{'_name': None, 'common': {'_name': None, 'no_progress_bar': False, 'log_interval': 100, 'log_format': 'simple', 'log_file': 'chkpt/en-ja.E18-D4/train.log', 'aim_repo': None, 'aim_run_hash': None, 'tensorboard_logdir': None, 'wandb_project': 'wmt23', 'azureml_logging': False, 'seed': 0, 'cpu': False, 'tpu': False, 'bf16': False, 'memory_efficient_bf16': False, 'fp16': True, 'memory_efficient_fp16': False, 'fp16_no_flatten_grads': False, 'fp16_init_scale': 8, 'fp16_scale_window': None, 'fp16_scale_tolerance': 0.0, 'on_cpu_convert_precision': False, 'min_loss_scale': 0.0001, 'threshold_loss_scale': None, 'amp': False, 'amp_batch_retries': 2, 'amp_init_scale': 128, 'amp_scale_window': None, 'user_dir': None, 'empty_cache_freq': 0, 'all_gather_list_size': 16384, 'model_parallel_size': 1, 'quantization_config_path': None, 'profile': False, 'reset_logging': False, 'suppress_crashes': False, 'use_plasma_view': False, 'plasma_path': '/tmp/plasma'}, 'common_eval': {'_name': None, 'path': None, 'post_process': None, 'quiet': False, 'model_overrides': '{}', 'results_path': None}, 'distributed_training': {'_name': None, 'distributed_world_size': 8, 'distributed_num_procs': 8, 'distributed_rank': 0, 'distributed_backend': 'nccl', 'distributed_init_method': 'tcp://localhost:30873', 'distributed_port': 30873, 'device_id': 0, 'distributed_no_spawn': False, 'ddp_backend': 'pytorch_ddp', 'ddp_comm_hook': 'none', 'bucket_cap_mb': 25, 'fix_batches_to_gpus': False, 'find_unused_parameters': False, 'gradient_as_bucket_view': False, 'fast_stat_sync': False, 'heartbeat_timeout': -1, 'broadcast_buffers': False, 'slowmo_momentum': None, 'slowmo_base_algorithm': 'localsgd', 'localsgd_frequency': 3, 'nprocs_per_node': 8, 'pipeline_model_parallel': False, 'pipeline_balance': None, 'pipeline_devices': None, 'pipeline_chunks': 0, 'pipeline_encoder_balance': None, 'pipeline_encoder_devices': None, 'pipeline_decoder_balance': None, 'pipeline_decoder_devices': None, 'pipeline_checkpoint': 'never', 'zero_sharding': 'none', 'fp16': True, 'memory_efficient_fp16': False, 'tpu': False, 'no_reshard_after_forward': False, 'fp32_reduce_scatter': False, 'cpu_offload': False, 'use_sharded_state': False, 'not_fsdp_flatten_parameters': False}, 'dataset': {'_name': None, 'num_workers': 0, 'skip_invalid_size_inputs_valid_test': False, 'max_tokens': 8192, 'batch_size': None, 'required_batch_size_multiple': 8, 'required_seq_len_multiple': 1, 'dataset_impl': None, 'data_buffer_size': 10, 'train_subset': 'train', 'valid_subset': 'valid', 'combine_valid_subsets': None, 'ignore_unused_valid_subsets': False, 'validate_interval': 100000, 'validate_interval_updates': 0, 'validate_after_updates': 0, 'fixed_validation_seed': None, 'disable_validation': False, 'max_tokens_valid': 8192, 'batch_size_valid': None, 'max_valid_steps': None, 'curriculum': 0, 'gen_subset': 'test', 'num_shards': 1, 'shard_id': 0, 'grouped_shuffling': False, 'update_epoch_batch_itr': False, 'update_ordered_indices_seed': False}, 'optimization': {'_name': None, 'max_epoch': 0, 'max_update': 60000, 'stop_time_hours': 0.0, 'clip_norm': 1.0, 'sentence_avg': False, 'update_freq': [8], 'lr': [0.001], 'stop_min_lr': -1.0, 'use_bmuf': False, 'skip_remainder_batch': False, 'debug_param_names': False}, 'checkpoint': {'_name': None, 'save_dir': 'chkpt/en-ja.E18-D4', 'restore_file': 'checkpoint_last.pt', 'continue_once': None, 'finetune_from_model': None, 'reset_dataloader': False, 'reset_lr_scheduler': False, 'reset_meters': False, 'reset_optimizer': False, 'optimizer_overrides': '{}', 'save_interval': 100000, 'save_interval_updates': 1000, 'keep_interval_updates': 10, 'keep_interval_updates_pattern': -1, 'keep_last_epochs': -1, 'keep_best_checkpoints': -1, 'no_save': False, 'no_epoch_checkpoints': True, 'no_last_checkpoints': False, 'no_save_optimizer_state': False, 'best_checkpoint_metric': 'loss', 'maximize_best_checkpoint_metric': False, 'patience': -1, 'checkpoint_suffix': '', 'checkpoint_shard_count': 1, 'load_checkpoint_on_all_dp_ranks': False, 'write_checkpoints_asynchronously': False, 'model_parallel_size': 1}, 'bmuf': {'_name': None, 'block_lr': 1.0, 'block_momentum': 0.875, 'global_sync_iter': 50, 'warmup_iterations': 500, 'use_nbm': False, 'average_sync': False, 'distributed_world_size': 8}, 'generation': {'_name': None, 'beam': 5, 'beam_mt': 0, 'nbest': 1, 'max_len_a': 0.0, 'max_len_b': 200, 'max_len_a_mt': 0.0, 'max_len_b_mt': 200, 'min_len': 1, 'match_source_len': False, 'unnormalized': False, 'no_early_stop': False, 'no_beamable_mm': False, 'lenpen': 1.0, 'lenpen_mt': 1.0, 'unkpen': 0.0, 'replace_unk': None, 'sacrebleu': False, 'score_reference': False, 'prefix_size': 0, 'no_repeat_ngram_size': 0, 'sampling': False, 'sampling_topk': -1, 'sampling_topp': -1.0, 'constraints': None, 'temperature': 1.0, 'diverse_beam_groups': -1, 'diverse_beam_strength': 0.5, 'diversity_rate': -1.0, 'print_alignment': None, 'print_step': False, 'lm_path': None, 'lm_weight': 0.0, 'iter_decode_eos_penalty': 0.0, 'iter_decode_max_iter': 10, 'iter_decode_force_max_iter': False, 'iter_decode_with_beam': 1, 'iter_decode_with_external_reranker': False, 'retain_iter_history': False, 'retain_dropout': False, 'retain_dropout_modules': None, 'decoding_format': None, 'no_seed_provided': False, 'eos_token': None}, 'eval_lm': {'_name': None, 'output_word_probs': False, 'output_word_stats': False, 'context_window': 0, 'softmax_batch': 9223372036854775807}, 'interactive': {'_name': None, 'buffer_size': 0, 'input': '-'}, 'model': Namespace(no_progress_bar=False, log_interval=100, log_format='simple', log_file='chkpt/en-ja.E18-D4/train.log', aim_repo=None, aim_run_hash=None, tensorboard_logdir=None, wandb_project='wmt23', azureml_logging=False, seed=0, cpu=False, tpu=False, bf16=False, memory_efficient_bf16=False, fp16=True, memory_efficient_fp16=False, fp16_no_flatten_grads=False, fp16_init_scale=8, fp16_scale_window=None, fp16_scale_tolerance=0.0, on_cpu_convert_precision=False, min_loss_scale=0.0001, threshold_loss_scale=None, amp=False, amp_batch_retries=2, amp_init_scale=128, amp_scale_window=None, user_dir=None, empty_cache_freq=0, all_gather_list_size=16384, model_parallel_size=1, quantization_config_path=None, profile=False, reset_logging=False, suppress_crashes=False, use_plasma_view=False, plasma_path='/tmp/plasma', criterion='label_smoothed_cross_entropy', tokenizer=None, bpe=None, optimizer='adam', lr_scheduler='inverse_sqrt', scoring='bleu', task='translation', num_workers=0, skip_invalid_size_inputs_valid_test=False, max_tokens=8192, batch_size=None, required_batch_size_multiple=8, required_seq_len_multiple=1, dataset_impl=None, data_buffer_size=10, train_subset='train', valid_subset='valid', combine_valid_subsets=None, ignore_unused_valid_subsets=False, validate_interval=100000, validate_interval_updates=0, validate_after_updates=0, fixed_validation_seed=None, disable_validation=False, max_tokens_valid=8192, batch_size_valid=None, max_valid_steps=None, curriculum=0, gen_subset='test', num_shards=1, shard_id=0, grouped_shuffling=False, update_epoch_batch_itr=False, update_ordered_indices_seed=False, distributed_world_size=8, distributed_num_procs=8, distributed_rank=0, distributed_backend='nccl', distributed_init_method=None, distributed_port=-1, device_id=0, distributed_no_spawn=False, ddp_backend='pytorch_ddp', ddp_comm_hook='none', bucket_cap_mb=25, fix_batches_to_gpus=False, find_unused_parameters=False, gradient_as_bucket_view=False, fast_stat_sync=False, heartbeat_timeout=-1, broadcast_buffers=False, slowmo_momentum=None, slowmo_base_algorithm='localsgd', localsgd_frequency=3, nprocs_per_node=8, pipeline_model_parallel=False, pipeline_balance=None, pipeline_devices=None, pipeline_chunks=0, pipeline_encoder_balance=None, pipeline_encoder_devices=None, pipeline_decoder_balance=None, pipeline_decoder_devices=None, pipeline_checkpoint='never', zero_sharding='none', no_reshard_after_forward=False, fp32_reduce_scatter=False, cpu_offload=False, use_sharded_state=False, not_fsdp_flatten_parameters=False, arch='transformer_vaswani_wmt_en_de_big', max_epoch=0, max_update=60000, stop_time_hours=0, clip_norm=1.0, sentence_avg=False, update_freq=[8], lr=[0.001], stop_min_lr=-1.0, use_bmuf=False, skip_remainder_batch=False, debug_param_names=False, save_dir='chkpt/en-ja.E18-D4', restore_file='checkpoint_last.pt', continue_once=None, finetune_from_model=None, reset_dataloader=False, reset_lr_scheduler=False, reset_meters=False, reset_optimizer=False, optimizer_overrides='{}', save_interval=100000, save_interval_updates=1000, keep_interval_updates=10, keep_interval_updates_pattern=-1, keep_last_epochs=-1, keep_best_checkpoints=-1, no_save=False, no_epoch_checkpoints=True, no_last_checkpoints=False, no_save_optimizer_state=False, best_checkpoint_metric='loss', maximize_best_checkpoint_metric=False, patience=-1, checkpoint_suffix='', checkpoint_shard_count=1, load_checkpoint_on_all_dp_ranks=False, write_checkpoints_asynchronously=False, store_ema=False, ema_decay=0.9999, ema_start_update=0, ema_seed_model=None, ema_update_freq=1, ema_fp32=False, data='binarized/en-ja/', source_lang=None, target_lang=None, load_alignments=False, left_pad_source=True, left_pad_target=False, upsample_primary=-1, truncate_source=False, num_batch_buckets=0, eval_bleu=False, eval_bleu_args='{}', eval_bleu_detok='space', eval_bleu_detok_args='{}', eval_tokenized_bleu=False, eval_bleu_remove_bpe=None, eval_bleu_print_samples=False, label_smoothing=0.1, report_accuracy=False, ignore_prefix_size=0, adam_betas='(0.9, 0.98)', adam_eps=1e-08, weight_decay=0.0, use_old_adam=False, fp16_adam_stats=False, warmup_updates=4000, warmup_init_lr=-1, pad=1, eos=2, unk=3, encoder_ffn_embed_dim=8192, decoder_ffn_embed_dim=8192, encoder_layers=18, decoder_layers=4, dropout=0.1, share_decoder_input_output_embed=True, no_seed_provided=False, encoder_embed_dim=1024, encoder_attention_heads=16, encoder_normalize_before=False, decoder_embed_dim=1024, decoder_attention_heads=16, encoder_embed_path=None, encoder_learned_pos=False, decoder_embed_path=None, decoder_normalize_before=False, decoder_learned_pos=False, attention_dropout=0.0, activation_dropout=0.0, activation_fn='relu', adaptive_softmax_cutoff=None, adaptive_softmax_dropout=0, share_all_embeddings=False, merge_src_tgt_embed=False, no_token_positional_embeddings=False, adaptive_input=False, no_cross_attention=False, cross_self_attention=False, decoder_output_dim=1024, decoder_input_dim=1024, no_scale_embedding=False, layernorm_embedding=False, tie_adaptive_weights=False, checkpoint_activations=False, offload_activations=False, encoder_layers_to_keep=None, decoder_layers_to_keep=None, encoder_layerdrop=0, decoder_layerdrop=0, quant_noise_pq=0, quant_noise_pq_block_size=8, quant_noise_scalar=0, _name='transformer_vaswani_wmt_en_de_big'), 'task': {'_name': 'translation', 'data': 'binarized/en-ja/', 'source_lang': None, 'target_lang': None, 'load_alignments': False, 'left_pad_source': True, 'left_pad_target': False, 'max_source_positions': 1024, 'max_target_positions': 1024, 'upsample_primary': -1, 'truncate_source': False, 'num_batch_buckets': 0, 'train_subset': 'train', 'dataset_impl': None, 'required_seq_len_multiple': 1, 'eval_bleu': False, 'eval_bleu_args': '{}', 'eval_bleu_detok': 'space', 'eval_bleu_detok_args': '{}', 'eval_tokenized_bleu': False, 'eval_bleu_remove_bpe': None, 'eval_bleu_print_samples': False}, 'criterion': {'_name': 'label_smoothed_cross_entropy', 'label_smoothing': 0.1, 'report_accuracy': False, 'ignore_prefix_size': 0, 'sentence_avg': False}, 'optimizer': {'_name': 'adam', 'adam_betas': '(0.9, 0.98)', 'adam_eps': 1e-08, 'weight_decay': 0.0, 'use_old_adam': False, 'fp16_adam_stats': False, 'tpu': False, 'lr': [0.001]}, 'lr_scheduler': {'_name': 'inverse_sqrt', 'warmup_updates': 4000, 'warmup_init_lr': -1.0, 'lr': [0.001]}, 'scoring': {'_name': 'bleu', 'pad': 1, 'eos': 2, 'unk': 3}, 'bpe': None, 'tokenizer': None, 'ema': {'_name': None, 'store_ema': False, 'ema_decay': 0.9999, 'ema_start_update': 0, 'ema_seed_model': None, 'ema_update_freq': 1, 'ema_fp32': False}} TransformerModel( (encoder): TransformerEncoderBase( (dropout_module): FairseqDropout() (embed_tokens): Embedding(16000, 1024, padding_idx=1) (embed_positions): SinusoidalPositionalEmbedding() (layers): ModuleList( (0-17): 18 x TransformerEncoderLayerBase( (self_attn): MultiheadAttention( (dropout_module): FairseqDropout() (k_proj): Linear(in_features=1024, out_features=1024, bias=True) (v_proj): Linear(in_features=1024, out_features=1024, bias=True) (q_proj): Linear(in_features=1024, out_features=1024, bias=True) (out_proj): Linear(in_features=1024, out_features=1024, bias=True) ) (self_attn_layer_norm): FusedLayerNorm(torch.Size([1024]), eps=1e-05, elementwise_affine=True) (dropout_module): FairseqDropout() (activation_dropout_module): FairseqDropout() (fc1): Linear(in_features=1024, out_features=8192, bias=True) (fc2): Linear(in_features=8192, out_features=1024, bias=True) (final_layer_norm): FusedLayerNorm(torch.Size([1024]), eps=1e-05, elementwise_affine=True) ) ) ) (decoder): TransformerDecoderBase( (dropout_module): FairseqDropout() (embed_tokens): Embedding(32000, 1024, padding_idx=1) (embed_positions): SinusoidalPositionalEmbedding() (layers): ModuleList( (0-3): 4 x TransformerDecoderLayerBase( (dropout_module): FairseqDropout() (self_attn): MultiheadAttention( (dropout_module): FairseqDropout() (k_proj): Linear(in_features=1024, out_features=1024, bias=True) (v_proj): Linear(in_features=1024, out_features=1024, bias=True) (q_proj): Linear(in_features=1024, out_features=1024, bias=True) (out_proj): Linear(in_features=1024, out_features=1024, bias=True) ) (activation_dropout_module): FairseqDropout() (self_attn_layer_norm): FusedLayerNorm(torch.Size([1024]), eps=1e-05, elementwise_affine=True) (encoder_attn): MultiheadAttention( (dropout_module): FairseqDropout() (k_proj): Linear(in_features=1024, out_features=1024, bias=True) (v_proj): Linear(in_features=1024, out_features=1024, bias=True) (q_proj): Linear(in_features=1024, out_features=1024, bias=True) (out_proj): Linear(in_features=1024, out_features=1024, bias=True) ) (encoder_attn_layer_norm): FusedLayerNorm(torch.Size([1024]), eps=1e-05, elementwise_affine=True) (fc1): Linear(in_features=1024, out_features=8192, bias=True) (fc2): Linear(in_features=8192, out_features=1024, bias=True) (final_layer_norm): FusedLayerNorm(torch.Size([1024]), eps=1e-05, elementwise_affine=True) ) ) (output_projection): Linear(in_features=1024, out_features=32000, bias=False) ) ) task: TranslationTask model: TransformerModel criterion: LabelSmoothedCrossEntropyCriterion num. shared model params: 527,710,208 (num. trained: 527,710,208) num. expert model params: 0 (num. trained: 0) training on 8 devices (GPUs/TPUs) max tokens per device = 8192 and max sentences per device = None begin dry-run validation on "valid" subset Start iterating over samples epoch 001: 101 / 1707 loss=13.026, nll_loss=12.707, ppl=6687.94, wps=290623, ups=0.68, wpb=429046, bsz=16478.7, num_updates=100, lr=2.5e-05, gnorm=3.127, clip=93, loss_scale=4, train_wall=156, gb_free=59.2, wall=172 epoch 001: 201 / 1707 loss=11.674, nll_loss=11.176, ppl=2314.07, wps=290222, ups=0.68, wpb=428875, bsz=16497.1, num_updates=200, lr=5e-05, gnorm=1.93, clip=93, loss_scale=4, train_wall=147, gb_free=58.8, wall=320 epoch 001: 301 / 1707 loss=11.125, nll_loss=10.523, ppl=1471.8, wps=290922, ups=0.68, wpb=428862, bsz=16456.3, num_updates=300, lr=7.5e-05, gnorm=1.677, clip=94, loss_scale=8, train_wall=147, gb_free=59.7, wall=467 epoch 001: 401 / 1707 loss=10.492, nll_loss=9.769, ppl=872.29, wps=289471, ups=0.68, wpb=427925, bsz=15950.5, num_updates=400, lr=0.0001, gnorm=1.619, clip=100, loss_scale=8, train_wall=147, gb_free=59.1, wall=615 epoch 001: 501 / 1707 loss=10.007, nll_loss=9.185, ppl=582.1, wps=291373, ups=0.68, wpb=430478, bsz=16565.2, num_updates=500, lr=0.000125, gnorm=1.484, clip=95, loss_scale=8, train_wall=147, gb_free=58.9, wall=763 epoch 001: 602 / 1707 loss=9.679, nll_loss=8.79, ppl=442.71, wps=288632, ups=0.67, wpb=429520, bsz=16195.5, num_updates=600, lr=0.00015, gnorm=1.37, clip=93, loss_scale=8, train_wall=148, gb_free=59.7, wall=912 epoch 001: 702 / 1707 loss=9.397, nll_loss=8.455, ppl=350.91, wps=291023, ups=0.68, wpb=428026, bsz=16256.9, num_updates=700, lr=0.000175, gnorm=1.237, clip=87, loss_scale=8, train_wall=146, gb_free=59.3, wall=1059 epoch 001: 802 / 1707 loss=9.139, nll_loss=8.152, ppl=284.52, wps=292982, ups=0.68, wpb=428860, bsz=16393.8, num_updates=800, lr=0.0002, gnorm=1.178, clip=83, loss_scale=16, train_wall=146, gb_free=59.6, wall=1205 epoch 001: 902 / 1707 loss=8.886, nll_loss=7.857, ppl=231.9, wps=292063, ups=0.68, wpb=428861, bsz=16332.8, num_updates=900, lr=0.000225, gnorm=1.078, clip=67, loss_scale=16, train_wall=146, gb_free=59.1, wall=1352 epoch 001: 1002 / 1707 loss=8.655, nll_loss=7.589, ppl=192.57, wps=294173, ups=0.68, wpb=431684, bsz=16342.6, num_updates=1000, lr=0.00025, gnorm=1.029, clip=57, loss_scale=16, train_wall=146, gb_free=59.2, wall=1499 begin validation on "valid" subset epoch 001 | valid on 'valid' subset | loss 8.622 | nll_loss 7.527 | ppl 184.4 | wps 77183.8 | wpb 21331 | bsz 1016 | num_updates 1000 epoch 001: 1103 / 1707 loss=8.434, nll_loss=7.334, ppl=161.37, wps=258873, ups=0.6, wpb=429516, bsz=16321.4, num_updates=1100, lr=0.000275, gnorm=1.011, clip=51, loss_scale=16, train_wall=147, gb_free=58.9, wall=1665 epoch 001: 1203 / 1707 loss=8.254, nll_loss=7.126, ppl=139.7, wps=293024, ups=0.68, wpb=428395, bsz=16167.4, num_updates=1200, lr=0.0003, gnorm=0.966, clip=36, loss_scale=16, train_wall=145, gb_free=59, wall=1811 epoch 001: 1303 / 1707 loss=8.076, nll_loss=6.921, ppl=121.16, wps=294155, ups=0.69, wpb=429296, bsz=16486.5, num_updates=1300, lr=0.000325, gnorm=0.955, clip=36, loss_scale=16, train_wall=145, gb_free=59.1, wall=1957 epoch 001: 1404 / 1707 loss=7.91, nll_loss=6.729, ppl=106.11, wps=291611, ups=0.68, wpb=429574, bsz=16381.2, num_updates=1400, lr=0.00035, gnorm=0.92, clip=25, loss_scale=16, train_wall=146, gb_free=58.9, wall=2104 epoch 001: 1504 / 1707 loss=7.768, nll_loss=6.566, ppl=94.73, wps=294505, ups=0.69, wpb=428758, bsz=16309.5, num_updates=1500, lr=0.000375, gnorm=0.91, clip=25, loss_scale=16, train_wall=145, gb_free=58.9, wall=2250 epoch 001: 1605 / 1707 loss=7.64, nll_loss=6.418, ppl=85.54, wps=291282, ups=0.68, wpb=428698, bsz=16170.8, num_updates=1600, lr=0.0004, gnorm=0.892, clip=21, loss_scale=16, train_wall=146, gb_free=59.4, wall=2397 epoch 001: 1705 / 1707 loss=7.515, nll_loss=6.275, ppl=77.46, wps=294139, ups=0.69, wpb=429336, bsz=16413.8, num_updates=1700, lr=0.000425, gnorm=0.889, clip=23, loss_scale=16, train_wall=145, gb_free=59.7, wall=2543 end of epoch 1 (average epoch stats below) epoch 001 | loss 9.273 | nll_loss 8.326 | ppl 320.9 | wps 289694 | ups 0.68 | wpb 428954 | bsz 16329.9 | num_updates 1702 | lr 0.0004255 | gnorm 1.31 | clip 63.5 | loss_scale 16 | train_wall 2497 | gb_free 60.2 | wall 2545 Start iterating over samples epoch 002: 99 / 1707 loss=7.403, nll_loss=6.146, ppl=70.81, wps=291011, ups=0.68, wpb=425559, bsz=16123.4, num_updates=1800, lr=0.00045, gnorm=0.879, clip=22, loss_scale=8, train_wall=145, gb_free=58.8, wall=2689 epoch 002: 99 / 1707 loss=7.403, nll_loss=6.146, ppl=70.81, wps=291011, ups=0.68, wpb=425559, bsz=16123.4, num_updates=1800, lr=0.00045, gnorm=0.879, clip=22, loss_scale=8, train_wall=145, gb_free=58.8, wall=2689 epoch 002: 199 / 1707 loss=7.318, nll_loss=6.048, ppl=66.15, wps=294720, ups=0.69, wpb=429094, bsz=16306.4, num_updates=1900, lr=0.000475, gnorm=0.89, clip=26, loss_scale=8, train_wall=145, gb_free=59, wall=2835 epoch 002: 199 / 1707 loss=7.318, nll_loss=6.048, ppl=66.15, wps=294720, ups=0.69, wpb=429094, bsz=16306.4, num_updates=1900, lr=0.000475, gnorm=0.89, clip=26, loss_scale=8, train_wall=145, gb_free=59, wall=2835 epoch 002: 299 / 1707 loss=7.226, nll_loss=5.942, ppl=61.49, wps=294024, ups=0.69, wpb=428516, bsz=16700, num_updates=2000, lr=0.0005, gnorm=0.892, clip=26, loss_scale=8, train_wall=145, gb_free=59, wall=2980 epoch 002: 299 / 1707 loss=7.226, nll_loss=5.942, ppl=61.49, wps=294024, ups=0.69, wpb=428516, bsz=16700, num_updates=2000, lr=0.0005, gnorm=0.892, clip=26, loss_scale=8, train_wall=145, gb_free=59, wall=2980 begin validation on "valid" subset epoch 002 | valid on 'valid' subset | loss 7.364 | nll_loss 6.089 | ppl 68.08 | wps 77036.3 | wpb 21331 | bsz 1016 | num_updates 2000 | best_loss 7.364 epoch 002 | valid on 'valid' subset | loss 7.364 | nll_loss 6.089 | ppl 68.08 | wps 77036.3 | wpb 21331 | bsz 1016 | num_updates 2000 | best_loss 7.364 epoch 002: 400 / 1707 loss=7.149, nll_loss=5.854, ppl=57.86, wps=258999, ups=0.6, wpb=429363, bsz=16685, num_updates=2100, lr=0.000525, gnorm=0.86, clip=19, loss_scale=8, train_wall=146, gb_free=59.3, wall=3146 epoch 002: 400 / 1707 loss=7.149, nll_loss=5.854, ppl=57.86, wps=258999, ups=0.6, wpb=429363, bsz=16685, num_updates=2100, lr=0.000525, gnorm=0.86, clip=19, loss_scale=8, train_wall=146, gb_free=59.3, wall=3146 epoch 002: 500 / 1707 loss=7.094, nll_loss=5.792, ppl=55.4, wps=294635, ups=0.69, wpb=428722, bsz=16320, num_updates=2200, lr=0.00055, gnorm=0.885, clip=22, loss_scale=8, train_wall=145, gb_free=59, wall=3292 epoch 002: 500 / 1707 loss=7.094, nll_loss=5.792, ppl=55.4, wps=294635, ups=0.69, wpb=428722, bsz=16320, num_updates=2200, lr=0.00055, gnorm=0.885, clip=22, loss_scale=8, train_wall=145, gb_free=59, wall=3292 epoch 002: 601 / 1707 loss=7.032, nll_loss=5.721, ppl=52.74, wps=291262, ups=0.68, wpb=428929, bsz=16287.3, num_updates=2300, lr=0.000575, gnorm=0.861, clip=10, loss_scale=4, train_wall=146, gb_free=59.7, wall=3439 epoch 002: 601 / 1707 loss=7.032, nll_loss=5.721, ppl=52.74, wps=291262, ups=0.68, wpb=428929, bsz=16287.3, num_updates=2300, lr=0.000575, gnorm=0.861, clip=10, loss_scale=4, train_wall=146, gb_free=59.7, wall=3439 epoch 002: 701 / 1707 loss=6.965, nll_loss=5.645, ppl=50.04, wps=295657, ups=0.69, wpb=430648, bsz=16279, num_updates=2400, lr=0.0006, gnorm=0.85, clip=14, loss_scale=4, train_wall=145, gb_free=59.2, wall=3585 epoch 002: 701 / 1707 loss=6.965, nll_loss=5.645, ppl=50.04, wps=295657, ups=0.69, wpb=430648, bsz=16279, num_updates=2400, lr=0.0006, gnorm=0.85, clip=14, loss_scale=4, train_wall=145, gb_free=59.2, wall=3585 epoch 002: 801 / 1707 loss=6.904, nll_loss=5.576, ppl=47.69, wps=293854, ups=0.69, wpb=428849, bsz=16377.5, num_updates=2500, lr=0.000625, gnorm=0.845, clip=15, loss_scale=8, train_wall=145, gb_free=58.9, wall=3730 epoch 002: 801 / 1707 loss=6.904, nll_loss=5.576, ppl=47.69, wps=293854, ups=0.69, wpb=428849, bsz=16377.5, num_updates=2500, lr=0.000625, gnorm=0.845, clip=15, loss_scale=8, train_wall=145, gb_free=58.9, wall=3730 epoch 002: 901 / 1707 loss=6.88, nll_loss=5.549, ppl=46.83, wps=294341, ups=0.69, wpb=428665, bsz=16227.3, num_updates=2600, lr=0.00065, gnorm=0.835, clip=11, loss_scale=8, train_wall=145, gb_free=58.9, wall=3876 epoch 002: 901 / 1707 loss=6.88, nll_loss=5.549, ppl=46.83, wps=294341, ups=0.69, wpb=428665, bsz=16227.3, num_updates=2600, lr=0.00065, gnorm=0.835, clip=11, loss_scale=8, train_wall=145, gb_free=58.9, wall=3876 epoch 002: 1002 / 1707 loss=6.835, nll_loss=5.5, ppl=45.25, wps=292254, ups=0.68, wpb=428943, bsz=16271.7, num_updates=2700, lr=0.000675, gnorm=0.874, clip=22, loss_scale=4, train_wall=146, gb_free=59.5, wall=4023 epoch 002: 1002 / 1707 loss=6.835, nll_loss=5.5, ppl=45.25, wps=292254, ups=0.68, wpb=428943, bsz=16271.7, num_updates=2700, lr=0.000675, gnorm=0.874, clip=22, loss_scale=4, train_wall=146, gb_free=59.5, wall=4023 epoch 002: 1102 / 1707 loss=6.772, nll_loss=5.429, ppl=43.07, wps=295474, ups=0.69, wpb=429746, bsz=16120.2, num_updates=2800, lr=0.0007, gnorm=0.776, clip=5, loss_scale=4, train_wall=145, gb_free=59.5, wall=4168 epoch 002: 1102 / 1707 loss=6.772, nll_loss=5.429, ppl=43.07, wps=295474, ups=0.69, wpb=429746, bsz=16120.2, num_updates=2800, lr=0.0007, gnorm=0.776, clip=5, loss_scale=4, train_wall=145, gb_free=59.5, wall=4168 epoch 002: 1202 / 1707 loss=6.742, nll_loss=5.396, ppl=42.1, wps=295802, ups=0.69, wpb=430006, bsz=16396.7, num_updates=2900, lr=0.000725, gnorm=0.821, clip=16, loss_scale=8, train_wall=145, gb_free=59.2, wall=4314 epoch 002: 1202 / 1707 loss=6.742, nll_loss=5.396, ppl=42.1, wps=295802, ups=0.69, wpb=430006, bsz=16396.7, num_updates=2900, lr=0.000725, gnorm=0.821, clip=16, loss_scale=8, train_wall=145, gb_free=59.2, wall=4314 epoch 002: 1303 / 1707 loss=6.691, nll_loss=5.338, ppl=40.45, wps=292918, ups=0.68, wpb=429645, bsz=16241.4, num_updates=3000, lr=0.00075, gnorm=0.821, clip=7, loss_scale=4, train_wall=146, gb_free=58.9, wall=4460 epoch 002: 1303 / 1707 loss=6.691, nll_loss=5.338, ppl=40.45, wps=292918, ups=0.68, wpb=429645, bsz=16241.4, num_updates=3000, lr=0.00075, gnorm=0.821, clip=7, loss_scale=4, train_wall=146, gb_free=58.9, wall=4460 begin validation on "valid" subset epoch 002 | valid on 'valid' subset | loss 6.946 | nll_loss 5.637 | ppl 49.77 | wps 76107.2 | wpb 21331 | bsz 1016 | num_updates 3000 | best_loss 6.946 epoch 002 | valid on 'valid' subset | loss 6.946 | nll_loss 5.637 | ppl 49.77 | wps 76107.2 | wpb 21331 | bsz 1016 | num_updates 3000 | best_loss 6.946 epoch 002: 1403 / 1707 loss=6.676, nll_loss=5.323, ppl=40.02, wps=260697, ups=0.61, wpb=427839, bsz=16364.2, num_updates=3100, lr=0.000775, gnorm=0.863, clip=18, loss_scale=4, train_wall=144, gb_free=59.2, wall=4624 epoch 002: 1403 / 1707 loss=6.676, nll_loss=5.323, ppl=40.02, wps=260697, ups=0.61, wpb=427839, bsz=16364.2, num_updates=3100, lr=0.000775, gnorm=0.863, clip=18, loss_scale=4, train_wall=144, gb_free=59.2, wall=4624 epoch 002: 1503 / 1707 loss=6.609, nll_loss=5.247, ppl=37.97, wps=295696, ups=0.69, wpb=428013, bsz=16393.4, num_updates=3200, lr=0.0008, gnorm=0.794, clip=7, loss_scale=8, train_wall=144, gb_free=58.9, wall=4769 epoch 002: 1503 / 1707 loss=6.609, nll_loss=5.247, ppl=37.97, wps=295696, ups=0.69, wpb=428013, bsz=16393.4, num_updates=3200, lr=0.0008, gnorm=0.794, clip=7, loss_scale=8, train_wall=144, gb_free=58.9, wall=4769 epoch 002: 1604 / 1707 loss=6.569, nll_loss=5.201, ppl=36.79, wps=292589, ups=0.68, wpb=429521, bsz=16300, num_updates=3300, lr=0.000825, gnorm=0.808, clip=16, loss_scale=4, train_wall=146, gb_free=59, wall=4916 epoch 002: 1604 / 1707 loss=6.569, nll_loss=5.201, ppl=36.79, wps=292589, ups=0.68, wpb=429521, bsz=16300, num_updates=3300, lr=0.000825, gnorm=0.808, clip=16, loss_scale=4, train_wall=146, gb_free=59, wall=4916 epoch 002: 1704 / 1707 loss=6.539, nll_loss=5.168, ppl=35.95, wps=296070, ups=0.69, wpb=430016, bsz=16104, num_updates=3400, lr=0.00085, gnorm=0.767, clip=3, loss_scale=4, train_wall=144, gb_free=59.3, wall=5061 epoch 002: 1704 / 1707 loss=6.539, nll_loss=5.168, ppl=35.95, wps=296070, ups=0.69, wpb=430016, bsz=16104, num_updates=3400, lr=0.00085, gnorm=0.767, clip=3, loss_scale=4, train_wall=144, gb_free=59.3, wall=5061 end of epoch 2 (average epoch stats below) epoch 002 | loss 6.905 | nll_loss 5.58 | ppl 47.82 | wps 289538 | ups 0.68 | wpb 428940 | bsz 16321.7 | num_updates 3403 | lr 0.00085075 | gnorm 0.843 | clip 15.3 | loss_scale 4 | train_wall 2468 | gb_free 59.4 | wall 5065 epoch 002 | loss 6.905 | nll_loss 5.58 | ppl 47.82 | wps 289538 | ups 0.68 | wpb 428940 | bsz 16321.7 | num_updates 3403 | lr 0.00085075 | gnorm 0.843 | clip 15.3 | loss_scale 4 | train_wall 2468 | gb_free 59.4 | wall 5065 Start iterating over samples epoch 003: 98 / 1707 loss=6.486, nll_loss=5.108, ppl=34.48, wps=291672, ups=0.68, wpb=426182, bsz=16218.2, num_updates=3500, lr=0.000875, gnorm=0.783, clip=12, loss_scale=4, train_wall=145, gb_free=59.4, wall=5207 epoch 003: 98 / 1707 loss=6.486, nll_loss=5.108, ppl=34.48, wps=291672, ups=0.68, wpb=426182, bsz=16218.2, num_updates=3500, lr=0.000875, gnorm=0.783, clip=12, loss_scale=4, train_wall=145, gb_free=59.4, wall=5207 epoch 003: 98 / 1707 loss=6.486, nll_loss=5.108, ppl=34.48, wps=291672, ups=0.68, wpb=426182, bsz=16218.2, num_updates=3500, lr=0.000875, gnorm=0.783, clip=12, loss_scale=4, train_wall=145, gb_free=59.4, wall=5207 epoch 003: 198 / 1707 loss=6.438, nll_loss=5.053, ppl=33.2, wps=295472, ups=0.69, wpb=429484, bsz=16443.6, num_updates=3600, lr=0.0009, gnorm=0.786, clip=8, loss_scale=4, train_wall=145, gb_free=59.2, wall=5353 epoch 003: 198 / 1707 loss=6.438, nll_loss=5.053, ppl=33.2, wps=295472, ups=0.69, wpb=429484, bsz=16443.6, num_updates=3600, lr=0.0009, gnorm=0.786, clip=8, loss_scale=4, train_wall=145, gb_free=59.2, wall=5353 epoch 003: 198 / 1707 loss=6.438, nll_loss=5.053, ppl=33.2, wps=295472, ups=0.69, wpb=429484, bsz=16443.6, num_updates=3600, lr=0.0009, gnorm=0.786, clip=8, loss_scale=4, train_wall=145, gb_free=59.2, wall=5353 epoch 003: 298 / 1707 loss=6.396, nll_loss=5.007, ppl=32.15, wps=294259, ups=0.69, wpb=428790, bsz=16579.1, num_updates=3700, lr=0.000925, gnorm=0.772, clip=4, loss_scale=4, train_wall=145, gb_free=59, wall=5498 epoch 003: 298 / 1707 loss=6.396, nll_loss=5.007, ppl=32.15, wps=294259, ups=0.69, wpb=428790, bsz=16579.1, num_updates=3700, lr=0.000925, gnorm=0.772, clip=4, loss_scale=4, train_wall=145, gb_free=59, wall=5498 epoch 003: 298 / 1707 loss=6.396, nll_loss=5.007, ppl=32.15, wps=294259, ups=0.69, wpb=428790, bsz=16579.1, num_updates=3700, lr=0.000925, gnorm=0.772, clip=4, loss_scale=4, train_wall=145, gb_free=59, wall=5498 epoch 003: 399 / 1707 loss=6.354, nll_loss=4.959, ppl=31.1, wps=291971, ups=0.68, wpb=429517, bsz=16338.2, num_updates=3800, lr=0.00095, gnorm=0.748, clip=3, loss_scale=4, train_wall=146, gb_free=59, wall=5646 epoch 003: 399 / 1707 loss=6.354, nll_loss=4.959, ppl=31.1, wps=291971, ups=0.68, wpb=429517, bsz=16338.2, num_updates=3800, lr=0.00095, gnorm=0.748, clip=3, loss_scale=4, train_wall=146, gb_free=59, wall=5646 epoch 003: 399 / 1707 loss=6.354, nll_loss=4.959, ppl=31.1, wps=291971, ups=0.68, wpb=429517, bsz=16338.2, num_updates=3800, lr=0.00095, gnorm=0.748, clip=3, loss_scale=4, train_wall=146, gb_free=59, wall=5646 epoch 003: 499 / 1707 loss=6.316, nll_loss=4.915, ppl=30.18, wps=294906, ups=0.69, wpb=430412, bsz=16654.7, num_updates=3900, lr=0.000975, gnorm=0.757, clip=4, loss_scale=4, train_wall=145, gb_free=59.2, wall=5792 epoch 003: 499 / 1707 loss=6.316, nll_loss=4.915, ppl=30.18, wps=294906, ups=0.69, wpb=430412, bsz=16654.7, num_updates=3900, lr=0.000975, gnorm=0.757, clip=4, loss_scale=4, train_wall=145, gb_free=59.2, wall=5792 epoch 003: 499 / 1707 loss=6.316, nll_loss=4.915, ppl=30.18, wps=294906, ups=0.69, wpb=430412, bsz=16654.7, num_updates=3900, lr=0.000975, gnorm=0.757, clip=4, loss_scale=4, train_wall=145, gb_free=59.2, wall=5792 epoch 003: 599 / 1707 loss=6.299, nll_loss=4.896, ppl=29.78, wps=295657, ups=0.69, wpb=429457, bsz=16310, num_updates=4000, lr=0.001, gnorm=0.735, clip=5, loss_scale=4, train_wall=144, gb_free=59.4, wall=5937 epoch 003: 599 / 1707 loss=6.299, nll_loss=4.896, ppl=29.78, wps=295657, ups=0.69, wpb=429457, bsz=16310, num_updates=4000, lr=0.001, gnorm=0.735, clip=5, loss_scale=4, train_wall=144, gb_free=59.4, wall=5937 epoch 003: 599 / 1707 loss=6.299, nll_loss=4.896, ppl=29.78, wps=295657, ups=0.69, wpb=429457, bsz=16310, num_updates=4000, lr=0.001, gnorm=0.735, clip=5, loss_scale=4, train_wall=144, gb_free=59.4, wall=5937 begin validation on "valid" subset epoch 003 | valid on 'valid' subset | loss 6.674 | nll_loss 5.333 | ppl 40.3 | wps 76466 | wpb 21331 | bsz 1016 | num_updates 4000 | best_loss 6.674 epoch 003 | valid on 'valid' subset | loss 6.674 | nll_loss 5.333 | ppl 40.3 | wps 76466 | wpb 21331 | bsz 1016 | num_updates 4000 | best_loss 6.674 epoch 003 | valid on 'valid' subset | loss 6.674 | nll_loss 5.333 | ppl 40.3 | wps 76466 | wpb 21331 | bsz 1016 | num_updates 4000 | best_loss 6.674 epoch 003: 700 / 1707 loss=6.246, nll_loss=4.837, ppl=28.58, wps=261753, ups=0.61, wpb=428317, bsz=16244, num_updates=4100, lr=0.00098773, gnorm=0.701, clip=1, loss_scale=4, train_wall=145, gb_free=59.4, wall=6100 epoch 003: 700 / 1707 loss=6.246, nll_loss=4.837, ppl=28.58, wps=261753, ups=0.61, wpb=428317, bsz=16244, num_updates=4100, lr=0.00098773, gnorm=0.701, clip=1, loss_scale=4, train_wall=145, gb_free=59.4, wall=6100 epoch 003: 700 / 1707 loss=6.246, nll_loss=4.837, ppl=28.58, wps=261753, ups=0.61, wpb=428317, bsz=16244, num_updates=4100, lr=0.00098773, gnorm=0.701, clip=1, loss_scale=4, train_wall=145, gb_free=59.4, wall=6100 epoch 003: 800 / 1707 loss=6.213, nll_loss=4.799, ppl=27.84, wps=295381, ups=0.69, wpb=427125, bsz=16008.4, num_updates=4200, lr=0.0009759, gnorm=0.698, clip=3, loss_scale=4, train_wall=144, gb_free=59.5, wall=6245 epoch 003: 800 / 1707 loss=6.213, nll_loss=4.799, ppl=27.84, wps=295381, ups=0.69, wpb=427125, bsz=16008.4, num_updates=4200, lr=0.0009759, gnorm=0.698, clip=3, loss_scale=4, train_wall=144, gb_free=59.5, wall=6245 epoch 003: 800 / 1707 loss=6.213, nll_loss=4.799, ppl=27.84, wps=295381, ups=0.69, wpb=427125, bsz=16008.4, num_updates=4200, lr=0.0009759, gnorm=0.698, clip=3, loss_scale=4, train_wall=144, gb_free=59.5, wall=6245 epoch 003: 900 / 1707 loss=6.156, nll_loss=4.734, ppl=26.62, wps=295787, ups=0.69, wpb=428949, bsz=16275.4, num_updates=4300, lr=0.000964486, gnorm=0.664, clip=0, loss_scale=8, train_wall=144, gb_free=58.9, wall=6390 epoch 003: 900 / 1707 loss=6.156, nll_loss=4.734, ppl=26.62, wps=295787, ups=0.69, wpb=428949, bsz=16275.4, num_updates=4300, lr=0.000964486, gnorm=0.664, clip=0, loss_scale=8, train_wall=144, gb_free=58.9, wall=6390 epoch 003: 900 / 1707 loss=6.156, nll_loss=4.734, ppl=26.62, wps=295787, ups=0.69, wpb=428949, bsz=16275.4, num_updates=4300, lr=0.000964486, gnorm=0.664, clip=0, loss_scale=8, train_wall=144, gb_free=58.9, wall=6390 epoch 003: 1000 / 1707 loss=6.114, nll_loss=4.686, ppl=25.75, wps=296918, ups=0.69, wpb=429451, bsz=16039.2, num_updates=4400, lr=0.000953463, gnorm=0.66, clip=0, loss_scale=8, train_wall=144, gb_free=59.5, wall=6535 epoch 003: 1000 / 1707 loss=6.114, nll_loss=4.686, ppl=25.75, wps=296918, ups=0.69, wpb=429451, bsz=16039.2, num_updates=4400, lr=0.000953463, gnorm=0.66, clip=0, loss_scale=8, train_wall=144, gb_free=59.5, wall=6535 epoch 003: 1000 / 1707 loss=6.114, nll_loss=4.686, ppl=25.75, wps=296918, ups=0.69, wpb=429451, bsz=16039.2, num_updates=4400, lr=0.000953463, gnorm=0.66, clip=0, loss_scale=8, train_wall=144, gb_free=59.5, wall=6535 epoch 003: 1100 / 1707 loss=6.068, nll_loss=4.635, ppl=24.84, wps=295815, ups=0.69, wpb=429805, bsz=16237.3, num_updates=4500, lr=0.000942809, gnorm=0.643, clip=0, loss_scale=8, train_wall=144, gb_free=58.9, wall=6680 epoch 003: 1100 / 1707 loss=6.068, nll_loss=4.635, ppl=24.84, wps=295815, ups=0.69, wpb=429805, bsz=16237.3, num_updates=4500, lr=0.000942809, gnorm=0.643, clip=0, loss_scale=8, train_wall=144, gb_free=58.9, wall=6680 epoch 003: 1100 / 1707 loss=6.068, nll_loss=4.635, ppl=24.84, wps=295815, ups=0.69, wpb=429805, bsz=16237.3, num_updates=4500, lr=0.000942809, gnorm=0.643, clip=0, loss_scale=8, train_wall=144, gb_free=58.9, wall=6680 epoch 003: 1201 / 1707 loss=6.03, nll_loss=4.591, ppl=24.11, wps=292481, ups=0.68, wpb=429041, bsz=16166.8, num_updates=4600, lr=0.000932505, gnorm=0.628, clip=1, loss_scale=4, train_wall=146, gb_free=59.4, wall=6827 epoch 003: 1201 / 1707 loss=6.03, nll_loss=4.591, ppl=24.11, wps=292481, ups=0.68, wpb=429041, bsz=16166.8, num_updates=4600, lr=0.000932505, gnorm=0.628, clip=1, loss_scale=4, train_wall=146, gb_free=59.4, wall=6827 epoch 003: 1201 / 1707 loss=6.03, nll_loss=4.591, ppl=24.11, wps=292481, ups=0.68, wpb=429041, bsz=16166.8, num_updates=4600, lr=0.000932505, gnorm=0.628, clip=1, loss_scale=4, train_wall=146, gb_free=59.4, wall=6827 epoch 003: 1301 / 1707 loss=5.984, nll_loss=4.54, ppl=23.26, wps=295596, ups=0.69, wpb=429678, bsz=16397.8, num_updates=4700, lr=0.000922531, gnorm=0.618, clip=0, loss_scale=4, train_wall=145, gb_free=59.2, wall=6972 epoch 003: 1301 / 1707 loss=5.984, nll_loss=4.54, ppl=23.26, wps=295596, ups=0.69, wpb=429678, bsz=16397.8, num_updates=4700, lr=0.000922531, gnorm=0.618, clip=0, loss_scale=4, train_wall=145, gb_free=59.2, wall=6972 epoch 003: 1301 / 1707 loss=5.984, nll_loss=4.54, ppl=23.26, wps=295596, ups=0.69, wpb=429678, bsz=16397.8, num_updates=4700, lr=0.000922531, gnorm=0.618, clip=0, loss_scale=4, train_wall=145, gb_free=59.2, wall=6972 epoch 003: 1401 / 1707 loss=5.936, nll_loss=4.485, ppl=22.4, wps=295100, ups=0.69, wpb=429264, bsz=16390.2, num_updates=4800, lr=0.000912871, gnorm=0.607, clip=0, loss_scale=8, train_wall=145, gb_free=59, wall=7117 epoch 003: 1401 / 1707 loss=5.936, nll_loss=4.485, ppl=22.4, wps=295100, ups=0.69, wpb=429264, bsz=16390.2, num_updates=4800, lr=0.000912871, gnorm=0.607, clip=0, loss_scale=8, train_wall=145, gb_free=59, wall=7117 epoch 003: 1401 / 1707 loss=5.936, nll_loss=4.485, ppl=22.4, wps=295100, ups=0.69, wpb=429264, bsz=16390.2, num_updates=4800, lr=0.000912871, gnorm=0.607, clip=0, loss_scale=8, train_wall=145, gb_free=59, wall=7117 epoch 003: 1502 / 1707 loss=5.896, nll_loss=4.44, ppl=21.71, wps=292875, ups=0.68, wpb=429815, bsz=16418.7, num_updates=4900, lr=0.000903508, gnorm=0.698, clip=10, loss_scale=4, train_wall=146, gb_free=59.2, wall=7264 epoch 003: 1502 / 1707 loss=5.896, nll_loss=4.44, ppl=21.71, wps=292875, ups=0.68, wpb=429815, bsz=16418.7, num_updates=4900, lr=0.000903508, gnorm=0.698, clip=10, loss_scale=4, train_wall=146, gb_free=59.2, wall=7264 epoch 003: 1502 / 1707 loss=5.896, nll_loss=4.44, ppl=21.71, wps=292875, ups=0.68, wpb=429815, bsz=16418.7, num_updates=4900, lr=0.000903508, gnorm=0.698, clip=10, loss_scale=4, train_wall=146, gb_free=59.2, wall=7264 epoch 003: 1602 / 1707 loss=5.589, nll_loss=4.092, ppl=17.05, wps=294759, ups=0.69, wpb=428163, bsz=16503.8, num_updates=5000, lr=0.000894427, gnorm=0.705, clip=9, loss_scale=4, train_wall=144, gb_free=58.9, wall=7410 epoch 003: 1602 / 1707 loss=5.589, nll_loss=4.092, ppl=17.05, wps=294759, ups=0.69, wpb=428163, bsz=16503.8, num_updates=5000, lr=0.000894427, gnorm=0.705, clip=9, loss_scale=4, train_wall=144, gb_free=58.9, wall=7410 epoch 003: 1602 / 1707 loss=5.589, nll_loss=4.092, ppl=17.05, wps=294759, ups=0.69, wpb=428163, bsz=16503.8, num_updates=5000, lr=0.000894427, gnorm=0.705, clip=9, loss_scale=4, train_wall=144, gb_free=58.9, wall=7410 begin validation on "valid" subset epoch 003 | valid on 'valid' subset | loss 5.394 | nll_loss 3.851 | ppl 14.43 | wps 76760.4 | wpb 21331 | bsz 1016 | num_updates 5000 | best_loss 5.394 epoch 003 | valid on 'valid' subset | loss 5.394 | nll_loss 3.851 | ppl 14.43 | wps 76760.4 | wpb 21331 | bsz 1016 | num_updates 5000 | best_loss 5.394 epoch 003 | valid on 'valid' subset | loss 5.394 | nll_loss 3.851 | ppl 14.43 | wps 76760.4 | wpb 21331 | bsz 1016 | num_updates 5000 | best_loss 5.394 epoch 003: 1702 / 1707 loss=5.252, nll_loss=3.716, ppl=13.14, wps=264306, ups=0.62, wpb=428878, bsz=16383.8, num_updates=5100, lr=0.000885615, gnorm=0.601, clip=4, loss_scale=8, train_wall=144, gb_free=58.8, wall=7572 epoch 003: 1702 / 1707 loss=5.252, nll_loss=3.716, ppl=13.14, wps=264306, ups=0.62, wpb=428878, bsz=16383.8, num_updates=5100, lr=0.000885615, gnorm=0.601, clip=4, loss_scale=8, train_wall=144, gb_free=58.8, wall=7572 epoch 003: 1702 / 1707 loss=5.252, nll_loss=3.716, ppl=13.14, wps=264306, ups=0.62, wpb=428878, bsz=16383.8, num_updates=5100, lr=0.000885615, gnorm=0.601, clip=4, loss_scale=8, train_wall=144, gb_free=58.8, wall=7572 end of epoch 3 (average epoch stats below) epoch 003 | loss 6.101 | nll_loss 4.673 | ppl 25.51 | wps 290490 | ups 0.68 | wpb 428968 | bsz 16329.3 | num_updates 5105 | lr 0.000885181 | gnorm 0.694 | clip 3.6 | loss_scale 8 | train_wall 2464 | gb_free 60.2 | wall 7578 epoch 003 | loss 6.101 | nll_loss 4.673 | ppl 25.51 | wps 290490 | ups 0.68 | wpb 428968 | bsz 16329.3 | num_updates 5105 | lr 0.000885181 | gnorm 0.694 | clip 3.6 | loss_scale 8 | train_wall 2464 | gb_free 60.2 | wall 7578 epoch 003 | loss 6.101 | nll_loss 4.673 | ppl 25.51 | wps 290490 | ups 0.68 | wpb 428968 | bsz 16329.3 | num_updates 5105 | lr 0.000885181 | gnorm 0.694 | clip 3.6 | loss_scale 8 | train_wall 2464 | gb_free 60.2 | wall 7578 Start iterating over samples epoch 004: 95 / 1707 loss=5.067, nll_loss=3.512, ppl=11.41, wps=293006, ups=0.69, wpb=423393, bsz=16411.2, num_updates=5200, lr=0.000877058, gnorm=0.533, clip=3, loss_scale=8, train_wall=144, gb_free=59.3, wall=7716 epoch 004: 95 / 1707 loss=5.067, nll_loss=3.512, ppl=11.41, wps=293006, ups=0.69, wpb=423393, bsz=16411.2, num_updates=5200, lr=0.000877058, gnorm=0.533, clip=3, loss_scale=8, train_wall=144, gb_free=59.3, wall=7716 epoch 004: 95 / 1707 loss=5.067, nll_loss=3.512, ppl=11.41, wps=293006, ups=0.69, wpb=423393, bsz=16411.2, num_updates=5200, lr=0.000877058, gnorm=0.533, clip=3, loss_scale=8, train_wall=144, gb_free=59.3, wall=7716 epoch 004: 95 / 1707 loss=5.067, nll_loss=3.512, ppl=11.41, wps=293006, ups=0.69, wpb=423393, bsz=16411.2, num_updates=5200, lr=0.000877058, gnorm=0.533, clip=3, loss_scale=8, train_wall=144, gb_free=59.3, wall=7716 epoch 004: 196 / 1707 loss=4.987, nll_loss=3.426, ppl=10.75, wps=292349, ups=0.68, wpb=428176, bsz=16172.7, num_updates=5300, lr=0.000868744, gnorm=0.48, clip=0, loss_scale=4, train_wall=146, gb_free=58.8, wall=7863 epoch 004: 196 / 1707 loss=4.987, nll_loss=3.426, ppl=10.75, wps=292349, ups=0.68, wpb=428176, bsz=16172.7, num_updates=5300, lr=0.000868744, gnorm=0.48, clip=0, loss_scale=4, train_wall=146, gb_free=58.8, wall=7863 epoch 004: 196 / 1707 loss=4.987, nll_loss=3.426, ppl=10.75, wps=292349, ups=0.68, wpb=428176, bsz=16172.7, num_updates=5300, lr=0.000868744, gnorm=0.48, clip=0, loss_scale=4, train_wall=146, gb_free=58.8, wall=7863 epoch 004: 196 / 1707 loss=4.987, nll_loss=3.426, ppl=10.75, wps=292349, ups=0.68, wpb=428176, bsz=16172.7, num_updates=5300, lr=0.000868744, gnorm=0.48, clip=0, loss_scale=4, train_wall=146, gb_free=58.8, wall=7863 epoch 004: 296 / 1707 loss=4.929, nll_loss=3.363, ppl=10.29, wps=295271, ups=0.69, wpb=429679, bsz=16046, num_updates=5400, lr=0.000860663, gnorm=0.47, clip=0, loss_scale=4, train_wall=145, gb_free=59, wall=8008 epoch 004: 296 / 1707 loss=4.929, nll_loss=3.363, ppl=10.29, wps=295271, ups=0.69, wpb=429679, bsz=16046, num_updates=5400, lr=0.000860663, gnorm=0.47, clip=0, loss_scale=4, train_wall=145, gb_free=59, wall=8008 epoch 004: 296 / 1707 loss=4.929, nll_loss=3.363, ppl=10.29, wps=295271, ups=0.69, wpb=429679, bsz=16046, num_updates=5400, lr=0.000860663, gnorm=0.47, clip=0, loss_scale=4, train_wall=145, gb_free=59, wall=8008 epoch 004: 296 / 1707 loss=4.929, nll_loss=3.363, ppl=10.29, wps=295271, ups=0.69, wpb=429679, bsz=16046, num_updates=5400, lr=0.000860663, gnorm=0.47, clip=0, loss_scale=4, train_wall=145, gb_free=59, wall=8008 epoch 004: 396 / 1707 loss=4.876, nll_loss=3.307, ppl=9.9, wps=295899, ups=0.69, wpb=429848, bsz=16529.9, num_updates=5500, lr=0.000852803, gnorm=0.442, clip=0, loss_scale=4, train_wall=145, gb_free=59, wall=8154 epoch 004: 396 / 1707 loss=4.876, nll_loss=3.307, ppl=9.9, wps=295899, ups=0.69, wpb=429848, bsz=16529.9, num_updates=5500, lr=0.000852803, gnorm=0.442, clip=0, loss_scale=4, train_wall=145, gb_free=59, wall=8154 epoch 004: 396 / 1707 loss=4.876, nll_loss=3.307, ppl=9.9, wps=295899, ups=0.69, wpb=429848, bsz=16529.9, num_updates=5500, lr=0.000852803, gnorm=0.442, clip=0, loss_scale=4, train_wall=145, gb_free=59, wall=8154 epoch 004: 396 / 1707 loss=4.876, nll_loss=3.307, ppl=9.9, wps=295899, ups=0.69, wpb=429848, bsz=16529.9, num_updates=5500, lr=0.000852803, gnorm=0.442, clip=0, loss_scale=4, train_wall=145, gb_free=59, wall=8154 epoch 004: 496 / 1707 loss=4.84, nll_loss=3.268, ppl=9.64, wps=295851, ups=0.69, wpb=430173, bsz=16254.7, num_updates=5600, lr=0.000845154, gnorm=0.437, clip=0, loss_scale=8, train_wall=145, gb_free=59.2, wall=8299 epoch 004: 496 / 1707 loss=4.84, nll_loss=3.268, ppl=9.64, wps=295851, ups=0.69, wpb=430173, bsz=16254.7, num_updates=5600, lr=0.000845154, gnorm=0.437, clip=0, loss_scale=8, train_wall=145, gb_free=59.2, wall=8299 epoch 004: 496 / 1707 loss=4.84, nll_loss=3.268, ppl=9.64, wps=295851, ups=0.69, wpb=430173, bsz=16254.7, num_updates=5600, lr=0.000845154, gnorm=0.437, clip=0, loss_scale=8, train_wall=145, gb_free=59.2, wall=8299 epoch 004: 496 / 1707 loss=4.84, nll_loss=3.268, ppl=9.64, wps=295851, ups=0.69, wpb=430173, bsz=16254.7, num_updates=5600, lr=0.000845154, gnorm=0.437, clip=0, loss_scale=8, train_wall=145, gb_free=59.2, wall=8299 epoch 004: 597 / 1707 loss=4.799, nll_loss=3.225, ppl=9.35, wps=292900, ups=0.68, wpb=429272, bsz=16392.4, num_updates=5700, lr=0.000837708, gnorm=0.405, clip=0, loss_scale=4, train_wall=146, gb_free=59.1, wall=8445 epoch 004: 597 / 1707 loss=4.799, nll_loss=3.225, ppl=9.35, wps=292900, ups=0.68, wpb=429272, bsz=16392.4, num_updates=5700, lr=0.000837708, gnorm=0.405, clip=0, loss_scale=4, train_wall=146, gb_free=59.1, wall=8445 epoch 004: 597 / 1707 loss=4.799, nll_loss=3.225, ppl=9.35, wps=292900, ups=0.68, wpb=429272, bsz=16392.4, num_updates=5700, lr=0.000837708, gnorm=0.405, clip=0, loss_scale=4, train_wall=146, gb_free=59.1, wall=8445 epoch 004: 597 / 1707 loss=4.799, nll_loss=3.225, ppl=9.35, wps=292900, ups=0.68, wpb=429272, bsz=16392.4, num_updates=5700, lr=0.000837708, gnorm=0.405, clip=0, loss_scale=4, train_wall=146, gb_free=59.1, wall=8445 epoch 004: 697 / 1707 loss=4.778, nll_loss=3.203, ppl=9.21, wps=294444, ups=0.69, wpb=428361, bsz=16342.6, num_updates=5800, lr=0.000830455, gnorm=0.407, clip=0, loss_scale=4, train_wall=145, gb_free=59.4, wall=8591 epoch 004: 697 / 1707 loss=4.778, nll_loss=3.203, ppl=9.21, wps=294444, ups=0.69, wpb=428361, bsz=16342.6, num_updates=5800, lr=0.000830455, gnorm=0.407, clip=0, loss_scale=4, train_wall=145, gb_free=59.4, wall=8591 epoch 004: 697 / 1707 loss=4.778, nll_loss=3.203, ppl=9.21, wps=294444, ups=0.69, wpb=428361, bsz=16342.6, num_updates=5800, lr=0.000830455, gnorm=0.407, clip=0, loss_scale=4, train_wall=145, gb_free=59.4, wall=8591 epoch 004: 697 / 1707 loss=4.778, nll_loss=3.203, ppl=9.21, wps=294444, ups=0.69, wpb=428361, bsz=16342.6, num_updates=5800, lr=0.000830455, gnorm=0.407, clip=0, loss_scale=4, train_wall=145, gb_free=59.4, wall=8591 epoch 004: 797 / 1707 loss=4.757, nll_loss=3.181, ppl=9.07, wps=294985, ups=0.69, wpb=428579, bsz=16355.4, num_updates=5900, lr=0.000823387, gnorm=0.388, clip=0, loss_scale=4, train_wall=144, gb_free=58.9, wall=8736 epoch 004: 797 / 1707 loss=4.757, nll_loss=3.181, ppl=9.07, wps=294985, ups=0.69, wpb=428579, bsz=16355.4, num_updates=5900, lr=0.000823387, gnorm=0.388, clip=0, loss_scale=4, train_wall=144, gb_free=58.9, wall=8736 epoch 004: 797 / 1707 loss=4.757, nll_loss=3.181, ppl=9.07, wps=294985, ups=0.69, wpb=428579, bsz=16355.4, num_updates=5900, lr=0.000823387, gnorm=0.388, clip=0, loss_scale=4, train_wall=144, gb_free=58.9, wall=8736 epoch 004: 797 / 1707 loss=4.757, nll_loss=3.181, ppl=9.07, wps=294985, ups=0.69, wpb=428579, bsz=16355.4, num_updates=5900, lr=0.000823387, gnorm=0.388, clip=0, loss_scale=4, train_wall=144, gb_free=58.9, wall=8736 epoch 004: 897 / 1707 loss=4.73, nll_loss=3.153, ppl=8.89, wps=295410, ups=0.69, wpb=429704, bsz=16629.3, num_updates=6000, lr=0.000816497, gnorm=0.379, clip=0, loss_scale=8, train_wall=145, gb_free=59.2, wall=8882 epoch 004: 897 / 1707 loss=4.73, nll_loss=3.153, ppl=8.89, wps=295410, ups=0.69, wpb=429704, bsz=16629.3, num_updates=6000, lr=0.000816497, gnorm=0.379, clip=0, loss_scale=8, train_wall=145, gb_free=59.2, wall=8882 epoch 004: 897 / 1707 loss=4.73, nll_loss=3.153, ppl=8.89, wps=295410, ups=0.69, wpb=429704, bsz=16629.3, num_updates=6000, lr=0.000816497, gnorm=0.379, clip=0, loss_scale=8, train_wall=145, gb_free=59.2, wall=8882 epoch 004: 897 / 1707 loss=4.73, nll_loss=3.153, ppl=8.89, wps=295410, ups=0.69, wpb=429704, bsz=16629.3, num_updates=6000, lr=0.000816497, gnorm=0.379, clip=0, loss_scale=8, train_wall=145, gb_free=59.2, wall=8882 begin validation on "valid" subset epoch 004 | valid on 'valid' subset | loss 4.746 | nll_loss 3.152 | ppl 8.89 | wps 76294.9 | wpb 21331 | bsz 1016 | num_updates 6000 | best_loss 4.746 epoch 004 | valid on 'valid' subset | loss 4.746 | nll_loss 3.152 | ppl 8.89 | wps 76294.9 | wpb 21331 | bsz 1016 | num_updates 6000 | best_loss 4.746 epoch 004 | valid on 'valid' subset | loss 4.746 | nll_loss 3.152 | ppl 8.89 | wps 76294.9 | wpb 21331 | bsz 1016 | num_updates 6000 | best_loss 4.746 epoch 004 | valid on 'valid' subset | loss 4.746 | nll_loss 3.152 | ppl 8.89 | wps 76294.9 | wpb 21331 | bsz 1016 | num_updates 6000 | best_loss 4.746 epoch 004: 997 / 1707 loss=4.71, nll_loss=3.131, ppl=8.76, wps=256237, ups=0.6, wpb=429740, bsz=16271.4, num_updates=6100, lr=0.000809776, gnorm=0.381, clip=0, loss_scale=8, train_wall=144, gb_free=59.4, wall=9049 epoch 004: 997 / 1707 loss=4.71, nll_loss=3.131, ppl=8.76, wps=256237, ups=0.6, wpb=429740, bsz=16271.4, num_updates=6100, lr=0.000809776, gnorm=0.381, clip=0, loss_scale=8, train_wall=144, gb_free=59.4, wall=9049 epoch 004: 997 / 1707 loss=4.71, nll_loss=3.131, ppl=8.76, wps=256237, ups=0.6, wpb=429740, bsz=16271.4, num_updates=6100, lr=0.000809776, gnorm=0.381, clip=0, loss_scale=8, train_wall=144, gb_free=59.4, wall=9049 epoch 004: 997 / 1707 loss=4.71, nll_loss=3.131, ppl=8.76, wps=256237, ups=0.6, wpb=429740, bsz=16271.4, num_updates=6100, lr=0.000809776, gnorm=0.381, clip=0, loss_scale=8, train_wall=144, gb_free=59.4, wall=9049 epoch 004: 1098 / 1707 loss=4.683, nll_loss=3.102, ppl=8.59, wps=292228, ups=0.68, wpb=429028, bsz=16482.6, num_updates=6200, lr=0.000803219, gnorm=0.367, clip=0, loss_scale=4, train_wall=146, gb_free=59.1, wall=9196 epoch 004: 1098 / 1707 loss=4.683, nll_loss=3.102, ppl=8.59, wps=292228, ups=0.68, wpb=429028, bsz=16482.6, num_updates=6200, lr=0.000803219, gnorm=0.367, clip=0, loss_scale=4, train_wall=146, gb_free=59.1, wall=9196 epoch 004: 1098 / 1707 loss=4.683, nll_loss=3.102, ppl=8.59, wps=292228, ups=0.68, wpb=429028, bsz=16482.6, num_updates=6200, lr=0.000803219, gnorm=0.367, clip=0, loss_scale=4, train_wall=146, gb_free=59.1, wall=9196 epoch 004: 1098 / 1707 loss=4.683, nll_loss=3.102, ppl=8.59, wps=292228, ups=0.68, wpb=429028, bsz=16482.6, num_updates=6200, lr=0.000803219, gnorm=0.367, clip=0, loss_scale=4, train_wall=146, gb_free=59.1, wall=9196 epoch 004: 1198 / 1707 loss=4.676, nll_loss=3.095, ppl=8.55, wps=297348, ups=0.69, wpb=430423, bsz=16201.1, num_updates=6300, lr=0.000796819, gnorm=0.362, clip=0, loss_scale=4, train_wall=144, gb_free=59.7, wall=9341 epoch 004: 1198 / 1707 loss=4.676, nll_loss=3.095, ppl=8.55, wps=297348, ups=0.69, wpb=430423, bsz=16201.1, num_updates=6300, lr=0.000796819, gnorm=0.362, clip=0, loss_scale=4, train_wall=144, gb_free=59.7, wall=9341 epoch 004: 1198 / 1707 loss=4.676, nll_loss=3.095, ppl=8.55, wps=297348, ups=0.69, wpb=430423, bsz=16201.1, num_updates=6300, lr=0.000796819, gnorm=0.362, clip=0, loss_scale=4, train_wall=144, gb_free=59.7, wall=9341 epoch 004: 1198 / 1707 loss=4.676, nll_loss=3.095, ppl=8.55, wps=297348, ups=0.69, wpb=430423, bsz=16201.1, num_updates=6300, lr=0.000796819, gnorm=0.362, clip=0, loss_scale=4, train_wall=144, gb_free=59.7, wall=9341 epoch 004: 1298 / 1707 loss=4.662, nll_loss=3.08, ppl=8.45, wps=295670, ups=0.69, wpb=428939, bsz=16109, num_updates=6400, lr=0.000790569, gnorm=0.359, clip=0, loss_scale=8, train_wall=144, gb_free=59.3, wall=9486 epoch 004: 1298 / 1707 loss=4.662, nll_loss=3.08, ppl=8.45, wps=295670, ups=0.69, wpb=428939, bsz=16109, num_updates=6400, lr=0.000790569, gnorm=0.359, clip=0, loss_scale=8, train_wall=144, gb_free=59.3, wall=9486 epoch 004: 1298 / 1707 loss=4.662, nll_loss=3.08, ppl=8.45, wps=295670, ups=0.69, wpb=428939, bsz=16109, num_updates=6400, lr=0.000790569, gnorm=0.359, clip=0, loss_scale=8, train_wall=144, gb_free=59.3, wall=9486 epoch 004: 1298 / 1707 loss=4.662, nll_loss=3.08, ppl=8.45, wps=295670, ups=0.69, wpb=428939, bsz=16109, num_updates=6400, lr=0.000790569, gnorm=0.359, clip=0, loss_scale=8, train_wall=144, gb_free=59.3, wall=9486 epoch 004: 1398 / 1707 loss=4.64, nll_loss=3.057, ppl=8.32, wps=295000, ups=0.69, wpb=428457, bsz=16331.1, num_updates=6500, lr=0.000784465, gnorm=0.358, clip=0, loss_scale=8, train_wall=145, gb_free=59.3, wall=9631 epoch 004: 1398 / 1707 loss=4.64, nll_loss=3.057, ppl=8.32, wps=295000, ups=0.69, wpb=428457, bsz=16331.1, num_updates=6500, lr=0.000784465, gnorm=0.358, clip=0, loss_scale=8, train_wall=145, gb_free=59.3, wall=9631 epoch 004: 1398 / 1707 loss=4.64, nll_loss=3.057, ppl=8.32, wps=295000, ups=0.69, wpb=428457, bsz=16331.1, num_updates=6500, lr=0.000784465, gnorm=0.358, clip=0, loss_scale=8, train_wall=145, gb_free=59.3, wall=9631 epoch 004: 1398 / 1707 loss=4.64, nll_loss=3.057, ppl=8.32, wps=295000, ups=0.69, wpb=428457, bsz=16331.1, num_updates=6500, lr=0.000784465, gnorm=0.358, clip=0, loss_scale=8, train_wall=145, gb_free=59.3, wall=9631 epoch 004: 1499 / 1707 loss=4.634, nll_loss=3.051, ppl=8.29, wps=294462, ups=0.69, wpb=429788, bsz=16301.3, num_updates=6600, lr=0.000778499, gnorm=0.348, clip=0, loss_scale=4, train_wall=145, gb_free=59.3, wall=9777 epoch 004: 1499 / 1707 loss=4.634, nll_loss=3.051, ppl=8.29, wps=294462, ups=0.69, wpb=429788, bsz=16301.3, num_updates=6600, lr=0.000778499, gnorm=0.348, clip=0, loss_scale=4, train_wall=145, gb_free=59.3, wall=9777 epoch 004: 1499 / 1707 loss=4.634, nll_loss=3.051, ppl=8.29, wps=294462, ups=0.69, wpb=429788, bsz=16301.3, num_updates=6600, lr=0.000778499, gnorm=0.348, clip=0, loss_scale=4, train_wall=145, gb_free=59.3, wall=9777 epoch 004: 1499 / 1707 loss=4.634, nll_loss=3.051, ppl=8.29, wps=294462, ups=0.69, wpb=429788, bsz=16301.3, num_updates=6600, lr=0.000778499, gnorm=0.348, clip=0, loss_scale=4, train_wall=145, gb_free=59.3, wall=9777 epoch 004: 1599 / 1707 loss=4.619, nll_loss=3.035, ppl=8.2, wps=295873, ups=0.69, wpb=429051, bsz=16577.5, num_updates=6700, lr=0.000772667, gnorm=0.346, clip=0, loss_scale=4, train_wall=144, gb_free=58.5, wall=9922 epoch 004: 1599 / 1707 loss=4.619, nll_loss=3.035, ppl=8.2, wps=295873, ups=0.69, wpb=429051, bsz=16577.5, num_updates=6700, lr=0.000772667, gnorm=0.346, clip=0, loss_scale=4, train_wall=144, gb_free=58.5, wall=9922 epoch 004: 1599 / 1707 loss=4.619, nll_loss=3.035, ppl=8.2, wps=295873, ups=0.69, wpb=429051, bsz=16577.5, num_updates=6700, lr=0.000772667, gnorm=0.346, clip=0, loss_scale=4, train_wall=144, gb_free=58.5, wall=9922 epoch 004: 1599 / 1707 loss=4.619, nll_loss=3.035, ppl=8.2, wps=295873, ups=0.69, wpb=429051, bsz=16577.5, num_updates=6700, lr=0.000772667, gnorm=0.346, clip=0, loss_scale=4, train_wall=144, gb_free=58.5, wall=9922 epoch 004: 1699 / 1707 loss=4.608, nll_loss=3.023, ppl=8.13, wps=295546, ups=0.69, wpb=429614, bsz=16161.6, num_updates=6800, lr=0.000766965, gnorm=0.336, clip=0, loss_scale=8, train_wall=145, gb_free=59.1, wall=10068 epoch 004: 1699 / 1707 loss=4.608, nll_loss=3.023, ppl=8.13, wps=295546, ups=0.69, wpb=429614, bsz=16161.6, num_updates=6800, lr=0.000766965, gnorm=0.336, clip=0, loss_scale=8, train_wall=145, gb_free=59.1, wall=10068 epoch 004: 1699 / 1707 loss=4.608, nll_loss=3.023, ppl=8.13, wps=295546, ups=0.69, wpb=429614, bsz=16161.6, num_updates=6800, lr=0.000766965, gnorm=0.336, clip=0, loss_scale=8, train_wall=145, gb_free=59.1, wall=10068 epoch 004: 1699 / 1707 loss=4.608, nll_loss=3.023, ppl=8.13, wps=295546, ups=0.69, wpb=429614, bsz=16161.6, num_updates=6800, lr=0.000766965, gnorm=0.336, clip=0, loss_scale=8, train_wall=145, gb_free=59.1, wall=10068 end of epoch 4 (average epoch stats below) epoch 004 | loss 4.762 | nll_loss 3.187 | ppl 9.1 | wps 292173 | ups 0.68 | wpb 428949 | bsz 16326.5 | num_updates 6808 | lr 0.000766514 | gnorm 0.399 | clip 0.2 | loss_scale 8 | train_wall 2464 | gb_free 59.9 | wall 10078 epoch 004 | loss 4.762 | nll_loss 3.187 | ppl 9.1 | wps 292173 | ups 0.68 | wpb 428949 | bsz 16326.5 | num_updates 6808 | lr 0.000766514 | gnorm 0.399 | clip 0.2 | loss_scale 8 | train_wall 2464 | gb_free 59.9 | wall 10078 epoch 004 | loss 4.762 | nll_loss 3.187 | ppl 9.1 | wps 292173 | ups 0.68 | wpb 428949 | bsz 16326.5 | num_updates 6808 | lr 0.000766514 | gnorm 0.399 | clip 0.2 | loss_scale 8 | train_wall 2464 | gb_free 59.9 | wall 10078 epoch 004 | loss 4.762 | nll_loss 3.187 | ppl 9.1 | wps 292173 | ups 0.68 | wpb 428949 | bsz 16326.5 | num_updates 6808 | lr 0.000766514 | gnorm 0.399 | clip 0.2 | loss_scale 8 | train_wall 2464 | gb_free 59.9 | wall 10078 Start iterating over samples epoch 005: 93 / 1707 loss=4.557, nll_loss=2.964, ppl=7.8, wps=291284, ups=0.69, wpb=424516, bsz=16161.6, num_updates=6900, lr=0.000761387, gnorm=0.348, clip=0, loss_scale=4, train_wall=145, gb_free=58.7, wall=10213 epoch 005: 93 / 1707 loss=4.557, nll_loss=2.964, ppl=7.8, wps=291284, ups=0.69, wpb=424516, bsz=16161.6, num_updates=6900, lr=0.000761387, gnorm=0.348, clip=0, loss_scale=4, train_wall=145, gb_free=58.7, wall=10213 epoch 005: 93 / 1707 loss=4.557, nll_loss=2.964, ppl=7.8, wps=291284, ups=0.69, wpb=424516, bsz=16161.6, num_updates=6900, lr=0.000761387, gnorm=0.348, clip=0, loss_scale=4, train_wall=145, gb_free=58.7, wall=10213 epoch 005: 93 / 1707 loss=4.557, nll_loss=2.964, ppl=7.8, wps=291284, ups=0.69, wpb=424516, bsz=16161.6, num_updates=6900, lr=0.000761387, gnorm=0.348, clip=0, loss_scale=4, train_wall=145, gb_free=58.7, wall=10213 epoch 005: 93 / 1707 loss=4.557, nll_loss=2.964, ppl=7.8, wps=291284, ups=0.69, wpb=424516, bsz=16161.6, num_updates=6900, lr=0.000761387, gnorm=0.348, clip=0, loss_scale=4, train_wall=145, gb_free=58.7, wall=10213 epoch 005: 193 / 1707 loss=4.56, nll_loss=2.969, ppl=7.83, wps=296651, ups=0.69, wpb=430154, bsz=16285.7, num_updates=7000, lr=0.000755929, gnorm=0.326, clip=0, loss_scale=4, train_wall=144, gb_free=58.9, wall=10358 epoch 005: 193 / 1707 loss=4.56, nll_loss=2.969, ppl=7.83, wps=296651, ups=0.69, wpb=430154, bsz=16285.7, num_updates=7000, lr=0.000755929, gnorm=0.326, clip=0, loss_scale=4, train_wall=144, gb_free=58.9, wall=10358 epoch 005: 193 / 1707 loss=4.56, nll_loss=2.969, ppl=7.83, wps=296651, ups=0.69, wpb=430154, bsz=16285.7, num_updates=7000, lr=0.000755929, gnorm=0.326, clip=0, loss_scale=4, train_wall=144, gb_free=58.9, wall=10358 epoch 005: 193 / 1707 loss=4.56, nll_loss=2.969, ppl=7.83, wps=296651, ups=0.69, wpb=430154, bsz=16285.7, num_updates=7000, lr=0.000755929, gnorm=0.326, clip=0, loss_scale=4, train_wall=144, gb_free=58.9, wall=10358 epoch 005: 193 / 1707 loss=4.56, nll_loss=2.969, ppl=7.83, wps=296651, ups=0.69, wpb=430154, bsz=16285.7, num_updates=7000, lr=0.000755929, gnorm=0.326, clip=0, loss_scale=4, train_wall=144, gb_free=58.9, wall=10358 begin validation on "valid" subset epoch 005 | valid on 'valid' subset | loss 4.607 | nll_loss 3.006 | ppl 8.03 | wps 76626.9 | wpb 21331 | bsz 1016 | num_updates 7000 | best_loss 4.607 epoch 005 | valid on 'valid' subset | loss 4.607 | nll_loss 3.006 | ppl 8.03 | wps 76626.9 | wpb 21331 | bsz 1016 | num_updates 7000 | best_loss 4.607 epoch 005 | valid on 'valid' subset | loss 4.607 | nll_loss 3.006 | ppl 8.03 | wps 76626.9 | wpb 21331 | bsz 1016 | num_updates 7000 | best_loss 4.607 epoch 005 | valid on 'valid' subset | loss 4.607 | nll_loss 3.006 | ppl 8.03 | wps 76626.9 | wpb 21331 | bsz 1016 | num_updates 7000 | best_loss 4.607 epoch 005 | valid on 'valid' subset | loss 4.607 | nll_loss 3.006 | ppl 8.03 | wps 76626.9 | wpb 21331 | bsz 1016 | num_updates 7000 | best_loss 4.607 epoch 005: 293 / 1707 loss=4.553, nll_loss=2.961, ppl=7.79, wps=264672, ups=0.62, wpb=429745, bsz=16433.4, num_updates=7100, lr=0.000750587, gnorm=0.326, clip=0, loss_scale=4, train_wall=144, gb_free=59.5, wall=10521 epoch 005: 293 / 1707 loss=4.553, nll_loss=2.961, ppl=7.79, wps=264672, ups=0.62, wpb=429745, bsz=16433.4, num_updates=7100, lr=0.000750587, gnorm=0.326, clip=0, loss_scale=4, train_wall=144, gb_free=59.5, wall=10521 epoch 005: 293 / 1707 loss=4.553, nll_loss=2.961, ppl=7.79, wps=264672, ups=0.62, wpb=429745, bsz=16433.4, num_updates=7100, lr=0.000750587, gnorm=0.326, clip=0, loss_scale=4, train_wall=144, gb_free=59.5, wall=10521 epoch 005: 293 / 1707 loss=4.553, nll_loss=2.961, ppl=7.79, wps=264672, ups=0.62, wpb=429745, bsz=16433.4, num_updates=7100, lr=0.000750587, gnorm=0.326, clip=0, loss_scale=4, train_wall=144, gb_free=59.5, wall=10521 epoch 005: 293 / 1707 loss=4.553, nll_loss=2.961, ppl=7.79, wps=264672, ups=0.62, wpb=429745, bsz=16433.4, num_updates=7100, lr=0.000750587, gnorm=0.326, clip=0, loss_scale=4, train_wall=144, gb_free=59.5, wall=10521 epoch 005: 393 / 1707 loss=4.542, nll_loss=2.95, ppl=7.73, wps=297101, ups=0.69, wpb=429464, bsz=16285.7, num_updates=7200, lr=0.000745356, gnorm=0.326, clip=0, loss_scale=8, train_wall=144, gb_free=59.5, wall=10665 epoch 005: 393 / 1707 loss=4.542, nll_loss=2.95, ppl=7.73, wps=297101, ups=0.69, wpb=429464, bsz=16285.7, num_updates=7200, lr=0.000745356, gnorm=0.326, clip=0, loss_scale=8, train_wall=144, gb_free=59.5, wall=10665 epoch 005: 393 / 1707 loss=4.542, nll_loss=2.95, ppl=7.73, wps=297101, ups=0.69, wpb=429464, bsz=16285.7, num_updates=7200, lr=0.000745356, gnorm=0.326, clip=0, loss_scale=8, train_wall=144, gb_free=59.5, wall=10665 epoch 005: 393 / 1707 loss=4.542, nll_loss=2.95, ppl=7.73, wps=297101, ups=0.69, wpb=429464, bsz=16285.7, num_updates=7200, lr=0.000745356, gnorm=0.326, clip=0, loss_scale=8, train_wall=144, gb_free=59.5, wall=10665 epoch 005: 393 / 1707 loss=4.542, nll_loss=2.95, ppl=7.73, wps=297101, ups=0.69, wpb=429464, bsz=16285.7, num_updates=7200, lr=0.000745356, gnorm=0.326, clip=0, loss_scale=8, train_wall=144, gb_free=59.5, wall=10665 epoch 005: 494 / 1707 loss=4.535, nll_loss=2.942, ppl=7.69, wps=291867, ups=0.68, wpb=427967, bsz=16396, num_updates=7300, lr=0.000740233, gnorm=0.327, clip=0, loss_scale=4, train_wall=146, gb_free=59.3, wall=10812 epoch 005: 494 / 1707 loss=4.535, nll_loss=2.942, ppl=7.69, wps=291867, ups=0.68, wpb=427967, bsz=16396, num_updates=7300, lr=0.000740233, gnorm=0.327, clip=0, loss_scale=4, train_wall=146, gb_free=59.3, wall=10812 epoch 005: 494 / 1707 loss=4.535, nll_loss=2.942, ppl=7.69, wps=291867, ups=0.68, wpb=427967, bsz=16396, num_updates=7300, lr=0.000740233, gnorm=0.327, clip=0, loss_scale=4, train_wall=146, gb_free=59.3, wall=10812 epoch 005: 494 / 1707 loss=4.535, nll_loss=2.942, ppl=7.69, wps=291867, ups=0.68, wpb=427967, bsz=16396, num_updates=7300, lr=0.000740233, gnorm=0.327, clip=0, loss_scale=4, train_wall=146, gb_free=59.3, wall=10812 epoch 005: 494 / 1707 loss=4.535, nll_loss=2.942, ppl=7.69, wps=291867, ups=0.68, wpb=427967, bsz=16396, num_updates=7300, lr=0.000740233, gnorm=0.327, clip=0, loss_scale=4, train_wall=146, gb_free=59.3, wall=10812 epoch 005: 594 / 1707 loss=4.533, nll_loss=2.94, ppl=7.68, wps=295528, ups=0.69, wpb=428157, bsz=16141.4, num_updates=7400, lr=0.000735215, gnorm=0.317, clip=0, loss_scale=4, train_wall=144, gb_free=58.9, wall=10957 epoch 005: 594 / 1707 loss=4.533, nll_loss=2.94, ppl=7.68, wps=295528, ups=0.69, wpb=428157, bsz=16141.4, num_updates=7400, lr=0.000735215, gnorm=0.317, clip=0, loss_scale=4, train_wall=144, gb_free=58.9, wall=10957 epoch 005: 594 / 1707 loss=4.533, nll_loss=2.94, ppl=7.68, wps=295528, ups=0.69, wpb=428157, bsz=16141.4, num_updates=7400, lr=0.000735215, gnorm=0.317, clip=0, loss_scale=4, train_wall=144, gb_free=58.9, wall=10957 epoch 005: 594 / 1707 loss=4.533, nll_loss=2.94, ppl=7.68, wps=295528, ups=0.69, wpb=428157, bsz=16141.4, num_updates=7400, lr=0.000735215, gnorm=0.317, clip=0, loss_scale=4, train_wall=144, gb_free=58.9, wall=10957 epoch 005: 594 / 1707 loss=4.533, nll_loss=2.94, ppl=7.68, wps=295528, ups=0.69, wpb=428157, bsz=16141.4, num_updates=7400, lr=0.000735215, gnorm=0.317, clip=0, loss_scale=4, train_wall=144, gb_free=58.9, wall=10957 epoch 005: 694 / 1707 loss=4.521, nll_loss=2.928, ppl=7.61, wps=295943, ups=0.69, wpb=429792, bsz=16296, num_updates=7500, lr=0.000730297, gnorm=0.321, clip=0, loss_scale=8, train_wall=144, gb_free=59.2, wall=11102 epoch 005: 694 / 1707 loss=4.521, nll_loss=2.928, ppl=7.61, wps=295943, ups=0.69, wpb=429792, bsz=16296, num_updates=7500, lr=0.000730297, gnorm=0.321, clip=0, loss_scale=8, train_wall=144, gb_free=59.2, wall=11102 epoch 005: 694 / 1707 loss=4.521, nll_loss=2.928, ppl=7.61, wps=295943, ups=0.69, wpb=429792, bsz=16296, num_updates=7500, lr=0.000730297, gnorm=0.321, clip=0, loss_scale=8, train_wall=144, gb_free=59.2, wall=11102 epoch 005: 694 / 1707 loss=4.521, nll_loss=2.928, ppl=7.61, wps=295943, ups=0.69, wpb=429792, bsz=16296, num_updates=7500, lr=0.000730297, gnorm=0.321, clip=0, loss_scale=8, train_wall=144, gb_free=59.2, wall=11102 epoch 005: 694 / 1707 loss=4.521, nll_loss=2.928, ppl=7.61, wps=295943, ups=0.69, wpb=429792, bsz=16296, num_updates=7500, lr=0.000730297, gnorm=0.321, clip=0, loss_scale=8, train_wall=144, gb_free=59.2, wall=11102 epoch 005: 794 / 1707 loss=4.53, nll_loss=2.938, ppl=7.67, wps=296351, ups=0.69, wpb=429805, bsz=16459.2, num_updates=7600, lr=0.000725476, gnorm=0.322, clip=0, loss_scale=8, train_wall=144, gb_free=59.2, wall=11247 epoch 005: 794 / 1707 loss=4.53, nll_loss=2.938, ppl=7.67, wps=296351, ups=0.69, wpb=429805, bsz=16459.2, num_updates=7600, lr=0.000725476, gnorm=0.322, clip=0, loss_scale=8, train_wall=144, gb_free=59.2, wall=11247 epoch 005: 794 / 1707 loss=4.53, nll_loss=2.938, ppl=7.67, wps=296351, ups=0.69, wpb=429805, bsz=16459.2, num_updates=7600, lr=0.000725476, gnorm=0.322, clip=0, loss_scale=8, train_wall=144, gb_free=59.2, wall=11247 epoch 005: 794 / 1707 loss=4.53, nll_loss=2.938, ppl=7.67, wps=296351, ups=0.69, wpb=429805, bsz=16459.2, num_updates=7600, lr=0.000725476, gnorm=0.322, clip=0, loss_scale=8, train_wall=144, gb_free=59.2, wall=11247 epoch 005: 794 / 1707 loss=4.53, nll_loss=2.938, ppl=7.67, wps=296351, ups=0.69, wpb=429805, bsz=16459.2, num_updates=7600, lr=0.000725476, gnorm=0.322, clip=0, loss_scale=8, train_wall=144, gb_free=59.2, wall=11247 epoch 005: 895 / 1707 loss=4.512, nll_loss=2.92, ppl=7.57, wps=293614, ups=0.68, wpb=430116, bsz=16462, num_updates=7700, lr=0.00072075, gnorm=0.309, clip=0, loss_scale=4, train_wall=146, gb_free=59.5, wall=11394 epoch 005: 895 / 1707 loss=4.512, nll_loss=2.92, ppl=7.57, wps=293614, ups=0.68, wpb=430116, bsz=16462, num_updates=7700, lr=0.00072075, gnorm=0.309, clip=0, loss_scale=4, train_wall=146, gb_free=59.5, wall=11394 epoch 005: 895 / 1707 loss=4.512, nll_loss=2.92, ppl=7.57, wps=293614, ups=0.68, wpb=430116, bsz=16462, num_updates=7700, lr=0.00072075, gnorm=0.309, clip=0, loss_scale=4, train_wall=146, gb_free=59.5, wall=11394 epoch 005: 895 / 1707 loss=4.512, nll_loss=2.92, ppl=7.57, wps=293614, ups=0.68, wpb=430116, bsz=16462, num_updates=7700, lr=0.00072075, gnorm=0.309, clip=0, loss_scale=4, train_wall=146, gb_free=59.5, wall=11394 epoch 005: 895 / 1707 loss=4.512, nll_loss=2.92, ppl=7.57, wps=293614, ups=0.68, wpb=430116, bsz=16462, num_updates=7700, lr=0.00072075, gnorm=0.309, clip=0, loss_scale=4, train_wall=146, gb_free=59.5, wall=11394 epoch 005: 995 / 1707 loss=4.504, nll_loss=2.91, ppl=7.52, wps=294157, ups=0.69, wpb=426350, bsz=16189.1, num_updates=7800, lr=0.000716115, gnorm=0.315, clip=0, loss_scale=4, train_wall=144, gb_free=59.1, wall=11539 epoch 005: 995 / 1707 loss=4.504, nll_loss=2.91, ppl=7.52, wps=294157, ups=0.69, wpb=426350, bsz=16189.1, num_updates=7800, lr=0.000716115, gnorm=0.315, clip=0, loss_scale=4, train_wall=144, gb_free=59.1, wall=11539 epoch 005: 995 / 1707 loss=4.504, nll_loss=2.91, ppl=7.52, wps=294157, ups=0.69, wpb=426350, bsz=16189.1, num_updates=7800, lr=0.000716115, gnorm=0.315, clip=0, loss_scale=4, train_wall=144, gb_free=59.1, wall=11539 epoch 005: 995 / 1707 loss=4.504, nll_loss=2.91, ppl=7.52, wps=294157, ups=0.69, wpb=426350, bsz=16189.1, num_updates=7800, lr=0.000716115, gnorm=0.315, clip=0, loss_scale=4, train_wall=144, gb_free=59.1, wall=11539 epoch 005: 995 / 1707 loss=4.504, nll_loss=2.91, ppl=7.52, wps=294157, ups=0.69, wpb=426350, bsz=16189.1, num_updates=7800, lr=0.000716115, gnorm=0.315, clip=0, loss_scale=4, train_wall=144, gb_free=59.1, wall=11539 epoch 005: 1096 / 1707 loss=4.508, nll_loss=2.915, ppl=7.54, wps=293776, ups=0.68, wpb=429807, bsz=15985.5, num_updates=7900, lr=0.000711568, gnorm=0.308, clip=0, loss_scale=4, train_wall=145, gb_free=59.2, wall=11685 epoch 005: 1096 / 1707 loss=4.508, nll_loss=2.915, ppl=7.54, wps=293776, ups=0.68, wpb=429807, bsz=15985.5, num_updates=7900, lr=0.000711568, gnorm=0.308, clip=0, loss_scale=4, train_wall=145, gb_free=59.2, wall=11685 epoch 005: 1096 / 1707 loss=4.508, nll_loss=2.915, ppl=7.54, wps=293776, ups=0.68, wpb=429807, bsz=15985.5, num_updates=7900, lr=0.000711568, gnorm=0.308, clip=0, loss_scale=4, train_wall=145, gb_free=59.2, wall=11685 epoch 005: 1096 / 1707 loss=4.508, nll_loss=2.915, ppl=7.54, wps=293776, ups=0.68, wpb=429807, bsz=15985.5, num_updates=7900, lr=0.000711568, gnorm=0.308, clip=0, loss_scale=4, train_wall=145, gb_free=59.2, wall=11685 epoch 005: 1096 / 1707 loss=4.508, nll_loss=2.915, ppl=7.54, wps=293776, ups=0.68, wpb=429807, bsz=15985.5, num_updates=7900, lr=0.000711568, gnorm=0.308, clip=0, loss_scale=4, train_wall=145, gb_free=59.2, wall=11685 epoch 005: 1196 / 1707 loss=4.495, nll_loss=2.902, ppl=7.47, wps=296128, ups=0.69, wpb=430777, bsz=16372.7, num_updates=8000, lr=0.000707107, gnorm=0.307, clip=0, loss_scale=4, train_wall=145, gb_free=58.9, wall=11830 epoch 005: 1196 / 1707 loss=4.495, nll_loss=2.902, ppl=7.47, wps=296128, ups=0.69, wpb=430777, bsz=16372.7, num_updates=8000, lr=0.000707107, gnorm=0.307, clip=0, loss_scale=4, train_wall=145, gb_free=58.9, wall=11830 epoch 005: 1196 / 1707 loss=4.495, nll_loss=2.902, ppl=7.47, wps=296128, ups=0.69, wpb=430777, bsz=16372.7, num_updates=8000, lr=0.000707107, gnorm=0.307, clip=0, loss_scale=4, train_wall=145, gb_free=58.9, wall=11830 epoch 005: 1196 / 1707 loss=4.495, nll_loss=2.902, ppl=7.47, wps=296128, ups=0.69, wpb=430777, bsz=16372.7, num_updates=8000, lr=0.000707107, gnorm=0.307, clip=0, loss_scale=4, train_wall=145, gb_free=58.9, wall=11830 epoch 005: 1196 / 1707 loss=4.495, nll_loss=2.902, ppl=7.47, wps=296128, ups=0.69, wpb=430777, bsz=16372.7, num_updates=8000, lr=0.000707107, gnorm=0.307, clip=0, loss_scale=4, train_wall=145, gb_free=58.9, wall=11830 begin validation on "valid" subset epoch 005 | valid on 'valid' subset | loss 4.545 | nll_loss 2.939 | ppl 7.67 | wps 77024.8 | wpb 21331 | bsz 1016 | num_updates 8000 | best_loss 4.545 epoch 005 | valid on 'valid' subset | loss 4.545 | nll_loss 2.939 | ppl 7.67 | wps 77024.8 | wpb 21331 | bsz 1016 | num_updates 8000 | best_loss 4.545 epoch 005 | valid on 'valid' subset | loss 4.545 | nll_loss 2.939 | ppl 7.67 | wps 77024.8 | wpb 21331 | bsz 1016 | num_updates 8000 | best_loss 4.545 epoch 005 | valid on 'valid' subset | loss 4.545 | nll_loss 2.939 | ppl 7.67 | wps 77024.8 | wpb 21331 | bsz 1016 | num_updates 8000 | best_loss 4.545 epoch 005 | valid on 'valid' subset | loss 4.545 | nll_loss 2.939 | ppl 7.67 | wps 77024.8 | wpb 21331 | bsz 1016 | num_updates 8000 | best_loss 4.545 epoch 005: 1296 / 1707 loss=4.491, nll_loss=2.897, ppl=7.45, wps=258981, ups=0.6, wpb=429027, bsz=16427.4, num_updates=8100, lr=0.000702728, gnorm=0.315, clip=0, loss_scale=4, train_wall=144, gb_free=59.2, wall=11996 epoch 005: 1296 / 1707 loss=4.491, nll_loss=2.897, ppl=7.45, wps=258981, ups=0.6, wpb=429027, bsz=16427.4, num_updates=8100, lr=0.000702728, gnorm=0.315, clip=0, loss_scale=4, train_wall=144, gb_free=59.2, wall=11996 epoch 005: 1296 / 1707 loss=4.491, nll_loss=2.897, ppl=7.45, wps=258981, ups=0.6, wpb=429027, bsz=16427.4, num_updates=8100, lr=0.000702728, gnorm=0.315, clip=0, loss_scale=4, train_wall=144, gb_free=59.2, wall=11996 epoch 005: 1296 / 1707 loss=4.491, nll_loss=2.897, ppl=7.45, wps=258981, ups=0.6, wpb=429027, bsz=16427.4, num_updates=8100, lr=0.000702728, gnorm=0.315, clip=0, loss_scale=4, train_wall=144, gb_free=59.2, wall=11996 epoch 005: 1296 / 1707 loss=4.491, nll_loss=2.897, ppl=7.45, wps=258981, ups=0.6, wpb=429027, bsz=16427.4, num_updates=8100, lr=0.000702728, gnorm=0.315, clip=0, loss_scale=4, train_wall=144, gb_free=59.2, wall=11996 epoch 005: 1396 / 1707 loss=4.484, nll_loss=2.889, ppl=7.41, wps=295074, ups=0.69, wpb=428097, bsz=16599.1, num_updates=8200, lr=0.00069843, gnorm=0.309, clip=0, loss_scale=8, train_wall=144, gb_free=58.9, wall=12141 epoch 005: 1396 / 1707 loss=4.484, nll_loss=2.889, ppl=7.41, wps=295074, ups=0.69, wpb=428097, bsz=16599.1, num_updates=8200, lr=0.00069843, gnorm=0.309, clip=0, loss_scale=8, train_wall=144, gb_free=58.9, wall=12141 epoch 005: 1396 / 1707 loss=4.484, nll_loss=2.889, ppl=7.41, wps=295074, ups=0.69, wpb=428097, bsz=16599.1, num_updates=8200, lr=0.00069843, gnorm=0.309, clip=0, loss_scale=8, train_wall=144, gb_free=58.9, wall=12141 epoch 005: 1396 / 1707 loss=4.484, nll_loss=2.889, ppl=7.41, wps=295074, ups=0.69, wpb=428097, bsz=16599.1, num_updates=8200, lr=0.00069843, gnorm=0.309, clip=0, loss_scale=8, train_wall=144, gb_free=58.9, wall=12141 epoch 005: 1396 / 1707 loss=4.484, nll_loss=2.889, ppl=7.41, wps=295074, ups=0.69, wpb=428097, bsz=16599.1, num_updates=8200, lr=0.00069843, gnorm=0.309, clip=0, loss_scale=8, train_wall=144, gb_free=58.9, wall=12141 epoch 005: 1496 / 1707 loss=4.471, nll_loss=2.876, ppl=7.34, wps=295235, ups=0.69, wpb=429398, bsz=16558.7, num_updates=8300, lr=0.00069421, gnorm=0.296, clip=0, loss_scale=8, train_wall=145, gb_free=58.7, wall=12287 epoch 005: 1496 / 1707 loss=4.471, nll_loss=2.876, ppl=7.34, wps=295235, ups=0.69, wpb=429398, bsz=16558.7, num_updates=8300, lr=0.00069421, gnorm=0.296, clip=0, loss_scale=8, train_wall=145, gb_free=58.7, wall=12287 epoch 005: 1496 / 1707 loss=4.471, nll_loss=2.876, ppl=7.34, wps=295235, ups=0.69, wpb=429398, bsz=16558.7, num_updates=8300, lr=0.00069421, gnorm=0.296, clip=0, loss_scale=8, train_wall=145, gb_free=58.7, wall=12287 epoch 005: 1496 / 1707 loss=4.471, nll_loss=2.876, ppl=7.34, wps=295235, ups=0.69, wpb=429398, bsz=16558.7, num_updates=8300, lr=0.00069421, gnorm=0.296, clip=0, loss_scale=8, train_wall=145, gb_free=58.7, wall=12287 epoch 005: 1496 / 1707 loss=4.471, nll_loss=2.876, ppl=7.34, wps=295235, ups=0.69, wpb=429398, bsz=16558.7, num_updates=8300, lr=0.00069421, gnorm=0.296, clip=0, loss_scale=8, train_wall=145, gb_free=58.7, wall=12287 epoch 005: 1597 / 1707 loss=4.476, nll_loss=2.882, ppl=7.37, wps=292273, ups=0.68, wpb=429028, bsz=16289.7, num_updates=8400, lr=0.000690066, gnorm=0.292, clip=0, loss_scale=4, train_wall=146, gb_free=59.5, wall=12433 epoch 005: 1597 / 1707 loss=4.476, nll_loss=2.882, ppl=7.37, wps=292273, ups=0.68, wpb=429028, bsz=16289.7, num_updates=8400, lr=0.000690066, gnorm=0.292, clip=0, loss_scale=4, train_wall=146, gb_free=59.5, wall=12433 epoch 005: 1597 / 1707 loss=4.476, nll_loss=2.882, ppl=7.37, wps=292273, ups=0.68, wpb=429028, bsz=16289.7, num_updates=8400, lr=0.000690066, gnorm=0.292, clip=0, loss_scale=4, train_wall=146, gb_free=59.5, wall=12433 epoch 005: 1597 / 1707 loss=4.476, nll_loss=2.882, ppl=7.37, wps=292273, ups=0.68, wpb=429028, bsz=16289.7, num_updates=8400, lr=0.000690066, gnorm=0.292, clip=0, loss_scale=4, train_wall=146, gb_free=59.5, wall=12433 epoch 005: 1597 / 1707 loss=4.476, nll_loss=2.882, ppl=7.37, wps=292273, ups=0.68, wpb=429028, bsz=16289.7, num_updates=8400, lr=0.000690066, gnorm=0.292, clip=0, loss_scale=4, train_wall=146, gb_free=59.5, wall=12433 epoch 005: 1697 / 1707 loss=4.471, nll_loss=2.876, ppl=7.34, wps=296087, ups=0.69, wpb=429593, bsz=16175.8, num_updates=8500, lr=0.000685994, gnorm=0.31, clip=0, loss_scale=4, train_wall=144, gb_free=59.1, wall=12578 epoch 005: 1697 / 1707 loss=4.471, nll_loss=2.876, ppl=7.34, wps=296087, ups=0.69, wpb=429593, bsz=16175.8, num_updates=8500, lr=0.000685994, gnorm=0.31, clip=0, loss_scale=4, train_wall=144, gb_free=59.1, wall=12578 epoch 005: 1697 / 1707 loss=4.471, nll_loss=2.876, ppl=7.34, wps=296087, ups=0.69, wpb=429593, bsz=16175.8, num_updates=8500, lr=0.000685994, gnorm=0.31, clip=0, loss_scale=4, train_wall=144, gb_free=59.1, wall=12578 epoch 005: 1697 / 1707 loss=4.471, nll_loss=2.876, ppl=7.34, wps=296087, ups=0.69, wpb=429593, bsz=16175.8, num_updates=8500, lr=0.000685994, gnorm=0.31, clip=0, loss_scale=4, train_wall=144, gb_free=59.1, wall=12578 epoch 005: 1697 / 1707 loss=4.471, nll_loss=2.876, ppl=7.34, wps=296087, ups=0.69, wpb=429593, bsz=16175.8, num_updates=8500, lr=0.000685994, gnorm=0.31, clip=0, loss_scale=4, train_wall=144, gb_free=59.1, wall=12578 end of epoch 5 (average epoch stats below) epoch 005 | loss 4.514 | nll_loss 2.921 | ppl 7.57 | wps 290408 | ups 0.68 | wpb 428942 | bsz 16328.9 | num_updates 8510 | lr 0.000685591 | gnorm 0.316 | clip 0 | loss_scale 4 | train_wall 2461 | gb_free 60.2 | wall 12592 epoch 005 | loss 4.514 | nll_loss 2.921 | ppl 7.57 | wps 290408 | ups 0.68 | wpb 428942 | bsz 16328.9 | num_updates 8510 | lr 0.000685591 | gnorm 0.316 | clip 0 | loss_scale 4 | train_wall 2461 | gb_free 60.2 | wall 12592 epoch 005 | loss 4.514 | nll_loss 2.921 | ppl 7.57 | wps 290408 | ups 0.68 | wpb 428942 | bsz 16328.9 | num_updates 8510 | lr 0.000685591 | gnorm 0.316 | clip 0 | loss_scale 4 | train_wall 2461 | gb_free 60.2 | wall 12592 epoch 005 | loss 4.514 | nll_loss 2.921 | ppl 7.57 | wps 290408 | ups 0.68 | wpb 428942 | bsz 16328.9 | num_updates 8510 | lr 0.000685591 | gnorm 0.316 | clip 0 | loss_scale 4 | train_wall 2461 | gb_free 60.2 | wall 12592 epoch 005 | loss 4.514 | nll_loss 2.921 | ppl 7.57 | wps 290408 | ups 0.68 | wpb 428942 | bsz 16328.9 | num_updates 8510 | lr 0.000685591 | gnorm 0.316 | clip 0 | loss_scale 4 | train_wall 2461 | gb_free 60.2 | wall 12592 Start iterating over samples epoch 006: 90 / 1707 loss=4.425, nll_loss=2.823, ppl=7.08, wps=295615, ups=0.69, wpb=426800, bsz=16476.7, num_updates=8600, lr=0.000681994, gnorm=0.29, clip=0, loss_scale=8, train_wall=143, gb_free=59.1, wall=12723 epoch 006: 90 / 1707 loss=4.425, nll_loss=2.823, ppl=7.08, wps=295615, ups=0.69, wpb=426800, bsz=16476.7, num_updates=8600, lr=0.000681994, gnorm=0.29, clip=0, loss_scale=8, train_wall=143, gb_free=59.1, wall=12723 epoch 006: 90 / 1707 loss=4.425, nll_loss=2.823, ppl=7.08, wps=295615, ups=0.69, wpb=426800, bsz=16476.7, num_updates=8600, lr=0.000681994, gnorm=0.29, clip=0, loss_scale=8, train_wall=143, gb_free=59.1, wall=12723 epoch 006: 90 / 1707 loss=4.425, nll_loss=2.823, ppl=7.08, wps=295615, ups=0.69, wpb=426800, bsz=16476.7, num_updates=8600, lr=0.000681994, gnorm=0.29, clip=0, loss_scale=8, train_wall=143, gb_free=59.1, wall=12723 epoch 006: 90 / 1707 loss=4.425, nll_loss=2.823, ppl=7.08, wps=295615, ups=0.69, wpb=426800, bsz=16476.7, num_updates=8600, lr=0.000681994, gnorm=0.29, clip=0, loss_scale=8, train_wall=143, gb_free=59.1, wall=12723 epoch 006: 90 / 1707 loss=4.425, nll_loss=2.823, ppl=7.08, wps=295615, ups=0.69, wpb=426800, bsz=16476.7, num_updates=8600, lr=0.000681994, gnorm=0.29, clip=0, loss_scale=8, train_wall=143, gb_free=59.1, wall=12723 epoch 006: 191 / 1707 loss=4.431, nll_loss=2.83, ppl=7.11, wps=292951, ups=0.68, wpb=429926, bsz=16301.2, num_updates=8700, lr=0.000678064, gnorm=0.299, clip=0, loss_scale=4, train_wall=146, gb_free=59.2, wall=12870 epoch 006: 191 / 1707 loss=4.431, nll_loss=2.83, ppl=7.11, wps=292951, ups=0.68, wpb=429926, bsz=16301.2, num_updates=8700, lr=0.000678064, gnorm=0.299, clip=0, loss_scale=4, train_wall=146, gb_free=59.2, wall=12870 epoch 006: 191 / 1707 loss=4.431, nll_loss=2.83, ppl=7.11, wps=292951, ups=0.68, wpb=429926, bsz=16301.2, num_updates=8700, lr=0.000678064, gnorm=0.299, clip=0, loss_scale=4, train_wall=146, gb_free=59.2, wall=12870 epoch 006: 191 / 1707 loss=4.431, nll_loss=2.83, ppl=7.11, wps=292951, ups=0.68, wpb=429926, bsz=16301.2, num_updates=8700, lr=0.000678064, gnorm=0.299, clip=0, loss_scale=4, train_wall=146, gb_free=59.2, wall=12870 epoch 006: 191 / 1707 loss=4.431, nll_loss=2.83, ppl=7.11, wps=292951, ups=0.68, wpb=429926, bsz=16301.2, num_updates=8700, lr=0.000678064, gnorm=0.299, clip=0, loss_scale=4, train_wall=146, gb_free=59.2, wall=12870 epoch 006: 191 / 1707 loss=4.431, nll_loss=2.83, ppl=7.11, wps=292951, ups=0.68, wpb=429926, bsz=16301.2, num_updates=8700, lr=0.000678064, gnorm=0.299, clip=0, loss_scale=4, train_wall=146, gb_free=59.2, wall=12870 epoch 006: 291 / 1707 loss=4.422, nll_loss=2.82, ppl=7.06, wps=296756, ups=0.69, wpb=430001, bsz=16308.5, num_updates=8800, lr=0.0006742, gnorm=0.284, clip=0, loss_scale=4, train_wall=144, gb_free=59.1, wall=13014 epoch 006: 291 / 1707 loss=4.422, nll_loss=2.82, ppl=7.06, wps=296756, ups=0.69, wpb=430001, bsz=16308.5, num_updates=8800, lr=0.0006742, gnorm=0.284, clip=0, loss_scale=4, train_wall=144, gb_free=59.1, wall=13014 epoch 006: 291 / 1707 loss=4.422, nll_loss=2.82, ppl=7.06, wps=296756, ups=0.69, wpb=430001, bsz=16308.5, num_updates=8800, lr=0.0006742, gnorm=0.284, clip=0, loss_scale=4, train_wall=144, gb_free=59.1, wall=13014 epoch 006: 291 / 1707 loss=4.422, nll_loss=2.82, ppl=7.06, wps=296756, ups=0.69, wpb=430001, bsz=16308.5, num_updates=8800, lr=0.0006742, gnorm=0.284, clip=0, loss_scale=4, train_wall=144, gb_free=59.1, wall=13014 epoch 006: 291 / 1707 loss=4.422, nll_loss=2.82, ppl=7.06, wps=296756, ups=0.69, wpb=430001, bsz=16308.5, num_updates=8800, lr=0.0006742, gnorm=0.284, clip=0, loss_scale=4, train_wall=144, gb_free=59.1, wall=13014 epoch 006: 291 / 1707 loss=4.422, nll_loss=2.82, ppl=7.06, wps=296756, ups=0.69, wpb=430001, bsz=16308.5, num_updates=8800, lr=0.0006742, gnorm=0.284, clip=0, loss_scale=4, train_wall=144, gb_free=59.1, wall=13014 epoch 006: 391 / 1707 loss=4.425, nll_loss=2.824, ppl=7.08, wps=296145, ups=0.69, wpb=428801, bsz=16149.1, num_updates=8900, lr=0.000670402, gnorm=0.294, clip=0, loss_scale=4, train_wall=144, gb_free=59.4, wall=13159 epoch 006: 391 / 1707 loss=4.425, nll_loss=2.824, ppl=7.08, wps=296145, ups=0.69, wpb=428801, bsz=16149.1, num_updates=8900, lr=0.000670402, gnorm=0.294, clip=0, loss_scale=4, train_wall=144, gb_free=59.4, wall=13159 epoch 006: 391 / 1707 loss=4.425, nll_loss=2.824, ppl=7.08, wps=296145, ups=0.69, wpb=428801, bsz=16149.1, num_updates=8900, lr=0.000670402, gnorm=0.294, clip=0, loss_scale=4, train_wall=144, gb_free=59.4, wall=13159 epoch 006: 391 / 1707 loss=4.425, nll_loss=2.824, ppl=7.08, wps=296145, ups=0.69, wpb=428801, bsz=16149.1, num_updates=8900, lr=0.000670402, gnorm=0.294, clip=0, loss_scale=4, train_wall=144, gb_free=59.4, wall=13159 epoch 006: 391 / 1707 loss=4.425, nll_loss=2.824, ppl=7.08, wps=296145, ups=0.69, wpb=428801, bsz=16149.1, num_updates=8900, lr=0.000670402, gnorm=0.294, clip=0, loss_scale=4, train_wall=144, gb_free=59.4, wall=13159 epoch 006: 391 / 1707 loss=4.425, nll_loss=2.824, ppl=7.08, wps=296145, ups=0.69, wpb=428801, bsz=16149.1, num_updates=8900, lr=0.000670402, gnorm=0.294, clip=0, loss_scale=4, train_wall=144, gb_free=59.4, wall=13159 epoch 006: 491 / 1707 loss=4.423, nll_loss=2.822, ppl=7.07, wps=295998, ups=0.69, wpb=429362, bsz=16224.6, num_updates=9000, lr=0.000666667, gnorm=0.29, clip=0, loss_scale=8, train_wall=144, gb_free=59.2, wall=13304 epoch 006: 491 / 1707 loss=4.423, nll_loss=2.822, ppl=7.07, wps=295998, ups=0.69, wpb=429362, bsz=16224.6, num_updates=9000, lr=0.000666667, gnorm=0.29, clip=0, loss_scale=8, train_wall=144, gb_free=59.2, wall=13304 epoch 006: 491 / 1707 loss=4.423, nll_loss=2.822, ppl=7.07, wps=295998, ups=0.69, wpb=429362, bsz=16224.6, num_updates=9000, lr=0.000666667, gnorm=0.29, clip=0, loss_scale=8, train_wall=144, gb_free=59.2, wall=13304 epoch 006: 491 / 1707 loss=4.423, nll_loss=2.822, ppl=7.07, wps=295998, ups=0.69, wpb=429362, bsz=16224.6, num_updates=9000, lr=0.000666667, gnorm=0.29, clip=0, loss_scale=8, train_wall=144, gb_free=59.2, wall=13304 epoch 006: 491 / 1707 loss=4.423, nll_loss=2.822, ppl=7.07, wps=295998, ups=0.69, wpb=429362, bsz=16224.6, num_updates=9000, lr=0.000666667, gnorm=0.29, clip=0, loss_scale=8, train_wall=144, gb_free=59.2, wall=13304 epoch 006: 491 / 1707 loss=4.423, nll_loss=2.822, ppl=7.07, wps=295998, ups=0.69, wpb=429362, bsz=16224.6, num_updates=9000, lr=0.000666667, gnorm=0.29, clip=0, loss_scale=8, train_wall=144, gb_free=59.2, wall=13304 begin validation on "valid" subset epoch 006 | valid on 'valid' subset | loss 4.471 | nll_loss 2.865 | ppl 7.28 | wps 76107.5 | wpb 21331 | bsz 1016 | num_updates 9000 | best_loss 4.471 epoch 006 | valid on 'valid' subset | loss 4.471 | nll_loss 2.865 | ppl 7.28 | wps 76107.5 | wpb 21331 | bsz 1016 | num_updates 9000 | best_loss 4.471 epoch 006 | valid on 'valid' subset | loss 4.471 | nll_loss 2.865 | ppl 7.28 | wps 76107.5 | wpb 21331 | bsz 1016 | num_updates 9000 | best_loss 4.471 epoch 006 | valid on 'valid' subset | loss 4.471 | nll_loss 2.865 | ppl 7.28 | wps 76107.5 | wpb 21331 | bsz 1016 | num_updates 9000 | best_loss 4.471 epoch 006 | valid on 'valid' subset | loss 4.471 | nll_loss 2.865 | ppl 7.28 | wps 76107.5 | wpb 21331 | bsz 1016 | num_updates 9000 | best_loss 4.471 epoch 006 | valid on 'valid' subset | loss 4.471 | nll_loss 2.865 | ppl 7.28 | wps 76107.5 | wpb 21331 | bsz 1016 | num_updates 9000 | best_loss 4.471 epoch 006: 592 / 1707 loss=4.417, nll_loss=2.816, ppl=7.04, wps=255858, ups=0.6, wpb=428268, bsz=16313.5, num_updates=9100, lr=0.000662994, gnorm=0.307, clip=0, loss_scale=4, train_wall=145, gb_free=59.1, wall=13472 epoch 006: 592 / 1707 loss=4.417, nll_loss=2.816, ppl=7.04, wps=255858, ups=0.6, wpb=428268, bsz=16313.5, num_updates=9100, lr=0.000662994, gnorm=0.307, clip=0, loss_scale=4, train_wall=145, gb_free=59.1, wall=13472 epoch 006: 592 / 1707 loss=4.417, nll_loss=2.816, ppl=7.04, wps=255858, ups=0.6, wpb=428268, bsz=16313.5, num_updates=9100, lr=0.000662994, gnorm=0.307, clip=0, loss_scale=4, train_wall=145, gb_free=59.1, wall=13472 epoch 006: 592 / 1707 loss=4.417, nll_loss=2.816, ppl=7.04, wps=255858, ups=0.6, wpb=428268, bsz=16313.5, num_updates=9100, lr=0.000662994, gnorm=0.307, clip=0, loss_scale=4, train_wall=145, gb_free=59.1, wall=13472 epoch 006: 592 / 1707 loss=4.417, nll_loss=2.816, ppl=7.04, wps=255858, ups=0.6, wpb=428268, bsz=16313.5, num_updates=9100, lr=0.000662994, gnorm=0.307, clip=0, loss_scale=4, train_wall=145, gb_free=59.1, wall=13472 epoch 006: 592 / 1707 loss=4.417, nll_loss=2.816, ppl=7.04, wps=255858, ups=0.6, wpb=428268, bsz=16313.5, num_updates=9100, lr=0.000662994, gnorm=0.307, clip=0, loss_scale=4, train_wall=145, gb_free=59.1, wall=13472 epoch 006: 692 / 1707 loss=4.421, nll_loss=2.821, ppl=7.07, wps=296380, ups=0.69, wpb=429396, bsz=16289.7, num_updates=9200, lr=0.00065938, gnorm=0.283, clip=0, loss_scale=4, train_wall=144, gb_free=59.2, wall=13617 epoch 006: 692 / 1707 loss=4.421, nll_loss=2.821, ppl=7.07, wps=296380, ups=0.69, wpb=429396, bsz=16289.7, num_updates=9200, lr=0.00065938, gnorm=0.283, clip=0, loss_scale=4, train_wall=144, gb_free=59.2, wall=13617 epoch 006: 692 / 1707 loss=4.421, nll_loss=2.821, ppl=7.07, wps=296380, ups=0.69, wpb=429396, bsz=16289.7, num_updates=9200, lr=0.00065938, gnorm=0.283, clip=0, loss_scale=4, train_wall=144, gb_free=59.2, wall=13617 epoch 006: 692 / 1707 loss=4.421, nll_loss=2.821, ppl=7.07, wps=296380, ups=0.69, wpb=429396, bsz=16289.7, num_updates=9200, lr=0.00065938, gnorm=0.283, clip=0, loss_scale=4, train_wall=144, gb_free=59.2, wall=13617 epoch 006: 692 / 1707 loss=4.421, nll_loss=2.821, ppl=7.07, wps=296380, ups=0.69, wpb=429396, bsz=16289.7, num_updates=9200, lr=0.00065938, gnorm=0.283, clip=0, loss_scale=4, train_wall=144, gb_free=59.2, wall=13617 epoch 006: 692 / 1707 loss=4.421, nll_loss=2.821, ppl=7.07, wps=296380, ups=0.69, wpb=429396, bsz=16289.7, num_updates=9200, lr=0.00065938, gnorm=0.283, clip=0, loss_scale=4, train_wall=144, gb_free=59.2, wall=13617 epoch 006: 792 / 1707 loss=4.414, nll_loss=2.813, ppl=7.03, wps=296553, ups=0.69, wpb=428368, bsz=16516.2, num_updates=9300, lr=0.000655826, gnorm=0.286, clip=0, loss_scale=8, train_wall=144, gb_free=59.3, wall=13761 epoch 006: 792 / 1707 loss=4.414, nll_loss=2.813, ppl=7.03, wps=296553, ups=0.69, wpb=428368, bsz=16516.2, num_updates=9300, lr=0.000655826, gnorm=0.286, clip=0, loss_scale=8, train_wall=144, gb_free=59.3, wall=13761 epoch 006: 792 / 1707 loss=4.414, nll_loss=2.813, ppl=7.03, wps=296553, ups=0.69, wpb=428368, bsz=16516.2, num_updates=9300, lr=0.000655826, gnorm=0.286, clip=0, loss_scale=8, train_wall=144, gb_free=59.3, wall=13761 epoch 006: 792 / 1707 loss=4.414, nll_loss=2.813, ppl=7.03, wps=296553, ups=0.69, wpb=428368, bsz=16516.2, num_updates=9300, lr=0.000655826, gnorm=0.286, clip=0, loss_scale=8, train_wall=144, gb_free=59.3, wall=13761 epoch 006: 792 / 1707 loss=4.414, nll_loss=2.813, ppl=7.03, wps=296553, ups=0.69, wpb=428368, bsz=16516.2, num_updates=9300, lr=0.000655826, gnorm=0.286, clip=0, loss_scale=8, train_wall=144, gb_free=59.3, wall=13761 epoch 006: 792 / 1707 loss=4.414, nll_loss=2.813, ppl=7.03, wps=296553, ups=0.69, wpb=428368, bsz=16516.2, num_updates=9300, lr=0.000655826, gnorm=0.286, clip=0, loss_scale=8, train_wall=144, gb_free=59.3, wall=13761 epoch 006: 893 / 1707 loss=4.413, nll_loss=2.813, ppl=7.03, wps=294606, ups=0.69, wpb=429838, bsz=16216.5, num_updates=9400, lr=0.000652328, gnorm=0.277, clip=0, loss_scale=4, train_wall=145, gb_free=59.4, wall=13907 epoch 006: 893 / 1707 loss=4.413, nll_loss=2.813, ppl=7.03, wps=294606, ups=0.69, wpb=429838, bsz=16216.5, num_updates=9400, lr=0.000652328, gnorm=0.277, clip=0, loss_scale=4, train_wall=145, gb_free=59.4, wall=13907 epoch 006: 893 / 1707 loss=4.413, nll_loss=2.813, ppl=7.03, wps=294606, ups=0.69, wpb=429838, bsz=16216.5, num_updates=9400, lr=0.000652328, gnorm=0.277, clip=0, loss_scale=4, train_wall=145, gb_free=59.4, wall=13907 epoch 006: 893 / 1707 loss=4.413, nll_loss=2.813, ppl=7.03, wps=294606, ups=0.69, wpb=429838, bsz=16216.5, num_updates=9400, lr=0.000652328, gnorm=0.277, clip=0, loss_scale=4, train_wall=145, gb_free=59.4, wall=13907 epoch 006: 893 / 1707 loss=4.413, nll_loss=2.813, ppl=7.03, wps=294606, ups=0.69, wpb=429838, bsz=16216.5, num_updates=9400, lr=0.000652328, gnorm=0.277, clip=0, loss_scale=4, train_wall=145, gb_free=59.4, wall=13907 epoch 006: 893 / 1707 loss=4.413, nll_loss=2.813, ppl=7.03, wps=294606, ups=0.69, wpb=429838, bsz=16216.5, num_updates=9400, lr=0.000652328, gnorm=0.277, clip=0, loss_scale=4, train_wall=145, gb_free=59.4, wall=13907 epoch 006: 993 / 1707 loss=4.409, nll_loss=2.808, ppl=7, wps=296405, ups=0.69, wpb=428700, bsz=16169.3, num_updates=9500, lr=0.000648886, gnorm=0.29, clip=0, loss_scale=4, train_wall=144, gb_free=58.9, wall=14052 epoch 006: 993 / 1707 loss=4.409, nll_loss=2.808, ppl=7, wps=296405, ups=0.69, wpb=428700, bsz=16169.3, num_updates=9500, lr=0.000648886, gnorm=0.29, clip=0, loss_scale=4, train_wall=144, gb_free=58.9, wall=14052 epoch 006: 993 / 1707 loss=4.409, nll_loss=2.808, ppl=7, wps=296405, ups=0.69, wpb=428700, bsz=16169.3, num_updates=9500, lr=0.000648886, gnorm=0.29, clip=0, loss_scale=4, train_wall=144, gb_free=58.9, wall=14052 epoch 006: 993 / 1707 loss=4.409, nll_loss=2.808, ppl=7, wps=296405, ups=0.69, wpb=428700, bsz=16169.3, num_updates=9500, lr=0.000648886, gnorm=0.29, clip=0, loss_scale=4, train_wall=144, gb_free=58.9, wall=14052 epoch 006: 993 / 1707 loss=4.409, nll_loss=2.808, ppl=7, wps=296405, ups=0.69, wpb=428700, bsz=16169.3, num_updates=9500, lr=0.000648886, gnorm=0.29, clip=0, loss_scale=4, train_wall=144, gb_free=58.9, wall=14052 epoch 006: 993 / 1707 loss=4.409, nll_loss=2.808, ppl=7, wps=296405, ups=0.69, wpb=428700, bsz=16169.3, num_updates=9500, lr=0.000648886, gnorm=0.29, clip=0, loss_scale=4, train_wall=144, gb_free=58.9, wall=14052 epoch 006: 1094 / 1707 loss=4.404, nll_loss=2.803, ppl=6.98, wps=293221, ups=0.68, wpb=429186, bsz=16354.3, num_updates=9600, lr=0.000645497, gnorm=0.279, clip=0, loss_scale=4, train_wall=146, gb_free=59, wall=14198 epoch 006: 1094 / 1707 loss=4.404, nll_loss=2.803, ppl=6.98, wps=293221, ups=0.68, wpb=429186, bsz=16354.3, num_updates=9600, lr=0.000645497, gnorm=0.279, clip=0, loss_scale=4, train_wall=146, gb_free=59, wall=14198 epoch 006: 1094 / 1707 loss=4.404, nll_loss=2.803, ppl=6.98, wps=293221, ups=0.68, wpb=429186, bsz=16354.3, num_updates=9600, lr=0.000645497, gnorm=0.279, clip=0, loss_scale=4, train_wall=146, gb_free=59, wall=14198 epoch 006: 1094 / 1707 loss=4.404, nll_loss=2.803, ppl=6.98, wps=293221, ups=0.68, wpb=429186, bsz=16354.3, num_updates=9600, lr=0.000645497, gnorm=0.279, clip=0, loss_scale=4, train_wall=146, gb_free=59, wall=14198 epoch 006: 1094 / 1707 loss=4.404, nll_loss=2.803, ppl=6.98, wps=293221, ups=0.68, wpb=429186, bsz=16354.3, num_updates=9600, lr=0.000645497, gnorm=0.279, clip=0, loss_scale=4, train_wall=146, gb_free=59, wall=14198 epoch 006: 1094 / 1707 loss=4.404, nll_loss=2.803, ppl=6.98, wps=293221, ups=0.68, wpb=429186, bsz=16354.3, num_updates=9600, lr=0.000645497, gnorm=0.279, clip=0, loss_scale=4, train_wall=146, gb_free=59, wall=14198 epoch 006: 1194 / 1707 loss=4.406, nll_loss=2.806, ppl=6.99, wps=295083, ups=0.69, wpb=428070, bsz=16547.1, num_updates=9700, lr=0.000642161, gnorm=0.278, clip=0, loss_scale=4, train_wall=144, gb_free=59, wall=14343 epoch 006: 1194 / 1707 loss=4.406, nll_loss=2.806, ppl=6.99, wps=295083, ups=0.69, wpb=428070, bsz=16547.1, num_updates=9700, lr=0.000642161, gnorm=0.278, clip=0, loss_scale=4, train_wall=144, gb_free=59, wall=14343 epoch 006: 1194 / 1707 loss=4.406, nll_loss=2.806, ppl=6.99, wps=295083, ups=0.69, wpb=428070, bsz=16547.1, num_updates=9700, lr=0.000642161, gnorm=0.278, clip=0, loss_scale=4, train_wall=144, gb_free=59, wall=14343 epoch 006: 1194 / 1707 loss=4.406, nll_loss=2.806, ppl=6.99, wps=295083, ups=0.69, wpb=428070, bsz=16547.1, num_updates=9700, lr=0.000642161, gnorm=0.278, clip=0, loss_scale=4, train_wall=144, gb_free=59, wall=14343 epoch 006: 1194 / 1707 loss=4.406, nll_loss=2.806, ppl=6.99, wps=295083, ups=0.69, wpb=428070, bsz=16547.1, num_updates=9700, lr=0.000642161, gnorm=0.278, clip=0, loss_scale=4, train_wall=144, gb_free=59, wall=14343 epoch 006: 1194 / 1707 loss=4.406, nll_loss=2.806, ppl=6.99, wps=295083, ups=0.69, wpb=428070, bsz=16547.1, num_updates=9700, lr=0.000642161, gnorm=0.278, clip=0, loss_scale=4, train_wall=144, gb_free=59, wall=14343 epoch 006: 1294 / 1707 loss=4.402, nll_loss=2.801, ppl=6.97, wps=297675, ups=0.69, wpb=430200, bsz=16310.2, num_updates=9800, lr=0.000638877, gnorm=0.274, clip=0, loss_scale=4, train_wall=144, gb_free=59, wall=14487 epoch 006: 1294 / 1707 loss=4.402, nll_loss=2.801, ppl=6.97, wps=297675, ups=0.69, wpb=430200, bsz=16310.2, num_updates=9800, lr=0.000638877, gnorm=0.274, clip=0, loss_scale=4, train_wall=144, gb_free=59, wall=14487 epoch 006: 1294 / 1707 loss=4.402, nll_loss=2.801, ppl=6.97, wps=297675, ups=0.69, wpb=430200, bsz=16310.2, num_updates=9800, lr=0.000638877, gnorm=0.274, clip=0, loss_scale=4, train_wall=144, gb_free=59, wall=14487 epoch 006: 1294 / 1707 loss=4.402, nll_loss=2.801, ppl=6.97, wps=297675, ups=0.69, wpb=430200, bsz=16310.2, num_updates=9800, lr=0.000638877, gnorm=0.274, clip=0, loss_scale=4, train_wall=144, gb_free=59, wall=14487 epoch 006: 1294 / 1707 loss=4.402, nll_loss=2.801, ppl=6.97, wps=297675, ups=0.69, wpb=430200, bsz=16310.2, num_updates=9800, lr=0.000638877, gnorm=0.274, clip=0, loss_scale=4, train_wall=144, gb_free=59, wall=14487 epoch 006: 1294 / 1707 loss=4.402, nll_loss=2.801, ppl=6.97, wps=297675, ups=0.69, wpb=430200, bsz=16310.2, num_updates=9800, lr=0.000638877, gnorm=0.274, clip=0, loss_scale=4, train_wall=144, gb_free=59, wall=14487 epoch 006: 1394 / 1707 loss=4.408, nll_loss=2.809, ppl=7.01, wps=296737, ups=0.69, wpb=429321, bsz=16281.4, num_updates=9900, lr=0.000635642, gnorm=0.281, clip=0, loss_scale=8, train_wall=144, gb_free=58.9, wall=14632 epoch 006: 1394 / 1707 loss=4.408, nll_loss=2.809, ppl=7.01, wps=296737, ups=0.69, wpb=429321, bsz=16281.4, num_updates=9900, lr=0.000635642, gnorm=0.281, clip=0, loss_scale=8, train_wall=144, gb_free=58.9, wall=14632 epoch 006: 1394 / 1707 loss=4.408, nll_loss=2.809, ppl=7.01, wps=296737, ups=0.69, wpb=429321, bsz=16281.4, num_updates=9900, lr=0.000635642, gnorm=0.281, clip=0, loss_scale=8, train_wall=144, gb_free=58.9, wall=14632 epoch 006: 1394 / 1707 loss=4.408, nll_loss=2.809, ppl=7.01, wps=296737, ups=0.69, wpb=429321, bsz=16281.4, num_updates=9900, lr=0.000635642, gnorm=0.281, clip=0, loss_scale=8, train_wall=144, gb_free=58.9, wall=14632 epoch 006: 1394 / 1707 loss=4.408, nll_loss=2.809, ppl=7.01, wps=296737, ups=0.69, wpb=429321, bsz=16281.4, num_updates=9900, lr=0.000635642, gnorm=0.281, clip=0, loss_scale=8, train_wall=144, gb_free=58.9, wall=14632 epoch 006: 1394 / 1707 loss=4.408, nll_loss=2.809, ppl=7.01, wps=296737, ups=0.69, wpb=429321, bsz=16281.4, num_updates=9900, lr=0.000635642, gnorm=0.281, clip=0, loss_scale=8, train_wall=144, gb_free=58.9, wall=14632 epoch 006: 1495 / 1707 loss=4.395, nll_loss=2.794, ppl=6.94, wps=293483, ups=0.68, wpb=428452, bsz=16260.8, num_updates=10000, lr=0.000632456, gnorm=0.28, clip=0, loss_scale=4, train_wall=145, gb_free=58.9, wall=14778 epoch 006: 1495 / 1707 loss=4.395, nll_loss=2.794, ppl=6.94, wps=293483, ups=0.68, wpb=428452, bsz=16260.8, num_updates=10000, lr=0.000632456, gnorm=0.28, clip=0, loss_scale=4, train_wall=145, gb_free=58.9, wall=14778 epoch 006: 1495 / 1707 loss=4.395, nll_loss=2.794, ppl=6.94, wps=293483, ups=0.68, wpb=428452, bsz=16260.8, num_updates=10000, lr=0.000632456, gnorm=0.28, clip=0, loss_scale=4, train_wall=145, gb_free=58.9, wall=14778 epoch 006: 1495 / 1707 loss=4.395, nll_loss=2.794, ppl=6.94, wps=293483, ups=0.68, wpb=428452, bsz=16260.8, num_updates=10000, lr=0.000632456, gnorm=0.28, clip=0, loss_scale=4, train_wall=145, gb_free=58.9, wall=14778 epoch 006: 1495 / 1707 loss=4.395, nll_loss=2.794, ppl=6.94, wps=293483, ups=0.68, wpb=428452, bsz=16260.8, num_updates=10000, lr=0.000632456, gnorm=0.28, clip=0, loss_scale=4, train_wall=145, gb_free=58.9, wall=14778 epoch 006: 1495 / 1707 loss=4.395, nll_loss=2.794, ppl=6.94, wps=293483, ups=0.68, wpb=428452, bsz=16260.8, num_updates=10000, lr=0.000632456, gnorm=0.28, clip=0, loss_scale=4, train_wall=145, gb_free=58.9, wall=14778 begin validation on "valid" subset epoch 006 | valid on 'valid' subset | loss 4.459 | nll_loss 2.842 | ppl 7.17 | wps 76542.2 | wpb 21331 | bsz 1016 | num_updates 10000 | best_loss 4.459 epoch 006 | valid on 'valid' subset | loss 4.459 | nll_loss 2.842 | ppl 7.17 | wps 76542.2 | wpb 21331 | bsz 1016 | num_updates 10000 | best_loss 4.459 epoch 006 | valid on 'valid' subset | loss 4.459 | nll_loss 2.842 | ppl 7.17 | wps 76542.2 | wpb 21331 | bsz 1016 | num_updates 10000 | best_loss 4.459 epoch 006 | valid on 'valid' subset | loss 4.459 | nll_loss 2.842 | ppl 7.17 | wps 76542.2 | wpb 21331 | bsz 1016 | num_updates 10000 | best_loss 4.459 epoch 006 | valid on 'valid' subset | loss 4.459 | nll_loss 2.842 | ppl 7.17 | wps 76542.2 | wpb 21331 | bsz 1016 | num_updates 10000 | best_loss 4.459 epoch 006 | valid on 'valid' subset | loss 4.459 | nll_loss 2.842 | ppl 7.17 | wps 76542.2 | wpb 21331 | bsz 1016 | num_updates 10000 | best_loss 4.459 epoch 006: 1595 / 1707 loss=4.396, nll_loss=2.796, ppl=6.95, wps=260352, ups=0.61, wpb=428964, bsz=16599.1, num_updates=10100, lr=0.000629317, gnorm=0.28, clip=0, loss_scale=4, train_wall=144, gb_free=59.2, wall=14943 epoch 006: 1595 / 1707 loss=4.396, nll_loss=2.796, ppl=6.95, wps=260352, ups=0.61, wpb=428964, bsz=16599.1, num_updates=10100, lr=0.000629317, gnorm=0.28, clip=0, loss_scale=4, train_wall=144, gb_free=59.2, wall=14943 epoch 006: 1595 / 1707 loss=4.396, nll_loss=2.796, ppl=6.95, wps=260352, ups=0.61, wpb=428964, bsz=16599.1, num_updates=10100, lr=0.000629317, gnorm=0.28, clip=0, loss_scale=4, train_wall=144, gb_free=59.2, wall=14943 epoch 006: 1595 / 1707 loss=4.396, nll_loss=2.796, ppl=6.95, wps=260352, ups=0.61, wpb=428964, bsz=16599.1, num_updates=10100, lr=0.000629317, gnorm=0.28, clip=0, loss_scale=4, train_wall=144, gb_free=59.2, wall=14943 epoch 006: 1595 / 1707 loss=4.396, nll_loss=2.796, ppl=6.95, wps=260352, ups=0.61, wpb=428964, bsz=16599.1, num_updates=10100, lr=0.000629317, gnorm=0.28, clip=0, loss_scale=4, train_wall=144, gb_free=59.2, wall=14943 epoch 006: 1595 / 1707 loss=4.396, nll_loss=2.796, ppl=6.95, wps=260352, ups=0.61, wpb=428964, bsz=16599.1, num_updates=10100, lr=0.000629317, gnorm=0.28, clip=0, loss_scale=4, train_wall=144, gb_free=59.2, wall=14943 epoch 006: 1695 / 1707 loss=4.389, nll_loss=2.788, ppl=6.91, wps=297111, ups=0.69, wpb=428483, bsz=16330.4, num_updates=10200, lr=0.000626224, gnorm=0.263, clip=0, loss_scale=8, train_wall=144, gb_free=59.3, wall=15087 epoch 006: 1695 / 1707 loss=4.389, nll_loss=2.788, ppl=6.91, wps=297111, ups=0.69, wpb=428483, bsz=16330.4, num_updates=10200, lr=0.000626224, gnorm=0.263, clip=0, loss_scale=8, train_wall=144, gb_free=59.3, wall=15087 epoch 006: 1695 / 1707 loss=4.389, nll_loss=2.788, ppl=6.91, wps=297111, ups=0.69, wpb=428483, bsz=16330.4, num_updates=10200, lr=0.000626224, gnorm=0.263, clip=0, loss_scale=8, train_wall=144, gb_free=59.3, wall=15087 epoch 006: 1695 / 1707 loss=4.389, nll_loss=2.788, ppl=6.91, wps=297111, ups=0.69, wpb=428483, bsz=16330.4, num_updates=10200, lr=0.000626224, gnorm=0.263, clip=0, loss_scale=8, train_wall=144, gb_free=59.3, wall=15087 epoch 006: 1695 / 1707 loss=4.389, nll_loss=2.788, ppl=6.91, wps=297111, ups=0.69, wpb=428483, bsz=16330.4, num_updates=10200, lr=0.000626224, gnorm=0.263, clip=0, loss_scale=8, train_wall=144, gb_free=59.3, wall=15087 epoch 006: 1695 / 1707 loss=4.389, nll_loss=2.788, ppl=6.91, wps=297111, ups=0.69, wpb=428483, bsz=16330.4, num_updates=10200, lr=0.000626224, gnorm=0.263, clip=0, loss_scale=8, train_wall=144, gb_free=59.3, wall=15087 end of epoch 6 (average epoch stats below) epoch 006 | loss 4.411 | nll_loss 2.81 | ppl 7.01 | wps 290701 | ups 0.68 | wpb 428943 | bsz 16328.8 | num_updates 10212 | lr 0.000625856 | gnorm 0.285 | clip 0 | loss_scale 8 | train_wall 2456 | gb_free 60.4 | wall 15103 epoch 006 | loss 4.411 | nll_loss 2.81 | ppl 7.01 | wps 290701 | ups 0.68 | wpb 428943 | bsz 16328.8 | num_updates 10212 | lr 0.000625856 | gnorm 0.285 | clip 0 | loss_scale 8 | train_wall 2456 | gb_free 60.4 | wall 15103 epoch 006 | loss 4.411 | nll_loss 2.81 | ppl 7.01 | wps 290701 | ups 0.68 | wpb 428943 | bsz 16328.8 | num_updates 10212 | lr 0.000625856 | gnorm 0.285 | clip 0 | loss_scale 8 | train_wall 2456 | gb_free 60.4 | wall 15103 epoch 006 | loss 4.411 | nll_loss 2.81 | ppl 7.01 | wps 290701 | ups 0.68 | wpb 428943 | bsz 16328.8 | num_updates 10212 | lr 0.000625856 | gnorm 0.285 | clip 0 | loss_scale 8 | train_wall 2456 | gb_free 60.4 | wall 15103 epoch 006 | loss 4.411 | nll_loss 2.81 | ppl 7.01 | wps 290701 | ups 0.68 | wpb 428943 | bsz 16328.8 | num_updates 10212 | lr 0.000625856 | gnorm 0.285 | clip 0 | loss_scale 8 | train_wall 2456 | gb_free 60.4 | wall 15103 epoch 006 | loss 4.411 | nll_loss 2.81 | ppl 7.01 | wps 290701 | ups 0.68 | wpb 428943 | bsz 16328.8 | num_updates 10212 | lr 0.000625856 | gnorm 0.285 | clip 0 | loss_scale 8 | train_wall 2456 | gb_free 60.4 | wall 15103 Start iterating over samples epoch 007: 89 / 1707 loss=4.347, nll_loss=2.739, ppl=6.68, wps=293097, ups=0.69, wpb=426712, bsz=15938.4, num_updates=10300, lr=0.000623177, gnorm=0.28, clip=0, loss_scale=4, train_wall=144, gb_free=59, wall=15233 epoch 007: 89 / 1707 loss=4.347, nll_loss=2.739, ppl=6.68, wps=293097, ups=0.69, wpb=426712, bsz=15938.4, num_updates=10300, lr=0.000623177, gnorm=0.28, clip=0, loss_scale=4, train_wall=144, gb_free=59, wall=15233 epoch 007: 89 / 1707 loss=4.347, nll_loss=2.739, ppl=6.68, wps=293097, ups=0.69, wpb=426712, bsz=15938.4, num_updates=10300, lr=0.000623177, gnorm=0.28, clip=0, loss_scale=4, train_wall=144, gb_free=59, wall=15233 epoch 007: 89 / 1707 loss=4.347, nll_loss=2.739, ppl=6.68, wps=293097, ups=0.69, wpb=426712, bsz=15938.4, num_updates=10300, lr=0.000623177, gnorm=0.28, clip=0, loss_scale=4, train_wall=144, gb_free=59, wall=15233 epoch 007: 89 / 1707 loss=4.347, nll_loss=2.739, ppl=6.68, wps=293097, ups=0.69, wpb=426712, bsz=15938.4, num_updates=10300, lr=0.000623177, gnorm=0.28, clip=0, loss_scale=4, train_wall=144, gb_free=59, wall=15233 epoch 007: 89 / 1707 loss=4.347, nll_loss=2.739, ppl=6.68, wps=293097, ups=0.69, wpb=426712, bsz=15938.4, num_updates=10300, lr=0.000623177, gnorm=0.28, clip=0, loss_scale=4, train_wall=144, gb_free=59, wall=15233 epoch 007: 89 / 1707 loss=4.347, nll_loss=2.739, ppl=6.68, wps=293097, ups=0.69, wpb=426712, bsz=15938.4, num_updates=10300, lr=0.000623177, gnorm=0.28, clip=0, loss_scale=4, train_wall=144, gb_free=59, wall=15233 epoch 007: 189 / 1707 loss=4.347, nll_loss=2.739, ppl=6.68, wps=296532, ups=0.69, wpb=429678, bsz=16435.4, num_updates=10400, lr=0.000620174, gnorm=0.268, clip=0, loss_scale=4, train_wall=144, gb_free=59, wall=15378 epoch 007: 189 / 1707 loss=4.347, nll_loss=2.739, ppl=6.68, wps=296532, ups=0.69, wpb=429678, bsz=16435.4, num_updates=10400, lr=0.000620174, gnorm=0.268, clip=0, loss_scale=4, train_wall=144, gb_free=59, wall=15378 epoch 007: 189 / 1707 loss=4.347, nll_loss=2.739, ppl=6.68, wps=296532, ups=0.69, wpb=429678, bsz=16435.4, num_updates=10400, lr=0.000620174, gnorm=0.268, clip=0, loss_scale=4, train_wall=144, gb_free=59, wall=15378 epoch 007: 189 / 1707 loss=4.347, nll_loss=2.739, ppl=6.68, wps=296532, ups=0.69, wpb=429678, bsz=16435.4, num_updates=10400, lr=0.000620174, gnorm=0.268, clip=0, loss_scale=4, train_wall=144, gb_free=59, wall=15378 epoch 007: 189 / 1707 loss=4.347, nll_loss=2.739, ppl=6.68, wps=296532, ups=0.69, wpb=429678, bsz=16435.4, num_updates=10400, lr=0.000620174, gnorm=0.268, clip=0, loss_scale=4, train_wall=144, gb_free=59, wall=15378 epoch 007: 189 / 1707 loss=4.347, nll_loss=2.739, ppl=6.68, wps=296532, ups=0.69, wpb=429678, bsz=16435.4, num_updates=10400, lr=0.000620174, gnorm=0.268, clip=0, loss_scale=4, train_wall=144, gb_free=59, wall=15378 epoch 007: 189 / 1707 loss=4.347, nll_loss=2.739, ppl=6.68, wps=296532, ups=0.69, wpb=429678, bsz=16435.4, num_updates=10400, lr=0.000620174, gnorm=0.268, clip=0, loss_scale=4, train_wall=144, gb_free=59, wall=15378 epoch 007: 289 / 1707 loss=4.349, nll_loss=2.742, ppl=6.69, wps=296401, ups=0.69, wpb=430611, bsz=16399.2, num_updates=10500, lr=0.000617213, gnorm=0.276, clip=0, loss_scale=8, train_wall=145, gb_free=59.1, wall=15523 epoch 007: 289 / 1707 loss=4.349, nll_loss=2.742, ppl=6.69, wps=296401, ups=0.69, wpb=430611, bsz=16399.2, num_updates=10500, lr=0.000617213, gnorm=0.276, clip=0, loss_scale=8, train_wall=145, gb_free=59.1, wall=15523 epoch 007: 289 / 1707 loss=4.349, nll_loss=2.742, ppl=6.69, wps=296401, ups=0.69, wpb=430611, bsz=16399.2, num_updates=10500, lr=0.000617213, gnorm=0.276, clip=0, loss_scale=8, train_wall=145, gb_free=59.1, wall=15523 epoch 007: 289 / 1707 loss=4.349, nll_loss=2.742, ppl=6.69, wps=296401, ups=0.69, wpb=430611, bsz=16399.2, num_updates=10500, lr=0.000617213, gnorm=0.276, clip=0, loss_scale=8, train_wall=145, gb_free=59.1, wall=15523 epoch 007: 289 / 1707 loss=4.349, nll_loss=2.742, ppl=6.69, wps=296401, ups=0.69, wpb=430611, bsz=16399.2, num_updates=10500, lr=0.000617213, gnorm=0.276, clip=0, loss_scale=8, train_wall=145, gb_free=59.1, wall=15523 epoch 007: 289 / 1707 loss=4.349, nll_loss=2.742, ppl=6.69, wps=296401, ups=0.69, wpb=430611, bsz=16399.2, num_updates=10500, lr=0.000617213, gnorm=0.276, clip=0, loss_scale=8, train_wall=145, gb_free=59.1, wall=15523 epoch 007: 289 / 1707 loss=4.349, nll_loss=2.742, ppl=6.69, wps=296401, ups=0.69, wpb=430611, bsz=16399.2, num_updates=10500, lr=0.000617213, gnorm=0.276, clip=0, loss_scale=8, train_wall=145, gb_free=59.1, wall=15523 epoch 007: 390 / 1707 loss=4.351, nll_loss=2.744, ppl=6.7, wps=293567, ups=0.69, wpb=428539, bsz=16317.8, num_updates=10600, lr=0.000614295, gnorm=0.286, clip=0, loss_scale=4, train_wall=145, gb_free=59.1, wall=15669 epoch 007: 390 / 1707 loss=4.351, nll_loss=2.744, ppl=6.7, wps=293567, ups=0.69, wpb=428539, bsz=16317.8, num_updates=10600, lr=0.000614295, gnorm=0.286, clip=0, loss_scale=4, train_wall=145, gb_free=59.1, wall=15669 epoch 007: 390 / 1707 loss=4.351, nll_loss=2.744, ppl=6.7, wps=293567, ups=0.69, wpb=428539, bsz=16317.8, num_updates=10600, lr=0.000614295, gnorm=0.286, clip=0, loss_scale=4, train_wall=145, gb_free=59.1, wall=15669 epoch 007: 390 / 1707 loss=4.351, nll_loss=2.744, ppl=6.7, wps=293567, ups=0.69, wpb=428539, bsz=16317.8, num_updates=10600, lr=0.000614295, gnorm=0.286, clip=0, loss_scale=4, train_wall=145, gb_free=59.1, wall=15669 epoch 007: 390 / 1707 loss=4.351, nll_loss=2.744, ppl=6.7, wps=293567, ups=0.69, wpb=428539, bsz=16317.8, num_updates=10600, lr=0.000614295, gnorm=0.286, clip=0, loss_scale=4, train_wall=145, gb_free=59.1, wall=15669 epoch 007: 390 / 1707 loss=4.351, nll_loss=2.744, ppl=6.7, wps=293567, ups=0.69, wpb=428539, bsz=16317.8, num_updates=10600, lr=0.000614295, gnorm=0.286, clip=0, loss_scale=4, train_wall=145, gb_free=59.1, wall=15669 epoch 007: 390 / 1707 loss=4.351, nll_loss=2.744, ppl=6.7, wps=293567, ups=0.69, wpb=428539, bsz=16317.8, num_updates=10600, lr=0.000614295, gnorm=0.286, clip=0, loss_scale=4, train_wall=145, gb_free=59.1, wall=15669 epoch 007: 490 / 1707 loss=4.357, nll_loss=2.751, ppl=6.73, wps=296747, ups=0.69, wpb=429274, bsz=16215.4, num_updates=10700, lr=0.000611418, gnorm=0.271, clip=0, loss_scale=4, train_wall=144, gb_free=59.1, wall=15814 epoch 007: 490 / 1707 loss=4.357, nll_loss=2.751, ppl=6.73, wps=296747, ups=0.69, wpb=429274, bsz=16215.4, num_updates=10700, lr=0.000611418, gnorm=0.271, clip=0, loss_scale=4, train_wall=144, gb_free=59.1, wall=15814 epoch 007: 490 / 1707 loss=4.357, nll_loss=2.751, ppl=6.73, wps=296747, ups=0.69, wpb=429274, bsz=16215.4, num_updates=10700, lr=0.000611418, gnorm=0.271, clip=0, loss_scale=4, train_wall=144, gb_free=59.1, wall=15814 epoch 007: 490 / 1707 loss=4.357, nll_loss=2.751, ppl=6.73, wps=296747, ups=0.69, wpb=429274, bsz=16215.4, num_updates=10700, lr=0.000611418, gnorm=0.271, clip=0, loss_scale=4, train_wall=144, gb_free=59.1, wall=15814 epoch 007: 490 / 1707 loss=4.357, nll_loss=2.751, ppl=6.73, wps=296747, ups=0.69, wpb=429274, bsz=16215.4, num_updates=10700, lr=0.000611418, gnorm=0.271, clip=0, loss_scale=4, train_wall=144, gb_free=59.1, wall=15814 epoch 007: 490 / 1707 loss=4.357, nll_loss=2.751, ppl=6.73, wps=296747, ups=0.69, wpb=429274, bsz=16215.4, num_updates=10700, lr=0.000611418, gnorm=0.271, clip=0, loss_scale=4, train_wall=144, gb_free=59.1, wall=15814 epoch 007: 490 / 1707 loss=4.357, nll_loss=2.751, ppl=6.73, wps=296747, ups=0.69, wpb=429274, bsz=16215.4, num_updates=10700, lr=0.000611418, gnorm=0.271, clip=0, loss_scale=4, train_wall=144, gb_free=59.1, wall=15814 epoch 007: 591 / 1707 loss=4.352, nll_loss=2.747, ppl=6.71, wps=293912, ups=0.68, wpb=430339, bsz=16550.8, num_updates=10800, lr=0.000608581, gnorm=0.276, clip=0, loss_scale=4, train_wall=146, gb_free=59.2, wall=15960 epoch 007: 591 / 1707 loss=4.352, nll_loss=2.747, ppl=6.71, wps=293912, ups=0.68, wpb=430339, bsz=16550.8, num_updates=10800, lr=0.000608581, gnorm=0.276, clip=0, loss_scale=4, train_wall=146, gb_free=59.2, wall=15960 epoch 007: 591 / 1707 loss=4.352, nll_loss=2.747, ppl=6.71, wps=293912, ups=0.68, wpb=430339, bsz=16550.8, num_updates=10800, lr=0.000608581, gnorm=0.276, clip=0, loss_scale=4, train_wall=146, gb_free=59.2, wall=15960 epoch 007: 591 / 1707 loss=4.352, nll_loss=2.747, ppl=6.71, wps=293912, ups=0.68, wpb=430339, bsz=16550.8, num_updates=10800, lr=0.000608581, gnorm=0.276, clip=0, loss_scale=4, train_wall=146, gb_free=59.2, wall=15960 epoch 007: 591 / 1707 loss=4.352, nll_loss=2.747, ppl=6.71, wps=293912, ups=0.68, wpb=430339, bsz=16550.8, num_updates=10800, lr=0.000608581, gnorm=0.276, clip=0, loss_scale=4, train_wall=146, gb_free=59.2, wall=15960 epoch 007: 591 / 1707 loss=4.352, nll_loss=2.747, ppl=6.71, wps=293912, ups=0.68, wpb=430339, bsz=16550.8, num_updates=10800, lr=0.000608581, gnorm=0.276, clip=0, loss_scale=4, train_wall=146, gb_free=59.2, wall=15960 epoch 007: 591 / 1707 loss=4.352, nll_loss=2.747, ppl=6.71, wps=293912, ups=0.68, wpb=430339, bsz=16550.8, num_updates=10800, lr=0.000608581, gnorm=0.276, clip=0, loss_scale=4, train_wall=146, gb_free=59.2, wall=15960 epoch 007: 691 / 1707 loss=4.352, nll_loss=2.746, ppl=6.71, wps=295846, ups=0.69, wpb=429089, bsz=16419, num_updates=10900, lr=0.000605783, gnorm=0.268, clip=0, loss_scale=4, train_wall=144, gb_free=59.3, wall=16105 epoch 007: 691 / 1707 loss=4.352, nll_loss=2.746, ppl=6.71, wps=295846, ups=0.69, wpb=429089, bsz=16419, num_updates=10900, lr=0.000605783, gnorm=0.268, clip=0, loss_scale=4, train_wall=144, gb_free=59.3, wall=16105 epoch 007: 691 / 1707 loss=4.352, nll_loss=2.746, ppl=6.71, wps=295846, ups=0.69, wpb=429089, bsz=16419, num_updates=10900, lr=0.000605783, gnorm=0.268, clip=0, loss_scale=4, train_wall=144, gb_free=59.3, wall=16105 epoch 007: 691 / 1707 loss=4.352, nll_loss=2.746, ppl=6.71, wps=295846, ups=0.69, wpb=429089, bsz=16419, num_updates=10900, lr=0.000605783, gnorm=0.268, clip=0, loss_scale=4, train_wall=144, gb_free=59.3, wall=16105 epoch 007: 691 / 1707 loss=4.352, nll_loss=2.746, ppl=6.71, wps=295846, ups=0.69, wpb=429089, bsz=16419, num_updates=10900, lr=0.000605783, gnorm=0.268, clip=0, loss_scale=4, train_wall=144, gb_free=59.3, wall=16105 epoch 007: 691 / 1707 loss=4.352, nll_loss=2.746, ppl=6.71, wps=295846, ups=0.69, wpb=429089, bsz=16419, num_updates=10900, lr=0.000605783, gnorm=0.268, clip=0, loss_scale=4, train_wall=144, gb_free=59.3, wall=16105 epoch 007: 691 / 1707 loss=4.352, nll_loss=2.746, ppl=6.71, wps=295846, ups=0.69, wpb=429089, bsz=16419, num_updates=10900, lr=0.000605783, gnorm=0.268, clip=0, loss_scale=4, train_wall=144, gb_free=59.3, wall=16105 epoch 007: 791 / 1707 loss=4.347, nll_loss=2.741, ppl=6.68, wps=298050, ups=0.69, wpb=430413, bsz=16331.9, num_updates=11000, lr=0.000603023, gnorm=0.253, clip=0, loss_scale=4, train_wall=144, gb_free=59.2, wall=16249 epoch 007: 791 / 1707 loss=4.347, nll_loss=2.741, ppl=6.68, wps=298050, ups=0.69, wpb=430413, bsz=16331.9, num_updates=11000, lr=0.000603023, gnorm=0.253, clip=0, loss_scale=4, train_wall=144, gb_free=59.2, wall=16249 epoch 007: 791 / 1707 loss=4.347, nll_loss=2.741, ppl=6.68, wps=298050, ups=0.69, wpb=430413, bsz=16331.9, num_updates=11000, lr=0.000603023, gnorm=0.253, clip=0, loss_scale=4, train_wall=144, gb_free=59.2, wall=16249 epoch 007: 791 / 1707 loss=4.347, nll_loss=2.741, ppl=6.68, wps=298050, ups=0.69, wpb=430413, bsz=16331.9, num_updates=11000, lr=0.000603023, gnorm=0.253, clip=0, loss_scale=4, train_wall=144, gb_free=59.2, wall=16249 epoch 007: 791 / 1707 loss=4.347, nll_loss=2.741, ppl=6.68, wps=298050, ups=0.69, wpb=430413, bsz=16331.9, num_updates=11000, lr=0.000603023, gnorm=0.253, clip=0, loss_scale=4, train_wall=144, gb_free=59.2, wall=16249 epoch 007: 791 / 1707 loss=4.347, nll_loss=2.741, ppl=6.68, wps=298050, ups=0.69, wpb=430413, bsz=16331.9, num_updates=11000, lr=0.000603023, gnorm=0.253, clip=0, loss_scale=4, train_wall=144, gb_free=59.2, wall=16249 epoch 007: 791 / 1707 loss=4.347, nll_loss=2.741, ppl=6.68, wps=298050, ups=0.69, wpb=430413, bsz=16331.9, num_updates=11000, lr=0.000603023, gnorm=0.253, clip=0, loss_scale=4, train_wall=144, gb_free=59.2, wall=16249 begin validation on "valid" subset epoch 007 | valid on 'valid' subset | loss 4.423 | nll_loss 2.808 | ppl 7 | wps 76787.4 | wpb 21331 | bsz 1016 | num_updates 11000 | best_loss 4.423 epoch 007 | valid on 'valid' subset | loss 4.423 | nll_loss 2.808 | ppl 7 | wps 76787.4 | wpb 21331 | bsz 1016 | num_updates 11000 | best_loss 4.423 epoch 007 | valid on 'valid' subset | loss 4.423 | nll_loss 2.808 | ppl 7 | wps 76787.4 | wpb 21331 | bsz 1016 | num_updates 11000 | best_loss 4.423 epoch 007 | valid on 'valid' subset | loss 4.423 | nll_loss 2.808 | ppl 7 | wps 76787.4 | wpb 21331 | bsz 1016 | num_updates 11000 | best_loss 4.423 epoch 007 | valid on 'valid' subset | loss 4.423 | nll_loss 2.808 | ppl 7 | wps 76787.4 | wpb 21331 | bsz 1016 | num_updates 11000 | best_loss 4.423 epoch 007 | valid on 'valid' subset | loss 4.423 | nll_loss 2.808 | ppl 7 | wps 76787.4 | wpb 21331 | bsz 1016 | num_updates 11000 | best_loss 4.423 epoch 007 | valid on 'valid' subset | loss 4.423 | nll_loss 2.808 | ppl 7 | wps 76787.4 | wpb 21331 | bsz 1016 | num_updates 11000 | best_loss 4.423 epoch 007: 891 / 1707 loss=4.356, nll_loss=2.751, ppl=6.73, wps=254478, ups=0.59, wpb=428470, bsz=16361.8, num_updates=11100, lr=0.0006003, gnorm=0.278, clip=0, loss_scale=8, train_wall=143, gb_free=59, wall=16418 epoch 007: 891 / 1707 loss=4.356, nll_loss=2.751, ppl=6.73, wps=254478, ups=0.59, wpb=428470, bsz=16361.8, num_updates=11100, lr=0.0006003, gnorm=0.278, clip=0, loss_scale=8, train_wall=143, gb_free=59, wall=16418 epoch 007: 891 / 1707 loss=4.356, nll_loss=2.751, ppl=6.73, wps=254478, ups=0.59, wpb=428470, bsz=16361.8, num_updates=11100, lr=0.0006003, gnorm=0.278, clip=0, loss_scale=8, train_wall=143, gb_free=59, wall=16418 epoch 007: 891 / 1707 loss=4.356, nll_loss=2.751, ppl=6.73, wps=254478, ups=0.59, wpb=428470, bsz=16361.8, num_updates=11100, lr=0.0006003, gnorm=0.278, clip=0, loss_scale=8, train_wall=143, gb_free=59, wall=16418 epoch 007: 891 / 1707 loss=4.356, nll_loss=2.751, ppl=6.73, wps=254478, ups=0.59, wpb=428470, bsz=16361.8, num_updates=11100, lr=0.0006003, gnorm=0.278, clip=0, loss_scale=8, train_wall=143, gb_free=59, wall=16418 epoch 007: 891 / 1707 loss=4.356, nll_loss=2.751, ppl=6.73, wps=254478, ups=0.59, wpb=428470, bsz=16361.8, num_updates=11100, lr=0.0006003, gnorm=0.278, clip=0, loss_scale=8, train_wall=143, gb_free=59, wall=16418 epoch 007: 891 / 1707 loss=4.356, nll_loss=2.751, ppl=6.73, wps=254478, ups=0.59, wpb=428470, bsz=16361.8, num_updates=11100, lr=0.0006003, gnorm=0.278, clip=0, loss_scale=8, train_wall=143, gb_free=59, wall=16418 epoch 007: 992 / 1707 loss=4.342, nll_loss=2.735, ppl=6.66, wps=293566, ups=0.69, wpb=428149, bsz=16274.7, num_updates=11200, lr=0.000597614, gnorm=0.265, clip=0, loss_scale=4, train_wall=145, gb_free=59.1, wall=16564 epoch 007: 992 / 1707 loss=4.342, nll_loss=2.735, ppl=6.66, wps=293566, ups=0.69, wpb=428149, bsz=16274.7, num_updates=11200, lr=0.000597614, gnorm=0.265, clip=0, loss_scale=4, train_wall=145, gb_free=59.1, wall=16564 epoch 007: 992 / 1707 loss=4.342, nll_loss=2.735, ppl=6.66, wps=293566, ups=0.69, wpb=428149, bsz=16274.7, num_updates=11200, lr=0.000597614, gnorm=0.265, clip=0, loss_scale=4, train_wall=145, gb_free=59.1, wall=16564 epoch 007: 992 / 1707 loss=4.342, nll_loss=2.735, ppl=6.66, wps=293566, ups=0.69, wpb=428149, bsz=16274.7, num_updates=11200, lr=0.000597614, gnorm=0.265, clip=0, loss_scale=4, train_wall=145, gb_free=59.1, wall=16564 epoch 007: 992 / 1707 loss=4.342, nll_loss=2.735, ppl=6.66, wps=293566, ups=0.69, wpb=428149, bsz=16274.7, num_updates=11200, lr=0.000597614, gnorm=0.265, clip=0, loss_scale=4, train_wall=145, gb_free=59.1, wall=16564 epoch 007: 992 / 1707 loss=4.342, nll_loss=2.735, ppl=6.66, wps=293566, ups=0.69, wpb=428149, bsz=16274.7, num_updates=11200, lr=0.000597614, gnorm=0.265, clip=0, loss_scale=4, train_wall=145, gb_free=59.1, wall=16564 epoch 007: 992 / 1707 loss=4.342, nll_loss=2.735, ppl=6.66, wps=293566, ups=0.69, wpb=428149, bsz=16274.7, num_updates=11200, lr=0.000597614, gnorm=0.265, clip=0, loss_scale=4, train_wall=145, gb_free=59.1, wall=16564 epoch 007: 1092 / 1707 loss=4.336, nll_loss=2.729, ppl=6.63, wps=295766, ups=0.69, wpb=428037, bsz=16381.1, num_updates=11300, lr=0.000594964, gnorm=0.268, clip=0, loss_scale=4, train_wall=144, gb_free=58.9, wall=16708 epoch 007: 1092 / 1707 loss=4.336, nll_loss=2.729, ppl=6.63, wps=295766, ups=0.69, wpb=428037, bsz=16381.1, num_updates=11300, lr=0.000594964, gnorm=0.268, clip=0, loss_scale=4, train_wall=144, gb_free=58.9, wall=16708 epoch 007: 1092 / 1707 loss=4.336, nll_loss=2.729, ppl=6.63, wps=295766, ups=0.69, wpb=428037, bsz=16381.1, num_updates=11300, lr=0.000594964, gnorm=0.268, clip=0, loss_scale=4, train_wall=144, gb_free=58.9, wall=16708 epoch 007: 1092 / 1707 loss=4.336, nll_loss=2.729, ppl=6.63, wps=295766, ups=0.69, wpb=428037, bsz=16381.1, num_updates=11300, lr=0.000594964, gnorm=0.268, clip=0, loss_scale=4, train_wall=144, gb_free=58.9, wall=16708 epoch 007: 1092 / 1707 loss=4.336, nll_loss=2.729, ppl=6.63, wps=295766, ups=0.69, wpb=428037, bsz=16381.1, num_updates=11300, lr=0.000594964, gnorm=0.268, clip=0, loss_scale=4, train_wall=144, gb_free=58.9, wall=16708 epoch 007: 1092 / 1707 loss=4.336, nll_loss=2.729, ppl=6.63, wps=295766, ups=0.69, wpb=428037, bsz=16381.1, num_updates=11300, lr=0.000594964, gnorm=0.268, clip=0, loss_scale=4, train_wall=144, gb_free=58.9, wall=16708 epoch 007: 1092 / 1707 loss=4.336, nll_loss=2.729, ppl=6.63, wps=295766, ups=0.69, wpb=428037, bsz=16381.1, num_updates=11300, lr=0.000594964, gnorm=0.268, clip=0, loss_scale=4, train_wall=144, gb_free=58.9, wall=16708 epoch 007: 1193 / 1707 loss=4.349, nll_loss=2.745, ppl=6.7, wps=293637, ups=0.68, wpb=428696, bsz=16287, num_updates=11400, lr=0.000592349, gnorm=0.26, clip=0, loss_scale=4, train_wall=145, gb_free=59.4, wall=16854 epoch 007: 1193 / 1707 loss=4.349, nll_loss=2.745, ppl=6.7, wps=293637, ups=0.68, wpb=428696, bsz=16287, num_updates=11400, lr=0.000592349, gnorm=0.26, clip=0, loss_scale=4, train_wall=145, gb_free=59.4, wall=16854 epoch 007: 1193 / 1707 loss=4.349, nll_loss=2.745, ppl=6.7, wps=293637, ups=0.68, wpb=428696, bsz=16287, num_updates=11400, lr=0.000592349, gnorm=0.26, clip=0, loss_scale=4, train_wall=145, gb_free=59.4, wall=16854 epoch 007: 1193 / 1707 loss=4.349, nll_loss=2.745, ppl=6.7, wps=293637, ups=0.68, wpb=428696, bsz=16287, num_updates=11400, lr=0.000592349, gnorm=0.26, clip=0, loss_scale=4, train_wall=145, gb_free=59.4, wall=16854 epoch 007: 1193 / 1707 loss=4.349, nll_loss=2.745, ppl=6.7, wps=293637, ups=0.68, wpb=428696, bsz=16287, num_updates=11400, lr=0.000592349, gnorm=0.26, clip=0, loss_scale=4, train_wall=145, gb_free=59.4, wall=16854 epoch 007: 1193 / 1707 loss=4.349, nll_loss=2.745, ppl=6.7, wps=293637, ups=0.68, wpb=428696, bsz=16287, num_updates=11400, lr=0.000592349, gnorm=0.26, clip=0, loss_scale=4, train_wall=145, gb_free=59.4, wall=16854 epoch 007: 1193 / 1707 loss=4.349, nll_loss=2.745, ppl=6.7, wps=293637, ups=0.68, wpb=428696, bsz=16287, num_updates=11400, lr=0.000592349, gnorm=0.26, clip=0, loss_scale=4, train_wall=145, gb_free=59.4, wall=16854 epoch 007: 1294 / 1707 loss=4.339, nll_loss=2.732, ppl=6.65, wps=293717, ups=0.68, wpb=429109, bsz=16269.5, num_updates=11500, lr=0.000589768, gnorm=0.264, clip=0, loss_scale=2, train_wall=145, gb_free=59.6, wall=17000 epoch 007: 1294 / 1707 loss=4.339, nll_loss=2.732, ppl=6.65, wps=293717, ups=0.68, wpb=429109, bsz=16269.5, num_updates=11500, lr=0.000589768, gnorm=0.264, clip=0, loss_scale=2, train_wall=145, gb_free=59.6, wall=17000 epoch 007: 1294 / 1707 loss=4.339, nll_loss=2.732, ppl=6.65, wps=293717, ups=0.68, wpb=429109, bsz=16269.5, num_updates=11500, lr=0.000589768, gnorm=0.264, clip=0, loss_scale=2, train_wall=145, gb_free=59.6, wall=17000 epoch 007: 1294 / 1707 loss=4.339, nll_loss=2.732, ppl=6.65, wps=293717, ups=0.68, wpb=429109, bsz=16269.5, num_updates=11500, lr=0.000589768, gnorm=0.264, clip=0, loss_scale=2, train_wall=145, gb_free=59.6, wall=17000 epoch 007: 1294 / 1707 loss=4.339, nll_loss=2.732, ppl=6.65, wps=293717, ups=0.68, wpb=429109, bsz=16269.5, num_updates=11500, lr=0.000589768, gnorm=0.264, clip=0, loss_scale=2, train_wall=145, gb_free=59.6, wall=17000 epoch 007: 1294 / 1707 loss=4.339, nll_loss=2.732, ppl=6.65, wps=293717, ups=0.68, wpb=429109, bsz=16269.5, num_updates=11500, lr=0.000589768, gnorm=0.264, clip=0, loss_scale=2, train_wall=145, gb_free=59.6, wall=17000 epoch 007: 1294 / 1707 loss=4.339, nll_loss=2.732, ppl=6.65, wps=293717, ups=0.68, wpb=429109, bsz=16269.5, num_updates=11500, lr=0.000589768, gnorm=0.264, clip=0, loss_scale=2, train_wall=145, gb_free=59.6, wall=17000 epoch 007: 1394 / 1707 loss=4.349, nll_loss=2.744, ppl=6.7, wps=296152, ups=0.69, wpb=428223, bsz=16180.2, num_updates=11600, lr=0.00058722, gnorm=0.277, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=17145 epoch 007: 1394 / 1707 loss=4.349, nll_loss=2.744, ppl=6.7, wps=296152, ups=0.69, wpb=428223, bsz=16180.2, num_updates=11600, lr=0.00058722, gnorm=0.277, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=17145 epoch 007: 1394 / 1707 loss=4.349, nll_loss=2.744, ppl=6.7, wps=296152, ups=0.69, wpb=428223, bsz=16180.2, num_updates=11600, lr=0.00058722, gnorm=0.277, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=17145 epoch 007: 1394 / 1707 loss=4.349, nll_loss=2.744, ppl=6.7, wps=296152, ups=0.69, wpb=428223, bsz=16180.2, num_updates=11600, lr=0.00058722, gnorm=0.277, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=17145 epoch 007: 1394 / 1707 loss=4.349, nll_loss=2.744, ppl=6.7, wps=296152, ups=0.69, wpb=428223, bsz=16180.2, num_updates=11600, lr=0.00058722, gnorm=0.277, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=17145 epoch 007: 1394 / 1707 loss=4.349, nll_loss=2.744, ppl=6.7, wps=296152, ups=0.69, wpb=428223, bsz=16180.2, num_updates=11600, lr=0.00058722, gnorm=0.277, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=17145 epoch 007: 1394 / 1707 loss=4.349, nll_loss=2.744, ppl=6.7, wps=296152, ups=0.69, wpb=428223, bsz=16180.2, num_updates=11600, lr=0.00058722, gnorm=0.277, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=17145 epoch 007: 1494 / 1707 loss=4.338, nll_loss=2.732, ppl=6.64, wps=295405, ups=0.69, wpb=428460, bsz=16393, num_updates=11700, lr=0.000584705, gnorm=0.263, clip=0, loss_scale=2, train_wall=144, gb_free=59.1, wall=17290 epoch 007: 1494 / 1707 loss=4.338, nll_loss=2.732, ppl=6.64, wps=295405, ups=0.69, wpb=428460, bsz=16393, num_updates=11700, lr=0.000584705, gnorm=0.263, clip=0, loss_scale=2, train_wall=144, gb_free=59.1, wall=17290 epoch 007: 1494 / 1707 loss=4.338, nll_loss=2.732, ppl=6.64, wps=295405, ups=0.69, wpb=428460, bsz=16393, num_updates=11700, lr=0.000584705, gnorm=0.263, clip=0, loss_scale=2, train_wall=144, gb_free=59.1, wall=17290 epoch 007: 1494 / 1707 loss=4.338, nll_loss=2.732, ppl=6.64, wps=295405, ups=0.69, wpb=428460, bsz=16393, num_updates=11700, lr=0.000584705, gnorm=0.263, clip=0, loss_scale=2, train_wall=144, gb_free=59.1, wall=17290 epoch 007: 1494 / 1707 loss=4.338, nll_loss=2.732, ppl=6.64, wps=295405, ups=0.69, wpb=428460, bsz=16393, num_updates=11700, lr=0.000584705, gnorm=0.263, clip=0, loss_scale=2, train_wall=144, gb_free=59.1, wall=17290 epoch 007: 1494 / 1707 loss=4.338, nll_loss=2.732, ppl=6.64, wps=295405, ups=0.69, wpb=428460, bsz=16393, num_updates=11700, lr=0.000584705, gnorm=0.263, clip=0, loss_scale=2, train_wall=144, gb_free=59.1, wall=17290 epoch 007: 1494 / 1707 loss=4.338, nll_loss=2.732, ppl=6.64, wps=295405, ups=0.69, wpb=428460, bsz=16393, num_updates=11700, lr=0.000584705, gnorm=0.263, clip=0, loss_scale=2, train_wall=144, gb_free=59.1, wall=17290 epoch 007: 1594 / 1707 loss=4.34, nll_loss=2.734, ppl=6.65, wps=297577, ups=0.69, wpb=430004, bsz=16392.2, num_updates=11800, lr=0.000582223, gnorm=0.267, clip=0, loss_scale=4, train_wall=144, gb_free=59.7, wall=17435 epoch 007: 1594 / 1707 loss=4.34, nll_loss=2.734, ppl=6.65, wps=297577, ups=0.69, wpb=430004, bsz=16392.2, num_updates=11800, lr=0.000582223, gnorm=0.267, clip=0, loss_scale=4, train_wall=144, gb_free=59.7, wall=17435 epoch 007: 1594 / 1707 loss=4.34, nll_loss=2.734, ppl=6.65, wps=297577, ups=0.69, wpb=430004, bsz=16392.2, num_updates=11800, lr=0.000582223, gnorm=0.267, clip=0, loss_scale=4, train_wall=144, gb_free=59.7, wall=17435 epoch 007: 1594 / 1707 loss=4.34, nll_loss=2.734, ppl=6.65, wps=297577, ups=0.69, wpb=430004, bsz=16392.2, num_updates=11800, lr=0.000582223, gnorm=0.267, clip=0, loss_scale=4, train_wall=144, gb_free=59.7, wall=17435 epoch 007: 1594 / 1707 loss=4.34, nll_loss=2.734, ppl=6.65, wps=297577, ups=0.69, wpb=430004, bsz=16392.2, num_updates=11800, lr=0.000582223, gnorm=0.267, clip=0, loss_scale=4, train_wall=144, gb_free=59.7, wall=17435 epoch 007: 1594 / 1707 loss=4.34, nll_loss=2.734, ppl=6.65, wps=297577, ups=0.69, wpb=430004, bsz=16392.2, num_updates=11800, lr=0.000582223, gnorm=0.267, clip=0, loss_scale=4, train_wall=144, gb_free=59.7, wall=17435 epoch 007: 1594 / 1707 loss=4.34, nll_loss=2.734, ppl=6.65, wps=297577, ups=0.69, wpb=430004, bsz=16392.2, num_updates=11800, lr=0.000582223, gnorm=0.267, clip=0, loss_scale=4, train_wall=144, gb_free=59.7, wall=17435 epoch 007: 1694 / 1707 loss=4.333, nll_loss=2.728, ppl=6.62, wps=295826, ups=0.69, wpb=428336, bsz=16438.7, num_updates=11900, lr=0.000579771, gnorm=0.264, clip=0, loss_scale=4, train_wall=144, gb_free=59, wall=17579 epoch 007: 1694 / 1707 loss=4.333, nll_loss=2.728, ppl=6.62, wps=295826, ups=0.69, wpb=428336, bsz=16438.7, num_updates=11900, lr=0.000579771, gnorm=0.264, clip=0, loss_scale=4, train_wall=144, gb_free=59, wall=17579 epoch 007: 1694 / 1707 loss=4.333, nll_loss=2.728, ppl=6.62, wps=295826, ups=0.69, wpb=428336, bsz=16438.7, num_updates=11900, lr=0.000579771, gnorm=0.264, clip=0, loss_scale=4, train_wall=144, gb_free=59, wall=17579 epoch 007: 1694 / 1707 loss=4.333, nll_loss=2.728, ppl=6.62, wps=295826, ups=0.69, wpb=428336, bsz=16438.7, num_updates=11900, lr=0.000579771, gnorm=0.264, clip=0, loss_scale=4, train_wall=144, gb_free=59, wall=17579 epoch 007: 1694 / 1707 loss=4.333, nll_loss=2.728, ppl=6.62, wps=295826, ups=0.69, wpb=428336, bsz=16438.7, num_updates=11900, lr=0.000579771, gnorm=0.264, clip=0, loss_scale=4, train_wall=144, gb_free=59, wall=17579 epoch 007: 1694 / 1707 loss=4.333, nll_loss=2.728, ppl=6.62, wps=295826, ups=0.69, wpb=428336, bsz=16438.7, num_updates=11900, lr=0.000579771, gnorm=0.264, clip=0, loss_scale=4, train_wall=144, gb_free=59, wall=17579 epoch 007: 1694 / 1707 loss=4.333, nll_loss=2.728, ppl=6.62, wps=295826, ups=0.69, wpb=428336, bsz=16438.7, num_updates=11900, lr=0.000579771, gnorm=0.264, clip=0, loss_scale=4, train_wall=144, gb_free=59, wall=17579 end of epoch 7 (average epoch stats below) epoch 007 | loss 4.346 | nll_loss 2.74 | ppl 6.68 | wps 292598 | ups 0.68 | wpb 428939 | bsz 16327.7 | num_updates 11913 | lr 0.000579455 | gnorm 0.269 | clip 0 | loss_scale 4 | train_wall 2455 | gb_free 60.8 | wall 17597 epoch 007 | loss 4.346 | nll_loss 2.74 | ppl 6.68 | wps 292598 | ups 0.68 | wpb 428939 | bsz 16327.7 | num_updates 11913 | lr 0.000579455 | gnorm 0.269 | clip 0 | loss_scale 4 | train_wall 2455 | gb_free 60.8 | wall 17597 epoch 007 | loss 4.346 | nll_loss 2.74 | ppl 6.68 | wps 292598 | ups 0.68 | wpb 428939 | bsz 16327.7 | num_updates 11913 | lr 0.000579455 | gnorm 0.269 | clip 0 | loss_scale 4 | train_wall 2455 | gb_free 60.8 | wall 17597 epoch 007 | loss 4.346 | nll_loss 2.74 | ppl 6.68 | wps 292598 | ups 0.68 | wpb 428939 | bsz 16327.7 | num_updates 11913 | lr 0.000579455 | gnorm 0.269 | clip 0 | loss_scale 4 | train_wall 2455 | gb_free 60.8 | wall 17597 epoch 007 | loss 4.346 | nll_loss 2.74 | ppl 6.68 | wps 292598 | ups 0.68 | wpb 428939 | bsz 16327.7 | num_updates 11913 | lr 0.000579455 | gnorm 0.269 | clip 0 | loss_scale 4 | train_wall 2455 | gb_free 60.8 | wall 17597 epoch 007 | loss 4.346 | nll_loss 2.74 | ppl 6.68 | wps 292598 | ups 0.68 | wpb 428939 | bsz 16327.7 | num_updates 11913 | lr 0.000579455 | gnorm 0.269 | clip 0 | loss_scale 4 | train_wall 2455 | gb_free 60.8 | wall 17597 epoch 007 | loss 4.346 | nll_loss 2.74 | ppl 6.68 | wps 292598 | ups 0.68 | wpb 428939 | bsz 16327.7 | num_updates 11913 | lr 0.000579455 | gnorm 0.269 | clip 0 | loss_scale 4 | train_wall 2455 | gb_free 60.8 | wall 17597 Start iterating over samples epoch 008: 87 / 1707 loss=4.298, nll_loss=2.686, ppl=6.44, wps=295454, ups=0.69, wpb=425825, bsz=16091.9, num_updates=12000, lr=0.00057735, gnorm=0.267, clip=0, loss_scale=8, train_wall=143, gb_free=59.3, wall=17724 epoch 008: 87 / 1707 loss=4.298, nll_loss=2.686, ppl=6.44, wps=295454, ups=0.69, wpb=425825, bsz=16091.9, num_updates=12000, lr=0.00057735, gnorm=0.267, clip=0, loss_scale=8, train_wall=143, gb_free=59.3, wall=17724 epoch 008: 87 / 1707 loss=4.298, nll_loss=2.686, ppl=6.44, wps=295454, ups=0.69, wpb=425825, bsz=16091.9, num_updates=12000, lr=0.00057735, gnorm=0.267, clip=0, loss_scale=8, train_wall=143, gb_free=59.3, wall=17724 epoch 008: 87 / 1707 loss=4.298, nll_loss=2.686, ppl=6.44, wps=295454, ups=0.69, wpb=425825, bsz=16091.9, num_updates=12000, lr=0.00057735, gnorm=0.267, clip=0, loss_scale=8, train_wall=143, gb_free=59.3, wall=17724 epoch 008: 87 / 1707 loss=4.298, nll_loss=2.686, ppl=6.44, wps=295454, ups=0.69, wpb=425825, bsz=16091.9, num_updates=12000, lr=0.00057735, gnorm=0.267, clip=0, loss_scale=8, train_wall=143, gb_free=59.3, wall=17724 epoch 008: 87 / 1707 loss=4.298, nll_loss=2.686, ppl=6.44, wps=295454, ups=0.69, wpb=425825, bsz=16091.9, num_updates=12000, lr=0.00057735, gnorm=0.267, clip=0, loss_scale=8, train_wall=143, gb_free=59.3, wall=17724 epoch 008: 87 / 1707 loss=4.298, nll_loss=2.686, ppl=6.44, wps=295454, ups=0.69, wpb=425825, bsz=16091.9, num_updates=12000, lr=0.00057735, gnorm=0.267, clip=0, loss_scale=8, train_wall=143, gb_free=59.3, wall=17724 epoch 008: 87 / 1707 loss=4.298, nll_loss=2.686, ppl=6.44, wps=295454, ups=0.69, wpb=425825, bsz=16091.9, num_updates=12000, lr=0.00057735, gnorm=0.267, clip=0, loss_scale=8, train_wall=143, gb_free=59.3, wall=17724 begin validation on "valid" subset epoch 008 | valid on 'valid' subset | loss 4.407 | nll_loss 2.796 | ppl 6.95 | wps 76911.4 | wpb 21331 | bsz 1016 | num_updates 12000 | best_loss 4.407 epoch 008 | valid on 'valid' subset | loss 4.407 | nll_loss 2.796 | ppl 6.95 | wps 76911.4 | wpb 21331 | bsz 1016 | num_updates 12000 | best_loss 4.407 epoch 008 | valid on 'valid' subset | loss 4.407 | nll_loss 2.796 | ppl 6.95 | wps 76911.4 | wpb 21331 | bsz 1016 | num_updates 12000 | best_loss 4.407 epoch 008 | valid on 'valid' subset | loss 4.407 | nll_loss 2.796 | ppl 6.95 | wps 76911.4 | wpb 21331 | bsz 1016 | num_updates 12000 | best_loss 4.407 epoch 008 | valid on 'valid' subset | loss 4.407 | nll_loss 2.796 | ppl 6.95 | wps 76911.4 | wpb 21331 | bsz 1016 | num_updates 12000 | best_loss 4.407 epoch 008 | valid on 'valid' subset | loss 4.407 | nll_loss 2.796 | ppl 6.95 | wps 76911.4 | wpb 21331 | bsz 1016 | num_updates 12000 | best_loss 4.407 epoch 008 | valid on 'valid' subset | loss 4.407 | nll_loss 2.796 | ppl 6.95 | wps 76911.4 | wpb 21331 | bsz 1016 | num_updates 12000 | best_loss 4.407 epoch 008 | valid on 'valid' subset | loss 4.407 | nll_loss 2.796 | ppl 6.95 | wps 76911.4 | wpb 21331 | bsz 1016 | num_updates 12000 | best_loss 4.407 epoch 008: 187 / 1707 loss=4.295, nll_loss=2.683, ppl=6.42, wps=262440, ups=0.61, wpb=428899, bsz=16095.6, num_updates=12100, lr=0.00057496, gnorm=0.265, clip=0, loss_scale=8, train_wall=143, gb_free=59, wall=17887 epoch 008: 187 / 1707 loss=4.295, nll_loss=2.683, ppl=6.42, wps=262440, ups=0.61, wpb=428899, bsz=16095.6, num_updates=12100, lr=0.00057496, gnorm=0.265, clip=0, loss_scale=8, train_wall=143, gb_free=59, wall=17887 epoch 008: 187 / 1707 loss=4.295, nll_loss=2.683, ppl=6.42, wps=262440, ups=0.61, wpb=428899, bsz=16095.6, num_updates=12100, lr=0.00057496, gnorm=0.265, clip=0, loss_scale=8, train_wall=143, gb_free=59, wall=17887 epoch 008: 187 / 1707 loss=4.295, nll_loss=2.683, ppl=6.42, wps=262440, ups=0.61, wpb=428899, bsz=16095.6, num_updates=12100, lr=0.00057496, gnorm=0.265, clip=0, loss_scale=8, train_wall=143, gb_free=59, wall=17887 epoch 008: 187 / 1707 loss=4.295, nll_loss=2.683, ppl=6.42, wps=262440, ups=0.61, wpb=428899, bsz=16095.6, num_updates=12100, lr=0.00057496, gnorm=0.265, clip=0, loss_scale=8, train_wall=143, gb_free=59, wall=17887 epoch 008: 187 / 1707 loss=4.295, nll_loss=2.683, ppl=6.42, wps=262440, ups=0.61, wpb=428899, bsz=16095.6, num_updates=12100, lr=0.00057496, gnorm=0.265, clip=0, loss_scale=8, train_wall=143, gb_free=59, wall=17887 epoch 008: 187 / 1707 loss=4.295, nll_loss=2.683, ppl=6.42, wps=262440, ups=0.61, wpb=428899, bsz=16095.6, num_updates=12100, lr=0.00057496, gnorm=0.265, clip=0, loss_scale=8, train_wall=143, gb_free=59, wall=17887 epoch 008: 187 / 1707 loss=4.295, nll_loss=2.683, ppl=6.42, wps=262440, ups=0.61, wpb=428899, bsz=16095.6, num_updates=12100, lr=0.00057496, gnorm=0.265, clip=0, loss_scale=8, train_wall=143, gb_free=59, wall=17887 epoch 008: 289 / 1707 loss=4.293, nll_loss=2.681, ppl=6.41, wps=290192, ups=0.68, wpb=428565, bsz=16306.6, num_updates=12200, lr=0.000572598, gnorm=0.264, clip=0, loss_scale=2, train_wall=147, gb_free=58.8, wall=18035 epoch 008: 289 / 1707 loss=4.293, nll_loss=2.681, ppl=6.41, wps=290192, ups=0.68, wpb=428565, bsz=16306.6, num_updates=12200, lr=0.000572598, gnorm=0.264, clip=0, loss_scale=2, train_wall=147, gb_free=58.8, wall=18035 epoch 008: 289 / 1707 loss=4.293, nll_loss=2.681, ppl=6.41, wps=290192, ups=0.68, wpb=428565, bsz=16306.6, num_updates=12200, lr=0.000572598, gnorm=0.264, clip=0, loss_scale=2, train_wall=147, gb_free=58.8, wall=18035 epoch 008: 289 / 1707 loss=4.293, nll_loss=2.681, ppl=6.41, wps=290192, ups=0.68, wpb=428565, bsz=16306.6, num_updates=12200, lr=0.000572598, gnorm=0.264, clip=0, loss_scale=2, train_wall=147, gb_free=58.8, wall=18035 epoch 008: 289 / 1707 loss=4.293, nll_loss=2.681, ppl=6.41, wps=290192, ups=0.68, wpb=428565, bsz=16306.6, num_updates=12200, lr=0.000572598, gnorm=0.264, clip=0, loss_scale=2, train_wall=147, gb_free=58.8, wall=18035 epoch 008: 289 / 1707 loss=4.293, nll_loss=2.681, ppl=6.41, wps=290192, ups=0.68, wpb=428565, bsz=16306.6, num_updates=12200, lr=0.000572598, gnorm=0.264, clip=0, loss_scale=2, train_wall=147, gb_free=58.8, wall=18035 epoch 008: 289 / 1707 loss=4.293, nll_loss=2.681, ppl=6.41, wps=290192, ups=0.68, wpb=428565, bsz=16306.6, num_updates=12200, lr=0.000572598, gnorm=0.264, clip=0, loss_scale=2, train_wall=147, gb_free=58.8, wall=18035 epoch 008: 289 / 1707 loss=4.293, nll_loss=2.681, ppl=6.41, wps=290192, ups=0.68, wpb=428565, bsz=16306.6, num_updates=12200, lr=0.000572598, gnorm=0.264, clip=0, loss_scale=2, train_wall=147, gb_free=58.8, wall=18035 epoch 008: 389 / 1707 loss=4.303, nll_loss=2.693, ppl=6.47, wps=297993, ups=0.69, wpb=429808, bsz=16479.5, num_updates=12300, lr=0.000570266, gnorm=0.25, clip=0, loss_scale=2, train_wall=143, gb_free=58.8, wall=18179 epoch 008: 389 / 1707 loss=4.303, nll_loss=2.693, ppl=6.47, wps=297993, ups=0.69, wpb=429808, bsz=16479.5, num_updates=12300, lr=0.000570266, gnorm=0.25, clip=0, loss_scale=2, train_wall=143, gb_free=58.8, wall=18179 epoch 008: 389 / 1707 loss=4.303, nll_loss=2.693, ppl=6.47, wps=297993, ups=0.69, wpb=429808, bsz=16479.5, num_updates=12300, lr=0.000570266, gnorm=0.25, clip=0, loss_scale=2, train_wall=143, gb_free=58.8, wall=18179 epoch 008: 389 / 1707 loss=4.303, nll_loss=2.693, ppl=6.47, wps=297993, ups=0.69, wpb=429808, bsz=16479.5, num_updates=12300, lr=0.000570266, gnorm=0.25, clip=0, loss_scale=2, train_wall=143, gb_free=58.8, wall=18179 epoch 008: 389 / 1707 loss=4.303, nll_loss=2.693, ppl=6.47, wps=297993, ups=0.69, wpb=429808, bsz=16479.5, num_updates=12300, lr=0.000570266, gnorm=0.25, clip=0, loss_scale=2, train_wall=143, gb_free=58.8, wall=18179 epoch 008: 389 / 1707 loss=4.303, nll_loss=2.693, ppl=6.47, wps=297993, ups=0.69, wpb=429808, bsz=16479.5, num_updates=12300, lr=0.000570266, gnorm=0.25, clip=0, loss_scale=2, train_wall=143, gb_free=58.8, wall=18179 epoch 008: 389 / 1707 loss=4.303, nll_loss=2.693, ppl=6.47, wps=297993, ups=0.69, wpb=429808, bsz=16479.5, num_updates=12300, lr=0.000570266, gnorm=0.25, clip=0, loss_scale=2, train_wall=143, gb_free=58.8, wall=18179 epoch 008: 389 / 1707 loss=4.303, nll_loss=2.693, ppl=6.47, wps=297993, ups=0.69, wpb=429808, bsz=16479.5, num_updates=12300, lr=0.000570266, gnorm=0.25, clip=0, loss_scale=2, train_wall=143, gb_free=58.8, wall=18179 epoch 008: 489 / 1707 loss=4.307, nll_loss=2.697, ppl=6.49, wps=297072, ups=0.69, wpb=429450, bsz=16450.7, num_updates=12400, lr=0.000567962, gnorm=0.26, clip=0, loss_scale=2, train_wall=144, gb_free=59.3, wall=18323 epoch 008: 489 / 1707 loss=4.307, nll_loss=2.697, ppl=6.49, wps=297072, ups=0.69, wpb=429450, bsz=16450.7, num_updates=12400, lr=0.000567962, gnorm=0.26, clip=0, loss_scale=2, train_wall=144, gb_free=59.3, wall=18323 epoch 008: 489 / 1707 loss=4.307, nll_loss=2.697, ppl=6.49, wps=297072, ups=0.69, wpb=429450, bsz=16450.7, num_updates=12400, lr=0.000567962, gnorm=0.26, clip=0, loss_scale=2, train_wall=144, gb_free=59.3, wall=18323 epoch 008: 489 / 1707 loss=4.307, nll_loss=2.697, ppl=6.49, wps=297072, ups=0.69, wpb=429450, bsz=16450.7, num_updates=12400, lr=0.000567962, gnorm=0.26, clip=0, loss_scale=2, train_wall=144, gb_free=59.3, wall=18323 epoch 008: 489 / 1707 loss=4.307, nll_loss=2.697, ppl=6.49, wps=297072, ups=0.69, wpb=429450, bsz=16450.7, num_updates=12400, lr=0.000567962, gnorm=0.26, clip=0, loss_scale=2, train_wall=144, gb_free=59.3, wall=18323 epoch 008: 489 / 1707 loss=4.307, nll_loss=2.697, ppl=6.49, wps=297072, ups=0.69, wpb=429450, bsz=16450.7, num_updates=12400, lr=0.000567962, gnorm=0.26, clip=0, loss_scale=2, train_wall=144, gb_free=59.3, wall=18323 epoch 008: 489 / 1707 loss=4.307, nll_loss=2.697, ppl=6.49, wps=297072, ups=0.69, wpb=429450, bsz=16450.7, num_updates=12400, lr=0.000567962, gnorm=0.26, clip=0, loss_scale=2, train_wall=144, gb_free=59.3, wall=18323 epoch 008: 489 / 1707 loss=4.307, nll_loss=2.697, ppl=6.49, wps=297072, ups=0.69, wpb=429450, bsz=16450.7, num_updates=12400, lr=0.000567962, gnorm=0.26, clip=0, loss_scale=2, train_wall=144, gb_free=59.3, wall=18323 epoch 008: 589 / 1707 loss=4.297, nll_loss=2.686, ppl=6.43, wps=295841, ups=0.69, wpb=429064, bsz=16376.9, num_updates=12500, lr=0.000565685, gnorm=0.271, clip=0, loss_scale=4, train_wall=144, gb_free=59.5, wall=18468 epoch 008: 589 / 1707 loss=4.297, nll_loss=2.686, ppl=6.43, wps=295841, ups=0.69, wpb=429064, bsz=16376.9, num_updates=12500, lr=0.000565685, gnorm=0.271, clip=0, loss_scale=4, train_wall=144, gb_free=59.5, wall=18468 epoch 008: 589 / 1707 loss=4.297, nll_loss=2.686, ppl=6.43, wps=295841, ups=0.69, wpb=429064, bsz=16376.9, num_updates=12500, lr=0.000565685, gnorm=0.271, clip=0, loss_scale=4, train_wall=144, gb_free=59.5, wall=18468 epoch 008: 589 / 1707 loss=4.297, nll_loss=2.686, ppl=6.43, wps=295841, ups=0.69, wpb=429064, bsz=16376.9, num_updates=12500, lr=0.000565685, gnorm=0.271, clip=0, loss_scale=4, train_wall=144, gb_free=59.5, wall=18468 epoch 008: 589 / 1707 loss=4.297, nll_loss=2.686, ppl=6.43, wps=295841, ups=0.69, wpb=429064, bsz=16376.9, num_updates=12500, lr=0.000565685, gnorm=0.271, clip=0, loss_scale=4, train_wall=144, gb_free=59.5, wall=18468 epoch 008: 589 / 1707 loss=4.297, nll_loss=2.686, ppl=6.43, wps=295841, ups=0.69, wpb=429064, bsz=16376.9, num_updates=12500, lr=0.000565685, gnorm=0.271, clip=0, loss_scale=4, train_wall=144, gb_free=59.5, wall=18468 epoch 008: 589 / 1707 loss=4.297, nll_loss=2.686, ppl=6.43, wps=295841, ups=0.69, wpb=429064, bsz=16376.9, num_updates=12500, lr=0.000565685, gnorm=0.271, clip=0, loss_scale=4, train_wall=144, gb_free=59.5, wall=18468 epoch 008: 589 / 1707 loss=4.297, nll_loss=2.686, ppl=6.43, wps=295841, ups=0.69, wpb=429064, bsz=16376.9, num_updates=12500, lr=0.000565685, gnorm=0.271, clip=0, loss_scale=4, train_wall=144, gb_free=59.5, wall=18468 epoch 008: 689 / 1707 loss=4.302, nll_loss=2.692, ppl=6.46, wps=295733, ups=0.69, wpb=428654, bsz=16484.2, num_updates=12600, lr=0.000563436, gnorm=0.253, clip=0, loss_scale=4, train_wall=144, gb_free=59, wall=18613 epoch 008: 689 / 1707 loss=4.302, nll_loss=2.692, ppl=6.46, wps=295733, ups=0.69, wpb=428654, bsz=16484.2, num_updates=12600, lr=0.000563436, gnorm=0.253, clip=0, loss_scale=4, train_wall=144, gb_free=59, wall=18613 epoch 008: 689 / 1707 loss=4.302, nll_loss=2.692, ppl=6.46, wps=295733, ups=0.69, wpb=428654, bsz=16484.2, num_updates=12600, lr=0.000563436, gnorm=0.253, clip=0, loss_scale=4, train_wall=144, gb_free=59, wall=18613 epoch 008: 689 / 1707 loss=4.302, nll_loss=2.692, ppl=6.46, wps=295733, ups=0.69, wpb=428654, bsz=16484.2, num_updates=12600, lr=0.000563436, gnorm=0.253, clip=0, loss_scale=4, train_wall=144, gb_free=59, wall=18613 epoch 008: 689 / 1707 loss=4.302, nll_loss=2.692, ppl=6.46, wps=295733, ups=0.69, wpb=428654, bsz=16484.2, num_updates=12600, lr=0.000563436, gnorm=0.253, clip=0, loss_scale=4, train_wall=144, gb_free=59, wall=18613 epoch 008: 689 / 1707 loss=4.302, nll_loss=2.692, ppl=6.46, wps=295733, ups=0.69, wpb=428654, bsz=16484.2, num_updates=12600, lr=0.000563436, gnorm=0.253, clip=0, loss_scale=4, train_wall=144, gb_free=59, wall=18613 epoch 008: 689 / 1707 loss=4.302, nll_loss=2.692, ppl=6.46, wps=295733, ups=0.69, wpb=428654, bsz=16484.2, num_updates=12600, lr=0.000563436, gnorm=0.253, clip=0, loss_scale=4, train_wall=144, gb_free=59, wall=18613 epoch 008: 689 / 1707 loss=4.302, nll_loss=2.692, ppl=6.46, wps=295733, ups=0.69, wpb=428654, bsz=16484.2, num_updates=12600, lr=0.000563436, gnorm=0.253, clip=0, loss_scale=4, train_wall=144, gb_free=59, wall=18613 epoch 008: 789 / 1707 loss=4.309, nll_loss=2.7, ppl=6.5, wps=296929, ups=0.69, wpb=429707, bsz=16360.9, num_updates=12700, lr=0.000561214, gnorm=0.256, clip=0, loss_scale=8, train_wall=144, gb_free=59.2, wall=18758 epoch 008: 789 / 1707 loss=4.309, nll_loss=2.7, ppl=6.5, wps=296929, ups=0.69, wpb=429707, bsz=16360.9, num_updates=12700, lr=0.000561214, gnorm=0.256, clip=0, loss_scale=8, train_wall=144, gb_free=59.2, wall=18758 epoch 008: 789 / 1707 loss=4.309, nll_loss=2.7, ppl=6.5, wps=296929, ups=0.69, wpb=429707, bsz=16360.9, num_updates=12700, lr=0.000561214, gnorm=0.256, clip=0, loss_scale=8, train_wall=144, gb_free=59.2, wall=18758 epoch 008: 789 / 1707 loss=4.309, nll_loss=2.7, ppl=6.5, wps=296929, ups=0.69, wpb=429707, bsz=16360.9, num_updates=12700, lr=0.000561214, gnorm=0.256, clip=0, loss_scale=8, train_wall=144, gb_free=59.2, wall=18758 epoch 008: 789 / 1707 loss=4.309, nll_loss=2.7, ppl=6.5, wps=296929, ups=0.69, wpb=429707, bsz=16360.9, num_updates=12700, lr=0.000561214, gnorm=0.256, clip=0, loss_scale=8, train_wall=144, gb_free=59.2, wall=18758 epoch 008: 789 / 1707 loss=4.309, nll_loss=2.7, ppl=6.5, wps=296929, ups=0.69, wpb=429707, bsz=16360.9, num_updates=12700, lr=0.000561214, gnorm=0.256, clip=0, loss_scale=8, train_wall=144, gb_free=59.2, wall=18758 epoch 008: 789 / 1707 loss=4.309, nll_loss=2.7, ppl=6.5, wps=296929, ups=0.69, wpb=429707, bsz=16360.9, num_updates=12700, lr=0.000561214, gnorm=0.256, clip=0, loss_scale=8, train_wall=144, gb_free=59.2, wall=18758 epoch 008: 789 / 1707 loss=4.309, nll_loss=2.7, ppl=6.5, wps=296929, ups=0.69, wpb=429707, bsz=16360.9, num_updates=12700, lr=0.000561214, gnorm=0.256, clip=0, loss_scale=8, train_wall=144, gb_free=59.2, wall=18758 epoch 008: 890 / 1707 loss=4.298, nll_loss=2.688, ppl=6.44, wps=292461, ups=0.68, wpb=427834, bsz=16239.5, num_updates=12800, lr=0.000559017, gnorm=0.257, clip=0, loss_scale=4, train_wall=145, gb_free=59.2, wall=18904 epoch 008: 890 / 1707 loss=4.298, nll_loss=2.688, ppl=6.44, wps=292461, ups=0.68, wpb=427834, bsz=16239.5, num_updates=12800, lr=0.000559017, gnorm=0.257, clip=0, loss_scale=4, train_wall=145, gb_free=59.2, wall=18904 epoch 008: 890 / 1707 loss=4.298, nll_loss=2.688, ppl=6.44, wps=292461, ups=0.68, wpb=427834, bsz=16239.5, num_updates=12800, lr=0.000559017, gnorm=0.257, clip=0, loss_scale=4, train_wall=145, gb_free=59.2, wall=18904 epoch 008: 890 / 1707 loss=4.298, nll_loss=2.688, ppl=6.44, wps=292461, ups=0.68, wpb=427834, bsz=16239.5, num_updates=12800, lr=0.000559017, gnorm=0.257, clip=0, loss_scale=4, train_wall=145, gb_free=59.2, wall=18904 epoch 008: 890 / 1707 loss=4.298, nll_loss=2.688, ppl=6.44, wps=292461, ups=0.68, wpb=427834, bsz=16239.5, num_updates=12800, lr=0.000559017, gnorm=0.257, clip=0, loss_scale=4, train_wall=145, gb_free=59.2, wall=18904 epoch 008: 890 / 1707 loss=4.298, nll_loss=2.688, ppl=6.44, wps=292461, ups=0.68, wpb=427834, bsz=16239.5, num_updates=12800, lr=0.000559017, gnorm=0.257, clip=0, loss_scale=4, train_wall=145, gb_free=59.2, wall=18904 epoch 008: 890 / 1707 loss=4.298, nll_loss=2.688, ppl=6.44, wps=292461, ups=0.68, wpb=427834, bsz=16239.5, num_updates=12800, lr=0.000559017, gnorm=0.257, clip=0, loss_scale=4, train_wall=145, gb_free=59.2, wall=18904 epoch 008: 890 / 1707 loss=4.298, nll_loss=2.688, ppl=6.44, wps=292461, ups=0.68, wpb=427834, bsz=16239.5, num_updates=12800, lr=0.000559017, gnorm=0.257, clip=0, loss_scale=4, train_wall=145, gb_free=59.2, wall=18904 epoch 008: 990 / 1707 loss=4.297, nll_loss=2.687, ppl=6.44, wps=295126, ups=0.69, wpb=428933, bsz=16431, num_updates=12900, lr=0.000556846, gnorm=0.267, clip=0, loss_scale=4, train_wall=145, gb_free=59.1, wall=19050 epoch 008: 990 / 1707 loss=4.297, nll_loss=2.687, ppl=6.44, wps=295126, ups=0.69, wpb=428933, bsz=16431, num_updates=12900, lr=0.000556846, gnorm=0.267, clip=0, loss_scale=4, train_wall=145, gb_free=59.1, wall=19050 epoch 008: 990 / 1707 loss=4.297, nll_loss=2.687, ppl=6.44, wps=295126, ups=0.69, wpb=428933, bsz=16431, num_updates=12900, lr=0.000556846, gnorm=0.267, clip=0, loss_scale=4, train_wall=145, gb_free=59.1, wall=19050 epoch 008: 990 / 1707 loss=4.297, nll_loss=2.687, ppl=6.44, wps=295126, ups=0.69, wpb=428933, bsz=16431, num_updates=12900, lr=0.000556846, gnorm=0.267, clip=0, loss_scale=4, train_wall=145, gb_free=59.1, wall=19050 epoch 008: 990 / 1707 loss=4.297, nll_loss=2.687, ppl=6.44, wps=295126, ups=0.69, wpb=428933, bsz=16431, num_updates=12900, lr=0.000556846, gnorm=0.267, clip=0, loss_scale=4, train_wall=145, gb_free=59.1, wall=19050 epoch 008: 990 / 1707 loss=4.297, nll_loss=2.687, ppl=6.44, wps=295126, ups=0.69, wpb=428933, bsz=16431, num_updates=12900, lr=0.000556846, gnorm=0.267, clip=0, loss_scale=4, train_wall=145, gb_free=59.1, wall=19050 epoch 008: 990 / 1707 loss=4.297, nll_loss=2.687, ppl=6.44, wps=295126, ups=0.69, wpb=428933, bsz=16431, num_updates=12900, lr=0.000556846, gnorm=0.267, clip=0, loss_scale=4, train_wall=145, gb_free=59.1, wall=19050 epoch 008: 990 / 1707 loss=4.297, nll_loss=2.687, ppl=6.44, wps=295126, ups=0.69, wpb=428933, bsz=16431, num_updates=12900, lr=0.000556846, gnorm=0.267, clip=0, loss_scale=4, train_wall=145, gb_free=59.1, wall=19050 epoch 008: 1090 / 1707 loss=4.301, nll_loss=2.692, ppl=6.46, wps=296635, ups=0.69, wpb=429281, bsz=16273.5, num_updates=13000, lr=0.0005547, gnorm=0.256, clip=0, loss_scale=4, train_wall=144, gb_free=59.2, wall=19194 epoch 008: 1090 / 1707 loss=4.301, nll_loss=2.692, ppl=6.46, wps=296635, ups=0.69, wpb=429281, bsz=16273.5, num_updates=13000, lr=0.0005547, gnorm=0.256, clip=0, loss_scale=4, train_wall=144, gb_free=59.2, wall=19194 epoch 008: 1090 / 1707 loss=4.301, nll_loss=2.692, ppl=6.46, wps=296635, ups=0.69, wpb=429281, bsz=16273.5, num_updates=13000, lr=0.0005547, gnorm=0.256, clip=0, loss_scale=4, train_wall=144, gb_free=59.2, wall=19194 epoch 008: 1090 / 1707 loss=4.301, nll_loss=2.692, ppl=6.46, wps=296635, ups=0.69, wpb=429281, bsz=16273.5, num_updates=13000, lr=0.0005547, gnorm=0.256, clip=0, loss_scale=4, train_wall=144, gb_free=59.2, wall=19194 epoch 008: 1090 / 1707 loss=4.301, nll_loss=2.692, ppl=6.46, wps=296635, ups=0.69, wpb=429281, bsz=16273.5, num_updates=13000, lr=0.0005547, gnorm=0.256, clip=0, loss_scale=4, train_wall=144, gb_free=59.2, wall=19194 epoch 008: 1090 / 1707 loss=4.301, nll_loss=2.692, ppl=6.46, wps=296635, ups=0.69, wpb=429281, bsz=16273.5, num_updates=13000, lr=0.0005547, gnorm=0.256, clip=0, loss_scale=4, train_wall=144, gb_free=59.2, wall=19194 epoch 008: 1090 / 1707 loss=4.301, nll_loss=2.692, ppl=6.46, wps=296635, ups=0.69, wpb=429281, bsz=16273.5, num_updates=13000, lr=0.0005547, gnorm=0.256, clip=0, loss_scale=4, train_wall=144, gb_free=59.2, wall=19194 epoch 008: 1090 / 1707 loss=4.301, nll_loss=2.692, ppl=6.46, wps=296635, ups=0.69, wpb=429281, bsz=16273.5, num_updates=13000, lr=0.0005547, gnorm=0.256, clip=0, loss_scale=4, train_wall=144, gb_free=59.2, wall=19194 begin validation on "valid" subset epoch 008 | valid on 'valid' subset | loss 4.394 | nll_loss 2.783 | ppl 6.88 | wps 76689.7 | wpb 21331 | bsz 1016 | num_updates 13000 | best_loss 4.394 epoch 008 | valid on 'valid' subset | loss 4.394 | nll_loss 2.783 | ppl 6.88 | wps 76689.7 | wpb 21331 | bsz 1016 | num_updates 13000 | best_loss 4.394 epoch 008 | valid on 'valid' subset | loss 4.394 | nll_loss 2.783 | ppl 6.88 | wps 76689.7 | wpb 21331 | bsz 1016 | num_updates 13000 | best_loss 4.394 epoch 008 | valid on 'valid' subset | loss 4.394 | nll_loss 2.783 | ppl 6.88 | wps 76689.7 | wpb 21331 | bsz 1016 | num_updates 13000 | best_loss 4.394 epoch 008 | valid on 'valid' subset | loss 4.394 | nll_loss 2.783 | ppl 6.88 | wps 76689.7 | wpb 21331 | bsz 1016 | num_updates 13000 | best_loss 4.394 epoch 008 | valid on 'valid' subset | loss 4.394 | nll_loss 2.783 | ppl 6.88 | wps 76689.7 | wpb 21331 | bsz 1016 | num_updates 13000 | best_loss 4.394 epoch 008 | valid on 'valid' subset | loss 4.394 | nll_loss 2.783 | ppl 6.88 | wps 76689.7 | wpb 21331 | bsz 1016 | num_updates 13000 | best_loss 4.394 epoch 008 | valid on 'valid' subset | loss 4.394 | nll_loss 2.783 | ppl 6.88 | wps 76689.7 | wpb 21331 | bsz 1016 | num_updates 13000 | best_loss 4.394 epoch 008: 1191 / 1707 loss=4.293, nll_loss=2.683, ppl=6.42, wps=245564, ups=0.57, wpb=427314, bsz=16549, num_updates=13100, lr=0.000552579, gnorm=0.267, clip=0, loss_scale=4, train_wall=145, gb_free=58.9, wall=19369 epoch 008: 1191 / 1707 loss=4.293, nll_loss=2.683, ppl=6.42, wps=245564, ups=0.57, wpb=427314, bsz=16549, num_updates=13100, lr=0.000552579, gnorm=0.267, clip=0, loss_scale=4, train_wall=145, gb_free=58.9, wall=19369 epoch 008: 1191 / 1707 loss=4.293, nll_loss=2.683, ppl=6.42, wps=245564, ups=0.57, wpb=427314, bsz=16549, num_updates=13100, lr=0.000552579, gnorm=0.267, clip=0, loss_scale=4, train_wall=145, gb_free=58.9, wall=19369 epoch 008: 1191 / 1707 loss=4.293, nll_loss=2.683, ppl=6.42, wps=245564, ups=0.57, wpb=427314, bsz=16549, num_updates=13100, lr=0.000552579, gnorm=0.267, clip=0, loss_scale=4, train_wall=145, gb_free=58.9, wall=19369 epoch 008: 1191 / 1707 loss=4.293, nll_loss=2.683, ppl=6.42, wps=245564, ups=0.57, wpb=427314, bsz=16549, num_updates=13100, lr=0.000552579, gnorm=0.267, clip=0, loss_scale=4, train_wall=145, gb_free=58.9, wall=19369 epoch 008: 1191 / 1707 loss=4.293, nll_loss=2.683, ppl=6.42, wps=245564, ups=0.57, wpb=427314, bsz=16549, num_updates=13100, lr=0.000552579, gnorm=0.267, clip=0, loss_scale=4, train_wall=145, gb_free=58.9, wall=19369 epoch 008: 1191 / 1707 loss=4.293, nll_loss=2.683, ppl=6.42, wps=245564, ups=0.57, wpb=427314, bsz=16549, num_updates=13100, lr=0.000552579, gnorm=0.267, clip=0, loss_scale=4, train_wall=145, gb_free=58.9, wall=19369 epoch 008: 1191 / 1707 loss=4.293, nll_loss=2.683, ppl=6.42, wps=245564, ups=0.57, wpb=427314, bsz=16549, num_updates=13100, lr=0.000552579, gnorm=0.267, clip=0, loss_scale=4, train_wall=145, gb_free=58.9, wall=19369 epoch 008: 1291 / 1707 loss=4.299, nll_loss=2.69, ppl=6.45, wps=298260, ups=0.69, wpb=430141, bsz=16190.2, num_updates=13200, lr=0.000550482, gnorm=0.259, clip=0, loss_scale=4, train_wall=143, gb_free=58.9, wall=19513 epoch 008: 1291 / 1707 loss=4.299, nll_loss=2.69, ppl=6.45, wps=298260, ups=0.69, wpb=430141, bsz=16190.2, num_updates=13200, lr=0.000550482, gnorm=0.259, clip=0, loss_scale=4, train_wall=143, gb_free=58.9, wall=19513 epoch 008: 1291 / 1707 loss=4.299, nll_loss=2.69, ppl=6.45, wps=298260, ups=0.69, wpb=430141, bsz=16190.2, num_updates=13200, lr=0.000550482, gnorm=0.259, clip=0, loss_scale=4, train_wall=143, gb_free=58.9, wall=19513 epoch 008: 1291 / 1707 loss=4.299, nll_loss=2.69, ppl=6.45, wps=298260, ups=0.69, wpb=430141, bsz=16190.2, num_updates=13200, lr=0.000550482, gnorm=0.259, clip=0, loss_scale=4, train_wall=143, gb_free=58.9, wall=19513 epoch 008: 1291 / 1707 loss=4.299, nll_loss=2.69, ppl=6.45, wps=298260, ups=0.69, wpb=430141, bsz=16190.2, num_updates=13200, lr=0.000550482, gnorm=0.259, clip=0, loss_scale=4, train_wall=143, gb_free=58.9, wall=19513 epoch 008: 1291 / 1707 loss=4.299, nll_loss=2.69, ppl=6.45, wps=298260, ups=0.69, wpb=430141, bsz=16190.2, num_updates=13200, lr=0.000550482, gnorm=0.259, clip=0, loss_scale=4, train_wall=143, gb_free=58.9, wall=19513 epoch 008: 1291 / 1707 loss=4.299, nll_loss=2.69, ppl=6.45, wps=298260, ups=0.69, wpb=430141, bsz=16190.2, num_updates=13200, lr=0.000550482, gnorm=0.259, clip=0, loss_scale=4, train_wall=143, gb_free=58.9, wall=19513 epoch 008: 1291 / 1707 loss=4.299, nll_loss=2.69, ppl=6.45, wps=298260, ups=0.69, wpb=430141, bsz=16190.2, num_updates=13200, lr=0.000550482, gnorm=0.259, clip=0, loss_scale=4, train_wall=143, gb_free=58.9, wall=19513 epoch 008: 1391 / 1707 loss=4.304, nll_loss=2.695, ppl=6.48, wps=297331, ups=0.69, wpb=430235, bsz=16489.7, num_updates=13300, lr=0.000548408, gnorm=0.272, clip=0, loss_scale=8, train_wall=144, gb_free=58.7, wall=19657 epoch 008: 1391 / 1707 loss=4.304, nll_loss=2.695, ppl=6.48, wps=297331, ups=0.69, wpb=430235, bsz=16489.7, num_updates=13300, lr=0.000548408, gnorm=0.272, clip=0, loss_scale=8, train_wall=144, gb_free=58.7, wall=19657 epoch 008: 1391 / 1707 loss=4.304, nll_loss=2.695, ppl=6.48, wps=297331, ups=0.69, wpb=430235, bsz=16489.7, num_updates=13300, lr=0.000548408, gnorm=0.272, clip=0, loss_scale=8, train_wall=144, gb_free=58.7, wall=19657 epoch 008: 1391 / 1707 loss=4.304, nll_loss=2.695, ppl=6.48, wps=297331, ups=0.69, wpb=430235, bsz=16489.7, num_updates=13300, lr=0.000548408, gnorm=0.272, clip=0, loss_scale=8, train_wall=144, gb_free=58.7, wall=19657 epoch 008: 1391 / 1707 loss=4.304, nll_loss=2.695, ppl=6.48, wps=297331, ups=0.69, wpb=430235, bsz=16489.7, num_updates=13300, lr=0.000548408, gnorm=0.272, clip=0, loss_scale=8, train_wall=144, gb_free=58.7, wall=19657 epoch 008: 1391 / 1707 loss=4.304, nll_loss=2.695, ppl=6.48, wps=297331, ups=0.69, wpb=430235, bsz=16489.7, num_updates=13300, lr=0.000548408, gnorm=0.272, clip=0, loss_scale=8, train_wall=144, gb_free=58.7, wall=19657 epoch 008: 1391 / 1707 loss=4.304, nll_loss=2.695, ppl=6.48, wps=297331, ups=0.69, wpb=430235, bsz=16489.7, num_updates=13300, lr=0.000548408, gnorm=0.272, clip=0, loss_scale=8, train_wall=144, gb_free=58.7, wall=19657 epoch 008: 1391 / 1707 loss=4.304, nll_loss=2.695, ppl=6.48, wps=297331, ups=0.69, wpb=430235, bsz=16489.7, num_updates=13300, lr=0.000548408, gnorm=0.272, clip=0, loss_scale=8, train_wall=144, gb_free=58.7, wall=19657 epoch 008: 1492 / 1707 loss=4.302, nll_loss=2.693, ppl=6.47, wps=293934, ups=0.68, wpb=429134, bsz=16196.5, num_updates=13400, lr=0.000546358, gnorm=0.255, clip=0, loss_scale=4, train_wall=145, gb_free=59.3, wall=19803 epoch 008: 1492 / 1707 loss=4.302, nll_loss=2.693, ppl=6.47, wps=293934, ups=0.68, wpb=429134, bsz=16196.5, num_updates=13400, lr=0.000546358, gnorm=0.255, clip=0, loss_scale=4, train_wall=145, gb_free=59.3, wall=19803 epoch 008: 1492 / 1707 loss=4.302, nll_loss=2.693, ppl=6.47, wps=293934, ups=0.68, wpb=429134, bsz=16196.5, num_updates=13400, lr=0.000546358, gnorm=0.255, clip=0, loss_scale=4, train_wall=145, gb_free=59.3, wall=19803 epoch 008: 1492 / 1707 loss=4.302, nll_loss=2.693, ppl=6.47, wps=293934, ups=0.68, wpb=429134, bsz=16196.5, num_updates=13400, lr=0.000546358, gnorm=0.255, clip=0, loss_scale=4, train_wall=145, gb_free=59.3, wall=19803 epoch 008: 1492 / 1707 loss=4.302, nll_loss=2.693, ppl=6.47, wps=293934, ups=0.68, wpb=429134, bsz=16196.5, num_updates=13400, lr=0.000546358, gnorm=0.255, clip=0, loss_scale=4, train_wall=145, gb_free=59.3, wall=19803 epoch 008: 1492 / 1707 loss=4.302, nll_loss=2.693, ppl=6.47, wps=293934, ups=0.68, wpb=429134, bsz=16196.5, num_updates=13400, lr=0.000546358, gnorm=0.255, clip=0, loss_scale=4, train_wall=145, gb_free=59.3, wall=19803 epoch 008: 1492 / 1707 loss=4.302, nll_loss=2.693, ppl=6.47, wps=293934, ups=0.68, wpb=429134, bsz=16196.5, num_updates=13400, lr=0.000546358, gnorm=0.255, clip=0, loss_scale=4, train_wall=145, gb_free=59.3, wall=19803 epoch 008: 1492 / 1707 loss=4.302, nll_loss=2.693, ppl=6.47, wps=293934, ups=0.68, wpb=429134, bsz=16196.5, num_updates=13400, lr=0.000546358, gnorm=0.255, clip=0, loss_scale=4, train_wall=145, gb_free=59.3, wall=19803 epoch 008: 1592 / 1707 loss=4.291, nll_loss=2.682, ppl=6.42, wps=297325, ups=0.69, wpb=429173, bsz=16274.6, num_updates=13500, lr=0.000544331, gnorm=0.261, clip=0, loss_scale=4, train_wall=143, gb_free=59.3, wall=19948 epoch 008: 1592 / 1707 loss=4.291, nll_loss=2.682, ppl=6.42, wps=297325, ups=0.69, wpb=429173, bsz=16274.6, num_updates=13500, lr=0.000544331, gnorm=0.261, clip=0, loss_scale=4, train_wall=143, gb_free=59.3, wall=19948 epoch 008: 1592 / 1707 loss=4.291, nll_loss=2.682, ppl=6.42, wps=297325, ups=0.69, wpb=429173, bsz=16274.6, num_updates=13500, lr=0.000544331, gnorm=0.261, clip=0, loss_scale=4, train_wall=143, gb_free=59.3, wall=19948 epoch 008: 1592 / 1707 loss=4.291, nll_loss=2.682, ppl=6.42, wps=297325, ups=0.69, wpb=429173, bsz=16274.6, num_updates=13500, lr=0.000544331, gnorm=0.261, clip=0, loss_scale=4, train_wall=143, gb_free=59.3, wall=19948 epoch 008: 1592 / 1707 loss=4.291, nll_loss=2.682, ppl=6.42, wps=297325, ups=0.69, wpb=429173, bsz=16274.6, num_updates=13500, lr=0.000544331, gnorm=0.261, clip=0, loss_scale=4, train_wall=143, gb_free=59.3, wall=19948 epoch 008: 1592 / 1707 loss=4.291, nll_loss=2.682, ppl=6.42, wps=297325, ups=0.69, wpb=429173, bsz=16274.6, num_updates=13500, lr=0.000544331, gnorm=0.261, clip=0, loss_scale=4, train_wall=143, gb_free=59.3, wall=19948 epoch 008: 1592 / 1707 loss=4.291, nll_loss=2.682, ppl=6.42, wps=297325, ups=0.69, wpb=429173, bsz=16274.6, num_updates=13500, lr=0.000544331, gnorm=0.261, clip=0, loss_scale=4, train_wall=143, gb_free=59.3, wall=19948 epoch 008: 1592 / 1707 loss=4.291, nll_loss=2.682, ppl=6.42, wps=297325, ups=0.69, wpb=429173, bsz=16274.6, num_updates=13500, lr=0.000544331, gnorm=0.261, clip=0, loss_scale=4, train_wall=143, gb_free=59.3, wall=19948 epoch 008: 1692 / 1707 loss=4.298, nll_loss=2.69, ppl=6.45, wps=295897, ups=0.69, wpb=429005, bsz=16172, num_updates=13600, lr=0.000542326, gnorm=0.26, clip=0, loss_scale=4, train_wall=144, gb_free=59.1, wall=20093 epoch 008: 1692 / 1707 loss=4.298, nll_loss=2.69, ppl=6.45, wps=295897, ups=0.69, wpb=429005, bsz=16172, num_updates=13600, lr=0.000542326, gnorm=0.26, clip=0, loss_scale=4, train_wall=144, gb_free=59.1, wall=20093 epoch 008: 1692 / 1707 loss=4.298, nll_loss=2.69, ppl=6.45, wps=295897, ups=0.69, wpb=429005, bsz=16172, num_updates=13600, lr=0.000542326, gnorm=0.26, clip=0, loss_scale=4, train_wall=144, gb_free=59.1, wall=20093 epoch 008: 1692 / 1707 loss=4.298, nll_loss=2.69, ppl=6.45, wps=295897, ups=0.69, wpb=429005, bsz=16172, num_updates=13600, lr=0.000542326, gnorm=0.26, clip=0, loss_scale=4, train_wall=144, gb_free=59.1, wall=20093 epoch 008: 1692 / 1707 loss=4.298, nll_loss=2.69, ppl=6.45, wps=295897, ups=0.69, wpb=429005, bsz=16172, num_updates=13600, lr=0.000542326, gnorm=0.26, clip=0, loss_scale=4, train_wall=144, gb_free=59.1, wall=20093 epoch 008: 1692 / 1707 loss=4.298, nll_loss=2.69, ppl=6.45, wps=295897, ups=0.69, wpb=429005, bsz=16172, num_updates=13600, lr=0.000542326, gnorm=0.26, clip=0, loss_scale=4, train_wall=144, gb_free=59.1, wall=20093 epoch 008: 1692 / 1707 loss=4.298, nll_loss=2.69, ppl=6.45, wps=295897, ups=0.69, wpb=429005, bsz=16172, num_updates=13600, lr=0.000542326, gnorm=0.26, clip=0, loss_scale=4, train_wall=144, gb_free=59.1, wall=20093 epoch 008: 1692 / 1707 loss=4.298, nll_loss=2.69, ppl=6.45, wps=295897, ups=0.69, wpb=429005, bsz=16172, num_updates=13600, lr=0.000542326, gnorm=0.26, clip=0, loss_scale=4, train_wall=144, gb_free=59.1, wall=20093 end of epoch 8 (average epoch stats below) epoch 008 | loss 4.299 | nll_loss 2.689 | ppl 6.45 | wps 290108 | ups 0.68 | wpb 428930 | bsz 16331.5 | num_updates 13615 | lr 0.000542027 | gnorm 0.261 | clip 0 | loss_scale 8 | train_wall 2455 | gb_free 59.1 | wall 20113 epoch 008 | loss 4.299 | nll_loss 2.689 | ppl 6.45 | wps 290108 | ups 0.68 | wpb 428930 | bsz 16331.5 | num_updates 13615 | lr 0.000542027 | gnorm 0.261 | clip 0 | loss_scale 8 | train_wall 2455 | gb_free 59.1 | wall 20113 epoch 008 | loss 4.299 | nll_loss 2.689 | ppl 6.45 | wps 290108 | ups 0.68 | wpb 428930 | bsz 16331.5 | num_updates 13615 | lr 0.000542027 | gnorm 0.261 | clip 0 | loss_scale 8 | train_wall 2455 | gb_free 59.1 | wall 20113 epoch 008 | loss 4.299 | nll_loss 2.689 | ppl 6.45 | wps 290108 | ups 0.68 | wpb 428930 | bsz 16331.5 | num_updates 13615 | lr 0.000542027 | gnorm 0.261 | clip 0 | loss_scale 8 | train_wall 2455 | gb_free 59.1 | wall 20113 epoch 008 | loss 4.299 | nll_loss 2.689 | ppl 6.45 | wps 290108 | ups 0.68 | wpb 428930 | bsz 16331.5 | num_updates 13615 | lr 0.000542027 | gnorm 0.261 | clip 0 | loss_scale 8 | train_wall 2455 | gb_free 59.1 | wall 20113 epoch 008 | loss 4.299 | nll_loss 2.689 | ppl 6.45 | wps 290108 | ups 0.68 | wpb 428930 | bsz 16331.5 | num_updates 13615 | lr 0.000542027 | gnorm 0.261 | clip 0 | loss_scale 8 | train_wall 2455 | gb_free 59.1 | wall 20113 epoch 008 | loss 4.299 | nll_loss 2.689 | ppl 6.45 | wps 290108 | ups 0.68 | wpb 428930 | bsz 16331.5 | num_updates 13615 | lr 0.000542027 | gnorm 0.261 | clip 0 | loss_scale 8 | train_wall 2455 | gb_free 59.1 | wall 20113 epoch 008 | loss 4.299 | nll_loss 2.689 | ppl 6.45 | wps 290108 | ups 0.68 | wpb 428930 | bsz 16331.5 | num_updates 13615 | lr 0.000542027 | gnorm 0.261 | clip 0 | loss_scale 8 | train_wall 2455 | gb_free 59.1 | wall 20113 Start iterating over samples epoch 009: 86 / 1707 loss=4.257, nll_loss=2.642, ppl=6.24, wps=292174, ups=0.69, wpb=426312, bsz=16470, num_updates=13700, lr=0.000540343, gnorm=0.253, clip=0, loss_scale=4, train_wall=145, gb_free=59.6, wall=20239 epoch 009: 86 / 1707 loss=4.257, nll_loss=2.642, ppl=6.24, wps=292174, ups=0.69, wpb=426312, bsz=16470, num_updates=13700, lr=0.000540343, gnorm=0.253, clip=0, loss_scale=4, train_wall=145, gb_free=59.6, wall=20239 epoch 009: 86 / 1707 loss=4.257, nll_loss=2.642, ppl=6.24, wps=292174, ups=0.69, wpb=426312, bsz=16470, num_updates=13700, lr=0.000540343, gnorm=0.253, clip=0, loss_scale=4, train_wall=145, gb_free=59.6, wall=20239 epoch 009: 86 / 1707 loss=4.257, nll_loss=2.642, ppl=6.24, wps=292174, ups=0.69, wpb=426312, bsz=16470, num_updates=13700, lr=0.000540343, gnorm=0.253, clip=0, loss_scale=4, train_wall=145, gb_free=59.6, wall=20239 epoch 009: 86 / 1707 loss=4.257, nll_loss=2.642, ppl=6.24, wps=292174, ups=0.69, wpb=426312, bsz=16470, num_updates=13700, lr=0.000540343, gnorm=0.253, clip=0, loss_scale=4, train_wall=145, gb_free=59.6, wall=20239 epoch 009: 86 / 1707 loss=4.257, nll_loss=2.642, ppl=6.24, wps=292174, ups=0.69, wpb=426312, bsz=16470, num_updates=13700, lr=0.000540343, gnorm=0.253, clip=0, loss_scale=4, train_wall=145, gb_free=59.6, wall=20239 epoch 009: 86 / 1707 loss=4.257, nll_loss=2.642, ppl=6.24, wps=292174, ups=0.69, wpb=426312, bsz=16470, num_updates=13700, lr=0.000540343, gnorm=0.253, clip=0, loss_scale=4, train_wall=145, gb_free=59.6, wall=20239 epoch 009: 86 / 1707 loss=4.257, nll_loss=2.642, ppl=6.24, wps=292174, ups=0.69, wpb=426312, bsz=16470, num_updates=13700, lr=0.000540343, gnorm=0.253, clip=0, loss_scale=4, train_wall=145, gb_free=59.6, wall=20239 epoch 009: 86 / 1707 loss=4.257, nll_loss=2.642, ppl=6.24, wps=292174, ups=0.69, wpb=426312, bsz=16470, num_updates=13700, lr=0.000540343, gnorm=0.253, clip=0, loss_scale=4, train_wall=145, gb_free=59.6, wall=20239 epoch 009: 186 / 1707 loss=4.253, nll_loss=2.637, ppl=6.22, wps=295994, ups=0.69, wpb=429726, bsz=16226.9, num_updates=13800, lr=0.000538382, gnorm=0.253, clip=0, loss_scale=4, train_wall=144, gb_free=58.9, wall=20384 epoch 009: 186 / 1707 loss=4.253, nll_loss=2.637, ppl=6.22, wps=295994, ups=0.69, wpb=429726, bsz=16226.9, num_updates=13800, lr=0.000538382, gnorm=0.253, clip=0, loss_scale=4, train_wall=144, gb_free=58.9, wall=20384 epoch 009: 186 / 1707 loss=4.253, nll_loss=2.637, ppl=6.22, wps=295994, ups=0.69, wpb=429726, bsz=16226.9, num_updates=13800, lr=0.000538382, gnorm=0.253, clip=0, loss_scale=4, train_wall=144, gb_free=58.9, wall=20384 epoch 009: 186 / 1707 loss=4.253, nll_loss=2.637, ppl=6.22, wps=295994, ups=0.69, wpb=429726, bsz=16226.9, num_updates=13800, lr=0.000538382, gnorm=0.253, clip=0, loss_scale=4, train_wall=144, gb_free=58.9, wall=20384 epoch 009: 186 / 1707 loss=4.253, nll_loss=2.637, ppl=6.22, wps=295994, ups=0.69, wpb=429726, bsz=16226.9, num_updates=13800, lr=0.000538382, gnorm=0.253, clip=0, loss_scale=4, train_wall=144, gb_free=58.9, wall=20384 epoch 009: 186 / 1707 loss=4.253, nll_loss=2.637, ppl=6.22, wps=295994, ups=0.69, wpb=429726, bsz=16226.9, num_updates=13800, lr=0.000538382, gnorm=0.253, clip=0, loss_scale=4, train_wall=144, gb_free=58.9, wall=20384 epoch 009: 186 / 1707 loss=4.253, nll_loss=2.637, ppl=6.22, wps=295994, ups=0.69, wpb=429726, bsz=16226.9, num_updates=13800, lr=0.000538382, gnorm=0.253, clip=0, loss_scale=4, train_wall=144, gb_free=58.9, wall=20384 epoch 009: 186 / 1707 loss=4.253, nll_loss=2.637, ppl=6.22, wps=295994, ups=0.69, wpb=429726, bsz=16226.9, num_updates=13800, lr=0.000538382, gnorm=0.253, clip=0, loss_scale=4, train_wall=144, gb_free=58.9, wall=20384 epoch 009: 186 / 1707 loss=4.253, nll_loss=2.637, ppl=6.22, wps=295994, ups=0.69, wpb=429726, bsz=16226.9, num_updates=13800, lr=0.000538382, gnorm=0.253, clip=0, loss_scale=4, train_wall=144, gb_free=58.9, wall=20384 epoch 009: 287 / 1707 loss=4.261, nll_loss=2.646, ppl=6.26, wps=293430, ups=0.68, wpb=429081, bsz=16361.8, num_updates=13900, lr=0.000536442, gnorm=0.264, clip=0, loss_scale=4, train_wall=145, gb_free=59.1, wall=20530 epoch 009: 287 / 1707 loss=4.261, nll_loss=2.646, ppl=6.26, wps=293430, ups=0.68, wpb=429081, bsz=16361.8, num_updates=13900, lr=0.000536442, gnorm=0.264, clip=0, loss_scale=4, train_wall=145, gb_free=59.1, wall=20530 epoch 009: 287 / 1707 loss=4.261, nll_loss=2.646, ppl=6.26, wps=293430, ups=0.68, wpb=429081, bsz=16361.8, num_updates=13900, lr=0.000536442, gnorm=0.264, clip=0, loss_scale=4, train_wall=145, gb_free=59.1, wall=20530 epoch 009: 287 / 1707 loss=4.261, nll_loss=2.646, ppl=6.26, wps=293430, ups=0.68, wpb=429081, bsz=16361.8, num_updates=13900, lr=0.000536442, gnorm=0.264, clip=0, loss_scale=4, train_wall=145, gb_free=59.1, wall=20530 epoch 009: 287 / 1707 loss=4.261, nll_loss=2.646, ppl=6.26, wps=293430, ups=0.68, wpb=429081, bsz=16361.8, num_updates=13900, lr=0.000536442, gnorm=0.264, clip=0, loss_scale=4, train_wall=145, gb_free=59.1, wall=20530 epoch 009: 287 / 1707 loss=4.261, nll_loss=2.646, ppl=6.26, wps=293430, ups=0.68, wpb=429081, bsz=16361.8, num_updates=13900, lr=0.000536442, gnorm=0.264, clip=0, loss_scale=4, train_wall=145, gb_free=59.1, wall=20530 epoch 009: 287 / 1707 loss=4.261, nll_loss=2.646, ppl=6.26, wps=293430, ups=0.68, wpb=429081, bsz=16361.8, num_updates=13900, lr=0.000536442, gnorm=0.264, clip=0, loss_scale=4, train_wall=145, gb_free=59.1, wall=20530 epoch 009: 287 / 1707 loss=4.261, nll_loss=2.646, ppl=6.26, wps=293430, ups=0.68, wpb=429081, bsz=16361.8, num_updates=13900, lr=0.000536442, gnorm=0.264, clip=0, loss_scale=4, train_wall=145, gb_free=59.1, wall=20530 epoch 009: 287 / 1707 loss=4.261, nll_loss=2.646, ppl=6.26, wps=293430, ups=0.68, wpb=429081, bsz=16361.8, num_updates=13900, lr=0.000536442, gnorm=0.264, clip=0, loss_scale=4, train_wall=145, gb_free=59.1, wall=20530 epoch 009: 387 / 1707 loss=4.26, nll_loss=2.645, ppl=6.25, wps=297155, ups=0.69, wpb=429072, bsz=16004.1, num_updates=14000, lr=0.000534522, gnorm=0.255, clip=0, loss_scale=4, train_wall=144, gb_free=59.2, wall=20674 epoch 009: 387 / 1707 loss=4.26, nll_loss=2.645, ppl=6.25, wps=297155, ups=0.69, wpb=429072, bsz=16004.1, num_updates=14000, lr=0.000534522, gnorm=0.255, clip=0, loss_scale=4, train_wall=144, gb_free=59.2, wall=20674 epoch 009: 387 / 1707 loss=4.26, nll_loss=2.645, ppl=6.25, wps=297155, ups=0.69, wpb=429072, bsz=16004.1, num_updates=14000, lr=0.000534522, gnorm=0.255, clip=0, loss_scale=4, train_wall=144, gb_free=59.2, wall=20674 epoch 009: 387 / 1707 loss=4.26, nll_loss=2.645, ppl=6.25, wps=297155, ups=0.69, wpb=429072, bsz=16004.1, num_updates=14000, lr=0.000534522, gnorm=0.255, clip=0, loss_scale=4, train_wall=144, gb_free=59.2, wall=20674 epoch 009: 387 / 1707 loss=4.26, nll_loss=2.645, ppl=6.25, wps=297155, ups=0.69, wpb=429072, bsz=16004.1, num_updates=14000, lr=0.000534522, gnorm=0.255, clip=0, loss_scale=4, train_wall=144, gb_free=59.2, wall=20674 epoch 009: 387 / 1707 loss=4.26, nll_loss=2.645, ppl=6.25, wps=297155, ups=0.69, wpb=429072, bsz=16004.1, num_updates=14000, lr=0.000534522, gnorm=0.255, clip=0, loss_scale=4, train_wall=144, gb_free=59.2, wall=20674 epoch 009: 387 / 1707 loss=4.26, nll_loss=2.645, ppl=6.25, wps=297155, ups=0.69, wpb=429072, bsz=16004.1, num_updates=14000, lr=0.000534522, gnorm=0.255, clip=0, loss_scale=4, train_wall=144, gb_free=59.2, wall=20674 epoch 009: 387 / 1707 loss=4.26, nll_loss=2.645, ppl=6.25, wps=297155, ups=0.69, wpb=429072, bsz=16004.1, num_updates=14000, lr=0.000534522, gnorm=0.255, clip=0, loss_scale=4, train_wall=144, gb_free=59.2, wall=20674 epoch 009: 387 / 1707 loss=4.26, nll_loss=2.645, ppl=6.25, wps=297155, ups=0.69, wpb=429072, bsz=16004.1, num_updates=14000, lr=0.000534522, gnorm=0.255, clip=0, loss_scale=4, train_wall=144, gb_free=59.2, wall=20674 begin validation on "valid" subset epoch 009 | valid on 'valid' subset | loss 4.384 | nll_loss 2.768 | ppl 6.81 | wps 76682.7 | wpb 21331 | bsz 1016 | num_updates 14000 | best_loss 4.384 epoch 009 | valid on 'valid' subset | loss 4.384 | nll_loss 2.768 | ppl 6.81 | wps 76682.7 | wpb 21331 | bsz 1016 | num_updates 14000 | best_loss 4.384 epoch 009 | valid on 'valid' subset | loss 4.384 | nll_loss 2.768 | ppl 6.81 | wps 76682.7 | wpb 21331 | bsz 1016 | num_updates 14000 | best_loss 4.384 epoch 009 | valid on 'valid' subset | loss 4.384 | nll_loss 2.768 | ppl 6.81 | wps 76682.7 | wpb 21331 | bsz 1016 | num_updates 14000 | best_loss 4.384 epoch 009 | valid on 'valid' subset | loss 4.384 | nll_loss 2.768 | ppl 6.81 | wps 76682.7 | wpb 21331 | bsz 1016 | num_updates 14000 | best_loss 4.384 epoch 009 | valid on 'valid' subset | loss 4.384 | nll_loss 2.768 | ppl 6.81 | wps 76682.7 | wpb 21331 | bsz 1016 | num_updates 14000 | best_loss 4.384 epoch 009 | valid on 'valid' subset | loss 4.384 | nll_loss 2.768 | ppl 6.81 | wps 76682.7 | wpb 21331 | bsz 1016 | num_updates 14000 | best_loss 4.384 epoch 009 | valid on 'valid' subset | loss 4.384 | nll_loss 2.768 | ppl 6.81 | wps 76682.7 | wpb 21331 | bsz 1016 | num_updates 14000 | best_loss 4.384 epoch 009 | valid on 'valid' subset | loss 4.384 | nll_loss 2.768 | ppl 6.81 | wps 76682.7 | wpb 21331 | bsz 1016 | num_updates 14000 | best_loss 4.384 epoch 009: 487 / 1707 loss=4.265, nll_loss=2.651, ppl=6.28, wps=260472, ups=0.61, wpb=428951, bsz=16370.6, num_updates=14100, lr=0.000532624, gnorm=0.248, clip=0, loss_scale=4, train_wall=144, gb_free=58.9, wall=20839 epoch 009: 487 / 1707 loss=4.265, nll_loss=2.651, ppl=6.28, wps=260472, ups=0.61, wpb=428951, bsz=16370.6, num_updates=14100, lr=0.000532624, gnorm=0.248, clip=0, loss_scale=4, train_wall=144, gb_free=58.9, wall=20839 epoch 009: 487 / 1707 loss=4.265, nll_loss=2.651, ppl=6.28, wps=260472, ups=0.61, wpb=428951, bsz=16370.6, num_updates=14100, lr=0.000532624, gnorm=0.248, clip=0, loss_scale=4, train_wall=144, gb_free=58.9, wall=20839 epoch 009: 487 / 1707 loss=4.265, nll_loss=2.651, ppl=6.28, wps=260472, ups=0.61, wpb=428951, bsz=16370.6, num_updates=14100, lr=0.000532624, gnorm=0.248, clip=0, loss_scale=4, train_wall=144, gb_free=58.9, wall=20839 epoch 009: 487 / 1707 loss=4.265, nll_loss=2.651, ppl=6.28, wps=260472, ups=0.61, wpb=428951, bsz=16370.6, num_updates=14100, lr=0.000532624, gnorm=0.248, clip=0, loss_scale=4, train_wall=144, gb_free=58.9, wall=20839 epoch 009: 487 / 1707 loss=4.265, nll_loss=2.651, ppl=6.28, wps=260472, ups=0.61, wpb=428951, bsz=16370.6, num_updates=14100, lr=0.000532624, gnorm=0.248, clip=0, loss_scale=4, train_wall=144, gb_free=58.9, wall=20839 epoch 009: 487 / 1707 loss=4.265, nll_loss=2.651, ppl=6.28, wps=260472, ups=0.61, wpb=428951, bsz=16370.6, num_updates=14100, lr=0.000532624, gnorm=0.248, clip=0, loss_scale=4, train_wall=144, gb_free=58.9, wall=20839 epoch 009: 487 / 1707 loss=4.265, nll_loss=2.651, ppl=6.28, wps=260472, ups=0.61, wpb=428951, bsz=16370.6, num_updates=14100, lr=0.000532624, gnorm=0.248, clip=0, loss_scale=4, train_wall=144, gb_free=58.9, wall=20839 epoch 009: 487 / 1707 loss=4.265, nll_loss=2.651, ppl=6.28, wps=260472, ups=0.61, wpb=428951, bsz=16370.6, num_updates=14100, lr=0.000532624, gnorm=0.248, clip=0, loss_scale=4, train_wall=144, gb_free=58.9, wall=20839 epoch 009: 588 / 1707 loss=4.257, nll_loss=2.642, ppl=6.24, wps=292181, ups=0.68, wpb=426733, bsz=16019.4, num_updates=14200, lr=0.000530745, gnorm=0.261, clip=0, loss_scale=4, train_wall=145, gb_free=59.2, wall=20985 epoch 009: 588 / 1707 loss=4.257, nll_loss=2.642, ppl=6.24, wps=292181, ups=0.68, wpb=426733, bsz=16019.4, num_updates=14200, lr=0.000530745, gnorm=0.261, clip=0, loss_scale=4, train_wall=145, gb_free=59.2, wall=20985 epoch 009: 588 / 1707 loss=4.257, nll_loss=2.642, ppl=6.24, wps=292181, ups=0.68, wpb=426733, bsz=16019.4, num_updates=14200, lr=0.000530745, gnorm=0.261, clip=0, loss_scale=4, train_wall=145, gb_free=59.2, wall=20985 epoch 009: 588 / 1707 loss=4.257, nll_loss=2.642, ppl=6.24, wps=292181, ups=0.68, wpb=426733, bsz=16019.4, num_updates=14200, lr=0.000530745, gnorm=0.261, clip=0, loss_scale=4, train_wall=145, gb_free=59.2, wall=20985 epoch 009: 588 / 1707 loss=4.257, nll_loss=2.642, ppl=6.24, wps=292181, ups=0.68, wpb=426733, bsz=16019.4, num_updates=14200, lr=0.000530745, gnorm=0.261, clip=0, loss_scale=4, train_wall=145, gb_free=59.2, wall=20985 epoch 009: 588 / 1707 loss=4.257, nll_loss=2.642, ppl=6.24, wps=292181, ups=0.68, wpb=426733, bsz=16019.4, num_updates=14200, lr=0.000530745, gnorm=0.261, clip=0, loss_scale=4, train_wall=145, gb_free=59.2, wall=20985 epoch 009: 588 / 1707 loss=4.257, nll_loss=2.642, ppl=6.24, wps=292181, ups=0.68, wpb=426733, bsz=16019.4, num_updates=14200, lr=0.000530745, gnorm=0.261, clip=0, loss_scale=4, train_wall=145, gb_free=59.2, wall=20985 epoch 009: 588 / 1707 loss=4.257, nll_loss=2.642, ppl=6.24, wps=292181, ups=0.68, wpb=426733, bsz=16019.4, num_updates=14200, lr=0.000530745, gnorm=0.261, clip=0, loss_scale=4, train_wall=145, gb_free=59.2, wall=20985 epoch 009: 588 / 1707 loss=4.257, nll_loss=2.642, ppl=6.24, wps=292181, ups=0.68, wpb=426733, bsz=16019.4, num_updates=14200, lr=0.000530745, gnorm=0.261, clip=0, loss_scale=4, train_wall=145, gb_free=59.2, wall=20985 epoch 009: 688 / 1707 loss=4.266, nll_loss=2.653, ppl=6.29, wps=297078, ups=0.69, wpb=429213, bsz=16334, num_updates=14300, lr=0.000528886, gnorm=0.255, clip=0, loss_scale=4, train_wall=144, gb_free=59.3, wall=21130 epoch 009: 688 / 1707 loss=4.266, nll_loss=2.653, ppl=6.29, wps=297078, ups=0.69, wpb=429213, bsz=16334, num_updates=14300, lr=0.000528886, gnorm=0.255, clip=0, loss_scale=4, train_wall=144, gb_free=59.3, wall=21130 epoch 009: 688 / 1707 loss=4.266, nll_loss=2.653, ppl=6.29, wps=297078, ups=0.69, wpb=429213, bsz=16334, num_updates=14300, lr=0.000528886, gnorm=0.255, clip=0, loss_scale=4, train_wall=144, gb_free=59.3, wall=21130 epoch 009: 688 / 1707 loss=4.266, nll_loss=2.653, ppl=6.29, wps=297078, ups=0.69, wpb=429213, bsz=16334, num_updates=14300, lr=0.000528886, gnorm=0.255, clip=0, loss_scale=4, train_wall=144, gb_free=59.3, wall=21130 epoch 009: 688 / 1707 loss=4.266, nll_loss=2.653, ppl=6.29, wps=297078, ups=0.69, wpb=429213, bsz=16334, num_updates=14300, lr=0.000528886, gnorm=0.255, clip=0, loss_scale=4, train_wall=144, gb_free=59.3, wall=21130 epoch 009: 688 / 1707 loss=4.266, nll_loss=2.653, ppl=6.29, wps=297078, ups=0.69, wpb=429213, bsz=16334, num_updates=14300, lr=0.000528886, gnorm=0.255, clip=0, loss_scale=4, train_wall=144, gb_free=59.3, wall=21130 epoch 009: 688 / 1707 loss=4.266, nll_loss=2.653, ppl=6.29, wps=297078, ups=0.69, wpb=429213, bsz=16334, num_updates=14300, lr=0.000528886, gnorm=0.255, clip=0, loss_scale=4, train_wall=144, gb_free=59.3, wall=21130 epoch 009: 688 / 1707 loss=4.266, nll_loss=2.653, ppl=6.29, wps=297078, ups=0.69, wpb=429213, bsz=16334, num_updates=14300, lr=0.000528886, gnorm=0.255, clip=0, loss_scale=4, train_wall=144, gb_free=59.3, wall=21130 epoch 009: 688 / 1707 loss=4.266, nll_loss=2.653, ppl=6.29, wps=297078, ups=0.69, wpb=429213, bsz=16334, num_updates=14300, lr=0.000528886, gnorm=0.255, clip=0, loss_scale=4, train_wall=144, gb_free=59.3, wall=21130 epoch 009: 788 / 1707 loss=4.269, nll_loss=2.657, ppl=6.31, wps=297251, ups=0.69, wpb=430362, bsz=16499.4, num_updates=14400, lr=0.000527046, gnorm=0.249, clip=0, loss_scale=4, train_wall=144, gb_free=58.8, wall=21274 epoch 009: 788 / 1707 loss=4.269, nll_loss=2.657, ppl=6.31, wps=297251, ups=0.69, wpb=430362, bsz=16499.4, num_updates=14400, lr=0.000527046, gnorm=0.249, clip=0, loss_scale=4, train_wall=144, gb_free=58.8, wall=21274 epoch 009: 788 / 1707 loss=4.269, nll_loss=2.657, ppl=6.31, wps=297251, ups=0.69, wpb=430362, bsz=16499.4, num_updates=14400, lr=0.000527046, gnorm=0.249, clip=0, loss_scale=4, train_wall=144, gb_free=58.8, wall=21274 epoch 009: 788 / 1707 loss=4.269, nll_loss=2.657, ppl=6.31, wps=297251, ups=0.69, wpb=430362, bsz=16499.4, num_updates=14400, lr=0.000527046, gnorm=0.249, clip=0, loss_scale=4, train_wall=144, gb_free=58.8, wall=21274 epoch 009: 788 / 1707 loss=4.269, nll_loss=2.657, ppl=6.31, wps=297251, ups=0.69, wpb=430362, bsz=16499.4, num_updates=14400, lr=0.000527046, gnorm=0.249, clip=0, loss_scale=4, train_wall=144, gb_free=58.8, wall=21274 epoch 009: 788 / 1707 loss=4.269, nll_loss=2.657, ppl=6.31, wps=297251, ups=0.69, wpb=430362, bsz=16499.4, num_updates=14400, lr=0.000527046, gnorm=0.249, clip=0, loss_scale=4, train_wall=144, gb_free=58.8, wall=21274 epoch 009: 788 / 1707 loss=4.269, nll_loss=2.657, ppl=6.31, wps=297251, ups=0.69, wpb=430362, bsz=16499.4, num_updates=14400, lr=0.000527046, gnorm=0.249, clip=0, loss_scale=4, train_wall=144, gb_free=58.8, wall=21274 epoch 009: 788 / 1707 loss=4.269, nll_loss=2.657, ppl=6.31, wps=297251, ups=0.69, wpb=430362, bsz=16499.4, num_updates=14400, lr=0.000527046, gnorm=0.249, clip=0, loss_scale=4, train_wall=144, gb_free=58.8, wall=21274 epoch 009: 788 / 1707 loss=4.269, nll_loss=2.657, ppl=6.31, wps=297251, ups=0.69, wpb=430362, bsz=16499.4, num_updates=14400, lr=0.000527046, gnorm=0.249, clip=0, loss_scale=4, train_wall=144, gb_free=58.8, wall=21274 epoch 009: 889 / 1707 loss=4.269, nll_loss=2.657, ppl=6.31, wps=294241, ups=0.68, wpb=430601, bsz=16342.3, num_updates=14500, lr=0.000525226, gnorm=0.26, clip=0, loss_scale=4, train_wall=146, gb_free=58.8, wall=21421 epoch 009: 889 / 1707 loss=4.269, nll_loss=2.657, ppl=6.31, wps=294241, ups=0.68, wpb=430601, bsz=16342.3, num_updates=14500, lr=0.000525226, gnorm=0.26, clip=0, loss_scale=4, train_wall=146, gb_free=58.8, wall=21421 epoch 009: 889 / 1707 loss=4.269, nll_loss=2.657, ppl=6.31, wps=294241, ups=0.68, wpb=430601, bsz=16342.3, num_updates=14500, lr=0.000525226, gnorm=0.26, clip=0, loss_scale=4, train_wall=146, gb_free=58.8, wall=21421 epoch 009: 889 / 1707 loss=4.269, nll_loss=2.657, ppl=6.31, wps=294241, ups=0.68, wpb=430601, bsz=16342.3, num_updates=14500, lr=0.000525226, gnorm=0.26, clip=0, loss_scale=4, train_wall=146, gb_free=58.8, wall=21421 epoch 009: 889 / 1707 loss=4.269, nll_loss=2.657, ppl=6.31, wps=294241, ups=0.68, wpb=430601, bsz=16342.3, num_updates=14500, lr=0.000525226, gnorm=0.26, clip=0, loss_scale=4, train_wall=146, gb_free=58.8, wall=21421 epoch 009: 889 / 1707 loss=4.269, nll_loss=2.657, ppl=6.31, wps=294241, ups=0.68, wpb=430601, bsz=16342.3, num_updates=14500, lr=0.000525226, gnorm=0.26, clip=0, loss_scale=4, train_wall=146, gb_free=58.8, wall=21421 epoch 009: 889 / 1707 loss=4.269, nll_loss=2.657, ppl=6.31, wps=294241, ups=0.68, wpb=430601, bsz=16342.3, num_updates=14500, lr=0.000525226, gnorm=0.26, clip=0, loss_scale=4, train_wall=146, gb_free=58.8, wall=21421 epoch 009: 889 / 1707 loss=4.269, nll_loss=2.657, ppl=6.31, wps=294241, ups=0.68, wpb=430601, bsz=16342.3, num_updates=14500, lr=0.000525226, gnorm=0.26, clip=0, loss_scale=4, train_wall=146, gb_free=58.8, wall=21421 epoch 009: 889 / 1707 loss=4.269, nll_loss=2.657, ppl=6.31, wps=294241, ups=0.68, wpb=430601, bsz=16342.3, num_updates=14500, lr=0.000525226, gnorm=0.26, clip=0, loss_scale=4, train_wall=146, gb_free=58.8, wall=21421 epoch 009: 989 / 1707 loss=4.265, nll_loss=2.653, ppl=6.29, wps=296344, ups=0.69, wpb=429771, bsz=16595.5, num_updates=14600, lr=0.000523424, gnorm=0.251, clip=0, loss_scale=4, train_wall=144, gb_free=59.2, wall=21566 epoch 009: 989 / 1707 loss=4.265, nll_loss=2.653, ppl=6.29, wps=296344, ups=0.69, wpb=429771, bsz=16595.5, num_updates=14600, lr=0.000523424, gnorm=0.251, clip=0, loss_scale=4, train_wall=144, gb_free=59.2, wall=21566 epoch 009: 989 / 1707 loss=4.265, nll_loss=2.653, ppl=6.29, wps=296344, ups=0.69, wpb=429771, bsz=16595.5, num_updates=14600, lr=0.000523424, gnorm=0.251, clip=0, loss_scale=4, train_wall=144, gb_free=59.2, wall=21566 epoch 009: 989 / 1707 loss=4.265, nll_loss=2.653, ppl=6.29, wps=296344, ups=0.69, wpb=429771, bsz=16595.5, num_updates=14600, lr=0.000523424, gnorm=0.251, clip=0, loss_scale=4, train_wall=144, gb_free=59.2, wall=21566 epoch 009: 989 / 1707 loss=4.265, nll_loss=2.653, ppl=6.29, wps=296344, ups=0.69, wpb=429771, bsz=16595.5, num_updates=14600, lr=0.000523424, gnorm=0.251, clip=0, loss_scale=4, train_wall=144, gb_free=59.2, wall=21566 epoch 009: 989 / 1707 loss=4.265, nll_loss=2.653, ppl=6.29, wps=296344, ups=0.69, wpb=429771, bsz=16595.5, num_updates=14600, lr=0.000523424, gnorm=0.251, clip=0, loss_scale=4, train_wall=144, gb_free=59.2, wall=21566 epoch 009: 989 / 1707 loss=4.265, nll_loss=2.653, ppl=6.29, wps=296344, ups=0.69, wpb=429771, bsz=16595.5, num_updates=14600, lr=0.000523424, gnorm=0.251, clip=0, loss_scale=4, train_wall=144, gb_free=59.2, wall=21566 epoch 009: 989 / 1707 loss=4.265, nll_loss=2.653, ppl=6.29, wps=296344, ups=0.69, wpb=429771, bsz=16595.5, num_updates=14600, lr=0.000523424, gnorm=0.251, clip=0, loss_scale=4, train_wall=144, gb_free=59.2, wall=21566 epoch 009: 989 / 1707 loss=4.265, nll_loss=2.653, ppl=6.29, wps=296344, ups=0.69, wpb=429771, bsz=16595.5, num_updates=14600, lr=0.000523424, gnorm=0.251, clip=0, loss_scale=4, train_wall=144, gb_free=59.2, wall=21566 epoch 009: 1089 / 1707 loss=4.261, nll_loss=2.648, ppl=6.27, wps=294745, ups=0.69, wpb=427695, bsz=16305.8, num_updates=14700, lr=0.000521641, gnorm=0.269, clip=0, loss_scale=4, train_wall=144, gb_free=59, wall=21711 epoch 009: 1089 / 1707 loss=4.261, nll_loss=2.648, ppl=6.27, wps=294745, ups=0.69, wpb=427695, bsz=16305.8, num_updates=14700, lr=0.000521641, gnorm=0.269, clip=0, loss_scale=4, train_wall=144, gb_free=59, wall=21711 epoch 009: 1089 / 1707 loss=4.261, nll_loss=2.648, ppl=6.27, wps=294745, ups=0.69, wpb=427695, bsz=16305.8, num_updates=14700, lr=0.000521641, gnorm=0.269, clip=0, loss_scale=4, train_wall=144, gb_free=59, wall=21711 epoch 009: 1089 / 1707 loss=4.261, nll_loss=2.648, ppl=6.27, wps=294745, ups=0.69, wpb=427695, bsz=16305.8, num_updates=14700, lr=0.000521641, gnorm=0.269, clip=0, loss_scale=4, train_wall=144, gb_free=59, wall=21711 epoch 009: 1089 / 1707 loss=4.261, nll_loss=2.648, ppl=6.27, wps=294745, ups=0.69, wpb=427695, bsz=16305.8, num_updates=14700, lr=0.000521641, gnorm=0.269, clip=0, loss_scale=4, train_wall=144, gb_free=59, wall=21711 epoch 009: 1089 / 1707 loss=4.261, nll_loss=2.648, ppl=6.27, wps=294745, ups=0.69, wpb=427695, bsz=16305.8, num_updates=14700, lr=0.000521641, gnorm=0.269, clip=0, loss_scale=4, train_wall=144, gb_free=59, wall=21711 epoch 009: 1089 / 1707 loss=4.261, nll_loss=2.648, ppl=6.27, wps=294745, ups=0.69, wpb=427695, bsz=16305.8, num_updates=14700, lr=0.000521641, gnorm=0.269, clip=0, loss_scale=4, train_wall=144, gb_free=59, wall=21711 epoch 009: 1089 / 1707 loss=4.261, nll_loss=2.648, ppl=6.27, wps=294745, ups=0.69, wpb=427695, bsz=16305.8, num_updates=14700, lr=0.000521641, gnorm=0.269, clip=0, loss_scale=4, train_wall=144, gb_free=59, wall=21711 epoch 009: 1089 / 1707 loss=4.261, nll_loss=2.648, ppl=6.27, wps=294745, ups=0.69, wpb=427695, bsz=16305.8, num_updates=14700, lr=0.000521641, gnorm=0.269, clip=0, loss_scale=4, train_wall=144, gb_free=59, wall=21711 epoch 009: 1189 / 1707 loss=4.263, nll_loss=2.65, ppl=6.28, wps=296169, ups=0.69, wpb=428402, bsz=16327.8, num_updates=14800, lr=0.000519875, gnorm=0.26, clip=0, loss_scale=8, train_wall=144, gb_free=59, wall=21856 epoch 009: 1189 / 1707 loss=4.263, nll_loss=2.65, ppl=6.28, wps=296169, ups=0.69, wpb=428402, bsz=16327.8, num_updates=14800, lr=0.000519875, gnorm=0.26, clip=0, loss_scale=8, train_wall=144, gb_free=59, wall=21856 epoch 009: 1189 / 1707 loss=4.263, nll_loss=2.65, ppl=6.28, wps=296169, ups=0.69, wpb=428402, bsz=16327.8, num_updates=14800, lr=0.000519875, gnorm=0.26, clip=0, loss_scale=8, train_wall=144, gb_free=59, wall=21856 epoch 009: 1189 / 1707 loss=4.263, nll_loss=2.65, ppl=6.28, wps=296169, ups=0.69, wpb=428402, bsz=16327.8, num_updates=14800, lr=0.000519875, gnorm=0.26, clip=0, loss_scale=8, train_wall=144, gb_free=59, wall=21856 epoch 009: 1189 / 1707 loss=4.263, nll_loss=2.65, ppl=6.28, wps=296169, ups=0.69, wpb=428402, bsz=16327.8, num_updates=14800, lr=0.000519875, gnorm=0.26, clip=0, loss_scale=8, train_wall=144, gb_free=59, wall=21856 epoch 009: 1189 / 1707 loss=4.263, nll_loss=2.65, ppl=6.28, wps=296169, ups=0.69, wpb=428402, bsz=16327.8, num_updates=14800, lr=0.000519875, gnorm=0.26, clip=0, loss_scale=8, train_wall=144, gb_free=59, wall=21856 epoch 009: 1189 / 1707 loss=4.263, nll_loss=2.65, ppl=6.28, wps=296169, ups=0.69, wpb=428402, bsz=16327.8, num_updates=14800, lr=0.000519875, gnorm=0.26, clip=0, loss_scale=8, train_wall=144, gb_free=59, wall=21856 epoch 009: 1189 / 1707 loss=4.263, nll_loss=2.65, ppl=6.28, wps=296169, ups=0.69, wpb=428402, bsz=16327.8, num_updates=14800, lr=0.000519875, gnorm=0.26, clip=0, loss_scale=8, train_wall=144, gb_free=59, wall=21856 epoch 009: 1189 / 1707 loss=4.263, nll_loss=2.65, ppl=6.28, wps=296169, ups=0.69, wpb=428402, bsz=16327.8, num_updates=14800, lr=0.000519875, gnorm=0.26, clip=0, loss_scale=8, train_wall=144, gb_free=59, wall=21856 epoch 009: 1290 / 1707 loss=4.262, nll_loss=2.65, ppl=6.28, wps=292885, ups=0.68, wpb=429064, bsz=16413.4, num_updates=14900, lr=0.000518128, gnorm=0.254, clip=0, loss_scale=4, train_wall=146, gb_free=59.6, wall=22002 epoch 009: 1290 / 1707 loss=4.262, nll_loss=2.65, ppl=6.28, wps=292885, ups=0.68, wpb=429064, bsz=16413.4, num_updates=14900, lr=0.000518128, gnorm=0.254, clip=0, loss_scale=4, train_wall=146, gb_free=59.6, wall=22002 epoch 009: 1290 / 1707 loss=4.262, nll_loss=2.65, ppl=6.28, wps=292885, ups=0.68, wpb=429064, bsz=16413.4, num_updates=14900, lr=0.000518128, gnorm=0.254, clip=0, loss_scale=4, train_wall=146, gb_free=59.6, wall=22002 epoch 009: 1290 / 1707 loss=4.262, nll_loss=2.65, ppl=6.28, wps=292885, ups=0.68, wpb=429064, bsz=16413.4, num_updates=14900, lr=0.000518128, gnorm=0.254, clip=0, loss_scale=4, train_wall=146, gb_free=59.6, wall=22002 epoch 009: 1290 / 1707 loss=4.262, nll_loss=2.65, ppl=6.28, wps=292885, ups=0.68, wpb=429064, bsz=16413.4, num_updates=14900, lr=0.000518128, gnorm=0.254, clip=0, loss_scale=4, train_wall=146, gb_free=59.6, wall=22002 epoch 009: 1290 / 1707 loss=4.262, nll_loss=2.65, ppl=6.28, wps=292885, ups=0.68, wpb=429064, bsz=16413.4, num_updates=14900, lr=0.000518128, gnorm=0.254, clip=0, loss_scale=4, train_wall=146, gb_free=59.6, wall=22002 epoch 009: 1290 / 1707 loss=4.262, nll_loss=2.65, ppl=6.28, wps=292885, ups=0.68, wpb=429064, bsz=16413.4, num_updates=14900, lr=0.000518128, gnorm=0.254, clip=0, loss_scale=4, train_wall=146, gb_free=59.6, wall=22002 epoch 009: 1290 / 1707 loss=4.262, nll_loss=2.65, ppl=6.28, wps=292885, ups=0.68, wpb=429064, bsz=16413.4, num_updates=14900, lr=0.000518128, gnorm=0.254, clip=0, loss_scale=4, train_wall=146, gb_free=59.6, wall=22002 epoch 009: 1290 / 1707 loss=4.262, nll_loss=2.65, ppl=6.28, wps=292885, ups=0.68, wpb=429064, bsz=16413.4, num_updates=14900, lr=0.000518128, gnorm=0.254, clip=0, loss_scale=4, train_wall=146, gb_free=59.6, wall=22002 epoch 009: 1390 / 1707 loss=4.265, nll_loss=2.653, ppl=6.29, wps=295698, ups=0.69, wpb=429781, bsz=16346.3, num_updates=15000, lr=0.000516398, gnorm=0.254, clip=0, loss_scale=4, train_wall=145, gb_free=59.1, wall=22147 epoch 009: 1390 / 1707 loss=4.265, nll_loss=2.653, ppl=6.29, wps=295698, ups=0.69, wpb=429781, bsz=16346.3, num_updates=15000, lr=0.000516398, gnorm=0.254, clip=0, loss_scale=4, train_wall=145, gb_free=59.1, wall=22147 epoch 009: 1390 / 1707 loss=4.265, nll_loss=2.653, ppl=6.29, wps=295698, ups=0.69, wpb=429781, bsz=16346.3, num_updates=15000, lr=0.000516398, gnorm=0.254, clip=0, loss_scale=4, train_wall=145, gb_free=59.1, wall=22147 epoch 009: 1390 / 1707 loss=4.265, nll_loss=2.653, ppl=6.29, wps=295698, ups=0.69, wpb=429781, bsz=16346.3, num_updates=15000, lr=0.000516398, gnorm=0.254, clip=0, loss_scale=4, train_wall=145, gb_free=59.1, wall=22147 epoch 009: 1390 / 1707 loss=4.265, nll_loss=2.653, ppl=6.29, wps=295698, ups=0.69, wpb=429781, bsz=16346.3, num_updates=15000, lr=0.000516398, gnorm=0.254, clip=0, loss_scale=4, train_wall=145, gb_free=59.1, wall=22147 epoch 009: 1390 / 1707 loss=4.265, nll_loss=2.653, ppl=6.29, wps=295698, ups=0.69, wpb=429781, bsz=16346.3, num_updates=15000, lr=0.000516398, gnorm=0.254, clip=0, loss_scale=4, train_wall=145, gb_free=59.1, wall=22147 epoch 009: 1390 / 1707 loss=4.265, nll_loss=2.653, ppl=6.29, wps=295698, ups=0.69, wpb=429781, bsz=16346.3, num_updates=15000, lr=0.000516398, gnorm=0.254, clip=0, loss_scale=4, train_wall=145, gb_free=59.1, wall=22147 epoch 009: 1390 / 1707 loss=4.265, nll_loss=2.653, ppl=6.29, wps=295698, ups=0.69, wpb=429781, bsz=16346.3, num_updates=15000, lr=0.000516398, gnorm=0.254, clip=0, loss_scale=4, train_wall=145, gb_free=59.1, wall=22147 epoch 009: 1390 / 1707 loss=4.265, nll_loss=2.653, ppl=6.29, wps=295698, ups=0.69, wpb=429781, bsz=16346.3, num_updates=15000, lr=0.000516398, gnorm=0.254, clip=0, loss_scale=4, train_wall=145, gb_free=59.1, wall=22147 begin validation on "valid" subset epoch 009 | valid on 'valid' subset | loss 4.377 | nll_loss 2.759 | ppl 6.77 | wps 76566.7 | wpb 21331 | bsz 1016 | num_updates 15000 | best_loss 4.377 epoch 009 | valid on 'valid' subset | loss 4.377 | nll_loss 2.759 | ppl 6.77 | wps 76566.7 | wpb 21331 | bsz 1016 | num_updates 15000 | best_loss 4.377 epoch 009 | valid on 'valid' subset | loss 4.377 | nll_loss 2.759 | ppl 6.77 | wps 76566.7 | wpb 21331 | bsz 1016 | num_updates 15000 | best_loss 4.377 epoch 009 | valid on 'valid' subset | loss 4.377 | nll_loss 2.759 | ppl 6.77 | wps 76566.7 | wpb 21331 | bsz 1016 | num_updates 15000 | best_loss 4.377 epoch 009 | valid on 'valid' subset | loss 4.377 | nll_loss 2.759 | ppl 6.77 | wps 76566.7 | wpb 21331 | bsz 1016 | num_updates 15000 | best_loss 4.377 epoch 009 | valid on 'valid' subset | loss 4.377 | nll_loss 2.759 | ppl 6.77 | wps 76566.7 | wpb 21331 | bsz 1016 | num_updates 15000 | best_loss 4.377 epoch 009 | valid on 'valid' subset | loss 4.377 | nll_loss 2.759 | ppl 6.77 | wps 76566.7 | wpb 21331 | bsz 1016 | num_updates 15000 | best_loss 4.377 epoch 009 | valid on 'valid' subset | loss 4.377 | nll_loss 2.759 | ppl 6.77 | wps 76566.7 | wpb 21331 | bsz 1016 | num_updates 15000 | best_loss 4.377 epoch 009 | valid on 'valid' subset | loss 4.377 | nll_loss 2.759 | ppl 6.77 | wps 76566.7 | wpb 21331 | bsz 1016 | num_updates 15000 | best_loss 4.377 epoch 009: 1490 / 1707 loss=4.261, nll_loss=2.649, ppl=6.27, wps=263453, ups=0.61, wpb=429624, bsz=16130, num_updates=15100, lr=0.000514685, gnorm=0.246, clip=0, loss_scale=8, train_wall=143, gb_free=59.5, wall=22311 epoch 009: 1490 / 1707 loss=4.261, nll_loss=2.649, ppl=6.27, wps=263453, ups=0.61, wpb=429624, bsz=16130, num_updates=15100, lr=0.000514685, gnorm=0.246, clip=0, loss_scale=8, train_wall=143, gb_free=59.5, wall=22311 epoch 009: 1490 / 1707 loss=4.261, nll_loss=2.649, ppl=6.27, wps=263453, ups=0.61, wpb=429624, bsz=16130, num_updates=15100, lr=0.000514685, gnorm=0.246, clip=0, loss_scale=8, train_wall=143, gb_free=59.5, wall=22311 epoch 009: 1490 / 1707 loss=4.261, nll_loss=2.649, ppl=6.27, wps=263453, ups=0.61, wpb=429624, bsz=16130, num_updates=15100, lr=0.000514685, gnorm=0.246, clip=0, loss_scale=8, train_wall=143, gb_free=59.5, wall=22311 epoch 009: 1490 / 1707 loss=4.261, nll_loss=2.649, ppl=6.27, wps=263453, ups=0.61, wpb=429624, bsz=16130, num_updates=15100, lr=0.000514685, gnorm=0.246, clip=0, loss_scale=8, train_wall=143, gb_free=59.5, wall=22311 epoch 009: 1490 / 1707 loss=4.261, nll_loss=2.649, ppl=6.27, wps=263453, ups=0.61, wpb=429624, bsz=16130, num_updates=15100, lr=0.000514685, gnorm=0.246, clip=0, loss_scale=8, train_wall=143, gb_free=59.5, wall=22311 epoch 009: 1490 / 1707 loss=4.261, nll_loss=2.649, ppl=6.27, wps=263453, ups=0.61, wpb=429624, bsz=16130, num_updates=15100, lr=0.000514685, gnorm=0.246, clip=0, loss_scale=8, train_wall=143, gb_free=59.5, wall=22311 epoch 009: 1490 / 1707 loss=4.261, nll_loss=2.649, ppl=6.27, wps=263453, ups=0.61, wpb=429624, bsz=16130, num_updates=15100, lr=0.000514685, gnorm=0.246, clip=0, loss_scale=8, train_wall=143, gb_free=59.5, wall=22311 epoch 009: 1490 / 1707 loss=4.261, nll_loss=2.649, ppl=6.27, wps=263453, ups=0.61, wpb=429624, bsz=16130, num_updates=15100, lr=0.000514685, gnorm=0.246, clip=0, loss_scale=8, train_wall=143, gb_free=59.5, wall=22311 epoch 009: 1591 / 1707 loss=4.267, nll_loss=2.656, ppl=6.3, wps=295017, ups=0.68, wpb=430996, bsz=16633.5, num_updates=15200, lr=0.000512989, gnorm=0.264, clip=0, loss_scale=4, train_wall=145, gb_free=59, wall=22457 epoch 009: 1591 / 1707 loss=4.267, nll_loss=2.656, ppl=6.3, wps=295017, ups=0.68, wpb=430996, bsz=16633.5, num_updates=15200, lr=0.000512989, gnorm=0.264, clip=0, loss_scale=4, train_wall=145, gb_free=59, wall=22457 epoch 009: 1591 / 1707 loss=4.267, nll_loss=2.656, ppl=6.3, wps=295017, ups=0.68, wpb=430996, bsz=16633.5, num_updates=15200, lr=0.000512989, gnorm=0.264, clip=0, loss_scale=4, train_wall=145, gb_free=59, wall=22457 epoch 009: 1591 / 1707 loss=4.267, nll_loss=2.656, ppl=6.3, wps=295017, ups=0.68, wpb=430996, bsz=16633.5, num_updates=15200, lr=0.000512989, gnorm=0.264, clip=0, loss_scale=4, train_wall=145, gb_free=59, wall=22457 epoch 009: 1591 / 1707 loss=4.267, nll_loss=2.656, ppl=6.3, wps=295017, ups=0.68, wpb=430996, bsz=16633.5, num_updates=15200, lr=0.000512989, gnorm=0.264, clip=0, loss_scale=4, train_wall=145, gb_free=59, wall=22457 epoch 009: 1591 / 1707 loss=4.267, nll_loss=2.656, ppl=6.3, wps=295017, ups=0.68, wpb=430996, bsz=16633.5, num_updates=15200, lr=0.000512989, gnorm=0.264, clip=0, loss_scale=4, train_wall=145, gb_free=59, wall=22457 epoch 009: 1591 / 1707 loss=4.267, nll_loss=2.656, ppl=6.3, wps=295017, ups=0.68, wpb=430996, bsz=16633.5, num_updates=15200, lr=0.000512989, gnorm=0.264, clip=0, loss_scale=4, train_wall=145, gb_free=59, wall=22457 epoch 009: 1591 / 1707 loss=4.267, nll_loss=2.656, ppl=6.3, wps=295017, ups=0.68, wpb=430996, bsz=16633.5, num_updates=15200, lr=0.000512989, gnorm=0.264, clip=0, loss_scale=4, train_wall=145, gb_free=59, wall=22457 epoch 009: 1591 / 1707 loss=4.267, nll_loss=2.656, ppl=6.3, wps=295017, ups=0.68, wpb=430996, bsz=16633.5, num_updates=15200, lr=0.000512989, gnorm=0.264, clip=0, loss_scale=4, train_wall=145, gb_free=59, wall=22457 epoch 009: 1692 / 1707 loss=4.261, nll_loss=2.649, ppl=6.27, wps=293184, ups=0.68, wpb=428033, bsz=16411.3, num_updates=15300, lr=0.00051131, gnorm=0.251, clip=0, loss_scale=2, train_wall=145, gb_free=59.1, wall=22603 epoch 009: 1692 / 1707 loss=4.261, nll_loss=2.649, ppl=6.27, wps=293184, ups=0.68, wpb=428033, bsz=16411.3, num_updates=15300, lr=0.00051131, gnorm=0.251, clip=0, loss_scale=2, train_wall=145, gb_free=59.1, wall=22603 epoch 009: 1692 / 1707 loss=4.261, nll_loss=2.649, ppl=6.27, wps=293184, ups=0.68, wpb=428033, bsz=16411.3, num_updates=15300, lr=0.00051131, gnorm=0.251, clip=0, loss_scale=2, train_wall=145, gb_free=59.1, wall=22603 epoch 009: 1692 / 1707 loss=4.261, nll_loss=2.649, ppl=6.27, wps=293184, ups=0.68, wpb=428033, bsz=16411.3, num_updates=15300, lr=0.00051131, gnorm=0.251, clip=0, loss_scale=2, train_wall=145, gb_free=59.1, wall=22603 epoch 009: 1692 / 1707 loss=4.261, nll_loss=2.649, ppl=6.27, wps=293184, ups=0.68, wpb=428033, bsz=16411.3, num_updates=15300, lr=0.00051131, gnorm=0.251, clip=0, loss_scale=2, train_wall=145, gb_free=59.1, wall=22603 epoch 009: 1692 / 1707 loss=4.261, nll_loss=2.649, ppl=6.27, wps=293184, ups=0.68, wpb=428033, bsz=16411.3, num_updates=15300, lr=0.00051131, gnorm=0.251, clip=0, loss_scale=2, train_wall=145, gb_free=59.1, wall=22603 epoch 009: 1692 / 1707 loss=4.261, nll_loss=2.649, ppl=6.27, wps=293184, ups=0.68, wpb=428033, bsz=16411.3, num_updates=15300, lr=0.00051131, gnorm=0.251, clip=0, loss_scale=2, train_wall=145, gb_free=59.1, wall=22603 epoch 009: 1692 / 1707 loss=4.261, nll_loss=2.649, ppl=6.27, wps=293184, ups=0.68, wpb=428033, bsz=16411.3, num_updates=15300, lr=0.00051131, gnorm=0.251, clip=0, loss_scale=2, train_wall=145, gb_free=59.1, wall=22603 epoch 009: 1692 / 1707 loss=4.261, nll_loss=2.649, ppl=6.27, wps=293184, ups=0.68, wpb=428033, bsz=16411.3, num_updates=15300, lr=0.00051131, gnorm=0.251, clip=0, loss_scale=2, train_wall=145, gb_free=59.1, wall=22603 end of epoch 9 (average epoch stats below) epoch 009 | loss 4.262 | nll_loss 2.649 | ppl 6.27 | wps 290566 | ups 0.68 | wpb 428958 | bsz 16331.5 | num_updates 15315 | lr 0.00051106 | gnorm 0.256 | clip 0 | loss_scale 2 | train_wall 2457 | gb_free 58.8 | wall 22623 epoch 009 | loss 4.262 | nll_loss 2.649 | ppl 6.27 | wps 290566 | ups 0.68 | wpb 428958 | bsz 16331.5 | num_updates 15315 | lr 0.00051106 | gnorm 0.256 | clip 0 | loss_scale 2 | train_wall 2457 | gb_free 58.8 | wall 22623 epoch 009 | loss 4.262 | nll_loss 2.649 | ppl 6.27 | wps 290566 | ups 0.68 | wpb 428958 | bsz 16331.5 | num_updates 15315 | lr 0.00051106 | gnorm 0.256 | clip 0 | loss_scale 2 | train_wall 2457 | gb_free 58.8 | wall 22623 epoch 009 | loss 4.262 | nll_loss 2.649 | ppl 6.27 | wps 290566 | ups 0.68 | wpb 428958 | bsz 16331.5 | num_updates 15315 | lr 0.00051106 | gnorm 0.256 | clip 0 | loss_scale 2 | train_wall 2457 | gb_free 58.8 | wall 22623 epoch 009 | loss 4.262 | nll_loss 2.649 | ppl 6.27 | wps 290566 | ups 0.68 | wpb 428958 | bsz 16331.5 | num_updates 15315 | lr 0.00051106 | gnorm 0.256 | clip 0 | loss_scale 2 | train_wall 2457 | gb_free 58.8 | wall 22623 epoch 009 | loss 4.262 | nll_loss 2.649 | ppl 6.27 | wps 290566 | ups 0.68 | wpb 428958 | bsz 16331.5 | num_updates 15315 | lr 0.00051106 | gnorm 0.256 | clip 0 | loss_scale 2 | train_wall 2457 | gb_free 58.8 | wall 22623 epoch 009 | loss 4.262 | nll_loss 2.649 | ppl 6.27 | wps 290566 | ups 0.68 | wpb 428958 | bsz 16331.5 | num_updates 15315 | lr 0.00051106 | gnorm 0.256 | clip 0 | loss_scale 2 | train_wall 2457 | gb_free 58.8 | wall 22623 epoch 009 | loss 4.262 | nll_loss 2.649 | ppl 6.27 | wps 290566 | ups 0.68 | wpb 428958 | bsz 16331.5 | num_updates 15315 | lr 0.00051106 | gnorm 0.256 | clip 0 | loss_scale 2 | train_wall 2457 | gb_free 58.8 | wall 22623 epoch 009 | loss 4.262 | nll_loss 2.649 | ppl 6.27 | wps 290566 | ups 0.68 | wpb 428958 | bsz 16331.5 | num_updates 15315 | lr 0.00051106 | gnorm 0.256 | clip 0 | loss_scale 2 | train_wall 2457 | gb_free 58.8 | wall 22623 Start iterating over samples epoch 010: 85 / 1707 loss=4.224, nll_loss=2.605, ppl=6.08, wps=294565, ups=0.69, wpb=424344, bsz=16060.6, num_updates=15400, lr=0.000509647, gnorm=0.268, clip=0, loss_scale=2, train_wall=143, gb_free=59, wall=22747 epoch 010: 85 / 1707 loss=4.224, nll_loss=2.605, ppl=6.08, wps=294565, ups=0.69, wpb=424344, bsz=16060.6, num_updates=15400, lr=0.000509647, gnorm=0.268, clip=0, loss_scale=2, train_wall=143, gb_free=59, wall=22747 epoch 010: 85 / 1707 loss=4.224, nll_loss=2.605, ppl=6.08, wps=294565, ups=0.69, wpb=424344, bsz=16060.6, num_updates=15400, lr=0.000509647, gnorm=0.268, clip=0, loss_scale=2, train_wall=143, gb_free=59, wall=22747 epoch 010: 85 / 1707 loss=4.224, nll_loss=2.605, ppl=6.08, wps=294565, ups=0.69, wpb=424344, bsz=16060.6, num_updates=15400, lr=0.000509647, gnorm=0.268, clip=0, loss_scale=2, train_wall=143, gb_free=59, wall=22747 epoch 010: 85 / 1707 loss=4.224, nll_loss=2.605, ppl=6.08, wps=294565, ups=0.69, wpb=424344, bsz=16060.6, num_updates=15400, lr=0.000509647, gnorm=0.268, clip=0, loss_scale=2, train_wall=143, gb_free=59, wall=22747 epoch 010: 85 / 1707 loss=4.224, nll_loss=2.605, ppl=6.08, wps=294565, ups=0.69, wpb=424344, bsz=16060.6, num_updates=15400, lr=0.000509647, gnorm=0.268, clip=0, loss_scale=2, train_wall=143, gb_free=59, wall=22747 epoch 010: 85 / 1707 loss=4.224, nll_loss=2.605, ppl=6.08, wps=294565, ups=0.69, wpb=424344, bsz=16060.6, num_updates=15400, lr=0.000509647, gnorm=0.268, clip=0, loss_scale=2, train_wall=143, gb_free=59, wall=22747 epoch 010: 85 / 1707 loss=4.224, nll_loss=2.605, ppl=6.08, wps=294565, ups=0.69, wpb=424344, bsz=16060.6, num_updates=15400, lr=0.000509647, gnorm=0.268, clip=0, loss_scale=2, train_wall=143, gb_free=59, wall=22747 epoch 010: 85 / 1707 loss=4.224, nll_loss=2.605, ppl=6.08, wps=294565, ups=0.69, wpb=424344, bsz=16060.6, num_updates=15400, lr=0.000509647, gnorm=0.268, clip=0, loss_scale=2, train_wall=143, gb_free=59, wall=22747 epoch 010: 85 / 1707 loss=4.224, nll_loss=2.605, ppl=6.08, wps=294565, ups=0.69, wpb=424344, bsz=16060.6, num_updates=15400, lr=0.000509647, gnorm=0.268, clip=0, loss_scale=2, train_wall=143, gb_free=59, wall=22747 epoch 010: 185 / 1707 loss=4.226, nll_loss=2.608, ppl=6.09, wps=295740, ups=0.69, wpb=428231, bsz=16124.6, num_updates=15500, lr=0.000508001, gnorm=0.252, clip=0, loss_scale=4, train_wall=144, gb_free=58.7, wall=22891 epoch 010: 185 / 1707 loss=4.226, nll_loss=2.608, ppl=6.09, wps=295740, ups=0.69, wpb=428231, bsz=16124.6, num_updates=15500, lr=0.000508001, gnorm=0.252, clip=0, loss_scale=4, train_wall=144, gb_free=58.7, wall=22891 epoch 010: 185 / 1707 loss=4.226, nll_loss=2.608, ppl=6.09, wps=295740, ups=0.69, wpb=428231, bsz=16124.6, num_updates=15500, lr=0.000508001, gnorm=0.252, clip=0, loss_scale=4, train_wall=144, gb_free=58.7, wall=22891 epoch 010: 185 / 1707 loss=4.226, nll_loss=2.608, ppl=6.09, wps=295740, ups=0.69, wpb=428231, bsz=16124.6, num_updates=15500, lr=0.000508001, gnorm=0.252, clip=0, loss_scale=4, train_wall=144, gb_free=58.7, wall=22891 epoch 010: 185 / 1707 loss=4.226, nll_loss=2.608, ppl=6.09, wps=295740, ups=0.69, wpb=428231, bsz=16124.6, num_updates=15500, lr=0.000508001, gnorm=0.252, clip=0, loss_scale=4, train_wall=144, gb_free=58.7, wall=22891 epoch 010: 185 / 1707 loss=4.226, nll_loss=2.608, ppl=6.09, wps=295740, ups=0.69, wpb=428231, bsz=16124.6, num_updates=15500, lr=0.000508001, gnorm=0.252, clip=0, loss_scale=4, train_wall=144, gb_free=58.7, wall=22891 epoch 010: 185 / 1707 loss=4.226, nll_loss=2.608, ppl=6.09, wps=295740, ups=0.69, wpb=428231, bsz=16124.6, num_updates=15500, lr=0.000508001, gnorm=0.252, clip=0, loss_scale=4, train_wall=144, gb_free=58.7, wall=22891 epoch 010: 185 / 1707 loss=4.226, nll_loss=2.608, ppl=6.09, wps=295740, ups=0.69, wpb=428231, bsz=16124.6, num_updates=15500, lr=0.000508001, gnorm=0.252, clip=0, loss_scale=4, train_wall=144, gb_free=58.7, wall=22891 epoch 010: 185 / 1707 loss=4.226, nll_loss=2.608, ppl=6.09, wps=295740, ups=0.69, wpb=428231, bsz=16124.6, num_updates=15500, lr=0.000508001, gnorm=0.252, clip=0, loss_scale=4, train_wall=144, gb_free=58.7, wall=22891 epoch 010: 185 / 1707 loss=4.226, nll_loss=2.608, ppl=6.09, wps=295740, ups=0.69, wpb=428231, bsz=16124.6, num_updates=15500, lr=0.000508001, gnorm=0.252, clip=0, loss_scale=4, train_wall=144, gb_free=58.7, wall=22891 epoch 010: 285 / 1707 loss=4.224, nll_loss=2.606, ppl=6.09, wps=295944, ups=0.69, wpb=429438, bsz=16262.8, num_updates=15600, lr=0.00050637, gnorm=0.246, clip=0, loss_scale=4, train_wall=144, gb_free=59.2, wall=23037 epoch 010: 285 / 1707 loss=4.224, nll_loss=2.606, ppl=6.09, wps=295944, ups=0.69, wpb=429438, bsz=16262.8, num_updates=15600, lr=0.00050637, gnorm=0.246, clip=0, loss_scale=4, train_wall=144, gb_free=59.2, wall=23037 epoch 010: 285 / 1707 loss=4.224, nll_loss=2.606, ppl=6.09, wps=295944, ups=0.69, wpb=429438, bsz=16262.8, num_updates=15600, lr=0.00050637, gnorm=0.246, clip=0, loss_scale=4, train_wall=144, gb_free=59.2, wall=23037 epoch 010: 285 / 1707 loss=4.224, nll_loss=2.606, ppl=6.09, wps=295944, ups=0.69, wpb=429438, bsz=16262.8, num_updates=15600, lr=0.00050637, gnorm=0.246, clip=0, loss_scale=4, train_wall=144, gb_free=59.2, wall=23037 epoch 010: 285 / 1707 loss=4.224, nll_loss=2.606, ppl=6.09, wps=295944, ups=0.69, wpb=429438, bsz=16262.8, num_updates=15600, lr=0.00050637, gnorm=0.246, clip=0, loss_scale=4, train_wall=144, gb_free=59.2, wall=23037 epoch 010: 285 / 1707 loss=4.224, nll_loss=2.606, ppl=6.09, wps=295944, ups=0.69, wpb=429438, bsz=16262.8, num_updates=15600, lr=0.00050637, gnorm=0.246, clip=0, loss_scale=4, train_wall=144, gb_free=59.2, wall=23037 epoch 010: 285 / 1707 loss=4.224, nll_loss=2.606, ppl=6.09, wps=295944, ups=0.69, wpb=429438, bsz=16262.8, num_updates=15600, lr=0.00050637, gnorm=0.246, clip=0, loss_scale=4, train_wall=144, gb_free=59.2, wall=23037 epoch 010: 285 / 1707 loss=4.224, nll_loss=2.606, ppl=6.09, wps=295944, ups=0.69, wpb=429438, bsz=16262.8, num_updates=15600, lr=0.00050637, gnorm=0.246, clip=0, loss_scale=4, train_wall=144, gb_free=59.2, wall=23037 epoch 010: 285 / 1707 loss=4.224, nll_loss=2.606, ppl=6.09, wps=295944, ups=0.69, wpb=429438, bsz=16262.8, num_updates=15600, lr=0.00050637, gnorm=0.246, clip=0, loss_scale=4, train_wall=144, gb_free=59.2, wall=23037 epoch 010: 285 / 1707 loss=4.224, nll_loss=2.606, ppl=6.09, wps=295944, ups=0.69, wpb=429438, bsz=16262.8, num_updates=15600, lr=0.00050637, gnorm=0.246, clip=0, loss_scale=4, train_wall=144, gb_free=59.2, wall=23037 epoch 010: 385 / 1707 loss=4.233, nll_loss=2.616, ppl=6.13, wps=296730, ups=0.69, wpb=430359, bsz=16531.2, num_updates=15700, lr=0.000504754, gnorm=0.248, clip=0, loss_scale=4, train_wall=144, gb_free=58.8, wall=23182 epoch 010: 385 / 1707 loss=4.233, nll_loss=2.616, ppl=6.13, wps=296730, ups=0.69, wpb=430359, bsz=16531.2, num_updates=15700, lr=0.000504754, gnorm=0.248, clip=0, loss_scale=4, train_wall=144, gb_free=58.8, wall=23182 epoch 010: 385 / 1707 loss=4.233, nll_loss=2.616, ppl=6.13, wps=296730, ups=0.69, wpb=430359, bsz=16531.2, num_updates=15700, lr=0.000504754, gnorm=0.248, clip=0, loss_scale=4, train_wall=144, gb_free=58.8, wall=23182 epoch 010: 385 / 1707 loss=4.233, nll_loss=2.616, ppl=6.13, wps=296730, ups=0.69, wpb=430359, bsz=16531.2, num_updates=15700, lr=0.000504754, gnorm=0.248, clip=0, loss_scale=4, train_wall=144, gb_free=58.8, wall=23182 epoch 010: 385 / 1707 loss=4.233, nll_loss=2.616, ppl=6.13, wps=296730, ups=0.69, wpb=430359, bsz=16531.2, num_updates=15700, lr=0.000504754, gnorm=0.248, clip=0, loss_scale=4, train_wall=144, gb_free=58.8, wall=23182 epoch 010: 385 / 1707 loss=4.233, nll_loss=2.616, ppl=6.13, wps=296730, ups=0.69, wpb=430359, bsz=16531.2, num_updates=15700, lr=0.000504754, gnorm=0.248, clip=0, loss_scale=4, train_wall=144, gb_free=58.8, wall=23182 epoch 010: 385 / 1707 loss=4.233, nll_loss=2.616, ppl=6.13, wps=296730, ups=0.69, wpb=430359, bsz=16531.2, num_updates=15700, lr=0.000504754, gnorm=0.248, clip=0, loss_scale=4, train_wall=144, gb_free=58.8, wall=23182 epoch 010: 385 / 1707 loss=4.233, nll_loss=2.616, ppl=6.13, wps=296730, ups=0.69, wpb=430359, bsz=16531.2, num_updates=15700, lr=0.000504754, gnorm=0.248, clip=0, loss_scale=4, train_wall=144, gb_free=58.8, wall=23182 epoch 010: 385 / 1707 loss=4.233, nll_loss=2.616, ppl=6.13, wps=296730, ups=0.69, wpb=430359, bsz=16531.2, num_updates=15700, lr=0.000504754, gnorm=0.248, clip=0, loss_scale=4, train_wall=144, gb_free=58.8, wall=23182 epoch 010: 385 / 1707 loss=4.233, nll_loss=2.616, ppl=6.13, wps=296730, ups=0.69, wpb=430359, bsz=16531.2, num_updates=15700, lr=0.000504754, gnorm=0.248, clip=0, loss_scale=4, train_wall=144, gb_free=58.8, wall=23182 epoch 010: 486 / 1707 loss=4.234, nll_loss=2.617, ppl=6.14, wps=293384, ups=0.68, wpb=429395, bsz=16242.1, num_updates=15800, lr=0.000503155, gnorm=0.254, clip=0, loss_scale=4, train_wall=146, gb_free=59.5, wall=23328 epoch 010: 486 / 1707 loss=4.234, nll_loss=2.617, ppl=6.14, wps=293384, ups=0.68, wpb=429395, bsz=16242.1, num_updates=15800, lr=0.000503155, gnorm=0.254, clip=0, loss_scale=4, train_wall=146, gb_free=59.5, wall=23328 epoch 010: 486 / 1707 loss=4.234, nll_loss=2.617, ppl=6.14, wps=293384, ups=0.68, wpb=429395, bsz=16242.1, num_updates=15800, lr=0.000503155, gnorm=0.254, clip=0, loss_scale=4, train_wall=146, gb_free=59.5, wall=23328 epoch 010: 486 / 1707 loss=4.234, nll_loss=2.617, ppl=6.14, wps=293384, ups=0.68, wpb=429395, bsz=16242.1, num_updates=15800, lr=0.000503155, gnorm=0.254, clip=0, loss_scale=4, train_wall=146, gb_free=59.5, wall=23328 epoch 010: 486 / 1707 loss=4.234, nll_loss=2.617, ppl=6.14, wps=293384, ups=0.68, wpb=429395, bsz=16242.1, num_updates=15800, lr=0.000503155, gnorm=0.254, clip=0, loss_scale=4, train_wall=146, gb_free=59.5, wall=23328 epoch 010: 486 / 1707 loss=4.234, nll_loss=2.617, ppl=6.14, wps=293384, ups=0.68, wpb=429395, bsz=16242.1, num_updates=15800, lr=0.000503155, gnorm=0.254, clip=0, loss_scale=4, train_wall=146, gb_free=59.5, wall=23328 epoch 010: 486 / 1707 loss=4.234, nll_loss=2.617, ppl=6.14, wps=293384, ups=0.68, wpb=429395, bsz=16242.1, num_updates=15800, lr=0.000503155, gnorm=0.254, clip=0, loss_scale=4, train_wall=146, gb_free=59.5, wall=23328 epoch 010: 486 / 1707 loss=4.234, nll_loss=2.617, ppl=6.14, wps=293384, ups=0.68, wpb=429395, bsz=16242.1, num_updates=15800, lr=0.000503155, gnorm=0.254, clip=0, loss_scale=4, train_wall=146, gb_free=59.5, wall=23328 epoch 010: 486 / 1707 loss=4.234, nll_loss=2.617, ppl=6.14, wps=293384, ups=0.68, wpb=429395, bsz=16242.1, num_updates=15800, lr=0.000503155, gnorm=0.254, clip=0, loss_scale=4, train_wall=146, gb_free=59.5, wall=23328 epoch 010: 486 / 1707 loss=4.234, nll_loss=2.617, ppl=6.14, wps=293384, ups=0.68, wpb=429395, bsz=16242.1, num_updates=15800, lr=0.000503155, gnorm=0.254, clip=0, loss_scale=4, train_wall=146, gb_free=59.5, wall=23328 epoch 010: 587 / 1707 loss=4.235, nll_loss=2.619, ppl=6.14, wps=292377, ups=0.68, wpb=429336, bsz=16660.2, num_updates=15900, lr=0.00050157, gnorm=0.263, clip=0, loss_scale=2, train_wall=146, gb_free=58.8, wall=23475 epoch 010: 587 / 1707 loss=4.235, nll_loss=2.619, ppl=6.14, wps=292377, ups=0.68, wpb=429336, bsz=16660.2, num_updates=15900, lr=0.00050157, gnorm=0.263, clip=0, loss_scale=2, train_wall=146, gb_free=58.8, wall=23475 epoch 010: 587 / 1707 loss=4.235, nll_loss=2.619, ppl=6.14, wps=292377, ups=0.68, wpb=429336, bsz=16660.2, num_updates=15900, lr=0.00050157, gnorm=0.263, clip=0, loss_scale=2, train_wall=146, gb_free=58.8, wall=23475 epoch 010: 587 / 1707 loss=4.235, nll_loss=2.619, ppl=6.14, wps=292377, ups=0.68, wpb=429336, bsz=16660.2, num_updates=15900, lr=0.00050157, gnorm=0.263, clip=0, loss_scale=2, train_wall=146, gb_free=58.8, wall=23475 epoch 010: 587 / 1707 loss=4.235, nll_loss=2.619, ppl=6.14, wps=292377, ups=0.68, wpb=429336, bsz=16660.2, num_updates=15900, lr=0.00050157, gnorm=0.263, clip=0, loss_scale=2, train_wall=146, gb_free=58.8, wall=23475 epoch 010: 587 / 1707 loss=4.235, nll_loss=2.619, ppl=6.14, wps=292377, ups=0.68, wpb=429336, bsz=16660.2, num_updates=15900, lr=0.00050157, gnorm=0.263, clip=0, loss_scale=2, train_wall=146, gb_free=58.8, wall=23475 epoch 010: 587 / 1707 loss=4.235, nll_loss=2.619, ppl=6.14, wps=292377, ups=0.68, wpb=429336, bsz=16660.2, num_updates=15900, lr=0.00050157, gnorm=0.263, clip=0, loss_scale=2, train_wall=146, gb_free=58.8, wall=23475 epoch 010: 587 / 1707 loss=4.235, nll_loss=2.619, ppl=6.14, wps=292377, ups=0.68, wpb=429336, bsz=16660.2, num_updates=15900, lr=0.00050157, gnorm=0.263, clip=0, loss_scale=2, train_wall=146, gb_free=58.8, wall=23475 epoch 010: 587 / 1707 loss=4.235, nll_loss=2.619, ppl=6.14, wps=292377, ups=0.68, wpb=429336, bsz=16660.2, num_updates=15900, lr=0.00050157, gnorm=0.263, clip=0, loss_scale=2, train_wall=146, gb_free=58.8, wall=23475 epoch 010: 587 / 1707 loss=4.235, nll_loss=2.619, ppl=6.14, wps=292377, ups=0.68, wpb=429336, bsz=16660.2, num_updates=15900, lr=0.00050157, gnorm=0.263, clip=0, loss_scale=2, train_wall=146, gb_free=58.8, wall=23475 epoch 010: 687 / 1707 loss=4.24, nll_loss=2.625, ppl=6.17, wps=296580, ups=0.69, wpb=429816, bsz=16494.4, num_updates=16000, lr=0.0005, gnorm=0.255, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=23620 epoch 010: 687 / 1707 loss=4.24, nll_loss=2.625, ppl=6.17, wps=296580, ups=0.69, wpb=429816, bsz=16494.4, num_updates=16000, lr=0.0005, gnorm=0.255, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=23620 epoch 010: 687 / 1707 loss=4.24, nll_loss=2.625, ppl=6.17, wps=296580, ups=0.69, wpb=429816, bsz=16494.4, num_updates=16000, lr=0.0005, gnorm=0.255, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=23620 epoch 010: 687 / 1707 loss=4.24, nll_loss=2.625, ppl=6.17, wps=296580, ups=0.69, wpb=429816, bsz=16494.4, num_updates=16000, lr=0.0005, gnorm=0.255, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=23620 epoch 010: 687 / 1707 loss=4.24, nll_loss=2.625, ppl=6.17, wps=296580, ups=0.69, wpb=429816, bsz=16494.4, num_updates=16000, lr=0.0005, gnorm=0.255, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=23620 epoch 010: 687 / 1707 loss=4.24, nll_loss=2.625, ppl=6.17, wps=296580, ups=0.69, wpb=429816, bsz=16494.4, num_updates=16000, lr=0.0005, gnorm=0.255, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=23620 epoch 010: 687 / 1707 loss=4.24, nll_loss=2.625, ppl=6.17, wps=296580, ups=0.69, wpb=429816, bsz=16494.4, num_updates=16000, lr=0.0005, gnorm=0.255, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=23620 epoch 010: 687 / 1707 loss=4.24, nll_loss=2.625, ppl=6.17, wps=296580, ups=0.69, wpb=429816, bsz=16494.4, num_updates=16000, lr=0.0005, gnorm=0.255, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=23620 epoch 010: 687 / 1707 loss=4.24, nll_loss=2.625, ppl=6.17, wps=296580, ups=0.69, wpb=429816, bsz=16494.4, num_updates=16000, lr=0.0005, gnorm=0.255, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=23620 epoch 010: 687 / 1707 loss=4.24, nll_loss=2.625, ppl=6.17, wps=296580, ups=0.69, wpb=429816, bsz=16494.4, num_updates=16000, lr=0.0005, gnorm=0.255, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=23620 begin validation on "valid" subset epoch 010 | valid on 'valid' subset | loss 4.356 | nll_loss 2.737 | ppl 6.67 | wps 76176.1 | wpb 21331 | bsz 1016 | num_updates 16000 | best_loss 4.356 epoch 010 | valid on 'valid' subset | loss 4.356 | nll_loss 2.737 | ppl 6.67 | wps 76176.1 | wpb 21331 | bsz 1016 | num_updates 16000 | best_loss 4.356 epoch 010 | valid on 'valid' subset | loss 4.356 | nll_loss 2.737 | ppl 6.67 | wps 76176.1 | wpb 21331 | bsz 1016 | num_updates 16000 | best_loss 4.356 epoch 010 | valid on 'valid' subset | loss 4.356 | nll_loss 2.737 | ppl 6.67 | wps 76176.1 | wpb 21331 | bsz 1016 | num_updates 16000 | best_loss 4.356 epoch 010 | valid on 'valid' subset | loss 4.356 | nll_loss 2.737 | ppl 6.67 | wps 76176.1 | wpb 21331 | bsz 1016 | num_updates 16000 | best_loss 4.356 epoch 010 | valid on 'valid' subset | loss 4.356 | nll_loss 2.737 | ppl 6.67 | wps 76176.1 | wpb 21331 | bsz 1016 | num_updates 16000 | best_loss 4.356 epoch 010 | valid on 'valid' subset | loss 4.356 | nll_loss 2.737 | ppl 6.67 | wps 76176.1 | wpb 21331 | bsz 1016 | num_updates 16000 | best_loss 4.356 epoch 010 | valid on 'valid' subset | loss 4.356 | nll_loss 2.737 | ppl 6.67 | wps 76176.1 | wpb 21331 | bsz 1016 | num_updates 16000 | best_loss 4.356 epoch 010 | valid on 'valid' subset | loss 4.356 | nll_loss 2.737 | ppl 6.67 | wps 76176.1 | wpb 21331 | bsz 1016 | num_updates 16000 | best_loss 4.356 epoch 010 | valid on 'valid' subset | loss 4.356 | nll_loss 2.737 | ppl 6.67 | wps 76176.1 | wpb 21331 | bsz 1016 | num_updates 16000 | best_loss 4.356 epoch 010: 787 / 1707 loss=4.23, nll_loss=2.613, ppl=6.12, wps=265560, ups=0.62, wpb=429475, bsz=16315.2, num_updates=16100, lr=0.000498445, gnorm=0.26, clip=0, loss_scale=2, train_wall=144, gb_free=58.8, wall=23781 epoch 010: 787 / 1707 loss=4.23, nll_loss=2.613, ppl=6.12, wps=265560, ups=0.62, wpb=429475, bsz=16315.2, num_updates=16100, lr=0.000498445, gnorm=0.26, clip=0, loss_scale=2, train_wall=144, gb_free=58.8, wall=23781 epoch 010: 787 / 1707 loss=4.23, nll_loss=2.613, ppl=6.12, wps=265560, ups=0.62, wpb=429475, bsz=16315.2, num_updates=16100, lr=0.000498445, gnorm=0.26, clip=0, loss_scale=2, train_wall=144, gb_free=58.8, wall=23781 epoch 010: 787 / 1707 loss=4.23, nll_loss=2.613, ppl=6.12, wps=265560, ups=0.62, wpb=429475, bsz=16315.2, num_updates=16100, lr=0.000498445, gnorm=0.26, clip=0, loss_scale=2, train_wall=144, gb_free=58.8, wall=23781 epoch 010: 787 / 1707 loss=4.23, nll_loss=2.613, ppl=6.12, wps=265560, ups=0.62, wpb=429475, bsz=16315.2, num_updates=16100, lr=0.000498445, gnorm=0.26, clip=0, loss_scale=2, train_wall=144, gb_free=58.8, wall=23781 epoch 010: 787 / 1707 loss=4.23, nll_loss=2.613, ppl=6.12, wps=265560, ups=0.62, wpb=429475, bsz=16315.2, num_updates=16100, lr=0.000498445, gnorm=0.26, clip=0, loss_scale=2, train_wall=144, gb_free=58.8, wall=23781 epoch 010: 787 / 1707 loss=4.23, nll_loss=2.613, ppl=6.12, wps=265560, ups=0.62, wpb=429475, bsz=16315.2, num_updates=16100, lr=0.000498445, gnorm=0.26, clip=0, loss_scale=2, train_wall=144, gb_free=58.8, wall=23781 epoch 010: 787 / 1707 loss=4.23, nll_loss=2.613, ppl=6.12, wps=265560, ups=0.62, wpb=429475, bsz=16315.2, num_updates=16100, lr=0.000498445, gnorm=0.26, clip=0, loss_scale=2, train_wall=144, gb_free=58.8, wall=23781 epoch 010: 787 / 1707 loss=4.23, nll_loss=2.613, ppl=6.12, wps=265560, ups=0.62, wpb=429475, bsz=16315.2, num_updates=16100, lr=0.000498445, gnorm=0.26, clip=0, loss_scale=2, train_wall=144, gb_free=58.8, wall=23781 epoch 010: 787 / 1707 loss=4.23, nll_loss=2.613, ppl=6.12, wps=265560, ups=0.62, wpb=429475, bsz=16315.2, num_updates=16100, lr=0.000498445, gnorm=0.26, clip=0, loss_scale=2, train_wall=144, gb_free=58.8, wall=23781 epoch 010: 887 / 1707 loss=4.237, nll_loss=2.621, ppl=6.15, wps=296842, ups=0.69, wpb=429549, bsz=16483.3, num_updates=16200, lr=0.000496904, gnorm=0.252, clip=0, loss_scale=4, train_wall=144, gb_free=58.8, wall=23926 epoch 010: 887 / 1707 loss=4.237, nll_loss=2.621, ppl=6.15, wps=296842, ups=0.69, wpb=429549, bsz=16483.3, num_updates=16200, lr=0.000496904, gnorm=0.252, clip=0, loss_scale=4, train_wall=144, gb_free=58.8, wall=23926 epoch 010: 887 / 1707 loss=4.237, nll_loss=2.621, ppl=6.15, wps=296842, ups=0.69, wpb=429549, bsz=16483.3, num_updates=16200, lr=0.000496904, gnorm=0.252, clip=0, loss_scale=4, train_wall=144, gb_free=58.8, wall=23926 epoch 010: 887 / 1707 loss=4.237, nll_loss=2.621, ppl=6.15, wps=296842, ups=0.69, wpb=429549, bsz=16483.3, num_updates=16200, lr=0.000496904, gnorm=0.252, clip=0, loss_scale=4, train_wall=144, gb_free=58.8, wall=23926 epoch 010: 887 / 1707 loss=4.237, nll_loss=2.621, ppl=6.15, wps=296842, ups=0.69, wpb=429549, bsz=16483.3, num_updates=16200, lr=0.000496904, gnorm=0.252, clip=0, loss_scale=4, train_wall=144, gb_free=58.8, wall=23926 epoch 010: 887 / 1707 loss=4.237, nll_loss=2.621, ppl=6.15, wps=296842, ups=0.69, wpb=429549, bsz=16483.3, num_updates=16200, lr=0.000496904, gnorm=0.252, clip=0, loss_scale=4, train_wall=144, gb_free=58.8, wall=23926 epoch 010: 887 / 1707 loss=4.237, nll_loss=2.621, ppl=6.15, wps=296842, ups=0.69, wpb=429549, bsz=16483.3, num_updates=16200, lr=0.000496904, gnorm=0.252, clip=0, loss_scale=4, train_wall=144, gb_free=58.8, wall=23926 epoch 010: 887 / 1707 loss=4.237, nll_loss=2.621, ppl=6.15, wps=296842, ups=0.69, wpb=429549, bsz=16483.3, num_updates=16200, lr=0.000496904, gnorm=0.252, clip=0, loss_scale=4, train_wall=144, gb_free=58.8, wall=23926 epoch 010: 887 / 1707 loss=4.237, nll_loss=2.621, ppl=6.15, wps=296842, ups=0.69, wpb=429549, bsz=16483.3, num_updates=16200, lr=0.000496904, gnorm=0.252, clip=0, loss_scale=4, train_wall=144, gb_free=58.8, wall=23926 epoch 010: 887 / 1707 loss=4.237, nll_loss=2.621, ppl=6.15, wps=296842, ups=0.69, wpb=429549, bsz=16483.3, num_updates=16200, lr=0.000496904, gnorm=0.252, clip=0, loss_scale=4, train_wall=144, gb_free=58.8, wall=23926 epoch 010: 987 / 1707 loss=4.235, nll_loss=2.619, ppl=6.14, wps=297670, ups=0.69, wpb=430090, bsz=16196.2, num_updates=16300, lr=0.000495377, gnorm=0.249, clip=0, loss_scale=4, train_wall=144, gb_free=59.3, wall=24071 epoch 010: 987 / 1707 loss=4.235, nll_loss=2.619, ppl=6.14, wps=297670, ups=0.69, wpb=430090, bsz=16196.2, num_updates=16300, lr=0.000495377, gnorm=0.249, clip=0, loss_scale=4, train_wall=144, gb_free=59.3, wall=24071 epoch 010: 987 / 1707 loss=4.235, nll_loss=2.619, ppl=6.14, wps=297670, ups=0.69, wpb=430090, bsz=16196.2, num_updates=16300, lr=0.000495377, gnorm=0.249, clip=0, loss_scale=4, train_wall=144, gb_free=59.3, wall=24071 epoch 010: 987 / 1707 loss=4.235, nll_loss=2.619, ppl=6.14, wps=297670, ups=0.69, wpb=430090, bsz=16196.2, num_updates=16300, lr=0.000495377, gnorm=0.249, clip=0, loss_scale=4, train_wall=144, gb_free=59.3, wall=24071 epoch 010: 987 / 1707 loss=4.235, nll_loss=2.619, ppl=6.14, wps=297670, ups=0.69, wpb=430090, bsz=16196.2, num_updates=16300, lr=0.000495377, gnorm=0.249, clip=0, loss_scale=4, train_wall=144, gb_free=59.3, wall=24071 epoch 010: 987 / 1707 loss=4.235, nll_loss=2.619, ppl=6.14, wps=297670, ups=0.69, wpb=430090, bsz=16196.2, num_updates=16300, lr=0.000495377, gnorm=0.249, clip=0, loss_scale=4, train_wall=144, gb_free=59.3, wall=24071 epoch 010: 987 / 1707 loss=4.235, nll_loss=2.619, ppl=6.14, wps=297670, ups=0.69, wpb=430090, bsz=16196.2, num_updates=16300, lr=0.000495377, gnorm=0.249, clip=0, loss_scale=4, train_wall=144, gb_free=59.3, wall=24071 epoch 010: 987 / 1707 loss=4.235, nll_loss=2.619, ppl=6.14, wps=297670, ups=0.69, wpb=430090, bsz=16196.2, num_updates=16300, lr=0.000495377, gnorm=0.249, clip=0, loss_scale=4, train_wall=144, gb_free=59.3, wall=24071 epoch 010: 987 / 1707 loss=4.235, nll_loss=2.619, ppl=6.14, wps=297670, ups=0.69, wpb=430090, bsz=16196.2, num_updates=16300, lr=0.000495377, gnorm=0.249, clip=0, loss_scale=4, train_wall=144, gb_free=59.3, wall=24071 epoch 010: 987 / 1707 loss=4.235, nll_loss=2.619, ppl=6.14, wps=297670, ups=0.69, wpb=430090, bsz=16196.2, num_updates=16300, lr=0.000495377, gnorm=0.249, clip=0, loss_scale=4, train_wall=144, gb_free=59.3, wall=24071 epoch 010: 1088 / 1707 loss=4.228, nll_loss=2.611, ppl=6.11, wps=292849, ups=0.68, wpb=427900, bsz=16375.7, num_updates=16400, lr=0.000493865, gnorm=0.259, clip=0, loss_scale=2, train_wall=145, gb_free=59.4, wall=24217 epoch 010: 1088 / 1707 loss=4.228, nll_loss=2.611, ppl=6.11, wps=292849, ups=0.68, wpb=427900, bsz=16375.7, num_updates=16400, lr=0.000493865, gnorm=0.259, clip=0, loss_scale=2, train_wall=145, gb_free=59.4, wall=24217 epoch 010: 1088 / 1707 loss=4.228, nll_loss=2.611, ppl=6.11, wps=292849, ups=0.68, wpb=427900, bsz=16375.7, num_updates=16400, lr=0.000493865, gnorm=0.259, clip=0, loss_scale=2, train_wall=145, gb_free=59.4, wall=24217 epoch 010: 1088 / 1707 loss=4.228, nll_loss=2.611, ppl=6.11, wps=292849, ups=0.68, wpb=427900, bsz=16375.7, num_updates=16400, lr=0.000493865, gnorm=0.259, clip=0, loss_scale=2, train_wall=145, gb_free=59.4, wall=24217 epoch 010: 1088 / 1707 loss=4.228, nll_loss=2.611, ppl=6.11, wps=292849, ups=0.68, wpb=427900, bsz=16375.7, num_updates=16400, lr=0.000493865, gnorm=0.259, clip=0, loss_scale=2, train_wall=145, gb_free=59.4, wall=24217 epoch 010: 1088 / 1707 loss=4.228, nll_loss=2.611, ppl=6.11, wps=292849, ups=0.68, wpb=427900, bsz=16375.7, num_updates=16400, lr=0.000493865, gnorm=0.259, clip=0, loss_scale=2, train_wall=145, gb_free=59.4, wall=24217 epoch 010: 1088 / 1707 loss=4.228, nll_loss=2.611, ppl=6.11, wps=292849, ups=0.68, wpb=427900, bsz=16375.7, num_updates=16400, lr=0.000493865, gnorm=0.259, clip=0, loss_scale=2, train_wall=145, gb_free=59.4, wall=24217 epoch 010: 1088 / 1707 loss=4.228, nll_loss=2.611, ppl=6.11, wps=292849, ups=0.68, wpb=427900, bsz=16375.7, num_updates=16400, lr=0.000493865, gnorm=0.259, clip=0, loss_scale=2, train_wall=145, gb_free=59.4, wall=24217 epoch 010: 1088 / 1707 loss=4.228, nll_loss=2.611, ppl=6.11, wps=292849, ups=0.68, wpb=427900, bsz=16375.7, num_updates=16400, lr=0.000493865, gnorm=0.259, clip=0, loss_scale=2, train_wall=145, gb_free=59.4, wall=24217 epoch 010: 1088 / 1707 loss=4.228, nll_loss=2.611, ppl=6.11, wps=292849, ups=0.68, wpb=427900, bsz=16375.7, num_updates=16400, lr=0.000493865, gnorm=0.259, clip=0, loss_scale=2, train_wall=145, gb_free=59.4, wall=24217 epoch 010: 1189 / 1707 loss=4.238, nll_loss=2.623, ppl=6.16, wps=292520, ups=0.68, wpb=428282, bsz=16112.2, num_updates=16500, lr=0.000492366, gnorm=0.234, clip=0, loss_scale=1, train_wall=146, gb_free=59.2, wall=24363 epoch 010: 1189 / 1707 loss=4.238, nll_loss=2.623, ppl=6.16, wps=292520, ups=0.68, wpb=428282, bsz=16112.2, num_updates=16500, lr=0.000492366, gnorm=0.234, clip=0, loss_scale=1, train_wall=146, gb_free=59.2, wall=24363 epoch 010: 1189 / 1707 loss=4.238, nll_loss=2.623, ppl=6.16, wps=292520, ups=0.68, wpb=428282, bsz=16112.2, num_updates=16500, lr=0.000492366, gnorm=0.234, clip=0, loss_scale=1, train_wall=146, gb_free=59.2, wall=24363 epoch 010: 1189 / 1707 loss=4.238, nll_loss=2.623, ppl=6.16, wps=292520, ups=0.68, wpb=428282, bsz=16112.2, num_updates=16500, lr=0.000492366, gnorm=0.234, clip=0, loss_scale=1, train_wall=146, gb_free=59.2, wall=24363 epoch 010: 1189 / 1707 loss=4.238, nll_loss=2.623, ppl=6.16, wps=292520, ups=0.68, wpb=428282, bsz=16112.2, num_updates=16500, lr=0.000492366, gnorm=0.234, clip=0, loss_scale=1, train_wall=146, gb_free=59.2, wall=24363 epoch 010: 1189 / 1707 loss=4.238, nll_loss=2.623, ppl=6.16, wps=292520, ups=0.68, wpb=428282, bsz=16112.2, num_updates=16500, lr=0.000492366, gnorm=0.234, clip=0, loss_scale=1, train_wall=146, gb_free=59.2, wall=24363 epoch 010: 1189 / 1707 loss=4.238, nll_loss=2.623, ppl=6.16, wps=292520, ups=0.68, wpb=428282, bsz=16112.2, num_updates=16500, lr=0.000492366, gnorm=0.234, clip=0, loss_scale=1, train_wall=146, gb_free=59.2, wall=24363 epoch 010: 1189 / 1707 loss=4.238, nll_loss=2.623, ppl=6.16, wps=292520, ups=0.68, wpb=428282, bsz=16112.2, num_updates=16500, lr=0.000492366, gnorm=0.234, clip=0, loss_scale=1, train_wall=146, gb_free=59.2, wall=24363 epoch 010: 1189 / 1707 loss=4.238, nll_loss=2.623, ppl=6.16, wps=292520, ups=0.68, wpb=428282, bsz=16112.2, num_updates=16500, lr=0.000492366, gnorm=0.234, clip=0, loss_scale=1, train_wall=146, gb_free=59.2, wall=24363 epoch 010: 1189 / 1707 loss=4.238, nll_loss=2.623, ppl=6.16, wps=292520, ups=0.68, wpb=428282, bsz=16112.2, num_updates=16500, lr=0.000492366, gnorm=0.234, clip=0, loss_scale=1, train_wall=146, gb_free=59.2, wall=24363 epoch 010: 1289 / 1707 loss=4.241, nll_loss=2.627, ppl=6.18, wps=296258, ups=0.69, wpb=429092, bsz=16513.4, num_updates=16600, lr=0.000490881, gnorm=0.251, clip=0, loss_scale=1, train_wall=144, gb_free=58.9, wall=24508 epoch 010: 1289 / 1707 loss=4.241, nll_loss=2.627, ppl=6.18, wps=296258, ups=0.69, wpb=429092, bsz=16513.4, num_updates=16600, lr=0.000490881, gnorm=0.251, clip=0, loss_scale=1, train_wall=144, gb_free=58.9, wall=24508 epoch 010: 1289 / 1707 loss=4.241, nll_loss=2.627, ppl=6.18, wps=296258, ups=0.69, wpb=429092, bsz=16513.4, num_updates=16600, lr=0.000490881, gnorm=0.251, clip=0, loss_scale=1, train_wall=144, gb_free=58.9, wall=24508 epoch 010: 1289 / 1707 loss=4.241, nll_loss=2.627, ppl=6.18, wps=296258, ups=0.69, wpb=429092, bsz=16513.4, num_updates=16600, lr=0.000490881, gnorm=0.251, clip=0, loss_scale=1, train_wall=144, gb_free=58.9, wall=24508 epoch 010: 1289 / 1707 loss=4.241, nll_loss=2.627, ppl=6.18, wps=296258, ups=0.69, wpb=429092, bsz=16513.4, num_updates=16600, lr=0.000490881, gnorm=0.251, clip=0, loss_scale=1, train_wall=144, gb_free=58.9, wall=24508 epoch 010: 1289 / 1707 loss=4.241, nll_loss=2.627, ppl=6.18, wps=296258, ups=0.69, wpb=429092, bsz=16513.4, num_updates=16600, lr=0.000490881, gnorm=0.251, clip=0, loss_scale=1, train_wall=144, gb_free=58.9, wall=24508 epoch 010: 1289 / 1707 loss=4.241, nll_loss=2.627, ppl=6.18, wps=296258, ups=0.69, wpb=429092, bsz=16513.4, num_updates=16600, lr=0.000490881, gnorm=0.251, clip=0, loss_scale=1, train_wall=144, gb_free=58.9, wall=24508 epoch 010: 1289 / 1707 loss=4.241, nll_loss=2.627, ppl=6.18, wps=296258, ups=0.69, wpb=429092, bsz=16513.4, num_updates=16600, lr=0.000490881, gnorm=0.251, clip=0, loss_scale=1, train_wall=144, gb_free=58.9, wall=24508 epoch 010: 1289 / 1707 loss=4.241, nll_loss=2.627, ppl=6.18, wps=296258, ups=0.69, wpb=429092, bsz=16513.4, num_updates=16600, lr=0.000490881, gnorm=0.251, clip=0, loss_scale=1, train_wall=144, gb_free=58.9, wall=24508 epoch 010: 1289 / 1707 loss=4.241, nll_loss=2.627, ppl=6.18, wps=296258, ups=0.69, wpb=429092, bsz=16513.4, num_updates=16600, lr=0.000490881, gnorm=0.251, clip=0, loss_scale=1, train_wall=144, gb_free=58.9, wall=24508 epoch 010: 1389 / 1707 loss=4.236, nll_loss=2.622, ppl=6.15, wps=296517, ups=0.69, wpb=429348, bsz=16456.6, num_updates=16700, lr=0.000489409, gnorm=0.258, clip=0, loss_scale=2, train_wall=144, gb_free=59, wall=24653 epoch 010: 1389 / 1707 loss=4.236, nll_loss=2.622, ppl=6.15, wps=296517, ups=0.69, wpb=429348, bsz=16456.6, num_updates=16700, lr=0.000489409, gnorm=0.258, clip=0, loss_scale=2, train_wall=144, gb_free=59, wall=24653 epoch 010: 1389 / 1707 loss=4.236, nll_loss=2.622, ppl=6.15, wps=296517, ups=0.69, wpb=429348, bsz=16456.6, num_updates=16700, lr=0.000489409, gnorm=0.258, clip=0, loss_scale=2, train_wall=144, gb_free=59, wall=24653 epoch 010: 1389 / 1707 loss=4.236, nll_loss=2.622, ppl=6.15, wps=296517, ups=0.69, wpb=429348, bsz=16456.6, num_updates=16700, lr=0.000489409, gnorm=0.258, clip=0, loss_scale=2, train_wall=144, gb_free=59, wall=24653 epoch 010: 1389 / 1707 loss=4.236, nll_loss=2.622, ppl=6.15, wps=296517, ups=0.69, wpb=429348, bsz=16456.6, num_updates=16700, lr=0.000489409, gnorm=0.258, clip=0, loss_scale=2, train_wall=144, gb_free=59, wall=24653 epoch 010: 1389 / 1707 loss=4.236, nll_loss=2.622, ppl=6.15, wps=296517, ups=0.69, wpb=429348, bsz=16456.6, num_updates=16700, lr=0.000489409, gnorm=0.258, clip=0, loss_scale=2, train_wall=144, gb_free=59, wall=24653 epoch 010: 1389 / 1707 loss=4.236, nll_loss=2.622, ppl=6.15, wps=296517, ups=0.69, wpb=429348, bsz=16456.6, num_updates=16700, lr=0.000489409, gnorm=0.258, clip=0, loss_scale=2, train_wall=144, gb_free=59, wall=24653 epoch 010: 1389 / 1707 loss=4.236, nll_loss=2.622, ppl=6.15, wps=296517, ups=0.69, wpb=429348, bsz=16456.6, num_updates=16700, lr=0.000489409, gnorm=0.258, clip=0, loss_scale=2, train_wall=144, gb_free=59, wall=24653 epoch 010: 1389 / 1707 loss=4.236, nll_loss=2.622, ppl=6.15, wps=296517, ups=0.69, wpb=429348, bsz=16456.6, num_updates=16700, lr=0.000489409, gnorm=0.258, clip=0, loss_scale=2, train_wall=144, gb_free=59, wall=24653 epoch 010: 1389 / 1707 loss=4.236, nll_loss=2.622, ppl=6.15, wps=296517, ups=0.69, wpb=429348, bsz=16456.6, num_updates=16700, lr=0.000489409, gnorm=0.258, clip=0, loss_scale=2, train_wall=144, gb_free=59, wall=24653 epoch 010: 1489 / 1707 loss=4.233, nll_loss=2.618, ppl=6.14, wps=295650, ups=0.69, wpb=429019, bsz=16402.7, num_updates=16800, lr=0.00048795, gnorm=0.247, clip=0, loss_scale=2, train_wall=144, gb_free=59.1, wall=24798 epoch 010: 1489 / 1707 loss=4.233, nll_loss=2.618, ppl=6.14, wps=295650, ups=0.69, wpb=429019, bsz=16402.7, num_updates=16800, lr=0.00048795, gnorm=0.247, clip=0, loss_scale=2, train_wall=144, gb_free=59.1, wall=24798 epoch 010: 1489 / 1707 loss=4.233, nll_loss=2.618, ppl=6.14, wps=295650, ups=0.69, wpb=429019, bsz=16402.7, num_updates=16800, lr=0.00048795, gnorm=0.247, clip=0, loss_scale=2, train_wall=144, gb_free=59.1, wall=24798 epoch 010: 1489 / 1707 loss=4.233, nll_loss=2.618, ppl=6.14, wps=295650, ups=0.69, wpb=429019, bsz=16402.7, num_updates=16800, lr=0.00048795, gnorm=0.247, clip=0, loss_scale=2, train_wall=144, gb_free=59.1, wall=24798 epoch 010: 1489 / 1707 loss=4.233, nll_loss=2.618, ppl=6.14, wps=295650, ups=0.69, wpb=429019, bsz=16402.7, num_updates=16800, lr=0.00048795, gnorm=0.247, clip=0, loss_scale=2, train_wall=144, gb_free=59.1, wall=24798 epoch 010: 1489 / 1707 loss=4.233, nll_loss=2.618, ppl=6.14, wps=295650, ups=0.69, wpb=429019, bsz=16402.7, num_updates=16800, lr=0.00048795, gnorm=0.247, clip=0, loss_scale=2, train_wall=144, gb_free=59.1, wall=24798 epoch 010: 1489 / 1707 loss=4.233, nll_loss=2.618, ppl=6.14, wps=295650, ups=0.69, wpb=429019, bsz=16402.7, num_updates=16800, lr=0.00048795, gnorm=0.247, clip=0, loss_scale=2, train_wall=144, gb_free=59.1, wall=24798 epoch 010: 1489 / 1707 loss=4.233, nll_loss=2.618, ppl=6.14, wps=295650, ups=0.69, wpb=429019, bsz=16402.7, num_updates=16800, lr=0.00048795, gnorm=0.247, clip=0, loss_scale=2, train_wall=144, gb_free=59.1, wall=24798 epoch 010: 1489 / 1707 loss=4.233, nll_loss=2.618, ppl=6.14, wps=295650, ups=0.69, wpb=429019, bsz=16402.7, num_updates=16800, lr=0.00048795, gnorm=0.247, clip=0, loss_scale=2, train_wall=144, gb_free=59.1, wall=24798 epoch 010: 1489 / 1707 loss=4.233, nll_loss=2.618, ppl=6.14, wps=295650, ups=0.69, wpb=429019, bsz=16402.7, num_updates=16800, lr=0.00048795, gnorm=0.247, clip=0, loss_scale=2, train_wall=144, gb_free=59.1, wall=24798 epoch 010: 1589 / 1707 loss=4.238, nll_loss=2.623, ppl=6.16, wps=296238, ups=0.69, wpb=429484, bsz=16325.5, num_updates=16900, lr=0.000486504, gnorm=0.256, clip=0, loss_scale=2, train_wall=144, gb_free=59.2, wall=24943 epoch 010: 1589 / 1707 loss=4.238, nll_loss=2.623, ppl=6.16, wps=296238, ups=0.69, wpb=429484, bsz=16325.5, num_updates=16900, lr=0.000486504, gnorm=0.256, clip=0, loss_scale=2, train_wall=144, gb_free=59.2, wall=24943 epoch 010: 1589 / 1707 loss=4.238, nll_loss=2.623, ppl=6.16, wps=296238, ups=0.69, wpb=429484, bsz=16325.5, num_updates=16900, lr=0.000486504, gnorm=0.256, clip=0, loss_scale=2, train_wall=144, gb_free=59.2, wall=24943 epoch 010: 1589 / 1707 loss=4.238, nll_loss=2.623, ppl=6.16, wps=296238, ups=0.69, wpb=429484, bsz=16325.5, num_updates=16900, lr=0.000486504, gnorm=0.256, clip=0, loss_scale=2, train_wall=144, gb_free=59.2, wall=24943 epoch 010: 1589 / 1707 loss=4.238, nll_loss=2.623, ppl=6.16, wps=296238, ups=0.69, wpb=429484, bsz=16325.5, num_updates=16900, lr=0.000486504, gnorm=0.256, clip=0, loss_scale=2, train_wall=144, gb_free=59.2, wall=24943 epoch 010: 1589 / 1707 loss=4.238, nll_loss=2.623, ppl=6.16, wps=296238, ups=0.69, wpb=429484, bsz=16325.5, num_updates=16900, lr=0.000486504, gnorm=0.256, clip=0, loss_scale=2, train_wall=144, gb_free=59.2, wall=24943 epoch 010: 1589 / 1707 loss=4.238, nll_loss=2.623, ppl=6.16, wps=296238, ups=0.69, wpb=429484, bsz=16325.5, num_updates=16900, lr=0.000486504, gnorm=0.256, clip=0, loss_scale=2, train_wall=144, gb_free=59.2, wall=24943 epoch 010: 1589 / 1707 loss=4.238, nll_loss=2.623, ppl=6.16, wps=296238, ups=0.69, wpb=429484, bsz=16325.5, num_updates=16900, lr=0.000486504, gnorm=0.256, clip=0, loss_scale=2, train_wall=144, gb_free=59.2, wall=24943 epoch 010: 1589 / 1707 loss=4.238, nll_loss=2.623, ppl=6.16, wps=296238, ups=0.69, wpb=429484, bsz=16325.5, num_updates=16900, lr=0.000486504, gnorm=0.256, clip=0, loss_scale=2, train_wall=144, gb_free=59.2, wall=24943 epoch 010: 1589 / 1707 loss=4.238, nll_loss=2.623, ppl=6.16, wps=296238, ups=0.69, wpb=429484, bsz=16325.5, num_updates=16900, lr=0.000486504, gnorm=0.256, clip=0, loss_scale=2, train_wall=144, gb_free=59.2, wall=24943 epoch 010: 1689 / 1707 loss=4.24, nll_loss=2.626, ppl=6.17, wps=295728, ups=0.69, wpb=428121, bsz=16149.7, num_updates=17000, lr=0.000485071, gnorm=0.255, clip=0, loss_scale=4, train_wall=144, gb_free=58.9, wall=25088 epoch 010: 1689 / 1707 loss=4.24, nll_loss=2.626, ppl=6.17, wps=295728, ups=0.69, wpb=428121, bsz=16149.7, num_updates=17000, lr=0.000485071, gnorm=0.255, clip=0, loss_scale=4, train_wall=144, gb_free=58.9, wall=25088 epoch 010: 1689 / 1707 loss=4.24, nll_loss=2.626, ppl=6.17, wps=295728, ups=0.69, wpb=428121, bsz=16149.7, num_updates=17000, lr=0.000485071, gnorm=0.255, clip=0, loss_scale=4, train_wall=144, gb_free=58.9, wall=25088 epoch 010: 1689 / 1707 loss=4.24, nll_loss=2.626, ppl=6.17, wps=295728, ups=0.69, wpb=428121, bsz=16149.7, num_updates=17000, lr=0.000485071, gnorm=0.255, clip=0, loss_scale=4, train_wall=144, gb_free=58.9, wall=25088 epoch 010: 1689 / 1707 loss=4.24, nll_loss=2.626, ppl=6.17, wps=295728, ups=0.69, wpb=428121, bsz=16149.7, num_updates=17000, lr=0.000485071, gnorm=0.255, clip=0, loss_scale=4, train_wall=144, gb_free=58.9, wall=25088 epoch 010: 1689 / 1707 loss=4.24, nll_loss=2.626, ppl=6.17, wps=295728, ups=0.69, wpb=428121, bsz=16149.7, num_updates=17000, lr=0.000485071, gnorm=0.255, clip=0, loss_scale=4, train_wall=144, gb_free=58.9, wall=25088 epoch 010: 1689 / 1707 loss=4.24, nll_loss=2.626, ppl=6.17, wps=295728, ups=0.69, wpb=428121, bsz=16149.7, num_updates=17000, lr=0.000485071, gnorm=0.255, clip=0, loss_scale=4, train_wall=144, gb_free=58.9, wall=25088 epoch 010: 1689 / 1707 loss=4.24, nll_loss=2.626, ppl=6.17, wps=295728, ups=0.69, wpb=428121, bsz=16149.7, num_updates=17000, lr=0.000485071, gnorm=0.255, clip=0, loss_scale=4, train_wall=144, gb_free=58.9, wall=25088 epoch 010: 1689 / 1707 loss=4.24, nll_loss=2.626, ppl=6.17, wps=295728, ups=0.69, wpb=428121, bsz=16149.7, num_updates=17000, lr=0.000485071, gnorm=0.255, clip=0, loss_scale=4, train_wall=144, gb_free=58.9, wall=25088 epoch 010: 1689 / 1707 loss=4.24, nll_loss=2.626, ppl=6.17, wps=295728, ups=0.69, wpb=428121, bsz=16149.7, num_updates=17000, lr=0.000485071, gnorm=0.255, clip=0, loss_scale=4, train_wall=144, gb_free=58.9, wall=25088 begin validation on "valid" subset epoch 010 | valid on 'valid' subset | loss 4.355 | nll_loss 2.738 | ppl 6.67 | wps 75540.2 | wpb 21331 | bsz 1016 | num_updates 17000 | best_loss 4.355 epoch 010 | valid on 'valid' subset | loss 4.355 | nll_loss 2.738 | ppl 6.67 | wps 75540.2 | wpb 21331 | bsz 1016 | num_updates 17000 | best_loss 4.355 epoch 010 | valid on 'valid' subset | loss 4.355 | nll_loss 2.738 | ppl 6.67 | wps 75540.2 | wpb 21331 | bsz 1016 | num_updates 17000 | best_loss 4.355 epoch 010 | valid on 'valid' subset | loss 4.355 | nll_loss 2.738 | ppl 6.67 | wps 75540.2 | wpb 21331 | bsz 1016 | num_updates 17000 | best_loss 4.355 epoch 010 | valid on 'valid' subset | loss 4.355 | nll_loss 2.738 | ppl 6.67 | wps 75540.2 | wpb 21331 | bsz 1016 | num_updates 17000 | best_loss 4.355 epoch 010 | valid on 'valid' subset | loss 4.355 | nll_loss 2.738 | ppl 6.67 | wps 75540.2 | wpb 21331 | bsz 1016 | num_updates 17000 | best_loss 4.355 epoch 010 | valid on 'valid' subset | loss 4.355 | nll_loss 2.738 | ppl 6.67 | wps 75540.2 | wpb 21331 | bsz 1016 | num_updates 17000 | best_loss 4.355 epoch 010 | valid on 'valid' subset | loss 4.355 | nll_loss 2.738 | ppl 6.67 | wps 75540.2 | wpb 21331 | bsz 1016 | num_updates 17000 | best_loss 4.355 epoch 010 | valid on 'valid' subset | loss 4.355 | nll_loss 2.738 | ppl 6.67 | wps 75540.2 | wpb 21331 | bsz 1016 | num_updates 17000 | best_loss 4.355 epoch 010 | valid on 'valid' subset | loss 4.355 | nll_loss 2.738 | ppl 6.67 | wps 75540.2 | wpb 21331 | bsz 1016 | num_updates 17000 | best_loss 4.355 end of epoch 10 (average epoch stats below) epoch 010 | loss 4.233 | nll_loss 2.617 | ppl 6.14 | wps 291417 | ups 0.68 | wpb 428934 | bsz 16333.8 | num_updates 17018 | lr 0.000484815 | gnorm 0.253 | clip 0 | loss_scale 4 | train_wall 2459 | gb_free 59.8 | wall 25130 epoch 010 | loss 4.233 | nll_loss 2.617 | ppl 6.14 | wps 291417 | ups 0.68 | wpb 428934 | bsz 16333.8 | num_updates 17018 | lr 0.000484815 | gnorm 0.253 | clip 0 | loss_scale 4 | train_wall 2459 | gb_free 59.8 | wall 25130 epoch 010 | loss 4.233 | nll_loss 2.617 | ppl 6.14 | wps 291417 | ups 0.68 | wpb 428934 | bsz 16333.8 | num_updates 17018 | lr 0.000484815 | gnorm 0.253 | clip 0 | loss_scale 4 | train_wall 2459 | gb_free 59.8 | wall 25130 epoch 010 | loss 4.233 | nll_loss 2.617 | ppl 6.14 | wps 291417 | ups 0.68 | wpb 428934 | bsz 16333.8 | num_updates 17018 | lr 0.000484815 | gnorm 0.253 | clip 0 | loss_scale 4 | train_wall 2459 | gb_free 59.8 | wall 25130 epoch 010 | loss 4.233 | nll_loss 2.617 | ppl 6.14 | wps 291417 | ups 0.68 | wpb 428934 | bsz 16333.8 | num_updates 17018 | lr 0.000484815 | gnorm 0.253 | clip 0 | loss_scale 4 | train_wall 2459 | gb_free 59.8 | wall 25130 epoch 010 | loss 4.233 | nll_loss 2.617 | ppl 6.14 | wps 291417 | ups 0.68 | wpb 428934 | bsz 16333.8 | num_updates 17018 | lr 0.000484815 | gnorm 0.253 | clip 0 | loss_scale 4 | train_wall 2459 | gb_free 59.8 | wall 25130 epoch 010 | loss 4.233 | nll_loss 2.617 | ppl 6.14 | wps 291417 | ups 0.68 | wpb 428934 | bsz 16333.8 | num_updates 17018 | lr 0.000484815 | gnorm 0.253 | clip 0 | loss_scale 4 | train_wall 2459 | gb_free 59.8 | wall 25130 epoch 010 | loss 4.233 | nll_loss 2.617 | ppl 6.14 | wps 291417 | ups 0.68 | wpb 428934 | bsz 16333.8 | num_updates 17018 | lr 0.000484815 | gnorm 0.253 | clip 0 | loss_scale 4 | train_wall 2459 | gb_free 59.8 | wall 25130 epoch 010 | loss 4.233 | nll_loss 2.617 | ppl 6.14 | wps 291417 | ups 0.68 | wpb 428934 | bsz 16333.8 | num_updates 17018 | lr 0.000484815 | gnorm 0.253 | clip 0 | loss_scale 4 | train_wall 2459 | gb_free 59.8 | wall 25130 epoch 010 | loss 4.233 | nll_loss 2.617 | ppl 6.14 | wps 291417 | ups 0.68 | wpb 428934 | bsz 16333.8 | num_updates 17018 | lr 0.000484815 | gnorm 0.253 | clip 0 | loss_scale 4 | train_wall 2459 | gb_free 59.8 | wall 25130 Start iterating over samples epoch 011: 83 / 1707 loss=4.204, nll_loss=2.583, ppl=5.99, wps=261502, ups=0.61, wpb=425734, bsz=16090.2, num_updates=17100, lr=0.000483651, gnorm=0.261, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=25251 epoch 011: 83 / 1707 loss=4.204, nll_loss=2.583, ppl=5.99, wps=261502, ups=0.61, wpb=425734, bsz=16090.2, num_updates=17100, lr=0.000483651, gnorm=0.261, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=25251 epoch 011: 83 / 1707 loss=4.204, nll_loss=2.583, ppl=5.99, wps=261502, ups=0.61, wpb=425734, bsz=16090.2, num_updates=17100, lr=0.000483651, gnorm=0.261, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=25251 epoch 011: 83 / 1707 loss=4.204, nll_loss=2.583, ppl=5.99, wps=261502, ups=0.61, wpb=425734, bsz=16090.2, num_updates=17100, lr=0.000483651, gnorm=0.261, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=25251 epoch 011: 83 / 1707 loss=4.204, nll_loss=2.583, ppl=5.99, wps=261502, ups=0.61, wpb=425734, bsz=16090.2, num_updates=17100, lr=0.000483651, gnorm=0.261, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=25251 epoch 011: 83 / 1707 loss=4.204, nll_loss=2.583, ppl=5.99, wps=261502, ups=0.61, wpb=425734, bsz=16090.2, num_updates=17100, lr=0.000483651, gnorm=0.261, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=25251 epoch 011: 83 / 1707 loss=4.204, nll_loss=2.583, ppl=5.99, wps=261502, ups=0.61, wpb=425734, bsz=16090.2, num_updates=17100, lr=0.000483651, gnorm=0.261, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=25251 epoch 011: 83 / 1707 loss=4.204, nll_loss=2.583, ppl=5.99, wps=261502, ups=0.61, wpb=425734, bsz=16090.2, num_updates=17100, lr=0.000483651, gnorm=0.261, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=25251 epoch 011: 83 / 1707 loss=4.204, nll_loss=2.583, ppl=5.99, wps=261502, ups=0.61, wpb=425734, bsz=16090.2, num_updates=17100, lr=0.000483651, gnorm=0.261, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=25251 epoch 011: 83 / 1707 loss=4.204, nll_loss=2.583, ppl=5.99, wps=261502, ups=0.61, wpb=425734, bsz=16090.2, num_updates=17100, lr=0.000483651, gnorm=0.261, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=25251 epoch 011: 83 / 1707 loss=4.204, nll_loss=2.583, ppl=5.99, wps=261502, ups=0.61, wpb=425734, bsz=16090.2, num_updates=17100, lr=0.000483651, gnorm=0.261, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=25251 epoch 011: 184 / 1707 loss=4.197, nll_loss=2.576, ppl=5.96, wps=294795, ups=0.69, wpb=429865, bsz=16241.6, num_updates=17200, lr=0.000482243, gnorm=0.251, clip=0, loss_scale=1, train_wall=145, gb_free=58.9, wall=25396 epoch 011: 184 / 1707 loss=4.197, nll_loss=2.576, ppl=5.96, wps=294795, ups=0.69, wpb=429865, bsz=16241.6, num_updates=17200, lr=0.000482243, gnorm=0.251, clip=0, loss_scale=1, train_wall=145, gb_free=58.9, wall=25396 epoch 011: 184 / 1707 loss=4.197, nll_loss=2.576, ppl=5.96, wps=294795, ups=0.69, wpb=429865, bsz=16241.6, num_updates=17200, lr=0.000482243, gnorm=0.251, clip=0, loss_scale=1, train_wall=145, gb_free=58.9, wall=25396 epoch 011: 184 / 1707 loss=4.197, nll_loss=2.576, ppl=5.96, wps=294795, ups=0.69, wpb=429865, bsz=16241.6, num_updates=17200, lr=0.000482243, gnorm=0.251, clip=0, loss_scale=1, train_wall=145, gb_free=58.9, wall=25396 epoch 011: 184 / 1707 loss=4.197, nll_loss=2.576, ppl=5.96, wps=294795, ups=0.69, wpb=429865, bsz=16241.6, num_updates=17200, lr=0.000482243, gnorm=0.251, clip=0, loss_scale=1, train_wall=145, gb_free=58.9, wall=25396 epoch 011: 184 / 1707 loss=4.197, nll_loss=2.576, ppl=5.96, wps=294795, ups=0.69, wpb=429865, bsz=16241.6, num_updates=17200, lr=0.000482243, gnorm=0.251, clip=0, loss_scale=1, train_wall=145, gb_free=58.9, wall=25396 epoch 011: 184 / 1707 loss=4.197, nll_loss=2.576, ppl=5.96, wps=294795, ups=0.69, wpb=429865, bsz=16241.6, num_updates=17200, lr=0.000482243, gnorm=0.251, clip=0, loss_scale=1, train_wall=145, gb_free=58.9, wall=25396 epoch 011: 184 / 1707 loss=4.197, nll_loss=2.576, ppl=5.96, wps=294795, ups=0.69, wpb=429865, bsz=16241.6, num_updates=17200, lr=0.000482243, gnorm=0.251, clip=0, loss_scale=1, train_wall=145, gb_free=58.9, wall=25396 epoch 011: 184 / 1707 loss=4.197, nll_loss=2.576, ppl=5.96, wps=294795, ups=0.69, wpb=429865, bsz=16241.6, num_updates=17200, lr=0.000482243, gnorm=0.251, clip=0, loss_scale=1, train_wall=145, gb_free=58.9, wall=25396 epoch 011: 184 / 1707 loss=4.197, nll_loss=2.576, ppl=5.96, wps=294795, ups=0.69, wpb=429865, bsz=16241.6, num_updates=17200, lr=0.000482243, gnorm=0.251, clip=0, loss_scale=1, train_wall=145, gb_free=58.9, wall=25396 epoch 011: 184 / 1707 loss=4.197, nll_loss=2.576, ppl=5.96, wps=294795, ups=0.69, wpb=429865, bsz=16241.6, num_updates=17200, lr=0.000482243, gnorm=0.251, clip=0, loss_scale=1, train_wall=145, gb_free=58.9, wall=25396 epoch 011: 284 / 1707 loss=4.196, nll_loss=2.575, ppl=5.96, wps=294547, ups=0.69, wpb=427791, bsz=16490, num_updates=17300, lr=0.000480847, gnorm=0.251, clip=0, loss_scale=1, train_wall=144, gb_free=59.3, wall=25542 epoch 011: 284 / 1707 loss=4.196, nll_loss=2.575, ppl=5.96, wps=294547, ups=0.69, wpb=427791, bsz=16490, num_updates=17300, lr=0.000480847, gnorm=0.251, clip=0, loss_scale=1, train_wall=144, gb_free=59.3, wall=25542 epoch 011: 284 / 1707 loss=4.196, nll_loss=2.575, ppl=5.96, wps=294547, ups=0.69, wpb=427791, bsz=16490, num_updates=17300, lr=0.000480847, gnorm=0.251, clip=0, loss_scale=1, train_wall=144, gb_free=59.3, wall=25542 epoch 011: 284 / 1707 loss=4.196, nll_loss=2.575, ppl=5.96, wps=294547, ups=0.69, wpb=427791, bsz=16490, num_updates=17300, lr=0.000480847, gnorm=0.251, clip=0, loss_scale=1, train_wall=144, gb_free=59.3, wall=25542 epoch 011: 284 / 1707 loss=4.196, nll_loss=2.575, ppl=5.96, wps=294547, ups=0.69, wpb=427791, bsz=16490, num_updates=17300, lr=0.000480847, gnorm=0.251, clip=0, loss_scale=1, train_wall=144, gb_free=59.3, wall=25542 epoch 011: 284 / 1707 loss=4.196, nll_loss=2.575, ppl=5.96, wps=294547, ups=0.69, wpb=427791, bsz=16490, num_updates=17300, lr=0.000480847, gnorm=0.251, clip=0, loss_scale=1, train_wall=144, gb_free=59.3, wall=25542 epoch 011: 284 / 1707 loss=4.196, nll_loss=2.575, ppl=5.96, wps=294547, ups=0.69, wpb=427791, bsz=16490, num_updates=17300, lr=0.000480847, gnorm=0.251, clip=0, loss_scale=1, train_wall=144, gb_free=59.3, wall=25542 epoch 011: 284 / 1707 loss=4.196, nll_loss=2.575, ppl=5.96, wps=294547, ups=0.69, wpb=427791, bsz=16490, num_updates=17300, lr=0.000480847, gnorm=0.251, clip=0, loss_scale=1, train_wall=144, gb_free=59.3, wall=25542 epoch 011: 284 / 1707 loss=4.196, nll_loss=2.575, ppl=5.96, wps=294547, ups=0.69, wpb=427791, bsz=16490, num_updates=17300, lr=0.000480847, gnorm=0.251, clip=0, loss_scale=1, train_wall=144, gb_free=59.3, wall=25542 epoch 011: 284 / 1707 loss=4.196, nll_loss=2.575, ppl=5.96, wps=294547, ups=0.69, wpb=427791, bsz=16490, num_updates=17300, lr=0.000480847, gnorm=0.251, clip=0, loss_scale=1, train_wall=144, gb_free=59.3, wall=25542 epoch 011: 284 / 1707 loss=4.196, nll_loss=2.575, ppl=5.96, wps=294547, ups=0.69, wpb=427791, bsz=16490, num_updates=17300, lr=0.000480847, gnorm=0.251, clip=0, loss_scale=1, train_wall=144, gb_free=59.3, wall=25542 epoch 011: 384 / 1707 loss=4.21, nll_loss=2.592, ppl=6.03, wps=296174, ups=0.69, wpb=428711, bsz=16509.8, num_updates=17400, lr=0.000479463, gnorm=0.252, clip=0, loss_scale=2, train_wall=144, gb_free=59.5, wall=25686 epoch 011: 384 / 1707 loss=4.21, nll_loss=2.592, ppl=6.03, wps=296174, ups=0.69, wpb=428711, bsz=16509.8, num_updates=17400, lr=0.000479463, gnorm=0.252, clip=0, loss_scale=2, train_wall=144, gb_free=59.5, wall=25686 epoch 011: 384 / 1707 loss=4.21, nll_loss=2.592, ppl=6.03, wps=296174, ups=0.69, wpb=428711, bsz=16509.8, num_updates=17400, lr=0.000479463, gnorm=0.252, clip=0, loss_scale=2, train_wall=144, gb_free=59.5, wall=25686 epoch 011: 384 / 1707 loss=4.21, nll_loss=2.592, ppl=6.03, wps=296174, ups=0.69, wpb=428711, bsz=16509.8, num_updates=17400, lr=0.000479463, gnorm=0.252, clip=0, loss_scale=2, train_wall=144, gb_free=59.5, wall=25686 epoch 011: 384 / 1707 loss=4.21, nll_loss=2.592, ppl=6.03, wps=296174, ups=0.69, wpb=428711, bsz=16509.8, num_updates=17400, lr=0.000479463, gnorm=0.252, clip=0, loss_scale=2, train_wall=144, gb_free=59.5, wall=25686 epoch 011: 384 / 1707 loss=4.21, nll_loss=2.592, ppl=6.03, wps=296174, ups=0.69, wpb=428711, bsz=16509.8, num_updates=17400, lr=0.000479463, gnorm=0.252, clip=0, loss_scale=2, train_wall=144, gb_free=59.5, wall=25686 epoch 011: 384 / 1707 loss=4.21, nll_loss=2.592, ppl=6.03, wps=296174, ups=0.69, wpb=428711, bsz=16509.8, num_updates=17400, lr=0.000479463, gnorm=0.252, clip=0, loss_scale=2, train_wall=144, gb_free=59.5, wall=25686 epoch 011: 384 / 1707 loss=4.21, nll_loss=2.592, ppl=6.03, wps=296174, ups=0.69, wpb=428711, bsz=16509.8, num_updates=17400, lr=0.000479463, gnorm=0.252, clip=0, loss_scale=2, train_wall=144, gb_free=59.5, wall=25686 epoch 011: 384 / 1707 loss=4.21, nll_loss=2.592, ppl=6.03, wps=296174, ups=0.69, wpb=428711, bsz=16509.8, num_updates=17400, lr=0.000479463, gnorm=0.252, clip=0, loss_scale=2, train_wall=144, gb_free=59.5, wall=25686 epoch 011: 384 / 1707 loss=4.21, nll_loss=2.592, ppl=6.03, wps=296174, ups=0.69, wpb=428711, bsz=16509.8, num_updates=17400, lr=0.000479463, gnorm=0.252, clip=0, loss_scale=2, train_wall=144, gb_free=59.5, wall=25686 epoch 011: 384 / 1707 loss=4.21, nll_loss=2.592, ppl=6.03, wps=296174, ups=0.69, wpb=428711, bsz=16509.8, num_updates=17400, lr=0.000479463, gnorm=0.252, clip=0, loss_scale=2, train_wall=144, gb_free=59.5, wall=25686 epoch 011: 484 / 1707 loss=4.21, nll_loss=2.591, ppl=6.03, wps=296627, ups=0.69, wpb=429988, bsz=16519.9, num_updates=17500, lr=0.000478091, gnorm=0.263, clip=0, loss_scale=2, train_wall=144, gb_free=59, wall=25831 epoch 011: 484 / 1707 loss=4.21, nll_loss=2.591, ppl=6.03, wps=296627, ups=0.69, wpb=429988, bsz=16519.9, num_updates=17500, lr=0.000478091, gnorm=0.263, clip=0, loss_scale=2, train_wall=144, gb_free=59, wall=25831 epoch 011: 484 / 1707 loss=4.21, nll_loss=2.591, ppl=6.03, wps=296627, ups=0.69, wpb=429988, bsz=16519.9, num_updates=17500, lr=0.000478091, gnorm=0.263, clip=0, loss_scale=2, train_wall=144, gb_free=59, wall=25831 epoch 011: 484 / 1707 loss=4.21, nll_loss=2.591, ppl=6.03, wps=296627, ups=0.69, wpb=429988, bsz=16519.9, num_updates=17500, lr=0.000478091, gnorm=0.263, clip=0, loss_scale=2, train_wall=144, gb_free=59, wall=25831 epoch 011: 484 / 1707 loss=4.21, nll_loss=2.591, ppl=6.03, wps=296627, ups=0.69, wpb=429988, bsz=16519.9, num_updates=17500, lr=0.000478091, gnorm=0.263, clip=0, loss_scale=2, train_wall=144, gb_free=59, wall=25831 epoch 011: 484 / 1707 loss=4.21, nll_loss=2.591, ppl=6.03, wps=296627, ups=0.69, wpb=429988, bsz=16519.9, num_updates=17500, lr=0.000478091, gnorm=0.263, clip=0, loss_scale=2, train_wall=144, gb_free=59, wall=25831 epoch 011: 484 / 1707 loss=4.21, nll_loss=2.591, ppl=6.03, wps=296627, ups=0.69, wpb=429988, bsz=16519.9, num_updates=17500, lr=0.000478091, gnorm=0.263, clip=0, loss_scale=2, train_wall=144, gb_free=59, wall=25831 epoch 011: 484 / 1707 loss=4.21, nll_loss=2.591, ppl=6.03, wps=296627, ups=0.69, wpb=429988, bsz=16519.9, num_updates=17500, lr=0.000478091, gnorm=0.263, clip=0, loss_scale=2, train_wall=144, gb_free=59, wall=25831 epoch 011: 484 / 1707 loss=4.21, nll_loss=2.591, ppl=6.03, wps=296627, ups=0.69, wpb=429988, bsz=16519.9, num_updates=17500, lr=0.000478091, gnorm=0.263, clip=0, loss_scale=2, train_wall=144, gb_free=59, wall=25831 epoch 011: 484 / 1707 loss=4.21, nll_loss=2.591, ppl=6.03, wps=296627, ups=0.69, wpb=429988, bsz=16519.9, num_updates=17500, lr=0.000478091, gnorm=0.263, clip=0, loss_scale=2, train_wall=144, gb_free=59, wall=25831 epoch 011: 484 / 1707 loss=4.21, nll_loss=2.591, ppl=6.03, wps=296627, ups=0.69, wpb=429988, bsz=16519.9, num_updates=17500, lr=0.000478091, gnorm=0.263, clip=0, loss_scale=2, train_wall=144, gb_free=59, wall=25831 epoch 011: 584 / 1707 loss=4.218, nll_loss=2.601, ppl=6.07, wps=297261, ups=0.69, wpb=430199, bsz=16253.9, num_updates=17600, lr=0.000476731, gnorm=0.245, clip=0, loss_scale=2, train_wall=144, gb_free=59.4, wall=25976 epoch 011: 584 / 1707 loss=4.218, nll_loss=2.601, ppl=6.07, wps=297261, ups=0.69, wpb=430199, bsz=16253.9, num_updates=17600, lr=0.000476731, gnorm=0.245, clip=0, loss_scale=2, train_wall=144, gb_free=59.4, wall=25976 epoch 011: 584 / 1707 loss=4.218, nll_loss=2.601, ppl=6.07, wps=297261, ups=0.69, wpb=430199, bsz=16253.9, num_updates=17600, lr=0.000476731, gnorm=0.245, clip=0, loss_scale=2, train_wall=144, gb_free=59.4, wall=25976 epoch 011: 584 / 1707 loss=4.218, nll_loss=2.601, ppl=6.07, wps=297261, ups=0.69, wpb=430199, bsz=16253.9, num_updates=17600, lr=0.000476731, gnorm=0.245, clip=0, loss_scale=2, train_wall=144, gb_free=59.4, wall=25976 epoch 011: 584 / 1707 loss=4.218, nll_loss=2.601, ppl=6.07, wps=297261, ups=0.69, wpb=430199, bsz=16253.9, num_updates=17600, lr=0.000476731, gnorm=0.245, clip=0, loss_scale=2, train_wall=144, gb_free=59.4, wall=25976 epoch 011: 584 / 1707 loss=4.218, nll_loss=2.601, ppl=6.07, wps=297261, ups=0.69, wpb=430199, bsz=16253.9, num_updates=17600, lr=0.000476731, gnorm=0.245, clip=0, loss_scale=2, train_wall=144, gb_free=59.4, wall=25976 epoch 011: 584 / 1707 loss=4.218, nll_loss=2.601, ppl=6.07, wps=297261, ups=0.69, wpb=430199, bsz=16253.9, num_updates=17600, lr=0.000476731, gnorm=0.245, clip=0, loss_scale=2, train_wall=144, gb_free=59.4, wall=25976 epoch 011: 584 / 1707 loss=4.218, nll_loss=2.601, ppl=6.07, wps=297261, ups=0.69, wpb=430199, bsz=16253.9, num_updates=17600, lr=0.000476731, gnorm=0.245, clip=0, loss_scale=2, train_wall=144, gb_free=59.4, wall=25976 epoch 011: 584 / 1707 loss=4.218, nll_loss=2.601, ppl=6.07, wps=297261, ups=0.69, wpb=430199, bsz=16253.9, num_updates=17600, lr=0.000476731, gnorm=0.245, clip=0, loss_scale=2, train_wall=144, gb_free=59.4, wall=25976 epoch 011: 584 / 1707 loss=4.218, nll_loss=2.601, ppl=6.07, wps=297261, ups=0.69, wpb=430199, bsz=16253.9, num_updates=17600, lr=0.000476731, gnorm=0.245, clip=0, loss_scale=2, train_wall=144, gb_free=59.4, wall=25976 epoch 011: 584 / 1707 loss=4.218, nll_loss=2.601, ppl=6.07, wps=297261, ups=0.69, wpb=430199, bsz=16253.9, num_updates=17600, lr=0.000476731, gnorm=0.245, clip=0, loss_scale=2, train_wall=144, gb_free=59.4, wall=25976 epoch 011: 684 / 1707 loss=4.204, nll_loss=2.584, ppl=6, wps=295276, ups=0.69, wpb=428546, bsz=16309, num_updates=17700, lr=0.000475383, gnorm=0.247, clip=0, loss_scale=4, train_wall=144, gb_free=59, wall=26121 epoch 011: 684 / 1707 loss=4.204, nll_loss=2.584, ppl=6, wps=295276, ups=0.69, wpb=428546, bsz=16309, num_updates=17700, lr=0.000475383, gnorm=0.247, clip=0, loss_scale=4, train_wall=144, gb_free=59, wall=26121 epoch 011: 684 / 1707 loss=4.204, nll_loss=2.584, ppl=6, wps=295276, ups=0.69, wpb=428546, bsz=16309, num_updates=17700, lr=0.000475383, gnorm=0.247, clip=0, loss_scale=4, train_wall=144, gb_free=59, wall=26121 epoch 011: 684 / 1707 loss=4.204, nll_loss=2.584, ppl=6, wps=295276, ups=0.69, wpb=428546, bsz=16309, num_updates=17700, lr=0.000475383, gnorm=0.247, clip=0, loss_scale=4, train_wall=144, gb_free=59, wall=26121 epoch 011: 684 / 1707 loss=4.204, nll_loss=2.584, ppl=6, wps=295276, ups=0.69, wpb=428546, bsz=16309, num_updates=17700, lr=0.000475383, gnorm=0.247, clip=0, loss_scale=4, train_wall=144, gb_free=59, wall=26121 epoch 011: 684 / 1707 loss=4.204, nll_loss=2.584, ppl=6, wps=295276, ups=0.69, wpb=428546, bsz=16309, num_updates=17700, lr=0.000475383, gnorm=0.247, clip=0, loss_scale=4, train_wall=144, gb_free=59, wall=26121 epoch 011: 684 / 1707 loss=4.204, nll_loss=2.584, ppl=6, wps=295276, ups=0.69, wpb=428546, bsz=16309, num_updates=17700, lr=0.000475383, gnorm=0.247, clip=0, loss_scale=4, train_wall=144, gb_free=59, wall=26121 epoch 011: 684 / 1707 loss=4.204, nll_loss=2.584, ppl=6, wps=295276, ups=0.69, wpb=428546, bsz=16309, num_updates=17700, lr=0.000475383, gnorm=0.247, clip=0, loss_scale=4, train_wall=144, gb_free=59, wall=26121 epoch 011: 684 / 1707 loss=4.204, nll_loss=2.584, ppl=6, wps=295276, ups=0.69, wpb=428546, bsz=16309, num_updates=17700, lr=0.000475383, gnorm=0.247, clip=0, loss_scale=4, train_wall=144, gb_free=59, wall=26121 epoch 011: 684 / 1707 loss=4.204, nll_loss=2.584, ppl=6, wps=295276, ups=0.69, wpb=428546, bsz=16309, num_updates=17700, lr=0.000475383, gnorm=0.247, clip=0, loss_scale=4, train_wall=144, gb_free=59, wall=26121 epoch 011: 684 / 1707 loss=4.204, nll_loss=2.584, ppl=6, wps=295276, ups=0.69, wpb=428546, bsz=16309, num_updates=17700, lr=0.000475383, gnorm=0.247, clip=0, loss_scale=4, train_wall=144, gb_free=59, wall=26121 epoch 011: 785 / 1707 loss=4.211, nll_loss=2.593, ppl=6.03, wps=293881, ups=0.68, wpb=429696, bsz=16495.8, num_updates=17800, lr=0.000474045, gnorm=0.247, clip=0, loss_scale=2, train_wall=145, gb_free=59.1, wall=26267 epoch 011: 785 / 1707 loss=4.211, nll_loss=2.593, ppl=6.03, wps=293881, ups=0.68, wpb=429696, bsz=16495.8, num_updates=17800, lr=0.000474045, gnorm=0.247, clip=0, loss_scale=2, train_wall=145, gb_free=59.1, wall=26267 epoch 011: 785 / 1707 loss=4.211, nll_loss=2.593, ppl=6.03, wps=293881, ups=0.68, wpb=429696, bsz=16495.8, num_updates=17800, lr=0.000474045, gnorm=0.247, clip=0, loss_scale=2, train_wall=145, gb_free=59.1, wall=26267 epoch 011: 785 / 1707 loss=4.211, nll_loss=2.593, ppl=6.03, wps=293881, ups=0.68, wpb=429696, bsz=16495.8, num_updates=17800, lr=0.000474045, gnorm=0.247, clip=0, loss_scale=2, train_wall=145, gb_free=59.1, wall=26267 epoch 011: 785 / 1707 loss=4.211, nll_loss=2.593, ppl=6.03, wps=293881, ups=0.68, wpb=429696, bsz=16495.8, num_updates=17800, lr=0.000474045, gnorm=0.247, clip=0, loss_scale=2, train_wall=145, gb_free=59.1, wall=26267 epoch 011: 785 / 1707 loss=4.211, nll_loss=2.593, ppl=6.03, wps=293881, ups=0.68, wpb=429696, bsz=16495.8, num_updates=17800, lr=0.000474045, gnorm=0.247, clip=0, loss_scale=2, train_wall=145, gb_free=59.1, wall=26267 epoch 011: 785 / 1707 loss=4.211, nll_loss=2.593, ppl=6.03, wps=293881, ups=0.68, wpb=429696, bsz=16495.8, num_updates=17800, lr=0.000474045, gnorm=0.247, clip=0, loss_scale=2, train_wall=145, gb_free=59.1, wall=26267 epoch 011: 785 / 1707 loss=4.211, nll_loss=2.593, ppl=6.03, wps=293881, ups=0.68, wpb=429696, bsz=16495.8, num_updates=17800, lr=0.000474045, gnorm=0.247, clip=0, loss_scale=2, train_wall=145, gb_free=59.1, wall=26267 epoch 011: 785 / 1707 loss=4.211, nll_loss=2.593, ppl=6.03, wps=293881, ups=0.68, wpb=429696, bsz=16495.8, num_updates=17800, lr=0.000474045, gnorm=0.247, clip=0, loss_scale=2, train_wall=145, gb_free=59.1, wall=26267 epoch 011: 785 / 1707 loss=4.211, nll_loss=2.593, ppl=6.03, wps=293881, ups=0.68, wpb=429696, bsz=16495.8, num_updates=17800, lr=0.000474045, gnorm=0.247, clip=0, loss_scale=2, train_wall=145, gb_free=59.1, wall=26267 epoch 011: 785 / 1707 loss=4.211, nll_loss=2.593, ppl=6.03, wps=293881, ups=0.68, wpb=429696, bsz=16495.8, num_updates=17800, lr=0.000474045, gnorm=0.247, clip=0, loss_scale=2, train_wall=145, gb_free=59.1, wall=26267 epoch 011: 885 / 1707 loss=4.21, nll_loss=2.592, ppl=6.03, wps=294486, ups=0.69, wpb=426748, bsz=16219, num_updates=17900, lr=0.000472719, gnorm=0.252, clip=0, loss_scale=2, train_wall=144, gb_free=58.8, wall=26412 epoch 011: 885 / 1707 loss=4.21, nll_loss=2.592, ppl=6.03, wps=294486, ups=0.69, wpb=426748, bsz=16219, num_updates=17900, lr=0.000472719, gnorm=0.252, clip=0, loss_scale=2, train_wall=144, gb_free=58.8, wall=26412 epoch 011: 885 / 1707 loss=4.21, nll_loss=2.592, ppl=6.03, wps=294486, ups=0.69, wpb=426748, bsz=16219, num_updates=17900, lr=0.000472719, gnorm=0.252, clip=0, loss_scale=2, train_wall=144, gb_free=58.8, wall=26412 epoch 011: 885 / 1707 loss=4.21, nll_loss=2.592, ppl=6.03, wps=294486, ups=0.69, wpb=426748, bsz=16219, num_updates=17900, lr=0.000472719, gnorm=0.252, clip=0, loss_scale=2, train_wall=144, gb_free=58.8, wall=26412 epoch 011: 885 / 1707 loss=4.21, nll_loss=2.592, ppl=6.03, wps=294486, ups=0.69, wpb=426748, bsz=16219, num_updates=17900, lr=0.000472719, gnorm=0.252, clip=0, loss_scale=2, train_wall=144, gb_free=58.8, wall=26412 epoch 011: 885 / 1707 loss=4.21, nll_loss=2.592, ppl=6.03, wps=294486, ups=0.69, wpb=426748, bsz=16219, num_updates=17900, lr=0.000472719, gnorm=0.252, clip=0, loss_scale=2, train_wall=144, gb_free=58.8, wall=26412 epoch 011: 885 / 1707 loss=4.21, nll_loss=2.592, ppl=6.03, wps=294486, ups=0.69, wpb=426748, bsz=16219, num_updates=17900, lr=0.000472719, gnorm=0.252, clip=0, loss_scale=2, train_wall=144, gb_free=58.8, wall=26412 epoch 011: 885 / 1707 loss=4.21, nll_loss=2.592, ppl=6.03, wps=294486, ups=0.69, wpb=426748, bsz=16219, num_updates=17900, lr=0.000472719, gnorm=0.252, clip=0, loss_scale=2, train_wall=144, gb_free=58.8, wall=26412 epoch 011: 885 / 1707 loss=4.21, nll_loss=2.592, ppl=6.03, wps=294486, ups=0.69, wpb=426748, bsz=16219, num_updates=17900, lr=0.000472719, gnorm=0.252, clip=0, loss_scale=2, train_wall=144, gb_free=58.8, wall=26412 epoch 011: 885 / 1707 loss=4.21, nll_loss=2.592, ppl=6.03, wps=294486, ups=0.69, wpb=426748, bsz=16219, num_updates=17900, lr=0.000472719, gnorm=0.252, clip=0, loss_scale=2, train_wall=144, gb_free=58.8, wall=26412 epoch 011: 885 / 1707 loss=4.21, nll_loss=2.592, ppl=6.03, wps=294486, ups=0.69, wpb=426748, bsz=16219, num_updates=17900, lr=0.000472719, gnorm=0.252, clip=0, loss_scale=2, train_wall=144, gb_free=58.8, wall=26412 epoch 011: 985 / 1707 loss=4.211, nll_loss=2.594, ppl=6.04, wps=296255, ups=0.69, wpb=429119, bsz=16138.2, num_updates=18000, lr=0.000471405, gnorm=0.258, clip=0, loss_scale=4, train_wall=144, gb_free=59.3, wall=26557 epoch 011: 985 / 1707 loss=4.211, nll_loss=2.594, ppl=6.04, wps=296255, ups=0.69, wpb=429119, bsz=16138.2, num_updates=18000, lr=0.000471405, gnorm=0.258, clip=0, loss_scale=4, train_wall=144, gb_free=59.3, wall=26557 epoch 011: 985 / 1707 loss=4.211, nll_loss=2.594, ppl=6.04, wps=296255, ups=0.69, wpb=429119, bsz=16138.2, num_updates=18000, lr=0.000471405, gnorm=0.258, clip=0, loss_scale=4, train_wall=144, gb_free=59.3, wall=26557 epoch 011: 985 / 1707 loss=4.211, nll_loss=2.594, ppl=6.04, wps=296255, ups=0.69, wpb=429119, bsz=16138.2, num_updates=18000, lr=0.000471405, gnorm=0.258, clip=0, loss_scale=4, train_wall=144, gb_free=59.3, wall=26557 epoch 011: 985 / 1707 loss=4.211, nll_loss=2.594, ppl=6.04, wps=296255, ups=0.69, wpb=429119, bsz=16138.2, num_updates=18000, lr=0.000471405, gnorm=0.258, clip=0, loss_scale=4, train_wall=144, gb_free=59.3, wall=26557 epoch 011: 985 / 1707 loss=4.211, nll_loss=2.594, ppl=6.04, wps=296255, ups=0.69, wpb=429119, bsz=16138.2, num_updates=18000, lr=0.000471405, gnorm=0.258, clip=0, loss_scale=4, train_wall=144, gb_free=59.3, wall=26557 epoch 011: 985 / 1707 loss=4.211, nll_loss=2.594, ppl=6.04, wps=296255, ups=0.69, wpb=429119, bsz=16138.2, num_updates=18000, lr=0.000471405, gnorm=0.258, clip=0, loss_scale=4, train_wall=144, gb_free=59.3, wall=26557 epoch 011: 985 / 1707 loss=4.211, nll_loss=2.594, ppl=6.04, wps=296255, ups=0.69, wpb=429119, bsz=16138.2, num_updates=18000, lr=0.000471405, gnorm=0.258, clip=0, loss_scale=4, train_wall=144, gb_free=59.3, wall=26557 epoch 011: 985 / 1707 loss=4.211, nll_loss=2.594, ppl=6.04, wps=296255, ups=0.69, wpb=429119, bsz=16138.2, num_updates=18000, lr=0.000471405, gnorm=0.258, clip=0, loss_scale=4, train_wall=144, gb_free=59.3, wall=26557 epoch 011: 985 / 1707 loss=4.211, nll_loss=2.594, ppl=6.04, wps=296255, ups=0.69, wpb=429119, bsz=16138.2, num_updates=18000, lr=0.000471405, gnorm=0.258, clip=0, loss_scale=4, train_wall=144, gb_free=59.3, wall=26557 epoch 011: 985 / 1707 loss=4.211, nll_loss=2.594, ppl=6.04, wps=296255, ups=0.69, wpb=429119, bsz=16138.2, num_updates=18000, lr=0.000471405, gnorm=0.258, clip=0, loss_scale=4, train_wall=144, gb_free=59.3, wall=26557 begin validation on "valid" subset epoch 011 | valid on 'valid' subset | loss 4.343 | nll_loss 2.725 | ppl 6.61 | wps 76341.1 | wpb 21331 | bsz 1016 | num_updates 18000 | best_loss 4.343 epoch 011 | valid on 'valid' subset | loss 4.343 | nll_loss 2.725 | ppl 6.61 | wps 76341.1 | wpb 21331 | bsz 1016 | num_updates 18000 | best_loss 4.343 epoch 011 | valid on 'valid' subset | loss 4.343 | nll_loss 2.725 | ppl 6.61 | wps 76341.1 | wpb 21331 | bsz 1016 | num_updates 18000 | best_loss 4.343 epoch 011 | valid on 'valid' subset | loss 4.343 | nll_loss 2.725 | ppl 6.61 | wps 76341.1 | wpb 21331 | bsz 1016 | num_updates 18000 | best_loss 4.343 epoch 011 | valid on 'valid' subset | loss 4.343 | nll_loss 2.725 | ppl 6.61 | wps 76341.1 | wpb 21331 | bsz 1016 | num_updates 18000 | best_loss 4.343 epoch 011 | valid on 'valid' subset | loss 4.343 | nll_loss 2.725 | ppl 6.61 | wps 76341.1 | wpb 21331 | bsz 1016 | num_updates 18000 | best_loss 4.343 epoch 011 | valid on 'valid' subset | loss 4.343 | nll_loss 2.725 | ppl 6.61 | wps 76341.1 | wpb 21331 | bsz 1016 | num_updates 18000 | best_loss 4.343 epoch 011 | valid on 'valid' subset | loss 4.343 | nll_loss 2.725 | ppl 6.61 | wps 76341.1 | wpb 21331 | bsz 1016 | num_updates 18000 | best_loss 4.343 epoch 011 | valid on 'valid' subset | loss 4.343 | nll_loss 2.725 | ppl 6.61 | wps 76341.1 | wpb 21331 | bsz 1016 | num_updates 18000 | best_loss 4.343 epoch 011 | valid on 'valid' subset | loss 4.343 | nll_loss 2.725 | ppl 6.61 | wps 76341.1 | wpb 21331 | bsz 1016 | num_updates 18000 | best_loss 4.343 epoch 011 | valid on 'valid' subset | loss 4.343 | nll_loss 2.725 | ppl 6.61 | wps 76341.1 | wpb 21331 | bsz 1016 | num_updates 18000 | best_loss 4.343 epoch 011: 1085 / 1707 loss=4.205, nll_loss=2.586, ppl=6, wps=261460, ups=0.61, wpb=428263, bsz=16201.4, num_updates=18100, lr=0.0004701, gnorm=0.252, clip=0, loss_scale=4, train_wall=144, gb_free=58.8, wall=26721 epoch 011: 1085 / 1707 loss=4.205, nll_loss=2.586, ppl=6, wps=261460, ups=0.61, wpb=428263, bsz=16201.4, num_updates=18100, lr=0.0004701, gnorm=0.252, clip=0, loss_scale=4, train_wall=144, gb_free=58.8, wall=26721 epoch 011: 1085 / 1707 loss=4.205, nll_loss=2.586, ppl=6, wps=261460, ups=0.61, wpb=428263, bsz=16201.4, num_updates=18100, lr=0.0004701, gnorm=0.252, clip=0, loss_scale=4, train_wall=144, gb_free=58.8, wall=26721 epoch 011: 1085 / 1707 loss=4.205, nll_loss=2.586, ppl=6, wps=261460, ups=0.61, wpb=428263, bsz=16201.4, num_updates=18100, lr=0.0004701, gnorm=0.252, clip=0, loss_scale=4, train_wall=144, gb_free=58.8, wall=26721 epoch 011: 1085 / 1707 loss=4.205, nll_loss=2.586, ppl=6, wps=261460, ups=0.61, wpb=428263, bsz=16201.4, num_updates=18100, lr=0.0004701, gnorm=0.252, clip=0, loss_scale=4, train_wall=144, gb_free=58.8, wall=26721 epoch 011: 1085 / 1707 loss=4.205, nll_loss=2.586, ppl=6, wps=261460, ups=0.61, wpb=428263, bsz=16201.4, num_updates=18100, lr=0.0004701, gnorm=0.252, clip=0, loss_scale=4, train_wall=144, gb_free=58.8, wall=26721 epoch 011: 1085 / 1707 loss=4.205, nll_loss=2.586, ppl=6, wps=261460, ups=0.61, wpb=428263, bsz=16201.4, num_updates=18100, lr=0.0004701, gnorm=0.252, clip=0, loss_scale=4, train_wall=144, gb_free=58.8, wall=26721 epoch 011: 1085 / 1707 loss=4.205, nll_loss=2.586, ppl=6, wps=261460, ups=0.61, wpb=428263, bsz=16201.4, num_updates=18100, lr=0.0004701, gnorm=0.252, clip=0, loss_scale=4, train_wall=144, gb_free=58.8, wall=26721 epoch 011: 1085 / 1707 loss=4.205, nll_loss=2.586, ppl=6, wps=261460, ups=0.61, wpb=428263, bsz=16201.4, num_updates=18100, lr=0.0004701, gnorm=0.252, clip=0, loss_scale=4, train_wall=144, gb_free=58.8, wall=26721 epoch 011: 1085 / 1707 loss=4.205, nll_loss=2.586, ppl=6, wps=261460, ups=0.61, wpb=428263, bsz=16201.4, num_updates=18100, lr=0.0004701, gnorm=0.252, clip=0, loss_scale=4, train_wall=144, gb_free=58.8, wall=26721 epoch 011: 1085 / 1707 loss=4.205, nll_loss=2.586, ppl=6, wps=261460, ups=0.61, wpb=428263, bsz=16201.4, num_updates=18100, lr=0.0004701, gnorm=0.252, clip=0, loss_scale=4, train_wall=144, gb_free=58.8, wall=26721 epoch 011: 1186 / 1707 loss=4.22, nll_loss=2.604, ppl=6.08, wps=294120, ups=0.68, wpb=430601, bsz=16662.3, num_updates=18200, lr=0.000468807, gnorm=0.239, clip=0, loss_scale=2, train_wall=146, gb_free=59.1, wall=26867 epoch 011: 1186 / 1707 loss=4.22, nll_loss=2.604, ppl=6.08, wps=294120, ups=0.68, wpb=430601, bsz=16662.3, num_updates=18200, lr=0.000468807, gnorm=0.239, clip=0, loss_scale=2, train_wall=146, gb_free=59.1, wall=26867 epoch 011: 1186 / 1707 loss=4.22, nll_loss=2.604, ppl=6.08, wps=294120, ups=0.68, wpb=430601, bsz=16662.3, num_updates=18200, lr=0.000468807, gnorm=0.239, clip=0, loss_scale=2, train_wall=146, gb_free=59.1, wall=26867 epoch 011: 1186 / 1707 loss=4.22, nll_loss=2.604, ppl=6.08, wps=294120, ups=0.68, wpb=430601, bsz=16662.3, num_updates=18200, lr=0.000468807, gnorm=0.239, clip=0, loss_scale=2, train_wall=146, gb_free=59.1, wall=26867 epoch 011: 1186 / 1707 loss=4.22, nll_loss=2.604, ppl=6.08, wps=294120, ups=0.68, wpb=430601, bsz=16662.3, num_updates=18200, lr=0.000468807, gnorm=0.239, clip=0, loss_scale=2, train_wall=146, gb_free=59.1, wall=26867 epoch 011: 1186 / 1707 loss=4.22, nll_loss=2.604, ppl=6.08, wps=294120, ups=0.68, wpb=430601, bsz=16662.3, num_updates=18200, lr=0.000468807, gnorm=0.239, clip=0, loss_scale=2, train_wall=146, gb_free=59.1, wall=26867 epoch 011: 1186 / 1707 loss=4.22, nll_loss=2.604, ppl=6.08, wps=294120, ups=0.68, wpb=430601, bsz=16662.3, num_updates=18200, lr=0.000468807, gnorm=0.239, clip=0, loss_scale=2, train_wall=146, gb_free=59.1, wall=26867 epoch 011: 1186 / 1707 loss=4.22, nll_loss=2.604, ppl=6.08, wps=294120, ups=0.68, wpb=430601, bsz=16662.3, num_updates=18200, lr=0.000468807, gnorm=0.239, clip=0, loss_scale=2, train_wall=146, gb_free=59.1, wall=26867 epoch 011: 1186 / 1707 loss=4.22, nll_loss=2.604, ppl=6.08, wps=294120, ups=0.68, wpb=430601, bsz=16662.3, num_updates=18200, lr=0.000468807, gnorm=0.239, clip=0, loss_scale=2, train_wall=146, gb_free=59.1, wall=26867 epoch 011: 1186 / 1707 loss=4.22, nll_loss=2.604, ppl=6.08, wps=294120, ups=0.68, wpb=430601, bsz=16662.3, num_updates=18200, lr=0.000468807, gnorm=0.239, clip=0, loss_scale=2, train_wall=146, gb_free=59.1, wall=26867 epoch 011: 1186 / 1707 loss=4.22, nll_loss=2.604, ppl=6.08, wps=294120, ups=0.68, wpb=430601, bsz=16662.3, num_updates=18200, lr=0.000468807, gnorm=0.239, clip=0, loss_scale=2, train_wall=146, gb_free=59.1, wall=26867 epoch 011: 1286 / 1707 loss=4.216, nll_loss=2.599, ppl=6.06, wps=296516, ups=0.69, wpb=429291, bsz=16519.2, num_updates=18300, lr=0.000467525, gnorm=0.254, clip=0, loss_scale=2, train_wall=144, gb_free=59.1, wall=27012 epoch 011: 1286 / 1707 loss=4.216, nll_loss=2.599, ppl=6.06, wps=296516, ups=0.69, wpb=429291, bsz=16519.2, num_updates=18300, lr=0.000467525, gnorm=0.254, clip=0, loss_scale=2, train_wall=144, gb_free=59.1, wall=27012 epoch 011: 1286 / 1707 loss=4.216, nll_loss=2.599, ppl=6.06, wps=296516, ups=0.69, wpb=429291, bsz=16519.2, num_updates=18300, lr=0.000467525, gnorm=0.254, clip=0, loss_scale=2, train_wall=144, gb_free=59.1, wall=27012 epoch 011: 1286 / 1707 loss=4.216, nll_loss=2.599, ppl=6.06, wps=296516, ups=0.69, wpb=429291, bsz=16519.2, num_updates=18300, lr=0.000467525, gnorm=0.254, clip=0, loss_scale=2, train_wall=144, gb_free=59.1, wall=27012 epoch 011: 1286 / 1707 loss=4.216, nll_loss=2.599, ppl=6.06, wps=296516, ups=0.69, wpb=429291, bsz=16519.2, num_updates=18300, lr=0.000467525, gnorm=0.254, clip=0, loss_scale=2, train_wall=144, gb_free=59.1, wall=27012 epoch 011: 1286 / 1707 loss=4.216, nll_loss=2.599, ppl=6.06, wps=296516, ups=0.69, wpb=429291, bsz=16519.2, num_updates=18300, lr=0.000467525, gnorm=0.254, clip=0, loss_scale=2, train_wall=144, gb_free=59.1, wall=27012 epoch 011: 1286 / 1707 loss=4.216, nll_loss=2.599, ppl=6.06, wps=296516, ups=0.69, wpb=429291, bsz=16519.2, num_updates=18300, lr=0.000467525, gnorm=0.254, clip=0, loss_scale=2, train_wall=144, gb_free=59.1, wall=27012 epoch 011: 1286 / 1707 loss=4.216, nll_loss=2.599, ppl=6.06, wps=296516, ups=0.69, wpb=429291, bsz=16519.2, num_updates=18300, lr=0.000467525, gnorm=0.254, clip=0, loss_scale=2, train_wall=144, gb_free=59.1, wall=27012 epoch 011: 1286 / 1707 loss=4.216, nll_loss=2.599, ppl=6.06, wps=296516, ups=0.69, wpb=429291, bsz=16519.2, num_updates=18300, lr=0.000467525, gnorm=0.254, clip=0, loss_scale=2, train_wall=144, gb_free=59.1, wall=27012 epoch 011: 1286 / 1707 loss=4.216, nll_loss=2.599, ppl=6.06, wps=296516, ups=0.69, wpb=429291, bsz=16519.2, num_updates=18300, lr=0.000467525, gnorm=0.254, clip=0, loss_scale=2, train_wall=144, gb_free=59.1, wall=27012 epoch 011: 1286 / 1707 loss=4.216, nll_loss=2.599, ppl=6.06, wps=296516, ups=0.69, wpb=429291, bsz=16519.2, num_updates=18300, lr=0.000467525, gnorm=0.254, clip=0, loss_scale=2, train_wall=144, gb_free=59.1, wall=27012 epoch 011: 1386 / 1707 loss=4.206, nll_loss=2.588, ppl=6.01, wps=295840, ups=0.69, wpb=428547, bsz=16230.6, num_updates=18400, lr=0.000466252, gnorm=0.254, clip=0, loss_scale=2, train_wall=144, gb_free=59.3, wall=27157 epoch 011: 1386 / 1707 loss=4.206, nll_loss=2.588, ppl=6.01, wps=295840, ups=0.69, wpb=428547, bsz=16230.6, num_updates=18400, lr=0.000466252, gnorm=0.254, clip=0, loss_scale=2, train_wall=144, gb_free=59.3, wall=27157 epoch 011: 1386 / 1707 loss=4.206, nll_loss=2.588, ppl=6.01, wps=295840, ups=0.69, wpb=428547, bsz=16230.6, num_updates=18400, lr=0.000466252, gnorm=0.254, clip=0, loss_scale=2, train_wall=144, gb_free=59.3, wall=27157 epoch 011: 1386 / 1707 loss=4.206, nll_loss=2.588, ppl=6.01, wps=295840, ups=0.69, wpb=428547, bsz=16230.6, num_updates=18400, lr=0.000466252, gnorm=0.254, clip=0, loss_scale=2, train_wall=144, gb_free=59.3, wall=27157 epoch 011: 1386 / 1707 loss=4.206, nll_loss=2.588, ppl=6.01, wps=295840, ups=0.69, wpb=428547, bsz=16230.6, num_updates=18400, lr=0.000466252, gnorm=0.254, clip=0, loss_scale=2, train_wall=144, gb_free=59.3, wall=27157 epoch 011: 1386 / 1707 loss=4.206, nll_loss=2.588, ppl=6.01, wps=295840, ups=0.69, wpb=428547, bsz=16230.6, num_updates=18400, lr=0.000466252, gnorm=0.254, clip=0, loss_scale=2, train_wall=144, gb_free=59.3, wall=27157 epoch 011: 1386 / 1707 loss=4.206, nll_loss=2.588, ppl=6.01, wps=295840, ups=0.69, wpb=428547, bsz=16230.6, num_updates=18400, lr=0.000466252, gnorm=0.254, clip=0, loss_scale=2, train_wall=144, gb_free=59.3, wall=27157 epoch 011: 1386 / 1707 loss=4.206, nll_loss=2.588, ppl=6.01, wps=295840, ups=0.69, wpb=428547, bsz=16230.6, num_updates=18400, lr=0.000466252, gnorm=0.254, clip=0, loss_scale=2, train_wall=144, gb_free=59.3, wall=27157 epoch 011: 1386 / 1707 loss=4.206, nll_loss=2.588, ppl=6.01, wps=295840, ups=0.69, wpb=428547, bsz=16230.6, num_updates=18400, lr=0.000466252, gnorm=0.254, clip=0, loss_scale=2, train_wall=144, gb_free=59.3, wall=27157 epoch 011: 1386 / 1707 loss=4.206, nll_loss=2.588, ppl=6.01, wps=295840, ups=0.69, wpb=428547, bsz=16230.6, num_updates=18400, lr=0.000466252, gnorm=0.254, clip=0, loss_scale=2, train_wall=144, gb_free=59.3, wall=27157 epoch 011: 1386 / 1707 loss=4.206, nll_loss=2.588, ppl=6.01, wps=295840, ups=0.69, wpb=428547, bsz=16230.6, num_updates=18400, lr=0.000466252, gnorm=0.254, clip=0, loss_scale=2, train_wall=144, gb_free=59.3, wall=27157 epoch 011: 1486 / 1707 loss=4.213, nll_loss=2.596, ppl=6.05, wps=295924, ups=0.69, wpb=429273, bsz=16384.2, num_updates=18500, lr=0.000464991, gnorm=0.246, clip=0, loss_scale=4, train_wall=144, gb_free=58.8, wall=27302 epoch 011: 1486 / 1707 loss=4.213, nll_loss=2.596, ppl=6.05, wps=295924, ups=0.69, wpb=429273, bsz=16384.2, num_updates=18500, lr=0.000464991, gnorm=0.246, clip=0, loss_scale=4, train_wall=144, gb_free=58.8, wall=27302 epoch 011: 1486 / 1707 loss=4.213, nll_loss=2.596, ppl=6.05, wps=295924, ups=0.69, wpb=429273, bsz=16384.2, num_updates=18500, lr=0.000464991, gnorm=0.246, clip=0, loss_scale=4, train_wall=144, gb_free=58.8, wall=27302 epoch 011: 1486 / 1707 loss=4.213, nll_loss=2.596, ppl=6.05, wps=295924, ups=0.69, wpb=429273, bsz=16384.2, num_updates=18500, lr=0.000464991, gnorm=0.246, clip=0, loss_scale=4, train_wall=144, gb_free=58.8, wall=27302 epoch 011: 1486 / 1707 loss=4.213, nll_loss=2.596, ppl=6.05, wps=295924, ups=0.69, wpb=429273, bsz=16384.2, num_updates=18500, lr=0.000464991, gnorm=0.246, clip=0, loss_scale=4, train_wall=144, gb_free=58.8, wall=27302 epoch 011: 1486 / 1707 loss=4.213, nll_loss=2.596, ppl=6.05, wps=295924, ups=0.69, wpb=429273, bsz=16384.2, num_updates=18500, lr=0.000464991, gnorm=0.246, clip=0, loss_scale=4, train_wall=144, gb_free=58.8, wall=27302 epoch 011: 1486 / 1707 loss=4.213, nll_loss=2.596, ppl=6.05, wps=295924, ups=0.69, wpb=429273, bsz=16384.2, num_updates=18500, lr=0.000464991, gnorm=0.246, clip=0, loss_scale=4, train_wall=144, gb_free=58.8, wall=27302 epoch 011: 1486 / 1707 loss=4.213, nll_loss=2.596, ppl=6.05, wps=295924, ups=0.69, wpb=429273, bsz=16384.2, num_updates=18500, lr=0.000464991, gnorm=0.246, clip=0, loss_scale=4, train_wall=144, gb_free=58.8, wall=27302 epoch 011: 1486 / 1707 loss=4.213, nll_loss=2.596, ppl=6.05, wps=295924, ups=0.69, wpb=429273, bsz=16384.2, num_updates=18500, lr=0.000464991, gnorm=0.246, clip=0, loss_scale=4, train_wall=144, gb_free=58.8, wall=27302 epoch 011: 1486 / 1707 loss=4.213, nll_loss=2.596, ppl=6.05, wps=295924, ups=0.69, wpb=429273, bsz=16384.2, num_updates=18500, lr=0.000464991, gnorm=0.246, clip=0, loss_scale=4, train_wall=144, gb_free=58.8, wall=27302 epoch 011: 1486 / 1707 loss=4.213, nll_loss=2.596, ppl=6.05, wps=295924, ups=0.69, wpb=429273, bsz=16384.2, num_updates=18500, lr=0.000464991, gnorm=0.246, clip=0, loss_scale=4, train_wall=144, gb_free=58.8, wall=27302 epoch 011: 1586 / 1707 loss=4.202, nll_loss=2.584, ppl=6, wps=295800, ups=0.69, wpb=428392, bsz=16090.5, num_updates=18600, lr=0.000463739, gnorm=0.234, clip=0, loss_scale=4, train_wall=144, gb_free=59.3, wall=27447 epoch 011: 1586 / 1707 loss=4.202, nll_loss=2.584, ppl=6, wps=295800, ups=0.69, wpb=428392, bsz=16090.5, num_updates=18600, lr=0.000463739, gnorm=0.234, clip=0, loss_scale=4, train_wall=144, gb_free=59.3, wall=27447 epoch 011: 1586 / 1707 loss=4.202, nll_loss=2.584, ppl=6, wps=295800, ups=0.69, wpb=428392, bsz=16090.5, num_updates=18600, lr=0.000463739, gnorm=0.234, clip=0, loss_scale=4, train_wall=144, gb_free=59.3, wall=27447 epoch 011: 1586 / 1707 loss=4.202, nll_loss=2.584, ppl=6, wps=295800, ups=0.69, wpb=428392, bsz=16090.5, num_updates=18600, lr=0.000463739, gnorm=0.234, clip=0, loss_scale=4, train_wall=144, gb_free=59.3, wall=27447 epoch 011: 1586 / 1707 loss=4.202, nll_loss=2.584, ppl=6, wps=295800, ups=0.69, wpb=428392, bsz=16090.5, num_updates=18600, lr=0.000463739, gnorm=0.234, clip=0, loss_scale=4, train_wall=144, gb_free=59.3, wall=27447 epoch 011: 1586 / 1707 loss=4.202, nll_loss=2.584, ppl=6, wps=295800, ups=0.69, wpb=428392, bsz=16090.5, num_updates=18600, lr=0.000463739, gnorm=0.234, clip=0, loss_scale=4, train_wall=144, gb_free=59.3, wall=27447 epoch 011: 1586 / 1707 loss=4.202, nll_loss=2.584, ppl=6, wps=295800, ups=0.69, wpb=428392, bsz=16090.5, num_updates=18600, lr=0.000463739, gnorm=0.234, clip=0, loss_scale=4, train_wall=144, gb_free=59.3, wall=27447 epoch 011: 1586 / 1707 loss=4.202, nll_loss=2.584, ppl=6, wps=295800, ups=0.69, wpb=428392, bsz=16090.5, num_updates=18600, lr=0.000463739, gnorm=0.234, clip=0, loss_scale=4, train_wall=144, gb_free=59.3, wall=27447 epoch 011: 1586 / 1707 loss=4.202, nll_loss=2.584, ppl=6, wps=295800, ups=0.69, wpb=428392, bsz=16090.5, num_updates=18600, lr=0.000463739, gnorm=0.234, clip=0, loss_scale=4, train_wall=144, gb_free=59.3, wall=27447 epoch 011: 1586 / 1707 loss=4.202, nll_loss=2.584, ppl=6, wps=295800, ups=0.69, wpb=428392, bsz=16090.5, num_updates=18600, lr=0.000463739, gnorm=0.234, clip=0, loss_scale=4, train_wall=144, gb_free=59.3, wall=27447 epoch 011: 1586 / 1707 loss=4.202, nll_loss=2.584, ppl=6, wps=295800, ups=0.69, wpb=428392, bsz=16090.5, num_updates=18600, lr=0.000463739, gnorm=0.234, clip=0, loss_scale=4, train_wall=144, gb_free=59.3, wall=27447 epoch 011: 1687 / 1707 loss=4.212, nll_loss=2.595, ppl=6.04, wps=292738, ups=0.68, wpb=430329, bsz=16287.5, num_updates=18700, lr=0.000462497, gnorm=0.251, clip=0, loss_scale=2, train_wall=146, gb_free=59, wall=27594 epoch 011: 1687 / 1707 loss=4.212, nll_loss=2.595, ppl=6.04, wps=292738, ups=0.68, wpb=430329, bsz=16287.5, num_updates=18700, lr=0.000462497, gnorm=0.251, clip=0, loss_scale=2, train_wall=146, gb_free=59, wall=27594 epoch 011: 1687 / 1707 loss=4.212, nll_loss=2.595, ppl=6.04, wps=292738, ups=0.68, wpb=430329, bsz=16287.5, num_updates=18700, lr=0.000462497, gnorm=0.251, clip=0, loss_scale=2, train_wall=146, gb_free=59, wall=27594 epoch 011: 1687 / 1707 loss=4.212, nll_loss=2.595, ppl=6.04, wps=292738, ups=0.68, wpb=430329, bsz=16287.5, num_updates=18700, lr=0.000462497, gnorm=0.251, clip=0, loss_scale=2, train_wall=146, gb_free=59, wall=27594 epoch 011: 1687 / 1707 loss=4.212, nll_loss=2.595, ppl=6.04, wps=292738, ups=0.68, wpb=430329, bsz=16287.5, num_updates=18700, lr=0.000462497, gnorm=0.251, clip=0, loss_scale=2, train_wall=146, gb_free=59, wall=27594 epoch 011: 1687 / 1707 loss=4.212, nll_loss=2.595, ppl=6.04, wps=292738, ups=0.68, wpb=430329, bsz=16287.5, num_updates=18700, lr=0.000462497, gnorm=0.251, clip=0, loss_scale=2, train_wall=146, gb_free=59, wall=27594 epoch 011: 1687 / 1707 loss=4.212, nll_loss=2.595, ppl=6.04, wps=292738, ups=0.68, wpb=430329, bsz=16287.5, num_updates=18700, lr=0.000462497, gnorm=0.251, clip=0, loss_scale=2, train_wall=146, gb_free=59, wall=27594 epoch 011: 1687 / 1707 loss=4.212, nll_loss=2.595, ppl=6.04, wps=292738, ups=0.68, wpb=430329, bsz=16287.5, num_updates=18700, lr=0.000462497, gnorm=0.251, clip=0, loss_scale=2, train_wall=146, gb_free=59, wall=27594 epoch 011: 1687 / 1707 loss=4.212, nll_loss=2.595, ppl=6.04, wps=292738, ups=0.68, wpb=430329, bsz=16287.5, num_updates=18700, lr=0.000462497, gnorm=0.251, clip=0, loss_scale=2, train_wall=146, gb_free=59, wall=27594 epoch 011: 1687 / 1707 loss=4.212, nll_loss=2.595, ppl=6.04, wps=292738, ups=0.68, wpb=430329, bsz=16287.5, num_updates=18700, lr=0.000462497, gnorm=0.251, clip=0, loss_scale=2, train_wall=146, gb_free=59, wall=27594 epoch 011: 1687 / 1707 loss=4.212, nll_loss=2.595, ppl=6.04, wps=292738, ups=0.68, wpb=430329, bsz=16287.5, num_updates=18700, lr=0.000462497, gnorm=0.251, clip=0, loss_scale=2, train_wall=146, gb_free=59, wall=27594 end of epoch 11 (average epoch stats below) epoch 011 | loss 4.208 | nll_loss 2.59 | ppl 6.02 | wps 292973 | ups 0.68 | wpb 428930 | bsz 16335.2 | num_updates 18720 | lr 0.00046225 | gnorm 0.251 | clip 0 | loss_scale 2 | train_wall 2459 | gb_free 59.8 | wall 27622 epoch 011 | loss 4.208 | nll_loss 2.59 | ppl 6.02 | wps 292973 | ups 0.68 | wpb 428930 | bsz 16335.2 | num_updates 18720 | lr 0.00046225 | gnorm 0.251 | clip 0 | loss_scale 2 | train_wall 2459 | gb_free 59.8 | wall 27622 epoch 011 | loss 4.208 | nll_loss 2.59 | ppl 6.02 | wps 292973 | ups 0.68 | wpb 428930 | bsz 16335.2 | num_updates 18720 | lr 0.00046225 | gnorm 0.251 | clip 0 | loss_scale 2 | train_wall 2459 | gb_free 59.8 | wall 27622 epoch 011 | loss 4.208 | nll_loss 2.59 | ppl 6.02 | wps 292973 | ups 0.68 | wpb 428930 | bsz 16335.2 | num_updates 18720 | lr 0.00046225 | gnorm 0.251 | clip 0 | loss_scale 2 | train_wall 2459 | gb_free 59.8 | wall 27622 epoch 011 | loss 4.208 | nll_loss 2.59 | ppl 6.02 | wps 292973 | ups 0.68 | wpb 428930 | bsz 16335.2 | num_updates 18720 | lr 0.00046225 | gnorm 0.251 | clip 0 | loss_scale 2 | train_wall 2459 | gb_free 59.8 | wall 27622 epoch 011 | loss 4.208 | nll_loss 2.59 | ppl 6.02 | wps 292973 | ups 0.68 | wpb 428930 | bsz 16335.2 | num_updates 18720 | lr 0.00046225 | gnorm 0.251 | clip 0 | loss_scale 2 | train_wall 2459 | gb_free 59.8 | wall 27622 epoch 011 | loss 4.208 | nll_loss 2.59 | ppl 6.02 | wps 292973 | ups 0.68 | wpb 428930 | bsz 16335.2 | num_updates 18720 | lr 0.00046225 | gnorm 0.251 | clip 0 | loss_scale 2 | train_wall 2459 | gb_free 59.8 | wall 27622 epoch 011 | loss 4.208 | nll_loss 2.59 | ppl 6.02 | wps 292973 | ups 0.68 | wpb 428930 | bsz 16335.2 | num_updates 18720 | lr 0.00046225 | gnorm 0.251 | clip 0 | loss_scale 2 | train_wall 2459 | gb_free 59.8 | wall 27622 epoch 011 | loss 4.208 | nll_loss 2.59 | ppl 6.02 | wps 292973 | ups 0.68 | wpb 428930 | bsz 16335.2 | num_updates 18720 | lr 0.00046225 | gnorm 0.251 | clip 0 | loss_scale 2 | train_wall 2459 | gb_free 59.8 | wall 27622 epoch 011 | loss 4.208 | nll_loss 2.59 | ppl 6.02 | wps 292973 | ups 0.68 | wpb 428930 | bsz 16335.2 | num_updates 18720 | lr 0.00046225 | gnorm 0.251 | clip 0 | loss_scale 2 | train_wall 2459 | gb_free 59.8 | wall 27622 epoch 011 | loss 4.208 | nll_loss 2.59 | ppl 6.02 | wps 292973 | ups 0.68 | wpb 428930 | bsz 16335.2 | num_updates 18720 | lr 0.00046225 | gnorm 0.251 | clip 0 | loss_scale 2 | train_wall 2459 | gb_free 59.8 | wall 27622 Start iterating over samples epoch 012: 80 / 1707 loss=4.178, nll_loss=2.556, ppl=5.88, wps=295197, ups=0.69, wpb=425986, bsz=16325, num_updates=18800, lr=0.000461266, gnorm=0.253, clip=0, loss_scale=2, train_wall=143, gb_free=59.1, wall=27738 epoch 012: 80 / 1707 loss=4.178, nll_loss=2.556, ppl=5.88, wps=295197, ups=0.69, wpb=425986, bsz=16325, num_updates=18800, lr=0.000461266, gnorm=0.253, clip=0, loss_scale=2, train_wall=143, gb_free=59.1, wall=27738 epoch 012: 80 / 1707 loss=4.178, nll_loss=2.556, ppl=5.88, wps=295197, ups=0.69, wpb=425986, bsz=16325, num_updates=18800, lr=0.000461266, gnorm=0.253, clip=0, loss_scale=2, train_wall=143, gb_free=59.1, wall=27738 epoch 012: 80 / 1707 loss=4.178, nll_loss=2.556, ppl=5.88, wps=295197, ups=0.69, wpb=425986, bsz=16325, num_updates=18800, lr=0.000461266, gnorm=0.253, clip=0, loss_scale=2, train_wall=143, gb_free=59.1, wall=27738 epoch 012: 80 / 1707 loss=4.178, nll_loss=2.556, ppl=5.88, wps=295197, ups=0.69, wpb=425986, bsz=16325, num_updates=18800, lr=0.000461266, gnorm=0.253, clip=0, loss_scale=2, train_wall=143, gb_free=59.1, wall=27738 epoch 012: 80 / 1707 loss=4.178, nll_loss=2.556, ppl=5.88, wps=295197, ups=0.69, wpb=425986, bsz=16325, num_updates=18800, lr=0.000461266, gnorm=0.253, clip=0, loss_scale=2, train_wall=143, gb_free=59.1, wall=27738 epoch 012: 80 / 1707 loss=4.178, nll_loss=2.556, ppl=5.88, wps=295197, ups=0.69, wpb=425986, bsz=16325, num_updates=18800, lr=0.000461266, gnorm=0.253, clip=0, loss_scale=2, train_wall=143, gb_free=59.1, wall=27738 epoch 012: 80 / 1707 loss=4.178, nll_loss=2.556, ppl=5.88, wps=295197, ups=0.69, wpb=425986, bsz=16325, num_updates=18800, lr=0.000461266, gnorm=0.253, clip=0, loss_scale=2, train_wall=143, gb_free=59.1, wall=27738 epoch 012: 80 / 1707 loss=4.178, nll_loss=2.556, ppl=5.88, wps=295197, ups=0.69, wpb=425986, bsz=16325, num_updates=18800, lr=0.000461266, gnorm=0.253, clip=0, loss_scale=2, train_wall=143, gb_free=59.1, wall=27738 epoch 012: 80 / 1707 loss=4.178, nll_loss=2.556, ppl=5.88, wps=295197, ups=0.69, wpb=425986, bsz=16325, num_updates=18800, lr=0.000461266, gnorm=0.253, clip=0, loss_scale=2, train_wall=143, gb_free=59.1, wall=27738 epoch 012: 80 / 1707 loss=4.178, nll_loss=2.556, ppl=5.88, wps=295197, ups=0.69, wpb=425986, bsz=16325, num_updates=18800, lr=0.000461266, gnorm=0.253, clip=0, loss_scale=2, train_wall=143, gb_free=59.1, wall=27738 epoch 012: 80 / 1707 loss=4.178, nll_loss=2.556, ppl=5.88, wps=295197, ups=0.69, wpb=425986, bsz=16325, num_updates=18800, lr=0.000461266, gnorm=0.253, clip=0, loss_scale=2, train_wall=143, gb_free=59.1, wall=27738 epoch 012: 180 / 1707 loss=4.173, nll_loss=2.549, ppl=5.85, wps=296214, ups=0.69, wpb=428430, bsz=15996.1, num_updates=18900, lr=0.000460044, gnorm=0.241, clip=0, loss_scale=2, train_wall=144, gb_free=59.1, wall=27883 epoch 012: 180 / 1707 loss=4.173, nll_loss=2.549, ppl=5.85, wps=296214, ups=0.69, wpb=428430, bsz=15996.1, num_updates=18900, lr=0.000460044, gnorm=0.241, clip=0, loss_scale=2, train_wall=144, gb_free=59.1, wall=27883 epoch 012: 180 / 1707 loss=4.173, nll_loss=2.549, ppl=5.85, wps=296214, ups=0.69, wpb=428430, bsz=15996.1, num_updates=18900, lr=0.000460044, gnorm=0.241, clip=0, loss_scale=2, train_wall=144, gb_free=59.1, wall=27883 epoch 012: 180 / 1707 loss=4.173, nll_loss=2.549, ppl=5.85, wps=296214, ups=0.69, wpb=428430, bsz=15996.1, num_updates=18900, lr=0.000460044, gnorm=0.241, clip=0, loss_scale=2, train_wall=144, gb_free=59.1, wall=27883 epoch 012: 180 / 1707 loss=4.173, nll_loss=2.549, ppl=5.85, wps=296214, ups=0.69, wpb=428430, bsz=15996.1, num_updates=18900, lr=0.000460044, gnorm=0.241, clip=0, loss_scale=2, train_wall=144, gb_free=59.1, wall=27883 epoch 012: 180 / 1707 loss=4.173, nll_loss=2.549, ppl=5.85, wps=296214, ups=0.69, wpb=428430, bsz=15996.1, num_updates=18900, lr=0.000460044, gnorm=0.241, clip=0, loss_scale=2, train_wall=144, gb_free=59.1, wall=27883 epoch 012: 180 / 1707 loss=4.173, nll_loss=2.549, ppl=5.85, wps=296214, ups=0.69, wpb=428430, bsz=15996.1, num_updates=18900, lr=0.000460044, gnorm=0.241, clip=0, loss_scale=2, train_wall=144, gb_free=59.1, wall=27883 epoch 012: 180 / 1707 loss=4.173, nll_loss=2.549, ppl=5.85, wps=296214, ups=0.69, wpb=428430, bsz=15996.1, num_updates=18900, lr=0.000460044, gnorm=0.241, clip=0, loss_scale=2, train_wall=144, gb_free=59.1, wall=27883 epoch 012: 180 / 1707 loss=4.173, nll_loss=2.549, ppl=5.85, wps=296214, ups=0.69, wpb=428430, bsz=15996.1, num_updates=18900, lr=0.000460044, gnorm=0.241, clip=0, loss_scale=2, train_wall=144, gb_free=59.1, wall=27883 epoch 012: 180 / 1707 loss=4.173, nll_loss=2.549, ppl=5.85, wps=296214, ups=0.69, wpb=428430, bsz=15996.1, num_updates=18900, lr=0.000460044, gnorm=0.241, clip=0, loss_scale=2, train_wall=144, gb_free=59.1, wall=27883 epoch 012: 180 / 1707 loss=4.173, nll_loss=2.549, ppl=5.85, wps=296214, ups=0.69, wpb=428430, bsz=15996.1, num_updates=18900, lr=0.000460044, gnorm=0.241, clip=0, loss_scale=2, train_wall=144, gb_free=59.1, wall=27883 epoch 012: 180 / 1707 loss=4.173, nll_loss=2.549, ppl=5.85, wps=296214, ups=0.69, wpb=428430, bsz=15996.1, num_updates=18900, lr=0.000460044, gnorm=0.241, clip=0, loss_scale=2, train_wall=144, gb_free=59.1, wall=27883 epoch 012: 280 / 1707 loss=4.172, nll_loss=2.549, ppl=5.85, wps=296048, ups=0.69, wpb=428743, bsz=16550.1, num_updates=19000, lr=0.000458831, gnorm=0.259, clip=0, loss_scale=4, train_wall=144, gb_free=59, wall=28028 epoch 012: 280 / 1707 loss=4.172, nll_loss=2.549, ppl=5.85, wps=296048, ups=0.69, wpb=428743, bsz=16550.1, num_updates=19000, lr=0.000458831, gnorm=0.259, clip=0, loss_scale=4, train_wall=144, gb_free=59, wall=28028 epoch 012: 280 / 1707 loss=4.172, nll_loss=2.549, ppl=5.85, wps=296048, ups=0.69, wpb=428743, bsz=16550.1, num_updates=19000, lr=0.000458831, gnorm=0.259, clip=0, loss_scale=4, train_wall=144, gb_free=59, wall=28028 epoch 012: 280 / 1707 loss=4.172, nll_loss=2.549, ppl=5.85, wps=296048, ups=0.69, wpb=428743, bsz=16550.1, num_updates=19000, lr=0.000458831, gnorm=0.259, clip=0, loss_scale=4, train_wall=144, gb_free=59, wall=28028 epoch 012: 280 / 1707 loss=4.172, nll_loss=2.549, ppl=5.85, wps=296048, ups=0.69, wpb=428743, bsz=16550.1, num_updates=19000, lr=0.000458831, gnorm=0.259, clip=0, loss_scale=4, train_wall=144, gb_free=59, wall=28028 epoch 012: 280 / 1707 loss=4.172, nll_loss=2.549, ppl=5.85, wps=296048, ups=0.69, wpb=428743, bsz=16550.1, num_updates=19000, lr=0.000458831, gnorm=0.259, clip=0, loss_scale=4, train_wall=144, gb_free=59, wall=28028 epoch 012: 280 / 1707 loss=4.172, nll_loss=2.549, ppl=5.85, wps=296048, ups=0.69, wpb=428743, bsz=16550.1, num_updates=19000, lr=0.000458831, gnorm=0.259, clip=0, loss_scale=4, train_wall=144, gb_free=59, wall=28028 epoch 012: 280 / 1707 loss=4.172, nll_loss=2.549, ppl=5.85, wps=296048, ups=0.69, wpb=428743, bsz=16550.1, num_updates=19000, lr=0.000458831, gnorm=0.259, clip=0, loss_scale=4, train_wall=144, gb_free=59, wall=28028 epoch 012: 280 / 1707 loss=4.172, nll_loss=2.549, ppl=5.85, wps=296048, ups=0.69, wpb=428743, bsz=16550.1, num_updates=19000, lr=0.000458831, gnorm=0.259, clip=0, loss_scale=4, train_wall=144, gb_free=59, wall=28028 epoch 012: 280 / 1707 loss=4.172, nll_loss=2.549, ppl=5.85, wps=296048, ups=0.69, wpb=428743, bsz=16550.1, num_updates=19000, lr=0.000458831, gnorm=0.259, clip=0, loss_scale=4, train_wall=144, gb_free=59, wall=28028 epoch 012: 280 / 1707 loss=4.172, nll_loss=2.549, ppl=5.85, wps=296048, ups=0.69, wpb=428743, bsz=16550.1, num_updates=19000, lr=0.000458831, gnorm=0.259, clip=0, loss_scale=4, train_wall=144, gb_free=59, wall=28028 epoch 012: 280 / 1707 loss=4.172, nll_loss=2.549, ppl=5.85, wps=296048, ups=0.69, wpb=428743, bsz=16550.1, num_updates=19000, lr=0.000458831, gnorm=0.259, clip=0, loss_scale=4, train_wall=144, gb_free=59, wall=28028 begin validation on "valid" subset epoch 012 | valid on 'valid' subset | loss 4.349 | nll_loss 2.728 | ppl 6.63 | wps 76633.8 | wpb 21331 | bsz 1016 | num_updates 19000 | best_loss 4.343 epoch 012 | valid on 'valid' subset | loss 4.349 | nll_loss 2.728 | ppl 6.63 | wps 76633.8 | wpb 21331 | bsz 1016 | num_updates 19000 | best_loss 4.343 epoch 012 | valid on 'valid' subset | loss 4.349 | nll_loss 2.728 | ppl 6.63 | wps 76633.8 | wpb 21331 | bsz 1016 | num_updates 19000 | best_loss 4.343 epoch 012 | valid on 'valid' subset | loss 4.349 | nll_loss 2.728 | ppl 6.63 | wps 76633.8 | wpb 21331 | bsz 1016 | num_updates 19000 | best_loss 4.343 epoch 012 | valid on 'valid' subset | loss 4.349 | nll_loss 2.728 | ppl 6.63 | wps 76633.8 | wpb 21331 | bsz 1016 | num_updates 19000 | best_loss 4.343 epoch 012 | valid on 'valid' subset | loss 4.349 | nll_loss 2.728 | ppl 6.63 | wps 76633.8 | wpb 21331 | bsz 1016 | num_updates 19000 | best_loss 4.343 epoch 012 | valid on 'valid' subset | loss 4.349 | nll_loss 2.728 | ppl 6.63 | wps 76633.8 | wpb 21331 | bsz 1016 | num_updates 19000 | best_loss 4.343 epoch 012 | valid on 'valid' subset | loss 4.349 | nll_loss 2.728 | ppl 6.63 | wps 76633.8 | wpb 21331 | bsz 1016 | num_updates 19000 | best_loss 4.343 epoch 012 | valid on 'valid' subset | loss 4.349 | nll_loss 2.728 | ppl 6.63 | wps 76633.8 | wpb 21331 | bsz 1016 | num_updates 19000 | best_loss 4.343 epoch 012 | valid on 'valid' subset | loss 4.349 | nll_loss 2.728 | ppl 6.63 | wps 76633.8 | wpb 21331 | bsz 1016 | num_updates 19000 | best_loss 4.343 epoch 012 | valid on 'valid' subset | loss 4.349 | nll_loss 2.728 | ppl 6.63 | wps 76633.8 | wpb 21331 | bsz 1016 | num_updates 19000 | best_loss 4.343 epoch 012 | valid on 'valid' subset | loss 4.349 | nll_loss 2.728 | ppl 6.63 | wps 76633.8 | wpb 21331 | bsz 1016 | num_updates 19000 | best_loss 4.343 epoch 012: 380 / 1707 loss=4.184, nll_loss=2.563, ppl=5.91, wps=273996, ups=0.64, wpb=430007, bsz=16318.6, num_updates=19100, lr=0.000457629, gnorm=0.255, clip=0, loss_scale=4, train_wall=144, gb_free=59, wall=28185 epoch 012: 380 / 1707 loss=4.184, nll_loss=2.563, ppl=5.91, wps=273996, ups=0.64, wpb=430007, bsz=16318.6, num_updates=19100, lr=0.000457629, gnorm=0.255, clip=0, loss_scale=4, train_wall=144, gb_free=59, wall=28185 epoch 012: 380 / 1707 loss=4.184, nll_loss=2.563, ppl=5.91, wps=273996, ups=0.64, wpb=430007, bsz=16318.6, num_updates=19100, lr=0.000457629, gnorm=0.255, clip=0, loss_scale=4, train_wall=144, gb_free=59, wall=28185 epoch 012: 380 / 1707 loss=4.184, nll_loss=2.563, ppl=5.91, wps=273996, ups=0.64, wpb=430007, bsz=16318.6, num_updates=19100, lr=0.000457629, gnorm=0.255, clip=0, loss_scale=4, train_wall=144, gb_free=59, wall=28185 epoch 012: 380 / 1707 loss=4.184, nll_loss=2.563, ppl=5.91, wps=273996, ups=0.64, wpb=430007, bsz=16318.6, num_updates=19100, lr=0.000457629, gnorm=0.255, clip=0, loss_scale=4, train_wall=144, gb_free=59, wall=28185 epoch 012: 380 / 1707 loss=4.184, nll_loss=2.563, ppl=5.91, wps=273996, ups=0.64, wpb=430007, bsz=16318.6, num_updates=19100, lr=0.000457629, gnorm=0.255, clip=0, loss_scale=4, train_wall=144, gb_free=59, wall=28185 epoch 012: 380 / 1707 loss=4.184, nll_loss=2.563, ppl=5.91, wps=273996, ups=0.64, wpb=430007, bsz=16318.6, num_updates=19100, lr=0.000457629, gnorm=0.255, clip=0, loss_scale=4, train_wall=144, gb_free=59, wall=28185 epoch 012: 380 / 1707 loss=4.184, nll_loss=2.563, ppl=5.91, wps=273996, ups=0.64, wpb=430007, bsz=16318.6, num_updates=19100, lr=0.000457629, gnorm=0.255, clip=0, loss_scale=4, train_wall=144, gb_free=59, wall=28185 epoch 012: 380 / 1707 loss=4.184, nll_loss=2.563, ppl=5.91, wps=273996, ups=0.64, wpb=430007, bsz=16318.6, num_updates=19100, lr=0.000457629, gnorm=0.255, clip=0, loss_scale=4, train_wall=144, gb_free=59, wall=28185 epoch 012: 380 / 1707 loss=4.184, nll_loss=2.563, ppl=5.91, wps=273996, ups=0.64, wpb=430007, bsz=16318.6, num_updates=19100, lr=0.000457629, gnorm=0.255, clip=0, loss_scale=4, train_wall=144, gb_free=59, wall=28185 epoch 012: 380 / 1707 loss=4.184, nll_loss=2.563, ppl=5.91, wps=273996, ups=0.64, wpb=430007, bsz=16318.6, num_updates=19100, lr=0.000457629, gnorm=0.255, clip=0, loss_scale=4, train_wall=144, gb_free=59, wall=28185 epoch 012: 380 / 1707 loss=4.184, nll_loss=2.563, ppl=5.91, wps=273996, ups=0.64, wpb=430007, bsz=16318.6, num_updates=19100, lr=0.000457629, gnorm=0.255, clip=0, loss_scale=4, train_wall=144, gb_free=59, wall=28185 epoch 012: 481 / 1707 loss=4.185, nll_loss=2.563, ppl=5.91, wps=293564, ups=0.68, wpb=429339, bsz=16142.6, num_updates=19200, lr=0.000456435, gnorm=0.246, clip=0, loss_scale=4, train_wall=145, gb_free=59.3, wall=28331 epoch 012: 481 / 1707 loss=4.185, nll_loss=2.563, ppl=5.91, wps=293564, ups=0.68, wpb=429339, bsz=16142.6, num_updates=19200, lr=0.000456435, gnorm=0.246, clip=0, loss_scale=4, train_wall=145, gb_free=59.3, wall=28331 epoch 012: 481 / 1707 loss=4.185, nll_loss=2.563, ppl=5.91, wps=293564, ups=0.68, wpb=429339, bsz=16142.6, num_updates=19200, lr=0.000456435, gnorm=0.246, clip=0, loss_scale=4, train_wall=145, gb_free=59.3, wall=28331 epoch 012: 481 / 1707 loss=4.185, nll_loss=2.563, ppl=5.91, wps=293564, ups=0.68, wpb=429339, bsz=16142.6, num_updates=19200, lr=0.000456435, gnorm=0.246, clip=0, loss_scale=4, train_wall=145, gb_free=59.3, wall=28331 epoch 012: 481 / 1707 loss=4.185, nll_loss=2.563, ppl=5.91, wps=293564, ups=0.68, wpb=429339, bsz=16142.6, num_updates=19200, lr=0.000456435, gnorm=0.246, clip=0, loss_scale=4, train_wall=145, gb_free=59.3, wall=28331 epoch 012: 481 / 1707 loss=4.185, nll_loss=2.563, ppl=5.91, wps=293564, ups=0.68, wpb=429339, bsz=16142.6, num_updates=19200, lr=0.000456435, gnorm=0.246, clip=0, loss_scale=4, train_wall=145, gb_free=59.3, wall=28331 epoch 012: 481 / 1707 loss=4.185, nll_loss=2.563, ppl=5.91, wps=293564, ups=0.68, wpb=429339, bsz=16142.6, num_updates=19200, lr=0.000456435, gnorm=0.246, clip=0, loss_scale=4, train_wall=145, gb_free=59.3, wall=28331 epoch 012: 481 / 1707 loss=4.185, nll_loss=2.563, ppl=5.91, wps=293564, ups=0.68, wpb=429339, bsz=16142.6, num_updates=19200, lr=0.000456435, gnorm=0.246, clip=0, loss_scale=4, train_wall=145, gb_free=59.3, wall=28331 epoch 012: 481 / 1707 loss=4.185, nll_loss=2.563, ppl=5.91, wps=293564, ups=0.68, wpb=429339, bsz=16142.6, num_updates=19200, lr=0.000456435, gnorm=0.246, clip=0, loss_scale=4, train_wall=145, gb_free=59.3, wall=28331 epoch 012: 481 / 1707 loss=4.185, nll_loss=2.563, ppl=5.91, wps=293564, ups=0.68, wpb=429339, bsz=16142.6, num_updates=19200, lr=0.000456435, gnorm=0.246, clip=0, loss_scale=4, train_wall=145, gb_free=59.3, wall=28331 epoch 012: 481 / 1707 loss=4.185, nll_loss=2.563, ppl=5.91, wps=293564, ups=0.68, wpb=429339, bsz=16142.6, num_updates=19200, lr=0.000456435, gnorm=0.246, clip=0, loss_scale=4, train_wall=145, gb_free=59.3, wall=28331 epoch 012: 481 / 1707 loss=4.185, nll_loss=2.563, ppl=5.91, wps=293564, ups=0.68, wpb=429339, bsz=16142.6, num_updates=19200, lr=0.000456435, gnorm=0.246, clip=0, loss_scale=4, train_wall=145, gb_free=59.3, wall=28331 epoch 012: 581 / 1707 loss=4.188, nll_loss=2.567, ppl=5.92, wps=296206, ups=0.69, wpb=428651, bsz=16339, num_updates=19300, lr=0.000455251, gnorm=0.246, clip=0, loss_scale=4, train_wall=144, gb_free=59.1, wall=28476 epoch 012: 581 / 1707 loss=4.188, nll_loss=2.567, ppl=5.92, wps=296206, ups=0.69, wpb=428651, bsz=16339, num_updates=19300, lr=0.000455251, gnorm=0.246, clip=0, loss_scale=4, train_wall=144, gb_free=59.1, wall=28476 epoch 012: 581 / 1707 loss=4.188, nll_loss=2.567, ppl=5.92, wps=296206, ups=0.69, wpb=428651, bsz=16339, num_updates=19300, lr=0.000455251, gnorm=0.246, clip=0, loss_scale=4, train_wall=144, gb_free=59.1, wall=28476 epoch 012: 581 / 1707 loss=4.188, nll_loss=2.567, ppl=5.92, wps=296206, ups=0.69, wpb=428651, bsz=16339, num_updates=19300, lr=0.000455251, gnorm=0.246, clip=0, loss_scale=4, train_wall=144, gb_free=59.1, wall=28476 epoch 012: 581 / 1707 loss=4.188, nll_loss=2.567, ppl=5.92, wps=296206, ups=0.69, wpb=428651, bsz=16339, num_updates=19300, lr=0.000455251, gnorm=0.246, clip=0, loss_scale=4, train_wall=144, gb_free=59.1, wall=28476 epoch 012: 581 / 1707 loss=4.188, nll_loss=2.567, ppl=5.92, wps=296206, ups=0.69, wpb=428651, bsz=16339, num_updates=19300, lr=0.000455251, gnorm=0.246, clip=0, loss_scale=4, train_wall=144, gb_free=59.1, wall=28476 epoch 012: 581 / 1707 loss=4.188, nll_loss=2.567, ppl=5.92, wps=296206, ups=0.69, wpb=428651, bsz=16339, num_updates=19300, lr=0.000455251, gnorm=0.246, clip=0, loss_scale=4, train_wall=144, gb_free=59.1, wall=28476 epoch 012: 581 / 1707 loss=4.188, nll_loss=2.567, ppl=5.92, wps=296206, ups=0.69, wpb=428651, bsz=16339, num_updates=19300, lr=0.000455251, gnorm=0.246, clip=0, loss_scale=4, train_wall=144, gb_free=59.1, wall=28476 epoch 012: 581 / 1707 loss=4.188, nll_loss=2.567, ppl=5.92, wps=296206, ups=0.69, wpb=428651, bsz=16339, num_updates=19300, lr=0.000455251, gnorm=0.246, clip=0, loss_scale=4, train_wall=144, gb_free=59.1, wall=28476 epoch 012: 581 / 1707 loss=4.188, nll_loss=2.567, ppl=5.92, wps=296206, ups=0.69, wpb=428651, bsz=16339, num_updates=19300, lr=0.000455251, gnorm=0.246, clip=0, loss_scale=4, train_wall=144, gb_free=59.1, wall=28476 epoch 012: 581 / 1707 loss=4.188, nll_loss=2.567, ppl=5.92, wps=296206, ups=0.69, wpb=428651, bsz=16339, num_updates=19300, lr=0.000455251, gnorm=0.246, clip=0, loss_scale=4, train_wall=144, gb_free=59.1, wall=28476 epoch 012: 581 / 1707 loss=4.188, nll_loss=2.567, ppl=5.92, wps=296206, ups=0.69, wpb=428651, bsz=16339, num_updates=19300, lr=0.000455251, gnorm=0.246, clip=0, loss_scale=4, train_wall=144, gb_free=59.1, wall=28476 epoch 012: 682 / 1707 loss=4.194, nll_loss=2.574, ppl=5.95, wps=293524, ups=0.68, wpb=429219, bsz=16249.1, num_updates=19400, lr=0.000454077, gnorm=0.256, clip=0, loss_scale=2, train_wall=145, gb_free=59, wall=28622 epoch 012: 682 / 1707 loss=4.194, nll_loss=2.574, ppl=5.95, wps=293524, ups=0.68, wpb=429219, bsz=16249.1, num_updates=19400, lr=0.000454077, gnorm=0.256, clip=0, loss_scale=2, train_wall=145, gb_free=59, wall=28622 epoch 012: 682 / 1707 loss=4.194, nll_loss=2.574, ppl=5.95, wps=293524, ups=0.68, wpb=429219, bsz=16249.1, num_updates=19400, lr=0.000454077, gnorm=0.256, clip=0, loss_scale=2, train_wall=145, gb_free=59, wall=28622 epoch 012: 682 / 1707 loss=4.194, nll_loss=2.574, ppl=5.95, wps=293524, ups=0.68, wpb=429219, bsz=16249.1, num_updates=19400, lr=0.000454077, gnorm=0.256, clip=0, loss_scale=2, train_wall=145, gb_free=59, wall=28622 epoch 012: 682 / 1707 loss=4.194, nll_loss=2.574, ppl=5.95, wps=293524, ups=0.68, wpb=429219, bsz=16249.1, num_updates=19400, lr=0.000454077, gnorm=0.256, clip=0, loss_scale=2, train_wall=145, gb_free=59, wall=28622 epoch 012: 682 / 1707 loss=4.194, nll_loss=2.574, ppl=5.95, wps=293524, ups=0.68, wpb=429219, bsz=16249.1, num_updates=19400, lr=0.000454077, gnorm=0.256, clip=0, loss_scale=2, train_wall=145, gb_free=59, wall=28622 epoch 012: 682 / 1707 loss=4.194, nll_loss=2.574, ppl=5.95, wps=293524, ups=0.68, wpb=429219, bsz=16249.1, num_updates=19400, lr=0.000454077, gnorm=0.256, clip=0, loss_scale=2, train_wall=145, gb_free=59, wall=28622 epoch 012: 682 / 1707 loss=4.194, nll_loss=2.574, ppl=5.95, wps=293524, ups=0.68, wpb=429219, bsz=16249.1, num_updates=19400, lr=0.000454077, gnorm=0.256, clip=0, loss_scale=2, train_wall=145, gb_free=59, wall=28622 epoch 012: 682 / 1707 loss=4.194, nll_loss=2.574, ppl=5.95, wps=293524, ups=0.68, wpb=429219, bsz=16249.1, num_updates=19400, lr=0.000454077, gnorm=0.256, clip=0, loss_scale=2, train_wall=145, gb_free=59, wall=28622 epoch 012: 682 / 1707 loss=4.194, nll_loss=2.574, ppl=5.95, wps=293524, ups=0.68, wpb=429219, bsz=16249.1, num_updates=19400, lr=0.000454077, gnorm=0.256, clip=0, loss_scale=2, train_wall=145, gb_free=59, wall=28622 epoch 012: 682 / 1707 loss=4.194, nll_loss=2.574, ppl=5.95, wps=293524, ups=0.68, wpb=429219, bsz=16249.1, num_updates=19400, lr=0.000454077, gnorm=0.256, clip=0, loss_scale=2, train_wall=145, gb_free=59, wall=28622 epoch 012: 682 / 1707 loss=4.194, nll_loss=2.574, ppl=5.95, wps=293524, ups=0.68, wpb=429219, bsz=16249.1, num_updates=19400, lr=0.000454077, gnorm=0.256, clip=0, loss_scale=2, train_wall=145, gb_free=59, wall=28622 epoch 012: 782 / 1707 loss=4.18, nll_loss=2.558, ppl=5.89, wps=295075, ups=0.69, wpb=427184, bsz=16130.5, num_updates=19500, lr=0.000452911, gnorm=0.236, clip=0, loss_scale=2, train_wall=144, gb_free=59, wall=28767 epoch 012: 782 / 1707 loss=4.18, nll_loss=2.558, ppl=5.89, wps=295075, ups=0.69, wpb=427184, bsz=16130.5, num_updates=19500, lr=0.000452911, gnorm=0.236, clip=0, loss_scale=2, train_wall=144, gb_free=59, wall=28767 epoch 012: 782 / 1707 loss=4.18, nll_loss=2.558, ppl=5.89, wps=295075, ups=0.69, wpb=427184, bsz=16130.5, num_updates=19500, lr=0.000452911, gnorm=0.236, clip=0, loss_scale=2, train_wall=144, gb_free=59, wall=28767 epoch 012: 782 / 1707 loss=4.18, nll_loss=2.558, ppl=5.89, wps=295075, ups=0.69, wpb=427184, bsz=16130.5, num_updates=19500, lr=0.000452911, gnorm=0.236, clip=0, loss_scale=2, train_wall=144, gb_free=59, wall=28767 epoch 012: 782 / 1707 loss=4.18, nll_loss=2.558, ppl=5.89, wps=295075, ups=0.69, wpb=427184, bsz=16130.5, num_updates=19500, lr=0.000452911, gnorm=0.236, clip=0, loss_scale=2, train_wall=144, gb_free=59, wall=28767 epoch 012: 782 / 1707 loss=4.18, nll_loss=2.558, ppl=5.89, wps=295075, ups=0.69, wpb=427184, bsz=16130.5, num_updates=19500, lr=0.000452911, gnorm=0.236, clip=0, loss_scale=2, train_wall=144, gb_free=59, wall=28767 epoch 012: 782 / 1707 loss=4.18, nll_loss=2.558, ppl=5.89, wps=295075, ups=0.69, wpb=427184, bsz=16130.5, num_updates=19500, lr=0.000452911, gnorm=0.236, clip=0, loss_scale=2, train_wall=144, gb_free=59, wall=28767 epoch 012: 782 / 1707 loss=4.18, nll_loss=2.558, ppl=5.89, wps=295075, ups=0.69, wpb=427184, bsz=16130.5, num_updates=19500, lr=0.000452911, gnorm=0.236, clip=0, loss_scale=2, train_wall=144, gb_free=59, wall=28767 epoch 012: 782 / 1707 loss=4.18, nll_loss=2.558, ppl=5.89, wps=295075, ups=0.69, wpb=427184, bsz=16130.5, num_updates=19500, lr=0.000452911, gnorm=0.236, clip=0, loss_scale=2, train_wall=144, gb_free=59, wall=28767 epoch 012: 782 / 1707 loss=4.18, nll_loss=2.558, ppl=5.89, wps=295075, ups=0.69, wpb=427184, bsz=16130.5, num_updates=19500, lr=0.000452911, gnorm=0.236, clip=0, loss_scale=2, train_wall=144, gb_free=59, wall=28767 epoch 012: 782 / 1707 loss=4.18, nll_loss=2.558, ppl=5.89, wps=295075, ups=0.69, wpb=427184, bsz=16130.5, num_updates=19500, lr=0.000452911, gnorm=0.236, clip=0, loss_scale=2, train_wall=144, gb_free=59, wall=28767 epoch 012: 782 / 1707 loss=4.18, nll_loss=2.558, ppl=5.89, wps=295075, ups=0.69, wpb=427184, bsz=16130.5, num_updates=19500, lr=0.000452911, gnorm=0.236, clip=0, loss_scale=2, train_wall=144, gb_free=59, wall=28767 epoch 012: 882 / 1707 loss=4.194, nll_loss=2.575, ppl=5.96, wps=296612, ups=0.69, wpb=429657, bsz=16316, num_updates=19600, lr=0.000451754, gnorm=0.254, clip=0, loss_scale=4, train_wall=144, gb_free=59.2, wall=28911 epoch 012: 882 / 1707 loss=4.194, nll_loss=2.575, ppl=5.96, wps=296612, ups=0.69, wpb=429657, bsz=16316, num_updates=19600, lr=0.000451754, gnorm=0.254, clip=0, loss_scale=4, train_wall=144, gb_free=59.2, wall=28911 epoch 012: 882 / 1707 loss=4.194, nll_loss=2.575, ppl=5.96, wps=296612, ups=0.69, wpb=429657, bsz=16316, num_updates=19600, lr=0.000451754, gnorm=0.254, clip=0, loss_scale=4, train_wall=144, gb_free=59.2, wall=28911 epoch 012: 882 / 1707 loss=4.194, nll_loss=2.575, ppl=5.96, wps=296612, ups=0.69, wpb=429657, bsz=16316, num_updates=19600, lr=0.000451754, gnorm=0.254, clip=0, loss_scale=4, train_wall=144, gb_free=59.2, wall=28911 epoch 012: 882 / 1707 loss=4.194, nll_loss=2.575, ppl=5.96, wps=296612, ups=0.69, wpb=429657, bsz=16316, num_updates=19600, lr=0.000451754, gnorm=0.254, clip=0, loss_scale=4, train_wall=144, gb_free=59.2, wall=28911 epoch 012: 882 / 1707 loss=4.194, nll_loss=2.575, ppl=5.96, wps=296612, ups=0.69, wpb=429657, bsz=16316, num_updates=19600, lr=0.000451754, gnorm=0.254, clip=0, loss_scale=4, train_wall=144, gb_free=59.2, wall=28911 epoch 012: 882 / 1707 loss=4.194, nll_loss=2.575, ppl=5.96, wps=296612, ups=0.69, wpb=429657, bsz=16316, num_updates=19600, lr=0.000451754, gnorm=0.254, clip=0, loss_scale=4, train_wall=144, gb_free=59.2, wall=28911 epoch 012: 882 / 1707 loss=4.194, nll_loss=2.575, ppl=5.96, wps=296612, ups=0.69, wpb=429657, bsz=16316, num_updates=19600, lr=0.000451754, gnorm=0.254, clip=0, loss_scale=4, train_wall=144, gb_free=59.2, wall=28911 epoch 012: 882 / 1707 loss=4.194, nll_loss=2.575, ppl=5.96, wps=296612, ups=0.69, wpb=429657, bsz=16316, num_updates=19600, lr=0.000451754, gnorm=0.254, clip=0, loss_scale=4, train_wall=144, gb_free=59.2, wall=28911 epoch 012: 882 / 1707 loss=4.194, nll_loss=2.575, ppl=5.96, wps=296612, ups=0.69, wpb=429657, bsz=16316, num_updates=19600, lr=0.000451754, gnorm=0.254, clip=0, loss_scale=4, train_wall=144, gb_free=59.2, wall=28911 epoch 012: 882 / 1707 loss=4.194, nll_loss=2.575, ppl=5.96, wps=296612, ups=0.69, wpb=429657, bsz=16316, num_updates=19600, lr=0.000451754, gnorm=0.254, clip=0, loss_scale=4, train_wall=144, gb_free=59.2, wall=28911 epoch 012: 882 / 1707 loss=4.194, nll_loss=2.575, ppl=5.96, wps=296612, ups=0.69, wpb=429657, bsz=16316, num_updates=19600, lr=0.000451754, gnorm=0.254, clip=0, loss_scale=4, train_wall=144, gb_free=59.2, wall=28911 epoch 012: 983 / 1707 loss=4.184, nll_loss=2.563, ppl=5.91, wps=292376, ups=0.68, wpb=427954, bsz=16459.6, num_updates=19700, lr=0.000450606, gnorm=0.231, clip=0, loss_scale=2, train_wall=146, gb_free=59.2, wall=29058 epoch 012: 983 / 1707 loss=4.184, nll_loss=2.563, ppl=5.91, wps=292376, ups=0.68, wpb=427954, bsz=16459.6, num_updates=19700, lr=0.000450606, gnorm=0.231, clip=0, loss_scale=2, train_wall=146, gb_free=59.2, wall=29058 epoch 012: 983 / 1707 loss=4.184, nll_loss=2.563, ppl=5.91, wps=292376, ups=0.68, wpb=427954, bsz=16459.6, num_updates=19700, lr=0.000450606, gnorm=0.231, clip=0, loss_scale=2, train_wall=146, gb_free=59.2, wall=29058 epoch 012: 983 / 1707 loss=4.184, nll_loss=2.563, ppl=5.91, wps=292376, ups=0.68, wpb=427954, bsz=16459.6, num_updates=19700, lr=0.000450606, gnorm=0.231, clip=0, loss_scale=2, train_wall=146, gb_free=59.2, wall=29058 epoch 012: 983 / 1707 loss=4.184, nll_loss=2.563, ppl=5.91, wps=292376, ups=0.68, wpb=427954, bsz=16459.6, num_updates=19700, lr=0.000450606, gnorm=0.231, clip=0, loss_scale=2, train_wall=146, gb_free=59.2, wall=29058 epoch 012: 983 / 1707 loss=4.184, nll_loss=2.563, ppl=5.91, wps=292376, ups=0.68, wpb=427954, bsz=16459.6, num_updates=19700, lr=0.000450606, gnorm=0.231, clip=0, loss_scale=2, train_wall=146, gb_free=59.2, wall=29058 epoch 012: 983 / 1707 loss=4.184, nll_loss=2.563, ppl=5.91, wps=292376, ups=0.68, wpb=427954, bsz=16459.6, num_updates=19700, lr=0.000450606, gnorm=0.231, clip=0, loss_scale=2, train_wall=146, gb_free=59.2, wall=29058 epoch 012: 983 / 1707 loss=4.184, nll_loss=2.563, ppl=5.91, wps=292376, ups=0.68, wpb=427954, bsz=16459.6, num_updates=19700, lr=0.000450606, gnorm=0.231, clip=0, loss_scale=2, train_wall=146, gb_free=59.2, wall=29058 epoch 012: 983 / 1707 loss=4.184, nll_loss=2.563, ppl=5.91, wps=292376, ups=0.68, wpb=427954, bsz=16459.6, num_updates=19700, lr=0.000450606, gnorm=0.231, clip=0, loss_scale=2, train_wall=146, gb_free=59.2, wall=29058 epoch 012: 983 / 1707 loss=4.184, nll_loss=2.563, ppl=5.91, wps=292376, ups=0.68, wpb=427954, bsz=16459.6, num_updates=19700, lr=0.000450606, gnorm=0.231, clip=0, loss_scale=2, train_wall=146, gb_free=59.2, wall=29058 epoch 012: 983 / 1707 loss=4.184, nll_loss=2.563, ppl=5.91, wps=292376, ups=0.68, wpb=427954, bsz=16459.6, num_updates=19700, lr=0.000450606, gnorm=0.231, clip=0, loss_scale=2, train_wall=146, gb_free=59.2, wall=29058 epoch 012: 983 / 1707 loss=4.184, nll_loss=2.563, ppl=5.91, wps=292376, ups=0.68, wpb=427954, bsz=16459.6, num_updates=19700, lr=0.000450606, gnorm=0.231, clip=0, loss_scale=2, train_wall=146, gb_free=59.2, wall=29058 epoch 012: 1083 / 1707 loss=4.19, nll_loss=2.57, ppl=5.94, wps=295655, ups=0.69, wpb=428398, bsz=16476.1, num_updates=19800, lr=0.000449467, gnorm=0.246, clip=0, loss_scale=2, train_wall=144, gb_free=58.6, wall=29203 epoch 012: 1083 / 1707 loss=4.19, nll_loss=2.57, ppl=5.94, wps=295655, ups=0.69, wpb=428398, bsz=16476.1, num_updates=19800, lr=0.000449467, gnorm=0.246, clip=0, loss_scale=2, train_wall=144, gb_free=58.6, wall=29203 epoch 012: 1083 / 1707 loss=4.19, nll_loss=2.57, ppl=5.94, wps=295655, ups=0.69, wpb=428398, bsz=16476.1, num_updates=19800, lr=0.000449467, gnorm=0.246, clip=0, loss_scale=2, train_wall=144, gb_free=58.6, wall=29203 epoch 012: 1083 / 1707 loss=4.19, nll_loss=2.57, ppl=5.94, wps=295655, ups=0.69, wpb=428398, bsz=16476.1, num_updates=19800, lr=0.000449467, gnorm=0.246, clip=0, loss_scale=2, train_wall=144, gb_free=58.6, wall=29203 epoch 012: 1083 / 1707 loss=4.19, nll_loss=2.57, ppl=5.94, wps=295655, ups=0.69, wpb=428398, bsz=16476.1, num_updates=19800, lr=0.000449467, gnorm=0.246, clip=0, loss_scale=2, train_wall=144, gb_free=58.6, wall=29203 epoch 012: 1083 / 1707 loss=4.19, nll_loss=2.57, ppl=5.94, wps=295655, ups=0.69, wpb=428398, bsz=16476.1, num_updates=19800, lr=0.000449467, gnorm=0.246, clip=0, loss_scale=2, train_wall=144, gb_free=58.6, wall=29203 epoch 012: 1083 / 1707 loss=4.19, nll_loss=2.57, ppl=5.94, wps=295655, ups=0.69, wpb=428398, bsz=16476.1, num_updates=19800, lr=0.000449467, gnorm=0.246, clip=0, loss_scale=2, train_wall=144, gb_free=58.6, wall=29203 epoch 012: 1083 / 1707 loss=4.19, nll_loss=2.57, ppl=5.94, wps=295655, ups=0.69, wpb=428398, bsz=16476.1, num_updates=19800, lr=0.000449467, gnorm=0.246, clip=0, loss_scale=2, train_wall=144, gb_free=58.6, wall=29203 epoch 012: 1083 / 1707 loss=4.19, nll_loss=2.57, ppl=5.94, wps=295655, ups=0.69, wpb=428398, bsz=16476.1, num_updates=19800, lr=0.000449467, gnorm=0.246, clip=0, loss_scale=2, train_wall=144, gb_free=58.6, wall=29203 epoch 012: 1083 / 1707 loss=4.19, nll_loss=2.57, ppl=5.94, wps=295655, ups=0.69, wpb=428398, bsz=16476.1, num_updates=19800, lr=0.000449467, gnorm=0.246, clip=0, loss_scale=2, train_wall=144, gb_free=58.6, wall=29203 epoch 012: 1083 / 1707 loss=4.19, nll_loss=2.57, ppl=5.94, wps=295655, ups=0.69, wpb=428398, bsz=16476.1, num_updates=19800, lr=0.000449467, gnorm=0.246, clip=0, loss_scale=2, train_wall=144, gb_free=58.6, wall=29203 epoch 012: 1083 / 1707 loss=4.19, nll_loss=2.57, ppl=5.94, wps=295655, ups=0.69, wpb=428398, bsz=16476.1, num_updates=19800, lr=0.000449467, gnorm=0.246, clip=0, loss_scale=2, train_wall=144, gb_free=58.6, wall=29203 epoch 012: 1183 / 1707 loss=4.193, nll_loss=2.574, ppl=5.95, wps=296315, ups=0.69, wpb=429196, bsz=16238.3, num_updates=19900, lr=0.000448336, gnorm=0.236, clip=0, loss_scale=2, train_wall=144, gb_free=59.4, wall=29348 epoch 012: 1183 / 1707 loss=4.193, nll_loss=2.574, ppl=5.95, wps=296315, ups=0.69, wpb=429196, bsz=16238.3, num_updates=19900, lr=0.000448336, gnorm=0.236, clip=0, loss_scale=2, train_wall=144, gb_free=59.4, wall=29348 epoch 012: 1183 / 1707 loss=4.193, nll_loss=2.574, ppl=5.95, wps=296315, ups=0.69, wpb=429196, bsz=16238.3, num_updates=19900, lr=0.000448336, gnorm=0.236, clip=0, loss_scale=2, train_wall=144, gb_free=59.4, wall=29348 epoch 012: 1183 / 1707 loss=4.193, nll_loss=2.574, ppl=5.95, wps=296315, ups=0.69, wpb=429196, bsz=16238.3, num_updates=19900, lr=0.000448336, gnorm=0.236, clip=0, loss_scale=2, train_wall=144, gb_free=59.4, wall=29348 epoch 012: 1183 / 1707 loss=4.193, nll_loss=2.574, ppl=5.95, wps=296315, ups=0.69, wpb=429196, bsz=16238.3, num_updates=19900, lr=0.000448336, gnorm=0.236, clip=0, loss_scale=2, train_wall=144, gb_free=59.4, wall=29348 epoch 012: 1183 / 1707 loss=4.193, nll_loss=2.574, ppl=5.95, wps=296315, ups=0.69, wpb=429196, bsz=16238.3, num_updates=19900, lr=0.000448336, gnorm=0.236, clip=0, loss_scale=2, train_wall=144, gb_free=59.4, wall=29348 epoch 012: 1183 / 1707 loss=4.193, nll_loss=2.574, ppl=5.95, wps=296315, ups=0.69, wpb=429196, bsz=16238.3, num_updates=19900, lr=0.000448336, gnorm=0.236, clip=0, loss_scale=2, train_wall=144, gb_free=59.4, wall=29348 epoch 012: 1183 / 1707 loss=4.193, nll_loss=2.574, ppl=5.95, wps=296315, ups=0.69, wpb=429196, bsz=16238.3, num_updates=19900, lr=0.000448336, gnorm=0.236, clip=0, loss_scale=2, train_wall=144, gb_free=59.4, wall=29348 epoch 012: 1183 / 1707 loss=4.193, nll_loss=2.574, ppl=5.95, wps=296315, ups=0.69, wpb=429196, bsz=16238.3, num_updates=19900, lr=0.000448336, gnorm=0.236, clip=0, loss_scale=2, train_wall=144, gb_free=59.4, wall=29348 epoch 012: 1183 / 1707 loss=4.193, nll_loss=2.574, ppl=5.95, wps=296315, ups=0.69, wpb=429196, bsz=16238.3, num_updates=19900, lr=0.000448336, gnorm=0.236, clip=0, loss_scale=2, train_wall=144, gb_free=59.4, wall=29348 epoch 012: 1183 / 1707 loss=4.193, nll_loss=2.574, ppl=5.95, wps=296315, ups=0.69, wpb=429196, bsz=16238.3, num_updates=19900, lr=0.000448336, gnorm=0.236, clip=0, loss_scale=2, train_wall=144, gb_free=59.4, wall=29348 epoch 012: 1183 / 1707 loss=4.193, nll_loss=2.574, ppl=5.95, wps=296315, ups=0.69, wpb=429196, bsz=16238.3, num_updates=19900, lr=0.000448336, gnorm=0.236, clip=0, loss_scale=2, train_wall=144, gb_free=59.4, wall=29348 epoch 012: 1283 / 1707 loss=4.194, nll_loss=2.575, ppl=5.96, wps=296406, ups=0.69, wpb=429302, bsz=16501.5, num_updates=20000, lr=0.000447214, gnorm=0.255, clip=0, loss_scale=4, train_wall=144, gb_free=58.8, wall=29492 epoch 012: 1283 / 1707 loss=4.194, nll_loss=2.575, ppl=5.96, wps=296406, ups=0.69, wpb=429302, bsz=16501.5, num_updates=20000, lr=0.000447214, gnorm=0.255, clip=0, loss_scale=4, train_wall=144, gb_free=58.8, wall=29492 epoch 012: 1283 / 1707 loss=4.194, nll_loss=2.575, ppl=5.96, wps=296406, ups=0.69, wpb=429302, bsz=16501.5, num_updates=20000, lr=0.000447214, gnorm=0.255, clip=0, loss_scale=4, train_wall=144, gb_free=58.8, wall=29492 epoch 012: 1283 / 1707 loss=4.194, nll_loss=2.575, ppl=5.96, wps=296406, ups=0.69, wpb=429302, bsz=16501.5, num_updates=20000, lr=0.000447214, gnorm=0.255, clip=0, loss_scale=4, train_wall=144, gb_free=58.8, wall=29492 epoch 012: 1283 / 1707 loss=4.194, nll_loss=2.575, ppl=5.96, wps=296406, ups=0.69, wpb=429302, bsz=16501.5, num_updates=20000, lr=0.000447214, gnorm=0.255, clip=0, loss_scale=4, train_wall=144, gb_free=58.8, wall=29492 epoch 012: 1283 / 1707 loss=4.194, nll_loss=2.575, ppl=5.96, wps=296406, ups=0.69, wpb=429302, bsz=16501.5, num_updates=20000, lr=0.000447214, gnorm=0.255, clip=0, loss_scale=4, train_wall=144, gb_free=58.8, wall=29492 epoch 012: 1283 / 1707 loss=4.194, nll_loss=2.575, ppl=5.96, wps=296406, ups=0.69, wpb=429302, bsz=16501.5, num_updates=20000, lr=0.000447214, gnorm=0.255, clip=0, loss_scale=4, train_wall=144, gb_free=58.8, wall=29492 epoch 012: 1283 / 1707 loss=4.194, nll_loss=2.575, ppl=5.96, wps=296406, ups=0.69, wpb=429302, bsz=16501.5, num_updates=20000, lr=0.000447214, gnorm=0.255, clip=0, loss_scale=4, train_wall=144, gb_free=58.8, wall=29492 epoch 012: 1283 / 1707 loss=4.194, nll_loss=2.575, ppl=5.96, wps=296406, ups=0.69, wpb=429302, bsz=16501.5, num_updates=20000, lr=0.000447214, gnorm=0.255, clip=0, loss_scale=4, train_wall=144, gb_free=58.8, wall=29492 epoch 012: 1283 / 1707 loss=4.194, nll_loss=2.575, ppl=5.96, wps=296406, ups=0.69, wpb=429302, bsz=16501.5, num_updates=20000, lr=0.000447214, gnorm=0.255, clip=0, loss_scale=4, train_wall=144, gb_free=58.8, wall=29492 epoch 012: 1283 / 1707 loss=4.194, nll_loss=2.575, ppl=5.96, wps=296406, ups=0.69, wpb=429302, bsz=16501.5, num_updates=20000, lr=0.000447214, gnorm=0.255, clip=0, loss_scale=4, train_wall=144, gb_free=58.8, wall=29492 epoch 012: 1283 / 1707 loss=4.194, nll_loss=2.575, ppl=5.96, wps=296406, ups=0.69, wpb=429302, bsz=16501.5, num_updates=20000, lr=0.000447214, gnorm=0.255, clip=0, loss_scale=4, train_wall=144, gb_free=58.8, wall=29492 begin validation on "valid" subset epoch 012 | valid on 'valid' subset | loss 4.332 | nll_loss 2.711 | ppl 6.55 | wps 76666.2 | wpb 21331 | bsz 1016 | num_updates 20000 | best_loss 4.332 epoch 012 | valid on 'valid' subset | loss 4.332 | nll_loss 2.711 | ppl 6.55 | wps 76666.2 | wpb 21331 | bsz 1016 | num_updates 20000 | best_loss 4.332 epoch 012 | valid on 'valid' subset | loss 4.332 | nll_loss 2.711 | ppl 6.55 | wps 76666.2 | wpb 21331 | bsz 1016 | num_updates 20000 | best_loss 4.332 epoch 012 | valid on 'valid' subset | loss 4.332 | nll_loss 2.711 | ppl 6.55 | wps 76666.2 | wpb 21331 | bsz 1016 | num_updates 20000 | best_loss 4.332 epoch 012 | valid on 'valid' subset | loss 4.332 | nll_loss 2.711 | ppl 6.55 | wps 76666.2 | wpb 21331 | bsz 1016 | num_updates 20000 | best_loss 4.332 epoch 012 | valid on 'valid' subset | loss 4.332 | nll_loss 2.711 | ppl 6.55 | wps 76666.2 | wpb 21331 | bsz 1016 | num_updates 20000 | best_loss 4.332 epoch 012 | valid on 'valid' subset | loss 4.332 | nll_loss 2.711 | ppl 6.55 | wps 76666.2 | wpb 21331 | bsz 1016 | num_updates 20000 | best_loss 4.332 epoch 012 | valid on 'valid' subset | loss 4.332 | nll_loss 2.711 | ppl 6.55 | wps 76666.2 | wpb 21331 | bsz 1016 | num_updates 20000 | best_loss 4.332 epoch 012 | valid on 'valid' subset | loss 4.332 | nll_loss 2.711 | ppl 6.55 | wps 76666.2 | wpb 21331 | bsz 1016 | num_updates 20000 | best_loss 4.332 epoch 012 | valid on 'valid' subset | loss 4.332 | nll_loss 2.711 | ppl 6.55 | wps 76666.2 | wpb 21331 | bsz 1016 | num_updates 20000 | best_loss 4.332 epoch 012 | valid on 'valid' subset | loss 4.332 | nll_loss 2.711 | ppl 6.55 | wps 76666.2 | wpb 21331 | bsz 1016 | num_updates 20000 | best_loss 4.332 epoch 012 | valid on 'valid' subset | loss 4.332 | nll_loss 2.711 | ppl 6.55 | wps 76666.2 | wpb 21331 | bsz 1016 | num_updates 20000 | best_loss 4.332 epoch 012: 1383 / 1707 loss=4.193, nll_loss=2.574, ppl=5.96, wps=265887, ups=0.62, wpb=430122, bsz=16313.6, num_updates=20100, lr=0.0004461, gnorm=0.24, clip=0, loss_scale=4, train_wall=143, gb_free=59.6, wall=29654 epoch 012: 1383 / 1707 loss=4.193, nll_loss=2.574, ppl=5.96, wps=265887, ups=0.62, wpb=430122, bsz=16313.6, num_updates=20100, lr=0.0004461, gnorm=0.24, clip=0, loss_scale=4, train_wall=143, gb_free=59.6, wall=29654 epoch 012: 1383 / 1707 loss=4.193, nll_loss=2.574, ppl=5.96, wps=265887, ups=0.62, wpb=430122, bsz=16313.6, num_updates=20100, lr=0.0004461, gnorm=0.24, clip=0, loss_scale=4, train_wall=143, gb_free=59.6, wall=29654 epoch 012: 1383 / 1707 loss=4.193, nll_loss=2.574, ppl=5.96, wps=265887, ups=0.62, wpb=430122, bsz=16313.6, num_updates=20100, lr=0.0004461, gnorm=0.24, clip=0, loss_scale=4, train_wall=143, gb_free=59.6, wall=29654 epoch 012: 1383 / 1707 loss=4.193, nll_loss=2.574, ppl=5.96, wps=265887, ups=0.62, wpb=430122, bsz=16313.6, num_updates=20100, lr=0.0004461, gnorm=0.24, clip=0, loss_scale=4, train_wall=143, gb_free=59.6, wall=29654 epoch 012: 1383 / 1707 loss=4.193, nll_loss=2.574, ppl=5.96, wps=265887, ups=0.62, wpb=430122, bsz=16313.6, num_updates=20100, lr=0.0004461, gnorm=0.24, clip=0, loss_scale=4, train_wall=143, gb_free=59.6, wall=29654 epoch 012: 1383 / 1707 loss=4.193, nll_loss=2.574, ppl=5.96, wps=265887, ups=0.62, wpb=430122, bsz=16313.6, num_updates=20100, lr=0.0004461, gnorm=0.24, clip=0, loss_scale=4, train_wall=143, gb_free=59.6, wall=29654 epoch 012: 1383 / 1707 loss=4.193, nll_loss=2.574, ppl=5.96, wps=265887, ups=0.62, wpb=430122, bsz=16313.6, num_updates=20100, lr=0.0004461, gnorm=0.24, clip=0, loss_scale=4, train_wall=143, gb_free=59.6, wall=29654 epoch 012: 1383 / 1707 loss=4.193, nll_loss=2.574, ppl=5.96, wps=265887, ups=0.62, wpb=430122, bsz=16313.6, num_updates=20100, lr=0.0004461, gnorm=0.24, clip=0, loss_scale=4, train_wall=143, gb_free=59.6, wall=29654 epoch 012: 1383 / 1707 loss=4.193, nll_loss=2.574, ppl=5.96, wps=265887, ups=0.62, wpb=430122, bsz=16313.6, num_updates=20100, lr=0.0004461, gnorm=0.24, clip=0, loss_scale=4, train_wall=143, gb_free=59.6, wall=29654 epoch 012: 1383 / 1707 loss=4.193, nll_loss=2.574, ppl=5.96, wps=265887, ups=0.62, wpb=430122, bsz=16313.6, num_updates=20100, lr=0.0004461, gnorm=0.24, clip=0, loss_scale=4, train_wall=143, gb_free=59.6, wall=29654 epoch 012: 1383 / 1707 loss=4.193, nll_loss=2.574, ppl=5.96, wps=265887, ups=0.62, wpb=430122, bsz=16313.6, num_updates=20100, lr=0.0004461, gnorm=0.24, clip=0, loss_scale=4, train_wall=143, gb_free=59.6, wall=29654 epoch 012: 1484 / 1707 loss=4.198, nll_loss=2.58, ppl=5.98, wps=293936, ups=0.68, wpb=430382, bsz=16488, num_updates=20200, lr=0.000444994, gnorm=0.249, clip=0, loss_scale=2, train_wall=146, gb_free=59.5, wall=29801 epoch 012: 1484 / 1707 loss=4.198, nll_loss=2.58, ppl=5.98, wps=293936, ups=0.68, wpb=430382, bsz=16488, num_updates=20200, lr=0.000444994, gnorm=0.249, clip=0, loss_scale=2, train_wall=146, gb_free=59.5, wall=29801 epoch 012: 1484 / 1707 loss=4.198, nll_loss=2.58, ppl=5.98, wps=293936, ups=0.68, wpb=430382, bsz=16488, num_updates=20200, lr=0.000444994, gnorm=0.249, clip=0, loss_scale=2, train_wall=146, gb_free=59.5, wall=29801 epoch 012: 1484 / 1707 loss=4.198, nll_loss=2.58, ppl=5.98, wps=293936, ups=0.68, wpb=430382, bsz=16488, num_updates=20200, lr=0.000444994, gnorm=0.249, clip=0, loss_scale=2, train_wall=146, gb_free=59.5, wall=29801 epoch 012: 1484 / 1707 loss=4.198, nll_loss=2.58, ppl=5.98, wps=293936, ups=0.68, wpb=430382, bsz=16488, num_updates=20200, lr=0.000444994, gnorm=0.249, clip=0, loss_scale=2, train_wall=146, gb_free=59.5, wall=29801 epoch 012: 1484 / 1707 loss=4.198, nll_loss=2.58, ppl=5.98, wps=293936, ups=0.68, wpb=430382, bsz=16488, num_updates=20200, lr=0.000444994, gnorm=0.249, clip=0, loss_scale=2, train_wall=146, gb_free=59.5, wall=29801 epoch 012: 1484 / 1707 loss=4.198, nll_loss=2.58, ppl=5.98, wps=293936, ups=0.68, wpb=430382, bsz=16488, num_updates=20200, lr=0.000444994, gnorm=0.249, clip=0, loss_scale=2, train_wall=146, gb_free=59.5, wall=29801 epoch 012: 1484 / 1707 loss=4.198, nll_loss=2.58, ppl=5.98, wps=293936, ups=0.68, wpb=430382, bsz=16488, num_updates=20200, lr=0.000444994, gnorm=0.249, clip=0, loss_scale=2, train_wall=146, gb_free=59.5, wall=29801 epoch 012: 1484 / 1707 loss=4.198, nll_loss=2.58, ppl=5.98, wps=293936, ups=0.68, wpb=430382, bsz=16488, num_updates=20200, lr=0.000444994, gnorm=0.249, clip=0, loss_scale=2, train_wall=146, gb_free=59.5, wall=29801 epoch 012: 1484 / 1707 loss=4.198, nll_loss=2.58, ppl=5.98, wps=293936, ups=0.68, wpb=430382, bsz=16488, num_updates=20200, lr=0.000444994, gnorm=0.249, clip=0, loss_scale=2, train_wall=146, gb_free=59.5, wall=29801 epoch 012: 1484 / 1707 loss=4.198, nll_loss=2.58, ppl=5.98, wps=293936, ups=0.68, wpb=430382, bsz=16488, num_updates=20200, lr=0.000444994, gnorm=0.249, clip=0, loss_scale=2, train_wall=146, gb_free=59.5, wall=29801 epoch 012: 1484 / 1707 loss=4.198, nll_loss=2.58, ppl=5.98, wps=293936, ups=0.68, wpb=430382, bsz=16488, num_updates=20200, lr=0.000444994, gnorm=0.249, clip=0, loss_scale=2, train_wall=146, gb_free=59.5, wall=29801 epoch 012: 1584 / 1707 loss=4.194, nll_loss=2.576, ppl=5.96, wps=296073, ups=0.69, wpb=429132, bsz=16396.6, num_updates=20300, lr=0.000443897, gnorm=0.228, clip=0, loss_scale=2, train_wall=144, gb_free=58.8, wall=29945 epoch 012: 1584 / 1707 loss=4.194, nll_loss=2.576, ppl=5.96, wps=296073, ups=0.69, wpb=429132, bsz=16396.6, num_updates=20300, lr=0.000443897, gnorm=0.228, clip=0, loss_scale=2, train_wall=144, gb_free=58.8, wall=29945 epoch 012: 1584 / 1707 loss=4.194, nll_loss=2.576, ppl=5.96, wps=296073, ups=0.69, wpb=429132, bsz=16396.6, num_updates=20300, lr=0.000443897, gnorm=0.228, clip=0, loss_scale=2, train_wall=144, gb_free=58.8, wall=29945 epoch 012: 1584 / 1707 loss=4.194, nll_loss=2.576, ppl=5.96, wps=296073, ups=0.69, wpb=429132, bsz=16396.6, num_updates=20300, lr=0.000443897, gnorm=0.228, clip=0, loss_scale=2, train_wall=144, gb_free=58.8, wall=29945 epoch 012: 1584 / 1707 loss=4.194, nll_loss=2.576, ppl=5.96, wps=296073, ups=0.69, wpb=429132, bsz=16396.6, num_updates=20300, lr=0.000443897, gnorm=0.228, clip=0, loss_scale=2, train_wall=144, gb_free=58.8, wall=29945 epoch 012: 1584 / 1707 loss=4.194, nll_loss=2.576, ppl=5.96, wps=296073, ups=0.69, wpb=429132, bsz=16396.6, num_updates=20300, lr=0.000443897, gnorm=0.228, clip=0, loss_scale=2, train_wall=144, gb_free=58.8, wall=29945 epoch 012: 1584 / 1707 loss=4.194, nll_loss=2.576, ppl=5.96, wps=296073, ups=0.69, wpb=429132, bsz=16396.6, num_updates=20300, lr=0.000443897, gnorm=0.228, clip=0, loss_scale=2, train_wall=144, gb_free=58.8, wall=29945 epoch 012: 1584 / 1707 loss=4.194, nll_loss=2.576, ppl=5.96, wps=296073, ups=0.69, wpb=429132, bsz=16396.6, num_updates=20300, lr=0.000443897, gnorm=0.228, clip=0, loss_scale=2, train_wall=144, gb_free=58.8, wall=29945 epoch 012: 1584 / 1707 loss=4.194, nll_loss=2.576, ppl=5.96, wps=296073, ups=0.69, wpb=429132, bsz=16396.6, num_updates=20300, lr=0.000443897, gnorm=0.228, clip=0, loss_scale=2, train_wall=144, gb_free=58.8, wall=29945 epoch 012: 1584 / 1707 loss=4.194, nll_loss=2.576, ppl=5.96, wps=296073, ups=0.69, wpb=429132, bsz=16396.6, num_updates=20300, lr=0.000443897, gnorm=0.228, clip=0, loss_scale=2, train_wall=144, gb_free=58.8, wall=29945 epoch 012: 1584 / 1707 loss=4.194, nll_loss=2.576, ppl=5.96, wps=296073, ups=0.69, wpb=429132, bsz=16396.6, num_updates=20300, lr=0.000443897, gnorm=0.228, clip=0, loss_scale=2, train_wall=144, gb_free=58.8, wall=29945 epoch 012: 1584 / 1707 loss=4.194, nll_loss=2.576, ppl=5.96, wps=296073, ups=0.69, wpb=429132, bsz=16396.6, num_updates=20300, lr=0.000443897, gnorm=0.228, clip=0, loss_scale=2, train_wall=144, gb_free=58.8, wall=29945 epoch 012: 1684 / 1707 loss=4.195, nll_loss=2.577, ppl=5.97, wps=296461, ups=0.69, wpb=430366, bsz=16387, num_updates=20400, lr=0.000442807, gnorm=0.24, clip=0, loss_scale=2, train_wall=144, gb_free=59.1, wall=30091 epoch 012: 1684 / 1707 loss=4.195, nll_loss=2.577, ppl=5.97, wps=296461, ups=0.69, wpb=430366, bsz=16387, num_updates=20400, lr=0.000442807, gnorm=0.24, clip=0, loss_scale=2, train_wall=144, gb_free=59.1, wall=30091 epoch 012: 1684 / 1707 loss=4.195, nll_loss=2.577, ppl=5.97, wps=296461, ups=0.69, wpb=430366, bsz=16387, num_updates=20400, lr=0.000442807, gnorm=0.24, clip=0, loss_scale=2, train_wall=144, gb_free=59.1, wall=30091 epoch 012: 1684 / 1707 loss=4.195, nll_loss=2.577, ppl=5.97, wps=296461, ups=0.69, wpb=430366, bsz=16387, num_updates=20400, lr=0.000442807, gnorm=0.24, clip=0, loss_scale=2, train_wall=144, gb_free=59.1, wall=30091 epoch 012: 1684 / 1707 loss=4.195, nll_loss=2.577, ppl=5.97, wps=296461, ups=0.69, wpb=430366, bsz=16387, num_updates=20400, lr=0.000442807, gnorm=0.24, clip=0, loss_scale=2, train_wall=144, gb_free=59.1, wall=30091 epoch 012: 1684 / 1707 loss=4.195, nll_loss=2.577, ppl=5.97, wps=296461, ups=0.69, wpb=430366, bsz=16387, num_updates=20400, lr=0.000442807, gnorm=0.24, clip=0, loss_scale=2, train_wall=144, gb_free=59.1, wall=30091 epoch 012: 1684 / 1707 loss=4.195, nll_loss=2.577, ppl=5.97, wps=296461, ups=0.69, wpb=430366, bsz=16387, num_updates=20400, lr=0.000442807, gnorm=0.24, clip=0, loss_scale=2, train_wall=144, gb_free=59.1, wall=30091 epoch 012: 1684 / 1707 loss=4.195, nll_loss=2.577, ppl=5.97, wps=296461, ups=0.69, wpb=430366, bsz=16387, num_updates=20400, lr=0.000442807, gnorm=0.24, clip=0, loss_scale=2, train_wall=144, gb_free=59.1, wall=30091 epoch 012: 1684 / 1707 loss=4.195, nll_loss=2.577, ppl=5.97, wps=296461, ups=0.69, wpb=430366, bsz=16387, num_updates=20400, lr=0.000442807, gnorm=0.24, clip=0, loss_scale=2, train_wall=144, gb_free=59.1, wall=30091 epoch 012: 1684 / 1707 loss=4.195, nll_loss=2.577, ppl=5.97, wps=296461, ups=0.69, wpb=430366, bsz=16387, num_updates=20400, lr=0.000442807, gnorm=0.24, clip=0, loss_scale=2, train_wall=144, gb_free=59.1, wall=30091 epoch 012: 1684 / 1707 loss=4.195, nll_loss=2.577, ppl=5.97, wps=296461, ups=0.69, wpb=430366, bsz=16387, num_updates=20400, lr=0.000442807, gnorm=0.24, clip=0, loss_scale=2, train_wall=144, gb_free=59.1, wall=30091 epoch 012: 1684 / 1707 loss=4.195, nll_loss=2.577, ppl=5.97, wps=296461, ups=0.69, wpb=430366, bsz=16387, num_updates=20400, lr=0.000442807, gnorm=0.24, clip=0, loss_scale=2, train_wall=144, gb_free=59.1, wall=30091 end of epoch 12 (average epoch stats below) epoch 012 | loss 4.187 | nll_loss 2.567 | ppl 5.92 | wps 292042 | ups 0.68 | wpb 428936 | bsz 16334.1 | num_updates 20423 | lr 0.000442558 | gnorm 0.245 | clip 0 | loss_scale 4 | train_wall 2458 | gb_free 59.5 | wall 30123 epoch 012 | loss 4.187 | nll_loss 2.567 | ppl 5.92 | wps 292042 | ups 0.68 | wpb 428936 | bsz 16334.1 | num_updates 20423 | lr 0.000442558 | gnorm 0.245 | clip 0 | loss_scale 4 | train_wall 2458 | gb_free 59.5 | wall 30123 epoch 012 | loss 4.187 | nll_loss 2.567 | ppl 5.92 | wps 292042 | ups 0.68 | wpb 428936 | bsz 16334.1 | num_updates 20423 | lr 0.000442558 | gnorm 0.245 | clip 0 | loss_scale 4 | train_wall 2458 | gb_free 59.5 | wall 30123 epoch 012 | loss 4.187 | nll_loss 2.567 | ppl 5.92 | wps 292042 | ups 0.68 | wpb 428936 | bsz 16334.1 | num_updates 20423 | lr 0.000442558 | gnorm 0.245 | clip 0 | loss_scale 4 | train_wall 2458 | gb_free 59.5 | wall 30123 epoch 012 | loss 4.187 | nll_loss 2.567 | ppl 5.92 | wps 292042 | ups 0.68 | wpb 428936 | bsz 16334.1 | num_updates 20423 | lr 0.000442558 | gnorm 0.245 | clip 0 | loss_scale 4 | train_wall 2458 | gb_free 59.5 | wall 30123 epoch 012 | loss 4.187 | nll_loss 2.567 | ppl 5.92 | wps 292042 | ups 0.68 | wpb 428936 | bsz 16334.1 | num_updates 20423 | lr 0.000442558 | gnorm 0.245 | clip 0 | loss_scale 4 | train_wall 2458 | gb_free 59.5 | wall 30123 epoch 012 | loss 4.187 | nll_loss 2.567 | ppl 5.92 | wps 292042 | ups 0.68 | wpb 428936 | bsz 16334.1 | num_updates 20423 | lr 0.000442558 | gnorm 0.245 | clip 0 | loss_scale 4 | train_wall 2458 | gb_free 59.5 | wall 30123 epoch 012 | loss 4.187 | nll_loss 2.567 | ppl 5.92 | wps 292042 | ups 0.68 | wpb 428936 | bsz 16334.1 | num_updates 20423 | lr 0.000442558 | gnorm 0.245 | clip 0 | loss_scale 4 | train_wall 2458 | gb_free 59.5 | wall 30123 epoch 012 | loss 4.187 | nll_loss 2.567 | ppl 5.92 | wps 292042 | ups 0.68 | wpb 428936 | bsz 16334.1 | num_updates 20423 | lr 0.000442558 | gnorm 0.245 | clip 0 | loss_scale 4 | train_wall 2458 | gb_free 59.5 | wall 30123 epoch 012 | loss 4.187 | nll_loss 2.567 | ppl 5.92 | wps 292042 | ups 0.68 | wpb 428936 | bsz 16334.1 | num_updates 20423 | lr 0.000442558 | gnorm 0.245 | clip 0 | loss_scale 4 | train_wall 2458 | gb_free 59.5 | wall 30123 epoch 012 | loss 4.187 | nll_loss 2.567 | ppl 5.92 | wps 292042 | ups 0.68 | wpb 428936 | bsz 16334.1 | num_updates 20423 | lr 0.000442558 | gnorm 0.245 | clip 0 | loss_scale 4 | train_wall 2458 | gb_free 59.5 | wall 30123 epoch 012 | loss 4.187 | nll_loss 2.567 | ppl 5.92 | wps 292042 | ups 0.68 | wpb 428936 | bsz 16334.1 | num_updates 20423 | lr 0.000442558 | gnorm 0.245 | clip 0 | loss_scale 4 | train_wall 2458 | gb_free 59.5 | wall 30123 Start iterating over samples epoch 013: 78 / 1707 loss=4.165, nll_loss=2.542, ppl=5.82, wps=293597, ups=0.69, wpb=427274, bsz=16425.7, num_updates=20500, lr=0.000441726, gnorm=0.253, clip=0, loss_scale=2, train_wall=145, gb_free=58.9, wall=30236 epoch 013: 78 / 1707 loss=4.165, nll_loss=2.542, ppl=5.82, wps=293597, ups=0.69, wpb=427274, bsz=16425.7, num_updates=20500, lr=0.000441726, gnorm=0.253, clip=0, loss_scale=2, train_wall=145, gb_free=58.9, wall=30236 epoch 013: 78 / 1707 loss=4.165, nll_loss=2.542, ppl=5.82, wps=293597, ups=0.69, wpb=427274, bsz=16425.7, num_updates=20500, lr=0.000441726, gnorm=0.253, clip=0, loss_scale=2, train_wall=145, gb_free=58.9, wall=30236 epoch 013: 78 / 1707 loss=4.165, nll_loss=2.542, ppl=5.82, wps=293597, ups=0.69, wpb=427274, bsz=16425.7, num_updates=20500, lr=0.000441726, gnorm=0.253, clip=0, loss_scale=2, train_wall=145, gb_free=58.9, wall=30236 epoch 013: 78 / 1707 loss=4.165, nll_loss=2.542, ppl=5.82, wps=293597, ups=0.69, wpb=427274, bsz=16425.7, num_updates=20500, lr=0.000441726, gnorm=0.253, clip=0, loss_scale=2, train_wall=145, gb_free=58.9, wall=30236 epoch 013: 78 / 1707 loss=4.165, nll_loss=2.542, ppl=5.82, wps=293597, ups=0.69, wpb=427274, bsz=16425.7, num_updates=20500, lr=0.000441726, gnorm=0.253, clip=0, loss_scale=2, train_wall=145, gb_free=58.9, wall=30236 epoch 013: 78 / 1707 loss=4.165, nll_loss=2.542, ppl=5.82, wps=293597, ups=0.69, wpb=427274, bsz=16425.7, num_updates=20500, lr=0.000441726, gnorm=0.253, clip=0, loss_scale=2, train_wall=145, gb_free=58.9, wall=30236 epoch 013: 78 / 1707 loss=4.165, nll_loss=2.542, ppl=5.82, wps=293597, ups=0.69, wpb=427274, bsz=16425.7, num_updates=20500, lr=0.000441726, gnorm=0.253, clip=0, loss_scale=2, train_wall=145, gb_free=58.9, wall=30236 epoch 013: 78 / 1707 loss=4.165, nll_loss=2.542, ppl=5.82, wps=293597, ups=0.69, wpb=427274, bsz=16425.7, num_updates=20500, lr=0.000441726, gnorm=0.253, clip=0, loss_scale=2, train_wall=145, gb_free=58.9, wall=30236 epoch 013: 78 / 1707 loss=4.165, nll_loss=2.542, ppl=5.82, wps=293597, ups=0.69, wpb=427274, bsz=16425.7, num_updates=20500, lr=0.000441726, gnorm=0.253, clip=0, loss_scale=2, train_wall=145, gb_free=58.9, wall=30236 epoch 013: 78 / 1707 loss=4.165, nll_loss=2.542, ppl=5.82, wps=293597, ups=0.69, wpb=427274, bsz=16425.7, num_updates=20500, lr=0.000441726, gnorm=0.253, clip=0, loss_scale=2, train_wall=145, gb_free=58.9, wall=30236 epoch 013: 78 / 1707 loss=4.165, nll_loss=2.542, ppl=5.82, wps=293597, ups=0.69, wpb=427274, bsz=16425.7, num_updates=20500, lr=0.000441726, gnorm=0.253, clip=0, loss_scale=2, train_wall=145, gb_free=58.9, wall=30236 epoch 013: 78 / 1707 loss=4.165, nll_loss=2.542, ppl=5.82, wps=293597, ups=0.69, wpb=427274, bsz=16425.7, num_updates=20500, lr=0.000441726, gnorm=0.253, clip=0, loss_scale=2, train_wall=145, gb_free=58.9, wall=30236 epoch 013: 178 / 1707 loss=4.156, nll_loss=2.531, ppl=5.78, wps=296249, ups=0.69, wpb=429815, bsz=16331.9, num_updates=20600, lr=0.000440653, gnorm=0.238, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=30381 epoch 013: 178 / 1707 loss=4.156, nll_loss=2.531, ppl=5.78, wps=296249, ups=0.69, wpb=429815, bsz=16331.9, num_updates=20600, lr=0.000440653, gnorm=0.238, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=30381 epoch 013: 178 / 1707 loss=4.156, nll_loss=2.531, ppl=5.78, wps=296249, ups=0.69, wpb=429815, bsz=16331.9, num_updates=20600, lr=0.000440653, gnorm=0.238, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=30381 epoch 013: 178 / 1707 loss=4.156, nll_loss=2.531, ppl=5.78, wps=296249, ups=0.69, wpb=429815, bsz=16331.9, num_updates=20600, lr=0.000440653, gnorm=0.238, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=30381 epoch 013: 178 / 1707 loss=4.156, nll_loss=2.531, ppl=5.78, wps=296249, ups=0.69, wpb=429815, bsz=16331.9, num_updates=20600, lr=0.000440653, gnorm=0.238, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=30381 epoch 013: 178 / 1707 loss=4.156, nll_loss=2.531, ppl=5.78, wps=296249, ups=0.69, wpb=429815, bsz=16331.9, num_updates=20600, lr=0.000440653, gnorm=0.238, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=30381 epoch 013: 178 / 1707 loss=4.156, nll_loss=2.531, ppl=5.78, wps=296249, ups=0.69, wpb=429815, bsz=16331.9, num_updates=20600, lr=0.000440653, gnorm=0.238, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=30381 epoch 013: 178 / 1707 loss=4.156, nll_loss=2.531, ppl=5.78, wps=296249, ups=0.69, wpb=429815, bsz=16331.9, num_updates=20600, lr=0.000440653, gnorm=0.238, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=30381 epoch 013: 178 / 1707 loss=4.156, nll_loss=2.531, ppl=5.78, wps=296249, ups=0.69, wpb=429815, bsz=16331.9, num_updates=20600, lr=0.000440653, gnorm=0.238, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=30381 epoch 013: 178 / 1707 loss=4.156, nll_loss=2.531, ppl=5.78, wps=296249, ups=0.69, wpb=429815, bsz=16331.9, num_updates=20600, lr=0.000440653, gnorm=0.238, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=30381 epoch 013: 178 / 1707 loss=4.156, nll_loss=2.531, ppl=5.78, wps=296249, ups=0.69, wpb=429815, bsz=16331.9, num_updates=20600, lr=0.000440653, gnorm=0.238, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=30381 epoch 013: 178 / 1707 loss=4.156, nll_loss=2.531, ppl=5.78, wps=296249, ups=0.69, wpb=429815, bsz=16331.9, num_updates=20600, lr=0.000440653, gnorm=0.238, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=30381 epoch 013: 178 / 1707 loss=4.156, nll_loss=2.531, ppl=5.78, wps=296249, ups=0.69, wpb=429815, bsz=16331.9, num_updates=20600, lr=0.000440653, gnorm=0.238, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=30381 epoch 013: 279 / 1707 loss=4.161, nll_loss=2.536, ppl=5.8, wps=292987, ups=0.68, wpb=429000, bsz=16276.4, num_updates=20700, lr=0.000439587, gnorm=0.242, clip=0, loss_scale=1, train_wall=146, gb_free=59.2, wall=30528 epoch 013: 279 / 1707 loss=4.161, nll_loss=2.536, ppl=5.8, wps=292987, ups=0.68, wpb=429000, bsz=16276.4, num_updates=20700, lr=0.000439587, gnorm=0.242, clip=0, loss_scale=1, train_wall=146, gb_free=59.2, wall=30528 epoch 013: 279 / 1707 loss=4.161, nll_loss=2.536, ppl=5.8, wps=292987, ups=0.68, wpb=429000, bsz=16276.4, num_updates=20700, lr=0.000439587, gnorm=0.242, clip=0, loss_scale=1, train_wall=146, gb_free=59.2, wall=30528 epoch 013: 279 / 1707 loss=4.161, nll_loss=2.536, ppl=5.8, wps=292987, ups=0.68, wpb=429000, bsz=16276.4, num_updates=20700, lr=0.000439587, gnorm=0.242, clip=0, loss_scale=1, train_wall=146, gb_free=59.2, wall=30528 epoch 013: 279 / 1707 loss=4.161, nll_loss=2.536, ppl=5.8, wps=292987, ups=0.68, wpb=429000, bsz=16276.4, num_updates=20700, lr=0.000439587, gnorm=0.242, clip=0, loss_scale=1, train_wall=146, gb_free=59.2, wall=30528 epoch 013: 279 / 1707 loss=4.161, nll_loss=2.536, ppl=5.8, wps=292987, ups=0.68, wpb=429000, bsz=16276.4, num_updates=20700, lr=0.000439587, gnorm=0.242, clip=0, loss_scale=1, train_wall=146, gb_free=59.2, wall=30528 epoch 013: 279 / 1707 loss=4.161, nll_loss=2.536, ppl=5.8, wps=292987, ups=0.68, wpb=429000, bsz=16276.4, num_updates=20700, lr=0.000439587, gnorm=0.242, clip=0, loss_scale=1, train_wall=146, gb_free=59.2, wall=30528 epoch 013: 279 / 1707 loss=4.161, nll_loss=2.536, ppl=5.8, wps=292987, ups=0.68, wpb=429000, bsz=16276.4, num_updates=20700, lr=0.000439587, gnorm=0.242, clip=0, loss_scale=1, train_wall=146, gb_free=59.2, wall=30528 epoch 013: 279 / 1707 loss=4.161, nll_loss=2.536, ppl=5.8, wps=292987, ups=0.68, wpb=429000, bsz=16276.4, num_updates=20700, lr=0.000439587, gnorm=0.242, clip=0, loss_scale=1, train_wall=146, gb_free=59.2, wall=30528 epoch 013: 279 / 1707 loss=4.161, nll_loss=2.536, ppl=5.8, wps=292987, ups=0.68, wpb=429000, bsz=16276.4, num_updates=20700, lr=0.000439587, gnorm=0.242, clip=0, loss_scale=1, train_wall=146, gb_free=59.2, wall=30528 epoch 013: 279 / 1707 loss=4.161, nll_loss=2.536, ppl=5.8, wps=292987, ups=0.68, wpb=429000, bsz=16276.4, num_updates=20700, lr=0.000439587, gnorm=0.242, clip=0, loss_scale=1, train_wall=146, gb_free=59.2, wall=30528 epoch 013: 279 / 1707 loss=4.161, nll_loss=2.536, ppl=5.8, wps=292987, ups=0.68, wpb=429000, bsz=16276.4, num_updates=20700, lr=0.000439587, gnorm=0.242, clip=0, loss_scale=1, train_wall=146, gb_free=59.2, wall=30528 epoch 013: 279 / 1707 loss=4.161, nll_loss=2.536, ppl=5.8, wps=292987, ups=0.68, wpb=429000, bsz=16276.4, num_updates=20700, lr=0.000439587, gnorm=0.242, clip=0, loss_scale=1, train_wall=146, gb_free=59.2, wall=30528 epoch 013: 379 / 1707 loss=4.16, nll_loss=2.535, ppl=5.8, wps=295429, ups=0.69, wpb=428192, bsz=16422.3, num_updates=20800, lr=0.000438529, gnorm=0.245, clip=0, loss_scale=1, train_wall=144, gb_free=59.3, wall=30673 epoch 013: 379 / 1707 loss=4.16, nll_loss=2.535, ppl=5.8, wps=295429, ups=0.69, wpb=428192, bsz=16422.3, num_updates=20800, lr=0.000438529, gnorm=0.245, clip=0, loss_scale=1, train_wall=144, gb_free=59.3, wall=30673 epoch 013: 379 / 1707 loss=4.16, nll_loss=2.535, ppl=5.8, wps=295429, ups=0.69, wpb=428192, bsz=16422.3, num_updates=20800, lr=0.000438529, gnorm=0.245, clip=0, loss_scale=1, train_wall=144, gb_free=59.3, wall=30673 epoch 013: 379 / 1707 loss=4.16, nll_loss=2.535, ppl=5.8, wps=295429, ups=0.69, wpb=428192, bsz=16422.3, num_updates=20800, lr=0.000438529, gnorm=0.245, clip=0, loss_scale=1, train_wall=144, gb_free=59.3, wall=30673 epoch 013: 379 / 1707 loss=4.16, nll_loss=2.535, ppl=5.8, wps=295429, ups=0.69, wpb=428192, bsz=16422.3, num_updates=20800, lr=0.000438529, gnorm=0.245, clip=0, loss_scale=1, train_wall=144, gb_free=59.3, wall=30673 epoch 013: 379 / 1707 loss=4.16, nll_loss=2.535, ppl=5.8, wps=295429, ups=0.69, wpb=428192, bsz=16422.3, num_updates=20800, lr=0.000438529, gnorm=0.245, clip=0, loss_scale=1, train_wall=144, gb_free=59.3, wall=30673 epoch 013: 379 / 1707 loss=4.16, nll_loss=2.535, ppl=5.8, wps=295429, ups=0.69, wpb=428192, bsz=16422.3, num_updates=20800, lr=0.000438529, gnorm=0.245, clip=0, loss_scale=1, train_wall=144, gb_free=59.3, wall=30673 epoch 013: 379 / 1707 loss=4.16, nll_loss=2.535, ppl=5.8, wps=295429, ups=0.69, wpb=428192, bsz=16422.3, num_updates=20800, lr=0.000438529, gnorm=0.245, clip=0, loss_scale=1, train_wall=144, gb_free=59.3, wall=30673 epoch 013: 379 / 1707 loss=4.16, nll_loss=2.535, ppl=5.8, wps=295429, ups=0.69, wpb=428192, bsz=16422.3, num_updates=20800, lr=0.000438529, gnorm=0.245, clip=0, loss_scale=1, train_wall=144, gb_free=59.3, wall=30673 epoch 013: 379 / 1707 loss=4.16, nll_loss=2.535, ppl=5.8, wps=295429, ups=0.69, wpb=428192, bsz=16422.3, num_updates=20800, lr=0.000438529, gnorm=0.245, clip=0, loss_scale=1, train_wall=144, gb_free=59.3, wall=30673 epoch 013: 379 / 1707 loss=4.16, nll_loss=2.535, ppl=5.8, wps=295429, ups=0.69, wpb=428192, bsz=16422.3, num_updates=20800, lr=0.000438529, gnorm=0.245, clip=0, loss_scale=1, train_wall=144, gb_free=59.3, wall=30673 epoch 013: 379 / 1707 loss=4.16, nll_loss=2.535, ppl=5.8, wps=295429, ups=0.69, wpb=428192, bsz=16422.3, num_updates=20800, lr=0.000438529, gnorm=0.245, clip=0, loss_scale=1, train_wall=144, gb_free=59.3, wall=30673 epoch 013: 379 / 1707 loss=4.16, nll_loss=2.535, ppl=5.8, wps=295429, ups=0.69, wpb=428192, bsz=16422.3, num_updates=20800, lr=0.000438529, gnorm=0.245, clip=0, loss_scale=1, train_wall=144, gb_free=59.3, wall=30673 epoch 013: 479 / 1707 loss=4.158, nll_loss=2.534, ppl=5.79, wps=296210, ups=0.69, wpb=428083, bsz=16279.2, num_updates=20900, lr=0.000437479, gnorm=0.245, clip=0, loss_scale=2, train_wall=144, gb_free=60.2, wall=30817 epoch 013: 479 / 1707 loss=4.158, nll_loss=2.534, ppl=5.79, wps=296210, ups=0.69, wpb=428083, bsz=16279.2, num_updates=20900, lr=0.000437479, gnorm=0.245, clip=0, loss_scale=2, train_wall=144, gb_free=60.2, wall=30817 epoch 013: 479 / 1707 loss=4.158, nll_loss=2.534, ppl=5.79, wps=296210, ups=0.69, wpb=428083, bsz=16279.2, num_updates=20900, lr=0.000437479, gnorm=0.245, clip=0, loss_scale=2, train_wall=144, gb_free=60.2, wall=30817 epoch 013: 479 / 1707 loss=4.158, nll_loss=2.534, ppl=5.79, wps=296210, ups=0.69, wpb=428083, bsz=16279.2, num_updates=20900, lr=0.000437479, gnorm=0.245, clip=0, loss_scale=2, train_wall=144, gb_free=60.2, wall=30817 epoch 013: 479 / 1707 loss=4.158, nll_loss=2.534, ppl=5.79, wps=296210, ups=0.69, wpb=428083, bsz=16279.2, num_updates=20900, lr=0.000437479, gnorm=0.245, clip=0, loss_scale=2, train_wall=144, gb_free=60.2, wall=30817 epoch 013: 479 / 1707 loss=4.158, nll_loss=2.534, ppl=5.79, wps=296210, ups=0.69, wpb=428083, bsz=16279.2, num_updates=20900, lr=0.000437479, gnorm=0.245, clip=0, loss_scale=2, train_wall=144, gb_free=60.2, wall=30817 epoch 013: 479 / 1707 loss=4.158, nll_loss=2.534, ppl=5.79, wps=296210, ups=0.69, wpb=428083, bsz=16279.2, num_updates=20900, lr=0.000437479, gnorm=0.245, clip=0, loss_scale=2, train_wall=144, gb_free=60.2, wall=30817 epoch 013: 479 / 1707 loss=4.158, nll_loss=2.534, ppl=5.79, wps=296210, ups=0.69, wpb=428083, bsz=16279.2, num_updates=20900, lr=0.000437479, gnorm=0.245, clip=0, loss_scale=2, train_wall=144, gb_free=60.2, wall=30817 epoch 013: 479 / 1707 loss=4.158, nll_loss=2.534, ppl=5.79, wps=296210, ups=0.69, wpb=428083, bsz=16279.2, num_updates=20900, lr=0.000437479, gnorm=0.245, clip=0, loss_scale=2, train_wall=144, gb_free=60.2, wall=30817 epoch 013: 479 / 1707 loss=4.158, nll_loss=2.534, ppl=5.79, wps=296210, ups=0.69, wpb=428083, bsz=16279.2, num_updates=20900, lr=0.000437479, gnorm=0.245, clip=0, loss_scale=2, train_wall=144, gb_free=60.2, wall=30817 epoch 013: 479 / 1707 loss=4.158, nll_loss=2.534, ppl=5.79, wps=296210, ups=0.69, wpb=428083, bsz=16279.2, num_updates=20900, lr=0.000437479, gnorm=0.245, clip=0, loss_scale=2, train_wall=144, gb_free=60.2, wall=30817 epoch 013: 479 / 1707 loss=4.158, nll_loss=2.534, ppl=5.79, wps=296210, ups=0.69, wpb=428083, bsz=16279.2, num_updates=20900, lr=0.000437479, gnorm=0.245, clip=0, loss_scale=2, train_wall=144, gb_free=60.2, wall=30817 epoch 013: 479 / 1707 loss=4.158, nll_loss=2.534, ppl=5.79, wps=296210, ups=0.69, wpb=428083, bsz=16279.2, num_updates=20900, lr=0.000437479, gnorm=0.245, clip=0, loss_scale=2, train_wall=144, gb_free=60.2, wall=30817 epoch 013: 579 / 1707 loss=4.17, nll_loss=2.547, ppl=5.84, wps=296981, ups=0.69, wpb=429243, bsz=16263.9, num_updates=21000, lr=0.000436436, gnorm=0.256, clip=0, loss_scale=2, train_wall=144, gb_free=59.1, wall=30962 epoch 013: 579 / 1707 loss=4.17, nll_loss=2.547, ppl=5.84, wps=296981, ups=0.69, wpb=429243, bsz=16263.9, num_updates=21000, lr=0.000436436, gnorm=0.256, clip=0, loss_scale=2, train_wall=144, gb_free=59.1, wall=30962 epoch 013: 579 / 1707 loss=4.17, nll_loss=2.547, ppl=5.84, wps=296981, ups=0.69, wpb=429243, bsz=16263.9, num_updates=21000, lr=0.000436436, gnorm=0.256, clip=0, loss_scale=2, train_wall=144, gb_free=59.1, wall=30962 epoch 013: 579 / 1707 loss=4.17, nll_loss=2.547, ppl=5.84, wps=296981, ups=0.69, wpb=429243, bsz=16263.9, num_updates=21000, lr=0.000436436, gnorm=0.256, clip=0, loss_scale=2, train_wall=144, gb_free=59.1, wall=30962 epoch 013: 579 / 1707 loss=4.17, nll_loss=2.547, ppl=5.84, wps=296981, ups=0.69, wpb=429243, bsz=16263.9, num_updates=21000, lr=0.000436436, gnorm=0.256, clip=0, loss_scale=2, train_wall=144, gb_free=59.1, wall=30962 epoch 013: 579 / 1707 loss=4.17, nll_loss=2.547, ppl=5.84, wps=296981, ups=0.69, wpb=429243, bsz=16263.9, num_updates=21000, lr=0.000436436, gnorm=0.256, clip=0, loss_scale=2, train_wall=144, gb_free=59.1, wall=30962 epoch 013: 579 / 1707 loss=4.17, nll_loss=2.547, ppl=5.84, wps=296981, ups=0.69, wpb=429243, bsz=16263.9, num_updates=21000, lr=0.000436436, gnorm=0.256, clip=0, loss_scale=2, train_wall=144, gb_free=59.1, wall=30962 epoch 013: 579 / 1707 loss=4.17, nll_loss=2.547, ppl=5.84, wps=296981, ups=0.69, wpb=429243, bsz=16263.9, num_updates=21000, lr=0.000436436, gnorm=0.256, clip=0, loss_scale=2, train_wall=144, gb_free=59.1, wall=30962 epoch 013: 579 / 1707 loss=4.17, nll_loss=2.547, ppl=5.84, wps=296981, ups=0.69, wpb=429243, bsz=16263.9, num_updates=21000, lr=0.000436436, gnorm=0.256, clip=0, loss_scale=2, train_wall=144, gb_free=59.1, wall=30962 epoch 013: 579 / 1707 loss=4.17, nll_loss=2.547, ppl=5.84, wps=296981, ups=0.69, wpb=429243, bsz=16263.9, num_updates=21000, lr=0.000436436, gnorm=0.256, clip=0, loss_scale=2, train_wall=144, gb_free=59.1, wall=30962 epoch 013: 579 / 1707 loss=4.17, nll_loss=2.547, ppl=5.84, wps=296981, ups=0.69, wpb=429243, bsz=16263.9, num_updates=21000, lr=0.000436436, gnorm=0.256, clip=0, loss_scale=2, train_wall=144, gb_free=59.1, wall=30962 epoch 013: 579 / 1707 loss=4.17, nll_loss=2.547, ppl=5.84, wps=296981, ups=0.69, wpb=429243, bsz=16263.9, num_updates=21000, lr=0.000436436, gnorm=0.256, clip=0, loss_scale=2, train_wall=144, gb_free=59.1, wall=30962 epoch 013: 579 / 1707 loss=4.17, nll_loss=2.547, ppl=5.84, wps=296981, ups=0.69, wpb=429243, bsz=16263.9, num_updates=21000, lr=0.000436436, gnorm=0.256, clip=0, loss_scale=2, train_wall=144, gb_free=59.1, wall=30962 begin validation on "valid" subset epoch 013 | valid on 'valid' subset | loss 4.338 | nll_loss 2.721 | ppl 6.59 | wps 76524.8 | wpb 21331 | bsz 1016 | num_updates 21000 | best_loss 4.332 epoch 013 | valid on 'valid' subset | loss 4.338 | nll_loss 2.721 | ppl 6.59 | wps 76524.8 | wpb 21331 | bsz 1016 | num_updates 21000 | best_loss 4.332 epoch 013 | valid on 'valid' subset | loss 4.338 | nll_loss 2.721 | ppl 6.59 | wps 76524.8 | wpb 21331 | bsz 1016 | num_updates 21000 | best_loss 4.332 epoch 013 | valid on 'valid' subset | loss 4.338 | nll_loss 2.721 | ppl 6.59 | wps 76524.8 | wpb 21331 | bsz 1016 | num_updates 21000 | best_loss 4.332 epoch 013 | valid on 'valid' subset | loss 4.338 | nll_loss 2.721 | ppl 6.59 | wps 76524.8 | wpb 21331 | bsz 1016 | num_updates 21000 | best_loss 4.332 epoch 013 | valid on 'valid' subset | loss 4.338 | nll_loss 2.721 | ppl 6.59 | wps 76524.8 | wpb 21331 | bsz 1016 | num_updates 21000 | best_loss 4.332 epoch 013 | valid on 'valid' subset | loss 4.338 | nll_loss 2.721 | ppl 6.59 | wps 76524.8 | wpb 21331 | bsz 1016 | num_updates 21000 | best_loss 4.332 epoch 013 | valid on 'valid' subset | loss 4.338 | nll_loss 2.721 | ppl 6.59 | wps 76524.8 | wpb 21331 | bsz 1016 | num_updates 21000 | best_loss 4.332 epoch 013 | valid on 'valid' subset | loss 4.338 | nll_loss 2.721 | ppl 6.59 | wps 76524.8 | wpb 21331 | bsz 1016 | num_updates 21000 | best_loss 4.332 epoch 013 | valid on 'valid' subset | loss 4.338 | nll_loss 2.721 | ppl 6.59 | wps 76524.8 | wpb 21331 | bsz 1016 | num_updates 21000 | best_loss 4.332 epoch 013 | valid on 'valid' subset | loss 4.338 | nll_loss 2.721 | ppl 6.59 | wps 76524.8 | wpb 21331 | bsz 1016 | num_updates 21000 | best_loss 4.332 epoch 013 | valid on 'valid' subset | loss 4.338 | nll_loss 2.721 | ppl 6.59 | wps 76524.8 | wpb 21331 | bsz 1016 | num_updates 21000 | best_loss 4.332 epoch 013 | valid on 'valid' subset | loss 4.338 | nll_loss 2.721 | ppl 6.59 | wps 76524.8 | wpb 21331 | bsz 1016 | num_updates 21000 | best_loss 4.332 epoch 013: 679 / 1707 loss=4.168, nll_loss=2.546, ppl=5.84, wps=272184, ups=0.64, wpb=427817, bsz=16320.9, num_updates=21100, lr=0.0004354, gnorm=0.244, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=31119 epoch 013: 679 / 1707 loss=4.168, nll_loss=2.546, ppl=5.84, wps=272184, ups=0.64, wpb=427817, bsz=16320.9, num_updates=21100, lr=0.0004354, gnorm=0.244, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=31119 epoch 013: 679 / 1707 loss=4.168, nll_loss=2.546, ppl=5.84, wps=272184, ups=0.64, wpb=427817, bsz=16320.9, num_updates=21100, lr=0.0004354, gnorm=0.244, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=31119 epoch 013: 679 / 1707 loss=4.168, nll_loss=2.546, ppl=5.84, wps=272184, ups=0.64, wpb=427817, bsz=16320.9, num_updates=21100, lr=0.0004354, gnorm=0.244, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=31119 epoch 013: 679 / 1707 loss=4.168, nll_loss=2.546, ppl=5.84, wps=272184, ups=0.64, wpb=427817, bsz=16320.9, num_updates=21100, lr=0.0004354, gnorm=0.244, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=31119 epoch 013: 679 / 1707 loss=4.168, nll_loss=2.546, ppl=5.84, wps=272184, ups=0.64, wpb=427817, bsz=16320.9, num_updates=21100, lr=0.0004354, gnorm=0.244, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=31119 epoch 013: 679 / 1707 loss=4.168, nll_loss=2.546, ppl=5.84, wps=272184, ups=0.64, wpb=427817, bsz=16320.9, num_updates=21100, lr=0.0004354, gnorm=0.244, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=31119 epoch 013: 679 / 1707 loss=4.168, nll_loss=2.546, ppl=5.84, wps=272184, ups=0.64, wpb=427817, bsz=16320.9, num_updates=21100, lr=0.0004354, gnorm=0.244, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=31119 epoch 013: 679 / 1707 loss=4.168, nll_loss=2.546, ppl=5.84, wps=272184, ups=0.64, wpb=427817, bsz=16320.9, num_updates=21100, lr=0.0004354, gnorm=0.244, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=31119 epoch 013: 679 / 1707 loss=4.168, nll_loss=2.546, ppl=5.84, wps=272184, ups=0.64, wpb=427817, bsz=16320.9, num_updates=21100, lr=0.0004354, gnorm=0.244, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=31119 epoch 013: 679 / 1707 loss=4.168, nll_loss=2.546, ppl=5.84, wps=272184, ups=0.64, wpb=427817, bsz=16320.9, num_updates=21100, lr=0.0004354, gnorm=0.244, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=31119 epoch 013: 679 / 1707 loss=4.168, nll_loss=2.546, ppl=5.84, wps=272184, ups=0.64, wpb=427817, bsz=16320.9, num_updates=21100, lr=0.0004354, gnorm=0.244, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=31119 epoch 013: 679 / 1707 loss=4.168, nll_loss=2.546, ppl=5.84, wps=272184, ups=0.64, wpb=427817, bsz=16320.9, num_updates=21100, lr=0.0004354, gnorm=0.244, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=31119 epoch 013: 779 / 1707 loss=4.171, nll_loss=2.549, ppl=5.85, wps=297293, ups=0.69, wpb=427957, bsz=16389.4, num_updates=21200, lr=0.000434372, gnorm=0.24, clip=0, loss_scale=4, train_wall=143, gb_free=59.4, wall=31263 epoch 013: 779 / 1707 loss=4.171, nll_loss=2.549, ppl=5.85, wps=297293, ups=0.69, wpb=427957, bsz=16389.4, num_updates=21200, lr=0.000434372, gnorm=0.24, clip=0, loss_scale=4, train_wall=143, gb_free=59.4, wall=31263 epoch 013: 779 / 1707 loss=4.171, nll_loss=2.549, ppl=5.85, wps=297293, ups=0.69, wpb=427957, bsz=16389.4, num_updates=21200, lr=0.000434372, gnorm=0.24, clip=0, loss_scale=4, train_wall=143, gb_free=59.4, wall=31263 epoch 013: 779 / 1707 loss=4.171, nll_loss=2.549, ppl=5.85, wps=297293, ups=0.69, wpb=427957, bsz=16389.4, num_updates=21200, lr=0.000434372, gnorm=0.24, clip=0, loss_scale=4, train_wall=143, gb_free=59.4, wall=31263 epoch 013: 779 / 1707 loss=4.171, nll_loss=2.549, ppl=5.85, wps=297293, ups=0.69, wpb=427957, bsz=16389.4, num_updates=21200, lr=0.000434372, gnorm=0.24, clip=0, loss_scale=4, train_wall=143, gb_free=59.4, wall=31263 epoch 013: 779 / 1707 loss=4.171, nll_loss=2.549, ppl=5.85, wps=297293, ups=0.69, wpb=427957, bsz=16389.4, num_updates=21200, lr=0.000434372, gnorm=0.24, clip=0, loss_scale=4, train_wall=143, gb_free=59.4, wall=31263 epoch 013: 779 / 1707 loss=4.171, nll_loss=2.549, ppl=5.85, wps=297293, ups=0.69, wpb=427957, bsz=16389.4, num_updates=21200, lr=0.000434372, gnorm=0.24, clip=0, loss_scale=4, train_wall=143, gb_free=59.4, wall=31263 epoch 013: 779 / 1707 loss=4.171, nll_loss=2.549, ppl=5.85, wps=297293, ups=0.69, wpb=427957, bsz=16389.4, num_updates=21200, lr=0.000434372, gnorm=0.24, clip=0, loss_scale=4, train_wall=143, gb_free=59.4, wall=31263 epoch 013: 779 / 1707 loss=4.171, nll_loss=2.549, ppl=5.85, wps=297293, ups=0.69, wpb=427957, bsz=16389.4, num_updates=21200, lr=0.000434372, gnorm=0.24, clip=0, loss_scale=4, train_wall=143, gb_free=59.4, wall=31263 epoch 013: 779 / 1707 loss=4.171, nll_loss=2.549, ppl=5.85, wps=297293, ups=0.69, wpb=427957, bsz=16389.4, num_updates=21200, lr=0.000434372, gnorm=0.24, clip=0, loss_scale=4, train_wall=143, gb_free=59.4, wall=31263 epoch 013: 779 / 1707 loss=4.171, nll_loss=2.549, ppl=5.85, wps=297293, ups=0.69, wpb=427957, bsz=16389.4, num_updates=21200, lr=0.000434372, gnorm=0.24, clip=0, loss_scale=4, train_wall=143, gb_free=59.4, wall=31263 epoch 013: 779 / 1707 loss=4.171, nll_loss=2.549, ppl=5.85, wps=297293, ups=0.69, wpb=427957, bsz=16389.4, num_updates=21200, lr=0.000434372, gnorm=0.24, clip=0, loss_scale=4, train_wall=143, gb_free=59.4, wall=31263 epoch 013: 779 / 1707 loss=4.171, nll_loss=2.549, ppl=5.85, wps=297293, ups=0.69, wpb=427957, bsz=16389.4, num_updates=21200, lr=0.000434372, gnorm=0.24, clip=0, loss_scale=4, train_wall=143, gb_free=59.4, wall=31263 epoch 013: 879 / 1707 loss=4.173, nll_loss=2.551, ppl=5.86, wps=294887, ups=0.69, wpb=428301, bsz=16181.6, num_updates=21300, lr=0.000433351, gnorm=0.24, clip=0, loss_scale=4, train_wall=144, gb_free=58.9, wall=31408 epoch 013: 879 / 1707 loss=4.173, nll_loss=2.551, ppl=5.86, wps=294887, ups=0.69, wpb=428301, bsz=16181.6, num_updates=21300, lr=0.000433351, gnorm=0.24, clip=0, loss_scale=4, train_wall=144, gb_free=58.9, wall=31408 epoch 013: 879 / 1707 loss=4.173, nll_loss=2.551, ppl=5.86, wps=294887, ups=0.69, wpb=428301, bsz=16181.6, num_updates=21300, lr=0.000433351, gnorm=0.24, clip=0, loss_scale=4, train_wall=144, gb_free=58.9, wall=31408 epoch 013: 879 / 1707 loss=4.173, nll_loss=2.551, ppl=5.86, wps=294887, ups=0.69, wpb=428301, bsz=16181.6, num_updates=21300, lr=0.000433351, gnorm=0.24, clip=0, loss_scale=4, train_wall=144, gb_free=58.9, wall=31408 epoch 013: 879 / 1707 loss=4.173, nll_loss=2.551, ppl=5.86, wps=294887, ups=0.69, wpb=428301, bsz=16181.6, num_updates=21300, lr=0.000433351, gnorm=0.24, clip=0, loss_scale=4, train_wall=144, gb_free=58.9, wall=31408 epoch 013: 879 / 1707 loss=4.173, nll_loss=2.551, ppl=5.86, wps=294887, ups=0.69, wpb=428301, bsz=16181.6, num_updates=21300, lr=0.000433351, gnorm=0.24, clip=0, loss_scale=4, train_wall=144, gb_free=58.9, wall=31408 epoch 013: 879 / 1707 loss=4.173, nll_loss=2.551, ppl=5.86, wps=294887, ups=0.69, wpb=428301, bsz=16181.6, num_updates=21300, lr=0.000433351, gnorm=0.24, clip=0, loss_scale=4, train_wall=144, gb_free=58.9, wall=31408 epoch 013: 879 / 1707 loss=4.173, nll_loss=2.551, ppl=5.86, wps=294887, ups=0.69, wpb=428301, bsz=16181.6, num_updates=21300, lr=0.000433351, gnorm=0.24, clip=0, loss_scale=4, train_wall=144, gb_free=58.9, wall=31408 epoch 013: 879 / 1707 loss=4.173, nll_loss=2.551, ppl=5.86, wps=294887, ups=0.69, wpb=428301, bsz=16181.6, num_updates=21300, lr=0.000433351, gnorm=0.24, clip=0, loss_scale=4, train_wall=144, gb_free=58.9, wall=31408 epoch 013: 879 / 1707 loss=4.173, nll_loss=2.551, ppl=5.86, wps=294887, ups=0.69, wpb=428301, bsz=16181.6, num_updates=21300, lr=0.000433351, gnorm=0.24, clip=0, loss_scale=4, train_wall=144, gb_free=58.9, wall=31408 epoch 013: 879 / 1707 loss=4.173, nll_loss=2.551, ppl=5.86, wps=294887, ups=0.69, wpb=428301, bsz=16181.6, num_updates=21300, lr=0.000433351, gnorm=0.24, clip=0, loss_scale=4, train_wall=144, gb_free=58.9, wall=31408 epoch 013: 879 / 1707 loss=4.173, nll_loss=2.551, ppl=5.86, wps=294887, ups=0.69, wpb=428301, bsz=16181.6, num_updates=21300, lr=0.000433351, gnorm=0.24, clip=0, loss_scale=4, train_wall=144, gb_free=58.9, wall=31408 epoch 013: 879 / 1707 loss=4.173, nll_loss=2.551, ppl=5.86, wps=294887, ups=0.69, wpb=428301, bsz=16181.6, num_updates=21300, lr=0.000433351, gnorm=0.24, clip=0, loss_scale=4, train_wall=144, gb_free=58.9, wall=31408 epoch 013: 980 / 1707 loss=4.178, nll_loss=2.557, ppl=5.88, wps=293734, ups=0.68, wpb=429808, bsz=16478.2, num_updates=21400, lr=0.000432338, gnorm=0.252, clip=0, loss_scale=2, train_wall=146, gb_free=59.3, wall=31554 epoch 013: 980 / 1707 loss=4.178, nll_loss=2.557, ppl=5.88, wps=293734, ups=0.68, wpb=429808, bsz=16478.2, num_updates=21400, lr=0.000432338, gnorm=0.252, clip=0, loss_scale=2, train_wall=146, gb_free=59.3, wall=31554 epoch 013: 980 / 1707 loss=4.178, nll_loss=2.557, ppl=5.88, wps=293734, ups=0.68, wpb=429808, bsz=16478.2, num_updates=21400, lr=0.000432338, gnorm=0.252, clip=0, loss_scale=2, train_wall=146, gb_free=59.3, wall=31554 epoch 013: 980 / 1707 loss=4.178, nll_loss=2.557, ppl=5.88, wps=293734, ups=0.68, wpb=429808, bsz=16478.2, num_updates=21400, lr=0.000432338, gnorm=0.252, clip=0, loss_scale=2, train_wall=146, gb_free=59.3, wall=31554 epoch 013: 980 / 1707 loss=4.178, nll_loss=2.557, ppl=5.88, wps=293734, ups=0.68, wpb=429808, bsz=16478.2, num_updates=21400, lr=0.000432338, gnorm=0.252, clip=0, loss_scale=2, train_wall=146, gb_free=59.3, wall=31554 epoch 013: 980 / 1707 loss=4.178, nll_loss=2.557, ppl=5.88, wps=293734, ups=0.68, wpb=429808, bsz=16478.2, num_updates=21400, lr=0.000432338, gnorm=0.252, clip=0, loss_scale=2, train_wall=146, gb_free=59.3, wall=31554 epoch 013: 980 / 1707 loss=4.178, nll_loss=2.557, ppl=5.88, wps=293734, ups=0.68, wpb=429808, bsz=16478.2, num_updates=21400, lr=0.000432338, gnorm=0.252, clip=0, loss_scale=2, train_wall=146, gb_free=59.3, wall=31554 epoch 013: 980 / 1707 loss=4.178, nll_loss=2.557, ppl=5.88, wps=293734, ups=0.68, wpb=429808, bsz=16478.2, num_updates=21400, lr=0.000432338, gnorm=0.252, clip=0, loss_scale=2, train_wall=146, gb_free=59.3, wall=31554 epoch 013: 980 / 1707 loss=4.178, nll_loss=2.557, ppl=5.88, wps=293734, ups=0.68, wpb=429808, bsz=16478.2, num_updates=21400, lr=0.000432338, gnorm=0.252, clip=0, loss_scale=2, train_wall=146, gb_free=59.3, wall=31554 epoch 013: 980 / 1707 loss=4.178, nll_loss=2.557, ppl=5.88, wps=293734, ups=0.68, wpb=429808, bsz=16478.2, num_updates=21400, lr=0.000432338, gnorm=0.252, clip=0, loss_scale=2, train_wall=146, gb_free=59.3, wall=31554 epoch 013: 980 / 1707 loss=4.178, nll_loss=2.557, ppl=5.88, wps=293734, ups=0.68, wpb=429808, bsz=16478.2, num_updates=21400, lr=0.000432338, gnorm=0.252, clip=0, loss_scale=2, train_wall=146, gb_free=59.3, wall=31554 epoch 013: 980 / 1707 loss=4.178, nll_loss=2.557, ppl=5.88, wps=293734, ups=0.68, wpb=429808, bsz=16478.2, num_updates=21400, lr=0.000432338, gnorm=0.252, clip=0, loss_scale=2, train_wall=146, gb_free=59.3, wall=31554 epoch 013: 980 / 1707 loss=4.178, nll_loss=2.557, ppl=5.88, wps=293734, ups=0.68, wpb=429808, bsz=16478.2, num_updates=21400, lr=0.000432338, gnorm=0.252, clip=0, loss_scale=2, train_wall=146, gb_free=59.3, wall=31554 epoch 013: 1080 / 1707 loss=4.173, nll_loss=2.551, ppl=5.86, wps=296120, ups=0.69, wpb=429716, bsz=16180.4, num_updates=21500, lr=0.000431331, gnorm=0.241, clip=0, loss_scale=2, train_wall=144, gb_free=59.4, wall=31700 epoch 013: 1080 / 1707 loss=4.173, nll_loss=2.551, ppl=5.86, wps=296120, ups=0.69, wpb=429716, bsz=16180.4, num_updates=21500, lr=0.000431331, gnorm=0.241, clip=0, loss_scale=2, train_wall=144, gb_free=59.4, wall=31700 epoch 013: 1080 / 1707 loss=4.173, nll_loss=2.551, ppl=5.86, wps=296120, ups=0.69, wpb=429716, bsz=16180.4, num_updates=21500, lr=0.000431331, gnorm=0.241, clip=0, loss_scale=2, train_wall=144, gb_free=59.4, wall=31700 epoch 013: 1080 / 1707 loss=4.173, nll_loss=2.551, ppl=5.86, wps=296120, ups=0.69, wpb=429716, bsz=16180.4, num_updates=21500, lr=0.000431331, gnorm=0.241, clip=0, loss_scale=2, train_wall=144, gb_free=59.4, wall=31700 epoch 013: 1080 / 1707 loss=4.173, nll_loss=2.551, ppl=5.86, wps=296120, ups=0.69, wpb=429716, bsz=16180.4, num_updates=21500, lr=0.000431331, gnorm=0.241, clip=0, loss_scale=2, train_wall=144, gb_free=59.4, wall=31700 epoch 013: 1080 / 1707 loss=4.173, nll_loss=2.551, ppl=5.86, wps=296120, ups=0.69, wpb=429716, bsz=16180.4, num_updates=21500, lr=0.000431331, gnorm=0.241, clip=0, loss_scale=2, train_wall=144, gb_free=59.4, wall=31700 epoch 013: 1080 / 1707 loss=4.173, nll_loss=2.551, ppl=5.86, wps=296120, ups=0.69, wpb=429716, bsz=16180.4, num_updates=21500, lr=0.000431331, gnorm=0.241, clip=0, loss_scale=2, train_wall=144, gb_free=59.4, wall=31700 epoch 013: 1080 / 1707 loss=4.173, nll_loss=2.551, ppl=5.86, wps=296120, ups=0.69, wpb=429716, bsz=16180.4, num_updates=21500, lr=0.000431331, gnorm=0.241, clip=0, loss_scale=2, train_wall=144, gb_free=59.4, wall=31700 epoch 013: 1080 / 1707 loss=4.173, nll_loss=2.551, ppl=5.86, wps=296120, ups=0.69, wpb=429716, bsz=16180.4, num_updates=21500, lr=0.000431331, gnorm=0.241, clip=0, loss_scale=2, train_wall=144, gb_free=59.4, wall=31700 epoch 013: 1080 / 1707 loss=4.173, nll_loss=2.551, ppl=5.86, wps=296120, ups=0.69, wpb=429716, bsz=16180.4, num_updates=21500, lr=0.000431331, gnorm=0.241, clip=0, loss_scale=2, train_wall=144, gb_free=59.4, wall=31700 epoch 013: 1080 / 1707 loss=4.173, nll_loss=2.551, ppl=5.86, wps=296120, ups=0.69, wpb=429716, bsz=16180.4, num_updates=21500, lr=0.000431331, gnorm=0.241, clip=0, loss_scale=2, train_wall=144, gb_free=59.4, wall=31700 epoch 013: 1080 / 1707 loss=4.173, nll_loss=2.551, ppl=5.86, wps=296120, ups=0.69, wpb=429716, bsz=16180.4, num_updates=21500, lr=0.000431331, gnorm=0.241, clip=0, loss_scale=2, train_wall=144, gb_free=59.4, wall=31700 epoch 013: 1080 / 1707 loss=4.173, nll_loss=2.551, ppl=5.86, wps=296120, ups=0.69, wpb=429716, bsz=16180.4, num_updates=21500, lr=0.000431331, gnorm=0.241, clip=0, loss_scale=2, train_wall=144, gb_free=59.4, wall=31700 epoch 013: 1180 / 1707 loss=4.176, nll_loss=2.555, ppl=5.88, wps=296345, ups=0.69, wpb=430557, bsz=16617.7, num_updates=21600, lr=0.000430331, gnorm=0.243, clip=0, loss_scale=4, train_wall=145, gb_free=59.1, wall=31845 epoch 013: 1180 / 1707 loss=4.176, nll_loss=2.555, ppl=5.88, wps=296345, ups=0.69, wpb=430557, bsz=16617.7, num_updates=21600, lr=0.000430331, gnorm=0.243, clip=0, loss_scale=4, train_wall=145, gb_free=59.1, wall=31845 epoch 013: 1180 / 1707 loss=4.176, nll_loss=2.555, ppl=5.88, wps=296345, ups=0.69, wpb=430557, bsz=16617.7, num_updates=21600, lr=0.000430331, gnorm=0.243, clip=0, loss_scale=4, train_wall=145, gb_free=59.1, wall=31845 epoch 013: 1180 / 1707 loss=4.176, nll_loss=2.555, ppl=5.88, wps=296345, ups=0.69, wpb=430557, bsz=16617.7, num_updates=21600, lr=0.000430331, gnorm=0.243, clip=0, loss_scale=4, train_wall=145, gb_free=59.1, wall=31845 epoch 013: 1180 / 1707 loss=4.176, nll_loss=2.555, ppl=5.88, wps=296345, ups=0.69, wpb=430557, bsz=16617.7, num_updates=21600, lr=0.000430331, gnorm=0.243, clip=0, loss_scale=4, train_wall=145, gb_free=59.1, wall=31845 epoch 013: 1180 / 1707 loss=4.176, nll_loss=2.555, ppl=5.88, wps=296345, ups=0.69, wpb=430557, bsz=16617.7, num_updates=21600, lr=0.000430331, gnorm=0.243, clip=0, loss_scale=4, train_wall=145, gb_free=59.1, wall=31845 epoch 013: 1180 / 1707 loss=4.176, nll_loss=2.555, ppl=5.88, wps=296345, ups=0.69, wpb=430557, bsz=16617.7, num_updates=21600, lr=0.000430331, gnorm=0.243, clip=0, loss_scale=4, train_wall=145, gb_free=59.1, wall=31845 epoch 013: 1180 / 1707 loss=4.176, nll_loss=2.555, ppl=5.88, wps=296345, ups=0.69, wpb=430557, bsz=16617.7, num_updates=21600, lr=0.000430331, gnorm=0.243, clip=0, loss_scale=4, train_wall=145, gb_free=59.1, wall=31845 epoch 013: 1180 / 1707 loss=4.176, nll_loss=2.555, ppl=5.88, wps=296345, ups=0.69, wpb=430557, bsz=16617.7, num_updates=21600, lr=0.000430331, gnorm=0.243, clip=0, loss_scale=4, train_wall=145, gb_free=59.1, wall=31845 epoch 013: 1180 / 1707 loss=4.176, nll_loss=2.555, ppl=5.88, wps=296345, ups=0.69, wpb=430557, bsz=16617.7, num_updates=21600, lr=0.000430331, gnorm=0.243, clip=0, loss_scale=4, train_wall=145, gb_free=59.1, wall=31845 epoch 013: 1180 / 1707 loss=4.176, nll_loss=2.555, ppl=5.88, wps=296345, ups=0.69, wpb=430557, bsz=16617.7, num_updates=21600, lr=0.000430331, gnorm=0.243, clip=0, loss_scale=4, train_wall=145, gb_free=59.1, wall=31845 epoch 013: 1180 / 1707 loss=4.176, nll_loss=2.555, ppl=5.88, wps=296345, ups=0.69, wpb=430557, bsz=16617.7, num_updates=21600, lr=0.000430331, gnorm=0.243, clip=0, loss_scale=4, train_wall=145, gb_free=59.1, wall=31845 epoch 013: 1180 / 1707 loss=4.176, nll_loss=2.555, ppl=5.88, wps=296345, ups=0.69, wpb=430557, bsz=16617.7, num_updates=21600, lr=0.000430331, gnorm=0.243, clip=0, loss_scale=4, train_wall=145, gb_free=59.1, wall=31845 epoch 013: 1281 / 1707 loss=4.171, nll_loss=2.549, ppl=5.85, wps=292615, ups=0.68, wpb=429431, bsz=16361.7, num_updates=21700, lr=0.000429339, gnorm=0.243, clip=0, loss_scale=2, train_wall=146, gb_free=58.9, wall=31992 epoch 013: 1281 / 1707 loss=4.171, nll_loss=2.549, ppl=5.85, wps=292615, ups=0.68, wpb=429431, bsz=16361.7, num_updates=21700, lr=0.000429339, gnorm=0.243, clip=0, loss_scale=2, train_wall=146, gb_free=58.9, wall=31992 epoch 013: 1281 / 1707 loss=4.171, nll_loss=2.549, ppl=5.85, wps=292615, ups=0.68, wpb=429431, bsz=16361.7, num_updates=21700, lr=0.000429339, gnorm=0.243, clip=0, loss_scale=2, train_wall=146, gb_free=58.9, wall=31992 epoch 013: 1281 / 1707 loss=4.171, nll_loss=2.549, ppl=5.85, wps=292615, ups=0.68, wpb=429431, bsz=16361.7, num_updates=21700, lr=0.000429339, gnorm=0.243, clip=0, loss_scale=2, train_wall=146, gb_free=58.9, wall=31992 epoch 013: 1281 / 1707 loss=4.171, nll_loss=2.549, ppl=5.85, wps=292615, ups=0.68, wpb=429431, bsz=16361.7, num_updates=21700, lr=0.000429339, gnorm=0.243, clip=0, loss_scale=2, train_wall=146, gb_free=58.9, wall=31992 epoch 013: 1281 / 1707 loss=4.171, nll_loss=2.549, ppl=5.85, wps=292615, ups=0.68, wpb=429431, bsz=16361.7, num_updates=21700, lr=0.000429339, gnorm=0.243, clip=0, loss_scale=2, train_wall=146, gb_free=58.9, wall=31992 epoch 013: 1281 / 1707 loss=4.171, nll_loss=2.549, ppl=5.85, wps=292615, ups=0.68, wpb=429431, bsz=16361.7, num_updates=21700, lr=0.000429339, gnorm=0.243, clip=0, loss_scale=2, train_wall=146, gb_free=58.9, wall=31992 epoch 013: 1281 / 1707 loss=4.171, nll_loss=2.549, ppl=5.85, wps=292615, ups=0.68, wpb=429431, bsz=16361.7, num_updates=21700, lr=0.000429339, gnorm=0.243, clip=0, loss_scale=2, train_wall=146, gb_free=58.9, wall=31992 epoch 013: 1281 / 1707 loss=4.171, nll_loss=2.549, ppl=5.85, wps=292615, ups=0.68, wpb=429431, bsz=16361.7, num_updates=21700, lr=0.000429339, gnorm=0.243, clip=0, loss_scale=2, train_wall=146, gb_free=58.9, wall=31992 epoch 013: 1281 / 1707 loss=4.171, nll_loss=2.549, ppl=5.85, wps=292615, ups=0.68, wpb=429431, bsz=16361.7, num_updates=21700, lr=0.000429339, gnorm=0.243, clip=0, loss_scale=2, train_wall=146, gb_free=58.9, wall=31992 epoch 013: 1281 / 1707 loss=4.171, nll_loss=2.549, ppl=5.85, wps=292615, ups=0.68, wpb=429431, bsz=16361.7, num_updates=21700, lr=0.000429339, gnorm=0.243, clip=0, loss_scale=2, train_wall=146, gb_free=58.9, wall=31992 epoch 013: 1281 / 1707 loss=4.171, nll_loss=2.549, ppl=5.85, wps=292615, ups=0.68, wpb=429431, bsz=16361.7, num_updates=21700, lr=0.000429339, gnorm=0.243, clip=0, loss_scale=2, train_wall=146, gb_free=58.9, wall=31992 epoch 013: 1281 / 1707 loss=4.171, nll_loss=2.549, ppl=5.85, wps=292615, ups=0.68, wpb=429431, bsz=16361.7, num_updates=21700, lr=0.000429339, gnorm=0.243, clip=0, loss_scale=2, train_wall=146, gb_free=58.9, wall=31992 epoch 013: 1382 / 1707 loss=4.181, nll_loss=2.561, ppl=5.9, wps=294378, ups=0.68, wpb=431229, bsz=16307.7, num_updates=21800, lr=0.000428353, gnorm=0.251, clip=0, loss_scale=1, train_wall=146, gb_free=59.9, wall=32138 epoch 013: 1382 / 1707 loss=4.181, nll_loss=2.561, ppl=5.9, wps=294378, ups=0.68, wpb=431229, bsz=16307.7, num_updates=21800, lr=0.000428353, gnorm=0.251, clip=0, loss_scale=1, train_wall=146, gb_free=59.9, wall=32138 epoch 013: 1382 / 1707 loss=4.181, nll_loss=2.561, ppl=5.9, wps=294378, ups=0.68, wpb=431229, bsz=16307.7, num_updates=21800, lr=0.000428353, gnorm=0.251, clip=0, loss_scale=1, train_wall=146, gb_free=59.9, wall=32138 epoch 013: 1382 / 1707 loss=4.181, nll_loss=2.561, ppl=5.9, wps=294378, ups=0.68, wpb=431229, bsz=16307.7, num_updates=21800, lr=0.000428353, gnorm=0.251, clip=0, loss_scale=1, train_wall=146, gb_free=59.9, wall=32138 epoch 013: 1382 / 1707 loss=4.181, nll_loss=2.561, ppl=5.9, wps=294378, ups=0.68, wpb=431229, bsz=16307.7, num_updates=21800, lr=0.000428353, gnorm=0.251, clip=0, loss_scale=1, train_wall=146, gb_free=59.9, wall=32138 epoch 013: 1382 / 1707 loss=4.181, nll_loss=2.561, ppl=5.9, wps=294378, ups=0.68, wpb=431229, bsz=16307.7, num_updates=21800, lr=0.000428353, gnorm=0.251, clip=0, loss_scale=1, train_wall=146, gb_free=59.9, wall=32138 epoch 013: 1382 / 1707 loss=4.181, nll_loss=2.561, ppl=5.9, wps=294378, ups=0.68, wpb=431229, bsz=16307.7, num_updates=21800, lr=0.000428353, gnorm=0.251, clip=0, loss_scale=1, train_wall=146, gb_free=59.9, wall=32138 epoch 013: 1382 / 1707 loss=4.181, nll_loss=2.561, ppl=5.9, wps=294378, ups=0.68, wpb=431229, bsz=16307.7, num_updates=21800, lr=0.000428353, gnorm=0.251, clip=0, loss_scale=1, train_wall=146, gb_free=59.9, wall=32138 epoch 013: 1382 / 1707 loss=4.181, nll_loss=2.561, ppl=5.9, wps=294378, ups=0.68, wpb=431229, bsz=16307.7, num_updates=21800, lr=0.000428353, gnorm=0.251, clip=0, loss_scale=1, train_wall=146, gb_free=59.9, wall=32138 epoch 013: 1382 / 1707 loss=4.181, nll_loss=2.561, ppl=5.9, wps=294378, ups=0.68, wpb=431229, bsz=16307.7, num_updates=21800, lr=0.000428353, gnorm=0.251, clip=0, loss_scale=1, train_wall=146, gb_free=59.9, wall=32138 epoch 013: 1382 / 1707 loss=4.181, nll_loss=2.561, ppl=5.9, wps=294378, ups=0.68, wpb=431229, bsz=16307.7, num_updates=21800, lr=0.000428353, gnorm=0.251, clip=0, loss_scale=1, train_wall=146, gb_free=59.9, wall=32138 epoch 013: 1382 / 1707 loss=4.181, nll_loss=2.561, ppl=5.9, wps=294378, ups=0.68, wpb=431229, bsz=16307.7, num_updates=21800, lr=0.000428353, gnorm=0.251, clip=0, loss_scale=1, train_wall=146, gb_free=59.9, wall=32138 epoch 013: 1382 / 1707 loss=4.181, nll_loss=2.561, ppl=5.9, wps=294378, ups=0.68, wpb=431229, bsz=16307.7, num_updates=21800, lr=0.000428353, gnorm=0.251, clip=0, loss_scale=1, train_wall=146, gb_free=59.9, wall=32138 epoch 013: 1482 / 1707 loss=4.167, nll_loss=2.545, ppl=5.84, wps=295428, ups=0.69, wpb=427673, bsz=16008.6, num_updates=21900, lr=0.000427374, gnorm=0.24, clip=0, loss_scale=1, train_wall=144, gb_free=58.9, wall=32283 epoch 013: 1482 / 1707 loss=4.167, nll_loss=2.545, ppl=5.84, wps=295428, ups=0.69, wpb=427673, bsz=16008.6, num_updates=21900, lr=0.000427374, gnorm=0.24, clip=0, loss_scale=1, train_wall=144, gb_free=58.9, wall=32283 epoch 013: 1482 / 1707 loss=4.167, nll_loss=2.545, ppl=5.84, wps=295428, ups=0.69, wpb=427673, bsz=16008.6, num_updates=21900, lr=0.000427374, gnorm=0.24, clip=0, loss_scale=1, train_wall=144, gb_free=58.9, wall=32283 epoch 013: 1482 / 1707 loss=4.167, nll_loss=2.545, ppl=5.84, wps=295428, ups=0.69, wpb=427673, bsz=16008.6, num_updates=21900, lr=0.000427374, gnorm=0.24, clip=0, loss_scale=1, train_wall=144, gb_free=58.9, wall=32283 epoch 013: 1482 / 1707 loss=4.167, nll_loss=2.545, ppl=5.84, wps=295428, ups=0.69, wpb=427673, bsz=16008.6, num_updates=21900, lr=0.000427374, gnorm=0.24, clip=0, loss_scale=1, train_wall=144, gb_free=58.9, wall=32283 epoch 013: 1482 / 1707 loss=4.167, nll_loss=2.545, ppl=5.84, wps=295428, ups=0.69, wpb=427673, bsz=16008.6, num_updates=21900, lr=0.000427374, gnorm=0.24, clip=0, loss_scale=1, train_wall=144, gb_free=58.9, wall=32283 epoch 013: 1482 / 1707 loss=4.167, nll_loss=2.545, ppl=5.84, wps=295428, ups=0.69, wpb=427673, bsz=16008.6, num_updates=21900, lr=0.000427374, gnorm=0.24, clip=0, loss_scale=1, train_wall=144, gb_free=58.9, wall=32283 epoch 013: 1482 / 1707 loss=4.167, nll_loss=2.545, ppl=5.84, wps=295428, ups=0.69, wpb=427673, bsz=16008.6, num_updates=21900, lr=0.000427374, gnorm=0.24, clip=0, loss_scale=1, train_wall=144, gb_free=58.9, wall=32283 epoch 013: 1482 / 1707 loss=4.167, nll_loss=2.545, ppl=5.84, wps=295428, ups=0.69, wpb=427673, bsz=16008.6, num_updates=21900, lr=0.000427374, gnorm=0.24, clip=0, loss_scale=1, train_wall=144, gb_free=58.9, wall=32283 epoch 013: 1482 / 1707 loss=4.167, nll_loss=2.545, ppl=5.84, wps=295428, ups=0.69, wpb=427673, bsz=16008.6, num_updates=21900, lr=0.000427374, gnorm=0.24, clip=0, loss_scale=1, train_wall=144, gb_free=58.9, wall=32283 epoch 013: 1482 / 1707 loss=4.167, nll_loss=2.545, ppl=5.84, wps=295428, ups=0.69, wpb=427673, bsz=16008.6, num_updates=21900, lr=0.000427374, gnorm=0.24, clip=0, loss_scale=1, train_wall=144, gb_free=58.9, wall=32283 epoch 013: 1482 / 1707 loss=4.167, nll_loss=2.545, ppl=5.84, wps=295428, ups=0.69, wpb=427673, bsz=16008.6, num_updates=21900, lr=0.000427374, gnorm=0.24, clip=0, loss_scale=1, train_wall=144, gb_free=58.9, wall=32283 epoch 013: 1482 / 1707 loss=4.167, nll_loss=2.545, ppl=5.84, wps=295428, ups=0.69, wpb=427673, bsz=16008.6, num_updates=21900, lr=0.000427374, gnorm=0.24, clip=0, loss_scale=1, train_wall=144, gb_free=58.9, wall=32283 epoch 013: 1582 / 1707 loss=4.174, nll_loss=2.554, ppl=5.87, wps=296899, ups=0.69, wpb=430925, bsz=16541.4, num_updates=22000, lr=0.000426401, gnorm=0.249, clip=0, loss_scale=1, train_wall=144, gb_free=59.3, wall=32428 epoch 013: 1582 / 1707 loss=4.174, nll_loss=2.554, ppl=5.87, wps=296899, ups=0.69, wpb=430925, bsz=16541.4, num_updates=22000, lr=0.000426401, gnorm=0.249, clip=0, loss_scale=1, train_wall=144, gb_free=59.3, wall=32428 epoch 013: 1582 / 1707 loss=4.174, nll_loss=2.554, ppl=5.87, wps=296899, ups=0.69, wpb=430925, bsz=16541.4, num_updates=22000, lr=0.000426401, gnorm=0.249, clip=0, loss_scale=1, train_wall=144, gb_free=59.3, wall=32428 epoch 013: 1582 / 1707 loss=4.174, nll_loss=2.554, ppl=5.87, wps=296899, ups=0.69, wpb=430925, bsz=16541.4, num_updates=22000, lr=0.000426401, gnorm=0.249, clip=0, loss_scale=1, train_wall=144, gb_free=59.3, wall=32428 epoch 013: 1582 / 1707 loss=4.174, nll_loss=2.554, ppl=5.87, wps=296899, ups=0.69, wpb=430925, bsz=16541.4, num_updates=22000, lr=0.000426401, gnorm=0.249, clip=0, loss_scale=1, train_wall=144, gb_free=59.3, wall=32428 epoch 013: 1582 / 1707 loss=4.174, nll_loss=2.554, ppl=5.87, wps=296899, ups=0.69, wpb=430925, bsz=16541.4, num_updates=22000, lr=0.000426401, gnorm=0.249, clip=0, loss_scale=1, train_wall=144, gb_free=59.3, wall=32428 epoch 013: 1582 / 1707 loss=4.174, nll_loss=2.554, ppl=5.87, wps=296899, ups=0.69, wpb=430925, bsz=16541.4, num_updates=22000, lr=0.000426401, gnorm=0.249, clip=0, loss_scale=1, train_wall=144, gb_free=59.3, wall=32428 epoch 013: 1582 / 1707 loss=4.174, nll_loss=2.554, ppl=5.87, wps=296899, ups=0.69, wpb=430925, bsz=16541.4, num_updates=22000, lr=0.000426401, gnorm=0.249, clip=0, loss_scale=1, train_wall=144, gb_free=59.3, wall=32428 epoch 013: 1582 / 1707 loss=4.174, nll_loss=2.554, ppl=5.87, wps=296899, ups=0.69, wpb=430925, bsz=16541.4, num_updates=22000, lr=0.000426401, gnorm=0.249, clip=0, loss_scale=1, train_wall=144, gb_free=59.3, wall=32428 epoch 013: 1582 / 1707 loss=4.174, nll_loss=2.554, ppl=5.87, wps=296899, ups=0.69, wpb=430925, bsz=16541.4, num_updates=22000, lr=0.000426401, gnorm=0.249, clip=0, loss_scale=1, train_wall=144, gb_free=59.3, wall=32428 epoch 013: 1582 / 1707 loss=4.174, nll_loss=2.554, ppl=5.87, wps=296899, ups=0.69, wpb=430925, bsz=16541.4, num_updates=22000, lr=0.000426401, gnorm=0.249, clip=0, loss_scale=1, train_wall=144, gb_free=59.3, wall=32428 epoch 013: 1582 / 1707 loss=4.174, nll_loss=2.554, ppl=5.87, wps=296899, ups=0.69, wpb=430925, bsz=16541.4, num_updates=22000, lr=0.000426401, gnorm=0.249, clip=0, loss_scale=1, train_wall=144, gb_free=59.3, wall=32428 epoch 013: 1582 / 1707 loss=4.174, nll_loss=2.554, ppl=5.87, wps=296899, ups=0.69, wpb=430925, bsz=16541.4, num_updates=22000, lr=0.000426401, gnorm=0.249, clip=0, loss_scale=1, train_wall=144, gb_free=59.3, wall=32428 begin validation on "valid" subset epoch 013 | valid on 'valid' subset | loss 4.321 | nll_loss 2.703 | ppl 6.51 | wps 76537.3 | wpb 21331 | bsz 1016 | num_updates 22000 | best_loss 4.321 epoch 013 | valid on 'valid' subset | loss 4.321 | nll_loss 2.703 | ppl 6.51 | wps 76537.3 | wpb 21331 | bsz 1016 | num_updates 22000 | best_loss 4.321 epoch 013 | valid on 'valid' subset | loss 4.321 | nll_loss 2.703 | ppl 6.51 | wps 76537.3 | wpb 21331 | bsz 1016 | num_updates 22000 | best_loss 4.321 epoch 013 | valid on 'valid' subset | loss 4.321 | nll_loss 2.703 | ppl 6.51 | wps 76537.3 | wpb 21331 | bsz 1016 | num_updates 22000 | best_loss 4.321 epoch 013 | valid on 'valid' subset | loss 4.321 | nll_loss 2.703 | ppl 6.51 | wps 76537.3 | wpb 21331 | bsz 1016 | num_updates 22000 | best_loss 4.321 epoch 013 | valid on 'valid' subset | loss 4.321 | nll_loss 2.703 | ppl 6.51 | wps 76537.3 | wpb 21331 | bsz 1016 | num_updates 22000 | best_loss 4.321 epoch 013 | valid on 'valid' subset | loss 4.321 | nll_loss 2.703 | ppl 6.51 | wps 76537.3 | wpb 21331 | bsz 1016 | num_updates 22000 | best_loss 4.321 epoch 013 | valid on 'valid' subset | loss 4.321 | nll_loss 2.703 | ppl 6.51 | wps 76537.3 | wpb 21331 | bsz 1016 | num_updates 22000 | best_loss 4.321 epoch 013 | valid on 'valid' subset | loss 4.321 | nll_loss 2.703 | ppl 6.51 | wps 76537.3 | wpb 21331 | bsz 1016 | num_updates 22000 | best_loss 4.321 epoch 013 | valid on 'valid' subset | loss 4.321 | nll_loss 2.703 | ppl 6.51 | wps 76537.3 | wpb 21331 | bsz 1016 | num_updates 22000 | best_loss 4.321 epoch 013 | valid on 'valid' subset | loss 4.321 | nll_loss 2.703 | ppl 6.51 | wps 76537.3 | wpb 21331 | bsz 1016 | num_updates 22000 | best_loss 4.321 epoch 013 | valid on 'valid' subset | loss 4.321 | nll_loss 2.703 | ppl 6.51 | wps 76537.3 | wpb 21331 | bsz 1016 | num_updates 22000 | best_loss 4.321 epoch 013 | valid on 'valid' subset | loss 4.321 | nll_loss 2.703 | ppl 6.51 | wps 76537.3 | wpb 21331 | bsz 1016 | num_updates 22000 | best_loss 4.321 epoch 013: 1682 / 1707 loss=4.172, nll_loss=2.551, ppl=5.86, wps=262167, ups=0.61, wpb=427990, bsz=16330.2, num_updates=22100, lr=0.000425436, gnorm=0.257, clip=0, loss_scale=2, train_wall=144, gb_free=59, wall=32591 epoch 013: 1682 / 1707 loss=4.172, nll_loss=2.551, ppl=5.86, wps=262167, ups=0.61, wpb=427990, bsz=16330.2, num_updates=22100, lr=0.000425436, gnorm=0.257, clip=0, loss_scale=2, train_wall=144, gb_free=59, wall=32591 epoch 013: 1682 / 1707 loss=4.172, nll_loss=2.551, ppl=5.86, wps=262167, ups=0.61, wpb=427990, bsz=16330.2, num_updates=22100, lr=0.000425436, gnorm=0.257, clip=0, loss_scale=2, train_wall=144, gb_free=59, wall=32591 epoch 013: 1682 / 1707 loss=4.172, nll_loss=2.551, ppl=5.86, wps=262167, ups=0.61, wpb=427990, bsz=16330.2, num_updates=22100, lr=0.000425436, gnorm=0.257, clip=0, loss_scale=2, train_wall=144, gb_free=59, wall=32591 epoch 013: 1682 / 1707 loss=4.172, nll_loss=2.551, ppl=5.86, wps=262167, ups=0.61, wpb=427990, bsz=16330.2, num_updates=22100, lr=0.000425436, gnorm=0.257, clip=0, loss_scale=2, train_wall=144, gb_free=59, wall=32591 epoch 013: 1682 / 1707 loss=4.172, nll_loss=2.551, ppl=5.86, wps=262167, ups=0.61, wpb=427990, bsz=16330.2, num_updates=22100, lr=0.000425436, gnorm=0.257, clip=0, loss_scale=2, train_wall=144, gb_free=59, wall=32591 epoch 013: 1682 / 1707 loss=4.172, nll_loss=2.551, ppl=5.86, wps=262167, ups=0.61, wpb=427990, bsz=16330.2, num_updates=22100, lr=0.000425436, gnorm=0.257, clip=0, loss_scale=2, train_wall=144, gb_free=59, wall=32591 epoch 013: 1682 / 1707 loss=4.172, nll_loss=2.551, ppl=5.86, wps=262167, ups=0.61, wpb=427990, bsz=16330.2, num_updates=22100, lr=0.000425436, gnorm=0.257, clip=0, loss_scale=2, train_wall=144, gb_free=59, wall=32591 epoch 013: 1682 / 1707 loss=4.172, nll_loss=2.551, ppl=5.86, wps=262167, ups=0.61, wpb=427990, bsz=16330.2, num_updates=22100, lr=0.000425436, gnorm=0.257, clip=0, loss_scale=2, train_wall=144, gb_free=59, wall=32591 epoch 013: 1682 / 1707 loss=4.172, nll_loss=2.551, ppl=5.86, wps=262167, ups=0.61, wpb=427990, bsz=16330.2, num_updates=22100, lr=0.000425436, gnorm=0.257, clip=0, loss_scale=2, train_wall=144, gb_free=59, wall=32591 epoch 013: 1682 / 1707 loss=4.172, nll_loss=2.551, ppl=5.86, wps=262167, ups=0.61, wpb=427990, bsz=16330.2, num_updates=22100, lr=0.000425436, gnorm=0.257, clip=0, loss_scale=2, train_wall=144, gb_free=59, wall=32591 epoch 013: 1682 / 1707 loss=4.172, nll_loss=2.551, ppl=5.86, wps=262167, ups=0.61, wpb=427990, bsz=16330.2, num_updates=22100, lr=0.000425436, gnorm=0.257, clip=0, loss_scale=2, train_wall=144, gb_free=59, wall=32591 epoch 013: 1682 / 1707 loss=4.172, nll_loss=2.551, ppl=5.86, wps=262167, ups=0.61, wpb=427990, bsz=16330.2, num_updates=22100, lr=0.000425436, gnorm=0.257, clip=0, loss_scale=2, train_wall=144, gb_free=59, wall=32591 end of epoch 13 (average epoch stats below) epoch 013 | loss 4.169 | nll_loss 2.547 | ppl 5.84 | wps 291587 | ups 0.68 | wpb 428936 | bsz 16334.9 | num_updates 22125 | lr 0.000425195 | gnorm 0.246 | clip 0 | loss_scale 2 | train_wall 2459 | gb_free 59.8 | wall 32627 epoch 013 | loss 4.169 | nll_loss 2.547 | ppl 5.84 | wps 291587 | ups 0.68 | wpb 428936 | bsz 16334.9 | num_updates 22125 | lr 0.000425195 | gnorm 0.246 | clip 0 | loss_scale 2 | train_wall 2459 | gb_free 59.8 | wall 32627 epoch 013 | loss 4.169 | nll_loss 2.547 | ppl 5.84 | wps 291587 | ups 0.68 | wpb 428936 | bsz 16334.9 | num_updates 22125 | lr 0.000425195 | gnorm 0.246 | clip 0 | loss_scale 2 | train_wall 2459 | gb_free 59.8 | wall 32627 epoch 013 | loss 4.169 | nll_loss 2.547 | ppl 5.84 | wps 291587 | ups 0.68 | wpb 428936 | bsz 16334.9 | num_updates 22125 | lr 0.000425195 | gnorm 0.246 | clip 0 | loss_scale 2 | train_wall 2459 | gb_free 59.8 | wall 32627 epoch 013 | loss 4.169 | nll_loss 2.547 | ppl 5.84 | wps 291587 | ups 0.68 | wpb 428936 | bsz 16334.9 | num_updates 22125 | lr 0.000425195 | gnorm 0.246 | clip 0 | loss_scale 2 | train_wall 2459 | gb_free 59.8 | wall 32627 epoch 013 | loss 4.169 | nll_loss 2.547 | ppl 5.84 | wps 291587 | ups 0.68 | wpb 428936 | bsz 16334.9 | num_updates 22125 | lr 0.000425195 | gnorm 0.246 | clip 0 | loss_scale 2 | train_wall 2459 | gb_free 59.8 | wall 32627 epoch 013 | loss 4.169 | nll_loss 2.547 | ppl 5.84 | wps 291587 | ups 0.68 | wpb 428936 | bsz 16334.9 | num_updates 22125 | lr 0.000425195 | gnorm 0.246 | clip 0 | loss_scale 2 | train_wall 2459 | gb_free 59.8 | wall 32627 epoch 013 | loss 4.169 | nll_loss 2.547 | ppl 5.84 | wps 291587 | ups 0.68 | wpb 428936 | bsz 16334.9 | num_updates 22125 | lr 0.000425195 | gnorm 0.246 | clip 0 | loss_scale 2 | train_wall 2459 | gb_free 59.8 | wall 32627 epoch 013 | loss 4.169 | nll_loss 2.547 | ppl 5.84 | wps 291587 | ups 0.68 | wpb 428936 | bsz 16334.9 | num_updates 22125 | lr 0.000425195 | gnorm 0.246 | clip 0 | loss_scale 2 | train_wall 2459 | gb_free 59.8 | wall 32627 epoch 013 | loss 4.169 | nll_loss 2.547 | ppl 5.84 | wps 291587 | ups 0.68 | wpb 428936 | bsz 16334.9 | num_updates 22125 | lr 0.000425195 | gnorm 0.246 | clip 0 | loss_scale 2 | train_wall 2459 | gb_free 59.8 | wall 32627 epoch 013 | loss 4.169 | nll_loss 2.547 | ppl 5.84 | wps 291587 | ups 0.68 | wpb 428936 | bsz 16334.9 | num_updates 22125 | lr 0.000425195 | gnorm 0.246 | clip 0 | loss_scale 2 | train_wall 2459 | gb_free 59.8 | wall 32627 epoch 013 | loss 4.169 | nll_loss 2.547 | ppl 5.84 | wps 291587 | ups 0.68 | wpb 428936 | bsz 16334.9 | num_updates 22125 | lr 0.000425195 | gnorm 0.246 | clip 0 | loss_scale 2 | train_wall 2459 | gb_free 59.8 | wall 32627 epoch 013 | loss 4.169 | nll_loss 2.547 | ppl 5.84 | wps 291587 | ups 0.68 | wpb 428936 | bsz 16334.9 | num_updates 22125 | lr 0.000425195 | gnorm 0.246 | clip 0 | loss_scale 2 | train_wall 2459 | gb_free 59.8 | wall 32627 Start iterating over samples epoch 014: 75 / 1707 loss=4.141, nll_loss=2.515, ppl=5.71, wps=293680, ups=0.69, wpb=424760, bsz=16273.3, num_updates=22200, lr=0.000424476, gnorm=0.232, clip=0, loss_scale=2, train_wall=144, gb_free=59.1, wall=32736 epoch 014: 75 / 1707 loss=4.141, nll_loss=2.515, ppl=5.71, wps=293680, ups=0.69, wpb=424760, bsz=16273.3, num_updates=22200, lr=0.000424476, gnorm=0.232, clip=0, loss_scale=2, train_wall=144, gb_free=59.1, wall=32736 epoch 014: 75 / 1707 loss=4.141, nll_loss=2.515, ppl=5.71, wps=293680, ups=0.69, wpb=424760, bsz=16273.3, num_updates=22200, lr=0.000424476, gnorm=0.232, clip=0, loss_scale=2, train_wall=144, gb_free=59.1, wall=32736 epoch 014: 75 / 1707 loss=4.141, nll_loss=2.515, ppl=5.71, wps=293680, ups=0.69, wpb=424760, bsz=16273.3, num_updates=22200, lr=0.000424476, gnorm=0.232, clip=0, loss_scale=2, train_wall=144, gb_free=59.1, wall=32736 epoch 014: 75 / 1707 loss=4.141, nll_loss=2.515, ppl=5.71, wps=293680, ups=0.69, wpb=424760, bsz=16273.3, num_updates=22200, lr=0.000424476, gnorm=0.232, clip=0, loss_scale=2, train_wall=144, gb_free=59.1, wall=32736 epoch 014: 75 / 1707 loss=4.141, nll_loss=2.515, ppl=5.71, wps=293680, ups=0.69, wpb=424760, bsz=16273.3, num_updates=22200, lr=0.000424476, gnorm=0.232, clip=0, loss_scale=2, train_wall=144, gb_free=59.1, wall=32736 epoch 014: 75 / 1707 loss=4.141, nll_loss=2.515, ppl=5.71, wps=293680, ups=0.69, wpb=424760, bsz=16273.3, num_updates=22200, lr=0.000424476, gnorm=0.232, clip=0, loss_scale=2, train_wall=144, gb_free=59.1, wall=32736 epoch 014: 75 / 1707 loss=4.141, nll_loss=2.515, ppl=5.71, wps=293680, ups=0.69, wpb=424760, bsz=16273.3, num_updates=22200, lr=0.000424476, gnorm=0.232, clip=0, loss_scale=2, train_wall=144, gb_free=59.1, wall=32736 epoch 014: 75 / 1707 loss=4.141, nll_loss=2.515, ppl=5.71, wps=293680, ups=0.69, wpb=424760, bsz=16273.3, num_updates=22200, lr=0.000424476, gnorm=0.232, clip=0, loss_scale=2, train_wall=144, gb_free=59.1, wall=32736 epoch 014: 75 / 1707 loss=4.141, nll_loss=2.515, ppl=5.71, wps=293680, ups=0.69, wpb=424760, bsz=16273.3, num_updates=22200, lr=0.000424476, gnorm=0.232, clip=0, loss_scale=2, train_wall=144, gb_free=59.1, wall=32736 epoch 014: 75 / 1707 loss=4.141, nll_loss=2.515, ppl=5.71, wps=293680, ups=0.69, wpb=424760, bsz=16273.3, num_updates=22200, lr=0.000424476, gnorm=0.232, clip=0, loss_scale=2, train_wall=144, gb_free=59.1, wall=32736 epoch 014: 75 / 1707 loss=4.141, nll_loss=2.515, ppl=5.71, wps=293680, ups=0.69, wpb=424760, bsz=16273.3, num_updates=22200, lr=0.000424476, gnorm=0.232, clip=0, loss_scale=2, train_wall=144, gb_free=59.1, wall=32736 epoch 014: 75 / 1707 loss=4.141, nll_loss=2.515, ppl=5.71, wps=293680, ups=0.69, wpb=424760, bsz=16273.3, num_updates=22200, lr=0.000424476, gnorm=0.232, clip=0, loss_scale=2, train_wall=144, gb_free=59.1, wall=32736 epoch 014: 75 / 1707 loss=4.141, nll_loss=2.515, ppl=5.71, wps=293680, ups=0.69, wpb=424760, bsz=16273.3, num_updates=22200, lr=0.000424476, gnorm=0.232, clip=0, loss_scale=2, train_wall=144, gb_free=59.1, wall=32736 epoch 014: 175 / 1707 loss=4.133, nll_loss=2.506, ppl=5.68, wps=296356, ups=0.69, wpb=429626, bsz=16325.8, num_updates=22300, lr=0.000423524, gnorm=0.239, clip=0, loss_scale=4, train_wall=144, gb_free=59.5, wall=32881 epoch 014: 175 / 1707 loss=4.133, nll_loss=2.506, ppl=5.68, wps=296356, ups=0.69, wpb=429626, bsz=16325.8, num_updates=22300, lr=0.000423524, gnorm=0.239, clip=0, loss_scale=4, train_wall=144, gb_free=59.5, wall=32881 epoch 014: 175 / 1707 loss=4.133, nll_loss=2.506, ppl=5.68, wps=296356, ups=0.69, wpb=429626, bsz=16325.8, num_updates=22300, lr=0.000423524, gnorm=0.239, clip=0, loss_scale=4, train_wall=144, gb_free=59.5, wall=32881 epoch 014: 175 / 1707 loss=4.133, nll_loss=2.506, ppl=5.68, wps=296356, ups=0.69, wpb=429626, bsz=16325.8, num_updates=22300, lr=0.000423524, gnorm=0.239, clip=0, loss_scale=4, train_wall=144, gb_free=59.5, wall=32881 epoch 014: 175 / 1707 loss=4.133, nll_loss=2.506, ppl=5.68, wps=296356, ups=0.69, wpb=429626, bsz=16325.8, num_updates=22300, lr=0.000423524, gnorm=0.239, clip=0, loss_scale=4, train_wall=144, gb_free=59.5, wall=32881 epoch 014: 175 / 1707 loss=4.133, nll_loss=2.506, ppl=5.68, wps=296356, ups=0.69, wpb=429626, bsz=16325.8, num_updates=22300, lr=0.000423524, gnorm=0.239, clip=0, loss_scale=4, train_wall=144, gb_free=59.5, wall=32881 epoch 014: 175 / 1707 loss=4.133, nll_loss=2.506, ppl=5.68, wps=296356, ups=0.69, wpb=429626, bsz=16325.8, num_updates=22300, lr=0.000423524, gnorm=0.239, clip=0, loss_scale=4, train_wall=144, gb_free=59.5, wall=32881 epoch 014: 175 / 1707 loss=4.133, nll_loss=2.506, ppl=5.68, wps=296356, ups=0.69, wpb=429626, bsz=16325.8, num_updates=22300, lr=0.000423524, gnorm=0.239, clip=0, loss_scale=4, train_wall=144, gb_free=59.5, wall=32881 epoch 014: 175 / 1707 loss=4.133, nll_loss=2.506, ppl=5.68, wps=296356, ups=0.69, wpb=429626, bsz=16325.8, num_updates=22300, lr=0.000423524, gnorm=0.239, clip=0, loss_scale=4, train_wall=144, gb_free=59.5, wall=32881 epoch 014: 175 / 1707 loss=4.133, nll_loss=2.506, ppl=5.68, wps=296356, ups=0.69, wpb=429626, bsz=16325.8, num_updates=22300, lr=0.000423524, gnorm=0.239, clip=0, loss_scale=4, train_wall=144, gb_free=59.5, wall=32881 epoch 014: 175 / 1707 loss=4.133, nll_loss=2.506, ppl=5.68, wps=296356, ups=0.69, wpb=429626, bsz=16325.8, num_updates=22300, lr=0.000423524, gnorm=0.239, clip=0, loss_scale=4, train_wall=144, gb_free=59.5, wall=32881 epoch 014: 175 / 1707 loss=4.133, nll_loss=2.506, ppl=5.68, wps=296356, ups=0.69, wpb=429626, bsz=16325.8, num_updates=22300, lr=0.000423524, gnorm=0.239, clip=0, loss_scale=4, train_wall=144, gb_free=59.5, wall=32881 epoch 014: 175 / 1707 loss=4.133, nll_loss=2.506, ppl=5.68, wps=296356, ups=0.69, wpb=429626, bsz=16325.8, num_updates=22300, lr=0.000423524, gnorm=0.239, clip=0, loss_scale=4, train_wall=144, gb_free=59.5, wall=32881 epoch 014: 175 / 1707 loss=4.133, nll_loss=2.506, ppl=5.68, wps=296356, ups=0.69, wpb=429626, bsz=16325.8, num_updates=22300, lr=0.000423524, gnorm=0.239, clip=0, loss_scale=4, train_wall=144, gb_free=59.5, wall=32881 epoch 014: 275 / 1707 loss=4.138, nll_loss=2.511, ppl=5.7, wps=295234, ups=0.69, wpb=428967, bsz=16210.3, num_updates=22400, lr=0.000422577, gnorm=0.256, clip=0, loss_scale=4, train_wall=145, gb_free=59.5, wall=33026 epoch 014: 275 / 1707 loss=4.138, nll_loss=2.511, ppl=5.7, wps=295234, ups=0.69, wpb=428967, bsz=16210.3, num_updates=22400, lr=0.000422577, gnorm=0.256, clip=0, loss_scale=4, train_wall=145, gb_free=59.5, wall=33026 epoch 014: 275 / 1707 loss=4.138, nll_loss=2.511, ppl=5.7, wps=295234, ups=0.69, wpb=428967, bsz=16210.3, num_updates=22400, lr=0.000422577, gnorm=0.256, clip=0, loss_scale=4, train_wall=145, gb_free=59.5, wall=33026 epoch 014: 275 / 1707 loss=4.138, nll_loss=2.511, ppl=5.7, wps=295234, ups=0.69, wpb=428967, bsz=16210.3, num_updates=22400, lr=0.000422577, gnorm=0.256, clip=0, loss_scale=4, train_wall=145, gb_free=59.5, wall=33026 epoch 014: 275 / 1707 loss=4.138, nll_loss=2.511, ppl=5.7, wps=295234, ups=0.69, wpb=428967, bsz=16210.3, num_updates=22400, lr=0.000422577, gnorm=0.256, clip=0, loss_scale=4, train_wall=145, gb_free=59.5, wall=33026 epoch 014: 275 / 1707 loss=4.138, nll_loss=2.511, ppl=5.7, wps=295234, ups=0.69, wpb=428967, bsz=16210.3, num_updates=22400, lr=0.000422577, gnorm=0.256, clip=0, loss_scale=4, train_wall=145, gb_free=59.5, wall=33026 epoch 014: 275 / 1707 loss=4.138, nll_loss=2.511, ppl=5.7, wps=295234, ups=0.69, wpb=428967, bsz=16210.3, num_updates=22400, lr=0.000422577, gnorm=0.256, clip=0, loss_scale=4, train_wall=145, gb_free=59.5, wall=33026 epoch 014: 275 / 1707 loss=4.138, nll_loss=2.511, ppl=5.7, wps=295234, ups=0.69, wpb=428967, bsz=16210.3, num_updates=22400, lr=0.000422577, gnorm=0.256, clip=0, loss_scale=4, train_wall=145, gb_free=59.5, wall=33026 epoch 014: 275 / 1707 loss=4.138, nll_loss=2.511, ppl=5.7, wps=295234, ups=0.69, wpb=428967, bsz=16210.3, num_updates=22400, lr=0.000422577, gnorm=0.256, clip=0, loss_scale=4, train_wall=145, gb_free=59.5, wall=33026 epoch 014: 275 / 1707 loss=4.138, nll_loss=2.511, ppl=5.7, wps=295234, ups=0.69, wpb=428967, bsz=16210.3, num_updates=22400, lr=0.000422577, gnorm=0.256, clip=0, loss_scale=4, train_wall=145, gb_free=59.5, wall=33026 epoch 014: 275 / 1707 loss=4.138, nll_loss=2.511, ppl=5.7, wps=295234, ups=0.69, wpb=428967, bsz=16210.3, num_updates=22400, lr=0.000422577, gnorm=0.256, clip=0, loss_scale=4, train_wall=145, gb_free=59.5, wall=33026 epoch 014: 275 / 1707 loss=4.138, nll_loss=2.511, ppl=5.7, wps=295234, ups=0.69, wpb=428967, bsz=16210.3, num_updates=22400, lr=0.000422577, gnorm=0.256, clip=0, loss_scale=4, train_wall=145, gb_free=59.5, wall=33026 epoch 014: 275 / 1707 loss=4.138, nll_loss=2.511, ppl=5.7, wps=295234, ups=0.69, wpb=428967, bsz=16210.3, num_updates=22400, lr=0.000422577, gnorm=0.256, clip=0, loss_scale=4, train_wall=145, gb_free=59.5, wall=33026 epoch 014: 275 / 1707 loss=4.138, nll_loss=2.511, ppl=5.7, wps=295234, ups=0.69, wpb=428967, bsz=16210.3, num_updates=22400, lr=0.000422577, gnorm=0.256, clip=0, loss_scale=4, train_wall=145, gb_free=59.5, wall=33026 epoch 014: 375 / 1707 loss=4.15, nll_loss=2.525, ppl=5.76, wps=297014, ups=0.69, wpb=430868, bsz=16427.4, num_updates=22500, lr=0.000421637, gnorm=0.234, clip=0, loss_scale=4, train_wall=144, gb_free=59, wall=33171 epoch 014: 375 / 1707 loss=4.15, nll_loss=2.525, ppl=5.76, wps=297014, ups=0.69, wpb=430868, bsz=16427.4, num_updates=22500, lr=0.000421637, gnorm=0.234, clip=0, loss_scale=4, train_wall=144, gb_free=59, wall=33171 epoch 014: 375 / 1707 loss=4.15, nll_loss=2.525, ppl=5.76, wps=297014, ups=0.69, wpb=430868, bsz=16427.4, num_updates=22500, lr=0.000421637, gnorm=0.234, clip=0, loss_scale=4, train_wall=144, gb_free=59, wall=33171 epoch 014: 375 / 1707 loss=4.15, nll_loss=2.525, ppl=5.76, wps=297014, ups=0.69, wpb=430868, bsz=16427.4, num_updates=22500, lr=0.000421637, gnorm=0.234, clip=0, loss_scale=4, train_wall=144, gb_free=59, wall=33171 epoch 014: 375 / 1707 loss=4.15, nll_loss=2.525, ppl=5.76, wps=297014, ups=0.69, wpb=430868, bsz=16427.4, num_updates=22500, lr=0.000421637, gnorm=0.234, clip=0, loss_scale=4, train_wall=144, gb_free=59, wall=33171 epoch 014: 375 / 1707 loss=4.15, nll_loss=2.525, ppl=5.76, wps=297014, ups=0.69, wpb=430868, bsz=16427.4, num_updates=22500, lr=0.000421637, gnorm=0.234, clip=0, loss_scale=4, train_wall=144, gb_free=59, wall=33171 epoch 014: 375 / 1707 loss=4.15, nll_loss=2.525, ppl=5.76, wps=297014, ups=0.69, wpb=430868, bsz=16427.4, num_updates=22500, lr=0.000421637, gnorm=0.234, clip=0, loss_scale=4, train_wall=144, gb_free=59, wall=33171 epoch 014: 375 / 1707 loss=4.15, nll_loss=2.525, ppl=5.76, wps=297014, ups=0.69, wpb=430868, bsz=16427.4, num_updates=22500, lr=0.000421637, gnorm=0.234, clip=0, loss_scale=4, train_wall=144, gb_free=59, wall=33171 epoch 014: 375 / 1707 loss=4.15, nll_loss=2.525, ppl=5.76, wps=297014, ups=0.69, wpb=430868, bsz=16427.4, num_updates=22500, lr=0.000421637, gnorm=0.234, clip=0, loss_scale=4, train_wall=144, gb_free=59, wall=33171 epoch 014: 375 / 1707 loss=4.15, nll_loss=2.525, ppl=5.76, wps=297014, ups=0.69, wpb=430868, bsz=16427.4, num_updates=22500, lr=0.000421637, gnorm=0.234, clip=0, loss_scale=4, train_wall=144, gb_free=59, wall=33171 epoch 014: 375 / 1707 loss=4.15, nll_loss=2.525, ppl=5.76, wps=297014, ups=0.69, wpb=430868, bsz=16427.4, num_updates=22500, lr=0.000421637, gnorm=0.234, clip=0, loss_scale=4, train_wall=144, gb_free=59, wall=33171 epoch 014: 375 / 1707 loss=4.15, nll_loss=2.525, ppl=5.76, wps=297014, ups=0.69, wpb=430868, bsz=16427.4, num_updates=22500, lr=0.000421637, gnorm=0.234, clip=0, loss_scale=4, train_wall=144, gb_free=59, wall=33171 epoch 014: 375 / 1707 loss=4.15, nll_loss=2.525, ppl=5.76, wps=297014, ups=0.69, wpb=430868, bsz=16427.4, num_updates=22500, lr=0.000421637, gnorm=0.234, clip=0, loss_scale=4, train_wall=144, gb_free=59, wall=33171 epoch 014: 375 / 1707 loss=4.15, nll_loss=2.525, ppl=5.76, wps=297014, ups=0.69, wpb=430868, bsz=16427.4, num_updates=22500, lr=0.000421637, gnorm=0.234, clip=0, loss_scale=4, train_wall=144, gb_free=59, wall=33171 epoch 014: 477 / 1707 loss=4.155, nll_loss=2.531, ppl=5.78, wps=289652, ups=0.67, wpb=429115, bsz=16240.7, num_updates=22600, lr=0.000420703, gnorm=0.245, clip=0, loss_scale=1, train_wall=147, gb_free=58.9, wall=33319 epoch 014: 477 / 1707 loss=4.155, nll_loss=2.531, ppl=5.78, wps=289652, ups=0.67, wpb=429115, bsz=16240.7, num_updates=22600, lr=0.000420703, gnorm=0.245, clip=0, loss_scale=1, train_wall=147, gb_free=58.9, wall=33319 epoch 014: 477 / 1707 loss=4.155, nll_loss=2.531, ppl=5.78, wps=289652, ups=0.67, wpb=429115, bsz=16240.7, num_updates=22600, lr=0.000420703, gnorm=0.245, clip=0, loss_scale=1, train_wall=147, gb_free=58.9, wall=33319 epoch 014: 477 / 1707 loss=4.155, nll_loss=2.531, ppl=5.78, wps=289652, ups=0.67, wpb=429115, bsz=16240.7, num_updates=22600, lr=0.000420703, gnorm=0.245, clip=0, loss_scale=1, train_wall=147, gb_free=58.9, wall=33319 epoch 014: 477 / 1707 loss=4.155, nll_loss=2.531, ppl=5.78, wps=289652, ups=0.67, wpb=429115, bsz=16240.7, num_updates=22600, lr=0.000420703, gnorm=0.245, clip=0, loss_scale=1, train_wall=147, gb_free=58.9, wall=33319 epoch 014: 477 / 1707 loss=4.155, nll_loss=2.531, ppl=5.78, wps=289652, ups=0.67, wpb=429115, bsz=16240.7, num_updates=22600, lr=0.000420703, gnorm=0.245, clip=0, loss_scale=1, train_wall=147, gb_free=58.9, wall=33319 epoch 014: 477 / 1707 loss=4.155, nll_loss=2.531, ppl=5.78, wps=289652, ups=0.67, wpb=429115, bsz=16240.7, num_updates=22600, lr=0.000420703, gnorm=0.245, clip=0, loss_scale=1, train_wall=147, gb_free=58.9, wall=33319 epoch 014: 477 / 1707 loss=4.155, nll_loss=2.531, ppl=5.78, wps=289652, ups=0.67, wpb=429115, bsz=16240.7, num_updates=22600, lr=0.000420703, gnorm=0.245, clip=0, loss_scale=1, train_wall=147, gb_free=58.9, wall=33319 epoch 014: 477 / 1707 loss=4.155, nll_loss=2.531, ppl=5.78, wps=289652, ups=0.67, wpb=429115, bsz=16240.7, num_updates=22600, lr=0.000420703, gnorm=0.245, clip=0, loss_scale=1, train_wall=147, gb_free=58.9, wall=33319 epoch 014: 477 / 1707 loss=4.155, nll_loss=2.531, ppl=5.78, wps=289652, ups=0.67, wpb=429115, bsz=16240.7, num_updates=22600, lr=0.000420703, gnorm=0.245, clip=0, loss_scale=1, train_wall=147, gb_free=58.9, wall=33319 epoch 014: 477 / 1707 loss=4.155, nll_loss=2.531, ppl=5.78, wps=289652, ups=0.67, wpb=429115, bsz=16240.7, num_updates=22600, lr=0.000420703, gnorm=0.245, clip=0, loss_scale=1, train_wall=147, gb_free=58.9, wall=33319 epoch 014: 477 / 1707 loss=4.155, nll_loss=2.531, ppl=5.78, wps=289652, ups=0.67, wpb=429115, bsz=16240.7, num_updates=22600, lr=0.000420703, gnorm=0.245, clip=0, loss_scale=1, train_wall=147, gb_free=58.9, wall=33319 epoch 014: 477 / 1707 loss=4.155, nll_loss=2.531, ppl=5.78, wps=289652, ups=0.67, wpb=429115, bsz=16240.7, num_updates=22600, lr=0.000420703, gnorm=0.245, clip=0, loss_scale=1, train_wall=147, gb_free=58.9, wall=33319 epoch 014: 477 / 1707 loss=4.155, nll_loss=2.531, ppl=5.78, wps=289652, ups=0.67, wpb=429115, bsz=16240.7, num_updates=22600, lr=0.000420703, gnorm=0.245, clip=0, loss_scale=1, train_wall=147, gb_free=58.9, wall=33319 epoch 014: 577 / 1707 loss=4.145, nll_loss=2.519, ppl=5.73, wps=294542, ups=0.69, wpb=427463, bsz=16151, num_updates=22700, lr=0.000419775, gnorm=0.234, clip=0, loss_scale=1, train_wall=144, gb_free=59, wall=33464 epoch 014: 577 / 1707 loss=4.145, nll_loss=2.519, ppl=5.73, wps=294542, ups=0.69, wpb=427463, bsz=16151, num_updates=22700, lr=0.000419775, gnorm=0.234, clip=0, loss_scale=1, train_wall=144, gb_free=59, wall=33464 epoch 014: 577 / 1707 loss=4.145, nll_loss=2.519, ppl=5.73, wps=294542, ups=0.69, wpb=427463, bsz=16151, num_updates=22700, lr=0.000419775, gnorm=0.234, clip=0, loss_scale=1, train_wall=144, gb_free=59, wall=33464 epoch 014: 577 / 1707 loss=4.145, nll_loss=2.519, ppl=5.73, wps=294542, ups=0.69, wpb=427463, bsz=16151, num_updates=22700, lr=0.000419775, gnorm=0.234, clip=0, loss_scale=1, train_wall=144, gb_free=59, wall=33464 epoch 014: 577 / 1707 loss=4.145, nll_loss=2.519, ppl=5.73, wps=294542, ups=0.69, wpb=427463, bsz=16151, num_updates=22700, lr=0.000419775, gnorm=0.234, clip=0, loss_scale=1, train_wall=144, gb_free=59, wall=33464 epoch 014: 577 / 1707 loss=4.145, nll_loss=2.519, ppl=5.73, wps=294542, ups=0.69, wpb=427463, bsz=16151, num_updates=22700, lr=0.000419775, gnorm=0.234, clip=0, loss_scale=1, train_wall=144, gb_free=59, wall=33464 epoch 014: 577 / 1707 loss=4.145, nll_loss=2.519, ppl=5.73, wps=294542, ups=0.69, wpb=427463, bsz=16151, num_updates=22700, lr=0.000419775, gnorm=0.234, clip=0, loss_scale=1, train_wall=144, gb_free=59, wall=33464 epoch 014: 577 / 1707 loss=4.145, nll_loss=2.519, ppl=5.73, wps=294542, ups=0.69, wpb=427463, bsz=16151, num_updates=22700, lr=0.000419775, gnorm=0.234, clip=0, loss_scale=1, train_wall=144, gb_free=59, wall=33464 epoch 014: 577 / 1707 loss=4.145, nll_loss=2.519, ppl=5.73, wps=294542, ups=0.69, wpb=427463, bsz=16151, num_updates=22700, lr=0.000419775, gnorm=0.234, clip=0, loss_scale=1, train_wall=144, gb_free=59, wall=33464 epoch 014: 577 / 1707 loss=4.145, nll_loss=2.519, ppl=5.73, wps=294542, ups=0.69, wpb=427463, bsz=16151, num_updates=22700, lr=0.000419775, gnorm=0.234, clip=0, loss_scale=1, train_wall=144, gb_free=59, wall=33464 epoch 014: 577 / 1707 loss=4.145, nll_loss=2.519, ppl=5.73, wps=294542, ups=0.69, wpb=427463, bsz=16151, num_updates=22700, lr=0.000419775, gnorm=0.234, clip=0, loss_scale=1, train_wall=144, gb_free=59, wall=33464 epoch 014: 577 / 1707 loss=4.145, nll_loss=2.519, ppl=5.73, wps=294542, ups=0.69, wpb=427463, bsz=16151, num_updates=22700, lr=0.000419775, gnorm=0.234, clip=0, loss_scale=1, train_wall=144, gb_free=59, wall=33464 epoch 014: 577 / 1707 loss=4.145, nll_loss=2.519, ppl=5.73, wps=294542, ups=0.69, wpb=427463, bsz=16151, num_updates=22700, lr=0.000419775, gnorm=0.234, clip=0, loss_scale=1, train_wall=144, gb_free=59, wall=33464 epoch 014: 577 / 1707 loss=4.145, nll_loss=2.519, ppl=5.73, wps=294542, ups=0.69, wpb=427463, bsz=16151, num_updates=22700, lr=0.000419775, gnorm=0.234, clip=0, loss_scale=1, train_wall=144, gb_free=59, wall=33464 epoch 014: 677 / 1707 loss=4.152, nll_loss=2.528, ppl=5.77, wps=296369, ups=0.69, wpb=428761, bsz=16312.1, num_updates=22800, lr=0.000418854, gnorm=0.259, clip=0, loss_scale=1, train_wall=144, gb_free=58.7, wall=33609 epoch 014: 677 / 1707 loss=4.152, nll_loss=2.528, ppl=5.77, wps=296369, ups=0.69, wpb=428761, bsz=16312.1, num_updates=22800, lr=0.000418854, gnorm=0.259, clip=0, loss_scale=1, train_wall=144, gb_free=58.7, wall=33609 epoch 014: 677 / 1707 loss=4.152, nll_loss=2.528, ppl=5.77, wps=296369, ups=0.69, wpb=428761, bsz=16312.1, num_updates=22800, lr=0.000418854, gnorm=0.259, clip=0, loss_scale=1, train_wall=144, gb_free=58.7, wall=33609 epoch 014: 677 / 1707 loss=4.152, nll_loss=2.528, ppl=5.77, wps=296369, ups=0.69, wpb=428761, bsz=16312.1, num_updates=22800, lr=0.000418854, gnorm=0.259, clip=0, loss_scale=1, train_wall=144, gb_free=58.7, wall=33609 epoch 014: 677 / 1707 loss=4.152, nll_loss=2.528, ppl=5.77, wps=296369, ups=0.69, wpb=428761, bsz=16312.1, num_updates=22800, lr=0.000418854, gnorm=0.259, clip=0, loss_scale=1, train_wall=144, gb_free=58.7, wall=33609 epoch 014: 677 / 1707 loss=4.152, nll_loss=2.528, ppl=5.77, wps=296369, ups=0.69, wpb=428761, bsz=16312.1, num_updates=22800, lr=0.000418854, gnorm=0.259, clip=0, loss_scale=1, train_wall=144, gb_free=58.7, wall=33609 epoch 014: 677 / 1707 loss=4.152, nll_loss=2.528, ppl=5.77, wps=296369, ups=0.69, wpb=428761, bsz=16312.1, num_updates=22800, lr=0.000418854, gnorm=0.259, clip=0, loss_scale=1, train_wall=144, gb_free=58.7, wall=33609 epoch 014: 677 / 1707 loss=4.152, nll_loss=2.528, ppl=5.77, wps=296369, ups=0.69, wpb=428761, bsz=16312.1, num_updates=22800, lr=0.000418854, gnorm=0.259, clip=0, loss_scale=1, train_wall=144, gb_free=58.7, wall=33609 epoch 014: 677 / 1707 loss=4.152, nll_loss=2.528, ppl=5.77, wps=296369, ups=0.69, wpb=428761, bsz=16312.1, num_updates=22800, lr=0.000418854, gnorm=0.259, clip=0, loss_scale=1, train_wall=144, gb_free=58.7, wall=33609 epoch 014: 677 / 1707 loss=4.152, nll_loss=2.528, ppl=5.77, wps=296369, ups=0.69, wpb=428761, bsz=16312.1, num_updates=22800, lr=0.000418854, gnorm=0.259, clip=0, loss_scale=1, train_wall=144, gb_free=58.7, wall=33609 epoch 014: 677 / 1707 loss=4.152, nll_loss=2.528, ppl=5.77, wps=296369, ups=0.69, wpb=428761, bsz=16312.1, num_updates=22800, lr=0.000418854, gnorm=0.259, clip=0, loss_scale=1, train_wall=144, gb_free=58.7, wall=33609 epoch 014: 677 / 1707 loss=4.152, nll_loss=2.528, ppl=5.77, wps=296369, ups=0.69, wpb=428761, bsz=16312.1, num_updates=22800, lr=0.000418854, gnorm=0.259, clip=0, loss_scale=1, train_wall=144, gb_free=58.7, wall=33609 epoch 014: 677 / 1707 loss=4.152, nll_loss=2.528, ppl=5.77, wps=296369, ups=0.69, wpb=428761, bsz=16312.1, num_updates=22800, lr=0.000418854, gnorm=0.259, clip=0, loss_scale=1, train_wall=144, gb_free=58.7, wall=33609 epoch 014: 677 / 1707 loss=4.152, nll_loss=2.528, ppl=5.77, wps=296369, ups=0.69, wpb=428761, bsz=16312.1, num_updates=22800, lr=0.000418854, gnorm=0.259, clip=0, loss_scale=1, train_wall=144, gb_free=58.7, wall=33609 epoch 014: 777 / 1707 loss=4.152, nll_loss=2.528, ppl=5.77, wps=294791, ups=0.69, wpb=430163, bsz=16420.7, num_updates=22900, lr=0.000417938, gnorm=0.225, clip=0, loss_scale=2, train_wall=145, gb_free=59, wall=33755 epoch 014: 777 / 1707 loss=4.152, nll_loss=2.528, ppl=5.77, wps=294791, ups=0.69, wpb=430163, bsz=16420.7, num_updates=22900, lr=0.000417938, gnorm=0.225, clip=0, loss_scale=2, train_wall=145, gb_free=59, wall=33755 epoch 014: 777 / 1707 loss=4.152, nll_loss=2.528, ppl=5.77, wps=294791, ups=0.69, wpb=430163, bsz=16420.7, num_updates=22900, lr=0.000417938, gnorm=0.225, clip=0, loss_scale=2, train_wall=145, gb_free=59, wall=33755 epoch 014: 777 / 1707 loss=4.152, nll_loss=2.528, ppl=5.77, wps=294791, ups=0.69, wpb=430163, bsz=16420.7, num_updates=22900, lr=0.000417938, gnorm=0.225, clip=0, loss_scale=2, train_wall=145, gb_free=59, wall=33755 epoch 014: 777 / 1707 loss=4.152, nll_loss=2.528, ppl=5.77, wps=294791, ups=0.69, wpb=430163, bsz=16420.7, num_updates=22900, lr=0.000417938, gnorm=0.225, clip=0, loss_scale=2, train_wall=145, gb_free=59, wall=33755 epoch 014: 777 / 1707 loss=4.152, nll_loss=2.528, ppl=5.77, wps=294791, ups=0.69, wpb=430163, bsz=16420.7, num_updates=22900, lr=0.000417938, gnorm=0.225, clip=0, loss_scale=2, train_wall=145, gb_free=59, wall=33755 epoch 014: 777 / 1707 loss=4.152, nll_loss=2.528, ppl=5.77, wps=294791, ups=0.69, wpb=430163, bsz=16420.7, num_updates=22900, lr=0.000417938, gnorm=0.225, clip=0, loss_scale=2, train_wall=145, gb_free=59, wall=33755 epoch 014: 777 / 1707 loss=4.152, nll_loss=2.528, ppl=5.77, wps=294791, ups=0.69, wpb=430163, bsz=16420.7, num_updates=22900, lr=0.000417938, gnorm=0.225, clip=0, loss_scale=2, train_wall=145, gb_free=59, wall=33755 epoch 014: 777 / 1707 loss=4.152, nll_loss=2.528, ppl=5.77, wps=294791, ups=0.69, wpb=430163, bsz=16420.7, num_updates=22900, lr=0.000417938, gnorm=0.225, clip=0, loss_scale=2, train_wall=145, gb_free=59, wall=33755 epoch 014: 777 / 1707 loss=4.152, nll_loss=2.528, ppl=5.77, wps=294791, ups=0.69, wpb=430163, bsz=16420.7, num_updates=22900, lr=0.000417938, gnorm=0.225, clip=0, loss_scale=2, train_wall=145, gb_free=59, wall=33755 epoch 014: 777 / 1707 loss=4.152, nll_loss=2.528, ppl=5.77, wps=294791, ups=0.69, wpb=430163, bsz=16420.7, num_updates=22900, lr=0.000417938, gnorm=0.225, clip=0, loss_scale=2, train_wall=145, gb_free=59, wall=33755 epoch 014: 777 / 1707 loss=4.152, nll_loss=2.528, ppl=5.77, wps=294791, ups=0.69, wpb=430163, bsz=16420.7, num_updates=22900, lr=0.000417938, gnorm=0.225, clip=0, loss_scale=2, train_wall=145, gb_free=59, wall=33755 epoch 014: 777 / 1707 loss=4.152, nll_loss=2.528, ppl=5.77, wps=294791, ups=0.69, wpb=430163, bsz=16420.7, num_updates=22900, lr=0.000417938, gnorm=0.225, clip=0, loss_scale=2, train_wall=145, gb_free=59, wall=33755 epoch 014: 777 / 1707 loss=4.152, nll_loss=2.528, ppl=5.77, wps=294791, ups=0.69, wpb=430163, bsz=16420.7, num_updates=22900, lr=0.000417938, gnorm=0.225, clip=0, loss_scale=2, train_wall=145, gb_free=59, wall=33755 epoch 014: 877 / 1707 loss=4.155, nll_loss=2.532, ppl=5.78, wps=294489, ups=0.69, wpb=428321, bsz=16513.3, num_updates=23000, lr=0.000417029, gnorm=0.239, clip=0, loss_scale=2, train_wall=145, gb_free=59, wall=33901 epoch 014: 877 / 1707 loss=4.155, nll_loss=2.532, ppl=5.78, wps=294489, ups=0.69, wpb=428321, bsz=16513.3, num_updates=23000, lr=0.000417029, gnorm=0.239, clip=0, loss_scale=2, train_wall=145, gb_free=59, wall=33901 epoch 014: 877 / 1707 loss=4.155, nll_loss=2.532, ppl=5.78, wps=294489, ups=0.69, wpb=428321, bsz=16513.3, num_updates=23000, lr=0.000417029, gnorm=0.239, clip=0, loss_scale=2, train_wall=145, gb_free=59, wall=33901 epoch 014: 877 / 1707 loss=4.155, nll_loss=2.532, ppl=5.78, wps=294489, ups=0.69, wpb=428321, bsz=16513.3, num_updates=23000, lr=0.000417029, gnorm=0.239, clip=0, loss_scale=2, train_wall=145, gb_free=59, wall=33901 epoch 014: 877 / 1707 loss=4.155, nll_loss=2.532, ppl=5.78, wps=294489, ups=0.69, wpb=428321, bsz=16513.3, num_updates=23000, lr=0.000417029, gnorm=0.239, clip=0, loss_scale=2, train_wall=145, gb_free=59, wall=33901 epoch 014: 877 / 1707 loss=4.155, nll_loss=2.532, ppl=5.78, wps=294489, ups=0.69, wpb=428321, bsz=16513.3, num_updates=23000, lr=0.000417029, gnorm=0.239, clip=0, loss_scale=2, train_wall=145, gb_free=59, wall=33901 epoch 014: 877 / 1707 loss=4.155, nll_loss=2.532, ppl=5.78, wps=294489, ups=0.69, wpb=428321, bsz=16513.3, num_updates=23000, lr=0.000417029, gnorm=0.239, clip=0, loss_scale=2, train_wall=145, gb_free=59, wall=33901 epoch 014: 877 / 1707 loss=4.155, nll_loss=2.532, ppl=5.78, wps=294489, ups=0.69, wpb=428321, bsz=16513.3, num_updates=23000, lr=0.000417029, gnorm=0.239, clip=0, loss_scale=2, train_wall=145, gb_free=59, wall=33901 epoch 014: 877 / 1707 loss=4.155, nll_loss=2.532, ppl=5.78, wps=294489, ups=0.69, wpb=428321, bsz=16513.3, num_updates=23000, lr=0.000417029, gnorm=0.239, clip=0, loss_scale=2, train_wall=145, gb_free=59, wall=33901 epoch 014: 877 / 1707 loss=4.155, nll_loss=2.532, ppl=5.78, wps=294489, ups=0.69, wpb=428321, bsz=16513.3, num_updates=23000, lr=0.000417029, gnorm=0.239, clip=0, loss_scale=2, train_wall=145, gb_free=59, wall=33901 epoch 014: 877 / 1707 loss=4.155, nll_loss=2.532, ppl=5.78, wps=294489, ups=0.69, wpb=428321, bsz=16513.3, num_updates=23000, lr=0.000417029, gnorm=0.239, clip=0, loss_scale=2, train_wall=145, gb_free=59, wall=33901 epoch 014: 877 / 1707 loss=4.155, nll_loss=2.532, ppl=5.78, wps=294489, ups=0.69, wpb=428321, bsz=16513.3, num_updates=23000, lr=0.000417029, gnorm=0.239, clip=0, loss_scale=2, train_wall=145, gb_free=59, wall=33901 epoch 014: 877 / 1707 loss=4.155, nll_loss=2.532, ppl=5.78, wps=294489, ups=0.69, wpb=428321, bsz=16513.3, num_updates=23000, lr=0.000417029, gnorm=0.239, clip=0, loss_scale=2, train_wall=145, gb_free=59, wall=33901 epoch 014: 877 / 1707 loss=4.155, nll_loss=2.532, ppl=5.78, wps=294489, ups=0.69, wpb=428321, bsz=16513.3, num_updates=23000, lr=0.000417029, gnorm=0.239, clip=0, loss_scale=2, train_wall=145, gb_free=59, wall=33901 begin validation on "valid" subset epoch 014 | valid on 'valid' subset | loss 4.317 | nll_loss 2.696 | ppl 6.48 | wps 76411.1 | wpb 21331 | bsz 1016 | num_updates 23000 | best_loss 4.317 epoch 014 | valid on 'valid' subset | loss 4.317 | nll_loss 2.696 | ppl 6.48 | wps 76411.1 | wpb 21331 | bsz 1016 | num_updates 23000 | best_loss 4.317 epoch 014 | valid on 'valid' subset | loss 4.317 | nll_loss 2.696 | ppl 6.48 | wps 76411.1 | wpb 21331 | bsz 1016 | num_updates 23000 | best_loss 4.317 epoch 014 | valid on 'valid' subset | loss 4.317 | nll_loss 2.696 | ppl 6.48 | wps 76411.1 | wpb 21331 | bsz 1016 | num_updates 23000 | best_loss 4.317 epoch 014 | valid on 'valid' subset | loss 4.317 | nll_loss 2.696 | ppl 6.48 | wps 76411.1 | wpb 21331 | bsz 1016 | num_updates 23000 | best_loss 4.317 epoch 014 | valid on 'valid' subset | loss 4.317 | nll_loss 2.696 | ppl 6.48 | wps 76411.1 | wpb 21331 | bsz 1016 | num_updates 23000 | best_loss 4.317 epoch 014 | valid on 'valid' subset | loss 4.317 | nll_loss 2.696 | ppl 6.48 | wps 76411.1 | wpb 21331 | bsz 1016 | num_updates 23000 | best_loss 4.317 epoch 014 | valid on 'valid' subset | loss 4.317 | nll_loss 2.696 | ppl 6.48 | wps 76411.1 | wpb 21331 | bsz 1016 | num_updates 23000 | best_loss 4.317 epoch 014 | valid on 'valid' subset | loss 4.317 | nll_loss 2.696 | ppl 6.48 | wps 76411.1 | wpb 21331 | bsz 1016 | num_updates 23000 | best_loss 4.317 epoch 014 | valid on 'valid' subset | loss 4.317 | nll_loss 2.696 | ppl 6.48 | wps 76411.1 | wpb 21331 | bsz 1016 | num_updates 23000 | best_loss 4.317 epoch 014 | valid on 'valid' subset | loss 4.317 | nll_loss 2.696 | ppl 6.48 | wps 76411.1 | wpb 21331 | bsz 1016 | num_updates 23000 | best_loss 4.317 epoch 014 | valid on 'valid' subset | loss 4.317 | nll_loss 2.696 | ppl 6.48 | wps 76411.1 | wpb 21331 | bsz 1016 | num_updates 23000 | best_loss 4.317 epoch 014 | valid on 'valid' subset | loss 4.317 | nll_loss 2.696 | ppl 6.48 | wps 76411.1 | wpb 21331 | bsz 1016 | num_updates 23000 | best_loss 4.317 epoch 014 | valid on 'valid' subset | loss 4.317 | nll_loss 2.696 | ppl 6.48 | wps 76411.1 | wpb 21331 | bsz 1016 | num_updates 23000 | best_loss 4.317 epoch 014: 977 / 1707 loss=4.153, nll_loss=2.529, ppl=5.77, wps=262929, ups=0.61, wpb=429113, bsz=16498.9, num_updates=23100, lr=0.000416125, gnorm=0.233, clip=0, loss_scale=4, train_wall=144, gb_free=58.9, wall=34064 epoch 014: 977 / 1707 loss=4.153, nll_loss=2.529, ppl=5.77, wps=262929, ups=0.61, wpb=429113, bsz=16498.9, num_updates=23100, lr=0.000416125, gnorm=0.233, clip=0, loss_scale=4, train_wall=144, gb_free=58.9, wall=34064 epoch 014: 977 / 1707 loss=4.153, nll_loss=2.529, ppl=5.77, wps=262929, ups=0.61, wpb=429113, bsz=16498.9, num_updates=23100, lr=0.000416125, gnorm=0.233, clip=0, loss_scale=4, train_wall=144, gb_free=58.9, wall=34064 epoch 014: 977 / 1707 loss=4.153, nll_loss=2.529, ppl=5.77, wps=262929, ups=0.61, wpb=429113, bsz=16498.9, num_updates=23100, lr=0.000416125, gnorm=0.233, clip=0, loss_scale=4, train_wall=144, gb_free=58.9, wall=34064 epoch 014: 977 / 1707 loss=4.153, nll_loss=2.529, ppl=5.77, wps=262929, ups=0.61, wpb=429113, bsz=16498.9, num_updates=23100, lr=0.000416125, gnorm=0.233, clip=0, loss_scale=4, train_wall=144, gb_free=58.9, wall=34064 epoch 014: 977 / 1707 loss=4.153, nll_loss=2.529, ppl=5.77, wps=262929, ups=0.61, wpb=429113, bsz=16498.9, num_updates=23100, lr=0.000416125, gnorm=0.233, clip=0, loss_scale=4, train_wall=144, gb_free=58.9, wall=34064 epoch 014: 977 / 1707 loss=4.153, nll_loss=2.529, ppl=5.77, wps=262929, ups=0.61, wpb=429113, bsz=16498.9, num_updates=23100, lr=0.000416125, gnorm=0.233, clip=0, loss_scale=4, train_wall=144, gb_free=58.9, wall=34064 epoch 014: 977 / 1707 loss=4.153, nll_loss=2.529, ppl=5.77, wps=262929, ups=0.61, wpb=429113, bsz=16498.9, num_updates=23100, lr=0.000416125, gnorm=0.233, clip=0, loss_scale=4, train_wall=144, gb_free=58.9, wall=34064 epoch 014: 977 / 1707 loss=4.153, nll_loss=2.529, ppl=5.77, wps=262929, ups=0.61, wpb=429113, bsz=16498.9, num_updates=23100, lr=0.000416125, gnorm=0.233, clip=0, loss_scale=4, train_wall=144, gb_free=58.9, wall=34064 epoch 014: 977 / 1707 loss=4.153, nll_loss=2.529, ppl=5.77, wps=262929, ups=0.61, wpb=429113, bsz=16498.9, num_updates=23100, lr=0.000416125, gnorm=0.233, clip=0, loss_scale=4, train_wall=144, gb_free=58.9, wall=34064 epoch 014: 977 / 1707 loss=4.153, nll_loss=2.529, ppl=5.77, wps=262929, ups=0.61, wpb=429113, bsz=16498.9, num_updates=23100, lr=0.000416125, gnorm=0.233, clip=0, loss_scale=4, train_wall=144, gb_free=58.9, wall=34064 epoch 014: 977 / 1707 loss=4.153, nll_loss=2.529, ppl=5.77, wps=262929, ups=0.61, wpb=429113, bsz=16498.9, num_updates=23100, lr=0.000416125, gnorm=0.233, clip=0, loss_scale=4, train_wall=144, gb_free=58.9, wall=34064 epoch 014: 977 / 1707 loss=4.153, nll_loss=2.529, ppl=5.77, wps=262929, ups=0.61, wpb=429113, bsz=16498.9, num_updates=23100, lr=0.000416125, gnorm=0.233, clip=0, loss_scale=4, train_wall=144, gb_free=58.9, wall=34064 epoch 014: 977 / 1707 loss=4.153, nll_loss=2.529, ppl=5.77, wps=262929, ups=0.61, wpb=429113, bsz=16498.9, num_updates=23100, lr=0.000416125, gnorm=0.233, clip=0, loss_scale=4, train_wall=144, gb_free=58.9, wall=34064 epoch 014: 1077 / 1707 loss=4.16, nll_loss=2.538, ppl=5.81, wps=295928, ups=0.69, wpb=429021, bsz=16297.6, num_updates=23200, lr=0.000415227, gnorm=0.243, clip=0, loss_scale=4, train_wall=144, gb_free=59.5, wall=34209 epoch 014: 1077 / 1707 loss=4.16, nll_loss=2.538, ppl=5.81, wps=295928, ups=0.69, wpb=429021, bsz=16297.6, num_updates=23200, lr=0.000415227, gnorm=0.243, clip=0, loss_scale=4, train_wall=144, gb_free=59.5, wall=34209 epoch 014: 1077 / 1707 loss=4.16, nll_loss=2.538, ppl=5.81, wps=295928, ups=0.69, wpb=429021, bsz=16297.6, num_updates=23200, lr=0.000415227, gnorm=0.243, clip=0, loss_scale=4, train_wall=144, gb_free=59.5, wall=34209 epoch 014: 1077 / 1707 loss=4.16, nll_loss=2.538, ppl=5.81, wps=295928, ups=0.69, wpb=429021, bsz=16297.6, num_updates=23200, lr=0.000415227, gnorm=0.243, clip=0, loss_scale=4, train_wall=144, gb_free=59.5, wall=34209 epoch 014: 1077 / 1707 loss=4.16, nll_loss=2.538, ppl=5.81, wps=295928, ups=0.69, wpb=429021, bsz=16297.6, num_updates=23200, lr=0.000415227, gnorm=0.243, clip=0, loss_scale=4, train_wall=144, gb_free=59.5, wall=34209 epoch 014: 1077 / 1707 loss=4.16, nll_loss=2.538, ppl=5.81, wps=295928, ups=0.69, wpb=429021, bsz=16297.6, num_updates=23200, lr=0.000415227, gnorm=0.243, clip=0, loss_scale=4, train_wall=144, gb_free=59.5, wall=34209 epoch 014: 1077 / 1707 loss=4.16, nll_loss=2.538, ppl=5.81, wps=295928, ups=0.69, wpb=429021, bsz=16297.6, num_updates=23200, lr=0.000415227, gnorm=0.243, clip=0, loss_scale=4, train_wall=144, gb_free=59.5, wall=34209 epoch 014: 1077 / 1707 loss=4.16, nll_loss=2.538, ppl=5.81, wps=295928, ups=0.69, wpb=429021, bsz=16297.6, num_updates=23200, lr=0.000415227, gnorm=0.243, clip=0, loss_scale=4, train_wall=144, gb_free=59.5, wall=34209 epoch 014: 1077 / 1707 loss=4.16, nll_loss=2.538, ppl=5.81, wps=295928, ups=0.69, wpb=429021, bsz=16297.6, num_updates=23200, lr=0.000415227, gnorm=0.243, clip=0, loss_scale=4, train_wall=144, gb_free=59.5, wall=34209 epoch 014: 1077 / 1707 loss=4.16, nll_loss=2.538, ppl=5.81, wps=295928, ups=0.69, wpb=429021, bsz=16297.6, num_updates=23200, lr=0.000415227, gnorm=0.243, clip=0, loss_scale=4, train_wall=144, gb_free=59.5, wall=34209 epoch 014: 1077 / 1707 loss=4.16, nll_loss=2.538, ppl=5.81, wps=295928, ups=0.69, wpb=429021, bsz=16297.6, num_updates=23200, lr=0.000415227, gnorm=0.243, clip=0, loss_scale=4, train_wall=144, gb_free=59.5, wall=34209 epoch 014: 1077 / 1707 loss=4.16, nll_loss=2.538, ppl=5.81, wps=295928, ups=0.69, wpb=429021, bsz=16297.6, num_updates=23200, lr=0.000415227, gnorm=0.243, clip=0, loss_scale=4, train_wall=144, gb_free=59.5, wall=34209 epoch 014: 1077 / 1707 loss=4.16, nll_loss=2.538, ppl=5.81, wps=295928, ups=0.69, wpb=429021, bsz=16297.6, num_updates=23200, lr=0.000415227, gnorm=0.243, clip=0, loss_scale=4, train_wall=144, gb_free=59.5, wall=34209 epoch 014: 1077 / 1707 loss=4.16, nll_loss=2.538, ppl=5.81, wps=295928, ups=0.69, wpb=429021, bsz=16297.6, num_updates=23200, lr=0.000415227, gnorm=0.243, clip=0, loss_scale=4, train_wall=144, gb_free=59.5, wall=34209 epoch 014: 1178 / 1707 loss=4.151, nll_loss=2.527, ppl=5.76, wps=293477, ups=0.68, wpb=430466, bsz=16602.7, num_updates=23300, lr=0.000414335, gnorm=0.248, clip=0, loss_scale=2, train_wall=146, gb_free=59.2, wall=34355 epoch 014: 1178 / 1707 loss=4.151, nll_loss=2.527, ppl=5.76, wps=293477, ups=0.68, wpb=430466, bsz=16602.7, num_updates=23300, lr=0.000414335, gnorm=0.248, clip=0, loss_scale=2, train_wall=146, gb_free=59.2, wall=34355 epoch 014: 1178 / 1707 loss=4.151, nll_loss=2.527, ppl=5.76, wps=293477, ups=0.68, wpb=430466, bsz=16602.7, num_updates=23300, lr=0.000414335, gnorm=0.248, clip=0, loss_scale=2, train_wall=146, gb_free=59.2, wall=34355 epoch 014: 1178 / 1707 loss=4.151, nll_loss=2.527, ppl=5.76, wps=293477, ups=0.68, wpb=430466, bsz=16602.7, num_updates=23300, lr=0.000414335, gnorm=0.248, clip=0, loss_scale=2, train_wall=146, gb_free=59.2, wall=34355 epoch 014: 1178 / 1707 loss=4.151, nll_loss=2.527, ppl=5.76, wps=293477, ups=0.68, wpb=430466, bsz=16602.7, num_updates=23300, lr=0.000414335, gnorm=0.248, clip=0, loss_scale=2, train_wall=146, gb_free=59.2, wall=34355 epoch 014: 1178 / 1707 loss=4.151, nll_loss=2.527, ppl=5.76, wps=293477, ups=0.68, wpb=430466, bsz=16602.7, num_updates=23300, lr=0.000414335, gnorm=0.248, clip=0, loss_scale=2, train_wall=146, gb_free=59.2, wall=34355 epoch 014: 1178 / 1707 loss=4.151, nll_loss=2.527, ppl=5.76, wps=293477, ups=0.68, wpb=430466, bsz=16602.7, num_updates=23300, lr=0.000414335, gnorm=0.248, clip=0, loss_scale=2, train_wall=146, gb_free=59.2, wall=34355 epoch 014: 1178 / 1707 loss=4.151, nll_loss=2.527, ppl=5.76, wps=293477, ups=0.68, wpb=430466, bsz=16602.7, num_updates=23300, lr=0.000414335, gnorm=0.248, clip=0, loss_scale=2, train_wall=146, gb_free=59.2, wall=34355 epoch 014: 1178 / 1707 loss=4.151, nll_loss=2.527, ppl=5.76, wps=293477, ups=0.68, wpb=430466, bsz=16602.7, num_updates=23300, lr=0.000414335, gnorm=0.248, clip=0, loss_scale=2, train_wall=146, gb_free=59.2, wall=34355 epoch 014: 1178 / 1707 loss=4.151, nll_loss=2.527, ppl=5.76, wps=293477, ups=0.68, wpb=430466, bsz=16602.7, num_updates=23300, lr=0.000414335, gnorm=0.248, clip=0, loss_scale=2, train_wall=146, gb_free=59.2, wall=34355 epoch 014: 1178 / 1707 loss=4.151, nll_loss=2.527, ppl=5.76, wps=293477, ups=0.68, wpb=430466, bsz=16602.7, num_updates=23300, lr=0.000414335, gnorm=0.248, clip=0, loss_scale=2, train_wall=146, gb_free=59.2, wall=34355 epoch 014: 1178 / 1707 loss=4.151, nll_loss=2.527, ppl=5.76, wps=293477, ups=0.68, wpb=430466, bsz=16602.7, num_updates=23300, lr=0.000414335, gnorm=0.248, clip=0, loss_scale=2, train_wall=146, gb_free=59.2, wall=34355 epoch 014: 1178 / 1707 loss=4.151, nll_loss=2.527, ppl=5.76, wps=293477, ups=0.68, wpb=430466, bsz=16602.7, num_updates=23300, lr=0.000414335, gnorm=0.248, clip=0, loss_scale=2, train_wall=146, gb_free=59.2, wall=34355 epoch 014: 1178 / 1707 loss=4.151, nll_loss=2.527, ppl=5.76, wps=293477, ups=0.68, wpb=430466, bsz=16602.7, num_updates=23300, lr=0.000414335, gnorm=0.248, clip=0, loss_scale=2, train_wall=146, gb_free=59.2, wall=34355 epoch 014: 1279 / 1707 loss=4.155, nll_loss=2.532, ppl=5.78, wps=292349, ups=0.68, wpb=427223, bsz=16242.8, num_updates=23400, lr=0.000413449, gnorm=0.245, clip=0, loss_scale=1, train_wall=145, gb_free=58.8, wall=34502 epoch 014: 1279 / 1707 loss=4.155, nll_loss=2.532, ppl=5.78, wps=292349, ups=0.68, wpb=427223, bsz=16242.8, num_updates=23400, lr=0.000413449, gnorm=0.245, clip=0, loss_scale=1, train_wall=145, gb_free=58.8, wall=34502 epoch 014: 1279 / 1707 loss=4.155, nll_loss=2.532, ppl=5.78, wps=292349, ups=0.68, wpb=427223, bsz=16242.8, num_updates=23400, lr=0.000413449, gnorm=0.245, clip=0, loss_scale=1, train_wall=145, gb_free=58.8, wall=34502 epoch 014: 1279 / 1707 loss=4.155, nll_loss=2.532, ppl=5.78, wps=292349, ups=0.68, wpb=427223, bsz=16242.8, num_updates=23400, lr=0.000413449, gnorm=0.245, clip=0, loss_scale=1, train_wall=145, gb_free=58.8, wall=34502 epoch 014: 1279 / 1707 loss=4.155, nll_loss=2.532, ppl=5.78, wps=292349, ups=0.68, wpb=427223, bsz=16242.8, num_updates=23400, lr=0.000413449, gnorm=0.245, clip=0, loss_scale=1, train_wall=145, gb_free=58.8, wall=34502 epoch 014: 1279 / 1707 loss=4.155, nll_loss=2.532, ppl=5.78, wps=292349, ups=0.68, wpb=427223, bsz=16242.8, num_updates=23400, lr=0.000413449, gnorm=0.245, clip=0, loss_scale=1, train_wall=145, gb_free=58.8, wall=34502 epoch 014: 1279 / 1707 loss=4.155, nll_loss=2.532, ppl=5.78, wps=292349, ups=0.68, wpb=427223, bsz=16242.8, num_updates=23400, lr=0.000413449, gnorm=0.245, clip=0, loss_scale=1, train_wall=145, gb_free=58.8, wall=34502 epoch 014: 1279 / 1707 loss=4.155, nll_loss=2.532, ppl=5.78, wps=292349, ups=0.68, wpb=427223, bsz=16242.8, num_updates=23400, lr=0.000413449, gnorm=0.245, clip=0, loss_scale=1, train_wall=145, gb_free=58.8, wall=34502 epoch 014: 1279 / 1707 loss=4.155, nll_loss=2.532, ppl=5.78, wps=292349, ups=0.68, wpb=427223, bsz=16242.8, num_updates=23400, lr=0.000413449, gnorm=0.245, clip=0, loss_scale=1, train_wall=145, gb_free=58.8, wall=34502 epoch 014: 1279 / 1707 loss=4.155, nll_loss=2.532, ppl=5.78, wps=292349, ups=0.68, wpb=427223, bsz=16242.8, num_updates=23400, lr=0.000413449, gnorm=0.245, clip=0, loss_scale=1, train_wall=145, gb_free=58.8, wall=34502 epoch 014: 1279 / 1707 loss=4.155, nll_loss=2.532, ppl=5.78, wps=292349, ups=0.68, wpb=427223, bsz=16242.8, num_updates=23400, lr=0.000413449, gnorm=0.245, clip=0, loss_scale=1, train_wall=145, gb_free=58.8, wall=34502 epoch 014: 1279 / 1707 loss=4.155, nll_loss=2.532, ppl=5.78, wps=292349, ups=0.68, wpb=427223, bsz=16242.8, num_updates=23400, lr=0.000413449, gnorm=0.245, clip=0, loss_scale=1, train_wall=145, gb_free=58.8, wall=34502 epoch 014: 1279 / 1707 loss=4.155, nll_loss=2.532, ppl=5.78, wps=292349, ups=0.68, wpb=427223, bsz=16242.8, num_updates=23400, lr=0.000413449, gnorm=0.245, clip=0, loss_scale=1, train_wall=145, gb_free=58.8, wall=34502 epoch 014: 1279 / 1707 loss=4.155, nll_loss=2.532, ppl=5.78, wps=292349, ups=0.68, wpb=427223, bsz=16242.8, num_updates=23400, lr=0.000413449, gnorm=0.245, clip=0, loss_scale=1, train_wall=145, gb_free=58.8, wall=34502 epoch 014: 1379 / 1707 loss=4.162, nll_loss=2.54, ppl=5.81, wps=296569, ups=0.69, wpb=429799, bsz=16368.6, num_updates=23500, lr=0.000412568, gnorm=0.248, clip=0, loss_scale=1, train_wall=144, gb_free=59.1, wall=34646 epoch 014: 1379 / 1707 loss=4.162, nll_loss=2.54, ppl=5.81, wps=296569, ups=0.69, wpb=429799, bsz=16368.6, num_updates=23500, lr=0.000412568, gnorm=0.248, clip=0, loss_scale=1, train_wall=144, gb_free=59.1, wall=34646 epoch 014: 1379 / 1707 loss=4.162, nll_loss=2.54, ppl=5.81, wps=296569, ups=0.69, wpb=429799, bsz=16368.6, num_updates=23500, lr=0.000412568, gnorm=0.248, clip=0, loss_scale=1, train_wall=144, gb_free=59.1, wall=34646 epoch 014: 1379 / 1707 loss=4.162, nll_loss=2.54, ppl=5.81, wps=296569, ups=0.69, wpb=429799, bsz=16368.6, num_updates=23500, lr=0.000412568, gnorm=0.248, clip=0, loss_scale=1, train_wall=144, gb_free=59.1, wall=34646 epoch 014: 1379 / 1707 loss=4.162, nll_loss=2.54, ppl=5.81, wps=296569, ups=0.69, wpb=429799, bsz=16368.6, num_updates=23500, lr=0.000412568, gnorm=0.248, clip=0, loss_scale=1, train_wall=144, gb_free=59.1, wall=34646 epoch 014: 1379 / 1707 loss=4.162, nll_loss=2.54, ppl=5.81, wps=296569, ups=0.69, wpb=429799, bsz=16368.6, num_updates=23500, lr=0.000412568, gnorm=0.248, clip=0, loss_scale=1, train_wall=144, gb_free=59.1, wall=34646 epoch 014: 1379 / 1707 loss=4.162, nll_loss=2.54, ppl=5.81, wps=296569, ups=0.69, wpb=429799, bsz=16368.6, num_updates=23500, lr=0.000412568, gnorm=0.248, clip=0, loss_scale=1, train_wall=144, gb_free=59.1, wall=34646 epoch 014: 1379 / 1707 loss=4.162, nll_loss=2.54, ppl=5.81, wps=296569, ups=0.69, wpb=429799, bsz=16368.6, num_updates=23500, lr=0.000412568, gnorm=0.248, clip=0, loss_scale=1, train_wall=144, gb_free=59.1, wall=34646 epoch 014: 1379 / 1707 loss=4.162, nll_loss=2.54, ppl=5.81, wps=296569, ups=0.69, wpb=429799, bsz=16368.6, num_updates=23500, lr=0.000412568, gnorm=0.248, clip=0, loss_scale=1, train_wall=144, gb_free=59.1, wall=34646 epoch 014: 1379 / 1707 loss=4.162, nll_loss=2.54, ppl=5.81, wps=296569, ups=0.69, wpb=429799, bsz=16368.6, num_updates=23500, lr=0.000412568, gnorm=0.248, clip=0, loss_scale=1, train_wall=144, gb_free=59.1, wall=34646 epoch 014: 1379 / 1707 loss=4.162, nll_loss=2.54, ppl=5.81, wps=296569, ups=0.69, wpb=429799, bsz=16368.6, num_updates=23500, lr=0.000412568, gnorm=0.248, clip=0, loss_scale=1, train_wall=144, gb_free=59.1, wall=34646 epoch 014: 1379 / 1707 loss=4.162, nll_loss=2.54, ppl=5.81, wps=296569, ups=0.69, wpb=429799, bsz=16368.6, num_updates=23500, lr=0.000412568, gnorm=0.248, clip=0, loss_scale=1, train_wall=144, gb_free=59.1, wall=34646 epoch 014: 1379 / 1707 loss=4.162, nll_loss=2.54, ppl=5.81, wps=296569, ups=0.69, wpb=429799, bsz=16368.6, num_updates=23500, lr=0.000412568, gnorm=0.248, clip=0, loss_scale=1, train_wall=144, gb_free=59.1, wall=34646 epoch 014: 1379 / 1707 loss=4.162, nll_loss=2.54, ppl=5.81, wps=296569, ups=0.69, wpb=429799, bsz=16368.6, num_updates=23500, lr=0.000412568, gnorm=0.248, clip=0, loss_scale=1, train_wall=144, gb_free=59.1, wall=34646 epoch 014: 1479 / 1707 loss=4.163, nll_loss=2.541, ppl=5.82, wps=295840, ups=0.69, wpb=429297, bsz=16354.7, num_updates=23600, lr=0.000411693, gnorm=0.237, clip=0, loss_scale=2, train_wall=144, gb_free=58.8, wall=34792 epoch 014: 1479 / 1707 loss=4.163, nll_loss=2.541, ppl=5.82, wps=295840, ups=0.69, wpb=429297, bsz=16354.7, num_updates=23600, lr=0.000411693, gnorm=0.237, clip=0, loss_scale=2, train_wall=144, gb_free=58.8, wall=34792 epoch 014: 1479 / 1707 loss=4.163, nll_loss=2.541, ppl=5.82, wps=295840, ups=0.69, wpb=429297, bsz=16354.7, num_updates=23600, lr=0.000411693, gnorm=0.237, clip=0, loss_scale=2, train_wall=144, gb_free=58.8, wall=34792 epoch 014: 1479 / 1707 loss=4.163, nll_loss=2.541, ppl=5.82, wps=295840, ups=0.69, wpb=429297, bsz=16354.7, num_updates=23600, lr=0.000411693, gnorm=0.237, clip=0, loss_scale=2, train_wall=144, gb_free=58.8, wall=34792 epoch 014: 1479 / 1707 loss=4.163, nll_loss=2.541, ppl=5.82, wps=295840, ups=0.69, wpb=429297, bsz=16354.7, num_updates=23600, lr=0.000411693, gnorm=0.237, clip=0, loss_scale=2, train_wall=144, gb_free=58.8, wall=34792 epoch 014: 1479 / 1707 loss=4.163, nll_loss=2.541, ppl=5.82, wps=295840, ups=0.69, wpb=429297, bsz=16354.7, num_updates=23600, lr=0.000411693, gnorm=0.237, clip=0, loss_scale=2, train_wall=144, gb_free=58.8, wall=34792 epoch 014: 1479 / 1707 loss=4.163, nll_loss=2.541, ppl=5.82, wps=295840, ups=0.69, wpb=429297, bsz=16354.7, num_updates=23600, lr=0.000411693, gnorm=0.237, clip=0, loss_scale=2, train_wall=144, gb_free=58.8, wall=34792 epoch 014: 1479 / 1707 loss=4.163, nll_loss=2.541, ppl=5.82, wps=295840, ups=0.69, wpb=429297, bsz=16354.7, num_updates=23600, lr=0.000411693, gnorm=0.237, clip=0, loss_scale=2, train_wall=144, gb_free=58.8, wall=34792 epoch 014: 1479 / 1707 loss=4.163, nll_loss=2.541, ppl=5.82, wps=295840, ups=0.69, wpb=429297, bsz=16354.7, num_updates=23600, lr=0.000411693, gnorm=0.237, clip=0, loss_scale=2, train_wall=144, gb_free=58.8, wall=34792 epoch 014: 1479 / 1707 loss=4.163, nll_loss=2.541, ppl=5.82, wps=295840, ups=0.69, wpb=429297, bsz=16354.7, num_updates=23600, lr=0.000411693, gnorm=0.237, clip=0, loss_scale=2, train_wall=144, gb_free=58.8, wall=34792 epoch 014: 1479 / 1707 loss=4.163, nll_loss=2.541, ppl=5.82, wps=295840, ups=0.69, wpb=429297, bsz=16354.7, num_updates=23600, lr=0.000411693, gnorm=0.237, clip=0, loss_scale=2, train_wall=144, gb_free=58.8, wall=34792 epoch 014: 1479 / 1707 loss=4.163, nll_loss=2.541, ppl=5.82, wps=295840, ups=0.69, wpb=429297, bsz=16354.7, num_updates=23600, lr=0.000411693, gnorm=0.237, clip=0, loss_scale=2, train_wall=144, gb_free=58.8, wall=34792 epoch 014: 1479 / 1707 loss=4.163, nll_loss=2.541, ppl=5.82, wps=295840, ups=0.69, wpb=429297, bsz=16354.7, num_updates=23600, lr=0.000411693, gnorm=0.237, clip=0, loss_scale=2, train_wall=144, gb_free=58.8, wall=34792 epoch 014: 1479 / 1707 loss=4.163, nll_loss=2.541, ppl=5.82, wps=295840, ups=0.69, wpb=429297, bsz=16354.7, num_updates=23600, lr=0.000411693, gnorm=0.237, clip=0, loss_scale=2, train_wall=144, gb_free=58.8, wall=34792 epoch 014: 1579 / 1707 loss=4.165, nll_loss=2.543, ppl=5.83, wps=295903, ups=0.69, wpb=428965, bsz=16212.2, num_updates=23700, lr=0.000410824, gnorm=0.237, clip=0, loss_scale=2, train_wall=144, gb_free=59.1, wall=34937 epoch 014: 1579 / 1707 loss=4.165, nll_loss=2.543, ppl=5.83, wps=295903, ups=0.69, wpb=428965, bsz=16212.2, num_updates=23700, lr=0.000410824, gnorm=0.237, clip=0, loss_scale=2, train_wall=144, gb_free=59.1, wall=34937 epoch 014: 1579 / 1707 loss=4.165, nll_loss=2.543, ppl=5.83, wps=295903, ups=0.69, wpb=428965, bsz=16212.2, num_updates=23700, lr=0.000410824, gnorm=0.237, clip=0, loss_scale=2, train_wall=144, gb_free=59.1, wall=34937 epoch 014: 1579 / 1707 loss=4.165, nll_loss=2.543, ppl=5.83, wps=295903, ups=0.69, wpb=428965, bsz=16212.2, num_updates=23700, lr=0.000410824, gnorm=0.237, clip=0, loss_scale=2, train_wall=144, gb_free=59.1, wall=34937 epoch 014: 1579 / 1707 loss=4.165, nll_loss=2.543, ppl=5.83, wps=295903, ups=0.69, wpb=428965, bsz=16212.2, num_updates=23700, lr=0.000410824, gnorm=0.237, clip=0, loss_scale=2, train_wall=144, gb_free=59.1, wall=34937 epoch 014: 1579 / 1707 loss=4.165, nll_loss=2.543, ppl=5.83, wps=295903, ups=0.69, wpb=428965, bsz=16212.2, num_updates=23700, lr=0.000410824, gnorm=0.237, clip=0, loss_scale=2, train_wall=144, gb_free=59.1, wall=34937 epoch 014: 1579 / 1707 loss=4.165, nll_loss=2.543, ppl=5.83, wps=295903, ups=0.69, wpb=428965, bsz=16212.2, num_updates=23700, lr=0.000410824, gnorm=0.237, clip=0, loss_scale=2, train_wall=144, gb_free=59.1, wall=34937 epoch 014: 1579 / 1707 loss=4.165, nll_loss=2.543, ppl=5.83, wps=295903, ups=0.69, wpb=428965, bsz=16212.2, num_updates=23700, lr=0.000410824, gnorm=0.237, clip=0, loss_scale=2, train_wall=144, gb_free=59.1, wall=34937 epoch 014: 1579 / 1707 loss=4.165, nll_loss=2.543, ppl=5.83, wps=295903, ups=0.69, wpb=428965, bsz=16212.2, num_updates=23700, lr=0.000410824, gnorm=0.237, clip=0, loss_scale=2, train_wall=144, gb_free=59.1, wall=34937 epoch 014: 1579 / 1707 loss=4.165, nll_loss=2.543, ppl=5.83, wps=295903, ups=0.69, wpb=428965, bsz=16212.2, num_updates=23700, lr=0.000410824, gnorm=0.237, clip=0, loss_scale=2, train_wall=144, gb_free=59.1, wall=34937 epoch 014: 1579 / 1707 loss=4.165, nll_loss=2.543, ppl=5.83, wps=295903, ups=0.69, wpb=428965, bsz=16212.2, num_updates=23700, lr=0.000410824, gnorm=0.237, clip=0, loss_scale=2, train_wall=144, gb_free=59.1, wall=34937 epoch 014: 1579 / 1707 loss=4.165, nll_loss=2.543, ppl=5.83, wps=295903, ups=0.69, wpb=428965, bsz=16212.2, num_updates=23700, lr=0.000410824, gnorm=0.237, clip=0, loss_scale=2, train_wall=144, gb_free=59.1, wall=34937 epoch 014: 1579 / 1707 loss=4.165, nll_loss=2.543, ppl=5.83, wps=295903, ups=0.69, wpb=428965, bsz=16212.2, num_updates=23700, lr=0.000410824, gnorm=0.237, clip=0, loss_scale=2, train_wall=144, gb_free=59.1, wall=34937 epoch 014: 1579 / 1707 loss=4.165, nll_loss=2.543, ppl=5.83, wps=295903, ups=0.69, wpb=428965, bsz=16212.2, num_updates=23700, lr=0.000410824, gnorm=0.237, clip=0, loss_scale=2, train_wall=144, gb_free=59.1, wall=34937 epoch 014: 1679 / 1707 loss=4.166, nll_loss=2.544, ppl=5.83, wps=295168, ups=0.69, wpb=429436, bsz=16288, num_updates=23800, lr=0.00040996, gnorm=0.246, clip=0, loss_scale=2, train_wall=145, gb_free=59.2, wall=35082 epoch 014: 1679 / 1707 loss=4.166, nll_loss=2.544, ppl=5.83, wps=295168, ups=0.69, wpb=429436, bsz=16288, num_updates=23800, lr=0.00040996, gnorm=0.246, clip=0, loss_scale=2, train_wall=145, gb_free=59.2, wall=35082 epoch 014: 1679 / 1707 loss=4.166, nll_loss=2.544, ppl=5.83, wps=295168, ups=0.69, wpb=429436, bsz=16288, num_updates=23800, lr=0.00040996, gnorm=0.246, clip=0, loss_scale=2, train_wall=145, gb_free=59.2, wall=35082 epoch 014: 1679 / 1707 loss=4.166, nll_loss=2.544, ppl=5.83, wps=295168, ups=0.69, wpb=429436, bsz=16288, num_updates=23800, lr=0.00040996, gnorm=0.246, clip=0, loss_scale=2, train_wall=145, gb_free=59.2, wall=35082 epoch 014: 1679 / 1707 loss=4.166, nll_loss=2.544, ppl=5.83, wps=295168, ups=0.69, wpb=429436, bsz=16288, num_updates=23800, lr=0.00040996, gnorm=0.246, clip=0, loss_scale=2, train_wall=145, gb_free=59.2, wall=35082 epoch 014: 1679 / 1707 loss=4.166, nll_loss=2.544, ppl=5.83, wps=295168, ups=0.69, wpb=429436, bsz=16288, num_updates=23800, lr=0.00040996, gnorm=0.246, clip=0, loss_scale=2, train_wall=145, gb_free=59.2, wall=35082 epoch 014: 1679 / 1707 loss=4.166, nll_loss=2.544, ppl=5.83, wps=295168, ups=0.69, wpb=429436, bsz=16288, num_updates=23800, lr=0.00040996, gnorm=0.246, clip=0, loss_scale=2, train_wall=145, gb_free=59.2, wall=35082 epoch 014: 1679 / 1707 loss=4.166, nll_loss=2.544, ppl=5.83, wps=295168, ups=0.69, wpb=429436, bsz=16288, num_updates=23800, lr=0.00040996, gnorm=0.246, clip=0, loss_scale=2, train_wall=145, gb_free=59.2, wall=35082 epoch 014: 1679 / 1707 loss=4.166, nll_loss=2.544, ppl=5.83, wps=295168, ups=0.69, wpb=429436, bsz=16288, num_updates=23800, lr=0.00040996, gnorm=0.246, clip=0, loss_scale=2, train_wall=145, gb_free=59.2, wall=35082 epoch 014: 1679 / 1707 loss=4.166, nll_loss=2.544, ppl=5.83, wps=295168, ups=0.69, wpb=429436, bsz=16288, num_updates=23800, lr=0.00040996, gnorm=0.246, clip=0, loss_scale=2, train_wall=145, gb_free=59.2, wall=35082 epoch 014: 1679 / 1707 loss=4.166, nll_loss=2.544, ppl=5.83, wps=295168, ups=0.69, wpb=429436, bsz=16288, num_updates=23800, lr=0.00040996, gnorm=0.246, clip=0, loss_scale=2, train_wall=145, gb_free=59.2, wall=35082 epoch 014: 1679 / 1707 loss=4.166, nll_loss=2.544, ppl=5.83, wps=295168, ups=0.69, wpb=429436, bsz=16288, num_updates=23800, lr=0.00040996, gnorm=0.246, clip=0, loss_scale=2, train_wall=145, gb_free=59.2, wall=35082 epoch 014: 1679 / 1707 loss=4.166, nll_loss=2.544, ppl=5.83, wps=295168, ups=0.69, wpb=429436, bsz=16288, num_updates=23800, lr=0.00040996, gnorm=0.246, clip=0, loss_scale=2, train_wall=145, gb_free=59.2, wall=35082 epoch 014: 1679 / 1707 loss=4.166, nll_loss=2.544, ppl=5.83, wps=295168, ups=0.69, wpb=429436, bsz=16288, num_updates=23800, lr=0.00040996, gnorm=0.246, clip=0, loss_scale=2, train_wall=145, gb_free=59.2, wall=35082 end of epoch 14 (average epoch stats below) epoch 014 | loss 4.153 | nll_loss 2.529 | ppl 5.77 | wps 292804 | ups 0.68 | wpb 428939 | bsz 16331.8 | num_updates 23828 | lr 0.000409719 | gnorm 0.241 | clip 0 | loss_scale 4 | train_wall 2463 | gb_free 59.1 | wall 35121 epoch 014 | loss 4.153 | nll_loss 2.529 | ppl 5.77 | wps 292804 | ups 0.68 | wpb 428939 | bsz 16331.8 | num_updates 23828 | lr 0.000409719 | gnorm 0.241 | clip 0 | loss_scale 4 | train_wall 2463 | gb_free 59.1 | wall 35121 epoch 014 | loss 4.153 | nll_loss 2.529 | ppl 5.77 | wps 292804 | ups 0.68 | wpb 428939 | bsz 16331.8 | num_updates 23828 | lr 0.000409719 | gnorm 0.241 | clip 0 | loss_scale 4 | train_wall 2463 | gb_free 59.1 | wall 35121 epoch 014 | loss 4.153 | nll_loss 2.529 | ppl 5.77 | wps 292804 | ups 0.68 | wpb 428939 | bsz 16331.8 | num_updates 23828 | lr 0.000409719 | gnorm 0.241 | clip 0 | loss_scale 4 | train_wall 2463 | gb_free 59.1 | wall 35121 epoch 014 | loss 4.153 | nll_loss 2.529 | ppl 5.77 | wps 292804 | ups 0.68 | wpb 428939 | bsz 16331.8 | num_updates 23828 | lr 0.000409719 | gnorm 0.241 | clip 0 | loss_scale 4 | train_wall 2463 | gb_free 59.1 | wall 35121 epoch 014 | loss 4.153 | nll_loss 2.529 | ppl 5.77 | wps 292804 | ups 0.68 | wpb 428939 | bsz 16331.8 | num_updates 23828 | lr 0.000409719 | gnorm 0.241 | clip 0 | loss_scale 4 | train_wall 2463 | gb_free 59.1 | wall 35121 epoch 014 | loss 4.153 | nll_loss 2.529 | ppl 5.77 | wps 292804 | ups 0.68 | wpb 428939 | bsz 16331.8 | num_updates 23828 | lr 0.000409719 | gnorm 0.241 | clip 0 | loss_scale 4 | train_wall 2463 | gb_free 59.1 | wall 35121 epoch 014 | loss 4.153 | nll_loss 2.529 | ppl 5.77 | wps 292804 | ups 0.68 | wpb 428939 | bsz 16331.8 | num_updates 23828 | lr 0.000409719 | gnorm 0.241 | clip 0 | loss_scale 4 | train_wall 2463 | gb_free 59.1 | wall 35121 epoch 014 | loss 4.153 | nll_loss 2.529 | ppl 5.77 | wps 292804 | ups 0.68 | wpb 428939 | bsz 16331.8 | num_updates 23828 | lr 0.000409719 | gnorm 0.241 | clip 0 | loss_scale 4 | train_wall 2463 | gb_free 59.1 | wall 35121 epoch 014 | loss 4.153 | nll_loss 2.529 | ppl 5.77 | wps 292804 | ups 0.68 | wpb 428939 | bsz 16331.8 | num_updates 23828 | lr 0.000409719 | gnorm 0.241 | clip 0 | loss_scale 4 | train_wall 2463 | gb_free 59.1 | wall 35121 epoch 014 | loss 4.153 | nll_loss 2.529 | ppl 5.77 | wps 292804 | ups 0.68 | wpb 428939 | bsz 16331.8 | num_updates 23828 | lr 0.000409719 | gnorm 0.241 | clip 0 | loss_scale 4 | train_wall 2463 | gb_free 59.1 | wall 35121 epoch 014 | loss 4.153 | nll_loss 2.529 | ppl 5.77 | wps 292804 | ups 0.68 | wpb 428939 | bsz 16331.8 | num_updates 23828 | lr 0.000409719 | gnorm 0.241 | clip 0 | loss_scale 4 | train_wall 2463 | gb_free 59.1 | wall 35121 epoch 014 | loss 4.153 | nll_loss 2.529 | ppl 5.77 | wps 292804 | ups 0.68 | wpb 428939 | bsz 16331.8 | num_updates 23828 | lr 0.000409719 | gnorm 0.241 | clip 0 | loss_scale 4 | train_wall 2463 | gb_free 59.1 | wall 35121 epoch 014 | loss 4.153 | nll_loss 2.529 | ppl 5.77 | wps 292804 | ups 0.68 | wpb 428939 | bsz 16331.8 | num_updates 23828 | lr 0.000409719 | gnorm 0.241 | clip 0 | loss_scale 4 | train_wall 2463 | gb_free 59.1 | wall 35121 Start iterating over samples epoch 015: 72 / 1707 loss=4.131, nll_loss=2.504, ppl=5.67, wps=294983, ups=0.69, wpb=426143, bsz=16365.5, num_updates=23900, lr=0.000409101, gnorm=0.252, clip=0, loss_scale=4, train_wall=143, gb_free=59.3, wall=35226 epoch 015: 72 / 1707 loss=4.131, nll_loss=2.504, ppl=5.67, wps=294983, ups=0.69, wpb=426143, bsz=16365.5, num_updates=23900, lr=0.000409101, gnorm=0.252, clip=0, loss_scale=4, train_wall=143, gb_free=59.3, wall=35226 epoch 015: 72 / 1707 loss=4.131, nll_loss=2.504, ppl=5.67, wps=294983, ups=0.69, wpb=426143, bsz=16365.5, num_updates=23900, lr=0.000409101, gnorm=0.252, clip=0, loss_scale=4, train_wall=143, gb_free=59.3, wall=35226 epoch 015: 72 / 1707 loss=4.131, nll_loss=2.504, ppl=5.67, wps=294983, ups=0.69, wpb=426143, bsz=16365.5, num_updates=23900, lr=0.000409101, gnorm=0.252, clip=0, loss_scale=4, train_wall=143, gb_free=59.3, wall=35226 epoch 015: 72 / 1707 loss=4.131, nll_loss=2.504, ppl=5.67, wps=294983, ups=0.69, wpb=426143, bsz=16365.5, num_updates=23900, lr=0.000409101, gnorm=0.252, clip=0, loss_scale=4, train_wall=143, gb_free=59.3, wall=35226 epoch 015: 72 / 1707 loss=4.131, nll_loss=2.504, ppl=5.67, wps=294983, ups=0.69, wpb=426143, bsz=16365.5, num_updates=23900, lr=0.000409101, gnorm=0.252, clip=0, loss_scale=4, train_wall=143, gb_free=59.3, wall=35226 epoch 015: 72 / 1707 loss=4.131, nll_loss=2.504, ppl=5.67, wps=294983, ups=0.69, wpb=426143, bsz=16365.5, num_updates=23900, lr=0.000409101, gnorm=0.252, clip=0, loss_scale=4, train_wall=143, gb_free=59.3, wall=35226 epoch 015: 72 / 1707 loss=4.131, nll_loss=2.504, ppl=5.67, wps=294983, ups=0.69, wpb=426143, bsz=16365.5, num_updates=23900, lr=0.000409101, gnorm=0.252, clip=0, loss_scale=4, train_wall=143, gb_free=59.3, wall=35226 epoch 015: 72 / 1707 loss=4.131, nll_loss=2.504, ppl=5.67, wps=294983, ups=0.69, wpb=426143, bsz=16365.5, num_updates=23900, lr=0.000409101, gnorm=0.252, clip=0, loss_scale=4, train_wall=143, gb_free=59.3, wall=35226 epoch 015: 72 / 1707 loss=4.131, nll_loss=2.504, ppl=5.67, wps=294983, ups=0.69, wpb=426143, bsz=16365.5, num_updates=23900, lr=0.000409101, gnorm=0.252, clip=0, loss_scale=4, train_wall=143, gb_free=59.3, wall=35226 epoch 015: 72 / 1707 loss=4.131, nll_loss=2.504, ppl=5.67, wps=294983, ups=0.69, wpb=426143, bsz=16365.5, num_updates=23900, lr=0.000409101, gnorm=0.252, clip=0, loss_scale=4, train_wall=143, gb_free=59.3, wall=35226 epoch 015: 72 / 1707 loss=4.131, nll_loss=2.504, ppl=5.67, wps=294983, ups=0.69, wpb=426143, bsz=16365.5, num_updates=23900, lr=0.000409101, gnorm=0.252, clip=0, loss_scale=4, train_wall=143, gb_free=59.3, wall=35226 epoch 015: 72 / 1707 loss=4.131, nll_loss=2.504, ppl=5.67, wps=294983, ups=0.69, wpb=426143, bsz=16365.5, num_updates=23900, lr=0.000409101, gnorm=0.252, clip=0, loss_scale=4, train_wall=143, gb_free=59.3, wall=35226 epoch 015: 72 / 1707 loss=4.131, nll_loss=2.504, ppl=5.67, wps=294983, ups=0.69, wpb=426143, bsz=16365.5, num_updates=23900, lr=0.000409101, gnorm=0.252, clip=0, loss_scale=4, train_wall=143, gb_free=59.3, wall=35226 epoch 015: 72 / 1707 loss=4.131, nll_loss=2.504, ppl=5.67, wps=294983, ups=0.69, wpb=426143, bsz=16365.5, num_updates=23900, lr=0.000409101, gnorm=0.252, clip=0, loss_scale=4, train_wall=143, gb_free=59.3, wall=35226 epoch 015: 172 / 1707 loss=4.124, nll_loss=2.495, ppl=5.64, wps=295827, ups=0.69, wpb=429299, bsz=16333.8, num_updates=24000, lr=0.000408248, gnorm=0.24, clip=0, loss_scale=4, train_wall=144, gb_free=59.1, wall=35372 epoch 015: 172 / 1707 loss=4.124, nll_loss=2.495, ppl=5.64, wps=295827, ups=0.69, wpb=429299, bsz=16333.8, num_updates=24000, lr=0.000408248, gnorm=0.24, clip=0, loss_scale=4, train_wall=144, gb_free=59.1, wall=35372 epoch 015: 172 / 1707 loss=4.124, nll_loss=2.495, ppl=5.64, wps=295827, ups=0.69, wpb=429299, bsz=16333.8, num_updates=24000, lr=0.000408248, gnorm=0.24, clip=0, loss_scale=4, train_wall=144, gb_free=59.1, wall=35372 epoch 015: 172 / 1707 loss=4.124, nll_loss=2.495, ppl=5.64, wps=295827, ups=0.69, wpb=429299, bsz=16333.8, num_updates=24000, lr=0.000408248, gnorm=0.24, clip=0, loss_scale=4, train_wall=144, gb_free=59.1, wall=35372 epoch 015: 172 / 1707 loss=4.124, nll_loss=2.495, ppl=5.64, wps=295827, ups=0.69, wpb=429299, bsz=16333.8, num_updates=24000, lr=0.000408248, gnorm=0.24, clip=0, loss_scale=4, train_wall=144, gb_free=59.1, wall=35372 epoch 015: 172 / 1707 loss=4.124, nll_loss=2.495, ppl=5.64, wps=295827, ups=0.69, wpb=429299, bsz=16333.8, num_updates=24000, lr=0.000408248, gnorm=0.24, clip=0, loss_scale=4, train_wall=144, gb_free=59.1, wall=35372 epoch 015: 172 / 1707 loss=4.124, nll_loss=2.495, ppl=5.64, wps=295827, ups=0.69, wpb=429299, bsz=16333.8, num_updates=24000, lr=0.000408248, gnorm=0.24, clip=0, loss_scale=4, train_wall=144, gb_free=59.1, wall=35372 epoch 015: 172 / 1707 loss=4.124, nll_loss=2.495, ppl=5.64, wps=295827, ups=0.69, wpb=429299, bsz=16333.8, num_updates=24000, lr=0.000408248, gnorm=0.24, clip=0, loss_scale=4, train_wall=144, gb_free=59.1, wall=35372 epoch 015: 172 / 1707 loss=4.124, nll_loss=2.495, ppl=5.64, wps=295827, ups=0.69, wpb=429299, bsz=16333.8, num_updates=24000, lr=0.000408248, gnorm=0.24, clip=0, loss_scale=4, train_wall=144, gb_free=59.1, wall=35372 epoch 015: 172 / 1707 loss=4.124, nll_loss=2.495, ppl=5.64, wps=295827, ups=0.69, wpb=429299, bsz=16333.8, num_updates=24000, lr=0.000408248, gnorm=0.24, clip=0, loss_scale=4, train_wall=144, gb_free=59.1, wall=35372 epoch 015: 172 / 1707 loss=4.124, nll_loss=2.495, ppl=5.64, wps=295827, ups=0.69, wpb=429299, bsz=16333.8, num_updates=24000, lr=0.000408248, gnorm=0.24, clip=0, loss_scale=4, train_wall=144, gb_free=59.1, wall=35372 epoch 015: 172 / 1707 loss=4.124, nll_loss=2.495, ppl=5.64, wps=295827, ups=0.69, wpb=429299, bsz=16333.8, num_updates=24000, lr=0.000408248, gnorm=0.24, clip=0, loss_scale=4, train_wall=144, gb_free=59.1, wall=35372 epoch 015: 172 / 1707 loss=4.124, nll_loss=2.495, ppl=5.64, wps=295827, ups=0.69, wpb=429299, bsz=16333.8, num_updates=24000, lr=0.000408248, gnorm=0.24, clip=0, loss_scale=4, train_wall=144, gb_free=59.1, wall=35372 epoch 015: 172 / 1707 loss=4.124, nll_loss=2.495, ppl=5.64, wps=295827, ups=0.69, wpb=429299, bsz=16333.8, num_updates=24000, lr=0.000408248, gnorm=0.24, clip=0, loss_scale=4, train_wall=144, gb_free=59.1, wall=35372 epoch 015: 172 / 1707 loss=4.124, nll_loss=2.495, ppl=5.64, wps=295827, ups=0.69, wpb=429299, bsz=16333.8, num_updates=24000, lr=0.000408248, gnorm=0.24, clip=0, loss_scale=4, train_wall=144, gb_free=59.1, wall=35372 begin validation on "valid" subset epoch 015 | valid on 'valid' subset | loss 4.331 | nll_loss 2.708 | ppl 6.53 | wps 76087.8 | wpb 21331 | bsz 1016 | num_updates 24000 | best_loss 4.317 epoch 015 | valid on 'valid' subset | loss 4.331 | nll_loss 2.708 | ppl 6.53 | wps 76087.8 | wpb 21331 | bsz 1016 | num_updates 24000 | best_loss 4.317 epoch 015 | valid on 'valid' subset | loss 4.331 | nll_loss 2.708 | ppl 6.53 | wps 76087.8 | wpb 21331 | bsz 1016 | num_updates 24000 | best_loss 4.317 epoch 015 | valid on 'valid' subset | loss 4.331 | nll_loss 2.708 | ppl 6.53 | wps 76087.8 | wpb 21331 | bsz 1016 | num_updates 24000 | best_loss 4.317 epoch 015 | valid on 'valid' subset | loss 4.331 | nll_loss 2.708 | ppl 6.53 | wps 76087.8 | wpb 21331 | bsz 1016 | num_updates 24000 | best_loss 4.317 epoch 015 | valid on 'valid' subset | loss 4.331 | nll_loss 2.708 | ppl 6.53 | wps 76087.8 | wpb 21331 | bsz 1016 | num_updates 24000 | best_loss 4.317 epoch 015 | valid on 'valid' subset | loss 4.331 | nll_loss 2.708 | ppl 6.53 | wps 76087.8 | wpb 21331 | bsz 1016 | num_updates 24000 | best_loss 4.317 epoch 015 | valid on 'valid' subset | loss 4.331 | nll_loss 2.708 | ppl 6.53 | wps 76087.8 | wpb 21331 | bsz 1016 | num_updates 24000 | best_loss 4.317 epoch 015 | valid on 'valid' subset | loss 4.331 | nll_loss 2.708 | ppl 6.53 | wps 76087.8 | wpb 21331 | bsz 1016 | num_updates 24000 | best_loss 4.317 epoch 015 | valid on 'valid' subset | loss 4.331 | nll_loss 2.708 | ppl 6.53 | wps 76087.8 | wpb 21331 | bsz 1016 | num_updates 24000 | best_loss 4.317 epoch 015 | valid on 'valid' subset | loss 4.331 | nll_loss 2.708 | ppl 6.53 | wps 76087.8 | wpb 21331 | bsz 1016 | num_updates 24000 | best_loss 4.317 epoch 015 | valid on 'valid' subset | loss 4.331 | nll_loss 2.708 | ppl 6.53 | wps 76087.8 | wpb 21331 | bsz 1016 | num_updates 24000 | best_loss 4.317 epoch 015 | valid on 'valid' subset | loss 4.331 | nll_loss 2.708 | ppl 6.53 | wps 76087.8 | wpb 21331 | bsz 1016 | num_updates 24000 | best_loss 4.317 epoch 015 | valid on 'valid' subset | loss 4.331 | nll_loss 2.708 | ppl 6.53 | wps 76087.8 | wpb 21331 | bsz 1016 | num_updates 24000 | best_loss 4.317 epoch 015 | valid on 'valid' subset | loss 4.331 | nll_loss 2.708 | ppl 6.53 | wps 76087.8 | wpb 21331 | bsz 1016 | num_updates 24000 | best_loss 4.317 epoch 015: 273 / 1707 loss=4.125, nll_loss=2.497, ppl=5.64, wps=267572, ups=0.62, wpb=429317, bsz=16111.3, num_updates=24100, lr=0.0004074, gnorm=0.245, clip=0, loss_scale=4, train_wall=146, gb_free=59.1, wall=35532 epoch 015: 273 / 1707 loss=4.125, nll_loss=2.497, ppl=5.64, wps=267572, ups=0.62, wpb=429317, bsz=16111.3, num_updates=24100, lr=0.0004074, gnorm=0.245, clip=0, loss_scale=4, train_wall=146, gb_free=59.1, wall=35532 epoch 015: 273 / 1707 loss=4.125, nll_loss=2.497, ppl=5.64, wps=267572, ups=0.62, wpb=429317, bsz=16111.3, num_updates=24100, lr=0.0004074, gnorm=0.245, clip=0, loss_scale=4, train_wall=146, gb_free=59.1, wall=35532 epoch 015: 273 / 1707 loss=4.125, nll_loss=2.497, ppl=5.64, wps=267572, ups=0.62, wpb=429317, bsz=16111.3, num_updates=24100, lr=0.0004074, gnorm=0.245, clip=0, loss_scale=4, train_wall=146, gb_free=59.1, wall=35532 epoch 015: 273 / 1707 loss=4.125, nll_loss=2.497, ppl=5.64, wps=267572, ups=0.62, wpb=429317, bsz=16111.3, num_updates=24100, lr=0.0004074, gnorm=0.245, clip=0, loss_scale=4, train_wall=146, gb_free=59.1, wall=35532 epoch 015: 273 / 1707 loss=4.125, nll_loss=2.497, ppl=5.64, wps=267572, ups=0.62, wpb=429317, bsz=16111.3, num_updates=24100, lr=0.0004074, gnorm=0.245, clip=0, loss_scale=4, train_wall=146, gb_free=59.1, wall=35532 epoch 015: 273 / 1707 loss=4.125, nll_loss=2.497, ppl=5.64, wps=267572, ups=0.62, wpb=429317, bsz=16111.3, num_updates=24100, lr=0.0004074, gnorm=0.245, clip=0, loss_scale=4, train_wall=146, gb_free=59.1, wall=35532 epoch 015: 273 / 1707 loss=4.125, nll_loss=2.497, ppl=5.64, wps=267572, ups=0.62, wpb=429317, bsz=16111.3, num_updates=24100, lr=0.0004074, gnorm=0.245, clip=0, loss_scale=4, train_wall=146, gb_free=59.1, wall=35532 epoch 015: 273 / 1707 loss=4.125, nll_loss=2.497, ppl=5.64, wps=267572, ups=0.62, wpb=429317, bsz=16111.3, num_updates=24100, lr=0.0004074, gnorm=0.245, clip=0, loss_scale=4, train_wall=146, gb_free=59.1, wall=35532 epoch 015: 273 / 1707 loss=4.125, nll_loss=2.497, ppl=5.64, wps=267572, ups=0.62, wpb=429317, bsz=16111.3, num_updates=24100, lr=0.0004074, gnorm=0.245, clip=0, loss_scale=4, train_wall=146, gb_free=59.1, wall=35532 epoch 015: 273 / 1707 loss=4.125, nll_loss=2.497, ppl=5.64, wps=267572, ups=0.62, wpb=429317, bsz=16111.3, num_updates=24100, lr=0.0004074, gnorm=0.245, clip=0, loss_scale=4, train_wall=146, gb_free=59.1, wall=35532 epoch 015: 273 / 1707 loss=4.125, nll_loss=2.497, ppl=5.64, wps=267572, ups=0.62, wpb=429317, bsz=16111.3, num_updates=24100, lr=0.0004074, gnorm=0.245, clip=0, loss_scale=4, train_wall=146, gb_free=59.1, wall=35532 epoch 015: 273 / 1707 loss=4.125, nll_loss=2.497, ppl=5.64, wps=267572, ups=0.62, wpb=429317, bsz=16111.3, num_updates=24100, lr=0.0004074, gnorm=0.245, clip=0, loss_scale=4, train_wall=146, gb_free=59.1, wall=35532 epoch 015: 273 / 1707 loss=4.125, nll_loss=2.497, ppl=5.64, wps=267572, ups=0.62, wpb=429317, bsz=16111.3, num_updates=24100, lr=0.0004074, gnorm=0.245, clip=0, loss_scale=4, train_wall=146, gb_free=59.1, wall=35532 epoch 015: 273 / 1707 loss=4.125, nll_loss=2.497, ppl=5.64, wps=267572, ups=0.62, wpb=429317, bsz=16111.3, num_updates=24100, lr=0.0004074, gnorm=0.245, clip=0, loss_scale=4, train_wall=146, gb_free=59.1, wall=35532 epoch 015: 373 / 1707 loss=4.132, nll_loss=2.505, ppl=5.68, wps=295634, ups=0.69, wpb=429030, bsz=16337.4, num_updates=24200, lr=0.000406558, gnorm=0.237, clip=0, loss_scale=4, train_wall=144, gb_free=59.3, wall=35677 epoch 015: 373 / 1707 loss=4.132, nll_loss=2.505, ppl=5.68, wps=295634, ups=0.69, wpb=429030, bsz=16337.4, num_updates=24200, lr=0.000406558, gnorm=0.237, clip=0, loss_scale=4, train_wall=144, gb_free=59.3, wall=35677 epoch 015: 373 / 1707 loss=4.132, nll_loss=2.505, ppl=5.68, wps=295634, ups=0.69, wpb=429030, bsz=16337.4, num_updates=24200, lr=0.000406558, gnorm=0.237, clip=0, loss_scale=4, train_wall=144, gb_free=59.3, wall=35677 epoch 015: 373 / 1707 loss=4.132, nll_loss=2.505, ppl=5.68, wps=295634, ups=0.69, wpb=429030, bsz=16337.4, num_updates=24200, lr=0.000406558, gnorm=0.237, clip=0, loss_scale=4, train_wall=144, gb_free=59.3, wall=35677 epoch 015: 373 / 1707 loss=4.132, nll_loss=2.505, ppl=5.68, wps=295634, ups=0.69, wpb=429030, bsz=16337.4, num_updates=24200, lr=0.000406558, gnorm=0.237, clip=0, loss_scale=4, train_wall=144, gb_free=59.3, wall=35677 epoch 015: 373 / 1707 loss=4.132, nll_loss=2.505, ppl=5.68, wps=295634, ups=0.69, wpb=429030, bsz=16337.4, num_updates=24200, lr=0.000406558, gnorm=0.237, clip=0, loss_scale=4, train_wall=144, gb_free=59.3, wall=35677 epoch 015: 373 / 1707 loss=4.132, nll_loss=2.505, ppl=5.68, wps=295634, ups=0.69, wpb=429030, bsz=16337.4, num_updates=24200, lr=0.000406558, gnorm=0.237, clip=0, loss_scale=4, train_wall=144, gb_free=59.3, wall=35677 epoch 015: 373 / 1707 loss=4.132, nll_loss=2.505, ppl=5.68, wps=295634, ups=0.69, wpb=429030, bsz=16337.4, num_updates=24200, lr=0.000406558, gnorm=0.237, clip=0, loss_scale=4, train_wall=144, gb_free=59.3, wall=35677 epoch 015: 373 / 1707 loss=4.132, nll_loss=2.505, ppl=5.68, wps=295634, ups=0.69, wpb=429030, bsz=16337.4, num_updates=24200, lr=0.000406558, gnorm=0.237, clip=0, loss_scale=4, train_wall=144, gb_free=59.3, wall=35677 epoch 015: 373 / 1707 loss=4.132, nll_loss=2.505, ppl=5.68, wps=295634, ups=0.69, wpb=429030, bsz=16337.4, num_updates=24200, lr=0.000406558, gnorm=0.237, clip=0, loss_scale=4, train_wall=144, gb_free=59.3, wall=35677 epoch 015: 373 / 1707 loss=4.132, nll_loss=2.505, ppl=5.68, wps=295634, ups=0.69, wpb=429030, bsz=16337.4, num_updates=24200, lr=0.000406558, gnorm=0.237, clip=0, loss_scale=4, train_wall=144, gb_free=59.3, wall=35677 epoch 015: 373 / 1707 loss=4.132, nll_loss=2.505, ppl=5.68, wps=295634, ups=0.69, wpb=429030, bsz=16337.4, num_updates=24200, lr=0.000406558, gnorm=0.237, clip=0, loss_scale=4, train_wall=144, gb_free=59.3, wall=35677 epoch 015: 373 / 1707 loss=4.132, nll_loss=2.505, ppl=5.68, wps=295634, ups=0.69, wpb=429030, bsz=16337.4, num_updates=24200, lr=0.000406558, gnorm=0.237, clip=0, loss_scale=4, train_wall=144, gb_free=59.3, wall=35677 epoch 015: 373 / 1707 loss=4.132, nll_loss=2.505, ppl=5.68, wps=295634, ups=0.69, wpb=429030, bsz=16337.4, num_updates=24200, lr=0.000406558, gnorm=0.237, clip=0, loss_scale=4, train_wall=144, gb_free=59.3, wall=35677 epoch 015: 373 / 1707 loss=4.132, nll_loss=2.505, ppl=5.68, wps=295634, ups=0.69, wpb=429030, bsz=16337.4, num_updates=24200, lr=0.000406558, gnorm=0.237, clip=0, loss_scale=4, train_wall=144, gb_free=59.3, wall=35677 epoch 015: 474 / 1707 loss=4.13, nll_loss=2.503, ppl=5.67, wps=293488, ups=0.68, wpb=429186, bsz=16272.9, num_updates=24300, lr=0.00040572, gnorm=0.228, clip=0, loss_scale=2, train_wall=145, gb_free=59.1, wall=35823 epoch 015: 474 / 1707 loss=4.13, nll_loss=2.503, ppl=5.67, wps=293488, ups=0.68, wpb=429186, bsz=16272.9, num_updates=24300, lr=0.00040572, gnorm=0.228, clip=0, loss_scale=2, train_wall=145, gb_free=59.1, wall=35823 epoch 015: 474 / 1707 loss=4.13, nll_loss=2.503, ppl=5.67, wps=293488, ups=0.68, wpb=429186, bsz=16272.9, num_updates=24300, lr=0.00040572, gnorm=0.228, clip=0, loss_scale=2, train_wall=145, gb_free=59.1, wall=35823 epoch 015: 474 / 1707 loss=4.13, nll_loss=2.503, ppl=5.67, wps=293488, ups=0.68, wpb=429186, bsz=16272.9, num_updates=24300, lr=0.00040572, gnorm=0.228, clip=0, loss_scale=2, train_wall=145, gb_free=59.1, wall=35823 epoch 015: 474 / 1707 loss=4.13, nll_loss=2.503, ppl=5.67, wps=293488, ups=0.68, wpb=429186, bsz=16272.9, num_updates=24300, lr=0.00040572, gnorm=0.228, clip=0, loss_scale=2, train_wall=145, gb_free=59.1, wall=35823 epoch 015: 474 / 1707 loss=4.13, nll_loss=2.503, ppl=5.67, wps=293488, ups=0.68, wpb=429186, bsz=16272.9, num_updates=24300, lr=0.00040572, gnorm=0.228, clip=0, loss_scale=2, train_wall=145, gb_free=59.1, wall=35823 epoch 015: 474 / 1707 loss=4.13, nll_loss=2.503, ppl=5.67, wps=293488, ups=0.68, wpb=429186, bsz=16272.9, num_updates=24300, lr=0.00040572, gnorm=0.228, clip=0, loss_scale=2, train_wall=145, gb_free=59.1, wall=35823 epoch 015: 474 / 1707 loss=4.13, nll_loss=2.503, ppl=5.67, wps=293488, ups=0.68, wpb=429186, bsz=16272.9, num_updates=24300, lr=0.00040572, gnorm=0.228, clip=0, loss_scale=2, train_wall=145, gb_free=59.1, wall=35823 epoch 015: 474 / 1707 loss=4.13, nll_loss=2.503, ppl=5.67, wps=293488, ups=0.68, wpb=429186, bsz=16272.9, num_updates=24300, lr=0.00040572, gnorm=0.228, clip=0, loss_scale=2, train_wall=145, gb_free=59.1, wall=35823 epoch 015: 474 / 1707 loss=4.13, nll_loss=2.503, ppl=5.67, wps=293488, ups=0.68, wpb=429186, bsz=16272.9, num_updates=24300, lr=0.00040572, gnorm=0.228, clip=0, loss_scale=2, train_wall=145, gb_free=59.1, wall=35823 epoch 015: 474 / 1707 loss=4.13, nll_loss=2.503, ppl=5.67, wps=293488, ups=0.68, wpb=429186, bsz=16272.9, num_updates=24300, lr=0.00040572, gnorm=0.228, clip=0, loss_scale=2, train_wall=145, gb_free=59.1, wall=35823 epoch 015: 474 / 1707 loss=4.13, nll_loss=2.503, ppl=5.67, wps=293488, ups=0.68, wpb=429186, bsz=16272.9, num_updates=24300, lr=0.00040572, gnorm=0.228, clip=0, loss_scale=2, train_wall=145, gb_free=59.1, wall=35823 epoch 015: 474 / 1707 loss=4.13, nll_loss=2.503, ppl=5.67, wps=293488, ups=0.68, wpb=429186, bsz=16272.9, num_updates=24300, lr=0.00040572, gnorm=0.228, clip=0, loss_scale=2, train_wall=145, gb_free=59.1, wall=35823 epoch 015: 474 / 1707 loss=4.13, nll_loss=2.503, ppl=5.67, wps=293488, ups=0.68, wpb=429186, bsz=16272.9, num_updates=24300, lr=0.00040572, gnorm=0.228, clip=0, loss_scale=2, train_wall=145, gb_free=59.1, wall=35823 epoch 015: 474 / 1707 loss=4.13, nll_loss=2.503, ppl=5.67, wps=293488, ups=0.68, wpb=429186, bsz=16272.9, num_updates=24300, lr=0.00040572, gnorm=0.228, clip=0, loss_scale=2, train_wall=145, gb_free=59.1, wall=35823 epoch 015: 574 / 1707 loss=4.137, nll_loss=2.511, ppl=5.7, wps=295476, ups=0.69, wpb=428640, bsz=16197, num_updates=24400, lr=0.000404888, gnorm=0.249, clip=0, loss_scale=2, train_wall=144, gb_free=59.1, wall=35968 epoch 015: 574 / 1707 loss=4.137, nll_loss=2.511, ppl=5.7, wps=295476, ups=0.69, wpb=428640, bsz=16197, num_updates=24400, lr=0.000404888, gnorm=0.249, clip=0, loss_scale=2, train_wall=144, gb_free=59.1, wall=35968 epoch 015: 574 / 1707 loss=4.137, nll_loss=2.511, ppl=5.7, wps=295476, ups=0.69, wpb=428640, bsz=16197, num_updates=24400, lr=0.000404888, gnorm=0.249, clip=0, loss_scale=2, train_wall=144, gb_free=59.1, wall=35968 epoch 015: 574 / 1707 loss=4.137, nll_loss=2.511, ppl=5.7, wps=295476, ups=0.69, wpb=428640, bsz=16197, num_updates=24400, lr=0.000404888, gnorm=0.249, clip=0, loss_scale=2, train_wall=144, gb_free=59.1, wall=35968 epoch 015: 574 / 1707 loss=4.137, nll_loss=2.511, ppl=5.7, wps=295476, ups=0.69, wpb=428640, bsz=16197, num_updates=24400, lr=0.000404888, gnorm=0.249, clip=0, loss_scale=2, train_wall=144, gb_free=59.1, wall=35968 epoch 015: 574 / 1707 loss=4.137, nll_loss=2.511, ppl=5.7, wps=295476, ups=0.69, wpb=428640, bsz=16197, num_updates=24400, lr=0.000404888, gnorm=0.249, clip=0, loss_scale=2, train_wall=144, gb_free=59.1, wall=35968 epoch 015: 574 / 1707 loss=4.137, nll_loss=2.511, ppl=5.7, wps=295476, ups=0.69, wpb=428640, bsz=16197, num_updates=24400, lr=0.000404888, gnorm=0.249, clip=0, loss_scale=2, train_wall=144, gb_free=59.1, wall=35968 epoch 015: 574 / 1707 loss=4.137, nll_loss=2.511, ppl=5.7, wps=295476, ups=0.69, wpb=428640, bsz=16197, num_updates=24400, lr=0.000404888, gnorm=0.249, clip=0, loss_scale=2, train_wall=144, gb_free=59.1, wall=35968 epoch 015: 574 / 1707 loss=4.137, nll_loss=2.511, ppl=5.7, wps=295476, ups=0.69, wpb=428640, bsz=16197, num_updates=24400, lr=0.000404888, gnorm=0.249, clip=0, loss_scale=2, train_wall=144, gb_free=59.1, wall=35968 epoch 015: 574 / 1707 loss=4.137, nll_loss=2.511, ppl=5.7, wps=295476, ups=0.69, wpb=428640, bsz=16197, num_updates=24400, lr=0.000404888, gnorm=0.249, clip=0, loss_scale=2, train_wall=144, gb_free=59.1, wall=35968 epoch 015: 574 / 1707 loss=4.137, nll_loss=2.511, ppl=5.7, wps=295476, ups=0.69, wpb=428640, bsz=16197, num_updates=24400, lr=0.000404888, gnorm=0.249, clip=0, loss_scale=2, train_wall=144, gb_free=59.1, wall=35968 epoch 015: 574 / 1707 loss=4.137, nll_loss=2.511, ppl=5.7, wps=295476, ups=0.69, wpb=428640, bsz=16197, num_updates=24400, lr=0.000404888, gnorm=0.249, clip=0, loss_scale=2, train_wall=144, gb_free=59.1, wall=35968 epoch 015: 574 / 1707 loss=4.137, nll_loss=2.511, ppl=5.7, wps=295476, ups=0.69, wpb=428640, bsz=16197, num_updates=24400, lr=0.000404888, gnorm=0.249, clip=0, loss_scale=2, train_wall=144, gb_free=59.1, wall=35968 epoch 015: 574 / 1707 loss=4.137, nll_loss=2.511, ppl=5.7, wps=295476, ups=0.69, wpb=428640, bsz=16197, num_updates=24400, lr=0.000404888, gnorm=0.249, clip=0, loss_scale=2, train_wall=144, gb_free=59.1, wall=35968 epoch 015: 574 / 1707 loss=4.137, nll_loss=2.511, ppl=5.7, wps=295476, ups=0.69, wpb=428640, bsz=16197, num_updates=24400, lr=0.000404888, gnorm=0.249, clip=0, loss_scale=2, train_wall=144, gb_free=59.1, wall=35968 epoch 015: 674 / 1707 loss=4.136, nll_loss=2.51, ppl=5.7, wps=295411, ups=0.69, wpb=427989, bsz=16293, num_updates=24500, lr=0.000404061, gnorm=0.252, clip=0, loss_scale=2, train_wall=144, gb_free=59, wall=36113 epoch 015: 674 / 1707 loss=4.136, nll_loss=2.51, ppl=5.7, wps=295411, ups=0.69, wpb=427989, bsz=16293, num_updates=24500, lr=0.000404061, gnorm=0.252, clip=0, loss_scale=2, train_wall=144, gb_free=59, wall=36113 epoch 015: 674 / 1707 loss=4.136, nll_loss=2.51, ppl=5.7, wps=295411, ups=0.69, wpb=427989, bsz=16293, num_updates=24500, lr=0.000404061, gnorm=0.252, clip=0, loss_scale=2, train_wall=144, gb_free=59, wall=36113 epoch 015: 674 / 1707 loss=4.136, nll_loss=2.51, ppl=5.7, wps=295411, ups=0.69, wpb=427989, bsz=16293, num_updates=24500, lr=0.000404061, gnorm=0.252, clip=0, loss_scale=2, train_wall=144, gb_free=59, wall=36113 epoch 015: 674 / 1707 loss=4.136, nll_loss=2.51, ppl=5.7, wps=295411, ups=0.69, wpb=427989, bsz=16293, num_updates=24500, lr=0.000404061, gnorm=0.252, clip=0, loss_scale=2, train_wall=144, gb_free=59, wall=36113 epoch 015: 674 / 1707 loss=4.136, nll_loss=2.51, ppl=5.7, wps=295411, ups=0.69, wpb=427989, bsz=16293, num_updates=24500, lr=0.000404061, gnorm=0.252, clip=0, loss_scale=2, train_wall=144, gb_free=59, wall=36113 epoch 015: 674 / 1707 loss=4.136, nll_loss=2.51, ppl=5.7, wps=295411, ups=0.69, wpb=427989, bsz=16293, num_updates=24500, lr=0.000404061, gnorm=0.252, clip=0, loss_scale=2, train_wall=144, gb_free=59, wall=36113 epoch 015: 674 / 1707 loss=4.136, nll_loss=2.51, ppl=5.7, wps=295411, ups=0.69, wpb=427989, bsz=16293, num_updates=24500, lr=0.000404061, gnorm=0.252, clip=0, loss_scale=2, train_wall=144, gb_free=59, wall=36113 epoch 015: 674 / 1707 loss=4.136, nll_loss=2.51, ppl=5.7, wps=295411, ups=0.69, wpb=427989, bsz=16293, num_updates=24500, lr=0.000404061, gnorm=0.252, clip=0, loss_scale=2, train_wall=144, gb_free=59, wall=36113 epoch 015: 674 / 1707 loss=4.136, nll_loss=2.51, ppl=5.7, wps=295411, ups=0.69, wpb=427989, bsz=16293, num_updates=24500, lr=0.000404061, gnorm=0.252, clip=0, loss_scale=2, train_wall=144, gb_free=59, wall=36113 epoch 015: 674 / 1707 loss=4.136, nll_loss=2.51, ppl=5.7, wps=295411, ups=0.69, wpb=427989, bsz=16293, num_updates=24500, lr=0.000404061, gnorm=0.252, clip=0, loss_scale=2, train_wall=144, gb_free=59, wall=36113 epoch 015: 674 / 1707 loss=4.136, nll_loss=2.51, ppl=5.7, wps=295411, ups=0.69, wpb=427989, bsz=16293, num_updates=24500, lr=0.000404061, gnorm=0.252, clip=0, loss_scale=2, train_wall=144, gb_free=59, wall=36113 epoch 015: 674 / 1707 loss=4.136, nll_loss=2.51, ppl=5.7, wps=295411, ups=0.69, wpb=427989, bsz=16293, num_updates=24500, lr=0.000404061, gnorm=0.252, clip=0, loss_scale=2, train_wall=144, gb_free=59, wall=36113 epoch 015: 674 / 1707 loss=4.136, nll_loss=2.51, ppl=5.7, wps=295411, ups=0.69, wpb=427989, bsz=16293, num_updates=24500, lr=0.000404061, gnorm=0.252, clip=0, loss_scale=2, train_wall=144, gb_free=59, wall=36113 epoch 015: 674 / 1707 loss=4.136, nll_loss=2.51, ppl=5.7, wps=295411, ups=0.69, wpb=427989, bsz=16293, num_updates=24500, lr=0.000404061, gnorm=0.252, clip=0, loss_scale=2, train_wall=144, gb_free=59, wall=36113 epoch 015: 774 / 1707 loss=4.142, nll_loss=2.517, ppl=5.72, wps=295983, ups=0.69, wpb=429960, bsz=16650.2, num_updates=24600, lr=0.000403239, gnorm=0.235, clip=0, loss_scale=4, train_wall=144, gb_free=58.9, wall=36259 epoch 015: 774 / 1707 loss=4.142, nll_loss=2.517, ppl=5.72, wps=295983, ups=0.69, wpb=429960, bsz=16650.2, num_updates=24600, lr=0.000403239, gnorm=0.235, clip=0, loss_scale=4, train_wall=144, gb_free=58.9, wall=36259 epoch 015: 774 / 1707 loss=4.142, nll_loss=2.517, ppl=5.72, wps=295983, ups=0.69, wpb=429960, bsz=16650.2, num_updates=24600, lr=0.000403239, gnorm=0.235, clip=0, loss_scale=4, train_wall=144, gb_free=58.9, wall=36259 epoch 015: 774 / 1707 loss=4.142, nll_loss=2.517, ppl=5.72, wps=295983, ups=0.69, wpb=429960, bsz=16650.2, num_updates=24600, lr=0.000403239, gnorm=0.235, clip=0, loss_scale=4, train_wall=144, gb_free=58.9, wall=36259 epoch 015: 774 / 1707 loss=4.142, nll_loss=2.517, ppl=5.72, wps=295983, ups=0.69, wpb=429960, bsz=16650.2, num_updates=24600, lr=0.000403239, gnorm=0.235, clip=0, loss_scale=4, train_wall=144, gb_free=58.9, wall=36259 epoch 015: 774 / 1707 loss=4.142, nll_loss=2.517, ppl=5.72, wps=295983, ups=0.69, wpb=429960, bsz=16650.2, num_updates=24600, lr=0.000403239, gnorm=0.235, clip=0, loss_scale=4, train_wall=144, gb_free=58.9, wall=36259 epoch 015: 774 / 1707 loss=4.142, nll_loss=2.517, ppl=5.72, wps=295983, ups=0.69, wpb=429960, bsz=16650.2, num_updates=24600, lr=0.000403239, gnorm=0.235, clip=0, loss_scale=4, train_wall=144, gb_free=58.9, wall=36259 epoch 015: 774 / 1707 loss=4.142, nll_loss=2.517, ppl=5.72, wps=295983, ups=0.69, wpb=429960, bsz=16650.2, num_updates=24600, lr=0.000403239, gnorm=0.235, clip=0, loss_scale=4, train_wall=144, gb_free=58.9, wall=36259 epoch 015: 774 / 1707 loss=4.142, nll_loss=2.517, ppl=5.72, wps=295983, ups=0.69, wpb=429960, bsz=16650.2, num_updates=24600, lr=0.000403239, gnorm=0.235, clip=0, loss_scale=4, train_wall=144, gb_free=58.9, wall=36259 epoch 015: 774 / 1707 loss=4.142, nll_loss=2.517, ppl=5.72, wps=295983, ups=0.69, wpb=429960, bsz=16650.2, num_updates=24600, lr=0.000403239, gnorm=0.235, clip=0, loss_scale=4, train_wall=144, gb_free=58.9, wall=36259 epoch 015: 774 / 1707 loss=4.142, nll_loss=2.517, ppl=5.72, wps=295983, ups=0.69, wpb=429960, bsz=16650.2, num_updates=24600, lr=0.000403239, gnorm=0.235, clip=0, loss_scale=4, train_wall=144, gb_free=58.9, wall=36259 epoch 015: 774 / 1707 loss=4.142, nll_loss=2.517, ppl=5.72, wps=295983, ups=0.69, wpb=429960, bsz=16650.2, num_updates=24600, lr=0.000403239, gnorm=0.235, clip=0, loss_scale=4, train_wall=144, gb_free=58.9, wall=36259 epoch 015: 774 / 1707 loss=4.142, nll_loss=2.517, ppl=5.72, wps=295983, ups=0.69, wpb=429960, bsz=16650.2, num_updates=24600, lr=0.000403239, gnorm=0.235, clip=0, loss_scale=4, train_wall=144, gb_free=58.9, wall=36259 epoch 015: 774 / 1707 loss=4.142, nll_loss=2.517, ppl=5.72, wps=295983, ups=0.69, wpb=429960, bsz=16650.2, num_updates=24600, lr=0.000403239, gnorm=0.235, clip=0, loss_scale=4, train_wall=144, gb_free=58.9, wall=36259 epoch 015: 774 / 1707 loss=4.142, nll_loss=2.517, ppl=5.72, wps=295983, ups=0.69, wpb=429960, bsz=16650.2, num_updates=24600, lr=0.000403239, gnorm=0.235, clip=0, loss_scale=4, train_wall=144, gb_free=58.9, wall=36259 epoch 015: 874 / 1707 loss=4.14, nll_loss=2.515, ppl=5.72, wps=296023, ups=0.69, wpb=429818, bsz=16315, num_updates=24700, lr=0.000402422, gnorm=0.238, clip=0, loss_scale=4, train_wall=144, gb_free=58.9, wall=36404 epoch 015: 874 / 1707 loss=4.14, nll_loss=2.515, ppl=5.72, wps=296023, ups=0.69, wpb=429818, bsz=16315, num_updates=24700, lr=0.000402422, gnorm=0.238, clip=0, loss_scale=4, train_wall=144, gb_free=58.9, wall=36404 epoch 015: 874 / 1707 loss=4.14, nll_loss=2.515, ppl=5.72, wps=296023, ups=0.69, wpb=429818, bsz=16315, num_updates=24700, lr=0.000402422, gnorm=0.238, clip=0, loss_scale=4, train_wall=144, gb_free=58.9, wall=36404 epoch 015: 874 / 1707 loss=4.14, nll_loss=2.515, ppl=5.72, wps=296023, ups=0.69, wpb=429818, bsz=16315, num_updates=24700, lr=0.000402422, gnorm=0.238, clip=0, loss_scale=4, train_wall=144, gb_free=58.9, wall=36404 epoch 015: 874 / 1707 loss=4.14, nll_loss=2.515, ppl=5.72, wps=296023, ups=0.69, wpb=429818, bsz=16315, num_updates=24700, lr=0.000402422, gnorm=0.238, clip=0, loss_scale=4, train_wall=144, gb_free=58.9, wall=36404 epoch 015: 874 / 1707 loss=4.14, nll_loss=2.515, ppl=5.72, wps=296023, ups=0.69, wpb=429818, bsz=16315, num_updates=24700, lr=0.000402422, gnorm=0.238, clip=0, loss_scale=4, train_wall=144, gb_free=58.9, wall=36404 epoch 015: 874 / 1707 loss=4.14, nll_loss=2.515, ppl=5.72, wps=296023, ups=0.69, wpb=429818, bsz=16315, num_updates=24700, lr=0.000402422, gnorm=0.238, clip=0, loss_scale=4, train_wall=144, gb_free=58.9, wall=36404 epoch 015: 874 / 1707 loss=4.14, nll_loss=2.515, ppl=5.72, wps=296023, ups=0.69, wpb=429818, bsz=16315, num_updates=24700, lr=0.000402422, gnorm=0.238, clip=0, loss_scale=4, train_wall=144, gb_free=58.9, wall=36404 epoch 015: 874 / 1707 loss=4.14, nll_loss=2.515, ppl=5.72, wps=296023, ups=0.69, wpb=429818, bsz=16315, num_updates=24700, lr=0.000402422, gnorm=0.238, clip=0, loss_scale=4, train_wall=144, gb_free=58.9, wall=36404 epoch 015: 874 / 1707 loss=4.14, nll_loss=2.515, ppl=5.72, wps=296023, ups=0.69, wpb=429818, bsz=16315, num_updates=24700, lr=0.000402422, gnorm=0.238, clip=0, loss_scale=4, train_wall=144, gb_free=58.9, wall=36404 epoch 015: 874 / 1707 loss=4.14, nll_loss=2.515, ppl=5.72, wps=296023, ups=0.69, wpb=429818, bsz=16315, num_updates=24700, lr=0.000402422, gnorm=0.238, clip=0, loss_scale=4, train_wall=144, gb_free=58.9, wall=36404 epoch 015: 874 / 1707 loss=4.14, nll_loss=2.515, ppl=5.72, wps=296023, ups=0.69, wpb=429818, bsz=16315, num_updates=24700, lr=0.000402422, gnorm=0.238, clip=0, loss_scale=4, train_wall=144, gb_free=58.9, wall=36404 epoch 015: 874 / 1707 loss=4.14, nll_loss=2.515, ppl=5.72, wps=296023, ups=0.69, wpb=429818, bsz=16315, num_updates=24700, lr=0.000402422, gnorm=0.238, clip=0, loss_scale=4, train_wall=144, gb_free=58.9, wall=36404 epoch 015: 874 / 1707 loss=4.14, nll_loss=2.515, ppl=5.72, wps=296023, ups=0.69, wpb=429818, bsz=16315, num_updates=24700, lr=0.000402422, gnorm=0.238, clip=0, loss_scale=4, train_wall=144, gb_free=58.9, wall=36404 epoch 015: 874 / 1707 loss=4.14, nll_loss=2.515, ppl=5.72, wps=296023, ups=0.69, wpb=429818, bsz=16315, num_updates=24700, lr=0.000402422, gnorm=0.238, clip=0, loss_scale=4, train_wall=144, gb_free=58.9, wall=36404 epoch 015: 974 / 1707 loss=4.143, nll_loss=2.518, ppl=5.73, wps=293932, ups=0.69, wpb=427582, bsz=16391.3, num_updates=24800, lr=0.00040161, gnorm=0.238, clip=0, loss_scale=8, train_wall=145, gb_free=59.2, wall=36549 epoch 015: 974 / 1707 loss=4.143, nll_loss=2.518, ppl=5.73, wps=293932, ups=0.69, wpb=427582, bsz=16391.3, num_updates=24800, lr=0.00040161, gnorm=0.238, clip=0, loss_scale=8, train_wall=145, gb_free=59.2, wall=36549 epoch 015: 974 / 1707 loss=4.143, nll_loss=2.518, ppl=5.73, wps=293932, ups=0.69, wpb=427582, bsz=16391.3, num_updates=24800, lr=0.00040161, gnorm=0.238, clip=0, loss_scale=8, train_wall=145, gb_free=59.2, wall=36549 epoch 015: 974 / 1707 loss=4.143, nll_loss=2.518, ppl=5.73, wps=293932, ups=0.69, wpb=427582, bsz=16391.3, num_updates=24800, lr=0.00040161, gnorm=0.238, clip=0, loss_scale=8, train_wall=145, gb_free=59.2, wall=36549 epoch 015: 974 / 1707 loss=4.143, nll_loss=2.518, ppl=5.73, wps=293932, ups=0.69, wpb=427582, bsz=16391.3, num_updates=24800, lr=0.00040161, gnorm=0.238, clip=0, loss_scale=8, train_wall=145, gb_free=59.2, wall=36549 epoch 015: 974 / 1707 loss=4.143, nll_loss=2.518, ppl=5.73, wps=293932, ups=0.69, wpb=427582, bsz=16391.3, num_updates=24800, lr=0.00040161, gnorm=0.238, clip=0, loss_scale=8, train_wall=145, gb_free=59.2, wall=36549 epoch 015: 974 / 1707 loss=4.143, nll_loss=2.518, ppl=5.73, wps=293932, ups=0.69, wpb=427582, bsz=16391.3, num_updates=24800, lr=0.00040161, gnorm=0.238, clip=0, loss_scale=8, train_wall=145, gb_free=59.2, wall=36549 epoch 015: 974 / 1707 loss=4.143, nll_loss=2.518, ppl=5.73, wps=293932, ups=0.69, wpb=427582, bsz=16391.3, num_updates=24800, lr=0.00040161, gnorm=0.238, clip=0, loss_scale=8, train_wall=145, gb_free=59.2, wall=36549 epoch 015: 974 / 1707 loss=4.143, nll_loss=2.518, ppl=5.73, wps=293932, ups=0.69, wpb=427582, bsz=16391.3, num_updates=24800, lr=0.00040161, gnorm=0.238, clip=0, loss_scale=8, train_wall=145, gb_free=59.2, wall=36549 epoch 015: 974 / 1707 loss=4.143, nll_loss=2.518, ppl=5.73, wps=293932, ups=0.69, wpb=427582, bsz=16391.3, num_updates=24800, lr=0.00040161, gnorm=0.238, clip=0, loss_scale=8, train_wall=145, gb_free=59.2, wall=36549 epoch 015: 974 / 1707 loss=4.143, nll_loss=2.518, ppl=5.73, wps=293932, ups=0.69, wpb=427582, bsz=16391.3, num_updates=24800, lr=0.00040161, gnorm=0.238, clip=0, loss_scale=8, train_wall=145, gb_free=59.2, wall=36549 epoch 015: 974 / 1707 loss=4.143, nll_loss=2.518, ppl=5.73, wps=293932, ups=0.69, wpb=427582, bsz=16391.3, num_updates=24800, lr=0.00040161, gnorm=0.238, clip=0, loss_scale=8, train_wall=145, gb_free=59.2, wall=36549 epoch 015: 974 / 1707 loss=4.143, nll_loss=2.518, ppl=5.73, wps=293932, ups=0.69, wpb=427582, bsz=16391.3, num_updates=24800, lr=0.00040161, gnorm=0.238, clip=0, loss_scale=8, train_wall=145, gb_free=59.2, wall=36549 epoch 015: 974 / 1707 loss=4.143, nll_loss=2.518, ppl=5.73, wps=293932, ups=0.69, wpb=427582, bsz=16391.3, num_updates=24800, lr=0.00040161, gnorm=0.238, clip=0, loss_scale=8, train_wall=145, gb_free=59.2, wall=36549 epoch 015: 974 / 1707 loss=4.143, nll_loss=2.518, ppl=5.73, wps=293932, ups=0.69, wpb=427582, bsz=16391.3, num_updates=24800, lr=0.00040161, gnorm=0.238, clip=0, loss_scale=8, train_wall=145, gb_free=59.2, wall=36549 epoch 015: 1076 / 1707 loss=4.14, nll_loss=2.515, ppl=5.71, wps=290708, ups=0.68, wpb=428850, bsz=16271.5, num_updates=24900, lr=0.000400802, gnorm=0.239, clip=0, loss_scale=2, train_wall=147, gb_free=59.2, wall=36697 epoch 015: 1076 / 1707 loss=4.14, nll_loss=2.515, ppl=5.71, wps=290708, ups=0.68, wpb=428850, bsz=16271.5, num_updates=24900, lr=0.000400802, gnorm=0.239, clip=0, loss_scale=2, train_wall=147, gb_free=59.2, wall=36697 epoch 015: 1076 / 1707 loss=4.14, nll_loss=2.515, ppl=5.71, wps=290708, ups=0.68, wpb=428850, bsz=16271.5, num_updates=24900, lr=0.000400802, gnorm=0.239, clip=0, loss_scale=2, train_wall=147, gb_free=59.2, wall=36697 epoch 015: 1076 / 1707 loss=4.14, nll_loss=2.515, ppl=5.71, wps=290708, ups=0.68, wpb=428850, bsz=16271.5, num_updates=24900, lr=0.000400802, gnorm=0.239, clip=0, loss_scale=2, train_wall=147, gb_free=59.2, wall=36697 epoch 015: 1076 / 1707 loss=4.14, nll_loss=2.515, ppl=5.71, wps=290708, ups=0.68, wpb=428850, bsz=16271.5, num_updates=24900, lr=0.000400802, gnorm=0.239, clip=0, loss_scale=2, train_wall=147, gb_free=59.2, wall=36697 epoch 015: 1076 / 1707 loss=4.14, nll_loss=2.515, ppl=5.71, wps=290708, ups=0.68, wpb=428850, bsz=16271.5, num_updates=24900, lr=0.000400802, gnorm=0.239, clip=0, loss_scale=2, train_wall=147, gb_free=59.2, wall=36697 epoch 015: 1076 / 1707 loss=4.14, nll_loss=2.515, ppl=5.71, wps=290708, ups=0.68, wpb=428850, bsz=16271.5, num_updates=24900, lr=0.000400802, gnorm=0.239, clip=0, loss_scale=2, train_wall=147, gb_free=59.2, wall=36697 epoch 015: 1076 / 1707 loss=4.14, nll_loss=2.515, ppl=5.71, wps=290708, ups=0.68, wpb=428850, bsz=16271.5, num_updates=24900, lr=0.000400802, gnorm=0.239, clip=0, loss_scale=2, train_wall=147, gb_free=59.2, wall=36697 epoch 015: 1076 / 1707 loss=4.14, nll_loss=2.515, ppl=5.71, wps=290708, ups=0.68, wpb=428850, bsz=16271.5, num_updates=24900, lr=0.000400802, gnorm=0.239, clip=0, loss_scale=2, train_wall=147, gb_free=59.2, wall=36697 epoch 015: 1076 / 1707 loss=4.14, nll_loss=2.515, ppl=5.71, wps=290708, ups=0.68, wpb=428850, bsz=16271.5, num_updates=24900, lr=0.000400802, gnorm=0.239, clip=0, loss_scale=2, train_wall=147, gb_free=59.2, wall=36697 epoch 015: 1076 / 1707 loss=4.14, nll_loss=2.515, ppl=5.71, wps=290708, ups=0.68, wpb=428850, bsz=16271.5, num_updates=24900, lr=0.000400802, gnorm=0.239, clip=0, loss_scale=2, train_wall=147, gb_free=59.2, wall=36697 epoch 015: 1076 / 1707 loss=4.14, nll_loss=2.515, ppl=5.71, wps=290708, ups=0.68, wpb=428850, bsz=16271.5, num_updates=24900, lr=0.000400802, gnorm=0.239, clip=0, loss_scale=2, train_wall=147, gb_free=59.2, wall=36697 epoch 015: 1076 / 1707 loss=4.14, nll_loss=2.515, ppl=5.71, wps=290708, ups=0.68, wpb=428850, bsz=16271.5, num_updates=24900, lr=0.000400802, gnorm=0.239, clip=0, loss_scale=2, train_wall=147, gb_free=59.2, wall=36697 epoch 015: 1076 / 1707 loss=4.14, nll_loss=2.515, ppl=5.71, wps=290708, ups=0.68, wpb=428850, bsz=16271.5, num_updates=24900, lr=0.000400802, gnorm=0.239, clip=0, loss_scale=2, train_wall=147, gb_free=59.2, wall=36697 epoch 015: 1076 / 1707 loss=4.14, nll_loss=2.515, ppl=5.71, wps=290708, ups=0.68, wpb=428850, bsz=16271.5, num_updates=24900, lr=0.000400802, gnorm=0.239, clip=0, loss_scale=2, train_wall=147, gb_free=59.2, wall=36697 epoch 015: 1177 / 1707 loss=4.147, nll_loss=2.523, ppl=5.75, wps=294147, ups=0.68, wpb=431098, bsz=16192.7, num_updates=25000, lr=0.0004, gnorm=0.253, clip=0, loss_scale=1, train_wall=146, gb_free=59.1, wall=36843 epoch 015: 1177 / 1707 loss=4.147, nll_loss=2.523, ppl=5.75, wps=294147, ups=0.68, wpb=431098, bsz=16192.7, num_updates=25000, lr=0.0004, gnorm=0.253, clip=0, loss_scale=1, train_wall=146, gb_free=59.1, wall=36843 epoch 015: 1177 / 1707 loss=4.147, nll_loss=2.523, ppl=5.75, wps=294147, ups=0.68, wpb=431098, bsz=16192.7, num_updates=25000, lr=0.0004, gnorm=0.253, clip=0, loss_scale=1, train_wall=146, gb_free=59.1, wall=36843 epoch 015: 1177 / 1707 loss=4.147, nll_loss=2.523, ppl=5.75, wps=294147, ups=0.68, wpb=431098, bsz=16192.7, num_updates=25000, lr=0.0004, gnorm=0.253, clip=0, loss_scale=1, train_wall=146, gb_free=59.1, wall=36843 epoch 015: 1177 / 1707 loss=4.147, nll_loss=2.523, ppl=5.75, wps=294147, ups=0.68, wpb=431098, bsz=16192.7, num_updates=25000, lr=0.0004, gnorm=0.253, clip=0, loss_scale=1, train_wall=146, gb_free=59.1, wall=36843 epoch 015: 1177 / 1707 loss=4.147, nll_loss=2.523, ppl=5.75, wps=294147, ups=0.68, wpb=431098, bsz=16192.7, num_updates=25000, lr=0.0004, gnorm=0.253, clip=0, loss_scale=1, train_wall=146, gb_free=59.1, wall=36843 epoch 015: 1177 / 1707 loss=4.147, nll_loss=2.523, ppl=5.75, wps=294147, ups=0.68, wpb=431098, bsz=16192.7, num_updates=25000, lr=0.0004, gnorm=0.253, clip=0, loss_scale=1, train_wall=146, gb_free=59.1, wall=36843 epoch 015: 1177 / 1707 loss=4.147, nll_loss=2.523, ppl=5.75, wps=294147, ups=0.68, wpb=431098, bsz=16192.7, num_updates=25000, lr=0.0004, gnorm=0.253, clip=0, loss_scale=1, train_wall=146, gb_free=59.1, wall=36843 epoch 015: 1177 / 1707 loss=4.147, nll_loss=2.523, ppl=5.75, wps=294147, ups=0.68, wpb=431098, bsz=16192.7, num_updates=25000, lr=0.0004, gnorm=0.253, clip=0, loss_scale=1, train_wall=146, gb_free=59.1, wall=36843 epoch 015: 1177 / 1707 loss=4.147, nll_loss=2.523, ppl=5.75, wps=294147, ups=0.68, wpb=431098, bsz=16192.7, num_updates=25000, lr=0.0004, gnorm=0.253, clip=0, loss_scale=1, train_wall=146, gb_free=59.1, wall=36843 epoch 015: 1177 / 1707 loss=4.147, nll_loss=2.523, ppl=5.75, wps=294147, ups=0.68, wpb=431098, bsz=16192.7, num_updates=25000, lr=0.0004, gnorm=0.253, clip=0, loss_scale=1, train_wall=146, gb_free=59.1, wall=36843 epoch 015: 1177 / 1707 loss=4.147, nll_loss=2.523, ppl=5.75, wps=294147, ups=0.68, wpb=431098, bsz=16192.7, num_updates=25000, lr=0.0004, gnorm=0.253, clip=0, loss_scale=1, train_wall=146, gb_free=59.1, wall=36843 epoch 015: 1177 / 1707 loss=4.147, nll_loss=2.523, ppl=5.75, wps=294147, ups=0.68, wpb=431098, bsz=16192.7, num_updates=25000, lr=0.0004, gnorm=0.253, clip=0, loss_scale=1, train_wall=146, gb_free=59.1, wall=36843 epoch 015: 1177 / 1707 loss=4.147, nll_loss=2.523, ppl=5.75, wps=294147, ups=0.68, wpb=431098, bsz=16192.7, num_updates=25000, lr=0.0004, gnorm=0.253, clip=0, loss_scale=1, train_wall=146, gb_free=59.1, wall=36843 epoch 015: 1177 / 1707 loss=4.147, nll_loss=2.523, ppl=5.75, wps=294147, ups=0.68, wpb=431098, bsz=16192.7, num_updates=25000, lr=0.0004, gnorm=0.253, clip=0, loss_scale=1, train_wall=146, gb_free=59.1, wall=36843 begin validation on "valid" subset epoch 015 | valid on 'valid' subset | loss 4.315 | nll_loss 2.693 | ppl 6.47 | wps 76577.9 | wpb 21331 | bsz 1016 | num_updates 25000 | best_loss 4.315 epoch 015 | valid on 'valid' subset | loss 4.315 | nll_loss 2.693 | ppl 6.47 | wps 76577.9 | wpb 21331 | bsz 1016 | num_updates 25000 | best_loss 4.315 epoch 015 | valid on 'valid' subset | loss 4.315 | nll_loss 2.693 | ppl 6.47 | wps 76577.9 | wpb 21331 | bsz 1016 | num_updates 25000 | best_loss 4.315 epoch 015 | valid on 'valid' subset | loss 4.315 | nll_loss 2.693 | ppl 6.47 | wps 76577.9 | wpb 21331 | bsz 1016 | num_updates 25000 | best_loss 4.315 epoch 015 | valid on 'valid' subset | loss 4.315 | nll_loss 2.693 | ppl 6.47 | wps 76577.9 | wpb 21331 | bsz 1016 | num_updates 25000 | best_loss 4.315 epoch 015 | valid on 'valid' subset | loss 4.315 | nll_loss 2.693 | ppl 6.47 | wps 76577.9 | wpb 21331 | bsz 1016 | num_updates 25000 | best_loss 4.315 epoch 015 | valid on 'valid' subset | loss 4.315 | nll_loss 2.693 | ppl 6.47 | wps 76577.9 | wpb 21331 | bsz 1016 | num_updates 25000 | best_loss 4.315 epoch 015 | valid on 'valid' subset | loss 4.315 | nll_loss 2.693 | ppl 6.47 | wps 76577.9 | wpb 21331 | bsz 1016 | num_updates 25000 | best_loss 4.315 epoch 015 | valid on 'valid' subset | loss 4.315 | nll_loss 2.693 | ppl 6.47 | wps 76577.9 | wpb 21331 | bsz 1016 | num_updates 25000 | best_loss 4.315 epoch 015 | valid on 'valid' subset | loss 4.315 | nll_loss 2.693 | ppl 6.47 | wps 76577.9 | wpb 21331 | bsz 1016 | num_updates 25000 | best_loss 4.315 epoch 015 | valid on 'valid' subset | loss 4.315 | nll_loss 2.693 | ppl 6.47 | wps 76577.9 | wpb 21331 | bsz 1016 | num_updates 25000 | best_loss 4.315 epoch 015 | valid on 'valid' subset | loss 4.315 | nll_loss 2.693 | ppl 6.47 | wps 76577.9 | wpb 21331 | bsz 1016 | num_updates 25000 | best_loss 4.315 epoch 015 | valid on 'valid' subset | loss 4.315 | nll_loss 2.693 | ppl 6.47 | wps 76577.9 | wpb 21331 | bsz 1016 | num_updates 25000 | best_loss 4.315 epoch 015 | valid on 'valid' subset | loss 4.315 | nll_loss 2.693 | ppl 6.47 | wps 76577.9 | wpb 21331 | bsz 1016 | num_updates 25000 | best_loss 4.315 epoch 015 | valid on 'valid' subset | loss 4.315 | nll_loss 2.693 | ppl 6.47 | wps 76577.9 | wpb 21331 | bsz 1016 | num_updates 25000 | best_loss 4.315 epoch 015: 1277 / 1707 loss=4.145, nll_loss=2.521, ppl=5.74, wps=261490, ups=0.61, wpb=428974, bsz=16341.4, num_updates=25100, lr=0.000399202, gnorm=0.226, clip=0, loss_scale=1, train_wall=144, gb_free=59.2, wall=37007 epoch 015: 1277 / 1707 loss=4.145, nll_loss=2.521, ppl=5.74, wps=261490, ups=0.61, wpb=428974, bsz=16341.4, num_updates=25100, lr=0.000399202, gnorm=0.226, clip=0, loss_scale=1, train_wall=144, gb_free=59.2, wall=37007 epoch 015: 1277 / 1707 loss=4.145, nll_loss=2.521, ppl=5.74, wps=261490, ups=0.61, wpb=428974, bsz=16341.4, num_updates=25100, lr=0.000399202, gnorm=0.226, clip=0, loss_scale=1, train_wall=144, gb_free=59.2, wall=37007 epoch 015: 1277 / 1707 loss=4.145, nll_loss=2.521, ppl=5.74, wps=261490, ups=0.61, wpb=428974, bsz=16341.4, num_updates=25100, lr=0.000399202, gnorm=0.226, clip=0, loss_scale=1, train_wall=144, gb_free=59.2, wall=37007 epoch 015: 1277 / 1707 loss=4.145, nll_loss=2.521, ppl=5.74, wps=261490, ups=0.61, wpb=428974, bsz=16341.4, num_updates=25100, lr=0.000399202, gnorm=0.226, clip=0, loss_scale=1, train_wall=144, gb_free=59.2, wall=37007 epoch 015: 1277 / 1707 loss=4.145, nll_loss=2.521, ppl=5.74, wps=261490, ups=0.61, wpb=428974, bsz=16341.4, num_updates=25100, lr=0.000399202, gnorm=0.226, clip=0, loss_scale=1, train_wall=144, gb_free=59.2, wall=37007 epoch 015: 1277 / 1707 loss=4.145, nll_loss=2.521, ppl=5.74, wps=261490, ups=0.61, wpb=428974, bsz=16341.4, num_updates=25100, lr=0.000399202, gnorm=0.226, clip=0, loss_scale=1, train_wall=144, gb_free=59.2, wall=37007 epoch 015: 1277 / 1707 loss=4.145, nll_loss=2.521, ppl=5.74, wps=261490, ups=0.61, wpb=428974, bsz=16341.4, num_updates=25100, lr=0.000399202, gnorm=0.226, clip=0, loss_scale=1, train_wall=144, gb_free=59.2, wall=37007 epoch 015: 1277 / 1707 loss=4.145, nll_loss=2.521, ppl=5.74, wps=261490, ups=0.61, wpb=428974, bsz=16341.4, num_updates=25100, lr=0.000399202, gnorm=0.226, clip=0, loss_scale=1, train_wall=144, gb_free=59.2, wall=37007 epoch 015: 1277 / 1707 loss=4.145, nll_loss=2.521, ppl=5.74, wps=261490, ups=0.61, wpb=428974, bsz=16341.4, num_updates=25100, lr=0.000399202, gnorm=0.226, clip=0, loss_scale=1, train_wall=144, gb_free=59.2, wall=37007 epoch 015: 1277 / 1707 loss=4.145, nll_loss=2.521, ppl=5.74, wps=261490, ups=0.61, wpb=428974, bsz=16341.4, num_updates=25100, lr=0.000399202, gnorm=0.226, clip=0, loss_scale=1, train_wall=144, gb_free=59.2, wall=37007 epoch 015: 1277 / 1707 loss=4.145, nll_loss=2.521, ppl=5.74, wps=261490, ups=0.61, wpb=428974, bsz=16341.4, num_updates=25100, lr=0.000399202, gnorm=0.226, clip=0, loss_scale=1, train_wall=144, gb_free=59.2, wall=37007 epoch 015: 1277 / 1707 loss=4.145, nll_loss=2.521, ppl=5.74, wps=261490, ups=0.61, wpb=428974, bsz=16341.4, num_updates=25100, lr=0.000399202, gnorm=0.226, clip=0, loss_scale=1, train_wall=144, gb_free=59.2, wall=37007 epoch 015: 1277 / 1707 loss=4.145, nll_loss=2.521, ppl=5.74, wps=261490, ups=0.61, wpb=428974, bsz=16341.4, num_updates=25100, lr=0.000399202, gnorm=0.226, clip=0, loss_scale=1, train_wall=144, gb_free=59.2, wall=37007 epoch 015: 1277 / 1707 loss=4.145, nll_loss=2.521, ppl=5.74, wps=261490, ups=0.61, wpb=428974, bsz=16341.4, num_updates=25100, lr=0.000399202, gnorm=0.226, clip=0, loss_scale=1, train_wall=144, gb_free=59.2, wall=37007 epoch 015: 1377 / 1707 loss=4.146, nll_loss=2.522, ppl=5.74, wps=296471, ups=0.69, wpb=430208, bsz=16546.1, num_updates=25200, lr=0.00039841, gnorm=0.244, clip=0, loss_scale=1, train_wall=144, gb_free=58.9, wall=37153 epoch 015: 1377 / 1707 loss=4.146, nll_loss=2.522, ppl=5.74, wps=296471, ups=0.69, wpb=430208, bsz=16546.1, num_updates=25200, lr=0.00039841, gnorm=0.244, clip=0, loss_scale=1, train_wall=144, gb_free=58.9, wall=37153 epoch 015: 1377 / 1707 loss=4.146, nll_loss=2.522, ppl=5.74, wps=296471, ups=0.69, wpb=430208, bsz=16546.1, num_updates=25200, lr=0.00039841, gnorm=0.244, clip=0, loss_scale=1, train_wall=144, gb_free=58.9, wall=37153 epoch 015: 1377 / 1707 loss=4.146, nll_loss=2.522, ppl=5.74, wps=296471, ups=0.69, wpb=430208, bsz=16546.1, num_updates=25200, lr=0.00039841, gnorm=0.244, clip=0, loss_scale=1, train_wall=144, gb_free=58.9, wall=37153 epoch 015: 1377 / 1707 loss=4.146, nll_loss=2.522, ppl=5.74, wps=296471, ups=0.69, wpb=430208, bsz=16546.1, num_updates=25200, lr=0.00039841, gnorm=0.244, clip=0, loss_scale=1, train_wall=144, gb_free=58.9, wall=37153 epoch 015: 1377 / 1707 loss=4.146, nll_loss=2.522, ppl=5.74, wps=296471, ups=0.69, wpb=430208, bsz=16546.1, num_updates=25200, lr=0.00039841, gnorm=0.244, clip=0, loss_scale=1, train_wall=144, gb_free=58.9, wall=37153 epoch 015: 1377 / 1707 loss=4.146, nll_loss=2.522, ppl=5.74, wps=296471, ups=0.69, wpb=430208, bsz=16546.1, num_updates=25200, lr=0.00039841, gnorm=0.244, clip=0, loss_scale=1, train_wall=144, gb_free=58.9, wall=37153 epoch 015: 1377 / 1707 loss=4.146, nll_loss=2.522, ppl=5.74, wps=296471, ups=0.69, wpb=430208, bsz=16546.1, num_updates=25200, lr=0.00039841, gnorm=0.244, clip=0, loss_scale=1, train_wall=144, gb_free=58.9, wall=37153 epoch 015: 1377 / 1707 loss=4.146, nll_loss=2.522, ppl=5.74, wps=296471, ups=0.69, wpb=430208, bsz=16546.1, num_updates=25200, lr=0.00039841, gnorm=0.244, clip=0, loss_scale=1, train_wall=144, gb_free=58.9, wall=37153 epoch 015: 1377 / 1707 loss=4.146, nll_loss=2.522, ppl=5.74, wps=296471, ups=0.69, wpb=430208, bsz=16546.1, num_updates=25200, lr=0.00039841, gnorm=0.244, clip=0, loss_scale=1, train_wall=144, gb_free=58.9, wall=37153 epoch 015: 1377 / 1707 loss=4.146, nll_loss=2.522, ppl=5.74, wps=296471, ups=0.69, wpb=430208, bsz=16546.1, num_updates=25200, lr=0.00039841, gnorm=0.244, clip=0, loss_scale=1, train_wall=144, gb_free=58.9, wall=37153 epoch 015: 1377 / 1707 loss=4.146, nll_loss=2.522, ppl=5.74, wps=296471, ups=0.69, wpb=430208, bsz=16546.1, num_updates=25200, lr=0.00039841, gnorm=0.244, clip=0, loss_scale=1, train_wall=144, gb_free=58.9, wall=37153 epoch 015: 1377 / 1707 loss=4.146, nll_loss=2.522, ppl=5.74, wps=296471, ups=0.69, wpb=430208, bsz=16546.1, num_updates=25200, lr=0.00039841, gnorm=0.244, clip=0, loss_scale=1, train_wall=144, gb_free=58.9, wall=37153 epoch 015: 1377 / 1707 loss=4.146, nll_loss=2.522, ppl=5.74, wps=296471, ups=0.69, wpb=430208, bsz=16546.1, num_updates=25200, lr=0.00039841, gnorm=0.244, clip=0, loss_scale=1, train_wall=144, gb_free=58.9, wall=37153 epoch 015: 1377 / 1707 loss=4.146, nll_loss=2.522, ppl=5.74, wps=296471, ups=0.69, wpb=430208, bsz=16546.1, num_updates=25200, lr=0.00039841, gnorm=0.244, clip=0, loss_scale=1, train_wall=144, gb_free=58.9, wall=37153 epoch 015: 1477 / 1707 loss=4.143, nll_loss=2.519, ppl=5.73, wps=295109, ups=0.69, wpb=428136, bsz=16424.2, num_updates=25300, lr=0.000397621, gnorm=0.253, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=37298 epoch 015: 1477 / 1707 loss=4.143, nll_loss=2.519, ppl=5.73, wps=295109, ups=0.69, wpb=428136, bsz=16424.2, num_updates=25300, lr=0.000397621, gnorm=0.253, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=37298 epoch 015: 1477 / 1707 loss=4.143, nll_loss=2.519, ppl=5.73, wps=295109, ups=0.69, wpb=428136, bsz=16424.2, num_updates=25300, lr=0.000397621, gnorm=0.253, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=37298 epoch 015: 1477 / 1707 loss=4.143, nll_loss=2.519, ppl=5.73, wps=295109, ups=0.69, wpb=428136, bsz=16424.2, num_updates=25300, lr=0.000397621, gnorm=0.253, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=37298 epoch 015: 1477 / 1707 loss=4.143, nll_loss=2.519, ppl=5.73, wps=295109, ups=0.69, wpb=428136, bsz=16424.2, num_updates=25300, lr=0.000397621, gnorm=0.253, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=37298 epoch 015: 1477 / 1707 loss=4.143, nll_loss=2.519, ppl=5.73, wps=295109, ups=0.69, wpb=428136, bsz=16424.2, num_updates=25300, lr=0.000397621, gnorm=0.253, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=37298 epoch 015: 1477 / 1707 loss=4.143, nll_loss=2.519, ppl=5.73, wps=295109, ups=0.69, wpb=428136, bsz=16424.2, num_updates=25300, lr=0.000397621, gnorm=0.253, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=37298 epoch 015: 1477 / 1707 loss=4.143, nll_loss=2.519, ppl=5.73, wps=295109, ups=0.69, wpb=428136, bsz=16424.2, num_updates=25300, lr=0.000397621, gnorm=0.253, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=37298 epoch 015: 1477 / 1707 loss=4.143, nll_loss=2.519, ppl=5.73, wps=295109, ups=0.69, wpb=428136, bsz=16424.2, num_updates=25300, lr=0.000397621, gnorm=0.253, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=37298 epoch 015: 1477 / 1707 loss=4.143, nll_loss=2.519, ppl=5.73, wps=295109, ups=0.69, wpb=428136, bsz=16424.2, num_updates=25300, lr=0.000397621, gnorm=0.253, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=37298 epoch 015: 1477 / 1707 loss=4.143, nll_loss=2.519, ppl=5.73, wps=295109, ups=0.69, wpb=428136, bsz=16424.2, num_updates=25300, lr=0.000397621, gnorm=0.253, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=37298 epoch 015: 1477 / 1707 loss=4.143, nll_loss=2.519, ppl=5.73, wps=295109, ups=0.69, wpb=428136, bsz=16424.2, num_updates=25300, lr=0.000397621, gnorm=0.253, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=37298 epoch 015: 1477 / 1707 loss=4.143, nll_loss=2.519, ppl=5.73, wps=295109, ups=0.69, wpb=428136, bsz=16424.2, num_updates=25300, lr=0.000397621, gnorm=0.253, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=37298 epoch 015: 1477 / 1707 loss=4.143, nll_loss=2.519, ppl=5.73, wps=295109, ups=0.69, wpb=428136, bsz=16424.2, num_updates=25300, lr=0.000397621, gnorm=0.253, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=37298 epoch 015: 1477 / 1707 loss=4.143, nll_loss=2.519, ppl=5.73, wps=295109, ups=0.69, wpb=428136, bsz=16424.2, num_updates=25300, lr=0.000397621, gnorm=0.253, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=37298 epoch 015: 1577 / 1707 loss=4.143, nll_loss=2.519, ppl=5.73, wps=295960, ups=0.69, wpb=428680, bsz=16348.6, num_updates=25400, lr=0.000396838, gnorm=0.251, clip=0, loss_scale=2, train_wall=144, gb_free=58.8, wall=37442 epoch 015: 1577 / 1707 loss=4.143, nll_loss=2.519, ppl=5.73, wps=295960, ups=0.69, wpb=428680, bsz=16348.6, num_updates=25400, lr=0.000396838, gnorm=0.251, clip=0, loss_scale=2, train_wall=144, gb_free=58.8, wall=37442 epoch 015: 1577 / 1707 loss=4.143, nll_loss=2.519, ppl=5.73, wps=295960, ups=0.69, wpb=428680, bsz=16348.6, num_updates=25400, lr=0.000396838, gnorm=0.251, clip=0, loss_scale=2, train_wall=144, gb_free=58.8, wall=37442 epoch 015: 1577 / 1707 loss=4.143, nll_loss=2.519, ppl=5.73, wps=295960, ups=0.69, wpb=428680, bsz=16348.6, num_updates=25400, lr=0.000396838, gnorm=0.251, clip=0, loss_scale=2, train_wall=144, gb_free=58.8, wall=37442 epoch 015: 1577 / 1707 loss=4.143, nll_loss=2.519, ppl=5.73, wps=295960, ups=0.69, wpb=428680, bsz=16348.6, num_updates=25400, lr=0.000396838, gnorm=0.251, clip=0, loss_scale=2, train_wall=144, gb_free=58.8, wall=37442 epoch 015: 1577 / 1707 loss=4.143, nll_loss=2.519, ppl=5.73, wps=295960, ups=0.69, wpb=428680, bsz=16348.6, num_updates=25400, lr=0.000396838, gnorm=0.251, clip=0, loss_scale=2, train_wall=144, gb_free=58.8, wall=37442 epoch 015: 1577 / 1707 loss=4.143, nll_loss=2.519, ppl=5.73, wps=295960, ups=0.69, wpb=428680, bsz=16348.6, num_updates=25400, lr=0.000396838, gnorm=0.251, clip=0, loss_scale=2, train_wall=144, gb_free=58.8, wall=37442 epoch 015: 1577 / 1707 loss=4.143, nll_loss=2.519, ppl=5.73, wps=295960, ups=0.69, wpb=428680, bsz=16348.6, num_updates=25400, lr=0.000396838, gnorm=0.251, clip=0, loss_scale=2, train_wall=144, gb_free=58.8, wall=37442 epoch 015: 1577 / 1707 loss=4.143, nll_loss=2.519, ppl=5.73, wps=295960, ups=0.69, wpb=428680, bsz=16348.6, num_updates=25400, lr=0.000396838, gnorm=0.251, clip=0, loss_scale=2, train_wall=144, gb_free=58.8, wall=37442 epoch 015: 1577 / 1707 loss=4.143, nll_loss=2.519, ppl=5.73, wps=295960, ups=0.69, wpb=428680, bsz=16348.6, num_updates=25400, lr=0.000396838, gnorm=0.251, clip=0, loss_scale=2, train_wall=144, gb_free=58.8, wall=37442 epoch 015: 1577 / 1707 loss=4.143, nll_loss=2.519, ppl=5.73, wps=295960, ups=0.69, wpb=428680, bsz=16348.6, num_updates=25400, lr=0.000396838, gnorm=0.251, clip=0, loss_scale=2, train_wall=144, gb_free=58.8, wall=37442 epoch 015: 1577 / 1707 loss=4.143, nll_loss=2.519, ppl=5.73, wps=295960, ups=0.69, wpb=428680, bsz=16348.6, num_updates=25400, lr=0.000396838, gnorm=0.251, clip=0, loss_scale=2, train_wall=144, gb_free=58.8, wall=37442 epoch 015: 1577 / 1707 loss=4.143, nll_loss=2.519, ppl=5.73, wps=295960, ups=0.69, wpb=428680, bsz=16348.6, num_updates=25400, lr=0.000396838, gnorm=0.251, clip=0, loss_scale=2, train_wall=144, gb_free=58.8, wall=37442 epoch 015: 1577 / 1707 loss=4.143, nll_loss=2.519, ppl=5.73, wps=295960, ups=0.69, wpb=428680, bsz=16348.6, num_updates=25400, lr=0.000396838, gnorm=0.251, clip=0, loss_scale=2, train_wall=144, gb_free=58.8, wall=37442 epoch 015: 1577 / 1707 loss=4.143, nll_loss=2.519, ppl=5.73, wps=295960, ups=0.69, wpb=428680, bsz=16348.6, num_updates=25400, lr=0.000396838, gnorm=0.251, clip=0, loss_scale=2, train_wall=144, gb_free=58.8, wall=37442 epoch 015: 1677 / 1707 loss=4.147, nll_loss=2.524, ppl=5.75, wps=295349, ups=0.69, wpb=429459, bsz=16163.4, num_updates=25500, lr=0.000396059, gnorm=0.236, clip=0, loss_scale=4, train_wall=145, gb_free=59, wall=37588 epoch 015: 1677 / 1707 loss=4.147, nll_loss=2.524, ppl=5.75, wps=295349, ups=0.69, wpb=429459, bsz=16163.4, num_updates=25500, lr=0.000396059, gnorm=0.236, clip=0, loss_scale=4, train_wall=145, gb_free=59, wall=37588 epoch 015: 1677 / 1707 loss=4.147, nll_loss=2.524, ppl=5.75, wps=295349, ups=0.69, wpb=429459, bsz=16163.4, num_updates=25500, lr=0.000396059, gnorm=0.236, clip=0, loss_scale=4, train_wall=145, gb_free=59, wall=37588 epoch 015: 1677 / 1707 loss=4.147, nll_loss=2.524, ppl=5.75, wps=295349, ups=0.69, wpb=429459, bsz=16163.4, num_updates=25500, lr=0.000396059, gnorm=0.236, clip=0, loss_scale=4, train_wall=145, gb_free=59, wall=37588 epoch 015: 1677 / 1707 loss=4.147, nll_loss=2.524, ppl=5.75, wps=295349, ups=0.69, wpb=429459, bsz=16163.4, num_updates=25500, lr=0.000396059, gnorm=0.236, clip=0, loss_scale=4, train_wall=145, gb_free=59, wall=37588 epoch 015: 1677 / 1707 loss=4.147, nll_loss=2.524, ppl=5.75, wps=295349, ups=0.69, wpb=429459, bsz=16163.4, num_updates=25500, lr=0.000396059, gnorm=0.236, clip=0, loss_scale=4, train_wall=145, gb_free=59, wall=37588 epoch 015: 1677 / 1707 loss=4.147, nll_loss=2.524, ppl=5.75, wps=295349, ups=0.69, wpb=429459, bsz=16163.4, num_updates=25500, lr=0.000396059, gnorm=0.236, clip=0, loss_scale=4, train_wall=145, gb_free=59, wall=37588 epoch 015: 1677 / 1707 loss=4.147, nll_loss=2.524, ppl=5.75, wps=295349, ups=0.69, wpb=429459, bsz=16163.4, num_updates=25500, lr=0.000396059, gnorm=0.236, clip=0, loss_scale=4, train_wall=145, gb_free=59, wall=37588 epoch 015: 1677 / 1707 loss=4.147, nll_loss=2.524, ppl=5.75, wps=295349, ups=0.69, wpb=429459, bsz=16163.4, num_updates=25500, lr=0.000396059, gnorm=0.236, clip=0, loss_scale=4, train_wall=145, gb_free=59, wall=37588 epoch 015: 1677 / 1707 loss=4.147, nll_loss=2.524, ppl=5.75, wps=295349, ups=0.69, wpb=429459, bsz=16163.4, num_updates=25500, lr=0.000396059, gnorm=0.236, clip=0, loss_scale=4, train_wall=145, gb_free=59, wall=37588 epoch 015: 1677 / 1707 loss=4.147, nll_loss=2.524, ppl=5.75, wps=295349, ups=0.69, wpb=429459, bsz=16163.4, num_updates=25500, lr=0.000396059, gnorm=0.236, clip=0, loss_scale=4, train_wall=145, gb_free=59, wall=37588 epoch 015: 1677 / 1707 loss=4.147, nll_loss=2.524, ppl=5.75, wps=295349, ups=0.69, wpb=429459, bsz=16163.4, num_updates=25500, lr=0.000396059, gnorm=0.236, clip=0, loss_scale=4, train_wall=145, gb_free=59, wall=37588 epoch 015: 1677 / 1707 loss=4.147, nll_loss=2.524, ppl=5.75, wps=295349, ups=0.69, wpb=429459, bsz=16163.4, num_updates=25500, lr=0.000396059, gnorm=0.236, clip=0, loss_scale=4, train_wall=145, gb_free=59, wall=37588 epoch 015: 1677 / 1707 loss=4.147, nll_loss=2.524, ppl=5.75, wps=295349, ups=0.69, wpb=429459, bsz=16163.4, num_updates=25500, lr=0.000396059, gnorm=0.236, clip=0, loss_scale=4, train_wall=145, gb_free=59, wall=37588 epoch 015: 1677 / 1707 loss=4.147, nll_loss=2.524, ppl=5.75, wps=295349, ups=0.69, wpb=429459, bsz=16163.4, num_updates=25500, lr=0.000396059, gnorm=0.236, clip=0, loss_scale=4, train_wall=145, gb_free=59, wall=37588 end of epoch 15 (average epoch stats below) epoch 015 | loss 4.138 | nll_loss 2.513 | ppl 5.71 | wps 290959 | ups 0.68 | wpb 428934 | bsz 16332.1 | num_updates 25530 | lr 0.000395826 | gnorm 0.242 | clip 0 | loss_scale 4 | train_wall 2463 | gb_free 59.9 | wall 37630 epoch 015 | loss 4.138 | nll_loss 2.513 | ppl 5.71 | wps 290959 | ups 0.68 | wpb 428934 | bsz 16332.1 | num_updates 25530 | lr 0.000395826 | gnorm 0.242 | clip 0 | loss_scale 4 | train_wall 2463 | gb_free 59.9 | wall 37630 epoch 015 | loss 4.138 | nll_loss 2.513 | ppl 5.71 | wps 290959 | ups 0.68 | wpb 428934 | bsz 16332.1 | num_updates 25530 | lr 0.000395826 | gnorm 0.242 | clip 0 | loss_scale 4 | train_wall 2463 | gb_free 59.9 | wall 37630 epoch 015 | loss 4.138 | nll_loss 2.513 | ppl 5.71 | wps 290959 | ups 0.68 | wpb 428934 | bsz 16332.1 | num_updates 25530 | lr 0.000395826 | gnorm 0.242 | clip 0 | loss_scale 4 | train_wall 2463 | gb_free 59.9 | wall 37630 epoch 015 | loss 4.138 | nll_loss 2.513 | ppl 5.71 | wps 290959 | ups 0.68 | wpb 428934 | bsz 16332.1 | num_updates 25530 | lr 0.000395826 | gnorm 0.242 | clip 0 | loss_scale 4 | train_wall 2463 | gb_free 59.9 | wall 37630 epoch 015 | loss 4.138 | nll_loss 2.513 | ppl 5.71 | wps 290959 | ups 0.68 | wpb 428934 | bsz 16332.1 | num_updates 25530 | lr 0.000395826 | gnorm 0.242 | clip 0 | loss_scale 4 | train_wall 2463 | gb_free 59.9 | wall 37630 epoch 015 | loss 4.138 | nll_loss 2.513 | ppl 5.71 | wps 290959 | ups 0.68 | wpb 428934 | bsz 16332.1 | num_updates 25530 | lr 0.000395826 | gnorm 0.242 | clip 0 | loss_scale 4 | train_wall 2463 | gb_free 59.9 | wall 37630 epoch 015 | loss 4.138 | nll_loss 2.513 | ppl 5.71 | wps 290959 | ups 0.68 | wpb 428934 | bsz 16332.1 | num_updates 25530 | lr 0.000395826 | gnorm 0.242 | clip 0 | loss_scale 4 | train_wall 2463 | gb_free 59.9 | wall 37630 epoch 015 | loss 4.138 | nll_loss 2.513 | ppl 5.71 | wps 290959 | ups 0.68 | wpb 428934 | bsz 16332.1 | num_updates 25530 | lr 0.000395826 | gnorm 0.242 | clip 0 | loss_scale 4 | train_wall 2463 | gb_free 59.9 | wall 37630 epoch 015 | loss 4.138 | nll_loss 2.513 | ppl 5.71 | wps 290959 | ups 0.68 | wpb 428934 | bsz 16332.1 | num_updates 25530 | lr 0.000395826 | gnorm 0.242 | clip 0 | loss_scale 4 | train_wall 2463 | gb_free 59.9 | wall 37630 epoch 015 | loss 4.138 | nll_loss 2.513 | ppl 5.71 | wps 290959 | ups 0.68 | wpb 428934 | bsz 16332.1 | num_updates 25530 | lr 0.000395826 | gnorm 0.242 | clip 0 | loss_scale 4 | train_wall 2463 | gb_free 59.9 | wall 37630 epoch 015 | loss 4.138 | nll_loss 2.513 | ppl 5.71 | wps 290959 | ups 0.68 | wpb 428934 | bsz 16332.1 | num_updates 25530 | lr 0.000395826 | gnorm 0.242 | clip 0 | loss_scale 4 | train_wall 2463 | gb_free 59.9 | wall 37630 epoch 015 | loss 4.138 | nll_loss 2.513 | ppl 5.71 | wps 290959 | ups 0.68 | wpb 428934 | bsz 16332.1 | num_updates 25530 | lr 0.000395826 | gnorm 0.242 | clip 0 | loss_scale 4 | train_wall 2463 | gb_free 59.9 | wall 37630 epoch 015 | loss 4.138 | nll_loss 2.513 | ppl 5.71 | wps 290959 | ups 0.68 | wpb 428934 | bsz 16332.1 | num_updates 25530 | lr 0.000395826 | gnorm 0.242 | clip 0 | loss_scale 4 | train_wall 2463 | gb_free 59.9 | wall 37630 epoch 015 | loss 4.138 | nll_loss 2.513 | ppl 5.71 | wps 290959 | ups 0.68 | wpb 428934 | bsz 16332.1 | num_updates 25530 | lr 0.000395826 | gnorm 0.242 | clip 0 | loss_scale 4 | train_wall 2463 | gb_free 59.9 | wall 37630 Start iterating over samples epoch 016: 70 / 1707 loss=4.119, nll_loss=2.491, ppl=5.62, wps=293569, ups=0.69, wpb=425175, bsz=16484.6, num_updates=25600, lr=0.000395285, gnorm=0.225, clip=0, loss_scale=4, train_wall=144, gb_free=58.9, wall=37733 epoch 016: 70 / 1707 loss=4.119, nll_loss=2.491, ppl=5.62, wps=293569, ups=0.69, wpb=425175, bsz=16484.6, num_updates=25600, lr=0.000395285, gnorm=0.225, clip=0, loss_scale=4, train_wall=144, gb_free=58.9, wall=37733 epoch 016: 70 / 1707 loss=4.119, nll_loss=2.491, ppl=5.62, wps=293569, ups=0.69, wpb=425175, bsz=16484.6, num_updates=25600, lr=0.000395285, gnorm=0.225, clip=0, loss_scale=4, train_wall=144, gb_free=58.9, wall=37733 epoch 016: 70 / 1707 loss=4.119, nll_loss=2.491, ppl=5.62, wps=293569, ups=0.69, wpb=425175, bsz=16484.6, num_updates=25600, lr=0.000395285, gnorm=0.225, clip=0, loss_scale=4, train_wall=144, gb_free=58.9, wall=37733 epoch 016: 70 / 1707 loss=4.119, nll_loss=2.491, ppl=5.62, wps=293569, ups=0.69, wpb=425175, bsz=16484.6, num_updates=25600, lr=0.000395285, gnorm=0.225, clip=0, loss_scale=4, train_wall=144, gb_free=58.9, wall=37733 epoch 016: 70 / 1707 loss=4.119, nll_loss=2.491, ppl=5.62, wps=293569, ups=0.69, wpb=425175, bsz=16484.6, num_updates=25600, lr=0.000395285, gnorm=0.225, clip=0, loss_scale=4, train_wall=144, gb_free=58.9, wall=37733 epoch 016: 70 / 1707 loss=4.119, nll_loss=2.491, ppl=5.62, wps=293569, ups=0.69, wpb=425175, bsz=16484.6, num_updates=25600, lr=0.000395285, gnorm=0.225, clip=0, loss_scale=4, train_wall=144, gb_free=58.9, wall=37733 epoch 016: 70 / 1707 loss=4.119, nll_loss=2.491, ppl=5.62, wps=293569, ups=0.69, wpb=425175, bsz=16484.6, num_updates=25600, lr=0.000395285, gnorm=0.225, clip=0, loss_scale=4, train_wall=144, gb_free=58.9, wall=37733 epoch 016: 70 / 1707 loss=4.119, nll_loss=2.491, ppl=5.62, wps=293569, ups=0.69, wpb=425175, bsz=16484.6, num_updates=25600, lr=0.000395285, gnorm=0.225, clip=0, loss_scale=4, train_wall=144, gb_free=58.9, wall=37733 epoch 016: 70 / 1707 loss=4.119, nll_loss=2.491, ppl=5.62, wps=293569, ups=0.69, wpb=425175, bsz=16484.6, num_updates=25600, lr=0.000395285, gnorm=0.225, clip=0, loss_scale=4, train_wall=144, gb_free=58.9, wall=37733 epoch 016: 70 / 1707 loss=4.119, nll_loss=2.491, ppl=5.62, wps=293569, ups=0.69, wpb=425175, bsz=16484.6, num_updates=25600, lr=0.000395285, gnorm=0.225, clip=0, loss_scale=4, train_wall=144, gb_free=58.9, wall=37733 epoch 016: 70 / 1707 loss=4.119, nll_loss=2.491, ppl=5.62, wps=293569, ups=0.69, wpb=425175, bsz=16484.6, num_updates=25600, lr=0.000395285, gnorm=0.225, clip=0, loss_scale=4, train_wall=144, gb_free=58.9, wall=37733 epoch 016: 70 / 1707 loss=4.119, nll_loss=2.491, ppl=5.62, wps=293569, ups=0.69, wpb=425175, bsz=16484.6, num_updates=25600, lr=0.000395285, gnorm=0.225, clip=0, loss_scale=4, train_wall=144, gb_free=58.9, wall=37733 epoch 016: 70 / 1707 loss=4.119, nll_loss=2.491, ppl=5.62, wps=293569, ups=0.69, wpb=425175, bsz=16484.6, num_updates=25600, lr=0.000395285, gnorm=0.225, clip=0, loss_scale=4, train_wall=144, gb_free=58.9, wall=37733 epoch 016: 70 / 1707 loss=4.119, nll_loss=2.491, ppl=5.62, wps=293569, ups=0.69, wpb=425175, bsz=16484.6, num_updates=25600, lr=0.000395285, gnorm=0.225, clip=0, loss_scale=4, train_wall=144, gb_free=58.9, wall=37733 epoch 016: 70 / 1707 loss=4.119, nll_loss=2.491, ppl=5.62, wps=293569, ups=0.69, wpb=425175, bsz=16484.6, num_updates=25600, lr=0.000395285, gnorm=0.225, clip=0, loss_scale=4, train_wall=144, gb_free=58.9, wall=37733 epoch 016: 170 / 1707 loss=4.113, nll_loss=2.484, ppl=5.59, wps=294933, ups=0.69, wpb=429043, bsz=16411.4, num_updates=25700, lr=0.000394515, gnorm=0.24, clip=0, loss_scale=4, train_wall=145, gb_free=58.8, wall=37878 epoch 016: 170 / 1707 loss=4.113, nll_loss=2.484, ppl=5.59, wps=294933, ups=0.69, wpb=429043, bsz=16411.4, num_updates=25700, lr=0.000394515, gnorm=0.24, clip=0, loss_scale=4, train_wall=145, gb_free=58.8, wall=37878 epoch 016: 170 / 1707 loss=4.113, nll_loss=2.484, ppl=5.59, wps=294933, ups=0.69, wpb=429043, bsz=16411.4, num_updates=25700, lr=0.000394515, gnorm=0.24, clip=0, loss_scale=4, train_wall=145, gb_free=58.8, wall=37878 epoch 016: 170 / 1707 loss=4.113, nll_loss=2.484, ppl=5.59, wps=294933, ups=0.69, wpb=429043, bsz=16411.4, num_updates=25700, lr=0.000394515, gnorm=0.24, clip=0, loss_scale=4, train_wall=145, gb_free=58.8, wall=37878 epoch 016: 170 / 1707 loss=4.113, nll_loss=2.484, ppl=5.59, wps=294933, ups=0.69, wpb=429043, bsz=16411.4, num_updates=25700, lr=0.000394515, gnorm=0.24, clip=0, loss_scale=4, train_wall=145, gb_free=58.8, wall=37878 epoch 016: 170 / 1707 loss=4.113, nll_loss=2.484, ppl=5.59, wps=294933, ups=0.69, wpb=429043, bsz=16411.4, num_updates=25700, lr=0.000394515, gnorm=0.24, clip=0, loss_scale=4, train_wall=145, gb_free=58.8, wall=37878 epoch 016: 170 / 1707 loss=4.113, nll_loss=2.484, ppl=5.59, wps=294933, ups=0.69, wpb=429043, bsz=16411.4, num_updates=25700, lr=0.000394515, gnorm=0.24, clip=0, loss_scale=4, train_wall=145, gb_free=58.8, wall=37878 epoch 016: 170 / 1707 loss=4.113, nll_loss=2.484, ppl=5.59, wps=294933, ups=0.69, wpb=429043, bsz=16411.4, num_updates=25700, lr=0.000394515, gnorm=0.24, clip=0, loss_scale=4, train_wall=145, gb_free=58.8, wall=37878 epoch 016: 170 / 1707 loss=4.113, nll_loss=2.484, ppl=5.59, wps=294933, ups=0.69, wpb=429043, bsz=16411.4, num_updates=25700, lr=0.000394515, gnorm=0.24, clip=0, loss_scale=4, train_wall=145, gb_free=58.8, wall=37878 epoch 016: 170 / 1707 loss=4.113, nll_loss=2.484, ppl=5.59, wps=294933, ups=0.69, wpb=429043, bsz=16411.4, num_updates=25700, lr=0.000394515, gnorm=0.24, clip=0, loss_scale=4, train_wall=145, gb_free=58.8, wall=37878 epoch 016: 170 / 1707 loss=4.113, nll_loss=2.484, ppl=5.59, wps=294933, ups=0.69, wpb=429043, bsz=16411.4, num_updates=25700, lr=0.000394515, gnorm=0.24, clip=0, loss_scale=4, train_wall=145, gb_free=58.8, wall=37878 epoch 016: 170 / 1707 loss=4.113, nll_loss=2.484, ppl=5.59, wps=294933, ups=0.69, wpb=429043, bsz=16411.4, num_updates=25700, lr=0.000394515, gnorm=0.24, clip=0, loss_scale=4, train_wall=145, gb_free=58.8, wall=37878 epoch 016: 170 / 1707 loss=4.113, nll_loss=2.484, ppl=5.59, wps=294933, ups=0.69, wpb=429043, bsz=16411.4, num_updates=25700, lr=0.000394515, gnorm=0.24, clip=0, loss_scale=4, train_wall=145, gb_free=58.8, wall=37878 epoch 016: 170 / 1707 loss=4.113, nll_loss=2.484, ppl=5.59, wps=294933, ups=0.69, wpb=429043, bsz=16411.4, num_updates=25700, lr=0.000394515, gnorm=0.24, clip=0, loss_scale=4, train_wall=145, gb_free=58.8, wall=37878 epoch 016: 170 / 1707 loss=4.113, nll_loss=2.484, ppl=5.59, wps=294933, ups=0.69, wpb=429043, bsz=16411.4, num_updates=25700, lr=0.000394515, gnorm=0.24, clip=0, loss_scale=4, train_wall=145, gb_free=58.8, wall=37878 epoch 016: 170 / 1707 loss=4.113, nll_loss=2.484, ppl=5.59, wps=294933, ups=0.69, wpb=429043, bsz=16411.4, num_updates=25700, lr=0.000394515, gnorm=0.24, clip=0, loss_scale=4, train_wall=145, gb_free=58.8, wall=37878 epoch 016: 271 / 1707 loss=4.112, nll_loss=2.483, ppl=5.59, wps=292732, ups=0.68, wpb=428566, bsz=16219.1, num_updates=25800, lr=0.00039375, gnorm=0.234, clip=0, loss_scale=4, train_wall=146, gb_free=59.2, wall=38025 epoch 016: 271 / 1707 loss=4.112, nll_loss=2.483, ppl=5.59, wps=292732, ups=0.68, wpb=428566, bsz=16219.1, num_updates=25800, lr=0.00039375, gnorm=0.234, clip=0, loss_scale=4, train_wall=146, gb_free=59.2, wall=38025 epoch 016: 271 / 1707 loss=4.112, nll_loss=2.483, ppl=5.59, wps=292732, ups=0.68, wpb=428566, bsz=16219.1, num_updates=25800, lr=0.00039375, gnorm=0.234, clip=0, loss_scale=4, train_wall=146, gb_free=59.2, wall=38025 epoch 016: 271 / 1707 loss=4.112, nll_loss=2.483, ppl=5.59, wps=292732, ups=0.68, wpb=428566, bsz=16219.1, num_updates=25800, lr=0.00039375, gnorm=0.234, clip=0, loss_scale=4, train_wall=146, gb_free=59.2, wall=38025 epoch 016: 271 / 1707 loss=4.112, nll_loss=2.483, ppl=5.59, wps=292732, ups=0.68, wpb=428566, bsz=16219.1, num_updates=25800, lr=0.00039375, gnorm=0.234, clip=0, loss_scale=4, train_wall=146, gb_free=59.2, wall=38025 epoch 016: 271 / 1707 loss=4.112, nll_loss=2.483, ppl=5.59, wps=292732, ups=0.68, wpb=428566, bsz=16219.1, num_updates=25800, lr=0.00039375, gnorm=0.234, clip=0, loss_scale=4, train_wall=146, gb_free=59.2, wall=38025 epoch 016: 271 / 1707 loss=4.112, nll_loss=2.483, ppl=5.59, wps=292732, ups=0.68, wpb=428566, bsz=16219.1, num_updates=25800, lr=0.00039375, gnorm=0.234, clip=0, loss_scale=4, train_wall=146, gb_free=59.2, wall=38025 epoch 016: 271 / 1707 loss=4.112, nll_loss=2.483, ppl=5.59, wps=292732, ups=0.68, wpb=428566, bsz=16219.1, num_updates=25800, lr=0.00039375, gnorm=0.234, clip=0, loss_scale=4, train_wall=146, gb_free=59.2, wall=38025 epoch 016: 271 / 1707 loss=4.112, nll_loss=2.483, ppl=5.59, wps=292732, ups=0.68, wpb=428566, bsz=16219.1, num_updates=25800, lr=0.00039375, gnorm=0.234, clip=0, loss_scale=4, train_wall=146, gb_free=59.2, wall=38025 epoch 016: 271 / 1707 loss=4.112, nll_loss=2.483, ppl=5.59, wps=292732, ups=0.68, wpb=428566, bsz=16219.1, num_updates=25800, lr=0.00039375, gnorm=0.234, clip=0, loss_scale=4, train_wall=146, gb_free=59.2, wall=38025 epoch 016: 271 / 1707 loss=4.112, nll_loss=2.483, ppl=5.59, wps=292732, ups=0.68, wpb=428566, bsz=16219.1, num_updates=25800, lr=0.00039375, gnorm=0.234, clip=0, loss_scale=4, train_wall=146, gb_free=59.2, wall=38025 epoch 016: 271 / 1707 loss=4.112, nll_loss=2.483, ppl=5.59, wps=292732, ups=0.68, wpb=428566, bsz=16219.1, num_updates=25800, lr=0.00039375, gnorm=0.234, clip=0, loss_scale=4, train_wall=146, gb_free=59.2, wall=38025 epoch 016: 271 / 1707 loss=4.112, nll_loss=2.483, ppl=5.59, wps=292732, ups=0.68, wpb=428566, bsz=16219.1, num_updates=25800, lr=0.00039375, gnorm=0.234, clip=0, loss_scale=4, train_wall=146, gb_free=59.2, wall=38025 epoch 016: 271 / 1707 loss=4.112, nll_loss=2.483, ppl=5.59, wps=292732, ups=0.68, wpb=428566, bsz=16219.1, num_updates=25800, lr=0.00039375, gnorm=0.234, clip=0, loss_scale=4, train_wall=146, gb_free=59.2, wall=38025 epoch 016: 271 / 1707 loss=4.112, nll_loss=2.483, ppl=5.59, wps=292732, ups=0.68, wpb=428566, bsz=16219.1, num_updates=25800, lr=0.00039375, gnorm=0.234, clip=0, loss_scale=4, train_wall=146, gb_free=59.2, wall=38025 epoch 016: 271 / 1707 loss=4.112, nll_loss=2.483, ppl=5.59, wps=292732, ups=0.68, wpb=428566, bsz=16219.1, num_updates=25800, lr=0.00039375, gnorm=0.234, clip=0, loss_scale=4, train_wall=146, gb_free=59.2, wall=38025 epoch 016: 372 / 1707 loss=4.119, nll_loss=2.491, ppl=5.62, wps=294694, ups=0.68, wpb=430811, bsz=16291, num_updates=25900, lr=0.000392989, gnorm=0.251, clip=0, loss_scale=2, train_wall=145, gb_free=58.9, wall=38171 epoch 016: 372 / 1707 loss=4.119, nll_loss=2.491, ppl=5.62, wps=294694, ups=0.68, wpb=430811, bsz=16291, num_updates=25900, lr=0.000392989, gnorm=0.251, clip=0, loss_scale=2, train_wall=145, gb_free=58.9, wall=38171 epoch 016: 372 / 1707 loss=4.119, nll_loss=2.491, ppl=5.62, wps=294694, ups=0.68, wpb=430811, bsz=16291, num_updates=25900, lr=0.000392989, gnorm=0.251, clip=0, loss_scale=2, train_wall=145, gb_free=58.9, wall=38171 epoch 016: 372 / 1707 loss=4.119, nll_loss=2.491, ppl=5.62, wps=294694, ups=0.68, wpb=430811, bsz=16291, num_updates=25900, lr=0.000392989, gnorm=0.251, clip=0, loss_scale=2, train_wall=145, gb_free=58.9, wall=38171 epoch 016: 372 / 1707 loss=4.119, nll_loss=2.491, ppl=5.62, wps=294694, ups=0.68, wpb=430811, bsz=16291, num_updates=25900, lr=0.000392989, gnorm=0.251, clip=0, loss_scale=2, train_wall=145, gb_free=58.9, wall=38171 epoch 016: 372 / 1707 loss=4.119, nll_loss=2.491, ppl=5.62, wps=294694, ups=0.68, wpb=430811, bsz=16291, num_updates=25900, lr=0.000392989, gnorm=0.251, clip=0, loss_scale=2, train_wall=145, gb_free=58.9, wall=38171 epoch 016: 372 / 1707 loss=4.119, nll_loss=2.491, ppl=5.62, wps=294694, ups=0.68, wpb=430811, bsz=16291, num_updates=25900, lr=0.000392989, gnorm=0.251, clip=0, loss_scale=2, train_wall=145, gb_free=58.9, wall=38171 epoch 016: 372 / 1707 loss=4.119, nll_loss=2.491, ppl=5.62, wps=294694, ups=0.68, wpb=430811, bsz=16291, num_updates=25900, lr=0.000392989, gnorm=0.251, clip=0, loss_scale=2, train_wall=145, gb_free=58.9, wall=38171 epoch 016: 372 / 1707 loss=4.119, nll_loss=2.491, ppl=5.62, wps=294694, ups=0.68, wpb=430811, bsz=16291, num_updates=25900, lr=0.000392989, gnorm=0.251, clip=0, loss_scale=2, train_wall=145, gb_free=58.9, wall=38171 epoch 016: 372 / 1707 loss=4.119, nll_loss=2.491, ppl=5.62, wps=294694, ups=0.68, wpb=430811, bsz=16291, num_updates=25900, lr=0.000392989, gnorm=0.251, clip=0, loss_scale=2, train_wall=145, gb_free=58.9, wall=38171 epoch 016: 372 / 1707 loss=4.119, nll_loss=2.491, ppl=5.62, wps=294694, ups=0.68, wpb=430811, bsz=16291, num_updates=25900, lr=0.000392989, gnorm=0.251, clip=0, loss_scale=2, train_wall=145, gb_free=58.9, wall=38171 epoch 016: 372 / 1707 loss=4.119, nll_loss=2.491, ppl=5.62, wps=294694, ups=0.68, wpb=430811, bsz=16291, num_updates=25900, lr=0.000392989, gnorm=0.251, clip=0, loss_scale=2, train_wall=145, gb_free=58.9, wall=38171 epoch 016: 372 / 1707 loss=4.119, nll_loss=2.491, ppl=5.62, wps=294694, ups=0.68, wpb=430811, bsz=16291, num_updates=25900, lr=0.000392989, gnorm=0.251, clip=0, loss_scale=2, train_wall=145, gb_free=58.9, wall=38171 epoch 016: 372 / 1707 loss=4.119, nll_loss=2.491, ppl=5.62, wps=294694, ups=0.68, wpb=430811, bsz=16291, num_updates=25900, lr=0.000392989, gnorm=0.251, clip=0, loss_scale=2, train_wall=145, gb_free=58.9, wall=38171 epoch 016: 372 / 1707 loss=4.119, nll_loss=2.491, ppl=5.62, wps=294694, ups=0.68, wpb=430811, bsz=16291, num_updates=25900, lr=0.000392989, gnorm=0.251, clip=0, loss_scale=2, train_wall=145, gb_free=58.9, wall=38171 epoch 016: 372 / 1707 loss=4.119, nll_loss=2.491, ppl=5.62, wps=294694, ups=0.68, wpb=430811, bsz=16291, num_updates=25900, lr=0.000392989, gnorm=0.251, clip=0, loss_scale=2, train_wall=145, gb_free=58.9, wall=38171 epoch 016: 472 / 1707 loss=4.119, nll_loss=2.491, ppl=5.62, wps=294778, ups=0.69, wpb=428935, bsz=16192.9, num_updates=26000, lr=0.000392232, gnorm=0.23, clip=0, loss_scale=2, train_wall=145, gb_free=59.1, wall=38316 epoch 016: 472 / 1707 loss=4.119, nll_loss=2.491, ppl=5.62, wps=294778, ups=0.69, wpb=428935, bsz=16192.9, num_updates=26000, lr=0.000392232, gnorm=0.23, clip=0, loss_scale=2, train_wall=145, gb_free=59.1, wall=38316 epoch 016: 472 / 1707 loss=4.119, nll_loss=2.491, ppl=5.62, wps=294778, ups=0.69, wpb=428935, bsz=16192.9, num_updates=26000, lr=0.000392232, gnorm=0.23, clip=0, loss_scale=2, train_wall=145, gb_free=59.1, wall=38316 epoch 016: 472 / 1707 loss=4.119, nll_loss=2.491, ppl=5.62, wps=294778, ups=0.69, wpb=428935, bsz=16192.9, num_updates=26000, lr=0.000392232, gnorm=0.23, clip=0, loss_scale=2, train_wall=145, gb_free=59.1, wall=38316 epoch 016: 472 / 1707 loss=4.119, nll_loss=2.491, ppl=5.62, wps=294778, ups=0.69, wpb=428935, bsz=16192.9, num_updates=26000, lr=0.000392232, gnorm=0.23, clip=0, loss_scale=2, train_wall=145, gb_free=59.1, wall=38316 epoch 016: 472 / 1707 loss=4.119, nll_loss=2.491, ppl=5.62, wps=294778, ups=0.69, wpb=428935, bsz=16192.9, num_updates=26000, lr=0.000392232, gnorm=0.23, clip=0, loss_scale=2, train_wall=145, gb_free=59.1, wall=38316 epoch 016: 472 / 1707 loss=4.119, nll_loss=2.491, ppl=5.62, wps=294778, ups=0.69, wpb=428935, bsz=16192.9, num_updates=26000, lr=0.000392232, gnorm=0.23, clip=0, loss_scale=2, train_wall=145, gb_free=59.1, wall=38316 epoch 016: 472 / 1707 loss=4.119, nll_loss=2.491, ppl=5.62, wps=294778, ups=0.69, wpb=428935, bsz=16192.9, num_updates=26000, lr=0.000392232, gnorm=0.23, clip=0, loss_scale=2, train_wall=145, gb_free=59.1, wall=38316 epoch 016: 472 / 1707 loss=4.119, nll_loss=2.491, ppl=5.62, wps=294778, ups=0.69, wpb=428935, bsz=16192.9, num_updates=26000, lr=0.000392232, gnorm=0.23, clip=0, loss_scale=2, train_wall=145, gb_free=59.1, wall=38316 epoch 016: 472 / 1707 loss=4.119, nll_loss=2.491, ppl=5.62, wps=294778, ups=0.69, wpb=428935, bsz=16192.9, num_updates=26000, lr=0.000392232, gnorm=0.23, clip=0, loss_scale=2, train_wall=145, gb_free=59.1, wall=38316 epoch 016: 472 / 1707 loss=4.119, nll_loss=2.491, ppl=5.62, wps=294778, ups=0.69, wpb=428935, bsz=16192.9, num_updates=26000, lr=0.000392232, gnorm=0.23, clip=0, loss_scale=2, train_wall=145, gb_free=59.1, wall=38316 epoch 016: 472 / 1707 loss=4.119, nll_loss=2.491, ppl=5.62, wps=294778, ups=0.69, wpb=428935, bsz=16192.9, num_updates=26000, lr=0.000392232, gnorm=0.23, clip=0, loss_scale=2, train_wall=145, gb_free=59.1, wall=38316 epoch 016: 472 / 1707 loss=4.119, nll_loss=2.491, ppl=5.62, wps=294778, ups=0.69, wpb=428935, bsz=16192.9, num_updates=26000, lr=0.000392232, gnorm=0.23, clip=0, loss_scale=2, train_wall=145, gb_free=59.1, wall=38316 epoch 016: 472 / 1707 loss=4.119, nll_loss=2.491, ppl=5.62, wps=294778, ups=0.69, wpb=428935, bsz=16192.9, num_updates=26000, lr=0.000392232, gnorm=0.23, clip=0, loss_scale=2, train_wall=145, gb_free=59.1, wall=38316 epoch 016: 472 / 1707 loss=4.119, nll_loss=2.491, ppl=5.62, wps=294778, ups=0.69, wpb=428935, bsz=16192.9, num_updates=26000, lr=0.000392232, gnorm=0.23, clip=0, loss_scale=2, train_wall=145, gb_free=59.1, wall=38316 epoch 016: 472 / 1707 loss=4.119, nll_loss=2.491, ppl=5.62, wps=294778, ups=0.69, wpb=428935, bsz=16192.9, num_updates=26000, lr=0.000392232, gnorm=0.23, clip=0, loss_scale=2, train_wall=145, gb_free=59.1, wall=38316 begin validation on "valid" subset epoch 016 | valid on 'valid' subset | loss 4.314 | nll_loss 2.694 | ppl 6.47 | wps 75841.2 | wpb 21331 | bsz 1016 | num_updates 26000 | best_loss 4.314 epoch 016 | valid on 'valid' subset | loss 4.314 | nll_loss 2.694 | ppl 6.47 | wps 75841.2 | wpb 21331 | bsz 1016 | num_updates 26000 | best_loss 4.314 epoch 016 | valid on 'valid' subset | loss 4.314 | nll_loss 2.694 | ppl 6.47 | wps 75841.2 | wpb 21331 | bsz 1016 | num_updates 26000 | best_loss 4.314 epoch 016 | valid on 'valid' subset | loss 4.314 | nll_loss 2.694 | ppl 6.47 | wps 75841.2 | wpb 21331 | bsz 1016 | num_updates 26000 | best_loss 4.314 epoch 016 | valid on 'valid' subset | loss 4.314 | nll_loss 2.694 | ppl 6.47 | wps 75841.2 | wpb 21331 | bsz 1016 | num_updates 26000 | best_loss 4.314 epoch 016 | valid on 'valid' subset | loss 4.314 | nll_loss 2.694 | ppl 6.47 | wps 75841.2 | wpb 21331 | bsz 1016 | num_updates 26000 | best_loss 4.314 epoch 016 | valid on 'valid' subset | loss 4.314 | nll_loss 2.694 | ppl 6.47 | wps 75841.2 | wpb 21331 | bsz 1016 | num_updates 26000 | best_loss 4.314 epoch 016 | valid on 'valid' subset | loss 4.314 | nll_loss 2.694 | ppl 6.47 | wps 75841.2 | wpb 21331 | bsz 1016 | num_updates 26000 | best_loss 4.314 epoch 016 | valid on 'valid' subset | loss 4.314 | nll_loss 2.694 | ppl 6.47 | wps 75841.2 | wpb 21331 | bsz 1016 | num_updates 26000 | best_loss 4.314 epoch 016 | valid on 'valid' subset | loss 4.314 | nll_loss 2.694 | ppl 6.47 | wps 75841.2 | wpb 21331 | bsz 1016 | num_updates 26000 | best_loss 4.314 epoch 016 | valid on 'valid' subset | loss 4.314 | nll_loss 2.694 | ppl 6.47 | wps 75841.2 | wpb 21331 | bsz 1016 | num_updates 26000 | best_loss 4.314 epoch 016 | valid on 'valid' subset | loss 4.314 | nll_loss 2.694 | ppl 6.47 | wps 75841.2 | wpb 21331 | bsz 1016 | num_updates 26000 | best_loss 4.314 epoch 016 | valid on 'valid' subset | loss 4.314 | nll_loss 2.694 | ppl 6.47 | wps 75841.2 | wpb 21331 | bsz 1016 | num_updates 26000 | best_loss 4.314 epoch 016 | valid on 'valid' subset | loss 4.314 | nll_loss 2.694 | ppl 6.47 | wps 75841.2 | wpb 21331 | bsz 1016 | num_updates 26000 | best_loss 4.314 epoch 016 | valid on 'valid' subset | loss 4.314 | nll_loss 2.694 | ppl 6.47 | wps 75841.2 | wpb 21331 | bsz 1016 | num_updates 26000 | best_loss 4.314 epoch 016 | valid on 'valid' subset | loss 4.314 | nll_loss 2.694 | ppl 6.47 | wps 75841.2 | wpb 21331 | bsz 1016 | num_updates 26000 | best_loss 4.314 epoch 016: 572 / 1707 loss=4.121, nll_loss=2.493, ppl=5.63, wps=258328, ups=0.6, wpb=427552, bsz=16208.3, num_updates=26100, lr=0.00039148, gnorm=0.237, clip=0, loss_scale=4, train_wall=143, gb_free=59.2, wall=38482 epoch 016: 572 / 1707 loss=4.121, nll_loss=2.493, ppl=5.63, wps=258328, ups=0.6, wpb=427552, bsz=16208.3, num_updates=26100, lr=0.00039148, gnorm=0.237, clip=0, loss_scale=4, train_wall=143, gb_free=59.2, wall=38482 epoch 016: 572 / 1707 loss=4.121, nll_loss=2.493, ppl=5.63, wps=258328, ups=0.6, wpb=427552, bsz=16208.3, num_updates=26100, lr=0.00039148, gnorm=0.237, clip=0, loss_scale=4, train_wall=143, gb_free=59.2, wall=38482 epoch 016: 572 / 1707 loss=4.121, nll_loss=2.493, ppl=5.63, wps=258328, ups=0.6, wpb=427552, bsz=16208.3, num_updates=26100, lr=0.00039148, gnorm=0.237, clip=0, loss_scale=4, train_wall=143, gb_free=59.2, wall=38482 epoch 016: 572 / 1707 loss=4.121, nll_loss=2.493, ppl=5.63, wps=258328, ups=0.6, wpb=427552, bsz=16208.3, num_updates=26100, lr=0.00039148, gnorm=0.237, clip=0, loss_scale=4, train_wall=143, gb_free=59.2, wall=38482 epoch 016: 572 / 1707 loss=4.121, nll_loss=2.493, ppl=5.63, wps=258328, ups=0.6, wpb=427552, bsz=16208.3, num_updates=26100, lr=0.00039148, gnorm=0.237, clip=0, loss_scale=4, train_wall=143, gb_free=59.2, wall=38482 epoch 016: 572 / 1707 loss=4.121, nll_loss=2.493, ppl=5.63, wps=258328, ups=0.6, wpb=427552, bsz=16208.3, num_updates=26100, lr=0.00039148, gnorm=0.237, clip=0, loss_scale=4, train_wall=143, gb_free=59.2, wall=38482 epoch 016: 572 / 1707 loss=4.121, nll_loss=2.493, ppl=5.63, wps=258328, ups=0.6, wpb=427552, bsz=16208.3, num_updates=26100, lr=0.00039148, gnorm=0.237, clip=0, loss_scale=4, train_wall=143, gb_free=59.2, wall=38482 epoch 016: 572 / 1707 loss=4.121, nll_loss=2.493, ppl=5.63, wps=258328, ups=0.6, wpb=427552, bsz=16208.3, num_updates=26100, lr=0.00039148, gnorm=0.237, clip=0, loss_scale=4, train_wall=143, gb_free=59.2, wall=38482 epoch 016: 572 / 1707 loss=4.121, nll_loss=2.493, ppl=5.63, wps=258328, ups=0.6, wpb=427552, bsz=16208.3, num_updates=26100, lr=0.00039148, gnorm=0.237, clip=0, loss_scale=4, train_wall=143, gb_free=59.2, wall=38482 epoch 016: 572 / 1707 loss=4.121, nll_loss=2.493, ppl=5.63, wps=258328, ups=0.6, wpb=427552, bsz=16208.3, num_updates=26100, lr=0.00039148, gnorm=0.237, clip=0, loss_scale=4, train_wall=143, gb_free=59.2, wall=38482 epoch 016: 572 / 1707 loss=4.121, nll_loss=2.493, ppl=5.63, wps=258328, ups=0.6, wpb=427552, bsz=16208.3, num_updates=26100, lr=0.00039148, gnorm=0.237, clip=0, loss_scale=4, train_wall=143, gb_free=59.2, wall=38482 epoch 016: 572 / 1707 loss=4.121, nll_loss=2.493, ppl=5.63, wps=258328, ups=0.6, wpb=427552, bsz=16208.3, num_updates=26100, lr=0.00039148, gnorm=0.237, clip=0, loss_scale=4, train_wall=143, gb_free=59.2, wall=38482 epoch 016: 572 / 1707 loss=4.121, nll_loss=2.493, ppl=5.63, wps=258328, ups=0.6, wpb=427552, bsz=16208.3, num_updates=26100, lr=0.00039148, gnorm=0.237, clip=0, loss_scale=4, train_wall=143, gb_free=59.2, wall=38482 epoch 016: 572 / 1707 loss=4.121, nll_loss=2.493, ppl=5.63, wps=258328, ups=0.6, wpb=427552, bsz=16208.3, num_updates=26100, lr=0.00039148, gnorm=0.237, clip=0, loss_scale=4, train_wall=143, gb_free=59.2, wall=38482 epoch 016: 572 / 1707 loss=4.121, nll_loss=2.493, ppl=5.63, wps=258328, ups=0.6, wpb=427552, bsz=16208.3, num_updates=26100, lr=0.00039148, gnorm=0.237, clip=0, loss_scale=4, train_wall=143, gb_free=59.2, wall=38482 epoch 016: 672 / 1707 loss=4.126, nll_loss=2.499, ppl=5.65, wps=296429, ups=0.69, wpb=430030, bsz=16301.6, num_updates=26200, lr=0.000390732, gnorm=0.246, clip=0, loss_scale=4, train_wall=144, gb_free=58.8, wall=38627 epoch 016: 672 / 1707 loss=4.126, nll_loss=2.499, ppl=5.65, wps=296429, ups=0.69, wpb=430030, bsz=16301.6, num_updates=26200, lr=0.000390732, gnorm=0.246, clip=0, loss_scale=4, train_wall=144, gb_free=58.8, wall=38627 epoch 016: 672 / 1707 loss=4.126, nll_loss=2.499, ppl=5.65, wps=296429, ups=0.69, wpb=430030, bsz=16301.6, num_updates=26200, lr=0.000390732, gnorm=0.246, clip=0, loss_scale=4, train_wall=144, gb_free=58.8, wall=38627 epoch 016: 672 / 1707 loss=4.126, nll_loss=2.499, ppl=5.65, wps=296429, ups=0.69, wpb=430030, bsz=16301.6, num_updates=26200, lr=0.000390732, gnorm=0.246, clip=0, loss_scale=4, train_wall=144, gb_free=58.8, wall=38627 epoch 016: 672 / 1707 loss=4.126, nll_loss=2.499, ppl=5.65, wps=296429, ups=0.69, wpb=430030, bsz=16301.6, num_updates=26200, lr=0.000390732, gnorm=0.246, clip=0, loss_scale=4, train_wall=144, gb_free=58.8, wall=38627 epoch 016: 672 / 1707 loss=4.126, nll_loss=2.499, ppl=5.65, wps=296429, ups=0.69, wpb=430030, bsz=16301.6, num_updates=26200, lr=0.000390732, gnorm=0.246, clip=0, loss_scale=4, train_wall=144, gb_free=58.8, wall=38627 epoch 016: 672 / 1707 loss=4.126, nll_loss=2.499, ppl=5.65, wps=296429, ups=0.69, wpb=430030, bsz=16301.6, num_updates=26200, lr=0.000390732, gnorm=0.246, clip=0, loss_scale=4, train_wall=144, gb_free=58.8, wall=38627 epoch 016: 672 / 1707 loss=4.126, nll_loss=2.499, ppl=5.65, wps=296429, ups=0.69, wpb=430030, bsz=16301.6, num_updates=26200, lr=0.000390732, gnorm=0.246, clip=0, loss_scale=4, train_wall=144, gb_free=58.8, wall=38627 epoch 016: 672 / 1707 loss=4.126, nll_loss=2.499, ppl=5.65, wps=296429, ups=0.69, wpb=430030, bsz=16301.6, num_updates=26200, lr=0.000390732, gnorm=0.246, clip=0, loss_scale=4, train_wall=144, gb_free=58.8, wall=38627 epoch 016: 672 / 1707 loss=4.126, nll_loss=2.499, ppl=5.65, wps=296429, ups=0.69, wpb=430030, bsz=16301.6, num_updates=26200, lr=0.000390732, gnorm=0.246, clip=0, loss_scale=4, train_wall=144, gb_free=58.8, wall=38627 epoch 016: 672 / 1707 loss=4.126, nll_loss=2.499, ppl=5.65, wps=296429, ups=0.69, wpb=430030, bsz=16301.6, num_updates=26200, lr=0.000390732, gnorm=0.246, clip=0, loss_scale=4, train_wall=144, gb_free=58.8, wall=38627 epoch 016: 672 / 1707 loss=4.126, nll_loss=2.499, ppl=5.65, wps=296429, ups=0.69, wpb=430030, bsz=16301.6, num_updates=26200, lr=0.000390732, gnorm=0.246, clip=0, loss_scale=4, train_wall=144, gb_free=58.8, wall=38627 epoch 016: 672 / 1707 loss=4.126, nll_loss=2.499, ppl=5.65, wps=296429, ups=0.69, wpb=430030, bsz=16301.6, num_updates=26200, lr=0.000390732, gnorm=0.246, clip=0, loss_scale=4, train_wall=144, gb_free=58.8, wall=38627 epoch 016: 672 / 1707 loss=4.126, nll_loss=2.499, ppl=5.65, wps=296429, ups=0.69, wpb=430030, bsz=16301.6, num_updates=26200, lr=0.000390732, gnorm=0.246, clip=0, loss_scale=4, train_wall=144, gb_free=58.8, wall=38627 epoch 016: 672 / 1707 loss=4.126, nll_loss=2.499, ppl=5.65, wps=296429, ups=0.69, wpb=430030, bsz=16301.6, num_updates=26200, lr=0.000390732, gnorm=0.246, clip=0, loss_scale=4, train_wall=144, gb_free=58.8, wall=38627 epoch 016: 672 / 1707 loss=4.126, nll_loss=2.499, ppl=5.65, wps=296429, ups=0.69, wpb=430030, bsz=16301.6, num_updates=26200, lr=0.000390732, gnorm=0.246, clip=0, loss_scale=4, train_wall=144, gb_free=58.8, wall=38627 epoch 016: 772 / 1707 loss=4.12, nll_loss=2.492, ppl=5.63, wps=294612, ups=0.69, wpb=428507, bsz=16227.5, num_updates=26300, lr=0.000389989, gnorm=0.232, clip=0, loss_scale=4, train_wall=145, gb_free=59.1, wall=38772 epoch 016: 772 / 1707 loss=4.12, nll_loss=2.492, ppl=5.63, wps=294612, ups=0.69, wpb=428507, bsz=16227.5, num_updates=26300, lr=0.000389989, gnorm=0.232, clip=0, loss_scale=4, train_wall=145, gb_free=59.1, wall=38772 epoch 016: 772 / 1707 loss=4.12, nll_loss=2.492, ppl=5.63, wps=294612, ups=0.69, wpb=428507, bsz=16227.5, num_updates=26300, lr=0.000389989, gnorm=0.232, clip=0, loss_scale=4, train_wall=145, gb_free=59.1, wall=38772 epoch 016: 772 / 1707 loss=4.12, nll_loss=2.492, ppl=5.63, wps=294612, ups=0.69, wpb=428507, bsz=16227.5, num_updates=26300, lr=0.000389989, gnorm=0.232, clip=0, loss_scale=4, train_wall=145, gb_free=59.1, wall=38772 epoch 016: 772 / 1707 loss=4.12, nll_loss=2.492, ppl=5.63, wps=294612, ups=0.69, wpb=428507, bsz=16227.5, num_updates=26300, lr=0.000389989, gnorm=0.232, clip=0, loss_scale=4, train_wall=145, gb_free=59.1, wall=38772 epoch 016: 772 / 1707 loss=4.12, nll_loss=2.492, ppl=5.63, wps=294612, ups=0.69, wpb=428507, bsz=16227.5, num_updates=26300, lr=0.000389989, gnorm=0.232, clip=0, loss_scale=4, train_wall=145, gb_free=59.1, wall=38772 epoch 016: 772 / 1707 loss=4.12, nll_loss=2.492, ppl=5.63, wps=294612, ups=0.69, wpb=428507, bsz=16227.5, num_updates=26300, lr=0.000389989, gnorm=0.232, clip=0, loss_scale=4, train_wall=145, gb_free=59.1, wall=38772 epoch 016: 772 / 1707 loss=4.12, nll_loss=2.492, ppl=5.63, wps=294612, ups=0.69, wpb=428507, bsz=16227.5, num_updates=26300, lr=0.000389989, gnorm=0.232, clip=0, loss_scale=4, train_wall=145, gb_free=59.1, wall=38772 epoch 016: 772 / 1707 loss=4.12, nll_loss=2.492, ppl=5.63, wps=294612, ups=0.69, wpb=428507, bsz=16227.5, num_updates=26300, lr=0.000389989, gnorm=0.232, clip=0, loss_scale=4, train_wall=145, gb_free=59.1, wall=38772 epoch 016: 772 / 1707 loss=4.12, nll_loss=2.492, ppl=5.63, wps=294612, ups=0.69, wpb=428507, bsz=16227.5, num_updates=26300, lr=0.000389989, gnorm=0.232, clip=0, loss_scale=4, train_wall=145, gb_free=59.1, wall=38772 epoch 016: 772 / 1707 loss=4.12, nll_loss=2.492, ppl=5.63, wps=294612, ups=0.69, wpb=428507, bsz=16227.5, num_updates=26300, lr=0.000389989, gnorm=0.232, clip=0, loss_scale=4, train_wall=145, gb_free=59.1, wall=38772 epoch 016: 772 / 1707 loss=4.12, nll_loss=2.492, ppl=5.63, wps=294612, ups=0.69, wpb=428507, bsz=16227.5, num_updates=26300, lr=0.000389989, gnorm=0.232, clip=0, loss_scale=4, train_wall=145, gb_free=59.1, wall=38772 epoch 016: 772 / 1707 loss=4.12, nll_loss=2.492, ppl=5.63, wps=294612, ups=0.69, wpb=428507, bsz=16227.5, num_updates=26300, lr=0.000389989, gnorm=0.232, clip=0, loss_scale=4, train_wall=145, gb_free=59.1, wall=38772 epoch 016: 772 / 1707 loss=4.12, nll_loss=2.492, ppl=5.63, wps=294612, ups=0.69, wpb=428507, bsz=16227.5, num_updates=26300, lr=0.000389989, gnorm=0.232, clip=0, loss_scale=4, train_wall=145, gb_free=59.1, wall=38772 epoch 016: 772 / 1707 loss=4.12, nll_loss=2.492, ppl=5.63, wps=294612, ups=0.69, wpb=428507, bsz=16227.5, num_updates=26300, lr=0.000389989, gnorm=0.232, clip=0, loss_scale=4, train_wall=145, gb_free=59.1, wall=38772 epoch 016: 772 / 1707 loss=4.12, nll_loss=2.492, ppl=5.63, wps=294612, ups=0.69, wpb=428507, bsz=16227.5, num_updates=26300, lr=0.000389989, gnorm=0.232, clip=0, loss_scale=4, train_wall=145, gb_free=59.1, wall=38772 epoch 016: 873 / 1707 loss=4.126, nll_loss=2.5, ppl=5.66, wps=292476, ups=0.68, wpb=429114, bsz=16415, num_updates=26400, lr=0.000389249, gnorm=0.258, clip=0, loss_scale=4, train_wall=146, gb_free=59.6, wall=38919 epoch 016: 873 / 1707 loss=4.126, nll_loss=2.5, ppl=5.66, wps=292476, ups=0.68, wpb=429114, bsz=16415, num_updates=26400, lr=0.000389249, gnorm=0.258, clip=0, loss_scale=4, train_wall=146, gb_free=59.6, wall=38919 epoch 016: 873 / 1707 loss=4.126, nll_loss=2.5, ppl=5.66, wps=292476, ups=0.68, wpb=429114, bsz=16415, num_updates=26400, lr=0.000389249, gnorm=0.258, clip=0, loss_scale=4, train_wall=146, gb_free=59.6, wall=38919 epoch 016: 873 / 1707 loss=4.126, nll_loss=2.5, ppl=5.66, wps=292476, ups=0.68, wpb=429114, bsz=16415, num_updates=26400, lr=0.000389249, gnorm=0.258, clip=0, loss_scale=4, train_wall=146, gb_free=59.6, wall=38919 epoch 016: 873 / 1707 loss=4.126, nll_loss=2.5, ppl=5.66, wps=292476, ups=0.68, wpb=429114, bsz=16415, num_updates=26400, lr=0.000389249, gnorm=0.258, clip=0, loss_scale=4, train_wall=146, gb_free=59.6, wall=38919 epoch 016: 873 / 1707 loss=4.126, nll_loss=2.5, ppl=5.66, wps=292476, ups=0.68, wpb=429114, bsz=16415, num_updates=26400, lr=0.000389249, gnorm=0.258, clip=0, loss_scale=4, train_wall=146, gb_free=59.6, wall=38919 epoch 016: 873 / 1707 loss=4.126, nll_loss=2.5, ppl=5.66, wps=292476, ups=0.68, wpb=429114, bsz=16415, num_updates=26400, lr=0.000389249, gnorm=0.258, clip=0, loss_scale=4, train_wall=146, gb_free=59.6, wall=38919 epoch 016: 873 / 1707 loss=4.126, nll_loss=2.5, ppl=5.66, wps=292476, ups=0.68, wpb=429114, bsz=16415, num_updates=26400, lr=0.000389249, gnorm=0.258, clip=0, loss_scale=4, train_wall=146, gb_free=59.6, wall=38919 epoch 016: 873 / 1707 loss=4.126, nll_loss=2.5, ppl=5.66, wps=292476, ups=0.68, wpb=429114, bsz=16415, num_updates=26400, lr=0.000389249, gnorm=0.258, clip=0, loss_scale=4, train_wall=146, gb_free=59.6, wall=38919 epoch 016: 873 / 1707 loss=4.126, nll_loss=2.5, ppl=5.66, wps=292476, ups=0.68, wpb=429114, bsz=16415, num_updates=26400, lr=0.000389249, gnorm=0.258, clip=0, loss_scale=4, train_wall=146, gb_free=59.6, wall=38919 epoch 016: 873 / 1707 loss=4.126, nll_loss=2.5, ppl=5.66, wps=292476, ups=0.68, wpb=429114, bsz=16415, num_updates=26400, lr=0.000389249, gnorm=0.258, clip=0, loss_scale=4, train_wall=146, gb_free=59.6, wall=38919 epoch 016: 873 / 1707 loss=4.126, nll_loss=2.5, ppl=5.66, wps=292476, ups=0.68, wpb=429114, bsz=16415, num_updates=26400, lr=0.000389249, gnorm=0.258, clip=0, loss_scale=4, train_wall=146, gb_free=59.6, wall=38919 epoch 016: 873 / 1707 loss=4.126, nll_loss=2.5, ppl=5.66, wps=292476, ups=0.68, wpb=429114, bsz=16415, num_updates=26400, lr=0.000389249, gnorm=0.258, clip=0, loss_scale=4, train_wall=146, gb_free=59.6, wall=38919 epoch 016: 873 / 1707 loss=4.126, nll_loss=2.5, ppl=5.66, wps=292476, ups=0.68, wpb=429114, bsz=16415, num_updates=26400, lr=0.000389249, gnorm=0.258, clip=0, loss_scale=4, train_wall=146, gb_free=59.6, wall=38919 epoch 016: 873 / 1707 loss=4.126, nll_loss=2.5, ppl=5.66, wps=292476, ups=0.68, wpb=429114, bsz=16415, num_updates=26400, lr=0.000389249, gnorm=0.258, clip=0, loss_scale=4, train_wall=146, gb_free=59.6, wall=38919 epoch 016: 873 / 1707 loss=4.126, nll_loss=2.5, ppl=5.66, wps=292476, ups=0.68, wpb=429114, bsz=16415, num_updates=26400, lr=0.000389249, gnorm=0.258, clip=0, loss_scale=4, train_wall=146, gb_free=59.6, wall=38919 epoch 016: 974 / 1707 loss=4.122, nll_loss=2.495, ppl=5.64, wps=292491, ups=0.68, wpb=428844, bsz=16289.2, num_updates=26500, lr=0.000388514, gnorm=0.233, clip=0, loss_scale=2, train_wall=146, gb_free=59.4, wall=39066 epoch 016: 974 / 1707 loss=4.122, nll_loss=2.495, ppl=5.64, wps=292491, ups=0.68, wpb=428844, bsz=16289.2, num_updates=26500, lr=0.000388514, gnorm=0.233, clip=0, loss_scale=2, train_wall=146, gb_free=59.4, wall=39066 epoch 016: 974 / 1707 loss=4.122, nll_loss=2.495, ppl=5.64, wps=292491, ups=0.68, wpb=428844, bsz=16289.2, num_updates=26500, lr=0.000388514, gnorm=0.233, clip=0, loss_scale=2, train_wall=146, gb_free=59.4, wall=39066 epoch 016: 974 / 1707 loss=4.122, nll_loss=2.495, ppl=5.64, wps=292491, ups=0.68, wpb=428844, bsz=16289.2, num_updates=26500, lr=0.000388514, gnorm=0.233, clip=0, loss_scale=2, train_wall=146, gb_free=59.4, wall=39066 epoch 016: 974 / 1707 loss=4.122, nll_loss=2.495, ppl=5.64, wps=292491, ups=0.68, wpb=428844, bsz=16289.2, num_updates=26500, lr=0.000388514, gnorm=0.233, clip=0, loss_scale=2, train_wall=146, gb_free=59.4, wall=39066 epoch 016: 974 / 1707 loss=4.122, nll_loss=2.495, ppl=5.64, wps=292491, ups=0.68, wpb=428844, bsz=16289.2, num_updates=26500, lr=0.000388514, gnorm=0.233, clip=0, loss_scale=2, train_wall=146, gb_free=59.4, wall=39066 epoch 016: 974 / 1707 loss=4.122, nll_loss=2.495, ppl=5.64, wps=292491, ups=0.68, wpb=428844, bsz=16289.2, num_updates=26500, lr=0.000388514, gnorm=0.233, clip=0, loss_scale=2, train_wall=146, gb_free=59.4, wall=39066 epoch 016: 974 / 1707 loss=4.122, nll_loss=2.495, ppl=5.64, wps=292491, ups=0.68, wpb=428844, bsz=16289.2, num_updates=26500, lr=0.000388514, gnorm=0.233, clip=0, loss_scale=2, train_wall=146, gb_free=59.4, wall=39066 epoch 016: 974 / 1707 loss=4.122, nll_loss=2.495, ppl=5.64, wps=292491, ups=0.68, wpb=428844, bsz=16289.2, num_updates=26500, lr=0.000388514, gnorm=0.233, clip=0, loss_scale=2, train_wall=146, gb_free=59.4, wall=39066 epoch 016: 974 / 1707 loss=4.122, nll_loss=2.495, ppl=5.64, wps=292491, ups=0.68, wpb=428844, bsz=16289.2, num_updates=26500, lr=0.000388514, gnorm=0.233, clip=0, loss_scale=2, train_wall=146, gb_free=59.4, wall=39066 epoch 016: 974 / 1707 loss=4.122, nll_loss=2.495, ppl=5.64, wps=292491, ups=0.68, wpb=428844, bsz=16289.2, num_updates=26500, lr=0.000388514, gnorm=0.233, clip=0, loss_scale=2, train_wall=146, gb_free=59.4, wall=39066 epoch 016: 974 / 1707 loss=4.122, nll_loss=2.495, ppl=5.64, wps=292491, ups=0.68, wpb=428844, bsz=16289.2, num_updates=26500, lr=0.000388514, gnorm=0.233, clip=0, loss_scale=2, train_wall=146, gb_free=59.4, wall=39066 epoch 016: 974 / 1707 loss=4.122, nll_loss=2.495, ppl=5.64, wps=292491, ups=0.68, wpb=428844, bsz=16289.2, num_updates=26500, lr=0.000388514, gnorm=0.233, clip=0, loss_scale=2, train_wall=146, gb_free=59.4, wall=39066 epoch 016: 974 / 1707 loss=4.122, nll_loss=2.495, ppl=5.64, wps=292491, ups=0.68, wpb=428844, bsz=16289.2, num_updates=26500, lr=0.000388514, gnorm=0.233, clip=0, loss_scale=2, train_wall=146, gb_free=59.4, wall=39066 epoch 016: 974 / 1707 loss=4.122, nll_loss=2.495, ppl=5.64, wps=292491, ups=0.68, wpb=428844, bsz=16289.2, num_updates=26500, lr=0.000388514, gnorm=0.233, clip=0, loss_scale=2, train_wall=146, gb_free=59.4, wall=39066 epoch 016: 974 / 1707 loss=4.122, nll_loss=2.495, ppl=5.64, wps=292491, ups=0.68, wpb=428844, bsz=16289.2, num_updates=26500, lr=0.000388514, gnorm=0.233, clip=0, loss_scale=2, train_wall=146, gb_free=59.4, wall=39066 epoch 016: 1074 / 1707 loss=4.126, nll_loss=2.5, ppl=5.66, wps=295292, ups=0.69, wpb=428805, bsz=16390.1, num_updates=26600, lr=0.000387783, gnorm=0.238, clip=0, loss_scale=2, train_wall=144, gb_free=59.3, wall=39211 epoch 016: 1074 / 1707 loss=4.126, nll_loss=2.5, ppl=5.66, wps=295292, ups=0.69, wpb=428805, bsz=16390.1, num_updates=26600, lr=0.000387783, gnorm=0.238, clip=0, loss_scale=2, train_wall=144, gb_free=59.3, wall=39211 epoch 016: 1074 / 1707 loss=4.126, nll_loss=2.5, ppl=5.66, wps=295292, ups=0.69, wpb=428805, bsz=16390.1, num_updates=26600, lr=0.000387783, gnorm=0.238, clip=0, loss_scale=2, train_wall=144, gb_free=59.3, wall=39211 epoch 016: 1074 / 1707 loss=4.126, nll_loss=2.5, ppl=5.66, wps=295292, ups=0.69, wpb=428805, bsz=16390.1, num_updates=26600, lr=0.000387783, gnorm=0.238, clip=0, loss_scale=2, train_wall=144, gb_free=59.3, wall=39211 epoch 016: 1074 / 1707 loss=4.126, nll_loss=2.5, ppl=5.66, wps=295292, ups=0.69, wpb=428805, bsz=16390.1, num_updates=26600, lr=0.000387783, gnorm=0.238, clip=0, loss_scale=2, train_wall=144, gb_free=59.3, wall=39211 epoch 016: 1074 / 1707 loss=4.126, nll_loss=2.5, ppl=5.66, wps=295292, ups=0.69, wpb=428805, bsz=16390.1, num_updates=26600, lr=0.000387783, gnorm=0.238, clip=0, loss_scale=2, train_wall=144, gb_free=59.3, wall=39211 epoch 016: 1074 / 1707 loss=4.126, nll_loss=2.5, ppl=5.66, wps=295292, ups=0.69, wpb=428805, bsz=16390.1, num_updates=26600, lr=0.000387783, gnorm=0.238, clip=0, loss_scale=2, train_wall=144, gb_free=59.3, wall=39211 epoch 016: 1074 / 1707 loss=4.126, nll_loss=2.5, ppl=5.66, wps=295292, ups=0.69, wpb=428805, bsz=16390.1, num_updates=26600, lr=0.000387783, gnorm=0.238, clip=0, loss_scale=2, train_wall=144, gb_free=59.3, wall=39211 epoch 016: 1074 / 1707 loss=4.126, nll_loss=2.5, ppl=5.66, wps=295292, ups=0.69, wpb=428805, bsz=16390.1, num_updates=26600, lr=0.000387783, gnorm=0.238, clip=0, loss_scale=2, train_wall=144, gb_free=59.3, wall=39211 epoch 016: 1074 / 1707 loss=4.126, nll_loss=2.5, ppl=5.66, wps=295292, ups=0.69, wpb=428805, bsz=16390.1, num_updates=26600, lr=0.000387783, gnorm=0.238, clip=0, loss_scale=2, train_wall=144, gb_free=59.3, wall=39211 epoch 016: 1074 / 1707 loss=4.126, nll_loss=2.5, ppl=5.66, wps=295292, ups=0.69, wpb=428805, bsz=16390.1, num_updates=26600, lr=0.000387783, gnorm=0.238, clip=0, loss_scale=2, train_wall=144, gb_free=59.3, wall=39211 epoch 016: 1074 / 1707 loss=4.126, nll_loss=2.5, ppl=5.66, wps=295292, ups=0.69, wpb=428805, bsz=16390.1, num_updates=26600, lr=0.000387783, gnorm=0.238, clip=0, loss_scale=2, train_wall=144, gb_free=59.3, wall=39211 epoch 016: 1074 / 1707 loss=4.126, nll_loss=2.5, ppl=5.66, wps=295292, ups=0.69, wpb=428805, bsz=16390.1, num_updates=26600, lr=0.000387783, gnorm=0.238, clip=0, loss_scale=2, train_wall=144, gb_free=59.3, wall=39211 epoch 016: 1074 / 1707 loss=4.126, nll_loss=2.5, ppl=5.66, wps=295292, ups=0.69, wpb=428805, bsz=16390.1, num_updates=26600, lr=0.000387783, gnorm=0.238, clip=0, loss_scale=2, train_wall=144, gb_free=59.3, wall=39211 epoch 016: 1074 / 1707 loss=4.126, nll_loss=2.5, ppl=5.66, wps=295292, ups=0.69, wpb=428805, bsz=16390.1, num_updates=26600, lr=0.000387783, gnorm=0.238, clip=0, loss_scale=2, train_wall=144, gb_free=59.3, wall=39211 epoch 016: 1074 / 1707 loss=4.126, nll_loss=2.5, ppl=5.66, wps=295292, ups=0.69, wpb=428805, bsz=16390.1, num_updates=26600, lr=0.000387783, gnorm=0.238, clip=0, loss_scale=2, train_wall=144, gb_free=59.3, wall=39211 epoch 016: 1174 / 1707 loss=4.138, nll_loss=2.514, ppl=5.71, wps=294707, ups=0.69, wpb=429819, bsz=16625.4, num_updates=26700, lr=0.000387056, gnorm=0.234, clip=0, loss_scale=4, train_wall=145, gb_free=59.2, wall=39357 epoch 016: 1174 / 1707 loss=4.138, nll_loss=2.514, ppl=5.71, wps=294707, ups=0.69, wpb=429819, bsz=16625.4, num_updates=26700, lr=0.000387056, gnorm=0.234, clip=0, loss_scale=4, train_wall=145, gb_free=59.2, wall=39357 epoch 016: 1174 / 1707 loss=4.138, nll_loss=2.514, ppl=5.71, wps=294707, ups=0.69, wpb=429819, bsz=16625.4, num_updates=26700, lr=0.000387056, gnorm=0.234, clip=0, loss_scale=4, train_wall=145, gb_free=59.2, wall=39357 epoch 016: 1174 / 1707 loss=4.138, nll_loss=2.514, ppl=5.71, wps=294707, ups=0.69, wpb=429819, bsz=16625.4, num_updates=26700, lr=0.000387056, gnorm=0.234, clip=0, loss_scale=4, train_wall=145, gb_free=59.2, wall=39357 epoch 016: 1174 / 1707 loss=4.138, nll_loss=2.514, ppl=5.71, wps=294707, ups=0.69, wpb=429819, bsz=16625.4, num_updates=26700, lr=0.000387056, gnorm=0.234, clip=0, loss_scale=4, train_wall=145, gb_free=59.2, wall=39357 epoch 016: 1174 / 1707 loss=4.138, nll_loss=2.514, ppl=5.71, wps=294707, ups=0.69, wpb=429819, bsz=16625.4, num_updates=26700, lr=0.000387056, gnorm=0.234, clip=0, loss_scale=4, train_wall=145, gb_free=59.2, wall=39357 epoch 016: 1174 / 1707 loss=4.138, nll_loss=2.514, ppl=5.71, wps=294707, ups=0.69, wpb=429819, bsz=16625.4, num_updates=26700, lr=0.000387056, gnorm=0.234, clip=0, loss_scale=4, train_wall=145, gb_free=59.2, wall=39357 epoch 016: 1174 / 1707 loss=4.138, nll_loss=2.514, ppl=5.71, wps=294707, ups=0.69, wpb=429819, bsz=16625.4, num_updates=26700, lr=0.000387056, gnorm=0.234, clip=0, loss_scale=4, train_wall=145, gb_free=59.2, wall=39357 epoch 016: 1174 / 1707 loss=4.138, nll_loss=2.514, ppl=5.71, wps=294707, ups=0.69, wpb=429819, bsz=16625.4, num_updates=26700, lr=0.000387056, gnorm=0.234, clip=0, loss_scale=4, train_wall=145, gb_free=59.2, wall=39357 epoch 016: 1174 / 1707 loss=4.138, nll_loss=2.514, ppl=5.71, wps=294707, ups=0.69, wpb=429819, bsz=16625.4, num_updates=26700, lr=0.000387056, gnorm=0.234, clip=0, loss_scale=4, train_wall=145, gb_free=59.2, wall=39357 epoch 016: 1174 / 1707 loss=4.138, nll_loss=2.514, ppl=5.71, wps=294707, ups=0.69, wpb=429819, bsz=16625.4, num_updates=26700, lr=0.000387056, gnorm=0.234, clip=0, loss_scale=4, train_wall=145, gb_free=59.2, wall=39357 epoch 016: 1174 / 1707 loss=4.138, nll_loss=2.514, ppl=5.71, wps=294707, ups=0.69, wpb=429819, bsz=16625.4, num_updates=26700, lr=0.000387056, gnorm=0.234, clip=0, loss_scale=4, train_wall=145, gb_free=59.2, wall=39357 epoch 016: 1174 / 1707 loss=4.138, nll_loss=2.514, ppl=5.71, wps=294707, ups=0.69, wpb=429819, bsz=16625.4, num_updates=26700, lr=0.000387056, gnorm=0.234, clip=0, loss_scale=4, train_wall=145, gb_free=59.2, wall=39357 epoch 016: 1174 / 1707 loss=4.138, nll_loss=2.514, ppl=5.71, wps=294707, ups=0.69, wpb=429819, bsz=16625.4, num_updates=26700, lr=0.000387056, gnorm=0.234, clip=0, loss_scale=4, train_wall=145, gb_free=59.2, wall=39357 epoch 016: 1174 / 1707 loss=4.138, nll_loss=2.514, ppl=5.71, wps=294707, ups=0.69, wpb=429819, bsz=16625.4, num_updates=26700, lr=0.000387056, gnorm=0.234, clip=0, loss_scale=4, train_wall=145, gb_free=59.2, wall=39357 epoch 016: 1174 / 1707 loss=4.138, nll_loss=2.514, ppl=5.71, wps=294707, ups=0.69, wpb=429819, bsz=16625.4, num_updates=26700, lr=0.000387056, gnorm=0.234, clip=0, loss_scale=4, train_wall=145, gb_free=59.2, wall=39357 epoch 016: 1275 / 1707 loss=4.131, nll_loss=2.506, ppl=5.68, wps=291507, ups=0.68, wpb=428516, bsz=16452.2, num_updates=26800, lr=0.000386334, gnorm=0.235, clip=0, loss_scale=2, train_wall=146, gb_free=59, wall=39504 epoch 016: 1275 / 1707 loss=4.131, nll_loss=2.506, ppl=5.68, wps=291507, ups=0.68, wpb=428516, bsz=16452.2, num_updates=26800, lr=0.000386334, gnorm=0.235, clip=0, loss_scale=2, train_wall=146, gb_free=59, wall=39504 epoch 016: 1275 / 1707 loss=4.131, nll_loss=2.506, ppl=5.68, wps=291507, ups=0.68, wpb=428516, bsz=16452.2, num_updates=26800, lr=0.000386334, gnorm=0.235, clip=0, loss_scale=2, train_wall=146, gb_free=59, wall=39504 epoch 016: 1275 / 1707 loss=4.131, nll_loss=2.506, ppl=5.68, wps=291507, ups=0.68, wpb=428516, bsz=16452.2, num_updates=26800, lr=0.000386334, gnorm=0.235, clip=0, loss_scale=2, train_wall=146, gb_free=59, wall=39504 epoch 016: 1275 / 1707 loss=4.131, nll_loss=2.506, ppl=5.68, wps=291507, ups=0.68, wpb=428516, bsz=16452.2, num_updates=26800, lr=0.000386334, gnorm=0.235, clip=0, loss_scale=2, train_wall=146, gb_free=59, wall=39504 epoch 016: 1275 / 1707 loss=4.131, nll_loss=2.506, ppl=5.68, wps=291507, ups=0.68, wpb=428516, bsz=16452.2, num_updates=26800, lr=0.000386334, gnorm=0.235, clip=0, loss_scale=2, train_wall=146, gb_free=59, wall=39504 epoch 016: 1275 / 1707 loss=4.131, nll_loss=2.506, ppl=5.68, wps=291507, ups=0.68, wpb=428516, bsz=16452.2, num_updates=26800, lr=0.000386334, gnorm=0.235, clip=0, loss_scale=2, train_wall=146, gb_free=59, wall=39504 epoch 016: 1275 / 1707 loss=4.131, nll_loss=2.506, ppl=5.68, wps=291507, ups=0.68, wpb=428516, bsz=16452.2, num_updates=26800, lr=0.000386334, gnorm=0.235, clip=0, loss_scale=2, train_wall=146, gb_free=59, wall=39504 epoch 016: 1275 / 1707 loss=4.131, nll_loss=2.506, ppl=5.68, wps=291507, ups=0.68, wpb=428516, bsz=16452.2, num_updates=26800, lr=0.000386334, gnorm=0.235, clip=0, loss_scale=2, train_wall=146, gb_free=59, wall=39504 epoch 016: 1275 / 1707 loss=4.131, nll_loss=2.506, ppl=5.68, wps=291507, ups=0.68, wpb=428516, bsz=16452.2, num_updates=26800, lr=0.000386334, gnorm=0.235, clip=0, loss_scale=2, train_wall=146, gb_free=59, wall=39504 epoch 016: 1275 / 1707 loss=4.131, nll_loss=2.506, ppl=5.68, wps=291507, ups=0.68, wpb=428516, bsz=16452.2, num_updates=26800, lr=0.000386334, gnorm=0.235, clip=0, loss_scale=2, train_wall=146, gb_free=59, wall=39504 epoch 016: 1275 / 1707 loss=4.131, nll_loss=2.506, ppl=5.68, wps=291507, ups=0.68, wpb=428516, bsz=16452.2, num_updates=26800, lr=0.000386334, gnorm=0.235, clip=0, loss_scale=2, train_wall=146, gb_free=59, wall=39504 epoch 016: 1275 / 1707 loss=4.131, nll_loss=2.506, ppl=5.68, wps=291507, ups=0.68, wpb=428516, bsz=16452.2, num_updates=26800, lr=0.000386334, gnorm=0.235, clip=0, loss_scale=2, train_wall=146, gb_free=59, wall=39504 epoch 016: 1275 / 1707 loss=4.131, nll_loss=2.506, ppl=5.68, wps=291507, ups=0.68, wpb=428516, bsz=16452.2, num_updates=26800, lr=0.000386334, gnorm=0.235, clip=0, loss_scale=2, train_wall=146, gb_free=59, wall=39504 epoch 016: 1275 / 1707 loss=4.131, nll_loss=2.506, ppl=5.68, wps=291507, ups=0.68, wpb=428516, bsz=16452.2, num_updates=26800, lr=0.000386334, gnorm=0.235, clip=0, loss_scale=2, train_wall=146, gb_free=59, wall=39504 epoch 016: 1275 / 1707 loss=4.131, nll_loss=2.506, ppl=5.68, wps=291507, ups=0.68, wpb=428516, bsz=16452.2, num_updates=26800, lr=0.000386334, gnorm=0.235, clip=0, loss_scale=2, train_wall=146, gb_free=59, wall=39504 epoch 016: 1375 / 1707 loss=4.13, nll_loss=2.505, ppl=5.67, wps=296030, ups=0.69, wpb=429944, bsz=15979.2, num_updates=26900, lr=0.000385615, gnorm=0.23, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=39649 epoch 016: 1375 / 1707 loss=4.13, nll_loss=2.505, ppl=5.67, wps=296030, ups=0.69, wpb=429944, bsz=15979.2, num_updates=26900, lr=0.000385615, gnorm=0.23, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=39649 epoch 016: 1375 / 1707 loss=4.13, nll_loss=2.505, ppl=5.67, wps=296030, ups=0.69, wpb=429944, bsz=15979.2, num_updates=26900, lr=0.000385615, gnorm=0.23, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=39649 epoch 016: 1375 / 1707 loss=4.13, nll_loss=2.505, ppl=5.67, wps=296030, ups=0.69, wpb=429944, bsz=15979.2, num_updates=26900, lr=0.000385615, gnorm=0.23, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=39649 epoch 016: 1375 / 1707 loss=4.13, nll_loss=2.505, ppl=5.67, wps=296030, ups=0.69, wpb=429944, bsz=15979.2, num_updates=26900, lr=0.000385615, gnorm=0.23, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=39649 epoch 016: 1375 / 1707 loss=4.13, nll_loss=2.505, ppl=5.67, wps=296030, ups=0.69, wpb=429944, bsz=15979.2, num_updates=26900, lr=0.000385615, gnorm=0.23, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=39649 epoch 016: 1375 / 1707 loss=4.13, nll_loss=2.505, ppl=5.67, wps=296030, ups=0.69, wpb=429944, bsz=15979.2, num_updates=26900, lr=0.000385615, gnorm=0.23, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=39649 epoch 016: 1375 / 1707 loss=4.13, nll_loss=2.505, ppl=5.67, wps=296030, ups=0.69, wpb=429944, bsz=15979.2, num_updates=26900, lr=0.000385615, gnorm=0.23, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=39649 epoch 016: 1375 / 1707 loss=4.13, nll_loss=2.505, ppl=5.67, wps=296030, ups=0.69, wpb=429944, bsz=15979.2, num_updates=26900, lr=0.000385615, gnorm=0.23, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=39649 epoch 016: 1375 / 1707 loss=4.13, nll_loss=2.505, ppl=5.67, wps=296030, ups=0.69, wpb=429944, bsz=15979.2, num_updates=26900, lr=0.000385615, gnorm=0.23, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=39649 epoch 016: 1375 / 1707 loss=4.13, nll_loss=2.505, ppl=5.67, wps=296030, ups=0.69, wpb=429944, bsz=15979.2, num_updates=26900, lr=0.000385615, gnorm=0.23, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=39649 epoch 016: 1375 / 1707 loss=4.13, nll_loss=2.505, ppl=5.67, wps=296030, ups=0.69, wpb=429944, bsz=15979.2, num_updates=26900, lr=0.000385615, gnorm=0.23, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=39649 epoch 016: 1375 / 1707 loss=4.13, nll_loss=2.505, ppl=5.67, wps=296030, ups=0.69, wpb=429944, bsz=15979.2, num_updates=26900, lr=0.000385615, gnorm=0.23, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=39649 epoch 016: 1375 / 1707 loss=4.13, nll_loss=2.505, ppl=5.67, wps=296030, ups=0.69, wpb=429944, bsz=15979.2, num_updates=26900, lr=0.000385615, gnorm=0.23, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=39649 epoch 016: 1375 / 1707 loss=4.13, nll_loss=2.505, ppl=5.67, wps=296030, ups=0.69, wpb=429944, bsz=15979.2, num_updates=26900, lr=0.000385615, gnorm=0.23, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=39649 epoch 016: 1375 / 1707 loss=4.13, nll_loss=2.505, ppl=5.67, wps=296030, ups=0.69, wpb=429944, bsz=15979.2, num_updates=26900, lr=0.000385615, gnorm=0.23, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=39649 epoch 016: 1475 / 1707 loss=4.133, nll_loss=2.508, ppl=5.69, wps=295395, ups=0.69, wpb=429353, bsz=16721.7, num_updates=27000, lr=0.0003849, gnorm=0.229, clip=0, loss_scale=4, train_wall=144, gb_free=59.1, wall=39794 epoch 016: 1475 / 1707 loss=4.133, nll_loss=2.508, ppl=5.69, wps=295395, ups=0.69, wpb=429353, bsz=16721.7, num_updates=27000, lr=0.0003849, gnorm=0.229, clip=0, loss_scale=4, train_wall=144, gb_free=59.1, wall=39794 epoch 016: 1475 / 1707 loss=4.133, nll_loss=2.508, ppl=5.69, wps=295395, ups=0.69, wpb=429353, bsz=16721.7, num_updates=27000, lr=0.0003849, gnorm=0.229, clip=0, loss_scale=4, train_wall=144, gb_free=59.1, wall=39794 epoch 016: 1475 / 1707 loss=4.133, nll_loss=2.508, ppl=5.69, wps=295395, ups=0.69, wpb=429353, bsz=16721.7, num_updates=27000, lr=0.0003849, gnorm=0.229, clip=0, loss_scale=4, train_wall=144, gb_free=59.1, wall=39794 epoch 016: 1475 / 1707 loss=4.133, nll_loss=2.508, ppl=5.69, wps=295395, ups=0.69, wpb=429353, bsz=16721.7, num_updates=27000, lr=0.0003849, gnorm=0.229, clip=0, loss_scale=4, train_wall=144, gb_free=59.1, wall=39794 epoch 016: 1475 / 1707 loss=4.133, nll_loss=2.508, ppl=5.69, wps=295395, ups=0.69, wpb=429353, bsz=16721.7, num_updates=27000, lr=0.0003849, gnorm=0.229, clip=0, loss_scale=4, train_wall=144, gb_free=59.1, wall=39794 epoch 016: 1475 / 1707 loss=4.133, nll_loss=2.508, ppl=5.69, wps=295395, ups=0.69, wpb=429353, bsz=16721.7, num_updates=27000, lr=0.0003849, gnorm=0.229, clip=0, loss_scale=4, train_wall=144, gb_free=59.1, wall=39794 epoch 016: 1475 / 1707 loss=4.133, nll_loss=2.508, ppl=5.69, wps=295395, ups=0.69, wpb=429353, bsz=16721.7, num_updates=27000, lr=0.0003849, gnorm=0.229, clip=0, loss_scale=4, train_wall=144, gb_free=59.1, wall=39794 epoch 016: 1475 / 1707 loss=4.133, nll_loss=2.508, ppl=5.69, wps=295395, ups=0.69, wpb=429353, bsz=16721.7, num_updates=27000, lr=0.0003849, gnorm=0.229, clip=0, loss_scale=4, train_wall=144, gb_free=59.1, wall=39794 epoch 016: 1475 / 1707 loss=4.133, nll_loss=2.508, ppl=5.69, wps=295395, ups=0.69, wpb=429353, bsz=16721.7, num_updates=27000, lr=0.0003849, gnorm=0.229, clip=0, loss_scale=4, train_wall=144, gb_free=59.1, wall=39794 epoch 016: 1475 / 1707 loss=4.133, nll_loss=2.508, ppl=5.69, wps=295395, ups=0.69, wpb=429353, bsz=16721.7, num_updates=27000, lr=0.0003849, gnorm=0.229, clip=0, loss_scale=4, train_wall=144, gb_free=59.1, wall=39794 epoch 016: 1475 / 1707 loss=4.133, nll_loss=2.508, ppl=5.69, wps=295395, ups=0.69, wpb=429353, bsz=16721.7, num_updates=27000, lr=0.0003849, gnorm=0.229, clip=0, loss_scale=4, train_wall=144, gb_free=59.1, wall=39794 epoch 016: 1475 / 1707 loss=4.133, nll_loss=2.508, ppl=5.69, wps=295395, ups=0.69, wpb=429353, bsz=16721.7, num_updates=27000, lr=0.0003849, gnorm=0.229, clip=0, loss_scale=4, train_wall=144, gb_free=59.1, wall=39794 epoch 016: 1475 / 1707 loss=4.133, nll_loss=2.508, ppl=5.69, wps=295395, ups=0.69, wpb=429353, bsz=16721.7, num_updates=27000, lr=0.0003849, gnorm=0.229, clip=0, loss_scale=4, train_wall=144, gb_free=59.1, wall=39794 epoch 016: 1475 / 1707 loss=4.133, nll_loss=2.508, ppl=5.69, wps=295395, ups=0.69, wpb=429353, bsz=16721.7, num_updates=27000, lr=0.0003849, gnorm=0.229, clip=0, loss_scale=4, train_wall=144, gb_free=59.1, wall=39794 epoch 016: 1475 / 1707 loss=4.133, nll_loss=2.508, ppl=5.69, wps=295395, ups=0.69, wpb=429353, bsz=16721.7, num_updates=27000, lr=0.0003849, gnorm=0.229, clip=0, loss_scale=4, train_wall=144, gb_free=59.1, wall=39794 begin validation on "valid" subset epoch 016 | valid on 'valid' subset | loss 4.305 | nll_loss 2.687 | ppl 6.44 | wps 75715 | wpb 21331 | bsz 1016 | num_updates 27000 | best_loss 4.305 epoch 016 | valid on 'valid' subset | loss 4.305 | nll_loss 2.687 | ppl 6.44 | wps 75715 | wpb 21331 | bsz 1016 | num_updates 27000 | best_loss 4.305 epoch 016 | valid on 'valid' subset | loss 4.305 | nll_loss 2.687 | ppl 6.44 | wps 75715 | wpb 21331 | bsz 1016 | num_updates 27000 | best_loss 4.305 epoch 016 | valid on 'valid' subset | loss 4.305 | nll_loss 2.687 | ppl 6.44 | wps 75715 | wpb 21331 | bsz 1016 | num_updates 27000 | best_loss 4.305 epoch 016 | valid on 'valid' subset | loss 4.305 | nll_loss 2.687 | ppl 6.44 | wps 75715 | wpb 21331 | bsz 1016 | num_updates 27000 | best_loss 4.305 epoch 016 | valid on 'valid' subset | loss 4.305 | nll_loss 2.687 | ppl 6.44 | wps 75715 | wpb 21331 | bsz 1016 | num_updates 27000 | best_loss 4.305 epoch 016 | valid on 'valid' subset | loss 4.305 | nll_loss 2.687 | ppl 6.44 | wps 75715 | wpb 21331 | bsz 1016 | num_updates 27000 | best_loss 4.305 epoch 016 | valid on 'valid' subset | loss 4.305 | nll_loss 2.687 | ppl 6.44 | wps 75715 | wpb 21331 | bsz 1016 | num_updates 27000 | best_loss 4.305 epoch 016 | valid on 'valid' subset | loss 4.305 | nll_loss 2.687 | ppl 6.44 | wps 75715 | wpb 21331 | bsz 1016 | num_updates 27000 | best_loss 4.305 epoch 016 | valid on 'valid' subset | loss 4.305 | nll_loss 2.687 | ppl 6.44 | wps 75715 | wpb 21331 | bsz 1016 | num_updates 27000 | best_loss 4.305 epoch 016 | valid on 'valid' subset | loss 4.305 | nll_loss 2.687 | ppl 6.44 | wps 75715 | wpb 21331 | bsz 1016 | num_updates 27000 | best_loss 4.305 epoch 016 | valid on 'valid' subset | loss 4.305 | nll_loss 2.687 | ppl 6.44 | wps 75715 | wpb 21331 | bsz 1016 | num_updates 27000 | best_loss 4.305 epoch 016 | valid on 'valid' subset | loss 4.305 | nll_loss 2.687 | ppl 6.44 | wps 75715 | wpb 21331 | bsz 1016 | num_updates 27000 | best_loss 4.305 epoch 016 | valid on 'valid' subset | loss 4.305 | nll_loss 2.687 | ppl 6.44 | wps 75715 | wpb 21331 | bsz 1016 | num_updates 27000 | best_loss 4.305 epoch 016 | valid on 'valid' subset | loss 4.305 | nll_loss 2.687 | ppl 6.44 | wps 75715 | wpb 21331 | bsz 1016 | num_updates 27000 | best_loss 4.305 epoch 016 | valid on 'valid' subset | loss 4.305 | nll_loss 2.687 | ppl 6.44 | wps 75715 | wpb 21331 | bsz 1016 | num_updates 27000 | best_loss 4.305 epoch 016: 1575 / 1707 loss=4.134, nll_loss=2.509, ppl=5.69, wps=263685, ups=0.61, wpb=429262, bsz=16271.5, num_updates=27100, lr=0.000384189, gnorm=0.24, clip=0, loss_scale=4, train_wall=143, gb_free=59.2, wall=39957 epoch 016: 1575 / 1707 loss=4.134, nll_loss=2.509, ppl=5.69, wps=263685, ups=0.61, wpb=429262, bsz=16271.5, num_updates=27100, lr=0.000384189, gnorm=0.24, clip=0, loss_scale=4, train_wall=143, gb_free=59.2, wall=39957 epoch 016: 1575 / 1707 loss=4.134, nll_loss=2.509, ppl=5.69, wps=263685, ups=0.61, wpb=429262, bsz=16271.5, num_updates=27100, lr=0.000384189, gnorm=0.24, clip=0, loss_scale=4, train_wall=143, gb_free=59.2, wall=39957 epoch 016: 1575 / 1707 loss=4.134, nll_loss=2.509, ppl=5.69, wps=263685, ups=0.61, wpb=429262, bsz=16271.5, num_updates=27100, lr=0.000384189, gnorm=0.24, clip=0, loss_scale=4, train_wall=143, gb_free=59.2, wall=39957 epoch 016: 1575 / 1707 loss=4.134, nll_loss=2.509, ppl=5.69, wps=263685, ups=0.61, wpb=429262, bsz=16271.5, num_updates=27100, lr=0.000384189, gnorm=0.24, clip=0, loss_scale=4, train_wall=143, gb_free=59.2, wall=39957 epoch 016: 1575 / 1707 loss=4.134, nll_loss=2.509, ppl=5.69, wps=263685, ups=0.61, wpb=429262, bsz=16271.5, num_updates=27100, lr=0.000384189, gnorm=0.24, clip=0, loss_scale=4, train_wall=143, gb_free=59.2, wall=39957 epoch 016: 1575 / 1707 loss=4.134, nll_loss=2.509, ppl=5.69, wps=263685, ups=0.61, wpb=429262, bsz=16271.5, num_updates=27100, lr=0.000384189, gnorm=0.24, clip=0, loss_scale=4, train_wall=143, gb_free=59.2, wall=39957 epoch 016: 1575 / 1707 loss=4.134, nll_loss=2.509, ppl=5.69, wps=263685, ups=0.61, wpb=429262, bsz=16271.5, num_updates=27100, lr=0.000384189, gnorm=0.24, clip=0, loss_scale=4, train_wall=143, gb_free=59.2, wall=39957 epoch 016: 1575 / 1707 loss=4.134, nll_loss=2.509, ppl=5.69, wps=263685, ups=0.61, wpb=429262, bsz=16271.5, num_updates=27100, lr=0.000384189, gnorm=0.24, clip=0, loss_scale=4, train_wall=143, gb_free=59.2, wall=39957 epoch 016: 1575 / 1707 loss=4.134, nll_loss=2.509, ppl=5.69, wps=263685, ups=0.61, wpb=429262, bsz=16271.5, num_updates=27100, lr=0.000384189, gnorm=0.24, clip=0, loss_scale=4, train_wall=143, gb_free=59.2, wall=39957 epoch 016: 1575 / 1707 loss=4.134, nll_loss=2.509, ppl=5.69, wps=263685, ups=0.61, wpb=429262, bsz=16271.5, num_updates=27100, lr=0.000384189, gnorm=0.24, clip=0, loss_scale=4, train_wall=143, gb_free=59.2, wall=39957 epoch 016: 1575 / 1707 loss=4.134, nll_loss=2.509, ppl=5.69, wps=263685, ups=0.61, wpb=429262, bsz=16271.5, num_updates=27100, lr=0.000384189, gnorm=0.24, clip=0, loss_scale=4, train_wall=143, gb_free=59.2, wall=39957 epoch 016: 1575 / 1707 loss=4.134, nll_loss=2.509, ppl=5.69, wps=263685, ups=0.61, wpb=429262, bsz=16271.5, num_updates=27100, lr=0.000384189, gnorm=0.24, clip=0, loss_scale=4, train_wall=143, gb_free=59.2, wall=39957 epoch 016: 1575 / 1707 loss=4.134, nll_loss=2.509, ppl=5.69, wps=263685, ups=0.61, wpb=429262, bsz=16271.5, num_updates=27100, lr=0.000384189, gnorm=0.24, clip=0, loss_scale=4, train_wall=143, gb_free=59.2, wall=39957 epoch 016: 1575 / 1707 loss=4.134, nll_loss=2.509, ppl=5.69, wps=263685, ups=0.61, wpb=429262, bsz=16271.5, num_updates=27100, lr=0.000384189, gnorm=0.24, clip=0, loss_scale=4, train_wall=143, gb_free=59.2, wall=39957 epoch 016: 1575 / 1707 loss=4.134, nll_loss=2.509, ppl=5.69, wps=263685, ups=0.61, wpb=429262, bsz=16271.5, num_updates=27100, lr=0.000384189, gnorm=0.24, clip=0, loss_scale=4, train_wall=143, gb_free=59.2, wall=39957 epoch 016: 1675 / 1707 loss=4.134, nll_loss=2.509, ppl=5.69, wps=296558, ups=0.69, wpb=429321, bsz=16254.2, num_updates=27200, lr=0.000383482, gnorm=0.236, clip=0, loss_scale=4, train_wall=144, gb_free=59.4, wall=40102 epoch 016: 1675 / 1707 loss=4.134, nll_loss=2.509, ppl=5.69, wps=296558, ups=0.69, wpb=429321, bsz=16254.2, num_updates=27200, lr=0.000383482, gnorm=0.236, clip=0, loss_scale=4, train_wall=144, gb_free=59.4, wall=40102 epoch 016: 1675 / 1707 loss=4.134, nll_loss=2.509, ppl=5.69, wps=296558, ups=0.69, wpb=429321, bsz=16254.2, num_updates=27200, lr=0.000383482, gnorm=0.236, clip=0, loss_scale=4, train_wall=144, gb_free=59.4, wall=40102 epoch 016: 1675 / 1707 loss=4.134, nll_loss=2.509, ppl=5.69, wps=296558, ups=0.69, wpb=429321, bsz=16254.2, num_updates=27200, lr=0.000383482, gnorm=0.236, clip=0, loss_scale=4, train_wall=144, gb_free=59.4, wall=40102 epoch 016: 1675 / 1707 loss=4.134, nll_loss=2.509, ppl=5.69, wps=296558, ups=0.69, wpb=429321, bsz=16254.2, num_updates=27200, lr=0.000383482, gnorm=0.236, clip=0, loss_scale=4, train_wall=144, gb_free=59.4, wall=40102 epoch 016: 1675 / 1707 loss=4.134, nll_loss=2.509, ppl=5.69, wps=296558, ups=0.69, wpb=429321, bsz=16254.2, num_updates=27200, lr=0.000383482, gnorm=0.236, clip=0, loss_scale=4, train_wall=144, gb_free=59.4, wall=40102 epoch 016: 1675 / 1707 loss=4.134, nll_loss=2.509, ppl=5.69, wps=296558, ups=0.69, wpb=429321, bsz=16254.2, num_updates=27200, lr=0.000383482, gnorm=0.236, clip=0, loss_scale=4, train_wall=144, gb_free=59.4, wall=40102 epoch 016: 1675 / 1707 loss=4.134, nll_loss=2.509, ppl=5.69, wps=296558, ups=0.69, wpb=429321, bsz=16254.2, num_updates=27200, lr=0.000383482, gnorm=0.236, clip=0, loss_scale=4, train_wall=144, gb_free=59.4, wall=40102 epoch 016: 1675 / 1707 loss=4.134, nll_loss=2.509, ppl=5.69, wps=296558, ups=0.69, wpb=429321, bsz=16254.2, num_updates=27200, lr=0.000383482, gnorm=0.236, clip=0, loss_scale=4, train_wall=144, gb_free=59.4, wall=40102 epoch 016: 1675 / 1707 loss=4.134, nll_loss=2.509, ppl=5.69, wps=296558, ups=0.69, wpb=429321, bsz=16254.2, num_updates=27200, lr=0.000383482, gnorm=0.236, clip=0, loss_scale=4, train_wall=144, gb_free=59.4, wall=40102 epoch 016: 1675 / 1707 loss=4.134, nll_loss=2.509, ppl=5.69, wps=296558, ups=0.69, wpb=429321, bsz=16254.2, num_updates=27200, lr=0.000383482, gnorm=0.236, clip=0, loss_scale=4, train_wall=144, gb_free=59.4, wall=40102 epoch 016: 1675 / 1707 loss=4.134, nll_loss=2.509, ppl=5.69, wps=296558, ups=0.69, wpb=429321, bsz=16254.2, num_updates=27200, lr=0.000383482, gnorm=0.236, clip=0, loss_scale=4, train_wall=144, gb_free=59.4, wall=40102 epoch 016: 1675 / 1707 loss=4.134, nll_loss=2.509, ppl=5.69, wps=296558, ups=0.69, wpb=429321, bsz=16254.2, num_updates=27200, lr=0.000383482, gnorm=0.236, clip=0, loss_scale=4, train_wall=144, gb_free=59.4, wall=40102 epoch 016: 1675 / 1707 loss=4.134, nll_loss=2.509, ppl=5.69, wps=296558, ups=0.69, wpb=429321, bsz=16254.2, num_updates=27200, lr=0.000383482, gnorm=0.236, clip=0, loss_scale=4, train_wall=144, gb_free=59.4, wall=40102 epoch 016: 1675 / 1707 loss=4.134, nll_loss=2.509, ppl=5.69, wps=296558, ups=0.69, wpb=429321, bsz=16254.2, num_updates=27200, lr=0.000383482, gnorm=0.236, clip=0, loss_scale=4, train_wall=144, gb_free=59.4, wall=40102 epoch 016: 1675 / 1707 loss=4.134, nll_loss=2.509, ppl=5.69, wps=296558, ups=0.69, wpb=429321, bsz=16254.2, num_updates=27200, lr=0.000383482, gnorm=0.236, clip=0, loss_scale=4, train_wall=144, gb_free=59.4, wall=40102 end of epoch 16 (average epoch stats below) epoch 016 | loss 4.125 | nll_loss 2.498 | ppl 5.65 | wps 290106 | ups 0.68 | wpb 428949 | bsz 16334.2 | num_updates 27232 | lr 0.000383257 | gnorm 0.237 | clip 0 | loss_scale 8 | train_wall 2463 | gb_free 59.9 | wall 40147 epoch 016 | loss 4.125 | nll_loss 2.498 | ppl 5.65 | wps 290106 | ups 0.68 | wpb 428949 | bsz 16334.2 | num_updates 27232 | lr 0.000383257 | gnorm 0.237 | clip 0 | loss_scale 8 | train_wall 2463 | gb_free 59.9 | wall 40147 epoch 016 | loss 4.125 | nll_loss 2.498 | ppl 5.65 | wps 290106 | ups 0.68 | wpb 428949 | bsz 16334.2 | num_updates 27232 | lr 0.000383257 | gnorm 0.237 | clip 0 | loss_scale 8 | train_wall 2463 | gb_free 59.9 | wall 40147 epoch 016 | loss 4.125 | nll_loss 2.498 | ppl 5.65 | wps 290106 | ups 0.68 | wpb 428949 | bsz 16334.2 | num_updates 27232 | lr 0.000383257 | gnorm 0.237 | clip 0 | loss_scale 8 | train_wall 2463 | gb_free 59.9 | wall 40147 epoch 016 | loss 4.125 | nll_loss 2.498 | ppl 5.65 | wps 290106 | ups 0.68 | wpb 428949 | bsz 16334.2 | num_updates 27232 | lr 0.000383257 | gnorm 0.237 | clip 0 | loss_scale 8 | train_wall 2463 | gb_free 59.9 | wall 40147 epoch 016 | loss 4.125 | nll_loss 2.498 | ppl 5.65 | wps 290106 | ups 0.68 | wpb 428949 | bsz 16334.2 | num_updates 27232 | lr 0.000383257 | gnorm 0.237 | clip 0 | loss_scale 8 | train_wall 2463 | gb_free 59.9 | wall 40147 epoch 016 | loss 4.125 | nll_loss 2.498 | ppl 5.65 | wps 290106 | ups 0.68 | wpb 428949 | bsz 16334.2 | num_updates 27232 | lr 0.000383257 | gnorm 0.237 | clip 0 | loss_scale 8 | train_wall 2463 | gb_free 59.9 | wall 40147 epoch 016 | loss 4.125 | nll_loss 2.498 | ppl 5.65 | wps 290106 | ups 0.68 | wpb 428949 | bsz 16334.2 | num_updates 27232 | lr 0.000383257 | gnorm 0.237 | clip 0 | loss_scale 8 | train_wall 2463 | gb_free 59.9 | wall 40147 epoch 016 | loss 4.125 | nll_loss 2.498 | ppl 5.65 | wps 290106 | ups 0.68 | wpb 428949 | bsz 16334.2 | num_updates 27232 | lr 0.000383257 | gnorm 0.237 | clip 0 | loss_scale 8 | train_wall 2463 | gb_free 59.9 | wall 40147 epoch 016 | loss 4.125 | nll_loss 2.498 | ppl 5.65 | wps 290106 | ups 0.68 | wpb 428949 | bsz 16334.2 | num_updates 27232 | lr 0.000383257 | gnorm 0.237 | clip 0 | loss_scale 8 | train_wall 2463 | gb_free 59.9 | wall 40147 epoch 016 | loss 4.125 | nll_loss 2.498 | ppl 5.65 | wps 290106 | ups 0.68 | wpb 428949 | bsz 16334.2 | num_updates 27232 | lr 0.000383257 | gnorm 0.237 | clip 0 | loss_scale 8 | train_wall 2463 | gb_free 59.9 | wall 40147 epoch 016 | loss 4.125 | nll_loss 2.498 | ppl 5.65 | wps 290106 | ups 0.68 | wpb 428949 | bsz 16334.2 | num_updates 27232 | lr 0.000383257 | gnorm 0.237 | clip 0 | loss_scale 8 | train_wall 2463 | gb_free 59.9 | wall 40147 epoch 016 | loss 4.125 | nll_loss 2.498 | ppl 5.65 | wps 290106 | ups 0.68 | wpb 428949 | bsz 16334.2 | num_updates 27232 | lr 0.000383257 | gnorm 0.237 | clip 0 | loss_scale 8 | train_wall 2463 | gb_free 59.9 | wall 40147 epoch 016 | loss 4.125 | nll_loss 2.498 | ppl 5.65 | wps 290106 | ups 0.68 | wpb 428949 | bsz 16334.2 | num_updates 27232 | lr 0.000383257 | gnorm 0.237 | clip 0 | loss_scale 8 | train_wall 2463 | gb_free 59.9 | wall 40147 epoch 016 | loss 4.125 | nll_loss 2.498 | ppl 5.65 | wps 290106 | ups 0.68 | wpb 428949 | bsz 16334.2 | num_updates 27232 | lr 0.000383257 | gnorm 0.237 | clip 0 | loss_scale 8 | train_wall 2463 | gb_free 59.9 | wall 40147 epoch 016 | loss 4.125 | nll_loss 2.498 | ppl 5.65 | wps 290106 | ups 0.68 | wpb 428949 | bsz 16334.2 | num_updates 27232 | lr 0.000383257 | gnorm 0.237 | clip 0 | loss_scale 8 | train_wall 2463 | gb_free 59.9 | wall 40147 Start iterating over samples epoch 017: 69 / 1707 loss=4.107, nll_loss=2.477, ppl=5.57, wps=292545, ups=0.69, wpb=426136, bsz=16261.4, num_updates=27300, lr=0.00038278, gnorm=0.23, clip=0, loss_scale=4, train_wall=145, gb_free=59.1, wall=40248 epoch 017: 69 / 1707 loss=4.107, nll_loss=2.477, ppl=5.57, wps=292545, ups=0.69, wpb=426136, bsz=16261.4, num_updates=27300, lr=0.00038278, gnorm=0.23, clip=0, loss_scale=4, train_wall=145, gb_free=59.1, wall=40248 epoch 017: 69 / 1707 loss=4.107, nll_loss=2.477, ppl=5.57, wps=292545, ups=0.69, wpb=426136, bsz=16261.4, num_updates=27300, lr=0.00038278, gnorm=0.23, clip=0, loss_scale=4, train_wall=145, gb_free=59.1, wall=40248 epoch 017: 69 / 1707 loss=4.107, nll_loss=2.477, ppl=5.57, wps=292545, ups=0.69, wpb=426136, bsz=16261.4, num_updates=27300, lr=0.00038278, gnorm=0.23, clip=0, loss_scale=4, train_wall=145, gb_free=59.1, wall=40248 epoch 017: 69 / 1707 loss=4.107, nll_loss=2.477, ppl=5.57, wps=292545, ups=0.69, wpb=426136, bsz=16261.4, num_updates=27300, lr=0.00038278, gnorm=0.23, clip=0, loss_scale=4, train_wall=145, gb_free=59.1, wall=40248 epoch 017: 69 / 1707 loss=4.107, nll_loss=2.477, ppl=5.57, wps=292545, ups=0.69, wpb=426136, bsz=16261.4, num_updates=27300, lr=0.00038278, gnorm=0.23, clip=0, loss_scale=4, train_wall=145, gb_free=59.1, wall=40248 epoch 017: 69 / 1707 loss=4.107, nll_loss=2.477, ppl=5.57, wps=292545, ups=0.69, wpb=426136, bsz=16261.4, num_updates=27300, lr=0.00038278, gnorm=0.23, clip=0, loss_scale=4, train_wall=145, gb_free=59.1, wall=40248 epoch 017: 69 / 1707 loss=4.107, nll_loss=2.477, ppl=5.57, wps=292545, ups=0.69, wpb=426136, bsz=16261.4, num_updates=27300, lr=0.00038278, gnorm=0.23, clip=0, loss_scale=4, train_wall=145, gb_free=59.1, wall=40248 epoch 017: 69 / 1707 loss=4.107, nll_loss=2.477, ppl=5.57, wps=292545, ups=0.69, wpb=426136, bsz=16261.4, num_updates=27300, lr=0.00038278, gnorm=0.23, clip=0, loss_scale=4, train_wall=145, gb_free=59.1, wall=40248 epoch 017: 69 / 1707 loss=4.107, nll_loss=2.477, ppl=5.57, wps=292545, ups=0.69, wpb=426136, bsz=16261.4, num_updates=27300, lr=0.00038278, gnorm=0.23, clip=0, loss_scale=4, train_wall=145, gb_free=59.1, wall=40248 epoch 017: 69 / 1707 loss=4.107, nll_loss=2.477, ppl=5.57, wps=292545, ups=0.69, wpb=426136, bsz=16261.4, num_updates=27300, lr=0.00038278, gnorm=0.23, clip=0, loss_scale=4, train_wall=145, gb_free=59.1, wall=40248 epoch 017: 69 / 1707 loss=4.107, nll_loss=2.477, ppl=5.57, wps=292545, ups=0.69, wpb=426136, bsz=16261.4, num_updates=27300, lr=0.00038278, gnorm=0.23, clip=0, loss_scale=4, train_wall=145, gb_free=59.1, wall=40248 epoch 017: 69 / 1707 loss=4.107, nll_loss=2.477, ppl=5.57, wps=292545, ups=0.69, wpb=426136, bsz=16261.4, num_updates=27300, lr=0.00038278, gnorm=0.23, clip=0, loss_scale=4, train_wall=145, gb_free=59.1, wall=40248 epoch 017: 69 / 1707 loss=4.107, nll_loss=2.477, ppl=5.57, wps=292545, ups=0.69, wpb=426136, bsz=16261.4, num_updates=27300, lr=0.00038278, gnorm=0.23, clip=0, loss_scale=4, train_wall=145, gb_free=59.1, wall=40248 epoch 017: 69 / 1707 loss=4.107, nll_loss=2.477, ppl=5.57, wps=292545, ups=0.69, wpb=426136, bsz=16261.4, num_updates=27300, lr=0.00038278, gnorm=0.23, clip=0, loss_scale=4, train_wall=145, gb_free=59.1, wall=40248 epoch 017: 69 / 1707 loss=4.107, nll_loss=2.477, ppl=5.57, wps=292545, ups=0.69, wpb=426136, bsz=16261.4, num_updates=27300, lr=0.00038278, gnorm=0.23, clip=0, loss_scale=4, train_wall=145, gb_free=59.1, wall=40248 epoch 017: 69 / 1707 loss=4.107, nll_loss=2.477, ppl=5.57, wps=292545, ups=0.69, wpb=426136, bsz=16261.4, num_updates=27300, lr=0.00038278, gnorm=0.23, clip=0, loss_scale=4, train_wall=145, gb_free=59.1, wall=40248 epoch 017: 170 / 1707 loss=4.105, nll_loss=2.475, ppl=5.56, wps=291649, ups=0.68, wpb=429023, bsz=16332.2, num_updates=27400, lr=0.00038208, gnorm=0.241, clip=0, loss_scale=2, train_wall=146, gb_free=59, wall=40395 epoch 017: 170 / 1707 loss=4.105, nll_loss=2.475, ppl=5.56, wps=291649, ups=0.68, wpb=429023, bsz=16332.2, num_updates=27400, lr=0.00038208, gnorm=0.241, clip=0, loss_scale=2, train_wall=146, gb_free=59, wall=40395 epoch 017: 170 / 1707 loss=4.105, nll_loss=2.475, ppl=5.56, wps=291649, ups=0.68, wpb=429023, bsz=16332.2, num_updates=27400, lr=0.00038208, gnorm=0.241, clip=0, loss_scale=2, train_wall=146, gb_free=59, wall=40395 epoch 017: 170 / 1707 loss=4.105, nll_loss=2.475, ppl=5.56, wps=291649, ups=0.68, wpb=429023, bsz=16332.2, num_updates=27400, lr=0.00038208, gnorm=0.241, clip=0, loss_scale=2, train_wall=146, gb_free=59, wall=40395 epoch 017: 170 / 1707 loss=4.105, nll_loss=2.475, ppl=5.56, wps=291649, ups=0.68, wpb=429023, bsz=16332.2, num_updates=27400, lr=0.00038208, gnorm=0.241, clip=0, loss_scale=2, train_wall=146, gb_free=59, wall=40395 epoch 017: 170 / 1707 loss=4.105, nll_loss=2.475, ppl=5.56, wps=291649, ups=0.68, wpb=429023, bsz=16332.2, num_updates=27400, lr=0.00038208, gnorm=0.241, clip=0, loss_scale=2, train_wall=146, gb_free=59, wall=40395 epoch 017: 170 / 1707 loss=4.105, nll_loss=2.475, ppl=5.56, wps=291649, ups=0.68, wpb=429023, bsz=16332.2, num_updates=27400, lr=0.00038208, gnorm=0.241, clip=0, loss_scale=2, train_wall=146, gb_free=59, wall=40395 epoch 017: 170 / 1707 loss=4.105, nll_loss=2.475, ppl=5.56, wps=291649, ups=0.68, wpb=429023, bsz=16332.2, num_updates=27400, lr=0.00038208, gnorm=0.241, clip=0, loss_scale=2, train_wall=146, gb_free=59, wall=40395 epoch 017: 170 / 1707 loss=4.105, nll_loss=2.475, ppl=5.56, wps=291649, ups=0.68, wpb=429023, bsz=16332.2, num_updates=27400, lr=0.00038208, gnorm=0.241, clip=0, loss_scale=2, train_wall=146, gb_free=59, wall=40395 epoch 017: 170 / 1707 loss=4.105, nll_loss=2.475, ppl=5.56, wps=291649, ups=0.68, wpb=429023, bsz=16332.2, num_updates=27400, lr=0.00038208, gnorm=0.241, clip=0, loss_scale=2, train_wall=146, gb_free=59, wall=40395 epoch 017: 170 / 1707 loss=4.105, nll_loss=2.475, ppl=5.56, wps=291649, ups=0.68, wpb=429023, bsz=16332.2, num_updates=27400, lr=0.00038208, gnorm=0.241, clip=0, loss_scale=2, train_wall=146, gb_free=59, wall=40395 epoch 017: 170 / 1707 loss=4.105, nll_loss=2.475, ppl=5.56, wps=291649, ups=0.68, wpb=429023, bsz=16332.2, num_updates=27400, lr=0.00038208, gnorm=0.241, clip=0, loss_scale=2, train_wall=146, gb_free=59, wall=40395 epoch 017: 170 / 1707 loss=4.105, nll_loss=2.475, ppl=5.56, wps=291649, ups=0.68, wpb=429023, bsz=16332.2, num_updates=27400, lr=0.00038208, gnorm=0.241, clip=0, loss_scale=2, train_wall=146, gb_free=59, wall=40395 epoch 017: 170 / 1707 loss=4.105, nll_loss=2.475, ppl=5.56, wps=291649, ups=0.68, wpb=429023, bsz=16332.2, num_updates=27400, lr=0.00038208, gnorm=0.241, clip=0, loss_scale=2, train_wall=146, gb_free=59, wall=40395 epoch 017: 170 / 1707 loss=4.105, nll_loss=2.475, ppl=5.56, wps=291649, ups=0.68, wpb=429023, bsz=16332.2, num_updates=27400, lr=0.00038208, gnorm=0.241, clip=0, loss_scale=2, train_wall=146, gb_free=59, wall=40395 epoch 017: 170 / 1707 loss=4.105, nll_loss=2.475, ppl=5.56, wps=291649, ups=0.68, wpb=429023, bsz=16332.2, num_updates=27400, lr=0.00038208, gnorm=0.241, clip=0, loss_scale=2, train_wall=146, gb_free=59, wall=40395 epoch 017: 170 / 1707 loss=4.105, nll_loss=2.475, ppl=5.56, wps=291649, ups=0.68, wpb=429023, bsz=16332.2, num_updates=27400, lr=0.00038208, gnorm=0.241, clip=0, loss_scale=2, train_wall=146, gb_free=59, wall=40395 epoch 017: 270 / 1707 loss=4.097, nll_loss=2.466, ppl=5.53, wps=294540, ups=0.69, wpb=427804, bsz=16385.7, num_updates=27500, lr=0.000381385, gnorm=0.242, clip=0, loss_scale=2, train_wall=144, gb_free=59.4, wall=40540 epoch 017: 270 / 1707 loss=4.097, nll_loss=2.466, ppl=5.53, wps=294540, ups=0.69, wpb=427804, bsz=16385.7, num_updates=27500, lr=0.000381385, gnorm=0.242, clip=0, loss_scale=2, train_wall=144, gb_free=59.4, wall=40540 epoch 017: 270 / 1707 loss=4.097, nll_loss=2.466, ppl=5.53, wps=294540, ups=0.69, wpb=427804, bsz=16385.7, num_updates=27500, lr=0.000381385, gnorm=0.242, clip=0, loss_scale=2, train_wall=144, gb_free=59.4, wall=40540 epoch 017: 270 / 1707 loss=4.097, nll_loss=2.466, ppl=5.53, wps=294540, ups=0.69, wpb=427804, bsz=16385.7, num_updates=27500, lr=0.000381385, gnorm=0.242, clip=0, loss_scale=2, train_wall=144, gb_free=59.4, wall=40540 epoch 017: 270 / 1707 loss=4.097, nll_loss=2.466, ppl=5.53, wps=294540, ups=0.69, wpb=427804, bsz=16385.7, num_updates=27500, lr=0.000381385, gnorm=0.242, clip=0, loss_scale=2, train_wall=144, gb_free=59.4, wall=40540 epoch 017: 270 / 1707 loss=4.097, nll_loss=2.466, ppl=5.53, wps=294540, ups=0.69, wpb=427804, bsz=16385.7, num_updates=27500, lr=0.000381385, gnorm=0.242, clip=0, loss_scale=2, train_wall=144, gb_free=59.4, wall=40540 epoch 017: 270 / 1707 loss=4.097, nll_loss=2.466, ppl=5.53, wps=294540, ups=0.69, wpb=427804, bsz=16385.7, num_updates=27500, lr=0.000381385, gnorm=0.242, clip=0, loss_scale=2, train_wall=144, gb_free=59.4, wall=40540 epoch 017: 270 / 1707 loss=4.097, nll_loss=2.466, ppl=5.53, wps=294540, ups=0.69, wpb=427804, bsz=16385.7, num_updates=27500, lr=0.000381385, gnorm=0.242, clip=0, loss_scale=2, train_wall=144, gb_free=59.4, wall=40540 epoch 017: 270 / 1707 loss=4.097, nll_loss=2.466, ppl=5.53, wps=294540, ups=0.69, wpb=427804, bsz=16385.7, num_updates=27500, lr=0.000381385, gnorm=0.242, clip=0, loss_scale=2, train_wall=144, gb_free=59.4, wall=40540 epoch 017: 270 / 1707 loss=4.097, nll_loss=2.466, ppl=5.53, wps=294540, ups=0.69, wpb=427804, bsz=16385.7, num_updates=27500, lr=0.000381385, gnorm=0.242, clip=0, loss_scale=2, train_wall=144, gb_free=59.4, wall=40540 epoch 017: 270 / 1707 loss=4.097, nll_loss=2.466, ppl=5.53, wps=294540, ups=0.69, wpb=427804, bsz=16385.7, num_updates=27500, lr=0.000381385, gnorm=0.242, clip=0, loss_scale=2, train_wall=144, gb_free=59.4, wall=40540 epoch 017: 270 / 1707 loss=4.097, nll_loss=2.466, ppl=5.53, wps=294540, ups=0.69, wpb=427804, bsz=16385.7, num_updates=27500, lr=0.000381385, gnorm=0.242, clip=0, loss_scale=2, train_wall=144, gb_free=59.4, wall=40540 epoch 017: 270 / 1707 loss=4.097, nll_loss=2.466, ppl=5.53, wps=294540, ups=0.69, wpb=427804, bsz=16385.7, num_updates=27500, lr=0.000381385, gnorm=0.242, clip=0, loss_scale=2, train_wall=144, gb_free=59.4, wall=40540 epoch 017: 270 / 1707 loss=4.097, nll_loss=2.466, ppl=5.53, wps=294540, ups=0.69, wpb=427804, bsz=16385.7, num_updates=27500, lr=0.000381385, gnorm=0.242, clip=0, loss_scale=2, train_wall=144, gb_free=59.4, wall=40540 epoch 017: 270 / 1707 loss=4.097, nll_loss=2.466, ppl=5.53, wps=294540, ups=0.69, wpb=427804, bsz=16385.7, num_updates=27500, lr=0.000381385, gnorm=0.242, clip=0, loss_scale=2, train_wall=144, gb_free=59.4, wall=40540 epoch 017: 270 / 1707 loss=4.097, nll_loss=2.466, ppl=5.53, wps=294540, ups=0.69, wpb=427804, bsz=16385.7, num_updates=27500, lr=0.000381385, gnorm=0.242, clip=0, loss_scale=2, train_wall=144, gb_free=59.4, wall=40540 epoch 017: 270 / 1707 loss=4.097, nll_loss=2.466, ppl=5.53, wps=294540, ups=0.69, wpb=427804, bsz=16385.7, num_updates=27500, lr=0.000381385, gnorm=0.242, clip=0, loss_scale=2, train_wall=144, gb_free=59.4, wall=40540 epoch 017: 370 / 1707 loss=4.107, nll_loss=2.478, ppl=5.57, wps=295999, ups=0.69, wpb=431153, bsz=16413.7, num_updates=27600, lr=0.000380693, gnorm=0.24, clip=0, loss_scale=4, train_wall=145, gb_free=59.1, wall=40686 epoch 017: 370 / 1707 loss=4.107, nll_loss=2.478, ppl=5.57, wps=295999, ups=0.69, wpb=431153, bsz=16413.7, num_updates=27600, lr=0.000380693, gnorm=0.24, clip=0, loss_scale=4, train_wall=145, gb_free=59.1, wall=40686 epoch 017: 370 / 1707 loss=4.107, nll_loss=2.478, ppl=5.57, wps=295999, ups=0.69, wpb=431153, bsz=16413.7, num_updates=27600, lr=0.000380693, gnorm=0.24, clip=0, loss_scale=4, train_wall=145, gb_free=59.1, wall=40686 epoch 017: 370 / 1707 loss=4.107, nll_loss=2.478, ppl=5.57, wps=295999, ups=0.69, wpb=431153, bsz=16413.7, num_updates=27600, lr=0.000380693, gnorm=0.24, clip=0, loss_scale=4, train_wall=145, gb_free=59.1, wall=40686 epoch 017: 370 / 1707 loss=4.107, nll_loss=2.478, ppl=5.57, wps=295999, ups=0.69, wpb=431153, bsz=16413.7, num_updates=27600, lr=0.000380693, gnorm=0.24, clip=0, loss_scale=4, train_wall=145, gb_free=59.1, wall=40686 epoch 017: 370 / 1707 loss=4.107, nll_loss=2.478, ppl=5.57, wps=295999, ups=0.69, wpb=431153, bsz=16413.7, num_updates=27600, lr=0.000380693, gnorm=0.24, clip=0, loss_scale=4, train_wall=145, gb_free=59.1, wall=40686 epoch 017: 370 / 1707 loss=4.107, nll_loss=2.478, ppl=5.57, wps=295999, ups=0.69, wpb=431153, bsz=16413.7, num_updates=27600, lr=0.000380693, gnorm=0.24, clip=0, loss_scale=4, train_wall=145, gb_free=59.1, wall=40686 epoch 017: 370 / 1707 loss=4.107, nll_loss=2.478, ppl=5.57, wps=295999, ups=0.69, wpb=431153, bsz=16413.7, num_updates=27600, lr=0.000380693, gnorm=0.24, clip=0, loss_scale=4, train_wall=145, gb_free=59.1, wall=40686 epoch 017: 370 / 1707 loss=4.107, nll_loss=2.478, ppl=5.57, wps=295999, ups=0.69, wpb=431153, bsz=16413.7, num_updates=27600, lr=0.000380693, gnorm=0.24, clip=0, loss_scale=4, train_wall=145, gb_free=59.1, wall=40686 epoch 017: 370 / 1707 loss=4.107, nll_loss=2.478, ppl=5.57, wps=295999, ups=0.69, wpb=431153, bsz=16413.7, num_updates=27600, lr=0.000380693, gnorm=0.24, clip=0, loss_scale=4, train_wall=145, gb_free=59.1, wall=40686 epoch 017: 370 / 1707 loss=4.107, nll_loss=2.478, ppl=5.57, wps=295999, ups=0.69, wpb=431153, bsz=16413.7, num_updates=27600, lr=0.000380693, gnorm=0.24, clip=0, loss_scale=4, train_wall=145, gb_free=59.1, wall=40686 epoch 017: 370 / 1707 loss=4.107, nll_loss=2.478, ppl=5.57, wps=295999, ups=0.69, wpb=431153, bsz=16413.7, num_updates=27600, lr=0.000380693, gnorm=0.24, clip=0, loss_scale=4, train_wall=145, gb_free=59.1, wall=40686 epoch 017: 370 / 1707 loss=4.107, nll_loss=2.478, ppl=5.57, wps=295999, ups=0.69, wpb=431153, bsz=16413.7, num_updates=27600, lr=0.000380693, gnorm=0.24, clip=0, loss_scale=4, train_wall=145, gb_free=59.1, wall=40686 epoch 017: 370 / 1707 loss=4.107, nll_loss=2.478, ppl=5.57, wps=295999, ups=0.69, wpb=431153, bsz=16413.7, num_updates=27600, lr=0.000380693, gnorm=0.24, clip=0, loss_scale=4, train_wall=145, gb_free=59.1, wall=40686 epoch 017: 370 / 1707 loss=4.107, nll_loss=2.478, ppl=5.57, wps=295999, ups=0.69, wpb=431153, bsz=16413.7, num_updates=27600, lr=0.000380693, gnorm=0.24, clip=0, loss_scale=4, train_wall=145, gb_free=59.1, wall=40686 epoch 017: 370 / 1707 loss=4.107, nll_loss=2.478, ppl=5.57, wps=295999, ups=0.69, wpb=431153, bsz=16413.7, num_updates=27600, lr=0.000380693, gnorm=0.24, clip=0, loss_scale=4, train_wall=145, gb_free=59.1, wall=40686 epoch 017: 370 / 1707 loss=4.107, nll_loss=2.478, ppl=5.57, wps=295999, ups=0.69, wpb=431153, bsz=16413.7, num_updates=27600, lr=0.000380693, gnorm=0.24, clip=0, loss_scale=4, train_wall=145, gb_free=59.1, wall=40686 epoch 017: 470 / 1707 loss=4.108, nll_loss=2.478, ppl=5.57, wps=296050, ups=0.69, wpb=429384, bsz=16349.5, num_updates=27700, lr=0.000380006, gnorm=0.241, clip=0, loss_scale=4, train_wall=144, gb_free=58.9, wall=40831 epoch 017: 470 / 1707 loss=4.108, nll_loss=2.478, ppl=5.57, wps=296050, ups=0.69, wpb=429384, bsz=16349.5, num_updates=27700, lr=0.000380006, gnorm=0.241, clip=0, loss_scale=4, train_wall=144, gb_free=58.9, wall=40831 epoch 017: 470 / 1707 loss=4.108, nll_loss=2.478, ppl=5.57, wps=296050, ups=0.69, wpb=429384, bsz=16349.5, num_updates=27700, lr=0.000380006, gnorm=0.241, clip=0, loss_scale=4, train_wall=144, gb_free=58.9, wall=40831 epoch 017: 470 / 1707 loss=4.108, nll_loss=2.478, ppl=5.57, wps=296050, ups=0.69, wpb=429384, bsz=16349.5, num_updates=27700, lr=0.000380006, gnorm=0.241, clip=0, loss_scale=4, train_wall=144, gb_free=58.9, wall=40831 epoch 017: 470 / 1707 loss=4.108, nll_loss=2.478, ppl=5.57, wps=296050, ups=0.69, wpb=429384, bsz=16349.5, num_updates=27700, lr=0.000380006, gnorm=0.241, clip=0, loss_scale=4, train_wall=144, gb_free=58.9, wall=40831 epoch 017: 470 / 1707 loss=4.108, nll_loss=2.478, ppl=5.57, wps=296050, ups=0.69, wpb=429384, bsz=16349.5, num_updates=27700, lr=0.000380006, gnorm=0.241, clip=0, loss_scale=4, train_wall=144, gb_free=58.9, wall=40831 epoch 017: 470 / 1707 loss=4.108, nll_loss=2.478, ppl=5.57, wps=296050, ups=0.69, wpb=429384, bsz=16349.5, num_updates=27700, lr=0.000380006, gnorm=0.241, clip=0, loss_scale=4, train_wall=144, gb_free=58.9, wall=40831 epoch 017: 470 / 1707 loss=4.108, nll_loss=2.478, ppl=5.57, wps=296050, ups=0.69, wpb=429384, bsz=16349.5, num_updates=27700, lr=0.000380006, gnorm=0.241, clip=0, loss_scale=4, train_wall=144, gb_free=58.9, wall=40831 epoch 017: 470 / 1707 loss=4.108, nll_loss=2.478, ppl=5.57, wps=296050, ups=0.69, wpb=429384, bsz=16349.5, num_updates=27700, lr=0.000380006, gnorm=0.241, clip=0, loss_scale=4, train_wall=144, gb_free=58.9, wall=40831 epoch 017: 470 / 1707 loss=4.108, nll_loss=2.478, ppl=5.57, wps=296050, ups=0.69, wpb=429384, bsz=16349.5, num_updates=27700, lr=0.000380006, gnorm=0.241, clip=0, loss_scale=4, train_wall=144, gb_free=58.9, wall=40831 epoch 017: 470 / 1707 loss=4.108, nll_loss=2.478, ppl=5.57, wps=296050, ups=0.69, wpb=429384, bsz=16349.5, num_updates=27700, lr=0.000380006, gnorm=0.241, clip=0, loss_scale=4, train_wall=144, gb_free=58.9, wall=40831 epoch 017: 470 / 1707 loss=4.108, nll_loss=2.478, ppl=5.57, wps=296050, ups=0.69, wpb=429384, bsz=16349.5, num_updates=27700, lr=0.000380006, gnorm=0.241, clip=0, loss_scale=4, train_wall=144, gb_free=58.9, wall=40831 epoch 017: 470 / 1707 loss=4.108, nll_loss=2.478, ppl=5.57, wps=296050, ups=0.69, wpb=429384, bsz=16349.5, num_updates=27700, lr=0.000380006, gnorm=0.241, clip=0, loss_scale=4, train_wall=144, gb_free=58.9, wall=40831 epoch 017: 470 / 1707 loss=4.108, nll_loss=2.478, ppl=5.57, wps=296050, ups=0.69, wpb=429384, bsz=16349.5, num_updates=27700, lr=0.000380006, gnorm=0.241, clip=0, loss_scale=4, train_wall=144, gb_free=58.9, wall=40831 epoch 017: 470 / 1707 loss=4.108, nll_loss=2.478, ppl=5.57, wps=296050, ups=0.69, wpb=429384, bsz=16349.5, num_updates=27700, lr=0.000380006, gnorm=0.241, clip=0, loss_scale=4, train_wall=144, gb_free=58.9, wall=40831 epoch 017: 470 / 1707 loss=4.108, nll_loss=2.478, ppl=5.57, wps=296050, ups=0.69, wpb=429384, bsz=16349.5, num_updates=27700, lr=0.000380006, gnorm=0.241, clip=0, loss_scale=4, train_wall=144, gb_free=58.9, wall=40831 epoch 017: 470 / 1707 loss=4.108, nll_loss=2.478, ppl=5.57, wps=296050, ups=0.69, wpb=429384, bsz=16349.5, num_updates=27700, lr=0.000380006, gnorm=0.241, clip=0, loss_scale=4, train_wall=144, gb_free=58.9, wall=40831 epoch 017: 571 / 1707 loss=4.103, nll_loss=2.474, ppl=5.55, wps=291079, ups=0.68, wpb=428672, bsz=16279.3, num_updates=27800, lr=0.000379322, gnorm=0.244, clip=0, loss_scale=2, train_wall=146, gb_free=59.4, wall=40978 epoch 017: 571 / 1707 loss=4.103, nll_loss=2.474, ppl=5.55, wps=291079, ups=0.68, wpb=428672, bsz=16279.3, num_updates=27800, lr=0.000379322, gnorm=0.244, clip=0, loss_scale=2, train_wall=146, gb_free=59.4, wall=40978 epoch 017: 571 / 1707 loss=4.103, nll_loss=2.474, ppl=5.55, wps=291079, ups=0.68, wpb=428672, bsz=16279.3, num_updates=27800, lr=0.000379322, gnorm=0.244, clip=0, loss_scale=2, train_wall=146, gb_free=59.4, wall=40978 epoch 017: 571 / 1707 loss=4.103, nll_loss=2.474, ppl=5.55, wps=291079, ups=0.68, wpb=428672, bsz=16279.3, num_updates=27800, lr=0.000379322, gnorm=0.244, clip=0, loss_scale=2, train_wall=146, gb_free=59.4, wall=40978 epoch 017: 571 / 1707 loss=4.103, nll_loss=2.474, ppl=5.55, wps=291079, ups=0.68, wpb=428672, bsz=16279.3, num_updates=27800, lr=0.000379322, gnorm=0.244, clip=0, loss_scale=2, train_wall=146, gb_free=59.4, wall=40978 epoch 017: 571 / 1707 loss=4.103, nll_loss=2.474, ppl=5.55, wps=291079, ups=0.68, wpb=428672, bsz=16279.3, num_updates=27800, lr=0.000379322, gnorm=0.244, clip=0, loss_scale=2, train_wall=146, gb_free=59.4, wall=40978 epoch 017: 571 / 1707 loss=4.103, nll_loss=2.474, ppl=5.55, wps=291079, ups=0.68, wpb=428672, bsz=16279.3, num_updates=27800, lr=0.000379322, gnorm=0.244, clip=0, loss_scale=2, train_wall=146, gb_free=59.4, wall=40978 epoch 017: 571 / 1707 loss=4.103, nll_loss=2.474, ppl=5.55, wps=291079, ups=0.68, wpb=428672, bsz=16279.3, num_updates=27800, lr=0.000379322, gnorm=0.244, clip=0, loss_scale=2, train_wall=146, gb_free=59.4, wall=40978 epoch 017: 571 / 1707 loss=4.103, nll_loss=2.474, ppl=5.55, wps=291079, ups=0.68, wpb=428672, bsz=16279.3, num_updates=27800, lr=0.000379322, gnorm=0.244, clip=0, loss_scale=2, train_wall=146, gb_free=59.4, wall=40978 epoch 017: 571 / 1707 loss=4.103, nll_loss=2.474, ppl=5.55, wps=291079, ups=0.68, wpb=428672, bsz=16279.3, num_updates=27800, lr=0.000379322, gnorm=0.244, clip=0, loss_scale=2, train_wall=146, gb_free=59.4, wall=40978 epoch 017: 571 / 1707 loss=4.103, nll_loss=2.474, ppl=5.55, wps=291079, ups=0.68, wpb=428672, bsz=16279.3, num_updates=27800, lr=0.000379322, gnorm=0.244, clip=0, loss_scale=2, train_wall=146, gb_free=59.4, wall=40978 epoch 017: 571 / 1707 loss=4.103, nll_loss=2.474, ppl=5.55, wps=291079, ups=0.68, wpb=428672, bsz=16279.3, num_updates=27800, lr=0.000379322, gnorm=0.244, clip=0, loss_scale=2, train_wall=146, gb_free=59.4, wall=40978 epoch 017: 571 / 1707 loss=4.103, nll_loss=2.474, ppl=5.55, wps=291079, ups=0.68, wpb=428672, bsz=16279.3, num_updates=27800, lr=0.000379322, gnorm=0.244, clip=0, loss_scale=2, train_wall=146, gb_free=59.4, wall=40978 epoch 017: 571 / 1707 loss=4.103, nll_loss=2.474, ppl=5.55, wps=291079, ups=0.68, wpb=428672, bsz=16279.3, num_updates=27800, lr=0.000379322, gnorm=0.244, clip=0, loss_scale=2, train_wall=146, gb_free=59.4, wall=40978 epoch 017: 571 / 1707 loss=4.103, nll_loss=2.474, ppl=5.55, wps=291079, ups=0.68, wpb=428672, bsz=16279.3, num_updates=27800, lr=0.000379322, gnorm=0.244, clip=0, loss_scale=2, train_wall=146, gb_free=59.4, wall=40978 epoch 017: 571 / 1707 loss=4.103, nll_loss=2.474, ppl=5.55, wps=291079, ups=0.68, wpb=428672, bsz=16279.3, num_updates=27800, lr=0.000379322, gnorm=0.244, clip=0, loss_scale=2, train_wall=146, gb_free=59.4, wall=40978 epoch 017: 571 / 1707 loss=4.103, nll_loss=2.474, ppl=5.55, wps=291079, ups=0.68, wpb=428672, bsz=16279.3, num_updates=27800, lr=0.000379322, gnorm=0.244, clip=0, loss_scale=2, train_wall=146, gb_free=59.4, wall=40978 epoch 017: 671 / 1707 loss=4.123, nll_loss=2.495, ppl=5.64, wps=295244, ups=0.69, wpb=428544, bsz=16268.4, num_updates=27900, lr=0.000378641, gnorm=0.246, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=41123 epoch 017: 671 / 1707 loss=4.123, nll_loss=2.495, ppl=5.64, wps=295244, ups=0.69, wpb=428544, bsz=16268.4, num_updates=27900, lr=0.000378641, gnorm=0.246, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=41123 epoch 017: 671 / 1707 loss=4.123, nll_loss=2.495, ppl=5.64, wps=295244, ups=0.69, wpb=428544, bsz=16268.4, num_updates=27900, lr=0.000378641, gnorm=0.246, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=41123 epoch 017: 671 / 1707 loss=4.123, nll_loss=2.495, ppl=5.64, wps=295244, ups=0.69, wpb=428544, bsz=16268.4, num_updates=27900, lr=0.000378641, gnorm=0.246, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=41123 epoch 017: 671 / 1707 loss=4.123, nll_loss=2.495, ppl=5.64, wps=295244, ups=0.69, wpb=428544, bsz=16268.4, num_updates=27900, lr=0.000378641, gnorm=0.246, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=41123 epoch 017: 671 / 1707 loss=4.123, nll_loss=2.495, ppl=5.64, wps=295244, ups=0.69, wpb=428544, bsz=16268.4, num_updates=27900, lr=0.000378641, gnorm=0.246, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=41123 epoch 017: 671 / 1707 loss=4.123, nll_loss=2.495, ppl=5.64, wps=295244, ups=0.69, wpb=428544, bsz=16268.4, num_updates=27900, lr=0.000378641, gnorm=0.246, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=41123 epoch 017: 671 / 1707 loss=4.123, nll_loss=2.495, ppl=5.64, wps=295244, ups=0.69, wpb=428544, bsz=16268.4, num_updates=27900, lr=0.000378641, gnorm=0.246, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=41123 epoch 017: 671 / 1707 loss=4.123, nll_loss=2.495, ppl=5.64, wps=295244, ups=0.69, wpb=428544, bsz=16268.4, num_updates=27900, lr=0.000378641, gnorm=0.246, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=41123 epoch 017: 671 / 1707 loss=4.123, nll_loss=2.495, ppl=5.64, wps=295244, ups=0.69, wpb=428544, bsz=16268.4, num_updates=27900, lr=0.000378641, gnorm=0.246, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=41123 epoch 017: 671 / 1707 loss=4.123, nll_loss=2.495, ppl=5.64, wps=295244, ups=0.69, wpb=428544, bsz=16268.4, num_updates=27900, lr=0.000378641, gnorm=0.246, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=41123 epoch 017: 671 / 1707 loss=4.123, nll_loss=2.495, ppl=5.64, wps=295244, ups=0.69, wpb=428544, bsz=16268.4, num_updates=27900, lr=0.000378641, gnorm=0.246, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=41123 epoch 017: 671 / 1707 loss=4.123, nll_loss=2.495, ppl=5.64, wps=295244, ups=0.69, wpb=428544, bsz=16268.4, num_updates=27900, lr=0.000378641, gnorm=0.246, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=41123 epoch 017: 671 / 1707 loss=4.123, nll_loss=2.495, ppl=5.64, wps=295244, ups=0.69, wpb=428544, bsz=16268.4, num_updates=27900, lr=0.000378641, gnorm=0.246, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=41123 epoch 017: 671 / 1707 loss=4.123, nll_loss=2.495, ppl=5.64, wps=295244, ups=0.69, wpb=428544, bsz=16268.4, num_updates=27900, lr=0.000378641, gnorm=0.246, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=41123 epoch 017: 671 / 1707 loss=4.123, nll_loss=2.495, ppl=5.64, wps=295244, ups=0.69, wpb=428544, bsz=16268.4, num_updates=27900, lr=0.000378641, gnorm=0.246, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=41123 epoch 017: 671 / 1707 loss=4.123, nll_loss=2.495, ppl=5.64, wps=295244, ups=0.69, wpb=428544, bsz=16268.4, num_updates=27900, lr=0.000378641, gnorm=0.246, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=41123 epoch 017: 771 / 1707 loss=4.12, nll_loss=2.493, ppl=5.63, wps=294924, ups=0.69, wpb=427801, bsz=16110.8, num_updates=28000, lr=0.000377964, gnorm=0.237, clip=0, loss_scale=4, train_wall=144, gb_free=59.8, wall=41268 epoch 017: 771 / 1707 loss=4.12, nll_loss=2.493, ppl=5.63, wps=294924, ups=0.69, wpb=427801, bsz=16110.8, num_updates=28000, lr=0.000377964, gnorm=0.237, clip=0, loss_scale=4, train_wall=144, gb_free=59.8, wall=41268 epoch 017: 771 / 1707 loss=4.12, nll_loss=2.493, ppl=5.63, wps=294924, ups=0.69, wpb=427801, bsz=16110.8, num_updates=28000, lr=0.000377964, gnorm=0.237, clip=0, loss_scale=4, train_wall=144, gb_free=59.8, wall=41268 epoch 017: 771 / 1707 loss=4.12, nll_loss=2.493, ppl=5.63, wps=294924, ups=0.69, wpb=427801, bsz=16110.8, num_updates=28000, lr=0.000377964, gnorm=0.237, clip=0, loss_scale=4, train_wall=144, gb_free=59.8, wall=41268 epoch 017: 771 / 1707 loss=4.12, nll_loss=2.493, ppl=5.63, wps=294924, ups=0.69, wpb=427801, bsz=16110.8, num_updates=28000, lr=0.000377964, gnorm=0.237, clip=0, loss_scale=4, train_wall=144, gb_free=59.8, wall=41268 epoch 017: 771 / 1707 loss=4.12, nll_loss=2.493, ppl=5.63, wps=294924, ups=0.69, wpb=427801, bsz=16110.8, num_updates=28000, lr=0.000377964, gnorm=0.237, clip=0, loss_scale=4, train_wall=144, gb_free=59.8, wall=41268 epoch 017: 771 / 1707 loss=4.12, nll_loss=2.493, ppl=5.63, wps=294924, ups=0.69, wpb=427801, bsz=16110.8, num_updates=28000, lr=0.000377964, gnorm=0.237, clip=0, loss_scale=4, train_wall=144, gb_free=59.8, wall=41268 epoch 017: 771 / 1707 loss=4.12, nll_loss=2.493, ppl=5.63, wps=294924, ups=0.69, wpb=427801, bsz=16110.8, num_updates=28000, lr=0.000377964, gnorm=0.237, clip=0, loss_scale=4, train_wall=144, gb_free=59.8, wall=41268 epoch 017: 771 / 1707 loss=4.12, nll_loss=2.493, ppl=5.63, wps=294924, ups=0.69, wpb=427801, bsz=16110.8, num_updates=28000, lr=0.000377964, gnorm=0.237, clip=0, loss_scale=4, train_wall=144, gb_free=59.8, wall=41268 epoch 017: 771 / 1707 loss=4.12, nll_loss=2.493, ppl=5.63, wps=294924, ups=0.69, wpb=427801, bsz=16110.8, num_updates=28000, lr=0.000377964, gnorm=0.237, clip=0, loss_scale=4, train_wall=144, gb_free=59.8, wall=41268 epoch 017: 771 / 1707 loss=4.12, nll_loss=2.493, ppl=5.63, wps=294924, ups=0.69, wpb=427801, bsz=16110.8, num_updates=28000, lr=0.000377964, gnorm=0.237, clip=0, loss_scale=4, train_wall=144, gb_free=59.8, wall=41268 epoch 017: 771 / 1707 loss=4.12, nll_loss=2.493, ppl=5.63, wps=294924, ups=0.69, wpb=427801, bsz=16110.8, num_updates=28000, lr=0.000377964, gnorm=0.237, clip=0, loss_scale=4, train_wall=144, gb_free=59.8, wall=41268 epoch 017: 771 / 1707 loss=4.12, nll_loss=2.493, ppl=5.63, wps=294924, ups=0.69, wpb=427801, bsz=16110.8, num_updates=28000, lr=0.000377964, gnorm=0.237, clip=0, loss_scale=4, train_wall=144, gb_free=59.8, wall=41268 epoch 017: 771 / 1707 loss=4.12, nll_loss=2.493, ppl=5.63, wps=294924, ups=0.69, wpb=427801, bsz=16110.8, num_updates=28000, lr=0.000377964, gnorm=0.237, clip=0, loss_scale=4, train_wall=144, gb_free=59.8, wall=41268 epoch 017: 771 / 1707 loss=4.12, nll_loss=2.493, ppl=5.63, wps=294924, ups=0.69, wpb=427801, bsz=16110.8, num_updates=28000, lr=0.000377964, gnorm=0.237, clip=0, loss_scale=4, train_wall=144, gb_free=59.8, wall=41268 epoch 017: 771 / 1707 loss=4.12, nll_loss=2.493, ppl=5.63, wps=294924, ups=0.69, wpb=427801, bsz=16110.8, num_updates=28000, lr=0.000377964, gnorm=0.237, clip=0, loss_scale=4, train_wall=144, gb_free=59.8, wall=41268 epoch 017: 771 / 1707 loss=4.12, nll_loss=2.493, ppl=5.63, wps=294924, ups=0.69, wpb=427801, bsz=16110.8, num_updates=28000, lr=0.000377964, gnorm=0.237, clip=0, loss_scale=4, train_wall=144, gb_free=59.8, wall=41268 begin validation on "valid" subset epoch 017 | valid on 'valid' subset | loss 4.296 | nll_loss 2.676 | ppl 6.39 | wps 76386.6 | wpb 21331 | bsz 1016 | num_updates 28000 | best_loss 4.296 epoch 017 | valid on 'valid' subset | loss 4.296 | nll_loss 2.676 | ppl 6.39 | wps 76386.6 | wpb 21331 | bsz 1016 | num_updates 28000 | best_loss 4.296 epoch 017 | valid on 'valid' subset | loss 4.296 | nll_loss 2.676 | ppl 6.39 | wps 76386.6 | wpb 21331 | bsz 1016 | num_updates 28000 | best_loss 4.296 epoch 017 | valid on 'valid' subset | loss 4.296 | nll_loss 2.676 | ppl 6.39 | wps 76386.6 | wpb 21331 | bsz 1016 | num_updates 28000 | best_loss 4.296 epoch 017 | valid on 'valid' subset | loss 4.296 | nll_loss 2.676 | ppl 6.39 | wps 76386.6 | wpb 21331 | bsz 1016 | num_updates 28000 | best_loss 4.296 epoch 017 | valid on 'valid' subset | loss 4.296 | nll_loss 2.676 | ppl 6.39 | wps 76386.6 | wpb 21331 | bsz 1016 | num_updates 28000 | best_loss 4.296 epoch 017 | valid on 'valid' subset | loss 4.296 | nll_loss 2.676 | ppl 6.39 | wps 76386.6 | wpb 21331 | bsz 1016 | num_updates 28000 | best_loss 4.296 epoch 017 | valid on 'valid' subset | loss 4.296 | nll_loss 2.676 | ppl 6.39 | wps 76386.6 | wpb 21331 | bsz 1016 | num_updates 28000 | best_loss 4.296 epoch 017 | valid on 'valid' subset | loss 4.296 | nll_loss 2.676 | ppl 6.39 | wps 76386.6 | wpb 21331 | bsz 1016 | num_updates 28000 | best_loss 4.296 epoch 017 | valid on 'valid' subset | loss 4.296 | nll_loss 2.676 | ppl 6.39 | wps 76386.6 | wpb 21331 | bsz 1016 | num_updates 28000 | best_loss 4.296 epoch 017 | valid on 'valid' subset | loss 4.296 | nll_loss 2.676 | ppl 6.39 | wps 76386.6 | wpb 21331 | bsz 1016 | num_updates 28000 | best_loss 4.296 epoch 017 | valid on 'valid' subset | loss 4.296 | nll_loss 2.676 | ppl 6.39 | wps 76386.6 | wpb 21331 | bsz 1016 | num_updates 28000 | best_loss 4.296 epoch 017 | valid on 'valid' subset | loss 4.296 | nll_loss 2.676 | ppl 6.39 | wps 76386.6 | wpb 21331 | bsz 1016 | num_updates 28000 | best_loss 4.296 epoch 017 | valid on 'valid' subset | loss 4.296 | nll_loss 2.676 | ppl 6.39 | wps 76386.6 | wpb 21331 | bsz 1016 | num_updates 28000 | best_loss 4.296 epoch 017 | valid on 'valid' subset | loss 4.296 | nll_loss 2.676 | ppl 6.39 | wps 76386.6 | wpb 21331 | bsz 1016 | num_updates 28000 | best_loss 4.296 epoch 017 | valid on 'valid' subset | loss 4.296 | nll_loss 2.676 | ppl 6.39 | wps 76386.6 | wpb 21331 | bsz 1016 | num_updates 28000 | best_loss 4.296 epoch 017 | valid on 'valid' subset | loss 4.296 | nll_loss 2.676 | ppl 6.39 | wps 76386.6 | wpb 21331 | bsz 1016 | num_updates 28000 | best_loss 4.296 epoch 017: 872 / 1707 loss=4.113, nll_loss=2.485, ppl=5.6, wps=260958, ups=0.61, wpb=428859, bsz=16370.4, num_updates=28100, lr=0.000377291, gnorm=0.239, clip=0, loss_scale=2, train_wall=146, gb_free=59.5, wall=41432 epoch 017: 872 / 1707 loss=4.113, nll_loss=2.485, ppl=5.6, wps=260958, ups=0.61, wpb=428859, bsz=16370.4, num_updates=28100, lr=0.000377291, gnorm=0.239, clip=0, loss_scale=2, train_wall=146, gb_free=59.5, wall=41432 epoch 017: 872 / 1707 loss=4.113, nll_loss=2.485, ppl=5.6, wps=260958, ups=0.61, wpb=428859, bsz=16370.4, num_updates=28100, lr=0.000377291, gnorm=0.239, clip=0, loss_scale=2, train_wall=146, gb_free=59.5, wall=41432 epoch 017: 872 / 1707 loss=4.113, nll_loss=2.485, ppl=5.6, wps=260958, ups=0.61, wpb=428859, bsz=16370.4, num_updates=28100, lr=0.000377291, gnorm=0.239, clip=0, loss_scale=2, train_wall=146, gb_free=59.5, wall=41432 epoch 017: 872 / 1707 loss=4.113, nll_loss=2.485, ppl=5.6, wps=260958, ups=0.61, wpb=428859, bsz=16370.4, num_updates=28100, lr=0.000377291, gnorm=0.239, clip=0, loss_scale=2, train_wall=146, gb_free=59.5, wall=41432 epoch 017: 872 / 1707 loss=4.113, nll_loss=2.485, ppl=5.6, wps=260958, ups=0.61, wpb=428859, bsz=16370.4, num_updates=28100, lr=0.000377291, gnorm=0.239, clip=0, loss_scale=2, train_wall=146, gb_free=59.5, wall=41432 epoch 017: 872 / 1707 loss=4.113, nll_loss=2.485, ppl=5.6, wps=260958, ups=0.61, wpb=428859, bsz=16370.4, num_updates=28100, lr=0.000377291, gnorm=0.239, clip=0, loss_scale=2, train_wall=146, gb_free=59.5, wall=41432 epoch 017: 872 / 1707 loss=4.113, nll_loss=2.485, ppl=5.6, wps=260958, ups=0.61, wpb=428859, bsz=16370.4, num_updates=28100, lr=0.000377291, gnorm=0.239, clip=0, loss_scale=2, train_wall=146, gb_free=59.5, wall=41432 epoch 017: 872 / 1707 loss=4.113, nll_loss=2.485, ppl=5.6, wps=260958, ups=0.61, wpb=428859, bsz=16370.4, num_updates=28100, lr=0.000377291, gnorm=0.239, clip=0, loss_scale=2, train_wall=146, gb_free=59.5, wall=41432 epoch 017: 872 / 1707 loss=4.113, nll_loss=2.485, ppl=5.6, wps=260958, ups=0.61, wpb=428859, bsz=16370.4, num_updates=28100, lr=0.000377291, gnorm=0.239, clip=0, loss_scale=2, train_wall=146, gb_free=59.5, wall=41432 epoch 017: 872 / 1707 loss=4.113, nll_loss=2.485, ppl=5.6, wps=260958, ups=0.61, wpb=428859, bsz=16370.4, num_updates=28100, lr=0.000377291, gnorm=0.239, clip=0, loss_scale=2, train_wall=146, gb_free=59.5, wall=41432 epoch 017: 872 / 1707 loss=4.113, nll_loss=2.485, ppl=5.6, wps=260958, ups=0.61, wpb=428859, bsz=16370.4, num_updates=28100, lr=0.000377291, gnorm=0.239, clip=0, loss_scale=2, train_wall=146, gb_free=59.5, wall=41432 epoch 017: 872 / 1707 loss=4.113, nll_loss=2.485, ppl=5.6, wps=260958, ups=0.61, wpb=428859, bsz=16370.4, num_updates=28100, lr=0.000377291, gnorm=0.239, clip=0, loss_scale=2, train_wall=146, gb_free=59.5, wall=41432 epoch 017: 872 / 1707 loss=4.113, nll_loss=2.485, ppl=5.6, wps=260958, ups=0.61, wpb=428859, bsz=16370.4, num_updates=28100, lr=0.000377291, gnorm=0.239, clip=0, loss_scale=2, train_wall=146, gb_free=59.5, wall=41432 epoch 017: 872 / 1707 loss=4.113, nll_loss=2.485, ppl=5.6, wps=260958, ups=0.61, wpb=428859, bsz=16370.4, num_updates=28100, lr=0.000377291, gnorm=0.239, clip=0, loss_scale=2, train_wall=146, gb_free=59.5, wall=41432 epoch 017: 872 / 1707 loss=4.113, nll_loss=2.485, ppl=5.6, wps=260958, ups=0.61, wpb=428859, bsz=16370.4, num_updates=28100, lr=0.000377291, gnorm=0.239, clip=0, loss_scale=2, train_wall=146, gb_free=59.5, wall=41432 epoch 017: 872 / 1707 loss=4.113, nll_loss=2.485, ppl=5.6, wps=260958, ups=0.61, wpb=428859, bsz=16370.4, num_updates=28100, lr=0.000377291, gnorm=0.239, clip=0, loss_scale=2, train_wall=146, gb_free=59.5, wall=41432 epoch 017: 972 / 1707 loss=4.111, nll_loss=2.482, ppl=5.59, wps=295720, ups=0.69, wpb=428879, bsz=16346.2, num_updates=28200, lr=0.000376622, gnorm=0.231, clip=0, loss_scale=2, train_wall=144, gb_free=59.1, wall=41577 epoch 017: 972 / 1707 loss=4.111, nll_loss=2.482, ppl=5.59, wps=295720, ups=0.69, wpb=428879, bsz=16346.2, num_updates=28200, lr=0.000376622, gnorm=0.231, clip=0, loss_scale=2, train_wall=144, gb_free=59.1, wall=41577 epoch 017: 972 / 1707 loss=4.111, nll_loss=2.482, ppl=5.59, wps=295720, ups=0.69, wpb=428879, bsz=16346.2, num_updates=28200, lr=0.000376622, gnorm=0.231, clip=0, loss_scale=2, train_wall=144, gb_free=59.1, wall=41577 epoch 017: 972 / 1707 loss=4.111, nll_loss=2.482, ppl=5.59, wps=295720, ups=0.69, wpb=428879, bsz=16346.2, num_updates=28200, lr=0.000376622, gnorm=0.231, clip=0, loss_scale=2, train_wall=144, gb_free=59.1, wall=41577 epoch 017: 972 / 1707 loss=4.111, nll_loss=2.482, ppl=5.59, wps=295720, ups=0.69, wpb=428879, bsz=16346.2, num_updates=28200, lr=0.000376622, gnorm=0.231, clip=0, loss_scale=2, train_wall=144, gb_free=59.1, wall=41577 epoch 017: 972 / 1707 loss=4.111, nll_loss=2.482, ppl=5.59, wps=295720, ups=0.69, wpb=428879, bsz=16346.2, num_updates=28200, lr=0.000376622, gnorm=0.231, clip=0, loss_scale=2, train_wall=144, gb_free=59.1, wall=41577 epoch 017: 972 / 1707 loss=4.111, nll_loss=2.482, ppl=5.59, wps=295720, ups=0.69, wpb=428879, bsz=16346.2, num_updates=28200, lr=0.000376622, gnorm=0.231, clip=0, loss_scale=2, train_wall=144, gb_free=59.1, wall=41577 epoch 017: 972 / 1707 loss=4.111, nll_loss=2.482, ppl=5.59, wps=295720, ups=0.69, wpb=428879, bsz=16346.2, num_updates=28200, lr=0.000376622, gnorm=0.231, clip=0, loss_scale=2, train_wall=144, gb_free=59.1, wall=41577 epoch 017: 972 / 1707 loss=4.111, nll_loss=2.482, ppl=5.59, wps=295720, ups=0.69, wpb=428879, bsz=16346.2, num_updates=28200, lr=0.000376622, gnorm=0.231, clip=0, loss_scale=2, train_wall=144, gb_free=59.1, wall=41577 epoch 017: 972 / 1707 loss=4.111, nll_loss=2.482, ppl=5.59, wps=295720, ups=0.69, wpb=428879, bsz=16346.2, num_updates=28200, lr=0.000376622, gnorm=0.231, clip=0, loss_scale=2, train_wall=144, gb_free=59.1, wall=41577 epoch 017: 972 / 1707 loss=4.111, nll_loss=2.482, ppl=5.59, wps=295720, ups=0.69, wpb=428879, bsz=16346.2, num_updates=28200, lr=0.000376622, gnorm=0.231, clip=0, loss_scale=2, train_wall=144, gb_free=59.1, wall=41577 epoch 017: 972 / 1707 loss=4.111, nll_loss=2.482, ppl=5.59, wps=295720, ups=0.69, wpb=428879, bsz=16346.2, num_updates=28200, lr=0.000376622, gnorm=0.231, clip=0, loss_scale=2, train_wall=144, gb_free=59.1, wall=41577 epoch 017: 972 / 1707 loss=4.111, nll_loss=2.482, ppl=5.59, wps=295720, ups=0.69, wpb=428879, bsz=16346.2, num_updates=28200, lr=0.000376622, gnorm=0.231, clip=0, loss_scale=2, train_wall=144, gb_free=59.1, wall=41577 epoch 017: 972 / 1707 loss=4.111, nll_loss=2.482, ppl=5.59, wps=295720, ups=0.69, wpb=428879, bsz=16346.2, num_updates=28200, lr=0.000376622, gnorm=0.231, clip=0, loss_scale=2, train_wall=144, gb_free=59.1, wall=41577 epoch 017: 972 / 1707 loss=4.111, nll_loss=2.482, ppl=5.59, wps=295720, ups=0.69, wpb=428879, bsz=16346.2, num_updates=28200, lr=0.000376622, gnorm=0.231, clip=0, loss_scale=2, train_wall=144, gb_free=59.1, wall=41577 epoch 017: 972 / 1707 loss=4.111, nll_loss=2.482, ppl=5.59, wps=295720, ups=0.69, wpb=428879, bsz=16346.2, num_updates=28200, lr=0.000376622, gnorm=0.231, clip=0, loss_scale=2, train_wall=144, gb_free=59.1, wall=41577 epoch 017: 972 / 1707 loss=4.111, nll_loss=2.482, ppl=5.59, wps=295720, ups=0.69, wpb=428879, bsz=16346.2, num_updates=28200, lr=0.000376622, gnorm=0.231, clip=0, loss_scale=2, train_wall=144, gb_free=59.1, wall=41577 epoch 017: 1072 / 1707 loss=4.114, nll_loss=2.486, ppl=5.6, wps=295364, ups=0.69, wpb=428573, bsz=16455.1, num_updates=28300, lr=0.000375956, gnorm=0.241, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=41723 epoch 017: 1072 / 1707 loss=4.114, nll_loss=2.486, ppl=5.6, wps=295364, ups=0.69, wpb=428573, bsz=16455.1, num_updates=28300, lr=0.000375956, gnorm=0.241, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=41723 epoch 017: 1072 / 1707 loss=4.114, nll_loss=2.486, ppl=5.6, wps=295364, ups=0.69, wpb=428573, bsz=16455.1, num_updates=28300, lr=0.000375956, gnorm=0.241, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=41723 epoch 017: 1072 / 1707 loss=4.114, nll_loss=2.486, ppl=5.6, wps=295364, ups=0.69, wpb=428573, bsz=16455.1, num_updates=28300, lr=0.000375956, gnorm=0.241, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=41723 epoch 017: 1072 / 1707 loss=4.114, nll_loss=2.486, ppl=5.6, wps=295364, ups=0.69, wpb=428573, bsz=16455.1, num_updates=28300, lr=0.000375956, gnorm=0.241, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=41723 epoch 017: 1072 / 1707 loss=4.114, nll_loss=2.486, ppl=5.6, wps=295364, ups=0.69, wpb=428573, bsz=16455.1, num_updates=28300, lr=0.000375956, gnorm=0.241, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=41723 epoch 017: 1072 / 1707 loss=4.114, nll_loss=2.486, ppl=5.6, wps=295364, ups=0.69, wpb=428573, bsz=16455.1, num_updates=28300, lr=0.000375956, gnorm=0.241, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=41723 epoch 017: 1072 / 1707 loss=4.114, nll_loss=2.486, ppl=5.6, wps=295364, ups=0.69, wpb=428573, bsz=16455.1, num_updates=28300, lr=0.000375956, gnorm=0.241, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=41723 epoch 017: 1072 / 1707 loss=4.114, nll_loss=2.486, ppl=5.6, wps=295364, ups=0.69, wpb=428573, bsz=16455.1, num_updates=28300, lr=0.000375956, gnorm=0.241, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=41723 epoch 017: 1072 / 1707 loss=4.114, nll_loss=2.486, ppl=5.6, wps=295364, ups=0.69, wpb=428573, bsz=16455.1, num_updates=28300, lr=0.000375956, gnorm=0.241, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=41723 epoch 017: 1072 / 1707 loss=4.114, nll_loss=2.486, ppl=5.6, wps=295364, ups=0.69, wpb=428573, bsz=16455.1, num_updates=28300, lr=0.000375956, gnorm=0.241, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=41723 epoch 017: 1072 / 1707 loss=4.114, nll_loss=2.486, ppl=5.6, wps=295364, ups=0.69, wpb=428573, bsz=16455.1, num_updates=28300, lr=0.000375956, gnorm=0.241, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=41723 epoch 017: 1072 / 1707 loss=4.114, nll_loss=2.486, ppl=5.6, wps=295364, ups=0.69, wpb=428573, bsz=16455.1, num_updates=28300, lr=0.000375956, gnorm=0.241, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=41723 epoch 017: 1072 / 1707 loss=4.114, nll_loss=2.486, ppl=5.6, wps=295364, ups=0.69, wpb=428573, bsz=16455.1, num_updates=28300, lr=0.000375956, gnorm=0.241, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=41723 epoch 017: 1072 / 1707 loss=4.114, nll_loss=2.486, ppl=5.6, wps=295364, ups=0.69, wpb=428573, bsz=16455.1, num_updates=28300, lr=0.000375956, gnorm=0.241, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=41723 epoch 017: 1072 / 1707 loss=4.114, nll_loss=2.486, ppl=5.6, wps=295364, ups=0.69, wpb=428573, bsz=16455.1, num_updates=28300, lr=0.000375956, gnorm=0.241, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=41723 epoch 017: 1072 / 1707 loss=4.114, nll_loss=2.486, ppl=5.6, wps=295364, ups=0.69, wpb=428573, bsz=16455.1, num_updates=28300, lr=0.000375956, gnorm=0.241, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=41723 epoch 017: 1172 / 1707 loss=4.123, nll_loss=2.496, ppl=5.64, wps=296142, ups=0.69, wpb=429766, bsz=16184.2, num_updates=28400, lr=0.000375293, gnorm=0.233, clip=0, loss_scale=4, train_wall=144, gb_free=59.3, wall=41868 epoch 017: 1172 / 1707 loss=4.123, nll_loss=2.496, ppl=5.64, wps=296142, ups=0.69, wpb=429766, bsz=16184.2, num_updates=28400, lr=0.000375293, gnorm=0.233, clip=0, loss_scale=4, train_wall=144, gb_free=59.3, wall=41868 epoch 017: 1172 / 1707 loss=4.123, nll_loss=2.496, ppl=5.64, wps=296142, ups=0.69, wpb=429766, bsz=16184.2, num_updates=28400, lr=0.000375293, gnorm=0.233, clip=0, loss_scale=4, train_wall=144, gb_free=59.3, wall=41868 epoch 017: 1172 / 1707 loss=4.123, nll_loss=2.496, ppl=5.64, wps=296142, ups=0.69, wpb=429766, bsz=16184.2, num_updates=28400, lr=0.000375293, gnorm=0.233, clip=0, loss_scale=4, train_wall=144, gb_free=59.3, wall=41868 epoch 017: 1172 / 1707 loss=4.123, nll_loss=2.496, ppl=5.64, wps=296142, ups=0.69, wpb=429766, bsz=16184.2, num_updates=28400, lr=0.000375293, gnorm=0.233, clip=0, loss_scale=4, train_wall=144, gb_free=59.3, wall=41868 epoch 017: 1172 / 1707 loss=4.123, nll_loss=2.496, ppl=5.64, wps=296142, ups=0.69, wpb=429766, bsz=16184.2, num_updates=28400, lr=0.000375293, gnorm=0.233, clip=0, loss_scale=4, train_wall=144, gb_free=59.3, wall=41868 epoch 017: 1172 / 1707 loss=4.123, nll_loss=2.496, ppl=5.64, wps=296142, ups=0.69, wpb=429766, bsz=16184.2, num_updates=28400, lr=0.000375293, gnorm=0.233, clip=0, loss_scale=4, train_wall=144, gb_free=59.3, wall=41868 epoch 017: 1172 / 1707 loss=4.123, nll_loss=2.496, ppl=5.64, wps=296142, ups=0.69, wpb=429766, bsz=16184.2, num_updates=28400, lr=0.000375293, gnorm=0.233, clip=0, loss_scale=4, train_wall=144, gb_free=59.3, wall=41868 epoch 017: 1172 / 1707 loss=4.123, nll_loss=2.496, ppl=5.64, wps=296142, ups=0.69, wpb=429766, bsz=16184.2, num_updates=28400, lr=0.000375293, gnorm=0.233, clip=0, loss_scale=4, train_wall=144, gb_free=59.3, wall=41868 epoch 017: 1172 / 1707 loss=4.123, nll_loss=2.496, ppl=5.64, wps=296142, ups=0.69, wpb=429766, bsz=16184.2, num_updates=28400, lr=0.000375293, gnorm=0.233, clip=0, loss_scale=4, train_wall=144, gb_free=59.3, wall=41868 epoch 017: 1172 / 1707 loss=4.123, nll_loss=2.496, ppl=5.64, wps=296142, ups=0.69, wpb=429766, bsz=16184.2, num_updates=28400, lr=0.000375293, gnorm=0.233, clip=0, loss_scale=4, train_wall=144, gb_free=59.3, wall=41868 epoch 017: 1172 / 1707 loss=4.123, nll_loss=2.496, ppl=5.64, wps=296142, ups=0.69, wpb=429766, bsz=16184.2, num_updates=28400, lr=0.000375293, gnorm=0.233, clip=0, loss_scale=4, train_wall=144, gb_free=59.3, wall=41868 epoch 017: 1172 / 1707 loss=4.123, nll_loss=2.496, ppl=5.64, wps=296142, ups=0.69, wpb=429766, bsz=16184.2, num_updates=28400, lr=0.000375293, gnorm=0.233, clip=0, loss_scale=4, train_wall=144, gb_free=59.3, wall=41868 epoch 017: 1172 / 1707 loss=4.123, nll_loss=2.496, ppl=5.64, wps=296142, ups=0.69, wpb=429766, bsz=16184.2, num_updates=28400, lr=0.000375293, gnorm=0.233, clip=0, loss_scale=4, train_wall=144, gb_free=59.3, wall=41868 epoch 017: 1172 / 1707 loss=4.123, nll_loss=2.496, ppl=5.64, wps=296142, ups=0.69, wpb=429766, bsz=16184.2, num_updates=28400, lr=0.000375293, gnorm=0.233, clip=0, loss_scale=4, train_wall=144, gb_free=59.3, wall=41868 epoch 017: 1172 / 1707 loss=4.123, nll_loss=2.496, ppl=5.64, wps=296142, ups=0.69, wpb=429766, bsz=16184.2, num_updates=28400, lr=0.000375293, gnorm=0.233, clip=0, loss_scale=4, train_wall=144, gb_free=59.3, wall=41868 epoch 017: 1172 / 1707 loss=4.123, nll_loss=2.496, ppl=5.64, wps=296142, ups=0.69, wpb=429766, bsz=16184.2, num_updates=28400, lr=0.000375293, gnorm=0.233, clip=0, loss_scale=4, train_wall=144, gb_free=59.3, wall=41868 epoch 017: 1272 / 1707 loss=4.123, nll_loss=2.497, ppl=5.64, wps=296948, ups=0.69, wpb=430539, bsz=16231.9, num_updates=28500, lr=0.000374634, gnorm=0.236, clip=0, loss_scale=4, train_wall=144, gb_free=59.2, wall=42013 epoch 017: 1272 / 1707 loss=4.123, nll_loss=2.497, ppl=5.64, wps=296948, ups=0.69, wpb=430539, bsz=16231.9, num_updates=28500, lr=0.000374634, gnorm=0.236, clip=0, loss_scale=4, train_wall=144, gb_free=59.2, wall=42013 epoch 017: 1272 / 1707 loss=4.123, nll_loss=2.497, ppl=5.64, wps=296948, ups=0.69, wpb=430539, bsz=16231.9, num_updates=28500, lr=0.000374634, gnorm=0.236, clip=0, loss_scale=4, train_wall=144, gb_free=59.2, wall=42013 epoch 017: 1272 / 1707 loss=4.123, nll_loss=2.497, ppl=5.64, wps=296948, ups=0.69, wpb=430539, bsz=16231.9, num_updates=28500, lr=0.000374634, gnorm=0.236, clip=0, loss_scale=4, train_wall=144, gb_free=59.2, wall=42013 epoch 017: 1272 / 1707 loss=4.123, nll_loss=2.497, ppl=5.64, wps=296948, ups=0.69, wpb=430539, bsz=16231.9, num_updates=28500, lr=0.000374634, gnorm=0.236, clip=0, loss_scale=4, train_wall=144, gb_free=59.2, wall=42013 epoch 017: 1272 / 1707 loss=4.123, nll_loss=2.497, ppl=5.64, wps=296948, ups=0.69, wpb=430539, bsz=16231.9, num_updates=28500, lr=0.000374634, gnorm=0.236, clip=0, loss_scale=4, train_wall=144, gb_free=59.2, wall=42013 epoch 017: 1272 / 1707 loss=4.123, nll_loss=2.497, ppl=5.64, wps=296948, ups=0.69, wpb=430539, bsz=16231.9, num_updates=28500, lr=0.000374634, gnorm=0.236, clip=0, loss_scale=4, train_wall=144, gb_free=59.2, wall=42013 epoch 017: 1272 / 1707 loss=4.123, nll_loss=2.497, ppl=5.64, wps=296948, ups=0.69, wpb=430539, bsz=16231.9, num_updates=28500, lr=0.000374634, gnorm=0.236, clip=0, loss_scale=4, train_wall=144, gb_free=59.2, wall=42013 epoch 017: 1272 / 1707 loss=4.123, nll_loss=2.497, ppl=5.64, wps=296948, ups=0.69, wpb=430539, bsz=16231.9, num_updates=28500, lr=0.000374634, gnorm=0.236, clip=0, loss_scale=4, train_wall=144, gb_free=59.2, wall=42013 epoch 017: 1272 / 1707 loss=4.123, nll_loss=2.497, ppl=5.64, wps=296948, ups=0.69, wpb=430539, bsz=16231.9, num_updates=28500, lr=0.000374634, gnorm=0.236, clip=0, loss_scale=4, train_wall=144, gb_free=59.2, wall=42013 epoch 017: 1272 / 1707 loss=4.123, nll_loss=2.497, ppl=5.64, wps=296948, ups=0.69, wpb=430539, bsz=16231.9, num_updates=28500, lr=0.000374634, gnorm=0.236, clip=0, loss_scale=4, train_wall=144, gb_free=59.2, wall=42013 epoch 017: 1272 / 1707 loss=4.123, nll_loss=2.497, ppl=5.64, wps=296948, ups=0.69, wpb=430539, bsz=16231.9, num_updates=28500, lr=0.000374634, gnorm=0.236, clip=0, loss_scale=4, train_wall=144, gb_free=59.2, wall=42013 epoch 017: 1272 / 1707 loss=4.123, nll_loss=2.497, ppl=5.64, wps=296948, ups=0.69, wpb=430539, bsz=16231.9, num_updates=28500, lr=0.000374634, gnorm=0.236, clip=0, loss_scale=4, train_wall=144, gb_free=59.2, wall=42013 epoch 017: 1272 / 1707 loss=4.123, nll_loss=2.497, ppl=5.64, wps=296948, ups=0.69, wpb=430539, bsz=16231.9, num_updates=28500, lr=0.000374634, gnorm=0.236, clip=0, loss_scale=4, train_wall=144, gb_free=59.2, wall=42013 epoch 017: 1272 / 1707 loss=4.123, nll_loss=2.497, ppl=5.64, wps=296948, ups=0.69, wpb=430539, bsz=16231.9, num_updates=28500, lr=0.000374634, gnorm=0.236, clip=0, loss_scale=4, train_wall=144, gb_free=59.2, wall=42013 epoch 017: 1272 / 1707 loss=4.123, nll_loss=2.497, ppl=5.64, wps=296948, ups=0.69, wpb=430539, bsz=16231.9, num_updates=28500, lr=0.000374634, gnorm=0.236, clip=0, loss_scale=4, train_wall=144, gb_free=59.2, wall=42013 epoch 017: 1272 / 1707 loss=4.123, nll_loss=2.497, ppl=5.64, wps=296948, ups=0.69, wpb=430539, bsz=16231.9, num_updates=28500, lr=0.000374634, gnorm=0.236, clip=0, loss_scale=4, train_wall=144, gb_free=59.2, wall=42013 epoch 017: 1373 / 1707 loss=4.118, nll_loss=2.491, ppl=5.62, wps=292932, ups=0.68, wpb=429089, bsz=16256.6, num_updates=28600, lr=0.000373979, gnorm=0.244, clip=0, loss_scale=2, train_wall=146, gb_free=59.5, wall=42159 epoch 017: 1373 / 1707 loss=4.118, nll_loss=2.491, ppl=5.62, wps=292932, ups=0.68, wpb=429089, bsz=16256.6, num_updates=28600, lr=0.000373979, gnorm=0.244, clip=0, loss_scale=2, train_wall=146, gb_free=59.5, wall=42159 epoch 017: 1373 / 1707 loss=4.118, nll_loss=2.491, ppl=5.62, wps=292932, ups=0.68, wpb=429089, bsz=16256.6, num_updates=28600, lr=0.000373979, gnorm=0.244, clip=0, loss_scale=2, train_wall=146, gb_free=59.5, wall=42159 epoch 017: 1373 / 1707 loss=4.118, nll_loss=2.491, ppl=5.62, wps=292932, ups=0.68, wpb=429089, bsz=16256.6, num_updates=28600, lr=0.000373979, gnorm=0.244, clip=0, loss_scale=2, train_wall=146, gb_free=59.5, wall=42159 epoch 017: 1373 / 1707 loss=4.118, nll_loss=2.491, ppl=5.62, wps=292932, ups=0.68, wpb=429089, bsz=16256.6, num_updates=28600, lr=0.000373979, gnorm=0.244, clip=0, loss_scale=2, train_wall=146, gb_free=59.5, wall=42159 epoch 017: 1373 / 1707 loss=4.118, nll_loss=2.491, ppl=5.62, wps=292932, ups=0.68, wpb=429089, bsz=16256.6, num_updates=28600, lr=0.000373979, gnorm=0.244, clip=0, loss_scale=2, train_wall=146, gb_free=59.5, wall=42159 epoch 017: 1373 / 1707 loss=4.118, nll_loss=2.491, ppl=5.62, wps=292932, ups=0.68, wpb=429089, bsz=16256.6, num_updates=28600, lr=0.000373979, gnorm=0.244, clip=0, loss_scale=2, train_wall=146, gb_free=59.5, wall=42159 epoch 017: 1373 / 1707 loss=4.118, nll_loss=2.491, ppl=5.62, wps=292932, ups=0.68, wpb=429089, bsz=16256.6, num_updates=28600, lr=0.000373979, gnorm=0.244, clip=0, loss_scale=2, train_wall=146, gb_free=59.5, wall=42159 epoch 017: 1373 / 1707 loss=4.118, nll_loss=2.491, ppl=5.62, wps=292932, ups=0.68, wpb=429089, bsz=16256.6, num_updates=28600, lr=0.000373979, gnorm=0.244, clip=0, loss_scale=2, train_wall=146, gb_free=59.5, wall=42159 epoch 017: 1373 / 1707 loss=4.118, nll_loss=2.491, ppl=5.62, wps=292932, ups=0.68, wpb=429089, bsz=16256.6, num_updates=28600, lr=0.000373979, gnorm=0.244, clip=0, loss_scale=2, train_wall=146, gb_free=59.5, wall=42159 epoch 017: 1373 / 1707 loss=4.118, nll_loss=2.491, ppl=5.62, wps=292932, ups=0.68, wpb=429089, bsz=16256.6, num_updates=28600, lr=0.000373979, gnorm=0.244, clip=0, loss_scale=2, train_wall=146, gb_free=59.5, wall=42159 epoch 017: 1373 / 1707 loss=4.118, nll_loss=2.491, ppl=5.62, wps=292932, ups=0.68, wpb=429089, bsz=16256.6, num_updates=28600, lr=0.000373979, gnorm=0.244, clip=0, loss_scale=2, train_wall=146, gb_free=59.5, wall=42159 epoch 017: 1373 / 1707 loss=4.118, nll_loss=2.491, ppl=5.62, wps=292932, ups=0.68, wpb=429089, bsz=16256.6, num_updates=28600, lr=0.000373979, gnorm=0.244, clip=0, loss_scale=2, train_wall=146, gb_free=59.5, wall=42159 epoch 017: 1373 / 1707 loss=4.118, nll_loss=2.491, ppl=5.62, wps=292932, ups=0.68, wpb=429089, bsz=16256.6, num_updates=28600, lr=0.000373979, gnorm=0.244, clip=0, loss_scale=2, train_wall=146, gb_free=59.5, wall=42159 epoch 017: 1373 / 1707 loss=4.118, nll_loss=2.491, ppl=5.62, wps=292932, ups=0.68, wpb=429089, bsz=16256.6, num_updates=28600, lr=0.000373979, gnorm=0.244, clip=0, loss_scale=2, train_wall=146, gb_free=59.5, wall=42159 epoch 017: 1373 / 1707 loss=4.118, nll_loss=2.491, ppl=5.62, wps=292932, ups=0.68, wpb=429089, bsz=16256.6, num_updates=28600, lr=0.000373979, gnorm=0.244, clip=0, loss_scale=2, train_wall=146, gb_free=59.5, wall=42159 epoch 017: 1373 / 1707 loss=4.118, nll_loss=2.491, ppl=5.62, wps=292932, ups=0.68, wpb=429089, bsz=16256.6, num_updates=28600, lr=0.000373979, gnorm=0.244, clip=0, loss_scale=2, train_wall=146, gb_free=59.5, wall=42159 epoch 017: 1473 / 1707 loss=4.12, nll_loss=2.494, ppl=5.63, wps=295717, ups=0.69, wpb=430888, bsz=16423.4, num_updates=28700, lr=0.000373327, gnorm=0.232, clip=0, loss_scale=2, train_wall=145, gb_free=59, wall=42305 epoch 017: 1473 / 1707 loss=4.12, nll_loss=2.494, ppl=5.63, wps=295717, ups=0.69, wpb=430888, bsz=16423.4, num_updates=28700, lr=0.000373327, gnorm=0.232, clip=0, loss_scale=2, train_wall=145, gb_free=59, wall=42305 epoch 017: 1473 / 1707 loss=4.12, nll_loss=2.494, ppl=5.63, wps=295717, ups=0.69, wpb=430888, bsz=16423.4, num_updates=28700, lr=0.000373327, gnorm=0.232, clip=0, loss_scale=2, train_wall=145, gb_free=59, wall=42305 epoch 017: 1473 / 1707 loss=4.12, nll_loss=2.494, ppl=5.63, wps=295717, ups=0.69, wpb=430888, bsz=16423.4, num_updates=28700, lr=0.000373327, gnorm=0.232, clip=0, loss_scale=2, train_wall=145, gb_free=59, wall=42305 epoch 017: 1473 / 1707 loss=4.12, nll_loss=2.494, ppl=5.63, wps=295717, ups=0.69, wpb=430888, bsz=16423.4, num_updates=28700, lr=0.000373327, gnorm=0.232, clip=0, loss_scale=2, train_wall=145, gb_free=59, wall=42305 epoch 017: 1473 / 1707 loss=4.12, nll_loss=2.494, ppl=5.63, wps=295717, ups=0.69, wpb=430888, bsz=16423.4, num_updates=28700, lr=0.000373327, gnorm=0.232, clip=0, loss_scale=2, train_wall=145, gb_free=59, wall=42305 epoch 017: 1473 / 1707 loss=4.12, nll_loss=2.494, ppl=5.63, wps=295717, ups=0.69, wpb=430888, bsz=16423.4, num_updates=28700, lr=0.000373327, gnorm=0.232, clip=0, loss_scale=2, train_wall=145, gb_free=59, wall=42305 epoch 017: 1473 / 1707 loss=4.12, nll_loss=2.494, ppl=5.63, wps=295717, ups=0.69, wpb=430888, bsz=16423.4, num_updates=28700, lr=0.000373327, gnorm=0.232, clip=0, loss_scale=2, train_wall=145, gb_free=59, wall=42305 epoch 017: 1473 / 1707 loss=4.12, nll_loss=2.494, ppl=5.63, wps=295717, ups=0.69, wpb=430888, bsz=16423.4, num_updates=28700, lr=0.000373327, gnorm=0.232, clip=0, loss_scale=2, train_wall=145, gb_free=59, wall=42305 epoch 017: 1473 / 1707 loss=4.12, nll_loss=2.494, ppl=5.63, wps=295717, ups=0.69, wpb=430888, bsz=16423.4, num_updates=28700, lr=0.000373327, gnorm=0.232, clip=0, loss_scale=2, train_wall=145, gb_free=59, wall=42305 epoch 017: 1473 / 1707 loss=4.12, nll_loss=2.494, ppl=5.63, wps=295717, ups=0.69, wpb=430888, bsz=16423.4, num_updates=28700, lr=0.000373327, gnorm=0.232, clip=0, loss_scale=2, train_wall=145, gb_free=59, wall=42305 epoch 017: 1473 / 1707 loss=4.12, nll_loss=2.494, ppl=5.63, wps=295717, ups=0.69, wpb=430888, bsz=16423.4, num_updates=28700, lr=0.000373327, gnorm=0.232, clip=0, loss_scale=2, train_wall=145, gb_free=59, wall=42305 epoch 017: 1473 / 1707 loss=4.12, nll_loss=2.494, ppl=5.63, wps=295717, ups=0.69, wpb=430888, bsz=16423.4, num_updates=28700, lr=0.000373327, gnorm=0.232, clip=0, loss_scale=2, train_wall=145, gb_free=59, wall=42305 epoch 017: 1473 / 1707 loss=4.12, nll_loss=2.494, ppl=5.63, wps=295717, ups=0.69, wpb=430888, bsz=16423.4, num_updates=28700, lr=0.000373327, gnorm=0.232, clip=0, loss_scale=2, train_wall=145, gb_free=59, wall=42305 epoch 017: 1473 / 1707 loss=4.12, nll_loss=2.494, ppl=5.63, wps=295717, ups=0.69, wpb=430888, bsz=16423.4, num_updates=28700, lr=0.000373327, gnorm=0.232, clip=0, loss_scale=2, train_wall=145, gb_free=59, wall=42305 epoch 017: 1473 / 1707 loss=4.12, nll_loss=2.494, ppl=5.63, wps=295717, ups=0.69, wpb=430888, bsz=16423.4, num_updates=28700, lr=0.000373327, gnorm=0.232, clip=0, loss_scale=2, train_wall=145, gb_free=59, wall=42305 epoch 017: 1473 / 1707 loss=4.12, nll_loss=2.494, ppl=5.63, wps=295717, ups=0.69, wpb=430888, bsz=16423.4, num_updates=28700, lr=0.000373327, gnorm=0.232, clip=0, loss_scale=2, train_wall=145, gb_free=59, wall=42305 epoch 017: 1574 / 1707 loss=4.117, nll_loss=2.491, ppl=5.62, wps=291206, ups=0.68, wpb=427823, bsz=16302.9, num_updates=28800, lr=0.000372678, gnorm=0.233, clip=0, loss_scale=1, train_wall=146, gb_free=59, wall=42452 epoch 017: 1574 / 1707 loss=4.117, nll_loss=2.491, ppl=5.62, wps=291206, ups=0.68, wpb=427823, bsz=16302.9, num_updates=28800, lr=0.000372678, gnorm=0.233, clip=0, loss_scale=1, train_wall=146, gb_free=59, wall=42452 epoch 017: 1574 / 1707 loss=4.117, nll_loss=2.491, ppl=5.62, wps=291206, ups=0.68, wpb=427823, bsz=16302.9, num_updates=28800, lr=0.000372678, gnorm=0.233, clip=0, loss_scale=1, train_wall=146, gb_free=59, wall=42452 epoch 017: 1574 / 1707 loss=4.117, nll_loss=2.491, ppl=5.62, wps=291206, ups=0.68, wpb=427823, bsz=16302.9, num_updates=28800, lr=0.000372678, gnorm=0.233, clip=0, loss_scale=1, train_wall=146, gb_free=59, wall=42452 epoch 017: 1574 / 1707 loss=4.117, nll_loss=2.491, ppl=5.62, wps=291206, ups=0.68, wpb=427823, bsz=16302.9, num_updates=28800, lr=0.000372678, gnorm=0.233, clip=0, loss_scale=1, train_wall=146, gb_free=59, wall=42452 epoch 017: 1574 / 1707 loss=4.117, nll_loss=2.491, ppl=5.62, wps=291206, ups=0.68, wpb=427823, bsz=16302.9, num_updates=28800, lr=0.000372678, gnorm=0.233, clip=0, loss_scale=1, train_wall=146, gb_free=59, wall=42452 epoch 017: 1574 / 1707 loss=4.117, nll_loss=2.491, ppl=5.62, wps=291206, ups=0.68, wpb=427823, bsz=16302.9, num_updates=28800, lr=0.000372678, gnorm=0.233, clip=0, loss_scale=1, train_wall=146, gb_free=59, wall=42452 epoch 017: 1574 / 1707 loss=4.117, nll_loss=2.491, ppl=5.62, wps=291206, ups=0.68, wpb=427823, bsz=16302.9, num_updates=28800, lr=0.000372678, gnorm=0.233, clip=0, loss_scale=1, train_wall=146, gb_free=59, wall=42452 epoch 017: 1574 / 1707 loss=4.117, nll_loss=2.491, ppl=5.62, wps=291206, ups=0.68, wpb=427823, bsz=16302.9, num_updates=28800, lr=0.000372678, gnorm=0.233, clip=0, loss_scale=1, train_wall=146, gb_free=59, wall=42452 epoch 017: 1574 / 1707 loss=4.117, nll_loss=2.491, ppl=5.62, wps=291206, ups=0.68, wpb=427823, bsz=16302.9, num_updates=28800, lr=0.000372678, gnorm=0.233, clip=0, loss_scale=1, train_wall=146, gb_free=59, wall=42452 epoch 017: 1574 / 1707 loss=4.117, nll_loss=2.491, ppl=5.62, wps=291206, ups=0.68, wpb=427823, bsz=16302.9, num_updates=28800, lr=0.000372678, gnorm=0.233, clip=0, loss_scale=1, train_wall=146, gb_free=59, wall=42452 epoch 017: 1574 / 1707 loss=4.117, nll_loss=2.491, ppl=5.62, wps=291206, ups=0.68, wpb=427823, bsz=16302.9, num_updates=28800, lr=0.000372678, gnorm=0.233, clip=0, loss_scale=1, train_wall=146, gb_free=59, wall=42452 epoch 017: 1574 / 1707 loss=4.117, nll_loss=2.491, ppl=5.62, wps=291206, ups=0.68, wpb=427823, bsz=16302.9, num_updates=28800, lr=0.000372678, gnorm=0.233, clip=0, loss_scale=1, train_wall=146, gb_free=59, wall=42452 epoch 017: 1574 / 1707 loss=4.117, nll_loss=2.491, ppl=5.62, wps=291206, ups=0.68, wpb=427823, bsz=16302.9, num_updates=28800, lr=0.000372678, gnorm=0.233, clip=0, loss_scale=1, train_wall=146, gb_free=59, wall=42452 epoch 017: 1574 / 1707 loss=4.117, nll_loss=2.491, ppl=5.62, wps=291206, ups=0.68, wpb=427823, bsz=16302.9, num_updates=28800, lr=0.000372678, gnorm=0.233, clip=0, loss_scale=1, train_wall=146, gb_free=59, wall=42452 epoch 017: 1574 / 1707 loss=4.117, nll_loss=2.491, ppl=5.62, wps=291206, ups=0.68, wpb=427823, bsz=16302.9, num_updates=28800, lr=0.000372678, gnorm=0.233, clip=0, loss_scale=1, train_wall=146, gb_free=59, wall=42452 epoch 017: 1574 / 1707 loss=4.117, nll_loss=2.491, ppl=5.62, wps=291206, ups=0.68, wpb=427823, bsz=16302.9, num_updates=28800, lr=0.000372678, gnorm=0.233, clip=0, loss_scale=1, train_wall=146, gb_free=59, wall=42452 epoch 017: 1674 / 1707 loss=4.117, nll_loss=2.49, ppl=5.62, wps=295163, ups=0.69, wpb=429459, bsz=16680.9, num_updates=28900, lr=0.000372033, gnorm=0.232, clip=0, loss_scale=1, train_wall=145, gb_free=59.2, wall=42597 epoch 017: 1674 / 1707 loss=4.117, nll_loss=2.49, ppl=5.62, wps=295163, ups=0.69, wpb=429459, bsz=16680.9, num_updates=28900, lr=0.000372033, gnorm=0.232, clip=0, loss_scale=1, train_wall=145, gb_free=59.2, wall=42597 epoch 017: 1674 / 1707 loss=4.117, nll_loss=2.49, ppl=5.62, wps=295163, ups=0.69, wpb=429459, bsz=16680.9, num_updates=28900, lr=0.000372033, gnorm=0.232, clip=0, loss_scale=1, train_wall=145, gb_free=59.2, wall=42597 epoch 017: 1674 / 1707 loss=4.117, nll_loss=2.49, ppl=5.62, wps=295163, ups=0.69, wpb=429459, bsz=16680.9, num_updates=28900, lr=0.000372033, gnorm=0.232, clip=0, loss_scale=1, train_wall=145, gb_free=59.2, wall=42597 epoch 017: 1674 / 1707 loss=4.117, nll_loss=2.49, ppl=5.62, wps=295163, ups=0.69, wpb=429459, bsz=16680.9, num_updates=28900, lr=0.000372033, gnorm=0.232, clip=0, loss_scale=1, train_wall=145, gb_free=59.2, wall=42597 epoch 017: 1674 / 1707 loss=4.117, nll_loss=2.49, ppl=5.62, wps=295163, ups=0.69, wpb=429459, bsz=16680.9, num_updates=28900, lr=0.000372033, gnorm=0.232, clip=0, loss_scale=1, train_wall=145, gb_free=59.2, wall=42597 epoch 017: 1674 / 1707 loss=4.117, nll_loss=2.49, ppl=5.62, wps=295163, ups=0.69, wpb=429459, bsz=16680.9, num_updates=28900, lr=0.000372033, gnorm=0.232, clip=0, loss_scale=1, train_wall=145, gb_free=59.2, wall=42597 epoch 017: 1674 / 1707 loss=4.117, nll_loss=2.49, ppl=5.62, wps=295163, ups=0.69, wpb=429459, bsz=16680.9, num_updates=28900, lr=0.000372033, gnorm=0.232, clip=0, loss_scale=1, train_wall=145, gb_free=59.2, wall=42597 epoch 017: 1674 / 1707 loss=4.117, nll_loss=2.49, ppl=5.62, wps=295163, ups=0.69, wpb=429459, bsz=16680.9, num_updates=28900, lr=0.000372033, gnorm=0.232, clip=0, loss_scale=1, train_wall=145, gb_free=59.2, wall=42597 epoch 017: 1674 / 1707 loss=4.117, nll_loss=2.49, ppl=5.62, wps=295163, ups=0.69, wpb=429459, bsz=16680.9, num_updates=28900, lr=0.000372033, gnorm=0.232, clip=0, loss_scale=1, train_wall=145, gb_free=59.2, wall=42597 epoch 017: 1674 / 1707 loss=4.117, nll_loss=2.49, ppl=5.62, wps=295163, ups=0.69, wpb=429459, bsz=16680.9, num_updates=28900, lr=0.000372033, gnorm=0.232, clip=0, loss_scale=1, train_wall=145, gb_free=59.2, wall=42597 epoch 017: 1674 / 1707 loss=4.117, nll_loss=2.49, ppl=5.62, wps=295163, ups=0.69, wpb=429459, bsz=16680.9, num_updates=28900, lr=0.000372033, gnorm=0.232, clip=0, loss_scale=1, train_wall=145, gb_free=59.2, wall=42597 epoch 017: 1674 / 1707 loss=4.117, nll_loss=2.49, ppl=5.62, wps=295163, ups=0.69, wpb=429459, bsz=16680.9, num_updates=28900, lr=0.000372033, gnorm=0.232, clip=0, loss_scale=1, train_wall=145, gb_free=59.2, wall=42597 epoch 017: 1674 / 1707 loss=4.117, nll_loss=2.49, ppl=5.62, wps=295163, ups=0.69, wpb=429459, bsz=16680.9, num_updates=28900, lr=0.000372033, gnorm=0.232, clip=0, loss_scale=1, train_wall=145, gb_free=59.2, wall=42597 epoch 017: 1674 / 1707 loss=4.117, nll_loss=2.49, ppl=5.62, wps=295163, ups=0.69, wpb=429459, bsz=16680.9, num_updates=28900, lr=0.000372033, gnorm=0.232, clip=0, loss_scale=1, train_wall=145, gb_free=59.2, wall=42597 epoch 017: 1674 / 1707 loss=4.117, nll_loss=2.49, ppl=5.62, wps=295163, ups=0.69, wpb=429459, bsz=16680.9, num_updates=28900, lr=0.000372033, gnorm=0.232, clip=0, loss_scale=1, train_wall=145, gb_free=59.2, wall=42597 epoch 017: 1674 / 1707 loss=4.117, nll_loss=2.49, ppl=5.62, wps=295163, ups=0.69, wpb=429459, bsz=16680.9, num_updates=28900, lr=0.000372033, gnorm=0.232, clip=0, loss_scale=1, train_wall=145, gb_free=59.2, wall=42597 end of epoch 17 (average epoch stats below) epoch 017 | loss 4.113 | nll_loss 2.485 | ppl 5.6 | wps 292210 | ups 0.68 | wpb 428943 | bsz 16333.7 | num_updates 28933 | lr 0.00037182 | gnorm 0.238 | clip 0 | loss_scale 1 | train_wall 2465 | gb_free 60.4 | wall 42644 epoch 017 | loss 4.113 | nll_loss 2.485 | ppl 5.6 | wps 292210 | ups 0.68 | wpb 428943 | bsz 16333.7 | num_updates 28933 | lr 0.00037182 | gnorm 0.238 | clip 0 | loss_scale 1 | train_wall 2465 | gb_free 60.4 | wall 42644 epoch 017 | loss 4.113 | nll_loss 2.485 | ppl 5.6 | wps 292210 | ups 0.68 | wpb 428943 | bsz 16333.7 | num_updates 28933 | lr 0.00037182 | gnorm 0.238 | clip 0 | loss_scale 1 | train_wall 2465 | gb_free 60.4 | wall 42644 epoch 017 | loss 4.113 | nll_loss 2.485 | ppl 5.6 | wps 292210 | ups 0.68 | wpb 428943 | bsz 16333.7 | num_updates 28933 | lr 0.00037182 | gnorm 0.238 | clip 0 | loss_scale 1 | train_wall 2465 | gb_free 60.4 | wall 42644 epoch 017 | loss 4.113 | nll_loss 2.485 | ppl 5.6 | wps 292210 | ups 0.68 | wpb 428943 | bsz 16333.7 | num_updates 28933 | lr 0.00037182 | gnorm 0.238 | clip 0 | loss_scale 1 | train_wall 2465 | gb_free 60.4 | wall 42644 epoch 017 | loss 4.113 | nll_loss 2.485 | ppl 5.6 | wps 292210 | ups 0.68 | wpb 428943 | bsz 16333.7 | num_updates 28933 | lr 0.00037182 | gnorm 0.238 | clip 0 | loss_scale 1 | train_wall 2465 | gb_free 60.4 | wall 42644 epoch 017 | loss 4.113 | nll_loss 2.485 | ppl 5.6 | wps 292210 | ups 0.68 | wpb 428943 | bsz 16333.7 | num_updates 28933 | lr 0.00037182 | gnorm 0.238 | clip 0 | loss_scale 1 | train_wall 2465 | gb_free 60.4 | wall 42644 epoch 017 | loss 4.113 | nll_loss 2.485 | ppl 5.6 | wps 292210 | ups 0.68 | wpb 428943 | bsz 16333.7 | num_updates 28933 | lr 0.00037182 | gnorm 0.238 | clip 0 | loss_scale 1 | train_wall 2465 | gb_free 60.4 | wall 42644 epoch 017 | loss 4.113 | nll_loss 2.485 | ppl 5.6 | wps 292210 | ups 0.68 | wpb 428943 | bsz 16333.7 | num_updates 28933 | lr 0.00037182 | gnorm 0.238 | clip 0 | loss_scale 1 | train_wall 2465 | gb_free 60.4 | wall 42644 epoch 017 | loss 4.113 | nll_loss 2.485 | ppl 5.6 | wps 292210 | ups 0.68 | wpb 428943 | bsz 16333.7 | num_updates 28933 | lr 0.00037182 | gnorm 0.238 | clip 0 | loss_scale 1 | train_wall 2465 | gb_free 60.4 | wall 42644 epoch 017 | loss 4.113 | nll_loss 2.485 | ppl 5.6 | wps 292210 | ups 0.68 | wpb 428943 | bsz 16333.7 | num_updates 28933 | lr 0.00037182 | gnorm 0.238 | clip 0 | loss_scale 1 | train_wall 2465 | gb_free 60.4 | wall 42644 epoch 017 | loss 4.113 | nll_loss 2.485 | ppl 5.6 | wps 292210 | ups 0.68 | wpb 428943 | bsz 16333.7 | num_updates 28933 | lr 0.00037182 | gnorm 0.238 | clip 0 | loss_scale 1 | train_wall 2465 | gb_free 60.4 | wall 42644 epoch 017 | loss 4.113 | nll_loss 2.485 | ppl 5.6 | wps 292210 | ups 0.68 | wpb 428943 | bsz 16333.7 | num_updates 28933 | lr 0.00037182 | gnorm 0.238 | clip 0 | loss_scale 1 | train_wall 2465 | gb_free 60.4 | wall 42644 epoch 017 | loss 4.113 | nll_loss 2.485 | ppl 5.6 | wps 292210 | ups 0.68 | wpb 428943 | bsz 16333.7 | num_updates 28933 | lr 0.00037182 | gnorm 0.238 | clip 0 | loss_scale 1 | train_wall 2465 | gb_free 60.4 | wall 42644 epoch 017 | loss 4.113 | nll_loss 2.485 | ppl 5.6 | wps 292210 | ups 0.68 | wpb 428943 | bsz 16333.7 | num_updates 28933 | lr 0.00037182 | gnorm 0.238 | clip 0 | loss_scale 1 | train_wall 2465 | gb_free 60.4 | wall 42644 epoch 017 | loss 4.113 | nll_loss 2.485 | ppl 5.6 | wps 292210 | ups 0.68 | wpb 428943 | bsz 16333.7 | num_updates 28933 | lr 0.00037182 | gnorm 0.238 | clip 0 | loss_scale 1 | train_wall 2465 | gb_free 60.4 | wall 42644 epoch 017 | loss 4.113 | nll_loss 2.485 | ppl 5.6 | wps 292210 | ups 0.68 | wpb 428943 | bsz 16333.7 | num_updates 28933 | lr 0.00037182 | gnorm 0.238 | clip 0 | loss_scale 1 | train_wall 2465 | gb_free 60.4 | wall 42644 Start iterating over samples epoch 018: 67 / 1707 loss=4.086, nll_loss=2.454, ppl=5.48, wps=293695, ups=0.69, wpb=424863, bsz=16306.2, num_updates=29000, lr=0.000371391, gnorm=0.245, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=42742 epoch 018: 67 / 1707 loss=4.086, nll_loss=2.454, ppl=5.48, wps=293695, ups=0.69, wpb=424863, bsz=16306.2, num_updates=29000, lr=0.000371391, gnorm=0.245, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=42742 epoch 018: 67 / 1707 loss=4.086, nll_loss=2.454, ppl=5.48, wps=293695, ups=0.69, wpb=424863, bsz=16306.2, num_updates=29000, lr=0.000371391, gnorm=0.245, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=42742 epoch 018: 67 / 1707 loss=4.086, nll_loss=2.454, ppl=5.48, wps=293695, ups=0.69, wpb=424863, bsz=16306.2, num_updates=29000, lr=0.000371391, gnorm=0.245, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=42742 epoch 018: 67 / 1707 loss=4.086, nll_loss=2.454, ppl=5.48, wps=293695, ups=0.69, wpb=424863, bsz=16306.2, num_updates=29000, lr=0.000371391, gnorm=0.245, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=42742 epoch 018: 67 / 1707 loss=4.086, nll_loss=2.454, ppl=5.48, wps=293695, ups=0.69, wpb=424863, bsz=16306.2, num_updates=29000, lr=0.000371391, gnorm=0.245, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=42742 epoch 018: 67 / 1707 loss=4.086, nll_loss=2.454, ppl=5.48, wps=293695, ups=0.69, wpb=424863, bsz=16306.2, num_updates=29000, lr=0.000371391, gnorm=0.245, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=42742 epoch 018: 67 / 1707 loss=4.086, nll_loss=2.454, ppl=5.48, wps=293695, ups=0.69, wpb=424863, bsz=16306.2, num_updates=29000, lr=0.000371391, gnorm=0.245, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=42742 epoch 018: 67 / 1707 loss=4.086, nll_loss=2.454, ppl=5.48, wps=293695, ups=0.69, wpb=424863, bsz=16306.2, num_updates=29000, lr=0.000371391, gnorm=0.245, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=42742 epoch 018: 67 / 1707 loss=4.086, nll_loss=2.454, ppl=5.48, wps=293695, ups=0.69, wpb=424863, bsz=16306.2, num_updates=29000, lr=0.000371391, gnorm=0.245, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=42742 epoch 018: 67 / 1707 loss=4.086, nll_loss=2.454, ppl=5.48, wps=293695, ups=0.69, wpb=424863, bsz=16306.2, num_updates=29000, lr=0.000371391, gnorm=0.245, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=42742 epoch 018: 67 / 1707 loss=4.086, nll_loss=2.454, ppl=5.48, wps=293695, ups=0.69, wpb=424863, bsz=16306.2, num_updates=29000, lr=0.000371391, gnorm=0.245, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=42742 epoch 018: 67 / 1707 loss=4.086, nll_loss=2.454, ppl=5.48, wps=293695, ups=0.69, wpb=424863, bsz=16306.2, num_updates=29000, lr=0.000371391, gnorm=0.245, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=42742 epoch 018: 67 / 1707 loss=4.086, nll_loss=2.454, ppl=5.48, wps=293695, ups=0.69, wpb=424863, bsz=16306.2, num_updates=29000, lr=0.000371391, gnorm=0.245, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=42742 epoch 018: 67 / 1707 loss=4.086, nll_loss=2.454, ppl=5.48, wps=293695, ups=0.69, wpb=424863, bsz=16306.2, num_updates=29000, lr=0.000371391, gnorm=0.245, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=42742 epoch 018: 67 / 1707 loss=4.086, nll_loss=2.454, ppl=5.48, wps=293695, ups=0.69, wpb=424863, bsz=16306.2, num_updates=29000, lr=0.000371391, gnorm=0.245, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=42742 epoch 018: 67 / 1707 loss=4.086, nll_loss=2.454, ppl=5.48, wps=293695, ups=0.69, wpb=424863, bsz=16306.2, num_updates=29000, lr=0.000371391, gnorm=0.245, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=42742 epoch 018: 67 / 1707 loss=4.086, nll_loss=2.454, ppl=5.48, wps=293695, ups=0.69, wpb=424863, bsz=16306.2, num_updates=29000, lr=0.000371391, gnorm=0.245, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=42742 begin validation on "valid" subset epoch 018 | valid on 'valid' subset | loss 4.306 | nll_loss 2.689 | ppl 6.45 | wps 75694.9 | wpb 21331 | bsz 1016 | num_updates 29000 | best_loss 4.296 epoch 018 | valid on 'valid' subset | loss 4.306 | nll_loss 2.689 | ppl 6.45 | wps 75694.9 | wpb 21331 | bsz 1016 | num_updates 29000 | best_loss 4.296 epoch 018 | valid on 'valid' subset | loss 4.306 | nll_loss 2.689 | ppl 6.45 | wps 75694.9 | wpb 21331 | bsz 1016 | num_updates 29000 | best_loss 4.296 epoch 018 | valid on 'valid' subset | loss 4.306 | nll_loss 2.689 | ppl 6.45 | wps 75694.9 | wpb 21331 | bsz 1016 | num_updates 29000 | best_loss 4.296 epoch 018 | valid on 'valid' subset | loss 4.306 | nll_loss 2.689 | ppl 6.45 | wps 75694.9 | wpb 21331 | bsz 1016 | num_updates 29000 | best_loss 4.296 epoch 018 | valid on 'valid' subset | loss 4.306 | nll_loss 2.689 | ppl 6.45 | wps 75694.9 | wpb 21331 | bsz 1016 | num_updates 29000 | best_loss 4.296 epoch 018 | valid on 'valid' subset | loss 4.306 | nll_loss 2.689 | ppl 6.45 | wps 75694.9 | wpb 21331 | bsz 1016 | num_updates 29000 | best_loss 4.296 epoch 018 | valid on 'valid' subset | loss 4.306 | nll_loss 2.689 | ppl 6.45 | wps 75694.9 | wpb 21331 | bsz 1016 | num_updates 29000 | best_loss 4.296 epoch 018 | valid on 'valid' subset | loss 4.306 | nll_loss 2.689 | ppl 6.45 | wps 75694.9 | wpb 21331 | bsz 1016 | num_updates 29000 | best_loss 4.296 epoch 018 | valid on 'valid' subset | loss 4.306 | nll_loss 2.689 | ppl 6.45 | wps 75694.9 | wpb 21331 | bsz 1016 | num_updates 29000 | best_loss 4.296 epoch 018 | valid on 'valid' subset | loss 4.306 | nll_loss 2.689 | ppl 6.45 | wps 75694.9 | wpb 21331 | bsz 1016 | num_updates 29000 | best_loss 4.296 epoch 018 | valid on 'valid' subset | loss 4.306 | nll_loss 2.689 | ppl 6.45 | wps 75694.9 | wpb 21331 | bsz 1016 | num_updates 29000 | best_loss 4.296 epoch 018 | valid on 'valid' subset | loss 4.306 | nll_loss 2.689 | ppl 6.45 | wps 75694.9 | wpb 21331 | bsz 1016 | num_updates 29000 | best_loss 4.296 epoch 018 | valid on 'valid' subset | loss 4.306 | nll_loss 2.689 | ppl 6.45 | wps 75694.9 | wpb 21331 | bsz 1016 | num_updates 29000 | best_loss 4.296 epoch 018 | valid on 'valid' subset | loss 4.306 | nll_loss 2.689 | ppl 6.45 | wps 75694.9 | wpb 21331 | bsz 1016 | num_updates 29000 | best_loss 4.296 epoch 018 | valid on 'valid' subset | loss 4.306 | nll_loss 2.689 | ppl 6.45 | wps 75694.9 | wpb 21331 | bsz 1016 | num_updates 29000 | best_loss 4.296 epoch 018 | valid on 'valid' subset | loss 4.306 | nll_loss 2.689 | ppl 6.45 | wps 75694.9 | wpb 21331 | bsz 1016 | num_updates 29000 | best_loss 4.296 epoch 018 | valid on 'valid' subset | loss 4.306 | nll_loss 2.689 | ppl 6.45 | wps 75694.9 | wpb 21331 | bsz 1016 | num_updates 29000 | best_loss 4.296 epoch 018: 167 / 1707 loss=4.093, nll_loss=2.461, ppl=5.51, wps=270962, ups=0.63, wpb=429196, bsz=16443.2, num_updates=29100, lr=0.000370752, gnorm=0.236, clip=0, loss_scale=2, train_wall=144, gb_free=59, wall=42900 epoch 018: 167 / 1707 loss=4.093, nll_loss=2.461, ppl=5.51, wps=270962, ups=0.63, wpb=429196, bsz=16443.2, num_updates=29100, lr=0.000370752, gnorm=0.236, clip=0, loss_scale=2, train_wall=144, gb_free=59, wall=42900 epoch 018: 167 / 1707 loss=4.093, nll_loss=2.461, ppl=5.51, wps=270962, ups=0.63, wpb=429196, bsz=16443.2, num_updates=29100, lr=0.000370752, gnorm=0.236, clip=0, loss_scale=2, train_wall=144, gb_free=59, wall=42900 epoch 018: 167 / 1707 loss=4.093, nll_loss=2.461, ppl=5.51, wps=270962, ups=0.63, wpb=429196, bsz=16443.2, num_updates=29100, lr=0.000370752, gnorm=0.236, clip=0, loss_scale=2, train_wall=144, gb_free=59, wall=42900 epoch 018: 167 / 1707 loss=4.093, nll_loss=2.461, ppl=5.51, wps=270962, ups=0.63, wpb=429196, bsz=16443.2, num_updates=29100, lr=0.000370752, gnorm=0.236, clip=0, loss_scale=2, train_wall=144, gb_free=59, wall=42900 epoch 018: 167 / 1707 loss=4.093, nll_loss=2.461, ppl=5.51, wps=270962, ups=0.63, wpb=429196, bsz=16443.2, num_updates=29100, lr=0.000370752, gnorm=0.236, clip=0, loss_scale=2, train_wall=144, gb_free=59, wall=42900 epoch 018: 167 / 1707 loss=4.093, nll_loss=2.461, ppl=5.51, wps=270962, ups=0.63, wpb=429196, bsz=16443.2, num_updates=29100, lr=0.000370752, gnorm=0.236, clip=0, loss_scale=2, train_wall=144, gb_free=59, wall=42900 epoch 018: 167 / 1707 loss=4.093, nll_loss=2.461, ppl=5.51, wps=270962, ups=0.63, wpb=429196, bsz=16443.2, num_updates=29100, lr=0.000370752, gnorm=0.236, clip=0, loss_scale=2, train_wall=144, gb_free=59, wall=42900 epoch 018: 167 / 1707 loss=4.093, nll_loss=2.461, ppl=5.51, wps=270962, ups=0.63, wpb=429196, bsz=16443.2, num_updates=29100, lr=0.000370752, gnorm=0.236, clip=0, loss_scale=2, train_wall=144, gb_free=59, wall=42900 epoch 018: 167 / 1707 loss=4.093, nll_loss=2.461, ppl=5.51, wps=270962, ups=0.63, wpb=429196, bsz=16443.2, num_updates=29100, lr=0.000370752, gnorm=0.236, clip=0, loss_scale=2, train_wall=144, gb_free=59, wall=42900 epoch 018: 167 / 1707 loss=4.093, nll_loss=2.461, ppl=5.51, wps=270962, ups=0.63, wpb=429196, bsz=16443.2, num_updates=29100, lr=0.000370752, gnorm=0.236, clip=0, loss_scale=2, train_wall=144, gb_free=59, wall=42900 epoch 018: 167 / 1707 loss=4.093, nll_loss=2.461, ppl=5.51, wps=270962, ups=0.63, wpb=429196, bsz=16443.2, num_updates=29100, lr=0.000370752, gnorm=0.236, clip=0, loss_scale=2, train_wall=144, gb_free=59, wall=42900 epoch 018: 167 / 1707 loss=4.093, nll_loss=2.461, ppl=5.51, wps=270962, ups=0.63, wpb=429196, bsz=16443.2, num_updates=29100, lr=0.000370752, gnorm=0.236, clip=0, loss_scale=2, train_wall=144, gb_free=59, wall=42900 epoch 018: 167 / 1707 loss=4.093, nll_loss=2.461, ppl=5.51, wps=270962, ups=0.63, wpb=429196, bsz=16443.2, num_updates=29100, lr=0.000370752, gnorm=0.236, clip=0, loss_scale=2, train_wall=144, gb_free=59, wall=42900 epoch 018: 167 / 1707 loss=4.093, nll_loss=2.461, ppl=5.51, wps=270962, ups=0.63, wpb=429196, bsz=16443.2, num_updates=29100, lr=0.000370752, gnorm=0.236, clip=0, loss_scale=2, train_wall=144, gb_free=59, wall=42900 epoch 018: 167 / 1707 loss=4.093, nll_loss=2.461, ppl=5.51, wps=270962, ups=0.63, wpb=429196, bsz=16443.2, num_updates=29100, lr=0.000370752, gnorm=0.236, clip=0, loss_scale=2, train_wall=144, gb_free=59, wall=42900 epoch 018: 167 / 1707 loss=4.093, nll_loss=2.461, ppl=5.51, wps=270962, ups=0.63, wpb=429196, bsz=16443.2, num_updates=29100, lr=0.000370752, gnorm=0.236, clip=0, loss_scale=2, train_wall=144, gb_free=59, wall=42900 epoch 018: 167 / 1707 loss=4.093, nll_loss=2.461, ppl=5.51, wps=270962, ups=0.63, wpb=429196, bsz=16443.2, num_updates=29100, lr=0.000370752, gnorm=0.236, clip=0, loss_scale=2, train_wall=144, gb_free=59, wall=42900 epoch 018: 267 / 1707 loss=4.09, nll_loss=2.458, ppl=5.49, wps=296452, ups=0.69, wpb=430443, bsz=16379.9, num_updates=29200, lr=0.000370117, gnorm=0.238, clip=0, loss_scale=2, train_wall=144, gb_free=59.3, wall=43046 epoch 018: 267 / 1707 loss=4.09, nll_loss=2.458, ppl=5.49, wps=296452, ups=0.69, wpb=430443, bsz=16379.9, num_updates=29200, lr=0.000370117, gnorm=0.238, clip=0, loss_scale=2, train_wall=144, gb_free=59.3, wall=43046 epoch 018: 267 / 1707 loss=4.09, nll_loss=2.458, ppl=5.49, wps=296452, ups=0.69, wpb=430443, bsz=16379.9, num_updates=29200, lr=0.000370117, gnorm=0.238, clip=0, loss_scale=2, train_wall=144, gb_free=59.3, wall=43046 epoch 018: 267 / 1707 loss=4.09, nll_loss=2.458, ppl=5.49, wps=296452, ups=0.69, wpb=430443, bsz=16379.9, num_updates=29200, lr=0.000370117, gnorm=0.238, clip=0, loss_scale=2, train_wall=144, gb_free=59.3, wall=43046 epoch 018: 267 / 1707 loss=4.09, nll_loss=2.458, ppl=5.49, wps=296452, ups=0.69, wpb=430443, bsz=16379.9, num_updates=29200, lr=0.000370117, gnorm=0.238, clip=0, loss_scale=2, train_wall=144, gb_free=59.3, wall=43046 epoch 018: 267 / 1707 loss=4.09, nll_loss=2.458, ppl=5.49, wps=296452, ups=0.69, wpb=430443, bsz=16379.9, num_updates=29200, lr=0.000370117, gnorm=0.238, clip=0, loss_scale=2, train_wall=144, gb_free=59.3, wall=43046 epoch 018: 267 / 1707 loss=4.09, nll_loss=2.458, ppl=5.49, wps=296452, ups=0.69, wpb=430443, bsz=16379.9, num_updates=29200, lr=0.000370117, gnorm=0.238, clip=0, loss_scale=2, train_wall=144, gb_free=59.3, wall=43046 epoch 018: 267 / 1707 loss=4.09, nll_loss=2.458, ppl=5.49, wps=296452, ups=0.69, wpb=430443, bsz=16379.9, num_updates=29200, lr=0.000370117, gnorm=0.238, clip=0, loss_scale=2, train_wall=144, gb_free=59.3, wall=43046 epoch 018: 267 / 1707 loss=4.09, nll_loss=2.458, ppl=5.49, wps=296452, ups=0.69, wpb=430443, bsz=16379.9, num_updates=29200, lr=0.000370117, gnorm=0.238, clip=0, loss_scale=2, train_wall=144, gb_free=59.3, wall=43046 epoch 018: 267 / 1707 loss=4.09, nll_loss=2.458, ppl=5.49, wps=296452, ups=0.69, wpb=430443, bsz=16379.9, num_updates=29200, lr=0.000370117, gnorm=0.238, clip=0, loss_scale=2, train_wall=144, gb_free=59.3, wall=43046 epoch 018: 267 / 1707 loss=4.09, nll_loss=2.458, ppl=5.49, wps=296452, ups=0.69, wpb=430443, bsz=16379.9, num_updates=29200, lr=0.000370117, gnorm=0.238, clip=0, loss_scale=2, train_wall=144, gb_free=59.3, wall=43046 epoch 018: 267 / 1707 loss=4.09, nll_loss=2.458, ppl=5.49, wps=296452, ups=0.69, wpb=430443, bsz=16379.9, num_updates=29200, lr=0.000370117, gnorm=0.238, clip=0, loss_scale=2, train_wall=144, gb_free=59.3, wall=43046 epoch 018: 267 / 1707 loss=4.09, nll_loss=2.458, ppl=5.49, wps=296452, ups=0.69, wpb=430443, bsz=16379.9, num_updates=29200, lr=0.000370117, gnorm=0.238, clip=0, loss_scale=2, train_wall=144, gb_free=59.3, wall=43046 epoch 018: 267 / 1707 loss=4.09, nll_loss=2.458, ppl=5.49, wps=296452, ups=0.69, wpb=430443, bsz=16379.9, num_updates=29200, lr=0.000370117, gnorm=0.238, clip=0, loss_scale=2, train_wall=144, gb_free=59.3, wall=43046 epoch 018: 267 / 1707 loss=4.09, nll_loss=2.458, ppl=5.49, wps=296452, ups=0.69, wpb=430443, bsz=16379.9, num_updates=29200, lr=0.000370117, gnorm=0.238, clip=0, loss_scale=2, train_wall=144, gb_free=59.3, wall=43046 epoch 018: 267 / 1707 loss=4.09, nll_loss=2.458, ppl=5.49, wps=296452, ups=0.69, wpb=430443, bsz=16379.9, num_updates=29200, lr=0.000370117, gnorm=0.238, clip=0, loss_scale=2, train_wall=144, gb_free=59.3, wall=43046 epoch 018: 267 / 1707 loss=4.09, nll_loss=2.458, ppl=5.49, wps=296452, ups=0.69, wpb=430443, bsz=16379.9, num_updates=29200, lr=0.000370117, gnorm=0.238, clip=0, loss_scale=2, train_wall=144, gb_free=59.3, wall=43046 epoch 018: 267 / 1707 loss=4.09, nll_loss=2.458, ppl=5.49, wps=296452, ups=0.69, wpb=430443, bsz=16379.9, num_updates=29200, lr=0.000370117, gnorm=0.238, clip=0, loss_scale=2, train_wall=144, gb_free=59.3, wall=43046 epoch 018: 368 / 1707 loss=4.085, nll_loss=2.453, ppl=5.48, wps=292666, ups=0.68, wpb=429222, bsz=16158.1, num_updates=29300, lr=0.000369484, gnorm=0.253, clip=0, loss_scale=1, train_wall=146, gb_free=59.1, wall=43192 epoch 018: 368 / 1707 loss=4.085, nll_loss=2.453, ppl=5.48, wps=292666, ups=0.68, wpb=429222, bsz=16158.1, num_updates=29300, lr=0.000369484, gnorm=0.253, clip=0, loss_scale=1, train_wall=146, gb_free=59.1, wall=43192 epoch 018: 368 / 1707 loss=4.085, nll_loss=2.453, ppl=5.48, wps=292666, ups=0.68, wpb=429222, bsz=16158.1, num_updates=29300, lr=0.000369484, gnorm=0.253, clip=0, loss_scale=1, train_wall=146, gb_free=59.1, wall=43192 epoch 018: 368 / 1707 loss=4.085, nll_loss=2.453, ppl=5.48, wps=292666, ups=0.68, wpb=429222, bsz=16158.1, num_updates=29300, lr=0.000369484, gnorm=0.253, clip=0, loss_scale=1, train_wall=146, gb_free=59.1, wall=43192 epoch 018: 368 / 1707 loss=4.085, nll_loss=2.453, ppl=5.48, wps=292666, ups=0.68, wpb=429222, bsz=16158.1, num_updates=29300, lr=0.000369484, gnorm=0.253, clip=0, loss_scale=1, train_wall=146, gb_free=59.1, wall=43192 epoch 018: 368 / 1707 loss=4.085, nll_loss=2.453, ppl=5.48, wps=292666, ups=0.68, wpb=429222, bsz=16158.1, num_updates=29300, lr=0.000369484, gnorm=0.253, clip=0, loss_scale=1, train_wall=146, gb_free=59.1, wall=43192 epoch 018: 368 / 1707 loss=4.085, nll_loss=2.453, ppl=5.48, wps=292666, ups=0.68, wpb=429222, bsz=16158.1, num_updates=29300, lr=0.000369484, gnorm=0.253, clip=0, loss_scale=1, train_wall=146, gb_free=59.1, wall=43192 epoch 018: 368 / 1707 loss=4.085, nll_loss=2.453, ppl=5.48, wps=292666, ups=0.68, wpb=429222, bsz=16158.1, num_updates=29300, lr=0.000369484, gnorm=0.253, clip=0, loss_scale=1, train_wall=146, gb_free=59.1, wall=43192 epoch 018: 368 / 1707 loss=4.085, nll_loss=2.453, ppl=5.48, wps=292666, ups=0.68, wpb=429222, bsz=16158.1, num_updates=29300, lr=0.000369484, gnorm=0.253, clip=0, loss_scale=1, train_wall=146, gb_free=59.1, wall=43192 epoch 018: 368 / 1707 loss=4.085, nll_loss=2.453, ppl=5.48, wps=292666, ups=0.68, wpb=429222, bsz=16158.1, num_updates=29300, lr=0.000369484, gnorm=0.253, clip=0, loss_scale=1, train_wall=146, gb_free=59.1, wall=43192 epoch 018: 368 / 1707 loss=4.085, nll_loss=2.453, ppl=5.48, wps=292666, ups=0.68, wpb=429222, bsz=16158.1, num_updates=29300, lr=0.000369484, gnorm=0.253, clip=0, loss_scale=1, train_wall=146, gb_free=59.1, wall=43192 epoch 018: 368 / 1707 loss=4.085, nll_loss=2.453, ppl=5.48, wps=292666, ups=0.68, wpb=429222, bsz=16158.1, num_updates=29300, lr=0.000369484, gnorm=0.253, clip=0, loss_scale=1, train_wall=146, gb_free=59.1, wall=43192 epoch 018: 368 / 1707 loss=4.085, nll_loss=2.453, ppl=5.48, wps=292666, ups=0.68, wpb=429222, bsz=16158.1, num_updates=29300, lr=0.000369484, gnorm=0.253, clip=0, loss_scale=1, train_wall=146, gb_free=59.1, wall=43192 epoch 018: 368 / 1707 loss=4.085, nll_loss=2.453, ppl=5.48, wps=292666, ups=0.68, wpb=429222, bsz=16158.1, num_updates=29300, lr=0.000369484, gnorm=0.253, clip=0, loss_scale=1, train_wall=146, gb_free=59.1, wall=43192 epoch 018: 368 / 1707 loss=4.085, nll_loss=2.453, ppl=5.48, wps=292666, ups=0.68, wpb=429222, bsz=16158.1, num_updates=29300, lr=0.000369484, gnorm=0.253, clip=0, loss_scale=1, train_wall=146, gb_free=59.1, wall=43192 epoch 018: 368 / 1707 loss=4.085, nll_loss=2.453, ppl=5.48, wps=292666, ups=0.68, wpb=429222, bsz=16158.1, num_updates=29300, lr=0.000369484, gnorm=0.253, clip=0, loss_scale=1, train_wall=146, gb_free=59.1, wall=43192 epoch 018: 368 / 1707 loss=4.085, nll_loss=2.453, ppl=5.48, wps=292666, ups=0.68, wpb=429222, bsz=16158.1, num_updates=29300, lr=0.000369484, gnorm=0.253, clip=0, loss_scale=1, train_wall=146, gb_free=59.1, wall=43192 epoch 018: 368 / 1707 loss=4.085, nll_loss=2.453, ppl=5.48, wps=292666, ups=0.68, wpb=429222, bsz=16158.1, num_updates=29300, lr=0.000369484, gnorm=0.253, clip=0, loss_scale=1, train_wall=146, gb_free=59.1, wall=43192 epoch 018: 468 / 1707 loss=4.1, nll_loss=2.469, ppl=5.54, wps=297266, ups=0.69, wpb=430727, bsz=16171.7, num_updates=29400, lr=0.000368856, gnorm=0.239, clip=0, loss_scale=1, train_wall=144, gb_free=59.2, wall=43337 epoch 018: 468 / 1707 loss=4.1, nll_loss=2.469, ppl=5.54, wps=297266, ups=0.69, wpb=430727, bsz=16171.7, num_updates=29400, lr=0.000368856, gnorm=0.239, clip=0, loss_scale=1, train_wall=144, gb_free=59.2, wall=43337 epoch 018: 468 / 1707 loss=4.1, nll_loss=2.469, ppl=5.54, wps=297266, ups=0.69, wpb=430727, bsz=16171.7, num_updates=29400, lr=0.000368856, gnorm=0.239, clip=0, loss_scale=1, train_wall=144, gb_free=59.2, wall=43337 epoch 018: 468 / 1707 loss=4.1, nll_loss=2.469, ppl=5.54, wps=297266, ups=0.69, wpb=430727, bsz=16171.7, num_updates=29400, lr=0.000368856, gnorm=0.239, clip=0, loss_scale=1, train_wall=144, gb_free=59.2, wall=43337 epoch 018: 468 / 1707 loss=4.1, nll_loss=2.469, ppl=5.54, wps=297266, ups=0.69, wpb=430727, bsz=16171.7, num_updates=29400, lr=0.000368856, gnorm=0.239, clip=0, loss_scale=1, train_wall=144, gb_free=59.2, wall=43337 epoch 018: 468 / 1707 loss=4.1, nll_loss=2.469, ppl=5.54, wps=297266, ups=0.69, wpb=430727, bsz=16171.7, num_updates=29400, lr=0.000368856, gnorm=0.239, clip=0, loss_scale=1, train_wall=144, gb_free=59.2, wall=43337 epoch 018: 468 / 1707 loss=4.1, nll_loss=2.469, ppl=5.54, wps=297266, ups=0.69, wpb=430727, bsz=16171.7, num_updates=29400, lr=0.000368856, gnorm=0.239, clip=0, loss_scale=1, train_wall=144, gb_free=59.2, wall=43337 epoch 018: 468 / 1707 loss=4.1, nll_loss=2.469, ppl=5.54, wps=297266, ups=0.69, wpb=430727, bsz=16171.7, num_updates=29400, lr=0.000368856, gnorm=0.239, clip=0, loss_scale=1, train_wall=144, gb_free=59.2, wall=43337 epoch 018: 468 / 1707 loss=4.1, nll_loss=2.469, ppl=5.54, wps=297266, ups=0.69, wpb=430727, bsz=16171.7, num_updates=29400, lr=0.000368856, gnorm=0.239, clip=0, loss_scale=1, train_wall=144, gb_free=59.2, wall=43337 epoch 018: 468 / 1707 loss=4.1, nll_loss=2.469, ppl=5.54, wps=297266, ups=0.69, wpb=430727, bsz=16171.7, num_updates=29400, lr=0.000368856, gnorm=0.239, clip=0, loss_scale=1, train_wall=144, gb_free=59.2, wall=43337 epoch 018: 468 / 1707 loss=4.1, nll_loss=2.469, ppl=5.54, wps=297266, ups=0.69, wpb=430727, bsz=16171.7, num_updates=29400, lr=0.000368856, gnorm=0.239, clip=0, loss_scale=1, train_wall=144, gb_free=59.2, wall=43337 epoch 018: 468 / 1707 loss=4.1, nll_loss=2.469, ppl=5.54, wps=297266, ups=0.69, wpb=430727, bsz=16171.7, num_updates=29400, lr=0.000368856, gnorm=0.239, clip=0, loss_scale=1, train_wall=144, gb_free=59.2, wall=43337 epoch 018: 468 / 1707 loss=4.1, nll_loss=2.469, ppl=5.54, wps=297266, ups=0.69, wpb=430727, bsz=16171.7, num_updates=29400, lr=0.000368856, gnorm=0.239, clip=0, loss_scale=1, train_wall=144, gb_free=59.2, wall=43337 epoch 018: 468 / 1707 loss=4.1, nll_loss=2.469, ppl=5.54, wps=297266, ups=0.69, wpb=430727, bsz=16171.7, num_updates=29400, lr=0.000368856, gnorm=0.239, clip=0, loss_scale=1, train_wall=144, gb_free=59.2, wall=43337 epoch 018: 468 / 1707 loss=4.1, nll_loss=2.469, ppl=5.54, wps=297266, ups=0.69, wpb=430727, bsz=16171.7, num_updates=29400, lr=0.000368856, gnorm=0.239, clip=0, loss_scale=1, train_wall=144, gb_free=59.2, wall=43337 epoch 018: 468 / 1707 loss=4.1, nll_loss=2.469, ppl=5.54, wps=297266, ups=0.69, wpb=430727, bsz=16171.7, num_updates=29400, lr=0.000368856, gnorm=0.239, clip=0, loss_scale=1, train_wall=144, gb_free=59.2, wall=43337 epoch 018: 468 / 1707 loss=4.1, nll_loss=2.469, ppl=5.54, wps=297266, ups=0.69, wpb=430727, bsz=16171.7, num_updates=29400, lr=0.000368856, gnorm=0.239, clip=0, loss_scale=1, train_wall=144, gb_free=59.2, wall=43337 epoch 018: 468 / 1707 loss=4.1, nll_loss=2.469, ppl=5.54, wps=297266, ups=0.69, wpb=430727, bsz=16171.7, num_updates=29400, lr=0.000368856, gnorm=0.239, clip=0, loss_scale=1, train_wall=144, gb_free=59.2, wall=43337 epoch 018: 568 / 1707 loss=4.103, nll_loss=2.474, ppl=5.56, wps=295007, ups=0.69, wpb=427794, bsz=16419.4, num_updates=29500, lr=0.00036823, gnorm=0.232, clip=0, loss_scale=1, train_wall=144, gb_free=59.2, wall=43482 epoch 018: 568 / 1707 loss=4.103, nll_loss=2.474, ppl=5.56, wps=295007, ups=0.69, wpb=427794, bsz=16419.4, num_updates=29500, lr=0.00036823, gnorm=0.232, clip=0, loss_scale=1, train_wall=144, gb_free=59.2, wall=43482 epoch 018: 568 / 1707 loss=4.103, nll_loss=2.474, ppl=5.56, wps=295007, ups=0.69, wpb=427794, bsz=16419.4, num_updates=29500, lr=0.00036823, gnorm=0.232, clip=0, loss_scale=1, train_wall=144, gb_free=59.2, wall=43482 epoch 018: 568 / 1707 loss=4.103, nll_loss=2.474, ppl=5.56, wps=295007, ups=0.69, wpb=427794, bsz=16419.4, num_updates=29500, lr=0.00036823, gnorm=0.232, clip=0, loss_scale=1, train_wall=144, gb_free=59.2, wall=43482 epoch 018: 568 / 1707 loss=4.103, nll_loss=2.474, ppl=5.56, wps=295007, ups=0.69, wpb=427794, bsz=16419.4, num_updates=29500, lr=0.00036823, gnorm=0.232, clip=0, loss_scale=1, train_wall=144, gb_free=59.2, wall=43482 epoch 018: 568 / 1707 loss=4.103, nll_loss=2.474, ppl=5.56, wps=295007, ups=0.69, wpb=427794, bsz=16419.4, num_updates=29500, lr=0.00036823, gnorm=0.232, clip=0, loss_scale=1, train_wall=144, gb_free=59.2, wall=43482 epoch 018: 568 / 1707 loss=4.103, nll_loss=2.474, ppl=5.56, wps=295007, ups=0.69, wpb=427794, bsz=16419.4, num_updates=29500, lr=0.00036823, gnorm=0.232, clip=0, loss_scale=1, train_wall=144, gb_free=59.2, wall=43482 epoch 018: 568 / 1707 loss=4.103, nll_loss=2.474, ppl=5.56, wps=295007, ups=0.69, wpb=427794, bsz=16419.4, num_updates=29500, lr=0.00036823, gnorm=0.232, clip=0, loss_scale=1, train_wall=144, gb_free=59.2, wall=43482 epoch 018: 568 / 1707 loss=4.103, nll_loss=2.474, ppl=5.56, wps=295007, ups=0.69, wpb=427794, bsz=16419.4, num_updates=29500, lr=0.00036823, gnorm=0.232, clip=0, loss_scale=1, train_wall=144, gb_free=59.2, wall=43482 epoch 018: 568 / 1707 loss=4.103, nll_loss=2.474, ppl=5.56, wps=295007, ups=0.69, wpb=427794, bsz=16419.4, num_updates=29500, lr=0.00036823, gnorm=0.232, clip=0, loss_scale=1, train_wall=144, gb_free=59.2, wall=43482 epoch 018: 568 / 1707 loss=4.103, nll_loss=2.474, ppl=5.56, wps=295007, ups=0.69, wpb=427794, bsz=16419.4, num_updates=29500, lr=0.00036823, gnorm=0.232, clip=0, loss_scale=1, train_wall=144, gb_free=59.2, wall=43482 epoch 018: 568 / 1707 loss=4.103, nll_loss=2.474, ppl=5.56, wps=295007, ups=0.69, wpb=427794, bsz=16419.4, num_updates=29500, lr=0.00036823, gnorm=0.232, clip=0, loss_scale=1, train_wall=144, gb_free=59.2, wall=43482 epoch 018: 568 / 1707 loss=4.103, nll_loss=2.474, ppl=5.56, wps=295007, ups=0.69, wpb=427794, bsz=16419.4, num_updates=29500, lr=0.00036823, gnorm=0.232, clip=0, loss_scale=1, train_wall=144, gb_free=59.2, wall=43482 epoch 018: 568 / 1707 loss=4.103, nll_loss=2.474, ppl=5.56, wps=295007, ups=0.69, wpb=427794, bsz=16419.4, num_updates=29500, lr=0.00036823, gnorm=0.232, clip=0, loss_scale=1, train_wall=144, gb_free=59.2, wall=43482 epoch 018: 568 / 1707 loss=4.103, nll_loss=2.474, ppl=5.56, wps=295007, ups=0.69, wpb=427794, bsz=16419.4, num_updates=29500, lr=0.00036823, gnorm=0.232, clip=0, loss_scale=1, train_wall=144, gb_free=59.2, wall=43482 epoch 018: 568 / 1707 loss=4.103, nll_loss=2.474, ppl=5.56, wps=295007, ups=0.69, wpb=427794, bsz=16419.4, num_updates=29500, lr=0.00036823, gnorm=0.232, clip=0, loss_scale=1, train_wall=144, gb_free=59.2, wall=43482 epoch 018: 568 / 1707 loss=4.103, nll_loss=2.474, ppl=5.56, wps=295007, ups=0.69, wpb=427794, bsz=16419.4, num_updates=29500, lr=0.00036823, gnorm=0.232, clip=0, loss_scale=1, train_wall=144, gb_free=59.2, wall=43482 epoch 018: 568 / 1707 loss=4.103, nll_loss=2.474, ppl=5.56, wps=295007, ups=0.69, wpb=427794, bsz=16419.4, num_updates=29500, lr=0.00036823, gnorm=0.232, clip=0, loss_scale=1, train_wall=144, gb_free=59.2, wall=43482 epoch 018: 668 / 1707 loss=4.101, nll_loss=2.471, ppl=5.54, wps=295994, ups=0.69, wpb=429467, bsz=16255.8, num_updates=29600, lr=0.000367607, gnorm=0.229, clip=0, loss_scale=2, train_wall=144, gb_free=59, wall=43627 epoch 018: 668 / 1707 loss=4.101, nll_loss=2.471, ppl=5.54, wps=295994, ups=0.69, wpb=429467, bsz=16255.8, num_updates=29600, lr=0.000367607, gnorm=0.229, clip=0, loss_scale=2, train_wall=144, gb_free=59, wall=43627 epoch 018: 668 / 1707 loss=4.101, nll_loss=2.471, ppl=5.54, wps=295994, ups=0.69, wpb=429467, bsz=16255.8, num_updates=29600, lr=0.000367607, gnorm=0.229, clip=0, loss_scale=2, train_wall=144, gb_free=59, wall=43627 epoch 018: 668 / 1707 loss=4.101, nll_loss=2.471, ppl=5.54, wps=295994, ups=0.69, wpb=429467, bsz=16255.8, num_updates=29600, lr=0.000367607, gnorm=0.229, clip=0, loss_scale=2, train_wall=144, gb_free=59, wall=43627 epoch 018: 668 / 1707 loss=4.101, nll_loss=2.471, ppl=5.54, wps=295994, ups=0.69, wpb=429467, bsz=16255.8, num_updates=29600, lr=0.000367607, gnorm=0.229, clip=0, loss_scale=2, train_wall=144, gb_free=59, wall=43627 epoch 018: 668 / 1707 loss=4.101, nll_loss=2.471, ppl=5.54, wps=295994, ups=0.69, wpb=429467, bsz=16255.8, num_updates=29600, lr=0.000367607, gnorm=0.229, clip=0, loss_scale=2, train_wall=144, gb_free=59, wall=43627 epoch 018: 668 / 1707 loss=4.101, nll_loss=2.471, ppl=5.54, wps=295994, ups=0.69, wpb=429467, bsz=16255.8, num_updates=29600, lr=0.000367607, gnorm=0.229, clip=0, loss_scale=2, train_wall=144, gb_free=59, wall=43627 epoch 018: 668 / 1707 loss=4.101, nll_loss=2.471, ppl=5.54, wps=295994, ups=0.69, wpb=429467, bsz=16255.8, num_updates=29600, lr=0.000367607, gnorm=0.229, clip=0, loss_scale=2, train_wall=144, gb_free=59, wall=43627 epoch 018: 668 / 1707 loss=4.101, nll_loss=2.471, ppl=5.54, wps=295994, ups=0.69, wpb=429467, bsz=16255.8, num_updates=29600, lr=0.000367607, gnorm=0.229, clip=0, loss_scale=2, train_wall=144, gb_free=59, wall=43627 epoch 018: 668 / 1707 loss=4.101, nll_loss=2.471, ppl=5.54, wps=295994, ups=0.69, wpb=429467, bsz=16255.8, num_updates=29600, lr=0.000367607, gnorm=0.229, clip=0, loss_scale=2, train_wall=144, gb_free=59, wall=43627 epoch 018: 668 / 1707 loss=4.101, nll_loss=2.471, ppl=5.54, wps=295994, ups=0.69, wpb=429467, bsz=16255.8, num_updates=29600, lr=0.000367607, gnorm=0.229, clip=0, loss_scale=2, train_wall=144, gb_free=59, wall=43627 epoch 018: 668 / 1707 loss=4.101, nll_loss=2.471, ppl=5.54, wps=295994, ups=0.69, wpb=429467, bsz=16255.8, num_updates=29600, lr=0.000367607, gnorm=0.229, clip=0, loss_scale=2, train_wall=144, gb_free=59, wall=43627 epoch 018: 668 / 1707 loss=4.101, nll_loss=2.471, ppl=5.54, wps=295994, ups=0.69, wpb=429467, bsz=16255.8, num_updates=29600, lr=0.000367607, gnorm=0.229, clip=0, loss_scale=2, train_wall=144, gb_free=59, wall=43627 epoch 018: 668 / 1707 loss=4.101, nll_loss=2.471, ppl=5.54, wps=295994, ups=0.69, wpb=429467, bsz=16255.8, num_updates=29600, lr=0.000367607, gnorm=0.229, clip=0, loss_scale=2, train_wall=144, gb_free=59, wall=43627 epoch 018: 668 / 1707 loss=4.101, nll_loss=2.471, ppl=5.54, wps=295994, ups=0.69, wpb=429467, bsz=16255.8, num_updates=29600, lr=0.000367607, gnorm=0.229, clip=0, loss_scale=2, train_wall=144, gb_free=59, wall=43627 epoch 018: 668 / 1707 loss=4.101, nll_loss=2.471, ppl=5.54, wps=295994, ups=0.69, wpb=429467, bsz=16255.8, num_updates=29600, lr=0.000367607, gnorm=0.229, clip=0, loss_scale=2, train_wall=144, gb_free=59, wall=43627 epoch 018: 668 / 1707 loss=4.101, nll_loss=2.471, ppl=5.54, wps=295994, ups=0.69, wpb=429467, bsz=16255.8, num_updates=29600, lr=0.000367607, gnorm=0.229, clip=0, loss_scale=2, train_wall=144, gb_free=59, wall=43627 epoch 018: 668 / 1707 loss=4.101, nll_loss=2.471, ppl=5.54, wps=295994, ups=0.69, wpb=429467, bsz=16255.8, num_updates=29600, lr=0.000367607, gnorm=0.229, clip=0, loss_scale=2, train_wall=144, gb_free=59, wall=43627 epoch 018: 768 / 1707 loss=4.104, nll_loss=2.475, ppl=5.56, wps=296556, ups=0.69, wpb=430257, bsz=16455.8, num_updates=29700, lr=0.000366988, gnorm=0.244, clip=0, loss_scale=2, train_wall=144, gb_free=59.2, wall=43772 epoch 018: 768 / 1707 loss=4.104, nll_loss=2.475, ppl=5.56, wps=296556, ups=0.69, wpb=430257, bsz=16455.8, num_updates=29700, lr=0.000366988, gnorm=0.244, clip=0, loss_scale=2, train_wall=144, gb_free=59.2, wall=43772 epoch 018: 768 / 1707 loss=4.104, nll_loss=2.475, ppl=5.56, wps=296556, ups=0.69, wpb=430257, bsz=16455.8, num_updates=29700, lr=0.000366988, gnorm=0.244, clip=0, loss_scale=2, train_wall=144, gb_free=59.2, wall=43772 epoch 018: 768 / 1707 loss=4.104, nll_loss=2.475, ppl=5.56, wps=296556, ups=0.69, wpb=430257, bsz=16455.8, num_updates=29700, lr=0.000366988, gnorm=0.244, clip=0, loss_scale=2, train_wall=144, gb_free=59.2, wall=43772 epoch 018: 768 / 1707 loss=4.104, nll_loss=2.475, ppl=5.56, wps=296556, ups=0.69, wpb=430257, bsz=16455.8, num_updates=29700, lr=0.000366988, gnorm=0.244, clip=0, loss_scale=2, train_wall=144, gb_free=59.2, wall=43772 epoch 018: 768 / 1707 loss=4.104, nll_loss=2.475, ppl=5.56, wps=296556, ups=0.69, wpb=430257, bsz=16455.8, num_updates=29700, lr=0.000366988, gnorm=0.244, clip=0, loss_scale=2, train_wall=144, gb_free=59.2, wall=43772 epoch 018: 768 / 1707 loss=4.104, nll_loss=2.475, ppl=5.56, wps=296556, ups=0.69, wpb=430257, bsz=16455.8, num_updates=29700, lr=0.000366988, gnorm=0.244, clip=0, loss_scale=2, train_wall=144, gb_free=59.2, wall=43772 epoch 018: 768 / 1707 loss=4.104, nll_loss=2.475, ppl=5.56, wps=296556, ups=0.69, wpb=430257, bsz=16455.8, num_updates=29700, lr=0.000366988, gnorm=0.244, clip=0, loss_scale=2, train_wall=144, gb_free=59.2, wall=43772 epoch 018: 768 / 1707 loss=4.104, nll_loss=2.475, ppl=5.56, wps=296556, ups=0.69, wpb=430257, bsz=16455.8, num_updates=29700, lr=0.000366988, gnorm=0.244, clip=0, loss_scale=2, train_wall=144, gb_free=59.2, wall=43772 epoch 018: 768 / 1707 loss=4.104, nll_loss=2.475, ppl=5.56, wps=296556, ups=0.69, wpb=430257, bsz=16455.8, num_updates=29700, lr=0.000366988, gnorm=0.244, clip=0, loss_scale=2, train_wall=144, gb_free=59.2, wall=43772 epoch 018: 768 / 1707 loss=4.104, nll_loss=2.475, ppl=5.56, wps=296556, ups=0.69, wpb=430257, bsz=16455.8, num_updates=29700, lr=0.000366988, gnorm=0.244, clip=0, loss_scale=2, train_wall=144, gb_free=59.2, wall=43772 epoch 018: 768 / 1707 loss=4.104, nll_loss=2.475, ppl=5.56, wps=296556, ups=0.69, wpb=430257, bsz=16455.8, num_updates=29700, lr=0.000366988, gnorm=0.244, clip=0, loss_scale=2, train_wall=144, gb_free=59.2, wall=43772 epoch 018: 768 / 1707 loss=4.104, nll_loss=2.475, ppl=5.56, wps=296556, ups=0.69, wpb=430257, bsz=16455.8, num_updates=29700, lr=0.000366988, gnorm=0.244, clip=0, loss_scale=2, train_wall=144, gb_free=59.2, wall=43772 epoch 018: 768 / 1707 loss=4.104, nll_loss=2.475, ppl=5.56, wps=296556, ups=0.69, wpb=430257, bsz=16455.8, num_updates=29700, lr=0.000366988, gnorm=0.244, clip=0, loss_scale=2, train_wall=144, gb_free=59.2, wall=43772 epoch 018: 768 / 1707 loss=4.104, nll_loss=2.475, ppl=5.56, wps=296556, ups=0.69, wpb=430257, bsz=16455.8, num_updates=29700, lr=0.000366988, gnorm=0.244, clip=0, loss_scale=2, train_wall=144, gb_free=59.2, wall=43772 epoch 018: 768 / 1707 loss=4.104, nll_loss=2.475, ppl=5.56, wps=296556, ups=0.69, wpb=430257, bsz=16455.8, num_updates=29700, lr=0.000366988, gnorm=0.244, clip=0, loss_scale=2, train_wall=144, gb_free=59.2, wall=43772 epoch 018: 768 / 1707 loss=4.104, nll_loss=2.475, ppl=5.56, wps=296556, ups=0.69, wpb=430257, bsz=16455.8, num_updates=29700, lr=0.000366988, gnorm=0.244, clip=0, loss_scale=2, train_wall=144, gb_free=59.2, wall=43772 epoch 018: 768 / 1707 loss=4.104, nll_loss=2.475, ppl=5.56, wps=296556, ups=0.69, wpb=430257, bsz=16455.8, num_updates=29700, lr=0.000366988, gnorm=0.244, clip=0, loss_scale=2, train_wall=144, gb_free=59.2, wall=43772 epoch 018: 869 / 1707 loss=4.102, nll_loss=2.472, ppl=5.55, wps=291902, ups=0.68, wpb=428211, bsz=16457.6, num_updates=29800, lr=0.000366372, gnorm=0.228, clip=0, loss_scale=2, train_wall=146, gb_free=59.1, wall=43919 epoch 018: 869 / 1707 loss=4.102, nll_loss=2.472, ppl=5.55, wps=291902, ups=0.68, wpb=428211, bsz=16457.6, num_updates=29800, lr=0.000366372, gnorm=0.228, clip=0, loss_scale=2, train_wall=146, gb_free=59.1, wall=43919 epoch 018: 869 / 1707 loss=4.102, nll_loss=2.472, ppl=5.55, wps=291902, ups=0.68, wpb=428211, bsz=16457.6, num_updates=29800, lr=0.000366372, gnorm=0.228, clip=0, loss_scale=2, train_wall=146, gb_free=59.1, wall=43919 epoch 018: 869 / 1707 loss=4.102, nll_loss=2.472, ppl=5.55, wps=291902, ups=0.68, wpb=428211, bsz=16457.6, num_updates=29800, lr=0.000366372, gnorm=0.228, clip=0, loss_scale=2, train_wall=146, gb_free=59.1, wall=43919 epoch 018: 869 / 1707 loss=4.102, nll_loss=2.472, ppl=5.55, wps=291902, ups=0.68, wpb=428211, bsz=16457.6, num_updates=29800, lr=0.000366372, gnorm=0.228, clip=0, loss_scale=2, train_wall=146, gb_free=59.1, wall=43919 epoch 018: 869 / 1707 loss=4.102, nll_loss=2.472, ppl=5.55, wps=291902, ups=0.68, wpb=428211, bsz=16457.6, num_updates=29800, lr=0.000366372, gnorm=0.228, clip=0, loss_scale=2, train_wall=146, gb_free=59.1, wall=43919 epoch 018: 869 / 1707 loss=4.102, nll_loss=2.472, ppl=5.55, wps=291902, ups=0.68, wpb=428211, bsz=16457.6, num_updates=29800, lr=0.000366372, gnorm=0.228, clip=0, loss_scale=2, train_wall=146, gb_free=59.1, wall=43919 epoch 018: 869 / 1707 loss=4.102, nll_loss=2.472, ppl=5.55, wps=291902, ups=0.68, wpb=428211, bsz=16457.6, num_updates=29800, lr=0.000366372, gnorm=0.228, clip=0, loss_scale=2, train_wall=146, gb_free=59.1, wall=43919 epoch 018: 869 / 1707 loss=4.102, nll_loss=2.472, ppl=5.55, wps=291902, ups=0.68, wpb=428211, bsz=16457.6, num_updates=29800, lr=0.000366372, gnorm=0.228, clip=0, loss_scale=2, train_wall=146, gb_free=59.1, wall=43919 epoch 018: 869 / 1707 loss=4.102, nll_loss=2.472, ppl=5.55, wps=291902, ups=0.68, wpb=428211, bsz=16457.6, num_updates=29800, lr=0.000366372, gnorm=0.228, clip=0, loss_scale=2, train_wall=146, gb_free=59.1, wall=43919 epoch 018: 869 / 1707 loss=4.102, nll_loss=2.472, ppl=5.55, wps=291902, ups=0.68, wpb=428211, bsz=16457.6, num_updates=29800, lr=0.000366372, gnorm=0.228, clip=0, loss_scale=2, train_wall=146, gb_free=59.1, wall=43919 epoch 018: 869 / 1707 loss=4.102, nll_loss=2.472, ppl=5.55, wps=291902, ups=0.68, wpb=428211, bsz=16457.6, num_updates=29800, lr=0.000366372, gnorm=0.228, clip=0, loss_scale=2, train_wall=146, gb_free=59.1, wall=43919 epoch 018: 869 / 1707 loss=4.102, nll_loss=2.472, ppl=5.55, wps=291902, ups=0.68, wpb=428211, bsz=16457.6, num_updates=29800, lr=0.000366372, gnorm=0.228, clip=0, loss_scale=2, train_wall=146, gb_free=59.1, wall=43919 epoch 018: 869 / 1707 loss=4.102, nll_loss=2.472, ppl=5.55, wps=291902, ups=0.68, wpb=428211, bsz=16457.6, num_updates=29800, lr=0.000366372, gnorm=0.228, clip=0, loss_scale=2, train_wall=146, gb_free=59.1, wall=43919 epoch 018: 869 / 1707 loss=4.102, nll_loss=2.472, ppl=5.55, wps=291902, ups=0.68, wpb=428211, bsz=16457.6, num_updates=29800, lr=0.000366372, gnorm=0.228, clip=0, loss_scale=2, train_wall=146, gb_free=59.1, wall=43919 epoch 018: 869 / 1707 loss=4.102, nll_loss=2.472, ppl=5.55, wps=291902, ups=0.68, wpb=428211, bsz=16457.6, num_updates=29800, lr=0.000366372, gnorm=0.228, clip=0, loss_scale=2, train_wall=146, gb_free=59.1, wall=43919 epoch 018: 869 / 1707 loss=4.102, nll_loss=2.472, ppl=5.55, wps=291902, ups=0.68, wpb=428211, bsz=16457.6, num_updates=29800, lr=0.000366372, gnorm=0.228, clip=0, loss_scale=2, train_wall=146, gb_free=59.1, wall=43919 epoch 018: 869 / 1707 loss=4.102, nll_loss=2.472, ppl=5.55, wps=291902, ups=0.68, wpb=428211, bsz=16457.6, num_updates=29800, lr=0.000366372, gnorm=0.228, clip=0, loss_scale=2, train_wall=146, gb_free=59.1, wall=43919 epoch 018: 969 / 1707 loss=4.103, nll_loss=2.474, ppl=5.56, wps=295436, ups=0.69, wpb=427945, bsz=16201.1, num_updates=29900, lr=0.000365758, gnorm=0.239, clip=0, loss_scale=2, train_wall=144, gb_free=59.2, wall=44064 epoch 018: 969 / 1707 loss=4.103, nll_loss=2.474, ppl=5.56, wps=295436, ups=0.69, wpb=427945, bsz=16201.1, num_updates=29900, lr=0.000365758, gnorm=0.239, clip=0, loss_scale=2, train_wall=144, gb_free=59.2, wall=44064 epoch 018: 969 / 1707 loss=4.103, nll_loss=2.474, ppl=5.56, wps=295436, ups=0.69, wpb=427945, bsz=16201.1, num_updates=29900, lr=0.000365758, gnorm=0.239, clip=0, loss_scale=2, train_wall=144, gb_free=59.2, wall=44064 epoch 018: 969 / 1707 loss=4.103, nll_loss=2.474, ppl=5.56, wps=295436, ups=0.69, wpb=427945, bsz=16201.1, num_updates=29900, lr=0.000365758, gnorm=0.239, clip=0, loss_scale=2, train_wall=144, gb_free=59.2, wall=44064 epoch 018: 969 / 1707 loss=4.103, nll_loss=2.474, ppl=5.56, wps=295436, ups=0.69, wpb=427945, bsz=16201.1, num_updates=29900, lr=0.000365758, gnorm=0.239, clip=0, loss_scale=2, train_wall=144, gb_free=59.2, wall=44064 epoch 018: 969 / 1707 loss=4.103, nll_loss=2.474, ppl=5.56, wps=295436, ups=0.69, wpb=427945, bsz=16201.1, num_updates=29900, lr=0.000365758, gnorm=0.239, clip=0, loss_scale=2, train_wall=144, gb_free=59.2, wall=44064 epoch 018: 969 / 1707 loss=4.103, nll_loss=2.474, ppl=5.56, wps=295436, ups=0.69, wpb=427945, bsz=16201.1, num_updates=29900, lr=0.000365758, gnorm=0.239, clip=0, loss_scale=2, train_wall=144, gb_free=59.2, wall=44064 epoch 018: 969 / 1707 loss=4.103, nll_loss=2.474, ppl=5.56, wps=295436, ups=0.69, wpb=427945, bsz=16201.1, num_updates=29900, lr=0.000365758, gnorm=0.239, clip=0, loss_scale=2, train_wall=144, gb_free=59.2, wall=44064 epoch 018: 969 / 1707 loss=4.103, nll_loss=2.474, ppl=5.56, wps=295436, ups=0.69, wpb=427945, bsz=16201.1, num_updates=29900, lr=0.000365758, gnorm=0.239, clip=0, loss_scale=2, train_wall=144, gb_free=59.2, wall=44064 epoch 018: 969 / 1707 loss=4.103, nll_loss=2.474, ppl=5.56, wps=295436, ups=0.69, wpb=427945, bsz=16201.1, num_updates=29900, lr=0.000365758, gnorm=0.239, clip=0, loss_scale=2, train_wall=144, gb_free=59.2, wall=44064 epoch 018: 969 / 1707 loss=4.103, nll_loss=2.474, ppl=5.56, wps=295436, ups=0.69, wpb=427945, bsz=16201.1, num_updates=29900, lr=0.000365758, gnorm=0.239, clip=0, loss_scale=2, train_wall=144, gb_free=59.2, wall=44064 epoch 018: 969 / 1707 loss=4.103, nll_loss=2.474, ppl=5.56, wps=295436, ups=0.69, wpb=427945, bsz=16201.1, num_updates=29900, lr=0.000365758, gnorm=0.239, clip=0, loss_scale=2, train_wall=144, gb_free=59.2, wall=44064 epoch 018: 969 / 1707 loss=4.103, nll_loss=2.474, ppl=5.56, wps=295436, ups=0.69, wpb=427945, bsz=16201.1, num_updates=29900, lr=0.000365758, gnorm=0.239, clip=0, loss_scale=2, train_wall=144, gb_free=59.2, wall=44064 epoch 018: 969 / 1707 loss=4.103, nll_loss=2.474, ppl=5.56, wps=295436, ups=0.69, wpb=427945, bsz=16201.1, num_updates=29900, lr=0.000365758, gnorm=0.239, clip=0, loss_scale=2, train_wall=144, gb_free=59.2, wall=44064 epoch 018: 969 / 1707 loss=4.103, nll_loss=2.474, ppl=5.56, wps=295436, ups=0.69, wpb=427945, bsz=16201.1, num_updates=29900, lr=0.000365758, gnorm=0.239, clip=0, loss_scale=2, train_wall=144, gb_free=59.2, wall=44064 epoch 018: 969 / 1707 loss=4.103, nll_loss=2.474, ppl=5.56, wps=295436, ups=0.69, wpb=427945, bsz=16201.1, num_updates=29900, lr=0.000365758, gnorm=0.239, clip=0, loss_scale=2, train_wall=144, gb_free=59.2, wall=44064 epoch 018: 969 / 1707 loss=4.103, nll_loss=2.474, ppl=5.56, wps=295436, ups=0.69, wpb=427945, bsz=16201.1, num_updates=29900, lr=0.000365758, gnorm=0.239, clip=0, loss_scale=2, train_wall=144, gb_free=59.2, wall=44064 epoch 018: 969 / 1707 loss=4.103, nll_loss=2.474, ppl=5.56, wps=295436, ups=0.69, wpb=427945, bsz=16201.1, num_updates=29900, lr=0.000365758, gnorm=0.239, clip=0, loss_scale=2, train_wall=144, gb_free=59.2, wall=44064 epoch 018: 1069 / 1707 loss=4.105, nll_loss=2.477, ppl=5.57, wps=295276, ups=0.69, wpb=429634, bsz=16404.1, num_updates=30000, lr=0.000365148, gnorm=0.241, clip=0, loss_scale=2, train_wall=145, gb_free=59.5, wall=44209 epoch 018: 1069 / 1707 loss=4.105, nll_loss=2.477, ppl=5.57, wps=295276, ups=0.69, wpb=429634, bsz=16404.1, num_updates=30000, lr=0.000365148, gnorm=0.241, clip=0, loss_scale=2, train_wall=145, gb_free=59.5, wall=44209 epoch 018: 1069 / 1707 loss=4.105, nll_loss=2.477, ppl=5.57, wps=295276, ups=0.69, wpb=429634, bsz=16404.1, num_updates=30000, lr=0.000365148, gnorm=0.241, clip=0, loss_scale=2, train_wall=145, gb_free=59.5, wall=44209 epoch 018: 1069 / 1707 loss=4.105, nll_loss=2.477, ppl=5.57, wps=295276, ups=0.69, wpb=429634, bsz=16404.1, num_updates=30000, lr=0.000365148, gnorm=0.241, clip=0, loss_scale=2, train_wall=145, gb_free=59.5, wall=44209 epoch 018: 1069 / 1707 loss=4.105, nll_loss=2.477, ppl=5.57, wps=295276, ups=0.69, wpb=429634, bsz=16404.1, num_updates=30000, lr=0.000365148, gnorm=0.241, clip=0, loss_scale=2, train_wall=145, gb_free=59.5, wall=44209 epoch 018: 1069 / 1707 loss=4.105, nll_loss=2.477, ppl=5.57, wps=295276, ups=0.69, wpb=429634, bsz=16404.1, num_updates=30000, lr=0.000365148, gnorm=0.241, clip=0, loss_scale=2, train_wall=145, gb_free=59.5, wall=44209 epoch 018: 1069 / 1707 loss=4.105, nll_loss=2.477, ppl=5.57, wps=295276, ups=0.69, wpb=429634, bsz=16404.1, num_updates=30000, lr=0.000365148, gnorm=0.241, clip=0, loss_scale=2, train_wall=145, gb_free=59.5, wall=44209 epoch 018: 1069 / 1707 loss=4.105, nll_loss=2.477, ppl=5.57, wps=295276, ups=0.69, wpb=429634, bsz=16404.1, num_updates=30000, lr=0.000365148, gnorm=0.241, clip=0, loss_scale=2, train_wall=145, gb_free=59.5, wall=44209 epoch 018: 1069 / 1707 loss=4.105, nll_loss=2.477, ppl=5.57, wps=295276, ups=0.69, wpb=429634, bsz=16404.1, num_updates=30000, lr=0.000365148, gnorm=0.241, clip=0, loss_scale=2, train_wall=145, gb_free=59.5, wall=44209 epoch 018: 1069 / 1707 loss=4.105, nll_loss=2.477, ppl=5.57, wps=295276, ups=0.69, wpb=429634, bsz=16404.1, num_updates=30000, lr=0.000365148, gnorm=0.241, clip=0, loss_scale=2, train_wall=145, gb_free=59.5, wall=44209 epoch 018: 1069 / 1707 loss=4.105, nll_loss=2.477, ppl=5.57, wps=295276, ups=0.69, wpb=429634, bsz=16404.1, num_updates=30000, lr=0.000365148, gnorm=0.241, clip=0, loss_scale=2, train_wall=145, gb_free=59.5, wall=44209 epoch 018: 1069 / 1707 loss=4.105, nll_loss=2.477, ppl=5.57, wps=295276, ups=0.69, wpb=429634, bsz=16404.1, num_updates=30000, lr=0.000365148, gnorm=0.241, clip=0, loss_scale=2, train_wall=145, gb_free=59.5, wall=44209 epoch 018: 1069 / 1707 loss=4.105, nll_loss=2.477, ppl=5.57, wps=295276, ups=0.69, wpb=429634, bsz=16404.1, num_updates=30000, lr=0.000365148, gnorm=0.241, clip=0, loss_scale=2, train_wall=145, gb_free=59.5, wall=44209 epoch 018: 1069 / 1707 loss=4.105, nll_loss=2.477, ppl=5.57, wps=295276, ups=0.69, wpb=429634, bsz=16404.1, num_updates=30000, lr=0.000365148, gnorm=0.241, clip=0, loss_scale=2, train_wall=145, gb_free=59.5, wall=44209 epoch 018: 1069 / 1707 loss=4.105, nll_loss=2.477, ppl=5.57, wps=295276, ups=0.69, wpb=429634, bsz=16404.1, num_updates=30000, lr=0.000365148, gnorm=0.241, clip=0, loss_scale=2, train_wall=145, gb_free=59.5, wall=44209 epoch 018: 1069 / 1707 loss=4.105, nll_loss=2.477, ppl=5.57, wps=295276, ups=0.69, wpb=429634, bsz=16404.1, num_updates=30000, lr=0.000365148, gnorm=0.241, clip=0, loss_scale=2, train_wall=145, gb_free=59.5, wall=44209 epoch 018: 1069 / 1707 loss=4.105, nll_loss=2.477, ppl=5.57, wps=295276, ups=0.69, wpb=429634, bsz=16404.1, num_updates=30000, lr=0.000365148, gnorm=0.241, clip=0, loss_scale=2, train_wall=145, gb_free=59.5, wall=44209 epoch 018: 1069 / 1707 loss=4.105, nll_loss=2.477, ppl=5.57, wps=295276, ups=0.69, wpb=429634, bsz=16404.1, num_updates=30000, lr=0.000365148, gnorm=0.241, clip=0, loss_scale=2, train_wall=145, gb_free=59.5, wall=44209 begin validation on "valid" subset epoch 018 | valid on 'valid' subset | loss 4.302 | nll_loss 2.681 | ppl 6.41 | wps 75844.7 | wpb 21331 | bsz 1016 | num_updates 30000 | best_loss 4.296 epoch 018 | valid on 'valid' subset | loss 4.302 | nll_loss 2.681 | ppl 6.41 | wps 75844.7 | wpb 21331 | bsz 1016 | num_updates 30000 | best_loss 4.296 epoch 018 | valid on 'valid' subset | loss 4.302 | nll_loss 2.681 | ppl 6.41 | wps 75844.7 | wpb 21331 | bsz 1016 | num_updates 30000 | best_loss 4.296 epoch 018 | valid on 'valid' subset | loss 4.302 | nll_loss 2.681 | ppl 6.41 | wps 75844.7 | wpb 21331 | bsz 1016 | num_updates 30000 | best_loss 4.296 epoch 018 | valid on 'valid' subset | loss 4.302 | nll_loss 2.681 | ppl 6.41 | wps 75844.7 | wpb 21331 | bsz 1016 | num_updates 30000 | best_loss 4.296 epoch 018 | valid on 'valid' subset | loss 4.302 | nll_loss 2.681 | ppl 6.41 | wps 75844.7 | wpb 21331 | bsz 1016 | num_updates 30000 | best_loss 4.296 epoch 018 | valid on 'valid' subset | loss 4.302 | nll_loss 2.681 | ppl 6.41 | wps 75844.7 | wpb 21331 | bsz 1016 | num_updates 30000 | best_loss 4.296 epoch 018 | valid on 'valid' subset | loss 4.302 | nll_loss 2.681 | ppl 6.41 | wps 75844.7 | wpb 21331 | bsz 1016 | num_updates 30000 | best_loss 4.296 epoch 018 | valid on 'valid' subset | loss 4.302 | nll_loss 2.681 | ppl 6.41 | wps 75844.7 | wpb 21331 | bsz 1016 | num_updates 30000 | best_loss 4.296 epoch 018 | valid on 'valid' subset | loss 4.302 | nll_loss 2.681 | ppl 6.41 | wps 75844.7 | wpb 21331 | bsz 1016 | num_updates 30000 | best_loss 4.296 epoch 018 | valid on 'valid' subset | loss 4.302 | nll_loss 2.681 | ppl 6.41 | wps 75844.7 | wpb 21331 | bsz 1016 | num_updates 30000 | best_loss 4.296 epoch 018 | valid on 'valid' subset | loss 4.302 | nll_loss 2.681 | ppl 6.41 | wps 75844.7 | wpb 21331 | bsz 1016 | num_updates 30000 | best_loss 4.296 epoch 018 | valid on 'valid' subset | loss 4.302 | nll_loss 2.681 | ppl 6.41 | wps 75844.7 | wpb 21331 | bsz 1016 | num_updates 30000 | best_loss 4.296 epoch 018 | valid on 'valid' subset | loss 4.302 | nll_loss 2.681 | ppl 6.41 | wps 75844.7 | wpb 21331 | bsz 1016 | num_updates 30000 | best_loss 4.296 epoch 018 | valid on 'valid' subset | loss 4.302 | nll_loss 2.681 | ppl 6.41 | wps 75844.7 | wpb 21331 | bsz 1016 | num_updates 30000 | best_loss 4.296 epoch 018 | valid on 'valid' subset | loss 4.302 | nll_loss 2.681 | ppl 6.41 | wps 75844.7 | wpb 21331 | bsz 1016 | num_updates 30000 | best_loss 4.296 epoch 018 | valid on 'valid' subset | loss 4.302 | nll_loss 2.681 | ppl 6.41 | wps 75844.7 | wpb 21331 | bsz 1016 | num_updates 30000 | best_loss 4.296 epoch 018 | valid on 'valid' subset | loss 4.302 | nll_loss 2.681 | ppl 6.41 | wps 75844.7 | wpb 21331 | bsz 1016 | num_updates 30000 | best_loss 4.296 epoch 018: 1169 / 1707 loss=4.117, nll_loss=2.49, ppl=5.62, wps=270623, ups=0.63, wpb=429921, bsz=16231.8, num_updates=30100, lr=0.000364541, gnorm=0.225, clip=0, loss_scale=4, train_wall=144, gb_free=59, wall=44368 epoch 018: 1169 / 1707 loss=4.117, nll_loss=2.49, ppl=5.62, wps=270623, ups=0.63, wpb=429921, bsz=16231.8, num_updates=30100, lr=0.000364541, gnorm=0.225, clip=0, loss_scale=4, train_wall=144, gb_free=59, wall=44368 epoch 018: 1169 / 1707 loss=4.117, nll_loss=2.49, ppl=5.62, wps=270623, ups=0.63, wpb=429921, bsz=16231.8, num_updates=30100, lr=0.000364541, gnorm=0.225, clip=0, loss_scale=4, train_wall=144, gb_free=59, wall=44368 epoch 018: 1169 / 1707 loss=4.117, nll_loss=2.49, ppl=5.62, wps=270623, ups=0.63, wpb=429921, bsz=16231.8, num_updates=30100, lr=0.000364541, gnorm=0.225, clip=0, loss_scale=4, train_wall=144, gb_free=59, wall=44368 epoch 018: 1169 / 1707 loss=4.117, nll_loss=2.49, ppl=5.62, wps=270623, ups=0.63, wpb=429921, bsz=16231.8, num_updates=30100, lr=0.000364541, gnorm=0.225, clip=0, loss_scale=4, train_wall=144, gb_free=59, wall=44368 epoch 018: 1169 / 1707 loss=4.117, nll_loss=2.49, ppl=5.62, wps=270623, ups=0.63, wpb=429921, bsz=16231.8, num_updates=30100, lr=0.000364541, gnorm=0.225, clip=0, loss_scale=4, train_wall=144, gb_free=59, wall=44368 epoch 018: 1169 / 1707 loss=4.117, nll_loss=2.49, ppl=5.62, wps=270623, ups=0.63, wpb=429921, bsz=16231.8, num_updates=30100, lr=0.000364541, gnorm=0.225, clip=0, loss_scale=4, train_wall=144, gb_free=59, wall=44368 epoch 018: 1169 / 1707 loss=4.117, nll_loss=2.49, ppl=5.62, wps=270623, ups=0.63, wpb=429921, bsz=16231.8, num_updates=30100, lr=0.000364541, gnorm=0.225, clip=0, loss_scale=4, train_wall=144, gb_free=59, wall=44368 epoch 018: 1169 / 1707 loss=4.117, nll_loss=2.49, ppl=5.62, wps=270623, ups=0.63, wpb=429921, bsz=16231.8, num_updates=30100, lr=0.000364541, gnorm=0.225, clip=0, loss_scale=4, train_wall=144, gb_free=59, wall=44368 epoch 018: 1169 / 1707 loss=4.117, nll_loss=2.49, ppl=5.62, wps=270623, ups=0.63, wpb=429921, bsz=16231.8, num_updates=30100, lr=0.000364541, gnorm=0.225, clip=0, loss_scale=4, train_wall=144, gb_free=59, wall=44368 epoch 018: 1169 / 1707 loss=4.117, nll_loss=2.49, ppl=5.62, wps=270623, ups=0.63, wpb=429921, bsz=16231.8, num_updates=30100, lr=0.000364541, gnorm=0.225, clip=0, loss_scale=4, train_wall=144, gb_free=59, wall=44368 epoch 018: 1169 / 1707 loss=4.117, nll_loss=2.49, ppl=5.62, wps=270623, ups=0.63, wpb=429921, bsz=16231.8, num_updates=30100, lr=0.000364541, gnorm=0.225, clip=0, loss_scale=4, train_wall=144, gb_free=59, wall=44368 epoch 018: 1169 / 1707 loss=4.117, nll_loss=2.49, ppl=5.62, wps=270623, ups=0.63, wpb=429921, bsz=16231.8, num_updates=30100, lr=0.000364541, gnorm=0.225, clip=0, loss_scale=4, train_wall=144, gb_free=59, wall=44368 epoch 018: 1169 / 1707 loss=4.117, nll_loss=2.49, ppl=5.62, wps=270623, ups=0.63, wpb=429921, bsz=16231.8, num_updates=30100, lr=0.000364541, gnorm=0.225, clip=0, loss_scale=4, train_wall=144, gb_free=59, wall=44368 epoch 018: 1169 / 1707 loss=4.117, nll_loss=2.49, ppl=5.62, wps=270623, ups=0.63, wpb=429921, bsz=16231.8, num_updates=30100, lr=0.000364541, gnorm=0.225, clip=0, loss_scale=4, train_wall=144, gb_free=59, wall=44368 epoch 018: 1169 / 1707 loss=4.117, nll_loss=2.49, ppl=5.62, wps=270623, ups=0.63, wpb=429921, bsz=16231.8, num_updates=30100, lr=0.000364541, gnorm=0.225, clip=0, loss_scale=4, train_wall=144, gb_free=59, wall=44368 epoch 018: 1169 / 1707 loss=4.117, nll_loss=2.49, ppl=5.62, wps=270623, ups=0.63, wpb=429921, bsz=16231.8, num_updates=30100, lr=0.000364541, gnorm=0.225, clip=0, loss_scale=4, train_wall=144, gb_free=59, wall=44368 epoch 018: 1169 / 1707 loss=4.117, nll_loss=2.49, ppl=5.62, wps=270623, ups=0.63, wpb=429921, bsz=16231.8, num_updates=30100, lr=0.000364541, gnorm=0.225, clip=0, loss_scale=4, train_wall=144, gb_free=59, wall=44368 epoch 018: 1270 / 1707 loss=4.108, nll_loss=2.48, ppl=5.58, wps=291837, ups=0.68, wpb=428539, bsz=16522.6, num_updates=30200, lr=0.000363937, gnorm=0.252, clip=0, loss_scale=2, train_wall=146, gb_free=59.4, wall=44515 epoch 018: 1270 / 1707 loss=4.108, nll_loss=2.48, ppl=5.58, wps=291837, ups=0.68, wpb=428539, bsz=16522.6, num_updates=30200, lr=0.000363937, gnorm=0.252, clip=0, loss_scale=2, train_wall=146, gb_free=59.4, wall=44515 epoch 018: 1270 / 1707 loss=4.108, nll_loss=2.48, ppl=5.58, wps=291837, ups=0.68, wpb=428539, bsz=16522.6, num_updates=30200, lr=0.000363937, gnorm=0.252, clip=0, loss_scale=2, train_wall=146, gb_free=59.4, wall=44515 epoch 018: 1270 / 1707 loss=4.108, nll_loss=2.48, ppl=5.58, wps=291837, ups=0.68, wpb=428539, bsz=16522.6, num_updates=30200, lr=0.000363937, gnorm=0.252, clip=0, loss_scale=2, train_wall=146, gb_free=59.4, wall=44515 epoch 018: 1270 / 1707 loss=4.108, nll_loss=2.48, ppl=5.58, wps=291837, ups=0.68, wpb=428539, bsz=16522.6, num_updates=30200, lr=0.000363937, gnorm=0.252, clip=0, loss_scale=2, train_wall=146, gb_free=59.4, wall=44515 epoch 018: 1270 / 1707 loss=4.108, nll_loss=2.48, ppl=5.58, wps=291837, ups=0.68, wpb=428539, bsz=16522.6, num_updates=30200, lr=0.000363937, gnorm=0.252, clip=0, loss_scale=2, train_wall=146, gb_free=59.4, wall=44515 epoch 018: 1270 / 1707 loss=4.108, nll_loss=2.48, ppl=5.58, wps=291837, ups=0.68, wpb=428539, bsz=16522.6, num_updates=30200, lr=0.000363937, gnorm=0.252, clip=0, loss_scale=2, train_wall=146, gb_free=59.4, wall=44515 epoch 018: 1270 / 1707 loss=4.108, nll_loss=2.48, ppl=5.58, wps=291837, ups=0.68, wpb=428539, bsz=16522.6, num_updates=30200, lr=0.000363937, gnorm=0.252, clip=0, loss_scale=2, train_wall=146, gb_free=59.4, wall=44515 epoch 018: 1270 / 1707 loss=4.108, nll_loss=2.48, ppl=5.58, wps=291837, ups=0.68, wpb=428539, bsz=16522.6, num_updates=30200, lr=0.000363937, gnorm=0.252, clip=0, loss_scale=2, train_wall=146, gb_free=59.4, wall=44515 epoch 018: 1270 / 1707 loss=4.108, nll_loss=2.48, ppl=5.58, wps=291837, ups=0.68, wpb=428539, bsz=16522.6, num_updates=30200, lr=0.000363937, gnorm=0.252, clip=0, loss_scale=2, train_wall=146, gb_free=59.4, wall=44515 epoch 018: 1270 / 1707 loss=4.108, nll_loss=2.48, ppl=5.58, wps=291837, ups=0.68, wpb=428539, bsz=16522.6, num_updates=30200, lr=0.000363937, gnorm=0.252, clip=0, loss_scale=2, train_wall=146, gb_free=59.4, wall=44515 epoch 018: 1270 / 1707 loss=4.108, nll_loss=2.48, ppl=5.58, wps=291837, ups=0.68, wpb=428539, bsz=16522.6, num_updates=30200, lr=0.000363937, gnorm=0.252, clip=0, loss_scale=2, train_wall=146, gb_free=59.4, wall=44515 epoch 018: 1270 / 1707 loss=4.108, nll_loss=2.48, ppl=5.58, wps=291837, ups=0.68, wpb=428539, bsz=16522.6, num_updates=30200, lr=0.000363937, gnorm=0.252, clip=0, loss_scale=2, train_wall=146, gb_free=59.4, wall=44515 epoch 018: 1270 / 1707 loss=4.108, nll_loss=2.48, ppl=5.58, wps=291837, ups=0.68, wpb=428539, bsz=16522.6, num_updates=30200, lr=0.000363937, gnorm=0.252, clip=0, loss_scale=2, train_wall=146, gb_free=59.4, wall=44515 epoch 018: 1270 / 1707 loss=4.108, nll_loss=2.48, ppl=5.58, wps=291837, ups=0.68, wpb=428539, bsz=16522.6, num_updates=30200, lr=0.000363937, gnorm=0.252, clip=0, loss_scale=2, train_wall=146, gb_free=59.4, wall=44515 epoch 018: 1270 / 1707 loss=4.108, nll_loss=2.48, ppl=5.58, wps=291837, ups=0.68, wpb=428539, bsz=16522.6, num_updates=30200, lr=0.000363937, gnorm=0.252, clip=0, loss_scale=2, train_wall=146, gb_free=59.4, wall=44515 epoch 018: 1270 / 1707 loss=4.108, nll_loss=2.48, ppl=5.58, wps=291837, ups=0.68, wpb=428539, bsz=16522.6, num_updates=30200, lr=0.000363937, gnorm=0.252, clip=0, loss_scale=2, train_wall=146, gb_free=59.4, wall=44515 epoch 018: 1270 / 1707 loss=4.108, nll_loss=2.48, ppl=5.58, wps=291837, ups=0.68, wpb=428539, bsz=16522.6, num_updates=30200, lr=0.000363937, gnorm=0.252, clip=0, loss_scale=2, train_wall=146, gb_free=59.4, wall=44515 epoch 018: 1370 / 1707 loss=4.106, nll_loss=2.478, ppl=5.57, wps=295390, ups=0.69, wpb=429597, bsz=16357.4, num_updates=30300, lr=0.000363336, gnorm=0.244, clip=0, loss_scale=2, train_wall=145, gb_free=59, wall=44660 epoch 018: 1370 / 1707 loss=4.106, nll_loss=2.478, ppl=5.57, wps=295390, ups=0.69, wpb=429597, bsz=16357.4, num_updates=30300, lr=0.000363336, gnorm=0.244, clip=0, loss_scale=2, train_wall=145, gb_free=59, wall=44660 epoch 018: 1370 / 1707 loss=4.106, nll_loss=2.478, ppl=5.57, wps=295390, ups=0.69, wpb=429597, bsz=16357.4, num_updates=30300, lr=0.000363336, gnorm=0.244, clip=0, loss_scale=2, train_wall=145, gb_free=59, wall=44660 epoch 018: 1370 / 1707 loss=4.106, nll_loss=2.478, ppl=5.57, wps=295390, ups=0.69, wpb=429597, bsz=16357.4, num_updates=30300, lr=0.000363336, gnorm=0.244, clip=0, loss_scale=2, train_wall=145, gb_free=59, wall=44660 epoch 018: 1370 / 1707 loss=4.106, nll_loss=2.478, ppl=5.57, wps=295390, ups=0.69, wpb=429597, bsz=16357.4, num_updates=30300, lr=0.000363336, gnorm=0.244, clip=0, loss_scale=2, train_wall=145, gb_free=59, wall=44660 epoch 018: 1370 / 1707 loss=4.106, nll_loss=2.478, ppl=5.57, wps=295390, ups=0.69, wpb=429597, bsz=16357.4, num_updates=30300, lr=0.000363336, gnorm=0.244, clip=0, loss_scale=2, train_wall=145, gb_free=59, wall=44660 epoch 018: 1370 / 1707 loss=4.106, nll_loss=2.478, ppl=5.57, wps=295390, ups=0.69, wpb=429597, bsz=16357.4, num_updates=30300, lr=0.000363336, gnorm=0.244, clip=0, loss_scale=2, train_wall=145, gb_free=59, wall=44660 epoch 018: 1370 / 1707 loss=4.106, nll_loss=2.478, ppl=5.57, wps=295390, ups=0.69, wpb=429597, bsz=16357.4, num_updates=30300, lr=0.000363336, gnorm=0.244, clip=0, loss_scale=2, train_wall=145, gb_free=59, wall=44660 epoch 018: 1370 / 1707 loss=4.106, nll_loss=2.478, ppl=5.57, wps=295390, ups=0.69, wpb=429597, bsz=16357.4, num_updates=30300, lr=0.000363336, gnorm=0.244, clip=0, loss_scale=2, train_wall=145, gb_free=59, wall=44660 epoch 018: 1370 / 1707 loss=4.106, nll_loss=2.478, ppl=5.57, wps=295390, ups=0.69, wpb=429597, bsz=16357.4, num_updates=30300, lr=0.000363336, gnorm=0.244, clip=0, loss_scale=2, train_wall=145, gb_free=59, wall=44660 epoch 018: 1370 / 1707 loss=4.106, nll_loss=2.478, ppl=5.57, wps=295390, ups=0.69, wpb=429597, bsz=16357.4, num_updates=30300, lr=0.000363336, gnorm=0.244, clip=0, loss_scale=2, train_wall=145, gb_free=59, wall=44660 epoch 018: 1370 / 1707 loss=4.106, nll_loss=2.478, ppl=5.57, wps=295390, ups=0.69, wpb=429597, bsz=16357.4, num_updates=30300, lr=0.000363336, gnorm=0.244, clip=0, loss_scale=2, train_wall=145, gb_free=59, wall=44660 epoch 018: 1370 / 1707 loss=4.106, nll_loss=2.478, ppl=5.57, wps=295390, ups=0.69, wpb=429597, bsz=16357.4, num_updates=30300, lr=0.000363336, gnorm=0.244, clip=0, loss_scale=2, train_wall=145, gb_free=59, wall=44660 epoch 018: 1370 / 1707 loss=4.106, nll_loss=2.478, ppl=5.57, wps=295390, ups=0.69, wpb=429597, bsz=16357.4, num_updates=30300, lr=0.000363336, gnorm=0.244, clip=0, loss_scale=2, train_wall=145, gb_free=59, wall=44660 epoch 018: 1370 / 1707 loss=4.106, nll_loss=2.478, ppl=5.57, wps=295390, ups=0.69, wpb=429597, bsz=16357.4, num_updates=30300, lr=0.000363336, gnorm=0.244, clip=0, loss_scale=2, train_wall=145, gb_free=59, wall=44660 epoch 018: 1370 / 1707 loss=4.106, nll_loss=2.478, ppl=5.57, wps=295390, ups=0.69, wpb=429597, bsz=16357.4, num_updates=30300, lr=0.000363336, gnorm=0.244, clip=0, loss_scale=2, train_wall=145, gb_free=59, wall=44660 epoch 018: 1370 / 1707 loss=4.106, nll_loss=2.478, ppl=5.57, wps=295390, ups=0.69, wpb=429597, bsz=16357.4, num_updates=30300, lr=0.000363336, gnorm=0.244, clip=0, loss_scale=2, train_wall=145, gb_free=59, wall=44660 epoch 018: 1370 / 1707 loss=4.106, nll_loss=2.478, ppl=5.57, wps=295390, ups=0.69, wpb=429597, bsz=16357.4, num_updates=30300, lr=0.000363336, gnorm=0.244, clip=0, loss_scale=2, train_wall=145, gb_free=59, wall=44660 epoch 018: 1470 / 1707 loss=4.113, nll_loss=2.485, ppl=5.6, wps=295230, ups=0.69, wpb=427890, bsz=16244.7, num_updates=30400, lr=0.000362738, gnorm=0.23, clip=0, loss_scale=4, train_wall=144, gb_free=58.8, wall=44805 epoch 018: 1470 / 1707 loss=4.113, nll_loss=2.485, ppl=5.6, wps=295230, ups=0.69, wpb=427890, bsz=16244.7, num_updates=30400, lr=0.000362738, gnorm=0.23, clip=0, loss_scale=4, train_wall=144, gb_free=58.8, wall=44805 epoch 018: 1470 / 1707 loss=4.113, nll_loss=2.485, ppl=5.6, wps=295230, ups=0.69, wpb=427890, bsz=16244.7, num_updates=30400, lr=0.000362738, gnorm=0.23, clip=0, loss_scale=4, train_wall=144, gb_free=58.8, wall=44805 epoch 018: 1470 / 1707 loss=4.113, nll_loss=2.485, ppl=5.6, wps=295230, ups=0.69, wpb=427890, bsz=16244.7, num_updates=30400, lr=0.000362738, gnorm=0.23, clip=0, loss_scale=4, train_wall=144, gb_free=58.8, wall=44805 epoch 018: 1470 / 1707 loss=4.113, nll_loss=2.485, ppl=5.6, wps=295230, ups=0.69, wpb=427890, bsz=16244.7, num_updates=30400, lr=0.000362738, gnorm=0.23, clip=0, loss_scale=4, train_wall=144, gb_free=58.8, wall=44805 epoch 018: 1470 / 1707 loss=4.113, nll_loss=2.485, ppl=5.6, wps=295230, ups=0.69, wpb=427890, bsz=16244.7, num_updates=30400, lr=0.000362738, gnorm=0.23, clip=0, loss_scale=4, train_wall=144, gb_free=58.8, wall=44805 epoch 018: 1470 / 1707 loss=4.113, nll_loss=2.485, ppl=5.6, wps=295230, ups=0.69, wpb=427890, bsz=16244.7, num_updates=30400, lr=0.000362738, gnorm=0.23, clip=0, loss_scale=4, train_wall=144, gb_free=58.8, wall=44805 epoch 018: 1470 / 1707 loss=4.113, nll_loss=2.485, ppl=5.6, wps=295230, ups=0.69, wpb=427890, bsz=16244.7, num_updates=30400, lr=0.000362738, gnorm=0.23, clip=0, loss_scale=4, train_wall=144, gb_free=58.8, wall=44805 epoch 018: 1470 / 1707 loss=4.113, nll_loss=2.485, ppl=5.6, wps=295230, ups=0.69, wpb=427890, bsz=16244.7, num_updates=30400, lr=0.000362738, gnorm=0.23, clip=0, loss_scale=4, train_wall=144, gb_free=58.8, wall=44805 epoch 018: 1470 / 1707 loss=4.113, nll_loss=2.485, ppl=5.6, wps=295230, ups=0.69, wpb=427890, bsz=16244.7, num_updates=30400, lr=0.000362738, gnorm=0.23, clip=0, loss_scale=4, train_wall=144, gb_free=58.8, wall=44805 epoch 018: 1470 / 1707 loss=4.113, nll_loss=2.485, ppl=5.6, wps=295230, ups=0.69, wpb=427890, bsz=16244.7, num_updates=30400, lr=0.000362738, gnorm=0.23, clip=0, loss_scale=4, train_wall=144, gb_free=58.8, wall=44805 epoch 018: 1470 / 1707 loss=4.113, nll_loss=2.485, ppl=5.6, wps=295230, ups=0.69, wpb=427890, bsz=16244.7, num_updates=30400, lr=0.000362738, gnorm=0.23, clip=0, loss_scale=4, train_wall=144, gb_free=58.8, wall=44805 epoch 018: 1470 / 1707 loss=4.113, nll_loss=2.485, ppl=5.6, wps=295230, ups=0.69, wpb=427890, bsz=16244.7, num_updates=30400, lr=0.000362738, gnorm=0.23, clip=0, loss_scale=4, train_wall=144, gb_free=58.8, wall=44805 epoch 018: 1470 / 1707 loss=4.113, nll_loss=2.485, ppl=5.6, wps=295230, ups=0.69, wpb=427890, bsz=16244.7, num_updates=30400, lr=0.000362738, gnorm=0.23, clip=0, loss_scale=4, train_wall=144, gb_free=58.8, wall=44805 epoch 018: 1470 / 1707 loss=4.113, nll_loss=2.485, ppl=5.6, wps=295230, ups=0.69, wpb=427890, bsz=16244.7, num_updates=30400, lr=0.000362738, gnorm=0.23, clip=0, loss_scale=4, train_wall=144, gb_free=58.8, wall=44805 epoch 018: 1470 / 1707 loss=4.113, nll_loss=2.485, ppl=5.6, wps=295230, ups=0.69, wpb=427890, bsz=16244.7, num_updates=30400, lr=0.000362738, gnorm=0.23, clip=0, loss_scale=4, train_wall=144, gb_free=58.8, wall=44805 epoch 018: 1470 / 1707 loss=4.113, nll_loss=2.485, ppl=5.6, wps=295230, ups=0.69, wpb=427890, bsz=16244.7, num_updates=30400, lr=0.000362738, gnorm=0.23, clip=0, loss_scale=4, train_wall=144, gb_free=58.8, wall=44805 epoch 018: 1470 / 1707 loss=4.113, nll_loss=2.485, ppl=5.6, wps=295230, ups=0.69, wpb=427890, bsz=16244.7, num_updates=30400, lr=0.000362738, gnorm=0.23, clip=0, loss_scale=4, train_wall=144, gb_free=58.8, wall=44805 epoch 018: 1571 / 1707 loss=4.101, nll_loss=2.473, ppl=5.55, wps=292071, ups=0.68, wpb=429321, bsz=16209.9, num_updates=30500, lr=0.000362143, gnorm=0.239, clip=0, loss_scale=2, train_wall=146, gb_free=59.2, wall=44952 epoch 018: 1571 / 1707 loss=4.101, nll_loss=2.473, ppl=5.55, wps=292071, ups=0.68, wpb=429321, bsz=16209.9, num_updates=30500, lr=0.000362143, gnorm=0.239, clip=0, loss_scale=2, train_wall=146, gb_free=59.2, wall=44952 epoch 018: 1571 / 1707 loss=4.101, nll_loss=2.473, ppl=5.55, wps=292071, ups=0.68, wpb=429321, bsz=16209.9, num_updates=30500, lr=0.000362143, gnorm=0.239, clip=0, loss_scale=2, train_wall=146, gb_free=59.2, wall=44952 epoch 018: 1571 / 1707 loss=4.101, nll_loss=2.473, ppl=5.55, wps=292071, ups=0.68, wpb=429321, bsz=16209.9, num_updates=30500, lr=0.000362143, gnorm=0.239, clip=0, loss_scale=2, train_wall=146, gb_free=59.2, wall=44952 epoch 018: 1571 / 1707 loss=4.101, nll_loss=2.473, ppl=5.55, wps=292071, ups=0.68, wpb=429321, bsz=16209.9, num_updates=30500, lr=0.000362143, gnorm=0.239, clip=0, loss_scale=2, train_wall=146, gb_free=59.2, wall=44952 epoch 018: 1571 / 1707 loss=4.101, nll_loss=2.473, ppl=5.55, wps=292071, ups=0.68, wpb=429321, bsz=16209.9, num_updates=30500, lr=0.000362143, gnorm=0.239, clip=0, loss_scale=2, train_wall=146, gb_free=59.2, wall=44952 epoch 018: 1571 / 1707 loss=4.101, nll_loss=2.473, ppl=5.55, wps=292071, ups=0.68, wpb=429321, bsz=16209.9, num_updates=30500, lr=0.000362143, gnorm=0.239, clip=0, loss_scale=2, train_wall=146, gb_free=59.2, wall=44952 epoch 018: 1571 / 1707 loss=4.101, nll_loss=2.473, ppl=5.55, wps=292071, ups=0.68, wpb=429321, bsz=16209.9, num_updates=30500, lr=0.000362143, gnorm=0.239, clip=0, loss_scale=2, train_wall=146, gb_free=59.2, wall=44952 epoch 018: 1571 / 1707 loss=4.101, nll_loss=2.473, ppl=5.55, wps=292071, ups=0.68, wpb=429321, bsz=16209.9, num_updates=30500, lr=0.000362143, gnorm=0.239, clip=0, loss_scale=2, train_wall=146, gb_free=59.2, wall=44952 epoch 018: 1571 / 1707 loss=4.101, nll_loss=2.473, ppl=5.55, wps=292071, ups=0.68, wpb=429321, bsz=16209.9, num_updates=30500, lr=0.000362143, gnorm=0.239, clip=0, loss_scale=2, train_wall=146, gb_free=59.2, wall=44952 epoch 018: 1571 / 1707 loss=4.101, nll_loss=2.473, ppl=5.55, wps=292071, ups=0.68, wpb=429321, bsz=16209.9, num_updates=30500, lr=0.000362143, gnorm=0.239, clip=0, loss_scale=2, train_wall=146, gb_free=59.2, wall=44952 epoch 018: 1571 / 1707 loss=4.101, nll_loss=2.473, ppl=5.55, wps=292071, ups=0.68, wpb=429321, bsz=16209.9, num_updates=30500, lr=0.000362143, gnorm=0.239, clip=0, loss_scale=2, train_wall=146, gb_free=59.2, wall=44952 epoch 018: 1571 / 1707 loss=4.101, nll_loss=2.473, ppl=5.55, wps=292071, ups=0.68, wpb=429321, bsz=16209.9, num_updates=30500, lr=0.000362143, gnorm=0.239, clip=0, loss_scale=2, train_wall=146, gb_free=59.2, wall=44952 epoch 018: 1571 / 1707 loss=4.101, nll_loss=2.473, ppl=5.55, wps=292071, ups=0.68, wpb=429321, bsz=16209.9, num_updates=30500, lr=0.000362143, gnorm=0.239, clip=0, loss_scale=2, train_wall=146, gb_free=59.2, wall=44952 epoch 018: 1571 / 1707 loss=4.101, nll_loss=2.473, ppl=5.55, wps=292071, ups=0.68, wpb=429321, bsz=16209.9, num_updates=30500, lr=0.000362143, gnorm=0.239, clip=0, loss_scale=2, train_wall=146, gb_free=59.2, wall=44952 epoch 018: 1571 / 1707 loss=4.101, nll_loss=2.473, ppl=5.55, wps=292071, ups=0.68, wpb=429321, bsz=16209.9, num_updates=30500, lr=0.000362143, gnorm=0.239, clip=0, loss_scale=2, train_wall=146, gb_free=59.2, wall=44952 epoch 018: 1571 / 1707 loss=4.101, nll_loss=2.473, ppl=5.55, wps=292071, ups=0.68, wpb=429321, bsz=16209.9, num_updates=30500, lr=0.000362143, gnorm=0.239, clip=0, loss_scale=2, train_wall=146, gb_free=59.2, wall=44952 epoch 018: 1571 / 1707 loss=4.101, nll_loss=2.473, ppl=5.55, wps=292071, ups=0.68, wpb=429321, bsz=16209.9, num_updates=30500, lr=0.000362143, gnorm=0.239, clip=0, loss_scale=2, train_wall=146, gb_free=59.2, wall=44952 epoch 018: 1672 / 1707 loss=4.116, nll_loss=2.489, ppl=5.61, wps=292304, ups=0.68, wpb=429243, bsz=16403.6, num_updates=30600, lr=0.000361551, gnorm=0.242, clip=0, loss_scale=1, train_wall=146, gb_free=59.5, wall=45099 epoch 018: 1672 / 1707 loss=4.116, nll_loss=2.489, ppl=5.61, wps=292304, ups=0.68, wpb=429243, bsz=16403.6, num_updates=30600, lr=0.000361551, gnorm=0.242, clip=0, loss_scale=1, train_wall=146, gb_free=59.5, wall=45099 epoch 018: 1672 / 1707 loss=4.116, nll_loss=2.489, ppl=5.61, wps=292304, ups=0.68, wpb=429243, bsz=16403.6, num_updates=30600, lr=0.000361551, gnorm=0.242, clip=0, loss_scale=1, train_wall=146, gb_free=59.5, wall=45099 epoch 018: 1672 / 1707 loss=4.116, nll_loss=2.489, ppl=5.61, wps=292304, ups=0.68, wpb=429243, bsz=16403.6, num_updates=30600, lr=0.000361551, gnorm=0.242, clip=0, loss_scale=1, train_wall=146, gb_free=59.5, wall=45099 epoch 018: 1672 / 1707 loss=4.116, nll_loss=2.489, ppl=5.61, wps=292304, ups=0.68, wpb=429243, bsz=16403.6, num_updates=30600, lr=0.000361551, gnorm=0.242, clip=0, loss_scale=1, train_wall=146, gb_free=59.5, wall=45099 epoch 018: 1672 / 1707 loss=4.116, nll_loss=2.489, ppl=5.61, wps=292304, ups=0.68, wpb=429243, bsz=16403.6, num_updates=30600, lr=0.000361551, gnorm=0.242, clip=0, loss_scale=1, train_wall=146, gb_free=59.5, wall=45099 epoch 018: 1672 / 1707 loss=4.116, nll_loss=2.489, ppl=5.61, wps=292304, ups=0.68, wpb=429243, bsz=16403.6, num_updates=30600, lr=0.000361551, gnorm=0.242, clip=0, loss_scale=1, train_wall=146, gb_free=59.5, wall=45099 epoch 018: 1672 / 1707 loss=4.116, nll_loss=2.489, ppl=5.61, wps=292304, ups=0.68, wpb=429243, bsz=16403.6, num_updates=30600, lr=0.000361551, gnorm=0.242, clip=0, loss_scale=1, train_wall=146, gb_free=59.5, wall=45099 epoch 018: 1672 / 1707 loss=4.116, nll_loss=2.489, ppl=5.61, wps=292304, ups=0.68, wpb=429243, bsz=16403.6, num_updates=30600, lr=0.000361551, gnorm=0.242, clip=0, loss_scale=1, train_wall=146, gb_free=59.5, wall=45099 epoch 018: 1672 / 1707 loss=4.116, nll_loss=2.489, ppl=5.61, wps=292304, ups=0.68, wpb=429243, bsz=16403.6, num_updates=30600, lr=0.000361551, gnorm=0.242, clip=0, loss_scale=1, train_wall=146, gb_free=59.5, wall=45099 epoch 018: 1672 / 1707 loss=4.116, nll_loss=2.489, ppl=5.61, wps=292304, ups=0.68, wpb=429243, bsz=16403.6, num_updates=30600, lr=0.000361551, gnorm=0.242, clip=0, loss_scale=1, train_wall=146, gb_free=59.5, wall=45099 epoch 018: 1672 / 1707 loss=4.116, nll_loss=2.489, ppl=5.61, wps=292304, ups=0.68, wpb=429243, bsz=16403.6, num_updates=30600, lr=0.000361551, gnorm=0.242, clip=0, loss_scale=1, train_wall=146, gb_free=59.5, wall=45099 epoch 018: 1672 / 1707 loss=4.116, nll_loss=2.489, ppl=5.61, wps=292304, ups=0.68, wpb=429243, bsz=16403.6, num_updates=30600, lr=0.000361551, gnorm=0.242, clip=0, loss_scale=1, train_wall=146, gb_free=59.5, wall=45099 epoch 018: 1672 / 1707 loss=4.116, nll_loss=2.489, ppl=5.61, wps=292304, ups=0.68, wpb=429243, bsz=16403.6, num_updates=30600, lr=0.000361551, gnorm=0.242, clip=0, loss_scale=1, train_wall=146, gb_free=59.5, wall=45099 epoch 018: 1672 / 1707 loss=4.116, nll_loss=2.489, ppl=5.61, wps=292304, ups=0.68, wpb=429243, bsz=16403.6, num_updates=30600, lr=0.000361551, gnorm=0.242, clip=0, loss_scale=1, train_wall=146, gb_free=59.5, wall=45099 epoch 018: 1672 / 1707 loss=4.116, nll_loss=2.489, ppl=5.61, wps=292304, ups=0.68, wpb=429243, bsz=16403.6, num_updates=30600, lr=0.000361551, gnorm=0.242, clip=0, loss_scale=1, train_wall=146, gb_free=59.5, wall=45099 epoch 018: 1672 / 1707 loss=4.116, nll_loss=2.489, ppl=5.61, wps=292304, ups=0.68, wpb=429243, bsz=16403.6, num_updates=30600, lr=0.000361551, gnorm=0.242, clip=0, loss_scale=1, train_wall=146, gb_free=59.5, wall=45099 epoch 018: 1672 / 1707 loss=4.116, nll_loss=2.489, ppl=5.61, wps=292304, ups=0.68, wpb=429243, bsz=16403.6, num_updates=30600, lr=0.000361551, gnorm=0.242, clip=0, loss_scale=1, train_wall=146, gb_free=59.5, wall=45099 end of epoch 18 (average epoch stats below) epoch 018 | loss 4.102 | nll_loss 2.472 | ppl 5.55 | wps 291455 | ups 0.68 | wpb 428962 | bsz 16333.6 | num_updates 30635 | lr 0.000361344 | gnorm 0.239 | clip 0 | loss_scale 1 | train_wall 2464 | gb_free 59 | wall 45149 epoch 018 | loss 4.102 | nll_loss 2.472 | ppl 5.55 | wps 291455 | ups 0.68 | wpb 428962 | bsz 16333.6 | num_updates 30635 | lr 0.000361344 | gnorm 0.239 | clip 0 | loss_scale 1 | train_wall 2464 | gb_free 59 | wall 45149 epoch 018 | loss 4.102 | nll_loss 2.472 | ppl 5.55 | wps 291455 | ups 0.68 | wpb 428962 | bsz 16333.6 | num_updates 30635 | lr 0.000361344 | gnorm 0.239 | clip 0 | loss_scale 1 | train_wall 2464 | gb_free 59 | wall 45149 epoch 018 | loss 4.102 | nll_loss 2.472 | ppl 5.55 | wps 291455 | ups 0.68 | wpb 428962 | bsz 16333.6 | num_updates 30635 | lr 0.000361344 | gnorm 0.239 | clip 0 | loss_scale 1 | train_wall 2464 | gb_free 59 | wall 45149 epoch 018 | loss 4.102 | nll_loss 2.472 | ppl 5.55 | wps 291455 | ups 0.68 | wpb 428962 | bsz 16333.6 | num_updates 30635 | lr 0.000361344 | gnorm 0.239 | clip 0 | loss_scale 1 | train_wall 2464 | gb_free 59 | wall 45149 epoch 018 | loss 4.102 | nll_loss 2.472 | ppl 5.55 | wps 291455 | ups 0.68 | wpb 428962 | bsz 16333.6 | num_updates 30635 | lr 0.000361344 | gnorm 0.239 | clip 0 | loss_scale 1 | train_wall 2464 | gb_free 59 | wall 45149 epoch 018 | loss 4.102 | nll_loss 2.472 | ppl 5.55 | wps 291455 | ups 0.68 | wpb 428962 | bsz 16333.6 | num_updates 30635 | lr 0.000361344 | gnorm 0.239 | clip 0 | loss_scale 1 | train_wall 2464 | gb_free 59 | wall 45149 epoch 018 | loss 4.102 | nll_loss 2.472 | ppl 5.55 | wps 291455 | ups 0.68 | wpb 428962 | bsz 16333.6 | num_updates 30635 | lr 0.000361344 | gnorm 0.239 | clip 0 | loss_scale 1 | train_wall 2464 | gb_free 59 | wall 45149 epoch 018 | loss 4.102 | nll_loss 2.472 | ppl 5.55 | wps 291455 | ups 0.68 | wpb 428962 | bsz 16333.6 | num_updates 30635 | lr 0.000361344 | gnorm 0.239 | clip 0 | loss_scale 1 | train_wall 2464 | gb_free 59 | wall 45149 epoch 018 | loss 4.102 | nll_loss 2.472 | ppl 5.55 | wps 291455 | ups 0.68 | wpb 428962 | bsz 16333.6 | num_updates 30635 | lr 0.000361344 | gnorm 0.239 | clip 0 | loss_scale 1 | train_wall 2464 | gb_free 59 | wall 45149 epoch 018 | loss 4.102 | nll_loss 2.472 | ppl 5.55 | wps 291455 | ups 0.68 | wpb 428962 | bsz 16333.6 | num_updates 30635 | lr 0.000361344 | gnorm 0.239 | clip 0 | loss_scale 1 | train_wall 2464 | gb_free 59 | wall 45149 epoch 018 | loss 4.102 | nll_loss 2.472 | ppl 5.55 | wps 291455 | ups 0.68 | wpb 428962 | bsz 16333.6 | num_updates 30635 | lr 0.000361344 | gnorm 0.239 | clip 0 | loss_scale 1 | train_wall 2464 | gb_free 59 | wall 45149 epoch 018 | loss 4.102 | nll_loss 2.472 | ppl 5.55 | wps 291455 | ups 0.68 | wpb 428962 | bsz 16333.6 | num_updates 30635 | lr 0.000361344 | gnorm 0.239 | clip 0 | loss_scale 1 | train_wall 2464 | gb_free 59 | wall 45149 epoch 018 | loss 4.102 | nll_loss 2.472 | ppl 5.55 | wps 291455 | ups 0.68 | wpb 428962 | bsz 16333.6 | num_updates 30635 | lr 0.000361344 | gnorm 0.239 | clip 0 | loss_scale 1 | train_wall 2464 | gb_free 59 | wall 45149 epoch 018 | loss 4.102 | nll_loss 2.472 | ppl 5.55 | wps 291455 | ups 0.68 | wpb 428962 | bsz 16333.6 | num_updates 30635 | lr 0.000361344 | gnorm 0.239 | clip 0 | loss_scale 1 | train_wall 2464 | gb_free 59 | wall 45149 epoch 018 | loss 4.102 | nll_loss 2.472 | ppl 5.55 | wps 291455 | ups 0.68 | wpb 428962 | bsz 16333.6 | num_updates 30635 | lr 0.000361344 | gnorm 0.239 | clip 0 | loss_scale 1 | train_wall 2464 | gb_free 59 | wall 45149 epoch 018 | loss 4.102 | nll_loss 2.472 | ppl 5.55 | wps 291455 | ups 0.68 | wpb 428962 | bsz 16333.6 | num_updates 30635 | lr 0.000361344 | gnorm 0.239 | clip 0 | loss_scale 1 | train_wall 2464 | gb_free 59 | wall 45149 epoch 018 | loss 4.102 | nll_loss 2.472 | ppl 5.55 | wps 291455 | ups 0.68 | wpb 428962 | bsz 16333.6 | num_updates 30635 | lr 0.000361344 | gnorm 0.239 | clip 0 | loss_scale 1 | train_wall 2464 | gb_free 59 | wall 45149 Start iterating over samples epoch 019: 65 / 1707 loss=4.081, nll_loss=2.449, ppl=5.46, wps=294080, ups=0.69, wpb=425529, bsz=16218.4, num_updates=30700, lr=0.000360961, gnorm=0.237, clip=0, loss_scale=1, train_wall=144, gb_free=58.9, wall=45244 epoch 019: 65 / 1707 loss=4.081, nll_loss=2.449, ppl=5.46, wps=294080, ups=0.69, wpb=425529, bsz=16218.4, num_updates=30700, lr=0.000360961, gnorm=0.237, clip=0, loss_scale=1, train_wall=144, gb_free=58.9, wall=45244 epoch 019: 65 / 1707 loss=4.081, nll_loss=2.449, ppl=5.46, wps=294080, ups=0.69, wpb=425529, bsz=16218.4, num_updates=30700, lr=0.000360961, gnorm=0.237, clip=0, loss_scale=1, train_wall=144, gb_free=58.9, wall=45244 epoch 019: 65 / 1707 loss=4.081, nll_loss=2.449, ppl=5.46, wps=294080, ups=0.69, wpb=425529, bsz=16218.4, num_updates=30700, lr=0.000360961, gnorm=0.237, clip=0, loss_scale=1, train_wall=144, gb_free=58.9, wall=45244 epoch 019: 65 / 1707 loss=4.081, nll_loss=2.449, ppl=5.46, wps=294080, ups=0.69, wpb=425529, bsz=16218.4, num_updates=30700, lr=0.000360961, gnorm=0.237, clip=0, loss_scale=1, train_wall=144, gb_free=58.9, wall=45244 epoch 019: 65 / 1707 loss=4.081, nll_loss=2.449, ppl=5.46, wps=294080, ups=0.69, wpb=425529, bsz=16218.4, num_updates=30700, lr=0.000360961, gnorm=0.237, clip=0, loss_scale=1, train_wall=144, gb_free=58.9, wall=45244 epoch 019: 65 / 1707 loss=4.081, nll_loss=2.449, ppl=5.46, wps=294080, ups=0.69, wpb=425529, bsz=16218.4, num_updates=30700, lr=0.000360961, gnorm=0.237, clip=0, loss_scale=1, train_wall=144, gb_free=58.9, wall=45244 epoch 019: 65 / 1707 loss=4.081, nll_loss=2.449, ppl=5.46, wps=294080, ups=0.69, wpb=425529, bsz=16218.4, num_updates=30700, lr=0.000360961, gnorm=0.237, clip=0, loss_scale=1, train_wall=144, gb_free=58.9, wall=45244 epoch 019: 65 / 1707 loss=4.081, nll_loss=2.449, ppl=5.46, wps=294080, ups=0.69, wpb=425529, bsz=16218.4, num_updates=30700, lr=0.000360961, gnorm=0.237, clip=0, loss_scale=1, train_wall=144, gb_free=58.9, wall=45244 epoch 019: 65 / 1707 loss=4.081, nll_loss=2.449, ppl=5.46, wps=294080, ups=0.69, wpb=425529, bsz=16218.4, num_updates=30700, lr=0.000360961, gnorm=0.237, clip=0, loss_scale=1, train_wall=144, gb_free=58.9, wall=45244 epoch 019: 65 / 1707 loss=4.081, nll_loss=2.449, ppl=5.46, wps=294080, ups=0.69, wpb=425529, bsz=16218.4, num_updates=30700, lr=0.000360961, gnorm=0.237, clip=0, loss_scale=1, train_wall=144, gb_free=58.9, wall=45244 epoch 019: 65 / 1707 loss=4.081, nll_loss=2.449, ppl=5.46, wps=294080, ups=0.69, wpb=425529, bsz=16218.4, num_updates=30700, lr=0.000360961, gnorm=0.237, clip=0, loss_scale=1, train_wall=144, gb_free=58.9, wall=45244 epoch 019: 65 / 1707 loss=4.081, nll_loss=2.449, ppl=5.46, wps=294080, ups=0.69, wpb=425529, bsz=16218.4, num_updates=30700, lr=0.000360961, gnorm=0.237, clip=0, loss_scale=1, train_wall=144, gb_free=58.9, wall=45244 epoch 019: 65 / 1707 loss=4.081, nll_loss=2.449, ppl=5.46, wps=294080, ups=0.69, wpb=425529, bsz=16218.4, num_updates=30700, lr=0.000360961, gnorm=0.237, clip=0, loss_scale=1, train_wall=144, gb_free=58.9, wall=45244 epoch 019: 65 / 1707 loss=4.081, nll_loss=2.449, ppl=5.46, wps=294080, ups=0.69, wpb=425529, bsz=16218.4, num_updates=30700, lr=0.000360961, gnorm=0.237, clip=0, loss_scale=1, train_wall=144, gb_free=58.9, wall=45244 epoch 019: 65 / 1707 loss=4.081, nll_loss=2.449, ppl=5.46, wps=294080, ups=0.69, wpb=425529, bsz=16218.4, num_updates=30700, lr=0.000360961, gnorm=0.237, clip=0, loss_scale=1, train_wall=144, gb_free=58.9, wall=45244 epoch 019: 65 / 1707 loss=4.081, nll_loss=2.449, ppl=5.46, wps=294080, ups=0.69, wpb=425529, bsz=16218.4, num_updates=30700, lr=0.000360961, gnorm=0.237, clip=0, loss_scale=1, train_wall=144, gb_free=58.9, wall=45244 epoch 019: 65 / 1707 loss=4.081, nll_loss=2.449, ppl=5.46, wps=294080, ups=0.69, wpb=425529, bsz=16218.4, num_updates=30700, lr=0.000360961, gnorm=0.237, clip=0, loss_scale=1, train_wall=144, gb_free=58.9, wall=45244 epoch 019: 65 / 1707 loss=4.081, nll_loss=2.449, ppl=5.46, wps=294080, ups=0.69, wpb=425529, bsz=16218.4, num_updates=30700, lr=0.000360961, gnorm=0.237, clip=0, loss_scale=1, train_wall=144, gb_free=58.9, wall=45244 epoch 019: 165 / 1707 loss=4.074, nll_loss=2.441, ppl=5.43, wps=293936, ups=0.69, wpb=428601, bsz=16522, num_updates=30800, lr=0.000360375, gnorm=0.247, clip=0, loss_scale=1, train_wall=145, gb_free=59, wall=45390 epoch 019: 165 / 1707 loss=4.074, nll_loss=2.441, ppl=5.43, wps=293936, ups=0.69, wpb=428601, bsz=16522, num_updates=30800, lr=0.000360375, gnorm=0.247, clip=0, loss_scale=1, train_wall=145, gb_free=59, wall=45390 epoch 019: 165 / 1707 loss=4.074, nll_loss=2.441, ppl=5.43, wps=293936, ups=0.69, wpb=428601, bsz=16522, num_updates=30800, lr=0.000360375, gnorm=0.247, clip=0, loss_scale=1, train_wall=145, gb_free=59, wall=45390 epoch 019: 165 / 1707 loss=4.074, nll_loss=2.441, ppl=5.43, wps=293936, ups=0.69, wpb=428601, bsz=16522, num_updates=30800, lr=0.000360375, gnorm=0.247, clip=0, loss_scale=1, train_wall=145, gb_free=59, wall=45390 epoch 019: 165 / 1707 loss=4.074, nll_loss=2.441, ppl=5.43, wps=293936, ups=0.69, wpb=428601, bsz=16522, num_updates=30800, lr=0.000360375, gnorm=0.247, clip=0, loss_scale=1, train_wall=145, gb_free=59, wall=45390 epoch 019: 165 / 1707 loss=4.074, nll_loss=2.441, ppl=5.43, wps=293936, ups=0.69, wpb=428601, bsz=16522, num_updates=30800, lr=0.000360375, gnorm=0.247, clip=0, loss_scale=1, train_wall=145, gb_free=59, wall=45390 epoch 019: 165 / 1707 loss=4.074, nll_loss=2.441, ppl=5.43, wps=293936, ups=0.69, wpb=428601, bsz=16522, num_updates=30800, lr=0.000360375, gnorm=0.247, clip=0, loss_scale=1, train_wall=145, gb_free=59, wall=45390 epoch 019: 165 / 1707 loss=4.074, nll_loss=2.441, ppl=5.43, wps=293936, ups=0.69, wpb=428601, bsz=16522, num_updates=30800, lr=0.000360375, gnorm=0.247, clip=0, loss_scale=1, train_wall=145, gb_free=59, wall=45390 epoch 019: 165 / 1707 loss=4.074, nll_loss=2.441, ppl=5.43, wps=293936, ups=0.69, wpb=428601, bsz=16522, num_updates=30800, lr=0.000360375, gnorm=0.247, clip=0, loss_scale=1, train_wall=145, gb_free=59, wall=45390 epoch 019: 165 / 1707 loss=4.074, nll_loss=2.441, ppl=5.43, wps=293936, ups=0.69, wpb=428601, bsz=16522, num_updates=30800, lr=0.000360375, gnorm=0.247, clip=0, loss_scale=1, train_wall=145, gb_free=59, wall=45390 epoch 019: 165 / 1707 loss=4.074, nll_loss=2.441, ppl=5.43, wps=293936, ups=0.69, wpb=428601, bsz=16522, num_updates=30800, lr=0.000360375, gnorm=0.247, clip=0, loss_scale=1, train_wall=145, gb_free=59, wall=45390 epoch 019: 165 / 1707 loss=4.074, nll_loss=2.441, ppl=5.43, wps=293936, ups=0.69, wpb=428601, bsz=16522, num_updates=30800, lr=0.000360375, gnorm=0.247, clip=0, loss_scale=1, train_wall=145, gb_free=59, wall=45390 epoch 019: 165 / 1707 loss=4.074, nll_loss=2.441, ppl=5.43, wps=293936, ups=0.69, wpb=428601, bsz=16522, num_updates=30800, lr=0.000360375, gnorm=0.247, clip=0, loss_scale=1, train_wall=145, gb_free=59, wall=45390 epoch 019: 165 / 1707 loss=4.074, nll_loss=2.441, ppl=5.43, wps=293936, ups=0.69, wpb=428601, bsz=16522, num_updates=30800, lr=0.000360375, gnorm=0.247, clip=0, loss_scale=1, train_wall=145, gb_free=59, wall=45390 epoch 019: 165 / 1707 loss=4.074, nll_loss=2.441, ppl=5.43, wps=293936, ups=0.69, wpb=428601, bsz=16522, num_updates=30800, lr=0.000360375, gnorm=0.247, clip=0, loss_scale=1, train_wall=145, gb_free=59, wall=45390 epoch 019: 165 / 1707 loss=4.074, nll_loss=2.441, ppl=5.43, wps=293936, ups=0.69, wpb=428601, bsz=16522, num_updates=30800, lr=0.000360375, gnorm=0.247, clip=0, loss_scale=1, train_wall=145, gb_free=59, wall=45390 epoch 019: 165 / 1707 loss=4.074, nll_loss=2.441, ppl=5.43, wps=293936, ups=0.69, wpb=428601, bsz=16522, num_updates=30800, lr=0.000360375, gnorm=0.247, clip=0, loss_scale=1, train_wall=145, gb_free=59, wall=45390 epoch 019: 165 / 1707 loss=4.074, nll_loss=2.441, ppl=5.43, wps=293936, ups=0.69, wpb=428601, bsz=16522, num_updates=30800, lr=0.000360375, gnorm=0.247, clip=0, loss_scale=1, train_wall=145, gb_free=59, wall=45390 epoch 019: 165 / 1707 loss=4.074, nll_loss=2.441, ppl=5.43, wps=293936, ups=0.69, wpb=428601, bsz=16522, num_updates=30800, lr=0.000360375, gnorm=0.247, clip=0, loss_scale=1, train_wall=145, gb_free=59, wall=45390 epoch 019: 265 / 1707 loss=4.078, nll_loss=2.445, ppl=5.45, wps=294835, ups=0.69, wpb=427848, bsz=16410.6, num_updates=30900, lr=0.000359791, gnorm=0.237, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=45535 epoch 019: 265 / 1707 loss=4.078, nll_loss=2.445, ppl=5.45, wps=294835, ups=0.69, wpb=427848, bsz=16410.6, num_updates=30900, lr=0.000359791, gnorm=0.237, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=45535 epoch 019: 265 / 1707 loss=4.078, nll_loss=2.445, ppl=5.45, wps=294835, ups=0.69, wpb=427848, bsz=16410.6, num_updates=30900, lr=0.000359791, gnorm=0.237, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=45535 epoch 019: 265 / 1707 loss=4.078, nll_loss=2.445, ppl=5.45, wps=294835, ups=0.69, wpb=427848, bsz=16410.6, num_updates=30900, lr=0.000359791, gnorm=0.237, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=45535 epoch 019: 265 / 1707 loss=4.078, nll_loss=2.445, ppl=5.45, wps=294835, ups=0.69, wpb=427848, bsz=16410.6, num_updates=30900, lr=0.000359791, gnorm=0.237, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=45535 epoch 019: 265 / 1707 loss=4.078, nll_loss=2.445, ppl=5.45, wps=294835, ups=0.69, wpb=427848, bsz=16410.6, num_updates=30900, lr=0.000359791, gnorm=0.237, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=45535 epoch 019: 265 / 1707 loss=4.078, nll_loss=2.445, ppl=5.45, wps=294835, ups=0.69, wpb=427848, bsz=16410.6, num_updates=30900, lr=0.000359791, gnorm=0.237, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=45535 epoch 019: 265 / 1707 loss=4.078, nll_loss=2.445, ppl=5.45, wps=294835, ups=0.69, wpb=427848, bsz=16410.6, num_updates=30900, lr=0.000359791, gnorm=0.237, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=45535 epoch 019: 265 / 1707 loss=4.078, nll_loss=2.445, ppl=5.45, wps=294835, ups=0.69, wpb=427848, bsz=16410.6, num_updates=30900, lr=0.000359791, gnorm=0.237, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=45535 epoch 019: 265 / 1707 loss=4.078, nll_loss=2.445, ppl=5.45, wps=294835, ups=0.69, wpb=427848, bsz=16410.6, num_updates=30900, lr=0.000359791, gnorm=0.237, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=45535 epoch 019: 265 / 1707 loss=4.078, nll_loss=2.445, ppl=5.45, wps=294835, ups=0.69, wpb=427848, bsz=16410.6, num_updates=30900, lr=0.000359791, gnorm=0.237, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=45535 epoch 019: 265 / 1707 loss=4.078, nll_loss=2.445, ppl=5.45, wps=294835, ups=0.69, wpb=427848, bsz=16410.6, num_updates=30900, lr=0.000359791, gnorm=0.237, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=45535 epoch 019: 265 / 1707 loss=4.078, nll_loss=2.445, ppl=5.45, wps=294835, ups=0.69, wpb=427848, bsz=16410.6, num_updates=30900, lr=0.000359791, gnorm=0.237, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=45535 epoch 019: 265 / 1707 loss=4.078, nll_loss=2.445, ppl=5.45, wps=294835, ups=0.69, wpb=427848, bsz=16410.6, num_updates=30900, lr=0.000359791, gnorm=0.237, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=45535 epoch 019: 265 / 1707 loss=4.078, nll_loss=2.445, ppl=5.45, wps=294835, ups=0.69, wpb=427848, bsz=16410.6, num_updates=30900, lr=0.000359791, gnorm=0.237, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=45535 epoch 019: 265 / 1707 loss=4.078, nll_loss=2.445, ppl=5.45, wps=294835, ups=0.69, wpb=427848, bsz=16410.6, num_updates=30900, lr=0.000359791, gnorm=0.237, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=45535 epoch 019: 265 / 1707 loss=4.078, nll_loss=2.445, ppl=5.45, wps=294835, ups=0.69, wpb=427848, bsz=16410.6, num_updates=30900, lr=0.000359791, gnorm=0.237, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=45535 epoch 019: 265 / 1707 loss=4.078, nll_loss=2.445, ppl=5.45, wps=294835, ups=0.69, wpb=427848, bsz=16410.6, num_updates=30900, lr=0.000359791, gnorm=0.237, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=45535 epoch 019: 265 / 1707 loss=4.078, nll_loss=2.445, ppl=5.45, wps=294835, ups=0.69, wpb=427848, bsz=16410.6, num_updates=30900, lr=0.000359791, gnorm=0.237, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=45535 epoch 019: 365 / 1707 loss=4.084, nll_loss=2.452, ppl=5.47, wps=295663, ups=0.69, wpb=429723, bsz=16355.7, num_updates=31000, lr=0.000359211, gnorm=0.25, clip=0, loss_scale=2, train_wall=145, gb_free=59.4, wall=45680 epoch 019: 365 / 1707 loss=4.084, nll_loss=2.452, ppl=5.47, wps=295663, ups=0.69, wpb=429723, bsz=16355.7, num_updates=31000, lr=0.000359211, gnorm=0.25, clip=0, loss_scale=2, train_wall=145, gb_free=59.4, wall=45680 epoch 019: 365 / 1707 loss=4.084, nll_loss=2.452, ppl=5.47, wps=295663, ups=0.69, wpb=429723, bsz=16355.7, num_updates=31000, lr=0.000359211, gnorm=0.25, clip=0, loss_scale=2, train_wall=145, gb_free=59.4, wall=45680 epoch 019: 365 / 1707 loss=4.084, nll_loss=2.452, ppl=5.47, wps=295663, ups=0.69, wpb=429723, bsz=16355.7, num_updates=31000, lr=0.000359211, gnorm=0.25, clip=0, loss_scale=2, train_wall=145, gb_free=59.4, wall=45680 epoch 019: 365 / 1707 loss=4.084, nll_loss=2.452, ppl=5.47, wps=295663, ups=0.69, wpb=429723, bsz=16355.7, num_updates=31000, lr=0.000359211, gnorm=0.25, clip=0, loss_scale=2, train_wall=145, gb_free=59.4, wall=45680 epoch 019: 365 / 1707 loss=4.084, nll_loss=2.452, ppl=5.47, wps=295663, ups=0.69, wpb=429723, bsz=16355.7, num_updates=31000, lr=0.000359211, gnorm=0.25, clip=0, loss_scale=2, train_wall=145, gb_free=59.4, wall=45680 epoch 019: 365 / 1707 loss=4.084, nll_loss=2.452, ppl=5.47, wps=295663, ups=0.69, wpb=429723, bsz=16355.7, num_updates=31000, lr=0.000359211, gnorm=0.25, clip=0, loss_scale=2, train_wall=145, gb_free=59.4, wall=45680 epoch 019: 365 / 1707 loss=4.084, nll_loss=2.452, ppl=5.47, wps=295663, ups=0.69, wpb=429723, bsz=16355.7, num_updates=31000, lr=0.000359211, gnorm=0.25, clip=0, loss_scale=2, train_wall=145, gb_free=59.4, wall=45680 epoch 019: 365 / 1707 loss=4.084, nll_loss=2.452, ppl=5.47, wps=295663, ups=0.69, wpb=429723, bsz=16355.7, num_updates=31000, lr=0.000359211, gnorm=0.25, clip=0, loss_scale=2, train_wall=145, gb_free=59.4, wall=45680 epoch 019: 365 / 1707 loss=4.084, nll_loss=2.452, ppl=5.47, wps=295663, ups=0.69, wpb=429723, bsz=16355.7, num_updates=31000, lr=0.000359211, gnorm=0.25, clip=0, loss_scale=2, train_wall=145, gb_free=59.4, wall=45680 epoch 019: 365 / 1707 loss=4.084, nll_loss=2.452, ppl=5.47, wps=295663, ups=0.69, wpb=429723, bsz=16355.7, num_updates=31000, lr=0.000359211, gnorm=0.25, clip=0, loss_scale=2, train_wall=145, gb_free=59.4, wall=45680 epoch 019: 365 / 1707 loss=4.084, nll_loss=2.452, ppl=5.47, wps=295663, ups=0.69, wpb=429723, bsz=16355.7, num_updates=31000, lr=0.000359211, gnorm=0.25, clip=0, loss_scale=2, train_wall=145, gb_free=59.4, wall=45680 epoch 019: 365 / 1707 loss=4.084, nll_loss=2.452, ppl=5.47, wps=295663, ups=0.69, wpb=429723, bsz=16355.7, num_updates=31000, lr=0.000359211, gnorm=0.25, clip=0, loss_scale=2, train_wall=145, gb_free=59.4, wall=45680 epoch 019: 365 / 1707 loss=4.084, nll_loss=2.452, ppl=5.47, wps=295663, ups=0.69, wpb=429723, bsz=16355.7, num_updates=31000, lr=0.000359211, gnorm=0.25, clip=0, loss_scale=2, train_wall=145, gb_free=59.4, wall=45680 epoch 019: 365 / 1707 loss=4.084, nll_loss=2.452, ppl=5.47, wps=295663, ups=0.69, wpb=429723, bsz=16355.7, num_updates=31000, lr=0.000359211, gnorm=0.25, clip=0, loss_scale=2, train_wall=145, gb_free=59.4, wall=45680 epoch 019: 365 / 1707 loss=4.084, nll_loss=2.452, ppl=5.47, wps=295663, ups=0.69, wpb=429723, bsz=16355.7, num_updates=31000, lr=0.000359211, gnorm=0.25, clip=0, loss_scale=2, train_wall=145, gb_free=59.4, wall=45680 epoch 019: 365 / 1707 loss=4.084, nll_loss=2.452, ppl=5.47, wps=295663, ups=0.69, wpb=429723, bsz=16355.7, num_updates=31000, lr=0.000359211, gnorm=0.25, clip=0, loss_scale=2, train_wall=145, gb_free=59.4, wall=45680 epoch 019: 365 / 1707 loss=4.084, nll_loss=2.452, ppl=5.47, wps=295663, ups=0.69, wpb=429723, bsz=16355.7, num_updates=31000, lr=0.000359211, gnorm=0.25, clip=0, loss_scale=2, train_wall=145, gb_free=59.4, wall=45680 epoch 019: 365 / 1707 loss=4.084, nll_loss=2.452, ppl=5.47, wps=295663, ups=0.69, wpb=429723, bsz=16355.7, num_updates=31000, lr=0.000359211, gnorm=0.25, clip=0, loss_scale=2, train_wall=145, gb_free=59.4, wall=45680 begin validation on "valid" subset epoch 019 | valid on 'valid' subset | loss 4.314 | nll_loss 2.693 | ppl 6.47 | wps 75475 | wpb 21331 | bsz 1016 | num_updates 31000 | best_loss 4.296 epoch 019 | valid on 'valid' subset | loss 4.314 | nll_loss 2.693 | ppl 6.47 | wps 75475 | wpb 21331 | bsz 1016 | num_updates 31000 | best_loss 4.296 epoch 019 | valid on 'valid' subset | loss 4.314 | nll_loss 2.693 | ppl 6.47 | wps 75475 | wpb 21331 | bsz 1016 | num_updates 31000 | best_loss 4.296 epoch 019 | valid on 'valid' subset | loss 4.314 | nll_loss 2.693 | ppl 6.47 | wps 75475 | wpb 21331 | bsz 1016 | num_updates 31000 | best_loss 4.296 epoch 019 | valid on 'valid' subset | loss 4.314 | nll_loss 2.693 | ppl 6.47 | wps 75475 | wpb 21331 | bsz 1016 | num_updates 31000 | best_loss 4.296 epoch 019 | valid on 'valid' subset | loss 4.314 | nll_loss 2.693 | ppl 6.47 | wps 75475 | wpb 21331 | bsz 1016 | num_updates 31000 | best_loss 4.296 epoch 019 | valid on 'valid' subset | loss 4.314 | nll_loss 2.693 | ppl 6.47 | wps 75475 | wpb 21331 | bsz 1016 | num_updates 31000 | best_loss 4.296 epoch 019 | valid on 'valid' subset | loss 4.314 | nll_loss 2.693 | ppl 6.47 | wps 75475 | wpb 21331 | bsz 1016 | num_updates 31000 | best_loss 4.296 epoch 019 | valid on 'valid' subset | loss 4.314 | nll_loss 2.693 | ppl 6.47 | wps 75475 | wpb 21331 | bsz 1016 | num_updates 31000 | best_loss 4.296 epoch 019 | valid on 'valid' subset | loss 4.314 | nll_loss 2.693 | ppl 6.47 | wps 75475 | wpb 21331 | bsz 1016 | num_updates 31000 | best_loss 4.296 epoch 019 | valid on 'valid' subset | loss 4.314 | nll_loss 2.693 | ppl 6.47 | wps 75475 | wpb 21331 | bsz 1016 | num_updates 31000 | best_loss 4.296 epoch 019 | valid on 'valid' subset | loss 4.314 | nll_loss 2.693 | ppl 6.47 | wps 75475 | wpb 21331 | bsz 1016 | num_updates 31000 | best_loss 4.296 epoch 019 | valid on 'valid' subset | loss 4.314 | nll_loss 2.693 | ppl 6.47 | wps 75475 | wpb 21331 | bsz 1016 | num_updates 31000 | best_loss 4.296 epoch 019 | valid on 'valid' subset | loss 4.314 | nll_loss 2.693 | ppl 6.47 | wps 75475 | wpb 21331 | bsz 1016 | num_updates 31000 | best_loss 4.296 epoch 019 | valid on 'valid' subset | loss 4.314 | nll_loss 2.693 | ppl 6.47 | wps 75475 | wpb 21331 | bsz 1016 | num_updates 31000 | best_loss 4.296 epoch 019 | valid on 'valid' subset | loss 4.314 | nll_loss 2.693 | ppl 6.47 | wps 75475 | wpb 21331 | bsz 1016 | num_updates 31000 | best_loss 4.296 epoch 019 | valid on 'valid' subset | loss 4.314 | nll_loss 2.693 | ppl 6.47 | wps 75475 | wpb 21331 | bsz 1016 | num_updates 31000 | best_loss 4.296 epoch 019 | valid on 'valid' subset | loss 4.314 | nll_loss 2.693 | ppl 6.47 | wps 75475 | wpb 21331 | bsz 1016 | num_updates 31000 | best_loss 4.296 epoch 019 | valid on 'valid' subset | loss 4.314 | nll_loss 2.693 | ppl 6.47 | wps 75475 | wpb 21331 | bsz 1016 | num_updates 31000 | best_loss 4.296 epoch 019: 465 / 1707 loss=4.084, nll_loss=2.452, ppl=5.47, wps=271886, ups=0.63, wpb=429279, bsz=16521.8, num_updates=31100, lr=0.000358633, gnorm=0.238, clip=0, loss_scale=4, train_wall=144, gb_free=59.3, wall=45838 epoch 019: 465 / 1707 loss=4.084, nll_loss=2.452, ppl=5.47, wps=271886, ups=0.63, wpb=429279, bsz=16521.8, num_updates=31100, lr=0.000358633, gnorm=0.238, clip=0, loss_scale=4, train_wall=144, gb_free=59.3, wall=45838 epoch 019: 465 / 1707 loss=4.084, nll_loss=2.452, ppl=5.47, wps=271886, ups=0.63, wpb=429279, bsz=16521.8, num_updates=31100, lr=0.000358633, gnorm=0.238, clip=0, loss_scale=4, train_wall=144, gb_free=59.3, wall=45838 epoch 019: 465 / 1707 loss=4.084, nll_loss=2.452, ppl=5.47, wps=271886, ups=0.63, wpb=429279, bsz=16521.8, num_updates=31100, lr=0.000358633, gnorm=0.238, clip=0, loss_scale=4, train_wall=144, gb_free=59.3, wall=45838 epoch 019: 465 / 1707 loss=4.084, nll_loss=2.452, ppl=5.47, wps=271886, ups=0.63, wpb=429279, bsz=16521.8, num_updates=31100, lr=0.000358633, gnorm=0.238, clip=0, loss_scale=4, train_wall=144, gb_free=59.3, wall=45838 epoch 019: 465 / 1707 loss=4.084, nll_loss=2.452, ppl=5.47, wps=271886, ups=0.63, wpb=429279, bsz=16521.8, num_updates=31100, lr=0.000358633, gnorm=0.238, clip=0, loss_scale=4, train_wall=144, gb_free=59.3, wall=45838 epoch 019: 465 / 1707 loss=4.084, nll_loss=2.452, ppl=5.47, wps=271886, ups=0.63, wpb=429279, bsz=16521.8, num_updates=31100, lr=0.000358633, gnorm=0.238, clip=0, loss_scale=4, train_wall=144, gb_free=59.3, wall=45838 epoch 019: 465 / 1707 loss=4.084, nll_loss=2.452, ppl=5.47, wps=271886, ups=0.63, wpb=429279, bsz=16521.8, num_updates=31100, lr=0.000358633, gnorm=0.238, clip=0, loss_scale=4, train_wall=144, gb_free=59.3, wall=45838 epoch 019: 465 / 1707 loss=4.084, nll_loss=2.452, ppl=5.47, wps=271886, ups=0.63, wpb=429279, bsz=16521.8, num_updates=31100, lr=0.000358633, gnorm=0.238, clip=0, loss_scale=4, train_wall=144, gb_free=59.3, wall=45838 epoch 019: 465 / 1707 loss=4.084, nll_loss=2.452, ppl=5.47, wps=271886, ups=0.63, wpb=429279, bsz=16521.8, num_updates=31100, lr=0.000358633, gnorm=0.238, clip=0, loss_scale=4, train_wall=144, gb_free=59.3, wall=45838 epoch 019: 465 / 1707 loss=4.084, nll_loss=2.452, ppl=5.47, wps=271886, ups=0.63, wpb=429279, bsz=16521.8, num_updates=31100, lr=0.000358633, gnorm=0.238, clip=0, loss_scale=4, train_wall=144, gb_free=59.3, wall=45838 epoch 019: 465 / 1707 loss=4.084, nll_loss=2.452, ppl=5.47, wps=271886, ups=0.63, wpb=429279, bsz=16521.8, num_updates=31100, lr=0.000358633, gnorm=0.238, clip=0, loss_scale=4, train_wall=144, gb_free=59.3, wall=45838 epoch 019: 465 / 1707 loss=4.084, nll_loss=2.452, ppl=5.47, wps=271886, ups=0.63, wpb=429279, bsz=16521.8, num_updates=31100, lr=0.000358633, gnorm=0.238, clip=0, loss_scale=4, train_wall=144, gb_free=59.3, wall=45838 epoch 019: 465 / 1707 loss=4.084, nll_loss=2.452, ppl=5.47, wps=271886, ups=0.63, wpb=429279, bsz=16521.8, num_updates=31100, lr=0.000358633, gnorm=0.238, clip=0, loss_scale=4, train_wall=144, gb_free=59.3, wall=45838 epoch 019: 465 / 1707 loss=4.084, nll_loss=2.452, ppl=5.47, wps=271886, ups=0.63, wpb=429279, bsz=16521.8, num_updates=31100, lr=0.000358633, gnorm=0.238, clip=0, loss_scale=4, train_wall=144, gb_free=59.3, wall=45838 epoch 019: 465 / 1707 loss=4.084, nll_loss=2.452, ppl=5.47, wps=271886, ups=0.63, wpb=429279, bsz=16521.8, num_updates=31100, lr=0.000358633, gnorm=0.238, clip=0, loss_scale=4, train_wall=144, gb_free=59.3, wall=45838 epoch 019: 465 / 1707 loss=4.084, nll_loss=2.452, ppl=5.47, wps=271886, ups=0.63, wpb=429279, bsz=16521.8, num_updates=31100, lr=0.000358633, gnorm=0.238, clip=0, loss_scale=4, train_wall=144, gb_free=59.3, wall=45838 epoch 019: 465 / 1707 loss=4.084, nll_loss=2.452, ppl=5.47, wps=271886, ups=0.63, wpb=429279, bsz=16521.8, num_updates=31100, lr=0.000358633, gnorm=0.238, clip=0, loss_scale=4, train_wall=144, gb_free=59.3, wall=45838 epoch 019: 465 / 1707 loss=4.084, nll_loss=2.452, ppl=5.47, wps=271886, ups=0.63, wpb=429279, bsz=16521.8, num_updates=31100, lr=0.000358633, gnorm=0.238, clip=0, loss_scale=4, train_wall=144, gb_free=59.3, wall=45838 epoch 019: 566 / 1707 loss=4.093, nll_loss=2.462, ppl=5.51, wps=293934, ups=0.68, wpb=429800, bsz=16356.7, num_updates=31200, lr=0.000358057, gnorm=0.232, clip=0, loss_scale=2, train_wall=145, gb_free=59, wall=45984 epoch 019: 566 / 1707 loss=4.093, nll_loss=2.462, ppl=5.51, wps=293934, ups=0.68, wpb=429800, bsz=16356.7, num_updates=31200, lr=0.000358057, gnorm=0.232, clip=0, loss_scale=2, train_wall=145, gb_free=59, wall=45984 epoch 019: 566 / 1707 loss=4.093, nll_loss=2.462, ppl=5.51, wps=293934, ups=0.68, wpb=429800, bsz=16356.7, num_updates=31200, lr=0.000358057, gnorm=0.232, clip=0, loss_scale=2, train_wall=145, gb_free=59, wall=45984 epoch 019: 566 / 1707 loss=4.093, nll_loss=2.462, ppl=5.51, wps=293934, ups=0.68, wpb=429800, bsz=16356.7, num_updates=31200, lr=0.000358057, gnorm=0.232, clip=0, loss_scale=2, train_wall=145, gb_free=59, wall=45984 epoch 019: 566 / 1707 loss=4.093, nll_loss=2.462, ppl=5.51, wps=293934, ups=0.68, wpb=429800, bsz=16356.7, num_updates=31200, lr=0.000358057, gnorm=0.232, clip=0, loss_scale=2, train_wall=145, gb_free=59, wall=45984 epoch 019: 566 / 1707 loss=4.093, nll_loss=2.462, ppl=5.51, wps=293934, ups=0.68, wpb=429800, bsz=16356.7, num_updates=31200, lr=0.000358057, gnorm=0.232, clip=0, loss_scale=2, train_wall=145, gb_free=59, wall=45984 epoch 019: 566 / 1707 loss=4.093, nll_loss=2.462, ppl=5.51, wps=293934, ups=0.68, wpb=429800, bsz=16356.7, num_updates=31200, lr=0.000358057, gnorm=0.232, clip=0, loss_scale=2, train_wall=145, gb_free=59, wall=45984 epoch 019: 566 / 1707 loss=4.093, nll_loss=2.462, ppl=5.51, wps=293934, ups=0.68, wpb=429800, bsz=16356.7, num_updates=31200, lr=0.000358057, gnorm=0.232, clip=0, loss_scale=2, train_wall=145, gb_free=59, wall=45984 epoch 019: 566 / 1707 loss=4.093, nll_loss=2.462, ppl=5.51, wps=293934, ups=0.68, wpb=429800, bsz=16356.7, num_updates=31200, lr=0.000358057, gnorm=0.232, clip=0, loss_scale=2, train_wall=145, gb_free=59, wall=45984 epoch 019: 566 / 1707 loss=4.093, nll_loss=2.462, ppl=5.51, wps=293934, ups=0.68, wpb=429800, bsz=16356.7, num_updates=31200, lr=0.000358057, gnorm=0.232, clip=0, loss_scale=2, train_wall=145, gb_free=59, wall=45984 epoch 019: 566 / 1707 loss=4.093, nll_loss=2.462, ppl=5.51, wps=293934, ups=0.68, wpb=429800, bsz=16356.7, num_updates=31200, lr=0.000358057, gnorm=0.232, clip=0, loss_scale=2, train_wall=145, gb_free=59, wall=45984 epoch 019: 566 / 1707 loss=4.093, nll_loss=2.462, ppl=5.51, wps=293934, ups=0.68, wpb=429800, bsz=16356.7, num_updates=31200, lr=0.000358057, gnorm=0.232, clip=0, loss_scale=2, train_wall=145, gb_free=59, wall=45984 epoch 019: 566 / 1707 loss=4.093, nll_loss=2.462, ppl=5.51, wps=293934, ups=0.68, wpb=429800, bsz=16356.7, num_updates=31200, lr=0.000358057, gnorm=0.232, clip=0, loss_scale=2, train_wall=145, gb_free=59, wall=45984 epoch 019: 566 / 1707 loss=4.093, nll_loss=2.462, ppl=5.51, wps=293934, ups=0.68, wpb=429800, bsz=16356.7, num_updates=31200, lr=0.000358057, gnorm=0.232, clip=0, loss_scale=2, train_wall=145, gb_free=59, wall=45984 epoch 019: 566 / 1707 loss=4.093, nll_loss=2.462, ppl=5.51, wps=293934, ups=0.68, wpb=429800, bsz=16356.7, num_updates=31200, lr=0.000358057, gnorm=0.232, clip=0, loss_scale=2, train_wall=145, gb_free=59, wall=45984 epoch 019: 566 / 1707 loss=4.093, nll_loss=2.462, ppl=5.51, wps=293934, ups=0.68, wpb=429800, bsz=16356.7, num_updates=31200, lr=0.000358057, gnorm=0.232, clip=0, loss_scale=2, train_wall=145, gb_free=59, wall=45984 epoch 019: 566 / 1707 loss=4.093, nll_loss=2.462, ppl=5.51, wps=293934, ups=0.68, wpb=429800, bsz=16356.7, num_updates=31200, lr=0.000358057, gnorm=0.232, clip=0, loss_scale=2, train_wall=145, gb_free=59, wall=45984 epoch 019: 566 / 1707 loss=4.093, nll_loss=2.462, ppl=5.51, wps=293934, ups=0.68, wpb=429800, bsz=16356.7, num_updates=31200, lr=0.000358057, gnorm=0.232, clip=0, loss_scale=2, train_wall=145, gb_free=59, wall=45984 epoch 019: 566 / 1707 loss=4.093, nll_loss=2.462, ppl=5.51, wps=293934, ups=0.68, wpb=429800, bsz=16356.7, num_updates=31200, lr=0.000358057, gnorm=0.232, clip=0, loss_scale=2, train_wall=145, gb_free=59, wall=45984 epoch 019: 666 / 1707 loss=4.091, nll_loss=2.46, ppl=5.5, wps=295076, ups=0.69, wpb=429256, bsz=16341.1, num_updates=31300, lr=0.000357485, gnorm=0.242, clip=0, loss_scale=2, train_wall=145, gb_free=59.4, wall=46130 epoch 019: 666 / 1707 loss=4.091, nll_loss=2.46, ppl=5.5, wps=295076, ups=0.69, wpb=429256, bsz=16341.1, num_updates=31300, lr=0.000357485, gnorm=0.242, clip=0, loss_scale=2, train_wall=145, gb_free=59.4, wall=46130 epoch 019: 666 / 1707 loss=4.091, nll_loss=2.46, ppl=5.5, wps=295076, ups=0.69, wpb=429256, bsz=16341.1, num_updates=31300, lr=0.000357485, gnorm=0.242, clip=0, loss_scale=2, train_wall=145, gb_free=59.4, wall=46130 epoch 019: 666 / 1707 loss=4.091, nll_loss=2.46, ppl=5.5, wps=295076, ups=0.69, wpb=429256, bsz=16341.1, num_updates=31300, lr=0.000357485, gnorm=0.242, clip=0, loss_scale=2, train_wall=145, gb_free=59.4, wall=46130 epoch 019: 666 / 1707 loss=4.091, nll_loss=2.46, ppl=5.5, wps=295076, ups=0.69, wpb=429256, bsz=16341.1, num_updates=31300, lr=0.000357485, gnorm=0.242, clip=0, loss_scale=2, train_wall=145, gb_free=59.4, wall=46130 epoch 019: 666 / 1707 loss=4.091, nll_loss=2.46, ppl=5.5, wps=295076, ups=0.69, wpb=429256, bsz=16341.1, num_updates=31300, lr=0.000357485, gnorm=0.242, clip=0, loss_scale=2, train_wall=145, gb_free=59.4, wall=46130 epoch 019: 666 / 1707 loss=4.091, nll_loss=2.46, ppl=5.5, wps=295076, ups=0.69, wpb=429256, bsz=16341.1, num_updates=31300, lr=0.000357485, gnorm=0.242, clip=0, loss_scale=2, train_wall=145, gb_free=59.4, wall=46130 epoch 019: 666 / 1707 loss=4.091, nll_loss=2.46, ppl=5.5, wps=295076, ups=0.69, wpb=429256, bsz=16341.1, num_updates=31300, lr=0.000357485, gnorm=0.242, clip=0, loss_scale=2, train_wall=145, gb_free=59.4, wall=46130 epoch 019: 666 / 1707 loss=4.091, nll_loss=2.46, ppl=5.5, wps=295076, ups=0.69, wpb=429256, bsz=16341.1, num_updates=31300, lr=0.000357485, gnorm=0.242, clip=0, loss_scale=2, train_wall=145, gb_free=59.4, wall=46130 epoch 019: 666 / 1707 loss=4.091, nll_loss=2.46, ppl=5.5, wps=295076, ups=0.69, wpb=429256, bsz=16341.1, num_updates=31300, lr=0.000357485, gnorm=0.242, clip=0, loss_scale=2, train_wall=145, gb_free=59.4, wall=46130 epoch 019: 666 / 1707 loss=4.091, nll_loss=2.46, ppl=5.5, wps=295076, ups=0.69, wpb=429256, bsz=16341.1, num_updates=31300, lr=0.000357485, gnorm=0.242, clip=0, loss_scale=2, train_wall=145, gb_free=59.4, wall=46130 epoch 019: 666 / 1707 loss=4.091, nll_loss=2.46, ppl=5.5, wps=295076, ups=0.69, wpb=429256, bsz=16341.1, num_updates=31300, lr=0.000357485, gnorm=0.242, clip=0, loss_scale=2, train_wall=145, gb_free=59.4, wall=46130 epoch 019: 666 / 1707 loss=4.091, nll_loss=2.46, ppl=5.5, wps=295076, ups=0.69, wpb=429256, bsz=16341.1, num_updates=31300, lr=0.000357485, gnorm=0.242, clip=0, loss_scale=2, train_wall=145, gb_free=59.4, wall=46130 epoch 019: 666 / 1707 loss=4.091, nll_loss=2.46, ppl=5.5, wps=295076, ups=0.69, wpb=429256, bsz=16341.1, num_updates=31300, lr=0.000357485, gnorm=0.242, clip=0, loss_scale=2, train_wall=145, gb_free=59.4, wall=46130 epoch 019: 666 / 1707 loss=4.091, nll_loss=2.46, ppl=5.5, wps=295076, ups=0.69, wpb=429256, bsz=16341.1, num_updates=31300, lr=0.000357485, gnorm=0.242, clip=0, loss_scale=2, train_wall=145, gb_free=59.4, wall=46130 epoch 019: 666 / 1707 loss=4.091, nll_loss=2.46, ppl=5.5, wps=295076, ups=0.69, wpb=429256, bsz=16341.1, num_updates=31300, lr=0.000357485, gnorm=0.242, clip=0, loss_scale=2, train_wall=145, gb_free=59.4, wall=46130 epoch 019: 666 / 1707 loss=4.091, nll_loss=2.46, ppl=5.5, wps=295076, ups=0.69, wpb=429256, bsz=16341.1, num_updates=31300, lr=0.000357485, gnorm=0.242, clip=0, loss_scale=2, train_wall=145, gb_free=59.4, wall=46130 epoch 019: 666 / 1707 loss=4.091, nll_loss=2.46, ppl=5.5, wps=295076, ups=0.69, wpb=429256, bsz=16341.1, num_updates=31300, lr=0.000357485, gnorm=0.242, clip=0, loss_scale=2, train_wall=145, gb_free=59.4, wall=46130 epoch 019: 666 / 1707 loss=4.091, nll_loss=2.46, ppl=5.5, wps=295076, ups=0.69, wpb=429256, bsz=16341.1, num_updates=31300, lr=0.000357485, gnorm=0.242, clip=0, loss_scale=2, train_wall=145, gb_free=59.4, wall=46130 epoch 019: 766 / 1707 loss=4.097, nll_loss=2.467, ppl=5.53, wps=295509, ups=0.69, wpb=428642, bsz=16166.7, num_updates=31400, lr=0.000356915, gnorm=0.228, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=46275 epoch 019: 766 / 1707 loss=4.097, nll_loss=2.467, ppl=5.53, wps=295509, ups=0.69, wpb=428642, bsz=16166.7, num_updates=31400, lr=0.000356915, gnorm=0.228, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=46275 epoch 019: 766 / 1707 loss=4.097, nll_loss=2.467, ppl=5.53, wps=295509, ups=0.69, wpb=428642, bsz=16166.7, num_updates=31400, lr=0.000356915, gnorm=0.228, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=46275 epoch 019: 766 / 1707 loss=4.097, nll_loss=2.467, ppl=5.53, wps=295509, ups=0.69, wpb=428642, bsz=16166.7, num_updates=31400, lr=0.000356915, gnorm=0.228, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=46275 epoch 019: 766 / 1707 loss=4.097, nll_loss=2.467, ppl=5.53, wps=295509, ups=0.69, wpb=428642, bsz=16166.7, num_updates=31400, lr=0.000356915, gnorm=0.228, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=46275 epoch 019: 766 / 1707 loss=4.097, nll_loss=2.467, ppl=5.53, wps=295509, ups=0.69, wpb=428642, bsz=16166.7, num_updates=31400, lr=0.000356915, gnorm=0.228, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=46275 epoch 019: 766 / 1707 loss=4.097, nll_loss=2.467, ppl=5.53, wps=295509, ups=0.69, wpb=428642, bsz=16166.7, num_updates=31400, lr=0.000356915, gnorm=0.228, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=46275 epoch 019: 766 / 1707 loss=4.097, nll_loss=2.467, ppl=5.53, wps=295509, ups=0.69, wpb=428642, bsz=16166.7, num_updates=31400, lr=0.000356915, gnorm=0.228, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=46275 epoch 019: 766 / 1707 loss=4.097, nll_loss=2.467, ppl=5.53, wps=295509, ups=0.69, wpb=428642, bsz=16166.7, num_updates=31400, lr=0.000356915, gnorm=0.228, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=46275 epoch 019: 766 / 1707 loss=4.097, nll_loss=2.467, ppl=5.53, wps=295509, ups=0.69, wpb=428642, bsz=16166.7, num_updates=31400, lr=0.000356915, gnorm=0.228, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=46275 epoch 019: 766 / 1707 loss=4.097, nll_loss=2.467, ppl=5.53, wps=295509, ups=0.69, wpb=428642, bsz=16166.7, num_updates=31400, lr=0.000356915, gnorm=0.228, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=46275 epoch 019: 766 / 1707 loss=4.097, nll_loss=2.467, ppl=5.53, wps=295509, ups=0.69, wpb=428642, bsz=16166.7, num_updates=31400, lr=0.000356915, gnorm=0.228, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=46275 epoch 019: 766 / 1707 loss=4.097, nll_loss=2.467, ppl=5.53, wps=295509, ups=0.69, wpb=428642, bsz=16166.7, num_updates=31400, lr=0.000356915, gnorm=0.228, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=46275 epoch 019: 766 / 1707 loss=4.097, nll_loss=2.467, ppl=5.53, wps=295509, ups=0.69, wpb=428642, bsz=16166.7, num_updates=31400, lr=0.000356915, gnorm=0.228, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=46275 epoch 019: 766 / 1707 loss=4.097, nll_loss=2.467, ppl=5.53, wps=295509, ups=0.69, wpb=428642, bsz=16166.7, num_updates=31400, lr=0.000356915, gnorm=0.228, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=46275 epoch 019: 766 / 1707 loss=4.097, nll_loss=2.467, ppl=5.53, wps=295509, ups=0.69, wpb=428642, bsz=16166.7, num_updates=31400, lr=0.000356915, gnorm=0.228, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=46275 epoch 019: 766 / 1707 loss=4.097, nll_loss=2.467, ppl=5.53, wps=295509, ups=0.69, wpb=428642, bsz=16166.7, num_updates=31400, lr=0.000356915, gnorm=0.228, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=46275 epoch 019: 766 / 1707 loss=4.097, nll_loss=2.467, ppl=5.53, wps=295509, ups=0.69, wpb=428642, bsz=16166.7, num_updates=31400, lr=0.000356915, gnorm=0.228, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=46275 epoch 019: 766 / 1707 loss=4.097, nll_loss=2.467, ppl=5.53, wps=295509, ups=0.69, wpb=428642, bsz=16166.7, num_updates=31400, lr=0.000356915, gnorm=0.228, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=46275 epoch 019: 866 / 1707 loss=4.101, nll_loss=2.472, ppl=5.55, wps=295238, ups=0.69, wpb=429926, bsz=16662.3, num_updates=31500, lr=0.000356348, gnorm=0.231, clip=0, loss_scale=4, train_wall=145, gb_free=59.3, wall=46421 epoch 019: 866 / 1707 loss=4.101, nll_loss=2.472, ppl=5.55, wps=295238, ups=0.69, wpb=429926, bsz=16662.3, num_updates=31500, lr=0.000356348, gnorm=0.231, clip=0, loss_scale=4, train_wall=145, gb_free=59.3, wall=46421 epoch 019: 866 / 1707 loss=4.101, nll_loss=2.472, ppl=5.55, wps=295238, ups=0.69, wpb=429926, bsz=16662.3, num_updates=31500, lr=0.000356348, gnorm=0.231, clip=0, loss_scale=4, train_wall=145, gb_free=59.3, wall=46421 epoch 019: 866 / 1707 loss=4.101, nll_loss=2.472, ppl=5.55, wps=295238, ups=0.69, wpb=429926, bsz=16662.3, num_updates=31500, lr=0.000356348, gnorm=0.231, clip=0, loss_scale=4, train_wall=145, gb_free=59.3, wall=46421 epoch 019: 866 / 1707 loss=4.101, nll_loss=2.472, ppl=5.55, wps=295238, ups=0.69, wpb=429926, bsz=16662.3, num_updates=31500, lr=0.000356348, gnorm=0.231, clip=0, loss_scale=4, train_wall=145, gb_free=59.3, wall=46421 epoch 019: 866 / 1707 loss=4.101, nll_loss=2.472, ppl=5.55, wps=295238, ups=0.69, wpb=429926, bsz=16662.3, num_updates=31500, lr=0.000356348, gnorm=0.231, clip=0, loss_scale=4, train_wall=145, gb_free=59.3, wall=46421 epoch 019: 866 / 1707 loss=4.101, nll_loss=2.472, ppl=5.55, wps=295238, ups=0.69, wpb=429926, bsz=16662.3, num_updates=31500, lr=0.000356348, gnorm=0.231, clip=0, loss_scale=4, train_wall=145, gb_free=59.3, wall=46421 epoch 019: 866 / 1707 loss=4.101, nll_loss=2.472, ppl=5.55, wps=295238, ups=0.69, wpb=429926, bsz=16662.3, num_updates=31500, lr=0.000356348, gnorm=0.231, clip=0, loss_scale=4, train_wall=145, gb_free=59.3, wall=46421 epoch 019: 866 / 1707 loss=4.101, nll_loss=2.472, ppl=5.55, wps=295238, ups=0.69, wpb=429926, bsz=16662.3, num_updates=31500, lr=0.000356348, gnorm=0.231, clip=0, loss_scale=4, train_wall=145, gb_free=59.3, wall=46421 epoch 019: 866 / 1707 loss=4.101, nll_loss=2.472, ppl=5.55, wps=295238, ups=0.69, wpb=429926, bsz=16662.3, num_updates=31500, lr=0.000356348, gnorm=0.231, clip=0, loss_scale=4, train_wall=145, gb_free=59.3, wall=46421 epoch 019: 866 / 1707 loss=4.101, nll_loss=2.472, ppl=5.55, wps=295238, ups=0.69, wpb=429926, bsz=16662.3, num_updates=31500, lr=0.000356348, gnorm=0.231, clip=0, loss_scale=4, train_wall=145, gb_free=59.3, wall=46421 epoch 019: 866 / 1707 loss=4.101, nll_loss=2.472, ppl=5.55, wps=295238, ups=0.69, wpb=429926, bsz=16662.3, num_updates=31500, lr=0.000356348, gnorm=0.231, clip=0, loss_scale=4, train_wall=145, gb_free=59.3, wall=46421 epoch 019: 866 / 1707 loss=4.101, nll_loss=2.472, ppl=5.55, wps=295238, ups=0.69, wpb=429926, bsz=16662.3, num_updates=31500, lr=0.000356348, gnorm=0.231, clip=0, loss_scale=4, train_wall=145, gb_free=59.3, wall=46421 epoch 019: 866 / 1707 loss=4.101, nll_loss=2.472, ppl=5.55, wps=295238, ups=0.69, wpb=429926, bsz=16662.3, num_updates=31500, lr=0.000356348, gnorm=0.231, clip=0, loss_scale=4, train_wall=145, gb_free=59.3, wall=46421 epoch 019: 866 / 1707 loss=4.101, nll_loss=2.472, ppl=5.55, wps=295238, ups=0.69, wpb=429926, bsz=16662.3, num_updates=31500, lr=0.000356348, gnorm=0.231, clip=0, loss_scale=4, train_wall=145, gb_free=59.3, wall=46421 epoch 019: 866 / 1707 loss=4.101, nll_loss=2.472, ppl=5.55, wps=295238, ups=0.69, wpb=429926, bsz=16662.3, num_updates=31500, lr=0.000356348, gnorm=0.231, clip=0, loss_scale=4, train_wall=145, gb_free=59.3, wall=46421 epoch 019: 866 / 1707 loss=4.101, nll_loss=2.472, ppl=5.55, wps=295238, ups=0.69, wpb=429926, bsz=16662.3, num_updates=31500, lr=0.000356348, gnorm=0.231, clip=0, loss_scale=4, train_wall=145, gb_free=59.3, wall=46421 epoch 019: 866 / 1707 loss=4.101, nll_loss=2.472, ppl=5.55, wps=295238, ups=0.69, wpb=429926, bsz=16662.3, num_updates=31500, lr=0.000356348, gnorm=0.231, clip=0, loss_scale=4, train_wall=145, gb_free=59.3, wall=46421 epoch 019: 866 / 1707 loss=4.101, nll_loss=2.472, ppl=5.55, wps=295238, ups=0.69, wpb=429926, bsz=16662.3, num_updates=31500, lr=0.000356348, gnorm=0.231, clip=0, loss_scale=4, train_wall=145, gb_free=59.3, wall=46421 epoch 019: 966 / 1707 loss=4.091, nll_loss=2.461, ppl=5.5, wps=294987, ups=0.69, wpb=428494, bsz=16279.6, num_updates=31600, lr=0.000355784, gnorm=0.246, clip=0, loss_scale=4, train_wall=145, gb_free=59.7, wall=46566 epoch 019: 966 / 1707 loss=4.091, nll_loss=2.461, ppl=5.5, wps=294987, ups=0.69, wpb=428494, bsz=16279.6, num_updates=31600, lr=0.000355784, gnorm=0.246, clip=0, loss_scale=4, train_wall=145, gb_free=59.7, wall=46566 epoch 019: 966 / 1707 loss=4.091, nll_loss=2.461, ppl=5.5, wps=294987, ups=0.69, wpb=428494, bsz=16279.6, num_updates=31600, lr=0.000355784, gnorm=0.246, clip=0, loss_scale=4, train_wall=145, gb_free=59.7, wall=46566 epoch 019: 966 / 1707 loss=4.091, nll_loss=2.461, ppl=5.5, wps=294987, ups=0.69, wpb=428494, bsz=16279.6, num_updates=31600, lr=0.000355784, gnorm=0.246, clip=0, loss_scale=4, train_wall=145, gb_free=59.7, wall=46566 epoch 019: 966 / 1707 loss=4.091, nll_loss=2.461, ppl=5.5, wps=294987, ups=0.69, wpb=428494, bsz=16279.6, num_updates=31600, lr=0.000355784, gnorm=0.246, clip=0, loss_scale=4, train_wall=145, gb_free=59.7, wall=46566 epoch 019: 966 / 1707 loss=4.091, nll_loss=2.461, ppl=5.5, wps=294987, ups=0.69, wpb=428494, bsz=16279.6, num_updates=31600, lr=0.000355784, gnorm=0.246, clip=0, loss_scale=4, train_wall=145, gb_free=59.7, wall=46566 epoch 019: 966 / 1707 loss=4.091, nll_loss=2.461, ppl=5.5, wps=294987, ups=0.69, wpb=428494, bsz=16279.6, num_updates=31600, lr=0.000355784, gnorm=0.246, clip=0, loss_scale=4, train_wall=145, gb_free=59.7, wall=46566 epoch 019: 966 / 1707 loss=4.091, nll_loss=2.461, ppl=5.5, wps=294987, ups=0.69, wpb=428494, bsz=16279.6, num_updates=31600, lr=0.000355784, gnorm=0.246, clip=0, loss_scale=4, train_wall=145, gb_free=59.7, wall=46566 epoch 019: 966 / 1707 loss=4.091, nll_loss=2.461, ppl=5.5, wps=294987, ups=0.69, wpb=428494, bsz=16279.6, num_updates=31600, lr=0.000355784, gnorm=0.246, clip=0, loss_scale=4, train_wall=145, gb_free=59.7, wall=46566 epoch 019: 966 / 1707 loss=4.091, nll_loss=2.461, ppl=5.5, wps=294987, ups=0.69, wpb=428494, bsz=16279.6, num_updates=31600, lr=0.000355784, gnorm=0.246, clip=0, loss_scale=4, train_wall=145, gb_free=59.7, wall=46566 epoch 019: 966 / 1707 loss=4.091, nll_loss=2.461, ppl=5.5, wps=294987, ups=0.69, wpb=428494, bsz=16279.6, num_updates=31600, lr=0.000355784, gnorm=0.246, clip=0, loss_scale=4, train_wall=145, gb_free=59.7, wall=46566 epoch 019: 966 / 1707 loss=4.091, nll_loss=2.461, ppl=5.5, wps=294987, ups=0.69, wpb=428494, bsz=16279.6, num_updates=31600, lr=0.000355784, gnorm=0.246, clip=0, loss_scale=4, train_wall=145, gb_free=59.7, wall=46566 epoch 019: 966 / 1707 loss=4.091, nll_loss=2.461, ppl=5.5, wps=294987, ups=0.69, wpb=428494, bsz=16279.6, num_updates=31600, lr=0.000355784, gnorm=0.246, clip=0, loss_scale=4, train_wall=145, gb_free=59.7, wall=46566 epoch 019: 966 / 1707 loss=4.091, nll_loss=2.461, ppl=5.5, wps=294987, ups=0.69, wpb=428494, bsz=16279.6, num_updates=31600, lr=0.000355784, gnorm=0.246, clip=0, loss_scale=4, train_wall=145, gb_free=59.7, wall=46566 epoch 019: 966 / 1707 loss=4.091, nll_loss=2.461, ppl=5.5, wps=294987, ups=0.69, wpb=428494, bsz=16279.6, num_updates=31600, lr=0.000355784, gnorm=0.246, clip=0, loss_scale=4, train_wall=145, gb_free=59.7, wall=46566 epoch 019: 966 / 1707 loss=4.091, nll_loss=2.461, ppl=5.5, wps=294987, ups=0.69, wpb=428494, bsz=16279.6, num_updates=31600, lr=0.000355784, gnorm=0.246, clip=0, loss_scale=4, train_wall=145, gb_free=59.7, wall=46566 epoch 019: 966 / 1707 loss=4.091, nll_loss=2.461, ppl=5.5, wps=294987, ups=0.69, wpb=428494, bsz=16279.6, num_updates=31600, lr=0.000355784, gnorm=0.246, clip=0, loss_scale=4, train_wall=145, gb_free=59.7, wall=46566 epoch 019: 966 / 1707 loss=4.091, nll_loss=2.461, ppl=5.5, wps=294987, ups=0.69, wpb=428494, bsz=16279.6, num_updates=31600, lr=0.000355784, gnorm=0.246, clip=0, loss_scale=4, train_wall=145, gb_free=59.7, wall=46566 epoch 019: 966 / 1707 loss=4.091, nll_loss=2.461, ppl=5.5, wps=294987, ups=0.69, wpb=428494, bsz=16279.6, num_updates=31600, lr=0.000355784, gnorm=0.246, clip=0, loss_scale=4, train_wall=145, gb_free=59.7, wall=46566 epoch 019: 1068 / 1707 loss=4.09, nll_loss=2.459, ppl=5.5, wps=289904, ups=0.68, wpb=429410, bsz=16397.7, num_updates=31700, lr=0.000355222, gnorm=0.23, clip=0, loss_scale=1, train_wall=147, gb_free=58.9, wall=46714 epoch 019: 1068 / 1707 loss=4.09, nll_loss=2.459, ppl=5.5, wps=289904, ups=0.68, wpb=429410, bsz=16397.7, num_updates=31700, lr=0.000355222, gnorm=0.23, clip=0, loss_scale=1, train_wall=147, gb_free=58.9, wall=46714 epoch 019: 1068 / 1707 loss=4.09, nll_loss=2.459, ppl=5.5, wps=289904, ups=0.68, wpb=429410, bsz=16397.7, num_updates=31700, lr=0.000355222, gnorm=0.23, clip=0, loss_scale=1, train_wall=147, gb_free=58.9, wall=46714 epoch 019: 1068 / 1707 loss=4.09, nll_loss=2.459, ppl=5.5, wps=289904, ups=0.68, wpb=429410, bsz=16397.7, num_updates=31700, lr=0.000355222, gnorm=0.23, clip=0, loss_scale=1, train_wall=147, gb_free=58.9, wall=46714 epoch 019: 1068 / 1707 loss=4.09, nll_loss=2.459, ppl=5.5, wps=289904, ups=0.68, wpb=429410, bsz=16397.7, num_updates=31700, lr=0.000355222, gnorm=0.23, clip=0, loss_scale=1, train_wall=147, gb_free=58.9, wall=46714 epoch 019: 1068 / 1707 loss=4.09, nll_loss=2.459, ppl=5.5, wps=289904, ups=0.68, wpb=429410, bsz=16397.7, num_updates=31700, lr=0.000355222, gnorm=0.23, clip=0, loss_scale=1, train_wall=147, gb_free=58.9, wall=46714 epoch 019: 1068 / 1707 loss=4.09, nll_loss=2.459, ppl=5.5, wps=289904, ups=0.68, wpb=429410, bsz=16397.7, num_updates=31700, lr=0.000355222, gnorm=0.23, clip=0, loss_scale=1, train_wall=147, gb_free=58.9, wall=46714 epoch 019: 1068 / 1707 loss=4.09, nll_loss=2.459, ppl=5.5, wps=289904, ups=0.68, wpb=429410, bsz=16397.7, num_updates=31700, lr=0.000355222, gnorm=0.23, clip=0, loss_scale=1, train_wall=147, gb_free=58.9, wall=46714 epoch 019: 1068 / 1707 loss=4.09, nll_loss=2.459, ppl=5.5, wps=289904, ups=0.68, wpb=429410, bsz=16397.7, num_updates=31700, lr=0.000355222, gnorm=0.23, clip=0, loss_scale=1, train_wall=147, gb_free=58.9, wall=46714 epoch 019: 1068 / 1707 loss=4.09, nll_loss=2.459, ppl=5.5, wps=289904, ups=0.68, wpb=429410, bsz=16397.7, num_updates=31700, lr=0.000355222, gnorm=0.23, clip=0, loss_scale=1, train_wall=147, gb_free=58.9, wall=46714 epoch 019: 1068 / 1707 loss=4.09, nll_loss=2.459, ppl=5.5, wps=289904, ups=0.68, wpb=429410, bsz=16397.7, num_updates=31700, lr=0.000355222, gnorm=0.23, clip=0, loss_scale=1, train_wall=147, gb_free=58.9, wall=46714 epoch 019: 1068 / 1707 loss=4.09, nll_loss=2.459, ppl=5.5, wps=289904, ups=0.68, wpb=429410, bsz=16397.7, num_updates=31700, lr=0.000355222, gnorm=0.23, clip=0, loss_scale=1, train_wall=147, gb_free=58.9, wall=46714 epoch 019: 1068 / 1707 loss=4.09, nll_loss=2.459, ppl=5.5, wps=289904, ups=0.68, wpb=429410, bsz=16397.7, num_updates=31700, lr=0.000355222, gnorm=0.23, clip=0, loss_scale=1, train_wall=147, gb_free=58.9, wall=46714 epoch 019: 1068 / 1707 loss=4.09, nll_loss=2.459, ppl=5.5, wps=289904, ups=0.68, wpb=429410, bsz=16397.7, num_updates=31700, lr=0.000355222, gnorm=0.23, clip=0, loss_scale=1, train_wall=147, gb_free=58.9, wall=46714 epoch 019: 1068 / 1707 loss=4.09, nll_loss=2.459, ppl=5.5, wps=289904, ups=0.68, wpb=429410, bsz=16397.7, num_updates=31700, lr=0.000355222, gnorm=0.23, clip=0, loss_scale=1, train_wall=147, gb_free=58.9, wall=46714 epoch 019: 1068 / 1707 loss=4.09, nll_loss=2.459, ppl=5.5, wps=289904, ups=0.68, wpb=429410, bsz=16397.7, num_updates=31700, lr=0.000355222, gnorm=0.23, clip=0, loss_scale=1, train_wall=147, gb_free=58.9, wall=46714 epoch 019: 1068 / 1707 loss=4.09, nll_loss=2.459, ppl=5.5, wps=289904, ups=0.68, wpb=429410, bsz=16397.7, num_updates=31700, lr=0.000355222, gnorm=0.23, clip=0, loss_scale=1, train_wall=147, gb_free=58.9, wall=46714 epoch 019: 1068 / 1707 loss=4.09, nll_loss=2.459, ppl=5.5, wps=289904, ups=0.68, wpb=429410, bsz=16397.7, num_updates=31700, lr=0.000355222, gnorm=0.23, clip=0, loss_scale=1, train_wall=147, gb_free=58.9, wall=46714 epoch 019: 1068 / 1707 loss=4.09, nll_loss=2.459, ppl=5.5, wps=289904, ups=0.68, wpb=429410, bsz=16397.7, num_updates=31700, lr=0.000355222, gnorm=0.23, clip=0, loss_scale=1, train_wall=147, gb_free=58.9, wall=46714 epoch 019: 1168 / 1707 loss=4.095, nll_loss=2.465, ppl=5.52, wps=294745, ups=0.69, wpb=428307, bsz=16348.9, num_updates=31800, lr=0.000354663, gnorm=0.24, clip=0, loss_scale=1, train_wall=144, gb_free=59.2, wall=46859 epoch 019: 1168 / 1707 loss=4.095, nll_loss=2.465, ppl=5.52, wps=294745, ups=0.69, wpb=428307, bsz=16348.9, num_updates=31800, lr=0.000354663, gnorm=0.24, clip=0, loss_scale=1, train_wall=144, gb_free=59.2, wall=46859 epoch 019: 1168 / 1707 loss=4.095, nll_loss=2.465, ppl=5.52, wps=294745, ups=0.69, wpb=428307, bsz=16348.9, num_updates=31800, lr=0.000354663, gnorm=0.24, clip=0, loss_scale=1, train_wall=144, gb_free=59.2, wall=46859 epoch 019: 1168 / 1707 loss=4.095, nll_loss=2.465, ppl=5.52, wps=294745, ups=0.69, wpb=428307, bsz=16348.9, num_updates=31800, lr=0.000354663, gnorm=0.24, clip=0, loss_scale=1, train_wall=144, gb_free=59.2, wall=46859 epoch 019: 1168 / 1707 loss=4.095, nll_loss=2.465, ppl=5.52, wps=294745, ups=0.69, wpb=428307, bsz=16348.9, num_updates=31800, lr=0.000354663, gnorm=0.24, clip=0, loss_scale=1, train_wall=144, gb_free=59.2, wall=46859 epoch 019: 1168 / 1707 loss=4.095, nll_loss=2.465, ppl=5.52, wps=294745, ups=0.69, wpb=428307, bsz=16348.9, num_updates=31800, lr=0.000354663, gnorm=0.24, clip=0, loss_scale=1, train_wall=144, gb_free=59.2, wall=46859 epoch 019: 1168 / 1707 loss=4.095, nll_loss=2.465, ppl=5.52, wps=294745, ups=0.69, wpb=428307, bsz=16348.9, num_updates=31800, lr=0.000354663, gnorm=0.24, clip=0, loss_scale=1, train_wall=144, gb_free=59.2, wall=46859 epoch 019: 1168 / 1707 loss=4.095, nll_loss=2.465, ppl=5.52, wps=294745, ups=0.69, wpb=428307, bsz=16348.9, num_updates=31800, lr=0.000354663, gnorm=0.24, clip=0, loss_scale=1, train_wall=144, gb_free=59.2, wall=46859 epoch 019: 1168 / 1707 loss=4.095, nll_loss=2.465, ppl=5.52, wps=294745, ups=0.69, wpb=428307, bsz=16348.9, num_updates=31800, lr=0.000354663, gnorm=0.24, clip=0, loss_scale=1, train_wall=144, gb_free=59.2, wall=46859 epoch 019: 1168 / 1707 loss=4.095, nll_loss=2.465, ppl=5.52, wps=294745, ups=0.69, wpb=428307, bsz=16348.9, num_updates=31800, lr=0.000354663, gnorm=0.24, clip=0, loss_scale=1, train_wall=144, gb_free=59.2, wall=46859 epoch 019: 1168 / 1707 loss=4.095, nll_loss=2.465, ppl=5.52, wps=294745, ups=0.69, wpb=428307, bsz=16348.9, num_updates=31800, lr=0.000354663, gnorm=0.24, clip=0, loss_scale=1, train_wall=144, gb_free=59.2, wall=46859 epoch 019: 1168 / 1707 loss=4.095, nll_loss=2.465, ppl=5.52, wps=294745, ups=0.69, wpb=428307, bsz=16348.9, num_updates=31800, lr=0.000354663, gnorm=0.24, clip=0, loss_scale=1, train_wall=144, gb_free=59.2, wall=46859 epoch 019: 1168 / 1707 loss=4.095, nll_loss=2.465, ppl=5.52, wps=294745, ups=0.69, wpb=428307, bsz=16348.9, num_updates=31800, lr=0.000354663, gnorm=0.24, clip=0, loss_scale=1, train_wall=144, gb_free=59.2, wall=46859 epoch 019: 1168 / 1707 loss=4.095, nll_loss=2.465, ppl=5.52, wps=294745, ups=0.69, wpb=428307, bsz=16348.9, num_updates=31800, lr=0.000354663, gnorm=0.24, clip=0, loss_scale=1, train_wall=144, gb_free=59.2, wall=46859 epoch 019: 1168 / 1707 loss=4.095, nll_loss=2.465, ppl=5.52, wps=294745, ups=0.69, wpb=428307, bsz=16348.9, num_updates=31800, lr=0.000354663, gnorm=0.24, clip=0, loss_scale=1, train_wall=144, gb_free=59.2, wall=46859 epoch 019: 1168 / 1707 loss=4.095, nll_loss=2.465, ppl=5.52, wps=294745, ups=0.69, wpb=428307, bsz=16348.9, num_updates=31800, lr=0.000354663, gnorm=0.24, clip=0, loss_scale=1, train_wall=144, gb_free=59.2, wall=46859 epoch 019: 1168 / 1707 loss=4.095, nll_loss=2.465, ppl=5.52, wps=294745, ups=0.69, wpb=428307, bsz=16348.9, num_updates=31800, lr=0.000354663, gnorm=0.24, clip=0, loss_scale=1, train_wall=144, gb_free=59.2, wall=46859 epoch 019: 1168 / 1707 loss=4.095, nll_loss=2.465, ppl=5.52, wps=294745, ups=0.69, wpb=428307, bsz=16348.9, num_updates=31800, lr=0.000354663, gnorm=0.24, clip=0, loss_scale=1, train_wall=144, gb_free=59.2, wall=46859 epoch 019: 1168 / 1707 loss=4.095, nll_loss=2.465, ppl=5.52, wps=294745, ups=0.69, wpb=428307, bsz=16348.9, num_updates=31800, lr=0.000354663, gnorm=0.24, clip=0, loss_scale=1, train_wall=144, gb_free=59.2, wall=46859 epoch 019: 1268 / 1707 loss=4.097, nll_loss=2.468, ppl=5.53, wps=295956, ups=0.69, wpb=430194, bsz=16454.7, num_updates=31900, lr=0.000354107, gnorm=0.229, clip=0, loss_scale=1, train_wall=145, gb_free=59, wall=47005 epoch 019: 1268 / 1707 loss=4.097, nll_loss=2.468, ppl=5.53, wps=295956, ups=0.69, wpb=430194, bsz=16454.7, num_updates=31900, lr=0.000354107, gnorm=0.229, clip=0, loss_scale=1, train_wall=145, gb_free=59, wall=47005 epoch 019: 1268 / 1707 loss=4.097, nll_loss=2.468, ppl=5.53, wps=295956, ups=0.69, wpb=430194, bsz=16454.7, num_updates=31900, lr=0.000354107, gnorm=0.229, clip=0, loss_scale=1, train_wall=145, gb_free=59, wall=47005 epoch 019: 1268 / 1707 loss=4.097, nll_loss=2.468, ppl=5.53, wps=295956, ups=0.69, wpb=430194, bsz=16454.7, num_updates=31900, lr=0.000354107, gnorm=0.229, clip=0, loss_scale=1, train_wall=145, gb_free=59, wall=47005 epoch 019: 1268 / 1707 loss=4.097, nll_loss=2.468, ppl=5.53, wps=295956, ups=0.69, wpb=430194, bsz=16454.7, num_updates=31900, lr=0.000354107, gnorm=0.229, clip=0, loss_scale=1, train_wall=145, gb_free=59, wall=47005 epoch 019: 1268 / 1707 loss=4.097, nll_loss=2.468, ppl=5.53, wps=295956, ups=0.69, wpb=430194, bsz=16454.7, num_updates=31900, lr=0.000354107, gnorm=0.229, clip=0, loss_scale=1, train_wall=145, gb_free=59, wall=47005 epoch 019: 1268 / 1707 loss=4.097, nll_loss=2.468, ppl=5.53, wps=295956, ups=0.69, wpb=430194, bsz=16454.7, num_updates=31900, lr=0.000354107, gnorm=0.229, clip=0, loss_scale=1, train_wall=145, gb_free=59, wall=47005 epoch 019: 1268 / 1707 loss=4.097, nll_loss=2.468, ppl=5.53, wps=295956, ups=0.69, wpb=430194, bsz=16454.7, num_updates=31900, lr=0.000354107, gnorm=0.229, clip=0, loss_scale=1, train_wall=145, gb_free=59, wall=47005 epoch 019: 1268 / 1707 loss=4.097, nll_loss=2.468, ppl=5.53, wps=295956, ups=0.69, wpb=430194, bsz=16454.7, num_updates=31900, lr=0.000354107, gnorm=0.229, clip=0, loss_scale=1, train_wall=145, gb_free=59, wall=47005 epoch 019: 1268 / 1707 loss=4.097, nll_loss=2.468, ppl=5.53, wps=295956, ups=0.69, wpb=430194, bsz=16454.7, num_updates=31900, lr=0.000354107, gnorm=0.229, clip=0, loss_scale=1, train_wall=145, gb_free=59, wall=47005 epoch 019: 1268 / 1707 loss=4.097, nll_loss=2.468, ppl=5.53, wps=295956, ups=0.69, wpb=430194, bsz=16454.7, num_updates=31900, lr=0.000354107, gnorm=0.229, clip=0, loss_scale=1, train_wall=145, gb_free=59, wall=47005 epoch 019: 1268 / 1707 loss=4.097, nll_loss=2.468, ppl=5.53, wps=295956, ups=0.69, wpb=430194, bsz=16454.7, num_updates=31900, lr=0.000354107, gnorm=0.229, clip=0, loss_scale=1, train_wall=145, gb_free=59, wall=47005 epoch 019: 1268 / 1707 loss=4.097, nll_loss=2.468, ppl=5.53, wps=295956, ups=0.69, wpb=430194, bsz=16454.7, num_updates=31900, lr=0.000354107, gnorm=0.229, clip=0, loss_scale=1, train_wall=145, gb_free=59, wall=47005 epoch 019: 1268 / 1707 loss=4.097, nll_loss=2.468, ppl=5.53, wps=295956, ups=0.69, wpb=430194, bsz=16454.7, num_updates=31900, lr=0.000354107, gnorm=0.229, clip=0, loss_scale=1, train_wall=145, gb_free=59, wall=47005 epoch 019: 1268 / 1707 loss=4.097, nll_loss=2.468, ppl=5.53, wps=295956, ups=0.69, wpb=430194, bsz=16454.7, num_updates=31900, lr=0.000354107, gnorm=0.229, clip=0, loss_scale=1, train_wall=145, gb_free=59, wall=47005 epoch 019: 1268 / 1707 loss=4.097, nll_loss=2.468, ppl=5.53, wps=295956, ups=0.69, wpb=430194, bsz=16454.7, num_updates=31900, lr=0.000354107, gnorm=0.229, clip=0, loss_scale=1, train_wall=145, gb_free=59, wall=47005 epoch 019: 1268 / 1707 loss=4.097, nll_loss=2.468, ppl=5.53, wps=295956, ups=0.69, wpb=430194, bsz=16454.7, num_updates=31900, lr=0.000354107, gnorm=0.229, clip=0, loss_scale=1, train_wall=145, gb_free=59, wall=47005 epoch 019: 1268 / 1707 loss=4.097, nll_loss=2.468, ppl=5.53, wps=295956, ups=0.69, wpb=430194, bsz=16454.7, num_updates=31900, lr=0.000354107, gnorm=0.229, clip=0, loss_scale=1, train_wall=145, gb_free=59, wall=47005 epoch 019: 1268 / 1707 loss=4.097, nll_loss=2.468, ppl=5.53, wps=295956, ups=0.69, wpb=430194, bsz=16454.7, num_updates=31900, lr=0.000354107, gnorm=0.229, clip=0, loss_scale=1, train_wall=145, gb_free=59, wall=47005 epoch 019: 1368 / 1707 loss=4.1, nll_loss=2.471, ppl=5.54, wps=296990, ups=0.69, wpb=431192, bsz=16226.2, num_updates=32000, lr=0.000353553, gnorm=0.239, clip=0, loss_scale=2, train_wall=144, gb_free=59.3, wall=47150 epoch 019: 1368 / 1707 loss=4.1, nll_loss=2.471, ppl=5.54, wps=296990, ups=0.69, wpb=431192, bsz=16226.2, num_updates=32000, lr=0.000353553, gnorm=0.239, clip=0, loss_scale=2, train_wall=144, gb_free=59.3, wall=47150 epoch 019: 1368 / 1707 loss=4.1, nll_loss=2.471, ppl=5.54, wps=296990, ups=0.69, wpb=431192, bsz=16226.2, num_updates=32000, lr=0.000353553, gnorm=0.239, clip=0, loss_scale=2, train_wall=144, gb_free=59.3, wall=47150 epoch 019: 1368 / 1707 loss=4.1, nll_loss=2.471, ppl=5.54, wps=296990, ups=0.69, wpb=431192, bsz=16226.2, num_updates=32000, lr=0.000353553, gnorm=0.239, clip=0, loss_scale=2, train_wall=144, gb_free=59.3, wall=47150 epoch 019: 1368 / 1707 loss=4.1, nll_loss=2.471, ppl=5.54, wps=296990, ups=0.69, wpb=431192, bsz=16226.2, num_updates=32000, lr=0.000353553, gnorm=0.239, clip=0, loss_scale=2, train_wall=144, gb_free=59.3, wall=47150 epoch 019: 1368 / 1707 loss=4.1, nll_loss=2.471, ppl=5.54, wps=296990, ups=0.69, wpb=431192, bsz=16226.2, num_updates=32000, lr=0.000353553, gnorm=0.239, clip=0, loss_scale=2, train_wall=144, gb_free=59.3, wall=47150 epoch 019: 1368 / 1707 loss=4.1, nll_loss=2.471, ppl=5.54, wps=296990, ups=0.69, wpb=431192, bsz=16226.2, num_updates=32000, lr=0.000353553, gnorm=0.239, clip=0, loss_scale=2, train_wall=144, gb_free=59.3, wall=47150 epoch 019: 1368 / 1707 loss=4.1, nll_loss=2.471, ppl=5.54, wps=296990, ups=0.69, wpb=431192, bsz=16226.2, num_updates=32000, lr=0.000353553, gnorm=0.239, clip=0, loss_scale=2, train_wall=144, gb_free=59.3, wall=47150 epoch 019: 1368 / 1707 loss=4.1, nll_loss=2.471, ppl=5.54, wps=296990, ups=0.69, wpb=431192, bsz=16226.2, num_updates=32000, lr=0.000353553, gnorm=0.239, clip=0, loss_scale=2, train_wall=144, gb_free=59.3, wall=47150 epoch 019: 1368 / 1707 loss=4.1, nll_loss=2.471, ppl=5.54, wps=296990, ups=0.69, wpb=431192, bsz=16226.2, num_updates=32000, lr=0.000353553, gnorm=0.239, clip=0, loss_scale=2, train_wall=144, gb_free=59.3, wall=47150 epoch 019: 1368 / 1707 loss=4.1, nll_loss=2.471, ppl=5.54, wps=296990, ups=0.69, wpb=431192, bsz=16226.2, num_updates=32000, lr=0.000353553, gnorm=0.239, clip=0, loss_scale=2, train_wall=144, gb_free=59.3, wall=47150 epoch 019: 1368 / 1707 loss=4.1, nll_loss=2.471, ppl=5.54, wps=296990, ups=0.69, wpb=431192, bsz=16226.2, num_updates=32000, lr=0.000353553, gnorm=0.239, clip=0, loss_scale=2, train_wall=144, gb_free=59.3, wall=47150 epoch 019: 1368 / 1707 loss=4.1, nll_loss=2.471, ppl=5.54, wps=296990, ups=0.69, wpb=431192, bsz=16226.2, num_updates=32000, lr=0.000353553, gnorm=0.239, clip=0, loss_scale=2, train_wall=144, gb_free=59.3, wall=47150 epoch 019: 1368 / 1707 loss=4.1, nll_loss=2.471, ppl=5.54, wps=296990, ups=0.69, wpb=431192, bsz=16226.2, num_updates=32000, lr=0.000353553, gnorm=0.239, clip=0, loss_scale=2, train_wall=144, gb_free=59.3, wall=47150 epoch 019: 1368 / 1707 loss=4.1, nll_loss=2.471, ppl=5.54, wps=296990, ups=0.69, wpb=431192, bsz=16226.2, num_updates=32000, lr=0.000353553, gnorm=0.239, clip=0, loss_scale=2, train_wall=144, gb_free=59.3, wall=47150 epoch 019: 1368 / 1707 loss=4.1, nll_loss=2.471, ppl=5.54, wps=296990, ups=0.69, wpb=431192, bsz=16226.2, num_updates=32000, lr=0.000353553, gnorm=0.239, clip=0, loss_scale=2, train_wall=144, gb_free=59.3, wall=47150 epoch 019: 1368 / 1707 loss=4.1, nll_loss=2.471, ppl=5.54, wps=296990, ups=0.69, wpb=431192, bsz=16226.2, num_updates=32000, lr=0.000353553, gnorm=0.239, clip=0, loss_scale=2, train_wall=144, gb_free=59.3, wall=47150 epoch 019: 1368 / 1707 loss=4.1, nll_loss=2.471, ppl=5.54, wps=296990, ups=0.69, wpb=431192, bsz=16226.2, num_updates=32000, lr=0.000353553, gnorm=0.239, clip=0, loss_scale=2, train_wall=144, gb_free=59.3, wall=47150 epoch 019: 1368 / 1707 loss=4.1, nll_loss=2.471, ppl=5.54, wps=296990, ups=0.69, wpb=431192, bsz=16226.2, num_updates=32000, lr=0.000353553, gnorm=0.239, clip=0, loss_scale=2, train_wall=144, gb_free=59.3, wall=47150 begin validation on "valid" subset epoch 019 | valid on 'valid' subset | loss 4.305 | nll_loss 2.683 | ppl 6.42 | wps 76024.2 | wpb 21331 | bsz 1016 | num_updates 32000 | best_loss 4.296 epoch 019 | valid on 'valid' subset | loss 4.305 | nll_loss 2.683 | ppl 6.42 | wps 76024.2 | wpb 21331 | bsz 1016 | num_updates 32000 | best_loss 4.296 epoch 019 | valid on 'valid' subset | loss 4.305 | nll_loss 2.683 | ppl 6.42 | wps 76024.2 | wpb 21331 | bsz 1016 | num_updates 32000 | best_loss 4.296 epoch 019 | valid on 'valid' subset | loss 4.305 | nll_loss 2.683 | ppl 6.42 | wps 76024.2 | wpb 21331 | bsz 1016 | num_updates 32000 | best_loss 4.296 epoch 019 | valid on 'valid' subset | loss 4.305 | nll_loss 2.683 | ppl 6.42 | wps 76024.2 | wpb 21331 | bsz 1016 | num_updates 32000 | best_loss 4.296 epoch 019 | valid on 'valid' subset | loss 4.305 | nll_loss 2.683 | ppl 6.42 | wps 76024.2 | wpb 21331 | bsz 1016 | num_updates 32000 | best_loss 4.296 epoch 019 | valid on 'valid' subset | loss 4.305 | nll_loss 2.683 | ppl 6.42 | wps 76024.2 | wpb 21331 | bsz 1016 | num_updates 32000 | best_loss 4.296 epoch 019 | valid on 'valid' subset | loss 4.305 | nll_loss 2.683 | ppl 6.42 | wps 76024.2 | wpb 21331 | bsz 1016 | num_updates 32000 | best_loss 4.296 epoch 019 | valid on 'valid' subset | loss 4.305 | nll_loss 2.683 | ppl 6.42 | wps 76024.2 | wpb 21331 | bsz 1016 | num_updates 32000 | best_loss 4.296 epoch 019 | valid on 'valid' subset | loss 4.305 | nll_loss 2.683 | ppl 6.42 | wps 76024.2 | wpb 21331 | bsz 1016 | num_updates 32000 | best_loss 4.296 epoch 019 | valid on 'valid' subset | loss 4.305 | nll_loss 2.683 | ppl 6.42 | wps 76024.2 | wpb 21331 | bsz 1016 | num_updates 32000 | best_loss 4.296 epoch 019 | valid on 'valid' subset | loss 4.305 | nll_loss 2.683 | ppl 6.42 | wps 76024.2 | wpb 21331 | bsz 1016 | num_updates 32000 | best_loss 4.296 epoch 019 | valid on 'valid' subset | loss 4.305 | nll_loss 2.683 | ppl 6.42 | wps 76024.2 | wpb 21331 | bsz 1016 | num_updates 32000 | best_loss 4.296 epoch 019 | valid on 'valid' subset | loss 4.305 | nll_loss 2.683 | ppl 6.42 | wps 76024.2 | wpb 21331 | bsz 1016 | num_updates 32000 | best_loss 4.296 epoch 019 | valid on 'valid' subset | loss 4.305 | nll_loss 2.683 | ppl 6.42 | wps 76024.2 | wpb 21331 | bsz 1016 | num_updates 32000 | best_loss 4.296 epoch 019 | valid on 'valid' subset | loss 4.305 | nll_loss 2.683 | ppl 6.42 | wps 76024.2 | wpb 21331 | bsz 1016 | num_updates 32000 | best_loss 4.296 epoch 019 | valid on 'valid' subset | loss 4.305 | nll_loss 2.683 | ppl 6.42 | wps 76024.2 | wpb 21331 | bsz 1016 | num_updates 32000 | best_loss 4.296 epoch 019 | valid on 'valid' subset | loss 4.305 | nll_loss 2.683 | ppl 6.42 | wps 76024.2 | wpb 21331 | bsz 1016 | num_updates 32000 | best_loss 4.296 epoch 019 | valid on 'valid' subset | loss 4.305 | nll_loss 2.683 | ppl 6.42 | wps 76024.2 | wpb 21331 | bsz 1016 | num_updates 32000 | best_loss 4.296 epoch 019: 1468 / 1707 loss=4.098, nll_loss=2.469, ppl=5.54, wps=271113, ups=0.63, wpb=428005, bsz=16237.7, num_updates=32100, lr=0.000353002, gnorm=0.227, clip=0, loss_scale=2, train_wall=143, gb_free=59.2, wall=47308 epoch 019: 1468 / 1707 loss=4.098, nll_loss=2.469, ppl=5.54, wps=271113, ups=0.63, wpb=428005, bsz=16237.7, num_updates=32100, lr=0.000353002, gnorm=0.227, clip=0, loss_scale=2, train_wall=143, gb_free=59.2, wall=47308 epoch 019: 1468 / 1707 loss=4.098, nll_loss=2.469, ppl=5.54, wps=271113, ups=0.63, wpb=428005, bsz=16237.7, num_updates=32100, lr=0.000353002, gnorm=0.227, clip=0, loss_scale=2, train_wall=143, gb_free=59.2, wall=47308 epoch 019: 1468 / 1707 loss=4.098, nll_loss=2.469, ppl=5.54, wps=271113, ups=0.63, wpb=428005, bsz=16237.7, num_updates=32100, lr=0.000353002, gnorm=0.227, clip=0, loss_scale=2, train_wall=143, gb_free=59.2, wall=47308 epoch 019: 1468 / 1707 loss=4.098, nll_loss=2.469, ppl=5.54, wps=271113, ups=0.63, wpb=428005, bsz=16237.7, num_updates=32100, lr=0.000353002, gnorm=0.227, clip=0, loss_scale=2, train_wall=143, gb_free=59.2, wall=47308 epoch 019: 1468 / 1707 loss=4.098, nll_loss=2.469, ppl=5.54, wps=271113, ups=0.63, wpb=428005, bsz=16237.7, num_updates=32100, lr=0.000353002, gnorm=0.227, clip=0, loss_scale=2, train_wall=143, gb_free=59.2, wall=47308 epoch 019: 1468 / 1707 loss=4.098, nll_loss=2.469, ppl=5.54, wps=271113, ups=0.63, wpb=428005, bsz=16237.7, num_updates=32100, lr=0.000353002, gnorm=0.227, clip=0, loss_scale=2, train_wall=143, gb_free=59.2, wall=47308 epoch 019: 1468 / 1707 loss=4.098, nll_loss=2.469, ppl=5.54, wps=271113, ups=0.63, wpb=428005, bsz=16237.7, num_updates=32100, lr=0.000353002, gnorm=0.227, clip=0, loss_scale=2, train_wall=143, gb_free=59.2, wall=47308 epoch 019: 1468 / 1707 loss=4.098, nll_loss=2.469, ppl=5.54, wps=271113, ups=0.63, wpb=428005, bsz=16237.7, num_updates=32100, lr=0.000353002, gnorm=0.227, clip=0, loss_scale=2, train_wall=143, gb_free=59.2, wall=47308 epoch 019: 1468 / 1707 loss=4.098, nll_loss=2.469, ppl=5.54, wps=271113, ups=0.63, wpb=428005, bsz=16237.7, num_updates=32100, lr=0.000353002, gnorm=0.227, clip=0, loss_scale=2, train_wall=143, gb_free=59.2, wall=47308 epoch 019: 1468 / 1707 loss=4.098, nll_loss=2.469, ppl=5.54, wps=271113, ups=0.63, wpb=428005, bsz=16237.7, num_updates=32100, lr=0.000353002, gnorm=0.227, clip=0, loss_scale=2, train_wall=143, gb_free=59.2, wall=47308 epoch 019: 1468 / 1707 loss=4.098, nll_loss=2.469, ppl=5.54, wps=271113, ups=0.63, wpb=428005, bsz=16237.7, num_updates=32100, lr=0.000353002, gnorm=0.227, clip=0, loss_scale=2, train_wall=143, gb_free=59.2, wall=47308 epoch 019: 1468 / 1707 loss=4.098, nll_loss=2.469, ppl=5.54, wps=271113, ups=0.63, wpb=428005, bsz=16237.7, num_updates=32100, lr=0.000353002, gnorm=0.227, clip=0, loss_scale=2, train_wall=143, gb_free=59.2, wall=47308 epoch 019: 1468 / 1707 loss=4.098, nll_loss=2.469, ppl=5.54, wps=271113, ups=0.63, wpb=428005, bsz=16237.7, num_updates=32100, lr=0.000353002, gnorm=0.227, clip=0, loss_scale=2, train_wall=143, gb_free=59.2, wall=47308 epoch 019: 1468 / 1707 loss=4.098, nll_loss=2.469, ppl=5.54, wps=271113, ups=0.63, wpb=428005, bsz=16237.7, num_updates=32100, lr=0.000353002, gnorm=0.227, clip=0, loss_scale=2, train_wall=143, gb_free=59.2, wall=47308 epoch 019: 1468 / 1707 loss=4.098, nll_loss=2.469, ppl=5.54, wps=271113, ups=0.63, wpb=428005, bsz=16237.7, num_updates=32100, lr=0.000353002, gnorm=0.227, clip=0, loss_scale=2, train_wall=143, gb_free=59.2, wall=47308 epoch 019: 1468 / 1707 loss=4.098, nll_loss=2.469, ppl=5.54, wps=271113, ups=0.63, wpb=428005, bsz=16237.7, num_updates=32100, lr=0.000353002, gnorm=0.227, clip=0, loss_scale=2, train_wall=143, gb_free=59.2, wall=47308 epoch 019: 1468 / 1707 loss=4.098, nll_loss=2.469, ppl=5.54, wps=271113, ups=0.63, wpb=428005, bsz=16237.7, num_updates=32100, lr=0.000353002, gnorm=0.227, clip=0, loss_scale=2, train_wall=143, gb_free=59.2, wall=47308 epoch 019: 1468 / 1707 loss=4.098, nll_loss=2.469, ppl=5.54, wps=271113, ups=0.63, wpb=428005, bsz=16237.7, num_updates=32100, lr=0.000353002, gnorm=0.227, clip=0, loss_scale=2, train_wall=143, gb_free=59.2, wall=47308 epoch 019: 1569 / 1707 loss=4.1, nll_loss=2.471, ppl=5.54, wps=292781, ups=0.68, wpb=428928, bsz=16165.8, num_updates=32200, lr=0.000352454, gnorm=0.233, clip=0, loss_scale=2, train_wall=146, gb_free=59.6, wall=47454 epoch 019: 1569 / 1707 loss=4.1, nll_loss=2.471, ppl=5.54, wps=292781, ups=0.68, wpb=428928, bsz=16165.8, num_updates=32200, lr=0.000352454, gnorm=0.233, clip=0, loss_scale=2, train_wall=146, gb_free=59.6, wall=47454 epoch 019: 1569 / 1707 loss=4.1, nll_loss=2.471, ppl=5.54, wps=292781, ups=0.68, wpb=428928, bsz=16165.8, num_updates=32200, lr=0.000352454, gnorm=0.233, clip=0, loss_scale=2, train_wall=146, gb_free=59.6, wall=47454 epoch 019: 1569 / 1707 loss=4.1, nll_loss=2.471, ppl=5.54, wps=292781, ups=0.68, wpb=428928, bsz=16165.8, num_updates=32200, lr=0.000352454, gnorm=0.233, clip=0, loss_scale=2, train_wall=146, gb_free=59.6, wall=47454 epoch 019: 1569 / 1707 loss=4.1, nll_loss=2.471, ppl=5.54, wps=292781, ups=0.68, wpb=428928, bsz=16165.8, num_updates=32200, lr=0.000352454, gnorm=0.233, clip=0, loss_scale=2, train_wall=146, gb_free=59.6, wall=47454 epoch 019: 1569 / 1707 loss=4.1, nll_loss=2.471, ppl=5.54, wps=292781, ups=0.68, wpb=428928, bsz=16165.8, num_updates=32200, lr=0.000352454, gnorm=0.233, clip=0, loss_scale=2, train_wall=146, gb_free=59.6, wall=47454 epoch 019: 1569 / 1707 loss=4.1, nll_loss=2.471, ppl=5.54, wps=292781, ups=0.68, wpb=428928, bsz=16165.8, num_updates=32200, lr=0.000352454, gnorm=0.233, clip=0, loss_scale=2, train_wall=146, gb_free=59.6, wall=47454 epoch 019: 1569 / 1707 loss=4.1, nll_loss=2.471, ppl=5.54, wps=292781, ups=0.68, wpb=428928, bsz=16165.8, num_updates=32200, lr=0.000352454, gnorm=0.233, clip=0, loss_scale=2, train_wall=146, gb_free=59.6, wall=47454 epoch 019: 1569 / 1707 loss=4.1, nll_loss=2.471, ppl=5.54, wps=292781, ups=0.68, wpb=428928, bsz=16165.8, num_updates=32200, lr=0.000352454, gnorm=0.233, clip=0, loss_scale=2, train_wall=146, gb_free=59.6, wall=47454 epoch 019: 1569 / 1707 loss=4.1, nll_loss=2.471, ppl=5.54, wps=292781, ups=0.68, wpb=428928, bsz=16165.8, num_updates=32200, lr=0.000352454, gnorm=0.233, clip=0, loss_scale=2, train_wall=146, gb_free=59.6, wall=47454 epoch 019: 1569 / 1707 loss=4.1, nll_loss=2.471, ppl=5.54, wps=292781, ups=0.68, wpb=428928, bsz=16165.8, num_updates=32200, lr=0.000352454, gnorm=0.233, clip=0, loss_scale=2, train_wall=146, gb_free=59.6, wall=47454 epoch 019: 1569 / 1707 loss=4.1, nll_loss=2.471, ppl=5.54, wps=292781, ups=0.68, wpb=428928, bsz=16165.8, num_updates=32200, lr=0.000352454, gnorm=0.233, clip=0, loss_scale=2, train_wall=146, gb_free=59.6, wall=47454 epoch 019: 1569 / 1707 loss=4.1, nll_loss=2.471, ppl=5.54, wps=292781, ups=0.68, wpb=428928, bsz=16165.8, num_updates=32200, lr=0.000352454, gnorm=0.233, clip=0, loss_scale=2, train_wall=146, gb_free=59.6, wall=47454 epoch 019: 1569 / 1707 loss=4.1, nll_loss=2.471, ppl=5.54, wps=292781, ups=0.68, wpb=428928, bsz=16165.8, num_updates=32200, lr=0.000352454, gnorm=0.233, clip=0, loss_scale=2, train_wall=146, gb_free=59.6, wall=47454 epoch 019: 1569 / 1707 loss=4.1, nll_loss=2.471, ppl=5.54, wps=292781, ups=0.68, wpb=428928, bsz=16165.8, num_updates=32200, lr=0.000352454, gnorm=0.233, clip=0, loss_scale=2, train_wall=146, gb_free=59.6, wall=47454 epoch 019: 1569 / 1707 loss=4.1, nll_loss=2.471, ppl=5.54, wps=292781, ups=0.68, wpb=428928, bsz=16165.8, num_updates=32200, lr=0.000352454, gnorm=0.233, clip=0, loss_scale=2, train_wall=146, gb_free=59.6, wall=47454 epoch 019: 1569 / 1707 loss=4.1, nll_loss=2.471, ppl=5.54, wps=292781, ups=0.68, wpb=428928, bsz=16165.8, num_updates=32200, lr=0.000352454, gnorm=0.233, clip=0, loss_scale=2, train_wall=146, gb_free=59.6, wall=47454 epoch 019: 1569 / 1707 loss=4.1, nll_loss=2.471, ppl=5.54, wps=292781, ups=0.68, wpb=428928, bsz=16165.8, num_updates=32200, lr=0.000352454, gnorm=0.233, clip=0, loss_scale=2, train_wall=146, gb_free=59.6, wall=47454 epoch 019: 1569 / 1707 loss=4.1, nll_loss=2.471, ppl=5.54, wps=292781, ups=0.68, wpb=428928, bsz=16165.8, num_updates=32200, lr=0.000352454, gnorm=0.233, clip=0, loss_scale=2, train_wall=146, gb_free=59.6, wall=47454 epoch 019: 1670 / 1707 loss=4.094, nll_loss=2.465, ppl=5.52, wps=292666, ups=0.68, wpb=428004, bsz=16234.3, num_updates=32300, lr=0.000351908, gnorm=0.248, clip=0, loss_scale=1, train_wall=146, gb_free=58.9, wall=47600 epoch 019: 1670 / 1707 loss=4.094, nll_loss=2.465, ppl=5.52, wps=292666, ups=0.68, wpb=428004, bsz=16234.3, num_updates=32300, lr=0.000351908, gnorm=0.248, clip=0, loss_scale=1, train_wall=146, gb_free=58.9, wall=47600 epoch 019: 1670 / 1707 loss=4.094, nll_loss=2.465, ppl=5.52, wps=292666, ups=0.68, wpb=428004, bsz=16234.3, num_updates=32300, lr=0.000351908, gnorm=0.248, clip=0, loss_scale=1, train_wall=146, gb_free=58.9, wall=47600 epoch 019: 1670 / 1707 loss=4.094, nll_loss=2.465, ppl=5.52, wps=292666, ups=0.68, wpb=428004, bsz=16234.3, num_updates=32300, lr=0.000351908, gnorm=0.248, clip=0, loss_scale=1, train_wall=146, gb_free=58.9, wall=47600 epoch 019: 1670 / 1707 loss=4.094, nll_loss=2.465, ppl=5.52, wps=292666, ups=0.68, wpb=428004, bsz=16234.3, num_updates=32300, lr=0.000351908, gnorm=0.248, clip=0, loss_scale=1, train_wall=146, gb_free=58.9, wall=47600 epoch 019: 1670 / 1707 loss=4.094, nll_loss=2.465, ppl=5.52, wps=292666, ups=0.68, wpb=428004, bsz=16234.3, num_updates=32300, lr=0.000351908, gnorm=0.248, clip=0, loss_scale=1, train_wall=146, gb_free=58.9, wall=47600 epoch 019: 1670 / 1707 loss=4.094, nll_loss=2.465, ppl=5.52, wps=292666, ups=0.68, wpb=428004, bsz=16234.3, num_updates=32300, lr=0.000351908, gnorm=0.248, clip=0, loss_scale=1, train_wall=146, gb_free=58.9, wall=47600 epoch 019: 1670 / 1707 loss=4.094, nll_loss=2.465, ppl=5.52, wps=292666, ups=0.68, wpb=428004, bsz=16234.3, num_updates=32300, lr=0.000351908, gnorm=0.248, clip=0, loss_scale=1, train_wall=146, gb_free=58.9, wall=47600 epoch 019: 1670 / 1707 loss=4.094, nll_loss=2.465, ppl=5.52, wps=292666, ups=0.68, wpb=428004, bsz=16234.3, num_updates=32300, lr=0.000351908, gnorm=0.248, clip=0, loss_scale=1, train_wall=146, gb_free=58.9, wall=47600 epoch 019: 1670 / 1707 loss=4.094, nll_loss=2.465, ppl=5.52, wps=292666, ups=0.68, wpb=428004, bsz=16234.3, num_updates=32300, lr=0.000351908, gnorm=0.248, clip=0, loss_scale=1, train_wall=146, gb_free=58.9, wall=47600 epoch 019: 1670 / 1707 loss=4.094, nll_loss=2.465, ppl=5.52, wps=292666, ups=0.68, wpb=428004, bsz=16234.3, num_updates=32300, lr=0.000351908, gnorm=0.248, clip=0, loss_scale=1, train_wall=146, gb_free=58.9, wall=47600 epoch 019: 1670 / 1707 loss=4.094, nll_loss=2.465, ppl=5.52, wps=292666, ups=0.68, wpb=428004, bsz=16234.3, num_updates=32300, lr=0.000351908, gnorm=0.248, clip=0, loss_scale=1, train_wall=146, gb_free=58.9, wall=47600 epoch 019: 1670 / 1707 loss=4.094, nll_loss=2.465, ppl=5.52, wps=292666, ups=0.68, wpb=428004, bsz=16234.3, num_updates=32300, lr=0.000351908, gnorm=0.248, clip=0, loss_scale=1, train_wall=146, gb_free=58.9, wall=47600 epoch 019: 1670 / 1707 loss=4.094, nll_loss=2.465, ppl=5.52, wps=292666, ups=0.68, wpb=428004, bsz=16234.3, num_updates=32300, lr=0.000351908, gnorm=0.248, clip=0, loss_scale=1, train_wall=146, gb_free=58.9, wall=47600 epoch 019: 1670 / 1707 loss=4.094, nll_loss=2.465, ppl=5.52, wps=292666, ups=0.68, wpb=428004, bsz=16234.3, num_updates=32300, lr=0.000351908, gnorm=0.248, clip=0, loss_scale=1, train_wall=146, gb_free=58.9, wall=47600 epoch 019: 1670 / 1707 loss=4.094, nll_loss=2.465, ppl=5.52, wps=292666, ups=0.68, wpb=428004, bsz=16234.3, num_updates=32300, lr=0.000351908, gnorm=0.248, clip=0, loss_scale=1, train_wall=146, gb_free=58.9, wall=47600 epoch 019: 1670 / 1707 loss=4.094, nll_loss=2.465, ppl=5.52, wps=292666, ups=0.68, wpb=428004, bsz=16234.3, num_updates=32300, lr=0.000351908, gnorm=0.248, clip=0, loss_scale=1, train_wall=146, gb_free=58.9, wall=47600 epoch 019: 1670 / 1707 loss=4.094, nll_loss=2.465, ppl=5.52, wps=292666, ups=0.68, wpb=428004, bsz=16234.3, num_updates=32300, lr=0.000351908, gnorm=0.248, clip=0, loss_scale=1, train_wall=146, gb_free=58.9, wall=47600 epoch 019: 1670 / 1707 loss=4.094, nll_loss=2.465, ppl=5.52, wps=292666, ups=0.68, wpb=428004, bsz=16234.3, num_updates=32300, lr=0.000351908, gnorm=0.248, clip=0, loss_scale=1, train_wall=146, gb_free=58.9, wall=47600 end of epoch 19 (average epoch stats below) epoch 019 | loss 4.091 | nll_loss 2.461 | ppl 5.5 | wps 291582 | ups 0.68 | wpb 428942 | bsz 16334.4 | num_updates 32337 | lr 0.000351706 | gnorm 0.237 | clip 0 | loss_scale 1 | train_wall 2464 | gb_free 60.2 | wall 47653 epoch 019 | loss 4.091 | nll_loss 2.461 | ppl 5.5 | wps 291582 | ups 0.68 | wpb 428942 | bsz 16334.4 | num_updates 32337 | lr 0.000351706 | gnorm 0.237 | clip 0 | loss_scale 1 | train_wall 2464 | gb_free 60.2 | wall 47653 epoch 019 | loss 4.091 | nll_loss 2.461 | ppl 5.5 | wps 291582 | ups 0.68 | wpb 428942 | bsz 16334.4 | num_updates 32337 | lr 0.000351706 | gnorm 0.237 | clip 0 | loss_scale 1 | train_wall 2464 | gb_free 60.2 | wall 47653 epoch 019 | loss 4.091 | nll_loss 2.461 | ppl 5.5 | wps 291582 | ups 0.68 | wpb 428942 | bsz 16334.4 | num_updates 32337 | lr 0.000351706 | gnorm 0.237 | clip 0 | loss_scale 1 | train_wall 2464 | gb_free 60.2 | wall 47653 epoch 019 | loss 4.091 | nll_loss 2.461 | ppl 5.5 | wps 291582 | ups 0.68 | wpb 428942 | bsz 16334.4 | num_updates 32337 | lr 0.000351706 | gnorm 0.237 | clip 0 | loss_scale 1 | train_wall 2464 | gb_free 60.2 | wall 47653 epoch 019 | loss 4.091 | nll_loss 2.461 | ppl 5.5 | wps 291582 | ups 0.68 | wpb 428942 | bsz 16334.4 | num_updates 32337 | lr 0.000351706 | gnorm 0.237 | clip 0 | loss_scale 1 | train_wall 2464 | gb_free 60.2 | wall 47653 epoch 019 | loss 4.091 | nll_loss 2.461 | ppl 5.5 | wps 291582 | ups 0.68 | wpb 428942 | bsz 16334.4 | num_updates 32337 | lr 0.000351706 | gnorm 0.237 | clip 0 | loss_scale 1 | train_wall 2464 | gb_free 60.2 | wall 47653 epoch 019 | loss 4.091 | nll_loss 2.461 | ppl 5.5 | wps 291582 | ups 0.68 | wpb 428942 | bsz 16334.4 | num_updates 32337 | lr 0.000351706 | gnorm 0.237 | clip 0 | loss_scale 1 | train_wall 2464 | gb_free 60.2 | wall 47653 epoch 019 | loss 4.091 | nll_loss 2.461 | ppl 5.5 | wps 291582 | ups 0.68 | wpb 428942 | bsz 16334.4 | num_updates 32337 | lr 0.000351706 | gnorm 0.237 | clip 0 | loss_scale 1 | train_wall 2464 | gb_free 60.2 | wall 47653 epoch 019 | loss 4.091 | nll_loss 2.461 | ppl 5.5 | wps 291582 | ups 0.68 | wpb 428942 | bsz 16334.4 | num_updates 32337 | lr 0.000351706 | gnorm 0.237 | clip 0 | loss_scale 1 | train_wall 2464 | gb_free 60.2 | wall 47653 epoch 019 | loss 4.091 | nll_loss 2.461 | ppl 5.5 | wps 291582 | ups 0.68 | wpb 428942 | bsz 16334.4 | num_updates 32337 | lr 0.000351706 | gnorm 0.237 | clip 0 | loss_scale 1 | train_wall 2464 | gb_free 60.2 | wall 47653 epoch 019 | loss 4.091 | nll_loss 2.461 | ppl 5.5 | wps 291582 | ups 0.68 | wpb 428942 | bsz 16334.4 | num_updates 32337 | lr 0.000351706 | gnorm 0.237 | clip 0 | loss_scale 1 | train_wall 2464 | gb_free 60.2 | wall 47653 epoch 019 | loss 4.091 | nll_loss 2.461 | ppl 5.5 | wps 291582 | ups 0.68 | wpb 428942 | bsz 16334.4 | num_updates 32337 | lr 0.000351706 | gnorm 0.237 | clip 0 | loss_scale 1 | train_wall 2464 | gb_free 60.2 | wall 47653 epoch 019 | loss 4.091 | nll_loss 2.461 | ppl 5.5 | wps 291582 | ups 0.68 | wpb 428942 | bsz 16334.4 | num_updates 32337 | lr 0.000351706 | gnorm 0.237 | clip 0 | loss_scale 1 | train_wall 2464 | gb_free 60.2 | wall 47653 epoch 019 | loss 4.091 | nll_loss 2.461 | ppl 5.5 | wps 291582 | ups 0.68 | wpb 428942 | bsz 16334.4 | num_updates 32337 | lr 0.000351706 | gnorm 0.237 | clip 0 | loss_scale 1 | train_wall 2464 | gb_free 60.2 | wall 47653 epoch 019 | loss 4.091 | nll_loss 2.461 | ppl 5.5 | wps 291582 | ups 0.68 | wpb 428942 | bsz 16334.4 | num_updates 32337 | lr 0.000351706 | gnorm 0.237 | clip 0 | loss_scale 1 | train_wall 2464 | gb_free 60.2 | wall 47653 epoch 019 | loss 4.091 | nll_loss 2.461 | ppl 5.5 | wps 291582 | ups 0.68 | wpb 428942 | bsz 16334.4 | num_updates 32337 | lr 0.000351706 | gnorm 0.237 | clip 0 | loss_scale 1 | train_wall 2464 | gb_free 60.2 | wall 47653 epoch 019 | loss 4.091 | nll_loss 2.461 | ppl 5.5 | wps 291582 | ups 0.68 | wpb 428942 | bsz 16334.4 | num_updates 32337 | lr 0.000351706 | gnorm 0.237 | clip 0 | loss_scale 1 | train_wall 2464 | gb_free 60.2 | wall 47653 epoch 019 | loss 4.091 | nll_loss 2.461 | ppl 5.5 | wps 291582 | ups 0.68 | wpb 428942 | bsz 16334.4 | num_updates 32337 | lr 0.000351706 | gnorm 0.237 | clip 0 | loss_scale 1 | train_wall 2464 | gb_free 60.2 | wall 47653 Start iterating over samples epoch 020: 63 / 1707 loss=4.081, nll_loss=2.449, ppl=5.46, wps=295277, ups=0.69, wpb=426706, bsz=16265.9, num_updates=32400, lr=0.000351364, gnorm=0.242, clip=0, loss_scale=1, train_wall=143, gb_free=58.9, wall=47745 epoch 020: 63 / 1707 loss=4.081, nll_loss=2.449, ppl=5.46, wps=295277, ups=0.69, wpb=426706, bsz=16265.9, num_updates=32400, lr=0.000351364, gnorm=0.242, clip=0, loss_scale=1, train_wall=143, gb_free=58.9, wall=47745 epoch 020: 63 / 1707 loss=4.081, nll_loss=2.449, ppl=5.46, wps=295277, ups=0.69, wpb=426706, bsz=16265.9, num_updates=32400, lr=0.000351364, gnorm=0.242, clip=0, loss_scale=1, train_wall=143, gb_free=58.9, wall=47745 epoch 020: 63 / 1707 loss=4.081, nll_loss=2.449, ppl=5.46, wps=295277, ups=0.69, wpb=426706, bsz=16265.9, num_updates=32400, lr=0.000351364, gnorm=0.242, clip=0, loss_scale=1, train_wall=143, gb_free=58.9, wall=47745 epoch 020: 63 / 1707 loss=4.081, nll_loss=2.449, ppl=5.46, wps=295277, ups=0.69, wpb=426706, bsz=16265.9, num_updates=32400, lr=0.000351364, gnorm=0.242, clip=0, loss_scale=1, train_wall=143, gb_free=58.9, wall=47745 epoch 020: 63 / 1707 loss=4.081, nll_loss=2.449, ppl=5.46, wps=295277, ups=0.69, wpb=426706, bsz=16265.9, num_updates=32400, lr=0.000351364, gnorm=0.242, clip=0, loss_scale=1, train_wall=143, gb_free=58.9, wall=47745 epoch 020: 63 / 1707 loss=4.081, nll_loss=2.449, ppl=5.46, wps=295277, ups=0.69, wpb=426706, bsz=16265.9, num_updates=32400, lr=0.000351364, gnorm=0.242, clip=0, loss_scale=1, train_wall=143, gb_free=58.9, wall=47745 epoch 020: 63 / 1707 loss=4.081, nll_loss=2.449, ppl=5.46, wps=295277, ups=0.69, wpb=426706, bsz=16265.9, num_updates=32400, lr=0.000351364, gnorm=0.242, clip=0, loss_scale=1, train_wall=143, gb_free=58.9, wall=47745 epoch 020: 63 / 1707 loss=4.081, nll_loss=2.449, ppl=5.46, wps=295277, ups=0.69, wpb=426706, bsz=16265.9, num_updates=32400, lr=0.000351364, gnorm=0.242, clip=0, loss_scale=1, train_wall=143, gb_free=58.9, wall=47745 epoch 020: 63 / 1707 loss=4.081, nll_loss=2.449, ppl=5.46, wps=295277, ups=0.69, wpb=426706, bsz=16265.9, num_updates=32400, lr=0.000351364, gnorm=0.242, clip=0, loss_scale=1, train_wall=143, gb_free=58.9, wall=47745 epoch 020: 63 / 1707 loss=4.081, nll_loss=2.449, ppl=5.46, wps=295277, ups=0.69, wpb=426706, bsz=16265.9, num_updates=32400, lr=0.000351364, gnorm=0.242, clip=0, loss_scale=1, train_wall=143, gb_free=58.9, wall=47745 epoch 020: 63 / 1707 loss=4.081, nll_loss=2.449, ppl=5.46, wps=295277, ups=0.69, wpb=426706, bsz=16265.9, num_updates=32400, lr=0.000351364, gnorm=0.242, clip=0, loss_scale=1, train_wall=143, gb_free=58.9, wall=47745 epoch 020: 63 / 1707 loss=4.081, nll_loss=2.449, ppl=5.46, wps=295277, ups=0.69, wpb=426706, bsz=16265.9, num_updates=32400, lr=0.000351364, gnorm=0.242, clip=0, loss_scale=1, train_wall=143, gb_free=58.9, wall=47745 epoch 020: 63 / 1707 loss=4.081, nll_loss=2.449, ppl=5.46, wps=295277, ups=0.69, wpb=426706, bsz=16265.9, num_updates=32400, lr=0.000351364, gnorm=0.242, clip=0, loss_scale=1, train_wall=143, gb_free=58.9, wall=47745 epoch 020: 63 / 1707 loss=4.081, nll_loss=2.449, ppl=5.46, wps=295277, ups=0.69, wpb=426706, bsz=16265.9, num_updates=32400, lr=0.000351364, gnorm=0.242, clip=0, loss_scale=1, train_wall=143, gb_free=58.9, wall=47745 epoch 020: 63 / 1707 loss=4.081, nll_loss=2.449, ppl=5.46, wps=295277, ups=0.69, wpb=426706, bsz=16265.9, num_updates=32400, lr=0.000351364, gnorm=0.242, clip=0, loss_scale=1, train_wall=143, gb_free=58.9, wall=47745 epoch 020: 63 / 1707 loss=4.081, nll_loss=2.449, ppl=5.46, wps=295277, ups=0.69, wpb=426706, bsz=16265.9, num_updates=32400, lr=0.000351364, gnorm=0.242, clip=0, loss_scale=1, train_wall=143, gb_free=58.9, wall=47745 epoch 020: 63 / 1707 loss=4.081, nll_loss=2.449, ppl=5.46, wps=295277, ups=0.69, wpb=426706, bsz=16265.9, num_updates=32400, lr=0.000351364, gnorm=0.242, clip=0, loss_scale=1, train_wall=143, gb_free=58.9, wall=47745 epoch 020: 63 / 1707 loss=4.081, nll_loss=2.449, ppl=5.46, wps=295277, ups=0.69, wpb=426706, bsz=16265.9, num_updates=32400, lr=0.000351364, gnorm=0.242, clip=0, loss_scale=1, train_wall=143, gb_free=58.9, wall=47745 epoch 020: 63 / 1707 loss=4.081, nll_loss=2.449, ppl=5.46, wps=295277, ups=0.69, wpb=426706, bsz=16265.9, num_updates=32400, lr=0.000351364, gnorm=0.242, clip=0, loss_scale=1, train_wall=143, gb_free=58.9, wall=47745 epoch 020: 163 / 1707 loss=4.065, nll_loss=2.43, ppl=5.39, wps=296432, ups=0.69, wpb=430193, bsz=16156.3, num_updates=32500, lr=0.000350823, gnorm=0.243, clip=0, loss_scale=1, train_wall=144, gb_free=59.1, wall=47890 epoch 020: 163 / 1707 loss=4.065, nll_loss=2.43, ppl=5.39, wps=296432, ups=0.69, wpb=430193, bsz=16156.3, num_updates=32500, lr=0.000350823, gnorm=0.243, clip=0, loss_scale=1, train_wall=144, gb_free=59.1, wall=47890 epoch 020: 163 / 1707 loss=4.065, nll_loss=2.43, ppl=5.39, wps=296432, ups=0.69, wpb=430193, bsz=16156.3, num_updates=32500, lr=0.000350823, gnorm=0.243, clip=0, loss_scale=1, train_wall=144, gb_free=59.1, wall=47890 epoch 020: 163 / 1707 loss=4.065, nll_loss=2.43, ppl=5.39, wps=296432, ups=0.69, wpb=430193, bsz=16156.3, num_updates=32500, lr=0.000350823, gnorm=0.243, clip=0, loss_scale=1, train_wall=144, gb_free=59.1, wall=47890 epoch 020: 163 / 1707 loss=4.065, nll_loss=2.43, ppl=5.39, wps=296432, ups=0.69, wpb=430193, bsz=16156.3, num_updates=32500, lr=0.000350823, gnorm=0.243, clip=0, loss_scale=1, train_wall=144, gb_free=59.1, wall=47890 epoch 020: 163 / 1707 loss=4.065, nll_loss=2.43, ppl=5.39, wps=296432, ups=0.69, wpb=430193, bsz=16156.3, num_updates=32500, lr=0.000350823, gnorm=0.243, clip=0, loss_scale=1, train_wall=144, gb_free=59.1, wall=47890 epoch 020: 163 / 1707 loss=4.065, nll_loss=2.43, ppl=5.39, wps=296432, ups=0.69, wpb=430193, bsz=16156.3, num_updates=32500, lr=0.000350823, gnorm=0.243, clip=0, loss_scale=1, train_wall=144, gb_free=59.1, wall=47890 epoch 020: 163 / 1707 loss=4.065, nll_loss=2.43, ppl=5.39, wps=296432, ups=0.69, wpb=430193, bsz=16156.3, num_updates=32500, lr=0.000350823, gnorm=0.243, clip=0, loss_scale=1, train_wall=144, gb_free=59.1, wall=47890 epoch 020: 163 / 1707 loss=4.065, nll_loss=2.43, ppl=5.39, wps=296432, ups=0.69, wpb=430193, bsz=16156.3, num_updates=32500, lr=0.000350823, gnorm=0.243, clip=0, loss_scale=1, train_wall=144, gb_free=59.1, wall=47890 epoch 020: 163 / 1707 loss=4.065, nll_loss=2.43, ppl=5.39, wps=296432, ups=0.69, wpb=430193, bsz=16156.3, num_updates=32500, lr=0.000350823, gnorm=0.243, clip=0, loss_scale=1, train_wall=144, gb_free=59.1, wall=47890 epoch 020: 163 / 1707 loss=4.065, nll_loss=2.43, ppl=5.39, wps=296432, ups=0.69, wpb=430193, bsz=16156.3, num_updates=32500, lr=0.000350823, gnorm=0.243, clip=0, loss_scale=1, train_wall=144, gb_free=59.1, wall=47890 epoch 020: 163 / 1707 loss=4.065, nll_loss=2.43, ppl=5.39, wps=296432, ups=0.69, wpb=430193, bsz=16156.3, num_updates=32500, lr=0.000350823, gnorm=0.243, clip=0, loss_scale=1, train_wall=144, gb_free=59.1, wall=47890 epoch 020: 163 / 1707 loss=4.065, nll_loss=2.43, ppl=5.39, wps=296432, ups=0.69, wpb=430193, bsz=16156.3, num_updates=32500, lr=0.000350823, gnorm=0.243, clip=0, loss_scale=1, train_wall=144, gb_free=59.1, wall=47890 epoch 020: 163 / 1707 loss=4.065, nll_loss=2.43, ppl=5.39, wps=296432, ups=0.69, wpb=430193, bsz=16156.3, num_updates=32500, lr=0.000350823, gnorm=0.243, clip=0, loss_scale=1, train_wall=144, gb_free=59.1, wall=47890 epoch 020: 163 / 1707 loss=4.065, nll_loss=2.43, ppl=5.39, wps=296432, ups=0.69, wpb=430193, bsz=16156.3, num_updates=32500, lr=0.000350823, gnorm=0.243, clip=0, loss_scale=1, train_wall=144, gb_free=59.1, wall=47890 epoch 020: 163 / 1707 loss=4.065, nll_loss=2.43, ppl=5.39, wps=296432, ups=0.69, wpb=430193, bsz=16156.3, num_updates=32500, lr=0.000350823, gnorm=0.243, clip=0, loss_scale=1, train_wall=144, gb_free=59.1, wall=47890 epoch 020: 163 / 1707 loss=4.065, nll_loss=2.43, ppl=5.39, wps=296432, ups=0.69, wpb=430193, bsz=16156.3, num_updates=32500, lr=0.000350823, gnorm=0.243, clip=0, loss_scale=1, train_wall=144, gb_free=59.1, wall=47890 epoch 020: 163 / 1707 loss=4.065, nll_loss=2.43, ppl=5.39, wps=296432, ups=0.69, wpb=430193, bsz=16156.3, num_updates=32500, lr=0.000350823, gnorm=0.243, clip=0, loss_scale=1, train_wall=144, gb_free=59.1, wall=47890 epoch 020: 163 / 1707 loss=4.065, nll_loss=2.43, ppl=5.39, wps=296432, ups=0.69, wpb=430193, bsz=16156.3, num_updates=32500, lr=0.000350823, gnorm=0.243, clip=0, loss_scale=1, train_wall=144, gb_free=59.1, wall=47890 epoch 020: 163 / 1707 loss=4.065, nll_loss=2.43, ppl=5.39, wps=296432, ups=0.69, wpb=430193, bsz=16156.3, num_updates=32500, lr=0.000350823, gnorm=0.243, clip=0, loss_scale=1, train_wall=144, gb_free=59.1, wall=47890 epoch 020: 263 / 1707 loss=4.071, nll_loss=2.437, ppl=5.42, wps=295129, ups=0.69, wpb=429295, bsz=16117.5, num_updates=32600, lr=0.000350285, gnorm=0.225, clip=0, loss_scale=2, train_wall=145, gb_free=59, wall=48035 epoch 020: 263 / 1707 loss=4.071, nll_loss=2.437, ppl=5.42, wps=295129, ups=0.69, wpb=429295, bsz=16117.5, num_updates=32600, lr=0.000350285, gnorm=0.225, clip=0, loss_scale=2, train_wall=145, gb_free=59, wall=48035 epoch 020: 263 / 1707 loss=4.071, nll_loss=2.437, ppl=5.42, wps=295129, ups=0.69, wpb=429295, bsz=16117.5, num_updates=32600, lr=0.000350285, gnorm=0.225, clip=0, loss_scale=2, train_wall=145, gb_free=59, wall=48035 epoch 020: 263 / 1707 loss=4.071, nll_loss=2.437, ppl=5.42, wps=295129, ups=0.69, wpb=429295, bsz=16117.5, num_updates=32600, lr=0.000350285, gnorm=0.225, clip=0, loss_scale=2, train_wall=145, gb_free=59, wall=48035 epoch 020: 263 / 1707 loss=4.071, nll_loss=2.437, ppl=5.42, wps=295129, ups=0.69, wpb=429295, bsz=16117.5, num_updates=32600, lr=0.000350285, gnorm=0.225, clip=0, loss_scale=2, train_wall=145, gb_free=59, wall=48035 epoch 020: 263 / 1707 loss=4.071, nll_loss=2.437, ppl=5.42, wps=295129, ups=0.69, wpb=429295, bsz=16117.5, num_updates=32600, lr=0.000350285, gnorm=0.225, clip=0, loss_scale=2, train_wall=145, gb_free=59, wall=48035 epoch 020: 263 / 1707 loss=4.071, nll_loss=2.437, ppl=5.42, wps=295129, ups=0.69, wpb=429295, bsz=16117.5, num_updates=32600, lr=0.000350285, gnorm=0.225, clip=0, loss_scale=2, train_wall=145, gb_free=59, wall=48035 epoch 020: 263 / 1707 loss=4.071, nll_loss=2.437, ppl=5.42, wps=295129, ups=0.69, wpb=429295, bsz=16117.5, num_updates=32600, lr=0.000350285, gnorm=0.225, clip=0, loss_scale=2, train_wall=145, gb_free=59, wall=48035 epoch 020: 263 / 1707 loss=4.071, nll_loss=2.437, ppl=5.42, wps=295129, ups=0.69, wpb=429295, bsz=16117.5, num_updates=32600, lr=0.000350285, gnorm=0.225, clip=0, loss_scale=2, train_wall=145, gb_free=59, wall=48035 epoch 020: 263 / 1707 loss=4.071, nll_loss=2.437, ppl=5.42, wps=295129, ups=0.69, wpb=429295, bsz=16117.5, num_updates=32600, lr=0.000350285, gnorm=0.225, clip=0, loss_scale=2, train_wall=145, gb_free=59, wall=48035 epoch 020: 263 / 1707 loss=4.071, nll_loss=2.437, ppl=5.42, wps=295129, ups=0.69, wpb=429295, bsz=16117.5, num_updates=32600, lr=0.000350285, gnorm=0.225, clip=0, loss_scale=2, train_wall=145, gb_free=59, wall=48035 epoch 020: 263 / 1707 loss=4.071, nll_loss=2.437, ppl=5.42, wps=295129, ups=0.69, wpb=429295, bsz=16117.5, num_updates=32600, lr=0.000350285, gnorm=0.225, clip=0, loss_scale=2, train_wall=145, gb_free=59, wall=48035 epoch 020: 263 / 1707 loss=4.071, nll_loss=2.437, ppl=5.42, wps=295129, ups=0.69, wpb=429295, bsz=16117.5, num_updates=32600, lr=0.000350285, gnorm=0.225, clip=0, loss_scale=2, train_wall=145, gb_free=59, wall=48035 epoch 020: 263 / 1707 loss=4.071, nll_loss=2.437, ppl=5.42, wps=295129, ups=0.69, wpb=429295, bsz=16117.5, num_updates=32600, lr=0.000350285, gnorm=0.225, clip=0, loss_scale=2, train_wall=145, gb_free=59, wall=48035 epoch 020: 263 / 1707 loss=4.071, nll_loss=2.437, ppl=5.42, wps=295129, ups=0.69, wpb=429295, bsz=16117.5, num_updates=32600, lr=0.000350285, gnorm=0.225, clip=0, loss_scale=2, train_wall=145, gb_free=59, wall=48035 epoch 020: 263 / 1707 loss=4.071, nll_loss=2.437, ppl=5.42, wps=295129, ups=0.69, wpb=429295, bsz=16117.5, num_updates=32600, lr=0.000350285, gnorm=0.225, clip=0, loss_scale=2, train_wall=145, gb_free=59, wall=48035 epoch 020: 263 / 1707 loss=4.071, nll_loss=2.437, ppl=5.42, wps=295129, ups=0.69, wpb=429295, bsz=16117.5, num_updates=32600, lr=0.000350285, gnorm=0.225, clip=0, loss_scale=2, train_wall=145, gb_free=59, wall=48035 epoch 020: 263 / 1707 loss=4.071, nll_loss=2.437, ppl=5.42, wps=295129, ups=0.69, wpb=429295, bsz=16117.5, num_updates=32600, lr=0.000350285, gnorm=0.225, clip=0, loss_scale=2, train_wall=145, gb_free=59, wall=48035 epoch 020: 263 / 1707 loss=4.071, nll_loss=2.437, ppl=5.42, wps=295129, ups=0.69, wpb=429295, bsz=16117.5, num_updates=32600, lr=0.000350285, gnorm=0.225, clip=0, loss_scale=2, train_wall=145, gb_free=59, wall=48035 epoch 020: 263 / 1707 loss=4.071, nll_loss=2.437, ppl=5.42, wps=295129, ups=0.69, wpb=429295, bsz=16117.5, num_updates=32600, lr=0.000350285, gnorm=0.225, clip=0, loss_scale=2, train_wall=145, gb_free=59, wall=48035 epoch 020: 363 / 1707 loss=4.065, nll_loss=2.431, ppl=5.39, wps=294889, ups=0.69, wpb=429281, bsz=16478.1, num_updates=32700, lr=0.000349749, gnorm=0.233, clip=0, loss_scale=2, train_wall=145, gb_free=58.9, wall=48181 epoch 020: 363 / 1707 loss=4.065, nll_loss=2.431, ppl=5.39, wps=294889, ups=0.69, wpb=429281, bsz=16478.1, num_updates=32700, lr=0.000349749, gnorm=0.233, clip=0, loss_scale=2, train_wall=145, gb_free=58.9, wall=48181 epoch 020: 363 / 1707 loss=4.065, nll_loss=2.431, ppl=5.39, wps=294889, ups=0.69, wpb=429281, bsz=16478.1, num_updates=32700, lr=0.000349749, gnorm=0.233, clip=0, loss_scale=2, train_wall=145, gb_free=58.9, wall=48181 epoch 020: 363 / 1707 loss=4.065, nll_loss=2.431, ppl=5.39, wps=294889, ups=0.69, wpb=429281, bsz=16478.1, num_updates=32700, lr=0.000349749, gnorm=0.233, clip=0, loss_scale=2, train_wall=145, gb_free=58.9, wall=48181 epoch 020: 363 / 1707 loss=4.065, nll_loss=2.431, ppl=5.39, wps=294889, ups=0.69, wpb=429281, bsz=16478.1, num_updates=32700, lr=0.000349749, gnorm=0.233, clip=0, loss_scale=2, train_wall=145, gb_free=58.9, wall=48181 epoch 020: 363 / 1707 loss=4.065, nll_loss=2.431, ppl=5.39, wps=294889, ups=0.69, wpb=429281, bsz=16478.1, num_updates=32700, lr=0.000349749, gnorm=0.233, clip=0, loss_scale=2, train_wall=145, gb_free=58.9, wall=48181 epoch 020: 363 / 1707 loss=4.065, nll_loss=2.431, ppl=5.39, wps=294889, ups=0.69, wpb=429281, bsz=16478.1, num_updates=32700, lr=0.000349749, gnorm=0.233, clip=0, loss_scale=2, train_wall=145, gb_free=58.9, wall=48181 epoch 020: 363 / 1707 loss=4.065, nll_loss=2.431, ppl=5.39, wps=294889, ups=0.69, wpb=429281, bsz=16478.1, num_updates=32700, lr=0.000349749, gnorm=0.233, clip=0, loss_scale=2, train_wall=145, gb_free=58.9, wall=48181 epoch 020: 363 / 1707 loss=4.065, nll_loss=2.431, ppl=5.39, wps=294889, ups=0.69, wpb=429281, bsz=16478.1, num_updates=32700, lr=0.000349749, gnorm=0.233, clip=0, loss_scale=2, train_wall=145, gb_free=58.9, wall=48181 epoch 020: 363 / 1707 loss=4.065, nll_loss=2.431, ppl=5.39, wps=294889, ups=0.69, wpb=429281, bsz=16478.1, num_updates=32700, lr=0.000349749, gnorm=0.233, clip=0, loss_scale=2, train_wall=145, gb_free=58.9, wall=48181 epoch 020: 363 / 1707 loss=4.065, nll_loss=2.431, ppl=5.39, wps=294889, ups=0.69, wpb=429281, bsz=16478.1, num_updates=32700, lr=0.000349749, gnorm=0.233, clip=0, loss_scale=2, train_wall=145, gb_free=58.9, wall=48181 epoch 020: 363 / 1707 loss=4.065, nll_loss=2.431, ppl=5.39, wps=294889, ups=0.69, wpb=429281, bsz=16478.1, num_updates=32700, lr=0.000349749, gnorm=0.233, clip=0, loss_scale=2, train_wall=145, gb_free=58.9, wall=48181 epoch 020: 363 / 1707 loss=4.065, nll_loss=2.431, ppl=5.39, wps=294889, ups=0.69, wpb=429281, bsz=16478.1, num_updates=32700, lr=0.000349749, gnorm=0.233, clip=0, loss_scale=2, train_wall=145, gb_free=58.9, wall=48181 epoch 020: 363 / 1707 loss=4.065, nll_loss=2.431, ppl=5.39, wps=294889, ups=0.69, wpb=429281, bsz=16478.1, num_updates=32700, lr=0.000349749, gnorm=0.233, clip=0, loss_scale=2, train_wall=145, gb_free=58.9, wall=48181 epoch 020: 363 / 1707 loss=4.065, nll_loss=2.431, ppl=5.39, wps=294889, ups=0.69, wpb=429281, bsz=16478.1, num_updates=32700, lr=0.000349749, gnorm=0.233, clip=0, loss_scale=2, train_wall=145, gb_free=58.9, wall=48181 epoch 020: 363 / 1707 loss=4.065, nll_loss=2.431, ppl=5.39, wps=294889, ups=0.69, wpb=429281, bsz=16478.1, num_updates=32700, lr=0.000349749, gnorm=0.233, clip=0, loss_scale=2, train_wall=145, gb_free=58.9, wall=48181 epoch 020: 363 / 1707 loss=4.065, nll_loss=2.431, ppl=5.39, wps=294889, ups=0.69, wpb=429281, bsz=16478.1, num_updates=32700, lr=0.000349749, gnorm=0.233, clip=0, loss_scale=2, train_wall=145, gb_free=58.9, wall=48181 epoch 020: 363 / 1707 loss=4.065, nll_loss=2.431, ppl=5.39, wps=294889, ups=0.69, wpb=429281, bsz=16478.1, num_updates=32700, lr=0.000349749, gnorm=0.233, clip=0, loss_scale=2, train_wall=145, gb_free=58.9, wall=48181 epoch 020: 363 / 1707 loss=4.065, nll_loss=2.431, ppl=5.39, wps=294889, ups=0.69, wpb=429281, bsz=16478.1, num_updates=32700, lr=0.000349749, gnorm=0.233, clip=0, loss_scale=2, train_wall=145, gb_free=58.9, wall=48181 epoch 020: 363 / 1707 loss=4.065, nll_loss=2.431, ppl=5.39, wps=294889, ups=0.69, wpb=429281, bsz=16478.1, num_updates=32700, lr=0.000349749, gnorm=0.233, clip=0, loss_scale=2, train_wall=145, gb_free=58.9, wall=48181 epoch 020: 463 / 1707 loss=4.077, nll_loss=2.444, ppl=5.44, wps=295470, ups=0.69, wpb=429839, bsz=16203.9, num_updates=32800, lr=0.000349215, gnorm=0.229, clip=0, loss_scale=2, train_wall=145, gb_free=58.8, wall=48327 epoch 020: 463 / 1707 loss=4.077, nll_loss=2.444, ppl=5.44, wps=295470, ups=0.69, wpb=429839, bsz=16203.9, num_updates=32800, lr=0.000349215, gnorm=0.229, clip=0, loss_scale=2, train_wall=145, gb_free=58.8, wall=48327 epoch 020: 463 / 1707 loss=4.077, nll_loss=2.444, ppl=5.44, wps=295470, ups=0.69, wpb=429839, bsz=16203.9, num_updates=32800, lr=0.000349215, gnorm=0.229, clip=0, loss_scale=2, train_wall=145, gb_free=58.8, wall=48327 epoch 020: 463 / 1707 loss=4.077, nll_loss=2.444, ppl=5.44, wps=295470, ups=0.69, wpb=429839, bsz=16203.9, num_updates=32800, lr=0.000349215, gnorm=0.229, clip=0, loss_scale=2, train_wall=145, gb_free=58.8, wall=48327 epoch 020: 463 / 1707 loss=4.077, nll_loss=2.444, ppl=5.44, wps=295470, ups=0.69, wpb=429839, bsz=16203.9, num_updates=32800, lr=0.000349215, gnorm=0.229, clip=0, loss_scale=2, train_wall=145, gb_free=58.8, wall=48327 epoch 020: 463 / 1707 loss=4.077, nll_loss=2.444, ppl=5.44, wps=295470, ups=0.69, wpb=429839, bsz=16203.9, num_updates=32800, lr=0.000349215, gnorm=0.229, clip=0, loss_scale=2, train_wall=145, gb_free=58.8, wall=48327 epoch 020: 463 / 1707 loss=4.077, nll_loss=2.444, ppl=5.44, wps=295470, ups=0.69, wpb=429839, bsz=16203.9, num_updates=32800, lr=0.000349215, gnorm=0.229, clip=0, loss_scale=2, train_wall=145, gb_free=58.8, wall=48327 epoch 020: 463 / 1707 loss=4.077, nll_loss=2.444, ppl=5.44, wps=295470, ups=0.69, wpb=429839, bsz=16203.9, num_updates=32800, lr=0.000349215, gnorm=0.229, clip=0, loss_scale=2, train_wall=145, gb_free=58.8, wall=48327 epoch 020: 463 / 1707 loss=4.077, nll_loss=2.444, ppl=5.44, wps=295470, ups=0.69, wpb=429839, bsz=16203.9, num_updates=32800, lr=0.000349215, gnorm=0.229, clip=0, loss_scale=2, train_wall=145, gb_free=58.8, wall=48327 epoch 020: 463 / 1707 loss=4.077, nll_loss=2.444, ppl=5.44, wps=295470, ups=0.69, wpb=429839, bsz=16203.9, num_updates=32800, lr=0.000349215, gnorm=0.229, clip=0, loss_scale=2, train_wall=145, gb_free=58.8, wall=48327 epoch 020: 463 / 1707 loss=4.077, nll_loss=2.444, ppl=5.44, wps=295470, ups=0.69, wpb=429839, bsz=16203.9, num_updates=32800, lr=0.000349215, gnorm=0.229, clip=0, loss_scale=2, train_wall=145, gb_free=58.8, wall=48327 epoch 020: 463 / 1707 loss=4.077, nll_loss=2.444, ppl=5.44, wps=295470, ups=0.69, wpb=429839, bsz=16203.9, num_updates=32800, lr=0.000349215, gnorm=0.229, clip=0, loss_scale=2, train_wall=145, gb_free=58.8, wall=48327 epoch 020: 463 / 1707 loss=4.077, nll_loss=2.444, ppl=5.44, wps=295470, ups=0.69, wpb=429839, bsz=16203.9, num_updates=32800, lr=0.000349215, gnorm=0.229, clip=0, loss_scale=2, train_wall=145, gb_free=58.8, wall=48327 epoch 020: 463 / 1707 loss=4.077, nll_loss=2.444, ppl=5.44, wps=295470, ups=0.69, wpb=429839, bsz=16203.9, num_updates=32800, lr=0.000349215, gnorm=0.229, clip=0, loss_scale=2, train_wall=145, gb_free=58.8, wall=48327 epoch 020: 463 / 1707 loss=4.077, nll_loss=2.444, ppl=5.44, wps=295470, ups=0.69, wpb=429839, bsz=16203.9, num_updates=32800, lr=0.000349215, gnorm=0.229, clip=0, loss_scale=2, train_wall=145, gb_free=58.8, wall=48327 epoch 020: 463 / 1707 loss=4.077, nll_loss=2.444, ppl=5.44, wps=295470, ups=0.69, wpb=429839, bsz=16203.9, num_updates=32800, lr=0.000349215, gnorm=0.229, clip=0, loss_scale=2, train_wall=145, gb_free=58.8, wall=48327 epoch 020: 463 / 1707 loss=4.077, nll_loss=2.444, ppl=5.44, wps=295470, ups=0.69, wpb=429839, bsz=16203.9, num_updates=32800, lr=0.000349215, gnorm=0.229, clip=0, loss_scale=2, train_wall=145, gb_free=58.8, wall=48327 epoch 020: 463 / 1707 loss=4.077, nll_loss=2.444, ppl=5.44, wps=295470, ups=0.69, wpb=429839, bsz=16203.9, num_updates=32800, lr=0.000349215, gnorm=0.229, clip=0, loss_scale=2, train_wall=145, gb_free=58.8, wall=48327 epoch 020: 463 / 1707 loss=4.077, nll_loss=2.444, ppl=5.44, wps=295470, ups=0.69, wpb=429839, bsz=16203.9, num_updates=32800, lr=0.000349215, gnorm=0.229, clip=0, loss_scale=2, train_wall=145, gb_free=58.8, wall=48327 epoch 020: 463 / 1707 loss=4.077, nll_loss=2.444, ppl=5.44, wps=295470, ups=0.69, wpb=429839, bsz=16203.9, num_updates=32800, lr=0.000349215, gnorm=0.229, clip=0, loss_scale=2, train_wall=145, gb_free=58.8, wall=48327 epoch 020: 564 / 1707 loss=4.083, nll_loss=2.452, ppl=5.47, wps=292961, ups=0.68, wpb=429095, bsz=16241.8, num_updates=32900, lr=0.000348684, gnorm=0.24, clip=0, loss_scale=2, train_wall=146, gb_free=59.4, wall=48473 epoch 020: 564 / 1707 loss=4.083, nll_loss=2.452, ppl=5.47, wps=292961, ups=0.68, wpb=429095, bsz=16241.8, num_updates=32900, lr=0.000348684, gnorm=0.24, clip=0, loss_scale=2, train_wall=146, gb_free=59.4, wall=48473 epoch 020: 564 / 1707 loss=4.083, nll_loss=2.452, ppl=5.47, wps=292961, ups=0.68, wpb=429095, bsz=16241.8, num_updates=32900, lr=0.000348684, gnorm=0.24, clip=0, loss_scale=2, train_wall=146, gb_free=59.4, wall=48473 epoch 020: 564 / 1707 loss=4.083, nll_loss=2.452, ppl=5.47, wps=292961, ups=0.68, wpb=429095, bsz=16241.8, num_updates=32900, lr=0.000348684, gnorm=0.24, clip=0, loss_scale=2, train_wall=146, gb_free=59.4, wall=48473 epoch 020: 564 / 1707 loss=4.083, nll_loss=2.452, ppl=5.47, wps=292961, ups=0.68, wpb=429095, bsz=16241.8, num_updates=32900, lr=0.000348684, gnorm=0.24, clip=0, loss_scale=2, train_wall=146, gb_free=59.4, wall=48473 epoch 020: 564 / 1707 loss=4.083, nll_loss=2.452, ppl=5.47, wps=292961, ups=0.68, wpb=429095, bsz=16241.8, num_updates=32900, lr=0.000348684, gnorm=0.24, clip=0, loss_scale=2, train_wall=146, gb_free=59.4, wall=48473 epoch 020: 564 / 1707 loss=4.083, nll_loss=2.452, ppl=5.47, wps=292961, ups=0.68, wpb=429095, bsz=16241.8, num_updates=32900, lr=0.000348684, gnorm=0.24, clip=0, loss_scale=2, train_wall=146, gb_free=59.4, wall=48473 epoch 020: 564 / 1707 loss=4.083, nll_loss=2.452, ppl=5.47, wps=292961, ups=0.68, wpb=429095, bsz=16241.8, num_updates=32900, lr=0.000348684, gnorm=0.24, clip=0, loss_scale=2, train_wall=146, gb_free=59.4, wall=48473 epoch 020: 564 / 1707 loss=4.083, nll_loss=2.452, ppl=5.47, wps=292961, ups=0.68, wpb=429095, bsz=16241.8, num_updates=32900, lr=0.000348684, gnorm=0.24, clip=0, loss_scale=2, train_wall=146, gb_free=59.4, wall=48473 epoch 020: 564 / 1707 loss=4.083, nll_loss=2.452, ppl=5.47, wps=292961, ups=0.68, wpb=429095, bsz=16241.8, num_updates=32900, lr=0.000348684, gnorm=0.24, clip=0, loss_scale=2, train_wall=146, gb_free=59.4, wall=48473 epoch 020: 564 / 1707 loss=4.083, nll_loss=2.452, ppl=5.47, wps=292961, ups=0.68, wpb=429095, bsz=16241.8, num_updates=32900, lr=0.000348684, gnorm=0.24, clip=0, loss_scale=2, train_wall=146, gb_free=59.4, wall=48473 epoch 020: 564 / 1707 loss=4.083, nll_loss=2.452, ppl=5.47, wps=292961, ups=0.68, wpb=429095, bsz=16241.8, num_updates=32900, lr=0.000348684, gnorm=0.24, clip=0, loss_scale=2, train_wall=146, gb_free=59.4, wall=48473 epoch 020: 564 / 1707 loss=4.083, nll_loss=2.452, ppl=5.47, wps=292961, ups=0.68, wpb=429095, bsz=16241.8, num_updates=32900, lr=0.000348684, gnorm=0.24, clip=0, loss_scale=2, train_wall=146, gb_free=59.4, wall=48473 epoch 020: 564 / 1707 loss=4.083, nll_loss=2.452, ppl=5.47, wps=292961, ups=0.68, wpb=429095, bsz=16241.8, num_updates=32900, lr=0.000348684, gnorm=0.24, clip=0, loss_scale=2, train_wall=146, gb_free=59.4, wall=48473 epoch 020: 564 / 1707 loss=4.083, nll_loss=2.452, ppl=5.47, wps=292961, ups=0.68, wpb=429095, bsz=16241.8, num_updates=32900, lr=0.000348684, gnorm=0.24, clip=0, loss_scale=2, train_wall=146, gb_free=59.4, wall=48473 epoch 020: 564 / 1707 loss=4.083, nll_loss=2.452, ppl=5.47, wps=292961, ups=0.68, wpb=429095, bsz=16241.8, num_updates=32900, lr=0.000348684, gnorm=0.24, clip=0, loss_scale=2, train_wall=146, gb_free=59.4, wall=48473 epoch 020: 564 / 1707 loss=4.083, nll_loss=2.452, ppl=5.47, wps=292961, ups=0.68, wpb=429095, bsz=16241.8, num_updates=32900, lr=0.000348684, gnorm=0.24, clip=0, loss_scale=2, train_wall=146, gb_free=59.4, wall=48473 epoch 020: 564 / 1707 loss=4.083, nll_loss=2.452, ppl=5.47, wps=292961, ups=0.68, wpb=429095, bsz=16241.8, num_updates=32900, lr=0.000348684, gnorm=0.24, clip=0, loss_scale=2, train_wall=146, gb_free=59.4, wall=48473 epoch 020: 564 / 1707 loss=4.083, nll_loss=2.452, ppl=5.47, wps=292961, ups=0.68, wpb=429095, bsz=16241.8, num_updates=32900, lr=0.000348684, gnorm=0.24, clip=0, loss_scale=2, train_wall=146, gb_free=59.4, wall=48473 epoch 020: 564 / 1707 loss=4.083, nll_loss=2.452, ppl=5.47, wps=292961, ups=0.68, wpb=429095, bsz=16241.8, num_updates=32900, lr=0.000348684, gnorm=0.24, clip=0, loss_scale=2, train_wall=146, gb_free=59.4, wall=48473 epoch 020: 664 / 1707 loss=4.078, nll_loss=2.446, ppl=5.45, wps=293350, ups=0.68, wpb=428606, bsz=16687.8, num_updates=33000, lr=0.000348155, gnorm=0.263, clip=0, loss_scale=2, train_wall=145, gb_free=59.2, wall=48619 epoch 020: 664 / 1707 loss=4.078, nll_loss=2.446, ppl=5.45, wps=293350, ups=0.68, wpb=428606, bsz=16687.8, num_updates=33000, lr=0.000348155, gnorm=0.263, clip=0, loss_scale=2, train_wall=145, gb_free=59.2, wall=48619 epoch 020: 664 / 1707 loss=4.078, nll_loss=2.446, ppl=5.45, wps=293350, ups=0.68, wpb=428606, bsz=16687.8, num_updates=33000, lr=0.000348155, gnorm=0.263, clip=0, loss_scale=2, train_wall=145, gb_free=59.2, wall=48619 epoch 020: 664 / 1707 loss=4.078, nll_loss=2.446, ppl=5.45, wps=293350, ups=0.68, wpb=428606, bsz=16687.8, num_updates=33000, lr=0.000348155, gnorm=0.263, clip=0, loss_scale=2, train_wall=145, gb_free=59.2, wall=48619 epoch 020: 664 / 1707 loss=4.078, nll_loss=2.446, ppl=5.45, wps=293350, ups=0.68, wpb=428606, bsz=16687.8, num_updates=33000, lr=0.000348155, gnorm=0.263, clip=0, loss_scale=2, train_wall=145, gb_free=59.2, wall=48619 epoch 020: 664 / 1707 loss=4.078, nll_loss=2.446, ppl=5.45, wps=293350, ups=0.68, wpb=428606, bsz=16687.8, num_updates=33000, lr=0.000348155, gnorm=0.263, clip=0, loss_scale=2, train_wall=145, gb_free=59.2, wall=48619 epoch 020: 664 / 1707 loss=4.078, nll_loss=2.446, ppl=5.45, wps=293350, ups=0.68, wpb=428606, bsz=16687.8, num_updates=33000, lr=0.000348155, gnorm=0.263, clip=0, loss_scale=2, train_wall=145, gb_free=59.2, wall=48619 epoch 020: 664 / 1707 loss=4.078, nll_loss=2.446, ppl=5.45, wps=293350, ups=0.68, wpb=428606, bsz=16687.8, num_updates=33000, lr=0.000348155, gnorm=0.263, clip=0, loss_scale=2, train_wall=145, gb_free=59.2, wall=48619 epoch 020: 664 / 1707 loss=4.078, nll_loss=2.446, ppl=5.45, wps=293350, ups=0.68, wpb=428606, bsz=16687.8, num_updates=33000, lr=0.000348155, gnorm=0.263, clip=0, loss_scale=2, train_wall=145, gb_free=59.2, wall=48619 epoch 020: 664 / 1707 loss=4.078, nll_loss=2.446, ppl=5.45, wps=293350, ups=0.68, wpb=428606, bsz=16687.8, num_updates=33000, lr=0.000348155, gnorm=0.263, clip=0, loss_scale=2, train_wall=145, gb_free=59.2, wall=48619 epoch 020: 664 / 1707 loss=4.078, nll_loss=2.446, ppl=5.45, wps=293350, ups=0.68, wpb=428606, bsz=16687.8, num_updates=33000, lr=0.000348155, gnorm=0.263, clip=0, loss_scale=2, train_wall=145, gb_free=59.2, wall=48619 epoch 020: 664 / 1707 loss=4.078, nll_loss=2.446, ppl=5.45, wps=293350, ups=0.68, wpb=428606, bsz=16687.8, num_updates=33000, lr=0.000348155, gnorm=0.263, clip=0, loss_scale=2, train_wall=145, gb_free=59.2, wall=48619 epoch 020: 664 / 1707 loss=4.078, nll_loss=2.446, ppl=5.45, wps=293350, ups=0.68, wpb=428606, bsz=16687.8, num_updates=33000, lr=0.000348155, gnorm=0.263, clip=0, loss_scale=2, train_wall=145, gb_free=59.2, wall=48619 epoch 020: 664 / 1707 loss=4.078, nll_loss=2.446, ppl=5.45, wps=293350, ups=0.68, wpb=428606, bsz=16687.8, num_updates=33000, lr=0.000348155, gnorm=0.263, clip=0, loss_scale=2, train_wall=145, gb_free=59.2, wall=48619 epoch 020: 664 / 1707 loss=4.078, nll_loss=2.446, ppl=5.45, wps=293350, ups=0.68, wpb=428606, bsz=16687.8, num_updates=33000, lr=0.000348155, gnorm=0.263, clip=0, loss_scale=2, train_wall=145, gb_free=59.2, wall=48619 epoch 020: 664 / 1707 loss=4.078, nll_loss=2.446, ppl=5.45, wps=293350, ups=0.68, wpb=428606, bsz=16687.8, num_updates=33000, lr=0.000348155, gnorm=0.263, clip=0, loss_scale=2, train_wall=145, gb_free=59.2, wall=48619 epoch 020: 664 / 1707 loss=4.078, nll_loss=2.446, ppl=5.45, wps=293350, ups=0.68, wpb=428606, bsz=16687.8, num_updates=33000, lr=0.000348155, gnorm=0.263, clip=0, loss_scale=2, train_wall=145, gb_free=59.2, wall=48619 epoch 020: 664 / 1707 loss=4.078, nll_loss=2.446, ppl=5.45, wps=293350, ups=0.68, wpb=428606, bsz=16687.8, num_updates=33000, lr=0.000348155, gnorm=0.263, clip=0, loss_scale=2, train_wall=145, gb_free=59.2, wall=48619 epoch 020: 664 / 1707 loss=4.078, nll_loss=2.446, ppl=5.45, wps=293350, ups=0.68, wpb=428606, bsz=16687.8, num_updates=33000, lr=0.000348155, gnorm=0.263, clip=0, loss_scale=2, train_wall=145, gb_free=59.2, wall=48619 epoch 020: 664 / 1707 loss=4.078, nll_loss=2.446, ppl=5.45, wps=293350, ups=0.68, wpb=428606, bsz=16687.8, num_updates=33000, lr=0.000348155, gnorm=0.263, clip=0, loss_scale=2, train_wall=145, gb_free=59.2, wall=48619 begin validation on "valid" subset epoch 020 | valid on 'valid' subset | loss 4.306 | nll_loss 2.683 | ppl 6.42 | wps 75619.4 | wpb 21331 | bsz 1016 | num_updates 33000 | best_loss 4.296 epoch 020 | valid on 'valid' subset | loss 4.306 | nll_loss 2.683 | ppl 6.42 | wps 75619.4 | wpb 21331 | bsz 1016 | num_updates 33000 | best_loss 4.296 epoch 020 | valid on 'valid' subset | loss 4.306 | nll_loss 2.683 | ppl 6.42 | wps 75619.4 | wpb 21331 | bsz 1016 | num_updates 33000 | best_loss 4.296 epoch 020 | valid on 'valid' subset | loss 4.306 | nll_loss 2.683 | ppl 6.42 | wps 75619.4 | wpb 21331 | bsz 1016 | num_updates 33000 | best_loss 4.296 epoch 020 | valid on 'valid' subset | loss 4.306 | nll_loss 2.683 | ppl 6.42 | wps 75619.4 | wpb 21331 | bsz 1016 | num_updates 33000 | best_loss 4.296 epoch 020 | valid on 'valid' subset | loss 4.306 | nll_loss 2.683 | ppl 6.42 | wps 75619.4 | wpb 21331 | bsz 1016 | num_updates 33000 | best_loss 4.296 epoch 020 | valid on 'valid' subset | loss 4.306 | nll_loss 2.683 | ppl 6.42 | wps 75619.4 | wpb 21331 | bsz 1016 | num_updates 33000 | best_loss 4.296 epoch 020 | valid on 'valid' subset | loss 4.306 | nll_loss 2.683 | ppl 6.42 | wps 75619.4 | wpb 21331 | bsz 1016 | num_updates 33000 | best_loss 4.296 epoch 020 | valid on 'valid' subset | loss 4.306 | nll_loss 2.683 | ppl 6.42 | wps 75619.4 | wpb 21331 | bsz 1016 | num_updates 33000 | best_loss 4.296 epoch 020 | valid on 'valid' subset | loss 4.306 | nll_loss 2.683 | ppl 6.42 | wps 75619.4 | wpb 21331 | bsz 1016 | num_updates 33000 | best_loss 4.296 epoch 020 | valid on 'valid' subset | loss 4.306 | nll_loss 2.683 | ppl 6.42 | wps 75619.4 | wpb 21331 | bsz 1016 | num_updates 33000 | best_loss 4.296 epoch 020 | valid on 'valid' subset | loss 4.306 | nll_loss 2.683 | ppl 6.42 | wps 75619.4 | wpb 21331 | bsz 1016 | num_updates 33000 | best_loss 4.296 epoch 020 | valid on 'valid' subset | loss 4.306 | nll_loss 2.683 | ppl 6.42 | wps 75619.4 | wpb 21331 | bsz 1016 | num_updates 33000 | best_loss 4.296 epoch 020 | valid on 'valid' subset | loss 4.306 | nll_loss 2.683 | ppl 6.42 | wps 75619.4 | wpb 21331 | bsz 1016 | num_updates 33000 | best_loss 4.296 epoch 020 | valid on 'valid' subset | loss 4.306 | nll_loss 2.683 | ppl 6.42 | wps 75619.4 | wpb 21331 | bsz 1016 | num_updates 33000 | best_loss 4.296 epoch 020 | valid on 'valid' subset | loss 4.306 | nll_loss 2.683 | ppl 6.42 | wps 75619.4 | wpb 21331 | bsz 1016 | num_updates 33000 | best_loss 4.296 epoch 020 | valid on 'valid' subset | loss 4.306 | nll_loss 2.683 | ppl 6.42 | wps 75619.4 | wpb 21331 | bsz 1016 | num_updates 33000 | best_loss 4.296 epoch 020 | valid on 'valid' subset | loss 4.306 | nll_loss 2.683 | ppl 6.42 | wps 75619.4 | wpb 21331 | bsz 1016 | num_updates 33000 | best_loss 4.296 epoch 020 | valid on 'valid' subset | loss 4.306 | nll_loss 2.683 | ppl 6.42 | wps 75619.4 | wpb 21331 | bsz 1016 | num_updates 33000 | best_loss 4.296 epoch 020 | valid on 'valid' subset | loss 4.306 | nll_loss 2.683 | ppl 6.42 | wps 75619.4 | wpb 21331 | bsz 1016 | num_updates 33000 | best_loss 4.296 epoch 020: 764 / 1707 loss=4.077, nll_loss=2.444, ppl=5.44, wps=272149, ups=0.63, wpb=428741, bsz=16209.6, num_updates=33100, lr=0.000347629, gnorm=0.227, clip=0, loss_scale=4, train_wall=144, gb_free=59.1, wall=48777 epoch 020: 764 / 1707 loss=4.077, nll_loss=2.444, ppl=5.44, wps=272149, ups=0.63, wpb=428741, bsz=16209.6, num_updates=33100, lr=0.000347629, gnorm=0.227, clip=0, loss_scale=4, train_wall=144, gb_free=59.1, wall=48777 epoch 020: 764 / 1707 loss=4.077, nll_loss=2.444, ppl=5.44, wps=272149, ups=0.63, wpb=428741, bsz=16209.6, num_updates=33100, lr=0.000347629, gnorm=0.227, clip=0, loss_scale=4, train_wall=144, gb_free=59.1, wall=48777 epoch 020: 764 / 1707 loss=4.077, nll_loss=2.444, ppl=5.44, wps=272149, ups=0.63, wpb=428741, bsz=16209.6, num_updates=33100, lr=0.000347629, gnorm=0.227, clip=0, loss_scale=4, train_wall=144, gb_free=59.1, wall=48777 epoch 020: 764 / 1707 loss=4.077, nll_loss=2.444, ppl=5.44, wps=272149, ups=0.63, wpb=428741, bsz=16209.6, num_updates=33100, lr=0.000347629, gnorm=0.227, clip=0, loss_scale=4, train_wall=144, gb_free=59.1, wall=48777 epoch 020: 764 / 1707 loss=4.077, nll_loss=2.444, ppl=5.44, wps=272149, ups=0.63, wpb=428741, bsz=16209.6, num_updates=33100, lr=0.000347629, gnorm=0.227, clip=0, loss_scale=4, train_wall=144, gb_free=59.1, wall=48777 epoch 020: 764 / 1707 loss=4.077, nll_loss=2.444, ppl=5.44, wps=272149, ups=0.63, wpb=428741, bsz=16209.6, num_updates=33100, lr=0.000347629, gnorm=0.227, clip=0, loss_scale=4, train_wall=144, gb_free=59.1, wall=48777 epoch 020: 764 / 1707 loss=4.077, nll_loss=2.444, ppl=5.44, wps=272149, ups=0.63, wpb=428741, bsz=16209.6, num_updates=33100, lr=0.000347629, gnorm=0.227, clip=0, loss_scale=4, train_wall=144, gb_free=59.1, wall=48777 epoch 020: 764 / 1707 loss=4.077, nll_loss=2.444, ppl=5.44, wps=272149, ups=0.63, wpb=428741, bsz=16209.6, num_updates=33100, lr=0.000347629, gnorm=0.227, clip=0, loss_scale=4, train_wall=144, gb_free=59.1, wall=48777 epoch 020: 764 / 1707 loss=4.077, nll_loss=2.444, ppl=5.44, wps=272149, ups=0.63, wpb=428741, bsz=16209.6, num_updates=33100, lr=0.000347629, gnorm=0.227, clip=0, loss_scale=4, train_wall=144, gb_free=59.1, wall=48777 epoch 020: 764 / 1707 loss=4.077, nll_loss=2.444, ppl=5.44, wps=272149, ups=0.63, wpb=428741, bsz=16209.6, num_updates=33100, lr=0.000347629, gnorm=0.227, clip=0, loss_scale=4, train_wall=144, gb_free=59.1, wall=48777 epoch 020: 764 / 1707 loss=4.077, nll_loss=2.444, ppl=5.44, wps=272149, ups=0.63, wpb=428741, bsz=16209.6, num_updates=33100, lr=0.000347629, gnorm=0.227, clip=0, loss_scale=4, train_wall=144, gb_free=59.1, wall=48777 epoch 020: 764 / 1707 loss=4.077, nll_loss=2.444, ppl=5.44, wps=272149, ups=0.63, wpb=428741, bsz=16209.6, num_updates=33100, lr=0.000347629, gnorm=0.227, clip=0, loss_scale=4, train_wall=144, gb_free=59.1, wall=48777 epoch 020: 764 / 1707 loss=4.077, nll_loss=2.444, ppl=5.44, wps=272149, ups=0.63, wpb=428741, bsz=16209.6, num_updates=33100, lr=0.000347629, gnorm=0.227, clip=0, loss_scale=4, train_wall=144, gb_free=59.1, wall=48777 epoch 020: 764 / 1707 loss=4.077, nll_loss=2.444, ppl=5.44, wps=272149, ups=0.63, wpb=428741, bsz=16209.6, num_updates=33100, lr=0.000347629, gnorm=0.227, clip=0, loss_scale=4, train_wall=144, gb_free=59.1, wall=48777 epoch 020: 764 / 1707 loss=4.077, nll_loss=2.444, ppl=5.44, wps=272149, ups=0.63, wpb=428741, bsz=16209.6, num_updates=33100, lr=0.000347629, gnorm=0.227, clip=0, loss_scale=4, train_wall=144, gb_free=59.1, wall=48777 epoch 020: 764 / 1707 loss=4.077, nll_loss=2.444, ppl=5.44, wps=272149, ups=0.63, wpb=428741, bsz=16209.6, num_updates=33100, lr=0.000347629, gnorm=0.227, clip=0, loss_scale=4, train_wall=144, gb_free=59.1, wall=48777 epoch 020: 764 / 1707 loss=4.077, nll_loss=2.444, ppl=5.44, wps=272149, ups=0.63, wpb=428741, bsz=16209.6, num_updates=33100, lr=0.000347629, gnorm=0.227, clip=0, loss_scale=4, train_wall=144, gb_free=59.1, wall=48777 epoch 020: 764 / 1707 loss=4.077, nll_loss=2.444, ppl=5.44, wps=272149, ups=0.63, wpb=428741, bsz=16209.6, num_updates=33100, lr=0.000347629, gnorm=0.227, clip=0, loss_scale=4, train_wall=144, gb_free=59.1, wall=48777 epoch 020: 764 / 1707 loss=4.077, nll_loss=2.444, ppl=5.44, wps=272149, ups=0.63, wpb=428741, bsz=16209.6, num_updates=33100, lr=0.000347629, gnorm=0.227, clip=0, loss_scale=4, train_wall=144, gb_free=59.1, wall=48777 epoch 020: 865 / 1707 loss=4.089, nll_loss=2.459, ppl=5.5, wps=293742, ups=0.68, wpb=430187, bsz=16375.4, num_updates=33200, lr=0.000347105, gnorm=0.221, clip=0, loss_scale=2, train_wall=146, gb_free=59, wall=48923 epoch 020: 865 / 1707 loss=4.089, nll_loss=2.459, ppl=5.5, wps=293742, ups=0.68, wpb=430187, bsz=16375.4, num_updates=33200, lr=0.000347105, gnorm=0.221, clip=0, loss_scale=2, train_wall=146, gb_free=59, wall=48923 epoch 020: 865 / 1707 loss=4.089, nll_loss=2.459, ppl=5.5, wps=293742, ups=0.68, wpb=430187, bsz=16375.4, num_updates=33200, lr=0.000347105, gnorm=0.221, clip=0, loss_scale=2, train_wall=146, gb_free=59, wall=48923 epoch 020: 865 / 1707 loss=4.089, nll_loss=2.459, ppl=5.5, wps=293742, ups=0.68, wpb=430187, bsz=16375.4, num_updates=33200, lr=0.000347105, gnorm=0.221, clip=0, loss_scale=2, train_wall=146, gb_free=59, wall=48923 epoch 020: 865 / 1707 loss=4.089, nll_loss=2.459, ppl=5.5, wps=293742, ups=0.68, wpb=430187, bsz=16375.4, num_updates=33200, lr=0.000347105, gnorm=0.221, clip=0, loss_scale=2, train_wall=146, gb_free=59, wall=48923 epoch 020: 865 / 1707 loss=4.089, nll_loss=2.459, ppl=5.5, wps=293742, ups=0.68, wpb=430187, bsz=16375.4, num_updates=33200, lr=0.000347105, gnorm=0.221, clip=0, loss_scale=2, train_wall=146, gb_free=59, wall=48923 epoch 020: 865 / 1707 loss=4.089, nll_loss=2.459, ppl=5.5, wps=293742, ups=0.68, wpb=430187, bsz=16375.4, num_updates=33200, lr=0.000347105, gnorm=0.221, clip=0, loss_scale=2, train_wall=146, gb_free=59, wall=48923 epoch 020: 865 / 1707 loss=4.089, nll_loss=2.459, ppl=5.5, wps=293742, ups=0.68, wpb=430187, bsz=16375.4, num_updates=33200, lr=0.000347105, gnorm=0.221, clip=0, loss_scale=2, train_wall=146, gb_free=59, wall=48923 epoch 020: 865 / 1707 loss=4.089, nll_loss=2.459, ppl=5.5, wps=293742, ups=0.68, wpb=430187, bsz=16375.4, num_updates=33200, lr=0.000347105, gnorm=0.221, clip=0, loss_scale=2, train_wall=146, gb_free=59, wall=48923 epoch 020: 865 / 1707 loss=4.089, nll_loss=2.459, ppl=5.5, wps=293742, ups=0.68, wpb=430187, bsz=16375.4, num_updates=33200, lr=0.000347105, gnorm=0.221, clip=0, loss_scale=2, train_wall=146, gb_free=59, wall=48923 epoch 020: 865 / 1707 loss=4.089, nll_loss=2.459, ppl=5.5, wps=293742, ups=0.68, wpb=430187, bsz=16375.4, num_updates=33200, lr=0.000347105, gnorm=0.221, clip=0, loss_scale=2, train_wall=146, gb_free=59, wall=48923 epoch 020: 865 / 1707 loss=4.089, nll_loss=2.459, ppl=5.5, wps=293742, ups=0.68, wpb=430187, bsz=16375.4, num_updates=33200, lr=0.000347105, gnorm=0.221, clip=0, loss_scale=2, train_wall=146, gb_free=59, wall=48923 epoch 020: 865 / 1707 loss=4.089, nll_loss=2.459, ppl=5.5, wps=293742, ups=0.68, wpb=430187, bsz=16375.4, num_updates=33200, lr=0.000347105, gnorm=0.221, clip=0, loss_scale=2, train_wall=146, gb_free=59, wall=48923 epoch 020: 865 / 1707 loss=4.089, nll_loss=2.459, ppl=5.5, wps=293742, ups=0.68, wpb=430187, bsz=16375.4, num_updates=33200, lr=0.000347105, gnorm=0.221, clip=0, loss_scale=2, train_wall=146, gb_free=59, wall=48923 epoch 020: 865 / 1707 loss=4.089, nll_loss=2.459, ppl=5.5, wps=293742, ups=0.68, wpb=430187, bsz=16375.4, num_updates=33200, lr=0.000347105, gnorm=0.221, clip=0, loss_scale=2, train_wall=146, gb_free=59, wall=48923 epoch 020: 865 / 1707 loss=4.089, nll_loss=2.459, ppl=5.5, wps=293742, ups=0.68, wpb=430187, bsz=16375.4, num_updates=33200, lr=0.000347105, gnorm=0.221, clip=0, loss_scale=2, train_wall=146, gb_free=59, wall=48923 epoch 020: 865 / 1707 loss=4.089, nll_loss=2.459, ppl=5.5, wps=293742, ups=0.68, wpb=430187, bsz=16375.4, num_updates=33200, lr=0.000347105, gnorm=0.221, clip=0, loss_scale=2, train_wall=146, gb_free=59, wall=48923 epoch 020: 865 / 1707 loss=4.089, nll_loss=2.459, ppl=5.5, wps=293742, ups=0.68, wpb=430187, bsz=16375.4, num_updates=33200, lr=0.000347105, gnorm=0.221, clip=0, loss_scale=2, train_wall=146, gb_free=59, wall=48923 epoch 020: 865 / 1707 loss=4.089, nll_loss=2.459, ppl=5.5, wps=293742, ups=0.68, wpb=430187, bsz=16375.4, num_updates=33200, lr=0.000347105, gnorm=0.221, clip=0, loss_scale=2, train_wall=146, gb_free=59, wall=48923 epoch 020: 865 / 1707 loss=4.089, nll_loss=2.459, ppl=5.5, wps=293742, ups=0.68, wpb=430187, bsz=16375.4, num_updates=33200, lr=0.000347105, gnorm=0.221, clip=0, loss_scale=2, train_wall=146, gb_free=59, wall=48923 epoch 020: 966 / 1707 loss=4.088, nll_loss=2.457, ppl=5.49, wps=293181, ups=0.68, wpb=429456, bsz=16168.2, num_updates=33300, lr=0.000346583, gnorm=0.249, clip=0, loss_scale=1, train_wall=146, gb_free=59.3, wall=49070 epoch 020: 966 / 1707 loss=4.088, nll_loss=2.457, ppl=5.49, wps=293181, ups=0.68, wpb=429456, bsz=16168.2, num_updates=33300, lr=0.000346583, gnorm=0.249, clip=0, loss_scale=1, train_wall=146, gb_free=59.3, wall=49070 epoch 020: 966 / 1707 loss=4.088, nll_loss=2.457, ppl=5.49, wps=293181, ups=0.68, wpb=429456, bsz=16168.2, num_updates=33300, lr=0.000346583, gnorm=0.249, clip=0, loss_scale=1, train_wall=146, gb_free=59.3, wall=49070 epoch 020: 966 / 1707 loss=4.088, nll_loss=2.457, ppl=5.49, wps=293181, ups=0.68, wpb=429456, bsz=16168.2, num_updates=33300, lr=0.000346583, gnorm=0.249, clip=0, loss_scale=1, train_wall=146, gb_free=59.3, wall=49070 epoch 020: 966 / 1707 loss=4.088, nll_loss=2.457, ppl=5.49, wps=293181, ups=0.68, wpb=429456, bsz=16168.2, num_updates=33300, lr=0.000346583, gnorm=0.249, clip=0, loss_scale=1, train_wall=146, gb_free=59.3, wall=49070 epoch 020: 966 / 1707 loss=4.088, nll_loss=2.457, ppl=5.49, wps=293181, ups=0.68, wpb=429456, bsz=16168.2, num_updates=33300, lr=0.000346583, gnorm=0.249, clip=0, loss_scale=1, train_wall=146, gb_free=59.3, wall=49070 epoch 020: 966 / 1707 loss=4.088, nll_loss=2.457, ppl=5.49, wps=293181, ups=0.68, wpb=429456, bsz=16168.2, num_updates=33300, lr=0.000346583, gnorm=0.249, clip=0, loss_scale=1, train_wall=146, gb_free=59.3, wall=49070 epoch 020: 966 / 1707 loss=4.088, nll_loss=2.457, ppl=5.49, wps=293181, ups=0.68, wpb=429456, bsz=16168.2, num_updates=33300, lr=0.000346583, gnorm=0.249, clip=0, loss_scale=1, train_wall=146, gb_free=59.3, wall=49070 epoch 020: 966 / 1707 loss=4.088, nll_loss=2.457, ppl=5.49, wps=293181, ups=0.68, wpb=429456, bsz=16168.2, num_updates=33300, lr=0.000346583, gnorm=0.249, clip=0, loss_scale=1, train_wall=146, gb_free=59.3, wall=49070 epoch 020: 966 / 1707 loss=4.088, nll_loss=2.457, ppl=5.49, wps=293181, ups=0.68, wpb=429456, bsz=16168.2, num_updates=33300, lr=0.000346583, gnorm=0.249, clip=0, loss_scale=1, train_wall=146, gb_free=59.3, wall=49070 epoch 020: 966 / 1707 loss=4.088, nll_loss=2.457, ppl=5.49, wps=293181, ups=0.68, wpb=429456, bsz=16168.2, num_updates=33300, lr=0.000346583, gnorm=0.249, clip=0, loss_scale=1, train_wall=146, gb_free=59.3, wall=49070 epoch 020: 966 / 1707 loss=4.088, nll_loss=2.457, ppl=5.49, wps=293181, ups=0.68, wpb=429456, bsz=16168.2, num_updates=33300, lr=0.000346583, gnorm=0.249, clip=0, loss_scale=1, train_wall=146, gb_free=59.3, wall=49070 epoch 020: 966 / 1707 loss=4.088, nll_loss=2.457, ppl=5.49, wps=293181, ups=0.68, wpb=429456, bsz=16168.2, num_updates=33300, lr=0.000346583, gnorm=0.249, clip=0, loss_scale=1, train_wall=146, gb_free=59.3, wall=49070 epoch 020: 966 / 1707 loss=4.088, nll_loss=2.457, ppl=5.49, wps=293181, ups=0.68, wpb=429456, bsz=16168.2, num_updates=33300, lr=0.000346583, gnorm=0.249, clip=0, loss_scale=1, train_wall=146, gb_free=59.3, wall=49070 epoch 020: 966 / 1707 loss=4.088, nll_loss=2.457, ppl=5.49, wps=293181, ups=0.68, wpb=429456, bsz=16168.2, num_updates=33300, lr=0.000346583, gnorm=0.249, clip=0, loss_scale=1, train_wall=146, gb_free=59.3, wall=49070 epoch 020: 966 / 1707 loss=4.088, nll_loss=2.457, ppl=5.49, wps=293181, ups=0.68, wpb=429456, bsz=16168.2, num_updates=33300, lr=0.000346583, gnorm=0.249, clip=0, loss_scale=1, train_wall=146, gb_free=59.3, wall=49070 epoch 020: 966 / 1707 loss=4.088, nll_loss=2.457, ppl=5.49, wps=293181, ups=0.68, wpb=429456, bsz=16168.2, num_updates=33300, lr=0.000346583, gnorm=0.249, clip=0, loss_scale=1, train_wall=146, gb_free=59.3, wall=49070 epoch 020: 966 / 1707 loss=4.088, nll_loss=2.457, ppl=5.49, wps=293181, ups=0.68, wpb=429456, bsz=16168.2, num_updates=33300, lr=0.000346583, gnorm=0.249, clip=0, loss_scale=1, train_wall=146, gb_free=59.3, wall=49070 epoch 020: 966 / 1707 loss=4.088, nll_loss=2.457, ppl=5.49, wps=293181, ups=0.68, wpb=429456, bsz=16168.2, num_updates=33300, lr=0.000346583, gnorm=0.249, clip=0, loss_scale=1, train_wall=146, gb_free=59.3, wall=49070 epoch 020: 966 / 1707 loss=4.088, nll_loss=2.457, ppl=5.49, wps=293181, ups=0.68, wpb=429456, bsz=16168.2, num_updates=33300, lr=0.000346583, gnorm=0.249, clip=0, loss_scale=1, train_wall=146, gb_free=59.3, wall=49070 epoch 020: 1066 / 1707 loss=4.087, nll_loss=2.457, ppl=5.49, wps=296260, ups=0.69, wpb=429107, bsz=16543.4, num_updates=33400, lr=0.000346064, gnorm=0.232, clip=0, loss_scale=1, train_wall=144, gb_free=59.6, wall=49214 epoch 020: 1066 / 1707 loss=4.087, nll_loss=2.457, ppl=5.49, wps=296260, ups=0.69, wpb=429107, bsz=16543.4, num_updates=33400, lr=0.000346064, gnorm=0.232, clip=0, loss_scale=1, train_wall=144, gb_free=59.6, wall=49214 epoch 020: 1066 / 1707 loss=4.087, nll_loss=2.457, ppl=5.49, wps=296260, ups=0.69, wpb=429107, bsz=16543.4, num_updates=33400, lr=0.000346064, gnorm=0.232, clip=0, loss_scale=1, train_wall=144, gb_free=59.6, wall=49214 epoch 020: 1066 / 1707 loss=4.087, nll_loss=2.457, ppl=5.49, wps=296260, ups=0.69, wpb=429107, bsz=16543.4, num_updates=33400, lr=0.000346064, gnorm=0.232, clip=0, loss_scale=1, train_wall=144, gb_free=59.6, wall=49214 epoch 020: 1066 / 1707 loss=4.087, nll_loss=2.457, ppl=5.49, wps=296260, ups=0.69, wpb=429107, bsz=16543.4, num_updates=33400, lr=0.000346064, gnorm=0.232, clip=0, loss_scale=1, train_wall=144, gb_free=59.6, wall=49214 epoch 020: 1066 / 1707 loss=4.087, nll_loss=2.457, ppl=5.49, wps=296260, ups=0.69, wpb=429107, bsz=16543.4, num_updates=33400, lr=0.000346064, gnorm=0.232, clip=0, loss_scale=1, train_wall=144, gb_free=59.6, wall=49214 epoch 020: 1066 / 1707 loss=4.087, nll_loss=2.457, ppl=5.49, wps=296260, ups=0.69, wpb=429107, bsz=16543.4, num_updates=33400, lr=0.000346064, gnorm=0.232, clip=0, loss_scale=1, train_wall=144, gb_free=59.6, wall=49214 epoch 020: 1066 / 1707 loss=4.087, nll_loss=2.457, ppl=5.49, wps=296260, ups=0.69, wpb=429107, bsz=16543.4, num_updates=33400, lr=0.000346064, gnorm=0.232, clip=0, loss_scale=1, train_wall=144, gb_free=59.6, wall=49214 epoch 020: 1066 / 1707 loss=4.087, nll_loss=2.457, ppl=5.49, wps=296260, ups=0.69, wpb=429107, bsz=16543.4, num_updates=33400, lr=0.000346064, gnorm=0.232, clip=0, loss_scale=1, train_wall=144, gb_free=59.6, wall=49214 epoch 020: 1066 / 1707 loss=4.087, nll_loss=2.457, ppl=5.49, wps=296260, ups=0.69, wpb=429107, bsz=16543.4, num_updates=33400, lr=0.000346064, gnorm=0.232, clip=0, loss_scale=1, train_wall=144, gb_free=59.6, wall=49214 epoch 020: 1066 / 1707 loss=4.087, nll_loss=2.457, ppl=5.49, wps=296260, ups=0.69, wpb=429107, bsz=16543.4, num_updates=33400, lr=0.000346064, gnorm=0.232, clip=0, loss_scale=1, train_wall=144, gb_free=59.6, wall=49214 epoch 020: 1066 / 1707 loss=4.087, nll_loss=2.457, ppl=5.49, wps=296260, ups=0.69, wpb=429107, bsz=16543.4, num_updates=33400, lr=0.000346064, gnorm=0.232, clip=0, loss_scale=1, train_wall=144, gb_free=59.6, wall=49214 epoch 020: 1066 / 1707 loss=4.087, nll_loss=2.457, ppl=5.49, wps=296260, ups=0.69, wpb=429107, bsz=16543.4, num_updates=33400, lr=0.000346064, gnorm=0.232, clip=0, loss_scale=1, train_wall=144, gb_free=59.6, wall=49214 epoch 020: 1066 / 1707 loss=4.087, nll_loss=2.457, ppl=5.49, wps=296260, ups=0.69, wpb=429107, bsz=16543.4, num_updates=33400, lr=0.000346064, gnorm=0.232, clip=0, loss_scale=1, train_wall=144, gb_free=59.6, wall=49214 epoch 020: 1066 / 1707 loss=4.087, nll_loss=2.457, ppl=5.49, wps=296260, ups=0.69, wpb=429107, bsz=16543.4, num_updates=33400, lr=0.000346064, gnorm=0.232, clip=0, loss_scale=1, train_wall=144, gb_free=59.6, wall=49214 epoch 020: 1066 / 1707 loss=4.087, nll_loss=2.457, ppl=5.49, wps=296260, ups=0.69, wpb=429107, bsz=16543.4, num_updates=33400, lr=0.000346064, gnorm=0.232, clip=0, loss_scale=1, train_wall=144, gb_free=59.6, wall=49214 epoch 020: 1066 / 1707 loss=4.087, nll_loss=2.457, ppl=5.49, wps=296260, ups=0.69, wpb=429107, bsz=16543.4, num_updates=33400, lr=0.000346064, gnorm=0.232, clip=0, loss_scale=1, train_wall=144, gb_free=59.6, wall=49214 epoch 020: 1066 / 1707 loss=4.087, nll_loss=2.457, ppl=5.49, wps=296260, ups=0.69, wpb=429107, bsz=16543.4, num_updates=33400, lr=0.000346064, gnorm=0.232, clip=0, loss_scale=1, train_wall=144, gb_free=59.6, wall=49214 epoch 020: 1066 / 1707 loss=4.087, nll_loss=2.457, ppl=5.49, wps=296260, ups=0.69, wpb=429107, bsz=16543.4, num_updates=33400, lr=0.000346064, gnorm=0.232, clip=0, loss_scale=1, train_wall=144, gb_free=59.6, wall=49214 epoch 020: 1066 / 1707 loss=4.087, nll_loss=2.457, ppl=5.49, wps=296260, ups=0.69, wpb=429107, bsz=16543.4, num_updates=33400, lr=0.000346064, gnorm=0.232, clip=0, loss_scale=1, train_wall=144, gb_free=59.6, wall=49214 epoch 020: 1166 / 1707 loss=4.088, nll_loss=2.458, ppl=5.49, wps=295521, ups=0.69, wpb=428486, bsz=16245.5, num_updates=33500, lr=0.000345547, gnorm=0.246, clip=0, loss_scale=2, train_wall=144, gb_free=59, wall=49359 epoch 020: 1166 / 1707 loss=4.088, nll_loss=2.458, ppl=5.49, wps=295521, ups=0.69, wpb=428486, bsz=16245.5, num_updates=33500, lr=0.000345547, gnorm=0.246, clip=0, loss_scale=2, train_wall=144, gb_free=59, wall=49359 epoch 020: 1166 / 1707 loss=4.088, nll_loss=2.458, ppl=5.49, wps=295521, ups=0.69, wpb=428486, bsz=16245.5, num_updates=33500, lr=0.000345547, gnorm=0.246, clip=0, loss_scale=2, train_wall=144, gb_free=59, wall=49359 epoch 020: 1166 / 1707 loss=4.088, nll_loss=2.458, ppl=5.49, wps=295521, ups=0.69, wpb=428486, bsz=16245.5, num_updates=33500, lr=0.000345547, gnorm=0.246, clip=0, loss_scale=2, train_wall=144, gb_free=59, wall=49359 epoch 020: 1166 / 1707 loss=4.088, nll_loss=2.458, ppl=5.49, wps=295521, ups=0.69, wpb=428486, bsz=16245.5, num_updates=33500, lr=0.000345547, gnorm=0.246, clip=0, loss_scale=2, train_wall=144, gb_free=59, wall=49359 epoch 020: 1166 / 1707 loss=4.088, nll_loss=2.458, ppl=5.49, wps=295521, ups=0.69, wpb=428486, bsz=16245.5, num_updates=33500, lr=0.000345547, gnorm=0.246, clip=0, loss_scale=2, train_wall=144, gb_free=59, wall=49359 epoch 020: 1166 / 1707 loss=4.088, nll_loss=2.458, ppl=5.49, wps=295521, ups=0.69, wpb=428486, bsz=16245.5, num_updates=33500, lr=0.000345547, gnorm=0.246, clip=0, loss_scale=2, train_wall=144, gb_free=59, wall=49359 epoch 020: 1166 / 1707 loss=4.088, nll_loss=2.458, ppl=5.49, wps=295521, ups=0.69, wpb=428486, bsz=16245.5, num_updates=33500, lr=0.000345547, gnorm=0.246, clip=0, loss_scale=2, train_wall=144, gb_free=59, wall=49359 epoch 020: 1166 / 1707 loss=4.088, nll_loss=2.458, ppl=5.49, wps=295521, ups=0.69, wpb=428486, bsz=16245.5, num_updates=33500, lr=0.000345547, gnorm=0.246, clip=0, loss_scale=2, train_wall=144, gb_free=59, wall=49359 epoch 020: 1166 / 1707 loss=4.088, nll_loss=2.458, ppl=5.49, wps=295521, ups=0.69, wpb=428486, bsz=16245.5, num_updates=33500, lr=0.000345547, gnorm=0.246, clip=0, loss_scale=2, train_wall=144, gb_free=59, wall=49359 epoch 020: 1166 / 1707 loss=4.088, nll_loss=2.458, ppl=5.49, wps=295521, ups=0.69, wpb=428486, bsz=16245.5, num_updates=33500, lr=0.000345547, gnorm=0.246, clip=0, loss_scale=2, train_wall=144, gb_free=59, wall=49359 epoch 020: 1166 / 1707 loss=4.088, nll_loss=2.458, ppl=5.49, wps=295521, ups=0.69, wpb=428486, bsz=16245.5, num_updates=33500, lr=0.000345547, gnorm=0.246, clip=0, loss_scale=2, train_wall=144, gb_free=59, wall=49359 epoch 020: 1166 / 1707 loss=4.088, nll_loss=2.458, ppl=5.49, wps=295521, ups=0.69, wpb=428486, bsz=16245.5, num_updates=33500, lr=0.000345547, gnorm=0.246, clip=0, loss_scale=2, train_wall=144, gb_free=59, wall=49359 epoch 020: 1166 / 1707 loss=4.088, nll_loss=2.458, ppl=5.49, wps=295521, ups=0.69, wpb=428486, bsz=16245.5, num_updates=33500, lr=0.000345547, gnorm=0.246, clip=0, loss_scale=2, train_wall=144, gb_free=59, wall=49359 epoch 020: 1166 / 1707 loss=4.088, nll_loss=2.458, ppl=5.49, wps=295521, ups=0.69, wpb=428486, bsz=16245.5, num_updates=33500, lr=0.000345547, gnorm=0.246, clip=0, loss_scale=2, train_wall=144, gb_free=59, wall=49359 epoch 020: 1166 / 1707 loss=4.088, nll_loss=2.458, ppl=5.49, wps=295521, ups=0.69, wpb=428486, bsz=16245.5, num_updates=33500, lr=0.000345547, gnorm=0.246, clip=0, loss_scale=2, train_wall=144, gb_free=59, wall=49359 epoch 020: 1166 / 1707 loss=4.088, nll_loss=2.458, ppl=5.49, wps=295521, ups=0.69, wpb=428486, bsz=16245.5, num_updates=33500, lr=0.000345547, gnorm=0.246, clip=0, loss_scale=2, train_wall=144, gb_free=59, wall=49359 epoch 020: 1166 / 1707 loss=4.088, nll_loss=2.458, ppl=5.49, wps=295521, ups=0.69, wpb=428486, bsz=16245.5, num_updates=33500, lr=0.000345547, gnorm=0.246, clip=0, loss_scale=2, train_wall=144, gb_free=59, wall=49359 epoch 020: 1166 / 1707 loss=4.088, nll_loss=2.458, ppl=5.49, wps=295521, ups=0.69, wpb=428486, bsz=16245.5, num_updates=33500, lr=0.000345547, gnorm=0.246, clip=0, loss_scale=2, train_wall=144, gb_free=59, wall=49359 epoch 020: 1166 / 1707 loss=4.088, nll_loss=2.458, ppl=5.49, wps=295521, ups=0.69, wpb=428486, bsz=16245.5, num_updates=33500, lr=0.000345547, gnorm=0.246, clip=0, loss_scale=2, train_wall=144, gb_free=59, wall=49359 epoch 020: 1266 / 1707 loss=4.098, nll_loss=2.469, ppl=5.54, wps=295494, ups=0.69, wpb=429220, bsz=16457.5, num_updates=33600, lr=0.000345033, gnorm=0.238, clip=0, loss_scale=2, train_wall=144, gb_free=59.5, wall=49505 epoch 020: 1266 / 1707 loss=4.098, nll_loss=2.469, ppl=5.54, wps=295494, ups=0.69, wpb=429220, bsz=16457.5, num_updates=33600, lr=0.000345033, gnorm=0.238, clip=0, loss_scale=2, train_wall=144, gb_free=59.5, wall=49505 epoch 020: 1266 / 1707 loss=4.098, nll_loss=2.469, ppl=5.54, wps=295494, ups=0.69, wpb=429220, bsz=16457.5, num_updates=33600, lr=0.000345033, gnorm=0.238, clip=0, loss_scale=2, train_wall=144, gb_free=59.5, wall=49505 epoch 020: 1266 / 1707 loss=4.098, nll_loss=2.469, ppl=5.54, wps=295494, ups=0.69, wpb=429220, bsz=16457.5, num_updates=33600, lr=0.000345033, gnorm=0.238, clip=0, loss_scale=2, train_wall=144, gb_free=59.5, wall=49505 epoch 020: 1266 / 1707 loss=4.098, nll_loss=2.469, ppl=5.54, wps=295494, ups=0.69, wpb=429220, bsz=16457.5, num_updates=33600, lr=0.000345033, gnorm=0.238, clip=0, loss_scale=2, train_wall=144, gb_free=59.5, wall=49505 epoch 020: 1266 / 1707 loss=4.098, nll_loss=2.469, ppl=5.54, wps=295494, ups=0.69, wpb=429220, bsz=16457.5, num_updates=33600, lr=0.000345033, gnorm=0.238, clip=0, loss_scale=2, train_wall=144, gb_free=59.5, wall=49505 epoch 020: 1266 / 1707 loss=4.098, nll_loss=2.469, ppl=5.54, wps=295494, ups=0.69, wpb=429220, bsz=16457.5, num_updates=33600, lr=0.000345033, gnorm=0.238, clip=0, loss_scale=2, train_wall=144, gb_free=59.5, wall=49505 epoch 020: 1266 / 1707 loss=4.098, nll_loss=2.469, ppl=5.54, wps=295494, ups=0.69, wpb=429220, bsz=16457.5, num_updates=33600, lr=0.000345033, gnorm=0.238, clip=0, loss_scale=2, train_wall=144, gb_free=59.5, wall=49505 epoch 020: 1266 / 1707 loss=4.098, nll_loss=2.469, ppl=5.54, wps=295494, ups=0.69, wpb=429220, bsz=16457.5, num_updates=33600, lr=0.000345033, gnorm=0.238, clip=0, loss_scale=2, train_wall=144, gb_free=59.5, wall=49505 epoch 020: 1266 / 1707 loss=4.098, nll_loss=2.469, ppl=5.54, wps=295494, ups=0.69, wpb=429220, bsz=16457.5, num_updates=33600, lr=0.000345033, gnorm=0.238, clip=0, loss_scale=2, train_wall=144, gb_free=59.5, wall=49505 epoch 020: 1266 / 1707 loss=4.098, nll_loss=2.469, ppl=5.54, wps=295494, ups=0.69, wpb=429220, bsz=16457.5, num_updates=33600, lr=0.000345033, gnorm=0.238, clip=0, loss_scale=2, train_wall=144, gb_free=59.5, wall=49505 epoch 020: 1266 / 1707 loss=4.098, nll_loss=2.469, ppl=5.54, wps=295494, ups=0.69, wpb=429220, bsz=16457.5, num_updates=33600, lr=0.000345033, gnorm=0.238, clip=0, loss_scale=2, train_wall=144, gb_free=59.5, wall=49505 epoch 020: 1266 / 1707 loss=4.098, nll_loss=2.469, ppl=5.54, wps=295494, ups=0.69, wpb=429220, bsz=16457.5, num_updates=33600, lr=0.000345033, gnorm=0.238, clip=0, loss_scale=2, train_wall=144, gb_free=59.5, wall=49505 epoch 020: 1266 / 1707 loss=4.098, nll_loss=2.469, ppl=5.54, wps=295494, ups=0.69, wpb=429220, bsz=16457.5, num_updates=33600, lr=0.000345033, gnorm=0.238, clip=0, loss_scale=2, train_wall=144, gb_free=59.5, wall=49505 epoch 020: 1266 / 1707 loss=4.098, nll_loss=2.469, ppl=5.54, wps=295494, ups=0.69, wpb=429220, bsz=16457.5, num_updates=33600, lr=0.000345033, gnorm=0.238, clip=0, loss_scale=2, train_wall=144, gb_free=59.5, wall=49505 epoch 020: 1266 / 1707 loss=4.098, nll_loss=2.469, ppl=5.54, wps=295494, ups=0.69, wpb=429220, bsz=16457.5, num_updates=33600, lr=0.000345033, gnorm=0.238, clip=0, loss_scale=2, train_wall=144, gb_free=59.5, wall=49505 epoch 020: 1266 / 1707 loss=4.098, nll_loss=2.469, ppl=5.54, wps=295494, ups=0.69, wpb=429220, bsz=16457.5, num_updates=33600, lr=0.000345033, gnorm=0.238, clip=0, loss_scale=2, train_wall=144, gb_free=59.5, wall=49505 epoch 020: 1266 / 1707 loss=4.098, nll_loss=2.469, ppl=5.54, wps=295494, ups=0.69, wpb=429220, bsz=16457.5, num_updates=33600, lr=0.000345033, gnorm=0.238, clip=0, loss_scale=2, train_wall=144, gb_free=59.5, wall=49505 epoch 020: 1266 / 1707 loss=4.098, nll_loss=2.469, ppl=5.54, wps=295494, ups=0.69, wpb=429220, bsz=16457.5, num_updates=33600, lr=0.000345033, gnorm=0.238, clip=0, loss_scale=2, train_wall=144, gb_free=59.5, wall=49505 epoch 020: 1266 / 1707 loss=4.098, nll_loss=2.469, ppl=5.54, wps=295494, ups=0.69, wpb=429220, bsz=16457.5, num_updates=33600, lr=0.000345033, gnorm=0.238, clip=0, loss_scale=2, train_wall=144, gb_free=59.5, wall=49505 epoch 020: 1366 / 1707 loss=4.078, nll_loss=2.447, ppl=5.45, wps=294324, ups=0.69, wpb=428992, bsz=16508.1, num_updates=33700, lr=0.00034452, gnorm=0.232, clip=0, loss_scale=2, train_wall=145, gb_free=59.2, wall=49650 epoch 020: 1366 / 1707 loss=4.078, nll_loss=2.447, ppl=5.45, wps=294324, ups=0.69, wpb=428992, bsz=16508.1, num_updates=33700, lr=0.00034452, gnorm=0.232, clip=0, loss_scale=2, train_wall=145, gb_free=59.2, wall=49650 epoch 020: 1366 / 1707 loss=4.078, nll_loss=2.447, ppl=5.45, wps=294324, ups=0.69, wpb=428992, bsz=16508.1, num_updates=33700, lr=0.00034452, gnorm=0.232, clip=0, loss_scale=2, train_wall=145, gb_free=59.2, wall=49650 epoch 020: 1366 / 1707 loss=4.078, nll_loss=2.447, ppl=5.45, wps=294324, ups=0.69, wpb=428992, bsz=16508.1, num_updates=33700, lr=0.00034452, gnorm=0.232, clip=0, loss_scale=2, train_wall=145, gb_free=59.2, wall=49650 epoch 020: 1366 / 1707 loss=4.078, nll_loss=2.447, ppl=5.45, wps=294324, ups=0.69, wpb=428992, bsz=16508.1, num_updates=33700, lr=0.00034452, gnorm=0.232, clip=0, loss_scale=2, train_wall=145, gb_free=59.2, wall=49650 epoch 020: 1366 / 1707 loss=4.078, nll_loss=2.447, ppl=5.45, wps=294324, ups=0.69, wpb=428992, bsz=16508.1, num_updates=33700, lr=0.00034452, gnorm=0.232, clip=0, loss_scale=2, train_wall=145, gb_free=59.2, wall=49650 epoch 020: 1366 / 1707 loss=4.078, nll_loss=2.447, ppl=5.45, wps=294324, ups=0.69, wpb=428992, bsz=16508.1, num_updates=33700, lr=0.00034452, gnorm=0.232, clip=0, loss_scale=2, train_wall=145, gb_free=59.2, wall=49650 epoch 020: 1366 / 1707 loss=4.078, nll_loss=2.447, ppl=5.45, wps=294324, ups=0.69, wpb=428992, bsz=16508.1, num_updates=33700, lr=0.00034452, gnorm=0.232, clip=0, loss_scale=2, train_wall=145, gb_free=59.2, wall=49650 epoch 020: 1366 / 1707 loss=4.078, nll_loss=2.447, ppl=5.45, wps=294324, ups=0.69, wpb=428992, bsz=16508.1, num_updates=33700, lr=0.00034452, gnorm=0.232, clip=0, loss_scale=2, train_wall=145, gb_free=59.2, wall=49650 epoch 020: 1366 / 1707 loss=4.078, nll_loss=2.447, ppl=5.45, wps=294324, ups=0.69, wpb=428992, bsz=16508.1, num_updates=33700, lr=0.00034452, gnorm=0.232, clip=0, loss_scale=2, train_wall=145, gb_free=59.2, wall=49650 epoch 020: 1366 / 1707 loss=4.078, nll_loss=2.447, ppl=5.45, wps=294324, ups=0.69, wpb=428992, bsz=16508.1, num_updates=33700, lr=0.00034452, gnorm=0.232, clip=0, loss_scale=2, train_wall=145, gb_free=59.2, wall=49650 epoch 020: 1366 / 1707 loss=4.078, nll_loss=2.447, ppl=5.45, wps=294324, ups=0.69, wpb=428992, bsz=16508.1, num_updates=33700, lr=0.00034452, gnorm=0.232, clip=0, loss_scale=2, train_wall=145, gb_free=59.2, wall=49650 epoch 020: 1366 / 1707 loss=4.078, nll_loss=2.447, ppl=5.45, wps=294324, ups=0.69, wpb=428992, bsz=16508.1, num_updates=33700, lr=0.00034452, gnorm=0.232, clip=0, loss_scale=2, train_wall=145, gb_free=59.2, wall=49650 epoch 020: 1366 / 1707 loss=4.078, nll_loss=2.447, ppl=5.45, wps=294324, ups=0.69, wpb=428992, bsz=16508.1, num_updates=33700, lr=0.00034452, gnorm=0.232, clip=0, loss_scale=2, train_wall=145, gb_free=59.2, wall=49650 epoch 020: 1366 / 1707 loss=4.078, nll_loss=2.447, ppl=5.45, wps=294324, ups=0.69, wpb=428992, bsz=16508.1, num_updates=33700, lr=0.00034452, gnorm=0.232, clip=0, loss_scale=2, train_wall=145, gb_free=59.2, wall=49650 epoch 020: 1366 / 1707 loss=4.078, nll_loss=2.447, ppl=5.45, wps=294324, ups=0.69, wpb=428992, bsz=16508.1, num_updates=33700, lr=0.00034452, gnorm=0.232, clip=0, loss_scale=2, train_wall=145, gb_free=59.2, wall=49650 epoch 020: 1366 / 1707 loss=4.078, nll_loss=2.447, ppl=5.45, wps=294324, ups=0.69, wpb=428992, bsz=16508.1, num_updates=33700, lr=0.00034452, gnorm=0.232, clip=0, loss_scale=2, train_wall=145, gb_free=59.2, wall=49650 epoch 020: 1366 / 1707 loss=4.078, nll_loss=2.447, ppl=5.45, wps=294324, ups=0.69, wpb=428992, bsz=16508.1, num_updates=33700, lr=0.00034452, gnorm=0.232, clip=0, loss_scale=2, train_wall=145, gb_free=59.2, wall=49650 epoch 020: 1366 / 1707 loss=4.078, nll_loss=2.447, ppl=5.45, wps=294324, ups=0.69, wpb=428992, bsz=16508.1, num_updates=33700, lr=0.00034452, gnorm=0.232, clip=0, loss_scale=2, train_wall=145, gb_free=59.2, wall=49650 epoch 020: 1366 / 1707 loss=4.078, nll_loss=2.447, ppl=5.45, wps=294324, ups=0.69, wpb=428992, bsz=16508.1, num_updates=33700, lr=0.00034452, gnorm=0.232, clip=0, loss_scale=2, train_wall=145, gb_free=59.2, wall=49650 epoch 020: 1467 / 1707 loss=4.094, nll_loss=2.464, ppl=5.52, wps=292570, ups=0.68, wpb=428398, bsz=16298.7, num_updates=33800, lr=0.00034401, gnorm=0.227, clip=0, loss_scale=2, train_wall=146, gb_free=59.4, wall=49797 epoch 020: 1467 / 1707 loss=4.094, nll_loss=2.464, ppl=5.52, wps=292570, ups=0.68, wpb=428398, bsz=16298.7, num_updates=33800, lr=0.00034401, gnorm=0.227, clip=0, loss_scale=2, train_wall=146, gb_free=59.4, wall=49797 epoch 020: 1467 / 1707 loss=4.094, nll_loss=2.464, ppl=5.52, wps=292570, ups=0.68, wpb=428398, bsz=16298.7, num_updates=33800, lr=0.00034401, gnorm=0.227, clip=0, loss_scale=2, train_wall=146, gb_free=59.4, wall=49797 epoch 020: 1467 / 1707 loss=4.094, nll_loss=2.464, ppl=5.52, wps=292570, ups=0.68, wpb=428398, bsz=16298.7, num_updates=33800, lr=0.00034401, gnorm=0.227, clip=0, loss_scale=2, train_wall=146, gb_free=59.4, wall=49797 epoch 020: 1467 / 1707 loss=4.094, nll_loss=2.464, ppl=5.52, wps=292570, ups=0.68, wpb=428398, bsz=16298.7, num_updates=33800, lr=0.00034401, gnorm=0.227, clip=0, loss_scale=2, train_wall=146, gb_free=59.4, wall=49797 epoch 020: 1467 / 1707 loss=4.094, nll_loss=2.464, ppl=5.52, wps=292570, ups=0.68, wpb=428398, bsz=16298.7, num_updates=33800, lr=0.00034401, gnorm=0.227, clip=0, loss_scale=2, train_wall=146, gb_free=59.4, wall=49797 epoch 020: 1467 / 1707 loss=4.094, nll_loss=2.464, ppl=5.52, wps=292570, ups=0.68, wpb=428398, bsz=16298.7, num_updates=33800, lr=0.00034401, gnorm=0.227, clip=0, loss_scale=2, train_wall=146, gb_free=59.4, wall=49797 epoch 020: 1467 / 1707 loss=4.094, nll_loss=2.464, ppl=5.52, wps=292570, ups=0.68, wpb=428398, bsz=16298.7, num_updates=33800, lr=0.00034401, gnorm=0.227, clip=0, loss_scale=2, train_wall=146, gb_free=59.4, wall=49797 epoch 020: 1467 / 1707 loss=4.094, nll_loss=2.464, ppl=5.52, wps=292570, ups=0.68, wpb=428398, bsz=16298.7, num_updates=33800, lr=0.00034401, gnorm=0.227, clip=0, loss_scale=2, train_wall=146, gb_free=59.4, wall=49797 epoch 020: 1467 / 1707 loss=4.094, nll_loss=2.464, ppl=5.52, wps=292570, ups=0.68, wpb=428398, bsz=16298.7, num_updates=33800, lr=0.00034401, gnorm=0.227, clip=0, loss_scale=2, train_wall=146, gb_free=59.4, wall=49797 epoch 020: 1467 / 1707 loss=4.094, nll_loss=2.464, ppl=5.52, wps=292570, ups=0.68, wpb=428398, bsz=16298.7, num_updates=33800, lr=0.00034401, gnorm=0.227, clip=0, loss_scale=2, train_wall=146, gb_free=59.4, wall=49797 epoch 020: 1467 / 1707 loss=4.094, nll_loss=2.464, ppl=5.52, wps=292570, ups=0.68, wpb=428398, bsz=16298.7, num_updates=33800, lr=0.00034401, gnorm=0.227, clip=0, loss_scale=2, train_wall=146, gb_free=59.4, wall=49797 epoch 020: 1467 / 1707 loss=4.094, nll_loss=2.464, ppl=5.52, wps=292570, ups=0.68, wpb=428398, bsz=16298.7, num_updates=33800, lr=0.00034401, gnorm=0.227, clip=0, loss_scale=2, train_wall=146, gb_free=59.4, wall=49797 epoch 020: 1467 / 1707 loss=4.094, nll_loss=2.464, ppl=5.52, wps=292570, ups=0.68, wpb=428398, bsz=16298.7, num_updates=33800, lr=0.00034401, gnorm=0.227, clip=0, loss_scale=2, train_wall=146, gb_free=59.4, wall=49797 epoch 020: 1467 / 1707 loss=4.094, nll_loss=2.464, ppl=5.52, wps=292570, ups=0.68, wpb=428398, bsz=16298.7, num_updates=33800, lr=0.00034401, gnorm=0.227, clip=0, loss_scale=2, train_wall=146, gb_free=59.4, wall=49797 epoch 020: 1467 / 1707 loss=4.094, nll_loss=2.464, ppl=5.52, wps=292570, ups=0.68, wpb=428398, bsz=16298.7, num_updates=33800, lr=0.00034401, gnorm=0.227, clip=0, loss_scale=2, train_wall=146, gb_free=59.4, wall=49797 epoch 020: 1467 / 1707 loss=4.094, nll_loss=2.464, ppl=5.52, wps=292570, ups=0.68, wpb=428398, bsz=16298.7, num_updates=33800, lr=0.00034401, gnorm=0.227, clip=0, loss_scale=2, train_wall=146, gb_free=59.4, wall=49797 epoch 020: 1467 / 1707 loss=4.094, nll_loss=2.464, ppl=5.52, wps=292570, ups=0.68, wpb=428398, bsz=16298.7, num_updates=33800, lr=0.00034401, gnorm=0.227, clip=0, loss_scale=2, train_wall=146, gb_free=59.4, wall=49797 epoch 020: 1467 / 1707 loss=4.094, nll_loss=2.464, ppl=5.52, wps=292570, ups=0.68, wpb=428398, bsz=16298.7, num_updates=33800, lr=0.00034401, gnorm=0.227, clip=0, loss_scale=2, train_wall=146, gb_free=59.4, wall=49797 epoch 020: 1467 / 1707 loss=4.094, nll_loss=2.464, ppl=5.52, wps=292570, ups=0.68, wpb=428398, bsz=16298.7, num_updates=33800, lr=0.00034401, gnorm=0.227, clip=0, loss_scale=2, train_wall=146, gb_free=59.4, wall=49797 epoch 020: 1567 / 1707 loss=4.09, nll_loss=2.46, ppl=5.5, wps=294891, ups=0.69, wpb=428920, bsz=16466.6, num_updates=33900, lr=0.000343503, gnorm=0.225, clip=0, loss_scale=2, train_wall=145, gb_free=59.3, wall=49942 epoch 020: 1567 / 1707 loss=4.09, nll_loss=2.46, ppl=5.5, wps=294891, ups=0.69, wpb=428920, bsz=16466.6, num_updates=33900, lr=0.000343503, gnorm=0.225, clip=0, loss_scale=2, train_wall=145, gb_free=59.3, wall=49942 epoch 020: 1567 / 1707 loss=4.09, nll_loss=2.46, ppl=5.5, wps=294891, ups=0.69, wpb=428920, bsz=16466.6, num_updates=33900, lr=0.000343503, gnorm=0.225, clip=0, loss_scale=2, train_wall=145, gb_free=59.3, wall=49942 epoch 020: 1567 / 1707 loss=4.09, nll_loss=2.46, ppl=5.5, wps=294891, ups=0.69, wpb=428920, bsz=16466.6, num_updates=33900, lr=0.000343503, gnorm=0.225, clip=0, loss_scale=2, train_wall=145, gb_free=59.3, wall=49942 epoch 020: 1567 / 1707 loss=4.09, nll_loss=2.46, ppl=5.5, wps=294891, ups=0.69, wpb=428920, bsz=16466.6, num_updates=33900, lr=0.000343503, gnorm=0.225, clip=0, loss_scale=2, train_wall=145, gb_free=59.3, wall=49942 epoch 020: 1567 / 1707 loss=4.09, nll_loss=2.46, ppl=5.5, wps=294891, ups=0.69, wpb=428920, bsz=16466.6, num_updates=33900, lr=0.000343503, gnorm=0.225, clip=0, loss_scale=2, train_wall=145, gb_free=59.3, wall=49942 epoch 020: 1567 / 1707 loss=4.09, nll_loss=2.46, ppl=5.5, wps=294891, ups=0.69, wpb=428920, bsz=16466.6, num_updates=33900, lr=0.000343503, gnorm=0.225, clip=0, loss_scale=2, train_wall=145, gb_free=59.3, wall=49942 epoch 020: 1567 / 1707 loss=4.09, nll_loss=2.46, ppl=5.5, wps=294891, ups=0.69, wpb=428920, bsz=16466.6, num_updates=33900, lr=0.000343503, gnorm=0.225, clip=0, loss_scale=2, train_wall=145, gb_free=59.3, wall=49942 epoch 020: 1567 / 1707 loss=4.09, nll_loss=2.46, ppl=5.5, wps=294891, ups=0.69, wpb=428920, bsz=16466.6, num_updates=33900, lr=0.000343503, gnorm=0.225, clip=0, loss_scale=2, train_wall=145, gb_free=59.3, wall=49942 epoch 020: 1567 / 1707 loss=4.09, nll_loss=2.46, ppl=5.5, wps=294891, ups=0.69, wpb=428920, bsz=16466.6, num_updates=33900, lr=0.000343503, gnorm=0.225, clip=0, loss_scale=2, train_wall=145, gb_free=59.3, wall=49942 epoch 020: 1567 / 1707 loss=4.09, nll_loss=2.46, ppl=5.5, wps=294891, ups=0.69, wpb=428920, bsz=16466.6, num_updates=33900, lr=0.000343503, gnorm=0.225, clip=0, loss_scale=2, train_wall=145, gb_free=59.3, wall=49942 epoch 020: 1567 / 1707 loss=4.09, nll_loss=2.46, ppl=5.5, wps=294891, ups=0.69, wpb=428920, bsz=16466.6, num_updates=33900, lr=0.000343503, gnorm=0.225, clip=0, loss_scale=2, train_wall=145, gb_free=59.3, wall=49942 epoch 020: 1567 / 1707 loss=4.09, nll_loss=2.46, ppl=5.5, wps=294891, ups=0.69, wpb=428920, bsz=16466.6, num_updates=33900, lr=0.000343503, gnorm=0.225, clip=0, loss_scale=2, train_wall=145, gb_free=59.3, wall=49942 epoch 020: 1567 / 1707 loss=4.09, nll_loss=2.46, ppl=5.5, wps=294891, ups=0.69, wpb=428920, bsz=16466.6, num_updates=33900, lr=0.000343503, gnorm=0.225, clip=0, loss_scale=2, train_wall=145, gb_free=59.3, wall=49942 epoch 020: 1567 / 1707 loss=4.09, nll_loss=2.46, ppl=5.5, wps=294891, ups=0.69, wpb=428920, bsz=16466.6, num_updates=33900, lr=0.000343503, gnorm=0.225, clip=0, loss_scale=2, train_wall=145, gb_free=59.3, wall=49942 epoch 020: 1567 / 1707 loss=4.09, nll_loss=2.46, ppl=5.5, wps=294891, ups=0.69, wpb=428920, bsz=16466.6, num_updates=33900, lr=0.000343503, gnorm=0.225, clip=0, loss_scale=2, train_wall=145, gb_free=59.3, wall=49942 epoch 020: 1567 / 1707 loss=4.09, nll_loss=2.46, ppl=5.5, wps=294891, ups=0.69, wpb=428920, bsz=16466.6, num_updates=33900, lr=0.000343503, gnorm=0.225, clip=0, loss_scale=2, train_wall=145, gb_free=59.3, wall=49942 epoch 020: 1567 / 1707 loss=4.09, nll_loss=2.46, ppl=5.5, wps=294891, ups=0.69, wpb=428920, bsz=16466.6, num_updates=33900, lr=0.000343503, gnorm=0.225, clip=0, loss_scale=2, train_wall=145, gb_free=59.3, wall=49942 epoch 020: 1567 / 1707 loss=4.09, nll_loss=2.46, ppl=5.5, wps=294891, ups=0.69, wpb=428920, bsz=16466.6, num_updates=33900, lr=0.000343503, gnorm=0.225, clip=0, loss_scale=2, train_wall=145, gb_free=59.3, wall=49942 epoch 020: 1567 / 1707 loss=4.09, nll_loss=2.46, ppl=5.5, wps=294891, ups=0.69, wpb=428920, bsz=16466.6, num_updates=33900, lr=0.000343503, gnorm=0.225, clip=0, loss_scale=2, train_wall=145, gb_free=59.3, wall=49942 epoch 020: 1667 / 1707 loss=4.084, nll_loss=2.453, ppl=5.48, wps=293946, ups=0.69, wpb=428362, bsz=16088.3, num_updates=34000, lr=0.000342997, gnorm=0.247, clip=0, loss_scale=4, train_wall=145, gb_free=59.1, wall=50088 epoch 020: 1667 / 1707 loss=4.084, nll_loss=2.453, ppl=5.48, wps=293946, ups=0.69, wpb=428362, bsz=16088.3, num_updates=34000, lr=0.000342997, gnorm=0.247, clip=0, loss_scale=4, train_wall=145, gb_free=59.1, wall=50088 epoch 020: 1667 / 1707 loss=4.084, nll_loss=2.453, ppl=5.48, wps=293946, ups=0.69, wpb=428362, bsz=16088.3, num_updates=34000, lr=0.000342997, gnorm=0.247, clip=0, loss_scale=4, train_wall=145, gb_free=59.1, wall=50088 epoch 020: 1667 / 1707 loss=4.084, nll_loss=2.453, ppl=5.48, wps=293946, ups=0.69, wpb=428362, bsz=16088.3, num_updates=34000, lr=0.000342997, gnorm=0.247, clip=0, loss_scale=4, train_wall=145, gb_free=59.1, wall=50088 epoch 020: 1667 / 1707 loss=4.084, nll_loss=2.453, ppl=5.48, wps=293946, ups=0.69, wpb=428362, bsz=16088.3, num_updates=34000, lr=0.000342997, gnorm=0.247, clip=0, loss_scale=4, train_wall=145, gb_free=59.1, wall=50088 epoch 020: 1667 / 1707 loss=4.084, nll_loss=2.453, ppl=5.48, wps=293946, ups=0.69, wpb=428362, bsz=16088.3, num_updates=34000, lr=0.000342997, gnorm=0.247, clip=0, loss_scale=4, train_wall=145, gb_free=59.1, wall=50088 epoch 020: 1667 / 1707 loss=4.084, nll_loss=2.453, ppl=5.48, wps=293946, ups=0.69, wpb=428362, bsz=16088.3, num_updates=34000, lr=0.000342997, gnorm=0.247, clip=0, loss_scale=4, train_wall=145, gb_free=59.1, wall=50088 epoch 020: 1667 / 1707 loss=4.084, nll_loss=2.453, ppl=5.48, wps=293946, ups=0.69, wpb=428362, bsz=16088.3, num_updates=34000, lr=0.000342997, gnorm=0.247, clip=0, loss_scale=4, train_wall=145, gb_free=59.1, wall=50088 epoch 020: 1667 / 1707 loss=4.084, nll_loss=2.453, ppl=5.48, wps=293946, ups=0.69, wpb=428362, bsz=16088.3, num_updates=34000, lr=0.000342997, gnorm=0.247, clip=0, loss_scale=4, train_wall=145, gb_free=59.1, wall=50088 epoch 020: 1667 / 1707 loss=4.084, nll_loss=2.453, ppl=5.48, wps=293946, ups=0.69, wpb=428362, bsz=16088.3, num_updates=34000, lr=0.000342997, gnorm=0.247, clip=0, loss_scale=4, train_wall=145, gb_free=59.1, wall=50088 epoch 020: 1667 / 1707 loss=4.084, nll_loss=2.453, ppl=5.48, wps=293946, ups=0.69, wpb=428362, bsz=16088.3, num_updates=34000, lr=0.000342997, gnorm=0.247, clip=0, loss_scale=4, train_wall=145, gb_free=59.1, wall=50088 epoch 020: 1667 / 1707 loss=4.084, nll_loss=2.453, ppl=5.48, wps=293946, ups=0.69, wpb=428362, bsz=16088.3, num_updates=34000, lr=0.000342997, gnorm=0.247, clip=0, loss_scale=4, train_wall=145, gb_free=59.1, wall=50088 epoch 020: 1667 / 1707 loss=4.084, nll_loss=2.453, ppl=5.48, wps=293946, ups=0.69, wpb=428362, bsz=16088.3, num_updates=34000, lr=0.000342997, gnorm=0.247, clip=0, loss_scale=4, train_wall=145, gb_free=59.1, wall=50088 epoch 020: 1667 / 1707 loss=4.084, nll_loss=2.453, ppl=5.48, wps=293946, ups=0.69, wpb=428362, bsz=16088.3, num_updates=34000, lr=0.000342997, gnorm=0.247, clip=0, loss_scale=4, train_wall=145, gb_free=59.1, wall=50088 epoch 020: 1667 / 1707 loss=4.084, nll_loss=2.453, ppl=5.48, wps=293946, ups=0.69, wpb=428362, bsz=16088.3, num_updates=34000, lr=0.000342997, gnorm=0.247, clip=0, loss_scale=4, train_wall=145, gb_free=59.1, wall=50088 epoch 020: 1667 / 1707 loss=4.084, nll_loss=2.453, ppl=5.48, wps=293946, ups=0.69, wpb=428362, bsz=16088.3, num_updates=34000, lr=0.000342997, gnorm=0.247, clip=0, loss_scale=4, train_wall=145, gb_free=59.1, wall=50088 epoch 020: 1667 / 1707 loss=4.084, nll_loss=2.453, ppl=5.48, wps=293946, ups=0.69, wpb=428362, bsz=16088.3, num_updates=34000, lr=0.000342997, gnorm=0.247, clip=0, loss_scale=4, train_wall=145, gb_free=59.1, wall=50088 epoch 020: 1667 / 1707 loss=4.084, nll_loss=2.453, ppl=5.48, wps=293946, ups=0.69, wpb=428362, bsz=16088.3, num_updates=34000, lr=0.000342997, gnorm=0.247, clip=0, loss_scale=4, train_wall=145, gb_free=59.1, wall=50088 epoch 020: 1667 / 1707 loss=4.084, nll_loss=2.453, ppl=5.48, wps=293946, ups=0.69, wpb=428362, bsz=16088.3, num_updates=34000, lr=0.000342997, gnorm=0.247, clip=0, loss_scale=4, train_wall=145, gb_free=59.1, wall=50088 epoch 020: 1667 / 1707 loss=4.084, nll_loss=2.453, ppl=5.48, wps=293946, ups=0.69, wpb=428362, bsz=16088.3, num_updates=34000, lr=0.000342997, gnorm=0.247, clip=0, loss_scale=4, train_wall=145, gb_free=59.1, wall=50088 begin validation on "valid" subset epoch 020 | valid on 'valid' subset | loss 4.291 | nll_loss 2.668 | ppl 6.36 | wps 75393 | wpb 21331 | bsz 1016 | num_updates 34000 | best_loss 4.291 epoch 020 | valid on 'valid' subset | loss 4.291 | nll_loss 2.668 | ppl 6.36 | wps 75393 | wpb 21331 | bsz 1016 | num_updates 34000 | best_loss 4.291 epoch 020 | valid on 'valid' subset | loss 4.291 | nll_loss 2.668 | ppl 6.36 | wps 75393 | wpb 21331 | bsz 1016 | num_updates 34000 | best_loss 4.291 epoch 020 | valid on 'valid' subset | loss 4.291 | nll_loss 2.668 | ppl 6.36 | wps 75393 | wpb 21331 | bsz 1016 | num_updates 34000 | best_loss 4.291 epoch 020 | valid on 'valid' subset | loss 4.291 | nll_loss 2.668 | ppl 6.36 | wps 75393 | wpb 21331 | bsz 1016 | num_updates 34000 | best_loss 4.291 epoch 020 | valid on 'valid' subset | loss 4.291 | nll_loss 2.668 | ppl 6.36 | wps 75393 | wpb 21331 | bsz 1016 | num_updates 34000 | best_loss 4.291 epoch 020 | valid on 'valid' subset | loss 4.291 | nll_loss 2.668 | ppl 6.36 | wps 75393 | wpb 21331 | bsz 1016 | num_updates 34000 | best_loss 4.291 epoch 020 | valid on 'valid' subset | loss 4.291 | nll_loss 2.668 | ppl 6.36 | wps 75393 | wpb 21331 | bsz 1016 | num_updates 34000 | best_loss 4.291 epoch 020 | valid on 'valid' subset | loss 4.291 | nll_loss 2.668 | ppl 6.36 | wps 75393 | wpb 21331 | bsz 1016 | num_updates 34000 | best_loss 4.291 epoch 020 | valid on 'valid' subset | loss 4.291 | nll_loss 2.668 | ppl 6.36 | wps 75393 | wpb 21331 | bsz 1016 | num_updates 34000 | best_loss 4.291 epoch 020 | valid on 'valid' subset | loss 4.291 | nll_loss 2.668 | ppl 6.36 | wps 75393 | wpb 21331 | bsz 1016 | num_updates 34000 | best_loss 4.291 epoch 020 | valid on 'valid' subset | loss 4.291 | nll_loss 2.668 | ppl 6.36 | wps 75393 | wpb 21331 | bsz 1016 | num_updates 34000 | best_loss 4.291 epoch 020 | valid on 'valid' subset | loss 4.291 | nll_loss 2.668 | ppl 6.36 | wps 75393 | wpb 21331 | bsz 1016 | num_updates 34000 | best_loss 4.291 epoch 020 | valid on 'valid' subset | loss 4.291 | nll_loss 2.668 | ppl 6.36 | wps 75393 | wpb 21331 | bsz 1016 | num_updates 34000 | best_loss 4.291 epoch 020 | valid on 'valid' subset | loss 4.291 | nll_loss 2.668 | ppl 6.36 | wps 75393 | wpb 21331 | bsz 1016 | num_updates 34000 | best_loss 4.291 epoch 020 | valid on 'valid' subset | loss 4.291 | nll_loss 2.668 | ppl 6.36 | wps 75393 | wpb 21331 | bsz 1016 | num_updates 34000 | best_loss 4.291 epoch 020 | valid on 'valid' subset | loss 4.291 | nll_loss 2.668 | ppl 6.36 | wps 75393 | wpb 21331 | bsz 1016 | num_updates 34000 | best_loss 4.291 epoch 020 | valid on 'valid' subset | loss 4.291 | nll_loss 2.668 | ppl 6.36 | wps 75393 | wpb 21331 | bsz 1016 | num_updates 34000 | best_loss 4.291 epoch 020 | valid on 'valid' subset | loss 4.291 | nll_loss 2.668 | ppl 6.36 | wps 75393 | wpb 21331 | bsz 1016 | num_updates 34000 | best_loss 4.291 epoch 020 | valid on 'valid' subset | loss 4.291 | nll_loss 2.668 | ppl 6.36 | wps 75393 | wpb 21331 | bsz 1016 | num_updates 34000 | best_loss 4.291 end of epoch 20 (average epoch stats below) epoch 020 | loss 4.081 | nll_loss 2.45 | ppl 5.46 | wps 290649 | ups 0.68 | wpb 428939 | bsz 16331.9 | num_updates 34039 | lr 0.000342801 | gnorm 0.236 | clip 0 | loss_scale 2 | train_wall 2465 | gb_free 60.5 | wall 50165 epoch 020 | loss 4.081 | nll_loss 2.45 | ppl 5.46 | wps 290649 | ups 0.68 | wpb 428939 | bsz 16331.9 | num_updates 34039 | lr 0.000342801 | gnorm 0.236 | clip 0 | loss_scale 2 | train_wall 2465 | gb_free 60.5 | wall 50165 epoch 020 | loss 4.081 | nll_loss 2.45 | ppl 5.46 | wps 290649 | ups 0.68 | wpb 428939 | bsz 16331.9 | num_updates 34039 | lr 0.000342801 | gnorm 0.236 | clip 0 | loss_scale 2 | train_wall 2465 | gb_free 60.5 | wall 50165 epoch 020 | loss 4.081 | nll_loss 2.45 | ppl 5.46 | wps 290649 | ups 0.68 | wpb 428939 | bsz 16331.9 | num_updates 34039 | lr 0.000342801 | gnorm 0.236 | clip 0 | loss_scale 2 | train_wall 2465 | gb_free 60.5 | wall 50165 epoch 020 | loss 4.081 | nll_loss 2.45 | ppl 5.46 | wps 290649 | ups 0.68 | wpb 428939 | bsz 16331.9 | num_updates 34039 | lr 0.000342801 | gnorm 0.236 | clip 0 | loss_scale 2 | train_wall 2465 | gb_free 60.5 | wall 50165 epoch 020 | loss 4.081 | nll_loss 2.45 | ppl 5.46 | wps 290649 | ups 0.68 | wpb 428939 | bsz 16331.9 | num_updates 34039 | lr 0.000342801 | gnorm 0.236 | clip 0 | loss_scale 2 | train_wall 2465 | gb_free 60.5 | wall 50165 epoch 020 | loss 4.081 | nll_loss 2.45 | ppl 5.46 | wps 290649 | ups 0.68 | wpb 428939 | bsz 16331.9 | num_updates 34039 | lr 0.000342801 | gnorm 0.236 | clip 0 | loss_scale 2 | train_wall 2465 | gb_free 60.5 | wall 50165 epoch 020 | loss 4.081 | nll_loss 2.45 | ppl 5.46 | wps 290649 | ups 0.68 | wpb 428939 | bsz 16331.9 | num_updates 34039 | lr 0.000342801 | gnorm 0.236 | clip 0 | loss_scale 2 | train_wall 2465 | gb_free 60.5 | wall 50165 epoch 020 | loss 4.081 | nll_loss 2.45 | ppl 5.46 | wps 290649 | ups 0.68 | wpb 428939 | bsz 16331.9 | num_updates 34039 | lr 0.000342801 | gnorm 0.236 | clip 0 | loss_scale 2 | train_wall 2465 | gb_free 60.5 | wall 50165 epoch 020 | loss 4.081 | nll_loss 2.45 | ppl 5.46 | wps 290649 | ups 0.68 | wpb 428939 | bsz 16331.9 | num_updates 34039 | lr 0.000342801 | gnorm 0.236 | clip 0 | loss_scale 2 | train_wall 2465 | gb_free 60.5 | wall 50165 epoch 020 | loss 4.081 | nll_loss 2.45 | ppl 5.46 | wps 290649 | ups 0.68 | wpb 428939 | bsz 16331.9 | num_updates 34039 | lr 0.000342801 | gnorm 0.236 | clip 0 | loss_scale 2 | train_wall 2465 | gb_free 60.5 | wall 50165 epoch 020 | loss 4.081 | nll_loss 2.45 | ppl 5.46 | wps 290649 | ups 0.68 | wpb 428939 | bsz 16331.9 | num_updates 34039 | lr 0.000342801 | gnorm 0.236 | clip 0 | loss_scale 2 | train_wall 2465 | gb_free 60.5 | wall 50165 epoch 020 | loss 4.081 | nll_loss 2.45 | ppl 5.46 | wps 290649 | ups 0.68 | wpb 428939 | bsz 16331.9 | num_updates 34039 | lr 0.000342801 | gnorm 0.236 | clip 0 | loss_scale 2 | train_wall 2465 | gb_free 60.5 | wall 50165 epoch 020 | loss 4.081 | nll_loss 2.45 | ppl 5.46 | wps 290649 | ups 0.68 | wpb 428939 | bsz 16331.9 | num_updates 34039 | lr 0.000342801 | gnorm 0.236 | clip 0 | loss_scale 2 | train_wall 2465 | gb_free 60.5 | wall 50165 epoch 020 | loss 4.081 | nll_loss 2.45 | ppl 5.46 | wps 290649 | ups 0.68 | wpb 428939 | bsz 16331.9 | num_updates 34039 | lr 0.000342801 | gnorm 0.236 | clip 0 | loss_scale 2 | train_wall 2465 | gb_free 60.5 | wall 50165 epoch 020 | loss 4.081 | nll_loss 2.45 | ppl 5.46 | wps 290649 | ups 0.68 | wpb 428939 | bsz 16331.9 | num_updates 34039 | lr 0.000342801 | gnorm 0.236 | clip 0 | loss_scale 2 | train_wall 2465 | gb_free 60.5 | wall 50165 epoch 020 | loss 4.081 | nll_loss 2.45 | ppl 5.46 | wps 290649 | ups 0.68 | wpb 428939 | bsz 16331.9 | num_updates 34039 | lr 0.000342801 | gnorm 0.236 | clip 0 | loss_scale 2 | train_wall 2465 | gb_free 60.5 | wall 50165 epoch 020 | loss 4.081 | nll_loss 2.45 | ppl 5.46 | wps 290649 | ups 0.68 | wpb 428939 | bsz 16331.9 | num_updates 34039 | lr 0.000342801 | gnorm 0.236 | clip 0 | loss_scale 2 | train_wall 2465 | gb_free 60.5 | wall 50165 epoch 020 | loss 4.081 | nll_loss 2.45 | ppl 5.46 | wps 290649 | ups 0.68 | wpb 428939 | bsz 16331.9 | num_updates 34039 | lr 0.000342801 | gnorm 0.236 | clip 0 | loss_scale 2 | train_wall 2465 | gb_free 60.5 | wall 50165 epoch 020 | loss 4.081 | nll_loss 2.45 | ppl 5.46 | wps 290649 | ups 0.68 | wpb 428939 | bsz 16331.9 | num_updates 34039 | lr 0.000342801 | gnorm 0.236 | clip 0 | loss_scale 2 | train_wall 2465 | gb_free 60.5 | wall 50165 Start iterating over samples epoch 021: 61 / 1707 loss=4.066, nll_loss=2.433, ppl=5.4, wps=257689, ups=0.61, wpb=425524, bsz=16246, num_updates=34100, lr=0.000342494, gnorm=0.231, clip=0, loss_scale=2, train_wall=144, gb_free=59.1, wall=50253 epoch 021: 61 / 1707 loss=4.066, nll_loss=2.433, ppl=5.4, wps=257689, ups=0.61, wpb=425524, bsz=16246, num_updates=34100, lr=0.000342494, gnorm=0.231, clip=0, loss_scale=2, train_wall=144, gb_free=59.1, wall=50253 epoch 021: 61 / 1707 loss=4.066, nll_loss=2.433, ppl=5.4, wps=257689, ups=0.61, wpb=425524, bsz=16246, num_updates=34100, lr=0.000342494, gnorm=0.231, clip=0, loss_scale=2, train_wall=144, gb_free=59.1, wall=50253 epoch 021: 61 / 1707 loss=4.066, nll_loss=2.433, ppl=5.4, wps=257689, ups=0.61, wpb=425524, bsz=16246, num_updates=34100, lr=0.000342494, gnorm=0.231, clip=0, loss_scale=2, train_wall=144, gb_free=59.1, wall=50253 epoch 021: 61 / 1707 loss=4.066, nll_loss=2.433, ppl=5.4, wps=257689, ups=0.61, wpb=425524, bsz=16246, num_updates=34100, lr=0.000342494, gnorm=0.231, clip=0, loss_scale=2, train_wall=144, gb_free=59.1, wall=50253 epoch 021: 61 / 1707 loss=4.066, nll_loss=2.433, ppl=5.4, wps=257689, ups=0.61, wpb=425524, bsz=16246, num_updates=34100, lr=0.000342494, gnorm=0.231, clip=0, loss_scale=2, train_wall=144, gb_free=59.1, wall=50253 epoch 021: 61 / 1707 loss=4.066, nll_loss=2.433, ppl=5.4, wps=257689, ups=0.61, wpb=425524, bsz=16246, num_updates=34100, lr=0.000342494, gnorm=0.231, clip=0, loss_scale=2, train_wall=144, gb_free=59.1, wall=50253 epoch 021: 61 / 1707 loss=4.066, nll_loss=2.433, ppl=5.4, wps=257689, ups=0.61, wpb=425524, bsz=16246, num_updates=34100, lr=0.000342494, gnorm=0.231, clip=0, loss_scale=2, train_wall=144, gb_free=59.1, wall=50253 epoch 021: 61 / 1707 loss=4.066, nll_loss=2.433, ppl=5.4, wps=257689, ups=0.61, wpb=425524, bsz=16246, num_updates=34100, lr=0.000342494, gnorm=0.231, clip=0, loss_scale=2, train_wall=144, gb_free=59.1, wall=50253 epoch 021: 61 / 1707 loss=4.066, nll_loss=2.433, ppl=5.4, wps=257689, ups=0.61, wpb=425524, bsz=16246, num_updates=34100, lr=0.000342494, gnorm=0.231, clip=0, loss_scale=2, train_wall=144, gb_free=59.1, wall=50253 epoch 021: 61 / 1707 loss=4.066, nll_loss=2.433, ppl=5.4, wps=257689, ups=0.61, wpb=425524, bsz=16246, num_updates=34100, lr=0.000342494, gnorm=0.231, clip=0, loss_scale=2, train_wall=144, gb_free=59.1, wall=50253 epoch 021: 61 / 1707 loss=4.066, nll_loss=2.433, ppl=5.4, wps=257689, ups=0.61, wpb=425524, bsz=16246, num_updates=34100, lr=0.000342494, gnorm=0.231, clip=0, loss_scale=2, train_wall=144, gb_free=59.1, wall=50253 epoch 021: 61 / 1707 loss=4.066, nll_loss=2.433, ppl=5.4, wps=257689, ups=0.61, wpb=425524, bsz=16246, num_updates=34100, lr=0.000342494, gnorm=0.231, clip=0, loss_scale=2, train_wall=144, gb_free=59.1, wall=50253 epoch 021: 61 / 1707 loss=4.066, nll_loss=2.433, ppl=5.4, wps=257689, ups=0.61, wpb=425524, bsz=16246, num_updates=34100, lr=0.000342494, gnorm=0.231, clip=0, loss_scale=2, train_wall=144, gb_free=59.1, wall=50253 epoch 021: 61 / 1707 loss=4.066, nll_loss=2.433, ppl=5.4, wps=257689, ups=0.61, wpb=425524, bsz=16246, num_updates=34100, lr=0.000342494, gnorm=0.231, clip=0, loss_scale=2, train_wall=144, gb_free=59.1, wall=50253 epoch 021: 61 / 1707 loss=4.066, nll_loss=2.433, ppl=5.4, wps=257689, ups=0.61, wpb=425524, bsz=16246, num_updates=34100, lr=0.000342494, gnorm=0.231, clip=0, loss_scale=2, train_wall=144, gb_free=59.1, wall=50253 epoch 021: 61 / 1707 loss=4.066, nll_loss=2.433, ppl=5.4, wps=257689, ups=0.61, wpb=425524, bsz=16246, num_updates=34100, lr=0.000342494, gnorm=0.231, clip=0, loss_scale=2, train_wall=144, gb_free=59.1, wall=50253 epoch 021: 61 / 1707 loss=4.066, nll_loss=2.433, ppl=5.4, wps=257689, ups=0.61, wpb=425524, bsz=16246, num_updates=34100, lr=0.000342494, gnorm=0.231, clip=0, loss_scale=2, train_wall=144, gb_free=59.1, wall=50253 epoch 021: 61 / 1707 loss=4.066, nll_loss=2.433, ppl=5.4, wps=257689, ups=0.61, wpb=425524, bsz=16246, num_updates=34100, lr=0.000342494, gnorm=0.231, clip=0, loss_scale=2, train_wall=144, gb_free=59.1, wall=50253 epoch 021: 61 / 1707 loss=4.066, nll_loss=2.433, ppl=5.4, wps=257689, ups=0.61, wpb=425524, bsz=16246, num_updates=34100, lr=0.000342494, gnorm=0.231, clip=0, loss_scale=2, train_wall=144, gb_free=59.1, wall=50253 epoch 021: 61 / 1707 loss=4.066, nll_loss=2.433, ppl=5.4, wps=257689, ups=0.61, wpb=425524, bsz=16246, num_updates=34100, lr=0.000342494, gnorm=0.231, clip=0, loss_scale=2, train_wall=144, gb_free=59.1, wall=50253 epoch 021: 161 / 1707 loss=4.054, nll_loss=2.418, ppl=5.35, wps=296937, ups=0.69, wpb=430813, bsz=16166.7, num_updates=34200, lr=0.000341993, gnorm=0.232, clip=0, loss_scale=2, train_wall=144, gb_free=59.4, wall=50398 epoch 021: 161 / 1707 loss=4.054, nll_loss=2.418, ppl=5.35, wps=296937, ups=0.69, wpb=430813, bsz=16166.7, num_updates=34200, lr=0.000341993, gnorm=0.232, clip=0, loss_scale=2, train_wall=144, gb_free=59.4, wall=50398 epoch 021: 161 / 1707 loss=4.054, nll_loss=2.418, ppl=5.35, wps=296937, ups=0.69, wpb=430813, bsz=16166.7, num_updates=34200, lr=0.000341993, gnorm=0.232, clip=0, loss_scale=2, train_wall=144, gb_free=59.4, wall=50398 epoch 021: 161 / 1707 loss=4.054, nll_loss=2.418, ppl=5.35, wps=296937, ups=0.69, wpb=430813, bsz=16166.7, num_updates=34200, lr=0.000341993, gnorm=0.232, clip=0, loss_scale=2, train_wall=144, gb_free=59.4, wall=50398 epoch 021: 161 / 1707 loss=4.054, nll_loss=2.418, ppl=5.35, wps=296937, ups=0.69, wpb=430813, bsz=16166.7, num_updates=34200, lr=0.000341993, gnorm=0.232, clip=0, loss_scale=2, train_wall=144, gb_free=59.4, wall=50398 epoch 021: 161 / 1707 loss=4.054, nll_loss=2.418, ppl=5.35, wps=296937, ups=0.69, wpb=430813, bsz=16166.7, num_updates=34200, lr=0.000341993, gnorm=0.232, clip=0, loss_scale=2, train_wall=144, gb_free=59.4, wall=50398 epoch 021: 161 / 1707 loss=4.054, nll_loss=2.418, ppl=5.35, wps=296937, ups=0.69, wpb=430813, bsz=16166.7, num_updates=34200, lr=0.000341993, gnorm=0.232, clip=0, loss_scale=2, train_wall=144, gb_free=59.4, wall=50398 epoch 021: 161 / 1707 loss=4.054, nll_loss=2.418, ppl=5.35, wps=296937, ups=0.69, wpb=430813, bsz=16166.7, num_updates=34200, lr=0.000341993, gnorm=0.232, clip=0, loss_scale=2, train_wall=144, gb_free=59.4, wall=50398 epoch 021: 161 / 1707 loss=4.054, nll_loss=2.418, ppl=5.35, wps=296937, ups=0.69, wpb=430813, bsz=16166.7, num_updates=34200, lr=0.000341993, gnorm=0.232, clip=0, loss_scale=2, train_wall=144, gb_free=59.4, wall=50398 epoch 021: 161 / 1707 loss=4.054, nll_loss=2.418, ppl=5.35, wps=296937, ups=0.69, wpb=430813, bsz=16166.7, num_updates=34200, lr=0.000341993, gnorm=0.232, clip=0, loss_scale=2, train_wall=144, gb_free=59.4, wall=50398 epoch 021: 161 / 1707 loss=4.054, nll_loss=2.418, ppl=5.35, wps=296937, ups=0.69, wpb=430813, bsz=16166.7, num_updates=34200, lr=0.000341993, gnorm=0.232, clip=0, loss_scale=2, train_wall=144, gb_free=59.4, wall=50398 epoch 021: 161 / 1707 loss=4.054, nll_loss=2.418, ppl=5.35, wps=296937, ups=0.69, wpb=430813, bsz=16166.7, num_updates=34200, lr=0.000341993, gnorm=0.232, clip=0, loss_scale=2, train_wall=144, gb_free=59.4, wall=50398 epoch 021: 161 / 1707 loss=4.054, nll_loss=2.418, ppl=5.35, wps=296937, ups=0.69, wpb=430813, bsz=16166.7, num_updates=34200, lr=0.000341993, gnorm=0.232, clip=0, loss_scale=2, train_wall=144, gb_free=59.4, wall=50398 epoch 021: 161 / 1707 loss=4.054, nll_loss=2.418, ppl=5.35, wps=296937, ups=0.69, wpb=430813, bsz=16166.7, num_updates=34200, lr=0.000341993, gnorm=0.232, clip=0, loss_scale=2, train_wall=144, gb_free=59.4, wall=50398 epoch 021: 161 / 1707 loss=4.054, nll_loss=2.418, ppl=5.35, wps=296937, ups=0.69, wpb=430813, bsz=16166.7, num_updates=34200, lr=0.000341993, gnorm=0.232, clip=0, loss_scale=2, train_wall=144, gb_free=59.4, wall=50398 epoch 021: 161 / 1707 loss=4.054, nll_loss=2.418, ppl=5.35, wps=296937, ups=0.69, wpb=430813, bsz=16166.7, num_updates=34200, lr=0.000341993, gnorm=0.232, clip=0, loss_scale=2, train_wall=144, gb_free=59.4, wall=50398 epoch 021: 161 / 1707 loss=4.054, nll_loss=2.418, ppl=5.35, wps=296937, ups=0.69, wpb=430813, bsz=16166.7, num_updates=34200, lr=0.000341993, gnorm=0.232, clip=0, loss_scale=2, train_wall=144, gb_free=59.4, wall=50398 epoch 021: 161 / 1707 loss=4.054, nll_loss=2.418, ppl=5.35, wps=296937, ups=0.69, wpb=430813, bsz=16166.7, num_updates=34200, lr=0.000341993, gnorm=0.232, clip=0, loss_scale=2, train_wall=144, gb_free=59.4, wall=50398 epoch 021: 161 / 1707 loss=4.054, nll_loss=2.418, ppl=5.35, wps=296937, ups=0.69, wpb=430813, bsz=16166.7, num_updates=34200, lr=0.000341993, gnorm=0.232, clip=0, loss_scale=2, train_wall=144, gb_free=59.4, wall=50398 epoch 021: 161 / 1707 loss=4.054, nll_loss=2.418, ppl=5.35, wps=296937, ups=0.69, wpb=430813, bsz=16166.7, num_updates=34200, lr=0.000341993, gnorm=0.232, clip=0, loss_scale=2, train_wall=144, gb_free=59.4, wall=50398 epoch 021: 161 / 1707 loss=4.054, nll_loss=2.418, ppl=5.35, wps=296937, ups=0.69, wpb=430813, bsz=16166.7, num_updates=34200, lr=0.000341993, gnorm=0.232, clip=0, loss_scale=2, train_wall=144, gb_free=59.4, wall=50398 epoch 021: 262 / 1707 loss=4.055, nll_loss=2.42, ppl=5.35, wps=293584, ups=0.68, wpb=429485, bsz=16299.8, num_updates=34300, lr=0.000341494, gnorm=0.236, clip=0, loss_scale=2, train_wall=145, gb_free=59.3, wall=50545 epoch 021: 262 / 1707 loss=4.055, nll_loss=2.42, ppl=5.35, wps=293584, ups=0.68, wpb=429485, bsz=16299.8, num_updates=34300, lr=0.000341494, gnorm=0.236, clip=0, loss_scale=2, train_wall=145, gb_free=59.3, wall=50545 epoch 021: 262 / 1707 loss=4.055, nll_loss=2.42, ppl=5.35, wps=293584, ups=0.68, wpb=429485, bsz=16299.8, num_updates=34300, lr=0.000341494, gnorm=0.236, clip=0, loss_scale=2, train_wall=145, gb_free=59.3, wall=50545 epoch 021: 262 / 1707 loss=4.055, nll_loss=2.42, ppl=5.35, wps=293584, ups=0.68, wpb=429485, bsz=16299.8, num_updates=34300, lr=0.000341494, gnorm=0.236, clip=0, loss_scale=2, train_wall=145, gb_free=59.3, wall=50545 epoch 021: 262 / 1707 loss=4.055, nll_loss=2.42, ppl=5.35, wps=293584, ups=0.68, wpb=429485, bsz=16299.8, num_updates=34300, lr=0.000341494, gnorm=0.236, clip=0, loss_scale=2, train_wall=145, gb_free=59.3, wall=50545 epoch 021: 262 / 1707 loss=4.055, nll_loss=2.42, ppl=5.35, wps=293584, ups=0.68, wpb=429485, bsz=16299.8, num_updates=34300, lr=0.000341494, gnorm=0.236, clip=0, loss_scale=2, train_wall=145, gb_free=59.3, wall=50545 epoch 021: 262 / 1707 loss=4.055, nll_loss=2.42, ppl=5.35, wps=293584, ups=0.68, wpb=429485, bsz=16299.8, num_updates=34300, lr=0.000341494, gnorm=0.236, clip=0, loss_scale=2, train_wall=145, gb_free=59.3, wall=50545 epoch 021: 262 / 1707 loss=4.055, nll_loss=2.42, ppl=5.35, wps=293584, ups=0.68, wpb=429485, bsz=16299.8, num_updates=34300, lr=0.000341494, gnorm=0.236, clip=0, loss_scale=2, train_wall=145, gb_free=59.3, wall=50545 epoch 021: 262 / 1707 loss=4.055, nll_loss=2.42, ppl=5.35, wps=293584, ups=0.68, wpb=429485, bsz=16299.8, num_updates=34300, lr=0.000341494, gnorm=0.236, clip=0, loss_scale=2, train_wall=145, gb_free=59.3, wall=50545 epoch 021: 262 / 1707 loss=4.055, nll_loss=2.42, ppl=5.35, wps=293584, ups=0.68, wpb=429485, bsz=16299.8, num_updates=34300, lr=0.000341494, gnorm=0.236, clip=0, loss_scale=2, train_wall=145, gb_free=59.3, wall=50545 epoch 021: 262 / 1707 loss=4.055, nll_loss=2.42, ppl=5.35, wps=293584, ups=0.68, wpb=429485, bsz=16299.8, num_updates=34300, lr=0.000341494, gnorm=0.236, clip=0, loss_scale=2, train_wall=145, gb_free=59.3, wall=50545 epoch 021: 262 / 1707 loss=4.055, nll_loss=2.42, ppl=5.35, wps=293584, ups=0.68, wpb=429485, bsz=16299.8, num_updates=34300, lr=0.000341494, gnorm=0.236, clip=0, loss_scale=2, train_wall=145, gb_free=59.3, wall=50545 epoch 021: 262 / 1707 loss=4.055, nll_loss=2.42, ppl=5.35, wps=293584, ups=0.68, wpb=429485, bsz=16299.8, num_updates=34300, lr=0.000341494, gnorm=0.236, clip=0, loss_scale=2, train_wall=145, gb_free=59.3, wall=50545 epoch 021: 262 / 1707 loss=4.055, nll_loss=2.42, ppl=5.35, wps=293584, ups=0.68, wpb=429485, bsz=16299.8, num_updates=34300, lr=0.000341494, gnorm=0.236, clip=0, loss_scale=2, train_wall=145, gb_free=59.3, wall=50545 epoch 021: 262 / 1707 loss=4.055, nll_loss=2.42, ppl=5.35, wps=293584, ups=0.68, wpb=429485, bsz=16299.8, num_updates=34300, lr=0.000341494, gnorm=0.236, clip=0, loss_scale=2, train_wall=145, gb_free=59.3, wall=50545 epoch 021: 262 / 1707 loss=4.055, nll_loss=2.42, ppl=5.35, wps=293584, ups=0.68, wpb=429485, bsz=16299.8, num_updates=34300, lr=0.000341494, gnorm=0.236, clip=0, loss_scale=2, train_wall=145, gb_free=59.3, wall=50545 epoch 021: 262 / 1707 loss=4.055, nll_loss=2.42, ppl=5.35, wps=293584, ups=0.68, wpb=429485, bsz=16299.8, num_updates=34300, lr=0.000341494, gnorm=0.236, clip=0, loss_scale=2, train_wall=145, gb_free=59.3, wall=50545 epoch 021: 262 / 1707 loss=4.055, nll_loss=2.42, ppl=5.35, wps=293584, ups=0.68, wpb=429485, bsz=16299.8, num_updates=34300, lr=0.000341494, gnorm=0.236, clip=0, loss_scale=2, train_wall=145, gb_free=59.3, wall=50545 epoch 021: 262 / 1707 loss=4.055, nll_loss=2.42, ppl=5.35, wps=293584, ups=0.68, wpb=429485, bsz=16299.8, num_updates=34300, lr=0.000341494, gnorm=0.236, clip=0, loss_scale=2, train_wall=145, gb_free=59.3, wall=50545 epoch 021: 262 / 1707 loss=4.055, nll_loss=2.42, ppl=5.35, wps=293584, ups=0.68, wpb=429485, bsz=16299.8, num_updates=34300, lr=0.000341494, gnorm=0.236, clip=0, loss_scale=2, train_wall=145, gb_free=59.3, wall=50545 epoch 021: 262 / 1707 loss=4.055, nll_loss=2.42, ppl=5.35, wps=293584, ups=0.68, wpb=429485, bsz=16299.8, num_updates=34300, lr=0.000341494, gnorm=0.236, clip=0, loss_scale=2, train_wall=145, gb_free=59.3, wall=50545 epoch 021: 362 / 1707 loss=4.065, nll_loss=2.431, ppl=5.39, wps=295687, ups=0.69, wpb=429173, bsz=16328, num_updates=34400, lr=0.000340997, gnorm=0.236, clip=0, loss_scale=2, train_wall=144, gb_free=59, wall=50690 epoch 021: 362 / 1707 loss=4.065, nll_loss=2.431, ppl=5.39, wps=295687, ups=0.69, wpb=429173, bsz=16328, num_updates=34400, lr=0.000340997, gnorm=0.236, clip=0, loss_scale=2, train_wall=144, gb_free=59, wall=50690 epoch 021: 362 / 1707 loss=4.065, nll_loss=2.431, ppl=5.39, wps=295687, ups=0.69, wpb=429173, bsz=16328, num_updates=34400, lr=0.000340997, gnorm=0.236, clip=0, loss_scale=2, train_wall=144, gb_free=59, wall=50690 epoch 021: 362 / 1707 loss=4.065, nll_loss=2.431, ppl=5.39, wps=295687, ups=0.69, wpb=429173, bsz=16328, num_updates=34400, lr=0.000340997, gnorm=0.236, clip=0, loss_scale=2, train_wall=144, gb_free=59, wall=50690 epoch 021: 362 / 1707 loss=4.065, nll_loss=2.431, ppl=5.39, wps=295687, ups=0.69, wpb=429173, bsz=16328, num_updates=34400, lr=0.000340997, gnorm=0.236, clip=0, loss_scale=2, train_wall=144, gb_free=59, wall=50690 epoch 021: 362 / 1707 loss=4.065, nll_loss=2.431, ppl=5.39, wps=295687, ups=0.69, wpb=429173, bsz=16328, num_updates=34400, lr=0.000340997, gnorm=0.236, clip=0, loss_scale=2, train_wall=144, gb_free=59, wall=50690 epoch 021: 362 / 1707 loss=4.065, nll_loss=2.431, ppl=5.39, wps=295687, ups=0.69, wpb=429173, bsz=16328, num_updates=34400, lr=0.000340997, gnorm=0.236, clip=0, loss_scale=2, train_wall=144, gb_free=59, wall=50690 epoch 021: 362 / 1707 loss=4.065, nll_loss=2.431, ppl=5.39, wps=295687, ups=0.69, wpb=429173, bsz=16328, num_updates=34400, lr=0.000340997, gnorm=0.236, clip=0, loss_scale=2, train_wall=144, gb_free=59, wall=50690 epoch 021: 362 / 1707 loss=4.065, nll_loss=2.431, ppl=5.39, wps=295687, ups=0.69, wpb=429173, bsz=16328, num_updates=34400, lr=0.000340997, gnorm=0.236, clip=0, loss_scale=2, train_wall=144, gb_free=59, wall=50690 epoch 021: 362 / 1707 loss=4.065, nll_loss=2.431, ppl=5.39, wps=295687, ups=0.69, wpb=429173, bsz=16328, num_updates=34400, lr=0.000340997, gnorm=0.236, clip=0, loss_scale=2, train_wall=144, gb_free=59, wall=50690 epoch 021: 362 / 1707 loss=4.065, nll_loss=2.431, ppl=5.39, wps=295687, ups=0.69, wpb=429173, bsz=16328, num_updates=34400, lr=0.000340997, gnorm=0.236, clip=0, loss_scale=2, train_wall=144, gb_free=59, wall=50690 epoch 021: 362 / 1707 loss=4.065, nll_loss=2.431, ppl=5.39, wps=295687, ups=0.69, wpb=429173, bsz=16328, num_updates=34400, lr=0.000340997, gnorm=0.236, clip=0, loss_scale=2, train_wall=144, gb_free=59, wall=50690 epoch 021: 362 / 1707 loss=4.065, nll_loss=2.431, ppl=5.39, wps=295687, ups=0.69, wpb=429173, bsz=16328, num_updates=34400, lr=0.000340997, gnorm=0.236, clip=0, loss_scale=2, train_wall=144, gb_free=59, wall=50690 epoch 021: 362 / 1707 loss=4.065, nll_loss=2.431, ppl=5.39, wps=295687, ups=0.69, wpb=429173, bsz=16328, num_updates=34400, lr=0.000340997, gnorm=0.236, clip=0, loss_scale=2, train_wall=144, gb_free=59, wall=50690 epoch 021: 362 / 1707 loss=4.065, nll_loss=2.431, ppl=5.39, wps=295687, ups=0.69, wpb=429173, bsz=16328, num_updates=34400, lr=0.000340997, gnorm=0.236, clip=0, loss_scale=2, train_wall=144, gb_free=59, wall=50690 epoch 021: 362 / 1707 loss=4.065, nll_loss=2.431, ppl=5.39, wps=295687, ups=0.69, wpb=429173, bsz=16328, num_updates=34400, lr=0.000340997, gnorm=0.236, clip=0, loss_scale=2, train_wall=144, gb_free=59, wall=50690 epoch 021: 362 / 1707 loss=4.065, nll_loss=2.431, ppl=5.39, wps=295687, ups=0.69, wpb=429173, bsz=16328, num_updates=34400, lr=0.000340997, gnorm=0.236, clip=0, loss_scale=2, train_wall=144, gb_free=59, wall=50690 epoch 021: 362 / 1707 loss=4.065, nll_loss=2.431, ppl=5.39, wps=295687, ups=0.69, wpb=429173, bsz=16328, num_updates=34400, lr=0.000340997, gnorm=0.236, clip=0, loss_scale=2, train_wall=144, gb_free=59, wall=50690 epoch 021: 362 / 1707 loss=4.065, nll_loss=2.431, ppl=5.39, wps=295687, ups=0.69, wpb=429173, bsz=16328, num_updates=34400, lr=0.000340997, gnorm=0.236, clip=0, loss_scale=2, train_wall=144, gb_free=59, wall=50690 epoch 021: 362 / 1707 loss=4.065, nll_loss=2.431, ppl=5.39, wps=295687, ups=0.69, wpb=429173, bsz=16328, num_updates=34400, lr=0.000340997, gnorm=0.236, clip=0, loss_scale=2, train_wall=144, gb_free=59, wall=50690 epoch 021: 362 / 1707 loss=4.065, nll_loss=2.431, ppl=5.39, wps=295687, ups=0.69, wpb=429173, bsz=16328, num_updates=34400, lr=0.000340997, gnorm=0.236, clip=0, loss_scale=2, train_wall=144, gb_free=59, wall=50690 epoch 021: 463 / 1707 loss=4.069, nll_loss=2.436, ppl=5.41, wps=292412, ups=0.68, wpb=427812, bsz=16195.3, num_updates=34500, lr=0.000340503, gnorm=0.235, clip=0, loss_scale=1, train_wall=145, gb_free=58.9, wall=50836 epoch 021: 463 / 1707 loss=4.069, nll_loss=2.436, ppl=5.41, wps=292412, ups=0.68, wpb=427812, bsz=16195.3, num_updates=34500, lr=0.000340503, gnorm=0.235, clip=0, loss_scale=1, train_wall=145, gb_free=58.9, wall=50836 epoch 021: 463 / 1707 loss=4.069, nll_loss=2.436, ppl=5.41, wps=292412, ups=0.68, wpb=427812, bsz=16195.3, num_updates=34500, lr=0.000340503, gnorm=0.235, clip=0, loss_scale=1, train_wall=145, gb_free=58.9, wall=50836 epoch 021: 463 / 1707 loss=4.069, nll_loss=2.436, ppl=5.41, wps=292412, ups=0.68, wpb=427812, bsz=16195.3, num_updates=34500, lr=0.000340503, gnorm=0.235, clip=0, loss_scale=1, train_wall=145, gb_free=58.9, wall=50836 epoch 021: 463 / 1707 loss=4.069, nll_loss=2.436, ppl=5.41, wps=292412, ups=0.68, wpb=427812, bsz=16195.3, num_updates=34500, lr=0.000340503, gnorm=0.235, clip=0, loss_scale=1, train_wall=145, gb_free=58.9, wall=50836 epoch 021: 463 / 1707 loss=4.069, nll_loss=2.436, ppl=5.41, wps=292412, ups=0.68, wpb=427812, bsz=16195.3, num_updates=34500, lr=0.000340503, gnorm=0.235, clip=0, loss_scale=1, train_wall=145, gb_free=58.9, wall=50836 epoch 021: 463 / 1707 loss=4.069, nll_loss=2.436, ppl=5.41, wps=292412, ups=0.68, wpb=427812, bsz=16195.3, num_updates=34500, lr=0.000340503, gnorm=0.235, clip=0, loss_scale=1, train_wall=145, gb_free=58.9, wall=50836 epoch 021: 463 / 1707 loss=4.069, nll_loss=2.436, ppl=5.41, wps=292412, ups=0.68, wpb=427812, bsz=16195.3, num_updates=34500, lr=0.000340503, gnorm=0.235, clip=0, loss_scale=1, train_wall=145, gb_free=58.9, wall=50836 epoch 021: 463 / 1707 loss=4.069, nll_loss=2.436, ppl=5.41, wps=292412, ups=0.68, wpb=427812, bsz=16195.3, num_updates=34500, lr=0.000340503, gnorm=0.235, clip=0, loss_scale=1, train_wall=145, gb_free=58.9, wall=50836 epoch 021: 463 / 1707 loss=4.069, nll_loss=2.436, ppl=5.41, wps=292412, ups=0.68, wpb=427812, bsz=16195.3, num_updates=34500, lr=0.000340503, gnorm=0.235, clip=0, loss_scale=1, train_wall=145, gb_free=58.9, wall=50836 epoch 021: 463 / 1707 loss=4.069, nll_loss=2.436, ppl=5.41, wps=292412, ups=0.68, wpb=427812, bsz=16195.3, num_updates=34500, lr=0.000340503, gnorm=0.235, clip=0, loss_scale=1, train_wall=145, gb_free=58.9, wall=50836 epoch 021: 463 / 1707 loss=4.069, nll_loss=2.436, ppl=5.41, wps=292412, ups=0.68, wpb=427812, bsz=16195.3, num_updates=34500, lr=0.000340503, gnorm=0.235, clip=0, loss_scale=1, train_wall=145, gb_free=58.9, wall=50836 epoch 021: 463 / 1707 loss=4.069, nll_loss=2.436, ppl=5.41, wps=292412, ups=0.68, wpb=427812, bsz=16195.3, num_updates=34500, lr=0.000340503, gnorm=0.235, clip=0, loss_scale=1, train_wall=145, gb_free=58.9, wall=50836 epoch 021: 463 / 1707 loss=4.069, nll_loss=2.436, ppl=5.41, wps=292412, ups=0.68, wpb=427812, bsz=16195.3, num_updates=34500, lr=0.000340503, gnorm=0.235, clip=0, loss_scale=1, train_wall=145, gb_free=58.9, wall=50836 epoch 021: 463 / 1707 loss=4.069, nll_loss=2.436, ppl=5.41, wps=292412, ups=0.68, wpb=427812, bsz=16195.3, num_updates=34500, lr=0.000340503, gnorm=0.235, clip=0, loss_scale=1, train_wall=145, gb_free=58.9, wall=50836 epoch 021: 463 / 1707 loss=4.069, nll_loss=2.436, ppl=5.41, wps=292412, ups=0.68, wpb=427812, bsz=16195.3, num_updates=34500, lr=0.000340503, gnorm=0.235, clip=0, loss_scale=1, train_wall=145, gb_free=58.9, wall=50836 epoch 021: 463 / 1707 loss=4.069, nll_loss=2.436, ppl=5.41, wps=292412, ups=0.68, wpb=427812, bsz=16195.3, num_updates=34500, lr=0.000340503, gnorm=0.235, clip=0, loss_scale=1, train_wall=145, gb_free=58.9, wall=50836 epoch 021: 463 / 1707 loss=4.069, nll_loss=2.436, ppl=5.41, wps=292412, ups=0.68, wpb=427812, bsz=16195.3, num_updates=34500, lr=0.000340503, gnorm=0.235, clip=0, loss_scale=1, train_wall=145, gb_free=58.9, wall=50836 epoch 021: 463 / 1707 loss=4.069, nll_loss=2.436, ppl=5.41, wps=292412, ups=0.68, wpb=427812, bsz=16195.3, num_updates=34500, lr=0.000340503, gnorm=0.235, clip=0, loss_scale=1, train_wall=145, gb_free=58.9, wall=50836 epoch 021: 463 / 1707 loss=4.069, nll_loss=2.436, ppl=5.41, wps=292412, ups=0.68, wpb=427812, bsz=16195.3, num_updates=34500, lr=0.000340503, gnorm=0.235, clip=0, loss_scale=1, train_wall=145, gb_free=58.9, wall=50836 epoch 021: 463 / 1707 loss=4.069, nll_loss=2.436, ppl=5.41, wps=292412, ups=0.68, wpb=427812, bsz=16195.3, num_updates=34500, lr=0.000340503, gnorm=0.235, clip=0, loss_scale=1, train_wall=145, gb_free=58.9, wall=50836 epoch 021: 563 / 1707 loss=4.068, nll_loss=2.435, ppl=5.41, wps=295323, ups=0.69, wpb=428932, bsz=16177.4, num_updates=34600, lr=0.00034001, gnorm=0.238, clip=0, loss_scale=1, train_wall=144, gb_free=59.2, wall=50981 epoch 021: 563 / 1707 loss=4.068, nll_loss=2.435, ppl=5.41, wps=295323, ups=0.69, wpb=428932, bsz=16177.4, num_updates=34600, lr=0.00034001, gnorm=0.238, clip=0, loss_scale=1, train_wall=144, gb_free=59.2, wall=50981 epoch 021: 563 / 1707 loss=4.068, nll_loss=2.435, ppl=5.41, wps=295323, ups=0.69, wpb=428932, bsz=16177.4, num_updates=34600, lr=0.00034001, gnorm=0.238, clip=0, loss_scale=1, train_wall=144, gb_free=59.2, wall=50981 epoch 021: 563 / 1707 loss=4.068, nll_loss=2.435, ppl=5.41, wps=295323, ups=0.69, wpb=428932, bsz=16177.4, num_updates=34600, lr=0.00034001, gnorm=0.238, clip=0, loss_scale=1, train_wall=144, gb_free=59.2, wall=50981 epoch 021: 563 / 1707 loss=4.068, nll_loss=2.435, ppl=5.41, wps=295323, ups=0.69, wpb=428932, bsz=16177.4, num_updates=34600, lr=0.00034001, gnorm=0.238, clip=0, loss_scale=1, train_wall=144, gb_free=59.2, wall=50981 epoch 021: 563 / 1707 loss=4.068, nll_loss=2.435, ppl=5.41, wps=295323, ups=0.69, wpb=428932, bsz=16177.4, num_updates=34600, lr=0.00034001, gnorm=0.238, clip=0, loss_scale=1, train_wall=144, gb_free=59.2, wall=50981 epoch 021: 563 / 1707 loss=4.068, nll_loss=2.435, ppl=5.41, wps=295323, ups=0.69, wpb=428932, bsz=16177.4, num_updates=34600, lr=0.00034001, gnorm=0.238, clip=0, loss_scale=1, train_wall=144, gb_free=59.2, wall=50981 epoch 021: 563 / 1707 loss=4.068, nll_loss=2.435, ppl=5.41, wps=295323, ups=0.69, wpb=428932, bsz=16177.4, num_updates=34600, lr=0.00034001, gnorm=0.238, clip=0, loss_scale=1, train_wall=144, gb_free=59.2, wall=50981 epoch 021: 563 / 1707 loss=4.068, nll_loss=2.435, ppl=5.41, wps=295323, ups=0.69, wpb=428932, bsz=16177.4, num_updates=34600, lr=0.00034001, gnorm=0.238, clip=0, loss_scale=1, train_wall=144, gb_free=59.2, wall=50981 epoch 021: 563 / 1707 loss=4.068, nll_loss=2.435, ppl=5.41, wps=295323, ups=0.69, wpb=428932, bsz=16177.4, num_updates=34600, lr=0.00034001, gnorm=0.238, clip=0, loss_scale=1, train_wall=144, gb_free=59.2, wall=50981 epoch 021: 563 / 1707 loss=4.068, nll_loss=2.435, ppl=5.41, wps=295323, ups=0.69, wpb=428932, bsz=16177.4, num_updates=34600, lr=0.00034001, gnorm=0.238, clip=0, loss_scale=1, train_wall=144, gb_free=59.2, wall=50981 epoch 021: 563 / 1707 loss=4.068, nll_loss=2.435, ppl=5.41, wps=295323, ups=0.69, wpb=428932, bsz=16177.4, num_updates=34600, lr=0.00034001, gnorm=0.238, clip=0, loss_scale=1, train_wall=144, gb_free=59.2, wall=50981 epoch 021: 563 / 1707 loss=4.068, nll_loss=2.435, ppl=5.41, wps=295323, ups=0.69, wpb=428932, bsz=16177.4, num_updates=34600, lr=0.00034001, gnorm=0.238, clip=0, loss_scale=1, train_wall=144, gb_free=59.2, wall=50981 epoch 021: 563 / 1707 loss=4.068, nll_loss=2.435, ppl=5.41, wps=295323, ups=0.69, wpb=428932, bsz=16177.4, num_updates=34600, lr=0.00034001, gnorm=0.238, clip=0, loss_scale=1, train_wall=144, gb_free=59.2, wall=50981 epoch 021: 563 / 1707 loss=4.068, nll_loss=2.435, ppl=5.41, wps=295323, ups=0.69, wpb=428932, bsz=16177.4, num_updates=34600, lr=0.00034001, gnorm=0.238, clip=0, loss_scale=1, train_wall=144, gb_free=59.2, wall=50981 epoch 021: 563 / 1707 loss=4.068, nll_loss=2.435, ppl=5.41, wps=295323, ups=0.69, wpb=428932, bsz=16177.4, num_updates=34600, lr=0.00034001, gnorm=0.238, clip=0, loss_scale=1, train_wall=144, gb_free=59.2, wall=50981 epoch 021: 563 / 1707 loss=4.068, nll_loss=2.435, ppl=5.41, wps=295323, ups=0.69, wpb=428932, bsz=16177.4, num_updates=34600, lr=0.00034001, gnorm=0.238, clip=0, loss_scale=1, train_wall=144, gb_free=59.2, wall=50981 epoch 021: 563 / 1707 loss=4.068, nll_loss=2.435, ppl=5.41, wps=295323, ups=0.69, wpb=428932, bsz=16177.4, num_updates=34600, lr=0.00034001, gnorm=0.238, clip=0, loss_scale=1, train_wall=144, gb_free=59.2, wall=50981 epoch 021: 563 / 1707 loss=4.068, nll_loss=2.435, ppl=5.41, wps=295323, ups=0.69, wpb=428932, bsz=16177.4, num_updates=34600, lr=0.00034001, gnorm=0.238, clip=0, loss_scale=1, train_wall=144, gb_free=59.2, wall=50981 epoch 021: 563 / 1707 loss=4.068, nll_loss=2.435, ppl=5.41, wps=295323, ups=0.69, wpb=428932, bsz=16177.4, num_updates=34600, lr=0.00034001, gnorm=0.238, clip=0, loss_scale=1, train_wall=144, gb_free=59.2, wall=50981 epoch 021: 563 / 1707 loss=4.068, nll_loss=2.435, ppl=5.41, wps=295323, ups=0.69, wpb=428932, bsz=16177.4, num_updates=34600, lr=0.00034001, gnorm=0.238, clip=0, loss_scale=1, train_wall=144, gb_free=59.2, wall=50981 epoch 021: 663 / 1707 loss=4.068, nll_loss=2.435, ppl=5.41, wps=295101, ups=0.69, wpb=428663, bsz=16618, num_updates=34700, lr=0.00033952, gnorm=0.233, clip=0, loss_scale=1, train_wall=144, gb_free=58.9, wall=51127 epoch 021: 663 / 1707 loss=4.068, nll_loss=2.435, ppl=5.41, wps=295101, ups=0.69, wpb=428663, bsz=16618, num_updates=34700, lr=0.00033952, gnorm=0.233, clip=0, loss_scale=1, train_wall=144, gb_free=58.9, wall=51127 epoch 021: 663 / 1707 loss=4.068, nll_loss=2.435, ppl=5.41, wps=295101, ups=0.69, wpb=428663, bsz=16618, num_updates=34700, lr=0.00033952, gnorm=0.233, clip=0, loss_scale=1, train_wall=144, gb_free=58.9, wall=51127 epoch 021: 663 / 1707 loss=4.068, nll_loss=2.435, ppl=5.41, wps=295101, ups=0.69, wpb=428663, bsz=16618, num_updates=34700, lr=0.00033952, gnorm=0.233, clip=0, loss_scale=1, train_wall=144, gb_free=58.9, wall=51127 epoch 021: 663 / 1707 loss=4.068, nll_loss=2.435, ppl=5.41, wps=295101, ups=0.69, wpb=428663, bsz=16618, num_updates=34700, lr=0.00033952, gnorm=0.233, clip=0, loss_scale=1, train_wall=144, gb_free=58.9, wall=51127 epoch 021: 663 / 1707 loss=4.068, nll_loss=2.435, ppl=5.41, wps=295101, ups=0.69, wpb=428663, bsz=16618, num_updates=34700, lr=0.00033952, gnorm=0.233, clip=0, loss_scale=1, train_wall=144, gb_free=58.9, wall=51127 epoch 021: 663 / 1707 loss=4.068, nll_loss=2.435, ppl=5.41, wps=295101, ups=0.69, wpb=428663, bsz=16618, num_updates=34700, lr=0.00033952, gnorm=0.233, clip=0, loss_scale=1, train_wall=144, gb_free=58.9, wall=51127 epoch 021: 663 / 1707 loss=4.068, nll_loss=2.435, ppl=5.41, wps=295101, ups=0.69, wpb=428663, bsz=16618, num_updates=34700, lr=0.00033952, gnorm=0.233, clip=0, loss_scale=1, train_wall=144, gb_free=58.9, wall=51127 epoch 021: 663 / 1707 loss=4.068, nll_loss=2.435, ppl=5.41, wps=295101, ups=0.69, wpb=428663, bsz=16618, num_updates=34700, lr=0.00033952, gnorm=0.233, clip=0, loss_scale=1, train_wall=144, gb_free=58.9, wall=51127 epoch 021: 663 / 1707 loss=4.068, nll_loss=2.435, ppl=5.41, wps=295101, ups=0.69, wpb=428663, bsz=16618, num_updates=34700, lr=0.00033952, gnorm=0.233, clip=0, loss_scale=1, train_wall=144, gb_free=58.9, wall=51127 epoch 021: 663 / 1707 loss=4.068, nll_loss=2.435, ppl=5.41, wps=295101, ups=0.69, wpb=428663, bsz=16618, num_updates=34700, lr=0.00033952, gnorm=0.233, clip=0, loss_scale=1, train_wall=144, gb_free=58.9, wall=51127 epoch 021: 663 / 1707 loss=4.068, nll_loss=2.435, ppl=5.41, wps=295101, ups=0.69, wpb=428663, bsz=16618, num_updates=34700, lr=0.00033952, gnorm=0.233, clip=0, loss_scale=1, train_wall=144, gb_free=58.9, wall=51127 epoch 021: 663 / 1707 loss=4.068, nll_loss=2.435, ppl=5.41, wps=295101, ups=0.69, wpb=428663, bsz=16618, num_updates=34700, lr=0.00033952, gnorm=0.233, clip=0, loss_scale=1, train_wall=144, gb_free=58.9, wall=51127 epoch 021: 663 / 1707 loss=4.068, nll_loss=2.435, ppl=5.41, wps=295101, ups=0.69, wpb=428663, bsz=16618, num_updates=34700, lr=0.00033952, gnorm=0.233, clip=0, loss_scale=1, train_wall=144, gb_free=58.9, wall=51127 epoch 021: 663 / 1707 loss=4.068, nll_loss=2.435, ppl=5.41, wps=295101, ups=0.69, wpb=428663, bsz=16618, num_updates=34700, lr=0.00033952, gnorm=0.233, clip=0, loss_scale=1, train_wall=144, gb_free=58.9, wall=51127 epoch 021: 663 / 1707 loss=4.068, nll_loss=2.435, ppl=5.41, wps=295101, ups=0.69, wpb=428663, bsz=16618, num_updates=34700, lr=0.00033952, gnorm=0.233, clip=0, loss_scale=1, train_wall=144, gb_free=58.9, wall=51127 epoch 021: 663 / 1707 loss=4.068, nll_loss=2.435, ppl=5.41, wps=295101, ups=0.69, wpb=428663, bsz=16618, num_updates=34700, lr=0.00033952, gnorm=0.233, clip=0, loss_scale=1, train_wall=144, gb_free=58.9, wall=51127 epoch 021: 663 / 1707 loss=4.068, nll_loss=2.435, ppl=5.41, wps=295101, ups=0.69, wpb=428663, bsz=16618, num_updates=34700, lr=0.00033952, gnorm=0.233, clip=0, loss_scale=1, train_wall=144, gb_free=58.9, wall=51127 epoch 021: 663 / 1707 loss=4.068, nll_loss=2.435, ppl=5.41, wps=295101, ups=0.69, wpb=428663, bsz=16618, num_updates=34700, lr=0.00033952, gnorm=0.233, clip=0, loss_scale=1, train_wall=144, gb_free=58.9, wall=51127 epoch 021: 663 / 1707 loss=4.068, nll_loss=2.435, ppl=5.41, wps=295101, ups=0.69, wpb=428663, bsz=16618, num_updates=34700, lr=0.00033952, gnorm=0.233, clip=0, loss_scale=1, train_wall=144, gb_free=58.9, wall=51127 epoch 021: 663 / 1707 loss=4.068, nll_loss=2.435, ppl=5.41, wps=295101, ups=0.69, wpb=428663, bsz=16618, num_updates=34700, lr=0.00033952, gnorm=0.233, clip=0, loss_scale=1, train_wall=144, gb_free=58.9, wall=51127 epoch 021: 763 / 1707 loss=4.073, nll_loss=2.44, ppl=5.43, wps=294836, ups=0.69, wpb=429179, bsz=16410.2, num_updates=34800, lr=0.000339032, gnorm=0.232, clip=0, loss_scale=2, train_wall=145, gb_free=59, wall=51272 epoch 021: 763 / 1707 loss=4.073, nll_loss=2.44, ppl=5.43, wps=294836, ups=0.69, wpb=429179, bsz=16410.2, num_updates=34800, lr=0.000339032, gnorm=0.232, clip=0, loss_scale=2, train_wall=145, gb_free=59, wall=51272 epoch 021: 763 / 1707 loss=4.073, nll_loss=2.44, ppl=5.43, wps=294836, ups=0.69, wpb=429179, bsz=16410.2, num_updates=34800, lr=0.000339032, gnorm=0.232, clip=0, loss_scale=2, train_wall=145, gb_free=59, wall=51272 epoch 021: 763 / 1707 loss=4.073, nll_loss=2.44, ppl=5.43, wps=294836, ups=0.69, wpb=429179, bsz=16410.2, num_updates=34800, lr=0.000339032, gnorm=0.232, clip=0, loss_scale=2, train_wall=145, gb_free=59, wall=51272 epoch 021: 763 / 1707 loss=4.073, nll_loss=2.44, ppl=5.43, wps=294836, ups=0.69, wpb=429179, bsz=16410.2, num_updates=34800, lr=0.000339032, gnorm=0.232, clip=0, loss_scale=2, train_wall=145, gb_free=59, wall=51272 epoch 021: 763 / 1707 loss=4.073, nll_loss=2.44, ppl=5.43, wps=294836, ups=0.69, wpb=429179, bsz=16410.2, num_updates=34800, lr=0.000339032, gnorm=0.232, clip=0, loss_scale=2, train_wall=145, gb_free=59, wall=51272 epoch 021: 763 / 1707 loss=4.073, nll_loss=2.44, ppl=5.43, wps=294836, ups=0.69, wpb=429179, bsz=16410.2, num_updates=34800, lr=0.000339032, gnorm=0.232, clip=0, loss_scale=2, train_wall=145, gb_free=59, wall=51272 epoch 021: 763 / 1707 loss=4.073, nll_loss=2.44, ppl=5.43, wps=294836, ups=0.69, wpb=429179, bsz=16410.2, num_updates=34800, lr=0.000339032, gnorm=0.232, clip=0, loss_scale=2, train_wall=145, gb_free=59, wall=51272 epoch 021: 763 / 1707 loss=4.073, nll_loss=2.44, ppl=5.43, wps=294836, ups=0.69, wpb=429179, bsz=16410.2, num_updates=34800, lr=0.000339032, gnorm=0.232, clip=0, loss_scale=2, train_wall=145, gb_free=59, wall=51272 epoch 021: 763 / 1707 loss=4.073, nll_loss=2.44, ppl=5.43, wps=294836, ups=0.69, wpb=429179, bsz=16410.2, num_updates=34800, lr=0.000339032, gnorm=0.232, clip=0, loss_scale=2, train_wall=145, gb_free=59, wall=51272 epoch 021: 763 / 1707 loss=4.073, nll_loss=2.44, ppl=5.43, wps=294836, ups=0.69, wpb=429179, bsz=16410.2, num_updates=34800, lr=0.000339032, gnorm=0.232, clip=0, loss_scale=2, train_wall=145, gb_free=59, wall=51272 epoch 021: 763 / 1707 loss=4.073, nll_loss=2.44, ppl=5.43, wps=294836, ups=0.69, wpb=429179, bsz=16410.2, num_updates=34800, lr=0.000339032, gnorm=0.232, clip=0, loss_scale=2, train_wall=145, gb_free=59, wall=51272 epoch 021: 763 / 1707 loss=4.073, nll_loss=2.44, ppl=5.43, wps=294836, ups=0.69, wpb=429179, bsz=16410.2, num_updates=34800, lr=0.000339032, gnorm=0.232, clip=0, loss_scale=2, train_wall=145, gb_free=59, wall=51272 epoch 021: 763 / 1707 loss=4.073, nll_loss=2.44, ppl=5.43, wps=294836, ups=0.69, wpb=429179, bsz=16410.2, num_updates=34800, lr=0.000339032, gnorm=0.232, clip=0, loss_scale=2, train_wall=145, gb_free=59, wall=51272 epoch 021: 763 / 1707 loss=4.073, nll_loss=2.44, ppl=5.43, wps=294836, ups=0.69, wpb=429179, bsz=16410.2, num_updates=34800, lr=0.000339032, gnorm=0.232, clip=0, loss_scale=2, train_wall=145, gb_free=59, wall=51272 epoch 021: 763 / 1707 loss=4.073, nll_loss=2.44, ppl=5.43, wps=294836, ups=0.69, wpb=429179, bsz=16410.2, num_updates=34800, lr=0.000339032, gnorm=0.232, clip=0, loss_scale=2, train_wall=145, gb_free=59, wall=51272 epoch 021: 763 / 1707 loss=4.073, nll_loss=2.44, ppl=5.43, wps=294836, ups=0.69, wpb=429179, bsz=16410.2, num_updates=34800, lr=0.000339032, gnorm=0.232, clip=0, loss_scale=2, train_wall=145, gb_free=59, wall=51272 epoch 021: 763 / 1707 loss=4.073, nll_loss=2.44, ppl=5.43, wps=294836, ups=0.69, wpb=429179, bsz=16410.2, num_updates=34800, lr=0.000339032, gnorm=0.232, clip=0, loss_scale=2, train_wall=145, gb_free=59, wall=51272 epoch 021: 763 / 1707 loss=4.073, nll_loss=2.44, ppl=5.43, wps=294836, ups=0.69, wpb=429179, bsz=16410.2, num_updates=34800, lr=0.000339032, gnorm=0.232, clip=0, loss_scale=2, train_wall=145, gb_free=59, wall=51272 epoch 021: 763 / 1707 loss=4.073, nll_loss=2.44, ppl=5.43, wps=294836, ups=0.69, wpb=429179, bsz=16410.2, num_updates=34800, lr=0.000339032, gnorm=0.232, clip=0, loss_scale=2, train_wall=145, gb_free=59, wall=51272 epoch 021: 763 / 1707 loss=4.073, nll_loss=2.44, ppl=5.43, wps=294836, ups=0.69, wpb=429179, bsz=16410.2, num_updates=34800, lr=0.000339032, gnorm=0.232, clip=0, loss_scale=2, train_wall=145, gb_free=59, wall=51272 epoch 021: 863 / 1707 loss=4.082, nll_loss=2.451, ppl=5.47, wps=295190, ups=0.69, wpb=429073, bsz=16396.3, num_updates=34900, lr=0.000338546, gnorm=0.239, clip=0, loss_scale=2, train_wall=145, gb_free=58.9, wall=51417 epoch 021: 863 / 1707 loss=4.082, nll_loss=2.451, ppl=5.47, wps=295190, ups=0.69, wpb=429073, bsz=16396.3, num_updates=34900, lr=0.000338546, gnorm=0.239, clip=0, loss_scale=2, train_wall=145, gb_free=58.9, wall=51417 epoch 021: 863 / 1707 loss=4.082, nll_loss=2.451, ppl=5.47, wps=295190, ups=0.69, wpb=429073, bsz=16396.3, num_updates=34900, lr=0.000338546, gnorm=0.239, clip=0, loss_scale=2, train_wall=145, gb_free=58.9, wall=51417 epoch 021: 863 / 1707 loss=4.082, nll_loss=2.451, ppl=5.47, wps=295190, ups=0.69, wpb=429073, bsz=16396.3, num_updates=34900, lr=0.000338546, gnorm=0.239, clip=0, loss_scale=2, train_wall=145, gb_free=58.9, wall=51417 epoch 021: 863 / 1707 loss=4.082, nll_loss=2.451, ppl=5.47, wps=295190, ups=0.69, wpb=429073, bsz=16396.3, num_updates=34900, lr=0.000338546, gnorm=0.239, clip=0, loss_scale=2, train_wall=145, gb_free=58.9, wall=51417 epoch 021: 863 / 1707 loss=4.082, nll_loss=2.451, ppl=5.47, wps=295190, ups=0.69, wpb=429073, bsz=16396.3, num_updates=34900, lr=0.000338546, gnorm=0.239, clip=0, loss_scale=2, train_wall=145, gb_free=58.9, wall=51417 epoch 021: 863 / 1707 loss=4.082, nll_loss=2.451, ppl=5.47, wps=295190, ups=0.69, wpb=429073, bsz=16396.3, num_updates=34900, lr=0.000338546, gnorm=0.239, clip=0, loss_scale=2, train_wall=145, gb_free=58.9, wall=51417 epoch 021: 863 / 1707 loss=4.082, nll_loss=2.451, ppl=5.47, wps=295190, ups=0.69, wpb=429073, bsz=16396.3, num_updates=34900, lr=0.000338546, gnorm=0.239, clip=0, loss_scale=2, train_wall=145, gb_free=58.9, wall=51417 epoch 021: 863 / 1707 loss=4.082, nll_loss=2.451, ppl=5.47, wps=295190, ups=0.69, wpb=429073, bsz=16396.3, num_updates=34900, lr=0.000338546, gnorm=0.239, clip=0, loss_scale=2, train_wall=145, gb_free=58.9, wall=51417 epoch 021: 863 / 1707 loss=4.082, nll_loss=2.451, ppl=5.47, wps=295190, ups=0.69, wpb=429073, bsz=16396.3, num_updates=34900, lr=0.000338546, gnorm=0.239, clip=0, loss_scale=2, train_wall=145, gb_free=58.9, wall=51417 epoch 021: 863 / 1707 loss=4.082, nll_loss=2.451, ppl=5.47, wps=295190, ups=0.69, wpb=429073, bsz=16396.3, num_updates=34900, lr=0.000338546, gnorm=0.239, clip=0, loss_scale=2, train_wall=145, gb_free=58.9, wall=51417 epoch 021: 863 / 1707 loss=4.082, nll_loss=2.451, ppl=5.47, wps=295190, ups=0.69, wpb=429073, bsz=16396.3, num_updates=34900, lr=0.000338546, gnorm=0.239, clip=0, loss_scale=2, train_wall=145, gb_free=58.9, wall=51417 epoch 021: 863 / 1707 loss=4.082, nll_loss=2.451, ppl=5.47, wps=295190, ups=0.69, wpb=429073, bsz=16396.3, num_updates=34900, lr=0.000338546, gnorm=0.239, clip=0, loss_scale=2, train_wall=145, gb_free=58.9, wall=51417 epoch 021: 863 / 1707 loss=4.082, nll_loss=2.451, ppl=5.47, wps=295190, ups=0.69, wpb=429073, bsz=16396.3, num_updates=34900, lr=0.000338546, gnorm=0.239, clip=0, loss_scale=2, train_wall=145, gb_free=58.9, wall=51417 epoch 021: 863 / 1707 loss=4.082, nll_loss=2.451, ppl=5.47, wps=295190, ups=0.69, wpb=429073, bsz=16396.3, num_updates=34900, lr=0.000338546, gnorm=0.239, clip=0, loss_scale=2, train_wall=145, gb_free=58.9, wall=51417 epoch 021: 863 / 1707 loss=4.082, nll_loss=2.451, ppl=5.47, wps=295190, ups=0.69, wpb=429073, bsz=16396.3, num_updates=34900, lr=0.000338546, gnorm=0.239, clip=0, loss_scale=2, train_wall=145, gb_free=58.9, wall=51417 epoch 021: 863 / 1707 loss=4.082, nll_loss=2.451, ppl=5.47, wps=295190, ups=0.69, wpb=429073, bsz=16396.3, num_updates=34900, lr=0.000338546, gnorm=0.239, clip=0, loss_scale=2, train_wall=145, gb_free=58.9, wall=51417 epoch 021: 863 / 1707 loss=4.082, nll_loss=2.451, ppl=5.47, wps=295190, ups=0.69, wpb=429073, bsz=16396.3, num_updates=34900, lr=0.000338546, gnorm=0.239, clip=0, loss_scale=2, train_wall=145, gb_free=58.9, wall=51417 epoch 021: 863 / 1707 loss=4.082, nll_loss=2.451, ppl=5.47, wps=295190, ups=0.69, wpb=429073, bsz=16396.3, num_updates=34900, lr=0.000338546, gnorm=0.239, clip=0, loss_scale=2, train_wall=145, gb_free=58.9, wall=51417 epoch 021: 863 / 1707 loss=4.082, nll_loss=2.451, ppl=5.47, wps=295190, ups=0.69, wpb=429073, bsz=16396.3, num_updates=34900, lr=0.000338546, gnorm=0.239, clip=0, loss_scale=2, train_wall=145, gb_free=58.9, wall=51417 epoch 021: 863 / 1707 loss=4.082, nll_loss=2.451, ppl=5.47, wps=295190, ups=0.69, wpb=429073, bsz=16396.3, num_updates=34900, lr=0.000338546, gnorm=0.239, clip=0, loss_scale=2, train_wall=145, gb_free=58.9, wall=51417 epoch 021: 964 / 1707 loss=4.077, nll_loss=2.446, ppl=5.45, wps=292405, ups=0.68, wpb=430455, bsz=16314.4, num_updates=35000, lr=0.000338062, gnorm=0.245, clip=0, loss_scale=2, train_wall=146, gb_free=59.3, wall=51565 epoch 021: 964 / 1707 loss=4.077, nll_loss=2.446, ppl=5.45, wps=292405, ups=0.68, wpb=430455, bsz=16314.4, num_updates=35000, lr=0.000338062, gnorm=0.245, clip=0, loss_scale=2, train_wall=146, gb_free=59.3, wall=51565 epoch 021: 964 / 1707 loss=4.077, nll_loss=2.446, ppl=5.45, wps=292405, ups=0.68, wpb=430455, bsz=16314.4, num_updates=35000, lr=0.000338062, gnorm=0.245, clip=0, loss_scale=2, train_wall=146, gb_free=59.3, wall=51565 epoch 021: 964 / 1707 loss=4.077, nll_loss=2.446, ppl=5.45, wps=292405, ups=0.68, wpb=430455, bsz=16314.4, num_updates=35000, lr=0.000338062, gnorm=0.245, clip=0, loss_scale=2, train_wall=146, gb_free=59.3, wall=51565 epoch 021: 964 / 1707 loss=4.077, nll_loss=2.446, ppl=5.45, wps=292405, ups=0.68, wpb=430455, bsz=16314.4, num_updates=35000, lr=0.000338062, gnorm=0.245, clip=0, loss_scale=2, train_wall=146, gb_free=59.3, wall=51565 epoch 021: 964 / 1707 loss=4.077, nll_loss=2.446, ppl=5.45, wps=292405, ups=0.68, wpb=430455, bsz=16314.4, num_updates=35000, lr=0.000338062, gnorm=0.245, clip=0, loss_scale=2, train_wall=146, gb_free=59.3, wall=51565 epoch 021: 964 / 1707 loss=4.077, nll_loss=2.446, ppl=5.45, wps=292405, ups=0.68, wpb=430455, bsz=16314.4, num_updates=35000, lr=0.000338062, gnorm=0.245, clip=0, loss_scale=2, train_wall=146, gb_free=59.3, wall=51565 epoch 021: 964 / 1707 loss=4.077, nll_loss=2.446, ppl=5.45, wps=292405, ups=0.68, wpb=430455, bsz=16314.4, num_updates=35000, lr=0.000338062, gnorm=0.245, clip=0, loss_scale=2, train_wall=146, gb_free=59.3, wall=51565 epoch 021: 964 / 1707 loss=4.077, nll_loss=2.446, ppl=5.45, wps=292405, ups=0.68, wpb=430455, bsz=16314.4, num_updates=35000, lr=0.000338062, gnorm=0.245, clip=0, loss_scale=2, train_wall=146, gb_free=59.3, wall=51565 epoch 021: 964 / 1707 loss=4.077, nll_loss=2.446, ppl=5.45, wps=292405, ups=0.68, wpb=430455, bsz=16314.4, num_updates=35000, lr=0.000338062, gnorm=0.245, clip=0, loss_scale=2, train_wall=146, gb_free=59.3, wall=51565 epoch 021: 964 / 1707 loss=4.077, nll_loss=2.446, ppl=5.45, wps=292405, ups=0.68, wpb=430455, bsz=16314.4, num_updates=35000, lr=0.000338062, gnorm=0.245, clip=0, loss_scale=2, train_wall=146, gb_free=59.3, wall=51565 epoch 021: 964 / 1707 loss=4.077, nll_loss=2.446, ppl=5.45, wps=292405, ups=0.68, wpb=430455, bsz=16314.4, num_updates=35000, lr=0.000338062, gnorm=0.245, clip=0, loss_scale=2, train_wall=146, gb_free=59.3, wall=51565 epoch 021: 964 / 1707 loss=4.077, nll_loss=2.446, ppl=5.45, wps=292405, ups=0.68, wpb=430455, bsz=16314.4, num_updates=35000, lr=0.000338062, gnorm=0.245, clip=0, loss_scale=2, train_wall=146, gb_free=59.3, wall=51565 epoch 021: 964 / 1707 loss=4.077, nll_loss=2.446, ppl=5.45, wps=292405, ups=0.68, wpb=430455, bsz=16314.4, num_updates=35000, lr=0.000338062, gnorm=0.245, clip=0, loss_scale=2, train_wall=146, gb_free=59.3, wall=51565 epoch 021: 964 / 1707 loss=4.077, nll_loss=2.446, ppl=5.45, wps=292405, ups=0.68, wpb=430455, bsz=16314.4, num_updates=35000, lr=0.000338062, gnorm=0.245, clip=0, loss_scale=2, train_wall=146, gb_free=59.3, wall=51565 epoch 021: 964 / 1707 loss=4.077, nll_loss=2.446, ppl=5.45, wps=292405, ups=0.68, wpb=430455, bsz=16314.4, num_updates=35000, lr=0.000338062, gnorm=0.245, clip=0, loss_scale=2, train_wall=146, gb_free=59.3, wall=51565 epoch 021: 964 / 1707 loss=4.077, nll_loss=2.446, ppl=5.45, wps=292405, ups=0.68, wpb=430455, bsz=16314.4, num_updates=35000, lr=0.000338062, gnorm=0.245, clip=0, loss_scale=2, train_wall=146, gb_free=59.3, wall=51565 epoch 021: 964 / 1707 loss=4.077, nll_loss=2.446, ppl=5.45, wps=292405, ups=0.68, wpb=430455, bsz=16314.4, num_updates=35000, lr=0.000338062, gnorm=0.245, clip=0, loss_scale=2, train_wall=146, gb_free=59.3, wall=51565 epoch 021: 964 / 1707 loss=4.077, nll_loss=2.446, ppl=5.45, wps=292405, ups=0.68, wpb=430455, bsz=16314.4, num_updates=35000, lr=0.000338062, gnorm=0.245, clip=0, loss_scale=2, train_wall=146, gb_free=59.3, wall=51565 epoch 021: 964 / 1707 loss=4.077, nll_loss=2.446, ppl=5.45, wps=292405, ups=0.68, wpb=430455, bsz=16314.4, num_updates=35000, lr=0.000338062, gnorm=0.245, clip=0, loss_scale=2, train_wall=146, gb_free=59.3, wall=51565 epoch 021: 964 / 1707 loss=4.077, nll_loss=2.446, ppl=5.45, wps=292405, ups=0.68, wpb=430455, bsz=16314.4, num_updates=35000, lr=0.000338062, gnorm=0.245, clip=0, loss_scale=2, train_wall=146, gb_free=59.3, wall=51565 begin validation on "valid" subset epoch 021 | valid on 'valid' subset | loss 4.31 | nll_loss 2.69 | ppl 6.45 | wps 75788.3 | wpb 21331 | bsz 1016 | num_updates 35000 | best_loss 4.291 epoch 021 | valid on 'valid' subset | loss 4.31 | nll_loss 2.69 | ppl 6.45 | wps 75788.3 | wpb 21331 | bsz 1016 | num_updates 35000 | best_loss 4.291 epoch 021 | valid on 'valid' subset | loss 4.31 | nll_loss 2.69 | ppl 6.45 | wps 75788.3 | wpb 21331 | bsz 1016 | num_updates 35000 | best_loss 4.291 epoch 021 | valid on 'valid' subset | loss 4.31 | nll_loss 2.69 | ppl 6.45 | wps 75788.3 | wpb 21331 | bsz 1016 | num_updates 35000 | best_loss 4.291 epoch 021 | valid on 'valid' subset | loss 4.31 | nll_loss 2.69 | ppl 6.45 | wps 75788.3 | wpb 21331 | bsz 1016 | num_updates 35000 | best_loss 4.291 epoch 021 | valid on 'valid' subset | loss 4.31 | nll_loss 2.69 | ppl 6.45 | wps 75788.3 | wpb 21331 | bsz 1016 | num_updates 35000 | best_loss 4.291 epoch 021 | valid on 'valid' subset | loss 4.31 | nll_loss 2.69 | ppl 6.45 | wps 75788.3 | wpb 21331 | bsz 1016 | num_updates 35000 | best_loss 4.291 epoch 021 | valid on 'valid' subset | loss 4.31 | nll_loss 2.69 | ppl 6.45 | wps 75788.3 | wpb 21331 | bsz 1016 | num_updates 35000 | best_loss 4.291 epoch 021 | valid on 'valid' subset | loss 4.31 | nll_loss 2.69 | ppl 6.45 | wps 75788.3 | wpb 21331 | bsz 1016 | num_updates 35000 | best_loss 4.291 epoch 021 | valid on 'valid' subset | loss 4.31 | nll_loss 2.69 | ppl 6.45 | wps 75788.3 | wpb 21331 | bsz 1016 | num_updates 35000 | best_loss 4.291 epoch 021 | valid on 'valid' subset | loss 4.31 | nll_loss 2.69 | ppl 6.45 | wps 75788.3 | wpb 21331 | bsz 1016 | num_updates 35000 | best_loss 4.291 epoch 021 | valid on 'valid' subset | loss 4.31 | nll_loss 2.69 | ppl 6.45 | wps 75788.3 | wpb 21331 | bsz 1016 | num_updates 35000 | best_loss 4.291 epoch 021 | valid on 'valid' subset | loss 4.31 | nll_loss 2.69 | ppl 6.45 | wps 75788.3 | wpb 21331 | bsz 1016 | num_updates 35000 | best_loss 4.291 epoch 021 | valid on 'valid' subset | loss 4.31 | nll_loss 2.69 | ppl 6.45 | wps 75788.3 | wpb 21331 | bsz 1016 | num_updates 35000 | best_loss 4.291 epoch 021 | valid on 'valid' subset | loss 4.31 | nll_loss 2.69 | ppl 6.45 | wps 75788.3 | wpb 21331 | bsz 1016 | num_updates 35000 | best_loss 4.291 epoch 021 | valid on 'valid' subset | loss 4.31 | nll_loss 2.69 | ppl 6.45 | wps 75788.3 | wpb 21331 | bsz 1016 | num_updates 35000 | best_loss 4.291 epoch 021 | valid on 'valid' subset | loss 4.31 | nll_loss 2.69 | ppl 6.45 | wps 75788.3 | wpb 21331 | bsz 1016 | num_updates 35000 | best_loss 4.291 epoch 021 | valid on 'valid' subset | loss 4.31 | nll_loss 2.69 | ppl 6.45 | wps 75788.3 | wpb 21331 | bsz 1016 | num_updates 35000 | best_loss 4.291 epoch 021 | valid on 'valid' subset | loss 4.31 | nll_loss 2.69 | ppl 6.45 | wps 75788.3 | wpb 21331 | bsz 1016 | num_updates 35000 | best_loss 4.291 epoch 021 | valid on 'valid' subset | loss 4.31 | nll_loss 2.69 | ppl 6.45 | wps 75788.3 | wpb 21331 | bsz 1016 | num_updates 35000 | best_loss 4.291 epoch 021 | valid on 'valid' subset | loss 4.31 | nll_loss 2.69 | ppl 6.45 | wps 75788.3 | wpb 21331 | bsz 1016 | num_updates 35000 | best_loss 4.291 epoch 021: 1065 / 1707 loss=4.075, nll_loss=2.443, ppl=5.44, wps=266020, ups=0.62, wpb=428792, bsz=16493.2, num_updates=35100, lr=0.00033758, gnorm=0.241, clip=0, loss_scale=1, train_wall=145, gb_free=59, wall=51726 epoch 021: 1065 / 1707 loss=4.075, nll_loss=2.443, ppl=5.44, wps=266020, ups=0.62, wpb=428792, bsz=16493.2, num_updates=35100, lr=0.00033758, gnorm=0.241, clip=0, loss_scale=1, train_wall=145, gb_free=59, wall=51726 epoch 021: 1065 / 1707 loss=4.075, nll_loss=2.443, ppl=5.44, wps=266020, ups=0.62, wpb=428792, bsz=16493.2, num_updates=35100, lr=0.00033758, gnorm=0.241, clip=0, loss_scale=1, train_wall=145, gb_free=59, wall=51726 epoch 021: 1065 / 1707 loss=4.075, nll_loss=2.443, ppl=5.44, wps=266020, ups=0.62, wpb=428792, bsz=16493.2, num_updates=35100, lr=0.00033758, gnorm=0.241, clip=0, loss_scale=1, train_wall=145, gb_free=59, wall=51726 epoch 021: 1065 / 1707 loss=4.075, nll_loss=2.443, ppl=5.44, wps=266020, ups=0.62, wpb=428792, bsz=16493.2, num_updates=35100, lr=0.00033758, gnorm=0.241, clip=0, loss_scale=1, train_wall=145, gb_free=59, wall=51726 epoch 021: 1065 / 1707 loss=4.075, nll_loss=2.443, ppl=5.44, wps=266020, ups=0.62, wpb=428792, bsz=16493.2, num_updates=35100, lr=0.00033758, gnorm=0.241, clip=0, loss_scale=1, train_wall=145, gb_free=59, wall=51726 epoch 021: 1065 / 1707 loss=4.075, nll_loss=2.443, ppl=5.44, wps=266020, ups=0.62, wpb=428792, bsz=16493.2, num_updates=35100, lr=0.00033758, gnorm=0.241, clip=0, loss_scale=1, train_wall=145, gb_free=59, wall=51726 epoch 021: 1065 / 1707 loss=4.075, nll_loss=2.443, ppl=5.44, wps=266020, ups=0.62, wpb=428792, bsz=16493.2, num_updates=35100, lr=0.00033758, gnorm=0.241, clip=0, loss_scale=1, train_wall=145, gb_free=59, wall=51726 epoch 021: 1065 / 1707 loss=4.075, nll_loss=2.443, ppl=5.44, wps=266020, ups=0.62, wpb=428792, bsz=16493.2, num_updates=35100, lr=0.00033758, gnorm=0.241, clip=0, loss_scale=1, train_wall=145, gb_free=59, wall=51726 epoch 021: 1065 / 1707 loss=4.075, nll_loss=2.443, ppl=5.44, wps=266020, ups=0.62, wpb=428792, bsz=16493.2, num_updates=35100, lr=0.00033758, gnorm=0.241, clip=0, loss_scale=1, train_wall=145, gb_free=59, wall=51726 epoch 021: 1065 / 1707 loss=4.075, nll_loss=2.443, ppl=5.44, wps=266020, ups=0.62, wpb=428792, bsz=16493.2, num_updates=35100, lr=0.00033758, gnorm=0.241, clip=0, loss_scale=1, train_wall=145, gb_free=59, wall=51726 epoch 021: 1065 / 1707 loss=4.075, nll_loss=2.443, ppl=5.44, wps=266020, ups=0.62, wpb=428792, bsz=16493.2, num_updates=35100, lr=0.00033758, gnorm=0.241, clip=0, loss_scale=1, train_wall=145, gb_free=59, wall=51726 epoch 021: 1065 / 1707 loss=4.075, nll_loss=2.443, ppl=5.44, wps=266020, ups=0.62, wpb=428792, bsz=16493.2, num_updates=35100, lr=0.00033758, gnorm=0.241, clip=0, loss_scale=1, train_wall=145, gb_free=59, wall=51726 epoch 021: 1065 / 1707 loss=4.075, nll_loss=2.443, ppl=5.44, wps=266020, ups=0.62, wpb=428792, bsz=16493.2, num_updates=35100, lr=0.00033758, gnorm=0.241, clip=0, loss_scale=1, train_wall=145, gb_free=59, wall=51726 epoch 021: 1065 / 1707 loss=4.075, nll_loss=2.443, ppl=5.44, wps=266020, ups=0.62, wpb=428792, bsz=16493.2, num_updates=35100, lr=0.00033758, gnorm=0.241, clip=0, loss_scale=1, train_wall=145, gb_free=59, wall=51726 epoch 021: 1065 / 1707 loss=4.075, nll_loss=2.443, ppl=5.44, wps=266020, ups=0.62, wpb=428792, bsz=16493.2, num_updates=35100, lr=0.00033758, gnorm=0.241, clip=0, loss_scale=1, train_wall=145, gb_free=59, wall=51726 epoch 021: 1065 / 1707 loss=4.075, nll_loss=2.443, ppl=5.44, wps=266020, ups=0.62, wpb=428792, bsz=16493.2, num_updates=35100, lr=0.00033758, gnorm=0.241, clip=0, loss_scale=1, train_wall=145, gb_free=59, wall=51726 epoch 021: 1065 / 1707 loss=4.075, nll_loss=2.443, ppl=5.44, wps=266020, ups=0.62, wpb=428792, bsz=16493.2, num_updates=35100, lr=0.00033758, gnorm=0.241, clip=0, loss_scale=1, train_wall=145, gb_free=59, wall=51726 epoch 021: 1065 / 1707 loss=4.075, nll_loss=2.443, ppl=5.44, wps=266020, ups=0.62, wpb=428792, bsz=16493.2, num_updates=35100, lr=0.00033758, gnorm=0.241, clip=0, loss_scale=1, train_wall=145, gb_free=59, wall=51726 epoch 021: 1065 / 1707 loss=4.075, nll_loss=2.443, ppl=5.44, wps=266020, ups=0.62, wpb=428792, bsz=16493.2, num_updates=35100, lr=0.00033758, gnorm=0.241, clip=0, loss_scale=1, train_wall=145, gb_free=59, wall=51726 epoch 021: 1065 / 1707 loss=4.075, nll_loss=2.443, ppl=5.44, wps=266020, ups=0.62, wpb=428792, bsz=16493.2, num_updates=35100, lr=0.00033758, gnorm=0.241, clip=0, loss_scale=1, train_wall=145, gb_free=59, wall=51726 epoch 021: 1165 / 1707 loss=4.071, nll_loss=2.439, ppl=5.42, wps=294740, ups=0.69, wpb=427732, bsz=16269.2, num_updates=35200, lr=0.0003371, gnorm=0.229, clip=0, loss_scale=1, train_wall=144, gb_free=59, wall=51871 epoch 021: 1165 / 1707 loss=4.071, nll_loss=2.439, ppl=5.42, wps=294740, ups=0.69, wpb=427732, bsz=16269.2, num_updates=35200, lr=0.0003371, gnorm=0.229, clip=0, loss_scale=1, train_wall=144, gb_free=59, wall=51871 epoch 021: 1165 / 1707 loss=4.071, nll_loss=2.439, ppl=5.42, wps=294740, ups=0.69, wpb=427732, bsz=16269.2, num_updates=35200, lr=0.0003371, gnorm=0.229, clip=0, loss_scale=1, train_wall=144, gb_free=59, wall=51871 epoch 021: 1165 / 1707 loss=4.071, nll_loss=2.439, ppl=5.42, wps=294740, ups=0.69, wpb=427732, bsz=16269.2, num_updates=35200, lr=0.0003371, gnorm=0.229, clip=0, loss_scale=1, train_wall=144, gb_free=59, wall=51871 epoch 021: 1165 / 1707 loss=4.071, nll_loss=2.439, ppl=5.42, wps=294740, ups=0.69, wpb=427732, bsz=16269.2, num_updates=35200, lr=0.0003371, gnorm=0.229, clip=0, loss_scale=1, train_wall=144, gb_free=59, wall=51871 epoch 021: 1165 / 1707 loss=4.071, nll_loss=2.439, ppl=5.42, wps=294740, ups=0.69, wpb=427732, bsz=16269.2, num_updates=35200, lr=0.0003371, gnorm=0.229, clip=0, loss_scale=1, train_wall=144, gb_free=59, wall=51871 epoch 021: 1165 / 1707 loss=4.071, nll_loss=2.439, ppl=5.42, wps=294740, ups=0.69, wpb=427732, bsz=16269.2, num_updates=35200, lr=0.0003371, gnorm=0.229, clip=0, loss_scale=1, train_wall=144, gb_free=59, wall=51871 epoch 021: 1165 / 1707 loss=4.071, nll_loss=2.439, ppl=5.42, wps=294740, ups=0.69, wpb=427732, bsz=16269.2, num_updates=35200, lr=0.0003371, gnorm=0.229, clip=0, loss_scale=1, train_wall=144, gb_free=59, wall=51871 epoch 021: 1165 / 1707 loss=4.071, nll_loss=2.439, ppl=5.42, wps=294740, ups=0.69, wpb=427732, bsz=16269.2, num_updates=35200, lr=0.0003371, gnorm=0.229, clip=0, loss_scale=1, train_wall=144, gb_free=59, wall=51871 epoch 021: 1165 / 1707 loss=4.071, nll_loss=2.439, ppl=5.42, wps=294740, ups=0.69, wpb=427732, bsz=16269.2, num_updates=35200, lr=0.0003371, gnorm=0.229, clip=0, loss_scale=1, train_wall=144, gb_free=59, wall=51871 epoch 021: 1165 / 1707 loss=4.071, nll_loss=2.439, ppl=5.42, wps=294740, ups=0.69, wpb=427732, bsz=16269.2, num_updates=35200, lr=0.0003371, gnorm=0.229, clip=0, loss_scale=1, train_wall=144, gb_free=59, wall=51871 epoch 021: 1165 / 1707 loss=4.071, nll_loss=2.439, ppl=5.42, wps=294740, ups=0.69, wpb=427732, bsz=16269.2, num_updates=35200, lr=0.0003371, gnorm=0.229, clip=0, loss_scale=1, train_wall=144, gb_free=59, wall=51871 epoch 021: 1165 / 1707 loss=4.071, nll_loss=2.439, ppl=5.42, wps=294740, ups=0.69, wpb=427732, bsz=16269.2, num_updates=35200, lr=0.0003371, gnorm=0.229, clip=0, loss_scale=1, train_wall=144, gb_free=59, wall=51871 epoch 021: 1165 / 1707 loss=4.071, nll_loss=2.439, ppl=5.42, wps=294740, ups=0.69, wpb=427732, bsz=16269.2, num_updates=35200, lr=0.0003371, gnorm=0.229, clip=0, loss_scale=1, train_wall=144, gb_free=59, wall=51871 epoch 021: 1165 / 1707 loss=4.071, nll_loss=2.439, ppl=5.42, wps=294740, ups=0.69, wpb=427732, bsz=16269.2, num_updates=35200, lr=0.0003371, gnorm=0.229, clip=0, loss_scale=1, train_wall=144, gb_free=59, wall=51871 epoch 021: 1165 / 1707 loss=4.071, nll_loss=2.439, ppl=5.42, wps=294740, ups=0.69, wpb=427732, bsz=16269.2, num_updates=35200, lr=0.0003371, gnorm=0.229, clip=0, loss_scale=1, train_wall=144, gb_free=59, wall=51871 epoch 021: 1165 / 1707 loss=4.071, nll_loss=2.439, ppl=5.42, wps=294740, ups=0.69, wpb=427732, bsz=16269.2, num_updates=35200, lr=0.0003371, gnorm=0.229, clip=0, loss_scale=1, train_wall=144, gb_free=59, wall=51871 epoch 021: 1165 / 1707 loss=4.071, nll_loss=2.439, ppl=5.42, wps=294740, ups=0.69, wpb=427732, bsz=16269.2, num_updates=35200, lr=0.0003371, gnorm=0.229, clip=0, loss_scale=1, train_wall=144, gb_free=59, wall=51871 epoch 021: 1165 / 1707 loss=4.071, nll_loss=2.439, ppl=5.42, wps=294740, ups=0.69, wpb=427732, bsz=16269.2, num_updates=35200, lr=0.0003371, gnorm=0.229, clip=0, loss_scale=1, train_wall=144, gb_free=59, wall=51871 epoch 021: 1165 / 1707 loss=4.071, nll_loss=2.439, ppl=5.42, wps=294740, ups=0.69, wpb=427732, bsz=16269.2, num_updates=35200, lr=0.0003371, gnorm=0.229, clip=0, loss_scale=1, train_wall=144, gb_free=59, wall=51871 epoch 021: 1165 / 1707 loss=4.071, nll_loss=2.439, ppl=5.42, wps=294740, ups=0.69, wpb=427732, bsz=16269.2, num_updates=35200, lr=0.0003371, gnorm=0.229, clip=0, loss_scale=1, train_wall=144, gb_free=59, wall=51871 epoch 021: 1265 / 1707 loss=4.085, nll_loss=2.455, ppl=5.48, wps=294797, ups=0.69, wpb=430013, bsz=16388.6, num_updates=35300, lr=0.000336622, gnorm=0.24, clip=0, loss_scale=1, train_wall=145, gb_free=58.9, wall=52017 epoch 021: 1265 / 1707 loss=4.085, nll_loss=2.455, ppl=5.48, wps=294797, ups=0.69, wpb=430013, bsz=16388.6, num_updates=35300, lr=0.000336622, gnorm=0.24, clip=0, loss_scale=1, train_wall=145, gb_free=58.9, wall=52017 epoch 021: 1265 / 1707 loss=4.085, nll_loss=2.455, ppl=5.48, wps=294797, ups=0.69, wpb=430013, bsz=16388.6, num_updates=35300, lr=0.000336622, gnorm=0.24, clip=0, loss_scale=1, train_wall=145, gb_free=58.9, wall=52017 epoch 021: 1265 / 1707 loss=4.085, nll_loss=2.455, ppl=5.48, wps=294797, ups=0.69, wpb=430013, bsz=16388.6, num_updates=35300, lr=0.000336622, gnorm=0.24, clip=0, loss_scale=1, train_wall=145, gb_free=58.9, wall=52017 epoch 021: 1265 / 1707 loss=4.085, nll_loss=2.455, ppl=5.48, wps=294797, ups=0.69, wpb=430013, bsz=16388.6, num_updates=35300, lr=0.000336622, gnorm=0.24, clip=0, loss_scale=1, train_wall=145, gb_free=58.9, wall=52017 epoch 021: 1265 / 1707 loss=4.085, nll_loss=2.455, ppl=5.48, wps=294797, ups=0.69, wpb=430013, bsz=16388.6, num_updates=35300, lr=0.000336622, gnorm=0.24, clip=0, loss_scale=1, train_wall=145, gb_free=58.9, wall=52017 epoch 021: 1265 / 1707 loss=4.085, nll_loss=2.455, ppl=5.48, wps=294797, ups=0.69, wpb=430013, bsz=16388.6, num_updates=35300, lr=0.000336622, gnorm=0.24, clip=0, loss_scale=1, train_wall=145, gb_free=58.9, wall=52017 epoch 021: 1265 / 1707 loss=4.085, nll_loss=2.455, ppl=5.48, wps=294797, ups=0.69, wpb=430013, bsz=16388.6, num_updates=35300, lr=0.000336622, gnorm=0.24, clip=0, loss_scale=1, train_wall=145, gb_free=58.9, wall=52017 epoch 021: 1265 / 1707 loss=4.085, nll_loss=2.455, ppl=5.48, wps=294797, ups=0.69, wpb=430013, bsz=16388.6, num_updates=35300, lr=0.000336622, gnorm=0.24, clip=0, loss_scale=1, train_wall=145, gb_free=58.9, wall=52017 epoch 021: 1265 / 1707 loss=4.085, nll_loss=2.455, ppl=5.48, wps=294797, ups=0.69, wpb=430013, bsz=16388.6, num_updates=35300, lr=0.000336622, gnorm=0.24, clip=0, loss_scale=1, train_wall=145, gb_free=58.9, wall=52017 epoch 021: 1265 / 1707 loss=4.085, nll_loss=2.455, ppl=5.48, wps=294797, ups=0.69, wpb=430013, bsz=16388.6, num_updates=35300, lr=0.000336622, gnorm=0.24, clip=0, loss_scale=1, train_wall=145, gb_free=58.9, wall=52017 epoch 021: 1265 / 1707 loss=4.085, nll_loss=2.455, ppl=5.48, wps=294797, ups=0.69, wpb=430013, bsz=16388.6, num_updates=35300, lr=0.000336622, gnorm=0.24, clip=0, loss_scale=1, train_wall=145, gb_free=58.9, wall=52017 epoch 021: 1265 / 1707 loss=4.085, nll_loss=2.455, ppl=5.48, wps=294797, ups=0.69, wpb=430013, bsz=16388.6, num_updates=35300, lr=0.000336622, gnorm=0.24, clip=0, loss_scale=1, train_wall=145, gb_free=58.9, wall=52017 epoch 021: 1265 / 1707 loss=4.085, nll_loss=2.455, ppl=5.48, wps=294797, ups=0.69, wpb=430013, bsz=16388.6, num_updates=35300, lr=0.000336622, gnorm=0.24, clip=0, loss_scale=1, train_wall=145, gb_free=58.9, wall=52017 epoch 021: 1265 / 1707 loss=4.085, nll_loss=2.455, ppl=5.48, wps=294797, ups=0.69, wpb=430013, bsz=16388.6, num_updates=35300, lr=0.000336622, gnorm=0.24, clip=0, loss_scale=1, train_wall=145, gb_free=58.9, wall=52017 epoch 021: 1265 / 1707 loss=4.085, nll_loss=2.455, ppl=5.48, wps=294797, ups=0.69, wpb=430013, bsz=16388.6, num_updates=35300, lr=0.000336622, gnorm=0.24, clip=0, loss_scale=1, train_wall=145, gb_free=58.9, wall=52017 epoch 021: 1265 / 1707 loss=4.085, nll_loss=2.455, ppl=5.48, wps=294797, ups=0.69, wpb=430013, bsz=16388.6, num_updates=35300, lr=0.000336622, gnorm=0.24, clip=0, loss_scale=1, train_wall=145, gb_free=58.9, wall=52017 epoch 021: 1265 / 1707 loss=4.085, nll_loss=2.455, ppl=5.48, wps=294797, ups=0.69, wpb=430013, bsz=16388.6, num_updates=35300, lr=0.000336622, gnorm=0.24, clip=0, loss_scale=1, train_wall=145, gb_free=58.9, wall=52017 epoch 021: 1265 / 1707 loss=4.085, nll_loss=2.455, ppl=5.48, wps=294797, ups=0.69, wpb=430013, bsz=16388.6, num_updates=35300, lr=0.000336622, gnorm=0.24, clip=0, loss_scale=1, train_wall=145, gb_free=58.9, wall=52017 epoch 021: 1265 / 1707 loss=4.085, nll_loss=2.455, ppl=5.48, wps=294797, ups=0.69, wpb=430013, bsz=16388.6, num_updates=35300, lr=0.000336622, gnorm=0.24, clip=0, loss_scale=1, train_wall=145, gb_free=58.9, wall=52017 epoch 021: 1265 / 1707 loss=4.085, nll_loss=2.455, ppl=5.48, wps=294797, ups=0.69, wpb=430013, bsz=16388.6, num_updates=35300, lr=0.000336622, gnorm=0.24, clip=0, loss_scale=1, train_wall=145, gb_free=58.9, wall=52017 epoch 021: 1365 / 1707 loss=4.09, nll_loss=2.46, ppl=5.5, wps=296298, ups=0.69, wpb=429382, bsz=16235.9, num_updates=35400, lr=0.000336146, gnorm=0.232, clip=0, loss_scale=2, train_wall=144, gb_free=59.3, wall=52162 epoch 021: 1365 / 1707 loss=4.09, nll_loss=2.46, ppl=5.5, wps=296298, ups=0.69, wpb=429382, bsz=16235.9, num_updates=35400, lr=0.000336146, gnorm=0.232, clip=0, loss_scale=2, train_wall=144, gb_free=59.3, wall=52162 epoch 021: 1365 / 1707 loss=4.09, nll_loss=2.46, ppl=5.5, wps=296298, ups=0.69, wpb=429382, bsz=16235.9, num_updates=35400, lr=0.000336146, gnorm=0.232, clip=0, loss_scale=2, train_wall=144, gb_free=59.3, wall=52162 epoch 021: 1365 / 1707 loss=4.09, nll_loss=2.46, ppl=5.5, wps=296298, ups=0.69, wpb=429382, bsz=16235.9, num_updates=35400, lr=0.000336146, gnorm=0.232, clip=0, loss_scale=2, train_wall=144, gb_free=59.3, wall=52162 epoch 021: 1365 / 1707 loss=4.09, nll_loss=2.46, ppl=5.5, wps=296298, ups=0.69, wpb=429382, bsz=16235.9, num_updates=35400, lr=0.000336146, gnorm=0.232, clip=0, loss_scale=2, train_wall=144, gb_free=59.3, wall=52162 epoch 021: 1365 / 1707 loss=4.09, nll_loss=2.46, ppl=5.5, wps=296298, ups=0.69, wpb=429382, bsz=16235.9, num_updates=35400, lr=0.000336146, gnorm=0.232, clip=0, loss_scale=2, train_wall=144, gb_free=59.3, wall=52162 epoch 021: 1365 / 1707 loss=4.09, nll_loss=2.46, ppl=5.5, wps=296298, ups=0.69, wpb=429382, bsz=16235.9, num_updates=35400, lr=0.000336146, gnorm=0.232, clip=0, loss_scale=2, train_wall=144, gb_free=59.3, wall=52162 epoch 021: 1365 / 1707 loss=4.09, nll_loss=2.46, ppl=5.5, wps=296298, ups=0.69, wpb=429382, bsz=16235.9, num_updates=35400, lr=0.000336146, gnorm=0.232, clip=0, loss_scale=2, train_wall=144, gb_free=59.3, wall=52162 epoch 021: 1365 / 1707 loss=4.09, nll_loss=2.46, ppl=5.5, wps=296298, ups=0.69, wpb=429382, bsz=16235.9, num_updates=35400, lr=0.000336146, gnorm=0.232, clip=0, loss_scale=2, train_wall=144, gb_free=59.3, wall=52162 epoch 021: 1365 / 1707 loss=4.09, nll_loss=2.46, ppl=5.5, wps=296298, ups=0.69, wpb=429382, bsz=16235.9, num_updates=35400, lr=0.000336146, gnorm=0.232, clip=0, loss_scale=2, train_wall=144, gb_free=59.3, wall=52162 epoch 021: 1365 / 1707 loss=4.09, nll_loss=2.46, ppl=5.5, wps=296298, ups=0.69, wpb=429382, bsz=16235.9, num_updates=35400, lr=0.000336146, gnorm=0.232, clip=0, loss_scale=2, train_wall=144, gb_free=59.3, wall=52162 epoch 021: 1365 / 1707 loss=4.09, nll_loss=2.46, ppl=5.5, wps=296298, ups=0.69, wpb=429382, bsz=16235.9, num_updates=35400, lr=0.000336146, gnorm=0.232, clip=0, loss_scale=2, train_wall=144, gb_free=59.3, wall=52162 epoch 021: 1365 / 1707 loss=4.09, nll_loss=2.46, ppl=5.5, wps=296298, ups=0.69, wpb=429382, bsz=16235.9, num_updates=35400, lr=0.000336146, gnorm=0.232, clip=0, loss_scale=2, train_wall=144, gb_free=59.3, wall=52162 epoch 021: 1365 / 1707 loss=4.09, nll_loss=2.46, ppl=5.5, wps=296298, ups=0.69, wpb=429382, bsz=16235.9, num_updates=35400, lr=0.000336146, gnorm=0.232, clip=0, loss_scale=2, train_wall=144, gb_free=59.3, wall=52162 epoch 021: 1365 / 1707 loss=4.09, nll_loss=2.46, ppl=5.5, wps=296298, ups=0.69, wpb=429382, bsz=16235.9, num_updates=35400, lr=0.000336146, gnorm=0.232, clip=0, loss_scale=2, train_wall=144, gb_free=59.3, wall=52162 epoch 021: 1365 / 1707 loss=4.09, nll_loss=2.46, ppl=5.5, wps=296298, ups=0.69, wpb=429382, bsz=16235.9, num_updates=35400, lr=0.000336146, gnorm=0.232, clip=0, loss_scale=2, train_wall=144, gb_free=59.3, wall=52162 epoch 021: 1365 / 1707 loss=4.09, nll_loss=2.46, ppl=5.5, wps=296298, ups=0.69, wpb=429382, bsz=16235.9, num_updates=35400, lr=0.000336146, gnorm=0.232, clip=0, loss_scale=2, train_wall=144, gb_free=59.3, wall=52162 epoch 021: 1365 / 1707 loss=4.09, nll_loss=2.46, ppl=5.5, wps=296298, ups=0.69, wpb=429382, bsz=16235.9, num_updates=35400, lr=0.000336146, gnorm=0.232, clip=0, loss_scale=2, train_wall=144, gb_free=59.3, wall=52162 epoch 021: 1365 / 1707 loss=4.09, nll_loss=2.46, ppl=5.5, wps=296298, ups=0.69, wpb=429382, bsz=16235.9, num_updates=35400, lr=0.000336146, gnorm=0.232, clip=0, loss_scale=2, train_wall=144, gb_free=59.3, wall=52162 epoch 021: 1365 / 1707 loss=4.09, nll_loss=2.46, ppl=5.5, wps=296298, ups=0.69, wpb=429382, bsz=16235.9, num_updates=35400, lr=0.000336146, gnorm=0.232, clip=0, loss_scale=2, train_wall=144, gb_free=59.3, wall=52162 epoch 021: 1365 / 1707 loss=4.09, nll_loss=2.46, ppl=5.5, wps=296298, ups=0.69, wpb=429382, bsz=16235.9, num_updates=35400, lr=0.000336146, gnorm=0.232, clip=0, loss_scale=2, train_wall=144, gb_free=59.3, wall=52162 epoch 021: 1465 / 1707 loss=4.08, nll_loss=2.449, ppl=5.46, wps=295368, ups=0.69, wpb=430095, bsz=16509, num_updates=35500, lr=0.000335673, gnorm=0.232, clip=0, loss_scale=2, train_wall=145, gb_free=58.9, wall=52307 epoch 021: 1465 / 1707 loss=4.08, nll_loss=2.449, ppl=5.46, wps=295368, ups=0.69, wpb=430095, bsz=16509, num_updates=35500, lr=0.000335673, gnorm=0.232, clip=0, loss_scale=2, train_wall=145, gb_free=58.9, wall=52307 epoch 021: 1465 / 1707 loss=4.08, nll_loss=2.449, ppl=5.46, wps=295368, ups=0.69, wpb=430095, bsz=16509, num_updates=35500, lr=0.000335673, gnorm=0.232, clip=0, loss_scale=2, train_wall=145, gb_free=58.9, wall=52307 epoch 021: 1465 / 1707 loss=4.08, nll_loss=2.449, ppl=5.46, wps=295368, ups=0.69, wpb=430095, bsz=16509, num_updates=35500, lr=0.000335673, gnorm=0.232, clip=0, loss_scale=2, train_wall=145, gb_free=58.9, wall=52307 epoch 021: 1465 / 1707 loss=4.08, nll_loss=2.449, ppl=5.46, wps=295368, ups=0.69, wpb=430095, bsz=16509, num_updates=35500, lr=0.000335673, gnorm=0.232, clip=0, loss_scale=2, train_wall=145, gb_free=58.9, wall=52307 epoch 021: 1465 / 1707 loss=4.08, nll_loss=2.449, ppl=5.46, wps=295368, ups=0.69, wpb=430095, bsz=16509, num_updates=35500, lr=0.000335673, gnorm=0.232, clip=0, loss_scale=2, train_wall=145, gb_free=58.9, wall=52307 epoch 021: 1465 / 1707 loss=4.08, nll_loss=2.449, ppl=5.46, wps=295368, ups=0.69, wpb=430095, bsz=16509, num_updates=35500, lr=0.000335673, gnorm=0.232, clip=0, loss_scale=2, train_wall=145, gb_free=58.9, wall=52307 epoch 021: 1465 / 1707 loss=4.08, nll_loss=2.449, ppl=5.46, wps=295368, ups=0.69, wpb=430095, bsz=16509, num_updates=35500, lr=0.000335673, gnorm=0.232, clip=0, loss_scale=2, train_wall=145, gb_free=58.9, wall=52307 epoch 021: 1465 / 1707 loss=4.08, nll_loss=2.449, ppl=5.46, wps=295368, ups=0.69, wpb=430095, bsz=16509, num_updates=35500, lr=0.000335673, gnorm=0.232, clip=0, loss_scale=2, train_wall=145, gb_free=58.9, wall=52307 epoch 021: 1465 / 1707 loss=4.08, nll_loss=2.449, ppl=5.46, wps=295368, ups=0.69, wpb=430095, bsz=16509, num_updates=35500, lr=0.000335673, gnorm=0.232, clip=0, loss_scale=2, train_wall=145, gb_free=58.9, wall=52307 epoch 021: 1465 / 1707 loss=4.08, nll_loss=2.449, ppl=5.46, wps=295368, ups=0.69, wpb=430095, bsz=16509, num_updates=35500, lr=0.000335673, gnorm=0.232, clip=0, loss_scale=2, train_wall=145, gb_free=58.9, wall=52307 epoch 021: 1465 / 1707 loss=4.08, nll_loss=2.449, ppl=5.46, wps=295368, ups=0.69, wpb=430095, bsz=16509, num_updates=35500, lr=0.000335673, gnorm=0.232, clip=0, loss_scale=2, train_wall=145, gb_free=58.9, wall=52307 epoch 021: 1465 / 1707 loss=4.08, nll_loss=2.449, ppl=5.46, wps=295368, ups=0.69, wpb=430095, bsz=16509, num_updates=35500, lr=0.000335673, gnorm=0.232, clip=0, loss_scale=2, train_wall=145, gb_free=58.9, wall=52307 epoch 021: 1465 / 1707 loss=4.08, nll_loss=2.449, ppl=5.46, wps=295368, ups=0.69, wpb=430095, bsz=16509, num_updates=35500, lr=0.000335673, gnorm=0.232, clip=0, loss_scale=2, train_wall=145, gb_free=58.9, wall=52307 epoch 021: 1465 / 1707 loss=4.08, nll_loss=2.449, ppl=5.46, wps=295368, ups=0.69, wpb=430095, bsz=16509, num_updates=35500, lr=0.000335673, gnorm=0.232, clip=0, loss_scale=2, train_wall=145, gb_free=58.9, wall=52307 epoch 021: 1465 / 1707 loss=4.08, nll_loss=2.449, ppl=5.46, wps=295368, ups=0.69, wpb=430095, bsz=16509, num_updates=35500, lr=0.000335673, gnorm=0.232, clip=0, loss_scale=2, train_wall=145, gb_free=58.9, wall=52307 epoch 021: 1465 / 1707 loss=4.08, nll_loss=2.449, ppl=5.46, wps=295368, ups=0.69, wpb=430095, bsz=16509, num_updates=35500, lr=0.000335673, gnorm=0.232, clip=0, loss_scale=2, train_wall=145, gb_free=58.9, wall=52307 epoch 021: 1465 / 1707 loss=4.08, nll_loss=2.449, ppl=5.46, wps=295368, ups=0.69, wpb=430095, bsz=16509, num_updates=35500, lr=0.000335673, gnorm=0.232, clip=0, loss_scale=2, train_wall=145, gb_free=58.9, wall=52307 epoch 021: 1465 / 1707 loss=4.08, nll_loss=2.449, ppl=5.46, wps=295368, ups=0.69, wpb=430095, bsz=16509, num_updates=35500, lr=0.000335673, gnorm=0.232, clip=0, loss_scale=2, train_wall=145, gb_free=58.9, wall=52307 epoch 021: 1465 / 1707 loss=4.08, nll_loss=2.449, ppl=5.46, wps=295368, ups=0.69, wpb=430095, bsz=16509, num_updates=35500, lr=0.000335673, gnorm=0.232, clip=0, loss_scale=2, train_wall=145, gb_free=58.9, wall=52307 epoch 021: 1465 / 1707 loss=4.08, nll_loss=2.449, ppl=5.46, wps=295368, ups=0.69, wpb=430095, bsz=16509, num_updates=35500, lr=0.000335673, gnorm=0.232, clip=0, loss_scale=2, train_wall=145, gb_free=58.9, wall=52307 epoch 021: 1565 / 1707 loss=4.081, nll_loss=2.45, ppl=5.46, wps=295294, ups=0.69, wpb=428847, bsz=16308.2, num_updates=35600, lr=0.000335201, gnorm=0.231, clip=0, loss_scale=4, train_wall=144, gb_free=59.2, wall=52453 epoch 021: 1565 / 1707 loss=4.081, nll_loss=2.45, ppl=5.46, wps=295294, ups=0.69, wpb=428847, bsz=16308.2, num_updates=35600, lr=0.000335201, gnorm=0.231, clip=0, loss_scale=4, train_wall=144, gb_free=59.2, wall=52453 epoch 021: 1565 / 1707 loss=4.081, nll_loss=2.45, ppl=5.46, wps=295294, ups=0.69, wpb=428847, bsz=16308.2, num_updates=35600, lr=0.000335201, gnorm=0.231, clip=0, loss_scale=4, train_wall=144, gb_free=59.2, wall=52453 epoch 021: 1565 / 1707 loss=4.081, nll_loss=2.45, ppl=5.46, wps=295294, ups=0.69, wpb=428847, bsz=16308.2, num_updates=35600, lr=0.000335201, gnorm=0.231, clip=0, loss_scale=4, train_wall=144, gb_free=59.2, wall=52453 epoch 021: 1565 / 1707 loss=4.081, nll_loss=2.45, ppl=5.46, wps=295294, ups=0.69, wpb=428847, bsz=16308.2, num_updates=35600, lr=0.000335201, gnorm=0.231, clip=0, loss_scale=4, train_wall=144, gb_free=59.2, wall=52453 epoch 021: 1565 / 1707 loss=4.081, nll_loss=2.45, ppl=5.46, wps=295294, ups=0.69, wpb=428847, bsz=16308.2, num_updates=35600, lr=0.000335201, gnorm=0.231, clip=0, loss_scale=4, train_wall=144, gb_free=59.2, wall=52453 epoch 021: 1565 / 1707 loss=4.081, nll_loss=2.45, ppl=5.46, wps=295294, ups=0.69, wpb=428847, bsz=16308.2, num_updates=35600, lr=0.000335201, gnorm=0.231, clip=0, loss_scale=4, train_wall=144, gb_free=59.2, wall=52453 epoch 021: 1565 / 1707 loss=4.081, nll_loss=2.45, ppl=5.46, wps=295294, ups=0.69, wpb=428847, bsz=16308.2, num_updates=35600, lr=0.000335201, gnorm=0.231, clip=0, loss_scale=4, train_wall=144, gb_free=59.2, wall=52453 epoch 021: 1565 / 1707 loss=4.081, nll_loss=2.45, ppl=5.46, wps=295294, ups=0.69, wpb=428847, bsz=16308.2, num_updates=35600, lr=0.000335201, gnorm=0.231, clip=0, loss_scale=4, train_wall=144, gb_free=59.2, wall=52453 epoch 021: 1565 / 1707 loss=4.081, nll_loss=2.45, ppl=5.46, wps=295294, ups=0.69, wpb=428847, bsz=16308.2, num_updates=35600, lr=0.000335201, gnorm=0.231, clip=0, loss_scale=4, train_wall=144, gb_free=59.2, wall=52453 epoch 021: 1565 / 1707 loss=4.081, nll_loss=2.45, ppl=5.46, wps=295294, ups=0.69, wpb=428847, bsz=16308.2, num_updates=35600, lr=0.000335201, gnorm=0.231, clip=0, loss_scale=4, train_wall=144, gb_free=59.2, wall=52453 epoch 021: 1565 / 1707 loss=4.081, nll_loss=2.45, ppl=5.46, wps=295294, ups=0.69, wpb=428847, bsz=16308.2, num_updates=35600, lr=0.000335201, gnorm=0.231, clip=0, loss_scale=4, train_wall=144, gb_free=59.2, wall=52453 epoch 021: 1565 / 1707 loss=4.081, nll_loss=2.45, ppl=5.46, wps=295294, ups=0.69, wpb=428847, bsz=16308.2, num_updates=35600, lr=0.000335201, gnorm=0.231, clip=0, loss_scale=4, train_wall=144, gb_free=59.2, wall=52453 epoch 021: 1565 / 1707 loss=4.081, nll_loss=2.45, ppl=5.46, wps=295294, ups=0.69, wpb=428847, bsz=16308.2, num_updates=35600, lr=0.000335201, gnorm=0.231, clip=0, loss_scale=4, train_wall=144, gb_free=59.2, wall=52453 epoch 021: 1565 / 1707 loss=4.081, nll_loss=2.45, ppl=5.46, wps=295294, ups=0.69, wpb=428847, bsz=16308.2, num_updates=35600, lr=0.000335201, gnorm=0.231, clip=0, loss_scale=4, train_wall=144, gb_free=59.2, wall=52453 epoch 021: 1565 / 1707 loss=4.081, nll_loss=2.45, ppl=5.46, wps=295294, ups=0.69, wpb=428847, bsz=16308.2, num_updates=35600, lr=0.000335201, gnorm=0.231, clip=0, loss_scale=4, train_wall=144, gb_free=59.2, wall=52453 epoch 021: 1565 / 1707 loss=4.081, nll_loss=2.45, ppl=5.46, wps=295294, ups=0.69, wpb=428847, bsz=16308.2, num_updates=35600, lr=0.000335201, gnorm=0.231, clip=0, loss_scale=4, train_wall=144, gb_free=59.2, wall=52453 epoch 021: 1565 / 1707 loss=4.081, nll_loss=2.45, ppl=5.46, wps=295294, ups=0.69, wpb=428847, bsz=16308.2, num_updates=35600, lr=0.000335201, gnorm=0.231, clip=0, loss_scale=4, train_wall=144, gb_free=59.2, wall=52453 epoch 021: 1565 / 1707 loss=4.081, nll_loss=2.45, ppl=5.46, wps=295294, ups=0.69, wpb=428847, bsz=16308.2, num_updates=35600, lr=0.000335201, gnorm=0.231, clip=0, loss_scale=4, train_wall=144, gb_free=59.2, wall=52453 epoch 021: 1565 / 1707 loss=4.081, nll_loss=2.45, ppl=5.46, wps=295294, ups=0.69, wpb=428847, bsz=16308.2, num_updates=35600, lr=0.000335201, gnorm=0.231, clip=0, loss_scale=4, train_wall=144, gb_free=59.2, wall=52453 epoch 021: 1565 / 1707 loss=4.081, nll_loss=2.45, ppl=5.46, wps=295294, ups=0.69, wpb=428847, bsz=16308.2, num_updates=35600, lr=0.000335201, gnorm=0.231, clip=0, loss_scale=4, train_wall=144, gb_free=59.2, wall=52453 epoch 021: 1667 / 1707 loss=4.076, nll_loss=2.444, ppl=5.44, wps=288663, ups=0.67, wpb=427891, bsz=16274.7, num_updates=35700, lr=0.000334731, gnorm=0.246, clip=0, loss_scale=1, train_wall=147, gb_free=58.9, wall=52601 epoch 021: 1667 / 1707 loss=4.076, nll_loss=2.444, ppl=5.44, wps=288663, ups=0.67, wpb=427891, bsz=16274.7, num_updates=35700, lr=0.000334731, gnorm=0.246, clip=0, loss_scale=1, train_wall=147, gb_free=58.9, wall=52601 epoch 021: 1667 / 1707 loss=4.076, nll_loss=2.444, ppl=5.44, wps=288663, ups=0.67, wpb=427891, bsz=16274.7, num_updates=35700, lr=0.000334731, gnorm=0.246, clip=0, loss_scale=1, train_wall=147, gb_free=58.9, wall=52601 epoch 021: 1667 / 1707 loss=4.076, nll_loss=2.444, ppl=5.44, wps=288663, ups=0.67, wpb=427891, bsz=16274.7, num_updates=35700, lr=0.000334731, gnorm=0.246, clip=0, loss_scale=1, train_wall=147, gb_free=58.9, wall=52601 epoch 021: 1667 / 1707 loss=4.076, nll_loss=2.444, ppl=5.44, wps=288663, ups=0.67, wpb=427891, bsz=16274.7, num_updates=35700, lr=0.000334731, gnorm=0.246, clip=0, loss_scale=1, train_wall=147, gb_free=58.9, wall=52601 epoch 021: 1667 / 1707 loss=4.076, nll_loss=2.444, ppl=5.44, wps=288663, ups=0.67, wpb=427891, bsz=16274.7, num_updates=35700, lr=0.000334731, gnorm=0.246, clip=0, loss_scale=1, train_wall=147, gb_free=58.9, wall=52601 epoch 021: 1667 / 1707 loss=4.076, nll_loss=2.444, ppl=5.44, wps=288663, ups=0.67, wpb=427891, bsz=16274.7, num_updates=35700, lr=0.000334731, gnorm=0.246, clip=0, loss_scale=1, train_wall=147, gb_free=58.9, wall=52601 epoch 021: 1667 / 1707 loss=4.076, nll_loss=2.444, ppl=5.44, wps=288663, ups=0.67, wpb=427891, bsz=16274.7, num_updates=35700, lr=0.000334731, gnorm=0.246, clip=0, loss_scale=1, train_wall=147, gb_free=58.9, wall=52601 epoch 021: 1667 / 1707 loss=4.076, nll_loss=2.444, ppl=5.44, wps=288663, ups=0.67, wpb=427891, bsz=16274.7, num_updates=35700, lr=0.000334731, gnorm=0.246, clip=0, loss_scale=1, train_wall=147, gb_free=58.9, wall=52601 epoch 021: 1667 / 1707 loss=4.076, nll_loss=2.444, ppl=5.44, wps=288663, ups=0.67, wpb=427891, bsz=16274.7, num_updates=35700, lr=0.000334731, gnorm=0.246, clip=0, loss_scale=1, train_wall=147, gb_free=58.9, wall=52601 epoch 021: 1667 / 1707 loss=4.076, nll_loss=2.444, ppl=5.44, wps=288663, ups=0.67, wpb=427891, bsz=16274.7, num_updates=35700, lr=0.000334731, gnorm=0.246, clip=0, loss_scale=1, train_wall=147, gb_free=58.9, wall=52601 epoch 021: 1667 / 1707 loss=4.076, nll_loss=2.444, ppl=5.44, wps=288663, ups=0.67, wpb=427891, bsz=16274.7, num_updates=35700, lr=0.000334731, gnorm=0.246, clip=0, loss_scale=1, train_wall=147, gb_free=58.9, wall=52601 epoch 021: 1667 / 1707 loss=4.076, nll_loss=2.444, ppl=5.44, wps=288663, ups=0.67, wpb=427891, bsz=16274.7, num_updates=35700, lr=0.000334731, gnorm=0.246, clip=0, loss_scale=1, train_wall=147, gb_free=58.9, wall=52601 epoch 021: 1667 / 1707 loss=4.076, nll_loss=2.444, ppl=5.44, wps=288663, ups=0.67, wpb=427891, bsz=16274.7, num_updates=35700, lr=0.000334731, gnorm=0.246, clip=0, loss_scale=1, train_wall=147, gb_free=58.9, wall=52601 epoch 021: 1667 / 1707 loss=4.076, nll_loss=2.444, ppl=5.44, wps=288663, ups=0.67, wpb=427891, bsz=16274.7, num_updates=35700, lr=0.000334731, gnorm=0.246, clip=0, loss_scale=1, train_wall=147, gb_free=58.9, wall=52601 epoch 021: 1667 / 1707 loss=4.076, nll_loss=2.444, ppl=5.44, wps=288663, ups=0.67, wpb=427891, bsz=16274.7, num_updates=35700, lr=0.000334731, gnorm=0.246, clip=0, loss_scale=1, train_wall=147, gb_free=58.9, wall=52601 epoch 021: 1667 / 1707 loss=4.076, nll_loss=2.444, ppl=5.44, wps=288663, ups=0.67, wpb=427891, bsz=16274.7, num_updates=35700, lr=0.000334731, gnorm=0.246, clip=0, loss_scale=1, train_wall=147, gb_free=58.9, wall=52601 epoch 021: 1667 / 1707 loss=4.076, nll_loss=2.444, ppl=5.44, wps=288663, ups=0.67, wpb=427891, bsz=16274.7, num_updates=35700, lr=0.000334731, gnorm=0.246, clip=0, loss_scale=1, train_wall=147, gb_free=58.9, wall=52601 epoch 021: 1667 / 1707 loss=4.076, nll_loss=2.444, ppl=5.44, wps=288663, ups=0.67, wpb=427891, bsz=16274.7, num_updates=35700, lr=0.000334731, gnorm=0.246, clip=0, loss_scale=1, train_wall=147, gb_free=58.9, wall=52601 epoch 021: 1667 / 1707 loss=4.076, nll_loss=2.444, ppl=5.44, wps=288663, ups=0.67, wpb=427891, bsz=16274.7, num_updates=35700, lr=0.000334731, gnorm=0.246, clip=0, loss_scale=1, train_wall=147, gb_free=58.9, wall=52601 epoch 021: 1667 / 1707 loss=4.076, nll_loss=2.444, ppl=5.44, wps=288663, ups=0.67, wpb=427891, bsz=16274.7, num_updates=35700, lr=0.000334731, gnorm=0.246, clip=0, loss_scale=1, train_wall=147, gb_free=58.9, wall=52601 end of epoch 21 (average epoch stats below) epoch 021 | loss 4.073 | nll_loss 2.44 | ppl 5.43 | wps 292692 | ups 0.68 | wpb 428936 | bsz 16334.6 | num_updates 35740 | lr 0.000334544 | gnorm 0.235 | clip 0 | loss_scale 1 | train_wall 2464 | gb_free 59 | wall 52657 epoch 021 | loss 4.073 | nll_loss 2.44 | ppl 5.43 | wps 292692 | ups 0.68 | wpb 428936 | bsz 16334.6 | num_updates 35740 | lr 0.000334544 | gnorm 0.235 | clip 0 | loss_scale 1 | train_wall 2464 | gb_free 59 | wall 52657 epoch 021 | loss 4.073 | nll_loss 2.44 | ppl 5.43 | wps 292692 | ups 0.68 | wpb 428936 | bsz 16334.6 | num_updates 35740 | lr 0.000334544 | gnorm 0.235 | clip 0 | loss_scale 1 | train_wall 2464 | gb_free 59 | wall 52657 epoch 021 | loss 4.073 | nll_loss 2.44 | ppl 5.43 | wps 292692 | ups 0.68 | wpb 428936 | bsz 16334.6 | num_updates 35740 | lr 0.000334544 | gnorm 0.235 | clip 0 | loss_scale 1 | train_wall 2464 | gb_free 59 | wall 52657 epoch 021 | loss 4.073 | nll_loss 2.44 | ppl 5.43 | wps 292692 | ups 0.68 | wpb 428936 | bsz 16334.6 | num_updates 35740 | lr 0.000334544 | gnorm 0.235 | clip 0 | loss_scale 1 | train_wall 2464 | gb_free 59 | wall 52657 epoch 021 | loss 4.073 | nll_loss 2.44 | ppl 5.43 | wps 292692 | ups 0.68 | wpb 428936 | bsz 16334.6 | num_updates 35740 | lr 0.000334544 | gnorm 0.235 | clip 0 | loss_scale 1 | train_wall 2464 | gb_free 59 | wall 52657 epoch 021 | loss 4.073 | nll_loss 2.44 | ppl 5.43 | wps 292692 | ups 0.68 | wpb 428936 | bsz 16334.6 | num_updates 35740 | lr 0.000334544 | gnorm 0.235 | clip 0 | loss_scale 1 | train_wall 2464 | gb_free 59 | wall 52657 epoch 021 | loss 4.073 | nll_loss 2.44 | ppl 5.43 | wps 292692 | ups 0.68 | wpb 428936 | bsz 16334.6 | num_updates 35740 | lr 0.000334544 | gnorm 0.235 | clip 0 | loss_scale 1 | train_wall 2464 | gb_free 59 | wall 52657 epoch 021 | loss 4.073 | nll_loss 2.44 | ppl 5.43 | wps 292692 | ups 0.68 | wpb 428936 | bsz 16334.6 | num_updates 35740 | lr 0.000334544 | gnorm 0.235 | clip 0 | loss_scale 1 | train_wall 2464 | gb_free 59 | wall 52657 epoch 021 | loss 4.073 | nll_loss 2.44 | ppl 5.43 | wps 292692 | ups 0.68 | wpb 428936 | bsz 16334.6 | num_updates 35740 | lr 0.000334544 | gnorm 0.235 | clip 0 | loss_scale 1 | train_wall 2464 | gb_free 59 | wall 52657 epoch 021 | loss 4.073 | nll_loss 2.44 | ppl 5.43 | wps 292692 | ups 0.68 | wpb 428936 | bsz 16334.6 | num_updates 35740 | lr 0.000334544 | gnorm 0.235 | clip 0 | loss_scale 1 | train_wall 2464 | gb_free 59 | wall 52657 epoch 021 | loss 4.073 | nll_loss 2.44 | ppl 5.43 | wps 292692 | ups 0.68 | wpb 428936 | bsz 16334.6 | num_updates 35740 | lr 0.000334544 | gnorm 0.235 | clip 0 | loss_scale 1 | train_wall 2464 | gb_free 59 | wall 52657 epoch 021 | loss 4.073 | nll_loss 2.44 | ppl 5.43 | wps 292692 | ups 0.68 | wpb 428936 | bsz 16334.6 | num_updates 35740 | lr 0.000334544 | gnorm 0.235 | clip 0 | loss_scale 1 | train_wall 2464 | gb_free 59 | wall 52657 epoch 021 | loss 4.073 | nll_loss 2.44 | ppl 5.43 | wps 292692 | ups 0.68 | wpb 428936 | bsz 16334.6 | num_updates 35740 | lr 0.000334544 | gnorm 0.235 | clip 0 | loss_scale 1 | train_wall 2464 | gb_free 59 | wall 52657 epoch 021 | loss 4.073 | nll_loss 2.44 | ppl 5.43 | wps 292692 | ups 0.68 | wpb 428936 | bsz 16334.6 | num_updates 35740 | lr 0.000334544 | gnorm 0.235 | clip 0 | loss_scale 1 | train_wall 2464 | gb_free 59 | wall 52657 epoch 021 | loss 4.073 | nll_loss 2.44 | ppl 5.43 | wps 292692 | ups 0.68 | wpb 428936 | bsz 16334.6 | num_updates 35740 | lr 0.000334544 | gnorm 0.235 | clip 0 | loss_scale 1 | train_wall 2464 | gb_free 59 | wall 52657 epoch 021 | loss 4.073 | nll_loss 2.44 | ppl 5.43 | wps 292692 | ups 0.68 | wpb 428936 | bsz 16334.6 | num_updates 35740 | lr 0.000334544 | gnorm 0.235 | clip 0 | loss_scale 1 | train_wall 2464 | gb_free 59 | wall 52657 epoch 021 | loss 4.073 | nll_loss 2.44 | ppl 5.43 | wps 292692 | ups 0.68 | wpb 428936 | bsz 16334.6 | num_updates 35740 | lr 0.000334544 | gnorm 0.235 | clip 0 | loss_scale 1 | train_wall 2464 | gb_free 59 | wall 52657 epoch 021 | loss 4.073 | nll_loss 2.44 | ppl 5.43 | wps 292692 | ups 0.68 | wpb 428936 | bsz 16334.6 | num_updates 35740 | lr 0.000334544 | gnorm 0.235 | clip 0 | loss_scale 1 | train_wall 2464 | gb_free 59 | wall 52657 epoch 021 | loss 4.073 | nll_loss 2.44 | ppl 5.43 | wps 292692 | ups 0.68 | wpb 428936 | bsz 16334.6 | num_updates 35740 | lr 0.000334544 | gnorm 0.235 | clip 0 | loss_scale 1 | train_wall 2464 | gb_free 59 | wall 52657 epoch 021 | loss 4.073 | nll_loss 2.44 | ppl 5.43 | wps 292692 | ups 0.68 | wpb 428936 | bsz 16334.6 | num_updates 35740 | lr 0.000334544 | gnorm 0.235 | clip 0 | loss_scale 1 | train_wall 2464 | gb_free 59 | wall 52657 Start iterating over samples epoch 022: 60 / 1707 loss=4.064, nll_loss=2.43, ppl=5.39, wps=295937, ups=0.7, wpb=425768, bsz=16276.4, num_updates=35800, lr=0.000334263, gnorm=0.225, clip=0, loss_scale=1, train_wall=143, gb_free=59.1, wall=52745 epoch 022: 60 / 1707 loss=4.064, nll_loss=2.43, ppl=5.39, wps=295937, ups=0.7, wpb=425768, bsz=16276.4, num_updates=35800, lr=0.000334263, gnorm=0.225, clip=0, loss_scale=1, train_wall=143, gb_free=59.1, wall=52745 epoch 022: 60 / 1707 loss=4.064, nll_loss=2.43, ppl=5.39, wps=295937, ups=0.7, wpb=425768, bsz=16276.4, num_updates=35800, lr=0.000334263, gnorm=0.225, clip=0, loss_scale=1, train_wall=143, gb_free=59.1, wall=52745 epoch 022: 60 / 1707 loss=4.064, nll_loss=2.43, ppl=5.39, wps=295937, ups=0.7, wpb=425768, bsz=16276.4, num_updates=35800, lr=0.000334263, gnorm=0.225, clip=0, loss_scale=1, train_wall=143, gb_free=59.1, wall=52745 epoch 022: 60 / 1707 loss=4.064, nll_loss=2.43, ppl=5.39, wps=295937, ups=0.7, wpb=425768, bsz=16276.4, num_updates=35800, lr=0.000334263, gnorm=0.225, clip=0, loss_scale=1, train_wall=143, gb_free=59.1, wall=52745 epoch 022: 60 / 1707 loss=4.064, nll_loss=2.43, ppl=5.39, wps=295937, ups=0.7, wpb=425768, bsz=16276.4, num_updates=35800, lr=0.000334263, gnorm=0.225, clip=0, loss_scale=1, train_wall=143, gb_free=59.1, wall=52745 epoch 022: 60 / 1707 loss=4.064, nll_loss=2.43, ppl=5.39, wps=295937, ups=0.7, wpb=425768, bsz=16276.4, num_updates=35800, lr=0.000334263, gnorm=0.225, clip=0, loss_scale=1, train_wall=143, gb_free=59.1, wall=52745 epoch 022: 60 / 1707 loss=4.064, nll_loss=2.43, ppl=5.39, wps=295937, ups=0.7, wpb=425768, bsz=16276.4, num_updates=35800, lr=0.000334263, gnorm=0.225, clip=0, loss_scale=1, train_wall=143, gb_free=59.1, wall=52745 epoch 022: 60 / 1707 loss=4.064, nll_loss=2.43, ppl=5.39, wps=295937, ups=0.7, wpb=425768, bsz=16276.4, num_updates=35800, lr=0.000334263, gnorm=0.225, clip=0, loss_scale=1, train_wall=143, gb_free=59.1, wall=52745 epoch 022: 60 / 1707 loss=4.064, nll_loss=2.43, ppl=5.39, wps=295937, ups=0.7, wpb=425768, bsz=16276.4, num_updates=35800, lr=0.000334263, gnorm=0.225, clip=0, loss_scale=1, train_wall=143, gb_free=59.1, wall=52745 epoch 022: 60 / 1707 loss=4.064, nll_loss=2.43, ppl=5.39, wps=295937, ups=0.7, wpb=425768, bsz=16276.4, num_updates=35800, lr=0.000334263, gnorm=0.225, clip=0, loss_scale=1, train_wall=143, gb_free=59.1, wall=52745 epoch 022: 60 / 1707 loss=4.064, nll_loss=2.43, ppl=5.39, wps=295937, ups=0.7, wpb=425768, bsz=16276.4, num_updates=35800, lr=0.000334263, gnorm=0.225, clip=0, loss_scale=1, train_wall=143, gb_free=59.1, wall=52745 epoch 022: 60 / 1707 loss=4.064, nll_loss=2.43, ppl=5.39, wps=295937, ups=0.7, wpb=425768, bsz=16276.4, num_updates=35800, lr=0.000334263, gnorm=0.225, clip=0, loss_scale=1, train_wall=143, gb_free=59.1, wall=52745 epoch 022: 60 / 1707 loss=4.064, nll_loss=2.43, ppl=5.39, wps=295937, ups=0.7, wpb=425768, bsz=16276.4, num_updates=35800, lr=0.000334263, gnorm=0.225, clip=0, loss_scale=1, train_wall=143, gb_free=59.1, wall=52745 epoch 022: 60 / 1707 loss=4.064, nll_loss=2.43, ppl=5.39, wps=295937, ups=0.7, wpb=425768, bsz=16276.4, num_updates=35800, lr=0.000334263, gnorm=0.225, clip=0, loss_scale=1, train_wall=143, gb_free=59.1, wall=52745 epoch 022: 60 / 1707 loss=4.064, nll_loss=2.43, ppl=5.39, wps=295937, ups=0.7, wpb=425768, bsz=16276.4, num_updates=35800, lr=0.000334263, gnorm=0.225, clip=0, loss_scale=1, train_wall=143, gb_free=59.1, wall=52745 epoch 022: 60 / 1707 loss=4.064, nll_loss=2.43, ppl=5.39, wps=295937, ups=0.7, wpb=425768, bsz=16276.4, num_updates=35800, lr=0.000334263, gnorm=0.225, clip=0, loss_scale=1, train_wall=143, gb_free=59.1, wall=52745 epoch 022: 60 / 1707 loss=4.064, nll_loss=2.43, ppl=5.39, wps=295937, ups=0.7, wpb=425768, bsz=16276.4, num_updates=35800, lr=0.000334263, gnorm=0.225, clip=0, loss_scale=1, train_wall=143, gb_free=59.1, wall=52745 epoch 022: 60 / 1707 loss=4.064, nll_loss=2.43, ppl=5.39, wps=295937, ups=0.7, wpb=425768, bsz=16276.4, num_updates=35800, lr=0.000334263, gnorm=0.225, clip=0, loss_scale=1, train_wall=143, gb_free=59.1, wall=52745 epoch 022: 60 / 1707 loss=4.064, nll_loss=2.43, ppl=5.39, wps=295937, ups=0.7, wpb=425768, bsz=16276.4, num_updates=35800, lr=0.000334263, gnorm=0.225, clip=0, loss_scale=1, train_wall=143, gb_free=59.1, wall=52745 epoch 022: 60 / 1707 loss=4.064, nll_loss=2.43, ppl=5.39, wps=295937, ups=0.7, wpb=425768, bsz=16276.4, num_updates=35800, lr=0.000334263, gnorm=0.225, clip=0, loss_scale=1, train_wall=143, gb_free=59.1, wall=52745 epoch 022: 60 / 1707 loss=4.064, nll_loss=2.43, ppl=5.39, wps=295937, ups=0.7, wpb=425768, bsz=16276.4, num_updates=35800, lr=0.000334263, gnorm=0.225, clip=0, loss_scale=1, train_wall=143, gb_free=59.1, wall=52745 epoch 022: 160 / 1707 loss=4.049, nll_loss=2.413, ppl=5.33, wps=296715, ups=0.69, wpb=429744, bsz=16197.8, num_updates=35900, lr=0.000333797, gnorm=0.239, clip=0, loss_scale=1, train_wall=144, gb_free=58.9, wall=52890 epoch 022: 160 / 1707 loss=4.049, nll_loss=2.413, ppl=5.33, wps=296715, ups=0.69, wpb=429744, bsz=16197.8, num_updates=35900, lr=0.000333797, gnorm=0.239, clip=0, loss_scale=1, train_wall=144, gb_free=58.9, wall=52890 epoch 022: 160 / 1707 loss=4.049, nll_loss=2.413, ppl=5.33, wps=296715, ups=0.69, wpb=429744, bsz=16197.8, num_updates=35900, lr=0.000333797, gnorm=0.239, clip=0, loss_scale=1, train_wall=144, gb_free=58.9, wall=52890 epoch 022: 160 / 1707 loss=4.049, nll_loss=2.413, ppl=5.33, wps=296715, ups=0.69, wpb=429744, bsz=16197.8, num_updates=35900, lr=0.000333797, gnorm=0.239, clip=0, loss_scale=1, train_wall=144, gb_free=58.9, wall=52890 epoch 022: 160 / 1707 loss=4.049, nll_loss=2.413, ppl=5.33, wps=296715, ups=0.69, wpb=429744, bsz=16197.8, num_updates=35900, lr=0.000333797, gnorm=0.239, clip=0, loss_scale=1, train_wall=144, gb_free=58.9, wall=52890 epoch 022: 160 / 1707 loss=4.049, nll_loss=2.413, ppl=5.33, wps=296715, ups=0.69, wpb=429744, bsz=16197.8, num_updates=35900, lr=0.000333797, gnorm=0.239, clip=0, loss_scale=1, train_wall=144, gb_free=58.9, wall=52890 epoch 022: 160 / 1707 loss=4.049, nll_loss=2.413, ppl=5.33, wps=296715, ups=0.69, wpb=429744, bsz=16197.8, num_updates=35900, lr=0.000333797, gnorm=0.239, clip=0, loss_scale=1, train_wall=144, gb_free=58.9, wall=52890 epoch 022: 160 / 1707 loss=4.049, nll_loss=2.413, ppl=5.33, wps=296715, ups=0.69, wpb=429744, bsz=16197.8, num_updates=35900, lr=0.000333797, gnorm=0.239, clip=0, loss_scale=1, train_wall=144, gb_free=58.9, wall=52890 epoch 022: 160 / 1707 loss=4.049, nll_loss=2.413, ppl=5.33, wps=296715, ups=0.69, wpb=429744, bsz=16197.8, num_updates=35900, lr=0.000333797, gnorm=0.239, clip=0, loss_scale=1, train_wall=144, gb_free=58.9, wall=52890 epoch 022: 160 / 1707 loss=4.049, nll_loss=2.413, ppl=5.33, wps=296715, ups=0.69, wpb=429744, bsz=16197.8, num_updates=35900, lr=0.000333797, gnorm=0.239, clip=0, loss_scale=1, train_wall=144, gb_free=58.9, wall=52890 epoch 022: 160 / 1707 loss=4.049, nll_loss=2.413, ppl=5.33, wps=296715, ups=0.69, wpb=429744, bsz=16197.8, num_updates=35900, lr=0.000333797, gnorm=0.239, clip=0, loss_scale=1, train_wall=144, gb_free=58.9, wall=52890 epoch 022: 160 / 1707 loss=4.049, nll_loss=2.413, ppl=5.33, wps=296715, ups=0.69, wpb=429744, bsz=16197.8, num_updates=35900, lr=0.000333797, gnorm=0.239, clip=0, loss_scale=1, train_wall=144, gb_free=58.9, wall=52890 epoch 022: 160 / 1707 loss=4.049, nll_loss=2.413, ppl=5.33, wps=296715, ups=0.69, wpb=429744, bsz=16197.8, num_updates=35900, lr=0.000333797, gnorm=0.239, clip=0, loss_scale=1, train_wall=144, gb_free=58.9, wall=52890 epoch 022: 160 / 1707 loss=4.049, nll_loss=2.413, ppl=5.33, wps=296715, ups=0.69, wpb=429744, bsz=16197.8, num_updates=35900, lr=0.000333797, gnorm=0.239, clip=0, loss_scale=1, train_wall=144, gb_free=58.9, wall=52890 epoch 022: 160 / 1707 loss=4.049, nll_loss=2.413, ppl=5.33, wps=296715, ups=0.69, wpb=429744, bsz=16197.8, num_updates=35900, lr=0.000333797, gnorm=0.239, clip=0, loss_scale=1, train_wall=144, gb_free=58.9, wall=52890 epoch 022: 160 / 1707 loss=4.049, nll_loss=2.413, ppl=5.33, wps=296715, ups=0.69, wpb=429744, bsz=16197.8, num_updates=35900, lr=0.000333797, gnorm=0.239, clip=0, loss_scale=1, train_wall=144, gb_free=58.9, wall=52890 epoch 022: 160 / 1707 loss=4.049, nll_loss=2.413, ppl=5.33, wps=296715, ups=0.69, wpb=429744, bsz=16197.8, num_updates=35900, lr=0.000333797, gnorm=0.239, clip=0, loss_scale=1, train_wall=144, gb_free=58.9, wall=52890 epoch 022: 160 / 1707 loss=4.049, nll_loss=2.413, ppl=5.33, wps=296715, ups=0.69, wpb=429744, bsz=16197.8, num_updates=35900, lr=0.000333797, gnorm=0.239, clip=0, loss_scale=1, train_wall=144, gb_free=58.9, wall=52890 epoch 022: 160 / 1707 loss=4.049, nll_loss=2.413, ppl=5.33, wps=296715, ups=0.69, wpb=429744, bsz=16197.8, num_updates=35900, lr=0.000333797, gnorm=0.239, clip=0, loss_scale=1, train_wall=144, gb_free=58.9, wall=52890 epoch 022: 160 / 1707 loss=4.049, nll_loss=2.413, ppl=5.33, wps=296715, ups=0.69, wpb=429744, bsz=16197.8, num_updates=35900, lr=0.000333797, gnorm=0.239, clip=0, loss_scale=1, train_wall=144, gb_free=58.9, wall=52890 epoch 022: 160 / 1707 loss=4.049, nll_loss=2.413, ppl=5.33, wps=296715, ups=0.69, wpb=429744, bsz=16197.8, num_updates=35900, lr=0.000333797, gnorm=0.239, clip=0, loss_scale=1, train_wall=144, gb_free=58.9, wall=52890 epoch 022: 160 / 1707 loss=4.049, nll_loss=2.413, ppl=5.33, wps=296715, ups=0.69, wpb=429744, bsz=16197.8, num_updates=35900, lr=0.000333797, gnorm=0.239, clip=0, loss_scale=1, train_wall=144, gb_free=58.9, wall=52890 epoch 022: 260 / 1707 loss=4.047, nll_loss=2.41, ppl=5.32, wps=295144, ups=0.69, wpb=429439, bsz=16390.9, num_updates=36000, lr=0.000333333, gnorm=0.227, clip=0, loss_scale=2, train_wall=145, gb_free=59, wall=53035 epoch 022: 260 / 1707 loss=4.047, nll_loss=2.41, ppl=5.32, wps=295144, ups=0.69, wpb=429439, bsz=16390.9, num_updates=36000, lr=0.000333333, gnorm=0.227, clip=0, loss_scale=2, train_wall=145, gb_free=59, wall=53035 epoch 022: 260 / 1707 loss=4.047, nll_loss=2.41, ppl=5.32, wps=295144, ups=0.69, wpb=429439, bsz=16390.9, num_updates=36000, lr=0.000333333, gnorm=0.227, clip=0, loss_scale=2, train_wall=145, gb_free=59, wall=53035 epoch 022: 260 / 1707 loss=4.047, nll_loss=2.41, ppl=5.32, wps=295144, ups=0.69, wpb=429439, bsz=16390.9, num_updates=36000, lr=0.000333333, gnorm=0.227, clip=0, loss_scale=2, train_wall=145, gb_free=59, wall=53035 epoch 022: 260 / 1707 loss=4.047, nll_loss=2.41, ppl=5.32, wps=295144, ups=0.69, wpb=429439, bsz=16390.9, num_updates=36000, lr=0.000333333, gnorm=0.227, clip=0, loss_scale=2, train_wall=145, gb_free=59, wall=53035 epoch 022: 260 / 1707 loss=4.047, nll_loss=2.41, ppl=5.32, wps=295144, ups=0.69, wpb=429439, bsz=16390.9, num_updates=36000, lr=0.000333333, gnorm=0.227, clip=0, loss_scale=2, train_wall=145, gb_free=59, wall=53035 epoch 022: 260 / 1707 loss=4.047, nll_loss=2.41, ppl=5.32, wps=295144, ups=0.69, wpb=429439, bsz=16390.9, num_updates=36000, lr=0.000333333, gnorm=0.227, clip=0, loss_scale=2, train_wall=145, gb_free=59, wall=53035 epoch 022: 260 / 1707 loss=4.047, nll_loss=2.41, ppl=5.32, wps=295144, ups=0.69, wpb=429439, bsz=16390.9, num_updates=36000, lr=0.000333333, gnorm=0.227, clip=0, loss_scale=2, train_wall=145, gb_free=59, wall=53035 epoch 022: 260 / 1707 loss=4.047, nll_loss=2.41, ppl=5.32, wps=295144, ups=0.69, wpb=429439, bsz=16390.9, num_updates=36000, lr=0.000333333, gnorm=0.227, clip=0, loss_scale=2, train_wall=145, gb_free=59, wall=53035 epoch 022: 260 / 1707 loss=4.047, nll_loss=2.41, ppl=5.32, wps=295144, ups=0.69, wpb=429439, bsz=16390.9, num_updates=36000, lr=0.000333333, gnorm=0.227, clip=0, loss_scale=2, train_wall=145, gb_free=59, wall=53035 epoch 022: 260 / 1707 loss=4.047, nll_loss=2.41, ppl=5.32, wps=295144, ups=0.69, wpb=429439, bsz=16390.9, num_updates=36000, lr=0.000333333, gnorm=0.227, clip=0, loss_scale=2, train_wall=145, gb_free=59, wall=53035 epoch 022: 260 / 1707 loss=4.047, nll_loss=2.41, ppl=5.32, wps=295144, ups=0.69, wpb=429439, bsz=16390.9, num_updates=36000, lr=0.000333333, gnorm=0.227, clip=0, loss_scale=2, train_wall=145, gb_free=59, wall=53035 epoch 022: 260 / 1707 loss=4.047, nll_loss=2.41, ppl=5.32, wps=295144, ups=0.69, wpb=429439, bsz=16390.9, num_updates=36000, lr=0.000333333, gnorm=0.227, clip=0, loss_scale=2, train_wall=145, gb_free=59, wall=53035 epoch 022: 260 / 1707 loss=4.047, nll_loss=2.41, ppl=5.32, wps=295144, ups=0.69, wpb=429439, bsz=16390.9, num_updates=36000, lr=0.000333333, gnorm=0.227, clip=0, loss_scale=2, train_wall=145, gb_free=59, wall=53035 epoch 022: 260 / 1707 loss=4.047, nll_loss=2.41, ppl=5.32, wps=295144, ups=0.69, wpb=429439, bsz=16390.9, num_updates=36000, lr=0.000333333, gnorm=0.227, clip=0, loss_scale=2, train_wall=145, gb_free=59, wall=53035 epoch 022: 260 / 1707 loss=4.047, nll_loss=2.41, ppl=5.32, wps=295144, ups=0.69, wpb=429439, bsz=16390.9, num_updates=36000, lr=0.000333333, gnorm=0.227, clip=0, loss_scale=2, train_wall=145, gb_free=59, wall=53035 epoch 022: 260 / 1707 loss=4.047, nll_loss=2.41, ppl=5.32, wps=295144, ups=0.69, wpb=429439, bsz=16390.9, num_updates=36000, lr=0.000333333, gnorm=0.227, clip=0, loss_scale=2, train_wall=145, gb_free=59, wall=53035 epoch 022: 260 / 1707 loss=4.047, nll_loss=2.41, ppl=5.32, wps=295144, ups=0.69, wpb=429439, bsz=16390.9, num_updates=36000, lr=0.000333333, gnorm=0.227, clip=0, loss_scale=2, train_wall=145, gb_free=59, wall=53035 epoch 022: 260 / 1707 loss=4.047, nll_loss=2.41, ppl=5.32, wps=295144, ups=0.69, wpb=429439, bsz=16390.9, num_updates=36000, lr=0.000333333, gnorm=0.227, clip=0, loss_scale=2, train_wall=145, gb_free=59, wall=53035 epoch 022: 260 / 1707 loss=4.047, nll_loss=2.41, ppl=5.32, wps=295144, ups=0.69, wpb=429439, bsz=16390.9, num_updates=36000, lr=0.000333333, gnorm=0.227, clip=0, loss_scale=2, train_wall=145, gb_free=59, wall=53035 epoch 022: 260 / 1707 loss=4.047, nll_loss=2.41, ppl=5.32, wps=295144, ups=0.69, wpb=429439, bsz=16390.9, num_updates=36000, lr=0.000333333, gnorm=0.227, clip=0, loss_scale=2, train_wall=145, gb_free=59, wall=53035 epoch 022: 260 / 1707 loss=4.047, nll_loss=2.41, ppl=5.32, wps=295144, ups=0.69, wpb=429439, bsz=16390.9, num_updates=36000, lr=0.000333333, gnorm=0.227, clip=0, loss_scale=2, train_wall=145, gb_free=59, wall=53035 begin validation on "valid" subset epoch 022 | valid on 'valid' subset | loss 4.292 | nll_loss 2.67 | ppl 6.36 | wps 75193.5 | wpb 21331 | bsz 1016 | num_updates 36000 | best_loss 4.291 epoch 022 | valid on 'valid' subset | loss 4.292 | nll_loss 2.67 | ppl 6.36 | wps 75193.5 | wpb 21331 | bsz 1016 | num_updates 36000 | best_loss 4.291 epoch 022 | valid on 'valid' subset | loss 4.292 | nll_loss 2.67 | ppl 6.36 | wps 75193.5 | wpb 21331 | bsz 1016 | num_updates 36000 | best_loss 4.291 epoch 022 | valid on 'valid' subset | loss 4.292 | nll_loss 2.67 | ppl 6.36 | wps 75193.5 | wpb 21331 | bsz 1016 | num_updates 36000 | best_loss 4.291 epoch 022 | valid on 'valid' subset | loss 4.292 | nll_loss 2.67 | ppl 6.36 | wps 75193.5 | wpb 21331 | bsz 1016 | num_updates 36000 | best_loss 4.291 epoch 022 | valid on 'valid' subset | loss 4.292 | nll_loss 2.67 | ppl 6.36 | wps 75193.5 | wpb 21331 | bsz 1016 | num_updates 36000 | best_loss 4.291 epoch 022 | valid on 'valid' subset | loss 4.292 | nll_loss 2.67 | ppl 6.36 | wps 75193.5 | wpb 21331 | bsz 1016 | num_updates 36000 | best_loss 4.291 epoch 022 | valid on 'valid' subset | loss 4.292 | nll_loss 2.67 | ppl 6.36 | wps 75193.5 | wpb 21331 | bsz 1016 | num_updates 36000 | best_loss 4.291 epoch 022 | valid on 'valid' subset | loss 4.292 | nll_loss 2.67 | ppl 6.36 | wps 75193.5 | wpb 21331 | bsz 1016 | num_updates 36000 | best_loss 4.291 epoch 022 | valid on 'valid' subset | loss 4.292 | nll_loss 2.67 | ppl 6.36 | wps 75193.5 | wpb 21331 | bsz 1016 | num_updates 36000 | best_loss 4.291 epoch 022 | valid on 'valid' subset | loss 4.292 | nll_loss 2.67 | ppl 6.36 | wps 75193.5 | wpb 21331 | bsz 1016 | num_updates 36000 | best_loss 4.291 epoch 022 | valid on 'valid' subset | loss 4.292 | nll_loss 2.67 | ppl 6.36 | wps 75193.5 | wpb 21331 | bsz 1016 | num_updates 36000 | best_loss 4.291 epoch 022 | valid on 'valid' subset | loss 4.292 | nll_loss 2.67 | ppl 6.36 | wps 75193.5 | wpb 21331 | bsz 1016 | num_updates 36000 | best_loss 4.291 epoch 022 | valid on 'valid' subset | loss 4.292 | nll_loss 2.67 | ppl 6.36 | wps 75193.5 | wpb 21331 | bsz 1016 | num_updates 36000 | best_loss 4.291 epoch 022 | valid on 'valid' subset | loss 4.292 | nll_loss 2.67 | ppl 6.36 | wps 75193.5 | wpb 21331 | bsz 1016 | num_updates 36000 | best_loss 4.291 epoch 022 | valid on 'valid' subset | loss 4.292 | nll_loss 2.67 | ppl 6.36 | wps 75193.5 | wpb 21331 | bsz 1016 | num_updates 36000 | best_loss 4.291 epoch 022 | valid on 'valid' subset | loss 4.292 | nll_loss 2.67 | ppl 6.36 | wps 75193.5 | wpb 21331 | bsz 1016 | num_updates 36000 | best_loss 4.291 epoch 022 | valid on 'valid' subset | loss 4.292 | nll_loss 2.67 | ppl 6.36 | wps 75193.5 | wpb 21331 | bsz 1016 | num_updates 36000 | best_loss 4.291 epoch 022 | valid on 'valid' subset | loss 4.292 | nll_loss 2.67 | ppl 6.36 | wps 75193.5 | wpb 21331 | bsz 1016 | num_updates 36000 | best_loss 4.291 epoch 022 | valid on 'valid' subset | loss 4.292 | nll_loss 2.67 | ppl 6.36 | wps 75193.5 | wpb 21331 | bsz 1016 | num_updates 36000 | best_loss 4.291 epoch 022 | valid on 'valid' subset | loss 4.292 | nll_loss 2.67 | ppl 6.36 | wps 75193.5 | wpb 21331 | bsz 1016 | num_updates 36000 | best_loss 4.291 epoch 022 | valid on 'valid' subset | loss 4.292 | nll_loss 2.67 | ppl 6.36 | wps 75193.5 | wpb 21331 | bsz 1016 | num_updates 36000 | best_loss 4.291 epoch 022: 360 / 1707 loss=4.048, nll_loss=2.412, ppl=5.32, wps=272416, ups=0.64, wpb=428396, bsz=16287.8, num_updates=36100, lr=0.000332871, gnorm=0.243, clip=0, loss_scale=2, train_wall=144, gb_free=59.2, wall=53192 epoch 022: 360 / 1707 loss=4.048, nll_loss=2.412, ppl=5.32, wps=272416, ups=0.64, wpb=428396, bsz=16287.8, num_updates=36100, lr=0.000332871, gnorm=0.243, clip=0, loss_scale=2, train_wall=144, gb_free=59.2, wall=53192 epoch 022: 360 / 1707 loss=4.048, nll_loss=2.412, ppl=5.32, wps=272416, ups=0.64, wpb=428396, bsz=16287.8, num_updates=36100, lr=0.000332871, gnorm=0.243, clip=0, loss_scale=2, train_wall=144, gb_free=59.2, wall=53192 epoch 022: 360 / 1707 loss=4.048, nll_loss=2.412, ppl=5.32, wps=272416, ups=0.64, wpb=428396, bsz=16287.8, num_updates=36100, lr=0.000332871, gnorm=0.243, clip=0, loss_scale=2, train_wall=144, gb_free=59.2, wall=53192 epoch 022: 360 / 1707 loss=4.048, nll_loss=2.412, ppl=5.32, wps=272416, ups=0.64, wpb=428396, bsz=16287.8, num_updates=36100, lr=0.000332871, gnorm=0.243, clip=0, loss_scale=2, train_wall=144, gb_free=59.2, wall=53192 epoch 022: 360 / 1707 loss=4.048, nll_loss=2.412, ppl=5.32, wps=272416, ups=0.64, wpb=428396, bsz=16287.8, num_updates=36100, lr=0.000332871, gnorm=0.243, clip=0, loss_scale=2, train_wall=144, gb_free=59.2, wall=53192 epoch 022: 360 / 1707 loss=4.048, nll_loss=2.412, ppl=5.32, wps=272416, ups=0.64, wpb=428396, bsz=16287.8, num_updates=36100, lr=0.000332871, gnorm=0.243, clip=0, loss_scale=2, train_wall=144, gb_free=59.2, wall=53192 epoch 022: 360 / 1707 loss=4.048, nll_loss=2.412, ppl=5.32, wps=272416, ups=0.64, wpb=428396, bsz=16287.8, num_updates=36100, lr=0.000332871, gnorm=0.243, clip=0, loss_scale=2, train_wall=144, gb_free=59.2, wall=53192 epoch 022: 360 / 1707 loss=4.048, nll_loss=2.412, ppl=5.32, wps=272416, ups=0.64, wpb=428396, bsz=16287.8, num_updates=36100, lr=0.000332871, gnorm=0.243, clip=0, loss_scale=2, train_wall=144, gb_free=59.2, wall=53192 epoch 022: 360 / 1707 loss=4.048, nll_loss=2.412, ppl=5.32, wps=272416, ups=0.64, wpb=428396, bsz=16287.8, num_updates=36100, lr=0.000332871, gnorm=0.243, clip=0, loss_scale=2, train_wall=144, gb_free=59.2, wall=53192 epoch 022: 360 / 1707 loss=4.048, nll_loss=2.412, ppl=5.32, wps=272416, ups=0.64, wpb=428396, bsz=16287.8, num_updates=36100, lr=0.000332871, gnorm=0.243, clip=0, loss_scale=2, train_wall=144, gb_free=59.2, wall=53192 epoch 022: 360 / 1707 loss=4.048, nll_loss=2.412, ppl=5.32, wps=272416, ups=0.64, wpb=428396, bsz=16287.8, num_updates=36100, lr=0.000332871, gnorm=0.243, clip=0, loss_scale=2, train_wall=144, gb_free=59.2, wall=53192 epoch 022: 360 / 1707 loss=4.048, nll_loss=2.412, ppl=5.32, wps=272416, ups=0.64, wpb=428396, bsz=16287.8, num_updates=36100, lr=0.000332871, gnorm=0.243, clip=0, loss_scale=2, train_wall=144, gb_free=59.2, wall=53192 epoch 022: 360 / 1707 loss=4.048, nll_loss=2.412, ppl=5.32, wps=272416, ups=0.64, wpb=428396, bsz=16287.8, num_updates=36100, lr=0.000332871, gnorm=0.243, clip=0, loss_scale=2, train_wall=144, gb_free=59.2, wall=53192 epoch 022: 360 / 1707 loss=4.048, nll_loss=2.412, ppl=5.32, wps=272416, ups=0.64, wpb=428396, bsz=16287.8, num_updates=36100, lr=0.000332871, gnorm=0.243, clip=0, loss_scale=2, train_wall=144, gb_free=59.2, wall=53192 epoch 022: 360 / 1707 loss=4.048, nll_loss=2.412, ppl=5.32, wps=272416, ups=0.64, wpb=428396, bsz=16287.8, num_updates=36100, lr=0.000332871, gnorm=0.243, clip=0, loss_scale=2, train_wall=144, gb_free=59.2, wall=53192 epoch 022: 360 / 1707 loss=4.048, nll_loss=2.412, ppl=5.32, wps=272416, ups=0.64, wpb=428396, bsz=16287.8, num_updates=36100, lr=0.000332871, gnorm=0.243, clip=0, loss_scale=2, train_wall=144, gb_free=59.2, wall=53192 epoch 022: 360 / 1707 loss=4.048, nll_loss=2.412, ppl=5.32, wps=272416, ups=0.64, wpb=428396, bsz=16287.8, num_updates=36100, lr=0.000332871, gnorm=0.243, clip=0, loss_scale=2, train_wall=144, gb_free=59.2, wall=53192 epoch 022: 360 / 1707 loss=4.048, nll_loss=2.412, ppl=5.32, wps=272416, ups=0.64, wpb=428396, bsz=16287.8, num_updates=36100, lr=0.000332871, gnorm=0.243, clip=0, loss_scale=2, train_wall=144, gb_free=59.2, wall=53192 epoch 022: 360 / 1707 loss=4.048, nll_loss=2.412, ppl=5.32, wps=272416, ups=0.64, wpb=428396, bsz=16287.8, num_updates=36100, lr=0.000332871, gnorm=0.243, clip=0, loss_scale=2, train_wall=144, gb_free=59.2, wall=53192 epoch 022: 360 / 1707 loss=4.048, nll_loss=2.412, ppl=5.32, wps=272416, ups=0.64, wpb=428396, bsz=16287.8, num_updates=36100, lr=0.000332871, gnorm=0.243, clip=0, loss_scale=2, train_wall=144, gb_free=59.2, wall=53192 epoch 022: 360 / 1707 loss=4.048, nll_loss=2.412, ppl=5.32, wps=272416, ups=0.64, wpb=428396, bsz=16287.8, num_updates=36100, lr=0.000332871, gnorm=0.243, clip=0, loss_scale=2, train_wall=144, gb_free=59.2, wall=53192 epoch 022: 460 / 1707 loss=4.058, nll_loss=2.423, ppl=5.36, wps=296784, ups=0.69, wpb=430153, bsz=16184.8, num_updates=36200, lr=0.000332411, gnorm=0.235, clip=0, loss_scale=4, train_wall=144, gb_free=59.4, wall=53337 epoch 022: 460 / 1707 loss=4.058, nll_loss=2.423, ppl=5.36, wps=296784, ups=0.69, wpb=430153, bsz=16184.8, num_updates=36200, lr=0.000332411, gnorm=0.235, clip=0, loss_scale=4, train_wall=144, gb_free=59.4, wall=53337 epoch 022: 460 / 1707 loss=4.058, nll_loss=2.423, ppl=5.36, wps=296784, ups=0.69, wpb=430153, bsz=16184.8, num_updates=36200, lr=0.000332411, gnorm=0.235, clip=0, loss_scale=4, train_wall=144, gb_free=59.4, wall=53337 epoch 022: 460 / 1707 loss=4.058, nll_loss=2.423, ppl=5.36, wps=296784, ups=0.69, wpb=430153, bsz=16184.8, num_updates=36200, lr=0.000332411, gnorm=0.235, clip=0, loss_scale=4, train_wall=144, gb_free=59.4, wall=53337 epoch 022: 460 / 1707 loss=4.058, nll_loss=2.423, ppl=5.36, wps=296784, ups=0.69, wpb=430153, bsz=16184.8, num_updates=36200, lr=0.000332411, gnorm=0.235, clip=0, loss_scale=4, train_wall=144, gb_free=59.4, wall=53337 epoch 022: 460 / 1707 loss=4.058, nll_loss=2.423, ppl=5.36, wps=296784, ups=0.69, wpb=430153, bsz=16184.8, num_updates=36200, lr=0.000332411, gnorm=0.235, clip=0, loss_scale=4, train_wall=144, gb_free=59.4, wall=53337 epoch 022: 460 / 1707 loss=4.058, nll_loss=2.423, ppl=5.36, wps=296784, ups=0.69, wpb=430153, bsz=16184.8, num_updates=36200, lr=0.000332411, gnorm=0.235, clip=0, loss_scale=4, train_wall=144, gb_free=59.4, wall=53337 epoch 022: 460 / 1707 loss=4.058, nll_loss=2.423, ppl=5.36, wps=296784, ups=0.69, wpb=430153, bsz=16184.8, num_updates=36200, lr=0.000332411, gnorm=0.235, clip=0, loss_scale=4, train_wall=144, gb_free=59.4, wall=53337 epoch 022: 460 / 1707 loss=4.058, nll_loss=2.423, ppl=5.36, wps=296784, ups=0.69, wpb=430153, bsz=16184.8, num_updates=36200, lr=0.000332411, gnorm=0.235, clip=0, loss_scale=4, train_wall=144, gb_free=59.4, wall=53337 epoch 022: 460 / 1707 loss=4.058, nll_loss=2.423, ppl=5.36, wps=296784, ups=0.69, wpb=430153, bsz=16184.8, num_updates=36200, lr=0.000332411, gnorm=0.235, clip=0, loss_scale=4, train_wall=144, gb_free=59.4, wall=53337 epoch 022: 460 / 1707 loss=4.058, nll_loss=2.423, ppl=5.36, wps=296784, ups=0.69, wpb=430153, bsz=16184.8, num_updates=36200, lr=0.000332411, gnorm=0.235, clip=0, loss_scale=4, train_wall=144, gb_free=59.4, wall=53337 epoch 022: 460 / 1707 loss=4.058, nll_loss=2.423, ppl=5.36, wps=296784, ups=0.69, wpb=430153, bsz=16184.8, num_updates=36200, lr=0.000332411, gnorm=0.235, clip=0, loss_scale=4, train_wall=144, gb_free=59.4, wall=53337 epoch 022: 460 / 1707 loss=4.058, nll_loss=2.423, ppl=5.36, wps=296784, ups=0.69, wpb=430153, bsz=16184.8, num_updates=36200, lr=0.000332411, gnorm=0.235, clip=0, loss_scale=4, train_wall=144, gb_free=59.4, wall=53337 epoch 022: 460 / 1707 loss=4.058, nll_loss=2.423, ppl=5.36, wps=296784, ups=0.69, wpb=430153, bsz=16184.8, num_updates=36200, lr=0.000332411, gnorm=0.235, clip=0, loss_scale=4, train_wall=144, gb_free=59.4, wall=53337 epoch 022: 460 / 1707 loss=4.058, nll_loss=2.423, ppl=5.36, wps=296784, ups=0.69, wpb=430153, bsz=16184.8, num_updates=36200, lr=0.000332411, gnorm=0.235, clip=0, loss_scale=4, train_wall=144, gb_free=59.4, wall=53337 epoch 022: 460 / 1707 loss=4.058, nll_loss=2.423, ppl=5.36, wps=296784, ups=0.69, wpb=430153, bsz=16184.8, num_updates=36200, lr=0.000332411, gnorm=0.235, clip=0, loss_scale=4, train_wall=144, gb_free=59.4, wall=53337 epoch 022: 460 / 1707 loss=4.058, nll_loss=2.423, ppl=5.36, wps=296784, ups=0.69, wpb=430153, bsz=16184.8, num_updates=36200, lr=0.000332411, gnorm=0.235, clip=0, loss_scale=4, train_wall=144, gb_free=59.4, wall=53337 epoch 022: 460 / 1707 loss=4.058, nll_loss=2.423, ppl=5.36, wps=296784, ups=0.69, wpb=430153, bsz=16184.8, num_updates=36200, lr=0.000332411, gnorm=0.235, clip=0, loss_scale=4, train_wall=144, gb_free=59.4, wall=53337 epoch 022: 460 / 1707 loss=4.058, nll_loss=2.423, ppl=5.36, wps=296784, ups=0.69, wpb=430153, bsz=16184.8, num_updates=36200, lr=0.000332411, gnorm=0.235, clip=0, loss_scale=4, train_wall=144, gb_free=59.4, wall=53337 epoch 022: 460 / 1707 loss=4.058, nll_loss=2.423, ppl=5.36, wps=296784, ups=0.69, wpb=430153, bsz=16184.8, num_updates=36200, lr=0.000332411, gnorm=0.235, clip=0, loss_scale=4, train_wall=144, gb_free=59.4, wall=53337 epoch 022: 460 / 1707 loss=4.058, nll_loss=2.423, ppl=5.36, wps=296784, ups=0.69, wpb=430153, bsz=16184.8, num_updates=36200, lr=0.000332411, gnorm=0.235, clip=0, loss_scale=4, train_wall=144, gb_free=59.4, wall=53337 epoch 022: 460 / 1707 loss=4.058, nll_loss=2.423, ppl=5.36, wps=296784, ups=0.69, wpb=430153, bsz=16184.8, num_updates=36200, lr=0.000332411, gnorm=0.235, clip=0, loss_scale=4, train_wall=144, gb_free=59.4, wall=53337 epoch 022: 560 / 1707 loss=4.06, nll_loss=2.426, ppl=5.37, wps=296197, ups=0.69, wpb=429026, bsz=16023.4, num_updates=36300, lr=0.000331953, gnorm=0.237, clip=0, loss_scale=4, train_wall=144, gb_free=59, wall=53482 epoch 022: 560 / 1707 loss=4.06, nll_loss=2.426, ppl=5.37, wps=296197, ups=0.69, wpb=429026, bsz=16023.4, num_updates=36300, lr=0.000331953, gnorm=0.237, clip=0, loss_scale=4, train_wall=144, gb_free=59, wall=53482 epoch 022: 560 / 1707 loss=4.06, nll_loss=2.426, ppl=5.37, wps=296197, ups=0.69, wpb=429026, bsz=16023.4, num_updates=36300, lr=0.000331953, gnorm=0.237, clip=0, loss_scale=4, train_wall=144, gb_free=59, wall=53482 epoch 022: 560 / 1707 loss=4.06, nll_loss=2.426, ppl=5.37, wps=296197, ups=0.69, wpb=429026, bsz=16023.4, num_updates=36300, lr=0.000331953, gnorm=0.237, clip=0, loss_scale=4, train_wall=144, gb_free=59, wall=53482 epoch 022: 560 / 1707 loss=4.06, nll_loss=2.426, ppl=5.37, wps=296197, ups=0.69, wpb=429026, bsz=16023.4, num_updates=36300, lr=0.000331953, gnorm=0.237, clip=0, loss_scale=4, train_wall=144, gb_free=59, wall=53482 epoch 022: 560 / 1707 loss=4.06, nll_loss=2.426, ppl=5.37, wps=296197, ups=0.69, wpb=429026, bsz=16023.4, num_updates=36300, lr=0.000331953, gnorm=0.237, clip=0, loss_scale=4, train_wall=144, gb_free=59, wall=53482 epoch 022: 560 / 1707 loss=4.06, nll_loss=2.426, ppl=5.37, wps=296197, ups=0.69, wpb=429026, bsz=16023.4, num_updates=36300, lr=0.000331953, gnorm=0.237, clip=0, loss_scale=4, train_wall=144, gb_free=59, wall=53482 epoch 022: 560 / 1707 loss=4.06, nll_loss=2.426, ppl=5.37, wps=296197, ups=0.69, wpb=429026, bsz=16023.4, num_updates=36300, lr=0.000331953, gnorm=0.237, clip=0, loss_scale=4, train_wall=144, gb_free=59, wall=53482 epoch 022: 560 / 1707 loss=4.06, nll_loss=2.426, ppl=5.37, wps=296197, ups=0.69, wpb=429026, bsz=16023.4, num_updates=36300, lr=0.000331953, gnorm=0.237, clip=0, loss_scale=4, train_wall=144, gb_free=59, wall=53482 epoch 022: 560 / 1707 loss=4.06, nll_loss=2.426, ppl=5.37, wps=296197, ups=0.69, wpb=429026, bsz=16023.4, num_updates=36300, lr=0.000331953, gnorm=0.237, clip=0, loss_scale=4, train_wall=144, gb_free=59, wall=53482 epoch 022: 560 / 1707 loss=4.06, nll_loss=2.426, ppl=5.37, wps=296197, ups=0.69, wpb=429026, bsz=16023.4, num_updates=36300, lr=0.000331953, gnorm=0.237, clip=0, loss_scale=4, train_wall=144, gb_free=59, wall=53482 epoch 022: 560 / 1707 loss=4.06, nll_loss=2.426, ppl=5.37, wps=296197, ups=0.69, wpb=429026, bsz=16023.4, num_updates=36300, lr=0.000331953, gnorm=0.237, clip=0, loss_scale=4, train_wall=144, gb_free=59, wall=53482 epoch 022: 560 / 1707 loss=4.06, nll_loss=2.426, ppl=5.37, wps=296197, ups=0.69, wpb=429026, bsz=16023.4, num_updates=36300, lr=0.000331953, gnorm=0.237, clip=0, loss_scale=4, train_wall=144, gb_free=59, wall=53482 epoch 022: 560 / 1707 loss=4.06, nll_loss=2.426, ppl=5.37, wps=296197, ups=0.69, wpb=429026, bsz=16023.4, num_updates=36300, lr=0.000331953, gnorm=0.237, clip=0, loss_scale=4, train_wall=144, gb_free=59, wall=53482 epoch 022: 560 / 1707 loss=4.06, nll_loss=2.426, ppl=5.37, wps=296197, ups=0.69, wpb=429026, bsz=16023.4, num_updates=36300, lr=0.000331953, gnorm=0.237, clip=0, loss_scale=4, train_wall=144, gb_free=59, wall=53482 epoch 022: 560 / 1707 loss=4.06, nll_loss=2.426, ppl=5.37, wps=296197, ups=0.69, wpb=429026, bsz=16023.4, num_updates=36300, lr=0.000331953, gnorm=0.237, clip=0, loss_scale=4, train_wall=144, gb_free=59, wall=53482 epoch 022: 560 / 1707 loss=4.06, nll_loss=2.426, ppl=5.37, wps=296197, ups=0.69, wpb=429026, bsz=16023.4, num_updates=36300, lr=0.000331953, gnorm=0.237, clip=0, loss_scale=4, train_wall=144, gb_free=59, wall=53482 epoch 022: 560 / 1707 loss=4.06, nll_loss=2.426, ppl=5.37, wps=296197, ups=0.69, wpb=429026, bsz=16023.4, num_updates=36300, lr=0.000331953, gnorm=0.237, clip=0, loss_scale=4, train_wall=144, gb_free=59, wall=53482 epoch 022: 560 / 1707 loss=4.06, nll_loss=2.426, ppl=5.37, wps=296197, ups=0.69, wpb=429026, bsz=16023.4, num_updates=36300, lr=0.000331953, gnorm=0.237, clip=0, loss_scale=4, train_wall=144, gb_free=59, wall=53482 epoch 022: 560 / 1707 loss=4.06, nll_loss=2.426, ppl=5.37, wps=296197, ups=0.69, wpb=429026, bsz=16023.4, num_updates=36300, lr=0.000331953, gnorm=0.237, clip=0, loss_scale=4, train_wall=144, gb_free=59, wall=53482 epoch 022: 560 / 1707 loss=4.06, nll_loss=2.426, ppl=5.37, wps=296197, ups=0.69, wpb=429026, bsz=16023.4, num_updates=36300, lr=0.000331953, gnorm=0.237, clip=0, loss_scale=4, train_wall=144, gb_free=59, wall=53482 epoch 022: 560 / 1707 loss=4.06, nll_loss=2.426, ppl=5.37, wps=296197, ups=0.69, wpb=429026, bsz=16023.4, num_updates=36300, lr=0.000331953, gnorm=0.237, clip=0, loss_scale=4, train_wall=144, gb_free=59, wall=53482 epoch 022: 660 / 1707 loss=4.068, nll_loss=2.435, ppl=5.41, wps=295908, ups=0.69, wpb=429563, bsz=16236.5, num_updates=36400, lr=0.000331497, gnorm=0.224, clip=0, loss_scale=4, train_wall=144, gb_free=59, wall=53627 epoch 022: 660 / 1707 loss=4.068, nll_loss=2.435, ppl=5.41, wps=295908, ups=0.69, wpb=429563, bsz=16236.5, num_updates=36400, lr=0.000331497, gnorm=0.224, clip=0, loss_scale=4, train_wall=144, gb_free=59, wall=53627 epoch 022: 660 / 1707 loss=4.068, nll_loss=2.435, ppl=5.41, wps=295908, ups=0.69, wpb=429563, bsz=16236.5, num_updates=36400, lr=0.000331497, gnorm=0.224, clip=0, loss_scale=4, train_wall=144, gb_free=59, wall=53627 epoch 022: 660 / 1707 loss=4.068, nll_loss=2.435, ppl=5.41, wps=295908, ups=0.69, wpb=429563, bsz=16236.5, num_updates=36400, lr=0.000331497, gnorm=0.224, clip=0, loss_scale=4, train_wall=144, gb_free=59, wall=53627 epoch 022: 660 / 1707 loss=4.068, nll_loss=2.435, ppl=5.41, wps=295908, ups=0.69, wpb=429563, bsz=16236.5, num_updates=36400, lr=0.000331497, gnorm=0.224, clip=0, loss_scale=4, train_wall=144, gb_free=59, wall=53627 epoch 022: 660 / 1707 loss=4.068, nll_loss=2.435, ppl=5.41, wps=295908, ups=0.69, wpb=429563, bsz=16236.5, num_updates=36400, lr=0.000331497, gnorm=0.224, clip=0, loss_scale=4, train_wall=144, gb_free=59, wall=53627 epoch 022: 660 / 1707 loss=4.068, nll_loss=2.435, ppl=5.41, wps=295908, ups=0.69, wpb=429563, bsz=16236.5, num_updates=36400, lr=0.000331497, gnorm=0.224, clip=0, loss_scale=4, train_wall=144, gb_free=59, wall=53627 epoch 022: 660 / 1707 loss=4.068, nll_loss=2.435, ppl=5.41, wps=295908, ups=0.69, wpb=429563, bsz=16236.5, num_updates=36400, lr=0.000331497, gnorm=0.224, clip=0, loss_scale=4, train_wall=144, gb_free=59, wall=53627 epoch 022: 660 / 1707 loss=4.068, nll_loss=2.435, ppl=5.41, wps=295908, ups=0.69, wpb=429563, bsz=16236.5, num_updates=36400, lr=0.000331497, gnorm=0.224, clip=0, loss_scale=4, train_wall=144, gb_free=59, wall=53627 epoch 022: 660 / 1707 loss=4.068, nll_loss=2.435, ppl=5.41, wps=295908, ups=0.69, wpb=429563, bsz=16236.5, num_updates=36400, lr=0.000331497, gnorm=0.224, clip=0, loss_scale=4, train_wall=144, gb_free=59, wall=53627 epoch 022: 660 / 1707 loss=4.068, nll_loss=2.435, ppl=5.41, wps=295908, ups=0.69, wpb=429563, bsz=16236.5, num_updates=36400, lr=0.000331497, gnorm=0.224, clip=0, loss_scale=4, train_wall=144, gb_free=59, wall=53627 epoch 022: 660 / 1707 loss=4.068, nll_loss=2.435, ppl=5.41, wps=295908, ups=0.69, wpb=429563, bsz=16236.5, num_updates=36400, lr=0.000331497, gnorm=0.224, clip=0, loss_scale=4, train_wall=144, gb_free=59, wall=53627 epoch 022: 660 / 1707 loss=4.068, nll_loss=2.435, ppl=5.41, wps=295908, ups=0.69, wpb=429563, bsz=16236.5, num_updates=36400, lr=0.000331497, gnorm=0.224, clip=0, loss_scale=4, train_wall=144, gb_free=59, wall=53627 epoch 022: 660 / 1707 loss=4.068, nll_loss=2.435, ppl=5.41, wps=295908, ups=0.69, wpb=429563, bsz=16236.5, num_updates=36400, lr=0.000331497, gnorm=0.224, clip=0, loss_scale=4, train_wall=144, gb_free=59, wall=53627 epoch 022: 660 / 1707 loss=4.068, nll_loss=2.435, ppl=5.41, wps=295908, ups=0.69, wpb=429563, bsz=16236.5, num_updates=36400, lr=0.000331497, gnorm=0.224, clip=0, loss_scale=4, train_wall=144, gb_free=59, wall=53627 epoch 022: 660 / 1707 loss=4.068, nll_loss=2.435, ppl=5.41, wps=295908, ups=0.69, wpb=429563, bsz=16236.5, num_updates=36400, lr=0.000331497, gnorm=0.224, clip=0, loss_scale=4, train_wall=144, gb_free=59, wall=53627 epoch 022: 660 / 1707 loss=4.068, nll_loss=2.435, ppl=5.41, wps=295908, ups=0.69, wpb=429563, bsz=16236.5, num_updates=36400, lr=0.000331497, gnorm=0.224, clip=0, loss_scale=4, train_wall=144, gb_free=59, wall=53627 epoch 022: 660 / 1707 loss=4.068, nll_loss=2.435, ppl=5.41, wps=295908, ups=0.69, wpb=429563, bsz=16236.5, num_updates=36400, lr=0.000331497, gnorm=0.224, clip=0, loss_scale=4, train_wall=144, gb_free=59, wall=53627 epoch 022: 660 / 1707 loss=4.068, nll_loss=2.435, ppl=5.41, wps=295908, ups=0.69, wpb=429563, bsz=16236.5, num_updates=36400, lr=0.000331497, gnorm=0.224, clip=0, loss_scale=4, train_wall=144, gb_free=59, wall=53627 epoch 022: 660 / 1707 loss=4.068, nll_loss=2.435, ppl=5.41, wps=295908, ups=0.69, wpb=429563, bsz=16236.5, num_updates=36400, lr=0.000331497, gnorm=0.224, clip=0, loss_scale=4, train_wall=144, gb_free=59, wall=53627 epoch 022: 660 / 1707 loss=4.068, nll_loss=2.435, ppl=5.41, wps=295908, ups=0.69, wpb=429563, bsz=16236.5, num_updates=36400, lr=0.000331497, gnorm=0.224, clip=0, loss_scale=4, train_wall=144, gb_free=59, wall=53627 epoch 022: 660 / 1707 loss=4.068, nll_loss=2.435, ppl=5.41, wps=295908, ups=0.69, wpb=429563, bsz=16236.5, num_updates=36400, lr=0.000331497, gnorm=0.224, clip=0, loss_scale=4, train_wall=144, gb_free=59, wall=53627 epoch 022: 762 / 1707 loss=4.071, nll_loss=2.438, ppl=5.42, wps=288913, ups=0.68, wpb=427048, bsz=16304.2, num_updates=36500, lr=0.000331042, gnorm=0.232, clip=0, loss_scale=2, train_wall=147, gb_free=58.9, wall=53775 epoch 022: 762 / 1707 loss=4.071, nll_loss=2.438, ppl=5.42, wps=288913, ups=0.68, wpb=427048, bsz=16304.2, num_updates=36500, lr=0.000331042, gnorm=0.232, clip=0, loss_scale=2, train_wall=147, gb_free=58.9, wall=53775 epoch 022: 762 / 1707 loss=4.071, nll_loss=2.438, ppl=5.42, wps=288913, ups=0.68, wpb=427048, bsz=16304.2, num_updates=36500, lr=0.000331042, gnorm=0.232, clip=0, loss_scale=2, train_wall=147, gb_free=58.9, wall=53775 epoch 022: 762 / 1707 loss=4.071, nll_loss=2.438, ppl=5.42, wps=288913, ups=0.68, wpb=427048, bsz=16304.2, num_updates=36500, lr=0.000331042, gnorm=0.232, clip=0, loss_scale=2, train_wall=147, gb_free=58.9, wall=53775 epoch 022: 762 / 1707 loss=4.071, nll_loss=2.438, ppl=5.42, wps=288913, ups=0.68, wpb=427048, bsz=16304.2, num_updates=36500, lr=0.000331042, gnorm=0.232, clip=0, loss_scale=2, train_wall=147, gb_free=58.9, wall=53775 epoch 022: 762 / 1707 loss=4.071, nll_loss=2.438, ppl=5.42, wps=288913, ups=0.68, wpb=427048, bsz=16304.2, num_updates=36500, lr=0.000331042, gnorm=0.232, clip=0, loss_scale=2, train_wall=147, gb_free=58.9, wall=53775 epoch 022: 762 / 1707 loss=4.071, nll_loss=2.438, ppl=5.42, wps=288913, ups=0.68, wpb=427048, bsz=16304.2, num_updates=36500, lr=0.000331042, gnorm=0.232, clip=0, loss_scale=2, train_wall=147, gb_free=58.9, wall=53775 epoch 022: 762 / 1707 loss=4.071, nll_loss=2.438, ppl=5.42, wps=288913, ups=0.68, wpb=427048, bsz=16304.2, num_updates=36500, lr=0.000331042, gnorm=0.232, clip=0, loss_scale=2, train_wall=147, gb_free=58.9, wall=53775 epoch 022: 762 / 1707 loss=4.071, nll_loss=2.438, ppl=5.42, wps=288913, ups=0.68, wpb=427048, bsz=16304.2, num_updates=36500, lr=0.000331042, gnorm=0.232, clip=0, loss_scale=2, train_wall=147, gb_free=58.9, wall=53775 epoch 022: 762 / 1707 loss=4.071, nll_loss=2.438, ppl=5.42, wps=288913, ups=0.68, wpb=427048, bsz=16304.2, num_updates=36500, lr=0.000331042, gnorm=0.232, clip=0, loss_scale=2, train_wall=147, gb_free=58.9, wall=53775 epoch 022: 762 / 1707 loss=4.071, nll_loss=2.438, ppl=5.42, wps=288913, ups=0.68, wpb=427048, bsz=16304.2, num_updates=36500, lr=0.000331042, gnorm=0.232, clip=0, loss_scale=2, train_wall=147, gb_free=58.9, wall=53775 epoch 022: 762 / 1707 loss=4.071, nll_loss=2.438, ppl=5.42, wps=288913, ups=0.68, wpb=427048, bsz=16304.2, num_updates=36500, lr=0.000331042, gnorm=0.232, clip=0, loss_scale=2, train_wall=147, gb_free=58.9, wall=53775 epoch 022: 762 / 1707 loss=4.071, nll_loss=2.438, ppl=5.42, wps=288913, ups=0.68, wpb=427048, bsz=16304.2, num_updates=36500, lr=0.000331042, gnorm=0.232, clip=0, loss_scale=2, train_wall=147, gb_free=58.9, wall=53775 epoch 022: 762 / 1707 loss=4.071, nll_loss=2.438, ppl=5.42, wps=288913, ups=0.68, wpb=427048, bsz=16304.2, num_updates=36500, lr=0.000331042, gnorm=0.232, clip=0, loss_scale=2, train_wall=147, gb_free=58.9, wall=53775 epoch 022: 762 / 1707 loss=4.071, nll_loss=2.438, ppl=5.42, wps=288913, ups=0.68, wpb=427048, bsz=16304.2, num_updates=36500, lr=0.000331042, gnorm=0.232, clip=0, loss_scale=2, train_wall=147, gb_free=58.9, wall=53775 epoch 022: 762 / 1707 loss=4.071, nll_loss=2.438, ppl=5.42, wps=288913, ups=0.68, wpb=427048, bsz=16304.2, num_updates=36500, lr=0.000331042, gnorm=0.232, clip=0, loss_scale=2, train_wall=147, gb_free=58.9, wall=53775 epoch 022: 762 / 1707 loss=4.071, nll_loss=2.438, ppl=5.42, wps=288913, ups=0.68, wpb=427048, bsz=16304.2, num_updates=36500, lr=0.000331042, gnorm=0.232, clip=0, loss_scale=2, train_wall=147, gb_free=58.9, wall=53775 epoch 022: 762 / 1707 loss=4.071, nll_loss=2.438, ppl=5.42, wps=288913, ups=0.68, wpb=427048, bsz=16304.2, num_updates=36500, lr=0.000331042, gnorm=0.232, clip=0, loss_scale=2, train_wall=147, gb_free=58.9, wall=53775 epoch 022: 762 / 1707 loss=4.071, nll_loss=2.438, ppl=5.42, wps=288913, ups=0.68, wpb=427048, bsz=16304.2, num_updates=36500, lr=0.000331042, gnorm=0.232, clip=0, loss_scale=2, train_wall=147, gb_free=58.9, wall=53775 epoch 022: 762 / 1707 loss=4.071, nll_loss=2.438, ppl=5.42, wps=288913, ups=0.68, wpb=427048, bsz=16304.2, num_updates=36500, lr=0.000331042, gnorm=0.232, clip=0, loss_scale=2, train_wall=147, gb_free=58.9, wall=53775 epoch 022: 762 / 1707 loss=4.071, nll_loss=2.438, ppl=5.42, wps=288913, ups=0.68, wpb=427048, bsz=16304.2, num_updates=36500, lr=0.000331042, gnorm=0.232, clip=0, loss_scale=2, train_wall=147, gb_free=58.9, wall=53775 epoch 022: 762 / 1707 loss=4.071, nll_loss=2.438, ppl=5.42, wps=288913, ups=0.68, wpb=427048, bsz=16304.2, num_updates=36500, lr=0.000331042, gnorm=0.232, clip=0, loss_scale=2, train_wall=147, gb_free=58.9, wall=53775 epoch 022: 862 / 1707 loss=4.073, nll_loss=2.441, ppl=5.43, wps=295920, ups=0.69, wpb=430314, bsz=16600.6, num_updates=36600, lr=0.00033059, gnorm=0.231, clip=0, loss_scale=2, train_wall=145, gb_free=59.1, wall=53920 epoch 022: 862 / 1707 loss=4.073, nll_loss=2.441, ppl=5.43, wps=295920, ups=0.69, wpb=430314, bsz=16600.6, num_updates=36600, lr=0.00033059, gnorm=0.231, clip=0, loss_scale=2, train_wall=145, gb_free=59.1, wall=53920 epoch 022: 862 / 1707 loss=4.073, nll_loss=2.441, ppl=5.43, wps=295920, ups=0.69, wpb=430314, bsz=16600.6, num_updates=36600, lr=0.00033059, gnorm=0.231, clip=0, loss_scale=2, train_wall=145, gb_free=59.1, wall=53920 epoch 022: 862 / 1707 loss=4.073, nll_loss=2.441, ppl=5.43, wps=295920, ups=0.69, wpb=430314, bsz=16600.6, num_updates=36600, lr=0.00033059, gnorm=0.231, clip=0, loss_scale=2, train_wall=145, gb_free=59.1, wall=53920 epoch 022: 862 / 1707 loss=4.073, nll_loss=2.441, ppl=5.43, wps=295920, ups=0.69, wpb=430314, bsz=16600.6, num_updates=36600, lr=0.00033059, gnorm=0.231, clip=0, loss_scale=2, train_wall=145, gb_free=59.1, wall=53920 epoch 022: 862 / 1707 loss=4.073, nll_loss=2.441, ppl=5.43, wps=295920, ups=0.69, wpb=430314, bsz=16600.6, num_updates=36600, lr=0.00033059, gnorm=0.231, clip=0, loss_scale=2, train_wall=145, gb_free=59.1, wall=53920 epoch 022: 862 / 1707 loss=4.073, nll_loss=2.441, ppl=5.43, wps=295920, ups=0.69, wpb=430314, bsz=16600.6, num_updates=36600, lr=0.00033059, gnorm=0.231, clip=0, loss_scale=2, train_wall=145, gb_free=59.1, wall=53920 epoch 022: 862 / 1707 loss=4.073, nll_loss=2.441, ppl=5.43, wps=295920, ups=0.69, wpb=430314, bsz=16600.6, num_updates=36600, lr=0.00033059, gnorm=0.231, clip=0, loss_scale=2, train_wall=145, gb_free=59.1, wall=53920 epoch 022: 862 / 1707 loss=4.073, nll_loss=2.441, ppl=5.43, wps=295920, ups=0.69, wpb=430314, bsz=16600.6, num_updates=36600, lr=0.00033059, gnorm=0.231, clip=0, loss_scale=2, train_wall=145, gb_free=59.1, wall=53920 epoch 022: 862 / 1707 loss=4.073, nll_loss=2.441, ppl=5.43, wps=295920, ups=0.69, wpb=430314, bsz=16600.6, num_updates=36600, lr=0.00033059, gnorm=0.231, clip=0, loss_scale=2, train_wall=145, gb_free=59.1, wall=53920 epoch 022: 862 / 1707 loss=4.073, nll_loss=2.441, ppl=5.43, wps=295920, ups=0.69, wpb=430314, bsz=16600.6, num_updates=36600, lr=0.00033059, gnorm=0.231, clip=0, loss_scale=2, train_wall=145, gb_free=59.1, wall=53920 epoch 022: 862 / 1707 loss=4.073, nll_loss=2.441, ppl=5.43, wps=295920, ups=0.69, wpb=430314, bsz=16600.6, num_updates=36600, lr=0.00033059, gnorm=0.231, clip=0, loss_scale=2, train_wall=145, gb_free=59.1, wall=53920 epoch 022: 862 / 1707 loss=4.073, nll_loss=2.441, ppl=5.43, wps=295920, ups=0.69, wpb=430314, bsz=16600.6, num_updates=36600, lr=0.00033059, gnorm=0.231, clip=0, loss_scale=2, train_wall=145, gb_free=59.1, wall=53920 epoch 022: 862 / 1707 loss=4.073, nll_loss=2.441, ppl=5.43, wps=295920, ups=0.69, wpb=430314, bsz=16600.6, num_updates=36600, lr=0.00033059, gnorm=0.231, clip=0, loss_scale=2, train_wall=145, gb_free=59.1, wall=53920 epoch 022: 862 / 1707 loss=4.073, nll_loss=2.441, ppl=5.43, wps=295920, ups=0.69, wpb=430314, bsz=16600.6, num_updates=36600, lr=0.00033059, gnorm=0.231, clip=0, loss_scale=2, train_wall=145, gb_free=59.1, wall=53920 epoch 022: 862 / 1707 loss=4.073, nll_loss=2.441, ppl=5.43, wps=295920, ups=0.69, wpb=430314, bsz=16600.6, num_updates=36600, lr=0.00033059, gnorm=0.231, clip=0, loss_scale=2, train_wall=145, gb_free=59.1, wall=53920 epoch 022: 862 / 1707 loss=4.073, nll_loss=2.441, ppl=5.43, wps=295920, ups=0.69, wpb=430314, bsz=16600.6, num_updates=36600, lr=0.00033059, gnorm=0.231, clip=0, loss_scale=2, train_wall=145, gb_free=59.1, wall=53920 epoch 022: 862 / 1707 loss=4.073, nll_loss=2.441, ppl=5.43, wps=295920, ups=0.69, wpb=430314, bsz=16600.6, num_updates=36600, lr=0.00033059, gnorm=0.231, clip=0, loss_scale=2, train_wall=145, gb_free=59.1, wall=53920 epoch 022: 862 / 1707 loss=4.073, nll_loss=2.441, ppl=5.43, wps=295920, ups=0.69, wpb=430314, bsz=16600.6, num_updates=36600, lr=0.00033059, gnorm=0.231, clip=0, loss_scale=2, train_wall=145, gb_free=59.1, wall=53920 epoch 022: 862 / 1707 loss=4.073, nll_loss=2.441, ppl=5.43, wps=295920, ups=0.69, wpb=430314, bsz=16600.6, num_updates=36600, lr=0.00033059, gnorm=0.231, clip=0, loss_scale=2, train_wall=145, gb_free=59.1, wall=53920 epoch 022: 862 / 1707 loss=4.073, nll_loss=2.441, ppl=5.43, wps=295920, ups=0.69, wpb=430314, bsz=16600.6, num_updates=36600, lr=0.00033059, gnorm=0.231, clip=0, loss_scale=2, train_wall=145, gb_free=59.1, wall=53920 epoch 022: 862 / 1707 loss=4.073, nll_loss=2.441, ppl=5.43, wps=295920, ups=0.69, wpb=430314, bsz=16600.6, num_updates=36600, lr=0.00033059, gnorm=0.231, clip=0, loss_scale=2, train_wall=145, gb_free=59.1, wall=53920 epoch 022: 963 / 1707 loss=4.066, nll_loss=2.433, ppl=5.4, wps=292685, ups=0.68, wpb=428913, bsz=16258.4, num_updates=36700, lr=0.000330139, gnorm=0.237, clip=0, loss_scale=1, train_wall=146, gb_free=59.3, wall=54067 epoch 022: 963 / 1707 loss=4.066, nll_loss=2.433, ppl=5.4, wps=292685, ups=0.68, wpb=428913, bsz=16258.4, num_updates=36700, lr=0.000330139, gnorm=0.237, clip=0, loss_scale=1, train_wall=146, gb_free=59.3, wall=54067 epoch 022: 963 / 1707 loss=4.066, nll_loss=2.433, ppl=5.4, wps=292685, ups=0.68, wpb=428913, bsz=16258.4, num_updates=36700, lr=0.000330139, gnorm=0.237, clip=0, loss_scale=1, train_wall=146, gb_free=59.3, wall=54067 epoch 022: 963 / 1707 loss=4.066, nll_loss=2.433, ppl=5.4, wps=292685, ups=0.68, wpb=428913, bsz=16258.4, num_updates=36700, lr=0.000330139, gnorm=0.237, clip=0, loss_scale=1, train_wall=146, gb_free=59.3, wall=54067 epoch 022: 963 / 1707 loss=4.066, nll_loss=2.433, ppl=5.4, wps=292685, ups=0.68, wpb=428913, bsz=16258.4, num_updates=36700, lr=0.000330139, gnorm=0.237, clip=0, loss_scale=1, train_wall=146, gb_free=59.3, wall=54067 epoch 022: 963 / 1707 loss=4.066, nll_loss=2.433, ppl=5.4, wps=292685, ups=0.68, wpb=428913, bsz=16258.4, num_updates=36700, lr=0.000330139, gnorm=0.237, clip=0, loss_scale=1, train_wall=146, gb_free=59.3, wall=54067 epoch 022: 963 / 1707 loss=4.066, nll_loss=2.433, ppl=5.4, wps=292685, ups=0.68, wpb=428913, bsz=16258.4, num_updates=36700, lr=0.000330139, gnorm=0.237, clip=0, loss_scale=1, train_wall=146, gb_free=59.3, wall=54067 epoch 022: 963 / 1707 loss=4.066, nll_loss=2.433, ppl=5.4, wps=292685, ups=0.68, wpb=428913, bsz=16258.4, num_updates=36700, lr=0.000330139, gnorm=0.237, clip=0, loss_scale=1, train_wall=146, gb_free=59.3, wall=54067 epoch 022: 963 / 1707 loss=4.066, nll_loss=2.433, ppl=5.4, wps=292685, ups=0.68, wpb=428913, bsz=16258.4, num_updates=36700, lr=0.000330139, gnorm=0.237, clip=0, loss_scale=1, train_wall=146, gb_free=59.3, wall=54067 epoch 022: 963 / 1707 loss=4.066, nll_loss=2.433, ppl=5.4, wps=292685, ups=0.68, wpb=428913, bsz=16258.4, num_updates=36700, lr=0.000330139, gnorm=0.237, clip=0, loss_scale=1, train_wall=146, gb_free=59.3, wall=54067 epoch 022: 963 / 1707 loss=4.066, nll_loss=2.433, ppl=5.4, wps=292685, ups=0.68, wpb=428913, bsz=16258.4, num_updates=36700, lr=0.000330139, gnorm=0.237, clip=0, loss_scale=1, train_wall=146, gb_free=59.3, wall=54067 epoch 022: 963 / 1707 loss=4.066, nll_loss=2.433, ppl=5.4, wps=292685, ups=0.68, wpb=428913, bsz=16258.4, num_updates=36700, lr=0.000330139, gnorm=0.237, clip=0, loss_scale=1, train_wall=146, gb_free=59.3, wall=54067 epoch 022: 963 / 1707 loss=4.066, nll_loss=2.433, ppl=5.4, wps=292685, ups=0.68, wpb=428913, bsz=16258.4, num_updates=36700, lr=0.000330139, gnorm=0.237, clip=0, loss_scale=1, train_wall=146, gb_free=59.3, wall=54067 epoch 022: 963 / 1707 loss=4.066, nll_loss=2.433, ppl=5.4, wps=292685, ups=0.68, wpb=428913, bsz=16258.4, num_updates=36700, lr=0.000330139, gnorm=0.237, clip=0, loss_scale=1, train_wall=146, gb_free=59.3, wall=54067 epoch 022: 963 / 1707 loss=4.066, nll_loss=2.433, ppl=5.4, wps=292685, ups=0.68, wpb=428913, bsz=16258.4, num_updates=36700, lr=0.000330139, gnorm=0.237, clip=0, loss_scale=1, train_wall=146, gb_free=59.3, wall=54067 epoch 022: 963 / 1707 loss=4.066, nll_loss=2.433, ppl=5.4, wps=292685, ups=0.68, wpb=428913, bsz=16258.4, num_updates=36700, lr=0.000330139, gnorm=0.237, clip=0, loss_scale=1, train_wall=146, gb_free=59.3, wall=54067 epoch 022: 963 / 1707 loss=4.066, nll_loss=2.433, ppl=5.4, wps=292685, ups=0.68, wpb=428913, bsz=16258.4, num_updates=36700, lr=0.000330139, gnorm=0.237, clip=0, loss_scale=1, train_wall=146, gb_free=59.3, wall=54067 epoch 022: 963 / 1707 loss=4.066, nll_loss=2.433, ppl=5.4, wps=292685, ups=0.68, wpb=428913, bsz=16258.4, num_updates=36700, lr=0.000330139, gnorm=0.237, clip=0, loss_scale=1, train_wall=146, gb_free=59.3, wall=54067 epoch 022: 963 / 1707 loss=4.066, nll_loss=2.433, ppl=5.4, wps=292685, ups=0.68, wpb=428913, bsz=16258.4, num_updates=36700, lr=0.000330139, gnorm=0.237, clip=0, loss_scale=1, train_wall=146, gb_free=59.3, wall=54067 epoch 022: 963 / 1707 loss=4.066, nll_loss=2.433, ppl=5.4, wps=292685, ups=0.68, wpb=428913, bsz=16258.4, num_updates=36700, lr=0.000330139, gnorm=0.237, clip=0, loss_scale=1, train_wall=146, gb_free=59.3, wall=54067 epoch 022: 963 / 1707 loss=4.066, nll_loss=2.433, ppl=5.4, wps=292685, ups=0.68, wpb=428913, bsz=16258.4, num_updates=36700, lr=0.000330139, gnorm=0.237, clip=0, loss_scale=1, train_wall=146, gb_free=59.3, wall=54067 epoch 022: 963 / 1707 loss=4.066, nll_loss=2.433, ppl=5.4, wps=292685, ups=0.68, wpb=428913, bsz=16258.4, num_updates=36700, lr=0.000330139, gnorm=0.237, clip=0, loss_scale=1, train_wall=146, gb_free=59.3, wall=54067 epoch 022: 1063 / 1707 loss=4.065, nll_loss=2.432, ppl=5.4, wps=294845, ups=0.69, wpb=428768, bsz=16406.6, num_updates=36800, lr=0.00032969, gnorm=0.227, clip=0, loss_scale=1, train_wall=145, gb_free=58.9, wall=54212 epoch 022: 1063 / 1707 loss=4.065, nll_loss=2.432, ppl=5.4, wps=294845, ups=0.69, wpb=428768, bsz=16406.6, num_updates=36800, lr=0.00032969, gnorm=0.227, clip=0, loss_scale=1, train_wall=145, gb_free=58.9, wall=54212 epoch 022: 1063 / 1707 loss=4.065, nll_loss=2.432, ppl=5.4, wps=294845, ups=0.69, wpb=428768, bsz=16406.6, num_updates=36800, lr=0.00032969, gnorm=0.227, clip=0, loss_scale=1, train_wall=145, gb_free=58.9, wall=54212 epoch 022: 1063 / 1707 loss=4.065, nll_loss=2.432, ppl=5.4, wps=294845, ups=0.69, wpb=428768, bsz=16406.6, num_updates=36800, lr=0.00032969, gnorm=0.227, clip=0, loss_scale=1, train_wall=145, gb_free=58.9, wall=54212 epoch 022: 1063 / 1707 loss=4.065, nll_loss=2.432, ppl=5.4, wps=294845, ups=0.69, wpb=428768, bsz=16406.6, num_updates=36800, lr=0.00032969, gnorm=0.227, clip=0, loss_scale=1, train_wall=145, gb_free=58.9, wall=54212 epoch 022: 1063 / 1707 loss=4.065, nll_loss=2.432, ppl=5.4, wps=294845, ups=0.69, wpb=428768, bsz=16406.6, num_updates=36800, lr=0.00032969, gnorm=0.227, clip=0, loss_scale=1, train_wall=145, gb_free=58.9, wall=54212 epoch 022: 1063 / 1707 loss=4.065, nll_loss=2.432, ppl=5.4, wps=294845, ups=0.69, wpb=428768, bsz=16406.6, num_updates=36800, lr=0.00032969, gnorm=0.227, clip=0, loss_scale=1, train_wall=145, gb_free=58.9, wall=54212 epoch 022: 1063 / 1707 loss=4.065, nll_loss=2.432, ppl=5.4, wps=294845, ups=0.69, wpb=428768, bsz=16406.6, num_updates=36800, lr=0.00032969, gnorm=0.227, clip=0, loss_scale=1, train_wall=145, gb_free=58.9, wall=54212 epoch 022: 1063 / 1707 loss=4.065, nll_loss=2.432, ppl=5.4, wps=294845, ups=0.69, wpb=428768, bsz=16406.6, num_updates=36800, lr=0.00032969, gnorm=0.227, clip=0, loss_scale=1, train_wall=145, gb_free=58.9, wall=54212 epoch 022: 1063 / 1707 loss=4.065, nll_loss=2.432, ppl=5.4, wps=294845, ups=0.69, wpb=428768, bsz=16406.6, num_updates=36800, lr=0.00032969, gnorm=0.227, clip=0, loss_scale=1, train_wall=145, gb_free=58.9, wall=54212 epoch 022: 1063 / 1707 loss=4.065, nll_loss=2.432, ppl=5.4, wps=294845, ups=0.69, wpb=428768, bsz=16406.6, num_updates=36800, lr=0.00032969, gnorm=0.227, clip=0, loss_scale=1, train_wall=145, gb_free=58.9, wall=54212 epoch 022: 1063 / 1707 loss=4.065, nll_loss=2.432, ppl=5.4, wps=294845, ups=0.69, wpb=428768, bsz=16406.6, num_updates=36800, lr=0.00032969, gnorm=0.227, clip=0, loss_scale=1, train_wall=145, gb_free=58.9, wall=54212 epoch 022: 1063 / 1707 loss=4.065, nll_loss=2.432, ppl=5.4, wps=294845, ups=0.69, wpb=428768, bsz=16406.6, num_updates=36800, lr=0.00032969, gnorm=0.227, clip=0, loss_scale=1, train_wall=145, gb_free=58.9, wall=54212 epoch 022: 1063 / 1707 loss=4.065, nll_loss=2.432, ppl=5.4, wps=294845, ups=0.69, wpb=428768, bsz=16406.6, num_updates=36800, lr=0.00032969, gnorm=0.227, clip=0, loss_scale=1, train_wall=145, gb_free=58.9, wall=54212 epoch 022: 1063 / 1707 loss=4.065, nll_loss=2.432, ppl=5.4, wps=294845, ups=0.69, wpb=428768, bsz=16406.6, num_updates=36800, lr=0.00032969, gnorm=0.227, clip=0, loss_scale=1, train_wall=145, gb_free=58.9, wall=54212 epoch 022: 1063 / 1707 loss=4.065, nll_loss=2.432, ppl=5.4, wps=294845, ups=0.69, wpb=428768, bsz=16406.6, num_updates=36800, lr=0.00032969, gnorm=0.227, clip=0, loss_scale=1, train_wall=145, gb_free=58.9, wall=54212 epoch 022: 1063 / 1707 loss=4.065, nll_loss=2.432, ppl=5.4, wps=294845, ups=0.69, wpb=428768, bsz=16406.6, num_updates=36800, lr=0.00032969, gnorm=0.227, clip=0, loss_scale=1, train_wall=145, gb_free=58.9, wall=54212 epoch 022: 1063 / 1707 loss=4.065, nll_loss=2.432, ppl=5.4, wps=294845, ups=0.69, wpb=428768, bsz=16406.6, num_updates=36800, lr=0.00032969, gnorm=0.227, clip=0, loss_scale=1, train_wall=145, gb_free=58.9, wall=54212 epoch 022: 1063 / 1707 loss=4.065, nll_loss=2.432, ppl=5.4, wps=294845, ups=0.69, wpb=428768, bsz=16406.6, num_updates=36800, lr=0.00032969, gnorm=0.227, clip=0, loss_scale=1, train_wall=145, gb_free=58.9, wall=54212 epoch 022: 1063 / 1707 loss=4.065, nll_loss=2.432, ppl=5.4, wps=294845, ups=0.69, wpb=428768, bsz=16406.6, num_updates=36800, lr=0.00032969, gnorm=0.227, clip=0, loss_scale=1, train_wall=145, gb_free=58.9, wall=54212 epoch 022: 1063 / 1707 loss=4.065, nll_loss=2.432, ppl=5.4, wps=294845, ups=0.69, wpb=428768, bsz=16406.6, num_updates=36800, lr=0.00032969, gnorm=0.227, clip=0, loss_scale=1, train_wall=145, gb_free=58.9, wall=54212 epoch 022: 1063 / 1707 loss=4.065, nll_loss=2.432, ppl=5.4, wps=294845, ups=0.69, wpb=428768, bsz=16406.6, num_updates=36800, lr=0.00032969, gnorm=0.227, clip=0, loss_scale=1, train_wall=145, gb_free=58.9, wall=54212 epoch 022: 1163 / 1707 loss=4.066, nll_loss=2.434, ppl=5.4, wps=295570, ups=0.69, wpb=429046, bsz=16482.1, num_updates=36900, lr=0.000329243, gnorm=0.238, clip=0, loss_scale=2, train_wall=144, gb_free=59.4, wall=54358 epoch 022: 1163 / 1707 loss=4.066, nll_loss=2.434, ppl=5.4, wps=295570, ups=0.69, wpb=429046, bsz=16482.1, num_updates=36900, lr=0.000329243, gnorm=0.238, clip=0, loss_scale=2, train_wall=144, gb_free=59.4, wall=54358 epoch 022: 1163 / 1707 loss=4.066, nll_loss=2.434, ppl=5.4, wps=295570, ups=0.69, wpb=429046, bsz=16482.1, num_updates=36900, lr=0.000329243, gnorm=0.238, clip=0, loss_scale=2, train_wall=144, gb_free=59.4, wall=54358 epoch 022: 1163 / 1707 loss=4.066, nll_loss=2.434, ppl=5.4, wps=295570, ups=0.69, wpb=429046, bsz=16482.1, num_updates=36900, lr=0.000329243, gnorm=0.238, clip=0, loss_scale=2, train_wall=144, gb_free=59.4, wall=54358 epoch 022: 1163 / 1707 loss=4.066, nll_loss=2.434, ppl=5.4, wps=295570, ups=0.69, wpb=429046, bsz=16482.1, num_updates=36900, lr=0.000329243, gnorm=0.238, clip=0, loss_scale=2, train_wall=144, gb_free=59.4, wall=54358 epoch 022: 1163 / 1707 loss=4.066, nll_loss=2.434, ppl=5.4, wps=295570, ups=0.69, wpb=429046, bsz=16482.1, num_updates=36900, lr=0.000329243, gnorm=0.238, clip=0, loss_scale=2, train_wall=144, gb_free=59.4, wall=54358 epoch 022: 1163 / 1707 loss=4.066, nll_loss=2.434, ppl=5.4, wps=295570, ups=0.69, wpb=429046, bsz=16482.1, num_updates=36900, lr=0.000329243, gnorm=0.238, clip=0, loss_scale=2, train_wall=144, gb_free=59.4, wall=54358 epoch 022: 1163 / 1707 loss=4.066, nll_loss=2.434, ppl=5.4, wps=295570, ups=0.69, wpb=429046, bsz=16482.1, num_updates=36900, lr=0.000329243, gnorm=0.238, clip=0, loss_scale=2, train_wall=144, gb_free=59.4, wall=54358 epoch 022: 1163 / 1707 loss=4.066, nll_loss=2.434, ppl=5.4, wps=295570, ups=0.69, wpb=429046, bsz=16482.1, num_updates=36900, lr=0.000329243, gnorm=0.238, clip=0, loss_scale=2, train_wall=144, gb_free=59.4, wall=54358 epoch 022: 1163 / 1707 loss=4.066, nll_loss=2.434, ppl=5.4, wps=295570, ups=0.69, wpb=429046, bsz=16482.1, num_updates=36900, lr=0.000329243, gnorm=0.238, clip=0, loss_scale=2, train_wall=144, gb_free=59.4, wall=54358 epoch 022: 1163 / 1707 loss=4.066, nll_loss=2.434, ppl=5.4, wps=295570, ups=0.69, wpb=429046, bsz=16482.1, num_updates=36900, lr=0.000329243, gnorm=0.238, clip=0, loss_scale=2, train_wall=144, gb_free=59.4, wall=54358 epoch 022: 1163 / 1707 loss=4.066, nll_loss=2.434, ppl=5.4, wps=295570, ups=0.69, wpb=429046, bsz=16482.1, num_updates=36900, lr=0.000329243, gnorm=0.238, clip=0, loss_scale=2, train_wall=144, gb_free=59.4, wall=54358 epoch 022: 1163 / 1707 loss=4.066, nll_loss=2.434, ppl=5.4, wps=295570, ups=0.69, wpb=429046, bsz=16482.1, num_updates=36900, lr=0.000329243, gnorm=0.238, clip=0, loss_scale=2, train_wall=144, gb_free=59.4, wall=54358 epoch 022: 1163 / 1707 loss=4.066, nll_loss=2.434, ppl=5.4, wps=295570, ups=0.69, wpb=429046, bsz=16482.1, num_updates=36900, lr=0.000329243, gnorm=0.238, clip=0, loss_scale=2, train_wall=144, gb_free=59.4, wall=54358 epoch 022: 1163 / 1707 loss=4.066, nll_loss=2.434, ppl=5.4, wps=295570, ups=0.69, wpb=429046, bsz=16482.1, num_updates=36900, lr=0.000329243, gnorm=0.238, clip=0, loss_scale=2, train_wall=144, gb_free=59.4, wall=54358 epoch 022: 1163 / 1707 loss=4.066, nll_loss=2.434, ppl=5.4, wps=295570, ups=0.69, wpb=429046, bsz=16482.1, num_updates=36900, lr=0.000329243, gnorm=0.238, clip=0, loss_scale=2, train_wall=144, gb_free=59.4, wall=54358 epoch 022: 1163 / 1707 loss=4.066, nll_loss=2.434, ppl=5.4, wps=295570, ups=0.69, wpb=429046, bsz=16482.1, num_updates=36900, lr=0.000329243, gnorm=0.238, clip=0, loss_scale=2, train_wall=144, gb_free=59.4, wall=54358 epoch 022: 1163 / 1707 loss=4.066, nll_loss=2.434, ppl=5.4, wps=295570, ups=0.69, wpb=429046, bsz=16482.1, num_updates=36900, lr=0.000329243, gnorm=0.238, clip=0, loss_scale=2, train_wall=144, gb_free=59.4, wall=54358 epoch 022: 1163 / 1707 loss=4.066, nll_loss=2.434, ppl=5.4, wps=295570, ups=0.69, wpb=429046, bsz=16482.1, num_updates=36900, lr=0.000329243, gnorm=0.238, clip=0, loss_scale=2, train_wall=144, gb_free=59.4, wall=54358 epoch 022: 1163 / 1707 loss=4.066, nll_loss=2.434, ppl=5.4, wps=295570, ups=0.69, wpb=429046, bsz=16482.1, num_updates=36900, lr=0.000329243, gnorm=0.238, clip=0, loss_scale=2, train_wall=144, gb_free=59.4, wall=54358 epoch 022: 1163 / 1707 loss=4.066, nll_loss=2.434, ppl=5.4, wps=295570, ups=0.69, wpb=429046, bsz=16482.1, num_updates=36900, lr=0.000329243, gnorm=0.238, clip=0, loss_scale=2, train_wall=144, gb_free=59.4, wall=54358 epoch 022: 1163 / 1707 loss=4.066, nll_loss=2.434, ppl=5.4, wps=295570, ups=0.69, wpb=429046, bsz=16482.1, num_updates=36900, lr=0.000329243, gnorm=0.238, clip=0, loss_scale=2, train_wall=144, gb_free=59.4, wall=54358 epoch 022: 1263 / 1707 loss=4.07, nll_loss=2.438, ppl=5.42, wps=295258, ups=0.69, wpb=429258, bsz=16593.2, num_updates=37000, lr=0.000328798, gnorm=0.23, clip=0, loss_scale=2, train_wall=145, gb_free=58.8, wall=54503 epoch 022: 1263 / 1707 loss=4.07, nll_loss=2.438, ppl=5.42, wps=295258, ups=0.69, wpb=429258, bsz=16593.2, num_updates=37000, lr=0.000328798, gnorm=0.23, clip=0, loss_scale=2, train_wall=145, gb_free=58.8, wall=54503 epoch 022: 1263 / 1707 loss=4.07, nll_loss=2.438, ppl=5.42, wps=295258, ups=0.69, wpb=429258, bsz=16593.2, num_updates=37000, lr=0.000328798, gnorm=0.23, clip=0, loss_scale=2, train_wall=145, gb_free=58.8, wall=54503 epoch 022: 1263 / 1707 loss=4.07, nll_loss=2.438, ppl=5.42, wps=295258, ups=0.69, wpb=429258, bsz=16593.2, num_updates=37000, lr=0.000328798, gnorm=0.23, clip=0, loss_scale=2, train_wall=145, gb_free=58.8, wall=54503 epoch 022: 1263 / 1707 loss=4.07, nll_loss=2.438, ppl=5.42, wps=295258, ups=0.69, wpb=429258, bsz=16593.2, num_updates=37000, lr=0.000328798, gnorm=0.23, clip=0, loss_scale=2, train_wall=145, gb_free=58.8, wall=54503 epoch 022: 1263 / 1707 loss=4.07, nll_loss=2.438, ppl=5.42, wps=295258, ups=0.69, wpb=429258, bsz=16593.2, num_updates=37000, lr=0.000328798, gnorm=0.23, clip=0, loss_scale=2, train_wall=145, gb_free=58.8, wall=54503 epoch 022: 1263 / 1707 loss=4.07, nll_loss=2.438, ppl=5.42, wps=295258, ups=0.69, wpb=429258, bsz=16593.2, num_updates=37000, lr=0.000328798, gnorm=0.23, clip=0, loss_scale=2, train_wall=145, gb_free=58.8, wall=54503 epoch 022: 1263 / 1707 loss=4.07, nll_loss=2.438, ppl=5.42, wps=295258, ups=0.69, wpb=429258, bsz=16593.2, num_updates=37000, lr=0.000328798, gnorm=0.23, clip=0, loss_scale=2, train_wall=145, gb_free=58.8, wall=54503 epoch 022: 1263 / 1707 loss=4.07, nll_loss=2.438, ppl=5.42, wps=295258, ups=0.69, wpb=429258, bsz=16593.2, num_updates=37000, lr=0.000328798, gnorm=0.23, clip=0, loss_scale=2, train_wall=145, gb_free=58.8, wall=54503 epoch 022: 1263 / 1707 loss=4.07, nll_loss=2.438, ppl=5.42, wps=295258, ups=0.69, wpb=429258, bsz=16593.2, num_updates=37000, lr=0.000328798, gnorm=0.23, clip=0, loss_scale=2, train_wall=145, gb_free=58.8, wall=54503 epoch 022: 1263 / 1707 loss=4.07, nll_loss=2.438, ppl=5.42, wps=295258, ups=0.69, wpb=429258, bsz=16593.2, num_updates=37000, lr=0.000328798, gnorm=0.23, clip=0, loss_scale=2, train_wall=145, gb_free=58.8, wall=54503 epoch 022: 1263 / 1707 loss=4.07, nll_loss=2.438, ppl=5.42, wps=295258, ups=0.69, wpb=429258, bsz=16593.2, num_updates=37000, lr=0.000328798, gnorm=0.23, clip=0, loss_scale=2, train_wall=145, gb_free=58.8, wall=54503 epoch 022: 1263 / 1707 loss=4.07, nll_loss=2.438, ppl=5.42, wps=295258, ups=0.69, wpb=429258, bsz=16593.2, num_updates=37000, lr=0.000328798, gnorm=0.23, clip=0, loss_scale=2, train_wall=145, gb_free=58.8, wall=54503 epoch 022: 1263 / 1707 loss=4.07, nll_loss=2.438, ppl=5.42, wps=295258, ups=0.69, wpb=429258, bsz=16593.2, num_updates=37000, lr=0.000328798, gnorm=0.23, clip=0, loss_scale=2, train_wall=145, gb_free=58.8, wall=54503 epoch 022: 1263 / 1707 loss=4.07, nll_loss=2.438, ppl=5.42, wps=295258, ups=0.69, wpb=429258, bsz=16593.2, num_updates=37000, lr=0.000328798, gnorm=0.23, clip=0, loss_scale=2, train_wall=145, gb_free=58.8, wall=54503 epoch 022: 1263 / 1707 loss=4.07, nll_loss=2.438, ppl=5.42, wps=295258, ups=0.69, wpb=429258, bsz=16593.2, num_updates=37000, lr=0.000328798, gnorm=0.23, clip=0, loss_scale=2, train_wall=145, gb_free=58.8, wall=54503 epoch 022: 1263 / 1707 loss=4.07, nll_loss=2.438, ppl=5.42, wps=295258, ups=0.69, wpb=429258, bsz=16593.2, num_updates=37000, lr=0.000328798, gnorm=0.23, clip=0, loss_scale=2, train_wall=145, gb_free=58.8, wall=54503 epoch 022: 1263 / 1707 loss=4.07, nll_loss=2.438, ppl=5.42, wps=295258, ups=0.69, wpb=429258, bsz=16593.2, num_updates=37000, lr=0.000328798, gnorm=0.23, clip=0, loss_scale=2, train_wall=145, gb_free=58.8, wall=54503 epoch 022: 1263 / 1707 loss=4.07, nll_loss=2.438, ppl=5.42, wps=295258, ups=0.69, wpb=429258, bsz=16593.2, num_updates=37000, lr=0.000328798, gnorm=0.23, clip=0, loss_scale=2, train_wall=145, gb_free=58.8, wall=54503 epoch 022: 1263 / 1707 loss=4.07, nll_loss=2.438, ppl=5.42, wps=295258, ups=0.69, wpb=429258, bsz=16593.2, num_updates=37000, lr=0.000328798, gnorm=0.23, clip=0, loss_scale=2, train_wall=145, gb_free=58.8, wall=54503 epoch 022: 1263 / 1707 loss=4.07, nll_loss=2.438, ppl=5.42, wps=295258, ups=0.69, wpb=429258, bsz=16593.2, num_updates=37000, lr=0.000328798, gnorm=0.23, clip=0, loss_scale=2, train_wall=145, gb_free=58.8, wall=54503 epoch 022: 1263 / 1707 loss=4.07, nll_loss=2.438, ppl=5.42, wps=295258, ups=0.69, wpb=429258, bsz=16593.2, num_updates=37000, lr=0.000328798, gnorm=0.23, clip=0, loss_scale=2, train_wall=145, gb_free=58.8, wall=54503 begin validation on "valid" subset epoch 022 | valid on 'valid' subset | loss 4.285 | nll_loss 2.662 | ppl 6.33 | wps 75888.6 | wpb 21331 | bsz 1016 | num_updates 37000 | best_loss 4.285 epoch 022 | valid on 'valid' subset | loss 4.285 | nll_loss 2.662 | ppl 6.33 | wps 75888.6 | wpb 21331 | bsz 1016 | num_updates 37000 | best_loss 4.285 epoch 022 | valid on 'valid' subset | loss 4.285 | nll_loss 2.662 | ppl 6.33 | wps 75888.6 | wpb 21331 | bsz 1016 | num_updates 37000 | best_loss 4.285 epoch 022 | valid on 'valid' subset | loss 4.285 | nll_loss 2.662 | ppl 6.33 | wps 75888.6 | wpb 21331 | bsz 1016 | num_updates 37000 | best_loss 4.285 epoch 022 | valid on 'valid' subset | loss 4.285 | nll_loss 2.662 | ppl 6.33 | wps 75888.6 | wpb 21331 | bsz 1016 | num_updates 37000 | best_loss 4.285 epoch 022 | valid on 'valid' subset | loss 4.285 | nll_loss 2.662 | ppl 6.33 | wps 75888.6 | wpb 21331 | bsz 1016 | num_updates 37000 | best_loss 4.285 epoch 022 | valid on 'valid' subset | loss 4.285 | nll_loss 2.662 | ppl 6.33 | wps 75888.6 | wpb 21331 | bsz 1016 | num_updates 37000 | best_loss 4.285 epoch 022 | valid on 'valid' subset | loss 4.285 | nll_loss 2.662 | ppl 6.33 | wps 75888.6 | wpb 21331 | bsz 1016 | num_updates 37000 | best_loss 4.285 epoch 022 | valid on 'valid' subset | loss 4.285 | nll_loss 2.662 | ppl 6.33 | wps 75888.6 | wpb 21331 | bsz 1016 | num_updates 37000 | best_loss 4.285 epoch 022 | valid on 'valid' subset | loss 4.285 | nll_loss 2.662 | ppl 6.33 | wps 75888.6 | wpb 21331 | bsz 1016 | num_updates 37000 | best_loss 4.285 epoch 022 | valid on 'valid' subset | loss 4.285 | nll_loss 2.662 | ppl 6.33 | wps 75888.6 | wpb 21331 | bsz 1016 | num_updates 37000 | best_loss 4.285 epoch 022 | valid on 'valid' subset | loss 4.285 | nll_loss 2.662 | ppl 6.33 | wps 75888.6 | wpb 21331 | bsz 1016 | num_updates 37000 | best_loss 4.285 epoch 022 | valid on 'valid' subset | loss 4.285 | nll_loss 2.662 | ppl 6.33 | wps 75888.6 | wpb 21331 | bsz 1016 | num_updates 37000 | best_loss 4.285 epoch 022 | valid on 'valid' subset | loss 4.285 | nll_loss 2.662 | ppl 6.33 | wps 75888.6 | wpb 21331 | bsz 1016 | num_updates 37000 | best_loss 4.285 epoch 022 | valid on 'valid' subset | loss 4.285 | nll_loss 2.662 | ppl 6.33 | wps 75888.6 | wpb 21331 | bsz 1016 | num_updates 37000 | best_loss 4.285 epoch 022 | valid on 'valid' subset | loss 4.285 | nll_loss 2.662 | ppl 6.33 | wps 75888.6 | wpb 21331 | bsz 1016 | num_updates 37000 | best_loss 4.285 epoch 022 | valid on 'valid' subset | loss 4.285 | nll_loss 2.662 | ppl 6.33 | wps 75888.6 | wpb 21331 | bsz 1016 | num_updates 37000 | best_loss 4.285 epoch 022 | valid on 'valid' subset | loss 4.285 | nll_loss 2.662 | ppl 6.33 | wps 75888.6 | wpb 21331 | bsz 1016 | num_updates 37000 | best_loss 4.285 epoch 022 | valid on 'valid' subset | loss 4.285 | nll_loss 2.662 | ppl 6.33 | wps 75888.6 | wpb 21331 | bsz 1016 | num_updates 37000 | best_loss 4.285 epoch 022 | valid on 'valid' subset | loss 4.285 | nll_loss 2.662 | ppl 6.33 | wps 75888.6 | wpb 21331 | bsz 1016 | num_updates 37000 | best_loss 4.285 epoch 022 | valid on 'valid' subset | loss 4.285 | nll_loss 2.662 | ppl 6.33 | wps 75888.6 | wpb 21331 | bsz 1016 | num_updates 37000 | best_loss 4.285 epoch 022 | valid on 'valid' subset | loss 4.285 | nll_loss 2.662 | ppl 6.33 | wps 75888.6 | wpb 21331 | bsz 1016 | num_updates 37000 | best_loss 4.285 epoch 022: 1363 / 1707 loss=4.065, nll_loss=2.432, ppl=5.4, wps=262918, ups=0.61, wpb=428213, bsz=16412.4, num_updates=37100, lr=0.000328355, gnorm=0.235, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=54666 epoch 022: 1363 / 1707 loss=4.065, nll_loss=2.432, ppl=5.4, wps=262918, ups=0.61, wpb=428213, bsz=16412.4, num_updates=37100, lr=0.000328355, gnorm=0.235, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=54666 epoch 022: 1363 / 1707 loss=4.065, nll_loss=2.432, ppl=5.4, wps=262918, ups=0.61, wpb=428213, bsz=16412.4, num_updates=37100, lr=0.000328355, gnorm=0.235, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=54666 epoch 022: 1363 / 1707 loss=4.065, nll_loss=2.432, ppl=5.4, wps=262918, ups=0.61, wpb=428213, bsz=16412.4, num_updates=37100, lr=0.000328355, gnorm=0.235, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=54666 epoch 022: 1363 / 1707 loss=4.065, nll_loss=2.432, ppl=5.4, wps=262918, ups=0.61, wpb=428213, bsz=16412.4, num_updates=37100, lr=0.000328355, gnorm=0.235, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=54666 epoch 022: 1363 / 1707 loss=4.065, nll_loss=2.432, ppl=5.4, wps=262918, ups=0.61, wpb=428213, bsz=16412.4, num_updates=37100, lr=0.000328355, gnorm=0.235, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=54666 epoch 022: 1363 / 1707 loss=4.065, nll_loss=2.432, ppl=5.4, wps=262918, ups=0.61, wpb=428213, bsz=16412.4, num_updates=37100, lr=0.000328355, gnorm=0.235, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=54666 epoch 022: 1363 / 1707 loss=4.065, nll_loss=2.432, ppl=5.4, wps=262918, ups=0.61, wpb=428213, bsz=16412.4, num_updates=37100, lr=0.000328355, gnorm=0.235, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=54666 epoch 022: 1363 / 1707 loss=4.065, nll_loss=2.432, ppl=5.4, wps=262918, ups=0.61, wpb=428213, bsz=16412.4, num_updates=37100, lr=0.000328355, gnorm=0.235, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=54666 epoch 022: 1363 / 1707 loss=4.065, nll_loss=2.432, ppl=5.4, wps=262918, ups=0.61, wpb=428213, bsz=16412.4, num_updates=37100, lr=0.000328355, gnorm=0.235, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=54666 epoch 022: 1363 / 1707 loss=4.065, nll_loss=2.432, ppl=5.4, wps=262918, ups=0.61, wpb=428213, bsz=16412.4, num_updates=37100, lr=0.000328355, gnorm=0.235, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=54666 epoch 022: 1363 / 1707 loss=4.065, nll_loss=2.432, ppl=5.4, wps=262918, ups=0.61, wpb=428213, bsz=16412.4, num_updates=37100, lr=0.000328355, gnorm=0.235, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=54666 epoch 022: 1363 / 1707 loss=4.065, nll_loss=2.432, ppl=5.4, wps=262918, ups=0.61, wpb=428213, bsz=16412.4, num_updates=37100, lr=0.000328355, gnorm=0.235, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=54666 epoch 022: 1363 / 1707 loss=4.065, nll_loss=2.432, ppl=5.4, wps=262918, ups=0.61, wpb=428213, bsz=16412.4, num_updates=37100, lr=0.000328355, gnorm=0.235, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=54666 epoch 022: 1363 / 1707 loss=4.065, nll_loss=2.432, ppl=5.4, wps=262918, ups=0.61, wpb=428213, bsz=16412.4, num_updates=37100, lr=0.000328355, gnorm=0.235, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=54666 epoch 022: 1363 / 1707 loss=4.065, nll_loss=2.432, ppl=5.4, wps=262918, ups=0.61, wpb=428213, bsz=16412.4, num_updates=37100, lr=0.000328355, gnorm=0.235, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=54666 epoch 022: 1363 / 1707 loss=4.065, nll_loss=2.432, ppl=5.4, wps=262918, ups=0.61, wpb=428213, bsz=16412.4, num_updates=37100, lr=0.000328355, gnorm=0.235, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=54666 epoch 022: 1363 / 1707 loss=4.065, nll_loss=2.432, ppl=5.4, wps=262918, ups=0.61, wpb=428213, bsz=16412.4, num_updates=37100, lr=0.000328355, gnorm=0.235, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=54666 epoch 022: 1363 / 1707 loss=4.065, nll_loss=2.432, ppl=5.4, wps=262918, ups=0.61, wpb=428213, bsz=16412.4, num_updates=37100, lr=0.000328355, gnorm=0.235, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=54666 epoch 022: 1363 / 1707 loss=4.065, nll_loss=2.432, ppl=5.4, wps=262918, ups=0.61, wpb=428213, bsz=16412.4, num_updates=37100, lr=0.000328355, gnorm=0.235, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=54666 epoch 022: 1363 / 1707 loss=4.065, nll_loss=2.432, ppl=5.4, wps=262918, ups=0.61, wpb=428213, bsz=16412.4, num_updates=37100, lr=0.000328355, gnorm=0.235, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=54666 epoch 022: 1363 / 1707 loss=4.065, nll_loss=2.432, ppl=5.4, wps=262918, ups=0.61, wpb=428213, bsz=16412.4, num_updates=37100, lr=0.000328355, gnorm=0.235, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=54666 epoch 022: 1463 / 1707 loss=4.073, nll_loss=2.441, ppl=5.43, wps=295518, ups=0.69, wpb=429576, bsz=16663.3, num_updates=37200, lr=0.000327913, gnorm=0.23, clip=0, loss_scale=4, train_wall=144, gb_free=59.3, wall=54811 epoch 022: 1463 / 1707 loss=4.073, nll_loss=2.441, ppl=5.43, wps=295518, ups=0.69, wpb=429576, bsz=16663.3, num_updates=37200, lr=0.000327913, gnorm=0.23, clip=0, loss_scale=4, train_wall=144, gb_free=59.3, wall=54811 epoch 022: 1463 / 1707 loss=4.073, nll_loss=2.441, ppl=5.43, wps=295518, ups=0.69, wpb=429576, bsz=16663.3, num_updates=37200, lr=0.000327913, gnorm=0.23, clip=0, loss_scale=4, train_wall=144, gb_free=59.3, wall=54811 epoch 022: 1463 / 1707 loss=4.073, nll_loss=2.441, ppl=5.43, wps=295518, ups=0.69, wpb=429576, bsz=16663.3, num_updates=37200, lr=0.000327913, gnorm=0.23, clip=0, loss_scale=4, train_wall=144, gb_free=59.3, wall=54811 epoch 022: 1463 / 1707 loss=4.073, nll_loss=2.441, ppl=5.43, wps=295518, ups=0.69, wpb=429576, bsz=16663.3, num_updates=37200, lr=0.000327913, gnorm=0.23, clip=0, loss_scale=4, train_wall=144, gb_free=59.3, wall=54811 epoch 022: 1463 / 1707 loss=4.073, nll_loss=2.441, ppl=5.43, wps=295518, ups=0.69, wpb=429576, bsz=16663.3, num_updates=37200, lr=0.000327913, gnorm=0.23, clip=0, loss_scale=4, train_wall=144, gb_free=59.3, wall=54811 epoch 022: 1463 / 1707 loss=4.073, nll_loss=2.441, ppl=5.43, wps=295518, ups=0.69, wpb=429576, bsz=16663.3, num_updates=37200, lr=0.000327913, gnorm=0.23, clip=0, loss_scale=4, train_wall=144, gb_free=59.3, wall=54811 epoch 022: 1463 / 1707 loss=4.073, nll_loss=2.441, ppl=5.43, wps=295518, ups=0.69, wpb=429576, bsz=16663.3, num_updates=37200, lr=0.000327913, gnorm=0.23, clip=0, loss_scale=4, train_wall=144, gb_free=59.3, wall=54811 epoch 022: 1463 / 1707 loss=4.073, nll_loss=2.441, ppl=5.43, wps=295518, ups=0.69, wpb=429576, bsz=16663.3, num_updates=37200, lr=0.000327913, gnorm=0.23, clip=0, loss_scale=4, train_wall=144, gb_free=59.3, wall=54811 epoch 022: 1463 / 1707 loss=4.073, nll_loss=2.441, ppl=5.43, wps=295518, ups=0.69, wpb=429576, bsz=16663.3, num_updates=37200, lr=0.000327913, gnorm=0.23, clip=0, loss_scale=4, train_wall=144, gb_free=59.3, wall=54811 epoch 022: 1463 / 1707 loss=4.073, nll_loss=2.441, ppl=5.43, wps=295518, ups=0.69, wpb=429576, bsz=16663.3, num_updates=37200, lr=0.000327913, gnorm=0.23, clip=0, loss_scale=4, train_wall=144, gb_free=59.3, wall=54811 epoch 022: 1463 / 1707 loss=4.073, nll_loss=2.441, ppl=5.43, wps=295518, ups=0.69, wpb=429576, bsz=16663.3, num_updates=37200, lr=0.000327913, gnorm=0.23, clip=0, loss_scale=4, train_wall=144, gb_free=59.3, wall=54811 epoch 022: 1463 / 1707 loss=4.073, nll_loss=2.441, ppl=5.43, wps=295518, ups=0.69, wpb=429576, bsz=16663.3, num_updates=37200, lr=0.000327913, gnorm=0.23, clip=0, loss_scale=4, train_wall=144, gb_free=59.3, wall=54811 epoch 022: 1463 / 1707 loss=4.073, nll_loss=2.441, ppl=5.43, wps=295518, ups=0.69, wpb=429576, bsz=16663.3, num_updates=37200, lr=0.000327913, gnorm=0.23, clip=0, loss_scale=4, train_wall=144, gb_free=59.3, wall=54811 epoch 022: 1463 / 1707 loss=4.073, nll_loss=2.441, ppl=5.43, wps=295518, ups=0.69, wpb=429576, bsz=16663.3, num_updates=37200, lr=0.000327913, gnorm=0.23, clip=0, loss_scale=4, train_wall=144, gb_free=59.3, wall=54811 epoch 022: 1463 / 1707 loss=4.073, nll_loss=2.441, ppl=5.43, wps=295518, ups=0.69, wpb=429576, bsz=16663.3, num_updates=37200, lr=0.000327913, gnorm=0.23, clip=0, loss_scale=4, train_wall=144, gb_free=59.3, wall=54811 epoch 022: 1463 / 1707 loss=4.073, nll_loss=2.441, ppl=5.43, wps=295518, ups=0.69, wpb=429576, bsz=16663.3, num_updates=37200, lr=0.000327913, gnorm=0.23, clip=0, loss_scale=4, train_wall=144, gb_free=59.3, wall=54811 epoch 022: 1463 / 1707 loss=4.073, nll_loss=2.441, ppl=5.43, wps=295518, ups=0.69, wpb=429576, bsz=16663.3, num_updates=37200, lr=0.000327913, gnorm=0.23, clip=0, loss_scale=4, train_wall=144, gb_free=59.3, wall=54811 epoch 022: 1463 / 1707 loss=4.073, nll_loss=2.441, ppl=5.43, wps=295518, ups=0.69, wpb=429576, bsz=16663.3, num_updates=37200, lr=0.000327913, gnorm=0.23, clip=0, loss_scale=4, train_wall=144, gb_free=59.3, wall=54811 epoch 022: 1463 / 1707 loss=4.073, nll_loss=2.441, ppl=5.43, wps=295518, ups=0.69, wpb=429576, bsz=16663.3, num_updates=37200, lr=0.000327913, gnorm=0.23, clip=0, loss_scale=4, train_wall=144, gb_free=59.3, wall=54811 epoch 022: 1463 / 1707 loss=4.073, nll_loss=2.441, ppl=5.43, wps=295518, ups=0.69, wpb=429576, bsz=16663.3, num_updates=37200, lr=0.000327913, gnorm=0.23, clip=0, loss_scale=4, train_wall=144, gb_free=59.3, wall=54811 epoch 022: 1463 / 1707 loss=4.073, nll_loss=2.441, ppl=5.43, wps=295518, ups=0.69, wpb=429576, bsz=16663.3, num_updates=37200, lr=0.000327913, gnorm=0.23, clip=0, loss_scale=4, train_wall=144, gb_free=59.3, wall=54811 epoch 022: 1563 / 1707 loss=4.073, nll_loss=2.442, ppl=5.43, wps=296927, ups=0.69, wpb=429988, bsz=16160.4, num_updates=37300, lr=0.000327473, gnorm=0.224, clip=0, loss_scale=4, train_wall=144, gb_free=59.1, wall=54956 epoch 022: 1563 / 1707 loss=4.073, nll_loss=2.442, ppl=5.43, wps=296927, ups=0.69, wpb=429988, bsz=16160.4, num_updates=37300, lr=0.000327473, gnorm=0.224, clip=0, loss_scale=4, train_wall=144, gb_free=59.1, wall=54956 epoch 022: 1563 / 1707 loss=4.073, nll_loss=2.442, ppl=5.43, wps=296927, ups=0.69, wpb=429988, bsz=16160.4, num_updates=37300, lr=0.000327473, gnorm=0.224, clip=0, loss_scale=4, train_wall=144, gb_free=59.1, wall=54956 epoch 022: 1563 / 1707 loss=4.073, nll_loss=2.442, ppl=5.43, wps=296927, ups=0.69, wpb=429988, bsz=16160.4, num_updates=37300, lr=0.000327473, gnorm=0.224, clip=0, loss_scale=4, train_wall=144, gb_free=59.1, wall=54956 epoch 022: 1563 / 1707 loss=4.073, nll_loss=2.442, ppl=5.43, wps=296927, ups=0.69, wpb=429988, bsz=16160.4, num_updates=37300, lr=0.000327473, gnorm=0.224, clip=0, loss_scale=4, train_wall=144, gb_free=59.1, wall=54956 epoch 022: 1563 / 1707 loss=4.073, nll_loss=2.442, ppl=5.43, wps=296927, ups=0.69, wpb=429988, bsz=16160.4, num_updates=37300, lr=0.000327473, gnorm=0.224, clip=0, loss_scale=4, train_wall=144, gb_free=59.1, wall=54956 epoch 022: 1563 / 1707 loss=4.073, nll_loss=2.442, ppl=5.43, wps=296927, ups=0.69, wpb=429988, bsz=16160.4, num_updates=37300, lr=0.000327473, gnorm=0.224, clip=0, loss_scale=4, train_wall=144, gb_free=59.1, wall=54956 epoch 022: 1563 / 1707 loss=4.073, nll_loss=2.442, ppl=5.43, wps=296927, ups=0.69, wpb=429988, bsz=16160.4, num_updates=37300, lr=0.000327473, gnorm=0.224, clip=0, loss_scale=4, train_wall=144, gb_free=59.1, wall=54956 epoch 022: 1563 / 1707 loss=4.073, nll_loss=2.442, ppl=5.43, wps=296927, ups=0.69, wpb=429988, bsz=16160.4, num_updates=37300, lr=0.000327473, gnorm=0.224, clip=0, loss_scale=4, train_wall=144, gb_free=59.1, wall=54956 epoch 022: 1563 / 1707 loss=4.073, nll_loss=2.442, ppl=5.43, wps=296927, ups=0.69, wpb=429988, bsz=16160.4, num_updates=37300, lr=0.000327473, gnorm=0.224, clip=0, loss_scale=4, train_wall=144, gb_free=59.1, wall=54956 epoch 022: 1563 / 1707 loss=4.073, nll_loss=2.442, ppl=5.43, wps=296927, ups=0.69, wpb=429988, bsz=16160.4, num_updates=37300, lr=0.000327473, gnorm=0.224, clip=0, loss_scale=4, train_wall=144, gb_free=59.1, wall=54956 epoch 022: 1563 / 1707 loss=4.073, nll_loss=2.442, ppl=5.43, wps=296927, ups=0.69, wpb=429988, bsz=16160.4, num_updates=37300, lr=0.000327473, gnorm=0.224, clip=0, loss_scale=4, train_wall=144, gb_free=59.1, wall=54956 epoch 022: 1563 / 1707 loss=4.073, nll_loss=2.442, ppl=5.43, wps=296927, ups=0.69, wpb=429988, bsz=16160.4, num_updates=37300, lr=0.000327473, gnorm=0.224, clip=0, loss_scale=4, train_wall=144, gb_free=59.1, wall=54956 epoch 022: 1563 / 1707 loss=4.073, nll_loss=2.442, ppl=5.43, wps=296927, ups=0.69, wpb=429988, bsz=16160.4, num_updates=37300, lr=0.000327473, gnorm=0.224, clip=0, loss_scale=4, train_wall=144, gb_free=59.1, wall=54956 epoch 022: 1563 / 1707 loss=4.073, nll_loss=2.442, ppl=5.43, wps=296927, ups=0.69, wpb=429988, bsz=16160.4, num_updates=37300, lr=0.000327473, gnorm=0.224, clip=0, loss_scale=4, train_wall=144, gb_free=59.1, wall=54956 epoch 022: 1563 / 1707 loss=4.073, nll_loss=2.442, ppl=5.43, wps=296927, ups=0.69, wpb=429988, bsz=16160.4, num_updates=37300, lr=0.000327473, gnorm=0.224, clip=0, loss_scale=4, train_wall=144, gb_free=59.1, wall=54956 epoch 022: 1563 / 1707 loss=4.073, nll_loss=2.442, ppl=5.43, wps=296927, ups=0.69, wpb=429988, bsz=16160.4, num_updates=37300, lr=0.000327473, gnorm=0.224, clip=0, loss_scale=4, train_wall=144, gb_free=59.1, wall=54956 epoch 022: 1563 / 1707 loss=4.073, nll_loss=2.442, ppl=5.43, wps=296927, ups=0.69, wpb=429988, bsz=16160.4, num_updates=37300, lr=0.000327473, gnorm=0.224, clip=0, loss_scale=4, train_wall=144, gb_free=59.1, wall=54956 epoch 022: 1563 / 1707 loss=4.073, nll_loss=2.442, ppl=5.43, wps=296927, ups=0.69, wpb=429988, bsz=16160.4, num_updates=37300, lr=0.000327473, gnorm=0.224, clip=0, loss_scale=4, train_wall=144, gb_free=59.1, wall=54956 epoch 022: 1563 / 1707 loss=4.073, nll_loss=2.442, ppl=5.43, wps=296927, ups=0.69, wpb=429988, bsz=16160.4, num_updates=37300, lr=0.000327473, gnorm=0.224, clip=0, loss_scale=4, train_wall=144, gb_free=59.1, wall=54956 epoch 022: 1563 / 1707 loss=4.073, nll_loss=2.442, ppl=5.43, wps=296927, ups=0.69, wpb=429988, bsz=16160.4, num_updates=37300, lr=0.000327473, gnorm=0.224, clip=0, loss_scale=4, train_wall=144, gb_free=59.1, wall=54956 epoch 022: 1563 / 1707 loss=4.073, nll_loss=2.442, ppl=5.43, wps=296927, ups=0.69, wpb=429988, bsz=16160.4, num_updates=37300, lr=0.000327473, gnorm=0.224, clip=0, loss_scale=4, train_wall=144, gb_free=59.1, wall=54956 epoch 022: 1664 / 1707 loss=4.078, nll_loss=2.447, ppl=5.45, wps=293427, ups=0.68, wpb=428523, bsz=16074.1, num_updates=37400, lr=0.000327035, gnorm=0.234, clip=0, loss_scale=2, train_wall=145, gb_free=59.1, wall=55102 epoch 022: 1664 / 1707 loss=4.078, nll_loss=2.447, ppl=5.45, wps=293427, ups=0.68, wpb=428523, bsz=16074.1, num_updates=37400, lr=0.000327035, gnorm=0.234, clip=0, loss_scale=2, train_wall=145, gb_free=59.1, wall=55102 epoch 022: 1664 / 1707 loss=4.078, nll_loss=2.447, ppl=5.45, wps=293427, ups=0.68, wpb=428523, bsz=16074.1, num_updates=37400, lr=0.000327035, gnorm=0.234, clip=0, loss_scale=2, train_wall=145, gb_free=59.1, wall=55102 epoch 022: 1664 / 1707 loss=4.078, nll_loss=2.447, ppl=5.45, wps=293427, ups=0.68, wpb=428523, bsz=16074.1, num_updates=37400, lr=0.000327035, gnorm=0.234, clip=0, loss_scale=2, train_wall=145, gb_free=59.1, wall=55102 epoch 022: 1664 / 1707 loss=4.078, nll_loss=2.447, ppl=5.45, wps=293427, ups=0.68, wpb=428523, bsz=16074.1, num_updates=37400, lr=0.000327035, gnorm=0.234, clip=0, loss_scale=2, train_wall=145, gb_free=59.1, wall=55102 epoch 022: 1664 / 1707 loss=4.078, nll_loss=2.447, ppl=5.45, wps=293427, ups=0.68, wpb=428523, bsz=16074.1, num_updates=37400, lr=0.000327035, gnorm=0.234, clip=0, loss_scale=2, train_wall=145, gb_free=59.1, wall=55102 epoch 022: 1664 / 1707 loss=4.078, nll_loss=2.447, ppl=5.45, wps=293427, ups=0.68, wpb=428523, bsz=16074.1, num_updates=37400, lr=0.000327035, gnorm=0.234, clip=0, loss_scale=2, train_wall=145, gb_free=59.1, wall=55102 epoch 022: 1664 / 1707 loss=4.078, nll_loss=2.447, ppl=5.45, wps=293427, ups=0.68, wpb=428523, bsz=16074.1, num_updates=37400, lr=0.000327035, gnorm=0.234, clip=0, loss_scale=2, train_wall=145, gb_free=59.1, wall=55102 epoch 022: 1664 / 1707 loss=4.078, nll_loss=2.447, ppl=5.45, wps=293427, ups=0.68, wpb=428523, bsz=16074.1, num_updates=37400, lr=0.000327035, gnorm=0.234, clip=0, loss_scale=2, train_wall=145, gb_free=59.1, wall=55102 epoch 022: 1664 / 1707 loss=4.078, nll_loss=2.447, ppl=5.45, wps=293427, ups=0.68, wpb=428523, bsz=16074.1, num_updates=37400, lr=0.000327035, gnorm=0.234, clip=0, loss_scale=2, train_wall=145, gb_free=59.1, wall=55102 epoch 022: 1664 / 1707 loss=4.078, nll_loss=2.447, ppl=5.45, wps=293427, ups=0.68, wpb=428523, bsz=16074.1, num_updates=37400, lr=0.000327035, gnorm=0.234, clip=0, loss_scale=2, train_wall=145, gb_free=59.1, wall=55102 epoch 022: 1664 / 1707 loss=4.078, nll_loss=2.447, ppl=5.45, wps=293427, ups=0.68, wpb=428523, bsz=16074.1, num_updates=37400, lr=0.000327035, gnorm=0.234, clip=0, loss_scale=2, train_wall=145, gb_free=59.1, wall=55102 epoch 022: 1664 / 1707 loss=4.078, nll_loss=2.447, ppl=5.45, wps=293427, ups=0.68, wpb=428523, bsz=16074.1, num_updates=37400, lr=0.000327035, gnorm=0.234, clip=0, loss_scale=2, train_wall=145, gb_free=59.1, wall=55102 epoch 022: 1664 / 1707 loss=4.078, nll_loss=2.447, ppl=5.45, wps=293427, ups=0.68, wpb=428523, bsz=16074.1, num_updates=37400, lr=0.000327035, gnorm=0.234, clip=0, loss_scale=2, train_wall=145, gb_free=59.1, wall=55102 epoch 022: 1664 / 1707 loss=4.078, nll_loss=2.447, ppl=5.45, wps=293427, ups=0.68, wpb=428523, bsz=16074.1, num_updates=37400, lr=0.000327035, gnorm=0.234, clip=0, loss_scale=2, train_wall=145, gb_free=59.1, wall=55102 epoch 022: 1664 / 1707 loss=4.078, nll_loss=2.447, ppl=5.45, wps=293427, ups=0.68, wpb=428523, bsz=16074.1, num_updates=37400, lr=0.000327035, gnorm=0.234, clip=0, loss_scale=2, train_wall=145, gb_free=59.1, wall=55102 epoch 022: 1664 / 1707 loss=4.078, nll_loss=2.447, ppl=5.45, wps=293427, ups=0.68, wpb=428523, bsz=16074.1, num_updates=37400, lr=0.000327035, gnorm=0.234, clip=0, loss_scale=2, train_wall=145, gb_free=59.1, wall=55102 epoch 022: 1664 / 1707 loss=4.078, nll_loss=2.447, ppl=5.45, wps=293427, ups=0.68, wpb=428523, bsz=16074.1, num_updates=37400, lr=0.000327035, gnorm=0.234, clip=0, loss_scale=2, train_wall=145, gb_free=59.1, wall=55102 epoch 022: 1664 / 1707 loss=4.078, nll_loss=2.447, ppl=5.45, wps=293427, ups=0.68, wpb=428523, bsz=16074.1, num_updates=37400, lr=0.000327035, gnorm=0.234, clip=0, loss_scale=2, train_wall=145, gb_free=59.1, wall=55102 epoch 022: 1664 / 1707 loss=4.078, nll_loss=2.447, ppl=5.45, wps=293427, ups=0.68, wpb=428523, bsz=16074.1, num_updates=37400, lr=0.000327035, gnorm=0.234, clip=0, loss_scale=2, train_wall=145, gb_free=59.1, wall=55102 epoch 022: 1664 / 1707 loss=4.078, nll_loss=2.447, ppl=5.45, wps=293427, ups=0.68, wpb=428523, bsz=16074.1, num_updates=37400, lr=0.000327035, gnorm=0.234, clip=0, loss_scale=2, train_wall=145, gb_free=59.1, wall=55102 epoch 022: 1664 / 1707 loss=4.078, nll_loss=2.447, ppl=5.45, wps=293427, ups=0.68, wpb=428523, bsz=16074.1, num_updates=37400, lr=0.000327035, gnorm=0.234, clip=0, loss_scale=2, train_wall=145, gb_free=59.1, wall=55102 end of epoch 22 (average epoch stats below) epoch 022 | loss 4.064 | nll_loss 2.431 | ppl 5.39 | wps 291460 | ups 0.68 | wpb 428927 | bsz 16330.5 | num_updates 37443 | lr 0.000326847 | gnorm 0.232 | clip 0 | loss_scale 2 | train_wall 2461 | gb_free 59.5 | wall 55164 epoch 022 | loss 4.064 | nll_loss 2.431 | ppl 5.39 | wps 291460 | ups 0.68 | wpb 428927 | bsz 16330.5 | num_updates 37443 | lr 0.000326847 | gnorm 0.232 | clip 0 | loss_scale 2 | train_wall 2461 | gb_free 59.5 | wall 55164 epoch 022 | loss 4.064 | nll_loss 2.431 | ppl 5.39 | wps 291460 | ups 0.68 | wpb 428927 | bsz 16330.5 | num_updates 37443 | lr 0.000326847 | gnorm 0.232 | clip 0 | loss_scale 2 | train_wall 2461 | gb_free 59.5 | wall 55164 epoch 022 | loss 4.064 | nll_loss 2.431 | ppl 5.39 | wps 291460 | ups 0.68 | wpb 428927 | bsz 16330.5 | num_updates 37443 | lr 0.000326847 | gnorm 0.232 | clip 0 | loss_scale 2 | train_wall 2461 | gb_free 59.5 | wall 55164 epoch 022 | loss 4.064 | nll_loss 2.431 | ppl 5.39 | wps 291460 | ups 0.68 | wpb 428927 | bsz 16330.5 | num_updates 37443 | lr 0.000326847 | gnorm 0.232 | clip 0 | loss_scale 2 | train_wall 2461 | gb_free 59.5 | wall 55164 epoch 022 | loss 4.064 | nll_loss 2.431 | ppl 5.39 | wps 291460 | ups 0.68 | wpb 428927 | bsz 16330.5 | num_updates 37443 | lr 0.000326847 | gnorm 0.232 | clip 0 | loss_scale 2 | train_wall 2461 | gb_free 59.5 | wall 55164 epoch 022 | loss 4.064 | nll_loss 2.431 | ppl 5.39 | wps 291460 | ups 0.68 | wpb 428927 | bsz 16330.5 | num_updates 37443 | lr 0.000326847 | gnorm 0.232 | clip 0 | loss_scale 2 | train_wall 2461 | gb_free 59.5 | wall 55164 epoch 022 | loss 4.064 | nll_loss 2.431 | ppl 5.39 | wps 291460 | ups 0.68 | wpb 428927 | bsz 16330.5 | num_updates 37443 | lr 0.000326847 | gnorm 0.232 | clip 0 | loss_scale 2 | train_wall 2461 | gb_free 59.5 | wall 55164 epoch 022 | loss 4.064 | nll_loss 2.431 | ppl 5.39 | wps 291460 | ups 0.68 | wpb 428927 | bsz 16330.5 | num_updates 37443 | lr 0.000326847 | gnorm 0.232 | clip 0 | loss_scale 2 | train_wall 2461 | gb_free 59.5 | wall 55164 epoch 022 | loss 4.064 | nll_loss 2.431 | ppl 5.39 | wps 291460 | ups 0.68 | wpb 428927 | bsz 16330.5 | num_updates 37443 | lr 0.000326847 | gnorm 0.232 | clip 0 | loss_scale 2 | train_wall 2461 | gb_free 59.5 | wall 55164 epoch 022 | loss 4.064 | nll_loss 2.431 | ppl 5.39 | wps 291460 | ups 0.68 | wpb 428927 | bsz 16330.5 | num_updates 37443 | lr 0.000326847 | gnorm 0.232 | clip 0 | loss_scale 2 | train_wall 2461 | gb_free 59.5 | wall 55164 epoch 022 | loss 4.064 | nll_loss 2.431 | ppl 5.39 | wps 291460 | ups 0.68 | wpb 428927 | bsz 16330.5 | num_updates 37443 | lr 0.000326847 | gnorm 0.232 | clip 0 | loss_scale 2 | train_wall 2461 | gb_free 59.5 | wall 55164 epoch 022 | loss 4.064 | nll_loss 2.431 | ppl 5.39 | wps 291460 | ups 0.68 | wpb 428927 | bsz 16330.5 | num_updates 37443 | lr 0.000326847 | gnorm 0.232 | clip 0 | loss_scale 2 | train_wall 2461 | gb_free 59.5 | wall 55164 epoch 022 | loss 4.064 | nll_loss 2.431 | ppl 5.39 | wps 291460 | ups 0.68 | wpb 428927 | bsz 16330.5 | num_updates 37443 | lr 0.000326847 | gnorm 0.232 | clip 0 | loss_scale 2 | train_wall 2461 | gb_free 59.5 | wall 55164 epoch 022 | loss 4.064 | nll_loss 2.431 | ppl 5.39 | wps 291460 | ups 0.68 | wpb 428927 | bsz 16330.5 | num_updates 37443 | lr 0.000326847 | gnorm 0.232 | clip 0 | loss_scale 2 | train_wall 2461 | gb_free 59.5 | wall 55164 epoch 022 | loss 4.064 | nll_loss 2.431 | ppl 5.39 | wps 291460 | ups 0.68 | wpb 428927 | bsz 16330.5 | num_updates 37443 | lr 0.000326847 | gnorm 0.232 | clip 0 | loss_scale 2 | train_wall 2461 | gb_free 59.5 | wall 55164 epoch 022 | loss 4.064 | nll_loss 2.431 | ppl 5.39 | wps 291460 | ups 0.68 | wpb 428927 | bsz 16330.5 | num_updates 37443 | lr 0.000326847 | gnorm 0.232 | clip 0 | loss_scale 2 | train_wall 2461 | gb_free 59.5 | wall 55164 epoch 022 | loss 4.064 | nll_loss 2.431 | ppl 5.39 | wps 291460 | ups 0.68 | wpb 428927 | bsz 16330.5 | num_updates 37443 | lr 0.000326847 | gnorm 0.232 | clip 0 | loss_scale 2 | train_wall 2461 | gb_free 59.5 | wall 55164 epoch 022 | loss 4.064 | nll_loss 2.431 | ppl 5.39 | wps 291460 | ups 0.68 | wpb 428927 | bsz 16330.5 | num_updates 37443 | lr 0.000326847 | gnorm 0.232 | clip 0 | loss_scale 2 | train_wall 2461 | gb_free 59.5 | wall 55164 epoch 022 | loss 4.064 | nll_loss 2.431 | ppl 5.39 | wps 291460 | ups 0.68 | wpb 428927 | bsz 16330.5 | num_updates 37443 | lr 0.000326847 | gnorm 0.232 | clip 0 | loss_scale 2 | train_wall 2461 | gb_free 59.5 | wall 55164 epoch 022 | loss 4.064 | nll_loss 2.431 | ppl 5.39 | wps 291460 | ups 0.68 | wpb 428927 | bsz 16330.5 | num_updates 37443 | lr 0.000326847 | gnorm 0.232 | clip 0 | loss_scale 2 | train_wall 2461 | gb_free 59.5 | wall 55164 epoch 022 | loss 4.064 | nll_loss 2.431 | ppl 5.39 | wps 291460 | ups 0.68 | wpb 428927 | bsz 16330.5 | num_updates 37443 | lr 0.000326847 | gnorm 0.232 | clip 0 | loss_scale 2 | train_wall 2461 | gb_free 59.5 | wall 55164 Start iterating over samples epoch 023: 57 / 1707 loss=4.052, nll_loss=2.417, ppl=5.34, wps=293788, ups=0.69, wpb=425388, bsz=16302.1, num_updates=37500, lr=0.000326599, gnorm=0.237, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=55247 epoch 023: 57 / 1707 loss=4.052, nll_loss=2.417, ppl=5.34, wps=293788, ups=0.69, wpb=425388, bsz=16302.1, num_updates=37500, lr=0.000326599, gnorm=0.237, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=55247 epoch 023: 57 / 1707 loss=4.052, nll_loss=2.417, ppl=5.34, wps=293788, ups=0.69, wpb=425388, bsz=16302.1, num_updates=37500, lr=0.000326599, gnorm=0.237, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=55247 epoch 023: 57 / 1707 loss=4.052, nll_loss=2.417, ppl=5.34, wps=293788, ups=0.69, wpb=425388, bsz=16302.1, num_updates=37500, lr=0.000326599, gnorm=0.237, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=55247 epoch 023: 57 / 1707 loss=4.052, nll_loss=2.417, ppl=5.34, wps=293788, ups=0.69, wpb=425388, bsz=16302.1, num_updates=37500, lr=0.000326599, gnorm=0.237, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=55247 epoch 023: 57 / 1707 loss=4.052, nll_loss=2.417, ppl=5.34, wps=293788, ups=0.69, wpb=425388, bsz=16302.1, num_updates=37500, lr=0.000326599, gnorm=0.237, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=55247 epoch 023: 57 / 1707 loss=4.052, nll_loss=2.417, ppl=5.34, wps=293788, ups=0.69, wpb=425388, bsz=16302.1, num_updates=37500, lr=0.000326599, gnorm=0.237, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=55247 epoch 023: 57 / 1707 loss=4.052, nll_loss=2.417, ppl=5.34, wps=293788, ups=0.69, wpb=425388, bsz=16302.1, num_updates=37500, lr=0.000326599, gnorm=0.237, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=55247 epoch 023: 57 / 1707 loss=4.052, nll_loss=2.417, ppl=5.34, wps=293788, ups=0.69, wpb=425388, bsz=16302.1, num_updates=37500, lr=0.000326599, gnorm=0.237, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=55247 epoch 023: 57 / 1707 loss=4.052, nll_loss=2.417, ppl=5.34, wps=293788, ups=0.69, wpb=425388, bsz=16302.1, num_updates=37500, lr=0.000326599, gnorm=0.237, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=55247 epoch 023: 57 / 1707 loss=4.052, nll_loss=2.417, ppl=5.34, wps=293788, ups=0.69, wpb=425388, bsz=16302.1, num_updates=37500, lr=0.000326599, gnorm=0.237, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=55247 epoch 023: 57 / 1707 loss=4.052, nll_loss=2.417, ppl=5.34, wps=293788, ups=0.69, wpb=425388, bsz=16302.1, num_updates=37500, lr=0.000326599, gnorm=0.237, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=55247 epoch 023: 57 / 1707 loss=4.052, nll_loss=2.417, ppl=5.34, wps=293788, ups=0.69, wpb=425388, bsz=16302.1, num_updates=37500, lr=0.000326599, gnorm=0.237, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=55247 epoch 023: 57 / 1707 loss=4.052, nll_loss=2.417, ppl=5.34, wps=293788, ups=0.69, wpb=425388, bsz=16302.1, num_updates=37500, lr=0.000326599, gnorm=0.237, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=55247 epoch 023: 57 / 1707 loss=4.052, nll_loss=2.417, ppl=5.34, wps=293788, ups=0.69, wpb=425388, bsz=16302.1, num_updates=37500, lr=0.000326599, gnorm=0.237, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=55247 epoch 023: 57 / 1707 loss=4.052, nll_loss=2.417, ppl=5.34, wps=293788, ups=0.69, wpb=425388, bsz=16302.1, num_updates=37500, lr=0.000326599, gnorm=0.237, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=55247 epoch 023: 57 / 1707 loss=4.052, nll_loss=2.417, ppl=5.34, wps=293788, ups=0.69, wpb=425388, bsz=16302.1, num_updates=37500, lr=0.000326599, gnorm=0.237, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=55247 epoch 023: 57 / 1707 loss=4.052, nll_loss=2.417, ppl=5.34, wps=293788, ups=0.69, wpb=425388, bsz=16302.1, num_updates=37500, lr=0.000326599, gnorm=0.237, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=55247 epoch 023: 57 / 1707 loss=4.052, nll_loss=2.417, ppl=5.34, wps=293788, ups=0.69, wpb=425388, bsz=16302.1, num_updates=37500, lr=0.000326599, gnorm=0.237, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=55247 epoch 023: 57 / 1707 loss=4.052, nll_loss=2.417, ppl=5.34, wps=293788, ups=0.69, wpb=425388, bsz=16302.1, num_updates=37500, lr=0.000326599, gnorm=0.237, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=55247 epoch 023: 57 / 1707 loss=4.052, nll_loss=2.417, ppl=5.34, wps=293788, ups=0.69, wpb=425388, bsz=16302.1, num_updates=37500, lr=0.000326599, gnorm=0.237, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=55247 epoch 023: 57 / 1707 loss=4.052, nll_loss=2.417, ppl=5.34, wps=293788, ups=0.69, wpb=425388, bsz=16302.1, num_updates=37500, lr=0.000326599, gnorm=0.237, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=55247 epoch 023: 57 / 1707 loss=4.052, nll_loss=2.417, ppl=5.34, wps=293788, ups=0.69, wpb=425388, bsz=16302.1, num_updates=37500, lr=0.000326599, gnorm=0.237, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=55247 epoch 023: 157 / 1707 loss=4.046, nll_loss=2.41, ppl=5.31, wps=296182, ups=0.69, wpb=430142, bsz=16360, num_updates=37600, lr=0.000326164, gnorm=0.219, clip=0, loss_scale=4, train_wall=144, gb_free=58.9, wall=55392 epoch 023: 157 / 1707 loss=4.046, nll_loss=2.41, ppl=5.31, wps=296182, ups=0.69, wpb=430142, bsz=16360, num_updates=37600, lr=0.000326164, gnorm=0.219, clip=0, loss_scale=4, train_wall=144, gb_free=58.9, wall=55392 epoch 023: 157 / 1707 loss=4.046, nll_loss=2.41, ppl=5.31, wps=296182, ups=0.69, wpb=430142, bsz=16360, num_updates=37600, lr=0.000326164, gnorm=0.219, clip=0, loss_scale=4, train_wall=144, gb_free=58.9, wall=55392 epoch 023: 157 / 1707 loss=4.046, nll_loss=2.41, ppl=5.31, wps=296182, ups=0.69, wpb=430142, bsz=16360, num_updates=37600, lr=0.000326164, gnorm=0.219, clip=0, loss_scale=4, train_wall=144, gb_free=58.9, wall=55392 epoch 023: 157 / 1707 loss=4.046, nll_loss=2.41, ppl=5.31, wps=296182, ups=0.69, wpb=430142, bsz=16360, num_updates=37600, lr=0.000326164, gnorm=0.219, clip=0, loss_scale=4, train_wall=144, gb_free=58.9, wall=55392 epoch 023: 157 / 1707 loss=4.046, nll_loss=2.41, ppl=5.31, wps=296182, ups=0.69, wpb=430142, bsz=16360, num_updates=37600, lr=0.000326164, gnorm=0.219, clip=0, loss_scale=4, train_wall=144, gb_free=58.9, wall=55392 epoch 023: 157 / 1707 loss=4.046, nll_loss=2.41, ppl=5.31, wps=296182, ups=0.69, wpb=430142, bsz=16360, num_updates=37600, lr=0.000326164, gnorm=0.219, clip=0, loss_scale=4, train_wall=144, gb_free=58.9, wall=55392 epoch 023: 157 / 1707 loss=4.046, nll_loss=2.41, ppl=5.31, wps=296182, ups=0.69, wpb=430142, bsz=16360, num_updates=37600, lr=0.000326164, gnorm=0.219, clip=0, loss_scale=4, train_wall=144, gb_free=58.9, wall=55392 epoch 023: 157 / 1707 loss=4.046, nll_loss=2.41, ppl=5.31, wps=296182, ups=0.69, wpb=430142, bsz=16360, num_updates=37600, lr=0.000326164, gnorm=0.219, clip=0, loss_scale=4, train_wall=144, gb_free=58.9, wall=55392 epoch 023: 157 / 1707 loss=4.046, nll_loss=2.41, ppl=5.31, wps=296182, ups=0.69, wpb=430142, bsz=16360, num_updates=37600, lr=0.000326164, gnorm=0.219, clip=0, loss_scale=4, train_wall=144, gb_free=58.9, wall=55392 epoch 023: 157 / 1707 loss=4.046, nll_loss=2.41, ppl=5.31, wps=296182, ups=0.69, wpb=430142, bsz=16360, num_updates=37600, lr=0.000326164, gnorm=0.219, clip=0, loss_scale=4, train_wall=144, gb_free=58.9, wall=55392 epoch 023: 157 / 1707 loss=4.046, nll_loss=2.41, ppl=5.31, wps=296182, ups=0.69, wpb=430142, bsz=16360, num_updates=37600, lr=0.000326164, gnorm=0.219, clip=0, loss_scale=4, train_wall=144, gb_free=58.9, wall=55392 epoch 023: 157 / 1707 loss=4.046, nll_loss=2.41, ppl=5.31, wps=296182, ups=0.69, wpb=430142, bsz=16360, num_updates=37600, lr=0.000326164, gnorm=0.219, clip=0, loss_scale=4, train_wall=144, gb_free=58.9, wall=55392 epoch 023: 157 / 1707 loss=4.046, nll_loss=2.41, ppl=5.31, wps=296182, ups=0.69, wpb=430142, bsz=16360, num_updates=37600, lr=0.000326164, gnorm=0.219, clip=0, loss_scale=4, train_wall=144, gb_free=58.9, wall=55392 epoch 023: 157 / 1707 loss=4.046, nll_loss=2.41, ppl=5.31, wps=296182, ups=0.69, wpb=430142, bsz=16360, num_updates=37600, lr=0.000326164, gnorm=0.219, clip=0, loss_scale=4, train_wall=144, gb_free=58.9, wall=55392 epoch 023: 157 / 1707 loss=4.046, nll_loss=2.41, ppl=5.31, wps=296182, ups=0.69, wpb=430142, bsz=16360, num_updates=37600, lr=0.000326164, gnorm=0.219, clip=0, loss_scale=4, train_wall=144, gb_free=58.9, wall=55392 epoch 023: 157 / 1707 loss=4.046, nll_loss=2.41, ppl=5.31, wps=296182, ups=0.69, wpb=430142, bsz=16360, num_updates=37600, lr=0.000326164, gnorm=0.219, clip=0, loss_scale=4, train_wall=144, gb_free=58.9, wall=55392 epoch 023: 157 / 1707 loss=4.046, nll_loss=2.41, ppl=5.31, wps=296182, ups=0.69, wpb=430142, bsz=16360, num_updates=37600, lr=0.000326164, gnorm=0.219, clip=0, loss_scale=4, train_wall=144, gb_free=58.9, wall=55392 epoch 023: 157 / 1707 loss=4.046, nll_loss=2.41, ppl=5.31, wps=296182, ups=0.69, wpb=430142, bsz=16360, num_updates=37600, lr=0.000326164, gnorm=0.219, clip=0, loss_scale=4, train_wall=144, gb_free=58.9, wall=55392 epoch 023: 157 / 1707 loss=4.046, nll_loss=2.41, ppl=5.31, wps=296182, ups=0.69, wpb=430142, bsz=16360, num_updates=37600, lr=0.000326164, gnorm=0.219, clip=0, loss_scale=4, train_wall=144, gb_free=58.9, wall=55392 epoch 023: 157 / 1707 loss=4.046, nll_loss=2.41, ppl=5.31, wps=296182, ups=0.69, wpb=430142, bsz=16360, num_updates=37600, lr=0.000326164, gnorm=0.219, clip=0, loss_scale=4, train_wall=144, gb_free=58.9, wall=55392 epoch 023: 157 / 1707 loss=4.046, nll_loss=2.41, ppl=5.31, wps=296182, ups=0.69, wpb=430142, bsz=16360, num_updates=37600, lr=0.000326164, gnorm=0.219, clip=0, loss_scale=4, train_wall=144, gb_free=58.9, wall=55392 epoch 023: 157 / 1707 loss=4.046, nll_loss=2.41, ppl=5.31, wps=296182, ups=0.69, wpb=430142, bsz=16360, num_updates=37600, lr=0.000326164, gnorm=0.219, clip=0, loss_scale=4, train_wall=144, gb_free=58.9, wall=55392 epoch 023: 257 / 1707 loss=4.043, nll_loss=2.407, ppl=5.3, wps=295540, ups=0.69, wpb=429008, bsz=16683.6, num_updates=37700, lr=0.000325731, gnorm=0.235, clip=0, loss_scale=4, train_wall=144, gb_free=59, wall=55537 epoch 023: 257 / 1707 loss=4.043, nll_loss=2.407, ppl=5.3, wps=295540, ups=0.69, wpb=429008, bsz=16683.6, num_updates=37700, lr=0.000325731, gnorm=0.235, clip=0, loss_scale=4, train_wall=144, gb_free=59, wall=55537 epoch 023: 257 / 1707 loss=4.043, nll_loss=2.407, ppl=5.3, wps=295540, ups=0.69, wpb=429008, bsz=16683.6, num_updates=37700, lr=0.000325731, gnorm=0.235, clip=0, loss_scale=4, train_wall=144, gb_free=59, wall=55537 epoch 023: 257 / 1707 loss=4.043, nll_loss=2.407, ppl=5.3, wps=295540, ups=0.69, wpb=429008, bsz=16683.6, num_updates=37700, lr=0.000325731, gnorm=0.235, clip=0, loss_scale=4, train_wall=144, gb_free=59, wall=55537 epoch 023: 257 / 1707 loss=4.043, nll_loss=2.407, ppl=5.3, wps=295540, ups=0.69, wpb=429008, bsz=16683.6, num_updates=37700, lr=0.000325731, gnorm=0.235, clip=0, loss_scale=4, train_wall=144, gb_free=59, wall=55537 epoch 023: 257 / 1707 loss=4.043, nll_loss=2.407, ppl=5.3, wps=295540, ups=0.69, wpb=429008, bsz=16683.6, num_updates=37700, lr=0.000325731, gnorm=0.235, clip=0, loss_scale=4, train_wall=144, gb_free=59, wall=55537 epoch 023: 257 / 1707 loss=4.043, nll_loss=2.407, ppl=5.3, wps=295540, ups=0.69, wpb=429008, bsz=16683.6, num_updates=37700, lr=0.000325731, gnorm=0.235, clip=0, loss_scale=4, train_wall=144, gb_free=59, wall=55537 epoch 023: 257 / 1707 loss=4.043, nll_loss=2.407, ppl=5.3, wps=295540, ups=0.69, wpb=429008, bsz=16683.6, num_updates=37700, lr=0.000325731, gnorm=0.235, clip=0, loss_scale=4, train_wall=144, gb_free=59, wall=55537 epoch 023: 257 / 1707 loss=4.043, nll_loss=2.407, ppl=5.3, wps=295540, ups=0.69, wpb=429008, bsz=16683.6, num_updates=37700, lr=0.000325731, gnorm=0.235, clip=0, loss_scale=4, train_wall=144, gb_free=59, wall=55537 epoch 023: 257 / 1707 loss=4.043, nll_loss=2.407, ppl=5.3, wps=295540, ups=0.69, wpb=429008, bsz=16683.6, num_updates=37700, lr=0.000325731, gnorm=0.235, clip=0, loss_scale=4, train_wall=144, gb_free=59, wall=55537 epoch 023: 257 / 1707 loss=4.043, nll_loss=2.407, ppl=5.3, wps=295540, ups=0.69, wpb=429008, bsz=16683.6, num_updates=37700, lr=0.000325731, gnorm=0.235, clip=0, loss_scale=4, train_wall=144, gb_free=59, wall=55537 epoch 023: 257 / 1707 loss=4.043, nll_loss=2.407, ppl=5.3, wps=295540, ups=0.69, wpb=429008, bsz=16683.6, num_updates=37700, lr=0.000325731, gnorm=0.235, clip=0, loss_scale=4, train_wall=144, gb_free=59, wall=55537 epoch 023: 257 / 1707 loss=4.043, nll_loss=2.407, ppl=5.3, wps=295540, ups=0.69, wpb=429008, bsz=16683.6, num_updates=37700, lr=0.000325731, gnorm=0.235, clip=0, loss_scale=4, train_wall=144, gb_free=59, wall=55537 epoch 023: 257 / 1707 loss=4.043, nll_loss=2.407, ppl=5.3, wps=295540, ups=0.69, wpb=429008, bsz=16683.6, num_updates=37700, lr=0.000325731, gnorm=0.235, clip=0, loss_scale=4, train_wall=144, gb_free=59, wall=55537 epoch 023: 257 / 1707 loss=4.043, nll_loss=2.407, ppl=5.3, wps=295540, ups=0.69, wpb=429008, bsz=16683.6, num_updates=37700, lr=0.000325731, gnorm=0.235, clip=0, loss_scale=4, train_wall=144, gb_free=59, wall=55537 epoch 023: 257 / 1707 loss=4.043, nll_loss=2.407, ppl=5.3, wps=295540, ups=0.69, wpb=429008, bsz=16683.6, num_updates=37700, lr=0.000325731, gnorm=0.235, clip=0, loss_scale=4, train_wall=144, gb_free=59, wall=55537 epoch 023: 257 / 1707 loss=4.043, nll_loss=2.407, ppl=5.3, wps=295540, ups=0.69, wpb=429008, bsz=16683.6, num_updates=37700, lr=0.000325731, gnorm=0.235, clip=0, loss_scale=4, train_wall=144, gb_free=59, wall=55537 epoch 023: 257 / 1707 loss=4.043, nll_loss=2.407, ppl=5.3, wps=295540, ups=0.69, wpb=429008, bsz=16683.6, num_updates=37700, lr=0.000325731, gnorm=0.235, clip=0, loss_scale=4, train_wall=144, gb_free=59, wall=55537 epoch 023: 257 / 1707 loss=4.043, nll_loss=2.407, ppl=5.3, wps=295540, ups=0.69, wpb=429008, bsz=16683.6, num_updates=37700, lr=0.000325731, gnorm=0.235, clip=0, loss_scale=4, train_wall=144, gb_free=59, wall=55537 epoch 023: 257 / 1707 loss=4.043, nll_loss=2.407, ppl=5.3, wps=295540, ups=0.69, wpb=429008, bsz=16683.6, num_updates=37700, lr=0.000325731, gnorm=0.235, clip=0, loss_scale=4, train_wall=144, gb_free=59, wall=55537 epoch 023: 257 / 1707 loss=4.043, nll_loss=2.407, ppl=5.3, wps=295540, ups=0.69, wpb=429008, bsz=16683.6, num_updates=37700, lr=0.000325731, gnorm=0.235, clip=0, loss_scale=4, train_wall=144, gb_free=59, wall=55537 epoch 023: 257 / 1707 loss=4.043, nll_loss=2.407, ppl=5.3, wps=295540, ups=0.69, wpb=429008, bsz=16683.6, num_updates=37700, lr=0.000325731, gnorm=0.235, clip=0, loss_scale=4, train_wall=144, gb_free=59, wall=55537 epoch 023: 257 / 1707 loss=4.043, nll_loss=2.407, ppl=5.3, wps=295540, ups=0.69, wpb=429008, bsz=16683.6, num_updates=37700, lr=0.000325731, gnorm=0.235, clip=0, loss_scale=4, train_wall=144, gb_free=59, wall=55537 epoch 023: 358 / 1707 loss=4.048, nll_loss=2.412, ppl=5.32, wps=292938, ups=0.68, wpb=429712, bsz=16239.7, num_updates=37800, lr=0.0003253, gnorm=0.232, clip=0, loss_scale=2, train_wall=146, gb_free=59, wall=55684 epoch 023: 358 / 1707 loss=4.048, nll_loss=2.412, ppl=5.32, wps=292938, ups=0.68, wpb=429712, bsz=16239.7, num_updates=37800, lr=0.0003253, gnorm=0.232, clip=0, loss_scale=2, train_wall=146, gb_free=59, wall=55684 epoch 023: 358 / 1707 loss=4.048, nll_loss=2.412, ppl=5.32, wps=292938, ups=0.68, wpb=429712, bsz=16239.7, num_updates=37800, lr=0.0003253, gnorm=0.232, clip=0, loss_scale=2, train_wall=146, gb_free=59, wall=55684 epoch 023: 358 / 1707 loss=4.048, nll_loss=2.412, ppl=5.32, wps=292938, ups=0.68, wpb=429712, bsz=16239.7, num_updates=37800, lr=0.0003253, gnorm=0.232, clip=0, loss_scale=2, train_wall=146, gb_free=59, wall=55684 epoch 023: 358 / 1707 loss=4.048, nll_loss=2.412, ppl=5.32, wps=292938, ups=0.68, wpb=429712, bsz=16239.7, num_updates=37800, lr=0.0003253, gnorm=0.232, clip=0, loss_scale=2, train_wall=146, gb_free=59, wall=55684 epoch 023: 358 / 1707 loss=4.048, nll_loss=2.412, ppl=5.32, wps=292938, ups=0.68, wpb=429712, bsz=16239.7, num_updates=37800, lr=0.0003253, gnorm=0.232, clip=0, loss_scale=2, train_wall=146, gb_free=59, wall=55684 epoch 023: 358 / 1707 loss=4.048, nll_loss=2.412, ppl=5.32, wps=292938, ups=0.68, wpb=429712, bsz=16239.7, num_updates=37800, lr=0.0003253, gnorm=0.232, clip=0, loss_scale=2, train_wall=146, gb_free=59, wall=55684 epoch 023: 358 / 1707 loss=4.048, nll_loss=2.412, ppl=5.32, wps=292938, ups=0.68, wpb=429712, bsz=16239.7, num_updates=37800, lr=0.0003253, gnorm=0.232, clip=0, loss_scale=2, train_wall=146, gb_free=59, wall=55684 epoch 023: 358 / 1707 loss=4.048, nll_loss=2.412, ppl=5.32, wps=292938, ups=0.68, wpb=429712, bsz=16239.7, num_updates=37800, lr=0.0003253, gnorm=0.232, clip=0, loss_scale=2, train_wall=146, gb_free=59, wall=55684 epoch 023: 358 / 1707 loss=4.048, nll_loss=2.412, ppl=5.32, wps=292938, ups=0.68, wpb=429712, bsz=16239.7, num_updates=37800, lr=0.0003253, gnorm=0.232, clip=0, loss_scale=2, train_wall=146, gb_free=59, wall=55684 epoch 023: 358 / 1707 loss=4.048, nll_loss=2.412, ppl=5.32, wps=292938, ups=0.68, wpb=429712, bsz=16239.7, num_updates=37800, lr=0.0003253, gnorm=0.232, clip=0, loss_scale=2, train_wall=146, gb_free=59, wall=55684 epoch 023: 358 / 1707 loss=4.048, nll_loss=2.412, ppl=5.32, wps=292938, ups=0.68, wpb=429712, bsz=16239.7, num_updates=37800, lr=0.0003253, gnorm=0.232, clip=0, loss_scale=2, train_wall=146, gb_free=59, wall=55684 epoch 023: 358 / 1707 loss=4.048, nll_loss=2.412, ppl=5.32, wps=292938, ups=0.68, wpb=429712, bsz=16239.7, num_updates=37800, lr=0.0003253, gnorm=0.232, clip=0, loss_scale=2, train_wall=146, gb_free=59, wall=55684 epoch 023: 358 / 1707 loss=4.048, nll_loss=2.412, ppl=5.32, wps=292938, ups=0.68, wpb=429712, bsz=16239.7, num_updates=37800, lr=0.0003253, gnorm=0.232, clip=0, loss_scale=2, train_wall=146, gb_free=59, wall=55684 epoch 023: 358 / 1707 loss=4.048, nll_loss=2.412, ppl=5.32, wps=292938, ups=0.68, wpb=429712, bsz=16239.7, num_updates=37800, lr=0.0003253, gnorm=0.232, clip=0, loss_scale=2, train_wall=146, gb_free=59, wall=55684 epoch 023: 358 / 1707 loss=4.048, nll_loss=2.412, ppl=5.32, wps=292938, ups=0.68, wpb=429712, bsz=16239.7, num_updates=37800, lr=0.0003253, gnorm=0.232, clip=0, loss_scale=2, train_wall=146, gb_free=59, wall=55684 epoch 023: 358 / 1707 loss=4.048, nll_loss=2.412, ppl=5.32, wps=292938, ups=0.68, wpb=429712, bsz=16239.7, num_updates=37800, lr=0.0003253, gnorm=0.232, clip=0, loss_scale=2, train_wall=146, gb_free=59, wall=55684 epoch 023: 358 / 1707 loss=4.048, nll_loss=2.412, ppl=5.32, wps=292938, ups=0.68, wpb=429712, bsz=16239.7, num_updates=37800, lr=0.0003253, gnorm=0.232, clip=0, loss_scale=2, train_wall=146, gb_free=59, wall=55684 epoch 023: 358 / 1707 loss=4.048, nll_loss=2.412, ppl=5.32, wps=292938, ups=0.68, wpb=429712, bsz=16239.7, num_updates=37800, lr=0.0003253, gnorm=0.232, clip=0, loss_scale=2, train_wall=146, gb_free=59, wall=55684 epoch 023: 358 / 1707 loss=4.048, nll_loss=2.412, ppl=5.32, wps=292938, ups=0.68, wpb=429712, bsz=16239.7, num_updates=37800, lr=0.0003253, gnorm=0.232, clip=0, loss_scale=2, train_wall=146, gb_free=59, wall=55684 epoch 023: 358 / 1707 loss=4.048, nll_loss=2.412, ppl=5.32, wps=292938, ups=0.68, wpb=429712, bsz=16239.7, num_updates=37800, lr=0.0003253, gnorm=0.232, clip=0, loss_scale=2, train_wall=146, gb_free=59, wall=55684 epoch 023: 358 / 1707 loss=4.048, nll_loss=2.412, ppl=5.32, wps=292938, ups=0.68, wpb=429712, bsz=16239.7, num_updates=37800, lr=0.0003253, gnorm=0.232, clip=0, loss_scale=2, train_wall=146, gb_free=59, wall=55684 epoch 023: 358 / 1707 loss=4.048, nll_loss=2.412, ppl=5.32, wps=292938, ups=0.68, wpb=429712, bsz=16239.7, num_updates=37800, lr=0.0003253, gnorm=0.232, clip=0, loss_scale=2, train_wall=146, gb_free=59, wall=55684 epoch 023: 458 / 1707 loss=4.054, nll_loss=2.419, ppl=5.35, wps=295900, ups=0.69, wpb=429155, bsz=16301, num_updates=37900, lr=0.000324871, gnorm=0.235, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=55829 epoch 023: 458 / 1707 loss=4.054, nll_loss=2.419, ppl=5.35, wps=295900, ups=0.69, wpb=429155, bsz=16301, num_updates=37900, lr=0.000324871, gnorm=0.235, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=55829 epoch 023: 458 / 1707 loss=4.054, nll_loss=2.419, ppl=5.35, wps=295900, ups=0.69, wpb=429155, bsz=16301, num_updates=37900, lr=0.000324871, gnorm=0.235, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=55829 epoch 023: 458 / 1707 loss=4.054, nll_loss=2.419, ppl=5.35, wps=295900, ups=0.69, wpb=429155, bsz=16301, num_updates=37900, lr=0.000324871, gnorm=0.235, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=55829 epoch 023: 458 / 1707 loss=4.054, nll_loss=2.419, ppl=5.35, wps=295900, ups=0.69, wpb=429155, bsz=16301, num_updates=37900, lr=0.000324871, gnorm=0.235, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=55829 epoch 023: 458 / 1707 loss=4.054, nll_loss=2.419, ppl=5.35, wps=295900, ups=0.69, wpb=429155, bsz=16301, num_updates=37900, lr=0.000324871, gnorm=0.235, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=55829 epoch 023: 458 / 1707 loss=4.054, nll_loss=2.419, ppl=5.35, wps=295900, ups=0.69, wpb=429155, bsz=16301, num_updates=37900, lr=0.000324871, gnorm=0.235, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=55829 epoch 023: 458 / 1707 loss=4.054, nll_loss=2.419, ppl=5.35, wps=295900, ups=0.69, wpb=429155, bsz=16301, num_updates=37900, lr=0.000324871, gnorm=0.235, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=55829 epoch 023: 458 / 1707 loss=4.054, nll_loss=2.419, ppl=5.35, wps=295900, ups=0.69, wpb=429155, bsz=16301, num_updates=37900, lr=0.000324871, gnorm=0.235, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=55829 epoch 023: 458 / 1707 loss=4.054, nll_loss=2.419, ppl=5.35, wps=295900, ups=0.69, wpb=429155, bsz=16301, num_updates=37900, lr=0.000324871, gnorm=0.235, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=55829 epoch 023: 458 / 1707 loss=4.054, nll_loss=2.419, ppl=5.35, wps=295900, ups=0.69, wpb=429155, bsz=16301, num_updates=37900, lr=0.000324871, gnorm=0.235, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=55829 epoch 023: 458 / 1707 loss=4.054, nll_loss=2.419, ppl=5.35, wps=295900, ups=0.69, wpb=429155, bsz=16301, num_updates=37900, lr=0.000324871, gnorm=0.235, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=55829 epoch 023: 458 / 1707 loss=4.054, nll_loss=2.419, ppl=5.35, wps=295900, ups=0.69, wpb=429155, bsz=16301, num_updates=37900, lr=0.000324871, gnorm=0.235, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=55829 epoch 023: 458 / 1707 loss=4.054, nll_loss=2.419, ppl=5.35, wps=295900, ups=0.69, wpb=429155, bsz=16301, num_updates=37900, lr=0.000324871, gnorm=0.235, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=55829 epoch 023: 458 / 1707 loss=4.054, nll_loss=2.419, ppl=5.35, wps=295900, ups=0.69, wpb=429155, bsz=16301, num_updates=37900, lr=0.000324871, gnorm=0.235, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=55829 epoch 023: 458 / 1707 loss=4.054, nll_loss=2.419, ppl=5.35, wps=295900, ups=0.69, wpb=429155, bsz=16301, num_updates=37900, lr=0.000324871, gnorm=0.235, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=55829 epoch 023: 458 / 1707 loss=4.054, nll_loss=2.419, ppl=5.35, wps=295900, ups=0.69, wpb=429155, bsz=16301, num_updates=37900, lr=0.000324871, gnorm=0.235, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=55829 epoch 023: 458 / 1707 loss=4.054, nll_loss=2.419, ppl=5.35, wps=295900, ups=0.69, wpb=429155, bsz=16301, num_updates=37900, lr=0.000324871, gnorm=0.235, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=55829 epoch 023: 458 / 1707 loss=4.054, nll_loss=2.419, ppl=5.35, wps=295900, ups=0.69, wpb=429155, bsz=16301, num_updates=37900, lr=0.000324871, gnorm=0.235, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=55829 epoch 023: 458 / 1707 loss=4.054, nll_loss=2.419, ppl=5.35, wps=295900, ups=0.69, wpb=429155, bsz=16301, num_updates=37900, lr=0.000324871, gnorm=0.235, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=55829 epoch 023: 458 / 1707 loss=4.054, nll_loss=2.419, ppl=5.35, wps=295900, ups=0.69, wpb=429155, bsz=16301, num_updates=37900, lr=0.000324871, gnorm=0.235, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=55829 epoch 023: 458 / 1707 loss=4.054, nll_loss=2.419, ppl=5.35, wps=295900, ups=0.69, wpb=429155, bsz=16301, num_updates=37900, lr=0.000324871, gnorm=0.235, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=55829 epoch 023: 458 / 1707 loss=4.054, nll_loss=2.419, ppl=5.35, wps=295900, ups=0.69, wpb=429155, bsz=16301, num_updates=37900, lr=0.000324871, gnorm=0.235, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=55829 epoch 023: 559 / 1707 loss=4.05, nll_loss=2.415, ppl=5.33, wps=293671, ups=0.68, wpb=431415, bsz=16557.3, num_updates=38000, lr=0.000324443, gnorm=0.245, clip=0, loss_scale=1, train_wall=146, gb_free=59.4, wall=55976 epoch 023: 559 / 1707 loss=4.05, nll_loss=2.415, ppl=5.33, wps=293671, ups=0.68, wpb=431415, bsz=16557.3, num_updates=38000, lr=0.000324443, gnorm=0.245, clip=0, loss_scale=1, train_wall=146, gb_free=59.4, wall=55976 epoch 023: 559 / 1707 loss=4.05, nll_loss=2.415, ppl=5.33, wps=293671, ups=0.68, wpb=431415, bsz=16557.3, num_updates=38000, lr=0.000324443, gnorm=0.245, clip=0, loss_scale=1, train_wall=146, gb_free=59.4, wall=55976 epoch 023: 559 / 1707 loss=4.05, nll_loss=2.415, ppl=5.33, wps=293671, ups=0.68, wpb=431415, bsz=16557.3, num_updates=38000, lr=0.000324443, gnorm=0.245, clip=0, loss_scale=1, train_wall=146, gb_free=59.4, wall=55976 epoch 023: 559 / 1707 loss=4.05, nll_loss=2.415, ppl=5.33, wps=293671, ups=0.68, wpb=431415, bsz=16557.3, num_updates=38000, lr=0.000324443, gnorm=0.245, clip=0, loss_scale=1, train_wall=146, gb_free=59.4, wall=55976 epoch 023: 559 / 1707 loss=4.05, nll_loss=2.415, ppl=5.33, wps=293671, ups=0.68, wpb=431415, bsz=16557.3, num_updates=38000, lr=0.000324443, gnorm=0.245, clip=0, loss_scale=1, train_wall=146, gb_free=59.4, wall=55976 epoch 023: 559 / 1707 loss=4.05, nll_loss=2.415, ppl=5.33, wps=293671, ups=0.68, wpb=431415, bsz=16557.3, num_updates=38000, lr=0.000324443, gnorm=0.245, clip=0, loss_scale=1, train_wall=146, gb_free=59.4, wall=55976 epoch 023: 559 / 1707 loss=4.05, nll_loss=2.415, ppl=5.33, wps=293671, ups=0.68, wpb=431415, bsz=16557.3, num_updates=38000, lr=0.000324443, gnorm=0.245, clip=0, loss_scale=1, train_wall=146, gb_free=59.4, wall=55976 epoch 023: 559 / 1707 loss=4.05, nll_loss=2.415, ppl=5.33, wps=293671, ups=0.68, wpb=431415, bsz=16557.3, num_updates=38000, lr=0.000324443, gnorm=0.245, clip=0, loss_scale=1, train_wall=146, gb_free=59.4, wall=55976 epoch 023: 559 / 1707 loss=4.05, nll_loss=2.415, ppl=5.33, wps=293671, ups=0.68, wpb=431415, bsz=16557.3, num_updates=38000, lr=0.000324443, gnorm=0.245, clip=0, loss_scale=1, train_wall=146, gb_free=59.4, wall=55976 epoch 023: 559 / 1707 loss=4.05, nll_loss=2.415, ppl=5.33, wps=293671, ups=0.68, wpb=431415, bsz=16557.3, num_updates=38000, lr=0.000324443, gnorm=0.245, clip=0, loss_scale=1, train_wall=146, gb_free=59.4, wall=55976 epoch 023: 559 / 1707 loss=4.05, nll_loss=2.415, ppl=5.33, wps=293671, ups=0.68, wpb=431415, bsz=16557.3, num_updates=38000, lr=0.000324443, gnorm=0.245, clip=0, loss_scale=1, train_wall=146, gb_free=59.4, wall=55976 epoch 023: 559 / 1707 loss=4.05, nll_loss=2.415, ppl=5.33, wps=293671, ups=0.68, wpb=431415, bsz=16557.3, num_updates=38000, lr=0.000324443, gnorm=0.245, clip=0, loss_scale=1, train_wall=146, gb_free=59.4, wall=55976 epoch 023: 559 / 1707 loss=4.05, nll_loss=2.415, ppl=5.33, wps=293671, ups=0.68, wpb=431415, bsz=16557.3, num_updates=38000, lr=0.000324443, gnorm=0.245, clip=0, loss_scale=1, train_wall=146, gb_free=59.4, wall=55976 epoch 023: 559 / 1707 loss=4.05, nll_loss=2.415, ppl=5.33, wps=293671, ups=0.68, wpb=431415, bsz=16557.3, num_updates=38000, lr=0.000324443, gnorm=0.245, clip=0, loss_scale=1, train_wall=146, gb_free=59.4, wall=55976 epoch 023: 559 / 1707 loss=4.05, nll_loss=2.415, ppl=5.33, wps=293671, ups=0.68, wpb=431415, bsz=16557.3, num_updates=38000, lr=0.000324443, gnorm=0.245, clip=0, loss_scale=1, train_wall=146, gb_free=59.4, wall=55976 epoch 023: 559 / 1707 loss=4.05, nll_loss=2.415, ppl=5.33, wps=293671, ups=0.68, wpb=431415, bsz=16557.3, num_updates=38000, lr=0.000324443, gnorm=0.245, clip=0, loss_scale=1, train_wall=146, gb_free=59.4, wall=55976 epoch 023: 559 / 1707 loss=4.05, nll_loss=2.415, ppl=5.33, wps=293671, ups=0.68, wpb=431415, bsz=16557.3, num_updates=38000, lr=0.000324443, gnorm=0.245, clip=0, loss_scale=1, train_wall=146, gb_free=59.4, wall=55976 epoch 023: 559 / 1707 loss=4.05, nll_loss=2.415, ppl=5.33, wps=293671, ups=0.68, wpb=431415, bsz=16557.3, num_updates=38000, lr=0.000324443, gnorm=0.245, clip=0, loss_scale=1, train_wall=146, gb_free=59.4, wall=55976 epoch 023: 559 / 1707 loss=4.05, nll_loss=2.415, ppl=5.33, wps=293671, ups=0.68, wpb=431415, bsz=16557.3, num_updates=38000, lr=0.000324443, gnorm=0.245, clip=0, loss_scale=1, train_wall=146, gb_free=59.4, wall=55976 epoch 023: 559 / 1707 loss=4.05, nll_loss=2.415, ppl=5.33, wps=293671, ups=0.68, wpb=431415, bsz=16557.3, num_updates=38000, lr=0.000324443, gnorm=0.245, clip=0, loss_scale=1, train_wall=146, gb_free=59.4, wall=55976 epoch 023: 559 / 1707 loss=4.05, nll_loss=2.415, ppl=5.33, wps=293671, ups=0.68, wpb=431415, bsz=16557.3, num_updates=38000, lr=0.000324443, gnorm=0.245, clip=0, loss_scale=1, train_wall=146, gb_free=59.4, wall=55976 epoch 023: 559 / 1707 loss=4.05, nll_loss=2.415, ppl=5.33, wps=293671, ups=0.68, wpb=431415, bsz=16557.3, num_updates=38000, lr=0.000324443, gnorm=0.245, clip=0, loss_scale=1, train_wall=146, gb_free=59.4, wall=55976 begin validation on "valid" subset epoch 023 | valid on 'valid' subset | loss 4.299 | nll_loss 2.68 | ppl 6.41 | wps 76343.2 | wpb 21331 | bsz 1016 | num_updates 38000 | best_loss 4.285 epoch 023 | valid on 'valid' subset | loss 4.299 | nll_loss 2.68 | ppl 6.41 | wps 76343.2 | wpb 21331 | bsz 1016 | num_updates 38000 | best_loss 4.285 epoch 023 | valid on 'valid' subset | loss 4.299 | nll_loss 2.68 | ppl 6.41 | wps 76343.2 | wpb 21331 | bsz 1016 | num_updates 38000 | best_loss 4.285 epoch 023 | valid on 'valid' subset | loss 4.299 | nll_loss 2.68 | ppl 6.41 | wps 76343.2 | wpb 21331 | bsz 1016 | num_updates 38000 | best_loss 4.285 epoch 023 | valid on 'valid' subset | loss 4.299 | nll_loss 2.68 | ppl 6.41 | wps 76343.2 | wpb 21331 | bsz 1016 | num_updates 38000 | best_loss 4.285 epoch 023 | valid on 'valid' subset | loss 4.299 | nll_loss 2.68 | ppl 6.41 | wps 76343.2 | wpb 21331 | bsz 1016 | num_updates 38000 | best_loss 4.285 epoch 023 | valid on 'valid' subset | loss 4.299 | nll_loss 2.68 | ppl 6.41 | wps 76343.2 | wpb 21331 | bsz 1016 | num_updates 38000 | best_loss 4.285 epoch 023 | valid on 'valid' subset | loss 4.299 | nll_loss 2.68 | ppl 6.41 | wps 76343.2 | wpb 21331 | bsz 1016 | num_updates 38000 | best_loss 4.285 epoch 023 | valid on 'valid' subset | loss 4.299 | nll_loss 2.68 | ppl 6.41 | wps 76343.2 | wpb 21331 | bsz 1016 | num_updates 38000 | best_loss 4.285 epoch 023 | valid on 'valid' subset | loss 4.299 | nll_loss 2.68 | ppl 6.41 | wps 76343.2 | wpb 21331 | bsz 1016 | num_updates 38000 | best_loss 4.285 epoch 023 | valid on 'valid' subset | loss 4.299 | nll_loss 2.68 | ppl 6.41 | wps 76343.2 | wpb 21331 | bsz 1016 | num_updates 38000 | best_loss 4.285 epoch 023 | valid on 'valid' subset | loss 4.299 | nll_loss 2.68 | ppl 6.41 | wps 76343.2 | wpb 21331 | bsz 1016 | num_updates 38000 | best_loss 4.285 epoch 023 | valid on 'valid' subset | loss 4.299 | nll_loss 2.68 | ppl 6.41 | wps 76343.2 | wpb 21331 | bsz 1016 | num_updates 38000 | best_loss 4.285 epoch 023 | valid on 'valid' subset | loss 4.299 | nll_loss 2.68 | ppl 6.41 | wps 76343.2 | wpb 21331 | bsz 1016 | num_updates 38000 | best_loss 4.285 epoch 023 | valid on 'valid' subset | loss 4.299 | nll_loss 2.68 | ppl 6.41 | wps 76343.2 | wpb 21331 | bsz 1016 | num_updates 38000 | best_loss 4.285 epoch 023 | valid on 'valid' subset | loss 4.299 | nll_loss 2.68 | ppl 6.41 | wps 76343.2 | wpb 21331 | bsz 1016 | num_updates 38000 | best_loss 4.285 epoch 023 | valid on 'valid' subset | loss 4.299 | nll_loss 2.68 | ppl 6.41 | wps 76343.2 | wpb 21331 | bsz 1016 | num_updates 38000 | best_loss 4.285 epoch 023 | valid on 'valid' subset | loss 4.299 | nll_loss 2.68 | ppl 6.41 | wps 76343.2 | wpb 21331 | bsz 1016 | num_updates 38000 | best_loss 4.285 epoch 023 | valid on 'valid' subset | loss 4.299 | nll_loss 2.68 | ppl 6.41 | wps 76343.2 | wpb 21331 | bsz 1016 | num_updates 38000 | best_loss 4.285 epoch 023 | valid on 'valid' subset | loss 4.299 | nll_loss 2.68 | ppl 6.41 | wps 76343.2 | wpb 21331 | bsz 1016 | num_updates 38000 | best_loss 4.285 epoch 023 | valid on 'valid' subset | loss 4.299 | nll_loss 2.68 | ppl 6.41 | wps 76343.2 | wpb 21331 | bsz 1016 | num_updates 38000 | best_loss 4.285 epoch 023 | valid on 'valid' subset | loss 4.299 | nll_loss 2.68 | ppl 6.41 | wps 76343.2 | wpb 21331 | bsz 1016 | num_updates 38000 | best_loss 4.285 epoch 023 | valid on 'valid' subset | loss 4.299 | nll_loss 2.68 | ppl 6.41 | wps 76343.2 | wpb 21331 | bsz 1016 | num_updates 38000 | best_loss 4.285 epoch 023: 659 / 1707 loss=4.055, nll_loss=2.421, ppl=5.36, wps=272601, ups=0.64, wpb=429176, bsz=16263.1, num_updates=38100, lr=0.000324017, gnorm=0.233, clip=0, loss_scale=1, train_wall=144, gb_free=59.2, wall=56133 epoch 023: 659 / 1707 loss=4.055, nll_loss=2.421, ppl=5.36, wps=272601, ups=0.64, wpb=429176, bsz=16263.1, num_updates=38100, lr=0.000324017, gnorm=0.233, clip=0, loss_scale=1, train_wall=144, gb_free=59.2, wall=56133 epoch 023: 659 / 1707 loss=4.055, nll_loss=2.421, ppl=5.36, wps=272601, ups=0.64, wpb=429176, bsz=16263.1, num_updates=38100, lr=0.000324017, gnorm=0.233, clip=0, loss_scale=1, train_wall=144, gb_free=59.2, wall=56133 epoch 023: 659 / 1707 loss=4.055, nll_loss=2.421, ppl=5.36, wps=272601, ups=0.64, wpb=429176, bsz=16263.1, num_updates=38100, lr=0.000324017, gnorm=0.233, clip=0, loss_scale=1, train_wall=144, gb_free=59.2, wall=56133 epoch 023: 659 / 1707 loss=4.055, nll_loss=2.421, ppl=5.36, wps=272601, ups=0.64, wpb=429176, bsz=16263.1, num_updates=38100, lr=0.000324017, gnorm=0.233, clip=0, loss_scale=1, train_wall=144, gb_free=59.2, wall=56133 epoch 023: 659 / 1707 loss=4.055, nll_loss=2.421, ppl=5.36, wps=272601, ups=0.64, wpb=429176, bsz=16263.1, num_updates=38100, lr=0.000324017, gnorm=0.233, clip=0, loss_scale=1, train_wall=144, gb_free=59.2, wall=56133 epoch 023: 659 / 1707 loss=4.055, nll_loss=2.421, ppl=5.36, wps=272601, ups=0.64, wpb=429176, bsz=16263.1, num_updates=38100, lr=0.000324017, gnorm=0.233, clip=0, loss_scale=1, train_wall=144, gb_free=59.2, wall=56133 epoch 023: 659 / 1707 loss=4.055, nll_loss=2.421, ppl=5.36, wps=272601, ups=0.64, wpb=429176, bsz=16263.1, num_updates=38100, lr=0.000324017, gnorm=0.233, clip=0, loss_scale=1, train_wall=144, gb_free=59.2, wall=56133 epoch 023: 659 / 1707 loss=4.055, nll_loss=2.421, ppl=5.36, wps=272601, ups=0.64, wpb=429176, bsz=16263.1, num_updates=38100, lr=0.000324017, gnorm=0.233, clip=0, loss_scale=1, train_wall=144, gb_free=59.2, wall=56133 epoch 023: 659 / 1707 loss=4.055, nll_loss=2.421, ppl=5.36, wps=272601, ups=0.64, wpb=429176, bsz=16263.1, num_updates=38100, lr=0.000324017, gnorm=0.233, clip=0, loss_scale=1, train_wall=144, gb_free=59.2, wall=56133 epoch 023: 659 / 1707 loss=4.055, nll_loss=2.421, ppl=5.36, wps=272601, ups=0.64, wpb=429176, bsz=16263.1, num_updates=38100, lr=0.000324017, gnorm=0.233, clip=0, loss_scale=1, train_wall=144, gb_free=59.2, wall=56133 epoch 023: 659 / 1707 loss=4.055, nll_loss=2.421, ppl=5.36, wps=272601, ups=0.64, wpb=429176, bsz=16263.1, num_updates=38100, lr=0.000324017, gnorm=0.233, clip=0, loss_scale=1, train_wall=144, gb_free=59.2, wall=56133 epoch 023: 659 / 1707 loss=4.055, nll_loss=2.421, ppl=5.36, wps=272601, ups=0.64, wpb=429176, bsz=16263.1, num_updates=38100, lr=0.000324017, gnorm=0.233, clip=0, loss_scale=1, train_wall=144, gb_free=59.2, wall=56133 epoch 023: 659 / 1707 loss=4.055, nll_loss=2.421, ppl=5.36, wps=272601, ups=0.64, wpb=429176, bsz=16263.1, num_updates=38100, lr=0.000324017, gnorm=0.233, clip=0, loss_scale=1, train_wall=144, gb_free=59.2, wall=56133 epoch 023: 659 / 1707 loss=4.055, nll_loss=2.421, ppl=5.36, wps=272601, ups=0.64, wpb=429176, bsz=16263.1, num_updates=38100, lr=0.000324017, gnorm=0.233, clip=0, loss_scale=1, train_wall=144, gb_free=59.2, wall=56133 epoch 023: 659 / 1707 loss=4.055, nll_loss=2.421, ppl=5.36, wps=272601, ups=0.64, wpb=429176, bsz=16263.1, num_updates=38100, lr=0.000324017, gnorm=0.233, clip=0, loss_scale=1, train_wall=144, gb_free=59.2, wall=56133 epoch 023: 659 / 1707 loss=4.055, nll_loss=2.421, ppl=5.36, wps=272601, ups=0.64, wpb=429176, bsz=16263.1, num_updates=38100, lr=0.000324017, gnorm=0.233, clip=0, loss_scale=1, train_wall=144, gb_free=59.2, wall=56133 epoch 023: 659 / 1707 loss=4.055, nll_loss=2.421, ppl=5.36, wps=272601, ups=0.64, wpb=429176, bsz=16263.1, num_updates=38100, lr=0.000324017, gnorm=0.233, clip=0, loss_scale=1, train_wall=144, gb_free=59.2, wall=56133 epoch 023: 659 / 1707 loss=4.055, nll_loss=2.421, ppl=5.36, wps=272601, ups=0.64, wpb=429176, bsz=16263.1, num_updates=38100, lr=0.000324017, gnorm=0.233, clip=0, loss_scale=1, train_wall=144, gb_free=59.2, wall=56133 epoch 023: 659 / 1707 loss=4.055, nll_loss=2.421, ppl=5.36, wps=272601, ups=0.64, wpb=429176, bsz=16263.1, num_updates=38100, lr=0.000324017, gnorm=0.233, clip=0, loss_scale=1, train_wall=144, gb_free=59.2, wall=56133 epoch 023: 659 / 1707 loss=4.055, nll_loss=2.421, ppl=5.36, wps=272601, ups=0.64, wpb=429176, bsz=16263.1, num_updates=38100, lr=0.000324017, gnorm=0.233, clip=0, loss_scale=1, train_wall=144, gb_free=59.2, wall=56133 epoch 023: 659 / 1707 loss=4.055, nll_loss=2.421, ppl=5.36, wps=272601, ups=0.64, wpb=429176, bsz=16263.1, num_updates=38100, lr=0.000324017, gnorm=0.233, clip=0, loss_scale=1, train_wall=144, gb_free=59.2, wall=56133 epoch 023: 659 / 1707 loss=4.055, nll_loss=2.421, ppl=5.36, wps=272601, ups=0.64, wpb=429176, bsz=16263.1, num_updates=38100, lr=0.000324017, gnorm=0.233, clip=0, loss_scale=1, train_wall=144, gb_free=59.2, wall=56133 epoch 023: 759 / 1707 loss=4.059, nll_loss=2.425, ppl=5.37, wps=296623, ups=0.69, wpb=429640, bsz=16609.2, num_updates=38200, lr=0.000323592, gnorm=0.229, clip=0, loss_scale=2, train_wall=144, gb_free=58.8, wall=56278 epoch 023: 759 / 1707 loss=4.059, nll_loss=2.425, ppl=5.37, wps=296623, ups=0.69, wpb=429640, bsz=16609.2, num_updates=38200, lr=0.000323592, gnorm=0.229, clip=0, loss_scale=2, train_wall=144, gb_free=58.8, wall=56278 epoch 023: 759 / 1707 loss=4.059, nll_loss=2.425, ppl=5.37, wps=296623, ups=0.69, wpb=429640, bsz=16609.2, num_updates=38200, lr=0.000323592, gnorm=0.229, clip=0, loss_scale=2, train_wall=144, gb_free=58.8, wall=56278 epoch 023: 759 / 1707 loss=4.059, nll_loss=2.425, ppl=5.37, wps=296623, ups=0.69, wpb=429640, bsz=16609.2, num_updates=38200, lr=0.000323592, gnorm=0.229, clip=0, loss_scale=2, train_wall=144, gb_free=58.8, wall=56278 epoch 023: 759 / 1707 loss=4.059, nll_loss=2.425, ppl=5.37, wps=296623, ups=0.69, wpb=429640, bsz=16609.2, num_updates=38200, lr=0.000323592, gnorm=0.229, clip=0, loss_scale=2, train_wall=144, gb_free=58.8, wall=56278 epoch 023: 759 / 1707 loss=4.059, nll_loss=2.425, ppl=5.37, wps=296623, ups=0.69, wpb=429640, bsz=16609.2, num_updates=38200, lr=0.000323592, gnorm=0.229, clip=0, loss_scale=2, train_wall=144, gb_free=58.8, wall=56278 epoch 023: 759 / 1707 loss=4.059, nll_loss=2.425, ppl=5.37, wps=296623, ups=0.69, wpb=429640, bsz=16609.2, num_updates=38200, lr=0.000323592, gnorm=0.229, clip=0, loss_scale=2, train_wall=144, gb_free=58.8, wall=56278 epoch 023: 759 / 1707 loss=4.059, nll_loss=2.425, ppl=5.37, wps=296623, ups=0.69, wpb=429640, bsz=16609.2, num_updates=38200, lr=0.000323592, gnorm=0.229, clip=0, loss_scale=2, train_wall=144, gb_free=58.8, wall=56278 epoch 023: 759 / 1707 loss=4.059, nll_loss=2.425, ppl=5.37, wps=296623, ups=0.69, wpb=429640, bsz=16609.2, num_updates=38200, lr=0.000323592, gnorm=0.229, clip=0, loss_scale=2, train_wall=144, gb_free=58.8, wall=56278 epoch 023: 759 / 1707 loss=4.059, nll_loss=2.425, ppl=5.37, wps=296623, ups=0.69, wpb=429640, bsz=16609.2, num_updates=38200, lr=0.000323592, gnorm=0.229, clip=0, loss_scale=2, train_wall=144, gb_free=58.8, wall=56278 epoch 023: 759 / 1707 loss=4.059, nll_loss=2.425, ppl=5.37, wps=296623, ups=0.69, wpb=429640, bsz=16609.2, num_updates=38200, lr=0.000323592, gnorm=0.229, clip=0, loss_scale=2, train_wall=144, gb_free=58.8, wall=56278 epoch 023: 759 / 1707 loss=4.059, nll_loss=2.425, ppl=5.37, wps=296623, ups=0.69, wpb=429640, bsz=16609.2, num_updates=38200, lr=0.000323592, gnorm=0.229, clip=0, loss_scale=2, train_wall=144, gb_free=58.8, wall=56278 epoch 023: 759 / 1707 loss=4.059, nll_loss=2.425, ppl=5.37, wps=296623, ups=0.69, wpb=429640, bsz=16609.2, num_updates=38200, lr=0.000323592, gnorm=0.229, clip=0, loss_scale=2, train_wall=144, gb_free=58.8, wall=56278 epoch 023: 759 / 1707 loss=4.059, nll_loss=2.425, ppl=5.37, wps=296623, ups=0.69, wpb=429640, bsz=16609.2, num_updates=38200, lr=0.000323592, gnorm=0.229, clip=0, loss_scale=2, train_wall=144, gb_free=58.8, wall=56278 epoch 023: 759 / 1707 loss=4.059, nll_loss=2.425, ppl=5.37, wps=296623, ups=0.69, wpb=429640, bsz=16609.2, num_updates=38200, lr=0.000323592, gnorm=0.229, clip=0, loss_scale=2, train_wall=144, gb_free=58.8, wall=56278 epoch 023: 759 / 1707 loss=4.059, nll_loss=2.425, ppl=5.37, wps=296623, ups=0.69, wpb=429640, bsz=16609.2, num_updates=38200, lr=0.000323592, gnorm=0.229, clip=0, loss_scale=2, train_wall=144, gb_free=58.8, wall=56278 epoch 023: 759 / 1707 loss=4.059, nll_loss=2.425, ppl=5.37, wps=296623, ups=0.69, wpb=429640, bsz=16609.2, num_updates=38200, lr=0.000323592, gnorm=0.229, clip=0, loss_scale=2, train_wall=144, gb_free=58.8, wall=56278 epoch 023: 759 / 1707 loss=4.059, nll_loss=2.425, ppl=5.37, wps=296623, ups=0.69, wpb=429640, bsz=16609.2, num_updates=38200, lr=0.000323592, gnorm=0.229, clip=0, loss_scale=2, train_wall=144, gb_free=58.8, wall=56278 epoch 023: 759 / 1707 loss=4.059, nll_loss=2.425, ppl=5.37, wps=296623, ups=0.69, wpb=429640, bsz=16609.2, num_updates=38200, lr=0.000323592, gnorm=0.229, clip=0, loss_scale=2, train_wall=144, gb_free=58.8, wall=56278 epoch 023: 759 / 1707 loss=4.059, nll_loss=2.425, ppl=5.37, wps=296623, ups=0.69, wpb=429640, bsz=16609.2, num_updates=38200, lr=0.000323592, gnorm=0.229, clip=0, loss_scale=2, train_wall=144, gb_free=58.8, wall=56278 epoch 023: 759 / 1707 loss=4.059, nll_loss=2.425, ppl=5.37, wps=296623, ups=0.69, wpb=429640, bsz=16609.2, num_updates=38200, lr=0.000323592, gnorm=0.229, clip=0, loss_scale=2, train_wall=144, gb_free=58.8, wall=56278 epoch 023: 759 / 1707 loss=4.059, nll_loss=2.425, ppl=5.37, wps=296623, ups=0.69, wpb=429640, bsz=16609.2, num_updates=38200, lr=0.000323592, gnorm=0.229, clip=0, loss_scale=2, train_wall=144, gb_free=58.8, wall=56278 epoch 023: 759 / 1707 loss=4.059, nll_loss=2.425, ppl=5.37, wps=296623, ups=0.69, wpb=429640, bsz=16609.2, num_updates=38200, lr=0.000323592, gnorm=0.229, clip=0, loss_scale=2, train_wall=144, gb_free=58.8, wall=56278 epoch 023: 859 / 1707 loss=4.063, nll_loss=2.43, ppl=5.39, wps=296227, ups=0.69, wpb=430318, bsz=16916.2, num_updates=38300, lr=0.00032317, gnorm=0.226, clip=0, loss_scale=2, train_wall=144, gb_free=59.6, wall=56423 epoch 023: 859 / 1707 loss=4.063, nll_loss=2.43, ppl=5.39, wps=296227, ups=0.69, wpb=430318, bsz=16916.2, num_updates=38300, lr=0.00032317, gnorm=0.226, clip=0, loss_scale=2, train_wall=144, gb_free=59.6, wall=56423 epoch 023: 859 / 1707 loss=4.063, nll_loss=2.43, ppl=5.39, wps=296227, ups=0.69, wpb=430318, bsz=16916.2, num_updates=38300, lr=0.00032317, gnorm=0.226, clip=0, loss_scale=2, train_wall=144, gb_free=59.6, wall=56423 epoch 023: 859 / 1707 loss=4.063, nll_loss=2.43, ppl=5.39, wps=296227, ups=0.69, wpb=430318, bsz=16916.2, num_updates=38300, lr=0.00032317, gnorm=0.226, clip=0, loss_scale=2, train_wall=144, gb_free=59.6, wall=56423 epoch 023: 859 / 1707 loss=4.063, nll_loss=2.43, ppl=5.39, wps=296227, ups=0.69, wpb=430318, bsz=16916.2, num_updates=38300, lr=0.00032317, gnorm=0.226, clip=0, loss_scale=2, train_wall=144, gb_free=59.6, wall=56423 epoch 023: 859 / 1707 loss=4.063, nll_loss=2.43, ppl=5.39, wps=296227, ups=0.69, wpb=430318, bsz=16916.2, num_updates=38300, lr=0.00032317, gnorm=0.226, clip=0, loss_scale=2, train_wall=144, gb_free=59.6, wall=56423 epoch 023: 859 / 1707 loss=4.063, nll_loss=2.43, ppl=5.39, wps=296227, ups=0.69, wpb=430318, bsz=16916.2, num_updates=38300, lr=0.00032317, gnorm=0.226, clip=0, loss_scale=2, train_wall=144, gb_free=59.6, wall=56423 epoch 023: 859 / 1707 loss=4.063, nll_loss=2.43, ppl=5.39, wps=296227, ups=0.69, wpb=430318, bsz=16916.2, num_updates=38300, lr=0.00032317, gnorm=0.226, clip=0, loss_scale=2, train_wall=144, gb_free=59.6, wall=56423 epoch 023: 859 / 1707 loss=4.063, nll_loss=2.43, ppl=5.39, wps=296227, ups=0.69, wpb=430318, bsz=16916.2, num_updates=38300, lr=0.00032317, gnorm=0.226, clip=0, loss_scale=2, train_wall=144, gb_free=59.6, wall=56423 epoch 023: 859 / 1707 loss=4.063, nll_loss=2.43, ppl=5.39, wps=296227, ups=0.69, wpb=430318, bsz=16916.2, num_updates=38300, lr=0.00032317, gnorm=0.226, clip=0, loss_scale=2, train_wall=144, gb_free=59.6, wall=56423 epoch 023: 859 / 1707 loss=4.063, nll_loss=2.43, ppl=5.39, wps=296227, ups=0.69, wpb=430318, bsz=16916.2, num_updates=38300, lr=0.00032317, gnorm=0.226, clip=0, loss_scale=2, train_wall=144, gb_free=59.6, wall=56423 epoch 023: 859 / 1707 loss=4.063, nll_loss=2.43, ppl=5.39, wps=296227, ups=0.69, wpb=430318, bsz=16916.2, num_updates=38300, lr=0.00032317, gnorm=0.226, clip=0, loss_scale=2, train_wall=144, gb_free=59.6, wall=56423 epoch 023: 859 / 1707 loss=4.063, nll_loss=2.43, ppl=5.39, wps=296227, ups=0.69, wpb=430318, bsz=16916.2, num_updates=38300, lr=0.00032317, gnorm=0.226, clip=0, loss_scale=2, train_wall=144, gb_free=59.6, wall=56423 epoch 023: 859 / 1707 loss=4.063, nll_loss=2.43, ppl=5.39, wps=296227, ups=0.69, wpb=430318, bsz=16916.2, num_updates=38300, lr=0.00032317, gnorm=0.226, clip=0, loss_scale=2, train_wall=144, gb_free=59.6, wall=56423 epoch 023: 859 / 1707 loss=4.063, nll_loss=2.43, ppl=5.39, wps=296227, ups=0.69, wpb=430318, bsz=16916.2, num_updates=38300, lr=0.00032317, gnorm=0.226, clip=0, loss_scale=2, train_wall=144, gb_free=59.6, wall=56423 epoch 023: 859 / 1707 loss=4.063, nll_loss=2.43, ppl=5.39, wps=296227, ups=0.69, wpb=430318, bsz=16916.2, num_updates=38300, lr=0.00032317, gnorm=0.226, clip=0, loss_scale=2, train_wall=144, gb_free=59.6, wall=56423 epoch 023: 859 / 1707 loss=4.063, nll_loss=2.43, ppl=5.39, wps=296227, ups=0.69, wpb=430318, bsz=16916.2, num_updates=38300, lr=0.00032317, gnorm=0.226, clip=0, loss_scale=2, train_wall=144, gb_free=59.6, wall=56423 epoch 023: 859 / 1707 loss=4.063, nll_loss=2.43, ppl=5.39, wps=296227, ups=0.69, wpb=430318, bsz=16916.2, num_updates=38300, lr=0.00032317, gnorm=0.226, clip=0, loss_scale=2, train_wall=144, gb_free=59.6, wall=56423 epoch 023: 859 / 1707 loss=4.063, nll_loss=2.43, ppl=5.39, wps=296227, ups=0.69, wpb=430318, bsz=16916.2, num_updates=38300, lr=0.00032317, gnorm=0.226, clip=0, loss_scale=2, train_wall=144, gb_free=59.6, wall=56423 epoch 023: 859 / 1707 loss=4.063, nll_loss=2.43, ppl=5.39, wps=296227, ups=0.69, wpb=430318, bsz=16916.2, num_updates=38300, lr=0.00032317, gnorm=0.226, clip=0, loss_scale=2, train_wall=144, gb_free=59.6, wall=56423 epoch 023: 859 / 1707 loss=4.063, nll_loss=2.43, ppl=5.39, wps=296227, ups=0.69, wpb=430318, bsz=16916.2, num_updates=38300, lr=0.00032317, gnorm=0.226, clip=0, loss_scale=2, train_wall=144, gb_free=59.6, wall=56423 epoch 023: 859 / 1707 loss=4.063, nll_loss=2.43, ppl=5.39, wps=296227, ups=0.69, wpb=430318, bsz=16916.2, num_updates=38300, lr=0.00032317, gnorm=0.226, clip=0, loss_scale=2, train_wall=144, gb_free=59.6, wall=56423 epoch 023: 859 / 1707 loss=4.063, nll_loss=2.43, ppl=5.39, wps=296227, ups=0.69, wpb=430318, bsz=16916.2, num_updates=38300, lr=0.00032317, gnorm=0.226, clip=0, loss_scale=2, train_wall=144, gb_free=59.6, wall=56423 epoch 023: 960 / 1707 loss=4.056, nll_loss=2.421, ppl=5.36, wps=293140, ups=0.68, wpb=428663, bsz=16146, num_updates=38400, lr=0.000322749, gnorm=0.227, clip=0, loss_scale=1, train_wall=145, gb_free=59.5, wall=56570 epoch 023: 960 / 1707 loss=4.056, nll_loss=2.421, ppl=5.36, wps=293140, ups=0.68, wpb=428663, bsz=16146, num_updates=38400, lr=0.000322749, gnorm=0.227, clip=0, loss_scale=1, train_wall=145, gb_free=59.5, wall=56570 epoch 023: 960 / 1707 loss=4.056, nll_loss=2.421, ppl=5.36, wps=293140, ups=0.68, wpb=428663, bsz=16146, num_updates=38400, lr=0.000322749, gnorm=0.227, clip=0, loss_scale=1, train_wall=145, gb_free=59.5, wall=56570 epoch 023: 960 / 1707 loss=4.056, nll_loss=2.421, ppl=5.36, wps=293140, ups=0.68, wpb=428663, bsz=16146, num_updates=38400, lr=0.000322749, gnorm=0.227, clip=0, loss_scale=1, train_wall=145, gb_free=59.5, wall=56570 epoch 023: 960 / 1707 loss=4.056, nll_loss=2.421, ppl=5.36, wps=293140, ups=0.68, wpb=428663, bsz=16146, num_updates=38400, lr=0.000322749, gnorm=0.227, clip=0, loss_scale=1, train_wall=145, gb_free=59.5, wall=56570 epoch 023: 960 / 1707 loss=4.056, nll_loss=2.421, ppl=5.36, wps=293140, ups=0.68, wpb=428663, bsz=16146, num_updates=38400, lr=0.000322749, gnorm=0.227, clip=0, loss_scale=1, train_wall=145, gb_free=59.5, wall=56570 epoch 023: 960 / 1707 loss=4.056, nll_loss=2.421, ppl=5.36, wps=293140, ups=0.68, wpb=428663, bsz=16146, num_updates=38400, lr=0.000322749, gnorm=0.227, clip=0, loss_scale=1, train_wall=145, gb_free=59.5, wall=56570 epoch 023: 960 / 1707 loss=4.056, nll_loss=2.421, ppl=5.36, wps=293140, ups=0.68, wpb=428663, bsz=16146, num_updates=38400, lr=0.000322749, gnorm=0.227, clip=0, loss_scale=1, train_wall=145, gb_free=59.5, wall=56570 epoch 023: 960 / 1707 loss=4.056, nll_loss=2.421, ppl=5.36, wps=293140, ups=0.68, wpb=428663, bsz=16146, num_updates=38400, lr=0.000322749, gnorm=0.227, clip=0, loss_scale=1, train_wall=145, gb_free=59.5, wall=56570 epoch 023: 960 / 1707 loss=4.056, nll_loss=2.421, ppl=5.36, wps=293140, ups=0.68, wpb=428663, bsz=16146, num_updates=38400, lr=0.000322749, gnorm=0.227, clip=0, loss_scale=1, train_wall=145, gb_free=59.5, wall=56570 epoch 023: 960 / 1707 loss=4.056, nll_loss=2.421, ppl=5.36, wps=293140, ups=0.68, wpb=428663, bsz=16146, num_updates=38400, lr=0.000322749, gnorm=0.227, clip=0, loss_scale=1, train_wall=145, gb_free=59.5, wall=56570 epoch 023: 960 / 1707 loss=4.056, nll_loss=2.421, ppl=5.36, wps=293140, ups=0.68, wpb=428663, bsz=16146, num_updates=38400, lr=0.000322749, gnorm=0.227, clip=0, loss_scale=1, train_wall=145, gb_free=59.5, wall=56570 epoch 023: 960 / 1707 loss=4.056, nll_loss=2.421, ppl=5.36, wps=293140, ups=0.68, wpb=428663, bsz=16146, num_updates=38400, lr=0.000322749, gnorm=0.227, clip=0, loss_scale=1, train_wall=145, gb_free=59.5, wall=56570 epoch 023: 960 / 1707 loss=4.056, nll_loss=2.421, ppl=5.36, wps=293140, ups=0.68, wpb=428663, bsz=16146, num_updates=38400, lr=0.000322749, gnorm=0.227, clip=0, loss_scale=1, train_wall=145, gb_free=59.5, wall=56570 epoch 023: 960 / 1707 loss=4.056, nll_loss=2.421, ppl=5.36, wps=293140, ups=0.68, wpb=428663, bsz=16146, num_updates=38400, lr=0.000322749, gnorm=0.227, clip=0, loss_scale=1, train_wall=145, gb_free=59.5, wall=56570 epoch 023: 960 / 1707 loss=4.056, nll_loss=2.421, ppl=5.36, wps=293140, ups=0.68, wpb=428663, bsz=16146, num_updates=38400, lr=0.000322749, gnorm=0.227, clip=0, loss_scale=1, train_wall=145, gb_free=59.5, wall=56570 epoch 023: 960 / 1707 loss=4.056, nll_loss=2.421, ppl=5.36, wps=293140, ups=0.68, wpb=428663, bsz=16146, num_updates=38400, lr=0.000322749, gnorm=0.227, clip=0, loss_scale=1, train_wall=145, gb_free=59.5, wall=56570 epoch 023: 960 / 1707 loss=4.056, nll_loss=2.421, ppl=5.36, wps=293140, ups=0.68, wpb=428663, bsz=16146, num_updates=38400, lr=0.000322749, gnorm=0.227, clip=0, loss_scale=1, train_wall=145, gb_free=59.5, wall=56570 epoch 023: 960 / 1707 loss=4.056, nll_loss=2.421, ppl=5.36, wps=293140, ups=0.68, wpb=428663, bsz=16146, num_updates=38400, lr=0.000322749, gnorm=0.227, clip=0, loss_scale=1, train_wall=145, gb_free=59.5, wall=56570 epoch 023: 960 / 1707 loss=4.056, nll_loss=2.421, ppl=5.36, wps=293140, ups=0.68, wpb=428663, bsz=16146, num_updates=38400, lr=0.000322749, gnorm=0.227, clip=0, loss_scale=1, train_wall=145, gb_free=59.5, wall=56570 epoch 023: 960 / 1707 loss=4.056, nll_loss=2.421, ppl=5.36, wps=293140, ups=0.68, wpb=428663, bsz=16146, num_updates=38400, lr=0.000322749, gnorm=0.227, clip=0, loss_scale=1, train_wall=145, gb_free=59.5, wall=56570 epoch 023: 960 / 1707 loss=4.056, nll_loss=2.421, ppl=5.36, wps=293140, ups=0.68, wpb=428663, bsz=16146, num_updates=38400, lr=0.000322749, gnorm=0.227, clip=0, loss_scale=1, train_wall=145, gb_free=59.5, wall=56570 epoch 023: 960 / 1707 loss=4.056, nll_loss=2.421, ppl=5.36, wps=293140, ups=0.68, wpb=428663, bsz=16146, num_updates=38400, lr=0.000322749, gnorm=0.227, clip=0, loss_scale=1, train_wall=145, gb_free=59.5, wall=56570 {'_name': None, 'common': {'_name': None, 'no_progress_bar': False, 'log_interval': 100, 'log_format': 'simple', 'log_file': 'chkpt/en-ja.E18-D4/train.log', 'aim_repo': None, 'aim_run_hash': None, 'tensorboard_logdir': None, 'wandb_project': 'wmt23', 'azureml_logging': False, 'seed': 0, 'cpu': False, 'tpu': False, 'bf16': False, 'memory_efficient_bf16': False, 'fp16': True, 'memory_efficient_fp16': False, 'fp16_no_flatten_grads': False, 'fp16_init_scale': 8, 'fp16_scale_window': None, 'fp16_scale_tolerance': 0.0, 'on_cpu_convert_precision': False, 'min_loss_scale': 0.0001, 'threshold_loss_scale': None, 'amp': False, 'amp_batch_retries': 2, 'amp_init_scale': 128, 'amp_scale_window': None, 'user_dir': None, 'empty_cache_freq': 0, 'all_gather_list_size': 16384, 'model_parallel_size': 1, 'quantization_config_path': None, 'profile': False, 'reset_logging': False, 'suppress_crashes': False, 'use_plasma_view': False, 'plasma_path': '/tmp/plasma'}, 'common_eval': {'_name': None, 'path': None, 'post_process': None, 'quiet': False, 'model_overrides': '{}', 'results_path': None}, 'distributed_training': {'_name': None, 'distributed_world_size': 8, 'distributed_num_procs': 8, 'distributed_rank': 0, 'distributed_backend': 'nccl', 'distributed_init_method': 'tcp://localhost:27460', 'distributed_port': 27460, 'device_id': 0, 'distributed_no_spawn': False, 'ddp_backend': 'pytorch_ddp', 'ddp_comm_hook': 'none', 'bucket_cap_mb': 25, 'fix_batches_to_gpus': False, 'find_unused_parameters': False, 'gradient_as_bucket_view': False, 'fast_stat_sync': False, 'heartbeat_timeout': -1, 'broadcast_buffers': False, 'slowmo_momentum': None, 'slowmo_base_algorithm': 'localsgd', 'localsgd_frequency': 3, 'nprocs_per_node': 8, 'pipeline_model_parallel': False, 'pipeline_balance': None, 'pipeline_devices': None, 'pipeline_chunks': 0, 'pipeline_encoder_balance': None, 'pipeline_encoder_devices': None, 'pipeline_decoder_balance': None, 'pipeline_decoder_devices': None, 'pipeline_checkpoint': 'never', 'zero_sharding': 'none', 'fp16': True, 'memory_efficient_fp16': False, 'tpu': False, 'no_reshard_after_forward': False, 'fp32_reduce_scatter': False, 'cpu_offload': False, 'use_sharded_state': False, 'not_fsdp_flatten_parameters': False}, 'dataset': {'_name': None, 'num_workers': 0, 'skip_invalid_size_inputs_valid_test': False, 'max_tokens': 8192, 'batch_size': None, 'required_batch_size_multiple': 8, 'required_seq_len_multiple': 1, 'dataset_impl': None, 'data_buffer_size': 10, 'train_subset': 'train', 'valid_subset': 'valid', 'combine_valid_subsets': None, 'ignore_unused_valid_subsets': False, 'validate_interval': 100000, 'validate_interval_updates': 0, 'validate_after_updates': 0, 'fixed_validation_seed': None, 'disable_validation': False, 'max_tokens_valid': 8192, 'batch_size_valid': None, 'max_valid_steps': None, 'curriculum': 0, 'gen_subset': 'test', 'num_shards': 1, 'shard_id': 0, 'grouped_shuffling': False, 'update_epoch_batch_itr': False, 'update_ordered_indices_seed': False}, 'optimization': {'_name': None, 'max_epoch': 0, 'max_update': 60000, 'stop_time_hours': 0.0, 'clip_norm': 1.0, 'sentence_avg': False, 'update_freq': [8], 'lr': [0.001], 'stop_min_lr': -1.0, 'use_bmuf': False, 'skip_remainder_batch': False, 'debug_param_names': False}, 'checkpoint': {'_name': None, 'save_dir': 'chkpt/en-ja.E18-D4', 'restore_file': 'checkpoint_last.pt', 'continue_once': None, 'finetune_from_model': None, 'reset_dataloader': False, 'reset_lr_scheduler': False, 'reset_meters': False, 'reset_optimizer': False, 'optimizer_overrides': '{}', 'save_interval': 100000, 'save_interval_updates': 1000, 'keep_interval_updates': 10, 'keep_interval_updates_pattern': -1, 'keep_last_epochs': -1, 'keep_best_checkpoints': -1, 'no_save': False, 'no_epoch_checkpoints': True, 'no_last_checkpoints': False, 'no_save_optimizer_state': False, 'best_checkpoint_metric': 'loss', 'maximize_best_checkpoint_metric': False, 'patience': -1, 'checkpoint_suffix': '', 'checkpoint_shard_count': 1, 'load_checkpoint_on_all_dp_ranks': False, 'write_checkpoints_asynchronously': False, 'model_parallel_size': 1}, 'bmuf': {'_name': None, 'block_lr': 1.0, 'block_momentum': 0.875, 'global_sync_iter': 50, 'warmup_iterations': 500, 'use_nbm': False, 'average_sync': False, 'distributed_world_size': 8}, 'generation': {'_name': None, 'beam': 5, 'beam_mt': 0, 'nbest': 1, 'max_len_a': 0.0, 'max_len_b': 200, 'max_len_a_mt': 0.0, 'max_len_b_mt': 200, 'min_len': 1, 'match_source_len': False, 'unnormalized': False, 'no_early_stop': False, 'no_beamable_mm': False, 'lenpen': 1.0, 'lenpen_mt': 1.0, 'unkpen': 0.0, 'replace_unk': None, 'sacrebleu': False, 'score_reference': False, 'prefix_size': 0, 'no_repeat_ngram_size': 0, 'sampling': False, 'sampling_topk': -1, 'sampling_topp': -1.0, 'constraints': None, 'temperature': 1.0, 'diverse_beam_groups': -1, 'diverse_beam_strength': 0.5, 'diversity_rate': -1.0, 'print_alignment': None, 'print_step': False, 'lm_path': None, 'lm_weight': 0.0, 'iter_decode_eos_penalty': 0.0, 'iter_decode_max_iter': 10, 'iter_decode_force_max_iter': False, 'iter_decode_with_beam': 1, 'iter_decode_with_external_reranker': False, 'retain_iter_history': False, 'retain_dropout': False, 'retain_dropout_modules': None, 'decoding_format': None, 'no_seed_provided': False, 'eos_token': None}, 'eval_lm': {'_name': None, 'output_word_probs': False, 'output_word_stats': False, 'context_window': 0, 'softmax_batch': 9223372036854775807}, 'interactive': {'_name': None, 'buffer_size': 0, 'input': '-'}, 'model': Namespace(no_progress_bar=False, log_interval=100, log_format='simple', log_file='chkpt/en-ja.E18-D4/train.log', aim_repo=None, aim_run_hash=None, tensorboard_logdir=None, wandb_project='wmt23', azureml_logging=False, seed=0, cpu=False, tpu=False, bf16=False, memory_efficient_bf16=False, fp16=True, memory_efficient_fp16=False, fp16_no_flatten_grads=False, fp16_init_scale=8, fp16_scale_window=None, fp16_scale_tolerance=0.0, on_cpu_convert_precision=False, min_loss_scale=0.0001, threshold_loss_scale=None, amp=False, amp_batch_retries=2, amp_init_scale=128, amp_scale_window=None, user_dir=None, empty_cache_freq=0, all_gather_list_size=16384, model_parallel_size=1, quantization_config_path=None, profile=False, reset_logging=False, suppress_crashes=False, use_plasma_view=False, plasma_path='/tmp/plasma', criterion='label_smoothed_cross_entropy', tokenizer=None, bpe=None, optimizer='adam', lr_scheduler='inverse_sqrt', scoring='bleu', task='translation', num_workers=0, skip_invalid_size_inputs_valid_test=False, max_tokens=8192, batch_size=None, required_batch_size_multiple=8, required_seq_len_multiple=1, dataset_impl=None, data_buffer_size=10, train_subset='train', valid_subset='valid', combine_valid_subsets=None, ignore_unused_valid_subsets=False, validate_interval=100000, validate_interval_updates=0, validate_after_updates=0, fixed_validation_seed=None, disable_validation=False, max_tokens_valid=8192, batch_size_valid=None, max_valid_steps=None, curriculum=0, gen_subset='test', num_shards=1, shard_id=0, grouped_shuffling=False, update_epoch_batch_itr=False, update_ordered_indices_seed=False, distributed_world_size=8, distributed_num_procs=8, distributed_rank=0, distributed_backend='nccl', distributed_init_method=None, distributed_port=-1, device_id=0, distributed_no_spawn=False, ddp_backend='pytorch_ddp', ddp_comm_hook='none', bucket_cap_mb=25, fix_batches_to_gpus=False, find_unused_parameters=False, gradient_as_bucket_view=False, fast_stat_sync=False, heartbeat_timeout=-1, broadcast_buffers=False, slowmo_momentum=None, slowmo_base_algorithm='localsgd', localsgd_frequency=3, nprocs_per_node=8, pipeline_model_parallel=False, pipeline_balance=None, pipeline_devices=None, pipeline_chunks=0, pipeline_encoder_balance=None, pipeline_encoder_devices=None, pipeline_decoder_balance=None, pipeline_decoder_devices=None, pipeline_checkpoint='never', zero_sharding='none', no_reshard_after_forward=False, fp32_reduce_scatter=False, cpu_offload=False, use_sharded_state=False, not_fsdp_flatten_parameters=False, arch='transformer_vaswani_wmt_en_de_big', max_epoch=0, max_update=60000, stop_time_hours=0, clip_norm=1.0, sentence_avg=False, update_freq=[8], lr=[0.001], stop_min_lr=-1.0, use_bmuf=False, skip_remainder_batch=False, debug_param_names=False, save_dir='chkpt/en-ja.E18-D4', restore_file='checkpoint_last.pt', continue_once=None, finetune_from_model=None, reset_dataloader=False, reset_lr_scheduler=False, reset_meters=False, reset_optimizer=False, optimizer_overrides='{}', save_interval=100000, save_interval_updates=1000, keep_interval_updates=10, keep_interval_updates_pattern=-1, keep_last_epochs=-1, keep_best_checkpoints=-1, no_save=False, no_epoch_checkpoints=True, no_last_checkpoints=False, no_save_optimizer_state=False, best_checkpoint_metric='loss', maximize_best_checkpoint_metric=False, patience=-1, checkpoint_suffix='', checkpoint_shard_count=1, load_checkpoint_on_all_dp_ranks=False, write_checkpoints_asynchronously=False, store_ema=False, ema_decay=0.9999, ema_start_update=0, ema_seed_model=None, ema_update_freq=1, ema_fp32=False, data='binarized/en-ja/', source_lang=None, target_lang=None, load_alignments=False, left_pad_source=True, left_pad_target=False, upsample_primary=-1, truncate_source=False, num_batch_buckets=0, eval_bleu=False, eval_bleu_args='{}', eval_bleu_detok='space', eval_bleu_detok_args='{}', eval_tokenized_bleu=False, eval_bleu_remove_bpe=None, eval_bleu_print_samples=False, label_smoothing=0.1, report_accuracy=False, ignore_prefix_size=0, adam_betas='(0.9, 0.98)', adam_eps=1e-08, weight_decay=0.0, use_old_adam=False, fp16_adam_stats=False, warmup_updates=4000, warmup_init_lr=-1, pad=1, eos=2, unk=3, encoder_ffn_embed_dim=8192, decoder_ffn_embed_dim=8192, encoder_layers=18, decoder_layers=4, dropout=0.1, share_decoder_input_output_embed=True, no_seed_provided=False, encoder_embed_dim=1024, encoder_attention_heads=16, encoder_normalize_before=False, decoder_embed_dim=1024, decoder_attention_heads=16, encoder_embed_path=None, encoder_learned_pos=False, decoder_embed_path=None, decoder_normalize_before=False, decoder_learned_pos=False, attention_dropout=0.0, activation_dropout=0.0, activation_fn='relu', adaptive_softmax_cutoff=None, adaptive_softmax_dropout=0, share_all_embeddings=False, merge_src_tgt_embed=False, no_token_positional_embeddings=False, adaptive_input=False, no_cross_attention=False, cross_self_attention=False, decoder_output_dim=1024, decoder_input_dim=1024, no_scale_embedding=False, layernorm_embedding=False, tie_adaptive_weights=False, checkpoint_activations=False, offload_activations=False, encoder_layers_to_keep=None, decoder_layers_to_keep=None, encoder_layerdrop=0, decoder_layerdrop=0, quant_noise_pq=0, quant_noise_pq_block_size=8, quant_noise_scalar=0, _name='transformer_vaswani_wmt_en_de_big'), 'task': {'_name': 'translation', 'data': 'binarized/en-ja/', 'source_lang': None, 'target_lang': None, 'load_alignments': False, 'left_pad_source': True, 'left_pad_target': False, 'max_source_positions': 1024, 'max_target_positions': 1024, 'upsample_primary': -1, 'truncate_source': False, 'num_batch_buckets': 0, 'train_subset': 'train', 'dataset_impl': None, 'required_seq_len_multiple': 1, 'eval_bleu': False, 'eval_bleu_args': '{}', 'eval_bleu_detok': 'space', 'eval_bleu_detok_args': '{}', 'eval_tokenized_bleu': False, 'eval_bleu_remove_bpe': None, 'eval_bleu_print_samples': False}, 'criterion': {'_name': 'label_smoothed_cross_entropy', 'label_smoothing': 0.1, 'report_accuracy': False, 'ignore_prefix_size': 0, 'sentence_avg': False}, 'optimizer': {'_name': 'adam', 'adam_betas': '(0.9, 0.98)', 'adam_eps': 1e-08, 'weight_decay': 0.0, 'use_old_adam': False, 'fp16_adam_stats': False, 'tpu': False, 'lr': [0.001]}, 'lr_scheduler': {'_name': 'inverse_sqrt', 'warmup_updates': 4000, 'warmup_init_lr': -1.0, 'lr': [0.001]}, 'scoring': {'_name': 'bleu', 'pad': 1, 'eos': 2, 'unk': 3}, 'bpe': None, 'tokenizer': None, 'ema': {'_name': None, 'store_ema': False, 'ema_decay': 0.9999, 'ema_start_update': 0, 'ema_seed_model': None, 'ema_update_freq': 1, 'ema_fp32': False}} TransformerModel( (encoder): TransformerEncoderBase( (dropout_module): FairseqDropout() (embed_tokens): Embedding(16000, 1024, padding_idx=1) (embed_positions): SinusoidalPositionalEmbedding() (layers): ModuleList( (0-17): 18 x TransformerEncoderLayerBase( (self_attn): MultiheadAttention( (dropout_module): FairseqDropout() (k_proj): Linear(in_features=1024, out_features=1024, bias=True) (v_proj): Linear(in_features=1024, out_features=1024, bias=True) (q_proj): Linear(in_features=1024, out_features=1024, bias=True) (out_proj): Linear(in_features=1024, out_features=1024, bias=True) ) (self_attn_layer_norm): FusedLayerNorm(torch.Size([1024]), eps=1e-05, elementwise_affine=True) (dropout_module): FairseqDropout() (activation_dropout_module): FairseqDropout() (fc1): Linear(in_features=1024, out_features=8192, bias=True) (fc2): Linear(in_features=8192, out_features=1024, bias=True) (final_layer_norm): FusedLayerNorm(torch.Size([1024]), eps=1e-05, elementwise_affine=True) ) ) ) (decoder): TransformerDecoderBase( (dropout_module): FairseqDropout() (embed_tokens): Embedding(32000, 1024, padding_idx=1) (embed_positions): SinusoidalPositionalEmbedding() (layers): ModuleList( (0-3): 4 x TransformerDecoderLayerBase( (dropout_module): FairseqDropout() (self_attn): MultiheadAttention( (dropout_module): FairseqDropout() (k_proj): Linear(in_features=1024, out_features=1024, bias=True) (v_proj): Linear(in_features=1024, out_features=1024, bias=True) (q_proj): Linear(in_features=1024, out_features=1024, bias=True) (out_proj): Linear(in_features=1024, out_features=1024, bias=True) ) (activation_dropout_module): FairseqDropout() (self_attn_layer_norm): FusedLayerNorm(torch.Size([1024]), eps=1e-05, elementwise_affine=True) (encoder_attn): MultiheadAttention( (dropout_module): FairseqDropout() (k_proj): Linear(in_features=1024, out_features=1024, bias=True) (v_proj): Linear(in_features=1024, out_features=1024, bias=True) (q_proj): Linear(in_features=1024, out_features=1024, bias=True) (out_proj): Linear(in_features=1024, out_features=1024, bias=True) ) (encoder_attn_layer_norm): FusedLayerNorm(torch.Size([1024]), eps=1e-05, elementwise_affine=True) (fc1): Linear(in_features=1024, out_features=8192, bias=True) (fc2): Linear(in_features=8192, out_features=1024, bias=True) (final_layer_norm): FusedLayerNorm(torch.Size([1024]), eps=1e-05, elementwise_affine=True) ) ) (output_projection): Linear(in_features=1024, out_features=32000, bias=False) ) ) task: TranslationTask model: TransformerModel criterion: LabelSmoothedCrossEntropyCriterion num. shared model params: 527,710,208 (num. trained: 527,710,208) num. expert model params: 0 (num. trained: 0) training on 8 devices (GPUs/TPUs) max tokens per device = 8192 and max sentences per device = None begin dry-run validation on "valid" subset Start iterating over samples epoch 023: 659 / 1707 loss=4.038, nll_loss=2.401, ppl=5.28, wps=269720, ups=0.63, wpb=429176, bsz=16263.1, num_updates=38100, lr=0.000324017, gnorm=0.235, clip=0, loss_scale=1, train_wall=144, gb_free=59.2, wall=0 epoch 023: 759 / 1707 loss=4.045, nll_loss=2.409, ppl=5.31, wps=298270, ups=0.69, wpb=429640, bsz=16609.2, num_updates=38200, lr=0.000323592, gnorm=0.238, clip=0, loss_scale=1, train_wall=143, gb_free=58.8, wall=0 epoch 023: 859 / 1707 loss=4.052, nll_loss=2.417, ppl=5.34, wps=298045, ups=0.69, wpb=430318, bsz=16916.2, num_updates=38300, lr=0.00032317, gnorm=0.226, clip=0, loss_scale=2, train_wall=144, gb_free=59.6, wall=0 epoch 023: 960 / 1707 loss=4.044, nll_loss=2.407, ppl=5.31, wps=294240, ups=0.69, wpb=428864, bsz=16147.3, num_updates=38400, lr=0.000322749, gnorm=0.233, clip=0, loss_scale=1, train_wall=145, gb_free=59.5, wall=0 epoch 023: 1060 / 1707 loss=4.061, nll_loss=2.427, ppl=5.38, wps=298073, ups=0.69, wpb=429372, bsz=16443.9, num_updates=38500, lr=0.000322329, gnorm=0.227, clip=0, loss_scale=1, train_wall=143, gb_free=59.1, wall=0 epoch 023: 1160 / 1707 loss=4.048, nll_loss=2.412, ppl=5.32, wps=296278, ups=0.69, wpb=428158, bsz=16199.8, num_updates=38600, lr=0.000321911, gnorm=0.238, clip=0, loss_scale=1, train_wall=144, gb_free=59.1, wall=0 epoch 023: 1260 / 1707 loss=4.048, nll_loss=2.413, ppl=5.33, wps=295729, ups=0.69, wpb=426313, bsz=16227.7, num_updates=38700, lr=0.000321495, gnorm=0.232, clip=0, loss_scale=2, train_wall=143, gb_free=59.1, wall=0 epoch 023: 1360 / 1707 loss=4.053, nll_loss=2.418, ppl=5.34, wps=296809, ups=0.69, wpb=428953, bsz=16078.7, num_updates=38800, lr=0.000321081, gnorm=0.243, clip=0, loss_scale=2, train_wall=144, gb_free=59.2, wall=0 epoch 023: 1461 / 1707 loss=4.053, nll_loss=2.418, ppl=5.35, wps=293297, ups=0.68, wpb=428392, bsz=16326.7, num_updates=38900, lr=0.000320668, gnorm=0.241, clip=0, loss_scale=2, train_wall=145, gb_free=58.8, wall=0 epoch 023: 1561 / 1707 loss=4.059, nll_loss=2.425, ppl=5.37, wps=297407, ups=0.69, wpb=428907, bsz=16002.4, num_updates=39000, lr=0.000320256, gnorm=0.237, clip=0, loss_scale=2, train_wall=143, gb_free=59.1, wall=0 begin validation on "valid" subset epoch 023 | valid on 'valid' subset | loss 4.298 | nll_loss 2.681 | ppl 6.41 | wps 76302.9 | wpb 21331 | bsz 1016 | num_updates 39000 | best_loss 4.285 epoch 023: 1661 / 1707 loss=4.059, nll_loss=2.426, ppl=5.37, wps=272562, ups=0.63, wpb=429813, bsz=16117.9, num_updates=39100, lr=0.000319847, gnorm=0.225, clip=0, loss_scale=2, train_wall=143, gb_free=59, wall=0 end of epoch 23 (average epoch stats below) epoch 023 | loss 4.05 | nll_loss 2.415 | ppl 5.33 | wps 292679 | ups 0.68 | wpb 428956 | bsz 16334.2 | num_updates 39146 | lr 0.000319659 | gnorm 0.234 | clip 0 | loss_scale 4 | train_wall 2453 | gb_free 60.9 | wall 0 Start iterating over samples epoch 024: 54 / 1707 loss=4.04, nll_loss=2.403, ppl=5.29, wps=296660, ups=0.7, wpb=424490, bsz=16001.8, num_updates=39200, lr=0.000319438, gnorm=0.226, clip=0, loss_scale=4, train_wall=142, gb_free=58.8, wall=0 epoch 024: 54 / 1707 loss=4.04, nll_loss=2.403, ppl=5.29, wps=296660, ups=0.7, wpb=424490, bsz=16001.8, num_updates=39200, lr=0.000319438, gnorm=0.226, clip=0, loss_scale=4, train_wall=142, gb_free=58.8, wall=0 epoch 024: 155 / 1707 loss=4.037, nll_loss=2.4, ppl=5.28, wps=295504, ups=0.69, wpb=430294, bsz=16565.9, num_updates=39300, lr=0.000319032, gnorm=0.234, clip=0, loss_scale=2, train_wall=145, gb_free=59.3, wall=0 epoch 024: 155 / 1707 loss=4.037, nll_loss=2.4, ppl=5.28, wps=295504, ups=0.69, wpb=430294, bsz=16565.9, num_updates=39300, lr=0.000319032, gnorm=0.234, clip=0, loss_scale=2, train_wall=145, gb_free=59.3, wall=0 epoch 024: 255 / 1707 loss=4.042, nll_loss=2.405, ppl=5.3, wps=296940, ups=0.69, wpb=428488, bsz=16531.1, num_updates=39400, lr=0.000318626, gnorm=0.234, clip=0, loss_scale=2, train_wall=143, gb_free=59.3, wall=0 epoch 024: 255 / 1707 loss=4.042, nll_loss=2.405, ppl=5.3, wps=296940, ups=0.69, wpb=428488, bsz=16531.1, num_updates=39400, lr=0.000318626, gnorm=0.234, clip=0, loss_scale=2, train_wall=143, gb_free=59.3, wall=0 epoch 024: 355 / 1707 loss=4.044, nll_loss=2.408, ppl=5.31, wps=296796, ups=0.69, wpb=427820, bsz=16255.2, num_updates=39500, lr=0.000318223, gnorm=0.223, clip=0, loss_scale=4, train_wall=143, gb_free=59.3, wall=0 epoch 024: 355 / 1707 loss=4.044, nll_loss=2.408, ppl=5.31, wps=296796, ups=0.69, wpb=427820, bsz=16255.2, num_updates=39500, lr=0.000318223, gnorm=0.223, clip=0, loss_scale=4, train_wall=143, gb_free=59.3, wall=0 epoch 024: 456 / 1707 loss=4.038, nll_loss=2.401, ppl=5.28, wps=292937, ups=0.68, wpb=427890, bsz=16382.3, num_updates=39600, lr=0.000317821, gnorm=0.221, clip=0, loss_scale=2, train_wall=145, gb_free=59.4, wall=0 epoch 024: 456 / 1707 loss=4.038, nll_loss=2.401, ppl=5.28, wps=292937, ups=0.68, wpb=427890, bsz=16382.3, num_updates=39600, lr=0.000317821, gnorm=0.221, clip=0, loss_scale=2, train_wall=145, gb_free=59.4, wall=0 epoch 024: 556 / 1707 loss=4.051, nll_loss=2.416, ppl=5.34, wps=297651, ups=0.69, wpb=429492, bsz=16177.9, num_updates=39700, lr=0.00031742, gnorm=0.23, clip=0, loss_scale=2, train_wall=143, gb_free=59.2, wall=0 epoch 024: 556 / 1707 loss=4.051, nll_loss=2.416, ppl=5.34, wps=297651, ups=0.69, wpb=429492, bsz=16177.9, num_updates=39700, lr=0.00031742, gnorm=0.23, clip=0, loss_scale=2, train_wall=143, gb_free=59.2, wall=0 epoch 024: 656 / 1707 loss=4.05, nll_loss=2.415, ppl=5.33, wps=296509, ups=0.69, wpb=429396, bsz=16390, num_updates=39800, lr=0.000317021, gnorm=0.239, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=0 epoch 024: 656 / 1707 loss=4.05, nll_loss=2.415, ppl=5.33, wps=296509, ups=0.69, wpb=429396, bsz=16390, num_updates=39800, lr=0.000317021, gnorm=0.239, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=0 epoch 024: 756 / 1707 loss=4.049, nll_loss=2.414, ppl=5.33, wps=296948, ups=0.69, wpb=428182, bsz=16188.4, num_updates=39900, lr=0.000316624, gnorm=0.224, clip=0, loss_scale=4, train_wall=143, gb_free=59.2, wall=0 epoch 024: 756 / 1707 loss=4.049, nll_loss=2.414, ppl=5.33, wps=296948, ups=0.69, wpb=428182, bsz=16188.4, num_updates=39900, lr=0.000316624, gnorm=0.224, clip=0, loss_scale=4, train_wall=143, gb_free=59.2, wall=0 epoch 024: 856 / 1707 loss=4.061, nll_loss=2.428, ppl=5.38, wps=296840, ups=0.69, wpb=429165, bsz=16281.4, num_updates=40000, lr=0.000316228, gnorm=0.239, clip=0, loss_scale=4, train_wall=144, gb_free=58.9, wall=0 epoch 024: 856 / 1707 loss=4.061, nll_loss=2.428, ppl=5.38, wps=296840, ups=0.69, wpb=429165, bsz=16281.4, num_updates=40000, lr=0.000316228, gnorm=0.239, clip=0, loss_scale=4, train_wall=144, gb_free=58.9, wall=0 begin validation on "valid" subset epoch 024 | valid on 'valid' subset | loss 4.28 | nll_loss 2.658 | ppl 6.31 | wps 76817.2 | wpb 21331 | bsz 1016 | num_updates 40000 | best_loss 4.28 epoch 024 | valid on 'valid' subset | loss 4.28 | nll_loss 2.658 | ppl 6.31 | wps 76817.2 | wpb 21331 | bsz 1016 | num_updates 40000 | best_loss 4.28 epoch 024: 956 / 1707 loss=4.058, nll_loss=2.424, ppl=5.37, wps=255316, ups=0.59, wpb=431049, bsz=16531.9, num_updates=40100, lr=0.000315833, gnorm=0.227, clip=0, loss_scale=8, train_wall=144, gb_free=58.8, wall=0 epoch 024: 956 / 1707 loss=4.058, nll_loss=2.424, ppl=5.37, wps=255316, ups=0.59, wpb=431049, bsz=16531.9, num_updates=40100, lr=0.000315833, gnorm=0.227, clip=0, loss_scale=8, train_wall=144, gb_free=58.8, wall=0 epoch 024: 1057 / 1707 loss=4.051, nll_loss=2.417, ppl=5.34, wps=294180, ups=0.68, wpb=429528, bsz=16323.6, num_updates=40200, lr=0.00031544, gnorm=0.233, clip=0, loss_scale=4, train_wall=145, gb_free=59.6, wall=0 epoch 024: 1057 / 1707 loss=4.051, nll_loss=2.417, ppl=5.34, wps=294180, ups=0.68, wpb=429528, bsz=16323.6, num_updates=40200, lr=0.00031544, gnorm=0.233, clip=0, loss_scale=4, train_wall=145, gb_free=59.6, wall=0 epoch 024: 1157 / 1707 loss=4.049, nll_loss=2.415, ppl=5.33, wps=296162, ups=0.69, wpb=428264, bsz=16336.2, num_updates=40300, lr=0.000315049, gnorm=0.234, clip=0, loss_scale=4, train_wall=144, gb_free=59.1, wall=0 epoch 024: 1157 / 1707 loss=4.049, nll_loss=2.415, ppl=5.33, wps=296162, ups=0.69, wpb=428264, bsz=16336.2, num_updates=40300, lr=0.000315049, gnorm=0.234, clip=0, loss_scale=4, train_wall=144, gb_free=59.1, wall=0 epoch 024: 1258 / 1707 loss=4.048, nll_loss=2.413, ppl=5.33, wps=292742, ups=0.68, wpb=427455, bsz=16067.2, num_updates=40400, lr=0.000314658, gnorm=0.232, clip=0, loss_scale=2, train_wall=145, gb_free=59.3, wall=0 epoch 024: 1258 / 1707 loss=4.048, nll_loss=2.413, ppl=5.33, wps=292742, ups=0.68, wpb=427455, bsz=16067.2, num_updates=40400, lr=0.000314658, gnorm=0.232, clip=0, loss_scale=2, train_wall=145, gb_free=59.3, wall=0 epoch 024: 1358 / 1707 loss=4.058, nll_loss=2.424, ppl=5.37, wps=296702, ups=0.69, wpb=429493, bsz=16433, num_updates=40500, lr=0.00031427, gnorm=0.23, clip=0, loss_scale=2, train_wall=144, gb_free=59.6, wall=0 epoch 024: 1358 / 1707 loss=4.058, nll_loss=2.424, ppl=5.37, wps=296702, ups=0.69, wpb=429493, bsz=16433, num_updates=40500, lr=0.00031427, gnorm=0.23, clip=0, loss_scale=2, train_wall=144, gb_free=59.6, wall=0 epoch 024: 1458 / 1707 loss=4.057, nll_loss=2.424, ppl=5.37, wps=297460, ups=0.69, wpb=431031, bsz=16693.9, num_updates=40600, lr=0.000313882, gnorm=0.234, clip=0, loss_scale=4, train_wall=144, gb_free=58.9, wall=0 epoch 024: 1458 / 1707 loss=4.057, nll_loss=2.424, ppl=5.37, wps=297460, ups=0.69, wpb=431031, bsz=16693.9, num_updates=40600, lr=0.000313882, gnorm=0.234, clip=0, loss_scale=4, train_wall=144, gb_free=58.9, wall=0 epoch 024: 1559 / 1707 loss=4.057, nll_loss=2.424, ppl=5.37, wps=294878, ups=0.69, wpb=430459, bsz=16368.1, num_updates=40700, lr=0.000313497, gnorm=0.231, clip=0, loss_scale=2, train_wall=145, gb_free=59.1, wall=0 epoch 024: 1559 / 1707 loss=4.057, nll_loss=2.424, ppl=5.37, wps=294878, ups=0.69, wpb=430459, bsz=16368.1, num_updates=40700, lr=0.000313497, gnorm=0.231, clip=0, loss_scale=2, train_wall=145, gb_free=59.1, wall=0 epoch 024: 1659 / 1707 loss=4.055, nll_loss=2.422, ppl=5.36, wps=297544, ups=0.69, wpb=429522, bsz=16133.6, num_updates=40800, lr=0.000313112, gnorm=0.229, clip=0, loss_scale=2, train_wall=144, gb_free=59.5, wall=0 epoch 024: 1659 / 1707 loss=4.055, nll_loss=2.422, ppl=5.36, wps=297544, ups=0.69, wpb=429522, bsz=16133.6, num_updates=40800, lr=0.000313112, gnorm=0.229, clip=0, loss_scale=2, train_wall=144, gb_free=59.5, wall=0 end of epoch 24 (average epoch stats below) epoch 024 | loss 4.05 | nll_loss 2.415 | ppl 5.33 | wps 293228 | ups 0.68 | wpb 428947 | bsz 16332.5 | num_updates 40848 | lr 0.000312928 | gnorm 0.231 | clip 0 | loss_scale 2 | train_wall 2451 | gb_free 60 | wall 0 epoch 024 | loss 4.05 | nll_loss 2.415 | ppl 5.33 | wps 293228 | ups 0.68 | wpb 428947 | bsz 16332.5 | num_updates 40848 | lr 0.000312928 | gnorm 0.231 | clip 0 | loss_scale 2 | train_wall 2451 | gb_free 60 | wall 0 Start iterating over samples epoch 025: 53 / 1707 loss=4.038, nll_loss=2.402, ppl=5.28, wps=292730, ups=0.69, wpb=425223, bsz=16233, num_updates=40900, lr=0.000312729, gnorm=0.239, clip=0, loss_scale=2, train_wall=144, gb_free=59.1, wall=0 epoch 025: 53 / 1707 loss=4.038, nll_loss=2.402, ppl=5.28, wps=292730, ups=0.69, wpb=425223, bsz=16233, num_updates=40900, lr=0.000312729, gnorm=0.239, clip=0, loss_scale=2, train_wall=144, gb_free=59.1, wall=0 epoch 025: 53 / 1707 loss=4.038, nll_loss=2.402, ppl=5.28, wps=292730, ups=0.69, wpb=425223, bsz=16233, num_updates=40900, lr=0.000312729, gnorm=0.239, clip=0, loss_scale=2, train_wall=144, gb_free=59.1, wall=0 epoch 025: 153 / 1707 loss=4.025, nll_loss=2.385, ppl=5.23, wps=296517, ups=0.69, wpb=428488, bsz=16150.3, num_updates=41000, lr=0.000312348, gnorm=0.229, clip=0, loss_scale=2, train_wall=144, gb_free=59.3, wall=0 epoch 025: 153 / 1707 loss=4.025, nll_loss=2.385, ppl=5.23, wps=296517, ups=0.69, wpb=428488, bsz=16150.3, num_updates=41000, lr=0.000312348, gnorm=0.229, clip=0, loss_scale=2, train_wall=144, gb_free=59.3, wall=0 epoch 025: 153 / 1707 loss=4.025, nll_loss=2.385, ppl=5.23, wps=296517, ups=0.69, wpb=428488, bsz=16150.3, num_updates=41000, lr=0.000312348, gnorm=0.229, clip=0, loss_scale=2, train_wall=144, gb_free=59.3, wall=0 begin validation on "valid" subset epoch 025 | valid on 'valid' subset | loss 4.295 | nll_loss 2.673 | ppl 6.38 | wps 76205.3 | wpb 21331 | bsz 1016 | num_updates 41000 | best_loss 4.28 epoch 025 | valid on 'valid' subset | loss 4.295 | nll_loss 2.673 | ppl 6.38 | wps 76205.3 | wpb 21331 | bsz 1016 | num_updates 41000 | best_loss 4.28 epoch 025 | valid on 'valid' subset | loss 4.295 | nll_loss 2.673 | ppl 6.38 | wps 76205.3 | wpb 21331 | bsz 1016 | num_updates 41000 | best_loss 4.28 epoch 025: 253 / 1707 loss=4.028, nll_loss=2.39, ppl=5.24, wps=269926, ups=0.63, wpb=429090, bsz=16244.2, num_updates=41100, lr=0.000311967, gnorm=0.232, clip=0, loss_scale=2, train_wall=144, gb_free=59.1, wall=0 epoch 025: 253 / 1707 loss=4.028, nll_loss=2.39, ppl=5.24, wps=269926, ups=0.63, wpb=429090, bsz=16244.2, num_updates=41100, lr=0.000311967, gnorm=0.232, clip=0, loss_scale=2, train_wall=144, gb_free=59.1, wall=0 epoch 025: 253 / 1707 loss=4.028, nll_loss=2.39, ppl=5.24, wps=269926, ups=0.63, wpb=429090, bsz=16244.2, num_updates=41100, lr=0.000311967, gnorm=0.232, clip=0, loss_scale=2, train_wall=144, gb_free=59.1, wall=0 epoch 025: 353 / 1707 loss=4.025, nll_loss=2.387, ppl=5.23, wps=297002, ups=0.69, wpb=428431, bsz=16427.7, num_updates=41200, lr=0.000311588, gnorm=0.231, clip=0, loss_scale=4, train_wall=143, gb_free=59.2, wall=0 epoch 025: 353 / 1707 loss=4.025, nll_loss=2.387, ppl=5.23, wps=297002, ups=0.69, wpb=428431, bsz=16427.7, num_updates=41200, lr=0.000311588, gnorm=0.231, clip=0, loss_scale=4, train_wall=143, gb_free=59.2, wall=0 epoch 025: 353 / 1707 loss=4.025, nll_loss=2.387, ppl=5.23, wps=297002, ups=0.69, wpb=428431, bsz=16427.7, num_updates=41200, lr=0.000311588, gnorm=0.231, clip=0, loss_scale=4, train_wall=143, gb_free=59.2, wall=0 epoch 025: 453 / 1707 loss=4.032, nll_loss=2.395, ppl=5.26, wps=295466, ups=0.69, wpb=427263, bsz=16386.8, num_updates=41300, lr=0.000311211, gnorm=0.236, clip=0, loss_scale=4, train_wall=144, gb_free=59.2, wall=0 epoch 025: 453 / 1707 loss=4.032, nll_loss=2.395, ppl=5.26, wps=295466, ups=0.69, wpb=427263, bsz=16386.8, num_updates=41300, lr=0.000311211, gnorm=0.236, clip=0, loss_scale=4, train_wall=144, gb_free=59.2, wall=0 epoch 025: 453 / 1707 loss=4.032, nll_loss=2.395, ppl=5.26, wps=295466, ups=0.69, wpb=427263, bsz=16386.8, num_updates=41300, lr=0.000311211, gnorm=0.236, clip=0, loss_scale=4, train_wall=144, gb_free=59.2, wall=0 epoch 025: 554 / 1707 loss=4.033, nll_loss=2.395, ppl=5.26, wps=295036, ups=0.69, wpb=430219, bsz=16104.7, num_updates=41400, lr=0.000310835, gnorm=0.229, clip=0, loss_scale=2, train_wall=145, gb_free=59.5, wall=0 epoch 025: 554 / 1707 loss=4.033, nll_loss=2.395, ppl=5.26, wps=295036, ups=0.69, wpb=430219, bsz=16104.7, num_updates=41400, lr=0.000310835, gnorm=0.229, clip=0, loss_scale=2, train_wall=145, gb_free=59.5, wall=0 epoch 025: 554 / 1707 loss=4.033, nll_loss=2.395, ppl=5.26, wps=295036, ups=0.69, wpb=430219, bsz=16104.7, num_updates=41400, lr=0.000310835, gnorm=0.229, clip=0, loss_scale=2, train_wall=145, gb_free=59.5, wall=0 epoch 025: 654 / 1707 loss=4.04, nll_loss=2.404, ppl=5.29, wps=296946, ups=0.69, wpb=428999, bsz=16227.9, num_updates=41500, lr=0.00031046, gnorm=0.229, clip=0, loss_scale=2, train_wall=144, gb_free=59.1, wall=0 epoch 025: 654 / 1707 loss=4.04, nll_loss=2.404, ppl=5.29, wps=296946, ups=0.69, wpb=428999, bsz=16227.9, num_updates=41500, lr=0.00031046, gnorm=0.229, clip=0, loss_scale=2, train_wall=144, gb_free=59.1, wall=0 epoch 025: 654 / 1707 loss=4.04, nll_loss=2.404, ppl=5.29, wps=296946, ups=0.69, wpb=428999, bsz=16227.9, num_updates=41500, lr=0.00031046, gnorm=0.229, clip=0, loss_scale=2, train_wall=144, gb_free=59.1, wall=0 epoch 025: 754 / 1707 loss=4.048, nll_loss=2.413, ppl=5.33, wps=297905, ups=0.69, wpb=430263, bsz=16326.7, num_updates=41600, lr=0.000310087, gnorm=0.233, clip=0, loss_scale=2, train_wall=144, gb_free=59, wall=0 epoch 025: 754 / 1707 loss=4.048, nll_loss=2.413, ppl=5.33, wps=297905, ups=0.69, wpb=430263, bsz=16326.7, num_updates=41600, lr=0.000310087, gnorm=0.233, clip=0, loss_scale=2, train_wall=144, gb_free=59, wall=0 epoch 025: 754 / 1707 loss=4.048, nll_loss=2.413, ppl=5.33, wps=297905, ups=0.69, wpb=430263, bsz=16326.7, num_updates=41600, lr=0.000310087, gnorm=0.233, clip=0, loss_scale=2, train_wall=144, gb_free=59, wall=0 epoch 025: 855 / 1707 loss=4.043, nll_loss=2.408, ppl=5.31, wps=293216, ups=0.69, wpb=427735, bsz=16331, num_updates=41700, lr=0.000309715, gnorm=0.235, clip=0, loss_scale=2, train_wall=145, gb_free=59.4, wall=0 epoch 025: 855 / 1707 loss=4.043, nll_loss=2.408, ppl=5.31, wps=293216, ups=0.69, wpb=427735, bsz=16331, num_updates=41700, lr=0.000309715, gnorm=0.235, clip=0, loss_scale=2, train_wall=145, gb_free=59.4, wall=0 epoch 025: 855 / 1707 loss=4.043, nll_loss=2.408, ppl=5.31, wps=293216, ups=0.69, wpb=427735, bsz=16331, num_updates=41700, lr=0.000309715, gnorm=0.235, clip=0, loss_scale=2, train_wall=145, gb_free=59.4, wall=0 epoch 025: 955 / 1707 loss=4.047, nll_loss=2.412, ppl=5.32, wps=296795, ups=0.69, wpb=430302, bsz=16346.7, num_updates=41800, lr=0.000309344, gnorm=0.231, clip=0, loss_scale=2, train_wall=144, gb_free=58.8, wall=0 epoch 025: 955 / 1707 loss=4.047, nll_loss=2.412, ppl=5.32, wps=296795, ups=0.69, wpb=430302, bsz=16346.7, num_updates=41800, lr=0.000309344, gnorm=0.231, clip=0, loss_scale=2, train_wall=144, gb_free=58.8, wall=0 epoch 025: 955 / 1707 loss=4.047, nll_loss=2.412, ppl=5.32, wps=296795, ups=0.69, wpb=430302, bsz=16346.7, num_updates=41800, lr=0.000309344, gnorm=0.231, clip=0, loss_scale=2, train_wall=144, gb_free=58.8, wall=0 epoch 025: 1055 / 1707 loss=4.05, nll_loss=2.415, ppl=5.33, wps=296495, ups=0.69, wpb=428742, bsz=16562.6, num_updates=41900, lr=0.000308975, gnorm=0.227, clip=0, loss_scale=2, train_wall=144, gb_free=59, wall=0 epoch 025: 1055 / 1707 loss=4.05, nll_loss=2.415, ppl=5.33, wps=296495, ups=0.69, wpb=428742, bsz=16562.6, num_updates=41900, lr=0.000308975, gnorm=0.227, clip=0, loss_scale=2, train_wall=144, gb_free=59, wall=0 epoch 025: 1055 / 1707 loss=4.05, nll_loss=2.415, ppl=5.33, wps=296495, ups=0.69, wpb=428742, bsz=16562.6, num_updates=41900, lr=0.000308975, gnorm=0.227, clip=0, loss_scale=2, train_wall=144, gb_free=59, wall=0 epoch 025: 1155 / 1707 loss=4.054, nll_loss=2.42, ppl=5.35, wps=298643, ups=0.69, wpb=430297, bsz=16303.2, num_updates=42000, lr=0.000308607, gnorm=0.236, clip=0, loss_scale=4, train_wall=143, gb_free=59.2, wall=0 epoch 025: 1155 / 1707 loss=4.054, nll_loss=2.42, ppl=5.35, wps=298643, ups=0.69, wpb=430297, bsz=16303.2, num_updates=42000, lr=0.000308607, gnorm=0.236, clip=0, loss_scale=4, train_wall=143, gb_free=59.2, wall=0 epoch 025: 1155 / 1707 loss=4.054, nll_loss=2.42, ppl=5.35, wps=298643, ups=0.69, wpb=430297, bsz=16303.2, num_updates=42000, lr=0.000308607, gnorm=0.236, clip=0, loss_scale=4, train_wall=143, gb_free=59.2, wall=0 begin validation on "valid" subset epoch 025 | valid on 'valid' subset | loss 4.298 | nll_loss 2.677 | ppl 6.4 | wps 76925.5 | wpb 21331 | bsz 1016 | num_updates 42000 | best_loss 4.28 epoch 025 | valid on 'valid' subset | loss 4.298 | nll_loss 2.677 | ppl 6.4 | wps 76925.5 | wpb 21331 | bsz 1016 | num_updates 42000 | best_loss 4.28 epoch 025 | valid on 'valid' subset | loss 4.298 | nll_loss 2.677 | ppl 6.4 | wps 76925.5 | wpb 21331 | bsz 1016 | num_updates 42000 | best_loss 4.28 epoch 025: 1257 / 1707 loss=4.054, nll_loss=2.42, ppl=5.35, wps=270623, ups=0.63, wpb=428096, bsz=16193.4, num_updates=42100, lr=0.00030824, gnorm=0.239, clip=0, loss_scale=1, train_wall=146, gb_free=59.1, wall=0 epoch 025: 1257 / 1707 loss=4.054, nll_loss=2.42, ppl=5.35, wps=270623, ups=0.63, wpb=428096, bsz=16193.4, num_updates=42100, lr=0.00030824, gnorm=0.239, clip=0, loss_scale=1, train_wall=146, gb_free=59.1, wall=0 epoch 025: 1257 / 1707 loss=4.054, nll_loss=2.42, ppl=5.35, wps=270623, ups=0.63, wpb=428096, bsz=16193.4, num_updates=42100, lr=0.00030824, gnorm=0.239, clip=0, loss_scale=1, train_wall=146, gb_free=59.1, wall=0 epoch 025: 1357 / 1707 loss=4.051, nll_loss=2.417, ppl=5.34, wps=299528, ups=0.7, wpb=430539, bsz=16241.1, num_updates=42200, lr=0.000307875, gnorm=0.24, clip=0, loss_scale=1, train_wall=143, gb_free=59.4, wall=0 epoch 025: 1357 / 1707 loss=4.051, nll_loss=2.417, ppl=5.34, wps=299528, ups=0.7, wpb=430539, bsz=16241.1, num_updates=42200, lr=0.000307875, gnorm=0.24, clip=0, loss_scale=1, train_wall=143, gb_free=59.4, wall=0 epoch 025: 1357 / 1707 loss=4.051, nll_loss=2.417, ppl=5.34, wps=299528, ups=0.7, wpb=430539, bsz=16241.1, num_updates=42200, lr=0.000307875, gnorm=0.24, clip=0, loss_scale=1, train_wall=143, gb_free=59.4, wall=0 epoch 025: 1457 / 1707 loss=4.052, nll_loss=2.418, ppl=5.34, wps=295859, ups=0.69, wpb=427873, bsz=16501, num_updates=42300, lr=0.00030751, gnorm=0.249, clip=0, loss_scale=1, train_wall=144, gb_free=59, wall=0 epoch 025: 1457 / 1707 loss=4.052, nll_loss=2.418, ppl=5.34, wps=295859, ups=0.69, wpb=427873, bsz=16501, num_updates=42300, lr=0.00030751, gnorm=0.249, clip=0, loss_scale=1, train_wall=144, gb_free=59, wall=0 epoch 025: 1457 / 1707 loss=4.052, nll_loss=2.418, ppl=5.34, wps=295859, ups=0.69, wpb=427873, bsz=16501, num_updates=42300, lr=0.00030751, gnorm=0.249, clip=0, loss_scale=1, train_wall=144, gb_free=59, wall=0 epoch 025: 1557 / 1707 loss=4.049, nll_loss=2.414, ppl=5.33, wps=298161, ups=0.69, wpb=429670, bsz=16285.8, num_updates=42400, lr=0.000307148, gnorm=0.225, clip=0, loss_scale=2, train_wall=143, gb_free=59.2, wall=0 epoch 025: 1557 / 1707 loss=4.049, nll_loss=2.414, ppl=5.33, wps=298161, ups=0.69, wpb=429670, bsz=16285.8, num_updates=42400, lr=0.000307148, gnorm=0.225, clip=0, loss_scale=2, train_wall=143, gb_free=59.2, wall=0 epoch 025: 1557 / 1707 loss=4.049, nll_loss=2.414, ppl=5.33, wps=298161, ups=0.69, wpb=429670, bsz=16285.8, num_updates=42400, lr=0.000307148, gnorm=0.225, clip=0, loss_scale=2, train_wall=143, gb_free=59.2, wall=0 epoch 025: 1657 / 1707 loss=4.052, nll_loss=2.419, ppl=5.35, wps=297439, ups=0.69, wpb=429888, bsz=16690.5, num_updates=42500, lr=0.000306786, gnorm=0.231, clip=0, loss_scale=2, train_wall=144, gb_free=59.3, wall=0 epoch 025: 1657 / 1707 loss=4.052, nll_loss=2.419, ppl=5.35, wps=297439, ups=0.69, wpb=429888, bsz=16690.5, num_updates=42500, lr=0.000306786, gnorm=0.231, clip=0, loss_scale=2, train_wall=144, gb_free=59.3, wall=0 epoch 025: 1657 / 1707 loss=4.052, nll_loss=2.419, ppl=5.35, wps=297439, ups=0.69, wpb=429888, bsz=16690.5, num_updates=42500, lr=0.000306786, gnorm=0.231, clip=0, loss_scale=2, train_wall=144, gb_free=59.3, wall=0 end of epoch 25 (average epoch stats below) epoch 025 | loss 4.042 | nll_loss 2.407 | ppl 5.3 | wps 293170 | ups 0.68 | wpb 428962 | bsz 16331.5 | num_updates 42550 | lr 0.000306606 | gnorm 0.233 | clip 0.1 | loss_scale 2 | train_wall 2450 | gb_free 59.1 | wall 0 epoch 025 | loss 4.042 | nll_loss 2.407 | ppl 5.3 | wps 293170 | ups 0.68 | wpb 428962 | bsz 16331.5 | num_updates 42550 | lr 0.000306606 | gnorm 0.233 | clip 0.1 | loss_scale 2 | train_wall 2450 | gb_free 59.1 | wall 0 epoch 025 | loss 4.042 | nll_loss 2.407 | ppl 5.3 | wps 293170 | ups 0.68 | wpb 428962 | bsz 16331.5 | num_updates 42550 | lr 0.000306606 | gnorm 0.233 | clip 0.1 | loss_scale 2 | train_wall 2450 | gb_free 59.1 | wall 0 Start iterating over samples epoch 026: 50 / 1707 loss=4.038, nll_loss=2.402, ppl=5.29, wps=295673, ups=0.69, wpb=426936, bsz=16281.7, num_updates=42600, lr=0.000306426, gnorm=0.231, clip=1, loss_scale=4, train_wall=143, gb_free=59.1, wall=0 epoch 026: 50 / 1707 loss=4.038, nll_loss=2.402, ppl=5.29, wps=295673, ups=0.69, wpb=426936, bsz=16281.7, num_updates=42600, lr=0.000306426, gnorm=0.231, clip=1, loss_scale=4, train_wall=143, gb_free=59.1, wall=0 epoch 026: 50 / 1707 loss=4.038, nll_loss=2.402, ppl=5.29, wps=295673, ups=0.69, wpb=426936, bsz=16281.7, num_updates=42600, lr=0.000306426, gnorm=0.231, clip=1, loss_scale=4, train_wall=143, gb_free=59.1, wall=0 epoch 026: 50 / 1707 loss=4.038, nll_loss=2.402, ppl=5.29, wps=295673, ups=0.69, wpb=426936, bsz=16281.7, num_updates=42600, lr=0.000306426, gnorm=0.231, clip=1, loss_scale=4, train_wall=143, gb_free=59.1, wall=0 epoch 026: 150 / 1707 loss=4.012, nll_loss=2.372, ppl=5.18, wps=296320, ups=0.69, wpb=428741, bsz=16322.6, num_updates=42700, lr=0.000306067, gnorm=0.227, clip=0, loss_scale=4, train_wall=144, gb_free=58.9, wall=0 epoch 026: 150 / 1707 loss=4.012, nll_loss=2.372, ppl=5.18, wps=296320, ups=0.69, wpb=428741, bsz=16322.6, num_updates=42700, lr=0.000306067, gnorm=0.227, clip=0, loss_scale=4, train_wall=144, gb_free=58.9, wall=0 epoch 026: 150 / 1707 loss=4.012, nll_loss=2.372, ppl=5.18, wps=296320, ups=0.69, wpb=428741, bsz=16322.6, num_updates=42700, lr=0.000306067, gnorm=0.227, clip=0, loss_scale=4, train_wall=144, gb_free=58.9, wall=0 epoch 026: 150 / 1707 loss=4.012, nll_loss=2.372, ppl=5.18, wps=296320, ups=0.69, wpb=428741, bsz=16322.6, num_updates=42700, lr=0.000306067, gnorm=0.227, clip=0, loss_scale=4, train_wall=144, gb_free=58.9, wall=0 epoch 026: 252 / 1707 loss=4.026, nll_loss=2.387, ppl=5.23, wps=293134, ups=0.68, wpb=429540, bsz=16546.8, num_updates=42800, lr=0.000305709, gnorm=0.222, clip=0, loss_scale=1, train_wall=146, gb_free=59.2, wall=0 epoch 026: 252 / 1707 loss=4.026, nll_loss=2.387, ppl=5.23, wps=293134, ups=0.68, wpb=429540, bsz=16546.8, num_updates=42800, lr=0.000305709, gnorm=0.222, clip=0, loss_scale=1, train_wall=146, gb_free=59.2, wall=0 epoch 026: 252 / 1707 loss=4.026, nll_loss=2.387, ppl=5.23, wps=293134, ups=0.68, wpb=429540, bsz=16546.8, num_updates=42800, lr=0.000305709, gnorm=0.222, clip=0, loss_scale=1, train_wall=146, gb_free=59.2, wall=0 epoch 026: 252 / 1707 loss=4.026, nll_loss=2.387, ppl=5.23, wps=293134, ups=0.68, wpb=429540, bsz=16546.8, num_updates=42800, lr=0.000305709, gnorm=0.222, clip=0, loss_scale=1, train_wall=146, gb_free=59.2, wall=0 epoch 026: 352 / 1707 loss=4.023, nll_loss=2.385, ppl=5.22, wps=297250, ups=0.69, wpb=429827, bsz=16386.4, num_updates=42900, lr=0.000305352, gnorm=0.238, clip=0, loss_scale=1, train_wall=144, gb_free=59.1, wall=0 epoch 026: 352 / 1707 loss=4.023, nll_loss=2.385, ppl=5.22, wps=297250, ups=0.69, wpb=429827, bsz=16386.4, num_updates=42900, lr=0.000305352, gnorm=0.238, clip=0, loss_scale=1, train_wall=144, gb_free=59.1, wall=0 epoch 026: 352 / 1707 loss=4.023, nll_loss=2.385, ppl=5.22, wps=297250, ups=0.69, wpb=429827, bsz=16386.4, num_updates=42900, lr=0.000305352, gnorm=0.238, clip=0, loss_scale=1, train_wall=144, gb_free=59.1, wall=0 epoch 026: 352 / 1707 loss=4.023, nll_loss=2.385, ppl=5.22, wps=297250, ups=0.69, wpb=429827, bsz=16386.4, num_updates=42900, lr=0.000305352, gnorm=0.238, clip=0, loss_scale=1, train_wall=144, gb_free=59.1, wall=0 epoch 026: 452 / 1707 loss=4.023, nll_loss=2.384, ppl=5.22, wps=297172, ups=0.69, wpb=427994, bsz=16040.4, num_updates=43000, lr=0.000304997, gnorm=0.241, clip=1, loss_scale=1, train_wall=143, gb_free=59.2, wall=0 epoch 026: 452 / 1707 loss=4.023, nll_loss=2.384, ppl=5.22, wps=297172, ups=0.69, wpb=427994, bsz=16040.4, num_updates=43000, lr=0.000304997, gnorm=0.241, clip=1, loss_scale=1, train_wall=143, gb_free=59.2, wall=0 epoch 026: 452 / 1707 loss=4.023, nll_loss=2.384, ppl=5.22, wps=297172, ups=0.69, wpb=427994, bsz=16040.4, num_updates=43000, lr=0.000304997, gnorm=0.241, clip=1, loss_scale=1, train_wall=143, gb_free=59.2, wall=0 epoch 026: 452 / 1707 loss=4.023, nll_loss=2.384, ppl=5.22, wps=297172, ups=0.69, wpb=427994, bsz=16040.4, num_updates=43000, lr=0.000304997, gnorm=0.241, clip=1, loss_scale=1, train_wall=143, gb_free=59.2, wall=0 begin validation on "valid" subset epoch 026 | valid on 'valid' subset | loss 4.295 | nll_loss 2.673 | ppl 6.38 | wps 76810.1 | wpb 21331 | bsz 1016 | num_updates 43000 | best_loss 4.28 epoch 026 | valid on 'valid' subset | loss 4.295 | nll_loss 2.673 | ppl 6.38 | wps 76810.1 | wpb 21331 | bsz 1016 | num_updates 43000 | best_loss 4.28 epoch 026 | valid on 'valid' subset | loss 4.295 | nll_loss 2.673 | ppl 6.38 | wps 76810.1 | wpb 21331 | bsz 1016 | num_updates 43000 | best_loss 4.28 epoch 026 | valid on 'valid' subset | loss 4.295 | nll_loss 2.673 | ppl 6.38 | wps 76810.1 | wpb 21331 | bsz 1016 | num_updates 43000 | best_loss 4.28 epoch 026: 553 / 1707 loss=4.029, nll_loss=2.391, ppl=5.25, wps=272228, ups=0.64, wpb=427978, bsz=16298.1, num_updates=43100, lr=0.000304643, gnorm=0.232, clip=0, loss_scale=1, train_wall=145, gb_free=57.6, wall=0 epoch 026: 553 / 1707 loss=4.029, nll_loss=2.391, ppl=5.25, wps=272228, ups=0.64, wpb=427978, bsz=16298.1, num_updates=43100, lr=0.000304643, gnorm=0.232, clip=0, loss_scale=1, train_wall=145, gb_free=57.6, wall=0 epoch 026: 553 / 1707 loss=4.029, nll_loss=2.391, ppl=5.25, wps=272228, ups=0.64, wpb=427978, bsz=16298.1, num_updates=43100, lr=0.000304643, gnorm=0.232, clip=0, loss_scale=1, train_wall=145, gb_free=57.6, wall=0 epoch 026: 553 / 1707 loss=4.029, nll_loss=2.391, ppl=5.25, wps=272228, ups=0.64, wpb=427978, bsz=16298.1, num_updates=43100, lr=0.000304643, gnorm=0.232, clip=0, loss_scale=1, train_wall=145, gb_free=57.6, wall=0 epoch 026: 653 / 1707 loss=4.037, nll_loss=2.4, ppl=5.28, wps=297539, ups=0.69, wpb=429599, bsz=16585.6, num_updates=43200, lr=0.00030429, gnorm=0.234, clip=0, loss_scale=1, train_wall=144, gb_free=59.1, wall=0 epoch 026: 653 / 1707 loss=4.037, nll_loss=2.4, ppl=5.28, wps=297539, ups=0.69, wpb=429599, bsz=16585.6, num_updates=43200, lr=0.00030429, gnorm=0.234, clip=0, loss_scale=1, train_wall=144, gb_free=59.1, wall=0 epoch 026: 653 / 1707 loss=4.037, nll_loss=2.4, ppl=5.28, wps=297539, ups=0.69, wpb=429599, bsz=16585.6, num_updates=43200, lr=0.00030429, gnorm=0.234, clip=0, loss_scale=1, train_wall=144, gb_free=59.1, wall=0 epoch 026: 653 / 1707 loss=4.037, nll_loss=2.4, ppl=5.28, wps=297539, ups=0.69, wpb=429599, bsz=16585.6, num_updates=43200, lr=0.00030429, gnorm=0.234, clip=0, loss_scale=1, train_wall=144, gb_free=59.1, wall=0 epoch 026: 753 / 1707 loss=4.033, nll_loss=2.396, ppl=5.26, wps=297376, ups=0.69, wpb=428968, bsz=16327.4, num_updates=43300, lr=0.000303939, gnorm=0.232, clip=0, loss_scale=1, train_wall=143, gb_free=59.2, wall=0 epoch 026: 753 / 1707 loss=4.033, nll_loss=2.396, ppl=5.26, wps=297376, ups=0.69, wpb=428968, bsz=16327.4, num_updates=43300, lr=0.000303939, gnorm=0.232, clip=0, loss_scale=1, train_wall=143, gb_free=59.2, wall=0 epoch 026: 753 / 1707 loss=4.033, nll_loss=2.396, ppl=5.26, wps=297376, ups=0.69, wpb=428968, bsz=16327.4, num_updates=43300, lr=0.000303939, gnorm=0.232, clip=0, loss_scale=1, train_wall=143, gb_free=59.2, wall=0 epoch 026: 753 / 1707 loss=4.033, nll_loss=2.396, ppl=5.26, wps=297376, ups=0.69, wpb=428968, bsz=16327.4, num_updates=43300, lr=0.000303939, gnorm=0.232, clip=0, loss_scale=1, train_wall=143, gb_free=59.2, wall=0 epoch 026: 854 / 1707 loss=4.032, nll_loss=2.394, ppl=5.26, wps=294971, ups=0.69, wpb=430062, bsz=16225.7, num_updates=43400, lr=0.000303588, gnorm=0.23, clip=0, loss_scale=1, train_wall=145, gb_free=59.1, wall=0 epoch 026: 854 / 1707 loss=4.032, nll_loss=2.394, ppl=5.26, wps=294971, ups=0.69, wpb=430062, bsz=16225.7, num_updates=43400, lr=0.000303588, gnorm=0.23, clip=0, loss_scale=1, train_wall=145, gb_free=59.1, wall=0 epoch 026: 854 / 1707 loss=4.032, nll_loss=2.394, ppl=5.26, wps=294971, ups=0.69, wpb=430062, bsz=16225.7, num_updates=43400, lr=0.000303588, gnorm=0.23, clip=0, loss_scale=1, train_wall=145, gb_free=59.1, wall=0 epoch 026: 854 / 1707 loss=4.032, nll_loss=2.394, ppl=5.26, wps=294971, ups=0.69, wpb=430062, bsz=16225.7, num_updates=43400, lr=0.000303588, gnorm=0.23, clip=0, loss_scale=1, train_wall=145, gb_free=59.1, wall=0 epoch 026: 954 / 1707 loss=4.041, nll_loss=2.405, ppl=5.3, wps=296917, ups=0.69, wpb=429199, bsz=16506.5, num_updates=43500, lr=0.000303239, gnorm=0.239, clip=0, loss_scale=1, train_wall=144, gb_free=59.4, wall=0 epoch 026: 954 / 1707 loss=4.041, nll_loss=2.405, ppl=5.3, wps=296917, ups=0.69, wpb=429199, bsz=16506.5, num_updates=43500, lr=0.000303239, gnorm=0.239, clip=0, loss_scale=1, train_wall=144, gb_free=59.4, wall=0 epoch 026: 954 / 1707 loss=4.041, nll_loss=2.405, ppl=5.3, wps=296917, ups=0.69, wpb=429199, bsz=16506.5, num_updates=43500, lr=0.000303239, gnorm=0.239, clip=0, loss_scale=1, train_wall=144, gb_free=59.4, wall=0 epoch 026: 954 / 1707 loss=4.041, nll_loss=2.405, ppl=5.3, wps=296917, ups=0.69, wpb=429199, bsz=16506.5, num_updates=43500, lr=0.000303239, gnorm=0.239, clip=0, loss_scale=1, train_wall=144, gb_free=59.4, wall=0 epoch 026: 1055 / 1707 loss=4.046, nll_loss=2.411, ppl=5.32, wps=295409, ups=0.69, wpb=429881, bsz=16352.5, num_updates=43600, lr=0.000302891, gnorm=0.219, clip=0, loss_scale=0.5, train_wall=145, gb_free=59, wall=0 epoch 026: 1055 / 1707 loss=4.046, nll_loss=2.411, ppl=5.32, wps=295409, ups=0.69, wpb=429881, bsz=16352.5, num_updates=43600, lr=0.000302891, gnorm=0.219, clip=0, loss_scale=0.5, train_wall=145, gb_free=59, wall=0 epoch 026: 1055 / 1707 loss=4.046, nll_loss=2.411, ppl=5.32, wps=295409, ups=0.69, wpb=429881, bsz=16352.5, num_updates=43600, lr=0.000302891, gnorm=0.219, clip=0, loss_scale=0.5, train_wall=145, gb_free=59, wall=0 epoch 026: 1055 / 1707 loss=4.046, nll_loss=2.411, ppl=5.32, wps=295409, ups=0.69, wpb=429881, bsz=16352.5, num_updates=43600, lr=0.000302891, gnorm=0.219, clip=0, loss_scale=0.5, train_wall=145, gb_free=59, wall=0 epoch 026: 1155 / 1707 loss=4.039, nll_loss=2.403, ppl=5.29, wps=296723, ups=0.69, wpb=428866, bsz=16285.5, num_updates=43700, lr=0.000302545, gnorm=0.224, clip=0, loss_scale=0.5, train_wall=144, gb_free=59.1, wall=0 epoch 026: 1155 / 1707 loss=4.039, nll_loss=2.403, ppl=5.29, wps=296723, ups=0.69, wpb=428866, bsz=16285.5, num_updates=43700, lr=0.000302545, gnorm=0.224, clip=0, loss_scale=0.5, train_wall=144, gb_free=59.1, wall=0 epoch 026: 1155 / 1707 loss=4.039, nll_loss=2.403, ppl=5.29, wps=296723, ups=0.69, wpb=428866, bsz=16285.5, num_updates=43700, lr=0.000302545, gnorm=0.224, clip=0, loss_scale=0.5, train_wall=144, gb_free=59.1, wall=0 epoch 026: 1155 / 1707 loss=4.039, nll_loss=2.403, ppl=5.29, wps=296723, ups=0.69, wpb=428866, bsz=16285.5, num_updates=43700, lr=0.000302545, gnorm=0.224, clip=0, loss_scale=0.5, train_wall=144, gb_free=59.1, wall=0 epoch 026: 1255 / 1707 loss=4.046, nll_loss=2.411, ppl=5.32, wps=297582, ups=0.69, wpb=428692, bsz=16145, num_updates=43800, lr=0.000302199, gnorm=0.23, clip=0, loss_scale=1, train_wall=143, gb_free=59, wall=0 epoch 026: 1255 / 1707 loss=4.046, nll_loss=2.411, ppl=5.32, wps=297582, ups=0.69, wpb=428692, bsz=16145, num_updates=43800, lr=0.000302199, gnorm=0.23, clip=0, loss_scale=1, train_wall=143, gb_free=59, wall=0 epoch 026: 1255 / 1707 loss=4.046, nll_loss=2.411, ppl=5.32, wps=297582, ups=0.69, wpb=428692, bsz=16145, num_updates=43800, lr=0.000302199, gnorm=0.23, clip=0, loss_scale=1, train_wall=143, gb_free=59, wall=0 epoch 026: 1255 / 1707 loss=4.046, nll_loss=2.411, ppl=5.32, wps=297582, ups=0.69, wpb=428692, bsz=16145, num_updates=43800, lr=0.000302199, gnorm=0.23, clip=0, loss_scale=1, train_wall=143, gb_free=59, wall=0 epoch 026: 1355 / 1707 loss=4.043, nll_loss=2.408, ppl=5.31, wps=296944, ups=0.69, wpb=428498, bsz=16399.3, num_updates=43900, lr=0.000301855, gnorm=0.237, clip=0, loss_scale=1, train_wall=143, gb_free=58.8, wall=0 epoch 026: 1355 / 1707 loss=4.043, nll_loss=2.408, ppl=5.31, wps=296944, ups=0.69, wpb=428498, bsz=16399.3, num_updates=43900, lr=0.000301855, gnorm=0.237, clip=0, loss_scale=1, train_wall=143, gb_free=58.8, wall=0 epoch 026: 1355 / 1707 loss=4.043, nll_loss=2.408, ppl=5.31, wps=296944, ups=0.69, wpb=428498, bsz=16399.3, num_updates=43900, lr=0.000301855, gnorm=0.237, clip=0, loss_scale=1, train_wall=143, gb_free=58.8, wall=0 epoch 026: 1355 / 1707 loss=4.043, nll_loss=2.408, ppl=5.31, wps=296944, ups=0.69, wpb=428498, bsz=16399.3, num_updates=43900, lr=0.000301855, gnorm=0.237, clip=0, loss_scale=1, train_wall=143, gb_free=58.8, wall=0 epoch 026: 1455 / 1707 loss=4.045, nll_loss=2.41, ppl=5.32, wps=297362, ups=0.69, wpb=429263, bsz=16507.8, num_updates=44000, lr=0.000301511, gnorm=0.242, clip=0, loss_scale=1, train_wall=144, gb_free=59.1, wall=0 epoch 026: 1455 / 1707 loss=4.045, nll_loss=2.41, ppl=5.32, wps=297362, ups=0.69, wpb=429263, bsz=16507.8, num_updates=44000, lr=0.000301511, gnorm=0.242, clip=0, loss_scale=1, train_wall=144, gb_free=59.1, wall=0 epoch 026: 1455 / 1707 loss=4.045, nll_loss=2.41, ppl=5.32, wps=297362, ups=0.69, wpb=429263, bsz=16507.8, num_updates=44000, lr=0.000301511, gnorm=0.242, clip=0, loss_scale=1, train_wall=144, gb_free=59.1, wall=0 epoch 026: 1455 / 1707 loss=4.045, nll_loss=2.41, ppl=5.32, wps=297362, ups=0.69, wpb=429263, bsz=16507.8, num_updates=44000, lr=0.000301511, gnorm=0.242, clip=0, loss_scale=1, train_wall=144, gb_free=59.1, wall=0 begin validation on "valid" subset epoch 026 | valid on 'valid' subset | loss 4.284 | nll_loss 2.664 | ppl 6.34 | wps 76677.6 | wpb 21331 | bsz 1016 | num_updates 44000 | best_loss 4.28 epoch 026 | valid on 'valid' subset | loss 4.284 | nll_loss 2.664 | ppl 6.34 | wps 76677.6 | wpb 21331 | bsz 1016 | num_updates 44000 | best_loss 4.28 epoch 026 | valid on 'valid' subset | loss 4.284 | nll_loss 2.664 | ppl 6.34 | wps 76677.6 | wpb 21331 | bsz 1016 | num_updates 44000 | best_loss 4.28 epoch 026 | valid on 'valid' subset | loss 4.284 | nll_loss 2.664 | ppl 6.34 | wps 76677.6 | wpb 21331 | bsz 1016 | num_updates 44000 | best_loss 4.28 epoch 026: 1556 / 1707 loss=4.046, nll_loss=2.412, ppl=5.32, wps=270378, ups=0.63, wpb=430140, bsz=16229.1, num_updates=44100, lr=0.000301169, gnorm=0.229, clip=0, loss_scale=1, train_wall=145, gb_free=58.9, wall=0 epoch 026: 1556 / 1707 loss=4.046, nll_loss=2.412, ppl=5.32, wps=270378, ups=0.63, wpb=430140, bsz=16229.1, num_updates=44100, lr=0.000301169, gnorm=0.229, clip=0, loss_scale=1, train_wall=145, gb_free=58.9, wall=0 epoch 026: 1556 / 1707 loss=4.046, nll_loss=2.412, ppl=5.32, wps=270378, ups=0.63, wpb=430140, bsz=16229.1, num_updates=44100, lr=0.000301169, gnorm=0.229, clip=0, loss_scale=1, train_wall=145, gb_free=58.9, wall=0 epoch 026: 1556 / 1707 loss=4.046, nll_loss=2.412, ppl=5.32, wps=270378, ups=0.63, wpb=430140, bsz=16229.1, num_updates=44100, lr=0.000301169, gnorm=0.229, clip=0, loss_scale=1, train_wall=145, gb_free=58.9, wall=0 epoch 026: 1656 / 1707 loss=4.043, nll_loss=2.408, ppl=5.31, wps=297330, ups=0.69, wpb=428317, bsz=16363.4, num_updates=44200, lr=0.000300828, gnorm=0.235, clip=0, loss_scale=1, train_wall=143, gb_free=59.3, wall=0 epoch 026: 1656 / 1707 loss=4.043, nll_loss=2.408, ppl=5.31, wps=297330, ups=0.69, wpb=428317, bsz=16363.4, num_updates=44200, lr=0.000300828, gnorm=0.235, clip=0, loss_scale=1, train_wall=143, gb_free=59.3, wall=0 epoch 026: 1656 / 1707 loss=4.043, nll_loss=2.408, ppl=5.31, wps=297330, ups=0.69, wpb=428317, bsz=16363.4, num_updates=44200, lr=0.000300828, gnorm=0.235, clip=0, loss_scale=1, train_wall=143, gb_free=59.3, wall=0 epoch 026: 1656 / 1707 loss=4.043, nll_loss=2.408, ppl=5.31, wps=297330, ups=0.69, wpb=428317, bsz=16363.4, num_updates=44200, lr=0.000300828, gnorm=0.235, clip=0, loss_scale=1, train_wall=143, gb_free=59.3, wall=0 end of epoch 26 (average epoch stats below) epoch 026 | loss 4.035 | nll_loss 2.399 | ppl 5.27 | wps 293321 | ups 0.68 | wpb 428931 | bsz 16336.9 | num_updates 44251 | lr 0.000300655 | gnorm 0.232 | clip 0.1 | loss_scale 1 | train_wall 2448 | gb_free 59.3 | wall 0 epoch 026 | loss 4.035 | nll_loss 2.399 | ppl 5.27 | wps 293321 | ups 0.68 | wpb 428931 | bsz 16336.9 | num_updates 44251 | lr 0.000300655 | gnorm 0.232 | clip 0.1 | loss_scale 1 | train_wall 2448 | gb_free 59.3 | wall 0 epoch 026 | loss 4.035 | nll_loss 2.399 | ppl 5.27 | wps 293321 | ups 0.68 | wpb 428931 | bsz 16336.9 | num_updates 44251 | lr 0.000300655 | gnorm 0.232 | clip 0.1 | loss_scale 1 | train_wall 2448 | gb_free 59.3 | wall 0 epoch 026 | loss 4.035 | nll_loss 2.399 | ppl 5.27 | wps 293321 | ups 0.68 | wpb 428931 | bsz 16336.9 | num_updates 44251 | lr 0.000300655 | gnorm 0.232 | clip 0.1 | loss_scale 1 | train_wall 2448 | gb_free 59.3 | wall 0 Start iterating over samples epoch 027: 49 / 1707 loss=4.028, nll_loss=2.391, ppl=5.24, wps=295979, ups=0.7, wpb=424736, bsz=16125, num_updates=44300, lr=0.000300489, gnorm=0.242, clip=0, loss_scale=1, train_wall=142, gb_free=59.3, wall=0 epoch 027: 49 / 1707 loss=4.028, nll_loss=2.391, ppl=5.24, wps=295979, ups=0.7, wpb=424736, bsz=16125, num_updates=44300, lr=0.000300489, gnorm=0.242, clip=0, loss_scale=1, train_wall=142, gb_free=59.3, wall=0 epoch 027: 49 / 1707 loss=4.028, nll_loss=2.391, ppl=5.24, wps=295979, ups=0.7, wpb=424736, bsz=16125, num_updates=44300, lr=0.000300489, gnorm=0.242, clip=0, loss_scale=1, train_wall=142, gb_free=59.3, wall=0 epoch 027: 49 / 1707 loss=4.028, nll_loss=2.391, ppl=5.24, wps=295979, ups=0.7, wpb=424736, bsz=16125, num_updates=44300, lr=0.000300489, gnorm=0.242, clip=0, loss_scale=1, train_wall=142, gb_free=59.3, wall=0 epoch 027: 49 / 1707 loss=4.028, nll_loss=2.391, ppl=5.24, wps=295979, ups=0.7, wpb=424736, bsz=16125, num_updates=44300, lr=0.000300489, gnorm=0.242, clip=0, loss_scale=1, train_wall=142, gb_free=59.3, wall=0 epoch 027: 149 / 1707 loss=4.017, nll_loss=2.378, ppl=5.2, wps=297806, ups=0.69, wpb=430017, bsz=16121.8, num_updates=44400, lr=0.00030015, gnorm=0.221, clip=0, loss_scale=2, train_wall=144, gb_free=58.8, wall=0 epoch 027: 149 / 1707 loss=4.017, nll_loss=2.378, ppl=5.2, wps=297806, ups=0.69, wpb=430017, bsz=16121.8, num_updates=44400, lr=0.00030015, gnorm=0.221, clip=0, loss_scale=2, train_wall=144, gb_free=58.8, wall=0 epoch 027: 149 / 1707 loss=4.017, nll_loss=2.378, ppl=5.2, wps=297806, ups=0.69, wpb=430017, bsz=16121.8, num_updates=44400, lr=0.00030015, gnorm=0.221, clip=0, loss_scale=2, train_wall=144, gb_free=58.8, wall=0 epoch 027: 149 / 1707 loss=4.017, nll_loss=2.378, ppl=5.2, wps=297806, ups=0.69, wpb=430017, bsz=16121.8, num_updates=44400, lr=0.00030015, gnorm=0.221, clip=0, loss_scale=2, train_wall=144, gb_free=58.8, wall=0 epoch 027: 149 / 1707 loss=4.017, nll_loss=2.378, ppl=5.2, wps=297806, ups=0.69, wpb=430017, bsz=16121.8, num_updates=44400, lr=0.00030015, gnorm=0.221, clip=0, loss_scale=2, train_wall=144, gb_free=58.8, wall=0 epoch 027: 250 / 1707 loss=4.016, nll_loss=2.376, ppl=5.19, wps=294048, ups=0.69, wpb=428722, bsz=16137.1, num_updates=44500, lr=0.000299813, gnorm=0.247, clip=0, loss_scale=1, train_wall=145, gb_free=59.3, wall=0 epoch 027: 250 / 1707 loss=4.016, nll_loss=2.376, ppl=5.19, wps=294048, ups=0.69, wpb=428722, bsz=16137.1, num_updates=44500, lr=0.000299813, gnorm=0.247, clip=0, loss_scale=1, train_wall=145, gb_free=59.3, wall=0 epoch 027: 250 / 1707 loss=4.016, nll_loss=2.376, ppl=5.19, wps=294048, ups=0.69, wpb=428722, bsz=16137.1, num_updates=44500, lr=0.000299813, gnorm=0.247, clip=0, loss_scale=1, train_wall=145, gb_free=59.3, wall=0 epoch 027: 250 / 1707 loss=4.016, nll_loss=2.376, ppl=5.19, wps=294048, ups=0.69, wpb=428722, bsz=16137.1, num_updates=44500, lr=0.000299813, gnorm=0.247, clip=0, loss_scale=1, train_wall=145, gb_free=59.3, wall=0 epoch 027: 250 / 1707 loss=4.016, nll_loss=2.376, ppl=5.19, wps=294048, ups=0.69, wpb=428722, bsz=16137.1, num_updates=44500, lr=0.000299813, gnorm=0.247, clip=0, loss_scale=1, train_wall=145, gb_free=59.3, wall=0 epoch 027: 350 / 1707 loss=4.017, nll_loss=2.377, ppl=5.2, wps=297092, ups=0.69, wpb=430211, bsz=16377.4, num_updates=44600, lr=0.000299476, gnorm=0.226, clip=0, loss_scale=1, train_wall=144, gb_free=58.6, wall=0 epoch 027: 350 / 1707 loss=4.017, nll_loss=2.377, ppl=5.2, wps=297092, ups=0.69, wpb=430211, bsz=16377.4, num_updates=44600, lr=0.000299476, gnorm=0.226, clip=0, loss_scale=1, train_wall=144, gb_free=58.6, wall=0 epoch 027: 350 / 1707 loss=4.017, nll_loss=2.377, ppl=5.2, wps=297092, ups=0.69, wpb=430211, bsz=16377.4, num_updates=44600, lr=0.000299476, gnorm=0.226, clip=0, loss_scale=1, train_wall=144, gb_free=58.6, wall=0 epoch 027: 350 / 1707 loss=4.017, nll_loss=2.377, ppl=5.2, wps=297092, ups=0.69, wpb=430211, bsz=16377.4, num_updates=44600, lr=0.000299476, gnorm=0.226, clip=0, loss_scale=1, train_wall=144, gb_free=58.6, wall=0 epoch 027: 350 / 1707 loss=4.017, nll_loss=2.377, ppl=5.2, wps=297092, ups=0.69, wpb=430211, bsz=16377.4, num_updates=44600, lr=0.000299476, gnorm=0.226, clip=0, loss_scale=1, train_wall=144, gb_free=58.6, wall=0 epoch 027: 450 / 1707 loss=4.022, nll_loss=2.384, ppl=5.22, wps=296146, ups=0.69, wpb=427529, bsz=16259, num_updates=44700, lr=0.000299141, gnorm=0.238, clip=0, loss_scale=1, train_wall=144, gb_free=59, wall=0 epoch 027: 450 / 1707 loss=4.022, nll_loss=2.384, ppl=5.22, wps=296146, ups=0.69, wpb=427529, bsz=16259, num_updates=44700, lr=0.000299141, gnorm=0.238, clip=0, loss_scale=1, train_wall=144, gb_free=59, wall=0 epoch 027: 450 / 1707 loss=4.022, nll_loss=2.384, ppl=5.22, wps=296146, ups=0.69, wpb=427529, bsz=16259, num_updates=44700, lr=0.000299141, gnorm=0.238, clip=0, loss_scale=1, train_wall=144, gb_free=59, wall=0 epoch 027: 450 / 1707 loss=4.022, nll_loss=2.384, ppl=5.22, wps=296146, ups=0.69, wpb=427529, bsz=16259, num_updates=44700, lr=0.000299141, gnorm=0.238, clip=0, loss_scale=1, train_wall=144, gb_free=59, wall=0 epoch 027: 450 / 1707 loss=4.022, nll_loss=2.384, ppl=5.22, wps=296146, ups=0.69, wpb=427529, bsz=16259, num_updates=44700, lr=0.000299141, gnorm=0.238, clip=0, loss_scale=1, train_wall=144, gb_free=59, wall=0 epoch 027: 551 / 1707 loss=4.024, nll_loss=2.386, ppl=5.23, wps=293836, ups=0.69, wpb=428789, bsz=16311.7, num_updates=44800, lr=0.000298807, gnorm=0.246, clip=0, loss_scale=1, train_wall=145, gb_free=58.9, wall=0 epoch 027: 551 / 1707 loss=4.024, nll_loss=2.386, ppl=5.23, wps=293836, ups=0.69, wpb=428789, bsz=16311.7, num_updates=44800, lr=0.000298807, gnorm=0.246, clip=0, loss_scale=1, train_wall=145, gb_free=58.9, wall=0 epoch 027: 551 / 1707 loss=4.024, nll_loss=2.386, ppl=5.23, wps=293836, ups=0.69, wpb=428789, bsz=16311.7, num_updates=44800, lr=0.000298807, gnorm=0.246, clip=0, loss_scale=1, train_wall=145, gb_free=58.9, wall=0 epoch 027: 551 / 1707 loss=4.024, nll_loss=2.386, ppl=5.23, wps=293836, ups=0.69, wpb=428789, bsz=16311.7, num_updates=44800, lr=0.000298807, gnorm=0.246, clip=0, loss_scale=1, train_wall=145, gb_free=58.9, wall=0 epoch 027: 551 / 1707 loss=4.024, nll_loss=2.386, ppl=5.23, wps=293836, ups=0.69, wpb=428789, bsz=16311.7, num_updates=44800, lr=0.000298807, gnorm=0.246, clip=0, loss_scale=1, train_wall=145, gb_free=58.9, wall=0 epoch 027: 651 / 1707 loss=4.022, nll_loss=2.384, ppl=5.22, wps=296137, ups=0.69, wpb=428446, bsz=16325, num_updates=44900, lr=0.000298474, gnorm=0.228, clip=0, loss_scale=1, train_wall=144, gb_free=58.8, wall=0 epoch 027: 651 / 1707 loss=4.022, nll_loss=2.384, ppl=5.22, wps=296137, ups=0.69, wpb=428446, bsz=16325, num_updates=44900, lr=0.000298474, gnorm=0.228, clip=0, loss_scale=1, train_wall=144, gb_free=58.8, wall=0 epoch 027: 651 / 1707 loss=4.022, nll_loss=2.384, ppl=5.22, wps=296137, ups=0.69, wpb=428446, bsz=16325, num_updates=44900, lr=0.000298474, gnorm=0.228, clip=0, loss_scale=1, train_wall=144, gb_free=58.8, wall=0 epoch 027: 651 / 1707 loss=4.022, nll_loss=2.384, ppl=5.22, wps=296137, ups=0.69, wpb=428446, bsz=16325, num_updates=44900, lr=0.000298474, gnorm=0.228, clip=0, loss_scale=1, train_wall=144, gb_free=58.8, wall=0 epoch 027: 651 / 1707 loss=4.022, nll_loss=2.384, ppl=5.22, wps=296137, ups=0.69, wpb=428446, bsz=16325, num_updates=44900, lr=0.000298474, gnorm=0.228, clip=0, loss_scale=1, train_wall=144, gb_free=58.8, wall=0 epoch 027: 751 / 1707 loss=4.034, nll_loss=2.397, ppl=5.27, wps=298259, ups=0.69, wpb=430946, bsz=16561.4, num_updates=45000, lr=0.000298142, gnorm=0.235, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=0 epoch 027: 751 / 1707 loss=4.034, nll_loss=2.397, ppl=5.27, wps=298259, ups=0.69, wpb=430946, bsz=16561.4, num_updates=45000, lr=0.000298142, gnorm=0.235, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=0 epoch 027: 751 / 1707 loss=4.034, nll_loss=2.397, ppl=5.27, wps=298259, ups=0.69, wpb=430946, bsz=16561.4, num_updates=45000, lr=0.000298142, gnorm=0.235, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=0 epoch 027: 751 / 1707 loss=4.034, nll_loss=2.397, ppl=5.27, wps=298259, ups=0.69, wpb=430946, bsz=16561.4, num_updates=45000, lr=0.000298142, gnorm=0.235, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=0 epoch 027: 751 / 1707 loss=4.034, nll_loss=2.397, ppl=5.27, wps=298259, ups=0.69, wpb=430946, bsz=16561.4, num_updates=45000, lr=0.000298142, gnorm=0.235, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=0 begin validation on "valid" subset epoch 027 | valid on 'valid' subset | loss 4.292 | nll_loss 2.667 | ppl 6.35 | wps 76937.1 | wpb 21331 | bsz 1016 | num_updates 45000 | best_loss 4.28 epoch 027 | valid on 'valid' subset | loss 4.292 | nll_loss 2.667 | ppl 6.35 | wps 76937.1 | wpb 21331 | bsz 1016 | num_updates 45000 | best_loss 4.28 epoch 027 | valid on 'valid' subset | loss 4.292 | nll_loss 2.667 | ppl 6.35 | wps 76937.1 | wpb 21331 | bsz 1016 | num_updates 45000 | best_loss 4.28 epoch 027 | valid on 'valid' subset | loss 4.292 | nll_loss 2.667 | ppl 6.35 | wps 76937.1 | wpb 21331 | bsz 1016 | num_updates 45000 | best_loss 4.28 epoch 027 | valid on 'valid' subset | loss 4.292 | nll_loss 2.667 | ppl 6.35 | wps 76937.1 | wpb 21331 | bsz 1016 | num_updates 45000 | best_loss 4.28 epoch 027: 851 / 1707 loss=4.029, nll_loss=2.392, ppl=5.25, wps=267224, ups=0.62, wpb=428408, bsz=16376.1, num_updates=45100, lr=0.000297812, gnorm=0.231, clip=0, loss_scale=2, train_wall=143, gb_free=58.7, wall=0 epoch 027: 851 / 1707 loss=4.029, nll_loss=2.392, ppl=5.25, wps=267224, ups=0.62, wpb=428408, bsz=16376.1, num_updates=45100, lr=0.000297812, gnorm=0.231, clip=0, loss_scale=2, train_wall=143, gb_free=58.7, wall=0 epoch 027: 851 / 1707 loss=4.029, nll_loss=2.392, ppl=5.25, wps=267224, ups=0.62, wpb=428408, bsz=16376.1, num_updates=45100, lr=0.000297812, gnorm=0.231, clip=0, loss_scale=2, train_wall=143, gb_free=58.7, wall=0 epoch 027: 851 / 1707 loss=4.029, nll_loss=2.392, ppl=5.25, wps=267224, ups=0.62, wpb=428408, bsz=16376.1, num_updates=45100, lr=0.000297812, gnorm=0.231, clip=0, loss_scale=2, train_wall=143, gb_free=58.7, wall=0 epoch 027: 851 / 1707 loss=4.029, nll_loss=2.392, ppl=5.25, wps=267224, ups=0.62, wpb=428408, bsz=16376.1, num_updates=45100, lr=0.000297812, gnorm=0.231, clip=0, loss_scale=2, train_wall=143, gb_free=58.7, wall=0 epoch 027: 951 / 1707 loss=4.033, nll_loss=2.397, ppl=5.27, wps=297125, ups=0.69, wpb=428292, bsz=16313.6, num_updates=45200, lr=0.000297482, gnorm=0.232, clip=0, loss_scale=2, train_wall=143, gb_free=59.5, wall=0 epoch 027: 951 / 1707 loss=4.033, nll_loss=2.397, ppl=5.27, wps=297125, ups=0.69, wpb=428292, bsz=16313.6, num_updates=45200, lr=0.000297482, gnorm=0.232, clip=0, loss_scale=2, train_wall=143, gb_free=59.5, wall=0 epoch 027: 951 / 1707 loss=4.033, nll_loss=2.397, ppl=5.27, wps=297125, ups=0.69, wpb=428292, bsz=16313.6, num_updates=45200, lr=0.000297482, gnorm=0.232, clip=0, loss_scale=2, train_wall=143, gb_free=59.5, wall=0 epoch 027: 951 / 1707 loss=4.033, nll_loss=2.397, ppl=5.27, wps=297125, ups=0.69, wpb=428292, bsz=16313.6, num_updates=45200, lr=0.000297482, gnorm=0.232, clip=0, loss_scale=2, train_wall=143, gb_free=59.5, wall=0 epoch 027: 951 / 1707 loss=4.033, nll_loss=2.397, ppl=5.27, wps=297125, ups=0.69, wpb=428292, bsz=16313.6, num_updates=45200, lr=0.000297482, gnorm=0.232, clip=0, loss_scale=2, train_wall=143, gb_free=59.5, wall=0 epoch 027: 1052 / 1707 loss=4.027, nll_loss=2.389, ppl=5.24, wps=294787, ups=0.69, wpb=429343, bsz=16347.1, num_updates=45300, lr=0.000297154, gnorm=0.237, clip=0, loss_scale=2, train_wall=145, gb_free=58.9, wall=0 epoch 027: 1052 / 1707 loss=4.027, nll_loss=2.389, ppl=5.24, wps=294787, ups=0.69, wpb=429343, bsz=16347.1, num_updates=45300, lr=0.000297154, gnorm=0.237, clip=0, loss_scale=2, train_wall=145, gb_free=58.9, wall=0 epoch 027: 1052 / 1707 loss=4.027, nll_loss=2.389, ppl=5.24, wps=294787, ups=0.69, wpb=429343, bsz=16347.1, num_updates=45300, lr=0.000297154, gnorm=0.237, clip=0, loss_scale=2, train_wall=145, gb_free=58.9, wall=0 epoch 027: 1052 / 1707 loss=4.027, nll_loss=2.389, ppl=5.24, wps=294787, ups=0.69, wpb=429343, bsz=16347.1, num_updates=45300, lr=0.000297154, gnorm=0.237, clip=0, loss_scale=2, train_wall=145, gb_free=58.9, wall=0 epoch 027: 1052 / 1707 loss=4.027, nll_loss=2.389, ppl=5.24, wps=294787, ups=0.69, wpb=429343, bsz=16347.1, num_updates=45300, lr=0.000297154, gnorm=0.237, clip=0, loss_scale=2, train_wall=145, gb_free=58.9, wall=0 epoch 027: 1152 / 1707 loss=4.036, nll_loss=2.401, ppl=5.28, wps=297810, ups=0.69, wpb=429720, bsz=16403.4, num_updates=45400, lr=0.000296826, gnorm=0.233, clip=0, loss_scale=2, train_wall=143, gb_free=59.1, wall=0 epoch 027: 1152 / 1707 loss=4.036, nll_loss=2.401, ppl=5.28, wps=297810, ups=0.69, wpb=429720, bsz=16403.4, num_updates=45400, lr=0.000296826, gnorm=0.233, clip=0, loss_scale=2, train_wall=143, gb_free=59.1, wall=0 epoch 027: 1152 / 1707 loss=4.036, nll_loss=2.401, ppl=5.28, wps=297810, ups=0.69, wpb=429720, bsz=16403.4, num_updates=45400, lr=0.000296826, gnorm=0.233, clip=0, loss_scale=2, train_wall=143, gb_free=59.1, wall=0 epoch 027: 1152 / 1707 loss=4.036, nll_loss=2.401, ppl=5.28, wps=297810, ups=0.69, wpb=429720, bsz=16403.4, num_updates=45400, lr=0.000296826, gnorm=0.233, clip=0, loss_scale=2, train_wall=143, gb_free=59.1, wall=0 epoch 027: 1152 / 1707 loss=4.036, nll_loss=2.401, ppl=5.28, wps=297810, ups=0.69, wpb=429720, bsz=16403.4, num_updates=45400, lr=0.000296826, gnorm=0.233, clip=0, loss_scale=2, train_wall=143, gb_free=59.1, wall=0 epoch 027: 1253 / 1707 loss=4.043, nll_loss=2.408, ppl=5.31, wps=295025, ups=0.69, wpb=429999, bsz=16391.8, num_updates=45500, lr=0.0002965, gnorm=0.246, clip=0, loss_scale=1, train_wall=145, gb_free=59, wall=0 epoch 027: 1253 / 1707 loss=4.043, nll_loss=2.408, ppl=5.31, wps=295025, ups=0.69, wpb=429999, bsz=16391.8, num_updates=45500, lr=0.0002965, gnorm=0.246, clip=0, loss_scale=1, train_wall=145, gb_free=59, wall=0 epoch 027: 1253 / 1707 loss=4.043, nll_loss=2.408, ppl=5.31, wps=295025, ups=0.69, wpb=429999, bsz=16391.8, num_updates=45500, lr=0.0002965, gnorm=0.246, clip=0, loss_scale=1, train_wall=145, gb_free=59, wall=0 epoch 027: 1253 / 1707 loss=4.043, nll_loss=2.408, ppl=5.31, wps=295025, ups=0.69, wpb=429999, bsz=16391.8, num_updates=45500, lr=0.0002965, gnorm=0.246, clip=0, loss_scale=1, train_wall=145, gb_free=59, wall=0 epoch 027: 1253 / 1707 loss=4.043, nll_loss=2.408, ppl=5.31, wps=295025, ups=0.69, wpb=429999, bsz=16391.8, num_updates=45500, lr=0.0002965, gnorm=0.246, clip=0, loss_scale=1, train_wall=145, gb_free=59, wall=0 epoch 027: 1353 / 1707 loss=4.044, nll_loss=2.409, ppl=5.31, wps=296846, ups=0.69, wpb=429160, bsz=16341.9, num_updates=45600, lr=0.000296174, gnorm=0.235, clip=0, loss_scale=1, train_wall=144, gb_free=59.2, wall=0 epoch 027: 1353 / 1707 loss=4.044, nll_loss=2.409, ppl=5.31, wps=296846, ups=0.69, wpb=429160, bsz=16341.9, num_updates=45600, lr=0.000296174, gnorm=0.235, clip=0, loss_scale=1, train_wall=144, gb_free=59.2, wall=0 epoch 027: 1353 / 1707 loss=4.044, nll_loss=2.409, ppl=5.31, wps=296846, ups=0.69, wpb=429160, bsz=16341.9, num_updates=45600, lr=0.000296174, gnorm=0.235, clip=0, loss_scale=1, train_wall=144, gb_free=59.2, wall=0 epoch 027: 1353 / 1707 loss=4.044, nll_loss=2.409, ppl=5.31, wps=296846, ups=0.69, wpb=429160, bsz=16341.9, num_updates=45600, lr=0.000296174, gnorm=0.235, clip=0, loss_scale=1, train_wall=144, gb_free=59.2, wall=0 epoch 027: 1353 / 1707 loss=4.044, nll_loss=2.409, ppl=5.31, wps=296846, ups=0.69, wpb=429160, bsz=16341.9, num_updates=45600, lr=0.000296174, gnorm=0.235, clip=0, loss_scale=1, train_wall=144, gb_free=59.2, wall=0 epoch 027: 1453 / 1707 loss=4.042, nll_loss=2.407, ppl=5.3, wps=296344, ups=0.69, wpb=429330, bsz=16485.6, num_updates=45700, lr=0.00029585, gnorm=0.238, clip=0, loss_scale=1, train_wall=144, gb_free=58.7, wall=0 epoch 027: 1453 / 1707 loss=4.042, nll_loss=2.407, ppl=5.3, wps=296344, ups=0.69, wpb=429330, bsz=16485.6, num_updates=45700, lr=0.00029585, gnorm=0.238, clip=0, loss_scale=1, train_wall=144, gb_free=58.7, wall=0 epoch 027: 1453 / 1707 loss=4.042, nll_loss=2.407, ppl=5.3, wps=296344, ups=0.69, wpb=429330, bsz=16485.6, num_updates=45700, lr=0.00029585, gnorm=0.238, clip=0, loss_scale=1, train_wall=144, gb_free=58.7, wall=0 epoch 027: 1453 / 1707 loss=4.042, nll_loss=2.407, ppl=5.3, wps=296344, ups=0.69, wpb=429330, bsz=16485.6, num_updates=45700, lr=0.00029585, gnorm=0.238, clip=0, loss_scale=1, train_wall=144, gb_free=58.7, wall=0 epoch 027: 1453 / 1707 loss=4.042, nll_loss=2.407, ppl=5.3, wps=296344, ups=0.69, wpb=429330, bsz=16485.6, num_updates=45700, lr=0.00029585, gnorm=0.238, clip=0, loss_scale=1, train_wall=144, gb_free=58.7, wall=0 epoch 027: 1553 / 1707 loss=4.032, nll_loss=2.396, ppl=5.26, wps=297308, ups=0.69, wpb=429223, bsz=16201.4, num_updates=45800, lr=0.000295527, gnorm=0.228, clip=0, loss_scale=2, train_wall=143, gb_free=58.9, wall=0 epoch 027: 1553 / 1707 loss=4.032, nll_loss=2.396, ppl=5.26, wps=297308, ups=0.69, wpb=429223, bsz=16201.4, num_updates=45800, lr=0.000295527, gnorm=0.228, clip=0, loss_scale=2, train_wall=143, gb_free=58.9, wall=0 epoch 027: 1553 / 1707 loss=4.032, nll_loss=2.396, ppl=5.26, wps=297308, ups=0.69, wpb=429223, bsz=16201.4, num_updates=45800, lr=0.000295527, gnorm=0.228, clip=0, loss_scale=2, train_wall=143, gb_free=58.9, wall=0 epoch 027: 1553 / 1707 loss=4.032, nll_loss=2.396, ppl=5.26, wps=297308, ups=0.69, wpb=429223, bsz=16201.4, num_updates=45800, lr=0.000295527, gnorm=0.228, clip=0, loss_scale=2, train_wall=143, gb_free=58.9, wall=0 epoch 027: 1553 / 1707 loss=4.032, nll_loss=2.396, ppl=5.26, wps=297308, ups=0.69, wpb=429223, bsz=16201.4, num_updates=45800, lr=0.000295527, gnorm=0.228, clip=0, loss_scale=2, train_wall=143, gb_free=58.9, wall=0 epoch 027: 1654 / 1707 loss=4.037, nll_loss=2.402, ppl=5.28, wps=294435, ups=0.69, wpb=429244, bsz=16376.6, num_updates=45900, lr=0.000295205, gnorm=0.225, clip=0, loss_scale=1, train_wall=145, gb_free=59, wall=0 epoch 027: 1654 / 1707 loss=4.037, nll_loss=2.402, ppl=5.28, wps=294435, ups=0.69, wpb=429244, bsz=16376.6, num_updates=45900, lr=0.000295205, gnorm=0.225, clip=0, loss_scale=1, train_wall=145, gb_free=59, wall=0 epoch 027: 1654 / 1707 loss=4.037, nll_loss=2.402, ppl=5.28, wps=294435, ups=0.69, wpb=429244, bsz=16376.6, num_updates=45900, lr=0.000295205, gnorm=0.225, clip=0, loss_scale=1, train_wall=145, gb_free=59, wall=0 epoch 027: 1654 / 1707 loss=4.037, nll_loss=2.402, ppl=5.28, wps=294435, ups=0.69, wpb=429244, bsz=16376.6, num_updates=45900, lr=0.000295205, gnorm=0.225, clip=0, loss_scale=1, train_wall=145, gb_free=59, wall=0 epoch 027: 1654 / 1707 loss=4.037, nll_loss=2.402, ppl=5.28, wps=294435, ups=0.69, wpb=429244, bsz=16376.6, num_updates=45900, lr=0.000295205, gnorm=0.225, clip=0, loss_scale=1, train_wall=145, gb_free=59, wall=0 end of epoch 27 (average epoch stats below) epoch 027 | loss 4.029 | nll_loss 2.392 | ppl 5.25 | wps 294302 | ups 0.69 | wpb 428914 | bsz 16331.1 | num_updates 45953 | lr 0.000295035 | gnorm 0.234 | clip 0 | loss_scale 1 | train_wall 2449 | gb_free 60.1 | wall 0 epoch 027 | loss 4.029 | nll_loss 2.392 | ppl 5.25 | wps 294302 | ups 0.69 | wpb 428914 | bsz 16331.1 | num_updates 45953 | lr 0.000295035 | gnorm 0.234 | clip 0 | loss_scale 1 | train_wall 2449 | gb_free 60.1 | wall 0 epoch 027 | loss 4.029 | nll_loss 2.392 | ppl 5.25 | wps 294302 | ups 0.69 | wpb 428914 | bsz 16331.1 | num_updates 45953 | lr 0.000295035 | gnorm 0.234 | clip 0 | loss_scale 1 | train_wall 2449 | gb_free 60.1 | wall 0 epoch 027 | loss 4.029 | nll_loss 2.392 | ppl 5.25 | wps 294302 | ups 0.69 | wpb 428914 | bsz 16331.1 | num_updates 45953 | lr 0.000295035 | gnorm 0.234 | clip 0 | loss_scale 1 | train_wall 2449 | gb_free 60.1 | wall 0 epoch 027 | loss 4.029 | nll_loss 2.392 | ppl 5.25 | wps 294302 | ups 0.69 | wpb 428914 | bsz 16331.1 | num_updates 45953 | lr 0.000295035 | gnorm 0.234 | clip 0 | loss_scale 1 | train_wall 2449 | gb_free 60.1 | wall 0 Start iterating over samples epoch 028: 47 / 1707 loss=4.03, nll_loss=2.393, ppl=5.25, wps=297841, ups=0.7, wpb=427579, bsz=16322.4, num_updates=46000, lr=0.000294884, gnorm=0.23, clip=0, loss_scale=1, train_wall=142, gb_free=58.9, wall=0 epoch 028: 47 / 1707 loss=4.03, nll_loss=2.393, ppl=5.25, wps=297841, ups=0.7, wpb=427579, bsz=16322.4, num_updates=46000, lr=0.000294884, gnorm=0.23, clip=0, loss_scale=1, train_wall=142, gb_free=58.9, wall=0 epoch 028: 47 / 1707 loss=4.03, nll_loss=2.393, ppl=5.25, wps=297841, ups=0.7, wpb=427579, bsz=16322.4, num_updates=46000, lr=0.000294884, gnorm=0.23, clip=0, loss_scale=1, train_wall=142, gb_free=58.9, wall=0 epoch 028: 47 / 1707 loss=4.03, nll_loss=2.393, ppl=5.25, wps=297841, ups=0.7, wpb=427579, bsz=16322.4, num_updates=46000, lr=0.000294884, gnorm=0.23, clip=0, loss_scale=1, train_wall=142, gb_free=58.9, wall=0 epoch 028: 47 / 1707 loss=4.03, nll_loss=2.393, ppl=5.25, wps=297841, ups=0.7, wpb=427579, bsz=16322.4, num_updates=46000, lr=0.000294884, gnorm=0.23, clip=0, loss_scale=1, train_wall=142, gb_free=58.9, wall=0 epoch 028: 47 / 1707 loss=4.03, nll_loss=2.393, ppl=5.25, wps=297841, ups=0.7, wpb=427579, bsz=16322.4, num_updates=46000, lr=0.000294884, gnorm=0.23, clip=0, loss_scale=1, train_wall=142, gb_free=58.9, wall=0 begin validation on "valid" subset epoch 028 | valid on 'valid' subset | loss 4.288 | nll_loss 2.66 | ppl 6.32 | wps 75358.2 | wpb 21331 | bsz 1016 | num_updates 46000 | best_loss 4.28 epoch 028 | valid on 'valid' subset | loss 4.288 | nll_loss 2.66 | ppl 6.32 | wps 75358.2 | wpb 21331 | bsz 1016 | num_updates 46000 | best_loss 4.28 epoch 028 | valid on 'valid' subset | loss 4.288 | nll_loss 2.66 | ppl 6.32 | wps 75358.2 | wpb 21331 | bsz 1016 | num_updates 46000 | best_loss 4.28 epoch 028 | valid on 'valid' subset | loss 4.288 | nll_loss 2.66 | ppl 6.32 | wps 75358.2 | wpb 21331 | bsz 1016 | num_updates 46000 | best_loss 4.28 epoch 028 | valid on 'valid' subset | loss 4.288 | nll_loss 2.66 | ppl 6.32 | wps 75358.2 | wpb 21331 | bsz 1016 | num_updates 46000 | best_loss 4.28 epoch 028 | valid on 'valid' subset | loss 4.288 | nll_loss 2.66 | ppl 6.32 | wps 75358.2 | wpb 21331 | bsz 1016 | num_updates 46000 | best_loss 4.28 epoch 028: 147 / 1707 loss=4.009, nll_loss=2.368, ppl=5.16, wps=273912, ups=0.64, wpb=428802, bsz=16442.2, num_updates=46100, lr=0.000294564, gnorm=0.232, clip=0, loss_scale=2, train_wall=143, gb_free=59.2, wall=0 epoch 028: 147 / 1707 loss=4.009, nll_loss=2.368, ppl=5.16, wps=273912, ups=0.64, wpb=428802, bsz=16442.2, num_updates=46100, lr=0.000294564, gnorm=0.232, clip=0, loss_scale=2, train_wall=143, gb_free=59.2, wall=0 epoch 028: 147 / 1707 loss=4.009, nll_loss=2.368, ppl=5.16, wps=273912, ups=0.64, wpb=428802, bsz=16442.2, num_updates=46100, lr=0.000294564, gnorm=0.232, clip=0, loss_scale=2, train_wall=143, gb_free=59.2, wall=0 epoch 028: 147 / 1707 loss=4.009, nll_loss=2.368, ppl=5.16, wps=273912, ups=0.64, wpb=428802, bsz=16442.2, num_updates=46100, lr=0.000294564, gnorm=0.232, clip=0, loss_scale=2, train_wall=143, gb_free=59.2, wall=0 epoch 028: 147 / 1707 loss=4.009, nll_loss=2.368, ppl=5.16, wps=273912, ups=0.64, wpb=428802, bsz=16442.2, num_updates=46100, lr=0.000294564, gnorm=0.232, clip=0, loss_scale=2, train_wall=143, gb_free=59.2, wall=0 epoch 028: 147 / 1707 loss=4.009, nll_loss=2.368, ppl=5.16, wps=273912, ups=0.64, wpb=428802, bsz=16442.2, num_updates=46100, lr=0.000294564, gnorm=0.232, clip=0, loss_scale=2, train_wall=143, gb_free=59.2, wall=0 epoch 028: 248 / 1707 loss=4.013, nll_loss=2.373, ppl=5.18, wps=295235, ups=0.69, wpb=429828, bsz=16173.8, num_updates=46200, lr=0.000294245, gnorm=0.232, clip=0, loss_scale=1, train_wall=145, gb_free=59.1, wall=0 epoch 028: 248 / 1707 loss=4.013, nll_loss=2.373, ppl=5.18, wps=295235, ups=0.69, wpb=429828, bsz=16173.8, num_updates=46200, lr=0.000294245, gnorm=0.232, clip=0, loss_scale=1, train_wall=145, gb_free=59.1, wall=0 epoch 028: 248 / 1707 loss=4.013, nll_loss=2.373, ppl=5.18, wps=295235, ups=0.69, wpb=429828, bsz=16173.8, num_updates=46200, lr=0.000294245, gnorm=0.232, clip=0, loss_scale=1, train_wall=145, gb_free=59.1, wall=0 epoch 028: 248 / 1707 loss=4.013, nll_loss=2.373, ppl=5.18, wps=295235, ups=0.69, wpb=429828, bsz=16173.8, num_updates=46200, lr=0.000294245, gnorm=0.232, clip=0, loss_scale=1, train_wall=145, gb_free=59.1, wall=0 epoch 028: 248 / 1707 loss=4.013, nll_loss=2.373, ppl=5.18, wps=295235, ups=0.69, wpb=429828, bsz=16173.8, num_updates=46200, lr=0.000294245, gnorm=0.232, clip=0, loss_scale=1, train_wall=145, gb_free=59.1, wall=0 epoch 028: 248 / 1707 loss=4.013, nll_loss=2.373, ppl=5.18, wps=295235, ups=0.69, wpb=429828, bsz=16173.8, num_updates=46200, lr=0.000294245, gnorm=0.232, clip=0, loss_scale=1, train_wall=145, gb_free=59.1, wall=0 epoch 028: 348 / 1707 loss=4.011, nll_loss=2.371, ppl=5.17, wps=297273, ups=0.69, wpb=429621, bsz=16383.7, num_updates=46300, lr=0.000293927, gnorm=0.225, clip=0, loss_scale=1, train_wall=144, gb_free=58.9, wall=0 epoch 028: 348 / 1707 loss=4.011, nll_loss=2.371, ppl=5.17, wps=297273, ups=0.69, wpb=429621, bsz=16383.7, num_updates=46300, lr=0.000293927, gnorm=0.225, clip=0, loss_scale=1, train_wall=144, gb_free=58.9, wall=0 epoch 028: 348 / 1707 loss=4.011, nll_loss=2.371, ppl=5.17, wps=297273, ups=0.69, wpb=429621, bsz=16383.7, num_updates=46300, lr=0.000293927, gnorm=0.225, clip=0, loss_scale=1, train_wall=144, gb_free=58.9, wall=0 epoch 028: 348 / 1707 loss=4.011, nll_loss=2.371, ppl=5.17, wps=297273, ups=0.69, wpb=429621, bsz=16383.7, num_updates=46300, lr=0.000293927, gnorm=0.225, clip=0, loss_scale=1, train_wall=144, gb_free=58.9, wall=0 epoch 028: 348 / 1707 loss=4.011, nll_loss=2.371, ppl=5.17, wps=297273, ups=0.69, wpb=429621, bsz=16383.7, num_updates=46300, lr=0.000293927, gnorm=0.225, clip=0, loss_scale=1, train_wall=144, gb_free=58.9, wall=0 epoch 028: 348 / 1707 loss=4.011, nll_loss=2.371, ppl=5.17, wps=297273, ups=0.69, wpb=429621, bsz=16383.7, num_updates=46300, lr=0.000293927, gnorm=0.225, clip=0, loss_scale=1, train_wall=144, gb_free=58.9, wall=0 epoch 028: 448 / 1707 loss=4.02, nll_loss=2.382, ppl=5.21, wps=297567, ups=0.69, wpb=429617, bsz=16321.1, num_updates=46400, lr=0.00029361, gnorm=0.241, clip=0, loss_scale=1, train_wall=144, gb_free=59.1, wall=0 epoch 028: 448 / 1707 loss=4.02, nll_loss=2.382, ppl=5.21, wps=297567, ups=0.69, wpb=429617, bsz=16321.1, num_updates=46400, lr=0.00029361, gnorm=0.241, clip=0, loss_scale=1, train_wall=144, gb_free=59.1, wall=0 epoch 028: 448 / 1707 loss=4.02, nll_loss=2.382, ppl=5.21, wps=297567, ups=0.69, wpb=429617, bsz=16321.1, num_updates=46400, lr=0.00029361, gnorm=0.241, clip=0, loss_scale=1, train_wall=144, gb_free=59.1, wall=0 epoch 028: 448 / 1707 loss=4.02, nll_loss=2.382, ppl=5.21, wps=297567, ups=0.69, wpb=429617, bsz=16321.1, num_updates=46400, lr=0.00029361, gnorm=0.241, clip=0, loss_scale=1, train_wall=144, gb_free=59.1, wall=0 epoch 028: 448 / 1707 loss=4.02, nll_loss=2.382, ppl=5.21, wps=297567, ups=0.69, wpb=429617, bsz=16321.1, num_updates=46400, lr=0.00029361, gnorm=0.241, clip=0, loss_scale=1, train_wall=144, gb_free=59.1, wall=0 epoch 028: 448 / 1707 loss=4.02, nll_loss=2.382, ppl=5.21, wps=297567, ups=0.69, wpb=429617, bsz=16321.1, num_updates=46400, lr=0.00029361, gnorm=0.241, clip=0, loss_scale=1, train_wall=144, gb_free=59.1, wall=0 epoch 028: 549 / 1707 loss=4.017, nll_loss=2.378, ppl=5.2, wps=294376, ups=0.69, wpb=429062, bsz=16363.8, num_updates=46500, lr=0.000293294, gnorm=0.231, clip=0, loss_scale=1, train_wall=145, gb_free=59.4, wall=0 epoch 028: 549 / 1707 loss=4.017, nll_loss=2.378, ppl=5.2, wps=294376, ups=0.69, wpb=429062, bsz=16363.8, num_updates=46500, lr=0.000293294, gnorm=0.231, clip=0, loss_scale=1, train_wall=145, gb_free=59.4, wall=0 epoch 028: 549 / 1707 loss=4.017, nll_loss=2.378, ppl=5.2, wps=294376, ups=0.69, wpb=429062, bsz=16363.8, num_updates=46500, lr=0.000293294, gnorm=0.231, clip=0, loss_scale=1, train_wall=145, gb_free=59.4, wall=0 epoch 028: 549 / 1707 loss=4.017, nll_loss=2.378, ppl=5.2, wps=294376, ups=0.69, wpb=429062, bsz=16363.8, num_updates=46500, lr=0.000293294, gnorm=0.231, clip=0, loss_scale=1, train_wall=145, gb_free=59.4, wall=0 epoch 028: 549 / 1707 loss=4.017, nll_loss=2.378, ppl=5.2, wps=294376, ups=0.69, wpb=429062, bsz=16363.8, num_updates=46500, lr=0.000293294, gnorm=0.231, clip=0, loss_scale=1, train_wall=145, gb_free=59.4, wall=0 epoch 028: 549 / 1707 loss=4.017, nll_loss=2.378, ppl=5.2, wps=294376, ups=0.69, wpb=429062, bsz=16363.8, num_updates=46500, lr=0.000293294, gnorm=0.231, clip=0, loss_scale=1, train_wall=145, gb_free=59.4, wall=0 epoch 028: 649 / 1707 loss=4.012, nll_loss=2.373, ppl=5.18, wps=296727, ups=0.69, wpb=428457, bsz=16608.7, num_updates=46600, lr=0.000292979, gnorm=0.23, clip=0, loss_scale=1, train_wall=144, gb_free=59.3, wall=0 epoch 028: 649 / 1707 loss=4.012, nll_loss=2.373, ppl=5.18, wps=296727, ups=0.69, wpb=428457, bsz=16608.7, num_updates=46600, lr=0.000292979, gnorm=0.23, clip=0, loss_scale=1, train_wall=144, gb_free=59.3, wall=0 epoch 028: 649 / 1707 loss=4.012, nll_loss=2.373, ppl=5.18, wps=296727, ups=0.69, wpb=428457, bsz=16608.7, num_updates=46600, lr=0.000292979, gnorm=0.23, clip=0, loss_scale=1, train_wall=144, gb_free=59.3, wall=0 epoch 028: 649 / 1707 loss=4.012, nll_loss=2.373, ppl=5.18, wps=296727, ups=0.69, wpb=428457, bsz=16608.7, num_updates=46600, lr=0.000292979, gnorm=0.23, clip=0, loss_scale=1, train_wall=144, gb_free=59.3, wall=0 epoch 028: 649 / 1707 loss=4.012, nll_loss=2.373, ppl=5.18, wps=296727, ups=0.69, wpb=428457, bsz=16608.7, num_updates=46600, lr=0.000292979, gnorm=0.23, clip=0, loss_scale=1, train_wall=144, gb_free=59.3, wall=0 epoch 028: 649 / 1707 loss=4.012, nll_loss=2.373, ppl=5.18, wps=296727, ups=0.69, wpb=428457, bsz=16608.7, num_updates=46600, lr=0.000292979, gnorm=0.23, clip=0, loss_scale=1, train_wall=144, gb_free=59.3, wall=0 epoch 028: 749 / 1707 loss=4.02, nll_loss=2.382, ppl=5.21, wps=295910, ups=0.69, wpb=428116, bsz=16518.9, num_updates=46700, lr=0.000292666, gnorm=0.242, clip=0, loss_scale=1, train_wall=144, gb_free=59.1, wall=0 epoch 028: 749 / 1707 loss=4.02, nll_loss=2.382, ppl=5.21, wps=295910, ups=0.69, wpb=428116, bsz=16518.9, num_updates=46700, lr=0.000292666, gnorm=0.242, clip=0, loss_scale=1, train_wall=144, gb_free=59.1, wall=0 epoch 028: 749 / 1707 loss=4.02, nll_loss=2.382, ppl=5.21, wps=295910, ups=0.69, wpb=428116, bsz=16518.9, num_updates=46700, lr=0.000292666, gnorm=0.242, clip=0, loss_scale=1, train_wall=144, gb_free=59.1, wall=0 epoch 028: 749 / 1707 loss=4.02, nll_loss=2.382, ppl=5.21, wps=295910, ups=0.69, wpb=428116, bsz=16518.9, num_updates=46700, lr=0.000292666, gnorm=0.242, clip=0, loss_scale=1, train_wall=144, gb_free=59.1, wall=0 epoch 028: 749 / 1707 loss=4.02, nll_loss=2.382, ppl=5.21, wps=295910, ups=0.69, wpb=428116, bsz=16518.9, num_updates=46700, lr=0.000292666, gnorm=0.242, clip=0, loss_scale=1, train_wall=144, gb_free=59.1, wall=0 epoch 028: 749 / 1707 loss=4.02, nll_loss=2.382, ppl=5.21, wps=295910, ups=0.69, wpb=428116, bsz=16518.9, num_updates=46700, lr=0.000292666, gnorm=0.242, clip=0, loss_scale=1, train_wall=144, gb_free=59.1, wall=0 epoch 028: 849 / 1707 loss=4.024, nll_loss=2.386, ppl=5.23, wps=294736, ups=0.69, wpb=427460, bsz=16113.4, num_updates=46800, lr=0.000292353, gnorm=0.242, clip=0, loss_scale=2, train_wall=144, gb_free=59.1, wall=0 epoch 028: 849 / 1707 loss=4.024, nll_loss=2.386, ppl=5.23, wps=294736, ups=0.69, wpb=427460, bsz=16113.4, num_updates=46800, lr=0.000292353, gnorm=0.242, clip=0, loss_scale=2, train_wall=144, gb_free=59.1, wall=0 epoch 028: 849 / 1707 loss=4.024, nll_loss=2.386, ppl=5.23, wps=294736, ups=0.69, wpb=427460, bsz=16113.4, num_updates=46800, lr=0.000292353, gnorm=0.242, clip=0, loss_scale=2, train_wall=144, gb_free=59.1, wall=0 epoch 028: 849 / 1707 loss=4.024, nll_loss=2.386, ppl=5.23, wps=294736, ups=0.69, wpb=427460, bsz=16113.4, num_updates=46800, lr=0.000292353, gnorm=0.242, clip=0, loss_scale=2, train_wall=144, gb_free=59.1, wall=0 epoch 028: 849 / 1707 loss=4.024, nll_loss=2.386, ppl=5.23, wps=294736, ups=0.69, wpb=427460, bsz=16113.4, num_updates=46800, lr=0.000292353, gnorm=0.242, clip=0, loss_scale=2, train_wall=144, gb_free=59.1, wall=0 epoch 028: 849 / 1707 loss=4.024, nll_loss=2.386, ppl=5.23, wps=294736, ups=0.69, wpb=427460, bsz=16113.4, num_updates=46800, lr=0.000292353, gnorm=0.242, clip=0, loss_scale=2, train_wall=144, gb_free=59.1, wall=0 epoch 028: 949 / 1707 loss=4.034, nll_loss=2.398, ppl=5.27, wps=297465, ups=0.69, wpb=429733, bsz=16308, num_updates=46900, lr=0.000292041, gnorm=0.247, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=0 epoch 028: 949 / 1707 loss=4.034, nll_loss=2.398, ppl=5.27, wps=297465, ups=0.69, wpb=429733, bsz=16308, num_updates=46900, lr=0.000292041, gnorm=0.247, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=0 epoch 028: 949 / 1707 loss=4.034, nll_loss=2.398, ppl=5.27, wps=297465, ups=0.69, wpb=429733, bsz=16308, num_updates=46900, lr=0.000292041, gnorm=0.247, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=0 epoch 028: 949 / 1707 loss=4.034, nll_loss=2.398, ppl=5.27, wps=297465, ups=0.69, wpb=429733, bsz=16308, num_updates=46900, lr=0.000292041, gnorm=0.247, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=0 epoch 028: 949 / 1707 loss=4.034, nll_loss=2.398, ppl=5.27, wps=297465, ups=0.69, wpb=429733, bsz=16308, num_updates=46900, lr=0.000292041, gnorm=0.247, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=0 epoch 028: 949 / 1707 loss=4.034, nll_loss=2.398, ppl=5.27, wps=297465, ups=0.69, wpb=429733, bsz=16308, num_updates=46900, lr=0.000292041, gnorm=0.247, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=0 epoch 028: 1049 / 1707 loss=4.024, nll_loss=2.387, ppl=5.23, wps=295563, ups=0.69, wpb=428222, bsz=16497.1, num_updates=47000, lr=0.00029173, gnorm=0.227, clip=0, loss_scale=4, train_wall=144, gb_free=58.9, wall=0 epoch 028: 1049 / 1707 loss=4.024, nll_loss=2.387, ppl=5.23, wps=295563, ups=0.69, wpb=428222, bsz=16497.1, num_updates=47000, lr=0.00029173, gnorm=0.227, clip=0, loss_scale=4, train_wall=144, gb_free=58.9, wall=0 epoch 028: 1049 / 1707 loss=4.024, nll_loss=2.387, ppl=5.23, wps=295563, ups=0.69, wpb=428222, bsz=16497.1, num_updates=47000, lr=0.00029173, gnorm=0.227, clip=0, loss_scale=4, train_wall=144, gb_free=58.9, wall=0 epoch 028: 1049 / 1707 loss=4.024, nll_loss=2.387, ppl=5.23, wps=295563, ups=0.69, wpb=428222, bsz=16497.1, num_updates=47000, lr=0.00029173, gnorm=0.227, clip=0, loss_scale=4, train_wall=144, gb_free=58.9, wall=0 epoch 028: 1049 / 1707 loss=4.024, nll_loss=2.387, ppl=5.23, wps=295563, ups=0.69, wpb=428222, bsz=16497.1, num_updates=47000, lr=0.00029173, gnorm=0.227, clip=0, loss_scale=4, train_wall=144, gb_free=58.9, wall=0 epoch 028: 1049 / 1707 loss=4.024, nll_loss=2.387, ppl=5.23, wps=295563, ups=0.69, wpb=428222, bsz=16497.1, num_updates=47000, lr=0.00029173, gnorm=0.227, clip=0, loss_scale=4, train_wall=144, gb_free=58.9, wall=0 begin validation on "valid" subset epoch 028 | valid on 'valid' subset | loss 4.296 | nll_loss 2.676 | ppl 6.39 | wps 76893.2 | wpb 21331 | bsz 1016 | num_updates 47000 | best_loss 4.28 epoch 028 | valid on 'valid' subset | loss 4.296 | nll_loss 2.676 | ppl 6.39 | wps 76893.2 | wpb 21331 | bsz 1016 | num_updates 47000 | best_loss 4.28 epoch 028 | valid on 'valid' subset | loss 4.296 | nll_loss 2.676 | ppl 6.39 | wps 76893.2 | wpb 21331 | bsz 1016 | num_updates 47000 | best_loss 4.28 epoch 028 | valid on 'valid' subset | loss 4.296 | nll_loss 2.676 | ppl 6.39 | wps 76893.2 | wpb 21331 | bsz 1016 | num_updates 47000 | best_loss 4.28 epoch 028 | valid on 'valid' subset | loss 4.296 | nll_loss 2.676 | ppl 6.39 | wps 76893.2 | wpb 21331 | bsz 1016 | num_updates 47000 | best_loss 4.28 epoch 028 | valid on 'valid' subset | loss 4.296 | nll_loss 2.676 | ppl 6.39 | wps 76893.2 | wpb 21331 | bsz 1016 | num_updates 47000 | best_loss 4.28 epoch 028: 1150 / 1707 loss=4.034, nll_loss=2.397, ppl=5.27, wps=273034, ups=0.64, wpb=429782, bsz=16427.4, num_updates=47100, lr=0.00029142, gnorm=0.23, clip=0, loss_scale=2, train_wall=145, gb_free=58.8, wall=0 epoch 028: 1150 / 1707 loss=4.034, nll_loss=2.397, ppl=5.27, wps=273034, ups=0.64, wpb=429782, bsz=16427.4, num_updates=47100, lr=0.00029142, gnorm=0.23, clip=0, loss_scale=2, train_wall=145, gb_free=58.8, wall=0 epoch 028: 1150 / 1707 loss=4.034, nll_loss=2.397, ppl=5.27, wps=273034, ups=0.64, wpb=429782, bsz=16427.4, num_updates=47100, lr=0.00029142, gnorm=0.23, clip=0, loss_scale=2, train_wall=145, gb_free=58.8, wall=0 epoch 028: 1150 / 1707 loss=4.034, nll_loss=2.397, ppl=5.27, wps=273034, ups=0.64, wpb=429782, bsz=16427.4, num_updates=47100, lr=0.00029142, gnorm=0.23, clip=0, loss_scale=2, train_wall=145, gb_free=58.8, wall=0 epoch 028: 1150 / 1707 loss=4.034, nll_loss=2.397, ppl=5.27, wps=273034, ups=0.64, wpb=429782, bsz=16427.4, num_updates=47100, lr=0.00029142, gnorm=0.23, clip=0, loss_scale=2, train_wall=145, gb_free=58.8, wall=0 epoch 028: 1150 / 1707 loss=4.034, nll_loss=2.397, ppl=5.27, wps=273034, ups=0.64, wpb=429782, bsz=16427.4, num_updates=47100, lr=0.00029142, gnorm=0.23, clip=0, loss_scale=2, train_wall=145, gb_free=58.8, wall=0 epoch 028: 1250 / 1707 loss=4.032, nll_loss=2.396, ppl=5.26, wps=298241, ups=0.69, wpb=430575, bsz=16276.5, num_updates=47200, lr=0.000291111, gnorm=0.248, clip=0, loss_scale=2, train_wall=144, gb_free=59.1, wall=0 epoch 028: 1250 / 1707 loss=4.032, nll_loss=2.396, ppl=5.26, wps=298241, ups=0.69, wpb=430575, bsz=16276.5, num_updates=47200, lr=0.000291111, gnorm=0.248, clip=0, loss_scale=2, train_wall=144, gb_free=59.1, wall=0 epoch 028: 1250 / 1707 loss=4.032, nll_loss=2.396, ppl=5.26, wps=298241, ups=0.69, wpb=430575, bsz=16276.5, num_updates=47200, lr=0.000291111, gnorm=0.248, clip=0, loss_scale=2, train_wall=144, gb_free=59.1, wall=0 epoch 028: 1250 / 1707 loss=4.032, nll_loss=2.396, ppl=5.26, wps=298241, ups=0.69, wpb=430575, bsz=16276.5, num_updates=47200, lr=0.000291111, gnorm=0.248, clip=0, loss_scale=2, train_wall=144, gb_free=59.1, wall=0 epoch 028: 1250 / 1707 loss=4.032, nll_loss=2.396, ppl=5.26, wps=298241, ups=0.69, wpb=430575, bsz=16276.5, num_updates=47200, lr=0.000291111, gnorm=0.248, clip=0, loss_scale=2, train_wall=144, gb_free=59.1, wall=0 epoch 028: 1250 / 1707 loss=4.032, nll_loss=2.396, ppl=5.26, wps=298241, ups=0.69, wpb=430575, bsz=16276.5, num_updates=47200, lr=0.000291111, gnorm=0.248, clip=0, loss_scale=2, train_wall=144, gb_free=59.1, wall=0 epoch 028: 1351 / 1707 loss=4.033, nll_loss=2.397, ppl=5.27, wps=295044, ups=0.69, wpb=429631, bsz=16200, num_updates=47300, lr=0.000290803, gnorm=0.228, clip=0, loss_scale=2, train_wall=145, gb_free=59.1, wall=0 epoch 028: 1351 / 1707 loss=4.033, nll_loss=2.397, ppl=5.27, wps=295044, ups=0.69, wpb=429631, bsz=16200, num_updates=47300, lr=0.000290803, gnorm=0.228, clip=0, loss_scale=2, train_wall=145, gb_free=59.1, wall=0 epoch 028: 1351 / 1707 loss=4.033, nll_loss=2.397, ppl=5.27, wps=295044, ups=0.69, wpb=429631, bsz=16200, num_updates=47300, lr=0.000290803, gnorm=0.228, clip=0, loss_scale=2, train_wall=145, gb_free=59.1, wall=0 epoch 028: 1351 / 1707 loss=4.033, nll_loss=2.397, ppl=5.27, wps=295044, ups=0.69, wpb=429631, bsz=16200, num_updates=47300, lr=0.000290803, gnorm=0.228, clip=0, loss_scale=2, train_wall=145, gb_free=59.1, wall=0 epoch 028: 1351 / 1707 loss=4.033, nll_loss=2.397, ppl=5.27, wps=295044, ups=0.69, wpb=429631, bsz=16200, num_updates=47300, lr=0.000290803, gnorm=0.228, clip=0, loss_scale=2, train_wall=145, gb_free=59.1, wall=0 epoch 028: 1351 / 1707 loss=4.033, nll_loss=2.397, ppl=5.27, wps=295044, ups=0.69, wpb=429631, bsz=16200, num_updates=47300, lr=0.000290803, gnorm=0.228, clip=0, loss_scale=2, train_wall=145, gb_free=59.1, wall=0 epoch 028: 1452 / 1707 loss=4.029, nll_loss=2.392, ppl=5.25, wps=294360, ups=0.69, wpb=428580, bsz=16134.3, num_updates=47400, lr=0.000290496, gnorm=0.23, clip=0, loss_scale=1, train_wall=145, gb_free=58.9, wall=0 epoch 028: 1452 / 1707 loss=4.029, nll_loss=2.392, ppl=5.25, wps=294360, ups=0.69, wpb=428580, bsz=16134.3, num_updates=47400, lr=0.000290496, gnorm=0.23, clip=0, loss_scale=1, train_wall=145, gb_free=58.9, wall=0 epoch 028: 1452 / 1707 loss=4.029, nll_loss=2.392, ppl=5.25, wps=294360, ups=0.69, wpb=428580, bsz=16134.3, num_updates=47400, lr=0.000290496, gnorm=0.23, clip=0, loss_scale=1, train_wall=145, gb_free=58.9, wall=0 epoch 028: 1452 / 1707 loss=4.029, nll_loss=2.392, ppl=5.25, wps=294360, ups=0.69, wpb=428580, bsz=16134.3, num_updates=47400, lr=0.000290496, gnorm=0.23, clip=0, loss_scale=1, train_wall=145, gb_free=58.9, wall=0 epoch 028: 1452 / 1707 loss=4.029, nll_loss=2.392, ppl=5.25, wps=294360, ups=0.69, wpb=428580, bsz=16134.3, num_updates=47400, lr=0.000290496, gnorm=0.23, clip=0, loss_scale=1, train_wall=145, gb_free=58.9, wall=0 epoch 028: 1452 / 1707 loss=4.029, nll_loss=2.392, ppl=5.25, wps=294360, ups=0.69, wpb=428580, bsz=16134.3, num_updates=47400, lr=0.000290496, gnorm=0.23, clip=0, loss_scale=1, train_wall=145, gb_free=58.9, wall=0 epoch 028: 1552 / 1707 loss=4.037, nll_loss=2.402, ppl=5.28, wps=297819, ups=0.69, wpb=429328, bsz=16375, num_updates=47500, lr=0.000290191, gnorm=0.242, clip=0, loss_scale=1, train_wall=143, gb_free=59.6, wall=0 epoch 028: 1552 / 1707 loss=4.037, nll_loss=2.402, ppl=5.28, wps=297819, ups=0.69, wpb=429328, bsz=16375, num_updates=47500, lr=0.000290191, gnorm=0.242, clip=0, loss_scale=1, train_wall=143, gb_free=59.6, wall=0 epoch 028: 1552 / 1707 loss=4.037, nll_loss=2.402, ppl=5.28, wps=297819, ups=0.69, wpb=429328, bsz=16375, num_updates=47500, lr=0.000290191, gnorm=0.242, clip=0, loss_scale=1, train_wall=143, gb_free=59.6, wall=0 epoch 028: 1552 / 1707 loss=4.037, nll_loss=2.402, ppl=5.28, wps=297819, ups=0.69, wpb=429328, bsz=16375, num_updates=47500, lr=0.000290191, gnorm=0.242, clip=0, loss_scale=1, train_wall=143, gb_free=59.6, wall=0 epoch 028: 1552 / 1707 loss=4.037, nll_loss=2.402, ppl=5.28, wps=297819, ups=0.69, wpb=429328, bsz=16375, num_updates=47500, lr=0.000290191, gnorm=0.242, clip=0, loss_scale=1, train_wall=143, gb_free=59.6, wall=0 epoch 028: 1552 / 1707 loss=4.037, nll_loss=2.402, ppl=5.28, wps=297819, ups=0.69, wpb=429328, bsz=16375, num_updates=47500, lr=0.000290191, gnorm=0.242, clip=0, loss_scale=1, train_wall=143, gb_free=59.6, wall=0 epoch 028: 1652 / 1707 loss=4.035, nll_loss=2.399, ppl=5.27, wps=297280, ups=0.69, wpb=429241, bsz=16326.3, num_updates=47600, lr=0.000289886, gnorm=0.225, clip=0, loss_scale=1, train_wall=144, gb_free=59, wall=0 epoch 028: 1652 / 1707 loss=4.035, nll_loss=2.399, ppl=5.27, wps=297280, ups=0.69, wpb=429241, bsz=16326.3, num_updates=47600, lr=0.000289886, gnorm=0.225, clip=0, loss_scale=1, train_wall=144, gb_free=59, wall=0 epoch 028: 1652 / 1707 loss=4.035, nll_loss=2.399, ppl=5.27, wps=297280, ups=0.69, wpb=429241, bsz=16326.3, num_updates=47600, lr=0.000289886, gnorm=0.225, clip=0, loss_scale=1, train_wall=144, gb_free=59, wall=0 epoch 028: 1652 / 1707 loss=4.035, nll_loss=2.399, ppl=5.27, wps=297280, ups=0.69, wpb=429241, bsz=16326.3, num_updates=47600, lr=0.000289886, gnorm=0.225, clip=0, loss_scale=1, train_wall=144, gb_free=59, wall=0 epoch 028: 1652 / 1707 loss=4.035, nll_loss=2.399, ppl=5.27, wps=297280, ups=0.69, wpb=429241, bsz=16326.3, num_updates=47600, lr=0.000289886, gnorm=0.225, clip=0, loss_scale=1, train_wall=144, gb_free=59, wall=0 epoch 028: 1652 / 1707 loss=4.035, nll_loss=2.399, ppl=5.27, wps=297280, ups=0.69, wpb=429241, bsz=16326.3, num_updates=47600, lr=0.000289886, gnorm=0.225, clip=0, loss_scale=1, train_wall=144, gb_free=59, wall=0 end of epoch 28 (average epoch stats below) epoch 028 | loss 4.024 | nll_loss 2.386 | ppl 5.23 | wps 293419 | ups 0.68 | wpb 428931 | bsz 16335.3 | num_updates 47655 | lr 0.000289718 | gnorm 0.235 | clip 0 | loss_scale 2 | train_wall 2449 | gb_free 59.3 | wall 0 epoch 028 | loss 4.024 | nll_loss 2.386 | ppl 5.23 | wps 293419 | ups 0.68 | wpb 428931 | bsz 16335.3 | num_updates 47655 | lr 0.000289718 | gnorm 0.235 | clip 0 | loss_scale 2 | train_wall 2449 | gb_free 59.3 | wall 0 epoch 028 | loss 4.024 | nll_loss 2.386 | ppl 5.23 | wps 293419 | ups 0.68 | wpb 428931 | bsz 16335.3 | num_updates 47655 | lr 0.000289718 | gnorm 0.235 | clip 0 | loss_scale 2 | train_wall 2449 | gb_free 59.3 | wall 0 epoch 028 | loss 4.024 | nll_loss 2.386 | ppl 5.23 | wps 293419 | ups 0.68 | wpb 428931 | bsz 16335.3 | num_updates 47655 | lr 0.000289718 | gnorm 0.235 | clip 0 | loss_scale 2 | train_wall 2449 | gb_free 59.3 | wall 0 epoch 028 | loss 4.024 | nll_loss 2.386 | ppl 5.23 | wps 293419 | ups 0.68 | wpb 428931 | bsz 16335.3 | num_updates 47655 | lr 0.000289718 | gnorm 0.235 | clip 0 | loss_scale 2 | train_wall 2449 | gb_free 59.3 | wall 0 epoch 028 | loss 4.024 | nll_loss 2.386 | ppl 5.23 | wps 293419 | ups 0.68 | wpb 428931 | bsz 16335.3 | num_updates 47655 | lr 0.000289718 | gnorm 0.235 | clip 0 | loss_scale 2 | train_wall 2449 | gb_free 59.3 | wall 0 Start iterating over samples epoch 029: 45 / 1707 loss=4.013, nll_loss=2.374, ppl=5.18, wps=295988, ups=0.7, wpb=423440, bsz=16143.9, num_updates=47700, lr=0.000289581, gnorm=0.236, clip=0, loss_scale=2, train_wall=142, gb_free=59.3, wall=0 epoch 029: 45 / 1707 loss=4.013, nll_loss=2.374, ppl=5.18, wps=295988, ups=0.7, wpb=423440, bsz=16143.9, num_updates=47700, lr=0.000289581, gnorm=0.236, clip=0, loss_scale=2, train_wall=142, gb_free=59.3, wall=0 epoch 029: 45 / 1707 loss=4.013, nll_loss=2.374, ppl=5.18, wps=295988, ups=0.7, wpb=423440, bsz=16143.9, num_updates=47700, lr=0.000289581, gnorm=0.236, clip=0, loss_scale=2, train_wall=142, gb_free=59.3, wall=0 epoch 029: 45 / 1707 loss=4.013, nll_loss=2.374, ppl=5.18, wps=295988, ups=0.7, wpb=423440, bsz=16143.9, num_updates=47700, lr=0.000289581, gnorm=0.236, clip=0, loss_scale=2, train_wall=142, gb_free=59.3, wall=0 epoch 029: 45 / 1707 loss=4.013, nll_loss=2.374, ppl=5.18, wps=295988, ups=0.7, wpb=423440, bsz=16143.9, num_updates=47700, lr=0.000289581, gnorm=0.236, clip=0, loss_scale=2, train_wall=142, gb_free=59.3, wall=0 epoch 029: 45 / 1707 loss=4.013, nll_loss=2.374, ppl=5.18, wps=295988, ups=0.7, wpb=423440, bsz=16143.9, num_updates=47700, lr=0.000289581, gnorm=0.236, clip=0, loss_scale=2, train_wall=142, gb_free=59.3, wall=0 epoch 029: 45 / 1707 loss=4.013, nll_loss=2.374, ppl=5.18, wps=295988, ups=0.7, wpb=423440, bsz=16143.9, num_updates=47700, lr=0.000289581, gnorm=0.236, clip=0, loss_scale=2, train_wall=142, gb_free=59.3, wall=0 epoch 029: 146 / 1707 loss=4.005, nll_loss=2.364, ppl=5.15, wps=295108, ups=0.69, wpb=430614, bsz=16384.6, num_updates=47800, lr=0.000289278, gnorm=0.224, clip=0, loss_scale=1, train_wall=145, gb_free=59.4, wall=0 epoch 029: 146 / 1707 loss=4.005, nll_loss=2.364, ppl=5.15, wps=295108, ups=0.69, wpb=430614, bsz=16384.6, num_updates=47800, lr=0.000289278, gnorm=0.224, clip=0, loss_scale=1, train_wall=145, gb_free=59.4, wall=0 epoch 029: 146 / 1707 loss=4.005, nll_loss=2.364, ppl=5.15, wps=295108, ups=0.69, wpb=430614, bsz=16384.6, num_updates=47800, lr=0.000289278, gnorm=0.224, clip=0, loss_scale=1, train_wall=145, gb_free=59.4, wall=0 epoch 029: 146 / 1707 loss=4.005, nll_loss=2.364, ppl=5.15, wps=295108, ups=0.69, wpb=430614, bsz=16384.6, num_updates=47800, lr=0.000289278, gnorm=0.224, clip=0, loss_scale=1, train_wall=145, gb_free=59.4, wall=0 epoch 029: 146 / 1707 loss=4.005, nll_loss=2.364, ppl=5.15, wps=295108, ups=0.69, wpb=430614, bsz=16384.6, num_updates=47800, lr=0.000289278, gnorm=0.224, clip=0, loss_scale=1, train_wall=145, gb_free=59.4, wall=0 epoch 029: 146 / 1707 loss=4.005, nll_loss=2.364, ppl=5.15, wps=295108, ups=0.69, wpb=430614, bsz=16384.6, num_updates=47800, lr=0.000289278, gnorm=0.224, clip=0, loss_scale=1, train_wall=145, gb_free=59.4, wall=0 epoch 029: 146 / 1707 loss=4.005, nll_loss=2.364, ppl=5.15, wps=295108, ups=0.69, wpb=430614, bsz=16384.6, num_updates=47800, lr=0.000289278, gnorm=0.224, clip=0, loss_scale=1, train_wall=145, gb_free=59.4, wall=0 epoch 029: 246 / 1707 loss=4.005, nll_loss=2.364, ppl=5.15, wps=297960, ups=0.69, wpb=429316, bsz=16327.2, num_updates=47900, lr=0.000288976, gnorm=0.229, clip=0, loss_scale=1, train_wall=143, gb_free=59.2, wall=0 epoch 029: 246 / 1707 loss=4.005, nll_loss=2.364, ppl=5.15, wps=297960, ups=0.69, wpb=429316, bsz=16327.2, num_updates=47900, lr=0.000288976, gnorm=0.229, clip=0, loss_scale=1, train_wall=143, gb_free=59.2, wall=0 epoch 029: 246 / 1707 loss=4.005, nll_loss=2.364, ppl=5.15, wps=297960, ups=0.69, wpb=429316, bsz=16327.2, num_updates=47900, lr=0.000288976, gnorm=0.229, clip=0, loss_scale=1, train_wall=143, gb_free=59.2, wall=0 epoch 029: 246 / 1707 loss=4.005, nll_loss=2.364, ppl=5.15, wps=297960, ups=0.69, wpb=429316, bsz=16327.2, num_updates=47900, lr=0.000288976, gnorm=0.229, clip=0, loss_scale=1, train_wall=143, gb_free=59.2, wall=0 epoch 029: 246 / 1707 loss=4.005, nll_loss=2.364, ppl=5.15, wps=297960, ups=0.69, wpb=429316, bsz=16327.2, num_updates=47900, lr=0.000288976, gnorm=0.229, clip=0, loss_scale=1, train_wall=143, gb_free=59.2, wall=0 epoch 029: 246 / 1707 loss=4.005, nll_loss=2.364, ppl=5.15, wps=297960, ups=0.69, wpb=429316, bsz=16327.2, num_updates=47900, lr=0.000288976, gnorm=0.229, clip=0, loss_scale=1, train_wall=143, gb_free=59.2, wall=0 epoch 029: 246 / 1707 loss=4.005, nll_loss=2.364, ppl=5.15, wps=297960, ups=0.69, wpb=429316, bsz=16327.2, num_updates=47900, lr=0.000288976, gnorm=0.229, clip=0, loss_scale=1, train_wall=143, gb_free=59.2, wall=0 epoch 029: 346 / 1707 loss=4.004, nll_loss=2.363, ppl=5.14, wps=297521, ups=0.69, wpb=429592, bsz=16184.5, num_updates=48000, lr=0.000288675, gnorm=0.247, clip=0, loss_scale=1, train_wall=144, gb_free=59, wall=0 epoch 029: 346 / 1707 loss=4.004, nll_loss=2.363, ppl=5.14, wps=297521, ups=0.69, wpb=429592, bsz=16184.5, num_updates=48000, lr=0.000288675, gnorm=0.247, clip=0, loss_scale=1, train_wall=144, gb_free=59, wall=0 epoch 029: 346 / 1707 loss=4.004, nll_loss=2.363, ppl=5.14, wps=297521, ups=0.69, wpb=429592, bsz=16184.5, num_updates=48000, lr=0.000288675, gnorm=0.247, clip=0, loss_scale=1, train_wall=144, gb_free=59, wall=0 epoch 029: 346 / 1707 loss=4.004, nll_loss=2.363, ppl=5.14, wps=297521, ups=0.69, wpb=429592, bsz=16184.5, num_updates=48000, lr=0.000288675, gnorm=0.247, clip=0, loss_scale=1, train_wall=144, gb_free=59, wall=0 epoch 029: 346 / 1707 loss=4.004, nll_loss=2.363, ppl=5.14, wps=297521, ups=0.69, wpb=429592, bsz=16184.5, num_updates=48000, lr=0.000288675, gnorm=0.247, clip=0, loss_scale=1, train_wall=144, gb_free=59, wall=0 epoch 029: 346 / 1707 loss=4.004, nll_loss=2.363, ppl=5.14, wps=297521, ups=0.69, wpb=429592, bsz=16184.5, num_updates=48000, lr=0.000288675, gnorm=0.247, clip=0, loss_scale=1, train_wall=144, gb_free=59, wall=0 epoch 029: 346 / 1707 loss=4.004, nll_loss=2.363, ppl=5.14, wps=297521, ups=0.69, wpb=429592, bsz=16184.5, num_updates=48000, lr=0.000288675, gnorm=0.247, clip=0, loss_scale=1, train_wall=144, gb_free=59, wall=0 begin validation on "valid" subset epoch 029 | valid on 'valid' subset | loss 4.287 | nll_loss 2.662 | ppl 6.33 | wps 76373.6 | wpb 21331 | bsz 1016 | num_updates 48000 | best_loss 4.28 epoch 029 | valid on 'valid' subset | loss 4.287 | nll_loss 2.662 | ppl 6.33 | wps 76373.6 | wpb 21331 | bsz 1016 | num_updates 48000 | best_loss 4.28 epoch 029 | valid on 'valid' subset | loss 4.287 | nll_loss 2.662 | ppl 6.33 | wps 76373.6 | wpb 21331 | bsz 1016 | num_updates 48000 | best_loss 4.28 epoch 029 | valid on 'valid' subset | loss 4.287 | nll_loss 2.662 | ppl 6.33 | wps 76373.6 | wpb 21331 | bsz 1016 | num_updates 48000 | best_loss 4.28 epoch 029 | valid on 'valid' subset | loss 4.287 | nll_loss 2.662 | ppl 6.33 | wps 76373.6 | wpb 21331 | bsz 1016 | num_updates 48000 | best_loss 4.28 epoch 029 | valid on 'valid' subset | loss 4.287 | nll_loss 2.662 | ppl 6.33 | wps 76373.6 | wpb 21331 | bsz 1016 | num_updates 48000 | best_loss 4.28 epoch 029 | valid on 'valid' subset | loss 4.287 | nll_loss 2.662 | ppl 6.33 | wps 76373.6 | wpb 21331 | bsz 1016 | num_updates 48000 | best_loss 4.28 epoch 029: 446 / 1707 loss=4.008, nll_loss=2.368, ppl=5.16, wps=266066, ups=0.62, wpb=428833, bsz=16582.8, num_updates=48100, lr=0.000288375, gnorm=0.229, clip=0, loss_scale=2, train_wall=143, gb_free=59.1, wall=0 epoch 029: 446 / 1707 loss=4.008, nll_loss=2.368, ppl=5.16, wps=266066, ups=0.62, wpb=428833, bsz=16582.8, num_updates=48100, lr=0.000288375, gnorm=0.229, clip=0, loss_scale=2, train_wall=143, gb_free=59.1, wall=0 epoch 029: 446 / 1707 loss=4.008, nll_loss=2.368, ppl=5.16, wps=266066, ups=0.62, wpb=428833, bsz=16582.8, num_updates=48100, lr=0.000288375, gnorm=0.229, clip=0, loss_scale=2, train_wall=143, gb_free=59.1, wall=0 epoch 029: 446 / 1707 loss=4.008, nll_loss=2.368, ppl=5.16, wps=266066, ups=0.62, wpb=428833, bsz=16582.8, num_updates=48100, lr=0.000288375, gnorm=0.229, clip=0, loss_scale=2, train_wall=143, gb_free=59.1, wall=0 epoch 029: 446 / 1707 loss=4.008, nll_loss=2.368, ppl=5.16, wps=266066, ups=0.62, wpb=428833, bsz=16582.8, num_updates=48100, lr=0.000288375, gnorm=0.229, clip=0, loss_scale=2, train_wall=143, gb_free=59.1, wall=0 epoch 029: 446 / 1707 loss=4.008, nll_loss=2.368, ppl=5.16, wps=266066, ups=0.62, wpb=428833, bsz=16582.8, num_updates=48100, lr=0.000288375, gnorm=0.229, clip=0, loss_scale=2, train_wall=143, gb_free=59.1, wall=0 epoch 029: 446 / 1707 loss=4.008, nll_loss=2.368, ppl=5.16, wps=266066, ups=0.62, wpb=428833, bsz=16582.8, num_updates=48100, lr=0.000288375, gnorm=0.229, clip=0, loss_scale=2, train_wall=143, gb_free=59.1, wall=0 epoch 029: 546 / 1707 loss=4.01, nll_loss=2.371, ppl=5.17, wps=297248, ups=0.69, wpb=428090, bsz=16336.9, num_updates=48200, lr=0.000288076, gnorm=0.239, clip=0, loss_scale=2, train_wall=143, gb_free=59.2, wall=0 epoch 029: 546 / 1707 loss=4.01, nll_loss=2.371, ppl=5.17, wps=297248, ups=0.69, wpb=428090, bsz=16336.9, num_updates=48200, lr=0.000288076, gnorm=0.239, clip=0, loss_scale=2, train_wall=143, gb_free=59.2, wall=0 epoch 029: 546 / 1707 loss=4.01, nll_loss=2.371, ppl=5.17, wps=297248, ups=0.69, wpb=428090, bsz=16336.9, num_updates=48200, lr=0.000288076, gnorm=0.239, clip=0, loss_scale=2, train_wall=143, gb_free=59.2, wall=0 epoch 029: 546 / 1707 loss=4.01, nll_loss=2.371, ppl=5.17, wps=297248, ups=0.69, wpb=428090, bsz=16336.9, num_updates=48200, lr=0.000288076, gnorm=0.239, clip=0, loss_scale=2, train_wall=143, gb_free=59.2, wall=0 epoch 029: 546 / 1707 loss=4.01, nll_loss=2.371, ppl=5.17, wps=297248, ups=0.69, wpb=428090, bsz=16336.9, num_updates=48200, lr=0.000288076, gnorm=0.239, clip=0, loss_scale=2, train_wall=143, gb_free=59.2, wall=0 epoch 029: 546 / 1707 loss=4.01, nll_loss=2.371, ppl=5.17, wps=297248, ups=0.69, wpb=428090, bsz=16336.9, num_updates=48200, lr=0.000288076, gnorm=0.239, clip=0, loss_scale=2, train_wall=143, gb_free=59.2, wall=0 epoch 029: 546 / 1707 loss=4.01, nll_loss=2.371, ppl=5.17, wps=297248, ups=0.69, wpb=428090, bsz=16336.9, num_updates=48200, lr=0.000288076, gnorm=0.239, clip=0, loss_scale=2, train_wall=143, gb_free=59.2, wall=0 epoch 029: 647 / 1707 loss=4.017, nll_loss=2.378, ppl=5.2, wps=294375, ups=0.68, wpb=429800, bsz=16355, num_updates=48300, lr=0.000287777, gnorm=0.239, clip=0, loss_scale=1, train_wall=145, gb_free=59.1, wall=0 epoch 029: 647 / 1707 loss=4.017, nll_loss=2.378, ppl=5.2, wps=294375, ups=0.68, wpb=429800, bsz=16355, num_updates=48300, lr=0.000287777, gnorm=0.239, clip=0, loss_scale=1, train_wall=145, gb_free=59.1, wall=0 epoch 029: 647 / 1707 loss=4.017, nll_loss=2.378, ppl=5.2, wps=294375, ups=0.68, wpb=429800, bsz=16355, num_updates=48300, lr=0.000287777, gnorm=0.239, clip=0, loss_scale=1, train_wall=145, gb_free=59.1, wall=0 epoch 029: 647 / 1707 loss=4.017, nll_loss=2.378, ppl=5.2, wps=294375, ups=0.68, wpb=429800, bsz=16355, num_updates=48300, lr=0.000287777, gnorm=0.239, clip=0, loss_scale=1, train_wall=145, gb_free=59.1, wall=0 epoch 029: 647 / 1707 loss=4.017, nll_loss=2.378, ppl=5.2, wps=294375, ups=0.68, wpb=429800, bsz=16355, num_updates=48300, lr=0.000287777, gnorm=0.239, clip=0, loss_scale=1, train_wall=145, gb_free=59.1, wall=0 epoch 029: 647 / 1707 loss=4.017, nll_loss=2.378, ppl=5.2, wps=294375, ups=0.68, wpb=429800, bsz=16355, num_updates=48300, lr=0.000287777, gnorm=0.239, clip=0, loss_scale=1, train_wall=145, gb_free=59.1, wall=0 epoch 029: 647 / 1707 loss=4.017, nll_loss=2.378, ppl=5.2, wps=294375, ups=0.68, wpb=429800, bsz=16355, num_updates=48300, lr=0.000287777, gnorm=0.239, clip=0, loss_scale=1, train_wall=145, gb_free=59.1, wall=0 epoch 029: 747 / 1707 loss=4.023, nll_loss=2.385, ppl=5.22, wps=297459, ups=0.69, wpb=429206, bsz=16323.7, num_updates=48400, lr=0.00028748, gnorm=0.235, clip=0, loss_scale=1, train_wall=143, gb_free=59.1, wall=0 epoch 029: 747 / 1707 loss=4.023, nll_loss=2.385, ppl=5.22, wps=297459, ups=0.69, wpb=429206, bsz=16323.7, num_updates=48400, lr=0.00028748, gnorm=0.235, clip=0, loss_scale=1, train_wall=143, gb_free=59.1, wall=0 epoch 029: 747 / 1707 loss=4.023, nll_loss=2.385, ppl=5.22, wps=297459, ups=0.69, wpb=429206, bsz=16323.7, num_updates=48400, lr=0.00028748, gnorm=0.235, clip=0, loss_scale=1, train_wall=143, gb_free=59.1, wall=0 epoch 029: 747 / 1707 loss=4.023, nll_loss=2.385, ppl=5.22, wps=297459, ups=0.69, wpb=429206, bsz=16323.7, num_updates=48400, lr=0.00028748, gnorm=0.235, clip=0, loss_scale=1, train_wall=143, gb_free=59.1, wall=0 epoch 029: 747 / 1707 loss=4.023, nll_loss=2.385, ppl=5.22, wps=297459, ups=0.69, wpb=429206, bsz=16323.7, num_updates=48400, lr=0.00028748, gnorm=0.235, clip=0, loss_scale=1, train_wall=143, gb_free=59.1, wall=0 epoch 029: 747 / 1707 loss=4.023, nll_loss=2.385, ppl=5.22, wps=297459, ups=0.69, wpb=429206, bsz=16323.7, num_updates=48400, lr=0.00028748, gnorm=0.235, clip=0, loss_scale=1, train_wall=143, gb_free=59.1, wall=0 epoch 029: 747 / 1707 loss=4.023, nll_loss=2.385, ppl=5.22, wps=297459, ups=0.69, wpb=429206, bsz=16323.7, num_updates=48400, lr=0.00028748, gnorm=0.235, clip=0, loss_scale=1, train_wall=143, gb_free=59.1, wall=0 epoch 029: 847 / 1707 loss=4.015, nll_loss=2.376, ppl=5.19, wps=295654, ups=0.69, wpb=427602, bsz=16373.3, num_updates=48500, lr=0.000287183, gnorm=0.228, clip=0, loss_scale=2, train_wall=144, gb_free=58.8, wall=0 epoch 029: 847 / 1707 loss=4.015, nll_loss=2.376, ppl=5.19, wps=295654, ups=0.69, wpb=427602, bsz=16373.3, num_updates=48500, lr=0.000287183, gnorm=0.228, clip=0, loss_scale=2, train_wall=144, gb_free=58.8, wall=0 epoch 029: 847 / 1707 loss=4.015, nll_loss=2.376, ppl=5.19, wps=295654, ups=0.69, wpb=427602, bsz=16373.3, num_updates=48500, lr=0.000287183, gnorm=0.228, clip=0, loss_scale=2, train_wall=144, gb_free=58.8, wall=0 epoch 029: 847 / 1707 loss=4.015, nll_loss=2.376, ppl=5.19, wps=295654, ups=0.69, wpb=427602, bsz=16373.3, num_updates=48500, lr=0.000287183, gnorm=0.228, clip=0, loss_scale=2, train_wall=144, gb_free=58.8, wall=0 epoch 029: 847 / 1707 loss=4.015, nll_loss=2.376, ppl=5.19, wps=295654, ups=0.69, wpb=427602, bsz=16373.3, num_updates=48500, lr=0.000287183, gnorm=0.228, clip=0, loss_scale=2, train_wall=144, gb_free=58.8, wall=0 epoch 029: 847 / 1707 loss=4.015, nll_loss=2.376, ppl=5.19, wps=295654, ups=0.69, wpb=427602, bsz=16373.3, num_updates=48500, lr=0.000287183, gnorm=0.228, clip=0, loss_scale=2, train_wall=144, gb_free=58.8, wall=0 epoch 029: 847 / 1707 loss=4.015, nll_loss=2.376, ppl=5.19, wps=295654, ups=0.69, wpb=427602, bsz=16373.3, num_updates=48500, lr=0.000287183, gnorm=0.228, clip=0, loss_scale=2, train_wall=144, gb_free=58.8, wall=0 epoch 029: 948 / 1707 loss=4.022, nll_loss=2.384, ppl=5.22, wps=293777, ups=0.69, wpb=428726, bsz=16435, num_updates=48600, lr=0.000286888, gnorm=0.23, clip=0, loss_scale=1, train_wall=145, gb_free=58.8, wall=0 epoch 029: 948 / 1707 loss=4.022, nll_loss=2.384, ppl=5.22, wps=293777, ups=0.69, wpb=428726, bsz=16435, num_updates=48600, lr=0.000286888, gnorm=0.23, clip=0, loss_scale=1, train_wall=145, gb_free=58.8, wall=0 epoch 029: 948 / 1707 loss=4.022, nll_loss=2.384, ppl=5.22, wps=293777, ups=0.69, wpb=428726, bsz=16435, num_updates=48600, lr=0.000286888, gnorm=0.23, clip=0, loss_scale=1, train_wall=145, gb_free=58.8, wall=0 epoch 029: 948 / 1707 loss=4.022, nll_loss=2.384, ppl=5.22, wps=293777, ups=0.69, wpb=428726, bsz=16435, num_updates=48600, lr=0.000286888, gnorm=0.23, clip=0, loss_scale=1, train_wall=145, gb_free=58.8, wall=0 epoch 029: 948 / 1707 loss=4.022, nll_loss=2.384, ppl=5.22, wps=293777, ups=0.69, wpb=428726, bsz=16435, num_updates=48600, lr=0.000286888, gnorm=0.23, clip=0, loss_scale=1, train_wall=145, gb_free=58.8, wall=0 epoch 029: 948 / 1707 loss=4.022, nll_loss=2.384, ppl=5.22, wps=293777, ups=0.69, wpb=428726, bsz=16435, num_updates=48600, lr=0.000286888, gnorm=0.23, clip=0, loss_scale=1, train_wall=145, gb_free=58.8, wall=0 epoch 029: 948 / 1707 loss=4.022, nll_loss=2.384, ppl=5.22, wps=293777, ups=0.69, wpb=428726, bsz=16435, num_updates=48600, lr=0.000286888, gnorm=0.23, clip=0, loss_scale=1, train_wall=145, gb_free=58.8, wall=0 epoch 029: 1048 / 1707 loss=4.022, nll_loss=2.384, ppl=5.22, wps=296800, ups=0.69, wpb=429210, bsz=16470.5, num_updates=48700, lr=0.000286593, gnorm=0.226, clip=0, loss_scale=1, train_wall=144, gb_free=59.5, wall=0 epoch 029: 1048 / 1707 loss=4.022, nll_loss=2.384, ppl=5.22, wps=296800, ups=0.69, wpb=429210, bsz=16470.5, num_updates=48700, lr=0.000286593, gnorm=0.226, clip=0, loss_scale=1, train_wall=144, gb_free=59.5, wall=0 epoch 029: 1048 / 1707 loss=4.022, nll_loss=2.384, ppl=5.22, wps=296800, ups=0.69, wpb=429210, bsz=16470.5, num_updates=48700, lr=0.000286593, gnorm=0.226, clip=0, loss_scale=1, train_wall=144, gb_free=59.5, wall=0 epoch 029: 1048 / 1707 loss=4.022, nll_loss=2.384, ppl=5.22, wps=296800, ups=0.69, wpb=429210, bsz=16470.5, num_updates=48700, lr=0.000286593, gnorm=0.226, clip=0, loss_scale=1, train_wall=144, gb_free=59.5, wall=0 epoch 029: 1048 / 1707 loss=4.022, nll_loss=2.384, ppl=5.22, wps=296800, ups=0.69, wpb=429210, bsz=16470.5, num_updates=48700, lr=0.000286593, gnorm=0.226, clip=0, loss_scale=1, train_wall=144, gb_free=59.5, wall=0 epoch 029: 1048 / 1707 loss=4.022, nll_loss=2.384, ppl=5.22, wps=296800, ups=0.69, wpb=429210, bsz=16470.5, num_updates=48700, lr=0.000286593, gnorm=0.226, clip=0, loss_scale=1, train_wall=144, gb_free=59.5, wall=0 epoch 029: 1048 / 1707 loss=4.022, nll_loss=2.384, ppl=5.22, wps=296800, ups=0.69, wpb=429210, bsz=16470.5, num_updates=48700, lr=0.000286593, gnorm=0.226, clip=0, loss_scale=1, train_wall=144, gb_free=59.5, wall=0 epoch 029: 1148 / 1707 loss=4.028, nll_loss=2.391, ppl=5.25, wps=297025, ups=0.69, wpb=428405, bsz=16268, num_updates=48800, lr=0.000286299, gnorm=0.238, clip=0, loss_scale=1, train_wall=143, gb_free=59.2, wall=0 epoch 029: 1148 / 1707 loss=4.028, nll_loss=2.391, ppl=5.25, wps=297025, ups=0.69, wpb=428405, bsz=16268, num_updates=48800, lr=0.000286299, gnorm=0.238, clip=0, loss_scale=1, train_wall=143, gb_free=59.2, wall=0 epoch 029: 1148 / 1707 loss=4.028, nll_loss=2.391, ppl=5.25, wps=297025, ups=0.69, wpb=428405, bsz=16268, num_updates=48800, lr=0.000286299, gnorm=0.238, clip=0, loss_scale=1, train_wall=143, gb_free=59.2, wall=0 epoch 029: 1148 / 1707 loss=4.028, nll_loss=2.391, ppl=5.25, wps=297025, ups=0.69, wpb=428405, bsz=16268, num_updates=48800, lr=0.000286299, gnorm=0.238, clip=0, loss_scale=1, train_wall=143, gb_free=59.2, wall=0 epoch 029: 1148 / 1707 loss=4.028, nll_loss=2.391, ppl=5.25, wps=297025, ups=0.69, wpb=428405, bsz=16268, num_updates=48800, lr=0.000286299, gnorm=0.238, clip=0, loss_scale=1, train_wall=143, gb_free=59.2, wall=0 epoch 029: 1148 / 1707 loss=4.028, nll_loss=2.391, ppl=5.25, wps=297025, ups=0.69, wpb=428405, bsz=16268, num_updates=48800, lr=0.000286299, gnorm=0.238, clip=0, loss_scale=1, train_wall=143, gb_free=59.2, wall=0 epoch 029: 1148 / 1707 loss=4.028, nll_loss=2.391, ppl=5.25, wps=297025, ups=0.69, wpb=428405, bsz=16268, num_updates=48800, lr=0.000286299, gnorm=0.238, clip=0, loss_scale=1, train_wall=143, gb_free=59.2, wall=0 epoch 029: 1248 / 1707 loss=4.027, nll_loss=2.39, ppl=5.24, wps=298574, ups=0.69, wpb=429618, bsz=16348.6, num_updates=48900, lr=0.000286006, gnorm=0.245, clip=0, loss_scale=2, train_wall=143, gb_free=59.2, wall=0 epoch 029: 1248 / 1707 loss=4.027, nll_loss=2.39, ppl=5.24, wps=298574, ups=0.69, wpb=429618, bsz=16348.6, num_updates=48900, lr=0.000286006, gnorm=0.245, clip=0, loss_scale=2, train_wall=143, gb_free=59.2, wall=0 epoch 029: 1248 / 1707 loss=4.027, nll_loss=2.39, ppl=5.24, wps=298574, ups=0.69, wpb=429618, bsz=16348.6, num_updates=48900, lr=0.000286006, gnorm=0.245, clip=0, loss_scale=2, train_wall=143, gb_free=59.2, wall=0 epoch 029: 1248 / 1707 loss=4.027, nll_loss=2.39, ppl=5.24, wps=298574, ups=0.69, wpb=429618, bsz=16348.6, num_updates=48900, lr=0.000286006, gnorm=0.245, clip=0, loss_scale=2, train_wall=143, gb_free=59.2, wall=0 epoch 029: 1248 / 1707 loss=4.027, nll_loss=2.39, ppl=5.24, wps=298574, ups=0.69, wpb=429618, bsz=16348.6, num_updates=48900, lr=0.000286006, gnorm=0.245, clip=0, loss_scale=2, train_wall=143, gb_free=59.2, wall=0 epoch 029: 1248 / 1707 loss=4.027, nll_loss=2.39, ppl=5.24, wps=298574, ups=0.69, wpb=429618, bsz=16348.6, num_updates=48900, lr=0.000286006, gnorm=0.245, clip=0, loss_scale=2, train_wall=143, gb_free=59.2, wall=0 epoch 029: 1248 / 1707 loss=4.027, nll_loss=2.39, ppl=5.24, wps=298574, ups=0.69, wpb=429618, bsz=16348.6, num_updates=48900, lr=0.000286006, gnorm=0.245, clip=0, loss_scale=2, train_wall=143, gb_free=59.2, wall=0 epoch 029: 1348 / 1707 loss=4.024, nll_loss=2.387, ppl=5.23, wps=297241, ups=0.69, wpb=429596, bsz=16434.7, num_updates=49000, lr=0.000285714, gnorm=0.227, clip=0, loss_scale=2, train_wall=144, gb_free=59, wall=0 epoch 029: 1348 / 1707 loss=4.024, nll_loss=2.387, ppl=5.23, wps=297241, ups=0.69, wpb=429596, bsz=16434.7, num_updates=49000, lr=0.000285714, gnorm=0.227, clip=0, loss_scale=2, train_wall=144, gb_free=59, wall=0 epoch 029: 1348 / 1707 loss=4.024, nll_loss=2.387, ppl=5.23, wps=297241, ups=0.69, wpb=429596, bsz=16434.7, num_updates=49000, lr=0.000285714, gnorm=0.227, clip=0, loss_scale=2, train_wall=144, gb_free=59, wall=0 epoch 029: 1348 / 1707 loss=4.024, nll_loss=2.387, ppl=5.23, wps=297241, ups=0.69, wpb=429596, bsz=16434.7, num_updates=49000, lr=0.000285714, gnorm=0.227, clip=0, loss_scale=2, train_wall=144, gb_free=59, wall=0 epoch 029: 1348 / 1707 loss=4.024, nll_loss=2.387, ppl=5.23, wps=297241, ups=0.69, wpb=429596, bsz=16434.7, num_updates=49000, lr=0.000285714, gnorm=0.227, clip=0, loss_scale=2, train_wall=144, gb_free=59, wall=0 epoch 029: 1348 / 1707 loss=4.024, nll_loss=2.387, ppl=5.23, wps=297241, ups=0.69, wpb=429596, bsz=16434.7, num_updates=49000, lr=0.000285714, gnorm=0.227, clip=0, loss_scale=2, train_wall=144, gb_free=59, wall=0 epoch 029: 1348 / 1707 loss=4.024, nll_loss=2.387, ppl=5.23, wps=297241, ups=0.69, wpb=429596, bsz=16434.7, num_updates=49000, lr=0.000285714, gnorm=0.227, clip=0, loss_scale=2, train_wall=144, gb_free=59, wall=0 begin validation on "valid" subset epoch 029 | valid on 'valid' subset | loss 4.295 | nll_loss 2.672 | ppl 6.38 | wps 76073.2 | wpb 21331 | bsz 1016 | num_updates 49000 | best_loss 4.28 epoch 029 | valid on 'valid' subset | loss 4.295 | nll_loss 2.672 | ppl 6.38 | wps 76073.2 | wpb 21331 | bsz 1016 | num_updates 49000 | best_loss 4.28 epoch 029 | valid on 'valid' subset | loss 4.295 | nll_loss 2.672 | ppl 6.38 | wps 76073.2 | wpb 21331 | bsz 1016 | num_updates 49000 | best_loss 4.28 epoch 029 | valid on 'valid' subset | loss 4.295 | nll_loss 2.672 | ppl 6.38 | wps 76073.2 | wpb 21331 | bsz 1016 | num_updates 49000 | best_loss 4.28 epoch 029 | valid on 'valid' subset | loss 4.295 | nll_loss 2.672 | ppl 6.38 | wps 76073.2 | wpb 21331 | bsz 1016 | num_updates 49000 | best_loss 4.28 epoch 029 | valid on 'valid' subset | loss 4.295 | nll_loss 2.672 | ppl 6.38 | wps 76073.2 | wpb 21331 | bsz 1016 | num_updates 49000 | best_loss 4.28 epoch 029 | valid on 'valid' subset | loss 4.295 | nll_loss 2.672 | ppl 6.38 | wps 76073.2 | wpb 21331 | bsz 1016 | num_updates 49000 | best_loss 4.28 epoch 029: 1449 / 1707 loss=4.027, nll_loss=2.39, ppl=5.24, wps=271135, ups=0.63, wpb=427935, bsz=16342.9, num_updates=49100, lr=0.000285423, gnorm=0.22, clip=0, loss_scale=1, train_wall=145, gb_free=58.9, wall=0 epoch 029: 1449 / 1707 loss=4.027, nll_loss=2.39, ppl=5.24, wps=271135, ups=0.63, wpb=427935, bsz=16342.9, num_updates=49100, lr=0.000285423, gnorm=0.22, clip=0, loss_scale=1, train_wall=145, gb_free=58.9, wall=0 epoch 029: 1449 / 1707 loss=4.027, nll_loss=2.39, ppl=5.24, wps=271135, ups=0.63, wpb=427935, bsz=16342.9, num_updates=49100, lr=0.000285423, gnorm=0.22, clip=0, loss_scale=1, train_wall=145, gb_free=58.9, wall=0 epoch 029: 1449 / 1707 loss=4.027, nll_loss=2.39, ppl=5.24, wps=271135, ups=0.63, wpb=427935, bsz=16342.9, num_updates=49100, lr=0.000285423, gnorm=0.22, clip=0, loss_scale=1, train_wall=145, gb_free=58.9, wall=0 epoch 029: 1449 / 1707 loss=4.027, nll_loss=2.39, ppl=5.24, wps=271135, ups=0.63, wpb=427935, bsz=16342.9, num_updates=49100, lr=0.000285423, gnorm=0.22, clip=0, loss_scale=1, train_wall=145, gb_free=58.9, wall=0 epoch 029: 1449 / 1707 loss=4.027, nll_loss=2.39, ppl=5.24, wps=271135, ups=0.63, wpb=427935, bsz=16342.9, num_updates=49100, lr=0.000285423, gnorm=0.22, clip=0, loss_scale=1, train_wall=145, gb_free=58.9, wall=0 epoch 029: 1449 / 1707 loss=4.027, nll_loss=2.39, ppl=5.24, wps=271135, ups=0.63, wpb=427935, bsz=16342.9, num_updates=49100, lr=0.000285423, gnorm=0.22, clip=0, loss_scale=1, train_wall=145, gb_free=58.9, wall=0 epoch 029: 1549 / 1707 loss=4.026, nll_loss=2.39, ppl=5.24, wps=297516, ups=0.69, wpb=429060, bsz=16157.2, num_updates=49200, lr=0.000285133, gnorm=0.249, clip=0, loss_scale=1, train_wall=143, gb_free=59, wall=0 epoch 029: 1549 / 1707 loss=4.026, nll_loss=2.39, ppl=5.24, wps=297516, ups=0.69, wpb=429060, bsz=16157.2, num_updates=49200, lr=0.000285133, gnorm=0.249, clip=0, loss_scale=1, train_wall=143, gb_free=59, wall=0 epoch 029: 1549 / 1707 loss=4.026, nll_loss=2.39, ppl=5.24, wps=297516, ups=0.69, wpb=429060, bsz=16157.2, num_updates=49200, lr=0.000285133, gnorm=0.249, clip=0, loss_scale=1, train_wall=143, gb_free=59, wall=0 epoch 029: 1549 / 1707 loss=4.026, nll_loss=2.39, ppl=5.24, wps=297516, ups=0.69, wpb=429060, bsz=16157.2, num_updates=49200, lr=0.000285133, gnorm=0.249, clip=0, loss_scale=1, train_wall=143, gb_free=59, wall=0 epoch 029: 1549 / 1707 loss=4.026, nll_loss=2.39, ppl=5.24, wps=297516, ups=0.69, wpb=429060, bsz=16157.2, num_updates=49200, lr=0.000285133, gnorm=0.249, clip=0, loss_scale=1, train_wall=143, gb_free=59, wall=0 epoch 029: 1549 / 1707 loss=4.026, nll_loss=2.39, ppl=5.24, wps=297516, ups=0.69, wpb=429060, bsz=16157.2, num_updates=49200, lr=0.000285133, gnorm=0.249, clip=0, loss_scale=1, train_wall=143, gb_free=59, wall=0 epoch 029: 1549 / 1707 loss=4.026, nll_loss=2.39, ppl=5.24, wps=297516, ups=0.69, wpb=429060, bsz=16157.2, num_updates=49200, lr=0.000285133, gnorm=0.249, clip=0, loss_scale=1, train_wall=143, gb_free=59, wall=0 epoch 029: 1649 / 1707 loss=4.029, nll_loss=2.392, ppl=5.25, wps=297509, ups=0.69, wpb=430465, bsz=16373.3, num_updates=49300, lr=0.000284844, gnorm=0.234, clip=0, loss_scale=2, train_wall=144, gb_free=59.6, wall=0 epoch 029: 1649 / 1707 loss=4.029, nll_loss=2.392, ppl=5.25, wps=297509, ups=0.69, wpb=430465, bsz=16373.3, num_updates=49300, lr=0.000284844, gnorm=0.234, clip=0, loss_scale=2, train_wall=144, gb_free=59.6, wall=0 epoch 029: 1649 / 1707 loss=4.029, nll_loss=2.392, ppl=5.25, wps=297509, ups=0.69, wpb=430465, bsz=16373.3, num_updates=49300, lr=0.000284844, gnorm=0.234, clip=0, loss_scale=2, train_wall=144, gb_free=59.6, wall=0 epoch 029: 1649 / 1707 loss=4.029, nll_loss=2.392, ppl=5.25, wps=297509, ups=0.69, wpb=430465, bsz=16373.3, num_updates=49300, lr=0.000284844, gnorm=0.234, clip=0, loss_scale=2, train_wall=144, gb_free=59.6, wall=0 epoch 029: 1649 / 1707 loss=4.029, nll_loss=2.392, ppl=5.25, wps=297509, ups=0.69, wpb=430465, bsz=16373.3, num_updates=49300, lr=0.000284844, gnorm=0.234, clip=0, loss_scale=2, train_wall=144, gb_free=59.6, wall=0 epoch 029: 1649 / 1707 loss=4.029, nll_loss=2.392, ppl=5.25, wps=297509, ups=0.69, wpb=430465, bsz=16373.3, num_updates=49300, lr=0.000284844, gnorm=0.234, clip=0, loss_scale=2, train_wall=144, gb_free=59.6, wall=0 epoch 029: 1649 / 1707 loss=4.029, nll_loss=2.392, ppl=5.25, wps=297509, ups=0.69, wpb=430465, bsz=16373.3, num_updates=49300, lr=0.000284844, gnorm=0.234, clip=0, loss_scale=2, train_wall=144, gb_free=59.6, wall=0 end of epoch 29 (average epoch stats below) epoch 029 | loss 4.018 | nll_loss 2.379 | ppl 5.2 | wps 293151 | ups 0.68 | wpb 428934 | bsz 16333.2 | num_updates 49358 | lr 0.000284676 | gnorm 0.233 | clip 0 | loss_scale 2 | train_wall 2448 | gb_free 59.4 | wall 0 epoch 029 | loss 4.018 | nll_loss 2.379 | ppl 5.2 | wps 293151 | ups 0.68 | wpb 428934 | bsz 16333.2 | num_updates 49358 | lr 0.000284676 | gnorm 0.233 | clip 0 | loss_scale 2 | train_wall 2448 | gb_free 59.4 | wall 0 epoch 029 | loss 4.018 | nll_loss 2.379 | ppl 5.2 | wps 293151 | ups 0.68 | wpb 428934 | bsz 16333.2 | num_updates 49358 | lr 0.000284676 | gnorm 0.233 | clip 0 | loss_scale 2 | train_wall 2448 | gb_free 59.4 | wall 0 epoch 029 | loss 4.018 | nll_loss 2.379 | ppl 5.2 | wps 293151 | ups 0.68 | wpb 428934 | bsz 16333.2 | num_updates 49358 | lr 0.000284676 | gnorm 0.233 | clip 0 | loss_scale 2 | train_wall 2448 | gb_free 59.4 | wall 0 epoch 029 | loss 4.018 | nll_loss 2.379 | ppl 5.2 | wps 293151 | ups 0.68 | wpb 428934 | bsz 16333.2 | num_updates 49358 | lr 0.000284676 | gnorm 0.233 | clip 0 | loss_scale 2 | train_wall 2448 | gb_free 59.4 | wall 0 epoch 029 | loss 4.018 | nll_loss 2.379 | ppl 5.2 | wps 293151 | ups 0.68 | wpb 428934 | bsz 16333.2 | num_updates 49358 | lr 0.000284676 | gnorm 0.233 | clip 0 | loss_scale 2 | train_wall 2448 | gb_free 59.4 | wall 0 epoch 029 | loss 4.018 | nll_loss 2.379 | ppl 5.2 | wps 293151 | ups 0.68 | wpb 428934 | bsz 16333.2 | num_updates 49358 | lr 0.000284676 | gnorm 0.233 | clip 0 | loss_scale 2 | train_wall 2448 | gb_free 59.4 | wall 0 Start iterating over samples epoch 030: 42 / 1707 loss=4.005, nll_loss=2.365, ppl=5.15, wps=297310, ups=0.7, wpb=424970, bsz=15967, num_updates=49400, lr=0.000284555, gnorm=0.228, clip=0, loss_scale=2, train_wall=142, gb_free=59.2, wall=0 epoch 030: 42 / 1707 loss=4.005, nll_loss=2.365, ppl=5.15, wps=297310, ups=0.7, wpb=424970, bsz=15967, num_updates=49400, lr=0.000284555, gnorm=0.228, clip=0, loss_scale=2, train_wall=142, gb_free=59.2, wall=0 epoch 030: 42 / 1707 loss=4.005, nll_loss=2.365, ppl=5.15, wps=297310, ups=0.7, wpb=424970, bsz=15967, num_updates=49400, lr=0.000284555, gnorm=0.228, clip=0, loss_scale=2, train_wall=142, gb_free=59.2, wall=0 epoch 030: 42 / 1707 loss=4.005, nll_loss=2.365, ppl=5.15, wps=297310, ups=0.7, wpb=424970, bsz=15967, num_updates=49400, lr=0.000284555, gnorm=0.228, clip=0, loss_scale=2, train_wall=142, gb_free=59.2, wall=0 epoch 030: 42 / 1707 loss=4.005, nll_loss=2.365, ppl=5.15, wps=297310, ups=0.7, wpb=424970, bsz=15967, num_updates=49400, lr=0.000284555, gnorm=0.228, clip=0, loss_scale=2, train_wall=142, gb_free=59.2, wall=0 epoch 030: 42 / 1707 loss=4.005, nll_loss=2.365, ppl=5.15, wps=297310, ups=0.7, wpb=424970, bsz=15967, num_updates=49400, lr=0.000284555, gnorm=0.228, clip=0, loss_scale=2, train_wall=142, gb_free=59.2, wall=0 epoch 030: 42 / 1707 loss=4.005, nll_loss=2.365, ppl=5.15, wps=297310, ups=0.7, wpb=424970, bsz=15967, num_updates=49400, lr=0.000284555, gnorm=0.228, clip=0, loss_scale=2, train_wall=142, gb_free=59.2, wall=0 epoch 030: 42 / 1707 loss=4.005, nll_loss=2.365, ppl=5.15, wps=297310, ups=0.7, wpb=424970, bsz=15967, num_updates=49400, lr=0.000284555, gnorm=0.228, clip=0, loss_scale=2, train_wall=142, gb_free=59.2, wall=0 epoch 030: 142 / 1707 loss=4.003, nll_loss=2.362, ppl=5.14, wps=297734, ups=0.69, wpb=429966, bsz=16263.6, num_updates=49500, lr=0.000284268, gnorm=0.238, clip=0, loss_scale=2, train_wall=143, gb_free=59.3, wall=0 epoch 030: 142 / 1707 loss=4.003, nll_loss=2.362, ppl=5.14, wps=297734, ups=0.69, wpb=429966, bsz=16263.6, num_updates=49500, lr=0.000284268, gnorm=0.238, clip=0, loss_scale=2, train_wall=143, gb_free=59.3, wall=0 epoch 030: 142 / 1707 loss=4.003, nll_loss=2.362, ppl=5.14, wps=297734, ups=0.69, wpb=429966, bsz=16263.6, num_updates=49500, lr=0.000284268, gnorm=0.238, clip=0, loss_scale=2, train_wall=143, gb_free=59.3, wall=0 epoch 030: 142 / 1707 loss=4.003, nll_loss=2.362, ppl=5.14, wps=297734, ups=0.69, wpb=429966, bsz=16263.6, num_updates=49500, lr=0.000284268, gnorm=0.238, clip=0, loss_scale=2, train_wall=143, gb_free=59.3, wall=0 epoch 030: 142 / 1707 loss=4.003, nll_loss=2.362, ppl=5.14, wps=297734, ups=0.69, wpb=429966, bsz=16263.6, num_updates=49500, lr=0.000284268, gnorm=0.238, clip=0, loss_scale=2, train_wall=143, gb_free=59.3, wall=0 epoch 030: 142 / 1707 loss=4.003, nll_loss=2.362, ppl=5.14, wps=297734, ups=0.69, wpb=429966, bsz=16263.6, num_updates=49500, lr=0.000284268, gnorm=0.238, clip=0, loss_scale=2, train_wall=143, gb_free=59.3, wall=0 epoch 030: 142 / 1707 loss=4.003, nll_loss=2.362, ppl=5.14, wps=297734, ups=0.69, wpb=429966, bsz=16263.6, num_updates=49500, lr=0.000284268, gnorm=0.238, clip=0, loss_scale=2, train_wall=143, gb_free=59.3, wall=0 epoch 030: 142 / 1707 loss=4.003, nll_loss=2.362, ppl=5.14, wps=297734, ups=0.69, wpb=429966, bsz=16263.6, num_updates=49500, lr=0.000284268, gnorm=0.238, clip=0, loss_scale=2, train_wall=143, gb_free=59.3, wall=0 epoch 030: 242 / 1707 loss=4.002, nll_loss=2.361, ppl=5.14, wps=296659, ups=0.69, wpb=429568, bsz=16501.2, num_updates=49600, lr=0.000283981, gnorm=0.237, clip=0, loss_scale=4, train_wall=144, gb_free=58.9, wall=0 epoch 030: 242 / 1707 loss=4.002, nll_loss=2.361, ppl=5.14, wps=296659, ups=0.69, wpb=429568, bsz=16501.2, num_updates=49600, lr=0.000283981, gnorm=0.237, clip=0, loss_scale=4, train_wall=144, gb_free=58.9, wall=0 epoch 030: 242 / 1707 loss=4.002, nll_loss=2.361, ppl=5.14, wps=296659, ups=0.69, wpb=429568, bsz=16501.2, num_updates=49600, lr=0.000283981, gnorm=0.237, clip=0, loss_scale=4, train_wall=144, gb_free=58.9, wall=0 epoch 030: 242 / 1707 loss=4.002, nll_loss=2.361, ppl=5.14, wps=296659, ups=0.69, wpb=429568, bsz=16501.2, num_updates=49600, lr=0.000283981, gnorm=0.237, clip=0, loss_scale=4, train_wall=144, gb_free=58.9, wall=0 epoch 030: 242 / 1707 loss=4.002, nll_loss=2.361, ppl=5.14, wps=296659, ups=0.69, wpb=429568, bsz=16501.2, num_updates=49600, lr=0.000283981, gnorm=0.237, clip=0, loss_scale=4, train_wall=144, gb_free=58.9, wall=0 epoch 030: 242 / 1707 loss=4.002, nll_loss=2.361, ppl=5.14, wps=296659, ups=0.69, wpb=429568, bsz=16501.2, num_updates=49600, lr=0.000283981, gnorm=0.237, clip=0, loss_scale=4, train_wall=144, gb_free=58.9, wall=0 epoch 030: 242 / 1707 loss=4.002, nll_loss=2.361, ppl=5.14, wps=296659, ups=0.69, wpb=429568, bsz=16501.2, num_updates=49600, lr=0.000283981, gnorm=0.237, clip=0, loss_scale=4, train_wall=144, gb_free=58.9, wall=0 epoch 030: 242 / 1707 loss=4.002, nll_loss=2.361, ppl=5.14, wps=296659, ups=0.69, wpb=429568, bsz=16501.2, num_updates=49600, lr=0.000283981, gnorm=0.237, clip=0, loss_scale=4, train_wall=144, gb_free=58.9, wall=0 epoch 030: 342 / 1707 loss=4.005, nll_loss=2.365, ppl=5.15, wps=295800, ups=0.69, wpb=427394, bsz=16335.6, num_updates=49700, lr=0.000283695, gnorm=0.228, clip=0, loss_scale=4, train_wall=144, gb_free=59.3, wall=0 epoch 030: 342 / 1707 loss=4.005, nll_loss=2.365, ppl=5.15, wps=295800, ups=0.69, wpb=427394, bsz=16335.6, num_updates=49700, lr=0.000283695, gnorm=0.228, clip=0, loss_scale=4, train_wall=144, gb_free=59.3, wall=0 epoch 030: 342 / 1707 loss=4.005, nll_loss=2.365, ppl=5.15, wps=295800, ups=0.69, wpb=427394, bsz=16335.6, num_updates=49700, lr=0.000283695, gnorm=0.228, clip=0, loss_scale=4, train_wall=144, gb_free=59.3, wall=0 epoch 030: 342 / 1707 loss=4.005, nll_loss=2.365, ppl=5.15, wps=295800, ups=0.69, wpb=427394, bsz=16335.6, num_updates=49700, lr=0.000283695, gnorm=0.228, clip=0, loss_scale=4, train_wall=144, gb_free=59.3, wall=0 epoch 030: 342 / 1707 loss=4.005, nll_loss=2.365, ppl=5.15, wps=295800, ups=0.69, wpb=427394, bsz=16335.6, num_updates=49700, lr=0.000283695, gnorm=0.228, clip=0, loss_scale=4, train_wall=144, gb_free=59.3, wall=0 epoch 030: 342 / 1707 loss=4.005, nll_loss=2.365, ppl=5.15, wps=295800, ups=0.69, wpb=427394, bsz=16335.6, num_updates=49700, lr=0.000283695, gnorm=0.228, clip=0, loss_scale=4, train_wall=144, gb_free=59.3, wall=0 epoch 030: 342 / 1707 loss=4.005, nll_loss=2.365, ppl=5.15, wps=295800, ups=0.69, wpb=427394, bsz=16335.6, num_updates=49700, lr=0.000283695, gnorm=0.228, clip=0, loss_scale=4, train_wall=144, gb_free=59.3, wall=0 epoch 030: 342 / 1707 loss=4.005, nll_loss=2.365, ppl=5.15, wps=295800, ups=0.69, wpb=427394, bsz=16335.6, num_updates=49700, lr=0.000283695, gnorm=0.228, clip=0, loss_scale=4, train_wall=144, gb_free=59.3, wall=0 epoch 030: 443 / 1707 loss=4.002, nll_loss=2.361, ppl=5.14, wps=293278, ups=0.68, wpb=428622, bsz=16457.9, num_updates=49800, lr=0.00028341, gnorm=0.226, clip=0, loss_scale=2, train_wall=145, gb_free=59, wall=0 epoch 030: 443 / 1707 loss=4.002, nll_loss=2.361, ppl=5.14, wps=293278, ups=0.68, wpb=428622, bsz=16457.9, num_updates=49800, lr=0.00028341, gnorm=0.226, clip=0, loss_scale=2, train_wall=145, gb_free=59, wall=0 epoch 030: 443 / 1707 loss=4.002, nll_loss=2.361, ppl=5.14, wps=293278, ups=0.68, wpb=428622, bsz=16457.9, num_updates=49800, lr=0.00028341, gnorm=0.226, clip=0, loss_scale=2, train_wall=145, gb_free=59, wall=0 epoch 030: 443 / 1707 loss=4.002, nll_loss=2.361, ppl=5.14, wps=293278, ups=0.68, wpb=428622, bsz=16457.9, num_updates=49800, lr=0.00028341, gnorm=0.226, clip=0, loss_scale=2, train_wall=145, gb_free=59, wall=0 epoch 030: 443 / 1707 loss=4.002, nll_loss=2.361, ppl=5.14, wps=293278, ups=0.68, wpb=428622, bsz=16457.9, num_updates=49800, lr=0.00028341, gnorm=0.226, clip=0, loss_scale=2, train_wall=145, gb_free=59, wall=0 epoch 030: 443 / 1707 loss=4.002, nll_loss=2.361, ppl=5.14, wps=293278, ups=0.68, wpb=428622, bsz=16457.9, num_updates=49800, lr=0.00028341, gnorm=0.226, clip=0, loss_scale=2, train_wall=145, gb_free=59, wall=0 epoch 030: 443 / 1707 loss=4.002, nll_loss=2.361, ppl=5.14, wps=293278, ups=0.68, wpb=428622, bsz=16457.9, num_updates=49800, lr=0.00028341, gnorm=0.226, clip=0, loss_scale=2, train_wall=145, gb_free=59, wall=0 epoch 030: 443 / 1707 loss=4.002, nll_loss=2.361, ppl=5.14, wps=293278, ups=0.68, wpb=428622, bsz=16457.9, num_updates=49800, lr=0.00028341, gnorm=0.226, clip=0, loss_scale=2, train_wall=145, gb_free=59, wall=0 epoch 030: 543 / 1707 loss=4.003, nll_loss=2.362, ppl=5.14, wps=296560, ups=0.69, wpb=429509, bsz=16323.6, num_updates=49900, lr=0.000283126, gnorm=0.239, clip=0, loss_scale=2, train_wall=144, gb_free=59, wall=0 epoch 030: 543 / 1707 loss=4.003, nll_loss=2.362, ppl=5.14, wps=296560, ups=0.69, wpb=429509, bsz=16323.6, num_updates=49900, lr=0.000283126, gnorm=0.239, clip=0, loss_scale=2, train_wall=144, gb_free=59, wall=0 epoch 030: 543 / 1707 loss=4.003, nll_loss=2.362, ppl=5.14, wps=296560, ups=0.69, wpb=429509, bsz=16323.6, num_updates=49900, lr=0.000283126, gnorm=0.239, clip=0, loss_scale=2, train_wall=144, gb_free=59, wall=0 epoch 030: 543 / 1707 loss=4.003, nll_loss=2.362, ppl=5.14, wps=296560, ups=0.69, wpb=429509, bsz=16323.6, num_updates=49900, lr=0.000283126, gnorm=0.239, clip=0, loss_scale=2, train_wall=144, gb_free=59, wall=0 epoch 030: 543 / 1707 loss=4.003, nll_loss=2.362, ppl=5.14, wps=296560, ups=0.69, wpb=429509, bsz=16323.6, num_updates=49900, lr=0.000283126, gnorm=0.239, clip=0, loss_scale=2, train_wall=144, gb_free=59, wall=0 epoch 030: 543 / 1707 loss=4.003, nll_loss=2.362, ppl=5.14, wps=296560, ups=0.69, wpb=429509, bsz=16323.6, num_updates=49900, lr=0.000283126, gnorm=0.239, clip=0, loss_scale=2, train_wall=144, gb_free=59, wall=0 epoch 030: 543 / 1707 loss=4.003, nll_loss=2.362, ppl=5.14, wps=296560, ups=0.69, wpb=429509, bsz=16323.6, num_updates=49900, lr=0.000283126, gnorm=0.239, clip=0, loss_scale=2, train_wall=144, gb_free=59, wall=0 epoch 030: 543 / 1707 loss=4.003, nll_loss=2.362, ppl=5.14, wps=296560, ups=0.69, wpb=429509, bsz=16323.6, num_updates=49900, lr=0.000283126, gnorm=0.239, clip=0, loss_scale=2, train_wall=144, gb_free=59, wall=0 epoch 030: 643 / 1707 loss=4.013, nll_loss=2.374, ppl=5.18, wps=296786, ups=0.69, wpb=429246, bsz=16423.5, num_updates=50000, lr=0.000282843, gnorm=0.243, clip=0, loss_scale=4, train_wall=144, gb_free=58.8, wall=0 epoch 030: 643 / 1707 loss=4.013, nll_loss=2.374, ppl=5.18, wps=296786, ups=0.69, wpb=429246, bsz=16423.5, num_updates=50000, lr=0.000282843, gnorm=0.243, clip=0, loss_scale=4, train_wall=144, gb_free=58.8, wall=0 epoch 030: 643 / 1707 loss=4.013, nll_loss=2.374, ppl=5.18, wps=296786, ups=0.69, wpb=429246, bsz=16423.5, num_updates=50000, lr=0.000282843, gnorm=0.243, clip=0, loss_scale=4, train_wall=144, gb_free=58.8, wall=0 epoch 030: 643 / 1707 loss=4.013, nll_loss=2.374, ppl=5.18, wps=296786, ups=0.69, wpb=429246, bsz=16423.5, num_updates=50000, lr=0.000282843, gnorm=0.243, clip=0, loss_scale=4, train_wall=144, gb_free=58.8, wall=0 epoch 030: 643 / 1707 loss=4.013, nll_loss=2.374, ppl=5.18, wps=296786, ups=0.69, wpb=429246, bsz=16423.5, num_updates=50000, lr=0.000282843, gnorm=0.243, clip=0, loss_scale=4, train_wall=144, gb_free=58.8, wall=0 epoch 030: 643 / 1707 loss=4.013, nll_loss=2.374, ppl=5.18, wps=296786, ups=0.69, wpb=429246, bsz=16423.5, num_updates=50000, lr=0.000282843, gnorm=0.243, clip=0, loss_scale=4, train_wall=144, gb_free=58.8, wall=0 epoch 030: 643 / 1707 loss=4.013, nll_loss=2.374, ppl=5.18, wps=296786, ups=0.69, wpb=429246, bsz=16423.5, num_updates=50000, lr=0.000282843, gnorm=0.243, clip=0, loss_scale=4, train_wall=144, gb_free=58.8, wall=0 epoch 030: 643 / 1707 loss=4.013, nll_loss=2.374, ppl=5.18, wps=296786, ups=0.69, wpb=429246, bsz=16423.5, num_updates=50000, lr=0.000282843, gnorm=0.243, clip=0, loss_scale=4, train_wall=144, gb_free=58.8, wall=0 begin validation on "valid" subset epoch 030 | valid on 'valid' subset | loss 4.297 | nll_loss 2.673 | ppl 6.38 | wps 75978 | wpb 21331 | bsz 1016 | num_updates 50000 | best_loss 4.28 epoch 030 | valid on 'valid' subset | loss 4.297 | nll_loss 2.673 | ppl 6.38 | wps 75978 | wpb 21331 | bsz 1016 | num_updates 50000 | best_loss 4.28 epoch 030 | valid on 'valid' subset | loss 4.297 | nll_loss 2.673 | ppl 6.38 | wps 75978 | wpb 21331 | bsz 1016 | num_updates 50000 | best_loss 4.28 epoch 030 | valid on 'valid' subset | loss 4.297 | nll_loss 2.673 | ppl 6.38 | wps 75978 | wpb 21331 | bsz 1016 | num_updates 50000 | best_loss 4.28 epoch 030 | valid on 'valid' subset | loss 4.297 | nll_loss 2.673 | ppl 6.38 | wps 75978 | wpb 21331 | bsz 1016 | num_updates 50000 | best_loss 4.28 epoch 030 | valid on 'valid' subset | loss 4.297 | nll_loss 2.673 | ppl 6.38 | wps 75978 | wpb 21331 | bsz 1016 | num_updates 50000 | best_loss 4.28 epoch 030 | valid on 'valid' subset | loss 4.297 | nll_loss 2.673 | ppl 6.38 | wps 75978 | wpb 21331 | bsz 1016 | num_updates 50000 | best_loss 4.28 epoch 030 | valid on 'valid' subset | loss 4.297 | nll_loss 2.673 | ppl 6.38 | wps 75978 | wpb 21331 | bsz 1016 | num_updates 50000 | best_loss 4.28 epoch 030: 743 / 1707 loss=4.014, nll_loss=2.375, ppl=5.19, wps=275405, ups=0.64, wpb=430825, bsz=16496.5, num_updates=50100, lr=0.00028256, gnorm=0.232, clip=0, loss_scale=4, train_wall=143, gb_free=59.2, wall=0 epoch 030: 743 / 1707 loss=4.014, nll_loss=2.375, ppl=5.19, wps=275405, ups=0.64, wpb=430825, bsz=16496.5, num_updates=50100, lr=0.00028256, gnorm=0.232, clip=0, loss_scale=4, train_wall=143, gb_free=59.2, wall=0 epoch 030: 743 / 1707 loss=4.014, nll_loss=2.375, ppl=5.19, wps=275405, ups=0.64, wpb=430825, bsz=16496.5, num_updates=50100, lr=0.00028256, gnorm=0.232, clip=0, loss_scale=4, train_wall=143, gb_free=59.2, wall=0 epoch 030: 743 / 1707 loss=4.014, nll_loss=2.375, ppl=5.19, wps=275405, ups=0.64, wpb=430825, bsz=16496.5, num_updates=50100, lr=0.00028256, gnorm=0.232, clip=0, loss_scale=4, train_wall=143, gb_free=59.2, wall=0 epoch 030: 743 / 1707 loss=4.014, nll_loss=2.375, ppl=5.19, wps=275405, ups=0.64, wpb=430825, bsz=16496.5, num_updates=50100, lr=0.00028256, gnorm=0.232, clip=0, loss_scale=4, train_wall=143, gb_free=59.2, wall=0 epoch 030: 743 / 1707 loss=4.014, nll_loss=2.375, ppl=5.19, wps=275405, ups=0.64, wpb=430825, bsz=16496.5, num_updates=50100, lr=0.00028256, gnorm=0.232, clip=0, loss_scale=4, train_wall=143, gb_free=59.2, wall=0 epoch 030: 743 / 1707 loss=4.014, nll_loss=2.375, ppl=5.19, wps=275405, ups=0.64, wpb=430825, bsz=16496.5, num_updates=50100, lr=0.00028256, gnorm=0.232, clip=0, loss_scale=4, train_wall=143, gb_free=59.2, wall=0 epoch 030: 743 / 1707 loss=4.014, nll_loss=2.375, ppl=5.19, wps=275405, ups=0.64, wpb=430825, bsz=16496.5, num_updates=50100, lr=0.00028256, gnorm=0.232, clip=0, loss_scale=4, train_wall=143, gb_free=59.2, wall=0 epoch 030: 844 / 1707 loss=4.007, nll_loss=2.367, ppl=5.16, wps=293559, ups=0.68, wpb=429647, bsz=16182.7, num_updates=50200, lr=0.000282279, gnorm=0.225, clip=0, loss_scale=2, train_wall=145, gb_free=59.2, wall=0 epoch 030: 844 / 1707 loss=4.007, nll_loss=2.367, ppl=5.16, wps=293559, ups=0.68, wpb=429647, bsz=16182.7, num_updates=50200, lr=0.000282279, gnorm=0.225, clip=0, loss_scale=2, train_wall=145, gb_free=59.2, wall=0 epoch 030: 844 / 1707 loss=4.007, nll_loss=2.367, ppl=5.16, wps=293559, ups=0.68, wpb=429647, bsz=16182.7, num_updates=50200, lr=0.000282279, gnorm=0.225, clip=0, loss_scale=2, train_wall=145, gb_free=59.2, wall=0 epoch 030: 844 / 1707 loss=4.007, nll_loss=2.367, ppl=5.16, wps=293559, ups=0.68, wpb=429647, bsz=16182.7, num_updates=50200, lr=0.000282279, gnorm=0.225, clip=0, loss_scale=2, train_wall=145, gb_free=59.2, wall=0 epoch 030: 844 / 1707 loss=4.007, nll_loss=2.367, ppl=5.16, wps=293559, ups=0.68, wpb=429647, bsz=16182.7, num_updates=50200, lr=0.000282279, gnorm=0.225, clip=0, loss_scale=2, train_wall=145, gb_free=59.2, wall=0 epoch 030: 844 / 1707 loss=4.007, nll_loss=2.367, ppl=5.16, wps=293559, ups=0.68, wpb=429647, bsz=16182.7, num_updates=50200, lr=0.000282279, gnorm=0.225, clip=0, loss_scale=2, train_wall=145, gb_free=59.2, wall=0 epoch 030: 844 / 1707 loss=4.007, nll_loss=2.367, ppl=5.16, wps=293559, ups=0.68, wpb=429647, bsz=16182.7, num_updates=50200, lr=0.000282279, gnorm=0.225, clip=0, loss_scale=2, train_wall=145, gb_free=59.2, wall=0 epoch 030: 844 / 1707 loss=4.007, nll_loss=2.367, ppl=5.16, wps=293559, ups=0.68, wpb=429647, bsz=16182.7, num_updates=50200, lr=0.000282279, gnorm=0.225, clip=0, loss_scale=2, train_wall=145, gb_free=59.2, wall=0 epoch 030: 945 / 1707 loss=4.015, nll_loss=2.376, ppl=5.19, wps=293274, ups=0.69, wpb=427930, bsz=16284.1, num_updates=50300, lr=0.000281998, gnorm=0.238, clip=0, loss_scale=1, train_wall=145, gb_free=58.9, wall=0 epoch 030: 945 / 1707 loss=4.015, nll_loss=2.376, ppl=5.19, wps=293274, ups=0.69, wpb=427930, bsz=16284.1, num_updates=50300, lr=0.000281998, gnorm=0.238, clip=0, loss_scale=1, train_wall=145, gb_free=58.9, wall=0 epoch 030: 945 / 1707 loss=4.015, nll_loss=2.376, ppl=5.19, wps=293274, ups=0.69, wpb=427930, bsz=16284.1, num_updates=50300, lr=0.000281998, gnorm=0.238, clip=0, loss_scale=1, train_wall=145, gb_free=58.9, wall=0 epoch 030: 945 / 1707 loss=4.015, nll_loss=2.376, ppl=5.19, wps=293274, ups=0.69, wpb=427930, bsz=16284.1, num_updates=50300, lr=0.000281998, gnorm=0.238, clip=0, loss_scale=1, train_wall=145, gb_free=58.9, wall=0 epoch 030: 945 / 1707 loss=4.015, nll_loss=2.376, ppl=5.19, wps=293274, ups=0.69, wpb=427930, bsz=16284.1, num_updates=50300, lr=0.000281998, gnorm=0.238, clip=0, loss_scale=1, train_wall=145, gb_free=58.9, wall=0 epoch 030: 945 / 1707 loss=4.015, nll_loss=2.376, ppl=5.19, wps=293274, ups=0.69, wpb=427930, bsz=16284.1, num_updates=50300, lr=0.000281998, gnorm=0.238, clip=0, loss_scale=1, train_wall=145, gb_free=58.9, wall=0 epoch 030: 945 / 1707 loss=4.015, nll_loss=2.376, ppl=5.19, wps=293274, ups=0.69, wpb=427930, bsz=16284.1, num_updates=50300, lr=0.000281998, gnorm=0.238, clip=0, loss_scale=1, train_wall=145, gb_free=58.9, wall=0 epoch 030: 945 / 1707 loss=4.015, nll_loss=2.376, ppl=5.19, wps=293274, ups=0.69, wpb=427930, bsz=16284.1, num_updates=50300, lr=0.000281998, gnorm=0.238, clip=0, loss_scale=1, train_wall=145, gb_free=58.9, wall=0 epoch 030: 1045 / 1707 loss=4.016, nll_loss=2.377, ppl=5.2, wps=296154, ups=0.69, wpb=428844, bsz=16433.4, num_updates=50400, lr=0.000281718, gnorm=0.223, clip=0, loss_scale=1, train_wall=144, gb_free=58.9, wall=0 epoch 030: 1045 / 1707 loss=4.016, nll_loss=2.377, ppl=5.2, wps=296154, ups=0.69, wpb=428844, bsz=16433.4, num_updates=50400, lr=0.000281718, gnorm=0.223, clip=0, loss_scale=1, train_wall=144, gb_free=58.9, wall=0 epoch 030: 1045 / 1707 loss=4.016, nll_loss=2.377, ppl=5.2, wps=296154, ups=0.69, wpb=428844, bsz=16433.4, num_updates=50400, lr=0.000281718, gnorm=0.223, clip=0, loss_scale=1, train_wall=144, gb_free=58.9, wall=0 epoch 030: 1045 / 1707 loss=4.016, nll_loss=2.377, ppl=5.2, wps=296154, ups=0.69, wpb=428844, bsz=16433.4, num_updates=50400, lr=0.000281718, gnorm=0.223, clip=0, loss_scale=1, train_wall=144, gb_free=58.9, wall=0 epoch 030: 1045 / 1707 loss=4.016, nll_loss=2.377, ppl=5.2, wps=296154, ups=0.69, wpb=428844, bsz=16433.4, num_updates=50400, lr=0.000281718, gnorm=0.223, clip=0, loss_scale=1, train_wall=144, gb_free=58.9, wall=0 epoch 030: 1045 / 1707 loss=4.016, nll_loss=2.377, ppl=5.2, wps=296154, ups=0.69, wpb=428844, bsz=16433.4, num_updates=50400, lr=0.000281718, gnorm=0.223, clip=0, loss_scale=1, train_wall=144, gb_free=58.9, wall=0 epoch 030: 1045 / 1707 loss=4.016, nll_loss=2.377, ppl=5.2, wps=296154, ups=0.69, wpb=428844, bsz=16433.4, num_updates=50400, lr=0.000281718, gnorm=0.223, clip=0, loss_scale=1, train_wall=144, gb_free=58.9, wall=0 epoch 030: 1045 / 1707 loss=4.016, nll_loss=2.377, ppl=5.2, wps=296154, ups=0.69, wpb=428844, bsz=16433.4, num_updates=50400, lr=0.000281718, gnorm=0.223, clip=0, loss_scale=1, train_wall=144, gb_free=58.9, wall=0 epoch 030: 1145 / 1707 loss=4.008, nll_loss=2.369, ppl=5.16, wps=297673, ups=0.69, wpb=430199, bsz=16190.5, num_updates=50500, lr=0.000281439, gnorm=0.22, clip=0, loss_scale=1, train_wall=144, gb_free=59.1, wall=0 epoch 030: 1145 / 1707 loss=4.008, nll_loss=2.369, ppl=5.16, wps=297673, ups=0.69, wpb=430199, bsz=16190.5, num_updates=50500, lr=0.000281439, gnorm=0.22, clip=0, loss_scale=1, train_wall=144, gb_free=59.1, wall=0 epoch 030: 1145 / 1707 loss=4.008, nll_loss=2.369, ppl=5.16, wps=297673, ups=0.69, wpb=430199, bsz=16190.5, num_updates=50500, lr=0.000281439, gnorm=0.22, clip=0, loss_scale=1, train_wall=144, gb_free=59.1, wall=0 epoch 030: 1145 / 1707 loss=4.008, nll_loss=2.369, ppl=5.16, wps=297673, ups=0.69, wpb=430199, bsz=16190.5, num_updates=50500, lr=0.000281439, gnorm=0.22, clip=0, loss_scale=1, train_wall=144, gb_free=59.1, wall=0 epoch 030: 1145 / 1707 loss=4.008, nll_loss=2.369, ppl=5.16, wps=297673, ups=0.69, wpb=430199, bsz=16190.5, num_updates=50500, lr=0.000281439, gnorm=0.22, clip=0, loss_scale=1, train_wall=144, gb_free=59.1, wall=0 epoch 030: 1145 / 1707 loss=4.008, nll_loss=2.369, ppl=5.16, wps=297673, ups=0.69, wpb=430199, bsz=16190.5, num_updates=50500, lr=0.000281439, gnorm=0.22, clip=0, loss_scale=1, train_wall=144, gb_free=59.1, wall=0 epoch 030: 1145 / 1707 loss=4.008, nll_loss=2.369, ppl=5.16, wps=297673, ups=0.69, wpb=430199, bsz=16190.5, num_updates=50500, lr=0.000281439, gnorm=0.22, clip=0, loss_scale=1, train_wall=144, gb_free=59.1, wall=0 epoch 030: 1145 / 1707 loss=4.008, nll_loss=2.369, ppl=5.16, wps=297673, ups=0.69, wpb=430199, bsz=16190.5, num_updates=50500, lr=0.000281439, gnorm=0.22, clip=0, loss_scale=1, train_wall=144, gb_free=59.1, wall=0 epoch 030: 1245 / 1707 loss=4.014, nll_loss=2.375, ppl=5.19, wps=296277, ups=0.69, wpb=428711, bsz=16258.1, num_updates=50600, lr=0.000281161, gnorm=0.239, clip=0, loss_scale=2, train_wall=144, gb_free=59.1, wall=0 epoch 030: 1245 / 1707 loss=4.014, nll_loss=2.375, ppl=5.19, wps=296277, ups=0.69, wpb=428711, bsz=16258.1, num_updates=50600, lr=0.000281161, gnorm=0.239, clip=0, loss_scale=2, train_wall=144, gb_free=59.1, wall=0 epoch 030: 1245 / 1707 loss=4.014, nll_loss=2.375, ppl=5.19, wps=296277, ups=0.69, wpb=428711, bsz=16258.1, num_updates=50600, lr=0.000281161, gnorm=0.239, clip=0, loss_scale=2, train_wall=144, gb_free=59.1, wall=0 epoch 030: 1245 / 1707 loss=4.014, nll_loss=2.375, ppl=5.19, wps=296277, ups=0.69, wpb=428711, bsz=16258.1, num_updates=50600, lr=0.000281161, gnorm=0.239, clip=0, loss_scale=2, train_wall=144, gb_free=59.1, wall=0 epoch 030: 1245 / 1707 loss=4.014, nll_loss=2.375, ppl=5.19, wps=296277, ups=0.69, wpb=428711, bsz=16258.1, num_updates=50600, lr=0.000281161, gnorm=0.239, clip=0, loss_scale=2, train_wall=144, gb_free=59.1, wall=0 epoch 030: 1245 / 1707 loss=4.014, nll_loss=2.375, ppl=5.19, wps=296277, ups=0.69, wpb=428711, bsz=16258.1, num_updates=50600, lr=0.000281161, gnorm=0.239, clip=0, loss_scale=2, train_wall=144, gb_free=59.1, wall=0 epoch 030: 1245 / 1707 loss=4.014, nll_loss=2.375, ppl=5.19, wps=296277, ups=0.69, wpb=428711, bsz=16258.1, num_updates=50600, lr=0.000281161, gnorm=0.239, clip=0, loss_scale=2, train_wall=144, gb_free=59.1, wall=0 epoch 030: 1245 / 1707 loss=4.014, nll_loss=2.375, ppl=5.19, wps=296277, ups=0.69, wpb=428711, bsz=16258.1, num_updates=50600, lr=0.000281161, gnorm=0.239, clip=0, loss_scale=2, train_wall=144, gb_free=59.1, wall=0 epoch 030: 1346 / 1707 loss=4.025, nll_loss=2.389, ppl=5.24, wps=294748, ups=0.68, wpb=430326, bsz=16470.5, num_updates=50700, lr=0.000280883, gnorm=0.247, clip=0, loss_scale=1, train_wall=145, gb_free=59, wall=0 epoch 030: 1346 / 1707 loss=4.025, nll_loss=2.389, ppl=5.24, wps=294748, ups=0.68, wpb=430326, bsz=16470.5, num_updates=50700, lr=0.000280883, gnorm=0.247, clip=0, loss_scale=1, train_wall=145, gb_free=59, wall=0 epoch 030: 1346 / 1707 loss=4.025, nll_loss=2.389, ppl=5.24, wps=294748, ups=0.68, wpb=430326, bsz=16470.5, num_updates=50700, lr=0.000280883, gnorm=0.247, clip=0, loss_scale=1, train_wall=145, gb_free=59, wall=0 epoch 030: 1346 / 1707 loss=4.025, nll_loss=2.389, ppl=5.24, wps=294748, ups=0.68, wpb=430326, bsz=16470.5, num_updates=50700, lr=0.000280883, gnorm=0.247, clip=0, loss_scale=1, train_wall=145, gb_free=59, wall=0 epoch 030: 1346 / 1707 loss=4.025, nll_loss=2.389, ppl=5.24, wps=294748, ups=0.68, wpb=430326, bsz=16470.5, num_updates=50700, lr=0.000280883, gnorm=0.247, clip=0, loss_scale=1, train_wall=145, gb_free=59, wall=0 epoch 030: 1346 / 1707 loss=4.025, nll_loss=2.389, ppl=5.24, wps=294748, ups=0.68, wpb=430326, bsz=16470.5, num_updates=50700, lr=0.000280883, gnorm=0.247, clip=0, loss_scale=1, train_wall=145, gb_free=59, wall=0 epoch 030: 1346 / 1707 loss=4.025, nll_loss=2.389, ppl=5.24, wps=294748, ups=0.68, wpb=430326, bsz=16470.5, num_updates=50700, lr=0.000280883, gnorm=0.247, clip=0, loss_scale=1, train_wall=145, gb_free=59, wall=0 epoch 030: 1346 / 1707 loss=4.025, nll_loss=2.389, ppl=5.24, wps=294748, ups=0.68, wpb=430326, bsz=16470.5, num_updates=50700, lr=0.000280883, gnorm=0.247, clip=0, loss_scale=1, train_wall=145, gb_free=59, wall=0 epoch 030: 1446 / 1707 loss=4.025, nll_loss=2.388, ppl=5.23, wps=295999, ups=0.69, wpb=428943, bsz=16346.2, num_updates=50800, lr=0.000280607, gnorm=0.238, clip=0, loss_scale=1, train_wall=144, gb_free=58.8, wall=0 epoch 030: 1446 / 1707 loss=4.025, nll_loss=2.388, ppl=5.23, wps=295999, ups=0.69, wpb=428943, bsz=16346.2, num_updates=50800, lr=0.000280607, gnorm=0.238, clip=0, loss_scale=1, train_wall=144, gb_free=58.8, wall=0 epoch 030: 1446 / 1707 loss=4.025, nll_loss=2.388, ppl=5.23, wps=295999, ups=0.69, wpb=428943, bsz=16346.2, num_updates=50800, lr=0.000280607, gnorm=0.238, clip=0, loss_scale=1, train_wall=144, gb_free=58.8, wall=0 epoch 030: 1446 / 1707 loss=4.025, nll_loss=2.388, ppl=5.23, wps=295999, ups=0.69, wpb=428943, bsz=16346.2, num_updates=50800, lr=0.000280607, gnorm=0.238, clip=0, loss_scale=1, train_wall=144, gb_free=58.8, wall=0 epoch 030: 1446 / 1707 loss=4.025, nll_loss=2.388, ppl=5.23, wps=295999, ups=0.69, wpb=428943, bsz=16346.2, num_updates=50800, lr=0.000280607, gnorm=0.238, clip=0, loss_scale=1, train_wall=144, gb_free=58.8, wall=0 epoch 030: 1446 / 1707 loss=4.025, nll_loss=2.388, ppl=5.23, wps=295999, ups=0.69, wpb=428943, bsz=16346.2, num_updates=50800, lr=0.000280607, gnorm=0.238, clip=0, loss_scale=1, train_wall=144, gb_free=58.8, wall=0 epoch 030: 1446 / 1707 loss=4.025, nll_loss=2.388, ppl=5.23, wps=295999, ups=0.69, wpb=428943, bsz=16346.2, num_updates=50800, lr=0.000280607, gnorm=0.238, clip=0, loss_scale=1, train_wall=144, gb_free=58.8, wall=0 epoch 030: 1446 / 1707 loss=4.025, nll_loss=2.388, ppl=5.23, wps=295999, ups=0.69, wpb=428943, bsz=16346.2, num_updates=50800, lr=0.000280607, gnorm=0.238, clip=0, loss_scale=1, train_wall=144, gb_free=58.8, wall=0 epoch 030: 1546 / 1707 loss=4.028, nll_loss=2.391, ppl=5.25, wps=297994, ups=0.69, wpb=429196, bsz=16381.3, num_updates=50900, lr=0.000280331, gnorm=0.243, clip=0, loss_scale=2, train_wall=143, gb_free=59, wall=0 epoch 030: 1546 / 1707 loss=4.028, nll_loss=2.391, ppl=5.25, wps=297994, ups=0.69, wpb=429196, bsz=16381.3, num_updates=50900, lr=0.000280331, gnorm=0.243, clip=0, loss_scale=2, train_wall=143, gb_free=59, wall=0 epoch 030: 1546 / 1707 loss=4.028, nll_loss=2.391, ppl=5.25, wps=297994, ups=0.69, wpb=429196, bsz=16381.3, num_updates=50900, lr=0.000280331, gnorm=0.243, clip=0, loss_scale=2, train_wall=143, gb_free=59, wall=0 epoch 030: 1546 / 1707 loss=4.028, nll_loss=2.391, ppl=5.25, wps=297994, ups=0.69, wpb=429196, bsz=16381.3, num_updates=50900, lr=0.000280331, gnorm=0.243, clip=0, loss_scale=2, train_wall=143, gb_free=59, wall=0 epoch 030: 1546 / 1707 loss=4.028, nll_loss=2.391, ppl=5.25, wps=297994, ups=0.69, wpb=429196, bsz=16381.3, num_updates=50900, lr=0.000280331, gnorm=0.243, clip=0, loss_scale=2, train_wall=143, gb_free=59, wall=0 epoch 030: 1546 / 1707 loss=4.028, nll_loss=2.391, ppl=5.25, wps=297994, ups=0.69, wpb=429196, bsz=16381.3, num_updates=50900, lr=0.000280331, gnorm=0.243, clip=0, loss_scale=2, train_wall=143, gb_free=59, wall=0 epoch 030: 1546 / 1707 loss=4.028, nll_loss=2.391, ppl=5.25, wps=297994, ups=0.69, wpb=429196, bsz=16381.3, num_updates=50900, lr=0.000280331, gnorm=0.243, clip=0, loss_scale=2, train_wall=143, gb_free=59, wall=0 epoch 030: 1546 / 1707 loss=4.028, nll_loss=2.391, ppl=5.25, wps=297994, ups=0.69, wpb=429196, bsz=16381.3, num_updates=50900, lr=0.000280331, gnorm=0.243, clip=0, loss_scale=2, train_wall=143, gb_free=59, wall=0 epoch 030: 1646 / 1707 loss=4.027, nll_loss=2.39, ppl=5.24, wps=296558, ups=0.69, wpb=428980, bsz=16133.4, num_updates=51000, lr=0.000280056, gnorm=0.235, clip=0, loss_scale=2, train_wall=144, gb_free=59.2, wall=0 epoch 030: 1646 / 1707 loss=4.027, nll_loss=2.39, ppl=5.24, wps=296558, ups=0.69, wpb=428980, bsz=16133.4, num_updates=51000, lr=0.000280056, gnorm=0.235, clip=0, loss_scale=2, train_wall=144, gb_free=59.2, wall=0 epoch 030: 1646 / 1707 loss=4.027, nll_loss=2.39, ppl=5.24, wps=296558, ups=0.69, wpb=428980, bsz=16133.4, num_updates=51000, lr=0.000280056, gnorm=0.235, clip=0, loss_scale=2, train_wall=144, gb_free=59.2, wall=0 epoch 030: 1646 / 1707 loss=4.027, nll_loss=2.39, ppl=5.24, wps=296558, ups=0.69, wpb=428980, bsz=16133.4, num_updates=51000, lr=0.000280056, gnorm=0.235, clip=0, loss_scale=2, train_wall=144, gb_free=59.2, wall=0 epoch 030: 1646 / 1707 loss=4.027, nll_loss=2.39, ppl=5.24, wps=296558, ups=0.69, wpb=428980, bsz=16133.4, num_updates=51000, lr=0.000280056, gnorm=0.235, clip=0, loss_scale=2, train_wall=144, gb_free=59.2, wall=0 epoch 030: 1646 / 1707 loss=4.027, nll_loss=2.39, ppl=5.24, wps=296558, ups=0.69, wpb=428980, bsz=16133.4, num_updates=51000, lr=0.000280056, gnorm=0.235, clip=0, loss_scale=2, train_wall=144, gb_free=59.2, wall=0 epoch 030: 1646 / 1707 loss=4.027, nll_loss=2.39, ppl=5.24, wps=296558, ups=0.69, wpb=428980, bsz=16133.4, num_updates=51000, lr=0.000280056, gnorm=0.235, clip=0, loss_scale=2, train_wall=144, gb_free=59.2, wall=0 epoch 030: 1646 / 1707 loss=4.027, nll_loss=2.39, ppl=5.24, wps=296558, ups=0.69, wpb=428980, bsz=16133.4, num_updates=51000, lr=0.000280056, gnorm=0.235, clip=0, loss_scale=2, train_wall=144, gb_free=59.2, wall=0 begin validation on "valid" subset epoch 030 | valid on 'valid' subset | loss 4.279 | nll_loss 2.657 | ppl 6.31 | wps 76576.8 | wpb 21331 | bsz 1016 | num_updates 51000 | best_loss 4.279 epoch 030 | valid on 'valid' subset | loss 4.279 | nll_loss 2.657 | ppl 6.31 | wps 76576.8 | wpb 21331 | bsz 1016 | num_updates 51000 | best_loss 4.279 epoch 030 | valid on 'valid' subset | loss 4.279 | nll_loss 2.657 | ppl 6.31 | wps 76576.8 | wpb 21331 | bsz 1016 | num_updates 51000 | best_loss 4.279 epoch 030 | valid on 'valid' subset | loss 4.279 | nll_loss 2.657 | ppl 6.31 | wps 76576.8 | wpb 21331 | bsz 1016 | num_updates 51000 | best_loss 4.279 epoch 030 | valid on 'valid' subset | loss 4.279 | nll_loss 2.657 | ppl 6.31 | wps 76576.8 | wpb 21331 | bsz 1016 | num_updates 51000 | best_loss 4.279 epoch 030 | valid on 'valid' subset | loss 4.279 | nll_loss 2.657 | ppl 6.31 | wps 76576.8 | wpb 21331 | bsz 1016 | num_updates 51000 | best_loss 4.279 epoch 030 | valid on 'valid' subset | loss 4.279 | nll_loss 2.657 | ppl 6.31 | wps 76576.8 | wpb 21331 | bsz 1016 | num_updates 51000 | best_loss 4.279 epoch 030 | valid on 'valid' subset | loss 4.279 | nll_loss 2.657 | ppl 6.31 | wps 76576.8 | wpb 21331 | bsz 1016 | num_updates 51000 | best_loss 4.279 end of epoch 30 (average epoch stats below) epoch 030 | loss 4.012 | nll_loss 2.373 | ppl 5.18 | wps 292596 | ups 0.68 | wpb 428952 | bsz 16333.6 | num_updates 51061 | lr 0.000279889 | gnorm 0.235 | clip 0 | loss_scale 2 | train_wall 2452 | gb_free 59.9 | wall 0 epoch 030 | loss 4.012 | nll_loss 2.373 | ppl 5.18 | wps 292596 | ups 0.68 | wpb 428952 | bsz 16333.6 | num_updates 51061 | lr 0.000279889 | gnorm 0.235 | clip 0 | loss_scale 2 | train_wall 2452 | gb_free 59.9 | wall 0 epoch 030 | loss 4.012 | nll_loss 2.373 | ppl 5.18 | wps 292596 | ups 0.68 | wpb 428952 | bsz 16333.6 | num_updates 51061 | lr 0.000279889 | gnorm 0.235 | clip 0 | loss_scale 2 | train_wall 2452 | gb_free 59.9 | wall 0 epoch 030 | loss 4.012 | nll_loss 2.373 | ppl 5.18 | wps 292596 | ups 0.68 | wpb 428952 | bsz 16333.6 | num_updates 51061 | lr 0.000279889 | gnorm 0.235 | clip 0 | loss_scale 2 | train_wall 2452 | gb_free 59.9 | wall 0 epoch 030 | loss 4.012 | nll_loss 2.373 | ppl 5.18 | wps 292596 | ups 0.68 | wpb 428952 | bsz 16333.6 | num_updates 51061 | lr 0.000279889 | gnorm 0.235 | clip 0 | loss_scale 2 | train_wall 2452 | gb_free 59.9 | wall 0 epoch 030 | loss 4.012 | nll_loss 2.373 | ppl 5.18 | wps 292596 | ups 0.68 | wpb 428952 | bsz 16333.6 | num_updates 51061 | lr 0.000279889 | gnorm 0.235 | clip 0 | loss_scale 2 | train_wall 2452 | gb_free 59.9 | wall 0 epoch 030 | loss 4.012 | nll_loss 2.373 | ppl 5.18 | wps 292596 | ups 0.68 | wpb 428952 | bsz 16333.6 | num_updates 51061 | lr 0.000279889 | gnorm 0.235 | clip 0 | loss_scale 2 | train_wall 2452 | gb_free 59.9 | wall 0 epoch 030 | loss 4.012 | nll_loss 2.373 | ppl 5.18 | wps 292596 | ups 0.68 | wpb 428952 | bsz 16333.6 | num_updates 51061 | lr 0.000279889 | gnorm 0.235 | clip 0 | loss_scale 2 | train_wall 2452 | gb_free 59.9 | wall 0 Start iterating over samples epoch 031: 40 / 1707 loss=4.004, nll_loss=2.363, ppl=5.15, wps=261630, ups=0.62, wpb=424908, bsz=16283.4, num_updates=51100, lr=0.000279782, gnorm=0.242, clip=0, loss_scale=1, train_wall=144, gb_free=59.5, wall=0 epoch 031: 40 / 1707 loss=4.004, nll_loss=2.363, ppl=5.15, wps=261630, ups=0.62, wpb=424908, bsz=16283.4, num_updates=51100, lr=0.000279782, gnorm=0.242, clip=0, loss_scale=1, train_wall=144, gb_free=59.5, wall=0 epoch 031: 40 / 1707 loss=4.004, nll_loss=2.363, ppl=5.15, wps=261630, ups=0.62, wpb=424908, bsz=16283.4, num_updates=51100, lr=0.000279782, gnorm=0.242, clip=0, loss_scale=1, train_wall=144, gb_free=59.5, wall=0 epoch 031: 40 / 1707 loss=4.004, nll_loss=2.363, ppl=5.15, wps=261630, ups=0.62, wpb=424908, bsz=16283.4, num_updates=51100, lr=0.000279782, gnorm=0.242, clip=0, loss_scale=1, train_wall=144, gb_free=59.5, wall=0 epoch 031: 40 / 1707 loss=4.004, nll_loss=2.363, ppl=5.15, wps=261630, ups=0.62, wpb=424908, bsz=16283.4, num_updates=51100, lr=0.000279782, gnorm=0.242, clip=0, loss_scale=1, train_wall=144, gb_free=59.5, wall=0 epoch 031: 40 / 1707 loss=4.004, nll_loss=2.363, ppl=5.15, wps=261630, ups=0.62, wpb=424908, bsz=16283.4, num_updates=51100, lr=0.000279782, gnorm=0.242, clip=0, loss_scale=1, train_wall=144, gb_free=59.5, wall=0 epoch 031: 40 / 1707 loss=4.004, nll_loss=2.363, ppl=5.15, wps=261630, ups=0.62, wpb=424908, bsz=16283.4, num_updates=51100, lr=0.000279782, gnorm=0.242, clip=0, loss_scale=1, train_wall=144, gb_free=59.5, wall=0 epoch 031: 40 / 1707 loss=4.004, nll_loss=2.363, ppl=5.15, wps=261630, ups=0.62, wpb=424908, bsz=16283.4, num_updates=51100, lr=0.000279782, gnorm=0.242, clip=0, loss_scale=1, train_wall=144, gb_free=59.5, wall=0 epoch 031: 40 / 1707 loss=4.004, nll_loss=2.363, ppl=5.15, wps=261630, ups=0.62, wpb=424908, bsz=16283.4, num_updates=51100, lr=0.000279782, gnorm=0.242, clip=0, loss_scale=1, train_wall=144, gb_free=59.5, wall=0 epoch 031: 141 / 1707 loss=3.991, nll_loss=2.348, ppl=5.09, wps=294336, ups=0.69, wpb=429088, bsz=16417.4, num_updates=51200, lr=0.000279508, gnorm=0.227, clip=0, loss_scale=0.5, train_wall=145, gb_free=59.1, wall=0 epoch 031: 141 / 1707 loss=3.991, nll_loss=2.348, ppl=5.09, wps=294336, ups=0.69, wpb=429088, bsz=16417.4, num_updates=51200, lr=0.000279508, gnorm=0.227, clip=0, loss_scale=0.5, train_wall=145, gb_free=59.1, wall=0 epoch 031: 141 / 1707 loss=3.991, nll_loss=2.348, ppl=5.09, wps=294336, ups=0.69, wpb=429088, bsz=16417.4, num_updates=51200, lr=0.000279508, gnorm=0.227, clip=0, loss_scale=0.5, train_wall=145, gb_free=59.1, wall=0 epoch 031: 141 / 1707 loss=3.991, nll_loss=2.348, ppl=5.09, wps=294336, ups=0.69, wpb=429088, bsz=16417.4, num_updates=51200, lr=0.000279508, gnorm=0.227, clip=0, loss_scale=0.5, train_wall=145, gb_free=59.1, wall=0 epoch 031: 141 / 1707 loss=3.991, nll_loss=2.348, ppl=5.09, wps=294336, ups=0.69, wpb=429088, bsz=16417.4, num_updates=51200, lr=0.000279508, gnorm=0.227, clip=0, loss_scale=0.5, train_wall=145, gb_free=59.1, wall=0 epoch 031: 141 / 1707 loss=3.991, nll_loss=2.348, ppl=5.09, wps=294336, ups=0.69, wpb=429088, bsz=16417.4, num_updates=51200, lr=0.000279508, gnorm=0.227, clip=0, loss_scale=0.5, train_wall=145, gb_free=59.1, wall=0 epoch 031: 141 / 1707 loss=3.991, nll_loss=2.348, ppl=5.09, wps=294336, ups=0.69, wpb=429088, bsz=16417.4, num_updates=51200, lr=0.000279508, gnorm=0.227, clip=0, loss_scale=0.5, train_wall=145, gb_free=59.1, wall=0 epoch 031: 141 / 1707 loss=3.991, nll_loss=2.348, ppl=5.09, wps=294336, ups=0.69, wpb=429088, bsz=16417.4, num_updates=51200, lr=0.000279508, gnorm=0.227, clip=0, loss_scale=0.5, train_wall=145, gb_free=59.1, wall=0 epoch 031: 141 / 1707 loss=3.991, nll_loss=2.348, ppl=5.09, wps=294336, ups=0.69, wpb=429088, bsz=16417.4, num_updates=51200, lr=0.000279508, gnorm=0.227, clip=0, loss_scale=0.5, train_wall=145, gb_free=59.1, wall=0 epoch 031: 241 / 1707 loss=3.992, nll_loss=2.349, ppl=5.1, wps=295811, ups=0.69, wpb=428085, bsz=16285, num_updates=51300, lr=0.000279236, gnorm=0.229, clip=0, loss_scale=0.5, train_wall=144, gb_free=58.6, wall=0 epoch 031: 241 / 1707 loss=3.992, nll_loss=2.349, ppl=5.1, wps=295811, ups=0.69, wpb=428085, bsz=16285, num_updates=51300, lr=0.000279236, gnorm=0.229, clip=0, loss_scale=0.5, train_wall=144, gb_free=58.6, wall=0 epoch 031: 241 / 1707 loss=3.992, nll_loss=2.349, ppl=5.1, wps=295811, ups=0.69, wpb=428085, bsz=16285, num_updates=51300, lr=0.000279236, gnorm=0.229, clip=0, loss_scale=0.5, train_wall=144, gb_free=58.6, wall=0 epoch 031: 241 / 1707 loss=3.992, nll_loss=2.349, ppl=5.1, wps=295811, ups=0.69, wpb=428085, bsz=16285, num_updates=51300, lr=0.000279236, gnorm=0.229, clip=0, loss_scale=0.5, train_wall=144, gb_free=58.6, wall=0 epoch 031: 241 / 1707 loss=3.992, nll_loss=2.349, ppl=5.1, wps=295811, ups=0.69, wpb=428085, bsz=16285, num_updates=51300, lr=0.000279236, gnorm=0.229, clip=0, loss_scale=0.5, train_wall=144, gb_free=58.6, wall=0 epoch 031: 241 / 1707 loss=3.992, nll_loss=2.349, ppl=5.1, wps=295811, ups=0.69, wpb=428085, bsz=16285, num_updates=51300, lr=0.000279236, gnorm=0.229, clip=0, loss_scale=0.5, train_wall=144, gb_free=58.6, wall=0 epoch 031: 241 / 1707 loss=3.992, nll_loss=2.349, ppl=5.1, wps=295811, ups=0.69, wpb=428085, bsz=16285, num_updates=51300, lr=0.000279236, gnorm=0.229, clip=0, loss_scale=0.5, train_wall=144, gb_free=58.6, wall=0 epoch 031: 241 / 1707 loss=3.992, nll_loss=2.349, ppl=5.1, wps=295811, ups=0.69, wpb=428085, bsz=16285, num_updates=51300, lr=0.000279236, gnorm=0.229, clip=0, loss_scale=0.5, train_wall=144, gb_free=58.6, wall=0 epoch 031: 241 / 1707 loss=3.992, nll_loss=2.349, ppl=5.1, wps=295811, ups=0.69, wpb=428085, bsz=16285, num_updates=51300, lr=0.000279236, gnorm=0.229, clip=0, loss_scale=0.5, train_wall=144, gb_free=58.6, wall=0 epoch 031: 341 / 1707 loss=3.995, nll_loss=2.354, ppl=5.11, wps=296575, ups=0.69, wpb=428107, bsz=16374.5, num_updates=51400, lr=0.000278964, gnorm=0.229, clip=0, loss_scale=0.5, train_wall=144, gb_free=59.2, wall=0 epoch 031: 341 / 1707 loss=3.995, nll_loss=2.354, ppl=5.11, wps=296575, ups=0.69, wpb=428107, bsz=16374.5, num_updates=51400, lr=0.000278964, gnorm=0.229, clip=0, loss_scale=0.5, train_wall=144, gb_free=59.2, wall=0 epoch 031: 341 / 1707 loss=3.995, nll_loss=2.354, ppl=5.11, wps=296575, ups=0.69, wpb=428107, bsz=16374.5, num_updates=51400, lr=0.000278964, gnorm=0.229, clip=0, loss_scale=0.5, train_wall=144, gb_free=59.2, wall=0 epoch 031: 341 / 1707 loss=3.995, nll_loss=2.354, ppl=5.11, wps=296575, ups=0.69, wpb=428107, bsz=16374.5, num_updates=51400, lr=0.000278964, gnorm=0.229, clip=0, loss_scale=0.5, train_wall=144, gb_free=59.2, wall=0 epoch 031: 341 / 1707 loss=3.995, nll_loss=2.354, ppl=5.11, wps=296575, ups=0.69, wpb=428107, bsz=16374.5, num_updates=51400, lr=0.000278964, gnorm=0.229, clip=0, loss_scale=0.5, train_wall=144, gb_free=59.2, wall=0 epoch 031: 341 / 1707 loss=3.995, nll_loss=2.354, ppl=5.11, wps=296575, ups=0.69, wpb=428107, bsz=16374.5, num_updates=51400, lr=0.000278964, gnorm=0.229, clip=0, loss_scale=0.5, train_wall=144, gb_free=59.2, wall=0 epoch 031: 341 / 1707 loss=3.995, nll_loss=2.354, ppl=5.11, wps=296575, ups=0.69, wpb=428107, bsz=16374.5, num_updates=51400, lr=0.000278964, gnorm=0.229, clip=0, loss_scale=0.5, train_wall=144, gb_free=59.2, wall=0 epoch 031: 341 / 1707 loss=3.995, nll_loss=2.354, ppl=5.11, wps=296575, ups=0.69, wpb=428107, bsz=16374.5, num_updates=51400, lr=0.000278964, gnorm=0.229, clip=0, loss_scale=0.5, train_wall=144, gb_free=59.2, wall=0 epoch 031: 341 / 1707 loss=3.995, nll_loss=2.354, ppl=5.11, wps=296575, ups=0.69, wpb=428107, bsz=16374.5, num_updates=51400, lr=0.000278964, gnorm=0.229, clip=0, loss_scale=0.5, train_wall=144, gb_free=59.2, wall=0 epoch 031: 442 / 1707 loss=4, nll_loss=2.359, ppl=5.13, wps=294583, ups=0.69, wpb=430028, bsz=16318.8, num_updates=51500, lr=0.000278693, gnorm=0.238, clip=0, loss_scale=0.5, train_wall=145, gb_free=59.1, wall=0 epoch 031: 442 / 1707 loss=4, nll_loss=2.359, ppl=5.13, wps=294583, ups=0.69, wpb=430028, bsz=16318.8, num_updates=51500, lr=0.000278693, gnorm=0.238, clip=0, loss_scale=0.5, train_wall=145, gb_free=59.1, wall=0 epoch 031: 442 / 1707 loss=4, nll_loss=2.359, ppl=5.13, wps=294583, ups=0.69, wpb=430028, bsz=16318.8, num_updates=51500, lr=0.000278693, gnorm=0.238, clip=0, loss_scale=0.5, train_wall=145, gb_free=59.1, wall=0 epoch 031: 442 / 1707 loss=4, nll_loss=2.359, ppl=5.13, wps=294583, ups=0.69, wpb=430028, bsz=16318.8, num_updates=51500, lr=0.000278693, gnorm=0.238, clip=0, loss_scale=0.5, train_wall=145, gb_free=59.1, wall=0 epoch 031: 442 / 1707 loss=4, nll_loss=2.359, ppl=5.13, wps=294583, ups=0.69, wpb=430028, bsz=16318.8, num_updates=51500, lr=0.000278693, gnorm=0.238, clip=0, loss_scale=0.5, train_wall=145, gb_free=59.1, wall=0 epoch 031: 442 / 1707 loss=4, nll_loss=2.359, ppl=5.13, wps=294583, ups=0.69, wpb=430028, bsz=16318.8, num_updates=51500, lr=0.000278693, gnorm=0.238, clip=0, loss_scale=0.5, train_wall=145, gb_free=59.1, wall=0 epoch 031: 442 / 1707 loss=4, nll_loss=2.359, ppl=5.13, wps=294583, ups=0.69, wpb=430028, bsz=16318.8, num_updates=51500, lr=0.000278693, gnorm=0.238, clip=0, loss_scale=0.5, train_wall=145, gb_free=59.1, wall=0 epoch 031: 442 / 1707 loss=4, nll_loss=2.359, ppl=5.13, wps=294583, ups=0.69, wpb=430028, bsz=16318.8, num_updates=51500, lr=0.000278693, gnorm=0.238, clip=0, loss_scale=0.5, train_wall=145, gb_free=59.1, wall=0 epoch 031: 442 / 1707 loss=4, nll_loss=2.359, ppl=5.13, wps=294583, ups=0.69, wpb=430028, bsz=16318.8, num_updates=51500, lr=0.000278693, gnorm=0.238, clip=0, loss_scale=0.5, train_wall=145, gb_free=59.1, wall=0 epoch 031: 542 / 1707 loss=4.002, nll_loss=2.362, ppl=5.14, wps=297631, ups=0.69, wpb=429582, bsz=16295.2, num_updates=51600, lr=0.000278423, gnorm=0.221, clip=0, loss_scale=0.5, train_wall=144, gb_free=59.2, wall=0 epoch 031: 542 / 1707 loss=4.002, nll_loss=2.362, ppl=5.14, wps=297631, ups=0.69, wpb=429582, bsz=16295.2, num_updates=51600, lr=0.000278423, gnorm=0.221, clip=0, loss_scale=0.5, train_wall=144, gb_free=59.2, wall=0 epoch 031: 542 / 1707 loss=4.002, nll_loss=2.362, ppl=5.14, wps=297631, ups=0.69, wpb=429582, bsz=16295.2, num_updates=51600, lr=0.000278423, gnorm=0.221, clip=0, loss_scale=0.5, train_wall=144, gb_free=59.2, wall=0 epoch 031: 542 / 1707 loss=4.002, nll_loss=2.362, ppl=5.14, wps=297631, ups=0.69, wpb=429582, bsz=16295.2, num_updates=51600, lr=0.000278423, gnorm=0.221, clip=0, loss_scale=0.5, train_wall=144, gb_free=59.2, wall=0 epoch 031: 542 / 1707 loss=4.002, nll_loss=2.362, ppl=5.14, wps=297631, ups=0.69, wpb=429582, bsz=16295.2, num_updates=51600, lr=0.000278423, gnorm=0.221, clip=0, loss_scale=0.5, train_wall=144, gb_free=59.2, wall=0 epoch 031: 542 / 1707 loss=4.002, nll_loss=2.362, ppl=5.14, wps=297631, ups=0.69, wpb=429582, bsz=16295.2, num_updates=51600, lr=0.000278423, gnorm=0.221, clip=0, loss_scale=0.5, train_wall=144, gb_free=59.2, wall=0 epoch 031: 542 / 1707 loss=4.002, nll_loss=2.362, ppl=5.14, wps=297631, ups=0.69, wpb=429582, bsz=16295.2, num_updates=51600, lr=0.000278423, gnorm=0.221, clip=0, loss_scale=0.5, train_wall=144, gb_free=59.2, wall=0 epoch 031: 542 / 1707 loss=4.002, nll_loss=2.362, ppl=5.14, wps=297631, ups=0.69, wpb=429582, bsz=16295.2, num_updates=51600, lr=0.000278423, gnorm=0.221, clip=0, loss_scale=0.5, train_wall=144, gb_free=59.2, wall=0 epoch 031: 542 / 1707 loss=4.002, nll_loss=2.362, ppl=5.14, wps=297631, ups=0.69, wpb=429582, bsz=16295.2, num_updates=51600, lr=0.000278423, gnorm=0.221, clip=0, loss_scale=0.5, train_wall=144, gb_free=59.2, wall=0 epoch 031: 642 / 1707 loss=4.006, nll_loss=2.366, ppl=5.16, wps=296469, ups=0.69, wpb=428961, bsz=16589.4, num_updates=51700, lr=0.000278154, gnorm=0.239, clip=1, loss_scale=0.5, train_wall=144, gb_free=59.5, wall=0 epoch 031: 642 / 1707 loss=4.006, nll_loss=2.366, ppl=5.16, wps=296469, ups=0.69, wpb=428961, bsz=16589.4, num_updates=51700, lr=0.000278154, gnorm=0.239, clip=1, loss_scale=0.5, train_wall=144, gb_free=59.5, wall=0 epoch 031: 642 / 1707 loss=4.006, nll_loss=2.366, ppl=5.16, wps=296469, ups=0.69, wpb=428961, bsz=16589.4, num_updates=51700, lr=0.000278154, gnorm=0.239, clip=1, loss_scale=0.5, train_wall=144, gb_free=59.5, wall=0 epoch 031: 642 / 1707 loss=4.006, nll_loss=2.366, ppl=5.16, wps=296469, ups=0.69, wpb=428961, bsz=16589.4, num_updates=51700, lr=0.000278154, gnorm=0.239, clip=1, loss_scale=0.5, train_wall=144, gb_free=59.5, wall=0 epoch 031: 642 / 1707 loss=4.006, nll_loss=2.366, ppl=5.16, wps=296469, ups=0.69, wpb=428961, bsz=16589.4, num_updates=51700, lr=0.000278154, gnorm=0.239, clip=1, loss_scale=0.5, train_wall=144, gb_free=59.5, wall=0 epoch 031: 642 / 1707 loss=4.006, nll_loss=2.366, ppl=5.16, wps=296469, ups=0.69, wpb=428961, bsz=16589.4, num_updates=51700, lr=0.000278154, gnorm=0.239, clip=1, loss_scale=0.5, train_wall=144, gb_free=59.5, wall=0 epoch 031: 642 / 1707 loss=4.006, nll_loss=2.366, ppl=5.16, wps=296469, ups=0.69, wpb=428961, bsz=16589.4, num_updates=51700, lr=0.000278154, gnorm=0.239, clip=1, loss_scale=0.5, train_wall=144, gb_free=59.5, wall=0 epoch 031: 642 / 1707 loss=4.006, nll_loss=2.366, ppl=5.16, wps=296469, ups=0.69, wpb=428961, bsz=16589.4, num_updates=51700, lr=0.000278154, gnorm=0.239, clip=1, loss_scale=0.5, train_wall=144, gb_free=59.5, wall=0 epoch 031: 642 / 1707 loss=4.006, nll_loss=2.366, ppl=5.16, wps=296469, ups=0.69, wpb=428961, bsz=16589.4, num_updates=51700, lr=0.000278154, gnorm=0.239, clip=1, loss_scale=0.5, train_wall=144, gb_free=59.5, wall=0 epoch 031: 742 / 1707 loss=4.015, nll_loss=2.376, ppl=5.19, wps=297642, ups=0.69, wpb=429750, bsz=16375, num_updates=51800, lr=0.000277885, gnorm=0.249, clip=0, loss_scale=1, train_wall=144, gb_free=58.8, wall=0 epoch 031: 742 / 1707 loss=4.015, nll_loss=2.376, ppl=5.19, wps=297642, ups=0.69, wpb=429750, bsz=16375, num_updates=51800, lr=0.000277885, gnorm=0.249, clip=0, loss_scale=1, train_wall=144, gb_free=58.8, wall=0 epoch 031: 742 / 1707 loss=4.015, nll_loss=2.376, ppl=5.19, wps=297642, ups=0.69, wpb=429750, bsz=16375, num_updates=51800, lr=0.000277885, gnorm=0.249, clip=0, loss_scale=1, train_wall=144, gb_free=58.8, wall=0 epoch 031: 742 / 1707 loss=4.015, nll_loss=2.376, ppl=5.19, wps=297642, ups=0.69, wpb=429750, bsz=16375, num_updates=51800, lr=0.000277885, gnorm=0.249, clip=0, loss_scale=1, train_wall=144, gb_free=58.8, wall=0 epoch 031: 742 / 1707 loss=4.015, nll_loss=2.376, ppl=5.19, wps=297642, ups=0.69, wpb=429750, bsz=16375, num_updates=51800, lr=0.000277885, gnorm=0.249, clip=0, loss_scale=1, train_wall=144, gb_free=58.8, wall=0 epoch 031: 742 / 1707 loss=4.015, nll_loss=2.376, ppl=5.19, wps=297642, ups=0.69, wpb=429750, bsz=16375, num_updates=51800, lr=0.000277885, gnorm=0.249, clip=0, loss_scale=1, train_wall=144, gb_free=58.8, wall=0 epoch 031: 742 / 1707 loss=4.015, nll_loss=2.376, ppl=5.19, wps=297642, ups=0.69, wpb=429750, bsz=16375, num_updates=51800, lr=0.000277885, gnorm=0.249, clip=0, loss_scale=1, train_wall=144, gb_free=58.8, wall=0 epoch 031: 742 / 1707 loss=4.015, nll_loss=2.376, ppl=5.19, wps=297642, ups=0.69, wpb=429750, bsz=16375, num_updates=51800, lr=0.000277885, gnorm=0.249, clip=0, loss_scale=1, train_wall=144, gb_free=58.8, wall=0 epoch 031: 742 / 1707 loss=4.015, nll_loss=2.376, ppl=5.19, wps=297642, ups=0.69, wpb=429750, bsz=16375, num_updates=51800, lr=0.000277885, gnorm=0.249, clip=0, loss_scale=1, train_wall=144, gb_free=58.8, wall=0 epoch 031: 842 / 1707 loss=4.01, nll_loss=2.371, ppl=5.17, wps=297533, ups=0.69, wpb=430348, bsz=16251.4, num_updates=51900, lr=0.000277617, gnorm=0.223, clip=0, loss_scale=1, train_wall=144, gb_free=59.3, wall=0 epoch 031: 842 / 1707 loss=4.01, nll_loss=2.371, ppl=5.17, wps=297533, ups=0.69, wpb=430348, bsz=16251.4, num_updates=51900, lr=0.000277617, gnorm=0.223, clip=0, loss_scale=1, train_wall=144, gb_free=59.3, wall=0 epoch 031: 842 / 1707 loss=4.01, nll_loss=2.371, ppl=5.17, wps=297533, ups=0.69, wpb=430348, bsz=16251.4, num_updates=51900, lr=0.000277617, gnorm=0.223, clip=0, loss_scale=1, train_wall=144, gb_free=59.3, wall=0 epoch 031: 842 / 1707 loss=4.01, nll_loss=2.371, ppl=5.17, wps=297533, ups=0.69, wpb=430348, bsz=16251.4, num_updates=51900, lr=0.000277617, gnorm=0.223, clip=0, loss_scale=1, train_wall=144, gb_free=59.3, wall=0 epoch 031: 842 / 1707 loss=4.01, nll_loss=2.371, ppl=5.17, wps=297533, ups=0.69, wpb=430348, bsz=16251.4, num_updates=51900, lr=0.000277617, gnorm=0.223, clip=0, loss_scale=1, train_wall=144, gb_free=59.3, wall=0 epoch 031: 842 / 1707 loss=4.01, nll_loss=2.371, ppl=5.17, wps=297533, ups=0.69, wpb=430348, bsz=16251.4, num_updates=51900, lr=0.000277617, gnorm=0.223, clip=0, loss_scale=1, train_wall=144, gb_free=59.3, wall=0 epoch 031: 842 / 1707 loss=4.01, nll_loss=2.371, ppl=5.17, wps=297533, ups=0.69, wpb=430348, bsz=16251.4, num_updates=51900, lr=0.000277617, gnorm=0.223, clip=0, loss_scale=1, train_wall=144, gb_free=59.3, wall=0 epoch 031: 842 / 1707 loss=4.01, nll_loss=2.371, ppl=5.17, wps=297533, ups=0.69, wpb=430348, bsz=16251.4, num_updates=51900, lr=0.000277617, gnorm=0.223, clip=0, loss_scale=1, train_wall=144, gb_free=59.3, wall=0 epoch 031: 842 / 1707 loss=4.01, nll_loss=2.371, ppl=5.17, wps=297533, ups=0.69, wpb=430348, bsz=16251.4, num_updates=51900, lr=0.000277617, gnorm=0.223, clip=0, loss_scale=1, train_wall=144, gb_free=59.3, wall=0 epoch 031: 942 / 1707 loss=4.007, nll_loss=2.367, ppl=5.16, wps=296512, ups=0.69, wpb=428505, bsz=16494.2, num_updates=52000, lr=0.00027735, gnorm=0.236, clip=0, loss_scale=1, train_wall=144, gb_free=59.2, wall=0 epoch 031: 942 / 1707 loss=4.007, nll_loss=2.367, ppl=5.16, wps=296512, ups=0.69, wpb=428505, bsz=16494.2, num_updates=52000, lr=0.00027735, gnorm=0.236, clip=0, loss_scale=1, train_wall=144, gb_free=59.2, wall=0 epoch 031: 942 / 1707 loss=4.007, nll_loss=2.367, ppl=5.16, wps=296512, ups=0.69, wpb=428505, bsz=16494.2, num_updates=52000, lr=0.00027735, gnorm=0.236, clip=0, loss_scale=1, train_wall=144, gb_free=59.2, wall=0 epoch 031: 942 / 1707 loss=4.007, nll_loss=2.367, ppl=5.16, wps=296512, ups=0.69, wpb=428505, bsz=16494.2, num_updates=52000, lr=0.00027735, gnorm=0.236, clip=0, loss_scale=1, train_wall=144, gb_free=59.2, wall=0 epoch 031: 942 / 1707 loss=4.007, nll_loss=2.367, ppl=5.16, wps=296512, ups=0.69, wpb=428505, bsz=16494.2, num_updates=52000, lr=0.00027735, gnorm=0.236, clip=0, loss_scale=1, train_wall=144, gb_free=59.2, wall=0 epoch 031: 942 / 1707 loss=4.007, nll_loss=2.367, ppl=5.16, wps=296512, ups=0.69, wpb=428505, bsz=16494.2, num_updates=52000, lr=0.00027735, gnorm=0.236, clip=0, loss_scale=1, train_wall=144, gb_free=59.2, wall=0 epoch 031: 942 / 1707 loss=4.007, nll_loss=2.367, ppl=5.16, wps=296512, ups=0.69, wpb=428505, bsz=16494.2, num_updates=52000, lr=0.00027735, gnorm=0.236, clip=0, loss_scale=1, train_wall=144, gb_free=59.2, wall=0 epoch 031: 942 / 1707 loss=4.007, nll_loss=2.367, ppl=5.16, wps=296512, ups=0.69, wpb=428505, bsz=16494.2, num_updates=52000, lr=0.00027735, gnorm=0.236, clip=0, loss_scale=1, train_wall=144, gb_free=59.2, wall=0 epoch 031: 942 / 1707 loss=4.007, nll_loss=2.367, ppl=5.16, wps=296512, ups=0.69, wpb=428505, bsz=16494.2, num_updates=52000, lr=0.00027735, gnorm=0.236, clip=0, loss_scale=1, train_wall=144, gb_free=59.2, wall=0 begin validation on "valid" subset epoch 031 | valid on 'valid' subset | loss 4.289 | nll_loss 2.666 | ppl 6.35 | wps 76925.5 | wpb 21331 | bsz 1016 | num_updates 52000 | best_loss 4.279 epoch 031 | valid on 'valid' subset | loss 4.289 | nll_loss 2.666 | ppl 6.35 | wps 76925.5 | wpb 21331 | bsz 1016 | num_updates 52000 | best_loss 4.279 epoch 031 | valid on 'valid' subset | loss 4.289 | nll_loss 2.666 | ppl 6.35 | wps 76925.5 | wpb 21331 | bsz 1016 | num_updates 52000 | best_loss 4.279 epoch 031 | valid on 'valid' subset | loss 4.289 | nll_loss 2.666 | ppl 6.35 | wps 76925.5 | wpb 21331 | bsz 1016 | num_updates 52000 | best_loss 4.279 epoch 031 | valid on 'valid' subset | loss 4.289 | nll_loss 2.666 | ppl 6.35 | wps 76925.5 | wpb 21331 | bsz 1016 | num_updates 52000 | best_loss 4.279 epoch 031 | valid on 'valid' subset | loss 4.289 | nll_loss 2.666 | ppl 6.35 | wps 76925.5 | wpb 21331 | bsz 1016 | num_updates 52000 | best_loss 4.279 epoch 031 | valid on 'valid' subset | loss 4.289 | nll_loss 2.666 | ppl 6.35 | wps 76925.5 | wpb 21331 | bsz 1016 | num_updates 52000 | best_loss 4.279 epoch 031 | valid on 'valid' subset | loss 4.289 | nll_loss 2.666 | ppl 6.35 | wps 76925.5 | wpb 21331 | bsz 1016 | num_updates 52000 | best_loss 4.279 epoch 031 | valid on 'valid' subset | loss 4.289 | nll_loss 2.666 | ppl 6.35 | wps 76925.5 | wpb 21331 | bsz 1016 | num_updates 52000 | best_loss 4.279 epoch 031: 1043 / 1707 loss=4.009, nll_loss=2.37, ppl=5.17, wps=271854, ups=0.63, wpb=429924, bsz=16243.7, num_updates=52100, lr=0.000277084, gnorm=0.231, clip=0, loss_scale=1, train_wall=145, gb_free=59.3, wall=0 epoch 031: 1043 / 1707 loss=4.009, nll_loss=2.37, ppl=5.17, wps=271854, ups=0.63, wpb=429924, bsz=16243.7, num_updates=52100, lr=0.000277084, gnorm=0.231, clip=0, loss_scale=1, train_wall=145, gb_free=59.3, wall=0 epoch 031: 1043 / 1707 loss=4.009, nll_loss=2.37, ppl=5.17, wps=271854, ups=0.63, wpb=429924, bsz=16243.7, num_updates=52100, lr=0.000277084, gnorm=0.231, clip=0, loss_scale=1, train_wall=145, gb_free=59.3, wall=0 epoch 031: 1043 / 1707 loss=4.009, nll_loss=2.37, ppl=5.17, wps=271854, ups=0.63, wpb=429924, bsz=16243.7, num_updates=52100, lr=0.000277084, gnorm=0.231, clip=0, loss_scale=1, train_wall=145, gb_free=59.3, wall=0 epoch 031: 1043 / 1707 loss=4.009, nll_loss=2.37, ppl=5.17, wps=271854, ups=0.63, wpb=429924, bsz=16243.7, num_updates=52100, lr=0.000277084, gnorm=0.231, clip=0, loss_scale=1, train_wall=145, gb_free=59.3, wall=0 epoch 031: 1043 / 1707 loss=4.009, nll_loss=2.37, ppl=5.17, wps=271854, ups=0.63, wpb=429924, bsz=16243.7, num_updates=52100, lr=0.000277084, gnorm=0.231, clip=0, loss_scale=1, train_wall=145, gb_free=59.3, wall=0 epoch 031: 1043 / 1707 loss=4.009, nll_loss=2.37, ppl=5.17, wps=271854, ups=0.63, wpb=429924, bsz=16243.7, num_updates=52100, lr=0.000277084, gnorm=0.231, clip=0, loss_scale=1, train_wall=145, gb_free=59.3, wall=0 epoch 031: 1043 / 1707 loss=4.009, nll_loss=2.37, ppl=5.17, wps=271854, ups=0.63, wpb=429924, bsz=16243.7, num_updates=52100, lr=0.000277084, gnorm=0.231, clip=0, loss_scale=1, train_wall=145, gb_free=59.3, wall=0 epoch 031: 1043 / 1707 loss=4.009, nll_loss=2.37, ppl=5.17, wps=271854, ups=0.63, wpb=429924, bsz=16243.7, num_updates=52100, lr=0.000277084, gnorm=0.231, clip=0, loss_scale=1, train_wall=145, gb_free=59.3, wall=0 epoch 031: 1144 / 1707 loss=4.006, nll_loss=2.367, ppl=5.16, wps=295015, ups=0.69, wpb=429084, bsz=16217, num_updates=52200, lr=0.000276818, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=145, gb_free=59.2, wall=0 epoch 031: 1144 / 1707 loss=4.006, nll_loss=2.367, ppl=5.16, wps=295015, ups=0.69, wpb=429084, bsz=16217, num_updates=52200, lr=0.000276818, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=145, gb_free=59.2, wall=0 epoch 031: 1144 / 1707 loss=4.006, nll_loss=2.367, ppl=5.16, wps=295015, ups=0.69, wpb=429084, bsz=16217, num_updates=52200, lr=0.000276818, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=145, gb_free=59.2, wall=0 epoch 031: 1144 / 1707 loss=4.006, nll_loss=2.367, ppl=5.16, wps=295015, ups=0.69, wpb=429084, bsz=16217, num_updates=52200, lr=0.000276818, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=145, gb_free=59.2, wall=0 epoch 031: 1144 / 1707 loss=4.006, nll_loss=2.367, ppl=5.16, wps=295015, ups=0.69, wpb=429084, bsz=16217, num_updates=52200, lr=0.000276818, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=145, gb_free=59.2, wall=0 epoch 031: 1144 / 1707 loss=4.006, nll_loss=2.367, ppl=5.16, wps=295015, ups=0.69, wpb=429084, bsz=16217, num_updates=52200, lr=0.000276818, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=145, gb_free=59.2, wall=0 epoch 031: 1144 / 1707 loss=4.006, nll_loss=2.367, ppl=5.16, wps=295015, ups=0.69, wpb=429084, bsz=16217, num_updates=52200, lr=0.000276818, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=145, gb_free=59.2, wall=0 epoch 031: 1144 / 1707 loss=4.006, nll_loss=2.367, ppl=5.16, wps=295015, ups=0.69, wpb=429084, bsz=16217, num_updates=52200, lr=0.000276818, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=145, gb_free=59.2, wall=0 epoch 031: 1144 / 1707 loss=4.006, nll_loss=2.367, ppl=5.16, wps=295015, ups=0.69, wpb=429084, bsz=16217, num_updates=52200, lr=0.000276818, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=145, gb_free=59.2, wall=0 epoch 031: 1244 / 1707 loss=4.016, nll_loss=2.378, ppl=5.2, wps=297181, ups=0.69, wpb=428572, bsz=16136.6, num_updates=52300, lr=0.000276553, gnorm=0.253, clip=1, loss_scale=0.5, train_wall=143, gb_free=59, wall=0 epoch 031: 1244 / 1707 loss=4.016, nll_loss=2.378, ppl=5.2, wps=297181, ups=0.69, wpb=428572, bsz=16136.6, num_updates=52300, lr=0.000276553, gnorm=0.253, clip=1, loss_scale=0.5, train_wall=143, gb_free=59, wall=0 epoch 031: 1244 / 1707 loss=4.016, nll_loss=2.378, ppl=5.2, wps=297181, ups=0.69, wpb=428572, bsz=16136.6, num_updates=52300, lr=0.000276553, gnorm=0.253, clip=1, loss_scale=0.5, train_wall=143, gb_free=59, wall=0 epoch 031: 1244 / 1707 loss=4.016, nll_loss=2.378, ppl=5.2, wps=297181, ups=0.69, wpb=428572, bsz=16136.6, num_updates=52300, lr=0.000276553, gnorm=0.253, clip=1, loss_scale=0.5, train_wall=143, gb_free=59, wall=0 epoch 031: 1244 / 1707 loss=4.016, nll_loss=2.378, ppl=5.2, wps=297181, ups=0.69, wpb=428572, bsz=16136.6, num_updates=52300, lr=0.000276553, gnorm=0.253, clip=1, loss_scale=0.5, train_wall=143, gb_free=59, wall=0 epoch 031: 1244 / 1707 loss=4.016, nll_loss=2.378, ppl=5.2, wps=297181, ups=0.69, wpb=428572, bsz=16136.6, num_updates=52300, lr=0.000276553, gnorm=0.253, clip=1, loss_scale=0.5, train_wall=143, gb_free=59, wall=0 epoch 031: 1244 / 1707 loss=4.016, nll_loss=2.378, ppl=5.2, wps=297181, ups=0.69, wpb=428572, bsz=16136.6, num_updates=52300, lr=0.000276553, gnorm=0.253, clip=1, loss_scale=0.5, train_wall=143, gb_free=59, wall=0 epoch 031: 1244 / 1707 loss=4.016, nll_loss=2.378, ppl=5.2, wps=297181, ups=0.69, wpb=428572, bsz=16136.6, num_updates=52300, lr=0.000276553, gnorm=0.253, clip=1, loss_scale=0.5, train_wall=143, gb_free=59, wall=0 epoch 031: 1244 / 1707 loss=4.016, nll_loss=2.378, ppl=5.2, wps=297181, ups=0.69, wpb=428572, bsz=16136.6, num_updates=52300, lr=0.000276553, gnorm=0.253, clip=1, loss_scale=0.5, train_wall=143, gb_free=59, wall=0 epoch 031: 1344 / 1707 loss=4.02, nll_loss=2.382, ppl=5.21, wps=296538, ups=0.69, wpb=427844, bsz=16319, num_updates=52400, lr=0.000276289, gnorm=0.241, clip=0, loss_scale=0.5, train_wall=144, gb_free=59.1, wall=0 epoch 031: 1344 / 1707 loss=4.02, nll_loss=2.382, ppl=5.21, wps=296538, ups=0.69, wpb=427844, bsz=16319, num_updates=52400, lr=0.000276289, gnorm=0.241, clip=0, loss_scale=0.5, train_wall=144, gb_free=59.1, wall=0 epoch 031: 1344 / 1707 loss=4.02, nll_loss=2.382, ppl=5.21, wps=296538, ups=0.69, wpb=427844, bsz=16319, num_updates=52400, lr=0.000276289, gnorm=0.241, clip=0, loss_scale=0.5, train_wall=144, gb_free=59.1, wall=0 epoch 031: 1344 / 1707 loss=4.02, nll_loss=2.382, ppl=5.21, wps=296538, ups=0.69, wpb=427844, bsz=16319, num_updates=52400, lr=0.000276289, gnorm=0.241, clip=0, loss_scale=0.5, train_wall=144, gb_free=59.1, wall=0 epoch 031: 1344 / 1707 loss=4.02, nll_loss=2.382, ppl=5.21, wps=296538, ups=0.69, wpb=427844, bsz=16319, num_updates=52400, lr=0.000276289, gnorm=0.241, clip=0, loss_scale=0.5, train_wall=144, gb_free=59.1, wall=0 epoch 031: 1344 / 1707 loss=4.02, nll_loss=2.382, ppl=5.21, wps=296538, ups=0.69, wpb=427844, bsz=16319, num_updates=52400, lr=0.000276289, gnorm=0.241, clip=0, loss_scale=0.5, train_wall=144, gb_free=59.1, wall=0 epoch 031: 1344 / 1707 loss=4.02, nll_loss=2.382, ppl=5.21, wps=296538, ups=0.69, wpb=427844, bsz=16319, num_updates=52400, lr=0.000276289, gnorm=0.241, clip=0, loss_scale=0.5, train_wall=144, gb_free=59.1, wall=0 epoch 031: 1344 / 1707 loss=4.02, nll_loss=2.382, ppl=5.21, wps=296538, ups=0.69, wpb=427844, bsz=16319, num_updates=52400, lr=0.000276289, gnorm=0.241, clip=0, loss_scale=0.5, train_wall=144, gb_free=59.1, wall=0 epoch 031: 1344 / 1707 loss=4.02, nll_loss=2.382, ppl=5.21, wps=296538, ups=0.69, wpb=427844, bsz=16319, num_updates=52400, lr=0.000276289, gnorm=0.241, clip=0, loss_scale=0.5, train_wall=144, gb_free=59.1, wall=0 epoch 031: 1444 / 1707 loss=4.016, nll_loss=2.378, ppl=5.2, wps=297361, ups=0.69, wpb=430204, bsz=16587.4, num_updates=52500, lr=0.000276026, gnorm=0.231, clip=0, loss_scale=1, train_wall=144, gb_free=59.1, wall=0 epoch 031: 1444 / 1707 loss=4.016, nll_loss=2.378, ppl=5.2, wps=297361, ups=0.69, wpb=430204, bsz=16587.4, num_updates=52500, lr=0.000276026, gnorm=0.231, clip=0, loss_scale=1, train_wall=144, gb_free=59.1, wall=0 epoch 031: 1444 / 1707 loss=4.016, nll_loss=2.378, ppl=5.2, wps=297361, ups=0.69, wpb=430204, bsz=16587.4, num_updates=52500, lr=0.000276026, gnorm=0.231, clip=0, loss_scale=1, train_wall=144, gb_free=59.1, wall=0 epoch 031: 1444 / 1707 loss=4.016, nll_loss=2.378, ppl=5.2, wps=297361, ups=0.69, wpb=430204, bsz=16587.4, num_updates=52500, lr=0.000276026, gnorm=0.231, clip=0, loss_scale=1, train_wall=144, gb_free=59.1, wall=0 epoch 031: 1444 / 1707 loss=4.016, nll_loss=2.378, ppl=5.2, wps=297361, ups=0.69, wpb=430204, bsz=16587.4, num_updates=52500, lr=0.000276026, gnorm=0.231, clip=0, loss_scale=1, train_wall=144, gb_free=59.1, wall=0 epoch 031: 1444 / 1707 loss=4.016, nll_loss=2.378, ppl=5.2, wps=297361, ups=0.69, wpb=430204, bsz=16587.4, num_updates=52500, lr=0.000276026, gnorm=0.231, clip=0, loss_scale=1, train_wall=144, gb_free=59.1, wall=0 epoch 031: 1444 / 1707 loss=4.016, nll_loss=2.378, ppl=5.2, wps=297361, ups=0.69, wpb=430204, bsz=16587.4, num_updates=52500, lr=0.000276026, gnorm=0.231, clip=0, loss_scale=1, train_wall=144, gb_free=59.1, wall=0 epoch 031: 1444 / 1707 loss=4.016, nll_loss=2.378, ppl=5.2, wps=297361, ups=0.69, wpb=430204, bsz=16587.4, num_updates=52500, lr=0.000276026, gnorm=0.231, clip=0, loss_scale=1, train_wall=144, gb_free=59.1, wall=0 epoch 031: 1444 / 1707 loss=4.016, nll_loss=2.378, ppl=5.2, wps=297361, ups=0.69, wpb=430204, bsz=16587.4, num_updates=52500, lr=0.000276026, gnorm=0.231, clip=0, loss_scale=1, train_wall=144, gb_free=59.1, wall=0 epoch 031: 1544 / 1707 loss=4.02, nll_loss=2.383, ppl=5.22, wps=296607, ups=0.69, wpb=428540, bsz=16240.2, num_updates=52600, lr=0.000275764, gnorm=0.25, clip=0, loss_scale=1, train_wall=144, gb_free=58.9, wall=0 epoch 031: 1544 / 1707 loss=4.02, nll_loss=2.383, ppl=5.22, wps=296607, ups=0.69, wpb=428540, bsz=16240.2, num_updates=52600, lr=0.000275764, gnorm=0.25, clip=0, loss_scale=1, train_wall=144, gb_free=58.9, wall=0 epoch 031: 1544 / 1707 loss=4.02, nll_loss=2.383, ppl=5.22, wps=296607, ups=0.69, wpb=428540, bsz=16240.2, num_updates=52600, lr=0.000275764, gnorm=0.25, clip=0, loss_scale=1, train_wall=144, gb_free=58.9, wall=0 epoch 031: 1544 / 1707 loss=4.02, nll_loss=2.383, ppl=5.22, wps=296607, ups=0.69, wpb=428540, bsz=16240.2, num_updates=52600, lr=0.000275764, gnorm=0.25, clip=0, loss_scale=1, train_wall=144, gb_free=58.9, wall=0 epoch 031: 1544 / 1707 loss=4.02, nll_loss=2.383, ppl=5.22, wps=296607, ups=0.69, wpb=428540, bsz=16240.2, num_updates=52600, lr=0.000275764, gnorm=0.25, clip=0, loss_scale=1, train_wall=144, gb_free=58.9, wall=0 epoch 031: 1544 / 1707 loss=4.02, nll_loss=2.383, ppl=5.22, wps=296607, ups=0.69, wpb=428540, bsz=16240.2, num_updates=52600, lr=0.000275764, gnorm=0.25, clip=0, loss_scale=1, train_wall=144, gb_free=58.9, wall=0 epoch 031: 1544 / 1707 loss=4.02, nll_loss=2.383, ppl=5.22, wps=296607, ups=0.69, wpb=428540, bsz=16240.2, num_updates=52600, lr=0.000275764, gnorm=0.25, clip=0, loss_scale=1, train_wall=144, gb_free=58.9, wall=0 epoch 031: 1544 / 1707 loss=4.02, nll_loss=2.383, ppl=5.22, wps=296607, ups=0.69, wpb=428540, bsz=16240.2, num_updates=52600, lr=0.000275764, gnorm=0.25, clip=0, loss_scale=1, train_wall=144, gb_free=58.9, wall=0 epoch 031: 1544 / 1707 loss=4.02, nll_loss=2.383, ppl=5.22, wps=296607, ups=0.69, wpb=428540, bsz=16240.2, num_updates=52600, lr=0.000275764, gnorm=0.25, clip=0, loss_scale=1, train_wall=144, gb_free=58.9, wall=0 epoch 031: 1645 / 1707 loss=4.025, nll_loss=2.388, ppl=5.23, wps=295487, ups=0.69, wpb=429774, bsz=16251.2, num_updates=52700, lr=0.000275502, gnorm=0.237, clip=0, loss_scale=0.5, train_wall=145, gb_free=58.9, wall=0 epoch 031: 1645 / 1707 loss=4.025, nll_loss=2.388, ppl=5.23, wps=295487, ups=0.69, wpb=429774, bsz=16251.2, num_updates=52700, lr=0.000275502, gnorm=0.237, clip=0, loss_scale=0.5, train_wall=145, gb_free=58.9, wall=0 epoch 031: 1645 / 1707 loss=4.025, nll_loss=2.388, ppl=5.23, wps=295487, ups=0.69, wpb=429774, bsz=16251.2, num_updates=52700, lr=0.000275502, gnorm=0.237, clip=0, loss_scale=0.5, train_wall=145, gb_free=58.9, wall=0 epoch 031: 1645 / 1707 loss=4.025, nll_loss=2.388, ppl=5.23, wps=295487, ups=0.69, wpb=429774, bsz=16251.2, num_updates=52700, lr=0.000275502, gnorm=0.237, clip=0, loss_scale=0.5, train_wall=145, gb_free=58.9, wall=0 epoch 031: 1645 / 1707 loss=4.025, nll_loss=2.388, ppl=5.23, wps=295487, ups=0.69, wpb=429774, bsz=16251.2, num_updates=52700, lr=0.000275502, gnorm=0.237, clip=0, loss_scale=0.5, train_wall=145, gb_free=58.9, wall=0 epoch 031: 1645 / 1707 loss=4.025, nll_loss=2.388, ppl=5.23, wps=295487, ups=0.69, wpb=429774, bsz=16251.2, num_updates=52700, lr=0.000275502, gnorm=0.237, clip=0, loss_scale=0.5, train_wall=145, gb_free=58.9, wall=0 epoch 031: 1645 / 1707 loss=4.025, nll_loss=2.388, ppl=5.23, wps=295487, ups=0.69, wpb=429774, bsz=16251.2, num_updates=52700, lr=0.000275502, gnorm=0.237, clip=0, loss_scale=0.5, train_wall=145, gb_free=58.9, wall=0 epoch 031: 1645 / 1707 loss=4.025, nll_loss=2.388, ppl=5.23, wps=295487, ups=0.69, wpb=429774, bsz=16251.2, num_updates=52700, lr=0.000275502, gnorm=0.237, clip=0, loss_scale=0.5, train_wall=145, gb_free=58.9, wall=0 epoch 031: 1645 / 1707 loss=4.025, nll_loss=2.388, ppl=5.23, wps=295487, ups=0.69, wpb=429774, bsz=16251.2, num_updates=52700, lr=0.000275502, gnorm=0.237, clip=0, loss_scale=0.5, train_wall=145, gb_free=58.9, wall=0 end of epoch 31 (average epoch stats below) epoch 031 | loss 4.008 | nll_loss 2.368 | ppl 5.16 | wps 294608 | ups 0.69 | wpb 428936 | bsz 16329.7 | num_updates 52762 | lr 0.00027534 | gnorm 0.235 | clip 0.1 | loss_scale 0.5 | train_wall 2450 | gb_free 59.2 | wall 0 epoch 031 | loss 4.008 | nll_loss 2.368 | ppl 5.16 | wps 294608 | ups 0.69 | wpb 428936 | bsz 16329.7 | num_updates 52762 | lr 0.00027534 | gnorm 0.235 | clip 0.1 | loss_scale 0.5 | train_wall 2450 | gb_free 59.2 | wall 0 epoch 031 | loss 4.008 | nll_loss 2.368 | ppl 5.16 | wps 294608 | ups 0.69 | wpb 428936 | bsz 16329.7 | num_updates 52762 | lr 0.00027534 | gnorm 0.235 | clip 0.1 | loss_scale 0.5 | train_wall 2450 | gb_free 59.2 | wall 0 epoch 031 | loss 4.008 | nll_loss 2.368 | ppl 5.16 | wps 294608 | ups 0.69 | wpb 428936 | bsz 16329.7 | num_updates 52762 | lr 0.00027534 | gnorm 0.235 | clip 0.1 | loss_scale 0.5 | train_wall 2450 | gb_free 59.2 | wall 0 epoch 031 | loss 4.008 | nll_loss 2.368 | ppl 5.16 | wps 294608 | ups 0.69 | wpb 428936 | bsz 16329.7 | num_updates 52762 | lr 0.00027534 | gnorm 0.235 | clip 0.1 | loss_scale 0.5 | train_wall 2450 | gb_free 59.2 | wall 0 epoch 031 | loss 4.008 | nll_loss 2.368 | ppl 5.16 | wps 294608 | ups 0.69 | wpb 428936 | bsz 16329.7 | num_updates 52762 | lr 0.00027534 | gnorm 0.235 | clip 0.1 | loss_scale 0.5 | train_wall 2450 | gb_free 59.2 | wall 0 epoch 031 | loss 4.008 | nll_loss 2.368 | ppl 5.16 | wps 294608 | ups 0.69 | wpb 428936 | bsz 16329.7 | num_updates 52762 | lr 0.00027534 | gnorm 0.235 | clip 0.1 | loss_scale 0.5 | train_wall 2450 | gb_free 59.2 | wall 0 epoch 031 | loss 4.008 | nll_loss 2.368 | ppl 5.16 | wps 294608 | ups 0.69 | wpb 428936 | bsz 16329.7 | num_updates 52762 | lr 0.00027534 | gnorm 0.235 | clip 0.1 | loss_scale 0.5 | train_wall 2450 | gb_free 59.2 | wall 0 epoch 031 | loss 4.008 | nll_loss 2.368 | ppl 5.16 | wps 294608 | ups 0.69 | wpb 428936 | bsz 16329.7 | num_updates 52762 | lr 0.00027534 | gnorm 0.235 | clip 0.1 | loss_scale 0.5 | train_wall 2450 | gb_free 59.2 | wall 0 Start iterating over samples epoch 032: 38 / 1707 loss=4.004, nll_loss=2.363, ppl=5.15, wps=296635, ups=0.7, wpb=425928, bsz=15972.7, num_updates=52800, lr=0.000275241, gnorm=0.233, clip=0, loss_scale=0.5, train_wall=143, gb_free=59.2, wall=0 epoch 032: 38 / 1707 loss=4.004, nll_loss=2.363, ppl=5.15, wps=296635, ups=0.7, wpb=425928, bsz=15972.7, num_updates=52800, lr=0.000275241, gnorm=0.233, clip=0, loss_scale=0.5, train_wall=143, gb_free=59.2, wall=0 epoch 032: 38 / 1707 loss=4.004, nll_loss=2.363, ppl=5.15, wps=296635, ups=0.7, wpb=425928, bsz=15972.7, num_updates=52800, lr=0.000275241, gnorm=0.233, clip=0, loss_scale=0.5, train_wall=143, gb_free=59.2, wall=0 epoch 032: 38 / 1707 loss=4.004, nll_loss=2.363, ppl=5.15, wps=296635, ups=0.7, wpb=425928, bsz=15972.7, num_updates=52800, lr=0.000275241, gnorm=0.233, clip=0, loss_scale=0.5, train_wall=143, gb_free=59.2, wall=0 epoch 032: 38 / 1707 loss=4.004, nll_loss=2.363, ppl=5.15, wps=296635, ups=0.7, wpb=425928, bsz=15972.7, num_updates=52800, lr=0.000275241, gnorm=0.233, clip=0, loss_scale=0.5, train_wall=143, gb_free=59.2, wall=0 epoch 032: 38 / 1707 loss=4.004, nll_loss=2.363, ppl=5.15, wps=296635, ups=0.7, wpb=425928, bsz=15972.7, num_updates=52800, lr=0.000275241, gnorm=0.233, clip=0, loss_scale=0.5, train_wall=143, gb_free=59.2, wall=0 epoch 032: 38 / 1707 loss=4.004, nll_loss=2.363, ppl=5.15, wps=296635, ups=0.7, wpb=425928, bsz=15972.7, num_updates=52800, lr=0.000275241, gnorm=0.233, clip=0, loss_scale=0.5, train_wall=143, gb_free=59.2, wall=0 epoch 032: 38 / 1707 loss=4.004, nll_loss=2.363, ppl=5.15, wps=296635, ups=0.7, wpb=425928, bsz=15972.7, num_updates=52800, lr=0.000275241, gnorm=0.233, clip=0, loss_scale=0.5, train_wall=143, gb_free=59.2, wall=0 epoch 032: 38 / 1707 loss=4.004, nll_loss=2.363, ppl=5.15, wps=296635, ups=0.7, wpb=425928, bsz=15972.7, num_updates=52800, lr=0.000275241, gnorm=0.233, clip=0, loss_scale=0.5, train_wall=143, gb_free=59.2, wall=0 epoch 032: 38 / 1707 loss=4.004, nll_loss=2.363, ppl=5.15, wps=296635, ups=0.7, wpb=425928, bsz=15972.7, num_updates=52800, lr=0.000275241, gnorm=0.233, clip=0, loss_scale=0.5, train_wall=143, gb_free=59.2, wall=0 epoch 032: 138 / 1707 loss=3.987, nll_loss=2.343, ppl=5.08, wps=295698, ups=0.69, wpb=428456, bsz=16396.9, num_updates=52900, lr=0.000274981, gnorm=0.233, clip=0, loss_scale=0.5, train_wall=144, gb_free=59, wall=0 epoch 032: 138 / 1707 loss=3.987, nll_loss=2.343, ppl=5.08, wps=295698, ups=0.69, wpb=428456, bsz=16396.9, num_updates=52900, lr=0.000274981, gnorm=0.233, clip=0, loss_scale=0.5, train_wall=144, gb_free=59, wall=0 epoch 032: 138 / 1707 loss=3.987, nll_loss=2.343, ppl=5.08, wps=295698, ups=0.69, wpb=428456, bsz=16396.9, num_updates=52900, lr=0.000274981, gnorm=0.233, clip=0, loss_scale=0.5, train_wall=144, gb_free=59, wall=0 epoch 032: 138 / 1707 loss=3.987, nll_loss=2.343, ppl=5.08, wps=295698, ups=0.69, wpb=428456, bsz=16396.9, num_updates=52900, lr=0.000274981, gnorm=0.233, clip=0, loss_scale=0.5, train_wall=144, gb_free=59, wall=0 epoch 032: 138 / 1707 loss=3.987, nll_loss=2.343, ppl=5.08, wps=295698, ups=0.69, wpb=428456, bsz=16396.9, num_updates=52900, lr=0.000274981, gnorm=0.233, clip=0, loss_scale=0.5, train_wall=144, gb_free=59, wall=0 epoch 032: 138 / 1707 loss=3.987, nll_loss=2.343, ppl=5.08, wps=295698, ups=0.69, wpb=428456, bsz=16396.9, num_updates=52900, lr=0.000274981, gnorm=0.233, clip=0, loss_scale=0.5, train_wall=144, gb_free=59, wall=0 epoch 032: 138 / 1707 loss=3.987, nll_loss=2.343, ppl=5.08, wps=295698, ups=0.69, wpb=428456, bsz=16396.9, num_updates=52900, lr=0.000274981, gnorm=0.233, clip=0, loss_scale=0.5, train_wall=144, gb_free=59, wall=0 epoch 032: 138 / 1707 loss=3.987, nll_loss=2.343, ppl=5.08, wps=295698, ups=0.69, wpb=428456, bsz=16396.9, num_updates=52900, lr=0.000274981, gnorm=0.233, clip=0, loss_scale=0.5, train_wall=144, gb_free=59, wall=0 epoch 032: 138 / 1707 loss=3.987, nll_loss=2.343, ppl=5.08, wps=295698, ups=0.69, wpb=428456, bsz=16396.9, num_updates=52900, lr=0.000274981, gnorm=0.233, clip=0, loss_scale=0.5, train_wall=144, gb_free=59, wall=0 epoch 032: 138 / 1707 loss=3.987, nll_loss=2.343, ppl=5.08, wps=295698, ups=0.69, wpb=428456, bsz=16396.9, num_updates=52900, lr=0.000274981, gnorm=0.233, clip=0, loss_scale=0.5, train_wall=144, gb_free=59, wall=0 epoch 032: 238 / 1707 loss=3.996, nll_loss=2.354, ppl=5.11, wps=296836, ups=0.69, wpb=429231, bsz=16643, num_updates=53000, lr=0.000274721, gnorm=0.238, clip=0, loss_scale=1, train_wall=144, gb_free=59, wall=0 epoch 032: 238 / 1707 loss=3.996, nll_loss=2.354, ppl=5.11, wps=296836, ups=0.69, wpb=429231, bsz=16643, num_updates=53000, lr=0.000274721, gnorm=0.238, clip=0, loss_scale=1, train_wall=144, gb_free=59, wall=0 epoch 032: 238 / 1707 loss=3.996, nll_loss=2.354, ppl=5.11, wps=296836, ups=0.69, wpb=429231, bsz=16643, num_updates=53000, lr=0.000274721, gnorm=0.238, clip=0, loss_scale=1, train_wall=144, gb_free=59, wall=0 epoch 032: 238 / 1707 loss=3.996, nll_loss=2.354, ppl=5.11, wps=296836, ups=0.69, wpb=429231, bsz=16643, num_updates=53000, lr=0.000274721, gnorm=0.238, clip=0, loss_scale=1, train_wall=144, gb_free=59, wall=0 epoch 032: 238 / 1707 loss=3.996, nll_loss=2.354, ppl=5.11, wps=296836, ups=0.69, wpb=429231, bsz=16643, num_updates=53000, lr=0.000274721, gnorm=0.238, clip=0, loss_scale=1, train_wall=144, gb_free=59, wall=0 epoch 032: 238 / 1707 loss=3.996, nll_loss=2.354, ppl=5.11, wps=296836, ups=0.69, wpb=429231, bsz=16643, num_updates=53000, lr=0.000274721, gnorm=0.238, clip=0, loss_scale=1, train_wall=144, gb_free=59, wall=0 epoch 032: 238 / 1707 loss=3.996, nll_loss=2.354, ppl=5.11, wps=296836, ups=0.69, wpb=429231, bsz=16643, num_updates=53000, lr=0.000274721, gnorm=0.238, clip=0, loss_scale=1, train_wall=144, gb_free=59, wall=0 epoch 032: 238 / 1707 loss=3.996, nll_loss=2.354, ppl=5.11, wps=296836, ups=0.69, wpb=429231, bsz=16643, num_updates=53000, lr=0.000274721, gnorm=0.238, clip=0, loss_scale=1, train_wall=144, gb_free=59, wall=0 epoch 032: 238 / 1707 loss=3.996, nll_loss=2.354, ppl=5.11, wps=296836, ups=0.69, wpb=429231, bsz=16643, num_updates=53000, lr=0.000274721, gnorm=0.238, clip=0, loss_scale=1, train_wall=144, gb_free=59, wall=0 epoch 032: 238 / 1707 loss=3.996, nll_loss=2.354, ppl=5.11, wps=296836, ups=0.69, wpb=429231, bsz=16643, num_updates=53000, lr=0.000274721, gnorm=0.238, clip=0, loss_scale=1, train_wall=144, gb_free=59, wall=0 begin validation on "valid" subset epoch 032 | valid on 'valid' subset | loss 4.287 | nll_loss 2.668 | ppl 6.36 | wps 68094.4 | wpb 21331 | bsz 1016 | num_updates 53000 | best_loss 4.279 epoch 032 | valid on 'valid' subset | loss 4.287 | nll_loss 2.668 | ppl 6.36 | wps 68094.4 | wpb 21331 | bsz 1016 | num_updates 53000 | best_loss 4.279 epoch 032 | valid on 'valid' subset | loss 4.287 | nll_loss 2.668 | ppl 6.36 | wps 68094.4 | wpb 21331 | bsz 1016 | num_updates 53000 | best_loss 4.279 epoch 032 | valid on 'valid' subset | loss 4.287 | nll_loss 2.668 | ppl 6.36 | wps 68094.4 | wpb 21331 | bsz 1016 | num_updates 53000 | best_loss 4.279 epoch 032 | valid on 'valid' subset | loss 4.287 | nll_loss 2.668 | ppl 6.36 | wps 68094.4 | wpb 21331 | bsz 1016 | num_updates 53000 | best_loss 4.279 epoch 032 | valid on 'valid' subset | loss 4.287 | nll_loss 2.668 | ppl 6.36 | wps 68094.4 | wpb 21331 | bsz 1016 | num_updates 53000 | best_loss 4.279 epoch 032 | valid on 'valid' subset | loss 4.287 | nll_loss 2.668 | ppl 6.36 | wps 68094.4 | wpb 21331 | bsz 1016 | num_updates 53000 | best_loss 4.279 epoch 032 | valid on 'valid' subset | loss 4.287 | nll_loss 2.668 | ppl 6.36 | wps 68094.4 | wpb 21331 | bsz 1016 | num_updates 53000 | best_loss 4.279 epoch 032 | valid on 'valid' subset | loss 4.287 | nll_loss 2.668 | ppl 6.36 | wps 68094.4 | wpb 21331 | bsz 1016 | num_updates 53000 | best_loss 4.279 epoch 032 | valid on 'valid' subset | loss 4.287 | nll_loss 2.668 | ppl 6.36 | wps 68094.4 | wpb 21331 | bsz 1016 | num_updates 53000 | best_loss 4.279 epoch 032: 338 / 1707 loss=3.992, nll_loss=2.35, ppl=5.1, wps=273891, ups=0.64, wpb=427840, bsz=16231, num_updates=53100, lr=0.000274462, gnorm=0.239, clip=0, loss_scale=1, train_wall=143, gb_free=58.9, wall=0 epoch 032: 338 / 1707 loss=3.992, nll_loss=2.35, ppl=5.1, wps=273891, ups=0.64, wpb=427840, bsz=16231, num_updates=53100, lr=0.000274462, gnorm=0.239, clip=0, loss_scale=1, train_wall=143, gb_free=58.9, wall=0 epoch 032: 338 / 1707 loss=3.992, nll_loss=2.35, ppl=5.1, wps=273891, ups=0.64, wpb=427840, bsz=16231, num_updates=53100, lr=0.000274462, gnorm=0.239, clip=0, loss_scale=1, train_wall=143, gb_free=58.9, wall=0 epoch 032: 338 / 1707 loss=3.992, nll_loss=2.35, ppl=5.1, wps=273891, ups=0.64, wpb=427840, bsz=16231, num_updates=53100, lr=0.000274462, gnorm=0.239, clip=0, loss_scale=1, train_wall=143, gb_free=58.9, wall=0 epoch 032: 338 / 1707 loss=3.992, nll_loss=2.35, ppl=5.1, wps=273891, ups=0.64, wpb=427840, bsz=16231, num_updates=53100, lr=0.000274462, gnorm=0.239, clip=0, loss_scale=1, train_wall=143, gb_free=58.9, wall=0 epoch 032: 338 / 1707 loss=3.992, nll_loss=2.35, ppl=5.1, wps=273891, ups=0.64, wpb=427840, bsz=16231, num_updates=53100, lr=0.000274462, gnorm=0.239, clip=0, loss_scale=1, train_wall=143, gb_free=58.9, wall=0 epoch 032: 338 / 1707 loss=3.992, nll_loss=2.35, ppl=5.1, wps=273891, ups=0.64, wpb=427840, bsz=16231, num_updates=53100, lr=0.000274462, gnorm=0.239, clip=0, loss_scale=1, train_wall=143, gb_free=58.9, wall=0 epoch 032: 338 / 1707 loss=3.992, nll_loss=2.35, ppl=5.1, wps=273891, ups=0.64, wpb=427840, bsz=16231, num_updates=53100, lr=0.000274462, gnorm=0.239, clip=0, loss_scale=1, train_wall=143, gb_free=58.9, wall=0 epoch 032: 338 / 1707 loss=3.992, nll_loss=2.35, ppl=5.1, wps=273891, ups=0.64, wpb=427840, bsz=16231, num_updates=53100, lr=0.000274462, gnorm=0.239, clip=0, loss_scale=1, train_wall=143, gb_free=58.9, wall=0 epoch 032: 338 / 1707 loss=3.992, nll_loss=2.35, ppl=5.1, wps=273891, ups=0.64, wpb=427840, bsz=16231, num_updates=53100, lr=0.000274462, gnorm=0.239, clip=0, loss_scale=1, train_wall=143, gb_free=58.9, wall=0 epoch 032: 438 / 1707 loss=3.998, nll_loss=2.356, ppl=5.12, wps=296778, ups=0.69, wpb=428962, bsz=16535, num_updates=53200, lr=0.000274204, gnorm=0.238, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=0 epoch 032: 438 / 1707 loss=3.998, nll_loss=2.356, ppl=5.12, wps=296778, ups=0.69, wpb=428962, bsz=16535, num_updates=53200, lr=0.000274204, gnorm=0.238, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=0 epoch 032: 438 / 1707 loss=3.998, nll_loss=2.356, ppl=5.12, wps=296778, ups=0.69, wpb=428962, bsz=16535, num_updates=53200, lr=0.000274204, gnorm=0.238, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=0 epoch 032: 438 / 1707 loss=3.998, nll_loss=2.356, ppl=5.12, wps=296778, ups=0.69, wpb=428962, bsz=16535, num_updates=53200, lr=0.000274204, gnorm=0.238, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=0 epoch 032: 438 / 1707 loss=3.998, nll_loss=2.356, ppl=5.12, wps=296778, ups=0.69, wpb=428962, bsz=16535, num_updates=53200, lr=0.000274204, gnorm=0.238, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=0 epoch 032: 438 / 1707 loss=3.998, nll_loss=2.356, ppl=5.12, wps=296778, ups=0.69, wpb=428962, bsz=16535, num_updates=53200, lr=0.000274204, gnorm=0.238, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=0 epoch 032: 438 / 1707 loss=3.998, nll_loss=2.356, ppl=5.12, wps=296778, ups=0.69, wpb=428962, bsz=16535, num_updates=53200, lr=0.000274204, gnorm=0.238, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=0 epoch 032: 438 / 1707 loss=3.998, nll_loss=2.356, ppl=5.12, wps=296778, ups=0.69, wpb=428962, bsz=16535, num_updates=53200, lr=0.000274204, gnorm=0.238, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=0 epoch 032: 438 / 1707 loss=3.998, nll_loss=2.356, ppl=5.12, wps=296778, ups=0.69, wpb=428962, bsz=16535, num_updates=53200, lr=0.000274204, gnorm=0.238, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=0 epoch 032: 438 / 1707 loss=3.998, nll_loss=2.356, ppl=5.12, wps=296778, ups=0.69, wpb=428962, bsz=16535, num_updates=53200, lr=0.000274204, gnorm=0.238, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=0 epoch 032: 539 / 1707 loss=3.992, nll_loss=2.35, ppl=5.1, wps=293818, ups=0.68, wpb=429106, bsz=16397.8, num_updates=53300, lr=0.000273947, gnorm=0.239, clip=0, loss_scale=1, train_wall=145, gb_free=59, wall=0 epoch 032: 539 / 1707 loss=3.992, nll_loss=2.35, ppl=5.1, wps=293818, ups=0.68, wpb=429106, bsz=16397.8, num_updates=53300, lr=0.000273947, gnorm=0.239, clip=0, loss_scale=1, train_wall=145, gb_free=59, wall=0 epoch 032: 539 / 1707 loss=3.992, nll_loss=2.35, ppl=5.1, wps=293818, ups=0.68, wpb=429106, bsz=16397.8, num_updates=53300, lr=0.000273947, gnorm=0.239, clip=0, loss_scale=1, train_wall=145, gb_free=59, wall=0 epoch 032: 539 / 1707 loss=3.992, nll_loss=2.35, ppl=5.1, wps=293818, ups=0.68, wpb=429106, bsz=16397.8, num_updates=53300, lr=0.000273947, gnorm=0.239, clip=0, loss_scale=1, train_wall=145, gb_free=59, wall=0 epoch 032: 539 / 1707 loss=3.992, nll_loss=2.35, ppl=5.1, wps=293818, ups=0.68, wpb=429106, bsz=16397.8, num_updates=53300, lr=0.000273947, gnorm=0.239, clip=0, loss_scale=1, train_wall=145, gb_free=59, wall=0 epoch 032: 539 / 1707 loss=3.992, nll_loss=2.35, ppl=5.1, wps=293818, ups=0.68, wpb=429106, bsz=16397.8, num_updates=53300, lr=0.000273947, gnorm=0.239, clip=0, loss_scale=1, train_wall=145, gb_free=59, wall=0 epoch 032: 539 / 1707 loss=3.992, nll_loss=2.35, ppl=5.1, wps=293818, ups=0.68, wpb=429106, bsz=16397.8, num_updates=53300, lr=0.000273947, gnorm=0.239, clip=0, loss_scale=1, train_wall=145, gb_free=59, wall=0 epoch 032: 539 / 1707 loss=3.992, nll_loss=2.35, ppl=5.1, wps=293818, ups=0.68, wpb=429106, bsz=16397.8, num_updates=53300, lr=0.000273947, gnorm=0.239, clip=0, loss_scale=1, train_wall=145, gb_free=59, wall=0 epoch 032: 539 / 1707 loss=3.992, nll_loss=2.35, ppl=5.1, wps=293818, ups=0.68, wpb=429106, bsz=16397.8, num_updates=53300, lr=0.000273947, gnorm=0.239, clip=0, loss_scale=1, train_wall=145, gb_free=59, wall=0 epoch 032: 539 / 1707 loss=3.992, nll_loss=2.35, ppl=5.1, wps=293818, ups=0.68, wpb=429106, bsz=16397.8, num_updates=53300, lr=0.000273947, gnorm=0.239, clip=0, loss_scale=1, train_wall=145, gb_free=59, wall=0 epoch 032: 639 / 1707 loss=3.995, nll_loss=2.354, ppl=5.11, wps=296229, ups=0.69, wpb=428133, bsz=16366.7, num_updates=53400, lr=0.00027369, gnorm=0.239, clip=0, loss_scale=1, train_wall=144, gb_free=59.4, wall=0 epoch 032: 639 / 1707 loss=3.995, nll_loss=2.354, ppl=5.11, wps=296229, ups=0.69, wpb=428133, bsz=16366.7, num_updates=53400, lr=0.00027369, gnorm=0.239, clip=0, loss_scale=1, train_wall=144, gb_free=59.4, wall=0 epoch 032: 639 / 1707 loss=3.995, nll_loss=2.354, ppl=5.11, wps=296229, ups=0.69, wpb=428133, bsz=16366.7, num_updates=53400, lr=0.00027369, gnorm=0.239, clip=0, loss_scale=1, train_wall=144, gb_free=59.4, wall=0 epoch 032: 639 / 1707 loss=3.995, nll_loss=2.354, ppl=5.11, wps=296229, ups=0.69, wpb=428133, bsz=16366.7, num_updates=53400, lr=0.00027369, gnorm=0.239, clip=0, loss_scale=1, train_wall=144, gb_free=59.4, wall=0 epoch 032: 639 / 1707 loss=3.995, nll_loss=2.354, ppl=5.11, wps=296229, ups=0.69, wpb=428133, bsz=16366.7, num_updates=53400, lr=0.00027369, gnorm=0.239, clip=0, loss_scale=1, train_wall=144, gb_free=59.4, wall=0 epoch 032: 639 / 1707 loss=3.995, nll_loss=2.354, ppl=5.11, wps=296229, ups=0.69, wpb=428133, bsz=16366.7, num_updates=53400, lr=0.00027369, gnorm=0.239, clip=0, loss_scale=1, train_wall=144, gb_free=59.4, wall=0 epoch 032: 639 / 1707 loss=3.995, nll_loss=2.354, ppl=5.11, wps=296229, ups=0.69, wpb=428133, bsz=16366.7, num_updates=53400, lr=0.00027369, gnorm=0.239, clip=0, loss_scale=1, train_wall=144, gb_free=59.4, wall=0 epoch 032: 639 / 1707 loss=3.995, nll_loss=2.354, ppl=5.11, wps=296229, ups=0.69, wpb=428133, bsz=16366.7, num_updates=53400, lr=0.00027369, gnorm=0.239, clip=0, loss_scale=1, train_wall=144, gb_free=59.4, wall=0 epoch 032: 639 / 1707 loss=3.995, nll_loss=2.354, ppl=5.11, wps=296229, ups=0.69, wpb=428133, bsz=16366.7, num_updates=53400, lr=0.00027369, gnorm=0.239, clip=0, loss_scale=1, train_wall=144, gb_free=59.4, wall=0 epoch 032: 639 / 1707 loss=3.995, nll_loss=2.354, ppl=5.11, wps=296229, ups=0.69, wpb=428133, bsz=16366.7, num_updates=53400, lr=0.00027369, gnorm=0.239, clip=0, loss_scale=1, train_wall=144, gb_free=59.4, wall=0 epoch 032: 739 / 1707 loss=3.998, nll_loss=2.357, ppl=5.12, wps=297798, ups=0.69, wpb=430193, bsz=16405.8, num_updates=53500, lr=0.000273434, gnorm=0.24, clip=0, loss_scale=2, train_wall=144, gb_free=59.5, wall=0 epoch 032: 739 / 1707 loss=3.998, nll_loss=2.357, ppl=5.12, wps=297798, ups=0.69, wpb=430193, bsz=16405.8, num_updates=53500, lr=0.000273434, gnorm=0.24, clip=0, loss_scale=2, train_wall=144, gb_free=59.5, wall=0 epoch 032: 739 / 1707 loss=3.998, nll_loss=2.357, ppl=5.12, wps=297798, ups=0.69, wpb=430193, bsz=16405.8, num_updates=53500, lr=0.000273434, gnorm=0.24, clip=0, loss_scale=2, train_wall=144, gb_free=59.5, wall=0 epoch 032: 739 / 1707 loss=3.998, nll_loss=2.357, ppl=5.12, wps=297798, ups=0.69, wpb=430193, bsz=16405.8, num_updates=53500, lr=0.000273434, gnorm=0.24, clip=0, loss_scale=2, train_wall=144, gb_free=59.5, wall=0 epoch 032: 739 / 1707 loss=3.998, nll_loss=2.357, ppl=5.12, wps=297798, ups=0.69, wpb=430193, bsz=16405.8, num_updates=53500, lr=0.000273434, gnorm=0.24, clip=0, loss_scale=2, train_wall=144, gb_free=59.5, wall=0 epoch 032: 739 / 1707 loss=3.998, nll_loss=2.357, ppl=5.12, wps=297798, ups=0.69, wpb=430193, bsz=16405.8, num_updates=53500, lr=0.000273434, gnorm=0.24, clip=0, loss_scale=2, train_wall=144, gb_free=59.5, wall=0 epoch 032: 739 / 1707 loss=3.998, nll_loss=2.357, ppl=5.12, wps=297798, ups=0.69, wpb=430193, bsz=16405.8, num_updates=53500, lr=0.000273434, gnorm=0.24, clip=0, loss_scale=2, train_wall=144, gb_free=59.5, wall=0 epoch 032: 739 / 1707 loss=3.998, nll_loss=2.357, ppl=5.12, wps=297798, ups=0.69, wpb=430193, bsz=16405.8, num_updates=53500, lr=0.000273434, gnorm=0.24, clip=0, loss_scale=2, train_wall=144, gb_free=59.5, wall=0 epoch 032: 739 / 1707 loss=3.998, nll_loss=2.357, ppl=5.12, wps=297798, ups=0.69, wpb=430193, bsz=16405.8, num_updates=53500, lr=0.000273434, gnorm=0.24, clip=0, loss_scale=2, train_wall=144, gb_free=59.5, wall=0 epoch 032: 739 / 1707 loss=3.998, nll_loss=2.357, ppl=5.12, wps=297798, ups=0.69, wpb=430193, bsz=16405.8, num_updates=53500, lr=0.000273434, gnorm=0.24, clip=0, loss_scale=2, train_wall=144, gb_free=59.5, wall=0 epoch 032: 839 / 1707 loss=4.012, nll_loss=2.373, ppl=5.18, wps=296913, ups=0.69, wpb=429199, bsz=16515.5, num_updates=53600, lr=0.000273179, gnorm=0.234, clip=0, loss_scale=2, train_wall=144, gb_free=58.8, wall=0 epoch 032: 839 / 1707 loss=4.012, nll_loss=2.373, ppl=5.18, wps=296913, ups=0.69, wpb=429199, bsz=16515.5, num_updates=53600, lr=0.000273179, gnorm=0.234, clip=0, loss_scale=2, train_wall=144, gb_free=58.8, wall=0 epoch 032: 839 / 1707 loss=4.012, nll_loss=2.373, ppl=5.18, wps=296913, ups=0.69, wpb=429199, bsz=16515.5, num_updates=53600, lr=0.000273179, gnorm=0.234, clip=0, loss_scale=2, train_wall=144, gb_free=58.8, wall=0 epoch 032: 839 / 1707 loss=4.012, nll_loss=2.373, ppl=5.18, wps=296913, ups=0.69, wpb=429199, bsz=16515.5, num_updates=53600, lr=0.000273179, gnorm=0.234, clip=0, loss_scale=2, train_wall=144, gb_free=58.8, wall=0 epoch 032: 839 / 1707 loss=4.012, nll_loss=2.373, ppl=5.18, wps=296913, ups=0.69, wpb=429199, bsz=16515.5, num_updates=53600, lr=0.000273179, gnorm=0.234, clip=0, loss_scale=2, train_wall=144, gb_free=58.8, wall=0 epoch 032: 839 / 1707 loss=4.012, nll_loss=2.373, ppl=5.18, wps=296913, ups=0.69, wpb=429199, bsz=16515.5, num_updates=53600, lr=0.000273179, gnorm=0.234, clip=0, loss_scale=2, train_wall=144, gb_free=58.8, wall=0 epoch 032: 839 / 1707 loss=4.012, nll_loss=2.373, ppl=5.18, wps=296913, ups=0.69, wpb=429199, bsz=16515.5, num_updates=53600, lr=0.000273179, gnorm=0.234, clip=0, loss_scale=2, train_wall=144, gb_free=58.8, wall=0 epoch 032: 839 / 1707 loss=4.012, nll_loss=2.373, ppl=5.18, wps=296913, ups=0.69, wpb=429199, bsz=16515.5, num_updates=53600, lr=0.000273179, gnorm=0.234, clip=0, loss_scale=2, train_wall=144, gb_free=58.8, wall=0 epoch 032: 839 / 1707 loss=4.012, nll_loss=2.373, ppl=5.18, wps=296913, ups=0.69, wpb=429199, bsz=16515.5, num_updates=53600, lr=0.000273179, gnorm=0.234, clip=0, loss_scale=2, train_wall=144, gb_free=58.8, wall=0 epoch 032: 839 / 1707 loss=4.012, nll_loss=2.373, ppl=5.18, wps=296913, ups=0.69, wpb=429199, bsz=16515.5, num_updates=53600, lr=0.000273179, gnorm=0.234, clip=0, loss_scale=2, train_wall=144, gb_free=58.8, wall=0 epoch 032: 940 / 1707 loss=4.011, nll_loss=2.372, ppl=5.18, wps=294407, ups=0.69, wpb=428767, bsz=16291.8, num_updates=53700, lr=0.000272925, gnorm=0.239, clip=0, loss_scale=1, train_wall=145, gb_free=59.5, wall=0 epoch 032: 940 / 1707 loss=4.011, nll_loss=2.372, ppl=5.18, wps=294407, ups=0.69, wpb=428767, bsz=16291.8, num_updates=53700, lr=0.000272925, gnorm=0.239, clip=0, loss_scale=1, train_wall=145, gb_free=59.5, wall=0 epoch 032: 940 / 1707 loss=4.011, nll_loss=2.372, ppl=5.18, wps=294407, ups=0.69, wpb=428767, bsz=16291.8, num_updates=53700, lr=0.000272925, gnorm=0.239, clip=0, loss_scale=1, train_wall=145, gb_free=59.5, wall=0 epoch 032: 940 / 1707 loss=4.011, nll_loss=2.372, ppl=5.18, wps=294407, ups=0.69, wpb=428767, bsz=16291.8, num_updates=53700, lr=0.000272925, gnorm=0.239, clip=0, loss_scale=1, train_wall=145, gb_free=59.5, wall=0 epoch 032: 940 / 1707 loss=4.011, nll_loss=2.372, ppl=5.18, wps=294407, ups=0.69, wpb=428767, bsz=16291.8, num_updates=53700, lr=0.000272925, gnorm=0.239, clip=0, loss_scale=1, train_wall=145, gb_free=59.5, wall=0 epoch 032: 940 / 1707 loss=4.011, nll_loss=2.372, ppl=5.18, wps=294407, ups=0.69, wpb=428767, bsz=16291.8, num_updates=53700, lr=0.000272925, gnorm=0.239, clip=0, loss_scale=1, train_wall=145, gb_free=59.5, wall=0 epoch 032: 940 / 1707 loss=4.011, nll_loss=2.372, ppl=5.18, wps=294407, ups=0.69, wpb=428767, bsz=16291.8, num_updates=53700, lr=0.000272925, gnorm=0.239, clip=0, loss_scale=1, train_wall=145, gb_free=59.5, wall=0 epoch 032: 940 / 1707 loss=4.011, nll_loss=2.372, ppl=5.18, wps=294407, ups=0.69, wpb=428767, bsz=16291.8, num_updates=53700, lr=0.000272925, gnorm=0.239, clip=0, loss_scale=1, train_wall=145, gb_free=59.5, wall=0 epoch 032: 940 / 1707 loss=4.011, nll_loss=2.372, ppl=5.18, wps=294407, ups=0.69, wpb=428767, bsz=16291.8, num_updates=53700, lr=0.000272925, gnorm=0.239, clip=0, loss_scale=1, train_wall=145, gb_free=59.5, wall=0 epoch 032: 940 / 1707 loss=4.011, nll_loss=2.372, ppl=5.18, wps=294407, ups=0.69, wpb=428767, bsz=16291.8, num_updates=53700, lr=0.000272925, gnorm=0.239, clip=0, loss_scale=1, train_wall=145, gb_free=59.5, wall=0 epoch 032: 1040 / 1707 loss=4.01, nll_loss=2.371, ppl=5.17, wps=298302, ups=0.69, wpb=430262, bsz=16168.7, num_updates=53800, lr=0.000272671, gnorm=0.233, clip=0, loss_scale=1, train_wall=143, gb_free=59.1, wall=0 epoch 032: 1040 / 1707 loss=4.01, nll_loss=2.371, ppl=5.17, wps=298302, ups=0.69, wpb=430262, bsz=16168.7, num_updates=53800, lr=0.000272671, gnorm=0.233, clip=0, loss_scale=1, train_wall=143, gb_free=59.1, wall=0 epoch 032: 1040 / 1707 loss=4.01, nll_loss=2.371, ppl=5.17, wps=298302, ups=0.69, wpb=430262, bsz=16168.7, num_updates=53800, lr=0.000272671, gnorm=0.233, clip=0, loss_scale=1, train_wall=143, gb_free=59.1, wall=0 epoch 032: 1040 / 1707 loss=4.01, nll_loss=2.371, ppl=5.17, wps=298302, ups=0.69, wpb=430262, bsz=16168.7, num_updates=53800, lr=0.000272671, gnorm=0.233, clip=0, loss_scale=1, train_wall=143, gb_free=59.1, wall=0 epoch 032: 1040 / 1707 loss=4.01, nll_loss=2.371, ppl=5.17, wps=298302, ups=0.69, wpb=430262, bsz=16168.7, num_updates=53800, lr=0.000272671, gnorm=0.233, clip=0, loss_scale=1, train_wall=143, gb_free=59.1, wall=0 epoch 032: 1040 / 1707 loss=4.01, nll_loss=2.371, ppl=5.17, wps=298302, ups=0.69, wpb=430262, bsz=16168.7, num_updates=53800, lr=0.000272671, gnorm=0.233, clip=0, loss_scale=1, train_wall=143, gb_free=59.1, wall=0 epoch 032: 1040 / 1707 loss=4.01, nll_loss=2.371, ppl=5.17, wps=298302, ups=0.69, wpb=430262, bsz=16168.7, num_updates=53800, lr=0.000272671, gnorm=0.233, clip=0, loss_scale=1, train_wall=143, gb_free=59.1, wall=0 epoch 032: 1040 / 1707 loss=4.01, nll_loss=2.371, ppl=5.17, wps=298302, ups=0.69, wpb=430262, bsz=16168.7, num_updates=53800, lr=0.000272671, gnorm=0.233, clip=0, loss_scale=1, train_wall=143, gb_free=59.1, wall=0 epoch 032: 1040 / 1707 loss=4.01, nll_loss=2.371, ppl=5.17, wps=298302, ups=0.69, wpb=430262, bsz=16168.7, num_updates=53800, lr=0.000272671, gnorm=0.233, clip=0, loss_scale=1, train_wall=143, gb_free=59.1, wall=0 epoch 032: 1040 / 1707 loss=4.01, nll_loss=2.371, ppl=5.17, wps=298302, ups=0.69, wpb=430262, bsz=16168.7, num_updates=53800, lr=0.000272671, gnorm=0.233, clip=0, loss_scale=1, train_wall=143, gb_free=59.1, wall=0 epoch 032: 1140 / 1707 loss=4.004, nll_loss=2.365, ppl=5.15, wps=297583, ups=0.69, wpb=429413, bsz=16319.2, num_updates=53900, lr=0.000272418, gnorm=0.24, clip=0, loss_scale=1, train_wall=143, gb_free=59.5, wall=0 epoch 032: 1140 / 1707 loss=4.004, nll_loss=2.365, ppl=5.15, wps=297583, ups=0.69, wpb=429413, bsz=16319.2, num_updates=53900, lr=0.000272418, gnorm=0.24, clip=0, loss_scale=1, train_wall=143, gb_free=59.5, wall=0 epoch 032: 1140 / 1707 loss=4.004, nll_loss=2.365, ppl=5.15, wps=297583, ups=0.69, wpb=429413, bsz=16319.2, num_updates=53900, lr=0.000272418, gnorm=0.24, clip=0, loss_scale=1, train_wall=143, gb_free=59.5, wall=0 epoch 032: 1140 / 1707 loss=4.004, nll_loss=2.365, ppl=5.15, wps=297583, ups=0.69, wpb=429413, bsz=16319.2, num_updates=53900, lr=0.000272418, gnorm=0.24, clip=0, loss_scale=1, train_wall=143, gb_free=59.5, wall=0 epoch 032: 1140 / 1707 loss=4.004, nll_loss=2.365, ppl=5.15, wps=297583, ups=0.69, wpb=429413, bsz=16319.2, num_updates=53900, lr=0.000272418, gnorm=0.24, clip=0, loss_scale=1, train_wall=143, gb_free=59.5, wall=0 epoch 032: 1140 / 1707 loss=4.004, nll_loss=2.365, ppl=5.15, wps=297583, ups=0.69, wpb=429413, bsz=16319.2, num_updates=53900, lr=0.000272418, gnorm=0.24, clip=0, loss_scale=1, train_wall=143, gb_free=59.5, wall=0 epoch 032: 1140 / 1707 loss=4.004, nll_loss=2.365, ppl=5.15, wps=297583, ups=0.69, wpb=429413, bsz=16319.2, num_updates=53900, lr=0.000272418, gnorm=0.24, clip=0, loss_scale=1, train_wall=143, gb_free=59.5, wall=0 epoch 032: 1140 / 1707 loss=4.004, nll_loss=2.365, ppl=5.15, wps=297583, ups=0.69, wpb=429413, bsz=16319.2, num_updates=53900, lr=0.000272418, gnorm=0.24, clip=0, loss_scale=1, train_wall=143, gb_free=59.5, wall=0 epoch 032: 1140 / 1707 loss=4.004, nll_loss=2.365, ppl=5.15, wps=297583, ups=0.69, wpb=429413, bsz=16319.2, num_updates=53900, lr=0.000272418, gnorm=0.24, clip=0, loss_scale=1, train_wall=143, gb_free=59.5, wall=0 epoch 032: 1140 / 1707 loss=4.004, nll_loss=2.365, ppl=5.15, wps=297583, ups=0.69, wpb=429413, bsz=16319.2, num_updates=53900, lr=0.000272418, gnorm=0.24, clip=0, loss_scale=1, train_wall=143, gb_free=59.5, wall=0 epoch 032: 1241 / 1707 loss=4.006, nll_loss=2.366, ppl=5.16, wps=292901, ups=0.68, wpb=427874, bsz=16285.9, num_updates=54000, lr=0.000272166, gnorm=0.245, clip=0, loss_scale=1, train_wall=145, gb_free=59.6, wall=0 epoch 032: 1241 / 1707 loss=4.006, nll_loss=2.366, ppl=5.16, wps=292901, ups=0.68, wpb=427874, bsz=16285.9, num_updates=54000, lr=0.000272166, gnorm=0.245, clip=0, loss_scale=1, train_wall=145, gb_free=59.6, wall=0 epoch 032: 1241 / 1707 loss=4.006, nll_loss=2.366, ppl=5.16, wps=292901, ups=0.68, wpb=427874, bsz=16285.9, num_updates=54000, lr=0.000272166, gnorm=0.245, clip=0, loss_scale=1, train_wall=145, gb_free=59.6, wall=0 epoch 032: 1241 / 1707 loss=4.006, nll_loss=2.366, ppl=5.16, wps=292901, ups=0.68, wpb=427874, bsz=16285.9, num_updates=54000, lr=0.000272166, gnorm=0.245, clip=0, loss_scale=1, train_wall=145, gb_free=59.6, wall=0 epoch 032: 1241 / 1707 loss=4.006, nll_loss=2.366, ppl=5.16, wps=292901, ups=0.68, wpb=427874, bsz=16285.9, num_updates=54000, lr=0.000272166, gnorm=0.245, clip=0, loss_scale=1, train_wall=145, gb_free=59.6, wall=0 epoch 032: 1241 / 1707 loss=4.006, nll_loss=2.366, ppl=5.16, wps=292901, ups=0.68, wpb=427874, bsz=16285.9, num_updates=54000, lr=0.000272166, gnorm=0.245, clip=0, loss_scale=1, train_wall=145, gb_free=59.6, wall=0 epoch 032: 1241 / 1707 loss=4.006, nll_loss=2.366, ppl=5.16, wps=292901, ups=0.68, wpb=427874, bsz=16285.9, num_updates=54000, lr=0.000272166, gnorm=0.245, clip=0, loss_scale=1, train_wall=145, gb_free=59.6, wall=0 epoch 032: 1241 / 1707 loss=4.006, nll_loss=2.366, ppl=5.16, wps=292901, ups=0.68, wpb=427874, bsz=16285.9, num_updates=54000, lr=0.000272166, gnorm=0.245, clip=0, loss_scale=1, train_wall=145, gb_free=59.6, wall=0 epoch 032: 1241 / 1707 loss=4.006, nll_loss=2.366, ppl=5.16, wps=292901, ups=0.68, wpb=427874, bsz=16285.9, num_updates=54000, lr=0.000272166, gnorm=0.245, clip=0, loss_scale=1, train_wall=145, gb_free=59.6, wall=0 epoch 032: 1241 / 1707 loss=4.006, nll_loss=2.366, ppl=5.16, wps=292901, ups=0.68, wpb=427874, bsz=16285.9, num_updates=54000, lr=0.000272166, gnorm=0.245, clip=0, loss_scale=1, train_wall=145, gb_free=59.6, wall=0 begin validation on "valid" subset epoch 032 | valid on 'valid' subset | loss 4.287 | nll_loss 2.663 | ppl 6.33 | wps 76611.4 | wpb 21331 | bsz 1016 | num_updates 54000 | best_loss 4.279 epoch 032 | valid on 'valid' subset | loss 4.287 | nll_loss 2.663 | ppl 6.33 | wps 76611.4 | wpb 21331 | bsz 1016 | num_updates 54000 | best_loss 4.279 epoch 032 | valid on 'valid' subset | loss 4.287 | nll_loss 2.663 | ppl 6.33 | wps 76611.4 | wpb 21331 | bsz 1016 | num_updates 54000 | best_loss 4.279 epoch 032 | valid on 'valid' subset | loss 4.287 | nll_loss 2.663 | ppl 6.33 | wps 76611.4 | wpb 21331 | bsz 1016 | num_updates 54000 | best_loss 4.279 epoch 032 | valid on 'valid' subset | loss 4.287 | nll_loss 2.663 | ppl 6.33 | wps 76611.4 | wpb 21331 | bsz 1016 | num_updates 54000 | best_loss 4.279 epoch 032 | valid on 'valid' subset | loss 4.287 | nll_loss 2.663 | ppl 6.33 | wps 76611.4 | wpb 21331 | bsz 1016 | num_updates 54000 | best_loss 4.279 epoch 032 | valid on 'valid' subset | loss 4.287 | nll_loss 2.663 | ppl 6.33 | wps 76611.4 | wpb 21331 | bsz 1016 | num_updates 54000 | best_loss 4.279 epoch 032 | valid on 'valid' subset | loss 4.287 | nll_loss 2.663 | ppl 6.33 | wps 76611.4 | wpb 21331 | bsz 1016 | num_updates 54000 | best_loss 4.279 epoch 032 | valid on 'valid' subset | loss 4.287 | nll_loss 2.663 | ppl 6.33 | wps 76611.4 | wpb 21331 | bsz 1016 | num_updates 54000 | best_loss 4.279 epoch 032 | valid on 'valid' subset | loss 4.287 | nll_loss 2.663 | ppl 6.33 | wps 76611.4 | wpb 21331 | bsz 1016 | num_updates 54000 | best_loss 4.279 epoch 032: 1341 / 1707 loss=4.014, nll_loss=2.376, ppl=5.19, wps=267066, ups=0.62, wpb=429817, bsz=16245.7, num_updates=54100, lr=0.000271914, gnorm=0.232, clip=0, loss_scale=1, train_wall=143, gb_free=59.6, wall=0 epoch 032: 1341 / 1707 loss=4.014, nll_loss=2.376, ppl=5.19, wps=267066, ups=0.62, wpb=429817, bsz=16245.7, num_updates=54100, lr=0.000271914, gnorm=0.232, clip=0, loss_scale=1, train_wall=143, gb_free=59.6, wall=0 epoch 032: 1341 / 1707 loss=4.014, nll_loss=2.376, ppl=5.19, wps=267066, ups=0.62, wpb=429817, bsz=16245.7, num_updates=54100, lr=0.000271914, gnorm=0.232, clip=0, loss_scale=1, train_wall=143, gb_free=59.6, wall=0 epoch 032: 1341 / 1707 loss=4.014, nll_loss=2.376, ppl=5.19, wps=267066, ups=0.62, wpb=429817, bsz=16245.7, num_updates=54100, lr=0.000271914, gnorm=0.232, clip=0, loss_scale=1, train_wall=143, gb_free=59.6, wall=0 epoch 032: 1341 / 1707 loss=4.014, nll_loss=2.376, ppl=5.19, wps=267066, ups=0.62, wpb=429817, bsz=16245.7, num_updates=54100, lr=0.000271914, gnorm=0.232, clip=0, loss_scale=1, train_wall=143, gb_free=59.6, wall=0 epoch 032: 1341 / 1707 loss=4.014, nll_loss=2.376, ppl=5.19, wps=267066, ups=0.62, wpb=429817, bsz=16245.7, num_updates=54100, lr=0.000271914, gnorm=0.232, clip=0, loss_scale=1, train_wall=143, gb_free=59.6, wall=0 epoch 032: 1341 / 1707 loss=4.014, nll_loss=2.376, ppl=5.19, wps=267066, ups=0.62, wpb=429817, bsz=16245.7, num_updates=54100, lr=0.000271914, gnorm=0.232, clip=0, loss_scale=1, train_wall=143, gb_free=59.6, wall=0 epoch 032: 1341 / 1707 loss=4.014, nll_loss=2.376, ppl=5.19, wps=267066, ups=0.62, wpb=429817, bsz=16245.7, num_updates=54100, lr=0.000271914, gnorm=0.232, clip=0, loss_scale=1, train_wall=143, gb_free=59.6, wall=0 epoch 032: 1341 / 1707 loss=4.014, nll_loss=2.376, ppl=5.19, wps=267066, ups=0.62, wpb=429817, bsz=16245.7, num_updates=54100, lr=0.000271914, gnorm=0.232, clip=0, loss_scale=1, train_wall=143, gb_free=59.6, wall=0 epoch 032: 1341 / 1707 loss=4.014, nll_loss=2.376, ppl=5.19, wps=267066, ups=0.62, wpb=429817, bsz=16245.7, num_updates=54100, lr=0.000271914, gnorm=0.232, clip=0, loss_scale=1, train_wall=143, gb_free=59.6, wall=0 epoch 032: 1442 / 1707 loss=4.012, nll_loss=2.374, ppl=5.18, wps=295183, ups=0.69, wpb=429919, bsz=16368.7, num_updates=54200, lr=0.000271663, gnorm=0.24, clip=0, loss_scale=0.5, train_wall=145, gb_free=59.2, wall=0 epoch 032: 1442 / 1707 loss=4.012, nll_loss=2.374, ppl=5.18, wps=295183, ups=0.69, wpb=429919, bsz=16368.7, num_updates=54200, lr=0.000271663, gnorm=0.24, clip=0, loss_scale=0.5, train_wall=145, gb_free=59.2, wall=0 epoch 032: 1442 / 1707 loss=4.012, nll_loss=2.374, ppl=5.18, wps=295183, ups=0.69, wpb=429919, bsz=16368.7, num_updates=54200, lr=0.000271663, gnorm=0.24, clip=0, loss_scale=0.5, train_wall=145, gb_free=59.2, wall=0 epoch 032: 1442 / 1707 loss=4.012, nll_loss=2.374, ppl=5.18, wps=295183, ups=0.69, wpb=429919, bsz=16368.7, num_updates=54200, lr=0.000271663, gnorm=0.24, clip=0, loss_scale=0.5, train_wall=145, gb_free=59.2, wall=0 epoch 032: 1442 / 1707 loss=4.012, nll_loss=2.374, ppl=5.18, wps=295183, ups=0.69, wpb=429919, bsz=16368.7, num_updates=54200, lr=0.000271663, gnorm=0.24, clip=0, loss_scale=0.5, train_wall=145, gb_free=59.2, wall=0 epoch 032: 1442 / 1707 loss=4.012, nll_loss=2.374, ppl=5.18, wps=295183, ups=0.69, wpb=429919, bsz=16368.7, num_updates=54200, lr=0.000271663, gnorm=0.24, clip=0, loss_scale=0.5, train_wall=145, gb_free=59.2, wall=0 epoch 032: 1442 / 1707 loss=4.012, nll_loss=2.374, ppl=5.18, wps=295183, ups=0.69, wpb=429919, bsz=16368.7, num_updates=54200, lr=0.000271663, gnorm=0.24, clip=0, loss_scale=0.5, train_wall=145, gb_free=59.2, wall=0 epoch 032: 1442 / 1707 loss=4.012, nll_loss=2.374, ppl=5.18, wps=295183, ups=0.69, wpb=429919, bsz=16368.7, num_updates=54200, lr=0.000271663, gnorm=0.24, clip=0, loss_scale=0.5, train_wall=145, gb_free=59.2, wall=0 epoch 032: 1442 / 1707 loss=4.012, nll_loss=2.374, ppl=5.18, wps=295183, ups=0.69, wpb=429919, bsz=16368.7, num_updates=54200, lr=0.000271663, gnorm=0.24, clip=0, loss_scale=0.5, train_wall=145, gb_free=59.2, wall=0 epoch 032: 1442 / 1707 loss=4.012, nll_loss=2.374, ppl=5.18, wps=295183, ups=0.69, wpb=429919, bsz=16368.7, num_updates=54200, lr=0.000271663, gnorm=0.24, clip=0, loss_scale=0.5, train_wall=145, gb_free=59.2, wall=0 epoch 032: 1542 / 1707 loss=4.02, nll_loss=2.383, ppl=5.22, wps=298257, ups=0.69, wpb=430982, bsz=16305.4, num_updates=54300, lr=0.000271413, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=144, gb_free=59.7, wall=0 epoch 032: 1542 / 1707 loss=4.02, nll_loss=2.383, ppl=5.22, wps=298257, ups=0.69, wpb=430982, bsz=16305.4, num_updates=54300, lr=0.000271413, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=144, gb_free=59.7, wall=0 epoch 032: 1542 / 1707 loss=4.02, nll_loss=2.383, ppl=5.22, wps=298257, ups=0.69, wpb=430982, bsz=16305.4, num_updates=54300, lr=0.000271413, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=144, gb_free=59.7, wall=0 epoch 032: 1542 / 1707 loss=4.02, nll_loss=2.383, ppl=5.22, wps=298257, ups=0.69, wpb=430982, bsz=16305.4, num_updates=54300, lr=0.000271413, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=144, gb_free=59.7, wall=0 epoch 032: 1542 / 1707 loss=4.02, nll_loss=2.383, ppl=5.22, wps=298257, ups=0.69, wpb=430982, bsz=16305.4, num_updates=54300, lr=0.000271413, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=144, gb_free=59.7, wall=0 epoch 032: 1542 / 1707 loss=4.02, nll_loss=2.383, ppl=5.22, wps=298257, ups=0.69, wpb=430982, bsz=16305.4, num_updates=54300, lr=0.000271413, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=144, gb_free=59.7, wall=0 epoch 032: 1542 / 1707 loss=4.02, nll_loss=2.383, ppl=5.22, wps=298257, ups=0.69, wpb=430982, bsz=16305.4, num_updates=54300, lr=0.000271413, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=144, gb_free=59.7, wall=0 epoch 032: 1542 / 1707 loss=4.02, nll_loss=2.383, ppl=5.22, wps=298257, ups=0.69, wpb=430982, bsz=16305.4, num_updates=54300, lr=0.000271413, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=144, gb_free=59.7, wall=0 epoch 032: 1542 / 1707 loss=4.02, nll_loss=2.383, ppl=5.22, wps=298257, ups=0.69, wpb=430982, bsz=16305.4, num_updates=54300, lr=0.000271413, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=144, gb_free=59.7, wall=0 epoch 032: 1542 / 1707 loss=4.02, nll_loss=2.383, ppl=5.22, wps=298257, ups=0.69, wpb=430982, bsz=16305.4, num_updates=54300, lr=0.000271413, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=144, gb_free=59.7, wall=0 epoch 032: 1642 / 1707 loss=4.017, nll_loss=2.379, ppl=5.2, wps=296569, ups=0.69, wpb=428437, bsz=16154.3, num_updates=54400, lr=0.000271163, gnorm=0.241, clip=0, loss_scale=1, train_wall=144, gb_free=59.4, wall=0 epoch 032: 1642 / 1707 loss=4.017, nll_loss=2.379, ppl=5.2, wps=296569, ups=0.69, wpb=428437, bsz=16154.3, num_updates=54400, lr=0.000271163, gnorm=0.241, clip=0, loss_scale=1, train_wall=144, gb_free=59.4, wall=0 epoch 032: 1642 / 1707 loss=4.017, nll_loss=2.379, ppl=5.2, wps=296569, ups=0.69, wpb=428437, bsz=16154.3, num_updates=54400, lr=0.000271163, gnorm=0.241, clip=0, loss_scale=1, train_wall=144, gb_free=59.4, wall=0 epoch 032: 1642 / 1707 loss=4.017, nll_loss=2.379, ppl=5.2, wps=296569, ups=0.69, wpb=428437, bsz=16154.3, num_updates=54400, lr=0.000271163, gnorm=0.241, clip=0, loss_scale=1, train_wall=144, gb_free=59.4, wall=0 epoch 032: 1642 / 1707 loss=4.017, nll_loss=2.379, ppl=5.2, wps=296569, ups=0.69, wpb=428437, bsz=16154.3, num_updates=54400, lr=0.000271163, gnorm=0.241, clip=0, loss_scale=1, train_wall=144, gb_free=59.4, wall=0 epoch 032: 1642 / 1707 loss=4.017, nll_loss=2.379, ppl=5.2, wps=296569, ups=0.69, wpb=428437, bsz=16154.3, num_updates=54400, lr=0.000271163, gnorm=0.241, clip=0, loss_scale=1, train_wall=144, gb_free=59.4, wall=0 epoch 032: 1642 / 1707 loss=4.017, nll_loss=2.379, ppl=5.2, wps=296569, ups=0.69, wpb=428437, bsz=16154.3, num_updates=54400, lr=0.000271163, gnorm=0.241, clip=0, loss_scale=1, train_wall=144, gb_free=59.4, wall=0 epoch 032: 1642 / 1707 loss=4.017, nll_loss=2.379, ppl=5.2, wps=296569, ups=0.69, wpb=428437, bsz=16154.3, num_updates=54400, lr=0.000271163, gnorm=0.241, clip=0, loss_scale=1, train_wall=144, gb_free=59.4, wall=0 epoch 032: 1642 / 1707 loss=4.017, nll_loss=2.379, ppl=5.2, wps=296569, ups=0.69, wpb=428437, bsz=16154.3, num_updates=54400, lr=0.000271163, gnorm=0.241, clip=0, loss_scale=1, train_wall=144, gb_free=59.4, wall=0 epoch 032: 1642 / 1707 loss=4.017, nll_loss=2.379, ppl=5.2, wps=296569, ups=0.69, wpb=428437, bsz=16154.3, num_updates=54400, lr=0.000271163, gnorm=0.241, clip=0, loss_scale=1, train_wall=144, gb_free=59.4, wall=0 end of epoch 32 (average epoch stats below) epoch 032 | loss 4.004 | nll_loss 2.364 | ppl 5.15 | wps 292923 | ups 0.68 | wpb 428946 | bsz 16334.9 | num_updates 54465 | lr 0.000271001 | gnorm 0.24 | clip 0.1 | loss_scale 1 | train_wall 2450 | gb_free 59.2 | wall 0 epoch 032 | loss 4.004 | nll_loss 2.364 | ppl 5.15 | wps 292923 | ups 0.68 | wpb 428946 | bsz 16334.9 | num_updates 54465 | lr 0.000271001 | gnorm 0.24 | clip 0.1 | loss_scale 1 | train_wall 2450 | gb_free 59.2 | wall 0 epoch 032 | loss 4.004 | nll_loss 2.364 | ppl 5.15 | wps 292923 | ups 0.68 | wpb 428946 | bsz 16334.9 | num_updates 54465 | lr 0.000271001 | gnorm 0.24 | clip 0.1 | loss_scale 1 | train_wall 2450 | gb_free 59.2 | wall 0 epoch 032 | loss 4.004 | nll_loss 2.364 | ppl 5.15 | wps 292923 | ups 0.68 | wpb 428946 | bsz 16334.9 | num_updates 54465 | lr 0.000271001 | gnorm 0.24 | clip 0.1 | loss_scale 1 | train_wall 2450 | gb_free 59.2 | wall 0 epoch 032 | loss 4.004 | nll_loss 2.364 | ppl 5.15 | wps 292923 | ups 0.68 | wpb 428946 | bsz 16334.9 | num_updates 54465 | lr 0.000271001 | gnorm 0.24 | clip 0.1 | loss_scale 1 | train_wall 2450 | gb_free 59.2 | wall 0 epoch 032 | loss 4.004 | nll_loss 2.364 | ppl 5.15 | wps 292923 | ups 0.68 | wpb 428946 | bsz 16334.9 | num_updates 54465 | lr 0.000271001 | gnorm 0.24 | clip 0.1 | loss_scale 1 | train_wall 2450 | gb_free 59.2 | wall 0 epoch 032 | loss 4.004 | nll_loss 2.364 | ppl 5.15 | wps 292923 | ups 0.68 | wpb 428946 | bsz 16334.9 | num_updates 54465 | lr 0.000271001 | gnorm 0.24 | clip 0.1 | loss_scale 1 | train_wall 2450 | gb_free 59.2 | wall 0 epoch 032 | loss 4.004 | nll_loss 2.364 | ppl 5.15 | wps 292923 | ups 0.68 | wpb 428946 | bsz 16334.9 | num_updates 54465 | lr 0.000271001 | gnorm 0.24 | clip 0.1 | loss_scale 1 | train_wall 2450 | gb_free 59.2 | wall 0 epoch 032 | loss 4.004 | nll_loss 2.364 | ppl 5.15 | wps 292923 | ups 0.68 | wpb 428946 | bsz 16334.9 | num_updates 54465 | lr 0.000271001 | gnorm 0.24 | clip 0.1 | loss_scale 1 | train_wall 2450 | gb_free 59.2 | wall 0 epoch 032 | loss 4.004 | nll_loss 2.364 | ppl 5.15 | wps 292923 | ups 0.68 | wpb 428946 | bsz 16334.9 | num_updates 54465 | lr 0.000271001 | gnorm 0.24 | clip 0.1 | loss_scale 1 | train_wall 2450 | gb_free 59.2 | wall 0 Start iterating over samples epoch 033: 35 / 1707 loss=4.003, nll_loss=2.363, ppl=5.15, wps=295206, ups=0.69, wpb=425165, bsz=16244.2, num_updates=54500, lr=0.000270914, gnorm=0.27, clip=1, loss_scale=1, train_wall=143, gb_free=59.4, wall=0 epoch 033: 35 / 1707 loss=4.003, nll_loss=2.363, ppl=5.15, wps=295206, ups=0.69, wpb=425165, bsz=16244.2, num_updates=54500, lr=0.000270914, gnorm=0.27, clip=1, loss_scale=1, train_wall=143, gb_free=59.4, wall=0 epoch 033: 35 / 1707 loss=4.003, nll_loss=2.363, ppl=5.15, wps=295206, ups=0.69, wpb=425165, bsz=16244.2, num_updates=54500, lr=0.000270914, gnorm=0.27, clip=1, loss_scale=1, train_wall=143, gb_free=59.4, wall=0 epoch 033: 35 / 1707 loss=4.003, nll_loss=2.363, ppl=5.15, wps=295206, ups=0.69, wpb=425165, bsz=16244.2, num_updates=54500, lr=0.000270914, gnorm=0.27, clip=1, loss_scale=1, train_wall=143, gb_free=59.4, wall=0 epoch 033: 35 / 1707 loss=4.003, nll_loss=2.363, ppl=5.15, wps=295206, ups=0.69, wpb=425165, bsz=16244.2, num_updates=54500, lr=0.000270914, gnorm=0.27, clip=1, loss_scale=1, train_wall=143, gb_free=59.4, wall=0 epoch 033: 35 / 1707 loss=4.003, nll_loss=2.363, ppl=5.15, wps=295206, ups=0.69, wpb=425165, bsz=16244.2, num_updates=54500, lr=0.000270914, gnorm=0.27, clip=1, loss_scale=1, train_wall=143, gb_free=59.4, wall=0 epoch 033: 35 / 1707 loss=4.003, nll_loss=2.363, ppl=5.15, wps=295206, ups=0.69, wpb=425165, bsz=16244.2, num_updates=54500, lr=0.000270914, gnorm=0.27, clip=1, loss_scale=1, train_wall=143, gb_free=59.4, wall=0 epoch 033: 35 / 1707 loss=4.003, nll_loss=2.363, ppl=5.15, wps=295206, ups=0.69, wpb=425165, bsz=16244.2, num_updates=54500, lr=0.000270914, gnorm=0.27, clip=1, loss_scale=1, train_wall=143, gb_free=59.4, wall=0 epoch 033: 35 / 1707 loss=4.003, nll_loss=2.363, ppl=5.15, wps=295206, ups=0.69, wpb=425165, bsz=16244.2, num_updates=54500, lr=0.000270914, gnorm=0.27, clip=1, loss_scale=1, train_wall=143, gb_free=59.4, wall=0 epoch 033: 35 / 1707 loss=4.003, nll_loss=2.363, ppl=5.15, wps=295206, ups=0.69, wpb=425165, bsz=16244.2, num_updates=54500, lr=0.000270914, gnorm=0.27, clip=1, loss_scale=1, train_wall=143, gb_free=59.4, wall=0 epoch 033: 35 / 1707 loss=4.003, nll_loss=2.363, ppl=5.15, wps=295206, ups=0.69, wpb=425165, bsz=16244.2, num_updates=54500, lr=0.000270914, gnorm=0.27, clip=1, loss_scale=1, train_wall=143, gb_free=59.4, wall=0 epoch 033: 135 / 1707 loss=3.985, nll_loss=2.342, ppl=5.07, wps=296411, ups=0.69, wpb=429096, bsz=16687.1, num_updates=54600, lr=0.000270666, gnorm=0.231, clip=0, loss_scale=1, train_wall=144, gb_free=58.7, wall=0 epoch 033: 135 / 1707 loss=3.985, nll_loss=2.342, ppl=5.07, wps=296411, ups=0.69, wpb=429096, bsz=16687.1, num_updates=54600, lr=0.000270666, gnorm=0.231, clip=0, loss_scale=1, train_wall=144, gb_free=58.7, wall=0 epoch 033: 135 / 1707 loss=3.985, nll_loss=2.342, ppl=5.07, wps=296411, ups=0.69, wpb=429096, bsz=16687.1, num_updates=54600, lr=0.000270666, gnorm=0.231, clip=0, loss_scale=1, train_wall=144, gb_free=58.7, wall=0 epoch 033: 135 / 1707 loss=3.985, nll_loss=2.342, ppl=5.07, wps=296411, ups=0.69, wpb=429096, bsz=16687.1, num_updates=54600, lr=0.000270666, gnorm=0.231, clip=0, loss_scale=1, train_wall=144, gb_free=58.7, wall=0 epoch 033: 135 / 1707 loss=3.985, nll_loss=2.342, ppl=5.07, wps=296411, ups=0.69, wpb=429096, bsz=16687.1, num_updates=54600, lr=0.000270666, gnorm=0.231, clip=0, loss_scale=1, train_wall=144, gb_free=58.7, wall=0 epoch 033: 135 / 1707 loss=3.985, nll_loss=2.342, ppl=5.07, wps=296411, ups=0.69, wpb=429096, bsz=16687.1, num_updates=54600, lr=0.000270666, gnorm=0.231, clip=0, loss_scale=1, train_wall=144, gb_free=58.7, wall=0 epoch 033: 135 / 1707 loss=3.985, nll_loss=2.342, ppl=5.07, wps=296411, ups=0.69, wpb=429096, bsz=16687.1, num_updates=54600, lr=0.000270666, gnorm=0.231, clip=0, loss_scale=1, train_wall=144, gb_free=58.7, wall=0 epoch 033: 135 / 1707 loss=3.985, nll_loss=2.342, ppl=5.07, wps=296411, ups=0.69, wpb=429096, bsz=16687.1, num_updates=54600, lr=0.000270666, gnorm=0.231, clip=0, loss_scale=1, train_wall=144, gb_free=58.7, wall=0 epoch 033: 135 / 1707 loss=3.985, nll_loss=2.342, ppl=5.07, wps=296411, ups=0.69, wpb=429096, bsz=16687.1, num_updates=54600, lr=0.000270666, gnorm=0.231, clip=0, loss_scale=1, train_wall=144, gb_free=58.7, wall=0 epoch 033: 135 / 1707 loss=3.985, nll_loss=2.342, ppl=5.07, wps=296411, ups=0.69, wpb=429096, bsz=16687.1, num_updates=54600, lr=0.000270666, gnorm=0.231, clip=0, loss_scale=1, train_wall=144, gb_free=58.7, wall=0 epoch 033: 135 / 1707 loss=3.985, nll_loss=2.342, ppl=5.07, wps=296411, ups=0.69, wpb=429096, bsz=16687.1, num_updates=54600, lr=0.000270666, gnorm=0.231, clip=0, loss_scale=1, train_wall=144, gb_free=58.7, wall=0 epoch 033: 235 / 1707 loss=3.982, nll_loss=2.338, ppl=5.06, wps=297650, ups=0.69, wpb=430186, bsz=16296.6, num_updates=54700, lr=0.000270418, gnorm=0.231, clip=0, loss_scale=2, train_wall=144, gb_free=59.2, wall=0 epoch 033: 235 / 1707 loss=3.982, nll_loss=2.338, ppl=5.06, wps=297650, ups=0.69, wpb=430186, bsz=16296.6, num_updates=54700, lr=0.000270418, gnorm=0.231, clip=0, loss_scale=2, train_wall=144, gb_free=59.2, wall=0 epoch 033: 235 / 1707 loss=3.982, nll_loss=2.338, ppl=5.06, wps=297650, ups=0.69, wpb=430186, bsz=16296.6, num_updates=54700, lr=0.000270418, gnorm=0.231, clip=0, loss_scale=2, train_wall=144, gb_free=59.2, wall=0 epoch 033: 235 / 1707 loss=3.982, nll_loss=2.338, ppl=5.06, wps=297650, ups=0.69, wpb=430186, bsz=16296.6, num_updates=54700, lr=0.000270418, gnorm=0.231, clip=0, loss_scale=2, train_wall=144, gb_free=59.2, wall=0 epoch 033: 235 / 1707 loss=3.982, nll_loss=2.338, ppl=5.06, wps=297650, ups=0.69, wpb=430186, bsz=16296.6, num_updates=54700, lr=0.000270418, gnorm=0.231, clip=0, loss_scale=2, train_wall=144, gb_free=59.2, wall=0 epoch 033: 235 / 1707 loss=3.982, nll_loss=2.338, ppl=5.06, wps=297650, ups=0.69, wpb=430186, bsz=16296.6, num_updates=54700, lr=0.000270418, gnorm=0.231, clip=0, loss_scale=2, train_wall=144, gb_free=59.2, wall=0 epoch 033: 235 / 1707 loss=3.982, nll_loss=2.338, ppl=5.06, wps=297650, ups=0.69, wpb=430186, bsz=16296.6, num_updates=54700, lr=0.000270418, gnorm=0.231, clip=0, loss_scale=2, train_wall=144, gb_free=59.2, wall=0 epoch 033: 235 / 1707 loss=3.982, nll_loss=2.338, ppl=5.06, wps=297650, ups=0.69, wpb=430186, bsz=16296.6, num_updates=54700, lr=0.000270418, gnorm=0.231, clip=0, loss_scale=2, train_wall=144, gb_free=59.2, wall=0 epoch 033: 235 / 1707 loss=3.982, nll_loss=2.338, ppl=5.06, wps=297650, ups=0.69, wpb=430186, bsz=16296.6, num_updates=54700, lr=0.000270418, gnorm=0.231, clip=0, loss_scale=2, train_wall=144, gb_free=59.2, wall=0 epoch 033: 235 / 1707 loss=3.982, nll_loss=2.338, ppl=5.06, wps=297650, ups=0.69, wpb=430186, bsz=16296.6, num_updates=54700, lr=0.000270418, gnorm=0.231, clip=0, loss_scale=2, train_wall=144, gb_free=59.2, wall=0 epoch 033: 235 / 1707 loss=3.982, nll_loss=2.338, ppl=5.06, wps=297650, ups=0.69, wpb=430186, bsz=16296.6, num_updates=54700, lr=0.000270418, gnorm=0.231, clip=0, loss_scale=2, train_wall=144, gb_free=59.2, wall=0 epoch 033: 336 / 1707 loss=3.986, nll_loss=2.344, ppl=5.08, wps=293161, ups=0.68, wpb=429280, bsz=16564.1, num_updates=54800, lr=0.000270172, gnorm=0.237, clip=0, loss_scale=1, train_wall=146, gb_free=59.3, wall=0 epoch 033: 336 / 1707 loss=3.986, nll_loss=2.344, ppl=5.08, wps=293161, ups=0.68, wpb=429280, bsz=16564.1, num_updates=54800, lr=0.000270172, gnorm=0.237, clip=0, loss_scale=1, train_wall=146, gb_free=59.3, wall=0 epoch 033: 336 / 1707 loss=3.986, nll_loss=2.344, ppl=5.08, wps=293161, ups=0.68, wpb=429280, bsz=16564.1, num_updates=54800, lr=0.000270172, gnorm=0.237, clip=0, loss_scale=1, train_wall=146, gb_free=59.3, wall=0 epoch 033: 336 / 1707 loss=3.986, nll_loss=2.344, ppl=5.08, wps=293161, ups=0.68, wpb=429280, bsz=16564.1, num_updates=54800, lr=0.000270172, gnorm=0.237, clip=0, loss_scale=1, train_wall=146, gb_free=59.3, wall=0 epoch 033: 336 / 1707 loss=3.986, nll_loss=2.344, ppl=5.08, wps=293161, ups=0.68, wpb=429280, bsz=16564.1, num_updates=54800, lr=0.000270172, gnorm=0.237, clip=0, loss_scale=1, train_wall=146, gb_free=59.3, wall=0 epoch 033: 336 / 1707 loss=3.986, nll_loss=2.344, ppl=5.08, wps=293161, ups=0.68, wpb=429280, bsz=16564.1, num_updates=54800, lr=0.000270172, gnorm=0.237, clip=0, loss_scale=1, train_wall=146, gb_free=59.3, wall=0 epoch 033: 336 / 1707 loss=3.986, nll_loss=2.344, ppl=5.08, wps=293161, ups=0.68, wpb=429280, bsz=16564.1, num_updates=54800, lr=0.000270172, gnorm=0.237, clip=0, loss_scale=1, train_wall=146, gb_free=59.3, wall=0 epoch 033: 336 / 1707 loss=3.986, nll_loss=2.344, ppl=5.08, wps=293161, ups=0.68, wpb=429280, bsz=16564.1, num_updates=54800, lr=0.000270172, gnorm=0.237, clip=0, loss_scale=1, train_wall=146, gb_free=59.3, wall=0 epoch 033: 336 / 1707 loss=3.986, nll_loss=2.344, ppl=5.08, wps=293161, ups=0.68, wpb=429280, bsz=16564.1, num_updates=54800, lr=0.000270172, gnorm=0.237, clip=0, loss_scale=1, train_wall=146, gb_free=59.3, wall=0 epoch 033: 336 / 1707 loss=3.986, nll_loss=2.344, ppl=5.08, wps=293161, ups=0.68, wpb=429280, bsz=16564.1, num_updates=54800, lr=0.000270172, gnorm=0.237, clip=0, loss_scale=1, train_wall=146, gb_free=59.3, wall=0 epoch 033: 336 / 1707 loss=3.986, nll_loss=2.344, ppl=5.08, wps=293161, ups=0.68, wpb=429280, bsz=16564.1, num_updates=54800, lr=0.000270172, gnorm=0.237, clip=0, loss_scale=1, train_wall=146, gb_free=59.3, wall=0 epoch 033: 436 / 1707 loss=3.992, nll_loss=2.35, ppl=5.1, wps=297721, ups=0.69, wpb=431389, bsz=16635.2, num_updates=54900, lr=0.000269925, gnorm=0.243, clip=0, loss_scale=1, train_wall=144, gb_free=58.8, wall=0 epoch 033: 436 / 1707 loss=3.992, nll_loss=2.35, ppl=5.1, wps=297721, ups=0.69, wpb=431389, bsz=16635.2, num_updates=54900, lr=0.000269925, gnorm=0.243, clip=0, loss_scale=1, train_wall=144, gb_free=58.8, wall=0 epoch 033: 436 / 1707 loss=3.992, nll_loss=2.35, ppl=5.1, wps=297721, ups=0.69, wpb=431389, bsz=16635.2, num_updates=54900, lr=0.000269925, gnorm=0.243, clip=0, loss_scale=1, train_wall=144, gb_free=58.8, wall=0 epoch 033: 436 / 1707 loss=3.992, nll_loss=2.35, ppl=5.1, wps=297721, ups=0.69, wpb=431389, bsz=16635.2, num_updates=54900, lr=0.000269925, gnorm=0.243, clip=0, loss_scale=1, train_wall=144, gb_free=58.8, wall=0 epoch 033: 436 / 1707 loss=3.992, nll_loss=2.35, ppl=5.1, wps=297721, ups=0.69, wpb=431389, bsz=16635.2, num_updates=54900, lr=0.000269925, gnorm=0.243, clip=0, loss_scale=1, train_wall=144, gb_free=58.8, wall=0 epoch 033: 436 / 1707 loss=3.992, nll_loss=2.35, ppl=5.1, wps=297721, ups=0.69, wpb=431389, bsz=16635.2, num_updates=54900, lr=0.000269925, gnorm=0.243, clip=0, loss_scale=1, train_wall=144, gb_free=58.8, wall=0 epoch 033: 436 / 1707 loss=3.992, nll_loss=2.35, ppl=5.1, wps=297721, ups=0.69, wpb=431389, bsz=16635.2, num_updates=54900, lr=0.000269925, gnorm=0.243, clip=0, loss_scale=1, train_wall=144, gb_free=58.8, wall=0 epoch 033: 436 / 1707 loss=3.992, nll_loss=2.35, ppl=5.1, wps=297721, ups=0.69, wpb=431389, bsz=16635.2, num_updates=54900, lr=0.000269925, gnorm=0.243, clip=0, loss_scale=1, train_wall=144, gb_free=58.8, wall=0 epoch 033: 436 / 1707 loss=3.992, nll_loss=2.35, ppl=5.1, wps=297721, ups=0.69, wpb=431389, bsz=16635.2, num_updates=54900, lr=0.000269925, gnorm=0.243, clip=0, loss_scale=1, train_wall=144, gb_free=58.8, wall=0 epoch 033: 436 / 1707 loss=3.992, nll_loss=2.35, ppl=5.1, wps=297721, ups=0.69, wpb=431389, bsz=16635.2, num_updates=54900, lr=0.000269925, gnorm=0.243, clip=0, loss_scale=1, train_wall=144, gb_free=58.8, wall=0 epoch 033: 436 / 1707 loss=3.992, nll_loss=2.35, ppl=5.1, wps=297721, ups=0.69, wpb=431389, bsz=16635.2, num_updates=54900, lr=0.000269925, gnorm=0.243, clip=0, loss_scale=1, train_wall=144, gb_free=58.8, wall=0 epoch 033: 537 / 1707 loss=3.989, nll_loss=2.347, ppl=5.09, wps=293704, ups=0.69, wpb=427262, bsz=16390.7, num_updates=55000, lr=0.00026968, gnorm=0.237, clip=0, loss_scale=0.5, train_wall=145, gb_free=59.3, wall=0 epoch 033: 537 / 1707 loss=3.989, nll_loss=2.347, ppl=5.09, wps=293704, ups=0.69, wpb=427262, bsz=16390.7, num_updates=55000, lr=0.00026968, gnorm=0.237, clip=0, loss_scale=0.5, train_wall=145, gb_free=59.3, wall=0 epoch 033: 537 / 1707 loss=3.989, nll_loss=2.347, ppl=5.09, wps=293704, ups=0.69, wpb=427262, bsz=16390.7, num_updates=55000, lr=0.00026968, gnorm=0.237, clip=0, loss_scale=0.5, train_wall=145, gb_free=59.3, wall=0 epoch 033: 537 / 1707 loss=3.989, nll_loss=2.347, ppl=5.09, wps=293704, ups=0.69, wpb=427262, bsz=16390.7, num_updates=55000, lr=0.00026968, gnorm=0.237, clip=0, loss_scale=0.5, train_wall=145, gb_free=59.3, wall=0 epoch 033: 537 / 1707 loss=3.989, nll_loss=2.347, ppl=5.09, wps=293704, ups=0.69, wpb=427262, bsz=16390.7, num_updates=55000, lr=0.00026968, gnorm=0.237, clip=0, loss_scale=0.5, train_wall=145, gb_free=59.3, wall=0 epoch 033: 537 / 1707 loss=3.989, nll_loss=2.347, ppl=5.09, wps=293704, ups=0.69, wpb=427262, bsz=16390.7, num_updates=55000, lr=0.00026968, gnorm=0.237, clip=0, loss_scale=0.5, train_wall=145, gb_free=59.3, wall=0 epoch 033: 537 / 1707 loss=3.989, nll_loss=2.347, ppl=5.09, wps=293704, ups=0.69, wpb=427262, bsz=16390.7, num_updates=55000, lr=0.00026968, gnorm=0.237, clip=0, loss_scale=0.5, train_wall=145, gb_free=59.3, wall=0 epoch 033: 537 / 1707 loss=3.989, nll_loss=2.347, ppl=5.09, wps=293704, ups=0.69, wpb=427262, bsz=16390.7, num_updates=55000, lr=0.00026968, gnorm=0.237, clip=0, loss_scale=0.5, train_wall=145, gb_free=59.3, wall=0 epoch 033: 537 / 1707 loss=3.989, nll_loss=2.347, ppl=5.09, wps=293704, ups=0.69, wpb=427262, bsz=16390.7, num_updates=55000, lr=0.00026968, gnorm=0.237, clip=0, loss_scale=0.5, train_wall=145, gb_free=59.3, wall=0 epoch 033: 537 / 1707 loss=3.989, nll_loss=2.347, ppl=5.09, wps=293704, ups=0.69, wpb=427262, bsz=16390.7, num_updates=55000, lr=0.00026968, gnorm=0.237, clip=0, loss_scale=0.5, train_wall=145, gb_free=59.3, wall=0 epoch 033: 537 / 1707 loss=3.989, nll_loss=2.347, ppl=5.09, wps=293704, ups=0.69, wpb=427262, bsz=16390.7, num_updates=55000, lr=0.00026968, gnorm=0.237, clip=0, loss_scale=0.5, train_wall=145, gb_free=59.3, wall=0 begin validation on "valid" subset epoch 033 | valid on 'valid' subset | loss 4.296 | nll_loss 2.678 | ppl 6.4 | wps 75750.1 | wpb 21331 | bsz 1016 | num_updates 55000 | best_loss 4.279 epoch 033 | valid on 'valid' subset | loss 4.296 | nll_loss 2.678 | ppl 6.4 | wps 75750.1 | wpb 21331 | bsz 1016 | num_updates 55000 | best_loss 4.279 epoch 033 | valid on 'valid' subset | loss 4.296 | nll_loss 2.678 | ppl 6.4 | wps 75750.1 | wpb 21331 | bsz 1016 | num_updates 55000 | best_loss 4.279 epoch 033 | valid on 'valid' subset | loss 4.296 | nll_loss 2.678 | ppl 6.4 | wps 75750.1 | wpb 21331 | bsz 1016 | num_updates 55000 | best_loss 4.279 epoch 033 | valid on 'valid' subset | loss 4.296 | nll_loss 2.678 | ppl 6.4 | wps 75750.1 | wpb 21331 | bsz 1016 | num_updates 55000 | best_loss 4.279 epoch 033 | valid on 'valid' subset | loss 4.296 | nll_loss 2.678 | ppl 6.4 | wps 75750.1 | wpb 21331 | bsz 1016 | num_updates 55000 | best_loss 4.279 epoch 033 | valid on 'valid' subset | loss 4.296 | nll_loss 2.678 | ppl 6.4 | wps 75750.1 | wpb 21331 | bsz 1016 | num_updates 55000 | best_loss 4.279 epoch 033 | valid on 'valid' subset | loss 4.296 | nll_loss 2.678 | ppl 6.4 | wps 75750.1 | wpb 21331 | bsz 1016 | num_updates 55000 | best_loss 4.279 epoch 033 | valid on 'valid' subset | loss 4.296 | nll_loss 2.678 | ppl 6.4 | wps 75750.1 | wpb 21331 | bsz 1016 | num_updates 55000 | best_loss 4.279 epoch 033 | valid on 'valid' subset | loss 4.296 | nll_loss 2.678 | ppl 6.4 | wps 75750.1 | wpb 21331 | bsz 1016 | num_updates 55000 | best_loss 4.279 epoch 033 | valid on 'valid' subset | loss 4.296 | nll_loss 2.678 | ppl 6.4 | wps 75750.1 | wpb 21331 | bsz 1016 | num_updates 55000 | best_loss 4.279 epoch 033: 637 / 1707 loss=3.997, nll_loss=2.356, ppl=5.12, wps=265895, ups=0.62, wpb=428701, bsz=16286.7, num_updates=55100, lr=0.000269435, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=144, gb_free=59.2, wall=0 epoch 033: 637 / 1707 loss=3.997, nll_loss=2.356, ppl=5.12, wps=265895, ups=0.62, wpb=428701, bsz=16286.7, num_updates=55100, lr=0.000269435, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=144, gb_free=59.2, wall=0 epoch 033: 637 / 1707 loss=3.997, nll_loss=2.356, ppl=5.12, wps=265895, ups=0.62, wpb=428701, bsz=16286.7, num_updates=55100, lr=0.000269435, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=144, gb_free=59.2, wall=0 epoch 033: 637 / 1707 loss=3.997, nll_loss=2.356, ppl=5.12, wps=265895, ups=0.62, wpb=428701, bsz=16286.7, num_updates=55100, lr=0.000269435, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=144, gb_free=59.2, wall=0 epoch 033: 637 / 1707 loss=3.997, nll_loss=2.356, ppl=5.12, wps=265895, ups=0.62, wpb=428701, bsz=16286.7, num_updates=55100, lr=0.000269435, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=144, gb_free=59.2, wall=0 epoch 033: 637 / 1707 loss=3.997, nll_loss=2.356, ppl=5.12, wps=265895, ups=0.62, wpb=428701, bsz=16286.7, num_updates=55100, lr=0.000269435, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=144, gb_free=59.2, wall=0 epoch 033: 637 / 1707 loss=3.997, nll_loss=2.356, ppl=5.12, wps=265895, ups=0.62, wpb=428701, bsz=16286.7, num_updates=55100, lr=0.000269435, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=144, gb_free=59.2, wall=0 epoch 033: 637 / 1707 loss=3.997, nll_loss=2.356, ppl=5.12, wps=265895, ups=0.62, wpb=428701, bsz=16286.7, num_updates=55100, lr=0.000269435, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=144, gb_free=59.2, wall=0 epoch 033: 637 / 1707 loss=3.997, nll_loss=2.356, ppl=5.12, wps=265895, ups=0.62, wpb=428701, bsz=16286.7, num_updates=55100, lr=0.000269435, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=144, gb_free=59.2, wall=0 epoch 033: 637 / 1707 loss=3.997, nll_loss=2.356, ppl=5.12, wps=265895, ups=0.62, wpb=428701, bsz=16286.7, num_updates=55100, lr=0.000269435, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=144, gb_free=59.2, wall=0 epoch 033: 637 / 1707 loss=3.997, nll_loss=2.356, ppl=5.12, wps=265895, ups=0.62, wpb=428701, bsz=16286.7, num_updates=55100, lr=0.000269435, gnorm=0.235, clip=0, loss_scale=0.5, train_wall=144, gb_free=59.2, wall=0 epoch 033: 737 / 1707 loss=4.003, nll_loss=2.362, ppl=5.14, wps=297778, ups=0.7, wpb=427362, bsz=16145.5, num_updates=55200, lr=0.000269191, gnorm=0.235, clip=0, loss_scale=1, train_wall=143, gb_free=59.1, wall=0 epoch 033: 737 / 1707 loss=4.003, nll_loss=2.362, ppl=5.14, wps=297778, ups=0.7, wpb=427362, bsz=16145.5, num_updates=55200, lr=0.000269191, gnorm=0.235, clip=0, loss_scale=1, train_wall=143, gb_free=59.1, wall=0 epoch 033: 737 / 1707 loss=4.003, nll_loss=2.362, ppl=5.14, wps=297778, ups=0.7, wpb=427362, bsz=16145.5, num_updates=55200, lr=0.000269191, gnorm=0.235, clip=0, loss_scale=1, train_wall=143, gb_free=59.1, wall=0 epoch 033: 737 / 1707 loss=4.003, nll_loss=2.362, ppl=5.14, wps=297778, ups=0.7, wpb=427362, bsz=16145.5, num_updates=55200, lr=0.000269191, gnorm=0.235, clip=0, loss_scale=1, train_wall=143, gb_free=59.1, wall=0 epoch 033: 737 / 1707 loss=4.003, nll_loss=2.362, ppl=5.14, wps=297778, ups=0.7, wpb=427362, bsz=16145.5, num_updates=55200, lr=0.000269191, gnorm=0.235, clip=0, loss_scale=1, train_wall=143, gb_free=59.1, wall=0 epoch 033: 737 / 1707 loss=4.003, nll_loss=2.362, ppl=5.14, wps=297778, ups=0.7, wpb=427362, bsz=16145.5, num_updates=55200, lr=0.000269191, gnorm=0.235, clip=0, loss_scale=1, train_wall=143, gb_free=59.1, wall=0 epoch 033: 737 / 1707 loss=4.003, nll_loss=2.362, ppl=5.14, wps=297778, ups=0.7, wpb=427362, bsz=16145.5, num_updates=55200, lr=0.000269191, gnorm=0.235, clip=0, loss_scale=1, train_wall=143, gb_free=59.1, wall=0 epoch 033: 737 / 1707 loss=4.003, nll_loss=2.362, ppl=5.14, wps=297778, ups=0.7, wpb=427362, bsz=16145.5, num_updates=55200, lr=0.000269191, gnorm=0.235, clip=0, loss_scale=1, train_wall=143, gb_free=59.1, wall=0 epoch 033: 737 / 1707 loss=4.003, nll_loss=2.362, ppl=5.14, wps=297778, ups=0.7, wpb=427362, bsz=16145.5, num_updates=55200, lr=0.000269191, gnorm=0.235, clip=0, loss_scale=1, train_wall=143, gb_free=59.1, wall=0 epoch 033: 737 / 1707 loss=4.003, nll_loss=2.362, ppl=5.14, wps=297778, ups=0.7, wpb=427362, bsz=16145.5, num_updates=55200, lr=0.000269191, gnorm=0.235, clip=0, loss_scale=1, train_wall=143, gb_free=59.1, wall=0 epoch 033: 737 / 1707 loss=4.003, nll_loss=2.362, ppl=5.14, wps=297778, ups=0.7, wpb=427362, bsz=16145.5, num_updates=55200, lr=0.000269191, gnorm=0.235, clip=0, loss_scale=1, train_wall=143, gb_free=59.1, wall=0 epoch 033: 837 / 1707 loss=4.002, nll_loss=2.362, ppl=5.14, wps=297216, ups=0.69, wpb=428634, bsz=16338.6, num_updates=55300, lr=0.000268947, gnorm=0.235, clip=0, loss_scale=1, train_wall=143, gb_free=59.3, wall=0 epoch 033: 837 / 1707 loss=4.002, nll_loss=2.362, ppl=5.14, wps=297216, ups=0.69, wpb=428634, bsz=16338.6, num_updates=55300, lr=0.000268947, gnorm=0.235, clip=0, loss_scale=1, train_wall=143, gb_free=59.3, wall=0 epoch 033: 837 / 1707 loss=4.002, nll_loss=2.362, ppl=5.14, wps=297216, ups=0.69, wpb=428634, bsz=16338.6, num_updates=55300, lr=0.000268947, gnorm=0.235, clip=0, loss_scale=1, train_wall=143, gb_free=59.3, wall=0 epoch 033: 837 / 1707 loss=4.002, nll_loss=2.362, ppl=5.14, wps=297216, ups=0.69, wpb=428634, bsz=16338.6, num_updates=55300, lr=0.000268947, gnorm=0.235, clip=0, loss_scale=1, train_wall=143, gb_free=59.3, wall=0 epoch 033: 837 / 1707 loss=4.002, nll_loss=2.362, ppl=5.14, wps=297216, ups=0.69, wpb=428634, bsz=16338.6, num_updates=55300, lr=0.000268947, gnorm=0.235, clip=0, loss_scale=1, train_wall=143, gb_free=59.3, wall=0 epoch 033: 837 / 1707 loss=4.002, nll_loss=2.362, ppl=5.14, wps=297216, ups=0.69, wpb=428634, bsz=16338.6, num_updates=55300, lr=0.000268947, gnorm=0.235, clip=0, loss_scale=1, train_wall=143, gb_free=59.3, wall=0 epoch 033: 837 / 1707 loss=4.002, nll_loss=2.362, ppl=5.14, wps=297216, ups=0.69, wpb=428634, bsz=16338.6, num_updates=55300, lr=0.000268947, gnorm=0.235, clip=0, loss_scale=1, train_wall=143, gb_free=59.3, wall=0 epoch 033: 837 / 1707 loss=4.002, nll_loss=2.362, ppl=5.14, wps=297216, ups=0.69, wpb=428634, bsz=16338.6, num_updates=55300, lr=0.000268947, gnorm=0.235, clip=0, loss_scale=1, train_wall=143, gb_free=59.3, wall=0 epoch 033: 837 / 1707 loss=4.002, nll_loss=2.362, ppl=5.14, wps=297216, ups=0.69, wpb=428634, bsz=16338.6, num_updates=55300, lr=0.000268947, gnorm=0.235, clip=0, loss_scale=1, train_wall=143, gb_free=59.3, wall=0 epoch 033: 837 / 1707 loss=4.002, nll_loss=2.362, ppl=5.14, wps=297216, ups=0.69, wpb=428634, bsz=16338.6, num_updates=55300, lr=0.000268947, gnorm=0.235, clip=0, loss_scale=1, train_wall=143, gb_free=59.3, wall=0 epoch 033: 837 / 1707 loss=4.002, nll_loss=2.362, ppl=5.14, wps=297216, ups=0.69, wpb=428634, bsz=16338.6, num_updates=55300, lr=0.000268947, gnorm=0.235, clip=0, loss_scale=1, train_wall=143, gb_free=59.3, wall=0 epoch 033: 938 / 1707 loss=4.002, nll_loss=2.362, ppl=5.14, wps=294414, ups=0.69, wpb=428883, bsz=16363.8, num_updates=55400, lr=0.000268705, gnorm=0.233, clip=0, loss_scale=0.5, train_wall=145, gb_free=59.1, wall=0 epoch 033: 938 / 1707 loss=4.002, nll_loss=2.362, ppl=5.14, wps=294414, ups=0.69, wpb=428883, bsz=16363.8, num_updates=55400, lr=0.000268705, gnorm=0.233, clip=0, loss_scale=0.5, train_wall=145, gb_free=59.1, wall=0 epoch 033: 938 / 1707 loss=4.002, nll_loss=2.362, ppl=5.14, wps=294414, ups=0.69, wpb=428883, bsz=16363.8, num_updates=55400, lr=0.000268705, gnorm=0.233, clip=0, loss_scale=0.5, train_wall=145, gb_free=59.1, wall=0 epoch 033: 938 / 1707 loss=4.002, nll_loss=2.362, ppl=5.14, wps=294414, ups=0.69, wpb=428883, bsz=16363.8, num_updates=55400, lr=0.000268705, gnorm=0.233, clip=0, loss_scale=0.5, train_wall=145, gb_free=59.1, wall=0 epoch 033: 938 / 1707 loss=4.002, nll_loss=2.362, ppl=5.14, wps=294414, ups=0.69, wpb=428883, bsz=16363.8, num_updates=55400, lr=0.000268705, gnorm=0.233, clip=0, loss_scale=0.5, train_wall=145, gb_free=59.1, wall=0 epoch 033: 938 / 1707 loss=4.002, nll_loss=2.362, ppl=5.14, wps=294414, ups=0.69, wpb=428883, bsz=16363.8, num_updates=55400, lr=0.000268705, gnorm=0.233, clip=0, loss_scale=0.5, train_wall=145, gb_free=59.1, wall=0 epoch 033: 938 / 1707 loss=4.002, nll_loss=2.362, ppl=5.14, wps=294414, ups=0.69, wpb=428883, bsz=16363.8, num_updates=55400, lr=0.000268705, gnorm=0.233, clip=0, loss_scale=0.5, train_wall=145, gb_free=59.1, wall=0 epoch 033: 938 / 1707 loss=4.002, nll_loss=2.362, ppl=5.14, wps=294414, ups=0.69, wpb=428883, bsz=16363.8, num_updates=55400, lr=0.000268705, gnorm=0.233, clip=0, loss_scale=0.5, train_wall=145, gb_free=59.1, wall=0 epoch 033: 938 / 1707 loss=4.002, nll_loss=2.362, ppl=5.14, wps=294414, ups=0.69, wpb=428883, bsz=16363.8, num_updates=55400, lr=0.000268705, gnorm=0.233, clip=0, loss_scale=0.5, train_wall=145, gb_free=59.1, wall=0 epoch 033: 938 / 1707 loss=4.002, nll_loss=2.362, ppl=5.14, wps=294414, ups=0.69, wpb=428883, bsz=16363.8, num_updates=55400, lr=0.000268705, gnorm=0.233, clip=0, loss_scale=0.5, train_wall=145, gb_free=59.1, wall=0 epoch 033: 938 / 1707 loss=4.002, nll_loss=2.362, ppl=5.14, wps=294414, ups=0.69, wpb=428883, bsz=16363.8, num_updates=55400, lr=0.000268705, gnorm=0.233, clip=0, loss_scale=0.5, train_wall=145, gb_free=59.1, wall=0 epoch 033: 1038 / 1707 loss=4.004, nll_loss=2.364, ppl=5.15, wps=296212, ups=0.69, wpb=429206, bsz=16663.2, num_updates=55500, lr=0.000268462, gnorm=0.243, clip=0, loss_scale=0.5, train_wall=144, gb_free=59.4, wall=0 epoch 033: 1038 / 1707 loss=4.004, nll_loss=2.364, ppl=5.15, wps=296212, ups=0.69, wpb=429206, bsz=16663.2, num_updates=55500, lr=0.000268462, gnorm=0.243, clip=0, loss_scale=0.5, train_wall=144, gb_free=59.4, wall=0 epoch 033: 1038 / 1707 loss=4.004, nll_loss=2.364, ppl=5.15, wps=296212, ups=0.69, wpb=429206, bsz=16663.2, num_updates=55500, lr=0.000268462, gnorm=0.243, clip=0, loss_scale=0.5, train_wall=144, gb_free=59.4, wall=0 epoch 033: 1038 / 1707 loss=4.004, nll_loss=2.364, ppl=5.15, wps=296212, ups=0.69, wpb=429206, bsz=16663.2, num_updates=55500, lr=0.000268462, gnorm=0.243, clip=0, loss_scale=0.5, train_wall=144, gb_free=59.4, wall=0 epoch 033: 1038 / 1707 loss=4.004, nll_loss=2.364, ppl=5.15, wps=296212, ups=0.69, wpb=429206, bsz=16663.2, num_updates=55500, lr=0.000268462, gnorm=0.243, clip=0, loss_scale=0.5, train_wall=144, gb_free=59.4, wall=0 epoch 033: 1038 / 1707 loss=4.004, nll_loss=2.364, ppl=5.15, wps=296212, ups=0.69, wpb=429206, bsz=16663.2, num_updates=55500, lr=0.000268462, gnorm=0.243, clip=0, loss_scale=0.5, train_wall=144, gb_free=59.4, wall=0 epoch 033: 1038 / 1707 loss=4.004, nll_loss=2.364, ppl=5.15, wps=296212, ups=0.69, wpb=429206, bsz=16663.2, num_updates=55500, lr=0.000268462, gnorm=0.243, clip=0, loss_scale=0.5, train_wall=144, gb_free=59.4, wall=0 epoch 033: 1038 / 1707 loss=4.004, nll_loss=2.364, ppl=5.15, wps=296212, ups=0.69, wpb=429206, bsz=16663.2, num_updates=55500, lr=0.000268462, gnorm=0.243, clip=0, loss_scale=0.5, train_wall=144, gb_free=59.4, wall=0 epoch 033: 1038 / 1707 loss=4.004, nll_loss=2.364, ppl=5.15, wps=296212, ups=0.69, wpb=429206, bsz=16663.2, num_updates=55500, lr=0.000268462, gnorm=0.243, clip=0, loss_scale=0.5, train_wall=144, gb_free=59.4, wall=0 epoch 033: 1038 / 1707 loss=4.004, nll_loss=2.364, ppl=5.15, wps=296212, ups=0.69, wpb=429206, bsz=16663.2, num_updates=55500, lr=0.000268462, gnorm=0.243, clip=0, loss_scale=0.5, train_wall=144, gb_free=59.4, wall=0 epoch 033: 1038 / 1707 loss=4.004, nll_loss=2.364, ppl=5.15, wps=296212, ups=0.69, wpb=429206, bsz=16663.2, num_updates=55500, lr=0.000268462, gnorm=0.243, clip=0, loss_scale=0.5, train_wall=144, gb_free=59.4, wall=0 epoch 033: 1138 / 1707 loss=4.006, nll_loss=2.367, ppl=5.16, wps=298444, ups=0.69, wpb=430388, bsz=15927.8, num_updates=55600, lr=0.000268221, gnorm=0.24, clip=0, loss_scale=0.5, train_wall=143, gb_free=58.9, wall=0 epoch 033: 1138 / 1707 loss=4.006, nll_loss=2.367, ppl=5.16, wps=298444, ups=0.69, wpb=430388, bsz=15927.8, num_updates=55600, lr=0.000268221, gnorm=0.24, clip=0, loss_scale=0.5, train_wall=143, gb_free=58.9, wall=0 epoch 033: 1138 / 1707 loss=4.006, nll_loss=2.367, ppl=5.16, wps=298444, ups=0.69, wpb=430388, bsz=15927.8, num_updates=55600, lr=0.000268221, gnorm=0.24, clip=0, loss_scale=0.5, train_wall=143, gb_free=58.9, wall=0 epoch 033: 1138 / 1707 loss=4.006, nll_loss=2.367, ppl=5.16, wps=298444, ups=0.69, wpb=430388, bsz=15927.8, num_updates=55600, lr=0.000268221, gnorm=0.24, clip=0, loss_scale=0.5, train_wall=143, gb_free=58.9, wall=0 epoch 033: 1138 / 1707 loss=4.006, nll_loss=2.367, ppl=5.16, wps=298444, ups=0.69, wpb=430388, bsz=15927.8, num_updates=55600, lr=0.000268221, gnorm=0.24, clip=0, loss_scale=0.5, train_wall=143, gb_free=58.9, wall=0 epoch 033: 1138 / 1707 loss=4.006, nll_loss=2.367, ppl=5.16, wps=298444, ups=0.69, wpb=430388, bsz=15927.8, num_updates=55600, lr=0.000268221, gnorm=0.24, clip=0, loss_scale=0.5, train_wall=143, gb_free=58.9, wall=0 epoch 033: 1138 / 1707 loss=4.006, nll_loss=2.367, ppl=5.16, wps=298444, ups=0.69, wpb=430388, bsz=15927.8, num_updates=55600, lr=0.000268221, gnorm=0.24, clip=0, loss_scale=0.5, train_wall=143, gb_free=58.9, wall=0 epoch 033: 1138 / 1707 loss=4.006, nll_loss=2.367, ppl=5.16, wps=298444, ups=0.69, wpb=430388, bsz=15927.8, num_updates=55600, lr=0.000268221, gnorm=0.24, clip=0, loss_scale=0.5, train_wall=143, gb_free=58.9, wall=0 epoch 033: 1138 / 1707 loss=4.006, nll_loss=2.367, ppl=5.16, wps=298444, ups=0.69, wpb=430388, bsz=15927.8, num_updates=55600, lr=0.000268221, gnorm=0.24, clip=0, loss_scale=0.5, train_wall=143, gb_free=58.9, wall=0 epoch 033: 1138 / 1707 loss=4.006, nll_loss=2.367, ppl=5.16, wps=298444, ups=0.69, wpb=430388, bsz=15927.8, num_updates=55600, lr=0.000268221, gnorm=0.24, clip=0, loss_scale=0.5, train_wall=143, gb_free=58.9, wall=0 epoch 033: 1138 / 1707 loss=4.006, nll_loss=2.367, ppl=5.16, wps=298444, ups=0.69, wpb=430388, bsz=15927.8, num_updates=55600, lr=0.000268221, gnorm=0.24, clip=0, loss_scale=0.5, train_wall=143, gb_free=58.9, wall=0 epoch 033: 1238 / 1707 loss=4.007, nll_loss=2.367, ppl=5.16, wps=296210, ups=0.69, wpb=428571, bsz=16295.6, num_updates=55700, lr=0.00026798, gnorm=0.244, clip=0, loss_scale=1, train_wall=144, gb_free=59.2, wall=0 epoch 033: 1238 / 1707 loss=4.007, nll_loss=2.367, ppl=5.16, wps=296210, ups=0.69, wpb=428571, bsz=16295.6, num_updates=55700, lr=0.00026798, gnorm=0.244, clip=0, loss_scale=1, train_wall=144, gb_free=59.2, wall=0 epoch 033: 1238 / 1707 loss=4.007, nll_loss=2.367, ppl=5.16, wps=296210, ups=0.69, wpb=428571, bsz=16295.6, num_updates=55700, lr=0.00026798, gnorm=0.244, clip=0, loss_scale=1, train_wall=144, gb_free=59.2, wall=0 epoch 033: 1238 / 1707 loss=4.007, nll_loss=2.367, ppl=5.16, wps=296210, ups=0.69, wpb=428571, bsz=16295.6, num_updates=55700, lr=0.00026798, gnorm=0.244, clip=0, loss_scale=1, train_wall=144, gb_free=59.2, wall=0 epoch 033: 1238 / 1707 loss=4.007, nll_loss=2.367, ppl=5.16, wps=296210, ups=0.69, wpb=428571, bsz=16295.6, num_updates=55700, lr=0.00026798, gnorm=0.244, clip=0, loss_scale=1, train_wall=144, gb_free=59.2, wall=0 epoch 033: 1238 / 1707 loss=4.007, nll_loss=2.367, ppl=5.16, wps=296210, ups=0.69, wpb=428571, bsz=16295.6, num_updates=55700, lr=0.00026798, gnorm=0.244, clip=0, loss_scale=1, train_wall=144, gb_free=59.2, wall=0 epoch 033: 1238 / 1707 loss=4.007, nll_loss=2.367, ppl=5.16, wps=296210, ups=0.69, wpb=428571, bsz=16295.6, num_updates=55700, lr=0.00026798, gnorm=0.244, clip=0, loss_scale=1, train_wall=144, gb_free=59.2, wall=0 epoch 033: 1238 / 1707 loss=4.007, nll_loss=2.367, ppl=5.16, wps=296210, ups=0.69, wpb=428571, bsz=16295.6, num_updates=55700, lr=0.00026798, gnorm=0.244, clip=0, loss_scale=1, train_wall=144, gb_free=59.2, wall=0 epoch 033: 1238 / 1707 loss=4.007, nll_loss=2.367, ppl=5.16, wps=296210, ups=0.69, wpb=428571, bsz=16295.6, num_updates=55700, lr=0.00026798, gnorm=0.244, clip=0, loss_scale=1, train_wall=144, gb_free=59.2, wall=0 epoch 033: 1238 / 1707 loss=4.007, nll_loss=2.367, ppl=5.16, wps=296210, ups=0.69, wpb=428571, bsz=16295.6, num_updates=55700, lr=0.00026798, gnorm=0.244, clip=0, loss_scale=1, train_wall=144, gb_free=59.2, wall=0 epoch 033: 1238 / 1707 loss=4.007, nll_loss=2.367, ppl=5.16, wps=296210, ups=0.69, wpb=428571, bsz=16295.6, num_updates=55700, lr=0.00026798, gnorm=0.244, clip=0, loss_scale=1, train_wall=144, gb_free=59.2, wall=0 epoch 033: 1338 / 1707 loss=4.01, nll_loss=2.371, ppl=5.17, wps=298084, ups=0.69, wpb=430283, bsz=16264.6, num_updates=55800, lr=0.00026774, gnorm=0.223, clip=0, loss_scale=1, train_wall=143, gb_free=59.2, wall=0 epoch 033: 1338 / 1707 loss=4.01, nll_loss=2.371, ppl=5.17, wps=298084, ups=0.69, wpb=430283, bsz=16264.6, num_updates=55800, lr=0.00026774, gnorm=0.223, clip=0, loss_scale=1, train_wall=143, gb_free=59.2, wall=0 epoch 033: 1338 / 1707 loss=4.01, nll_loss=2.371, ppl=5.17, wps=298084, ups=0.69, wpb=430283, bsz=16264.6, num_updates=55800, lr=0.00026774, gnorm=0.223, clip=0, loss_scale=1, train_wall=143, gb_free=59.2, wall=0 epoch 033: 1338 / 1707 loss=4.01, nll_loss=2.371, ppl=5.17, wps=298084, ups=0.69, wpb=430283, bsz=16264.6, num_updates=55800, lr=0.00026774, gnorm=0.223, clip=0, loss_scale=1, train_wall=143, gb_free=59.2, wall=0 epoch 033: 1338 / 1707 loss=4.01, nll_loss=2.371, ppl=5.17, wps=298084, ups=0.69, wpb=430283, bsz=16264.6, num_updates=55800, lr=0.00026774, gnorm=0.223, clip=0, loss_scale=1, train_wall=143, gb_free=59.2, wall=0 epoch 033: 1338 / 1707 loss=4.01, nll_loss=2.371, ppl=5.17, wps=298084, ups=0.69, wpb=430283, bsz=16264.6, num_updates=55800, lr=0.00026774, gnorm=0.223, clip=0, loss_scale=1, train_wall=143, gb_free=59.2, wall=0 epoch 033: 1338 / 1707 loss=4.01, nll_loss=2.371, ppl=5.17, wps=298084, ups=0.69, wpb=430283, bsz=16264.6, num_updates=55800, lr=0.00026774, gnorm=0.223, clip=0, loss_scale=1, train_wall=143, gb_free=59.2, wall=0 epoch 033: 1338 / 1707 loss=4.01, nll_loss=2.371, ppl=5.17, wps=298084, ups=0.69, wpb=430283, bsz=16264.6, num_updates=55800, lr=0.00026774, gnorm=0.223, clip=0, loss_scale=1, train_wall=143, gb_free=59.2, wall=0 epoch 033: 1338 / 1707 loss=4.01, nll_loss=2.371, ppl=5.17, wps=298084, ups=0.69, wpb=430283, bsz=16264.6, num_updates=55800, lr=0.00026774, gnorm=0.223, clip=0, loss_scale=1, train_wall=143, gb_free=59.2, wall=0 epoch 033: 1338 / 1707 loss=4.01, nll_loss=2.371, ppl=5.17, wps=298084, ups=0.69, wpb=430283, bsz=16264.6, num_updates=55800, lr=0.00026774, gnorm=0.223, clip=0, loss_scale=1, train_wall=143, gb_free=59.2, wall=0 epoch 033: 1338 / 1707 loss=4.01, nll_loss=2.371, ppl=5.17, wps=298084, ups=0.69, wpb=430283, bsz=16264.6, num_updates=55800, lr=0.00026774, gnorm=0.223, clip=0, loss_scale=1, train_wall=143, gb_free=59.2, wall=0 epoch 033: 1439 / 1707 loss=4.01, nll_loss=2.371, ppl=5.17, wps=293997, ups=0.69, wpb=428344, bsz=16163.7, num_updates=55900, lr=0.0002675, gnorm=0.239, clip=0, loss_scale=0.5, train_wall=145, gb_free=59.1, wall=0 epoch 033: 1439 / 1707 loss=4.01, nll_loss=2.371, ppl=5.17, wps=293997, ups=0.69, wpb=428344, bsz=16163.7, num_updates=55900, lr=0.0002675, gnorm=0.239, clip=0, loss_scale=0.5, train_wall=145, gb_free=59.1, wall=0 epoch 033: 1439 / 1707 loss=4.01, nll_loss=2.371, ppl=5.17, wps=293997, ups=0.69, wpb=428344, bsz=16163.7, num_updates=55900, lr=0.0002675, gnorm=0.239, clip=0, loss_scale=0.5, train_wall=145, gb_free=59.1, wall=0 epoch 033: 1439 / 1707 loss=4.01, nll_loss=2.371, ppl=5.17, wps=293997, ups=0.69, wpb=428344, bsz=16163.7, num_updates=55900, lr=0.0002675, gnorm=0.239, clip=0, loss_scale=0.5, train_wall=145, gb_free=59.1, wall=0 epoch 033: 1439 / 1707 loss=4.01, nll_loss=2.371, ppl=5.17, wps=293997, ups=0.69, wpb=428344, bsz=16163.7, num_updates=55900, lr=0.0002675, gnorm=0.239, clip=0, loss_scale=0.5, train_wall=145, gb_free=59.1, wall=0 epoch 033: 1439 / 1707 loss=4.01, nll_loss=2.371, ppl=5.17, wps=293997, ups=0.69, wpb=428344, bsz=16163.7, num_updates=55900, lr=0.0002675, gnorm=0.239, clip=0, loss_scale=0.5, train_wall=145, gb_free=59.1, wall=0 epoch 033: 1439 / 1707 loss=4.01, nll_loss=2.371, ppl=5.17, wps=293997, ups=0.69, wpb=428344, bsz=16163.7, num_updates=55900, lr=0.0002675, gnorm=0.239, clip=0, loss_scale=0.5, train_wall=145, gb_free=59.1, wall=0 epoch 033: 1439 / 1707 loss=4.01, nll_loss=2.371, ppl=5.17, wps=293997, ups=0.69, wpb=428344, bsz=16163.7, num_updates=55900, lr=0.0002675, gnorm=0.239, clip=0, loss_scale=0.5, train_wall=145, gb_free=59.1, wall=0 epoch 033: 1439 / 1707 loss=4.01, nll_loss=2.371, ppl=5.17, wps=293997, ups=0.69, wpb=428344, bsz=16163.7, num_updates=55900, lr=0.0002675, gnorm=0.239, clip=0, loss_scale=0.5, train_wall=145, gb_free=59.1, wall=0 epoch 033: 1439 / 1707 loss=4.01, nll_loss=2.371, ppl=5.17, wps=293997, ups=0.69, wpb=428344, bsz=16163.7, num_updates=55900, lr=0.0002675, gnorm=0.239, clip=0, loss_scale=0.5, train_wall=145, gb_free=59.1, wall=0 epoch 033: 1439 / 1707 loss=4.01, nll_loss=2.371, ppl=5.17, wps=293997, ups=0.69, wpb=428344, bsz=16163.7, num_updates=55900, lr=0.0002675, gnorm=0.239, clip=0, loss_scale=0.5, train_wall=145, gb_free=59.1, wall=0 epoch 033: 1539 / 1707 loss=4.013, nll_loss=2.375, ppl=5.19, wps=297403, ups=0.69, wpb=429656, bsz=16239.2, num_updates=56000, lr=0.000267261, gnorm=0.244, clip=0, loss_scale=0.5, train_wall=144, gb_free=58.9, wall=0 epoch 033: 1539 / 1707 loss=4.013, nll_loss=2.375, ppl=5.19, wps=297403, ups=0.69, wpb=429656, bsz=16239.2, num_updates=56000, lr=0.000267261, gnorm=0.244, clip=0, loss_scale=0.5, train_wall=144, gb_free=58.9, wall=0 epoch 033: 1539 / 1707 loss=4.013, nll_loss=2.375, ppl=5.19, wps=297403, ups=0.69, wpb=429656, bsz=16239.2, num_updates=56000, lr=0.000267261, gnorm=0.244, clip=0, loss_scale=0.5, train_wall=144, gb_free=58.9, wall=0 epoch 033: 1539 / 1707 loss=4.013, nll_loss=2.375, ppl=5.19, wps=297403, ups=0.69, wpb=429656, bsz=16239.2, num_updates=56000, lr=0.000267261, gnorm=0.244, clip=0, loss_scale=0.5, train_wall=144, gb_free=58.9, wall=0 epoch 033: 1539 / 1707 loss=4.013, nll_loss=2.375, ppl=5.19, wps=297403, ups=0.69, wpb=429656, bsz=16239.2, num_updates=56000, lr=0.000267261, gnorm=0.244, clip=0, loss_scale=0.5, train_wall=144, gb_free=58.9, wall=0 epoch 033: 1539 / 1707 loss=4.013, nll_loss=2.375, ppl=5.19, wps=297403, ups=0.69, wpb=429656, bsz=16239.2, num_updates=56000, lr=0.000267261, gnorm=0.244, clip=0, loss_scale=0.5, train_wall=144, gb_free=58.9, wall=0 epoch 033: 1539 / 1707 loss=4.013, nll_loss=2.375, ppl=5.19, wps=297403, ups=0.69, wpb=429656, bsz=16239.2, num_updates=56000, lr=0.000267261, gnorm=0.244, clip=0, loss_scale=0.5, train_wall=144, gb_free=58.9, wall=0 epoch 033: 1539 / 1707 loss=4.013, nll_loss=2.375, ppl=5.19, wps=297403, ups=0.69, wpb=429656, bsz=16239.2, num_updates=56000, lr=0.000267261, gnorm=0.244, clip=0, loss_scale=0.5, train_wall=144, gb_free=58.9, wall=0 epoch 033: 1539 / 1707 loss=4.013, nll_loss=2.375, ppl=5.19, wps=297403, ups=0.69, wpb=429656, bsz=16239.2, num_updates=56000, lr=0.000267261, gnorm=0.244, clip=0, loss_scale=0.5, train_wall=144, gb_free=58.9, wall=0 epoch 033: 1539 / 1707 loss=4.013, nll_loss=2.375, ppl=5.19, wps=297403, ups=0.69, wpb=429656, bsz=16239.2, num_updates=56000, lr=0.000267261, gnorm=0.244, clip=0, loss_scale=0.5, train_wall=144, gb_free=58.9, wall=0 epoch 033: 1539 / 1707 loss=4.013, nll_loss=2.375, ppl=5.19, wps=297403, ups=0.69, wpb=429656, bsz=16239.2, num_updates=56000, lr=0.000267261, gnorm=0.244, clip=0, loss_scale=0.5, train_wall=144, gb_free=58.9, wall=0 begin validation on "valid" subset epoch 033 | valid on 'valid' subset | loss 4.301 | nll_loss 2.676 | ppl 6.39 | wps 75994.8 | wpb 21331 | bsz 1016 | num_updates 56000 | best_loss 4.279 epoch 033 | valid on 'valid' subset | loss 4.301 | nll_loss 2.676 | ppl 6.39 | wps 75994.8 | wpb 21331 | bsz 1016 | num_updates 56000 | best_loss 4.279 epoch 033 | valid on 'valid' subset | loss 4.301 | nll_loss 2.676 | ppl 6.39 | wps 75994.8 | wpb 21331 | bsz 1016 | num_updates 56000 | best_loss 4.279 epoch 033 | valid on 'valid' subset | loss 4.301 | nll_loss 2.676 | ppl 6.39 | wps 75994.8 | wpb 21331 | bsz 1016 | num_updates 56000 | best_loss 4.279 epoch 033 | valid on 'valid' subset | loss 4.301 | nll_loss 2.676 | ppl 6.39 | wps 75994.8 | wpb 21331 | bsz 1016 | num_updates 56000 | best_loss 4.279 epoch 033 | valid on 'valid' subset | loss 4.301 | nll_loss 2.676 | ppl 6.39 | wps 75994.8 | wpb 21331 | bsz 1016 | num_updates 56000 | best_loss 4.279 epoch 033 | valid on 'valid' subset | loss 4.301 | nll_loss 2.676 | ppl 6.39 | wps 75994.8 | wpb 21331 | bsz 1016 | num_updates 56000 | best_loss 4.279 epoch 033 | valid on 'valid' subset | loss 4.301 | nll_loss 2.676 | ppl 6.39 | wps 75994.8 | wpb 21331 | bsz 1016 | num_updates 56000 | best_loss 4.279 epoch 033 | valid on 'valid' subset | loss 4.301 | nll_loss 2.676 | ppl 6.39 | wps 75994.8 | wpb 21331 | bsz 1016 | num_updates 56000 | best_loss 4.279 epoch 033 | valid on 'valid' subset | loss 4.301 | nll_loss 2.676 | ppl 6.39 | wps 75994.8 | wpb 21331 | bsz 1016 | num_updates 56000 | best_loss 4.279 epoch 033 | valid on 'valid' subset | loss 4.301 | nll_loss 2.676 | ppl 6.39 | wps 75994.8 | wpb 21331 | bsz 1016 | num_updates 56000 | best_loss 4.279 epoch 033: 1639 / 1707 loss=4.007, nll_loss=2.369, ppl=5.16, wps=270840, ups=0.63, wpb=428044, bsz=16373, num_updates=56100, lr=0.000267023, gnorm=0.243, clip=0, loss_scale=1, train_wall=144, gb_free=59.3, wall=0 epoch 033: 1639 / 1707 loss=4.007, nll_loss=2.369, ppl=5.16, wps=270840, ups=0.63, wpb=428044, bsz=16373, num_updates=56100, lr=0.000267023, gnorm=0.243, clip=0, loss_scale=1, train_wall=144, gb_free=59.3, wall=0 epoch 033: 1639 / 1707 loss=4.007, nll_loss=2.369, ppl=5.16, wps=270840, ups=0.63, wpb=428044, bsz=16373, num_updates=56100, lr=0.000267023, gnorm=0.243, clip=0, loss_scale=1, train_wall=144, gb_free=59.3, wall=0 epoch 033: 1639 / 1707 loss=4.007, nll_loss=2.369, ppl=5.16, wps=270840, ups=0.63, wpb=428044, bsz=16373, num_updates=56100, lr=0.000267023, gnorm=0.243, clip=0, loss_scale=1, train_wall=144, gb_free=59.3, wall=0 epoch 033: 1639 / 1707 loss=4.007, nll_loss=2.369, ppl=5.16, wps=270840, ups=0.63, wpb=428044, bsz=16373, num_updates=56100, lr=0.000267023, gnorm=0.243, clip=0, loss_scale=1, train_wall=144, gb_free=59.3, wall=0 epoch 033: 1639 / 1707 loss=4.007, nll_loss=2.369, ppl=5.16, wps=270840, ups=0.63, wpb=428044, bsz=16373, num_updates=56100, lr=0.000267023, gnorm=0.243, clip=0, loss_scale=1, train_wall=144, gb_free=59.3, wall=0 epoch 033: 1639 / 1707 loss=4.007, nll_loss=2.369, ppl=5.16, wps=270840, ups=0.63, wpb=428044, bsz=16373, num_updates=56100, lr=0.000267023, gnorm=0.243, clip=0, loss_scale=1, train_wall=144, gb_free=59.3, wall=0 epoch 033: 1639 / 1707 loss=4.007, nll_loss=2.369, ppl=5.16, wps=270840, ups=0.63, wpb=428044, bsz=16373, num_updates=56100, lr=0.000267023, gnorm=0.243, clip=0, loss_scale=1, train_wall=144, gb_free=59.3, wall=0 epoch 033: 1639 / 1707 loss=4.007, nll_loss=2.369, ppl=5.16, wps=270840, ups=0.63, wpb=428044, bsz=16373, num_updates=56100, lr=0.000267023, gnorm=0.243, clip=0, loss_scale=1, train_wall=144, gb_free=59.3, wall=0 epoch 033: 1639 / 1707 loss=4.007, nll_loss=2.369, ppl=5.16, wps=270840, ups=0.63, wpb=428044, bsz=16373, num_updates=56100, lr=0.000267023, gnorm=0.243, clip=0, loss_scale=1, train_wall=144, gb_free=59.3, wall=0 epoch 033: 1639 / 1707 loss=4.007, nll_loss=2.369, ppl=5.16, wps=270840, ups=0.63, wpb=428044, bsz=16373, num_updates=56100, lr=0.000267023, gnorm=0.243, clip=0, loss_scale=1, train_wall=144, gb_free=59.3, wall=0 end of epoch 33 (average epoch stats below) epoch 033 | loss 4 | nll_loss 2.359 | ppl 5.13 | wps 292801 | ups 0.68 | wpb 428942 | bsz 16332.8 | num_updates 56168 | lr 0.000266861 | gnorm 0.237 | clip 0 | loss_scale 1 | train_wall 2450 | gb_free 60.4 | wall 0 epoch 033 | loss 4 | nll_loss 2.359 | ppl 5.13 | wps 292801 | ups 0.68 | wpb 428942 | bsz 16332.8 | num_updates 56168 | lr 0.000266861 | gnorm 0.237 | clip 0 | loss_scale 1 | train_wall 2450 | gb_free 60.4 | wall 0 epoch 033 | loss 4 | nll_loss 2.359 | ppl 5.13 | wps 292801 | ups 0.68 | wpb 428942 | bsz 16332.8 | num_updates 56168 | lr 0.000266861 | gnorm 0.237 | clip 0 | loss_scale 1 | train_wall 2450 | gb_free 60.4 | wall 0 epoch 033 | loss 4 | nll_loss 2.359 | ppl 5.13 | wps 292801 | ups 0.68 | wpb 428942 | bsz 16332.8 | num_updates 56168 | lr 0.000266861 | gnorm 0.237 | clip 0 | loss_scale 1 | train_wall 2450 | gb_free 60.4 | wall 0 epoch 033 | loss 4 | nll_loss 2.359 | ppl 5.13 | wps 292801 | ups 0.68 | wpb 428942 | bsz 16332.8 | num_updates 56168 | lr 0.000266861 | gnorm 0.237 | clip 0 | loss_scale 1 | train_wall 2450 | gb_free 60.4 | wall 0 epoch 033 | loss 4 | nll_loss 2.359 | ppl 5.13 | wps 292801 | ups 0.68 | wpb 428942 | bsz 16332.8 | num_updates 56168 | lr 0.000266861 | gnorm 0.237 | clip 0 | loss_scale 1 | train_wall 2450 | gb_free 60.4 | wall 0 epoch 033 | loss 4 | nll_loss 2.359 | ppl 5.13 | wps 292801 | ups 0.68 | wpb 428942 | bsz 16332.8 | num_updates 56168 | lr 0.000266861 | gnorm 0.237 | clip 0 | loss_scale 1 | train_wall 2450 | gb_free 60.4 | wall 0 epoch 033 | loss 4 | nll_loss 2.359 | ppl 5.13 | wps 292801 | ups 0.68 | wpb 428942 | bsz 16332.8 | num_updates 56168 | lr 0.000266861 | gnorm 0.237 | clip 0 | loss_scale 1 | train_wall 2450 | gb_free 60.4 | wall 0 epoch 033 | loss 4 | nll_loss 2.359 | ppl 5.13 | wps 292801 | ups 0.68 | wpb 428942 | bsz 16332.8 | num_updates 56168 | lr 0.000266861 | gnorm 0.237 | clip 0 | loss_scale 1 | train_wall 2450 | gb_free 60.4 | wall 0 epoch 033 | loss 4 | nll_loss 2.359 | ppl 5.13 | wps 292801 | ups 0.68 | wpb 428942 | bsz 16332.8 | num_updates 56168 | lr 0.000266861 | gnorm 0.237 | clip 0 | loss_scale 1 | train_wall 2450 | gb_free 60.4 | wall 0 epoch 033 | loss 4 | nll_loss 2.359 | ppl 5.13 | wps 292801 | ups 0.68 | wpb 428942 | bsz 16332.8 | num_updates 56168 | lr 0.000266861 | gnorm 0.237 | clip 0 | loss_scale 1 | train_wall 2450 | gb_free 60.4 | wall 0 Start iterating over samples epoch 034: 32 / 1707 loss=3.999, nll_loss=2.358, ppl=5.13, wps=298172, ups=0.7, wpb=426213, bsz=16121.9, num_updates=56200, lr=0.000266785, gnorm=0.242, clip=0, loss_scale=1, train_wall=142, gb_free=59.4, wall=0 epoch 034: 32 / 1707 loss=3.999, nll_loss=2.358, ppl=5.13, wps=298172, ups=0.7, wpb=426213, bsz=16121.9, num_updates=56200, lr=0.000266785, gnorm=0.242, clip=0, loss_scale=1, train_wall=142, gb_free=59.4, wall=0 epoch 034: 32 / 1707 loss=3.999, nll_loss=2.358, ppl=5.13, wps=298172, ups=0.7, wpb=426213, bsz=16121.9, num_updates=56200, lr=0.000266785, gnorm=0.242, clip=0, loss_scale=1, train_wall=142, gb_free=59.4, wall=0 epoch 034: 32 / 1707 loss=3.999, nll_loss=2.358, ppl=5.13, wps=298172, ups=0.7, wpb=426213, bsz=16121.9, num_updates=56200, lr=0.000266785, gnorm=0.242, clip=0, loss_scale=1, train_wall=142, gb_free=59.4, wall=0 epoch 034: 32 / 1707 loss=3.999, nll_loss=2.358, ppl=5.13, wps=298172, ups=0.7, wpb=426213, bsz=16121.9, num_updates=56200, lr=0.000266785, gnorm=0.242, clip=0, loss_scale=1, train_wall=142, gb_free=59.4, wall=0 epoch 034: 32 / 1707 loss=3.999, nll_loss=2.358, ppl=5.13, wps=298172, ups=0.7, wpb=426213, bsz=16121.9, num_updates=56200, lr=0.000266785, gnorm=0.242, clip=0, loss_scale=1, train_wall=142, gb_free=59.4, wall=0 epoch 034: 32 / 1707 loss=3.999, nll_loss=2.358, ppl=5.13, wps=298172, ups=0.7, wpb=426213, bsz=16121.9, num_updates=56200, lr=0.000266785, gnorm=0.242, clip=0, loss_scale=1, train_wall=142, gb_free=59.4, wall=0 epoch 034: 32 / 1707 loss=3.999, nll_loss=2.358, ppl=5.13, wps=298172, ups=0.7, wpb=426213, bsz=16121.9, num_updates=56200, lr=0.000266785, gnorm=0.242, clip=0, loss_scale=1, train_wall=142, gb_free=59.4, wall=0 epoch 034: 32 / 1707 loss=3.999, nll_loss=2.358, ppl=5.13, wps=298172, ups=0.7, wpb=426213, bsz=16121.9, num_updates=56200, lr=0.000266785, gnorm=0.242, clip=0, loss_scale=1, train_wall=142, gb_free=59.4, wall=0 epoch 034: 32 / 1707 loss=3.999, nll_loss=2.358, ppl=5.13, wps=298172, ups=0.7, wpb=426213, bsz=16121.9, num_updates=56200, lr=0.000266785, gnorm=0.242, clip=0, loss_scale=1, train_wall=142, gb_free=59.4, wall=0 epoch 034: 32 / 1707 loss=3.999, nll_loss=2.358, ppl=5.13, wps=298172, ups=0.7, wpb=426213, bsz=16121.9, num_updates=56200, lr=0.000266785, gnorm=0.242, clip=0, loss_scale=1, train_wall=142, gb_free=59.4, wall=0 epoch 034: 32 / 1707 loss=3.999, nll_loss=2.358, ppl=5.13, wps=298172, ups=0.7, wpb=426213, bsz=16121.9, num_updates=56200, lr=0.000266785, gnorm=0.242, clip=0, loss_scale=1, train_wall=142, gb_free=59.4, wall=0 epoch 034: 132 / 1707 loss=3.98, nll_loss=2.336, ppl=5.05, wps=297278, ups=0.69, wpb=429385, bsz=16248.2, num_updates=56300, lr=0.000266548, gnorm=0.243, clip=0, loss_scale=1, train_wall=144, gb_free=59.4, wall=0 epoch 034: 132 / 1707 loss=3.98, nll_loss=2.336, ppl=5.05, wps=297278, ups=0.69, wpb=429385, bsz=16248.2, num_updates=56300, lr=0.000266548, gnorm=0.243, clip=0, loss_scale=1, train_wall=144, gb_free=59.4, wall=0 epoch 034: 132 / 1707 loss=3.98, nll_loss=2.336, ppl=5.05, wps=297278, ups=0.69, wpb=429385, bsz=16248.2, num_updates=56300, lr=0.000266548, gnorm=0.243, clip=0, loss_scale=1, train_wall=144, gb_free=59.4, wall=0 epoch 034: 132 / 1707 loss=3.98, nll_loss=2.336, ppl=5.05, wps=297278, ups=0.69, wpb=429385, bsz=16248.2, num_updates=56300, lr=0.000266548, gnorm=0.243, clip=0, loss_scale=1, train_wall=144, gb_free=59.4, wall=0 epoch 034: 132 / 1707 loss=3.98, nll_loss=2.336, ppl=5.05, wps=297278, ups=0.69, wpb=429385, bsz=16248.2, num_updates=56300, lr=0.000266548, gnorm=0.243, clip=0, loss_scale=1, train_wall=144, gb_free=59.4, wall=0 epoch 034: 132 / 1707 loss=3.98, nll_loss=2.336, ppl=5.05, wps=297278, ups=0.69, wpb=429385, bsz=16248.2, num_updates=56300, lr=0.000266548, gnorm=0.243, clip=0, loss_scale=1, train_wall=144, gb_free=59.4, wall=0 epoch 034: 132 / 1707 loss=3.98, nll_loss=2.336, ppl=5.05, wps=297278, ups=0.69, wpb=429385, bsz=16248.2, num_updates=56300, lr=0.000266548, gnorm=0.243, clip=0, loss_scale=1, train_wall=144, gb_free=59.4, wall=0 epoch 034: 132 / 1707 loss=3.98, nll_loss=2.336, ppl=5.05, wps=297278, ups=0.69, wpb=429385, bsz=16248.2, num_updates=56300, lr=0.000266548, gnorm=0.243, clip=0, loss_scale=1, train_wall=144, gb_free=59.4, wall=0 epoch 034: 132 / 1707 loss=3.98, nll_loss=2.336, ppl=5.05, wps=297278, ups=0.69, wpb=429385, bsz=16248.2, num_updates=56300, lr=0.000266548, gnorm=0.243, clip=0, loss_scale=1, train_wall=144, gb_free=59.4, wall=0 epoch 034: 132 / 1707 loss=3.98, nll_loss=2.336, ppl=5.05, wps=297278, ups=0.69, wpb=429385, bsz=16248.2, num_updates=56300, lr=0.000266548, gnorm=0.243, clip=0, loss_scale=1, train_wall=144, gb_free=59.4, wall=0 epoch 034: 132 / 1707 loss=3.98, nll_loss=2.336, ppl=5.05, wps=297278, ups=0.69, wpb=429385, bsz=16248.2, num_updates=56300, lr=0.000266548, gnorm=0.243, clip=0, loss_scale=1, train_wall=144, gb_free=59.4, wall=0 epoch 034: 132 / 1707 loss=3.98, nll_loss=2.336, ppl=5.05, wps=297278, ups=0.69, wpb=429385, bsz=16248.2, num_updates=56300, lr=0.000266548, gnorm=0.243, clip=0, loss_scale=1, train_wall=144, gb_free=59.4, wall=0 epoch 034: 232 / 1707 loss=3.984, nll_loss=2.341, ppl=5.07, wps=297606, ups=0.69, wpb=429574, bsz=16311.8, num_updates=56400, lr=0.000266312, gnorm=0.235, clip=0, loss_scale=2, train_wall=143, gb_free=59.4, wall=0 epoch 034: 232 / 1707 loss=3.984, nll_loss=2.341, ppl=5.07, wps=297606, ups=0.69, wpb=429574, bsz=16311.8, num_updates=56400, lr=0.000266312, gnorm=0.235, clip=0, loss_scale=2, train_wall=143, gb_free=59.4, wall=0 epoch 034: 232 / 1707 loss=3.984, nll_loss=2.341, ppl=5.07, wps=297606, ups=0.69, wpb=429574, bsz=16311.8, num_updates=56400, lr=0.000266312, gnorm=0.235, clip=0, loss_scale=2, train_wall=143, gb_free=59.4, wall=0 epoch 034: 232 / 1707 loss=3.984, nll_loss=2.341, ppl=5.07, wps=297606, ups=0.69, wpb=429574, bsz=16311.8, num_updates=56400, lr=0.000266312, gnorm=0.235, clip=0, loss_scale=2, train_wall=143, gb_free=59.4, wall=0 epoch 034: 232 / 1707 loss=3.984, nll_loss=2.341, ppl=5.07, wps=297606, ups=0.69, wpb=429574, bsz=16311.8, num_updates=56400, lr=0.000266312, gnorm=0.235, clip=0, loss_scale=2, train_wall=143, gb_free=59.4, wall=0 epoch 034: 232 / 1707 loss=3.984, nll_loss=2.341, ppl=5.07, wps=297606, ups=0.69, wpb=429574, bsz=16311.8, num_updates=56400, lr=0.000266312, gnorm=0.235, clip=0, loss_scale=2, train_wall=143, gb_free=59.4, wall=0 epoch 034: 232 / 1707 loss=3.984, nll_loss=2.341, ppl=5.07, wps=297606, ups=0.69, wpb=429574, bsz=16311.8, num_updates=56400, lr=0.000266312, gnorm=0.235, clip=0, loss_scale=2, train_wall=143, gb_free=59.4, wall=0 epoch 034: 232 / 1707 loss=3.984, nll_loss=2.341, ppl=5.07, wps=297606, ups=0.69, wpb=429574, bsz=16311.8, num_updates=56400, lr=0.000266312, gnorm=0.235, clip=0, loss_scale=2, train_wall=143, gb_free=59.4, wall=0 epoch 034: 232 / 1707 loss=3.984, nll_loss=2.341, ppl=5.07, wps=297606, ups=0.69, wpb=429574, bsz=16311.8, num_updates=56400, lr=0.000266312, gnorm=0.235, clip=0, loss_scale=2, train_wall=143, gb_free=59.4, wall=0 epoch 034: 232 / 1707 loss=3.984, nll_loss=2.341, ppl=5.07, wps=297606, ups=0.69, wpb=429574, bsz=16311.8, num_updates=56400, lr=0.000266312, gnorm=0.235, clip=0, loss_scale=2, train_wall=143, gb_free=59.4, wall=0 epoch 034: 232 / 1707 loss=3.984, nll_loss=2.341, ppl=5.07, wps=297606, ups=0.69, wpb=429574, bsz=16311.8, num_updates=56400, lr=0.000266312, gnorm=0.235, clip=0, loss_scale=2, train_wall=143, gb_free=59.4, wall=0 epoch 034: 232 / 1707 loss=3.984, nll_loss=2.341, ppl=5.07, wps=297606, ups=0.69, wpb=429574, bsz=16311.8, num_updates=56400, lr=0.000266312, gnorm=0.235, clip=0, loss_scale=2, train_wall=143, gb_free=59.4, wall=0 epoch 034: 332 / 1707 loss=3.989, nll_loss=2.347, ppl=5.09, wps=298820, ups=0.69, wpb=430619, bsz=16255.2, num_updates=56500, lr=0.000266076, gnorm=0.23, clip=0, loss_scale=2, train_wall=143, gb_free=59.2, wall=0 epoch 034: 332 / 1707 loss=3.989, nll_loss=2.347, ppl=5.09, wps=298820, ups=0.69, wpb=430619, bsz=16255.2, num_updates=56500, lr=0.000266076, gnorm=0.23, clip=0, loss_scale=2, train_wall=143, gb_free=59.2, wall=0 epoch 034: 332 / 1707 loss=3.989, nll_loss=2.347, ppl=5.09, wps=298820, ups=0.69, wpb=430619, bsz=16255.2, num_updates=56500, lr=0.000266076, gnorm=0.23, clip=0, loss_scale=2, train_wall=143, gb_free=59.2, wall=0 epoch 034: 332 / 1707 loss=3.989, nll_loss=2.347, ppl=5.09, wps=298820, ups=0.69, wpb=430619, bsz=16255.2, num_updates=56500, lr=0.000266076, gnorm=0.23, clip=0, loss_scale=2, train_wall=143, gb_free=59.2, wall=0 epoch 034: 332 / 1707 loss=3.989, nll_loss=2.347, ppl=5.09, wps=298820, ups=0.69, wpb=430619, bsz=16255.2, num_updates=56500, lr=0.000266076, gnorm=0.23, clip=0, loss_scale=2, train_wall=143, gb_free=59.2, wall=0 epoch 034: 332 / 1707 loss=3.989, nll_loss=2.347, ppl=5.09, wps=298820, ups=0.69, wpb=430619, bsz=16255.2, num_updates=56500, lr=0.000266076, gnorm=0.23, clip=0, loss_scale=2, train_wall=143, gb_free=59.2, wall=0 epoch 034: 332 / 1707 loss=3.989, nll_loss=2.347, ppl=5.09, wps=298820, ups=0.69, wpb=430619, bsz=16255.2, num_updates=56500, lr=0.000266076, gnorm=0.23, clip=0, loss_scale=2, train_wall=143, gb_free=59.2, wall=0 epoch 034: 332 / 1707 loss=3.989, nll_loss=2.347, ppl=5.09, wps=298820, ups=0.69, wpb=430619, bsz=16255.2, num_updates=56500, lr=0.000266076, gnorm=0.23, clip=0, loss_scale=2, train_wall=143, gb_free=59.2, wall=0 epoch 034: 332 / 1707 loss=3.989, nll_loss=2.347, ppl=5.09, wps=298820, ups=0.69, wpb=430619, bsz=16255.2, num_updates=56500, lr=0.000266076, gnorm=0.23, clip=0, loss_scale=2, train_wall=143, gb_free=59.2, wall=0 epoch 034: 332 / 1707 loss=3.989, nll_loss=2.347, ppl=5.09, wps=298820, ups=0.69, wpb=430619, bsz=16255.2, num_updates=56500, lr=0.000266076, gnorm=0.23, clip=0, loss_scale=2, train_wall=143, gb_free=59.2, wall=0 epoch 034: 332 / 1707 loss=3.989, nll_loss=2.347, ppl=5.09, wps=298820, ups=0.69, wpb=430619, bsz=16255.2, num_updates=56500, lr=0.000266076, gnorm=0.23, clip=0, loss_scale=2, train_wall=143, gb_free=59.2, wall=0 epoch 034: 332 / 1707 loss=3.989, nll_loss=2.347, ppl=5.09, wps=298820, ups=0.69, wpb=430619, bsz=16255.2, num_updates=56500, lr=0.000266076, gnorm=0.23, clip=0, loss_scale=2, train_wall=143, gb_free=59.2, wall=0 epoch 034: 432 / 1707 loss=3.986, nll_loss=2.344, ppl=5.08, wps=296631, ups=0.69, wpb=428266, bsz=16290.3, num_updates=56600, lr=0.000265841, gnorm=0.249, clip=0, loss_scale=4, train_wall=144, gb_free=59.2, wall=0 epoch 034: 432 / 1707 loss=3.986, nll_loss=2.344, ppl=5.08, wps=296631, ups=0.69, wpb=428266, bsz=16290.3, num_updates=56600, lr=0.000265841, gnorm=0.249, clip=0, loss_scale=4, train_wall=144, gb_free=59.2, wall=0 epoch 034: 432 / 1707 loss=3.986, nll_loss=2.344, ppl=5.08, wps=296631, ups=0.69, wpb=428266, bsz=16290.3, num_updates=56600, lr=0.000265841, gnorm=0.249, clip=0, loss_scale=4, train_wall=144, gb_free=59.2, wall=0 epoch 034: 432 / 1707 loss=3.986, nll_loss=2.344, ppl=5.08, wps=296631, ups=0.69, wpb=428266, bsz=16290.3, num_updates=56600, lr=0.000265841, gnorm=0.249, clip=0, loss_scale=4, train_wall=144, gb_free=59.2, wall=0 epoch 034: 432 / 1707 loss=3.986, nll_loss=2.344, ppl=5.08, wps=296631, ups=0.69, wpb=428266, bsz=16290.3, num_updates=56600, lr=0.000265841, gnorm=0.249, clip=0, loss_scale=4, train_wall=144, gb_free=59.2, wall=0 epoch 034: 432 / 1707 loss=3.986, nll_loss=2.344, ppl=5.08, wps=296631, ups=0.69, wpb=428266, bsz=16290.3, num_updates=56600, lr=0.000265841, gnorm=0.249, clip=0, loss_scale=4, train_wall=144, gb_free=59.2, wall=0 epoch 034: 432 / 1707 loss=3.986, nll_loss=2.344, ppl=5.08, wps=296631, ups=0.69, wpb=428266, bsz=16290.3, num_updates=56600, lr=0.000265841, gnorm=0.249, clip=0, loss_scale=4, train_wall=144, gb_free=59.2, wall=0 epoch 034: 432 / 1707 loss=3.986, nll_loss=2.344, ppl=5.08, wps=296631, ups=0.69, wpb=428266, bsz=16290.3, num_updates=56600, lr=0.000265841, gnorm=0.249, clip=0, loss_scale=4, train_wall=144, gb_free=59.2, wall=0 epoch 034: 432 / 1707 loss=3.986, nll_loss=2.344, ppl=5.08, wps=296631, ups=0.69, wpb=428266, bsz=16290.3, num_updates=56600, lr=0.000265841, gnorm=0.249, clip=0, loss_scale=4, train_wall=144, gb_free=59.2, wall=0 epoch 034: 432 / 1707 loss=3.986, nll_loss=2.344, ppl=5.08, wps=296631, ups=0.69, wpb=428266, bsz=16290.3, num_updates=56600, lr=0.000265841, gnorm=0.249, clip=0, loss_scale=4, train_wall=144, gb_free=59.2, wall=0 epoch 034: 432 / 1707 loss=3.986, nll_loss=2.344, ppl=5.08, wps=296631, ups=0.69, wpb=428266, bsz=16290.3, num_updates=56600, lr=0.000265841, gnorm=0.249, clip=0, loss_scale=4, train_wall=144, gb_free=59.2, wall=0 epoch 034: 432 / 1707 loss=3.986, nll_loss=2.344, ppl=5.08, wps=296631, ups=0.69, wpb=428266, bsz=16290.3, num_updates=56600, lr=0.000265841, gnorm=0.249, clip=0, loss_scale=4, train_wall=144, gb_free=59.2, wall=0 epoch 034: 533 / 1707 loss=3.99, nll_loss=2.347, ppl=5.09, wps=293949, ups=0.69, wpb=428681, bsz=16304.7, num_updates=56700, lr=0.000265606, gnorm=0.226, clip=0, loss_scale=2, train_wall=145, gb_free=59.2, wall=0 epoch 034: 533 / 1707 loss=3.99, nll_loss=2.347, ppl=5.09, wps=293949, ups=0.69, wpb=428681, bsz=16304.7, num_updates=56700, lr=0.000265606, gnorm=0.226, clip=0, loss_scale=2, train_wall=145, gb_free=59.2, wall=0 epoch 034: 533 / 1707 loss=3.99, nll_loss=2.347, ppl=5.09, wps=293949, ups=0.69, wpb=428681, bsz=16304.7, num_updates=56700, lr=0.000265606, gnorm=0.226, clip=0, loss_scale=2, train_wall=145, gb_free=59.2, wall=0 epoch 034: 533 / 1707 loss=3.99, nll_loss=2.347, ppl=5.09, wps=293949, ups=0.69, wpb=428681, bsz=16304.7, num_updates=56700, lr=0.000265606, gnorm=0.226, clip=0, loss_scale=2, train_wall=145, gb_free=59.2, wall=0 epoch 034: 533 / 1707 loss=3.99, nll_loss=2.347, ppl=5.09, wps=293949, ups=0.69, wpb=428681, bsz=16304.7, num_updates=56700, lr=0.000265606, gnorm=0.226, clip=0, loss_scale=2, train_wall=145, gb_free=59.2, wall=0 epoch 034: 533 / 1707 loss=3.99, nll_loss=2.347, ppl=5.09, wps=293949, ups=0.69, wpb=428681, bsz=16304.7, num_updates=56700, lr=0.000265606, gnorm=0.226, clip=0, loss_scale=2, train_wall=145, gb_free=59.2, wall=0 epoch 034: 533 / 1707 loss=3.99, nll_loss=2.347, ppl=5.09, wps=293949, ups=0.69, wpb=428681, bsz=16304.7, num_updates=56700, lr=0.000265606, gnorm=0.226, clip=0, loss_scale=2, train_wall=145, gb_free=59.2, wall=0 epoch 034: 533 / 1707 loss=3.99, nll_loss=2.347, ppl=5.09, wps=293949, ups=0.69, wpb=428681, bsz=16304.7, num_updates=56700, lr=0.000265606, gnorm=0.226, clip=0, loss_scale=2, train_wall=145, gb_free=59.2, wall=0 epoch 034: 533 / 1707 loss=3.99, nll_loss=2.347, ppl=5.09, wps=293949, ups=0.69, wpb=428681, bsz=16304.7, num_updates=56700, lr=0.000265606, gnorm=0.226, clip=0, loss_scale=2, train_wall=145, gb_free=59.2, wall=0 epoch 034: 533 / 1707 loss=3.99, nll_loss=2.347, ppl=5.09, wps=293949, ups=0.69, wpb=428681, bsz=16304.7, num_updates=56700, lr=0.000265606, gnorm=0.226, clip=0, loss_scale=2, train_wall=145, gb_free=59.2, wall=0 epoch 034: 533 / 1707 loss=3.99, nll_loss=2.347, ppl=5.09, wps=293949, ups=0.69, wpb=428681, bsz=16304.7, num_updates=56700, lr=0.000265606, gnorm=0.226, clip=0, loss_scale=2, train_wall=145, gb_free=59.2, wall=0 epoch 034: 533 / 1707 loss=3.99, nll_loss=2.347, ppl=5.09, wps=293949, ups=0.69, wpb=428681, bsz=16304.7, num_updates=56700, lr=0.000265606, gnorm=0.226, clip=0, loss_scale=2, train_wall=145, gb_free=59.2, wall=0 epoch 034: 634 / 1707 loss=3.993, nll_loss=2.352, ppl=5.1, wps=294170, ups=0.69, wpb=428870, bsz=16377, num_updates=56800, lr=0.000265372, gnorm=0.237, clip=0, loss_scale=1, train_wall=145, gb_free=58.7, wall=0 epoch 034: 634 / 1707 loss=3.993, nll_loss=2.352, ppl=5.1, wps=294170, ups=0.69, wpb=428870, bsz=16377, num_updates=56800, lr=0.000265372, gnorm=0.237, clip=0, loss_scale=1, train_wall=145, gb_free=58.7, wall=0 epoch 034: 634 / 1707 loss=3.993, nll_loss=2.352, ppl=5.1, wps=294170, ups=0.69, wpb=428870, bsz=16377, num_updates=56800, lr=0.000265372, gnorm=0.237, clip=0, loss_scale=1, train_wall=145, gb_free=58.7, wall=0 epoch 034: 634 / 1707 loss=3.993, nll_loss=2.352, ppl=5.1, wps=294170, ups=0.69, wpb=428870, bsz=16377, num_updates=56800, lr=0.000265372, gnorm=0.237, clip=0, loss_scale=1, train_wall=145, gb_free=58.7, wall=0 epoch 034: 634 / 1707 loss=3.993, nll_loss=2.352, ppl=5.1, wps=294170, ups=0.69, wpb=428870, bsz=16377, num_updates=56800, lr=0.000265372, gnorm=0.237, clip=0, loss_scale=1, train_wall=145, gb_free=58.7, wall=0 epoch 034: 634 / 1707 loss=3.993, nll_loss=2.352, ppl=5.1, wps=294170, ups=0.69, wpb=428870, bsz=16377, num_updates=56800, lr=0.000265372, gnorm=0.237, clip=0, loss_scale=1, train_wall=145, gb_free=58.7, wall=0 epoch 034: 634 / 1707 loss=3.993, nll_loss=2.352, ppl=5.1, wps=294170, ups=0.69, wpb=428870, bsz=16377, num_updates=56800, lr=0.000265372, gnorm=0.237, clip=0, loss_scale=1, train_wall=145, gb_free=58.7, wall=0 epoch 034: 634 / 1707 loss=3.993, nll_loss=2.352, ppl=5.1, wps=294170, ups=0.69, wpb=428870, bsz=16377, num_updates=56800, lr=0.000265372, gnorm=0.237, clip=0, loss_scale=1, train_wall=145, gb_free=58.7, wall=0 epoch 034: 634 / 1707 loss=3.993, nll_loss=2.352, ppl=5.1, wps=294170, ups=0.69, wpb=428870, bsz=16377, num_updates=56800, lr=0.000265372, gnorm=0.237, clip=0, loss_scale=1, train_wall=145, gb_free=58.7, wall=0 epoch 034: 634 / 1707 loss=3.993, nll_loss=2.352, ppl=5.1, wps=294170, ups=0.69, wpb=428870, bsz=16377, num_updates=56800, lr=0.000265372, gnorm=0.237, clip=0, loss_scale=1, train_wall=145, gb_free=58.7, wall=0 epoch 034: 634 / 1707 loss=3.993, nll_loss=2.352, ppl=5.1, wps=294170, ups=0.69, wpb=428870, bsz=16377, num_updates=56800, lr=0.000265372, gnorm=0.237, clip=0, loss_scale=1, train_wall=145, gb_free=58.7, wall=0 epoch 034: 634 / 1707 loss=3.993, nll_loss=2.352, ppl=5.1, wps=294170, ups=0.69, wpb=428870, bsz=16377, num_updates=56800, lr=0.000265372, gnorm=0.237, clip=0, loss_scale=1, train_wall=145, gb_free=58.7, wall=0 epoch 034: 734 / 1707 loss=3.992, nll_loss=2.35, ppl=5.1, wps=297273, ups=0.69, wpb=430055, bsz=15884.3, num_updates=56900, lr=0.000265139, gnorm=0.234, clip=0, loss_scale=1, train_wall=144, gb_free=59, wall=0 epoch 034: 734 / 1707 loss=3.992, nll_loss=2.35, ppl=5.1, wps=297273, ups=0.69, wpb=430055, bsz=15884.3, num_updates=56900, lr=0.000265139, gnorm=0.234, clip=0, loss_scale=1, train_wall=144, gb_free=59, wall=0 epoch 034: 734 / 1707 loss=3.992, nll_loss=2.35, ppl=5.1, wps=297273, ups=0.69, wpb=430055, bsz=15884.3, num_updates=56900, lr=0.000265139, gnorm=0.234, clip=0, loss_scale=1, train_wall=144, gb_free=59, wall=0 epoch 034: 734 / 1707 loss=3.992, nll_loss=2.35, ppl=5.1, wps=297273, ups=0.69, wpb=430055, bsz=15884.3, num_updates=56900, lr=0.000265139, gnorm=0.234, clip=0, loss_scale=1, train_wall=144, gb_free=59, wall=0 epoch 034: 734 / 1707 loss=3.992, nll_loss=2.35, ppl=5.1, wps=297273, ups=0.69, wpb=430055, bsz=15884.3, num_updates=56900, lr=0.000265139, gnorm=0.234, clip=0, loss_scale=1, train_wall=144, gb_free=59, wall=0 epoch 034: 734 / 1707 loss=3.992, nll_loss=2.35, ppl=5.1, wps=297273, ups=0.69, wpb=430055, bsz=15884.3, num_updates=56900, lr=0.000265139, gnorm=0.234, clip=0, loss_scale=1, train_wall=144, gb_free=59, wall=0 epoch 034: 734 / 1707 loss=3.992, nll_loss=2.35, ppl=5.1, wps=297273, ups=0.69, wpb=430055, bsz=15884.3, num_updates=56900, lr=0.000265139, gnorm=0.234, clip=0, loss_scale=1, train_wall=144, gb_free=59, wall=0 epoch 034: 734 / 1707 loss=3.992, nll_loss=2.35, ppl=5.1, wps=297273, ups=0.69, wpb=430055, bsz=15884.3, num_updates=56900, lr=0.000265139, gnorm=0.234, clip=0, loss_scale=1, train_wall=144, gb_free=59, wall=0 epoch 034: 734 / 1707 loss=3.992, nll_loss=2.35, ppl=5.1, wps=297273, ups=0.69, wpb=430055, bsz=15884.3, num_updates=56900, lr=0.000265139, gnorm=0.234, clip=0, loss_scale=1, train_wall=144, gb_free=59, wall=0 epoch 034: 734 / 1707 loss=3.992, nll_loss=2.35, ppl=5.1, wps=297273, ups=0.69, wpb=430055, bsz=15884.3, num_updates=56900, lr=0.000265139, gnorm=0.234, clip=0, loss_scale=1, train_wall=144, gb_free=59, wall=0 epoch 034: 734 / 1707 loss=3.992, nll_loss=2.35, ppl=5.1, wps=297273, ups=0.69, wpb=430055, bsz=15884.3, num_updates=56900, lr=0.000265139, gnorm=0.234, clip=0, loss_scale=1, train_wall=144, gb_free=59, wall=0 epoch 034: 734 / 1707 loss=3.992, nll_loss=2.35, ppl=5.1, wps=297273, ups=0.69, wpb=430055, bsz=15884.3, num_updates=56900, lr=0.000265139, gnorm=0.234, clip=0, loss_scale=1, train_wall=144, gb_free=59, wall=0 epoch 034: 834 / 1707 loss=3.994, nll_loss=2.353, ppl=5.11, wps=296356, ups=0.69, wpb=429227, bsz=16506.2, num_updates=57000, lr=0.000264906, gnorm=0.247, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=0 epoch 034: 834 / 1707 loss=3.994, nll_loss=2.353, ppl=5.11, wps=296356, ups=0.69, wpb=429227, bsz=16506.2, num_updates=57000, lr=0.000264906, gnorm=0.247, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=0 epoch 034: 834 / 1707 loss=3.994, nll_loss=2.353, ppl=5.11, wps=296356, ups=0.69, wpb=429227, bsz=16506.2, num_updates=57000, lr=0.000264906, gnorm=0.247, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=0 epoch 034: 834 / 1707 loss=3.994, nll_loss=2.353, ppl=5.11, wps=296356, ups=0.69, wpb=429227, bsz=16506.2, num_updates=57000, lr=0.000264906, gnorm=0.247, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=0 epoch 034: 834 / 1707 loss=3.994, nll_loss=2.353, ppl=5.11, wps=296356, ups=0.69, wpb=429227, bsz=16506.2, num_updates=57000, lr=0.000264906, gnorm=0.247, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=0 epoch 034: 834 / 1707 loss=3.994, nll_loss=2.353, ppl=5.11, wps=296356, ups=0.69, wpb=429227, bsz=16506.2, num_updates=57000, lr=0.000264906, gnorm=0.247, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=0 epoch 034: 834 / 1707 loss=3.994, nll_loss=2.353, ppl=5.11, wps=296356, ups=0.69, wpb=429227, bsz=16506.2, num_updates=57000, lr=0.000264906, gnorm=0.247, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=0 epoch 034: 834 / 1707 loss=3.994, nll_loss=2.353, ppl=5.11, wps=296356, ups=0.69, wpb=429227, bsz=16506.2, num_updates=57000, lr=0.000264906, gnorm=0.247, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=0 epoch 034: 834 / 1707 loss=3.994, nll_loss=2.353, ppl=5.11, wps=296356, ups=0.69, wpb=429227, bsz=16506.2, num_updates=57000, lr=0.000264906, gnorm=0.247, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=0 epoch 034: 834 / 1707 loss=3.994, nll_loss=2.353, ppl=5.11, wps=296356, ups=0.69, wpb=429227, bsz=16506.2, num_updates=57000, lr=0.000264906, gnorm=0.247, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=0 epoch 034: 834 / 1707 loss=3.994, nll_loss=2.353, ppl=5.11, wps=296356, ups=0.69, wpb=429227, bsz=16506.2, num_updates=57000, lr=0.000264906, gnorm=0.247, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=0 epoch 034: 834 / 1707 loss=3.994, nll_loss=2.353, ppl=5.11, wps=296356, ups=0.69, wpb=429227, bsz=16506.2, num_updates=57000, lr=0.000264906, gnorm=0.247, clip=0, loss_scale=2, train_wall=144, gb_free=58.9, wall=0 begin validation on "valid" subset epoch 034 | valid on 'valid' subset | loss 4.287 | nll_loss 2.665 | ppl 6.34 | wps 75715.9 | wpb 21331 | bsz 1016 | num_updates 57000 | best_loss 4.279 epoch 034 | valid on 'valid' subset | loss 4.287 | nll_loss 2.665 | ppl 6.34 | wps 75715.9 | wpb 21331 | bsz 1016 | num_updates 57000 | best_loss 4.279 epoch 034 | valid on 'valid' subset | loss 4.287 | nll_loss 2.665 | ppl 6.34 | wps 75715.9 | wpb 21331 | bsz 1016 | num_updates 57000 | best_loss 4.279 epoch 034 | valid on 'valid' subset | loss 4.287 | nll_loss 2.665 | ppl 6.34 | wps 75715.9 | wpb 21331 | bsz 1016 | num_updates 57000 | best_loss 4.279 epoch 034 | valid on 'valid' subset | loss 4.287 | nll_loss 2.665 | ppl 6.34 | wps 75715.9 | wpb 21331 | bsz 1016 | num_updates 57000 | best_loss 4.279 epoch 034 | valid on 'valid' subset | loss 4.287 | nll_loss 2.665 | ppl 6.34 | wps 75715.9 | wpb 21331 | bsz 1016 | num_updates 57000 | best_loss 4.279 epoch 034 | valid on 'valid' subset | loss 4.287 | nll_loss 2.665 | ppl 6.34 | wps 75715.9 | wpb 21331 | bsz 1016 | num_updates 57000 | best_loss 4.279 epoch 034 | valid on 'valid' subset | loss 4.287 | nll_loss 2.665 | ppl 6.34 | wps 75715.9 | wpb 21331 | bsz 1016 | num_updates 57000 | best_loss 4.279 epoch 034 | valid on 'valid' subset | loss 4.287 | nll_loss 2.665 | ppl 6.34 | wps 75715.9 | wpb 21331 | bsz 1016 | num_updates 57000 | best_loss 4.279 epoch 034 | valid on 'valid' subset | loss 4.287 | nll_loss 2.665 | ppl 6.34 | wps 75715.9 | wpb 21331 | bsz 1016 | num_updates 57000 | best_loss 4.279 epoch 034 | valid on 'valid' subset | loss 4.287 | nll_loss 2.665 | ppl 6.34 | wps 75715.9 | wpb 21331 | bsz 1016 | num_updates 57000 | best_loss 4.279 epoch 034 | valid on 'valid' subset | loss 4.287 | nll_loss 2.665 | ppl 6.34 | wps 75715.9 | wpb 21331 | bsz 1016 | num_updates 57000 | best_loss 4.279 epoch 034: 934 / 1707 loss=3.999, nll_loss=2.359, ppl=5.13, wps=265191, ups=0.62, wpb=429227, bsz=16481.4, num_updates=57100, lr=0.000264674, gnorm=0.239, clip=0, loss_scale=2, train_wall=144, gb_free=59, wall=0 epoch 034: 934 / 1707 loss=3.999, nll_loss=2.359, ppl=5.13, wps=265191, ups=0.62, wpb=429227, bsz=16481.4, num_updates=57100, lr=0.000264674, gnorm=0.239, clip=0, loss_scale=2, train_wall=144, gb_free=59, wall=0 epoch 034: 934 / 1707 loss=3.999, nll_loss=2.359, ppl=5.13, wps=265191, ups=0.62, wpb=429227, bsz=16481.4, num_updates=57100, lr=0.000264674, gnorm=0.239, clip=0, loss_scale=2, train_wall=144, gb_free=59, wall=0 epoch 034: 934 / 1707 loss=3.999, nll_loss=2.359, ppl=5.13, wps=265191, ups=0.62, wpb=429227, bsz=16481.4, num_updates=57100, lr=0.000264674, gnorm=0.239, clip=0, loss_scale=2, train_wall=144, gb_free=59, wall=0 epoch 034: 934 / 1707 loss=3.999, nll_loss=2.359, ppl=5.13, wps=265191, ups=0.62, wpb=429227, bsz=16481.4, num_updates=57100, lr=0.000264674, gnorm=0.239, clip=0, loss_scale=2, train_wall=144, gb_free=59, wall=0 epoch 034: 934 / 1707 loss=3.999, nll_loss=2.359, ppl=5.13, wps=265191, ups=0.62, wpb=429227, bsz=16481.4, num_updates=57100, lr=0.000264674, gnorm=0.239, clip=0, loss_scale=2, train_wall=144, gb_free=59, wall=0 epoch 034: 934 / 1707 loss=3.999, nll_loss=2.359, ppl=5.13, wps=265191, ups=0.62, wpb=429227, bsz=16481.4, num_updates=57100, lr=0.000264674, gnorm=0.239, clip=0, loss_scale=2, train_wall=144, gb_free=59, wall=0 epoch 034: 934 / 1707 loss=3.999, nll_loss=2.359, ppl=5.13, wps=265191, ups=0.62, wpb=429227, bsz=16481.4, num_updates=57100, lr=0.000264674, gnorm=0.239, clip=0, loss_scale=2, train_wall=144, gb_free=59, wall=0 epoch 034: 934 / 1707 loss=3.999, nll_loss=2.359, ppl=5.13, wps=265191, ups=0.62, wpb=429227, bsz=16481.4, num_updates=57100, lr=0.000264674, gnorm=0.239, clip=0, loss_scale=2, train_wall=144, gb_free=59, wall=0 epoch 034: 934 / 1707 loss=3.999, nll_loss=2.359, ppl=5.13, wps=265191, ups=0.62, wpb=429227, bsz=16481.4, num_updates=57100, lr=0.000264674, gnorm=0.239, clip=0, loss_scale=2, train_wall=144, gb_free=59, wall=0 epoch 034: 934 / 1707 loss=3.999, nll_loss=2.359, ppl=5.13, wps=265191, ups=0.62, wpb=429227, bsz=16481.4, num_updates=57100, lr=0.000264674, gnorm=0.239, clip=0, loss_scale=2, train_wall=144, gb_free=59, wall=0 epoch 034: 934 / 1707 loss=3.999, nll_loss=2.359, ppl=5.13, wps=265191, ups=0.62, wpb=429227, bsz=16481.4, num_updates=57100, lr=0.000264674, gnorm=0.239, clip=0, loss_scale=2, train_wall=144, gb_free=59, wall=0 epoch 034: 1035 / 1707 loss=3.996, nll_loss=2.355, ppl=5.12, wps=293154, ups=0.68, wpb=428410, bsz=16404.2, num_updates=57200, lr=0.000264443, gnorm=0.239, clip=0, loss_scale=1, train_wall=145, gb_free=59.2, wall=0 epoch 034: 1035 / 1707 loss=3.996, nll_loss=2.355, ppl=5.12, wps=293154, ups=0.68, wpb=428410, bsz=16404.2, num_updates=57200, lr=0.000264443, gnorm=0.239, clip=0, loss_scale=1, train_wall=145, gb_free=59.2, wall=0 epoch 034: 1035 / 1707 loss=3.996, nll_loss=2.355, ppl=5.12, wps=293154, ups=0.68, wpb=428410, bsz=16404.2, num_updates=57200, lr=0.000264443, gnorm=0.239, clip=0, loss_scale=1, train_wall=145, gb_free=59.2, wall=0 epoch 034: 1035 / 1707 loss=3.996, nll_loss=2.355, ppl=5.12, wps=293154, ups=0.68, wpb=428410, bsz=16404.2, num_updates=57200, lr=0.000264443, gnorm=0.239, clip=0, loss_scale=1, train_wall=145, gb_free=59.2, wall=0 epoch 034: 1035 / 1707 loss=3.996, nll_loss=2.355, ppl=5.12, wps=293154, ups=0.68, wpb=428410, bsz=16404.2, num_updates=57200, lr=0.000264443, gnorm=0.239, clip=0, loss_scale=1, train_wall=145, gb_free=59.2, wall=0 epoch 034: 1035 / 1707 loss=3.996, nll_loss=2.355, ppl=5.12, wps=293154, ups=0.68, wpb=428410, bsz=16404.2, num_updates=57200, lr=0.000264443, gnorm=0.239, clip=0, loss_scale=1, train_wall=145, gb_free=59.2, wall=0 epoch 034: 1035 / 1707 loss=3.996, nll_loss=2.355, ppl=5.12, wps=293154, ups=0.68, wpb=428410, bsz=16404.2, num_updates=57200, lr=0.000264443, gnorm=0.239, clip=0, loss_scale=1, train_wall=145, gb_free=59.2, wall=0 epoch 034: 1035 / 1707 loss=3.996, nll_loss=2.355, ppl=5.12, wps=293154, ups=0.68, wpb=428410, bsz=16404.2, num_updates=57200, lr=0.000264443, gnorm=0.239, clip=0, loss_scale=1, train_wall=145, gb_free=59.2, wall=0 epoch 034: 1035 / 1707 loss=3.996, nll_loss=2.355, ppl=5.12, wps=293154, ups=0.68, wpb=428410, bsz=16404.2, num_updates=57200, lr=0.000264443, gnorm=0.239, clip=0, loss_scale=1, train_wall=145, gb_free=59.2, wall=0 epoch 034: 1035 / 1707 loss=3.996, nll_loss=2.355, ppl=5.12, wps=293154, ups=0.68, wpb=428410, bsz=16404.2, num_updates=57200, lr=0.000264443, gnorm=0.239, clip=0, loss_scale=1, train_wall=145, gb_free=59.2, wall=0 epoch 034: 1035 / 1707 loss=3.996, nll_loss=2.355, ppl=5.12, wps=293154, ups=0.68, wpb=428410, bsz=16404.2, num_updates=57200, lr=0.000264443, gnorm=0.239, clip=0, loss_scale=1, train_wall=145, gb_free=59.2, wall=0 epoch 034: 1035 / 1707 loss=3.996, nll_loss=2.355, ppl=5.12, wps=293154, ups=0.68, wpb=428410, bsz=16404.2, num_updates=57200, lr=0.000264443, gnorm=0.239, clip=0, loss_scale=1, train_wall=145, gb_free=59.2, wall=0 epoch 034: 1135 / 1707 loss=4, nll_loss=2.36, ppl=5.13, wps=297525, ups=0.69, wpb=428459, bsz=16268.1, num_updates=57300, lr=0.000264212, gnorm=0.24, clip=0, loss_scale=1, train_wall=143, gb_free=58.8, wall=0 epoch 034: 1135 / 1707 loss=4, nll_loss=2.36, ppl=5.13, wps=297525, ups=0.69, wpb=428459, bsz=16268.1, num_updates=57300, lr=0.000264212, gnorm=0.24, clip=0, loss_scale=1, train_wall=143, gb_free=58.8, wall=0 epoch 034: 1135 / 1707 loss=4, nll_loss=2.36, ppl=5.13, wps=297525, ups=0.69, wpb=428459, bsz=16268.1, num_updates=57300, lr=0.000264212, gnorm=0.24, clip=0, loss_scale=1, train_wall=143, gb_free=58.8, wall=0 epoch 034: 1135 / 1707 loss=4, nll_loss=2.36, ppl=5.13, wps=297525, ups=0.69, wpb=428459, bsz=16268.1, num_updates=57300, lr=0.000264212, gnorm=0.24, clip=0, loss_scale=1, train_wall=143, gb_free=58.8, wall=0 epoch 034: 1135 / 1707 loss=4, nll_loss=2.36, ppl=5.13, wps=297525, ups=0.69, wpb=428459, bsz=16268.1, num_updates=57300, lr=0.000264212, gnorm=0.24, clip=0, loss_scale=1, train_wall=143, gb_free=58.8, wall=0 epoch 034: 1135 / 1707 loss=4, nll_loss=2.36, ppl=5.13, wps=297525, ups=0.69, wpb=428459, bsz=16268.1, num_updates=57300, lr=0.000264212, gnorm=0.24, clip=0, loss_scale=1, train_wall=143, gb_free=58.8, wall=0 epoch 034: 1135 / 1707 loss=4, nll_loss=2.36, ppl=5.13, wps=297525, ups=0.69, wpb=428459, bsz=16268.1, num_updates=57300, lr=0.000264212, gnorm=0.24, clip=0, loss_scale=1, train_wall=143, gb_free=58.8, wall=0 epoch 034: 1135 / 1707 loss=4, nll_loss=2.36, ppl=5.13, wps=297525, ups=0.69, wpb=428459, bsz=16268.1, num_updates=57300, lr=0.000264212, gnorm=0.24, clip=0, loss_scale=1, train_wall=143, gb_free=58.8, wall=0 epoch 034: 1135 / 1707 loss=4, nll_loss=2.36, ppl=5.13, wps=297525, ups=0.69, wpb=428459, bsz=16268.1, num_updates=57300, lr=0.000264212, gnorm=0.24, clip=0, loss_scale=1, train_wall=143, gb_free=58.8, wall=0 epoch 034: 1135 / 1707 loss=4, nll_loss=2.36, ppl=5.13, wps=297525, ups=0.69, wpb=428459, bsz=16268.1, num_updates=57300, lr=0.000264212, gnorm=0.24, clip=0, loss_scale=1, train_wall=143, gb_free=58.8, wall=0 epoch 034: 1135 / 1707 loss=4, nll_loss=2.36, ppl=5.13, wps=297525, ups=0.69, wpb=428459, bsz=16268.1, num_updates=57300, lr=0.000264212, gnorm=0.24, clip=0, loss_scale=1, train_wall=143, gb_free=58.8, wall=0 epoch 034: 1135 / 1707 loss=4, nll_loss=2.36, ppl=5.13, wps=297525, ups=0.69, wpb=428459, bsz=16268.1, num_updates=57300, lr=0.000264212, gnorm=0.24, clip=0, loss_scale=1, train_wall=143, gb_free=58.8, wall=0 epoch 034: 1235 / 1707 loss=4.011, nll_loss=2.372, ppl=5.18, wps=296538, ups=0.69, wpb=428587, bsz=16408.9, num_updates=57400, lr=0.000263982, gnorm=0.256, clip=1, loss_scale=1, train_wall=144, gb_free=59, wall=0 epoch 034: 1235 / 1707 loss=4.011, nll_loss=2.372, ppl=5.18, wps=296538, ups=0.69, wpb=428587, bsz=16408.9, num_updates=57400, lr=0.000263982, gnorm=0.256, clip=1, loss_scale=1, train_wall=144, gb_free=59, wall=0 epoch 034: 1235 / 1707 loss=4.011, nll_loss=2.372, ppl=5.18, wps=296538, ups=0.69, wpb=428587, bsz=16408.9, num_updates=57400, lr=0.000263982, gnorm=0.256, clip=1, loss_scale=1, train_wall=144, gb_free=59, wall=0 epoch 034: 1235 / 1707 loss=4.011, nll_loss=2.372, ppl=5.18, wps=296538, ups=0.69, wpb=428587, bsz=16408.9, num_updates=57400, lr=0.000263982, gnorm=0.256, clip=1, loss_scale=1, train_wall=144, gb_free=59, wall=0 epoch 034: 1235 / 1707 loss=4.011, nll_loss=2.372, ppl=5.18, wps=296538, ups=0.69, wpb=428587, bsz=16408.9, num_updates=57400, lr=0.000263982, gnorm=0.256, clip=1, loss_scale=1, train_wall=144, gb_free=59, wall=0 epoch 034: 1235 / 1707 loss=4.011, nll_loss=2.372, ppl=5.18, wps=296538, ups=0.69, wpb=428587, bsz=16408.9, num_updates=57400, lr=0.000263982, gnorm=0.256, clip=1, loss_scale=1, train_wall=144, gb_free=59, wall=0 epoch 034: 1235 / 1707 loss=4.011, nll_loss=2.372, ppl=5.18, wps=296538, ups=0.69, wpb=428587, bsz=16408.9, num_updates=57400, lr=0.000263982, gnorm=0.256, clip=1, loss_scale=1, train_wall=144, gb_free=59, wall=0 epoch 034: 1235 / 1707 loss=4.011, nll_loss=2.372, ppl=5.18, wps=296538, ups=0.69, wpb=428587, bsz=16408.9, num_updates=57400, lr=0.000263982, gnorm=0.256, clip=1, loss_scale=1, train_wall=144, gb_free=59, wall=0 epoch 034: 1235 / 1707 loss=4.011, nll_loss=2.372, ppl=5.18, wps=296538, ups=0.69, wpb=428587, bsz=16408.9, num_updates=57400, lr=0.000263982, gnorm=0.256, clip=1, loss_scale=1, train_wall=144, gb_free=59, wall=0 epoch 034: 1235 / 1707 loss=4.011, nll_loss=2.372, ppl=5.18, wps=296538, ups=0.69, wpb=428587, bsz=16408.9, num_updates=57400, lr=0.000263982, gnorm=0.256, clip=1, loss_scale=1, train_wall=144, gb_free=59, wall=0 epoch 034: 1235 / 1707 loss=4.011, nll_loss=2.372, ppl=5.18, wps=296538, ups=0.69, wpb=428587, bsz=16408.9, num_updates=57400, lr=0.000263982, gnorm=0.256, clip=1, loss_scale=1, train_wall=144, gb_free=59, wall=0 epoch 034: 1235 / 1707 loss=4.011, nll_loss=2.372, ppl=5.18, wps=296538, ups=0.69, wpb=428587, bsz=16408.9, num_updates=57400, lr=0.000263982, gnorm=0.256, clip=1, loss_scale=1, train_wall=144, gb_free=59, wall=0 epoch 034: 1336 / 1707 loss=4, nll_loss=2.36, ppl=5.13, wps=294244, ups=0.69, wpb=429363, bsz=16448.4, num_updates=57500, lr=0.000263752, gnorm=0.245, clip=0, loss_scale=1, train_wall=145, gb_free=59.3, wall=0 epoch 034: 1336 / 1707 loss=4, nll_loss=2.36, ppl=5.13, wps=294244, ups=0.69, wpb=429363, bsz=16448.4, num_updates=57500, lr=0.000263752, gnorm=0.245, clip=0, loss_scale=1, train_wall=145, gb_free=59.3, wall=0 epoch 034: 1336 / 1707 loss=4, nll_loss=2.36, ppl=5.13, wps=294244, ups=0.69, wpb=429363, bsz=16448.4, num_updates=57500, lr=0.000263752, gnorm=0.245, clip=0, loss_scale=1, train_wall=145, gb_free=59.3, wall=0 epoch 034: 1336 / 1707 loss=4, nll_loss=2.36, ppl=5.13, wps=294244, ups=0.69, wpb=429363, bsz=16448.4, num_updates=57500, lr=0.000263752, gnorm=0.245, clip=0, loss_scale=1, train_wall=145, gb_free=59.3, wall=0 epoch 034: 1336 / 1707 loss=4, nll_loss=2.36, ppl=5.13, wps=294244, ups=0.69, wpb=429363, bsz=16448.4, num_updates=57500, lr=0.000263752, gnorm=0.245, clip=0, loss_scale=1, train_wall=145, gb_free=59.3, wall=0 epoch 034: 1336 / 1707 loss=4, nll_loss=2.36, ppl=5.13, wps=294244, ups=0.69, wpb=429363, bsz=16448.4, num_updates=57500, lr=0.000263752, gnorm=0.245, clip=0, loss_scale=1, train_wall=145, gb_free=59.3, wall=0 epoch 034: 1336 / 1707 loss=4, nll_loss=2.36, ppl=5.13, wps=294244, ups=0.69, wpb=429363, bsz=16448.4, num_updates=57500, lr=0.000263752, gnorm=0.245, clip=0, loss_scale=1, train_wall=145, gb_free=59.3, wall=0 epoch 034: 1336 / 1707 loss=4, nll_loss=2.36, ppl=5.13, wps=294244, ups=0.69, wpb=429363, bsz=16448.4, num_updates=57500, lr=0.000263752, gnorm=0.245, clip=0, loss_scale=1, train_wall=145, gb_free=59.3, wall=0 epoch 034: 1336 / 1707 loss=4, nll_loss=2.36, ppl=5.13, wps=294244, ups=0.69, wpb=429363, bsz=16448.4, num_updates=57500, lr=0.000263752, gnorm=0.245, clip=0, loss_scale=1, train_wall=145, gb_free=59.3, wall=0 epoch 034: 1336 / 1707 loss=4, nll_loss=2.36, ppl=5.13, wps=294244, ups=0.69, wpb=429363, bsz=16448.4, num_updates=57500, lr=0.000263752, gnorm=0.245, clip=0, loss_scale=1, train_wall=145, gb_free=59.3, wall=0 epoch 034: 1336 / 1707 loss=4, nll_loss=2.36, ppl=5.13, wps=294244, ups=0.69, wpb=429363, bsz=16448.4, num_updates=57500, lr=0.000263752, gnorm=0.245, clip=0, loss_scale=1, train_wall=145, gb_free=59.3, wall=0 epoch 034: 1336 / 1707 loss=4, nll_loss=2.36, ppl=5.13, wps=294244, ups=0.69, wpb=429363, bsz=16448.4, num_updates=57500, lr=0.000263752, gnorm=0.245, clip=0, loss_scale=1, train_wall=145, gb_free=59.3, wall=0 epoch 034: 1436 / 1707 loss=4, nll_loss=2.36, ppl=5.13, wps=297854, ups=0.69, wpb=429836, bsz=16272, num_updates=57600, lr=0.000263523, gnorm=0.241, clip=0, loss_scale=1, train_wall=143, gb_free=59.1, wall=0 epoch 034: 1436 / 1707 loss=4, nll_loss=2.36, ppl=5.13, wps=297854, ups=0.69, wpb=429836, bsz=16272, num_updates=57600, lr=0.000263523, gnorm=0.241, clip=0, loss_scale=1, train_wall=143, gb_free=59.1, wall=0 epoch 034: 1436 / 1707 loss=4, nll_loss=2.36, ppl=5.13, wps=297854, ups=0.69, wpb=429836, bsz=16272, num_updates=57600, lr=0.000263523, gnorm=0.241, clip=0, loss_scale=1, train_wall=143, gb_free=59.1, wall=0 epoch 034: 1436 / 1707 loss=4, nll_loss=2.36, ppl=5.13, wps=297854, ups=0.69, wpb=429836, bsz=16272, num_updates=57600, lr=0.000263523, gnorm=0.241, clip=0, loss_scale=1, train_wall=143, gb_free=59.1, wall=0 epoch 034: 1436 / 1707 loss=4, nll_loss=2.36, ppl=5.13, wps=297854, ups=0.69, wpb=429836, bsz=16272, num_updates=57600, lr=0.000263523, gnorm=0.241, clip=0, loss_scale=1, train_wall=143, gb_free=59.1, wall=0 epoch 034: 1436 / 1707 loss=4, nll_loss=2.36, ppl=5.13, wps=297854, ups=0.69, wpb=429836, bsz=16272, num_updates=57600, lr=0.000263523, gnorm=0.241, clip=0, loss_scale=1, train_wall=143, gb_free=59.1, wall=0 epoch 034: 1436 / 1707 loss=4, nll_loss=2.36, ppl=5.13, wps=297854, ups=0.69, wpb=429836, bsz=16272, num_updates=57600, lr=0.000263523, gnorm=0.241, clip=0, loss_scale=1, train_wall=143, gb_free=59.1, wall=0 epoch 034: 1436 / 1707 loss=4, nll_loss=2.36, ppl=5.13, wps=297854, ups=0.69, wpb=429836, bsz=16272, num_updates=57600, lr=0.000263523, gnorm=0.241, clip=0, loss_scale=1, train_wall=143, gb_free=59.1, wall=0 epoch 034: 1436 / 1707 loss=4, nll_loss=2.36, ppl=5.13, wps=297854, ups=0.69, wpb=429836, bsz=16272, num_updates=57600, lr=0.000263523, gnorm=0.241, clip=0, loss_scale=1, train_wall=143, gb_free=59.1, wall=0 epoch 034: 1436 / 1707 loss=4, nll_loss=2.36, ppl=5.13, wps=297854, ups=0.69, wpb=429836, bsz=16272, num_updates=57600, lr=0.000263523, gnorm=0.241, clip=0, loss_scale=1, train_wall=143, gb_free=59.1, wall=0 epoch 034: 1436 / 1707 loss=4, nll_loss=2.36, ppl=5.13, wps=297854, ups=0.69, wpb=429836, bsz=16272, num_updates=57600, lr=0.000263523, gnorm=0.241, clip=0, loss_scale=1, train_wall=143, gb_free=59.1, wall=0 epoch 034: 1436 / 1707 loss=4, nll_loss=2.36, ppl=5.13, wps=297854, ups=0.69, wpb=429836, bsz=16272, num_updates=57600, lr=0.000263523, gnorm=0.241, clip=0, loss_scale=1, train_wall=143, gb_free=59.1, wall=0 epoch 034: 1536 / 1707 loss=4.011, nll_loss=2.372, ppl=5.18, wps=297772, ups=0.69, wpb=429975, bsz=16553.6, num_updates=57700, lr=0.000263295, gnorm=0.245, clip=0, loss_scale=1, train_wall=143, gb_free=59.3, wall=0 epoch 034: 1536 / 1707 loss=4.011, nll_loss=2.372, ppl=5.18, wps=297772, ups=0.69, wpb=429975, bsz=16553.6, num_updates=57700, lr=0.000263295, gnorm=0.245, clip=0, loss_scale=1, train_wall=143, gb_free=59.3, wall=0 epoch 034: 1536 / 1707 loss=4.011, nll_loss=2.372, ppl=5.18, wps=297772, ups=0.69, wpb=429975, bsz=16553.6, num_updates=57700, lr=0.000263295, gnorm=0.245, clip=0, loss_scale=1, train_wall=143, gb_free=59.3, wall=0 epoch 034: 1536 / 1707 loss=4.011, nll_loss=2.372, ppl=5.18, wps=297772, ups=0.69, wpb=429975, bsz=16553.6, num_updates=57700, lr=0.000263295, gnorm=0.245, clip=0, loss_scale=1, train_wall=143, gb_free=59.3, wall=0 epoch 034: 1536 / 1707 loss=4.011, nll_loss=2.372, ppl=5.18, wps=297772, ups=0.69, wpb=429975, bsz=16553.6, num_updates=57700, lr=0.000263295, gnorm=0.245, clip=0, loss_scale=1, train_wall=143, gb_free=59.3, wall=0 epoch 034: 1536 / 1707 loss=4.011, nll_loss=2.372, ppl=5.18, wps=297772, ups=0.69, wpb=429975, bsz=16553.6, num_updates=57700, lr=0.000263295, gnorm=0.245, clip=0, loss_scale=1, train_wall=143, gb_free=59.3, wall=0 epoch 034: 1536 / 1707 loss=4.011, nll_loss=2.372, ppl=5.18, wps=297772, ups=0.69, wpb=429975, bsz=16553.6, num_updates=57700, lr=0.000263295, gnorm=0.245, clip=0, loss_scale=1, train_wall=143, gb_free=59.3, wall=0 epoch 034: 1536 / 1707 loss=4.011, nll_loss=2.372, ppl=5.18, wps=297772, ups=0.69, wpb=429975, bsz=16553.6, num_updates=57700, lr=0.000263295, gnorm=0.245, clip=0, loss_scale=1, train_wall=143, gb_free=59.3, wall=0 epoch 034: 1536 / 1707 loss=4.011, nll_loss=2.372, ppl=5.18, wps=297772, ups=0.69, wpb=429975, bsz=16553.6, num_updates=57700, lr=0.000263295, gnorm=0.245, clip=0, loss_scale=1, train_wall=143, gb_free=59.3, wall=0 epoch 034: 1536 / 1707 loss=4.011, nll_loss=2.372, ppl=5.18, wps=297772, ups=0.69, wpb=429975, bsz=16553.6, num_updates=57700, lr=0.000263295, gnorm=0.245, clip=0, loss_scale=1, train_wall=143, gb_free=59.3, wall=0 epoch 034: 1536 / 1707 loss=4.011, nll_loss=2.372, ppl=5.18, wps=297772, ups=0.69, wpb=429975, bsz=16553.6, num_updates=57700, lr=0.000263295, gnorm=0.245, clip=0, loss_scale=1, train_wall=143, gb_free=59.3, wall=0 epoch 034: 1536 / 1707 loss=4.011, nll_loss=2.372, ppl=5.18, wps=297772, ups=0.69, wpb=429975, bsz=16553.6, num_updates=57700, lr=0.000263295, gnorm=0.245, clip=0, loss_scale=1, train_wall=143, gb_free=59.3, wall=0 epoch 034: 1637 / 1707 loss=4.017, nll_loss=2.38, ppl=5.2, wps=294393, ups=0.69, wpb=428728, bsz=16464, num_updates=57800, lr=0.000263067, gnorm=0.239, clip=0, loss_scale=1, train_wall=145, gb_free=59.3, wall=0 epoch 034: 1637 / 1707 loss=4.017, nll_loss=2.38, ppl=5.2, wps=294393, ups=0.69, wpb=428728, bsz=16464, num_updates=57800, lr=0.000263067, gnorm=0.239, clip=0, loss_scale=1, train_wall=145, gb_free=59.3, wall=0 epoch 034: 1637 / 1707 loss=4.017, nll_loss=2.38, ppl=5.2, wps=294393, ups=0.69, wpb=428728, bsz=16464, num_updates=57800, lr=0.000263067, gnorm=0.239, clip=0, loss_scale=1, train_wall=145, gb_free=59.3, wall=0 epoch 034: 1637 / 1707 loss=4.017, nll_loss=2.38, ppl=5.2, wps=294393, ups=0.69, wpb=428728, bsz=16464, num_updates=57800, lr=0.000263067, gnorm=0.239, clip=0, loss_scale=1, train_wall=145, gb_free=59.3, wall=0 epoch 034: 1637 / 1707 loss=4.017, nll_loss=2.38, ppl=5.2, wps=294393, ups=0.69, wpb=428728, bsz=16464, num_updates=57800, lr=0.000263067, gnorm=0.239, clip=0, loss_scale=1, train_wall=145, gb_free=59.3, wall=0 epoch 034: 1637 / 1707 loss=4.017, nll_loss=2.38, ppl=5.2, wps=294393, ups=0.69, wpb=428728, bsz=16464, num_updates=57800, lr=0.000263067, gnorm=0.239, clip=0, loss_scale=1, train_wall=145, gb_free=59.3, wall=0 epoch 034: 1637 / 1707 loss=4.017, nll_loss=2.38, ppl=5.2, wps=294393, ups=0.69, wpb=428728, bsz=16464, num_updates=57800, lr=0.000263067, gnorm=0.239, clip=0, loss_scale=1, train_wall=145, gb_free=59.3, wall=0 epoch 034: 1637 / 1707 loss=4.017, nll_loss=2.38, ppl=5.2, wps=294393, ups=0.69, wpb=428728, bsz=16464, num_updates=57800, lr=0.000263067, gnorm=0.239, clip=0, loss_scale=1, train_wall=145, gb_free=59.3, wall=0 epoch 034: 1637 / 1707 loss=4.017, nll_loss=2.38, ppl=5.2, wps=294393, ups=0.69, wpb=428728, bsz=16464, num_updates=57800, lr=0.000263067, gnorm=0.239, clip=0, loss_scale=1, train_wall=145, gb_free=59.3, wall=0 epoch 034: 1637 / 1707 loss=4.017, nll_loss=2.38, ppl=5.2, wps=294393, ups=0.69, wpb=428728, bsz=16464, num_updates=57800, lr=0.000263067, gnorm=0.239, clip=0, loss_scale=1, train_wall=145, gb_free=59.3, wall=0 epoch 034: 1637 / 1707 loss=4.017, nll_loss=2.38, ppl=5.2, wps=294393, ups=0.69, wpb=428728, bsz=16464, num_updates=57800, lr=0.000263067, gnorm=0.239, clip=0, loss_scale=1, train_wall=145, gb_free=59.3, wall=0 epoch 034: 1637 / 1707 loss=4.017, nll_loss=2.38, ppl=5.2, wps=294393, ups=0.69, wpb=428728, bsz=16464, num_updates=57800, lr=0.000263067, gnorm=0.239, clip=0, loss_scale=1, train_wall=145, gb_free=59.3, wall=0 end of epoch 34 (average epoch stats below) epoch 034 | loss 3.996 | nll_loss 2.356 | ppl 5.12 | wps 294044 | ups 0.69 | wpb 428942 | bsz 16334.2 | num_updates 57869 | lr 0.00026291 | gnorm 0.24 | clip 0.1 | loss_scale 0.5 | train_wall 2449 | gb_free 59.6 | wall 0 epoch 034 | loss 3.996 | nll_loss 2.356 | ppl 5.12 | wps 294044 | ups 0.69 | wpb 428942 | bsz 16334.2 | num_updates 57869 | lr 0.00026291 | gnorm 0.24 | clip 0.1 | loss_scale 0.5 | train_wall 2449 | gb_free 59.6 | wall 0 epoch 034 | loss 3.996 | nll_loss 2.356 | ppl 5.12 | wps 294044 | ups 0.69 | wpb 428942 | bsz 16334.2 | num_updates 57869 | lr 0.00026291 | gnorm 0.24 | clip 0.1 | loss_scale 0.5 | train_wall 2449 | gb_free 59.6 | wall 0 epoch 034 | loss 3.996 | nll_loss 2.356 | ppl 5.12 | wps 294044 | ups 0.69 | wpb 428942 | bsz 16334.2 | num_updates 57869 | lr 0.00026291 | gnorm 0.24 | clip 0.1 | loss_scale 0.5 | train_wall 2449 | gb_free 59.6 | wall 0 epoch 034 | loss 3.996 | nll_loss 2.356 | ppl 5.12 | wps 294044 | ups 0.69 | wpb 428942 | bsz 16334.2 | num_updates 57869 | lr 0.00026291 | gnorm 0.24 | clip 0.1 | loss_scale 0.5 | train_wall 2449 | gb_free 59.6 | wall 0 epoch 034 | loss 3.996 | nll_loss 2.356 | ppl 5.12 | wps 294044 | ups 0.69 | wpb 428942 | bsz 16334.2 | num_updates 57869 | lr 0.00026291 | gnorm 0.24 | clip 0.1 | loss_scale 0.5 | train_wall 2449 | gb_free 59.6 | wall 0 epoch 034 | loss 3.996 | nll_loss 2.356 | ppl 5.12 | wps 294044 | ups 0.69 | wpb 428942 | bsz 16334.2 | num_updates 57869 | lr 0.00026291 | gnorm 0.24 | clip 0.1 | loss_scale 0.5 | train_wall 2449 | gb_free 59.6 | wall 0 epoch 034 | loss 3.996 | nll_loss 2.356 | ppl 5.12 | wps 294044 | ups 0.69 | wpb 428942 | bsz 16334.2 | num_updates 57869 | lr 0.00026291 | gnorm 0.24 | clip 0.1 | loss_scale 0.5 | train_wall 2449 | gb_free 59.6 | wall 0 epoch 034 | loss 3.996 | nll_loss 2.356 | ppl 5.12 | wps 294044 | ups 0.69 | wpb 428942 | bsz 16334.2 | num_updates 57869 | lr 0.00026291 | gnorm 0.24 | clip 0.1 | loss_scale 0.5 | train_wall 2449 | gb_free 59.6 | wall 0 epoch 034 | loss 3.996 | nll_loss 2.356 | ppl 5.12 | wps 294044 | ups 0.69 | wpb 428942 | bsz 16334.2 | num_updates 57869 | lr 0.00026291 | gnorm 0.24 | clip 0.1 | loss_scale 0.5 | train_wall 2449 | gb_free 59.6 | wall 0 epoch 034 | loss 3.996 | nll_loss 2.356 | ppl 5.12 | wps 294044 | ups 0.69 | wpb 428942 | bsz 16334.2 | num_updates 57869 | lr 0.00026291 | gnorm 0.24 | clip 0.1 | loss_scale 0.5 | train_wall 2449 | gb_free 59.6 | wall 0 epoch 034 | loss 3.996 | nll_loss 2.356 | ppl 5.12 | wps 294044 | ups 0.69 | wpb 428942 | bsz 16334.2 | num_updates 57869 | lr 0.00026291 | gnorm 0.24 | clip 0.1 | loss_scale 0.5 | train_wall 2449 | gb_free 59.6 | wall 0 Start iterating over samples epoch 035: 31 / 1707 loss=4.003, nll_loss=2.363, ppl=5.15, wps=293357, ups=0.69, wpb=425998, bsz=16269.5, num_updates=57900, lr=0.00026284, gnorm=0.237, clip=0, loss_scale=0.5, train_wall=144, gb_free=59, wall=0 epoch 035: 31 / 1707 loss=4.003, nll_loss=2.363, ppl=5.15, wps=293357, ups=0.69, wpb=425998, bsz=16269.5, num_updates=57900, lr=0.00026284, gnorm=0.237, clip=0, loss_scale=0.5, train_wall=144, gb_free=59, wall=0 epoch 035: 31 / 1707 loss=4.003, nll_loss=2.363, ppl=5.15, wps=293357, ups=0.69, wpb=425998, bsz=16269.5, num_updates=57900, lr=0.00026284, gnorm=0.237, clip=0, loss_scale=0.5, train_wall=144, gb_free=59, wall=0 epoch 035: 31 / 1707 loss=4.003, nll_loss=2.363, ppl=5.15, wps=293357, ups=0.69, wpb=425998, bsz=16269.5, num_updates=57900, lr=0.00026284, gnorm=0.237, clip=0, loss_scale=0.5, train_wall=144, gb_free=59, wall=0 epoch 035: 31 / 1707 loss=4.003, nll_loss=2.363, ppl=5.15, wps=293357, ups=0.69, wpb=425998, bsz=16269.5, num_updates=57900, lr=0.00026284, gnorm=0.237, clip=0, loss_scale=0.5, train_wall=144, gb_free=59, wall=0 epoch 035: 31 / 1707 loss=4.003, nll_loss=2.363, ppl=5.15, wps=293357, ups=0.69, wpb=425998, bsz=16269.5, num_updates=57900, lr=0.00026284, gnorm=0.237, clip=0, loss_scale=0.5, train_wall=144, gb_free=59, wall=0 epoch 035: 31 / 1707 loss=4.003, nll_loss=2.363, ppl=5.15, wps=293357, ups=0.69, wpb=425998, bsz=16269.5, num_updates=57900, lr=0.00026284, gnorm=0.237, clip=0, loss_scale=0.5, train_wall=144, gb_free=59, wall=0 epoch 035: 31 / 1707 loss=4.003, nll_loss=2.363, ppl=5.15, wps=293357, ups=0.69, wpb=425998, bsz=16269.5, num_updates=57900, lr=0.00026284, gnorm=0.237, clip=0, loss_scale=0.5, train_wall=144, gb_free=59, wall=0 epoch 035: 31 / 1707 loss=4.003, nll_loss=2.363, ppl=5.15, wps=293357, ups=0.69, wpb=425998, bsz=16269.5, num_updates=57900, lr=0.00026284, gnorm=0.237, clip=0, loss_scale=0.5, train_wall=144, gb_free=59, wall=0 epoch 035: 31 / 1707 loss=4.003, nll_loss=2.363, ppl=5.15, wps=293357, ups=0.69, wpb=425998, bsz=16269.5, num_updates=57900, lr=0.00026284, gnorm=0.237, clip=0, loss_scale=0.5, train_wall=144, gb_free=59, wall=0 epoch 035: 31 / 1707 loss=4.003, nll_loss=2.363, ppl=5.15, wps=293357, ups=0.69, wpb=425998, bsz=16269.5, num_updates=57900, lr=0.00026284, gnorm=0.237, clip=0, loss_scale=0.5, train_wall=144, gb_free=59, wall=0 epoch 035: 31 / 1707 loss=4.003, nll_loss=2.363, ppl=5.15, wps=293357, ups=0.69, wpb=425998, bsz=16269.5, num_updates=57900, lr=0.00026284, gnorm=0.237, clip=0, loss_scale=0.5, train_wall=144, gb_free=59, wall=0 epoch 035: 31 / 1707 loss=4.003, nll_loss=2.363, ppl=5.15, wps=293357, ups=0.69, wpb=425998, bsz=16269.5, num_updates=57900, lr=0.00026284, gnorm=0.237, clip=0, loss_scale=0.5, train_wall=144, gb_free=59, wall=0 epoch 035: 131 / 1707 loss=3.97, nll_loss=2.325, ppl=5.01, wps=297566, ups=0.69, wpb=428759, bsz=16092, num_updates=58000, lr=0.000262613, gnorm=0.247, clip=0, loss_scale=0.5, train_wall=143, gb_free=59.1, wall=0 epoch 035: 131 / 1707 loss=3.97, nll_loss=2.325, ppl=5.01, wps=297566, ups=0.69, wpb=428759, bsz=16092, num_updates=58000, lr=0.000262613, gnorm=0.247, clip=0, loss_scale=0.5, train_wall=143, gb_free=59.1, wall=0 epoch 035: 131 / 1707 loss=3.97, nll_loss=2.325, ppl=5.01, wps=297566, ups=0.69, wpb=428759, bsz=16092, num_updates=58000, lr=0.000262613, gnorm=0.247, clip=0, loss_scale=0.5, train_wall=143, gb_free=59.1, wall=0 epoch 035: 131 / 1707 loss=3.97, nll_loss=2.325, ppl=5.01, wps=297566, ups=0.69, wpb=428759, bsz=16092, num_updates=58000, lr=0.000262613, gnorm=0.247, clip=0, loss_scale=0.5, train_wall=143, gb_free=59.1, wall=0 epoch 035: 131 / 1707 loss=3.97, nll_loss=2.325, ppl=5.01, wps=297566, ups=0.69, wpb=428759, bsz=16092, num_updates=58000, lr=0.000262613, gnorm=0.247, clip=0, loss_scale=0.5, train_wall=143, gb_free=59.1, wall=0 epoch 035: 131 / 1707 loss=3.97, nll_loss=2.325, ppl=5.01, wps=297566, ups=0.69, wpb=428759, bsz=16092, num_updates=58000, lr=0.000262613, gnorm=0.247, clip=0, loss_scale=0.5, train_wall=143, gb_free=59.1, wall=0 epoch 035: 131 / 1707 loss=3.97, nll_loss=2.325, ppl=5.01, wps=297566, ups=0.69, wpb=428759, bsz=16092, num_updates=58000, lr=0.000262613, gnorm=0.247, clip=0, loss_scale=0.5, train_wall=143, gb_free=59.1, wall=0 epoch 035: 131 / 1707 loss=3.97, nll_loss=2.325, ppl=5.01, wps=297566, ups=0.69, wpb=428759, bsz=16092, num_updates=58000, lr=0.000262613, gnorm=0.247, clip=0, loss_scale=0.5, train_wall=143, gb_free=59.1, wall=0 epoch 035: 131 / 1707 loss=3.97, nll_loss=2.325, ppl=5.01, wps=297566, ups=0.69, wpb=428759, bsz=16092, num_updates=58000, lr=0.000262613, gnorm=0.247, clip=0, loss_scale=0.5, train_wall=143, gb_free=59.1, wall=0 epoch 035: 131 / 1707 loss=3.97, nll_loss=2.325, ppl=5.01, wps=297566, ups=0.69, wpb=428759, bsz=16092, num_updates=58000, lr=0.000262613, gnorm=0.247, clip=0, loss_scale=0.5, train_wall=143, gb_free=59.1, wall=0 epoch 035: 131 / 1707 loss=3.97, nll_loss=2.325, ppl=5.01, wps=297566, ups=0.69, wpb=428759, bsz=16092, num_updates=58000, lr=0.000262613, gnorm=0.247, clip=0, loss_scale=0.5, train_wall=143, gb_free=59.1, wall=0 epoch 035: 131 / 1707 loss=3.97, nll_loss=2.325, ppl=5.01, wps=297566, ups=0.69, wpb=428759, bsz=16092, num_updates=58000, lr=0.000262613, gnorm=0.247, clip=0, loss_scale=0.5, train_wall=143, gb_free=59.1, wall=0 epoch 035: 131 / 1707 loss=3.97, nll_loss=2.325, ppl=5.01, wps=297566, ups=0.69, wpb=428759, bsz=16092, num_updates=58000, lr=0.000262613, gnorm=0.247, clip=0, loss_scale=0.5, train_wall=143, gb_free=59.1, wall=0 begin validation on "valid" subset epoch 035 | valid on 'valid' subset | loss 4.288 | nll_loss 2.671 | ppl 6.37 | wps 76569.8 | wpb 21331 | bsz 1016 | num_updates 58000 | best_loss 4.279 epoch 035 | valid on 'valid' subset | loss 4.288 | nll_loss 2.671 | ppl 6.37 | wps 76569.8 | wpb 21331 | bsz 1016 | num_updates 58000 | best_loss 4.279 epoch 035 | valid on 'valid' subset | loss 4.288 | nll_loss 2.671 | ppl 6.37 | wps 76569.8 | wpb 21331 | bsz 1016 | num_updates 58000 | best_loss 4.279 epoch 035 | valid on 'valid' subset | loss 4.288 | nll_loss 2.671 | ppl 6.37 | wps 76569.8 | wpb 21331 | bsz 1016 | num_updates 58000 | best_loss 4.279 epoch 035 | valid on 'valid' subset | loss 4.288 | nll_loss 2.671 | ppl 6.37 | wps 76569.8 | wpb 21331 | bsz 1016 | num_updates 58000 | best_loss 4.279 epoch 035 | valid on 'valid' subset | loss 4.288 | nll_loss 2.671 | ppl 6.37 | wps 76569.8 | wpb 21331 | bsz 1016 | num_updates 58000 | best_loss 4.279 epoch 035 | valid on 'valid' subset | loss 4.288 | nll_loss 2.671 | ppl 6.37 | wps 76569.8 | wpb 21331 | bsz 1016 | num_updates 58000 | best_loss 4.279 epoch 035 | valid on 'valid' subset | loss 4.288 | nll_loss 2.671 | ppl 6.37 | wps 76569.8 | wpb 21331 | bsz 1016 | num_updates 58000 | best_loss 4.279 epoch 035 | valid on 'valid' subset | loss 4.288 | nll_loss 2.671 | ppl 6.37 | wps 76569.8 | wpb 21331 | bsz 1016 | num_updates 58000 | best_loss 4.279 epoch 035 | valid on 'valid' subset | loss 4.288 | nll_loss 2.671 | ppl 6.37 | wps 76569.8 | wpb 21331 | bsz 1016 | num_updates 58000 | best_loss 4.279 epoch 035 | valid on 'valid' subset | loss 4.288 | nll_loss 2.671 | ppl 6.37 | wps 76569.8 | wpb 21331 | bsz 1016 | num_updates 58000 | best_loss 4.279 epoch 035 | valid on 'valid' subset | loss 4.288 | nll_loss 2.671 | ppl 6.37 | wps 76569.8 | wpb 21331 | bsz 1016 | num_updates 58000 | best_loss 4.279 epoch 035 | valid on 'valid' subset | loss 4.288 | nll_loss 2.671 | ppl 6.37 | wps 76569.8 | wpb 21331 | bsz 1016 | num_updates 58000 | best_loss 4.279 epoch 035: 231 / 1707 loss=3.978, nll_loss=2.335, ppl=5.04, wps=274571, ups=0.64, wpb=428766, bsz=16422.8, num_updates=58100, lr=0.000262387, gnorm=0.234, clip=0, loss_scale=0.5, train_wall=143, gb_free=59.6, wall=0 epoch 035: 231 / 1707 loss=3.978, nll_loss=2.335, ppl=5.04, wps=274571, ups=0.64, wpb=428766, bsz=16422.8, num_updates=58100, lr=0.000262387, gnorm=0.234, clip=0, loss_scale=0.5, train_wall=143, gb_free=59.6, wall=0 epoch 035: 231 / 1707 loss=3.978, nll_loss=2.335, ppl=5.04, wps=274571, ups=0.64, wpb=428766, bsz=16422.8, num_updates=58100, lr=0.000262387, gnorm=0.234, clip=0, loss_scale=0.5, train_wall=143, gb_free=59.6, wall=0 epoch 035: 231 / 1707 loss=3.978, nll_loss=2.335, ppl=5.04, wps=274571, ups=0.64, wpb=428766, bsz=16422.8, num_updates=58100, lr=0.000262387, gnorm=0.234, clip=0, loss_scale=0.5, train_wall=143, gb_free=59.6, wall=0 epoch 035: 231 / 1707 loss=3.978, nll_loss=2.335, ppl=5.04, wps=274571, ups=0.64, wpb=428766, bsz=16422.8, num_updates=58100, lr=0.000262387, gnorm=0.234, clip=0, loss_scale=0.5, train_wall=143, gb_free=59.6, wall=0 epoch 035: 231 / 1707 loss=3.978, nll_loss=2.335, ppl=5.04, wps=274571, ups=0.64, wpb=428766, bsz=16422.8, num_updates=58100, lr=0.000262387, gnorm=0.234, clip=0, loss_scale=0.5, train_wall=143, gb_free=59.6, wall=0 epoch 035: 231 / 1707 loss=3.978, nll_loss=2.335, ppl=5.04, wps=274571, ups=0.64, wpb=428766, bsz=16422.8, num_updates=58100, lr=0.000262387, gnorm=0.234, clip=0, loss_scale=0.5, train_wall=143, gb_free=59.6, wall=0 epoch 035: 231 / 1707 loss=3.978, nll_loss=2.335, ppl=5.04, wps=274571, ups=0.64, wpb=428766, bsz=16422.8, num_updates=58100, lr=0.000262387, gnorm=0.234, clip=0, loss_scale=0.5, train_wall=143, gb_free=59.6, wall=0 epoch 035: 231 / 1707 loss=3.978, nll_loss=2.335, ppl=5.04, wps=274571, ups=0.64, wpb=428766, bsz=16422.8, num_updates=58100, lr=0.000262387, gnorm=0.234, clip=0, loss_scale=0.5, train_wall=143, gb_free=59.6, wall=0 epoch 035: 231 / 1707 loss=3.978, nll_loss=2.335, ppl=5.04, wps=274571, ups=0.64, wpb=428766, bsz=16422.8, num_updates=58100, lr=0.000262387, gnorm=0.234, clip=0, loss_scale=0.5, train_wall=143, gb_free=59.6, wall=0 epoch 035: 231 / 1707 loss=3.978, nll_loss=2.335, ppl=5.04, wps=274571, ups=0.64, wpb=428766, bsz=16422.8, num_updates=58100, lr=0.000262387, gnorm=0.234, clip=0, loss_scale=0.5, train_wall=143, gb_free=59.6, wall=0 epoch 035: 231 / 1707 loss=3.978, nll_loss=2.335, ppl=5.04, wps=274571, ups=0.64, wpb=428766, bsz=16422.8, num_updates=58100, lr=0.000262387, gnorm=0.234, clip=0, loss_scale=0.5, train_wall=143, gb_free=59.6, wall=0 epoch 035: 231 / 1707 loss=3.978, nll_loss=2.335, ppl=5.04, wps=274571, ups=0.64, wpb=428766, bsz=16422.8, num_updates=58100, lr=0.000262387, gnorm=0.234, clip=0, loss_scale=0.5, train_wall=143, gb_free=59.6, wall=0 epoch 035: 332 / 1707 loss=3.98, nll_loss=2.337, ppl=5.05, wps=294866, ups=0.69, wpb=428961, bsz=16442.9, num_updates=58200, lr=0.000262161, gnorm=0.232, clip=0, loss_scale=0.5, train_wall=145, gb_free=59.2, wall=0 epoch 035: 332 / 1707 loss=3.98, nll_loss=2.337, ppl=5.05, wps=294866, ups=0.69, wpb=428961, bsz=16442.9, num_updates=58200, lr=0.000262161, gnorm=0.232, clip=0, loss_scale=0.5, train_wall=145, gb_free=59.2, wall=0 epoch 035: 332 / 1707 loss=3.98, nll_loss=2.337, ppl=5.05, wps=294866, ups=0.69, wpb=428961, bsz=16442.9, num_updates=58200, lr=0.000262161, gnorm=0.232, clip=0, loss_scale=0.5, train_wall=145, gb_free=59.2, wall=0 epoch 035: 332 / 1707 loss=3.98, nll_loss=2.337, ppl=5.05, wps=294866, ups=0.69, wpb=428961, bsz=16442.9, num_updates=58200, lr=0.000262161, gnorm=0.232, clip=0, loss_scale=0.5, train_wall=145, gb_free=59.2, wall=0 epoch 035: 332 / 1707 loss=3.98, nll_loss=2.337, ppl=5.05, wps=294866, ups=0.69, wpb=428961, bsz=16442.9, num_updates=58200, lr=0.000262161, gnorm=0.232, clip=0, loss_scale=0.5, train_wall=145, gb_free=59.2, wall=0 epoch 035: 332 / 1707 loss=3.98, nll_loss=2.337, ppl=5.05, wps=294866, ups=0.69, wpb=428961, bsz=16442.9, num_updates=58200, lr=0.000262161, gnorm=0.232, clip=0, loss_scale=0.5, train_wall=145, gb_free=59.2, wall=0 epoch 035: 332 / 1707 loss=3.98, nll_loss=2.337, ppl=5.05, wps=294866, ups=0.69, wpb=428961, bsz=16442.9, num_updates=58200, lr=0.000262161, gnorm=0.232, clip=0, loss_scale=0.5, train_wall=145, gb_free=59.2, wall=0 epoch 035: 332 / 1707 loss=3.98, nll_loss=2.337, ppl=5.05, wps=294866, ups=0.69, wpb=428961, bsz=16442.9, num_updates=58200, lr=0.000262161, gnorm=0.232, clip=0, loss_scale=0.5, train_wall=145, gb_free=59.2, wall=0 epoch 035: 332 / 1707 loss=3.98, nll_loss=2.337, ppl=5.05, wps=294866, ups=0.69, wpb=428961, bsz=16442.9, num_updates=58200, lr=0.000262161, gnorm=0.232, clip=0, loss_scale=0.5, train_wall=145, gb_free=59.2, wall=0 epoch 035: 332 / 1707 loss=3.98, nll_loss=2.337, ppl=5.05, wps=294866, ups=0.69, wpb=428961, bsz=16442.9, num_updates=58200, lr=0.000262161, gnorm=0.232, clip=0, loss_scale=0.5, train_wall=145, gb_free=59.2, wall=0 epoch 035: 332 / 1707 loss=3.98, nll_loss=2.337, ppl=5.05, wps=294866, ups=0.69, wpb=428961, bsz=16442.9, num_updates=58200, lr=0.000262161, gnorm=0.232, clip=0, loss_scale=0.5, train_wall=145, gb_free=59.2, wall=0 epoch 035: 332 / 1707 loss=3.98, nll_loss=2.337, ppl=5.05, wps=294866, ups=0.69, wpb=428961, bsz=16442.9, num_updates=58200, lr=0.000262161, gnorm=0.232, clip=0, loss_scale=0.5, train_wall=145, gb_free=59.2, wall=0 epoch 035: 332 / 1707 loss=3.98, nll_loss=2.337, ppl=5.05, wps=294866, ups=0.69, wpb=428961, bsz=16442.9, num_updates=58200, lr=0.000262161, gnorm=0.232, clip=0, loss_scale=0.5, train_wall=145, gb_free=59.2, wall=0 epoch 035: 432 / 1707 loss=3.994, nll_loss=2.352, ppl=5.11, wps=297822, ups=0.69, wpb=428745, bsz=16463.9, num_updates=58300, lr=0.000261936, gnorm=0.239, clip=0, loss_scale=0.5, train_wall=143, gb_free=59.5, wall=0 epoch 035: 432 / 1707 loss=3.994, nll_loss=2.352, ppl=5.11, wps=297822, ups=0.69, wpb=428745, bsz=16463.9, num_updates=58300, lr=0.000261936, gnorm=0.239, clip=0, loss_scale=0.5, train_wall=143, gb_free=59.5, wall=0 epoch 035: 432 / 1707 loss=3.994, nll_loss=2.352, ppl=5.11, wps=297822, ups=0.69, wpb=428745, bsz=16463.9, num_updates=58300, lr=0.000261936, gnorm=0.239, clip=0, loss_scale=0.5, train_wall=143, gb_free=59.5, wall=0 epoch 035: 432 / 1707 loss=3.994, nll_loss=2.352, ppl=5.11, wps=297822, ups=0.69, wpb=428745, bsz=16463.9, num_updates=58300, lr=0.000261936, gnorm=0.239, clip=0, loss_scale=0.5, train_wall=143, gb_free=59.5, wall=0 epoch 035: 432 / 1707 loss=3.994, nll_loss=2.352, ppl=5.11, wps=297822, ups=0.69, wpb=428745, bsz=16463.9, num_updates=58300, lr=0.000261936, gnorm=0.239, clip=0, loss_scale=0.5, train_wall=143, gb_free=59.5, wall=0 epoch 035: 432 / 1707 loss=3.994, nll_loss=2.352, ppl=5.11, wps=297822, ups=0.69, wpb=428745, bsz=16463.9, num_updates=58300, lr=0.000261936, gnorm=0.239, clip=0, loss_scale=0.5, train_wall=143, gb_free=59.5, wall=0 epoch 035: 432 / 1707 loss=3.994, nll_loss=2.352, ppl=5.11, wps=297822, ups=0.69, wpb=428745, bsz=16463.9, num_updates=58300, lr=0.000261936, gnorm=0.239, clip=0, loss_scale=0.5, train_wall=143, gb_free=59.5, wall=0 epoch 035: 432 / 1707 loss=3.994, nll_loss=2.352, ppl=5.11, wps=297822, ups=0.69, wpb=428745, bsz=16463.9, num_updates=58300, lr=0.000261936, gnorm=0.239, clip=0, loss_scale=0.5, train_wall=143, gb_free=59.5, wall=0 epoch 035: 432 / 1707 loss=3.994, nll_loss=2.352, ppl=5.11, wps=297822, ups=0.69, wpb=428745, bsz=16463.9, num_updates=58300, lr=0.000261936, gnorm=0.239, clip=0, loss_scale=0.5, train_wall=143, gb_free=59.5, wall=0 epoch 035: 432 / 1707 loss=3.994, nll_loss=2.352, ppl=5.11, wps=297822, ups=0.69, wpb=428745, bsz=16463.9, num_updates=58300, lr=0.000261936, gnorm=0.239, clip=0, loss_scale=0.5, train_wall=143, gb_free=59.5, wall=0 epoch 035: 432 / 1707 loss=3.994, nll_loss=2.352, ppl=5.11, wps=297822, ups=0.69, wpb=428745, bsz=16463.9, num_updates=58300, lr=0.000261936, gnorm=0.239, clip=0, loss_scale=0.5, train_wall=143, gb_free=59.5, wall=0 epoch 035: 432 / 1707 loss=3.994, nll_loss=2.352, ppl=5.11, wps=297822, ups=0.69, wpb=428745, bsz=16463.9, num_updates=58300, lr=0.000261936, gnorm=0.239, clip=0, loss_scale=0.5, train_wall=143, gb_free=59.5, wall=0 epoch 035: 432 / 1707 loss=3.994, nll_loss=2.352, ppl=5.11, wps=297822, ups=0.69, wpb=428745, bsz=16463.9, num_updates=58300, lr=0.000261936, gnorm=0.239, clip=0, loss_scale=0.5, train_wall=143, gb_free=59.5, wall=0 epoch 035: 533 / 1707 loss=3.989, nll_loss=2.347, ppl=5.09, wps=294473, ups=0.69, wpb=428944, bsz=16188.2, num_updates=58400, lr=0.000261712, gnorm=0.25, clip=0, loss_scale=0.25, train_wall=145, gb_free=58.8, wall=0 epoch 035: 533 / 1707 loss=3.989, nll_loss=2.347, ppl=5.09, wps=294473, ups=0.69, wpb=428944, bsz=16188.2, num_updates=58400, lr=0.000261712, gnorm=0.25, clip=0, loss_scale=0.25, train_wall=145, gb_free=58.8, wall=0 epoch 035: 533 / 1707 loss=3.989, nll_loss=2.347, ppl=5.09, wps=294473, ups=0.69, wpb=428944, bsz=16188.2, num_updates=58400, lr=0.000261712, gnorm=0.25, clip=0, loss_scale=0.25, train_wall=145, gb_free=58.8, wall=0 epoch 035: 533 / 1707 loss=3.989, nll_loss=2.347, ppl=5.09, wps=294473, ups=0.69, wpb=428944, bsz=16188.2, num_updates=58400, lr=0.000261712, gnorm=0.25, clip=0, loss_scale=0.25, train_wall=145, gb_free=58.8, wall=0 epoch 035: 533 / 1707 loss=3.989, nll_loss=2.347, ppl=5.09, wps=294473, ups=0.69, wpb=428944, bsz=16188.2, num_updates=58400, lr=0.000261712, gnorm=0.25, clip=0, loss_scale=0.25, train_wall=145, gb_free=58.8, wall=0 epoch 035: 533 / 1707 loss=3.989, nll_loss=2.347, ppl=5.09, wps=294473, ups=0.69, wpb=428944, bsz=16188.2, num_updates=58400, lr=0.000261712, gnorm=0.25, clip=0, loss_scale=0.25, train_wall=145, gb_free=58.8, wall=0 epoch 035: 533 / 1707 loss=3.989, nll_loss=2.347, ppl=5.09, wps=294473, ups=0.69, wpb=428944, bsz=16188.2, num_updates=58400, lr=0.000261712, gnorm=0.25, clip=0, loss_scale=0.25, train_wall=145, gb_free=58.8, wall=0 epoch 035: 533 / 1707 loss=3.989, nll_loss=2.347, ppl=5.09, wps=294473, ups=0.69, wpb=428944, bsz=16188.2, num_updates=58400, lr=0.000261712, gnorm=0.25, clip=0, loss_scale=0.25, train_wall=145, gb_free=58.8, wall=0 epoch 035: 533 / 1707 loss=3.989, nll_loss=2.347, ppl=5.09, wps=294473, ups=0.69, wpb=428944, bsz=16188.2, num_updates=58400, lr=0.000261712, gnorm=0.25, clip=0, loss_scale=0.25, train_wall=145, gb_free=58.8, wall=0 epoch 035: 533 / 1707 loss=3.989, nll_loss=2.347, ppl=5.09, wps=294473, ups=0.69, wpb=428944, bsz=16188.2, num_updates=58400, lr=0.000261712, gnorm=0.25, clip=0, loss_scale=0.25, train_wall=145, gb_free=58.8, wall=0 epoch 035: 533 / 1707 loss=3.989, nll_loss=2.347, ppl=5.09, wps=294473, ups=0.69, wpb=428944, bsz=16188.2, num_updates=58400, lr=0.000261712, gnorm=0.25, clip=0, loss_scale=0.25, train_wall=145, gb_free=58.8, wall=0 epoch 035: 533 / 1707 loss=3.989, nll_loss=2.347, ppl=5.09, wps=294473, ups=0.69, wpb=428944, bsz=16188.2, num_updates=58400, lr=0.000261712, gnorm=0.25, clip=0, loss_scale=0.25, train_wall=145, gb_free=58.8, wall=0 epoch 035: 533 / 1707 loss=3.989, nll_loss=2.347, ppl=5.09, wps=294473, ups=0.69, wpb=428944, bsz=16188.2, num_updates=58400, lr=0.000261712, gnorm=0.25, clip=0, loss_scale=0.25, train_wall=145, gb_free=58.8, wall=0 epoch 035: 633 / 1707 loss=3.988, nll_loss=2.346, ppl=5.08, wps=296975, ups=0.69, wpb=429741, bsz=16287, num_updates=58500, lr=0.000261488, gnorm=0.23, clip=0, loss_scale=0.25, train_wall=144, gb_free=58.8, wall=0 epoch 035: 633 / 1707 loss=3.988, nll_loss=2.346, ppl=5.08, wps=296975, ups=0.69, wpb=429741, bsz=16287, num_updates=58500, lr=0.000261488, gnorm=0.23, clip=0, loss_scale=0.25, train_wall=144, gb_free=58.8, wall=0 epoch 035: 633 / 1707 loss=3.988, nll_loss=2.346, ppl=5.08, wps=296975, ups=0.69, wpb=429741, bsz=16287, num_updates=58500, lr=0.000261488, gnorm=0.23, clip=0, loss_scale=0.25, train_wall=144, gb_free=58.8, wall=0 epoch 035: 633 / 1707 loss=3.988, nll_loss=2.346, ppl=5.08, wps=296975, ups=0.69, wpb=429741, bsz=16287, num_updates=58500, lr=0.000261488, gnorm=0.23, clip=0, loss_scale=0.25, train_wall=144, gb_free=58.8, wall=0 epoch 035: 633 / 1707 loss=3.988, nll_loss=2.346, ppl=5.08, wps=296975, ups=0.69, wpb=429741, bsz=16287, num_updates=58500, lr=0.000261488, gnorm=0.23, clip=0, loss_scale=0.25, train_wall=144, gb_free=58.8, wall=0 epoch 035: 633 / 1707 loss=3.988, nll_loss=2.346, ppl=5.08, wps=296975, ups=0.69, wpb=429741, bsz=16287, num_updates=58500, lr=0.000261488, gnorm=0.23, clip=0, loss_scale=0.25, train_wall=144, gb_free=58.8, wall=0 epoch 035: 633 / 1707 loss=3.988, nll_loss=2.346, ppl=5.08, wps=296975, ups=0.69, wpb=429741, bsz=16287, num_updates=58500, lr=0.000261488, gnorm=0.23, clip=0, loss_scale=0.25, train_wall=144, gb_free=58.8, wall=0 epoch 035: 633 / 1707 loss=3.988, nll_loss=2.346, ppl=5.08, wps=296975, ups=0.69, wpb=429741, bsz=16287, num_updates=58500, lr=0.000261488, gnorm=0.23, clip=0, loss_scale=0.25, train_wall=144, gb_free=58.8, wall=0 epoch 035: 633 / 1707 loss=3.988, nll_loss=2.346, ppl=5.08, wps=296975, ups=0.69, wpb=429741, bsz=16287, num_updates=58500, lr=0.000261488, gnorm=0.23, clip=0, loss_scale=0.25, train_wall=144, gb_free=58.8, wall=0 epoch 035: 633 / 1707 loss=3.988, nll_loss=2.346, ppl=5.08, wps=296975, ups=0.69, wpb=429741, bsz=16287, num_updates=58500, lr=0.000261488, gnorm=0.23, clip=0, loss_scale=0.25, train_wall=144, gb_free=58.8, wall=0 epoch 035: 633 / 1707 loss=3.988, nll_loss=2.346, ppl=5.08, wps=296975, ups=0.69, wpb=429741, bsz=16287, num_updates=58500, lr=0.000261488, gnorm=0.23, clip=0, loss_scale=0.25, train_wall=144, gb_free=58.8, wall=0 epoch 035: 633 / 1707 loss=3.988, nll_loss=2.346, ppl=5.08, wps=296975, ups=0.69, wpb=429741, bsz=16287, num_updates=58500, lr=0.000261488, gnorm=0.23, clip=0, loss_scale=0.25, train_wall=144, gb_free=58.8, wall=0 epoch 035: 633 / 1707 loss=3.988, nll_loss=2.346, ppl=5.08, wps=296975, ups=0.69, wpb=429741, bsz=16287, num_updates=58500, lr=0.000261488, gnorm=0.23, clip=0, loss_scale=0.25, train_wall=144, gb_free=58.8, wall=0 epoch 035: 733 / 1707 loss=3.994, nll_loss=2.353, ppl=5.11, wps=297060, ups=0.69, wpb=429068, bsz=16187.4, num_updates=58600, lr=0.000261265, gnorm=0.234, clip=0, loss_scale=0.5, train_wall=144, gb_free=59.2, wall=0 epoch 035: 733 / 1707 loss=3.994, nll_loss=2.353, ppl=5.11, wps=297060, ups=0.69, wpb=429068, bsz=16187.4, num_updates=58600, lr=0.000261265, gnorm=0.234, clip=0, loss_scale=0.5, train_wall=144, gb_free=59.2, wall=0 epoch 035: 733 / 1707 loss=3.994, nll_loss=2.353, ppl=5.11, wps=297060, ups=0.69, wpb=429068, bsz=16187.4, num_updates=58600, lr=0.000261265, gnorm=0.234, clip=0, loss_scale=0.5, train_wall=144, gb_free=59.2, wall=0 epoch 035: 733 / 1707 loss=3.994, nll_loss=2.353, ppl=5.11, wps=297060, ups=0.69, wpb=429068, bsz=16187.4, num_updates=58600, lr=0.000261265, gnorm=0.234, clip=0, loss_scale=0.5, train_wall=144, gb_free=59.2, wall=0 epoch 035: 733 / 1707 loss=3.994, nll_loss=2.353, ppl=5.11, wps=297060, ups=0.69, wpb=429068, bsz=16187.4, num_updates=58600, lr=0.000261265, gnorm=0.234, clip=0, loss_scale=0.5, train_wall=144, gb_free=59.2, wall=0 epoch 035: 733 / 1707 loss=3.994, nll_loss=2.353, ppl=5.11, wps=297060, ups=0.69, wpb=429068, bsz=16187.4, num_updates=58600, lr=0.000261265, gnorm=0.234, clip=0, loss_scale=0.5, train_wall=144, gb_free=59.2, wall=0 epoch 035: 733 / 1707 loss=3.994, nll_loss=2.353, ppl=5.11, wps=297060, ups=0.69, wpb=429068, bsz=16187.4, num_updates=58600, lr=0.000261265, gnorm=0.234, clip=0, loss_scale=0.5, train_wall=144, gb_free=59.2, wall=0 epoch 035: 733 / 1707 loss=3.994, nll_loss=2.353, ppl=5.11, wps=297060, ups=0.69, wpb=429068, bsz=16187.4, num_updates=58600, lr=0.000261265, gnorm=0.234, clip=0, loss_scale=0.5, train_wall=144, gb_free=59.2, wall=0 epoch 035: 733 / 1707 loss=3.994, nll_loss=2.353, ppl=5.11, wps=297060, ups=0.69, wpb=429068, bsz=16187.4, num_updates=58600, lr=0.000261265, gnorm=0.234, clip=0, loss_scale=0.5, train_wall=144, gb_free=59.2, wall=0 epoch 035: 733 / 1707 loss=3.994, nll_loss=2.353, ppl=5.11, wps=297060, ups=0.69, wpb=429068, bsz=16187.4, num_updates=58600, lr=0.000261265, gnorm=0.234, clip=0, loss_scale=0.5, train_wall=144, gb_free=59.2, wall=0 epoch 035: 733 / 1707 loss=3.994, nll_loss=2.353, ppl=5.11, wps=297060, ups=0.69, wpb=429068, bsz=16187.4, num_updates=58600, lr=0.000261265, gnorm=0.234, clip=0, loss_scale=0.5, train_wall=144, gb_free=59.2, wall=0 epoch 035: 733 / 1707 loss=3.994, nll_loss=2.353, ppl=5.11, wps=297060, ups=0.69, wpb=429068, bsz=16187.4, num_updates=58600, lr=0.000261265, gnorm=0.234, clip=0, loss_scale=0.5, train_wall=144, gb_free=59.2, wall=0 epoch 035: 733 / 1707 loss=3.994, nll_loss=2.353, ppl=5.11, wps=297060, ups=0.69, wpb=429068, bsz=16187.4, num_updates=58600, lr=0.000261265, gnorm=0.234, clip=0, loss_scale=0.5, train_wall=144, gb_free=59.2, wall=0 epoch 035: 833 / 1707 loss=3.994, nll_loss=2.353, ppl=5.11, wps=295990, ups=0.69, wpb=428094, bsz=16340.9, num_updates=58700, lr=0.000261042, gnorm=0.242, clip=0, loss_scale=0.5, train_wall=144, gb_free=58.9, wall=0 epoch 035: 833 / 1707 loss=3.994, nll_loss=2.353, ppl=5.11, wps=295990, ups=0.69, wpb=428094, bsz=16340.9, num_updates=58700, lr=0.000261042, gnorm=0.242, clip=0, loss_scale=0.5, train_wall=144, gb_free=58.9, wall=0 epoch 035: 833 / 1707 loss=3.994, nll_loss=2.353, ppl=5.11, wps=295990, ups=0.69, wpb=428094, bsz=16340.9, num_updates=58700, lr=0.000261042, gnorm=0.242, clip=0, loss_scale=0.5, train_wall=144, gb_free=58.9, wall=0 epoch 035: 833 / 1707 loss=3.994, nll_loss=2.353, ppl=5.11, wps=295990, ups=0.69, wpb=428094, bsz=16340.9, num_updates=58700, lr=0.000261042, gnorm=0.242, clip=0, loss_scale=0.5, train_wall=144, gb_free=58.9, wall=0 epoch 035: 833 / 1707 loss=3.994, nll_loss=2.353, ppl=5.11, wps=295990, ups=0.69, wpb=428094, bsz=16340.9, num_updates=58700, lr=0.000261042, gnorm=0.242, clip=0, loss_scale=0.5, train_wall=144, gb_free=58.9, wall=0 epoch 035: 833 / 1707 loss=3.994, nll_loss=2.353, ppl=5.11, wps=295990, ups=0.69, wpb=428094, bsz=16340.9, num_updates=58700, lr=0.000261042, gnorm=0.242, clip=0, loss_scale=0.5, train_wall=144, gb_free=58.9, wall=0 epoch 035: 833 / 1707 loss=3.994, nll_loss=2.353, ppl=5.11, wps=295990, ups=0.69, wpb=428094, bsz=16340.9, num_updates=58700, lr=0.000261042, gnorm=0.242, clip=0, loss_scale=0.5, train_wall=144, gb_free=58.9, wall=0 epoch 035: 833 / 1707 loss=3.994, nll_loss=2.353, ppl=5.11, wps=295990, ups=0.69, wpb=428094, bsz=16340.9, num_updates=58700, lr=0.000261042, gnorm=0.242, clip=0, loss_scale=0.5, train_wall=144, gb_free=58.9, wall=0 epoch 035: 833 / 1707 loss=3.994, nll_loss=2.353, ppl=5.11, wps=295990, ups=0.69, wpb=428094, bsz=16340.9, num_updates=58700, lr=0.000261042, gnorm=0.242, clip=0, loss_scale=0.5, train_wall=144, gb_free=58.9, wall=0 epoch 035: 833 / 1707 loss=3.994, nll_loss=2.353, ppl=5.11, wps=295990, ups=0.69, wpb=428094, bsz=16340.9, num_updates=58700, lr=0.000261042, gnorm=0.242, clip=0, loss_scale=0.5, train_wall=144, gb_free=58.9, wall=0 epoch 035: 833 / 1707 loss=3.994, nll_loss=2.353, ppl=5.11, wps=295990, ups=0.69, wpb=428094, bsz=16340.9, num_updates=58700, lr=0.000261042, gnorm=0.242, clip=0, loss_scale=0.5, train_wall=144, gb_free=58.9, wall=0 epoch 035: 833 / 1707 loss=3.994, nll_loss=2.353, ppl=5.11, wps=295990, ups=0.69, wpb=428094, bsz=16340.9, num_updates=58700, lr=0.000261042, gnorm=0.242, clip=0, loss_scale=0.5, train_wall=144, gb_free=58.9, wall=0 epoch 035: 833 / 1707 loss=3.994, nll_loss=2.353, ppl=5.11, wps=295990, ups=0.69, wpb=428094, bsz=16340.9, num_updates=58700, lr=0.000261042, gnorm=0.242, clip=0, loss_scale=0.5, train_wall=144, gb_free=58.9, wall=0 epoch 035: 933 / 1707 loss=3.995, nll_loss=2.354, ppl=5.11, wps=297466, ups=0.69, wpb=429525, bsz=16245, num_updates=58800, lr=0.00026082, gnorm=0.238, clip=0, loss_scale=0.5, train_wall=144, gb_free=59.5, wall=0 epoch 035: 933 / 1707 loss=3.995, nll_loss=2.354, ppl=5.11, wps=297466, ups=0.69, wpb=429525, bsz=16245, num_updates=58800, lr=0.00026082, gnorm=0.238, clip=0, loss_scale=0.5, train_wall=144, gb_free=59.5, wall=0 epoch 035: 933 / 1707 loss=3.995, nll_loss=2.354, ppl=5.11, wps=297466, ups=0.69, wpb=429525, bsz=16245, num_updates=58800, lr=0.00026082, gnorm=0.238, clip=0, loss_scale=0.5, train_wall=144, gb_free=59.5, wall=0 epoch 035: 933 / 1707 loss=3.995, nll_loss=2.354, ppl=5.11, wps=297466, ups=0.69, wpb=429525, bsz=16245, num_updates=58800, lr=0.00026082, gnorm=0.238, clip=0, loss_scale=0.5, train_wall=144, gb_free=59.5, wall=0 epoch 035: 933 / 1707 loss=3.995, nll_loss=2.354, ppl=5.11, wps=297466, ups=0.69, wpb=429525, bsz=16245, num_updates=58800, lr=0.00026082, gnorm=0.238, clip=0, loss_scale=0.5, train_wall=144, gb_free=59.5, wall=0 epoch 035: 933 / 1707 loss=3.995, nll_loss=2.354, ppl=5.11, wps=297466, ups=0.69, wpb=429525, bsz=16245, num_updates=58800, lr=0.00026082, gnorm=0.238, clip=0, loss_scale=0.5, train_wall=144, gb_free=59.5, wall=0 epoch 035: 933 / 1707 loss=3.995, nll_loss=2.354, ppl=5.11, wps=297466, ups=0.69, wpb=429525, bsz=16245, num_updates=58800, lr=0.00026082, gnorm=0.238, clip=0, loss_scale=0.5, train_wall=144, gb_free=59.5, wall=0 epoch 035: 933 / 1707 loss=3.995, nll_loss=2.354, ppl=5.11, wps=297466, ups=0.69, wpb=429525, bsz=16245, num_updates=58800, lr=0.00026082, gnorm=0.238, clip=0, loss_scale=0.5, train_wall=144, gb_free=59.5, wall=0 epoch 035: 933 / 1707 loss=3.995, nll_loss=2.354, ppl=5.11, wps=297466, ups=0.69, wpb=429525, bsz=16245, num_updates=58800, lr=0.00026082, gnorm=0.238, clip=0, loss_scale=0.5, train_wall=144, gb_free=59.5, wall=0 epoch 035: 933 / 1707 loss=3.995, nll_loss=2.354, ppl=5.11, wps=297466, ups=0.69, wpb=429525, bsz=16245, num_updates=58800, lr=0.00026082, gnorm=0.238, clip=0, loss_scale=0.5, train_wall=144, gb_free=59.5, wall=0 epoch 035: 933 / 1707 loss=3.995, nll_loss=2.354, ppl=5.11, wps=297466, ups=0.69, wpb=429525, bsz=16245, num_updates=58800, lr=0.00026082, gnorm=0.238, clip=0, loss_scale=0.5, train_wall=144, gb_free=59.5, wall=0 epoch 035: 933 / 1707 loss=3.995, nll_loss=2.354, ppl=5.11, wps=297466, ups=0.69, wpb=429525, bsz=16245, num_updates=58800, lr=0.00026082, gnorm=0.238, clip=0, loss_scale=0.5, train_wall=144, gb_free=59.5, wall=0 epoch 035: 933 / 1707 loss=3.995, nll_loss=2.354, ppl=5.11, wps=297466, ups=0.69, wpb=429525, bsz=16245, num_updates=58800, lr=0.00026082, gnorm=0.238, clip=0, loss_scale=0.5, train_wall=144, gb_free=59.5, wall=0 epoch 035: 1033 / 1707 loss=3.997, nll_loss=2.356, ppl=5.12, wps=297631, ups=0.69, wpb=430089, bsz=16427.7, num_updates=58900, lr=0.000260599, gnorm=0.249, clip=0, loss_scale=1, train_wall=144, gb_free=59, wall=0 epoch 035: 1033 / 1707 loss=3.997, nll_loss=2.356, ppl=5.12, wps=297631, ups=0.69, wpb=430089, bsz=16427.7, num_updates=58900, lr=0.000260599, gnorm=0.249, clip=0, loss_scale=1, train_wall=144, gb_free=59, wall=0 epoch 035: 1033 / 1707 loss=3.997, nll_loss=2.356, ppl=5.12, wps=297631, ups=0.69, wpb=430089, bsz=16427.7, num_updates=58900, lr=0.000260599, gnorm=0.249, clip=0, loss_scale=1, train_wall=144, gb_free=59, wall=0 epoch 035: 1033 / 1707 loss=3.997, nll_loss=2.356, ppl=5.12, wps=297631, ups=0.69, wpb=430089, bsz=16427.7, num_updates=58900, lr=0.000260599, gnorm=0.249, clip=0, loss_scale=1, train_wall=144, gb_free=59, wall=0 epoch 035: 1033 / 1707 loss=3.997, nll_loss=2.356, ppl=5.12, wps=297631, ups=0.69, wpb=430089, bsz=16427.7, num_updates=58900, lr=0.000260599, gnorm=0.249, clip=0, loss_scale=1, train_wall=144, gb_free=59, wall=0 epoch 035: 1033 / 1707 loss=3.997, nll_loss=2.356, ppl=5.12, wps=297631, ups=0.69, wpb=430089, bsz=16427.7, num_updates=58900, lr=0.000260599, gnorm=0.249, clip=0, loss_scale=1, train_wall=144, gb_free=59, wall=0 epoch 035: 1033 / 1707 loss=3.997, nll_loss=2.356, ppl=5.12, wps=297631, ups=0.69, wpb=430089, bsz=16427.7, num_updates=58900, lr=0.000260599, gnorm=0.249, clip=0, loss_scale=1, train_wall=144, gb_free=59, wall=0 epoch 035: 1033 / 1707 loss=3.997, nll_loss=2.356, ppl=5.12, wps=297631, ups=0.69, wpb=430089, bsz=16427.7, num_updates=58900, lr=0.000260599, gnorm=0.249, clip=0, loss_scale=1, train_wall=144, gb_free=59, wall=0 epoch 035: 1033 / 1707 loss=3.997, nll_loss=2.356, ppl=5.12, wps=297631, ups=0.69, wpb=430089, bsz=16427.7, num_updates=58900, lr=0.000260599, gnorm=0.249, clip=0, loss_scale=1, train_wall=144, gb_free=59, wall=0 epoch 035: 1033 / 1707 loss=3.997, nll_loss=2.356, ppl=5.12, wps=297631, ups=0.69, wpb=430089, bsz=16427.7, num_updates=58900, lr=0.000260599, gnorm=0.249, clip=0, loss_scale=1, train_wall=144, gb_free=59, wall=0 epoch 035: 1033 / 1707 loss=3.997, nll_loss=2.356, ppl=5.12, wps=297631, ups=0.69, wpb=430089, bsz=16427.7, num_updates=58900, lr=0.000260599, gnorm=0.249, clip=0, loss_scale=1, train_wall=144, gb_free=59, wall=0 epoch 035: 1033 / 1707 loss=3.997, nll_loss=2.356, ppl=5.12, wps=297631, ups=0.69, wpb=430089, bsz=16427.7, num_updates=58900, lr=0.000260599, gnorm=0.249, clip=0, loss_scale=1, train_wall=144, gb_free=59, wall=0 epoch 035: 1033 / 1707 loss=3.997, nll_loss=2.356, ppl=5.12, wps=297631, ups=0.69, wpb=430089, bsz=16427.7, num_updates=58900, lr=0.000260599, gnorm=0.249, clip=0, loss_scale=1, train_wall=144, gb_free=59, wall=0 epoch 035: 1133 / 1707 loss=4.003, nll_loss=2.363, ppl=5.14, wps=298533, ups=0.69, wpb=431175, bsz=16638.6, num_updates=59000, lr=0.000260378, gnorm=0.231, clip=0, loss_scale=1, train_wall=144, gb_free=59.4, wall=0 epoch 035: 1133 / 1707 loss=4.003, nll_loss=2.363, ppl=5.14, wps=298533, ups=0.69, wpb=431175, bsz=16638.6, num_updates=59000, lr=0.000260378, gnorm=0.231, clip=0, loss_scale=1, train_wall=144, gb_free=59.4, wall=0 epoch 035: 1133 / 1707 loss=4.003, nll_loss=2.363, ppl=5.14, wps=298533, ups=0.69, wpb=431175, bsz=16638.6, num_updates=59000, lr=0.000260378, gnorm=0.231, clip=0, loss_scale=1, train_wall=144, gb_free=59.4, wall=0 epoch 035: 1133 / 1707 loss=4.003, nll_loss=2.363, ppl=5.14, wps=298533, ups=0.69, wpb=431175, bsz=16638.6, num_updates=59000, lr=0.000260378, gnorm=0.231, clip=0, loss_scale=1, train_wall=144, gb_free=59.4, wall=0 epoch 035: 1133 / 1707 loss=4.003, nll_loss=2.363, ppl=5.14, wps=298533, ups=0.69, wpb=431175, bsz=16638.6, num_updates=59000, lr=0.000260378, gnorm=0.231, clip=0, loss_scale=1, train_wall=144, gb_free=59.4, wall=0 epoch 035: 1133 / 1707 loss=4.003, nll_loss=2.363, ppl=5.14, wps=298533, ups=0.69, wpb=431175, bsz=16638.6, num_updates=59000, lr=0.000260378, gnorm=0.231, clip=0, loss_scale=1, train_wall=144, gb_free=59.4, wall=0 epoch 035: 1133 / 1707 loss=4.003, nll_loss=2.363, ppl=5.14, wps=298533, ups=0.69, wpb=431175, bsz=16638.6, num_updates=59000, lr=0.000260378, gnorm=0.231, clip=0, loss_scale=1, train_wall=144, gb_free=59.4, wall=0 epoch 035: 1133 / 1707 loss=4.003, nll_loss=2.363, ppl=5.14, wps=298533, ups=0.69, wpb=431175, bsz=16638.6, num_updates=59000, lr=0.000260378, gnorm=0.231, clip=0, loss_scale=1, train_wall=144, gb_free=59.4, wall=0 epoch 035: 1133 / 1707 loss=4.003, nll_loss=2.363, ppl=5.14, wps=298533, ups=0.69, wpb=431175, bsz=16638.6, num_updates=59000, lr=0.000260378, gnorm=0.231, clip=0, loss_scale=1, train_wall=144, gb_free=59.4, wall=0 epoch 035: 1133 / 1707 loss=4.003, nll_loss=2.363, ppl=5.14, wps=298533, ups=0.69, wpb=431175, bsz=16638.6, num_updates=59000, lr=0.000260378, gnorm=0.231, clip=0, loss_scale=1, train_wall=144, gb_free=59.4, wall=0 epoch 035: 1133 / 1707 loss=4.003, nll_loss=2.363, ppl=5.14, wps=298533, ups=0.69, wpb=431175, bsz=16638.6, num_updates=59000, lr=0.000260378, gnorm=0.231, clip=0, loss_scale=1, train_wall=144, gb_free=59.4, wall=0 epoch 035: 1133 / 1707 loss=4.003, nll_loss=2.363, ppl=5.14, wps=298533, ups=0.69, wpb=431175, bsz=16638.6, num_updates=59000, lr=0.000260378, gnorm=0.231, clip=0, loss_scale=1, train_wall=144, gb_free=59.4, wall=0 epoch 035: 1133 / 1707 loss=4.003, nll_loss=2.363, ppl=5.14, wps=298533, ups=0.69, wpb=431175, bsz=16638.6, num_updates=59000, lr=0.000260378, gnorm=0.231, clip=0, loss_scale=1, train_wall=144, gb_free=59.4, wall=0 begin validation on "valid" subset epoch 035 | valid on 'valid' subset | loss 4.297 | nll_loss 2.679 | ppl 6.4 | wps 76435.9 | wpb 21331 | bsz 1016 | num_updates 59000 | best_loss 4.279 epoch 035 | valid on 'valid' subset | loss 4.297 | nll_loss 2.679 | ppl 6.4 | wps 76435.9 | wpb 21331 | bsz 1016 | num_updates 59000 | best_loss 4.279 epoch 035 | valid on 'valid' subset | loss 4.297 | nll_loss 2.679 | ppl 6.4 | wps 76435.9 | wpb 21331 | bsz 1016 | num_updates 59000 | best_loss 4.279 epoch 035 | valid on 'valid' subset | loss 4.297 | nll_loss 2.679 | ppl 6.4 | wps 76435.9 | wpb 21331 | bsz 1016 | num_updates 59000 | best_loss 4.279 epoch 035 | valid on 'valid' subset | loss 4.297 | nll_loss 2.679 | ppl 6.4 | wps 76435.9 | wpb 21331 | bsz 1016 | num_updates 59000 | best_loss 4.279 epoch 035 | valid on 'valid' subset | loss 4.297 | nll_loss 2.679 | ppl 6.4 | wps 76435.9 | wpb 21331 | bsz 1016 | num_updates 59000 | best_loss 4.279 epoch 035 | valid on 'valid' subset | loss 4.297 | nll_loss 2.679 | ppl 6.4 | wps 76435.9 | wpb 21331 | bsz 1016 | num_updates 59000 | best_loss 4.279 epoch 035 | valid on 'valid' subset | loss 4.297 | nll_loss 2.679 | ppl 6.4 | wps 76435.9 | wpb 21331 | bsz 1016 | num_updates 59000 | best_loss 4.279 epoch 035 | valid on 'valid' subset | loss 4.297 | nll_loss 2.679 | ppl 6.4 | wps 76435.9 | wpb 21331 | bsz 1016 | num_updates 59000 | best_loss 4.279 epoch 035 | valid on 'valid' subset | loss 4.297 | nll_loss 2.679 | ppl 6.4 | wps 76435.9 | wpb 21331 | bsz 1016 | num_updates 59000 | best_loss 4.279 epoch 035 | valid on 'valid' subset | loss 4.297 | nll_loss 2.679 | ppl 6.4 | wps 76435.9 | wpb 21331 | bsz 1016 | num_updates 59000 | best_loss 4.279 epoch 035 | valid on 'valid' subset | loss 4.297 | nll_loss 2.679 | ppl 6.4 | wps 76435.9 | wpb 21331 | bsz 1016 | num_updates 59000 | best_loss 4.279 epoch 035 | valid on 'valid' subset | loss 4.297 | nll_loss 2.679 | ppl 6.4 | wps 76435.9 | wpb 21331 | bsz 1016 | num_updates 59000 | best_loss 4.279 epoch 035: 1233 / 1707 loss=3.997, nll_loss=2.357, ppl=5.12, wps=273785, ups=0.64, wpb=428400, bsz=16255.3, num_updates=59100, lr=0.000260157, gnorm=0.235, clip=0, loss_scale=1, train_wall=143, gb_free=58.9, wall=0 epoch 035: 1233 / 1707 loss=3.997, nll_loss=2.357, ppl=5.12, wps=273785, ups=0.64, wpb=428400, bsz=16255.3, num_updates=59100, lr=0.000260157, gnorm=0.235, clip=0, loss_scale=1, train_wall=143, gb_free=58.9, wall=0 epoch 035: 1233 / 1707 loss=3.997, nll_loss=2.357, ppl=5.12, wps=273785, ups=0.64, wpb=428400, bsz=16255.3, num_updates=59100, lr=0.000260157, gnorm=0.235, clip=0, loss_scale=1, train_wall=143, gb_free=58.9, wall=0 epoch 035: 1233 / 1707 loss=3.997, nll_loss=2.357, ppl=5.12, wps=273785, ups=0.64, wpb=428400, bsz=16255.3, num_updates=59100, lr=0.000260157, gnorm=0.235, clip=0, loss_scale=1, train_wall=143, gb_free=58.9, wall=0 epoch 035: 1233 / 1707 loss=3.997, nll_loss=2.357, ppl=5.12, wps=273785, ups=0.64, wpb=428400, bsz=16255.3, num_updates=59100, lr=0.000260157, gnorm=0.235, clip=0, loss_scale=1, train_wall=143, gb_free=58.9, wall=0 epoch 035: 1233 / 1707 loss=3.997, nll_loss=2.357, ppl=5.12, wps=273785, ups=0.64, wpb=428400, bsz=16255.3, num_updates=59100, lr=0.000260157, gnorm=0.235, clip=0, loss_scale=1, train_wall=143, gb_free=58.9, wall=0 epoch 035: 1233 / 1707 loss=3.997, nll_loss=2.357, ppl=5.12, wps=273785, ups=0.64, wpb=428400, bsz=16255.3, num_updates=59100, lr=0.000260157, gnorm=0.235, clip=0, loss_scale=1, train_wall=143, gb_free=58.9, wall=0 epoch 035: 1233 / 1707 loss=3.997, nll_loss=2.357, ppl=5.12, wps=273785, ups=0.64, wpb=428400, bsz=16255.3, num_updates=59100, lr=0.000260157, gnorm=0.235, clip=0, loss_scale=1, train_wall=143, gb_free=58.9, wall=0 epoch 035: 1233 / 1707 loss=3.997, nll_loss=2.357, ppl=5.12, wps=273785, ups=0.64, wpb=428400, bsz=16255.3, num_updates=59100, lr=0.000260157, gnorm=0.235, clip=0, loss_scale=1, train_wall=143, gb_free=58.9, wall=0 epoch 035: 1233 / 1707 loss=3.997, nll_loss=2.357, ppl=5.12, wps=273785, ups=0.64, wpb=428400, bsz=16255.3, num_updates=59100, lr=0.000260157, gnorm=0.235, clip=0, loss_scale=1, train_wall=143, gb_free=58.9, wall=0 epoch 035: 1233 / 1707 loss=3.997, nll_loss=2.357, ppl=5.12, wps=273785, ups=0.64, wpb=428400, bsz=16255.3, num_updates=59100, lr=0.000260157, gnorm=0.235, clip=0, loss_scale=1, train_wall=143, gb_free=58.9, wall=0 epoch 035: 1233 / 1707 loss=3.997, nll_loss=2.357, ppl=5.12, wps=273785, ups=0.64, wpb=428400, bsz=16255.3, num_updates=59100, lr=0.000260157, gnorm=0.235, clip=0, loss_scale=1, train_wall=143, gb_free=58.9, wall=0 epoch 035: 1233 / 1707 loss=3.997, nll_loss=2.357, ppl=5.12, wps=273785, ups=0.64, wpb=428400, bsz=16255.3, num_updates=59100, lr=0.000260157, gnorm=0.235, clip=0, loss_scale=1, train_wall=143, gb_free=58.9, wall=0 epoch 035: 1333 / 1707 loss=3.998, nll_loss=2.358, ppl=5.13, wps=297157, ups=0.69, wpb=427645, bsz=16285.3, num_updates=59200, lr=0.000259938, gnorm=0.257, clip=0, loss_scale=2, train_wall=143, gb_free=59.3, wall=0 epoch 035: 1333 / 1707 loss=3.998, nll_loss=2.358, ppl=5.13, wps=297157, ups=0.69, wpb=427645, bsz=16285.3, num_updates=59200, lr=0.000259938, gnorm=0.257, clip=0, loss_scale=2, train_wall=143, gb_free=59.3, wall=0 epoch 035: 1333 / 1707 loss=3.998, nll_loss=2.358, ppl=5.13, wps=297157, ups=0.69, wpb=427645, bsz=16285.3, num_updates=59200, lr=0.000259938, gnorm=0.257, clip=0, loss_scale=2, train_wall=143, gb_free=59.3, wall=0 epoch 035: 1333 / 1707 loss=3.998, nll_loss=2.358, ppl=5.13, wps=297157, ups=0.69, wpb=427645, bsz=16285.3, num_updates=59200, lr=0.000259938, gnorm=0.257, clip=0, loss_scale=2, train_wall=143, gb_free=59.3, wall=0 epoch 035: 1333 / 1707 loss=3.998, nll_loss=2.358, ppl=5.13, wps=297157, ups=0.69, wpb=427645, bsz=16285.3, num_updates=59200, lr=0.000259938, gnorm=0.257, clip=0, loss_scale=2, train_wall=143, gb_free=59.3, wall=0 epoch 035: 1333 / 1707 loss=3.998, nll_loss=2.358, ppl=5.13, wps=297157, ups=0.69, wpb=427645, bsz=16285.3, num_updates=59200, lr=0.000259938, gnorm=0.257, clip=0, loss_scale=2, train_wall=143, gb_free=59.3, wall=0 epoch 035: 1333 / 1707 loss=3.998, nll_loss=2.358, ppl=5.13, wps=297157, ups=0.69, wpb=427645, bsz=16285.3, num_updates=59200, lr=0.000259938, gnorm=0.257, clip=0, loss_scale=2, train_wall=143, gb_free=59.3, wall=0 epoch 035: 1333 / 1707 loss=3.998, nll_loss=2.358, ppl=5.13, wps=297157, ups=0.69, wpb=427645, bsz=16285.3, num_updates=59200, lr=0.000259938, gnorm=0.257, clip=0, loss_scale=2, train_wall=143, gb_free=59.3, wall=0 epoch 035: 1333 / 1707 loss=3.998, nll_loss=2.358, ppl=5.13, wps=297157, ups=0.69, wpb=427645, bsz=16285.3, num_updates=59200, lr=0.000259938, gnorm=0.257, clip=0, loss_scale=2, train_wall=143, gb_free=59.3, wall=0 epoch 035: 1333 / 1707 loss=3.998, nll_loss=2.358, ppl=5.13, wps=297157, ups=0.69, wpb=427645, bsz=16285.3, num_updates=59200, lr=0.000259938, gnorm=0.257, clip=0, loss_scale=2, train_wall=143, gb_free=59.3, wall=0 epoch 035: 1333 / 1707 loss=3.998, nll_loss=2.358, ppl=5.13, wps=297157, ups=0.69, wpb=427645, bsz=16285.3, num_updates=59200, lr=0.000259938, gnorm=0.257, clip=0, loss_scale=2, train_wall=143, gb_free=59.3, wall=0 epoch 035: 1333 / 1707 loss=3.998, nll_loss=2.358, ppl=5.13, wps=297157, ups=0.69, wpb=427645, bsz=16285.3, num_updates=59200, lr=0.000259938, gnorm=0.257, clip=0, loss_scale=2, train_wall=143, gb_free=59.3, wall=0 epoch 035: 1333 / 1707 loss=3.998, nll_loss=2.358, ppl=5.13, wps=297157, ups=0.69, wpb=427645, bsz=16285.3, num_updates=59200, lr=0.000259938, gnorm=0.257, clip=0, loss_scale=2, train_wall=143, gb_free=59.3, wall=0 epoch 035: 1434 / 1707 loss=4.002, nll_loss=2.362, ppl=5.14, wps=295984, ups=0.69, wpb=430380, bsz=16221.1, num_updates=59300, lr=0.000259718, gnorm=0.249, clip=0, loss_scale=1, train_wall=145, gb_free=59, wall=0 epoch 035: 1434 / 1707 loss=4.002, nll_loss=2.362, ppl=5.14, wps=295984, ups=0.69, wpb=430380, bsz=16221.1, num_updates=59300, lr=0.000259718, gnorm=0.249, clip=0, loss_scale=1, train_wall=145, gb_free=59, wall=0 epoch 035: 1434 / 1707 loss=4.002, nll_loss=2.362, ppl=5.14, wps=295984, ups=0.69, wpb=430380, bsz=16221.1, num_updates=59300, lr=0.000259718, gnorm=0.249, clip=0, loss_scale=1, train_wall=145, gb_free=59, wall=0 epoch 035: 1434 / 1707 loss=4.002, nll_loss=2.362, ppl=5.14, wps=295984, ups=0.69, wpb=430380, bsz=16221.1, num_updates=59300, lr=0.000259718, gnorm=0.249, clip=0, loss_scale=1, train_wall=145, gb_free=59, wall=0 epoch 035: 1434 / 1707 loss=4.002, nll_loss=2.362, ppl=5.14, wps=295984, ups=0.69, wpb=430380, bsz=16221.1, num_updates=59300, lr=0.000259718, gnorm=0.249, clip=0, loss_scale=1, train_wall=145, gb_free=59, wall=0 epoch 035: 1434 / 1707 loss=4.002, nll_loss=2.362, ppl=5.14, wps=295984, ups=0.69, wpb=430380, bsz=16221.1, num_updates=59300, lr=0.000259718, gnorm=0.249, clip=0, loss_scale=1, train_wall=145, gb_free=59, wall=0 epoch 035: 1434 / 1707 loss=4.002, nll_loss=2.362, ppl=5.14, wps=295984, ups=0.69, wpb=430380, bsz=16221.1, num_updates=59300, lr=0.000259718, gnorm=0.249, clip=0, loss_scale=1, train_wall=145, gb_free=59, wall=0 epoch 035: 1434 / 1707 loss=4.002, nll_loss=2.362, ppl=5.14, wps=295984, ups=0.69, wpb=430380, bsz=16221.1, num_updates=59300, lr=0.000259718, gnorm=0.249, clip=0, loss_scale=1, train_wall=145, gb_free=59, wall=0 epoch 035: 1434 / 1707 loss=4.002, nll_loss=2.362, ppl=5.14, wps=295984, ups=0.69, wpb=430380, bsz=16221.1, num_updates=59300, lr=0.000259718, gnorm=0.249, clip=0, loss_scale=1, train_wall=145, gb_free=59, wall=0 epoch 035: 1434 / 1707 loss=4.002, nll_loss=2.362, ppl=5.14, wps=295984, ups=0.69, wpb=430380, bsz=16221.1, num_updates=59300, lr=0.000259718, gnorm=0.249, clip=0, loss_scale=1, train_wall=145, gb_free=59, wall=0 epoch 035: 1434 / 1707 loss=4.002, nll_loss=2.362, ppl=5.14, wps=295984, ups=0.69, wpb=430380, bsz=16221.1, num_updates=59300, lr=0.000259718, gnorm=0.249, clip=0, loss_scale=1, train_wall=145, gb_free=59, wall=0 epoch 035: 1434 / 1707 loss=4.002, nll_loss=2.362, ppl=5.14, wps=295984, ups=0.69, wpb=430380, bsz=16221.1, num_updates=59300, lr=0.000259718, gnorm=0.249, clip=0, loss_scale=1, train_wall=145, gb_free=59, wall=0 epoch 035: 1434 / 1707 loss=4.002, nll_loss=2.362, ppl=5.14, wps=295984, ups=0.69, wpb=430380, bsz=16221.1, num_updates=59300, lr=0.000259718, gnorm=0.249, clip=0, loss_scale=1, train_wall=145, gb_free=59, wall=0 epoch 035: 1534 / 1707 loss=4, nll_loss=2.36, ppl=5.13, wps=297384, ups=0.69, wpb=428395, bsz=16080, num_updates=59400, lr=0.0002595, gnorm=0.238, clip=0, loss_scale=1, train_wall=143, gb_free=58.9, wall=0 epoch 035: 1534 / 1707 loss=4, nll_loss=2.36, ppl=5.13, wps=297384, ups=0.69, wpb=428395, bsz=16080, num_updates=59400, lr=0.0002595, gnorm=0.238, clip=0, loss_scale=1, train_wall=143, gb_free=58.9, wall=0 epoch 035: 1534 / 1707 loss=4, nll_loss=2.36, ppl=5.13, wps=297384, ups=0.69, wpb=428395, bsz=16080, num_updates=59400, lr=0.0002595, gnorm=0.238, clip=0, loss_scale=1, train_wall=143, gb_free=58.9, wall=0 epoch 035: 1534 / 1707 loss=4, nll_loss=2.36, ppl=5.13, wps=297384, ups=0.69, wpb=428395, bsz=16080, num_updates=59400, lr=0.0002595, gnorm=0.238, clip=0, loss_scale=1, train_wall=143, gb_free=58.9, wall=0 epoch 035: 1534 / 1707 loss=4, nll_loss=2.36, ppl=5.13, wps=297384, ups=0.69, wpb=428395, bsz=16080, num_updates=59400, lr=0.0002595, gnorm=0.238, clip=0, loss_scale=1, train_wall=143, gb_free=58.9, wall=0 epoch 035: 1534 / 1707 loss=4, nll_loss=2.36, ppl=5.13, wps=297384, ups=0.69, wpb=428395, bsz=16080, num_updates=59400, lr=0.0002595, gnorm=0.238, clip=0, loss_scale=1, train_wall=143, gb_free=58.9, wall=0 epoch 035: 1534 / 1707 loss=4, nll_loss=2.36, ppl=5.13, wps=297384, ups=0.69, wpb=428395, bsz=16080, num_updates=59400, lr=0.0002595, gnorm=0.238, clip=0, loss_scale=1, train_wall=143, gb_free=58.9, wall=0 epoch 035: 1534 / 1707 loss=4, nll_loss=2.36, ppl=5.13, wps=297384, ups=0.69, wpb=428395, bsz=16080, num_updates=59400, lr=0.0002595, gnorm=0.238, clip=0, loss_scale=1, train_wall=143, gb_free=58.9, wall=0 epoch 035: 1534 / 1707 loss=4, nll_loss=2.36, ppl=5.13, wps=297384, ups=0.69, wpb=428395, bsz=16080, num_updates=59400, lr=0.0002595, gnorm=0.238, clip=0, loss_scale=1, train_wall=143, gb_free=58.9, wall=0 epoch 035: 1534 / 1707 loss=4, nll_loss=2.36, ppl=5.13, wps=297384, ups=0.69, wpb=428395, bsz=16080, num_updates=59400, lr=0.0002595, gnorm=0.238, clip=0, loss_scale=1, train_wall=143, gb_free=58.9, wall=0 epoch 035: 1534 / 1707 loss=4, nll_loss=2.36, ppl=5.13, wps=297384, ups=0.69, wpb=428395, bsz=16080, num_updates=59400, lr=0.0002595, gnorm=0.238, clip=0, loss_scale=1, train_wall=143, gb_free=58.9, wall=0 epoch 035: 1534 / 1707 loss=4, nll_loss=2.36, ppl=5.13, wps=297384, ups=0.69, wpb=428395, bsz=16080, num_updates=59400, lr=0.0002595, gnorm=0.238, clip=0, loss_scale=1, train_wall=143, gb_free=58.9, wall=0 epoch 035: 1534 / 1707 loss=4, nll_loss=2.36, ppl=5.13, wps=297384, ups=0.69, wpb=428395, bsz=16080, num_updates=59400, lr=0.0002595, gnorm=0.238, clip=0, loss_scale=1, train_wall=143, gb_free=58.9, wall=0 epoch 035: 1634 / 1707 loss=4.003, nll_loss=2.364, ppl=5.15, wps=297046, ups=0.69, wpb=428975, bsz=16675.3, num_updates=59500, lr=0.000259281, gnorm=0.252, clip=1, loss_scale=2, train_wall=144, gb_free=59.2, wall=0 epoch 035: 1634 / 1707 loss=4.003, nll_loss=2.364, ppl=5.15, wps=297046, ups=0.69, wpb=428975, bsz=16675.3, num_updates=59500, lr=0.000259281, gnorm=0.252, clip=1, loss_scale=2, train_wall=144, gb_free=59.2, wall=0 epoch 035: 1634 / 1707 loss=4.003, nll_loss=2.364, ppl=5.15, wps=297046, ups=0.69, wpb=428975, bsz=16675.3, num_updates=59500, lr=0.000259281, gnorm=0.252, clip=1, loss_scale=2, train_wall=144, gb_free=59.2, wall=0 epoch 035: 1634 / 1707 loss=4.003, nll_loss=2.364, ppl=5.15, wps=297046, ups=0.69, wpb=428975, bsz=16675.3, num_updates=59500, lr=0.000259281, gnorm=0.252, clip=1, loss_scale=2, train_wall=144, gb_free=59.2, wall=0 epoch 035: 1634 / 1707 loss=4.003, nll_loss=2.364, ppl=5.15, wps=297046, ups=0.69, wpb=428975, bsz=16675.3, num_updates=59500, lr=0.000259281, gnorm=0.252, clip=1, loss_scale=2, train_wall=144, gb_free=59.2, wall=0 epoch 035: 1634 / 1707 loss=4.003, nll_loss=2.364, ppl=5.15, wps=297046, ups=0.69, wpb=428975, bsz=16675.3, num_updates=59500, lr=0.000259281, gnorm=0.252, clip=1, loss_scale=2, train_wall=144, gb_free=59.2, wall=0 epoch 035: 1634 / 1707 loss=4.003, nll_loss=2.364, ppl=5.15, wps=297046, ups=0.69, wpb=428975, bsz=16675.3, num_updates=59500, lr=0.000259281, gnorm=0.252, clip=1, loss_scale=2, train_wall=144, gb_free=59.2, wall=0 epoch 035: 1634 / 1707 loss=4.003, nll_loss=2.364, ppl=5.15, wps=297046, ups=0.69, wpb=428975, bsz=16675.3, num_updates=59500, lr=0.000259281, gnorm=0.252, clip=1, loss_scale=2, train_wall=144, gb_free=59.2, wall=0 epoch 035: 1634 / 1707 loss=4.003, nll_loss=2.364, ppl=5.15, wps=297046, ups=0.69, wpb=428975, bsz=16675.3, num_updates=59500, lr=0.000259281, gnorm=0.252, clip=1, loss_scale=2, train_wall=144, gb_free=59.2, wall=0 epoch 035: 1634 / 1707 loss=4.003, nll_loss=2.364, ppl=5.15, wps=297046, ups=0.69, wpb=428975, bsz=16675.3, num_updates=59500, lr=0.000259281, gnorm=0.252, clip=1, loss_scale=2, train_wall=144, gb_free=59.2, wall=0 epoch 035: 1634 / 1707 loss=4.003, nll_loss=2.364, ppl=5.15, wps=297046, ups=0.69, wpb=428975, bsz=16675.3, num_updates=59500, lr=0.000259281, gnorm=0.252, clip=1, loss_scale=2, train_wall=144, gb_free=59.2, wall=0 epoch 035: 1634 / 1707 loss=4.003, nll_loss=2.364, ppl=5.15, wps=297046, ups=0.69, wpb=428975, bsz=16675.3, num_updates=59500, lr=0.000259281, gnorm=0.252, clip=1, loss_scale=2, train_wall=144, gb_free=59.2, wall=0 epoch 035: 1634 / 1707 loss=4.003, nll_loss=2.364, ppl=5.15, wps=297046, ups=0.69, wpb=428975, bsz=16675.3, num_updates=59500, lr=0.000259281, gnorm=0.252, clip=1, loss_scale=2, train_wall=144, gb_free=59.2, wall=0 end of epoch 35 (average epoch stats below) epoch 035 | loss 3.993 | nll_loss 2.352 | ppl 5.1 | wps 293768 | ups 0.68 | wpb 428944 | bsz 16332.1 | num_updates 59572 | lr 0.000259125 | gnorm 0.241 | clip 0.1 | loss_scale 1 | train_wall 2447 | gb_free 59.8 | wall 0 epoch 035 | loss 3.993 | nll_loss 2.352 | ppl 5.1 | wps 293768 | ups 0.68 | wpb 428944 | bsz 16332.1 | num_updates 59572 | lr 0.000259125 | gnorm 0.241 | clip 0.1 | loss_scale 1 | train_wall 2447 | gb_free 59.8 | wall 0 epoch 035 | loss 3.993 | nll_loss 2.352 | ppl 5.1 | wps 293768 | ups 0.68 | wpb 428944 | bsz 16332.1 | num_updates 59572 | lr 0.000259125 | gnorm 0.241 | clip 0.1 | loss_scale 1 | train_wall 2447 | gb_free 59.8 | wall 0 epoch 035 | loss 3.993 | nll_loss 2.352 | ppl 5.1 | wps 293768 | ups 0.68 | wpb 428944 | bsz 16332.1 | num_updates 59572 | lr 0.000259125 | gnorm 0.241 | clip 0.1 | loss_scale 1 | train_wall 2447 | gb_free 59.8 | wall 0 epoch 035 | loss 3.993 | nll_loss 2.352 | ppl 5.1 | wps 293768 | ups 0.68 | wpb 428944 | bsz 16332.1 | num_updates 59572 | lr 0.000259125 | gnorm 0.241 | clip 0.1 | loss_scale 1 | train_wall 2447 | gb_free 59.8 | wall 0 epoch 035 | loss 3.993 | nll_loss 2.352 | ppl 5.1 | wps 293768 | ups 0.68 | wpb 428944 | bsz 16332.1 | num_updates 59572 | lr 0.000259125 | gnorm 0.241 | clip 0.1 | loss_scale 1 | train_wall 2447 | gb_free 59.8 | wall 0 epoch 035 | loss 3.993 | nll_loss 2.352 | ppl 5.1 | wps 293768 | ups 0.68 | wpb 428944 | bsz 16332.1 | num_updates 59572 | lr 0.000259125 | gnorm 0.241 | clip 0.1 | loss_scale 1 | train_wall 2447 | gb_free 59.8 | wall 0 epoch 035 | loss 3.993 | nll_loss 2.352 | ppl 5.1 | wps 293768 | ups 0.68 | wpb 428944 | bsz 16332.1 | num_updates 59572 | lr 0.000259125 | gnorm 0.241 | clip 0.1 | loss_scale 1 | train_wall 2447 | gb_free 59.8 | wall 0 epoch 035 | loss 3.993 | nll_loss 2.352 | ppl 5.1 | wps 293768 | ups 0.68 | wpb 428944 | bsz 16332.1 | num_updates 59572 | lr 0.000259125 | gnorm 0.241 | clip 0.1 | loss_scale 1 | train_wall 2447 | gb_free 59.8 | wall 0 epoch 035 | loss 3.993 | nll_loss 2.352 | ppl 5.1 | wps 293768 | ups 0.68 | wpb 428944 | bsz 16332.1 | num_updates 59572 | lr 0.000259125 | gnorm 0.241 | clip 0.1 | loss_scale 1 | train_wall 2447 | gb_free 59.8 | wall 0 epoch 035 | loss 3.993 | nll_loss 2.352 | ppl 5.1 | wps 293768 | ups 0.68 | wpb 428944 | bsz 16332.1 | num_updates 59572 | lr 0.000259125 | gnorm 0.241 | clip 0.1 | loss_scale 1 | train_wall 2447 | gb_free 59.8 | wall 0 epoch 035 | loss 3.993 | nll_loss 2.352 | ppl 5.1 | wps 293768 | ups 0.68 | wpb 428944 | bsz 16332.1 | num_updates 59572 | lr 0.000259125 | gnorm 0.241 | clip 0.1 | loss_scale 1 | train_wall 2447 | gb_free 59.8 | wall 0 epoch 035 | loss 3.993 | nll_loss 2.352 | ppl 5.1 | wps 293768 | ups 0.68 | wpb 428944 | bsz 16332.1 | num_updates 59572 | lr 0.000259125 | gnorm 0.241 | clip 0.1 | loss_scale 1 | train_wall 2447 | gb_free 59.8 | wall 0 Start iterating over samples epoch 036: 28 / 1707 loss=3.992, nll_loss=2.351, ppl=5.1, wps=292971, ups=0.69, wpb=426557, bsz=16361.7, num_updates=59600, lr=0.000259064, gnorm=0.258, clip=0, loss_scale=1, train_wall=144, gb_free=60, wall=0 epoch 036: 28 / 1707 loss=3.992, nll_loss=2.351, ppl=5.1, wps=292971, ups=0.69, wpb=426557, bsz=16361.7, num_updates=59600, lr=0.000259064, gnorm=0.258, clip=0, loss_scale=1, train_wall=144, gb_free=60, wall=0 epoch 036: 28 / 1707 loss=3.992, nll_loss=2.351, ppl=5.1, wps=292971, ups=0.69, wpb=426557, bsz=16361.7, num_updates=59600, lr=0.000259064, gnorm=0.258, clip=0, loss_scale=1, train_wall=144, gb_free=60, wall=0 epoch 036: 28 / 1707 loss=3.992, nll_loss=2.351, ppl=5.1, wps=292971, ups=0.69, wpb=426557, bsz=16361.7, num_updates=59600, lr=0.000259064, gnorm=0.258, clip=0, loss_scale=1, train_wall=144, gb_free=60, wall=0 epoch 036: 28 / 1707 loss=3.992, nll_loss=2.351, ppl=5.1, wps=292971, ups=0.69, wpb=426557, bsz=16361.7, num_updates=59600, lr=0.000259064, gnorm=0.258, clip=0, loss_scale=1, train_wall=144, gb_free=60, wall=0 epoch 036: 28 / 1707 loss=3.992, nll_loss=2.351, ppl=5.1, wps=292971, ups=0.69, wpb=426557, bsz=16361.7, num_updates=59600, lr=0.000259064, gnorm=0.258, clip=0, loss_scale=1, train_wall=144, gb_free=60, wall=0 epoch 036: 28 / 1707 loss=3.992, nll_loss=2.351, ppl=5.1, wps=292971, ups=0.69, wpb=426557, bsz=16361.7, num_updates=59600, lr=0.000259064, gnorm=0.258, clip=0, loss_scale=1, train_wall=144, gb_free=60, wall=0 epoch 036: 28 / 1707 loss=3.992, nll_loss=2.351, ppl=5.1, wps=292971, ups=0.69, wpb=426557, bsz=16361.7, num_updates=59600, lr=0.000259064, gnorm=0.258, clip=0, loss_scale=1, train_wall=144, gb_free=60, wall=0 epoch 036: 28 / 1707 loss=3.992, nll_loss=2.351, ppl=5.1, wps=292971, ups=0.69, wpb=426557, bsz=16361.7, num_updates=59600, lr=0.000259064, gnorm=0.258, clip=0, loss_scale=1, train_wall=144, gb_free=60, wall=0 epoch 036: 28 / 1707 loss=3.992, nll_loss=2.351, ppl=5.1, wps=292971, ups=0.69, wpb=426557, bsz=16361.7, num_updates=59600, lr=0.000259064, gnorm=0.258, clip=0, loss_scale=1, train_wall=144, gb_free=60, wall=0 epoch 036: 28 / 1707 loss=3.992, nll_loss=2.351, ppl=5.1, wps=292971, ups=0.69, wpb=426557, bsz=16361.7, num_updates=59600, lr=0.000259064, gnorm=0.258, clip=0, loss_scale=1, train_wall=144, gb_free=60, wall=0 epoch 036: 28 / 1707 loss=3.992, nll_loss=2.351, ppl=5.1, wps=292971, ups=0.69, wpb=426557, bsz=16361.7, num_updates=59600, lr=0.000259064, gnorm=0.258, clip=0, loss_scale=1, train_wall=144, gb_free=60, wall=0 epoch 036: 28 / 1707 loss=3.992, nll_loss=2.351, ppl=5.1, wps=292971, ups=0.69, wpb=426557, bsz=16361.7, num_updates=59600, lr=0.000259064, gnorm=0.258, clip=0, loss_scale=1, train_wall=144, gb_free=60, wall=0 epoch 036: 28 / 1707 loss=3.992, nll_loss=2.351, ppl=5.1, wps=292971, ups=0.69, wpb=426557, bsz=16361.7, num_updates=59600, lr=0.000259064, gnorm=0.258, clip=0, loss_scale=1, train_wall=144, gb_free=60, wall=0 epoch 036: 128 / 1707 loss=3.976, nll_loss=2.331, ppl=5.03, wps=298412, ups=0.69, wpb=429482, bsz=16363.6, num_updates=59700, lr=0.000258847, gnorm=0.27, clip=0, loss_scale=1, train_wall=143, gb_free=59.5, wall=0 epoch 036: 128 / 1707 loss=3.976, nll_loss=2.331, ppl=5.03, wps=298412, ups=0.69, wpb=429482, bsz=16363.6, num_updates=59700, lr=0.000258847, gnorm=0.27, clip=0, loss_scale=1, train_wall=143, gb_free=59.5, wall=0 epoch 036: 128 / 1707 loss=3.976, nll_loss=2.331, ppl=5.03, wps=298412, ups=0.69, wpb=429482, bsz=16363.6, num_updates=59700, lr=0.000258847, gnorm=0.27, clip=0, loss_scale=1, train_wall=143, gb_free=59.5, wall=0 epoch 036: 128 / 1707 loss=3.976, nll_loss=2.331, ppl=5.03, wps=298412, ups=0.69, wpb=429482, bsz=16363.6, num_updates=59700, lr=0.000258847, gnorm=0.27, clip=0, loss_scale=1, train_wall=143, gb_free=59.5, wall=0 epoch 036: 128 / 1707 loss=3.976, nll_loss=2.331, ppl=5.03, wps=298412, ups=0.69, wpb=429482, bsz=16363.6, num_updates=59700, lr=0.000258847, gnorm=0.27, clip=0, loss_scale=1, train_wall=143, gb_free=59.5, wall=0 epoch 036: 128 / 1707 loss=3.976, nll_loss=2.331, ppl=5.03, wps=298412, ups=0.69, wpb=429482, bsz=16363.6, num_updates=59700, lr=0.000258847, gnorm=0.27, clip=0, loss_scale=1, train_wall=143, gb_free=59.5, wall=0 epoch 036: 128 / 1707 loss=3.976, nll_loss=2.331, ppl=5.03, wps=298412, ups=0.69, wpb=429482, bsz=16363.6, num_updates=59700, lr=0.000258847, gnorm=0.27, clip=0, loss_scale=1, train_wall=143, gb_free=59.5, wall=0 epoch 036: 128 / 1707 loss=3.976, nll_loss=2.331, ppl=5.03, wps=298412, ups=0.69, wpb=429482, bsz=16363.6, num_updates=59700, lr=0.000258847, gnorm=0.27, clip=0, loss_scale=1, train_wall=143, gb_free=59.5, wall=0 epoch 036: 128 / 1707 loss=3.976, nll_loss=2.331, ppl=5.03, wps=298412, ups=0.69, wpb=429482, bsz=16363.6, num_updates=59700, lr=0.000258847, gnorm=0.27, clip=0, loss_scale=1, train_wall=143, gb_free=59.5, wall=0 epoch 036: 128 / 1707 loss=3.976, nll_loss=2.331, ppl=5.03, wps=298412, ups=0.69, wpb=429482, bsz=16363.6, num_updates=59700, lr=0.000258847, gnorm=0.27, clip=0, loss_scale=1, train_wall=143, gb_free=59.5, wall=0 epoch 036: 128 / 1707 loss=3.976, nll_loss=2.331, ppl=5.03, wps=298412, ups=0.69, wpb=429482, bsz=16363.6, num_updates=59700, lr=0.000258847, gnorm=0.27, clip=0, loss_scale=1, train_wall=143, gb_free=59.5, wall=0 epoch 036: 128 / 1707 loss=3.976, nll_loss=2.331, ppl=5.03, wps=298412, ups=0.69, wpb=429482, bsz=16363.6, num_updates=59700, lr=0.000258847, gnorm=0.27, clip=0, loss_scale=1, train_wall=143, gb_free=59.5, wall=0 epoch 036: 128 / 1707 loss=3.976, nll_loss=2.331, ppl=5.03, wps=298412, ups=0.69, wpb=429482, bsz=16363.6, num_updates=59700, lr=0.000258847, gnorm=0.27, clip=0, loss_scale=1, train_wall=143, gb_free=59.5, wall=0 epoch 036: 128 / 1707 loss=3.976, nll_loss=2.331, ppl=5.03, wps=298412, ups=0.69, wpb=429482, bsz=16363.6, num_updates=59700, lr=0.000258847, gnorm=0.27, clip=0, loss_scale=1, train_wall=143, gb_free=59.5, wall=0 epoch 036: 228 / 1707 loss=3.98, nll_loss=2.336, ppl=5.05, wps=295994, ups=0.69, wpb=427861, bsz=16377, num_updates=59800, lr=0.00025863, gnorm=0.254, clip=0, loss_scale=1, train_wall=144, gb_free=59.3, wall=0 epoch 036: 228 / 1707 loss=3.98, nll_loss=2.336, ppl=5.05, wps=295994, ups=0.69, wpb=427861, bsz=16377, num_updates=59800, lr=0.00025863, gnorm=0.254, clip=0, loss_scale=1, train_wall=144, gb_free=59.3, wall=0 epoch 036: 228 / 1707 loss=3.98, nll_loss=2.336, ppl=5.05, wps=295994, ups=0.69, wpb=427861, bsz=16377, num_updates=59800, lr=0.00025863, gnorm=0.254, clip=0, loss_scale=1, train_wall=144, gb_free=59.3, wall=0 epoch 036: 228 / 1707 loss=3.98, nll_loss=2.336, ppl=5.05, wps=295994, ups=0.69, wpb=427861, bsz=16377, num_updates=59800, lr=0.00025863, gnorm=0.254, clip=0, loss_scale=1, train_wall=144, gb_free=59.3, wall=0 epoch 036: 228 / 1707 loss=3.98, nll_loss=2.336, ppl=5.05, wps=295994, ups=0.69, wpb=427861, bsz=16377, num_updates=59800, lr=0.00025863, gnorm=0.254, clip=0, loss_scale=1, train_wall=144, gb_free=59.3, wall=0 epoch 036: 228 / 1707 loss=3.98, nll_loss=2.336, ppl=5.05, wps=295994, ups=0.69, wpb=427861, bsz=16377, num_updates=59800, lr=0.00025863, gnorm=0.254, clip=0, loss_scale=1, train_wall=144, gb_free=59.3, wall=0 epoch 036: 228 / 1707 loss=3.98, nll_loss=2.336, ppl=5.05, wps=295994, ups=0.69, wpb=427861, bsz=16377, num_updates=59800, lr=0.00025863, gnorm=0.254, clip=0, loss_scale=1, train_wall=144, gb_free=59.3, wall=0 epoch 036: 228 / 1707 loss=3.98, nll_loss=2.336, ppl=5.05, wps=295994, ups=0.69, wpb=427861, bsz=16377, num_updates=59800, lr=0.00025863, gnorm=0.254, clip=0, loss_scale=1, train_wall=144, gb_free=59.3, wall=0 epoch 036: 228 / 1707 loss=3.98, nll_loss=2.336, ppl=5.05, wps=295994, ups=0.69, wpb=427861, bsz=16377, num_updates=59800, lr=0.00025863, gnorm=0.254, clip=0, loss_scale=1, train_wall=144, gb_free=59.3, wall=0 epoch 036: 228 / 1707 loss=3.98, nll_loss=2.336, ppl=5.05, wps=295994, ups=0.69, wpb=427861, bsz=16377, num_updates=59800, lr=0.00025863, gnorm=0.254, clip=0, loss_scale=1, train_wall=144, gb_free=59.3, wall=0 epoch 036: 228 / 1707 loss=3.98, nll_loss=2.336, ppl=5.05, wps=295994, ups=0.69, wpb=427861, bsz=16377, num_updates=59800, lr=0.00025863, gnorm=0.254, clip=0, loss_scale=1, train_wall=144, gb_free=59.3, wall=0 epoch 036: 228 / 1707 loss=3.98, nll_loss=2.336, ppl=5.05, wps=295994, ups=0.69, wpb=427861, bsz=16377, num_updates=59800, lr=0.00025863, gnorm=0.254, clip=0, loss_scale=1, train_wall=144, gb_free=59.3, wall=0 epoch 036: 228 / 1707 loss=3.98, nll_loss=2.336, ppl=5.05, wps=295994, ups=0.69, wpb=427861, bsz=16377, num_updates=59800, lr=0.00025863, gnorm=0.254, clip=0, loss_scale=1, train_wall=144, gb_free=59.3, wall=0 epoch 036: 228 / 1707 loss=3.98, nll_loss=2.336, ppl=5.05, wps=295994, ups=0.69, wpb=427861, bsz=16377, num_updates=59800, lr=0.00025863, gnorm=0.254, clip=0, loss_scale=1, train_wall=144, gb_free=59.3, wall=0 epoch 036: 329 / 1707 loss=3.979, nll_loss=2.336, ppl=5.05, wps=294777, ups=0.68, wpb=430508, bsz=16075.5, num_updates=59900, lr=0.000258414, gnorm=0.237, clip=0, loss_scale=1, train_wall=145, gb_free=59.4, wall=0 epoch 036: 329 / 1707 loss=3.979, nll_loss=2.336, ppl=5.05, wps=294777, ups=0.68, wpb=430508, bsz=16075.5, num_updates=59900, lr=0.000258414, gnorm=0.237, clip=0, loss_scale=1, train_wall=145, gb_free=59.4, wall=0 epoch 036: 329 / 1707 loss=3.979, nll_loss=2.336, ppl=5.05, wps=294777, ups=0.68, wpb=430508, bsz=16075.5, num_updates=59900, lr=0.000258414, gnorm=0.237, clip=0, loss_scale=1, train_wall=145, gb_free=59.4, wall=0 epoch 036: 329 / 1707 loss=3.979, nll_loss=2.336, ppl=5.05, wps=294777, ups=0.68, wpb=430508, bsz=16075.5, num_updates=59900, lr=0.000258414, gnorm=0.237, clip=0, loss_scale=1, train_wall=145, gb_free=59.4, wall=0 epoch 036: 329 / 1707 loss=3.979, nll_loss=2.336, ppl=5.05, wps=294777, ups=0.68, wpb=430508, bsz=16075.5, num_updates=59900, lr=0.000258414, gnorm=0.237, clip=0, loss_scale=1, train_wall=145, gb_free=59.4, wall=0 epoch 036: 329 / 1707 loss=3.979, nll_loss=2.336, ppl=5.05, wps=294777, ups=0.68, wpb=430508, bsz=16075.5, num_updates=59900, lr=0.000258414, gnorm=0.237, clip=0, loss_scale=1, train_wall=145, gb_free=59.4, wall=0 epoch 036: 329 / 1707 loss=3.979, nll_loss=2.336, ppl=5.05, wps=294777, ups=0.68, wpb=430508, bsz=16075.5, num_updates=59900, lr=0.000258414, gnorm=0.237, clip=0, loss_scale=1, train_wall=145, gb_free=59.4, wall=0 epoch 036: 329 / 1707 loss=3.979, nll_loss=2.336, ppl=5.05, wps=294777, ups=0.68, wpb=430508, bsz=16075.5, num_updates=59900, lr=0.000258414, gnorm=0.237, clip=0, loss_scale=1, train_wall=145, gb_free=59.4, wall=0 epoch 036: 329 / 1707 loss=3.979, nll_loss=2.336, ppl=5.05, wps=294777, ups=0.68, wpb=430508, bsz=16075.5, num_updates=59900, lr=0.000258414, gnorm=0.237, clip=0, loss_scale=1, train_wall=145, gb_free=59.4, wall=0 epoch 036: 329 / 1707 loss=3.979, nll_loss=2.336, ppl=5.05, wps=294777, ups=0.68, wpb=430508, bsz=16075.5, num_updates=59900, lr=0.000258414, gnorm=0.237, clip=0, loss_scale=1, train_wall=145, gb_free=59.4, wall=0 epoch 036: 329 / 1707 loss=3.979, nll_loss=2.336, ppl=5.05, wps=294777, ups=0.68, wpb=430508, bsz=16075.5, num_updates=59900, lr=0.000258414, gnorm=0.237, clip=0, loss_scale=1, train_wall=145, gb_free=59.4, wall=0 epoch 036: 329 / 1707 loss=3.979, nll_loss=2.336, ppl=5.05, wps=294777, ups=0.68, wpb=430508, bsz=16075.5, num_updates=59900, lr=0.000258414, gnorm=0.237, clip=0, loss_scale=1, train_wall=145, gb_free=59.4, wall=0 epoch 036: 329 / 1707 loss=3.979, nll_loss=2.336, ppl=5.05, wps=294777, ups=0.68, wpb=430508, bsz=16075.5, num_updates=59900, lr=0.000258414, gnorm=0.237, clip=0, loss_scale=1, train_wall=145, gb_free=59.4, wall=0 epoch 036: 329 / 1707 loss=3.979, nll_loss=2.336, ppl=5.05, wps=294777, ups=0.68, wpb=430508, bsz=16075.5, num_updates=59900, lr=0.000258414, gnorm=0.237, clip=0, loss_scale=1, train_wall=145, gb_free=59.4, wall=0 epoch 036: 429 / 1707 loss=3.991, nll_loss=2.349, ppl=5.09, wps=295840, ups=0.69, wpb=429034, bsz=16487, num_updates=60000, lr=0.000258199, gnorm=0.28, clip=0, loss_scale=1, train_wall=144, gb_free=59.7, wall=0 epoch 036: 429 / 1707 loss=3.991, nll_loss=2.349, ppl=5.09, wps=295840, ups=0.69, wpb=429034, bsz=16487, num_updates=60000, lr=0.000258199, gnorm=0.28, clip=0, loss_scale=1, train_wall=144, gb_free=59.7, wall=0 epoch 036: 429 / 1707 loss=3.991, nll_loss=2.349, ppl=5.09, wps=295840, ups=0.69, wpb=429034, bsz=16487, num_updates=60000, lr=0.000258199, gnorm=0.28, clip=0, loss_scale=1, train_wall=144, gb_free=59.7, wall=0 epoch 036: 429 / 1707 loss=3.991, nll_loss=2.349, ppl=5.09, wps=295840, ups=0.69, wpb=429034, bsz=16487, num_updates=60000, lr=0.000258199, gnorm=0.28, clip=0, loss_scale=1, train_wall=144, gb_free=59.7, wall=0 epoch 036: 429 / 1707 loss=3.991, nll_loss=2.349, ppl=5.09, wps=295840, ups=0.69, wpb=429034, bsz=16487, num_updates=60000, lr=0.000258199, gnorm=0.28, clip=0, loss_scale=1, train_wall=144, gb_free=59.7, wall=0 epoch 036: 429 / 1707 loss=3.991, nll_loss=2.349, ppl=5.09, wps=295840, ups=0.69, wpb=429034, bsz=16487, num_updates=60000, lr=0.000258199, gnorm=0.28, clip=0, loss_scale=1, train_wall=144, gb_free=59.7, wall=0 epoch 036: 429 / 1707 loss=3.991, nll_loss=2.349, ppl=5.09, wps=295840, ups=0.69, wpb=429034, bsz=16487, num_updates=60000, lr=0.000258199, gnorm=0.28, clip=0, loss_scale=1, train_wall=144, gb_free=59.7, wall=0 epoch 036: 429 / 1707 loss=3.991, nll_loss=2.349, ppl=5.09, wps=295840, ups=0.69, wpb=429034, bsz=16487, num_updates=60000, lr=0.000258199, gnorm=0.28, clip=0, loss_scale=1, train_wall=144, gb_free=59.7, wall=0 epoch 036: 429 / 1707 loss=3.991, nll_loss=2.349, ppl=5.09, wps=295840, ups=0.69, wpb=429034, bsz=16487, num_updates=60000, lr=0.000258199, gnorm=0.28, clip=0, loss_scale=1, train_wall=144, gb_free=59.7, wall=0 epoch 036: 429 / 1707 loss=3.991, nll_loss=2.349, ppl=5.09, wps=295840, ups=0.69, wpb=429034, bsz=16487, num_updates=60000, lr=0.000258199, gnorm=0.28, clip=0, loss_scale=1, train_wall=144, gb_free=59.7, wall=0 epoch 036: 429 / 1707 loss=3.991, nll_loss=2.349, ppl=5.09, wps=295840, ups=0.69, wpb=429034, bsz=16487, num_updates=60000, lr=0.000258199, gnorm=0.28, clip=0, loss_scale=1, train_wall=144, gb_free=59.7, wall=0 epoch 036: 429 / 1707 loss=3.991, nll_loss=2.349, ppl=5.09, wps=295840, ups=0.69, wpb=429034, bsz=16487, num_updates=60000, lr=0.000258199, gnorm=0.28, clip=0, loss_scale=1, train_wall=144, gb_free=59.7, wall=0 epoch 036: 429 / 1707 loss=3.991, nll_loss=2.349, ppl=5.09, wps=295840, ups=0.69, wpb=429034, bsz=16487, num_updates=60000, lr=0.000258199, gnorm=0.28, clip=0, loss_scale=1, train_wall=144, gb_free=59.7, wall=0 epoch 036: 429 / 1707 loss=3.991, nll_loss=2.349, ppl=5.09, wps=295840, ups=0.69, wpb=429034, bsz=16487, num_updates=60000, lr=0.000258199, gnorm=0.28, clip=0, loss_scale=1, train_wall=144, gb_free=59.7, wall=0 Stopping training due to num_updates: 60000 >= max_update: 60000 begin validation on "valid" subset epoch 036 | valid on 'valid' subset | loss 4.286 | nll_loss 2.666 | ppl 6.35 | wps 76443.4 | wpb 21331 | bsz 1016 | num_updates 60000 | best_loss 4.279 epoch 036 | valid on 'valid' subset | loss 4.286 | nll_loss 2.666 | ppl 6.35 | wps 76443.4 | wpb 21331 | bsz 1016 | num_updates 60000 | best_loss 4.279 epoch 036 | valid on 'valid' subset | loss 4.286 | nll_loss 2.666 | ppl 6.35 | wps 76443.4 | wpb 21331 | bsz 1016 | num_updates 60000 | best_loss 4.279 epoch 036 | valid on 'valid' subset | loss 4.286 | nll_loss 2.666 | ppl 6.35 | wps 76443.4 | wpb 21331 | bsz 1016 | num_updates 60000 | best_loss 4.279 epoch 036 | valid on 'valid' subset | loss 4.286 | nll_loss 2.666 | ppl 6.35 | wps 76443.4 | wpb 21331 | bsz 1016 | num_updates 60000 | best_loss 4.279 epoch 036 | valid on 'valid' subset | loss 4.286 | nll_loss 2.666 | ppl 6.35 | wps 76443.4 | wpb 21331 | bsz 1016 | num_updates 60000 | best_loss 4.279 epoch 036 | valid on 'valid' subset | loss 4.286 | nll_loss 2.666 | ppl 6.35 | wps 76443.4 | wpb 21331 | bsz 1016 | num_updates 60000 | best_loss 4.279 epoch 036 | valid on 'valid' subset | loss 4.286 | nll_loss 2.666 | ppl 6.35 | wps 76443.4 | wpb 21331 | bsz 1016 | num_updates 60000 | best_loss 4.279 epoch 036 | valid on 'valid' subset | loss 4.286 | nll_loss 2.666 | ppl 6.35 | wps 76443.4 | wpb 21331 | bsz 1016 | num_updates 60000 | best_loss 4.279 epoch 036 | valid on 'valid' subset | loss 4.286 | nll_loss 2.666 | ppl 6.35 | wps 76443.4 | wpb 21331 | bsz 1016 | num_updates 60000 | best_loss 4.279 epoch 036 | valid on 'valid' subset | loss 4.286 | nll_loss 2.666 | ppl 6.35 | wps 76443.4 | wpb 21331 | bsz 1016 | num_updates 60000 | best_loss 4.279 epoch 036 | valid on 'valid' subset | loss 4.286 | nll_loss 2.666 | ppl 6.35 | wps 76443.4 | wpb 21331 | bsz 1016 | num_updates 60000 | best_loss 4.279 epoch 036 | valid on 'valid' subset | loss 4.286 | nll_loss 2.666 | ppl 6.35 | wps 76443.4 | wpb 21331 | bsz 1016 | num_updates 60000 | best_loss 4.279 epoch 036 | valid on 'valid' subset | loss 4.286 | nll_loss 2.666 | ppl 6.35 | wps 76443.4 | wpb 21331 | bsz 1016 | num_updates 60000 | best_loss 4.279 end of epoch 36 (average epoch stats below) epoch 036 | loss 3.981 | nll_loss 2.337 | ppl 5.05 | wps 288516 | ups 0.67 | wpb 429424 | bsz 16343.6 | num_updates 60000 | lr 0.000258199 | gnorm 0.263 | clip 0 | loss_scale 1 | train_wall 617 | gb_free 59.7 | wall 0 epoch 036 | loss 3.981 | nll_loss 2.337 | ppl 5.05 | wps 288516 | ups 0.67 | wpb 429424 | bsz 16343.6 | num_updates 60000 | lr 0.000258199 | gnorm 0.263 | clip 0 | loss_scale 1 | train_wall 617 | gb_free 59.7 | wall 0 epoch 036 | loss 3.981 | nll_loss 2.337 | ppl 5.05 | wps 288516 | ups 0.67 | wpb 429424 | bsz 16343.6 | num_updates 60000 | lr 0.000258199 | gnorm 0.263 | clip 0 | loss_scale 1 | train_wall 617 | gb_free 59.7 | wall 0 epoch 036 | loss 3.981 | nll_loss 2.337 | ppl 5.05 | wps 288516 | ups 0.67 | wpb 429424 | bsz 16343.6 | num_updates 60000 | lr 0.000258199 | gnorm 0.263 | clip 0 | loss_scale 1 | train_wall 617 | gb_free 59.7 | wall 0 epoch 036 | loss 3.981 | nll_loss 2.337 | ppl 5.05 | wps 288516 | ups 0.67 | wpb 429424 | bsz 16343.6 | num_updates 60000 | lr 0.000258199 | gnorm 0.263 | clip 0 | loss_scale 1 | train_wall 617 | gb_free 59.7 | wall 0 epoch 036 | loss 3.981 | nll_loss 2.337 | ppl 5.05 | wps 288516 | ups 0.67 | wpb 429424 | bsz 16343.6 | num_updates 60000 | lr 0.000258199 | gnorm 0.263 | clip 0 | loss_scale 1 | train_wall 617 | gb_free 59.7 | wall 0 epoch 036 | loss 3.981 | nll_loss 2.337 | ppl 5.05 | wps 288516 | ups 0.67 | wpb 429424 | bsz 16343.6 | num_updates 60000 | lr 0.000258199 | gnorm 0.263 | clip 0 | loss_scale 1 | train_wall 617 | gb_free 59.7 | wall 0 epoch 036 | loss 3.981 | nll_loss 2.337 | ppl 5.05 | wps 288516 | ups 0.67 | wpb 429424 | bsz 16343.6 | num_updates 60000 | lr 0.000258199 | gnorm 0.263 | clip 0 | loss_scale 1 | train_wall 617 | gb_free 59.7 | wall 0 epoch 036 | loss 3.981 | nll_loss 2.337 | ppl 5.05 | wps 288516 | ups 0.67 | wpb 429424 | bsz 16343.6 | num_updates 60000 | lr 0.000258199 | gnorm 0.263 | clip 0 | loss_scale 1 | train_wall 617 | gb_free 59.7 | wall 0 epoch 036 | loss 3.981 | nll_loss 2.337 | ppl 5.05 | wps 288516 | ups 0.67 | wpb 429424 | bsz 16343.6 | num_updates 60000 | lr 0.000258199 | gnorm 0.263 | clip 0 | loss_scale 1 | train_wall 617 | gb_free 59.7 | wall 0 epoch 036 | loss 3.981 | nll_loss 2.337 | ppl 5.05 | wps 288516 | ups 0.67 | wpb 429424 | bsz 16343.6 | num_updates 60000 | lr 0.000258199 | gnorm 0.263 | clip 0 | loss_scale 1 | train_wall 617 | gb_free 59.7 | wall 0 epoch 036 | loss 3.981 | nll_loss 2.337 | ppl 5.05 | wps 288516 | ups 0.67 | wpb 429424 | bsz 16343.6 | num_updates 60000 | lr 0.000258199 | gnorm 0.263 | clip 0 | loss_scale 1 | train_wall 617 | gb_free 59.7 | wall 0 epoch 036 | loss 3.981 | nll_loss 2.337 | ppl 5.05 | wps 288516 | ups 0.67 | wpb 429424 | bsz 16343.6 | num_updates 60000 | lr 0.000258199 | gnorm 0.263 | clip 0 | loss_scale 1 | train_wall 617 | gb_free 59.7 | wall 0 epoch 036 | loss 3.981 | nll_loss 2.337 | ppl 5.05 | wps 288516 | ups 0.67 | wpb 429424 | bsz 16343.6 | num_updates 60000 | lr 0.000258199 | gnorm 0.263 | clip 0 | loss_scale 1 | train_wall 617 | gb_free 59.7 | wall 0 done training in 32167.4 seconds