{'_name': None, 'common': {'_name': None, 'no_progress_bar': False, 'log_interval': 100, 'log_format': 'simple', 'log_file': 'chkpt/ja-en.do01.ado01/train.log', 'aim_repo': None, 'aim_run_hash': None, 'tensorboard_logdir': None, 'wandb_project': 'wmt23', 'azureml_logging': False, 'seed': 0, 'cpu': False, 'tpu': False, 'bf16': False, 'memory_efficient_bf16': False, 'fp16': True, 'memory_efficient_fp16': False, 'fp16_no_flatten_grads': False, 'fp16_init_scale': 8, 'fp16_scale_window': None, 'fp16_scale_tolerance': 0.0, 'on_cpu_convert_precision': False, 'min_loss_scale': 0.0001, 'threshold_loss_scale': None, 'amp': False, 'amp_batch_retries': 2, 'amp_init_scale': 128, 'amp_scale_window': None, 'user_dir': None, 'empty_cache_freq': 0, 'all_gather_list_size': 16384, 'model_parallel_size': 1, 'quantization_config_path': None, 'profile': False, 'reset_logging': False, 'suppress_crashes': False, 'use_plasma_view': False, 'plasma_path': '/tmp/plasma'}, 'common_eval': {'_name': None, 'path': None, 'post_process': None, 'quiet': False, 'model_overrides': '{}', 'results_path': None}, 'distributed_training': {'_name': None, 'distributed_world_size': 8, 'distributed_num_procs': 8, 'distributed_rank': 0, 'distributed_backend': 'nccl', 'distributed_init_method': 'tcp://localhost:24563', 'distributed_port': 24563, 'device_id': 0, 'distributed_no_spawn': False, 'ddp_backend': 'pytorch_ddp', 'ddp_comm_hook': 'none', 'bucket_cap_mb': 25, 'fix_batches_to_gpus': False, 'find_unused_parameters': False, 'gradient_as_bucket_view': False, 'fast_stat_sync': False, 'heartbeat_timeout': -1, 'broadcast_buffers': False, 'slowmo_momentum': None, 'slowmo_base_algorithm': 'localsgd', 'localsgd_frequency': 3, 'nprocs_per_node': 8, 'pipeline_model_parallel': False, 'pipeline_balance': None, 'pipeline_devices': None, 'pipeline_chunks': 0, 'pipeline_encoder_balance': None, 'pipeline_encoder_devices': None, 'pipeline_decoder_balance': None, 'pipeline_decoder_devices': None, 'pipeline_checkpoint': 'never', 'zero_sharding': 'none', 'fp16': True, 'memory_efficient_fp16': False, 'tpu': False, 'no_reshard_after_forward': False, 'fp32_reduce_scatter': False, 'cpu_offload': False, 'use_sharded_state': False, 'not_fsdp_flatten_parameters': False}, 'dataset': {'_name': None, 'num_workers': 0, 'skip_invalid_size_inputs_valid_test': False, 'max_tokens': 16384, 'batch_size': None, 'required_batch_size_multiple': 8, 'required_seq_len_multiple': 1, 'dataset_impl': None, 'data_buffer_size': 10, 'train_subset': 'train', 'valid_subset': 'valid', 'combine_valid_subsets': None, 'ignore_unused_valid_subsets': False, 'validate_interval': 100000, 'validate_interval_updates': 0, 'validate_after_updates': 0, 'fixed_validation_seed': None, 'disable_validation': False, 'max_tokens_valid': 16384, 'batch_size_valid': None, 'max_valid_steps': None, 'curriculum': 0, 'gen_subset': 'test', 'num_shards': 1, 'shard_id': 0, 'grouped_shuffling': False, 'update_epoch_batch_itr': False, 'update_ordered_indices_seed': False}, 'optimization': {'_name': None, 'max_epoch': 0, 'max_update': 60000, 'stop_time_hours': 0.0, 'clip_norm': 1.0, 'sentence_avg': False, 'update_freq': [4], 'lr': [0.001], 'stop_min_lr': -1.0, 'use_bmuf': False, 'skip_remainder_batch': False, 'debug_param_names': False}, 'checkpoint': {'_name': None, 'save_dir': 'chkpt/ja-en.do01.ado01', 'restore_file': 'checkpoint_last.pt', 'continue_once': None, 'finetune_from_model': None, 'reset_dataloader': False, 'reset_lr_scheduler': False, 'reset_meters': False, 'reset_optimizer': False, 'optimizer_overrides': '{}', 'save_interval': 100000, 'save_interval_updates': 1000, 'keep_interval_updates': 10, 'keep_interval_updates_pattern': -1, 'keep_last_epochs': -1, 'keep_best_checkpoints': -1, 'no_save': False, 'no_epoch_checkpoints': True, 'no_last_checkpoints': False, 'no_save_optimizer_state': False, 'best_checkpoint_metric': 'loss', 'maximize_best_checkpoint_metric': False, 'patience': -1, 'checkpoint_suffix': '', 'checkpoint_shard_count': 1, 'load_checkpoint_on_all_dp_ranks': False, 'write_checkpoints_asynchronously': True, 'model_parallel_size': 1}, 'bmuf': {'_name': None, 'block_lr': 1.0, 'block_momentum': 0.875, 'global_sync_iter': 50, 'warmup_iterations': 500, 'use_nbm': False, 'average_sync': False, 'distributed_world_size': 8}, 'generation': {'_name': None, 'beam': 5, 'beam_mt': 0, 'nbest': 1, 'max_len_a': 0.0, 'max_len_b': 200, 'max_len_a_mt': 0.0, 'max_len_b_mt': 200, 'min_len': 1, 'match_source_len': False, 'unnormalized': False, 'no_early_stop': False, 'no_beamable_mm': False, 'lenpen': 1.0, 'lenpen_mt': 1.0, 'unkpen': 0.0, 'replace_unk': None, 'sacrebleu': False, 'score_reference': False, 'prefix_size': 0, 'no_repeat_ngram_size': 0, 'sampling': False, 'sampling_topk': -1, 'sampling_topp': -1.0, 'constraints': None, 'temperature': 1.0, 'diverse_beam_groups': -1, 'diverse_beam_strength': 0.5, 'diversity_rate': -1.0, 'print_alignment': None, 'print_step': False, 'lm_path': None, 'lm_weight': 0.0, 'iter_decode_eos_penalty': 0.0, 'iter_decode_max_iter': 10, 'iter_decode_force_max_iter': False, 'iter_decode_with_beam': 1, 'iter_decode_with_external_reranker': False, 'retain_iter_history': False, 'retain_dropout': False, 'retain_dropout_modules': None, 'decoding_format': None, 'no_seed_provided': False, 'eos_token': None}, 'eval_lm': {'_name': None, 'output_word_probs': False, 'output_word_stats': False, 'context_window': 0, 'softmax_batch': 9223372036854775807}, 'interactive': {'_name': None, 'buffer_size': 0, 'input': '-'}, 'model': Namespace(no_progress_bar=False, log_interval=100, log_format='simple', log_file='chkpt/ja-en.do01.ado01/train.log', aim_repo=None, aim_run_hash=None, tensorboard_logdir=None, wandb_project='wmt23', azureml_logging=False, seed=0, cpu=False, tpu=False, bf16=False, memory_efficient_bf16=False, fp16=True, memory_efficient_fp16=False, fp16_no_flatten_grads=False, fp16_init_scale=8, fp16_scale_window=None, fp16_scale_tolerance=0.0, on_cpu_convert_precision=False, min_loss_scale=0.0001, threshold_loss_scale=None, amp=False, amp_batch_retries=2, amp_init_scale=128, amp_scale_window=None, user_dir=None, empty_cache_freq=0, all_gather_list_size=16384, model_parallel_size=1, quantization_config_path=None, profile=False, reset_logging=False, suppress_crashes=False, use_plasma_view=False, plasma_path='/tmp/plasma', criterion='label_smoothed_cross_entropy', tokenizer=None, bpe=None, optimizer='adam', lr_scheduler='inverse_sqrt', scoring='bleu', task='translation', num_workers=0, skip_invalid_size_inputs_valid_test=False, max_tokens=16384, batch_size=None, required_batch_size_multiple=8, required_seq_len_multiple=1, dataset_impl=None, data_buffer_size=10, train_subset='train', valid_subset='valid', combine_valid_subsets=None, ignore_unused_valid_subsets=False, validate_interval=100000, validate_interval_updates=0, validate_after_updates=0, fixed_validation_seed=None, disable_validation=False, max_tokens_valid=16384, batch_size_valid=None, max_valid_steps=None, curriculum=0, gen_subset='test', num_shards=1, shard_id=0, grouped_shuffling=False, update_epoch_batch_itr=False, update_ordered_indices_seed=False, distributed_world_size=8, distributed_num_procs=8, distributed_rank=0, distributed_backend='nccl', distributed_init_method=None, distributed_port=-1, device_id=0, distributed_no_spawn=False, ddp_backend='pytorch_ddp', ddp_comm_hook='none', bucket_cap_mb=25, fix_batches_to_gpus=False, find_unused_parameters=False, gradient_as_bucket_view=False, fast_stat_sync=False, heartbeat_timeout=-1, broadcast_buffers=False, slowmo_momentum=None, slowmo_base_algorithm='localsgd', localsgd_frequency=3, nprocs_per_node=8, pipeline_model_parallel=False, pipeline_balance=None, pipeline_devices=None, pipeline_chunks=0, pipeline_encoder_balance=None, pipeline_encoder_devices=None, pipeline_decoder_balance=None, pipeline_decoder_devices=None, pipeline_checkpoint='never', zero_sharding='none', no_reshard_after_forward=False, fp32_reduce_scatter=False, cpu_offload=False, use_sharded_state=False, not_fsdp_flatten_parameters=False, arch='transformer_vaswani_wmt_en_de_big', max_epoch=0, max_update=60000, stop_time_hours=0, clip_norm=1.0, sentence_avg=False, update_freq=[4], lr=[0.001], stop_min_lr=-1.0, use_bmuf=False, skip_remainder_batch=False, debug_param_names=False, save_dir='chkpt/ja-en.do01.ado01', restore_file='checkpoint_last.pt', continue_once=None, finetune_from_model=None, reset_dataloader=False, reset_lr_scheduler=False, reset_meters=False, reset_optimizer=False, optimizer_overrides='{}', save_interval=100000, save_interval_updates=1000, keep_interval_updates=10, keep_interval_updates_pattern=-1, keep_last_epochs=-1, keep_best_checkpoints=-1, no_save=False, no_epoch_checkpoints=True, no_last_checkpoints=False, no_save_optimizer_state=False, best_checkpoint_metric='loss', maximize_best_checkpoint_metric=False, patience=-1, checkpoint_suffix='', checkpoint_shard_count=1, load_checkpoint_on_all_dp_ranks=False, write_checkpoints_asynchronously=True, store_ema=False, ema_decay=0.9999, ema_start_update=0, ema_seed_model=None, ema_update_freq=1, ema_fp32=False, data='binarized/ja-en/', source_lang=None, target_lang=None, load_alignments=False, left_pad_source=True, left_pad_target=False, upsample_primary=-1, truncate_source=False, num_batch_buckets=0, eval_bleu=False, eval_bleu_args='{}', eval_bleu_detok='space', eval_bleu_detok_args='{}', eval_tokenized_bleu=False, eval_bleu_remove_bpe=None, eval_bleu_print_samples=False, label_smoothing=0.1, report_accuracy=False, ignore_prefix_size=0, adam_betas='(0.9, 0.98)', adam_eps=1e-08, weight_decay=0.0, use_old_adam=False, fp16_adam_stats=False, warmup_updates=4000, warmup_init_lr=-1, pad=1, eos=2, unk=3, encoder_ffn_embed_dim=8192, decoder_ffn_embed_dim=8192, dropout=0.1, attention_dropout=0.1, share_decoder_input_output_embed=True, no_seed_provided=False, encoder_embed_dim=1024, encoder_attention_heads=16, encoder_normalize_before=False, decoder_embed_dim=1024, decoder_attention_heads=16, encoder_embed_path=None, encoder_layers=6, encoder_learned_pos=False, decoder_embed_path=None, decoder_layers=6, decoder_normalize_before=False, decoder_learned_pos=False, activation_dropout=0.0, activation_fn='relu', adaptive_softmax_cutoff=None, adaptive_softmax_dropout=0, share_all_embeddings=False, merge_src_tgt_embed=False, no_token_positional_embeddings=False, adaptive_input=False, no_cross_attention=False, cross_self_attention=False, decoder_output_dim=1024, decoder_input_dim=1024, no_scale_embedding=False, layernorm_embedding=False, tie_adaptive_weights=False, checkpoint_activations=False, offload_activations=False, encoder_layers_to_keep=None, decoder_layers_to_keep=None, encoder_layerdrop=0, decoder_layerdrop=0, quant_noise_pq=0, quant_noise_pq_block_size=8, quant_noise_scalar=0, _name='transformer_vaswani_wmt_en_de_big'), 'task': {'_name': 'translation', 'data': 'binarized/ja-en/', 'source_lang': None, 'target_lang': None, 'load_alignments': False, 'left_pad_source': True, 'left_pad_target': False, 'max_source_positions': 1024, 'max_target_positions': 1024, 'upsample_primary': -1, 'truncate_source': False, 'num_batch_buckets': 0, 'train_subset': 'train', 'dataset_impl': None, 'required_seq_len_multiple': 1, 'eval_bleu': False, 'eval_bleu_args': '{}', 'eval_bleu_detok': 'space', 'eval_bleu_detok_args': '{}', 'eval_tokenized_bleu': False, 'eval_bleu_remove_bpe': None, 'eval_bleu_print_samples': False}, 'criterion': {'_name': 'label_smoothed_cross_entropy', 'label_smoothing': 0.1, 'report_accuracy': False, 'ignore_prefix_size': 0, 'sentence_avg': False}, 'optimizer': {'_name': 'adam', 'adam_betas': '(0.9, 0.98)', 'adam_eps': 1e-08, 'weight_decay': 0.0, 'use_old_adam': False, 'fp16_adam_stats': False, 'tpu': False, 'lr': [0.001]}, 'lr_scheduler': {'_name': 'inverse_sqrt', 'warmup_updates': 4000, 'warmup_init_lr': -1.0, 'lr': [0.001]}, 'scoring': {'_name': 'bleu', 'pad': 1, 'eos': 2, 'unk': 3}, 'bpe': None, 'tokenizer': None, 'ema': {'_name': None, 'store_ema': False, 'ema_decay': 0.9999, 'ema_start_update': 0, 'ema_seed_model': None, 'ema_update_freq': 1, 'ema_fp32': False}} {'_name': None, 'common': {'_name': None, 'no_progress_bar': False, 'log_interval': 100, 'log_format': 'simple', 'log_file': 'chkpt/ja-en.do01.ado01/train.log', 'aim_repo': None, 'aim_run_hash': None, 'tensorboard_logdir': None, 'wandb_project': 'wmt23', 'azureml_logging': False, 'seed': 0, 'cpu': False, 'tpu': False, 'bf16': False, 'memory_efficient_bf16': False, 'fp16': True, 'memory_efficient_fp16': False, 'fp16_no_flatten_grads': False, 'fp16_init_scale': 8, 'fp16_scale_window': None, 'fp16_scale_tolerance': 0.0, 'on_cpu_convert_precision': False, 'min_loss_scale': 0.0001, 'threshold_loss_scale': None, 'amp': False, 'amp_batch_retries': 2, 'amp_init_scale': 128, 'amp_scale_window': None, 'user_dir': None, 'empty_cache_freq': 0, 'all_gather_list_size': 16384, 'model_parallel_size': 1, 'quantization_config_path': None, 'profile': False, 'reset_logging': False, 'suppress_crashes': False, 'use_plasma_view': False, 'plasma_path': '/tmp/plasma'}, 'common_eval': {'_name': None, 'path': None, 'post_process': None, 'quiet': False, 'model_overrides': '{}', 'results_path': None}, 'distributed_training': {'_name': None, 'distributed_world_size': 8, 'distributed_num_procs': 8, 'distributed_rank': 0, 'distributed_backend': 'nccl', 'distributed_init_method': 'tcp://localhost:51810', 'distributed_port': 51810, 'device_id': 0, 'distributed_no_spawn': False, 'ddp_backend': 'pytorch_ddp', 'ddp_comm_hook': 'none', 'bucket_cap_mb': 25, 'fix_batches_to_gpus': False, 'find_unused_parameters': False, 'gradient_as_bucket_view': False, 'fast_stat_sync': False, 'heartbeat_timeout': -1, 'broadcast_buffers': False, 'slowmo_momentum': None, 'slowmo_base_algorithm': 'localsgd', 'localsgd_frequency': 3, 'nprocs_per_node': 8, 'pipeline_model_parallel': False, 'pipeline_balance': None, 'pipeline_devices': None, 'pipeline_chunks': 0, 'pipeline_encoder_balance': None, 'pipeline_encoder_devices': None, 'pipeline_decoder_balance': None, 'pipeline_decoder_devices': None, 'pipeline_checkpoint': 'never', 'zero_sharding': 'none', 'fp16': True, 'memory_efficient_fp16': False, 'tpu': False, 'no_reshard_after_forward': False, 'fp32_reduce_scatter': False, 'cpu_offload': False, 'use_sharded_state': False, 'not_fsdp_flatten_parameters': False}, 'dataset': {'_name': None, 'num_workers': 0, 'skip_invalid_size_inputs_valid_test': False, 'max_tokens': 16384, 'batch_size': None, 'required_batch_size_multiple': 8, 'required_seq_len_multiple': 1, 'dataset_impl': None, 'data_buffer_size': 10, 'train_subset': 'train', 'valid_subset': 'valid', 'combine_valid_subsets': None, 'ignore_unused_valid_subsets': False, 'validate_interval': 100000, 'validate_interval_updates': 0, 'validate_after_updates': 0, 'fixed_validation_seed': None, 'disable_validation': False, 'max_tokens_valid': 16384, 'batch_size_valid': None, 'max_valid_steps': None, 'curriculum': 0, 'gen_subset': 'test', 'num_shards': 1, 'shard_id': 0, 'grouped_shuffling': False, 'update_epoch_batch_itr': False, 'update_ordered_indices_seed': False}, 'optimization': {'_name': None, 'max_epoch': 0, 'max_update': 60000, 'stop_time_hours': 0.0, 'clip_norm': 1.0, 'sentence_avg': False, 'update_freq': [4], 'lr': [0.001], 'stop_min_lr': -1.0, 'use_bmuf': False, 'skip_remainder_batch': False, 'debug_param_names': False}, 'checkpoint': {'_name': None, 'save_dir': 'chkpt/ja-en.do01.ado01', 'restore_file': 'checkpoint_last.pt', 'continue_once': None, 'finetune_from_model': None, 'reset_dataloader': False, 'reset_lr_scheduler': False, 'reset_meters': False, 'reset_optimizer': False, 'optimizer_overrides': '{}', 'save_interval': 100000, 'save_interval_updates': 1000, 'keep_interval_updates': 10, 'keep_interval_updates_pattern': -1, 'keep_last_epochs': -1, 'keep_best_checkpoints': -1, 'no_save': False, 'no_epoch_checkpoints': True, 'no_last_checkpoints': False, 'no_save_optimizer_state': False, 'best_checkpoint_metric': 'loss', 'maximize_best_checkpoint_metric': False, 'patience': -1, 'checkpoint_suffix': '', 'checkpoint_shard_count': 1, 'load_checkpoint_on_all_dp_ranks': False, 'write_checkpoints_asynchronously': False, 'model_parallel_size': 1}, 'bmuf': {'_name': None, 'block_lr': 1.0, 'block_momentum': 0.875, 'global_sync_iter': 50, 'warmup_iterations': 500, 'use_nbm': False, 'average_sync': False, 'distributed_world_size': 8}, 'generation': {'_name': None, 'beam': 5, 'beam_mt': 0, 'nbest': 1, 'max_len_a': 0.0, 'max_len_b': 200, 'max_len_a_mt': 0.0, 'max_len_b_mt': 200, 'min_len': 1, 'match_source_len': False, 'unnormalized': False, 'no_early_stop': False, 'no_beamable_mm': False, 'lenpen': 1.0, 'lenpen_mt': 1.0, 'unkpen': 0.0, 'replace_unk': None, 'sacrebleu': False, 'score_reference': False, 'prefix_size': 0, 'no_repeat_ngram_size': 0, 'sampling': False, 'sampling_topk': -1, 'sampling_topp': -1.0, 'constraints': None, 'temperature': 1.0, 'diverse_beam_groups': -1, 'diverse_beam_strength': 0.5, 'diversity_rate': -1.0, 'print_alignment': None, 'print_step': False, 'lm_path': None, 'lm_weight': 0.0, 'iter_decode_eos_penalty': 0.0, 'iter_decode_max_iter': 10, 'iter_decode_force_max_iter': False, 'iter_decode_with_beam': 1, 'iter_decode_with_external_reranker': False, 'retain_iter_history': False, 'retain_dropout': False, 'retain_dropout_modules': None, 'decoding_format': None, 'no_seed_provided': False, 'eos_token': None}, 'eval_lm': {'_name': None, 'output_word_probs': False, 'output_word_stats': False, 'context_window': 0, 'softmax_batch': 9223372036854775807}, 'interactive': {'_name': None, 'buffer_size': 0, 'input': '-'}, 'model': Namespace(no_progress_bar=False, log_interval=100, log_format='simple', log_file='chkpt/ja-en.do01.ado01/train.log', aim_repo=None, aim_run_hash=None, tensorboard_logdir=None, wandb_project='wmt23', azureml_logging=False, seed=0, cpu=False, tpu=False, bf16=False, memory_efficient_bf16=False, fp16=True, memory_efficient_fp16=False, fp16_no_flatten_grads=False, fp16_init_scale=8, fp16_scale_window=None, fp16_scale_tolerance=0.0, on_cpu_convert_precision=False, min_loss_scale=0.0001, threshold_loss_scale=None, amp=False, amp_batch_retries=2, amp_init_scale=128, amp_scale_window=None, user_dir=None, empty_cache_freq=0, all_gather_list_size=16384, model_parallel_size=1, quantization_config_path=None, profile=False, reset_logging=False, suppress_crashes=False, use_plasma_view=False, plasma_path='/tmp/plasma', criterion='label_smoothed_cross_entropy', tokenizer=None, bpe=None, optimizer='adam', lr_scheduler='inverse_sqrt', scoring='bleu', task='translation', num_workers=0, skip_invalid_size_inputs_valid_test=False, max_tokens=16384, batch_size=None, required_batch_size_multiple=8, required_seq_len_multiple=1, dataset_impl=None, data_buffer_size=10, train_subset='train', valid_subset='valid', combine_valid_subsets=None, ignore_unused_valid_subsets=False, validate_interval=100000, validate_interval_updates=0, validate_after_updates=0, fixed_validation_seed=None, disable_validation=False, max_tokens_valid=16384, batch_size_valid=None, max_valid_steps=None, curriculum=0, gen_subset='test', num_shards=1, shard_id=0, grouped_shuffling=False, update_epoch_batch_itr=False, update_ordered_indices_seed=False, distributed_world_size=8, distributed_num_procs=8, distributed_rank=0, distributed_backend='nccl', distributed_init_method=None, distributed_port=-1, device_id=0, distributed_no_spawn=False, ddp_backend='pytorch_ddp', ddp_comm_hook='none', bucket_cap_mb=25, fix_batches_to_gpus=False, find_unused_parameters=False, gradient_as_bucket_view=False, fast_stat_sync=False, heartbeat_timeout=-1, broadcast_buffers=False, slowmo_momentum=None, slowmo_base_algorithm='localsgd', localsgd_frequency=3, nprocs_per_node=8, pipeline_model_parallel=False, pipeline_balance=None, pipeline_devices=None, pipeline_chunks=0, pipeline_encoder_balance=None, pipeline_encoder_devices=None, pipeline_decoder_balance=None, pipeline_decoder_devices=None, pipeline_checkpoint='never', zero_sharding='none', no_reshard_after_forward=False, fp32_reduce_scatter=False, cpu_offload=False, use_sharded_state=False, not_fsdp_flatten_parameters=False, arch='transformer_vaswani_wmt_en_de_big', max_epoch=0, max_update=60000, stop_time_hours=0, clip_norm=1.0, sentence_avg=False, update_freq=[4], lr=[0.001], stop_min_lr=-1.0, use_bmuf=False, skip_remainder_batch=False, debug_param_names=False, save_dir='chkpt/ja-en.do01.ado01', restore_file='checkpoint_last.pt', continue_once=None, finetune_from_model=None, reset_dataloader=False, reset_lr_scheduler=False, reset_meters=False, reset_optimizer=False, optimizer_overrides='{}', save_interval=100000, save_interval_updates=1000, keep_interval_updates=10, keep_interval_updates_pattern=-1, keep_last_epochs=-1, keep_best_checkpoints=-1, no_save=False, no_epoch_checkpoints=True, no_last_checkpoints=False, no_save_optimizer_state=False, best_checkpoint_metric='loss', maximize_best_checkpoint_metric=False, patience=-1, checkpoint_suffix='', checkpoint_shard_count=1, load_checkpoint_on_all_dp_ranks=False, write_checkpoints_asynchronously=False, store_ema=False, ema_decay=0.9999, ema_start_update=0, ema_seed_model=None, ema_update_freq=1, ema_fp32=False, data='binarized/ja-en/', source_lang=None, target_lang=None, load_alignments=False, left_pad_source=True, left_pad_target=False, upsample_primary=-1, truncate_source=False, num_batch_buckets=0, eval_bleu=False, eval_bleu_args='{}', eval_bleu_detok='space', eval_bleu_detok_args='{}', eval_tokenized_bleu=False, eval_bleu_remove_bpe=None, eval_bleu_print_samples=False, label_smoothing=0.1, report_accuracy=False, ignore_prefix_size=0, adam_betas='(0.9, 0.98)', adam_eps=1e-08, weight_decay=0.0, use_old_adam=False, fp16_adam_stats=False, warmup_updates=4000, warmup_init_lr=-1, pad=1, eos=2, unk=3, encoder_ffn_embed_dim=8192, decoder_ffn_embed_dim=8192, dropout=0.1, attention_dropout=0.1, share_decoder_input_output_embed=True, no_seed_provided=False, encoder_embed_dim=1024, encoder_attention_heads=16, encoder_normalize_before=False, decoder_embed_dim=1024, decoder_attention_heads=16, encoder_embed_path=None, encoder_layers=6, encoder_learned_pos=False, decoder_embed_path=None, decoder_layers=6, decoder_normalize_before=False, decoder_learned_pos=False, activation_dropout=0.0, activation_fn='relu', adaptive_softmax_cutoff=None, adaptive_softmax_dropout=0, share_all_embeddings=False, merge_src_tgt_embed=False, no_token_positional_embeddings=False, adaptive_input=False, no_cross_attention=False, cross_self_attention=False, decoder_output_dim=1024, decoder_input_dim=1024, no_scale_embedding=False, layernorm_embedding=False, tie_adaptive_weights=False, checkpoint_activations=False, offload_activations=False, encoder_layers_to_keep=None, decoder_layers_to_keep=None, encoder_layerdrop=0, decoder_layerdrop=0, quant_noise_pq=0, quant_noise_pq_block_size=8, quant_noise_scalar=0, _name='transformer_vaswani_wmt_en_de_big'), 'task': {'_name': 'translation', 'data': 'binarized/ja-en/', 'source_lang': None, 'target_lang': None, 'load_alignments': False, 'left_pad_source': True, 'left_pad_target': False, 'max_source_positions': 1024, 'max_target_positions': 1024, 'upsample_primary': -1, 'truncate_source': False, 'num_batch_buckets': 0, 'train_subset': 'train', 'dataset_impl': None, 'required_seq_len_multiple': 1, 'eval_bleu': False, 'eval_bleu_args': '{}', 'eval_bleu_detok': 'space', 'eval_bleu_detok_args': '{}', 'eval_tokenized_bleu': False, 'eval_bleu_remove_bpe': None, 'eval_bleu_print_samples': False}, 'criterion': {'_name': 'label_smoothed_cross_entropy', 'label_smoothing': 0.1, 'report_accuracy': False, 'ignore_prefix_size': 0, 'sentence_avg': False}, 'optimizer': {'_name': 'adam', 'adam_betas': '(0.9, 0.98)', 'adam_eps': 1e-08, 'weight_decay': 0.0, 'use_old_adam': False, 'fp16_adam_stats': False, 'tpu': False, 'lr': [0.001]}, 'lr_scheduler': {'_name': 'inverse_sqrt', 'warmup_updates': 4000, 'warmup_init_lr': -1.0, 'lr': [0.001]}, 'scoring': {'_name': 'bleu', 'pad': 1, 'eos': 2, 'unk': 3}, 'bpe': None, 'tokenizer': None, 'ema': {'_name': None, 'store_ema': False, 'ema_decay': 0.9999, 'ema_start_update': 0, 'ema_seed_model': None, 'ema_update_freq': 1, 'ema_fp32': False}} TransformerModel( (encoder): TransformerEncoderBase( (dropout_module): FairseqDropout() (embed_tokens): Embedding(32000, 1024, padding_idx=1) (embed_positions): SinusoidalPositionalEmbedding() (layers): ModuleList( (0-5): 6 x TransformerEncoderLayerBase( (self_attn): MultiheadAttention( (dropout_module): FairseqDropout() (k_proj): Linear(in_features=1024, out_features=1024, bias=True) (v_proj): Linear(in_features=1024, out_features=1024, bias=True) (q_proj): Linear(in_features=1024, out_features=1024, bias=True) (out_proj): Linear(in_features=1024, out_features=1024, bias=True) ) (self_attn_layer_norm): FusedLayerNorm(torch.Size([1024]), eps=1e-05, elementwise_affine=True) (dropout_module): FairseqDropout() (activation_dropout_module): FairseqDropout() (fc1): Linear(in_features=1024, out_features=8192, bias=True) (fc2): Linear(in_features=8192, out_features=1024, bias=True) (final_layer_norm): FusedLayerNorm(torch.Size([1024]), eps=1e-05, elementwise_affine=True) ) ) ) (decoder): TransformerDecoderBase( (dropout_module): FairseqDropout() (embed_tokens): Embedding(16000, 1024, padding_idx=1) (embed_positions): SinusoidalPositionalEmbedding() (layers): ModuleList( (0-5): 6 x TransformerDecoderLayerBase( (dropout_module): FairseqDropout() (self_attn): MultiheadAttention( (dropout_module): FairseqDropout() (k_proj): Linear(in_features=1024, out_features=1024, bias=True) (v_proj): Linear(in_features=1024, out_features=1024, bias=True) (q_proj): Linear(in_features=1024, out_features=1024, bias=True) (out_proj): Linear(in_features=1024, out_features=1024, bias=True) ) (activation_dropout_module): FairseqDropout() (self_attn_layer_norm): FusedLayerNorm(torch.Size([1024]), eps=1e-05, elementwise_affine=True) (encoder_attn): MultiheadAttention( (dropout_module): FairseqDropout() (k_proj): Linear(in_features=1024, out_features=1024, bias=True) (v_proj): Linear(in_features=1024, out_features=1024, bias=True) (q_proj): Linear(in_features=1024, out_features=1024, bias=True) (out_proj): Linear(in_features=1024, out_features=1024, bias=True) ) (encoder_attn_layer_norm): FusedLayerNorm(torch.Size([1024]), eps=1e-05, elementwise_affine=True) (fc1): Linear(in_features=1024, out_features=8192, bias=True) (fc2): Linear(in_features=8192, out_features=1024, bias=True) (final_layer_norm): FusedLayerNorm(torch.Size([1024]), eps=1e-05, elementwise_affine=True) ) ) (output_projection): Linear(in_features=1024, out_features=16000, bias=False) ) ) task: TranslationTask model: TransformerModel criterion: LabelSmoothedCrossEntropyCriterion num. shared model params: 326,221,824 (num. trained: 326,221,824) num. expert model params: 0 (num. trained: 0) training on 8 devices (GPUs/TPUs) max tokens per device = 16384 and max sentences per device = None begin dry-run validation on "valid" subset Start iterating over samples epoch 001: 101 / 1689 loss=12.08, nll_loss=11.766, ppl=3481.91, wps=585154, ups=1.18, wpb=495063, bsz=16556.9, num_updates=100, lr=2.5e-05, gnorm=2.749, clip=83, loss_scale=4, train_wall=88, gb_free=60.8, wall=103 epoch 001: 201 / 1689 loss=10.471, nll_loss=9.922, ppl=969.98, wps=580238, ups=1.17, wpb=494772, bsz=16958.6, num_updates=200, lr=5e-05, gnorm=1.856, clip=99, loss_scale=4, train_wall=84, gb_free=60.8, wall=189 epoch 001: 301 / 1689 loss=9.759, nll_loss=9.078, ppl=540.36, wps=582218, ups=1.17, wpb=496328, bsz=16644.9, num_updates=300, lr=7.5e-05, gnorm=2.195, clip=100, loss_scale=4, train_wall=84, gb_free=61.4, wall=274 epoch 001: 401 / 1689 loss=9.146, nll_loss=8.354, ppl=327.19, wps=579081, ups=1.17, wpb=495021, bsz=16565.7, num_updates=400, lr=0.0001, gnorm=1.946, clip=100, loss_scale=4, train_wall=85, gb_free=61.5, wall=359 epoch 001: 501 / 1689 loss=8.667, nll_loss=7.788, ppl=221.07, wps=582463, ups=1.18, wpb=495038, bsz=16610.6, num_updates=500, lr=0.000125, gnorm=1.843, clip=100, loss_scale=4, train_wall=84, gb_free=61.2, wall=444 epoch 001: 602 / 1689 loss=8.271, nll_loss=7.323, ppl=160.12, wps=574684, ups=1.16, wpb=495616, bsz=16517.3, num_updates=600, lr=0.00015, gnorm=1.652, clip=99, loss_scale=4, train_wall=85, gb_free=61.7, wall=531 epoch 001: 702 / 1689 loss=7.863, nll_loss=6.847, ppl=115.12, wps=579620, ups=1.17, wpb=494766, bsz=16314.2, num_updates=700, lr=0.000175, gnorm=1.555, clip=100, loss_scale=4, train_wall=84, gb_free=61.3, wall=616 epoch 001: 802 / 1689 loss=7.465, nll_loss=6.386, ppl=83.64, wps=580132, ups=1.17, wpb=496202, bsz=16400.6, num_updates=800, lr=0.0002, gnorm=1.374, clip=99, loss_scale=4, train_wall=84, gb_free=60.9, wall=702 epoch 001: 902 / 1689 loss=7.094, nll_loss=5.957, ppl=62.11, wps=581363, ups=1.17, wpb=496562, bsz=16617.4, num_updates=900, lr=0.000225, gnorm=1.262, clip=97, loss_scale=4, train_wall=84, gb_free=61.8, wall=787 epoch 001: 1002 / 1689 loss=6.764, nll_loss=5.575, ppl=47.67, wps=581605, ups=1.17, wpb=496189, bsz=16704.8, num_updates=1000, lr=0.00025, gnorm=1.176, clip=88, loss_scale=4, train_wall=84, gb_free=61.1, wall=872 begin validation on "valid" subset epoch 001 | valid on 'valid' subset | loss 6.614 | nll_loss 5.364 | ppl 41.19 | wps 0 | wpb 44526 | bsz 2008 | num_updates 1000 epoch 001: 1102 / 1689 loss=6.437, nll_loss=5.2, ppl=36.77, wps=511307, ups=1.03, wpb=494438, bsz=16503.4, num_updates=1100, lr=0.000275, gnorm=1.122, clip=78, loss_scale=8, train_wall=84, gb_free=59.3, wall=969 epoch 001: 1202 / 1689 loss=6.126, nll_loss=4.843, ppl=28.7, wps=580772, ups=1.17, wpb=494853, bsz=16237.2, num_updates=1200, lr=0.0003, gnorm=1.028, clip=52, loss_scale=8, train_wall=84, gb_free=61.6, wall=1054 epoch 001: 1302 / 1689 loss=5.827, nll_loss=4.502, ppl=22.66, wps=579474, ups=1.17, wpb=493588, bsz=16373.8, num_updates=1300, lr=0.000325, gnorm=0.952, clip=35, loss_scale=8, train_wall=84, gb_free=60.8, wall=1139 epoch 001: 1402 / 1689 loss=5.576, nll_loss=4.217, ppl=18.6, wps=586959, ups=1.18, wpb=497384, bsz=16279.9, num_updates=1400, lr=0.00035, gnorm=0.853, clip=21, loss_scale=8, train_wall=84, gb_free=61.9, wall=1224 epoch 001: 1502 / 1689 loss=5.373, nll_loss=3.989, ppl=15.88, wps=580746, ups=1.17, wpb=494548, bsz=16575.5, num_updates=1500, lr=0.000375, gnorm=0.785, clip=9, loss_scale=8, train_wall=84, gb_free=61.6, wall=1309 epoch 001: 1603 / 1689 loss=5.199, nll_loss=3.795, ppl=13.88, wps=576540, ups=1.16, wpb=495591, bsz=16475.7, num_updates=1600, lr=0.0004, gnorm=0.669, clip=5, loss_scale=8, train_wall=85, gb_free=60.9, wall=1395 end of epoch 1 (average epoch stats below) epoch 001 | loss 7.504 | nll_loss 6.449 | ppl 87.34 | wps 576039 | ups 1.16 | wpb 495110 | bsz 16504.5 | num_updates 1686 | lr 0.0004215 | gnorm 1.399 | clip 69.3 | loss_scale 8 | train_wall 1423 | gb_free 60.7 | wall 1468 Start iterating over samples epoch 002: 14 / 1689 loss=5.083, nll_loss=3.666, ppl=12.7, wps=575698, ups=1.17, wpb=490777, bsz=16322.4, num_updates=1700, lr=0.000425, gnorm=0.659, clip=3, loss_scale=8, train_wall=84, gb_free=61.4, wall=1480 epoch 002: 14 / 1689 loss=5.083, nll_loss=3.666, ppl=12.7, wps=575698, ups=1.17, wpb=490777, bsz=16322.4, num_updates=1700, lr=0.000425, gnorm=0.659, clip=3, loss_scale=8, train_wall=84, gb_free=61.4, wall=1480 epoch 002: 114 / 1689 loss=4.958, nll_loss=3.527, ppl=11.53, wps=581118, ups=1.17, wpb=496688, bsz=16760.9, num_updates=1800, lr=0.00045, gnorm=0.569, clip=2, loss_scale=8, train_wall=84, gb_free=61.3, wall=1566 epoch 002: 114 / 1689 loss=4.958, nll_loss=3.527, ppl=11.53, wps=581118, ups=1.17, wpb=496688, bsz=16760.9, num_updates=1800, lr=0.00045, gnorm=0.569, clip=2, loss_scale=8, train_wall=84, gb_free=61.3, wall=1566 epoch 002: 214 / 1689 loss=4.875, nll_loss=3.436, ppl=10.82, wps=577631, ups=1.17, wpb=494526, bsz=16524.7, num_updates=1900, lr=0.000475, gnorm=0.574, clip=0, loss_scale=8, train_wall=85, gb_free=61.5, wall=1652 epoch 002: 214 / 1689 loss=4.875, nll_loss=3.436, ppl=10.82, wps=577631, ups=1.17, wpb=494526, bsz=16524.7, num_updates=1900, lr=0.000475, gnorm=0.574, clip=0, loss_scale=8, train_wall=85, gb_free=61.5, wall=1652 epoch 002: 315 / 1689 loss=4.782, nll_loss=3.335, ppl=10.09, wps=572788, ups=1.16, wpb=494348, bsz=16757.8, num_updates=2000, lr=0.0005, gnorm=0.487, clip=1, loss_scale=4, train_wall=85, gb_free=61.4, wall=1738 epoch 002: 315 / 1689 loss=4.782, nll_loss=3.335, ppl=10.09, wps=572788, ups=1.16, wpb=494348, bsz=16757.8, num_updates=2000, lr=0.0005, gnorm=0.487, clip=1, loss_scale=4, train_wall=85, gb_free=61.4, wall=1738 begin validation on "valid" subset epoch 002 | valid on 'valid' subset | loss 4.784 | nll_loss 3.287 | ppl 9.76 | wps 0 | wpb 44526 | bsz 2008 | num_updates 2000 | best_loss 4.784 epoch 002 | valid on 'valid' subset | loss 4.784 | nll_loss 3.287 | ppl 9.76 | wps 0 | wpb 44526 | bsz 2008 | num_updates 2000 | best_loss 4.784 epoch 002: 415 / 1689 loss=4.722, nll_loss=3.269, ppl=9.64, wps=516084, ups=1.04, wpb=495100, bsz=16294.4, num_updates=2100, lr=0.000525, gnorm=0.512, clip=2, loss_scale=4, train_wall=84, gb_free=61.2, wall=1834 epoch 002: 415 / 1689 loss=4.722, nll_loss=3.269, ppl=9.64, wps=516084, ups=1.04, wpb=495100, bsz=16294.4, num_updates=2100, lr=0.000525, gnorm=0.512, clip=2, loss_scale=4, train_wall=84, gb_free=61.2, wall=1834 epoch 002: 515 / 1689 loss=4.686, nll_loss=3.23, ppl=9.38, wps=584357, ups=1.18, wpb=494927, bsz=16567.2, num_updates=2200, lr=0.00055, gnorm=0.572, clip=6, loss_scale=4, train_wall=83, gb_free=61.6, wall=1918 epoch 002: 515 / 1689 loss=4.686, nll_loss=3.23, ppl=9.38, wps=584357, ups=1.18, wpb=494927, bsz=16567.2, num_updates=2200, lr=0.00055, gnorm=0.572, clip=6, loss_scale=4, train_wall=83, gb_free=61.6, wall=1918 epoch 002: 615 / 1689 loss=4.582, nll_loss=3.116, ppl=8.67, wps=581703, ups=1.17, wpb=495521, bsz=16630.4, num_updates=2300, lr=0.000575, gnorm=0.389, clip=0, loss_scale=4, train_wall=84, gb_free=61.4, wall=2004 epoch 002: 615 / 1689 loss=4.582, nll_loss=3.116, ppl=8.67, wps=581703, ups=1.17, wpb=495521, bsz=16630.4, num_updates=2300, lr=0.000575, gnorm=0.389, clip=0, loss_scale=4, train_wall=84, gb_free=61.4, wall=2004 epoch 002: 715 / 1689 loss=4.559, nll_loss=3.091, ppl=8.52, wps=582822, ups=1.18, wpb=495264, bsz=16162, num_updates=2400, lr=0.0006, gnorm=0.47, clip=0, loss_scale=4, train_wall=84, gb_free=60.7, wall=2089 epoch 002: 715 / 1689 loss=4.559, nll_loss=3.091, ppl=8.52, wps=582822, ups=1.18, wpb=495264, bsz=16162, num_updates=2400, lr=0.0006, gnorm=0.47, clip=0, loss_scale=4, train_wall=84, gb_free=60.7, wall=2089 epoch 002: 816 / 1689 loss=4.531, nll_loss=3.062, ppl=8.35, wps=577342, ups=1.17, wpb=494630, bsz=16780.7, num_updates=2500, lr=0.000625, gnorm=0.493, clip=9, loss_scale=4, train_wall=84, gb_free=60.4, wall=2174 epoch 002: 816 / 1689 loss=4.531, nll_loss=3.062, ppl=8.35, wps=577342, ups=1.17, wpb=494630, bsz=16780.7, num_updates=2500, lr=0.000625, gnorm=0.493, clip=9, loss_scale=4, train_wall=84, gb_free=60.4, wall=2174 epoch 002: 916 / 1689 loss=4.433, nll_loss=2.955, ppl=7.75, wps=584709, ups=1.18, wpb=495471, bsz=16597.7, num_updates=2600, lr=0.00065, gnorm=0.358, clip=0, loss_scale=4, train_wall=84, gb_free=62, wall=2259 epoch 002: 916 / 1689 loss=4.433, nll_loss=2.955, ppl=7.75, wps=584709, ups=1.18, wpb=495471, bsz=16597.7, num_updates=2600, lr=0.00065, gnorm=0.358, clip=0, loss_scale=4, train_wall=84, gb_free=62, wall=2259 epoch 002: 1016 / 1689 loss=4.406, nll_loss=2.926, ppl=7.6, wps=584890, ups=1.18, wpb=495174, bsz=16381.9, num_updates=2700, lr=0.000675, gnorm=0.403, clip=0, loss_scale=4, train_wall=84, gb_free=61.5, wall=2344 epoch 002: 1016 / 1689 loss=4.406, nll_loss=2.926, ppl=7.6, wps=584890, ups=1.18, wpb=495174, bsz=16381.9, num_updates=2700, lr=0.000675, gnorm=0.403, clip=0, loss_scale=4, train_wall=84, gb_free=61.5, wall=2344 epoch 002: 1116 / 1689 loss=4.383, nll_loss=2.9, ppl=7.46, wps=585023, ups=1.18, wpb=494962, bsz=16373.1, num_updates=2800, lr=0.0007, gnorm=0.396, clip=0, loss_scale=4, train_wall=83, gb_free=61.6, wall=2428 epoch 002: 1116 / 1689 loss=4.383, nll_loss=2.9, ppl=7.46, wps=585023, ups=1.18, wpb=494962, bsz=16373.1, num_updates=2800, lr=0.0007, gnorm=0.396, clip=0, loss_scale=4, train_wall=83, gb_free=61.6, wall=2428 epoch 002: 1216 / 1689 loss=4.339, nll_loss=2.854, ppl=7.23, wps=583742, ups=1.18, wpb=496100, bsz=16472.5, num_updates=2900, lr=0.000725, gnorm=0.384, clip=0, loss_scale=4, train_wall=84, gb_free=61.2, wall=2513 epoch 002: 1216 / 1689 loss=4.339, nll_loss=2.854, ppl=7.23, wps=583742, ups=1.18, wpb=496100, bsz=16472.5, num_updates=2900, lr=0.000725, gnorm=0.384, clip=0, loss_scale=4, train_wall=84, gb_free=61.2, wall=2513 epoch 002: 1316 / 1689 loss=4.307, nll_loss=2.819, ppl=7.06, wps=587200, ups=1.18, wpb=496192, bsz=16642.6, num_updates=3000, lr=0.00075, gnorm=0.36, clip=0, loss_scale=8, train_wall=83, gb_free=61.6, wall=2598 epoch 002: 1316 / 1689 loss=4.307, nll_loss=2.819, ppl=7.06, wps=587200, ups=1.18, wpb=496192, bsz=16642.6, num_updates=3000, lr=0.00075, gnorm=0.36, clip=0, loss_scale=8, train_wall=83, gb_free=61.6, wall=2598 begin validation on "valid" subset epoch 002 | valid on 'valid' subset | loss 4.303 | nll_loss 2.761 | ppl 6.78 | wps 0 | wpb 44526 | bsz 2008 | num_updates 3000 | best_loss 4.303 epoch 002 | valid on 'valid' subset | loss 4.303 | nll_loss 2.761 | ppl 6.78 | wps 0 | wpb 44526 | bsz 2008 | num_updates 3000 | best_loss 4.303 epoch 002: 1416 / 1689 loss=4.276, nll_loss=2.785, ppl=6.89, wps=520940, ups=1.05, wpb=494703, bsz=16277.2, num_updates=3100, lr=0.000775, gnorm=0.361, clip=0, loss_scale=8, train_wall=83, gb_free=61.4, wall=2693 epoch 002: 1416 / 1689 loss=4.276, nll_loss=2.785, ppl=6.89, wps=520940, ups=1.05, wpb=494703, bsz=16277.2, num_updates=3100, lr=0.000775, gnorm=0.361, clip=0, loss_scale=8, train_wall=83, gb_free=61.4, wall=2693 epoch 002: 1517 / 1689 loss=4.253, nll_loss=2.76, ppl=6.78, wps=581590, ups=1.17, wpb=495600, bsz=16398.9, num_updates=3200, lr=0.0008, gnorm=0.36, clip=0, loss_scale=4, train_wall=84, gb_free=62.1, wall=2778 epoch 002: 1517 / 1689 loss=4.253, nll_loss=2.76, ppl=6.78, wps=581590, ups=1.17, wpb=495600, bsz=16398.9, num_updates=3200, lr=0.0008, gnorm=0.36, clip=0, loss_scale=4, train_wall=84, gb_free=62.1, wall=2778 epoch 002: 1617 / 1689 loss=4.234, nll_loss=2.741, ppl=6.68, wps=589528, ups=1.19, wpb=497280, bsz=16313, num_updates=3300, lr=0.000825, gnorm=0.363, clip=0, loss_scale=4, train_wall=83, gb_free=61.4, wall=2862 epoch 002: 1617 / 1689 loss=4.234, nll_loss=2.741, ppl=6.68, wps=589528, ups=1.19, wpb=497280, bsz=16313, num_updates=3300, lr=0.000825, gnorm=0.363, clip=0, loss_scale=4, train_wall=83, gb_free=61.4, wall=2862 end of epoch 2 (average epoch stats below) epoch 002 | loss 4.512 | nll_loss 3.041 | ppl 8.23 | wps 573909 | ups 1.16 | wpb 495132 | bsz 16498.6 | num_updates 3372 | lr 0.000843 | gnorm 0.439 | clip 1.2 | loss_scale 4 | train_wall 1413 | gb_free 65.3 | wall 2922 epoch 002 | loss 4.512 | nll_loss 3.041 | ppl 8.23 | wps 573909 | ups 1.16 | wpb 495132 | bsz 16498.6 | num_updates 3372 | lr 0.000843 | gnorm 0.439 | clip 1.2 | loss_scale 4 | train_wall 1413 | gb_free 65.3 | wall 2922 Start iterating over samples epoch 003: 28 / 1689 loss=4.214, nll_loss=2.719, ppl=6.59, wps=583025, ups=1.19, wpb=490460, bsz=16511.3, num_updates=3400, lr=0.00085, gnorm=0.357, clip=0, loss_scale=4, train_wall=83, gb_free=60.8, wall=2946 epoch 003: 28 / 1689 loss=4.214, nll_loss=2.719, ppl=6.59, wps=583025, ups=1.19, wpb=490460, bsz=16511.3, num_updates=3400, lr=0.00085, gnorm=0.357, clip=0, loss_scale=4, train_wall=83, gb_free=60.8, wall=2946 epoch 003: 28 / 1689 loss=4.214, nll_loss=2.719, ppl=6.59, wps=583025, ups=1.19, wpb=490460, bsz=16511.3, num_updates=3400, lr=0.00085, gnorm=0.357, clip=0, loss_scale=4, train_wall=83, gb_free=60.8, wall=2946 epoch 003: 128 / 1689 loss=4.178, nll_loss=2.679, ppl=6.4, wps=584699, ups=1.18, wpb=495133, bsz=16849.4, num_updates=3500, lr=0.000875, gnorm=0.338, clip=0, loss_scale=4, train_wall=84, gb_free=61.5, wall=3031 epoch 003: 128 / 1689 loss=4.178, nll_loss=2.679, ppl=6.4, wps=584699, ups=1.18, wpb=495133, bsz=16849.4, num_updates=3500, lr=0.000875, gnorm=0.338, clip=0, loss_scale=4, train_wall=84, gb_free=61.5, wall=3031 epoch 003: 128 / 1689 loss=4.178, nll_loss=2.679, ppl=6.4, wps=584699, ups=1.18, wpb=495133, bsz=16849.4, num_updates=3500, lr=0.000875, gnorm=0.338, clip=0, loss_scale=4, train_wall=84, gb_free=61.5, wall=3031 epoch 003: 228 / 1689 loss=4.176, nll_loss=2.678, ppl=6.4, wps=583522, ups=1.18, wpb=495438, bsz=16848.2, num_updates=3600, lr=0.0009, gnorm=0.351, clip=0, loss_scale=4, train_wall=84, gb_free=62.5, wall=3116 epoch 003: 228 / 1689 loss=4.176, nll_loss=2.678, ppl=6.4, wps=583522, ups=1.18, wpb=495438, bsz=16848.2, num_updates=3600, lr=0.0009, gnorm=0.351, clip=0, loss_scale=4, train_wall=84, gb_free=62.5, wall=3116 epoch 003: 228 / 1689 loss=4.176, nll_loss=2.678, ppl=6.4, wps=583522, ups=1.18, wpb=495438, bsz=16848.2, num_updates=3600, lr=0.0009, gnorm=0.351, clip=0, loss_scale=4, train_wall=84, gb_free=62.5, wall=3116 epoch 003: 328 / 1689 loss=4.161, nll_loss=2.663, ppl=6.33, wps=584594, ups=1.18, wpb=494814, bsz=16504.7, num_updates=3700, lr=0.000925, gnorm=0.348, clip=0, loss_scale=8, train_wall=83, gb_free=61.1, wall=3201 epoch 003: 328 / 1689 loss=4.161, nll_loss=2.663, ppl=6.33, wps=584594, ups=1.18, wpb=494814, bsz=16504.7, num_updates=3700, lr=0.000925, gnorm=0.348, clip=0, loss_scale=8, train_wall=83, gb_free=61.1, wall=3201 epoch 003: 328 / 1689 loss=4.161, nll_loss=2.663, ppl=6.33, wps=584594, ups=1.18, wpb=494814, bsz=16504.7, num_updates=3700, lr=0.000925, gnorm=0.348, clip=0, loss_scale=8, train_wall=83, gb_free=61.1, wall=3201 epoch 003: 429 / 1689 loss=4.154, nll_loss=2.655, ppl=6.3, wps=578958, ups=1.17, wpb=494717, bsz=16453.2, num_updates=3800, lr=0.00095, gnorm=0.341, clip=0, loss_scale=4, train_wall=84, gb_free=60.1, wall=3286 epoch 003: 429 / 1689 loss=4.154, nll_loss=2.655, ppl=6.3, wps=578958, ups=1.17, wpb=494717, bsz=16453.2, num_updates=3800, lr=0.00095, gnorm=0.341, clip=0, loss_scale=4, train_wall=84, gb_free=60.1, wall=3286 epoch 003: 429 / 1689 loss=4.154, nll_loss=2.655, ppl=6.3, wps=578958, ups=1.17, wpb=494717, bsz=16453.2, num_updates=3800, lr=0.00095, gnorm=0.341, clip=0, loss_scale=4, train_wall=84, gb_free=60.1, wall=3286 epoch 003: 529 / 1689 loss=4.146, nll_loss=2.646, ppl=6.26, wps=586817, ups=1.18, wpb=496084, bsz=16501, num_updates=3900, lr=0.000975, gnorm=0.374, clip=0, loss_scale=4, train_wall=83, gb_free=61.1, wall=3371 epoch 003: 529 / 1689 loss=4.146, nll_loss=2.646, ppl=6.26, wps=586817, ups=1.18, wpb=496084, bsz=16501, num_updates=3900, lr=0.000975, gnorm=0.374, clip=0, loss_scale=4, train_wall=83, gb_free=61.1, wall=3371 epoch 003: 529 / 1689 loss=4.146, nll_loss=2.646, ppl=6.26, wps=586817, ups=1.18, wpb=496084, bsz=16501, num_updates=3900, lr=0.000975, gnorm=0.374, clip=0, loss_scale=4, train_wall=83, gb_free=61.1, wall=3371 epoch 003: 629 / 1689 loss=4.123, nll_loss=2.621, ppl=6.15, wps=588479, ups=1.19, wpb=495658, bsz=16451.8, num_updates=4000, lr=0.001, gnorm=0.322, clip=0, loss_scale=4, train_wall=83, gb_free=60.6, wall=3455 epoch 003: 629 / 1689 loss=4.123, nll_loss=2.621, ppl=6.15, wps=588479, ups=1.19, wpb=495658, bsz=16451.8, num_updates=4000, lr=0.001, gnorm=0.322, clip=0, loss_scale=4, train_wall=83, gb_free=60.6, wall=3455 epoch 003: 629 / 1689 loss=4.123, nll_loss=2.621, ppl=6.15, wps=588479, ups=1.19, wpb=495658, bsz=16451.8, num_updates=4000, lr=0.001, gnorm=0.322, clip=0, loss_scale=4, train_wall=83, gb_free=60.6, wall=3455 begin validation on "valid" subset epoch 003 | valid on 'valid' subset | loss 4.167 | nll_loss 2.631 | ppl 6.19 | wps 0 | wpb 44526 | bsz 2008 | num_updates 4000 | best_loss 4.167 epoch 003 | valid on 'valid' subset | loss 4.167 | nll_loss 2.631 | ppl 6.19 | wps 0 | wpb 44526 | bsz 2008 | num_updates 4000 | best_loss 4.167 epoch 003 | valid on 'valid' subset | loss 4.167 | nll_loss 2.631 | ppl 6.19 | wps 0 | wpb 44526 | bsz 2008 | num_updates 4000 | best_loss 4.167 epoch 003: 729 / 1689 loss=4.119, nll_loss=2.617, ppl=6.14, wps=517056, ups=1.04, wpb=496213, bsz=16341, num_updates=4100, lr=0.00098773, gnorm=0.325, clip=0, loss_scale=4, train_wall=84, gb_free=61.4, wall=3551 epoch 003: 729 / 1689 loss=4.119, nll_loss=2.617, ppl=6.14, wps=517056, ups=1.04, wpb=496213, bsz=16341, num_updates=4100, lr=0.00098773, gnorm=0.325, clip=0, loss_scale=4, train_wall=84, gb_free=61.4, wall=3551 epoch 003: 729 / 1689 loss=4.119, nll_loss=2.617, ppl=6.14, wps=517056, ups=1.04, wpb=496213, bsz=16341, num_updates=4100, lr=0.00098773, gnorm=0.325, clip=0, loss_scale=4, train_wall=84, gb_free=61.4, wall=3551 epoch 003: 829 / 1689 loss=4.108, nll_loss=2.606, ppl=6.09, wps=584987, ups=1.19, wpb=493044, bsz=16311.8, num_updates=4200, lr=0.0009759, gnorm=0.319, clip=0, loss_scale=4, train_wall=83, gb_free=61.6, wall=3635 epoch 003: 829 / 1689 loss=4.108, nll_loss=2.606, ppl=6.09, wps=584987, ups=1.19, wpb=493044, bsz=16311.8, num_updates=4200, lr=0.0009759, gnorm=0.319, clip=0, loss_scale=4, train_wall=83, gb_free=61.6, wall=3635 epoch 003: 829 / 1689 loss=4.108, nll_loss=2.606, ppl=6.09, wps=584987, ups=1.19, wpb=493044, bsz=16311.8, num_updates=4200, lr=0.0009759, gnorm=0.319, clip=0, loss_scale=4, train_wall=83, gb_free=61.6, wall=3635 epoch 003: 929 / 1689 loss=4.092, nll_loss=2.589, ppl=6.01, wps=588028, ups=1.19, wpb=494194, bsz=16361.3, num_updates=4300, lr=0.000964486, gnorm=0.308, clip=0, loss_scale=8, train_wall=83, gb_free=61.2, wall=3719 epoch 003: 929 / 1689 loss=4.092, nll_loss=2.589, ppl=6.01, wps=588028, ups=1.19, wpb=494194, bsz=16361.3, num_updates=4300, lr=0.000964486, gnorm=0.308, clip=0, loss_scale=8, train_wall=83, gb_free=61.2, wall=3719 epoch 003: 929 / 1689 loss=4.092, nll_loss=2.589, ppl=6.01, wps=588028, ups=1.19, wpb=494194, bsz=16361.3, num_updates=4300, lr=0.000964486, gnorm=0.308, clip=0, loss_scale=8, train_wall=83, gb_free=61.2, wall=3719 epoch 003: 1029 / 1689 loss=4.084, nll_loss=2.58, ppl=5.98, wps=588818, ups=1.19, wpb=496554, bsz=16440.3, num_updates=4400, lr=0.000953463, gnorm=0.307, clip=0, loss_scale=8, train_wall=83, gb_free=61.9, wall=3804 epoch 003: 1029 / 1689 loss=4.084, nll_loss=2.58, ppl=5.98, wps=588818, ups=1.19, wpb=496554, bsz=16440.3, num_updates=4400, lr=0.000953463, gnorm=0.307, clip=0, loss_scale=8, train_wall=83, gb_free=61.9, wall=3804 epoch 003: 1029 / 1689 loss=4.084, nll_loss=2.58, ppl=5.98, wps=588818, ups=1.19, wpb=496554, bsz=16440.3, num_updates=4400, lr=0.000953463, gnorm=0.307, clip=0, loss_scale=8, train_wall=83, gb_free=61.9, wall=3804 epoch 003: 1130 / 1689 loss=4.062, nll_loss=2.557, ppl=5.88, wps=580254, ups=1.17, wpb=496237, bsz=16901.9, num_updates=4500, lr=0.000942809, gnorm=0.301, clip=0, loss_scale=4, train_wall=84, gb_free=61.7, wall=3889 epoch 003: 1130 / 1689 loss=4.062, nll_loss=2.557, ppl=5.88, wps=580254, ups=1.17, wpb=496237, bsz=16901.9, num_updates=4500, lr=0.000942809, gnorm=0.301, clip=0, loss_scale=4, train_wall=84, gb_free=61.7, wall=3889 epoch 003: 1130 / 1689 loss=4.062, nll_loss=2.557, ppl=5.88, wps=580254, ups=1.17, wpb=496237, bsz=16901.9, num_updates=4500, lr=0.000942809, gnorm=0.301, clip=0, loss_scale=4, train_wall=84, gb_free=61.7, wall=3889 epoch 003: 1230 / 1689 loss=4.057, nll_loss=2.551, ppl=5.86, wps=585743, ups=1.19, wpb=494071, bsz=16463.8, num_updates=4600, lr=0.000932505, gnorm=0.295, clip=0, loss_scale=4, train_wall=83, gb_free=60.7, wall=3973 epoch 003: 1230 / 1689 loss=4.057, nll_loss=2.551, ppl=5.86, wps=585743, ups=1.19, wpb=494071, bsz=16463.8, num_updates=4600, lr=0.000932505, gnorm=0.295, clip=0, loss_scale=4, train_wall=83, gb_free=60.7, wall=3973 epoch 003: 1230 / 1689 loss=4.057, nll_loss=2.551, ppl=5.86, wps=585743, ups=1.19, wpb=494071, bsz=16463.8, num_updates=4600, lr=0.000932505, gnorm=0.295, clip=0, loss_scale=4, train_wall=83, gb_free=60.7, wall=3973 epoch 003: 1330 / 1689 loss=4.045, nll_loss=2.539, ppl=5.81, wps=589285, ups=1.19, wpb=495849, bsz=16334.4, num_updates=4700, lr=0.000922531, gnorm=0.286, clip=0, loss_scale=4, train_wall=83, gb_free=61.3, wall=4058 epoch 003: 1330 / 1689 loss=4.045, nll_loss=2.539, ppl=5.81, wps=589285, ups=1.19, wpb=495849, bsz=16334.4, num_updates=4700, lr=0.000922531, gnorm=0.286, clip=0, loss_scale=4, train_wall=83, gb_free=61.3, wall=4058 epoch 003: 1330 / 1689 loss=4.045, nll_loss=2.539, ppl=5.81, wps=589285, ups=1.19, wpb=495849, bsz=16334.4, num_updates=4700, lr=0.000922531, gnorm=0.286, clip=0, loss_scale=4, train_wall=83, gb_free=61.3, wall=4058 epoch 003: 1430 / 1689 loss=4.031, nll_loss=2.524, ppl=5.75, wps=585518, ups=1.18, wpb=496335, bsz=16704.6, num_updates=4800, lr=0.000912871, gnorm=0.278, clip=0, loss_scale=4, train_wall=84, gb_free=61.3, wall=4142 epoch 003: 1430 / 1689 loss=4.031, nll_loss=2.524, ppl=5.75, wps=585518, ups=1.18, wpb=496335, bsz=16704.6, num_updates=4800, lr=0.000912871, gnorm=0.278, clip=0, loss_scale=4, train_wall=84, gb_free=61.3, wall=4142 epoch 003: 1430 / 1689 loss=4.031, nll_loss=2.524, ppl=5.75, wps=585518, ups=1.18, wpb=496335, bsz=16704.6, num_updates=4800, lr=0.000912871, gnorm=0.278, clip=0, loss_scale=4, train_wall=84, gb_free=61.3, wall=4142 epoch 003: 1530 / 1689 loss=4.019, nll_loss=2.511, ppl=5.7, wps=588568, ups=1.19, wpb=495935, bsz=16441.5, num_updates=4900, lr=0.000903508, gnorm=0.278, clip=0, loss_scale=4, train_wall=83, gb_free=61.8, wall=4227 epoch 003: 1530 / 1689 loss=4.019, nll_loss=2.511, ppl=5.7, wps=588568, ups=1.19, wpb=495935, bsz=16441.5, num_updates=4900, lr=0.000903508, gnorm=0.278, clip=0, loss_scale=4, train_wall=83, gb_free=61.8, wall=4227 epoch 003: 1530 / 1689 loss=4.019, nll_loss=2.511, ppl=5.7, wps=588568, ups=1.19, wpb=495935, bsz=16441.5, num_updates=4900, lr=0.000903508, gnorm=0.278, clip=0, loss_scale=4, train_wall=83, gb_free=61.8, wall=4227 epoch 003: 1630 / 1689 loss=4.015, nll_loss=2.507, ppl=5.68, wps=585986, ups=1.18, wpb=495488, bsz=16111.9, num_updates=5000, lr=0.000894427, gnorm=0.274, clip=0, loss_scale=8, train_wall=83, gb_free=61.3, wall=4311 epoch 003: 1630 / 1689 loss=4.015, nll_loss=2.507, ppl=5.68, wps=585986, ups=1.18, wpb=495488, bsz=16111.9, num_updates=5000, lr=0.000894427, gnorm=0.274, clip=0, loss_scale=8, train_wall=83, gb_free=61.3, wall=4311 epoch 003: 1630 / 1689 loss=4.015, nll_loss=2.507, ppl=5.68, wps=585986, ups=1.18, wpb=495488, bsz=16111.9, num_updates=5000, lr=0.000894427, gnorm=0.274, clip=0, loss_scale=8, train_wall=83, gb_free=61.3, wall=4311 begin validation on "valid" subset epoch 003 | valid on 'valid' subset | loss 4.013 | nll_loss 2.467 | ppl 5.53 | wps 0 | wpb 44526 | bsz 2008 | num_updates 5000 | best_loss 4.013 epoch 003 | valid on 'valid' subset | loss 4.013 | nll_loss 2.467 | ppl 5.53 | wps 0 | wpb 44526 | bsz 2008 | num_updates 5000 | best_loss 4.013 epoch 003 | valid on 'valid' subset | loss 4.013 | nll_loss 2.467 | ppl 5.53 | wps 0 | wpb 44526 | bsz 2008 | num_updates 5000 | best_loss 4.013 end of epoch 3 (average epoch stats below) epoch 003 | loss 4.097 | nll_loss 2.594 | ppl 6.04 | wps 576435 | ups 1.16 | wpb 495123 | bsz 16504.7 | num_updates 5059 | lr 0.000889196 | gnorm 0.315 | clip 0 | loss_scale 8 | train_wall 1406 | gb_free 63 | wall 4371 epoch 003 | loss 4.097 | nll_loss 2.594 | ppl 6.04 | wps 576435 | ups 1.16 | wpb 495123 | bsz 16504.7 | num_updates 5059 | lr 0.000889196 | gnorm 0.315 | clip 0 | loss_scale 8 | train_wall 1406 | gb_free 63 | wall 4371 epoch 003 | loss 4.097 | nll_loss 2.594 | ppl 6.04 | wps 576435 | ups 1.16 | wpb 495123 | bsz 16504.7 | num_updates 5059 | lr 0.000889196 | gnorm 0.315 | clip 0 | loss_scale 8 | train_wall 1406 | gb_free 63 | wall 4371 Start iterating over samples epoch 004: 41 / 1689 loss=3.991, nll_loss=2.48, ppl=5.58, wps=513727, ups=1.04, wpb=491687, bsz=16828.6, num_updates=5100, lr=0.000885615, gnorm=0.273, clip=0, loss_scale=8, train_wall=83, gb_free=61.5, wall=4407 epoch 004: 41 / 1689 loss=3.991, nll_loss=2.48, ppl=5.58, wps=513727, ups=1.04, wpb=491687, bsz=16828.6, num_updates=5100, lr=0.000885615, gnorm=0.273, clip=0, loss_scale=8, train_wall=83, gb_free=61.5, wall=4407 epoch 004: 41 / 1689 loss=3.991, nll_loss=2.48, ppl=5.58, wps=513727, ups=1.04, wpb=491687, bsz=16828.6, num_updates=5100, lr=0.000885615, gnorm=0.273, clip=0, loss_scale=8, train_wall=83, gb_free=61.5, wall=4407 epoch 004: 41 / 1689 loss=3.991, nll_loss=2.48, ppl=5.58, wps=513727, ups=1.04, wpb=491687, bsz=16828.6, num_updates=5100, lr=0.000885615, gnorm=0.273, clip=0, loss_scale=8, train_wall=83, gb_free=61.5, wall=4407 epoch 004: 141 / 1689 loss=3.975, nll_loss=2.462, ppl=5.51, wps=590834, ups=1.19, wpb=496024, bsz=16396.1, num_updates=5200, lr=0.000877058, gnorm=0.261, clip=0, loss_scale=8, train_wall=83, gb_free=61.6, wall=4491 epoch 004: 141 / 1689 loss=3.975, nll_loss=2.462, ppl=5.51, wps=590834, ups=1.19, wpb=496024, bsz=16396.1, num_updates=5200, lr=0.000877058, gnorm=0.261, clip=0, loss_scale=8, train_wall=83, gb_free=61.6, wall=4491 epoch 004: 141 / 1689 loss=3.975, nll_loss=2.462, ppl=5.51, wps=590834, ups=1.19, wpb=496024, bsz=16396.1, num_updates=5200, lr=0.000877058, gnorm=0.261, clip=0, loss_scale=8, train_wall=83, gb_free=61.6, wall=4491 epoch 004: 141 / 1689 loss=3.975, nll_loss=2.462, ppl=5.51, wps=590834, ups=1.19, wpb=496024, bsz=16396.1, num_updates=5200, lr=0.000877058, gnorm=0.261, clip=0, loss_scale=8, train_wall=83, gb_free=61.6, wall=4491 epoch 004: 242 / 1689 loss=3.968, nll_loss=2.455, ppl=5.48, wps=580577, ups=1.17, wpb=496338, bsz=16307.9, num_updates=5300, lr=0.000868744, gnorm=0.259, clip=0, loss_scale=4, train_wall=84, gb_free=61.6, wall=4576 epoch 004: 242 / 1689 loss=3.968, nll_loss=2.455, ppl=5.48, wps=580577, ups=1.17, wpb=496338, bsz=16307.9, num_updates=5300, lr=0.000868744, gnorm=0.259, clip=0, loss_scale=4, train_wall=84, gb_free=61.6, wall=4576 epoch 004: 242 / 1689 loss=3.968, nll_loss=2.455, ppl=5.48, wps=580577, ups=1.17, wpb=496338, bsz=16307.9, num_updates=5300, lr=0.000868744, gnorm=0.259, clip=0, loss_scale=4, train_wall=84, gb_free=61.6, wall=4576 epoch 004: 242 / 1689 loss=3.968, nll_loss=2.455, ppl=5.48, wps=580577, ups=1.17, wpb=496338, bsz=16307.9, num_updates=5300, lr=0.000868744, gnorm=0.259, clip=0, loss_scale=4, train_wall=84, gb_free=61.6, wall=4576 epoch 004: 342 / 1689 loss=3.962, nll_loss=2.449, ppl=5.46, wps=579932, ups=1.17, wpb=493975, bsz=16735.7, num_updates=5400, lr=0.000860663, gnorm=0.265, clip=0, loss_scale=4, train_wall=84, gb_free=61.6, wall=4662 epoch 004: 342 / 1689 loss=3.962, nll_loss=2.449, ppl=5.46, wps=579932, ups=1.17, wpb=493975, bsz=16735.7, num_updates=5400, lr=0.000860663, gnorm=0.265, clip=0, loss_scale=4, train_wall=84, gb_free=61.6, wall=4662 epoch 004: 342 / 1689 loss=3.962, nll_loss=2.449, ppl=5.46, wps=579932, ups=1.17, wpb=493975, bsz=16735.7, num_updates=5400, lr=0.000860663, gnorm=0.265, clip=0, loss_scale=4, train_wall=84, gb_free=61.6, wall=4662 epoch 004: 342 / 1689 loss=3.962, nll_loss=2.449, ppl=5.46, wps=579932, ups=1.17, wpb=493975, bsz=16735.7, num_updates=5400, lr=0.000860663, gnorm=0.265, clip=0, loss_scale=4, train_wall=84, gb_free=61.6, wall=4662 epoch 004: 442 / 1689 loss=3.951, nll_loss=2.436, ppl=5.41, wps=584817, ups=1.18, wpb=496824, bsz=16678.8, num_updates=5500, lr=0.000852803, gnorm=0.255, clip=0, loss_scale=4, train_wall=84, gb_free=61.4, wall=4746 epoch 004: 442 / 1689 loss=3.951, nll_loss=2.436, ppl=5.41, wps=584817, ups=1.18, wpb=496824, bsz=16678.8, num_updates=5500, lr=0.000852803, gnorm=0.255, clip=0, loss_scale=4, train_wall=84, gb_free=61.4, wall=4746 epoch 004: 442 / 1689 loss=3.951, nll_loss=2.436, ppl=5.41, wps=584817, ups=1.18, wpb=496824, bsz=16678.8, num_updates=5500, lr=0.000852803, gnorm=0.255, clip=0, loss_scale=4, train_wall=84, gb_free=61.4, wall=4746 epoch 004: 442 / 1689 loss=3.951, nll_loss=2.436, ppl=5.41, wps=584817, ups=1.18, wpb=496824, bsz=16678.8, num_updates=5500, lr=0.000852803, gnorm=0.255, clip=0, loss_scale=4, train_wall=84, gb_free=61.4, wall=4746 epoch 004: 542 / 1689 loss=3.962, nll_loss=2.449, ppl=5.46, wps=586891, ups=1.18, wpb=495916, bsz=16423.7, num_updates=5600, lr=0.000845154, gnorm=0.251, clip=0, loss_scale=4, train_wall=83, gb_free=61.6, wall=4831 epoch 004: 542 / 1689 loss=3.962, nll_loss=2.449, ppl=5.46, wps=586891, ups=1.18, wpb=495916, bsz=16423.7, num_updates=5600, lr=0.000845154, gnorm=0.251, clip=0, loss_scale=4, train_wall=83, gb_free=61.6, wall=4831 epoch 004: 542 / 1689 loss=3.962, nll_loss=2.449, ppl=5.46, wps=586891, ups=1.18, wpb=495916, bsz=16423.7, num_updates=5600, lr=0.000845154, gnorm=0.251, clip=0, loss_scale=4, train_wall=83, gb_free=61.6, wall=4831 epoch 004: 542 / 1689 loss=3.962, nll_loss=2.449, ppl=5.46, wps=586891, ups=1.18, wpb=495916, bsz=16423.7, num_updates=5600, lr=0.000845154, gnorm=0.251, clip=0, loss_scale=4, train_wall=83, gb_free=61.6, wall=4831 epoch 004: 642 / 1689 loss=3.948, nll_loss=2.433, ppl=5.4, wps=580622, ups=1.17, wpb=494379, bsz=16563, num_updates=5700, lr=0.000837708, gnorm=0.252, clip=0, loss_scale=4, train_wall=84, gb_free=61.3, wall=4916 epoch 004: 642 / 1689 loss=3.948, nll_loss=2.433, ppl=5.4, wps=580622, ups=1.17, wpb=494379, bsz=16563, num_updates=5700, lr=0.000837708, gnorm=0.252, clip=0, loss_scale=4, train_wall=84, gb_free=61.3, wall=4916 epoch 004: 642 / 1689 loss=3.948, nll_loss=2.433, ppl=5.4, wps=580622, ups=1.17, wpb=494379, bsz=16563, num_updates=5700, lr=0.000837708, gnorm=0.252, clip=0, loss_scale=4, train_wall=84, gb_free=61.3, wall=4916 epoch 004: 642 / 1689 loss=3.948, nll_loss=2.433, ppl=5.4, wps=580622, ups=1.17, wpb=494379, bsz=16563, num_updates=5700, lr=0.000837708, gnorm=0.252, clip=0, loss_scale=4, train_wall=84, gb_free=61.3, wall=4916 epoch 004: 742 / 1689 loss=3.942, nll_loss=2.428, ppl=5.38, wps=582254, ups=1.17, wpb=495835, bsz=16531, num_updates=5800, lr=0.000830455, gnorm=0.249, clip=0, loss_scale=8, train_wall=84, gb_free=61.8, wall=5001 epoch 004: 742 / 1689 loss=3.942, nll_loss=2.428, ppl=5.38, wps=582254, ups=1.17, wpb=495835, bsz=16531, num_updates=5800, lr=0.000830455, gnorm=0.249, clip=0, loss_scale=8, train_wall=84, gb_free=61.8, wall=5001 epoch 004: 742 / 1689 loss=3.942, nll_loss=2.428, ppl=5.38, wps=582254, ups=1.17, wpb=495835, bsz=16531, num_updates=5800, lr=0.000830455, gnorm=0.249, clip=0, loss_scale=8, train_wall=84, gb_free=61.8, wall=5001 epoch 004: 742 / 1689 loss=3.942, nll_loss=2.428, ppl=5.38, wps=582254, ups=1.17, wpb=495835, bsz=16531, num_updates=5800, lr=0.000830455, gnorm=0.249, clip=0, loss_scale=8, train_wall=84, gb_free=61.8, wall=5001 epoch 004: 843 / 1689 loss=3.946, nll_loss=2.433, ppl=5.4, wps=577893, ups=1.17, wpb=495278, bsz=16274.4, num_updates=5900, lr=0.000823387, gnorm=0.24, clip=0, loss_scale=4, train_wall=85, gb_free=61.9, wall=5087 epoch 004: 843 / 1689 loss=3.946, nll_loss=2.433, ppl=5.4, wps=577893, ups=1.17, wpb=495278, bsz=16274.4, num_updates=5900, lr=0.000823387, gnorm=0.24, clip=0, loss_scale=4, train_wall=85, gb_free=61.9, wall=5087 epoch 004: 843 / 1689 loss=3.946, nll_loss=2.433, ppl=5.4, wps=577893, ups=1.17, wpb=495278, bsz=16274.4, num_updates=5900, lr=0.000823387, gnorm=0.24, clip=0, loss_scale=4, train_wall=85, gb_free=61.9, wall=5087 epoch 004: 843 / 1689 loss=3.946, nll_loss=2.433, ppl=5.4, wps=577893, ups=1.17, wpb=495278, bsz=16274.4, num_updates=5900, lr=0.000823387, gnorm=0.24, clip=0, loss_scale=4, train_wall=85, gb_free=61.9, wall=5087 epoch 004: 943 / 1689 loss=3.929, nll_loss=2.413, ppl=5.33, wps=586883, ups=1.18, wpb=495818, bsz=16316.6, num_updates=6000, lr=0.000816497, gnorm=0.246, clip=0, loss_scale=4, train_wall=83, gb_free=61.1, wall=5171 epoch 004: 943 / 1689 loss=3.929, nll_loss=2.413, ppl=5.33, wps=586883, ups=1.18, wpb=495818, bsz=16316.6, num_updates=6000, lr=0.000816497, gnorm=0.246, clip=0, loss_scale=4, train_wall=83, gb_free=61.1, wall=5171 epoch 004: 943 / 1689 loss=3.929, nll_loss=2.413, ppl=5.33, wps=586883, ups=1.18, wpb=495818, bsz=16316.6, num_updates=6000, lr=0.000816497, gnorm=0.246, clip=0, loss_scale=4, train_wall=83, gb_free=61.1, wall=5171 epoch 004: 943 / 1689 loss=3.929, nll_loss=2.413, ppl=5.33, wps=586883, ups=1.18, wpb=495818, bsz=16316.6, num_updates=6000, lr=0.000816497, gnorm=0.246, clip=0, loss_scale=4, train_wall=83, gb_free=61.1, wall=5171 begin validation on "valid" subset epoch 004 | valid on 'valid' subset | loss 3.957 | nll_loss 2.422 | ppl 5.36 | wps 0 | wpb 44526 | bsz 2008 | num_updates 6000 | best_loss 3.957 epoch 004 | valid on 'valid' subset | loss 3.957 | nll_loss 2.422 | ppl 5.36 | wps 0 | wpb 44526 | bsz 2008 | num_updates 6000 | best_loss 3.957 epoch 004 | valid on 'valid' subset | loss 3.957 | nll_loss 2.422 | ppl 5.36 | wps 0 | wpb 44526 | bsz 2008 | num_updates 6000 | best_loss 3.957 epoch 004 | valid on 'valid' subset | loss 3.957 | nll_loss 2.422 | ppl 5.36 | wps 0 | wpb 44526 | bsz 2008 | num_updates 6000 | best_loss 3.957 epoch 004: 1043 / 1689 loss=3.925, nll_loss=2.409, ppl=5.31, wps=504488, ups=1.02, wpb=495074, bsz=16657.2, num_updates=6100, lr=0.000809776, gnorm=0.241, clip=0, loss_scale=4, train_wall=83, gb_free=61.2, wall=5270 epoch 004: 1043 / 1689 loss=3.925, nll_loss=2.409, ppl=5.31, wps=504488, ups=1.02, wpb=495074, bsz=16657.2, num_updates=6100, lr=0.000809776, gnorm=0.241, clip=0, loss_scale=4, train_wall=83, gb_free=61.2, wall=5270 epoch 004: 1043 / 1689 loss=3.925, nll_loss=2.409, ppl=5.31, wps=504488, ups=1.02, wpb=495074, bsz=16657.2, num_updates=6100, lr=0.000809776, gnorm=0.241, clip=0, loss_scale=4, train_wall=83, gb_free=61.2, wall=5270 epoch 004: 1043 / 1689 loss=3.925, nll_loss=2.409, ppl=5.31, wps=504488, ups=1.02, wpb=495074, bsz=16657.2, num_updates=6100, lr=0.000809776, gnorm=0.241, clip=0, loss_scale=4, train_wall=83, gb_free=61.2, wall=5270 epoch 004: 1143 / 1689 loss=3.925, nll_loss=2.41, ppl=5.32, wps=583452, ups=1.18, wpb=495020, bsz=16677, num_updates=6200, lr=0.000803219, gnorm=0.236, clip=0, loss_scale=4, train_wall=83, gb_free=61.6, wall=5354 epoch 004: 1143 / 1689 loss=3.925, nll_loss=2.41, ppl=5.32, wps=583452, ups=1.18, wpb=495020, bsz=16677, num_updates=6200, lr=0.000803219, gnorm=0.236, clip=0, loss_scale=4, train_wall=83, gb_free=61.6, wall=5354 epoch 004: 1143 / 1689 loss=3.925, nll_loss=2.41, ppl=5.32, wps=583452, ups=1.18, wpb=495020, bsz=16677, num_updates=6200, lr=0.000803219, gnorm=0.236, clip=0, loss_scale=4, train_wall=83, gb_free=61.6, wall=5354 epoch 004: 1143 / 1689 loss=3.925, nll_loss=2.41, ppl=5.32, wps=583452, ups=1.18, wpb=495020, bsz=16677, num_updates=6200, lr=0.000803219, gnorm=0.236, clip=0, loss_scale=4, train_wall=83, gb_free=61.6, wall=5354 epoch 004: 1243 / 1689 loss=3.918, nll_loss=2.402, ppl=5.29, wps=585048, ups=1.18, wpb=495036, bsz=16285.8, num_updates=6300, lr=0.000796819, gnorm=0.237, clip=0, loss_scale=4, train_wall=84, gb_free=61.1, wall=5439 epoch 004: 1243 / 1689 loss=3.918, nll_loss=2.402, ppl=5.29, wps=585048, ups=1.18, wpb=495036, bsz=16285.8, num_updates=6300, lr=0.000796819, gnorm=0.237, clip=0, loss_scale=4, train_wall=84, gb_free=61.1, wall=5439 epoch 004: 1243 / 1689 loss=3.918, nll_loss=2.402, ppl=5.29, wps=585048, ups=1.18, wpb=495036, bsz=16285.8, num_updates=6300, lr=0.000796819, gnorm=0.237, clip=0, loss_scale=4, train_wall=84, gb_free=61.1, wall=5439 epoch 004: 1243 / 1689 loss=3.918, nll_loss=2.402, ppl=5.29, wps=585048, ups=1.18, wpb=495036, bsz=16285.8, num_updates=6300, lr=0.000796819, gnorm=0.237, clip=0, loss_scale=4, train_wall=84, gb_free=61.1, wall=5439 epoch 004: 1343 / 1689 loss=3.916, nll_loss=2.4, ppl=5.28, wps=582466, ups=1.18, wpb=494325, bsz=16714.7, num_updates=6400, lr=0.000790569, gnorm=0.233, clip=0, loss_scale=8, train_wall=84, gb_free=61.4, wall=5524 epoch 004: 1343 / 1689 loss=3.916, nll_loss=2.4, ppl=5.28, wps=582466, ups=1.18, wpb=494325, bsz=16714.7, num_updates=6400, lr=0.000790569, gnorm=0.233, clip=0, loss_scale=8, train_wall=84, gb_free=61.4, wall=5524 epoch 004: 1343 / 1689 loss=3.916, nll_loss=2.4, ppl=5.28, wps=582466, ups=1.18, wpb=494325, bsz=16714.7, num_updates=6400, lr=0.000790569, gnorm=0.233, clip=0, loss_scale=8, train_wall=84, gb_free=61.4, wall=5524 epoch 004: 1343 / 1689 loss=3.916, nll_loss=2.4, ppl=5.28, wps=582466, ups=1.18, wpb=494325, bsz=16714.7, num_updates=6400, lr=0.000790569, gnorm=0.233, clip=0, loss_scale=8, train_wall=84, gb_free=61.4, wall=5524 epoch 004: 1443 / 1689 loss=3.909, nll_loss=2.393, ppl=5.25, wps=583952, ups=1.18, wpb=494315, bsz=16775.5, num_updates=6500, lr=0.000784465, gnorm=0.242, clip=0, loss_scale=8, train_wall=83, gb_free=62.4, wall=5609 epoch 004: 1443 / 1689 loss=3.909, nll_loss=2.393, ppl=5.25, wps=583952, ups=1.18, wpb=494315, bsz=16775.5, num_updates=6500, lr=0.000784465, gnorm=0.242, clip=0, loss_scale=8, train_wall=83, gb_free=62.4, wall=5609 epoch 004: 1443 / 1689 loss=3.909, nll_loss=2.393, ppl=5.25, wps=583952, ups=1.18, wpb=494315, bsz=16775.5, num_updates=6500, lr=0.000784465, gnorm=0.242, clip=0, loss_scale=8, train_wall=83, gb_free=62.4, wall=5609 epoch 004: 1443 / 1689 loss=3.909, nll_loss=2.393, ppl=5.25, wps=583952, ups=1.18, wpb=494315, bsz=16775.5, num_updates=6500, lr=0.000784465, gnorm=0.242, clip=0, loss_scale=8, train_wall=83, gb_free=62.4, wall=5609 epoch 004: 1544 / 1689 loss=3.905, nll_loss=2.389, ppl=5.24, wps=576570, ups=1.17, wpb=494778, bsz=16383.8, num_updates=6600, lr=0.000778499, gnorm=0.231, clip=0, loss_scale=4, train_wall=85, gb_free=59.5, wall=5694 epoch 004: 1544 / 1689 loss=3.905, nll_loss=2.389, ppl=5.24, wps=576570, ups=1.17, wpb=494778, bsz=16383.8, num_updates=6600, lr=0.000778499, gnorm=0.231, clip=0, loss_scale=4, train_wall=85, gb_free=59.5, wall=5694 epoch 004: 1544 / 1689 loss=3.905, nll_loss=2.389, ppl=5.24, wps=576570, ups=1.17, wpb=494778, bsz=16383.8, num_updates=6600, lr=0.000778499, gnorm=0.231, clip=0, loss_scale=4, train_wall=85, gb_free=59.5, wall=5694 epoch 004: 1544 / 1689 loss=3.905, nll_loss=2.389, ppl=5.24, wps=576570, ups=1.17, wpb=494778, bsz=16383.8, num_updates=6600, lr=0.000778499, gnorm=0.231, clip=0, loss_scale=4, train_wall=85, gb_free=59.5, wall=5694 epoch 004: 1644 / 1689 loss=3.895, nll_loss=2.377, ppl=5.2, wps=586532, ups=1.18, wpb=497021, bsz=16200.4, num_updates=6700, lr=0.000772667, gnorm=0.216, clip=0, loss_scale=4, train_wall=84, gb_free=61.3, wall=5779 epoch 004: 1644 / 1689 loss=3.895, nll_loss=2.377, ppl=5.2, wps=586532, ups=1.18, wpb=497021, bsz=16200.4, num_updates=6700, lr=0.000772667, gnorm=0.216, clip=0, loss_scale=4, train_wall=84, gb_free=61.3, wall=5779 epoch 004: 1644 / 1689 loss=3.895, nll_loss=2.377, ppl=5.2, wps=586532, ups=1.18, wpb=497021, bsz=16200.4, num_updates=6700, lr=0.000772667, gnorm=0.216, clip=0, loss_scale=4, train_wall=84, gb_free=61.3, wall=5779 epoch 004: 1644 / 1689 loss=3.895, nll_loss=2.377, ppl=5.2, wps=586532, ups=1.18, wpb=497021, bsz=16200.4, num_updates=6700, lr=0.000772667, gnorm=0.216, clip=0, loss_scale=4, train_wall=84, gb_free=61.3, wall=5779 end of epoch 4 (average epoch stats below) epoch 004 | loss 3.936 | nll_loss 2.421 | ppl 5.36 | wps 577636 | ups 1.17 | wpb 495136 | bsz 16503.3 | num_updates 6745 | lr 0.000770086 | gnorm 0.245 | clip 0 | loss_scale 4 | train_wall 1412 | gb_free 63.1 | wall 5817 epoch 004 | loss 3.936 | nll_loss 2.421 | ppl 5.36 | wps 577636 | ups 1.17 | wpb 495136 | bsz 16503.3 | num_updates 6745 | lr 0.000770086 | gnorm 0.245 | clip 0 | loss_scale 4 | train_wall 1412 | gb_free 63.1 | wall 5817 epoch 004 | loss 3.936 | nll_loss 2.421 | ppl 5.36 | wps 577636 | ups 1.17 | wpb 495136 | bsz 16503.3 | num_updates 6745 | lr 0.000770086 | gnorm 0.245 | clip 0 | loss_scale 4 | train_wall 1412 | gb_free 63.1 | wall 5817 epoch 004 | loss 3.936 | nll_loss 2.421 | ppl 5.36 | wps 577636 | ups 1.17 | wpb 495136 | bsz 16503.3 | num_updates 6745 | lr 0.000770086 | gnorm 0.245 | clip 0 | loss_scale 4 | train_wall 1412 | gb_free 63.1 | wall 5817 Start iterating over samples epoch 005: 55 / 1689 loss=3.881, nll_loss=2.361, ppl=5.14, wps=579577, ups=1.18, wpb=491086, bsz=16238.4, num_updates=6800, lr=0.000766965, gnorm=0.227, clip=0, loss_scale=4, train_wall=83, gb_free=58.6, wall=5864 epoch 005: 55 / 1689 loss=3.881, nll_loss=2.361, ppl=5.14, wps=579577, ups=1.18, wpb=491086, bsz=16238.4, num_updates=6800, lr=0.000766965, gnorm=0.227, clip=0, loss_scale=4, train_wall=83, gb_free=58.6, wall=5864 epoch 005: 55 / 1689 loss=3.881, nll_loss=2.361, ppl=5.14, wps=579577, ups=1.18, wpb=491086, bsz=16238.4, num_updates=6800, lr=0.000766965, gnorm=0.227, clip=0, loss_scale=4, train_wall=83, gb_free=58.6, wall=5864 epoch 005: 55 / 1689 loss=3.881, nll_loss=2.361, ppl=5.14, wps=579577, ups=1.18, wpb=491086, bsz=16238.4, num_updates=6800, lr=0.000766965, gnorm=0.227, clip=0, loss_scale=4, train_wall=83, gb_free=58.6, wall=5864 epoch 005: 55 / 1689 loss=3.881, nll_loss=2.361, ppl=5.14, wps=579577, ups=1.18, wpb=491086, bsz=16238.4, num_updates=6800, lr=0.000766965, gnorm=0.227, clip=0, loss_scale=4, train_wall=83, gb_free=58.6, wall=5864 epoch 005: 155 / 1689 loss=3.863, nll_loss=2.341, ppl=5.07, wps=586380, ups=1.18, wpb=495144, bsz=16475.8, num_updates=6900, lr=0.000761387, gnorm=0.221, clip=0, loss_scale=4, train_wall=83, gb_free=61.6, wall=5948 epoch 005: 155 / 1689 loss=3.863, nll_loss=2.341, ppl=5.07, wps=586380, ups=1.18, wpb=495144, bsz=16475.8, num_updates=6900, lr=0.000761387, gnorm=0.221, clip=0, loss_scale=4, train_wall=83, gb_free=61.6, wall=5948 epoch 005: 155 / 1689 loss=3.863, nll_loss=2.341, ppl=5.07, wps=586380, ups=1.18, wpb=495144, bsz=16475.8, num_updates=6900, lr=0.000761387, gnorm=0.221, clip=0, loss_scale=4, train_wall=83, gb_free=61.6, wall=5948 epoch 005: 155 / 1689 loss=3.863, nll_loss=2.341, ppl=5.07, wps=586380, ups=1.18, wpb=495144, bsz=16475.8, num_updates=6900, lr=0.000761387, gnorm=0.221, clip=0, loss_scale=4, train_wall=83, gb_free=61.6, wall=5948 epoch 005: 155 / 1689 loss=3.863, nll_loss=2.341, ppl=5.07, wps=586380, ups=1.18, wpb=495144, bsz=16475.8, num_updates=6900, lr=0.000761387, gnorm=0.221, clip=0, loss_scale=4, train_wall=83, gb_free=61.6, wall=5948 epoch 005: 255 / 1689 loss=3.858, nll_loss=2.335, ppl=5.05, wps=584823, ups=1.18, wpb=494442, bsz=16387.3, num_updates=7000, lr=0.000755929, gnorm=0.227, clip=0, loss_scale=4, train_wall=83, gb_free=61.5, wall=6033 epoch 005: 255 / 1689 loss=3.858, nll_loss=2.335, ppl=5.05, wps=584823, ups=1.18, wpb=494442, bsz=16387.3, num_updates=7000, lr=0.000755929, gnorm=0.227, clip=0, loss_scale=4, train_wall=83, gb_free=61.5, wall=6033 epoch 005: 255 / 1689 loss=3.858, nll_loss=2.335, ppl=5.05, wps=584823, ups=1.18, wpb=494442, bsz=16387.3, num_updates=7000, lr=0.000755929, gnorm=0.227, clip=0, loss_scale=4, train_wall=83, gb_free=61.5, wall=6033 epoch 005: 255 / 1689 loss=3.858, nll_loss=2.335, ppl=5.05, wps=584823, ups=1.18, wpb=494442, bsz=16387.3, num_updates=7000, lr=0.000755929, gnorm=0.227, clip=0, loss_scale=4, train_wall=83, gb_free=61.5, wall=6033 epoch 005: 255 / 1689 loss=3.858, nll_loss=2.335, ppl=5.05, wps=584823, ups=1.18, wpb=494442, bsz=16387.3, num_updates=7000, lr=0.000755929, gnorm=0.227, clip=0, loss_scale=4, train_wall=83, gb_free=61.5, wall=6033 begin validation on "valid" subset epoch 005 | valid on 'valid' subset | loss 3.902 | nll_loss 2.357 | ppl 5.12 | wps 0 | wpb 44526 | bsz 2008 | num_updates 7000 | best_loss 3.902 epoch 005 | valid on 'valid' subset | loss 3.902 | nll_loss 2.357 | ppl 5.12 | wps 0 | wpb 44526 | bsz 2008 | num_updates 7000 | best_loss 3.902 epoch 005 | valid on 'valid' subset | loss 3.902 | nll_loss 2.357 | ppl 5.12 | wps 0 | wpb 44526 | bsz 2008 | num_updates 7000 | best_loss 3.902 epoch 005 | valid on 'valid' subset | loss 3.902 | nll_loss 2.357 | ppl 5.12 | wps 0 | wpb 44526 | bsz 2008 | num_updates 7000 | best_loss 3.902 epoch 005 | valid on 'valid' subset | loss 3.902 | nll_loss 2.357 | ppl 5.12 | wps 0 | wpb 44526 | bsz 2008 | num_updates 7000 | best_loss 3.902 epoch 005: 355 / 1689 loss=3.866, nll_loss=2.345, ppl=5.08, wps=520324, ups=1.05, wpb=495012, bsz=16464.3, num_updates=7100, lr=0.000750587, gnorm=0.226, clip=0, loss_scale=8, train_wall=83, gb_free=61.6, wall=6128 epoch 005: 355 / 1689 loss=3.866, nll_loss=2.345, ppl=5.08, wps=520324, ups=1.05, wpb=495012, bsz=16464.3, num_updates=7100, lr=0.000750587, gnorm=0.226, clip=0, loss_scale=8, train_wall=83, gb_free=61.6, wall=6128 epoch 005: 355 / 1689 loss=3.866, nll_loss=2.345, ppl=5.08, wps=520324, ups=1.05, wpb=495012, bsz=16464.3, num_updates=7100, lr=0.000750587, gnorm=0.226, clip=0, loss_scale=8, train_wall=83, gb_free=61.6, wall=6128 epoch 005: 355 / 1689 loss=3.866, nll_loss=2.345, ppl=5.08, wps=520324, ups=1.05, wpb=495012, bsz=16464.3, num_updates=7100, lr=0.000750587, gnorm=0.226, clip=0, loss_scale=8, train_wall=83, gb_free=61.6, wall=6128 epoch 005: 355 / 1689 loss=3.866, nll_loss=2.345, ppl=5.08, wps=520324, ups=1.05, wpb=495012, bsz=16464.3, num_updates=7100, lr=0.000750587, gnorm=0.226, clip=0, loss_scale=8, train_wall=83, gb_free=61.6, wall=6128 epoch 005: 456 / 1689 loss=3.855, nll_loss=2.333, ppl=5.04, wps=583688, ups=1.18, wpb=496487, bsz=16648.4, num_updates=7200, lr=0.000745356, gnorm=0.212, clip=0, loss_scale=4, train_wall=84, gb_free=61.1, wall=6213 epoch 005: 456 / 1689 loss=3.855, nll_loss=2.333, ppl=5.04, wps=583688, ups=1.18, wpb=496487, bsz=16648.4, num_updates=7200, lr=0.000745356, gnorm=0.212, clip=0, loss_scale=4, train_wall=84, gb_free=61.1, wall=6213 epoch 005: 456 / 1689 loss=3.855, nll_loss=2.333, ppl=5.04, wps=583688, ups=1.18, wpb=496487, bsz=16648.4, num_updates=7200, lr=0.000745356, gnorm=0.212, clip=0, loss_scale=4, train_wall=84, gb_free=61.1, wall=6213 epoch 005: 456 / 1689 loss=3.855, nll_loss=2.333, ppl=5.04, wps=583688, ups=1.18, wpb=496487, bsz=16648.4, num_updates=7200, lr=0.000745356, gnorm=0.212, clip=0, loss_scale=4, train_wall=84, gb_free=61.1, wall=6213 epoch 005: 456 / 1689 loss=3.855, nll_loss=2.333, ppl=5.04, wps=583688, ups=1.18, wpb=496487, bsz=16648.4, num_updates=7200, lr=0.000745356, gnorm=0.212, clip=0, loss_scale=4, train_wall=84, gb_free=61.1, wall=6213 epoch 005: 556 / 1689 loss=3.853, nll_loss=2.331, ppl=5.03, wps=587377, ups=1.18, wpb=495804, bsz=16580.6, num_updates=7300, lr=0.000740233, gnorm=0.213, clip=0, loss_scale=4, train_wall=83, gb_free=60.9, wall=6297 epoch 005: 556 / 1689 loss=3.853, nll_loss=2.331, ppl=5.03, wps=587377, ups=1.18, wpb=495804, bsz=16580.6, num_updates=7300, lr=0.000740233, gnorm=0.213, clip=0, loss_scale=4, train_wall=83, gb_free=60.9, wall=6297 epoch 005: 556 / 1689 loss=3.853, nll_loss=2.331, ppl=5.03, wps=587377, ups=1.18, wpb=495804, bsz=16580.6, num_updates=7300, lr=0.000740233, gnorm=0.213, clip=0, loss_scale=4, train_wall=83, gb_free=60.9, wall=6297 epoch 005: 556 / 1689 loss=3.853, nll_loss=2.331, ppl=5.03, wps=587377, ups=1.18, wpb=495804, bsz=16580.6, num_updates=7300, lr=0.000740233, gnorm=0.213, clip=0, loss_scale=4, train_wall=83, gb_free=60.9, wall=6297 epoch 005: 556 / 1689 loss=3.853, nll_loss=2.331, ppl=5.03, wps=587377, ups=1.18, wpb=495804, bsz=16580.6, num_updates=7300, lr=0.000740233, gnorm=0.213, clip=0, loss_scale=4, train_wall=83, gb_free=60.9, wall=6297 epoch 005: 656 / 1689 loss=3.859, nll_loss=2.338, ppl=5.05, wps=582516, ups=1.18, wpb=495704, bsz=16432, num_updates=7400, lr=0.000735215, gnorm=0.219, clip=0, loss_scale=4, train_wall=84, gb_free=61.2, wall=6383 epoch 005: 656 / 1689 loss=3.859, nll_loss=2.338, ppl=5.05, wps=582516, ups=1.18, wpb=495704, bsz=16432, num_updates=7400, lr=0.000735215, gnorm=0.219, clip=0, loss_scale=4, train_wall=84, gb_free=61.2, wall=6383 epoch 005: 656 / 1689 loss=3.859, nll_loss=2.338, ppl=5.05, wps=582516, ups=1.18, wpb=495704, bsz=16432, num_updates=7400, lr=0.000735215, gnorm=0.219, clip=0, loss_scale=4, train_wall=84, gb_free=61.2, wall=6383 epoch 005: 656 / 1689 loss=3.859, nll_loss=2.338, ppl=5.05, wps=582516, ups=1.18, wpb=495704, bsz=16432, num_updates=7400, lr=0.000735215, gnorm=0.219, clip=0, loss_scale=4, train_wall=84, gb_free=61.2, wall=6383 epoch 005: 656 / 1689 loss=3.859, nll_loss=2.338, ppl=5.05, wps=582516, ups=1.18, wpb=495704, bsz=16432, num_updates=7400, lr=0.000735215, gnorm=0.219, clip=0, loss_scale=4, train_wall=84, gb_free=61.2, wall=6383 epoch 005: 756 / 1689 loss=3.86, nll_loss=2.339, ppl=5.06, wps=586857, ups=1.19, wpb=494399, bsz=16383.8, num_updates=7500, lr=0.000730297, gnorm=0.227, clip=0, loss_scale=4, train_wall=83, gb_free=61.6, wall=6467 epoch 005: 756 / 1689 loss=3.86, nll_loss=2.339, ppl=5.06, wps=586857, ups=1.19, wpb=494399, bsz=16383.8, num_updates=7500, lr=0.000730297, gnorm=0.227, clip=0, loss_scale=4, train_wall=83, gb_free=61.6, wall=6467 epoch 005: 756 / 1689 loss=3.86, nll_loss=2.339, ppl=5.06, wps=586857, ups=1.19, wpb=494399, bsz=16383.8, num_updates=7500, lr=0.000730297, gnorm=0.227, clip=0, loss_scale=4, train_wall=83, gb_free=61.6, wall=6467 epoch 005: 756 / 1689 loss=3.86, nll_loss=2.339, ppl=5.06, wps=586857, ups=1.19, wpb=494399, bsz=16383.8, num_updates=7500, lr=0.000730297, gnorm=0.227, clip=0, loss_scale=4, train_wall=83, gb_free=61.6, wall=6467 epoch 005: 756 / 1689 loss=3.86, nll_loss=2.339, ppl=5.06, wps=586857, ups=1.19, wpb=494399, bsz=16383.8, num_updates=7500, lr=0.000730297, gnorm=0.227, clip=0, loss_scale=4, train_wall=83, gb_free=61.6, wall=6467 epoch 005: 856 / 1689 loss=3.854, nll_loss=2.332, ppl=5.04, wps=583710, ups=1.18, wpb=495323, bsz=16568.4, num_updates=7600, lr=0.000725476, gnorm=0.209, clip=0, loss_scale=4, train_wall=84, gb_free=61.1, wall=6552 epoch 005: 856 / 1689 loss=3.854, nll_loss=2.332, ppl=5.04, wps=583710, ups=1.18, wpb=495323, bsz=16568.4, num_updates=7600, lr=0.000725476, gnorm=0.209, clip=0, loss_scale=4, train_wall=84, gb_free=61.1, wall=6552 epoch 005: 856 / 1689 loss=3.854, nll_loss=2.332, ppl=5.04, wps=583710, ups=1.18, wpb=495323, bsz=16568.4, num_updates=7600, lr=0.000725476, gnorm=0.209, clip=0, loss_scale=4, train_wall=84, gb_free=61.1, wall=6552 epoch 005: 856 / 1689 loss=3.854, nll_loss=2.332, ppl=5.04, wps=583710, ups=1.18, wpb=495323, bsz=16568.4, num_updates=7600, lr=0.000725476, gnorm=0.209, clip=0, loss_scale=4, train_wall=84, gb_free=61.1, wall=6552 epoch 005: 856 / 1689 loss=3.854, nll_loss=2.332, ppl=5.04, wps=583710, ups=1.18, wpb=495323, bsz=16568.4, num_updates=7600, lr=0.000725476, gnorm=0.209, clip=0, loss_scale=4, train_wall=84, gb_free=61.1, wall=6552 epoch 005: 956 / 1689 loss=3.86, nll_loss=2.339, ppl=5.06, wps=586903, ups=1.18, wpb=495898, bsz=16465, num_updates=7700, lr=0.00072075, gnorm=0.218, clip=0, loss_scale=8, train_wall=83, gb_free=61.9, wall=6636 epoch 005: 956 / 1689 loss=3.86, nll_loss=2.339, ppl=5.06, wps=586903, ups=1.18, wpb=495898, bsz=16465, num_updates=7700, lr=0.00072075, gnorm=0.218, clip=0, loss_scale=8, train_wall=83, gb_free=61.9, wall=6636 epoch 005: 956 / 1689 loss=3.86, nll_loss=2.339, ppl=5.06, wps=586903, ups=1.18, wpb=495898, bsz=16465, num_updates=7700, lr=0.00072075, gnorm=0.218, clip=0, loss_scale=8, train_wall=83, gb_free=61.9, wall=6636 epoch 005: 956 / 1689 loss=3.86, nll_loss=2.339, ppl=5.06, wps=586903, ups=1.18, wpb=495898, bsz=16465, num_updates=7700, lr=0.00072075, gnorm=0.218, clip=0, loss_scale=8, train_wall=83, gb_free=61.9, wall=6636 epoch 005: 956 / 1689 loss=3.86, nll_loss=2.339, ppl=5.06, wps=586903, ups=1.18, wpb=495898, bsz=16465, num_updates=7700, lr=0.00072075, gnorm=0.218, clip=0, loss_scale=8, train_wall=83, gb_free=61.9, wall=6636 epoch 005: 1057 / 1689 loss=3.847, nll_loss=2.325, ppl=5.01, wps=581046, ups=1.17, wpb=497833, bsz=16672.2, num_updates=7800, lr=0.000716115, gnorm=0.208, clip=0, loss_scale=4, train_wall=85, gb_free=62.1, wall=6722 epoch 005: 1057 / 1689 loss=3.847, nll_loss=2.325, ppl=5.01, wps=581046, ups=1.17, wpb=497833, bsz=16672.2, num_updates=7800, lr=0.000716115, gnorm=0.208, clip=0, loss_scale=4, train_wall=85, gb_free=62.1, wall=6722 epoch 005: 1057 / 1689 loss=3.847, nll_loss=2.325, ppl=5.01, wps=581046, ups=1.17, wpb=497833, bsz=16672.2, num_updates=7800, lr=0.000716115, gnorm=0.208, clip=0, loss_scale=4, train_wall=85, gb_free=62.1, wall=6722 epoch 005: 1057 / 1689 loss=3.847, nll_loss=2.325, ppl=5.01, wps=581046, ups=1.17, wpb=497833, bsz=16672.2, num_updates=7800, lr=0.000716115, gnorm=0.208, clip=0, loss_scale=4, train_wall=85, gb_free=62.1, wall=6722 epoch 005: 1057 / 1689 loss=3.847, nll_loss=2.325, ppl=5.01, wps=581046, ups=1.17, wpb=497833, bsz=16672.2, num_updates=7800, lr=0.000716115, gnorm=0.208, clip=0, loss_scale=4, train_wall=85, gb_free=62.1, wall=6722 epoch 005: 1157 / 1689 loss=3.841, nll_loss=2.319, ppl=4.99, wps=583361, ups=1.18, wpb=493839, bsz=16664.2, num_updates=7900, lr=0.000711568, gnorm=0.21, clip=0, loss_scale=4, train_wall=83, gb_free=61, wall=6807 epoch 005: 1157 / 1689 loss=3.841, nll_loss=2.319, ppl=4.99, wps=583361, ups=1.18, wpb=493839, bsz=16664.2, num_updates=7900, lr=0.000711568, gnorm=0.21, clip=0, loss_scale=4, train_wall=83, gb_free=61, wall=6807 epoch 005: 1157 / 1689 loss=3.841, nll_loss=2.319, ppl=4.99, wps=583361, ups=1.18, wpb=493839, bsz=16664.2, num_updates=7900, lr=0.000711568, gnorm=0.21, clip=0, loss_scale=4, train_wall=83, gb_free=61, wall=6807 epoch 005: 1157 / 1689 loss=3.841, nll_loss=2.319, ppl=4.99, wps=583361, ups=1.18, wpb=493839, bsz=16664.2, num_updates=7900, lr=0.000711568, gnorm=0.21, clip=0, loss_scale=4, train_wall=83, gb_free=61, wall=6807 epoch 005: 1157 / 1689 loss=3.841, nll_loss=2.319, ppl=4.99, wps=583361, ups=1.18, wpb=493839, bsz=16664.2, num_updates=7900, lr=0.000711568, gnorm=0.21, clip=0, loss_scale=4, train_wall=83, gb_free=61, wall=6807 epoch 005: 1258 / 1689 loss=3.843, nll_loss=2.322, ppl=5, wps=577080, ups=1.17, wpb=494076, bsz=16424.2, num_updates=8000, lr=0.000707107, gnorm=0.209, clip=0, loss_scale=2, train_wall=85, gb_free=62.2, wall=6892 epoch 005: 1258 / 1689 loss=3.843, nll_loss=2.322, ppl=5, wps=577080, ups=1.17, wpb=494076, bsz=16424.2, num_updates=8000, lr=0.000707107, gnorm=0.209, clip=0, loss_scale=2, train_wall=85, gb_free=62.2, wall=6892 epoch 005: 1258 / 1689 loss=3.843, nll_loss=2.322, ppl=5, wps=577080, ups=1.17, wpb=494076, bsz=16424.2, num_updates=8000, lr=0.000707107, gnorm=0.209, clip=0, loss_scale=2, train_wall=85, gb_free=62.2, wall=6892 epoch 005: 1258 / 1689 loss=3.843, nll_loss=2.322, ppl=5, wps=577080, ups=1.17, wpb=494076, bsz=16424.2, num_updates=8000, lr=0.000707107, gnorm=0.209, clip=0, loss_scale=2, train_wall=85, gb_free=62.2, wall=6892 epoch 005: 1258 / 1689 loss=3.843, nll_loss=2.322, ppl=5, wps=577080, ups=1.17, wpb=494076, bsz=16424.2, num_updates=8000, lr=0.000707107, gnorm=0.209, clip=0, loss_scale=2, train_wall=85, gb_free=62.2, wall=6892 begin validation on "valid" subset epoch 005 | valid on 'valid' subset | loss 3.865 | nll_loss 2.32 | ppl 4.99 | wps 0 | wpb 44526 | bsz 2008 | num_updates 8000 | best_loss 3.865 epoch 005 | valid on 'valid' subset | loss 3.865 | nll_loss 2.32 | ppl 4.99 | wps 0 | wpb 44526 | bsz 2008 | num_updates 8000 | best_loss 3.865 epoch 005 | valid on 'valid' subset | loss 3.865 | nll_loss 2.32 | ppl 4.99 | wps 0 | wpb 44526 | bsz 2008 | num_updates 8000 | best_loss 3.865 epoch 005 | valid on 'valid' subset | loss 3.865 | nll_loss 2.32 | ppl 4.99 | wps 0 | wpb 44526 | bsz 2008 | num_updates 8000 | best_loss 3.865 epoch 005 | valid on 'valid' subset | loss 3.865 | nll_loss 2.32 | ppl 4.99 | wps 0 | wpb 44526 | bsz 2008 | num_updates 8000 | best_loss 3.865 epoch 005: 1358 / 1689 loss=3.841, nll_loss=2.319, ppl=4.99, wps=514435, ups=1.04, wpb=495112, bsz=16727.7, num_updates=8100, lr=0.000702728, gnorm=0.215, clip=0, loss_scale=2, train_wall=83, gb_free=61.9, wall=6988 epoch 005: 1358 / 1689 loss=3.841, nll_loss=2.319, ppl=4.99, wps=514435, ups=1.04, wpb=495112, bsz=16727.7, num_updates=8100, lr=0.000702728, gnorm=0.215, clip=0, loss_scale=2, train_wall=83, gb_free=61.9, wall=6988 epoch 005: 1358 / 1689 loss=3.841, nll_loss=2.319, ppl=4.99, wps=514435, ups=1.04, wpb=495112, bsz=16727.7, num_updates=8100, lr=0.000702728, gnorm=0.215, clip=0, loss_scale=2, train_wall=83, gb_free=61.9, wall=6988 epoch 005: 1358 / 1689 loss=3.841, nll_loss=2.319, ppl=4.99, wps=514435, ups=1.04, wpb=495112, bsz=16727.7, num_updates=8100, lr=0.000702728, gnorm=0.215, clip=0, loss_scale=2, train_wall=83, gb_free=61.9, wall=6988 epoch 005: 1358 / 1689 loss=3.841, nll_loss=2.319, ppl=4.99, wps=514435, ups=1.04, wpb=495112, bsz=16727.7, num_updates=8100, lr=0.000702728, gnorm=0.215, clip=0, loss_scale=2, train_wall=83, gb_free=61.9, wall=6988 epoch 005: 1458 / 1689 loss=3.836, nll_loss=2.314, ppl=4.97, wps=585058, ups=1.18, wpb=495608, bsz=16479.6, num_updates=8200, lr=0.00069843, gnorm=0.202, clip=0, loss_scale=2, train_wall=84, gb_free=60.8, wall=7073 epoch 005: 1458 / 1689 loss=3.836, nll_loss=2.314, ppl=4.97, wps=585058, ups=1.18, wpb=495608, bsz=16479.6, num_updates=8200, lr=0.00069843, gnorm=0.202, clip=0, loss_scale=2, train_wall=84, gb_free=60.8, wall=7073 epoch 005: 1458 / 1689 loss=3.836, nll_loss=2.314, ppl=4.97, wps=585058, ups=1.18, wpb=495608, bsz=16479.6, num_updates=8200, lr=0.00069843, gnorm=0.202, clip=0, loss_scale=2, train_wall=84, gb_free=60.8, wall=7073 epoch 005: 1458 / 1689 loss=3.836, nll_loss=2.314, ppl=4.97, wps=585058, ups=1.18, wpb=495608, bsz=16479.6, num_updates=8200, lr=0.00069843, gnorm=0.202, clip=0, loss_scale=2, train_wall=84, gb_free=60.8, wall=7073 epoch 005: 1458 / 1689 loss=3.836, nll_loss=2.314, ppl=4.97, wps=585058, ups=1.18, wpb=495608, bsz=16479.6, num_updates=8200, lr=0.00069843, gnorm=0.202, clip=0, loss_scale=2, train_wall=84, gb_free=60.8, wall=7073 epoch 005: 1558 / 1689 loss=3.836, nll_loss=2.314, ppl=4.97, wps=590283, ups=1.19, wpb=495981, bsz=16139.1, num_updates=8300, lr=0.00069421, gnorm=0.198, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=7157 epoch 005: 1558 / 1689 loss=3.836, nll_loss=2.314, ppl=4.97, wps=590283, ups=1.19, wpb=495981, bsz=16139.1, num_updates=8300, lr=0.00069421, gnorm=0.198, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=7157 epoch 005: 1558 / 1689 loss=3.836, nll_loss=2.314, ppl=4.97, wps=590283, ups=1.19, wpb=495981, bsz=16139.1, num_updates=8300, lr=0.00069421, gnorm=0.198, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=7157 epoch 005: 1558 / 1689 loss=3.836, nll_loss=2.314, ppl=4.97, wps=590283, ups=1.19, wpb=495981, bsz=16139.1, num_updates=8300, lr=0.00069421, gnorm=0.198, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=7157 epoch 005: 1558 / 1689 loss=3.836, nll_loss=2.314, ppl=4.97, wps=590283, ups=1.19, wpb=495981, bsz=16139.1, num_updates=8300, lr=0.00069421, gnorm=0.198, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=7157 epoch 005: 1658 / 1689 loss=3.834, nll_loss=2.311, ppl=4.96, wps=581714, ups=1.17, wpb=495442, bsz=16865, num_updates=8400, lr=0.000690066, gnorm=0.209, clip=0, loss_scale=2, train_wall=84, gb_free=60.8, wall=7242 epoch 005: 1658 / 1689 loss=3.834, nll_loss=2.311, ppl=4.96, wps=581714, ups=1.17, wpb=495442, bsz=16865, num_updates=8400, lr=0.000690066, gnorm=0.209, clip=0, loss_scale=2, train_wall=84, gb_free=60.8, wall=7242 epoch 005: 1658 / 1689 loss=3.834, nll_loss=2.311, ppl=4.96, wps=581714, ups=1.17, wpb=495442, bsz=16865, num_updates=8400, lr=0.000690066, gnorm=0.209, clip=0, loss_scale=2, train_wall=84, gb_free=60.8, wall=7242 epoch 005: 1658 / 1689 loss=3.834, nll_loss=2.311, ppl=4.96, wps=581714, ups=1.17, wpb=495442, bsz=16865, num_updates=8400, lr=0.000690066, gnorm=0.209, clip=0, loss_scale=2, train_wall=84, gb_free=60.8, wall=7242 epoch 005: 1658 / 1689 loss=3.834, nll_loss=2.311, ppl=4.96, wps=581714, ups=1.17, wpb=495442, bsz=16865, num_updates=8400, lr=0.000690066, gnorm=0.209, clip=0, loss_scale=2, train_wall=84, gb_free=60.8, wall=7242 end of epoch 5 (average epoch stats below) epoch 005 | loss 3.85 | nll_loss 2.329 | ppl 5.02 | wps 575208 | ups 1.16 | wpb 495132 | bsz 16503.2 | num_updates 8431 | lr 0.000688796 | gnorm 0.214 | clip 0 | loss_scale 2 | train_wall 1410 | gb_free 62.1 | wall 7268 epoch 005 | loss 3.85 | nll_loss 2.329 | ppl 5.02 | wps 575208 | ups 1.16 | wpb 495132 | bsz 16503.2 | num_updates 8431 | lr 0.000688796 | gnorm 0.214 | clip 0 | loss_scale 2 | train_wall 1410 | gb_free 62.1 | wall 7268 epoch 005 | loss 3.85 | nll_loss 2.329 | ppl 5.02 | wps 575208 | ups 1.16 | wpb 495132 | bsz 16503.2 | num_updates 8431 | lr 0.000688796 | gnorm 0.214 | clip 0 | loss_scale 2 | train_wall 1410 | gb_free 62.1 | wall 7268 epoch 005 | loss 3.85 | nll_loss 2.329 | ppl 5.02 | wps 575208 | ups 1.16 | wpb 495132 | bsz 16503.2 | num_updates 8431 | lr 0.000688796 | gnorm 0.214 | clip 0 | loss_scale 2 | train_wall 1410 | gb_free 62.1 | wall 7268 epoch 005 | loss 3.85 | nll_loss 2.329 | ppl 5.02 | wps 575208 | ups 1.16 | wpb 495132 | bsz 16503.2 | num_updates 8431 | lr 0.000688796 | gnorm 0.214 | clip 0 | loss_scale 2 | train_wall 1410 | gb_free 62.1 | wall 7268 Start iterating over samples epoch 006: 69 / 1689 loss=3.803, nll_loss=2.276, ppl=4.84, wps=583264, ups=1.18, wpb=492669, bsz=15980, num_updates=8500, lr=0.000685994, gnorm=0.202, clip=0, loss_scale=4, train_wall=83, gb_free=61, wall=7327 epoch 006: 69 / 1689 loss=3.803, nll_loss=2.276, ppl=4.84, wps=583264, ups=1.18, wpb=492669, bsz=15980, num_updates=8500, lr=0.000685994, gnorm=0.202, clip=0, loss_scale=4, train_wall=83, gb_free=61, wall=7327 epoch 006: 69 / 1689 loss=3.803, nll_loss=2.276, ppl=4.84, wps=583264, ups=1.18, wpb=492669, bsz=15980, num_updates=8500, lr=0.000685994, gnorm=0.202, clip=0, loss_scale=4, train_wall=83, gb_free=61, wall=7327 epoch 006: 69 / 1689 loss=3.803, nll_loss=2.276, ppl=4.84, wps=583264, ups=1.18, wpb=492669, bsz=15980, num_updates=8500, lr=0.000685994, gnorm=0.202, clip=0, loss_scale=4, train_wall=83, gb_free=61, wall=7327 epoch 006: 69 / 1689 loss=3.803, nll_loss=2.276, ppl=4.84, wps=583264, ups=1.18, wpb=492669, bsz=15980, num_updates=8500, lr=0.000685994, gnorm=0.202, clip=0, loss_scale=4, train_wall=83, gb_free=61, wall=7327 epoch 006: 69 / 1689 loss=3.803, nll_loss=2.276, ppl=4.84, wps=583264, ups=1.18, wpb=492669, bsz=15980, num_updates=8500, lr=0.000685994, gnorm=0.202, clip=0, loss_scale=4, train_wall=83, gb_free=61, wall=7327 epoch 006: 169 / 1689 loss=3.798, nll_loss=2.271, ppl=4.83, wps=586297, ups=1.18, wpb=495379, bsz=16313.9, num_updates=8600, lr=0.000681994, gnorm=0.199, clip=0, loss_scale=4, train_wall=83, gb_free=61, wall=7411 epoch 006: 169 / 1689 loss=3.798, nll_loss=2.271, ppl=4.83, wps=586297, ups=1.18, wpb=495379, bsz=16313.9, num_updates=8600, lr=0.000681994, gnorm=0.199, clip=0, loss_scale=4, train_wall=83, gb_free=61, wall=7411 epoch 006: 169 / 1689 loss=3.798, nll_loss=2.271, ppl=4.83, wps=586297, ups=1.18, wpb=495379, bsz=16313.9, num_updates=8600, lr=0.000681994, gnorm=0.199, clip=0, loss_scale=4, train_wall=83, gb_free=61, wall=7411 epoch 006: 169 / 1689 loss=3.798, nll_loss=2.271, ppl=4.83, wps=586297, ups=1.18, wpb=495379, bsz=16313.9, num_updates=8600, lr=0.000681994, gnorm=0.199, clip=0, loss_scale=4, train_wall=83, gb_free=61, wall=7411 epoch 006: 169 / 1689 loss=3.798, nll_loss=2.271, ppl=4.83, wps=586297, ups=1.18, wpb=495379, bsz=16313.9, num_updates=8600, lr=0.000681994, gnorm=0.199, clip=0, loss_scale=4, train_wall=83, gb_free=61, wall=7411 epoch 006: 169 / 1689 loss=3.798, nll_loss=2.271, ppl=4.83, wps=586297, ups=1.18, wpb=495379, bsz=16313.9, num_updates=8600, lr=0.000681994, gnorm=0.199, clip=0, loss_scale=4, train_wall=83, gb_free=61, wall=7411 epoch 006: 269 / 1689 loss=3.793, nll_loss=2.266, ppl=4.81, wps=585916, ups=1.18, wpb=495942, bsz=16622.8, num_updates=8700, lr=0.000678064, gnorm=0.201, clip=0, loss_scale=4, train_wall=84, gb_free=61.9, wall=7496 epoch 006: 269 / 1689 loss=3.793, nll_loss=2.266, ppl=4.81, wps=585916, ups=1.18, wpb=495942, bsz=16622.8, num_updates=8700, lr=0.000678064, gnorm=0.201, clip=0, loss_scale=4, train_wall=84, gb_free=61.9, wall=7496 epoch 006: 269 / 1689 loss=3.793, nll_loss=2.266, ppl=4.81, wps=585916, ups=1.18, wpb=495942, bsz=16622.8, num_updates=8700, lr=0.000678064, gnorm=0.201, clip=0, loss_scale=4, train_wall=84, gb_free=61.9, wall=7496 epoch 006: 269 / 1689 loss=3.793, nll_loss=2.266, ppl=4.81, wps=585916, ups=1.18, wpb=495942, bsz=16622.8, num_updates=8700, lr=0.000678064, gnorm=0.201, clip=0, loss_scale=4, train_wall=84, gb_free=61.9, wall=7496 epoch 006: 269 / 1689 loss=3.793, nll_loss=2.266, ppl=4.81, wps=585916, ups=1.18, wpb=495942, bsz=16622.8, num_updates=8700, lr=0.000678064, gnorm=0.201, clip=0, loss_scale=4, train_wall=84, gb_free=61.9, wall=7496 epoch 006: 269 / 1689 loss=3.793, nll_loss=2.266, ppl=4.81, wps=585916, ups=1.18, wpb=495942, bsz=16622.8, num_updates=8700, lr=0.000678064, gnorm=0.201, clip=0, loss_scale=4, train_wall=84, gb_free=61.9, wall=7496 epoch 006: 369 / 1689 loss=3.802, nll_loss=2.276, ppl=4.84, wps=586253, ups=1.19, wpb=494476, bsz=16313.7, num_updates=8800, lr=0.0006742, gnorm=0.205, clip=0, loss_scale=4, train_wall=83, gb_free=61, wall=7580 epoch 006: 369 / 1689 loss=3.802, nll_loss=2.276, ppl=4.84, wps=586253, ups=1.19, wpb=494476, bsz=16313.7, num_updates=8800, lr=0.0006742, gnorm=0.205, clip=0, loss_scale=4, train_wall=83, gb_free=61, wall=7580 epoch 006: 369 / 1689 loss=3.802, nll_loss=2.276, ppl=4.84, wps=586253, ups=1.19, wpb=494476, bsz=16313.7, num_updates=8800, lr=0.0006742, gnorm=0.205, clip=0, loss_scale=4, train_wall=83, gb_free=61, wall=7580 epoch 006: 369 / 1689 loss=3.802, nll_loss=2.276, ppl=4.84, wps=586253, ups=1.19, wpb=494476, bsz=16313.7, num_updates=8800, lr=0.0006742, gnorm=0.205, clip=0, loss_scale=4, train_wall=83, gb_free=61, wall=7580 epoch 006: 369 / 1689 loss=3.802, nll_loss=2.276, ppl=4.84, wps=586253, ups=1.19, wpb=494476, bsz=16313.7, num_updates=8800, lr=0.0006742, gnorm=0.205, clip=0, loss_scale=4, train_wall=83, gb_free=61, wall=7580 epoch 006: 369 / 1689 loss=3.802, nll_loss=2.276, ppl=4.84, wps=586253, ups=1.19, wpb=494476, bsz=16313.7, num_updates=8800, lr=0.0006742, gnorm=0.205, clip=0, loss_scale=4, train_wall=83, gb_free=61, wall=7580 epoch 006: 469 / 1689 loss=3.795, nll_loss=2.268, ppl=4.82, wps=585779, ups=1.18, wpb=496002, bsz=16443.6, num_updates=8900, lr=0.000670402, gnorm=0.203, clip=0, loss_scale=4, train_wall=84, gb_free=61.1, wall=7665 epoch 006: 469 / 1689 loss=3.795, nll_loss=2.268, ppl=4.82, wps=585779, ups=1.18, wpb=496002, bsz=16443.6, num_updates=8900, lr=0.000670402, gnorm=0.203, clip=0, loss_scale=4, train_wall=84, gb_free=61.1, wall=7665 epoch 006: 469 / 1689 loss=3.795, nll_loss=2.268, ppl=4.82, wps=585779, ups=1.18, wpb=496002, bsz=16443.6, num_updates=8900, lr=0.000670402, gnorm=0.203, clip=0, loss_scale=4, train_wall=84, gb_free=61.1, wall=7665 epoch 006: 469 / 1689 loss=3.795, nll_loss=2.268, ppl=4.82, wps=585779, ups=1.18, wpb=496002, bsz=16443.6, num_updates=8900, lr=0.000670402, gnorm=0.203, clip=0, loss_scale=4, train_wall=84, gb_free=61.1, wall=7665 epoch 006: 469 / 1689 loss=3.795, nll_loss=2.268, ppl=4.82, wps=585779, ups=1.18, wpb=496002, bsz=16443.6, num_updates=8900, lr=0.000670402, gnorm=0.203, clip=0, loss_scale=4, train_wall=84, gb_free=61.1, wall=7665 epoch 006: 469 / 1689 loss=3.795, nll_loss=2.268, ppl=4.82, wps=585779, ups=1.18, wpb=496002, bsz=16443.6, num_updates=8900, lr=0.000670402, gnorm=0.203, clip=0, loss_scale=4, train_wall=84, gb_free=61.1, wall=7665 epoch 006: 570 / 1689 loss=3.799, nll_loss=2.273, ppl=4.83, wps=577213, ups=1.16, wpb=495486, bsz=16670.2, num_updates=9000, lr=0.000666667, gnorm=0.199, clip=0, loss_scale=4, train_wall=85, gb_free=61.6, wall=7751 epoch 006: 570 / 1689 loss=3.799, nll_loss=2.273, ppl=4.83, wps=577213, ups=1.16, wpb=495486, bsz=16670.2, num_updates=9000, lr=0.000666667, gnorm=0.199, clip=0, loss_scale=4, train_wall=85, gb_free=61.6, wall=7751 epoch 006: 570 / 1689 loss=3.799, nll_loss=2.273, ppl=4.83, wps=577213, ups=1.16, wpb=495486, bsz=16670.2, num_updates=9000, lr=0.000666667, gnorm=0.199, clip=0, loss_scale=4, train_wall=85, gb_free=61.6, wall=7751 epoch 006: 570 / 1689 loss=3.799, nll_loss=2.273, ppl=4.83, wps=577213, ups=1.16, wpb=495486, bsz=16670.2, num_updates=9000, lr=0.000666667, gnorm=0.199, clip=0, loss_scale=4, train_wall=85, gb_free=61.6, wall=7751 epoch 006: 570 / 1689 loss=3.799, nll_loss=2.273, ppl=4.83, wps=577213, ups=1.16, wpb=495486, bsz=16670.2, num_updates=9000, lr=0.000666667, gnorm=0.199, clip=0, loss_scale=4, train_wall=85, gb_free=61.6, wall=7751 epoch 006: 570 / 1689 loss=3.799, nll_loss=2.273, ppl=4.83, wps=577213, ups=1.16, wpb=495486, bsz=16670.2, num_updates=9000, lr=0.000666667, gnorm=0.199, clip=0, loss_scale=4, train_wall=85, gb_free=61.6, wall=7751 begin validation on "valid" subset epoch 006 | valid on 'valid' subset | loss 3.849 | nll_loss 2.298 | ppl 4.92 | wps 0 | wpb 44526 | bsz 2008 | num_updates 9000 | best_loss 3.849 epoch 006 | valid on 'valid' subset | loss 3.849 | nll_loss 2.298 | ppl 4.92 | wps 0 | wpb 44526 | bsz 2008 | num_updates 9000 | best_loss 3.849 epoch 006 | valid on 'valid' subset | loss 3.849 | nll_loss 2.298 | ppl 4.92 | wps 0 | wpb 44526 | bsz 2008 | num_updates 9000 | best_loss 3.849 epoch 006 | valid on 'valid' subset | loss 3.849 | nll_loss 2.298 | ppl 4.92 | wps 0 | wpb 44526 | bsz 2008 | num_updates 9000 | best_loss 3.849 epoch 006 | valid on 'valid' subset | loss 3.849 | nll_loss 2.298 | ppl 4.92 | wps 0 | wpb 44526 | bsz 2008 | num_updates 9000 | best_loss 3.849 epoch 006 | valid on 'valid' subset | loss 3.849 | nll_loss 2.298 | ppl 4.92 | wps 0 | wpb 44526 | bsz 2008 | num_updates 9000 | best_loss 3.849 epoch 006: 670 / 1689 loss=3.797, nll_loss=2.271, ppl=4.83, wps=512232, ups=1.04, wpb=494036, bsz=16817.4, num_updates=9100, lr=0.000662994, gnorm=0.199, clip=0, loss_scale=4, train_wall=83, gb_free=60.7, wall=7847 epoch 006: 670 / 1689 loss=3.797, nll_loss=2.271, ppl=4.83, wps=512232, ups=1.04, wpb=494036, bsz=16817.4, num_updates=9100, lr=0.000662994, gnorm=0.199, clip=0, loss_scale=4, train_wall=83, gb_free=60.7, wall=7847 epoch 006: 670 / 1689 loss=3.797, nll_loss=2.271, ppl=4.83, wps=512232, ups=1.04, wpb=494036, bsz=16817.4, num_updates=9100, lr=0.000662994, gnorm=0.199, clip=0, loss_scale=4, train_wall=83, gb_free=60.7, wall=7847 epoch 006: 670 / 1689 loss=3.797, nll_loss=2.271, ppl=4.83, wps=512232, ups=1.04, wpb=494036, bsz=16817.4, num_updates=9100, lr=0.000662994, gnorm=0.199, clip=0, loss_scale=4, train_wall=83, gb_free=60.7, wall=7847 epoch 006: 670 / 1689 loss=3.797, nll_loss=2.271, ppl=4.83, wps=512232, ups=1.04, wpb=494036, bsz=16817.4, num_updates=9100, lr=0.000662994, gnorm=0.199, clip=0, loss_scale=4, train_wall=83, gb_free=60.7, wall=7847 epoch 006: 670 / 1689 loss=3.797, nll_loss=2.271, ppl=4.83, wps=512232, ups=1.04, wpb=494036, bsz=16817.4, num_updates=9100, lr=0.000662994, gnorm=0.199, clip=0, loss_scale=4, train_wall=83, gb_free=60.7, wall=7847 epoch 006: 770 / 1689 loss=3.797, nll_loss=2.271, ppl=4.83, wps=584316, ups=1.18, wpb=495786, bsz=16402.5, num_updates=9200, lr=0.00065938, gnorm=0.195, clip=0, loss_scale=4, train_wall=84, gb_free=61.4, wall=7932 epoch 006: 770 / 1689 loss=3.797, nll_loss=2.271, ppl=4.83, wps=584316, ups=1.18, wpb=495786, bsz=16402.5, num_updates=9200, lr=0.00065938, gnorm=0.195, clip=0, loss_scale=4, train_wall=84, gb_free=61.4, wall=7932 epoch 006: 770 / 1689 loss=3.797, nll_loss=2.271, ppl=4.83, wps=584316, ups=1.18, wpb=495786, bsz=16402.5, num_updates=9200, lr=0.00065938, gnorm=0.195, clip=0, loss_scale=4, train_wall=84, gb_free=61.4, wall=7932 epoch 006: 770 / 1689 loss=3.797, nll_loss=2.271, ppl=4.83, wps=584316, ups=1.18, wpb=495786, bsz=16402.5, num_updates=9200, lr=0.00065938, gnorm=0.195, clip=0, loss_scale=4, train_wall=84, gb_free=61.4, wall=7932 epoch 006: 770 / 1689 loss=3.797, nll_loss=2.271, ppl=4.83, wps=584316, ups=1.18, wpb=495786, bsz=16402.5, num_updates=9200, lr=0.00065938, gnorm=0.195, clip=0, loss_scale=4, train_wall=84, gb_free=61.4, wall=7932 epoch 006: 770 / 1689 loss=3.797, nll_loss=2.271, ppl=4.83, wps=584316, ups=1.18, wpb=495786, bsz=16402.5, num_updates=9200, lr=0.00065938, gnorm=0.195, clip=0, loss_scale=4, train_wall=84, gb_free=61.4, wall=7932 epoch 006: 870 / 1689 loss=3.8, nll_loss=2.274, ppl=4.84, wps=587227, ups=1.19, wpb=495285, bsz=16919.1, num_updates=9300, lr=0.000655826, gnorm=0.204, clip=0, loss_scale=4, train_wall=83, gb_free=60.6, wall=8016 epoch 006: 870 / 1689 loss=3.8, nll_loss=2.274, ppl=4.84, wps=587227, ups=1.19, wpb=495285, bsz=16919.1, num_updates=9300, lr=0.000655826, gnorm=0.204, clip=0, loss_scale=4, train_wall=83, gb_free=60.6, wall=8016 epoch 006: 870 / 1689 loss=3.8, nll_loss=2.274, ppl=4.84, wps=587227, ups=1.19, wpb=495285, bsz=16919.1, num_updates=9300, lr=0.000655826, gnorm=0.204, clip=0, loss_scale=4, train_wall=83, gb_free=60.6, wall=8016 epoch 006: 870 / 1689 loss=3.8, nll_loss=2.274, ppl=4.84, wps=587227, ups=1.19, wpb=495285, bsz=16919.1, num_updates=9300, lr=0.000655826, gnorm=0.204, clip=0, loss_scale=4, train_wall=83, gb_free=60.6, wall=8016 epoch 006: 870 / 1689 loss=3.8, nll_loss=2.274, ppl=4.84, wps=587227, ups=1.19, wpb=495285, bsz=16919.1, num_updates=9300, lr=0.000655826, gnorm=0.204, clip=0, loss_scale=4, train_wall=83, gb_free=60.6, wall=8016 epoch 006: 870 / 1689 loss=3.8, nll_loss=2.274, ppl=4.84, wps=587227, ups=1.19, wpb=495285, bsz=16919.1, num_updates=9300, lr=0.000655826, gnorm=0.204, clip=0, loss_scale=4, train_wall=83, gb_free=60.6, wall=8016 epoch 006: 970 / 1689 loss=3.802, nll_loss=2.277, ppl=4.85, wps=584015, ups=1.18, wpb=493156, bsz=16292.8, num_updates=9400, lr=0.000652328, gnorm=0.194, clip=0, loss_scale=4, train_wall=83, gb_free=61.8, wall=8101 epoch 006: 970 / 1689 loss=3.802, nll_loss=2.277, ppl=4.85, wps=584015, ups=1.18, wpb=493156, bsz=16292.8, num_updates=9400, lr=0.000652328, gnorm=0.194, clip=0, loss_scale=4, train_wall=83, gb_free=61.8, wall=8101 epoch 006: 970 / 1689 loss=3.802, nll_loss=2.277, ppl=4.85, wps=584015, ups=1.18, wpb=493156, bsz=16292.8, num_updates=9400, lr=0.000652328, gnorm=0.194, clip=0, loss_scale=4, train_wall=83, gb_free=61.8, wall=8101 epoch 006: 970 / 1689 loss=3.802, nll_loss=2.277, ppl=4.85, wps=584015, ups=1.18, wpb=493156, bsz=16292.8, num_updates=9400, lr=0.000652328, gnorm=0.194, clip=0, loss_scale=4, train_wall=83, gb_free=61.8, wall=8101 epoch 006: 970 / 1689 loss=3.802, nll_loss=2.277, ppl=4.85, wps=584015, ups=1.18, wpb=493156, bsz=16292.8, num_updates=9400, lr=0.000652328, gnorm=0.194, clip=0, loss_scale=4, train_wall=83, gb_free=61.8, wall=8101 epoch 006: 970 / 1689 loss=3.802, nll_loss=2.277, ppl=4.85, wps=584015, ups=1.18, wpb=493156, bsz=16292.8, num_updates=9400, lr=0.000652328, gnorm=0.194, clip=0, loss_scale=4, train_wall=83, gb_free=61.8, wall=8101 epoch 006: 1070 / 1689 loss=3.794, nll_loss=2.268, ppl=4.82, wps=585015, ups=1.18, wpb=494574, bsz=16490.6, num_updates=9500, lr=0.000648886, gnorm=0.196, clip=0, loss_scale=4, train_wall=83, gb_free=61.5, wall=8185 epoch 006: 1070 / 1689 loss=3.794, nll_loss=2.268, ppl=4.82, wps=585015, ups=1.18, wpb=494574, bsz=16490.6, num_updates=9500, lr=0.000648886, gnorm=0.196, clip=0, loss_scale=4, train_wall=83, gb_free=61.5, wall=8185 epoch 006: 1070 / 1689 loss=3.794, nll_loss=2.268, ppl=4.82, wps=585015, ups=1.18, wpb=494574, bsz=16490.6, num_updates=9500, lr=0.000648886, gnorm=0.196, clip=0, loss_scale=4, train_wall=83, gb_free=61.5, wall=8185 epoch 006: 1070 / 1689 loss=3.794, nll_loss=2.268, ppl=4.82, wps=585015, ups=1.18, wpb=494574, bsz=16490.6, num_updates=9500, lr=0.000648886, gnorm=0.196, clip=0, loss_scale=4, train_wall=83, gb_free=61.5, wall=8185 epoch 006: 1070 / 1689 loss=3.794, nll_loss=2.268, ppl=4.82, wps=585015, ups=1.18, wpb=494574, bsz=16490.6, num_updates=9500, lr=0.000648886, gnorm=0.196, clip=0, loss_scale=4, train_wall=83, gb_free=61.5, wall=8185 epoch 006: 1070 / 1689 loss=3.794, nll_loss=2.268, ppl=4.82, wps=585015, ups=1.18, wpb=494574, bsz=16490.6, num_updates=9500, lr=0.000648886, gnorm=0.196, clip=0, loss_scale=4, train_wall=83, gb_free=61.5, wall=8185 epoch 006: 1171 / 1689 loss=3.793, nll_loss=2.267, ppl=4.81, wps=581623, ups=1.18, wpb=494415, bsz=16739.4, num_updates=9600, lr=0.000645497, gnorm=0.197, clip=0, loss_scale=4, train_wall=84, gb_free=61.4, wall=8270 epoch 006: 1171 / 1689 loss=3.793, nll_loss=2.267, ppl=4.81, wps=581623, ups=1.18, wpb=494415, bsz=16739.4, num_updates=9600, lr=0.000645497, gnorm=0.197, clip=0, loss_scale=4, train_wall=84, gb_free=61.4, wall=8270 epoch 006: 1171 / 1689 loss=3.793, nll_loss=2.267, ppl=4.81, wps=581623, ups=1.18, wpb=494415, bsz=16739.4, num_updates=9600, lr=0.000645497, gnorm=0.197, clip=0, loss_scale=4, train_wall=84, gb_free=61.4, wall=8270 epoch 006: 1171 / 1689 loss=3.793, nll_loss=2.267, ppl=4.81, wps=581623, ups=1.18, wpb=494415, bsz=16739.4, num_updates=9600, lr=0.000645497, gnorm=0.197, clip=0, loss_scale=4, train_wall=84, gb_free=61.4, wall=8270 epoch 006: 1171 / 1689 loss=3.793, nll_loss=2.267, ppl=4.81, wps=581623, ups=1.18, wpb=494415, bsz=16739.4, num_updates=9600, lr=0.000645497, gnorm=0.197, clip=0, loss_scale=4, train_wall=84, gb_free=61.4, wall=8270 epoch 006: 1171 / 1689 loss=3.793, nll_loss=2.267, ppl=4.81, wps=581623, ups=1.18, wpb=494415, bsz=16739.4, num_updates=9600, lr=0.000645497, gnorm=0.197, clip=0, loss_scale=4, train_wall=84, gb_free=61.4, wall=8270 epoch 006: 1271 / 1689 loss=3.797, nll_loss=2.272, ppl=4.83, wps=584412, ups=1.18, wpb=495561, bsz=16173.8, num_updates=9700, lr=0.000642161, gnorm=0.196, clip=0, loss_scale=4, train_wall=84, gb_free=60.9, wall=8355 epoch 006: 1271 / 1689 loss=3.797, nll_loss=2.272, ppl=4.83, wps=584412, ups=1.18, wpb=495561, bsz=16173.8, num_updates=9700, lr=0.000642161, gnorm=0.196, clip=0, loss_scale=4, train_wall=84, gb_free=60.9, wall=8355 epoch 006: 1271 / 1689 loss=3.797, nll_loss=2.272, ppl=4.83, wps=584412, ups=1.18, wpb=495561, bsz=16173.8, num_updates=9700, lr=0.000642161, gnorm=0.196, clip=0, loss_scale=4, train_wall=84, gb_free=60.9, wall=8355 epoch 006: 1271 / 1689 loss=3.797, nll_loss=2.272, ppl=4.83, wps=584412, ups=1.18, wpb=495561, bsz=16173.8, num_updates=9700, lr=0.000642161, gnorm=0.196, clip=0, loss_scale=4, train_wall=84, gb_free=60.9, wall=8355 epoch 006: 1271 / 1689 loss=3.797, nll_loss=2.272, ppl=4.83, wps=584412, ups=1.18, wpb=495561, bsz=16173.8, num_updates=9700, lr=0.000642161, gnorm=0.196, clip=0, loss_scale=4, train_wall=84, gb_free=60.9, wall=8355 epoch 006: 1271 / 1689 loss=3.797, nll_loss=2.272, ppl=4.83, wps=584412, ups=1.18, wpb=495561, bsz=16173.8, num_updates=9700, lr=0.000642161, gnorm=0.196, clip=0, loss_scale=4, train_wall=84, gb_free=60.9, wall=8355 epoch 006: 1371 / 1689 loss=3.789, nll_loss=2.263, ppl=4.8, wps=586567, ups=1.18, wpb=497359, bsz=16719.1, num_updates=9800, lr=0.000638877, gnorm=0.192, clip=0, loss_scale=4, train_wall=84, gb_free=61.4, wall=8440 epoch 006: 1371 / 1689 loss=3.789, nll_loss=2.263, ppl=4.8, wps=586567, ups=1.18, wpb=497359, bsz=16719.1, num_updates=9800, lr=0.000638877, gnorm=0.192, clip=0, loss_scale=4, train_wall=84, gb_free=61.4, wall=8440 epoch 006: 1371 / 1689 loss=3.789, nll_loss=2.263, ppl=4.8, wps=586567, ups=1.18, wpb=497359, bsz=16719.1, num_updates=9800, lr=0.000638877, gnorm=0.192, clip=0, loss_scale=4, train_wall=84, gb_free=61.4, wall=8440 epoch 006: 1371 / 1689 loss=3.789, nll_loss=2.263, ppl=4.8, wps=586567, ups=1.18, wpb=497359, bsz=16719.1, num_updates=9800, lr=0.000638877, gnorm=0.192, clip=0, loss_scale=4, train_wall=84, gb_free=61.4, wall=8440 epoch 006: 1371 / 1689 loss=3.789, nll_loss=2.263, ppl=4.8, wps=586567, ups=1.18, wpb=497359, bsz=16719.1, num_updates=9800, lr=0.000638877, gnorm=0.192, clip=0, loss_scale=4, train_wall=84, gb_free=61.4, wall=8440 epoch 006: 1371 / 1689 loss=3.789, nll_loss=2.263, ppl=4.8, wps=586567, ups=1.18, wpb=497359, bsz=16719.1, num_updates=9800, lr=0.000638877, gnorm=0.192, clip=0, loss_scale=4, train_wall=84, gb_free=61.4, wall=8440 epoch 006: 1471 / 1689 loss=3.795, nll_loss=2.27, ppl=4.82, wps=586061, ups=1.18, wpb=494589, bsz=16329, num_updates=9900, lr=0.000635642, gnorm=0.189, clip=0, loss_scale=4, train_wall=83, gb_free=61.4, wall=8524 epoch 006: 1471 / 1689 loss=3.795, nll_loss=2.27, ppl=4.82, wps=586061, ups=1.18, wpb=494589, bsz=16329, num_updates=9900, lr=0.000635642, gnorm=0.189, clip=0, loss_scale=4, train_wall=83, gb_free=61.4, wall=8524 epoch 006: 1471 / 1689 loss=3.795, nll_loss=2.27, ppl=4.82, wps=586061, ups=1.18, wpb=494589, bsz=16329, num_updates=9900, lr=0.000635642, gnorm=0.189, clip=0, loss_scale=4, train_wall=83, gb_free=61.4, wall=8524 epoch 006: 1471 / 1689 loss=3.795, nll_loss=2.27, ppl=4.82, wps=586061, ups=1.18, wpb=494589, bsz=16329, num_updates=9900, lr=0.000635642, gnorm=0.189, clip=0, loss_scale=4, train_wall=83, gb_free=61.4, wall=8524 epoch 006: 1471 / 1689 loss=3.795, nll_loss=2.27, ppl=4.82, wps=586061, ups=1.18, wpb=494589, bsz=16329, num_updates=9900, lr=0.000635642, gnorm=0.189, clip=0, loss_scale=4, train_wall=83, gb_free=61.4, wall=8524 epoch 006: 1471 / 1689 loss=3.795, nll_loss=2.27, ppl=4.82, wps=586061, ups=1.18, wpb=494589, bsz=16329, num_updates=9900, lr=0.000635642, gnorm=0.189, clip=0, loss_scale=4, train_wall=83, gb_free=61.4, wall=8524 epoch 006: 1571 / 1689 loss=3.784, nll_loss=2.258, ppl=4.78, wps=586725, ups=1.18, wpb=496190, bsz=16776.5, num_updates=10000, lr=0.000632456, gnorm=0.189, clip=0, loss_scale=4, train_wall=83, gb_free=61.4, wall=8609 epoch 006: 1571 / 1689 loss=3.784, nll_loss=2.258, ppl=4.78, wps=586725, ups=1.18, wpb=496190, bsz=16776.5, num_updates=10000, lr=0.000632456, gnorm=0.189, clip=0, loss_scale=4, train_wall=83, gb_free=61.4, wall=8609 epoch 006: 1571 / 1689 loss=3.784, nll_loss=2.258, ppl=4.78, wps=586725, ups=1.18, wpb=496190, bsz=16776.5, num_updates=10000, lr=0.000632456, gnorm=0.189, clip=0, loss_scale=4, train_wall=83, gb_free=61.4, wall=8609 epoch 006: 1571 / 1689 loss=3.784, nll_loss=2.258, ppl=4.78, wps=586725, ups=1.18, wpb=496190, bsz=16776.5, num_updates=10000, lr=0.000632456, gnorm=0.189, clip=0, loss_scale=4, train_wall=83, gb_free=61.4, wall=8609 epoch 006: 1571 / 1689 loss=3.784, nll_loss=2.258, ppl=4.78, wps=586725, ups=1.18, wpb=496190, bsz=16776.5, num_updates=10000, lr=0.000632456, gnorm=0.189, clip=0, loss_scale=4, train_wall=83, gb_free=61.4, wall=8609 epoch 006: 1571 / 1689 loss=3.784, nll_loss=2.258, ppl=4.78, wps=586725, ups=1.18, wpb=496190, bsz=16776.5, num_updates=10000, lr=0.000632456, gnorm=0.189, clip=0, loss_scale=4, train_wall=83, gb_free=61.4, wall=8609 begin validation on "valid" subset epoch 006 | valid on 'valid' subset | loss 3.812 | nll_loss 2.263 | ppl 4.8 | wps 0 | wpb 44526 | bsz 2008 | num_updates 10000 | best_loss 3.812 epoch 006 | valid on 'valid' subset | loss 3.812 | nll_loss 2.263 | ppl 4.8 | wps 0 | wpb 44526 | bsz 2008 | num_updates 10000 | best_loss 3.812 epoch 006 | valid on 'valid' subset | loss 3.812 | nll_loss 2.263 | ppl 4.8 | wps 0 | wpb 44526 | bsz 2008 | num_updates 10000 | best_loss 3.812 epoch 006 | valid on 'valid' subset | loss 3.812 | nll_loss 2.263 | ppl 4.8 | wps 0 | wpb 44526 | bsz 2008 | num_updates 10000 | best_loss 3.812 epoch 006 | valid on 'valid' subset | loss 3.812 | nll_loss 2.263 | ppl 4.8 | wps 0 | wpb 44526 | bsz 2008 | num_updates 10000 | best_loss 3.812 epoch 006 | valid on 'valid' subset | loss 3.812 | nll_loss 2.263 | ppl 4.8 | wps 0 | wpb 44526 | bsz 2008 | num_updates 10000 | best_loss 3.812 epoch 006: 1672 / 1689 loss=3.787, nll_loss=2.261, ppl=4.79, wps=513211, ups=1.04, wpb=495672, bsz=16427.1, num_updates=10100, lr=0.000629317, gnorm=0.185, clip=0, loss_scale=4, train_wall=84, gb_free=61.4, wall=8706 epoch 006: 1672 / 1689 loss=3.787, nll_loss=2.261, ppl=4.79, wps=513211, ups=1.04, wpb=495672, bsz=16427.1, num_updates=10100, lr=0.000629317, gnorm=0.185, clip=0, loss_scale=4, train_wall=84, gb_free=61.4, wall=8706 epoch 006: 1672 / 1689 loss=3.787, nll_loss=2.261, ppl=4.79, wps=513211, ups=1.04, wpb=495672, bsz=16427.1, num_updates=10100, lr=0.000629317, gnorm=0.185, clip=0, loss_scale=4, train_wall=84, gb_free=61.4, wall=8706 epoch 006: 1672 / 1689 loss=3.787, nll_loss=2.261, ppl=4.79, wps=513211, ups=1.04, wpb=495672, bsz=16427.1, num_updates=10100, lr=0.000629317, gnorm=0.185, clip=0, loss_scale=4, train_wall=84, gb_free=61.4, wall=8706 epoch 006: 1672 / 1689 loss=3.787, nll_loss=2.261, ppl=4.79, wps=513211, ups=1.04, wpb=495672, bsz=16427.1, num_updates=10100, lr=0.000629317, gnorm=0.185, clip=0, loss_scale=4, train_wall=84, gb_free=61.4, wall=8706 epoch 006: 1672 / 1689 loss=3.787, nll_loss=2.261, ppl=4.79, wps=513211, ups=1.04, wpb=495672, bsz=16427.1, num_updates=10100, lr=0.000629317, gnorm=0.185, clip=0, loss_scale=4, train_wall=84, gb_free=61.4, wall=8706 end of epoch 6 (average epoch stats below) epoch 006 | loss 3.795 | nll_loss 2.269 | ppl 4.82 | wps 575235 | ups 1.16 | wpb 495121 | bsz 16503.4 | num_updates 10117 | lr 0.000628788 | gnorm 0.197 | clip 0 | loss_scale 4 | train_wall 1409 | gb_free 62.5 | wall 8719 epoch 006 | loss 3.795 | nll_loss 2.269 | ppl 4.82 | wps 575235 | ups 1.16 | wpb 495121 | bsz 16503.4 | num_updates 10117 | lr 0.000628788 | gnorm 0.197 | clip 0 | loss_scale 4 | train_wall 1409 | gb_free 62.5 | wall 8719 epoch 006 | loss 3.795 | nll_loss 2.269 | ppl 4.82 | wps 575235 | ups 1.16 | wpb 495121 | bsz 16503.4 | num_updates 10117 | lr 0.000628788 | gnorm 0.197 | clip 0 | loss_scale 4 | train_wall 1409 | gb_free 62.5 | wall 8719 epoch 006 | loss 3.795 | nll_loss 2.269 | ppl 4.82 | wps 575235 | ups 1.16 | wpb 495121 | bsz 16503.4 | num_updates 10117 | lr 0.000628788 | gnorm 0.197 | clip 0 | loss_scale 4 | train_wall 1409 | gb_free 62.5 | wall 8719 epoch 006 | loss 3.795 | nll_loss 2.269 | ppl 4.82 | wps 575235 | ups 1.16 | wpb 495121 | bsz 16503.4 | num_updates 10117 | lr 0.000628788 | gnorm 0.197 | clip 0 | loss_scale 4 | train_wall 1409 | gb_free 62.5 | wall 8719 epoch 006 | loss 3.795 | nll_loss 2.269 | ppl 4.82 | wps 575235 | ups 1.16 | wpb 495121 | bsz 16503.4 | num_updates 10117 | lr 0.000628788 | gnorm 0.197 | clip 0 | loss_scale 4 | train_wall 1409 | gb_free 62.5 | wall 8719 Start iterating over samples epoch 007: 83 / 1689 loss=3.759, nll_loss=2.228, ppl=4.68, wps=582652, ups=1.19, wpb=490636, bsz=16314.8, num_updates=10200, lr=0.000626224, gnorm=0.197, clip=0, loss_scale=4, train_wall=83, gb_free=61.9, wall=8790 epoch 007: 83 / 1689 loss=3.759, nll_loss=2.228, ppl=4.68, wps=582652, ups=1.19, wpb=490636, bsz=16314.8, num_updates=10200, lr=0.000626224, gnorm=0.197, clip=0, loss_scale=4, train_wall=83, gb_free=61.9, wall=8790 epoch 007: 83 / 1689 loss=3.759, nll_loss=2.228, ppl=4.68, wps=582652, ups=1.19, wpb=490636, bsz=16314.8, num_updates=10200, lr=0.000626224, gnorm=0.197, clip=0, loss_scale=4, train_wall=83, gb_free=61.9, wall=8790 epoch 007: 83 / 1689 loss=3.759, nll_loss=2.228, ppl=4.68, wps=582652, ups=1.19, wpb=490636, bsz=16314.8, num_updates=10200, lr=0.000626224, gnorm=0.197, clip=0, loss_scale=4, train_wall=83, gb_free=61.9, wall=8790 epoch 007: 83 / 1689 loss=3.759, nll_loss=2.228, ppl=4.68, wps=582652, ups=1.19, wpb=490636, bsz=16314.8, num_updates=10200, lr=0.000626224, gnorm=0.197, clip=0, loss_scale=4, train_wall=83, gb_free=61.9, wall=8790 epoch 007: 83 / 1689 loss=3.759, nll_loss=2.228, ppl=4.68, wps=582652, ups=1.19, wpb=490636, bsz=16314.8, num_updates=10200, lr=0.000626224, gnorm=0.197, clip=0, loss_scale=4, train_wall=83, gb_free=61.9, wall=8790 epoch 007: 83 / 1689 loss=3.759, nll_loss=2.228, ppl=4.68, wps=582652, ups=1.19, wpb=490636, bsz=16314.8, num_updates=10200, lr=0.000626224, gnorm=0.197, clip=0, loss_scale=4, train_wall=83, gb_free=61.9, wall=8790 epoch 007: 183 / 1689 loss=3.745, nll_loss=2.213, ppl=4.64, wps=585531, ups=1.18, wpb=495810, bsz=16695.2, num_updates=10300, lr=0.000623177, gnorm=0.183, clip=0, loss_scale=4, train_wall=84, gb_free=61.9, wall=8874 epoch 007: 183 / 1689 loss=3.745, nll_loss=2.213, ppl=4.64, wps=585531, ups=1.18, wpb=495810, bsz=16695.2, num_updates=10300, lr=0.000623177, gnorm=0.183, clip=0, loss_scale=4, train_wall=84, gb_free=61.9, wall=8874 epoch 007: 183 / 1689 loss=3.745, nll_loss=2.213, ppl=4.64, wps=585531, ups=1.18, wpb=495810, bsz=16695.2, num_updates=10300, lr=0.000623177, gnorm=0.183, clip=0, loss_scale=4, train_wall=84, gb_free=61.9, wall=8874 epoch 007: 183 / 1689 loss=3.745, nll_loss=2.213, ppl=4.64, wps=585531, ups=1.18, wpb=495810, bsz=16695.2, num_updates=10300, lr=0.000623177, gnorm=0.183, clip=0, loss_scale=4, train_wall=84, gb_free=61.9, wall=8874 epoch 007: 183 / 1689 loss=3.745, nll_loss=2.213, ppl=4.64, wps=585531, ups=1.18, wpb=495810, bsz=16695.2, num_updates=10300, lr=0.000623177, gnorm=0.183, clip=0, loss_scale=4, train_wall=84, gb_free=61.9, wall=8874 epoch 007: 183 / 1689 loss=3.745, nll_loss=2.213, ppl=4.64, wps=585531, ups=1.18, wpb=495810, bsz=16695.2, num_updates=10300, lr=0.000623177, gnorm=0.183, clip=0, loss_scale=4, train_wall=84, gb_free=61.9, wall=8874 epoch 007: 183 / 1689 loss=3.745, nll_loss=2.213, ppl=4.64, wps=585531, ups=1.18, wpb=495810, bsz=16695.2, num_updates=10300, lr=0.000623177, gnorm=0.183, clip=0, loss_scale=4, train_wall=84, gb_free=61.9, wall=8874 epoch 007: 283 / 1689 loss=3.754, nll_loss=2.223, ppl=4.67, wps=585131, ups=1.18, wpb=495289, bsz=16615.4, num_updates=10400, lr=0.000620174, gnorm=0.195, clip=0, loss_scale=4, train_wall=83, gb_free=62.4, wall=8959 epoch 007: 283 / 1689 loss=3.754, nll_loss=2.223, ppl=4.67, wps=585131, ups=1.18, wpb=495289, bsz=16615.4, num_updates=10400, lr=0.000620174, gnorm=0.195, clip=0, loss_scale=4, train_wall=83, gb_free=62.4, wall=8959 epoch 007: 283 / 1689 loss=3.754, nll_loss=2.223, ppl=4.67, wps=585131, ups=1.18, wpb=495289, bsz=16615.4, num_updates=10400, lr=0.000620174, gnorm=0.195, clip=0, loss_scale=4, train_wall=83, gb_free=62.4, wall=8959 epoch 007: 283 / 1689 loss=3.754, nll_loss=2.223, ppl=4.67, wps=585131, ups=1.18, wpb=495289, bsz=16615.4, num_updates=10400, lr=0.000620174, gnorm=0.195, clip=0, loss_scale=4, train_wall=83, gb_free=62.4, wall=8959 epoch 007: 283 / 1689 loss=3.754, nll_loss=2.223, ppl=4.67, wps=585131, ups=1.18, wpb=495289, bsz=16615.4, num_updates=10400, lr=0.000620174, gnorm=0.195, clip=0, loss_scale=4, train_wall=83, gb_free=62.4, wall=8959 epoch 007: 283 / 1689 loss=3.754, nll_loss=2.223, ppl=4.67, wps=585131, ups=1.18, wpb=495289, bsz=16615.4, num_updates=10400, lr=0.000620174, gnorm=0.195, clip=0, loss_scale=4, train_wall=83, gb_free=62.4, wall=8959 epoch 007: 283 / 1689 loss=3.754, nll_loss=2.223, ppl=4.67, wps=585131, ups=1.18, wpb=495289, bsz=16615.4, num_updates=10400, lr=0.000620174, gnorm=0.195, clip=0, loss_scale=4, train_wall=83, gb_free=62.4, wall=8959 epoch 007: 383 / 1689 loss=3.758, nll_loss=2.228, ppl=4.69, wps=588153, ups=1.19, wpb=495241, bsz=16108.6, num_updates=10500, lr=0.000617213, gnorm=0.188, clip=0, loss_scale=4, train_wall=83, gb_free=61.4, wall=9043 epoch 007: 383 / 1689 loss=3.758, nll_loss=2.228, ppl=4.69, wps=588153, ups=1.19, wpb=495241, bsz=16108.6, num_updates=10500, lr=0.000617213, gnorm=0.188, clip=0, loss_scale=4, train_wall=83, gb_free=61.4, wall=9043 epoch 007: 383 / 1689 loss=3.758, nll_loss=2.228, ppl=4.69, wps=588153, ups=1.19, wpb=495241, bsz=16108.6, num_updates=10500, lr=0.000617213, gnorm=0.188, clip=0, loss_scale=4, train_wall=83, gb_free=61.4, wall=9043 epoch 007: 383 / 1689 loss=3.758, nll_loss=2.228, ppl=4.69, wps=588153, ups=1.19, wpb=495241, bsz=16108.6, num_updates=10500, lr=0.000617213, gnorm=0.188, clip=0, loss_scale=4, train_wall=83, gb_free=61.4, wall=9043 epoch 007: 383 / 1689 loss=3.758, nll_loss=2.228, ppl=4.69, wps=588153, ups=1.19, wpb=495241, bsz=16108.6, num_updates=10500, lr=0.000617213, gnorm=0.188, clip=0, loss_scale=4, train_wall=83, gb_free=61.4, wall=9043 epoch 007: 383 / 1689 loss=3.758, nll_loss=2.228, ppl=4.69, wps=588153, ups=1.19, wpb=495241, bsz=16108.6, num_updates=10500, lr=0.000617213, gnorm=0.188, clip=0, loss_scale=4, train_wall=83, gb_free=61.4, wall=9043 epoch 007: 383 / 1689 loss=3.758, nll_loss=2.228, ppl=4.69, wps=588153, ups=1.19, wpb=495241, bsz=16108.6, num_updates=10500, lr=0.000617213, gnorm=0.188, clip=0, loss_scale=4, train_wall=83, gb_free=61.4, wall=9043 epoch 007: 483 / 1689 loss=3.762, nll_loss=2.233, ppl=4.7, wps=584957, ups=1.18, wpb=495202, bsz=16557.2, num_updates=10600, lr=0.000614295, gnorm=0.186, clip=0, loss_scale=8, train_wall=83, gb_free=61.3, wall=9128 epoch 007: 483 / 1689 loss=3.762, nll_loss=2.233, ppl=4.7, wps=584957, ups=1.18, wpb=495202, bsz=16557.2, num_updates=10600, lr=0.000614295, gnorm=0.186, clip=0, loss_scale=8, train_wall=83, gb_free=61.3, wall=9128 epoch 007: 483 / 1689 loss=3.762, nll_loss=2.233, ppl=4.7, wps=584957, ups=1.18, wpb=495202, bsz=16557.2, num_updates=10600, lr=0.000614295, gnorm=0.186, clip=0, loss_scale=8, train_wall=83, gb_free=61.3, wall=9128 epoch 007: 483 / 1689 loss=3.762, nll_loss=2.233, ppl=4.7, wps=584957, ups=1.18, wpb=495202, bsz=16557.2, num_updates=10600, lr=0.000614295, gnorm=0.186, clip=0, loss_scale=8, train_wall=83, gb_free=61.3, wall=9128 epoch 007: 483 / 1689 loss=3.762, nll_loss=2.233, ppl=4.7, wps=584957, ups=1.18, wpb=495202, bsz=16557.2, num_updates=10600, lr=0.000614295, gnorm=0.186, clip=0, loss_scale=8, train_wall=83, gb_free=61.3, wall=9128 epoch 007: 483 / 1689 loss=3.762, nll_loss=2.233, ppl=4.7, wps=584957, ups=1.18, wpb=495202, bsz=16557.2, num_updates=10600, lr=0.000614295, gnorm=0.186, clip=0, loss_scale=8, train_wall=83, gb_free=61.3, wall=9128 epoch 007: 483 / 1689 loss=3.762, nll_loss=2.233, ppl=4.7, wps=584957, ups=1.18, wpb=495202, bsz=16557.2, num_updates=10600, lr=0.000614295, gnorm=0.186, clip=0, loss_scale=8, train_wall=83, gb_free=61.3, wall=9128 epoch 007: 583 / 1689 loss=3.756, nll_loss=2.226, ppl=4.68, wps=585028, ups=1.18, wpb=496100, bsz=16461.3, num_updates=10700, lr=0.000611418, gnorm=0.188, clip=0, loss_scale=8, train_wall=84, gb_free=61.7, wall=9213 epoch 007: 583 / 1689 loss=3.756, nll_loss=2.226, ppl=4.68, wps=585028, ups=1.18, wpb=496100, bsz=16461.3, num_updates=10700, lr=0.000611418, gnorm=0.188, clip=0, loss_scale=8, train_wall=84, gb_free=61.7, wall=9213 epoch 007: 583 / 1689 loss=3.756, nll_loss=2.226, ppl=4.68, wps=585028, ups=1.18, wpb=496100, bsz=16461.3, num_updates=10700, lr=0.000611418, gnorm=0.188, clip=0, loss_scale=8, train_wall=84, gb_free=61.7, wall=9213 epoch 007: 583 / 1689 loss=3.756, nll_loss=2.226, ppl=4.68, wps=585028, ups=1.18, wpb=496100, bsz=16461.3, num_updates=10700, lr=0.000611418, gnorm=0.188, clip=0, loss_scale=8, train_wall=84, gb_free=61.7, wall=9213 epoch 007: 583 / 1689 loss=3.756, nll_loss=2.226, ppl=4.68, wps=585028, ups=1.18, wpb=496100, bsz=16461.3, num_updates=10700, lr=0.000611418, gnorm=0.188, clip=0, loss_scale=8, train_wall=84, gb_free=61.7, wall=9213 epoch 007: 583 / 1689 loss=3.756, nll_loss=2.226, ppl=4.68, wps=585028, ups=1.18, wpb=496100, bsz=16461.3, num_updates=10700, lr=0.000611418, gnorm=0.188, clip=0, loss_scale=8, train_wall=84, gb_free=61.7, wall=9213 epoch 007: 583 / 1689 loss=3.756, nll_loss=2.226, ppl=4.68, wps=585028, ups=1.18, wpb=496100, bsz=16461.3, num_updates=10700, lr=0.000611418, gnorm=0.188, clip=0, loss_scale=8, train_wall=84, gb_free=61.7, wall=9213 epoch 007: 684 / 1689 loss=3.76, nll_loss=2.231, ppl=4.69, wps=579158, ups=1.17, wpb=494482, bsz=16312.7, num_updates=10800, lr=0.000608581, gnorm=0.184, clip=0, loss_scale=4, train_wall=84, gb_free=61.4, wall=9298 epoch 007: 684 / 1689 loss=3.76, nll_loss=2.231, ppl=4.69, wps=579158, ups=1.17, wpb=494482, bsz=16312.7, num_updates=10800, lr=0.000608581, gnorm=0.184, clip=0, loss_scale=4, train_wall=84, gb_free=61.4, wall=9298 epoch 007: 684 / 1689 loss=3.76, nll_loss=2.231, ppl=4.69, wps=579158, ups=1.17, wpb=494482, bsz=16312.7, num_updates=10800, lr=0.000608581, gnorm=0.184, clip=0, loss_scale=4, train_wall=84, gb_free=61.4, wall=9298 epoch 007: 684 / 1689 loss=3.76, nll_loss=2.231, ppl=4.69, wps=579158, ups=1.17, wpb=494482, bsz=16312.7, num_updates=10800, lr=0.000608581, gnorm=0.184, clip=0, loss_scale=4, train_wall=84, gb_free=61.4, wall=9298 epoch 007: 684 / 1689 loss=3.76, nll_loss=2.231, ppl=4.69, wps=579158, ups=1.17, wpb=494482, bsz=16312.7, num_updates=10800, lr=0.000608581, gnorm=0.184, clip=0, loss_scale=4, train_wall=84, gb_free=61.4, wall=9298 epoch 007: 684 / 1689 loss=3.76, nll_loss=2.231, ppl=4.69, wps=579158, ups=1.17, wpb=494482, bsz=16312.7, num_updates=10800, lr=0.000608581, gnorm=0.184, clip=0, loss_scale=4, train_wall=84, gb_free=61.4, wall=9298 epoch 007: 684 / 1689 loss=3.76, nll_loss=2.231, ppl=4.69, wps=579158, ups=1.17, wpb=494482, bsz=16312.7, num_updates=10800, lr=0.000608581, gnorm=0.184, clip=0, loss_scale=4, train_wall=84, gb_free=61.4, wall=9298 epoch 007: 786 / 1689 loss=3.759, nll_loss=2.229, ppl=4.69, wps=572114, ups=1.16, wpb=494161, bsz=16273.6, num_updates=10900, lr=0.000605783, gnorm=0.192, clip=0, loss_scale=1, train_wall=85, gb_free=61.5, wall=9385 epoch 007: 786 / 1689 loss=3.759, nll_loss=2.229, ppl=4.69, wps=572114, ups=1.16, wpb=494161, bsz=16273.6, num_updates=10900, lr=0.000605783, gnorm=0.192, clip=0, loss_scale=1, train_wall=85, gb_free=61.5, wall=9385 epoch 007: 786 / 1689 loss=3.759, nll_loss=2.229, ppl=4.69, wps=572114, ups=1.16, wpb=494161, bsz=16273.6, num_updates=10900, lr=0.000605783, gnorm=0.192, clip=0, loss_scale=1, train_wall=85, gb_free=61.5, wall=9385 epoch 007: 786 / 1689 loss=3.759, nll_loss=2.229, ppl=4.69, wps=572114, ups=1.16, wpb=494161, bsz=16273.6, num_updates=10900, lr=0.000605783, gnorm=0.192, clip=0, loss_scale=1, train_wall=85, gb_free=61.5, wall=9385 epoch 007: 786 / 1689 loss=3.759, nll_loss=2.229, ppl=4.69, wps=572114, ups=1.16, wpb=494161, bsz=16273.6, num_updates=10900, lr=0.000605783, gnorm=0.192, clip=0, loss_scale=1, train_wall=85, gb_free=61.5, wall=9385 epoch 007: 786 / 1689 loss=3.759, nll_loss=2.229, ppl=4.69, wps=572114, ups=1.16, wpb=494161, bsz=16273.6, num_updates=10900, lr=0.000605783, gnorm=0.192, clip=0, loss_scale=1, train_wall=85, gb_free=61.5, wall=9385 epoch 007: 786 / 1689 loss=3.759, nll_loss=2.229, ppl=4.69, wps=572114, ups=1.16, wpb=494161, bsz=16273.6, num_updates=10900, lr=0.000605783, gnorm=0.192, clip=0, loss_scale=1, train_wall=85, gb_free=61.5, wall=9385 epoch 007: 886 / 1689 loss=3.758, nll_loss=2.229, ppl=4.69, wps=583242, ups=1.18, wpb=494771, bsz=16298.6, num_updates=11000, lr=0.000603023, gnorm=0.201, clip=0, loss_scale=1, train_wall=84, gb_free=62, wall=9469 epoch 007: 886 / 1689 loss=3.758, nll_loss=2.229, ppl=4.69, wps=583242, ups=1.18, wpb=494771, bsz=16298.6, num_updates=11000, lr=0.000603023, gnorm=0.201, clip=0, loss_scale=1, train_wall=84, gb_free=62, wall=9469 epoch 007: 886 / 1689 loss=3.758, nll_loss=2.229, ppl=4.69, wps=583242, ups=1.18, wpb=494771, bsz=16298.6, num_updates=11000, lr=0.000603023, gnorm=0.201, clip=0, loss_scale=1, train_wall=84, gb_free=62, wall=9469 epoch 007: 886 / 1689 loss=3.758, nll_loss=2.229, ppl=4.69, wps=583242, ups=1.18, wpb=494771, bsz=16298.6, num_updates=11000, lr=0.000603023, gnorm=0.201, clip=0, loss_scale=1, train_wall=84, gb_free=62, wall=9469 epoch 007: 886 / 1689 loss=3.758, nll_loss=2.229, ppl=4.69, wps=583242, ups=1.18, wpb=494771, bsz=16298.6, num_updates=11000, lr=0.000603023, gnorm=0.201, clip=0, loss_scale=1, train_wall=84, gb_free=62, wall=9469 epoch 007: 886 / 1689 loss=3.758, nll_loss=2.229, ppl=4.69, wps=583242, ups=1.18, wpb=494771, bsz=16298.6, num_updates=11000, lr=0.000603023, gnorm=0.201, clip=0, loss_scale=1, train_wall=84, gb_free=62, wall=9469 epoch 007: 886 / 1689 loss=3.758, nll_loss=2.229, ppl=4.69, wps=583242, ups=1.18, wpb=494771, bsz=16298.6, num_updates=11000, lr=0.000603023, gnorm=0.201, clip=0, loss_scale=1, train_wall=84, gb_free=62, wall=9469 begin validation on "valid" subset epoch 007 | valid on 'valid' subset | loss 3.795 | nll_loss 2.246 | ppl 4.74 | wps 0 | wpb 44526 | bsz 2008 | num_updates 11000 | best_loss 3.795 epoch 007 | valid on 'valid' subset | loss 3.795 | nll_loss 2.246 | ppl 4.74 | wps 0 | wpb 44526 | bsz 2008 | num_updates 11000 | best_loss 3.795 epoch 007 | valid on 'valid' subset | loss 3.795 | nll_loss 2.246 | ppl 4.74 | wps 0 | wpb 44526 | bsz 2008 | num_updates 11000 | best_loss 3.795 epoch 007 | valid on 'valid' subset | loss 3.795 | nll_loss 2.246 | ppl 4.74 | wps 0 | wpb 44526 | bsz 2008 | num_updates 11000 | best_loss 3.795 epoch 007 | valid on 'valid' subset | loss 3.795 | nll_loss 2.246 | ppl 4.74 | wps 0 | wpb 44526 | bsz 2008 | num_updates 11000 | best_loss 3.795 epoch 007 | valid on 'valid' subset | loss 3.795 | nll_loss 2.246 | ppl 4.74 | wps 0 | wpb 44526 | bsz 2008 | num_updates 11000 | best_loss 3.795 epoch 007 | valid on 'valid' subset | loss 3.795 | nll_loss 2.246 | ppl 4.74 | wps 0 | wpb 44526 | bsz 2008 | num_updates 11000 | best_loss 3.795 epoch 007: 987 / 1689 loss=3.758, nll_loss=2.229, ppl=4.69, wps=514368, ups=1.04, wpb=496420, bsz=16353.1, num_updates=11100, lr=0.0006003, gnorm=0.183, clip=0, loss_scale=0.5, train_wall=84, gb_free=61.9, wall=9566 epoch 007: 987 / 1689 loss=3.758, nll_loss=2.229, ppl=4.69, wps=514368, ups=1.04, wpb=496420, bsz=16353.1, num_updates=11100, lr=0.0006003, gnorm=0.183, clip=0, loss_scale=0.5, train_wall=84, gb_free=61.9, wall=9566 epoch 007: 987 / 1689 loss=3.758, nll_loss=2.229, ppl=4.69, wps=514368, ups=1.04, wpb=496420, bsz=16353.1, num_updates=11100, lr=0.0006003, gnorm=0.183, clip=0, loss_scale=0.5, train_wall=84, gb_free=61.9, wall=9566 epoch 007: 987 / 1689 loss=3.758, nll_loss=2.229, ppl=4.69, wps=514368, ups=1.04, wpb=496420, bsz=16353.1, num_updates=11100, lr=0.0006003, gnorm=0.183, clip=0, loss_scale=0.5, train_wall=84, gb_free=61.9, wall=9566 epoch 007: 987 / 1689 loss=3.758, nll_loss=2.229, ppl=4.69, wps=514368, ups=1.04, wpb=496420, bsz=16353.1, num_updates=11100, lr=0.0006003, gnorm=0.183, clip=0, loss_scale=0.5, train_wall=84, gb_free=61.9, wall=9566 epoch 007: 987 / 1689 loss=3.758, nll_loss=2.229, ppl=4.69, wps=514368, ups=1.04, wpb=496420, bsz=16353.1, num_updates=11100, lr=0.0006003, gnorm=0.183, clip=0, loss_scale=0.5, train_wall=84, gb_free=61.9, wall=9566 epoch 007: 987 / 1689 loss=3.758, nll_loss=2.229, ppl=4.69, wps=514368, ups=1.04, wpb=496420, bsz=16353.1, num_updates=11100, lr=0.0006003, gnorm=0.183, clip=0, loss_scale=0.5, train_wall=84, gb_free=61.9, wall=9566 epoch 007: 1087 / 1689 loss=3.755, nll_loss=2.225, ppl=4.68, wps=584713, ups=1.18, wpb=496429, bsz=16920.5, num_updates=11200, lr=0.000597614, gnorm=0.184, clip=0, loss_scale=0.5, train_wall=84, gb_free=61.3, wall=9651 epoch 007: 1087 / 1689 loss=3.755, nll_loss=2.225, ppl=4.68, wps=584713, ups=1.18, wpb=496429, bsz=16920.5, num_updates=11200, lr=0.000597614, gnorm=0.184, clip=0, loss_scale=0.5, train_wall=84, gb_free=61.3, wall=9651 epoch 007: 1087 / 1689 loss=3.755, nll_loss=2.225, ppl=4.68, wps=584713, ups=1.18, wpb=496429, bsz=16920.5, num_updates=11200, lr=0.000597614, gnorm=0.184, clip=0, loss_scale=0.5, train_wall=84, gb_free=61.3, wall=9651 epoch 007: 1087 / 1689 loss=3.755, nll_loss=2.225, ppl=4.68, wps=584713, ups=1.18, wpb=496429, bsz=16920.5, num_updates=11200, lr=0.000597614, gnorm=0.184, clip=0, loss_scale=0.5, train_wall=84, gb_free=61.3, wall=9651 epoch 007: 1087 / 1689 loss=3.755, nll_loss=2.225, ppl=4.68, wps=584713, ups=1.18, wpb=496429, bsz=16920.5, num_updates=11200, lr=0.000597614, gnorm=0.184, clip=0, loss_scale=0.5, train_wall=84, gb_free=61.3, wall=9651 epoch 007: 1087 / 1689 loss=3.755, nll_loss=2.225, ppl=4.68, wps=584713, ups=1.18, wpb=496429, bsz=16920.5, num_updates=11200, lr=0.000597614, gnorm=0.184, clip=0, loss_scale=0.5, train_wall=84, gb_free=61.3, wall=9651 epoch 007: 1087 / 1689 loss=3.755, nll_loss=2.225, ppl=4.68, wps=584713, ups=1.18, wpb=496429, bsz=16920.5, num_updates=11200, lr=0.000597614, gnorm=0.184, clip=0, loss_scale=0.5, train_wall=84, gb_free=61.3, wall=9651 epoch 007: 1187 / 1689 loss=3.756, nll_loss=2.227, ppl=4.68, wps=587342, ups=1.18, wpb=495879, bsz=16719.6, num_updates=11300, lr=0.000594964, gnorm=0.185, clip=0, loss_scale=0.5, train_wall=83, gb_free=61, wall=9735 epoch 007: 1187 / 1689 loss=3.756, nll_loss=2.227, ppl=4.68, wps=587342, ups=1.18, wpb=495879, bsz=16719.6, num_updates=11300, lr=0.000594964, gnorm=0.185, clip=0, loss_scale=0.5, train_wall=83, gb_free=61, wall=9735 epoch 007: 1187 / 1689 loss=3.756, nll_loss=2.227, ppl=4.68, wps=587342, ups=1.18, wpb=495879, bsz=16719.6, num_updates=11300, lr=0.000594964, gnorm=0.185, clip=0, loss_scale=0.5, train_wall=83, gb_free=61, wall=9735 epoch 007: 1187 / 1689 loss=3.756, nll_loss=2.227, ppl=4.68, wps=587342, ups=1.18, wpb=495879, bsz=16719.6, num_updates=11300, lr=0.000594964, gnorm=0.185, clip=0, loss_scale=0.5, train_wall=83, gb_free=61, wall=9735 epoch 007: 1187 / 1689 loss=3.756, nll_loss=2.227, ppl=4.68, wps=587342, ups=1.18, wpb=495879, bsz=16719.6, num_updates=11300, lr=0.000594964, gnorm=0.185, clip=0, loss_scale=0.5, train_wall=83, gb_free=61, wall=9735 epoch 007: 1187 / 1689 loss=3.756, nll_loss=2.227, ppl=4.68, wps=587342, ups=1.18, wpb=495879, bsz=16719.6, num_updates=11300, lr=0.000594964, gnorm=0.185, clip=0, loss_scale=0.5, train_wall=83, gb_free=61, wall=9735 epoch 007: 1187 / 1689 loss=3.756, nll_loss=2.227, ppl=4.68, wps=587342, ups=1.18, wpb=495879, bsz=16719.6, num_updates=11300, lr=0.000594964, gnorm=0.185, clip=0, loss_scale=0.5, train_wall=83, gb_free=61, wall=9735 epoch 007: 1287 / 1689 loss=3.759, nll_loss=2.23, ppl=4.69, wps=584440, ups=1.18, wpb=495370, bsz=16435, num_updates=11400, lr=0.000592349, gnorm=0.184, clip=0, loss_scale=0.5, train_wall=84, gb_free=61.4, wall=9820 epoch 007: 1287 / 1689 loss=3.759, nll_loss=2.23, ppl=4.69, wps=584440, ups=1.18, wpb=495370, bsz=16435, num_updates=11400, lr=0.000592349, gnorm=0.184, clip=0, loss_scale=0.5, train_wall=84, gb_free=61.4, wall=9820 epoch 007: 1287 / 1689 loss=3.759, nll_loss=2.23, ppl=4.69, wps=584440, ups=1.18, wpb=495370, bsz=16435, num_updates=11400, lr=0.000592349, gnorm=0.184, clip=0, loss_scale=0.5, train_wall=84, gb_free=61.4, wall=9820 epoch 007: 1287 / 1689 loss=3.759, nll_loss=2.23, ppl=4.69, wps=584440, ups=1.18, wpb=495370, bsz=16435, num_updates=11400, lr=0.000592349, gnorm=0.184, clip=0, loss_scale=0.5, train_wall=84, gb_free=61.4, wall=9820 epoch 007: 1287 / 1689 loss=3.759, nll_loss=2.23, ppl=4.69, wps=584440, ups=1.18, wpb=495370, bsz=16435, num_updates=11400, lr=0.000592349, gnorm=0.184, clip=0, loss_scale=0.5, train_wall=84, gb_free=61.4, wall=9820 epoch 007: 1287 / 1689 loss=3.759, nll_loss=2.23, ppl=4.69, wps=584440, ups=1.18, wpb=495370, bsz=16435, num_updates=11400, lr=0.000592349, gnorm=0.184, clip=0, loss_scale=0.5, train_wall=84, gb_free=61.4, wall=9820 epoch 007: 1287 / 1689 loss=3.759, nll_loss=2.23, ppl=4.69, wps=584440, ups=1.18, wpb=495370, bsz=16435, num_updates=11400, lr=0.000592349, gnorm=0.184, clip=0, loss_scale=0.5, train_wall=84, gb_free=61.4, wall=9820 epoch 007: 1387 / 1689 loss=3.761, nll_loss=2.232, ppl=4.7, wps=586741, ups=1.19, wpb=494609, bsz=16436.4, num_updates=11500, lr=0.000589768, gnorm=0.184, clip=0, loss_scale=0.5, train_wall=83, gb_free=60.2, wall=9904 epoch 007: 1387 / 1689 loss=3.761, nll_loss=2.232, ppl=4.7, wps=586741, ups=1.19, wpb=494609, bsz=16436.4, num_updates=11500, lr=0.000589768, gnorm=0.184, clip=0, loss_scale=0.5, train_wall=83, gb_free=60.2, wall=9904 epoch 007: 1387 / 1689 loss=3.761, nll_loss=2.232, ppl=4.7, wps=586741, ups=1.19, wpb=494609, bsz=16436.4, num_updates=11500, lr=0.000589768, gnorm=0.184, clip=0, loss_scale=0.5, train_wall=83, gb_free=60.2, wall=9904 epoch 007: 1387 / 1689 loss=3.761, nll_loss=2.232, ppl=4.7, wps=586741, ups=1.19, wpb=494609, bsz=16436.4, num_updates=11500, lr=0.000589768, gnorm=0.184, clip=0, loss_scale=0.5, train_wall=83, gb_free=60.2, wall=9904 epoch 007: 1387 / 1689 loss=3.761, nll_loss=2.232, ppl=4.7, wps=586741, ups=1.19, wpb=494609, bsz=16436.4, num_updates=11500, lr=0.000589768, gnorm=0.184, clip=0, loss_scale=0.5, train_wall=83, gb_free=60.2, wall=9904 epoch 007: 1387 / 1689 loss=3.761, nll_loss=2.232, ppl=4.7, wps=586741, ups=1.19, wpb=494609, bsz=16436.4, num_updates=11500, lr=0.000589768, gnorm=0.184, clip=0, loss_scale=0.5, train_wall=83, gb_free=60.2, wall=9904 epoch 007: 1387 / 1689 loss=3.761, nll_loss=2.232, ppl=4.7, wps=586741, ups=1.19, wpb=494609, bsz=16436.4, num_updates=11500, lr=0.000589768, gnorm=0.184, clip=0, loss_scale=0.5, train_wall=83, gb_free=60.2, wall=9904 epoch 007: 1487 / 1689 loss=3.748, nll_loss=2.218, ppl=4.65, wps=586047, ups=1.18, wpb=496297, bsz=16689.1, num_updates=11600, lr=0.00058722, gnorm=0.174, clip=0, loss_scale=1, train_wall=84, gb_free=61.2, wall=9989 epoch 007: 1487 / 1689 loss=3.748, nll_loss=2.218, ppl=4.65, wps=586047, ups=1.18, wpb=496297, bsz=16689.1, num_updates=11600, lr=0.00058722, gnorm=0.174, clip=0, loss_scale=1, train_wall=84, gb_free=61.2, wall=9989 epoch 007: 1487 / 1689 loss=3.748, nll_loss=2.218, ppl=4.65, wps=586047, ups=1.18, wpb=496297, bsz=16689.1, num_updates=11600, lr=0.00058722, gnorm=0.174, clip=0, loss_scale=1, train_wall=84, gb_free=61.2, wall=9989 epoch 007: 1487 / 1689 loss=3.748, nll_loss=2.218, ppl=4.65, wps=586047, ups=1.18, wpb=496297, bsz=16689.1, num_updates=11600, lr=0.00058722, gnorm=0.174, clip=0, loss_scale=1, train_wall=84, gb_free=61.2, wall=9989 epoch 007: 1487 / 1689 loss=3.748, nll_loss=2.218, ppl=4.65, wps=586047, ups=1.18, wpb=496297, bsz=16689.1, num_updates=11600, lr=0.00058722, gnorm=0.174, clip=0, loss_scale=1, train_wall=84, gb_free=61.2, wall=9989 epoch 007: 1487 / 1689 loss=3.748, nll_loss=2.218, ppl=4.65, wps=586047, ups=1.18, wpb=496297, bsz=16689.1, num_updates=11600, lr=0.00058722, gnorm=0.174, clip=0, loss_scale=1, train_wall=84, gb_free=61.2, wall=9989 epoch 007: 1487 / 1689 loss=3.748, nll_loss=2.218, ppl=4.65, wps=586047, ups=1.18, wpb=496297, bsz=16689.1, num_updates=11600, lr=0.00058722, gnorm=0.174, clip=0, loss_scale=1, train_wall=84, gb_free=61.2, wall=9989 epoch 007: 1587 / 1689 loss=3.746, nll_loss=2.216, ppl=4.64, wps=584487, ups=1.18, wpb=495970, bsz=16964.3, num_updates=11700, lr=0.000584705, gnorm=0.171, clip=0, loss_scale=1, train_wall=84, gb_free=61.4, wall=10074 epoch 007: 1587 / 1689 loss=3.746, nll_loss=2.216, ppl=4.64, wps=584487, ups=1.18, wpb=495970, bsz=16964.3, num_updates=11700, lr=0.000584705, gnorm=0.171, clip=0, loss_scale=1, train_wall=84, gb_free=61.4, wall=10074 epoch 007: 1587 / 1689 loss=3.746, nll_loss=2.216, ppl=4.64, wps=584487, ups=1.18, wpb=495970, bsz=16964.3, num_updates=11700, lr=0.000584705, gnorm=0.171, clip=0, loss_scale=1, train_wall=84, gb_free=61.4, wall=10074 epoch 007: 1587 / 1689 loss=3.746, nll_loss=2.216, ppl=4.64, wps=584487, ups=1.18, wpb=495970, bsz=16964.3, num_updates=11700, lr=0.000584705, gnorm=0.171, clip=0, loss_scale=1, train_wall=84, gb_free=61.4, wall=10074 epoch 007: 1587 / 1689 loss=3.746, nll_loss=2.216, ppl=4.64, wps=584487, ups=1.18, wpb=495970, bsz=16964.3, num_updates=11700, lr=0.000584705, gnorm=0.171, clip=0, loss_scale=1, train_wall=84, gb_free=61.4, wall=10074 epoch 007: 1587 / 1689 loss=3.746, nll_loss=2.216, ppl=4.64, wps=584487, ups=1.18, wpb=495970, bsz=16964.3, num_updates=11700, lr=0.000584705, gnorm=0.171, clip=0, loss_scale=1, train_wall=84, gb_free=61.4, wall=10074 epoch 007: 1587 / 1689 loss=3.746, nll_loss=2.216, ppl=4.64, wps=584487, ups=1.18, wpb=495970, bsz=16964.3, num_updates=11700, lr=0.000584705, gnorm=0.171, clip=0, loss_scale=1, train_wall=84, gb_free=61.4, wall=10074 epoch 007: 1687 / 1689 loss=3.747, nll_loss=2.217, ppl=4.65, wps=582422, ups=1.18, wpb=494597, bsz=16513.8, num_updates=11800, lr=0.000582223, gnorm=0.177, clip=0, loss_scale=1, train_wall=84, gb_free=61.7, wall=10159 epoch 007: 1687 / 1689 loss=3.747, nll_loss=2.217, ppl=4.65, wps=582422, ups=1.18, wpb=494597, bsz=16513.8, num_updates=11800, lr=0.000582223, gnorm=0.177, clip=0, loss_scale=1, train_wall=84, gb_free=61.7, wall=10159 epoch 007: 1687 / 1689 loss=3.747, nll_loss=2.217, ppl=4.65, wps=582422, ups=1.18, wpb=494597, bsz=16513.8, num_updates=11800, lr=0.000582223, gnorm=0.177, clip=0, loss_scale=1, train_wall=84, gb_free=61.7, wall=10159 epoch 007: 1687 / 1689 loss=3.747, nll_loss=2.217, ppl=4.65, wps=582422, ups=1.18, wpb=494597, bsz=16513.8, num_updates=11800, lr=0.000582223, gnorm=0.177, clip=0, loss_scale=1, train_wall=84, gb_free=61.7, wall=10159 epoch 007: 1687 / 1689 loss=3.747, nll_loss=2.217, ppl=4.65, wps=582422, ups=1.18, wpb=494597, bsz=16513.8, num_updates=11800, lr=0.000582223, gnorm=0.177, clip=0, loss_scale=1, train_wall=84, gb_free=61.7, wall=10159 epoch 007: 1687 / 1689 loss=3.747, nll_loss=2.217, ppl=4.65, wps=582422, ups=1.18, wpb=494597, bsz=16513.8, num_updates=11800, lr=0.000582223, gnorm=0.177, clip=0, loss_scale=1, train_wall=84, gb_free=61.7, wall=10159 epoch 007: 1687 / 1689 loss=3.747, nll_loss=2.217, ppl=4.65, wps=582422, ups=1.18, wpb=494597, bsz=16513.8, num_updates=11800, lr=0.000582223, gnorm=0.177, clip=0, loss_scale=1, train_wall=84, gb_free=61.7, wall=10159 end of epoch 7 (average epoch stats below) epoch 007 | loss 3.755 | nll_loss 2.225 | ppl 4.68 | wps 579100 | ups 1.17 | wpb 495094 | bsz 16509.3 | num_updates 11802 | lr 0.000582173 | gnorm 0.186 | clip 0 | loss_scale 1 | train_wall 1410 | gb_free 62.7 | wall 10160 epoch 007 | loss 3.755 | nll_loss 2.225 | ppl 4.68 | wps 579100 | ups 1.17 | wpb 495094 | bsz 16509.3 | num_updates 11802 | lr 0.000582173 | gnorm 0.186 | clip 0 | loss_scale 1 | train_wall 1410 | gb_free 62.7 | wall 10160 epoch 007 | loss 3.755 | nll_loss 2.225 | ppl 4.68 | wps 579100 | ups 1.17 | wpb 495094 | bsz 16509.3 | num_updates 11802 | lr 0.000582173 | gnorm 0.186 | clip 0 | loss_scale 1 | train_wall 1410 | gb_free 62.7 | wall 10160 epoch 007 | loss 3.755 | nll_loss 2.225 | ppl 4.68 | wps 579100 | ups 1.17 | wpb 495094 | bsz 16509.3 | num_updates 11802 | lr 0.000582173 | gnorm 0.186 | clip 0 | loss_scale 1 | train_wall 1410 | gb_free 62.7 | wall 10160 epoch 007 | loss 3.755 | nll_loss 2.225 | ppl 4.68 | wps 579100 | ups 1.17 | wpb 495094 | bsz 16509.3 | num_updates 11802 | lr 0.000582173 | gnorm 0.186 | clip 0 | loss_scale 1 | train_wall 1410 | gb_free 62.7 | wall 10160 epoch 007 | loss 3.755 | nll_loss 2.225 | ppl 4.68 | wps 579100 | ups 1.17 | wpb 495094 | bsz 16509.3 | num_updates 11802 | lr 0.000582173 | gnorm 0.186 | clip 0 | loss_scale 1 | train_wall 1410 | gb_free 62.7 | wall 10160 epoch 007 | loss 3.755 | nll_loss 2.225 | ppl 4.68 | wps 579100 | ups 1.17 | wpb 495094 | bsz 16509.3 | num_updates 11802 | lr 0.000582173 | gnorm 0.186 | clip 0 | loss_scale 1 | train_wall 1410 | gb_free 62.7 | wall 10160 Start iterating over samples epoch 008: 98 / 1689 loss=3.711, nll_loss=2.176, ppl=4.52, wps=581041, ups=1.18, wpb=491598, bsz=16544, num_updates=11900, lr=0.000579771, gnorm=0.187, clip=0, loss_scale=1, train_wall=83, gb_free=61.1, wall=10243 epoch 008: 98 / 1689 loss=3.711, nll_loss=2.176, ppl=4.52, wps=581041, ups=1.18, wpb=491598, bsz=16544, num_updates=11900, lr=0.000579771, gnorm=0.187, clip=0, loss_scale=1, train_wall=83, gb_free=61.1, wall=10243 epoch 008: 98 / 1689 loss=3.711, nll_loss=2.176, ppl=4.52, wps=581041, ups=1.18, wpb=491598, bsz=16544, num_updates=11900, lr=0.000579771, gnorm=0.187, clip=0, loss_scale=1, train_wall=83, gb_free=61.1, wall=10243 epoch 008: 98 / 1689 loss=3.711, nll_loss=2.176, ppl=4.52, wps=581041, ups=1.18, wpb=491598, bsz=16544, num_updates=11900, lr=0.000579771, gnorm=0.187, clip=0, loss_scale=1, train_wall=83, gb_free=61.1, wall=10243 epoch 008: 98 / 1689 loss=3.711, nll_loss=2.176, ppl=4.52, wps=581041, ups=1.18, wpb=491598, bsz=16544, num_updates=11900, lr=0.000579771, gnorm=0.187, clip=0, loss_scale=1, train_wall=83, gb_free=61.1, wall=10243 epoch 008: 98 / 1689 loss=3.711, nll_loss=2.176, ppl=4.52, wps=581041, ups=1.18, wpb=491598, bsz=16544, num_updates=11900, lr=0.000579771, gnorm=0.187, clip=0, loss_scale=1, train_wall=83, gb_free=61.1, wall=10243 epoch 008: 98 / 1689 loss=3.711, nll_loss=2.176, ppl=4.52, wps=581041, ups=1.18, wpb=491598, bsz=16544, num_updates=11900, lr=0.000579771, gnorm=0.187, clip=0, loss_scale=1, train_wall=83, gb_free=61.1, wall=10243 epoch 008: 98 / 1689 loss=3.711, nll_loss=2.176, ppl=4.52, wps=581041, ups=1.18, wpb=491598, bsz=16544, num_updates=11900, lr=0.000579771, gnorm=0.187, clip=0, loss_scale=1, train_wall=83, gb_free=61.1, wall=10243 epoch 008: 198 / 1689 loss=3.715, nll_loss=2.18, ppl=4.53, wps=583961, ups=1.18, wpb=494386, bsz=16689, num_updates=12000, lr=0.00057735, gnorm=0.185, clip=0, loss_scale=1, train_wall=84, gb_free=61.1, wall=10328 epoch 008: 198 / 1689 loss=3.715, nll_loss=2.18, ppl=4.53, wps=583961, ups=1.18, wpb=494386, bsz=16689, num_updates=12000, lr=0.00057735, gnorm=0.185, clip=0, loss_scale=1, train_wall=84, gb_free=61.1, wall=10328 epoch 008: 198 / 1689 loss=3.715, nll_loss=2.18, ppl=4.53, wps=583961, ups=1.18, wpb=494386, bsz=16689, num_updates=12000, lr=0.00057735, gnorm=0.185, clip=0, loss_scale=1, train_wall=84, gb_free=61.1, wall=10328 epoch 008: 198 / 1689 loss=3.715, nll_loss=2.18, ppl=4.53, wps=583961, ups=1.18, wpb=494386, bsz=16689, num_updates=12000, lr=0.00057735, gnorm=0.185, clip=0, loss_scale=1, train_wall=84, gb_free=61.1, wall=10328 epoch 008: 198 / 1689 loss=3.715, nll_loss=2.18, ppl=4.53, wps=583961, ups=1.18, wpb=494386, bsz=16689, num_updates=12000, lr=0.00057735, gnorm=0.185, clip=0, loss_scale=1, train_wall=84, gb_free=61.1, wall=10328 epoch 008: 198 / 1689 loss=3.715, nll_loss=2.18, ppl=4.53, wps=583961, ups=1.18, wpb=494386, bsz=16689, num_updates=12000, lr=0.00057735, gnorm=0.185, clip=0, loss_scale=1, train_wall=84, gb_free=61.1, wall=10328 epoch 008: 198 / 1689 loss=3.715, nll_loss=2.18, ppl=4.53, wps=583961, ups=1.18, wpb=494386, bsz=16689, num_updates=12000, lr=0.00057735, gnorm=0.185, clip=0, loss_scale=1, train_wall=84, gb_free=61.1, wall=10328 epoch 008: 198 / 1689 loss=3.715, nll_loss=2.18, ppl=4.53, wps=583961, ups=1.18, wpb=494386, bsz=16689, num_updates=12000, lr=0.00057735, gnorm=0.185, clip=0, loss_scale=1, train_wall=84, gb_free=61.1, wall=10328 begin validation on "valid" subset epoch 008 | valid on 'valid' subset | loss 3.804 | nll_loss 2.259 | ppl 4.79 | wps 0 | wpb 44526 | bsz 2008 | num_updates 12000 | best_loss 3.795 epoch 008 | valid on 'valid' subset | loss 3.804 | nll_loss 2.259 | ppl 4.79 | wps 0 | wpb 44526 | bsz 2008 | num_updates 12000 | best_loss 3.795 epoch 008 | valid on 'valid' subset | loss 3.804 | nll_loss 2.259 | ppl 4.79 | wps 0 | wpb 44526 | bsz 2008 | num_updates 12000 | best_loss 3.795 epoch 008 | valid on 'valid' subset | loss 3.804 | nll_loss 2.259 | ppl 4.79 | wps 0 | wpb 44526 | bsz 2008 | num_updates 12000 | best_loss 3.795 epoch 008 | valid on 'valid' subset | loss 3.804 | nll_loss 2.259 | ppl 4.79 | wps 0 | wpb 44526 | bsz 2008 | num_updates 12000 | best_loss 3.795 epoch 008 | valid on 'valid' subset | loss 3.804 | nll_loss 2.259 | ppl 4.79 | wps 0 | wpb 44526 | bsz 2008 | num_updates 12000 | best_loss 3.795 epoch 008 | valid on 'valid' subset | loss 3.804 | nll_loss 2.259 | ppl 4.79 | wps 0 | wpb 44526 | bsz 2008 | num_updates 12000 | best_loss 3.795 epoch 008 | valid on 'valid' subset | loss 3.804 | nll_loss 2.259 | ppl 4.79 | wps 0 | wpb 44526 | bsz 2008 | num_updates 12000 | best_loss 3.795 epoch 008: 298 / 1689 loss=3.722, nll_loss=2.188, ppl=4.56, wps=538834, ups=1.09, wpb=495533, bsz=16549.9, num_updates=12100, lr=0.00057496, gnorm=0.179, clip=0, loss_scale=2, train_wall=83, gb_free=60.8, wall=10420 epoch 008: 298 / 1689 loss=3.722, nll_loss=2.188, ppl=4.56, wps=538834, ups=1.09, wpb=495533, bsz=16549.9, num_updates=12100, lr=0.00057496, gnorm=0.179, clip=0, loss_scale=2, train_wall=83, gb_free=60.8, wall=10420 epoch 008: 298 / 1689 loss=3.722, nll_loss=2.188, ppl=4.56, wps=538834, ups=1.09, wpb=495533, bsz=16549.9, num_updates=12100, lr=0.00057496, gnorm=0.179, clip=0, loss_scale=2, train_wall=83, gb_free=60.8, wall=10420 epoch 008: 298 / 1689 loss=3.722, nll_loss=2.188, ppl=4.56, wps=538834, ups=1.09, wpb=495533, bsz=16549.9, num_updates=12100, lr=0.00057496, gnorm=0.179, clip=0, loss_scale=2, train_wall=83, gb_free=60.8, wall=10420 epoch 008: 298 / 1689 loss=3.722, nll_loss=2.188, ppl=4.56, wps=538834, ups=1.09, wpb=495533, bsz=16549.9, num_updates=12100, lr=0.00057496, gnorm=0.179, clip=0, loss_scale=2, train_wall=83, gb_free=60.8, wall=10420 epoch 008: 298 / 1689 loss=3.722, nll_loss=2.188, ppl=4.56, wps=538834, ups=1.09, wpb=495533, bsz=16549.9, num_updates=12100, lr=0.00057496, gnorm=0.179, clip=0, loss_scale=2, train_wall=83, gb_free=60.8, wall=10420 epoch 008: 298 / 1689 loss=3.722, nll_loss=2.188, ppl=4.56, wps=538834, ups=1.09, wpb=495533, bsz=16549.9, num_updates=12100, lr=0.00057496, gnorm=0.179, clip=0, loss_scale=2, train_wall=83, gb_free=60.8, wall=10420 epoch 008: 298 / 1689 loss=3.722, nll_loss=2.188, ppl=4.56, wps=538834, ups=1.09, wpb=495533, bsz=16549.9, num_updates=12100, lr=0.00057496, gnorm=0.179, clip=0, loss_scale=2, train_wall=83, gb_free=60.8, wall=10420 epoch 008: 398 / 1689 loss=3.716, nll_loss=2.182, ppl=4.54, wps=584879, ups=1.18, wpb=495068, bsz=16662.2, num_updates=12200, lr=0.000572598, gnorm=0.174, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=10505 epoch 008: 398 / 1689 loss=3.716, nll_loss=2.182, ppl=4.54, wps=584879, ups=1.18, wpb=495068, bsz=16662.2, num_updates=12200, lr=0.000572598, gnorm=0.174, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=10505 epoch 008: 398 / 1689 loss=3.716, nll_loss=2.182, ppl=4.54, wps=584879, ups=1.18, wpb=495068, bsz=16662.2, num_updates=12200, lr=0.000572598, gnorm=0.174, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=10505 epoch 008: 398 / 1689 loss=3.716, nll_loss=2.182, ppl=4.54, wps=584879, ups=1.18, wpb=495068, bsz=16662.2, num_updates=12200, lr=0.000572598, gnorm=0.174, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=10505 epoch 008: 398 / 1689 loss=3.716, nll_loss=2.182, ppl=4.54, wps=584879, ups=1.18, wpb=495068, bsz=16662.2, num_updates=12200, lr=0.000572598, gnorm=0.174, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=10505 epoch 008: 398 / 1689 loss=3.716, nll_loss=2.182, ppl=4.54, wps=584879, ups=1.18, wpb=495068, bsz=16662.2, num_updates=12200, lr=0.000572598, gnorm=0.174, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=10505 epoch 008: 398 / 1689 loss=3.716, nll_loss=2.182, ppl=4.54, wps=584879, ups=1.18, wpb=495068, bsz=16662.2, num_updates=12200, lr=0.000572598, gnorm=0.174, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=10505 epoch 008: 398 / 1689 loss=3.716, nll_loss=2.182, ppl=4.54, wps=584879, ups=1.18, wpb=495068, bsz=16662.2, num_updates=12200, lr=0.000572598, gnorm=0.174, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=10505 epoch 008: 498 / 1689 loss=3.729, nll_loss=2.196, ppl=4.58, wps=586422, ups=1.19, wpb=493366, bsz=15939, num_updates=12300, lr=0.000570266, gnorm=0.175, clip=0, loss_scale=2, train_wall=83, gb_free=61.7, wall=10589 epoch 008: 498 / 1689 loss=3.729, nll_loss=2.196, ppl=4.58, wps=586422, ups=1.19, wpb=493366, bsz=15939, num_updates=12300, lr=0.000570266, gnorm=0.175, clip=0, loss_scale=2, train_wall=83, gb_free=61.7, wall=10589 epoch 008: 498 / 1689 loss=3.729, nll_loss=2.196, ppl=4.58, wps=586422, ups=1.19, wpb=493366, bsz=15939, num_updates=12300, lr=0.000570266, gnorm=0.175, clip=0, loss_scale=2, train_wall=83, gb_free=61.7, wall=10589 epoch 008: 498 / 1689 loss=3.729, nll_loss=2.196, ppl=4.58, wps=586422, ups=1.19, wpb=493366, bsz=15939, num_updates=12300, lr=0.000570266, gnorm=0.175, clip=0, loss_scale=2, train_wall=83, gb_free=61.7, wall=10589 epoch 008: 498 / 1689 loss=3.729, nll_loss=2.196, ppl=4.58, wps=586422, ups=1.19, wpb=493366, bsz=15939, num_updates=12300, lr=0.000570266, gnorm=0.175, clip=0, loss_scale=2, train_wall=83, gb_free=61.7, wall=10589 epoch 008: 498 / 1689 loss=3.729, nll_loss=2.196, ppl=4.58, wps=586422, ups=1.19, wpb=493366, bsz=15939, num_updates=12300, lr=0.000570266, gnorm=0.175, clip=0, loss_scale=2, train_wall=83, gb_free=61.7, wall=10589 epoch 008: 498 / 1689 loss=3.729, nll_loss=2.196, ppl=4.58, wps=586422, ups=1.19, wpb=493366, bsz=15939, num_updates=12300, lr=0.000570266, gnorm=0.175, clip=0, loss_scale=2, train_wall=83, gb_free=61.7, wall=10589 epoch 008: 498 / 1689 loss=3.729, nll_loss=2.196, ppl=4.58, wps=586422, ups=1.19, wpb=493366, bsz=15939, num_updates=12300, lr=0.000570266, gnorm=0.175, clip=0, loss_scale=2, train_wall=83, gb_free=61.7, wall=10589 epoch 008: 598 / 1689 loss=3.732, nll_loss=2.2, ppl=4.6, wps=586463, ups=1.18, wpb=495106, bsz=16509, num_updates=12400, lr=0.000567962, gnorm=0.17, clip=0, loss_scale=2, train_wall=83, gb_free=60.4, wall=10673 epoch 008: 598 / 1689 loss=3.732, nll_loss=2.2, ppl=4.6, wps=586463, ups=1.18, wpb=495106, bsz=16509, num_updates=12400, lr=0.000567962, gnorm=0.17, clip=0, loss_scale=2, train_wall=83, gb_free=60.4, wall=10673 epoch 008: 598 / 1689 loss=3.732, nll_loss=2.2, ppl=4.6, wps=586463, ups=1.18, wpb=495106, bsz=16509, num_updates=12400, lr=0.000567962, gnorm=0.17, clip=0, loss_scale=2, train_wall=83, gb_free=60.4, wall=10673 epoch 008: 598 / 1689 loss=3.732, nll_loss=2.2, ppl=4.6, wps=586463, ups=1.18, wpb=495106, bsz=16509, num_updates=12400, lr=0.000567962, gnorm=0.17, clip=0, loss_scale=2, train_wall=83, gb_free=60.4, wall=10673 epoch 008: 598 / 1689 loss=3.732, nll_loss=2.2, ppl=4.6, wps=586463, ups=1.18, wpb=495106, bsz=16509, num_updates=12400, lr=0.000567962, gnorm=0.17, clip=0, loss_scale=2, train_wall=83, gb_free=60.4, wall=10673 epoch 008: 598 / 1689 loss=3.732, nll_loss=2.2, ppl=4.6, wps=586463, ups=1.18, wpb=495106, bsz=16509, num_updates=12400, lr=0.000567962, gnorm=0.17, clip=0, loss_scale=2, train_wall=83, gb_free=60.4, wall=10673 epoch 008: 598 / 1689 loss=3.732, nll_loss=2.2, ppl=4.6, wps=586463, ups=1.18, wpb=495106, bsz=16509, num_updates=12400, lr=0.000567962, gnorm=0.17, clip=0, loss_scale=2, train_wall=83, gb_free=60.4, wall=10673 epoch 008: 598 / 1689 loss=3.732, nll_loss=2.2, ppl=4.6, wps=586463, ups=1.18, wpb=495106, bsz=16509, num_updates=12400, lr=0.000567962, gnorm=0.17, clip=0, loss_scale=2, train_wall=83, gb_free=60.4, wall=10673 epoch 008: 698 / 1689 loss=3.724, nll_loss=2.191, ppl=4.57, wps=585457, ups=1.18, wpb=495682, bsz=16531.5, num_updates=12500, lr=0.000565685, gnorm=0.186, clip=0, loss_scale=2, train_wall=84, gb_free=59.6, wall=10758 epoch 008: 698 / 1689 loss=3.724, nll_loss=2.191, ppl=4.57, wps=585457, ups=1.18, wpb=495682, bsz=16531.5, num_updates=12500, lr=0.000565685, gnorm=0.186, clip=0, loss_scale=2, train_wall=84, gb_free=59.6, wall=10758 epoch 008: 698 / 1689 loss=3.724, nll_loss=2.191, ppl=4.57, wps=585457, ups=1.18, wpb=495682, bsz=16531.5, num_updates=12500, lr=0.000565685, gnorm=0.186, clip=0, loss_scale=2, train_wall=84, gb_free=59.6, wall=10758 epoch 008: 698 / 1689 loss=3.724, nll_loss=2.191, ppl=4.57, wps=585457, ups=1.18, wpb=495682, bsz=16531.5, num_updates=12500, lr=0.000565685, gnorm=0.186, clip=0, loss_scale=2, train_wall=84, gb_free=59.6, wall=10758 epoch 008: 698 / 1689 loss=3.724, nll_loss=2.191, ppl=4.57, wps=585457, ups=1.18, wpb=495682, bsz=16531.5, num_updates=12500, lr=0.000565685, gnorm=0.186, clip=0, loss_scale=2, train_wall=84, gb_free=59.6, wall=10758 epoch 008: 698 / 1689 loss=3.724, nll_loss=2.191, ppl=4.57, wps=585457, ups=1.18, wpb=495682, bsz=16531.5, num_updates=12500, lr=0.000565685, gnorm=0.186, clip=0, loss_scale=2, train_wall=84, gb_free=59.6, wall=10758 epoch 008: 698 / 1689 loss=3.724, nll_loss=2.191, ppl=4.57, wps=585457, ups=1.18, wpb=495682, bsz=16531.5, num_updates=12500, lr=0.000565685, gnorm=0.186, clip=0, loss_scale=2, train_wall=84, gb_free=59.6, wall=10758 epoch 008: 698 / 1689 loss=3.724, nll_loss=2.191, ppl=4.57, wps=585457, ups=1.18, wpb=495682, bsz=16531.5, num_updates=12500, lr=0.000565685, gnorm=0.186, clip=0, loss_scale=2, train_wall=84, gb_free=59.6, wall=10758 epoch 008: 798 / 1689 loss=3.728, nll_loss=2.195, ppl=4.58, wps=587312, ups=1.18, wpb=495975, bsz=16442.3, num_updates=12600, lr=0.000563436, gnorm=0.169, clip=0, loss_scale=4, train_wall=83, gb_free=61.8, wall=10842 epoch 008: 798 / 1689 loss=3.728, nll_loss=2.195, ppl=4.58, wps=587312, ups=1.18, wpb=495975, bsz=16442.3, num_updates=12600, lr=0.000563436, gnorm=0.169, clip=0, loss_scale=4, train_wall=83, gb_free=61.8, wall=10842 epoch 008: 798 / 1689 loss=3.728, nll_loss=2.195, ppl=4.58, wps=587312, ups=1.18, wpb=495975, bsz=16442.3, num_updates=12600, lr=0.000563436, gnorm=0.169, clip=0, loss_scale=4, train_wall=83, gb_free=61.8, wall=10842 epoch 008: 798 / 1689 loss=3.728, nll_loss=2.195, ppl=4.58, wps=587312, ups=1.18, wpb=495975, bsz=16442.3, num_updates=12600, lr=0.000563436, gnorm=0.169, clip=0, loss_scale=4, train_wall=83, gb_free=61.8, wall=10842 epoch 008: 798 / 1689 loss=3.728, nll_loss=2.195, ppl=4.58, wps=587312, ups=1.18, wpb=495975, bsz=16442.3, num_updates=12600, lr=0.000563436, gnorm=0.169, clip=0, loss_scale=4, train_wall=83, gb_free=61.8, wall=10842 epoch 008: 798 / 1689 loss=3.728, nll_loss=2.195, ppl=4.58, wps=587312, ups=1.18, wpb=495975, bsz=16442.3, num_updates=12600, lr=0.000563436, gnorm=0.169, clip=0, loss_scale=4, train_wall=83, gb_free=61.8, wall=10842 epoch 008: 798 / 1689 loss=3.728, nll_loss=2.195, ppl=4.58, wps=587312, ups=1.18, wpb=495975, bsz=16442.3, num_updates=12600, lr=0.000563436, gnorm=0.169, clip=0, loss_scale=4, train_wall=83, gb_free=61.8, wall=10842 epoch 008: 798 / 1689 loss=3.728, nll_loss=2.195, ppl=4.58, wps=587312, ups=1.18, wpb=495975, bsz=16442.3, num_updates=12600, lr=0.000563436, gnorm=0.169, clip=0, loss_scale=4, train_wall=83, gb_free=61.8, wall=10842 epoch 008: 898 / 1689 loss=3.729, nll_loss=2.197, ppl=4.58, wps=589637, ups=1.19, wpb=496040, bsz=16665, num_updates=12700, lr=0.000561214, gnorm=0.176, clip=0, loss_scale=4, train_wall=83, gb_free=61.1, wall=10926 epoch 008: 898 / 1689 loss=3.729, nll_loss=2.197, ppl=4.58, wps=589637, ups=1.19, wpb=496040, bsz=16665, num_updates=12700, lr=0.000561214, gnorm=0.176, clip=0, loss_scale=4, train_wall=83, gb_free=61.1, wall=10926 epoch 008: 898 / 1689 loss=3.729, nll_loss=2.197, ppl=4.58, wps=589637, ups=1.19, wpb=496040, bsz=16665, num_updates=12700, lr=0.000561214, gnorm=0.176, clip=0, loss_scale=4, train_wall=83, gb_free=61.1, wall=10926 epoch 008: 898 / 1689 loss=3.729, nll_loss=2.197, ppl=4.58, wps=589637, ups=1.19, wpb=496040, bsz=16665, num_updates=12700, lr=0.000561214, gnorm=0.176, clip=0, loss_scale=4, train_wall=83, gb_free=61.1, wall=10926 epoch 008: 898 / 1689 loss=3.729, nll_loss=2.197, ppl=4.58, wps=589637, ups=1.19, wpb=496040, bsz=16665, num_updates=12700, lr=0.000561214, gnorm=0.176, clip=0, loss_scale=4, train_wall=83, gb_free=61.1, wall=10926 epoch 008: 898 / 1689 loss=3.729, nll_loss=2.197, ppl=4.58, wps=589637, ups=1.19, wpb=496040, bsz=16665, num_updates=12700, lr=0.000561214, gnorm=0.176, clip=0, loss_scale=4, train_wall=83, gb_free=61.1, wall=10926 epoch 008: 898 / 1689 loss=3.729, nll_loss=2.197, ppl=4.58, wps=589637, ups=1.19, wpb=496040, bsz=16665, num_updates=12700, lr=0.000561214, gnorm=0.176, clip=0, loss_scale=4, train_wall=83, gb_free=61.1, wall=10926 epoch 008: 898 / 1689 loss=3.729, nll_loss=2.197, ppl=4.58, wps=589637, ups=1.19, wpb=496040, bsz=16665, num_updates=12700, lr=0.000561214, gnorm=0.176, clip=0, loss_scale=4, train_wall=83, gb_free=61.1, wall=10926 epoch 008: 998 / 1689 loss=3.73, nll_loss=2.199, ppl=4.59, wps=584267, ups=1.18, wpb=494198, bsz=16455.6, num_updates=12800, lr=0.000559017, gnorm=0.175, clip=0, loss_scale=4, train_wall=84, gb_free=61.8, wall=11011 epoch 008: 998 / 1689 loss=3.73, nll_loss=2.199, ppl=4.59, wps=584267, ups=1.18, wpb=494198, bsz=16455.6, num_updates=12800, lr=0.000559017, gnorm=0.175, clip=0, loss_scale=4, train_wall=84, gb_free=61.8, wall=11011 epoch 008: 998 / 1689 loss=3.73, nll_loss=2.199, ppl=4.59, wps=584267, ups=1.18, wpb=494198, bsz=16455.6, num_updates=12800, lr=0.000559017, gnorm=0.175, clip=0, loss_scale=4, train_wall=84, gb_free=61.8, wall=11011 epoch 008: 998 / 1689 loss=3.73, nll_loss=2.199, ppl=4.59, wps=584267, ups=1.18, wpb=494198, bsz=16455.6, num_updates=12800, lr=0.000559017, gnorm=0.175, clip=0, loss_scale=4, train_wall=84, gb_free=61.8, wall=11011 epoch 008: 998 / 1689 loss=3.73, nll_loss=2.199, ppl=4.59, wps=584267, ups=1.18, wpb=494198, bsz=16455.6, num_updates=12800, lr=0.000559017, gnorm=0.175, clip=0, loss_scale=4, train_wall=84, gb_free=61.8, wall=11011 epoch 008: 998 / 1689 loss=3.73, nll_loss=2.199, ppl=4.59, wps=584267, ups=1.18, wpb=494198, bsz=16455.6, num_updates=12800, lr=0.000559017, gnorm=0.175, clip=0, loss_scale=4, train_wall=84, gb_free=61.8, wall=11011 epoch 008: 998 / 1689 loss=3.73, nll_loss=2.199, ppl=4.59, wps=584267, ups=1.18, wpb=494198, bsz=16455.6, num_updates=12800, lr=0.000559017, gnorm=0.175, clip=0, loss_scale=4, train_wall=84, gb_free=61.8, wall=11011 epoch 008: 998 / 1689 loss=3.73, nll_loss=2.199, ppl=4.59, wps=584267, ups=1.18, wpb=494198, bsz=16455.6, num_updates=12800, lr=0.000559017, gnorm=0.175, clip=0, loss_scale=4, train_wall=84, gb_free=61.8, wall=11011 epoch 008: 1098 / 1689 loss=3.72, nll_loss=2.188, ppl=4.56, wps=586144, ups=1.18, wpb=494969, bsz=16398.1, num_updates=12900, lr=0.000556846, gnorm=0.168, clip=0, loss_scale=4, train_wall=83, gb_free=61.7, wall=11095 epoch 008: 1098 / 1689 loss=3.72, nll_loss=2.188, ppl=4.56, wps=586144, ups=1.18, wpb=494969, bsz=16398.1, num_updates=12900, lr=0.000556846, gnorm=0.168, clip=0, loss_scale=4, train_wall=83, gb_free=61.7, wall=11095 epoch 008: 1098 / 1689 loss=3.72, nll_loss=2.188, ppl=4.56, wps=586144, ups=1.18, wpb=494969, bsz=16398.1, num_updates=12900, lr=0.000556846, gnorm=0.168, clip=0, loss_scale=4, train_wall=83, gb_free=61.7, wall=11095 epoch 008: 1098 / 1689 loss=3.72, nll_loss=2.188, ppl=4.56, wps=586144, ups=1.18, wpb=494969, bsz=16398.1, num_updates=12900, lr=0.000556846, gnorm=0.168, clip=0, loss_scale=4, train_wall=83, gb_free=61.7, wall=11095 epoch 008: 1098 / 1689 loss=3.72, nll_loss=2.188, ppl=4.56, wps=586144, ups=1.18, wpb=494969, bsz=16398.1, num_updates=12900, lr=0.000556846, gnorm=0.168, clip=0, loss_scale=4, train_wall=83, gb_free=61.7, wall=11095 epoch 008: 1098 / 1689 loss=3.72, nll_loss=2.188, ppl=4.56, wps=586144, ups=1.18, wpb=494969, bsz=16398.1, num_updates=12900, lr=0.000556846, gnorm=0.168, clip=0, loss_scale=4, train_wall=83, gb_free=61.7, wall=11095 epoch 008: 1098 / 1689 loss=3.72, nll_loss=2.188, ppl=4.56, wps=586144, ups=1.18, wpb=494969, bsz=16398.1, num_updates=12900, lr=0.000556846, gnorm=0.168, clip=0, loss_scale=4, train_wall=83, gb_free=61.7, wall=11095 epoch 008: 1098 / 1689 loss=3.72, nll_loss=2.188, ppl=4.56, wps=586144, ups=1.18, wpb=494969, bsz=16398.1, num_updates=12900, lr=0.000556846, gnorm=0.168, clip=0, loss_scale=4, train_wall=83, gb_free=61.7, wall=11095 epoch 008: 1198 / 1689 loss=3.721, nll_loss=2.189, ppl=4.56, wps=588835, ups=1.18, wpb=497424, bsz=16192.6, num_updates=13000, lr=0.0005547, gnorm=0.176, clip=0, loss_scale=4, train_wall=84, gb_free=62.2, wall=11180 epoch 008: 1198 / 1689 loss=3.721, nll_loss=2.189, ppl=4.56, wps=588835, ups=1.18, wpb=497424, bsz=16192.6, num_updates=13000, lr=0.0005547, gnorm=0.176, clip=0, loss_scale=4, train_wall=84, gb_free=62.2, wall=11180 epoch 008: 1198 / 1689 loss=3.721, nll_loss=2.189, ppl=4.56, wps=588835, ups=1.18, wpb=497424, bsz=16192.6, num_updates=13000, lr=0.0005547, gnorm=0.176, clip=0, loss_scale=4, train_wall=84, gb_free=62.2, wall=11180 epoch 008: 1198 / 1689 loss=3.721, nll_loss=2.189, ppl=4.56, wps=588835, ups=1.18, wpb=497424, bsz=16192.6, num_updates=13000, lr=0.0005547, gnorm=0.176, clip=0, loss_scale=4, train_wall=84, gb_free=62.2, wall=11180 epoch 008: 1198 / 1689 loss=3.721, nll_loss=2.189, ppl=4.56, wps=588835, ups=1.18, wpb=497424, bsz=16192.6, num_updates=13000, lr=0.0005547, gnorm=0.176, clip=0, loss_scale=4, train_wall=84, gb_free=62.2, wall=11180 epoch 008: 1198 / 1689 loss=3.721, nll_loss=2.189, ppl=4.56, wps=588835, ups=1.18, wpb=497424, bsz=16192.6, num_updates=13000, lr=0.0005547, gnorm=0.176, clip=0, loss_scale=4, train_wall=84, gb_free=62.2, wall=11180 epoch 008: 1198 / 1689 loss=3.721, nll_loss=2.189, ppl=4.56, wps=588835, ups=1.18, wpb=497424, bsz=16192.6, num_updates=13000, lr=0.0005547, gnorm=0.176, clip=0, loss_scale=4, train_wall=84, gb_free=62.2, wall=11180 epoch 008: 1198 / 1689 loss=3.721, nll_loss=2.189, ppl=4.56, wps=588835, ups=1.18, wpb=497424, bsz=16192.6, num_updates=13000, lr=0.0005547, gnorm=0.176, clip=0, loss_scale=4, train_wall=84, gb_free=62.2, wall=11180 begin validation on "valid" subset epoch 008 | valid on 'valid' subset | loss 3.776 | nll_loss 2.227 | ppl 4.68 | wps 0 | wpb 44526 | bsz 2008 | num_updates 13000 | best_loss 3.776 epoch 008 | valid on 'valid' subset | loss 3.776 | nll_loss 2.227 | ppl 4.68 | wps 0 | wpb 44526 | bsz 2008 | num_updates 13000 | best_loss 3.776 epoch 008 | valid on 'valid' subset | loss 3.776 | nll_loss 2.227 | ppl 4.68 | wps 0 | wpb 44526 | bsz 2008 | num_updates 13000 | best_loss 3.776 epoch 008 | valid on 'valid' subset | loss 3.776 | nll_loss 2.227 | ppl 4.68 | wps 0 | wpb 44526 | bsz 2008 | num_updates 13000 | best_loss 3.776 epoch 008 | valid on 'valid' subset | loss 3.776 | nll_loss 2.227 | ppl 4.68 | wps 0 | wpb 44526 | bsz 2008 | num_updates 13000 | best_loss 3.776 epoch 008 | valid on 'valid' subset | loss 3.776 | nll_loss 2.227 | ppl 4.68 | wps 0 | wpb 44526 | bsz 2008 | num_updates 13000 | best_loss 3.776 epoch 008 | valid on 'valid' subset | loss 3.776 | nll_loss 2.227 | ppl 4.68 | wps 0 | wpb 44526 | bsz 2008 | num_updates 13000 | best_loss 3.776 epoch 008 | valid on 'valid' subset | loss 3.776 | nll_loss 2.227 | ppl 4.68 | wps 0 | wpb 44526 | bsz 2008 | num_updates 13000 | best_loss 3.776 epoch 008: 1298 / 1689 loss=3.72, nll_loss=2.187, ppl=4.55, wps=509056, ups=1.03, wpb=495453, bsz=16751, num_updates=13100, lr=0.000552579, gnorm=0.161, clip=0, loss_scale=4, train_wall=83, gb_free=61, wall=11277 epoch 008: 1298 / 1689 loss=3.72, nll_loss=2.187, ppl=4.55, wps=509056, ups=1.03, wpb=495453, bsz=16751, num_updates=13100, lr=0.000552579, gnorm=0.161, clip=0, loss_scale=4, train_wall=83, gb_free=61, wall=11277 epoch 008: 1298 / 1689 loss=3.72, nll_loss=2.187, ppl=4.55, wps=509056, ups=1.03, wpb=495453, bsz=16751, num_updates=13100, lr=0.000552579, gnorm=0.161, clip=0, loss_scale=4, train_wall=83, gb_free=61, wall=11277 epoch 008: 1298 / 1689 loss=3.72, nll_loss=2.187, ppl=4.55, wps=509056, ups=1.03, wpb=495453, bsz=16751, num_updates=13100, lr=0.000552579, gnorm=0.161, clip=0, loss_scale=4, train_wall=83, gb_free=61, wall=11277 epoch 008: 1298 / 1689 loss=3.72, nll_loss=2.187, ppl=4.55, wps=509056, ups=1.03, wpb=495453, bsz=16751, num_updates=13100, lr=0.000552579, gnorm=0.161, clip=0, loss_scale=4, train_wall=83, gb_free=61, wall=11277 epoch 008: 1298 / 1689 loss=3.72, nll_loss=2.187, ppl=4.55, wps=509056, ups=1.03, wpb=495453, bsz=16751, num_updates=13100, lr=0.000552579, gnorm=0.161, clip=0, loss_scale=4, train_wall=83, gb_free=61, wall=11277 epoch 008: 1298 / 1689 loss=3.72, nll_loss=2.187, ppl=4.55, wps=509056, ups=1.03, wpb=495453, bsz=16751, num_updates=13100, lr=0.000552579, gnorm=0.161, clip=0, loss_scale=4, train_wall=83, gb_free=61, wall=11277 epoch 008: 1298 / 1689 loss=3.72, nll_loss=2.187, ppl=4.55, wps=509056, ups=1.03, wpb=495453, bsz=16751, num_updates=13100, lr=0.000552579, gnorm=0.161, clip=0, loss_scale=4, train_wall=83, gb_free=61, wall=11277 epoch 008: 1399 / 1689 loss=3.723, nll_loss=2.191, ppl=4.57, wps=582493, ups=1.17, wpb=496586, bsz=16595.3, num_updates=13200, lr=0.000550482, gnorm=0.178, clip=0, loss_scale=4, train_wall=84, gb_free=61.2, wall=11362 epoch 008: 1399 / 1689 loss=3.723, nll_loss=2.191, ppl=4.57, wps=582493, ups=1.17, wpb=496586, bsz=16595.3, num_updates=13200, lr=0.000550482, gnorm=0.178, clip=0, loss_scale=4, train_wall=84, gb_free=61.2, wall=11362 epoch 008: 1399 / 1689 loss=3.723, nll_loss=2.191, ppl=4.57, wps=582493, ups=1.17, wpb=496586, bsz=16595.3, num_updates=13200, lr=0.000550482, gnorm=0.178, clip=0, loss_scale=4, train_wall=84, gb_free=61.2, wall=11362 epoch 008: 1399 / 1689 loss=3.723, nll_loss=2.191, ppl=4.57, wps=582493, ups=1.17, wpb=496586, bsz=16595.3, num_updates=13200, lr=0.000550482, gnorm=0.178, clip=0, loss_scale=4, train_wall=84, gb_free=61.2, wall=11362 epoch 008: 1399 / 1689 loss=3.723, nll_loss=2.191, ppl=4.57, wps=582493, ups=1.17, wpb=496586, bsz=16595.3, num_updates=13200, lr=0.000550482, gnorm=0.178, clip=0, loss_scale=4, train_wall=84, gb_free=61.2, wall=11362 epoch 008: 1399 / 1689 loss=3.723, nll_loss=2.191, ppl=4.57, wps=582493, ups=1.17, wpb=496586, bsz=16595.3, num_updates=13200, lr=0.000550482, gnorm=0.178, clip=0, loss_scale=4, train_wall=84, gb_free=61.2, wall=11362 epoch 008: 1399 / 1689 loss=3.723, nll_loss=2.191, ppl=4.57, wps=582493, ups=1.17, wpb=496586, bsz=16595.3, num_updates=13200, lr=0.000550482, gnorm=0.178, clip=0, loss_scale=4, train_wall=84, gb_free=61.2, wall=11362 epoch 008: 1399 / 1689 loss=3.723, nll_loss=2.191, ppl=4.57, wps=582493, ups=1.17, wpb=496586, bsz=16595.3, num_updates=13200, lr=0.000550482, gnorm=0.178, clip=0, loss_scale=4, train_wall=84, gb_free=61.2, wall=11362 epoch 008: 1499 / 1689 loss=3.729, nll_loss=2.197, ppl=4.59, wps=586588, ups=1.18, wpb=495143, bsz=16468.4, num_updates=13300, lr=0.000548408, gnorm=0.163, clip=0, loss_scale=4, train_wall=83, gb_free=61.2, wall=11447 epoch 008: 1499 / 1689 loss=3.729, nll_loss=2.197, ppl=4.59, wps=586588, ups=1.18, wpb=495143, bsz=16468.4, num_updates=13300, lr=0.000548408, gnorm=0.163, clip=0, loss_scale=4, train_wall=83, gb_free=61.2, wall=11447 epoch 008: 1499 / 1689 loss=3.729, nll_loss=2.197, ppl=4.59, wps=586588, ups=1.18, wpb=495143, bsz=16468.4, num_updates=13300, lr=0.000548408, gnorm=0.163, clip=0, loss_scale=4, train_wall=83, gb_free=61.2, wall=11447 epoch 008: 1499 / 1689 loss=3.729, nll_loss=2.197, ppl=4.59, wps=586588, ups=1.18, wpb=495143, bsz=16468.4, num_updates=13300, lr=0.000548408, gnorm=0.163, clip=0, loss_scale=4, train_wall=83, gb_free=61.2, wall=11447 epoch 008: 1499 / 1689 loss=3.729, nll_loss=2.197, ppl=4.59, wps=586588, ups=1.18, wpb=495143, bsz=16468.4, num_updates=13300, lr=0.000548408, gnorm=0.163, clip=0, loss_scale=4, train_wall=83, gb_free=61.2, wall=11447 epoch 008: 1499 / 1689 loss=3.729, nll_loss=2.197, ppl=4.59, wps=586588, ups=1.18, wpb=495143, bsz=16468.4, num_updates=13300, lr=0.000548408, gnorm=0.163, clip=0, loss_scale=4, train_wall=83, gb_free=61.2, wall=11447 epoch 008: 1499 / 1689 loss=3.729, nll_loss=2.197, ppl=4.59, wps=586588, ups=1.18, wpb=495143, bsz=16468.4, num_updates=13300, lr=0.000548408, gnorm=0.163, clip=0, loss_scale=4, train_wall=83, gb_free=61.2, wall=11447 epoch 008: 1499 / 1689 loss=3.729, nll_loss=2.197, ppl=4.59, wps=586588, ups=1.18, wpb=495143, bsz=16468.4, num_updates=13300, lr=0.000548408, gnorm=0.163, clip=0, loss_scale=4, train_wall=83, gb_free=61.2, wall=11447 epoch 008: 1599 / 1689 loss=3.728, nll_loss=2.197, ppl=4.58, wps=586799, ups=1.19, wpb=494698, bsz=16570.9, num_updates=13400, lr=0.000546358, gnorm=0.178, clip=0, loss_scale=4, train_wall=83, gb_free=61.2, wall=11531 epoch 008: 1599 / 1689 loss=3.728, nll_loss=2.197, ppl=4.58, wps=586799, ups=1.19, wpb=494698, bsz=16570.9, num_updates=13400, lr=0.000546358, gnorm=0.178, clip=0, loss_scale=4, train_wall=83, gb_free=61.2, wall=11531 epoch 008: 1599 / 1689 loss=3.728, nll_loss=2.197, ppl=4.58, wps=586799, ups=1.19, wpb=494698, bsz=16570.9, num_updates=13400, lr=0.000546358, gnorm=0.178, clip=0, loss_scale=4, train_wall=83, gb_free=61.2, wall=11531 epoch 008: 1599 / 1689 loss=3.728, nll_loss=2.197, ppl=4.58, wps=586799, ups=1.19, wpb=494698, bsz=16570.9, num_updates=13400, lr=0.000546358, gnorm=0.178, clip=0, loss_scale=4, train_wall=83, gb_free=61.2, wall=11531 epoch 008: 1599 / 1689 loss=3.728, nll_loss=2.197, ppl=4.58, wps=586799, ups=1.19, wpb=494698, bsz=16570.9, num_updates=13400, lr=0.000546358, gnorm=0.178, clip=0, loss_scale=4, train_wall=83, gb_free=61.2, wall=11531 epoch 008: 1599 / 1689 loss=3.728, nll_loss=2.197, ppl=4.58, wps=586799, ups=1.19, wpb=494698, bsz=16570.9, num_updates=13400, lr=0.000546358, gnorm=0.178, clip=0, loss_scale=4, train_wall=83, gb_free=61.2, wall=11531 epoch 008: 1599 / 1689 loss=3.728, nll_loss=2.197, ppl=4.58, wps=586799, ups=1.19, wpb=494698, bsz=16570.9, num_updates=13400, lr=0.000546358, gnorm=0.178, clip=0, loss_scale=4, train_wall=83, gb_free=61.2, wall=11531 epoch 008: 1599 / 1689 loss=3.728, nll_loss=2.197, ppl=4.58, wps=586799, ups=1.19, wpb=494698, bsz=16570.9, num_updates=13400, lr=0.000546358, gnorm=0.178, clip=0, loss_scale=4, train_wall=83, gb_free=61.2, wall=11531 end of epoch 8 (average epoch stats below) epoch 008 | loss 3.724 | nll_loss 2.191 | ppl 4.57 | wps 577500 | ups 1.17 | wpb 495124 | bsz 16508.6 | num_updates 13490 | lr 0.000544533 | gnorm 0.175 | clip 0 | loss_scale 4 | train_wall 1408 | gb_free 62.4 | wall 11607 epoch 008 | loss 3.724 | nll_loss 2.191 | ppl 4.57 | wps 577500 | ups 1.17 | wpb 495124 | bsz 16508.6 | num_updates 13490 | lr 0.000544533 | gnorm 0.175 | clip 0 | loss_scale 4 | train_wall 1408 | gb_free 62.4 | wall 11607 epoch 008 | loss 3.724 | nll_loss 2.191 | ppl 4.57 | wps 577500 | ups 1.17 | wpb 495124 | bsz 16508.6 | num_updates 13490 | lr 0.000544533 | gnorm 0.175 | clip 0 | loss_scale 4 | train_wall 1408 | gb_free 62.4 | wall 11607 epoch 008 | loss 3.724 | nll_loss 2.191 | ppl 4.57 | wps 577500 | ups 1.17 | wpb 495124 | bsz 16508.6 | num_updates 13490 | lr 0.000544533 | gnorm 0.175 | clip 0 | loss_scale 4 | train_wall 1408 | gb_free 62.4 | wall 11607 epoch 008 | loss 3.724 | nll_loss 2.191 | ppl 4.57 | wps 577500 | ups 1.17 | wpb 495124 | bsz 16508.6 | num_updates 13490 | lr 0.000544533 | gnorm 0.175 | clip 0 | loss_scale 4 | train_wall 1408 | gb_free 62.4 | wall 11607 epoch 008 | loss 3.724 | nll_loss 2.191 | ppl 4.57 | wps 577500 | ups 1.17 | wpb 495124 | bsz 16508.6 | num_updates 13490 | lr 0.000544533 | gnorm 0.175 | clip 0 | loss_scale 4 | train_wall 1408 | gb_free 62.4 | wall 11607 epoch 008 | loss 3.724 | nll_loss 2.191 | ppl 4.57 | wps 577500 | ups 1.17 | wpb 495124 | bsz 16508.6 | num_updates 13490 | lr 0.000544533 | gnorm 0.175 | clip 0 | loss_scale 4 | train_wall 1408 | gb_free 62.4 | wall 11607 epoch 008 | loss 3.724 | nll_loss 2.191 | ppl 4.57 | wps 577500 | ups 1.17 | wpb 495124 | bsz 16508.6 | num_updates 13490 | lr 0.000544533 | gnorm 0.175 | clip 0 | loss_scale 4 | train_wall 1408 | gb_free 62.4 | wall 11607 Start iterating over samples epoch 009: 10 / 1689 loss=3.724, nll_loss=2.192, ppl=4.57, wps=581669, ups=1.18, wpb=491756, bsz=16523.8, num_updates=13500, lr=0.000544331, gnorm=0.177, clip=0, loss_scale=4, train_wall=83, gb_free=60.9, wall=11616 epoch 009: 10 / 1689 loss=3.724, nll_loss=2.192, ppl=4.57, wps=581669, ups=1.18, wpb=491756, bsz=16523.8, num_updates=13500, lr=0.000544331, gnorm=0.177, clip=0, loss_scale=4, train_wall=83, gb_free=60.9, wall=11616 epoch 009: 10 / 1689 loss=3.724, nll_loss=2.192, ppl=4.57, wps=581669, ups=1.18, wpb=491756, bsz=16523.8, num_updates=13500, lr=0.000544331, gnorm=0.177, clip=0, loss_scale=4, train_wall=83, gb_free=60.9, wall=11616 epoch 009: 10 / 1689 loss=3.724, nll_loss=2.192, ppl=4.57, wps=581669, ups=1.18, wpb=491756, bsz=16523.8, num_updates=13500, lr=0.000544331, gnorm=0.177, clip=0, loss_scale=4, train_wall=83, gb_free=60.9, wall=11616 epoch 009: 10 / 1689 loss=3.724, nll_loss=2.192, ppl=4.57, wps=581669, ups=1.18, wpb=491756, bsz=16523.8, num_updates=13500, lr=0.000544331, gnorm=0.177, clip=0, loss_scale=4, train_wall=83, gb_free=60.9, wall=11616 epoch 009: 10 / 1689 loss=3.724, nll_loss=2.192, ppl=4.57, wps=581669, ups=1.18, wpb=491756, bsz=16523.8, num_updates=13500, lr=0.000544331, gnorm=0.177, clip=0, loss_scale=4, train_wall=83, gb_free=60.9, wall=11616 epoch 009: 10 / 1689 loss=3.724, nll_loss=2.192, ppl=4.57, wps=581669, ups=1.18, wpb=491756, bsz=16523.8, num_updates=13500, lr=0.000544331, gnorm=0.177, clip=0, loss_scale=4, train_wall=83, gb_free=60.9, wall=11616 epoch 009: 10 / 1689 loss=3.724, nll_loss=2.192, ppl=4.57, wps=581669, ups=1.18, wpb=491756, bsz=16523.8, num_updates=13500, lr=0.000544331, gnorm=0.177, clip=0, loss_scale=4, train_wall=83, gb_free=60.9, wall=11616 epoch 009: 10 / 1689 loss=3.724, nll_loss=2.192, ppl=4.57, wps=581669, ups=1.18, wpb=491756, bsz=16523.8, num_updates=13500, lr=0.000544331, gnorm=0.177, clip=0, loss_scale=4, train_wall=83, gb_free=60.9, wall=11616 epoch 009: 110 / 1689 loss=3.687, nll_loss=2.15, ppl=4.44, wps=587682, ups=1.19, wpb=495785, bsz=16394.6, num_updates=13600, lr=0.000542326, gnorm=0.163, clip=0, loss_scale=4, train_wall=83, gb_free=62.1, wall=11700 epoch 009: 110 / 1689 loss=3.687, nll_loss=2.15, ppl=4.44, wps=587682, ups=1.19, wpb=495785, bsz=16394.6, num_updates=13600, lr=0.000542326, gnorm=0.163, clip=0, loss_scale=4, train_wall=83, gb_free=62.1, wall=11700 epoch 009: 110 / 1689 loss=3.687, nll_loss=2.15, ppl=4.44, wps=587682, ups=1.19, wpb=495785, bsz=16394.6, num_updates=13600, lr=0.000542326, gnorm=0.163, clip=0, loss_scale=4, train_wall=83, gb_free=62.1, wall=11700 epoch 009: 110 / 1689 loss=3.687, nll_loss=2.15, ppl=4.44, wps=587682, ups=1.19, wpb=495785, bsz=16394.6, num_updates=13600, lr=0.000542326, gnorm=0.163, clip=0, loss_scale=4, train_wall=83, gb_free=62.1, wall=11700 epoch 009: 110 / 1689 loss=3.687, nll_loss=2.15, ppl=4.44, wps=587682, ups=1.19, wpb=495785, bsz=16394.6, num_updates=13600, lr=0.000542326, gnorm=0.163, clip=0, loss_scale=4, train_wall=83, gb_free=62.1, wall=11700 epoch 009: 110 / 1689 loss=3.687, nll_loss=2.15, ppl=4.44, wps=587682, ups=1.19, wpb=495785, bsz=16394.6, num_updates=13600, lr=0.000542326, gnorm=0.163, clip=0, loss_scale=4, train_wall=83, gb_free=62.1, wall=11700 epoch 009: 110 / 1689 loss=3.687, nll_loss=2.15, ppl=4.44, wps=587682, ups=1.19, wpb=495785, bsz=16394.6, num_updates=13600, lr=0.000542326, gnorm=0.163, clip=0, loss_scale=4, train_wall=83, gb_free=62.1, wall=11700 epoch 009: 110 / 1689 loss=3.687, nll_loss=2.15, ppl=4.44, wps=587682, ups=1.19, wpb=495785, bsz=16394.6, num_updates=13600, lr=0.000542326, gnorm=0.163, clip=0, loss_scale=4, train_wall=83, gb_free=62.1, wall=11700 epoch 009: 110 / 1689 loss=3.687, nll_loss=2.15, ppl=4.44, wps=587682, ups=1.19, wpb=495785, bsz=16394.6, num_updates=13600, lr=0.000542326, gnorm=0.163, clip=0, loss_scale=4, train_wall=83, gb_free=62.1, wall=11700 epoch 009: 211 / 1689 loss=3.696, nll_loss=2.16, ppl=4.47, wps=579588, ups=1.17, wpb=493562, bsz=16632.6, num_updates=13700, lr=0.000540343, gnorm=0.168, clip=0, loss_scale=4, train_wall=84, gb_free=60.7, wall=11785 epoch 009: 211 / 1689 loss=3.696, nll_loss=2.16, ppl=4.47, wps=579588, ups=1.17, wpb=493562, bsz=16632.6, num_updates=13700, lr=0.000540343, gnorm=0.168, clip=0, loss_scale=4, train_wall=84, gb_free=60.7, wall=11785 epoch 009: 211 / 1689 loss=3.696, nll_loss=2.16, ppl=4.47, wps=579588, ups=1.17, wpb=493562, bsz=16632.6, num_updates=13700, lr=0.000540343, gnorm=0.168, clip=0, loss_scale=4, train_wall=84, gb_free=60.7, wall=11785 epoch 009: 211 / 1689 loss=3.696, nll_loss=2.16, ppl=4.47, wps=579588, ups=1.17, wpb=493562, bsz=16632.6, num_updates=13700, lr=0.000540343, gnorm=0.168, clip=0, loss_scale=4, train_wall=84, gb_free=60.7, wall=11785 epoch 009: 211 / 1689 loss=3.696, nll_loss=2.16, ppl=4.47, wps=579588, ups=1.17, wpb=493562, bsz=16632.6, num_updates=13700, lr=0.000540343, gnorm=0.168, clip=0, loss_scale=4, train_wall=84, gb_free=60.7, wall=11785 epoch 009: 211 / 1689 loss=3.696, nll_loss=2.16, ppl=4.47, wps=579588, ups=1.17, wpb=493562, bsz=16632.6, num_updates=13700, lr=0.000540343, gnorm=0.168, clip=0, loss_scale=4, train_wall=84, gb_free=60.7, wall=11785 epoch 009: 211 / 1689 loss=3.696, nll_loss=2.16, ppl=4.47, wps=579588, ups=1.17, wpb=493562, bsz=16632.6, num_updates=13700, lr=0.000540343, gnorm=0.168, clip=0, loss_scale=4, train_wall=84, gb_free=60.7, wall=11785 epoch 009: 211 / 1689 loss=3.696, nll_loss=2.16, ppl=4.47, wps=579588, ups=1.17, wpb=493562, bsz=16632.6, num_updates=13700, lr=0.000540343, gnorm=0.168, clip=0, loss_scale=4, train_wall=84, gb_free=60.7, wall=11785 epoch 009: 211 / 1689 loss=3.696, nll_loss=2.16, ppl=4.47, wps=579588, ups=1.17, wpb=493562, bsz=16632.6, num_updates=13700, lr=0.000540343, gnorm=0.168, clip=0, loss_scale=4, train_wall=84, gb_free=60.7, wall=11785 epoch 009: 311 / 1689 loss=3.688, nll_loss=2.151, ppl=4.44, wps=585388, ups=1.18, wpb=496999, bsz=16647.7, num_updates=13800, lr=0.000538382, gnorm=0.159, clip=0, loss_scale=4, train_wall=84, gb_free=61.4, wall=11870 epoch 009: 311 / 1689 loss=3.688, nll_loss=2.151, ppl=4.44, wps=585388, ups=1.18, wpb=496999, bsz=16647.7, num_updates=13800, lr=0.000538382, gnorm=0.159, clip=0, loss_scale=4, train_wall=84, gb_free=61.4, wall=11870 epoch 009: 311 / 1689 loss=3.688, nll_loss=2.151, ppl=4.44, wps=585388, ups=1.18, wpb=496999, bsz=16647.7, num_updates=13800, lr=0.000538382, gnorm=0.159, clip=0, loss_scale=4, train_wall=84, gb_free=61.4, wall=11870 epoch 009: 311 / 1689 loss=3.688, nll_loss=2.151, ppl=4.44, wps=585388, ups=1.18, wpb=496999, bsz=16647.7, num_updates=13800, lr=0.000538382, gnorm=0.159, clip=0, loss_scale=4, train_wall=84, gb_free=61.4, wall=11870 epoch 009: 311 / 1689 loss=3.688, nll_loss=2.151, ppl=4.44, wps=585388, ups=1.18, wpb=496999, bsz=16647.7, num_updates=13800, lr=0.000538382, gnorm=0.159, clip=0, loss_scale=4, train_wall=84, gb_free=61.4, wall=11870 epoch 009: 311 / 1689 loss=3.688, nll_loss=2.151, ppl=4.44, wps=585388, ups=1.18, wpb=496999, bsz=16647.7, num_updates=13800, lr=0.000538382, gnorm=0.159, clip=0, loss_scale=4, train_wall=84, gb_free=61.4, wall=11870 epoch 009: 311 / 1689 loss=3.688, nll_loss=2.151, ppl=4.44, wps=585388, ups=1.18, wpb=496999, bsz=16647.7, num_updates=13800, lr=0.000538382, gnorm=0.159, clip=0, loss_scale=4, train_wall=84, gb_free=61.4, wall=11870 epoch 009: 311 / 1689 loss=3.688, nll_loss=2.151, ppl=4.44, wps=585388, ups=1.18, wpb=496999, bsz=16647.7, num_updates=13800, lr=0.000538382, gnorm=0.159, clip=0, loss_scale=4, train_wall=84, gb_free=61.4, wall=11870 epoch 009: 311 / 1689 loss=3.688, nll_loss=2.151, ppl=4.44, wps=585388, ups=1.18, wpb=496999, bsz=16647.7, num_updates=13800, lr=0.000538382, gnorm=0.159, clip=0, loss_scale=4, train_wall=84, gb_free=61.4, wall=11870 epoch 009: 411 / 1689 loss=3.698, nll_loss=2.163, ppl=4.48, wps=586794, ups=1.19, wpb=495079, bsz=16186.1, num_updates=13900, lr=0.000536442, gnorm=0.17, clip=0, loss_scale=4, train_wall=83, gb_free=61.1, wall=11955 epoch 009: 411 / 1689 loss=3.698, nll_loss=2.163, ppl=4.48, wps=586794, ups=1.19, wpb=495079, bsz=16186.1, num_updates=13900, lr=0.000536442, gnorm=0.17, clip=0, loss_scale=4, train_wall=83, gb_free=61.1, wall=11955 epoch 009: 411 / 1689 loss=3.698, nll_loss=2.163, ppl=4.48, wps=586794, ups=1.19, wpb=495079, bsz=16186.1, num_updates=13900, lr=0.000536442, gnorm=0.17, clip=0, loss_scale=4, train_wall=83, gb_free=61.1, wall=11955 epoch 009: 411 / 1689 loss=3.698, nll_loss=2.163, ppl=4.48, wps=586794, ups=1.19, wpb=495079, bsz=16186.1, num_updates=13900, lr=0.000536442, gnorm=0.17, clip=0, loss_scale=4, train_wall=83, gb_free=61.1, wall=11955 epoch 009: 411 / 1689 loss=3.698, nll_loss=2.163, ppl=4.48, wps=586794, ups=1.19, wpb=495079, bsz=16186.1, num_updates=13900, lr=0.000536442, gnorm=0.17, clip=0, loss_scale=4, train_wall=83, gb_free=61.1, wall=11955 epoch 009: 411 / 1689 loss=3.698, nll_loss=2.163, ppl=4.48, wps=586794, ups=1.19, wpb=495079, bsz=16186.1, num_updates=13900, lr=0.000536442, gnorm=0.17, clip=0, loss_scale=4, train_wall=83, gb_free=61.1, wall=11955 epoch 009: 411 / 1689 loss=3.698, nll_loss=2.163, ppl=4.48, wps=586794, ups=1.19, wpb=495079, bsz=16186.1, num_updates=13900, lr=0.000536442, gnorm=0.17, clip=0, loss_scale=4, train_wall=83, gb_free=61.1, wall=11955 epoch 009: 411 / 1689 loss=3.698, nll_loss=2.163, ppl=4.48, wps=586794, ups=1.19, wpb=495079, bsz=16186.1, num_updates=13900, lr=0.000536442, gnorm=0.17, clip=0, loss_scale=4, train_wall=83, gb_free=61.1, wall=11955 epoch 009: 411 / 1689 loss=3.698, nll_loss=2.163, ppl=4.48, wps=586794, ups=1.19, wpb=495079, bsz=16186.1, num_updates=13900, lr=0.000536442, gnorm=0.17, clip=0, loss_scale=4, train_wall=83, gb_free=61.1, wall=11955 epoch 009: 511 / 1689 loss=3.695, nll_loss=2.159, ppl=4.47, wps=583859, ups=1.18, wpb=495012, bsz=16536.7, num_updates=14000, lr=0.000534522, gnorm=0.168, clip=0, loss_scale=4, train_wall=84, gb_free=61.6, wall=12039 epoch 009: 511 / 1689 loss=3.695, nll_loss=2.159, ppl=4.47, wps=583859, ups=1.18, wpb=495012, bsz=16536.7, num_updates=14000, lr=0.000534522, gnorm=0.168, clip=0, loss_scale=4, train_wall=84, gb_free=61.6, wall=12039 epoch 009: 511 / 1689 loss=3.695, nll_loss=2.159, ppl=4.47, wps=583859, ups=1.18, wpb=495012, bsz=16536.7, num_updates=14000, lr=0.000534522, gnorm=0.168, clip=0, loss_scale=4, train_wall=84, gb_free=61.6, wall=12039 epoch 009: 511 / 1689 loss=3.695, nll_loss=2.159, ppl=4.47, wps=583859, ups=1.18, wpb=495012, bsz=16536.7, num_updates=14000, lr=0.000534522, gnorm=0.168, clip=0, loss_scale=4, train_wall=84, gb_free=61.6, wall=12039 epoch 009: 511 / 1689 loss=3.695, nll_loss=2.159, ppl=4.47, wps=583859, ups=1.18, wpb=495012, bsz=16536.7, num_updates=14000, lr=0.000534522, gnorm=0.168, clip=0, loss_scale=4, train_wall=84, gb_free=61.6, wall=12039 epoch 009: 511 / 1689 loss=3.695, nll_loss=2.159, ppl=4.47, wps=583859, ups=1.18, wpb=495012, bsz=16536.7, num_updates=14000, lr=0.000534522, gnorm=0.168, clip=0, loss_scale=4, train_wall=84, gb_free=61.6, wall=12039 epoch 009: 511 / 1689 loss=3.695, nll_loss=2.159, ppl=4.47, wps=583859, ups=1.18, wpb=495012, bsz=16536.7, num_updates=14000, lr=0.000534522, gnorm=0.168, clip=0, loss_scale=4, train_wall=84, gb_free=61.6, wall=12039 epoch 009: 511 / 1689 loss=3.695, nll_loss=2.159, ppl=4.47, wps=583859, ups=1.18, wpb=495012, bsz=16536.7, num_updates=14000, lr=0.000534522, gnorm=0.168, clip=0, loss_scale=4, train_wall=84, gb_free=61.6, wall=12039 epoch 009: 511 / 1689 loss=3.695, nll_loss=2.159, ppl=4.47, wps=583859, ups=1.18, wpb=495012, bsz=16536.7, num_updates=14000, lr=0.000534522, gnorm=0.168, clip=0, loss_scale=4, train_wall=84, gb_free=61.6, wall=12039 begin validation on "valid" subset epoch 009 | valid on 'valid' subset | loss 3.785 | nll_loss 2.236 | ppl 4.71 | wps 0 | wpb 44526 | bsz 2008 | num_updates 14000 | best_loss 3.776 epoch 009 | valid on 'valid' subset | loss 3.785 | nll_loss 2.236 | ppl 4.71 | wps 0 | wpb 44526 | bsz 2008 | num_updates 14000 | best_loss 3.776 epoch 009 | valid on 'valid' subset | loss 3.785 | nll_loss 2.236 | ppl 4.71 | wps 0 | wpb 44526 | bsz 2008 | num_updates 14000 | best_loss 3.776 epoch 009 | valid on 'valid' subset | loss 3.785 | nll_loss 2.236 | ppl 4.71 | wps 0 | wpb 44526 | bsz 2008 | num_updates 14000 | best_loss 3.776 epoch 009 | valid on 'valid' subset | loss 3.785 | nll_loss 2.236 | ppl 4.71 | wps 0 | wpb 44526 | bsz 2008 | num_updates 14000 | best_loss 3.776 epoch 009 | valid on 'valid' subset | loss 3.785 | nll_loss 2.236 | ppl 4.71 | wps 0 | wpb 44526 | bsz 2008 | num_updates 14000 | best_loss 3.776 epoch 009 | valid on 'valid' subset | loss 3.785 | nll_loss 2.236 | ppl 4.71 | wps 0 | wpb 44526 | bsz 2008 | num_updates 14000 | best_loss 3.776 epoch 009 | valid on 'valid' subset | loss 3.785 | nll_loss 2.236 | ppl 4.71 | wps 0 | wpb 44526 | bsz 2008 | num_updates 14000 | best_loss 3.776 epoch 009 | valid on 'valid' subset | loss 3.785 | nll_loss 2.236 | ppl 4.71 | wps 0 | wpb 44526 | bsz 2008 | num_updates 14000 | best_loss 3.776 epoch 009: 611 / 1689 loss=3.697, nll_loss=2.161, ppl=4.47, wps=507804, ups=1.03, wpb=495312, bsz=16384.8, num_updates=14100, lr=0.000532624, gnorm=0.172, clip=0, loss_scale=4, train_wall=83, gb_free=61.6, wall=12137 epoch 009: 611 / 1689 loss=3.697, nll_loss=2.161, ppl=4.47, wps=507804, ups=1.03, wpb=495312, bsz=16384.8, num_updates=14100, lr=0.000532624, gnorm=0.172, clip=0, loss_scale=4, train_wall=83, gb_free=61.6, wall=12137 epoch 009: 611 / 1689 loss=3.697, nll_loss=2.161, ppl=4.47, wps=507804, ups=1.03, wpb=495312, bsz=16384.8, num_updates=14100, lr=0.000532624, gnorm=0.172, clip=0, loss_scale=4, train_wall=83, gb_free=61.6, wall=12137 epoch 009: 611 / 1689 loss=3.697, nll_loss=2.161, ppl=4.47, wps=507804, ups=1.03, wpb=495312, bsz=16384.8, num_updates=14100, lr=0.000532624, gnorm=0.172, clip=0, loss_scale=4, train_wall=83, gb_free=61.6, wall=12137 epoch 009: 611 / 1689 loss=3.697, nll_loss=2.161, ppl=4.47, wps=507804, ups=1.03, wpb=495312, bsz=16384.8, num_updates=14100, lr=0.000532624, gnorm=0.172, clip=0, loss_scale=4, train_wall=83, gb_free=61.6, wall=12137 epoch 009: 611 / 1689 loss=3.697, nll_loss=2.161, ppl=4.47, wps=507804, ups=1.03, wpb=495312, bsz=16384.8, num_updates=14100, lr=0.000532624, gnorm=0.172, clip=0, loss_scale=4, train_wall=83, gb_free=61.6, wall=12137 epoch 009: 611 / 1689 loss=3.697, nll_loss=2.161, ppl=4.47, wps=507804, ups=1.03, wpb=495312, bsz=16384.8, num_updates=14100, lr=0.000532624, gnorm=0.172, clip=0, loss_scale=4, train_wall=83, gb_free=61.6, wall=12137 epoch 009: 611 / 1689 loss=3.697, nll_loss=2.161, ppl=4.47, wps=507804, ups=1.03, wpb=495312, bsz=16384.8, num_updates=14100, lr=0.000532624, gnorm=0.172, clip=0, loss_scale=4, train_wall=83, gb_free=61.6, wall=12137 epoch 009: 611 / 1689 loss=3.697, nll_loss=2.161, ppl=4.47, wps=507804, ups=1.03, wpb=495312, bsz=16384.8, num_updates=14100, lr=0.000532624, gnorm=0.172, clip=0, loss_scale=4, train_wall=83, gb_free=61.6, wall=12137 epoch 009: 711 / 1689 loss=3.695, nll_loss=2.159, ppl=4.47, wps=587065, ups=1.19, wpb=495283, bsz=16520.8, num_updates=14200, lr=0.000530745, gnorm=0.166, clip=0, loss_scale=4, train_wall=83, gb_free=61.5, wall=12221 epoch 009: 711 / 1689 loss=3.695, nll_loss=2.159, ppl=4.47, wps=587065, ups=1.19, wpb=495283, bsz=16520.8, num_updates=14200, lr=0.000530745, gnorm=0.166, clip=0, loss_scale=4, train_wall=83, gb_free=61.5, wall=12221 epoch 009: 711 / 1689 loss=3.695, nll_loss=2.159, ppl=4.47, wps=587065, ups=1.19, wpb=495283, bsz=16520.8, num_updates=14200, lr=0.000530745, gnorm=0.166, clip=0, loss_scale=4, train_wall=83, gb_free=61.5, wall=12221 epoch 009: 711 / 1689 loss=3.695, nll_loss=2.159, ppl=4.47, wps=587065, ups=1.19, wpb=495283, bsz=16520.8, num_updates=14200, lr=0.000530745, gnorm=0.166, clip=0, loss_scale=4, train_wall=83, gb_free=61.5, wall=12221 epoch 009: 711 / 1689 loss=3.695, nll_loss=2.159, ppl=4.47, wps=587065, ups=1.19, wpb=495283, bsz=16520.8, num_updates=14200, lr=0.000530745, gnorm=0.166, clip=0, loss_scale=4, train_wall=83, gb_free=61.5, wall=12221 epoch 009: 711 / 1689 loss=3.695, nll_loss=2.159, ppl=4.47, wps=587065, ups=1.19, wpb=495283, bsz=16520.8, num_updates=14200, lr=0.000530745, gnorm=0.166, clip=0, loss_scale=4, train_wall=83, gb_free=61.5, wall=12221 epoch 009: 711 / 1689 loss=3.695, nll_loss=2.159, ppl=4.47, wps=587065, ups=1.19, wpb=495283, bsz=16520.8, num_updates=14200, lr=0.000530745, gnorm=0.166, clip=0, loss_scale=4, train_wall=83, gb_free=61.5, wall=12221 epoch 009: 711 / 1689 loss=3.695, nll_loss=2.159, ppl=4.47, wps=587065, ups=1.19, wpb=495283, bsz=16520.8, num_updates=14200, lr=0.000530745, gnorm=0.166, clip=0, loss_scale=4, train_wall=83, gb_free=61.5, wall=12221 epoch 009: 711 / 1689 loss=3.695, nll_loss=2.159, ppl=4.47, wps=587065, ups=1.19, wpb=495283, bsz=16520.8, num_updates=14200, lr=0.000530745, gnorm=0.166, clip=0, loss_scale=4, train_wall=83, gb_free=61.5, wall=12221 epoch 009: 811 / 1689 loss=3.702, nll_loss=2.167, ppl=4.49, wps=588662, ups=1.19, wpb=494861, bsz=16637.5, num_updates=14300, lr=0.000528886, gnorm=0.166, clip=0, loss_scale=8, train_wall=83, gb_free=61.3, wall=12305 epoch 009: 811 / 1689 loss=3.702, nll_loss=2.167, ppl=4.49, wps=588662, ups=1.19, wpb=494861, bsz=16637.5, num_updates=14300, lr=0.000528886, gnorm=0.166, clip=0, loss_scale=8, train_wall=83, gb_free=61.3, wall=12305 epoch 009: 811 / 1689 loss=3.702, nll_loss=2.167, ppl=4.49, wps=588662, ups=1.19, wpb=494861, bsz=16637.5, num_updates=14300, lr=0.000528886, gnorm=0.166, clip=0, loss_scale=8, train_wall=83, gb_free=61.3, wall=12305 epoch 009: 811 / 1689 loss=3.702, nll_loss=2.167, ppl=4.49, wps=588662, ups=1.19, wpb=494861, bsz=16637.5, num_updates=14300, lr=0.000528886, gnorm=0.166, clip=0, loss_scale=8, train_wall=83, gb_free=61.3, wall=12305 epoch 009: 811 / 1689 loss=3.702, nll_loss=2.167, ppl=4.49, wps=588662, ups=1.19, wpb=494861, bsz=16637.5, num_updates=14300, lr=0.000528886, gnorm=0.166, clip=0, loss_scale=8, train_wall=83, gb_free=61.3, wall=12305 epoch 009: 811 / 1689 loss=3.702, nll_loss=2.167, ppl=4.49, wps=588662, ups=1.19, wpb=494861, bsz=16637.5, num_updates=14300, lr=0.000528886, gnorm=0.166, clip=0, loss_scale=8, train_wall=83, gb_free=61.3, wall=12305 epoch 009: 811 / 1689 loss=3.702, nll_loss=2.167, ppl=4.49, wps=588662, ups=1.19, wpb=494861, bsz=16637.5, num_updates=14300, lr=0.000528886, gnorm=0.166, clip=0, loss_scale=8, train_wall=83, gb_free=61.3, wall=12305 epoch 009: 811 / 1689 loss=3.702, nll_loss=2.167, ppl=4.49, wps=588662, ups=1.19, wpb=494861, bsz=16637.5, num_updates=14300, lr=0.000528886, gnorm=0.166, clip=0, loss_scale=8, train_wall=83, gb_free=61.3, wall=12305 epoch 009: 811 / 1689 loss=3.702, nll_loss=2.167, ppl=4.49, wps=588662, ups=1.19, wpb=494861, bsz=16637.5, num_updates=14300, lr=0.000528886, gnorm=0.166, clip=0, loss_scale=8, train_wall=83, gb_free=61.3, wall=12305 epoch 009: 912 / 1689 loss=3.705, nll_loss=2.17, ppl=4.5, wps=580475, ups=1.17, wpb=495507, bsz=16636.8, num_updates=14400, lr=0.000527046, gnorm=0.164, clip=0, loss_scale=4, train_wall=84, gb_free=61.5, wall=12391 epoch 009: 912 / 1689 loss=3.705, nll_loss=2.17, ppl=4.5, wps=580475, ups=1.17, wpb=495507, bsz=16636.8, num_updates=14400, lr=0.000527046, gnorm=0.164, clip=0, loss_scale=4, train_wall=84, gb_free=61.5, wall=12391 epoch 009: 912 / 1689 loss=3.705, nll_loss=2.17, ppl=4.5, wps=580475, ups=1.17, wpb=495507, bsz=16636.8, num_updates=14400, lr=0.000527046, gnorm=0.164, clip=0, loss_scale=4, train_wall=84, gb_free=61.5, wall=12391 epoch 009: 912 / 1689 loss=3.705, nll_loss=2.17, ppl=4.5, wps=580475, ups=1.17, wpb=495507, bsz=16636.8, num_updates=14400, lr=0.000527046, gnorm=0.164, clip=0, loss_scale=4, train_wall=84, gb_free=61.5, wall=12391 epoch 009: 912 / 1689 loss=3.705, nll_loss=2.17, ppl=4.5, wps=580475, ups=1.17, wpb=495507, bsz=16636.8, num_updates=14400, lr=0.000527046, gnorm=0.164, clip=0, loss_scale=4, train_wall=84, gb_free=61.5, wall=12391 epoch 009: 912 / 1689 loss=3.705, nll_loss=2.17, ppl=4.5, wps=580475, ups=1.17, wpb=495507, bsz=16636.8, num_updates=14400, lr=0.000527046, gnorm=0.164, clip=0, loss_scale=4, train_wall=84, gb_free=61.5, wall=12391 epoch 009: 912 / 1689 loss=3.705, nll_loss=2.17, ppl=4.5, wps=580475, ups=1.17, wpb=495507, bsz=16636.8, num_updates=14400, lr=0.000527046, gnorm=0.164, clip=0, loss_scale=4, train_wall=84, gb_free=61.5, wall=12391 epoch 009: 912 / 1689 loss=3.705, nll_loss=2.17, ppl=4.5, wps=580475, ups=1.17, wpb=495507, bsz=16636.8, num_updates=14400, lr=0.000527046, gnorm=0.164, clip=0, loss_scale=4, train_wall=84, gb_free=61.5, wall=12391 epoch 009: 912 / 1689 loss=3.705, nll_loss=2.17, ppl=4.5, wps=580475, ups=1.17, wpb=495507, bsz=16636.8, num_updates=14400, lr=0.000527046, gnorm=0.164, clip=0, loss_scale=4, train_wall=84, gb_free=61.5, wall=12391 epoch 009: 1012 / 1689 loss=3.702, nll_loss=2.167, ppl=4.49, wps=587605, ups=1.19, wpb=495128, bsz=16412.1, num_updates=14500, lr=0.000525226, gnorm=0.161, clip=0, loss_scale=4, train_wall=83, gb_free=61.2, wall=12475 epoch 009: 1012 / 1689 loss=3.702, nll_loss=2.167, ppl=4.49, wps=587605, ups=1.19, wpb=495128, bsz=16412.1, num_updates=14500, lr=0.000525226, gnorm=0.161, clip=0, loss_scale=4, train_wall=83, gb_free=61.2, wall=12475 epoch 009: 1012 / 1689 loss=3.702, nll_loss=2.167, ppl=4.49, wps=587605, ups=1.19, wpb=495128, bsz=16412.1, num_updates=14500, lr=0.000525226, gnorm=0.161, clip=0, loss_scale=4, train_wall=83, gb_free=61.2, wall=12475 epoch 009: 1012 / 1689 loss=3.702, nll_loss=2.167, ppl=4.49, wps=587605, ups=1.19, wpb=495128, bsz=16412.1, num_updates=14500, lr=0.000525226, gnorm=0.161, clip=0, loss_scale=4, train_wall=83, gb_free=61.2, wall=12475 epoch 009: 1012 / 1689 loss=3.702, nll_loss=2.167, ppl=4.49, wps=587605, ups=1.19, wpb=495128, bsz=16412.1, num_updates=14500, lr=0.000525226, gnorm=0.161, clip=0, loss_scale=4, train_wall=83, gb_free=61.2, wall=12475 epoch 009: 1012 / 1689 loss=3.702, nll_loss=2.167, ppl=4.49, wps=587605, ups=1.19, wpb=495128, bsz=16412.1, num_updates=14500, lr=0.000525226, gnorm=0.161, clip=0, loss_scale=4, train_wall=83, gb_free=61.2, wall=12475 epoch 009: 1012 / 1689 loss=3.702, nll_loss=2.167, ppl=4.49, wps=587605, ups=1.19, wpb=495128, bsz=16412.1, num_updates=14500, lr=0.000525226, gnorm=0.161, clip=0, loss_scale=4, train_wall=83, gb_free=61.2, wall=12475 epoch 009: 1012 / 1689 loss=3.702, nll_loss=2.167, ppl=4.49, wps=587605, ups=1.19, wpb=495128, bsz=16412.1, num_updates=14500, lr=0.000525226, gnorm=0.161, clip=0, loss_scale=4, train_wall=83, gb_free=61.2, wall=12475 epoch 009: 1012 / 1689 loss=3.702, nll_loss=2.167, ppl=4.49, wps=587605, ups=1.19, wpb=495128, bsz=16412.1, num_updates=14500, lr=0.000525226, gnorm=0.161, clip=0, loss_scale=4, train_wall=83, gb_free=61.2, wall=12475 epoch 009: 1112 / 1689 loss=3.706, nll_loss=2.172, ppl=4.51, wps=585296, ups=1.18, wpb=494941, bsz=16731, num_updates=14600, lr=0.000523424, gnorm=0.163, clip=0, loss_scale=4, train_wall=84, gb_free=61.2, wall=12560 epoch 009: 1112 / 1689 loss=3.706, nll_loss=2.172, ppl=4.51, wps=585296, ups=1.18, wpb=494941, bsz=16731, num_updates=14600, lr=0.000523424, gnorm=0.163, clip=0, loss_scale=4, train_wall=84, gb_free=61.2, wall=12560 epoch 009: 1112 / 1689 loss=3.706, nll_loss=2.172, ppl=4.51, wps=585296, ups=1.18, wpb=494941, bsz=16731, num_updates=14600, lr=0.000523424, gnorm=0.163, clip=0, loss_scale=4, train_wall=84, gb_free=61.2, wall=12560 epoch 009: 1112 / 1689 loss=3.706, nll_loss=2.172, ppl=4.51, wps=585296, ups=1.18, wpb=494941, bsz=16731, num_updates=14600, lr=0.000523424, gnorm=0.163, clip=0, loss_scale=4, train_wall=84, gb_free=61.2, wall=12560 epoch 009: 1112 / 1689 loss=3.706, nll_loss=2.172, ppl=4.51, wps=585296, ups=1.18, wpb=494941, bsz=16731, num_updates=14600, lr=0.000523424, gnorm=0.163, clip=0, loss_scale=4, train_wall=84, gb_free=61.2, wall=12560 epoch 009: 1112 / 1689 loss=3.706, nll_loss=2.172, ppl=4.51, wps=585296, ups=1.18, wpb=494941, bsz=16731, num_updates=14600, lr=0.000523424, gnorm=0.163, clip=0, loss_scale=4, train_wall=84, gb_free=61.2, wall=12560 epoch 009: 1112 / 1689 loss=3.706, nll_loss=2.172, ppl=4.51, wps=585296, ups=1.18, wpb=494941, bsz=16731, num_updates=14600, lr=0.000523424, gnorm=0.163, clip=0, loss_scale=4, train_wall=84, gb_free=61.2, wall=12560 epoch 009: 1112 / 1689 loss=3.706, nll_loss=2.172, ppl=4.51, wps=585296, ups=1.18, wpb=494941, bsz=16731, num_updates=14600, lr=0.000523424, gnorm=0.163, clip=0, loss_scale=4, train_wall=84, gb_free=61.2, wall=12560 epoch 009: 1112 / 1689 loss=3.706, nll_loss=2.172, ppl=4.51, wps=585296, ups=1.18, wpb=494941, bsz=16731, num_updates=14600, lr=0.000523424, gnorm=0.163, clip=0, loss_scale=4, train_wall=84, gb_free=61.2, wall=12560 epoch 009: 1212 / 1689 loss=3.699, nll_loss=2.164, ppl=4.48, wps=586142, ups=1.18, wpb=495194, bsz=16213, num_updates=14700, lr=0.000521641, gnorm=0.163, clip=0, loss_scale=4, train_wall=83, gb_free=61.7, wall=12644 epoch 009: 1212 / 1689 loss=3.699, nll_loss=2.164, ppl=4.48, wps=586142, ups=1.18, wpb=495194, bsz=16213, num_updates=14700, lr=0.000521641, gnorm=0.163, clip=0, loss_scale=4, train_wall=83, gb_free=61.7, wall=12644 epoch 009: 1212 / 1689 loss=3.699, nll_loss=2.164, ppl=4.48, wps=586142, ups=1.18, wpb=495194, bsz=16213, num_updates=14700, lr=0.000521641, gnorm=0.163, clip=0, loss_scale=4, train_wall=83, gb_free=61.7, wall=12644 epoch 009: 1212 / 1689 loss=3.699, nll_loss=2.164, ppl=4.48, wps=586142, ups=1.18, wpb=495194, bsz=16213, num_updates=14700, lr=0.000521641, gnorm=0.163, clip=0, loss_scale=4, train_wall=83, gb_free=61.7, wall=12644 epoch 009: 1212 / 1689 loss=3.699, nll_loss=2.164, ppl=4.48, wps=586142, ups=1.18, wpb=495194, bsz=16213, num_updates=14700, lr=0.000521641, gnorm=0.163, clip=0, loss_scale=4, train_wall=83, gb_free=61.7, wall=12644 epoch 009: 1212 / 1689 loss=3.699, nll_loss=2.164, ppl=4.48, wps=586142, ups=1.18, wpb=495194, bsz=16213, num_updates=14700, lr=0.000521641, gnorm=0.163, clip=0, loss_scale=4, train_wall=83, gb_free=61.7, wall=12644 epoch 009: 1212 / 1689 loss=3.699, nll_loss=2.164, ppl=4.48, wps=586142, ups=1.18, wpb=495194, bsz=16213, num_updates=14700, lr=0.000521641, gnorm=0.163, clip=0, loss_scale=4, train_wall=83, gb_free=61.7, wall=12644 epoch 009: 1212 / 1689 loss=3.699, nll_loss=2.164, ppl=4.48, wps=586142, ups=1.18, wpb=495194, bsz=16213, num_updates=14700, lr=0.000521641, gnorm=0.163, clip=0, loss_scale=4, train_wall=83, gb_free=61.7, wall=12644 epoch 009: 1212 / 1689 loss=3.699, nll_loss=2.164, ppl=4.48, wps=586142, ups=1.18, wpb=495194, bsz=16213, num_updates=14700, lr=0.000521641, gnorm=0.163, clip=0, loss_scale=4, train_wall=83, gb_free=61.7, wall=12644 epoch 009: 1312 / 1689 loss=3.7, nll_loss=2.166, ppl=4.49, wps=586998, ups=1.18, wpb=495872, bsz=16358.4, num_updates=14800, lr=0.000519875, gnorm=0.165, clip=0, loss_scale=4, train_wall=83, gb_free=61.2, wall=12728 epoch 009: 1312 / 1689 loss=3.7, nll_loss=2.166, ppl=4.49, wps=586998, ups=1.18, wpb=495872, bsz=16358.4, num_updates=14800, lr=0.000519875, gnorm=0.165, clip=0, loss_scale=4, train_wall=83, gb_free=61.2, wall=12728 epoch 009: 1312 / 1689 loss=3.7, nll_loss=2.166, ppl=4.49, wps=586998, ups=1.18, wpb=495872, bsz=16358.4, num_updates=14800, lr=0.000519875, gnorm=0.165, clip=0, loss_scale=4, train_wall=83, gb_free=61.2, wall=12728 epoch 009: 1312 / 1689 loss=3.7, nll_loss=2.166, ppl=4.49, wps=586998, ups=1.18, wpb=495872, bsz=16358.4, num_updates=14800, lr=0.000519875, gnorm=0.165, clip=0, loss_scale=4, train_wall=83, gb_free=61.2, wall=12728 epoch 009: 1312 / 1689 loss=3.7, nll_loss=2.166, ppl=4.49, wps=586998, ups=1.18, wpb=495872, bsz=16358.4, num_updates=14800, lr=0.000519875, gnorm=0.165, clip=0, loss_scale=4, train_wall=83, gb_free=61.2, wall=12728 epoch 009: 1312 / 1689 loss=3.7, nll_loss=2.166, ppl=4.49, wps=586998, ups=1.18, wpb=495872, bsz=16358.4, num_updates=14800, lr=0.000519875, gnorm=0.165, clip=0, loss_scale=4, train_wall=83, gb_free=61.2, wall=12728 epoch 009: 1312 / 1689 loss=3.7, nll_loss=2.166, ppl=4.49, wps=586998, ups=1.18, wpb=495872, bsz=16358.4, num_updates=14800, lr=0.000519875, gnorm=0.165, clip=0, loss_scale=4, train_wall=83, gb_free=61.2, wall=12728 epoch 009: 1312 / 1689 loss=3.7, nll_loss=2.166, ppl=4.49, wps=586998, ups=1.18, wpb=495872, bsz=16358.4, num_updates=14800, lr=0.000519875, gnorm=0.165, clip=0, loss_scale=4, train_wall=83, gb_free=61.2, wall=12728 epoch 009: 1312 / 1689 loss=3.7, nll_loss=2.166, ppl=4.49, wps=586998, ups=1.18, wpb=495872, bsz=16358.4, num_updates=14800, lr=0.000519875, gnorm=0.165, clip=0, loss_scale=4, train_wall=83, gb_free=61.2, wall=12728 epoch 009: 1413 / 1689 loss=3.699, nll_loss=2.164, ppl=4.48, wps=581497, ups=1.17, wpb=495111, bsz=16668.8, num_updates=14900, lr=0.000518128, gnorm=0.164, clip=0, loss_scale=2, train_wall=84, gb_free=62.1, wall=12814 epoch 009: 1413 / 1689 loss=3.699, nll_loss=2.164, ppl=4.48, wps=581497, ups=1.17, wpb=495111, bsz=16668.8, num_updates=14900, lr=0.000518128, gnorm=0.164, clip=0, loss_scale=2, train_wall=84, gb_free=62.1, wall=12814 epoch 009: 1413 / 1689 loss=3.699, nll_loss=2.164, ppl=4.48, wps=581497, ups=1.17, wpb=495111, bsz=16668.8, num_updates=14900, lr=0.000518128, gnorm=0.164, clip=0, loss_scale=2, train_wall=84, gb_free=62.1, wall=12814 epoch 009: 1413 / 1689 loss=3.699, nll_loss=2.164, ppl=4.48, wps=581497, ups=1.17, wpb=495111, bsz=16668.8, num_updates=14900, lr=0.000518128, gnorm=0.164, clip=0, loss_scale=2, train_wall=84, gb_free=62.1, wall=12814 epoch 009: 1413 / 1689 loss=3.699, nll_loss=2.164, ppl=4.48, wps=581497, ups=1.17, wpb=495111, bsz=16668.8, num_updates=14900, lr=0.000518128, gnorm=0.164, clip=0, loss_scale=2, train_wall=84, gb_free=62.1, wall=12814 epoch 009: 1413 / 1689 loss=3.699, nll_loss=2.164, ppl=4.48, wps=581497, ups=1.17, wpb=495111, bsz=16668.8, num_updates=14900, lr=0.000518128, gnorm=0.164, clip=0, loss_scale=2, train_wall=84, gb_free=62.1, wall=12814 epoch 009: 1413 / 1689 loss=3.699, nll_loss=2.164, ppl=4.48, wps=581497, ups=1.17, wpb=495111, bsz=16668.8, num_updates=14900, lr=0.000518128, gnorm=0.164, clip=0, loss_scale=2, train_wall=84, gb_free=62.1, wall=12814 epoch 009: 1413 / 1689 loss=3.699, nll_loss=2.164, ppl=4.48, wps=581497, ups=1.17, wpb=495111, bsz=16668.8, num_updates=14900, lr=0.000518128, gnorm=0.164, clip=0, loss_scale=2, train_wall=84, gb_free=62.1, wall=12814 epoch 009: 1413 / 1689 loss=3.699, nll_loss=2.164, ppl=4.48, wps=581497, ups=1.17, wpb=495111, bsz=16668.8, num_updates=14900, lr=0.000518128, gnorm=0.164, clip=0, loss_scale=2, train_wall=84, gb_free=62.1, wall=12814 epoch 009: 1513 / 1689 loss=3.706, nll_loss=2.173, ppl=4.51, wps=585122, ups=1.18, wpb=495755, bsz=16440.4, num_updates=15000, lr=0.000516398, gnorm=0.169, clip=0, loss_scale=2, train_wall=84, gb_free=61.3, wall=12898 epoch 009: 1513 / 1689 loss=3.706, nll_loss=2.173, ppl=4.51, wps=585122, ups=1.18, wpb=495755, bsz=16440.4, num_updates=15000, lr=0.000516398, gnorm=0.169, clip=0, loss_scale=2, train_wall=84, gb_free=61.3, wall=12898 epoch 009: 1513 / 1689 loss=3.706, nll_loss=2.173, ppl=4.51, wps=585122, ups=1.18, wpb=495755, bsz=16440.4, num_updates=15000, lr=0.000516398, gnorm=0.169, clip=0, loss_scale=2, train_wall=84, gb_free=61.3, wall=12898 epoch 009: 1513 / 1689 loss=3.706, nll_loss=2.173, ppl=4.51, wps=585122, ups=1.18, wpb=495755, bsz=16440.4, num_updates=15000, lr=0.000516398, gnorm=0.169, clip=0, loss_scale=2, train_wall=84, gb_free=61.3, wall=12898 epoch 009: 1513 / 1689 loss=3.706, nll_loss=2.173, ppl=4.51, wps=585122, ups=1.18, wpb=495755, bsz=16440.4, num_updates=15000, lr=0.000516398, gnorm=0.169, clip=0, loss_scale=2, train_wall=84, gb_free=61.3, wall=12898 epoch 009: 1513 / 1689 loss=3.706, nll_loss=2.173, ppl=4.51, wps=585122, ups=1.18, wpb=495755, bsz=16440.4, num_updates=15000, lr=0.000516398, gnorm=0.169, clip=0, loss_scale=2, train_wall=84, gb_free=61.3, wall=12898 epoch 009: 1513 / 1689 loss=3.706, nll_loss=2.173, ppl=4.51, wps=585122, ups=1.18, wpb=495755, bsz=16440.4, num_updates=15000, lr=0.000516398, gnorm=0.169, clip=0, loss_scale=2, train_wall=84, gb_free=61.3, wall=12898 epoch 009: 1513 / 1689 loss=3.706, nll_loss=2.173, ppl=4.51, wps=585122, ups=1.18, wpb=495755, bsz=16440.4, num_updates=15000, lr=0.000516398, gnorm=0.169, clip=0, loss_scale=2, train_wall=84, gb_free=61.3, wall=12898 epoch 009: 1513 / 1689 loss=3.706, nll_loss=2.173, ppl=4.51, wps=585122, ups=1.18, wpb=495755, bsz=16440.4, num_updates=15000, lr=0.000516398, gnorm=0.169, clip=0, loss_scale=2, train_wall=84, gb_free=61.3, wall=12898 begin validation on "valid" subset epoch 009 | valid on 'valid' subset | loss 3.765 | nll_loss 2.216 | ppl 4.65 | wps 0 | wpb 44526 | bsz 2008 | num_updates 15000 | best_loss 3.765 epoch 009 | valid on 'valid' subset | loss 3.765 | nll_loss 2.216 | ppl 4.65 | wps 0 | wpb 44526 | bsz 2008 | num_updates 15000 | best_loss 3.765 epoch 009 | valid on 'valid' subset | loss 3.765 | nll_loss 2.216 | ppl 4.65 | wps 0 | wpb 44526 | bsz 2008 | num_updates 15000 | best_loss 3.765 epoch 009 | valid on 'valid' subset | loss 3.765 | nll_loss 2.216 | ppl 4.65 | wps 0 | wpb 44526 | bsz 2008 | num_updates 15000 | best_loss 3.765 epoch 009 | valid on 'valid' subset | loss 3.765 | nll_loss 2.216 | ppl 4.65 | wps 0 | wpb 44526 | bsz 2008 | num_updates 15000 | best_loss 3.765 epoch 009 | valid on 'valid' subset | loss 3.765 | nll_loss 2.216 | ppl 4.65 | wps 0 | wpb 44526 | bsz 2008 | num_updates 15000 | best_loss 3.765 epoch 009 | valid on 'valid' subset | loss 3.765 | nll_loss 2.216 | ppl 4.65 | wps 0 | wpb 44526 | bsz 2008 | num_updates 15000 | best_loss 3.765 epoch 009 | valid on 'valid' subset | loss 3.765 | nll_loss 2.216 | ppl 4.65 | wps 0 | wpb 44526 | bsz 2008 | num_updates 15000 | best_loss 3.765 epoch 009 | valid on 'valid' subset | loss 3.765 | nll_loss 2.216 | ppl 4.65 | wps 0 | wpb 44526 | bsz 2008 | num_updates 15000 | best_loss 3.765 epoch 009: 1613 / 1689 loss=3.7, nll_loss=2.165, ppl=4.49, wps=506814, ups=1.02, wpb=495551, bsz=16688.3, num_updates=15100, lr=0.000514685, gnorm=0.169, clip=0, loss_scale=2, train_wall=83, gb_free=61.7, wall=12996 epoch 009: 1613 / 1689 loss=3.7, nll_loss=2.165, ppl=4.49, wps=506814, ups=1.02, wpb=495551, bsz=16688.3, num_updates=15100, lr=0.000514685, gnorm=0.169, clip=0, loss_scale=2, train_wall=83, gb_free=61.7, wall=12996 epoch 009: 1613 / 1689 loss=3.7, nll_loss=2.165, ppl=4.49, wps=506814, ups=1.02, wpb=495551, bsz=16688.3, num_updates=15100, lr=0.000514685, gnorm=0.169, clip=0, loss_scale=2, train_wall=83, gb_free=61.7, wall=12996 epoch 009: 1613 / 1689 loss=3.7, nll_loss=2.165, ppl=4.49, wps=506814, ups=1.02, wpb=495551, bsz=16688.3, num_updates=15100, lr=0.000514685, gnorm=0.169, clip=0, loss_scale=2, train_wall=83, gb_free=61.7, wall=12996 epoch 009: 1613 / 1689 loss=3.7, nll_loss=2.165, ppl=4.49, wps=506814, ups=1.02, wpb=495551, bsz=16688.3, num_updates=15100, lr=0.000514685, gnorm=0.169, clip=0, loss_scale=2, train_wall=83, gb_free=61.7, wall=12996 epoch 009: 1613 / 1689 loss=3.7, nll_loss=2.165, ppl=4.49, wps=506814, ups=1.02, wpb=495551, bsz=16688.3, num_updates=15100, lr=0.000514685, gnorm=0.169, clip=0, loss_scale=2, train_wall=83, gb_free=61.7, wall=12996 epoch 009: 1613 / 1689 loss=3.7, nll_loss=2.165, ppl=4.49, wps=506814, ups=1.02, wpb=495551, bsz=16688.3, num_updates=15100, lr=0.000514685, gnorm=0.169, clip=0, loss_scale=2, train_wall=83, gb_free=61.7, wall=12996 epoch 009: 1613 / 1689 loss=3.7, nll_loss=2.165, ppl=4.49, wps=506814, ups=1.02, wpb=495551, bsz=16688.3, num_updates=15100, lr=0.000514685, gnorm=0.169, clip=0, loss_scale=2, train_wall=83, gb_free=61.7, wall=12996 epoch 009: 1613 / 1689 loss=3.7, nll_loss=2.165, ppl=4.49, wps=506814, ups=1.02, wpb=495551, bsz=16688.3, num_updates=15100, lr=0.000514685, gnorm=0.169, clip=0, loss_scale=2, train_wall=83, gb_free=61.7, wall=12996 end of epoch 9 (average epoch stats below) epoch 009 | loss 3.699 | nll_loss 2.164 | ppl 4.48 | wps 574672 | ups 1.16 | wpb 495113 | bsz 16503.6 | num_updates 15176 | lr 0.000513395 | gnorm 0.166 | clip 0 | loss_scale 2 | train_wall 1407 | gb_free 65.1 | wall 13060 epoch 009 | loss 3.699 | nll_loss 2.164 | ppl 4.48 | wps 574672 | ups 1.16 | wpb 495113 | bsz 16503.6 | num_updates 15176 | lr 0.000513395 | gnorm 0.166 | clip 0 | loss_scale 2 | train_wall 1407 | gb_free 65.1 | wall 13060 epoch 009 | loss 3.699 | nll_loss 2.164 | ppl 4.48 | wps 574672 | ups 1.16 | wpb 495113 | bsz 16503.6 | num_updates 15176 | lr 0.000513395 | gnorm 0.166 | clip 0 | loss_scale 2 | train_wall 1407 | gb_free 65.1 | wall 13060 epoch 009 | loss 3.699 | nll_loss 2.164 | ppl 4.48 | wps 574672 | ups 1.16 | wpb 495113 | bsz 16503.6 | num_updates 15176 | lr 0.000513395 | gnorm 0.166 | clip 0 | loss_scale 2 | train_wall 1407 | gb_free 65.1 | wall 13060 epoch 009 | loss 3.699 | nll_loss 2.164 | ppl 4.48 | wps 574672 | ups 1.16 | wpb 495113 | bsz 16503.6 | num_updates 15176 | lr 0.000513395 | gnorm 0.166 | clip 0 | loss_scale 2 | train_wall 1407 | gb_free 65.1 | wall 13060 epoch 009 | loss 3.699 | nll_loss 2.164 | ppl 4.48 | wps 574672 | ups 1.16 | wpb 495113 | bsz 16503.6 | num_updates 15176 | lr 0.000513395 | gnorm 0.166 | clip 0 | loss_scale 2 | train_wall 1407 | gb_free 65.1 | wall 13060 epoch 009 | loss 3.699 | nll_loss 2.164 | ppl 4.48 | wps 574672 | ups 1.16 | wpb 495113 | bsz 16503.6 | num_updates 15176 | lr 0.000513395 | gnorm 0.166 | clip 0 | loss_scale 2 | train_wall 1407 | gb_free 65.1 | wall 13060 epoch 009 | loss 3.699 | nll_loss 2.164 | ppl 4.48 | wps 574672 | ups 1.16 | wpb 495113 | bsz 16503.6 | num_updates 15176 | lr 0.000513395 | gnorm 0.166 | clip 0 | loss_scale 2 | train_wall 1407 | gb_free 65.1 | wall 13060 epoch 009 | loss 3.699 | nll_loss 2.164 | ppl 4.48 | wps 574672 | ups 1.16 | wpb 495113 | bsz 16503.6 | num_updates 15176 | lr 0.000513395 | gnorm 0.166 | clip 0 | loss_scale 2 | train_wall 1407 | gb_free 65.1 | wall 13060 Start iterating over samples epoch 010: 24 / 1689 loss=3.697, nll_loss=2.162, ppl=4.48, wps=583859, ups=1.19, wpb=491640, bsz=16581.4, num_updates=15200, lr=0.000512989, gnorm=0.158, clip=0, loss_scale=2, train_wall=83, gb_free=60.4, wall=13080 epoch 010: 24 / 1689 loss=3.697, nll_loss=2.162, ppl=4.48, wps=583859, ups=1.19, wpb=491640, bsz=16581.4, num_updates=15200, lr=0.000512989, gnorm=0.158, clip=0, loss_scale=2, train_wall=83, gb_free=60.4, wall=13080 epoch 010: 24 / 1689 loss=3.697, nll_loss=2.162, ppl=4.48, wps=583859, ups=1.19, wpb=491640, bsz=16581.4, num_updates=15200, lr=0.000512989, gnorm=0.158, clip=0, loss_scale=2, train_wall=83, gb_free=60.4, wall=13080 epoch 010: 24 / 1689 loss=3.697, nll_loss=2.162, ppl=4.48, wps=583859, ups=1.19, wpb=491640, bsz=16581.4, num_updates=15200, lr=0.000512989, gnorm=0.158, clip=0, loss_scale=2, train_wall=83, gb_free=60.4, wall=13080 epoch 010: 24 / 1689 loss=3.697, nll_loss=2.162, ppl=4.48, wps=583859, ups=1.19, wpb=491640, bsz=16581.4, num_updates=15200, lr=0.000512989, gnorm=0.158, clip=0, loss_scale=2, train_wall=83, gb_free=60.4, wall=13080 epoch 010: 24 / 1689 loss=3.697, nll_loss=2.162, ppl=4.48, wps=583859, ups=1.19, wpb=491640, bsz=16581.4, num_updates=15200, lr=0.000512989, gnorm=0.158, clip=0, loss_scale=2, train_wall=83, gb_free=60.4, wall=13080 epoch 010: 24 / 1689 loss=3.697, nll_loss=2.162, ppl=4.48, wps=583859, ups=1.19, wpb=491640, bsz=16581.4, num_updates=15200, lr=0.000512989, gnorm=0.158, clip=0, loss_scale=2, train_wall=83, gb_free=60.4, wall=13080 epoch 010: 24 / 1689 loss=3.697, nll_loss=2.162, ppl=4.48, wps=583859, ups=1.19, wpb=491640, bsz=16581.4, num_updates=15200, lr=0.000512989, gnorm=0.158, clip=0, loss_scale=2, train_wall=83, gb_free=60.4, wall=13080 epoch 010: 24 / 1689 loss=3.697, nll_loss=2.162, ppl=4.48, wps=583859, ups=1.19, wpb=491640, bsz=16581.4, num_updates=15200, lr=0.000512989, gnorm=0.158, clip=0, loss_scale=2, train_wall=83, gb_free=60.4, wall=13080 epoch 010: 24 / 1689 loss=3.697, nll_loss=2.162, ppl=4.48, wps=583859, ups=1.19, wpb=491640, bsz=16581.4, num_updates=15200, lr=0.000512989, gnorm=0.158, clip=0, loss_scale=2, train_wall=83, gb_free=60.4, wall=13080 epoch 010: 124 / 1689 loss=3.657, nll_loss=2.116, ppl=4.34, wps=585049, ups=1.18, wpb=495572, bsz=16445.6, num_updates=15300, lr=0.00051131, gnorm=0.166, clip=0, loss_scale=2, train_wall=84, gb_free=62.4, wall=13165 epoch 010: 124 / 1689 loss=3.657, nll_loss=2.116, ppl=4.34, wps=585049, ups=1.18, wpb=495572, bsz=16445.6, num_updates=15300, lr=0.00051131, gnorm=0.166, clip=0, loss_scale=2, train_wall=84, gb_free=62.4, wall=13165 epoch 010: 124 / 1689 loss=3.657, nll_loss=2.116, ppl=4.34, wps=585049, ups=1.18, wpb=495572, bsz=16445.6, num_updates=15300, lr=0.00051131, gnorm=0.166, clip=0, loss_scale=2, train_wall=84, gb_free=62.4, wall=13165 epoch 010: 124 / 1689 loss=3.657, nll_loss=2.116, ppl=4.34, wps=585049, ups=1.18, wpb=495572, bsz=16445.6, num_updates=15300, lr=0.00051131, gnorm=0.166, clip=0, loss_scale=2, train_wall=84, gb_free=62.4, wall=13165 epoch 010: 124 / 1689 loss=3.657, nll_loss=2.116, ppl=4.34, wps=585049, ups=1.18, wpb=495572, bsz=16445.6, num_updates=15300, lr=0.00051131, gnorm=0.166, clip=0, loss_scale=2, train_wall=84, gb_free=62.4, wall=13165 epoch 010: 124 / 1689 loss=3.657, nll_loss=2.116, ppl=4.34, wps=585049, ups=1.18, wpb=495572, bsz=16445.6, num_updates=15300, lr=0.00051131, gnorm=0.166, clip=0, loss_scale=2, train_wall=84, gb_free=62.4, wall=13165 epoch 010: 124 / 1689 loss=3.657, nll_loss=2.116, ppl=4.34, wps=585049, ups=1.18, wpb=495572, bsz=16445.6, num_updates=15300, lr=0.00051131, gnorm=0.166, clip=0, loss_scale=2, train_wall=84, gb_free=62.4, wall=13165 epoch 010: 124 / 1689 loss=3.657, nll_loss=2.116, ppl=4.34, wps=585049, ups=1.18, wpb=495572, bsz=16445.6, num_updates=15300, lr=0.00051131, gnorm=0.166, clip=0, loss_scale=2, train_wall=84, gb_free=62.4, wall=13165 epoch 010: 124 / 1689 loss=3.657, nll_loss=2.116, ppl=4.34, wps=585049, ups=1.18, wpb=495572, bsz=16445.6, num_updates=15300, lr=0.00051131, gnorm=0.166, clip=0, loss_scale=2, train_wall=84, gb_free=62.4, wall=13165 epoch 010: 124 / 1689 loss=3.657, nll_loss=2.116, ppl=4.34, wps=585049, ups=1.18, wpb=495572, bsz=16445.6, num_updates=15300, lr=0.00051131, gnorm=0.166, clip=0, loss_scale=2, train_wall=84, gb_free=62.4, wall=13165 epoch 010: 224 / 1689 loss=3.673, nll_loss=2.135, ppl=4.39, wps=586727, ups=1.18, wpb=495280, bsz=16744.1, num_updates=15400, lr=0.000509647, gnorm=0.163, clip=0, loss_scale=4, train_wall=83, gb_free=61.3, wall=13249 epoch 010: 224 / 1689 loss=3.673, nll_loss=2.135, ppl=4.39, wps=586727, ups=1.18, wpb=495280, bsz=16744.1, num_updates=15400, lr=0.000509647, gnorm=0.163, clip=0, loss_scale=4, train_wall=83, gb_free=61.3, wall=13249 epoch 010: 224 / 1689 loss=3.673, nll_loss=2.135, ppl=4.39, wps=586727, ups=1.18, wpb=495280, bsz=16744.1, num_updates=15400, lr=0.000509647, gnorm=0.163, clip=0, loss_scale=4, train_wall=83, gb_free=61.3, wall=13249 epoch 010: 224 / 1689 loss=3.673, nll_loss=2.135, ppl=4.39, wps=586727, ups=1.18, wpb=495280, bsz=16744.1, num_updates=15400, lr=0.000509647, gnorm=0.163, clip=0, loss_scale=4, train_wall=83, gb_free=61.3, wall=13249 epoch 010: 224 / 1689 loss=3.673, nll_loss=2.135, ppl=4.39, wps=586727, ups=1.18, wpb=495280, bsz=16744.1, num_updates=15400, lr=0.000509647, gnorm=0.163, clip=0, loss_scale=4, train_wall=83, gb_free=61.3, wall=13249 epoch 010: 224 / 1689 loss=3.673, nll_loss=2.135, ppl=4.39, wps=586727, ups=1.18, wpb=495280, bsz=16744.1, num_updates=15400, lr=0.000509647, gnorm=0.163, clip=0, loss_scale=4, train_wall=83, gb_free=61.3, wall=13249 epoch 010: 224 / 1689 loss=3.673, nll_loss=2.135, ppl=4.39, wps=586727, ups=1.18, wpb=495280, bsz=16744.1, num_updates=15400, lr=0.000509647, gnorm=0.163, clip=0, loss_scale=4, train_wall=83, gb_free=61.3, wall=13249 epoch 010: 224 / 1689 loss=3.673, nll_loss=2.135, ppl=4.39, wps=586727, ups=1.18, wpb=495280, bsz=16744.1, num_updates=15400, lr=0.000509647, gnorm=0.163, clip=0, loss_scale=4, train_wall=83, gb_free=61.3, wall=13249 epoch 010: 224 / 1689 loss=3.673, nll_loss=2.135, ppl=4.39, wps=586727, ups=1.18, wpb=495280, bsz=16744.1, num_updates=15400, lr=0.000509647, gnorm=0.163, clip=0, loss_scale=4, train_wall=83, gb_free=61.3, wall=13249 epoch 010: 224 / 1689 loss=3.673, nll_loss=2.135, ppl=4.39, wps=586727, ups=1.18, wpb=495280, bsz=16744.1, num_updates=15400, lr=0.000509647, gnorm=0.163, clip=0, loss_scale=4, train_wall=83, gb_free=61.3, wall=13249 epoch 010: 324 / 1689 loss=3.679, nll_loss=2.141, ppl=4.41, wps=586311, ups=1.18, wpb=495349, bsz=16496.1, num_updates=15500, lr=0.000508001, gnorm=0.158, clip=0, loss_scale=4, train_wall=83, gb_free=60.9, wall=13334 epoch 010: 324 / 1689 loss=3.679, nll_loss=2.141, ppl=4.41, wps=586311, ups=1.18, wpb=495349, bsz=16496.1, num_updates=15500, lr=0.000508001, gnorm=0.158, clip=0, loss_scale=4, train_wall=83, gb_free=60.9, wall=13334 epoch 010: 324 / 1689 loss=3.679, nll_loss=2.141, ppl=4.41, wps=586311, ups=1.18, wpb=495349, bsz=16496.1, num_updates=15500, lr=0.000508001, gnorm=0.158, clip=0, loss_scale=4, train_wall=83, gb_free=60.9, wall=13334 epoch 010: 324 / 1689 loss=3.679, nll_loss=2.141, ppl=4.41, wps=586311, ups=1.18, wpb=495349, bsz=16496.1, num_updates=15500, lr=0.000508001, gnorm=0.158, clip=0, loss_scale=4, train_wall=83, gb_free=60.9, wall=13334 epoch 010: 324 / 1689 loss=3.679, nll_loss=2.141, ppl=4.41, wps=586311, ups=1.18, wpb=495349, bsz=16496.1, num_updates=15500, lr=0.000508001, gnorm=0.158, clip=0, loss_scale=4, train_wall=83, gb_free=60.9, wall=13334 epoch 010: 324 / 1689 loss=3.679, nll_loss=2.141, ppl=4.41, wps=586311, ups=1.18, wpb=495349, bsz=16496.1, num_updates=15500, lr=0.000508001, gnorm=0.158, clip=0, loss_scale=4, train_wall=83, gb_free=60.9, wall=13334 epoch 010: 324 / 1689 loss=3.679, nll_loss=2.141, ppl=4.41, wps=586311, ups=1.18, wpb=495349, bsz=16496.1, num_updates=15500, lr=0.000508001, gnorm=0.158, clip=0, loss_scale=4, train_wall=83, gb_free=60.9, wall=13334 epoch 010: 324 / 1689 loss=3.679, nll_loss=2.141, ppl=4.41, wps=586311, ups=1.18, wpb=495349, bsz=16496.1, num_updates=15500, lr=0.000508001, gnorm=0.158, clip=0, loss_scale=4, train_wall=83, gb_free=60.9, wall=13334 epoch 010: 324 / 1689 loss=3.679, nll_loss=2.141, ppl=4.41, wps=586311, ups=1.18, wpb=495349, bsz=16496.1, num_updates=15500, lr=0.000508001, gnorm=0.158, clip=0, loss_scale=4, train_wall=83, gb_free=60.9, wall=13334 epoch 010: 324 / 1689 loss=3.679, nll_loss=2.141, ppl=4.41, wps=586311, ups=1.18, wpb=495349, bsz=16496.1, num_updates=15500, lr=0.000508001, gnorm=0.158, clip=0, loss_scale=4, train_wall=83, gb_free=60.9, wall=13334 epoch 010: 424 / 1689 loss=3.671, nll_loss=2.133, ppl=4.39, wps=587436, ups=1.18, wpb=495917, bsz=16609.7, num_updates=15600, lr=0.00050637, gnorm=0.16, clip=0, loss_scale=4, train_wall=83, gb_free=62.2, wall=13418 epoch 010: 424 / 1689 loss=3.671, nll_loss=2.133, ppl=4.39, wps=587436, ups=1.18, wpb=495917, bsz=16609.7, num_updates=15600, lr=0.00050637, gnorm=0.16, clip=0, loss_scale=4, train_wall=83, gb_free=62.2, wall=13418 epoch 010: 424 / 1689 loss=3.671, nll_loss=2.133, ppl=4.39, wps=587436, ups=1.18, wpb=495917, bsz=16609.7, num_updates=15600, lr=0.00050637, gnorm=0.16, clip=0, loss_scale=4, train_wall=83, gb_free=62.2, wall=13418 epoch 010: 424 / 1689 loss=3.671, nll_loss=2.133, ppl=4.39, wps=587436, ups=1.18, wpb=495917, bsz=16609.7, num_updates=15600, lr=0.00050637, gnorm=0.16, clip=0, loss_scale=4, train_wall=83, gb_free=62.2, wall=13418 epoch 010: 424 / 1689 loss=3.671, nll_loss=2.133, ppl=4.39, wps=587436, ups=1.18, wpb=495917, bsz=16609.7, num_updates=15600, lr=0.00050637, gnorm=0.16, clip=0, loss_scale=4, train_wall=83, gb_free=62.2, wall=13418 epoch 010: 424 / 1689 loss=3.671, nll_loss=2.133, ppl=4.39, wps=587436, ups=1.18, wpb=495917, bsz=16609.7, num_updates=15600, lr=0.00050637, gnorm=0.16, clip=0, loss_scale=4, train_wall=83, gb_free=62.2, wall=13418 epoch 010: 424 / 1689 loss=3.671, nll_loss=2.133, ppl=4.39, wps=587436, ups=1.18, wpb=495917, bsz=16609.7, num_updates=15600, lr=0.00050637, gnorm=0.16, clip=0, loss_scale=4, train_wall=83, gb_free=62.2, wall=13418 epoch 010: 424 / 1689 loss=3.671, nll_loss=2.133, ppl=4.39, wps=587436, ups=1.18, wpb=495917, bsz=16609.7, num_updates=15600, lr=0.00050637, gnorm=0.16, clip=0, loss_scale=4, train_wall=83, gb_free=62.2, wall=13418 epoch 010: 424 / 1689 loss=3.671, nll_loss=2.133, ppl=4.39, wps=587436, ups=1.18, wpb=495917, bsz=16609.7, num_updates=15600, lr=0.00050637, gnorm=0.16, clip=0, loss_scale=4, train_wall=83, gb_free=62.2, wall=13418 epoch 010: 424 / 1689 loss=3.671, nll_loss=2.133, ppl=4.39, wps=587436, ups=1.18, wpb=495917, bsz=16609.7, num_updates=15600, lr=0.00050637, gnorm=0.16, clip=0, loss_scale=4, train_wall=83, gb_free=62.2, wall=13418 epoch 010: 524 / 1689 loss=3.677, nll_loss=2.14, ppl=4.41, wps=589362, ups=1.19, wpb=496666, bsz=16304.5, num_updates=15700, lr=0.000504754, gnorm=0.164, clip=0, loss_scale=4, train_wall=83, gb_free=61.2, wall=13503 epoch 010: 524 / 1689 loss=3.677, nll_loss=2.14, ppl=4.41, wps=589362, ups=1.19, wpb=496666, bsz=16304.5, num_updates=15700, lr=0.000504754, gnorm=0.164, clip=0, loss_scale=4, train_wall=83, gb_free=61.2, wall=13503 epoch 010: 524 / 1689 loss=3.677, nll_loss=2.14, ppl=4.41, wps=589362, ups=1.19, wpb=496666, bsz=16304.5, num_updates=15700, lr=0.000504754, gnorm=0.164, clip=0, loss_scale=4, train_wall=83, gb_free=61.2, wall=13503 epoch 010: 524 / 1689 loss=3.677, nll_loss=2.14, ppl=4.41, wps=589362, ups=1.19, wpb=496666, bsz=16304.5, num_updates=15700, lr=0.000504754, gnorm=0.164, clip=0, loss_scale=4, train_wall=83, gb_free=61.2, wall=13503 epoch 010: 524 / 1689 loss=3.677, nll_loss=2.14, ppl=4.41, wps=589362, ups=1.19, wpb=496666, bsz=16304.5, num_updates=15700, lr=0.000504754, gnorm=0.164, clip=0, loss_scale=4, train_wall=83, gb_free=61.2, wall=13503 epoch 010: 524 / 1689 loss=3.677, nll_loss=2.14, ppl=4.41, wps=589362, ups=1.19, wpb=496666, bsz=16304.5, num_updates=15700, lr=0.000504754, gnorm=0.164, clip=0, loss_scale=4, train_wall=83, gb_free=61.2, wall=13503 epoch 010: 524 / 1689 loss=3.677, nll_loss=2.14, ppl=4.41, wps=589362, ups=1.19, wpb=496666, bsz=16304.5, num_updates=15700, lr=0.000504754, gnorm=0.164, clip=0, loss_scale=4, train_wall=83, gb_free=61.2, wall=13503 epoch 010: 524 / 1689 loss=3.677, nll_loss=2.14, ppl=4.41, wps=589362, ups=1.19, wpb=496666, bsz=16304.5, num_updates=15700, lr=0.000504754, gnorm=0.164, clip=0, loss_scale=4, train_wall=83, gb_free=61.2, wall=13503 epoch 010: 524 / 1689 loss=3.677, nll_loss=2.14, ppl=4.41, wps=589362, ups=1.19, wpb=496666, bsz=16304.5, num_updates=15700, lr=0.000504754, gnorm=0.164, clip=0, loss_scale=4, train_wall=83, gb_free=61.2, wall=13503 epoch 010: 524 / 1689 loss=3.677, nll_loss=2.14, ppl=4.41, wps=589362, ups=1.19, wpb=496666, bsz=16304.5, num_updates=15700, lr=0.000504754, gnorm=0.164, clip=0, loss_scale=4, train_wall=83, gb_free=61.2, wall=13503 epoch 010: 624 / 1689 loss=3.668, nll_loss=2.13, ppl=4.38, wps=584179, ups=1.18, wpb=495407, bsz=16805.3, num_updates=15800, lr=0.000503155, gnorm=0.167, clip=0, loss_scale=4, train_wall=84, gb_free=62.2, wall=13587 epoch 010: 624 / 1689 loss=3.668, nll_loss=2.13, ppl=4.38, wps=584179, ups=1.18, wpb=495407, bsz=16805.3, num_updates=15800, lr=0.000503155, gnorm=0.167, clip=0, loss_scale=4, train_wall=84, gb_free=62.2, wall=13587 epoch 010: 624 / 1689 loss=3.668, nll_loss=2.13, ppl=4.38, wps=584179, ups=1.18, wpb=495407, bsz=16805.3, num_updates=15800, lr=0.000503155, gnorm=0.167, clip=0, loss_scale=4, train_wall=84, gb_free=62.2, wall=13587 epoch 010: 624 / 1689 loss=3.668, nll_loss=2.13, ppl=4.38, wps=584179, ups=1.18, wpb=495407, bsz=16805.3, num_updates=15800, lr=0.000503155, gnorm=0.167, clip=0, loss_scale=4, train_wall=84, gb_free=62.2, wall=13587 epoch 010: 624 / 1689 loss=3.668, nll_loss=2.13, ppl=4.38, wps=584179, ups=1.18, wpb=495407, bsz=16805.3, num_updates=15800, lr=0.000503155, gnorm=0.167, clip=0, loss_scale=4, train_wall=84, gb_free=62.2, wall=13587 epoch 010: 624 / 1689 loss=3.668, nll_loss=2.13, ppl=4.38, wps=584179, ups=1.18, wpb=495407, bsz=16805.3, num_updates=15800, lr=0.000503155, gnorm=0.167, clip=0, loss_scale=4, train_wall=84, gb_free=62.2, wall=13587 epoch 010: 624 / 1689 loss=3.668, nll_loss=2.13, ppl=4.38, wps=584179, ups=1.18, wpb=495407, bsz=16805.3, num_updates=15800, lr=0.000503155, gnorm=0.167, clip=0, loss_scale=4, train_wall=84, gb_free=62.2, wall=13587 epoch 010: 624 / 1689 loss=3.668, nll_loss=2.13, ppl=4.38, wps=584179, ups=1.18, wpb=495407, bsz=16805.3, num_updates=15800, lr=0.000503155, gnorm=0.167, clip=0, loss_scale=4, train_wall=84, gb_free=62.2, wall=13587 epoch 010: 624 / 1689 loss=3.668, nll_loss=2.13, ppl=4.38, wps=584179, ups=1.18, wpb=495407, bsz=16805.3, num_updates=15800, lr=0.000503155, gnorm=0.167, clip=0, loss_scale=4, train_wall=84, gb_free=62.2, wall=13587 epoch 010: 624 / 1689 loss=3.668, nll_loss=2.13, ppl=4.38, wps=584179, ups=1.18, wpb=495407, bsz=16805.3, num_updates=15800, lr=0.000503155, gnorm=0.167, clip=0, loss_scale=4, train_wall=84, gb_free=62.2, wall=13587 epoch 010: 724 / 1689 loss=3.679, nll_loss=2.142, ppl=4.41, wps=583470, ups=1.18, wpb=495419, bsz=16520.5, num_updates=15900, lr=0.00050157, gnorm=0.164, clip=0, loss_scale=8, train_wall=84, gb_free=60.2, wall=13672 epoch 010: 724 / 1689 loss=3.679, nll_loss=2.142, ppl=4.41, wps=583470, ups=1.18, wpb=495419, bsz=16520.5, num_updates=15900, lr=0.00050157, gnorm=0.164, clip=0, loss_scale=8, train_wall=84, gb_free=60.2, wall=13672 epoch 010: 724 / 1689 loss=3.679, nll_loss=2.142, ppl=4.41, wps=583470, ups=1.18, wpb=495419, bsz=16520.5, num_updates=15900, lr=0.00050157, gnorm=0.164, clip=0, loss_scale=8, train_wall=84, gb_free=60.2, wall=13672 epoch 010: 724 / 1689 loss=3.679, nll_loss=2.142, ppl=4.41, wps=583470, ups=1.18, wpb=495419, bsz=16520.5, num_updates=15900, lr=0.00050157, gnorm=0.164, clip=0, loss_scale=8, train_wall=84, gb_free=60.2, wall=13672 epoch 010: 724 / 1689 loss=3.679, nll_loss=2.142, ppl=4.41, wps=583470, ups=1.18, wpb=495419, bsz=16520.5, num_updates=15900, lr=0.00050157, gnorm=0.164, clip=0, loss_scale=8, train_wall=84, gb_free=60.2, wall=13672 epoch 010: 724 / 1689 loss=3.679, nll_loss=2.142, ppl=4.41, wps=583470, ups=1.18, wpb=495419, bsz=16520.5, num_updates=15900, lr=0.00050157, gnorm=0.164, clip=0, loss_scale=8, train_wall=84, gb_free=60.2, wall=13672 epoch 010: 724 / 1689 loss=3.679, nll_loss=2.142, ppl=4.41, wps=583470, ups=1.18, wpb=495419, bsz=16520.5, num_updates=15900, lr=0.00050157, gnorm=0.164, clip=0, loss_scale=8, train_wall=84, gb_free=60.2, wall=13672 epoch 010: 724 / 1689 loss=3.679, nll_loss=2.142, ppl=4.41, wps=583470, ups=1.18, wpb=495419, bsz=16520.5, num_updates=15900, lr=0.00050157, gnorm=0.164, clip=0, loss_scale=8, train_wall=84, gb_free=60.2, wall=13672 epoch 010: 724 / 1689 loss=3.679, nll_loss=2.142, ppl=4.41, wps=583470, ups=1.18, wpb=495419, bsz=16520.5, num_updates=15900, lr=0.00050157, gnorm=0.164, clip=0, loss_scale=8, train_wall=84, gb_free=60.2, wall=13672 epoch 010: 724 / 1689 loss=3.679, nll_loss=2.142, ppl=4.41, wps=583470, ups=1.18, wpb=495419, bsz=16520.5, num_updates=15900, lr=0.00050157, gnorm=0.164, clip=0, loss_scale=8, train_wall=84, gb_free=60.2, wall=13672 epoch 010: 825 / 1689 loss=3.679, nll_loss=2.142, ppl=4.41, wps=581451, ups=1.17, wpb=496051, bsz=16551.9, num_updates=16000, lr=0.0005, gnorm=0.16, clip=0, loss_scale=4, train_wall=84, gb_free=61.4, wall=13758 epoch 010: 825 / 1689 loss=3.679, nll_loss=2.142, ppl=4.41, wps=581451, ups=1.17, wpb=496051, bsz=16551.9, num_updates=16000, lr=0.0005, gnorm=0.16, clip=0, loss_scale=4, train_wall=84, gb_free=61.4, wall=13758 epoch 010: 825 / 1689 loss=3.679, nll_loss=2.142, ppl=4.41, wps=581451, ups=1.17, wpb=496051, bsz=16551.9, num_updates=16000, lr=0.0005, gnorm=0.16, clip=0, loss_scale=4, train_wall=84, gb_free=61.4, wall=13758 epoch 010: 825 / 1689 loss=3.679, nll_loss=2.142, ppl=4.41, wps=581451, ups=1.17, wpb=496051, bsz=16551.9, num_updates=16000, lr=0.0005, gnorm=0.16, clip=0, loss_scale=4, train_wall=84, gb_free=61.4, wall=13758 epoch 010: 825 / 1689 loss=3.679, nll_loss=2.142, ppl=4.41, wps=581451, ups=1.17, wpb=496051, bsz=16551.9, num_updates=16000, lr=0.0005, gnorm=0.16, clip=0, loss_scale=4, train_wall=84, gb_free=61.4, wall=13758 epoch 010: 825 / 1689 loss=3.679, nll_loss=2.142, ppl=4.41, wps=581451, ups=1.17, wpb=496051, bsz=16551.9, num_updates=16000, lr=0.0005, gnorm=0.16, clip=0, loss_scale=4, train_wall=84, gb_free=61.4, wall=13758 epoch 010: 825 / 1689 loss=3.679, nll_loss=2.142, ppl=4.41, wps=581451, ups=1.17, wpb=496051, bsz=16551.9, num_updates=16000, lr=0.0005, gnorm=0.16, clip=0, loss_scale=4, train_wall=84, gb_free=61.4, wall=13758 epoch 010: 825 / 1689 loss=3.679, nll_loss=2.142, ppl=4.41, wps=581451, ups=1.17, wpb=496051, bsz=16551.9, num_updates=16000, lr=0.0005, gnorm=0.16, clip=0, loss_scale=4, train_wall=84, gb_free=61.4, wall=13758 epoch 010: 825 / 1689 loss=3.679, nll_loss=2.142, ppl=4.41, wps=581451, ups=1.17, wpb=496051, bsz=16551.9, num_updates=16000, lr=0.0005, gnorm=0.16, clip=0, loss_scale=4, train_wall=84, gb_free=61.4, wall=13758 epoch 010: 825 / 1689 loss=3.679, nll_loss=2.142, ppl=4.41, wps=581451, ups=1.17, wpb=496051, bsz=16551.9, num_updates=16000, lr=0.0005, gnorm=0.16, clip=0, loss_scale=4, train_wall=84, gb_free=61.4, wall=13758 begin validation on "valid" subset epoch 010 | valid on 'valid' subset | loss 3.767 | nll_loss 2.22 | ppl 4.66 | wps 0 | wpb 44526 | bsz 2008 | num_updates 16000 | best_loss 3.765 epoch 010 | valid on 'valid' subset | loss 3.767 | nll_loss 2.22 | ppl 4.66 | wps 0 | wpb 44526 | bsz 2008 | num_updates 16000 | best_loss 3.765 epoch 010 | valid on 'valid' subset | loss 3.767 | nll_loss 2.22 | ppl 4.66 | wps 0 | wpb 44526 | bsz 2008 | num_updates 16000 | best_loss 3.765 epoch 010 | valid on 'valid' subset | loss 3.767 | nll_loss 2.22 | ppl 4.66 | wps 0 | wpb 44526 | bsz 2008 | num_updates 16000 | best_loss 3.765 epoch 010 | valid on 'valid' subset | loss 3.767 | nll_loss 2.22 | ppl 4.66 | wps 0 | wpb 44526 | bsz 2008 | num_updates 16000 | best_loss 3.765 epoch 010 | valid on 'valid' subset | loss 3.767 | nll_loss 2.22 | ppl 4.66 | wps 0 | wpb 44526 | bsz 2008 | num_updates 16000 | best_loss 3.765 epoch 010 | valid on 'valid' subset | loss 3.767 | nll_loss 2.22 | ppl 4.66 | wps 0 | wpb 44526 | bsz 2008 | num_updates 16000 | best_loss 3.765 epoch 010 | valid on 'valid' subset | loss 3.767 | nll_loss 2.22 | ppl 4.66 | wps 0 | wpb 44526 | bsz 2008 | num_updates 16000 | best_loss 3.765 epoch 010 | valid on 'valid' subset | loss 3.767 | nll_loss 2.22 | ppl 4.66 | wps 0 | wpb 44526 | bsz 2008 | num_updates 16000 | best_loss 3.765 epoch 010 | valid on 'valid' subset | loss 3.767 | nll_loss 2.22 | ppl 4.66 | wps 0 | wpb 44526 | bsz 2008 | num_updates 16000 | best_loss 3.765 epoch 010: 925 / 1689 loss=3.678, nll_loss=2.141, ppl=4.41, wps=534395, ups=1.08, wpb=496577, bsz=16745.5, num_updates=16100, lr=0.000498445, gnorm=0.161, clip=0, loss_scale=4, train_wall=84, gb_free=61.7, wall=13851 epoch 010: 925 / 1689 loss=3.678, nll_loss=2.141, ppl=4.41, wps=534395, ups=1.08, wpb=496577, bsz=16745.5, num_updates=16100, lr=0.000498445, gnorm=0.161, clip=0, loss_scale=4, train_wall=84, gb_free=61.7, wall=13851 epoch 010: 925 / 1689 loss=3.678, nll_loss=2.141, ppl=4.41, wps=534395, ups=1.08, wpb=496577, bsz=16745.5, num_updates=16100, lr=0.000498445, gnorm=0.161, clip=0, loss_scale=4, train_wall=84, gb_free=61.7, wall=13851 epoch 010: 925 / 1689 loss=3.678, nll_loss=2.141, ppl=4.41, wps=534395, ups=1.08, wpb=496577, bsz=16745.5, num_updates=16100, lr=0.000498445, gnorm=0.161, clip=0, loss_scale=4, train_wall=84, gb_free=61.7, wall=13851 epoch 010: 925 / 1689 loss=3.678, nll_loss=2.141, ppl=4.41, wps=534395, ups=1.08, wpb=496577, bsz=16745.5, num_updates=16100, lr=0.000498445, gnorm=0.161, clip=0, loss_scale=4, train_wall=84, gb_free=61.7, wall=13851 epoch 010: 925 / 1689 loss=3.678, nll_loss=2.141, ppl=4.41, wps=534395, ups=1.08, wpb=496577, bsz=16745.5, num_updates=16100, lr=0.000498445, gnorm=0.161, clip=0, loss_scale=4, train_wall=84, gb_free=61.7, wall=13851 epoch 010: 925 / 1689 loss=3.678, nll_loss=2.141, ppl=4.41, wps=534395, ups=1.08, wpb=496577, bsz=16745.5, num_updates=16100, lr=0.000498445, gnorm=0.161, clip=0, loss_scale=4, train_wall=84, gb_free=61.7, wall=13851 epoch 010: 925 / 1689 loss=3.678, nll_loss=2.141, ppl=4.41, wps=534395, ups=1.08, wpb=496577, bsz=16745.5, num_updates=16100, lr=0.000498445, gnorm=0.161, clip=0, loss_scale=4, train_wall=84, gb_free=61.7, wall=13851 epoch 010: 925 / 1689 loss=3.678, nll_loss=2.141, ppl=4.41, wps=534395, ups=1.08, wpb=496577, bsz=16745.5, num_updates=16100, lr=0.000498445, gnorm=0.161, clip=0, loss_scale=4, train_wall=84, gb_free=61.7, wall=13851 epoch 010: 925 / 1689 loss=3.678, nll_loss=2.141, ppl=4.41, wps=534395, ups=1.08, wpb=496577, bsz=16745.5, num_updates=16100, lr=0.000498445, gnorm=0.161, clip=0, loss_scale=4, train_wall=84, gb_free=61.7, wall=13851 epoch 010: 1025 / 1689 loss=3.685, nll_loss=2.149, ppl=4.44, wps=585396, ups=1.18, wpb=495321, bsz=16602.6, num_updates=16200, lr=0.000496904, gnorm=0.162, clip=0, loss_scale=4, train_wall=84, gb_free=61.3, wall=13935 epoch 010: 1025 / 1689 loss=3.685, nll_loss=2.149, ppl=4.44, wps=585396, ups=1.18, wpb=495321, bsz=16602.6, num_updates=16200, lr=0.000496904, gnorm=0.162, clip=0, loss_scale=4, train_wall=84, gb_free=61.3, wall=13935 epoch 010: 1025 / 1689 loss=3.685, nll_loss=2.149, ppl=4.44, wps=585396, ups=1.18, wpb=495321, bsz=16602.6, num_updates=16200, lr=0.000496904, gnorm=0.162, clip=0, loss_scale=4, train_wall=84, gb_free=61.3, wall=13935 epoch 010: 1025 / 1689 loss=3.685, nll_loss=2.149, ppl=4.44, wps=585396, ups=1.18, wpb=495321, bsz=16602.6, num_updates=16200, lr=0.000496904, gnorm=0.162, clip=0, loss_scale=4, train_wall=84, gb_free=61.3, wall=13935 epoch 010: 1025 / 1689 loss=3.685, nll_loss=2.149, ppl=4.44, wps=585396, ups=1.18, wpb=495321, bsz=16602.6, num_updates=16200, lr=0.000496904, gnorm=0.162, clip=0, loss_scale=4, train_wall=84, gb_free=61.3, wall=13935 epoch 010: 1025 / 1689 loss=3.685, nll_loss=2.149, ppl=4.44, wps=585396, ups=1.18, wpb=495321, bsz=16602.6, num_updates=16200, lr=0.000496904, gnorm=0.162, clip=0, loss_scale=4, train_wall=84, gb_free=61.3, wall=13935 epoch 010: 1025 / 1689 loss=3.685, nll_loss=2.149, ppl=4.44, wps=585396, ups=1.18, wpb=495321, bsz=16602.6, num_updates=16200, lr=0.000496904, gnorm=0.162, clip=0, loss_scale=4, train_wall=84, gb_free=61.3, wall=13935 epoch 010: 1025 / 1689 loss=3.685, nll_loss=2.149, ppl=4.44, wps=585396, ups=1.18, wpb=495321, bsz=16602.6, num_updates=16200, lr=0.000496904, gnorm=0.162, clip=0, loss_scale=4, train_wall=84, gb_free=61.3, wall=13935 epoch 010: 1025 / 1689 loss=3.685, nll_loss=2.149, ppl=4.44, wps=585396, ups=1.18, wpb=495321, bsz=16602.6, num_updates=16200, lr=0.000496904, gnorm=0.162, clip=0, loss_scale=4, train_wall=84, gb_free=61.3, wall=13935 epoch 010: 1025 / 1689 loss=3.685, nll_loss=2.149, ppl=4.44, wps=585396, ups=1.18, wpb=495321, bsz=16602.6, num_updates=16200, lr=0.000496904, gnorm=0.162, clip=0, loss_scale=4, train_wall=84, gb_free=61.3, wall=13935 epoch 010: 1125 / 1689 loss=3.685, nll_loss=2.149, ppl=4.43, wps=589489, ups=1.19, wpb=494677, bsz=16349.9, num_updates=16300, lr=0.000495377, gnorm=0.157, clip=0, loss_scale=4, train_wall=83, gb_free=61.6, wall=14019 epoch 010: 1125 / 1689 loss=3.685, nll_loss=2.149, ppl=4.43, wps=589489, ups=1.19, wpb=494677, bsz=16349.9, num_updates=16300, lr=0.000495377, gnorm=0.157, clip=0, loss_scale=4, train_wall=83, gb_free=61.6, wall=14019 epoch 010: 1125 / 1689 loss=3.685, nll_loss=2.149, ppl=4.43, wps=589489, ups=1.19, wpb=494677, bsz=16349.9, num_updates=16300, lr=0.000495377, gnorm=0.157, clip=0, loss_scale=4, train_wall=83, gb_free=61.6, wall=14019 epoch 010: 1125 / 1689 loss=3.685, nll_loss=2.149, ppl=4.43, wps=589489, ups=1.19, wpb=494677, bsz=16349.9, num_updates=16300, lr=0.000495377, gnorm=0.157, clip=0, loss_scale=4, train_wall=83, gb_free=61.6, wall=14019 epoch 010: 1125 / 1689 loss=3.685, nll_loss=2.149, ppl=4.43, wps=589489, ups=1.19, wpb=494677, bsz=16349.9, num_updates=16300, lr=0.000495377, gnorm=0.157, clip=0, loss_scale=4, train_wall=83, gb_free=61.6, wall=14019 epoch 010: 1125 / 1689 loss=3.685, nll_loss=2.149, ppl=4.43, wps=589489, ups=1.19, wpb=494677, bsz=16349.9, num_updates=16300, lr=0.000495377, gnorm=0.157, clip=0, loss_scale=4, train_wall=83, gb_free=61.6, wall=14019 epoch 010: 1125 / 1689 loss=3.685, nll_loss=2.149, ppl=4.43, wps=589489, ups=1.19, wpb=494677, bsz=16349.9, num_updates=16300, lr=0.000495377, gnorm=0.157, clip=0, loss_scale=4, train_wall=83, gb_free=61.6, wall=14019 epoch 010: 1125 / 1689 loss=3.685, nll_loss=2.149, ppl=4.43, wps=589489, ups=1.19, wpb=494677, bsz=16349.9, num_updates=16300, lr=0.000495377, gnorm=0.157, clip=0, loss_scale=4, train_wall=83, gb_free=61.6, wall=14019 epoch 010: 1125 / 1689 loss=3.685, nll_loss=2.149, ppl=4.43, wps=589489, ups=1.19, wpb=494677, bsz=16349.9, num_updates=16300, lr=0.000495377, gnorm=0.157, clip=0, loss_scale=4, train_wall=83, gb_free=61.6, wall=14019 epoch 010: 1125 / 1689 loss=3.685, nll_loss=2.149, ppl=4.43, wps=589489, ups=1.19, wpb=494677, bsz=16349.9, num_updates=16300, lr=0.000495377, gnorm=0.157, clip=0, loss_scale=4, train_wall=83, gb_free=61.6, wall=14019 epoch 010: 1225 / 1689 loss=3.683, nll_loss=2.146, ppl=4.43, wps=584785, ups=1.18, wpb=495627, bsz=16405.8, num_updates=16400, lr=0.000493865, gnorm=0.16, clip=0, loss_scale=4, train_wall=83, gb_free=60.7, wall=14104 epoch 010: 1225 / 1689 loss=3.683, nll_loss=2.146, ppl=4.43, wps=584785, ups=1.18, wpb=495627, bsz=16405.8, num_updates=16400, lr=0.000493865, gnorm=0.16, clip=0, loss_scale=4, train_wall=83, gb_free=60.7, wall=14104 epoch 010: 1225 / 1689 loss=3.683, nll_loss=2.146, ppl=4.43, wps=584785, ups=1.18, wpb=495627, bsz=16405.8, num_updates=16400, lr=0.000493865, gnorm=0.16, clip=0, loss_scale=4, train_wall=83, gb_free=60.7, wall=14104 epoch 010: 1225 / 1689 loss=3.683, nll_loss=2.146, ppl=4.43, wps=584785, ups=1.18, wpb=495627, bsz=16405.8, num_updates=16400, lr=0.000493865, gnorm=0.16, clip=0, loss_scale=4, train_wall=83, gb_free=60.7, wall=14104 epoch 010: 1225 / 1689 loss=3.683, nll_loss=2.146, ppl=4.43, wps=584785, ups=1.18, wpb=495627, bsz=16405.8, num_updates=16400, lr=0.000493865, gnorm=0.16, clip=0, loss_scale=4, train_wall=83, gb_free=60.7, wall=14104 epoch 010: 1225 / 1689 loss=3.683, nll_loss=2.146, ppl=4.43, wps=584785, ups=1.18, wpb=495627, bsz=16405.8, num_updates=16400, lr=0.000493865, gnorm=0.16, clip=0, loss_scale=4, train_wall=83, gb_free=60.7, wall=14104 epoch 010: 1225 / 1689 loss=3.683, nll_loss=2.146, ppl=4.43, wps=584785, ups=1.18, wpb=495627, bsz=16405.8, num_updates=16400, lr=0.000493865, gnorm=0.16, clip=0, loss_scale=4, train_wall=83, gb_free=60.7, wall=14104 epoch 010: 1225 / 1689 loss=3.683, nll_loss=2.146, ppl=4.43, wps=584785, ups=1.18, wpb=495627, bsz=16405.8, num_updates=16400, lr=0.000493865, gnorm=0.16, clip=0, loss_scale=4, train_wall=83, gb_free=60.7, wall=14104 epoch 010: 1225 / 1689 loss=3.683, nll_loss=2.146, ppl=4.43, wps=584785, ups=1.18, wpb=495627, bsz=16405.8, num_updates=16400, lr=0.000493865, gnorm=0.16, clip=0, loss_scale=4, train_wall=83, gb_free=60.7, wall=14104 epoch 010: 1225 / 1689 loss=3.683, nll_loss=2.146, ppl=4.43, wps=584785, ups=1.18, wpb=495627, bsz=16405.8, num_updates=16400, lr=0.000493865, gnorm=0.16, clip=0, loss_scale=4, train_wall=83, gb_free=60.7, wall=14104 epoch 010: 1326 / 1689 loss=3.685, nll_loss=2.15, ppl=4.44, wps=578799, ups=1.17, wpb=494489, bsz=16589.5, num_updates=16500, lr=0.000492366, gnorm=0.161, clip=0, loss_scale=4, train_wall=84, gb_free=58.7, wall=14189 epoch 010: 1326 / 1689 loss=3.685, nll_loss=2.15, ppl=4.44, wps=578799, ups=1.17, wpb=494489, bsz=16589.5, num_updates=16500, lr=0.000492366, gnorm=0.161, clip=0, loss_scale=4, train_wall=84, gb_free=58.7, wall=14189 epoch 010: 1326 / 1689 loss=3.685, nll_loss=2.15, ppl=4.44, wps=578799, ups=1.17, wpb=494489, bsz=16589.5, num_updates=16500, lr=0.000492366, gnorm=0.161, clip=0, loss_scale=4, train_wall=84, gb_free=58.7, wall=14189 epoch 010: 1326 / 1689 loss=3.685, nll_loss=2.15, ppl=4.44, wps=578799, ups=1.17, wpb=494489, bsz=16589.5, num_updates=16500, lr=0.000492366, gnorm=0.161, clip=0, loss_scale=4, train_wall=84, gb_free=58.7, wall=14189 epoch 010: 1326 / 1689 loss=3.685, nll_loss=2.15, ppl=4.44, wps=578799, ups=1.17, wpb=494489, bsz=16589.5, num_updates=16500, lr=0.000492366, gnorm=0.161, clip=0, loss_scale=4, train_wall=84, gb_free=58.7, wall=14189 epoch 010: 1326 / 1689 loss=3.685, nll_loss=2.15, ppl=4.44, wps=578799, ups=1.17, wpb=494489, bsz=16589.5, num_updates=16500, lr=0.000492366, gnorm=0.161, clip=0, loss_scale=4, train_wall=84, gb_free=58.7, wall=14189 epoch 010: 1326 / 1689 loss=3.685, nll_loss=2.15, ppl=4.44, wps=578799, ups=1.17, wpb=494489, bsz=16589.5, num_updates=16500, lr=0.000492366, gnorm=0.161, clip=0, loss_scale=4, train_wall=84, gb_free=58.7, wall=14189 epoch 010: 1326 / 1689 loss=3.685, nll_loss=2.15, ppl=4.44, wps=578799, ups=1.17, wpb=494489, bsz=16589.5, num_updates=16500, lr=0.000492366, gnorm=0.161, clip=0, loss_scale=4, train_wall=84, gb_free=58.7, wall=14189 epoch 010: 1326 / 1689 loss=3.685, nll_loss=2.15, ppl=4.44, wps=578799, ups=1.17, wpb=494489, bsz=16589.5, num_updates=16500, lr=0.000492366, gnorm=0.161, clip=0, loss_scale=4, train_wall=84, gb_free=58.7, wall=14189 epoch 010: 1326 / 1689 loss=3.685, nll_loss=2.15, ppl=4.44, wps=578799, ups=1.17, wpb=494489, bsz=16589.5, num_updates=16500, lr=0.000492366, gnorm=0.161, clip=0, loss_scale=4, train_wall=84, gb_free=58.7, wall=14189 epoch 010: 1426 / 1689 loss=3.681, nll_loss=2.144, ppl=4.42, wps=585222, ups=1.18, wpb=495668, bsz=16556.7, num_updates=16600, lr=0.000490881, gnorm=0.163, clip=0, loss_scale=4, train_wall=84, gb_free=61.5, wall=14274 epoch 010: 1426 / 1689 loss=3.681, nll_loss=2.144, ppl=4.42, wps=585222, ups=1.18, wpb=495668, bsz=16556.7, num_updates=16600, lr=0.000490881, gnorm=0.163, clip=0, loss_scale=4, train_wall=84, gb_free=61.5, wall=14274 epoch 010: 1426 / 1689 loss=3.681, nll_loss=2.144, ppl=4.42, wps=585222, ups=1.18, wpb=495668, bsz=16556.7, num_updates=16600, lr=0.000490881, gnorm=0.163, clip=0, loss_scale=4, train_wall=84, gb_free=61.5, wall=14274 epoch 010: 1426 / 1689 loss=3.681, nll_loss=2.144, ppl=4.42, wps=585222, ups=1.18, wpb=495668, bsz=16556.7, num_updates=16600, lr=0.000490881, gnorm=0.163, clip=0, loss_scale=4, train_wall=84, gb_free=61.5, wall=14274 epoch 010: 1426 / 1689 loss=3.681, nll_loss=2.144, ppl=4.42, wps=585222, ups=1.18, wpb=495668, bsz=16556.7, num_updates=16600, lr=0.000490881, gnorm=0.163, clip=0, loss_scale=4, train_wall=84, gb_free=61.5, wall=14274 epoch 010: 1426 / 1689 loss=3.681, nll_loss=2.144, ppl=4.42, wps=585222, ups=1.18, wpb=495668, bsz=16556.7, num_updates=16600, lr=0.000490881, gnorm=0.163, clip=0, loss_scale=4, train_wall=84, gb_free=61.5, wall=14274 epoch 010: 1426 / 1689 loss=3.681, nll_loss=2.144, ppl=4.42, wps=585222, ups=1.18, wpb=495668, bsz=16556.7, num_updates=16600, lr=0.000490881, gnorm=0.163, clip=0, loss_scale=4, train_wall=84, gb_free=61.5, wall=14274 epoch 010: 1426 / 1689 loss=3.681, nll_loss=2.144, ppl=4.42, wps=585222, ups=1.18, wpb=495668, bsz=16556.7, num_updates=16600, lr=0.000490881, gnorm=0.163, clip=0, loss_scale=4, train_wall=84, gb_free=61.5, wall=14274 epoch 010: 1426 / 1689 loss=3.681, nll_loss=2.144, ppl=4.42, wps=585222, ups=1.18, wpb=495668, bsz=16556.7, num_updates=16600, lr=0.000490881, gnorm=0.163, clip=0, loss_scale=4, train_wall=84, gb_free=61.5, wall=14274 epoch 010: 1426 / 1689 loss=3.681, nll_loss=2.144, ppl=4.42, wps=585222, ups=1.18, wpb=495668, bsz=16556.7, num_updates=16600, lr=0.000490881, gnorm=0.163, clip=0, loss_scale=4, train_wall=84, gb_free=61.5, wall=14274 epoch 010: 1526 / 1689 loss=3.686, nll_loss=2.151, ppl=4.44, wps=581900, ups=1.18, wpb=493539, bsz=16402.2, num_updates=16700, lr=0.000489409, gnorm=0.154, clip=0, loss_scale=4, train_wall=84, gb_free=62, wall=14359 epoch 010: 1526 / 1689 loss=3.686, nll_loss=2.151, ppl=4.44, wps=581900, ups=1.18, wpb=493539, bsz=16402.2, num_updates=16700, lr=0.000489409, gnorm=0.154, clip=0, loss_scale=4, train_wall=84, gb_free=62, wall=14359 epoch 010: 1526 / 1689 loss=3.686, nll_loss=2.151, ppl=4.44, wps=581900, ups=1.18, wpb=493539, bsz=16402.2, num_updates=16700, lr=0.000489409, gnorm=0.154, clip=0, loss_scale=4, train_wall=84, gb_free=62, wall=14359 epoch 010: 1526 / 1689 loss=3.686, nll_loss=2.151, ppl=4.44, wps=581900, ups=1.18, wpb=493539, bsz=16402.2, num_updates=16700, lr=0.000489409, gnorm=0.154, clip=0, loss_scale=4, train_wall=84, gb_free=62, wall=14359 epoch 010: 1526 / 1689 loss=3.686, nll_loss=2.151, ppl=4.44, wps=581900, ups=1.18, wpb=493539, bsz=16402.2, num_updates=16700, lr=0.000489409, gnorm=0.154, clip=0, loss_scale=4, train_wall=84, gb_free=62, wall=14359 epoch 010: 1526 / 1689 loss=3.686, nll_loss=2.151, ppl=4.44, wps=581900, ups=1.18, wpb=493539, bsz=16402.2, num_updates=16700, lr=0.000489409, gnorm=0.154, clip=0, loss_scale=4, train_wall=84, gb_free=62, wall=14359 epoch 010: 1526 / 1689 loss=3.686, nll_loss=2.151, ppl=4.44, wps=581900, ups=1.18, wpb=493539, bsz=16402.2, num_updates=16700, lr=0.000489409, gnorm=0.154, clip=0, loss_scale=4, train_wall=84, gb_free=62, wall=14359 epoch 010: 1526 / 1689 loss=3.686, nll_loss=2.151, ppl=4.44, wps=581900, ups=1.18, wpb=493539, bsz=16402.2, num_updates=16700, lr=0.000489409, gnorm=0.154, clip=0, loss_scale=4, train_wall=84, gb_free=62, wall=14359 epoch 010: 1526 / 1689 loss=3.686, nll_loss=2.151, ppl=4.44, wps=581900, ups=1.18, wpb=493539, bsz=16402.2, num_updates=16700, lr=0.000489409, gnorm=0.154, clip=0, loss_scale=4, train_wall=84, gb_free=62, wall=14359 epoch 010: 1526 / 1689 loss=3.686, nll_loss=2.151, ppl=4.44, wps=581900, ups=1.18, wpb=493539, bsz=16402.2, num_updates=16700, lr=0.000489409, gnorm=0.154, clip=0, loss_scale=4, train_wall=84, gb_free=62, wall=14359 epoch 010: 1627 / 1689 loss=3.675, nll_loss=2.138, ppl=4.4, wps=580140, ups=1.17, wpb=495744, bsz=16257.6, num_updates=16800, lr=0.00048795, gnorm=0.155, clip=0, loss_scale=2, train_wall=84, gb_free=57.6, wall=14444 epoch 010: 1627 / 1689 loss=3.675, nll_loss=2.138, ppl=4.4, wps=580140, ups=1.17, wpb=495744, bsz=16257.6, num_updates=16800, lr=0.00048795, gnorm=0.155, clip=0, loss_scale=2, train_wall=84, gb_free=57.6, wall=14444 epoch 010: 1627 / 1689 loss=3.675, nll_loss=2.138, ppl=4.4, wps=580140, ups=1.17, wpb=495744, bsz=16257.6, num_updates=16800, lr=0.00048795, gnorm=0.155, clip=0, loss_scale=2, train_wall=84, gb_free=57.6, wall=14444 epoch 010: 1627 / 1689 loss=3.675, nll_loss=2.138, ppl=4.4, wps=580140, ups=1.17, wpb=495744, bsz=16257.6, num_updates=16800, lr=0.00048795, gnorm=0.155, clip=0, loss_scale=2, train_wall=84, gb_free=57.6, wall=14444 epoch 010: 1627 / 1689 loss=3.675, nll_loss=2.138, ppl=4.4, wps=580140, ups=1.17, wpb=495744, bsz=16257.6, num_updates=16800, lr=0.00048795, gnorm=0.155, clip=0, loss_scale=2, train_wall=84, gb_free=57.6, wall=14444 epoch 010: 1627 / 1689 loss=3.675, nll_loss=2.138, ppl=4.4, wps=580140, ups=1.17, wpb=495744, bsz=16257.6, num_updates=16800, lr=0.00048795, gnorm=0.155, clip=0, loss_scale=2, train_wall=84, gb_free=57.6, wall=14444 epoch 010: 1627 / 1689 loss=3.675, nll_loss=2.138, ppl=4.4, wps=580140, ups=1.17, wpb=495744, bsz=16257.6, num_updates=16800, lr=0.00048795, gnorm=0.155, clip=0, loss_scale=2, train_wall=84, gb_free=57.6, wall=14444 epoch 010: 1627 / 1689 loss=3.675, nll_loss=2.138, ppl=4.4, wps=580140, ups=1.17, wpb=495744, bsz=16257.6, num_updates=16800, lr=0.00048795, gnorm=0.155, clip=0, loss_scale=2, train_wall=84, gb_free=57.6, wall=14444 epoch 010: 1627 / 1689 loss=3.675, nll_loss=2.138, ppl=4.4, wps=580140, ups=1.17, wpb=495744, bsz=16257.6, num_updates=16800, lr=0.00048795, gnorm=0.155, clip=0, loss_scale=2, train_wall=84, gb_free=57.6, wall=14444 epoch 010: 1627 / 1689 loss=3.675, nll_loss=2.138, ppl=4.4, wps=580140, ups=1.17, wpb=495744, bsz=16257.6, num_updates=16800, lr=0.00048795, gnorm=0.155, clip=0, loss_scale=2, train_wall=84, gb_free=57.6, wall=14444 end of epoch 10 (average epoch stats below) epoch 010 | loss 3.678 | nll_loss 2.14 | ppl 4.41 | wps 581101 | ups 1.17 | wpb 495133 | bsz 16502.9 | num_updates 16862 | lr 0.000487052 | gnorm 0.161 | clip 0 | loss_scale 2 | train_wall 1409 | gb_free 63.2 | wall 14496 epoch 010 | loss 3.678 | nll_loss 2.14 | ppl 4.41 | wps 581101 | ups 1.17 | wpb 495133 | bsz 16502.9 | num_updates 16862 | lr 0.000487052 | gnorm 0.161 | clip 0 | loss_scale 2 | train_wall 1409 | gb_free 63.2 | wall 14496 epoch 010 | loss 3.678 | nll_loss 2.14 | ppl 4.41 | wps 581101 | ups 1.17 | wpb 495133 | bsz 16502.9 | num_updates 16862 | lr 0.000487052 | gnorm 0.161 | clip 0 | loss_scale 2 | train_wall 1409 | gb_free 63.2 | wall 14496 epoch 010 | loss 3.678 | nll_loss 2.14 | ppl 4.41 | wps 581101 | ups 1.17 | wpb 495133 | bsz 16502.9 | num_updates 16862 | lr 0.000487052 | gnorm 0.161 | clip 0 | loss_scale 2 | train_wall 1409 | gb_free 63.2 | wall 14496 epoch 010 | loss 3.678 | nll_loss 2.14 | ppl 4.41 | wps 581101 | ups 1.17 | wpb 495133 | bsz 16502.9 | num_updates 16862 | lr 0.000487052 | gnorm 0.161 | clip 0 | loss_scale 2 | train_wall 1409 | gb_free 63.2 | wall 14496 epoch 010 | loss 3.678 | nll_loss 2.14 | ppl 4.41 | wps 581101 | ups 1.17 | wpb 495133 | bsz 16502.9 | num_updates 16862 | lr 0.000487052 | gnorm 0.161 | clip 0 | loss_scale 2 | train_wall 1409 | gb_free 63.2 | wall 14496 epoch 010 | loss 3.678 | nll_loss 2.14 | ppl 4.41 | wps 581101 | ups 1.17 | wpb 495133 | bsz 16502.9 | num_updates 16862 | lr 0.000487052 | gnorm 0.161 | clip 0 | loss_scale 2 | train_wall 1409 | gb_free 63.2 | wall 14496 epoch 010 | loss 3.678 | nll_loss 2.14 | ppl 4.41 | wps 581101 | ups 1.17 | wpb 495133 | bsz 16502.9 | num_updates 16862 | lr 0.000487052 | gnorm 0.161 | clip 0 | loss_scale 2 | train_wall 1409 | gb_free 63.2 | wall 14496 epoch 010 | loss 3.678 | nll_loss 2.14 | ppl 4.41 | wps 581101 | ups 1.17 | wpb 495133 | bsz 16502.9 | num_updates 16862 | lr 0.000487052 | gnorm 0.161 | clip 0 | loss_scale 2 | train_wall 1409 | gb_free 63.2 | wall 14496 epoch 010 | loss 3.678 | nll_loss 2.14 | ppl 4.41 | wps 581101 | ups 1.17 | wpb 495133 | bsz 16502.9 | num_updates 16862 | lr 0.000487052 | gnorm 0.161 | clip 0 | loss_scale 2 | train_wall 1409 | gb_free 63.2 | wall 14496 Start iterating over samples epoch 011: 38 / 1689 loss=3.664, nll_loss=2.125, ppl=4.36, wps=582045, ups=1.19, wpb=490748, bsz=16033.4, num_updates=16900, lr=0.000486504, gnorm=0.161, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=14529 epoch 011: 38 / 1689 loss=3.664, nll_loss=2.125, ppl=4.36, wps=582045, ups=1.19, wpb=490748, bsz=16033.4, num_updates=16900, lr=0.000486504, gnorm=0.161, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=14529 epoch 011: 38 / 1689 loss=3.664, nll_loss=2.125, ppl=4.36, wps=582045, ups=1.19, wpb=490748, bsz=16033.4, num_updates=16900, lr=0.000486504, gnorm=0.161, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=14529 epoch 011: 38 / 1689 loss=3.664, nll_loss=2.125, ppl=4.36, wps=582045, ups=1.19, wpb=490748, bsz=16033.4, num_updates=16900, lr=0.000486504, gnorm=0.161, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=14529 epoch 011: 38 / 1689 loss=3.664, nll_loss=2.125, ppl=4.36, wps=582045, ups=1.19, wpb=490748, bsz=16033.4, num_updates=16900, lr=0.000486504, gnorm=0.161, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=14529 epoch 011: 38 / 1689 loss=3.664, nll_loss=2.125, ppl=4.36, wps=582045, ups=1.19, wpb=490748, bsz=16033.4, num_updates=16900, lr=0.000486504, gnorm=0.161, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=14529 epoch 011: 38 / 1689 loss=3.664, nll_loss=2.125, ppl=4.36, wps=582045, ups=1.19, wpb=490748, bsz=16033.4, num_updates=16900, lr=0.000486504, gnorm=0.161, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=14529 epoch 011: 38 / 1689 loss=3.664, nll_loss=2.125, ppl=4.36, wps=582045, ups=1.19, wpb=490748, bsz=16033.4, num_updates=16900, lr=0.000486504, gnorm=0.161, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=14529 epoch 011: 38 / 1689 loss=3.664, nll_loss=2.125, ppl=4.36, wps=582045, ups=1.19, wpb=490748, bsz=16033.4, num_updates=16900, lr=0.000486504, gnorm=0.161, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=14529 epoch 011: 38 / 1689 loss=3.664, nll_loss=2.125, ppl=4.36, wps=582045, ups=1.19, wpb=490748, bsz=16033.4, num_updates=16900, lr=0.000486504, gnorm=0.161, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=14529 epoch 011: 38 / 1689 loss=3.664, nll_loss=2.125, ppl=4.36, wps=582045, ups=1.19, wpb=490748, bsz=16033.4, num_updates=16900, lr=0.000486504, gnorm=0.161, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=14529 epoch 011: 138 / 1689 loss=3.647, nll_loss=2.106, ppl=4.3, wps=587700, ups=1.18, wpb=496248, bsz=16711, num_updates=17000, lr=0.000485071, gnorm=0.155, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=14613 epoch 011: 138 / 1689 loss=3.647, nll_loss=2.106, ppl=4.3, wps=587700, ups=1.18, wpb=496248, bsz=16711, num_updates=17000, lr=0.000485071, gnorm=0.155, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=14613 epoch 011: 138 / 1689 loss=3.647, nll_loss=2.106, ppl=4.3, wps=587700, ups=1.18, wpb=496248, bsz=16711, num_updates=17000, lr=0.000485071, gnorm=0.155, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=14613 epoch 011: 138 / 1689 loss=3.647, nll_loss=2.106, ppl=4.3, wps=587700, ups=1.18, wpb=496248, bsz=16711, num_updates=17000, lr=0.000485071, gnorm=0.155, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=14613 epoch 011: 138 / 1689 loss=3.647, nll_loss=2.106, ppl=4.3, wps=587700, ups=1.18, wpb=496248, bsz=16711, num_updates=17000, lr=0.000485071, gnorm=0.155, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=14613 epoch 011: 138 / 1689 loss=3.647, nll_loss=2.106, ppl=4.3, wps=587700, ups=1.18, wpb=496248, bsz=16711, num_updates=17000, lr=0.000485071, gnorm=0.155, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=14613 epoch 011: 138 / 1689 loss=3.647, nll_loss=2.106, ppl=4.3, wps=587700, ups=1.18, wpb=496248, bsz=16711, num_updates=17000, lr=0.000485071, gnorm=0.155, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=14613 epoch 011: 138 / 1689 loss=3.647, nll_loss=2.106, ppl=4.3, wps=587700, ups=1.18, wpb=496248, bsz=16711, num_updates=17000, lr=0.000485071, gnorm=0.155, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=14613 epoch 011: 138 / 1689 loss=3.647, nll_loss=2.106, ppl=4.3, wps=587700, ups=1.18, wpb=496248, bsz=16711, num_updates=17000, lr=0.000485071, gnorm=0.155, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=14613 epoch 011: 138 / 1689 loss=3.647, nll_loss=2.106, ppl=4.3, wps=587700, ups=1.18, wpb=496248, bsz=16711, num_updates=17000, lr=0.000485071, gnorm=0.155, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=14613 epoch 011: 138 / 1689 loss=3.647, nll_loss=2.106, ppl=4.3, wps=587700, ups=1.18, wpb=496248, bsz=16711, num_updates=17000, lr=0.000485071, gnorm=0.155, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=14613 begin validation on "valid" subset epoch 011 | valid on 'valid' subset | loss 3.76 | nll_loss 2.214 | ppl 4.64 | wps 0 | wpb 44526 | bsz 2008 | num_updates 17000 | best_loss 3.76 epoch 011 | valid on 'valid' subset | loss 3.76 | nll_loss 2.214 | ppl 4.64 | wps 0 | wpb 44526 | bsz 2008 | num_updates 17000 | best_loss 3.76 epoch 011 | valid on 'valid' subset | loss 3.76 | nll_loss 2.214 | ppl 4.64 | wps 0 | wpb 44526 | bsz 2008 | num_updates 17000 | best_loss 3.76 epoch 011 | valid on 'valid' subset | loss 3.76 | nll_loss 2.214 | ppl 4.64 | wps 0 | wpb 44526 | bsz 2008 | num_updates 17000 | best_loss 3.76 epoch 011 | valid on 'valid' subset | loss 3.76 | nll_loss 2.214 | ppl 4.64 | wps 0 | wpb 44526 | bsz 2008 | num_updates 17000 | best_loss 3.76 epoch 011 | valid on 'valid' subset | loss 3.76 | nll_loss 2.214 | ppl 4.64 | wps 0 | wpb 44526 | bsz 2008 | num_updates 17000 | best_loss 3.76 epoch 011 | valid on 'valid' subset | loss 3.76 | nll_loss 2.214 | ppl 4.64 | wps 0 | wpb 44526 | bsz 2008 | num_updates 17000 | best_loss 3.76 epoch 011 | valid on 'valid' subset | loss 3.76 | nll_loss 2.214 | ppl 4.64 | wps 0 | wpb 44526 | bsz 2008 | num_updates 17000 | best_loss 3.76 epoch 011 | valid on 'valid' subset | loss 3.76 | nll_loss 2.214 | ppl 4.64 | wps 0 | wpb 44526 | bsz 2008 | num_updates 17000 | best_loss 3.76 epoch 011 | valid on 'valid' subset | loss 3.76 | nll_loss 2.214 | ppl 4.64 | wps 0 | wpb 44526 | bsz 2008 | num_updates 17000 | best_loss 3.76 epoch 011 | valid on 'valid' subset | loss 3.76 | nll_loss 2.214 | ppl 4.64 | wps 0 | wpb 44526 | bsz 2008 | num_updates 17000 | best_loss 3.76 epoch 011: 238 / 1689 loss=3.647, nll_loss=2.106, ppl=4.3, wps=520153, ups=1.05, wpb=495591, bsz=16546.2, num_updates=17100, lr=0.000483651, gnorm=0.16, clip=0, loss_scale=2, train_wall=83, gb_free=61.6, wall=14708 epoch 011: 238 / 1689 loss=3.647, nll_loss=2.106, ppl=4.3, wps=520153, ups=1.05, wpb=495591, bsz=16546.2, num_updates=17100, lr=0.000483651, gnorm=0.16, clip=0, loss_scale=2, train_wall=83, gb_free=61.6, wall=14708 epoch 011: 238 / 1689 loss=3.647, nll_loss=2.106, ppl=4.3, wps=520153, ups=1.05, wpb=495591, bsz=16546.2, num_updates=17100, lr=0.000483651, gnorm=0.16, clip=0, loss_scale=2, train_wall=83, gb_free=61.6, wall=14708 epoch 011: 238 / 1689 loss=3.647, nll_loss=2.106, ppl=4.3, wps=520153, ups=1.05, wpb=495591, bsz=16546.2, num_updates=17100, lr=0.000483651, gnorm=0.16, clip=0, loss_scale=2, train_wall=83, gb_free=61.6, wall=14708 epoch 011: 238 / 1689 loss=3.647, nll_loss=2.106, ppl=4.3, wps=520153, ups=1.05, wpb=495591, bsz=16546.2, num_updates=17100, lr=0.000483651, gnorm=0.16, clip=0, loss_scale=2, train_wall=83, gb_free=61.6, wall=14708 epoch 011: 238 / 1689 loss=3.647, nll_loss=2.106, ppl=4.3, wps=520153, ups=1.05, wpb=495591, bsz=16546.2, num_updates=17100, lr=0.000483651, gnorm=0.16, clip=0, loss_scale=2, train_wall=83, gb_free=61.6, wall=14708 epoch 011: 238 / 1689 loss=3.647, nll_loss=2.106, ppl=4.3, wps=520153, ups=1.05, wpb=495591, bsz=16546.2, num_updates=17100, lr=0.000483651, gnorm=0.16, clip=0, loss_scale=2, train_wall=83, gb_free=61.6, wall=14708 epoch 011: 238 / 1689 loss=3.647, nll_loss=2.106, ppl=4.3, wps=520153, ups=1.05, wpb=495591, bsz=16546.2, num_updates=17100, lr=0.000483651, gnorm=0.16, clip=0, loss_scale=2, train_wall=83, gb_free=61.6, wall=14708 epoch 011: 238 / 1689 loss=3.647, nll_loss=2.106, ppl=4.3, wps=520153, ups=1.05, wpb=495591, bsz=16546.2, num_updates=17100, lr=0.000483651, gnorm=0.16, clip=0, loss_scale=2, train_wall=83, gb_free=61.6, wall=14708 epoch 011: 238 / 1689 loss=3.647, nll_loss=2.106, ppl=4.3, wps=520153, ups=1.05, wpb=495591, bsz=16546.2, num_updates=17100, lr=0.000483651, gnorm=0.16, clip=0, loss_scale=2, train_wall=83, gb_free=61.6, wall=14708 epoch 011: 238 / 1689 loss=3.647, nll_loss=2.106, ppl=4.3, wps=520153, ups=1.05, wpb=495591, bsz=16546.2, num_updates=17100, lr=0.000483651, gnorm=0.16, clip=0, loss_scale=2, train_wall=83, gb_free=61.6, wall=14708 epoch 011: 338 / 1689 loss=3.657, nll_loss=2.117, ppl=4.34, wps=585875, ups=1.18, wpb=496032, bsz=16920.6, num_updates=17200, lr=0.000482243, gnorm=0.161, clip=0, loss_scale=2, train_wall=84, gb_free=61.8, wall=14793 epoch 011: 338 / 1689 loss=3.657, nll_loss=2.117, ppl=4.34, wps=585875, ups=1.18, wpb=496032, bsz=16920.6, num_updates=17200, lr=0.000482243, gnorm=0.161, clip=0, loss_scale=2, train_wall=84, gb_free=61.8, wall=14793 epoch 011: 338 / 1689 loss=3.657, nll_loss=2.117, ppl=4.34, wps=585875, ups=1.18, wpb=496032, bsz=16920.6, num_updates=17200, lr=0.000482243, gnorm=0.161, clip=0, loss_scale=2, train_wall=84, gb_free=61.8, wall=14793 epoch 011: 338 / 1689 loss=3.657, nll_loss=2.117, ppl=4.34, wps=585875, ups=1.18, wpb=496032, bsz=16920.6, num_updates=17200, lr=0.000482243, gnorm=0.161, clip=0, loss_scale=2, train_wall=84, gb_free=61.8, wall=14793 epoch 011: 338 / 1689 loss=3.657, nll_loss=2.117, ppl=4.34, wps=585875, ups=1.18, wpb=496032, bsz=16920.6, num_updates=17200, lr=0.000482243, gnorm=0.161, clip=0, loss_scale=2, train_wall=84, gb_free=61.8, wall=14793 epoch 011: 338 / 1689 loss=3.657, nll_loss=2.117, ppl=4.34, wps=585875, ups=1.18, wpb=496032, bsz=16920.6, num_updates=17200, lr=0.000482243, gnorm=0.161, clip=0, loss_scale=2, train_wall=84, gb_free=61.8, wall=14793 epoch 011: 338 / 1689 loss=3.657, nll_loss=2.117, ppl=4.34, wps=585875, ups=1.18, wpb=496032, bsz=16920.6, num_updates=17200, lr=0.000482243, gnorm=0.161, clip=0, loss_scale=2, train_wall=84, gb_free=61.8, wall=14793 epoch 011: 338 / 1689 loss=3.657, nll_loss=2.117, ppl=4.34, wps=585875, ups=1.18, wpb=496032, bsz=16920.6, num_updates=17200, lr=0.000482243, gnorm=0.161, clip=0, loss_scale=2, train_wall=84, gb_free=61.8, wall=14793 epoch 011: 338 / 1689 loss=3.657, nll_loss=2.117, ppl=4.34, wps=585875, ups=1.18, wpb=496032, bsz=16920.6, num_updates=17200, lr=0.000482243, gnorm=0.161, clip=0, loss_scale=2, train_wall=84, gb_free=61.8, wall=14793 epoch 011: 338 / 1689 loss=3.657, nll_loss=2.117, ppl=4.34, wps=585875, ups=1.18, wpb=496032, bsz=16920.6, num_updates=17200, lr=0.000482243, gnorm=0.161, clip=0, loss_scale=2, train_wall=84, gb_free=61.8, wall=14793 epoch 011: 338 / 1689 loss=3.657, nll_loss=2.117, ppl=4.34, wps=585875, ups=1.18, wpb=496032, bsz=16920.6, num_updates=17200, lr=0.000482243, gnorm=0.161, clip=0, loss_scale=2, train_wall=84, gb_free=61.8, wall=14793 epoch 011: 438 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=585965, ups=1.19, wpb=493742, bsz=16387.4, num_updates=17300, lr=0.000480847, gnorm=0.156, clip=0, loss_scale=4, train_wall=83, gb_free=62.1, wall=14877 epoch 011: 438 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=585965, ups=1.19, wpb=493742, bsz=16387.4, num_updates=17300, lr=0.000480847, gnorm=0.156, clip=0, loss_scale=4, train_wall=83, gb_free=62.1, wall=14877 epoch 011: 438 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=585965, ups=1.19, wpb=493742, bsz=16387.4, num_updates=17300, lr=0.000480847, gnorm=0.156, clip=0, loss_scale=4, train_wall=83, gb_free=62.1, wall=14877 epoch 011: 438 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=585965, ups=1.19, wpb=493742, bsz=16387.4, num_updates=17300, lr=0.000480847, gnorm=0.156, clip=0, loss_scale=4, train_wall=83, gb_free=62.1, wall=14877 epoch 011: 438 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=585965, ups=1.19, wpb=493742, bsz=16387.4, num_updates=17300, lr=0.000480847, gnorm=0.156, clip=0, loss_scale=4, train_wall=83, gb_free=62.1, wall=14877 epoch 011: 438 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=585965, ups=1.19, wpb=493742, bsz=16387.4, num_updates=17300, lr=0.000480847, gnorm=0.156, clip=0, loss_scale=4, train_wall=83, gb_free=62.1, wall=14877 epoch 011: 438 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=585965, ups=1.19, wpb=493742, bsz=16387.4, num_updates=17300, lr=0.000480847, gnorm=0.156, clip=0, loss_scale=4, train_wall=83, gb_free=62.1, wall=14877 epoch 011: 438 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=585965, ups=1.19, wpb=493742, bsz=16387.4, num_updates=17300, lr=0.000480847, gnorm=0.156, clip=0, loss_scale=4, train_wall=83, gb_free=62.1, wall=14877 epoch 011: 438 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=585965, ups=1.19, wpb=493742, bsz=16387.4, num_updates=17300, lr=0.000480847, gnorm=0.156, clip=0, loss_scale=4, train_wall=83, gb_free=62.1, wall=14877 epoch 011: 438 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=585965, ups=1.19, wpb=493742, bsz=16387.4, num_updates=17300, lr=0.000480847, gnorm=0.156, clip=0, loss_scale=4, train_wall=83, gb_free=62.1, wall=14877 epoch 011: 438 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=585965, ups=1.19, wpb=493742, bsz=16387.4, num_updates=17300, lr=0.000480847, gnorm=0.156, clip=0, loss_scale=4, train_wall=83, gb_free=62.1, wall=14877 epoch 011: 538 / 1689 loss=3.666, nll_loss=2.128, ppl=4.37, wps=583368, ups=1.18, wpb=493661, bsz=16641.8, num_updates=17400, lr=0.000479463, gnorm=0.153, clip=0, loss_scale=4, train_wall=83, gb_free=61.4, wall=14962 epoch 011: 538 / 1689 loss=3.666, nll_loss=2.128, ppl=4.37, wps=583368, ups=1.18, wpb=493661, bsz=16641.8, num_updates=17400, lr=0.000479463, gnorm=0.153, clip=0, loss_scale=4, train_wall=83, gb_free=61.4, wall=14962 epoch 011: 538 / 1689 loss=3.666, nll_loss=2.128, ppl=4.37, wps=583368, ups=1.18, wpb=493661, bsz=16641.8, num_updates=17400, lr=0.000479463, gnorm=0.153, clip=0, loss_scale=4, train_wall=83, gb_free=61.4, wall=14962 epoch 011: 538 / 1689 loss=3.666, nll_loss=2.128, ppl=4.37, wps=583368, ups=1.18, wpb=493661, bsz=16641.8, num_updates=17400, lr=0.000479463, gnorm=0.153, clip=0, loss_scale=4, train_wall=83, gb_free=61.4, wall=14962 epoch 011: 538 / 1689 loss=3.666, nll_loss=2.128, ppl=4.37, wps=583368, ups=1.18, wpb=493661, bsz=16641.8, num_updates=17400, lr=0.000479463, gnorm=0.153, clip=0, loss_scale=4, train_wall=83, gb_free=61.4, wall=14962 epoch 011: 538 / 1689 loss=3.666, nll_loss=2.128, ppl=4.37, wps=583368, ups=1.18, wpb=493661, bsz=16641.8, num_updates=17400, lr=0.000479463, gnorm=0.153, clip=0, loss_scale=4, train_wall=83, gb_free=61.4, wall=14962 epoch 011: 538 / 1689 loss=3.666, nll_loss=2.128, ppl=4.37, wps=583368, ups=1.18, wpb=493661, bsz=16641.8, num_updates=17400, lr=0.000479463, gnorm=0.153, clip=0, loss_scale=4, train_wall=83, gb_free=61.4, wall=14962 epoch 011: 538 / 1689 loss=3.666, nll_loss=2.128, ppl=4.37, wps=583368, ups=1.18, wpb=493661, bsz=16641.8, num_updates=17400, lr=0.000479463, gnorm=0.153, clip=0, loss_scale=4, train_wall=83, gb_free=61.4, wall=14962 epoch 011: 538 / 1689 loss=3.666, nll_loss=2.128, ppl=4.37, wps=583368, ups=1.18, wpb=493661, bsz=16641.8, num_updates=17400, lr=0.000479463, gnorm=0.153, clip=0, loss_scale=4, train_wall=83, gb_free=61.4, wall=14962 epoch 011: 538 / 1689 loss=3.666, nll_loss=2.128, ppl=4.37, wps=583368, ups=1.18, wpb=493661, bsz=16641.8, num_updates=17400, lr=0.000479463, gnorm=0.153, clip=0, loss_scale=4, train_wall=83, gb_free=61.4, wall=14962 epoch 011: 538 / 1689 loss=3.666, nll_loss=2.128, ppl=4.37, wps=583368, ups=1.18, wpb=493661, bsz=16641.8, num_updates=17400, lr=0.000479463, gnorm=0.153, clip=0, loss_scale=4, train_wall=83, gb_free=61.4, wall=14962 epoch 011: 639 / 1689 loss=3.66, nll_loss=2.121, ppl=4.35, wps=584341, ups=1.18, wpb=496719, bsz=16222.2, num_updates=17500, lr=0.000478091, gnorm=0.161, clip=0, loss_scale=2, train_wall=84, gb_free=59.5, wall=15047 epoch 011: 639 / 1689 loss=3.66, nll_loss=2.121, ppl=4.35, wps=584341, ups=1.18, wpb=496719, bsz=16222.2, num_updates=17500, lr=0.000478091, gnorm=0.161, clip=0, loss_scale=2, train_wall=84, gb_free=59.5, wall=15047 epoch 011: 639 / 1689 loss=3.66, nll_loss=2.121, ppl=4.35, wps=584341, ups=1.18, wpb=496719, bsz=16222.2, num_updates=17500, lr=0.000478091, gnorm=0.161, clip=0, loss_scale=2, train_wall=84, gb_free=59.5, wall=15047 epoch 011: 639 / 1689 loss=3.66, nll_loss=2.121, ppl=4.35, wps=584341, ups=1.18, wpb=496719, bsz=16222.2, num_updates=17500, lr=0.000478091, gnorm=0.161, clip=0, loss_scale=2, train_wall=84, gb_free=59.5, wall=15047 epoch 011: 639 / 1689 loss=3.66, nll_loss=2.121, ppl=4.35, wps=584341, ups=1.18, wpb=496719, bsz=16222.2, num_updates=17500, lr=0.000478091, gnorm=0.161, clip=0, loss_scale=2, train_wall=84, gb_free=59.5, wall=15047 epoch 011: 639 / 1689 loss=3.66, nll_loss=2.121, ppl=4.35, wps=584341, ups=1.18, wpb=496719, bsz=16222.2, num_updates=17500, lr=0.000478091, gnorm=0.161, clip=0, loss_scale=2, train_wall=84, gb_free=59.5, wall=15047 epoch 011: 639 / 1689 loss=3.66, nll_loss=2.121, ppl=4.35, wps=584341, ups=1.18, wpb=496719, bsz=16222.2, num_updates=17500, lr=0.000478091, gnorm=0.161, clip=0, loss_scale=2, train_wall=84, gb_free=59.5, wall=15047 epoch 011: 639 / 1689 loss=3.66, nll_loss=2.121, ppl=4.35, wps=584341, ups=1.18, wpb=496719, bsz=16222.2, num_updates=17500, lr=0.000478091, gnorm=0.161, clip=0, loss_scale=2, train_wall=84, gb_free=59.5, wall=15047 epoch 011: 639 / 1689 loss=3.66, nll_loss=2.121, ppl=4.35, wps=584341, ups=1.18, wpb=496719, bsz=16222.2, num_updates=17500, lr=0.000478091, gnorm=0.161, clip=0, loss_scale=2, train_wall=84, gb_free=59.5, wall=15047 epoch 011: 639 / 1689 loss=3.66, nll_loss=2.121, ppl=4.35, wps=584341, ups=1.18, wpb=496719, bsz=16222.2, num_updates=17500, lr=0.000478091, gnorm=0.161, clip=0, loss_scale=2, train_wall=84, gb_free=59.5, wall=15047 epoch 011: 639 / 1689 loss=3.66, nll_loss=2.121, ppl=4.35, wps=584341, ups=1.18, wpb=496719, bsz=16222.2, num_updates=17500, lr=0.000478091, gnorm=0.161, clip=0, loss_scale=2, train_wall=84, gb_free=59.5, wall=15047 epoch 011: 739 / 1689 loss=3.662, nll_loss=2.123, ppl=4.36, wps=583838, ups=1.18, wpb=494728, bsz=16490.2, num_updates=17600, lr=0.000476731, gnorm=0.161, clip=0, loss_scale=2, train_wall=84, gb_free=61.3, wall=15132 epoch 011: 739 / 1689 loss=3.662, nll_loss=2.123, ppl=4.36, wps=583838, ups=1.18, wpb=494728, bsz=16490.2, num_updates=17600, lr=0.000476731, gnorm=0.161, clip=0, loss_scale=2, train_wall=84, gb_free=61.3, wall=15132 epoch 011: 739 / 1689 loss=3.662, nll_loss=2.123, ppl=4.36, wps=583838, ups=1.18, wpb=494728, bsz=16490.2, num_updates=17600, lr=0.000476731, gnorm=0.161, clip=0, loss_scale=2, train_wall=84, gb_free=61.3, wall=15132 epoch 011: 739 / 1689 loss=3.662, nll_loss=2.123, ppl=4.36, wps=583838, ups=1.18, wpb=494728, bsz=16490.2, num_updates=17600, lr=0.000476731, gnorm=0.161, clip=0, loss_scale=2, train_wall=84, gb_free=61.3, wall=15132 epoch 011: 739 / 1689 loss=3.662, nll_loss=2.123, ppl=4.36, wps=583838, ups=1.18, wpb=494728, bsz=16490.2, num_updates=17600, lr=0.000476731, gnorm=0.161, clip=0, loss_scale=2, train_wall=84, gb_free=61.3, wall=15132 epoch 011: 739 / 1689 loss=3.662, nll_loss=2.123, ppl=4.36, wps=583838, ups=1.18, wpb=494728, bsz=16490.2, num_updates=17600, lr=0.000476731, gnorm=0.161, clip=0, loss_scale=2, train_wall=84, gb_free=61.3, wall=15132 epoch 011: 739 / 1689 loss=3.662, nll_loss=2.123, ppl=4.36, wps=583838, ups=1.18, wpb=494728, bsz=16490.2, num_updates=17600, lr=0.000476731, gnorm=0.161, clip=0, loss_scale=2, train_wall=84, gb_free=61.3, wall=15132 epoch 011: 739 / 1689 loss=3.662, nll_loss=2.123, ppl=4.36, wps=583838, ups=1.18, wpb=494728, bsz=16490.2, num_updates=17600, lr=0.000476731, gnorm=0.161, clip=0, loss_scale=2, train_wall=84, gb_free=61.3, wall=15132 epoch 011: 739 / 1689 loss=3.662, nll_loss=2.123, ppl=4.36, wps=583838, ups=1.18, wpb=494728, bsz=16490.2, num_updates=17600, lr=0.000476731, gnorm=0.161, clip=0, loss_scale=2, train_wall=84, gb_free=61.3, wall=15132 epoch 011: 739 / 1689 loss=3.662, nll_loss=2.123, ppl=4.36, wps=583838, ups=1.18, wpb=494728, bsz=16490.2, num_updates=17600, lr=0.000476731, gnorm=0.161, clip=0, loss_scale=2, train_wall=84, gb_free=61.3, wall=15132 epoch 011: 739 / 1689 loss=3.662, nll_loss=2.123, ppl=4.36, wps=583838, ups=1.18, wpb=494728, bsz=16490.2, num_updates=17600, lr=0.000476731, gnorm=0.161, clip=0, loss_scale=2, train_wall=84, gb_free=61.3, wall=15132 epoch 011: 839 / 1689 loss=3.658, nll_loss=2.119, ppl=4.34, wps=584896, ups=1.18, wpb=495642, bsz=16615, num_updates=17700, lr=0.000475383, gnorm=0.157, clip=0, loss_scale=2, train_wall=84, gb_free=61.1, wall=15216 epoch 011: 839 / 1689 loss=3.658, nll_loss=2.119, ppl=4.34, wps=584896, ups=1.18, wpb=495642, bsz=16615, num_updates=17700, lr=0.000475383, gnorm=0.157, clip=0, loss_scale=2, train_wall=84, gb_free=61.1, wall=15216 epoch 011: 839 / 1689 loss=3.658, nll_loss=2.119, ppl=4.34, wps=584896, ups=1.18, wpb=495642, bsz=16615, num_updates=17700, lr=0.000475383, gnorm=0.157, clip=0, loss_scale=2, train_wall=84, gb_free=61.1, wall=15216 epoch 011: 839 / 1689 loss=3.658, nll_loss=2.119, ppl=4.34, wps=584896, ups=1.18, wpb=495642, bsz=16615, num_updates=17700, lr=0.000475383, gnorm=0.157, clip=0, loss_scale=2, train_wall=84, gb_free=61.1, wall=15216 epoch 011: 839 / 1689 loss=3.658, nll_loss=2.119, ppl=4.34, wps=584896, ups=1.18, wpb=495642, bsz=16615, num_updates=17700, lr=0.000475383, gnorm=0.157, clip=0, loss_scale=2, train_wall=84, gb_free=61.1, wall=15216 epoch 011: 839 / 1689 loss=3.658, nll_loss=2.119, ppl=4.34, wps=584896, ups=1.18, wpb=495642, bsz=16615, num_updates=17700, lr=0.000475383, gnorm=0.157, clip=0, loss_scale=2, train_wall=84, gb_free=61.1, wall=15216 epoch 011: 839 / 1689 loss=3.658, nll_loss=2.119, ppl=4.34, wps=584896, ups=1.18, wpb=495642, bsz=16615, num_updates=17700, lr=0.000475383, gnorm=0.157, clip=0, loss_scale=2, train_wall=84, gb_free=61.1, wall=15216 epoch 011: 839 / 1689 loss=3.658, nll_loss=2.119, ppl=4.34, wps=584896, ups=1.18, wpb=495642, bsz=16615, num_updates=17700, lr=0.000475383, gnorm=0.157, clip=0, loss_scale=2, train_wall=84, gb_free=61.1, wall=15216 epoch 011: 839 / 1689 loss=3.658, nll_loss=2.119, ppl=4.34, wps=584896, ups=1.18, wpb=495642, bsz=16615, num_updates=17700, lr=0.000475383, gnorm=0.157, clip=0, loss_scale=2, train_wall=84, gb_free=61.1, wall=15216 epoch 011: 839 / 1689 loss=3.658, nll_loss=2.119, ppl=4.34, wps=584896, ups=1.18, wpb=495642, bsz=16615, num_updates=17700, lr=0.000475383, gnorm=0.157, clip=0, loss_scale=2, train_wall=84, gb_free=61.1, wall=15216 epoch 011: 839 / 1689 loss=3.658, nll_loss=2.119, ppl=4.34, wps=584896, ups=1.18, wpb=495642, bsz=16615, num_updates=17700, lr=0.000475383, gnorm=0.157, clip=0, loss_scale=2, train_wall=84, gb_free=61.1, wall=15216 epoch 011: 939 / 1689 loss=3.656, nll_loss=2.116, ppl=4.34, wps=585157, ups=1.18, wpb=496919, bsz=16518.8, num_updates=17800, lr=0.000474045, gnorm=0.151, clip=0, loss_scale=2, train_wall=84, gb_free=61.8, wall=15301 epoch 011: 939 / 1689 loss=3.656, nll_loss=2.116, ppl=4.34, wps=585157, ups=1.18, wpb=496919, bsz=16518.8, num_updates=17800, lr=0.000474045, gnorm=0.151, clip=0, loss_scale=2, train_wall=84, gb_free=61.8, wall=15301 epoch 011: 939 / 1689 loss=3.656, nll_loss=2.116, ppl=4.34, wps=585157, ups=1.18, wpb=496919, bsz=16518.8, num_updates=17800, lr=0.000474045, gnorm=0.151, clip=0, loss_scale=2, train_wall=84, gb_free=61.8, wall=15301 epoch 011: 939 / 1689 loss=3.656, nll_loss=2.116, ppl=4.34, wps=585157, ups=1.18, wpb=496919, bsz=16518.8, num_updates=17800, lr=0.000474045, gnorm=0.151, clip=0, loss_scale=2, train_wall=84, gb_free=61.8, wall=15301 epoch 011: 939 / 1689 loss=3.656, nll_loss=2.116, ppl=4.34, wps=585157, ups=1.18, wpb=496919, bsz=16518.8, num_updates=17800, lr=0.000474045, gnorm=0.151, clip=0, loss_scale=2, train_wall=84, gb_free=61.8, wall=15301 epoch 011: 939 / 1689 loss=3.656, nll_loss=2.116, ppl=4.34, wps=585157, ups=1.18, wpb=496919, bsz=16518.8, num_updates=17800, lr=0.000474045, gnorm=0.151, clip=0, loss_scale=2, train_wall=84, gb_free=61.8, wall=15301 epoch 011: 939 / 1689 loss=3.656, nll_loss=2.116, ppl=4.34, wps=585157, ups=1.18, wpb=496919, bsz=16518.8, num_updates=17800, lr=0.000474045, gnorm=0.151, clip=0, loss_scale=2, train_wall=84, gb_free=61.8, wall=15301 epoch 011: 939 / 1689 loss=3.656, nll_loss=2.116, ppl=4.34, wps=585157, ups=1.18, wpb=496919, bsz=16518.8, num_updates=17800, lr=0.000474045, gnorm=0.151, clip=0, loss_scale=2, train_wall=84, gb_free=61.8, wall=15301 epoch 011: 939 / 1689 loss=3.656, nll_loss=2.116, ppl=4.34, wps=585157, ups=1.18, wpb=496919, bsz=16518.8, num_updates=17800, lr=0.000474045, gnorm=0.151, clip=0, loss_scale=2, train_wall=84, gb_free=61.8, wall=15301 epoch 011: 939 / 1689 loss=3.656, nll_loss=2.116, ppl=4.34, wps=585157, ups=1.18, wpb=496919, bsz=16518.8, num_updates=17800, lr=0.000474045, gnorm=0.151, clip=0, loss_scale=2, train_wall=84, gb_free=61.8, wall=15301 epoch 011: 939 / 1689 loss=3.656, nll_loss=2.116, ppl=4.34, wps=585157, ups=1.18, wpb=496919, bsz=16518.8, num_updates=17800, lr=0.000474045, gnorm=0.151, clip=0, loss_scale=2, train_wall=84, gb_free=61.8, wall=15301 epoch 011: 1039 / 1689 loss=3.661, nll_loss=2.122, ppl=4.35, wps=584111, ups=1.18, wpb=493359, bsz=16746.8, num_updates=17900, lr=0.000472719, gnorm=0.152, clip=0, loss_scale=2, train_wall=83, gb_free=61.6, wall=15386 epoch 011: 1039 / 1689 loss=3.661, nll_loss=2.122, ppl=4.35, wps=584111, ups=1.18, wpb=493359, bsz=16746.8, num_updates=17900, lr=0.000472719, gnorm=0.152, clip=0, loss_scale=2, train_wall=83, gb_free=61.6, wall=15386 epoch 011: 1039 / 1689 loss=3.661, nll_loss=2.122, ppl=4.35, wps=584111, ups=1.18, wpb=493359, bsz=16746.8, num_updates=17900, lr=0.000472719, gnorm=0.152, clip=0, loss_scale=2, train_wall=83, gb_free=61.6, wall=15386 epoch 011: 1039 / 1689 loss=3.661, nll_loss=2.122, ppl=4.35, wps=584111, ups=1.18, wpb=493359, bsz=16746.8, num_updates=17900, lr=0.000472719, gnorm=0.152, clip=0, loss_scale=2, train_wall=83, gb_free=61.6, wall=15386 epoch 011: 1039 / 1689 loss=3.661, nll_loss=2.122, ppl=4.35, wps=584111, ups=1.18, wpb=493359, bsz=16746.8, num_updates=17900, lr=0.000472719, gnorm=0.152, clip=0, loss_scale=2, train_wall=83, gb_free=61.6, wall=15386 epoch 011: 1039 / 1689 loss=3.661, nll_loss=2.122, ppl=4.35, wps=584111, ups=1.18, wpb=493359, bsz=16746.8, num_updates=17900, lr=0.000472719, gnorm=0.152, clip=0, loss_scale=2, train_wall=83, gb_free=61.6, wall=15386 epoch 011: 1039 / 1689 loss=3.661, nll_loss=2.122, ppl=4.35, wps=584111, ups=1.18, wpb=493359, bsz=16746.8, num_updates=17900, lr=0.000472719, gnorm=0.152, clip=0, loss_scale=2, train_wall=83, gb_free=61.6, wall=15386 epoch 011: 1039 / 1689 loss=3.661, nll_loss=2.122, ppl=4.35, wps=584111, ups=1.18, wpb=493359, bsz=16746.8, num_updates=17900, lr=0.000472719, gnorm=0.152, clip=0, loss_scale=2, train_wall=83, gb_free=61.6, wall=15386 epoch 011: 1039 / 1689 loss=3.661, nll_loss=2.122, ppl=4.35, wps=584111, ups=1.18, wpb=493359, bsz=16746.8, num_updates=17900, lr=0.000472719, gnorm=0.152, clip=0, loss_scale=2, train_wall=83, gb_free=61.6, wall=15386 epoch 011: 1039 / 1689 loss=3.661, nll_loss=2.122, ppl=4.35, wps=584111, ups=1.18, wpb=493359, bsz=16746.8, num_updates=17900, lr=0.000472719, gnorm=0.152, clip=0, loss_scale=2, train_wall=83, gb_free=61.6, wall=15386 epoch 011: 1039 / 1689 loss=3.661, nll_loss=2.122, ppl=4.35, wps=584111, ups=1.18, wpb=493359, bsz=16746.8, num_updates=17900, lr=0.000472719, gnorm=0.152, clip=0, loss_scale=2, train_wall=83, gb_free=61.6, wall=15386 epoch 011: 1139 / 1689 loss=3.666, nll_loss=2.128, ppl=4.37, wps=584152, ups=1.18, wpb=496117, bsz=16615.5, num_updates=18000, lr=0.000471405, gnorm=0.166, clip=0, loss_scale=4, train_wall=84, gb_free=61, wall=15471 epoch 011: 1139 / 1689 loss=3.666, nll_loss=2.128, ppl=4.37, wps=584152, ups=1.18, wpb=496117, bsz=16615.5, num_updates=18000, lr=0.000471405, gnorm=0.166, clip=0, loss_scale=4, train_wall=84, gb_free=61, wall=15471 epoch 011: 1139 / 1689 loss=3.666, nll_loss=2.128, ppl=4.37, wps=584152, ups=1.18, wpb=496117, bsz=16615.5, num_updates=18000, lr=0.000471405, gnorm=0.166, clip=0, loss_scale=4, train_wall=84, gb_free=61, wall=15471 epoch 011: 1139 / 1689 loss=3.666, nll_loss=2.128, ppl=4.37, wps=584152, ups=1.18, wpb=496117, bsz=16615.5, num_updates=18000, lr=0.000471405, gnorm=0.166, clip=0, loss_scale=4, train_wall=84, gb_free=61, wall=15471 epoch 011: 1139 / 1689 loss=3.666, nll_loss=2.128, ppl=4.37, wps=584152, ups=1.18, wpb=496117, bsz=16615.5, num_updates=18000, lr=0.000471405, gnorm=0.166, clip=0, loss_scale=4, train_wall=84, gb_free=61, wall=15471 epoch 011: 1139 / 1689 loss=3.666, nll_loss=2.128, ppl=4.37, wps=584152, ups=1.18, wpb=496117, bsz=16615.5, num_updates=18000, lr=0.000471405, gnorm=0.166, clip=0, loss_scale=4, train_wall=84, gb_free=61, wall=15471 epoch 011: 1139 / 1689 loss=3.666, nll_loss=2.128, ppl=4.37, wps=584152, ups=1.18, wpb=496117, bsz=16615.5, num_updates=18000, lr=0.000471405, gnorm=0.166, clip=0, loss_scale=4, train_wall=84, gb_free=61, wall=15471 epoch 011: 1139 / 1689 loss=3.666, nll_loss=2.128, ppl=4.37, wps=584152, ups=1.18, wpb=496117, bsz=16615.5, num_updates=18000, lr=0.000471405, gnorm=0.166, clip=0, loss_scale=4, train_wall=84, gb_free=61, wall=15471 epoch 011: 1139 / 1689 loss=3.666, nll_loss=2.128, ppl=4.37, wps=584152, ups=1.18, wpb=496117, bsz=16615.5, num_updates=18000, lr=0.000471405, gnorm=0.166, clip=0, loss_scale=4, train_wall=84, gb_free=61, wall=15471 epoch 011: 1139 / 1689 loss=3.666, nll_loss=2.128, ppl=4.37, wps=584152, ups=1.18, wpb=496117, bsz=16615.5, num_updates=18000, lr=0.000471405, gnorm=0.166, clip=0, loss_scale=4, train_wall=84, gb_free=61, wall=15471 epoch 011: 1139 / 1689 loss=3.666, nll_loss=2.128, ppl=4.37, wps=584152, ups=1.18, wpb=496117, bsz=16615.5, num_updates=18000, lr=0.000471405, gnorm=0.166, clip=0, loss_scale=4, train_wall=84, gb_free=61, wall=15471 begin validation on "valid" subset epoch 011 | valid on 'valid' subset | loss 3.754 | nll_loss 2.202 | ppl 4.6 | wps 0 | wpb 44526 | bsz 2008 | num_updates 18000 | best_loss 3.754 epoch 011 | valid on 'valid' subset | loss 3.754 | nll_loss 2.202 | ppl 4.6 | wps 0 | wpb 44526 | bsz 2008 | num_updates 18000 | best_loss 3.754 epoch 011 | valid on 'valid' subset | loss 3.754 | nll_loss 2.202 | ppl 4.6 | wps 0 | wpb 44526 | bsz 2008 | num_updates 18000 | best_loss 3.754 epoch 011 | valid on 'valid' subset | loss 3.754 | nll_loss 2.202 | ppl 4.6 | wps 0 | wpb 44526 | bsz 2008 | num_updates 18000 | best_loss 3.754 epoch 011 | valid on 'valid' subset | loss 3.754 | nll_loss 2.202 | ppl 4.6 | wps 0 | wpb 44526 | bsz 2008 | num_updates 18000 | best_loss 3.754 epoch 011 | valid on 'valid' subset | loss 3.754 | nll_loss 2.202 | ppl 4.6 | wps 0 | wpb 44526 | bsz 2008 | num_updates 18000 | best_loss 3.754 epoch 011 | valid on 'valid' subset | loss 3.754 | nll_loss 2.202 | ppl 4.6 | wps 0 | wpb 44526 | bsz 2008 | num_updates 18000 | best_loss 3.754 epoch 011 | valid on 'valid' subset | loss 3.754 | nll_loss 2.202 | ppl 4.6 | wps 0 | wpb 44526 | bsz 2008 | num_updates 18000 | best_loss 3.754 epoch 011 | valid on 'valid' subset | loss 3.754 | nll_loss 2.202 | ppl 4.6 | wps 0 | wpb 44526 | bsz 2008 | num_updates 18000 | best_loss 3.754 epoch 011 | valid on 'valid' subset | loss 3.754 | nll_loss 2.202 | ppl 4.6 | wps 0 | wpb 44526 | bsz 2008 | num_updates 18000 | best_loss 3.754 epoch 011 | valid on 'valid' subset | loss 3.754 | nll_loss 2.202 | ppl 4.6 | wps 0 | wpb 44526 | bsz 2008 | num_updates 18000 | best_loss 3.754 epoch 011: 1239 / 1689 loss=3.661, nll_loss=2.123, ppl=4.36, wps=520352, ups=1.05, wpb=495327, bsz=16758, num_updates=18100, lr=0.0004701, gnorm=0.158, clip=0, loss_scale=4, train_wall=83, gb_free=60.9, wall=15566 epoch 011: 1239 / 1689 loss=3.661, nll_loss=2.123, ppl=4.36, wps=520352, ups=1.05, wpb=495327, bsz=16758, num_updates=18100, lr=0.0004701, gnorm=0.158, clip=0, loss_scale=4, train_wall=83, gb_free=60.9, wall=15566 epoch 011: 1239 / 1689 loss=3.661, nll_loss=2.123, ppl=4.36, wps=520352, ups=1.05, wpb=495327, bsz=16758, num_updates=18100, lr=0.0004701, gnorm=0.158, clip=0, loss_scale=4, train_wall=83, gb_free=60.9, wall=15566 epoch 011: 1239 / 1689 loss=3.661, nll_loss=2.123, ppl=4.36, wps=520352, ups=1.05, wpb=495327, bsz=16758, num_updates=18100, lr=0.0004701, gnorm=0.158, clip=0, loss_scale=4, train_wall=83, gb_free=60.9, wall=15566 epoch 011: 1239 / 1689 loss=3.661, nll_loss=2.123, ppl=4.36, wps=520352, ups=1.05, wpb=495327, bsz=16758, num_updates=18100, lr=0.0004701, gnorm=0.158, clip=0, loss_scale=4, train_wall=83, gb_free=60.9, wall=15566 epoch 011: 1239 / 1689 loss=3.661, nll_loss=2.123, ppl=4.36, wps=520352, ups=1.05, wpb=495327, bsz=16758, num_updates=18100, lr=0.0004701, gnorm=0.158, clip=0, loss_scale=4, train_wall=83, gb_free=60.9, wall=15566 epoch 011: 1239 / 1689 loss=3.661, nll_loss=2.123, ppl=4.36, wps=520352, ups=1.05, wpb=495327, bsz=16758, num_updates=18100, lr=0.0004701, gnorm=0.158, clip=0, loss_scale=4, train_wall=83, gb_free=60.9, wall=15566 epoch 011: 1239 / 1689 loss=3.661, nll_loss=2.123, ppl=4.36, wps=520352, ups=1.05, wpb=495327, bsz=16758, num_updates=18100, lr=0.0004701, gnorm=0.158, clip=0, loss_scale=4, train_wall=83, gb_free=60.9, wall=15566 epoch 011: 1239 / 1689 loss=3.661, nll_loss=2.123, ppl=4.36, wps=520352, ups=1.05, wpb=495327, bsz=16758, num_updates=18100, lr=0.0004701, gnorm=0.158, clip=0, loss_scale=4, train_wall=83, gb_free=60.9, wall=15566 epoch 011: 1239 / 1689 loss=3.661, nll_loss=2.123, ppl=4.36, wps=520352, ups=1.05, wpb=495327, bsz=16758, num_updates=18100, lr=0.0004701, gnorm=0.158, clip=0, loss_scale=4, train_wall=83, gb_free=60.9, wall=15566 epoch 011: 1239 / 1689 loss=3.661, nll_loss=2.123, ppl=4.36, wps=520352, ups=1.05, wpb=495327, bsz=16758, num_updates=18100, lr=0.0004701, gnorm=0.158, clip=0, loss_scale=4, train_wall=83, gb_free=60.9, wall=15566 epoch 011: 1339 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=588936, ups=1.19, wpb=495911, bsz=16409.3, num_updates=18200, lr=0.000468807, gnorm=0.153, clip=0, loss_scale=4, train_wall=83, gb_free=61.9, wall=15650 epoch 011: 1339 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=588936, ups=1.19, wpb=495911, bsz=16409.3, num_updates=18200, lr=0.000468807, gnorm=0.153, clip=0, loss_scale=4, train_wall=83, gb_free=61.9, wall=15650 epoch 011: 1339 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=588936, ups=1.19, wpb=495911, bsz=16409.3, num_updates=18200, lr=0.000468807, gnorm=0.153, clip=0, loss_scale=4, train_wall=83, gb_free=61.9, wall=15650 epoch 011: 1339 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=588936, ups=1.19, wpb=495911, bsz=16409.3, num_updates=18200, lr=0.000468807, gnorm=0.153, clip=0, loss_scale=4, train_wall=83, gb_free=61.9, wall=15650 epoch 011: 1339 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=588936, ups=1.19, wpb=495911, bsz=16409.3, num_updates=18200, lr=0.000468807, gnorm=0.153, clip=0, loss_scale=4, train_wall=83, gb_free=61.9, wall=15650 epoch 011: 1339 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=588936, ups=1.19, wpb=495911, bsz=16409.3, num_updates=18200, lr=0.000468807, gnorm=0.153, clip=0, loss_scale=4, train_wall=83, gb_free=61.9, wall=15650 epoch 011: 1339 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=588936, ups=1.19, wpb=495911, bsz=16409.3, num_updates=18200, lr=0.000468807, gnorm=0.153, clip=0, loss_scale=4, train_wall=83, gb_free=61.9, wall=15650 epoch 011: 1339 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=588936, ups=1.19, wpb=495911, bsz=16409.3, num_updates=18200, lr=0.000468807, gnorm=0.153, clip=0, loss_scale=4, train_wall=83, gb_free=61.9, wall=15650 epoch 011: 1339 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=588936, ups=1.19, wpb=495911, bsz=16409.3, num_updates=18200, lr=0.000468807, gnorm=0.153, clip=0, loss_scale=4, train_wall=83, gb_free=61.9, wall=15650 epoch 011: 1339 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=588936, ups=1.19, wpb=495911, bsz=16409.3, num_updates=18200, lr=0.000468807, gnorm=0.153, clip=0, loss_scale=4, train_wall=83, gb_free=61.9, wall=15650 epoch 011: 1339 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=588936, ups=1.19, wpb=495911, bsz=16409.3, num_updates=18200, lr=0.000468807, gnorm=0.153, clip=0, loss_scale=4, train_wall=83, gb_free=61.9, wall=15650 epoch 011: 1439 / 1689 loss=3.666, nll_loss=2.128, ppl=4.37, wps=589642, ups=1.19, wpb=493715, bsz=16059.8, num_updates=18300, lr=0.000467525, gnorm=0.156, clip=0, loss_scale=4, train_wall=83, gb_free=59.7, wall=15734 epoch 011: 1439 / 1689 loss=3.666, nll_loss=2.128, ppl=4.37, wps=589642, ups=1.19, wpb=493715, bsz=16059.8, num_updates=18300, lr=0.000467525, gnorm=0.156, clip=0, loss_scale=4, train_wall=83, gb_free=59.7, wall=15734 epoch 011: 1439 / 1689 loss=3.666, nll_loss=2.128, ppl=4.37, wps=589642, ups=1.19, wpb=493715, bsz=16059.8, num_updates=18300, lr=0.000467525, gnorm=0.156, clip=0, loss_scale=4, train_wall=83, gb_free=59.7, wall=15734 epoch 011: 1439 / 1689 loss=3.666, nll_loss=2.128, ppl=4.37, wps=589642, ups=1.19, wpb=493715, bsz=16059.8, num_updates=18300, lr=0.000467525, gnorm=0.156, clip=0, loss_scale=4, train_wall=83, gb_free=59.7, wall=15734 epoch 011: 1439 / 1689 loss=3.666, nll_loss=2.128, ppl=4.37, wps=589642, ups=1.19, wpb=493715, bsz=16059.8, num_updates=18300, lr=0.000467525, gnorm=0.156, clip=0, loss_scale=4, train_wall=83, gb_free=59.7, wall=15734 epoch 011: 1439 / 1689 loss=3.666, nll_loss=2.128, ppl=4.37, wps=589642, ups=1.19, wpb=493715, bsz=16059.8, num_updates=18300, lr=0.000467525, gnorm=0.156, clip=0, loss_scale=4, train_wall=83, gb_free=59.7, wall=15734 epoch 011: 1439 / 1689 loss=3.666, nll_loss=2.128, ppl=4.37, wps=589642, ups=1.19, wpb=493715, bsz=16059.8, num_updates=18300, lr=0.000467525, gnorm=0.156, clip=0, loss_scale=4, train_wall=83, gb_free=59.7, wall=15734 epoch 011: 1439 / 1689 loss=3.666, nll_loss=2.128, ppl=4.37, wps=589642, ups=1.19, wpb=493715, bsz=16059.8, num_updates=18300, lr=0.000467525, gnorm=0.156, clip=0, loss_scale=4, train_wall=83, gb_free=59.7, wall=15734 epoch 011: 1439 / 1689 loss=3.666, nll_loss=2.128, ppl=4.37, wps=589642, ups=1.19, wpb=493715, bsz=16059.8, num_updates=18300, lr=0.000467525, gnorm=0.156, clip=0, loss_scale=4, train_wall=83, gb_free=59.7, wall=15734 epoch 011: 1439 / 1689 loss=3.666, nll_loss=2.128, ppl=4.37, wps=589642, ups=1.19, wpb=493715, bsz=16059.8, num_updates=18300, lr=0.000467525, gnorm=0.156, clip=0, loss_scale=4, train_wall=83, gb_free=59.7, wall=15734 epoch 011: 1439 / 1689 loss=3.666, nll_loss=2.128, ppl=4.37, wps=589642, ups=1.19, wpb=493715, bsz=16059.8, num_updates=18300, lr=0.000467525, gnorm=0.156, clip=0, loss_scale=4, train_wall=83, gb_free=59.7, wall=15734 epoch 011: 1539 / 1689 loss=3.667, nll_loss=2.129, ppl=4.38, wps=588449, ups=1.19, wpb=495016, bsz=16101.7, num_updates=18400, lr=0.000466252, gnorm=0.161, clip=0, loss_scale=4, train_wall=83, gb_free=61.3, wall=15818 epoch 011: 1539 / 1689 loss=3.667, nll_loss=2.129, ppl=4.38, wps=588449, ups=1.19, wpb=495016, bsz=16101.7, num_updates=18400, lr=0.000466252, gnorm=0.161, clip=0, loss_scale=4, train_wall=83, gb_free=61.3, wall=15818 epoch 011: 1539 / 1689 loss=3.667, nll_loss=2.129, ppl=4.38, wps=588449, ups=1.19, wpb=495016, bsz=16101.7, num_updates=18400, lr=0.000466252, gnorm=0.161, clip=0, loss_scale=4, train_wall=83, gb_free=61.3, wall=15818 epoch 011: 1539 / 1689 loss=3.667, nll_loss=2.129, ppl=4.38, wps=588449, ups=1.19, wpb=495016, bsz=16101.7, num_updates=18400, lr=0.000466252, gnorm=0.161, clip=0, loss_scale=4, train_wall=83, gb_free=61.3, wall=15818 epoch 011: 1539 / 1689 loss=3.667, nll_loss=2.129, ppl=4.38, wps=588449, ups=1.19, wpb=495016, bsz=16101.7, num_updates=18400, lr=0.000466252, gnorm=0.161, clip=0, loss_scale=4, train_wall=83, gb_free=61.3, wall=15818 epoch 011: 1539 / 1689 loss=3.667, nll_loss=2.129, ppl=4.38, wps=588449, ups=1.19, wpb=495016, bsz=16101.7, num_updates=18400, lr=0.000466252, gnorm=0.161, clip=0, loss_scale=4, train_wall=83, gb_free=61.3, wall=15818 epoch 011: 1539 / 1689 loss=3.667, nll_loss=2.129, ppl=4.38, wps=588449, ups=1.19, wpb=495016, bsz=16101.7, num_updates=18400, lr=0.000466252, gnorm=0.161, clip=0, loss_scale=4, train_wall=83, gb_free=61.3, wall=15818 epoch 011: 1539 / 1689 loss=3.667, nll_loss=2.129, ppl=4.38, wps=588449, ups=1.19, wpb=495016, bsz=16101.7, num_updates=18400, lr=0.000466252, gnorm=0.161, clip=0, loss_scale=4, train_wall=83, gb_free=61.3, wall=15818 epoch 011: 1539 / 1689 loss=3.667, nll_loss=2.129, ppl=4.38, wps=588449, ups=1.19, wpb=495016, bsz=16101.7, num_updates=18400, lr=0.000466252, gnorm=0.161, clip=0, loss_scale=4, train_wall=83, gb_free=61.3, wall=15818 epoch 011: 1539 / 1689 loss=3.667, nll_loss=2.129, ppl=4.38, wps=588449, ups=1.19, wpb=495016, bsz=16101.7, num_updates=18400, lr=0.000466252, gnorm=0.161, clip=0, loss_scale=4, train_wall=83, gb_free=61.3, wall=15818 epoch 011: 1539 / 1689 loss=3.667, nll_loss=2.129, ppl=4.38, wps=588449, ups=1.19, wpb=495016, bsz=16101.7, num_updates=18400, lr=0.000466252, gnorm=0.161, clip=0, loss_scale=4, train_wall=83, gb_free=61.3, wall=15818 epoch 011: 1640 / 1689 loss=3.66, nll_loss=2.122, ppl=4.35, wps=584442, ups=1.18, wpb=496666, bsz=16384.9, num_updates=18500, lr=0.000464991, gnorm=0.154, clip=0, loss_scale=4, train_wall=84, gb_free=61.7, wall=15903 epoch 011: 1640 / 1689 loss=3.66, nll_loss=2.122, ppl=4.35, wps=584442, ups=1.18, wpb=496666, bsz=16384.9, num_updates=18500, lr=0.000464991, gnorm=0.154, clip=0, loss_scale=4, train_wall=84, gb_free=61.7, wall=15903 epoch 011: 1640 / 1689 loss=3.66, nll_loss=2.122, ppl=4.35, wps=584442, ups=1.18, wpb=496666, bsz=16384.9, num_updates=18500, lr=0.000464991, gnorm=0.154, clip=0, loss_scale=4, train_wall=84, gb_free=61.7, wall=15903 epoch 011: 1640 / 1689 loss=3.66, nll_loss=2.122, ppl=4.35, wps=584442, ups=1.18, wpb=496666, bsz=16384.9, num_updates=18500, lr=0.000464991, gnorm=0.154, clip=0, loss_scale=4, train_wall=84, gb_free=61.7, wall=15903 epoch 011: 1640 / 1689 loss=3.66, nll_loss=2.122, ppl=4.35, wps=584442, ups=1.18, wpb=496666, bsz=16384.9, num_updates=18500, lr=0.000464991, gnorm=0.154, clip=0, loss_scale=4, train_wall=84, gb_free=61.7, wall=15903 epoch 011: 1640 / 1689 loss=3.66, nll_loss=2.122, ppl=4.35, wps=584442, ups=1.18, wpb=496666, bsz=16384.9, num_updates=18500, lr=0.000464991, gnorm=0.154, clip=0, loss_scale=4, train_wall=84, gb_free=61.7, wall=15903 epoch 011: 1640 / 1689 loss=3.66, nll_loss=2.122, ppl=4.35, wps=584442, ups=1.18, wpb=496666, bsz=16384.9, num_updates=18500, lr=0.000464991, gnorm=0.154, clip=0, loss_scale=4, train_wall=84, gb_free=61.7, wall=15903 epoch 011: 1640 / 1689 loss=3.66, nll_loss=2.122, ppl=4.35, wps=584442, ups=1.18, wpb=496666, bsz=16384.9, num_updates=18500, lr=0.000464991, gnorm=0.154, clip=0, loss_scale=4, train_wall=84, gb_free=61.7, wall=15903 epoch 011: 1640 / 1689 loss=3.66, nll_loss=2.122, ppl=4.35, wps=584442, ups=1.18, wpb=496666, bsz=16384.9, num_updates=18500, lr=0.000464991, gnorm=0.154, clip=0, loss_scale=4, train_wall=84, gb_free=61.7, wall=15903 epoch 011: 1640 / 1689 loss=3.66, nll_loss=2.122, ppl=4.35, wps=584442, ups=1.18, wpb=496666, bsz=16384.9, num_updates=18500, lr=0.000464991, gnorm=0.154, clip=0, loss_scale=4, train_wall=84, gb_free=61.7, wall=15903 epoch 011: 1640 / 1689 loss=3.66, nll_loss=2.122, ppl=4.35, wps=584442, ups=1.18, wpb=496666, bsz=16384.9, num_updates=18500, lr=0.000464991, gnorm=0.154, clip=0, loss_scale=4, train_wall=84, gb_free=61.7, wall=15903 end of epoch 11 (average epoch stats below) epoch 011 | loss 3.66 | nll_loss 2.121 | ppl 4.35 | wps 577086 | ups 1.17 | wpb 495115 | bsz 16506 | num_updates 18549 | lr 0.000464376 | gnorm 0.157 | clip 0 | loss_scale 4 | train_wall 1406 | gb_free 62.5 | wall 15943 epoch 011 | loss 3.66 | nll_loss 2.121 | ppl 4.35 | wps 577086 | ups 1.17 | wpb 495115 | bsz 16506 | num_updates 18549 | lr 0.000464376 | gnorm 0.157 | clip 0 | loss_scale 4 | train_wall 1406 | gb_free 62.5 | wall 15943 epoch 011 | loss 3.66 | nll_loss 2.121 | ppl 4.35 | wps 577086 | ups 1.17 | wpb 495115 | bsz 16506 | num_updates 18549 | lr 0.000464376 | gnorm 0.157 | clip 0 | loss_scale 4 | train_wall 1406 | gb_free 62.5 | wall 15943 epoch 011 | loss 3.66 | nll_loss 2.121 | ppl 4.35 | wps 577086 | ups 1.17 | wpb 495115 | bsz 16506 | num_updates 18549 | lr 0.000464376 | gnorm 0.157 | clip 0 | loss_scale 4 | train_wall 1406 | gb_free 62.5 | wall 15943 epoch 011 | loss 3.66 | nll_loss 2.121 | ppl 4.35 | wps 577086 | ups 1.17 | wpb 495115 | bsz 16506 | num_updates 18549 | lr 0.000464376 | gnorm 0.157 | clip 0 | loss_scale 4 | train_wall 1406 | gb_free 62.5 | wall 15943 epoch 011 | loss 3.66 | nll_loss 2.121 | ppl 4.35 | wps 577086 | ups 1.17 | wpb 495115 | bsz 16506 | num_updates 18549 | lr 0.000464376 | gnorm 0.157 | clip 0 | loss_scale 4 | train_wall 1406 | gb_free 62.5 | wall 15943 epoch 011 | loss 3.66 | nll_loss 2.121 | ppl 4.35 | wps 577086 | ups 1.17 | wpb 495115 | bsz 16506 | num_updates 18549 | lr 0.000464376 | gnorm 0.157 | clip 0 | loss_scale 4 | train_wall 1406 | gb_free 62.5 | wall 15943 epoch 011 | loss 3.66 | nll_loss 2.121 | ppl 4.35 | wps 577086 | ups 1.17 | wpb 495115 | bsz 16506 | num_updates 18549 | lr 0.000464376 | gnorm 0.157 | clip 0 | loss_scale 4 | train_wall 1406 | gb_free 62.5 | wall 15943 epoch 011 | loss 3.66 | nll_loss 2.121 | ppl 4.35 | wps 577086 | ups 1.17 | wpb 495115 | bsz 16506 | num_updates 18549 | lr 0.000464376 | gnorm 0.157 | clip 0 | loss_scale 4 | train_wall 1406 | gb_free 62.5 | wall 15943 epoch 011 | loss 3.66 | nll_loss 2.121 | ppl 4.35 | wps 577086 | ups 1.17 | wpb 495115 | bsz 16506 | num_updates 18549 | lr 0.000464376 | gnorm 0.157 | clip 0 | loss_scale 4 | train_wall 1406 | gb_free 62.5 | wall 15943 epoch 011 | loss 3.66 | nll_loss 2.121 | ppl 4.35 | wps 577086 | ups 1.17 | wpb 495115 | bsz 16506 | num_updates 18549 | lr 0.000464376 | gnorm 0.157 | clip 0 | loss_scale 4 | train_wall 1406 | gb_free 62.5 | wall 15943 Start iterating over samples epoch 012: 51 / 1689 loss=3.648, nll_loss=2.107, ppl=4.31, wps=582440, ups=1.19, wpb=491031, bsz=16377, num_updates=18600, lr=0.000463739, gnorm=0.154, clip=0, loss_scale=4, train_wall=83, gb_free=61.2, wall=15987 epoch 012: 51 / 1689 loss=3.648, nll_loss=2.107, ppl=4.31, wps=582440, ups=1.19, wpb=491031, bsz=16377, num_updates=18600, lr=0.000463739, gnorm=0.154, clip=0, loss_scale=4, train_wall=83, gb_free=61.2, wall=15987 epoch 012: 51 / 1689 loss=3.648, nll_loss=2.107, ppl=4.31, wps=582440, ups=1.19, wpb=491031, bsz=16377, num_updates=18600, lr=0.000463739, gnorm=0.154, clip=0, loss_scale=4, train_wall=83, gb_free=61.2, wall=15987 epoch 012: 51 / 1689 loss=3.648, nll_loss=2.107, ppl=4.31, wps=582440, ups=1.19, wpb=491031, bsz=16377, num_updates=18600, lr=0.000463739, gnorm=0.154, clip=0, loss_scale=4, train_wall=83, gb_free=61.2, wall=15987 epoch 012: 51 / 1689 loss=3.648, nll_loss=2.107, ppl=4.31, wps=582440, ups=1.19, wpb=491031, bsz=16377, num_updates=18600, lr=0.000463739, gnorm=0.154, clip=0, loss_scale=4, train_wall=83, gb_free=61.2, wall=15987 epoch 012: 51 / 1689 loss=3.648, nll_loss=2.107, ppl=4.31, wps=582440, ups=1.19, wpb=491031, bsz=16377, num_updates=18600, lr=0.000463739, gnorm=0.154, clip=0, loss_scale=4, train_wall=83, gb_free=61.2, wall=15987 epoch 012: 51 / 1689 loss=3.648, nll_loss=2.107, ppl=4.31, wps=582440, ups=1.19, wpb=491031, bsz=16377, num_updates=18600, lr=0.000463739, gnorm=0.154, clip=0, loss_scale=4, train_wall=83, gb_free=61.2, wall=15987 epoch 012: 51 / 1689 loss=3.648, nll_loss=2.107, ppl=4.31, wps=582440, ups=1.19, wpb=491031, bsz=16377, num_updates=18600, lr=0.000463739, gnorm=0.154, clip=0, loss_scale=4, train_wall=83, gb_free=61.2, wall=15987 epoch 012: 51 / 1689 loss=3.648, nll_loss=2.107, ppl=4.31, wps=582440, ups=1.19, wpb=491031, bsz=16377, num_updates=18600, lr=0.000463739, gnorm=0.154, clip=0, loss_scale=4, train_wall=83, gb_free=61.2, wall=15987 epoch 012: 51 / 1689 loss=3.648, nll_loss=2.107, ppl=4.31, wps=582440, ups=1.19, wpb=491031, bsz=16377, num_updates=18600, lr=0.000463739, gnorm=0.154, clip=0, loss_scale=4, train_wall=83, gb_free=61.2, wall=15987 epoch 012: 51 / 1689 loss=3.648, nll_loss=2.107, ppl=4.31, wps=582440, ups=1.19, wpb=491031, bsz=16377, num_updates=18600, lr=0.000463739, gnorm=0.154, clip=0, loss_scale=4, train_wall=83, gb_free=61.2, wall=15987 epoch 012: 51 / 1689 loss=3.648, nll_loss=2.107, ppl=4.31, wps=582440, ups=1.19, wpb=491031, bsz=16377, num_updates=18600, lr=0.000463739, gnorm=0.154, clip=0, loss_scale=4, train_wall=83, gb_free=61.2, wall=15987 epoch 012: 151 / 1689 loss=3.627, nll_loss=2.083, ppl=4.24, wps=587774, ups=1.18, wpb=496225, bsz=16312.6, num_updates=18700, lr=0.000462497, gnorm=0.15, clip=0, loss_scale=4, train_wall=83, gb_free=62.1, wall=16072 epoch 012: 151 / 1689 loss=3.627, nll_loss=2.083, ppl=4.24, wps=587774, ups=1.18, wpb=496225, bsz=16312.6, num_updates=18700, lr=0.000462497, gnorm=0.15, clip=0, loss_scale=4, train_wall=83, gb_free=62.1, wall=16072 epoch 012: 151 / 1689 loss=3.627, nll_loss=2.083, ppl=4.24, wps=587774, ups=1.18, wpb=496225, bsz=16312.6, num_updates=18700, lr=0.000462497, gnorm=0.15, clip=0, loss_scale=4, train_wall=83, gb_free=62.1, wall=16072 epoch 012: 151 / 1689 loss=3.627, nll_loss=2.083, ppl=4.24, wps=587774, ups=1.18, wpb=496225, bsz=16312.6, num_updates=18700, lr=0.000462497, gnorm=0.15, clip=0, loss_scale=4, train_wall=83, gb_free=62.1, wall=16072 epoch 012: 151 / 1689 loss=3.627, nll_loss=2.083, ppl=4.24, wps=587774, ups=1.18, wpb=496225, bsz=16312.6, num_updates=18700, lr=0.000462497, gnorm=0.15, clip=0, loss_scale=4, train_wall=83, gb_free=62.1, wall=16072 epoch 012: 151 / 1689 loss=3.627, nll_loss=2.083, ppl=4.24, wps=587774, ups=1.18, wpb=496225, bsz=16312.6, num_updates=18700, lr=0.000462497, gnorm=0.15, clip=0, loss_scale=4, train_wall=83, gb_free=62.1, wall=16072 epoch 012: 151 / 1689 loss=3.627, nll_loss=2.083, ppl=4.24, wps=587774, ups=1.18, wpb=496225, bsz=16312.6, num_updates=18700, lr=0.000462497, gnorm=0.15, clip=0, loss_scale=4, train_wall=83, gb_free=62.1, wall=16072 epoch 012: 151 / 1689 loss=3.627, nll_loss=2.083, ppl=4.24, wps=587774, ups=1.18, wpb=496225, bsz=16312.6, num_updates=18700, lr=0.000462497, gnorm=0.15, clip=0, loss_scale=4, train_wall=83, gb_free=62.1, wall=16072 epoch 012: 151 / 1689 loss=3.627, nll_loss=2.083, ppl=4.24, wps=587774, ups=1.18, wpb=496225, bsz=16312.6, num_updates=18700, lr=0.000462497, gnorm=0.15, clip=0, loss_scale=4, train_wall=83, gb_free=62.1, wall=16072 epoch 012: 151 / 1689 loss=3.627, nll_loss=2.083, ppl=4.24, wps=587774, ups=1.18, wpb=496225, bsz=16312.6, num_updates=18700, lr=0.000462497, gnorm=0.15, clip=0, loss_scale=4, train_wall=83, gb_free=62.1, wall=16072 epoch 012: 151 / 1689 loss=3.627, nll_loss=2.083, ppl=4.24, wps=587774, ups=1.18, wpb=496225, bsz=16312.6, num_updates=18700, lr=0.000462497, gnorm=0.15, clip=0, loss_scale=4, train_wall=83, gb_free=62.1, wall=16072 epoch 012: 151 / 1689 loss=3.627, nll_loss=2.083, ppl=4.24, wps=587774, ups=1.18, wpb=496225, bsz=16312.6, num_updates=18700, lr=0.000462497, gnorm=0.15, clip=0, loss_scale=4, train_wall=83, gb_free=62.1, wall=16072 epoch 012: 251 / 1689 loss=3.638, nll_loss=2.096, ppl=4.28, wps=585462, ups=1.18, wpb=494887, bsz=16667.9, num_updates=18800, lr=0.000461266, gnorm=0.15, clip=0, loss_scale=4, train_wall=83, gb_free=60.8, wall=16156 epoch 012: 251 / 1689 loss=3.638, nll_loss=2.096, ppl=4.28, wps=585462, ups=1.18, wpb=494887, bsz=16667.9, num_updates=18800, lr=0.000461266, gnorm=0.15, clip=0, loss_scale=4, train_wall=83, gb_free=60.8, wall=16156 epoch 012: 251 / 1689 loss=3.638, nll_loss=2.096, ppl=4.28, wps=585462, ups=1.18, wpb=494887, bsz=16667.9, num_updates=18800, lr=0.000461266, gnorm=0.15, clip=0, loss_scale=4, train_wall=83, gb_free=60.8, wall=16156 epoch 012: 251 / 1689 loss=3.638, nll_loss=2.096, ppl=4.28, wps=585462, ups=1.18, wpb=494887, bsz=16667.9, num_updates=18800, lr=0.000461266, gnorm=0.15, clip=0, loss_scale=4, train_wall=83, gb_free=60.8, wall=16156 epoch 012: 251 / 1689 loss=3.638, nll_loss=2.096, ppl=4.28, wps=585462, ups=1.18, wpb=494887, bsz=16667.9, num_updates=18800, lr=0.000461266, gnorm=0.15, clip=0, loss_scale=4, train_wall=83, gb_free=60.8, wall=16156 epoch 012: 251 / 1689 loss=3.638, nll_loss=2.096, ppl=4.28, wps=585462, ups=1.18, wpb=494887, bsz=16667.9, num_updates=18800, lr=0.000461266, gnorm=0.15, clip=0, loss_scale=4, train_wall=83, gb_free=60.8, wall=16156 epoch 012: 251 / 1689 loss=3.638, nll_loss=2.096, ppl=4.28, wps=585462, ups=1.18, wpb=494887, bsz=16667.9, num_updates=18800, lr=0.000461266, gnorm=0.15, clip=0, loss_scale=4, train_wall=83, gb_free=60.8, wall=16156 epoch 012: 251 / 1689 loss=3.638, nll_loss=2.096, ppl=4.28, wps=585462, ups=1.18, wpb=494887, bsz=16667.9, num_updates=18800, lr=0.000461266, gnorm=0.15, clip=0, loss_scale=4, train_wall=83, gb_free=60.8, wall=16156 epoch 012: 251 / 1689 loss=3.638, nll_loss=2.096, ppl=4.28, wps=585462, ups=1.18, wpb=494887, bsz=16667.9, num_updates=18800, lr=0.000461266, gnorm=0.15, clip=0, loss_scale=4, train_wall=83, gb_free=60.8, wall=16156 epoch 012: 251 / 1689 loss=3.638, nll_loss=2.096, ppl=4.28, wps=585462, ups=1.18, wpb=494887, bsz=16667.9, num_updates=18800, lr=0.000461266, gnorm=0.15, clip=0, loss_scale=4, train_wall=83, gb_free=60.8, wall=16156 epoch 012: 251 / 1689 loss=3.638, nll_loss=2.096, ppl=4.28, wps=585462, ups=1.18, wpb=494887, bsz=16667.9, num_updates=18800, lr=0.000461266, gnorm=0.15, clip=0, loss_scale=4, train_wall=83, gb_free=60.8, wall=16156 epoch 012: 251 / 1689 loss=3.638, nll_loss=2.096, ppl=4.28, wps=585462, ups=1.18, wpb=494887, bsz=16667.9, num_updates=18800, lr=0.000461266, gnorm=0.15, clip=0, loss_scale=4, train_wall=83, gb_free=60.8, wall=16156 epoch 012: 351 / 1689 loss=3.64, nll_loss=2.099, ppl=4.28, wps=585446, ups=1.18, wpb=494357, bsz=16474.9, num_updates=18900, lr=0.000460044, gnorm=0.155, clip=0, loss_scale=4, train_wall=83, gb_free=60, wall=16241 epoch 012: 351 / 1689 loss=3.64, nll_loss=2.099, ppl=4.28, wps=585446, ups=1.18, wpb=494357, bsz=16474.9, num_updates=18900, lr=0.000460044, gnorm=0.155, clip=0, loss_scale=4, train_wall=83, gb_free=60, wall=16241 epoch 012: 351 / 1689 loss=3.64, nll_loss=2.099, ppl=4.28, wps=585446, ups=1.18, wpb=494357, bsz=16474.9, num_updates=18900, lr=0.000460044, gnorm=0.155, clip=0, loss_scale=4, train_wall=83, gb_free=60, wall=16241 epoch 012: 351 / 1689 loss=3.64, nll_loss=2.099, ppl=4.28, wps=585446, ups=1.18, wpb=494357, bsz=16474.9, num_updates=18900, lr=0.000460044, gnorm=0.155, clip=0, loss_scale=4, train_wall=83, gb_free=60, wall=16241 epoch 012: 351 / 1689 loss=3.64, nll_loss=2.099, ppl=4.28, wps=585446, ups=1.18, wpb=494357, bsz=16474.9, num_updates=18900, lr=0.000460044, gnorm=0.155, clip=0, loss_scale=4, train_wall=83, gb_free=60, wall=16241 epoch 012: 351 / 1689 loss=3.64, nll_loss=2.099, ppl=4.28, wps=585446, ups=1.18, wpb=494357, bsz=16474.9, num_updates=18900, lr=0.000460044, gnorm=0.155, clip=0, loss_scale=4, train_wall=83, gb_free=60, wall=16241 epoch 012: 351 / 1689 loss=3.64, nll_loss=2.099, ppl=4.28, wps=585446, ups=1.18, wpb=494357, bsz=16474.9, num_updates=18900, lr=0.000460044, gnorm=0.155, clip=0, loss_scale=4, train_wall=83, gb_free=60, wall=16241 epoch 012: 351 / 1689 loss=3.64, nll_loss=2.099, ppl=4.28, wps=585446, ups=1.18, wpb=494357, bsz=16474.9, num_updates=18900, lr=0.000460044, gnorm=0.155, clip=0, loss_scale=4, train_wall=83, gb_free=60, wall=16241 epoch 012: 351 / 1689 loss=3.64, nll_loss=2.099, ppl=4.28, wps=585446, ups=1.18, wpb=494357, bsz=16474.9, num_updates=18900, lr=0.000460044, gnorm=0.155, clip=0, loss_scale=4, train_wall=83, gb_free=60, wall=16241 epoch 012: 351 / 1689 loss=3.64, nll_loss=2.099, ppl=4.28, wps=585446, ups=1.18, wpb=494357, bsz=16474.9, num_updates=18900, lr=0.000460044, gnorm=0.155, clip=0, loss_scale=4, train_wall=83, gb_free=60, wall=16241 epoch 012: 351 / 1689 loss=3.64, nll_loss=2.099, ppl=4.28, wps=585446, ups=1.18, wpb=494357, bsz=16474.9, num_updates=18900, lr=0.000460044, gnorm=0.155, clip=0, loss_scale=4, train_wall=83, gb_free=60, wall=16241 epoch 012: 351 / 1689 loss=3.64, nll_loss=2.099, ppl=4.28, wps=585446, ups=1.18, wpb=494357, bsz=16474.9, num_updates=18900, lr=0.000460044, gnorm=0.155, clip=0, loss_scale=4, train_wall=83, gb_free=60, wall=16241 epoch 012: 451 / 1689 loss=3.644, nll_loss=2.103, ppl=4.3, wps=587095, ups=1.19, wpb=494311, bsz=16398.4, num_updates=19000, lr=0.000458831, gnorm=0.156, clip=0, loss_scale=8, train_wall=83, gb_free=61.8, wall=16325 epoch 012: 451 / 1689 loss=3.644, nll_loss=2.103, ppl=4.3, wps=587095, ups=1.19, wpb=494311, bsz=16398.4, num_updates=19000, lr=0.000458831, gnorm=0.156, clip=0, loss_scale=8, train_wall=83, gb_free=61.8, wall=16325 epoch 012: 451 / 1689 loss=3.644, nll_loss=2.103, ppl=4.3, wps=587095, ups=1.19, wpb=494311, bsz=16398.4, num_updates=19000, lr=0.000458831, gnorm=0.156, clip=0, loss_scale=8, train_wall=83, gb_free=61.8, wall=16325 epoch 012: 451 / 1689 loss=3.644, nll_loss=2.103, ppl=4.3, wps=587095, ups=1.19, wpb=494311, bsz=16398.4, num_updates=19000, lr=0.000458831, gnorm=0.156, clip=0, loss_scale=8, train_wall=83, gb_free=61.8, wall=16325 epoch 012: 451 / 1689 loss=3.644, nll_loss=2.103, ppl=4.3, wps=587095, ups=1.19, wpb=494311, bsz=16398.4, num_updates=19000, lr=0.000458831, gnorm=0.156, clip=0, loss_scale=8, train_wall=83, gb_free=61.8, wall=16325 epoch 012: 451 / 1689 loss=3.644, nll_loss=2.103, ppl=4.3, wps=587095, ups=1.19, wpb=494311, bsz=16398.4, num_updates=19000, lr=0.000458831, gnorm=0.156, clip=0, loss_scale=8, train_wall=83, gb_free=61.8, wall=16325 epoch 012: 451 / 1689 loss=3.644, nll_loss=2.103, ppl=4.3, wps=587095, ups=1.19, wpb=494311, bsz=16398.4, num_updates=19000, lr=0.000458831, gnorm=0.156, clip=0, loss_scale=8, train_wall=83, gb_free=61.8, wall=16325 epoch 012: 451 / 1689 loss=3.644, nll_loss=2.103, ppl=4.3, wps=587095, ups=1.19, wpb=494311, bsz=16398.4, num_updates=19000, lr=0.000458831, gnorm=0.156, clip=0, loss_scale=8, train_wall=83, gb_free=61.8, wall=16325 epoch 012: 451 / 1689 loss=3.644, nll_loss=2.103, ppl=4.3, wps=587095, ups=1.19, wpb=494311, bsz=16398.4, num_updates=19000, lr=0.000458831, gnorm=0.156, clip=0, loss_scale=8, train_wall=83, gb_free=61.8, wall=16325 epoch 012: 451 / 1689 loss=3.644, nll_loss=2.103, ppl=4.3, wps=587095, ups=1.19, wpb=494311, bsz=16398.4, num_updates=19000, lr=0.000458831, gnorm=0.156, clip=0, loss_scale=8, train_wall=83, gb_free=61.8, wall=16325 epoch 012: 451 / 1689 loss=3.644, nll_loss=2.103, ppl=4.3, wps=587095, ups=1.19, wpb=494311, bsz=16398.4, num_updates=19000, lr=0.000458831, gnorm=0.156, clip=0, loss_scale=8, train_wall=83, gb_free=61.8, wall=16325 epoch 012: 451 / 1689 loss=3.644, nll_loss=2.103, ppl=4.3, wps=587095, ups=1.19, wpb=494311, bsz=16398.4, num_updates=19000, lr=0.000458831, gnorm=0.156, clip=0, loss_scale=8, train_wall=83, gb_free=61.8, wall=16325 begin validation on "valid" subset epoch 012 | valid on 'valid' subset | loss 3.753 | nll_loss 2.204 | ppl 4.61 | wps 0 | wpb 44526 | bsz 2008 | num_updates 19000 | best_loss 3.753 epoch 012 | valid on 'valid' subset | loss 3.753 | nll_loss 2.204 | ppl 4.61 | wps 0 | wpb 44526 | bsz 2008 | num_updates 19000 | best_loss 3.753 epoch 012 | valid on 'valid' subset | loss 3.753 | nll_loss 2.204 | ppl 4.61 | wps 0 | wpb 44526 | bsz 2008 | num_updates 19000 | best_loss 3.753 epoch 012 | valid on 'valid' subset | loss 3.753 | nll_loss 2.204 | ppl 4.61 | wps 0 | wpb 44526 | bsz 2008 | num_updates 19000 | best_loss 3.753 epoch 012 | valid on 'valid' subset | loss 3.753 | nll_loss 2.204 | ppl 4.61 | wps 0 | wpb 44526 | bsz 2008 | num_updates 19000 | best_loss 3.753 epoch 012 | valid on 'valid' subset | loss 3.753 | nll_loss 2.204 | ppl 4.61 | wps 0 | wpb 44526 | bsz 2008 | num_updates 19000 | best_loss 3.753 epoch 012 | valid on 'valid' subset | loss 3.753 | nll_loss 2.204 | ppl 4.61 | wps 0 | wpb 44526 | bsz 2008 | num_updates 19000 | best_loss 3.753 epoch 012 | valid on 'valid' subset | loss 3.753 | nll_loss 2.204 | ppl 4.61 | wps 0 | wpb 44526 | bsz 2008 | num_updates 19000 | best_loss 3.753 epoch 012 | valid on 'valid' subset | loss 3.753 | nll_loss 2.204 | ppl 4.61 | wps 0 | wpb 44526 | bsz 2008 | num_updates 19000 | best_loss 3.753 epoch 012 | valid on 'valid' subset | loss 3.753 | nll_loss 2.204 | ppl 4.61 | wps 0 | wpb 44526 | bsz 2008 | num_updates 19000 | best_loss 3.753 epoch 012 | valid on 'valid' subset | loss 3.753 | nll_loss 2.204 | ppl 4.61 | wps 0 | wpb 44526 | bsz 2008 | num_updates 19000 | best_loss 3.753 epoch 012 | valid on 'valid' subset | loss 3.753 | nll_loss 2.204 | ppl 4.61 | wps 0 | wpb 44526 | bsz 2008 | num_updates 19000 | best_loss 3.753 epoch 012: 552 / 1689 loss=3.641, nll_loss=2.1, ppl=4.29, wps=511473, ups=1.03, wpb=495837, bsz=16444.8, num_updates=19100, lr=0.000457629, gnorm=0.145, clip=0, loss_scale=4, train_wall=84, gb_free=60.5, wall=16422 epoch 012: 552 / 1689 loss=3.641, nll_loss=2.1, ppl=4.29, wps=511473, ups=1.03, wpb=495837, bsz=16444.8, num_updates=19100, lr=0.000457629, gnorm=0.145, clip=0, loss_scale=4, train_wall=84, gb_free=60.5, wall=16422 epoch 012: 552 / 1689 loss=3.641, nll_loss=2.1, ppl=4.29, wps=511473, ups=1.03, wpb=495837, bsz=16444.8, num_updates=19100, lr=0.000457629, gnorm=0.145, clip=0, loss_scale=4, train_wall=84, gb_free=60.5, wall=16422 epoch 012: 552 / 1689 loss=3.641, nll_loss=2.1, ppl=4.29, wps=511473, ups=1.03, wpb=495837, bsz=16444.8, num_updates=19100, lr=0.000457629, gnorm=0.145, clip=0, loss_scale=4, train_wall=84, gb_free=60.5, wall=16422 epoch 012: 552 / 1689 loss=3.641, nll_loss=2.1, ppl=4.29, wps=511473, ups=1.03, wpb=495837, bsz=16444.8, num_updates=19100, lr=0.000457629, gnorm=0.145, clip=0, loss_scale=4, train_wall=84, gb_free=60.5, wall=16422 epoch 012: 552 / 1689 loss=3.641, nll_loss=2.1, ppl=4.29, wps=511473, ups=1.03, wpb=495837, bsz=16444.8, num_updates=19100, lr=0.000457629, gnorm=0.145, clip=0, loss_scale=4, train_wall=84, gb_free=60.5, wall=16422 epoch 012: 552 / 1689 loss=3.641, nll_loss=2.1, ppl=4.29, wps=511473, ups=1.03, wpb=495837, bsz=16444.8, num_updates=19100, lr=0.000457629, gnorm=0.145, clip=0, loss_scale=4, train_wall=84, gb_free=60.5, wall=16422 epoch 012: 552 / 1689 loss=3.641, nll_loss=2.1, ppl=4.29, wps=511473, ups=1.03, wpb=495837, bsz=16444.8, num_updates=19100, lr=0.000457629, gnorm=0.145, clip=0, loss_scale=4, train_wall=84, gb_free=60.5, wall=16422 epoch 012: 552 / 1689 loss=3.641, nll_loss=2.1, ppl=4.29, wps=511473, ups=1.03, wpb=495837, bsz=16444.8, num_updates=19100, lr=0.000457629, gnorm=0.145, clip=0, loss_scale=4, train_wall=84, gb_free=60.5, wall=16422 epoch 012: 552 / 1689 loss=3.641, nll_loss=2.1, ppl=4.29, wps=511473, ups=1.03, wpb=495837, bsz=16444.8, num_updates=19100, lr=0.000457629, gnorm=0.145, clip=0, loss_scale=4, train_wall=84, gb_free=60.5, wall=16422 epoch 012: 552 / 1689 loss=3.641, nll_loss=2.1, ppl=4.29, wps=511473, ups=1.03, wpb=495837, bsz=16444.8, num_updates=19100, lr=0.000457629, gnorm=0.145, clip=0, loss_scale=4, train_wall=84, gb_free=60.5, wall=16422 epoch 012: 552 / 1689 loss=3.641, nll_loss=2.1, ppl=4.29, wps=511473, ups=1.03, wpb=495837, bsz=16444.8, num_updates=19100, lr=0.000457629, gnorm=0.145, clip=0, loss_scale=4, train_wall=84, gb_free=60.5, wall=16422 epoch 012: 652 / 1689 loss=3.641, nll_loss=2.1, ppl=4.29, wps=590668, ups=1.19, wpb=496756, bsz=16709.6, num_updates=19200, lr=0.000456435, gnorm=0.15, clip=0, loss_scale=4, train_wall=83, gb_free=61.3, wall=16506 epoch 012: 652 / 1689 loss=3.641, nll_loss=2.1, ppl=4.29, wps=590668, ups=1.19, wpb=496756, bsz=16709.6, num_updates=19200, lr=0.000456435, gnorm=0.15, clip=0, loss_scale=4, train_wall=83, gb_free=61.3, wall=16506 epoch 012: 652 / 1689 loss=3.641, nll_loss=2.1, ppl=4.29, wps=590668, ups=1.19, wpb=496756, bsz=16709.6, num_updates=19200, lr=0.000456435, gnorm=0.15, clip=0, loss_scale=4, train_wall=83, gb_free=61.3, wall=16506 epoch 012: 652 / 1689 loss=3.641, nll_loss=2.1, ppl=4.29, wps=590668, ups=1.19, wpb=496756, bsz=16709.6, num_updates=19200, lr=0.000456435, gnorm=0.15, clip=0, loss_scale=4, train_wall=83, gb_free=61.3, wall=16506 epoch 012: 652 / 1689 loss=3.641, nll_loss=2.1, ppl=4.29, wps=590668, ups=1.19, wpb=496756, bsz=16709.6, num_updates=19200, lr=0.000456435, gnorm=0.15, clip=0, loss_scale=4, train_wall=83, gb_free=61.3, wall=16506 epoch 012: 652 / 1689 loss=3.641, nll_loss=2.1, ppl=4.29, wps=590668, ups=1.19, wpb=496756, bsz=16709.6, num_updates=19200, lr=0.000456435, gnorm=0.15, clip=0, loss_scale=4, train_wall=83, gb_free=61.3, wall=16506 epoch 012: 652 / 1689 loss=3.641, nll_loss=2.1, ppl=4.29, wps=590668, ups=1.19, wpb=496756, bsz=16709.6, num_updates=19200, lr=0.000456435, gnorm=0.15, clip=0, loss_scale=4, train_wall=83, gb_free=61.3, wall=16506 epoch 012: 652 / 1689 loss=3.641, nll_loss=2.1, ppl=4.29, wps=590668, ups=1.19, wpb=496756, bsz=16709.6, num_updates=19200, lr=0.000456435, gnorm=0.15, clip=0, loss_scale=4, train_wall=83, gb_free=61.3, wall=16506 epoch 012: 652 / 1689 loss=3.641, nll_loss=2.1, ppl=4.29, wps=590668, ups=1.19, wpb=496756, bsz=16709.6, num_updates=19200, lr=0.000456435, gnorm=0.15, clip=0, loss_scale=4, train_wall=83, gb_free=61.3, wall=16506 epoch 012: 652 / 1689 loss=3.641, nll_loss=2.1, ppl=4.29, wps=590668, ups=1.19, wpb=496756, bsz=16709.6, num_updates=19200, lr=0.000456435, gnorm=0.15, clip=0, loss_scale=4, train_wall=83, gb_free=61.3, wall=16506 epoch 012: 652 / 1689 loss=3.641, nll_loss=2.1, ppl=4.29, wps=590668, ups=1.19, wpb=496756, bsz=16709.6, num_updates=19200, lr=0.000456435, gnorm=0.15, clip=0, loss_scale=4, train_wall=83, gb_free=61.3, wall=16506 epoch 012: 652 / 1689 loss=3.641, nll_loss=2.1, ppl=4.29, wps=590668, ups=1.19, wpb=496756, bsz=16709.6, num_updates=19200, lr=0.000456435, gnorm=0.15, clip=0, loss_scale=4, train_wall=83, gb_free=61.3, wall=16506 epoch 012: 752 / 1689 loss=3.647, nll_loss=2.107, ppl=4.31, wps=592092, ups=1.19, wpb=496193, bsz=16467.5, num_updates=19300, lr=0.000455251, gnorm=0.16, clip=0, loss_scale=4, train_wall=83, gb_free=62.1, wall=16590 epoch 012: 752 / 1689 loss=3.647, nll_loss=2.107, ppl=4.31, wps=592092, ups=1.19, wpb=496193, bsz=16467.5, num_updates=19300, lr=0.000455251, gnorm=0.16, clip=0, loss_scale=4, train_wall=83, gb_free=62.1, wall=16590 epoch 012: 752 / 1689 loss=3.647, nll_loss=2.107, ppl=4.31, wps=592092, ups=1.19, wpb=496193, bsz=16467.5, num_updates=19300, lr=0.000455251, gnorm=0.16, clip=0, loss_scale=4, train_wall=83, gb_free=62.1, wall=16590 epoch 012: 752 / 1689 loss=3.647, nll_loss=2.107, ppl=4.31, wps=592092, ups=1.19, wpb=496193, bsz=16467.5, num_updates=19300, lr=0.000455251, gnorm=0.16, clip=0, loss_scale=4, train_wall=83, gb_free=62.1, wall=16590 epoch 012: 752 / 1689 loss=3.647, nll_loss=2.107, ppl=4.31, wps=592092, ups=1.19, wpb=496193, bsz=16467.5, num_updates=19300, lr=0.000455251, gnorm=0.16, clip=0, loss_scale=4, train_wall=83, gb_free=62.1, wall=16590 epoch 012: 752 / 1689 loss=3.647, nll_loss=2.107, ppl=4.31, wps=592092, ups=1.19, wpb=496193, bsz=16467.5, num_updates=19300, lr=0.000455251, gnorm=0.16, clip=0, loss_scale=4, train_wall=83, gb_free=62.1, wall=16590 epoch 012: 752 / 1689 loss=3.647, nll_loss=2.107, ppl=4.31, wps=592092, ups=1.19, wpb=496193, bsz=16467.5, num_updates=19300, lr=0.000455251, gnorm=0.16, clip=0, loss_scale=4, train_wall=83, gb_free=62.1, wall=16590 epoch 012: 752 / 1689 loss=3.647, nll_loss=2.107, ppl=4.31, wps=592092, ups=1.19, wpb=496193, bsz=16467.5, num_updates=19300, lr=0.000455251, gnorm=0.16, clip=0, loss_scale=4, train_wall=83, gb_free=62.1, wall=16590 epoch 012: 752 / 1689 loss=3.647, nll_loss=2.107, ppl=4.31, wps=592092, ups=1.19, wpb=496193, bsz=16467.5, num_updates=19300, lr=0.000455251, gnorm=0.16, clip=0, loss_scale=4, train_wall=83, gb_free=62.1, wall=16590 epoch 012: 752 / 1689 loss=3.647, nll_loss=2.107, ppl=4.31, wps=592092, ups=1.19, wpb=496193, bsz=16467.5, num_updates=19300, lr=0.000455251, gnorm=0.16, clip=0, loss_scale=4, train_wall=83, gb_free=62.1, wall=16590 epoch 012: 752 / 1689 loss=3.647, nll_loss=2.107, ppl=4.31, wps=592092, ups=1.19, wpb=496193, bsz=16467.5, num_updates=19300, lr=0.000455251, gnorm=0.16, clip=0, loss_scale=4, train_wall=83, gb_free=62.1, wall=16590 epoch 012: 752 / 1689 loss=3.647, nll_loss=2.107, ppl=4.31, wps=592092, ups=1.19, wpb=496193, bsz=16467.5, num_updates=19300, lr=0.000455251, gnorm=0.16, clip=0, loss_scale=4, train_wall=83, gb_free=62.1, wall=16590 epoch 012: 852 / 1689 loss=3.639, nll_loss=2.098, ppl=4.28, wps=585167, ups=1.18, wpb=495829, bsz=16773.1, num_updates=19400, lr=0.000454077, gnorm=0.151, clip=0, loss_scale=4, train_wall=84, gb_free=61, wall=16674 epoch 012: 852 / 1689 loss=3.639, nll_loss=2.098, ppl=4.28, wps=585167, ups=1.18, wpb=495829, bsz=16773.1, num_updates=19400, lr=0.000454077, gnorm=0.151, clip=0, loss_scale=4, train_wall=84, gb_free=61, wall=16674 epoch 012: 852 / 1689 loss=3.639, nll_loss=2.098, ppl=4.28, wps=585167, ups=1.18, wpb=495829, bsz=16773.1, num_updates=19400, lr=0.000454077, gnorm=0.151, clip=0, loss_scale=4, train_wall=84, gb_free=61, wall=16674 epoch 012: 852 / 1689 loss=3.639, nll_loss=2.098, ppl=4.28, wps=585167, ups=1.18, wpb=495829, bsz=16773.1, num_updates=19400, lr=0.000454077, gnorm=0.151, clip=0, loss_scale=4, train_wall=84, gb_free=61, wall=16674 epoch 012: 852 / 1689 loss=3.639, nll_loss=2.098, ppl=4.28, wps=585167, ups=1.18, wpb=495829, bsz=16773.1, num_updates=19400, lr=0.000454077, gnorm=0.151, clip=0, loss_scale=4, train_wall=84, gb_free=61, wall=16674 epoch 012: 852 / 1689 loss=3.639, nll_loss=2.098, ppl=4.28, wps=585167, ups=1.18, wpb=495829, bsz=16773.1, num_updates=19400, lr=0.000454077, gnorm=0.151, clip=0, loss_scale=4, train_wall=84, gb_free=61, wall=16674 epoch 012: 852 / 1689 loss=3.639, nll_loss=2.098, ppl=4.28, wps=585167, ups=1.18, wpb=495829, bsz=16773.1, num_updates=19400, lr=0.000454077, gnorm=0.151, clip=0, loss_scale=4, train_wall=84, gb_free=61, wall=16674 epoch 012: 852 / 1689 loss=3.639, nll_loss=2.098, ppl=4.28, wps=585167, ups=1.18, wpb=495829, bsz=16773.1, num_updates=19400, lr=0.000454077, gnorm=0.151, clip=0, loss_scale=4, train_wall=84, gb_free=61, wall=16674 epoch 012: 852 / 1689 loss=3.639, nll_loss=2.098, ppl=4.28, wps=585167, ups=1.18, wpb=495829, bsz=16773.1, num_updates=19400, lr=0.000454077, gnorm=0.151, clip=0, loss_scale=4, train_wall=84, gb_free=61, wall=16674 epoch 012: 852 / 1689 loss=3.639, nll_loss=2.098, ppl=4.28, wps=585167, ups=1.18, wpb=495829, bsz=16773.1, num_updates=19400, lr=0.000454077, gnorm=0.151, clip=0, loss_scale=4, train_wall=84, gb_free=61, wall=16674 epoch 012: 852 / 1689 loss=3.639, nll_loss=2.098, ppl=4.28, wps=585167, ups=1.18, wpb=495829, bsz=16773.1, num_updates=19400, lr=0.000454077, gnorm=0.151, clip=0, loss_scale=4, train_wall=84, gb_free=61, wall=16674 epoch 012: 852 / 1689 loss=3.639, nll_loss=2.098, ppl=4.28, wps=585167, ups=1.18, wpb=495829, bsz=16773.1, num_updates=19400, lr=0.000454077, gnorm=0.151, clip=0, loss_scale=4, train_wall=84, gb_free=61, wall=16674 epoch 012: 952 / 1689 loss=3.651, nll_loss=2.111, ppl=4.32, wps=590960, ups=1.19, wpb=495158, bsz=16638.7, num_updates=19500, lr=0.000452911, gnorm=0.153, clip=0, loss_scale=4, train_wall=83, gb_free=61.4, wall=16758 epoch 012: 952 / 1689 loss=3.651, nll_loss=2.111, ppl=4.32, wps=590960, ups=1.19, wpb=495158, bsz=16638.7, num_updates=19500, lr=0.000452911, gnorm=0.153, clip=0, loss_scale=4, train_wall=83, gb_free=61.4, wall=16758 epoch 012: 952 / 1689 loss=3.651, nll_loss=2.111, ppl=4.32, wps=590960, ups=1.19, wpb=495158, bsz=16638.7, num_updates=19500, lr=0.000452911, gnorm=0.153, clip=0, loss_scale=4, train_wall=83, gb_free=61.4, wall=16758 epoch 012: 952 / 1689 loss=3.651, nll_loss=2.111, ppl=4.32, wps=590960, ups=1.19, wpb=495158, bsz=16638.7, num_updates=19500, lr=0.000452911, gnorm=0.153, clip=0, loss_scale=4, train_wall=83, gb_free=61.4, wall=16758 epoch 012: 952 / 1689 loss=3.651, nll_loss=2.111, ppl=4.32, wps=590960, ups=1.19, wpb=495158, bsz=16638.7, num_updates=19500, lr=0.000452911, gnorm=0.153, clip=0, loss_scale=4, train_wall=83, gb_free=61.4, wall=16758 epoch 012: 952 / 1689 loss=3.651, nll_loss=2.111, ppl=4.32, wps=590960, ups=1.19, wpb=495158, bsz=16638.7, num_updates=19500, lr=0.000452911, gnorm=0.153, clip=0, loss_scale=4, train_wall=83, gb_free=61.4, wall=16758 epoch 012: 952 / 1689 loss=3.651, nll_loss=2.111, ppl=4.32, wps=590960, ups=1.19, wpb=495158, bsz=16638.7, num_updates=19500, lr=0.000452911, gnorm=0.153, clip=0, loss_scale=4, train_wall=83, gb_free=61.4, wall=16758 epoch 012: 952 / 1689 loss=3.651, nll_loss=2.111, ppl=4.32, wps=590960, ups=1.19, wpb=495158, bsz=16638.7, num_updates=19500, lr=0.000452911, gnorm=0.153, clip=0, loss_scale=4, train_wall=83, gb_free=61.4, wall=16758 epoch 012: 952 / 1689 loss=3.651, nll_loss=2.111, ppl=4.32, wps=590960, ups=1.19, wpb=495158, bsz=16638.7, num_updates=19500, lr=0.000452911, gnorm=0.153, clip=0, loss_scale=4, train_wall=83, gb_free=61.4, wall=16758 epoch 012: 952 / 1689 loss=3.651, nll_loss=2.111, ppl=4.32, wps=590960, ups=1.19, wpb=495158, bsz=16638.7, num_updates=19500, lr=0.000452911, gnorm=0.153, clip=0, loss_scale=4, train_wall=83, gb_free=61.4, wall=16758 epoch 012: 952 / 1689 loss=3.651, nll_loss=2.111, ppl=4.32, wps=590960, ups=1.19, wpb=495158, bsz=16638.7, num_updates=19500, lr=0.000452911, gnorm=0.153, clip=0, loss_scale=4, train_wall=83, gb_free=61.4, wall=16758 epoch 012: 952 / 1689 loss=3.651, nll_loss=2.111, ppl=4.32, wps=590960, ups=1.19, wpb=495158, bsz=16638.7, num_updates=19500, lr=0.000452911, gnorm=0.153, clip=0, loss_scale=4, train_wall=83, gb_free=61.4, wall=16758 epoch 012: 1052 / 1689 loss=3.646, nll_loss=2.106, ppl=4.31, wps=588384, ups=1.19, wpb=495484, bsz=16063.3, num_updates=19600, lr=0.000451754, gnorm=0.15, clip=0, loss_scale=8, train_wall=83, gb_free=62.3, wall=16842 epoch 012: 1052 / 1689 loss=3.646, nll_loss=2.106, ppl=4.31, wps=588384, ups=1.19, wpb=495484, bsz=16063.3, num_updates=19600, lr=0.000451754, gnorm=0.15, clip=0, loss_scale=8, train_wall=83, gb_free=62.3, wall=16842 epoch 012: 1052 / 1689 loss=3.646, nll_loss=2.106, ppl=4.31, wps=588384, ups=1.19, wpb=495484, bsz=16063.3, num_updates=19600, lr=0.000451754, gnorm=0.15, clip=0, loss_scale=8, train_wall=83, gb_free=62.3, wall=16842 epoch 012: 1052 / 1689 loss=3.646, nll_loss=2.106, ppl=4.31, wps=588384, ups=1.19, wpb=495484, bsz=16063.3, num_updates=19600, lr=0.000451754, gnorm=0.15, clip=0, loss_scale=8, train_wall=83, gb_free=62.3, wall=16842 epoch 012: 1052 / 1689 loss=3.646, nll_loss=2.106, ppl=4.31, wps=588384, ups=1.19, wpb=495484, bsz=16063.3, num_updates=19600, lr=0.000451754, gnorm=0.15, clip=0, loss_scale=8, train_wall=83, gb_free=62.3, wall=16842 epoch 012: 1052 / 1689 loss=3.646, nll_loss=2.106, ppl=4.31, wps=588384, ups=1.19, wpb=495484, bsz=16063.3, num_updates=19600, lr=0.000451754, gnorm=0.15, clip=0, loss_scale=8, train_wall=83, gb_free=62.3, wall=16842 epoch 012: 1052 / 1689 loss=3.646, nll_loss=2.106, ppl=4.31, wps=588384, ups=1.19, wpb=495484, bsz=16063.3, num_updates=19600, lr=0.000451754, gnorm=0.15, clip=0, loss_scale=8, train_wall=83, gb_free=62.3, wall=16842 epoch 012: 1052 / 1689 loss=3.646, nll_loss=2.106, ppl=4.31, wps=588384, ups=1.19, wpb=495484, bsz=16063.3, num_updates=19600, lr=0.000451754, gnorm=0.15, clip=0, loss_scale=8, train_wall=83, gb_free=62.3, wall=16842 epoch 012: 1052 / 1689 loss=3.646, nll_loss=2.106, ppl=4.31, wps=588384, ups=1.19, wpb=495484, bsz=16063.3, num_updates=19600, lr=0.000451754, gnorm=0.15, clip=0, loss_scale=8, train_wall=83, gb_free=62.3, wall=16842 epoch 012: 1052 / 1689 loss=3.646, nll_loss=2.106, ppl=4.31, wps=588384, ups=1.19, wpb=495484, bsz=16063.3, num_updates=19600, lr=0.000451754, gnorm=0.15, clip=0, loss_scale=8, train_wall=83, gb_free=62.3, wall=16842 epoch 012: 1052 / 1689 loss=3.646, nll_loss=2.106, ppl=4.31, wps=588384, ups=1.19, wpb=495484, bsz=16063.3, num_updates=19600, lr=0.000451754, gnorm=0.15, clip=0, loss_scale=8, train_wall=83, gb_free=62.3, wall=16842 epoch 012: 1052 / 1689 loss=3.646, nll_loss=2.106, ppl=4.31, wps=588384, ups=1.19, wpb=495484, bsz=16063.3, num_updates=19600, lr=0.000451754, gnorm=0.15, clip=0, loss_scale=8, train_wall=83, gb_free=62.3, wall=16842 epoch 012: 1153 / 1689 loss=3.646, nll_loss=2.106, ppl=4.31, wps=580739, ups=1.17, wpb=495955, bsz=16393.2, num_updates=19700, lr=0.000450606, gnorm=0.146, clip=0, loss_scale=4, train_wall=84, gb_free=61.4, wall=16928 epoch 012: 1153 / 1689 loss=3.646, nll_loss=2.106, ppl=4.31, wps=580739, ups=1.17, wpb=495955, bsz=16393.2, num_updates=19700, lr=0.000450606, gnorm=0.146, clip=0, loss_scale=4, train_wall=84, gb_free=61.4, wall=16928 epoch 012: 1153 / 1689 loss=3.646, nll_loss=2.106, ppl=4.31, wps=580739, ups=1.17, wpb=495955, bsz=16393.2, num_updates=19700, lr=0.000450606, gnorm=0.146, clip=0, loss_scale=4, train_wall=84, gb_free=61.4, wall=16928 epoch 012: 1153 / 1689 loss=3.646, nll_loss=2.106, ppl=4.31, wps=580739, ups=1.17, wpb=495955, bsz=16393.2, num_updates=19700, lr=0.000450606, gnorm=0.146, clip=0, loss_scale=4, train_wall=84, gb_free=61.4, wall=16928 epoch 012: 1153 / 1689 loss=3.646, nll_loss=2.106, ppl=4.31, wps=580739, ups=1.17, wpb=495955, bsz=16393.2, num_updates=19700, lr=0.000450606, gnorm=0.146, clip=0, loss_scale=4, train_wall=84, gb_free=61.4, wall=16928 epoch 012: 1153 / 1689 loss=3.646, nll_loss=2.106, ppl=4.31, wps=580739, ups=1.17, wpb=495955, bsz=16393.2, num_updates=19700, lr=0.000450606, gnorm=0.146, clip=0, loss_scale=4, train_wall=84, gb_free=61.4, wall=16928 epoch 012: 1153 / 1689 loss=3.646, nll_loss=2.106, ppl=4.31, wps=580739, ups=1.17, wpb=495955, bsz=16393.2, num_updates=19700, lr=0.000450606, gnorm=0.146, clip=0, loss_scale=4, train_wall=84, gb_free=61.4, wall=16928 epoch 012: 1153 / 1689 loss=3.646, nll_loss=2.106, ppl=4.31, wps=580739, ups=1.17, wpb=495955, bsz=16393.2, num_updates=19700, lr=0.000450606, gnorm=0.146, clip=0, loss_scale=4, train_wall=84, gb_free=61.4, wall=16928 epoch 012: 1153 / 1689 loss=3.646, nll_loss=2.106, ppl=4.31, wps=580739, ups=1.17, wpb=495955, bsz=16393.2, num_updates=19700, lr=0.000450606, gnorm=0.146, clip=0, loss_scale=4, train_wall=84, gb_free=61.4, wall=16928 epoch 012: 1153 / 1689 loss=3.646, nll_loss=2.106, ppl=4.31, wps=580739, ups=1.17, wpb=495955, bsz=16393.2, num_updates=19700, lr=0.000450606, gnorm=0.146, clip=0, loss_scale=4, train_wall=84, gb_free=61.4, wall=16928 epoch 012: 1153 / 1689 loss=3.646, nll_loss=2.106, ppl=4.31, wps=580739, ups=1.17, wpb=495955, bsz=16393.2, num_updates=19700, lr=0.000450606, gnorm=0.146, clip=0, loss_scale=4, train_wall=84, gb_free=61.4, wall=16928 epoch 012: 1153 / 1689 loss=3.646, nll_loss=2.106, ppl=4.31, wps=580739, ups=1.17, wpb=495955, bsz=16393.2, num_updates=19700, lr=0.000450606, gnorm=0.146, clip=0, loss_scale=4, train_wall=84, gb_free=61.4, wall=16928 epoch 012: 1254 / 1689 loss=3.649, nll_loss=2.109, ppl=4.32, wps=577141, ups=1.17, wpb=494514, bsz=16750.1, num_updates=19800, lr=0.000449467, gnorm=0.153, clip=0, loss_scale=2, train_wall=85, gb_free=61.6, wall=17014 epoch 012: 1254 / 1689 loss=3.649, nll_loss=2.109, ppl=4.32, wps=577141, ups=1.17, wpb=494514, bsz=16750.1, num_updates=19800, lr=0.000449467, gnorm=0.153, clip=0, loss_scale=2, train_wall=85, gb_free=61.6, wall=17014 epoch 012: 1254 / 1689 loss=3.649, nll_loss=2.109, ppl=4.32, wps=577141, ups=1.17, wpb=494514, bsz=16750.1, num_updates=19800, lr=0.000449467, gnorm=0.153, clip=0, loss_scale=2, train_wall=85, gb_free=61.6, wall=17014 epoch 012: 1254 / 1689 loss=3.649, nll_loss=2.109, ppl=4.32, wps=577141, ups=1.17, wpb=494514, bsz=16750.1, num_updates=19800, lr=0.000449467, gnorm=0.153, clip=0, loss_scale=2, train_wall=85, gb_free=61.6, wall=17014 epoch 012: 1254 / 1689 loss=3.649, nll_loss=2.109, ppl=4.32, wps=577141, ups=1.17, wpb=494514, bsz=16750.1, num_updates=19800, lr=0.000449467, gnorm=0.153, clip=0, loss_scale=2, train_wall=85, gb_free=61.6, wall=17014 epoch 012: 1254 / 1689 loss=3.649, nll_loss=2.109, ppl=4.32, wps=577141, ups=1.17, wpb=494514, bsz=16750.1, num_updates=19800, lr=0.000449467, gnorm=0.153, clip=0, loss_scale=2, train_wall=85, gb_free=61.6, wall=17014 epoch 012: 1254 / 1689 loss=3.649, nll_loss=2.109, ppl=4.32, wps=577141, ups=1.17, wpb=494514, bsz=16750.1, num_updates=19800, lr=0.000449467, gnorm=0.153, clip=0, loss_scale=2, train_wall=85, gb_free=61.6, wall=17014 epoch 012: 1254 / 1689 loss=3.649, nll_loss=2.109, ppl=4.32, wps=577141, ups=1.17, wpb=494514, bsz=16750.1, num_updates=19800, lr=0.000449467, gnorm=0.153, clip=0, loss_scale=2, train_wall=85, gb_free=61.6, wall=17014 epoch 012: 1254 / 1689 loss=3.649, nll_loss=2.109, ppl=4.32, wps=577141, ups=1.17, wpb=494514, bsz=16750.1, num_updates=19800, lr=0.000449467, gnorm=0.153, clip=0, loss_scale=2, train_wall=85, gb_free=61.6, wall=17014 epoch 012: 1254 / 1689 loss=3.649, nll_loss=2.109, ppl=4.32, wps=577141, ups=1.17, wpb=494514, bsz=16750.1, num_updates=19800, lr=0.000449467, gnorm=0.153, clip=0, loss_scale=2, train_wall=85, gb_free=61.6, wall=17014 epoch 012: 1254 / 1689 loss=3.649, nll_loss=2.109, ppl=4.32, wps=577141, ups=1.17, wpb=494514, bsz=16750.1, num_updates=19800, lr=0.000449467, gnorm=0.153, clip=0, loss_scale=2, train_wall=85, gb_free=61.6, wall=17014 epoch 012: 1254 / 1689 loss=3.649, nll_loss=2.109, ppl=4.32, wps=577141, ups=1.17, wpb=494514, bsz=16750.1, num_updates=19800, lr=0.000449467, gnorm=0.153, clip=0, loss_scale=2, train_wall=85, gb_free=61.6, wall=17014 epoch 012: 1354 / 1689 loss=3.652, nll_loss=2.113, ppl=4.33, wps=589918, ups=1.19, wpb=495324, bsz=16862.5, num_updates=19900, lr=0.000448336, gnorm=0.154, clip=0, loss_scale=2, train_wall=83, gb_free=60.2, wall=17097 epoch 012: 1354 / 1689 loss=3.652, nll_loss=2.113, ppl=4.33, wps=589918, ups=1.19, wpb=495324, bsz=16862.5, num_updates=19900, lr=0.000448336, gnorm=0.154, clip=0, loss_scale=2, train_wall=83, gb_free=60.2, wall=17097 epoch 012: 1354 / 1689 loss=3.652, nll_loss=2.113, ppl=4.33, wps=589918, ups=1.19, wpb=495324, bsz=16862.5, num_updates=19900, lr=0.000448336, gnorm=0.154, clip=0, loss_scale=2, train_wall=83, gb_free=60.2, wall=17097 epoch 012: 1354 / 1689 loss=3.652, nll_loss=2.113, ppl=4.33, wps=589918, ups=1.19, wpb=495324, bsz=16862.5, num_updates=19900, lr=0.000448336, gnorm=0.154, clip=0, loss_scale=2, train_wall=83, gb_free=60.2, wall=17097 epoch 012: 1354 / 1689 loss=3.652, nll_loss=2.113, ppl=4.33, wps=589918, ups=1.19, wpb=495324, bsz=16862.5, num_updates=19900, lr=0.000448336, gnorm=0.154, clip=0, loss_scale=2, train_wall=83, gb_free=60.2, wall=17097 epoch 012: 1354 / 1689 loss=3.652, nll_loss=2.113, ppl=4.33, wps=589918, ups=1.19, wpb=495324, bsz=16862.5, num_updates=19900, lr=0.000448336, gnorm=0.154, clip=0, loss_scale=2, train_wall=83, gb_free=60.2, wall=17097 epoch 012: 1354 / 1689 loss=3.652, nll_loss=2.113, ppl=4.33, wps=589918, ups=1.19, wpb=495324, bsz=16862.5, num_updates=19900, lr=0.000448336, gnorm=0.154, clip=0, loss_scale=2, train_wall=83, gb_free=60.2, wall=17097 epoch 012: 1354 / 1689 loss=3.652, nll_loss=2.113, ppl=4.33, wps=589918, ups=1.19, wpb=495324, bsz=16862.5, num_updates=19900, lr=0.000448336, gnorm=0.154, clip=0, loss_scale=2, train_wall=83, gb_free=60.2, wall=17097 epoch 012: 1354 / 1689 loss=3.652, nll_loss=2.113, ppl=4.33, wps=589918, ups=1.19, wpb=495324, bsz=16862.5, num_updates=19900, lr=0.000448336, gnorm=0.154, clip=0, loss_scale=2, train_wall=83, gb_free=60.2, wall=17097 epoch 012: 1354 / 1689 loss=3.652, nll_loss=2.113, ppl=4.33, wps=589918, ups=1.19, wpb=495324, bsz=16862.5, num_updates=19900, lr=0.000448336, gnorm=0.154, clip=0, loss_scale=2, train_wall=83, gb_free=60.2, wall=17097 epoch 012: 1354 / 1689 loss=3.652, nll_loss=2.113, ppl=4.33, wps=589918, ups=1.19, wpb=495324, bsz=16862.5, num_updates=19900, lr=0.000448336, gnorm=0.154, clip=0, loss_scale=2, train_wall=83, gb_free=60.2, wall=17097 epoch 012: 1354 / 1689 loss=3.652, nll_loss=2.113, ppl=4.33, wps=589918, ups=1.19, wpb=495324, bsz=16862.5, num_updates=19900, lr=0.000448336, gnorm=0.154, clip=0, loss_scale=2, train_wall=83, gb_free=60.2, wall=17097 epoch 012: 1454 / 1689 loss=3.653, nll_loss=2.114, ppl=4.33, wps=585705, ups=1.18, wpb=494708, bsz=16322, num_updates=20000, lr=0.000447214, gnorm=0.152, clip=0, loss_scale=2, train_wall=83, gb_free=60.8, wall=17182 epoch 012: 1454 / 1689 loss=3.653, nll_loss=2.114, ppl=4.33, wps=585705, ups=1.18, wpb=494708, bsz=16322, num_updates=20000, lr=0.000447214, gnorm=0.152, clip=0, loss_scale=2, train_wall=83, gb_free=60.8, wall=17182 epoch 012: 1454 / 1689 loss=3.653, nll_loss=2.114, ppl=4.33, wps=585705, ups=1.18, wpb=494708, bsz=16322, num_updates=20000, lr=0.000447214, gnorm=0.152, clip=0, loss_scale=2, train_wall=83, gb_free=60.8, wall=17182 epoch 012: 1454 / 1689 loss=3.653, nll_loss=2.114, ppl=4.33, wps=585705, ups=1.18, wpb=494708, bsz=16322, num_updates=20000, lr=0.000447214, gnorm=0.152, clip=0, loss_scale=2, train_wall=83, gb_free=60.8, wall=17182 epoch 012: 1454 / 1689 loss=3.653, nll_loss=2.114, ppl=4.33, wps=585705, ups=1.18, wpb=494708, bsz=16322, num_updates=20000, lr=0.000447214, gnorm=0.152, clip=0, loss_scale=2, train_wall=83, gb_free=60.8, wall=17182 epoch 012: 1454 / 1689 loss=3.653, nll_loss=2.114, ppl=4.33, wps=585705, ups=1.18, wpb=494708, bsz=16322, num_updates=20000, lr=0.000447214, gnorm=0.152, clip=0, loss_scale=2, train_wall=83, gb_free=60.8, wall=17182 epoch 012: 1454 / 1689 loss=3.653, nll_loss=2.114, ppl=4.33, wps=585705, ups=1.18, wpb=494708, bsz=16322, num_updates=20000, lr=0.000447214, gnorm=0.152, clip=0, loss_scale=2, train_wall=83, gb_free=60.8, wall=17182 epoch 012: 1454 / 1689 loss=3.653, nll_loss=2.114, ppl=4.33, wps=585705, ups=1.18, wpb=494708, bsz=16322, num_updates=20000, lr=0.000447214, gnorm=0.152, clip=0, loss_scale=2, train_wall=83, gb_free=60.8, wall=17182 epoch 012: 1454 / 1689 loss=3.653, nll_loss=2.114, ppl=4.33, wps=585705, ups=1.18, wpb=494708, bsz=16322, num_updates=20000, lr=0.000447214, gnorm=0.152, clip=0, loss_scale=2, train_wall=83, gb_free=60.8, wall=17182 epoch 012: 1454 / 1689 loss=3.653, nll_loss=2.114, ppl=4.33, wps=585705, ups=1.18, wpb=494708, bsz=16322, num_updates=20000, lr=0.000447214, gnorm=0.152, clip=0, loss_scale=2, train_wall=83, gb_free=60.8, wall=17182 epoch 012: 1454 / 1689 loss=3.653, nll_loss=2.114, ppl=4.33, wps=585705, ups=1.18, wpb=494708, bsz=16322, num_updates=20000, lr=0.000447214, gnorm=0.152, clip=0, loss_scale=2, train_wall=83, gb_free=60.8, wall=17182 epoch 012: 1454 / 1689 loss=3.653, nll_loss=2.114, ppl=4.33, wps=585705, ups=1.18, wpb=494708, bsz=16322, num_updates=20000, lr=0.000447214, gnorm=0.152, clip=0, loss_scale=2, train_wall=83, gb_free=60.8, wall=17182 begin validation on "valid" subset epoch 012 | valid on 'valid' subset | loss 3.731 | nll_loss 2.18 | ppl 4.53 | wps 0 | wpb 44526 | bsz 2008 | num_updates 20000 | best_loss 3.731 epoch 012 | valid on 'valid' subset | loss 3.731 | nll_loss 2.18 | ppl 4.53 | wps 0 | wpb 44526 | bsz 2008 | num_updates 20000 | best_loss 3.731 epoch 012 | valid on 'valid' subset | loss 3.731 | nll_loss 2.18 | ppl 4.53 | wps 0 | wpb 44526 | bsz 2008 | num_updates 20000 | best_loss 3.731 epoch 012 | valid on 'valid' subset | loss 3.731 | nll_loss 2.18 | ppl 4.53 | wps 0 | wpb 44526 | bsz 2008 | num_updates 20000 | best_loss 3.731 epoch 012 | valid on 'valid' subset | loss 3.731 | nll_loss 2.18 | ppl 4.53 | wps 0 | wpb 44526 | bsz 2008 | num_updates 20000 | best_loss 3.731 epoch 012 | valid on 'valid' subset | loss 3.731 | nll_loss 2.18 | ppl 4.53 | wps 0 | wpb 44526 | bsz 2008 | num_updates 20000 | best_loss 3.731 epoch 012 | valid on 'valid' subset | loss 3.731 | nll_loss 2.18 | ppl 4.53 | wps 0 | wpb 44526 | bsz 2008 | num_updates 20000 | best_loss 3.731 epoch 012 | valid on 'valid' subset | loss 3.731 | nll_loss 2.18 | ppl 4.53 | wps 0 | wpb 44526 | bsz 2008 | num_updates 20000 | best_loss 3.731 epoch 012 | valid on 'valid' subset | loss 3.731 | nll_loss 2.18 | ppl 4.53 | wps 0 | wpb 44526 | bsz 2008 | num_updates 20000 | best_loss 3.731 epoch 012 | valid on 'valid' subset | loss 3.731 | nll_loss 2.18 | ppl 4.53 | wps 0 | wpb 44526 | bsz 2008 | num_updates 20000 | best_loss 3.731 epoch 012 | valid on 'valid' subset | loss 3.731 | nll_loss 2.18 | ppl 4.53 | wps 0 | wpb 44526 | bsz 2008 | num_updates 20000 | best_loss 3.731 epoch 012 | valid on 'valid' subset | loss 3.731 | nll_loss 2.18 | ppl 4.53 | wps 0 | wpb 44526 | bsz 2008 | num_updates 20000 | best_loss 3.731 epoch 012: 1554 / 1689 loss=3.648, nll_loss=2.109, ppl=4.31, wps=512324, ups=1.03, wpb=495145, bsz=16623, num_updates=20100, lr=0.0004461, gnorm=0.145, clip=0, loss_scale=2, train_wall=83, gb_free=61.7, wall=17279 epoch 012: 1554 / 1689 loss=3.648, nll_loss=2.109, ppl=4.31, wps=512324, ups=1.03, wpb=495145, bsz=16623, num_updates=20100, lr=0.0004461, gnorm=0.145, clip=0, loss_scale=2, train_wall=83, gb_free=61.7, wall=17279 epoch 012: 1554 / 1689 loss=3.648, nll_loss=2.109, ppl=4.31, wps=512324, ups=1.03, wpb=495145, bsz=16623, num_updates=20100, lr=0.0004461, gnorm=0.145, clip=0, loss_scale=2, train_wall=83, gb_free=61.7, wall=17279 epoch 012: 1554 / 1689 loss=3.648, nll_loss=2.109, ppl=4.31, wps=512324, ups=1.03, wpb=495145, bsz=16623, num_updates=20100, lr=0.0004461, gnorm=0.145, clip=0, loss_scale=2, train_wall=83, gb_free=61.7, wall=17279 epoch 012: 1554 / 1689 loss=3.648, nll_loss=2.109, ppl=4.31, wps=512324, ups=1.03, wpb=495145, bsz=16623, num_updates=20100, lr=0.0004461, gnorm=0.145, clip=0, loss_scale=2, train_wall=83, gb_free=61.7, wall=17279 epoch 012: 1554 / 1689 loss=3.648, nll_loss=2.109, ppl=4.31, wps=512324, ups=1.03, wpb=495145, bsz=16623, num_updates=20100, lr=0.0004461, gnorm=0.145, clip=0, loss_scale=2, train_wall=83, gb_free=61.7, wall=17279 epoch 012: 1554 / 1689 loss=3.648, nll_loss=2.109, ppl=4.31, wps=512324, ups=1.03, wpb=495145, bsz=16623, num_updates=20100, lr=0.0004461, gnorm=0.145, clip=0, loss_scale=2, train_wall=83, gb_free=61.7, wall=17279 epoch 012: 1554 / 1689 loss=3.648, nll_loss=2.109, ppl=4.31, wps=512324, ups=1.03, wpb=495145, bsz=16623, num_updates=20100, lr=0.0004461, gnorm=0.145, clip=0, loss_scale=2, train_wall=83, gb_free=61.7, wall=17279 epoch 012: 1554 / 1689 loss=3.648, nll_loss=2.109, ppl=4.31, wps=512324, ups=1.03, wpb=495145, bsz=16623, num_updates=20100, lr=0.0004461, gnorm=0.145, clip=0, loss_scale=2, train_wall=83, gb_free=61.7, wall=17279 epoch 012: 1554 / 1689 loss=3.648, nll_loss=2.109, ppl=4.31, wps=512324, ups=1.03, wpb=495145, bsz=16623, num_updates=20100, lr=0.0004461, gnorm=0.145, clip=0, loss_scale=2, train_wall=83, gb_free=61.7, wall=17279 epoch 012: 1554 / 1689 loss=3.648, nll_loss=2.109, ppl=4.31, wps=512324, ups=1.03, wpb=495145, bsz=16623, num_updates=20100, lr=0.0004461, gnorm=0.145, clip=0, loss_scale=2, train_wall=83, gb_free=61.7, wall=17279 epoch 012: 1554 / 1689 loss=3.648, nll_loss=2.109, ppl=4.31, wps=512324, ups=1.03, wpb=495145, bsz=16623, num_updates=20100, lr=0.0004461, gnorm=0.145, clip=0, loss_scale=2, train_wall=83, gb_free=61.7, wall=17279 epoch 012: 1654 / 1689 loss=3.651, nll_loss=2.112, ppl=4.32, wps=587838, ups=1.18, wpb=496221, bsz=16478, num_updates=20200, lr=0.000444994, gnorm=0.154, clip=0, loss_scale=2, train_wall=83, gb_free=61.7, wall=17363 epoch 012: 1654 / 1689 loss=3.651, nll_loss=2.112, ppl=4.32, wps=587838, ups=1.18, wpb=496221, bsz=16478, num_updates=20200, lr=0.000444994, gnorm=0.154, clip=0, loss_scale=2, train_wall=83, gb_free=61.7, wall=17363 epoch 012: 1654 / 1689 loss=3.651, nll_loss=2.112, ppl=4.32, wps=587838, ups=1.18, wpb=496221, bsz=16478, num_updates=20200, lr=0.000444994, gnorm=0.154, clip=0, loss_scale=2, train_wall=83, gb_free=61.7, wall=17363 epoch 012: 1654 / 1689 loss=3.651, nll_loss=2.112, ppl=4.32, wps=587838, ups=1.18, wpb=496221, bsz=16478, num_updates=20200, lr=0.000444994, gnorm=0.154, clip=0, loss_scale=2, train_wall=83, gb_free=61.7, wall=17363 epoch 012: 1654 / 1689 loss=3.651, nll_loss=2.112, ppl=4.32, wps=587838, ups=1.18, wpb=496221, bsz=16478, num_updates=20200, lr=0.000444994, gnorm=0.154, clip=0, loss_scale=2, train_wall=83, gb_free=61.7, wall=17363 epoch 012: 1654 / 1689 loss=3.651, nll_loss=2.112, ppl=4.32, wps=587838, ups=1.18, wpb=496221, bsz=16478, num_updates=20200, lr=0.000444994, gnorm=0.154, clip=0, loss_scale=2, train_wall=83, gb_free=61.7, wall=17363 epoch 012: 1654 / 1689 loss=3.651, nll_loss=2.112, ppl=4.32, wps=587838, ups=1.18, wpb=496221, bsz=16478, num_updates=20200, lr=0.000444994, gnorm=0.154, clip=0, loss_scale=2, train_wall=83, gb_free=61.7, wall=17363 epoch 012: 1654 / 1689 loss=3.651, nll_loss=2.112, ppl=4.32, wps=587838, ups=1.18, wpb=496221, bsz=16478, num_updates=20200, lr=0.000444994, gnorm=0.154, clip=0, loss_scale=2, train_wall=83, gb_free=61.7, wall=17363 epoch 012: 1654 / 1689 loss=3.651, nll_loss=2.112, ppl=4.32, wps=587838, ups=1.18, wpb=496221, bsz=16478, num_updates=20200, lr=0.000444994, gnorm=0.154, clip=0, loss_scale=2, train_wall=83, gb_free=61.7, wall=17363 epoch 012: 1654 / 1689 loss=3.651, nll_loss=2.112, ppl=4.32, wps=587838, ups=1.18, wpb=496221, bsz=16478, num_updates=20200, lr=0.000444994, gnorm=0.154, clip=0, loss_scale=2, train_wall=83, gb_free=61.7, wall=17363 epoch 012: 1654 / 1689 loss=3.651, nll_loss=2.112, ppl=4.32, wps=587838, ups=1.18, wpb=496221, bsz=16478, num_updates=20200, lr=0.000444994, gnorm=0.154, clip=0, loss_scale=2, train_wall=83, gb_free=61.7, wall=17363 epoch 012: 1654 / 1689 loss=3.651, nll_loss=2.112, ppl=4.32, wps=587838, ups=1.18, wpb=496221, bsz=16478, num_updates=20200, lr=0.000444994, gnorm=0.154, clip=0, loss_scale=2, train_wall=83, gb_free=61.7, wall=17363 end of epoch 12 (average epoch stats below) epoch 012 | loss 3.644 | nll_loss 2.104 | ppl 4.3 | wps 576324 | ups 1.16 | wpb 495124 | bsz 16502.9 | num_updates 20235 | lr 0.000444609 | gnorm 0.152 | clip 0 | loss_scale 2 | train_wall 1406 | gb_free 62.8 | wall 17392 epoch 012 | loss 3.644 | nll_loss 2.104 | ppl 4.3 | wps 576324 | ups 1.16 | wpb 495124 | bsz 16502.9 | num_updates 20235 | lr 0.000444609 | gnorm 0.152 | clip 0 | loss_scale 2 | train_wall 1406 | gb_free 62.8 | wall 17392 epoch 012 | loss 3.644 | nll_loss 2.104 | ppl 4.3 | wps 576324 | ups 1.16 | wpb 495124 | bsz 16502.9 | num_updates 20235 | lr 0.000444609 | gnorm 0.152 | clip 0 | loss_scale 2 | train_wall 1406 | gb_free 62.8 | wall 17392 epoch 012 | loss 3.644 | nll_loss 2.104 | ppl 4.3 | wps 576324 | ups 1.16 | wpb 495124 | bsz 16502.9 | num_updates 20235 | lr 0.000444609 | gnorm 0.152 | clip 0 | loss_scale 2 | train_wall 1406 | gb_free 62.8 | wall 17392 epoch 012 | loss 3.644 | nll_loss 2.104 | ppl 4.3 | wps 576324 | ups 1.16 | wpb 495124 | bsz 16502.9 | num_updates 20235 | lr 0.000444609 | gnorm 0.152 | clip 0 | loss_scale 2 | train_wall 1406 | gb_free 62.8 | wall 17392 epoch 012 | loss 3.644 | nll_loss 2.104 | ppl 4.3 | wps 576324 | ups 1.16 | wpb 495124 | bsz 16502.9 | num_updates 20235 | lr 0.000444609 | gnorm 0.152 | clip 0 | loss_scale 2 | train_wall 1406 | gb_free 62.8 | wall 17392 epoch 012 | loss 3.644 | nll_loss 2.104 | ppl 4.3 | wps 576324 | ups 1.16 | wpb 495124 | bsz 16502.9 | num_updates 20235 | lr 0.000444609 | gnorm 0.152 | clip 0 | loss_scale 2 | train_wall 1406 | gb_free 62.8 | wall 17392 epoch 012 | loss 3.644 | nll_loss 2.104 | ppl 4.3 | wps 576324 | ups 1.16 | wpb 495124 | bsz 16502.9 | num_updates 20235 | lr 0.000444609 | gnorm 0.152 | clip 0 | loss_scale 2 | train_wall 1406 | gb_free 62.8 | wall 17392 epoch 012 | loss 3.644 | nll_loss 2.104 | ppl 4.3 | wps 576324 | ups 1.16 | wpb 495124 | bsz 16502.9 | num_updates 20235 | lr 0.000444609 | gnorm 0.152 | clip 0 | loss_scale 2 | train_wall 1406 | gb_free 62.8 | wall 17392 epoch 012 | loss 3.644 | nll_loss 2.104 | ppl 4.3 | wps 576324 | ups 1.16 | wpb 495124 | bsz 16502.9 | num_updates 20235 | lr 0.000444609 | gnorm 0.152 | clip 0 | loss_scale 2 | train_wall 1406 | gb_free 62.8 | wall 17392 epoch 012 | loss 3.644 | nll_loss 2.104 | ppl 4.3 | wps 576324 | ups 1.16 | wpb 495124 | bsz 16502.9 | num_updates 20235 | lr 0.000444609 | gnorm 0.152 | clip 0 | loss_scale 2 | train_wall 1406 | gb_free 62.8 | wall 17392 epoch 012 | loss 3.644 | nll_loss 2.104 | ppl 4.3 | wps 576324 | ups 1.16 | wpb 495124 | bsz 16502.9 | num_updates 20235 | lr 0.000444609 | gnorm 0.152 | clip 0 | loss_scale 2 | train_wall 1406 | gb_free 62.8 | wall 17392 Start iterating over samples epoch 013: 65 / 1689 loss=3.625, nll_loss=2.082, ppl=4.23, wps=584315, ups=1.19, wpb=491838, bsz=16115.6, num_updates=20300, lr=0.000443897, gnorm=0.161, clip=0, loss_scale=4, train_wall=83, gb_free=61.5, wall=17447 epoch 013: 65 / 1689 loss=3.625, nll_loss=2.082, ppl=4.23, wps=584315, ups=1.19, wpb=491838, bsz=16115.6, num_updates=20300, lr=0.000443897, gnorm=0.161, clip=0, loss_scale=4, train_wall=83, gb_free=61.5, wall=17447 epoch 013: 65 / 1689 loss=3.625, nll_loss=2.082, ppl=4.23, wps=584315, ups=1.19, wpb=491838, bsz=16115.6, num_updates=20300, lr=0.000443897, gnorm=0.161, clip=0, loss_scale=4, train_wall=83, gb_free=61.5, wall=17447 epoch 013: 65 / 1689 loss=3.625, nll_loss=2.082, ppl=4.23, wps=584315, ups=1.19, wpb=491838, bsz=16115.6, num_updates=20300, lr=0.000443897, gnorm=0.161, clip=0, loss_scale=4, train_wall=83, gb_free=61.5, wall=17447 epoch 013: 65 / 1689 loss=3.625, nll_loss=2.082, ppl=4.23, wps=584315, ups=1.19, wpb=491838, bsz=16115.6, num_updates=20300, lr=0.000443897, gnorm=0.161, clip=0, loss_scale=4, train_wall=83, gb_free=61.5, wall=17447 epoch 013: 65 / 1689 loss=3.625, nll_loss=2.082, ppl=4.23, wps=584315, ups=1.19, wpb=491838, bsz=16115.6, num_updates=20300, lr=0.000443897, gnorm=0.161, clip=0, loss_scale=4, train_wall=83, gb_free=61.5, wall=17447 epoch 013: 65 / 1689 loss=3.625, nll_loss=2.082, ppl=4.23, wps=584315, ups=1.19, wpb=491838, bsz=16115.6, num_updates=20300, lr=0.000443897, gnorm=0.161, clip=0, loss_scale=4, train_wall=83, gb_free=61.5, wall=17447 epoch 013: 65 / 1689 loss=3.625, nll_loss=2.082, ppl=4.23, wps=584315, ups=1.19, wpb=491838, bsz=16115.6, num_updates=20300, lr=0.000443897, gnorm=0.161, clip=0, loss_scale=4, train_wall=83, gb_free=61.5, wall=17447 epoch 013: 65 / 1689 loss=3.625, nll_loss=2.082, ppl=4.23, wps=584315, ups=1.19, wpb=491838, bsz=16115.6, num_updates=20300, lr=0.000443897, gnorm=0.161, clip=0, loss_scale=4, train_wall=83, gb_free=61.5, wall=17447 epoch 013: 65 / 1689 loss=3.625, nll_loss=2.082, ppl=4.23, wps=584315, ups=1.19, wpb=491838, bsz=16115.6, num_updates=20300, lr=0.000443897, gnorm=0.161, clip=0, loss_scale=4, train_wall=83, gb_free=61.5, wall=17447 epoch 013: 65 / 1689 loss=3.625, nll_loss=2.082, ppl=4.23, wps=584315, ups=1.19, wpb=491838, bsz=16115.6, num_updates=20300, lr=0.000443897, gnorm=0.161, clip=0, loss_scale=4, train_wall=83, gb_free=61.5, wall=17447 epoch 013: 65 / 1689 loss=3.625, nll_loss=2.082, ppl=4.23, wps=584315, ups=1.19, wpb=491838, bsz=16115.6, num_updates=20300, lr=0.000443897, gnorm=0.161, clip=0, loss_scale=4, train_wall=83, gb_free=61.5, wall=17447 epoch 013: 65 / 1689 loss=3.625, nll_loss=2.082, ppl=4.23, wps=584315, ups=1.19, wpb=491838, bsz=16115.6, num_updates=20300, lr=0.000443897, gnorm=0.161, clip=0, loss_scale=4, train_wall=83, gb_free=61.5, wall=17447 epoch 013: 165 / 1689 loss=3.617, nll_loss=2.073, ppl=4.21, wps=590031, ups=1.19, wpb=495102, bsz=16327.7, num_updates=20400, lr=0.000442807, gnorm=0.154, clip=0, loss_scale=4, train_wall=83, gb_free=61, wall=17531 epoch 013: 165 / 1689 loss=3.617, nll_loss=2.073, ppl=4.21, wps=590031, ups=1.19, wpb=495102, bsz=16327.7, num_updates=20400, lr=0.000442807, gnorm=0.154, clip=0, loss_scale=4, train_wall=83, gb_free=61, wall=17531 epoch 013: 165 / 1689 loss=3.617, nll_loss=2.073, ppl=4.21, wps=590031, ups=1.19, wpb=495102, bsz=16327.7, num_updates=20400, lr=0.000442807, gnorm=0.154, clip=0, loss_scale=4, train_wall=83, gb_free=61, wall=17531 epoch 013: 165 / 1689 loss=3.617, nll_loss=2.073, ppl=4.21, wps=590031, ups=1.19, wpb=495102, bsz=16327.7, num_updates=20400, lr=0.000442807, gnorm=0.154, clip=0, loss_scale=4, train_wall=83, gb_free=61, wall=17531 epoch 013: 165 / 1689 loss=3.617, nll_loss=2.073, ppl=4.21, wps=590031, ups=1.19, wpb=495102, bsz=16327.7, num_updates=20400, lr=0.000442807, gnorm=0.154, clip=0, loss_scale=4, train_wall=83, gb_free=61, wall=17531 epoch 013: 165 / 1689 loss=3.617, nll_loss=2.073, ppl=4.21, wps=590031, ups=1.19, wpb=495102, bsz=16327.7, num_updates=20400, lr=0.000442807, gnorm=0.154, clip=0, loss_scale=4, train_wall=83, gb_free=61, wall=17531 epoch 013: 165 / 1689 loss=3.617, nll_loss=2.073, ppl=4.21, wps=590031, ups=1.19, wpb=495102, bsz=16327.7, num_updates=20400, lr=0.000442807, gnorm=0.154, clip=0, loss_scale=4, train_wall=83, gb_free=61, wall=17531 epoch 013: 165 / 1689 loss=3.617, nll_loss=2.073, ppl=4.21, wps=590031, ups=1.19, wpb=495102, bsz=16327.7, num_updates=20400, lr=0.000442807, gnorm=0.154, clip=0, loss_scale=4, train_wall=83, gb_free=61, wall=17531 epoch 013: 165 / 1689 loss=3.617, nll_loss=2.073, ppl=4.21, wps=590031, ups=1.19, wpb=495102, bsz=16327.7, num_updates=20400, lr=0.000442807, gnorm=0.154, clip=0, loss_scale=4, train_wall=83, gb_free=61, wall=17531 epoch 013: 165 / 1689 loss=3.617, nll_loss=2.073, ppl=4.21, wps=590031, ups=1.19, wpb=495102, bsz=16327.7, num_updates=20400, lr=0.000442807, gnorm=0.154, clip=0, loss_scale=4, train_wall=83, gb_free=61, wall=17531 epoch 013: 165 / 1689 loss=3.617, nll_loss=2.073, ppl=4.21, wps=590031, ups=1.19, wpb=495102, bsz=16327.7, num_updates=20400, lr=0.000442807, gnorm=0.154, clip=0, loss_scale=4, train_wall=83, gb_free=61, wall=17531 epoch 013: 165 / 1689 loss=3.617, nll_loss=2.073, ppl=4.21, wps=590031, ups=1.19, wpb=495102, bsz=16327.7, num_updates=20400, lr=0.000442807, gnorm=0.154, clip=0, loss_scale=4, train_wall=83, gb_free=61, wall=17531 epoch 013: 165 / 1689 loss=3.617, nll_loss=2.073, ppl=4.21, wps=590031, ups=1.19, wpb=495102, bsz=16327.7, num_updates=20400, lr=0.000442807, gnorm=0.154, clip=0, loss_scale=4, train_wall=83, gb_free=61, wall=17531 epoch 013: 265 / 1689 loss=3.619, nll_loss=2.075, ppl=4.21, wps=583745, ups=1.18, wpb=494941, bsz=16508.9, num_updates=20500, lr=0.000441726, gnorm=0.146, clip=0, loss_scale=4, train_wall=84, gb_free=62, wall=17616 epoch 013: 265 / 1689 loss=3.619, nll_loss=2.075, ppl=4.21, wps=583745, ups=1.18, wpb=494941, bsz=16508.9, num_updates=20500, lr=0.000441726, gnorm=0.146, clip=0, loss_scale=4, train_wall=84, gb_free=62, wall=17616 epoch 013: 265 / 1689 loss=3.619, nll_loss=2.075, ppl=4.21, wps=583745, ups=1.18, wpb=494941, bsz=16508.9, num_updates=20500, lr=0.000441726, gnorm=0.146, clip=0, loss_scale=4, train_wall=84, gb_free=62, wall=17616 epoch 013: 265 / 1689 loss=3.619, nll_loss=2.075, ppl=4.21, wps=583745, ups=1.18, wpb=494941, bsz=16508.9, num_updates=20500, lr=0.000441726, gnorm=0.146, clip=0, loss_scale=4, train_wall=84, gb_free=62, wall=17616 epoch 013: 265 / 1689 loss=3.619, nll_loss=2.075, ppl=4.21, wps=583745, ups=1.18, wpb=494941, bsz=16508.9, num_updates=20500, lr=0.000441726, gnorm=0.146, clip=0, loss_scale=4, train_wall=84, gb_free=62, wall=17616 epoch 013: 265 / 1689 loss=3.619, nll_loss=2.075, ppl=4.21, wps=583745, ups=1.18, wpb=494941, bsz=16508.9, num_updates=20500, lr=0.000441726, gnorm=0.146, clip=0, loss_scale=4, train_wall=84, gb_free=62, wall=17616 epoch 013: 265 / 1689 loss=3.619, nll_loss=2.075, ppl=4.21, wps=583745, ups=1.18, wpb=494941, bsz=16508.9, num_updates=20500, lr=0.000441726, gnorm=0.146, clip=0, loss_scale=4, train_wall=84, gb_free=62, wall=17616 epoch 013: 265 / 1689 loss=3.619, nll_loss=2.075, ppl=4.21, wps=583745, ups=1.18, wpb=494941, bsz=16508.9, num_updates=20500, lr=0.000441726, gnorm=0.146, clip=0, loss_scale=4, train_wall=84, gb_free=62, wall=17616 epoch 013: 265 / 1689 loss=3.619, nll_loss=2.075, ppl=4.21, wps=583745, ups=1.18, wpb=494941, bsz=16508.9, num_updates=20500, lr=0.000441726, gnorm=0.146, clip=0, loss_scale=4, train_wall=84, gb_free=62, wall=17616 epoch 013: 265 / 1689 loss=3.619, nll_loss=2.075, ppl=4.21, wps=583745, ups=1.18, wpb=494941, bsz=16508.9, num_updates=20500, lr=0.000441726, gnorm=0.146, clip=0, loss_scale=4, train_wall=84, gb_free=62, wall=17616 epoch 013: 265 / 1689 loss=3.619, nll_loss=2.075, ppl=4.21, wps=583745, ups=1.18, wpb=494941, bsz=16508.9, num_updates=20500, lr=0.000441726, gnorm=0.146, clip=0, loss_scale=4, train_wall=84, gb_free=62, wall=17616 epoch 013: 265 / 1689 loss=3.619, nll_loss=2.075, ppl=4.21, wps=583745, ups=1.18, wpb=494941, bsz=16508.9, num_updates=20500, lr=0.000441726, gnorm=0.146, clip=0, loss_scale=4, train_wall=84, gb_free=62, wall=17616 epoch 013: 265 / 1689 loss=3.619, nll_loss=2.075, ppl=4.21, wps=583745, ups=1.18, wpb=494941, bsz=16508.9, num_updates=20500, lr=0.000441726, gnorm=0.146, clip=0, loss_scale=4, train_wall=84, gb_free=62, wall=17616 epoch 013: 365 / 1689 loss=3.624, nll_loss=2.082, ppl=4.23, wps=585024, ups=1.18, wpb=494408, bsz=16821.3, num_updates=20600, lr=0.000440653, gnorm=0.141, clip=0, loss_scale=4, train_wall=83, gb_free=61.7, wall=17700 epoch 013: 365 / 1689 loss=3.624, nll_loss=2.082, ppl=4.23, wps=585024, ups=1.18, wpb=494408, bsz=16821.3, num_updates=20600, lr=0.000440653, gnorm=0.141, clip=0, loss_scale=4, train_wall=83, gb_free=61.7, wall=17700 epoch 013: 365 / 1689 loss=3.624, nll_loss=2.082, ppl=4.23, wps=585024, ups=1.18, wpb=494408, bsz=16821.3, num_updates=20600, lr=0.000440653, gnorm=0.141, clip=0, loss_scale=4, train_wall=83, gb_free=61.7, wall=17700 epoch 013: 365 / 1689 loss=3.624, nll_loss=2.082, ppl=4.23, wps=585024, ups=1.18, wpb=494408, bsz=16821.3, num_updates=20600, lr=0.000440653, gnorm=0.141, clip=0, loss_scale=4, train_wall=83, gb_free=61.7, wall=17700 epoch 013: 365 / 1689 loss=3.624, nll_loss=2.082, ppl=4.23, wps=585024, ups=1.18, wpb=494408, bsz=16821.3, num_updates=20600, lr=0.000440653, gnorm=0.141, clip=0, loss_scale=4, train_wall=83, gb_free=61.7, wall=17700 epoch 013: 365 / 1689 loss=3.624, nll_loss=2.082, ppl=4.23, wps=585024, ups=1.18, wpb=494408, bsz=16821.3, num_updates=20600, lr=0.000440653, gnorm=0.141, clip=0, loss_scale=4, train_wall=83, gb_free=61.7, wall=17700 epoch 013: 365 / 1689 loss=3.624, nll_loss=2.082, ppl=4.23, wps=585024, ups=1.18, wpb=494408, bsz=16821.3, num_updates=20600, lr=0.000440653, gnorm=0.141, clip=0, loss_scale=4, train_wall=83, gb_free=61.7, wall=17700 epoch 013: 365 / 1689 loss=3.624, nll_loss=2.082, ppl=4.23, wps=585024, ups=1.18, wpb=494408, bsz=16821.3, num_updates=20600, lr=0.000440653, gnorm=0.141, clip=0, loss_scale=4, train_wall=83, gb_free=61.7, wall=17700 epoch 013: 365 / 1689 loss=3.624, nll_loss=2.082, ppl=4.23, wps=585024, ups=1.18, wpb=494408, bsz=16821.3, num_updates=20600, lr=0.000440653, gnorm=0.141, clip=0, loss_scale=4, train_wall=83, gb_free=61.7, wall=17700 epoch 013: 365 / 1689 loss=3.624, nll_loss=2.082, ppl=4.23, wps=585024, ups=1.18, wpb=494408, bsz=16821.3, num_updates=20600, lr=0.000440653, gnorm=0.141, clip=0, loss_scale=4, train_wall=83, gb_free=61.7, wall=17700 epoch 013: 365 / 1689 loss=3.624, nll_loss=2.082, ppl=4.23, wps=585024, ups=1.18, wpb=494408, bsz=16821.3, num_updates=20600, lr=0.000440653, gnorm=0.141, clip=0, loss_scale=4, train_wall=83, gb_free=61.7, wall=17700 epoch 013: 365 / 1689 loss=3.624, nll_loss=2.082, ppl=4.23, wps=585024, ups=1.18, wpb=494408, bsz=16821.3, num_updates=20600, lr=0.000440653, gnorm=0.141, clip=0, loss_scale=4, train_wall=83, gb_free=61.7, wall=17700 epoch 013: 365 / 1689 loss=3.624, nll_loss=2.082, ppl=4.23, wps=585024, ups=1.18, wpb=494408, bsz=16821.3, num_updates=20600, lr=0.000440653, gnorm=0.141, clip=0, loss_scale=4, train_wall=83, gb_free=61.7, wall=17700 epoch 013: 465 / 1689 loss=3.634, nll_loss=2.092, ppl=4.26, wps=584025, ups=1.18, wpb=494060, bsz=16524, num_updates=20700, lr=0.000439587, gnorm=0.145, clip=0, loss_scale=4, train_wall=83, gb_free=62, wall=17785 epoch 013: 465 / 1689 loss=3.634, nll_loss=2.092, ppl=4.26, wps=584025, ups=1.18, wpb=494060, bsz=16524, num_updates=20700, lr=0.000439587, gnorm=0.145, clip=0, loss_scale=4, train_wall=83, gb_free=62, wall=17785 epoch 013: 465 / 1689 loss=3.634, nll_loss=2.092, ppl=4.26, wps=584025, ups=1.18, wpb=494060, bsz=16524, num_updates=20700, lr=0.000439587, gnorm=0.145, clip=0, loss_scale=4, train_wall=83, gb_free=62, wall=17785 epoch 013: 465 / 1689 loss=3.634, nll_loss=2.092, ppl=4.26, wps=584025, ups=1.18, wpb=494060, bsz=16524, num_updates=20700, lr=0.000439587, gnorm=0.145, clip=0, loss_scale=4, train_wall=83, gb_free=62, wall=17785 epoch 013: 465 / 1689 loss=3.634, nll_loss=2.092, ppl=4.26, wps=584025, ups=1.18, wpb=494060, bsz=16524, num_updates=20700, lr=0.000439587, gnorm=0.145, clip=0, loss_scale=4, train_wall=83, gb_free=62, wall=17785 epoch 013: 465 / 1689 loss=3.634, nll_loss=2.092, ppl=4.26, wps=584025, ups=1.18, wpb=494060, bsz=16524, num_updates=20700, lr=0.000439587, gnorm=0.145, clip=0, loss_scale=4, train_wall=83, gb_free=62, wall=17785 epoch 013: 465 / 1689 loss=3.634, nll_loss=2.092, ppl=4.26, wps=584025, ups=1.18, wpb=494060, bsz=16524, num_updates=20700, lr=0.000439587, gnorm=0.145, clip=0, loss_scale=4, train_wall=83, gb_free=62, wall=17785 epoch 013: 465 / 1689 loss=3.634, nll_loss=2.092, ppl=4.26, wps=584025, ups=1.18, wpb=494060, bsz=16524, num_updates=20700, lr=0.000439587, gnorm=0.145, clip=0, loss_scale=4, train_wall=83, gb_free=62, wall=17785 epoch 013: 465 / 1689 loss=3.634, nll_loss=2.092, ppl=4.26, wps=584025, ups=1.18, wpb=494060, bsz=16524, num_updates=20700, lr=0.000439587, gnorm=0.145, clip=0, loss_scale=4, train_wall=83, gb_free=62, wall=17785 epoch 013: 465 / 1689 loss=3.634, nll_loss=2.092, ppl=4.26, wps=584025, ups=1.18, wpb=494060, bsz=16524, num_updates=20700, lr=0.000439587, gnorm=0.145, clip=0, loss_scale=4, train_wall=83, gb_free=62, wall=17785 epoch 013: 465 / 1689 loss=3.634, nll_loss=2.092, ppl=4.26, wps=584025, ups=1.18, wpb=494060, bsz=16524, num_updates=20700, lr=0.000439587, gnorm=0.145, clip=0, loss_scale=4, train_wall=83, gb_free=62, wall=17785 epoch 013: 465 / 1689 loss=3.634, nll_loss=2.092, ppl=4.26, wps=584025, ups=1.18, wpb=494060, bsz=16524, num_updates=20700, lr=0.000439587, gnorm=0.145, clip=0, loss_scale=4, train_wall=83, gb_free=62, wall=17785 epoch 013: 465 / 1689 loss=3.634, nll_loss=2.092, ppl=4.26, wps=584025, ups=1.18, wpb=494060, bsz=16524, num_updates=20700, lr=0.000439587, gnorm=0.145, clip=0, loss_scale=4, train_wall=83, gb_free=62, wall=17785 epoch 013: 565 / 1689 loss=3.63, nll_loss=2.088, ppl=4.25, wps=589260, ups=1.19, wpb=495554, bsz=16086.1, num_updates=20800, lr=0.000438529, gnorm=0.143, clip=0, loss_scale=4, train_wall=83, gb_free=60.8, wall=17869 epoch 013: 565 / 1689 loss=3.63, nll_loss=2.088, ppl=4.25, wps=589260, ups=1.19, wpb=495554, bsz=16086.1, num_updates=20800, lr=0.000438529, gnorm=0.143, clip=0, loss_scale=4, train_wall=83, gb_free=60.8, wall=17869 epoch 013: 565 / 1689 loss=3.63, nll_loss=2.088, ppl=4.25, wps=589260, ups=1.19, wpb=495554, bsz=16086.1, num_updates=20800, lr=0.000438529, gnorm=0.143, clip=0, loss_scale=4, train_wall=83, gb_free=60.8, wall=17869 epoch 013: 565 / 1689 loss=3.63, nll_loss=2.088, ppl=4.25, wps=589260, ups=1.19, wpb=495554, bsz=16086.1, num_updates=20800, lr=0.000438529, gnorm=0.143, clip=0, loss_scale=4, train_wall=83, gb_free=60.8, wall=17869 epoch 013: 565 / 1689 loss=3.63, nll_loss=2.088, ppl=4.25, wps=589260, ups=1.19, wpb=495554, bsz=16086.1, num_updates=20800, lr=0.000438529, gnorm=0.143, clip=0, loss_scale=4, train_wall=83, gb_free=60.8, wall=17869 epoch 013: 565 / 1689 loss=3.63, nll_loss=2.088, ppl=4.25, wps=589260, ups=1.19, wpb=495554, bsz=16086.1, num_updates=20800, lr=0.000438529, gnorm=0.143, clip=0, loss_scale=4, train_wall=83, gb_free=60.8, wall=17869 epoch 013: 565 / 1689 loss=3.63, nll_loss=2.088, ppl=4.25, wps=589260, ups=1.19, wpb=495554, bsz=16086.1, num_updates=20800, lr=0.000438529, gnorm=0.143, clip=0, loss_scale=4, train_wall=83, gb_free=60.8, wall=17869 epoch 013: 565 / 1689 loss=3.63, nll_loss=2.088, ppl=4.25, wps=589260, ups=1.19, wpb=495554, bsz=16086.1, num_updates=20800, lr=0.000438529, gnorm=0.143, clip=0, loss_scale=4, train_wall=83, gb_free=60.8, wall=17869 epoch 013: 565 / 1689 loss=3.63, nll_loss=2.088, ppl=4.25, wps=589260, ups=1.19, wpb=495554, bsz=16086.1, num_updates=20800, lr=0.000438529, gnorm=0.143, clip=0, loss_scale=4, train_wall=83, gb_free=60.8, wall=17869 epoch 013: 565 / 1689 loss=3.63, nll_loss=2.088, ppl=4.25, wps=589260, ups=1.19, wpb=495554, bsz=16086.1, num_updates=20800, lr=0.000438529, gnorm=0.143, clip=0, loss_scale=4, train_wall=83, gb_free=60.8, wall=17869 epoch 013: 565 / 1689 loss=3.63, nll_loss=2.088, ppl=4.25, wps=589260, ups=1.19, wpb=495554, bsz=16086.1, num_updates=20800, lr=0.000438529, gnorm=0.143, clip=0, loss_scale=4, train_wall=83, gb_free=60.8, wall=17869 epoch 013: 565 / 1689 loss=3.63, nll_loss=2.088, ppl=4.25, wps=589260, ups=1.19, wpb=495554, bsz=16086.1, num_updates=20800, lr=0.000438529, gnorm=0.143, clip=0, loss_scale=4, train_wall=83, gb_free=60.8, wall=17869 epoch 013: 565 / 1689 loss=3.63, nll_loss=2.088, ppl=4.25, wps=589260, ups=1.19, wpb=495554, bsz=16086.1, num_updates=20800, lr=0.000438529, gnorm=0.143, clip=0, loss_scale=4, train_wall=83, gb_free=60.8, wall=17869 epoch 013: 666 / 1689 loss=3.632, nll_loss=2.091, ppl=4.26, wps=580368, ups=1.17, wpb=496309, bsz=16995.7, num_updates=20900, lr=0.000437479, gnorm=0.15, clip=0, loss_scale=4, train_wall=84, gb_free=61.2, wall=17955 epoch 013: 666 / 1689 loss=3.632, nll_loss=2.091, ppl=4.26, wps=580368, ups=1.17, wpb=496309, bsz=16995.7, num_updates=20900, lr=0.000437479, gnorm=0.15, clip=0, loss_scale=4, train_wall=84, gb_free=61.2, wall=17955 epoch 013: 666 / 1689 loss=3.632, nll_loss=2.091, ppl=4.26, wps=580368, ups=1.17, wpb=496309, bsz=16995.7, num_updates=20900, lr=0.000437479, gnorm=0.15, clip=0, loss_scale=4, train_wall=84, gb_free=61.2, wall=17955 epoch 013: 666 / 1689 loss=3.632, nll_loss=2.091, ppl=4.26, wps=580368, ups=1.17, wpb=496309, bsz=16995.7, num_updates=20900, lr=0.000437479, gnorm=0.15, clip=0, loss_scale=4, train_wall=84, gb_free=61.2, wall=17955 epoch 013: 666 / 1689 loss=3.632, nll_loss=2.091, ppl=4.26, wps=580368, ups=1.17, wpb=496309, bsz=16995.7, num_updates=20900, lr=0.000437479, gnorm=0.15, clip=0, loss_scale=4, train_wall=84, gb_free=61.2, wall=17955 epoch 013: 666 / 1689 loss=3.632, nll_loss=2.091, ppl=4.26, wps=580368, ups=1.17, wpb=496309, bsz=16995.7, num_updates=20900, lr=0.000437479, gnorm=0.15, clip=0, loss_scale=4, train_wall=84, gb_free=61.2, wall=17955 epoch 013: 666 / 1689 loss=3.632, nll_loss=2.091, ppl=4.26, wps=580368, ups=1.17, wpb=496309, bsz=16995.7, num_updates=20900, lr=0.000437479, gnorm=0.15, clip=0, loss_scale=4, train_wall=84, gb_free=61.2, wall=17955 epoch 013: 666 / 1689 loss=3.632, nll_loss=2.091, ppl=4.26, wps=580368, ups=1.17, wpb=496309, bsz=16995.7, num_updates=20900, lr=0.000437479, gnorm=0.15, clip=0, loss_scale=4, train_wall=84, gb_free=61.2, wall=17955 epoch 013: 666 / 1689 loss=3.632, nll_loss=2.091, ppl=4.26, wps=580368, ups=1.17, wpb=496309, bsz=16995.7, num_updates=20900, lr=0.000437479, gnorm=0.15, clip=0, loss_scale=4, train_wall=84, gb_free=61.2, wall=17955 epoch 013: 666 / 1689 loss=3.632, nll_loss=2.091, ppl=4.26, wps=580368, ups=1.17, wpb=496309, bsz=16995.7, num_updates=20900, lr=0.000437479, gnorm=0.15, clip=0, loss_scale=4, train_wall=84, gb_free=61.2, wall=17955 epoch 013: 666 / 1689 loss=3.632, nll_loss=2.091, ppl=4.26, wps=580368, ups=1.17, wpb=496309, bsz=16995.7, num_updates=20900, lr=0.000437479, gnorm=0.15, clip=0, loss_scale=4, train_wall=84, gb_free=61.2, wall=17955 epoch 013: 666 / 1689 loss=3.632, nll_loss=2.091, ppl=4.26, wps=580368, ups=1.17, wpb=496309, bsz=16995.7, num_updates=20900, lr=0.000437479, gnorm=0.15, clip=0, loss_scale=4, train_wall=84, gb_free=61.2, wall=17955 epoch 013: 666 / 1689 loss=3.632, nll_loss=2.091, ppl=4.26, wps=580368, ups=1.17, wpb=496309, bsz=16995.7, num_updates=20900, lr=0.000437479, gnorm=0.15, clip=0, loss_scale=4, train_wall=84, gb_free=61.2, wall=17955 epoch 013: 766 / 1689 loss=3.636, nll_loss=2.095, ppl=4.27, wps=586023, ups=1.18, wpb=494990, bsz=16509.3, num_updates=21000, lr=0.000436436, gnorm=0.151, clip=0, loss_scale=4, train_wall=83, gb_free=62.4, wall=18039 epoch 013: 766 / 1689 loss=3.636, nll_loss=2.095, ppl=4.27, wps=586023, ups=1.18, wpb=494990, bsz=16509.3, num_updates=21000, lr=0.000436436, gnorm=0.151, clip=0, loss_scale=4, train_wall=83, gb_free=62.4, wall=18039 epoch 013: 766 / 1689 loss=3.636, nll_loss=2.095, ppl=4.27, wps=586023, ups=1.18, wpb=494990, bsz=16509.3, num_updates=21000, lr=0.000436436, gnorm=0.151, clip=0, loss_scale=4, train_wall=83, gb_free=62.4, wall=18039 epoch 013: 766 / 1689 loss=3.636, nll_loss=2.095, ppl=4.27, wps=586023, ups=1.18, wpb=494990, bsz=16509.3, num_updates=21000, lr=0.000436436, gnorm=0.151, clip=0, loss_scale=4, train_wall=83, gb_free=62.4, wall=18039 epoch 013: 766 / 1689 loss=3.636, nll_loss=2.095, ppl=4.27, wps=586023, ups=1.18, wpb=494990, bsz=16509.3, num_updates=21000, lr=0.000436436, gnorm=0.151, clip=0, loss_scale=4, train_wall=83, gb_free=62.4, wall=18039 epoch 013: 766 / 1689 loss=3.636, nll_loss=2.095, ppl=4.27, wps=586023, ups=1.18, wpb=494990, bsz=16509.3, num_updates=21000, lr=0.000436436, gnorm=0.151, clip=0, loss_scale=4, train_wall=83, gb_free=62.4, wall=18039 epoch 013: 766 / 1689 loss=3.636, nll_loss=2.095, ppl=4.27, wps=586023, ups=1.18, wpb=494990, bsz=16509.3, num_updates=21000, lr=0.000436436, gnorm=0.151, clip=0, loss_scale=4, train_wall=83, gb_free=62.4, wall=18039 epoch 013: 766 / 1689 loss=3.636, nll_loss=2.095, ppl=4.27, wps=586023, ups=1.18, wpb=494990, bsz=16509.3, num_updates=21000, lr=0.000436436, gnorm=0.151, clip=0, loss_scale=4, train_wall=83, gb_free=62.4, wall=18039 epoch 013: 766 / 1689 loss=3.636, nll_loss=2.095, ppl=4.27, wps=586023, ups=1.18, wpb=494990, bsz=16509.3, num_updates=21000, lr=0.000436436, gnorm=0.151, clip=0, loss_scale=4, train_wall=83, gb_free=62.4, wall=18039 epoch 013: 766 / 1689 loss=3.636, nll_loss=2.095, ppl=4.27, wps=586023, ups=1.18, wpb=494990, bsz=16509.3, num_updates=21000, lr=0.000436436, gnorm=0.151, clip=0, loss_scale=4, train_wall=83, gb_free=62.4, wall=18039 epoch 013: 766 / 1689 loss=3.636, nll_loss=2.095, ppl=4.27, wps=586023, ups=1.18, wpb=494990, bsz=16509.3, num_updates=21000, lr=0.000436436, gnorm=0.151, clip=0, loss_scale=4, train_wall=83, gb_free=62.4, wall=18039 epoch 013: 766 / 1689 loss=3.636, nll_loss=2.095, ppl=4.27, wps=586023, ups=1.18, wpb=494990, bsz=16509.3, num_updates=21000, lr=0.000436436, gnorm=0.151, clip=0, loss_scale=4, train_wall=83, gb_free=62.4, wall=18039 epoch 013: 766 / 1689 loss=3.636, nll_loss=2.095, ppl=4.27, wps=586023, ups=1.18, wpb=494990, bsz=16509.3, num_updates=21000, lr=0.000436436, gnorm=0.151, clip=0, loss_scale=4, train_wall=83, gb_free=62.4, wall=18039 begin validation on "valid" subset epoch 013 | valid on 'valid' subset | loss 3.74 | nll_loss 2.192 | ppl 4.57 | wps 0 | wpb 44526 | bsz 2008 | num_updates 21000 | best_loss 3.731 epoch 013 | valid on 'valid' subset | loss 3.74 | nll_loss 2.192 | ppl 4.57 | wps 0 | wpb 44526 | bsz 2008 | num_updates 21000 | best_loss 3.731 epoch 013 | valid on 'valid' subset | loss 3.74 | nll_loss 2.192 | ppl 4.57 | wps 0 | wpb 44526 | bsz 2008 | num_updates 21000 | best_loss 3.731 epoch 013 | valid on 'valid' subset | loss 3.74 | nll_loss 2.192 | ppl 4.57 | wps 0 | wpb 44526 | bsz 2008 | num_updates 21000 | best_loss 3.731 epoch 013 | valid on 'valid' subset | loss 3.74 | nll_loss 2.192 | ppl 4.57 | wps 0 | wpb 44526 | bsz 2008 | num_updates 21000 | best_loss 3.731 epoch 013 | valid on 'valid' subset | loss 3.74 | nll_loss 2.192 | ppl 4.57 | wps 0 | wpb 44526 | bsz 2008 | num_updates 21000 | best_loss 3.731 epoch 013 | valid on 'valid' subset | loss 3.74 | nll_loss 2.192 | ppl 4.57 | wps 0 | wpb 44526 | bsz 2008 | num_updates 21000 | best_loss 3.731 epoch 013 | valid on 'valid' subset | loss 3.74 | nll_loss 2.192 | ppl 4.57 | wps 0 | wpb 44526 | bsz 2008 | num_updates 21000 | best_loss 3.731 epoch 013 | valid on 'valid' subset | loss 3.74 | nll_loss 2.192 | ppl 4.57 | wps 0 | wpb 44526 | bsz 2008 | num_updates 21000 | best_loss 3.731 epoch 013 | valid on 'valid' subset | loss 3.74 | nll_loss 2.192 | ppl 4.57 | wps 0 | wpb 44526 | bsz 2008 | num_updates 21000 | best_loss 3.731 epoch 013 | valid on 'valid' subset | loss 3.74 | nll_loss 2.192 | ppl 4.57 | wps 0 | wpb 44526 | bsz 2008 | num_updates 21000 | best_loss 3.731 epoch 013 | valid on 'valid' subset | loss 3.74 | nll_loss 2.192 | ppl 4.57 | wps 0 | wpb 44526 | bsz 2008 | num_updates 21000 | best_loss 3.731 epoch 013 | valid on 'valid' subset | loss 3.74 | nll_loss 2.192 | ppl 4.57 | wps 0 | wpb 44526 | bsz 2008 | num_updates 21000 | best_loss 3.731 epoch 013: 867 / 1689 loss=3.632, nll_loss=2.091, ppl=4.26, wps=526840, ups=1.06, wpb=495473, bsz=16764.5, num_updates=21100, lr=0.0004354, gnorm=0.15, clip=0, loss_scale=2, train_wall=84, gb_free=61.5, wall=18133 epoch 013: 867 / 1689 loss=3.632, nll_loss=2.091, ppl=4.26, wps=526840, ups=1.06, wpb=495473, bsz=16764.5, num_updates=21100, lr=0.0004354, gnorm=0.15, clip=0, loss_scale=2, train_wall=84, gb_free=61.5, wall=18133 epoch 013: 867 / 1689 loss=3.632, nll_loss=2.091, ppl=4.26, wps=526840, ups=1.06, wpb=495473, bsz=16764.5, num_updates=21100, lr=0.0004354, gnorm=0.15, clip=0, loss_scale=2, train_wall=84, gb_free=61.5, wall=18133 epoch 013: 867 / 1689 loss=3.632, nll_loss=2.091, ppl=4.26, wps=526840, ups=1.06, wpb=495473, bsz=16764.5, num_updates=21100, lr=0.0004354, gnorm=0.15, clip=0, loss_scale=2, train_wall=84, gb_free=61.5, wall=18133 epoch 013: 867 / 1689 loss=3.632, nll_loss=2.091, ppl=4.26, wps=526840, ups=1.06, wpb=495473, bsz=16764.5, num_updates=21100, lr=0.0004354, gnorm=0.15, clip=0, loss_scale=2, train_wall=84, gb_free=61.5, wall=18133 epoch 013: 867 / 1689 loss=3.632, nll_loss=2.091, ppl=4.26, wps=526840, ups=1.06, wpb=495473, bsz=16764.5, num_updates=21100, lr=0.0004354, gnorm=0.15, clip=0, loss_scale=2, train_wall=84, gb_free=61.5, wall=18133 epoch 013: 867 / 1689 loss=3.632, nll_loss=2.091, ppl=4.26, wps=526840, ups=1.06, wpb=495473, bsz=16764.5, num_updates=21100, lr=0.0004354, gnorm=0.15, clip=0, loss_scale=2, train_wall=84, gb_free=61.5, wall=18133 epoch 013: 867 / 1689 loss=3.632, nll_loss=2.091, ppl=4.26, wps=526840, ups=1.06, wpb=495473, bsz=16764.5, num_updates=21100, lr=0.0004354, gnorm=0.15, clip=0, loss_scale=2, train_wall=84, gb_free=61.5, wall=18133 epoch 013: 867 / 1689 loss=3.632, nll_loss=2.091, ppl=4.26, wps=526840, ups=1.06, wpb=495473, bsz=16764.5, num_updates=21100, lr=0.0004354, gnorm=0.15, clip=0, loss_scale=2, train_wall=84, gb_free=61.5, wall=18133 epoch 013: 867 / 1689 loss=3.632, nll_loss=2.091, ppl=4.26, wps=526840, ups=1.06, wpb=495473, bsz=16764.5, num_updates=21100, lr=0.0004354, gnorm=0.15, clip=0, loss_scale=2, train_wall=84, gb_free=61.5, wall=18133 epoch 013: 867 / 1689 loss=3.632, nll_loss=2.091, ppl=4.26, wps=526840, ups=1.06, wpb=495473, bsz=16764.5, num_updates=21100, lr=0.0004354, gnorm=0.15, clip=0, loss_scale=2, train_wall=84, gb_free=61.5, wall=18133 epoch 013: 867 / 1689 loss=3.632, nll_loss=2.091, ppl=4.26, wps=526840, ups=1.06, wpb=495473, bsz=16764.5, num_updates=21100, lr=0.0004354, gnorm=0.15, clip=0, loss_scale=2, train_wall=84, gb_free=61.5, wall=18133 epoch 013: 867 / 1689 loss=3.632, nll_loss=2.091, ppl=4.26, wps=526840, ups=1.06, wpb=495473, bsz=16764.5, num_updates=21100, lr=0.0004354, gnorm=0.15, clip=0, loss_scale=2, train_wall=84, gb_free=61.5, wall=18133 epoch 013: 967 / 1689 loss=3.627, nll_loss=2.084, ppl=4.24, wps=586282, ups=1.18, wpb=496493, bsz=16500, num_updates=21200, lr=0.000434372, gnorm=0.146, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=18218 epoch 013: 967 / 1689 loss=3.627, nll_loss=2.084, ppl=4.24, wps=586282, ups=1.18, wpb=496493, bsz=16500, num_updates=21200, lr=0.000434372, gnorm=0.146, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=18218 epoch 013: 967 / 1689 loss=3.627, nll_loss=2.084, ppl=4.24, wps=586282, ups=1.18, wpb=496493, bsz=16500, num_updates=21200, lr=0.000434372, gnorm=0.146, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=18218 epoch 013: 967 / 1689 loss=3.627, nll_loss=2.084, ppl=4.24, wps=586282, ups=1.18, wpb=496493, bsz=16500, num_updates=21200, lr=0.000434372, gnorm=0.146, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=18218 epoch 013: 967 / 1689 loss=3.627, nll_loss=2.084, ppl=4.24, wps=586282, ups=1.18, wpb=496493, bsz=16500, num_updates=21200, lr=0.000434372, gnorm=0.146, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=18218 epoch 013: 967 / 1689 loss=3.627, nll_loss=2.084, ppl=4.24, wps=586282, ups=1.18, wpb=496493, bsz=16500, num_updates=21200, lr=0.000434372, gnorm=0.146, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=18218 epoch 013: 967 / 1689 loss=3.627, nll_loss=2.084, ppl=4.24, wps=586282, ups=1.18, wpb=496493, bsz=16500, num_updates=21200, lr=0.000434372, gnorm=0.146, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=18218 epoch 013: 967 / 1689 loss=3.627, nll_loss=2.084, ppl=4.24, wps=586282, ups=1.18, wpb=496493, bsz=16500, num_updates=21200, lr=0.000434372, gnorm=0.146, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=18218 epoch 013: 967 / 1689 loss=3.627, nll_loss=2.084, ppl=4.24, wps=586282, ups=1.18, wpb=496493, bsz=16500, num_updates=21200, lr=0.000434372, gnorm=0.146, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=18218 epoch 013: 967 / 1689 loss=3.627, nll_loss=2.084, ppl=4.24, wps=586282, ups=1.18, wpb=496493, bsz=16500, num_updates=21200, lr=0.000434372, gnorm=0.146, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=18218 epoch 013: 967 / 1689 loss=3.627, nll_loss=2.084, ppl=4.24, wps=586282, ups=1.18, wpb=496493, bsz=16500, num_updates=21200, lr=0.000434372, gnorm=0.146, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=18218 epoch 013: 967 / 1689 loss=3.627, nll_loss=2.084, ppl=4.24, wps=586282, ups=1.18, wpb=496493, bsz=16500, num_updates=21200, lr=0.000434372, gnorm=0.146, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=18218 epoch 013: 967 / 1689 loss=3.627, nll_loss=2.084, ppl=4.24, wps=586282, ups=1.18, wpb=496493, bsz=16500, num_updates=21200, lr=0.000434372, gnorm=0.146, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=18218 epoch 013: 1067 / 1689 loss=3.635, nll_loss=2.093, ppl=4.27, wps=587632, ups=1.19, wpb=495359, bsz=16461.4, num_updates=21300, lr=0.000433351, gnorm=0.155, clip=0, loss_scale=2, train_wall=83, gb_free=61.4, wall=18302 epoch 013: 1067 / 1689 loss=3.635, nll_loss=2.093, ppl=4.27, wps=587632, ups=1.19, wpb=495359, bsz=16461.4, num_updates=21300, lr=0.000433351, gnorm=0.155, clip=0, loss_scale=2, train_wall=83, gb_free=61.4, wall=18302 epoch 013: 1067 / 1689 loss=3.635, nll_loss=2.093, ppl=4.27, wps=587632, ups=1.19, wpb=495359, bsz=16461.4, num_updates=21300, lr=0.000433351, gnorm=0.155, clip=0, loss_scale=2, train_wall=83, gb_free=61.4, wall=18302 epoch 013: 1067 / 1689 loss=3.635, nll_loss=2.093, ppl=4.27, wps=587632, ups=1.19, wpb=495359, bsz=16461.4, num_updates=21300, lr=0.000433351, gnorm=0.155, clip=0, loss_scale=2, train_wall=83, gb_free=61.4, wall=18302 epoch 013: 1067 / 1689 loss=3.635, nll_loss=2.093, ppl=4.27, wps=587632, ups=1.19, wpb=495359, bsz=16461.4, num_updates=21300, lr=0.000433351, gnorm=0.155, clip=0, loss_scale=2, train_wall=83, gb_free=61.4, wall=18302 epoch 013: 1067 / 1689 loss=3.635, nll_loss=2.093, ppl=4.27, wps=587632, ups=1.19, wpb=495359, bsz=16461.4, num_updates=21300, lr=0.000433351, gnorm=0.155, clip=0, loss_scale=2, train_wall=83, gb_free=61.4, wall=18302 epoch 013: 1067 / 1689 loss=3.635, nll_loss=2.093, ppl=4.27, wps=587632, ups=1.19, wpb=495359, bsz=16461.4, num_updates=21300, lr=0.000433351, gnorm=0.155, clip=0, loss_scale=2, train_wall=83, gb_free=61.4, wall=18302 epoch 013: 1067 / 1689 loss=3.635, nll_loss=2.093, ppl=4.27, wps=587632, ups=1.19, wpb=495359, bsz=16461.4, num_updates=21300, lr=0.000433351, gnorm=0.155, clip=0, loss_scale=2, train_wall=83, gb_free=61.4, wall=18302 epoch 013: 1067 / 1689 loss=3.635, nll_loss=2.093, ppl=4.27, wps=587632, ups=1.19, wpb=495359, bsz=16461.4, num_updates=21300, lr=0.000433351, gnorm=0.155, clip=0, loss_scale=2, train_wall=83, gb_free=61.4, wall=18302 epoch 013: 1067 / 1689 loss=3.635, nll_loss=2.093, ppl=4.27, wps=587632, ups=1.19, wpb=495359, bsz=16461.4, num_updates=21300, lr=0.000433351, gnorm=0.155, clip=0, loss_scale=2, train_wall=83, gb_free=61.4, wall=18302 epoch 013: 1067 / 1689 loss=3.635, nll_loss=2.093, ppl=4.27, wps=587632, ups=1.19, wpb=495359, bsz=16461.4, num_updates=21300, lr=0.000433351, gnorm=0.155, clip=0, loss_scale=2, train_wall=83, gb_free=61.4, wall=18302 epoch 013: 1067 / 1689 loss=3.635, nll_loss=2.093, ppl=4.27, wps=587632, ups=1.19, wpb=495359, bsz=16461.4, num_updates=21300, lr=0.000433351, gnorm=0.155, clip=0, loss_scale=2, train_wall=83, gb_free=61.4, wall=18302 epoch 013: 1067 / 1689 loss=3.635, nll_loss=2.093, ppl=4.27, wps=587632, ups=1.19, wpb=495359, bsz=16461.4, num_updates=21300, lr=0.000433351, gnorm=0.155, clip=0, loss_scale=2, train_wall=83, gb_free=61.4, wall=18302 epoch 013: 1167 / 1689 loss=3.641, nll_loss=2.101, ppl=4.29, wps=587862, ups=1.19, wpb=494066, bsz=16459.8, num_updates=21400, lr=0.000432338, gnorm=0.154, clip=0, loss_scale=2, train_wall=83, gb_free=61.1, wall=18386 epoch 013: 1167 / 1689 loss=3.641, nll_loss=2.101, ppl=4.29, wps=587862, ups=1.19, wpb=494066, bsz=16459.8, num_updates=21400, lr=0.000432338, gnorm=0.154, clip=0, loss_scale=2, train_wall=83, gb_free=61.1, wall=18386 epoch 013: 1167 / 1689 loss=3.641, nll_loss=2.101, ppl=4.29, wps=587862, ups=1.19, wpb=494066, bsz=16459.8, num_updates=21400, lr=0.000432338, gnorm=0.154, clip=0, loss_scale=2, train_wall=83, gb_free=61.1, wall=18386 epoch 013: 1167 / 1689 loss=3.641, nll_loss=2.101, ppl=4.29, wps=587862, ups=1.19, wpb=494066, bsz=16459.8, num_updates=21400, lr=0.000432338, gnorm=0.154, clip=0, loss_scale=2, train_wall=83, gb_free=61.1, wall=18386 epoch 013: 1167 / 1689 loss=3.641, nll_loss=2.101, ppl=4.29, wps=587862, ups=1.19, wpb=494066, bsz=16459.8, num_updates=21400, lr=0.000432338, gnorm=0.154, clip=0, loss_scale=2, train_wall=83, gb_free=61.1, wall=18386 epoch 013: 1167 / 1689 loss=3.641, nll_loss=2.101, ppl=4.29, wps=587862, ups=1.19, wpb=494066, bsz=16459.8, num_updates=21400, lr=0.000432338, gnorm=0.154, clip=0, loss_scale=2, train_wall=83, gb_free=61.1, wall=18386 epoch 013: 1167 / 1689 loss=3.641, nll_loss=2.101, ppl=4.29, wps=587862, ups=1.19, wpb=494066, bsz=16459.8, num_updates=21400, lr=0.000432338, gnorm=0.154, clip=0, loss_scale=2, train_wall=83, gb_free=61.1, wall=18386 epoch 013: 1167 / 1689 loss=3.641, nll_loss=2.101, ppl=4.29, wps=587862, ups=1.19, wpb=494066, bsz=16459.8, num_updates=21400, lr=0.000432338, gnorm=0.154, clip=0, loss_scale=2, train_wall=83, gb_free=61.1, wall=18386 epoch 013: 1167 / 1689 loss=3.641, nll_loss=2.101, ppl=4.29, wps=587862, ups=1.19, wpb=494066, bsz=16459.8, num_updates=21400, lr=0.000432338, gnorm=0.154, clip=0, loss_scale=2, train_wall=83, gb_free=61.1, wall=18386 epoch 013: 1167 / 1689 loss=3.641, nll_loss=2.101, ppl=4.29, wps=587862, ups=1.19, wpb=494066, bsz=16459.8, num_updates=21400, lr=0.000432338, gnorm=0.154, clip=0, loss_scale=2, train_wall=83, gb_free=61.1, wall=18386 epoch 013: 1167 / 1689 loss=3.641, nll_loss=2.101, ppl=4.29, wps=587862, ups=1.19, wpb=494066, bsz=16459.8, num_updates=21400, lr=0.000432338, gnorm=0.154, clip=0, loss_scale=2, train_wall=83, gb_free=61.1, wall=18386 epoch 013: 1167 / 1689 loss=3.641, nll_loss=2.101, ppl=4.29, wps=587862, ups=1.19, wpb=494066, bsz=16459.8, num_updates=21400, lr=0.000432338, gnorm=0.154, clip=0, loss_scale=2, train_wall=83, gb_free=61.1, wall=18386 epoch 013: 1167 / 1689 loss=3.641, nll_loss=2.101, ppl=4.29, wps=587862, ups=1.19, wpb=494066, bsz=16459.8, num_updates=21400, lr=0.000432338, gnorm=0.154, clip=0, loss_scale=2, train_wall=83, gb_free=61.1, wall=18386 epoch 013: 1267 / 1689 loss=3.639, nll_loss=2.098, ppl=4.28, wps=588084, ups=1.19, wpb=495548, bsz=16229.5, num_updates=21500, lr=0.000431331, gnorm=0.152, clip=0, loss_scale=2, train_wall=83, gb_free=60.7, wall=18470 epoch 013: 1267 / 1689 loss=3.639, nll_loss=2.098, ppl=4.28, wps=588084, ups=1.19, wpb=495548, bsz=16229.5, num_updates=21500, lr=0.000431331, gnorm=0.152, clip=0, loss_scale=2, train_wall=83, gb_free=60.7, wall=18470 epoch 013: 1267 / 1689 loss=3.639, nll_loss=2.098, ppl=4.28, wps=588084, ups=1.19, wpb=495548, bsz=16229.5, num_updates=21500, lr=0.000431331, gnorm=0.152, clip=0, loss_scale=2, train_wall=83, gb_free=60.7, wall=18470 epoch 013: 1267 / 1689 loss=3.639, nll_loss=2.098, ppl=4.28, wps=588084, ups=1.19, wpb=495548, bsz=16229.5, num_updates=21500, lr=0.000431331, gnorm=0.152, clip=0, loss_scale=2, train_wall=83, gb_free=60.7, wall=18470 epoch 013: 1267 / 1689 loss=3.639, nll_loss=2.098, ppl=4.28, wps=588084, ups=1.19, wpb=495548, bsz=16229.5, num_updates=21500, lr=0.000431331, gnorm=0.152, clip=0, loss_scale=2, train_wall=83, gb_free=60.7, wall=18470 epoch 013: 1267 / 1689 loss=3.639, nll_loss=2.098, ppl=4.28, wps=588084, ups=1.19, wpb=495548, bsz=16229.5, num_updates=21500, lr=0.000431331, gnorm=0.152, clip=0, loss_scale=2, train_wall=83, gb_free=60.7, wall=18470 epoch 013: 1267 / 1689 loss=3.639, nll_loss=2.098, ppl=4.28, wps=588084, ups=1.19, wpb=495548, bsz=16229.5, num_updates=21500, lr=0.000431331, gnorm=0.152, clip=0, loss_scale=2, train_wall=83, gb_free=60.7, wall=18470 epoch 013: 1267 / 1689 loss=3.639, nll_loss=2.098, ppl=4.28, wps=588084, ups=1.19, wpb=495548, bsz=16229.5, num_updates=21500, lr=0.000431331, gnorm=0.152, clip=0, loss_scale=2, train_wall=83, gb_free=60.7, wall=18470 epoch 013: 1267 / 1689 loss=3.639, nll_loss=2.098, ppl=4.28, wps=588084, ups=1.19, wpb=495548, bsz=16229.5, num_updates=21500, lr=0.000431331, gnorm=0.152, clip=0, loss_scale=2, train_wall=83, gb_free=60.7, wall=18470 epoch 013: 1267 / 1689 loss=3.639, nll_loss=2.098, ppl=4.28, wps=588084, ups=1.19, wpb=495548, bsz=16229.5, num_updates=21500, lr=0.000431331, gnorm=0.152, clip=0, loss_scale=2, train_wall=83, gb_free=60.7, wall=18470 epoch 013: 1267 / 1689 loss=3.639, nll_loss=2.098, ppl=4.28, wps=588084, ups=1.19, wpb=495548, bsz=16229.5, num_updates=21500, lr=0.000431331, gnorm=0.152, clip=0, loss_scale=2, train_wall=83, gb_free=60.7, wall=18470 epoch 013: 1267 / 1689 loss=3.639, nll_loss=2.098, ppl=4.28, wps=588084, ups=1.19, wpb=495548, bsz=16229.5, num_updates=21500, lr=0.000431331, gnorm=0.152, clip=0, loss_scale=2, train_wall=83, gb_free=60.7, wall=18470 epoch 013: 1267 / 1689 loss=3.639, nll_loss=2.098, ppl=4.28, wps=588084, ups=1.19, wpb=495548, bsz=16229.5, num_updates=21500, lr=0.000431331, gnorm=0.152, clip=0, loss_scale=2, train_wall=83, gb_free=60.7, wall=18470 epoch 013: 1367 / 1689 loss=3.635, nll_loss=2.094, ppl=4.27, wps=584144, ups=1.18, wpb=495047, bsz=16766.9, num_updates=21600, lr=0.000430331, gnorm=0.144, clip=0, loss_scale=4, train_wall=84, gb_free=60.1, wall=18555 epoch 013: 1367 / 1689 loss=3.635, nll_loss=2.094, ppl=4.27, wps=584144, ups=1.18, wpb=495047, bsz=16766.9, num_updates=21600, lr=0.000430331, gnorm=0.144, clip=0, loss_scale=4, train_wall=84, gb_free=60.1, wall=18555 epoch 013: 1367 / 1689 loss=3.635, nll_loss=2.094, ppl=4.27, wps=584144, ups=1.18, wpb=495047, bsz=16766.9, num_updates=21600, lr=0.000430331, gnorm=0.144, clip=0, loss_scale=4, train_wall=84, gb_free=60.1, wall=18555 epoch 013: 1367 / 1689 loss=3.635, nll_loss=2.094, ppl=4.27, wps=584144, ups=1.18, wpb=495047, bsz=16766.9, num_updates=21600, lr=0.000430331, gnorm=0.144, clip=0, loss_scale=4, train_wall=84, gb_free=60.1, wall=18555 epoch 013: 1367 / 1689 loss=3.635, nll_loss=2.094, ppl=4.27, wps=584144, ups=1.18, wpb=495047, bsz=16766.9, num_updates=21600, lr=0.000430331, gnorm=0.144, clip=0, loss_scale=4, train_wall=84, gb_free=60.1, wall=18555 epoch 013: 1367 / 1689 loss=3.635, nll_loss=2.094, ppl=4.27, wps=584144, ups=1.18, wpb=495047, bsz=16766.9, num_updates=21600, lr=0.000430331, gnorm=0.144, clip=0, loss_scale=4, train_wall=84, gb_free=60.1, wall=18555 epoch 013: 1367 / 1689 loss=3.635, nll_loss=2.094, ppl=4.27, wps=584144, ups=1.18, wpb=495047, bsz=16766.9, num_updates=21600, lr=0.000430331, gnorm=0.144, clip=0, loss_scale=4, train_wall=84, gb_free=60.1, wall=18555 epoch 013: 1367 / 1689 loss=3.635, nll_loss=2.094, ppl=4.27, wps=584144, ups=1.18, wpb=495047, bsz=16766.9, num_updates=21600, lr=0.000430331, gnorm=0.144, clip=0, loss_scale=4, train_wall=84, gb_free=60.1, wall=18555 epoch 013: 1367 / 1689 loss=3.635, nll_loss=2.094, ppl=4.27, wps=584144, ups=1.18, wpb=495047, bsz=16766.9, num_updates=21600, lr=0.000430331, gnorm=0.144, clip=0, loss_scale=4, train_wall=84, gb_free=60.1, wall=18555 epoch 013: 1367 / 1689 loss=3.635, nll_loss=2.094, ppl=4.27, wps=584144, ups=1.18, wpb=495047, bsz=16766.9, num_updates=21600, lr=0.000430331, gnorm=0.144, clip=0, loss_scale=4, train_wall=84, gb_free=60.1, wall=18555 epoch 013: 1367 / 1689 loss=3.635, nll_loss=2.094, ppl=4.27, wps=584144, ups=1.18, wpb=495047, bsz=16766.9, num_updates=21600, lr=0.000430331, gnorm=0.144, clip=0, loss_scale=4, train_wall=84, gb_free=60.1, wall=18555 epoch 013: 1367 / 1689 loss=3.635, nll_loss=2.094, ppl=4.27, wps=584144, ups=1.18, wpb=495047, bsz=16766.9, num_updates=21600, lr=0.000430331, gnorm=0.144, clip=0, loss_scale=4, train_wall=84, gb_free=60.1, wall=18555 epoch 013: 1367 / 1689 loss=3.635, nll_loss=2.094, ppl=4.27, wps=584144, ups=1.18, wpb=495047, bsz=16766.9, num_updates=21600, lr=0.000430331, gnorm=0.144, clip=0, loss_scale=4, train_wall=84, gb_free=60.1, wall=18555 epoch 013: 1467 / 1689 loss=3.628, nll_loss=2.087, ppl=4.25, wps=587514, ups=1.18, wpb=495798, bsz=16772.7, num_updates=21700, lr=0.000429339, gnorm=0.147, clip=0, loss_scale=4, train_wall=83, gb_free=61.1, wall=18640 epoch 013: 1467 / 1689 loss=3.628, nll_loss=2.087, ppl=4.25, wps=587514, ups=1.18, wpb=495798, bsz=16772.7, num_updates=21700, lr=0.000429339, gnorm=0.147, clip=0, loss_scale=4, train_wall=83, gb_free=61.1, wall=18640 epoch 013: 1467 / 1689 loss=3.628, nll_loss=2.087, ppl=4.25, wps=587514, ups=1.18, wpb=495798, bsz=16772.7, num_updates=21700, lr=0.000429339, gnorm=0.147, clip=0, loss_scale=4, train_wall=83, gb_free=61.1, wall=18640 epoch 013: 1467 / 1689 loss=3.628, nll_loss=2.087, ppl=4.25, wps=587514, ups=1.18, wpb=495798, bsz=16772.7, num_updates=21700, lr=0.000429339, gnorm=0.147, clip=0, loss_scale=4, train_wall=83, gb_free=61.1, wall=18640 epoch 013: 1467 / 1689 loss=3.628, nll_loss=2.087, ppl=4.25, wps=587514, ups=1.18, wpb=495798, bsz=16772.7, num_updates=21700, lr=0.000429339, gnorm=0.147, clip=0, loss_scale=4, train_wall=83, gb_free=61.1, wall=18640 epoch 013: 1467 / 1689 loss=3.628, nll_loss=2.087, ppl=4.25, wps=587514, ups=1.18, wpb=495798, bsz=16772.7, num_updates=21700, lr=0.000429339, gnorm=0.147, clip=0, loss_scale=4, train_wall=83, gb_free=61.1, wall=18640 epoch 013: 1467 / 1689 loss=3.628, nll_loss=2.087, ppl=4.25, wps=587514, ups=1.18, wpb=495798, bsz=16772.7, num_updates=21700, lr=0.000429339, gnorm=0.147, clip=0, loss_scale=4, train_wall=83, gb_free=61.1, wall=18640 epoch 013: 1467 / 1689 loss=3.628, nll_loss=2.087, ppl=4.25, wps=587514, ups=1.18, wpb=495798, bsz=16772.7, num_updates=21700, lr=0.000429339, gnorm=0.147, clip=0, loss_scale=4, train_wall=83, gb_free=61.1, wall=18640 epoch 013: 1467 / 1689 loss=3.628, nll_loss=2.087, ppl=4.25, wps=587514, ups=1.18, wpb=495798, bsz=16772.7, num_updates=21700, lr=0.000429339, gnorm=0.147, clip=0, loss_scale=4, train_wall=83, gb_free=61.1, wall=18640 epoch 013: 1467 / 1689 loss=3.628, nll_loss=2.087, ppl=4.25, wps=587514, ups=1.18, wpb=495798, bsz=16772.7, num_updates=21700, lr=0.000429339, gnorm=0.147, clip=0, loss_scale=4, train_wall=83, gb_free=61.1, wall=18640 epoch 013: 1467 / 1689 loss=3.628, nll_loss=2.087, ppl=4.25, wps=587514, ups=1.18, wpb=495798, bsz=16772.7, num_updates=21700, lr=0.000429339, gnorm=0.147, clip=0, loss_scale=4, train_wall=83, gb_free=61.1, wall=18640 epoch 013: 1467 / 1689 loss=3.628, nll_loss=2.087, ppl=4.25, wps=587514, ups=1.18, wpb=495798, bsz=16772.7, num_updates=21700, lr=0.000429339, gnorm=0.147, clip=0, loss_scale=4, train_wall=83, gb_free=61.1, wall=18640 epoch 013: 1467 / 1689 loss=3.628, nll_loss=2.087, ppl=4.25, wps=587514, ups=1.18, wpb=495798, bsz=16772.7, num_updates=21700, lr=0.000429339, gnorm=0.147, clip=0, loss_scale=4, train_wall=83, gb_free=61.1, wall=18640 epoch 013: 1567 / 1689 loss=3.643, nll_loss=2.103, ppl=4.3, wps=583709, ups=1.18, wpb=495020, bsz=16463.6, num_updates=21800, lr=0.000428353, gnorm=0.151, clip=0, loss_scale=4, train_wall=84, gb_free=62, wall=18724 epoch 013: 1567 / 1689 loss=3.643, nll_loss=2.103, ppl=4.3, wps=583709, ups=1.18, wpb=495020, bsz=16463.6, num_updates=21800, lr=0.000428353, gnorm=0.151, clip=0, loss_scale=4, train_wall=84, gb_free=62, wall=18724 epoch 013: 1567 / 1689 loss=3.643, nll_loss=2.103, ppl=4.3, wps=583709, ups=1.18, wpb=495020, bsz=16463.6, num_updates=21800, lr=0.000428353, gnorm=0.151, clip=0, loss_scale=4, train_wall=84, gb_free=62, wall=18724 epoch 013: 1567 / 1689 loss=3.643, nll_loss=2.103, ppl=4.3, wps=583709, ups=1.18, wpb=495020, bsz=16463.6, num_updates=21800, lr=0.000428353, gnorm=0.151, clip=0, loss_scale=4, train_wall=84, gb_free=62, wall=18724 epoch 013: 1567 / 1689 loss=3.643, nll_loss=2.103, ppl=4.3, wps=583709, ups=1.18, wpb=495020, bsz=16463.6, num_updates=21800, lr=0.000428353, gnorm=0.151, clip=0, loss_scale=4, train_wall=84, gb_free=62, wall=18724 epoch 013: 1567 / 1689 loss=3.643, nll_loss=2.103, ppl=4.3, wps=583709, ups=1.18, wpb=495020, bsz=16463.6, num_updates=21800, lr=0.000428353, gnorm=0.151, clip=0, loss_scale=4, train_wall=84, gb_free=62, wall=18724 epoch 013: 1567 / 1689 loss=3.643, nll_loss=2.103, ppl=4.3, wps=583709, ups=1.18, wpb=495020, bsz=16463.6, num_updates=21800, lr=0.000428353, gnorm=0.151, clip=0, loss_scale=4, train_wall=84, gb_free=62, wall=18724 epoch 013: 1567 / 1689 loss=3.643, nll_loss=2.103, ppl=4.3, wps=583709, ups=1.18, wpb=495020, bsz=16463.6, num_updates=21800, lr=0.000428353, gnorm=0.151, clip=0, loss_scale=4, train_wall=84, gb_free=62, wall=18724 epoch 013: 1567 / 1689 loss=3.643, nll_loss=2.103, ppl=4.3, wps=583709, ups=1.18, wpb=495020, bsz=16463.6, num_updates=21800, lr=0.000428353, gnorm=0.151, clip=0, loss_scale=4, train_wall=84, gb_free=62, wall=18724 epoch 013: 1567 / 1689 loss=3.643, nll_loss=2.103, ppl=4.3, wps=583709, ups=1.18, wpb=495020, bsz=16463.6, num_updates=21800, lr=0.000428353, gnorm=0.151, clip=0, loss_scale=4, train_wall=84, gb_free=62, wall=18724 epoch 013: 1567 / 1689 loss=3.643, nll_loss=2.103, ppl=4.3, wps=583709, ups=1.18, wpb=495020, bsz=16463.6, num_updates=21800, lr=0.000428353, gnorm=0.151, clip=0, loss_scale=4, train_wall=84, gb_free=62, wall=18724 epoch 013: 1567 / 1689 loss=3.643, nll_loss=2.103, ppl=4.3, wps=583709, ups=1.18, wpb=495020, bsz=16463.6, num_updates=21800, lr=0.000428353, gnorm=0.151, clip=0, loss_scale=4, train_wall=84, gb_free=62, wall=18724 epoch 013: 1567 / 1689 loss=3.643, nll_loss=2.103, ppl=4.3, wps=583709, ups=1.18, wpb=495020, bsz=16463.6, num_updates=21800, lr=0.000428353, gnorm=0.151, clip=0, loss_scale=4, train_wall=84, gb_free=62, wall=18724 epoch 013: 1667 / 1689 loss=3.636, nll_loss=2.096, ppl=4.27, wps=586842, ups=1.18, wpb=495612, bsz=16361.4, num_updates=21900, lr=0.000427374, gnorm=0.147, clip=0, loss_scale=4, train_wall=83, gb_free=61.5, wall=18809 epoch 013: 1667 / 1689 loss=3.636, nll_loss=2.096, ppl=4.27, wps=586842, ups=1.18, wpb=495612, bsz=16361.4, num_updates=21900, lr=0.000427374, gnorm=0.147, clip=0, loss_scale=4, train_wall=83, gb_free=61.5, wall=18809 epoch 013: 1667 / 1689 loss=3.636, nll_loss=2.096, ppl=4.27, wps=586842, ups=1.18, wpb=495612, bsz=16361.4, num_updates=21900, lr=0.000427374, gnorm=0.147, clip=0, loss_scale=4, train_wall=83, gb_free=61.5, wall=18809 epoch 013: 1667 / 1689 loss=3.636, nll_loss=2.096, ppl=4.27, wps=586842, ups=1.18, wpb=495612, bsz=16361.4, num_updates=21900, lr=0.000427374, gnorm=0.147, clip=0, loss_scale=4, train_wall=83, gb_free=61.5, wall=18809 epoch 013: 1667 / 1689 loss=3.636, nll_loss=2.096, ppl=4.27, wps=586842, ups=1.18, wpb=495612, bsz=16361.4, num_updates=21900, lr=0.000427374, gnorm=0.147, clip=0, loss_scale=4, train_wall=83, gb_free=61.5, wall=18809 epoch 013: 1667 / 1689 loss=3.636, nll_loss=2.096, ppl=4.27, wps=586842, ups=1.18, wpb=495612, bsz=16361.4, num_updates=21900, lr=0.000427374, gnorm=0.147, clip=0, loss_scale=4, train_wall=83, gb_free=61.5, wall=18809 epoch 013: 1667 / 1689 loss=3.636, nll_loss=2.096, ppl=4.27, wps=586842, ups=1.18, wpb=495612, bsz=16361.4, num_updates=21900, lr=0.000427374, gnorm=0.147, clip=0, loss_scale=4, train_wall=83, gb_free=61.5, wall=18809 epoch 013: 1667 / 1689 loss=3.636, nll_loss=2.096, ppl=4.27, wps=586842, ups=1.18, wpb=495612, bsz=16361.4, num_updates=21900, lr=0.000427374, gnorm=0.147, clip=0, loss_scale=4, train_wall=83, gb_free=61.5, wall=18809 epoch 013: 1667 / 1689 loss=3.636, nll_loss=2.096, ppl=4.27, wps=586842, ups=1.18, wpb=495612, bsz=16361.4, num_updates=21900, lr=0.000427374, gnorm=0.147, clip=0, loss_scale=4, train_wall=83, gb_free=61.5, wall=18809 epoch 013: 1667 / 1689 loss=3.636, nll_loss=2.096, ppl=4.27, wps=586842, ups=1.18, wpb=495612, bsz=16361.4, num_updates=21900, lr=0.000427374, gnorm=0.147, clip=0, loss_scale=4, train_wall=83, gb_free=61.5, wall=18809 epoch 013: 1667 / 1689 loss=3.636, nll_loss=2.096, ppl=4.27, wps=586842, ups=1.18, wpb=495612, bsz=16361.4, num_updates=21900, lr=0.000427374, gnorm=0.147, clip=0, loss_scale=4, train_wall=83, gb_free=61.5, wall=18809 epoch 013: 1667 / 1689 loss=3.636, nll_loss=2.096, ppl=4.27, wps=586842, ups=1.18, wpb=495612, bsz=16361.4, num_updates=21900, lr=0.000427374, gnorm=0.147, clip=0, loss_scale=4, train_wall=83, gb_free=61.5, wall=18809 epoch 013: 1667 / 1689 loss=3.636, nll_loss=2.096, ppl=4.27, wps=586842, ups=1.18, wpb=495612, bsz=16361.4, num_updates=21900, lr=0.000427374, gnorm=0.147, clip=0, loss_scale=4, train_wall=83, gb_free=61.5, wall=18809 end of epoch 13 (average epoch stats below) epoch 013 | loss 3.631 | nll_loss 2.089 | ppl 4.26 | wps 582067 | ups 1.18 | wpb 495108 | bsz 16507.3 | num_updates 21922 | lr 0.000427159 | gnorm 0.149 | clip 0 | loss_scale 4 | train_wall 1406 | gb_free 62.7 | wall 18827 epoch 013 | loss 3.631 | nll_loss 2.089 | ppl 4.26 | wps 582067 | ups 1.18 | wpb 495108 | bsz 16507.3 | num_updates 21922 | lr 0.000427159 | gnorm 0.149 | clip 0 | loss_scale 4 | train_wall 1406 | gb_free 62.7 | wall 18827 epoch 013 | loss 3.631 | nll_loss 2.089 | ppl 4.26 | wps 582067 | ups 1.18 | wpb 495108 | bsz 16507.3 | num_updates 21922 | lr 0.000427159 | gnorm 0.149 | clip 0 | loss_scale 4 | train_wall 1406 | gb_free 62.7 | wall 18827 epoch 013 | loss 3.631 | nll_loss 2.089 | ppl 4.26 | wps 582067 | ups 1.18 | wpb 495108 | bsz 16507.3 | num_updates 21922 | lr 0.000427159 | gnorm 0.149 | clip 0 | loss_scale 4 | train_wall 1406 | gb_free 62.7 | wall 18827 epoch 013 | loss 3.631 | nll_loss 2.089 | ppl 4.26 | wps 582067 | ups 1.18 | wpb 495108 | bsz 16507.3 | num_updates 21922 | lr 0.000427159 | gnorm 0.149 | clip 0 | loss_scale 4 | train_wall 1406 | gb_free 62.7 | wall 18827 epoch 013 | loss 3.631 | nll_loss 2.089 | ppl 4.26 | wps 582067 | ups 1.18 | wpb 495108 | bsz 16507.3 | num_updates 21922 | lr 0.000427159 | gnorm 0.149 | clip 0 | loss_scale 4 | train_wall 1406 | gb_free 62.7 | wall 18827 epoch 013 | loss 3.631 | nll_loss 2.089 | ppl 4.26 | wps 582067 | ups 1.18 | wpb 495108 | bsz 16507.3 | num_updates 21922 | lr 0.000427159 | gnorm 0.149 | clip 0 | loss_scale 4 | train_wall 1406 | gb_free 62.7 | wall 18827 epoch 013 | loss 3.631 | nll_loss 2.089 | ppl 4.26 | wps 582067 | ups 1.18 | wpb 495108 | bsz 16507.3 | num_updates 21922 | lr 0.000427159 | gnorm 0.149 | clip 0 | loss_scale 4 | train_wall 1406 | gb_free 62.7 | wall 18827 epoch 013 | loss 3.631 | nll_loss 2.089 | ppl 4.26 | wps 582067 | ups 1.18 | wpb 495108 | bsz 16507.3 | num_updates 21922 | lr 0.000427159 | gnorm 0.149 | clip 0 | loss_scale 4 | train_wall 1406 | gb_free 62.7 | wall 18827 epoch 013 | loss 3.631 | nll_loss 2.089 | ppl 4.26 | wps 582067 | ups 1.18 | wpb 495108 | bsz 16507.3 | num_updates 21922 | lr 0.000427159 | gnorm 0.149 | clip 0 | loss_scale 4 | train_wall 1406 | gb_free 62.7 | wall 18827 epoch 013 | loss 3.631 | nll_loss 2.089 | ppl 4.26 | wps 582067 | ups 1.18 | wpb 495108 | bsz 16507.3 | num_updates 21922 | lr 0.000427159 | gnorm 0.149 | clip 0 | loss_scale 4 | train_wall 1406 | gb_free 62.7 | wall 18827 epoch 013 | loss 3.631 | nll_loss 2.089 | ppl 4.26 | wps 582067 | ups 1.18 | wpb 495108 | bsz 16507.3 | num_updates 21922 | lr 0.000427159 | gnorm 0.149 | clip 0 | loss_scale 4 | train_wall 1406 | gb_free 62.7 | wall 18827 epoch 013 | loss 3.631 | nll_loss 2.089 | ppl 4.26 | wps 582067 | ups 1.18 | wpb 495108 | bsz 16507.3 | num_updates 21922 | lr 0.000427159 | gnorm 0.149 | clip 0 | loss_scale 4 | train_wall 1406 | gb_free 62.7 | wall 18827 Start iterating over samples epoch 014: 78 / 1689 loss=3.611, nll_loss=2.066, ppl=4.19, wps=581354, ups=1.18, wpb=492370, bsz=16259.4, num_updates=22000, lr=0.000426401, gnorm=0.149, clip=0, loss_scale=4, train_wall=83, gb_free=61.7, wall=18894 epoch 014: 78 / 1689 loss=3.611, nll_loss=2.066, ppl=4.19, wps=581354, ups=1.18, wpb=492370, bsz=16259.4, num_updates=22000, lr=0.000426401, gnorm=0.149, clip=0, loss_scale=4, train_wall=83, gb_free=61.7, wall=18894 epoch 014: 78 / 1689 loss=3.611, nll_loss=2.066, ppl=4.19, wps=581354, ups=1.18, wpb=492370, bsz=16259.4, num_updates=22000, lr=0.000426401, gnorm=0.149, clip=0, loss_scale=4, train_wall=83, gb_free=61.7, wall=18894 epoch 014: 78 / 1689 loss=3.611, nll_loss=2.066, ppl=4.19, wps=581354, ups=1.18, wpb=492370, bsz=16259.4, num_updates=22000, lr=0.000426401, gnorm=0.149, clip=0, loss_scale=4, train_wall=83, gb_free=61.7, wall=18894 epoch 014: 78 / 1689 loss=3.611, nll_loss=2.066, ppl=4.19, wps=581354, ups=1.18, wpb=492370, bsz=16259.4, num_updates=22000, lr=0.000426401, gnorm=0.149, clip=0, loss_scale=4, train_wall=83, gb_free=61.7, wall=18894 epoch 014: 78 / 1689 loss=3.611, nll_loss=2.066, ppl=4.19, wps=581354, ups=1.18, wpb=492370, bsz=16259.4, num_updates=22000, lr=0.000426401, gnorm=0.149, clip=0, loss_scale=4, train_wall=83, gb_free=61.7, wall=18894 epoch 014: 78 / 1689 loss=3.611, nll_loss=2.066, ppl=4.19, wps=581354, ups=1.18, wpb=492370, bsz=16259.4, num_updates=22000, lr=0.000426401, gnorm=0.149, clip=0, loss_scale=4, train_wall=83, gb_free=61.7, wall=18894 epoch 014: 78 / 1689 loss=3.611, nll_loss=2.066, ppl=4.19, wps=581354, ups=1.18, wpb=492370, bsz=16259.4, num_updates=22000, lr=0.000426401, gnorm=0.149, clip=0, loss_scale=4, train_wall=83, gb_free=61.7, wall=18894 epoch 014: 78 / 1689 loss=3.611, nll_loss=2.066, ppl=4.19, wps=581354, ups=1.18, wpb=492370, bsz=16259.4, num_updates=22000, lr=0.000426401, gnorm=0.149, clip=0, loss_scale=4, train_wall=83, gb_free=61.7, wall=18894 epoch 014: 78 / 1689 loss=3.611, nll_loss=2.066, ppl=4.19, wps=581354, ups=1.18, wpb=492370, bsz=16259.4, num_updates=22000, lr=0.000426401, gnorm=0.149, clip=0, loss_scale=4, train_wall=83, gb_free=61.7, wall=18894 epoch 014: 78 / 1689 loss=3.611, nll_loss=2.066, ppl=4.19, wps=581354, ups=1.18, wpb=492370, bsz=16259.4, num_updates=22000, lr=0.000426401, gnorm=0.149, clip=0, loss_scale=4, train_wall=83, gb_free=61.7, wall=18894 epoch 014: 78 / 1689 loss=3.611, nll_loss=2.066, ppl=4.19, wps=581354, ups=1.18, wpb=492370, bsz=16259.4, num_updates=22000, lr=0.000426401, gnorm=0.149, clip=0, loss_scale=4, train_wall=83, gb_free=61.7, wall=18894 epoch 014: 78 / 1689 loss=3.611, nll_loss=2.066, ppl=4.19, wps=581354, ups=1.18, wpb=492370, bsz=16259.4, num_updates=22000, lr=0.000426401, gnorm=0.149, clip=0, loss_scale=4, train_wall=83, gb_free=61.7, wall=18894 epoch 014: 78 / 1689 loss=3.611, nll_loss=2.066, ppl=4.19, wps=581354, ups=1.18, wpb=492370, bsz=16259.4, num_updates=22000, lr=0.000426401, gnorm=0.149, clip=0, loss_scale=4, train_wall=83, gb_free=61.7, wall=18894 begin validation on "valid" subset epoch 014 | valid on 'valid' subset | loss 3.724 | nll_loss 2.171 | ppl 4.5 | wps 0 | wpb 44526 | bsz 2008 | num_updates 22000 | best_loss 3.724 epoch 014 | valid on 'valid' subset | loss 3.724 | nll_loss 2.171 | ppl 4.5 | wps 0 | wpb 44526 | bsz 2008 | num_updates 22000 | best_loss 3.724 epoch 014 | valid on 'valid' subset | loss 3.724 | nll_loss 2.171 | ppl 4.5 | wps 0 | wpb 44526 | bsz 2008 | num_updates 22000 | best_loss 3.724 epoch 014 | valid on 'valid' subset | loss 3.724 | nll_loss 2.171 | ppl 4.5 | wps 0 | wpb 44526 | bsz 2008 | num_updates 22000 | best_loss 3.724 epoch 014 | valid on 'valid' subset | loss 3.724 | nll_loss 2.171 | ppl 4.5 | wps 0 | wpb 44526 | bsz 2008 | num_updates 22000 | best_loss 3.724 epoch 014 | valid on 'valid' subset | loss 3.724 | nll_loss 2.171 | ppl 4.5 | wps 0 | wpb 44526 | bsz 2008 | num_updates 22000 | best_loss 3.724 epoch 014 | valid on 'valid' subset | loss 3.724 | nll_loss 2.171 | ppl 4.5 | wps 0 | wpb 44526 | bsz 2008 | num_updates 22000 | best_loss 3.724 epoch 014 | valid on 'valid' subset | loss 3.724 | nll_loss 2.171 | ppl 4.5 | wps 0 | wpb 44526 | bsz 2008 | num_updates 22000 | best_loss 3.724 epoch 014 | valid on 'valid' subset | loss 3.724 | nll_loss 2.171 | ppl 4.5 | wps 0 | wpb 44526 | bsz 2008 | num_updates 22000 | best_loss 3.724 epoch 014 | valid on 'valid' subset | loss 3.724 | nll_loss 2.171 | ppl 4.5 | wps 0 | wpb 44526 | bsz 2008 | num_updates 22000 | best_loss 3.724 epoch 014 | valid on 'valid' subset | loss 3.724 | nll_loss 2.171 | ppl 4.5 | wps 0 | wpb 44526 | bsz 2008 | num_updates 22000 | best_loss 3.724 epoch 014 | valid on 'valid' subset | loss 3.724 | nll_loss 2.171 | ppl 4.5 | wps 0 | wpb 44526 | bsz 2008 | num_updates 22000 | best_loss 3.724 epoch 014 | valid on 'valid' subset | loss 3.724 | nll_loss 2.171 | ppl 4.5 | wps 0 | wpb 44526 | bsz 2008 | num_updates 22000 | best_loss 3.724 epoch 014 | valid on 'valid' subset | loss 3.724 | nll_loss 2.171 | ppl 4.5 | wps 0 | wpb 44526 | bsz 2008 | num_updates 22000 | best_loss 3.724 epoch 014: 178 / 1689 loss=3.601, nll_loss=2.055, ppl=4.16, wps=520725, ups=1.05, wpb=497504, bsz=16527.6, num_updates=22100, lr=0.000425436, gnorm=0.144, clip=0, loss_scale=8, train_wall=83, gb_free=61, wall=18989 epoch 014: 178 / 1689 loss=3.601, nll_loss=2.055, ppl=4.16, wps=520725, ups=1.05, wpb=497504, bsz=16527.6, num_updates=22100, lr=0.000425436, gnorm=0.144, clip=0, loss_scale=8, train_wall=83, gb_free=61, wall=18989 epoch 014: 178 / 1689 loss=3.601, nll_loss=2.055, ppl=4.16, wps=520725, ups=1.05, wpb=497504, bsz=16527.6, num_updates=22100, lr=0.000425436, gnorm=0.144, clip=0, loss_scale=8, train_wall=83, gb_free=61, wall=18989 epoch 014: 178 / 1689 loss=3.601, nll_loss=2.055, ppl=4.16, wps=520725, ups=1.05, wpb=497504, bsz=16527.6, num_updates=22100, lr=0.000425436, gnorm=0.144, clip=0, loss_scale=8, train_wall=83, gb_free=61, wall=18989 epoch 014: 178 / 1689 loss=3.601, nll_loss=2.055, ppl=4.16, wps=520725, ups=1.05, wpb=497504, bsz=16527.6, num_updates=22100, lr=0.000425436, gnorm=0.144, clip=0, loss_scale=8, train_wall=83, gb_free=61, wall=18989 epoch 014: 178 / 1689 loss=3.601, nll_loss=2.055, ppl=4.16, wps=520725, ups=1.05, wpb=497504, bsz=16527.6, num_updates=22100, lr=0.000425436, gnorm=0.144, clip=0, loss_scale=8, train_wall=83, gb_free=61, wall=18989 epoch 014: 178 / 1689 loss=3.601, nll_loss=2.055, ppl=4.16, wps=520725, ups=1.05, wpb=497504, bsz=16527.6, num_updates=22100, lr=0.000425436, gnorm=0.144, clip=0, loss_scale=8, train_wall=83, gb_free=61, wall=18989 epoch 014: 178 / 1689 loss=3.601, nll_loss=2.055, ppl=4.16, wps=520725, ups=1.05, wpb=497504, bsz=16527.6, num_updates=22100, lr=0.000425436, gnorm=0.144, clip=0, loss_scale=8, train_wall=83, gb_free=61, wall=18989 epoch 014: 178 / 1689 loss=3.601, nll_loss=2.055, ppl=4.16, wps=520725, ups=1.05, wpb=497504, bsz=16527.6, num_updates=22100, lr=0.000425436, gnorm=0.144, clip=0, loss_scale=8, train_wall=83, gb_free=61, wall=18989 epoch 014: 178 / 1689 loss=3.601, nll_loss=2.055, ppl=4.16, wps=520725, ups=1.05, wpb=497504, bsz=16527.6, num_updates=22100, lr=0.000425436, gnorm=0.144, clip=0, loss_scale=8, train_wall=83, gb_free=61, wall=18989 epoch 014: 178 / 1689 loss=3.601, nll_loss=2.055, ppl=4.16, wps=520725, ups=1.05, wpb=497504, bsz=16527.6, num_updates=22100, lr=0.000425436, gnorm=0.144, clip=0, loss_scale=8, train_wall=83, gb_free=61, wall=18989 epoch 014: 178 / 1689 loss=3.601, nll_loss=2.055, ppl=4.16, wps=520725, ups=1.05, wpb=497504, bsz=16527.6, num_updates=22100, lr=0.000425436, gnorm=0.144, clip=0, loss_scale=8, train_wall=83, gb_free=61, wall=18989 epoch 014: 178 / 1689 loss=3.601, nll_loss=2.055, ppl=4.16, wps=520725, ups=1.05, wpb=497504, bsz=16527.6, num_updates=22100, lr=0.000425436, gnorm=0.144, clip=0, loss_scale=8, train_wall=83, gb_free=61, wall=18989 epoch 014: 178 / 1689 loss=3.601, nll_loss=2.055, ppl=4.16, wps=520725, ups=1.05, wpb=497504, bsz=16527.6, num_updates=22100, lr=0.000425436, gnorm=0.144, clip=0, loss_scale=8, train_wall=83, gb_free=61, wall=18989 epoch 014: 279 / 1689 loss=3.608, nll_loss=2.063, ppl=4.18, wps=581979, ups=1.17, wpb=496818, bsz=16362.1, num_updates=22200, lr=0.000424476, gnorm=0.141, clip=0, loss_scale=4, train_wall=84, gb_free=61.7, wall=19074 epoch 014: 279 / 1689 loss=3.608, nll_loss=2.063, ppl=4.18, wps=581979, ups=1.17, wpb=496818, bsz=16362.1, num_updates=22200, lr=0.000424476, gnorm=0.141, clip=0, loss_scale=4, train_wall=84, gb_free=61.7, wall=19074 epoch 014: 279 / 1689 loss=3.608, nll_loss=2.063, ppl=4.18, wps=581979, ups=1.17, wpb=496818, bsz=16362.1, num_updates=22200, lr=0.000424476, gnorm=0.141, clip=0, loss_scale=4, train_wall=84, gb_free=61.7, wall=19074 epoch 014: 279 / 1689 loss=3.608, nll_loss=2.063, ppl=4.18, wps=581979, ups=1.17, wpb=496818, bsz=16362.1, num_updates=22200, lr=0.000424476, gnorm=0.141, clip=0, loss_scale=4, train_wall=84, gb_free=61.7, wall=19074 epoch 014: 279 / 1689 loss=3.608, nll_loss=2.063, ppl=4.18, wps=581979, ups=1.17, wpb=496818, bsz=16362.1, num_updates=22200, lr=0.000424476, gnorm=0.141, clip=0, loss_scale=4, train_wall=84, gb_free=61.7, wall=19074 epoch 014: 279 / 1689 loss=3.608, nll_loss=2.063, ppl=4.18, wps=581979, ups=1.17, wpb=496818, bsz=16362.1, num_updates=22200, lr=0.000424476, gnorm=0.141, clip=0, loss_scale=4, train_wall=84, gb_free=61.7, wall=19074 epoch 014: 279 / 1689 loss=3.608, nll_loss=2.063, ppl=4.18, wps=581979, ups=1.17, wpb=496818, bsz=16362.1, num_updates=22200, lr=0.000424476, gnorm=0.141, clip=0, loss_scale=4, train_wall=84, gb_free=61.7, wall=19074 epoch 014: 279 / 1689 loss=3.608, nll_loss=2.063, ppl=4.18, wps=581979, ups=1.17, wpb=496818, bsz=16362.1, num_updates=22200, lr=0.000424476, gnorm=0.141, clip=0, loss_scale=4, train_wall=84, gb_free=61.7, wall=19074 epoch 014: 279 / 1689 loss=3.608, nll_loss=2.063, ppl=4.18, wps=581979, ups=1.17, wpb=496818, bsz=16362.1, num_updates=22200, lr=0.000424476, gnorm=0.141, clip=0, loss_scale=4, train_wall=84, gb_free=61.7, wall=19074 epoch 014: 279 / 1689 loss=3.608, nll_loss=2.063, ppl=4.18, wps=581979, ups=1.17, wpb=496818, bsz=16362.1, num_updates=22200, lr=0.000424476, gnorm=0.141, clip=0, loss_scale=4, train_wall=84, gb_free=61.7, wall=19074 epoch 014: 279 / 1689 loss=3.608, nll_loss=2.063, ppl=4.18, wps=581979, ups=1.17, wpb=496818, bsz=16362.1, num_updates=22200, lr=0.000424476, gnorm=0.141, clip=0, loss_scale=4, train_wall=84, gb_free=61.7, wall=19074 epoch 014: 279 / 1689 loss=3.608, nll_loss=2.063, ppl=4.18, wps=581979, ups=1.17, wpb=496818, bsz=16362.1, num_updates=22200, lr=0.000424476, gnorm=0.141, clip=0, loss_scale=4, train_wall=84, gb_free=61.7, wall=19074 epoch 014: 279 / 1689 loss=3.608, nll_loss=2.063, ppl=4.18, wps=581979, ups=1.17, wpb=496818, bsz=16362.1, num_updates=22200, lr=0.000424476, gnorm=0.141, clip=0, loss_scale=4, train_wall=84, gb_free=61.7, wall=19074 epoch 014: 279 / 1689 loss=3.608, nll_loss=2.063, ppl=4.18, wps=581979, ups=1.17, wpb=496818, bsz=16362.1, num_updates=22200, lr=0.000424476, gnorm=0.141, clip=0, loss_scale=4, train_wall=84, gb_free=61.7, wall=19074 epoch 014: 380 / 1689 loss=3.613, nll_loss=2.069, ppl=4.2, wps=582451, ups=1.18, wpb=495620, bsz=16703.8, num_updates=22300, lr=0.000423524, gnorm=0.146, clip=0, loss_scale=2, train_wall=84, gb_free=61.9, wall=19160 epoch 014: 380 / 1689 loss=3.613, nll_loss=2.069, ppl=4.2, wps=582451, ups=1.18, wpb=495620, bsz=16703.8, num_updates=22300, lr=0.000423524, gnorm=0.146, clip=0, loss_scale=2, train_wall=84, gb_free=61.9, wall=19160 epoch 014: 380 / 1689 loss=3.613, nll_loss=2.069, ppl=4.2, wps=582451, ups=1.18, wpb=495620, bsz=16703.8, num_updates=22300, lr=0.000423524, gnorm=0.146, clip=0, loss_scale=2, train_wall=84, gb_free=61.9, wall=19160 epoch 014: 380 / 1689 loss=3.613, nll_loss=2.069, ppl=4.2, wps=582451, ups=1.18, wpb=495620, bsz=16703.8, num_updates=22300, lr=0.000423524, gnorm=0.146, clip=0, loss_scale=2, train_wall=84, gb_free=61.9, wall=19160 epoch 014: 380 / 1689 loss=3.613, nll_loss=2.069, ppl=4.2, wps=582451, ups=1.18, wpb=495620, bsz=16703.8, num_updates=22300, lr=0.000423524, gnorm=0.146, clip=0, loss_scale=2, train_wall=84, gb_free=61.9, wall=19160 epoch 014: 380 / 1689 loss=3.613, nll_loss=2.069, ppl=4.2, wps=582451, ups=1.18, wpb=495620, bsz=16703.8, num_updates=22300, lr=0.000423524, gnorm=0.146, clip=0, loss_scale=2, train_wall=84, gb_free=61.9, wall=19160 epoch 014: 380 / 1689 loss=3.613, nll_loss=2.069, ppl=4.2, wps=582451, ups=1.18, wpb=495620, bsz=16703.8, num_updates=22300, lr=0.000423524, gnorm=0.146, clip=0, loss_scale=2, train_wall=84, gb_free=61.9, wall=19160 epoch 014: 380 / 1689 loss=3.613, nll_loss=2.069, ppl=4.2, wps=582451, ups=1.18, wpb=495620, bsz=16703.8, num_updates=22300, lr=0.000423524, gnorm=0.146, clip=0, loss_scale=2, train_wall=84, gb_free=61.9, wall=19160 epoch 014: 380 / 1689 loss=3.613, nll_loss=2.069, ppl=4.2, wps=582451, ups=1.18, wpb=495620, bsz=16703.8, num_updates=22300, lr=0.000423524, gnorm=0.146, clip=0, loss_scale=2, train_wall=84, gb_free=61.9, wall=19160 epoch 014: 380 / 1689 loss=3.613, nll_loss=2.069, ppl=4.2, wps=582451, ups=1.18, wpb=495620, bsz=16703.8, num_updates=22300, lr=0.000423524, gnorm=0.146, clip=0, loss_scale=2, train_wall=84, gb_free=61.9, wall=19160 epoch 014: 380 / 1689 loss=3.613, nll_loss=2.069, ppl=4.2, wps=582451, ups=1.18, wpb=495620, bsz=16703.8, num_updates=22300, lr=0.000423524, gnorm=0.146, clip=0, loss_scale=2, train_wall=84, gb_free=61.9, wall=19160 epoch 014: 380 / 1689 loss=3.613, nll_loss=2.069, ppl=4.2, wps=582451, ups=1.18, wpb=495620, bsz=16703.8, num_updates=22300, lr=0.000423524, gnorm=0.146, clip=0, loss_scale=2, train_wall=84, gb_free=61.9, wall=19160 epoch 014: 380 / 1689 loss=3.613, nll_loss=2.069, ppl=4.2, wps=582451, ups=1.18, wpb=495620, bsz=16703.8, num_updates=22300, lr=0.000423524, gnorm=0.146, clip=0, loss_scale=2, train_wall=84, gb_free=61.9, wall=19160 epoch 014: 380 / 1689 loss=3.613, nll_loss=2.069, ppl=4.2, wps=582451, ups=1.18, wpb=495620, bsz=16703.8, num_updates=22300, lr=0.000423524, gnorm=0.146, clip=0, loss_scale=2, train_wall=84, gb_free=61.9, wall=19160 epoch 014: 480 / 1689 loss=3.617, nll_loss=2.074, ppl=4.21, wps=582513, ups=1.18, wpb=495189, bsz=16525.2, num_updates=22400, lr=0.000422577, gnorm=0.149, clip=0, loss_scale=2, train_wall=84, gb_free=61.6, wall=19245 epoch 014: 480 / 1689 loss=3.617, nll_loss=2.074, ppl=4.21, wps=582513, ups=1.18, wpb=495189, bsz=16525.2, num_updates=22400, lr=0.000422577, gnorm=0.149, clip=0, loss_scale=2, train_wall=84, gb_free=61.6, wall=19245 epoch 014: 480 / 1689 loss=3.617, nll_loss=2.074, ppl=4.21, wps=582513, ups=1.18, wpb=495189, bsz=16525.2, num_updates=22400, lr=0.000422577, gnorm=0.149, clip=0, loss_scale=2, train_wall=84, gb_free=61.6, wall=19245 epoch 014: 480 / 1689 loss=3.617, nll_loss=2.074, ppl=4.21, wps=582513, ups=1.18, wpb=495189, bsz=16525.2, num_updates=22400, lr=0.000422577, gnorm=0.149, clip=0, loss_scale=2, train_wall=84, gb_free=61.6, wall=19245 epoch 014: 480 / 1689 loss=3.617, nll_loss=2.074, ppl=4.21, wps=582513, ups=1.18, wpb=495189, bsz=16525.2, num_updates=22400, lr=0.000422577, gnorm=0.149, clip=0, loss_scale=2, train_wall=84, gb_free=61.6, wall=19245 epoch 014: 480 / 1689 loss=3.617, nll_loss=2.074, ppl=4.21, wps=582513, ups=1.18, wpb=495189, bsz=16525.2, num_updates=22400, lr=0.000422577, gnorm=0.149, clip=0, loss_scale=2, train_wall=84, gb_free=61.6, wall=19245 epoch 014: 480 / 1689 loss=3.617, nll_loss=2.074, ppl=4.21, wps=582513, ups=1.18, wpb=495189, bsz=16525.2, num_updates=22400, lr=0.000422577, gnorm=0.149, clip=0, loss_scale=2, train_wall=84, gb_free=61.6, wall=19245 epoch 014: 480 / 1689 loss=3.617, nll_loss=2.074, ppl=4.21, wps=582513, ups=1.18, wpb=495189, bsz=16525.2, num_updates=22400, lr=0.000422577, gnorm=0.149, clip=0, loss_scale=2, train_wall=84, gb_free=61.6, wall=19245 epoch 014: 480 / 1689 loss=3.617, nll_loss=2.074, ppl=4.21, wps=582513, ups=1.18, wpb=495189, bsz=16525.2, num_updates=22400, lr=0.000422577, gnorm=0.149, clip=0, loss_scale=2, train_wall=84, gb_free=61.6, wall=19245 epoch 014: 480 / 1689 loss=3.617, nll_loss=2.074, ppl=4.21, wps=582513, ups=1.18, wpb=495189, bsz=16525.2, num_updates=22400, lr=0.000422577, gnorm=0.149, clip=0, loss_scale=2, train_wall=84, gb_free=61.6, wall=19245 epoch 014: 480 / 1689 loss=3.617, nll_loss=2.074, ppl=4.21, wps=582513, ups=1.18, wpb=495189, bsz=16525.2, num_updates=22400, lr=0.000422577, gnorm=0.149, clip=0, loss_scale=2, train_wall=84, gb_free=61.6, wall=19245 epoch 014: 480 / 1689 loss=3.617, nll_loss=2.074, ppl=4.21, wps=582513, ups=1.18, wpb=495189, bsz=16525.2, num_updates=22400, lr=0.000422577, gnorm=0.149, clip=0, loss_scale=2, train_wall=84, gb_free=61.6, wall=19245 epoch 014: 480 / 1689 loss=3.617, nll_loss=2.074, ppl=4.21, wps=582513, ups=1.18, wpb=495189, bsz=16525.2, num_updates=22400, lr=0.000422577, gnorm=0.149, clip=0, loss_scale=2, train_wall=84, gb_free=61.6, wall=19245 epoch 014: 480 / 1689 loss=3.617, nll_loss=2.074, ppl=4.21, wps=582513, ups=1.18, wpb=495189, bsz=16525.2, num_updates=22400, lr=0.000422577, gnorm=0.149, clip=0, loss_scale=2, train_wall=84, gb_free=61.6, wall=19245 epoch 014: 583 / 1689 loss=3.618, nll_loss=2.075, ppl=4.21, wps=568670, ups=1.15, wpb=495122, bsz=16542.5, num_updates=22500, lr=0.000421637, gnorm=0.15, clip=0, loss_scale=0.25, train_wall=86, gb_free=61.7, wall=19332 epoch 014: 583 / 1689 loss=3.618, nll_loss=2.075, ppl=4.21, wps=568670, ups=1.15, wpb=495122, bsz=16542.5, num_updates=22500, lr=0.000421637, gnorm=0.15, clip=0, loss_scale=0.25, train_wall=86, gb_free=61.7, wall=19332 epoch 014: 583 / 1689 loss=3.618, nll_loss=2.075, ppl=4.21, wps=568670, ups=1.15, wpb=495122, bsz=16542.5, num_updates=22500, lr=0.000421637, gnorm=0.15, clip=0, loss_scale=0.25, train_wall=86, gb_free=61.7, wall=19332 epoch 014: 583 / 1689 loss=3.618, nll_loss=2.075, ppl=4.21, wps=568670, ups=1.15, wpb=495122, bsz=16542.5, num_updates=22500, lr=0.000421637, gnorm=0.15, clip=0, loss_scale=0.25, train_wall=86, gb_free=61.7, wall=19332 epoch 014: 583 / 1689 loss=3.618, nll_loss=2.075, ppl=4.21, wps=568670, ups=1.15, wpb=495122, bsz=16542.5, num_updates=22500, lr=0.000421637, gnorm=0.15, clip=0, loss_scale=0.25, train_wall=86, gb_free=61.7, wall=19332 epoch 014: 583 / 1689 loss=3.618, nll_loss=2.075, ppl=4.21, wps=568670, ups=1.15, wpb=495122, bsz=16542.5, num_updates=22500, lr=0.000421637, gnorm=0.15, clip=0, loss_scale=0.25, train_wall=86, gb_free=61.7, wall=19332 epoch 014: 583 / 1689 loss=3.618, nll_loss=2.075, ppl=4.21, wps=568670, ups=1.15, wpb=495122, bsz=16542.5, num_updates=22500, lr=0.000421637, gnorm=0.15, clip=0, loss_scale=0.25, train_wall=86, gb_free=61.7, wall=19332 epoch 014: 583 / 1689 loss=3.618, nll_loss=2.075, ppl=4.21, wps=568670, ups=1.15, wpb=495122, bsz=16542.5, num_updates=22500, lr=0.000421637, gnorm=0.15, clip=0, loss_scale=0.25, train_wall=86, gb_free=61.7, wall=19332 epoch 014: 583 / 1689 loss=3.618, nll_loss=2.075, ppl=4.21, wps=568670, ups=1.15, wpb=495122, bsz=16542.5, num_updates=22500, lr=0.000421637, gnorm=0.15, clip=0, loss_scale=0.25, train_wall=86, gb_free=61.7, wall=19332 epoch 014: 583 / 1689 loss=3.618, nll_loss=2.075, ppl=4.21, wps=568670, ups=1.15, wpb=495122, bsz=16542.5, num_updates=22500, lr=0.000421637, gnorm=0.15, clip=0, loss_scale=0.25, train_wall=86, gb_free=61.7, wall=19332 epoch 014: 583 / 1689 loss=3.618, nll_loss=2.075, ppl=4.21, wps=568670, ups=1.15, wpb=495122, bsz=16542.5, num_updates=22500, lr=0.000421637, gnorm=0.15, clip=0, loss_scale=0.25, train_wall=86, gb_free=61.7, wall=19332 epoch 014: 583 / 1689 loss=3.618, nll_loss=2.075, ppl=4.21, wps=568670, ups=1.15, wpb=495122, bsz=16542.5, num_updates=22500, lr=0.000421637, gnorm=0.15, clip=0, loss_scale=0.25, train_wall=86, gb_free=61.7, wall=19332 epoch 014: 583 / 1689 loss=3.618, nll_loss=2.075, ppl=4.21, wps=568670, ups=1.15, wpb=495122, bsz=16542.5, num_updates=22500, lr=0.000421637, gnorm=0.15, clip=0, loss_scale=0.25, train_wall=86, gb_free=61.7, wall=19332 epoch 014: 583 / 1689 loss=3.618, nll_loss=2.075, ppl=4.21, wps=568670, ups=1.15, wpb=495122, bsz=16542.5, num_updates=22500, lr=0.000421637, gnorm=0.15, clip=0, loss_scale=0.25, train_wall=86, gb_free=61.7, wall=19332 epoch 014: 683 / 1689 loss=3.62, nll_loss=2.077, ppl=4.22, wps=585536, ups=1.19, wpb=493944, bsz=16365, num_updates=22600, lr=0.000420703, gnorm=0.141, clip=0, loss_scale=0.25, train_wall=83, gb_free=61.9, wall=19416 epoch 014: 683 / 1689 loss=3.62, nll_loss=2.077, ppl=4.22, wps=585536, ups=1.19, wpb=493944, bsz=16365, num_updates=22600, lr=0.000420703, gnorm=0.141, clip=0, loss_scale=0.25, train_wall=83, gb_free=61.9, wall=19416 epoch 014: 683 / 1689 loss=3.62, nll_loss=2.077, ppl=4.22, wps=585536, ups=1.19, wpb=493944, bsz=16365, num_updates=22600, lr=0.000420703, gnorm=0.141, clip=0, loss_scale=0.25, train_wall=83, gb_free=61.9, wall=19416 epoch 014: 683 / 1689 loss=3.62, nll_loss=2.077, ppl=4.22, wps=585536, ups=1.19, wpb=493944, bsz=16365, num_updates=22600, lr=0.000420703, gnorm=0.141, clip=0, loss_scale=0.25, train_wall=83, gb_free=61.9, wall=19416 epoch 014: 683 / 1689 loss=3.62, nll_loss=2.077, ppl=4.22, wps=585536, ups=1.19, wpb=493944, bsz=16365, num_updates=22600, lr=0.000420703, gnorm=0.141, clip=0, loss_scale=0.25, train_wall=83, gb_free=61.9, wall=19416 epoch 014: 683 / 1689 loss=3.62, nll_loss=2.077, ppl=4.22, wps=585536, ups=1.19, wpb=493944, bsz=16365, num_updates=22600, lr=0.000420703, gnorm=0.141, clip=0, loss_scale=0.25, train_wall=83, gb_free=61.9, wall=19416 epoch 014: 683 / 1689 loss=3.62, nll_loss=2.077, ppl=4.22, wps=585536, ups=1.19, wpb=493944, bsz=16365, num_updates=22600, lr=0.000420703, gnorm=0.141, clip=0, loss_scale=0.25, train_wall=83, gb_free=61.9, wall=19416 epoch 014: 683 / 1689 loss=3.62, nll_loss=2.077, ppl=4.22, wps=585536, ups=1.19, wpb=493944, bsz=16365, num_updates=22600, lr=0.000420703, gnorm=0.141, clip=0, loss_scale=0.25, train_wall=83, gb_free=61.9, wall=19416 epoch 014: 683 / 1689 loss=3.62, nll_loss=2.077, ppl=4.22, wps=585536, ups=1.19, wpb=493944, bsz=16365, num_updates=22600, lr=0.000420703, gnorm=0.141, clip=0, loss_scale=0.25, train_wall=83, gb_free=61.9, wall=19416 epoch 014: 683 / 1689 loss=3.62, nll_loss=2.077, ppl=4.22, wps=585536, ups=1.19, wpb=493944, bsz=16365, num_updates=22600, lr=0.000420703, gnorm=0.141, clip=0, loss_scale=0.25, train_wall=83, gb_free=61.9, wall=19416 epoch 014: 683 / 1689 loss=3.62, nll_loss=2.077, ppl=4.22, wps=585536, ups=1.19, wpb=493944, bsz=16365, num_updates=22600, lr=0.000420703, gnorm=0.141, clip=0, loss_scale=0.25, train_wall=83, gb_free=61.9, wall=19416 epoch 014: 683 / 1689 loss=3.62, nll_loss=2.077, ppl=4.22, wps=585536, ups=1.19, wpb=493944, bsz=16365, num_updates=22600, lr=0.000420703, gnorm=0.141, clip=0, loss_scale=0.25, train_wall=83, gb_free=61.9, wall=19416 epoch 014: 683 / 1689 loss=3.62, nll_loss=2.077, ppl=4.22, wps=585536, ups=1.19, wpb=493944, bsz=16365, num_updates=22600, lr=0.000420703, gnorm=0.141, clip=0, loss_scale=0.25, train_wall=83, gb_free=61.9, wall=19416 epoch 014: 683 / 1689 loss=3.62, nll_loss=2.077, ppl=4.22, wps=585536, ups=1.19, wpb=493944, bsz=16365, num_updates=22600, lr=0.000420703, gnorm=0.141, clip=0, loss_scale=0.25, train_wall=83, gb_free=61.9, wall=19416 epoch 014: 783 / 1689 loss=3.616, nll_loss=2.073, ppl=4.21, wps=586046, ups=1.18, wpb=495184, bsz=16445.5, num_updates=22700, lr=0.000419775, gnorm=0.148, clip=0, loss_scale=0.25, train_wall=83, gb_free=61.2, wall=19500 epoch 014: 783 / 1689 loss=3.616, nll_loss=2.073, ppl=4.21, wps=586046, ups=1.18, wpb=495184, bsz=16445.5, num_updates=22700, lr=0.000419775, gnorm=0.148, clip=0, loss_scale=0.25, train_wall=83, gb_free=61.2, wall=19500 epoch 014: 783 / 1689 loss=3.616, nll_loss=2.073, ppl=4.21, wps=586046, ups=1.18, wpb=495184, bsz=16445.5, num_updates=22700, lr=0.000419775, gnorm=0.148, clip=0, loss_scale=0.25, train_wall=83, gb_free=61.2, wall=19500 epoch 014: 783 / 1689 loss=3.616, nll_loss=2.073, ppl=4.21, wps=586046, ups=1.18, wpb=495184, bsz=16445.5, num_updates=22700, lr=0.000419775, gnorm=0.148, clip=0, loss_scale=0.25, train_wall=83, gb_free=61.2, wall=19500 epoch 014: 783 / 1689 loss=3.616, nll_loss=2.073, ppl=4.21, wps=586046, ups=1.18, wpb=495184, bsz=16445.5, num_updates=22700, lr=0.000419775, gnorm=0.148, clip=0, loss_scale=0.25, train_wall=83, gb_free=61.2, wall=19500 epoch 014: 783 / 1689 loss=3.616, nll_loss=2.073, ppl=4.21, wps=586046, ups=1.18, wpb=495184, bsz=16445.5, num_updates=22700, lr=0.000419775, gnorm=0.148, clip=0, loss_scale=0.25, train_wall=83, gb_free=61.2, wall=19500 epoch 014: 783 / 1689 loss=3.616, nll_loss=2.073, ppl=4.21, wps=586046, ups=1.18, wpb=495184, bsz=16445.5, num_updates=22700, lr=0.000419775, gnorm=0.148, clip=0, loss_scale=0.25, train_wall=83, gb_free=61.2, wall=19500 epoch 014: 783 / 1689 loss=3.616, nll_loss=2.073, ppl=4.21, wps=586046, ups=1.18, wpb=495184, bsz=16445.5, num_updates=22700, lr=0.000419775, gnorm=0.148, clip=0, loss_scale=0.25, train_wall=83, gb_free=61.2, wall=19500 epoch 014: 783 / 1689 loss=3.616, nll_loss=2.073, ppl=4.21, wps=586046, ups=1.18, wpb=495184, bsz=16445.5, num_updates=22700, lr=0.000419775, gnorm=0.148, clip=0, loss_scale=0.25, train_wall=83, gb_free=61.2, wall=19500 epoch 014: 783 / 1689 loss=3.616, nll_loss=2.073, ppl=4.21, wps=586046, ups=1.18, wpb=495184, bsz=16445.5, num_updates=22700, lr=0.000419775, gnorm=0.148, clip=0, loss_scale=0.25, train_wall=83, gb_free=61.2, wall=19500 epoch 014: 783 / 1689 loss=3.616, nll_loss=2.073, ppl=4.21, wps=586046, ups=1.18, wpb=495184, bsz=16445.5, num_updates=22700, lr=0.000419775, gnorm=0.148, clip=0, loss_scale=0.25, train_wall=83, gb_free=61.2, wall=19500 epoch 014: 783 / 1689 loss=3.616, nll_loss=2.073, ppl=4.21, wps=586046, ups=1.18, wpb=495184, bsz=16445.5, num_updates=22700, lr=0.000419775, gnorm=0.148, clip=0, loss_scale=0.25, train_wall=83, gb_free=61.2, wall=19500 epoch 014: 783 / 1689 loss=3.616, nll_loss=2.073, ppl=4.21, wps=586046, ups=1.18, wpb=495184, bsz=16445.5, num_updates=22700, lr=0.000419775, gnorm=0.148, clip=0, loss_scale=0.25, train_wall=83, gb_free=61.2, wall=19500 epoch 014: 783 / 1689 loss=3.616, nll_loss=2.073, ppl=4.21, wps=586046, ups=1.18, wpb=495184, bsz=16445.5, num_updates=22700, lr=0.000419775, gnorm=0.148, clip=0, loss_scale=0.25, train_wall=83, gb_free=61.2, wall=19500 epoch 014: 883 / 1689 loss=3.617, nll_loss=2.074, ppl=4.21, wps=586033, ups=1.18, wpb=495503, bsz=16654, num_updates=22800, lr=0.000418854, gnorm=0.148, clip=0, loss_scale=0.25, train_wall=83, gb_free=61.5, wall=19585 epoch 014: 883 / 1689 loss=3.617, nll_loss=2.074, ppl=4.21, wps=586033, ups=1.18, wpb=495503, bsz=16654, num_updates=22800, lr=0.000418854, gnorm=0.148, clip=0, loss_scale=0.25, train_wall=83, gb_free=61.5, wall=19585 epoch 014: 883 / 1689 loss=3.617, nll_loss=2.074, ppl=4.21, wps=586033, ups=1.18, wpb=495503, bsz=16654, num_updates=22800, lr=0.000418854, gnorm=0.148, clip=0, loss_scale=0.25, train_wall=83, gb_free=61.5, wall=19585 epoch 014: 883 / 1689 loss=3.617, nll_loss=2.074, ppl=4.21, wps=586033, ups=1.18, wpb=495503, bsz=16654, num_updates=22800, lr=0.000418854, gnorm=0.148, clip=0, loss_scale=0.25, train_wall=83, gb_free=61.5, wall=19585 epoch 014: 883 / 1689 loss=3.617, nll_loss=2.074, ppl=4.21, wps=586033, ups=1.18, wpb=495503, bsz=16654, num_updates=22800, lr=0.000418854, gnorm=0.148, clip=0, loss_scale=0.25, train_wall=83, gb_free=61.5, wall=19585 epoch 014: 883 / 1689 loss=3.617, nll_loss=2.074, ppl=4.21, wps=586033, ups=1.18, wpb=495503, bsz=16654, num_updates=22800, lr=0.000418854, gnorm=0.148, clip=0, loss_scale=0.25, train_wall=83, gb_free=61.5, wall=19585 epoch 014: 883 / 1689 loss=3.617, nll_loss=2.074, ppl=4.21, wps=586033, ups=1.18, wpb=495503, bsz=16654, num_updates=22800, lr=0.000418854, gnorm=0.148, clip=0, loss_scale=0.25, train_wall=83, gb_free=61.5, wall=19585 epoch 014: 883 / 1689 loss=3.617, nll_loss=2.074, ppl=4.21, wps=586033, ups=1.18, wpb=495503, bsz=16654, num_updates=22800, lr=0.000418854, gnorm=0.148, clip=0, loss_scale=0.25, train_wall=83, gb_free=61.5, wall=19585 epoch 014: 883 / 1689 loss=3.617, nll_loss=2.074, ppl=4.21, wps=586033, ups=1.18, wpb=495503, bsz=16654, num_updates=22800, lr=0.000418854, gnorm=0.148, clip=0, loss_scale=0.25, train_wall=83, gb_free=61.5, wall=19585 epoch 014: 883 / 1689 loss=3.617, nll_loss=2.074, ppl=4.21, wps=586033, ups=1.18, wpb=495503, bsz=16654, num_updates=22800, lr=0.000418854, gnorm=0.148, clip=0, loss_scale=0.25, train_wall=83, gb_free=61.5, wall=19585 epoch 014: 883 / 1689 loss=3.617, nll_loss=2.074, ppl=4.21, wps=586033, ups=1.18, wpb=495503, bsz=16654, num_updates=22800, lr=0.000418854, gnorm=0.148, clip=0, loss_scale=0.25, train_wall=83, gb_free=61.5, wall=19585 epoch 014: 883 / 1689 loss=3.617, nll_loss=2.074, ppl=4.21, wps=586033, ups=1.18, wpb=495503, bsz=16654, num_updates=22800, lr=0.000418854, gnorm=0.148, clip=0, loss_scale=0.25, train_wall=83, gb_free=61.5, wall=19585 epoch 014: 883 / 1689 loss=3.617, nll_loss=2.074, ppl=4.21, wps=586033, ups=1.18, wpb=495503, bsz=16654, num_updates=22800, lr=0.000418854, gnorm=0.148, clip=0, loss_scale=0.25, train_wall=83, gb_free=61.5, wall=19585 epoch 014: 883 / 1689 loss=3.617, nll_loss=2.074, ppl=4.21, wps=586033, ups=1.18, wpb=495503, bsz=16654, num_updates=22800, lr=0.000418854, gnorm=0.148, clip=0, loss_scale=0.25, train_wall=83, gb_free=61.5, wall=19585 epoch 014: 983 / 1689 loss=3.624, nll_loss=2.082, ppl=4.23, wps=585645, ups=1.18, wpb=494576, bsz=16726.6, num_updates=22900, lr=0.000417938, gnorm=0.145, clip=0, loss_scale=0.25, train_wall=83, gb_free=61.4, wall=19669 epoch 014: 983 / 1689 loss=3.624, nll_loss=2.082, ppl=4.23, wps=585645, ups=1.18, wpb=494576, bsz=16726.6, num_updates=22900, lr=0.000417938, gnorm=0.145, clip=0, loss_scale=0.25, train_wall=83, gb_free=61.4, wall=19669 epoch 014: 983 / 1689 loss=3.624, nll_loss=2.082, ppl=4.23, wps=585645, ups=1.18, wpb=494576, bsz=16726.6, num_updates=22900, lr=0.000417938, gnorm=0.145, clip=0, loss_scale=0.25, train_wall=83, gb_free=61.4, wall=19669 epoch 014: 983 / 1689 loss=3.624, nll_loss=2.082, ppl=4.23, wps=585645, ups=1.18, wpb=494576, bsz=16726.6, num_updates=22900, lr=0.000417938, gnorm=0.145, clip=0, loss_scale=0.25, train_wall=83, gb_free=61.4, wall=19669 epoch 014: 983 / 1689 loss=3.624, nll_loss=2.082, ppl=4.23, wps=585645, ups=1.18, wpb=494576, bsz=16726.6, num_updates=22900, lr=0.000417938, gnorm=0.145, clip=0, loss_scale=0.25, train_wall=83, gb_free=61.4, wall=19669 epoch 014: 983 / 1689 loss=3.624, nll_loss=2.082, ppl=4.23, wps=585645, ups=1.18, wpb=494576, bsz=16726.6, num_updates=22900, lr=0.000417938, gnorm=0.145, clip=0, loss_scale=0.25, train_wall=83, gb_free=61.4, wall=19669 epoch 014: 983 / 1689 loss=3.624, nll_loss=2.082, ppl=4.23, wps=585645, ups=1.18, wpb=494576, bsz=16726.6, num_updates=22900, lr=0.000417938, gnorm=0.145, clip=0, loss_scale=0.25, train_wall=83, gb_free=61.4, wall=19669 epoch 014: 983 / 1689 loss=3.624, nll_loss=2.082, ppl=4.23, wps=585645, ups=1.18, wpb=494576, bsz=16726.6, num_updates=22900, lr=0.000417938, gnorm=0.145, clip=0, loss_scale=0.25, train_wall=83, gb_free=61.4, wall=19669 epoch 014: 983 / 1689 loss=3.624, nll_loss=2.082, ppl=4.23, wps=585645, ups=1.18, wpb=494576, bsz=16726.6, num_updates=22900, lr=0.000417938, gnorm=0.145, clip=0, loss_scale=0.25, train_wall=83, gb_free=61.4, wall=19669 epoch 014: 983 / 1689 loss=3.624, nll_loss=2.082, ppl=4.23, wps=585645, ups=1.18, wpb=494576, bsz=16726.6, num_updates=22900, lr=0.000417938, gnorm=0.145, clip=0, loss_scale=0.25, train_wall=83, gb_free=61.4, wall=19669 epoch 014: 983 / 1689 loss=3.624, nll_loss=2.082, ppl=4.23, wps=585645, ups=1.18, wpb=494576, bsz=16726.6, num_updates=22900, lr=0.000417938, gnorm=0.145, clip=0, loss_scale=0.25, train_wall=83, gb_free=61.4, wall=19669 epoch 014: 983 / 1689 loss=3.624, nll_loss=2.082, ppl=4.23, wps=585645, ups=1.18, wpb=494576, bsz=16726.6, num_updates=22900, lr=0.000417938, gnorm=0.145, clip=0, loss_scale=0.25, train_wall=83, gb_free=61.4, wall=19669 epoch 014: 983 / 1689 loss=3.624, nll_loss=2.082, ppl=4.23, wps=585645, ups=1.18, wpb=494576, bsz=16726.6, num_updates=22900, lr=0.000417938, gnorm=0.145, clip=0, loss_scale=0.25, train_wall=83, gb_free=61.4, wall=19669 epoch 014: 983 / 1689 loss=3.624, nll_loss=2.082, ppl=4.23, wps=585645, ups=1.18, wpb=494576, bsz=16726.6, num_updates=22900, lr=0.000417938, gnorm=0.145, clip=0, loss_scale=0.25, train_wall=83, gb_free=61.4, wall=19669 epoch 014: 1083 / 1689 loss=3.621, nll_loss=2.078, ppl=4.22, wps=582995, ups=1.18, wpb=494597, bsz=16544.8, num_updates=23000, lr=0.000417029, gnorm=0.149, clip=0, loss_scale=0.5, train_wall=84, gb_free=61.3, wall=19754 epoch 014: 1083 / 1689 loss=3.621, nll_loss=2.078, ppl=4.22, wps=582995, ups=1.18, wpb=494597, bsz=16544.8, num_updates=23000, lr=0.000417029, gnorm=0.149, clip=0, loss_scale=0.5, train_wall=84, gb_free=61.3, wall=19754 epoch 014: 1083 / 1689 loss=3.621, nll_loss=2.078, ppl=4.22, wps=582995, ups=1.18, wpb=494597, bsz=16544.8, num_updates=23000, lr=0.000417029, gnorm=0.149, clip=0, loss_scale=0.5, train_wall=84, gb_free=61.3, wall=19754 epoch 014: 1083 / 1689 loss=3.621, nll_loss=2.078, ppl=4.22, wps=582995, ups=1.18, wpb=494597, bsz=16544.8, num_updates=23000, lr=0.000417029, gnorm=0.149, clip=0, loss_scale=0.5, train_wall=84, gb_free=61.3, wall=19754 epoch 014: 1083 / 1689 loss=3.621, nll_loss=2.078, ppl=4.22, wps=582995, ups=1.18, wpb=494597, bsz=16544.8, num_updates=23000, lr=0.000417029, gnorm=0.149, clip=0, loss_scale=0.5, train_wall=84, gb_free=61.3, wall=19754 epoch 014: 1083 / 1689 loss=3.621, nll_loss=2.078, ppl=4.22, wps=582995, ups=1.18, wpb=494597, bsz=16544.8, num_updates=23000, lr=0.000417029, gnorm=0.149, clip=0, loss_scale=0.5, train_wall=84, gb_free=61.3, wall=19754 epoch 014: 1083 / 1689 loss=3.621, nll_loss=2.078, ppl=4.22, wps=582995, ups=1.18, wpb=494597, bsz=16544.8, num_updates=23000, lr=0.000417029, gnorm=0.149, clip=0, loss_scale=0.5, train_wall=84, gb_free=61.3, wall=19754 epoch 014: 1083 / 1689 loss=3.621, nll_loss=2.078, ppl=4.22, wps=582995, ups=1.18, wpb=494597, bsz=16544.8, num_updates=23000, lr=0.000417029, gnorm=0.149, clip=0, loss_scale=0.5, train_wall=84, gb_free=61.3, wall=19754 epoch 014: 1083 / 1689 loss=3.621, nll_loss=2.078, ppl=4.22, wps=582995, ups=1.18, wpb=494597, bsz=16544.8, num_updates=23000, lr=0.000417029, gnorm=0.149, clip=0, loss_scale=0.5, train_wall=84, gb_free=61.3, wall=19754 epoch 014: 1083 / 1689 loss=3.621, nll_loss=2.078, ppl=4.22, wps=582995, ups=1.18, wpb=494597, bsz=16544.8, num_updates=23000, lr=0.000417029, gnorm=0.149, clip=0, loss_scale=0.5, train_wall=84, gb_free=61.3, wall=19754 epoch 014: 1083 / 1689 loss=3.621, nll_loss=2.078, ppl=4.22, wps=582995, ups=1.18, wpb=494597, bsz=16544.8, num_updates=23000, lr=0.000417029, gnorm=0.149, clip=0, loss_scale=0.5, train_wall=84, gb_free=61.3, wall=19754 epoch 014: 1083 / 1689 loss=3.621, nll_loss=2.078, ppl=4.22, wps=582995, ups=1.18, wpb=494597, bsz=16544.8, num_updates=23000, lr=0.000417029, gnorm=0.149, clip=0, loss_scale=0.5, train_wall=84, gb_free=61.3, wall=19754 epoch 014: 1083 / 1689 loss=3.621, nll_loss=2.078, ppl=4.22, wps=582995, ups=1.18, wpb=494597, bsz=16544.8, num_updates=23000, lr=0.000417029, gnorm=0.149, clip=0, loss_scale=0.5, train_wall=84, gb_free=61.3, wall=19754 epoch 014: 1083 / 1689 loss=3.621, nll_loss=2.078, ppl=4.22, wps=582995, ups=1.18, wpb=494597, bsz=16544.8, num_updates=23000, lr=0.000417029, gnorm=0.149, clip=0, loss_scale=0.5, train_wall=84, gb_free=61.3, wall=19754 begin validation on "valid" subset epoch 014 | valid on 'valid' subset | loss 3.739 | nll_loss 2.189 | ppl 4.56 | wps 0 | wpb 44526 | bsz 2008 | num_updates 23000 | best_loss 3.724 epoch 014 | valid on 'valid' subset | loss 3.739 | nll_loss 2.189 | ppl 4.56 | wps 0 | wpb 44526 | bsz 2008 | num_updates 23000 | best_loss 3.724 epoch 014 | valid on 'valid' subset | loss 3.739 | nll_loss 2.189 | ppl 4.56 | wps 0 | wpb 44526 | bsz 2008 | num_updates 23000 | best_loss 3.724 epoch 014 | valid on 'valid' subset | loss 3.739 | nll_loss 2.189 | ppl 4.56 | wps 0 | wpb 44526 | bsz 2008 | num_updates 23000 | best_loss 3.724 epoch 014 | valid on 'valid' subset | loss 3.739 | nll_loss 2.189 | ppl 4.56 | wps 0 | wpb 44526 | bsz 2008 | num_updates 23000 | best_loss 3.724 epoch 014 | valid on 'valid' subset | loss 3.739 | nll_loss 2.189 | ppl 4.56 | wps 0 | wpb 44526 | bsz 2008 | num_updates 23000 | best_loss 3.724 epoch 014 | valid on 'valid' subset | loss 3.739 | nll_loss 2.189 | ppl 4.56 | wps 0 | wpb 44526 | bsz 2008 | num_updates 23000 | best_loss 3.724 epoch 014 | valid on 'valid' subset | loss 3.739 | nll_loss 2.189 | ppl 4.56 | wps 0 | wpb 44526 | bsz 2008 | num_updates 23000 | best_loss 3.724 epoch 014 | valid on 'valid' subset | loss 3.739 | nll_loss 2.189 | ppl 4.56 | wps 0 | wpb 44526 | bsz 2008 | num_updates 23000 | best_loss 3.724 epoch 014 | valid on 'valid' subset | loss 3.739 | nll_loss 2.189 | ppl 4.56 | wps 0 | wpb 44526 | bsz 2008 | num_updates 23000 | best_loss 3.724 epoch 014 | valid on 'valid' subset | loss 3.739 | nll_loss 2.189 | ppl 4.56 | wps 0 | wpb 44526 | bsz 2008 | num_updates 23000 | best_loss 3.724 epoch 014 | valid on 'valid' subset | loss 3.739 | nll_loss 2.189 | ppl 4.56 | wps 0 | wpb 44526 | bsz 2008 | num_updates 23000 | best_loss 3.724 epoch 014 | valid on 'valid' subset | loss 3.739 | nll_loss 2.189 | ppl 4.56 | wps 0 | wpb 44526 | bsz 2008 | num_updates 23000 | best_loss 3.724 epoch 014 | valid on 'valid' subset | loss 3.739 | nll_loss 2.189 | ppl 4.56 | wps 0 | wpb 44526 | bsz 2008 | num_updates 23000 | best_loss 3.724 epoch 014: 1183 / 1689 loss=3.627, nll_loss=2.085, ppl=4.24, wps=535510, ups=1.08, wpb=493926, bsz=16546.8, num_updates=23100, lr=0.000416125, gnorm=0.149, clip=0, loss_scale=0.5, train_wall=83, gb_free=61.5, wall=19847 epoch 014: 1183 / 1689 loss=3.627, nll_loss=2.085, ppl=4.24, wps=535510, ups=1.08, wpb=493926, bsz=16546.8, num_updates=23100, lr=0.000416125, gnorm=0.149, clip=0, loss_scale=0.5, train_wall=83, gb_free=61.5, wall=19847 epoch 014: 1183 / 1689 loss=3.627, nll_loss=2.085, ppl=4.24, wps=535510, ups=1.08, wpb=493926, bsz=16546.8, num_updates=23100, lr=0.000416125, gnorm=0.149, clip=0, loss_scale=0.5, train_wall=83, gb_free=61.5, wall=19847 epoch 014: 1183 / 1689 loss=3.627, nll_loss=2.085, ppl=4.24, wps=535510, ups=1.08, wpb=493926, bsz=16546.8, num_updates=23100, lr=0.000416125, gnorm=0.149, clip=0, loss_scale=0.5, train_wall=83, gb_free=61.5, wall=19847 epoch 014: 1183 / 1689 loss=3.627, nll_loss=2.085, ppl=4.24, wps=535510, ups=1.08, wpb=493926, bsz=16546.8, num_updates=23100, lr=0.000416125, gnorm=0.149, clip=0, loss_scale=0.5, train_wall=83, gb_free=61.5, wall=19847 epoch 014: 1183 / 1689 loss=3.627, nll_loss=2.085, ppl=4.24, wps=535510, ups=1.08, wpb=493926, bsz=16546.8, num_updates=23100, lr=0.000416125, gnorm=0.149, clip=0, loss_scale=0.5, train_wall=83, gb_free=61.5, wall=19847 epoch 014: 1183 / 1689 loss=3.627, nll_loss=2.085, ppl=4.24, wps=535510, ups=1.08, wpb=493926, bsz=16546.8, num_updates=23100, lr=0.000416125, gnorm=0.149, clip=0, loss_scale=0.5, train_wall=83, gb_free=61.5, wall=19847 epoch 014: 1183 / 1689 loss=3.627, nll_loss=2.085, ppl=4.24, wps=535510, ups=1.08, wpb=493926, bsz=16546.8, num_updates=23100, lr=0.000416125, gnorm=0.149, clip=0, loss_scale=0.5, train_wall=83, gb_free=61.5, wall=19847 epoch 014: 1183 / 1689 loss=3.627, nll_loss=2.085, ppl=4.24, wps=535510, ups=1.08, wpb=493926, bsz=16546.8, num_updates=23100, lr=0.000416125, gnorm=0.149, clip=0, loss_scale=0.5, train_wall=83, gb_free=61.5, wall=19847 epoch 014: 1183 / 1689 loss=3.627, nll_loss=2.085, ppl=4.24, wps=535510, ups=1.08, wpb=493926, bsz=16546.8, num_updates=23100, lr=0.000416125, gnorm=0.149, clip=0, loss_scale=0.5, train_wall=83, gb_free=61.5, wall=19847 epoch 014: 1183 / 1689 loss=3.627, nll_loss=2.085, ppl=4.24, wps=535510, ups=1.08, wpb=493926, bsz=16546.8, num_updates=23100, lr=0.000416125, gnorm=0.149, clip=0, loss_scale=0.5, train_wall=83, gb_free=61.5, wall=19847 epoch 014: 1183 / 1689 loss=3.627, nll_loss=2.085, ppl=4.24, wps=535510, ups=1.08, wpb=493926, bsz=16546.8, num_updates=23100, lr=0.000416125, gnorm=0.149, clip=0, loss_scale=0.5, train_wall=83, gb_free=61.5, wall=19847 epoch 014: 1183 / 1689 loss=3.627, nll_loss=2.085, ppl=4.24, wps=535510, ups=1.08, wpb=493926, bsz=16546.8, num_updates=23100, lr=0.000416125, gnorm=0.149, clip=0, loss_scale=0.5, train_wall=83, gb_free=61.5, wall=19847 epoch 014: 1183 / 1689 loss=3.627, nll_loss=2.085, ppl=4.24, wps=535510, ups=1.08, wpb=493926, bsz=16546.8, num_updates=23100, lr=0.000416125, gnorm=0.149, clip=0, loss_scale=0.5, train_wall=83, gb_free=61.5, wall=19847 epoch 014: 1283 / 1689 loss=3.625, nll_loss=2.083, ppl=4.24, wps=590384, ups=1.19, wpb=496190, bsz=16168.4, num_updates=23200, lr=0.000415227, gnorm=0.143, clip=0, loss_scale=0.5, train_wall=83, gb_free=60.6, wall=19931 epoch 014: 1283 / 1689 loss=3.625, nll_loss=2.083, ppl=4.24, wps=590384, ups=1.19, wpb=496190, bsz=16168.4, num_updates=23200, lr=0.000415227, gnorm=0.143, clip=0, loss_scale=0.5, train_wall=83, gb_free=60.6, wall=19931 epoch 014: 1283 / 1689 loss=3.625, nll_loss=2.083, ppl=4.24, wps=590384, ups=1.19, wpb=496190, bsz=16168.4, num_updates=23200, lr=0.000415227, gnorm=0.143, clip=0, loss_scale=0.5, train_wall=83, gb_free=60.6, wall=19931 epoch 014: 1283 / 1689 loss=3.625, nll_loss=2.083, ppl=4.24, wps=590384, ups=1.19, wpb=496190, bsz=16168.4, num_updates=23200, lr=0.000415227, gnorm=0.143, clip=0, loss_scale=0.5, train_wall=83, gb_free=60.6, wall=19931 epoch 014: 1283 / 1689 loss=3.625, nll_loss=2.083, ppl=4.24, wps=590384, ups=1.19, wpb=496190, bsz=16168.4, num_updates=23200, lr=0.000415227, gnorm=0.143, clip=0, loss_scale=0.5, train_wall=83, gb_free=60.6, wall=19931 epoch 014: 1283 / 1689 loss=3.625, nll_loss=2.083, ppl=4.24, wps=590384, ups=1.19, wpb=496190, bsz=16168.4, num_updates=23200, lr=0.000415227, gnorm=0.143, clip=0, loss_scale=0.5, train_wall=83, gb_free=60.6, wall=19931 epoch 014: 1283 / 1689 loss=3.625, nll_loss=2.083, ppl=4.24, wps=590384, ups=1.19, wpb=496190, bsz=16168.4, num_updates=23200, lr=0.000415227, gnorm=0.143, clip=0, loss_scale=0.5, train_wall=83, gb_free=60.6, wall=19931 epoch 014: 1283 / 1689 loss=3.625, nll_loss=2.083, ppl=4.24, wps=590384, ups=1.19, wpb=496190, bsz=16168.4, num_updates=23200, lr=0.000415227, gnorm=0.143, clip=0, loss_scale=0.5, train_wall=83, gb_free=60.6, wall=19931 epoch 014: 1283 / 1689 loss=3.625, nll_loss=2.083, ppl=4.24, wps=590384, ups=1.19, wpb=496190, bsz=16168.4, num_updates=23200, lr=0.000415227, gnorm=0.143, clip=0, loss_scale=0.5, train_wall=83, gb_free=60.6, wall=19931 epoch 014: 1283 / 1689 loss=3.625, nll_loss=2.083, ppl=4.24, wps=590384, ups=1.19, wpb=496190, bsz=16168.4, num_updates=23200, lr=0.000415227, gnorm=0.143, clip=0, loss_scale=0.5, train_wall=83, gb_free=60.6, wall=19931 epoch 014: 1283 / 1689 loss=3.625, nll_loss=2.083, ppl=4.24, wps=590384, ups=1.19, wpb=496190, bsz=16168.4, num_updates=23200, lr=0.000415227, gnorm=0.143, clip=0, loss_scale=0.5, train_wall=83, gb_free=60.6, wall=19931 epoch 014: 1283 / 1689 loss=3.625, nll_loss=2.083, ppl=4.24, wps=590384, ups=1.19, wpb=496190, bsz=16168.4, num_updates=23200, lr=0.000415227, gnorm=0.143, clip=0, loss_scale=0.5, train_wall=83, gb_free=60.6, wall=19931 epoch 014: 1283 / 1689 loss=3.625, nll_loss=2.083, ppl=4.24, wps=590384, ups=1.19, wpb=496190, bsz=16168.4, num_updates=23200, lr=0.000415227, gnorm=0.143, clip=0, loss_scale=0.5, train_wall=83, gb_free=60.6, wall=19931 epoch 014: 1283 / 1689 loss=3.625, nll_loss=2.083, ppl=4.24, wps=590384, ups=1.19, wpb=496190, bsz=16168.4, num_updates=23200, lr=0.000415227, gnorm=0.143, clip=0, loss_scale=0.5, train_wall=83, gb_free=60.6, wall=19931 epoch 014: 1383 / 1689 loss=3.627, nll_loss=2.086, ppl=4.25, wps=587860, ups=1.19, wpb=495297, bsz=16368.2, num_updates=23300, lr=0.000414335, gnorm=0.141, clip=0, loss_scale=0.5, train_wall=83, gb_free=61.8, wall=20015 epoch 014: 1383 / 1689 loss=3.627, nll_loss=2.086, ppl=4.25, wps=587860, ups=1.19, wpb=495297, bsz=16368.2, num_updates=23300, lr=0.000414335, gnorm=0.141, clip=0, loss_scale=0.5, train_wall=83, gb_free=61.8, wall=20015 epoch 014: 1383 / 1689 loss=3.627, nll_loss=2.086, ppl=4.25, wps=587860, ups=1.19, wpb=495297, bsz=16368.2, num_updates=23300, lr=0.000414335, gnorm=0.141, clip=0, loss_scale=0.5, train_wall=83, gb_free=61.8, wall=20015 epoch 014: 1383 / 1689 loss=3.627, nll_loss=2.086, ppl=4.25, wps=587860, ups=1.19, wpb=495297, bsz=16368.2, num_updates=23300, lr=0.000414335, gnorm=0.141, clip=0, loss_scale=0.5, train_wall=83, gb_free=61.8, wall=20015 epoch 014: 1383 / 1689 loss=3.627, nll_loss=2.086, ppl=4.25, wps=587860, ups=1.19, wpb=495297, bsz=16368.2, num_updates=23300, lr=0.000414335, gnorm=0.141, clip=0, loss_scale=0.5, train_wall=83, gb_free=61.8, wall=20015 epoch 014: 1383 / 1689 loss=3.627, nll_loss=2.086, ppl=4.25, wps=587860, ups=1.19, wpb=495297, bsz=16368.2, num_updates=23300, lr=0.000414335, gnorm=0.141, clip=0, loss_scale=0.5, train_wall=83, gb_free=61.8, wall=20015 epoch 014: 1383 / 1689 loss=3.627, nll_loss=2.086, ppl=4.25, wps=587860, ups=1.19, wpb=495297, bsz=16368.2, num_updates=23300, lr=0.000414335, gnorm=0.141, clip=0, loss_scale=0.5, train_wall=83, gb_free=61.8, wall=20015 epoch 014: 1383 / 1689 loss=3.627, nll_loss=2.086, ppl=4.25, wps=587860, ups=1.19, wpb=495297, bsz=16368.2, num_updates=23300, lr=0.000414335, gnorm=0.141, clip=0, loss_scale=0.5, train_wall=83, gb_free=61.8, wall=20015 epoch 014: 1383 / 1689 loss=3.627, nll_loss=2.086, ppl=4.25, wps=587860, ups=1.19, wpb=495297, bsz=16368.2, num_updates=23300, lr=0.000414335, gnorm=0.141, clip=0, loss_scale=0.5, train_wall=83, gb_free=61.8, wall=20015 epoch 014: 1383 / 1689 loss=3.627, nll_loss=2.086, ppl=4.25, wps=587860, ups=1.19, wpb=495297, bsz=16368.2, num_updates=23300, lr=0.000414335, gnorm=0.141, clip=0, loss_scale=0.5, train_wall=83, gb_free=61.8, wall=20015 epoch 014: 1383 / 1689 loss=3.627, nll_loss=2.086, ppl=4.25, wps=587860, ups=1.19, wpb=495297, bsz=16368.2, num_updates=23300, lr=0.000414335, gnorm=0.141, clip=0, loss_scale=0.5, train_wall=83, gb_free=61.8, wall=20015 epoch 014: 1383 / 1689 loss=3.627, nll_loss=2.086, ppl=4.25, wps=587860, ups=1.19, wpb=495297, bsz=16368.2, num_updates=23300, lr=0.000414335, gnorm=0.141, clip=0, loss_scale=0.5, train_wall=83, gb_free=61.8, wall=20015 epoch 014: 1383 / 1689 loss=3.627, nll_loss=2.086, ppl=4.25, wps=587860, ups=1.19, wpb=495297, bsz=16368.2, num_updates=23300, lr=0.000414335, gnorm=0.141, clip=0, loss_scale=0.5, train_wall=83, gb_free=61.8, wall=20015 epoch 014: 1383 / 1689 loss=3.627, nll_loss=2.086, ppl=4.25, wps=587860, ups=1.19, wpb=495297, bsz=16368.2, num_updates=23300, lr=0.000414335, gnorm=0.141, clip=0, loss_scale=0.5, train_wall=83, gb_free=61.8, wall=20015 epoch 014: 1483 / 1689 loss=3.626, nll_loss=2.085, ppl=4.24, wps=581520, ups=1.17, wpb=495221, bsz=16655.7, num_updates=23400, lr=0.000413449, gnorm=0.138, clip=0, loss_scale=0.5, train_wall=84, gb_free=61.5, wall=20100 epoch 014: 1483 / 1689 loss=3.626, nll_loss=2.085, ppl=4.24, wps=581520, ups=1.17, wpb=495221, bsz=16655.7, num_updates=23400, lr=0.000413449, gnorm=0.138, clip=0, loss_scale=0.5, train_wall=84, gb_free=61.5, wall=20100 epoch 014: 1483 / 1689 loss=3.626, nll_loss=2.085, ppl=4.24, wps=581520, ups=1.17, wpb=495221, bsz=16655.7, num_updates=23400, lr=0.000413449, gnorm=0.138, clip=0, loss_scale=0.5, train_wall=84, gb_free=61.5, wall=20100 epoch 014: 1483 / 1689 loss=3.626, nll_loss=2.085, ppl=4.24, wps=581520, ups=1.17, wpb=495221, bsz=16655.7, num_updates=23400, lr=0.000413449, gnorm=0.138, clip=0, loss_scale=0.5, train_wall=84, gb_free=61.5, wall=20100 epoch 014: 1483 / 1689 loss=3.626, nll_loss=2.085, ppl=4.24, wps=581520, ups=1.17, wpb=495221, bsz=16655.7, num_updates=23400, lr=0.000413449, gnorm=0.138, clip=0, loss_scale=0.5, train_wall=84, gb_free=61.5, wall=20100 epoch 014: 1483 / 1689 loss=3.626, nll_loss=2.085, ppl=4.24, wps=581520, ups=1.17, wpb=495221, bsz=16655.7, num_updates=23400, lr=0.000413449, gnorm=0.138, clip=0, loss_scale=0.5, train_wall=84, gb_free=61.5, wall=20100 epoch 014: 1483 / 1689 loss=3.626, nll_loss=2.085, ppl=4.24, wps=581520, ups=1.17, wpb=495221, bsz=16655.7, num_updates=23400, lr=0.000413449, gnorm=0.138, clip=0, loss_scale=0.5, train_wall=84, gb_free=61.5, wall=20100 epoch 014: 1483 / 1689 loss=3.626, nll_loss=2.085, ppl=4.24, wps=581520, ups=1.17, wpb=495221, bsz=16655.7, num_updates=23400, lr=0.000413449, gnorm=0.138, clip=0, loss_scale=0.5, train_wall=84, gb_free=61.5, wall=20100 epoch 014: 1483 / 1689 loss=3.626, nll_loss=2.085, ppl=4.24, wps=581520, ups=1.17, wpb=495221, bsz=16655.7, num_updates=23400, lr=0.000413449, gnorm=0.138, clip=0, loss_scale=0.5, train_wall=84, gb_free=61.5, wall=20100 epoch 014: 1483 / 1689 loss=3.626, nll_loss=2.085, ppl=4.24, wps=581520, ups=1.17, wpb=495221, bsz=16655.7, num_updates=23400, lr=0.000413449, gnorm=0.138, clip=0, loss_scale=0.5, train_wall=84, gb_free=61.5, wall=20100 epoch 014: 1483 / 1689 loss=3.626, nll_loss=2.085, ppl=4.24, wps=581520, ups=1.17, wpb=495221, bsz=16655.7, num_updates=23400, lr=0.000413449, gnorm=0.138, clip=0, loss_scale=0.5, train_wall=84, gb_free=61.5, wall=20100 epoch 014: 1483 / 1689 loss=3.626, nll_loss=2.085, ppl=4.24, wps=581520, ups=1.17, wpb=495221, bsz=16655.7, num_updates=23400, lr=0.000413449, gnorm=0.138, clip=0, loss_scale=0.5, train_wall=84, gb_free=61.5, wall=20100 epoch 014: 1483 / 1689 loss=3.626, nll_loss=2.085, ppl=4.24, wps=581520, ups=1.17, wpb=495221, bsz=16655.7, num_updates=23400, lr=0.000413449, gnorm=0.138, clip=0, loss_scale=0.5, train_wall=84, gb_free=61.5, wall=20100 epoch 014: 1483 / 1689 loss=3.626, nll_loss=2.085, ppl=4.24, wps=581520, ups=1.17, wpb=495221, bsz=16655.7, num_updates=23400, lr=0.000413449, gnorm=0.138, clip=0, loss_scale=0.5, train_wall=84, gb_free=61.5, wall=20100 epoch 014: 1583 / 1689 loss=3.616, nll_loss=2.074, ppl=4.21, wps=586385, ups=1.18, wpb=496959, bsz=16741.5, num_updates=23500, lr=0.000412568, gnorm=0.149, clip=0, loss_scale=1, train_wall=84, gb_free=61, wall=20185 epoch 014: 1583 / 1689 loss=3.616, nll_loss=2.074, ppl=4.21, wps=586385, ups=1.18, wpb=496959, bsz=16741.5, num_updates=23500, lr=0.000412568, gnorm=0.149, clip=0, loss_scale=1, train_wall=84, gb_free=61, wall=20185 epoch 014: 1583 / 1689 loss=3.616, nll_loss=2.074, ppl=4.21, wps=586385, ups=1.18, wpb=496959, bsz=16741.5, num_updates=23500, lr=0.000412568, gnorm=0.149, clip=0, loss_scale=1, train_wall=84, gb_free=61, wall=20185 epoch 014: 1583 / 1689 loss=3.616, nll_loss=2.074, ppl=4.21, wps=586385, ups=1.18, wpb=496959, bsz=16741.5, num_updates=23500, lr=0.000412568, gnorm=0.149, clip=0, loss_scale=1, train_wall=84, gb_free=61, wall=20185 epoch 014: 1583 / 1689 loss=3.616, nll_loss=2.074, ppl=4.21, wps=586385, ups=1.18, wpb=496959, bsz=16741.5, num_updates=23500, lr=0.000412568, gnorm=0.149, clip=0, loss_scale=1, train_wall=84, gb_free=61, wall=20185 epoch 014: 1583 / 1689 loss=3.616, nll_loss=2.074, ppl=4.21, wps=586385, ups=1.18, wpb=496959, bsz=16741.5, num_updates=23500, lr=0.000412568, gnorm=0.149, clip=0, loss_scale=1, train_wall=84, gb_free=61, wall=20185 epoch 014: 1583 / 1689 loss=3.616, nll_loss=2.074, ppl=4.21, wps=586385, ups=1.18, wpb=496959, bsz=16741.5, num_updates=23500, lr=0.000412568, gnorm=0.149, clip=0, loss_scale=1, train_wall=84, gb_free=61, wall=20185 epoch 014: 1583 / 1689 loss=3.616, nll_loss=2.074, ppl=4.21, wps=586385, ups=1.18, wpb=496959, bsz=16741.5, num_updates=23500, lr=0.000412568, gnorm=0.149, clip=0, loss_scale=1, train_wall=84, gb_free=61, wall=20185 epoch 014: 1583 / 1689 loss=3.616, nll_loss=2.074, ppl=4.21, wps=586385, ups=1.18, wpb=496959, bsz=16741.5, num_updates=23500, lr=0.000412568, gnorm=0.149, clip=0, loss_scale=1, train_wall=84, gb_free=61, wall=20185 epoch 014: 1583 / 1689 loss=3.616, nll_loss=2.074, ppl=4.21, wps=586385, ups=1.18, wpb=496959, bsz=16741.5, num_updates=23500, lr=0.000412568, gnorm=0.149, clip=0, loss_scale=1, train_wall=84, gb_free=61, wall=20185 epoch 014: 1583 / 1689 loss=3.616, nll_loss=2.074, ppl=4.21, wps=586385, ups=1.18, wpb=496959, bsz=16741.5, num_updates=23500, lr=0.000412568, gnorm=0.149, clip=0, loss_scale=1, train_wall=84, gb_free=61, wall=20185 epoch 014: 1583 / 1689 loss=3.616, nll_loss=2.074, ppl=4.21, wps=586385, ups=1.18, wpb=496959, bsz=16741.5, num_updates=23500, lr=0.000412568, gnorm=0.149, clip=0, loss_scale=1, train_wall=84, gb_free=61, wall=20185 epoch 014: 1583 / 1689 loss=3.616, nll_loss=2.074, ppl=4.21, wps=586385, ups=1.18, wpb=496959, bsz=16741.5, num_updates=23500, lr=0.000412568, gnorm=0.149, clip=0, loss_scale=1, train_wall=84, gb_free=61, wall=20185 epoch 014: 1583 / 1689 loss=3.616, nll_loss=2.074, ppl=4.21, wps=586385, ups=1.18, wpb=496959, bsz=16741.5, num_updates=23500, lr=0.000412568, gnorm=0.149, clip=0, loss_scale=1, train_wall=84, gb_free=61, wall=20185 epoch 014: 1683 / 1689 loss=3.634, nll_loss=2.093, ppl=4.27, wps=585897, ups=1.19, wpb=493258, bsz=16426.8, num_updates=23600, lr=0.000411693, gnorm=0.143, clip=0, loss_scale=1, train_wall=83, gb_free=61.3, wall=20269 epoch 014: 1683 / 1689 loss=3.634, nll_loss=2.093, ppl=4.27, wps=585897, ups=1.19, wpb=493258, bsz=16426.8, num_updates=23600, lr=0.000411693, gnorm=0.143, clip=0, loss_scale=1, train_wall=83, gb_free=61.3, wall=20269 epoch 014: 1683 / 1689 loss=3.634, nll_loss=2.093, ppl=4.27, wps=585897, ups=1.19, wpb=493258, bsz=16426.8, num_updates=23600, lr=0.000411693, gnorm=0.143, clip=0, loss_scale=1, train_wall=83, gb_free=61.3, wall=20269 epoch 014: 1683 / 1689 loss=3.634, nll_loss=2.093, ppl=4.27, wps=585897, ups=1.19, wpb=493258, bsz=16426.8, num_updates=23600, lr=0.000411693, gnorm=0.143, clip=0, loss_scale=1, train_wall=83, gb_free=61.3, wall=20269 epoch 014: 1683 / 1689 loss=3.634, nll_loss=2.093, ppl=4.27, wps=585897, ups=1.19, wpb=493258, bsz=16426.8, num_updates=23600, lr=0.000411693, gnorm=0.143, clip=0, loss_scale=1, train_wall=83, gb_free=61.3, wall=20269 epoch 014: 1683 / 1689 loss=3.634, nll_loss=2.093, ppl=4.27, wps=585897, ups=1.19, wpb=493258, bsz=16426.8, num_updates=23600, lr=0.000411693, gnorm=0.143, clip=0, loss_scale=1, train_wall=83, gb_free=61.3, wall=20269 epoch 014: 1683 / 1689 loss=3.634, nll_loss=2.093, ppl=4.27, wps=585897, ups=1.19, wpb=493258, bsz=16426.8, num_updates=23600, lr=0.000411693, gnorm=0.143, clip=0, loss_scale=1, train_wall=83, gb_free=61.3, wall=20269 epoch 014: 1683 / 1689 loss=3.634, nll_loss=2.093, ppl=4.27, wps=585897, ups=1.19, wpb=493258, bsz=16426.8, num_updates=23600, lr=0.000411693, gnorm=0.143, clip=0, loss_scale=1, train_wall=83, gb_free=61.3, wall=20269 epoch 014: 1683 / 1689 loss=3.634, nll_loss=2.093, ppl=4.27, wps=585897, ups=1.19, wpb=493258, bsz=16426.8, num_updates=23600, lr=0.000411693, gnorm=0.143, clip=0, loss_scale=1, train_wall=83, gb_free=61.3, wall=20269 epoch 014: 1683 / 1689 loss=3.634, nll_loss=2.093, ppl=4.27, wps=585897, ups=1.19, wpb=493258, bsz=16426.8, num_updates=23600, lr=0.000411693, gnorm=0.143, clip=0, loss_scale=1, train_wall=83, gb_free=61.3, wall=20269 epoch 014: 1683 / 1689 loss=3.634, nll_loss=2.093, ppl=4.27, wps=585897, ups=1.19, wpb=493258, bsz=16426.8, num_updates=23600, lr=0.000411693, gnorm=0.143, clip=0, loss_scale=1, train_wall=83, gb_free=61.3, wall=20269 epoch 014: 1683 / 1689 loss=3.634, nll_loss=2.093, ppl=4.27, wps=585897, ups=1.19, wpb=493258, bsz=16426.8, num_updates=23600, lr=0.000411693, gnorm=0.143, clip=0, loss_scale=1, train_wall=83, gb_free=61.3, wall=20269 epoch 014: 1683 / 1689 loss=3.634, nll_loss=2.093, ppl=4.27, wps=585897, ups=1.19, wpb=493258, bsz=16426.8, num_updates=23600, lr=0.000411693, gnorm=0.143, clip=0, loss_scale=1, train_wall=83, gb_free=61.3, wall=20269 epoch 014: 1683 / 1689 loss=3.634, nll_loss=2.093, ppl=4.27, wps=585897, ups=1.19, wpb=493258, bsz=16426.8, num_updates=23600, lr=0.000411693, gnorm=0.143, clip=0, loss_scale=1, train_wall=83, gb_free=61.3, wall=20269 end of epoch 14 (average epoch stats below) epoch 014 | loss 3.619 | nll_loss 2.076 | ppl 4.22 | wps 576414 | ups 1.16 | wpb 495107 | bsz 16510.7 | num_updates 23606 | lr 0.000411641 | gnorm 0.146 | clip 0 | loss_scale 1 | train_wall 1408 | gb_free 63.5 | wall 20273 epoch 014 | loss 3.619 | nll_loss 2.076 | ppl 4.22 | wps 576414 | ups 1.16 | wpb 495107 | bsz 16510.7 | num_updates 23606 | lr 0.000411641 | gnorm 0.146 | clip 0 | loss_scale 1 | train_wall 1408 | gb_free 63.5 | wall 20273 epoch 014 | loss 3.619 | nll_loss 2.076 | ppl 4.22 | wps 576414 | ups 1.16 | wpb 495107 | bsz 16510.7 | num_updates 23606 | lr 0.000411641 | gnorm 0.146 | clip 0 | loss_scale 1 | train_wall 1408 | gb_free 63.5 | wall 20273 epoch 014 | loss 3.619 | nll_loss 2.076 | ppl 4.22 | wps 576414 | ups 1.16 | wpb 495107 | bsz 16510.7 | num_updates 23606 | lr 0.000411641 | gnorm 0.146 | clip 0 | loss_scale 1 | train_wall 1408 | gb_free 63.5 | wall 20273 epoch 014 | loss 3.619 | nll_loss 2.076 | ppl 4.22 | wps 576414 | ups 1.16 | wpb 495107 | bsz 16510.7 | num_updates 23606 | lr 0.000411641 | gnorm 0.146 | clip 0 | loss_scale 1 | train_wall 1408 | gb_free 63.5 | wall 20273 epoch 014 | loss 3.619 | nll_loss 2.076 | ppl 4.22 | wps 576414 | ups 1.16 | wpb 495107 | bsz 16510.7 | num_updates 23606 | lr 0.000411641 | gnorm 0.146 | clip 0 | loss_scale 1 | train_wall 1408 | gb_free 63.5 | wall 20273 epoch 014 | loss 3.619 | nll_loss 2.076 | ppl 4.22 | wps 576414 | ups 1.16 | wpb 495107 | bsz 16510.7 | num_updates 23606 | lr 0.000411641 | gnorm 0.146 | clip 0 | loss_scale 1 | train_wall 1408 | gb_free 63.5 | wall 20273 epoch 014 | loss 3.619 | nll_loss 2.076 | ppl 4.22 | wps 576414 | ups 1.16 | wpb 495107 | bsz 16510.7 | num_updates 23606 | lr 0.000411641 | gnorm 0.146 | clip 0 | loss_scale 1 | train_wall 1408 | gb_free 63.5 | wall 20273 epoch 014 | loss 3.619 | nll_loss 2.076 | ppl 4.22 | wps 576414 | ups 1.16 | wpb 495107 | bsz 16510.7 | num_updates 23606 | lr 0.000411641 | gnorm 0.146 | clip 0 | loss_scale 1 | train_wall 1408 | gb_free 63.5 | wall 20273 epoch 014 | loss 3.619 | nll_loss 2.076 | ppl 4.22 | wps 576414 | ups 1.16 | wpb 495107 | bsz 16510.7 | num_updates 23606 | lr 0.000411641 | gnorm 0.146 | clip 0 | loss_scale 1 | train_wall 1408 | gb_free 63.5 | wall 20273 epoch 014 | loss 3.619 | nll_loss 2.076 | ppl 4.22 | wps 576414 | ups 1.16 | wpb 495107 | bsz 16510.7 | num_updates 23606 | lr 0.000411641 | gnorm 0.146 | clip 0 | loss_scale 1 | train_wall 1408 | gb_free 63.5 | wall 20273 epoch 014 | loss 3.619 | nll_loss 2.076 | ppl 4.22 | wps 576414 | ups 1.16 | wpb 495107 | bsz 16510.7 | num_updates 23606 | lr 0.000411641 | gnorm 0.146 | clip 0 | loss_scale 1 | train_wall 1408 | gb_free 63.5 | wall 20273 epoch 014 | loss 3.619 | nll_loss 2.076 | ppl 4.22 | wps 576414 | ups 1.16 | wpb 495107 | bsz 16510.7 | num_updates 23606 | lr 0.000411641 | gnorm 0.146 | clip 0 | loss_scale 1 | train_wall 1408 | gb_free 63.5 | wall 20273 epoch 014 | loss 3.619 | nll_loss 2.076 | ppl 4.22 | wps 576414 | ups 1.16 | wpb 495107 | bsz 16510.7 | num_updates 23606 | lr 0.000411641 | gnorm 0.146 | clip 0 | loss_scale 1 | train_wall 1408 | gb_free 63.5 | wall 20273 Start iterating over samples epoch 015: 94 / 1689 loss=3.591, nll_loss=2.044, ppl=4.12, wps=581941, ups=1.19, wpb=490263, bsz=16263.5, num_updates=23700, lr=0.000410824, gnorm=0.154, clip=0, loss_scale=1, train_wall=83, gb_free=61.4, wall=20353 epoch 015: 94 / 1689 loss=3.591, nll_loss=2.044, ppl=4.12, wps=581941, ups=1.19, wpb=490263, bsz=16263.5, num_updates=23700, lr=0.000410824, gnorm=0.154, clip=0, loss_scale=1, train_wall=83, gb_free=61.4, wall=20353 epoch 015: 94 / 1689 loss=3.591, nll_loss=2.044, ppl=4.12, wps=581941, ups=1.19, wpb=490263, bsz=16263.5, num_updates=23700, lr=0.000410824, gnorm=0.154, clip=0, loss_scale=1, train_wall=83, gb_free=61.4, wall=20353 epoch 015: 94 / 1689 loss=3.591, nll_loss=2.044, ppl=4.12, wps=581941, ups=1.19, wpb=490263, bsz=16263.5, num_updates=23700, lr=0.000410824, gnorm=0.154, clip=0, loss_scale=1, train_wall=83, gb_free=61.4, wall=20353 epoch 015: 94 / 1689 loss=3.591, nll_loss=2.044, ppl=4.12, wps=581941, ups=1.19, wpb=490263, bsz=16263.5, num_updates=23700, lr=0.000410824, gnorm=0.154, clip=0, loss_scale=1, train_wall=83, gb_free=61.4, wall=20353 epoch 015: 94 / 1689 loss=3.591, nll_loss=2.044, ppl=4.12, wps=581941, ups=1.19, wpb=490263, bsz=16263.5, num_updates=23700, lr=0.000410824, gnorm=0.154, clip=0, loss_scale=1, train_wall=83, gb_free=61.4, wall=20353 epoch 015: 94 / 1689 loss=3.591, nll_loss=2.044, ppl=4.12, wps=581941, ups=1.19, wpb=490263, bsz=16263.5, num_updates=23700, lr=0.000410824, gnorm=0.154, clip=0, loss_scale=1, train_wall=83, gb_free=61.4, wall=20353 epoch 015: 94 / 1689 loss=3.591, nll_loss=2.044, ppl=4.12, wps=581941, ups=1.19, wpb=490263, bsz=16263.5, num_updates=23700, lr=0.000410824, gnorm=0.154, clip=0, loss_scale=1, train_wall=83, gb_free=61.4, wall=20353 epoch 015: 94 / 1689 loss=3.591, nll_loss=2.044, ppl=4.12, wps=581941, ups=1.19, wpb=490263, bsz=16263.5, num_updates=23700, lr=0.000410824, gnorm=0.154, clip=0, loss_scale=1, train_wall=83, gb_free=61.4, wall=20353 epoch 015: 94 / 1689 loss=3.591, nll_loss=2.044, ppl=4.12, wps=581941, ups=1.19, wpb=490263, bsz=16263.5, num_updates=23700, lr=0.000410824, gnorm=0.154, clip=0, loss_scale=1, train_wall=83, gb_free=61.4, wall=20353 epoch 015: 94 / 1689 loss=3.591, nll_loss=2.044, ppl=4.12, wps=581941, ups=1.19, wpb=490263, bsz=16263.5, num_updates=23700, lr=0.000410824, gnorm=0.154, clip=0, loss_scale=1, train_wall=83, gb_free=61.4, wall=20353 epoch 015: 94 / 1689 loss=3.591, nll_loss=2.044, ppl=4.12, wps=581941, ups=1.19, wpb=490263, bsz=16263.5, num_updates=23700, lr=0.000410824, gnorm=0.154, clip=0, loss_scale=1, train_wall=83, gb_free=61.4, wall=20353 epoch 015: 94 / 1689 loss=3.591, nll_loss=2.044, ppl=4.12, wps=581941, ups=1.19, wpb=490263, bsz=16263.5, num_updates=23700, lr=0.000410824, gnorm=0.154, clip=0, loss_scale=1, train_wall=83, gb_free=61.4, wall=20353 epoch 015: 94 / 1689 loss=3.591, nll_loss=2.044, ppl=4.12, wps=581941, ups=1.19, wpb=490263, bsz=16263.5, num_updates=23700, lr=0.000410824, gnorm=0.154, clip=0, loss_scale=1, train_wall=83, gb_free=61.4, wall=20353 epoch 015: 94 / 1689 loss=3.591, nll_loss=2.044, ppl=4.12, wps=581941, ups=1.19, wpb=490263, bsz=16263.5, num_updates=23700, lr=0.000410824, gnorm=0.154, clip=0, loss_scale=1, train_wall=83, gb_free=61.4, wall=20353 epoch 015: 194 / 1689 loss=3.598, nll_loss=2.052, ppl=4.15, wps=586226, ups=1.18, wpb=494874, bsz=16427, num_updates=23800, lr=0.00040996, gnorm=0.143, clip=0, loss_scale=1, train_wall=83, gb_free=60.6, wall=20438 epoch 015: 194 / 1689 loss=3.598, nll_loss=2.052, ppl=4.15, wps=586226, ups=1.18, wpb=494874, bsz=16427, num_updates=23800, lr=0.00040996, gnorm=0.143, clip=0, loss_scale=1, train_wall=83, gb_free=60.6, wall=20438 epoch 015: 194 / 1689 loss=3.598, nll_loss=2.052, ppl=4.15, wps=586226, ups=1.18, wpb=494874, bsz=16427, num_updates=23800, lr=0.00040996, gnorm=0.143, clip=0, loss_scale=1, train_wall=83, gb_free=60.6, wall=20438 epoch 015: 194 / 1689 loss=3.598, nll_loss=2.052, ppl=4.15, wps=586226, ups=1.18, wpb=494874, bsz=16427, num_updates=23800, lr=0.00040996, gnorm=0.143, clip=0, loss_scale=1, train_wall=83, gb_free=60.6, wall=20438 epoch 015: 194 / 1689 loss=3.598, nll_loss=2.052, ppl=4.15, wps=586226, ups=1.18, wpb=494874, bsz=16427, num_updates=23800, lr=0.00040996, gnorm=0.143, clip=0, loss_scale=1, train_wall=83, gb_free=60.6, wall=20438 epoch 015: 194 / 1689 loss=3.598, nll_loss=2.052, ppl=4.15, wps=586226, ups=1.18, wpb=494874, bsz=16427, num_updates=23800, lr=0.00040996, gnorm=0.143, clip=0, loss_scale=1, train_wall=83, gb_free=60.6, wall=20438 epoch 015: 194 / 1689 loss=3.598, nll_loss=2.052, ppl=4.15, wps=586226, ups=1.18, wpb=494874, bsz=16427, num_updates=23800, lr=0.00040996, gnorm=0.143, clip=0, loss_scale=1, train_wall=83, gb_free=60.6, wall=20438 epoch 015: 194 / 1689 loss=3.598, nll_loss=2.052, ppl=4.15, wps=586226, ups=1.18, wpb=494874, bsz=16427, num_updates=23800, lr=0.00040996, gnorm=0.143, clip=0, loss_scale=1, train_wall=83, gb_free=60.6, wall=20438 epoch 015: 194 / 1689 loss=3.598, nll_loss=2.052, ppl=4.15, wps=586226, ups=1.18, wpb=494874, bsz=16427, num_updates=23800, lr=0.00040996, gnorm=0.143, clip=0, loss_scale=1, train_wall=83, gb_free=60.6, wall=20438 epoch 015: 194 / 1689 loss=3.598, nll_loss=2.052, ppl=4.15, wps=586226, ups=1.18, wpb=494874, bsz=16427, num_updates=23800, lr=0.00040996, gnorm=0.143, clip=0, loss_scale=1, train_wall=83, gb_free=60.6, wall=20438 epoch 015: 194 / 1689 loss=3.598, nll_loss=2.052, ppl=4.15, wps=586226, ups=1.18, wpb=494874, bsz=16427, num_updates=23800, lr=0.00040996, gnorm=0.143, clip=0, loss_scale=1, train_wall=83, gb_free=60.6, wall=20438 epoch 015: 194 / 1689 loss=3.598, nll_loss=2.052, ppl=4.15, wps=586226, ups=1.18, wpb=494874, bsz=16427, num_updates=23800, lr=0.00040996, gnorm=0.143, clip=0, loss_scale=1, train_wall=83, gb_free=60.6, wall=20438 epoch 015: 194 / 1689 loss=3.598, nll_loss=2.052, ppl=4.15, wps=586226, ups=1.18, wpb=494874, bsz=16427, num_updates=23800, lr=0.00040996, gnorm=0.143, clip=0, loss_scale=1, train_wall=83, gb_free=60.6, wall=20438 epoch 015: 194 / 1689 loss=3.598, nll_loss=2.052, ppl=4.15, wps=586226, ups=1.18, wpb=494874, bsz=16427, num_updates=23800, lr=0.00040996, gnorm=0.143, clip=0, loss_scale=1, train_wall=83, gb_free=60.6, wall=20438 epoch 015: 194 / 1689 loss=3.598, nll_loss=2.052, ppl=4.15, wps=586226, ups=1.18, wpb=494874, bsz=16427, num_updates=23800, lr=0.00040996, gnorm=0.143, clip=0, loss_scale=1, train_wall=83, gb_free=60.6, wall=20438 epoch 015: 294 / 1689 loss=3.597, nll_loss=2.051, ppl=4.14, wps=584764, ups=1.18, wpb=495334, bsz=16510.9, num_updates=23900, lr=0.000409101, gnorm=0.144, clip=0, loss_scale=1, train_wall=84, gb_free=62, wall=20522 epoch 015: 294 / 1689 loss=3.597, nll_loss=2.051, ppl=4.14, wps=584764, ups=1.18, wpb=495334, bsz=16510.9, num_updates=23900, lr=0.000409101, gnorm=0.144, clip=0, loss_scale=1, train_wall=84, gb_free=62, wall=20522 epoch 015: 294 / 1689 loss=3.597, nll_loss=2.051, ppl=4.14, wps=584764, ups=1.18, wpb=495334, bsz=16510.9, num_updates=23900, lr=0.000409101, gnorm=0.144, clip=0, loss_scale=1, train_wall=84, gb_free=62, wall=20522 epoch 015: 294 / 1689 loss=3.597, nll_loss=2.051, ppl=4.14, wps=584764, ups=1.18, wpb=495334, bsz=16510.9, num_updates=23900, lr=0.000409101, gnorm=0.144, clip=0, loss_scale=1, train_wall=84, gb_free=62, wall=20522 epoch 015: 294 / 1689 loss=3.597, nll_loss=2.051, ppl=4.14, wps=584764, ups=1.18, wpb=495334, bsz=16510.9, num_updates=23900, lr=0.000409101, gnorm=0.144, clip=0, loss_scale=1, train_wall=84, gb_free=62, wall=20522 epoch 015: 294 / 1689 loss=3.597, nll_loss=2.051, ppl=4.14, wps=584764, ups=1.18, wpb=495334, bsz=16510.9, num_updates=23900, lr=0.000409101, gnorm=0.144, clip=0, loss_scale=1, train_wall=84, gb_free=62, wall=20522 epoch 015: 294 / 1689 loss=3.597, nll_loss=2.051, ppl=4.14, wps=584764, ups=1.18, wpb=495334, bsz=16510.9, num_updates=23900, lr=0.000409101, gnorm=0.144, clip=0, loss_scale=1, train_wall=84, gb_free=62, wall=20522 epoch 015: 294 / 1689 loss=3.597, nll_loss=2.051, ppl=4.14, wps=584764, ups=1.18, wpb=495334, bsz=16510.9, num_updates=23900, lr=0.000409101, gnorm=0.144, clip=0, loss_scale=1, train_wall=84, gb_free=62, wall=20522 epoch 015: 294 / 1689 loss=3.597, nll_loss=2.051, ppl=4.14, wps=584764, ups=1.18, wpb=495334, bsz=16510.9, num_updates=23900, lr=0.000409101, gnorm=0.144, clip=0, loss_scale=1, train_wall=84, gb_free=62, wall=20522 epoch 015: 294 / 1689 loss=3.597, nll_loss=2.051, ppl=4.14, wps=584764, ups=1.18, wpb=495334, bsz=16510.9, num_updates=23900, lr=0.000409101, gnorm=0.144, clip=0, loss_scale=1, train_wall=84, gb_free=62, wall=20522 epoch 015: 294 / 1689 loss=3.597, nll_loss=2.051, ppl=4.14, wps=584764, ups=1.18, wpb=495334, bsz=16510.9, num_updates=23900, lr=0.000409101, gnorm=0.144, clip=0, loss_scale=1, train_wall=84, gb_free=62, wall=20522 epoch 015: 294 / 1689 loss=3.597, nll_loss=2.051, ppl=4.14, wps=584764, ups=1.18, wpb=495334, bsz=16510.9, num_updates=23900, lr=0.000409101, gnorm=0.144, clip=0, loss_scale=1, train_wall=84, gb_free=62, wall=20522 epoch 015: 294 / 1689 loss=3.597, nll_loss=2.051, ppl=4.14, wps=584764, ups=1.18, wpb=495334, bsz=16510.9, num_updates=23900, lr=0.000409101, gnorm=0.144, clip=0, loss_scale=1, train_wall=84, gb_free=62, wall=20522 epoch 015: 294 / 1689 loss=3.597, nll_loss=2.051, ppl=4.14, wps=584764, ups=1.18, wpb=495334, bsz=16510.9, num_updates=23900, lr=0.000409101, gnorm=0.144, clip=0, loss_scale=1, train_wall=84, gb_free=62, wall=20522 epoch 015: 294 / 1689 loss=3.597, nll_loss=2.051, ppl=4.14, wps=584764, ups=1.18, wpb=495334, bsz=16510.9, num_updates=23900, lr=0.000409101, gnorm=0.144, clip=0, loss_scale=1, train_wall=84, gb_free=62, wall=20522 epoch 015: 394 / 1689 loss=3.602, nll_loss=2.057, ppl=4.16, wps=587023, ups=1.18, wpb=496557, bsz=16860.5, num_updates=24000, lr=0.000408248, gnorm=0.143, clip=0, loss_scale=2, train_wall=83, gb_free=62.1, wall=20607 epoch 015: 394 / 1689 loss=3.602, nll_loss=2.057, ppl=4.16, wps=587023, ups=1.18, wpb=496557, bsz=16860.5, num_updates=24000, lr=0.000408248, gnorm=0.143, clip=0, loss_scale=2, train_wall=83, gb_free=62.1, wall=20607 epoch 015: 394 / 1689 loss=3.602, nll_loss=2.057, ppl=4.16, wps=587023, ups=1.18, wpb=496557, bsz=16860.5, num_updates=24000, lr=0.000408248, gnorm=0.143, clip=0, loss_scale=2, train_wall=83, gb_free=62.1, wall=20607 epoch 015: 394 / 1689 loss=3.602, nll_loss=2.057, ppl=4.16, wps=587023, ups=1.18, wpb=496557, bsz=16860.5, num_updates=24000, lr=0.000408248, gnorm=0.143, clip=0, loss_scale=2, train_wall=83, gb_free=62.1, wall=20607 epoch 015: 394 / 1689 loss=3.602, nll_loss=2.057, ppl=4.16, wps=587023, ups=1.18, wpb=496557, bsz=16860.5, num_updates=24000, lr=0.000408248, gnorm=0.143, clip=0, loss_scale=2, train_wall=83, gb_free=62.1, wall=20607 epoch 015: 394 / 1689 loss=3.602, nll_loss=2.057, ppl=4.16, wps=587023, ups=1.18, wpb=496557, bsz=16860.5, num_updates=24000, lr=0.000408248, gnorm=0.143, clip=0, loss_scale=2, train_wall=83, gb_free=62.1, wall=20607 epoch 015: 394 / 1689 loss=3.602, nll_loss=2.057, ppl=4.16, wps=587023, ups=1.18, wpb=496557, bsz=16860.5, num_updates=24000, lr=0.000408248, gnorm=0.143, clip=0, loss_scale=2, train_wall=83, gb_free=62.1, wall=20607 epoch 015: 394 / 1689 loss=3.602, nll_loss=2.057, ppl=4.16, wps=587023, ups=1.18, wpb=496557, bsz=16860.5, num_updates=24000, lr=0.000408248, gnorm=0.143, clip=0, loss_scale=2, train_wall=83, gb_free=62.1, wall=20607 epoch 015: 394 / 1689 loss=3.602, nll_loss=2.057, ppl=4.16, wps=587023, ups=1.18, wpb=496557, bsz=16860.5, num_updates=24000, lr=0.000408248, gnorm=0.143, clip=0, loss_scale=2, train_wall=83, gb_free=62.1, wall=20607 epoch 015: 394 / 1689 loss=3.602, nll_loss=2.057, ppl=4.16, wps=587023, ups=1.18, wpb=496557, bsz=16860.5, num_updates=24000, lr=0.000408248, gnorm=0.143, clip=0, loss_scale=2, train_wall=83, gb_free=62.1, wall=20607 epoch 015: 394 / 1689 loss=3.602, nll_loss=2.057, ppl=4.16, wps=587023, ups=1.18, wpb=496557, bsz=16860.5, num_updates=24000, lr=0.000408248, gnorm=0.143, clip=0, loss_scale=2, train_wall=83, gb_free=62.1, wall=20607 epoch 015: 394 / 1689 loss=3.602, nll_loss=2.057, ppl=4.16, wps=587023, ups=1.18, wpb=496557, bsz=16860.5, num_updates=24000, lr=0.000408248, gnorm=0.143, clip=0, loss_scale=2, train_wall=83, gb_free=62.1, wall=20607 epoch 015: 394 / 1689 loss=3.602, nll_loss=2.057, ppl=4.16, wps=587023, ups=1.18, wpb=496557, bsz=16860.5, num_updates=24000, lr=0.000408248, gnorm=0.143, clip=0, loss_scale=2, train_wall=83, gb_free=62.1, wall=20607 epoch 015: 394 / 1689 loss=3.602, nll_loss=2.057, ppl=4.16, wps=587023, ups=1.18, wpb=496557, bsz=16860.5, num_updates=24000, lr=0.000408248, gnorm=0.143, clip=0, loss_scale=2, train_wall=83, gb_free=62.1, wall=20607 epoch 015: 394 / 1689 loss=3.602, nll_loss=2.057, ppl=4.16, wps=587023, ups=1.18, wpb=496557, bsz=16860.5, num_updates=24000, lr=0.000408248, gnorm=0.143, clip=0, loss_scale=2, train_wall=83, gb_free=62.1, wall=20607 begin validation on "valid" subset epoch 015 | valid on 'valid' subset | loss 3.713 | nll_loss 2.159 | ppl 4.47 | wps 0 | wpb 44526 | bsz 2008 | num_updates 24000 | best_loss 3.713 epoch 015 | valid on 'valid' subset | loss 3.713 | nll_loss 2.159 | ppl 4.47 | wps 0 | wpb 44526 | bsz 2008 | num_updates 24000 | best_loss 3.713 epoch 015 | valid on 'valid' subset | loss 3.713 | nll_loss 2.159 | ppl 4.47 | wps 0 | wpb 44526 | bsz 2008 | num_updates 24000 | best_loss 3.713 epoch 015 | valid on 'valid' subset | loss 3.713 | nll_loss 2.159 | ppl 4.47 | wps 0 | wpb 44526 | bsz 2008 | num_updates 24000 | best_loss 3.713 epoch 015 | valid on 'valid' subset | loss 3.713 | nll_loss 2.159 | ppl 4.47 | wps 0 | wpb 44526 | bsz 2008 | num_updates 24000 | best_loss 3.713 epoch 015 | valid on 'valid' subset | loss 3.713 | nll_loss 2.159 | ppl 4.47 | wps 0 | wpb 44526 | bsz 2008 | num_updates 24000 | best_loss 3.713 epoch 015 | valid on 'valid' subset | loss 3.713 | nll_loss 2.159 | ppl 4.47 | wps 0 | wpb 44526 | bsz 2008 | num_updates 24000 | best_loss 3.713 epoch 015 | valid on 'valid' subset | loss 3.713 | nll_loss 2.159 | ppl 4.47 | wps 0 | wpb 44526 | bsz 2008 | num_updates 24000 | best_loss 3.713 epoch 015 | valid on 'valid' subset | loss 3.713 | nll_loss 2.159 | ppl 4.47 | wps 0 | wpb 44526 | bsz 2008 | num_updates 24000 | best_loss 3.713 epoch 015 | valid on 'valid' subset | loss 3.713 | nll_loss 2.159 | ppl 4.47 | wps 0 | wpb 44526 | bsz 2008 | num_updates 24000 | best_loss 3.713 epoch 015 | valid on 'valid' subset | loss 3.713 | nll_loss 2.159 | ppl 4.47 | wps 0 | wpb 44526 | bsz 2008 | num_updates 24000 | best_loss 3.713 epoch 015 | valid on 'valid' subset | loss 3.713 | nll_loss 2.159 | ppl 4.47 | wps 0 | wpb 44526 | bsz 2008 | num_updates 24000 | best_loss 3.713 epoch 015 | valid on 'valid' subset | loss 3.713 | nll_loss 2.159 | ppl 4.47 | wps 0 | wpb 44526 | bsz 2008 | num_updates 24000 | best_loss 3.713 epoch 015 | valid on 'valid' subset | loss 3.713 | nll_loss 2.159 | ppl 4.47 | wps 0 | wpb 44526 | bsz 2008 | num_updates 24000 | best_loss 3.713 epoch 015 | valid on 'valid' subset | loss 3.713 | nll_loss 2.159 | ppl 4.47 | wps 0 | wpb 44526 | bsz 2008 | num_updates 24000 | best_loss 3.713 epoch 015: 494 / 1689 loss=3.602, nll_loss=2.057, ppl=4.16, wps=517238, ups=1.04, wpb=495610, bsz=16735.4, num_updates=24100, lr=0.0004074, gnorm=0.139, clip=0, loss_scale=2, train_wall=83, gb_free=60.1, wall=20703 epoch 015: 494 / 1689 loss=3.602, nll_loss=2.057, ppl=4.16, wps=517238, ups=1.04, wpb=495610, bsz=16735.4, num_updates=24100, lr=0.0004074, gnorm=0.139, clip=0, loss_scale=2, train_wall=83, gb_free=60.1, wall=20703 epoch 015: 494 / 1689 loss=3.602, nll_loss=2.057, ppl=4.16, wps=517238, ups=1.04, wpb=495610, bsz=16735.4, num_updates=24100, lr=0.0004074, gnorm=0.139, clip=0, loss_scale=2, train_wall=83, gb_free=60.1, wall=20703 epoch 015: 494 / 1689 loss=3.602, nll_loss=2.057, ppl=4.16, wps=517238, ups=1.04, wpb=495610, bsz=16735.4, num_updates=24100, lr=0.0004074, gnorm=0.139, clip=0, loss_scale=2, train_wall=83, gb_free=60.1, wall=20703 epoch 015: 494 / 1689 loss=3.602, nll_loss=2.057, ppl=4.16, wps=517238, ups=1.04, wpb=495610, bsz=16735.4, num_updates=24100, lr=0.0004074, gnorm=0.139, clip=0, loss_scale=2, train_wall=83, gb_free=60.1, wall=20703 epoch 015: 494 / 1689 loss=3.602, nll_loss=2.057, ppl=4.16, wps=517238, ups=1.04, wpb=495610, bsz=16735.4, num_updates=24100, lr=0.0004074, gnorm=0.139, clip=0, loss_scale=2, train_wall=83, gb_free=60.1, wall=20703 epoch 015: 494 / 1689 loss=3.602, nll_loss=2.057, ppl=4.16, wps=517238, ups=1.04, wpb=495610, bsz=16735.4, num_updates=24100, lr=0.0004074, gnorm=0.139, clip=0, loss_scale=2, train_wall=83, gb_free=60.1, wall=20703 epoch 015: 494 / 1689 loss=3.602, nll_loss=2.057, ppl=4.16, wps=517238, ups=1.04, wpb=495610, bsz=16735.4, num_updates=24100, lr=0.0004074, gnorm=0.139, clip=0, loss_scale=2, train_wall=83, gb_free=60.1, wall=20703 epoch 015: 494 / 1689 loss=3.602, nll_loss=2.057, ppl=4.16, wps=517238, ups=1.04, wpb=495610, bsz=16735.4, num_updates=24100, lr=0.0004074, gnorm=0.139, clip=0, loss_scale=2, train_wall=83, gb_free=60.1, wall=20703 epoch 015: 494 / 1689 loss=3.602, nll_loss=2.057, ppl=4.16, wps=517238, ups=1.04, wpb=495610, bsz=16735.4, num_updates=24100, lr=0.0004074, gnorm=0.139, clip=0, loss_scale=2, train_wall=83, gb_free=60.1, wall=20703 epoch 015: 494 / 1689 loss=3.602, nll_loss=2.057, ppl=4.16, wps=517238, ups=1.04, wpb=495610, bsz=16735.4, num_updates=24100, lr=0.0004074, gnorm=0.139, clip=0, loss_scale=2, train_wall=83, gb_free=60.1, wall=20703 epoch 015: 494 / 1689 loss=3.602, nll_loss=2.057, ppl=4.16, wps=517238, ups=1.04, wpb=495610, bsz=16735.4, num_updates=24100, lr=0.0004074, gnorm=0.139, clip=0, loss_scale=2, train_wall=83, gb_free=60.1, wall=20703 epoch 015: 494 / 1689 loss=3.602, nll_loss=2.057, ppl=4.16, wps=517238, ups=1.04, wpb=495610, bsz=16735.4, num_updates=24100, lr=0.0004074, gnorm=0.139, clip=0, loss_scale=2, train_wall=83, gb_free=60.1, wall=20703 epoch 015: 494 / 1689 loss=3.602, nll_loss=2.057, ppl=4.16, wps=517238, ups=1.04, wpb=495610, bsz=16735.4, num_updates=24100, lr=0.0004074, gnorm=0.139, clip=0, loss_scale=2, train_wall=83, gb_free=60.1, wall=20703 epoch 015: 494 / 1689 loss=3.602, nll_loss=2.057, ppl=4.16, wps=517238, ups=1.04, wpb=495610, bsz=16735.4, num_updates=24100, lr=0.0004074, gnorm=0.139, clip=0, loss_scale=2, train_wall=83, gb_free=60.1, wall=20703 epoch 015: 594 / 1689 loss=3.606, nll_loss=2.062, ppl=4.18, wps=586333, ups=1.19, wpb=494237, bsz=16334.3, num_updates=24200, lr=0.000406558, gnorm=0.145, clip=0, loss_scale=2, train_wall=83, gb_free=62.1, wall=20787 epoch 015: 594 / 1689 loss=3.606, nll_loss=2.062, ppl=4.18, wps=586333, ups=1.19, wpb=494237, bsz=16334.3, num_updates=24200, lr=0.000406558, gnorm=0.145, clip=0, loss_scale=2, train_wall=83, gb_free=62.1, wall=20787 epoch 015: 594 / 1689 loss=3.606, nll_loss=2.062, ppl=4.18, wps=586333, ups=1.19, wpb=494237, bsz=16334.3, num_updates=24200, lr=0.000406558, gnorm=0.145, clip=0, loss_scale=2, train_wall=83, gb_free=62.1, wall=20787 epoch 015: 594 / 1689 loss=3.606, nll_loss=2.062, ppl=4.18, wps=586333, ups=1.19, wpb=494237, bsz=16334.3, num_updates=24200, lr=0.000406558, gnorm=0.145, clip=0, loss_scale=2, train_wall=83, gb_free=62.1, wall=20787 epoch 015: 594 / 1689 loss=3.606, nll_loss=2.062, ppl=4.18, wps=586333, ups=1.19, wpb=494237, bsz=16334.3, num_updates=24200, lr=0.000406558, gnorm=0.145, clip=0, loss_scale=2, train_wall=83, gb_free=62.1, wall=20787 epoch 015: 594 / 1689 loss=3.606, nll_loss=2.062, ppl=4.18, wps=586333, ups=1.19, wpb=494237, bsz=16334.3, num_updates=24200, lr=0.000406558, gnorm=0.145, clip=0, loss_scale=2, train_wall=83, gb_free=62.1, wall=20787 epoch 015: 594 / 1689 loss=3.606, nll_loss=2.062, ppl=4.18, wps=586333, ups=1.19, wpb=494237, bsz=16334.3, num_updates=24200, lr=0.000406558, gnorm=0.145, clip=0, loss_scale=2, train_wall=83, gb_free=62.1, wall=20787 epoch 015: 594 / 1689 loss=3.606, nll_loss=2.062, ppl=4.18, wps=586333, ups=1.19, wpb=494237, bsz=16334.3, num_updates=24200, lr=0.000406558, gnorm=0.145, clip=0, loss_scale=2, train_wall=83, gb_free=62.1, wall=20787 epoch 015: 594 / 1689 loss=3.606, nll_loss=2.062, ppl=4.18, wps=586333, ups=1.19, wpb=494237, bsz=16334.3, num_updates=24200, lr=0.000406558, gnorm=0.145, clip=0, loss_scale=2, train_wall=83, gb_free=62.1, wall=20787 epoch 015: 594 / 1689 loss=3.606, nll_loss=2.062, ppl=4.18, wps=586333, ups=1.19, wpb=494237, bsz=16334.3, num_updates=24200, lr=0.000406558, gnorm=0.145, clip=0, loss_scale=2, train_wall=83, gb_free=62.1, wall=20787 epoch 015: 594 / 1689 loss=3.606, nll_loss=2.062, ppl=4.18, wps=586333, ups=1.19, wpb=494237, bsz=16334.3, num_updates=24200, lr=0.000406558, gnorm=0.145, clip=0, loss_scale=2, train_wall=83, gb_free=62.1, wall=20787 epoch 015: 594 / 1689 loss=3.606, nll_loss=2.062, ppl=4.18, wps=586333, ups=1.19, wpb=494237, bsz=16334.3, num_updates=24200, lr=0.000406558, gnorm=0.145, clip=0, loss_scale=2, train_wall=83, gb_free=62.1, wall=20787 epoch 015: 594 / 1689 loss=3.606, nll_loss=2.062, ppl=4.18, wps=586333, ups=1.19, wpb=494237, bsz=16334.3, num_updates=24200, lr=0.000406558, gnorm=0.145, clip=0, loss_scale=2, train_wall=83, gb_free=62.1, wall=20787 epoch 015: 594 / 1689 loss=3.606, nll_loss=2.062, ppl=4.18, wps=586333, ups=1.19, wpb=494237, bsz=16334.3, num_updates=24200, lr=0.000406558, gnorm=0.145, clip=0, loss_scale=2, train_wall=83, gb_free=62.1, wall=20787 epoch 015: 594 / 1689 loss=3.606, nll_loss=2.062, ppl=4.18, wps=586333, ups=1.19, wpb=494237, bsz=16334.3, num_updates=24200, lr=0.000406558, gnorm=0.145, clip=0, loss_scale=2, train_wall=83, gb_free=62.1, wall=20787 epoch 015: 694 / 1689 loss=3.606, nll_loss=2.061, ppl=4.17, wps=583699, ups=1.18, wpb=495018, bsz=16705.3, num_updates=24300, lr=0.00040572, gnorm=0.146, clip=0, loss_scale=2, train_wall=84, gb_free=62, wall=20872 epoch 015: 694 / 1689 loss=3.606, nll_loss=2.061, ppl=4.17, wps=583699, ups=1.18, wpb=495018, bsz=16705.3, num_updates=24300, lr=0.00040572, gnorm=0.146, clip=0, loss_scale=2, train_wall=84, gb_free=62, wall=20872 epoch 015: 694 / 1689 loss=3.606, nll_loss=2.061, ppl=4.17, wps=583699, ups=1.18, wpb=495018, bsz=16705.3, num_updates=24300, lr=0.00040572, gnorm=0.146, clip=0, loss_scale=2, train_wall=84, gb_free=62, wall=20872 epoch 015: 694 / 1689 loss=3.606, nll_loss=2.061, ppl=4.17, wps=583699, ups=1.18, wpb=495018, bsz=16705.3, num_updates=24300, lr=0.00040572, gnorm=0.146, clip=0, loss_scale=2, train_wall=84, gb_free=62, wall=20872 epoch 015: 694 / 1689 loss=3.606, nll_loss=2.061, ppl=4.17, wps=583699, ups=1.18, wpb=495018, bsz=16705.3, num_updates=24300, lr=0.00040572, gnorm=0.146, clip=0, loss_scale=2, train_wall=84, gb_free=62, wall=20872 epoch 015: 694 / 1689 loss=3.606, nll_loss=2.061, ppl=4.17, wps=583699, ups=1.18, wpb=495018, bsz=16705.3, num_updates=24300, lr=0.00040572, gnorm=0.146, clip=0, loss_scale=2, train_wall=84, gb_free=62, wall=20872 epoch 015: 694 / 1689 loss=3.606, nll_loss=2.061, ppl=4.17, wps=583699, ups=1.18, wpb=495018, bsz=16705.3, num_updates=24300, lr=0.00040572, gnorm=0.146, clip=0, loss_scale=2, train_wall=84, gb_free=62, wall=20872 epoch 015: 694 / 1689 loss=3.606, nll_loss=2.061, ppl=4.17, wps=583699, ups=1.18, wpb=495018, bsz=16705.3, num_updates=24300, lr=0.00040572, gnorm=0.146, clip=0, loss_scale=2, train_wall=84, gb_free=62, wall=20872 epoch 015: 694 / 1689 loss=3.606, nll_loss=2.061, ppl=4.17, wps=583699, ups=1.18, wpb=495018, bsz=16705.3, num_updates=24300, lr=0.00040572, gnorm=0.146, clip=0, loss_scale=2, train_wall=84, gb_free=62, wall=20872 epoch 015: 694 / 1689 loss=3.606, nll_loss=2.061, ppl=4.17, wps=583699, ups=1.18, wpb=495018, bsz=16705.3, num_updates=24300, lr=0.00040572, gnorm=0.146, clip=0, loss_scale=2, train_wall=84, gb_free=62, wall=20872 epoch 015: 694 / 1689 loss=3.606, nll_loss=2.061, ppl=4.17, wps=583699, ups=1.18, wpb=495018, bsz=16705.3, num_updates=24300, lr=0.00040572, gnorm=0.146, clip=0, loss_scale=2, train_wall=84, gb_free=62, wall=20872 epoch 015: 694 / 1689 loss=3.606, nll_loss=2.061, ppl=4.17, wps=583699, ups=1.18, wpb=495018, bsz=16705.3, num_updates=24300, lr=0.00040572, gnorm=0.146, clip=0, loss_scale=2, train_wall=84, gb_free=62, wall=20872 epoch 015: 694 / 1689 loss=3.606, nll_loss=2.061, ppl=4.17, wps=583699, ups=1.18, wpb=495018, bsz=16705.3, num_updates=24300, lr=0.00040572, gnorm=0.146, clip=0, loss_scale=2, train_wall=84, gb_free=62, wall=20872 epoch 015: 694 / 1689 loss=3.606, nll_loss=2.061, ppl=4.17, wps=583699, ups=1.18, wpb=495018, bsz=16705.3, num_updates=24300, lr=0.00040572, gnorm=0.146, clip=0, loss_scale=2, train_wall=84, gb_free=62, wall=20872 epoch 015: 694 / 1689 loss=3.606, nll_loss=2.061, ppl=4.17, wps=583699, ups=1.18, wpb=495018, bsz=16705.3, num_updates=24300, lr=0.00040572, gnorm=0.146, clip=0, loss_scale=2, train_wall=84, gb_free=62, wall=20872 epoch 015: 794 / 1689 loss=3.609, nll_loss=2.065, ppl=4.18, wps=586532, ups=1.18, wpb=495342, bsz=16290.8, num_updates=24400, lr=0.000404888, gnorm=0.148, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=20956 epoch 015: 794 / 1689 loss=3.609, nll_loss=2.065, ppl=4.18, wps=586532, ups=1.18, wpb=495342, bsz=16290.8, num_updates=24400, lr=0.000404888, gnorm=0.148, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=20956 epoch 015: 794 / 1689 loss=3.609, nll_loss=2.065, ppl=4.18, wps=586532, ups=1.18, wpb=495342, bsz=16290.8, num_updates=24400, lr=0.000404888, gnorm=0.148, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=20956 epoch 015: 794 / 1689 loss=3.609, nll_loss=2.065, ppl=4.18, wps=586532, ups=1.18, wpb=495342, bsz=16290.8, num_updates=24400, lr=0.000404888, gnorm=0.148, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=20956 epoch 015: 794 / 1689 loss=3.609, nll_loss=2.065, ppl=4.18, wps=586532, ups=1.18, wpb=495342, bsz=16290.8, num_updates=24400, lr=0.000404888, gnorm=0.148, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=20956 epoch 015: 794 / 1689 loss=3.609, nll_loss=2.065, ppl=4.18, wps=586532, ups=1.18, wpb=495342, bsz=16290.8, num_updates=24400, lr=0.000404888, gnorm=0.148, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=20956 epoch 015: 794 / 1689 loss=3.609, nll_loss=2.065, ppl=4.18, wps=586532, ups=1.18, wpb=495342, bsz=16290.8, num_updates=24400, lr=0.000404888, gnorm=0.148, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=20956 epoch 015: 794 / 1689 loss=3.609, nll_loss=2.065, ppl=4.18, wps=586532, ups=1.18, wpb=495342, bsz=16290.8, num_updates=24400, lr=0.000404888, gnorm=0.148, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=20956 epoch 015: 794 / 1689 loss=3.609, nll_loss=2.065, ppl=4.18, wps=586532, ups=1.18, wpb=495342, bsz=16290.8, num_updates=24400, lr=0.000404888, gnorm=0.148, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=20956 epoch 015: 794 / 1689 loss=3.609, nll_loss=2.065, ppl=4.18, wps=586532, ups=1.18, wpb=495342, bsz=16290.8, num_updates=24400, lr=0.000404888, gnorm=0.148, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=20956 epoch 015: 794 / 1689 loss=3.609, nll_loss=2.065, ppl=4.18, wps=586532, ups=1.18, wpb=495342, bsz=16290.8, num_updates=24400, lr=0.000404888, gnorm=0.148, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=20956 epoch 015: 794 / 1689 loss=3.609, nll_loss=2.065, ppl=4.18, wps=586532, ups=1.18, wpb=495342, bsz=16290.8, num_updates=24400, lr=0.000404888, gnorm=0.148, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=20956 epoch 015: 794 / 1689 loss=3.609, nll_loss=2.065, ppl=4.18, wps=586532, ups=1.18, wpb=495342, bsz=16290.8, num_updates=24400, lr=0.000404888, gnorm=0.148, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=20956 epoch 015: 794 / 1689 loss=3.609, nll_loss=2.065, ppl=4.18, wps=586532, ups=1.18, wpb=495342, bsz=16290.8, num_updates=24400, lr=0.000404888, gnorm=0.148, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=20956 epoch 015: 794 / 1689 loss=3.609, nll_loss=2.065, ppl=4.18, wps=586532, ups=1.18, wpb=495342, bsz=16290.8, num_updates=24400, lr=0.000404888, gnorm=0.148, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=20956 epoch 015: 895 / 1689 loss=3.613, nll_loss=2.069, ppl=4.2, wps=581513, ups=1.18, wpb=494415, bsz=16245.9, num_updates=24500, lr=0.000404061, gnorm=0.169, clip=0, loss_scale=1, train_wall=84, gb_free=61.5, wall=21041 epoch 015: 895 / 1689 loss=3.613, nll_loss=2.069, ppl=4.2, wps=581513, ups=1.18, wpb=494415, bsz=16245.9, num_updates=24500, lr=0.000404061, gnorm=0.169, clip=0, loss_scale=1, train_wall=84, gb_free=61.5, wall=21041 epoch 015: 895 / 1689 loss=3.613, nll_loss=2.069, ppl=4.2, wps=581513, ups=1.18, wpb=494415, bsz=16245.9, num_updates=24500, lr=0.000404061, gnorm=0.169, clip=0, loss_scale=1, train_wall=84, gb_free=61.5, wall=21041 epoch 015: 895 / 1689 loss=3.613, nll_loss=2.069, ppl=4.2, wps=581513, ups=1.18, wpb=494415, bsz=16245.9, num_updates=24500, lr=0.000404061, gnorm=0.169, clip=0, loss_scale=1, train_wall=84, gb_free=61.5, wall=21041 epoch 015: 895 / 1689 loss=3.613, nll_loss=2.069, ppl=4.2, wps=581513, ups=1.18, wpb=494415, bsz=16245.9, num_updates=24500, lr=0.000404061, gnorm=0.169, clip=0, loss_scale=1, train_wall=84, gb_free=61.5, wall=21041 epoch 015: 895 / 1689 loss=3.613, nll_loss=2.069, ppl=4.2, wps=581513, ups=1.18, wpb=494415, bsz=16245.9, num_updates=24500, lr=0.000404061, gnorm=0.169, clip=0, loss_scale=1, train_wall=84, gb_free=61.5, wall=21041 epoch 015: 895 / 1689 loss=3.613, nll_loss=2.069, ppl=4.2, wps=581513, ups=1.18, wpb=494415, bsz=16245.9, num_updates=24500, lr=0.000404061, gnorm=0.169, clip=0, loss_scale=1, train_wall=84, gb_free=61.5, wall=21041 epoch 015: 895 / 1689 loss=3.613, nll_loss=2.069, ppl=4.2, wps=581513, ups=1.18, wpb=494415, bsz=16245.9, num_updates=24500, lr=0.000404061, gnorm=0.169, clip=0, loss_scale=1, train_wall=84, gb_free=61.5, wall=21041 epoch 015: 895 / 1689 loss=3.613, nll_loss=2.069, ppl=4.2, wps=581513, ups=1.18, wpb=494415, bsz=16245.9, num_updates=24500, lr=0.000404061, gnorm=0.169, clip=0, loss_scale=1, train_wall=84, gb_free=61.5, wall=21041 epoch 015: 895 / 1689 loss=3.613, nll_loss=2.069, ppl=4.2, wps=581513, ups=1.18, wpb=494415, bsz=16245.9, num_updates=24500, lr=0.000404061, gnorm=0.169, clip=0, loss_scale=1, train_wall=84, gb_free=61.5, wall=21041 epoch 015: 895 / 1689 loss=3.613, nll_loss=2.069, ppl=4.2, wps=581513, ups=1.18, wpb=494415, bsz=16245.9, num_updates=24500, lr=0.000404061, gnorm=0.169, clip=0, loss_scale=1, train_wall=84, gb_free=61.5, wall=21041 epoch 015: 895 / 1689 loss=3.613, nll_loss=2.069, ppl=4.2, wps=581513, ups=1.18, wpb=494415, bsz=16245.9, num_updates=24500, lr=0.000404061, gnorm=0.169, clip=0, loss_scale=1, train_wall=84, gb_free=61.5, wall=21041 epoch 015: 895 / 1689 loss=3.613, nll_loss=2.069, ppl=4.2, wps=581513, ups=1.18, wpb=494415, bsz=16245.9, num_updates=24500, lr=0.000404061, gnorm=0.169, clip=0, loss_scale=1, train_wall=84, gb_free=61.5, wall=21041 epoch 015: 895 / 1689 loss=3.613, nll_loss=2.069, ppl=4.2, wps=581513, ups=1.18, wpb=494415, bsz=16245.9, num_updates=24500, lr=0.000404061, gnorm=0.169, clip=0, loss_scale=1, train_wall=84, gb_free=61.5, wall=21041 epoch 015: 895 / 1689 loss=3.613, nll_loss=2.069, ppl=4.2, wps=581513, ups=1.18, wpb=494415, bsz=16245.9, num_updates=24500, lr=0.000404061, gnorm=0.169, clip=0, loss_scale=1, train_wall=84, gb_free=61.5, wall=21041 epoch 015: 996 / 1689 loss=3.617, nll_loss=2.074, ppl=4.21, wps=585943, ups=1.18, wpb=497504, bsz=16210.4, num_updates=24600, lr=0.000403239, gnorm=0.15, clip=0, loss_scale=0.5, train_wall=84, gb_free=60.8, wall=21126 epoch 015: 996 / 1689 loss=3.617, nll_loss=2.074, ppl=4.21, wps=585943, ups=1.18, wpb=497504, bsz=16210.4, num_updates=24600, lr=0.000403239, gnorm=0.15, clip=0, loss_scale=0.5, train_wall=84, gb_free=60.8, wall=21126 epoch 015: 996 / 1689 loss=3.617, nll_loss=2.074, ppl=4.21, wps=585943, ups=1.18, wpb=497504, bsz=16210.4, num_updates=24600, lr=0.000403239, gnorm=0.15, clip=0, loss_scale=0.5, train_wall=84, gb_free=60.8, wall=21126 epoch 015: 996 / 1689 loss=3.617, nll_loss=2.074, ppl=4.21, wps=585943, ups=1.18, wpb=497504, bsz=16210.4, num_updates=24600, lr=0.000403239, gnorm=0.15, clip=0, loss_scale=0.5, train_wall=84, gb_free=60.8, wall=21126 epoch 015: 996 / 1689 loss=3.617, nll_loss=2.074, ppl=4.21, wps=585943, ups=1.18, wpb=497504, bsz=16210.4, num_updates=24600, lr=0.000403239, gnorm=0.15, clip=0, loss_scale=0.5, train_wall=84, gb_free=60.8, wall=21126 epoch 015: 996 / 1689 loss=3.617, nll_loss=2.074, ppl=4.21, wps=585943, ups=1.18, wpb=497504, bsz=16210.4, num_updates=24600, lr=0.000403239, gnorm=0.15, clip=0, loss_scale=0.5, train_wall=84, gb_free=60.8, wall=21126 epoch 015: 996 / 1689 loss=3.617, nll_loss=2.074, ppl=4.21, wps=585943, ups=1.18, wpb=497504, bsz=16210.4, num_updates=24600, lr=0.000403239, gnorm=0.15, clip=0, loss_scale=0.5, train_wall=84, gb_free=60.8, wall=21126 epoch 015: 996 / 1689 loss=3.617, nll_loss=2.074, ppl=4.21, wps=585943, ups=1.18, wpb=497504, bsz=16210.4, num_updates=24600, lr=0.000403239, gnorm=0.15, clip=0, loss_scale=0.5, train_wall=84, gb_free=60.8, wall=21126 epoch 015: 996 / 1689 loss=3.617, nll_loss=2.074, ppl=4.21, wps=585943, ups=1.18, wpb=497504, bsz=16210.4, num_updates=24600, lr=0.000403239, gnorm=0.15, clip=0, loss_scale=0.5, train_wall=84, gb_free=60.8, wall=21126 epoch 015: 996 / 1689 loss=3.617, nll_loss=2.074, ppl=4.21, wps=585943, ups=1.18, wpb=497504, bsz=16210.4, num_updates=24600, lr=0.000403239, gnorm=0.15, clip=0, loss_scale=0.5, train_wall=84, gb_free=60.8, wall=21126 epoch 015: 996 / 1689 loss=3.617, nll_loss=2.074, ppl=4.21, wps=585943, ups=1.18, wpb=497504, bsz=16210.4, num_updates=24600, lr=0.000403239, gnorm=0.15, clip=0, loss_scale=0.5, train_wall=84, gb_free=60.8, wall=21126 epoch 015: 996 / 1689 loss=3.617, nll_loss=2.074, ppl=4.21, wps=585943, ups=1.18, wpb=497504, bsz=16210.4, num_updates=24600, lr=0.000403239, gnorm=0.15, clip=0, loss_scale=0.5, train_wall=84, gb_free=60.8, wall=21126 epoch 015: 996 / 1689 loss=3.617, nll_loss=2.074, ppl=4.21, wps=585943, ups=1.18, wpb=497504, bsz=16210.4, num_updates=24600, lr=0.000403239, gnorm=0.15, clip=0, loss_scale=0.5, train_wall=84, gb_free=60.8, wall=21126 epoch 015: 996 / 1689 loss=3.617, nll_loss=2.074, ppl=4.21, wps=585943, ups=1.18, wpb=497504, bsz=16210.4, num_updates=24600, lr=0.000403239, gnorm=0.15, clip=0, loss_scale=0.5, train_wall=84, gb_free=60.8, wall=21126 epoch 015: 996 / 1689 loss=3.617, nll_loss=2.074, ppl=4.21, wps=585943, ups=1.18, wpb=497504, bsz=16210.4, num_updates=24600, lr=0.000403239, gnorm=0.15, clip=0, loss_scale=0.5, train_wall=84, gb_free=60.8, wall=21126 epoch 015: 1096 / 1689 loss=3.608, nll_loss=2.065, ppl=4.18, wps=582939, ups=1.18, wpb=495229, bsz=16775.7, num_updates=24700, lr=0.000402422, gnorm=0.146, clip=0, loss_scale=0.5, train_wall=84, gb_free=60.5, wall=21211 epoch 015: 1096 / 1689 loss=3.608, nll_loss=2.065, ppl=4.18, wps=582939, ups=1.18, wpb=495229, bsz=16775.7, num_updates=24700, lr=0.000402422, gnorm=0.146, clip=0, loss_scale=0.5, train_wall=84, gb_free=60.5, wall=21211 epoch 015: 1096 / 1689 loss=3.608, nll_loss=2.065, ppl=4.18, wps=582939, ups=1.18, wpb=495229, bsz=16775.7, num_updates=24700, lr=0.000402422, gnorm=0.146, clip=0, loss_scale=0.5, train_wall=84, gb_free=60.5, wall=21211 epoch 015: 1096 / 1689 loss=3.608, nll_loss=2.065, ppl=4.18, wps=582939, ups=1.18, wpb=495229, bsz=16775.7, num_updates=24700, lr=0.000402422, gnorm=0.146, clip=0, loss_scale=0.5, train_wall=84, gb_free=60.5, wall=21211 epoch 015: 1096 / 1689 loss=3.608, nll_loss=2.065, ppl=4.18, wps=582939, ups=1.18, wpb=495229, bsz=16775.7, num_updates=24700, lr=0.000402422, gnorm=0.146, clip=0, loss_scale=0.5, train_wall=84, gb_free=60.5, wall=21211 epoch 015: 1096 / 1689 loss=3.608, nll_loss=2.065, ppl=4.18, wps=582939, ups=1.18, wpb=495229, bsz=16775.7, num_updates=24700, lr=0.000402422, gnorm=0.146, clip=0, loss_scale=0.5, train_wall=84, gb_free=60.5, wall=21211 epoch 015: 1096 / 1689 loss=3.608, nll_loss=2.065, ppl=4.18, wps=582939, ups=1.18, wpb=495229, bsz=16775.7, num_updates=24700, lr=0.000402422, gnorm=0.146, clip=0, loss_scale=0.5, train_wall=84, gb_free=60.5, wall=21211 epoch 015: 1096 / 1689 loss=3.608, nll_loss=2.065, ppl=4.18, wps=582939, ups=1.18, wpb=495229, bsz=16775.7, num_updates=24700, lr=0.000402422, gnorm=0.146, clip=0, loss_scale=0.5, train_wall=84, gb_free=60.5, wall=21211 epoch 015: 1096 / 1689 loss=3.608, nll_loss=2.065, ppl=4.18, wps=582939, ups=1.18, wpb=495229, bsz=16775.7, num_updates=24700, lr=0.000402422, gnorm=0.146, clip=0, loss_scale=0.5, train_wall=84, gb_free=60.5, wall=21211 epoch 015: 1096 / 1689 loss=3.608, nll_loss=2.065, ppl=4.18, wps=582939, ups=1.18, wpb=495229, bsz=16775.7, num_updates=24700, lr=0.000402422, gnorm=0.146, clip=0, loss_scale=0.5, train_wall=84, gb_free=60.5, wall=21211 epoch 015: 1096 / 1689 loss=3.608, nll_loss=2.065, ppl=4.18, wps=582939, ups=1.18, wpb=495229, bsz=16775.7, num_updates=24700, lr=0.000402422, gnorm=0.146, clip=0, loss_scale=0.5, train_wall=84, gb_free=60.5, wall=21211 epoch 015: 1096 / 1689 loss=3.608, nll_loss=2.065, ppl=4.18, wps=582939, ups=1.18, wpb=495229, bsz=16775.7, num_updates=24700, lr=0.000402422, gnorm=0.146, clip=0, loss_scale=0.5, train_wall=84, gb_free=60.5, wall=21211 epoch 015: 1096 / 1689 loss=3.608, nll_loss=2.065, ppl=4.18, wps=582939, ups=1.18, wpb=495229, bsz=16775.7, num_updates=24700, lr=0.000402422, gnorm=0.146, clip=0, loss_scale=0.5, train_wall=84, gb_free=60.5, wall=21211 epoch 015: 1096 / 1689 loss=3.608, nll_loss=2.065, ppl=4.18, wps=582939, ups=1.18, wpb=495229, bsz=16775.7, num_updates=24700, lr=0.000402422, gnorm=0.146, clip=0, loss_scale=0.5, train_wall=84, gb_free=60.5, wall=21211 epoch 015: 1096 / 1689 loss=3.608, nll_loss=2.065, ppl=4.18, wps=582939, ups=1.18, wpb=495229, bsz=16775.7, num_updates=24700, lr=0.000402422, gnorm=0.146, clip=0, loss_scale=0.5, train_wall=84, gb_free=60.5, wall=21211 epoch 015: 1196 / 1689 loss=3.622, nll_loss=2.08, ppl=4.23, wps=583204, ups=1.18, wpb=493693, bsz=16256, num_updates=24800, lr=0.00040161, gnorm=0.152, clip=0, loss_scale=0.5, train_wall=84, gb_free=60.1, wall=21296 epoch 015: 1196 / 1689 loss=3.622, nll_loss=2.08, ppl=4.23, wps=583204, ups=1.18, wpb=493693, bsz=16256, num_updates=24800, lr=0.00040161, gnorm=0.152, clip=0, loss_scale=0.5, train_wall=84, gb_free=60.1, wall=21296 epoch 015: 1196 / 1689 loss=3.622, nll_loss=2.08, ppl=4.23, wps=583204, ups=1.18, wpb=493693, bsz=16256, num_updates=24800, lr=0.00040161, gnorm=0.152, clip=0, loss_scale=0.5, train_wall=84, gb_free=60.1, wall=21296 epoch 015: 1196 / 1689 loss=3.622, nll_loss=2.08, ppl=4.23, wps=583204, ups=1.18, wpb=493693, bsz=16256, num_updates=24800, lr=0.00040161, gnorm=0.152, clip=0, loss_scale=0.5, train_wall=84, gb_free=60.1, wall=21296 epoch 015: 1196 / 1689 loss=3.622, nll_loss=2.08, ppl=4.23, wps=583204, ups=1.18, wpb=493693, bsz=16256, num_updates=24800, lr=0.00040161, gnorm=0.152, clip=0, loss_scale=0.5, train_wall=84, gb_free=60.1, wall=21296 epoch 015: 1196 / 1689 loss=3.622, nll_loss=2.08, ppl=4.23, wps=583204, ups=1.18, wpb=493693, bsz=16256, num_updates=24800, lr=0.00040161, gnorm=0.152, clip=0, loss_scale=0.5, train_wall=84, gb_free=60.1, wall=21296 epoch 015: 1196 / 1689 loss=3.622, nll_loss=2.08, ppl=4.23, wps=583204, ups=1.18, wpb=493693, bsz=16256, num_updates=24800, lr=0.00040161, gnorm=0.152, clip=0, loss_scale=0.5, train_wall=84, gb_free=60.1, wall=21296 epoch 015: 1196 / 1689 loss=3.622, nll_loss=2.08, ppl=4.23, wps=583204, ups=1.18, wpb=493693, bsz=16256, num_updates=24800, lr=0.00040161, gnorm=0.152, clip=0, loss_scale=0.5, train_wall=84, gb_free=60.1, wall=21296 epoch 015: 1196 / 1689 loss=3.622, nll_loss=2.08, ppl=4.23, wps=583204, ups=1.18, wpb=493693, bsz=16256, num_updates=24800, lr=0.00040161, gnorm=0.152, clip=0, loss_scale=0.5, train_wall=84, gb_free=60.1, wall=21296 epoch 015: 1196 / 1689 loss=3.622, nll_loss=2.08, ppl=4.23, wps=583204, ups=1.18, wpb=493693, bsz=16256, num_updates=24800, lr=0.00040161, gnorm=0.152, clip=0, loss_scale=0.5, train_wall=84, gb_free=60.1, wall=21296 epoch 015: 1196 / 1689 loss=3.622, nll_loss=2.08, ppl=4.23, wps=583204, ups=1.18, wpb=493693, bsz=16256, num_updates=24800, lr=0.00040161, gnorm=0.152, clip=0, loss_scale=0.5, train_wall=84, gb_free=60.1, wall=21296 epoch 015: 1196 / 1689 loss=3.622, nll_loss=2.08, ppl=4.23, wps=583204, ups=1.18, wpb=493693, bsz=16256, num_updates=24800, lr=0.00040161, gnorm=0.152, clip=0, loss_scale=0.5, train_wall=84, gb_free=60.1, wall=21296 epoch 015: 1196 / 1689 loss=3.622, nll_loss=2.08, ppl=4.23, wps=583204, ups=1.18, wpb=493693, bsz=16256, num_updates=24800, lr=0.00040161, gnorm=0.152, clip=0, loss_scale=0.5, train_wall=84, gb_free=60.1, wall=21296 epoch 015: 1196 / 1689 loss=3.622, nll_loss=2.08, ppl=4.23, wps=583204, ups=1.18, wpb=493693, bsz=16256, num_updates=24800, lr=0.00040161, gnorm=0.152, clip=0, loss_scale=0.5, train_wall=84, gb_free=60.1, wall=21296 epoch 015: 1196 / 1689 loss=3.622, nll_loss=2.08, ppl=4.23, wps=583204, ups=1.18, wpb=493693, bsz=16256, num_updates=24800, lr=0.00040161, gnorm=0.152, clip=0, loss_scale=0.5, train_wall=84, gb_free=60.1, wall=21296 epoch 015: 1297 / 1689 loss=3.608, nll_loss=2.065, ppl=4.18, wps=579733, ups=1.17, wpb=495170, bsz=17058.5, num_updates=24900, lr=0.000400802, gnorm=0.142, clip=0, loss_scale=0.25, train_wall=84, gb_free=61.9, wall=21381 epoch 015: 1297 / 1689 loss=3.608, nll_loss=2.065, ppl=4.18, wps=579733, ups=1.17, wpb=495170, bsz=17058.5, num_updates=24900, lr=0.000400802, gnorm=0.142, clip=0, loss_scale=0.25, train_wall=84, gb_free=61.9, wall=21381 epoch 015: 1297 / 1689 loss=3.608, nll_loss=2.065, ppl=4.18, wps=579733, ups=1.17, wpb=495170, bsz=17058.5, num_updates=24900, lr=0.000400802, gnorm=0.142, clip=0, loss_scale=0.25, train_wall=84, gb_free=61.9, wall=21381 epoch 015: 1297 / 1689 loss=3.608, nll_loss=2.065, ppl=4.18, wps=579733, ups=1.17, wpb=495170, bsz=17058.5, num_updates=24900, lr=0.000400802, gnorm=0.142, clip=0, loss_scale=0.25, train_wall=84, gb_free=61.9, wall=21381 epoch 015: 1297 / 1689 loss=3.608, nll_loss=2.065, ppl=4.18, wps=579733, ups=1.17, wpb=495170, bsz=17058.5, num_updates=24900, lr=0.000400802, gnorm=0.142, clip=0, loss_scale=0.25, train_wall=84, gb_free=61.9, wall=21381 epoch 015: 1297 / 1689 loss=3.608, nll_loss=2.065, ppl=4.18, wps=579733, ups=1.17, wpb=495170, bsz=17058.5, num_updates=24900, lr=0.000400802, gnorm=0.142, clip=0, loss_scale=0.25, train_wall=84, gb_free=61.9, wall=21381 epoch 015: 1297 / 1689 loss=3.608, nll_loss=2.065, ppl=4.18, wps=579733, ups=1.17, wpb=495170, bsz=17058.5, num_updates=24900, lr=0.000400802, gnorm=0.142, clip=0, loss_scale=0.25, train_wall=84, gb_free=61.9, wall=21381 epoch 015: 1297 / 1689 loss=3.608, nll_loss=2.065, ppl=4.18, wps=579733, ups=1.17, wpb=495170, bsz=17058.5, num_updates=24900, lr=0.000400802, gnorm=0.142, clip=0, loss_scale=0.25, train_wall=84, gb_free=61.9, wall=21381 epoch 015: 1297 / 1689 loss=3.608, nll_loss=2.065, ppl=4.18, wps=579733, ups=1.17, wpb=495170, bsz=17058.5, num_updates=24900, lr=0.000400802, gnorm=0.142, clip=0, loss_scale=0.25, train_wall=84, gb_free=61.9, wall=21381 epoch 015: 1297 / 1689 loss=3.608, nll_loss=2.065, ppl=4.18, wps=579733, ups=1.17, wpb=495170, bsz=17058.5, num_updates=24900, lr=0.000400802, gnorm=0.142, clip=0, loss_scale=0.25, train_wall=84, gb_free=61.9, wall=21381 epoch 015: 1297 / 1689 loss=3.608, nll_loss=2.065, ppl=4.18, wps=579733, ups=1.17, wpb=495170, bsz=17058.5, num_updates=24900, lr=0.000400802, gnorm=0.142, clip=0, loss_scale=0.25, train_wall=84, gb_free=61.9, wall=21381 epoch 015: 1297 / 1689 loss=3.608, nll_loss=2.065, ppl=4.18, wps=579733, ups=1.17, wpb=495170, bsz=17058.5, num_updates=24900, lr=0.000400802, gnorm=0.142, clip=0, loss_scale=0.25, train_wall=84, gb_free=61.9, wall=21381 epoch 015: 1297 / 1689 loss=3.608, nll_loss=2.065, ppl=4.18, wps=579733, ups=1.17, wpb=495170, bsz=17058.5, num_updates=24900, lr=0.000400802, gnorm=0.142, clip=0, loss_scale=0.25, train_wall=84, gb_free=61.9, wall=21381 epoch 015: 1297 / 1689 loss=3.608, nll_loss=2.065, ppl=4.18, wps=579733, ups=1.17, wpb=495170, bsz=17058.5, num_updates=24900, lr=0.000400802, gnorm=0.142, clip=0, loss_scale=0.25, train_wall=84, gb_free=61.9, wall=21381 epoch 015: 1297 / 1689 loss=3.608, nll_loss=2.065, ppl=4.18, wps=579733, ups=1.17, wpb=495170, bsz=17058.5, num_updates=24900, lr=0.000400802, gnorm=0.142, clip=0, loss_scale=0.25, train_wall=84, gb_free=61.9, wall=21381 epoch 015: 1397 / 1689 loss=3.61, nll_loss=2.067, ppl=4.19, wps=588973, ups=1.18, wpb=497188, bsz=16455.7, num_updates=25000, lr=0.0004, gnorm=0.148, clip=0, loss_scale=0.25, train_wall=83, gb_free=61.3, wall=21466 epoch 015: 1397 / 1689 loss=3.61, nll_loss=2.067, ppl=4.19, wps=588973, ups=1.18, wpb=497188, bsz=16455.7, num_updates=25000, lr=0.0004, gnorm=0.148, clip=0, loss_scale=0.25, train_wall=83, gb_free=61.3, wall=21466 epoch 015: 1397 / 1689 loss=3.61, nll_loss=2.067, ppl=4.19, wps=588973, ups=1.18, wpb=497188, bsz=16455.7, num_updates=25000, lr=0.0004, gnorm=0.148, clip=0, loss_scale=0.25, train_wall=83, gb_free=61.3, wall=21466 epoch 015: 1397 / 1689 loss=3.61, nll_loss=2.067, ppl=4.19, wps=588973, ups=1.18, wpb=497188, bsz=16455.7, num_updates=25000, lr=0.0004, gnorm=0.148, clip=0, loss_scale=0.25, train_wall=83, gb_free=61.3, wall=21466 epoch 015: 1397 / 1689 loss=3.61, nll_loss=2.067, ppl=4.19, wps=588973, ups=1.18, wpb=497188, bsz=16455.7, num_updates=25000, lr=0.0004, gnorm=0.148, clip=0, loss_scale=0.25, train_wall=83, gb_free=61.3, wall=21466 epoch 015: 1397 / 1689 loss=3.61, nll_loss=2.067, ppl=4.19, wps=588973, ups=1.18, wpb=497188, bsz=16455.7, num_updates=25000, lr=0.0004, gnorm=0.148, clip=0, loss_scale=0.25, train_wall=83, gb_free=61.3, wall=21466 epoch 015: 1397 / 1689 loss=3.61, nll_loss=2.067, ppl=4.19, wps=588973, ups=1.18, wpb=497188, bsz=16455.7, num_updates=25000, lr=0.0004, gnorm=0.148, clip=0, loss_scale=0.25, train_wall=83, gb_free=61.3, wall=21466 epoch 015: 1397 / 1689 loss=3.61, nll_loss=2.067, ppl=4.19, wps=588973, ups=1.18, wpb=497188, bsz=16455.7, num_updates=25000, lr=0.0004, gnorm=0.148, clip=0, loss_scale=0.25, train_wall=83, gb_free=61.3, wall=21466 epoch 015: 1397 / 1689 loss=3.61, nll_loss=2.067, ppl=4.19, wps=588973, ups=1.18, wpb=497188, bsz=16455.7, num_updates=25000, lr=0.0004, gnorm=0.148, clip=0, loss_scale=0.25, train_wall=83, gb_free=61.3, wall=21466 epoch 015: 1397 / 1689 loss=3.61, nll_loss=2.067, ppl=4.19, wps=588973, ups=1.18, wpb=497188, bsz=16455.7, num_updates=25000, lr=0.0004, gnorm=0.148, clip=0, loss_scale=0.25, train_wall=83, gb_free=61.3, wall=21466 epoch 015: 1397 / 1689 loss=3.61, nll_loss=2.067, ppl=4.19, wps=588973, ups=1.18, wpb=497188, bsz=16455.7, num_updates=25000, lr=0.0004, gnorm=0.148, clip=0, loss_scale=0.25, train_wall=83, gb_free=61.3, wall=21466 epoch 015: 1397 / 1689 loss=3.61, nll_loss=2.067, ppl=4.19, wps=588973, ups=1.18, wpb=497188, bsz=16455.7, num_updates=25000, lr=0.0004, gnorm=0.148, clip=0, loss_scale=0.25, train_wall=83, gb_free=61.3, wall=21466 epoch 015: 1397 / 1689 loss=3.61, nll_loss=2.067, ppl=4.19, wps=588973, ups=1.18, wpb=497188, bsz=16455.7, num_updates=25000, lr=0.0004, gnorm=0.148, clip=0, loss_scale=0.25, train_wall=83, gb_free=61.3, wall=21466 epoch 015: 1397 / 1689 loss=3.61, nll_loss=2.067, ppl=4.19, wps=588973, ups=1.18, wpb=497188, bsz=16455.7, num_updates=25000, lr=0.0004, gnorm=0.148, clip=0, loss_scale=0.25, train_wall=83, gb_free=61.3, wall=21466 epoch 015: 1397 / 1689 loss=3.61, nll_loss=2.067, ppl=4.19, wps=588973, ups=1.18, wpb=497188, bsz=16455.7, num_updates=25000, lr=0.0004, gnorm=0.148, clip=0, loss_scale=0.25, train_wall=83, gb_free=61.3, wall=21466 begin validation on "valid" subset epoch 015 | valid on 'valid' subset | loss 3.725 | nll_loss 2.174 | ppl 4.51 | wps 0 | wpb 44526 | bsz 2008 | num_updates 25000 | best_loss 3.713 epoch 015 | valid on 'valid' subset | loss 3.725 | nll_loss 2.174 | ppl 4.51 | wps 0 | wpb 44526 | bsz 2008 | num_updates 25000 | best_loss 3.713 epoch 015 | valid on 'valid' subset | loss 3.725 | nll_loss 2.174 | ppl 4.51 | wps 0 | wpb 44526 | bsz 2008 | num_updates 25000 | best_loss 3.713 epoch 015 | valid on 'valid' subset | loss 3.725 | nll_loss 2.174 | ppl 4.51 | wps 0 | wpb 44526 | bsz 2008 | num_updates 25000 | best_loss 3.713 epoch 015 | valid on 'valid' subset | loss 3.725 | nll_loss 2.174 | ppl 4.51 | wps 0 | wpb 44526 | bsz 2008 | num_updates 25000 | best_loss 3.713 epoch 015 | valid on 'valid' subset | loss 3.725 | nll_loss 2.174 | ppl 4.51 | wps 0 | wpb 44526 | bsz 2008 | num_updates 25000 | best_loss 3.713 epoch 015 | valid on 'valid' subset | loss 3.725 | nll_loss 2.174 | ppl 4.51 | wps 0 | wpb 44526 | bsz 2008 | num_updates 25000 | best_loss 3.713 epoch 015 | valid on 'valid' subset | loss 3.725 | nll_loss 2.174 | ppl 4.51 | wps 0 | wpb 44526 | bsz 2008 | num_updates 25000 | best_loss 3.713 epoch 015 | valid on 'valid' subset | loss 3.725 | nll_loss 2.174 | ppl 4.51 | wps 0 | wpb 44526 | bsz 2008 | num_updates 25000 | best_loss 3.713 epoch 015 | valid on 'valid' subset | loss 3.725 | nll_loss 2.174 | ppl 4.51 | wps 0 | wpb 44526 | bsz 2008 | num_updates 25000 | best_loss 3.713 epoch 015 | valid on 'valid' subset | loss 3.725 | nll_loss 2.174 | ppl 4.51 | wps 0 | wpb 44526 | bsz 2008 | num_updates 25000 | best_loss 3.713 epoch 015 | valid on 'valid' subset | loss 3.725 | nll_loss 2.174 | ppl 4.51 | wps 0 | wpb 44526 | bsz 2008 | num_updates 25000 | best_loss 3.713 epoch 015 | valid on 'valid' subset | loss 3.725 | nll_loss 2.174 | ppl 4.51 | wps 0 | wpb 44526 | bsz 2008 | num_updates 25000 | best_loss 3.713 epoch 015 | valid on 'valid' subset | loss 3.725 | nll_loss 2.174 | ppl 4.51 | wps 0 | wpb 44526 | bsz 2008 | num_updates 25000 | best_loss 3.713 epoch 015 | valid on 'valid' subset | loss 3.725 | nll_loss 2.174 | ppl 4.51 | wps 0 | wpb 44526 | bsz 2008 | num_updates 25000 | best_loss 3.713 epoch 015: 1497 / 1689 loss=3.614, nll_loss=2.071, ppl=4.2, wps=537960, ups=1.09, wpb=494834, bsz=16903.3, num_updates=25100, lr=0.000399202, gnorm=0.144, clip=0, loss_scale=0.25, train_wall=83, gb_free=61.4, wall=21558 epoch 015: 1497 / 1689 loss=3.614, nll_loss=2.071, ppl=4.2, wps=537960, ups=1.09, wpb=494834, bsz=16903.3, num_updates=25100, lr=0.000399202, gnorm=0.144, clip=0, loss_scale=0.25, train_wall=83, gb_free=61.4, wall=21558 epoch 015: 1497 / 1689 loss=3.614, nll_loss=2.071, ppl=4.2, wps=537960, ups=1.09, wpb=494834, bsz=16903.3, num_updates=25100, lr=0.000399202, gnorm=0.144, clip=0, loss_scale=0.25, train_wall=83, gb_free=61.4, wall=21558 epoch 015: 1497 / 1689 loss=3.614, nll_loss=2.071, ppl=4.2, wps=537960, ups=1.09, wpb=494834, bsz=16903.3, num_updates=25100, lr=0.000399202, gnorm=0.144, clip=0, loss_scale=0.25, train_wall=83, gb_free=61.4, wall=21558 epoch 015: 1497 / 1689 loss=3.614, nll_loss=2.071, ppl=4.2, wps=537960, ups=1.09, wpb=494834, bsz=16903.3, num_updates=25100, lr=0.000399202, gnorm=0.144, clip=0, loss_scale=0.25, train_wall=83, gb_free=61.4, wall=21558 epoch 015: 1497 / 1689 loss=3.614, nll_loss=2.071, ppl=4.2, wps=537960, ups=1.09, wpb=494834, bsz=16903.3, num_updates=25100, lr=0.000399202, gnorm=0.144, clip=0, loss_scale=0.25, train_wall=83, gb_free=61.4, wall=21558 epoch 015: 1497 / 1689 loss=3.614, nll_loss=2.071, ppl=4.2, wps=537960, ups=1.09, wpb=494834, bsz=16903.3, num_updates=25100, lr=0.000399202, gnorm=0.144, clip=0, loss_scale=0.25, train_wall=83, gb_free=61.4, wall=21558 epoch 015: 1497 / 1689 loss=3.614, nll_loss=2.071, ppl=4.2, wps=537960, ups=1.09, wpb=494834, bsz=16903.3, num_updates=25100, lr=0.000399202, gnorm=0.144, clip=0, loss_scale=0.25, train_wall=83, gb_free=61.4, wall=21558 epoch 015: 1497 / 1689 loss=3.614, nll_loss=2.071, ppl=4.2, wps=537960, ups=1.09, wpb=494834, bsz=16903.3, num_updates=25100, lr=0.000399202, gnorm=0.144, clip=0, loss_scale=0.25, train_wall=83, gb_free=61.4, wall=21558 epoch 015: 1497 / 1689 loss=3.614, nll_loss=2.071, ppl=4.2, wps=537960, ups=1.09, wpb=494834, bsz=16903.3, num_updates=25100, lr=0.000399202, gnorm=0.144, clip=0, loss_scale=0.25, train_wall=83, gb_free=61.4, wall=21558 epoch 015: 1497 / 1689 loss=3.614, nll_loss=2.071, ppl=4.2, wps=537960, ups=1.09, wpb=494834, bsz=16903.3, num_updates=25100, lr=0.000399202, gnorm=0.144, clip=0, loss_scale=0.25, train_wall=83, gb_free=61.4, wall=21558 epoch 015: 1497 / 1689 loss=3.614, nll_loss=2.071, ppl=4.2, wps=537960, ups=1.09, wpb=494834, bsz=16903.3, num_updates=25100, lr=0.000399202, gnorm=0.144, clip=0, loss_scale=0.25, train_wall=83, gb_free=61.4, wall=21558 epoch 015: 1497 / 1689 loss=3.614, nll_loss=2.071, ppl=4.2, wps=537960, ups=1.09, wpb=494834, bsz=16903.3, num_updates=25100, lr=0.000399202, gnorm=0.144, clip=0, loss_scale=0.25, train_wall=83, gb_free=61.4, wall=21558 epoch 015: 1497 / 1689 loss=3.614, nll_loss=2.071, ppl=4.2, wps=537960, ups=1.09, wpb=494834, bsz=16903.3, num_updates=25100, lr=0.000399202, gnorm=0.144, clip=0, loss_scale=0.25, train_wall=83, gb_free=61.4, wall=21558 epoch 015: 1497 / 1689 loss=3.614, nll_loss=2.071, ppl=4.2, wps=537960, ups=1.09, wpb=494834, bsz=16903.3, num_updates=25100, lr=0.000399202, gnorm=0.144, clip=0, loss_scale=0.25, train_wall=83, gb_free=61.4, wall=21558 epoch 015: 1597 / 1689 loss=3.62, nll_loss=2.078, ppl=4.22, wps=587558, ups=1.18, wpb=496062, bsz=16531.8, num_updates=25200, lr=0.00039841, gnorm=0.149, clip=0, loss_scale=0.25, train_wall=83, gb_free=62.2, wall=21642 epoch 015: 1597 / 1689 loss=3.62, nll_loss=2.078, ppl=4.22, wps=587558, ups=1.18, wpb=496062, bsz=16531.8, num_updates=25200, lr=0.00039841, gnorm=0.149, clip=0, loss_scale=0.25, train_wall=83, gb_free=62.2, wall=21642 epoch 015: 1597 / 1689 loss=3.62, nll_loss=2.078, ppl=4.22, wps=587558, ups=1.18, wpb=496062, bsz=16531.8, num_updates=25200, lr=0.00039841, gnorm=0.149, clip=0, loss_scale=0.25, train_wall=83, gb_free=62.2, wall=21642 epoch 015: 1597 / 1689 loss=3.62, nll_loss=2.078, ppl=4.22, wps=587558, ups=1.18, wpb=496062, bsz=16531.8, num_updates=25200, lr=0.00039841, gnorm=0.149, clip=0, loss_scale=0.25, train_wall=83, gb_free=62.2, wall=21642 epoch 015: 1597 / 1689 loss=3.62, nll_loss=2.078, ppl=4.22, wps=587558, ups=1.18, wpb=496062, bsz=16531.8, num_updates=25200, lr=0.00039841, gnorm=0.149, clip=0, loss_scale=0.25, train_wall=83, gb_free=62.2, wall=21642 epoch 015: 1597 / 1689 loss=3.62, nll_loss=2.078, ppl=4.22, wps=587558, ups=1.18, wpb=496062, bsz=16531.8, num_updates=25200, lr=0.00039841, gnorm=0.149, clip=0, loss_scale=0.25, train_wall=83, gb_free=62.2, wall=21642 epoch 015: 1597 / 1689 loss=3.62, nll_loss=2.078, ppl=4.22, wps=587558, ups=1.18, wpb=496062, bsz=16531.8, num_updates=25200, lr=0.00039841, gnorm=0.149, clip=0, loss_scale=0.25, train_wall=83, gb_free=62.2, wall=21642 epoch 015: 1597 / 1689 loss=3.62, nll_loss=2.078, ppl=4.22, wps=587558, ups=1.18, wpb=496062, bsz=16531.8, num_updates=25200, lr=0.00039841, gnorm=0.149, clip=0, loss_scale=0.25, train_wall=83, gb_free=62.2, wall=21642 epoch 015: 1597 / 1689 loss=3.62, nll_loss=2.078, ppl=4.22, wps=587558, ups=1.18, wpb=496062, bsz=16531.8, num_updates=25200, lr=0.00039841, gnorm=0.149, clip=0, loss_scale=0.25, train_wall=83, gb_free=62.2, wall=21642 epoch 015: 1597 / 1689 loss=3.62, nll_loss=2.078, ppl=4.22, wps=587558, ups=1.18, wpb=496062, bsz=16531.8, num_updates=25200, lr=0.00039841, gnorm=0.149, clip=0, loss_scale=0.25, train_wall=83, gb_free=62.2, wall=21642 epoch 015: 1597 / 1689 loss=3.62, nll_loss=2.078, ppl=4.22, wps=587558, ups=1.18, wpb=496062, bsz=16531.8, num_updates=25200, lr=0.00039841, gnorm=0.149, clip=0, loss_scale=0.25, train_wall=83, gb_free=62.2, wall=21642 epoch 015: 1597 / 1689 loss=3.62, nll_loss=2.078, ppl=4.22, wps=587558, ups=1.18, wpb=496062, bsz=16531.8, num_updates=25200, lr=0.00039841, gnorm=0.149, clip=0, loss_scale=0.25, train_wall=83, gb_free=62.2, wall=21642 epoch 015: 1597 / 1689 loss=3.62, nll_loss=2.078, ppl=4.22, wps=587558, ups=1.18, wpb=496062, bsz=16531.8, num_updates=25200, lr=0.00039841, gnorm=0.149, clip=0, loss_scale=0.25, train_wall=83, gb_free=62.2, wall=21642 epoch 015: 1597 / 1689 loss=3.62, nll_loss=2.078, ppl=4.22, wps=587558, ups=1.18, wpb=496062, bsz=16531.8, num_updates=25200, lr=0.00039841, gnorm=0.149, clip=0, loss_scale=0.25, train_wall=83, gb_free=62.2, wall=21642 epoch 015: 1597 / 1689 loss=3.62, nll_loss=2.078, ppl=4.22, wps=587558, ups=1.18, wpb=496062, bsz=16531.8, num_updates=25200, lr=0.00039841, gnorm=0.149, clip=0, loss_scale=0.25, train_wall=83, gb_free=62.2, wall=21642 end of epoch 15 (average epoch stats below) epoch 015 | loss 3.608 | nll_loss 2.064 | ppl 4.18 | wps 577555 | ups 1.17 | wpb 495109 | bsz 16508.3 | num_updates 25292 | lr 0.000397684 | gnorm 0.147 | clip 0 | loss_scale 0.25 | train_wall 1408 | gb_free 63.9 | wall 21719 epoch 015 | loss 3.608 | nll_loss 2.064 | ppl 4.18 | wps 577555 | ups 1.17 | wpb 495109 | bsz 16508.3 | num_updates 25292 | lr 0.000397684 | gnorm 0.147 | clip 0 | loss_scale 0.25 | train_wall 1408 | gb_free 63.9 | wall 21719 epoch 015 | loss 3.608 | nll_loss 2.064 | ppl 4.18 | wps 577555 | ups 1.17 | wpb 495109 | bsz 16508.3 | num_updates 25292 | lr 0.000397684 | gnorm 0.147 | clip 0 | loss_scale 0.25 | train_wall 1408 | gb_free 63.9 | wall 21719 epoch 015 | loss 3.608 | nll_loss 2.064 | ppl 4.18 | wps 577555 | ups 1.17 | wpb 495109 | bsz 16508.3 | num_updates 25292 | lr 0.000397684 | gnorm 0.147 | clip 0 | loss_scale 0.25 | train_wall 1408 | gb_free 63.9 | wall 21719 epoch 015 | loss 3.608 | nll_loss 2.064 | ppl 4.18 | wps 577555 | ups 1.17 | wpb 495109 | bsz 16508.3 | num_updates 25292 | lr 0.000397684 | gnorm 0.147 | clip 0 | loss_scale 0.25 | train_wall 1408 | gb_free 63.9 | wall 21719 epoch 015 | loss 3.608 | nll_loss 2.064 | ppl 4.18 | wps 577555 | ups 1.17 | wpb 495109 | bsz 16508.3 | num_updates 25292 | lr 0.000397684 | gnorm 0.147 | clip 0 | loss_scale 0.25 | train_wall 1408 | gb_free 63.9 | wall 21719 epoch 015 | loss 3.608 | nll_loss 2.064 | ppl 4.18 | wps 577555 | ups 1.17 | wpb 495109 | bsz 16508.3 | num_updates 25292 | lr 0.000397684 | gnorm 0.147 | clip 0 | loss_scale 0.25 | train_wall 1408 | gb_free 63.9 | wall 21719 epoch 015 | loss 3.608 | nll_loss 2.064 | ppl 4.18 | wps 577555 | ups 1.17 | wpb 495109 | bsz 16508.3 | num_updates 25292 | lr 0.000397684 | gnorm 0.147 | clip 0 | loss_scale 0.25 | train_wall 1408 | gb_free 63.9 | wall 21719 epoch 015 | loss 3.608 | nll_loss 2.064 | ppl 4.18 | wps 577555 | ups 1.17 | wpb 495109 | bsz 16508.3 | num_updates 25292 | lr 0.000397684 | gnorm 0.147 | clip 0 | loss_scale 0.25 | train_wall 1408 | gb_free 63.9 | wall 21719 epoch 015 | loss 3.608 | nll_loss 2.064 | ppl 4.18 | wps 577555 | ups 1.17 | wpb 495109 | bsz 16508.3 | num_updates 25292 | lr 0.000397684 | gnorm 0.147 | clip 0 | loss_scale 0.25 | train_wall 1408 | gb_free 63.9 | wall 21719 epoch 015 | loss 3.608 | nll_loss 2.064 | ppl 4.18 | wps 577555 | ups 1.17 | wpb 495109 | bsz 16508.3 | num_updates 25292 | lr 0.000397684 | gnorm 0.147 | clip 0 | loss_scale 0.25 | train_wall 1408 | gb_free 63.9 | wall 21719 epoch 015 | loss 3.608 | nll_loss 2.064 | ppl 4.18 | wps 577555 | ups 1.17 | wpb 495109 | bsz 16508.3 | num_updates 25292 | lr 0.000397684 | gnorm 0.147 | clip 0 | loss_scale 0.25 | train_wall 1408 | gb_free 63.9 | wall 21719 epoch 015 | loss 3.608 | nll_loss 2.064 | ppl 4.18 | wps 577555 | ups 1.17 | wpb 495109 | bsz 16508.3 | num_updates 25292 | lr 0.000397684 | gnorm 0.147 | clip 0 | loss_scale 0.25 | train_wall 1408 | gb_free 63.9 | wall 21719 epoch 015 | loss 3.608 | nll_loss 2.064 | ppl 4.18 | wps 577555 | ups 1.17 | wpb 495109 | bsz 16508.3 | num_updates 25292 | lr 0.000397684 | gnorm 0.147 | clip 0 | loss_scale 0.25 | train_wall 1408 | gb_free 63.9 | wall 21719 epoch 015 | loss 3.608 | nll_loss 2.064 | ppl 4.18 | wps 577555 | ups 1.17 | wpb 495109 | bsz 16508.3 | num_updates 25292 | lr 0.000397684 | gnorm 0.147 | clip 0 | loss_scale 0.25 | train_wall 1408 | gb_free 63.9 | wall 21719 Start iterating over samples epoch 016: 8 / 1689 loss=3.618, nll_loss=2.076, ppl=4.22, wps=586411, ups=1.19, wpb=491808, bsz=15985.2, num_updates=25300, lr=0.000397621, gnorm=0.14, clip=0, loss_scale=0.25, train_wall=82, gb_free=61.1, wall=21726 epoch 016: 8 / 1689 loss=3.618, nll_loss=2.076, ppl=4.22, wps=586411, ups=1.19, wpb=491808, bsz=15985.2, num_updates=25300, lr=0.000397621, gnorm=0.14, clip=0, loss_scale=0.25, train_wall=82, gb_free=61.1, wall=21726 epoch 016: 8 / 1689 loss=3.618, nll_loss=2.076, ppl=4.22, wps=586411, ups=1.19, wpb=491808, bsz=15985.2, num_updates=25300, lr=0.000397621, gnorm=0.14, clip=0, loss_scale=0.25, train_wall=82, gb_free=61.1, wall=21726 epoch 016: 8 / 1689 loss=3.618, nll_loss=2.076, ppl=4.22, wps=586411, ups=1.19, wpb=491808, bsz=15985.2, num_updates=25300, lr=0.000397621, gnorm=0.14, clip=0, loss_scale=0.25, train_wall=82, gb_free=61.1, wall=21726 epoch 016: 8 / 1689 loss=3.618, nll_loss=2.076, ppl=4.22, wps=586411, ups=1.19, wpb=491808, bsz=15985.2, num_updates=25300, lr=0.000397621, gnorm=0.14, clip=0, loss_scale=0.25, train_wall=82, gb_free=61.1, wall=21726 epoch 016: 8 / 1689 loss=3.618, nll_loss=2.076, ppl=4.22, wps=586411, ups=1.19, wpb=491808, bsz=15985.2, num_updates=25300, lr=0.000397621, gnorm=0.14, clip=0, loss_scale=0.25, train_wall=82, gb_free=61.1, wall=21726 epoch 016: 8 / 1689 loss=3.618, nll_loss=2.076, ppl=4.22, wps=586411, ups=1.19, wpb=491808, bsz=15985.2, num_updates=25300, lr=0.000397621, gnorm=0.14, clip=0, loss_scale=0.25, train_wall=82, gb_free=61.1, wall=21726 epoch 016: 8 / 1689 loss=3.618, nll_loss=2.076, ppl=4.22, wps=586411, ups=1.19, wpb=491808, bsz=15985.2, num_updates=25300, lr=0.000397621, gnorm=0.14, clip=0, loss_scale=0.25, train_wall=82, gb_free=61.1, wall=21726 epoch 016: 8 / 1689 loss=3.618, nll_loss=2.076, ppl=4.22, wps=586411, ups=1.19, wpb=491808, bsz=15985.2, num_updates=25300, lr=0.000397621, gnorm=0.14, clip=0, loss_scale=0.25, train_wall=82, gb_free=61.1, wall=21726 epoch 016: 8 / 1689 loss=3.618, nll_loss=2.076, ppl=4.22, wps=586411, ups=1.19, wpb=491808, bsz=15985.2, num_updates=25300, lr=0.000397621, gnorm=0.14, clip=0, loss_scale=0.25, train_wall=82, gb_free=61.1, wall=21726 epoch 016: 8 / 1689 loss=3.618, nll_loss=2.076, ppl=4.22, wps=586411, ups=1.19, wpb=491808, bsz=15985.2, num_updates=25300, lr=0.000397621, gnorm=0.14, clip=0, loss_scale=0.25, train_wall=82, gb_free=61.1, wall=21726 epoch 016: 8 / 1689 loss=3.618, nll_loss=2.076, ppl=4.22, wps=586411, ups=1.19, wpb=491808, bsz=15985.2, num_updates=25300, lr=0.000397621, gnorm=0.14, clip=0, loss_scale=0.25, train_wall=82, gb_free=61.1, wall=21726 epoch 016: 8 / 1689 loss=3.618, nll_loss=2.076, ppl=4.22, wps=586411, ups=1.19, wpb=491808, bsz=15985.2, num_updates=25300, lr=0.000397621, gnorm=0.14, clip=0, loss_scale=0.25, train_wall=82, gb_free=61.1, wall=21726 epoch 016: 8 / 1689 loss=3.618, nll_loss=2.076, ppl=4.22, wps=586411, ups=1.19, wpb=491808, bsz=15985.2, num_updates=25300, lr=0.000397621, gnorm=0.14, clip=0, loss_scale=0.25, train_wall=82, gb_free=61.1, wall=21726 epoch 016: 8 / 1689 loss=3.618, nll_loss=2.076, ppl=4.22, wps=586411, ups=1.19, wpb=491808, bsz=15985.2, num_updates=25300, lr=0.000397621, gnorm=0.14, clip=0, loss_scale=0.25, train_wall=82, gb_free=61.1, wall=21726 epoch 016: 8 / 1689 loss=3.618, nll_loss=2.076, ppl=4.22, wps=586411, ups=1.19, wpb=491808, bsz=15985.2, num_updates=25300, lr=0.000397621, gnorm=0.14, clip=0, loss_scale=0.25, train_wall=82, gb_free=61.1, wall=21726 epoch 016: 108 / 1689 loss=3.587, nll_loss=2.04, ppl=4.11, wps=588577, ups=1.19, wpb=494833, bsz=16304.2, num_updates=25400, lr=0.000396838, gnorm=0.143, clip=0, loss_scale=0.5, train_wall=83, gb_free=60.9, wall=21810 epoch 016: 108 / 1689 loss=3.587, nll_loss=2.04, ppl=4.11, wps=588577, ups=1.19, wpb=494833, bsz=16304.2, num_updates=25400, lr=0.000396838, gnorm=0.143, clip=0, loss_scale=0.5, train_wall=83, gb_free=60.9, wall=21810 epoch 016: 108 / 1689 loss=3.587, nll_loss=2.04, ppl=4.11, wps=588577, ups=1.19, wpb=494833, bsz=16304.2, num_updates=25400, lr=0.000396838, gnorm=0.143, clip=0, loss_scale=0.5, train_wall=83, gb_free=60.9, wall=21810 epoch 016: 108 / 1689 loss=3.587, nll_loss=2.04, ppl=4.11, wps=588577, ups=1.19, wpb=494833, bsz=16304.2, num_updates=25400, lr=0.000396838, gnorm=0.143, clip=0, loss_scale=0.5, train_wall=83, gb_free=60.9, wall=21810 epoch 016: 108 / 1689 loss=3.587, nll_loss=2.04, ppl=4.11, wps=588577, ups=1.19, wpb=494833, bsz=16304.2, num_updates=25400, lr=0.000396838, gnorm=0.143, clip=0, loss_scale=0.5, train_wall=83, gb_free=60.9, wall=21810 epoch 016: 108 / 1689 loss=3.587, nll_loss=2.04, ppl=4.11, wps=588577, ups=1.19, wpb=494833, bsz=16304.2, num_updates=25400, lr=0.000396838, gnorm=0.143, clip=0, loss_scale=0.5, train_wall=83, gb_free=60.9, wall=21810 epoch 016: 108 / 1689 loss=3.587, nll_loss=2.04, ppl=4.11, wps=588577, ups=1.19, wpb=494833, bsz=16304.2, num_updates=25400, lr=0.000396838, gnorm=0.143, clip=0, loss_scale=0.5, train_wall=83, gb_free=60.9, wall=21810 epoch 016: 108 / 1689 loss=3.587, nll_loss=2.04, ppl=4.11, wps=588577, ups=1.19, wpb=494833, bsz=16304.2, num_updates=25400, lr=0.000396838, gnorm=0.143, clip=0, loss_scale=0.5, train_wall=83, gb_free=60.9, wall=21810 epoch 016: 108 / 1689 loss=3.587, nll_loss=2.04, ppl=4.11, wps=588577, ups=1.19, wpb=494833, bsz=16304.2, num_updates=25400, lr=0.000396838, gnorm=0.143, clip=0, loss_scale=0.5, train_wall=83, gb_free=60.9, wall=21810 epoch 016: 108 / 1689 loss=3.587, nll_loss=2.04, ppl=4.11, wps=588577, ups=1.19, wpb=494833, bsz=16304.2, num_updates=25400, lr=0.000396838, gnorm=0.143, clip=0, loss_scale=0.5, train_wall=83, gb_free=60.9, wall=21810 epoch 016: 108 / 1689 loss=3.587, nll_loss=2.04, ppl=4.11, wps=588577, ups=1.19, wpb=494833, bsz=16304.2, num_updates=25400, lr=0.000396838, gnorm=0.143, clip=0, loss_scale=0.5, train_wall=83, gb_free=60.9, wall=21810 epoch 016: 108 / 1689 loss=3.587, nll_loss=2.04, ppl=4.11, wps=588577, ups=1.19, wpb=494833, bsz=16304.2, num_updates=25400, lr=0.000396838, gnorm=0.143, clip=0, loss_scale=0.5, train_wall=83, gb_free=60.9, wall=21810 epoch 016: 108 / 1689 loss=3.587, nll_loss=2.04, ppl=4.11, wps=588577, ups=1.19, wpb=494833, bsz=16304.2, num_updates=25400, lr=0.000396838, gnorm=0.143, clip=0, loss_scale=0.5, train_wall=83, gb_free=60.9, wall=21810 epoch 016: 108 / 1689 loss=3.587, nll_loss=2.04, ppl=4.11, wps=588577, ups=1.19, wpb=494833, bsz=16304.2, num_updates=25400, lr=0.000396838, gnorm=0.143, clip=0, loss_scale=0.5, train_wall=83, gb_free=60.9, wall=21810 epoch 016: 108 / 1689 loss=3.587, nll_loss=2.04, ppl=4.11, wps=588577, ups=1.19, wpb=494833, bsz=16304.2, num_updates=25400, lr=0.000396838, gnorm=0.143, clip=0, loss_scale=0.5, train_wall=83, gb_free=60.9, wall=21810 epoch 016: 108 / 1689 loss=3.587, nll_loss=2.04, ppl=4.11, wps=588577, ups=1.19, wpb=494833, bsz=16304.2, num_updates=25400, lr=0.000396838, gnorm=0.143, clip=0, loss_scale=0.5, train_wall=83, gb_free=60.9, wall=21810 epoch 016: 208 / 1689 loss=3.584, nll_loss=2.036, ppl=4.1, wps=583949, ups=1.18, wpb=496652, bsz=16667.5, num_updates=25500, lr=0.000396059, gnorm=0.139, clip=0, loss_scale=0.5, train_wall=84, gb_free=62, wall=21895 epoch 016: 208 / 1689 loss=3.584, nll_loss=2.036, ppl=4.1, wps=583949, ups=1.18, wpb=496652, bsz=16667.5, num_updates=25500, lr=0.000396059, gnorm=0.139, clip=0, loss_scale=0.5, train_wall=84, gb_free=62, wall=21895 epoch 016: 208 / 1689 loss=3.584, nll_loss=2.036, ppl=4.1, wps=583949, ups=1.18, wpb=496652, bsz=16667.5, num_updates=25500, lr=0.000396059, gnorm=0.139, clip=0, loss_scale=0.5, train_wall=84, gb_free=62, wall=21895 epoch 016: 208 / 1689 loss=3.584, nll_loss=2.036, ppl=4.1, wps=583949, ups=1.18, wpb=496652, bsz=16667.5, num_updates=25500, lr=0.000396059, gnorm=0.139, clip=0, loss_scale=0.5, train_wall=84, gb_free=62, wall=21895 epoch 016: 208 / 1689 loss=3.584, nll_loss=2.036, ppl=4.1, wps=583949, ups=1.18, wpb=496652, bsz=16667.5, num_updates=25500, lr=0.000396059, gnorm=0.139, clip=0, loss_scale=0.5, train_wall=84, gb_free=62, wall=21895 epoch 016: 208 / 1689 loss=3.584, nll_loss=2.036, ppl=4.1, wps=583949, ups=1.18, wpb=496652, bsz=16667.5, num_updates=25500, lr=0.000396059, gnorm=0.139, clip=0, loss_scale=0.5, train_wall=84, gb_free=62, wall=21895 epoch 016: 208 / 1689 loss=3.584, nll_loss=2.036, ppl=4.1, wps=583949, ups=1.18, wpb=496652, bsz=16667.5, num_updates=25500, lr=0.000396059, gnorm=0.139, clip=0, loss_scale=0.5, train_wall=84, gb_free=62, wall=21895 epoch 016: 208 / 1689 loss=3.584, nll_loss=2.036, ppl=4.1, wps=583949, ups=1.18, wpb=496652, bsz=16667.5, num_updates=25500, lr=0.000396059, gnorm=0.139, clip=0, loss_scale=0.5, train_wall=84, gb_free=62, wall=21895 epoch 016: 208 / 1689 loss=3.584, nll_loss=2.036, ppl=4.1, wps=583949, ups=1.18, wpb=496652, bsz=16667.5, num_updates=25500, lr=0.000396059, gnorm=0.139, clip=0, loss_scale=0.5, train_wall=84, gb_free=62, wall=21895 epoch 016: 208 / 1689 loss=3.584, nll_loss=2.036, ppl=4.1, wps=583949, ups=1.18, wpb=496652, bsz=16667.5, num_updates=25500, lr=0.000396059, gnorm=0.139, clip=0, loss_scale=0.5, train_wall=84, gb_free=62, wall=21895 epoch 016: 208 / 1689 loss=3.584, nll_loss=2.036, ppl=4.1, wps=583949, ups=1.18, wpb=496652, bsz=16667.5, num_updates=25500, lr=0.000396059, gnorm=0.139, clip=0, loss_scale=0.5, train_wall=84, gb_free=62, wall=21895 epoch 016: 208 / 1689 loss=3.584, nll_loss=2.036, ppl=4.1, wps=583949, ups=1.18, wpb=496652, bsz=16667.5, num_updates=25500, lr=0.000396059, gnorm=0.139, clip=0, loss_scale=0.5, train_wall=84, gb_free=62, wall=21895 epoch 016: 208 / 1689 loss=3.584, nll_loss=2.036, ppl=4.1, wps=583949, ups=1.18, wpb=496652, bsz=16667.5, num_updates=25500, lr=0.000396059, gnorm=0.139, clip=0, loss_scale=0.5, train_wall=84, gb_free=62, wall=21895 epoch 016: 208 / 1689 loss=3.584, nll_loss=2.036, ppl=4.1, wps=583949, ups=1.18, wpb=496652, bsz=16667.5, num_updates=25500, lr=0.000396059, gnorm=0.139, clip=0, loss_scale=0.5, train_wall=84, gb_free=62, wall=21895 epoch 016: 208 / 1689 loss=3.584, nll_loss=2.036, ppl=4.1, wps=583949, ups=1.18, wpb=496652, bsz=16667.5, num_updates=25500, lr=0.000396059, gnorm=0.139, clip=0, loss_scale=0.5, train_wall=84, gb_free=62, wall=21895 epoch 016: 208 / 1689 loss=3.584, nll_loss=2.036, ppl=4.1, wps=583949, ups=1.18, wpb=496652, bsz=16667.5, num_updates=25500, lr=0.000396059, gnorm=0.139, clip=0, loss_scale=0.5, train_wall=84, gb_free=62, wall=21895 epoch 016: 308 / 1689 loss=3.594, nll_loss=2.048, ppl=4.13, wps=584123, ups=1.18, wpb=493569, bsz=16242.7, num_updates=25600, lr=0.000395285, gnorm=0.155, clip=0, loss_scale=0.5, train_wall=83, gb_free=60.5, wall=21980 epoch 016: 308 / 1689 loss=3.594, nll_loss=2.048, ppl=4.13, wps=584123, ups=1.18, wpb=493569, bsz=16242.7, num_updates=25600, lr=0.000395285, gnorm=0.155, clip=0, loss_scale=0.5, train_wall=83, gb_free=60.5, wall=21980 epoch 016: 308 / 1689 loss=3.594, nll_loss=2.048, ppl=4.13, wps=584123, ups=1.18, wpb=493569, bsz=16242.7, num_updates=25600, lr=0.000395285, gnorm=0.155, clip=0, loss_scale=0.5, train_wall=83, gb_free=60.5, wall=21980 epoch 016: 308 / 1689 loss=3.594, nll_loss=2.048, ppl=4.13, wps=584123, ups=1.18, wpb=493569, bsz=16242.7, num_updates=25600, lr=0.000395285, gnorm=0.155, clip=0, loss_scale=0.5, train_wall=83, gb_free=60.5, wall=21980 epoch 016: 308 / 1689 loss=3.594, nll_loss=2.048, ppl=4.13, wps=584123, ups=1.18, wpb=493569, bsz=16242.7, num_updates=25600, lr=0.000395285, gnorm=0.155, clip=0, loss_scale=0.5, train_wall=83, gb_free=60.5, wall=21980 epoch 016: 308 / 1689 loss=3.594, nll_loss=2.048, ppl=4.13, wps=584123, ups=1.18, wpb=493569, bsz=16242.7, num_updates=25600, lr=0.000395285, gnorm=0.155, clip=0, loss_scale=0.5, train_wall=83, gb_free=60.5, wall=21980 epoch 016: 308 / 1689 loss=3.594, nll_loss=2.048, ppl=4.13, wps=584123, ups=1.18, wpb=493569, bsz=16242.7, num_updates=25600, lr=0.000395285, gnorm=0.155, clip=0, loss_scale=0.5, train_wall=83, gb_free=60.5, wall=21980 epoch 016: 308 / 1689 loss=3.594, nll_loss=2.048, ppl=4.13, wps=584123, ups=1.18, wpb=493569, bsz=16242.7, num_updates=25600, lr=0.000395285, gnorm=0.155, clip=0, loss_scale=0.5, train_wall=83, gb_free=60.5, wall=21980 epoch 016: 308 / 1689 loss=3.594, nll_loss=2.048, ppl=4.13, wps=584123, ups=1.18, wpb=493569, bsz=16242.7, num_updates=25600, lr=0.000395285, gnorm=0.155, clip=0, loss_scale=0.5, train_wall=83, gb_free=60.5, wall=21980 epoch 016: 308 / 1689 loss=3.594, nll_loss=2.048, ppl=4.13, wps=584123, ups=1.18, wpb=493569, bsz=16242.7, num_updates=25600, lr=0.000395285, gnorm=0.155, clip=0, loss_scale=0.5, train_wall=83, gb_free=60.5, wall=21980 epoch 016: 308 / 1689 loss=3.594, nll_loss=2.048, ppl=4.13, wps=584123, ups=1.18, wpb=493569, bsz=16242.7, num_updates=25600, lr=0.000395285, gnorm=0.155, clip=0, loss_scale=0.5, train_wall=83, gb_free=60.5, wall=21980 epoch 016: 308 / 1689 loss=3.594, nll_loss=2.048, ppl=4.13, wps=584123, ups=1.18, wpb=493569, bsz=16242.7, num_updates=25600, lr=0.000395285, gnorm=0.155, clip=0, loss_scale=0.5, train_wall=83, gb_free=60.5, wall=21980 epoch 016: 308 / 1689 loss=3.594, nll_loss=2.048, ppl=4.13, wps=584123, ups=1.18, wpb=493569, bsz=16242.7, num_updates=25600, lr=0.000395285, gnorm=0.155, clip=0, loss_scale=0.5, train_wall=83, gb_free=60.5, wall=21980 epoch 016: 308 / 1689 loss=3.594, nll_loss=2.048, ppl=4.13, wps=584123, ups=1.18, wpb=493569, bsz=16242.7, num_updates=25600, lr=0.000395285, gnorm=0.155, clip=0, loss_scale=0.5, train_wall=83, gb_free=60.5, wall=21980 epoch 016: 308 / 1689 loss=3.594, nll_loss=2.048, ppl=4.13, wps=584123, ups=1.18, wpb=493569, bsz=16242.7, num_updates=25600, lr=0.000395285, gnorm=0.155, clip=0, loss_scale=0.5, train_wall=83, gb_free=60.5, wall=21980 epoch 016: 308 / 1689 loss=3.594, nll_loss=2.048, ppl=4.13, wps=584123, ups=1.18, wpb=493569, bsz=16242.7, num_updates=25600, lr=0.000395285, gnorm=0.155, clip=0, loss_scale=0.5, train_wall=83, gb_free=60.5, wall=21980 epoch 016: 409 / 1689 loss=3.594, nll_loss=2.048, ppl=4.13, wps=583382, ups=1.18, wpb=494808, bsz=16141.7, num_updates=25700, lr=0.000394515, gnorm=0.152, clip=0, loss_scale=0.25, train_wall=84, gb_free=61.5, wall=22064 epoch 016: 409 / 1689 loss=3.594, nll_loss=2.048, ppl=4.13, wps=583382, ups=1.18, wpb=494808, bsz=16141.7, num_updates=25700, lr=0.000394515, gnorm=0.152, clip=0, loss_scale=0.25, train_wall=84, gb_free=61.5, wall=22064 epoch 016: 409 / 1689 loss=3.594, nll_loss=2.048, ppl=4.13, wps=583382, ups=1.18, wpb=494808, bsz=16141.7, num_updates=25700, lr=0.000394515, gnorm=0.152, clip=0, loss_scale=0.25, train_wall=84, gb_free=61.5, wall=22064 epoch 016: 409 / 1689 loss=3.594, nll_loss=2.048, ppl=4.13, wps=583382, ups=1.18, wpb=494808, bsz=16141.7, num_updates=25700, lr=0.000394515, gnorm=0.152, clip=0, loss_scale=0.25, train_wall=84, gb_free=61.5, wall=22064 epoch 016: 409 / 1689 loss=3.594, nll_loss=2.048, ppl=4.13, wps=583382, ups=1.18, wpb=494808, bsz=16141.7, num_updates=25700, lr=0.000394515, gnorm=0.152, clip=0, loss_scale=0.25, train_wall=84, gb_free=61.5, wall=22064 epoch 016: 409 / 1689 loss=3.594, nll_loss=2.048, ppl=4.13, wps=583382, ups=1.18, wpb=494808, bsz=16141.7, num_updates=25700, lr=0.000394515, gnorm=0.152, clip=0, loss_scale=0.25, train_wall=84, gb_free=61.5, wall=22064 epoch 016: 409 / 1689 loss=3.594, nll_loss=2.048, ppl=4.13, wps=583382, ups=1.18, wpb=494808, bsz=16141.7, num_updates=25700, lr=0.000394515, gnorm=0.152, clip=0, loss_scale=0.25, train_wall=84, gb_free=61.5, wall=22064 epoch 016: 409 / 1689 loss=3.594, nll_loss=2.048, ppl=4.13, wps=583382, ups=1.18, wpb=494808, bsz=16141.7, num_updates=25700, lr=0.000394515, gnorm=0.152, clip=0, loss_scale=0.25, train_wall=84, gb_free=61.5, wall=22064 epoch 016: 409 / 1689 loss=3.594, nll_loss=2.048, ppl=4.13, wps=583382, ups=1.18, wpb=494808, bsz=16141.7, num_updates=25700, lr=0.000394515, gnorm=0.152, clip=0, loss_scale=0.25, train_wall=84, gb_free=61.5, wall=22064 epoch 016: 409 / 1689 loss=3.594, nll_loss=2.048, ppl=4.13, wps=583382, ups=1.18, wpb=494808, bsz=16141.7, num_updates=25700, lr=0.000394515, gnorm=0.152, clip=0, loss_scale=0.25, train_wall=84, gb_free=61.5, wall=22064 epoch 016: 409 / 1689 loss=3.594, nll_loss=2.048, ppl=4.13, wps=583382, ups=1.18, wpb=494808, bsz=16141.7, num_updates=25700, lr=0.000394515, gnorm=0.152, clip=0, loss_scale=0.25, train_wall=84, gb_free=61.5, wall=22064 epoch 016: 409 / 1689 loss=3.594, nll_loss=2.048, ppl=4.13, wps=583382, ups=1.18, wpb=494808, bsz=16141.7, num_updates=25700, lr=0.000394515, gnorm=0.152, clip=0, loss_scale=0.25, train_wall=84, gb_free=61.5, wall=22064 epoch 016: 409 / 1689 loss=3.594, nll_loss=2.048, ppl=4.13, wps=583382, ups=1.18, wpb=494808, bsz=16141.7, num_updates=25700, lr=0.000394515, gnorm=0.152, clip=0, loss_scale=0.25, train_wall=84, gb_free=61.5, wall=22064 epoch 016: 409 / 1689 loss=3.594, nll_loss=2.048, ppl=4.13, wps=583382, ups=1.18, wpb=494808, bsz=16141.7, num_updates=25700, lr=0.000394515, gnorm=0.152, clip=0, loss_scale=0.25, train_wall=84, gb_free=61.5, wall=22064 epoch 016: 409 / 1689 loss=3.594, nll_loss=2.048, ppl=4.13, wps=583382, ups=1.18, wpb=494808, bsz=16141.7, num_updates=25700, lr=0.000394515, gnorm=0.152, clip=0, loss_scale=0.25, train_wall=84, gb_free=61.5, wall=22064 epoch 016: 409 / 1689 loss=3.594, nll_loss=2.048, ppl=4.13, wps=583382, ups=1.18, wpb=494808, bsz=16141.7, num_updates=25700, lr=0.000394515, gnorm=0.152, clip=0, loss_scale=0.25, train_wall=84, gb_free=61.5, wall=22064 epoch 016: 509 / 1689 loss=3.598, nll_loss=2.053, ppl=4.15, wps=587696, ups=1.18, wpb=496467, bsz=16181.4, num_updates=25800, lr=0.00039375, gnorm=0.144, clip=0, loss_scale=0.25, train_wall=83, gb_free=60.3, wall=22149 epoch 016: 509 / 1689 loss=3.598, nll_loss=2.053, ppl=4.15, wps=587696, ups=1.18, wpb=496467, bsz=16181.4, num_updates=25800, lr=0.00039375, gnorm=0.144, clip=0, loss_scale=0.25, train_wall=83, gb_free=60.3, wall=22149 epoch 016: 509 / 1689 loss=3.598, nll_loss=2.053, ppl=4.15, wps=587696, ups=1.18, wpb=496467, bsz=16181.4, num_updates=25800, lr=0.00039375, gnorm=0.144, clip=0, loss_scale=0.25, train_wall=83, gb_free=60.3, wall=22149 epoch 016: 509 / 1689 loss=3.598, nll_loss=2.053, ppl=4.15, wps=587696, ups=1.18, wpb=496467, bsz=16181.4, num_updates=25800, lr=0.00039375, gnorm=0.144, clip=0, loss_scale=0.25, train_wall=83, gb_free=60.3, wall=22149 epoch 016: 509 / 1689 loss=3.598, nll_loss=2.053, ppl=4.15, wps=587696, ups=1.18, wpb=496467, bsz=16181.4, num_updates=25800, lr=0.00039375, gnorm=0.144, clip=0, loss_scale=0.25, train_wall=83, gb_free=60.3, wall=22149 epoch 016: 509 / 1689 loss=3.598, nll_loss=2.053, ppl=4.15, wps=587696, ups=1.18, wpb=496467, bsz=16181.4, num_updates=25800, lr=0.00039375, gnorm=0.144, clip=0, loss_scale=0.25, train_wall=83, gb_free=60.3, wall=22149 epoch 016: 509 / 1689 loss=3.598, nll_loss=2.053, ppl=4.15, wps=587696, ups=1.18, wpb=496467, bsz=16181.4, num_updates=25800, lr=0.00039375, gnorm=0.144, clip=0, loss_scale=0.25, train_wall=83, gb_free=60.3, wall=22149 epoch 016: 509 / 1689 loss=3.598, nll_loss=2.053, ppl=4.15, wps=587696, ups=1.18, wpb=496467, bsz=16181.4, num_updates=25800, lr=0.00039375, gnorm=0.144, clip=0, loss_scale=0.25, train_wall=83, gb_free=60.3, wall=22149 epoch 016: 509 / 1689 loss=3.598, nll_loss=2.053, ppl=4.15, wps=587696, ups=1.18, wpb=496467, bsz=16181.4, num_updates=25800, lr=0.00039375, gnorm=0.144, clip=0, loss_scale=0.25, train_wall=83, gb_free=60.3, wall=22149 epoch 016: 509 / 1689 loss=3.598, nll_loss=2.053, ppl=4.15, wps=587696, ups=1.18, wpb=496467, bsz=16181.4, num_updates=25800, lr=0.00039375, gnorm=0.144, clip=0, loss_scale=0.25, train_wall=83, gb_free=60.3, wall=22149 epoch 016: 509 / 1689 loss=3.598, nll_loss=2.053, ppl=4.15, wps=587696, ups=1.18, wpb=496467, bsz=16181.4, num_updates=25800, lr=0.00039375, gnorm=0.144, clip=0, loss_scale=0.25, train_wall=83, gb_free=60.3, wall=22149 epoch 016: 509 / 1689 loss=3.598, nll_loss=2.053, ppl=4.15, wps=587696, ups=1.18, wpb=496467, bsz=16181.4, num_updates=25800, lr=0.00039375, gnorm=0.144, clip=0, loss_scale=0.25, train_wall=83, gb_free=60.3, wall=22149 epoch 016: 509 / 1689 loss=3.598, nll_loss=2.053, ppl=4.15, wps=587696, ups=1.18, wpb=496467, bsz=16181.4, num_updates=25800, lr=0.00039375, gnorm=0.144, clip=0, loss_scale=0.25, train_wall=83, gb_free=60.3, wall=22149 epoch 016: 509 / 1689 loss=3.598, nll_loss=2.053, ppl=4.15, wps=587696, ups=1.18, wpb=496467, bsz=16181.4, num_updates=25800, lr=0.00039375, gnorm=0.144, clip=0, loss_scale=0.25, train_wall=83, gb_free=60.3, wall=22149 epoch 016: 509 / 1689 loss=3.598, nll_loss=2.053, ppl=4.15, wps=587696, ups=1.18, wpb=496467, bsz=16181.4, num_updates=25800, lr=0.00039375, gnorm=0.144, clip=0, loss_scale=0.25, train_wall=83, gb_free=60.3, wall=22149 epoch 016: 509 / 1689 loss=3.598, nll_loss=2.053, ppl=4.15, wps=587696, ups=1.18, wpb=496467, bsz=16181.4, num_updates=25800, lr=0.00039375, gnorm=0.144, clip=0, loss_scale=0.25, train_wall=83, gb_free=60.3, wall=22149 epoch 016: 609 / 1689 loss=3.594, nll_loss=2.048, ppl=4.14, wps=585948, ups=1.18, wpb=494702, bsz=16692.3, num_updates=25900, lr=0.000392989, gnorm=0.138, clip=0, loss_scale=0.25, train_wall=83, gb_free=61.6, wall=22233 epoch 016: 609 / 1689 loss=3.594, nll_loss=2.048, ppl=4.14, wps=585948, ups=1.18, wpb=494702, bsz=16692.3, num_updates=25900, lr=0.000392989, gnorm=0.138, clip=0, loss_scale=0.25, train_wall=83, gb_free=61.6, wall=22233 epoch 016: 609 / 1689 loss=3.594, nll_loss=2.048, ppl=4.14, wps=585948, ups=1.18, wpb=494702, bsz=16692.3, num_updates=25900, lr=0.000392989, gnorm=0.138, clip=0, loss_scale=0.25, train_wall=83, gb_free=61.6, wall=22233 epoch 016: 609 / 1689 loss=3.594, nll_loss=2.048, ppl=4.14, wps=585948, ups=1.18, wpb=494702, bsz=16692.3, num_updates=25900, lr=0.000392989, gnorm=0.138, clip=0, loss_scale=0.25, train_wall=83, gb_free=61.6, wall=22233 epoch 016: 609 / 1689 loss=3.594, nll_loss=2.048, ppl=4.14, wps=585948, ups=1.18, wpb=494702, bsz=16692.3, num_updates=25900, lr=0.000392989, gnorm=0.138, clip=0, loss_scale=0.25, train_wall=83, gb_free=61.6, wall=22233 epoch 016: 609 / 1689 loss=3.594, nll_loss=2.048, ppl=4.14, wps=585948, ups=1.18, wpb=494702, bsz=16692.3, num_updates=25900, lr=0.000392989, gnorm=0.138, clip=0, loss_scale=0.25, train_wall=83, gb_free=61.6, wall=22233 epoch 016: 609 / 1689 loss=3.594, nll_loss=2.048, ppl=4.14, wps=585948, ups=1.18, wpb=494702, bsz=16692.3, num_updates=25900, lr=0.000392989, gnorm=0.138, clip=0, loss_scale=0.25, train_wall=83, gb_free=61.6, wall=22233 epoch 016: 609 / 1689 loss=3.594, nll_loss=2.048, ppl=4.14, wps=585948, ups=1.18, wpb=494702, bsz=16692.3, num_updates=25900, lr=0.000392989, gnorm=0.138, clip=0, loss_scale=0.25, train_wall=83, gb_free=61.6, wall=22233 epoch 016: 609 / 1689 loss=3.594, nll_loss=2.048, ppl=4.14, wps=585948, ups=1.18, wpb=494702, bsz=16692.3, num_updates=25900, lr=0.000392989, gnorm=0.138, clip=0, loss_scale=0.25, train_wall=83, gb_free=61.6, wall=22233 epoch 016: 609 / 1689 loss=3.594, nll_loss=2.048, ppl=4.14, wps=585948, ups=1.18, wpb=494702, bsz=16692.3, num_updates=25900, lr=0.000392989, gnorm=0.138, clip=0, loss_scale=0.25, train_wall=83, gb_free=61.6, wall=22233 epoch 016: 609 / 1689 loss=3.594, nll_loss=2.048, ppl=4.14, wps=585948, ups=1.18, wpb=494702, bsz=16692.3, num_updates=25900, lr=0.000392989, gnorm=0.138, clip=0, loss_scale=0.25, train_wall=83, gb_free=61.6, wall=22233 epoch 016: 609 / 1689 loss=3.594, nll_loss=2.048, ppl=4.14, wps=585948, ups=1.18, wpb=494702, bsz=16692.3, num_updates=25900, lr=0.000392989, gnorm=0.138, clip=0, loss_scale=0.25, train_wall=83, gb_free=61.6, wall=22233 epoch 016: 609 / 1689 loss=3.594, nll_loss=2.048, ppl=4.14, wps=585948, ups=1.18, wpb=494702, bsz=16692.3, num_updates=25900, lr=0.000392989, gnorm=0.138, clip=0, loss_scale=0.25, train_wall=83, gb_free=61.6, wall=22233 epoch 016: 609 / 1689 loss=3.594, nll_loss=2.048, ppl=4.14, wps=585948, ups=1.18, wpb=494702, bsz=16692.3, num_updates=25900, lr=0.000392989, gnorm=0.138, clip=0, loss_scale=0.25, train_wall=83, gb_free=61.6, wall=22233 epoch 016: 609 / 1689 loss=3.594, nll_loss=2.048, ppl=4.14, wps=585948, ups=1.18, wpb=494702, bsz=16692.3, num_updates=25900, lr=0.000392989, gnorm=0.138, clip=0, loss_scale=0.25, train_wall=83, gb_free=61.6, wall=22233 epoch 016: 609 / 1689 loss=3.594, nll_loss=2.048, ppl=4.14, wps=585948, ups=1.18, wpb=494702, bsz=16692.3, num_updates=25900, lr=0.000392989, gnorm=0.138, clip=0, loss_scale=0.25, train_wall=83, gb_free=61.6, wall=22233 epoch 016: 709 / 1689 loss=3.598, nll_loss=2.053, ppl=4.15, wps=582830, ups=1.18, wpb=494018, bsz=16608.4, num_updates=26000, lr=0.000392232, gnorm=0.142, clip=0, loss_scale=0.25, train_wall=84, gb_free=62.4, wall=22318 epoch 016: 709 / 1689 loss=3.598, nll_loss=2.053, ppl=4.15, wps=582830, ups=1.18, wpb=494018, bsz=16608.4, num_updates=26000, lr=0.000392232, gnorm=0.142, clip=0, loss_scale=0.25, train_wall=84, gb_free=62.4, wall=22318 epoch 016: 709 / 1689 loss=3.598, nll_loss=2.053, ppl=4.15, wps=582830, ups=1.18, wpb=494018, bsz=16608.4, num_updates=26000, lr=0.000392232, gnorm=0.142, clip=0, loss_scale=0.25, train_wall=84, gb_free=62.4, wall=22318 epoch 016: 709 / 1689 loss=3.598, nll_loss=2.053, ppl=4.15, wps=582830, ups=1.18, wpb=494018, bsz=16608.4, num_updates=26000, lr=0.000392232, gnorm=0.142, clip=0, loss_scale=0.25, train_wall=84, gb_free=62.4, wall=22318 epoch 016: 709 / 1689 loss=3.598, nll_loss=2.053, ppl=4.15, wps=582830, ups=1.18, wpb=494018, bsz=16608.4, num_updates=26000, lr=0.000392232, gnorm=0.142, clip=0, loss_scale=0.25, train_wall=84, gb_free=62.4, wall=22318 epoch 016: 709 / 1689 loss=3.598, nll_loss=2.053, ppl=4.15, wps=582830, ups=1.18, wpb=494018, bsz=16608.4, num_updates=26000, lr=0.000392232, gnorm=0.142, clip=0, loss_scale=0.25, train_wall=84, gb_free=62.4, wall=22318 epoch 016: 709 / 1689 loss=3.598, nll_loss=2.053, ppl=4.15, wps=582830, ups=1.18, wpb=494018, bsz=16608.4, num_updates=26000, lr=0.000392232, gnorm=0.142, clip=0, loss_scale=0.25, train_wall=84, gb_free=62.4, wall=22318 epoch 016: 709 / 1689 loss=3.598, nll_loss=2.053, ppl=4.15, wps=582830, ups=1.18, wpb=494018, bsz=16608.4, num_updates=26000, lr=0.000392232, gnorm=0.142, clip=0, loss_scale=0.25, train_wall=84, gb_free=62.4, wall=22318 epoch 016: 709 / 1689 loss=3.598, nll_loss=2.053, ppl=4.15, wps=582830, ups=1.18, wpb=494018, bsz=16608.4, num_updates=26000, lr=0.000392232, gnorm=0.142, clip=0, loss_scale=0.25, train_wall=84, gb_free=62.4, wall=22318 epoch 016: 709 / 1689 loss=3.598, nll_loss=2.053, ppl=4.15, wps=582830, ups=1.18, wpb=494018, bsz=16608.4, num_updates=26000, lr=0.000392232, gnorm=0.142, clip=0, loss_scale=0.25, train_wall=84, gb_free=62.4, wall=22318 epoch 016: 709 / 1689 loss=3.598, nll_loss=2.053, ppl=4.15, wps=582830, ups=1.18, wpb=494018, bsz=16608.4, num_updates=26000, lr=0.000392232, gnorm=0.142, clip=0, loss_scale=0.25, train_wall=84, gb_free=62.4, wall=22318 epoch 016: 709 / 1689 loss=3.598, nll_loss=2.053, ppl=4.15, wps=582830, ups=1.18, wpb=494018, bsz=16608.4, num_updates=26000, lr=0.000392232, gnorm=0.142, clip=0, loss_scale=0.25, train_wall=84, gb_free=62.4, wall=22318 epoch 016: 709 / 1689 loss=3.598, nll_loss=2.053, ppl=4.15, wps=582830, ups=1.18, wpb=494018, bsz=16608.4, num_updates=26000, lr=0.000392232, gnorm=0.142, clip=0, loss_scale=0.25, train_wall=84, gb_free=62.4, wall=22318 epoch 016: 709 / 1689 loss=3.598, nll_loss=2.053, ppl=4.15, wps=582830, ups=1.18, wpb=494018, bsz=16608.4, num_updates=26000, lr=0.000392232, gnorm=0.142, clip=0, loss_scale=0.25, train_wall=84, gb_free=62.4, wall=22318 epoch 016: 709 / 1689 loss=3.598, nll_loss=2.053, ppl=4.15, wps=582830, ups=1.18, wpb=494018, bsz=16608.4, num_updates=26000, lr=0.000392232, gnorm=0.142, clip=0, loss_scale=0.25, train_wall=84, gb_free=62.4, wall=22318 epoch 016: 709 / 1689 loss=3.598, nll_loss=2.053, ppl=4.15, wps=582830, ups=1.18, wpb=494018, bsz=16608.4, num_updates=26000, lr=0.000392232, gnorm=0.142, clip=0, loss_scale=0.25, train_wall=84, gb_free=62.4, wall=22318 begin validation on "valid" subset epoch 016 | valid on 'valid' subset | loss 3.713 | nll_loss 2.165 | ppl 4.48 | wps 0 | wpb 44526 | bsz 2008 | num_updates 26000 | best_loss 3.713 epoch 016 | valid on 'valid' subset | loss 3.713 | nll_loss 2.165 | ppl 4.48 | wps 0 | wpb 44526 | bsz 2008 | num_updates 26000 | best_loss 3.713 epoch 016 | valid on 'valid' subset | loss 3.713 | nll_loss 2.165 | ppl 4.48 | wps 0 | wpb 44526 | bsz 2008 | num_updates 26000 | best_loss 3.713 epoch 016 | valid on 'valid' subset | loss 3.713 | nll_loss 2.165 | ppl 4.48 | wps 0 | wpb 44526 | bsz 2008 | num_updates 26000 | best_loss 3.713 epoch 016 | valid on 'valid' subset | loss 3.713 | nll_loss 2.165 | ppl 4.48 | wps 0 | wpb 44526 | bsz 2008 | num_updates 26000 | best_loss 3.713 epoch 016 | valid on 'valid' subset | loss 3.713 | nll_loss 2.165 | ppl 4.48 | wps 0 | wpb 44526 | bsz 2008 | num_updates 26000 | best_loss 3.713 epoch 016 | valid on 'valid' subset | loss 3.713 | nll_loss 2.165 | ppl 4.48 | wps 0 | wpb 44526 | bsz 2008 | num_updates 26000 | best_loss 3.713 epoch 016 | valid on 'valid' subset | loss 3.713 | nll_loss 2.165 | ppl 4.48 | wps 0 | wpb 44526 | bsz 2008 | num_updates 26000 | best_loss 3.713 epoch 016 | valid on 'valid' subset | loss 3.713 | nll_loss 2.165 | ppl 4.48 | wps 0 | wpb 44526 | bsz 2008 | num_updates 26000 | best_loss 3.713 epoch 016 | valid on 'valid' subset | loss 3.713 | nll_loss 2.165 | ppl 4.48 | wps 0 | wpb 44526 | bsz 2008 | num_updates 26000 | best_loss 3.713 epoch 016 | valid on 'valid' subset | loss 3.713 | nll_loss 2.165 | ppl 4.48 | wps 0 | wpb 44526 | bsz 2008 | num_updates 26000 | best_loss 3.713 epoch 016 | valid on 'valid' subset | loss 3.713 | nll_loss 2.165 | ppl 4.48 | wps 0 | wpb 44526 | bsz 2008 | num_updates 26000 | best_loss 3.713 epoch 016 | valid on 'valid' subset | loss 3.713 | nll_loss 2.165 | ppl 4.48 | wps 0 | wpb 44526 | bsz 2008 | num_updates 26000 | best_loss 3.713 epoch 016 | valid on 'valid' subset | loss 3.713 | nll_loss 2.165 | ppl 4.48 | wps 0 | wpb 44526 | bsz 2008 | num_updates 26000 | best_loss 3.713 epoch 016 | valid on 'valid' subset | loss 3.713 | nll_loss 2.165 | ppl 4.48 | wps 0 | wpb 44526 | bsz 2008 | num_updates 26000 | best_loss 3.713 epoch 016 | valid on 'valid' subset | loss 3.713 | nll_loss 2.165 | ppl 4.48 | wps 0 | wpb 44526 | bsz 2008 | num_updates 26000 | best_loss 3.713 epoch 016: 809 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=516768, ups=1.05, wpb=494367, bsz=16635.3, num_updates=26100, lr=0.00039148, gnorm=0.145, clip=0, loss_scale=0.25, train_wall=83, gb_free=61, wall=22414 epoch 016: 809 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=516768, ups=1.05, wpb=494367, bsz=16635.3, num_updates=26100, lr=0.00039148, gnorm=0.145, clip=0, loss_scale=0.25, train_wall=83, gb_free=61, wall=22414 epoch 016: 809 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=516768, ups=1.05, wpb=494367, bsz=16635.3, num_updates=26100, lr=0.00039148, gnorm=0.145, clip=0, loss_scale=0.25, train_wall=83, gb_free=61, wall=22414 epoch 016: 809 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=516768, ups=1.05, wpb=494367, bsz=16635.3, num_updates=26100, lr=0.00039148, gnorm=0.145, clip=0, loss_scale=0.25, train_wall=83, gb_free=61, wall=22414 epoch 016: 809 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=516768, ups=1.05, wpb=494367, bsz=16635.3, num_updates=26100, lr=0.00039148, gnorm=0.145, clip=0, loss_scale=0.25, train_wall=83, gb_free=61, wall=22414 epoch 016: 809 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=516768, ups=1.05, wpb=494367, bsz=16635.3, num_updates=26100, lr=0.00039148, gnorm=0.145, clip=0, loss_scale=0.25, train_wall=83, gb_free=61, wall=22414 epoch 016: 809 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=516768, ups=1.05, wpb=494367, bsz=16635.3, num_updates=26100, lr=0.00039148, gnorm=0.145, clip=0, loss_scale=0.25, train_wall=83, gb_free=61, wall=22414 epoch 016: 809 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=516768, ups=1.05, wpb=494367, bsz=16635.3, num_updates=26100, lr=0.00039148, gnorm=0.145, clip=0, loss_scale=0.25, train_wall=83, gb_free=61, wall=22414 epoch 016: 809 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=516768, ups=1.05, wpb=494367, bsz=16635.3, num_updates=26100, lr=0.00039148, gnorm=0.145, clip=0, loss_scale=0.25, train_wall=83, gb_free=61, wall=22414 epoch 016: 809 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=516768, ups=1.05, wpb=494367, bsz=16635.3, num_updates=26100, lr=0.00039148, gnorm=0.145, clip=0, loss_scale=0.25, train_wall=83, gb_free=61, wall=22414 epoch 016: 809 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=516768, ups=1.05, wpb=494367, bsz=16635.3, num_updates=26100, lr=0.00039148, gnorm=0.145, clip=0, loss_scale=0.25, train_wall=83, gb_free=61, wall=22414 epoch 016: 809 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=516768, ups=1.05, wpb=494367, bsz=16635.3, num_updates=26100, lr=0.00039148, gnorm=0.145, clip=0, loss_scale=0.25, train_wall=83, gb_free=61, wall=22414 epoch 016: 809 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=516768, ups=1.05, wpb=494367, bsz=16635.3, num_updates=26100, lr=0.00039148, gnorm=0.145, clip=0, loss_scale=0.25, train_wall=83, gb_free=61, wall=22414 epoch 016: 809 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=516768, ups=1.05, wpb=494367, bsz=16635.3, num_updates=26100, lr=0.00039148, gnorm=0.145, clip=0, loss_scale=0.25, train_wall=83, gb_free=61, wall=22414 epoch 016: 809 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=516768, ups=1.05, wpb=494367, bsz=16635.3, num_updates=26100, lr=0.00039148, gnorm=0.145, clip=0, loss_scale=0.25, train_wall=83, gb_free=61, wall=22414 epoch 016: 809 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=516768, ups=1.05, wpb=494367, bsz=16635.3, num_updates=26100, lr=0.00039148, gnorm=0.145, clip=0, loss_scale=0.25, train_wall=83, gb_free=61, wall=22414 epoch 016: 909 / 1689 loss=3.601, nll_loss=2.056, ppl=4.16, wps=585602, ups=1.18, wpb=495538, bsz=16659.4, num_updates=26200, lr=0.000390732, gnorm=0.141, clip=0, loss_scale=0.5, train_wall=84, gb_free=61, wall=22498 epoch 016: 909 / 1689 loss=3.601, nll_loss=2.056, ppl=4.16, wps=585602, ups=1.18, wpb=495538, bsz=16659.4, num_updates=26200, lr=0.000390732, gnorm=0.141, clip=0, loss_scale=0.5, train_wall=84, gb_free=61, wall=22498 epoch 016: 909 / 1689 loss=3.601, nll_loss=2.056, ppl=4.16, wps=585602, ups=1.18, wpb=495538, bsz=16659.4, num_updates=26200, lr=0.000390732, gnorm=0.141, clip=0, loss_scale=0.5, train_wall=84, gb_free=61, wall=22498 epoch 016: 909 / 1689 loss=3.601, nll_loss=2.056, ppl=4.16, wps=585602, ups=1.18, wpb=495538, bsz=16659.4, num_updates=26200, lr=0.000390732, gnorm=0.141, clip=0, loss_scale=0.5, train_wall=84, gb_free=61, wall=22498 epoch 016: 909 / 1689 loss=3.601, nll_loss=2.056, ppl=4.16, wps=585602, ups=1.18, wpb=495538, bsz=16659.4, num_updates=26200, lr=0.000390732, gnorm=0.141, clip=0, loss_scale=0.5, train_wall=84, gb_free=61, wall=22498 epoch 016: 909 / 1689 loss=3.601, nll_loss=2.056, ppl=4.16, wps=585602, ups=1.18, wpb=495538, bsz=16659.4, num_updates=26200, lr=0.000390732, gnorm=0.141, clip=0, loss_scale=0.5, train_wall=84, gb_free=61, wall=22498 epoch 016: 909 / 1689 loss=3.601, nll_loss=2.056, ppl=4.16, wps=585602, ups=1.18, wpb=495538, bsz=16659.4, num_updates=26200, lr=0.000390732, gnorm=0.141, clip=0, loss_scale=0.5, train_wall=84, gb_free=61, wall=22498 epoch 016: 909 / 1689 loss=3.601, nll_loss=2.056, ppl=4.16, wps=585602, ups=1.18, wpb=495538, bsz=16659.4, num_updates=26200, lr=0.000390732, gnorm=0.141, clip=0, loss_scale=0.5, train_wall=84, gb_free=61, wall=22498 epoch 016: 909 / 1689 loss=3.601, nll_loss=2.056, ppl=4.16, wps=585602, ups=1.18, wpb=495538, bsz=16659.4, num_updates=26200, lr=0.000390732, gnorm=0.141, clip=0, loss_scale=0.5, train_wall=84, gb_free=61, wall=22498 epoch 016: 909 / 1689 loss=3.601, nll_loss=2.056, ppl=4.16, wps=585602, ups=1.18, wpb=495538, bsz=16659.4, num_updates=26200, lr=0.000390732, gnorm=0.141, clip=0, loss_scale=0.5, train_wall=84, gb_free=61, wall=22498 epoch 016: 909 / 1689 loss=3.601, nll_loss=2.056, ppl=4.16, wps=585602, ups=1.18, wpb=495538, bsz=16659.4, num_updates=26200, lr=0.000390732, gnorm=0.141, clip=0, loss_scale=0.5, train_wall=84, gb_free=61, wall=22498 epoch 016: 909 / 1689 loss=3.601, nll_loss=2.056, ppl=4.16, wps=585602, ups=1.18, wpb=495538, bsz=16659.4, num_updates=26200, lr=0.000390732, gnorm=0.141, clip=0, loss_scale=0.5, train_wall=84, gb_free=61, wall=22498 epoch 016: 909 / 1689 loss=3.601, nll_loss=2.056, ppl=4.16, wps=585602, ups=1.18, wpb=495538, bsz=16659.4, num_updates=26200, lr=0.000390732, gnorm=0.141, clip=0, loss_scale=0.5, train_wall=84, gb_free=61, wall=22498 epoch 016: 909 / 1689 loss=3.601, nll_loss=2.056, ppl=4.16, wps=585602, ups=1.18, wpb=495538, bsz=16659.4, num_updates=26200, lr=0.000390732, gnorm=0.141, clip=0, loss_scale=0.5, train_wall=84, gb_free=61, wall=22498 epoch 016: 909 / 1689 loss=3.601, nll_loss=2.056, ppl=4.16, wps=585602, ups=1.18, wpb=495538, bsz=16659.4, num_updates=26200, lr=0.000390732, gnorm=0.141, clip=0, loss_scale=0.5, train_wall=84, gb_free=61, wall=22498 epoch 016: 909 / 1689 loss=3.601, nll_loss=2.056, ppl=4.16, wps=585602, ups=1.18, wpb=495538, bsz=16659.4, num_updates=26200, lr=0.000390732, gnorm=0.141, clip=0, loss_scale=0.5, train_wall=84, gb_free=61, wall=22498 epoch 016: 1009 / 1689 loss=3.599, nll_loss=2.054, ppl=4.15, wps=589915, ups=1.19, wpb=495738, bsz=16328.2, num_updates=26300, lr=0.000389989, gnorm=0.135, clip=0, loss_scale=0.5, train_wall=83, gb_free=61.4, wall=22582 epoch 016: 1009 / 1689 loss=3.599, nll_loss=2.054, ppl=4.15, wps=589915, ups=1.19, wpb=495738, bsz=16328.2, num_updates=26300, lr=0.000389989, gnorm=0.135, clip=0, loss_scale=0.5, train_wall=83, gb_free=61.4, wall=22582 epoch 016: 1009 / 1689 loss=3.599, nll_loss=2.054, ppl=4.15, wps=589915, ups=1.19, wpb=495738, bsz=16328.2, num_updates=26300, lr=0.000389989, gnorm=0.135, clip=0, loss_scale=0.5, train_wall=83, gb_free=61.4, wall=22582 epoch 016: 1009 / 1689 loss=3.599, nll_loss=2.054, ppl=4.15, wps=589915, ups=1.19, wpb=495738, bsz=16328.2, num_updates=26300, lr=0.000389989, gnorm=0.135, clip=0, loss_scale=0.5, train_wall=83, gb_free=61.4, wall=22582 epoch 016: 1009 / 1689 loss=3.599, nll_loss=2.054, ppl=4.15, wps=589915, ups=1.19, wpb=495738, bsz=16328.2, num_updates=26300, lr=0.000389989, gnorm=0.135, clip=0, loss_scale=0.5, train_wall=83, gb_free=61.4, wall=22582 epoch 016: 1009 / 1689 loss=3.599, nll_loss=2.054, ppl=4.15, wps=589915, ups=1.19, wpb=495738, bsz=16328.2, num_updates=26300, lr=0.000389989, gnorm=0.135, clip=0, loss_scale=0.5, train_wall=83, gb_free=61.4, wall=22582 epoch 016: 1009 / 1689 loss=3.599, nll_loss=2.054, ppl=4.15, wps=589915, ups=1.19, wpb=495738, bsz=16328.2, num_updates=26300, lr=0.000389989, gnorm=0.135, clip=0, loss_scale=0.5, train_wall=83, gb_free=61.4, wall=22582 epoch 016: 1009 / 1689 loss=3.599, nll_loss=2.054, ppl=4.15, wps=589915, ups=1.19, wpb=495738, bsz=16328.2, num_updates=26300, lr=0.000389989, gnorm=0.135, clip=0, loss_scale=0.5, train_wall=83, gb_free=61.4, wall=22582 epoch 016: 1009 / 1689 loss=3.599, nll_loss=2.054, ppl=4.15, wps=589915, ups=1.19, wpb=495738, bsz=16328.2, num_updates=26300, lr=0.000389989, gnorm=0.135, clip=0, loss_scale=0.5, train_wall=83, gb_free=61.4, wall=22582 epoch 016: 1009 / 1689 loss=3.599, nll_loss=2.054, ppl=4.15, wps=589915, ups=1.19, wpb=495738, bsz=16328.2, num_updates=26300, lr=0.000389989, gnorm=0.135, clip=0, loss_scale=0.5, train_wall=83, gb_free=61.4, wall=22582 epoch 016: 1009 / 1689 loss=3.599, nll_loss=2.054, ppl=4.15, wps=589915, ups=1.19, wpb=495738, bsz=16328.2, num_updates=26300, lr=0.000389989, gnorm=0.135, clip=0, loss_scale=0.5, train_wall=83, gb_free=61.4, wall=22582 epoch 016: 1009 / 1689 loss=3.599, nll_loss=2.054, ppl=4.15, wps=589915, ups=1.19, wpb=495738, bsz=16328.2, num_updates=26300, lr=0.000389989, gnorm=0.135, clip=0, loss_scale=0.5, train_wall=83, gb_free=61.4, wall=22582 epoch 016: 1009 / 1689 loss=3.599, nll_loss=2.054, ppl=4.15, wps=589915, ups=1.19, wpb=495738, bsz=16328.2, num_updates=26300, lr=0.000389989, gnorm=0.135, clip=0, loss_scale=0.5, train_wall=83, gb_free=61.4, wall=22582 epoch 016: 1009 / 1689 loss=3.599, nll_loss=2.054, ppl=4.15, wps=589915, ups=1.19, wpb=495738, bsz=16328.2, num_updates=26300, lr=0.000389989, gnorm=0.135, clip=0, loss_scale=0.5, train_wall=83, gb_free=61.4, wall=22582 epoch 016: 1009 / 1689 loss=3.599, nll_loss=2.054, ppl=4.15, wps=589915, ups=1.19, wpb=495738, bsz=16328.2, num_updates=26300, lr=0.000389989, gnorm=0.135, clip=0, loss_scale=0.5, train_wall=83, gb_free=61.4, wall=22582 epoch 016: 1009 / 1689 loss=3.599, nll_loss=2.054, ppl=4.15, wps=589915, ups=1.19, wpb=495738, bsz=16328.2, num_updates=26300, lr=0.000389989, gnorm=0.135, clip=0, loss_scale=0.5, train_wall=83, gb_free=61.4, wall=22582 epoch 016: 1109 / 1689 loss=3.601, nll_loss=2.057, ppl=4.16, wps=589903, ups=1.19, wpb=496389, bsz=16562.2, num_updates=26400, lr=0.000389249, gnorm=0.14, clip=0, loss_scale=0.5, train_wall=83, gb_free=61.3, wall=22667 epoch 016: 1109 / 1689 loss=3.601, nll_loss=2.057, ppl=4.16, wps=589903, ups=1.19, wpb=496389, bsz=16562.2, num_updates=26400, lr=0.000389249, gnorm=0.14, clip=0, loss_scale=0.5, train_wall=83, gb_free=61.3, wall=22667 epoch 016: 1109 / 1689 loss=3.601, nll_loss=2.057, ppl=4.16, wps=589903, ups=1.19, wpb=496389, bsz=16562.2, num_updates=26400, lr=0.000389249, gnorm=0.14, clip=0, loss_scale=0.5, train_wall=83, gb_free=61.3, wall=22667 epoch 016: 1109 / 1689 loss=3.601, nll_loss=2.057, ppl=4.16, wps=589903, ups=1.19, wpb=496389, bsz=16562.2, num_updates=26400, lr=0.000389249, gnorm=0.14, clip=0, loss_scale=0.5, train_wall=83, gb_free=61.3, wall=22667 epoch 016: 1109 / 1689 loss=3.601, nll_loss=2.057, ppl=4.16, wps=589903, ups=1.19, wpb=496389, bsz=16562.2, num_updates=26400, lr=0.000389249, gnorm=0.14, clip=0, loss_scale=0.5, train_wall=83, gb_free=61.3, wall=22667 epoch 016: 1109 / 1689 loss=3.601, nll_loss=2.057, ppl=4.16, wps=589903, ups=1.19, wpb=496389, bsz=16562.2, num_updates=26400, lr=0.000389249, gnorm=0.14, clip=0, loss_scale=0.5, train_wall=83, gb_free=61.3, wall=22667 epoch 016: 1109 / 1689 loss=3.601, nll_loss=2.057, ppl=4.16, wps=589903, ups=1.19, wpb=496389, bsz=16562.2, num_updates=26400, lr=0.000389249, gnorm=0.14, clip=0, loss_scale=0.5, train_wall=83, gb_free=61.3, wall=22667 epoch 016: 1109 / 1689 loss=3.601, nll_loss=2.057, ppl=4.16, wps=589903, ups=1.19, wpb=496389, bsz=16562.2, num_updates=26400, lr=0.000389249, gnorm=0.14, clip=0, loss_scale=0.5, train_wall=83, gb_free=61.3, wall=22667 epoch 016: 1109 / 1689 loss=3.601, nll_loss=2.057, ppl=4.16, wps=589903, ups=1.19, wpb=496389, bsz=16562.2, num_updates=26400, lr=0.000389249, gnorm=0.14, clip=0, loss_scale=0.5, train_wall=83, gb_free=61.3, wall=22667 epoch 016: 1109 / 1689 loss=3.601, nll_loss=2.057, ppl=4.16, wps=589903, ups=1.19, wpb=496389, bsz=16562.2, num_updates=26400, lr=0.000389249, gnorm=0.14, clip=0, loss_scale=0.5, train_wall=83, gb_free=61.3, wall=22667 epoch 016: 1109 / 1689 loss=3.601, nll_loss=2.057, ppl=4.16, wps=589903, ups=1.19, wpb=496389, bsz=16562.2, num_updates=26400, lr=0.000389249, gnorm=0.14, clip=0, loss_scale=0.5, train_wall=83, gb_free=61.3, wall=22667 epoch 016: 1109 / 1689 loss=3.601, nll_loss=2.057, ppl=4.16, wps=589903, ups=1.19, wpb=496389, bsz=16562.2, num_updates=26400, lr=0.000389249, gnorm=0.14, clip=0, loss_scale=0.5, train_wall=83, gb_free=61.3, wall=22667 epoch 016: 1109 / 1689 loss=3.601, nll_loss=2.057, ppl=4.16, wps=589903, ups=1.19, wpb=496389, bsz=16562.2, num_updates=26400, lr=0.000389249, gnorm=0.14, clip=0, loss_scale=0.5, train_wall=83, gb_free=61.3, wall=22667 epoch 016: 1109 / 1689 loss=3.601, nll_loss=2.057, ppl=4.16, wps=589903, ups=1.19, wpb=496389, bsz=16562.2, num_updates=26400, lr=0.000389249, gnorm=0.14, clip=0, loss_scale=0.5, train_wall=83, gb_free=61.3, wall=22667 epoch 016: 1109 / 1689 loss=3.601, nll_loss=2.057, ppl=4.16, wps=589903, ups=1.19, wpb=496389, bsz=16562.2, num_updates=26400, lr=0.000389249, gnorm=0.14, clip=0, loss_scale=0.5, train_wall=83, gb_free=61.3, wall=22667 epoch 016: 1109 / 1689 loss=3.601, nll_loss=2.057, ppl=4.16, wps=589903, ups=1.19, wpb=496389, bsz=16562.2, num_updates=26400, lr=0.000389249, gnorm=0.14, clip=0, loss_scale=0.5, train_wall=83, gb_free=61.3, wall=22667 epoch 016: 1209 / 1689 loss=3.604, nll_loss=2.061, ppl=4.17, wps=584198, ups=1.18, wpb=495018, bsz=17100.4, num_updates=26500, lr=0.000388514, gnorm=0.141, clip=0, loss_scale=0.5, train_wall=84, gb_free=61.9, wall=22751 epoch 016: 1209 / 1689 loss=3.604, nll_loss=2.061, ppl=4.17, wps=584198, ups=1.18, wpb=495018, bsz=17100.4, num_updates=26500, lr=0.000388514, gnorm=0.141, clip=0, loss_scale=0.5, train_wall=84, gb_free=61.9, wall=22751 epoch 016: 1209 / 1689 loss=3.604, nll_loss=2.061, ppl=4.17, wps=584198, ups=1.18, wpb=495018, bsz=17100.4, num_updates=26500, lr=0.000388514, gnorm=0.141, clip=0, loss_scale=0.5, train_wall=84, gb_free=61.9, wall=22751 epoch 016: 1209 / 1689 loss=3.604, nll_loss=2.061, ppl=4.17, wps=584198, ups=1.18, wpb=495018, bsz=17100.4, num_updates=26500, lr=0.000388514, gnorm=0.141, clip=0, loss_scale=0.5, train_wall=84, gb_free=61.9, wall=22751 epoch 016: 1209 / 1689 loss=3.604, nll_loss=2.061, ppl=4.17, wps=584198, ups=1.18, wpb=495018, bsz=17100.4, num_updates=26500, lr=0.000388514, gnorm=0.141, clip=0, loss_scale=0.5, train_wall=84, gb_free=61.9, wall=22751 epoch 016: 1209 / 1689 loss=3.604, nll_loss=2.061, ppl=4.17, wps=584198, ups=1.18, wpb=495018, bsz=17100.4, num_updates=26500, lr=0.000388514, gnorm=0.141, clip=0, loss_scale=0.5, train_wall=84, gb_free=61.9, wall=22751 epoch 016: 1209 / 1689 loss=3.604, nll_loss=2.061, ppl=4.17, wps=584198, ups=1.18, wpb=495018, bsz=17100.4, num_updates=26500, lr=0.000388514, gnorm=0.141, clip=0, loss_scale=0.5, train_wall=84, gb_free=61.9, wall=22751 epoch 016: 1209 / 1689 loss=3.604, nll_loss=2.061, ppl=4.17, wps=584198, ups=1.18, wpb=495018, bsz=17100.4, num_updates=26500, lr=0.000388514, gnorm=0.141, clip=0, loss_scale=0.5, train_wall=84, gb_free=61.9, wall=22751 epoch 016: 1209 / 1689 loss=3.604, nll_loss=2.061, ppl=4.17, wps=584198, ups=1.18, wpb=495018, bsz=17100.4, num_updates=26500, lr=0.000388514, gnorm=0.141, clip=0, loss_scale=0.5, train_wall=84, gb_free=61.9, wall=22751 epoch 016: 1209 / 1689 loss=3.604, nll_loss=2.061, ppl=4.17, wps=584198, ups=1.18, wpb=495018, bsz=17100.4, num_updates=26500, lr=0.000388514, gnorm=0.141, clip=0, loss_scale=0.5, train_wall=84, gb_free=61.9, wall=22751 epoch 016: 1209 / 1689 loss=3.604, nll_loss=2.061, ppl=4.17, wps=584198, ups=1.18, wpb=495018, bsz=17100.4, num_updates=26500, lr=0.000388514, gnorm=0.141, clip=0, loss_scale=0.5, train_wall=84, gb_free=61.9, wall=22751 epoch 016: 1209 / 1689 loss=3.604, nll_loss=2.061, ppl=4.17, wps=584198, ups=1.18, wpb=495018, bsz=17100.4, num_updates=26500, lr=0.000388514, gnorm=0.141, clip=0, loss_scale=0.5, train_wall=84, gb_free=61.9, wall=22751 epoch 016: 1209 / 1689 loss=3.604, nll_loss=2.061, ppl=4.17, wps=584198, ups=1.18, wpb=495018, bsz=17100.4, num_updates=26500, lr=0.000388514, gnorm=0.141, clip=0, loss_scale=0.5, train_wall=84, gb_free=61.9, wall=22751 epoch 016: 1209 / 1689 loss=3.604, nll_loss=2.061, ppl=4.17, wps=584198, ups=1.18, wpb=495018, bsz=17100.4, num_updates=26500, lr=0.000388514, gnorm=0.141, clip=0, loss_scale=0.5, train_wall=84, gb_free=61.9, wall=22751 epoch 016: 1209 / 1689 loss=3.604, nll_loss=2.061, ppl=4.17, wps=584198, ups=1.18, wpb=495018, bsz=17100.4, num_updates=26500, lr=0.000388514, gnorm=0.141, clip=0, loss_scale=0.5, train_wall=84, gb_free=61.9, wall=22751 epoch 016: 1209 / 1689 loss=3.604, nll_loss=2.061, ppl=4.17, wps=584198, ups=1.18, wpb=495018, bsz=17100.4, num_updates=26500, lr=0.000388514, gnorm=0.141, clip=0, loss_scale=0.5, train_wall=84, gb_free=61.9, wall=22751 epoch 016: 1309 / 1689 loss=3.601, nll_loss=2.057, ppl=4.16, wps=586258, ups=1.18, wpb=496414, bsz=16728.9, num_updates=26600, lr=0.000387783, gnorm=0.142, clip=0, loss_scale=0.5, train_wall=84, gb_free=61.8, wall=22836 epoch 016: 1309 / 1689 loss=3.601, nll_loss=2.057, ppl=4.16, wps=586258, ups=1.18, wpb=496414, bsz=16728.9, num_updates=26600, lr=0.000387783, gnorm=0.142, clip=0, loss_scale=0.5, train_wall=84, gb_free=61.8, wall=22836 epoch 016: 1309 / 1689 loss=3.601, nll_loss=2.057, ppl=4.16, wps=586258, ups=1.18, wpb=496414, bsz=16728.9, num_updates=26600, lr=0.000387783, gnorm=0.142, clip=0, loss_scale=0.5, train_wall=84, gb_free=61.8, wall=22836 epoch 016: 1309 / 1689 loss=3.601, nll_loss=2.057, ppl=4.16, wps=586258, ups=1.18, wpb=496414, bsz=16728.9, num_updates=26600, lr=0.000387783, gnorm=0.142, clip=0, loss_scale=0.5, train_wall=84, gb_free=61.8, wall=22836 epoch 016: 1309 / 1689 loss=3.601, nll_loss=2.057, ppl=4.16, wps=586258, ups=1.18, wpb=496414, bsz=16728.9, num_updates=26600, lr=0.000387783, gnorm=0.142, clip=0, loss_scale=0.5, train_wall=84, gb_free=61.8, wall=22836 epoch 016: 1309 / 1689 loss=3.601, nll_loss=2.057, ppl=4.16, wps=586258, ups=1.18, wpb=496414, bsz=16728.9, num_updates=26600, lr=0.000387783, gnorm=0.142, clip=0, loss_scale=0.5, train_wall=84, gb_free=61.8, wall=22836 epoch 016: 1309 / 1689 loss=3.601, nll_loss=2.057, ppl=4.16, wps=586258, ups=1.18, wpb=496414, bsz=16728.9, num_updates=26600, lr=0.000387783, gnorm=0.142, clip=0, loss_scale=0.5, train_wall=84, gb_free=61.8, wall=22836 epoch 016: 1309 / 1689 loss=3.601, nll_loss=2.057, ppl=4.16, wps=586258, ups=1.18, wpb=496414, bsz=16728.9, num_updates=26600, lr=0.000387783, gnorm=0.142, clip=0, loss_scale=0.5, train_wall=84, gb_free=61.8, wall=22836 epoch 016: 1309 / 1689 loss=3.601, nll_loss=2.057, ppl=4.16, wps=586258, ups=1.18, wpb=496414, bsz=16728.9, num_updates=26600, lr=0.000387783, gnorm=0.142, clip=0, loss_scale=0.5, train_wall=84, gb_free=61.8, wall=22836 epoch 016: 1309 / 1689 loss=3.601, nll_loss=2.057, ppl=4.16, wps=586258, ups=1.18, wpb=496414, bsz=16728.9, num_updates=26600, lr=0.000387783, gnorm=0.142, clip=0, loss_scale=0.5, train_wall=84, gb_free=61.8, wall=22836 epoch 016: 1309 / 1689 loss=3.601, nll_loss=2.057, ppl=4.16, wps=586258, ups=1.18, wpb=496414, bsz=16728.9, num_updates=26600, lr=0.000387783, gnorm=0.142, clip=0, loss_scale=0.5, train_wall=84, gb_free=61.8, wall=22836 epoch 016: 1309 / 1689 loss=3.601, nll_loss=2.057, ppl=4.16, wps=586258, ups=1.18, wpb=496414, bsz=16728.9, num_updates=26600, lr=0.000387783, gnorm=0.142, clip=0, loss_scale=0.5, train_wall=84, gb_free=61.8, wall=22836 epoch 016: 1309 / 1689 loss=3.601, nll_loss=2.057, ppl=4.16, wps=586258, ups=1.18, wpb=496414, bsz=16728.9, num_updates=26600, lr=0.000387783, gnorm=0.142, clip=0, loss_scale=0.5, train_wall=84, gb_free=61.8, wall=22836 epoch 016: 1309 / 1689 loss=3.601, nll_loss=2.057, ppl=4.16, wps=586258, ups=1.18, wpb=496414, bsz=16728.9, num_updates=26600, lr=0.000387783, gnorm=0.142, clip=0, loss_scale=0.5, train_wall=84, gb_free=61.8, wall=22836 epoch 016: 1309 / 1689 loss=3.601, nll_loss=2.057, ppl=4.16, wps=586258, ups=1.18, wpb=496414, bsz=16728.9, num_updates=26600, lr=0.000387783, gnorm=0.142, clip=0, loss_scale=0.5, train_wall=84, gb_free=61.8, wall=22836 epoch 016: 1309 / 1689 loss=3.601, nll_loss=2.057, ppl=4.16, wps=586258, ups=1.18, wpb=496414, bsz=16728.9, num_updates=26600, lr=0.000387783, gnorm=0.142, clip=0, loss_scale=0.5, train_wall=84, gb_free=61.8, wall=22836 epoch 016: 1409 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=586528, ups=1.18, wpb=495239, bsz=16378.2, num_updates=26700, lr=0.000387056, gnorm=0.143, clip=0, loss_scale=1, train_wall=83, gb_free=61.7, wall=22920 epoch 016: 1409 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=586528, ups=1.18, wpb=495239, bsz=16378.2, num_updates=26700, lr=0.000387056, gnorm=0.143, clip=0, loss_scale=1, train_wall=83, gb_free=61.7, wall=22920 epoch 016: 1409 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=586528, ups=1.18, wpb=495239, bsz=16378.2, num_updates=26700, lr=0.000387056, gnorm=0.143, clip=0, loss_scale=1, train_wall=83, gb_free=61.7, wall=22920 epoch 016: 1409 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=586528, ups=1.18, wpb=495239, bsz=16378.2, num_updates=26700, lr=0.000387056, gnorm=0.143, clip=0, loss_scale=1, train_wall=83, gb_free=61.7, wall=22920 epoch 016: 1409 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=586528, ups=1.18, wpb=495239, bsz=16378.2, num_updates=26700, lr=0.000387056, gnorm=0.143, clip=0, loss_scale=1, train_wall=83, gb_free=61.7, wall=22920 epoch 016: 1409 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=586528, ups=1.18, wpb=495239, bsz=16378.2, num_updates=26700, lr=0.000387056, gnorm=0.143, clip=0, loss_scale=1, train_wall=83, gb_free=61.7, wall=22920 epoch 016: 1409 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=586528, ups=1.18, wpb=495239, bsz=16378.2, num_updates=26700, lr=0.000387056, gnorm=0.143, clip=0, loss_scale=1, train_wall=83, gb_free=61.7, wall=22920 epoch 016: 1409 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=586528, ups=1.18, wpb=495239, bsz=16378.2, num_updates=26700, lr=0.000387056, gnorm=0.143, clip=0, loss_scale=1, train_wall=83, gb_free=61.7, wall=22920 epoch 016: 1409 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=586528, ups=1.18, wpb=495239, bsz=16378.2, num_updates=26700, lr=0.000387056, gnorm=0.143, clip=0, loss_scale=1, train_wall=83, gb_free=61.7, wall=22920 epoch 016: 1409 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=586528, ups=1.18, wpb=495239, bsz=16378.2, num_updates=26700, lr=0.000387056, gnorm=0.143, clip=0, loss_scale=1, train_wall=83, gb_free=61.7, wall=22920 epoch 016: 1409 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=586528, ups=1.18, wpb=495239, bsz=16378.2, num_updates=26700, lr=0.000387056, gnorm=0.143, clip=0, loss_scale=1, train_wall=83, gb_free=61.7, wall=22920 epoch 016: 1409 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=586528, ups=1.18, wpb=495239, bsz=16378.2, num_updates=26700, lr=0.000387056, gnorm=0.143, clip=0, loss_scale=1, train_wall=83, gb_free=61.7, wall=22920 epoch 016: 1409 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=586528, ups=1.18, wpb=495239, bsz=16378.2, num_updates=26700, lr=0.000387056, gnorm=0.143, clip=0, loss_scale=1, train_wall=83, gb_free=61.7, wall=22920 epoch 016: 1409 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=586528, ups=1.18, wpb=495239, bsz=16378.2, num_updates=26700, lr=0.000387056, gnorm=0.143, clip=0, loss_scale=1, train_wall=83, gb_free=61.7, wall=22920 epoch 016: 1409 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=586528, ups=1.18, wpb=495239, bsz=16378.2, num_updates=26700, lr=0.000387056, gnorm=0.143, clip=0, loss_scale=1, train_wall=83, gb_free=61.7, wall=22920 epoch 016: 1409 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=586528, ups=1.18, wpb=495239, bsz=16378.2, num_updates=26700, lr=0.000387056, gnorm=0.143, clip=0, loss_scale=1, train_wall=83, gb_free=61.7, wall=22920 epoch 016: 1509 / 1689 loss=3.604, nll_loss=2.061, ppl=4.17, wps=584589, ups=1.18, wpb=495249, bsz=16547.4, num_updates=26800, lr=0.000386334, gnorm=0.139, clip=0, loss_scale=1, train_wall=83, gb_free=61.5, wall=23005 epoch 016: 1509 / 1689 loss=3.604, nll_loss=2.061, ppl=4.17, wps=584589, ups=1.18, wpb=495249, bsz=16547.4, num_updates=26800, lr=0.000386334, gnorm=0.139, clip=0, loss_scale=1, train_wall=83, gb_free=61.5, wall=23005 epoch 016: 1509 / 1689 loss=3.604, nll_loss=2.061, ppl=4.17, wps=584589, ups=1.18, wpb=495249, bsz=16547.4, num_updates=26800, lr=0.000386334, gnorm=0.139, clip=0, loss_scale=1, train_wall=83, gb_free=61.5, wall=23005 epoch 016: 1509 / 1689 loss=3.604, nll_loss=2.061, ppl=4.17, wps=584589, ups=1.18, wpb=495249, bsz=16547.4, num_updates=26800, lr=0.000386334, gnorm=0.139, clip=0, loss_scale=1, train_wall=83, gb_free=61.5, wall=23005 epoch 016: 1509 / 1689 loss=3.604, nll_loss=2.061, ppl=4.17, wps=584589, ups=1.18, wpb=495249, bsz=16547.4, num_updates=26800, lr=0.000386334, gnorm=0.139, clip=0, loss_scale=1, train_wall=83, gb_free=61.5, wall=23005 epoch 016: 1509 / 1689 loss=3.604, nll_loss=2.061, ppl=4.17, wps=584589, ups=1.18, wpb=495249, bsz=16547.4, num_updates=26800, lr=0.000386334, gnorm=0.139, clip=0, loss_scale=1, train_wall=83, gb_free=61.5, wall=23005 epoch 016: 1509 / 1689 loss=3.604, nll_loss=2.061, ppl=4.17, wps=584589, ups=1.18, wpb=495249, bsz=16547.4, num_updates=26800, lr=0.000386334, gnorm=0.139, clip=0, loss_scale=1, train_wall=83, gb_free=61.5, wall=23005 epoch 016: 1509 / 1689 loss=3.604, nll_loss=2.061, ppl=4.17, wps=584589, ups=1.18, wpb=495249, bsz=16547.4, num_updates=26800, lr=0.000386334, gnorm=0.139, clip=0, loss_scale=1, train_wall=83, gb_free=61.5, wall=23005 epoch 016: 1509 / 1689 loss=3.604, nll_loss=2.061, ppl=4.17, wps=584589, ups=1.18, wpb=495249, bsz=16547.4, num_updates=26800, lr=0.000386334, gnorm=0.139, clip=0, loss_scale=1, train_wall=83, gb_free=61.5, wall=23005 epoch 016: 1509 / 1689 loss=3.604, nll_loss=2.061, ppl=4.17, wps=584589, ups=1.18, wpb=495249, bsz=16547.4, num_updates=26800, lr=0.000386334, gnorm=0.139, clip=0, loss_scale=1, train_wall=83, gb_free=61.5, wall=23005 epoch 016: 1509 / 1689 loss=3.604, nll_loss=2.061, ppl=4.17, wps=584589, ups=1.18, wpb=495249, bsz=16547.4, num_updates=26800, lr=0.000386334, gnorm=0.139, clip=0, loss_scale=1, train_wall=83, gb_free=61.5, wall=23005 epoch 016: 1509 / 1689 loss=3.604, nll_loss=2.061, ppl=4.17, wps=584589, ups=1.18, wpb=495249, bsz=16547.4, num_updates=26800, lr=0.000386334, gnorm=0.139, clip=0, loss_scale=1, train_wall=83, gb_free=61.5, wall=23005 epoch 016: 1509 / 1689 loss=3.604, nll_loss=2.061, ppl=4.17, wps=584589, ups=1.18, wpb=495249, bsz=16547.4, num_updates=26800, lr=0.000386334, gnorm=0.139, clip=0, loss_scale=1, train_wall=83, gb_free=61.5, wall=23005 epoch 016: 1509 / 1689 loss=3.604, nll_loss=2.061, ppl=4.17, wps=584589, ups=1.18, wpb=495249, bsz=16547.4, num_updates=26800, lr=0.000386334, gnorm=0.139, clip=0, loss_scale=1, train_wall=83, gb_free=61.5, wall=23005 epoch 016: 1509 / 1689 loss=3.604, nll_loss=2.061, ppl=4.17, wps=584589, ups=1.18, wpb=495249, bsz=16547.4, num_updates=26800, lr=0.000386334, gnorm=0.139, clip=0, loss_scale=1, train_wall=83, gb_free=61.5, wall=23005 epoch 016: 1509 / 1689 loss=3.604, nll_loss=2.061, ppl=4.17, wps=584589, ups=1.18, wpb=495249, bsz=16547.4, num_updates=26800, lr=0.000386334, gnorm=0.139, clip=0, loss_scale=1, train_wall=83, gb_free=61.5, wall=23005 epoch 016: 1609 / 1689 loss=3.603, nll_loss=2.06, ppl=4.17, wps=586594, ups=1.18, wpb=496717, bsz=16258.9, num_updates=26900, lr=0.000385615, gnorm=0.135, clip=0, loss_scale=1, train_wall=84, gb_free=62, wall=23090 epoch 016: 1609 / 1689 loss=3.603, nll_loss=2.06, ppl=4.17, wps=586594, ups=1.18, wpb=496717, bsz=16258.9, num_updates=26900, lr=0.000385615, gnorm=0.135, clip=0, loss_scale=1, train_wall=84, gb_free=62, wall=23090 epoch 016: 1609 / 1689 loss=3.603, nll_loss=2.06, ppl=4.17, wps=586594, ups=1.18, wpb=496717, bsz=16258.9, num_updates=26900, lr=0.000385615, gnorm=0.135, clip=0, loss_scale=1, train_wall=84, gb_free=62, wall=23090 epoch 016: 1609 / 1689 loss=3.603, nll_loss=2.06, ppl=4.17, wps=586594, ups=1.18, wpb=496717, bsz=16258.9, num_updates=26900, lr=0.000385615, gnorm=0.135, clip=0, loss_scale=1, train_wall=84, gb_free=62, wall=23090 epoch 016: 1609 / 1689 loss=3.603, nll_loss=2.06, ppl=4.17, wps=586594, ups=1.18, wpb=496717, bsz=16258.9, num_updates=26900, lr=0.000385615, gnorm=0.135, clip=0, loss_scale=1, train_wall=84, gb_free=62, wall=23090 epoch 016: 1609 / 1689 loss=3.603, nll_loss=2.06, ppl=4.17, wps=586594, ups=1.18, wpb=496717, bsz=16258.9, num_updates=26900, lr=0.000385615, gnorm=0.135, clip=0, loss_scale=1, train_wall=84, gb_free=62, wall=23090 epoch 016: 1609 / 1689 loss=3.603, nll_loss=2.06, ppl=4.17, wps=586594, ups=1.18, wpb=496717, bsz=16258.9, num_updates=26900, lr=0.000385615, gnorm=0.135, clip=0, loss_scale=1, train_wall=84, gb_free=62, wall=23090 epoch 016: 1609 / 1689 loss=3.603, nll_loss=2.06, ppl=4.17, wps=586594, ups=1.18, wpb=496717, bsz=16258.9, num_updates=26900, lr=0.000385615, gnorm=0.135, clip=0, loss_scale=1, train_wall=84, gb_free=62, wall=23090 epoch 016: 1609 / 1689 loss=3.603, nll_loss=2.06, ppl=4.17, wps=586594, ups=1.18, wpb=496717, bsz=16258.9, num_updates=26900, lr=0.000385615, gnorm=0.135, clip=0, loss_scale=1, train_wall=84, gb_free=62, wall=23090 epoch 016: 1609 / 1689 loss=3.603, nll_loss=2.06, ppl=4.17, wps=586594, ups=1.18, wpb=496717, bsz=16258.9, num_updates=26900, lr=0.000385615, gnorm=0.135, clip=0, loss_scale=1, train_wall=84, gb_free=62, wall=23090 epoch 016: 1609 / 1689 loss=3.603, nll_loss=2.06, ppl=4.17, wps=586594, ups=1.18, wpb=496717, bsz=16258.9, num_updates=26900, lr=0.000385615, gnorm=0.135, clip=0, loss_scale=1, train_wall=84, gb_free=62, wall=23090 epoch 016: 1609 / 1689 loss=3.603, nll_loss=2.06, ppl=4.17, wps=586594, ups=1.18, wpb=496717, bsz=16258.9, num_updates=26900, lr=0.000385615, gnorm=0.135, clip=0, loss_scale=1, train_wall=84, gb_free=62, wall=23090 epoch 016: 1609 / 1689 loss=3.603, nll_loss=2.06, ppl=4.17, wps=586594, ups=1.18, wpb=496717, bsz=16258.9, num_updates=26900, lr=0.000385615, gnorm=0.135, clip=0, loss_scale=1, train_wall=84, gb_free=62, wall=23090 epoch 016: 1609 / 1689 loss=3.603, nll_loss=2.06, ppl=4.17, wps=586594, ups=1.18, wpb=496717, bsz=16258.9, num_updates=26900, lr=0.000385615, gnorm=0.135, clip=0, loss_scale=1, train_wall=84, gb_free=62, wall=23090 epoch 016: 1609 / 1689 loss=3.603, nll_loss=2.06, ppl=4.17, wps=586594, ups=1.18, wpb=496717, bsz=16258.9, num_updates=26900, lr=0.000385615, gnorm=0.135, clip=0, loss_scale=1, train_wall=84, gb_free=62, wall=23090 epoch 016: 1609 / 1689 loss=3.603, nll_loss=2.06, ppl=4.17, wps=586594, ups=1.18, wpb=496717, bsz=16258.9, num_updates=26900, lr=0.000385615, gnorm=0.135, clip=0, loss_scale=1, train_wall=84, gb_free=62, wall=23090 end of epoch 16 (average epoch stats below) epoch 016 | loss 3.598 | nll_loss 2.053 | ppl 4.15 | wps 581254 | ups 1.17 | wpb 495114 | bsz 16506.4 | num_updates 26980 | lr 0.000385043 | gnorm 0.142 | clip 0 | loss_scale 1 | train_wall 1407 | gb_free 62.9 | wall 23157 epoch 016 | loss 3.598 | nll_loss 2.053 | ppl 4.15 | wps 581254 | ups 1.17 | wpb 495114 | bsz 16506.4 | num_updates 26980 | lr 0.000385043 | gnorm 0.142 | clip 0 | loss_scale 1 | train_wall 1407 | gb_free 62.9 | wall 23157 epoch 016 | loss 3.598 | nll_loss 2.053 | ppl 4.15 | wps 581254 | ups 1.17 | wpb 495114 | bsz 16506.4 | num_updates 26980 | lr 0.000385043 | gnorm 0.142 | clip 0 | loss_scale 1 | train_wall 1407 | gb_free 62.9 | wall 23157 epoch 016 | loss 3.598 | nll_loss 2.053 | ppl 4.15 | wps 581254 | ups 1.17 | wpb 495114 | bsz 16506.4 | num_updates 26980 | lr 0.000385043 | gnorm 0.142 | clip 0 | loss_scale 1 | train_wall 1407 | gb_free 62.9 | wall 23157 epoch 016 | loss 3.598 | nll_loss 2.053 | ppl 4.15 | wps 581254 | ups 1.17 | wpb 495114 | bsz 16506.4 | num_updates 26980 | lr 0.000385043 | gnorm 0.142 | clip 0 | loss_scale 1 | train_wall 1407 | gb_free 62.9 | wall 23157 epoch 016 | loss 3.598 | nll_loss 2.053 | ppl 4.15 | wps 581254 | ups 1.17 | wpb 495114 | bsz 16506.4 | num_updates 26980 | lr 0.000385043 | gnorm 0.142 | clip 0 | loss_scale 1 | train_wall 1407 | gb_free 62.9 | wall 23157 epoch 016 | loss 3.598 | nll_loss 2.053 | ppl 4.15 | wps 581254 | ups 1.17 | wpb 495114 | bsz 16506.4 | num_updates 26980 | lr 0.000385043 | gnorm 0.142 | clip 0 | loss_scale 1 | train_wall 1407 | gb_free 62.9 | wall 23157 epoch 016 | loss 3.598 | nll_loss 2.053 | ppl 4.15 | wps 581254 | ups 1.17 | wpb 495114 | bsz 16506.4 | num_updates 26980 | lr 0.000385043 | gnorm 0.142 | clip 0 | loss_scale 1 | train_wall 1407 | gb_free 62.9 | wall 23157 epoch 016 | loss 3.598 | nll_loss 2.053 | ppl 4.15 | wps 581254 | ups 1.17 | wpb 495114 | bsz 16506.4 | num_updates 26980 | lr 0.000385043 | gnorm 0.142 | clip 0 | loss_scale 1 | train_wall 1407 | gb_free 62.9 | wall 23157 epoch 016 | loss 3.598 | nll_loss 2.053 | ppl 4.15 | wps 581254 | ups 1.17 | wpb 495114 | bsz 16506.4 | num_updates 26980 | lr 0.000385043 | gnorm 0.142 | clip 0 | loss_scale 1 | train_wall 1407 | gb_free 62.9 | wall 23157 epoch 016 | loss 3.598 | nll_loss 2.053 | ppl 4.15 | wps 581254 | ups 1.17 | wpb 495114 | bsz 16506.4 | num_updates 26980 | lr 0.000385043 | gnorm 0.142 | clip 0 | loss_scale 1 | train_wall 1407 | gb_free 62.9 | wall 23157 epoch 016 | loss 3.598 | nll_loss 2.053 | ppl 4.15 | wps 581254 | ups 1.17 | wpb 495114 | bsz 16506.4 | num_updates 26980 | lr 0.000385043 | gnorm 0.142 | clip 0 | loss_scale 1 | train_wall 1407 | gb_free 62.9 | wall 23157 epoch 016 | loss 3.598 | nll_loss 2.053 | ppl 4.15 | wps 581254 | ups 1.17 | wpb 495114 | bsz 16506.4 | num_updates 26980 | lr 0.000385043 | gnorm 0.142 | clip 0 | loss_scale 1 | train_wall 1407 | gb_free 62.9 | wall 23157 epoch 016 | loss 3.598 | nll_loss 2.053 | ppl 4.15 | wps 581254 | ups 1.17 | wpb 495114 | bsz 16506.4 | num_updates 26980 | lr 0.000385043 | gnorm 0.142 | clip 0 | loss_scale 1 | train_wall 1407 | gb_free 62.9 | wall 23157 epoch 016 | loss 3.598 | nll_loss 2.053 | ppl 4.15 | wps 581254 | ups 1.17 | wpb 495114 | bsz 16506.4 | num_updates 26980 | lr 0.000385043 | gnorm 0.142 | clip 0 | loss_scale 1 | train_wall 1407 | gb_free 62.9 | wall 23157 epoch 016 | loss 3.598 | nll_loss 2.053 | ppl 4.15 | wps 581254 | ups 1.17 | wpb 495114 | bsz 16506.4 | num_updates 26980 | lr 0.000385043 | gnorm 0.142 | clip 0 | loss_scale 1 | train_wall 1407 | gb_free 62.9 | wall 23157 Start iterating over samples epoch 017: 20 / 1689 loss=3.601, nll_loss=2.057, ppl=4.16, wps=582447, ups=1.19, wpb=490647, bsz=16599.9, num_updates=27000, lr=0.0003849, gnorm=0.144, clip=0, loss_scale=1, train_wall=83, gb_free=61.9, wall=23174 epoch 017: 20 / 1689 loss=3.601, nll_loss=2.057, ppl=4.16, wps=582447, ups=1.19, wpb=490647, bsz=16599.9, num_updates=27000, lr=0.0003849, gnorm=0.144, clip=0, loss_scale=1, train_wall=83, gb_free=61.9, wall=23174 epoch 017: 20 / 1689 loss=3.601, nll_loss=2.057, ppl=4.16, wps=582447, ups=1.19, wpb=490647, bsz=16599.9, num_updates=27000, lr=0.0003849, gnorm=0.144, clip=0, loss_scale=1, train_wall=83, gb_free=61.9, wall=23174 epoch 017: 20 / 1689 loss=3.601, nll_loss=2.057, ppl=4.16, wps=582447, ups=1.19, wpb=490647, bsz=16599.9, num_updates=27000, lr=0.0003849, gnorm=0.144, clip=0, loss_scale=1, train_wall=83, gb_free=61.9, wall=23174 epoch 017: 20 / 1689 loss=3.601, nll_loss=2.057, ppl=4.16, wps=582447, ups=1.19, wpb=490647, bsz=16599.9, num_updates=27000, lr=0.0003849, gnorm=0.144, clip=0, loss_scale=1, train_wall=83, gb_free=61.9, wall=23174 epoch 017: 20 / 1689 loss=3.601, nll_loss=2.057, ppl=4.16, wps=582447, ups=1.19, wpb=490647, bsz=16599.9, num_updates=27000, lr=0.0003849, gnorm=0.144, clip=0, loss_scale=1, train_wall=83, gb_free=61.9, wall=23174 epoch 017: 20 / 1689 loss=3.601, nll_loss=2.057, ppl=4.16, wps=582447, ups=1.19, wpb=490647, bsz=16599.9, num_updates=27000, lr=0.0003849, gnorm=0.144, clip=0, loss_scale=1, train_wall=83, gb_free=61.9, wall=23174 epoch 017: 20 / 1689 loss=3.601, nll_loss=2.057, ppl=4.16, wps=582447, ups=1.19, wpb=490647, bsz=16599.9, num_updates=27000, lr=0.0003849, gnorm=0.144, clip=0, loss_scale=1, train_wall=83, gb_free=61.9, wall=23174 epoch 017: 20 / 1689 loss=3.601, nll_loss=2.057, ppl=4.16, wps=582447, ups=1.19, wpb=490647, bsz=16599.9, num_updates=27000, lr=0.0003849, gnorm=0.144, clip=0, loss_scale=1, train_wall=83, gb_free=61.9, wall=23174 epoch 017: 20 / 1689 loss=3.601, nll_loss=2.057, ppl=4.16, wps=582447, ups=1.19, wpb=490647, bsz=16599.9, num_updates=27000, lr=0.0003849, gnorm=0.144, clip=0, loss_scale=1, train_wall=83, gb_free=61.9, wall=23174 epoch 017: 20 / 1689 loss=3.601, nll_loss=2.057, ppl=4.16, wps=582447, ups=1.19, wpb=490647, bsz=16599.9, num_updates=27000, lr=0.0003849, gnorm=0.144, clip=0, loss_scale=1, train_wall=83, gb_free=61.9, wall=23174 epoch 017: 20 / 1689 loss=3.601, nll_loss=2.057, ppl=4.16, wps=582447, ups=1.19, wpb=490647, bsz=16599.9, num_updates=27000, lr=0.0003849, gnorm=0.144, clip=0, loss_scale=1, train_wall=83, gb_free=61.9, wall=23174 epoch 017: 20 / 1689 loss=3.601, nll_loss=2.057, ppl=4.16, wps=582447, ups=1.19, wpb=490647, bsz=16599.9, num_updates=27000, lr=0.0003849, gnorm=0.144, clip=0, loss_scale=1, train_wall=83, gb_free=61.9, wall=23174 epoch 017: 20 / 1689 loss=3.601, nll_loss=2.057, ppl=4.16, wps=582447, ups=1.19, wpb=490647, bsz=16599.9, num_updates=27000, lr=0.0003849, gnorm=0.144, clip=0, loss_scale=1, train_wall=83, gb_free=61.9, wall=23174 epoch 017: 20 / 1689 loss=3.601, nll_loss=2.057, ppl=4.16, wps=582447, ups=1.19, wpb=490647, bsz=16599.9, num_updates=27000, lr=0.0003849, gnorm=0.144, clip=0, loss_scale=1, train_wall=83, gb_free=61.9, wall=23174 epoch 017: 20 / 1689 loss=3.601, nll_loss=2.057, ppl=4.16, wps=582447, ups=1.19, wpb=490647, bsz=16599.9, num_updates=27000, lr=0.0003849, gnorm=0.144, clip=0, loss_scale=1, train_wall=83, gb_free=61.9, wall=23174 epoch 017: 20 / 1689 loss=3.601, nll_loss=2.057, ppl=4.16, wps=582447, ups=1.19, wpb=490647, bsz=16599.9, num_updates=27000, lr=0.0003849, gnorm=0.144, clip=0, loss_scale=1, train_wall=83, gb_free=61.9, wall=23174 begin validation on "valid" subset epoch 017 | valid on 'valid' subset | loss 3.722 | nll_loss 2.171 | ppl 4.5 | wps 0 | wpb 44526 | bsz 2008 | num_updates 27000 | best_loss 3.713 epoch 017 | valid on 'valid' subset | loss 3.722 | nll_loss 2.171 | ppl 4.5 | wps 0 | wpb 44526 | bsz 2008 | num_updates 27000 | best_loss 3.713 epoch 017 | valid on 'valid' subset | loss 3.722 | nll_loss 2.171 | ppl 4.5 | wps 0 | wpb 44526 | bsz 2008 | num_updates 27000 | best_loss 3.713 epoch 017 | valid on 'valid' subset | loss 3.722 | nll_loss 2.171 | ppl 4.5 | wps 0 | wpb 44526 | bsz 2008 | num_updates 27000 | best_loss 3.713 epoch 017 | valid on 'valid' subset | loss 3.722 | nll_loss 2.171 | ppl 4.5 | wps 0 | wpb 44526 | bsz 2008 | num_updates 27000 | best_loss 3.713 epoch 017 | valid on 'valid' subset | loss 3.722 | nll_loss 2.171 | ppl 4.5 | wps 0 | wpb 44526 | bsz 2008 | num_updates 27000 | best_loss 3.713 epoch 017 | valid on 'valid' subset | loss 3.722 | nll_loss 2.171 | ppl 4.5 | wps 0 | wpb 44526 | bsz 2008 | num_updates 27000 | best_loss 3.713 epoch 017 | valid on 'valid' subset | loss 3.722 | nll_loss 2.171 | ppl 4.5 | wps 0 | wpb 44526 | bsz 2008 | num_updates 27000 | best_loss 3.713 epoch 017 | valid on 'valid' subset | loss 3.722 | nll_loss 2.171 | ppl 4.5 | wps 0 | wpb 44526 | bsz 2008 | num_updates 27000 | best_loss 3.713 epoch 017 | valid on 'valid' subset | loss 3.722 | nll_loss 2.171 | ppl 4.5 | wps 0 | wpb 44526 | bsz 2008 | num_updates 27000 | best_loss 3.713 epoch 017 | valid on 'valid' subset | loss 3.722 | nll_loss 2.171 | ppl 4.5 | wps 0 | wpb 44526 | bsz 2008 | num_updates 27000 | best_loss 3.713 epoch 017 | valid on 'valid' subset | loss 3.722 | nll_loss 2.171 | ppl 4.5 | wps 0 | wpb 44526 | bsz 2008 | num_updates 27000 | best_loss 3.713 epoch 017 | valid on 'valid' subset | loss 3.722 | nll_loss 2.171 | ppl 4.5 | wps 0 | wpb 44526 | bsz 2008 | num_updates 27000 | best_loss 3.713 epoch 017 | valid on 'valid' subset | loss 3.722 | nll_loss 2.171 | ppl 4.5 | wps 0 | wpb 44526 | bsz 2008 | num_updates 27000 | best_loss 3.713 epoch 017 | valid on 'valid' subset | loss 3.722 | nll_loss 2.171 | ppl 4.5 | wps 0 | wpb 44526 | bsz 2008 | num_updates 27000 | best_loss 3.713 epoch 017 | valid on 'valid' subset | loss 3.722 | nll_loss 2.171 | ppl 4.5 | wps 0 | wpb 44526 | bsz 2008 | num_updates 27000 | best_loss 3.713 epoch 017 | valid on 'valid' subset | loss 3.722 | nll_loss 2.171 | ppl 4.5 | wps 0 | wpb 44526 | bsz 2008 | num_updates 27000 | best_loss 3.713 epoch 017: 120 / 1689 loss=3.579, nll_loss=2.031, ppl=4.09, wps=539235, ups=1.09, wpb=496106, bsz=16510.4, num_updates=27100, lr=0.000384189, gnorm=0.137, clip=0, loss_scale=1, train_wall=83, gb_free=60.1, wall=23266 epoch 017: 120 / 1689 loss=3.579, nll_loss=2.031, ppl=4.09, wps=539235, ups=1.09, wpb=496106, bsz=16510.4, num_updates=27100, lr=0.000384189, gnorm=0.137, clip=0, loss_scale=1, train_wall=83, gb_free=60.1, wall=23266 epoch 017: 120 / 1689 loss=3.579, nll_loss=2.031, ppl=4.09, wps=539235, ups=1.09, wpb=496106, bsz=16510.4, num_updates=27100, lr=0.000384189, gnorm=0.137, clip=0, loss_scale=1, train_wall=83, gb_free=60.1, wall=23266 epoch 017: 120 / 1689 loss=3.579, nll_loss=2.031, ppl=4.09, wps=539235, ups=1.09, wpb=496106, bsz=16510.4, num_updates=27100, lr=0.000384189, gnorm=0.137, clip=0, loss_scale=1, train_wall=83, gb_free=60.1, wall=23266 epoch 017: 120 / 1689 loss=3.579, nll_loss=2.031, ppl=4.09, wps=539235, ups=1.09, wpb=496106, bsz=16510.4, num_updates=27100, lr=0.000384189, gnorm=0.137, clip=0, loss_scale=1, train_wall=83, gb_free=60.1, wall=23266 epoch 017: 120 / 1689 loss=3.579, nll_loss=2.031, ppl=4.09, wps=539235, ups=1.09, wpb=496106, bsz=16510.4, num_updates=27100, lr=0.000384189, gnorm=0.137, clip=0, loss_scale=1, train_wall=83, gb_free=60.1, wall=23266 epoch 017: 120 / 1689 loss=3.579, nll_loss=2.031, ppl=4.09, wps=539235, ups=1.09, wpb=496106, bsz=16510.4, num_updates=27100, lr=0.000384189, gnorm=0.137, clip=0, loss_scale=1, train_wall=83, gb_free=60.1, wall=23266 epoch 017: 120 / 1689 loss=3.579, nll_loss=2.031, ppl=4.09, wps=539235, ups=1.09, wpb=496106, bsz=16510.4, num_updates=27100, lr=0.000384189, gnorm=0.137, clip=0, loss_scale=1, train_wall=83, gb_free=60.1, wall=23266 epoch 017: 120 / 1689 loss=3.579, nll_loss=2.031, ppl=4.09, wps=539235, ups=1.09, wpb=496106, bsz=16510.4, num_updates=27100, lr=0.000384189, gnorm=0.137, clip=0, loss_scale=1, train_wall=83, gb_free=60.1, wall=23266 epoch 017: 120 / 1689 loss=3.579, nll_loss=2.031, ppl=4.09, wps=539235, ups=1.09, wpb=496106, bsz=16510.4, num_updates=27100, lr=0.000384189, gnorm=0.137, clip=0, loss_scale=1, train_wall=83, gb_free=60.1, wall=23266 epoch 017: 120 / 1689 loss=3.579, nll_loss=2.031, ppl=4.09, wps=539235, ups=1.09, wpb=496106, bsz=16510.4, num_updates=27100, lr=0.000384189, gnorm=0.137, clip=0, loss_scale=1, train_wall=83, gb_free=60.1, wall=23266 epoch 017: 120 / 1689 loss=3.579, nll_loss=2.031, ppl=4.09, wps=539235, ups=1.09, wpb=496106, bsz=16510.4, num_updates=27100, lr=0.000384189, gnorm=0.137, clip=0, loss_scale=1, train_wall=83, gb_free=60.1, wall=23266 epoch 017: 120 / 1689 loss=3.579, nll_loss=2.031, ppl=4.09, wps=539235, ups=1.09, wpb=496106, bsz=16510.4, num_updates=27100, lr=0.000384189, gnorm=0.137, clip=0, loss_scale=1, train_wall=83, gb_free=60.1, wall=23266 epoch 017: 120 / 1689 loss=3.579, nll_loss=2.031, ppl=4.09, wps=539235, ups=1.09, wpb=496106, bsz=16510.4, num_updates=27100, lr=0.000384189, gnorm=0.137, clip=0, loss_scale=1, train_wall=83, gb_free=60.1, wall=23266 epoch 017: 120 / 1689 loss=3.579, nll_loss=2.031, ppl=4.09, wps=539235, ups=1.09, wpb=496106, bsz=16510.4, num_updates=27100, lr=0.000384189, gnorm=0.137, clip=0, loss_scale=1, train_wall=83, gb_free=60.1, wall=23266 epoch 017: 120 / 1689 loss=3.579, nll_loss=2.031, ppl=4.09, wps=539235, ups=1.09, wpb=496106, bsz=16510.4, num_updates=27100, lr=0.000384189, gnorm=0.137, clip=0, loss_scale=1, train_wall=83, gb_free=60.1, wall=23266 epoch 017: 120 / 1689 loss=3.579, nll_loss=2.031, ppl=4.09, wps=539235, ups=1.09, wpb=496106, bsz=16510.4, num_updates=27100, lr=0.000384189, gnorm=0.137, clip=0, loss_scale=1, train_wall=83, gb_free=60.1, wall=23266 epoch 017: 220 / 1689 loss=3.583, nll_loss=2.035, ppl=4.1, wps=587211, ups=1.19, wpb=495214, bsz=16373.8, num_updates=27200, lr=0.000383482, gnorm=0.146, clip=0, loss_scale=2, train_wall=83, gb_free=60.9, wall=23350 epoch 017: 220 / 1689 loss=3.583, nll_loss=2.035, ppl=4.1, wps=587211, ups=1.19, wpb=495214, bsz=16373.8, num_updates=27200, lr=0.000383482, gnorm=0.146, clip=0, loss_scale=2, train_wall=83, gb_free=60.9, wall=23350 epoch 017: 220 / 1689 loss=3.583, nll_loss=2.035, ppl=4.1, wps=587211, ups=1.19, wpb=495214, bsz=16373.8, num_updates=27200, lr=0.000383482, gnorm=0.146, clip=0, loss_scale=2, train_wall=83, gb_free=60.9, wall=23350 epoch 017: 220 / 1689 loss=3.583, nll_loss=2.035, ppl=4.1, wps=587211, ups=1.19, wpb=495214, bsz=16373.8, num_updates=27200, lr=0.000383482, gnorm=0.146, clip=0, loss_scale=2, train_wall=83, gb_free=60.9, wall=23350 epoch 017: 220 / 1689 loss=3.583, nll_loss=2.035, ppl=4.1, wps=587211, ups=1.19, wpb=495214, bsz=16373.8, num_updates=27200, lr=0.000383482, gnorm=0.146, clip=0, loss_scale=2, train_wall=83, gb_free=60.9, wall=23350 epoch 017: 220 / 1689 loss=3.583, nll_loss=2.035, ppl=4.1, wps=587211, ups=1.19, wpb=495214, bsz=16373.8, num_updates=27200, lr=0.000383482, gnorm=0.146, clip=0, loss_scale=2, train_wall=83, gb_free=60.9, wall=23350 epoch 017: 220 / 1689 loss=3.583, nll_loss=2.035, ppl=4.1, wps=587211, ups=1.19, wpb=495214, bsz=16373.8, num_updates=27200, lr=0.000383482, gnorm=0.146, clip=0, loss_scale=2, train_wall=83, gb_free=60.9, wall=23350 epoch 017: 220 / 1689 loss=3.583, nll_loss=2.035, ppl=4.1, wps=587211, ups=1.19, wpb=495214, bsz=16373.8, num_updates=27200, lr=0.000383482, gnorm=0.146, clip=0, loss_scale=2, train_wall=83, gb_free=60.9, wall=23350 epoch 017: 220 / 1689 loss=3.583, nll_loss=2.035, ppl=4.1, wps=587211, ups=1.19, wpb=495214, bsz=16373.8, num_updates=27200, lr=0.000383482, gnorm=0.146, clip=0, loss_scale=2, train_wall=83, gb_free=60.9, wall=23350 epoch 017: 220 / 1689 loss=3.583, nll_loss=2.035, ppl=4.1, wps=587211, ups=1.19, wpb=495214, bsz=16373.8, num_updates=27200, lr=0.000383482, gnorm=0.146, clip=0, loss_scale=2, train_wall=83, gb_free=60.9, wall=23350 epoch 017: 220 / 1689 loss=3.583, nll_loss=2.035, ppl=4.1, wps=587211, ups=1.19, wpb=495214, bsz=16373.8, num_updates=27200, lr=0.000383482, gnorm=0.146, clip=0, loss_scale=2, train_wall=83, gb_free=60.9, wall=23350 epoch 017: 220 / 1689 loss=3.583, nll_loss=2.035, ppl=4.1, wps=587211, ups=1.19, wpb=495214, bsz=16373.8, num_updates=27200, lr=0.000383482, gnorm=0.146, clip=0, loss_scale=2, train_wall=83, gb_free=60.9, wall=23350 epoch 017: 220 / 1689 loss=3.583, nll_loss=2.035, ppl=4.1, wps=587211, ups=1.19, wpb=495214, bsz=16373.8, num_updates=27200, lr=0.000383482, gnorm=0.146, clip=0, loss_scale=2, train_wall=83, gb_free=60.9, wall=23350 epoch 017: 220 / 1689 loss=3.583, nll_loss=2.035, ppl=4.1, wps=587211, ups=1.19, wpb=495214, bsz=16373.8, num_updates=27200, lr=0.000383482, gnorm=0.146, clip=0, loss_scale=2, train_wall=83, gb_free=60.9, wall=23350 epoch 017: 220 / 1689 loss=3.583, nll_loss=2.035, ppl=4.1, wps=587211, ups=1.19, wpb=495214, bsz=16373.8, num_updates=27200, lr=0.000383482, gnorm=0.146, clip=0, loss_scale=2, train_wall=83, gb_free=60.9, wall=23350 epoch 017: 220 / 1689 loss=3.583, nll_loss=2.035, ppl=4.1, wps=587211, ups=1.19, wpb=495214, bsz=16373.8, num_updates=27200, lr=0.000383482, gnorm=0.146, clip=0, loss_scale=2, train_wall=83, gb_free=60.9, wall=23350 epoch 017: 220 / 1689 loss=3.583, nll_loss=2.035, ppl=4.1, wps=587211, ups=1.19, wpb=495214, bsz=16373.8, num_updates=27200, lr=0.000383482, gnorm=0.146, clip=0, loss_scale=2, train_wall=83, gb_free=60.9, wall=23350 epoch 017: 320 / 1689 loss=3.577, nll_loss=2.029, ppl=4.08, wps=582016, ups=1.18, wpb=494176, bsz=16493.8, num_updates=27300, lr=0.00038278, gnorm=0.135, clip=0, loss_scale=2, train_wall=84, gb_free=60.7, wall=23435 epoch 017: 320 / 1689 loss=3.577, nll_loss=2.029, ppl=4.08, wps=582016, ups=1.18, wpb=494176, bsz=16493.8, num_updates=27300, lr=0.00038278, gnorm=0.135, clip=0, loss_scale=2, train_wall=84, gb_free=60.7, wall=23435 epoch 017: 320 / 1689 loss=3.577, nll_loss=2.029, ppl=4.08, wps=582016, ups=1.18, wpb=494176, bsz=16493.8, num_updates=27300, lr=0.00038278, gnorm=0.135, clip=0, loss_scale=2, train_wall=84, gb_free=60.7, wall=23435 epoch 017: 320 / 1689 loss=3.577, nll_loss=2.029, ppl=4.08, wps=582016, ups=1.18, wpb=494176, bsz=16493.8, num_updates=27300, lr=0.00038278, gnorm=0.135, clip=0, loss_scale=2, train_wall=84, gb_free=60.7, wall=23435 epoch 017: 320 / 1689 loss=3.577, nll_loss=2.029, ppl=4.08, wps=582016, ups=1.18, wpb=494176, bsz=16493.8, num_updates=27300, lr=0.00038278, gnorm=0.135, clip=0, loss_scale=2, train_wall=84, gb_free=60.7, wall=23435 epoch 017: 320 / 1689 loss=3.577, nll_loss=2.029, ppl=4.08, wps=582016, ups=1.18, wpb=494176, bsz=16493.8, num_updates=27300, lr=0.00038278, gnorm=0.135, clip=0, loss_scale=2, train_wall=84, gb_free=60.7, wall=23435 epoch 017: 320 / 1689 loss=3.577, nll_loss=2.029, ppl=4.08, wps=582016, ups=1.18, wpb=494176, bsz=16493.8, num_updates=27300, lr=0.00038278, gnorm=0.135, clip=0, loss_scale=2, train_wall=84, gb_free=60.7, wall=23435 epoch 017: 320 / 1689 loss=3.577, nll_loss=2.029, ppl=4.08, wps=582016, ups=1.18, wpb=494176, bsz=16493.8, num_updates=27300, lr=0.00038278, gnorm=0.135, clip=0, loss_scale=2, train_wall=84, gb_free=60.7, wall=23435 epoch 017: 320 / 1689 loss=3.577, nll_loss=2.029, ppl=4.08, wps=582016, ups=1.18, wpb=494176, bsz=16493.8, num_updates=27300, lr=0.00038278, gnorm=0.135, clip=0, loss_scale=2, train_wall=84, gb_free=60.7, wall=23435 epoch 017: 320 / 1689 loss=3.577, nll_loss=2.029, ppl=4.08, wps=582016, ups=1.18, wpb=494176, bsz=16493.8, num_updates=27300, lr=0.00038278, gnorm=0.135, clip=0, loss_scale=2, train_wall=84, gb_free=60.7, wall=23435 epoch 017: 320 / 1689 loss=3.577, nll_loss=2.029, ppl=4.08, wps=582016, ups=1.18, wpb=494176, bsz=16493.8, num_updates=27300, lr=0.00038278, gnorm=0.135, clip=0, loss_scale=2, train_wall=84, gb_free=60.7, wall=23435 epoch 017: 320 / 1689 loss=3.577, nll_loss=2.029, ppl=4.08, wps=582016, ups=1.18, wpb=494176, bsz=16493.8, num_updates=27300, lr=0.00038278, gnorm=0.135, clip=0, loss_scale=2, train_wall=84, gb_free=60.7, wall=23435 epoch 017: 320 / 1689 loss=3.577, nll_loss=2.029, ppl=4.08, wps=582016, ups=1.18, wpb=494176, bsz=16493.8, num_updates=27300, lr=0.00038278, gnorm=0.135, clip=0, loss_scale=2, train_wall=84, gb_free=60.7, wall=23435 epoch 017: 320 / 1689 loss=3.577, nll_loss=2.029, ppl=4.08, wps=582016, ups=1.18, wpb=494176, bsz=16493.8, num_updates=27300, lr=0.00038278, gnorm=0.135, clip=0, loss_scale=2, train_wall=84, gb_free=60.7, wall=23435 epoch 017: 320 / 1689 loss=3.577, nll_loss=2.029, ppl=4.08, wps=582016, ups=1.18, wpb=494176, bsz=16493.8, num_updates=27300, lr=0.00038278, gnorm=0.135, clip=0, loss_scale=2, train_wall=84, gb_free=60.7, wall=23435 epoch 017: 320 / 1689 loss=3.577, nll_loss=2.029, ppl=4.08, wps=582016, ups=1.18, wpb=494176, bsz=16493.8, num_updates=27300, lr=0.00038278, gnorm=0.135, clip=0, loss_scale=2, train_wall=84, gb_free=60.7, wall=23435 epoch 017: 320 / 1689 loss=3.577, nll_loss=2.029, ppl=4.08, wps=582016, ups=1.18, wpb=494176, bsz=16493.8, num_updates=27300, lr=0.00038278, gnorm=0.135, clip=0, loss_scale=2, train_wall=84, gb_free=60.7, wall=23435 epoch 017: 420 / 1689 loss=3.58, nll_loss=2.033, ppl=4.09, wps=584772, ups=1.18, wpb=496057, bsz=16597.3, num_updates=27400, lr=0.00038208, gnorm=0.143, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=23520 epoch 017: 420 / 1689 loss=3.58, nll_loss=2.033, ppl=4.09, wps=584772, ups=1.18, wpb=496057, bsz=16597.3, num_updates=27400, lr=0.00038208, gnorm=0.143, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=23520 epoch 017: 420 / 1689 loss=3.58, nll_loss=2.033, ppl=4.09, wps=584772, ups=1.18, wpb=496057, bsz=16597.3, num_updates=27400, lr=0.00038208, gnorm=0.143, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=23520 epoch 017: 420 / 1689 loss=3.58, nll_loss=2.033, ppl=4.09, wps=584772, ups=1.18, wpb=496057, bsz=16597.3, num_updates=27400, lr=0.00038208, gnorm=0.143, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=23520 epoch 017: 420 / 1689 loss=3.58, nll_loss=2.033, ppl=4.09, wps=584772, ups=1.18, wpb=496057, bsz=16597.3, num_updates=27400, lr=0.00038208, gnorm=0.143, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=23520 epoch 017: 420 / 1689 loss=3.58, nll_loss=2.033, ppl=4.09, wps=584772, ups=1.18, wpb=496057, bsz=16597.3, num_updates=27400, lr=0.00038208, gnorm=0.143, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=23520 epoch 017: 420 / 1689 loss=3.58, nll_loss=2.033, ppl=4.09, wps=584772, ups=1.18, wpb=496057, bsz=16597.3, num_updates=27400, lr=0.00038208, gnorm=0.143, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=23520 epoch 017: 420 / 1689 loss=3.58, nll_loss=2.033, ppl=4.09, wps=584772, ups=1.18, wpb=496057, bsz=16597.3, num_updates=27400, lr=0.00038208, gnorm=0.143, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=23520 epoch 017: 420 / 1689 loss=3.58, nll_loss=2.033, ppl=4.09, wps=584772, ups=1.18, wpb=496057, bsz=16597.3, num_updates=27400, lr=0.00038208, gnorm=0.143, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=23520 epoch 017: 420 / 1689 loss=3.58, nll_loss=2.033, ppl=4.09, wps=584772, ups=1.18, wpb=496057, bsz=16597.3, num_updates=27400, lr=0.00038208, gnorm=0.143, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=23520 epoch 017: 420 / 1689 loss=3.58, nll_loss=2.033, ppl=4.09, wps=584772, ups=1.18, wpb=496057, bsz=16597.3, num_updates=27400, lr=0.00038208, gnorm=0.143, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=23520 epoch 017: 420 / 1689 loss=3.58, nll_loss=2.033, ppl=4.09, wps=584772, ups=1.18, wpb=496057, bsz=16597.3, num_updates=27400, lr=0.00038208, gnorm=0.143, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=23520 epoch 017: 420 / 1689 loss=3.58, nll_loss=2.033, ppl=4.09, wps=584772, ups=1.18, wpb=496057, bsz=16597.3, num_updates=27400, lr=0.00038208, gnorm=0.143, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=23520 epoch 017: 420 / 1689 loss=3.58, nll_loss=2.033, ppl=4.09, wps=584772, ups=1.18, wpb=496057, bsz=16597.3, num_updates=27400, lr=0.00038208, gnorm=0.143, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=23520 epoch 017: 420 / 1689 loss=3.58, nll_loss=2.033, ppl=4.09, wps=584772, ups=1.18, wpb=496057, bsz=16597.3, num_updates=27400, lr=0.00038208, gnorm=0.143, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=23520 epoch 017: 420 / 1689 loss=3.58, nll_loss=2.033, ppl=4.09, wps=584772, ups=1.18, wpb=496057, bsz=16597.3, num_updates=27400, lr=0.00038208, gnorm=0.143, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=23520 epoch 017: 420 / 1689 loss=3.58, nll_loss=2.033, ppl=4.09, wps=584772, ups=1.18, wpb=496057, bsz=16597.3, num_updates=27400, lr=0.00038208, gnorm=0.143, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=23520 epoch 017: 520 / 1689 loss=3.593, nll_loss=2.048, ppl=4.13, wps=586035, ups=1.19, wpb=494298, bsz=16436.4, num_updates=27500, lr=0.000381385, gnorm=0.14, clip=0, loss_scale=2, train_wall=83, gb_free=61.8, wall=23604 epoch 017: 520 / 1689 loss=3.593, nll_loss=2.048, ppl=4.13, wps=586035, ups=1.19, wpb=494298, bsz=16436.4, num_updates=27500, lr=0.000381385, gnorm=0.14, clip=0, loss_scale=2, train_wall=83, gb_free=61.8, wall=23604 epoch 017: 520 / 1689 loss=3.593, nll_loss=2.048, ppl=4.13, wps=586035, ups=1.19, wpb=494298, bsz=16436.4, num_updates=27500, lr=0.000381385, gnorm=0.14, clip=0, loss_scale=2, train_wall=83, gb_free=61.8, wall=23604 epoch 017: 520 / 1689 loss=3.593, nll_loss=2.048, ppl=4.13, wps=586035, ups=1.19, wpb=494298, bsz=16436.4, num_updates=27500, lr=0.000381385, gnorm=0.14, clip=0, loss_scale=2, train_wall=83, gb_free=61.8, wall=23604 epoch 017: 520 / 1689 loss=3.593, nll_loss=2.048, ppl=4.13, wps=586035, ups=1.19, wpb=494298, bsz=16436.4, num_updates=27500, lr=0.000381385, gnorm=0.14, clip=0, loss_scale=2, train_wall=83, gb_free=61.8, wall=23604 epoch 017: 520 / 1689 loss=3.593, nll_loss=2.048, ppl=4.13, wps=586035, ups=1.19, wpb=494298, bsz=16436.4, num_updates=27500, lr=0.000381385, gnorm=0.14, clip=0, loss_scale=2, train_wall=83, gb_free=61.8, wall=23604 epoch 017: 520 / 1689 loss=3.593, nll_loss=2.048, ppl=4.13, wps=586035, ups=1.19, wpb=494298, bsz=16436.4, num_updates=27500, lr=0.000381385, gnorm=0.14, clip=0, loss_scale=2, train_wall=83, gb_free=61.8, wall=23604 epoch 017: 520 / 1689 loss=3.593, nll_loss=2.048, ppl=4.13, wps=586035, ups=1.19, wpb=494298, bsz=16436.4, num_updates=27500, lr=0.000381385, gnorm=0.14, clip=0, loss_scale=2, train_wall=83, gb_free=61.8, wall=23604 epoch 017: 520 / 1689 loss=3.593, nll_loss=2.048, ppl=4.13, wps=586035, ups=1.19, wpb=494298, bsz=16436.4, num_updates=27500, lr=0.000381385, gnorm=0.14, clip=0, loss_scale=2, train_wall=83, gb_free=61.8, wall=23604 epoch 017: 520 / 1689 loss=3.593, nll_loss=2.048, ppl=4.13, wps=586035, ups=1.19, wpb=494298, bsz=16436.4, num_updates=27500, lr=0.000381385, gnorm=0.14, clip=0, loss_scale=2, train_wall=83, gb_free=61.8, wall=23604 epoch 017: 520 / 1689 loss=3.593, nll_loss=2.048, ppl=4.13, wps=586035, ups=1.19, wpb=494298, bsz=16436.4, num_updates=27500, lr=0.000381385, gnorm=0.14, clip=0, loss_scale=2, train_wall=83, gb_free=61.8, wall=23604 epoch 017: 520 / 1689 loss=3.593, nll_loss=2.048, ppl=4.13, wps=586035, ups=1.19, wpb=494298, bsz=16436.4, num_updates=27500, lr=0.000381385, gnorm=0.14, clip=0, loss_scale=2, train_wall=83, gb_free=61.8, wall=23604 epoch 017: 520 / 1689 loss=3.593, nll_loss=2.048, ppl=4.13, wps=586035, ups=1.19, wpb=494298, bsz=16436.4, num_updates=27500, lr=0.000381385, gnorm=0.14, clip=0, loss_scale=2, train_wall=83, gb_free=61.8, wall=23604 epoch 017: 520 / 1689 loss=3.593, nll_loss=2.048, ppl=4.13, wps=586035, ups=1.19, wpb=494298, bsz=16436.4, num_updates=27500, lr=0.000381385, gnorm=0.14, clip=0, loss_scale=2, train_wall=83, gb_free=61.8, wall=23604 epoch 017: 520 / 1689 loss=3.593, nll_loss=2.048, ppl=4.13, wps=586035, ups=1.19, wpb=494298, bsz=16436.4, num_updates=27500, lr=0.000381385, gnorm=0.14, clip=0, loss_scale=2, train_wall=83, gb_free=61.8, wall=23604 epoch 017: 520 / 1689 loss=3.593, nll_loss=2.048, ppl=4.13, wps=586035, ups=1.19, wpb=494298, bsz=16436.4, num_updates=27500, lr=0.000381385, gnorm=0.14, clip=0, loss_scale=2, train_wall=83, gb_free=61.8, wall=23604 epoch 017: 520 / 1689 loss=3.593, nll_loss=2.048, ppl=4.13, wps=586035, ups=1.19, wpb=494298, bsz=16436.4, num_updates=27500, lr=0.000381385, gnorm=0.14, clip=0, loss_scale=2, train_wall=83, gb_free=61.8, wall=23604 epoch 017: 620 / 1689 loss=3.589, nll_loss=2.042, ppl=4.12, wps=582403, ups=1.18, wpb=493670, bsz=16199.9, num_updates=27600, lr=0.000380693, gnorm=0.14, clip=0, loss_scale=2, train_wall=84, gb_free=61.5, wall=23689 epoch 017: 620 / 1689 loss=3.589, nll_loss=2.042, ppl=4.12, wps=582403, ups=1.18, wpb=493670, bsz=16199.9, num_updates=27600, lr=0.000380693, gnorm=0.14, clip=0, loss_scale=2, train_wall=84, gb_free=61.5, wall=23689 epoch 017: 620 / 1689 loss=3.589, nll_loss=2.042, ppl=4.12, wps=582403, ups=1.18, wpb=493670, bsz=16199.9, num_updates=27600, lr=0.000380693, gnorm=0.14, clip=0, loss_scale=2, train_wall=84, gb_free=61.5, wall=23689 epoch 017: 620 / 1689 loss=3.589, nll_loss=2.042, ppl=4.12, wps=582403, ups=1.18, wpb=493670, bsz=16199.9, num_updates=27600, lr=0.000380693, gnorm=0.14, clip=0, loss_scale=2, train_wall=84, gb_free=61.5, wall=23689 epoch 017: 620 / 1689 loss=3.589, nll_loss=2.042, ppl=4.12, wps=582403, ups=1.18, wpb=493670, bsz=16199.9, num_updates=27600, lr=0.000380693, gnorm=0.14, clip=0, loss_scale=2, train_wall=84, gb_free=61.5, wall=23689 epoch 017: 620 / 1689 loss=3.589, nll_loss=2.042, ppl=4.12, wps=582403, ups=1.18, wpb=493670, bsz=16199.9, num_updates=27600, lr=0.000380693, gnorm=0.14, clip=0, loss_scale=2, train_wall=84, gb_free=61.5, wall=23689 epoch 017: 620 / 1689 loss=3.589, nll_loss=2.042, ppl=4.12, wps=582403, ups=1.18, wpb=493670, bsz=16199.9, num_updates=27600, lr=0.000380693, gnorm=0.14, clip=0, loss_scale=2, train_wall=84, gb_free=61.5, wall=23689 epoch 017: 620 / 1689 loss=3.589, nll_loss=2.042, ppl=4.12, wps=582403, ups=1.18, wpb=493670, bsz=16199.9, num_updates=27600, lr=0.000380693, gnorm=0.14, clip=0, loss_scale=2, train_wall=84, gb_free=61.5, wall=23689 epoch 017: 620 / 1689 loss=3.589, nll_loss=2.042, ppl=4.12, wps=582403, ups=1.18, wpb=493670, bsz=16199.9, num_updates=27600, lr=0.000380693, gnorm=0.14, clip=0, loss_scale=2, train_wall=84, gb_free=61.5, wall=23689 epoch 017: 620 / 1689 loss=3.589, nll_loss=2.042, ppl=4.12, wps=582403, ups=1.18, wpb=493670, bsz=16199.9, num_updates=27600, lr=0.000380693, gnorm=0.14, clip=0, loss_scale=2, train_wall=84, gb_free=61.5, wall=23689 epoch 017: 620 / 1689 loss=3.589, nll_loss=2.042, ppl=4.12, wps=582403, ups=1.18, wpb=493670, bsz=16199.9, num_updates=27600, lr=0.000380693, gnorm=0.14, clip=0, loss_scale=2, train_wall=84, gb_free=61.5, wall=23689 epoch 017: 620 / 1689 loss=3.589, nll_loss=2.042, ppl=4.12, wps=582403, ups=1.18, wpb=493670, bsz=16199.9, num_updates=27600, lr=0.000380693, gnorm=0.14, clip=0, loss_scale=2, train_wall=84, gb_free=61.5, wall=23689 epoch 017: 620 / 1689 loss=3.589, nll_loss=2.042, ppl=4.12, wps=582403, ups=1.18, wpb=493670, bsz=16199.9, num_updates=27600, lr=0.000380693, gnorm=0.14, clip=0, loss_scale=2, train_wall=84, gb_free=61.5, wall=23689 epoch 017: 620 / 1689 loss=3.589, nll_loss=2.042, ppl=4.12, wps=582403, ups=1.18, wpb=493670, bsz=16199.9, num_updates=27600, lr=0.000380693, gnorm=0.14, clip=0, loss_scale=2, train_wall=84, gb_free=61.5, wall=23689 epoch 017: 620 / 1689 loss=3.589, nll_loss=2.042, ppl=4.12, wps=582403, ups=1.18, wpb=493670, bsz=16199.9, num_updates=27600, lr=0.000380693, gnorm=0.14, clip=0, loss_scale=2, train_wall=84, gb_free=61.5, wall=23689 epoch 017: 620 / 1689 loss=3.589, nll_loss=2.042, ppl=4.12, wps=582403, ups=1.18, wpb=493670, bsz=16199.9, num_updates=27600, lr=0.000380693, gnorm=0.14, clip=0, loss_scale=2, train_wall=84, gb_free=61.5, wall=23689 epoch 017: 620 / 1689 loss=3.589, nll_loss=2.042, ppl=4.12, wps=582403, ups=1.18, wpb=493670, bsz=16199.9, num_updates=27600, lr=0.000380693, gnorm=0.14, clip=0, loss_scale=2, train_wall=84, gb_free=61.5, wall=23689 epoch 017: 720 / 1689 loss=3.589, nll_loss=2.043, ppl=4.12, wps=582784, ups=1.18, wpb=495338, bsz=16314.2, num_updates=27700, lr=0.000380006, gnorm=0.138, clip=0, loss_scale=4, train_wall=84, gb_free=61.5, wall=23774 epoch 017: 720 / 1689 loss=3.589, nll_loss=2.043, ppl=4.12, wps=582784, ups=1.18, wpb=495338, bsz=16314.2, num_updates=27700, lr=0.000380006, gnorm=0.138, clip=0, loss_scale=4, train_wall=84, gb_free=61.5, wall=23774 epoch 017: 720 / 1689 loss=3.589, nll_loss=2.043, ppl=4.12, wps=582784, ups=1.18, wpb=495338, bsz=16314.2, num_updates=27700, lr=0.000380006, gnorm=0.138, clip=0, loss_scale=4, train_wall=84, gb_free=61.5, wall=23774 epoch 017: 720 / 1689 loss=3.589, nll_loss=2.043, ppl=4.12, wps=582784, ups=1.18, wpb=495338, bsz=16314.2, num_updates=27700, lr=0.000380006, gnorm=0.138, clip=0, loss_scale=4, train_wall=84, gb_free=61.5, wall=23774 epoch 017: 720 / 1689 loss=3.589, nll_loss=2.043, ppl=4.12, wps=582784, ups=1.18, wpb=495338, bsz=16314.2, num_updates=27700, lr=0.000380006, gnorm=0.138, clip=0, loss_scale=4, train_wall=84, gb_free=61.5, wall=23774 epoch 017: 720 / 1689 loss=3.589, nll_loss=2.043, ppl=4.12, wps=582784, ups=1.18, wpb=495338, bsz=16314.2, num_updates=27700, lr=0.000380006, gnorm=0.138, clip=0, loss_scale=4, train_wall=84, gb_free=61.5, wall=23774 epoch 017: 720 / 1689 loss=3.589, nll_loss=2.043, ppl=4.12, wps=582784, ups=1.18, wpb=495338, bsz=16314.2, num_updates=27700, lr=0.000380006, gnorm=0.138, clip=0, loss_scale=4, train_wall=84, gb_free=61.5, wall=23774 epoch 017: 720 / 1689 loss=3.589, nll_loss=2.043, ppl=4.12, wps=582784, ups=1.18, wpb=495338, bsz=16314.2, num_updates=27700, lr=0.000380006, gnorm=0.138, clip=0, loss_scale=4, train_wall=84, gb_free=61.5, wall=23774 epoch 017: 720 / 1689 loss=3.589, nll_loss=2.043, ppl=4.12, wps=582784, ups=1.18, wpb=495338, bsz=16314.2, num_updates=27700, lr=0.000380006, gnorm=0.138, clip=0, loss_scale=4, train_wall=84, gb_free=61.5, wall=23774 epoch 017: 720 / 1689 loss=3.589, nll_loss=2.043, ppl=4.12, wps=582784, ups=1.18, wpb=495338, bsz=16314.2, num_updates=27700, lr=0.000380006, gnorm=0.138, clip=0, loss_scale=4, train_wall=84, gb_free=61.5, wall=23774 epoch 017: 720 / 1689 loss=3.589, nll_loss=2.043, ppl=4.12, wps=582784, ups=1.18, wpb=495338, bsz=16314.2, num_updates=27700, lr=0.000380006, gnorm=0.138, clip=0, loss_scale=4, train_wall=84, gb_free=61.5, wall=23774 epoch 017: 720 / 1689 loss=3.589, nll_loss=2.043, ppl=4.12, wps=582784, ups=1.18, wpb=495338, bsz=16314.2, num_updates=27700, lr=0.000380006, gnorm=0.138, clip=0, loss_scale=4, train_wall=84, gb_free=61.5, wall=23774 epoch 017: 720 / 1689 loss=3.589, nll_loss=2.043, ppl=4.12, wps=582784, ups=1.18, wpb=495338, bsz=16314.2, num_updates=27700, lr=0.000380006, gnorm=0.138, clip=0, loss_scale=4, train_wall=84, gb_free=61.5, wall=23774 epoch 017: 720 / 1689 loss=3.589, nll_loss=2.043, ppl=4.12, wps=582784, ups=1.18, wpb=495338, bsz=16314.2, num_updates=27700, lr=0.000380006, gnorm=0.138, clip=0, loss_scale=4, train_wall=84, gb_free=61.5, wall=23774 epoch 017: 720 / 1689 loss=3.589, nll_loss=2.043, ppl=4.12, wps=582784, ups=1.18, wpb=495338, bsz=16314.2, num_updates=27700, lr=0.000380006, gnorm=0.138, clip=0, loss_scale=4, train_wall=84, gb_free=61.5, wall=23774 epoch 017: 720 / 1689 loss=3.589, nll_loss=2.043, ppl=4.12, wps=582784, ups=1.18, wpb=495338, bsz=16314.2, num_updates=27700, lr=0.000380006, gnorm=0.138, clip=0, loss_scale=4, train_wall=84, gb_free=61.5, wall=23774 epoch 017: 720 / 1689 loss=3.589, nll_loss=2.043, ppl=4.12, wps=582784, ups=1.18, wpb=495338, bsz=16314.2, num_updates=27700, lr=0.000380006, gnorm=0.138, clip=0, loss_scale=4, train_wall=84, gb_free=61.5, wall=23774 epoch 017: 820 / 1689 loss=3.592, nll_loss=2.047, ppl=4.13, wps=588872, ups=1.19, wpb=496681, bsz=17014.6, num_updates=27800, lr=0.000379322, gnorm=0.134, clip=0, loss_scale=4, train_wall=83, gb_free=61.7, wall=23859 epoch 017: 820 / 1689 loss=3.592, nll_loss=2.047, ppl=4.13, wps=588872, ups=1.19, wpb=496681, bsz=17014.6, num_updates=27800, lr=0.000379322, gnorm=0.134, clip=0, loss_scale=4, train_wall=83, gb_free=61.7, wall=23859 epoch 017: 820 / 1689 loss=3.592, nll_loss=2.047, ppl=4.13, wps=588872, ups=1.19, wpb=496681, bsz=17014.6, num_updates=27800, lr=0.000379322, gnorm=0.134, clip=0, loss_scale=4, train_wall=83, gb_free=61.7, wall=23859 epoch 017: 820 / 1689 loss=3.592, nll_loss=2.047, ppl=4.13, wps=588872, ups=1.19, wpb=496681, bsz=17014.6, num_updates=27800, lr=0.000379322, gnorm=0.134, clip=0, loss_scale=4, train_wall=83, gb_free=61.7, wall=23859 epoch 017: 820 / 1689 loss=3.592, nll_loss=2.047, ppl=4.13, wps=588872, ups=1.19, wpb=496681, bsz=17014.6, num_updates=27800, lr=0.000379322, gnorm=0.134, clip=0, loss_scale=4, train_wall=83, gb_free=61.7, wall=23859 epoch 017: 820 / 1689 loss=3.592, nll_loss=2.047, ppl=4.13, wps=588872, ups=1.19, wpb=496681, bsz=17014.6, num_updates=27800, lr=0.000379322, gnorm=0.134, clip=0, loss_scale=4, train_wall=83, gb_free=61.7, wall=23859 epoch 017: 820 / 1689 loss=3.592, nll_loss=2.047, ppl=4.13, wps=588872, ups=1.19, wpb=496681, bsz=17014.6, num_updates=27800, lr=0.000379322, gnorm=0.134, clip=0, loss_scale=4, train_wall=83, gb_free=61.7, wall=23859 epoch 017: 820 / 1689 loss=3.592, nll_loss=2.047, ppl=4.13, wps=588872, ups=1.19, wpb=496681, bsz=17014.6, num_updates=27800, lr=0.000379322, gnorm=0.134, clip=0, loss_scale=4, train_wall=83, gb_free=61.7, wall=23859 epoch 017: 820 / 1689 loss=3.592, nll_loss=2.047, ppl=4.13, wps=588872, ups=1.19, wpb=496681, bsz=17014.6, num_updates=27800, lr=0.000379322, gnorm=0.134, clip=0, loss_scale=4, train_wall=83, gb_free=61.7, wall=23859 epoch 017: 820 / 1689 loss=3.592, nll_loss=2.047, ppl=4.13, wps=588872, ups=1.19, wpb=496681, bsz=17014.6, num_updates=27800, lr=0.000379322, gnorm=0.134, clip=0, loss_scale=4, train_wall=83, gb_free=61.7, wall=23859 epoch 017: 820 / 1689 loss=3.592, nll_loss=2.047, ppl=4.13, wps=588872, ups=1.19, wpb=496681, bsz=17014.6, num_updates=27800, lr=0.000379322, gnorm=0.134, clip=0, loss_scale=4, train_wall=83, gb_free=61.7, wall=23859 epoch 017: 820 / 1689 loss=3.592, nll_loss=2.047, ppl=4.13, wps=588872, ups=1.19, wpb=496681, bsz=17014.6, num_updates=27800, lr=0.000379322, gnorm=0.134, clip=0, loss_scale=4, train_wall=83, gb_free=61.7, wall=23859 epoch 017: 820 / 1689 loss=3.592, nll_loss=2.047, ppl=4.13, wps=588872, ups=1.19, wpb=496681, bsz=17014.6, num_updates=27800, lr=0.000379322, gnorm=0.134, clip=0, loss_scale=4, train_wall=83, gb_free=61.7, wall=23859 epoch 017: 820 / 1689 loss=3.592, nll_loss=2.047, ppl=4.13, wps=588872, ups=1.19, wpb=496681, bsz=17014.6, num_updates=27800, lr=0.000379322, gnorm=0.134, clip=0, loss_scale=4, train_wall=83, gb_free=61.7, wall=23859 epoch 017: 820 / 1689 loss=3.592, nll_loss=2.047, ppl=4.13, wps=588872, ups=1.19, wpb=496681, bsz=17014.6, num_updates=27800, lr=0.000379322, gnorm=0.134, clip=0, loss_scale=4, train_wall=83, gb_free=61.7, wall=23859 epoch 017: 820 / 1689 loss=3.592, nll_loss=2.047, ppl=4.13, wps=588872, ups=1.19, wpb=496681, bsz=17014.6, num_updates=27800, lr=0.000379322, gnorm=0.134, clip=0, loss_scale=4, train_wall=83, gb_free=61.7, wall=23859 epoch 017: 820 / 1689 loss=3.592, nll_loss=2.047, ppl=4.13, wps=588872, ups=1.19, wpb=496681, bsz=17014.6, num_updates=27800, lr=0.000379322, gnorm=0.134, clip=0, loss_scale=4, train_wall=83, gb_free=61.7, wall=23859 epoch 017: 920 / 1689 loss=3.59, nll_loss=2.044, ppl=4.12, wps=585988, ups=1.18, wpb=495638, bsz=16710.2, num_updates=27900, lr=0.000378641, gnorm=0.146, clip=0, loss_scale=4, train_wall=84, gb_free=61.5, wall=23943 epoch 017: 920 / 1689 loss=3.59, nll_loss=2.044, ppl=4.12, wps=585988, ups=1.18, wpb=495638, bsz=16710.2, num_updates=27900, lr=0.000378641, gnorm=0.146, clip=0, loss_scale=4, train_wall=84, gb_free=61.5, wall=23943 epoch 017: 920 / 1689 loss=3.59, nll_loss=2.044, ppl=4.12, wps=585988, ups=1.18, wpb=495638, bsz=16710.2, num_updates=27900, lr=0.000378641, gnorm=0.146, clip=0, loss_scale=4, train_wall=84, gb_free=61.5, wall=23943 epoch 017: 920 / 1689 loss=3.59, nll_loss=2.044, ppl=4.12, wps=585988, ups=1.18, wpb=495638, bsz=16710.2, num_updates=27900, lr=0.000378641, gnorm=0.146, clip=0, loss_scale=4, train_wall=84, gb_free=61.5, wall=23943 epoch 017: 920 / 1689 loss=3.59, nll_loss=2.044, ppl=4.12, wps=585988, ups=1.18, wpb=495638, bsz=16710.2, num_updates=27900, lr=0.000378641, gnorm=0.146, clip=0, loss_scale=4, train_wall=84, gb_free=61.5, wall=23943 epoch 017: 920 / 1689 loss=3.59, nll_loss=2.044, ppl=4.12, wps=585988, ups=1.18, wpb=495638, bsz=16710.2, num_updates=27900, lr=0.000378641, gnorm=0.146, clip=0, loss_scale=4, train_wall=84, gb_free=61.5, wall=23943 epoch 017: 920 / 1689 loss=3.59, nll_loss=2.044, ppl=4.12, wps=585988, ups=1.18, wpb=495638, bsz=16710.2, num_updates=27900, lr=0.000378641, gnorm=0.146, clip=0, loss_scale=4, train_wall=84, gb_free=61.5, wall=23943 epoch 017: 920 / 1689 loss=3.59, nll_loss=2.044, ppl=4.12, wps=585988, ups=1.18, wpb=495638, bsz=16710.2, num_updates=27900, lr=0.000378641, gnorm=0.146, clip=0, loss_scale=4, train_wall=84, gb_free=61.5, wall=23943 epoch 017: 920 / 1689 loss=3.59, nll_loss=2.044, ppl=4.12, wps=585988, ups=1.18, wpb=495638, bsz=16710.2, num_updates=27900, lr=0.000378641, gnorm=0.146, clip=0, loss_scale=4, train_wall=84, gb_free=61.5, wall=23943 epoch 017: 920 / 1689 loss=3.59, nll_loss=2.044, ppl=4.12, wps=585988, ups=1.18, wpb=495638, bsz=16710.2, num_updates=27900, lr=0.000378641, gnorm=0.146, clip=0, loss_scale=4, train_wall=84, gb_free=61.5, wall=23943 epoch 017: 920 / 1689 loss=3.59, nll_loss=2.044, ppl=4.12, wps=585988, ups=1.18, wpb=495638, bsz=16710.2, num_updates=27900, lr=0.000378641, gnorm=0.146, clip=0, loss_scale=4, train_wall=84, gb_free=61.5, wall=23943 epoch 017: 920 / 1689 loss=3.59, nll_loss=2.044, ppl=4.12, wps=585988, ups=1.18, wpb=495638, bsz=16710.2, num_updates=27900, lr=0.000378641, gnorm=0.146, clip=0, loss_scale=4, train_wall=84, gb_free=61.5, wall=23943 epoch 017: 920 / 1689 loss=3.59, nll_loss=2.044, ppl=4.12, wps=585988, ups=1.18, wpb=495638, bsz=16710.2, num_updates=27900, lr=0.000378641, gnorm=0.146, clip=0, loss_scale=4, train_wall=84, gb_free=61.5, wall=23943 epoch 017: 920 / 1689 loss=3.59, nll_loss=2.044, ppl=4.12, wps=585988, ups=1.18, wpb=495638, bsz=16710.2, num_updates=27900, lr=0.000378641, gnorm=0.146, clip=0, loss_scale=4, train_wall=84, gb_free=61.5, wall=23943 epoch 017: 920 / 1689 loss=3.59, nll_loss=2.044, ppl=4.12, wps=585988, ups=1.18, wpb=495638, bsz=16710.2, num_updates=27900, lr=0.000378641, gnorm=0.146, clip=0, loss_scale=4, train_wall=84, gb_free=61.5, wall=23943 epoch 017: 920 / 1689 loss=3.59, nll_loss=2.044, ppl=4.12, wps=585988, ups=1.18, wpb=495638, bsz=16710.2, num_updates=27900, lr=0.000378641, gnorm=0.146, clip=0, loss_scale=4, train_wall=84, gb_free=61.5, wall=23943 epoch 017: 920 / 1689 loss=3.59, nll_loss=2.044, ppl=4.12, wps=585988, ups=1.18, wpb=495638, bsz=16710.2, num_updates=27900, lr=0.000378641, gnorm=0.146, clip=0, loss_scale=4, train_wall=84, gb_free=61.5, wall=23943 epoch 017: 1020 / 1689 loss=3.595, nll_loss=2.05, ppl=4.14, wps=582341, ups=1.18, wpb=494540, bsz=16319, num_updates=28000, lr=0.000377964, gnorm=0.141, clip=0, loss_scale=4, train_wall=84, gb_free=61, wall=24028 epoch 017: 1020 / 1689 loss=3.595, nll_loss=2.05, ppl=4.14, wps=582341, ups=1.18, wpb=494540, bsz=16319, num_updates=28000, lr=0.000377964, gnorm=0.141, clip=0, loss_scale=4, train_wall=84, gb_free=61, wall=24028 epoch 017: 1020 / 1689 loss=3.595, nll_loss=2.05, ppl=4.14, wps=582341, ups=1.18, wpb=494540, bsz=16319, num_updates=28000, lr=0.000377964, gnorm=0.141, clip=0, loss_scale=4, train_wall=84, gb_free=61, wall=24028 epoch 017: 1020 / 1689 loss=3.595, nll_loss=2.05, ppl=4.14, wps=582341, ups=1.18, wpb=494540, bsz=16319, num_updates=28000, lr=0.000377964, gnorm=0.141, clip=0, loss_scale=4, train_wall=84, gb_free=61, wall=24028 epoch 017: 1020 / 1689 loss=3.595, nll_loss=2.05, ppl=4.14, wps=582341, ups=1.18, wpb=494540, bsz=16319, num_updates=28000, lr=0.000377964, gnorm=0.141, clip=0, loss_scale=4, train_wall=84, gb_free=61, wall=24028 epoch 017: 1020 / 1689 loss=3.595, nll_loss=2.05, ppl=4.14, wps=582341, ups=1.18, wpb=494540, bsz=16319, num_updates=28000, lr=0.000377964, gnorm=0.141, clip=0, loss_scale=4, train_wall=84, gb_free=61, wall=24028 epoch 017: 1020 / 1689 loss=3.595, nll_loss=2.05, ppl=4.14, wps=582341, ups=1.18, wpb=494540, bsz=16319, num_updates=28000, lr=0.000377964, gnorm=0.141, clip=0, loss_scale=4, train_wall=84, gb_free=61, wall=24028 epoch 017: 1020 / 1689 loss=3.595, nll_loss=2.05, ppl=4.14, wps=582341, ups=1.18, wpb=494540, bsz=16319, num_updates=28000, lr=0.000377964, gnorm=0.141, clip=0, loss_scale=4, train_wall=84, gb_free=61, wall=24028 epoch 017: 1020 / 1689 loss=3.595, nll_loss=2.05, ppl=4.14, wps=582341, ups=1.18, wpb=494540, bsz=16319, num_updates=28000, lr=0.000377964, gnorm=0.141, clip=0, loss_scale=4, train_wall=84, gb_free=61, wall=24028 epoch 017: 1020 / 1689 loss=3.595, nll_loss=2.05, ppl=4.14, wps=582341, ups=1.18, wpb=494540, bsz=16319, num_updates=28000, lr=0.000377964, gnorm=0.141, clip=0, loss_scale=4, train_wall=84, gb_free=61, wall=24028 epoch 017: 1020 / 1689 loss=3.595, nll_loss=2.05, ppl=4.14, wps=582341, ups=1.18, wpb=494540, bsz=16319, num_updates=28000, lr=0.000377964, gnorm=0.141, clip=0, loss_scale=4, train_wall=84, gb_free=61, wall=24028 epoch 017: 1020 / 1689 loss=3.595, nll_loss=2.05, ppl=4.14, wps=582341, ups=1.18, wpb=494540, bsz=16319, num_updates=28000, lr=0.000377964, gnorm=0.141, clip=0, loss_scale=4, train_wall=84, gb_free=61, wall=24028 epoch 017: 1020 / 1689 loss=3.595, nll_loss=2.05, ppl=4.14, wps=582341, ups=1.18, wpb=494540, bsz=16319, num_updates=28000, lr=0.000377964, gnorm=0.141, clip=0, loss_scale=4, train_wall=84, gb_free=61, wall=24028 epoch 017: 1020 / 1689 loss=3.595, nll_loss=2.05, ppl=4.14, wps=582341, ups=1.18, wpb=494540, bsz=16319, num_updates=28000, lr=0.000377964, gnorm=0.141, clip=0, loss_scale=4, train_wall=84, gb_free=61, wall=24028 epoch 017: 1020 / 1689 loss=3.595, nll_loss=2.05, ppl=4.14, wps=582341, ups=1.18, wpb=494540, bsz=16319, num_updates=28000, lr=0.000377964, gnorm=0.141, clip=0, loss_scale=4, train_wall=84, gb_free=61, wall=24028 epoch 017: 1020 / 1689 loss=3.595, nll_loss=2.05, ppl=4.14, wps=582341, ups=1.18, wpb=494540, bsz=16319, num_updates=28000, lr=0.000377964, gnorm=0.141, clip=0, loss_scale=4, train_wall=84, gb_free=61, wall=24028 epoch 017: 1020 / 1689 loss=3.595, nll_loss=2.05, ppl=4.14, wps=582341, ups=1.18, wpb=494540, bsz=16319, num_updates=28000, lr=0.000377964, gnorm=0.141, clip=0, loss_scale=4, train_wall=84, gb_free=61, wall=24028 begin validation on "valid" subset epoch 017 | valid on 'valid' subset | loss 3.703 | nll_loss 2.148 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 28000 | best_loss 3.703 epoch 017 | valid on 'valid' subset | loss 3.703 | nll_loss 2.148 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 28000 | best_loss 3.703 epoch 017 | valid on 'valid' subset | loss 3.703 | nll_loss 2.148 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 28000 | best_loss 3.703 epoch 017 | valid on 'valid' subset | loss 3.703 | nll_loss 2.148 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 28000 | best_loss 3.703 epoch 017 | valid on 'valid' subset | loss 3.703 | nll_loss 2.148 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 28000 | best_loss 3.703 epoch 017 | valid on 'valid' subset | loss 3.703 | nll_loss 2.148 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 28000 | best_loss 3.703 epoch 017 | valid on 'valid' subset | loss 3.703 | nll_loss 2.148 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 28000 | best_loss 3.703 epoch 017 | valid on 'valid' subset | loss 3.703 | nll_loss 2.148 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 28000 | best_loss 3.703 epoch 017 | valid on 'valid' subset | loss 3.703 | nll_loss 2.148 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 28000 | best_loss 3.703 epoch 017 | valid on 'valid' subset | loss 3.703 | nll_loss 2.148 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 28000 | best_loss 3.703 epoch 017 | valid on 'valid' subset | loss 3.703 | nll_loss 2.148 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 28000 | best_loss 3.703 epoch 017 | valid on 'valid' subset | loss 3.703 | nll_loss 2.148 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 28000 | best_loss 3.703 epoch 017 | valid on 'valid' subset | loss 3.703 | nll_loss 2.148 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 28000 | best_loss 3.703 epoch 017 | valid on 'valid' subset | loss 3.703 | nll_loss 2.148 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 28000 | best_loss 3.703 epoch 017 | valid on 'valid' subset | loss 3.703 | nll_loss 2.148 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 28000 | best_loss 3.703 epoch 017 | valid on 'valid' subset | loss 3.703 | nll_loss 2.148 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 28000 | best_loss 3.703 epoch 017 | valid on 'valid' subset | loss 3.703 | nll_loss 2.148 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 28000 | best_loss 3.703 epoch 017: 1120 / 1689 loss=3.59, nll_loss=2.045, ppl=4.13, wps=524272, ups=1.06, wpb=496700, bsz=16741.9, num_updates=28100, lr=0.000377291, gnorm=0.138, clip=0, loss_scale=4, train_wall=83, gb_free=61.9, wall=24123 epoch 017: 1120 / 1689 loss=3.59, nll_loss=2.045, ppl=4.13, wps=524272, ups=1.06, wpb=496700, bsz=16741.9, num_updates=28100, lr=0.000377291, gnorm=0.138, clip=0, loss_scale=4, train_wall=83, gb_free=61.9, wall=24123 epoch 017: 1120 / 1689 loss=3.59, nll_loss=2.045, ppl=4.13, wps=524272, ups=1.06, wpb=496700, bsz=16741.9, num_updates=28100, lr=0.000377291, gnorm=0.138, clip=0, loss_scale=4, train_wall=83, gb_free=61.9, wall=24123 epoch 017: 1120 / 1689 loss=3.59, nll_loss=2.045, ppl=4.13, wps=524272, ups=1.06, wpb=496700, bsz=16741.9, num_updates=28100, lr=0.000377291, gnorm=0.138, clip=0, loss_scale=4, train_wall=83, gb_free=61.9, wall=24123 epoch 017: 1120 / 1689 loss=3.59, nll_loss=2.045, ppl=4.13, wps=524272, ups=1.06, wpb=496700, bsz=16741.9, num_updates=28100, lr=0.000377291, gnorm=0.138, clip=0, loss_scale=4, train_wall=83, gb_free=61.9, wall=24123 epoch 017: 1120 / 1689 loss=3.59, nll_loss=2.045, ppl=4.13, wps=524272, ups=1.06, wpb=496700, bsz=16741.9, num_updates=28100, lr=0.000377291, gnorm=0.138, clip=0, loss_scale=4, train_wall=83, gb_free=61.9, wall=24123 epoch 017: 1120 / 1689 loss=3.59, nll_loss=2.045, ppl=4.13, wps=524272, ups=1.06, wpb=496700, bsz=16741.9, num_updates=28100, lr=0.000377291, gnorm=0.138, clip=0, loss_scale=4, train_wall=83, gb_free=61.9, wall=24123 epoch 017: 1120 / 1689 loss=3.59, nll_loss=2.045, ppl=4.13, wps=524272, ups=1.06, wpb=496700, bsz=16741.9, num_updates=28100, lr=0.000377291, gnorm=0.138, clip=0, loss_scale=4, train_wall=83, gb_free=61.9, wall=24123 epoch 017: 1120 / 1689 loss=3.59, nll_loss=2.045, ppl=4.13, wps=524272, ups=1.06, wpb=496700, bsz=16741.9, num_updates=28100, lr=0.000377291, gnorm=0.138, clip=0, loss_scale=4, train_wall=83, gb_free=61.9, wall=24123 epoch 017: 1120 / 1689 loss=3.59, nll_loss=2.045, ppl=4.13, wps=524272, ups=1.06, wpb=496700, bsz=16741.9, num_updates=28100, lr=0.000377291, gnorm=0.138, clip=0, loss_scale=4, train_wall=83, gb_free=61.9, wall=24123 epoch 017: 1120 / 1689 loss=3.59, nll_loss=2.045, ppl=4.13, wps=524272, ups=1.06, wpb=496700, bsz=16741.9, num_updates=28100, lr=0.000377291, gnorm=0.138, clip=0, loss_scale=4, train_wall=83, gb_free=61.9, wall=24123 epoch 017: 1120 / 1689 loss=3.59, nll_loss=2.045, ppl=4.13, wps=524272, ups=1.06, wpb=496700, bsz=16741.9, num_updates=28100, lr=0.000377291, gnorm=0.138, clip=0, loss_scale=4, train_wall=83, gb_free=61.9, wall=24123 epoch 017: 1120 / 1689 loss=3.59, nll_loss=2.045, ppl=4.13, wps=524272, ups=1.06, wpb=496700, bsz=16741.9, num_updates=28100, lr=0.000377291, gnorm=0.138, clip=0, loss_scale=4, train_wall=83, gb_free=61.9, wall=24123 epoch 017: 1120 / 1689 loss=3.59, nll_loss=2.045, ppl=4.13, wps=524272, ups=1.06, wpb=496700, bsz=16741.9, num_updates=28100, lr=0.000377291, gnorm=0.138, clip=0, loss_scale=4, train_wall=83, gb_free=61.9, wall=24123 epoch 017: 1120 / 1689 loss=3.59, nll_loss=2.045, ppl=4.13, wps=524272, ups=1.06, wpb=496700, bsz=16741.9, num_updates=28100, lr=0.000377291, gnorm=0.138, clip=0, loss_scale=4, train_wall=83, gb_free=61.9, wall=24123 epoch 017: 1120 / 1689 loss=3.59, nll_loss=2.045, ppl=4.13, wps=524272, ups=1.06, wpb=496700, bsz=16741.9, num_updates=28100, lr=0.000377291, gnorm=0.138, clip=0, loss_scale=4, train_wall=83, gb_free=61.9, wall=24123 epoch 017: 1120 / 1689 loss=3.59, nll_loss=2.045, ppl=4.13, wps=524272, ups=1.06, wpb=496700, bsz=16741.9, num_updates=28100, lr=0.000377291, gnorm=0.138, clip=0, loss_scale=4, train_wall=83, gb_free=61.9, wall=24123 epoch 017: 1220 / 1689 loss=3.59, nll_loss=2.044, ppl=4.12, wps=587635, ups=1.19, wpb=495031, bsz=16288.2, num_updates=28200, lr=0.000376622, gnorm=0.138, clip=0, loss_scale=8, train_wall=83, gb_free=62, wall=24207 epoch 017: 1220 / 1689 loss=3.59, nll_loss=2.044, ppl=4.12, wps=587635, ups=1.19, wpb=495031, bsz=16288.2, num_updates=28200, lr=0.000376622, gnorm=0.138, clip=0, loss_scale=8, train_wall=83, gb_free=62, wall=24207 epoch 017: 1220 / 1689 loss=3.59, nll_loss=2.044, ppl=4.12, wps=587635, ups=1.19, wpb=495031, bsz=16288.2, num_updates=28200, lr=0.000376622, gnorm=0.138, clip=0, loss_scale=8, train_wall=83, gb_free=62, wall=24207 epoch 017: 1220 / 1689 loss=3.59, nll_loss=2.044, ppl=4.12, wps=587635, ups=1.19, wpb=495031, bsz=16288.2, num_updates=28200, lr=0.000376622, gnorm=0.138, clip=0, loss_scale=8, train_wall=83, gb_free=62, wall=24207 epoch 017: 1220 / 1689 loss=3.59, nll_loss=2.044, ppl=4.12, wps=587635, ups=1.19, wpb=495031, bsz=16288.2, num_updates=28200, lr=0.000376622, gnorm=0.138, clip=0, loss_scale=8, train_wall=83, gb_free=62, wall=24207 epoch 017: 1220 / 1689 loss=3.59, nll_loss=2.044, ppl=4.12, wps=587635, ups=1.19, wpb=495031, bsz=16288.2, num_updates=28200, lr=0.000376622, gnorm=0.138, clip=0, loss_scale=8, train_wall=83, gb_free=62, wall=24207 epoch 017: 1220 / 1689 loss=3.59, nll_loss=2.044, ppl=4.12, wps=587635, ups=1.19, wpb=495031, bsz=16288.2, num_updates=28200, lr=0.000376622, gnorm=0.138, clip=0, loss_scale=8, train_wall=83, gb_free=62, wall=24207 epoch 017: 1220 / 1689 loss=3.59, nll_loss=2.044, ppl=4.12, wps=587635, ups=1.19, wpb=495031, bsz=16288.2, num_updates=28200, lr=0.000376622, gnorm=0.138, clip=0, loss_scale=8, train_wall=83, gb_free=62, wall=24207 epoch 017: 1220 / 1689 loss=3.59, nll_loss=2.044, ppl=4.12, wps=587635, ups=1.19, wpb=495031, bsz=16288.2, num_updates=28200, lr=0.000376622, gnorm=0.138, clip=0, loss_scale=8, train_wall=83, gb_free=62, wall=24207 epoch 017: 1220 / 1689 loss=3.59, nll_loss=2.044, ppl=4.12, wps=587635, ups=1.19, wpb=495031, bsz=16288.2, num_updates=28200, lr=0.000376622, gnorm=0.138, clip=0, loss_scale=8, train_wall=83, gb_free=62, wall=24207 epoch 017: 1220 / 1689 loss=3.59, nll_loss=2.044, ppl=4.12, wps=587635, ups=1.19, wpb=495031, bsz=16288.2, num_updates=28200, lr=0.000376622, gnorm=0.138, clip=0, loss_scale=8, train_wall=83, gb_free=62, wall=24207 epoch 017: 1220 / 1689 loss=3.59, nll_loss=2.044, ppl=4.12, wps=587635, ups=1.19, wpb=495031, bsz=16288.2, num_updates=28200, lr=0.000376622, gnorm=0.138, clip=0, loss_scale=8, train_wall=83, gb_free=62, wall=24207 epoch 017: 1220 / 1689 loss=3.59, nll_loss=2.044, ppl=4.12, wps=587635, ups=1.19, wpb=495031, bsz=16288.2, num_updates=28200, lr=0.000376622, gnorm=0.138, clip=0, loss_scale=8, train_wall=83, gb_free=62, wall=24207 epoch 017: 1220 / 1689 loss=3.59, nll_loss=2.044, ppl=4.12, wps=587635, ups=1.19, wpb=495031, bsz=16288.2, num_updates=28200, lr=0.000376622, gnorm=0.138, clip=0, loss_scale=8, train_wall=83, gb_free=62, wall=24207 epoch 017: 1220 / 1689 loss=3.59, nll_loss=2.044, ppl=4.12, wps=587635, ups=1.19, wpb=495031, bsz=16288.2, num_updates=28200, lr=0.000376622, gnorm=0.138, clip=0, loss_scale=8, train_wall=83, gb_free=62, wall=24207 epoch 017: 1220 / 1689 loss=3.59, nll_loss=2.044, ppl=4.12, wps=587635, ups=1.19, wpb=495031, bsz=16288.2, num_updates=28200, lr=0.000376622, gnorm=0.138, clip=0, loss_scale=8, train_wall=83, gb_free=62, wall=24207 epoch 017: 1220 / 1689 loss=3.59, nll_loss=2.044, ppl=4.12, wps=587635, ups=1.19, wpb=495031, bsz=16288.2, num_updates=28200, lr=0.000376622, gnorm=0.138, clip=0, loss_scale=8, train_wall=83, gb_free=62, wall=24207 epoch 017: 1321 / 1689 loss=3.593, nll_loss=2.047, ppl=4.13, wps=583503, ups=1.18, wpb=495254, bsz=16426.7, num_updates=28300, lr=0.000375956, gnorm=0.137, clip=0, loss_scale=4, train_wall=84, gb_free=60.7, wall=24292 epoch 017: 1321 / 1689 loss=3.593, nll_loss=2.047, ppl=4.13, wps=583503, ups=1.18, wpb=495254, bsz=16426.7, num_updates=28300, lr=0.000375956, gnorm=0.137, clip=0, loss_scale=4, train_wall=84, gb_free=60.7, wall=24292 epoch 017: 1321 / 1689 loss=3.593, nll_loss=2.047, ppl=4.13, wps=583503, ups=1.18, wpb=495254, bsz=16426.7, num_updates=28300, lr=0.000375956, gnorm=0.137, clip=0, loss_scale=4, train_wall=84, gb_free=60.7, wall=24292 epoch 017: 1321 / 1689 loss=3.593, nll_loss=2.047, ppl=4.13, wps=583503, ups=1.18, wpb=495254, bsz=16426.7, num_updates=28300, lr=0.000375956, gnorm=0.137, clip=0, loss_scale=4, train_wall=84, gb_free=60.7, wall=24292 epoch 017: 1321 / 1689 loss=3.593, nll_loss=2.047, ppl=4.13, wps=583503, ups=1.18, wpb=495254, bsz=16426.7, num_updates=28300, lr=0.000375956, gnorm=0.137, clip=0, loss_scale=4, train_wall=84, gb_free=60.7, wall=24292 epoch 017: 1321 / 1689 loss=3.593, nll_loss=2.047, ppl=4.13, wps=583503, ups=1.18, wpb=495254, bsz=16426.7, num_updates=28300, lr=0.000375956, gnorm=0.137, clip=0, loss_scale=4, train_wall=84, gb_free=60.7, wall=24292 epoch 017: 1321 / 1689 loss=3.593, nll_loss=2.047, ppl=4.13, wps=583503, ups=1.18, wpb=495254, bsz=16426.7, num_updates=28300, lr=0.000375956, gnorm=0.137, clip=0, loss_scale=4, train_wall=84, gb_free=60.7, wall=24292 epoch 017: 1321 / 1689 loss=3.593, nll_loss=2.047, ppl=4.13, wps=583503, ups=1.18, wpb=495254, bsz=16426.7, num_updates=28300, lr=0.000375956, gnorm=0.137, clip=0, loss_scale=4, train_wall=84, gb_free=60.7, wall=24292 epoch 017: 1321 / 1689 loss=3.593, nll_loss=2.047, ppl=4.13, wps=583503, ups=1.18, wpb=495254, bsz=16426.7, num_updates=28300, lr=0.000375956, gnorm=0.137, clip=0, loss_scale=4, train_wall=84, gb_free=60.7, wall=24292 epoch 017: 1321 / 1689 loss=3.593, nll_loss=2.047, ppl=4.13, wps=583503, ups=1.18, wpb=495254, bsz=16426.7, num_updates=28300, lr=0.000375956, gnorm=0.137, clip=0, loss_scale=4, train_wall=84, gb_free=60.7, wall=24292 epoch 017: 1321 / 1689 loss=3.593, nll_loss=2.047, ppl=4.13, wps=583503, ups=1.18, wpb=495254, bsz=16426.7, num_updates=28300, lr=0.000375956, gnorm=0.137, clip=0, loss_scale=4, train_wall=84, gb_free=60.7, wall=24292 epoch 017: 1321 / 1689 loss=3.593, nll_loss=2.047, ppl=4.13, wps=583503, ups=1.18, wpb=495254, bsz=16426.7, num_updates=28300, lr=0.000375956, gnorm=0.137, clip=0, loss_scale=4, train_wall=84, gb_free=60.7, wall=24292 epoch 017: 1321 / 1689 loss=3.593, nll_loss=2.047, ppl=4.13, wps=583503, ups=1.18, wpb=495254, bsz=16426.7, num_updates=28300, lr=0.000375956, gnorm=0.137, clip=0, loss_scale=4, train_wall=84, gb_free=60.7, wall=24292 epoch 017: 1321 / 1689 loss=3.593, nll_loss=2.047, ppl=4.13, wps=583503, ups=1.18, wpb=495254, bsz=16426.7, num_updates=28300, lr=0.000375956, gnorm=0.137, clip=0, loss_scale=4, train_wall=84, gb_free=60.7, wall=24292 epoch 017: 1321 / 1689 loss=3.593, nll_loss=2.047, ppl=4.13, wps=583503, ups=1.18, wpb=495254, bsz=16426.7, num_updates=28300, lr=0.000375956, gnorm=0.137, clip=0, loss_scale=4, train_wall=84, gb_free=60.7, wall=24292 epoch 017: 1321 / 1689 loss=3.593, nll_loss=2.047, ppl=4.13, wps=583503, ups=1.18, wpb=495254, bsz=16426.7, num_updates=28300, lr=0.000375956, gnorm=0.137, clip=0, loss_scale=4, train_wall=84, gb_free=60.7, wall=24292 epoch 017: 1321 / 1689 loss=3.593, nll_loss=2.047, ppl=4.13, wps=583503, ups=1.18, wpb=495254, bsz=16426.7, num_updates=28300, lr=0.000375956, gnorm=0.137, clip=0, loss_scale=4, train_wall=84, gb_free=60.7, wall=24292 epoch 017: 1422 / 1689 loss=3.591, nll_loss=2.046, ppl=4.13, wps=579385, ups=1.17, wpb=495616, bsz=16504.6, num_updates=28400, lr=0.000375293, gnorm=0.139, clip=0, loss_scale=2, train_wall=84, gb_free=62, wall=24377 epoch 017: 1422 / 1689 loss=3.591, nll_loss=2.046, ppl=4.13, wps=579385, ups=1.17, wpb=495616, bsz=16504.6, num_updates=28400, lr=0.000375293, gnorm=0.139, clip=0, loss_scale=2, train_wall=84, gb_free=62, wall=24377 epoch 017: 1422 / 1689 loss=3.591, nll_loss=2.046, ppl=4.13, wps=579385, ups=1.17, wpb=495616, bsz=16504.6, num_updates=28400, lr=0.000375293, gnorm=0.139, clip=0, loss_scale=2, train_wall=84, gb_free=62, wall=24377 epoch 017: 1422 / 1689 loss=3.591, nll_loss=2.046, ppl=4.13, wps=579385, ups=1.17, wpb=495616, bsz=16504.6, num_updates=28400, lr=0.000375293, gnorm=0.139, clip=0, loss_scale=2, train_wall=84, gb_free=62, wall=24377 epoch 017: 1422 / 1689 loss=3.591, nll_loss=2.046, ppl=4.13, wps=579385, ups=1.17, wpb=495616, bsz=16504.6, num_updates=28400, lr=0.000375293, gnorm=0.139, clip=0, loss_scale=2, train_wall=84, gb_free=62, wall=24377 epoch 017: 1422 / 1689 loss=3.591, nll_loss=2.046, ppl=4.13, wps=579385, ups=1.17, wpb=495616, bsz=16504.6, num_updates=28400, lr=0.000375293, gnorm=0.139, clip=0, loss_scale=2, train_wall=84, gb_free=62, wall=24377 epoch 017: 1422 / 1689 loss=3.591, nll_loss=2.046, ppl=4.13, wps=579385, ups=1.17, wpb=495616, bsz=16504.6, num_updates=28400, lr=0.000375293, gnorm=0.139, clip=0, loss_scale=2, train_wall=84, gb_free=62, wall=24377 epoch 017: 1422 / 1689 loss=3.591, nll_loss=2.046, ppl=4.13, wps=579385, ups=1.17, wpb=495616, bsz=16504.6, num_updates=28400, lr=0.000375293, gnorm=0.139, clip=0, loss_scale=2, train_wall=84, gb_free=62, wall=24377 epoch 017: 1422 / 1689 loss=3.591, nll_loss=2.046, ppl=4.13, wps=579385, ups=1.17, wpb=495616, bsz=16504.6, num_updates=28400, lr=0.000375293, gnorm=0.139, clip=0, loss_scale=2, train_wall=84, gb_free=62, wall=24377 epoch 017: 1422 / 1689 loss=3.591, nll_loss=2.046, ppl=4.13, wps=579385, ups=1.17, wpb=495616, bsz=16504.6, num_updates=28400, lr=0.000375293, gnorm=0.139, clip=0, loss_scale=2, train_wall=84, gb_free=62, wall=24377 epoch 017: 1422 / 1689 loss=3.591, nll_loss=2.046, ppl=4.13, wps=579385, ups=1.17, wpb=495616, bsz=16504.6, num_updates=28400, lr=0.000375293, gnorm=0.139, clip=0, loss_scale=2, train_wall=84, gb_free=62, wall=24377 epoch 017: 1422 / 1689 loss=3.591, nll_loss=2.046, ppl=4.13, wps=579385, ups=1.17, wpb=495616, bsz=16504.6, num_updates=28400, lr=0.000375293, gnorm=0.139, clip=0, loss_scale=2, train_wall=84, gb_free=62, wall=24377 epoch 017: 1422 / 1689 loss=3.591, nll_loss=2.046, ppl=4.13, wps=579385, ups=1.17, wpb=495616, bsz=16504.6, num_updates=28400, lr=0.000375293, gnorm=0.139, clip=0, loss_scale=2, train_wall=84, gb_free=62, wall=24377 epoch 017: 1422 / 1689 loss=3.591, nll_loss=2.046, ppl=4.13, wps=579385, ups=1.17, wpb=495616, bsz=16504.6, num_updates=28400, lr=0.000375293, gnorm=0.139, clip=0, loss_scale=2, train_wall=84, gb_free=62, wall=24377 epoch 017: 1422 / 1689 loss=3.591, nll_loss=2.046, ppl=4.13, wps=579385, ups=1.17, wpb=495616, bsz=16504.6, num_updates=28400, lr=0.000375293, gnorm=0.139, clip=0, loss_scale=2, train_wall=84, gb_free=62, wall=24377 epoch 017: 1422 / 1689 loss=3.591, nll_loss=2.046, ppl=4.13, wps=579385, ups=1.17, wpb=495616, bsz=16504.6, num_updates=28400, lr=0.000375293, gnorm=0.139, clip=0, loss_scale=2, train_wall=84, gb_free=62, wall=24377 epoch 017: 1422 / 1689 loss=3.591, nll_loss=2.046, ppl=4.13, wps=579385, ups=1.17, wpb=495616, bsz=16504.6, num_updates=28400, lr=0.000375293, gnorm=0.139, clip=0, loss_scale=2, train_wall=84, gb_free=62, wall=24377 epoch 017: 1522 / 1689 loss=3.593, nll_loss=2.048, ppl=4.13, wps=587371, ups=1.18, wpb=496825, bsz=16686.2, num_updates=28500, lr=0.000374634, gnorm=0.146, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=24462 epoch 017: 1522 / 1689 loss=3.593, nll_loss=2.048, ppl=4.13, wps=587371, ups=1.18, wpb=496825, bsz=16686.2, num_updates=28500, lr=0.000374634, gnorm=0.146, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=24462 epoch 017: 1522 / 1689 loss=3.593, nll_loss=2.048, ppl=4.13, wps=587371, ups=1.18, wpb=496825, bsz=16686.2, num_updates=28500, lr=0.000374634, gnorm=0.146, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=24462 epoch 017: 1522 / 1689 loss=3.593, nll_loss=2.048, ppl=4.13, wps=587371, ups=1.18, wpb=496825, bsz=16686.2, num_updates=28500, lr=0.000374634, gnorm=0.146, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=24462 epoch 017: 1522 / 1689 loss=3.593, nll_loss=2.048, ppl=4.13, wps=587371, ups=1.18, wpb=496825, bsz=16686.2, num_updates=28500, lr=0.000374634, gnorm=0.146, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=24462 epoch 017: 1522 / 1689 loss=3.593, nll_loss=2.048, ppl=4.13, wps=587371, ups=1.18, wpb=496825, bsz=16686.2, num_updates=28500, lr=0.000374634, gnorm=0.146, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=24462 epoch 017: 1522 / 1689 loss=3.593, nll_loss=2.048, ppl=4.13, wps=587371, ups=1.18, wpb=496825, bsz=16686.2, num_updates=28500, lr=0.000374634, gnorm=0.146, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=24462 epoch 017: 1522 / 1689 loss=3.593, nll_loss=2.048, ppl=4.13, wps=587371, ups=1.18, wpb=496825, bsz=16686.2, num_updates=28500, lr=0.000374634, gnorm=0.146, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=24462 epoch 017: 1522 / 1689 loss=3.593, nll_loss=2.048, ppl=4.13, wps=587371, ups=1.18, wpb=496825, bsz=16686.2, num_updates=28500, lr=0.000374634, gnorm=0.146, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=24462 epoch 017: 1522 / 1689 loss=3.593, nll_loss=2.048, ppl=4.13, wps=587371, ups=1.18, wpb=496825, bsz=16686.2, num_updates=28500, lr=0.000374634, gnorm=0.146, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=24462 epoch 017: 1522 / 1689 loss=3.593, nll_loss=2.048, ppl=4.13, wps=587371, ups=1.18, wpb=496825, bsz=16686.2, num_updates=28500, lr=0.000374634, gnorm=0.146, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=24462 epoch 017: 1522 / 1689 loss=3.593, nll_loss=2.048, ppl=4.13, wps=587371, ups=1.18, wpb=496825, bsz=16686.2, num_updates=28500, lr=0.000374634, gnorm=0.146, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=24462 epoch 017: 1522 / 1689 loss=3.593, nll_loss=2.048, ppl=4.13, wps=587371, ups=1.18, wpb=496825, bsz=16686.2, num_updates=28500, lr=0.000374634, gnorm=0.146, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=24462 epoch 017: 1522 / 1689 loss=3.593, nll_loss=2.048, ppl=4.13, wps=587371, ups=1.18, wpb=496825, bsz=16686.2, num_updates=28500, lr=0.000374634, gnorm=0.146, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=24462 epoch 017: 1522 / 1689 loss=3.593, nll_loss=2.048, ppl=4.13, wps=587371, ups=1.18, wpb=496825, bsz=16686.2, num_updates=28500, lr=0.000374634, gnorm=0.146, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=24462 epoch 017: 1522 / 1689 loss=3.593, nll_loss=2.048, ppl=4.13, wps=587371, ups=1.18, wpb=496825, bsz=16686.2, num_updates=28500, lr=0.000374634, gnorm=0.146, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=24462 epoch 017: 1522 / 1689 loss=3.593, nll_loss=2.048, ppl=4.13, wps=587371, ups=1.18, wpb=496825, bsz=16686.2, num_updates=28500, lr=0.000374634, gnorm=0.146, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=24462 epoch 017: 1622 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=586108, ups=1.18, wpb=495648, bsz=16841, num_updates=28600, lr=0.000373979, gnorm=0.142, clip=0, loss_scale=2, train_wall=84, gb_free=61.7, wall=24547 epoch 017: 1622 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=586108, ups=1.18, wpb=495648, bsz=16841, num_updates=28600, lr=0.000373979, gnorm=0.142, clip=0, loss_scale=2, train_wall=84, gb_free=61.7, wall=24547 epoch 017: 1622 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=586108, ups=1.18, wpb=495648, bsz=16841, num_updates=28600, lr=0.000373979, gnorm=0.142, clip=0, loss_scale=2, train_wall=84, gb_free=61.7, wall=24547 epoch 017: 1622 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=586108, ups=1.18, wpb=495648, bsz=16841, num_updates=28600, lr=0.000373979, gnorm=0.142, clip=0, loss_scale=2, train_wall=84, gb_free=61.7, wall=24547 epoch 017: 1622 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=586108, ups=1.18, wpb=495648, bsz=16841, num_updates=28600, lr=0.000373979, gnorm=0.142, clip=0, loss_scale=2, train_wall=84, gb_free=61.7, wall=24547 epoch 017: 1622 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=586108, ups=1.18, wpb=495648, bsz=16841, num_updates=28600, lr=0.000373979, gnorm=0.142, clip=0, loss_scale=2, train_wall=84, gb_free=61.7, wall=24547 epoch 017: 1622 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=586108, ups=1.18, wpb=495648, bsz=16841, num_updates=28600, lr=0.000373979, gnorm=0.142, clip=0, loss_scale=2, train_wall=84, gb_free=61.7, wall=24547 epoch 017: 1622 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=586108, ups=1.18, wpb=495648, bsz=16841, num_updates=28600, lr=0.000373979, gnorm=0.142, clip=0, loss_scale=2, train_wall=84, gb_free=61.7, wall=24547 epoch 017: 1622 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=586108, ups=1.18, wpb=495648, bsz=16841, num_updates=28600, lr=0.000373979, gnorm=0.142, clip=0, loss_scale=2, train_wall=84, gb_free=61.7, wall=24547 epoch 017: 1622 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=586108, ups=1.18, wpb=495648, bsz=16841, num_updates=28600, lr=0.000373979, gnorm=0.142, clip=0, loss_scale=2, train_wall=84, gb_free=61.7, wall=24547 epoch 017: 1622 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=586108, ups=1.18, wpb=495648, bsz=16841, num_updates=28600, lr=0.000373979, gnorm=0.142, clip=0, loss_scale=2, train_wall=84, gb_free=61.7, wall=24547 epoch 017: 1622 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=586108, ups=1.18, wpb=495648, bsz=16841, num_updates=28600, lr=0.000373979, gnorm=0.142, clip=0, loss_scale=2, train_wall=84, gb_free=61.7, wall=24547 epoch 017: 1622 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=586108, ups=1.18, wpb=495648, bsz=16841, num_updates=28600, lr=0.000373979, gnorm=0.142, clip=0, loss_scale=2, train_wall=84, gb_free=61.7, wall=24547 epoch 017: 1622 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=586108, ups=1.18, wpb=495648, bsz=16841, num_updates=28600, lr=0.000373979, gnorm=0.142, clip=0, loss_scale=2, train_wall=84, gb_free=61.7, wall=24547 epoch 017: 1622 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=586108, ups=1.18, wpb=495648, bsz=16841, num_updates=28600, lr=0.000373979, gnorm=0.142, clip=0, loss_scale=2, train_wall=84, gb_free=61.7, wall=24547 epoch 017: 1622 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=586108, ups=1.18, wpb=495648, bsz=16841, num_updates=28600, lr=0.000373979, gnorm=0.142, clip=0, loss_scale=2, train_wall=84, gb_free=61.7, wall=24547 epoch 017: 1622 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=586108, ups=1.18, wpb=495648, bsz=16841, num_updates=28600, lr=0.000373979, gnorm=0.142, clip=0, loss_scale=2, train_wall=84, gb_free=61.7, wall=24547 end of epoch 17 (average epoch stats below) epoch 017 | loss 3.589 | nll_loss 2.043 | ppl 4.12 | wps 577934 | ups 1.17 | wpb 495142 | bsz 16506.2 | num_updates 28667 | lr 0.000373542 | gnorm 0.14 | clip 0 | loss_scale 2 | train_wall 1408 | gb_free 60.5 | wall 24602 epoch 017 | loss 3.589 | nll_loss 2.043 | ppl 4.12 | wps 577934 | ups 1.17 | wpb 495142 | bsz 16506.2 | num_updates 28667 | lr 0.000373542 | gnorm 0.14 | clip 0 | loss_scale 2 | train_wall 1408 | gb_free 60.5 | wall 24602 epoch 017 | loss 3.589 | nll_loss 2.043 | ppl 4.12 | wps 577934 | ups 1.17 | wpb 495142 | bsz 16506.2 | num_updates 28667 | lr 0.000373542 | gnorm 0.14 | clip 0 | loss_scale 2 | train_wall 1408 | gb_free 60.5 | wall 24602 epoch 017 | loss 3.589 | nll_loss 2.043 | ppl 4.12 | wps 577934 | ups 1.17 | wpb 495142 | bsz 16506.2 | num_updates 28667 | lr 0.000373542 | gnorm 0.14 | clip 0 | loss_scale 2 | train_wall 1408 | gb_free 60.5 | wall 24602 epoch 017 | loss 3.589 | nll_loss 2.043 | ppl 4.12 | wps 577934 | ups 1.17 | wpb 495142 | bsz 16506.2 | num_updates 28667 | lr 0.000373542 | gnorm 0.14 | clip 0 | loss_scale 2 | train_wall 1408 | gb_free 60.5 | wall 24602 epoch 017 | loss 3.589 | nll_loss 2.043 | ppl 4.12 | wps 577934 | ups 1.17 | wpb 495142 | bsz 16506.2 | num_updates 28667 | lr 0.000373542 | gnorm 0.14 | clip 0 | loss_scale 2 | train_wall 1408 | gb_free 60.5 | wall 24602 epoch 017 | loss 3.589 | nll_loss 2.043 | ppl 4.12 | wps 577934 | ups 1.17 | wpb 495142 | bsz 16506.2 | num_updates 28667 | lr 0.000373542 | gnorm 0.14 | clip 0 | loss_scale 2 | train_wall 1408 | gb_free 60.5 | wall 24602 epoch 017 | loss 3.589 | nll_loss 2.043 | ppl 4.12 | wps 577934 | ups 1.17 | wpb 495142 | bsz 16506.2 | num_updates 28667 | lr 0.000373542 | gnorm 0.14 | clip 0 | loss_scale 2 | train_wall 1408 | gb_free 60.5 | wall 24602 epoch 017 | loss 3.589 | nll_loss 2.043 | ppl 4.12 | wps 577934 | ups 1.17 | wpb 495142 | bsz 16506.2 | num_updates 28667 | lr 0.000373542 | gnorm 0.14 | clip 0 | loss_scale 2 | train_wall 1408 | gb_free 60.5 | wall 24602 epoch 017 | loss 3.589 | nll_loss 2.043 | ppl 4.12 | wps 577934 | ups 1.17 | wpb 495142 | bsz 16506.2 | num_updates 28667 | lr 0.000373542 | gnorm 0.14 | clip 0 | loss_scale 2 | train_wall 1408 | gb_free 60.5 | wall 24602 epoch 017 | loss 3.589 | nll_loss 2.043 | ppl 4.12 | wps 577934 | ups 1.17 | wpb 495142 | bsz 16506.2 | num_updates 28667 | lr 0.000373542 | gnorm 0.14 | clip 0 | loss_scale 2 | train_wall 1408 | gb_free 60.5 | wall 24602 epoch 017 | loss 3.589 | nll_loss 2.043 | ppl 4.12 | wps 577934 | ups 1.17 | wpb 495142 | bsz 16506.2 | num_updates 28667 | lr 0.000373542 | gnorm 0.14 | clip 0 | loss_scale 2 | train_wall 1408 | gb_free 60.5 | wall 24602 epoch 017 | loss 3.589 | nll_loss 2.043 | ppl 4.12 | wps 577934 | ups 1.17 | wpb 495142 | bsz 16506.2 | num_updates 28667 | lr 0.000373542 | gnorm 0.14 | clip 0 | loss_scale 2 | train_wall 1408 | gb_free 60.5 | wall 24602 epoch 017 | loss 3.589 | nll_loss 2.043 | ppl 4.12 | wps 577934 | ups 1.17 | wpb 495142 | bsz 16506.2 | num_updates 28667 | lr 0.000373542 | gnorm 0.14 | clip 0 | loss_scale 2 | train_wall 1408 | gb_free 60.5 | wall 24602 epoch 017 | loss 3.589 | nll_loss 2.043 | ppl 4.12 | wps 577934 | ups 1.17 | wpb 495142 | bsz 16506.2 | num_updates 28667 | lr 0.000373542 | gnorm 0.14 | clip 0 | loss_scale 2 | train_wall 1408 | gb_free 60.5 | wall 24602 epoch 017 | loss 3.589 | nll_loss 2.043 | ppl 4.12 | wps 577934 | ups 1.17 | wpb 495142 | bsz 16506.2 | num_updates 28667 | lr 0.000373542 | gnorm 0.14 | clip 0 | loss_scale 2 | train_wall 1408 | gb_free 60.5 | wall 24602 epoch 017 | loss 3.589 | nll_loss 2.043 | ppl 4.12 | wps 577934 | ups 1.17 | wpb 495142 | bsz 16506.2 | num_updates 28667 | lr 0.000373542 | gnorm 0.14 | clip 0 | loss_scale 2 | train_wall 1408 | gb_free 60.5 | wall 24602 Start iterating over samples epoch 018: 33 / 1689 loss=3.586, nll_loss=2.04, ppl=4.11, wps=586988, ups=1.2, wpb=490271, bsz=16070.5, num_updates=28700, lr=0.000373327, gnorm=0.142, clip=0, loss_scale=2, train_wall=82, gb_free=61.3, wall=24630 epoch 018: 33 / 1689 loss=3.586, nll_loss=2.04, ppl=4.11, wps=586988, ups=1.2, wpb=490271, bsz=16070.5, num_updates=28700, lr=0.000373327, gnorm=0.142, clip=0, loss_scale=2, train_wall=82, gb_free=61.3, wall=24630 epoch 018: 33 / 1689 loss=3.586, nll_loss=2.04, ppl=4.11, wps=586988, ups=1.2, wpb=490271, bsz=16070.5, num_updates=28700, lr=0.000373327, gnorm=0.142, clip=0, loss_scale=2, train_wall=82, gb_free=61.3, wall=24630 epoch 018: 33 / 1689 loss=3.586, nll_loss=2.04, ppl=4.11, wps=586988, ups=1.2, wpb=490271, bsz=16070.5, num_updates=28700, lr=0.000373327, gnorm=0.142, clip=0, loss_scale=2, train_wall=82, gb_free=61.3, wall=24630 epoch 018: 33 / 1689 loss=3.586, nll_loss=2.04, ppl=4.11, wps=586988, ups=1.2, wpb=490271, bsz=16070.5, num_updates=28700, lr=0.000373327, gnorm=0.142, clip=0, loss_scale=2, train_wall=82, gb_free=61.3, wall=24630 epoch 018: 33 / 1689 loss=3.586, nll_loss=2.04, ppl=4.11, wps=586988, ups=1.2, wpb=490271, bsz=16070.5, num_updates=28700, lr=0.000373327, gnorm=0.142, clip=0, loss_scale=2, train_wall=82, gb_free=61.3, wall=24630 epoch 018: 33 / 1689 loss=3.586, nll_loss=2.04, ppl=4.11, wps=586988, ups=1.2, wpb=490271, bsz=16070.5, num_updates=28700, lr=0.000373327, gnorm=0.142, clip=0, loss_scale=2, train_wall=82, gb_free=61.3, wall=24630 epoch 018: 33 / 1689 loss=3.586, nll_loss=2.04, ppl=4.11, wps=586988, ups=1.2, wpb=490271, bsz=16070.5, num_updates=28700, lr=0.000373327, gnorm=0.142, clip=0, loss_scale=2, train_wall=82, gb_free=61.3, wall=24630 epoch 018: 33 / 1689 loss=3.586, nll_loss=2.04, ppl=4.11, wps=586988, ups=1.2, wpb=490271, bsz=16070.5, num_updates=28700, lr=0.000373327, gnorm=0.142, clip=0, loss_scale=2, train_wall=82, gb_free=61.3, wall=24630 epoch 018: 33 / 1689 loss=3.586, nll_loss=2.04, ppl=4.11, wps=586988, ups=1.2, wpb=490271, bsz=16070.5, num_updates=28700, lr=0.000373327, gnorm=0.142, clip=0, loss_scale=2, train_wall=82, gb_free=61.3, wall=24630 epoch 018: 33 / 1689 loss=3.586, nll_loss=2.04, ppl=4.11, wps=586988, ups=1.2, wpb=490271, bsz=16070.5, num_updates=28700, lr=0.000373327, gnorm=0.142, clip=0, loss_scale=2, train_wall=82, gb_free=61.3, wall=24630 epoch 018: 33 / 1689 loss=3.586, nll_loss=2.04, ppl=4.11, wps=586988, ups=1.2, wpb=490271, bsz=16070.5, num_updates=28700, lr=0.000373327, gnorm=0.142, clip=0, loss_scale=2, train_wall=82, gb_free=61.3, wall=24630 epoch 018: 33 / 1689 loss=3.586, nll_loss=2.04, ppl=4.11, wps=586988, ups=1.2, wpb=490271, bsz=16070.5, num_updates=28700, lr=0.000373327, gnorm=0.142, clip=0, loss_scale=2, train_wall=82, gb_free=61.3, wall=24630 epoch 018: 33 / 1689 loss=3.586, nll_loss=2.04, ppl=4.11, wps=586988, ups=1.2, wpb=490271, bsz=16070.5, num_updates=28700, lr=0.000373327, gnorm=0.142, clip=0, loss_scale=2, train_wall=82, gb_free=61.3, wall=24630 epoch 018: 33 / 1689 loss=3.586, nll_loss=2.04, ppl=4.11, wps=586988, ups=1.2, wpb=490271, bsz=16070.5, num_updates=28700, lr=0.000373327, gnorm=0.142, clip=0, loss_scale=2, train_wall=82, gb_free=61.3, wall=24630 epoch 018: 33 / 1689 loss=3.586, nll_loss=2.04, ppl=4.11, wps=586988, ups=1.2, wpb=490271, bsz=16070.5, num_updates=28700, lr=0.000373327, gnorm=0.142, clip=0, loss_scale=2, train_wall=82, gb_free=61.3, wall=24630 epoch 018: 33 / 1689 loss=3.586, nll_loss=2.04, ppl=4.11, wps=586988, ups=1.2, wpb=490271, bsz=16070.5, num_updates=28700, lr=0.000373327, gnorm=0.142, clip=0, loss_scale=2, train_wall=82, gb_free=61.3, wall=24630 epoch 018: 33 / 1689 loss=3.586, nll_loss=2.04, ppl=4.11, wps=586988, ups=1.2, wpb=490271, bsz=16070.5, num_updates=28700, lr=0.000373327, gnorm=0.142, clip=0, loss_scale=2, train_wall=82, gb_free=61.3, wall=24630 epoch 018: 133 / 1689 loss=3.566, nll_loss=2.016, ppl=4.04, wps=584489, ups=1.18, wpb=494848, bsz=16434.9, num_updates=28800, lr=0.000372678, gnorm=0.141, clip=0, loss_scale=2, train_wall=84, gb_free=61.7, wall=24715 epoch 018: 133 / 1689 loss=3.566, nll_loss=2.016, ppl=4.04, wps=584489, ups=1.18, wpb=494848, bsz=16434.9, num_updates=28800, lr=0.000372678, gnorm=0.141, clip=0, loss_scale=2, train_wall=84, gb_free=61.7, wall=24715 epoch 018: 133 / 1689 loss=3.566, nll_loss=2.016, ppl=4.04, wps=584489, ups=1.18, wpb=494848, bsz=16434.9, num_updates=28800, lr=0.000372678, gnorm=0.141, clip=0, loss_scale=2, train_wall=84, gb_free=61.7, wall=24715 epoch 018: 133 / 1689 loss=3.566, nll_loss=2.016, ppl=4.04, wps=584489, ups=1.18, wpb=494848, bsz=16434.9, num_updates=28800, lr=0.000372678, gnorm=0.141, clip=0, loss_scale=2, train_wall=84, gb_free=61.7, wall=24715 epoch 018: 133 / 1689 loss=3.566, nll_loss=2.016, ppl=4.04, wps=584489, ups=1.18, wpb=494848, bsz=16434.9, num_updates=28800, lr=0.000372678, gnorm=0.141, clip=0, loss_scale=2, train_wall=84, gb_free=61.7, wall=24715 epoch 018: 133 / 1689 loss=3.566, nll_loss=2.016, ppl=4.04, wps=584489, ups=1.18, wpb=494848, bsz=16434.9, num_updates=28800, lr=0.000372678, gnorm=0.141, clip=0, loss_scale=2, train_wall=84, gb_free=61.7, wall=24715 epoch 018: 133 / 1689 loss=3.566, nll_loss=2.016, ppl=4.04, wps=584489, ups=1.18, wpb=494848, bsz=16434.9, num_updates=28800, lr=0.000372678, gnorm=0.141, clip=0, loss_scale=2, train_wall=84, gb_free=61.7, wall=24715 epoch 018: 133 / 1689 loss=3.566, nll_loss=2.016, ppl=4.04, wps=584489, ups=1.18, wpb=494848, bsz=16434.9, num_updates=28800, lr=0.000372678, gnorm=0.141, clip=0, loss_scale=2, train_wall=84, gb_free=61.7, wall=24715 epoch 018: 133 / 1689 loss=3.566, nll_loss=2.016, ppl=4.04, wps=584489, ups=1.18, wpb=494848, bsz=16434.9, num_updates=28800, lr=0.000372678, gnorm=0.141, clip=0, loss_scale=2, train_wall=84, gb_free=61.7, wall=24715 epoch 018: 133 / 1689 loss=3.566, nll_loss=2.016, ppl=4.04, wps=584489, ups=1.18, wpb=494848, bsz=16434.9, num_updates=28800, lr=0.000372678, gnorm=0.141, clip=0, loss_scale=2, train_wall=84, gb_free=61.7, wall=24715 epoch 018: 133 / 1689 loss=3.566, nll_loss=2.016, ppl=4.04, wps=584489, ups=1.18, wpb=494848, bsz=16434.9, num_updates=28800, lr=0.000372678, gnorm=0.141, clip=0, loss_scale=2, train_wall=84, gb_free=61.7, wall=24715 epoch 018: 133 / 1689 loss=3.566, nll_loss=2.016, ppl=4.04, wps=584489, ups=1.18, wpb=494848, bsz=16434.9, num_updates=28800, lr=0.000372678, gnorm=0.141, clip=0, loss_scale=2, train_wall=84, gb_free=61.7, wall=24715 epoch 018: 133 / 1689 loss=3.566, nll_loss=2.016, ppl=4.04, wps=584489, ups=1.18, wpb=494848, bsz=16434.9, num_updates=28800, lr=0.000372678, gnorm=0.141, clip=0, loss_scale=2, train_wall=84, gb_free=61.7, wall=24715 epoch 018: 133 / 1689 loss=3.566, nll_loss=2.016, ppl=4.04, wps=584489, ups=1.18, wpb=494848, bsz=16434.9, num_updates=28800, lr=0.000372678, gnorm=0.141, clip=0, loss_scale=2, train_wall=84, gb_free=61.7, wall=24715 epoch 018: 133 / 1689 loss=3.566, nll_loss=2.016, ppl=4.04, wps=584489, ups=1.18, wpb=494848, bsz=16434.9, num_updates=28800, lr=0.000372678, gnorm=0.141, clip=0, loss_scale=2, train_wall=84, gb_free=61.7, wall=24715 epoch 018: 133 / 1689 loss=3.566, nll_loss=2.016, ppl=4.04, wps=584489, ups=1.18, wpb=494848, bsz=16434.9, num_updates=28800, lr=0.000372678, gnorm=0.141, clip=0, loss_scale=2, train_wall=84, gb_free=61.7, wall=24715 epoch 018: 133 / 1689 loss=3.566, nll_loss=2.016, ppl=4.04, wps=584489, ups=1.18, wpb=494848, bsz=16434.9, num_updates=28800, lr=0.000372678, gnorm=0.141, clip=0, loss_scale=2, train_wall=84, gb_free=61.7, wall=24715 epoch 018: 133 / 1689 loss=3.566, nll_loss=2.016, ppl=4.04, wps=584489, ups=1.18, wpb=494848, bsz=16434.9, num_updates=28800, lr=0.000372678, gnorm=0.141, clip=0, loss_scale=2, train_wall=84, gb_free=61.7, wall=24715 epoch 018: 233 / 1689 loss=3.575, nll_loss=2.027, ppl=4.08, wps=589517, ups=1.19, wpb=496144, bsz=16327.8, num_updates=28900, lr=0.000372033, gnorm=0.133, clip=0, loss_scale=4, train_wall=83, gb_free=61.4, wall=24799 epoch 018: 233 / 1689 loss=3.575, nll_loss=2.027, ppl=4.08, wps=589517, ups=1.19, wpb=496144, bsz=16327.8, num_updates=28900, lr=0.000372033, gnorm=0.133, clip=0, loss_scale=4, train_wall=83, gb_free=61.4, wall=24799 epoch 018: 233 / 1689 loss=3.575, nll_loss=2.027, ppl=4.08, wps=589517, ups=1.19, wpb=496144, bsz=16327.8, num_updates=28900, lr=0.000372033, gnorm=0.133, clip=0, loss_scale=4, train_wall=83, gb_free=61.4, wall=24799 epoch 018: 233 / 1689 loss=3.575, nll_loss=2.027, ppl=4.08, wps=589517, ups=1.19, wpb=496144, bsz=16327.8, num_updates=28900, lr=0.000372033, gnorm=0.133, clip=0, loss_scale=4, train_wall=83, gb_free=61.4, wall=24799 epoch 018: 233 / 1689 loss=3.575, nll_loss=2.027, ppl=4.08, wps=589517, ups=1.19, wpb=496144, bsz=16327.8, num_updates=28900, lr=0.000372033, gnorm=0.133, clip=0, loss_scale=4, train_wall=83, gb_free=61.4, wall=24799 epoch 018: 233 / 1689 loss=3.575, nll_loss=2.027, ppl=4.08, wps=589517, ups=1.19, wpb=496144, bsz=16327.8, num_updates=28900, lr=0.000372033, gnorm=0.133, clip=0, loss_scale=4, train_wall=83, gb_free=61.4, wall=24799 epoch 018: 233 / 1689 loss=3.575, nll_loss=2.027, ppl=4.08, wps=589517, ups=1.19, wpb=496144, bsz=16327.8, num_updates=28900, lr=0.000372033, gnorm=0.133, clip=0, loss_scale=4, train_wall=83, gb_free=61.4, wall=24799 epoch 018: 233 / 1689 loss=3.575, nll_loss=2.027, ppl=4.08, wps=589517, ups=1.19, wpb=496144, bsz=16327.8, num_updates=28900, lr=0.000372033, gnorm=0.133, clip=0, loss_scale=4, train_wall=83, gb_free=61.4, wall=24799 epoch 018: 233 / 1689 loss=3.575, nll_loss=2.027, ppl=4.08, wps=589517, ups=1.19, wpb=496144, bsz=16327.8, num_updates=28900, lr=0.000372033, gnorm=0.133, clip=0, loss_scale=4, train_wall=83, gb_free=61.4, wall=24799 epoch 018: 233 / 1689 loss=3.575, nll_loss=2.027, ppl=4.08, wps=589517, ups=1.19, wpb=496144, bsz=16327.8, num_updates=28900, lr=0.000372033, gnorm=0.133, clip=0, loss_scale=4, train_wall=83, gb_free=61.4, wall=24799 epoch 018: 233 / 1689 loss=3.575, nll_loss=2.027, ppl=4.08, wps=589517, ups=1.19, wpb=496144, bsz=16327.8, num_updates=28900, lr=0.000372033, gnorm=0.133, clip=0, loss_scale=4, train_wall=83, gb_free=61.4, wall=24799 epoch 018: 233 / 1689 loss=3.575, nll_loss=2.027, ppl=4.08, wps=589517, ups=1.19, wpb=496144, bsz=16327.8, num_updates=28900, lr=0.000372033, gnorm=0.133, clip=0, loss_scale=4, train_wall=83, gb_free=61.4, wall=24799 epoch 018: 233 / 1689 loss=3.575, nll_loss=2.027, ppl=4.08, wps=589517, ups=1.19, wpb=496144, bsz=16327.8, num_updates=28900, lr=0.000372033, gnorm=0.133, clip=0, loss_scale=4, train_wall=83, gb_free=61.4, wall=24799 epoch 018: 233 / 1689 loss=3.575, nll_loss=2.027, ppl=4.08, wps=589517, ups=1.19, wpb=496144, bsz=16327.8, num_updates=28900, lr=0.000372033, gnorm=0.133, clip=0, loss_scale=4, train_wall=83, gb_free=61.4, wall=24799 epoch 018: 233 / 1689 loss=3.575, nll_loss=2.027, ppl=4.08, wps=589517, ups=1.19, wpb=496144, bsz=16327.8, num_updates=28900, lr=0.000372033, gnorm=0.133, clip=0, loss_scale=4, train_wall=83, gb_free=61.4, wall=24799 epoch 018: 233 / 1689 loss=3.575, nll_loss=2.027, ppl=4.08, wps=589517, ups=1.19, wpb=496144, bsz=16327.8, num_updates=28900, lr=0.000372033, gnorm=0.133, clip=0, loss_scale=4, train_wall=83, gb_free=61.4, wall=24799 epoch 018: 233 / 1689 loss=3.575, nll_loss=2.027, ppl=4.08, wps=589517, ups=1.19, wpb=496144, bsz=16327.8, num_updates=28900, lr=0.000372033, gnorm=0.133, clip=0, loss_scale=4, train_wall=83, gb_free=61.4, wall=24799 epoch 018: 233 / 1689 loss=3.575, nll_loss=2.027, ppl=4.08, wps=589517, ups=1.19, wpb=496144, bsz=16327.8, num_updates=28900, lr=0.000372033, gnorm=0.133, clip=0, loss_scale=4, train_wall=83, gb_free=61.4, wall=24799 epoch 018: 334 / 1689 loss=3.576, nll_loss=2.028, ppl=4.08, wps=582467, ups=1.18, wpb=495673, bsz=16582.3, num_updates=29000, lr=0.000371391, gnorm=0.142, clip=0, loss_scale=2, train_wall=84, gb_free=61.4, wall=24884 epoch 018: 334 / 1689 loss=3.576, nll_loss=2.028, ppl=4.08, wps=582467, ups=1.18, wpb=495673, bsz=16582.3, num_updates=29000, lr=0.000371391, gnorm=0.142, clip=0, loss_scale=2, train_wall=84, gb_free=61.4, wall=24884 epoch 018: 334 / 1689 loss=3.576, nll_loss=2.028, ppl=4.08, wps=582467, ups=1.18, wpb=495673, bsz=16582.3, num_updates=29000, lr=0.000371391, gnorm=0.142, clip=0, loss_scale=2, train_wall=84, gb_free=61.4, wall=24884 epoch 018: 334 / 1689 loss=3.576, nll_loss=2.028, ppl=4.08, wps=582467, ups=1.18, wpb=495673, bsz=16582.3, num_updates=29000, lr=0.000371391, gnorm=0.142, clip=0, loss_scale=2, train_wall=84, gb_free=61.4, wall=24884 epoch 018: 334 / 1689 loss=3.576, nll_loss=2.028, ppl=4.08, wps=582467, ups=1.18, wpb=495673, bsz=16582.3, num_updates=29000, lr=0.000371391, gnorm=0.142, clip=0, loss_scale=2, train_wall=84, gb_free=61.4, wall=24884 epoch 018: 334 / 1689 loss=3.576, nll_loss=2.028, ppl=4.08, wps=582467, ups=1.18, wpb=495673, bsz=16582.3, num_updates=29000, lr=0.000371391, gnorm=0.142, clip=0, loss_scale=2, train_wall=84, gb_free=61.4, wall=24884 epoch 018: 334 / 1689 loss=3.576, nll_loss=2.028, ppl=4.08, wps=582467, ups=1.18, wpb=495673, bsz=16582.3, num_updates=29000, lr=0.000371391, gnorm=0.142, clip=0, loss_scale=2, train_wall=84, gb_free=61.4, wall=24884 epoch 018: 334 / 1689 loss=3.576, nll_loss=2.028, ppl=4.08, wps=582467, ups=1.18, wpb=495673, bsz=16582.3, num_updates=29000, lr=0.000371391, gnorm=0.142, clip=0, loss_scale=2, train_wall=84, gb_free=61.4, wall=24884 epoch 018: 334 / 1689 loss=3.576, nll_loss=2.028, ppl=4.08, wps=582467, ups=1.18, wpb=495673, bsz=16582.3, num_updates=29000, lr=0.000371391, gnorm=0.142, clip=0, loss_scale=2, train_wall=84, gb_free=61.4, wall=24884 epoch 018: 334 / 1689 loss=3.576, nll_loss=2.028, ppl=4.08, wps=582467, ups=1.18, wpb=495673, bsz=16582.3, num_updates=29000, lr=0.000371391, gnorm=0.142, clip=0, loss_scale=2, train_wall=84, gb_free=61.4, wall=24884 epoch 018: 334 / 1689 loss=3.576, nll_loss=2.028, ppl=4.08, wps=582467, ups=1.18, wpb=495673, bsz=16582.3, num_updates=29000, lr=0.000371391, gnorm=0.142, clip=0, loss_scale=2, train_wall=84, gb_free=61.4, wall=24884 epoch 018: 334 / 1689 loss=3.576, nll_loss=2.028, ppl=4.08, wps=582467, ups=1.18, wpb=495673, bsz=16582.3, num_updates=29000, lr=0.000371391, gnorm=0.142, clip=0, loss_scale=2, train_wall=84, gb_free=61.4, wall=24884 epoch 018: 334 / 1689 loss=3.576, nll_loss=2.028, ppl=4.08, wps=582467, ups=1.18, wpb=495673, bsz=16582.3, num_updates=29000, lr=0.000371391, gnorm=0.142, clip=0, loss_scale=2, train_wall=84, gb_free=61.4, wall=24884 epoch 018: 334 / 1689 loss=3.576, nll_loss=2.028, ppl=4.08, wps=582467, ups=1.18, wpb=495673, bsz=16582.3, num_updates=29000, lr=0.000371391, gnorm=0.142, clip=0, loss_scale=2, train_wall=84, gb_free=61.4, wall=24884 epoch 018: 334 / 1689 loss=3.576, nll_loss=2.028, ppl=4.08, wps=582467, ups=1.18, wpb=495673, bsz=16582.3, num_updates=29000, lr=0.000371391, gnorm=0.142, clip=0, loss_scale=2, train_wall=84, gb_free=61.4, wall=24884 epoch 018: 334 / 1689 loss=3.576, nll_loss=2.028, ppl=4.08, wps=582467, ups=1.18, wpb=495673, bsz=16582.3, num_updates=29000, lr=0.000371391, gnorm=0.142, clip=0, loss_scale=2, train_wall=84, gb_free=61.4, wall=24884 epoch 018: 334 / 1689 loss=3.576, nll_loss=2.028, ppl=4.08, wps=582467, ups=1.18, wpb=495673, bsz=16582.3, num_updates=29000, lr=0.000371391, gnorm=0.142, clip=0, loss_scale=2, train_wall=84, gb_free=61.4, wall=24884 epoch 018: 334 / 1689 loss=3.576, nll_loss=2.028, ppl=4.08, wps=582467, ups=1.18, wpb=495673, bsz=16582.3, num_updates=29000, lr=0.000371391, gnorm=0.142, clip=0, loss_scale=2, train_wall=84, gb_free=61.4, wall=24884 begin validation on "valid" subset epoch 018 | valid on 'valid' subset | loss 3.716 | nll_loss 2.163 | ppl 4.48 | wps 0 | wpb 44526 | bsz 2008 | num_updates 29000 | best_loss 3.703 epoch 018 | valid on 'valid' subset | loss 3.716 | nll_loss 2.163 | ppl 4.48 | wps 0 | wpb 44526 | bsz 2008 | num_updates 29000 | best_loss 3.703 epoch 018 | valid on 'valid' subset | loss 3.716 | nll_loss 2.163 | ppl 4.48 | wps 0 | wpb 44526 | bsz 2008 | num_updates 29000 | best_loss 3.703 epoch 018 | valid on 'valid' subset | loss 3.716 | nll_loss 2.163 | ppl 4.48 | wps 0 | wpb 44526 | bsz 2008 | num_updates 29000 | best_loss 3.703 epoch 018 | valid on 'valid' subset | loss 3.716 | nll_loss 2.163 | ppl 4.48 | wps 0 | wpb 44526 | bsz 2008 | num_updates 29000 | best_loss 3.703 epoch 018 | valid on 'valid' subset | loss 3.716 | nll_loss 2.163 | ppl 4.48 | wps 0 | wpb 44526 | bsz 2008 | num_updates 29000 | best_loss 3.703 epoch 018 | valid on 'valid' subset | loss 3.716 | nll_loss 2.163 | ppl 4.48 | wps 0 | wpb 44526 | bsz 2008 | num_updates 29000 | best_loss 3.703 epoch 018 | valid on 'valid' subset | loss 3.716 | nll_loss 2.163 | ppl 4.48 | wps 0 | wpb 44526 | bsz 2008 | num_updates 29000 | best_loss 3.703 epoch 018 | valid on 'valid' subset | loss 3.716 | nll_loss 2.163 | ppl 4.48 | wps 0 | wpb 44526 | bsz 2008 | num_updates 29000 | best_loss 3.703 epoch 018 | valid on 'valid' subset | loss 3.716 | nll_loss 2.163 | ppl 4.48 | wps 0 | wpb 44526 | bsz 2008 | num_updates 29000 | best_loss 3.703 epoch 018 | valid on 'valid' subset | loss 3.716 | nll_loss 2.163 | ppl 4.48 | wps 0 | wpb 44526 | bsz 2008 | num_updates 29000 | best_loss 3.703 epoch 018 | valid on 'valid' subset | loss 3.716 | nll_loss 2.163 | ppl 4.48 | wps 0 | wpb 44526 | bsz 2008 | num_updates 29000 | best_loss 3.703 epoch 018 | valid on 'valid' subset | loss 3.716 | nll_loss 2.163 | ppl 4.48 | wps 0 | wpb 44526 | bsz 2008 | num_updates 29000 | best_loss 3.703 epoch 018 | valid on 'valid' subset | loss 3.716 | nll_loss 2.163 | ppl 4.48 | wps 0 | wpb 44526 | bsz 2008 | num_updates 29000 | best_loss 3.703 epoch 018 | valid on 'valid' subset | loss 3.716 | nll_loss 2.163 | ppl 4.48 | wps 0 | wpb 44526 | bsz 2008 | num_updates 29000 | best_loss 3.703 epoch 018 | valid on 'valid' subset | loss 3.716 | nll_loss 2.163 | ppl 4.48 | wps 0 | wpb 44526 | bsz 2008 | num_updates 29000 | best_loss 3.703 epoch 018 | valid on 'valid' subset | loss 3.716 | nll_loss 2.163 | ppl 4.48 | wps 0 | wpb 44526 | bsz 2008 | num_updates 29000 | best_loss 3.703 epoch 018 | valid on 'valid' subset | loss 3.716 | nll_loss 2.163 | ppl 4.48 | wps 0 | wpb 44526 | bsz 2008 | num_updates 29000 | best_loss 3.703 epoch 018: 434 / 1689 loss=3.569, nll_loss=2.02, ppl=4.06, wps=527923, ups=1.06, wpb=498524, bsz=16640.2, num_updates=29100, lr=0.000370752, gnorm=0.145, clip=0, loss_scale=2, train_wall=83, gb_free=60.3, wall=24979 epoch 018: 434 / 1689 loss=3.569, nll_loss=2.02, ppl=4.06, wps=527923, ups=1.06, wpb=498524, bsz=16640.2, num_updates=29100, lr=0.000370752, gnorm=0.145, clip=0, loss_scale=2, train_wall=83, gb_free=60.3, wall=24979 epoch 018: 434 / 1689 loss=3.569, nll_loss=2.02, ppl=4.06, wps=527923, ups=1.06, wpb=498524, bsz=16640.2, num_updates=29100, lr=0.000370752, gnorm=0.145, clip=0, loss_scale=2, train_wall=83, gb_free=60.3, wall=24979 epoch 018: 434 / 1689 loss=3.569, nll_loss=2.02, ppl=4.06, wps=527923, ups=1.06, wpb=498524, bsz=16640.2, num_updates=29100, lr=0.000370752, gnorm=0.145, clip=0, loss_scale=2, train_wall=83, gb_free=60.3, wall=24979 epoch 018: 434 / 1689 loss=3.569, nll_loss=2.02, ppl=4.06, wps=527923, ups=1.06, wpb=498524, bsz=16640.2, num_updates=29100, lr=0.000370752, gnorm=0.145, clip=0, loss_scale=2, train_wall=83, gb_free=60.3, wall=24979 epoch 018: 434 / 1689 loss=3.569, nll_loss=2.02, ppl=4.06, wps=527923, ups=1.06, wpb=498524, bsz=16640.2, num_updates=29100, lr=0.000370752, gnorm=0.145, clip=0, loss_scale=2, train_wall=83, gb_free=60.3, wall=24979 epoch 018: 434 / 1689 loss=3.569, nll_loss=2.02, ppl=4.06, wps=527923, ups=1.06, wpb=498524, bsz=16640.2, num_updates=29100, lr=0.000370752, gnorm=0.145, clip=0, loss_scale=2, train_wall=83, gb_free=60.3, wall=24979 epoch 018: 434 / 1689 loss=3.569, nll_loss=2.02, ppl=4.06, wps=527923, ups=1.06, wpb=498524, bsz=16640.2, num_updates=29100, lr=0.000370752, gnorm=0.145, clip=0, loss_scale=2, train_wall=83, gb_free=60.3, wall=24979 epoch 018: 434 / 1689 loss=3.569, nll_loss=2.02, ppl=4.06, wps=527923, ups=1.06, wpb=498524, bsz=16640.2, num_updates=29100, lr=0.000370752, gnorm=0.145, clip=0, loss_scale=2, train_wall=83, gb_free=60.3, wall=24979 epoch 018: 434 / 1689 loss=3.569, nll_loss=2.02, ppl=4.06, wps=527923, ups=1.06, wpb=498524, bsz=16640.2, num_updates=29100, lr=0.000370752, gnorm=0.145, clip=0, loss_scale=2, train_wall=83, gb_free=60.3, wall=24979 epoch 018: 434 / 1689 loss=3.569, nll_loss=2.02, ppl=4.06, wps=527923, ups=1.06, wpb=498524, bsz=16640.2, num_updates=29100, lr=0.000370752, gnorm=0.145, clip=0, loss_scale=2, train_wall=83, gb_free=60.3, wall=24979 epoch 018: 434 / 1689 loss=3.569, nll_loss=2.02, ppl=4.06, wps=527923, ups=1.06, wpb=498524, bsz=16640.2, num_updates=29100, lr=0.000370752, gnorm=0.145, clip=0, loss_scale=2, train_wall=83, gb_free=60.3, wall=24979 epoch 018: 434 / 1689 loss=3.569, nll_loss=2.02, ppl=4.06, wps=527923, ups=1.06, wpb=498524, bsz=16640.2, num_updates=29100, lr=0.000370752, gnorm=0.145, clip=0, loss_scale=2, train_wall=83, gb_free=60.3, wall=24979 epoch 018: 434 / 1689 loss=3.569, nll_loss=2.02, ppl=4.06, wps=527923, ups=1.06, wpb=498524, bsz=16640.2, num_updates=29100, lr=0.000370752, gnorm=0.145, clip=0, loss_scale=2, train_wall=83, gb_free=60.3, wall=24979 epoch 018: 434 / 1689 loss=3.569, nll_loss=2.02, ppl=4.06, wps=527923, ups=1.06, wpb=498524, bsz=16640.2, num_updates=29100, lr=0.000370752, gnorm=0.145, clip=0, loss_scale=2, train_wall=83, gb_free=60.3, wall=24979 epoch 018: 434 / 1689 loss=3.569, nll_loss=2.02, ppl=4.06, wps=527923, ups=1.06, wpb=498524, bsz=16640.2, num_updates=29100, lr=0.000370752, gnorm=0.145, clip=0, loss_scale=2, train_wall=83, gb_free=60.3, wall=24979 epoch 018: 434 / 1689 loss=3.569, nll_loss=2.02, ppl=4.06, wps=527923, ups=1.06, wpb=498524, bsz=16640.2, num_updates=29100, lr=0.000370752, gnorm=0.145, clip=0, loss_scale=2, train_wall=83, gb_free=60.3, wall=24979 epoch 018: 434 / 1689 loss=3.569, nll_loss=2.02, ppl=4.06, wps=527923, ups=1.06, wpb=498524, bsz=16640.2, num_updates=29100, lr=0.000370752, gnorm=0.145, clip=0, loss_scale=2, train_wall=83, gb_free=60.3, wall=24979 epoch 018: 534 / 1689 loss=3.573, nll_loss=2.025, ppl=4.07, wps=590324, ups=1.19, wpb=495565, bsz=16467.4, num_updates=29200, lr=0.000370117, gnorm=0.14, clip=0, loss_scale=2, train_wall=83, gb_free=61.7, wall=25062 epoch 018: 534 / 1689 loss=3.573, nll_loss=2.025, ppl=4.07, wps=590324, ups=1.19, wpb=495565, bsz=16467.4, num_updates=29200, lr=0.000370117, gnorm=0.14, clip=0, loss_scale=2, train_wall=83, gb_free=61.7, wall=25062 epoch 018: 534 / 1689 loss=3.573, nll_loss=2.025, ppl=4.07, wps=590324, ups=1.19, wpb=495565, bsz=16467.4, num_updates=29200, lr=0.000370117, gnorm=0.14, clip=0, loss_scale=2, train_wall=83, gb_free=61.7, wall=25062 epoch 018: 534 / 1689 loss=3.573, nll_loss=2.025, ppl=4.07, wps=590324, ups=1.19, wpb=495565, bsz=16467.4, num_updates=29200, lr=0.000370117, gnorm=0.14, clip=0, loss_scale=2, train_wall=83, gb_free=61.7, wall=25062 epoch 018: 534 / 1689 loss=3.573, nll_loss=2.025, ppl=4.07, wps=590324, ups=1.19, wpb=495565, bsz=16467.4, num_updates=29200, lr=0.000370117, gnorm=0.14, clip=0, loss_scale=2, train_wall=83, gb_free=61.7, wall=25062 epoch 018: 534 / 1689 loss=3.573, nll_loss=2.025, ppl=4.07, wps=590324, ups=1.19, wpb=495565, bsz=16467.4, num_updates=29200, lr=0.000370117, gnorm=0.14, clip=0, loss_scale=2, train_wall=83, gb_free=61.7, wall=25062 epoch 018: 534 / 1689 loss=3.573, nll_loss=2.025, ppl=4.07, wps=590324, ups=1.19, wpb=495565, bsz=16467.4, num_updates=29200, lr=0.000370117, gnorm=0.14, clip=0, loss_scale=2, train_wall=83, gb_free=61.7, wall=25062 epoch 018: 534 / 1689 loss=3.573, nll_loss=2.025, ppl=4.07, wps=590324, ups=1.19, wpb=495565, bsz=16467.4, num_updates=29200, lr=0.000370117, gnorm=0.14, clip=0, loss_scale=2, train_wall=83, gb_free=61.7, wall=25062 epoch 018: 534 / 1689 loss=3.573, nll_loss=2.025, ppl=4.07, wps=590324, ups=1.19, wpb=495565, bsz=16467.4, num_updates=29200, lr=0.000370117, gnorm=0.14, clip=0, loss_scale=2, train_wall=83, gb_free=61.7, wall=25062 epoch 018: 534 / 1689 loss=3.573, nll_loss=2.025, ppl=4.07, wps=590324, ups=1.19, wpb=495565, bsz=16467.4, num_updates=29200, lr=0.000370117, gnorm=0.14, clip=0, loss_scale=2, train_wall=83, gb_free=61.7, wall=25062 epoch 018: 534 / 1689 loss=3.573, nll_loss=2.025, ppl=4.07, wps=590324, ups=1.19, wpb=495565, bsz=16467.4, num_updates=29200, lr=0.000370117, gnorm=0.14, clip=0, loss_scale=2, train_wall=83, gb_free=61.7, wall=25062 epoch 018: 534 / 1689 loss=3.573, nll_loss=2.025, ppl=4.07, wps=590324, ups=1.19, wpb=495565, bsz=16467.4, num_updates=29200, lr=0.000370117, gnorm=0.14, clip=0, loss_scale=2, train_wall=83, gb_free=61.7, wall=25062 epoch 018: 534 / 1689 loss=3.573, nll_loss=2.025, ppl=4.07, wps=590324, ups=1.19, wpb=495565, bsz=16467.4, num_updates=29200, lr=0.000370117, gnorm=0.14, clip=0, loss_scale=2, train_wall=83, gb_free=61.7, wall=25062 epoch 018: 534 / 1689 loss=3.573, nll_loss=2.025, ppl=4.07, wps=590324, ups=1.19, wpb=495565, bsz=16467.4, num_updates=29200, lr=0.000370117, gnorm=0.14, clip=0, loss_scale=2, train_wall=83, gb_free=61.7, wall=25062 epoch 018: 534 / 1689 loss=3.573, nll_loss=2.025, ppl=4.07, wps=590324, ups=1.19, wpb=495565, bsz=16467.4, num_updates=29200, lr=0.000370117, gnorm=0.14, clip=0, loss_scale=2, train_wall=83, gb_free=61.7, wall=25062 epoch 018: 534 / 1689 loss=3.573, nll_loss=2.025, ppl=4.07, wps=590324, ups=1.19, wpb=495565, bsz=16467.4, num_updates=29200, lr=0.000370117, gnorm=0.14, clip=0, loss_scale=2, train_wall=83, gb_free=61.7, wall=25062 epoch 018: 534 / 1689 loss=3.573, nll_loss=2.025, ppl=4.07, wps=590324, ups=1.19, wpb=495565, bsz=16467.4, num_updates=29200, lr=0.000370117, gnorm=0.14, clip=0, loss_scale=2, train_wall=83, gb_free=61.7, wall=25062 epoch 018: 534 / 1689 loss=3.573, nll_loss=2.025, ppl=4.07, wps=590324, ups=1.19, wpb=495565, bsz=16467.4, num_updates=29200, lr=0.000370117, gnorm=0.14, clip=0, loss_scale=2, train_wall=83, gb_free=61.7, wall=25062 epoch 018: 634 / 1689 loss=3.577, nll_loss=2.03, ppl=4.08, wps=588310, ups=1.19, wpb=495851, bsz=16316.9, num_updates=29300, lr=0.000369484, gnorm=0.146, clip=0, loss_scale=2, train_wall=83, gb_free=61.6, wall=25147 epoch 018: 634 / 1689 loss=3.577, nll_loss=2.03, ppl=4.08, wps=588310, ups=1.19, wpb=495851, bsz=16316.9, num_updates=29300, lr=0.000369484, gnorm=0.146, clip=0, loss_scale=2, train_wall=83, gb_free=61.6, wall=25147 epoch 018: 634 / 1689 loss=3.577, nll_loss=2.03, ppl=4.08, wps=588310, ups=1.19, wpb=495851, bsz=16316.9, num_updates=29300, lr=0.000369484, gnorm=0.146, clip=0, loss_scale=2, train_wall=83, gb_free=61.6, wall=25147 epoch 018: 634 / 1689 loss=3.577, nll_loss=2.03, ppl=4.08, wps=588310, ups=1.19, wpb=495851, bsz=16316.9, num_updates=29300, lr=0.000369484, gnorm=0.146, clip=0, loss_scale=2, train_wall=83, gb_free=61.6, wall=25147 epoch 018: 634 / 1689 loss=3.577, nll_loss=2.03, ppl=4.08, wps=588310, ups=1.19, wpb=495851, bsz=16316.9, num_updates=29300, lr=0.000369484, gnorm=0.146, clip=0, loss_scale=2, train_wall=83, gb_free=61.6, wall=25147 epoch 018: 634 / 1689 loss=3.577, nll_loss=2.03, ppl=4.08, wps=588310, ups=1.19, wpb=495851, bsz=16316.9, num_updates=29300, lr=0.000369484, gnorm=0.146, clip=0, loss_scale=2, train_wall=83, gb_free=61.6, wall=25147 epoch 018: 634 / 1689 loss=3.577, nll_loss=2.03, ppl=4.08, wps=588310, ups=1.19, wpb=495851, bsz=16316.9, num_updates=29300, lr=0.000369484, gnorm=0.146, clip=0, loss_scale=2, train_wall=83, gb_free=61.6, wall=25147 epoch 018: 634 / 1689 loss=3.577, nll_loss=2.03, ppl=4.08, wps=588310, ups=1.19, wpb=495851, bsz=16316.9, num_updates=29300, lr=0.000369484, gnorm=0.146, clip=0, loss_scale=2, train_wall=83, gb_free=61.6, wall=25147 epoch 018: 634 / 1689 loss=3.577, nll_loss=2.03, ppl=4.08, wps=588310, ups=1.19, wpb=495851, bsz=16316.9, num_updates=29300, lr=0.000369484, gnorm=0.146, clip=0, loss_scale=2, train_wall=83, gb_free=61.6, wall=25147 epoch 018: 634 / 1689 loss=3.577, nll_loss=2.03, ppl=4.08, wps=588310, ups=1.19, wpb=495851, bsz=16316.9, num_updates=29300, lr=0.000369484, gnorm=0.146, clip=0, loss_scale=2, train_wall=83, gb_free=61.6, wall=25147 epoch 018: 634 / 1689 loss=3.577, nll_loss=2.03, ppl=4.08, wps=588310, ups=1.19, wpb=495851, bsz=16316.9, num_updates=29300, lr=0.000369484, gnorm=0.146, clip=0, loss_scale=2, train_wall=83, gb_free=61.6, wall=25147 epoch 018: 634 / 1689 loss=3.577, nll_loss=2.03, ppl=4.08, wps=588310, ups=1.19, wpb=495851, bsz=16316.9, num_updates=29300, lr=0.000369484, gnorm=0.146, clip=0, loss_scale=2, train_wall=83, gb_free=61.6, wall=25147 epoch 018: 634 / 1689 loss=3.577, nll_loss=2.03, ppl=4.08, wps=588310, ups=1.19, wpb=495851, bsz=16316.9, num_updates=29300, lr=0.000369484, gnorm=0.146, clip=0, loss_scale=2, train_wall=83, gb_free=61.6, wall=25147 epoch 018: 634 / 1689 loss=3.577, nll_loss=2.03, ppl=4.08, wps=588310, ups=1.19, wpb=495851, bsz=16316.9, num_updates=29300, lr=0.000369484, gnorm=0.146, clip=0, loss_scale=2, train_wall=83, gb_free=61.6, wall=25147 epoch 018: 634 / 1689 loss=3.577, nll_loss=2.03, ppl=4.08, wps=588310, ups=1.19, wpb=495851, bsz=16316.9, num_updates=29300, lr=0.000369484, gnorm=0.146, clip=0, loss_scale=2, train_wall=83, gb_free=61.6, wall=25147 epoch 018: 634 / 1689 loss=3.577, nll_loss=2.03, ppl=4.08, wps=588310, ups=1.19, wpb=495851, bsz=16316.9, num_updates=29300, lr=0.000369484, gnorm=0.146, clip=0, loss_scale=2, train_wall=83, gb_free=61.6, wall=25147 epoch 018: 634 / 1689 loss=3.577, nll_loss=2.03, ppl=4.08, wps=588310, ups=1.19, wpb=495851, bsz=16316.9, num_updates=29300, lr=0.000369484, gnorm=0.146, clip=0, loss_scale=2, train_wall=83, gb_free=61.6, wall=25147 epoch 018: 634 / 1689 loss=3.577, nll_loss=2.03, ppl=4.08, wps=588310, ups=1.19, wpb=495851, bsz=16316.9, num_updates=29300, lr=0.000369484, gnorm=0.146, clip=0, loss_scale=2, train_wall=83, gb_free=61.6, wall=25147 epoch 018: 735 / 1689 loss=3.581, nll_loss=2.034, ppl=4.1, wps=582362, ups=1.17, wpb=495827, bsz=16545.4, num_updates=29400, lr=0.000368856, gnorm=0.141, clip=0, loss_scale=1, train_wall=84, gb_free=61.4, wall=25232 epoch 018: 735 / 1689 loss=3.581, nll_loss=2.034, ppl=4.1, wps=582362, ups=1.17, wpb=495827, bsz=16545.4, num_updates=29400, lr=0.000368856, gnorm=0.141, clip=0, loss_scale=1, train_wall=84, gb_free=61.4, wall=25232 epoch 018: 735 / 1689 loss=3.581, nll_loss=2.034, ppl=4.1, wps=582362, ups=1.17, wpb=495827, bsz=16545.4, num_updates=29400, lr=0.000368856, gnorm=0.141, clip=0, loss_scale=1, train_wall=84, gb_free=61.4, wall=25232 epoch 018: 735 / 1689 loss=3.581, nll_loss=2.034, ppl=4.1, wps=582362, ups=1.17, wpb=495827, bsz=16545.4, num_updates=29400, lr=0.000368856, gnorm=0.141, clip=0, loss_scale=1, train_wall=84, gb_free=61.4, wall=25232 epoch 018: 735 / 1689 loss=3.581, nll_loss=2.034, ppl=4.1, wps=582362, ups=1.17, wpb=495827, bsz=16545.4, num_updates=29400, lr=0.000368856, gnorm=0.141, clip=0, loss_scale=1, train_wall=84, gb_free=61.4, wall=25232 epoch 018: 735 / 1689 loss=3.581, nll_loss=2.034, ppl=4.1, wps=582362, ups=1.17, wpb=495827, bsz=16545.4, num_updates=29400, lr=0.000368856, gnorm=0.141, clip=0, loss_scale=1, train_wall=84, gb_free=61.4, wall=25232 epoch 018: 735 / 1689 loss=3.581, nll_loss=2.034, ppl=4.1, wps=582362, ups=1.17, wpb=495827, bsz=16545.4, num_updates=29400, lr=0.000368856, gnorm=0.141, clip=0, loss_scale=1, train_wall=84, gb_free=61.4, wall=25232 epoch 018: 735 / 1689 loss=3.581, nll_loss=2.034, ppl=4.1, wps=582362, ups=1.17, wpb=495827, bsz=16545.4, num_updates=29400, lr=0.000368856, gnorm=0.141, clip=0, loss_scale=1, train_wall=84, gb_free=61.4, wall=25232 epoch 018: 735 / 1689 loss=3.581, nll_loss=2.034, ppl=4.1, wps=582362, ups=1.17, wpb=495827, bsz=16545.4, num_updates=29400, lr=0.000368856, gnorm=0.141, clip=0, loss_scale=1, train_wall=84, gb_free=61.4, wall=25232 epoch 018: 735 / 1689 loss=3.581, nll_loss=2.034, ppl=4.1, wps=582362, ups=1.17, wpb=495827, bsz=16545.4, num_updates=29400, lr=0.000368856, gnorm=0.141, clip=0, loss_scale=1, train_wall=84, gb_free=61.4, wall=25232 epoch 018: 735 / 1689 loss=3.581, nll_loss=2.034, ppl=4.1, wps=582362, ups=1.17, wpb=495827, bsz=16545.4, num_updates=29400, lr=0.000368856, gnorm=0.141, clip=0, loss_scale=1, train_wall=84, gb_free=61.4, wall=25232 epoch 018: 735 / 1689 loss=3.581, nll_loss=2.034, ppl=4.1, wps=582362, ups=1.17, wpb=495827, bsz=16545.4, num_updates=29400, lr=0.000368856, gnorm=0.141, clip=0, loss_scale=1, train_wall=84, gb_free=61.4, wall=25232 epoch 018: 735 / 1689 loss=3.581, nll_loss=2.034, ppl=4.1, wps=582362, ups=1.17, wpb=495827, bsz=16545.4, num_updates=29400, lr=0.000368856, gnorm=0.141, clip=0, loss_scale=1, train_wall=84, gb_free=61.4, wall=25232 epoch 018: 735 / 1689 loss=3.581, nll_loss=2.034, ppl=4.1, wps=582362, ups=1.17, wpb=495827, bsz=16545.4, num_updates=29400, lr=0.000368856, gnorm=0.141, clip=0, loss_scale=1, train_wall=84, gb_free=61.4, wall=25232 epoch 018: 735 / 1689 loss=3.581, nll_loss=2.034, ppl=4.1, wps=582362, ups=1.17, wpb=495827, bsz=16545.4, num_updates=29400, lr=0.000368856, gnorm=0.141, clip=0, loss_scale=1, train_wall=84, gb_free=61.4, wall=25232 epoch 018: 735 / 1689 loss=3.581, nll_loss=2.034, ppl=4.1, wps=582362, ups=1.17, wpb=495827, bsz=16545.4, num_updates=29400, lr=0.000368856, gnorm=0.141, clip=0, loss_scale=1, train_wall=84, gb_free=61.4, wall=25232 epoch 018: 735 / 1689 loss=3.581, nll_loss=2.034, ppl=4.1, wps=582362, ups=1.17, wpb=495827, bsz=16545.4, num_updates=29400, lr=0.000368856, gnorm=0.141, clip=0, loss_scale=1, train_wall=84, gb_free=61.4, wall=25232 epoch 018: 735 / 1689 loss=3.581, nll_loss=2.034, ppl=4.1, wps=582362, ups=1.17, wpb=495827, bsz=16545.4, num_updates=29400, lr=0.000368856, gnorm=0.141, clip=0, loss_scale=1, train_wall=84, gb_free=61.4, wall=25232 epoch 018: 835 / 1689 loss=3.581, nll_loss=2.035, ppl=4.1, wps=588122, ups=1.19, wpb=494889, bsz=16930.9, num_updates=29500, lr=0.00036823, gnorm=0.137, clip=0, loss_scale=1, train_wall=83, gb_free=61.7, wall=25316 epoch 018: 835 / 1689 loss=3.581, nll_loss=2.035, ppl=4.1, wps=588122, ups=1.19, wpb=494889, bsz=16930.9, num_updates=29500, lr=0.00036823, gnorm=0.137, clip=0, loss_scale=1, train_wall=83, gb_free=61.7, wall=25316 epoch 018: 835 / 1689 loss=3.581, nll_loss=2.035, ppl=4.1, wps=588122, ups=1.19, wpb=494889, bsz=16930.9, num_updates=29500, lr=0.00036823, gnorm=0.137, clip=0, loss_scale=1, train_wall=83, gb_free=61.7, wall=25316 epoch 018: 835 / 1689 loss=3.581, nll_loss=2.035, ppl=4.1, wps=588122, ups=1.19, wpb=494889, bsz=16930.9, num_updates=29500, lr=0.00036823, gnorm=0.137, clip=0, loss_scale=1, train_wall=83, gb_free=61.7, wall=25316 epoch 018: 835 / 1689 loss=3.581, nll_loss=2.035, ppl=4.1, wps=588122, ups=1.19, wpb=494889, bsz=16930.9, num_updates=29500, lr=0.00036823, gnorm=0.137, clip=0, loss_scale=1, train_wall=83, gb_free=61.7, wall=25316 epoch 018: 835 / 1689 loss=3.581, nll_loss=2.035, ppl=4.1, wps=588122, ups=1.19, wpb=494889, bsz=16930.9, num_updates=29500, lr=0.00036823, gnorm=0.137, clip=0, loss_scale=1, train_wall=83, gb_free=61.7, wall=25316 epoch 018: 835 / 1689 loss=3.581, nll_loss=2.035, ppl=4.1, wps=588122, ups=1.19, wpb=494889, bsz=16930.9, num_updates=29500, lr=0.00036823, gnorm=0.137, clip=0, loss_scale=1, train_wall=83, gb_free=61.7, wall=25316 epoch 018: 835 / 1689 loss=3.581, nll_loss=2.035, ppl=4.1, wps=588122, ups=1.19, wpb=494889, bsz=16930.9, num_updates=29500, lr=0.00036823, gnorm=0.137, clip=0, loss_scale=1, train_wall=83, gb_free=61.7, wall=25316 epoch 018: 835 / 1689 loss=3.581, nll_loss=2.035, ppl=4.1, wps=588122, ups=1.19, wpb=494889, bsz=16930.9, num_updates=29500, lr=0.00036823, gnorm=0.137, clip=0, loss_scale=1, train_wall=83, gb_free=61.7, wall=25316 epoch 018: 835 / 1689 loss=3.581, nll_loss=2.035, ppl=4.1, wps=588122, ups=1.19, wpb=494889, bsz=16930.9, num_updates=29500, lr=0.00036823, gnorm=0.137, clip=0, loss_scale=1, train_wall=83, gb_free=61.7, wall=25316 epoch 018: 835 / 1689 loss=3.581, nll_loss=2.035, ppl=4.1, wps=588122, ups=1.19, wpb=494889, bsz=16930.9, num_updates=29500, lr=0.00036823, gnorm=0.137, clip=0, loss_scale=1, train_wall=83, gb_free=61.7, wall=25316 epoch 018: 835 / 1689 loss=3.581, nll_loss=2.035, ppl=4.1, wps=588122, ups=1.19, wpb=494889, bsz=16930.9, num_updates=29500, lr=0.00036823, gnorm=0.137, clip=0, loss_scale=1, train_wall=83, gb_free=61.7, wall=25316 epoch 018: 835 / 1689 loss=3.581, nll_loss=2.035, ppl=4.1, wps=588122, ups=1.19, wpb=494889, bsz=16930.9, num_updates=29500, lr=0.00036823, gnorm=0.137, clip=0, loss_scale=1, train_wall=83, gb_free=61.7, wall=25316 epoch 018: 835 / 1689 loss=3.581, nll_loss=2.035, ppl=4.1, wps=588122, ups=1.19, wpb=494889, bsz=16930.9, num_updates=29500, lr=0.00036823, gnorm=0.137, clip=0, loss_scale=1, train_wall=83, gb_free=61.7, wall=25316 epoch 018: 835 / 1689 loss=3.581, nll_loss=2.035, ppl=4.1, wps=588122, ups=1.19, wpb=494889, bsz=16930.9, num_updates=29500, lr=0.00036823, gnorm=0.137, clip=0, loss_scale=1, train_wall=83, gb_free=61.7, wall=25316 epoch 018: 835 / 1689 loss=3.581, nll_loss=2.035, ppl=4.1, wps=588122, ups=1.19, wpb=494889, bsz=16930.9, num_updates=29500, lr=0.00036823, gnorm=0.137, clip=0, loss_scale=1, train_wall=83, gb_free=61.7, wall=25316 epoch 018: 835 / 1689 loss=3.581, nll_loss=2.035, ppl=4.1, wps=588122, ups=1.19, wpb=494889, bsz=16930.9, num_updates=29500, lr=0.00036823, gnorm=0.137, clip=0, loss_scale=1, train_wall=83, gb_free=61.7, wall=25316 epoch 018: 835 / 1689 loss=3.581, nll_loss=2.035, ppl=4.1, wps=588122, ups=1.19, wpb=494889, bsz=16930.9, num_updates=29500, lr=0.00036823, gnorm=0.137, clip=0, loss_scale=1, train_wall=83, gb_free=61.7, wall=25316 epoch 018: 935 / 1689 loss=3.582, nll_loss=2.036, ppl=4.1, wps=586255, ups=1.18, wpb=496166, bsz=16399.5, num_updates=29600, lr=0.000367607, gnorm=0.138, clip=0, loss_scale=1, train_wall=84, gb_free=61.3, wall=25401 epoch 018: 935 / 1689 loss=3.582, nll_loss=2.036, ppl=4.1, wps=586255, ups=1.18, wpb=496166, bsz=16399.5, num_updates=29600, lr=0.000367607, gnorm=0.138, clip=0, loss_scale=1, train_wall=84, gb_free=61.3, wall=25401 epoch 018: 935 / 1689 loss=3.582, nll_loss=2.036, ppl=4.1, wps=586255, ups=1.18, wpb=496166, bsz=16399.5, num_updates=29600, lr=0.000367607, gnorm=0.138, clip=0, loss_scale=1, train_wall=84, gb_free=61.3, wall=25401 epoch 018: 935 / 1689 loss=3.582, nll_loss=2.036, ppl=4.1, wps=586255, ups=1.18, wpb=496166, bsz=16399.5, num_updates=29600, lr=0.000367607, gnorm=0.138, clip=0, loss_scale=1, train_wall=84, gb_free=61.3, wall=25401 epoch 018: 935 / 1689 loss=3.582, nll_loss=2.036, ppl=4.1, wps=586255, ups=1.18, wpb=496166, bsz=16399.5, num_updates=29600, lr=0.000367607, gnorm=0.138, clip=0, loss_scale=1, train_wall=84, gb_free=61.3, wall=25401 epoch 018: 935 / 1689 loss=3.582, nll_loss=2.036, ppl=4.1, wps=586255, ups=1.18, wpb=496166, bsz=16399.5, num_updates=29600, lr=0.000367607, gnorm=0.138, clip=0, loss_scale=1, train_wall=84, gb_free=61.3, wall=25401 epoch 018: 935 / 1689 loss=3.582, nll_loss=2.036, ppl=4.1, wps=586255, ups=1.18, wpb=496166, bsz=16399.5, num_updates=29600, lr=0.000367607, gnorm=0.138, clip=0, loss_scale=1, train_wall=84, gb_free=61.3, wall=25401 epoch 018: 935 / 1689 loss=3.582, nll_loss=2.036, ppl=4.1, wps=586255, ups=1.18, wpb=496166, bsz=16399.5, num_updates=29600, lr=0.000367607, gnorm=0.138, clip=0, loss_scale=1, train_wall=84, gb_free=61.3, wall=25401 epoch 018: 935 / 1689 loss=3.582, nll_loss=2.036, ppl=4.1, wps=586255, ups=1.18, wpb=496166, bsz=16399.5, num_updates=29600, lr=0.000367607, gnorm=0.138, clip=0, loss_scale=1, train_wall=84, gb_free=61.3, wall=25401 epoch 018: 935 / 1689 loss=3.582, nll_loss=2.036, ppl=4.1, wps=586255, ups=1.18, wpb=496166, bsz=16399.5, num_updates=29600, lr=0.000367607, gnorm=0.138, clip=0, loss_scale=1, train_wall=84, gb_free=61.3, wall=25401 epoch 018: 935 / 1689 loss=3.582, nll_loss=2.036, ppl=4.1, wps=586255, ups=1.18, wpb=496166, bsz=16399.5, num_updates=29600, lr=0.000367607, gnorm=0.138, clip=0, loss_scale=1, train_wall=84, gb_free=61.3, wall=25401 epoch 018: 935 / 1689 loss=3.582, nll_loss=2.036, ppl=4.1, wps=586255, ups=1.18, wpb=496166, bsz=16399.5, num_updates=29600, lr=0.000367607, gnorm=0.138, clip=0, loss_scale=1, train_wall=84, gb_free=61.3, wall=25401 epoch 018: 935 / 1689 loss=3.582, nll_loss=2.036, ppl=4.1, wps=586255, ups=1.18, wpb=496166, bsz=16399.5, num_updates=29600, lr=0.000367607, gnorm=0.138, clip=0, loss_scale=1, train_wall=84, gb_free=61.3, wall=25401 epoch 018: 935 / 1689 loss=3.582, nll_loss=2.036, ppl=4.1, wps=586255, ups=1.18, wpb=496166, bsz=16399.5, num_updates=29600, lr=0.000367607, gnorm=0.138, clip=0, loss_scale=1, train_wall=84, gb_free=61.3, wall=25401 epoch 018: 935 / 1689 loss=3.582, nll_loss=2.036, ppl=4.1, wps=586255, ups=1.18, wpb=496166, bsz=16399.5, num_updates=29600, lr=0.000367607, gnorm=0.138, clip=0, loss_scale=1, train_wall=84, gb_free=61.3, wall=25401 epoch 018: 935 / 1689 loss=3.582, nll_loss=2.036, ppl=4.1, wps=586255, ups=1.18, wpb=496166, bsz=16399.5, num_updates=29600, lr=0.000367607, gnorm=0.138, clip=0, loss_scale=1, train_wall=84, gb_free=61.3, wall=25401 epoch 018: 935 / 1689 loss=3.582, nll_loss=2.036, ppl=4.1, wps=586255, ups=1.18, wpb=496166, bsz=16399.5, num_updates=29600, lr=0.000367607, gnorm=0.138, clip=0, loss_scale=1, train_wall=84, gb_free=61.3, wall=25401 epoch 018: 935 / 1689 loss=3.582, nll_loss=2.036, ppl=4.1, wps=586255, ups=1.18, wpb=496166, bsz=16399.5, num_updates=29600, lr=0.000367607, gnorm=0.138, clip=0, loss_scale=1, train_wall=84, gb_free=61.3, wall=25401 epoch 018: 1035 / 1689 loss=3.585, nll_loss=2.039, ppl=4.11, wps=581243, ups=1.18, wpb=494349, bsz=16325.8, num_updates=29700, lr=0.000366988, gnorm=0.136, clip=0, loss_scale=1, train_wall=84, gb_free=62.2, wall=25486 epoch 018: 1035 / 1689 loss=3.585, nll_loss=2.039, ppl=4.11, wps=581243, ups=1.18, wpb=494349, bsz=16325.8, num_updates=29700, lr=0.000366988, gnorm=0.136, clip=0, loss_scale=1, train_wall=84, gb_free=62.2, wall=25486 epoch 018: 1035 / 1689 loss=3.585, nll_loss=2.039, ppl=4.11, wps=581243, ups=1.18, wpb=494349, bsz=16325.8, num_updates=29700, lr=0.000366988, gnorm=0.136, clip=0, loss_scale=1, train_wall=84, gb_free=62.2, wall=25486 epoch 018: 1035 / 1689 loss=3.585, nll_loss=2.039, ppl=4.11, wps=581243, ups=1.18, wpb=494349, bsz=16325.8, num_updates=29700, lr=0.000366988, gnorm=0.136, clip=0, loss_scale=1, train_wall=84, gb_free=62.2, wall=25486 epoch 018: 1035 / 1689 loss=3.585, nll_loss=2.039, ppl=4.11, wps=581243, ups=1.18, wpb=494349, bsz=16325.8, num_updates=29700, lr=0.000366988, gnorm=0.136, clip=0, loss_scale=1, train_wall=84, gb_free=62.2, wall=25486 epoch 018: 1035 / 1689 loss=3.585, nll_loss=2.039, ppl=4.11, wps=581243, ups=1.18, wpb=494349, bsz=16325.8, num_updates=29700, lr=0.000366988, gnorm=0.136, clip=0, loss_scale=1, train_wall=84, gb_free=62.2, wall=25486 epoch 018: 1035 / 1689 loss=3.585, nll_loss=2.039, ppl=4.11, wps=581243, ups=1.18, wpb=494349, bsz=16325.8, num_updates=29700, lr=0.000366988, gnorm=0.136, clip=0, loss_scale=1, train_wall=84, gb_free=62.2, wall=25486 epoch 018: 1035 / 1689 loss=3.585, nll_loss=2.039, ppl=4.11, wps=581243, ups=1.18, wpb=494349, bsz=16325.8, num_updates=29700, lr=0.000366988, gnorm=0.136, clip=0, loss_scale=1, train_wall=84, gb_free=62.2, wall=25486 epoch 018: 1035 / 1689 loss=3.585, nll_loss=2.039, ppl=4.11, wps=581243, ups=1.18, wpb=494349, bsz=16325.8, num_updates=29700, lr=0.000366988, gnorm=0.136, clip=0, loss_scale=1, train_wall=84, gb_free=62.2, wall=25486 epoch 018: 1035 / 1689 loss=3.585, nll_loss=2.039, ppl=4.11, wps=581243, ups=1.18, wpb=494349, bsz=16325.8, num_updates=29700, lr=0.000366988, gnorm=0.136, clip=0, loss_scale=1, train_wall=84, gb_free=62.2, wall=25486 epoch 018: 1035 / 1689 loss=3.585, nll_loss=2.039, ppl=4.11, wps=581243, ups=1.18, wpb=494349, bsz=16325.8, num_updates=29700, lr=0.000366988, gnorm=0.136, clip=0, loss_scale=1, train_wall=84, gb_free=62.2, wall=25486 epoch 018: 1035 / 1689 loss=3.585, nll_loss=2.039, ppl=4.11, wps=581243, ups=1.18, wpb=494349, bsz=16325.8, num_updates=29700, lr=0.000366988, gnorm=0.136, clip=0, loss_scale=1, train_wall=84, gb_free=62.2, wall=25486 epoch 018: 1035 / 1689 loss=3.585, nll_loss=2.039, ppl=4.11, wps=581243, ups=1.18, wpb=494349, bsz=16325.8, num_updates=29700, lr=0.000366988, gnorm=0.136, clip=0, loss_scale=1, train_wall=84, gb_free=62.2, wall=25486 epoch 018: 1035 / 1689 loss=3.585, nll_loss=2.039, ppl=4.11, wps=581243, ups=1.18, wpb=494349, bsz=16325.8, num_updates=29700, lr=0.000366988, gnorm=0.136, clip=0, loss_scale=1, train_wall=84, gb_free=62.2, wall=25486 epoch 018: 1035 / 1689 loss=3.585, nll_loss=2.039, ppl=4.11, wps=581243, ups=1.18, wpb=494349, bsz=16325.8, num_updates=29700, lr=0.000366988, gnorm=0.136, clip=0, loss_scale=1, train_wall=84, gb_free=62.2, wall=25486 epoch 018: 1035 / 1689 loss=3.585, nll_loss=2.039, ppl=4.11, wps=581243, ups=1.18, wpb=494349, bsz=16325.8, num_updates=29700, lr=0.000366988, gnorm=0.136, clip=0, loss_scale=1, train_wall=84, gb_free=62.2, wall=25486 epoch 018: 1035 / 1689 loss=3.585, nll_loss=2.039, ppl=4.11, wps=581243, ups=1.18, wpb=494349, bsz=16325.8, num_updates=29700, lr=0.000366988, gnorm=0.136, clip=0, loss_scale=1, train_wall=84, gb_free=62.2, wall=25486 epoch 018: 1035 / 1689 loss=3.585, nll_loss=2.039, ppl=4.11, wps=581243, ups=1.18, wpb=494349, bsz=16325.8, num_updates=29700, lr=0.000366988, gnorm=0.136, clip=0, loss_scale=1, train_wall=84, gb_free=62.2, wall=25486 epoch 018: 1135 / 1689 loss=3.581, nll_loss=2.034, ppl=4.1, wps=586507, ups=1.18, wpb=495762, bsz=16620, num_updates=29800, lr=0.000366372, gnorm=0.142, clip=0, loss_scale=1, train_wall=84, gb_free=61.4, wall=25570 epoch 018: 1135 / 1689 loss=3.581, nll_loss=2.034, ppl=4.1, wps=586507, ups=1.18, wpb=495762, bsz=16620, num_updates=29800, lr=0.000366372, gnorm=0.142, clip=0, loss_scale=1, train_wall=84, gb_free=61.4, wall=25570 epoch 018: 1135 / 1689 loss=3.581, nll_loss=2.034, ppl=4.1, wps=586507, ups=1.18, wpb=495762, bsz=16620, num_updates=29800, lr=0.000366372, gnorm=0.142, clip=0, loss_scale=1, train_wall=84, gb_free=61.4, wall=25570 epoch 018: 1135 / 1689 loss=3.581, nll_loss=2.034, ppl=4.1, wps=586507, ups=1.18, wpb=495762, bsz=16620, num_updates=29800, lr=0.000366372, gnorm=0.142, clip=0, loss_scale=1, train_wall=84, gb_free=61.4, wall=25570 epoch 018: 1135 / 1689 loss=3.581, nll_loss=2.034, ppl=4.1, wps=586507, ups=1.18, wpb=495762, bsz=16620, num_updates=29800, lr=0.000366372, gnorm=0.142, clip=0, loss_scale=1, train_wall=84, gb_free=61.4, wall=25570 epoch 018: 1135 / 1689 loss=3.581, nll_loss=2.034, ppl=4.1, wps=586507, ups=1.18, wpb=495762, bsz=16620, num_updates=29800, lr=0.000366372, gnorm=0.142, clip=0, loss_scale=1, train_wall=84, gb_free=61.4, wall=25570 epoch 018: 1135 / 1689 loss=3.581, nll_loss=2.034, ppl=4.1, wps=586507, ups=1.18, wpb=495762, bsz=16620, num_updates=29800, lr=0.000366372, gnorm=0.142, clip=0, loss_scale=1, train_wall=84, gb_free=61.4, wall=25570 epoch 018: 1135 / 1689 loss=3.581, nll_loss=2.034, ppl=4.1, wps=586507, ups=1.18, wpb=495762, bsz=16620, num_updates=29800, lr=0.000366372, gnorm=0.142, clip=0, loss_scale=1, train_wall=84, gb_free=61.4, wall=25570 epoch 018: 1135 / 1689 loss=3.581, nll_loss=2.034, ppl=4.1, wps=586507, ups=1.18, wpb=495762, bsz=16620, num_updates=29800, lr=0.000366372, gnorm=0.142, clip=0, loss_scale=1, train_wall=84, gb_free=61.4, wall=25570 epoch 018: 1135 / 1689 loss=3.581, nll_loss=2.034, ppl=4.1, wps=586507, ups=1.18, wpb=495762, bsz=16620, num_updates=29800, lr=0.000366372, gnorm=0.142, clip=0, loss_scale=1, train_wall=84, gb_free=61.4, wall=25570 epoch 018: 1135 / 1689 loss=3.581, nll_loss=2.034, ppl=4.1, wps=586507, ups=1.18, wpb=495762, bsz=16620, num_updates=29800, lr=0.000366372, gnorm=0.142, clip=0, loss_scale=1, train_wall=84, gb_free=61.4, wall=25570 epoch 018: 1135 / 1689 loss=3.581, nll_loss=2.034, ppl=4.1, wps=586507, ups=1.18, wpb=495762, bsz=16620, num_updates=29800, lr=0.000366372, gnorm=0.142, clip=0, loss_scale=1, train_wall=84, gb_free=61.4, wall=25570 epoch 018: 1135 / 1689 loss=3.581, nll_loss=2.034, ppl=4.1, wps=586507, ups=1.18, wpb=495762, bsz=16620, num_updates=29800, lr=0.000366372, gnorm=0.142, clip=0, loss_scale=1, train_wall=84, gb_free=61.4, wall=25570 epoch 018: 1135 / 1689 loss=3.581, nll_loss=2.034, ppl=4.1, wps=586507, ups=1.18, wpb=495762, bsz=16620, num_updates=29800, lr=0.000366372, gnorm=0.142, clip=0, loss_scale=1, train_wall=84, gb_free=61.4, wall=25570 epoch 018: 1135 / 1689 loss=3.581, nll_loss=2.034, ppl=4.1, wps=586507, ups=1.18, wpb=495762, bsz=16620, num_updates=29800, lr=0.000366372, gnorm=0.142, clip=0, loss_scale=1, train_wall=84, gb_free=61.4, wall=25570 epoch 018: 1135 / 1689 loss=3.581, nll_loss=2.034, ppl=4.1, wps=586507, ups=1.18, wpb=495762, bsz=16620, num_updates=29800, lr=0.000366372, gnorm=0.142, clip=0, loss_scale=1, train_wall=84, gb_free=61.4, wall=25570 epoch 018: 1135 / 1689 loss=3.581, nll_loss=2.034, ppl=4.1, wps=586507, ups=1.18, wpb=495762, bsz=16620, num_updates=29800, lr=0.000366372, gnorm=0.142, clip=0, loss_scale=1, train_wall=84, gb_free=61.4, wall=25570 epoch 018: 1135 / 1689 loss=3.581, nll_loss=2.034, ppl=4.1, wps=586507, ups=1.18, wpb=495762, bsz=16620, num_updates=29800, lr=0.000366372, gnorm=0.142, clip=0, loss_scale=1, train_wall=84, gb_free=61.4, wall=25570 epoch 018: 1235 / 1689 loss=3.582, nll_loss=2.035, ppl=4.1, wps=588380, ups=1.19, wpb=495413, bsz=16652.6, num_updates=29900, lr=0.000365758, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=60.9, wall=25654 epoch 018: 1235 / 1689 loss=3.582, nll_loss=2.035, ppl=4.1, wps=588380, ups=1.19, wpb=495413, bsz=16652.6, num_updates=29900, lr=0.000365758, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=60.9, wall=25654 epoch 018: 1235 / 1689 loss=3.582, nll_loss=2.035, ppl=4.1, wps=588380, ups=1.19, wpb=495413, bsz=16652.6, num_updates=29900, lr=0.000365758, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=60.9, wall=25654 epoch 018: 1235 / 1689 loss=3.582, nll_loss=2.035, ppl=4.1, wps=588380, ups=1.19, wpb=495413, bsz=16652.6, num_updates=29900, lr=0.000365758, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=60.9, wall=25654 epoch 018: 1235 / 1689 loss=3.582, nll_loss=2.035, ppl=4.1, wps=588380, ups=1.19, wpb=495413, bsz=16652.6, num_updates=29900, lr=0.000365758, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=60.9, wall=25654 epoch 018: 1235 / 1689 loss=3.582, nll_loss=2.035, ppl=4.1, wps=588380, ups=1.19, wpb=495413, bsz=16652.6, num_updates=29900, lr=0.000365758, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=60.9, wall=25654 epoch 018: 1235 / 1689 loss=3.582, nll_loss=2.035, ppl=4.1, wps=588380, ups=1.19, wpb=495413, bsz=16652.6, num_updates=29900, lr=0.000365758, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=60.9, wall=25654 epoch 018: 1235 / 1689 loss=3.582, nll_loss=2.035, ppl=4.1, wps=588380, ups=1.19, wpb=495413, bsz=16652.6, num_updates=29900, lr=0.000365758, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=60.9, wall=25654 epoch 018: 1235 / 1689 loss=3.582, nll_loss=2.035, ppl=4.1, wps=588380, ups=1.19, wpb=495413, bsz=16652.6, num_updates=29900, lr=0.000365758, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=60.9, wall=25654 epoch 018: 1235 / 1689 loss=3.582, nll_loss=2.035, ppl=4.1, wps=588380, ups=1.19, wpb=495413, bsz=16652.6, num_updates=29900, lr=0.000365758, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=60.9, wall=25654 epoch 018: 1235 / 1689 loss=3.582, nll_loss=2.035, ppl=4.1, wps=588380, ups=1.19, wpb=495413, bsz=16652.6, num_updates=29900, lr=0.000365758, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=60.9, wall=25654 epoch 018: 1235 / 1689 loss=3.582, nll_loss=2.035, ppl=4.1, wps=588380, ups=1.19, wpb=495413, bsz=16652.6, num_updates=29900, lr=0.000365758, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=60.9, wall=25654 epoch 018: 1235 / 1689 loss=3.582, nll_loss=2.035, ppl=4.1, wps=588380, ups=1.19, wpb=495413, bsz=16652.6, num_updates=29900, lr=0.000365758, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=60.9, wall=25654 epoch 018: 1235 / 1689 loss=3.582, nll_loss=2.035, ppl=4.1, wps=588380, ups=1.19, wpb=495413, bsz=16652.6, num_updates=29900, lr=0.000365758, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=60.9, wall=25654 epoch 018: 1235 / 1689 loss=3.582, nll_loss=2.035, ppl=4.1, wps=588380, ups=1.19, wpb=495413, bsz=16652.6, num_updates=29900, lr=0.000365758, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=60.9, wall=25654 epoch 018: 1235 / 1689 loss=3.582, nll_loss=2.035, ppl=4.1, wps=588380, ups=1.19, wpb=495413, bsz=16652.6, num_updates=29900, lr=0.000365758, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=60.9, wall=25654 epoch 018: 1235 / 1689 loss=3.582, nll_loss=2.035, ppl=4.1, wps=588380, ups=1.19, wpb=495413, bsz=16652.6, num_updates=29900, lr=0.000365758, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=60.9, wall=25654 epoch 018: 1235 / 1689 loss=3.582, nll_loss=2.035, ppl=4.1, wps=588380, ups=1.19, wpb=495413, bsz=16652.6, num_updates=29900, lr=0.000365758, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=60.9, wall=25654 epoch 018: 1335 / 1689 loss=3.582, nll_loss=2.036, ppl=4.1, wps=588515, ups=1.19, wpb=495093, bsz=16299.9, num_updates=30000, lr=0.000365148, gnorm=0.139, clip=0, loss_scale=2, train_wall=83, gb_free=62, wall=25739 epoch 018: 1335 / 1689 loss=3.582, nll_loss=2.036, ppl=4.1, wps=588515, ups=1.19, wpb=495093, bsz=16299.9, num_updates=30000, lr=0.000365148, gnorm=0.139, clip=0, loss_scale=2, train_wall=83, gb_free=62, wall=25739 epoch 018: 1335 / 1689 loss=3.582, nll_loss=2.036, ppl=4.1, wps=588515, ups=1.19, wpb=495093, bsz=16299.9, num_updates=30000, lr=0.000365148, gnorm=0.139, clip=0, loss_scale=2, train_wall=83, gb_free=62, wall=25739 epoch 018: 1335 / 1689 loss=3.582, nll_loss=2.036, ppl=4.1, wps=588515, ups=1.19, wpb=495093, bsz=16299.9, num_updates=30000, lr=0.000365148, gnorm=0.139, clip=0, loss_scale=2, train_wall=83, gb_free=62, wall=25739 epoch 018: 1335 / 1689 loss=3.582, nll_loss=2.036, ppl=4.1, wps=588515, ups=1.19, wpb=495093, bsz=16299.9, num_updates=30000, lr=0.000365148, gnorm=0.139, clip=0, loss_scale=2, train_wall=83, gb_free=62, wall=25739 epoch 018: 1335 / 1689 loss=3.582, nll_loss=2.036, ppl=4.1, wps=588515, ups=1.19, wpb=495093, bsz=16299.9, num_updates=30000, lr=0.000365148, gnorm=0.139, clip=0, loss_scale=2, train_wall=83, gb_free=62, wall=25739 epoch 018: 1335 / 1689 loss=3.582, nll_loss=2.036, ppl=4.1, wps=588515, ups=1.19, wpb=495093, bsz=16299.9, num_updates=30000, lr=0.000365148, gnorm=0.139, clip=0, loss_scale=2, train_wall=83, gb_free=62, wall=25739 epoch 018: 1335 / 1689 loss=3.582, nll_loss=2.036, ppl=4.1, wps=588515, ups=1.19, wpb=495093, bsz=16299.9, num_updates=30000, lr=0.000365148, gnorm=0.139, clip=0, loss_scale=2, train_wall=83, gb_free=62, wall=25739 epoch 018: 1335 / 1689 loss=3.582, nll_loss=2.036, ppl=4.1, wps=588515, ups=1.19, wpb=495093, bsz=16299.9, num_updates=30000, lr=0.000365148, gnorm=0.139, clip=0, loss_scale=2, train_wall=83, gb_free=62, wall=25739 epoch 018: 1335 / 1689 loss=3.582, nll_loss=2.036, ppl=4.1, wps=588515, ups=1.19, wpb=495093, bsz=16299.9, num_updates=30000, lr=0.000365148, gnorm=0.139, clip=0, loss_scale=2, train_wall=83, gb_free=62, wall=25739 epoch 018: 1335 / 1689 loss=3.582, nll_loss=2.036, ppl=4.1, wps=588515, ups=1.19, wpb=495093, bsz=16299.9, num_updates=30000, lr=0.000365148, gnorm=0.139, clip=0, loss_scale=2, train_wall=83, gb_free=62, wall=25739 epoch 018: 1335 / 1689 loss=3.582, nll_loss=2.036, ppl=4.1, wps=588515, ups=1.19, wpb=495093, bsz=16299.9, num_updates=30000, lr=0.000365148, gnorm=0.139, clip=0, loss_scale=2, train_wall=83, gb_free=62, wall=25739 epoch 018: 1335 / 1689 loss=3.582, nll_loss=2.036, ppl=4.1, wps=588515, ups=1.19, wpb=495093, bsz=16299.9, num_updates=30000, lr=0.000365148, gnorm=0.139, clip=0, loss_scale=2, train_wall=83, gb_free=62, wall=25739 epoch 018: 1335 / 1689 loss=3.582, nll_loss=2.036, ppl=4.1, wps=588515, ups=1.19, wpb=495093, bsz=16299.9, num_updates=30000, lr=0.000365148, gnorm=0.139, clip=0, loss_scale=2, train_wall=83, gb_free=62, wall=25739 epoch 018: 1335 / 1689 loss=3.582, nll_loss=2.036, ppl=4.1, wps=588515, ups=1.19, wpb=495093, bsz=16299.9, num_updates=30000, lr=0.000365148, gnorm=0.139, clip=0, loss_scale=2, train_wall=83, gb_free=62, wall=25739 epoch 018: 1335 / 1689 loss=3.582, nll_loss=2.036, ppl=4.1, wps=588515, ups=1.19, wpb=495093, bsz=16299.9, num_updates=30000, lr=0.000365148, gnorm=0.139, clip=0, loss_scale=2, train_wall=83, gb_free=62, wall=25739 epoch 018: 1335 / 1689 loss=3.582, nll_loss=2.036, ppl=4.1, wps=588515, ups=1.19, wpb=495093, bsz=16299.9, num_updates=30000, lr=0.000365148, gnorm=0.139, clip=0, loss_scale=2, train_wall=83, gb_free=62, wall=25739 epoch 018: 1335 / 1689 loss=3.582, nll_loss=2.036, ppl=4.1, wps=588515, ups=1.19, wpb=495093, bsz=16299.9, num_updates=30000, lr=0.000365148, gnorm=0.139, clip=0, loss_scale=2, train_wall=83, gb_free=62, wall=25739 begin validation on "valid" subset epoch 018 | valid on 'valid' subset | loss 3.704 | nll_loss 2.152 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 30000 | best_loss 3.703 epoch 018 | valid on 'valid' subset | loss 3.704 | nll_loss 2.152 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 30000 | best_loss 3.703 epoch 018 | valid on 'valid' subset | loss 3.704 | nll_loss 2.152 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 30000 | best_loss 3.703 epoch 018 | valid on 'valid' subset | loss 3.704 | nll_loss 2.152 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 30000 | best_loss 3.703 epoch 018 | valid on 'valid' subset | loss 3.704 | nll_loss 2.152 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 30000 | best_loss 3.703 epoch 018 | valid on 'valid' subset | loss 3.704 | nll_loss 2.152 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 30000 | best_loss 3.703 epoch 018 | valid on 'valid' subset | loss 3.704 | nll_loss 2.152 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 30000 | best_loss 3.703 epoch 018 | valid on 'valid' subset | loss 3.704 | nll_loss 2.152 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 30000 | best_loss 3.703 epoch 018 | valid on 'valid' subset | loss 3.704 | nll_loss 2.152 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 30000 | best_loss 3.703 epoch 018 | valid on 'valid' subset | loss 3.704 | nll_loss 2.152 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 30000 | best_loss 3.703 epoch 018 | valid on 'valid' subset | loss 3.704 | nll_loss 2.152 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 30000 | best_loss 3.703 epoch 018 | valid on 'valid' subset | loss 3.704 | nll_loss 2.152 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 30000 | best_loss 3.703 epoch 018 | valid on 'valid' subset | loss 3.704 | nll_loss 2.152 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 30000 | best_loss 3.703 epoch 018 | valid on 'valid' subset | loss 3.704 | nll_loss 2.152 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 30000 | best_loss 3.703 epoch 018 | valid on 'valid' subset | loss 3.704 | nll_loss 2.152 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 30000 | best_loss 3.703 epoch 018 | valid on 'valid' subset | loss 3.704 | nll_loss 2.152 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 30000 | best_loss 3.703 epoch 018 | valid on 'valid' subset | loss 3.704 | nll_loss 2.152 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 30000 | best_loss 3.703 epoch 018 | valid on 'valid' subset | loss 3.704 | nll_loss 2.152 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 30000 | best_loss 3.703 epoch 018: 1435 / 1689 loss=3.59, nll_loss=2.044, ppl=4.13, wps=529300, ups=1.07, wpb=494949, bsz=16354.8, num_updates=30100, lr=0.000364541, gnorm=0.14, clip=0, loss_scale=2, train_wall=83, gb_free=61.4, wall=25832 epoch 018: 1435 / 1689 loss=3.59, nll_loss=2.044, ppl=4.13, wps=529300, ups=1.07, wpb=494949, bsz=16354.8, num_updates=30100, lr=0.000364541, gnorm=0.14, clip=0, loss_scale=2, train_wall=83, gb_free=61.4, wall=25832 epoch 018: 1435 / 1689 loss=3.59, nll_loss=2.044, ppl=4.13, wps=529300, ups=1.07, wpb=494949, bsz=16354.8, num_updates=30100, lr=0.000364541, gnorm=0.14, clip=0, loss_scale=2, train_wall=83, gb_free=61.4, wall=25832 epoch 018: 1435 / 1689 loss=3.59, nll_loss=2.044, ppl=4.13, wps=529300, ups=1.07, wpb=494949, bsz=16354.8, num_updates=30100, lr=0.000364541, gnorm=0.14, clip=0, loss_scale=2, train_wall=83, gb_free=61.4, wall=25832 epoch 018: 1435 / 1689 loss=3.59, nll_loss=2.044, ppl=4.13, wps=529300, ups=1.07, wpb=494949, bsz=16354.8, num_updates=30100, lr=0.000364541, gnorm=0.14, clip=0, loss_scale=2, train_wall=83, gb_free=61.4, wall=25832 epoch 018: 1435 / 1689 loss=3.59, nll_loss=2.044, ppl=4.13, wps=529300, ups=1.07, wpb=494949, bsz=16354.8, num_updates=30100, lr=0.000364541, gnorm=0.14, clip=0, loss_scale=2, train_wall=83, gb_free=61.4, wall=25832 epoch 018: 1435 / 1689 loss=3.59, nll_loss=2.044, ppl=4.13, wps=529300, ups=1.07, wpb=494949, bsz=16354.8, num_updates=30100, lr=0.000364541, gnorm=0.14, clip=0, loss_scale=2, train_wall=83, gb_free=61.4, wall=25832 epoch 018: 1435 / 1689 loss=3.59, nll_loss=2.044, ppl=4.13, wps=529300, ups=1.07, wpb=494949, bsz=16354.8, num_updates=30100, lr=0.000364541, gnorm=0.14, clip=0, loss_scale=2, train_wall=83, gb_free=61.4, wall=25832 epoch 018: 1435 / 1689 loss=3.59, nll_loss=2.044, ppl=4.13, wps=529300, ups=1.07, wpb=494949, bsz=16354.8, num_updates=30100, lr=0.000364541, gnorm=0.14, clip=0, loss_scale=2, train_wall=83, gb_free=61.4, wall=25832 epoch 018: 1435 / 1689 loss=3.59, nll_loss=2.044, ppl=4.13, wps=529300, ups=1.07, wpb=494949, bsz=16354.8, num_updates=30100, lr=0.000364541, gnorm=0.14, clip=0, loss_scale=2, train_wall=83, gb_free=61.4, wall=25832 epoch 018: 1435 / 1689 loss=3.59, nll_loss=2.044, ppl=4.13, wps=529300, ups=1.07, wpb=494949, bsz=16354.8, num_updates=30100, lr=0.000364541, gnorm=0.14, clip=0, loss_scale=2, train_wall=83, gb_free=61.4, wall=25832 epoch 018: 1435 / 1689 loss=3.59, nll_loss=2.044, ppl=4.13, wps=529300, ups=1.07, wpb=494949, bsz=16354.8, num_updates=30100, lr=0.000364541, gnorm=0.14, clip=0, loss_scale=2, train_wall=83, gb_free=61.4, wall=25832 epoch 018: 1435 / 1689 loss=3.59, nll_loss=2.044, ppl=4.13, wps=529300, ups=1.07, wpb=494949, bsz=16354.8, num_updates=30100, lr=0.000364541, gnorm=0.14, clip=0, loss_scale=2, train_wall=83, gb_free=61.4, wall=25832 epoch 018: 1435 / 1689 loss=3.59, nll_loss=2.044, ppl=4.13, wps=529300, ups=1.07, wpb=494949, bsz=16354.8, num_updates=30100, lr=0.000364541, gnorm=0.14, clip=0, loss_scale=2, train_wall=83, gb_free=61.4, wall=25832 epoch 018: 1435 / 1689 loss=3.59, nll_loss=2.044, ppl=4.13, wps=529300, ups=1.07, wpb=494949, bsz=16354.8, num_updates=30100, lr=0.000364541, gnorm=0.14, clip=0, loss_scale=2, train_wall=83, gb_free=61.4, wall=25832 epoch 018: 1435 / 1689 loss=3.59, nll_loss=2.044, ppl=4.13, wps=529300, ups=1.07, wpb=494949, bsz=16354.8, num_updates=30100, lr=0.000364541, gnorm=0.14, clip=0, loss_scale=2, train_wall=83, gb_free=61.4, wall=25832 epoch 018: 1435 / 1689 loss=3.59, nll_loss=2.044, ppl=4.13, wps=529300, ups=1.07, wpb=494949, bsz=16354.8, num_updates=30100, lr=0.000364541, gnorm=0.14, clip=0, loss_scale=2, train_wall=83, gb_free=61.4, wall=25832 epoch 018: 1435 / 1689 loss=3.59, nll_loss=2.044, ppl=4.13, wps=529300, ups=1.07, wpb=494949, bsz=16354.8, num_updates=30100, lr=0.000364541, gnorm=0.14, clip=0, loss_scale=2, train_wall=83, gb_free=61.4, wall=25832 epoch 018: 1535 / 1689 loss=3.591, nll_loss=2.045, ppl=4.13, wps=589153, ups=1.19, wpb=495286, bsz=16332.2, num_updates=30200, lr=0.000363937, gnorm=0.134, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=25916 epoch 018: 1535 / 1689 loss=3.591, nll_loss=2.045, ppl=4.13, wps=589153, ups=1.19, wpb=495286, bsz=16332.2, num_updates=30200, lr=0.000363937, gnorm=0.134, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=25916 epoch 018: 1535 / 1689 loss=3.591, nll_loss=2.045, ppl=4.13, wps=589153, ups=1.19, wpb=495286, bsz=16332.2, num_updates=30200, lr=0.000363937, gnorm=0.134, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=25916 epoch 018: 1535 / 1689 loss=3.591, nll_loss=2.045, ppl=4.13, wps=589153, ups=1.19, wpb=495286, bsz=16332.2, num_updates=30200, lr=0.000363937, gnorm=0.134, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=25916 epoch 018: 1535 / 1689 loss=3.591, nll_loss=2.045, ppl=4.13, wps=589153, ups=1.19, wpb=495286, bsz=16332.2, num_updates=30200, lr=0.000363937, gnorm=0.134, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=25916 epoch 018: 1535 / 1689 loss=3.591, nll_loss=2.045, ppl=4.13, wps=589153, ups=1.19, wpb=495286, bsz=16332.2, num_updates=30200, lr=0.000363937, gnorm=0.134, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=25916 epoch 018: 1535 / 1689 loss=3.591, nll_loss=2.045, ppl=4.13, wps=589153, ups=1.19, wpb=495286, bsz=16332.2, num_updates=30200, lr=0.000363937, gnorm=0.134, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=25916 epoch 018: 1535 / 1689 loss=3.591, nll_loss=2.045, ppl=4.13, wps=589153, ups=1.19, wpb=495286, bsz=16332.2, num_updates=30200, lr=0.000363937, gnorm=0.134, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=25916 epoch 018: 1535 / 1689 loss=3.591, nll_loss=2.045, ppl=4.13, wps=589153, ups=1.19, wpb=495286, bsz=16332.2, num_updates=30200, lr=0.000363937, gnorm=0.134, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=25916 epoch 018: 1535 / 1689 loss=3.591, nll_loss=2.045, ppl=4.13, wps=589153, ups=1.19, wpb=495286, bsz=16332.2, num_updates=30200, lr=0.000363937, gnorm=0.134, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=25916 epoch 018: 1535 / 1689 loss=3.591, nll_loss=2.045, ppl=4.13, wps=589153, ups=1.19, wpb=495286, bsz=16332.2, num_updates=30200, lr=0.000363937, gnorm=0.134, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=25916 epoch 018: 1535 / 1689 loss=3.591, nll_loss=2.045, ppl=4.13, wps=589153, ups=1.19, wpb=495286, bsz=16332.2, num_updates=30200, lr=0.000363937, gnorm=0.134, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=25916 epoch 018: 1535 / 1689 loss=3.591, nll_loss=2.045, ppl=4.13, wps=589153, ups=1.19, wpb=495286, bsz=16332.2, num_updates=30200, lr=0.000363937, gnorm=0.134, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=25916 epoch 018: 1535 / 1689 loss=3.591, nll_loss=2.045, ppl=4.13, wps=589153, ups=1.19, wpb=495286, bsz=16332.2, num_updates=30200, lr=0.000363937, gnorm=0.134, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=25916 epoch 018: 1535 / 1689 loss=3.591, nll_loss=2.045, ppl=4.13, wps=589153, ups=1.19, wpb=495286, bsz=16332.2, num_updates=30200, lr=0.000363937, gnorm=0.134, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=25916 epoch 018: 1535 / 1689 loss=3.591, nll_loss=2.045, ppl=4.13, wps=589153, ups=1.19, wpb=495286, bsz=16332.2, num_updates=30200, lr=0.000363937, gnorm=0.134, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=25916 epoch 018: 1535 / 1689 loss=3.591, nll_loss=2.045, ppl=4.13, wps=589153, ups=1.19, wpb=495286, bsz=16332.2, num_updates=30200, lr=0.000363937, gnorm=0.134, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=25916 epoch 018: 1535 / 1689 loss=3.591, nll_loss=2.045, ppl=4.13, wps=589153, ups=1.19, wpb=495286, bsz=16332.2, num_updates=30200, lr=0.000363937, gnorm=0.134, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=25916 epoch 018: 1635 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=585838, ups=1.19, wpb=492063, bsz=16839.8, num_updates=30300, lr=0.000363336, gnorm=0.142, clip=0, loss_scale=2, train_wall=83, gb_free=60.9, wall=26000 epoch 018: 1635 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=585838, ups=1.19, wpb=492063, bsz=16839.8, num_updates=30300, lr=0.000363336, gnorm=0.142, clip=0, loss_scale=2, train_wall=83, gb_free=60.9, wall=26000 epoch 018: 1635 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=585838, ups=1.19, wpb=492063, bsz=16839.8, num_updates=30300, lr=0.000363336, gnorm=0.142, clip=0, loss_scale=2, train_wall=83, gb_free=60.9, wall=26000 epoch 018: 1635 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=585838, ups=1.19, wpb=492063, bsz=16839.8, num_updates=30300, lr=0.000363336, gnorm=0.142, clip=0, loss_scale=2, train_wall=83, gb_free=60.9, wall=26000 epoch 018: 1635 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=585838, ups=1.19, wpb=492063, bsz=16839.8, num_updates=30300, lr=0.000363336, gnorm=0.142, clip=0, loss_scale=2, train_wall=83, gb_free=60.9, wall=26000 epoch 018: 1635 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=585838, ups=1.19, wpb=492063, bsz=16839.8, num_updates=30300, lr=0.000363336, gnorm=0.142, clip=0, loss_scale=2, train_wall=83, gb_free=60.9, wall=26000 epoch 018: 1635 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=585838, ups=1.19, wpb=492063, bsz=16839.8, num_updates=30300, lr=0.000363336, gnorm=0.142, clip=0, loss_scale=2, train_wall=83, gb_free=60.9, wall=26000 epoch 018: 1635 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=585838, ups=1.19, wpb=492063, bsz=16839.8, num_updates=30300, lr=0.000363336, gnorm=0.142, clip=0, loss_scale=2, train_wall=83, gb_free=60.9, wall=26000 epoch 018: 1635 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=585838, ups=1.19, wpb=492063, bsz=16839.8, num_updates=30300, lr=0.000363336, gnorm=0.142, clip=0, loss_scale=2, train_wall=83, gb_free=60.9, wall=26000 epoch 018: 1635 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=585838, ups=1.19, wpb=492063, bsz=16839.8, num_updates=30300, lr=0.000363336, gnorm=0.142, clip=0, loss_scale=2, train_wall=83, gb_free=60.9, wall=26000 epoch 018: 1635 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=585838, ups=1.19, wpb=492063, bsz=16839.8, num_updates=30300, lr=0.000363336, gnorm=0.142, clip=0, loss_scale=2, train_wall=83, gb_free=60.9, wall=26000 epoch 018: 1635 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=585838, ups=1.19, wpb=492063, bsz=16839.8, num_updates=30300, lr=0.000363336, gnorm=0.142, clip=0, loss_scale=2, train_wall=83, gb_free=60.9, wall=26000 epoch 018: 1635 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=585838, ups=1.19, wpb=492063, bsz=16839.8, num_updates=30300, lr=0.000363336, gnorm=0.142, clip=0, loss_scale=2, train_wall=83, gb_free=60.9, wall=26000 epoch 018: 1635 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=585838, ups=1.19, wpb=492063, bsz=16839.8, num_updates=30300, lr=0.000363336, gnorm=0.142, clip=0, loss_scale=2, train_wall=83, gb_free=60.9, wall=26000 epoch 018: 1635 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=585838, ups=1.19, wpb=492063, bsz=16839.8, num_updates=30300, lr=0.000363336, gnorm=0.142, clip=0, loss_scale=2, train_wall=83, gb_free=60.9, wall=26000 epoch 018: 1635 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=585838, ups=1.19, wpb=492063, bsz=16839.8, num_updates=30300, lr=0.000363336, gnorm=0.142, clip=0, loss_scale=2, train_wall=83, gb_free=60.9, wall=26000 epoch 018: 1635 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=585838, ups=1.19, wpb=492063, bsz=16839.8, num_updates=30300, lr=0.000363336, gnorm=0.142, clip=0, loss_scale=2, train_wall=83, gb_free=60.9, wall=26000 epoch 018: 1635 / 1689 loss=3.6, nll_loss=2.056, ppl=4.16, wps=585838, ups=1.19, wpb=492063, bsz=16839.8, num_updates=30300, lr=0.000363336, gnorm=0.142, clip=0, loss_scale=2, train_wall=83, gb_free=60.9, wall=26000 end of epoch 18 (average epoch stats below) epoch 018 | loss 3.581 | nll_loss 2.034 | ppl 4.1 | wps 578928 | ups 1.17 | wpb 495124 | bsz 16506 | num_updates 30354 | lr 0.000363013 | gnorm 0.14 | clip 0 | loss_scale 2 | train_wall 1406 | gb_free 62.4 | wall 26045 epoch 018 | loss 3.581 | nll_loss 2.034 | ppl 4.1 | wps 578928 | ups 1.17 | wpb 495124 | bsz 16506 | num_updates 30354 | lr 0.000363013 | gnorm 0.14 | clip 0 | loss_scale 2 | train_wall 1406 | gb_free 62.4 | wall 26045 epoch 018 | loss 3.581 | nll_loss 2.034 | ppl 4.1 | wps 578928 | ups 1.17 | wpb 495124 | bsz 16506 | num_updates 30354 | lr 0.000363013 | gnorm 0.14 | clip 0 | loss_scale 2 | train_wall 1406 | gb_free 62.4 | wall 26045 epoch 018 | loss 3.581 | nll_loss 2.034 | ppl 4.1 | wps 578928 | ups 1.17 | wpb 495124 | bsz 16506 | num_updates 30354 | lr 0.000363013 | gnorm 0.14 | clip 0 | loss_scale 2 | train_wall 1406 | gb_free 62.4 | wall 26045 epoch 018 | loss 3.581 | nll_loss 2.034 | ppl 4.1 | wps 578928 | ups 1.17 | wpb 495124 | bsz 16506 | num_updates 30354 | lr 0.000363013 | gnorm 0.14 | clip 0 | loss_scale 2 | train_wall 1406 | gb_free 62.4 | wall 26045 epoch 018 | loss 3.581 | nll_loss 2.034 | ppl 4.1 | wps 578928 | ups 1.17 | wpb 495124 | bsz 16506 | num_updates 30354 | lr 0.000363013 | gnorm 0.14 | clip 0 | loss_scale 2 | train_wall 1406 | gb_free 62.4 | wall 26045 epoch 018 | loss 3.581 | nll_loss 2.034 | ppl 4.1 | wps 578928 | ups 1.17 | wpb 495124 | bsz 16506 | num_updates 30354 | lr 0.000363013 | gnorm 0.14 | clip 0 | loss_scale 2 | train_wall 1406 | gb_free 62.4 | wall 26045 epoch 018 | loss 3.581 | nll_loss 2.034 | ppl 4.1 | wps 578928 | ups 1.17 | wpb 495124 | bsz 16506 | num_updates 30354 | lr 0.000363013 | gnorm 0.14 | clip 0 | loss_scale 2 | train_wall 1406 | gb_free 62.4 | wall 26045 epoch 018 | loss 3.581 | nll_loss 2.034 | ppl 4.1 | wps 578928 | ups 1.17 | wpb 495124 | bsz 16506 | num_updates 30354 | lr 0.000363013 | gnorm 0.14 | clip 0 | loss_scale 2 | train_wall 1406 | gb_free 62.4 | wall 26045 epoch 018 | loss 3.581 | nll_loss 2.034 | ppl 4.1 | wps 578928 | ups 1.17 | wpb 495124 | bsz 16506 | num_updates 30354 | lr 0.000363013 | gnorm 0.14 | clip 0 | loss_scale 2 | train_wall 1406 | gb_free 62.4 | wall 26045 epoch 018 | loss 3.581 | nll_loss 2.034 | ppl 4.1 | wps 578928 | ups 1.17 | wpb 495124 | bsz 16506 | num_updates 30354 | lr 0.000363013 | gnorm 0.14 | clip 0 | loss_scale 2 | train_wall 1406 | gb_free 62.4 | wall 26045 epoch 018 | loss 3.581 | nll_loss 2.034 | ppl 4.1 | wps 578928 | ups 1.17 | wpb 495124 | bsz 16506 | num_updates 30354 | lr 0.000363013 | gnorm 0.14 | clip 0 | loss_scale 2 | train_wall 1406 | gb_free 62.4 | wall 26045 epoch 018 | loss 3.581 | nll_loss 2.034 | ppl 4.1 | wps 578928 | ups 1.17 | wpb 495124 | bsz 16506 | num_updates 30354 | lr 0.000363013 | gnorm 0.14 | clip 0 | loss_scale 2 | train_wall 1406 | gb_free 62.4 | wall 26045 epoch 018 | loss 3.581 | nll_loss 2.034 | ppl 4.1 | wps 578928 | ups 1.17 | wpb 495124 | bsz 16506 | num_updates 30354 | lr 0.000363013 | gnorm 0.14 | clip 0 | loss_scale 2 | train_wall 1406 | gb_free 62.4 | wall 26045 epoch 018 | loss 3.581 | nll_loss 2.034 | ppl 4.1 | wps 578928 | ups 1.17 | wpb 495124 | bsz 16506 | num_updates 30354 | lr 0.000363013 | gnorm 0.14 | clip 0 | loss_scale 2 | train_wall 1406 | gb_free 62.4 | wall 26045 epoch 018 | loss 3.581 | nll_loss 2.034 | ppl 4.1 | wps 578928 | ups 1.17 | wpb 495124 | bsz 16506 | num_updates 30354 | lr 0.000363013 | gnorm 0.14 | clip 0 | loss_scale 2 | train_wall 1406 | gb_free 62.4 | wall 26045 epoch 018 | loss 3.581 | nll_loss 2.034 | ppl 4.1 | wps 578928 | ups 1.17 | wpb 495124 | bsz 16506 | num_updates 30354 | lr 0.000363013 | gnorm 0.14 | clip 0 | loss_scale 2 | train_wall 1406 | gb_free 62.4 | wall 26045 epoch 018 | loss 3.581 | nll_loss 2.034 | ppl 4.1 | wps 578928 | ups 1.17 | wpb 495124 | bsz 16506 | num_updates 30354 | lr 0.000363013 | gnorm 0.14 | clip 0 | loss_scale 2 | train_wall 1406 | gb_free 62.4 | wall 26045 Start iterating over samples epoch 019: 46 / 1689 loss=3.578, nll_loss=2.03, ppl=4.08, wps=586963, ups=1.19, wpb=492139, bsz=16545.6, num_updates=30400, lr=0.000362738, gnorm=0.139, clip=0, loss_scale=2, train_wall=82, gb_free=59.9, wall=26084 epoch 019: 46 / 1689 loss=3.578, nll_loss=2.03, ppl=4.08, wps=586963, ups=1.19, wpb=492139, bsz=16545.6, num_updates=30400, lr=0.000362738, gnorm=0.139, clip=0, loss_scale=2, train_wall=82, gb_free=59.9, wall=26084 epoch 019: 46 / 1689 loss=3.578, nll_loss=2.03, ppl=4.08, wps=586963, ups=1.19, wpb=492139, bsz=16545.6, num_updates=30400, lr=0.000362738, gnorm=0.139, clip=0, loss_scale=2, train_wall=82, gb_free=59.9, wall=26084 epoch 019: 46 / 1689 loss=3.578, nll_loss=2.03, ppl=4.08, wps=586963, ups=1.19, wpb=492139, bsz=16545.6, num_updates=30400, lr=0.000362738, gnorm=0.139, clip=0, loss_scale=2, train_wall=82, gb_free=59.9, wall=26084 epoch 019: 46 / 1689 loss=3.578, nll_loss=2.03, ppl=4.08, wps=586963, ups=1.19, wpb=492139, bsz=16545.6, num_updates=30400, lr=0.000362738, gnorm=0.139, clip=0, loss_scale=2, train_wall=82, gb_free=59.9, wall=26084 epoch 019: 46 / 1689 loss=3.578, nll_loss=2.03, ppl=4.08, wps=586963, ups=1.19, wpb=492139, bsz=16545.6, num_updates=30400, lr=0.000362738, gnorm=0.139, clip=0, loss_scale=2, train_wall=82, gb_free=59.9, wall=26084 epoch 019: 46 / 1689 loss=3.578, nll_loss=2.03, ppl=4.08, wps=586963, ups=1.19, wpb=492139, bsz=16545.6, num_updates=30400, lr=0.000362738, gnorm=0.139, clip=0, loss_scale=2, train_wall=82, gb_free=59.9, wall=26084 epoch 019: 46 / 1689 loss=3.578, nll_loss=2.03, ppl=4.08, wps=586963, ups=1.19, wpb=492139, bsz=16545.6, num_updates=30400, lr=0.000362738, gnorm=0.139, clip=0, loss_scale=2, train_wall=82, gb_free=59.9, wall=26084 epoch 019: 46 / 1689 loss=3.578, nll_loss=2.03, ppl=4.08, wps=586963, ups=1.19, wpb=492139, bsz=16545.6, num_updates=30400, lr=0.000362738, gnorm=0.139, clip=0, loss_scale=2, train_wall=82, gb_free=59.9, wall=26084 epoch 019: 46 / 1689 loss=3.578, nll_loss=2.03, ppl=4.08, wps=586963, ups=1.19, wpb=492139, bsz=16545.6, num_updates=30400, lr=0.000362738, gnorm=0.139, clip=0, loss_scale=2, train_wall=82, gb_free=59.9, wall=26084 epoch 019: 46 / 1689 loss=3.578, nll_loss=2.03, ppl=4.08, wps=586963, ups=1.19, wpb=492139, bsz=16545.6, num_updates=30400, lr=0.000362738, gnorm=0.139, clip=0, loss_scale=2, train_wall=82, gb_free=59.9, wall=26084 epoch 019: 46 / 1689 loss=3.578, nll_loss=2.03, ppl=4.08, wps=586963, ups=1.19, wpb=492139, bsz=16545.6, num_updates=30400, lr=0.000362738, gnorm=0.139, clip=0, loss_scale=2, train_wall=82, gb_free=59.9, wall=26084 epoch 019: 46 / 1689 loss=3.578, nll_loss=2.03, ppl=4.08, wps=586963, ups=1.19, wpb=492139, bsz=16545.6, num_updates=30400, lr=0.000362738, gnorm=0.139, clip=0, loss_scale=2, train_wall=82, gb_free=59.9, wall=26084 epoch 019: 46 / 1689 loss=3.578, nll_loss=2.03, ppl=4.08, wps=586963, ups=1.19, wpb=492139, bsz=16545.6, num_updates=30400, lr=0.000362738, gnorm=0.139, clip=0, loss_scale=2, train_wall=82, gb_free=59.9, wall=26084 epoch 019: 46 / 1689 loss=3.578, nll_loss=2.03, ppl=4.08, wps=586963, ups=1.19, wpb=492139, bsz=16545.6, num_updates=30400, lr=0.000362738, gnorm=0.139, clip=0, loss_scale=2, train_wall=82, gb_free=59.9, wall=26084 epoch 019: 46 / 1689 loss=3.578, nll_loss=2.03, ppl=4.08, wps=586963, ups=1.19, wpb=492139, bsz=16545.6, num_updates=30400, lr=0.000362738, gnorm=0.139, clip=0, loss_scale=2, train_wall=82, gb_free=59.9, wall=26084 epoch 019: 46 / 1689 loss=3.578, nll_loss=2.03, ppl=4.08, wps=586963, ups=1.19, wpb=492139, bsz=16545.6, num_updates=30400, lr=0.000362738, gnorm=0.139, clip=0, loss_scale=2, train_wall=82, gb_free=59.9, wall=26084 epoch 019: 46 / 1689 loss=3.578, nll_loss=2.03, ppl=4.08, wps=586963, ups=1.19, wpb=492139, bsz=16545.6, num_updates=30400, lr=0.000362738, gnorm=0.139, clip=0, loss_scale=2, train_wall=82, gb_free=59.9, wall=26084 epoch 019: 46 / 1689 loss=3.578, nll_loss=2.03, ppl=4.08, wps=586963, ups=1.19, wpb=492139, bsz=16545.6, num_updates=30400, lr=0.000362738, gnorm=0.139, clip=0, loss_scale=2, train_wall=82, gb_free=59.9, wall=26084 epoch 019: 146 / 1689 loss=3.556, nll_loss=2.006, ppl=4.02, wps=587425, ups=1.18, wpb=496570, bsz=16890.3, num_updates=30500, lr=0.000362143, gnorm=0.136, clip=0, loss_scale=4, train_wall=83, gb_free=61.3, wall=26169 epoch 019: 146 / 1689 loss=3.556, nll_loss=2.006, ppl=4.02, wps=587425, ups=1.18, wpb=496570, bsz=16890.3, num_updates=30500, lr=0.000362143, gnorm=0.136, clip=0, loss_scale=4, train_wall=83, gb_free=61.3, wall=26169 epoch 019: 146 / 1689 loss=3.556, nll_loss=2.006, ppl=4.02, wps=587425, ups=1.18, wpb=496570, bsz=16890.3, num_updates=30500, lr=0.000362143, gnorm=0.136, clip=0, loss_scale=4, train_wall=83, gb_free=61.3, wall=26169 epoch 019: 146 / 1689 loss=3.556, nll_loss=2.006, ppl=4.02, wps=587425, ups=1.18, wpb=496570, bsz=16890.3, num_updates=30500, lr=0.000362143, gnorm=0.136, clip=0, loss_scale=4, train_wall=83, gb_free=61.3, wall=26169 epoch 019: 146 / 1689 loss=3.556, nll_loss=2.006, ppl=4.02, wps=587425, ups=1.18, wpb=496570, bsz=16890.3, num_updates=30500, lr=0.000362143, gnorm=0.136, clip=0, loss_scale=4, train_wall=83, gb_free=61.3, wall=26169 epoch 019: 146 / 1689 loss=3.556, nll_loss=2.006, ppl=4.02, wps=587425, ups=1.18, wpb=496570, bsz=16890.3, num_updates=30500, lr=0.000362143, gnorm=0.136, clip=0, loss_scale=4, train_wall=83, gb_free=61.3, wall=26169 epoch 019: 146 / 1689 loss=3.556, nll_loss=2.006, ppl=4.02, wps=587425, ups=1.18, wpb=496570, bsz=16890.3, num_updates=30500, lr=0.000362143, gnorm=0.136, clip=0, loss_scale=4, train_wall=83, gb_free=61.3, wall=26169 epoch 019: 146 / 1689 loss=3.556, nll_loss=2.006, ppl=4.02, wps=587425, ups=1.18, wpb=496570, bsz=16890.3, num_updates=30500, lr=0.000362143, gnorm=0.136, clip=0, loss_scale=4, train_wall=83, gb_free=61.3, wall=26169 epoch 019: 146 / 1689 loss=3.556, nll_loss=2.006, ppl=4.02, wps=587425, ups=1.18, wpb=496570, bsz=16890.3, num_updates=30500, lr=0.000362143, gnorm=0.136, clip=0, loss_scale=4, train_wall=83, gb_free=61.3, wall=26169 epoch 019: 146 / 1689 loss=3.556, nll_loss=2.006, ppl=4.02, wps=587425, ups=1.18, wpb=496570, bsz=16890.3, num_updates=30500, lr=0.000362143, gnorm=0.136, clip=0, loss_scale=4, train_wall=83, gb_free=61.3, wall=26169 epoch 019: 146 / 1689 loss=3.556, nll_loss=2.006, ppl=4.02, wps=587425, ups=1.18, wpb=496570, bsz=16890.3, num_updates=30500, lr=0.000362143, gnorm=0.136, clip=0, loss_scale=4, train_wall=83, gb_free=61.3, wall=26169 epoch 019: 146 / 1689 loss=3.556, nll_loss=2.006, ppl=4.02, wps=587425, ups=1.18, wpb=496570, bsz=16890.3, num_updates=30500, lr=0.000362143, gnorm=0.136, clip=0, loss_scale=4, train_wall=83, gb_free=61.3, wall=26169 epoch 019: 146 / 1689 loss=3.556, nll_loss=2.006, ppl=4.02, wps=587425, ups=1.18, wpb=496570, bsz=16890.3, num_updates=30500, lr=0.000362143, gnorm=0.136, clip=0, loss_scale=4, train_wall=83, gb_free=61.3, wall=26169 epoch 019: 146 / 1689 loss=3.556, nll_loss=2.006, ppl=4.02, wps=587425, ups=1.18, wpb=496570, bsz=16890.3, num_updates=30500, lr=0.000362143, gnorm=0.136, clip=0, loss_scale=4, train_wall=83, gb_free=61.3, wall=26169 epoch 019: 146 / 1689 loss=3.556, nll_loss=2.006, ppl=4.02, wps=587425, ups=1.18, wpb=496570, bsz=16890.3, num_updates=30500, lr=0.000362143, gnorm=0.136, clip=0, loss_scale=4, train_wall=83, gb_free=61.3, wall=26169 epoch 019: 146 / 1689 loss=3.556, nll_loss=2.006, ppl=4.02, wps=587425, ups=1.18, wpb=496570, bsz=16890.3, num_updates=30500, lr=0.000362143, gnorm=0.136, clip=0, loss_scale=4, train_wall=83, gb_free=61.3, wall=26169 epoch 019: 146 / 1689 loss=3.556, nll_loss=2.006, ppl=4.02, wps=587425, ups=1.18, wpb=496570, bsz=16890.3, num_updates=30500, lr=0.000362143, gnorm=0.136, clip=0, loss_scale=4, train_wall=83, gb_free=61.3, wall=26169 epoch 019: 146 / 1689 loss=3.556, nll_loss=2.006, ppl=4.02, wps=587425, ups=1.18, wpb=496570, bsz=16890.3, num_updates=30500, lr=0.000362143, gnorm=0.136, clip=0, loss_scale=4, train_wall=83, gb_free=61.3, wall=26169 epoch 019: 146 / 1689 loss=3.556, nll_loss=2.006, ppl=4.02, wps=587425, ups=1.18, wpb=496570, bsz=16890.3, num_updates=30500, lr=0.000362143, gnorm=0.136, clip=0, loss_scale=4, train_wall=83, gb_free=61.3, wall=26169 epoch 019: 247 / 1689 loss=3.563, nll_loss=2.014, ppl=4.04, wps=582969, ups=1.17, wpb=496376, bsz=16555, num_updates=30600, lr=0.000361551, gnorm=0.144, clip=0, loss_scale=2, train_wall=84, gb_free=61.2, wall=26254 epoch 019: 247 / 1689 loss=3.563, nll_loss=2.014, ppl=4.04, wps=582969, ups=1.17, wpb=496376, bsz=16555, num_updates=30600, lr=0.000361551, gnorm=0.144, clip=0, loss_scale=2, train_wall=84, gb_free=61.2, wall=26254 epoch 019: 247 / 1689 loss=3.563, nll_loss=2.014, ppl=4.04, wps=582969, ups=1.17, wpb=496376, bsz=16555, num_updates=30600, lr=0.000361551, gnorm=0.144, clip=0, loss_scale=2, train_wall=84, gb_free=61.2, wall=26254 epoch 019: 247 / 1689 loss=3.563, nll_loss=2.014, ppl=4.04, wps=582969, ups=1.17, wpb=496376, bsz=16555, num_updates=30600, lr=0.000361551, gnorm=0.144, clip=0, loss_scale=2, train_wall=84, gb_free=61.2, wall=26254 epoch 019: 247 / 1689 loss=3.563, nll_loss=2.014, ppl=4.04, wps=582969, ups=1.17, wpb=496376, bsz=16555, num_updates=30600, lr=0.000361551, gnorm=0.144, clip=0, loss_scale=2, train_wall=84, gb_free=61.2, wall=26254 epoch 019: 247 / 1689 loss=3.563, nll_loss=2.014, ppl=4.04, wps=582969, ups=1.17, wpb=496376, bsz=16555, num_updates=30600, lr=0.000361551, gnorm=0.144, clip=0, loss_scale=2, train_wall=84, gb_free=61.2, wall=26254 epoch 019: 247 / 1689 loss=3.563, nll_loss=2.014, ppl=4.04, wps=582969, ups=1.17, wpb=496376, bsz=16555, num_updates=30600, lr=0.000361551, gnorm=0.144, clip=0, loss_scale=2, train_wall=84, gb_free=61.2, wall=26254 epoch 019: 247 / 1689 loss=3.563, nll_loss=2.014, ppl=4.04, wps=582969, ups=1.17, wpb=496376, bsz=16555, num_updates=30600, lr=0.000361551, gnorm=0.144, clip=0, loss_scale=2, train_wall=84, gb_free=61.2, wall=26254 epoch 019: 247 / 1689 loss=3.563, nll_loss=2.014, ppl=4.04, wps=582969, ups=1.17, wpb=496376, bsz=16555, num_updates=30600, lr=0.000361551, gnorm=0.144, clip=0, loss_scale=2, train_wall=84, gb_free=61.2, wall=26254 epoch 019: 247 / 1689 loss=3.563, nll_loss=2.014, ppl=4.04, wps=582969, ups=1.17, wpb=496376, bsz=16555, num_updates=30600, lr=0.000361551, gnorm=0.144, clip=0, loss_scale=2, train_wall=84, gb_free=61.2, wall=26254 epoch 019: 247 / 1689 loss=3.563, nll_loss=2.014, ppl=4.04, wps=582969, ups=1.17, wpb=496376, bsz=16555, num_updates=30600, lr=0.000361551, gnorm=0.144, clip=0, loss_scale=2, train_wall=84, gb_free=61.2, wall=26254 epoch 019: 247 / 1689 loss=3.563, nll_loss=2.014, ppl=4.04, wps=582969, ups=1.17, wpb=496376, bsz=16555, num_updates=30600, lr=0.000361551, gnorm=0.144, clip=0, loss_scale=2, train_wall=84, gb_free=61.2, wall=26254 epoch 019: 247 / 1689 loss=3.563, nll_loss=2.014, ppl=4.04, wps=582969, ups=1.17, wpb=496376, bsz=16555, num_updates=30600, lr=0.000361551, gnorm=0.144, clip=0, loss_scale=2, train_wall=84, gb_free=61.2, wall=26254 epoch 019: 247 / 1689 loss=3.563, nll_loss=2.014, ppl=4.04, wps=582969, ups=1.17, wpb=496376, bsz=16555, num_updates=30600, lr=0.000361551, gnorm=0.144, clip=0, loss_scale=2, train_wall=84, gb_free=61.2, wall=26254 epoch 019: 247 / 1689 loss=3.563, nll_loss=2.014, ppl=4.04, wps=582969, ups=1.17, wpb=496376, bsz=16555, num_updates=30600, lr=0.000361551, gnorm=0.144, clip=0, loss_scale=2, train_wall=84, gb_free=61.2, wall=26254 epoch 019: 247 / 1689 loss=3.563, nll_loss=2.014, ppl=4.04, wps=582969, ups=1.17, wpb=496376, bsz=16555, num_updates=30600, lr=0.000361551, gnorm=0.144, clip=0, loss_scale=2, train_wall=84, gb_free=61.2, wall=26254 epoch 019: 247 / 1689 loss=3.563, nll_loss=2.014, ppl=4.04, wps=582969, ups=1.17, wpb=496376, bsz=16555, num_updates=30600, lr=0.000361551, gnorm=0.144, clip=0, loss_scale=2, train_wall=84, gb_free=61.2, wall=26254 epoch 019: 247 / 1689 loss=3.563, nll_loss=2.014, ppl=4.04, wps=582969, ups=1.17, wpb=496376, bsz=16555, num_updates=30600, lr=0.000361551, gnorm=0.144, clip=0, loss_scale=2, train_wall=84, gb_free=61.2, wall=26254 epoch 019: 247 / 1689 loss=3.563, nll_loss=2.014, ppl=4.04, wps=582969, ups=1.17, wpb=496376, bsz=16555, num_updates=30600, lr=0.000361551, gnorm=0.144, clip=0, loss_scale=2, train_wall=84, gb_free=61.2, wall=26254 epoch 019: 347 / 1689 loss=3.564, nll_loss=2.015, ppl=4.04, wps=586221, ups=1.18, wpb=495984, bsz=16641.7, num_updates=30700, lr=0.000360961, gnorm=0.129, clip=0, loss_scale=2, train_wall=84, gb_free=60.8, wall=26338 epoch 019: 347 / 1689 loss=3.564, nll_loss=2.015, ppl=4.04, wps=586221, ups=1.18, wpb=495984, bsz=16641.7, num_updates=30700, lr=0.000360961, gnorm=0.129, clip=0, loss_scale=2, train_wall=84, gb_free=60.8, wall=26338 epoch 019: 347 / 1689 loss=3.564, nll_loss=2.015, ppl=4.04, wps=586221, ups=1.18, wpb=495984, bsz=16641.7, num_updates=30700, lr=0.000360961, gnorm=0.129, clip=0, loss_scale=2, train_wall=84, gb_free=60.8, wall=26338 epoch 019: 347 / 1689 loss=3.564, nll_loss=2.015, ppl=4.04, wps=586221, ups=1.18, wpb=495984, bsz=16641.7, num_updates=30700, lr=0.000360961, gnorm=0.129, clip=0, loss_scale=2, train_wall=84, gb_free=60.8, wall=26338 epoch 019: 347 / 1689 loss=3.564, nll_loss=2.015, ppl=4.04, wps=586221, ups=1.18, wpb=495984, bsz=16641.7, num_updates=30700, lr=0.000360961, gnorm=0.129, clip=0, loss_scale=2, train_wall=84, gb_free=60.8, wall=26338 epoch 019: 347 / 1689 loss=3.564, nll_loss=2.015, ppl=4.04, wps=586221, ups=1.18, wpb=495984, bsz=16641.7, num_updates=30700, lr=0.000360961, gnorm=0.129, clip=0, loss_scale=2, train_wall=84, gb_free=60.8, wall=26338 epoch 019: 347 / 1689 loss=3.564, nll_loss=2.015, ppl=4.04, wps=586221, ups=1.18, wpb=495984, bsz=16641.7, num_updates=30700, lr=0.000360961, gnorm=0.129, clip=0, loss_scale=2, train_wall=84, gb_free=60.8, wall=26338 epoch 019: 347 / 1689 loss=3.564, nll_loss=2.015, ppl=4.04, wps=586221, ups=1.18, wpb=495984, bsz=16641.7, num_updates=30700, lr=0.000360961, gnorm=0.129, clip=0, loss_scale=2, train_wall=84, gb_free=60.8, wall=26338 epoch 019: 347 / 1689 loss=3.564, nll_loss=2.015, ppl=4.04, wps=586221, ups=1.18, wpb=495984, bsz=16641.7, num_updates=30700, lr=0.000360961, gnorm=0.129, clip=0, loss_scale=2, train_wall=84, gb_free=60.8, wall=26338 epoch 019: 347 / 1689 loss=3.564, nll_loss=2.015, ppl=4.04, wps=586221, ups=1.18, wpb=495984, bsz=16641.7, num_updates=30700, lr=0.000360961, gnorm=0.129, clip=0, loss_scale=2, train_wall=84, gb_free=60.8, wall=26338 epoch 019: 347 / 1689 loss=3.564, nll_loss=2.015, ppl=4.04, wps=586221, ups=1.18, wpb=495984, bsz=16641.7, num_updates=30700, lr=0.000360961, gnorm=0.129, clip=0, loss_scale=2, train_wall=84, gb_free=60.8, wall=26338 epoch 019: 347 / 1689 loss=3.564, nll_loss=2.015, ppl=4.04, wps=586221, ups=1.18, wpb=495984, bsz=16641.7, num_updates=30700, lr=0.000360961, gnorm=0.129, clip=0, loss_scale=2, train_wall=84, gb_free=60.8, wall=26338 epoch 019: 347 / 1689 loss=3.564, nll_loss=2.015, ppl=4.04, wps=586221, ups=1.18, wpb=495984, bsz=16641.7, num_updates=30700, lr=0.000360961, gnorm=0.129, clip=0, loss_scale=2, train_wall=84, gb_free=60.8, wall=26338 epoch 019: 347 / 1689 loss=3.564, nll_loss=2.015, ppl=4.04, wps=586221, ups=1.18, wpb=495984, bsz=16641.7, num_updates=30700, lr=0.000360961, gnorm=0.129, clip=0, loss_scale=2, train_wall=84, gb_free=60.8, wall=26338 epoch 019: 347 / 1689 loss=3.564, nll_loss=2.015, ppl=4.04, wps=586221, ups=1.18, wpb=495984, bsz=16641.7, num_updates=30700, lr=0.000360961, gnorm=0.129, clip=0, loss_scale=2, train_wall=84, gb_free=60.8, wall=26338 epoch 019: 347 / 1689 loss=3.564, nll_loss=2.015, ppl=4.04, wps=586221, ups=1.18, wpb=495984, bsz=16641.7, num_updates=30700, lr=0.000360961, gnorm=0.129, clip=0, loss_scale=2, train_wall=84, gb_free=60.8, wall=26338 epoch 019: 347 / 1689 loss=3.564, nll_loss=2.015, ppl=4.04, wps=586221, ups=1.18, wpb=495984, bsz=16641.7, num_updates=30700, lr=0.000360961, gnorm=0.129, clip=0, loss_scale=2, train_wall=84, gb_free=60.8, wall=26338 epoch 019: 347 / 1689 loss=3.564, nll_loss=2.015, ppl=4.04, wps=586221, ups=1.18, wpb=495984, bsz=16641.7, num_updates=30700, lr=0.000360961, gnorm=0.129, clip=0, loss_scale=2, train_wall=84, gb_free=60.8, wall=26338 epoch 019: 347 / 1689 loss=3.564, nll_loss=2.015, ppl=4.04, wps=586221, ups=1.18, wpb=495984, bsz=16641.7, num_updates=30700, lr=0.000360961, gnorm=0.129, clip=0, loss_scale=2, train_wall=84, gb_free=60.8, wall=26338 epoch 019: 447 / 1689 loss=3.567, nll_loss=2.018, ppl=4.05, wps=587167, ups=1.18, wpb=495854, bsz=16107.8, num_updates=30800, lr=0.000360375, gnorm=0.14, clip=0, loss_scale=2, train_wall=83, gb_free=61.7, wall=26423 epoch 019: 447 / 1689 loss=3.567, nll_loss=2.018, ppl=4.05, wps=587167, ups=1.18, wpb=495854, bsz=16107.8, num_updates=30800, lr=0.000360375, gnorm=0.14, clip=0, loss_scale=2, train_wall=83, gb_free=61.7, wall=26423 epoch 019: 447 / 1689 loss=3.567, nll_loss=2.018, ppl=4.05, wps=587167, ups=1.18, wpb=495854, bsz=16107.8, num_updates=30800, lr=0.000360375, gnorm=0.14, clip=0, loss_scale=2, train_wall=83, gb_free=61.7, wall=26423 epoch 019: 447 / 1689 loss=3.567, nll_loss=2.018, ppl=4.05, wps=587167, ups=1.18, wpb=495854, bsz=16107.8, num_updates=30800, lr=0.000360375, gnorm=0.14, clip=0, loss_scale=2, train_wall=83, gb_free=61.7, wall=26423 epoch 019: 447 / 1689 loss=3.567, nll_loss=2.018, ppl=4.05, wps=587167, ups=1.18, wpb=495854, bsz=16107.8, num_updates=30800, lr=0.000360375, gnorm=0.14, clip=0, loss_scale=2, train_wall=83, gb_free=61.7, wall=26423 epoch 019: 447 / 1689 loss=3.567, nll_loss=2.018, ppl=4.05, wps=587167, ups=1.18, wpb=495854, bsz=16107.8, num_updates=30800, lr=0.000360375, gnorm=0.14, clip=0, loss_scale=2, train_wall=83, gb_free=61.7, wall=26423 epoch 019: 447 / 1689 loss=3.567, nll_loss=2.018, ppl=4.05, wps=587167, ups=1.18, wpb=495854, bsz=16107.8, num_updates=30800, lr=0.000360375, gnorm=0.14, clip=0, loss_scale=2, train_wall=83, gb_free=61.7, wall=26423 epoch 019: 447 / 1689 loss=3.567, nll_loss=2.018, ppl=4.05, wps=587167, ups=1.18, wpb=495854, bsz=16107.8, num_updates=30800, lr=0.000360375, gnorm=0.14, clip=0, loss_scale=2, train_wall=83, gb_free=61.7, wall=26423 epoch 019: 447 / 1689 loss=3.567, nll_loss=2.018, ppl=4.05, wps=587167, ups=1.18, wpb=495854, bsz=16107.8, num_updates=30800, lr=0.000360375, gnorm=0.14, clip=0, loss_scale=2, train_wall=83, gb_free=61.7, wall=26423 epoch 019: 447 / 1689 loss=3.567, nll_loss=2.018, ppl=4.05, wps=587167, ups=1.18, wpb=495854, bsz=16107.8, num_updates=30800, lr=0.000360375, gnorm=0.14, clip=0, loss_scale=2, train_wall=83, gb_free=61.7, wall=26423 epoch 019: 447 / 1689 loss=3.567, nll_loss=2.018, ppl=4.05, wps=587167, ups=1.18, wpb=495854, bsz=16107.8, num_updates=30800, lr=0.000360375, gnorm=0.14, clip=0, loss_scale=2, train_wall=83, gb_free=61.7, wall=26423 epoch 019: 447 / 1689 loss=3.567, nll_loss=2.018, ppl=4.05, wps=587167, ups=1.18, wpb=495854, bsz=16107.8, num_updates=30800, lr=0.000360375, gnorm=0.14, clip=0, loss_scale=2, train_wall=83, gb_free=61.7, wall=26423 epoch 019: 447 / 1689 loss=3.567, nll_loss=2.018, ppl=4.05, wps=587167, ups=1.18, wpb=495854, bsz=16107.8, num_updates=30800, lr=0.000360375, gnorm=0.14, clip=0, loss_scale=2, train_wall=83, gb_free=61.7, wall=26423 epoch 019: 447 / 1689 loss=3.567, nll_loss=2.018, ppl=4.05, wps=587167, ups=1.18, wpb=495854, bsz=16107.8, num_updates=30800, lr=0.000360375, gnorm=0.14, clip=0, loss_scale=2, train_wall=83, gb_free=61.7, wall=26423 epoch 019: 447 / 1689 loss=3.567, nll_loss=2.018, ppl=4.05, wps=587167, ups=1.18, wpb=495854, bsz=16107.8, num_updates=30800, lr=0.000360375, gnorm=0.14, clip=0, loss_scale=2, train_wall=83, gb_free=61.7, wall=26423 epoch 019: 447 / 1689 loss=3.567, nll_loss=2.018, ppl=4.05, wps=587167, ups=1.18, wpb=495854, bsz=16107.8, num_updates=30800, lr=0.000360375, gnorm=0.14, clip=0, loss_scale=2, train_wall=83, gb_free=61.7, wall=26423 epoch 019: 447 / 1689 loss=3.567, nll_loss=2.018, ppl=4.05, wps=587167, ups=1.18, wpb=495854, bsz=16107.8, num_updates=30800, lr=0.000360375, gnorm=0.14, clip=0, loss_scale=2, train_wall=83, gb_free=61.7, wall=26423 epoch 019: 447 / 1689 loss=3.567, nll_loss=2.018, ppl=4.05, wps=587167, ups=1.18, wpb=495854, bsz=16107.8, num_updates=30800, lr=0.000360375, gnorm=0.14, clip=0, loss_scale=2, train_wall=83, gb_free=61.7, wall=26423 epoch 019: 447 / 1689 loss=3.567, nll_loss=2.018, ppl=4.05, wps=587167, ups=1.18, wpb=495854, bsz=16107.8, num_updates=30800, lr=0.000360375, gnorm=0.14, clip=0, loss_scale=2, train_wall=83, gb_free=61.7, wall=26423 epoch 019: 547 / 1689 loss=3.575, nll_loss=2.027, ppl=4.08, wps=592617, ups=1.19, wpb=496786, bsz=16595.1, num_updates=30900, lr=0.000359791, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=62.2, wall=26507 epoch 019: 547 / 1689 loss=3.575, nll_loss=2.027, ppl=4.08, wps=592617, ups=1.19, wpb=496786, bsz=16595.1, num_updates=30900, lr=0.000359791, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=62.2, wall=26507 epoch 019: 547 / 1689 loss=3.575, nll_loss=2.027, ppl=4.08, wps=592617, ups=1.19, wpb=496786, bsz=16595.1, num_updates=30900, lr=0.000359791, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=62.2, wall=26507 epoch 019: 547 / 1689 loss=3.575, nll_loss=2.027, ppl=4.08, wps=592617, ups=1.19, wpb=496786, bsz=16595.1, num_updates=30900, lr=0.000359791, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=62.2, wall=26507 epoch 019: 547 / 1689 loss=3.575, nll_loss=2.027, ppl=4.08, wps=592617, ups=1.19, wpb=496786, bsz=16595.1, num_updates=30900, lr=0.000359791, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=62.2, wall=26507 epoch 019: 547 / 1689 loss=3.575, nll_loss=2.027, ppl=4.08, wps=592617, ups=1.19, wpb=496786, bsz=16595.1, num_updates=30900, lr=0.000359791, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=62.2, wall=26507 epoch 019: 547 / 1689 loss=3.575, nll_loss=2.027, ppl=4.08, wps=592617, ups=1.19, wpb=496786, bsz=16595.1, num_updates=30900, lr=0.000359791, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=62.2, wall=26507 epoch 019: 547 / 1689 loss=3.575, nll_loss=2.027, ppl=4.08, wps=592617, ups=1.19, wpb=496786, bsz=16595.1, num_updates=30900, lr=0.000359791, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=62.2, wall=26507 epoch 019: 547 / 1689 loss=3.575, nll_loss=2.027, ppl=4.08, wps=592617, ups=1.19, wpb=496786, bsz=16595.1, num_updates=30900, lr=0.000359791, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=62.2, wall=26507 epoch 019: 547 / 1689 loss=3.575, nll_loss=2.027, ppl=4.08, wps=592617, ups=1.19, wpb=496786, bsz=16595.1, num_updates=30900, lr=0.000359791, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=62.2, wall=26507 epoch 019: 547 / 1689 loss=3.575, nll_loss=2.027, ppl=4.08, wps=592617, ups=1.19, wpb=496786, bsz=16595.1, num_updates=30900, lr=0.000359791, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=62.2, wall=26507 epoch 019: 547 / 1689 loss=3.575, nll_loss=2.027, ppl=4.08, wps=592617, ups=1.19, wpb=496786, bsz=16595.1, num_updates=30900, lr=0.000359791, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=62.2, wall=26507 epoch 019: 547 / 1689 loss=3.575, nll_loss=2.027, ppl=4.08, wps=592617, ups=1.19, wpb=496786, bsz=16595.1, num_updates=30900, lr=0.000359791, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=62.2, wall=26507 epoch 019: 547 / 1689 loss=3.575, nll_loss=2.027, ppl=4.08, wps=592617, ups=1.19, wpb=496786, bsz=16595.1, num_updates=30900, lr=0.000359791, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=62.2, wall=26507 epoch 019: 547 / 1689 loss=3.575, nll_loss=2.027, ppl=4.08, wps=592617, ups=1.19, wpb=496786, bsz=16595.1, num_updates=30900, lr=0.000359791, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=62.2, wall=26507 epoch 019: 547 / 1689 loss=3.575, nll_loss=2.027, ppl=4.08, wps=592617, ups=1.19, wpb=496786, bsz=16595.1, num_updates=30900, lr=0.000359791, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=62.2, wall=26507 epoch 019: 547 / 1689 loss=3.575, nll_loss=2.027, ppl=4.08, wps=592617, ups=1.19, wpb=496786, bsz=16595.1, num_updates=30900, lr=0.000359791, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=62.2, wall=26507 epoch 019: 547 / 1689 loss=3.575, nll_loss=2.027, ppl=4.08, wps=592617, ups=1.19, wpb=496786, bsz=16595.1, num_updates=30900, lr=0.000359791, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=62.2, wall=26507 epoch 019: 547 / 1689 loss=3.575, nll_loss=2.027, ppl=4.08, wps=592617, ups=1.19, wpb=496786, bsz=16595.1, num_updates=30900, lr=0.000359791, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=62.2, wall=26507 epoch 019: 647 / 1689 loss=3.575, nll_loss=2.027, ppl=4.08, wps=587040, ups=1.18, wpb=495588, bsz=16849.4, num_updates=31000, lr=0.000359211, gnorm=0.142, clip=0, loss_scale=2, train_wall=83, gb_free=61, wall=26591 epoch 019: 647 / 1689 loss=3.575, nll_loss=2.027, ppl=4.08, wps=587040, ups=1.18, wpb=495588, bsz=16849.4, num_updates=31000, lr=0.000359211, gnorm=0.142, clip=0, loss_scale=2, train_wall=83, gb_free=61, wall=26591 epoch 019: 647 / 1689 loss=3.575, nll_loss=2.027, ppl=4.08, wps=587040, ups=1.18, wpb=495588, bsz=16849.4, num_updates=31000, lr=0.000359211, gnorm=0.142, clip=0, loss_scale=2, train_wall=83, gb_free=61, wall=26591 epoch 019: 647 / 1689 loss=3.575, nll_loss=2.027, ppl=4.08, wps=587040, ups=1.18, wpb=495588, bsz=16849.4, num_updates=31000, lr=0.000359211, gnorm=0.142, clip=0, loss_scale=2, train_wall=83, gb_free=61, wall=26591 epoch 019: 647 / 1689 loss=3.575, nll_loss=2.027, ppl=4.08, wps=587040, ups=1.18, wpb=495588, bsz=16849.4, num_updates=31000, lr=0.000359211, gnorm=0.142, clip=0, loss_scale=2, train_wall=83, gb_free=61, wall=26591 epoch 019: 647 / 1689 loss=3.575, nll_loss=2.027, ppl=4.08, wps=587040, ups=1.18, wpb=495588, bsz=16849.4, num_updates=31000, lr=0.000359211, gnorm=0.142, clip=0, loss_scale=2, train_wall=83, gb_free=61, wall=26591 epoch 019: 647 / 1689 loss=3.575, nll_loss=2.027, ppl=4.08, wps=587040, ups=1.18, wpb=495588, bsz=16849.4, num_updates=31000, lr=0.000359211, gnorm=0.142, clip=0, loss_scale=2, train_wall=83, gb_free=61, wall=26591 epoch 019: 647 / 1689 loss=3.575, nll_loss=2.027, ppl=4.08, wps=587040, ups=1.18, wpb=495588, bsz=16849.4, num_updates=31000, lr=0.000359211, gnorm=0.142, clip=0, loss_scale=2, train_wall=83, gb_free=61, wall=26591 epoch 019: 647 / 1689 loss=3.575, nll_loss=2.027, ppl=4.08, wps=587040, ups=1.18, wpb=495588, bsz=16849.4, num_updates=31000, lr=0.000359211, gnorm=0.142, clip=0, loss_scale=2, train_wall=83, gb_free=61, wall=26591 epoch 019: 647 / 1689 loss=3.575, nll_loss=2.027, ppl=4.08, wps=587040, ups=1.18, wpb=495588, bsz=16849.4, num_updates=31000, lr=0.000359211, gnorm=0.142, clip=0, loss_scale=2, train_wall=83, gb_free=61, wall=26591 epoch 019: 647 / 1689 loss=3.575, nll_loss=2.027, ppl=4.08, wps=587040, ups=1.18, wpb=495588, bsz=16849.4, num_updates=31000, lr=0.000359211, gnorm=0.142, clip=0, loss_scale=2, train_wall=83, gb_free=61, wall=26591 epoch 019: 647 / 1689 loss=3.575, nll_loss=2.027, ppl=4.08, wps=587040, ups=1.18, wpb=495588, bsz=16849.4, num_updates=31000, lr=0.000359211, gnorm=0.142, clip=0, loss_scale=2, train_wall=83, gb_free=61, wall=26591 epoch 019: 647 / 1689 loss=3.575, nll_loss=2.027, ppl=4.08, wps=587040, ups=1.18, wpb=495588, bsz=16849.4, num_updates=31000, lr=0.000359211, gnorm=0.142, clip=0, loss_scale=2, train_wall=83, gb_free=61, wall=26591 epoch 019: 647 / 1689 loss=3.575, nll_loss=2.027, ppl=4.08, wps=587040, ups=1.18, wpb=495588, bsz=16849.4, num_updates=31000, lr=0.000359211, gnorm=0.142, clip=0, loss_scale=2, train_wall=83, gb_free=61, wall=26591 epoch 019: 647 / 1689 loss=3.575, nll_loss=2.027, ppl=4.08, wps=587040, ups=1.18, wpb=495588, bsz=16849.4, num_updates=31000, lr=0.000359211, gnorm=0.142, clip=0, loss_scale=2, train_wall=83, gb_free=61, wall=26591 epoch 019: 647 / 1689 loss=3.575, nll_loss=2.027, ppl=4.08, wps=587040, ups=1.18, wpb=495588, bsz=16849.4, num_updates=31000, lr=0.000359211, gnorm=0.142, clip=0, loss_scale=2, train_wall=83, gb_free=61, wall=26591 epoch 019: 647 / 1689 loss=3.575, nll_loss=2.027, ppl=4.08, wps=587040, ups=1.18, wpb=495588, bsz=16849.4, num_updates=31000, lr=0.000359211, gnorm=0.142, clip=0, loss_scale=2, train_wall=83, gb_free=61, wall=26591 epoch 019: 647 / 1689 loss=3.575, nll_loss=2.027, ppl=4.08, wps=587040, ups=1.18, wpb=495588, bsz=16849.4, num_updates=31000, lr=0.000359211, gnorm=0.142, clip=0, loss_scale=2, train_wall=83, gb_free=61, wall=26591 epoch 019: 647 / 1689 loss=3.575, nll_loss=2.027, ppl=4.08, wps=587040, ups=1.18, wpb=495588, bsz=16849.4, num_updates=31000, lr=0.000359211, gnorm=0.142, clip=0, loss_scale=2, train_wall=83, gb_free=61, wall=26591 begin validation on "valid" subset epoch 019 | valid on 'valid' subset | loss 3.701 | nll_loss 2.153 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 31000 | best_loss 3.701 epoch 019 | valid on 'valid' subset | loss 3.701 | nll_loss 2.153 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 31000 | best_loss 3.701 epoch 019 | valid on 'valid' subset | loss 3.701 | nll_loss 2.153 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 31000 | best_loss 3.701 epoch 019 | valid on 'valid' subset | loss 3.701 | nll_loss 2.153 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 31000 | best_loss 3.701 epoch 019 | valid on 'valid' subset | loss 3.701 | nll_loss 2.153 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 31000 | best_loss 3.701 epoch 019 | valid on 'valid' subset | loss 3.701 | nll_loss 2.153 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 31000 | best_loss 3.701 epoch 019 | valid on 'valid' subset | loss 3.701 | nll_loss 2.153 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 31000 | best_loss 3.701 epoch 019 | valid on 'valid' subset | loss 3.701 | nll_loss 2.153 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 31000 | best_loss 3.701 epoch 019 | valid on 'valid' subset | loss 3.701 | nll_loss 2.153 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 31000 | best_loss 3.701 epoch 019 | valid on 'valid' subset | loss 3.701 | nll_loss 2.153 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 31000 | best_loss 3.701 epoch 019 | valid on 'valid' subset | loss 3.701 | nll_loss 2.153 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 31000 | best_loss 3.701 epoch 019 | valid on 'valid' subset | loss 3.701 | nll_loss 2.153 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 31000 | best_loss 3.701 epoch 019 | valid on 'valid' subset | loss 3.701 | nll_loss 2.153 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 31000 | best_loss 3.701 epoch 019 | valid on 'valid' subset | loss 3.701 | nll_loss 2.153 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 31000 | best_loss 3.701 epoch 019 | valid on 'valid' subset | loss 3.701 | nll_loss 2.153 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 31000 | best_loss 3.701 epoch 019 | valid on 'valid' subset | loss 3.701 | nll_loss 2.153 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 31000 | best_loss 3.701 epoch 019 | valid on 'valid' subset | loss 3.701 | nll_loss 2.153 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 31000 | best_loss 3.701 epoch 019 | valid on 'valid' subset | loss 3.701 | nll_loss 2.153 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 31000 | best_loss 3.701 epoch 019 | valid on 'valid' subset | loss 3.701 | nll_loss 2.153 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 31000 | best_loss 3.701 epoch 019: 747 / 1689 loss=3.572, nll_loss=2.024, ppl=4.07, wps=516180, ups=1.04, wpb=495055, bsz=16695.2, num_updates=31100, lr=0.000358633, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=61, wall=26687 epoch 019: 747 / 1689 loss=3.572, nll_loss=2.024, ppl=4.07, wps=516180, ups=1.04, wpb=495055, bsz=16695.2, num_updates=31100, lr=0.000358633, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=61, wall=26687 epoch 019: 747 / 1689 loss=3.572, nll_loss=2.024, ppl=4.07, wps=516180, ups=1.04, wpb=495055, bsz=16695.2, num_updates=31100, lr=0.000358633, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=61, wall=26687 epoch 019: 747 / 1689 loss=3.572, nll_loss=2.024, ppl=4.07, wps=516180, ups=1.04, wpb=495055, bsz=16695.2, num_updates=31100, lr=0.000358633, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=61, wall=26687 epoch 019: 747 / 1689 loss=3.572, nll_loss=2.024, ppl=4.07, wps=516180, ups=1.04, wpb=495055, bsz=16695.2, num_updates=31100, lr=0.000358633, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=61, wall=26687 epoch 019: 747 / 1689 loss=3.572, nll_loss=2.024, ppl=4.07, wps=516180, ups=1.04, wpb=495055, bsz=16695.2, num_updates=31100, lr=0.000358633, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=61, wall=26687 epoch 019: 747 / 1689 loss=3.572, nll_loss=2.024, ppl=4.07, wps=516180, ups=1.04, wpb=495055, bsz=16695.2, num_updates=31100, lr=0.000358633, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=61, wall=26687 epoch 019: 747 / 1689 loss=3.572, nll_loss=2.024, ppl=4.07, wps=516180, ups=1.04, wpb=495055, bsz=16695.2, num_updates=31100, lr=0.000358633, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=61, wall=26687 epoch 019: 747 / 1689 loss=3.572, nll_loss=2.024, ppl=4.07, wps=516180, ups=1.04, wpb=495055, bsz=16695.2, num_updates=31100, lr=0.000358633, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=61, wall=26687 epoch 019: 747 / 1689 loss=3.572, nll_loss=2.024, ppl=4.07, wps=516180, ups=1.04, wpb=495055, bsz=16695.2, num_updates=31100, lr=0.000358633, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=61, wall=26687 epoch 019: 747 / 1689 loss=3.572, nll_loss=2.024, ppl=4.07, wps=516180, ups=1.04, wpb=495055, bsz=16695.2, num_updates=31100, lr=0.000358633, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=61, wall=26687 epoch 019: 747 / 1689 loss=3.572, nll_loss=2.024, ppl=4.07, wps=516180, ups=1.04, wpb=495055, bsz=16695.2, num_updates=31100, lr=0.000358633, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=61, wall=26687 epoch 019: 747 / 1689 loss=3.572, nll_loss=2.024, ppl=4.07, wps=516180, ups=1.04, wpb=495055, bsz=16695.2, num_updates=31100, lr=0.000358633, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=61, wall=26687 epoch 019: 747 / 1689 loss=3.572, nll_loss=2.024, ppl=4.07, wps=516180, ups=1.04, wpb=495055, bsz=16695.2, num_updates=31100, lr=0.000358633, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=61, wall=26687 epoch 019: 747 / 1689 loss=3.572, nll_loss=2.024, ppl=4.07, wps=516180, ups=1.04, wpb=495055, bsz=16695.2, num_updates=31100, lr=0.000358633, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=61, wall=26687 epoch 019: 747 / 1689 loss=3.572, nll_loss=2.024, ppl=4.07, wps=516180, ups=1.04, wpb=495055, bsz=16695.2, num_updates=31100, lr=0.000358633, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=61, wall=26687 epoch 019: 747 / 1689 loss=3.572, nll_loss=2.024, ppl=4.07, wps=516180, ups=1.04, wpb=495055, bsz=16695.2, num_updates=31100, lr=0.000358633, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=61, wall=26687 epoch 019: 747 / 1689 loss=3.572, nll_loss=2.024, ppl=4.07, wps=516180, ups=1.04, wpb=495055, bsz=16695.2, num_updates=31100, lr=0.000358633, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=61, wall=26687 epoch 019: 747 / 1689 loss=3.572, nll_loss=2.024, ppl=4.07, wps=516180, ups=1.04, wpb=495055, bsz=16695.2, num_updates=31100, lr=0.000358633, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=61, wall=26687 epoch 019: 847 / 1689 loss=3.573, nll_loss=2.026, ppl=4.07, wps=585577, ups=1.18, wpb=494675, bsz=16415.2, num_updates=31200, lr=0.000358057, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=61.8, wall=26771 epoch 019: 847 / 1689 loss=3.573, nll_loss=2.026, ppl=4.07, wps=585577, ups=1.18, wpb=494675, bsz=16415.2, num_updates=31200, lr=0.000358057, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=61.8, wall=26771 epoch 019: 847 / 1689 loss=3.573, nll_loss=2.026, ppl=4.07, wps=585577, ups=1.18, wpb=494675, bsz=16415.2, num_updates=31200, lr=0.000358057, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=61.8, wall=26771 epoch 019: 847 / 1689 loss=3.573, nll_loss=2.026, ppl=4.07, wps=585577, ups=1.18, wpb=494675, bsz=16415.2, num_updates=31200, lr=0.000358057, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=61.8, wall=26771 epoch 019: 847 / 1689 loss=3.573, nll_loss=2.026, ppl=4.07, wps=585577, ups=1.18, wpb=494675, bsz=16415.2, num_updates=31200, lr=0.000358057, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=61.8, wall=26771 epoch 019: 847 / 1689 loss=3.573, nll_loss=2.026, ppl=4.07, wps=585577, ups=1.18, wpb=494675, bsz=16415.2, num_updates=31200, lr=0.000358057, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=61.8, wall=26771 epoch 019: 847 / 1689 loss=3.573, nll_loss=2.026, ppl=4.07, wps=585577, ups=1.18, wpb=494675, bsz=16415.2, num_updates=31200, lr=0.000358057, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=61.8, wall=26771 epoch 019: 847 / 1689 loss=3.573, nll_loss=2.026, ppl=4.07, wps=585577, ups=1.18, wpb=494675, bsz=16415.2, num_updates=31200, lr=0.000358057, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=61.8, wall=26771 epoch 019: 847 / 1689 loss=3.573, nll_loss=2.026, ppl=4.07, wps=585577, ups=1.18, wpb=494675, bsz=16415.2, num_updates=31200, lr=0.000358057, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=61.8, wall=26771 epoch 019: 847 / 1689 loss=3.573, nll_loss=2.026, ppl=4.07, wps=585577, ups=1.18, wpb=494675, bsz=16415.2, num_updates=31200, lr=0.000358057, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=61.8, wall=26771 epoch 019: 847 / 1689 loss=3.573, nll_loss=2.026, ppl=4.07, wps=585577, ups=1.18, wpb=494675, bsz=16415.2, num_updates=31200, lr=0.000358057, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=61.8, wall=26771 epoch 019: 847 / 1689 loss=3.573, nll_loss=2.026, ppl=4.07, wps=585577, ups=1.18, wpb=494675, bsz=16415.2, num_updates=31200, lr=0.000358057, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=61.8, wall=26771 epoch 019: 847 / 1689 loss=3.573, nll_loss=2.026, ppl=4.07, wps=585577, ups=1.18, wpb=494675, bsz=16415.2, num_updates=31200, lr=0.000358057, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=61.8, wall=26771 epoch 019: 847 / 1689 loss=3.573, nll_loss=2.026, ppl=4.07, wps=585577, ups=1.18, wpb=494675, bsz=16415.2, num_updates=31200, lr=0.000358057, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=61.8, wall=26771 epoch 019: 847 / 1689 loss=3.573, nll_loss=2.026, ppl=4.07, wps=585577, ups=1.18, wpb=494675, bsz=16415.2, num_updates=31200, lr=0.000358057, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=61.8, wall=26771 epoch 019: 847 / 1689 loss=3.573, nll_loss=2.026, ppl=4.07, wps=585577, ups=1.18, wpb=494675, bsz=16415.2, num_updates=31200, lr=0.000358057, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=61.8, wall=26771 epoch 019: 847 / 1689 loss=3.573, nll_loss=2.026, ppl=4.07, wps=585577, ups=1.18, wpb=494675, bsz=16415.2, num_updates=31200, lr=0.000358057, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=61.8, wall=26771 epoch 019: 847 / 1689 loss=3.573, nll_loss=2.026, ppl=4.07, wps=585577, ups=1.18, wpb=494675, bsz=16415.2, num_updates=31200, lr=0.000358057, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=61.8, wall=26771 epoch 019: 847 / 1689 loss=3.573, nll_loss=2.026, ppl=4.07, wps=585577, ups=1.18, wpb=494675, bsz=16415.2, num_updates=31200, lr=0.000358057, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=61.8, wall=26771 epoch 019: 947 / 1689 loss=3.574, nll_loss=2.027, ppl=4.07, wps=589131, ups=1.19, wpb=496419, bsz=16263.6, num_updates=31300, lr=0.000357485, gnorm=0.138, clip=0, loss_scale=4, train_wall=83, gb_free=61.1, wall=26856 epoch 019: 947 / 1689 loss=3.574, nll_loss=2.027, ppl=4.07, wps=589131, ups=1.19, wpb=496419, bsz=16263.6, num_updates=31300, lr=0.000357485, gnorm=0.138, clip=0, loss_scale=4, train_wall=83, gb_free=61.1, wall=26856 epoch 019: 947 / 1689 loss=3.574, nll_loss=2.027, ppl=4.07, wps=589131, ups=1.19, wpb=496419, bsz=16263.6, num_updates=31300, lr=0.000357485, gnorm=0.138, clip=0, loss_scale=4, train_wall=83, gb_free=61.1, wall=26856 epoch 019: 947 / 1689 loss=3.574, nll_loss=2.027, ppl=4.07, wps=589131, ups=1.19, wpb=496419, bsz=16263.6, num_updates=31300, lr=0.000357485, gnorm=0.138, clip=0, loss_scale=4, train_wall=83, gb_free=61.1, wall=26856 epoch 019: 947 / 1689 loss=3.574, nll_loss=2.027, ppl=4.07, wps=589131, ups=1.19, wpb=496419, bsz=16263.6, num_updates=31300, lr=0.000357485, gnorm=0.138, clip=0, loss_scale=4, train_wall=83, gb_free=61.1, wall=26856 epoch 019: 947 / 1689 loss=3.574, nll_loss=2.027, ppl=4.07, wps=589131, ups=1.19, wpb=496419, bsz=16263.6, num_updates=31300, lr=0.000357485, gnorm=0.138, clip=0, loss_scale=4, train_wall=83, gb_free=61.1, wall=26856 epoch 019: 947 / 1689 loss=3.574, nll_loss=2.027, ppl=4.07, wps=589131, ups=1.19, wpb=496419, bsz=16263.6, num_updates=31300, lr=0.000357485, gnorm=0.138, clip=0, loss_scale=4, train_wall=83, gb_free=61.1, wall=26856 epoch 019: 947 / 1689 loss=3.574, nll_loss=2.027, ppl=4.07, wps=589131, ups=1.19, wpb=496419, bsz=16263.6, num_updates=31300, lr=0.000357485, gnorm=0.138, clip=0, loss_scale=4, train_wall=83, gb_free=61.1, wall=26856 epoch 019: 947 / 1689 loss=3.574, nll_loss=2.027, ppl=4.07, wps=589131, ups=1.19, wpb=496419, bsz=16263.6, num_updates=31300, lr=0.000357485, gnorm=0.138, clip=0, loss_scale=4, train_wall=83, gb_free=61.1, wall=26856 epoch 019: 947 / 1689 loss=3.574, nll_loss=2.027, ppl=4.07, wps=589131, ups=1.19, wpb=496419, bsz=16263.6, num_updates=31300, lr=0.000357485, gnorm=0.138, clip=0, loss_scale=4, train_wall=83, gb_free=61.1, wall=26856 epoch 019: 947 / 1689 loss=3.574, nll_loss=2.027, ppl=4.07, wps=589131, ups=1.19, wpb=496419, bsz=16263.6, num_updates=31300, lr=0.000357485, gnorm=0.138, clip=0, loss_scale=4, train_wall=83, gb_free=61.1, wall=26856 epoch 019: 947 / 1689 loss=3.574, nll_loss=2.027, ppl=4.07, wps=589131, ups=1.19, wpb=496419, bsz=16263.6, num_updates=31300, lr=0.000357485, gnorm=0.138, clip=0, loss_scale=4, train_wall=83, gb_free=61.1, wall=26856 epoch 019: 947 / 1689 loss=3.574, nll_loss=2.027, ppl=4.07, wps=589131, ups=1.19, wpb=496419, bsz=16263.6, num_updates=31300, lr=0.000357485, gnorm=0.138, clip=0, loss_scale=4, train_wall=83, gb_free=61.1, wall=26856 epoch 019: 947 / 1689 loss=3.574, nll_loss=2.027, ppl=4.07, wps=589131, ups=1.19, wpb=496419, bsz=16263.6, num_updates=31300, lr=0.000357485, gnorm=0.138, clip=0, loss_scale=4, train_wall=83, gb_free=61.1, wall=26856 epoch 019: 947 / 1689 loss=3.574, nll_loss=2.027, ppl=4.07, wps=589131, ups=1.19, wpb=496419, bsz=16263.6, num_updates=31300, lr=0.000357485, gnorm=0.138, clip=0, loss_scale=4, train_wall=83, gb_free=61.1, wall=26856 epoch 019: 947 / 1689 loss=3.574, nll_loss=2.027, ppl=4.07, wps=589131, ups=1.19, wpb=496419, bsz=16263.6, num_updates=31300, lr=0.000357485, gnorm=0.138, clip=0, loss_scale=4, train_wall=83, gb_free=61.1, wall=26856 epoch 019: 947 / 1689 loss=3.574, nll_loss=2.027, ppl=4.07, wps=589131, ups=1.19, wpb=496419, bsz=16263.6, num_updates=31300, lr=0.000357485, gnorm=0.138, clip=0, loss_scale=4, train_wall=83, gb_free=61.1, wall=26856 epoch 019: 947 / 1689 loss=3.574, nll_loss=2.027, ppl=4.07, wps=589131, ups=1.19, wpb=496419, bsz=16263.6, num_updates=31300, lr=0.000357485, gnorm=0.138, clip=0, loss_scale=4, train_wall=83, gb_free=61.1, wall=26856 epoch 019: 947 / 1689 loss=3.574, nll_loss=2.027, ppl=4.07, wps=589131, ups=1.19, wpb=496419, bsz=16263.6, num_updates=31300, lr=0.000357485, gnorm=0.138, clip=0, loss_scale=4, train_wall=83, gb_free=61.1, wall=26856 epoch 019: 1047 / 1689 loss=3.577, nll_loss=2.03, ppl=4.08, wps=584876, ups=1.18, wpb=494092, bsz=16317.7, num_updates=31400, lr=0.000356915, gnorm=0.14, clip=0, loss_scale=4, train_wall=83, gb_free=61.3, wall=26940 epoch 019: 1047 / 1689 loss=3.577, nll_loss=2.03, ppl=4.08, wps=584876, ups=1.18, wpb=494092, bsz=16317.7, num_updates=31400, lr=0.000356915, gnorm=0.14, clip=0, loss_scale=4, train_wall=83, gb_free=61.3, wall=26940 epoch 019: 1047 / 1689 loss=3.577, nll_loss=2.03, ppl=4.08, wps=584876, ups=1.18, wpb=494092, bsz=16317.7, num_updates=31400, lr=0.000356915, gnorm=0.14, clip=0, loss_scale=4, train_wall=83, gb_free=61.3, wall=26940 epoch 019: 1047 / 1689 loss=3.577, nll_loss=2.03, ppl=4.08, wps=584876, ups=1.18, wpb=494092, bsz=16317.7, num_updates=31400, lr=0.000356915, gnorm=0.14, clip=0, loss_scale=4, train_wall=83, gb_free=61.3, wall=26940 epoch 019: 1047 / 1689 loss=3.577, nll_loss=2.03, ppl=4.08, wps=584876, ups=1.18, wpb=494092, bsz=16317.7, num_updates=31400, lr=0.000356915, gnorm=0.14, clip=0, loss_scale=4, train_wall=83, gb_free=61.3, wall=26940 epoch 019: 1047 / 1689 loss=3.577, nll_loss=2.03, ppl=4.08, wps=584876, ups=1.18, wpb=494092, bsz=16317.7, num_updates=31400, lr=0.000356915, gnorm=0.14, clip=0, loss_scale=4, train_wall=83, gb_free=61.3, wall=26940 epoch 019: 1047 / 1689 loss=3.577, nll_loss=2.03, ppl=4.08, wps=584876, ups=1.18, wpb=494092, bsz=16317.7, num_updates=31400, lr=0.000356915, gnorm=0.14, clip=0, loss_scale=4, train_wall=83, gb_free=61.3, wall=26940 epoch 019: 1047 / 1689 loss=3.577, nll_loss=2.03, ppl=4.08, wps=584876, ups=1.18, wpb=494092, bsz=16317.7, num_updates=31400, lr=0.000356915, gnorm=0.14, clip=0, loss_scale=4, train_wall=83, gb_free=61.3, wall=26940 epoch 019: 1047 / 1689 loss=3.577, nll_loss=2.03, ppl=4.08, wps=584876, ups=1.18, wpb=494092, bsz=16317.7, num_updates=31400, lr=0.000356915, gnorm=0.14, clip=0, loss_scale=4, train_wall=83, gb_free=61.3, wall=26940 epoch 019: 1047 / 1689 loss=3.577, nll_loss=2.03, ppl=4.08, wps=584876, ups=1.18, wpb=494092, bsz=16317.7, num_updates=31400, lr=0.000356915, gnorm=0.14, clip=0, loss_scale=4, train_wall=83, gb_free=61.3, wall=26940 epoch 019: 1047 / 1689 loss=3.577, nll_loss=2.03, ppl=4.08, wps=584876, ups=1.18, wpb=494092, bsz=16317.7, num_updates=31400, lr=0.000356915, gnorm=0.14, clip=0, loss_scale=4, train_wall=83, gb_free=61.3, wall=26940 epoch 019: 1047 / 1689 loss=3.577, nll_loss=2.03, ppl=4.08, wps=584876, ups=1.18, wpb=494092, bsz=16317.7, num_updates=31400, lr=0.000356915, gnorm=0.14, clip=0, loss_scale=4, train_wall=83, gb_free=61.3, wall=26940 epoch 019: 1047 / 1689 loss=3.577, nll_loss=2.03, ppl=4.08, wps=584876, ups=1.18, wpb=494092, bsz=16317.7, num_updates=31400, lr=0.000356915, gnorm=0.14, clip=0, loss_scale=4, train_wall=83, gb_free=61.3, wall=26940 epoch 019: 1047 / 1689 loss=3.577, nll_loss=2.03, ppl=4.08, wps=584876, ups=1.18, wpb=494092, bsz=16317.7, num_updates=31400, lr=0.000356915, gnorm=0.14, clip=0, loss_scale=4, train_wall=83, gb_free=61.3, wall=26940 epoch 019: 1047 / 1689 loss=3.577, nll_loss=2.03, ppl=4.08, wps=584876, ups=1.18, wpb=494092, bsz=16317.7, num_updates=31400, lr=0.000356915, gnorm=0.14, clip=0, loss_scale=4, train_wall=83, gb_free=61.3, wall=26940 epoch 019: 1047 / 1689 loss=3.577, nll_loss=2.03, ppl=4.08, wps=584876, ups=1.18, wpb=494092, bsz=16317.7, num_updates=31400, lr=0.000356915, gnorm=0.14, clip=0, loss_scale=4, train_wall=83, gb_free=61.3, wall=26940 epoch 019: 1047 / 1689 loss=3.577, nll_loss=2.03, ppl=4.08, wps=584876, ups=1.18, wpb=494092, bsz=16317.7, num_updates=31400, lr=0.000356915, gnorm=0.14, clip=0, loss_scale=4, train_wall=83, gb_free=61.3, wall=26940 epoch 019: 1047 / 1689 loss=3.577, nll_loss=2.03, ppl=4.08, wps=584876, ups=1.18, wpb=494092, bsz=16317.7, num_updates=31400, lr=0.000356915, gnorm=0.14, clip=0, loss_scale=4, train_wall=83, gb_free=61.3, wall=26940 epoch 019: 1047 / 1689 loss=3.577, nll_loss=2.03, ppl=4.08, wps=584876, ups=1.18, wpb=494092, bsz=16317.7, num_updates=31400, lr=0.000356915, gnorm=0.14, clip=0, loss_scale=4, train_wall=83, gb_free=61.3, wall=26940 epoch 019: 1147 / 1689 loss=3.583, nll_loss=2.037, ppl=4.1, wps=585167, ups=1.19, wpb=493687, bsz=16411, num_updates=31500, lr=0.000356348, gnorm=0.139, clip=0, loss_scale=4, train_wall=83, gb_free=61.3, wall=27025 epoch 019: 1147 / 1689 loss=3.583, nll_loss=2.037, ppl=4.1, wps=585167, ups=1.19, wpb=493687, bsz=16411, num_updates=31500, lr=0.000356348, gnorm=0.139, clip=0, loss_scale=4, train_wall=83, gb_free=61.3, wall=27025 epoch 019: 1147 / 1689 loss=3.583, nll_loss=2.037, ppl=4.1, wps=585167, ups=1.19, wpb=493687, bsz=16411, num_updates=31500, lr=0.000356348, gnorm=0.139, clip=0, loss_scale=4, train_wall=83, gb_free=61.3, wall=27025 epoch 019: 1147 / 1689 loss=3.583, nll_loss=2.037, ppl=4.1, wps=585167, ups=1.19, wpb=493687, bsz=16411, num_updates=31500, lr=0.000356348, gnorm=0.139, clip=0, loss_scale=4, train_wall=83, gb_free=61.3, wall=27025 epoch 019: 1147 / 1689 loss=3.583, nll_loss=2.037, ppl=4.1, wps=585167, ups=1.19, wpb=493687, bsz=16411, num_updates=31500, lr=0.000356348, gnorm=0.139, clip=0, loss_scale=4, train_wall=83, gb_free=61.3, wall=27025 epoch 019: 1147 / 1689 loss=3.583, nll_loss=2.037, ppl=4.1, wps=585167, ups=1.19, wpb=493687, bsz=16411, num_updates=31500, lr=0.000356348, gnorm=0.139, clip=0, loss_scale=4, train_wall=83, gb_free=61.3, wall=27025 epoch 019: 1147 / 1689 loss=3.583, nll_loss=2.037, ppl=4.1, wps=585167, ups=1.19, wpb=493687, bsz=16411, num_updates=31500, lr=0.000356348, gnorm=0.139, clip=0, loss_scale=4, train_wall=83, gb_free=61.3, wall=27025 epoch 019: 1147 / 1689 loss=3.583, nll_loss=2.037, ppl=4.1, wps=585167, ups=1.19, wpb=493687, bsz=16411, num_updates=31500, lr=0.000356348, gnorm=0.139, clip=0, loss_scale=4, train_wall=83, gb_free=61.3, wall=27025 epoch 019: 1147 / 1689 loss=3.583, nll_loss=2.037, ppl=4.1, wps=585167, ups=1.19, wpb=493687, bsz=16411, num_updates=31500, lr=0.000356348, gnorm=0.139, clip=0, loss_scale=4, train_wall=83, gb_free=61.3, wall=27025 epoch 019: 1147 / 1689 loss=3.583, nll_loss=2.037, ppl=4.1, wps=585167, ups=1.19, wpb=493687, bsz=16411, num_updates=31500, lr=0.000356348, gnorm=0.139, clip=0, loss_scale=4, train_wall=83, gb_free=61.3, wall=27025 epoch 019: 1147 / 1689 loss=3.583, nll_loss=2.037, ppl=4.1, wps=585167, ups=1.19, wpb=493687, bsz=16411, num_updates=31500, lr=0.000356348, gnorm=0.139, clip=0, loss_scale=4, train_wall=83, gb_free=61.3, wall=27025 epoch 019: 1147 / 1689 loss=3.583, nll_loss=2.037, ppl=4.1, wps=585167, ups=1.19, wpb=493687, bsz=16411, num_updates=31500, lr=0.000356348, gnorm=0.139, clip=0, loss_scale=4, train_wall=83, gb_free=61.3, wall=27025 epoch 019: 1147 / 1689 loss=3.583, nll_loss=2.037, ppl=4.1, wps=585167, ups=1.19, wpb=493687, bsz=16411, num_updates=31500, lr=0.000356348, gnorm=0.139, clip=0, loss_scale=4, train_wall=83, gb_free=61.3, wall=27025 epoch 019: 1147 / 1689 loss=3.583, nll_loss=2.037, ppl=4.1, wps=585167, ups=1.19, wpb=493687, bsz=16411, num_updates=31500, lr=0.000356348, gnorm=0.139, clip=0, loss_scale=4, train_wall=83, gb_free=61.3, wall=27025 epoch 019: 1147 / 1689 loss=3.583, nll_loss=2.037, ppl=4.1, wps=585167, ups=1.19, wpb=493687, bsz=16411, num_updates=31500, lr=0.000356348, gnorm=0.139, clip=0, loss_scale=4, train_wall=83, gb_free=61.3, wall=27025 epoch 019: 1147 / 1689 loss=3.583, nll_loss=2.037, ppl=4.1, wps=585167, ups=1.19, wpb=493687, bsz=16411, num_updates=31500, lr=0.000356348, gnorm=0.139, clip=0, loss_scale=4, train_wall=83, gb_free=61.3, wall=27025 epoch 019: 1147 / 1689 loss=3.583, nll_loss=2.037, ppl=4.1, wps=585167, ups=1.19, wpb=493687, bsz=16411, num_updates=31500, lr=0.000356348, gnorm=0.139, clip=0, loss_scale=4, train_wall=83, gb_free=61.3, wall=27025 epoch 019: 1147 / 1689 loss=3.583, nll_loss=2.037, ppl=4.1, wps=585167, ups=1.19, wpb=493687, bsz=16411, num_updates=31500, lr=0.000356348, gnorm=0.139, clip=0, loss_scale=4, train_wall=83, gb_free=61.3, wall=27025 epoch 019: 1147 / 1689 loss=3.583, nll_loss=2.037, ppl=4.1, wps=585167, ups=1.19, wpb=493687, bsz=16411, num_updates=31500, lr=0.000356348, gnorm=0.139, clip=0, loss_scale=4, train_wall=83, gb_free=61.3, wall=27025 epoch 019: 1248 / 1689 loss=3.586, nll_loss=2.041, ppl=4.11, wps=579870, ups=1.17, wpb=494087, bsz=16831.4, num_updates=31600, lr=0.000355784, gnorm=0.139, clip=0, loss_scale=4, train_wall=84, gb_free=61.7, wall=27110 epoch 019: 1248 / 1689 loss=3.586, nll_loss=2.041, ppl=4.11, wps=579870, ups=1.17, wpb=494087, bsz=16831.4, num_updates=31600, lr=0.000355784, gnorm=0.139, clip=0, loss_scale=4, train_wall=84, gb_free=61.7, wall=27110 epoch 019: 1248 / 1689 loss=3.586, nll_loss=2.041, ppl=4.11, wps=579870, ups=1.17, wpb=494087, bsz=16831.4, num_updates=31600, lr=0.000355784, gnorm=0.139, clip=0, loss_scale=4, train_wall=84, gb_free=61.7, wall=27110 epoch 019: 1248 / 1689 loss=3.586, nll_loss=2.041, ppl=4.11, wps=579870, ups=1.17, wpb=494087, bsz=16831.4, num_updates=31600, lr=0.000355784, gnorm=0.139, clip=0, loss_scale=4, train_wall=84, gb_free=61.7, wall=27110 epoch 019: 1248 / 1689 loss=3.586, nll_loss=2.041, ppl=4.11, wps=579870, ups=1.17, wpb=494087, bsz=16831.4, num_updates=31600, lr=0.000355784, gnorm=0.139, clip=0, loss_scale=4, train_wall=84, gb_free=61.7, wall=27110 epoch 019: 1248 / 1689 loss=3.586, nll_loss=2.041, ppl=4.11, wps=579870, ups=1.17, wpb=494087, bsz=16831.4, num_updates=31600, lr=0.000355784, gnorm=0.139, clip=0, loss_scale=4, train_wall=84, gb_free=61.7, wall=27110 epoch 019: 1248 / 1689 loss=3.586, nll_loss=2.041, ppl=4.11, wps=579870, ups=1.17, wpb=494087, bsz=16831.4, num_updates=31600, lr=0.000355784, gnorm=0.139, clip=0, loss_scale=4, train_wall=84, gb_free=61.7, wall=27110 epoch 019: 1248 / 1689 loss=3.586, nll_loss=2.041, ppl=4.11, wps=579870, ups=1.17, wpb=494087, bsz=16831.4, num_updates=31600, lr=0.000355784, gnorm=0.139, clip=0, loss_scale=4, train_wall=84, gb_free=61.7, wall=27110 epoch 019: 1248 / 1689 loss=3.586, nll_loss=2.041, ppl=4.11, wps=579870, ups=1.17, wpb=494087, bsz=16831.4, num_updates=31600, lr=0.000355784, gnorm=0.139, clip=0, loss_scale=4, train_wall=84, gb_free=61.7, wall=27110 epoch 019: 1248 / 1689 loss=3.586, nll_loss=2.041, ppl=4.11, wps=579870, ups=1.17, wpb=494087, bsz=16831.4, num_updates=31600, lr=0.000355784, gnorm=0.139, clip=0, loss_scale=4, train_wall=84, gb_free=61.7, wall=27110 epoch 019: 1248 / 1689 loss=3.586, nll_loss=2.041, ppl=4.11, wps=579870, ups=1.17, wpb=494087, bsz=16831.4, num_updates=31600, lr=0.000355784, gnorm=0.139, clip=0, loss_scale=4, train_wall=84, gb_free=61.7, wall=27110 epoch 019: 1248 / 1689 loss=3.586, nll_loss=2.041, ppl=4.11, wps=579870, ups=1.17, wpb=494087, bsz=16831.4, num_updates=31600, lr=0.000355784, gnorm=0.139, clip=0, loss_scale=4, train_wall=84, gb_free=61.7, wall=27110 epoch 019: 1248 / 1689 loss=3.586, nll_loss=2.041, ppl=4.11, wps=579870, ups=1.17, wpb=494087, bsz=16831.4, num_updates=31600, lr=0.000355784, gnorm=0.139, clip=0, loss_scale=4, train_wall=84, gb_free=61.7, wall=27110 epoch 019: 1248 / 1689 loss=3.586, nll_loss=2.041, ppl=4.11, wps=579870, ups=1.17, wpb=494087, bsz=16831.4, num_updates=31600, lr=0.000355784, gnorm=0.139, clip=0, loss_scale=4, train_wall=84, gb_free=61.7, wall=27110 epoch 019: 1248 / 1689 loss=3.586, nll_loss=2.041, ppl=4.11, wps=579870, ups=1.17, wpb=494087, bsz=16831.4, num_updates=31600, lr=0.000355784, gnorm=0.139, clip=0, loss_scale=4, train_wall=84, gb_free=61.7, wall=27110 epoch 019: 1248 / 1689 loss=3.586, nll_loss=2.041, ppl=4.11, wps=579870, ups=1.17, wpb=494087, bsz=16831.4, num_updates=31600, lr=0.000355784, gnorm=0.139, clip=0, loss_scale=4, train_wall=84, gb_free=61.7, wall=27110 epoch 019: 1248 / 1689 loss=3.586, nll_loss=2.041, ppl=4.11, wps=579870, ups=1.17, wpb=494087, bsz=16831.4, num_updates=31600, lr=0.000355784, gnorm=0.139, clip=0, loss_scale=4, train_wall=84, gb_free=61.7, wall=27110 epoch 019: 1248 / 1689 loss=3.586, nll_loss=2.041, ppl=4.11, wps=579870, ups=1.17, wpb=494087, bsz=16831.4, num_updates=31600, lr=0.000355784, gnorm=0.139, clip=0, loss_scale=4, train_wall=84, gb_free=61.7, wall=27110 epoch 019: 1248 / 1689 loss=3.586, nll_loss=2.041, ppl=4.11, wps=579870, ups=1.17, wpb=494087, bsz=16831.4, num_updates=31600, lr=0.000355784, gnorm=0.139, clip=0, loss_scale=4, train_wall=84, gb_free=61.7, wall=27110 epoch 019: 1348 / 1689 loss=3.577, nll_loss=2.031, ppl=4.09, wps=584957, ups=1.18, wpb=495488, bsz=16721.5, num_updates=31700, lr=0.000355222, gnorm=0.139, clip=0, loss_scale=4, train_wall=83, gb_free=61.9, wall=27194 epoch 019: 1348 / 1689 loss=3.577, nll_loss=2.031, ppl=4.09, wps=584957, ups=1.18, wpb=495488, bsz=16721.5, num_updates=31700, lr=0.000355222, gnorm=0.139, clip=0, loss_scale=4, train_wall=83, gb_free=61.9, wall=27194 epoch 019: 1348 / 1689 loss=3.577, nll_loss=2.031, ppl=4.09, wps=584957, ups=1.18, wpb=495488, bsz=16721.5, num_updates=31700, lr=0.000355222, gnorm=0.139, clip=0, loss_scale=4, train_wall=83, gb_free=61.9, wall=27194 epoch 019: 1348 / 1689 loss=3.577, nll_loss=2.031, ppl=4.09, wps=584957, ups=1.18, wpb=495488, bsz=16721.5, num_updates=31700, lr=0.000355222, gnorm=0.139, clip=0, loss_scale=4, train_wall=83, gb_free=61.9, wall=27194 epoch 019: 1348 / 1689 loss=3.577, nll_loss=2.031, ppl=4.09, wps=584957, ups=1.18, wpb=495488, bsz=16721.5, num_updates=31700, lr=0.000355222, gnorm=0.139, clip=0, loss_scale=4, train_wall=83, gb_free=61.9, wall=27194 epoch 019: 1348 / 1689 loss=3.577, nll_loss=2.031, ppl=4.09, wps=584957, ups=1.18, wpb=495488, bsz=16721.5, num_updates=31700, lr=0.000355222, gnorm=0.139, clip=0, loss_scale=4, train_wall=83, gb_free=61.9, wall=27194 epoch 019: 1348 / 1689 loss=3.577, nll_loss=2.031, ppl=4.09, wps=584957, ups=1.18, wpb=495488, bsz=16721.5, num_updates=31700, lr=0.000355222, gnorm=0.139, clip=0, loss_scale=4, train_wall=83, gb_free=61.9, wall=27194 epoch 019: 1348 / 1689 loss=3.577, nll_loss=2.031, ppl=4.09, wps=584957, ups=1.18, wpb=495488, bsz=16721.5, num_updates=31700, lr=0.000355222, gnorm=0.139, clip=0, loss_scale=4, train_wall=83, gb_free=61.9, wall=27194 epoch 019: 1348 / 1689 loss=3.577, nll_loss=2.031, ppl=4.09, wps=584957, ups=1.18, wpb=495488, bsz=16721.5, num_updates=31700, lr=0.000355222, gnorm=0.139, clip=0, loss_scale=4, train_wall=83, gb_free=61.9, wall=27194 epoch 019: 1348 / 1689 loss=3.577, nll_loss=2.031, ppl=4.09, wps=584957, ups=1.18, wpb=495488, bsz=16721.5, num_updates=31700, lr=0.000355222, gnorm=0.139, clip=0, loss_scale=4, train_wall=83, gb_free=61.9, wall=27194 epoch 019: 1348 / 1689 loss=3.577, nll_loss=2.031, ppl=4.09, wps=584957, ups=1.18, wpb=495488, bsz=16721.5, num_updates=31700, lr=0.000355222, gnorm=0.139, clip=0, loss_scale=4, train_wall=83, gb_free=61.9, wall=27194 epoch 019: 1348 / 1689 loss=3.577, nll_loss=2.031, ppl=4.09, wps=584957, ups=1.18, wpb=495488, bsz=16721.5, num_updates=31700, lr=0.000355222, gnorm=0.139, clip=0, loss_scale=4, train_wall=83, gb_free=61.9, wall=27194 epoch 019: 1348 / 1689 loss=3.577, nll_loss=2.031, ppl=4.09, wps=584957, ups=1.18, wpb=495488, bsz=16721.5, num_updates=31700, lr=0.000355222, gnorm=0.139, clip=0, loss_scale=4, train_wall=83, gb_free=61.9, wall=27194 epoch 019: 1348 / 1689 loss=3.577, nll_loss=2.031, ppl=4.09, wps=584957, ups=1.18, wpb=495488, bsz=16721.5, num_updates=31700, lr=0.000355222, gnorm=0.139, clip=0, loss_scale=4, train_wall=83, gb_free=61.9, wall=27194 epoch 019: 1348 / 1689 loss=3.577, nll_loss=2.031, ppl=4.09, wps=584957, ups=1.18, wpb=495488, bsz=16721.5, num_updates=31700, lr=0.000355222, gnorm=0.139, clip=0, loss_scale=4, train_wall=83, gb_free=61.9, wall=27194 epoch 019: 1348 / 1689 loss=3.577, nll_loss=2.031, ppl=4.09, wps=584957, ups=1.18, wpb=495488, bsz=16721.5, num_updates=31700, lr=0.000355222, gnorm=0.139, clip=0, loss_scale=4, train_wall=83, gb_free=61.9, wall=27194 epoch 019: 1348 / 1689 loss=3.577, nll_loss=2.031, ppl=4.09, wps=584957, ups=1.18, wpb=495488, bsz=16721.5, num_updates=31700, lr=0.000355222, gnorm=0.139, clip=0, loss_scale=4, train_wall=83, gb_free=61.9, wall=27194 epoch 019: 1348 / 1689 loss=3.577, nll_loss=2.031, ppl=4.09, wps=584957, ups=1.18, wpb=495488, bsz=16721.5, num_updates=31700, lr=0.000355222, gnorm=0.139, clip=0, loss_scale=4, train_wall=83, gb_free=61.9, wall=27194 epoch 019: 1348 / 1689 loss=3.577, nll_loss=2.031, ppl=4.09, wps=584957, ups=1.18, wpb=495488, bsz=16721.5, num_updates=31700, lr=0.000355222, gnorm=0.139, clip=0, loss_scale=4, train_wall=83, gb_free=61.9, wall=27194 epoch 019: 1448 / 1689 loss=3.579, nll_loss=2.032, ppl=4.09, wps=586274, ups=1.18, wpb=494846, bsz=16221.5, num_updates=31800, lr=0.000354663, gnorm=0.144, clip=0, loss_scale=4, train_wall=83, gb_free=61.5, wall=27279 epoch 019: 1448 / 1689 loss=3.579, nll_loss=2.032, ppl=4.09, wps=586274, ups=1.18, wpb=494846, bsz=16221.5, num_updates=31800, lr=0.000354663, gnorm=0.144, clip=0, loss_scale=4, train_wall=83, gb_free=61.5, wall=27279 epoch 019: 1448 / 1689 loss=3.579, nll_loss=2.032, ppl=4.09, wps=586274, ups=1.18, wpb=494846, bsz=16221.5, num_updates=31800, lr=0.000354663, gnorm=0.144, clip=0, loss_scale=4, train_wall=83, gb_free=61.5, wall=27279 epoch 019: 1448 / 1689 loss=3.579, nll_loss=2.032, ppl=4.09, wps=586274, ups=1.18, wpb=494846, bsz=16221.5, num_updates=31800, lr=0.000354663, gnorm=0.144, clip=0, loss_scale=4, train_wall=83, gb_free=61.5, wall=27279 epoch 019: 1448 / 1689 loss=3.579, nll_loss=2.032, ppl=4.09, wps=586274, ups=1.18, wpb=494846, bsz=16221.5, num_updates=31800, lr=0.000354663, gnorm=0.144, clip=0, loss_scale=4, train_wall=83, gb_free=61.5, wall=27279 epoch 019: 1448 / 1689 loss=3.579, nll_loss=2.032, ppl=4.09, wps=586274, ups=1.18, wpb=494846, bsz=16221.5, num_updates=31800, lr=0.000354663, gnorm=0.144, clip=0, loss_scale=4, train_wall=83, gb_free=61.5, wall=27279 epoch 019: 1448 / 1689 loss=3.579, nll_loss=2.032, ppl=4.09, wps=586274, ups=1.18, wpb=494846, bsz=16221.5, num_updates=31800, lr=0.000354663, gnorm=0.144, clip=0, loss_scale=4, train_wall=83, gb_free=61.5, wall=27279 epoch 019: 1448 / 1689 loss=3.579, nll_loss=2.032, ppl=4.09, wps=586274, ups=1.18, wpb=494846, bsz=16221.5, num_updates=31800, lr=0.000354663, gnorm=0.144, clip=0, loss_scale=4, train_wall=83, gb_free=61.5, wall=27279 epoch 019: 1448 / 1689 loss=3.579, nll_loss=2.032, ppl=4.09, wps=586274, ups=1.18, wpb=494846, bsz=16221.5, num_updates=31800, lr=0.000354663, gnorm=0.144, clip=0, loss_scale=4, train_wall=83, gb_free=61.5, wall=27279 epoch 019: 1448 / 1689 loss=3.579, nll_loss=2.032, ppl=4.09, wps=586274, ups=1.18, wpb=494846, bsz=16221.5, num_updates=31800, lr=0.000354663, gnorm=0.144, clip=0, loss_scale=4, train_wall=83, gb_free=61.5, wall=27279 epoch 019: 1448 / 1689 loss=3.579, nll_loss=2.032, ppl=4.09, wps=586274, ups=1.18, wpb=494846, bsz=16221.5, num_updates=31800, lr=0.000354663, gnorm=0.144, clip=0, loss_scale=4, train_wall=83, gb_free=61.5, wall=27279 epoch 019: 1448 / 1689 loss=3.579, nll_loss=2.032, ppl=4.09, wps=586274, ups=1.18, wpb=494846, bsz=16221.5, num_updates=31800, lr=0.000354663, gnorm=0.144, clip=0, loss_scale=4, train_wall=83, gb_free=61.5, wall=27279 epoch 019: 1448 / 1689 loss=3.579, nll_loss=2.032, ppl=4.09, wps=586274, ups=1.18, wpb=494846, bsz=16221.5, num_updates=31800, lr=0.000354663, gnorm=0.144, clip=0, loss_scale=4, train_wall=83, gb_free=61.5, wall=27279 epoch 019: 1448 / 1689 loss=3.579, nll_loss=2.032, ppl=4.09, wps=586274, ups=1.18, wpb=494846, bsz=16221.5, num_updates=31800, lr=0.000354663, gnorm=0.144, clip=0, loss_scale=4, train_wall=83, gb_free=61.5, wall=27279 epoch 019: 1448 / 1689 loss=3.579, nll_loss=2.032, ppl=4.09, wps=586274, ups=1.18, wpb=494846, bsz=16221.5, num_updates=31800, lr=0.000354663, gnorm=0.144, clip=0, loss_scale=4, train_wall=83, gb_free=61.5, wall=27279 epoch 019: 1448 / 1689 loss=3.579, nll_loss=2.032, ppl=4.09, wps=586274, ups=1.18, wpb=494846, bsz=16221.5, num_updates=31800, lr=0.000354663, gnorm=0.144, clip=0, loss_scale=4, train_wall=83, gb_free=61.5, wall=27279 epoch 019: 1448 / 1689 loss=3.579, nll_loss=2.032, ppl=4.09, wps=586274, ups=1.18, wpb=494846, bsz=16221.5, num_updates=31800, lr=0.000354663, gnorm=0.144, clip=0, loss_scale=4, train_wall=83, gb_free=61.5, wall=27279 epoch 019: 1448 / 1689 loss=3.579, nll_loss=2.032, ppl=4.09, wps=586274, ups=1.18, wpb=494846, bsz=16221.5, num_updates=31800, lr=0.000354663, gnorm=0.144, clip=0, loss_scale=4, train_wall=83, gb_free=61.5, wall=27279 epoch 019: 1448 / 1689 loss=3.579, nll_loss=2.032, ppl=4.09, wps=586274, ups=1.18, wpb=494846, bsz=16221.5, num_updates=31800, lr=0.000354663, gnorm=0.144, clip=0, loss_scale=4, train_wall=83, gb_free=61.5, wall=27279 epoch 019: 1548 / 1689 loss=3.574, nll_loss=2.027, ppl=4.08, wps=586015, ups=1.18, wpb=495491, bsz=16270.8, num_updates=31900, lr=0.000354107, gnorm=0.137, clip=0, loss_scale=4, train_wall=83, gb_free=61.6, wall=27363 epoch 019: 1548 / 1689 loss=3.574, nll_loss=2.027, ppl=4.08, wps=586015, ups=1.18, wpb=495491, bsz=16270.8, num_updates=31900, lr=0.000354107, gnorm=0.137, clip=0, loss_scale=4, train_wall=83, gb_free=61.6, wall=27363 epoch 019: 1548 / 1689 loss=3.574, nll_loss=2.027, ppl=4.08, wps=586015, ups=1.18, wpb=495491, bsz=16270.8, num_updates=31900, lr=0.000354107, gnorm=0.137, clip=0, loss_scale=4, train_wall=83, gb_free=61.6, wall=27363 epoch 019: 1548 / 1689 loss=3.574, nll_loss=2.027, ppl=4.08, wps=586015, ups=1.18, wpb=495491, bsz=16270.8, num_updates=31900, lr=0.000354107, gnorm=0.137, clip=0, loss_scale=4, train_wall=83, gb_free=61.6, wall=27363 epoch 019: 1548 / 1689 loss=3.574, nll_loss=2.027, ppl=4.08, wps=586015, ups=1.18, wpb=495491, bsz=16270.8, num_updates=31900, lr=0.000354107, gnorm=0.137, clip=0, loss_scale=4, train_wall=83, gb_free=61.6, wall=27363 epoch 019: 1548 / 1689 loss=3.574, nll_loss=2.027, ppl=4.08, wps=586015, ups=1.18, wpb=495491, bsz=16270.8, num_updates=31900, lr=0.000354107, gnorm=0.137, clip=0, loss_scale=4, train_wall=83, gb_free=61.6, wall=27363 epoch 019: 1548 / 1689 loss=3.574, nll_loss=2.027, ppl=4.08, wps=586015, ups=1.18, wpb=495491, bsz=16270.8, num_updates=31900, lr=0.000354107, gnorm=0.137, clip=0, loss_scale=4, train_wall=83, gb_free=61.6, wall=27363 epoch 019: 1548 / 1689 loss=3.574, nll_loss=2.027, ppl=4.08, wps=586015, ups=1.18, wpb=495491, bsz=16270.8, num_updates=31900, lr=0.000354107, gnorm=0.137, clip=0, loss_scale=4, train_wall=83, gb_free=61.6, wall=27363 epoch 019: 1548 / 1689 loss=3.574, nll_loss=2.027, ppl=4.08, wps=586015, ups=1.18, wpb=495491, bsz=16270.8, num_updates=31900, lr=0.000354107, gnorm=0.137, clip=0, loss_scale=4, train_wall=83, gb_free=61.6, wall=27363 epoch 019: 1548 / 1689 loss=3.574, nll_loss=2.027, ppl=4.08, wps=586015, ups=1.18, wpb=495491, bsz=16270.8, num_updates=31900, lr=0.000354107, gnorm=0.137, clip=0, loss_scale=4, train_wall=83, gb_free=61.6, wall=27363 epoch 019: 1548 / 1689 loss=3.574, nll_loss=2.027, ppl=4.08, wps=586015, ups=1.18, wpb=495491, bsz=16270.8, num_updates=31900, lr=0.000354107, gnorm=0.137, clip=0, loss_scale=4, train_wall=83, gb_free=61.6, wall=27363 epoch 019: 1548 / 1689 loss=3.574, nll_loss=2.027, ppl=4.08, wps=586015, ups=1.18, wpb=495491, bsz=16270.8, num_updates=31900, lr=0.000354107, gnorm=0.137, clip=0, loss_scale=4, train_wall=83, gb_free=61.6, wall=27363 epoch 019: 1548 / 1689 loss=3.574, nll_loss=2.027, ppl=4.08, wps=586015, ups=1.18, wpb=495491, bsz=16270.8, num_updates=31900, lr=0.000354107, gnorm=0.137, clip=0, loss_scale=4, train_wall=83, gb_free=61.6, wall=27363 epoch 019: 1548 / 1689 loss=3.574, nll_loss=2.027, ppl=4.08, wps=586015, ups=1.18, wpb=495491, bsz=16270.8, num_updates=31900, lr=0.000354107, gnorm=0.137, clip=0, loss_scale=4, train_wall=83, gb_free=61.6, wall=27363 epoch 019: 1548 / 1689 loss=3.574, nll_loss=2.027, ppl=4.08, wps=586015, ups=1.18, wpb=495491, bsz=16270.8, num_updates=31900, lr=0.000354107, gnorm=0.137, clip=0, loss_scale=4, train_wall=83, gb_free=61.6, wall=27363 epoch 019: 1548 / 1689 loss=3.574, nll_loss=2.027, ppl=4.08, wps=586015, ups=1.18, wpb=495491, bsz=16270.8, num_updates=31900, lr=0.000354107, gnorm=0.137, clip=0, loss_scale=4, train_wall=83, gb_free=61.6, wall=27363 epoch 019: 1548 / 1689 loss=3.574, nll_loss=2.027, ppl=4.08, wps=586015, ups=1.18, wpb=495491, bsz=16270.8, num_updates=31900, lr=0.000354107, gnorm=0.137, clip=0, loss_scale=4, train_wall=83, gb_free=61.6, wall=27363 epoch 019: 1548 / 1689 loss=3.574, nll_loss=2.027, ppl=4.08, wps=586015, ups=1.18, wpb=495491, bsz=16270.8, num_updates=31900, lr=0.000354107, gnorm=0.137, clip=0, loss_scale=4, train_wall=83, gb_free=61.6, wall=27363 epoch 019: 1548 / 1689 loss=3.574, nll_loss=2.027, ppl=4.08, wps=586015, ups=1.18, wpb=495491, bsz=16270.8, num_updates=31900, lr=0.000354107, gnorm=0.137, clip=0, loss_scale=4, train_wall=83, gb_free=61.6, wall=27363 epoch 019: 1648 / 1689 loss=3.583, nll_loss=2.037, ppl=4.11, wps=591515, ups=1.19, wpb=495281, bsz=16554.2, num_updates=32000, lr=0.000353553, gnorm=0.139, clip=0, loss_scale=4, train_wall=83, gb_free=61.5, wall=27447 epoch 019: 1648 / 1689 loss=3.583, nll_loss=2.037, ppl=4.11, wps=591515, ups=1.19, wpb=495281, bsz=16554.2, num_updates=32000, lr=0.000353553, gnorm=0.139, clip=0, loss_scale=4, train_wall=83, gb_free=61.5, wall=27447 epoch 019: 1648 / 1689 loss=3.583, nll_loss=2.037, ppl=4.11, wps=591515, ups=1.19, wpb=495281, bsz=16554.2, num_updates=32000, lr=0.000353553, gnorm=0.139, clip=0, loss_scale=4, train_wall=83, gb_free=61.5, wall=27447 epoch 019: 1648 / 1689 loss=3.583, nll_loss=2.037, ppl=4.11, wps=591515, ups=1.19, wpb=495281, bsz=16554.2, num_updates=32000, lr=0.000353553, gnorm=0.139, clip=0, loss_scale=4, train_wall=83, gb_free=61.5, wall=27447 epoch 019: 1648 / 1689 loss=3.583, nll_loss=2.037, ppl=4.11, wps=591515, ups=1.19, wpb=495281, bsz=16554.2, num_updates=32000, lr=0.000353553, gnorm=0.139, clip=0, loss_scale=4, train_wall=83, gb_free=61.5, wall=27447 epoch 019: 1648 / 1689 loss=3.583, nll_loss=2.037, ppl=4.11, wps=591515, ups=1.19, wpb=495281, bsz=16554.2, num_updates=32000, lr=0.000353553, gnorm=0.139, clip=0, loss_scale=4, train_wall=83, gb_free=61.5, wall=27447 epoch 019: 1648 / 1689 loss=3.583, nll_loss=2.037, ppl=4.11, wps=591515, ups=1.19, wpb=495281, bsz=16554.2, num_updates=32000, lr=0.000353553, gnorm=0.139, clip=0, loss_scale=4, train_wall=83, gb_free=61.5, wall=27447 epoch 019: 1648 / 1689 loss=3.583, nll_loss=2.037, ppl=4.11, wps=591515, ups=1.19, wpb=495281, bsz=16554.2, num_updates=32000, lr=0.000353553, gnorm=0.139, clip=0, loss_scale=4, train_wall=83, gb_free=61.5, wall=27447 epoch 019: 1648 / 1689 loss=3.583, nll_loss=2.037, ppl=4.11, wps=591515, ups=1.19, wpb=495281, bsz=16554.2, num_updates=32000, lr=0.000353553, gnorm=0.139, clip=0, loss_scale=4, train_wall=83, gb_free=61.5, wall=27447 epoch 019: 1648 / 1689 loss=3.583, nll_loss=2.037, ppl=4.11, wps=591515, ups=1.19, wpb=495281, bsz=16554.2, num_updates=32000, lr=0.000353553, gnorm=0.139, clip=0, loss_scale=4, train_wall=83, gb_free=61.5, wall=27447 epoch 019: 1648 / 1689 loss=3.583, nll_loss=2.037, ppl=4.11, wps=591515, ups=1.19, wpb=495281, bsz=16554.2, num_updates=32000, lr=0.000353553, gnorm=0.139, clip=0, loss_scale=4, train_wall=83, gb_free=61.5, wall=27447 epoch 019: 1648 / 1689 loss=3.583, nll_loss=2.037, ppl=4.11, wps=591515, ups=1.19, wpb=495281, bsz=16554.2, num_updates=32000, lr=0.000353553, gnorm=0.139, clip=0, loss_scale=4, train_wall=83, gb_free=61.5, wall=27447 epoch 019: 1648 / 1689 loss=3.583, nll_loss=2.037, ppl=4.11, wps=591515, ups=1.19, wpb=495281, bsz=16554.2, num_updates=32000, lr=0.000353553, gnorm=0.139, clip=0, loss_scale=4, train_wall=83, gb_free=61.5, wall=27447 epoch 019: 1648 / 1689 loss=3.583, nll_loss=2.037, ppl=4.11, wps=591515, ups=1.19, wpb=495281, bsz=16554.2, num_updates=32000, lr=0.000353553, gnorm=0.139, clip=0, loss_scale=4, train_wall=83, gb_free=61.5, wall=27447 epoch 019: 1648 / 1689 loss=3.583, nll_loss=2.037, ppl=4.11, wps=591515, ups=1.19, wpb=495281, bsz=16554.2, num_updates=32000, lr=0.000353553, gnorm=0.139, clip=0, loss_scale=4, train_wall=83, gb_free=61.5, wall=27447 epoch 019: 1648 / 1689 loss=3.583, nll_loss=2.037, ppl=4.11, wps=591515, ups=1.19, wpb=495281, bsz=16554.2, num_updates=32000, lr=0.000353553, gnorm=0.139, clip=0, loss_scale=4, train_wall=83, gb_free=61.5, wall=27447 epoch 019: 1648 / 1689 loss=3.583, nll_loss=2.037, ppl=4.11, wps=591515, ups=1.19, wpb=495281, bsz=16554.2, num_updates=32000, lr=0.000353553, gnorm=0.139, clip=0, loss_scale=4, train_wall=83, gb_free=61.5, wall=27447 epoch 019: 1648 / 1689 loss=3.583, nll_loss=2.037, ppl=4.11, wps=591515, ups=1.19, wpb=495281, bsz=16554.2, num_updates=32000, lr=0.000353553, gnorm=0.139, clip=0, loss_scale=4, train_wall=83, gb_free=61.5, wall=27447 epoch 019: 1648 / 1689 loss=3.583, nll_loss=2.037, ppl=4.11, wps=591515, ups=1.19, wpb=495281, bsz=16554.2, num_updates=32000, lr=0.000353553, gnorm=0.139, clip=0, loss_scale=4, train_wall=83, gb_free=61.5, wall=27447 begin validation on "valid" subset epoch 019 | valid on 'valid' subset | loss 3.705 | nll_loss 2.152 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 32000 | best_loss 3.701 epoch 019 | valid on 'valid' subset | loss 3.705 | nll_loss 2.152 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 32000 | best_loss 3.701 epoch 019 | valid on 'valid' subset | loss 3.705 | nll_loss 2.152 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 32000 | best_loss 3.701 epoch 019 | valid on 'valid' subset | loss 3.705 | nll_loss 2.152 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 32000 | best_loss 3.701 epoch 019 | valid on 'valid' subset | loss 3.705 | nll_loss 2.152 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 32000 | best_loss 3.701 epoch 019 | valid on 'valid' subset | loss 3.705 | nll_loss 2.152 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 32000 | best_loss 3.701 epoch 019 | valid on 'valid' subset | loss 3.705 | nll_loss 2.152 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 32000 | best_loss 3.701 epoch 019 | valid on 'valid' subset | loss 3.705 | nll_loss 2.152 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 32000 | best_loss 3.701 epoch 019 | valid on 'valid' subset | loss 3.705 | nll_loss 2.152 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 32000 | best_loss 3.701 epoch 019 | valid on 'valid' subset | loss 3.705 | nll_loss 2.152 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 32000 | best_loss 3.701 epoch 019 | valid on 'valid' subset | loss 3.705 | nll_loss 2.152 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 32000 | best_loss 3.701 epoch 019 | valid on 'valid' subset | loss 3.705 | nll_loss 2.152 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 32000 | best_loss 3.701 epoch 019 | valid on 'valid' subset | loss 3.705 | nll_loss 2.152 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 32000 | best_loss 3.701 epoch 019 | valid on 'valid' subset | loss 3.705 | nll_loss 2.152 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 32000 | best_loss 3.701 epoch 019 | valid on 'valid' subset | loss 3.705 | nll_loss 2.152 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 32000 | best_loss 3.701 epoch 019 | valid on 'valid' subset | loss 3.705 | nll_loss 2.152 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 32000 | best_loss 3.701 epoch 019 | valid on 'valid' subset | loss 3.705 | nll_loss 2.152 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 32000 | best_loss 3.701 epoch 019 | valid on 'valid' subset | loss 3.705 | nll_loss 2.152 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 32000 | best_loss 3.701 epoch 019 | valid on 'valid' subset | loss 3.705 | nll_loss 2.152 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 32000 | best_loss 3.701 end of epoch 19 (average epoch stats below) epoch 019 | loss 3.573 | nll_loss 2.026 | ppl 4.07 | wps 578330 | ups 1.17 | wpb 495115 | bsz 16507.7 | num_updates 32041 | lr 0.000353327 | gnorm 0.139 | clip 0 | loss_scale 4 | train_wall 1405 | gb_free 62.9 | wall 27489 epoch 019 | loss 3.573 | nll_loss 2.026 | ppl 4.07 | wps 578330 | ups 1.17 | wpb 495115 | bsz 16507.7 | num_updates 32041 | lr 0.000353327 | gnorm 0.139 | clip 0 | loss_scale 4 | train_wall 1405 | gb_free 62.9 | wall 27489 epoch 019 | loss 3.573 | nll_loss 2.026 | ppl 4.07 | wps 578330 | ups 1.17 | wpb 495115 | bsz 16507.7 | num_updates 32041 | lr 0.000353327 | gnorm 0.139 | clip 0 | loss_scale 4 | train_wall 1405 | gb_free 62.9 | wall 27489 epoch 019 | loss 3.573 | nll_loss 2.026 | ppl 4.07 | wps 578330 | ups 1.17 | wpb 495115 | bsz 16507.7 | num_updates 32041 | lr 0.000353327 | gnorm 0.139 | clip 0 | loss_scale 4 | train_wall 1405 | gb_free 62.9 | wall 27489 epoch 019 | loss 3.573 | nll_loss 2.026 | ppl 4.07 | wps 578330 | ups 1.17 | wpb 495115 | bsz 16507.7 | num_updates 32041 | lr 0.000353327 | gnorm 0.139 | clip 0 | loss_scale 4 | train_wall 1405 | gb_free 62.9 | wall 27489 epoch 019 | loss 3.573 | nll_loss 2.026 | ppl 4.07 | wps 578330 | ups 1.17 | wpb 495115 | bsz 16507.7 | num_updates 32041 | lr 0.000353327 | gnorm 0.139 | clip 0 | loss_scale 4 | train_wall 1405 | gb_free 62.9 | wall 27489 epoch 019 | loss 3.573 | nll_loss 2.026 | ppl 4.07 | wps 578330 | ups 1.17 | wpb 495115 | bsz 16507.7 | num_updates 32041 | lr 0.000353327 | gnorm 0.139 | clip 0 | loss_scale 4 | train_wall 1405 | gb_free 62.9 | wall 27489 epoch 019 | loss 3.573 | nll_loss 2.026 | ppl 4.07 | wps 578330 | ups 1.17 | wpb 495115 | bsz 16507.7 | num_updates 32041 | lr 0.000353327 | gnorm 0.139 | clip 0 | loss_scale 4 | train_wall 1405 | gb_free 62.9 | wall 27489 epoch 019 | loss 3.573 | nll_loss 2.026 | ppl 4.07 | wps 578330 | ups 1.17 | wpb 495115 | bsz 16507.7 | num_updates 32041 | lr 0.000353327 | gnorm 0.139 | clip 0 | loss_scale 4 | train_wall 1405 | gb_free 62.9 | wall 27489 epoch 019 | loss 3.573 | nll_loss 2.026 | ppl 4.07 | wps 578330 | ups 1.17 | wpb 495115 | bsz 16507.7 | num_updates 32041 | lr 0.000353327 | gnorm 0.139 | clip 0 | loss_scale 4 | train_wall 1405 | gb_free 62.9 | wall 27489 epoch 019 | loss 3.573 | nll_loss 2.026 | ppl 4.07 | wps 578330 | ups 1.17 | wpb 495115 | bsz 16507.7 | num_updates 32041 | lr 0.000353327 | gnorm 0.139 | clip 0 | loss_scale 4 | train_wall 1405 | gb_free 62.9 | wall 27489 epoch 019 | loss 3.573 | nll_loss 2.026 | ppl 4.07 | wps 578330 | ups 1.17 | wpb 495115 | bsz 16507.7 | num_updates 32041 | lr 0.000353327 | gnorm 0.139 | clip 0 | loss_scale 4 | train_wall 1405 | gb_free 62.9 | wall 27489 epoch 019 | loss 3.573 | nll_loss 2.026 | ppl 4.07 | wps 578330 | ups 1.17 | wpb 495115 | bsz 16507.7 | num_updates 32041 | lr 0.000353327 | gnorm 0.139 | clip 0 | loss_scale 4 | train_wall 1405 | gb_free 62.9 | wall 27489 epoch 019 | loss 3.573 | nll_loss 2.026 | ppl 4.07 | wps 578330 | ups 1.17 | wpb 495115 | bsz 16507.7 | num_updates 32041 | lr 0.000353327 | gnorm 0.139 | clip 0 | loss_scale 4 | train_wall 1405 | gb_free 62.9 | wall 27489 epoch 019 | loss 3.573 | nll_loss 2.026 | ppl 4.07 | wps 578330 | ups 1.17 | wpb 495115 | bsz 16507.7 | num_updates 32041 | lr 0.000353327 | gnorm 0.139 | clip 0 | loss_scale 4 | train_wall 1405 | gb_free 62.9 | wall 27489 epoch 019 | loss 3.573 | nll_loss 2.026 | ppl 4.07 | wps 578330 | ups 1.17 | wpb 495115 | bsz 16507.7 | num_updates 32041 | lr 0.000353327 | gnorm 0.139 | clip 0 | loss_scale 4 | train_wall 1405 | gb_free 62.9 | wall 27489 epoch 019 | loss 3.573 | nll_loss 2.026 | ppl 4.07 | wps 578330 | ups 1.17 | wpb 495115 | bsz 16507.7 | num_updates 32041 | lr 0.000353327 | gnorm 0.139 | clip 0 | loss_scale 4 | train_wall 1405 | gb_free 62.9 | wall 27489 epoch 019 | loss 3.573 | nll_loss 2.026 | ppl 4.07 | wps 578330 | ups 1.17 | wpb 495115 | bsz 16507.7 | num_updates 32041 | lr 0.000353327 | gnorm 0.139 | clip 0 | loss_scale 4 | train_wall 1405 | gb_free 62.9 | wall 27489 epoch 019 | loss 3.573 | nll_loss 2.026 | ppl 4.07 | wps 578330 | ups 1.17 | wpb 495115 | bsz 16507.7 | num_updates 32041 | lr 0.000353327 | gnorm 0.139 | clip 0 | loss_scale 4 | train_wall 1405 | gb_free 62.9 | wall 27489 Start iterating over samples epoch 020: 60 / 1689 loss=3.555, nll_loss=2.004, ppl=4.01, wps=527645, ups=1.08, wpb=490756, bsz=16253.8, num_updates=32100, lr=0.000353002, gnorm=0.145, clip=0, loss_scale=4, train_wall=84, gb_free=60.4, wall=27540 epoch 020: 60 / 1689 loss=3.555, nll_loss=2.004, ppl=4.01, wps=527645, ups=1.08, wpb=490756, bsz=16253.8, num_updates=32100, lr=0.000353002, gnorm=0.145, clip=0, loss_scale=4, train_wall=84, gb_free=60.4, wall=27540 epoch 020: 60 / 1689 loss=3.555, nll_loss=2.004, ppl=4.01, wps=527645, ups=1.08, wpb=490756, bsz=16253.8, num_updates=32100, lr=0.000353002, gnorm=0.145, clip=0, loss_scale=4, train_wall=84, gb_free=60.4, wall=27540 epoch 020: 60 / 1689 loss=3.555, nll_loss=2.004, ppl=4.01, wps=527645, ups=1.08, wpb=490756, bsz=16253.8, num_updates=32100, lr=0.000353002, gnorm=0.145, clip=0, loss_scale=4, train_wall=84, gb_free=60.4, wall=27540 epoch 020: 60 / 1689 loss=3.555, nll_loss=2.004, ppl=4.01, wps=527645, ups=1.08, wpb=490756, bsz=16253.8, num_updates=32100, lr=0.000353002, gnorm=0.145, clip=0, loss_scale=4, train_wall=84, gb_free=60.4, wall=27540 epoch 020: 60 / 1689 loss=3.555, nll_loss=2.004, ppl=4.01, wps=527645, ups=1.08, wpb=490756, bsz=16253.8, num_updates=32100, lr=0.000353002, gnorm=0.145, clip=0, loss_scale=4, train_wall=84, gb_free=60.4, wall=27540 epoch 020: 60 / 1689 loss=3.555, nll_loss=2.004, ppl=4.01, wps=527645, ups=1.08, wpb=490756, bsz=16253.8, num_updates=32100, lr=0.000353002, gnorm=0.145, clip=0, loss_scale=4, train_wall=84, gb_free=60.4, wall=27540 epoch 020: 60 / 1689 loss=3.555, nll_loss=2.004, ppl=4.01, wps=527645, ups=1.08, wpb=490756, bsz=16253.8, num_updates=32100, lr=0.000353002, gnorm=0.145, clip=0, loss_scale=4, train_wall=84, gb_free=60.4, wall=27540 epoch 020: 60 / 1689 loss=3.555, nll_loss=2.004, ppl=4.01, wps=527645, ups=1.08, wpb=490756, bsz=16253.8, num_updates=32100, lr=0.000353002, gnorm=0.145, clip=0, loss_scale=4, train_wall=84, gb_free=60.4, wall=27540 epoch 020: 60 / 1689 loss=3.555, nll_loss=2.004, ppl=4.01, wps=527645, ups=1.08, wpb=490756, bsz=16253.8, num_updates=32100, lr=0.000353002, gnorm=0.145, clip=0, loss_scale=4, train_wall=84, gb_free=60.4, wall=27540 epoch 020: 60 / 1689 loss=3.555, nll_loss=2.004, ppl=4.01, wps=527645, ups=1.08, wpb=490756, bsz=16253.8, num_updates=32100, lr=0.000353002, gnorm=0.145, clip=0, loss_scale=4, train_wall=84, gb_free=60.4, wall=27540 epoch 020: 60 / 1689 loss=3.555, nll_loss=2.004, ppl=4.01, wps=527645, ups=1.08, wpb=490756, bsz=16253.8, num_updates=32100, lr=0.000353002, gnorm=0.145, clip=0, loss_scale=4, train_wall=84, gb_free=60.4, wall=27540 epoch 020: 60 / 1689 loss=3.555, nll_loss=2.004, ppl=4.01, wps=527645, ups=1.08, wpb=490756, bsz=16253.8, num_updates=32100, lr=0.000353002, gnorm=0.145, clip=0, loss_scale=4, train_wall=84, gb_free=60.4, wall=27540 epoch 020: 60 / 1689 loss=3.555, nll_loss=2.004, ppl=4.01, wps=527645, ups=1.08, wpb=490756, bsz=16253.8, num_updates=32100, lr=0.000353002, gnorm=0.145, clip=0, loss_scale=4, train_wall=84, gb_free=60.4, wall=27540 epoch 020: 60 / 1689 loss=3.555, nll_loss=2.004, ppl=4.01, wps=527645, ups=1.08, wpb=490756, bsz=16253.8, num_updates=32100, lr=0.000353002, gnorm=0.145, clip=0, loss_scale=4, train_wall=84, gb_free=60.4, wall=27540 epoch 020: 60 / 1689 loss=3.555, nll_loss=2.004, ppl=4.01, wps=527645, ups=1.08, wpb=490756, bsz=16253.8, num_updates=32100, lr=0.000353002, gnorm=0.145, clip=0, loss_scale=4, train_wall=84, gb_free=60.4, wall=27540 epoch 020: 60 / 1689 loss=3.555, nll_loss=2.004, ppl=4.01, wps=527645, ups=1.08, wpb=490756, bsz=16253.8, num_updates=32100, lr=0.000353002, gnorm=0.145, clip=0, loss_scale=4, train_wall=84, gb_free=60.4, wall=27540 epoch 020: 60 / 1689 loss=3.555, nll_loss=2.004, ppl=4.01, wps=527645, ups=1.08, wpb=490756, bsz=16253.8, num_updates=32100, lr=0.000353002, gnorm=0.145, clip=0, loss_scale=4, train_wall=84, gb_free=60.4, wall=27540 epoch 020: 60 / 1689 loss=3.555, nll_loss=2.004, ppl=4.01, wps=527645, ups=1.08, wpb=490756, bsz=16253.8, num_updates=32100, lr=0.000353002, gnorm=0.145, clip=0, loss_scale=4, train_wall=84, gb_free=60.4, wall=27540 epoch 020: 60 / 1689 loss=3.555, nll_loss=2.004, ppl=4.01, wps=527645, ups=1.08, wpb=490756, bsz=16253.8, num_updates=32100, lr=0.000353002, gnorm=0.145, clip=0, loss_scale=4, train_wall=84, gb_free=60.4, wall=27540 epoch 020: 160 / 1689 loss=3.553, nll_loss=2.002, ppl=4.01, wps=587476, ups=1.18, wpb=495887, bsz=16160.1, num_updates=32200, lr=0.000352454, gnorm=0.137, clip=0, loss_scale=4, train_wall=83, gb_free=61.6, wall=27625 epoch 020: 160 / 1689 loss=3.553, nll_loss=2.002, ppl=4.01, wps=587476, ups=1.18, wpb=495887, bsz=16160.1, num_updates=32200, lr=0.000352454, gnorm=0.137, clip=0, loss_scale=4, train_wall=83, gb_free=61.6, wall=27625 epoch 020: 160 / 1689 loss=3.553, nll_loss=2.002, ppl=4.01, wps=587476, ups=1.18, wpb=495887, bsz=16160.1, num_updates=32200, lr=0.000352454, gnorm=0.137, clip=0, loss_scale=4, train_wall=83, gb_free=61.6, wall=27625 epoch 020: 160 / 1689 loss=3.553, nll_loss=2.002, ppl=4.01, wps=587476, ups=1.18, wpb=495887, bsz=16160.1, num_updates=32200, lr=0.000352454, gnorm=0.137, clip=0, loss_scale=4, train_wall=83, gb_free=61.6, wall=27625 epoch 020: 160 / 1689 loss=3.553, nll_loss=2.002, ppl=4.01, wps=587476, ups=1.18, wpb=495887, bsz=16160.1, num_updates=32200, lr=0.000352454, gnorm=0.137, clip=0, loss_scale=4, train_wall=83, gb_free=61.6, wall=27625 epoch 020: 160 / 1689 loss=3.553, nll_loss=2.002, ppl=4.01, wps=587476, ups=1.18, wpb=495887, bsz=16160.1, num_updates=32200, lr=0.000352454, gnorm=0.137, clip=0, loss_scale=4, train_wall=83, gb_free=61.6, wall=27625 epoch 020: 160 / 1689 loss=3.553, nll_loss=2.002, ppl=4.01, wps=587476, ups=1.18, wpb=495887, bsz=16160.1, num_updates=32200, lr=0.000352454, gnorm=0.137, clip=0, loss_scale=4, train_wall=83, gb_free=61.6, wall=27625 epoch 020: 160 / 1689 loss=3.553, nll_loss=2.002, ppl=4.01, wps=587476, ups=1.18, wpb=495887, bsz=16160.1, num_updates=32200, lr=0.000352454, gnorm=0.137, clip=0, loss_scale=4, train_wall=83, gb_free=61.6, wall=27625 epoch 020: 160 / 1689 loss=3.553, nll_loss=2.002, ppl=4.01, wps=587476, ups=1.18, wpb=495887, bsz=16160.1, num_updates=32200, lr=0.000352454, gnorm=0.137, clip=0, loss_scale=4, train_wall=83, gb_free=61.6, wall=27625 epoch 020: 160 / 1689 loss=3.553, nll_loss=2.002, ppl=4.01, wps=587476, ups=1.18, wpb=495887, bsz=16160.1, num_updates=32200, lr=0.000352454, gnorm=0.137, clip=0, loss_scale=4, train_wall=83, gb_free=61.6, wall=27625 epoch 020: 160 / 1689 loss=3.553, nll_loss=2.002, ppl=4.01, wps=587476, ups=1.18, wpb=495887, bsz=16160.1, num_updates=32200, lr=0.000352454, gnorm=0.137, clip=0, loss_scale=4, train_wall=83, gb_free=61.6, wall=27625 epoch 020: 160 / 1689 loss=3.553, nll_loss=2.002, ppl=4.01, wps=587476, ups=1.18, wpb=495887, bsz=16160.1, num_updates=32200, lr=0.000352454, gnorm=0.137, clip=0, loss_scale=4, train_wall=83, gb_free=61.6, wall=27625 epoch 020: 160 / 1689 loss=3.553, nll_loss=2.002, ppl=4.01, wps=587476, ups=1.18, wpb=495887, bsz=16160.1, num_updates=32200, lr=0.000352454, gnorm=0.137, clip=0, loss_scale=4, train_wall=83, gb_free=61.6, wall=27625 epoch 020: 160 / 1689 loss=3.553, nll_loss=2.002, ppl=4.01, wps=587476, ups=1.18, wpb=495887, bsz=16160.1, num_updates=32200, lr=0.000352454, gnorm=0.137, clip=0, loss_scale=4, train_wall=83, gb_free=61.6, wall=27625 epoch 020: 160 / 1689 loss=3.553, nll_loss=2.002, ppl=4.01, wps=587476, ups=1.18, wpb=495887, bsz=16160.1, num_updates=32200, lr=0.000352454, gnorm=0.137, clip=0, loss_scale=4, train_wall=83, gb_free=61.6, wall=27625 epoch 020: 160 / 1689 loss=3.553, nll_loss=2.002, ppl=4.01, wps=587476, ups=1.18, wpb=495887, bsz=16160.1, num_updates=32200, lr=0.000352454, gnorm=0.137, clip=0, loss_scale=4, train_wall=83, gb_free=61.6, wall=27625 epoch 020: 160 / 1689 loss=3.553, nll_loss=2.002, ppl=4.01, wps=587476, ups=1.18, wpb=495887, bsz=16160.1, num_updates=32200, lr=0.000352454, gnorm=0.137, clip=0, loss_scale=4, train_wall=83, gb_free=61.6, wall=27625 epoch 020: 160 / 1689 loss=3.553, nll_loss=2.002, ppl=4.01, wps=587476, ups=1.18, wpb=495887, bsz=16160.1, num_updates=32200, lr=0.000352454, gnorm=0.137, clip=0, loss_scale=4, train_wall=83, gb_free=61.6, wall=27625 epoch 020: 160 / 1689 loss=3.553, nll_loss=2.002, ppl=4.01, wps=587476, ups=1.18, wpb=495887, bsz=16160.1, num_updates=32200, lr=0.000352454, gnorm=0.137, clip=0, loss_scale=4, train_wall=83, gb_free=61.6, wall=27625 epoch 020: 160 / 1689 loss=3.553, nll_loss=2.002, ppl=4.01, wps=587476, ups=1.18, wpb=495887, bsz=16160.1, num_updates=32200, lr=0.000352454, gnorm=0.137, clip=0, loss_scale=4, train_wall=83, gb_free=61.6, wall=27625 epoch 020: 260 / 1689 loss=3.557, nll_loss=2.006, ppl=4.02, wps=588632, ups=1.19, wpb=495114, bsz=16481.1, num_updates=32300, lr=0.000351908, gnorm=0.147, clip=0, loss_scale=4, train_wall=83, gb_free=61.7, wall=27709 epoch 020: 260 / 1689 loss=3.557, nll_loss=2.006, ppl=4.02, wps=588632, ups=1.19, wpb=495114, bsz=16481.1, num_updates=32300, lr=0.000351908, gnorm=0.147, clip=0, loss_scale=4, train_wall=83, gb_free=61.7, wall=27709 epoch 020: 260 / 1689 loss=3.557, nll_loss=2.006, ppl=4.02, wps=588632, ups=1.19, wpb=495114, bsz=16481.1, num_updates=32300, lr=0.000351908, gnorm=0.147, clip=0, loss_scale=4, train_wall=83, gb_free=61.7, wall=27709 epoch 020: 260 / 1689 loss=3.557, nll_loss=2.006, ppl=4.02, wps=588632, ups=1.19, wpb=495114, bsz=16481.1, num_updates=32300, lr=0.000351908, gnorm=0.147, clip=0, loss_scale=4, train_wall=83, gb_free=61.7, wall=27709 epoch 020: 260 / 1689 loss=3.557, nll_loss=2.006, ppl=4.02, wps=588632, ups=1.19, wpb=495114, bsz=16481.1, num_updates=32300, lr=0.000351908, gnorm=0.147, clip=0, loss_scale=4, train_wall=83, gb_free=61.7, wall=27709 epoch 020: 260 / 1689 loss=3.557, nll_loss=2.006, ppl=4.02, wps=588632, ups=1.19, wpb=495114, bsz=16481.1, num_updates=32300, lr=0.000351908, gnorm=0.147, clip=0, loss_scale=4, train_wall=83, gb_free=61.7, wall=27709 epoch 020: 260 / 1689 loss=3.557, nll_loss=2.006, ppl=4.02, wps=588632, ups=1.19, wpb=495114, bsz=16481.1, num_updates=32300, lr=0.000351908, gnorm=0.147, clip=0, loss_scale=4, train_wall=83, gb_free=61.7, wall=27709 epoch 020: 260 / 1689 loss=3.557, nll_loss=2.006, ppl=4.02, wps=588632, ups=1.19, wpb=495114, bsz=16481.1, num_updates=32300, lr=0.000351908, gnorm=0.147, clip=0, loss_scale=4, train_wall=83, gb_free=61.7, wall=27709 epoch 020: 260 / 1689 loss=3.557, nll_loss=2.006, ppl=4.02, wps=588632, ups=1.19, wpb=495114, bsz=16481.1, num_updates=32300, lr=0.000351908, gnorm=0.147, clip=0, loss_scale=4, train_wall=83, gb_free=61.7, wall=27709 epoch 020: 260 / 1689 loss=3.557, nll_loss=2.006, ppl=4.02, wps=588632, ups=1.19, wpb=495114, bsz=16481.1, num_updates=32300, lr=0.000351908, gnorm=0.147, clip=0, loss_scale=4, train_wall=83, gb_free=61.7, wall=27709 epoch 020: 260 / 1689 loss=3.557, nll_loss=2.006, ppl=4.02, wps=588632, ups=1.19, wpb=495114, bsz=16481.1, num_updates=32300, lr=0.000351908, gnorm=0.147, clip=0, loss_scale=4, train_wall=83, gb_free=61.7, wall=27709 epoch 020: 260 / 1689 loss=3.557, nll_loss=2.006, ppl=4.02, wps=588632, ups=1.19, wpb=495114, bsz=16481.1, num_updates=32300, lr=0.000351908, gnorm=0.147, clip=0, loss_scale=4, train_wall=83, gb_free=61.7, wall=27709 epoch 020: 260 / 1689 loss=3.557, nll_loss=2.006, ppl=4.02, wps=588632, ups=1.19, wpb=495114, bsz=16481.1, num_updates=32300, lr=0.000351908, gnorm=0.147, clip=0, loss_scale=4, train_wall=83, gb_free=61.7, wall=27709 epoch 020: 260 / 1689 loss=3.557, nll_loss=2.006, ppl=4.02, wps=588632, ups=1.19, wpb=495114, bsz=16481.1, num_updates=32300, lr=0.000351908, gnorm=0.147, clip=0, loss_scale=4, train_wall=83, gb_free=61.7, wall=27709 epoch 020: 260 / 1689 loss=3.557, nll_loss=2.006, ppl=4.02, wps=588632, ups=1.19, wpb=495114, bsz=16481.1, num_updates=32300, lr=0.000351908, gnorm=0.147, clip=0, loss_scale=4, train_wall=83, gb_free=61.7, wall=27709 epoch 020: 260 / 1689 loss=3.557, nll_loss=2.006, ppl=4.02, wps=588632, ups=1.19, wpb=495114, bsz=16481.1, num_updates=32300, lr=0.000351908, gnorm=0.147, clip=0, loss_scale=4, train_wall=83, gb_free=61.7, wall=27709 epoch 020: 260 / 1689 loss=3.557, nll_loss=2.006, ppl=4.02, wps=588632, ups=1.19, wpb=495114, bsz=16481.1, num_updates=32300, lr=0.000351908, gnorm=0.147, clip=0, loss_scale=4, train_wall=83, gb_free=61.7, wall=27709 epoch 020: 260 / 1689 loss=3.557, nll_loss=2.006, ppl=4.02, wps=588632, ups=1.19, wpb=495114, bsz=16481.1, num_updates=32300, lr=0.000351908, gnorm=0.147, clip=0, loss_scale=4, train_wall=83, gb_free=61.7, wall=27709 epoch 020: 260 / 1689 loss=3.557, nll_loss=2.006, ppl=4.02, wps=588632, ups=1.19, wpb=495114, bsz=16481.1, num_updates=32300, lr=0.000351908, gnorm=0.147, clip=0, loss_scale=4, train_wall=83, gb_free=61.7, wall=27709 epoch 020: 260 / 1689 loss=3.557, nll_loss=2.006, ppl=4.02, wps=588632, ups=1.19, wpb=495114, bsz=16481.1, num_updates=32300, lr=0.000351908, gnorm=0.147, clip=0, loss_scale=4, train_wall=83, gb_free=61.7, wall=27709 epoch 020: 360 / 1689 loss=3.551, nll_loss=2, ppl=4, wps=587152, ups=1.19, wpb=495074, bsz=16647.4, num_updates=32400, lr=0.000351364, gnorm=0.132, clip=0, loss_scale=4, train_wall=83, gb_free=60.1, wall=27793 epoch 020: 360 / 1689 loss=3.551, nll_loss=2, ppl=4, wps=587152, ups=1.19, wpb=495074, bsz=16647.4, num_updates=32400, lr=0.000351364, gnorm=0.132, clip=0, loss_scale=4, train_wall=83, gb_free=60.1, wall=27793 epoch 020: 360 / 1689 loss=3.551, nll_loss=2, ppl=4, wps=587152, ups=1.19, wpb=495074, bsz=16647.4, num_updates=32400, lr=0.000351364, gnorm=0.132, clip=0, loss_scale=4, train_wall=83, gb_free=60.1, wall=27793 epoch 020: 360 / 1689 loss=3.551, nll_loss=2, ppl=4, wps=587152, ups=1.19, wpb=495074, bsz=16647.4, num_updates=32400, lr=0.000351364, gnorm=0.132, clip=0, loss_scale=4, train_wall=83, gb_free=60.1, wall=27793 epoch 020: 360 / 1689 loss=3.551, nll_loss=2, ppl=4, wps=587152, ups=1.19, wpb=495074, bsz=16647.4, num_updates=32400, lr=0.000351364, gnorm=0.132, clip=0, loss_scale=4, train_wall=83, gb_free=60.1, wall=27793 epoch 020: 360 / 1689 loss=3.551, nll_loss=2, ppl=4, wps=587152, ups=1.19, wpb=495074, bsz=16647.4, num_updates=32400, lr=0.000351364, gnorm=0.132, clip=0, loss_scale=4, train_wall=83, gb_free=60.1, wall=27793 epoch 020: 360 / 1689 loss=3.551, nll_loss=2, ppl=4, wps=587152, ups=1.19, wpb=495074, bsz=16647.4, num_updates=32400, lr=0.000351364, gnorm=0.132, clip=0, loss_scale=4, train_wall=83, gb_free=60.1, wall=27793 epoch 020: 360 / 1689 loss=3.551, nll_loss=2, ppl=4, wps=587152, ups=1.19, wpb=495074, bsz=16647.4, num_updates=32400, lr=0.000351364, gnorm=0.132, clip=0, loss_scale=4, train_wall=83, gb_free=60.1, wall=27793 epoch 020: 360 / 1689 loss=3.551, nll_loss=2, ppl=4, wps=587152, ups=1.19, wpb=495074, bsz=16647.4, num_updates=32400, lr=0.000351364, gnorm=0.132, clip=0, loss_scale=4, train_wall=83, gb_free=60.1, wall=27793 epoch 020: 360 / 1689 loss=3.551, nll_loss=2, ppl=4, wps=587152, ups=1.19, wpb=495074, bsz=16647.4, num_updates=32400, lr=0.000351364, gnorm=0.132, clip=0, loss_scale=4, train_wall=83, gb_free=60.1, wall=27793 epoch 020: 360 / 1689 loss=3.551, nll_loss=2, ppl=4, wps=587152, ups=1.19, wpb=495074, bsz=16647.4, num_updates=32400, lr=0.000351364, gnorm=0.132, clip=0, loss_scale=4, train_wall=83, gb_free=60.1, wall=27793 epoch 020: 360 / 1689 loss=3.551, nll_loss=2, ppl=4, wps=587152, ups=1.19, wpb=495074, bsz=16647.4, num_updates=32400, lr=0.000351364, gnorm=0.132, clip=0, loss_scale=4, train_wall=83, gb_free=60.1, wall=27793 epoch 020: 360 / 1689 loss=3.551, nll_loss=2, ppl=4, wps=587152, ups=1.19, wpb=495074, bsz=16647.4, num_updates=32400, lr=0.000351364, gnorm=0.132, clip=0, loss_scale=4, train_wall=83, gb_free=60.1, wall=27793 epoch 020: 360 / 1689 loss=3.551, nll_loss=2, ppl=4, wps=587152, ups=1.19, wpb=495074, bsz=16647.4, num_updates=32400, lr=0.000351364, gnorm=0.132, clip=0, loss_scale=4, train_wall=83, gb_free=60.1, wall=27793 epoch 020: 360 / 1689 loss=3.551, nll_loss=2, ppl=4, wps=587152, ups=1.19, wpb=495074, bsz=16647.4, num_updates=32400, lr=0.000351364, gnorm=0.132, clip=0, loss_scale=4, train_wall=83, gb_free=60.1, wall=27793 epoch 020: 360 / 1689 loss=3.551, nll_loss=2, ppl=4, wps=587152, ups=1.19, wpb=495074, bsz=16647.4, num_updates=32400, lr=0.000351364, gnorm=0.132, clip=0, loss_scale=4, train_wall=83, gb_free=60.1, wall=27793 epoch 020: 360 / 1689 loss=3.551, nll_loss=2, ppl=4, wps=587152, ups=1.19, wpb=495074, bsz=16647.4, num_updates=32400, lr=0.000351364, gnorm=0.132, clip=0, loss_scale=4, train_wall=83, gb_free=60.1, wall=27793 epoch 020: 360 / 1689 loss=3.551, nll_loss=2, ppl=4, wps=587152, ups=1.19, wpb=495074, bsz=16647.4, num_updates=32400, lr=0.000351364, gnorm=0.132, clip=0, loss_scale=4, train_wall=83, gb_free=60.1, wall=27793 epoch 020: 360 / 1689 loss=3.551, nll_loss=2, ppl=4, wps=587152, ups=1.19, wpb=495074, bsz=16647.4, num_updates=32400, lr=0.000351364, gnorm=0.132, clip=0, loss_scale=4, train_wall=83, gb_free=60.1, wall=27793 epoch 020: 360 / 1689 loss=3.551, nll_loss=2, ppl=4, wps=587152, ups=1.19, wpb=495074, bsz=16647.4, num_updates=32400, lr=0.000351364, gnorm=0.132, clip=0, loss_scale=4, train_wall=83, gb_free=60.1, wall=27793 epoch 020: 460 / 1689 loss=3.557, nll_loss=2.007, ppl=4.02, wps=590154, ups=1.19, wpb=497326, bsz=16735.7, num_updates=32500, lr=0.000350823, gnorm=0.139, clip=0, loss_scale=4, train_wall=83, gb_free=61.6, wall=27877 epoch 020: 460 / 1689 loss=3.557, nll_loss=2.007, ppl=4.02, wps=590154, ups=1.19, wpb=497326, bsz=16735.7, num_updates=32500, lr=0.000350823, gnorm=0.139, clip=0, loss_scale=4, train_wall=83, gb_free=61.6, wall=27877 epoch 020: 460 / 1689 loss=3.557, nll_loss=2.007, ppl=4.02, wps=590154, ups=1.19, wpb=497326, bsz=16735.7, num_updates=32500, lr=0.000350823, gnorm=0.139, clip=0, loss_scale=4, train_wall=83, gb_free=61.6, wall=27877 epoch 020: 460 / 1689 loss=3.557, nll_loss=2.007, ppl=4.02, wps=590154, ups=1.19, wpb=497326, bsz=16735.7, num_updates=32500, lr=0.000350823, gnorm=0.139, clip=0, loss_scale=4, train_wall=83, gb_free=61.6, wall=27877 epoch 020: 460 / 1689 loss=3.557, nll_loss=2.007, ppl=4.02, wps=590154, ups=1.19, wpb=497326, bsz=16735.7, num_updates=32500, lr=0.000350823, gnorm=0.139, clip=0, loss_scale=4, train_wall=83, gb_free=61.6, wall=27877 epoch 020: 460 / 1689 loss=3.557, nll_loss=2.007, ppl=4.02, wps=590154, ups=1.19, wpb=497326, bsz=16735.7, num_updates=32500, lr=0.000350823, gnorm=0.139, clip=0, loss_scale=4, train_wall=83, gb_free=61.6, wall=27877 epoch 020: 460 / 1689 loss=3.557, nll_loss=2.007, ppl=4.02, wps=590154, ups=1.19, wpb=497326, bsz=16735.7, num_updates=32500, lr=0.000350823, gnorm=0.139, clip=0, loss_scale=4, train_wall=83, gb_free=61.6, wall=27877 epoch 020: 460 / 1689 loss=3.557, nll_loss=2.007, ppl=4.02, wps=590154, ups=1.19, wpb=497326, bsz=16735.7, num_updates=32500, lr=0.000350823, gnorm=0.139, clip=0, loss_scale=4, train_wall=83, gb_free=61.6, wall=27877 epoch 020: 460 / 1689 loss=3.557, nll_loss=2.007, ppl=4.02, wps=590154, ups=1.19, wpb=497326, bsz=16735.7, num_updates=32500, lr=0.000350823, gnorm=0.139, clip=0, loss_scale=4, train_wall=83, gb_free=61.6, wall=27877 epoch 020: 460 / 1689 loss=3.557, nll_loss=2.007, ppl=4.02, wps=590154, ups=1.19, wpb=497326, bsz=16735.7, num_updates=32500, lr=0.000350823, gnorm=0.139, clip=0, loss_scale=4, train_wall=83, gb_free=61.6, wall=27877 epoch 020: 460 / 1689 loss=3.557, nll_loss=2.007, ppl=4.02, wps=590154, ups=1.19, wpb=497326, bsz=16735.7, num_updates=32500, lr=0.000350823, gnorm=0.139, clip=0, loss_scale=4, train_wall=83, gb_free=61.6, wall=27877 epoch 020: 460 / 1689 loss=3.557, nll_loss=2.007, ppl=4.02, wps=590154, ups=1.19, wpb=497326, bsz=16735.7, num_updates=32500, lr=0.000350823, gnorm=0.139, clip=0, loss_scale=4, train_wall=83, gb_free=61.6, wall=27877 epoch 020: 460 / 1689 loss=3.557, nll_loss=2.007, ppl=4.02, wps=590154, ups=1.19, wpb=497326, bsz=16735.7, num_updates=32500, lr=0.000350823, gnorm=0.139, clip=0, loss_scale=4, train_wall=83, gb_free=61.6, wall=27877 epoch 020: 460 / 1689 loss=3.557, nll_loss=2.007, ppl=4.02, wps=590154, ups=1.19, wpb=497326, bsz=16735.7, num_updates=32500, lr=0.000350823, gnorm=0.139, clip=0, loss_scale=4, train_wall=83, gb_free=61.6, wall=27877 epoch 020: 460 / 1689 loss=3.557, nll_loss=2.007, ppl=4.02, wps=590154, ups=1.19, wpb=497326, bsz=16735.7, num_updates=32500, lr=0.000350823, gnorm=0.139, clip=0, loss_scale=4, train_wall=83, gb_free=61.6, wall=27877 epoch 020: 460 / 1689 loss=3.557, nll_loss=2.007, ppl=4.02, wps=590154, ups=1.19, wpb=497326, bsz=16735.7, num_updates=32500, lr=0.000350823, gnorm=0.139, clip=0, loss_scale=4, train_wall=83, gb_free=61.6, wall=27877 epoch 020: 460 / 1689 loss=3.557, nll_loss=2.007, ppl=4.02, wps=590154, ups=1.19, wpb=497326, bsz=16735.7, num_updates=32500, lr=0.000350823, gnorm=0.139, clip=0, loss_scale=4, train_wall=83, gb_free=61.6, wall=27877 epoch 020: 460 / 1689 loss=3.557, nll_loss=2.007, ppl=4.02, wps=590154, ups=1.19, wpb=497326, bsz=16735.7, num_updates=32500, lr=0.000350823, gnorm=0.139, clip=0, loss_scale=4, train_wall=83, gb_free=61.6, wall=27877 epoch 020: 460 / 1689 loss=3.557, nll_loss=2.007, ppl=4.02, wps=590154, ups=1.19, wpb=497326, bsz=16735.7, num_updates=32500, lr=0.000350823, gnorm=0.139, clip=0, loss_scale=4, train_wall=83, gb_free=61.6, wall=27877 epoch 020: 460 / 1689 loss=3.557, nll_loss=2.007, ppl=4.02, wps=590154, ups=1.19, wpb=497326, bsz=16735.7, num_updates=32500, lr=0.000350823, gnorm=0.139, clip=0, loss_scale=4, train_wall=83, gb_free=61.6, wall=27877 epoch 020: 560 / 1689 loss=3.566, nll_loss=2.018, ppl=4.05, wps=584075, ups=1.18, wpb=494715, bsz=16742.3, num_updates=32600, lr=0.000350285, gnorm=0.136, clip=0, loss_scale=8, train_wall=84, gb_free=62.2, wall=27962 epoch 020: 560 / 1689 loss=3.566, nll_loss=2.018, ppl=4.05, wps=584075, ups=1.18, wpb=494715, bsz=16742.3, num_updates=32600, lr=0.000350285, gnorm=0.136, clip=0, loss_scale=8, train_wall=84, gb_free=62.2, wall=27962 epoch 020: 560 / 1689 loss=3.566, nll_loss=2.018, ppl=4.05, wps=584075, ups=1.18, wpb=494715, bsz=16742.3, num_updates=32600, lr=0.000350285, gnorm=0.136, clip=0, loss_scale=8, train_wall=84, gb_free=62.2, wall=27962 epoch 020: 560 / 1689 loss=3.566, nll_loss=2.018, ppl=4.05, wps=584075, ups=1.18, wpb=494715, bsz=16742.3, num_updates=32600, lr=0.000350285, gnorm=0.136, clip=0, loss_scale=8, train_wall=84, gb_free=62.2, wall=27962 epoch 020: 560 / 1689 loss=3.566, nll_loss=2.018, ppl=4.05, wps=584075, ups=1.18, wpb=494715, bsz=16742.3, num_updates=32600, lr=0.000350285, gnorm=0.136, clip=0, loss_scale=8, train_wall=84, gb_free=62.2, wall=27962 epoch 020: 560 / 1689 loss=3.566, nll_loss=2.018, ppl=4.05, wps=584075, ups=1.18, wpb=494715, bsz=16742.3, num_updates=32600, lr=0.000350285, gnorm=0.136, clip=0, loss_scale=8, train_wall=84, gb_free=62.2, wall=27962 epoch 020: 560 / 1689 loss=3.566, nll_loss=2.018, ppl=4.05, wps=584075, ups=1.18, wpb=494715, bsz=16742.3, num_updates=32600, lr=0.000350285, gnorm=0.136, clip=0, loss_scale=8, train_wall=84, gb_free=62.2, wall=27962 epoch 020: 560 / 1689 loss=3.566, nll_loss=2.018, ppl=4.05, wps=584075, ups=1.18, wpb=494715, bsz=16742.3, num_updates=32600, lr=0.000350285, gnorm=0.136, clip=0, loss_scale=8, train_wall=84, gb_free=62.2, wall=27962 epoch 020: 560 / 1689 loss=3.566, nll_loss=2.018, ppl=4.05, wps=584075, ups=1.18, wpb=494715, bsz=16742.3, num_updates=32600, lr=0.000350285, gnorm=0.136, clip=0, loss_scale=8, train_wall=84, gb_free=62.2, wall=27962 epoch 020: 560 / 1689 loss=3.566, nll_loss=2.018, ppl=4.05, wps=584075, ups=1.18, wpb=494715, bsz=16742.3, num_updates=32600, lr=0.000350285, gnorm=0.136, clip=0, loss_scale=8, train_wall=84, gb_free=62.2, wall=27962 epoch 020: 560 / 1689 loss=3.566, nll_loss=2.018, ppl=4.05, wps=584075, ups=1.18, wpb=494715, bsz=16742.3, num_updates=32600, lr=0.000350285, gnorm=0.136, clip=0, loss_scale=8, train_wall=84, gb_free=62.2, wall=27962 epoch 020: 560 / 1689 loss=3.566, nll_loss=2.018, ppl=4.05, wps=584075, ups=1.18, wpb=494715, bsz=16742.3, num_updates=32600, lr=0.000350285, gnorm=0.136, clip=0, loss_scale=8, train_wall=84, gb_free=62.2, wall=27962 epoch 020: 560 / 1689 loss=3.566, nll_loss=2.018, ppl=4.05, wps=584075, ups=1.18, wpb=494715, bsz=16742.3, num_updates=32600, lr=0.000350285, gnorm=0.136, clip=0, loss_scale=8, train_wall=84, gb_free=62.2, wall=27962 epoch 020: 560 / 1689 loss=3.566, nll_loss=2.018, ppl=4.05, wps=584075, ups=1.18, wpb=494715, bsz=16742.3, num_updates=32600, lr=0.000350285, gnorm=0.136, clip=0, loss_scale=8, train_wall=84, gb_free=62.2, wall=27962 epoch 020: 560 / 1689 loss=3.566, nll_loss=2.018, ppl=4.05, wps=584075, ups=1.18, wpb=494715, bsz=16742.3, num_updates=32600, lr=0.000350285, gnorm=0.136, clip=0, loss_scale=8, train_wall=84, gb_free=62.2, wall=27962 epoch 020: 560 / 1689 loss=3.566, nll_loss=2.018, ppl=4.05, wps=584075, ups=1.18, wpb=494715, bsz=16742.3, num_updates=32600, lr=0.000350285, gnorm=0.136, clip=0, loss_scale=8, train_wall=84, gb_free=62.2, wall=27962 epoch 020: 560 / 1689 loss=3.566, nll_loss=2.018, ppl=4.05, wps=584075, ups=1.18, wpb=494715, bsz=16742.3, num_updates=32600, lr=0.000350285, gnorm=0.136, clip=0, loss_scale=8, train_wall=84, gb_free=62.2, wall=27962 epoch 020: 560 / 1689 loss=3.566, nll_loss=2.018, ppl=4.05, wps=584075, ups=1.18, wpb=494715, bsz=16742.3, num_updates=32600, lr=0.000350285, gnorm=0.136, clip=0, loss_scale=8, train_wall=84, gb_free=62.2, wall=27962 epoch 020: 560 / 1689 loss=3.566, nll_loss=2.018, ppl=4.05, wps=584075, ups=1.18, wpb=494715, bsz=16742.3, num_updates=32600, lr=0.000350285, gnorm=0.136, clip=0, loss_scale=8, train_wall=84, gb_free=62.2, wall=27962 epoch 020: 560 / 1689 loss=3.566, nll_loss=2.018, ppl=4.05, wps=584075, ups=1.18, wpb=494715, bsz=16742.3, num_updates=32600, lr=0.000350285, gnorm=0.136, clip=0, loss_scale=8, train_wall=84, gb_free=62.2, wall=27962 epoch 020: 661 / 1689 loss=3.571, nll_loss=2.023, ppl=4.06, wps=579185, ups=1.17, wpb=495271, bsz=16359.1, num_updates=32700, lr=0.000349749, gnorm=0.134, clip=0, loss_scale=4, train_wall=84, gb_free=61.6, wall=28047 epoch 020: 661 / 1689 loss=3.571, nll_loss=2.023, ppl=4.06, wps=579185, ups=1.17, wpb=495271, bsz=16359.1, num_updates=32700, lr=0.000349749, gnorm=0.134, clip=0, loss_scale=4, train_wall=84, gb_free=61.6, wall=28047 epoch 020: 661 / 1689 loss=3.571, nll_loss=2.023, ppl=4.06, wps=579185, ups=1.17, wpb=495271, bsz=16359.1, num_updates=32700, lr=0.000349749, gnorm=0.134, clip=0, loss_scale=4, train_wall=84, gb_free=61.6, wall=28047 epoch 020: 661 / 1689 loss=3.571, nll_loss=2.023, ppl=4.06, wps=579185, ups=1.17, wpb=495271, bsz=16359.1, num_updates=32700, lr=0.000349749, gnorm=0.134, clip=0, loss_scale=4, train_wall=84, gb_free=61.6, wall=28047 epoch 020: 661 / 1689 loss=3.571, nll_loss=2.023, ppl=4.06, wps=579185, ups=1.17, wpb=495271, bsz=16359.1, num_updates=32700, lr=0.000349749, gnorm=0.134, clip=0, loss_scale=4, train_wall=84, gb_free=61.6, wall=28047 epoch 020: 661 / 1689 loss=3.571, nll_loss=2.023, ppl=4.06, wps=579185, ups=1.17, wpb=495271, bsz=16359.1, num_updates=32700, lr=0.000349749, gnorm=0.134, clip=0, loss_scale=4, train_wall=84, gb_free=61.6, wall=28047 epoch 020: 661 / 1689 loss=3.571, nll_loss=2.023, ppl=4.06, wps=579185, ups=1.17, wpb=495271, bsz=16359.1, num_updates=32700, lr=0.000349749, gnorm=0.134, clip=0, loss_scale=4, train_wall=84, gb_free=61.6, wall=28047 epoch 020: 661 / 1689 loss=3.571, nll_loss=2.023, ppl=4.06, wps=579185, ups=1.17, wpb=495271, bsz=16359.1, num_updates=32700, lr=0.000349749, gnorm=0.134, clip=0, loss_scale=4, train_wall=84, gb_free=61.6, wall=28047 epoch 020: 661 / 1689 loss=3.571, nll_loss=2.023, ppl=4.06, wps=579185, ups=1.17, wpb=495271, bsz=16359.1, num_updates=32700, lr=0.000349749, gnorm=0.134, clip=0, loss_scale=4, train_wall=84, gb_free=61.6, wall=28047 epoch 020: 661 / 1689 loss=3.571, nll_loss=2.023, ppl=4.06, wps=579185, ups=1.17, wpb=495271, bsz=16359.1, num_updates=32700, lr=0.000349749, gnorm=0.134, clip=0, loss_scale=4, train_wall=84, gb_free=61.6, wall=28047 epoch 020: 661 / 1689 loss=3.571, nll_loss=2.023, ppl=4.06, wps=579185, ups=1.17, wpb=495271, bsz=16359.1, num_updates=32700, lr=0.000349749, gnorm=0.134, clip=0, loss_scale=4, train_wall=84, gb_free=61.6, wall=28047 epoch 020: 661 / 1689 loss=3.571, nll_loss=2.023, ppl=4.06, wps=579185, ups=1.17, wpb=495271, bsz=16359.1, num_updates=32700, lr=0.000349749, gnorm=0.134, clip=0, loss_scale=4, train_wall=84, gb_free=61.6, wall=28047 epoch 020: 661 / 1689 loss=3.571, nll_loss=2.023, ppl=4.06, wps=579185, ups=1.17, wpb=495271, bsz=16359.1, num_updates=32700, lr=0.000349749, gnorm=0.134, clip=0, loss_scale=4, train_wall=84, gb_free=61.6, wall=28047 epoch 020: 661 / 1689 loss=3.571, nll_loss=2.023, ppl=4.06, wps=579185, ups=1.17, wpb=495271, bsz=16359.1, num_updates=32700, lr=0.000349749, gnorm=0.134, clip=0, loss_scale=4, train_wall=84, gb_free=61.6, wall=28047 epoch 020: 661 / 1689 loss=3.571, nll_loss=2.023, ppl=4.06, wps=579185, ups=1.17, wpb=495271, bsz=16359.1, num_updates=32700, lr=0.000349749, gnorm=0.134, clip=0, loss_scale=4, train_wall=84, gb_free=61.6, wall=28047 epoch 020: 661 / 1689 loss=3.571, nll_loss=2.023, ppl=4.06, wps=579185, ups=1.17, wpb=495271, bsz=16359.1, num_updates=32700, lr=0.000349749, gnorm=0.134, clip=0, loss_scale=4, train_wall=84, gb_free=61.6, wall=28047 epoch 020: 661 / 1689 loss=3.571, nll_loss=2.023, ppl=4.06, wps=579185, ups=1.17, wpb=495271, bsz=16359.1, num_updates=32700, lr=0.000349749, gnorm=0.134, clip=0, loss_scale=4, train_wall=84, gb_free=61.6, wall=28047 epoch 020: 661 / 1689 loss=3.571, nll_loss=2.023, ppl=4.06, wps=579185, ups=1.17, wpb=495271, bsz=16359.1, num_updates=32700, lr=0.000349749, gnorm=0.134, clip=0, loss_scale=4, train_wall=84, gb_free=61.6, wall=28047 epoch 020: 661 / 1689 loss=3.571, nll_loss=2.023, ppl=4.06, wps=579185, ups=1.17, wpb=495271, bsz=16359.1, num_updates=32700, lr=0.000349749, gnorm=0.134, clip=0, loss_scale=4, train_wall=84, gb_free=61.6, wall=28047 epoch 020: 661 / 1689 loss=3.571, nll_loss=2.023, ppl=4.06, wps=579185, ups=1.17, wpb=495271, bsz=16359.1, num_updates=32700, lr=0.000349749, gnorm=0.134, clip=0, loss_scale=4, train_wall=84, gb_free=61.6, wall=28047 epoch 020: 761 / 1689 loss=3.567, nll_loss=2.019, ppl=4.05, wps=587150, ups=1.18, wpb=496759, bsz=17022, num_updates=32800, lr=0.000349215, gnorm=0.14, clip=0, loss_scale=4, train_wall=84, gb_free=61, wall=28132 epoch 020: 761 / 1689 loss=3.567, nll_loss=2.019, ppl=4.05, wps=587150, ups=1.18, wpb=496759, bsz=17022, num_updates=32800, lr=0.000349215, gnorm=0.14, clip=0, loss_scale=4, train_wall=84, gb_free=61, wall=28132 epoch 020: 761 / 1689 loss=3.567, nll_loss=2.019, ppl=4.05, wps=587150, ups=1.18, wpb=496759, bsz=17022, num_updates=32800, lr=0.000349215, gnorm=0.14, clip=0, loss_scale=4, train_wall=84, gb_free=61, wall=28132 epoch 020: 761 / 1689 loss=3.567, nll_loss=2.019, ppl=4.05, wps=587150, ups=1.18, wpb=496759, bsz=17022, num_updates=32800, lr=0.000349215, gnorm=0.14, clip=0, loss_scale=4, train_wall=84, gb_free=61, wall=28132 epoch 020: 761 / 1689 loss=3.567, nll_loss=2.019, ppl=4.05, wps=587150, ups=1.18, wpb=496759, bsz=17022, num_updates=32800, lr=0.000349215, gnorm=0.14, clip=0, loss_scale=4, train_wall=84, gb_free=61, wall=28132 epoch 020: 761 / 1689 loss=3.567, nll_loss=2.019, ppl=4.05, wps=587150, ups=1.18, wpb=496759, bsz=17022, num_updates=32800, lr=0.000349215, gnorm=0.14, clip=0, loss_scale=4, train_wall=84, gb_free=61, wall=28132 epoch 020: 761 / 1689 loss=3.567, nll_loss=2.019, ppl=4.05, wps=587150, ups=1.18, wpb=496759, bsz=17022, num_updates=32800, lr=0.000349215, gnorm=0.14, clip=0, loss_scale=4, train_wall=84, gb_free=61, wall=28132 epoch 020: 761 / 1689 loss=3.567, nll_loss=2.019, ppl=4.05, wps=587150, ups=1.18, wpb=496759, bsz=17022, num_updates=32800, lr=0.000349215, gnorm=0.14, clip=0, loss_scale=4, train_wall=84, gb_free=61, wall=28132 epoch 020: 761 / 1689 loss=3.567, nll_loss=2.019, ppl=4.05, wps=587150, ups=1.18, wpb=496759, bsz=17022, num_updates=32800, lr=0.000349215, gnorm=0.14, clip=0, loss_scale=4, train_wall=84, gb_free=61, wall=28132 epoch 020: 761 / 1689 loss=3.567, nll_loss=2.019, ppl=4.05, wps=587150, ups=1.18, wpb=496759, bsz=17022, num_updates=32800, lr=0.000349215, gnorm=0.14, clip=0, loss_scale=4, train_wall=84, gb_free=61, wall=28132 epoch 020: 761 / 1689 loss=3.567, nll_loss=2.019, ppl=4.05, wps=587150, ups=1.18, wpb=496759, bsz=17022, num_updates=32800, lr=0.000349215, gnorm=0.14, clip=0, loss_scale=4, train_wall=84, gb_free=61, wall=28132 epoch 020: 761 / 1689 loss=3.567, nll_loss=2.019, ppl=4.05, wps=587150, ups=1.18, wpb=496759, bsz=17022, num_updates=32800, lr=0.000349215, gnorm=0.14, clip=0, loss_scale=4, train_wall=84, gb_free=61, wall=28132 epoch 020: 761 / 1689 loss=3.567, nll_loss=2.019, ppl=4.05, wps=587150, ups=1.18, wpb=496759, bsz=17022, num_updates=32800, lr=0.000349215, gnorm=0.14, clip=0, loss_scale=4, train_wall=84, gb_free=61, wall=28132 epoch 020: 761 / 1689 loss=3.567, nll_loss=2.019, ppl=4.05, wps=587150, ups=1.18, wpb=496759, bsz=17022, num_updates=32800, lr=0.000349215, gnorm=0.14, clip=0, loss_scale=4, train_wall=84, gb_free=61, wall=28132 epoch 020: 761 / 1689 loss=3.567, nll_loss=2.019, ppl=4.05, wps=587150, ups=1.18, wpb=496759, bsz=17022, num_updates=32800, lr=0.000349215, gnorm=0.14, clip=0, loss_scale=4, train_wall=84, gb_free=61, wall=28132 epoch 020: 761 / 1689 loss=3.567, nll_loss=2.019, ppl=4.05, wps=587150, ups=1.18, wpb=496759, bsz=17022, num_updates=32800, lr=0.000349215, gnorm=0.14, clip=0, loss_scale=4, train_wall=84, gb_free=61, wall=28132 epoch 020: 761 / 1689 loss=3.567, nll_loss=2.019, ppl=4.05, wps=587150, ups=1.18, wpb=496759, bsz=17022, num_updates=32800, lr=0.000349215, gnorm=0.14, clip=0, loss_scale=4, train_wall=84, gb_free=61, wall=28132 epoch 020: 761 / 1689 loss=3.567, nll_loss=2.019, ppl=4.05, wps=587150, ups=1.18, wpb=496759, bsz=17022, num_updates=32800, lr=0.000349215, gnorm=0.14, clip=0, loss_scale=4, train_wall=84, gb_free=61, wall=28132 epoch 020: 761 / 1689 loss=3.567, nll_loss=2.019, ppl=4.05, wps=587150, ups=1.18, wpb=496759, bsz=17022, num_updates=32800, lr=0.000349215, gnorm=0.14, clip=0, loss_scale=4, train_wall=84, gb_free=61, wall=28132 epoch 020: 761 / 1689 loss=3.567, nll_loss=2.019, ppl=4.05, wps=587150, ups=1.18, wpb=496759, bsz=17022, num_updates=32800, lr=0.000349215, gnorm=0.14, clip=0, loss_scale=4, train_wall=84, gb_free=61, wall=28132 epoch 020: 861 / 1689 loss=3.569, nll_loss=2.021, ppl=4.06, wps=584783, ups=1.18, wpb=493975, bsz=16377.9, num_updates=32900, lr=0.000348684, gnorm=0.138, clip=0, loss_scale=4, train_wall=83, gb_free=61.9, wall=28217 epoch 020: 861 / 1689 loss=3.569, nll_loss=2.021, ppl=4.06, wps=584783, ups=1.18, wpb=493975, bsz=16377.9, num_updates=32900, lr=0.000348684, gnorm=0.138, clip=0, loss_scale=4, train_wall=83, gb_free=61.9, wall=28217 epoch 020: 861 / 1689 loss=3.569, nll_loss=2.021, ppl=4.06, wps=584783, ups=1.18, wpb=493975, bsz=16377.9, num_updates=32900, lr=0.000348684, gnorm=0.138, clip=0, loss_scale=4, train_wall=83, gb_free=61.9, wall=28217 epoch 020: 861 / 1689 loss=3.569, nll_loss=2.021, ppl=4.06, wps=584783, ups=1.18, wpb=493975, bsz=16377.9, num_updates=32900, lr=0.000348684, gnorm=0.138, clip=0, loss_scale=4, train_wall=83, gb_free=61.9, wall=28217 epoch 020: 861 / 1689 loss=3.569, nll_loss=2.021, ppl=4.06, wps=584783, ups=1.18, wpb=493975, bsz=16377.9, num_updates=32900, lr=0.000348684, gnorm=0.138, clip=0, loss_scale=4, train_wall=83, gb_free=61.9, wall=28217 epoch 020: 861 / 1689 loss=3.569, nll_loss=2.021, ppl=4.06, wps=584783, ups=1.18, wpb=493975, bsz=16377.9, num_updates=32900, lr=0.000348684, gnorm=0.138, clip=0, loss_scale=4, train_wall=83, gb_free=61.9, wall=28217 epoch 020: 861 / 1689 loss=3.569, nll_loss=2.021, ppl=4.06, wps=584783, ups=1.18, wpb=493975, bsz=16377.9, num_updates=32900, lr=0.000348684, gnorm=0.138, clip=0, loss_scale=4, train_wall=83, gb_free=61.9, wall=28217 epoch 020: 861 / 1689 loss=3.569, nll_loss=2.021, ppl=4.06, wps=584783, ups=1.18, wpb=493975, bsz=16377.9, num_updates=32900, lr=0.000348684, gnorm=0.138, clip=0, loss_scale=4, train_wall=83, gb_free=61.9, wall=28217 epoch 020: 861 / 1689 loss=3.569, nll_loss=2.021, ppl=4.06, wps=584783, ups=1.18, wpb=493975, bsz=16377.9, num_updates=32900, lr=0.000348684, gnorm=0.138, clip=0, loss_scale=4, train_wall=83, gb_free=61.9, wall=28217 epoch 020: 861 / 1689 loss=3.569, nll_loss=2.021, ppl=4.06, wps=584783, ups=1.18, wpb=493975, bsz=16377.9, num_updates=32900, lr=0.000348684, gnorm=0.138, clip=0, loss_scale=4, train_wall=83, gb_free=61.9, wall=28217 epoch 020: 861 / 1689 loss=3.569, nll_loss=2.021, ppl=4.06, wps=584783, ups=1.18, wpb=493975, bsz=16377.9, num_updates=32900, lr=0.000348684, gnorm=0.138, clip=0, loss_scale=4, train_wall=83, gb_free=61.9, wall=28217 epoch 020: 861 / 1689 loss=3.569, nll_loss=2.021, ppl=4.06, wps=584783, ups=1.18, wpb=493975, bsz=16377.9, num_updates=32900, lr=0.000348684, gnorm=0.138, clip=0, loss_scale=4, train_wall=83, gb_free=61.9, wall=28217 epoch 020: 861 / 1689 loss=3.569, nll_loss=2.021, ppl=4.06, wps=584783, ups=1.18, wpb=493975, bsz=16377.9, num_updates=32900, lr=0.000348684, gnorm=0.138, clip=0, loss_scale=4, train_wall=83, gb_free=61.9, wall=28217 epoch 020: 861 / 1689 loss=3.569, nll_loss=2.021, ppl=4.06, wps=584783, ups=1.18, wpb=493975, bsz=16377.9, num_updates=32900, lr=0.000348684, gnorm=0.138, clip=0, loss_scale=4, train_wall=83, gb_free=61.9, wall=28217 epoch 020: 861 / 1689 loss=3.569, nll_loss=2.021, ppl=4.06, wps=584783, ups=1.18, wpb=493975, bsz=16377.9, num_updates=32900, lr=0.000348684, gnorm=0.138, clip=0, loss_scale=4, train_wall=83, gb_free=61.9, wall=28217 epoch 020: 861 / 1689 loss=3.569, nll_loss=2.021, ppl=4.06, wps=584783, ups=1.18, wpb=493975, bsz=16377.9, num_updates=32900, lr=0.000348684, gnorm=0.138, clip=0, loss_scale=4, train_wall=83, gb_free=61.9, wall=28217 epoch 020: 861 / 1689 loss=3.569, nll_loss=2.021, ppl=4.06, wps=584783, ups=1.18, wpb=493975, bsz=16377.9, num_updates=32900, lr=0.000348684, gnorm=0.138, clip=0, loss_scale=4, train_wall=83, gb_free=61.9, wall=28217 epoch 020: 861 / 1689 loss=3.569, nll_loss=2.021, ppl=4.06, wps=584783, ups=1.18, wpb=493975, bsz=16377.9, num_updates=32900, lr=0.000348684, gnorm=0.138, clip=0, loss_scale=4, train_wall=83, gb_free=61.9, wall=28217 epoch 020: 861 / 1689 loss=3.569, nll_loss=2.021, ppl=4.06, wps=584783, ups=1.18, wpb=493975, bsz=16377.9, num_updates=32900, lr=0.000348684, gnorm=0.138, clip=0, loss_scale=4, train_wall=83, gb_free=61.9, wall=28217 epoch 020: 861 / 1689 loss=3.569, nll_loss=2.021, ppl=4.06, wps=584783, ups=1.18, wpb=493975, bsz=16377.9, num_updates=32900, lr=0.000348684, gnorm=0.138, clip=0, loss_scale=4, train_wall=83, gb_free=61.9, wall=28217 epoch 020: 961 / 1689 loss=3.575, nll_loss=2.028, ppl=4.08, wps=584224, ups=1.18, wpb=494560, bsz=16449.8, num_updates=33000, lr=0.000348155, gnorm=0.135, clip=0, loss_scale=4, train_wall=84, gb_free=60.8, wall=28301 epoch 020: 961 / 1689 loss=3.575, nll_loss=2.028, ppl=4.08, wps=584224, ups=1.18, wpb=494560, bsz=16449.8, num_updates=33000, lr=0.000348155, gnorm=0.135, clip=0, loss_scale=4, train_wall=84, gb_free=60.8, wall=28301 epoch 020: 961 / 1689 loss=3.575, nll_loss=2.028, ppl=4.08, wps=584224, ups=1.18, wpb=494560, bsz=16449.8, num_updates=33000, lr=0.000348155, gnorm=0.135, clip=0, loss_scale=4, train_wall=84, gb_free=60.8, wall=28301 epoch 020: 961 / 1689 loss=3.575, nll_loss=2.028, ppl=4.08, wps=584224, ups=1.18, wpb=494560, bsz=16449.8, num_updates=33000, lr=0.000348155, gnorm=0.135, clip=0, loss_scale=4, train_wall=84, gb_free=60.8, wall=28301 epoch 020: 961 / 1689 loss=3.575, nll_loss=2.028, ppl=4.08, wps=584224, ups=1.18, wpb=494560, bsz=16449.8, num_updates=33000, lr=0.000348155, gnorm=0.135, clip=0, loss_scale=4, train_wall=84, gb_free=60.8, wall=28301 epoch 020: 961 / 1689 loss=3.575, nll_loss=2.028, ppl=4.08, wps=584224, ups=1.18, wpb=494560, bsz=16449.8, num_updates=33000, lr=0.000348155, gnorm=0.135, clip=0, loss_scale=4, train_wall=84, gb_free=60.8, wall=28301 epoch 020: 961 / 1689 loss=3.575, nll_loss=2.028, ppl=4.08, wps=584224, ups=1.18, wpb=494560, bsz=16449.8, num_updates=33000, lr=0.000348155, gnorm=0.135, clip=0, loss_scale=4, train_wall=84, gb_free=60.8, wall=28301 epoch 020: 961 / 1689 loss=3.575, nll_loss=2.028, ppl=4.08, wps=584224, ups=1.18, wpb=494560, bsz=16449.8, num_updates=33000, lr=0.000348155, gnorm=0.135, clip=0, loss_scale=4, train_wall=84, gb_free=60.8, wall=28301 epoch 020: 961 / 1689 loss=3.575, nll_loss=2.028, ppl=4.08, wps=584224, ups=1.18, wpb=494560, bsz=16449.8, num_updates=33000, lr=0.000348155, gnorm=0.135, clip=0, loss_scale=4, train_wall=84, gb_free=60.8, wall=28301 epoch 020: 961 / 1689 loss=3.575, nll_loss=2.028, ppl=4.08, wps=584224, ups=1.18, wpb=494560, bsz=16449.8, num_updates=33000, lr=0.000348155, gnorm=0.135, clip=0, loss_scale=4, train_wall=84, gb_free=60.8, wall=28301 epoch 020: 961 / 1689 loss=3.575, nll_loss=2.028, ppl=4.08, wps=584224, ups=1.18, wpb=494560, bsz=16449.8, num_updates=33000, lr=0.000348155, gnorm=0.135, clip=0, loss_scale=4, train_wall=84, gb_free=60.8, wall=28301 epoch 020: 961 / 1689 loss=3.575, nll_loss=2.028, ppl=4.08, wps=584224, ups=1.18, wpb=494560, bsz=16449.8, num_updates=33000, lr=0.000348155, gnorm=0.135, clip=0, loss_scale=4, train_wall=84, gb_free=60.8, wall=28301 epoch 020: 961 / 1689 loss=3.575, nll_loss=2.028, ppl=4.08, wps=584224, ups=1.18, wpb=494560, bsz=16449.8, num_updates=33000, lr=0.000348155, gnorm=0.135, clip=0, loss_scale=4, train_wall=84, gb_free=60.8, wall=28301 epoch 020: 961 / 1689 loss=3.575, nll_loss=2.028, ppl=4.08, wps=584224, ups=1.18, wpb=494560, bsz=16449.8, num_updates=33000, lr=0.000348155, gnorm=0.135, clip=0, loss_scale=4, train_wall=84, gb_free=60.8, wall=28301 epoch 020: 961 / 1689 loss=3.575, nll_loss=2.028, ppl=4.08, wps=584224, ups=1.18, wpb=494560, bsz=16449.8, num_updates=33000, lr=0.000348155, gnorm=0.135, clip=0, loss_scale=4, train_wall=84, gb_free=60.8, wall=28301 epoch 020: 961 / 1689 loss=3.575, nll_loss=2.028, ppl=4.08, wps=584224, ups=1.18, wpb=494560, bsz=16449.8, num_updates=33000, lr=0.000348155, gnorm=0.135, clip=0, loss_scale=4, train_wall=84, gb_free=60.8, wall=28301 epoch 020: 961 / 1689 loss=3.575, nll_loss=2.028, ppl=4.08, wps=584224, ups=1.18, wpb=494560, bsz=16449.8, num_updates=33000, lr=0.000348155, gnorm=0.135, clip=0, loss_scale=4, train_wall=84, gb_free=60.8, wall=28301 epoch 020: 961 / 1689 loss=3.575, nll_loss=2.028, ppl=4.08, wps=584224, ups=1.18, wpb=494560, bsz=16449.8, num_updates=33000, lr=0.000348155, gnorm=0.135, clip=0, loss_scale=4, train_wall=84, gb_free=60.8, wall=28301 epoch 020: 961 / 1689 loss=3.575, nll_loss=2.028, ppl=4.08, wps=584224, ups=1.18, wpb=494560, bsz=16449.8, num_updates=33000, lr=0.000348155, gnorm=0.135, clip=0, loss_scale=4, train_wall=84, gb_free=60.8, wall=28301 epoch 020: 961 / 1689 loss=3.575, nll_loss=2.028, ppl=4.08, wps=584224, ups=1.18, wpb=494560, bsz=16449.8, num_updates=33000, lr=0.000348155, gnorm=0.135, clip=0, loss_scale=4, train_wall=84, gb_free=60.8, wall=28301 begin validation on "valid" subset epoch 020 | valid on 'valid' subset | loss 3.71 | nll_loss 2.163 | ppl 4.48 | wps 0 | wpb 44526 | bsz 2008 | num_updates 33000 | best_loss 3.701 epoch 020 | valid on 'valid' subset | loss 3.71 | nll_loss 2.163 | ppl 4.48 | wps 0 | wpb 44526 | bsz 2008 | num_updates 33000 | best_loss 3.701 epoch 020 | valid on 'valid' subset | loss 3.71 | nll_loss 2.163 | ppl 4.48 | wps 0 | wpb 44526 | bsz 2008 | num_updates 33000 | best_loss 3.701 epoch 020 | valid on 'valid' subset | loss 3.71 | nll_loss 2.163 | ppl 4.48 | wps 0 | wpb 44526 | bsz 2008 | num_updates 33000 | best_loss 3.701 epoch 020 | valid on 'valid' subset | loss 3.71 | nll_loss 2.163 | ppl 4.48 | wps 0 | wpb 44526 | bsz 2008 | num_updates 33000 | best_loss 3.701 epoch 020 | valid on 'valid' subset | loss 3.71 | nll_loss 2.163 | ppl 4.48 | wps 0 | wpb 44526 | bsz 2008 | num_updates 33000 | best_loss 3.701 epoch 020 | valid on 'valid' subset | loss 3.71 | nll_loss 2.163 | ppl 4.48 | wps 0 | wpb 44526 | bsz 2008 | num_updates 33000 | best_loss 3.701 epoch 020 | valid on 'valid' subset | loss 3.71 | nll_loss 2.163 | ppl 4.48 | wps 0 | wpb 44526 | bsz 2008 | num_updates 33000 | best_loss 3.701 epoch 020 | valid on 'valid' subset | loss 3.71 | nll_loss 2.163 | ppl 4.48 | wps 0 | wpb 44526 | bsz 2008 | num_updates 33000 | best_loss 3.701 epoch 020 | valid on 'valid' subset | loss 3.71 | nll_loss 2.163 | ppl 4.48 | wps 0 | wpb 44526 | bsz 2008 | num_updates 33000 | best_loss 3.701 epoch 020 | valid on 'valid' subset | loss 3.71 | nll_loss 2.163 | ppl 4.48 | wps 0 | wpb 44526 | bsz 2008 | num_updates 33000 | best_loss 3.701 epoch 020 | valid on 'valid' subset | loss 3.71 | nll_loss 2.163 | ppl 4.48 | wps 0 | wpb 44526 | bsz 2008 | num_updates 33000 | best_loss 3.701 epoch 020 | valid on 'valid' subset | loss 3.71 | nll_loss 2.163 | ppl 4.48 | wps 0 | wpb 44526 | bsz 2008 | num_updates 33000 | best_loss 3.701 epoch 020 | valid on 'valid' subset | loss 3.71 | nll_loss 2.163 | ppl 4.48 | wps 0 | wpb 44526 | bsz 2008 | num_updates 33000 | best_loss 3.701 epoch 020 | valid on 'valid' subset | loss 3.71 | nll_loss 2.163 | ppl 4.48 | wps 0 | wpb 44526 | bsz 2008 | num_updates 33000 | best_loss 3.701 epoch 020 | valid on 'valid' subset | loss 3.71 | nll_loss 2.163 | ppl 4.48 | wps 0 | wpb 44526 | bsz 2008 | num_updates 33000 | best_loss 3.701 epoch 020 | valid on 'valid' subset | loss 3.71 | nll_loss 2.163 | ppl 4.48 | wps 0 | wpb 44526 | bsz 2008 | num_updates 33000 | best_loss 3.701 epoch 020 | valid on 'valid' subset | loss 3.71 | nll_loss 2.163 | ppl 4.48 | wps 0 | wpb 44526 | bsz 2008 | num_updates 33000 | best_loss 3.701 epoch 020 | valid on 'valid' subset | loss 3.71 | nll_loss 2.163 | ppl 4.48 | wps 0 | wpb 44526 | bsz 2008 | num_updates 33000 | best_loss 3.701 epoch 020 | valid on 'valid' subset | loss 3.71 | nll_loss 2.163 | ppl 4.48 | wps 0 | wpb 44526 | bsz 2008 | num_updates 33000 | best_loss 3.701 epoch 020: 1061 / 1689 loss=3.567, nll_loss=2.019, ppl=4.05, wps=537965, ups=1.09, wpb=494604, bsz=16277.8, num_updates=33100, lr=0.000347629, gnorm=0.141, clip=0, loss_scale=4, train_wall=83, gb_free=61.9, wall=28393 epoch 020: 1061 / 1689 loss=3.567, nll_loss=2.019, ppl=4.05, wps=537965, ups=1.09, wpb=494604, bsz=16277.8, num_updates=33100, lr=0.000347629, gnorm=0.141, clip=0, loss_scale=4, train_wall=83, gb_free=61.9, wall=28393 epoch 020: 1061 / 1689 loss=3.567, nll_loss=2.019, ppl=4.05, wps=537965, ups=1.09, wpb=494604, bsz=16277.8, num_updates=33100, lr=0.000347629, gnorm=0.141, clip=0, loss_scale=4, train_wall=83, gb_free=61.9, wall=28393 epoch 020: 1061 / 1689 loss=3.567, nll_loss=2.019, ppl=4.05, wps=537965, ups=1.09, wpb=494604, bsz=16277.8, num_updates=33100, lr=0.000347629, gnorm=0.141, clip=0, loss_scale=4, train_wall=83, gb_free=61.9, wall=28393 epoch 020: 1061 / 1689 loss=3.567, nll_loss=2.019, ppl=4.05, wps=537965, ups=1.09, wpb=494604, bsz=16277.8, num_updates=33100, lr=0.000347629, gnorm=0.141, clip=0, loss_scale=4, train_wall=83, gb_free=61.9, wall=28393 epoch 020: 1061 / 1689 loss=3.567, nll_loss=2.019, ppl=4.05, wps=537965, ups=1.09, wpb=494604, bsz=16277.8, num_updates=33100, lr=0.000347629, gnorm=0.141, clip=0, loss_scale=4, train_wall=83, gb_free=61.9, wall=28393 epoch 020: 1061 / 1689 loss=3.567, nll_loss=2.019, ppl=4.05, wps=537965, ups=1.09, wpb=494604, bsz=16277.8, num_updates=33100, lr=0.000347629, gnorm=0.141, clip=0, loss_scale=4, train_wall=83, gb_free=61.9, wall=28393 epoch 020: 1061 / 1689 loss=3.567, nll_loss=2.019, ppl=4.05, wps=537965, ups=1.09, wpb=494604, bsz=16277.8, num_updates=33100, lr=0.000347629, gnorm=0.141, clip=0, loss_scale=4, train_wall=83, gb_free=61.9, wall=28393 epoch 020: 1061 / 1689 loss=3.567, nll_loss=2.019, ppl=4.05, wps=537965, ups=1.09, wpb=494604, bsz=16277.8, num_updates=33100, lr=0.000347629, gnorm=0.141, clip=0, loss_scale=4, train_wall=83, gb_free=61.9, wall=28393 epoch 020: 1061 / 1689 loss=3.567, nll_loss=2.019, ppl=4.05, wps=537965, ups=1.09, wpb=494604, bsz=16277.8, num_updates=33100, lr=0.000347629, gnorm=0.141, clip=0, loss_scale=4, train_wall=83, gb_free=61.9, wall=28393 epoch 020: 1061 / 1689 loss=3.567, nll_loss=2.019, ppl=4.05, wps=537965, ups=1.09, wpb=494604, bsz=16277.8, num_updates=33100, lr=0.000347629, gnorm=0.141, clip=0, loss_scale=4, train_wall=83, gb_free=61.9, wall=28393 epoch 020: 1061 / 1689 loss=3.567, nll_loss=2.019, ppl=4.05, wps=537965, ups=1.09, wpb=494604, bsz=16277.8, num_updates=33100, lr=0.000347629, gnorm=0.141, clip=0, loss_scale=4, train_wall=83, gb_free=61.9, wall=28393 epoch 020: 1061 / 1689 loss=3.567, nll_loss=2.019, ppl=4.05, wps=537965, ups=1.09, wpb=494604, bsz=16277.8, num_updates=33100, lr=0.000347629, gnorm=0.141, clip=0, loss_scale=4, train_wall=83, gb_free=61.9, wall=28393 epoch 020: 1061 / 1689 loss=3.567, nll_loss=2.019, ppl=4.05, wps=537965, ups=1.09, wpb=494604, bsz=16277.8, num_updates=33100, lr=0.000347629, gnorm=0.141, clip=0, loss_scale=4, train_wall=83, gb_free=61.9, wall=28393 epoch 020: 1061 / 1689 loss=3.567, nll_loss=2.019, ppl=4.05, wps=537965, ups=1.09, wpb=494604, bsz=16277.8, num_updates=33100, lr=0.000347629, gnorm=0.141, clip=0, loss_scale=4, train_wall=83, gb_free=61.9, wall=28393 epoch 020: 1061 / 1689 loss=3.567, nll_loss=2.019, ppl=4.05, wps=537965, ups=1.09, wpb=494604, bsz=16277.8, num_updates=33100, lr=0.000347629, gnorm=0.141, clip=0, loss_scale=4, train_wall=83, gb_free=61.9, wall=28393 epoch 020: 1061 / 1689 loss=3.567, nll_loss=2.019, ppl=4.05, wps=537965, ups=1.09, wpb=494604, bsz=16277.8, num_updates=33100, lr=0.000347629, gnorm=0.141, clip=0, loss_scale=4, train_wall=83, gb_free=61.9, wall=28393 epoch 020: 1061 / 1689 loss=3.567, nll_loss=2.019, ppl=4.05, wps=537965, ups=1.09, wpb=494604, bsz=16277.8, num_updates=33100, lr=0.000347629, gnorm=0.141, clip=0, loss_scale=4, train_wall=83, gb_free=61.9, wall=28393 epoch 020: 1061 / 1689 loss=3.567, nll_loss=2.019, ppl=4.05, wps=537965, ups=1.09, wpb=494604, bsz=16277.8, num_updates=33100, lr=0.000347629, gnorm=0.141, clip=0, loss_scale=4, train_wall=83, gb_free=61.9, wall=28393 epoch 020: 1061 / 1689 loss=3.567, nll_loss=2.019, ppl=4.05, wps=537965, ups=1.09, wpb=494604, bsz=16277.8, num_updates=33100, lr=0.000347629, gnorm=0.141, clip=0, loss_scale=4, train_wall=83, gb_free=61.9, wall=28393 epoch 020: 1161 / 1689 loss=3.565, nll_loss=2.017, ppl=4.05, wps=586640, ups=1.18, wpb=495112, bsz=16679.4, num_updates=33200, lr=0.000347105, gnorm=0.136, clip=0, loss_scale=8, train_wall=83, gb_free=61.3, wall=28478 epoch 020: 1161 / 1689 loss=3.565, nll_loss=2.017, ppl=4.05, wps=586640, ups=1.18, wpb=495112, bsz=16679.4, num_updates=33200, lr=0.000347105, gnorm=0.136, clip=0, loss_scale=8, train_wall=83, gb_free=61.3, wall=28478 epoch 020: 1161 / 1689 loss=3.565, nll_loss=2.017, ppl=4.05, wps=586640, ups=1.18, wpb=495112, bsz=16679.4, num_updates=33200, lr=0.000347105, gnorm=0.136, clip=0, loss_scale=8, train_wall=83, gb_free=61.3, wall=28478 epoch 020: 1161 / 1689 loss=3.565, nll_loss=2.017, ppl=4.05, wps=586640, ups=1.18, wpb=495112, bsz=16679.4, num_updates=33200, lr=0.000347105, gnorm=0.136, clip=0, loss_scale=8, train_wall=83, gb_free=61.3, wall=28478 epoch 020: 1161 / 1689 loss=3.565, nll_loss=2.017, ppl=4.05, wps=586640, ups=1.18, wpb=495112, bsz=16679.4, num_updates=33200, lr=0.000347105, gnorm=0.136, clip=0, loss_scale=8, train_wall=83, gb_free=61.3, wall=28478 epoch 020: 1161 / 1689 loss=3.565, nll_loss=2.017, ppl=4.05, wps=586640, ups=1.18, wpb=495112, bsz=16679.4, num_updates=33200, lr=0.000347105, gnorm=0.136, clip=0, loss_scale=8, train_wall=83, gb_free=61.3, wall=28478 epoch 020: 1161 / 1689 loss=3.565, nll_loss=2.017, ppl=4.05, wps=586640, ups=1.18, wpb=495112, bsz=16679.4, num_updates=33200, lr=0.000347105, gnorm=0.136, clip=0, loss_scale=8, train_wall=83, gb_free=61.3, wall=28478 epoch 020: 1161 / 1689 loss=3.565, nll_loss=2.017, ppl=4.05, wps=586640, ups=1.18, wpb=495112, bsz=16679.4, num_updates=33200, lr=0.000347105, gnorm=0.136, clip=0, loss_scale=8, train_wall=83, gb_free=61.3, wall=28478 epoch 020: 1161 / 1689 loss=3.565, nll_loss=2.017, ppl=4.05, wps=586640, ups=1.18, wpb=495112, bsz=16679.4, num_updates=33200, lr=0.000347105, gnorm=0.136, clip=0, loss_scale=8, train_wall=83, gb_free=61.3, wall=28478 epoch 020: 1161 / 1689 loss=3.565, nll_loss=2.017, ppl=4.05, wps=586640, ups=1.18, wpb=495112, bsz=16679.4, num_updates=33200, lr=0.000347105, gnorm=0.136, clip=0, loss_scale=8, train_wall=83, gb_free=61.3, wall=28478 epoch 020: 1161 / 1689 loss=3.565, nll_loss=2.017, ppl=4.05, wps=586640, ups=1.18, wpb=495112, bsz=16679.4, num_updates=33200, lr=0.000347105, gnorm=0.136, clip=0, loss_scale=8, train_wall=83, gb_free=61.3, wall=28478 epoch 020: 1161 / 1689 loss=3.565, nll_loss=2.017, ppl=4.05, wps=586640, ups=1.18, wpb=495112, bsz=16679.4, num_updates=33200, lr=0.000347105, gnorm=0.136, clip=0, loss_scale=8, train_wall=83, gb_free=61.3, wall=28478 epoch 020: 1161 / 1689 loss=3.565, nll_loss=2.017, ppl=4.05, wps=586640, ups=1.18, wpb=495112, bsz=16679.4, num_updates=33200, lr=0.000347105, gnorm=0.136, clip=0, loss_scale=8, train_wall=83, gb_free=61.3, wall=28478 epoch 020: 1161 / 1689 loss=3.565, nll_loss=2.017, ppl=4.05, wps=586640, ups=1.18, wpb=495112, bsz=16679.4, num_updates=33200, lr=0.000347105, gnorm=0.136, clip=0, loss_scale=8, train_wall=83, gb_free=61.3, wall=28478 epoch 020: 1161 / 1689 loss=3.565, nll_loss=2.017, ppl=4.05, wps=586640, ups=1.18, wpb=495112, bsz=16679.4, num_updates=33200, lr=0.000347105, gnorm=0.136, clip=0, loss_scale=8, train_wall=83, gb_free=61.3, wall=28478 epoch 020: 1161 / 1689 loss=3.565, nll_loss=2.017, ppl=4.05, wps=586640, ups=1.18, wpb=495112, bsz=16679.4, num_updates=33200, lr=0.000347105, gnorm=0.136, clip=0, loss_scale=8, train_wall=83, gb_free=61.3, wall=28478 epoch 020: 1161 / 1689 loss=3.565, nll_loss=2.017, ppl=4.05, wps=586640, ups=1.18, wpb=495112, bsz=16679.4, num_updates=33200, lr=0.000347105, gnorm=0.136, clip=0, loss_scale=8, train_wall=83, gb_free=61.3, wall=28478 epoch 020: 1161 / 1689 loss=3.565, nll_loss=2.017, ppl=4.05, wps=586640, ups=1.18, wpb=495112, bsz=16679.4, num_updates=33200, lr=0.000347105, gnorm=0.136, clip=0, loss_scale=8, train_wall=83, gb_free=61.3, wall=28478 epoch 020: 1161 / 1689 loss=3.565, nll_loss=2.017, ppl=4.05, wps=586640, ups=1.18, wpb=495112, bsz=16679.4, num_updates=33200, lr=0.000347105, gnorm=0.136, clip=0, loss_scale=8, train_wall=83, gb_free=61.3, wall=28478 epoch 020: 1161 / 1689 loss=3.565, nll_loss=2.017, ppl=4.05, wps=586640, ups=1.18, wpb=495112, bsz=16679.4, num_updates=33200, lr=0.000347105, gnorm=0.136, clip=0, loss_scale=8, train_wall=83, gb_free=61.3, wall=28478 epoch 020: 1261 / 1689 loss=3.572, nll_loss=2.024, ppl=4.07, wps=584162, ups=1.18, wpb=495242, bsz=16632.6, num_updates=33300, lr=0.000346583, gnorm=0.133, clip=0, loss_scale=8, train_wall=84, gb_free=59.5, wall=28562 epoch 020: 1261 / 1689 loss=3.572, nll_loss=2.024, ppl=4.07, wps=584162, ups=1.18, wpb=495242, bsz=16632.6, num_updates=33300, lr=0.000346583, gnorm=0.133, clip=0, loss_scale=8, train_wall=84, gb_free=59.5, wall=28562 epoch 020: 1261 / 1689 loss=3.572, nll_loss=2.024, ppl=4.07, wps=584162, ups=1.18, wpb=495242, bsz=16632.6, num_updates=33300, lr=0.000346583, gnorm=0.133, clip=0, loss_scale=8, train_wall=84, gb_free=59.5, wall=28562 epoch 020: 1261 / 1689 loss=3.572, nll_loss=2.024, ppl=4.07, wps=584162, ups=1.18, wpb=495242, bsz=16632.6, num_updates=33300, lr=0.000346583, gnorm=0.133, clip=0, loss_scale=8, train_wall=84, gb_free=59.5, wall=28562 epoch 020: 1261 / 1689 loss=3.572, nll_loss=2.024, ppl=4.07, wps=584162, ups=1.18, wpb=495242, bsz=16632.6, num_updates=33300, lr=0.000346583, gnorm=0.133, clip=0, loss_scale=8, train_wall=84, gb_free=59.5, wall=28562 epoch 020: 1261 / 1689 loss=3.572, nll_loss=2.024, ppl=4.07, wps=584162, ups=1.18, wpb=495242, bsz=16632.6, num_updates=33300, lr=0.000346583, gnorm=0.133, clip=0, loss_scale=8, train_wall=84, gb_free=59.5, wall=28562 epoch 020: 1261 / 1689 loss=3.572, nll_loss=2.024, ppl=4.07, wps=584162, ups=1.18, wpb=495242, bsz=16632.6, num_updates=33300, lr=0.000346583, gnorm=0.133, clip=0, loss_scale=8, train_wall=84, gb_free=59.5, wall=28562 epoch 020: 1261 / 1689 loss=3.572, nll_loss=2.024, ppl=4.07, wps=584162, ups=1.18, wpb=495242, bsz=16632.6, num_updates=33300, lr=0.000346583, gnorm=0.133, clip=0, loss_scale=8, train_wall=84, gb_free=59.5, wall=28562 epoch 020: 1261 / 1689 loss=3.572, nll_loss=2.024, ppl=4.07, wps=584162, ups=1.18, wpb=495242, bsz=16632.6, num_updates=33300, lr=0.000346583, gnorm=0.133, clip=0, loss_scale=8, train_wall=84, gb_free=59.5, wall=28562 epoch 020: 1261 / 1689 loss=3.572, nll_loss=2.024, ppl=4.07, wps=584162, ups=1.18, wpb=495242, bsz=16632.6, num_updates=33300, lr=0.000346583, gnorm=0.133, clip=0, loss_scale=8, train_wall=84, gb_free=59.5, wall=28562 epoch 020: 1261 / 1689 loss=3.572, nll_loss=2.024, ppl=4.07, wps=584162, ups=1.18, wpb=495242, bsz=16632.6, num_updates=33300, lr=0.000346583, gnorm=0.133, clip=0, loss_scale=8, train_wall=84, gb_free=59.5, wall=28562 epoch 020: 1261 / 1689 loss=3.572, nll_loss=2.024, ppl=4.07, wps=584162, ups=1.18, wpb=495242, bsz=16632.6, num_updates=33300, lr=0.000346583, gnorm=0.133, clip=0, loss_scale=8, train_wall=84, gb_free=59.5, wall=28562 epoch 020: 1261 / 1689 loss=3.572, nll_loss=2.024, ppl=4.07, wps=584162, ups=1.18, wpb=495242, bsz=16632.6, num_updates=33300, lr=0.000346583, gnorm=0.133, clip=0, loss_scale=8, train_wall=84, gb_free=59.5, wall=28562 epoch 020: 1261 / 1689 loss=3.572, nll_loss=2.024, ppl=4.07, wps=584162, ups=1.18, wpb=495242, bsz=16632.6, num_updates=33300, lr=0.000346583, gnorm=0.133, clip=0, loss_scale=8, train_wall=84, gb_free=59.5, wall=28562 epoch 020: 1261 / 1689 loss=3.572, nll_loss=2.024, ppl=4.07, wps=584162, ups=1.18, wpb=495242, bsz=16632.6, num_updates=33300, lr=0.000346583, gnorm=0.133, clip=0, loss_scale=8, train_wall=84, gb_free=59.5, wall=28562 epoch 020: 1261 / 1689 loss=3.572, nll_loss=2.024, ppl=4.07, wps=584162, ups=1.18, wpb=495242, bsz=16632.6, num_updates=33300, lr=0.000346583, gnorm=0.133, clip=0, loss_scale=8, train_wall=84, gb_free=59.5, wall=28562 epoch 020: 1261 / 1689 loss=3.572, nll_loss=2.024, ppl=4.07, wps=584162, ups=1.18, wpb=495242, bsz=16632.6, num_updates=33300, lr=0.000346583, gnorm=0.133, clip=0, loss_scale=8, train_wall=84, gb_free=59.5, wall=28562 epoch 020: 1261 / 1689 loss=3.572, nll_loss=2.024, ppl=4.07, wps=584162, ups=1.18, wpb=495242, bsz=16632.6, num_updates=33300, lr=0.000346583, gnorm=0.133, clip=0, loss_scale=8, train_wall=84, gb_free=59.5, wall=28562 epoch 020: 1261 / 1689 loss=3.572, nll_loss=2.024, ppl=4.07, wps=584162, ups=1.18, wpb=495242, bsz=16632.6, num_updates=33300, lr=0.000346583, gnorm=0.133, clip=0, loss_scale=8, train_wall=84, gb_free=59.5, wall=28562 epoch 020: 1261 / 1689 loss=3.572, nll_loss=2.024, ppl=4.07, wps=584162, ups=1.18, wpb=495242, bsz=16632.6, num_updates=33300, lr=0.000346583, gnorm=0.133, clip=0, loss_scale=8, train_wall=84, gb_free=59.5, wall=28562 epoch 020: 1362 / 1689 loss=3.576, nll_loss=2.03, ppl=4.08, wps=579925, ups=1.17, wpb=494762, bsz=16369.9, num_updates=33400, lr=0.000346064, gnorm=0.138, clip=0, loss_scale=4, train_wall=84, gb_free=61.1, wall=28648 epoch 020: 1362 / 1689 loss=3.576, nll_loss=2.03, ppl=4.08, wps=579925, ups=1.17, wpb=494762, bsz=16369.9, num_updates=33400, lr=0.000346064, gnorm=0.138, clip=0, loss_scale=4, train_wall=84, gb_free=61.1, wall=28648 epoch 020: 1362 / 1689 loss=3.576, nll_loss=2.03, ppl=4.08, wps=579925, ups=1.17, wpb=494762, bsz=16369.9, num_updates=33400, lr=0.000346064, gnorm=0.138, clip=0, loss_scale=4, train_wall=84, gb_free=61.1, wall=28648 epoch 020: 1362 / 1689 loss=3.576, nll_loss=2.03, ppl=4.08, wps=579925, ups=1.17, wpb=494762, bsz=16369.9, num_updates=33400, lr=0.000346064, gnorm=0.138, clip=0, loss_scale=4, train_wall=84, gb_free=61.1, wall=28648 epoch 020: 1362 / 1689 loss=3.576, nll_loss=2.03, ppl=4.08, wps=579925, ups=1.17, wpb=494762, bsz=16369.9, num_updates=33400, lr=0.000346064, gnorm=0.138, clip=0, loss_scale=4, train_wall=84, gb_free=61.1, wall=28648 epoch 020: 1362 / 1689 loss=3.576, nll_loss=2.03, ppl=4.08, wps=579925, ups=1.17, wpb=494762, bsz=16369.9, num_updates=33400, lr=0.000346064, gnorm=0.138, clip=0, loss_scale=4, train_wall=84, gb_free=61.1, wall=28648 epoch 020: 1362 / 1689 loss=3.576, nll_loss=2.03, ppl=4.08, wps=579925, ups=1.17, wpb=494762, bsz=16369.9, num_updates=33400, lr=0.000346064, gnorm=0.138, clip=0, loss_scale=4, train_wall=84, gb_free=61.1, wall=28648 epoch 020: 1362 / 1689 loss=3.576, nll_loss=2.03, ppl=4.08, wps=579925, ups=1.17, wpb=494762, bsz=16369.9, num_updates=33400, lr=0.000346064, gnorm=0.138, clip=0, loss_scale=4, train_wall=84, gb_free=61.1, wall=28648 epoch 020: 1362 / 1689 loss=3.576, nll_loss=2.03, ppl=4.08, wps=579925, ups=1.17, wpb=494762, bsz=16369.9, num_updates=33400, lr=0.000346064, gnorm=0.138, clip=0, loss_scale=4, train_wall=84, gb_free=61.1, wall=28648 epoch 020: 1362 / 1689 loss=3.576, nll_loss=2.03, ppl=4.08, wps=579925, ups=1.17, wpb=494762, bsz=16369.9, num_updates=33400, lr=0.000346064, gnorm=0.138, clip=0, loss_scale=4, train_wall=84, gb_free=61.1, wall=28648 epoch 020: 1362 / 1689 loss=3.576, nll_loss=2.03, ppl=4.08, wps=579925, ups=1.17, wpb=494762, bsz=16369.9, num_updates=33400, lr=0.000346064, gnorm=0.138, clip=0, loss_scale=4, train_wall=84, gb_free=61.1, wall=28648 epoch 020: 1362 / 1689 loss=3.576, nll_loss=2.03, ppl=4.08, wps=579925, ups=1.17, wpb=494762, bsz=16369.9, num_updates=33400, lr=0.000346064, gnorm=0.138, clip=0, loss_scale=4, train_wall=84, gb_free=61.1, wall=28648 epoch 020: 1362 / 1689 loss=3.576, nll_loss=2.03, ppl=4.08, wps=579925, ups=1.17, wpb=494762, bsz=16369.9, num_updates=33400, lr=0.000346064, gnorm=0.138, clip=0, loss_scale=4, train_wall=84, gb_free=61.1, wall=28648 epoch 020: 1362 / 1689 loss=3.576, nll_loss=2.03, ppl=4.08, wps=579925, ups=1.17, wpb=494762, bsz=16369.9, num_updates=33400, lr=0.000346064, gnorm=0.138, clip=0, loss_scale=4, train_wall=84, gb_free=61.1, wall=28648 epoch 020: 1362 / 1689 loss=3.576, nll_loss=2.03, ppl=4.08, wps=579925, ups=1.17, wpb=494762, bsz=16369.9, num_updates=33400, lr=0.000346064, gnorm=0.138, clip=0, loss_scale=4, train_wall=84, gb_free=61.1, wall=28648 epoch 020: 1362 / 1689 loss=3.576, nll_loss=2.03, ppl=4.08, wps=579925, ups=1.17, wpb=494762, bsz=16369.9, num_updates=33400, lr=0.000346064, gnorm=0.138, clip=0, loss_scale=4, train_wall=84, gb_free=61.1, wall=28648 epoch 020: 1362 / 1689 loss=3.576, nll_loss=2.03, ppl=4.08, wps=579925, ups=1.17, wpb=494762, bsz=16369.9, num_updates=33400, lr=0.000346064, gnorm=0.138, clip=0, loss_scale=4, train_wall=84, gb_free=61.1, wall=28648 epoch 020: 1362 / 1689 loss=3.576, nll_loss=2.03, ppl=4.08, wps=579925, ups=1.17, wpb=494762, bsz=16369.9, num_updates=33400, lr=0.000346064, gnorm=0.138, clip=0, loss_scale=4, train_wall=84, gb_free=61.1, wall=28648 epoch 020: 1362 / 1689 loss=3.576, nll_loss=2.03, ppl=4.08, wps=579925, ups=1.17, wpb=494762, bsz=16369.9, num_updates=33400, lr=0.000346064, gnorm=0.138, clip=0, loss_scale=4, train_wall=84, gb_free=61.1, wall=28648 epoch 020: 1362 / 1689 loss=3.576, nll_loss=2.03, ppl=4.08, wps=579925, ups=1.17, wpb=494762, bsz=16369.9, num_updates=33400, lr=0.000346064, gnorm=0.138, clip=0, loss_scale=4, train_wall=84, gb_free=61.1, wall=28648 epoch 020: 1462 / 1689 loss=3.576, nll_loss=2.029, ppl=4.08, wps=584149, ups=1.18, wpb=494978, bsz=16563, num_updates=33500, lr=0.000345547, gnorm=0.132, clip=0, loss_scale=4, train_wall=83, gb_free=61.6, wall=28732 epoch 020: 1462 / 1689 loss=3.576, nll_loss=2.029, ppl=4.08, wps=584149, ups=1.18, wpb=494978, bsz=16563, num_updates=33500, lr=0.000345547, gnorm=0.132, clip=0, loss_scale=4, train_wall=83, gb_free=61.6, wall=28732 epoch 020: 1462 / 1689 loss=3.576, nll_loss=2.029, ppl=4.08, wps=584149, ups=1.18, wpb=494978, bsz=16563, num_updates=33500, lr=0.000345547, gnorm=0.132, clip=0, loss_scale=4, train_wall=83, gb_free=61.6, wall=28732 epoch 020: 1462 / 1689 loss=3.576, nll_loss=2.029, ppl=4.08, wps=584149, ups=1.18, wpb=494978, bsz=16563, num_updates=33500, lr=0.000345547, gnorm=0.132, clip=0, loss_scale=4, train_wall=83, gb_free=61.6, wall=28732 epoch 020: 1462 / 1689 loss=3.576, nll_loss=2.029, ppl=4.08, wps=584149, ups=1.18, wpb=494978, bsz=16563, num_updates=33500, lr=0.000345547, gnorm=0.132, clip=0, loss_scale=4, train_wall=83, gb_free=61.6, wall=28732 epoch 020: 1462 / 1689 loss=3.576, nll_loss=2.029, ppl=4.08, wps=584149, ups=1.18, wpb=494978, bsz=16563, num_updates=33500, lr=0.000345547, gnorm=0.132, clip=0, loss_scale=4, train_wall=83, gb_free=61.6, wall=28732 epoch 020: 1462 / 1689 loss=3.576, nll_loss=2.029, ppl=4.08, wps=584149, ups=1.18, wpb=494978, bsz=16563, num_updates=33500, lr=0.000345547, gnorm=0.132, clip=0, loss_scale=4, train_wall=83, gb_free=61.6, wall=28732 epoch 020: 1462 / 1689 loss=3.576, nll_loss=2.029, ppl=4.08, wps=584149, ups=1.18, wpb=494978, bsz=16563, num_updates=33500, lr=0.000345547, gnorm=0.132, clip=0, loss_scale=4, train_wall=83, gb_free=61.6, wall=28732 epoch 020: 1462 / 1689 loss=3.576, nll_loss=2.029, ppl=4.08, wps=584149, ups=1.18, wpb=494978, bsz=16563, num_updates=33500, lr=0.000345547, gnorm=0.132, clip=0, loss_scale=4, train_wall=83, gb_free=61.6, wall=28732 epoch 020: 1462 / 1689 loss=3.576, nll_loss=2.029, ppl=4.08, wps=584149, ups=1.18, wpb=494978, bsz=16563, num_updates=33500, lr=0.000345547, gnorm=0.132, clip=0, loss_scale=4, train_wall=83, gb_free=61.6, wall=28732 epoch 020: 1462 / 1689 loss=3.576, nll_loss=2.029, ppl=4.08, wps=584149, ups=1.18, wpb=494978, bsz=16563, num_updates=33500, lr=0.000345547, gnorm=0.132, clip=0, loss_scale=4, train_wall=83, gb_free=61.6, wall=28732 epoch 020: 1462 / 1689 loss=3.576, nll_loss=2.029, ppl=4.08, wps=584149, ups=1.18, wpb=494978, bsz=16563, num_updates=33500, lr=0.000345547, gnorm=0.132, clip=0, loss_scale=4, train_wall=83, gb_free=61.6, wall=28732 epoch 020: 1462 / 1689 loss=3.576, nll_loss=2.029, ppl=4.08, wps=584149, ups=1.18, wpb=494978, bsz=16563, num_updates=33500, lr=0.000345547, gnorm=0.132, clip=0, loss_scale=4, train_wall=83, gb_free=61.6, wall=28732 epoch 020: 1462 / 1689 loss=3.576, nll_loss=2.029, ppl=4.08, wps=584149, ups=1.18, wpb=494978, bsz=16563, num_updates=33500, lr=0.000345547, gnorm=0.132, clip=0, loss_scale=4, train_wall=83, gb_free=61.6, wall=28732 epoch 020: 1462 / 1689 loss=3.576, nll_loss=2.029, ppl=4.08, wps=584149, ups=1.18, wpb=494978, bsz=16563, num_updates=33500, lr=0.000345547, gnorm=0.132, clip=0, loss_scale=4, train_wall=83, gb_free=61.6, wall=28732 epoch 020: 1462 / 1689 loss=3.576, nll_loss=2.029, ppl=4.08, wps=584149, ups=1.18, wpb=494978, bsz=16563, num_updates=33500, lr=0.000345547, gnorm=0.132, clip=0, loss_scale=4, train_wall=83, gb_free=61.6, wall=28732 epoch 020: 1462 / 1689 loss=3.576, nll_loss=2.029, ppl=4.08, wps=584149, ups=1.18, wpb=494978, bsz=16563, num_updates=33500, lr=0.000345547, gnorm=0.132, clip=0, loss_scale=4, train_wall=83, gb_free=61.6, wall=28732 epoch 020: 1462 / 1689 loss=3.576, nll_loss=2.029, ppl=4.08, wps=584149, ups=1.18, wpb=494978, bsz=16563, num_updates=33500, lr=0.000345547, gnorm=0.132, clip=0, loss_scale=4, train_wall=83, gb_free=61.6, wall=28732 epoch 020: 1462 / 1689 loss=3.576, nll_loss=2.029, ppl=4.08, wps=584149, ups=1.18, wpb=494978, bsz=16563, num_updates=33500, lr=0.000345547, gnorm=0.132, clip=0, loss_scale=4, train_wall=83, gb_free=61.6, wall=28732 epoch 020: 1462 / 1689 loss=3.576, nll_loss=2.029, ppl=4.08, wps=584149, ups=1.18, wpb=494978, bsz=16563, num_updates=33500, lr=0.000345547, gnorm=0.132, clip=0, loss_scale=4, train_wall=83, gb_free=61.6, wall=28732 epoch 020: 1562 / 1689 loss=3.574, nll_loss=2.027, ppl=4.08, wps=586311, ups=1.18, wpb=494838, bsz=16159.3, num_updates=33600, lr=0.000345033, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=60.7, wall=28817 epoch 020: 1562 / 1689 loss=3.574, nll_loss=2.027, ppl=4.08, wps=586311, ups=1.18, wpb=494838, bsz=16159.3, num_updates=33600, lr=0.000345033, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=60.7, wall=28817 epoch 020: 1562 / 1689 loss=3.574, nll_loss=2.027, ppl=4.08, wps=586311, ups=1.18, wpb=494838, bsz=16159.3, num_updates=33600, lr=0.000345033, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=60.7, wall=28817 epoch 020: 1562 / 1689 loss=3.574, nll_loss=2.027, ppl=4.08, wps=586311, ups=1.18, wpb=494838, bsz=16159.3, num_updates=33600, lr=0.000345033, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=60.7, wall=28817 epoch 020: 1562 / 1689 loss=3.574, nll_loss=2.027, ppl=4.08, wps=586311, ups=1.18, wpb=494838, bsz=16159.3, num_updates=33600, lr=0.000345033, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=60.7, wall=28817 epoch 020: 1562 / 1689 loss=3.574, nll_loss=2.027, ppl=4.08, wps=586311, ups=1.18, wpb=494838, bsz=16159.3, num_updates=33600, lr=0.000345033, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=60.7, wall=28817 epoch 020: 1562 / 1689 loss=3.574, nll_loss=2.027, ppl=4.08, wps=586311, ups=1.18, wpb=494838, bsz=16159.3, num_updates=33600, lr=0.000345033, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=60.7, wall=28817 epoch 020: 1562 / 1689 loss=3.574, nll_loss=2.027, ppl=4.08, wps=586311, ups=1.18, wpb=494838, bsz=16159.3, num_updates=33600, lr=0.000345033, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=60.7, wall=28817 epoch 020: 1562 / 1689 loss=3.574, nll_loss=2.027, ppl=4.08, wps=586311, ups=1.18, wpb=494838, bsz=16159.3, num_updates=33600, lr=0.000345033, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=60.7, wall=28817 epoch 020: 1562 / 1689 loss=3.574, nll_loss=2.027, ppl=4.08, wps=586311, ups=1.18, wpb=494838, bsz=16159.3, num_updates=33600, lr=0.000345033, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=60.7, wall=28817 epoch 020: 1562 / 1689 loss=3.574, nll_loss=2.027, ppl=4.08, wps=586311, ups=1.18, wpb=494838, bsz=16159.3, num_updates=33600, lr=0.000345033, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=60.7, wall=28817 epoch 020: 1562 / 1689 loss=3.574, nll_loss=2.027, ppl=4.08, wps=586311, ups=1.18, wpb=494838, bsz=16159.3, num_updates=33600, lr=0.000345033, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=60.7, wall=28817 epoch 020: 1562 / 1689 loss=3.574, nll_loss=2.027, ppl=4.08, wps=586311, ups=1.18, wpb=494838, bsz=16159.3, num_updates=33600, lr=0.000345033, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=60.7, wall=28817 epoch 020: 1562 / 1689 loss=3.574, nll_loss=2.027, ppl=4.08, wps=586311, ups=1.18, wpb=494838, bsz=16159.3, num_updates=33600, lr=0.000345033, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=60.7, wall=28817 epoch 020: 1562 / 1689 loss=3.574, nll_loss=2.027, ppl=4.08, wps=586311, ups=1.18, wpb=494838, bsz=16159.3, num_updates=33600, lr=0.000345033, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=60.7, wall=28817 epoch 020: 1562 / 1689 loss=3.574, nll_loss=2.027, ppl=4.08, wps=586311, ups=1.18, wpb=494838, bsz=16159.3, num_updates=33600, lr=0.000345033, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=60.7, wall=28817 epoch 020: 1562 / 1689 loss=3.574, nll_loss=2.027, ppl=4.08, wps=586311, ups=1.18, wpb=494838, bsz=16159.3, num_updates=33600, lr=0.000345033, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=60.7, wall=28817 epoch 020: 1562 / 1689 loss=3.574, nll_loss=2.027, ppl=4.08, wps=586311, ups=1.18, wpb=494838, bsz=16159.3, num_updates=33600, lr=0.000345033, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=60.7, wall=28817 epoch 020: 1562 / 1689 loss=3.574, nll_loss=2.027, ppl=4.08, wps=586311, ups=1.18, wpb=494838, bsz=16159.3, num_updates=33600, lr=0.000345033, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=60.7, wall=28817 epoch 020: 1562 / 1689 loss=3.574, nll_loss=2.027, ppl=4.08, wps=586311, ups=1.18, wpb=494838, bsz=16159.3, num_updates=33600, lr=0.000345033, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=60.7, wall=28817 epoch 020: 1662 / 1689 loss=3.573, nll_loss=2.025, ppl=4.07, wps=585454, ups=1.18, wpb=496598, bsz=16540.1, num_updates=33700, lr=0.00034452, gnorm=0.132, clip=0, loss_scale=4, train_wall=84, gb_free=60.5, wall=28902 epoch 020: 1662 / 1689 loss=3.573, nll_loss=2.025, ppl=4.07, wps=585454, ups=1.18, wpb=496598, bsz=16540.1, num_updates=33700, lr=0.00034452, gnorm=0.132, clip=0, loss_scale=4, train_wall=84, gb_free=60.5, wall=28902 epoch 020: 1662 / 1689 loss=3.573, nll_loss=2.025, ppl=4.07, wps=585454, ups=1.18, wpb=496598, bsz=16540.1, num_updates=33700, lr=0.00034452, gnorm=0.132, clip=0, loss_scale=4, train_wall=84, gb_free=60.5, wall=28902 epoch 020: 1662 / 1689 loss=3.573, nll_loss=2.025, ppl=4.07, wps=585454, ups=1.18, wpb=496598, bsz=16540.1, num_updates=33700, lr=0.00034452, gnorm=0.132, clip=0, loss_scale=4, train_wall=84, gb_free=60.5, wall=28902 epoch 020: 1662 / 1689 loss=3.573, nll_loss=2.025, ppl=4.07, wps=585454, ups=1.18, wpb=496598, bsz=16540.1, num_updates=33700, lr=0.00034452, gnorm=0.132, clip=0, loss_scale=4, train_wall=84, gb_free=60.5, wall=28902 epoch 020: 1662 / 1689 loss=3.573, nll_loss=2.025, ppl=4.07, wps=585454, ups=1.18, wpb=496598, bsz=16540.1, num_updates=33700, lr=0.00034452, gnorm=0.132, clip=0, loss_scale=4, train_wall=84, gb_free=60.5, wall=28902 epoch 020: 1662 / 1689 loss=3.573, nll_loss=2.025, ppl=4.07, wps=585454, ups=1.18, wpb=496598, bsz=16540.1, num_updates=33700, lr=0.00034452, gnorm=0.132, clip=0, loss_scale=4, train_wall=84, gb_free=60.5, wall=28902 epoch 020: 1662 / 1689 loss=3.573, nll_loss=2.025, ppl=4.07, wps=585454, ups=1.18, wpb=496598, bsz=16540.1, num_updates=33700, lr=0.00034452, gnorm=0.132, clip=0, loss_scale=4, train_wall=84, gb_free=60.5, wall=28902 epoch 020: 1662 / 1689 loss=3.573, nll_loss=2.025, ppl=4.07, wps=585454, ups=1.18, wpb=496598, bsz=16540.1, num_updates=33700, lr=0.00034452, gnorm=0.132, clip=0, loss_scale=4, train_wall=84, gb_free=60.5, wall=28902 epoch 020: 1662 / 1689 loss=3.573, nll_loss=2.025, ppl=4.07, wps=585454, ups=1.18, wpb=496598, bsz=16540.1, num_updates=33700, lr=0.00034452, gnorm=0.132, clip=0, loss_scale=4, train_wall=84, gb_free=60.5, wall=28902 epoch 020: 1662 / 1689 loss=3.573, nll_loss=2.025, ppl=4.07, wps=585454, ups=1.18, wpb=496598, bsz=16540.1, num_updates=33700, lr=0.00034452, gnorm=0.132, clip=0, loss_scale=4, train_wall=84, gb_free=60.5, wall=28902 epoch 020: 1662 / 1689 loss=3.573, nll_loss=2.025, ppl=4.07, wps=585454, ups=1.18, wpb=496598, bsz=16540.1, num_updates=33700, lr=0.00034452, gnorm=0.132, clip=0, loss_scale=4, train_wall=84, gb_free=60.5, wall=28902 epoch 020: 1662 / 1689 loss=3.573, nll_loss=2.025, ppl=4.07, wps=585454, ups=1.18, wpb=496598, bsz=16540.1, num_updates=33700, lr=0.00034452, gnorm=0.132, clip=0, loss_scale=4, train_wall=84, gb_free=60.5, wall=28902 epoch 020: 1662 / 1689 loss=3.573, nll_loss=2.025, ppl=4.07, wps=585454, ups=1.18, wpb=496598, bsz=16540.1, num_updates=33700, lr=0.00034452, gnorm=0.132, clip=0, loss_scale=4, train_wall=84, gb_free=60.5, wall=28902 epoch 020: 1662 / 1689 loss=3.573, nll_loss=2.025, ppl=4.07, wps=585454, ups=1.18, wpb=496598, bsz=16540.1, num_updates=33700, lr=0.00034452, gnorm=0.132, clip=0, loss_scale=4, train_wall=84, gb_free=60.5, wall=28902 epoch 020: 1662 / 1689 loss=3.573, nll_loss=2.025, ppl=4.07, wps=585454, ups=1.18, wpb=496598, bsz=16540.1, num_updates=33700, lr=0.00034452, gnorm=0.132, clip=0, loss_scale=4, train_wall=84, gb_free=60.5, wall=28902 epoch 020: 1662 / 1689 loss=3.573, nll_loss=2.025, ppl=4.07, wps=585454, ups=1.18, wpb=496598, bsz=16540.1, num_updates=33700, lr=0.00034452, gnorm=0.132, clip=0, loss_scale=4, train_wall=84, gb_free=60.5, wall=28902 epoch 020: 1662 / 1689 loss=3.573, nll_loss=2.025, ppl=4.07, wps=585454, ups=1.18, wpb=496598, bsz=16540.1, num_updates=33700, lr=0.00034452, gnorm=0.132, clip=0, loss_scale=4, train_wall=84, gb_free=60.5, wall=28902 epoch 020: 1662 / 1689 loss=3.573, nll_loss=2.025, ppl=4.07, wps=585454, ups=1.18, wpb=496598, bsz=16540.1, num_updates=33700, lr=0.00034452, gnorm=0.132, clip=0, loss_scale=4, train_wall=84, gb_free=60.5, wall=28902 epoch 020: 1662 / 1689 loss=3.573, nll_loss=2.025, ppl=4.07, wps=585454, ups=1.18, wpb=496598, bsz=16540.1, num_updates=33700, lr=0.00034452, gnorm=0.132, clip=0, loss_scale=4, train_wall=84, gb_free=60.5, wall=28902 end of epoch 20 (average epoch stats below) epoch 020 | loss 3.566 | nll_loss 2.018 | ppl 4.05 | wps 581814 | ups 1.18 | wpb 495120 | bsz 16502.5 | num_updates 33727 | lr 0.000344383 | gnorm 0.137 | clip 0 | loss_scale 4 | train_wall 1408 | gb_free 63.2 | wall 28924 epoch 020 | loss 3.566 | nll_loss 2.018 | ppl 4.05 | wps 581814 | ups 1.18 | wpb 495120 | bsz 16502.5 | num_updates 33727 | lr 0.000344383 | gnorm 0.137 | clip 0 | loss_scale 4 | train_wall 1408 | gb_free 63.2 | wall 28924 epoch 020 | loss 3.566 | nll_loss 2.018 | ppl 4.05 | wps 581814 | ups 1.18 | wpb 495120 | bsz 16502.5 | num_updates 33727 | lr 0.000344383 | gnorm 0.137 | clip 0 | loss_scale 4 | train_wall 1408 | gb_free 63.2 | wall 28924 epoch 020 | loss 3.566 | nll_loss 2.018 | ppl 4.05 | wps 581814 | ups 1.18 | wpb 495120 | bsz 16502.5 | num_updates 33727 | lr 0.000344383 | gnorm 0.137 | clip 0 | loss_scale 4 | train_wall 1408 | gb_free 63.2 | wall 28924 epoch 020 | loss 3.566 | nll_loss 2.018 | ppl 4.05 | wps 581814 | ups 1.18 | wpb 495120 | bsz 16502.5 | num_updates 33727 | lr 0.000344383 | gnorm 0.137 | clip 0 | loss_scale 4 | train_wall 1408 | gb_free 63.2 | wall 28924 epoch 020 | loss 3.566 | nll_loss 2.018 | ppl 4.05 | wps 581814 | ups 1.18 | wpb 495120 | bsz 16502.5 | num_updates 33727 | lr 0.000344383 | gnorm 0.137 | clip 0 | loss_scale 4 | train_wall 1408 | gb_free 63.2 | wall 28924 epoch 020 | loss 3.566 | nll_loss 2.018 | ppl 4.05 | wps 581814 | ups 1.18 | wpb 495120 | bsz 16502.5 | num_updates 33727 | lr 0.000344383 | gnorm 0.137 | clip 0 | loss_scale 4 | train_wall 1408 | gb_free 63.2 | wall 28924 epoch 020 | loss 3.566 | nll_loss 2.018 | ppl 4.05 | wps 581814 | ups 1.18 | wpb 495120 | bsz 16502.5 | num_updates 33727 | lr 0.000344383 | gnorm 0.137 | clip 0 | loss_scale 4 | train_wall 1408 | gb_free 63.2 | wall 28924 epoch 020 | loss 3.566 | nll_loss 2.018 | ppl 4.05 | wps 581814 | ups 1.18 | wpb 495120 | bsz 16502.5 | num_updates 33727 | lr 0.000344383 | gnorm 0.137 | clip 0 | loss_scale 4 | train_wall 1408 | gb_free 63.2 | wall 28924 epoch 020 | loss 3.566 | nll_loss 2.018 | ppl 4.05 | wps 581814 | ups 1.18 | wpb 495120 | bsz 16502.5 | num_updates 33727 | lr 0.000344383 | gnorm 0.137 | clip 0 | loss_scale 4 | train_wall 1408 | gb_free 63.2 | wall 28924 epoch 020 | loss 3.566 | nll_loss 2.018 | ppl 4.05 | wps 581814 | ups 1.18 | wpb 495120 | bsz 16502.5 | num_updates 33727 | lr 0.000344383 | gnorm 0.137 | clip 0 | loss_scale 4 | train_wall 1408 | gb_free 63.2 | wall 28924 epoch 020 | loss 3.566 | nll_loss 2.018 | ppl 4.05 | wps 581814 | ups 1.18 | wpb 495120 | bsz 16502.5 | num_updates 33727 | lr 0.000344383 | gnorm 0.137 | clip 0 | loss_scale 4 | train_wall 1408 | gb_free 63.2 | wall 28924 epoch 020 | loss 3.566 | nll_loss 2.018 | ppl 4.05 | wps 581814 | ups 1.18 | wpb 495120 | bsz 16502.5 | num_updates 33727 | lr 0.000344383 | gnorm 0.137 | clip 0 | loss_scale 4 | train_wall 1408 | gb_free 63.2 | wall 28924 epoch 020 | loss 3.566 | nll_loss 2.018 | ppl 4.05 | wps 581814 | ups 1.18 | wpb 495120 | bsz 16502.5 | num_updates 33727 | lr 0.000344383 | gnorm 0.137 | clip 0 | loss_scale 4 | train_wall 1408 | gb_free 63.2 | wall 28924 epoch 020 | loss 3.566 | nll_loss 2.018 | ppl 4.05 | wps 581814 | ups 1.18 | wpb 495120 | bsz 16502.5 | num_updates 33727 | lr 0.000344383 | gnorm 0.137 | clip 0 | loss_scale 4 | train_wall 1408 | gb_free 63.2 | wall 28924 epoch 020 | loss 3.566 | nll_loss 2.018 | ppl 4.05 | wps 581814 | ups 1.18 | wpb 495120 | bsz 16502.5 | num_updates 33727 | lr 0.000344383 | gnorm 0.137 | clip 0 | loss_scale 4 | train_wall 1408 | gb_free 63.2 | wall 28924 epoch 020 | loss 3.566 | nll_loss 2.018 | ppl 4.05 | wps 581814 | ups 1.18 | wpb 495120 | bsz 16502.5 | num_updates 33727 | lr 0.000344383 | gnorm 0.137 | clip 0 | loss_scale 4 | train_wall 1408 | gb_free 63.2 | wall 28924 epoch 020 | loss 3.566 | nll_loss 2.018 | ppl 4.05 | wps 581814 | ups 1.18 | wpb 495120 | bsz 16502.5 | num_updates 33727 | lr 0.000344383 | gnorm 0.137 | clip 0 | loss_scale 4 | train_wall 1408 | gb_free 63.2 | wall 28924 epoch 020 | loss 3.566 | nll_loss 2.018 | ppl 4.05 | wps 581814 | ups 1.18 | wpb 495120 | bsz 16502.5 | num_updates 33727 | lr 0.000344383 | gnorm 0.137 | clip 0 | loss_scale 4 | train_wall 1408 | gb_free 63.2 | wall 28924 epoch 020 | loss 3.566 | nll_loss 2.018 | ppl 4.05 | wps 581814 | ups 1.18 | wpb 495120 | bsz 16502.5 | num_updates 33727 | lr 0.000344383 | gnorm 0.137 | clip 0 | loss_scale 4 | train_wall 1408 | gb_free 63.2 | wall 28924 Start iterating over samples epoch 021: 73 / 1689 loss=3.55, nll_loss=1.999, ppl=4, wps=584623, ups=1.19, wpb=492146, bsz=16309.8, num_updates=33800, lr=0.00034401, gnorm=0.14, clip=0, loss_scale=4, train_wall=83, gb_free=61, wall=28986 epoch 021: 73 / 1689 loss=3.55, nll_loss=1.999, ppl=4, wps=584623, ups=1.19, wpb=492146, bsz=16309.8, num_updates=33800, lr=0.00034401, gnorm=0.14, clip=0, loss_scale=4, train_wall=83, gb_free=61, wall=28986 epoch 021: 73 / 1689 loss=3.55, nll_loss=1.999, ppl=4, wps=584623, ups=1.19, wpb=492146, bsz=16309.8, num_updates=33800, lr=0.00034401, gnorm=0.14, clip=0, loss_scale=4, train_wall=83, gb_free=61, wall=28986 epoch 021: 73 / 1689 loss=3.55, nll_loss=1.999, ppl=4, wps=584623, ups=1.19, wpb=492146, bsz=16309.8, num_updates=33800, lr=0.00034401, gnorm=0.14, clip=0, loss_scale=4, train_wall=83, gb_free=61, wall=28986 epoch 021: 73 / 1689 loss=3.55, nll_loss=1.999, ppl=4, wps=584623, ups=1.19, wpb=492146, bsz=16309.8, num_updates=33800, lr=0.00034401, gnorm=0.14, clip=0, loss_scale=4, train_wall=83, gb_free=61, wall=28986 epoch 021: 73 / 1689 loss=3.55, nll_loss=1.999, ppl=4, wps=584623, ups=1.19, wpb=492146, bsz=16309.8, num_updates=33800, lr=0.00034401, gnorm=0.14, clip=0, loss_scale=4, train_wall=83, gb_free=61, wall=28986 epoch 021: 73 / 1689 loss=3.55, nll_loss=1.999, ppl=4, wps=584623, ups=1.19, wpb=492146, bsz=16309.8, num_updates=33800, lr=0.00034401, gnorm=0.14, clip=0, loss_scale=4, train_wall=83, gb_free=61, wall=28986 epoch 021: 73 / 1689 loss=3.55, nll_loss=1.999, ppl=4, wps=584623, ups=1.19, wpb=492146, bsz=16309.8, num_updates=33800, lr=0.00034401, gnorm=0.14, clip=0, loss_scale=4, train_wall=83, gb_free=61, wall=28986 epoch 021: 73 / 1689 loss=3.55, nll_loss=1.999, ppl=4, wps=584623, ups=1.19, wpb=492146, bsz=16309.8, num_updates=33800, lr=0.00034401, gnorm=0.14, clip=0, loss_scale=4, train_wall=83, gb_free=61, wall=28986 epoch 021: 73 / 1689 loss=3.55, nll_loss=1.999, ppl=4, wps=584623, ups=1.19, wpb=492146, bsz=16309.8, num_updates=33800, lr=0.00034401, gnorm=0.14, clip=0, loss_scale=4, train_wall=83, gb_free=61, wall=28986 epoch 021: 73 / 1689 loss=3.55, nll_loss=1.999, ppl=4, wps=584623, ups=1.19, wpb=492146, bsz=16309.8, num_updates=33800, lr=0.00034401, gnorm=0.14, clip=0, loss_scale=4, train_wall=83, gb_free=61, wall=28986 epoch 021: 73 / 1689 loss=3.55, nll_loss=1.999, ppl=4, wps=584623, ups=1.19, wpb=492146, bsz=16309.8, num_updates=33800, lr=0.00034401, gnorm=0.14, clip=0, loss_scale=4, train_wall=83, gb_free=61, wall=28986 epoch 021: 73 / 1689 loss=3.55, nll_loss=1.999, ppl=4, wps=584623, ups=1.19, wpb=492146, bsz=16309.8, num_updates=33800, lr=0.00034401, gnorm=0.14, clip=0, loss_scale=4, train_wall=83, gb_free=61, wall=28986 epoch 021: 73 / 1689 loss=3.55, nll_loss=1.999, ppl=4, wps=584623, ups=1.19, wpb=492146, bsz=16309.8, num_updates=33800, lr=0.00034401, gnorm=0.14, clip=0, loss_scale=4, train_wall=83, gb_free=61, wall=28986 epoch 021: 73 / 1689 loss=3.55, nll_loss=1.999, ppl=4, wps=584623, ups=1.19, wpb=492146, bsz=16309.8, num_updates=33800, lr=0.00034401, gnorm=0.14, clip=0, loss_scale=4, train_wall=83, gb_free=61, wall=28986 epoch 021: 73 / 1689 loss=3.55, nll_loss=1.999, ppl=4, wps=584623, ups=1.19, wpb=492146, bsz=16309.8, num_updates=33800, lr=0.00034401, gnorm=0.14, clip=0, loss_scale=4, train_wall=83, gb_free=61, wall=28986 epoch 021: 73 / 1689 loss=3.55, nll_loss=1.999, ppl=4, wps=584623, ups=1.19, wpb=492146, bsz=16309.8, num_updates=33800, lr=0.00034401, gnorm=0.14, clip=0, loss_scale=4, train_wall=83, gb_free=61, wall=28986 epoch 021: 73 / 1689 loss=3.55, nll_loss=1.999, ppl=4, wps=584623, ups=1.19, wpb=492146, bsz=16309.8, num_updates=33800, lr=0.00034401, gnorm=0.14, clip=0, loss_scale=4, train_wall=83, gb_free=61, wall=28986 epoch 021: 73 / 1689 loss=3.55, nll_loss=1.999, ppl=4, wps=584623, ups=1.19, wpb=492146, bsz=16309.8, num_updates=33800, lr=0.00034401, gnorm=0.14, clip=0, loss_scale=4, train_wall=83, gb_free=61, wall=28986 epoch 021: 73 / 1689 loss=3.55, nll_loss=1.999, ppl=4, wps=584623, ups=1.19, wpb=492146, bsz=16309.8, num_updates=33800, lr=0.00034401, gnorm=0.14, clip=0, loss_scale=4, train_wall=83, gb_free=61, wall=28986 epoch 021: 73 / 1689 loss=3.55, nll_loss=1.999, ppl=4, wps=584623, ups=1.19, wpb=492146, bsz=16309.8, num_updates=33800, lr=0.00034401, gnorm=0.14, clip=0, loss_scale=4, train_wall=83, gb_free=61, wall=28986 epoch 021: 174 / 1689 loss=3.547, nll_loss=1.996, ppl=3.99, wps=578315, ups=1.17, wpb=494835, bsz=16358.9, num_updates=33900, lr=0.000343503, gnorm=0.14, clip=0, loss_scale=4, train_wall=85, gb_free=61.8, wall=29071 epoch 021: 174 / 1689 loss=3.547, nll_loss=1.996, ppl=3.99, wps=578315, ups=1.17, wpb=494835, bsz=16358.9, num_updates=33900, lr=0.000343503, gnorm=0.14, clip=0, loss_scale=4, train_wall=85, gb_free=61.8, wall=29071 epoch 021: 174 / 1689 loss=3.547, nll_loss=1.996, ppl=3.99, wps=578315, ups=1.17, wpb=494835, bsz=16358.9, num_updates=33900, lr=0.000343503, gnorm=0.14, clip=0, loss_scale=4, train_wall=85, gb_free=61.8, wall=29071 epoch 021: 174 / 1689 loss=3.547, nll_loss=1.996, ppl=3.99, wps=578315, ups=1.17, wpb=494835, bsz=16358.9, num_updates=33900, lr=0.000343503, gnorm=0.14, clip=0, loss_scale=4, train_wall=85, gb_free=61.8, wall=29071 epoch 021: 174 / 1689 loss=3.547, nll_loss=1.996, ppl=3.99, wps=578315, ups=1.17, wpb=494835, bsz=16358.9, num_updates=33900, lr=0.000343503, gnorm=0.14, clip=0, loss_scale=4, train_wall=85, gb_free=61.8, wall=29071 epoch 021: 174 / 1689 loss=3.547, nll_loss=1.996, ppl=3.99, wps=578315, ups=1.17, wpb=494835, bsz=16358.9, num_updates=33900, lr=0.000343503, gnorm=0.14, clip=0, loss_scale=4, train_wall=85, gb_free=61.8, wall=29071 epoch 021: 174 / 1689 loss=3.547, nll_loss=1.996, ppl=3.99, wps=578315, ups=1.17, wpb=494835, bsz=16358.9, num_updates=33900, lr=0.000343503, gnorm=0.14, clip=0, loss_scale=4, train_wall=85, gb_free=61.8, wall=29071 epoch 021: 174 / 1689 loss=3.547, nll_loss=1.996, ppl=3.99, wps=578315, ups=1.17, wpb=494835, bsz=16358.9, num_updates=33900, lr=0.000343503, gnorm=0.14, clip=0, loss_scale=4, train_wall=85, gb_free=61.8, wall=29071 epoch 021: 174 / 1689 loss=3.547, nll_loss=1.996, ppl=3.99, wps=578315, ups=1.17, wpb=494835, bsz=16358.9, num_updates=33900, lr=0.000343503, gnorm=0.14, clip=0, loss_scale=4, train_wall=85, gb_free=61.8, wall=29071 epoch 021: 174 / 1689 loss=3.547, nll_loss=1.996, ppl=3.99, wps=578315, ups=1.17, wpb=494835, bsz=16358.9, num_updates=33900, lr=0.000343503, gnorm=0.14, clip=0, loss_scale=4, train_wall=85, gb_free=61.8, wall=29071 epoch 021: 174 / 1689 loss=3.547, nll_loss=1.996, ppl=3.99, wps=578315, ups=1.17, wpb=494835, bsz=16358.9, num_updates=33900, lr=0.000343503, gnorm=0.14, clip=0, loss_scale=4, train_wall=85, gb_free=61.8, wall=29071 epoch 021: 174 / 1689 loss=3.547, nll_loss=1.996, ppl=3.99, wps=578315, ups=1.17, wpb=494835, bsz=16358.9, num_updates=33900, lr=0.000343503, gnorm=0.14, clip=0, loss_scale=4, train_wall=85, gb_free=61.8, wall=29071 epoch 021: 174 / 1689 loss=3.547, nll_loss=1.996, ppl=3.99, wps=578315, ups=1.17, wpb=494835, bsz=16358.9, num_updates=33900, lr=0.000343503, gnorm=0.14, clip=0, loss_scale=4, train_wall=85, gb_free=61.8, wall=29071 epoch 021: 174 / 1689 loss=3.547, nll_loss=1.996, ppl=3.99, wps=578315, ups=1.17, wpb=494835, bsz=16358.9, num_updates=33900, lr=0.000343503, gnorm=0.14, clip=0, loss_scale=4, train_wall=85, gb_free=61.8, wall=29071 epoch 021: 174 / 1689 loss=3.547, nll_loss=1.996, ppl=3.99, wps=578315, ups=1.17, wpb=494835, bsz=16358.9, num_updates=33900, lr=0.000343503, gnorm=0.14, clip=0, loss_scale=4, train_wall=85, gb_free=61.8, wall=29071 epoch 021: 174 / 1689 loss=3.547, nll_loss=1.996, ppl=3.99, wps=578315, ups=1.17, wpb=494835, bsz=16358.9, num_updates=33900, lr=0.000343503, gnorm=0.14, clip=0, loss_scale=4, train_wall=85, gb_free=61.8, wall=29071 epoch 021: 174 / 1689 loss=3.547, nll_loss=1.996, ppl=3.99, wps=578315, ups=1.17, wpb=494835, bsz=16358.9, num_updates=33900, lr=0.000343503, gnorm=0.14, clip=0, loss_scale=4, train_wall=85, gb_free=61.8, wall=29071 epoch 021: 174 / 1689 loss=3.547, nll_loss=1.996, ppl=3.99, wps=578315, ups=1.17, wpb=494835, bsz=16358.9, num_updates=33900, lr=0.000343503, gnorm=0.14, clip=0, loss_scale=4, train_wall=85, gb_free=61.8, wall=29071 epoch 021: 174 / 1689 loss=3.547, nll_loss=1.996, ppl=3.99, wps=578315, ups=1.17, wpb=494835, bsz=16358.9, num_updates=33900, lr=0.000343503, gnorm=0.14, clip=0, loss_scale=4, train_wall=85, gb_free=61.8, wall=29071 epoch 021: 174 / 1689 loss=3.547, nll_loss=1.996, ppl=3.99, wps=578315, ups=1.17, wpb=494835, bsz=16358.9, num_updates=33900, lr=0.000343503, gnorm=0.14, clip=0, loss_scale=4, train_wall=85, gb_free=61.8, wall=29071 epoch 021: 174 / 1689 loss=3.547, nll_loss=1.996, ppl=3.99, wps=578315, ups=1.17, wpb=494835, bsz=16358.9, num_updates=33900, lr=0.000343503, gnorm=0.14, clip=0, loss_scale=4, train_wall=85, gb_free=61.8, wall=29071 epoch 021: 275 / 1689 loss=3.552, nll_loss=2.002, ppl=4.01, wps=577968, ups=1.17, wpb=495662, bsz=16537.8, num_updates=34000, lr=0.000342997, gnorm=0.138, clip=0, loss_scale=2, train_wall=85, gb_free=61.6, wall=29157 epoch 021: 275 / 1689 loss=3.552, nll_loss=2.002, ppl=4.01, wps=577968, ups=1.17, wpb=495662, bsz=16537.8, num_updates=34000, lr=0.000342997, gnorm=0.138, clip=0, loss_scale=2, train_wall=85, gb_free=61.6, wall=29157 epoch 021: 275 / 1689 loss=3.552, nll_loss=2.002, ppl=4.01, wps=577968, ups=1.17, wpb=495662, bsz=16537.8, num_updates=34000, lr=0.000342997, gnorm=0.138, clip=0, loss_scale=2, train_wall=85, gb_free=61.6, wall=29157 epoch 021: 275 / 1689 loss=3.552, nll_loss=2.002, ppl=4.01, wps=577968, ups=1.17, wpb=495662, bsz=16537.8, num_updates=34000, lr=0.000342997, gnorm=0.138, clip=0, loss_scale=2, train_wall=85, gb_free=61.6, wall=29157 epoch 021: 275 / 1689 loss=3.552, nll_loss=2.002, ppl=4.01, wps=577968, ups=1.17, wpb=495662, bsz=16537.8, num_updates=34000, lr=0.000342997, gnorm=0.138, clip=0, loss_scale=2, train_wall=85, gb_free=61.6, wall=29157 epoch 021: 275 / 1689 loss=3.552, nll_loss=2.002, ppl=4.01, wps=577968, ups=1.17, wpb=495662, bsz=16537.8, num_updates=34000, lr=0.000342997, gnorm=0.138, clip=0, loss_scale=2, train_wall=85, gb_free=61.6, wall=29157 epoch 021: 275 / 1689 loss=3.552, nll_loss=2.002, ppl=4.01, wps=577968, ups=1.17, wpb=495662, bsz=16537.8, num_updates=34000, lr=0.000342997, gnorm=0.138, clip=0, loss_scale=2, train_wall=85, gb_free=61.6, wall=29157 epoch 021: 275 / 1689 loss=3.552, nll_loss=2.002, ppl=4.01, wps=577968, ups=1.17, wpb=495662, bsz=16537.8, num_updates=34000, lr=0.000342997, gnorm=0.138, clip=0, loss_scale=2, train_wall=85, gb_free=61.6, wall=29157 epoch 021: 275 / 1689 loss=3.552, nll_loss=2.002, ppl=4.01, wps=577968, ups=1.17, wpb=495662, bsz=16537.8, num_updates=34000, lr=0.000342997, gnorm=0.138, clip=0, loss_scale=2, train_wall=85, gb_free=61.6, wall=29157 epoch 021: 275 / 1689 loss=3.552, nll_loss=2.002, ppl=4.01, wps=577968, ups=1.17, wpb=495662, bsz=16537.8, num_updates=34000, lr=0.000342997, gnorm=0.138, clip=0, loss_scale=2, train_wall=85, gb_free=61.6, wall=29157 epoch 021: 275 / 1689 loss=3.552, nll_loss=2.002, ppl=4.01, wps=577968, ups=1.17, wpb=495662, bsz=16537.8, num_updates=34000, lr=0.000342997, gnorm=0.138, clip=0, loss_scale=2, train_wall=85, gb_free=61.6, wall=29157 epoch 021: 275 / 1689 loss=3.552, nll_loss=2.002, ppl=4.01, wps=577968, ups=1.17, wpb=495662, bsz=16537.8, num_updates=34000, lr=0.000342997, gnorm=0.138, clip=0, loss_scale=2, train_wall=85, gb_free=61.6, wall=29157 epoch 021: 275 / 1689 loss=3.552, nll_loss=2.002, ppl=4.01, wps=577968, ups=1.17, wpb=495662, bsz=16537.8, num_updates=34000, lr=0.000342997, gnorm=0.138, clip=0, loss_scale=2, train_wall=85, gb_free=61.6, wall=29157 epoch 021: 275 / 1689 loss=3.552, nll_loss=2.002, ppl=4.01, wps=577968, ups=1.17, wpb=495662, bsz=16537.8, num_updates=34000, lr=0.000342997, gnorm=0.138, clip=0, loss_scale=2, train_wall=85, gb_free=61.6, wall=29157 epoch 021: 275 / 1689 loss=3.552, nll_loss=2.002, ppl=4.01, wps=577968, ups=1.17, wpb=495662, bsz=16537.8, num_updates=34000, lr=0.000342997, gnorm=0.138, clip=0, loss_scale=2, train_wall=85, gb_free=61.6, wall=29157 epoch 021: 275 / 1689 loss=3.552, nll_loss=2.002, ppl=4.01, wps=577968, ups=1.17, wpb=495662, bsz=16537.8, num_updates=34000, lr=0.000342997, gnorm=0.138, clip=0, loss_scale=2, train_wall=85, gb_free=61.6, wall=29157 epoch 021: 275 / 1689 loss=3.552, nll_loss=2.002, ppl=4.01, wps=577968, ups=1.17, wpb=495662, bsz=16537.8, num_updates=34000, lr=0.000342997, gnorm=0.138, clip=0, loss_scale=2, train_wall=85, gb_free=61.6, wall=29157 epoch 021: 275 / 1689 loss=3.552, nll_loss=2.002, ppl=4.01, wps=577968, ups=1.17, wpb=495662, bsz=16537.8, num_updates=34000, lr=0.000342997, gnorm=0.138, clip=0, loss_scale=2, train_wall=85, gb_free=61.6, wall=29157 epoch 021: 275 / 1689 loss=3.552, nll_loss=2.002, ppl=4.01, wps=577968, ups=1.17, wpb=495662, bsz=16537.8, num_updates=34000, lr=0.000342997, gnorm=0.138, clip=0, loss_scale=2, train_wall=85, gb_free=61.6, wall=29157 epoch 021: 275 / 1689 loss=3.552, nll_loss=2.002, ppl=4.01, wps=577968, ups=1.17, wpb=495662, bsz=16537.8, num_updates=34000, lr=0.000342997, gnorm=0.138, clip=0, loss_scale=2, train_wall=85, gb_free=61.6, wall=29157 epoch 021: 275 / 1689 loss=3.552, nll_loss=2.002, ppl=4.01, wps=577968, ups=1.17, wpb=495662, bsz=16537.8, num_updates=34000, lr=0.000342997, gnorm=0.138, clip=0, loss_scale=2, train_wall=85, gb_free=61.6, wall=29157 begin validation on "valid" subset epoch 021 | valid on 'valid' subset | loss 3.712 | nll_loss 2.158 | ppl 4.46 | wps 0 | wpb 44526 | bsz 2008 | num_updates 34000 | best_loss 3.701 epoch 021 | valid on 'valid' subset | loss 3.712 | nll_loss 2.158 | ppl 4.46 | wps 0 | wpb 44526 | bsz 2008 | num_updates 34000 | best_loss 3.701 epoch 021 | valid on 'valid' subset | loss 3.712 | nll_loss 2.158 | ppl 4.46 | wps 0 | wpb 44526 | bsz 2008 | num_updates 34000 | best_loss 3.701 epoch 021 | valid on 'valid' subset | loss 3.712 | nll_loss 2.158 | ppl 4.46 | wps 0 | wpb 44526 | bsz 2008 | num_updates 34000 | best_loss 3.701 epoch 021 | valid on 'valid' subset | loss 3.712 | nll_loss 2.158 | ppl 4.46 | wps 0 | wpb 44526 | bsz 2008 | num_updates 34000 | best_loss 3.701 epoch 021 | valid on 'valid' subset | loss 3.712 | nll_loss 2.158 | ppl 4.46 | wps 0 | wpb 44526 | bsz 2008 | num_updates 34000 | best_loss 3.701 epoch 021 | valid on 'valid' subset | loss 3.712 | nll_loss 2.158 | ppl 4.46 | wps 0 | wpb 44526 | bsz 2008 | num_updates 34000 | best_loss 3.701 epoch 021 | valid on 'valid' subset | loss 3.712 | nll_loss 2.158 | ppl 4.46 | wps 0 | wpb 44526 | bsz 2008 | num_updates 34000 | best_loss 3.701 epoch 021 | valid on 'valid' subset | loss 3.712 | nll_loss 2.158 | ppl 4.46 | wps 0 | wpb 44526 | bsz 2008 | num_updates 34000 | best_loss 3.701 epoch 021 | valid on 'valid' subset | loss 3.712 | nll_loss 2.158 | ppl 4.46 | wps 0 | wpb 44526 | bsz 2008 | num_updates 34000 | best_loss 3.701 epoch 021 | valid on 'valid' subset | loss 3.712 | nll_loss 2.158 | ppl 4.46 | wps 0 | wpb 44526 | bsz 2008 | num_updates 34000 | best_loss 3.701 epoch 021 | valid on 'valid' subset | loss 3.712 | nll_loss 2.158 | ppl 4.46 | wps 0 | wpb 44526 | bsz 2008 | num_updates 34000 | best_loss 3.701 epoch 021 | valid on 'valid' subset | loss 3.712 | nll_loss 2.158 | ppl 4.46 | wps 0 | wpb 44526 | bsz 2008 | num_updates 34000 | best_loss 3.701 epoch 021 | valid on 'valid' subset | loss 3.712 | nll_loss 2.158 | ppl 4.46 | wps 0 | wpb 44526 | bsz 2008 | num_updates 34000 | best_loss 3.701 epoch 021 | valid on 'valid' subset | loss 3.712 | nll_loss 2.158 | ppl 4.46 | wps 0 | wpb 44526 | bsz 2008 | num_updates 34000 | best_loss 3.701 epoch 021 | valid on 'valid' subset | loss 3.712 | nll_loss 2.158 | ppl 4.46 | wps 0 | wpb 44526 | bsz 2008 | num_updates 34000 | best_loss 3.701 epoch 021 | valid on 'valid' subset | loss 3.712 | nll_loss 2.158 | ppl 4.46 | wps 0 | wpb 44526 | bsz 2008 | num_updates 34000 | best_loss 3.701 epoch 021 | valid on 'valid' subset | loss 3.712 | nll_loss 2.158 | ppl 4.46 | wps 0 | wpb 44526 | bsz 2008 | num_updates 34000 | best_loss 3.701 epoch 021 | valid on 'valid' subset | loss 3.712 | nll_loss 2.158 | ppl 4.46 | wps 0 | wpb 44526 | bsz 2008 | num_updates 34000 | best_loss 3.701 epoch 021 | valid on 'valid' subset | loss 3.712 | nll_loss 2.158 | ppl 4.46 | wps 0 | wpb 44526 | bsz 2008 | num_updates 34000 | best_loss 3.701 epoch 021 | valid on 'valid' subset | loss 3.712 | nll_loss 2.158 | ppl 4.46 | wps 0 | wpb 44526 | bsz 2008 | num_updates 34000 | best_loss 3.701 epoch 021: 375 / 1689 loss=3.555, nll_loss=2.005, ppl=4.01, wps=514533, ups=1.04, wpb=494253, bsz=16811.8, num_updates=34100, lr=0.000342494, gnorm=0.136, clip=0, loss_scale=2, train_wall=83, gb_free=61.4, wall=29253 epoch 021: 375 / 1689 loss=3.555, nll_loss=2.005, ppl=4.01, wps=514533, ups=1.04, wpb=494253, bsz=16811.8, num_updates=34100, lr=0.000342494, gnorm=0.136, clip=0, loss_scale=2, train_wall=83, gb_free=61.4, wall=29253 epoch 021: 375 / 1689 loss=3.555, nll_loss=2.005, ppl=4.01, wps=514533, ups=1.04, wpb=494253, bsz=16811.8, num_updates=34100, lr=0.000342494, gnorm=0.136, clip=0, loss_scale=2, train_wall=83, gb_free=61.4, wall=29253 epoch 021: 375 / 1689 loss=3.555, nll_loss=2.005, ppl=4.01, wps=514533, ups=1.04, wpb=494253, bsz=16811.8, num_updates=34100, lr=0.000342494, gnorm=0.136, clip=0, loss_scale=2, train_wall=83, gb_free=61.4, wall=29253 epoch 021: 375 / 1689 loss=3.555, nll_loss=2.005, ppl=4.01, wps=514533, ups=1.04, wpb=494253, bsz=16811.8, num_updates=34100, lr=0.000342494, gnorm=0.136, clip=0, loss_scale=2, train_wall=83, gb_free=61.4, wall=29253 epoch 021: 375 / 1689 loss=3.555, nll_loss=2.005, ppl=4.01, wps=514533, ups=1.04, wpb=494253, bsz=16811.8, num_updates=34100, lr=0.000342494, gnorm=0.136, clip=0, loss_scale=2, train_wall=83, gb_free=61.4, wall=29253 epoch 021: 375 / 1689 loss=3.555, nll_loss=2.005, ppl=4.01, wps=514533, ups=1.04, wpb=494253, bsz=16811.8, num_updates=34100, lr=0.000342494, gnorm=0.136, clip=0, loss_scale=2, train_wall=83, gb_free=61.4, wall=29253 epoch 021: 375 / 1689 loss=3.555, nll_loss=2.005, ppl=4.01, wps=514533, ups=1.04, wpb=494253, bsz=16811.8, num_updates=34100, lr=0.000342494, gnorm=0.136, clip=0, loss_scale=2, train_wall=83, gb_free=61.4, wall=29253 epoch 021: 375 / 1689 loss=3.555, nll_loss=2.005, ppl=4.01, wps=514533, ups=1.04, wpb=494253, bsz=16811.8, num_updates=34100, lr=0.000342494, gnorm=0.136, clip=0, loss_scale=2, train_wall=83, gb_free=61.4, wall=29253 epoch 021: 375 / 1689 loss=3.555, nll_loss=2.005, ppl=4.01, wps=514533, ups=1.04, wpb=494253, bsz=16811.8, num_updates=34100, lr=0.000342494, gnorm=0.136, clip=0, loss_scale=2, train_wall=83, gb_free=61.4, wall=29253 epoch 021: 375 / 1689 loss=3.555, nll_loss=2.005, ppl=4.01, wps=514533, ups=1.04, wpb=494253, bsz=16811.8, num_updates=34100, lr=0.000342494, gnorm=0.136, clip=0, loss_scale=2, train_wall=83, gb_free=61.4, wall=29253 epoch 021: 375 / 1689 loss=3.555, nll_loss=2.005, ppl=4.01, wps=514533, ups=1.04, wpb=494253, bsz=16811.8, num_updates=34100, lr=0.000342494, gnorm=0.136, clip=0, loss_scale=2, train_wall=83, gb_free=61.4, wall=29253 epoch 021: 375 / 1689 loss=3.555, nll_loss=2.005, ppl=4.01, wps=514533, ups=1.04, wpb=494253, bsz=16811.8, num_updates=34100, lr=0.000342494, gnorm=0.136, clip=0, loss_scale=2, train_wall=83, gb_free=61.4, wall=29253 epoch 021: 375 / 1689 loss=3.555, nll_loss=2.005, ppl=4.01, wps=514533, ups=1.04, wpb=494253, bsz=16811.8, num_updates=34100, lr=0.000342494, gnorm=0.136, clip=0, loss_scale=2, train_wall=83, gb_free=61.4, wall=29253 epoch 021: 375 / 1689 loss=3.555, nll_loss=2.005, ppl=4.01, wps=514533, ups=1.04, wpb=494253, bsz=16811.8, num_updates=34100, lr=0.000342494, gnorm=0.136, clip=0, loss_scale=2, train_wall=83, gb_free=61.4, wall=29253 epoch 021: 375 / 1689 loss=3.555, nll_loss=2.005, ppl=4.01, wps=514533, ups=1.04, wpb=494253, bsz=16811.8, num_updates=34100, lr=0.000342494, gnorm=0.136, clip=0, loss_scale=2, train_wall=83, gb_free=61.4, wall=29253 epoch 021: 375 / 1689 loss=3.555, nll_loss=2.005, ppl=4.01, wps=514533, ups=1.04, wpb=494253, bsz=16811.8, num_updates=34100, lr=0.000342494, gnorm=0.136, clip=0, loss_scale=2, train_wall=83, gb_free=61.4, wall=29253 epoch 021: 375 / 1689 loss=3.555, nll_loss=2.005, ppl=4.01, wps=514533, ups=1.04, wpb=494253, bsz=16811.8, num_updates=34100, lr=0.000342494, gnorm=0.136, clip=0, loss_scale=2, train_wall=83, gb_free=61.4, wall=29253 epoch 021: 375 / 1689 loss=3.555, nll_loss=2.005, ppl=4.01, wps=514533, ups=1.04, wpb=494253, bsz=16811.8, num_updates=34100, lr=0.000342494, gnorm=0.136, clip=0, loss_scale=2, train_wall=83, gb_free=61.4, wall=29253 epoch 021: 375 / 1689 loss=3.555, nll_loss=2.005, ppl=4.01, wps=514533, ups=1.04, wpb=494253, bsz=16811.8, num_updates=34100, lr=0.000342494, gnorm=0.136, clip=0, loss_scale=2, train_wall=83, gb_free=61.4, wall=29253 epoch 021: 375 / 1689 loss=3.555, nll_loss=2.005, ppl=4.01, wps=514533, ups=1.04, wpb=494253, bsz=16811.8, num_updates=34100, lr=0.000342494, gnorm=0.136, clip=0, loss_scale=2, train_wall=83, gb_free=61.4, wall=29253 epoch 021: 475 / 1689 loss=3.556, nll_loss=2.007, ppl=4.02, wps=584693, ups=1.18, wpb=494515, bsz=16686.1, num_updates=34200, lr=0.000341993, gnorm=0.137, clip=0, loss_scale=2, train_wall=84, gb_free=60.1, wall=29338 epoch 021: 475 / 1689 loss=3.556, nll_loss=2.007, ppl=4.02, wps=584693, ups=1.18, wpb=494515, bsz=16686.1, num_updates=34200, lr=0.000341993, gnorm=0.137, clip=0, loss_scale=2, train_wall=84, gb_free=60.1, wall=29338 epoch 021: 475 / 1689 loss=3.556, nll_loss=2.007, ppl=4.02, wps=584693, ups=1.18, wpb=494515, bsz=16686.1, num_updates=34200, lr=0.000341993, gnorm=0.137, clip=0, loss_scale=2, train_wall=84, gb_free=60.1, wall=29338 epoch 021: 475 / 1689 loss=3.556, nll_loss=2.007, ppl=4.02, wps=584693, ups=1.18, wpb=494515, bsz=16686.1, num_updates=34200, lr=0.000341993, gnorm=0.137, clip=0, loss_scale=2, train_wall=84, gb_free=60.1, wall=29338 epoch 021: 475 / 1689 loss=3.556, nll_loss=2.007, ppl=4.02, wps=584693, ups=1.18, wpb=494515, bsz=16686.1, num_updates=34200, lr=0.000341993, gnorm=0.137, clip=0, loss_scale=2, train_wall=84, gb_free=60.1, wall=29338 epoch 021: 475 / 1689 loss=3.556, nll_loss=2.007, ppl=4.02, wps=584693, ups=1.18, wpb=494515, bsz=16686.1, num_updates=34200, lr=0.000341993, gnorm=0.137, clip=0, loss_scale=2, train_wall=84, gb_free=60.1, wall=29338 epoch 021: 475 / 1689 loss=3.556, nll_loss=2.007, ppl=4.02, wps=584693, ups=1.18, wpb=494515, bsz=16686.1, num_updates=34200, lr=0.000341993, gnorm=0.137, clip=0, loss_scale=2, train_wall=84, gb_free=60.1, wall=29338 epoch 021: 475 / 1689 loss=3.556, nll_loss=2.007, ppl=4.02, wps=584693, ups=1.18, wpb=494515, bsz=16686.1, num_updates=34200, lr=0.000341993, gnorm=0.137, clip=0, loss_scale=2, train_wall=84, gb_free=60.1, wall=29338 epoch 021: 475 / 1689 loss=3.556, nll_loss=2.007, ppl=4.02, wps=584693, ups=1.18, wpb=494515, bsz=16686.1, num_updates=34200, lr=0.000341993, gnorm=0.137, clip=0, loss_scale=2, train_wall=84, gb_free=60.1, wall=29338 epoch 021: 475 / 1689 loss=3.556, nll_loss=2.007, ppl=4.02, wps=584693, ups=1.18, wpb=494515, bsz=16686.1, num_updates=34200, lr=0.000341993, gnorm=0.137, clip=0, loss_scale=2, train_wall=84, gb_free=60.1, wall=29338 epoch 021: 475 / 1689 loss=3.556, nll_loss=2.007, ppl=4.02, wps=584693, ups=1.18, wpb=494515, bsz=16686.1, num_updates=34200, lr=0.000341993, gnorm=0.137, clip=0, loss_scale=2, train_wall=84, gb_free=60.1, wall=29338 epoch 021: 475 / 1689 loss=3.556, nll_loss=2.007, ppl=4.02, wps=584693, ups=1.18, wpb=494515, bsz=16686.1, num_updates=34200, lr=0.000341993, gnorm=0.137, clip=0, loss_scale=2, train_wall=84, gb_free=60.1, wall=29338 epoch 021: 475 / 1689 loss=3.556, nll_loss=2.007, ppl=4.02, wps=584693, ups=1.18, wpb=494515, bsz=16686.1, num_updates=34200, lr=0.000341993, gnorm=0.137, clip=0, loss_scale=2, train_wall=84, gb_free=60.1, wall=29338 epoch 021: 475 / 1689 loss=3.556, nll_loss=2.007, ppl=4.02, wps=584693, ups=1.18, wpb=494515, bsz=16686.1, num_updates=34200, lr=0.000341993, gnorm=0.137, clip=0, loss_scale=2, train_wall=84, gb_free=60.1, wall=29338 epoch 021: 475 / 1689 loss=3.556, nll_loss=2.007, ppl=4.02, wps=584693, ups=1.18, wpb=494515, bsz=16686.1, num_updates=34200, lr=0.000341993, gnorm=0.137, clip=0, loss_scale=2, train_wall=84, gb_free=60.1, wall=29338 epoch 021: 475 / 1689 loss=3.556, nll_loss=2.007, ppl=4.02, wps=584693, ups=1.18, wpb=494515, bsz=16686.1, num_updates=34200, lr=0.000341993, gnorm=0.137, clip=0, loss_scale=2, train_wall=84, gb_free=60.1, wall=29338 epoch 021: 475 / 1689 loss=3.556, nll_loss=2.007, ppl=4.02, wps=584693, ups=1.18, wpb=494515, bsz=16686.1, num_updates=34200, lr=0.000341993, gnorm=0.137, clip=0, loss_scale=2, train_wall=84, gb_free=60.1, wall=29338 epoch 021: 475 / 1689 loss=3.556, nll_loss=2.007, ppl=4.02, wps=584693, ups=1.18, wpb=494515, bsz=16686.1, num_updates=34200, lr=0.000341993, gnorm=0.137, clip=0, loss_scale=2, train_wall=84, gb_free=60.1, wall=29338 epoch 021: 475 / 1689 loss=3.556, nll_loss=2.007, ppl=4.02, wps=584693, ups=1.18, wpb=494515, bsz=16686.1, num_updates=34200, lr=0.000341993, gnorm=0.137, clip=0, loss_scale=2, train_wall=84, gb_free=60.1, wall=29338 epoch 021: 475 / 1689 loss=3.556, nll_loss=2.007, ppl=4.02, wps=584693, ups=1.18, wpb=494515, bsz=16686.1, num_updates=34200, lr=0.000341993, gnorm=0.137, clip=0, loss_scale=2, train_wall=84, gb_free=60.1, wall=29338 epoch 021: 475 / 1689 loss=3.556, nll_loss=2.007, ppl=4.02, wps=584693, ups=1.18, wpb=494515, bsz=16686.1, num_updates=34200, lr=0.000341993, gnorm=0.137, clip=0, loss_scale=2, train_wall=84, gb_free=60.1, wall=29338 epoch 021: 575 / 1689 loss=3.557, nll_loss=2.008, ppl=4.02, wps=587986, ups=1.19, wpb=494716, bsz=16644.7, num_updates=34300, lr=0.000341494, gnorm=0.13, clip=0, loss_scale=2, train_wall=83, gb_free=60.8, wall=29422 epoch 021: 575 / 1689 loss=3.557, nll_loss=2.008, ppl=4.02, wps=587986, ups=1.19, wpb=494716, bsz=16644.7, num_updates=34300, lr=0.000341494, gnorm=0.13, clip=0, loss_scale=2, train_wall=83, gb_free=60.8, wall=29422 epoch 021: 575 / 1689 loss=3.557, nll_loss=2.008, ppl=4.02, wps=587986, ups=1.19, wpb=494716, bsz=16644.7, num_updates=34300, lr=0.000341494, gnorm=0.13, clip=0, loss_scale=2, train_wall=83, gb_free=60.8, wall=29422 epoch 021: 575 / 1689 loss=3.557, nll_loss=2.008, ppl=4.02, wps=587986, ups=1.19, wpb=494716, bsz=16644.7, num_updates=34300, lr=0.000341494, gnorm=0.13, clip=0, loss_scale=2, train_wall=83, gb_free=60.8, wall=29422 epoch 021: 575 / 1689 loss=3.557, nll_loss=2.008, ppl=4.02, wps=587986, ups=1.19, wpb=494716, bsz=16644.7, num_updates=34300, lr=0.000341494, gnorm=0.13, clip=0, loss_scale=2, train_wall=83, gb_free=60.8, wall=29422 epoch 021: 575 / 1689 loss=3.557, nll_loss=2.008, ppl=4.02, wps=587986, ups=1.19, wpb=494716, bsz=16644.7, num_updates=34300, lr=0.000341494, gnorm=0.13, clip=0, loss_scale=2, train_wall=83, gb_free=60.8, wall=29422 epoch 021: 575 / 1689 loss=3.557, nll_loss=2.008, ppl=4.02, wps=587986, ups=1.19, wpb=494716, bsz=16644.7, num_updates=34300, lr=0.000341494, gnorm=0.13, clip=0, loss_scale=2, train_wall=83, gb_free=60.8, wall=29422 epoch 021: 575 / 1689 loss=3.557, nll_loss=2.008, ppl=4.02, wps=587986, ups=1.19, wpb=494716, bsz=16644.7, num_updates=34300, lr=0.000341494, gnorm=0.13, clip=0, loss_scale=2, train_wall=83, gb_free=60.8, wall=29422 epoch 021: 575 / 1689 loss=3.557, nll_loss=2.008, ppl=4.02, wps=587986, ups=1.19, wpb=494716, bsz=16644.7, num_updates=34300, lr=0.000341494, gnorm=0.13, clip=0, loss_scale=2, train_wall=83, gb_free=60.8, wall=29422 epoch 021: 575 / 1689 loss=3.557, nll_loss=2.008, ppl=4.02, wps=587986, ups=1.19, wpb=494716, bsz=16644.7, num_updates=34300, lr=0.000341494, gnorm=0.13, clip=0, loss_scale=2, train_wall=83, gb_free=60.8, wall=29422 epoch 021: 575 / 1689 loss=3.557, nll_loss=2.008, ppl=4.02, wps=587986, ups=1.19, wpb=494716, bsz=16644.7, num_updates=34300, lr=0.000341494, gnorm=0.13, clip=0, loss_scale=2, train_wall=83, gb_free=60.8, wall=29422 epoch 021: 575 / 1689 loss=3.557, nll_loss=2.008, ppl=4.02, wps=587986, ups=1.19, wpb=494716, bsz=16644.7, num_updates=34300, lr=0.000341494, gnorm=0.13, clip=0, loss_scale=2, train_wall=83, gb_free=60.8, wall=29422 epoch 021: 575 / 1689 loss=3.557, nll_loss=2.008, ppl=4.02, wps=587986, ups=1.19, wpb=494716, bsz=16644.7, num_updates=34300, lr=0.000341494, gnorm=0.13, clip=0, loss_scale=2, train_wall=83, gb_free=60.8, wall=29422 epoch 021: 575 / 1689 loss=3.557, nll_loss=2.008, ppl=4.02, wps=587986, ups=1.19, wpb=494716, bsz=16644.7, num_updates=34300, lr=0.000341494, gnorm=0.13, clip=0, loss_scale=2, train_wall=83, gb_free=60.8, wall=29422 epoch 021: 575 / 1689 loss=3.557, nll_loss=2.008, ppl=4.02, wps=587986, ups=1.19, wpb=494716, bsz=16644.7, num_updates=34300, lr=0.000341494, gnorm=0.13, clip=0, loss_scale=2, train_wall=83, gb_free=60.8, wall=29422 epoch 021: 575 / 1689 loss=3.557, nll_loss=2.008, ppl=4.02, wps=587986, ups=1.19, wpb=494716, bsz=16644.7, num_updates=34300, lr=0.000341494, gnorm=0.13, clip=0, loss_scale=2, train_wall=83, gb_free=60.8, wall=29422 epoch 021: 575 / 1689 loss=3.557, nll_loss=2.008, ppl=4.02, wps=587986, ups=1.19, wpb=494716, bsz=16644.7, num_updates=34300, lr=0.000341494, gnorm=0.13, clip=0, loss_scale=2, train_wall=83, gb_free=60.8, wall=29422 epoch 021: 575 / 1689 loss=3.557, nll_loss=2.008, ppl=4.02, wps=587986, ups=1.19, wpb=494716, bsz=16644.7, num_updates=34300, lr=0.000341494, gnorm=0.13, clip=0, loss_scale=2, train_wall=83, gb_free=60.8, wall=29422 epoch 021: 575 / 1689 loss=3.557, nll_loss=2.008, ppl=4.02, wps=587986, ups=1.19, wpb=494716, bsz=16644.7, num_updates=34300, lr=0.000341494, gnorm=0.13, clip=0, loss_scale=2, train_wall=83, gb_free=60.8, wall=29422 epoch 021: 575 / 1689 loss=3.557, nll_loss=2.008, ppl=4.02, wps=587986, ups=1.19, wpb=494716, bsz=16644.7, num_updates=34300, lr=0.000341494, gnorm=0.13, clip=0, loss_scale=2, train_wall=83, gb_free=60.8, wall=29422 epoch 021: 575 / 1689 loss=3.557, nll_loss=2.008, ppl=4.02, wps=587986, ups=1.19, wpb=494716, bsz=16644.7, num_updates=34300, lr=0.000341494, gnorm=0.13, clip=0, loss_scale=2, train_wall=83, gb_free=60.8, wall=29422 epoch 021: 675 / 1689 loss=3.563, nll_loss=2.014, ppl=4.04, wps=585717, ups=1.18, wpb=495314, bsz=16547.3, num_updates=34400, lr=0.000340997, gnorm=0.135, clip=0, loss_scale=2, train_wall=84, gb_free=61.8, wall=29506 epoch 021: 675 / 1689 loss=3.563, nll_loss=2.014, ppl=4.04, wps=585717, ups=1.18, wpb=495314, bsz=16547.3, num_updates=34400, lr=0.000340997, gnorm=0.135, clip=0, loss_scale=2, train_wall=84, gb_free=61.8, wall=29506 epoch 021: 675 / 1689 loss=3.563, nll_loss=2.014, ppl=4.04, wps=585717, ups=1.18, wpb=495314, bsz=16547.3, num_updates=34400, lr=0.000340997, gnorm=0.135, clip=0, loss_scale=2, train_wall=84, gb_free=61.8, wall=29506 epoch 021: 675 / 1689 loss=3.563, nll_loss=2.014, ppl=4.04, wps=585717, ups=1.18, wpb=495314, bsz=16547.3, num_updates=34400, lr=0.000340997, gnorm=0.135, clip=0, loss_scale=2, train_wall=84, gb_free=61.8, wall=29506 epoch 021: 675 / 1689 loss=3.563, nll_loss=2.014, ppl=4.04, wps=585717, ups=1.18, wpb=495314, bsz=16547.3, num_updates=34400, lr=0.000340997, gnorm=0.135, clip=0, loss_scale=2, train_wall=84, gb_free=61.8, wall=29506 epoch 021: 675 / 1689 loss=3.563, nll_loss=2.014, ppl=4.04, wps=585717, ups=1.18, wpb=495314, bsz=16547.3, num_updates=34400, lr=0.000340997, gnorm=0.135, clip=0, loss_scale=2, train_wall=84, gb_free=61.8, wall=29506 epoch 021: 675 / 1689 loss=3.563, nll_loss=2.014, ppl=4.04, wps=585717, ups=1.18, wpb=495314, bsz=16547.3, num_updates=34400, lr=0.000340997, gnorm=0.135, clip=0, loss_scale=2, train_wall=84, gb_free=61.8, wall=29506 epoch 021: 675 / 1689 loss=3.563, nll_loss=2.014, ppl=4.04, wps=585717, ups=1.18, wpb=495314, bsz=16547.3, num_updates=34400, lr=0.000340997, gnorm=0.135, clip=0, loss_scale=2, train_wall=84, gb_free=61.8, wall=29506 epoch 021: 675 / 1689 loss=3.563, nll_loss=2.014, ppl=4.04, wps=585717, ups=1.18, wpb=495314, bsz=16547.3, num_updates=34400, lr=0.000340997, gnorm=0.135, clip=0, loss_scale=2, train_wall=84, gb_free=61.8, wall=29506 epoch 021: 675 / 1689 loss=3.563, nll_loss=2.014, ppl=4.04, wps=585717, ups=1.18, wpb=495314, bsz=16547.3, num_updates=34400, lr=0.000340997, gnorm=0.135, clip=0, loss_scale=2, train_wall=84, gb_free=61.8, wall=29506 epoch 021: 675 / 1689 loss=3.563, nll_loss=2.014, ppl=4.04, wps=585717, ups=1.18, wpb=495314, bsz=16547.3, num_updates=34400, lr=0.000340997, gnorm=0.135, clip=0, loss_scale=2, train_wall=84, gb_free=61.8, wall=29506 epoch 021: 675 / 1689 loss=3.563, nll_loss=2.014, ppl=4.04, wps=585717, ups=1.18, wpb=495314, bsz=16547.3, num_updates=34400, lr=0.000340997, gnorm=0.135, clip=0, loss_scale=2, train_wall=84, gb_free=61.8, wall=29506 epoch 021: 675 / 1689 loss=3.563, nll_loss=2.014, ppl=4.04, wps=585717, ups=1.18, wpb=495314, bsz=16547.3, num_updates=34400, lr=0.000340997, gnorm=0.135, clip=0, loss_scale=2, train_wall=84, gb_free=61.8, wall=29506 epoch 021: 675 / 1689 loss=3.563, nll_loss=2.014, ppl=4.04, wps=585717, ups=1.18, wpb=495314, bsz=16547.3, num_updates=34400, lr=0.000340997, gnorm=0.135, clip=0, loss_scale=2, train_wall=84, gb_free=61.8, wall=29506 epoch 021: 675 / 1689 loss=3.563, nll_loss=2.014, ppl=4.04, wps=585717, ups=1.18, wpb=495314, bsz=16547.3, num_updates=34400, lr=0.000340997, gnorm=0.135, clip=0, loss_scale=2, train_wall=84, gb_free=61.8, wall=29506 epoch 021: 675 / 1689 loss=3.563, nll_loss=2.014, ppl=4.04, wps=585717, ups=1.18, wpb=495314, bsz=16547.3, num_updates=34400, lr=0.000340997, gnorm=0.135, clip=0, loss_scale=2, train_wall=84, gb_free=61.8, wall=29506 epoch 021: 675 / 1689 loss=3.563, nll_loss=2.014, ppl=4.04, wps=585717, ups=1.18, wpb=495314, bsz=16547.3, num_updates=34400, lr=0.000340997, gnorm=0.135, clip=0, loss_scale=2, train_wall=84, gb_free=61.8, wall=29506 epoch 021: 675 / 1689 loss=3.563, nll_loss=2.014, ppl=4.04, wps=585717, ups=1.18, wpb=495314, bsz=16547.3, num_updates=34400, lr=0.000340997, gnorm=0.135, clip=0, loss_scale=2, train_wall=84, gb_free=61.8, wall=29506 epoch 021: 675 / 1689 loss=3.563, nll_loss=2.014, ppl=4.04, wps=585717, ups=1.18, wpb=495314, bsz=16547.3, num_updates=34400, lr=0.000340997, gnorm=0.135, clip=0, loss_scale=2, train_wall=84, gb_free=61.8, wall=29506 epoch 021: 675 / 1689 loss=3.563, nll_loss=2.014, ppl=4.04, wps=585717, ups=1.18, wpb=495314, bsz=16547.3, num_updates=34400, lr=0.000340997, gnorm=0.135, clip=0, loss_scale=2, train_wall=84, gb_free=61.8, wall=29506 epoch 021: 675 / 1689 loss=3.563, nll_loss=2.014, ppl=4.04, wps=585717, ups=1.18, wpb=495314, bsz=16547.3, num_updates=34400, lr=0.000340997, gnorm=0.135, clip=0, loss_scale=2, train_wall=84, gb_free=61.8, wall=29506 epoch 021: 775 / 1689 loss=3.566, nll_loss=2.018, ppl=4.05, wps=586790, ups=1.19, wpb=494768, bsz=16611.7, num_updates=34500, lr=0.000340503, gnorm=0.14, clip=0, loss_scale=4, train_wall=83, gb_free=61.8, wall=29591 epoch 021: 775 / 1689 loss=3.566, nll_loss=2.018, ppl=4.05, wps=586790, ups=1.19, wpb=494768, bsz=16611.7, num_updates=34500, lr=0.000340503, gnorm=0.14, clip=0, loss_scale=4, train_wall=83, gb_free=61.8, wall=29591 epoch 021: 775 / 1689 loss=3.566, nll_loss=2.018, ppl=4.05, wps=586790, ups=1.19, wpb=494768, bsz=16611.7, num_updates=34500, lr=0.000340503, gnorm=0.14, clip=0, loss_scale=4, train_wall=83, gb_free=61.8, wall=29591 epoch 021: 775 / 1689 loss=3.566, nll_loss=2.018, ppl=4.05, wps=586790, ups=1.19, wpb=494768, bsz=16611.7, num_updates=34500, lr=0.000340503, gnorm=0.14, clip=0, loss_scale=4, train_wall=83, gb_free=61.8, wall=29591 epoch 021: 775 / 1689 loss=3.566, nll_loss=2.018, ppl=4.05, wps=586790, ups=1.19, wpb=494768, bsz=16611.7, num_updates=34500, lr=0.000340503, gnorm=0.14, clip=0, loss_scale=4, train_wall=83, gb_free=61.8, wall=29591 epoch 021: 775 / 1689 loss=3.566, nll_loss=2.018, ppl=4.05, wps=586790, ups=1.19, wpb=494768, bsz=16611.7, num_updates=34500, lr=0.000340503, gnorm=0.14, clip=0, loss_scale=4, train_wall=83, gb_free=61.8, wall=29591 epoch 021: 775 / 1689 loss=3.566, nll_loss=2.018, ppl=4.05, wps=586790, ups=1.19, wpb=494768, bsz=16611.7, num_updates=34500, lr=0.000340503, gnorm=0.14, clip=0, loss_scale=4, train_wall=83, gb_free=61.8, wall=29591 epoch 021: 775 / 1689 loss=3.566, nll_loss=2.018, ppl=4.05, wps=586790, ups=1.19, wpb=494768, bsz=16611.7, num_updates=34500, lr=0.000340503, gnorm=0.14, clip=0, loss_scale=4, train_wall=83, gb_free=61.8, wall=29591 epoch 021: 775 / 1689 loss=3.566, nll_loss=2.018, ppl=4.05, wps=586790, ups=1.19, wpb=494768, bsz=16611.7, num_updates=34500, lr=0.000340503, gnorm=0.14, clip=0, loss_scale=4, train_wall=83, gb_free=61.8, wall=29591 epoch 021: 775 / 1689 loss=3.566, nll_loss=2.018, ppl=4.05, wps=586790, ups=1.19, wpb=494768, bsz=16611.7, num_updates=34500, lr=0.000340503, gnorm=0.14, clip=0, loss_scale=4, train_wall=83, gb_free=61.8, wall=29591 epoch 021: 775 / 1689 loss=3.566, nll_loss=2.018, ppl=4.05, wps=586790, ups=1.19, wpb=494768, bsz=16611.7, num_updates=34500, lr=0.000340503, gnorm=0.14, clip=0, loss_scale=4, train_wall=83, gb_free=61.8, wall=29591 epoch 021: 775 / 1689 loss=3.566, nll_loss=2.018, ppl=4.05, wps=586790, ups=1.19, wpb=494768, bsz=16611.7, num_updates=34500, lr=0.000340503, gnorm=0.14, clip=0, loss_scale=4, train_wall=83, gb_free=61.8, wall=29591 epoch 021: 775 / 1689 loss=3.566, nll_loss=2.018, ppl=4.05, wps=586790, ups=1.19, wpb=494768, bsz=16611.7, num_updates=34500, lr=0.000340503, gnorm=0.14, clip=0, loss_scale=4, train_wall=83, gb_free=61.8, wall=29591 epoch 021: 775 / 1689 loss=3.566, nll_loss=2.018, ppl=4.05, wps=586790, ups=1.19, wpb=494768, bsz=16611.7, num_updates=34500, lr=0.000340503, gnorm=0.14, clip=0, loss_scale=4, train_wall=83, gb_free=61.8, wall=29591 epoch 021: 775 / 1689 loss=3.566, nll_loss=2.018, ppl=4.05, wps=586790, ups=1.19, wpb=494768, bsz=16611.7, num_updates=34500, lr=0.000340503, gnorm=0.14, clip=0, loss_scale=4, train_wall=83, gb_free=61.8, wall=29591 epoch 021: 775 / 1689 loss=3.566, nll_loss=2.018, ppl=4.05, wps=586790, ups=1.19, wpb=494768, bsz=16611.7, num_updates=34500, lr=0.000340503, gnorm=0.14, clip=0, loss_scale=4, train_wall=83, gb_free=61.8, wall=29591 epoch 021: 775 / 1689 loss=3.566, nll_loss=2.018, ppl=4.05, wps=586790, ups=1.19, wpb=494768, bsz=16611.7, num_updates=34500, lr=0.000340503, gnorm=0.14, clip=0, loss_scale=4, train_wall=83, gb_free=61.8, wall=29591 epoch 021: 775 / 1689 loss=3.566, nll_loss=2.018, ppl=4.05, wps=586790, ups=1.19, wpb=494768, bsz=16611.7, num_updates=34500, lr=0.000340503, gnorm=0.14, clip=0, loss_scale=4, train_wall=83, gb_free=61.8, wall=29591 epoch 021: 775 / 1689 loss=3.566, nll_loss=2.018, ppl=4.05, wps=586790, ups=1.19, wpb=494768, bsz=16611.7, num_updates=34500, lr=0.000340503, gnorm=0.14, clip=0, loss_scale=4, train_wall=83, gb_free=61.8, wall=29591 epoch 021: 775 / 1689 loss=3.566, nll_loss=2.018, ppl=4.05, wps=586790, ups=1.19, wpb=494768, bsz=16611.7, num_updates=34500, lr=0.000340503, gnorm=0.14, clip=0, loss_scale=4, train_wall=83, gb_free=61.8, wall=29591 epoch 021: 775 / 1689 loss=3.566, nll_loss=2.018, ppl=4.05, wps=586790, ups=1.19, wpb=494768, bsz=16611.7, num_updates=34500, lr=0.000340503, gnorm=0.14, clip=0, loss_scale=4, train_wall=83, gb_free=61.8, wall=29591 epoch 021: 875 / 1689 loss=3.558, nll_loss=2.009, ppl=4.02, wps=586986, ups=1.18, wpb=495566, bsz=16255.1, num_updates=34600, lr=0.00034001, gnorm=0.136, clip=0, loss_scale=4, train_wall=83, gb_free=61.7, wall=29675 epoch 021: 875 / 1689 loss=3.558, nll_loss=2.009, ppl=4.02, wps=586986, ups=1.18, wpb=495566, bsz=16255.1, num_updates=34600, lr=0.00034001, gnorm=0.136, clip=0, loss_scale=4, train_wall=83, gb_free=61.7, wall=29675 epoch 021: 875 / 1689 loss=3.558, nll_loss=2.009, ppl=4.02, wps=586986, ups=1.18, wpb=495566, bsz=16255.1, num_updates=34600, lr=0.00034001, gnorm=0.136, clip=0, loss_scale=4, train_wall=83, gb_free=61.7, wall=29675 epoch 021: 875 / 1689 loss=3.558, nll_loss=2.009, ppl=4.02, wps=586986, ups=1.18, wpb=495566, bsz=16255.1, num_updates=34600, lr=0.00034001, gnorm=0.136, clip=0, loss_scale=4, train_wall=83, gb_free=61.7, wall=29675 epoch 021: 875 / 1689 loss=3.558, nll_loss=2.009, ppl=4.02, wps=586986, ups=1.18, wpb=495566, bsz=16255.1, num_updates=34600, lr=0.00034001, gnorm=0.136, clip=0, loss_scale=4, train_wall=83, gb_free=61.7, wall=29675 epoch 021: 875 / 1689 loss=3.558, nll_loss=2.009, ppl=4.02, wps=586986, ups=1.18, wpb=495566, bsz=16255.1, num_updates=34600, lr=0.00034001, gnorm=0.136, clip=0, loss_scale=4, train_wall=83, gb_free=61.7, wall=29675 epoch 021: 875 / 1689 loss=3.558, nll_loss=2.009, ppl=4.02, wps=586986, ups=1.18, wpb=495566, bsz=16255.1, num_updates=34600, lr=0.00034001, gnorm=0.136, clip=0, loss_scale=4, train_wall=83, gb_free=61.7, wall=29675 epoch 021: 875 / 1689 loss=3.558, nll_loss=2.009, ppl=4.02, wps=586986, ups=1.18, wpb=495566, bsz=16255.1, num_updates=34600, lr=0.00034001, gnorm=0.136, clip=0, loss_scale=4, train_wall=83, gb_free=61.7, wall=29675 epoch 021: 875 / 1689 loss=3.558, nll_loss=2.009, ppl=4.02, wps=586986, ups=1.18, wpb=495566, bsz=16255.1, num_updates=34600, lr=0.00034001, gnorm=0.136, clip=0, loss_scale=4, train_wall=83, gb_free=61.7, wall=29675 epoch 021: 875 / 1689 loss=3.558, nll_loss=2.009, ppl=4.02, wps=586986, ups=1.18, wpb=495566, bsz=16255.1, num_updates=34600, lr=0.00034001, gnorm=0.136, clip=0, loss_scale=4, train_wall=83, gb_free=61.7, wall=29675 epoch 021: 875 / 1689 loss=3.558, nll_loss=2.009, ppl=4.02, wps=586986, ups=1.18, wpb=495566, bsz=16255.1, num_updates=34600, lr=0.00034001, gnorm=0.136, clip=0, loss_scale=4, train_wall=83, gb_free=61.7, wall=29675 epoch 021: 875 / 1689 loss=3.558, nll_loss=2.009, ppl=4.02, wps=586986, ups=1.18, wpb=495566, bsz=16255.1, num_updates=34600, lr=0.00034001, gnorm=0.136, clip=0, loss_scale=4, train_wall=83, gb_free=61.7, wall=29675 epoch 021: 875 / 1689 loss=3.558, nll_loss=2.009, ppl=4.02, wps=586986, ups=1.18, wpb=495566, bsz=16255.1, num_updates=34600, lr=0.00034001, gnorm=0.136, clip=0, loss_scale=4, train_wall=83, gb_free=61.7, wall=29675 epoch 021: 875 / 1689 loss=3.558, nll_loss=2.009, ppl=4.02, wps=586986, ups=1.18, wpb=495566, bsz=16255.1, num_updates=34600, lr=0.00034001, gnorm=0.136, clip=0, loss_scale=4, train_wall=83, gb_free=61.7, wall=29675 epoch 021: 875 / 1689 loss=3.558, nll_loss=2.009, ppl=4.02, wps=586986, ups=1.18, wpb=495566, bsz=16255.1, num_updates=34600, lr=0.00034001, gnorm=0.136, clip=0, loss_scale=4, train_wall=83, gb_free=61.7, wall=29675 epoch 021: 875 / 1689 loss=3.558, nll_loss=2.009, ppl=4.02, wps=586986, ups=1.18, wpb=495566, bsz=16255.1, num_updates=34600, lr=0.00034001, gnorm=0.136, clip=0, loss_scale=4, train_wall=83, gb_free=61.7, wall=29675 epoch 021: 875 / 1689 loss=3.558, nll_loss=2.009, ppl=4.02, wps=586986, ups=1.18, wpb=495566, bsz=16255.1, num_updates=34600, lr=0.00034001, gnorm=0.136, clip=0, loss_scale=4, train_wall=83, gb_free=61.7, wall=29675 epoch 021: 875 / 1689 loss=3.558, nll_loss=2.009, ppl=4.02, wps=586986, ups=1.18, wpb=495566, bsz=16255.1, num_updates=34600, lr=0.00034001, gnorm=0.136, clip=0, loss_scale=4, train_wall=83, gb_free=61.7, wall=29675 epoch 021: 875 / 1689 loss=3.558, nll_loss=2.009, ppl=4.02, wps=586986, ups=1.18, wpb=495566, bsz=16255.1, num_updates=34600, lr=0.00034001, gnorm=0.136, clip=0, loss_scale=4, train_wall=83, gb_free=61.7, wall=29675 epoch 021: 875 / 1689 loss=3.558, nll_loss=2.009, ppl=4.02, wps=586986, ups=1.18, wpb=495566, bsz=16255.1, num_updates=34600, lr=0.00034001, gnorm=0.136, clip=0, loss_scale=4, train_wall=83, gb_free=61.7, wall=29675 epoch 021: 875 / 1689 loss=3.558, nll_loss=2.009, ppl=4.02, wps=586986, ups=1.18, wpb=495566, bsz=16255.1, num_updates=34600, lr=0.00034001, gnorm=0.136, clip=0, loss_scale=4, train_wall=83, gb_free=61.7, wall=29675 epoch 021: 975 / 1689 loss=3.556, nll_loss=2.007, ppl=4.02, wps=586836, ups=1.18, wpb=496648, bsz=16297.4, num_updates=34700, lr=0.00033952, gnorm=0.141, clip=0, loss_scale=4, train_wall=83, gb_free=60.8, wall=29760 epoch 021: 975 / 1689 loss=3.556, nll_loss=2.007, ppl=4.02, wps=586836, ups=1.18, wpb=496648, bsz=16297.4, num_updates=34700, lr=0.00033952, gnorm=0.141, clip=0, loss_scale=4, train_wall=83, gb_free=60.8, wall=29760 epoch 021: 975 / 1689 loss=3.556, nll_loss=2.007, ppl=4.02, wps=586836, ups=1.18, wpb=496648, bsz=16297.4, num_updates=34700, lr=0.00033952, gnorm=0.141, clip=0, loss_scale=4, train_wall=83, gb_free=60.8, wall=29760 epoch 021: 975 / 1689 loss=3.556, nll_loss=2.007, ppl=4.02, wps=586836, ups=1.18, wpb=496648, bsz=16297.4, num_updates=34700, lr=0.00033952, gnorm=0.141, clip=0, loss_scale=4, train_wall=83, gb_free=60.8, wall=29760 epoch 021: 975 / 1689 loss=3.556, nll_loss=2.007, ppl=4.02, wps=586836, ups=1.18, wpb=496648, bsz=16297.4, num_updates=34700, lr=0.00033952, gnorm=0.141, clip=0, loss_scale=4, train_wall=83, gb_free=60.8, wall=29760 epoch 021: 975 / 1689 loss=3.556, nll_loss=2.007, ppl=4.02, wps=586836, ups=1.18, wpb=496648, bsz=16297.4, num_updates=34700, lr=0.00033952, gnorm=0.141, clip=0, loss_scale=4, train_wall=83, gb_free=60.8, wall=29760 epoch 021: 975 / 1689 loss=3.556, nll_loss=2.007, ppl=4.02, wps=586836, ups=1.18, wpb=496648, bsz=16297.4, num_updates=34700, lr=0.00033952, gnorm=0.141, clip=0, loss_scale=4, train_wall=83, gb_free=60.8, wall=29760 epoch 021: 975 / 1689 loss=3.556, nll_loss=2.007, ppl=4.02, wps=586836, ups=1.18, wpb=496648, bsz=16297.4, num_updates=34700, lr=0.00033952, gnorm=0.141, clip=0, loss_scale=4, train_wall=83, gb_free=60.8, wall=29760 epoch 021: 975 / 1689 loss=3.556, nll_loss=2.007, ppl=4.02, wps=586836, ups=1.18, wpb=496648, bsz=16297.4, num_updates=34700, lr=0.00033952, gnorm=0.141, clip=0, loss_scale=4, train_wall=83, gb_free=60.8, wall=29760 epoch 021: 975 / 1689 loss=3.556, nll_loss=2.007, ppl=4.02, wps=586836, ups=1.18, wpb=496648, bsz=16297.4, num_updates=34700, lr=0.00033952, gnorm=0.141, clip=0, loss_scale=4, train_wall=83, gb_free=60.8, wall=29760 epoch 021: 975 / 1689 loss=3.556, nll_loss=2.007, ppl=4.02, wps=586836, ups=1.18, wpb=496648, bsz=16297.4, num_updates=34700, lr=0.00033952, gnorm=0.141, clip=0, loss_scale=4, train_wall=83, gb_free=60.8, wall=29760 epoch 021: 975 / 1689 loss=3.556, nll_loss=2.007, ppl=4.02, wps=586836, ups=1.18, wpb=496648, bsz=16297.4, num_updates=34700, lr=0.00033952, gnorm=0.141, clip=0, loss_scale=4, train_wall=83, gb_free=60.8, wall=29760 epoch 021: 975 / 1689 loss=3.556, nll_loss=2.007, ppl=4.02, wps=586836, ups=1.18, wpb=496648, bsz=16297.4, num_updates=34700, lr=0.00033952, gnorm=0.141, clip=0, loss_scale=4, train_wall=83, gb_free=60.8, wall=29760 epoch 021: 975 / 1689 loss=3.556, nll_loss=2.007, ppl=4.02, wps=586836, ups=1.18, wpb=496648, bsz=16297.4, num_updates=34700, lr=0.00033952, gnorm=0.141, clip=0, loss_scale=4, train_wall=83, gb_free=60.8, wall=29760 epoch 021: 975 / 1689 loss=3.556, nll_loss=2.007, ppl=4.02, wps=586836, ups=1.18, wpb=496648, bsz=16297.4, num_updates=34700, lr=0.00033952, gnorm=0.141, clip=0, loss_scale=4, train_wall=83, gb_free=60.8, wall=29760 epoch 021: 975 / 1689 loss=3.556, nll_loss=2.007, ppl=4.02, wps=586836, ups=1.18, wpb=496648, bsz=16297.4, num_updates=34700, lr=0.00033952, gnorm=0.141, clip=0, loss_scale=4, train_wall=83, gb_free=60.8, wall=29760 epoch 021: 975 / 1689 loss=3.556, nll_loss=2.007, ppl=4.02, wps=586836, ups=1.18, wpb=496648, bsz=16297.4, num_updates=34700, lr=0.00033952, gnorm=0.141, clip=0, loss_scale=4, train_wall=83, gb_free=60.8, wall=29760 epoch 021: 975 / 1689 loss=3.556, nll_loss=2.007, ppl=4.02, wps=586836, ups=1.18, wpb=496648, bsz=16297.4, num_updates=34700, lr=0.00033952, gnorm=0.141, clip=0, loss_scale=4, train_wall=83, gb_free=60.8, wall=29760 epoch 021: 975 / 1689 loss=3.556, nll_loss=2.007, ppl=4.02, wps=586836, ups=1.18, wpb=496648, bsz=16297.4, num_updates=34700, lr=0.00033952, gnorm=0.141, clip=0, loss_scale=4, train_wall=83, gb_free=60.8, wall=29760 epoch 021: 975 / 1689 loss=3.556, nll_loss=2.007, ppl=4.02, wps=586836, ups=1.18, wpb=496648, bsz=16297.4, num_updates=34700, lr=0.00033952, gnorm=0.141, clip=0, loss_scale=4, train_wall=83, gb_free=60.8, wall=29760 epoch 021: 975 / 1689 loss=3.556, nll_loss=2.007, ppl=4.02, wps=586836, ups=1.18, wpb=496648, bsz=16297.4, num_updates=34700, lr=0.00033952, gnorm=0.141, clip=0, loss_scale=4, train_wall=83, gb_free=60.8, wall=29760 epoch 021: 1076 / 1689 loss=3.561, nll_loss=2.012, ppl=4.03, wps=580449, ups=1.17, wpb=494861, bsz=16422.5, num_updates=34800, lr=0.000339032, gnorm=0.143, clip=0, loss_scale=2, train_wall=84, gb_free=62.3, wall=29845 epoch 021: 1076 / 1689 loss=3.561, nll_loss=2.012, ppl=4.03, wps=580449, ups=1.17, wpb=494861, bsz=16422.5, num_updates=34800, lr=0.000339032, gnorm=0.143, clip=0, loss_scale=2, train_wall=84, gb_free=62.3, wall=29845 epoch 021: 1076 / 1689 loss=3.561, nll_loss=2.012, ppl=4.03, wps=580449, ups=1.17, wpb=494861, bsz=16422.5, num_updates=34800, lr=0.000339032, gnorm=0.143, clip=0, loss_scale=2, train_wall=84, gb_free=62.3, wall=29845 epoch 021: 1076 / 1689 loss=3.561, nll_loss=2.012, ppl=4.03, wps=580449, ups=1.17, wpb=494861, bsz=16422.5, num_updates=34800, lr=0.000339032, gnorm=0.143, clip=0, loss_scale=2, train_wall=84, gb_free=62.3, wall=29845 epoch 021: 1076 / 1689 loss=3.561, nll_loss=2.012, ppl=4.03, wps=580449, ups=1.17, wpb=494861, bsz=16422.5, num_updates=34800, lr=0.000339032, gnorm=0.143, clip=0, loss_scale=2, train_wall=84, gb_free=62.3, wall=29845 epoch 021: 1076 / 1689 loss=3.561, nll_loss=2.012, ppl=4.03, wps=580449, ups=1.17, wpb=494861, bsz=16422.5, num_updates=34800, lr=0.000339032, gnorm=0.143, clip=0, loss_scale=2, train_wall=84, gb_free=62.3, wall=29845 epoch 021: 1076 / 1689 loss=3.561, nll_loss=2.012, ppl=4.03, wps=580449, ups=1.17, wpb=494861, bsz=16422.5, num_updates=34800, lr=0.000339032, gnorm=0.143, clip=0, loss_scale=2, train_wall=84, gb_free=62.3, wall=29845 epoch 021: 1076 / 1689 loss=3.561, nll_loss=2.012, ppl=4.03, wps=580449, ups=1.17, wpb=494861, bsz=16422.5, num_updates=34800, lr=0.000339032, gnorm=0.143, clip=0, loss_scale=2, train_wall=84, gb_free=62.3, wall=29845 epoch 021: 1076 / 1689 loss=3.561, nll_loss=2.012, ppl=4.03, wps=580449, ups=1.17, wpb=494861, bsz=16422.5, num_updates=34800, lr=0.000339032, gnorm=0.143, clip=0, loss_scale=2, train_wall=84, gb_free=62.3, wall=29845 epoch 021: 1076 / 1689 loss=3.561, nll_loss=2.012, ppl=4.03, wps=580449, ups=1.17, wpb=494861, bsz=16422.5, num_updates=34800, lr=0.000339032, gnorm=0.143, clip=0, loss_scale=2, train_wall=84, gb_free=62.3, wall=29845 epoch 021: 1076 / 1689 loss=3.561, nll_loss=2.012, ppl=4.03, wps=580449, ups=1.17, wpb=494861, bsz=16422.5, num_updates=34800, lr=0.000339032, gnorm=0.143, clip=0, loss_scale=2, train_wall=84, gb_free=62.3, wall=29845 epoch 021: 1076 / 1689 loss=3.561, nll_loss=2.012, ppl=4.03, wps=580449, ups=1.17, wpb=494861, bsz=16422.5, num_updates=34800, lr=0.000339032, gnorm=0.143, clip=0, loss_scale=2, train_wall=84, gb_free=62.3, wall=29845 epoch 021: 1076 / 1689 loss=3.561, nll_loss=2.012, ppl=4.03, wps=580449, ups=1.17, wpb=494861, bsz=16422.5, num_updates=34800, lr=0.000339032, gnorm=0.143, clip=0, loss_scale=2, train_wall=84, gb_free=62.3, wall=29845 epoch 021: 1076 / 1689 loss=3.561, nll_loss=2.012, ppl=4.03, wps=580449, ups=1.17, wpb=494861, bsz=16422.5, num_updates=34800, lr=0.000339032, gnorm=0.143, clip=0, loss_scale=2, train_wall=84, gb_free=62.3, wall=29845 epoch 021: 1076 / 1689 loss=3.561, nll_loss=2.012, ppl=4.03, wps=580449, ups=1.17, wpb=494861, bsz=16422.5, num_updates=34800, lr=0.000339032, gnorm=0.143, clip=0, loss_scale=2, train_wall=84, gb_free=62.3, wall=29845 epoch 021: 1076 / 1689 loss=3.561, nll_loss=2.012, ppl=4.03, wps=580449, ups=1.17, wpb=494861, bsz=16422.5, num_updates=34800, lr=0.000339032, gnorm=0.143, clip=0, loss_scale=2, train_wall=84, gb_free=62.3, wall=29845 epoch 021: 1076 / 1689 loss=3.561, nll_loss=2.012, ppl=4.03, wps=580449, ups=1.17, wpb=494861, bsz=16422.5, num_updates=34800, lr=0.000339032, gnorm=0.143, clip=0, loss_scale=2, train_wall=84, gb_free=62.3, wall=29845 epoch 021: 1076 / 1689 loss=3.561, nll_loss=2.012, ppl=4.03, wps=580449, ups=1.17, wpb=494861, bsz=16422.5, num_updates=34800, lr=0.000339032, gnorm=0.143, clip=0, loss_scale=2, train_wall=84, gb_free=62.3, wall=29845 epoch 021: 1076 / 1689 loss=3.561, nll_loss=2.012, ppl=4.03, wps=580449, ups=1.17, wpb=494861, bsz=16422.5, num_updates=34800, lr=0.000339032, gnorm=0.143, clip=0, loss_scale=2, train_wall=84, gb_free=62.3, wall=29845 epoch 021: 1076 / 1689 loss=3.561, nll_loss=2.012, ppl=4.03, wps=580449, ups=1.17, wpb=494861, bsz=16422.5, num_updates=34800, lr=0.000339032, gnorm=0.143, clip=0, loss_scale=2, train_wall=84, gb_free=62.3, wall=29845 epoch 021: 1076 / 1689 loss=3.561, nll_loss=2.012, ppl=4.03, wps=580449, ups=1.17, wpb=494861, bsz=16422.5, num_updates=34800, lr=0.000339032, gnorm=0.143, clip=0, loss_scale=2, train_wall=84, gb_free=62.3, wall=29845 epoch 021: 1176 / 1689 loss=3.564, nll_loss=2.016, ppl=4.05, wps=586282, ups=1.18, wpb=495389, bsz=16767.1, num_updates=34900, lr=0.000338546, gnorm=0.138, clip=0, loss_scale=2, train_wall=84, gb_free=60.7, wall=29930 epoch 021: 1176 / 1689 loss=3.564, nll_loss=2.016, ppl=4.05, wps=586282, ups=1.18, wpb=495389, bsz=16767.1, num_updates=34900, lr=0.000338546, gnorm=0.138, clip=0, loss_scale=2, train_wall=84, gb_free=60.7, wall=29930 epoch 021: 1176 / 1689 loss=3.564, nll_loss=2.016, ppl=4.05, wps=586282, ups=1.18, wpb=495389, bsz=16767.1, num_updates=34900, lr=0.000338546, gnorm=0.138, clip=0, loss_scale=2, train_wall=84, gb_free=60.7, wall=29930 epoch 021: 1176 / 1689 loss=3.564, nll_loss=2.016, ppl=4.05, wps=586282, ups=1.18, wpb=495389, bsz=16767.1, num_updates=34900, lr=0.000338546, gnorm=0.138, clip=0, loss_scale=2, train_wall=84, gb_free=60.7, wall=29930 epoch 021: 1176 / 1689 loss=3.564, nll_loss=2.016, ppl=4.05, wps=586282, ups=1.18, wpb=495389, bsz=16767.1, num_updates=34900, lr=0.000338546, gnorm=0.138, clip=0, loss_scale=2, train_wall=84, gb_free=60.7, wall=29930 epoch 021: 1176 / 1689 loss=3.564, nll_loss=2.016, ppl=4.05, wps=586282, ups=1.18, wpb=495389, bsz=16767.1, num_updates=34900, lr=0.000338546, gnorm=0.138, clip=0, loss_scale=2, train_wall=84, gb_free=60.7, wall=29930 epoch 021: 1176 / 1689 loss=3.564, nll_loss=2.016, ppl=4.05, wps=586282, ups=1.18, wpb=495389, bsz=16767.1, num_updates=34900, lr=0.000338546, gnorm=0.138, clip=0, loss_scale=2, train_wall=84, gb_free=60.7, wall=29930 epoch 021: 1176 / 1689 loss=3.564, nll_loss=2.016, ppl=4.05, wps=586282, ups=1.18, wpb=495389, bsz=16767.1, num_updates=34900, lr=0.000338546, gnorm=0.138, clip=0, loss_scale=2, train_wall=84, gb_free=60.7, wall=29930 epoch 021: 1176 / 1689 loss=3.564, nll_loss=2.016, ppl=4.05, wps=586282, ups=1.18, wpb=495389, bsz=16767.1, num_updates=34900, lr=0.000338546, gnorm=0.138, clip=0, loss_scale=2, train_wall=84, gb_free=60.7, wall=29930 epoch 021: 1176 / 1689 loss=3.564, nll_loss=2.016, ppl=4.05, wps=586282, ups=1.18, wpb=495389, bsz=16767.1, num_updates=34900, lr=0.000338546, gnorm=0.138, clip=0, loss_scale=2, train_wall=84, gb_free=60.7, wall=29930 epoch 021: 1176 / 1689 loss=3.564, nll_loss=2.016, ppl=4.05, wps=586282, ups=1.18, wpb=495389, bsz=16767.1, num_updates=34900, lr=0.000338546, gnorm=0.138, clip=0, loss_scale=2, train_wall=84, gb_free=60.7, wall=29930 epoch 021: 1176 / 1689 loss=3.564, nll_loss=2.016, ppl=4.05, wps=586282, ups=1.18, wpb=495389, bsz=16767.1, num_updates=34900, lr=0.000338546, gnorm=0.138, clip=0, loss_scale=2, train_wall=84, gb_free=60.7, wall=29930 epoch 021: 1176 / 1689 loss=3.564, nll_loss=2.016, ppl=4.05, wps=586282, ups=1.18, wpb=495389, bsz=16767.1, num_updates=34900, lr=0.000338546, gnorm=0.138, clip=0, loss_scale=2, train_wall=84, gb_free=60.7, wall=29930 epoch 021: 1176 / 1689 loss=3.564, nll_loss=2.016, ppl=4.05, wps=586282, ups=1.18, wpb=495389, bsz=16767.1, num_updates=34900, lr=0.000338546, gnorm=0.138, clip=0, loss_scale=2, train_wall=84, gb_free=60.7, wall=29930 epoch 021: 1176 / 1689 loss=3.564, nll_loss=2.016, ppl=4.05, wps=586282, ups=1.18, wpb=495389, bsz=16767.1, num_updates=34900, lr=0.000338546, gnorm=0.138, clip=0, loss_scale=2, train_wall=84, gb_free=60.7, wall=29930 epoch 021: 1176 / 1689 loss=3.564, nll_loss=2.016, ppl=4.05, wps=586282, ups=1.18, wpb=495389, bsz=16767.1, num_updates=34900, lr=0.000338546, gnorm=0.138, clip=0, loss_scale=2, train_wall=84, gb_free=60.7, wall=29930 epoch 021: 1176 / 1689 loss=3.564, nll_loss=2.016, ppl=4.05, wps=586282, ups=1.18, wpb=495389, bsz=16767.1, num_updates=34900, lr=0.000338546, gnorm=0.138, clip=0, loss_scale=2, train_wall=84, gb_free=60.7, wall=29930 epoch 021: 1176 / 1689 loss=3.564, nll_loss=2.016, ppl=4.05, wps=586282, ups=1.18, wpb=495389, bsz=16767.1, num_updates=34900, lr=0.000338546, gnorm=0.138, clip=0, loss_scale=2, train_wall=84, gb_free=60.7, wall=29930 epoch 021: 1176 / 1689 loss=3.564, nll_loss=2.016, ppl=4.05, wps=586282, ups=1.18, wpb=495389, bsz=16767.1, num_updates=34900, lr=0.000338546, gnorm=0.138, clip=0, loss_scale=2, train_wall=84, gb_free=60.7, wall=29930 epoch 021: 1176 / 1689 loss=3.564, nll_loss=2.016, ppl=4.05, wps=586282, ups=1.18, wpb=495389, bsz=16767.1, num_updates=34900, lr=0.000338546, gnorm=0.138, clip=0, loss_scale=2, train_wall=84, gb_free=60.7, wall=29930 epoch 021: 1176 / 1689 loss=3.564, nll_loss=2.016, ppl=4.05, wps=586282, ups=1.18, wpb=495389, bsz=16767.1, num_updates=34900, lr=0.000338546, gnorm=0.138, clip=0, loss_scale=2, train_wall=84, gb_free=60.7, wall=29930 epoch 021: 1277 / 1689 loss=3.566, nll_loss=2.018, ppl=4.05, wps=580371, ups=1.17, wpb=494752, bsz=16676.8, num_updates=35000, lr=0.000338062, gnorm=0.135, clip=0, loss_scale=1, train_wall=84, gb_free=61.8, wall=30015 epoch 021: 1277 / 1689 loss=3.566, nll_loss=2.018, ppl=4.05, wps=580371, ups=1.17, wpb=494752, bsz=16676.8, num_updates=35000, lr=0.000338062, gnorm=0.135, clip=0, loss_scale=1, train_wall=84, gb_free=61.8, wall=30015 epoch 021: 1277 / 1689 loss=3.566, nll_loss=2.018, ppl=4.05, wps=580371, ups=1.17, wpb=494752, bsz=16676.8, num_updates=35000, lr=0.000338062, gnorm=0.135, clip=0, loss_scale=1, train_wall=84, gb_free=61.8, wall=30015 epoch 021: 1277 / 1689 loss=3.566, nll_loss=2.018, ppl=4.05, wps=580371, ups=1.17, wpb=494752, bsz=16676.8, num_updates=35000, lr=0.000338062, gnorm=0.135, clip=0, loss_scale=1, train_wall=84, gb_free=61.8, wall=30015 epoch 021: 1277 / 1689 loss=3.566, nll_loss=2.018, ppl=4.05, wps=580371, ups=1.17, wpb=494752, bsz=16676.8, num_updates=35000, lr=0.000338062, gnorm=0.135, clip=0, loss_scale=1, train_wall=84, gb_free=61.8, wall=30015 epoch 021: 1277 / 1689 loss=3.566, nll_loss=2.018, ppl=4.05, wps=580371, ups=1.17, wpb=494752, bsz=16676.8, num_updates=35000, lr=0.000338062, gnorm=0.135, clip=0, loss_scale=1, train_wall=84, gb_free=61.8, wall=30015 epoch 021: 1277 / 1689 loss=3.566, nll_loss=2.018, ppl=4.05, wps=580371, ups=1.17, wpb=494752, bsz=16676.8, num_updates=35000, lr=0.000338062, gnorm=0.135, clip=0, loss_scale=1, train_wall=84, gb_free=61.8, wall=30015 epoch 021: 1277 / 1689 loss=3.566, nll_loss=2.018, ppl=4.05, wps=580371, ups=1.17, wpb=494752, bsz=16676.8, num_updates=35000, lr=0.000338062, gnorm=0.135, clip=0, loss_scale=1, train_wall=84, gb_free=61.8, wall=30015 epoch 021: 1277 / 1689 loss=3.566, nll_loss=2.018, ppl=4.05, wps=580371, ups=1.17, wpb=494752, bsz=16676.8, num_updates=35000, lr=0.000338062, gnorm=0.135, clip=0, loss_scale=1, train_wall=84, gb_free=61.8, wall=30015 epoch 021: 1277 / 1689 loss=3.566, nll_loss=2.018, ppl=4.05, wps=580371, ups=1.17, wpb=494752, bsz=16676.8, num_updates=35000, lr=0.000338062, gnorm=0.135, clip=0, loss_scale=1, train_wall=84, gb_free=61.8, wall=30015 epoch 021: 1277 / 1689 loss=3.566, nll_loss=2.018, ppl=4.05, wps=580371, ups=1.17, wpb=494752, bsz=16676.8, num_updates=35000, lr=0.000338062, gnorm=0.135, clip=0, loss_scale=1, train_wall=84, gb_free=61.8, wall=30015 epoch 021: 1277 / 1689 loss=3.566, nll_loss=2.018, ppl=4.05, wps=580371, ups=1.17, wpb=494752, bsz=16676.8, num_updates=35000, lr=0.000338062, gnorm=0.135, clip=0, loss_scale=1, train_wall=84, gb_free=61.8, wall=30015 epoch 021: 1277 / 1689 loss=3.566, nll_loss=2.018, ppl=4.05, wps=580371, ups=1.17, wpb=494752, bsz=16676.8, num_updates=35000, lr=0.000338062, gnorm=0.135, clip=0, loss_scale=1, train_wall=84, gb_free=61.8, wall=30015 epoch 021: 1277 / 1689 loss=3.566, nll_loss=2.018, ppl=4.05, wps=580371, ups=1.17, wpb=494752, bsz=16676.8, num_updates=35000, lr=0.000338062, gnorm=0.135, clip=0, loss_scale=1, train_wall=84, gb_free=61.8, wall=30015 epoch 021: 1277 / 1689 loss=3.566, nll_loss=2.018, ppl=4.05, wps=580371, ups=1.17, wpb=494752, bsz=16676.8, num_updates=35000, lr=0.000338062, gnorm=0.135, clip=0, loss_scale=1, train_wall=84, gb_free=61.8, wall=30015 epoch 021: 1277 / 1689 loss=3.566, nll_loss=2.018, ppl=4.05, wps=580371, ups=1.17, wpb=494752, bsz=16676.8, num_updates=35000, lr=0.000338062, gnorm=0.135, clip=0, loss_scale=1, train_wall=84, gb_free=61.8, wall=30015 epoch 021: 1277 / 1689 loss=3.566, nll_loss=2.018, ppl=4.05, wps=580371, ups=1.17, wpb=494752, bsz=16676.8, num_updates=35000, lr=0.000338062, gnorm=0.135, clip=0, loss_scale=1, train_wall=84, gb_free=61.8, wall=30015 epoch 021: 1277 / 1689 loss=3.566, nll_loss=2.018, ppl=4.05, wps=580371, ups=1.17, wpb=494752, bsz=16676.8, num_updates=35000, lr=0.000338062, gnorm=0.135, clip=0, loss_scale=1, train_wall=84, gb_free=61.8, wall=30015 epoch 021: 1277 / 1689 loss=3.566, nll_loss=2.018, ppl=4.05, wps=580371, ups=1.17, wpb=494752, bsz=16676.8, num_updates=35000, lr=0.000338062, gnorm=0.135, clip=0, loss_scale=1, train_wall=84, gb_free=61.8, wall=30015 epoch 021: 1277 / 1689 loss=3.566, nll_loss=2.018, ppl=4.05, wps=580371, ups=1.17, wpb=494752, bsz=16676.8, num_updates=35000, lr=0.000338062, gnorm=0.135, clip=0, loss_scale=1, train_wall=84, gb_free=61.8, wall=30015 epoch 021: 1277 / 1689 loss=3.566, nll_loss=2.018, ppl=4.05, wps=580371, ups=1.17, wpb=494752, bsz=16676.8, num_updates=35000, lr=0.000338062, gnorm=0.135, clip=0, loss_scale=1, train_wall=84, gb_free=61.8, wall=30015 begin validation on "valid" subset epoch 021 | valid on 'valid' subset | loss 3.707 | nll_loss 2.154 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 35000 | best_loss 3.701 epoch 021 | valid on 'valid' subset | loss 3.707 | nll_loss 2.154 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 35000 | best_loss 3.701 epoch 021 | valid on 'valid' subset | loss 3.707 | nll_loss 2.154 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 35000 | best_loss 3.701 epoch 021 | valid on 'valid' subset | loss 3.707 | nll_loss 2.154 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 35000 | best_loss 3.701 epoch 021 | valid on 'valid' subset | loss 3.707 | nll_loss 2.154 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 35000 | best_loss 3.701 epoch 021 | valid on 'valid' subset | loss 3.707 | nll_loss 2.154 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 35000 | best_loss 3.701 epoch 021 | valid on 'valid' subset | loss 3.707 | nll_loss 2.154 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 35000 | best_loss 3.701 epoch 021 | valid on 'valid' subset | loss 3.707 | nll_loss 2.154 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 35000 | best_loss 3.701 epoch 021 | valid on 'valid' subset | loss 3.707 | nll_loss 2.154 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 35000 | best_loss 3.701 epoch 021 | valid on 'valid' subset | loss 3.707 | nll_loss 2.154 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 35000 | best_loss 3.701 epoch 021 | valid on 'valid' subset | loss 3.707 | nll_loss 2.154 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 35000 | best_loss 3.701 epoch 021 | valid on 'valid' subset | loss 3.707 | nll_loss 2.154 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 35000 | best_loss 3.701 epoch 021 | valid on 'valid' subset | loss 3.707 | nll_loss 2.154 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 35000 | best_loss 3.701 epoch 021 | valid on 'valid' subset | loss 3.707 | nll_loss 2.154 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 35000 | best_loss 3.701 epoch 021 | valid on 'valid' subset | loss 3.707 | nll_loss 2.154 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 35000 | best_loss 3.701 epoch 021 | valid on 'valid' subset | loss 3.707 | nll_loss 2.154 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 35000 | best_loss 3.701 epoch 021 | valid on 'valid' subset | loss 3.707 | nll_loss 2.154 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 35000 | best_loss 3.701 epoch 021 | valid on 'valid' subset | loss 3.707 | nll_loss 2.154 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 35000 | best_loss 3.701 epoch 021 | valid on 'valid' subset | loss 3.707 | nll_loss 2.154 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 35000 | best_loss 3.701 epoch 021 | valid on 'valid' subset | loss 3.707 | nll_loss 2.154 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 35000 | best_loss 3.701 epoch 021 | valid on 'valid' subset | loss 3.707 | nll_loss 2.154 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 35000 | best_loss 3.701 epoch 021: 1377 / 1689 loss=3.567, nll_loss=2.019, ppl=4.05, wps=510442, ups=1.03, wpb=495253, bsz=16146.5, num_updates=35100, lr=0.00033758, gnorm=0.14, clip=0, loss_scale=1, train_wall=83, gb_free=62.3, wall=30112 epoch 021: 1377 / 1689 loss=3.567, nll_loss=2.019, ppl=4.05, wps=510442, ups=1.03, wpb=495253, bsz=16146.5, num_updates=35100, lr=0.00033758, gnorm=0.14, clip=0, loss_scale=1, train_wall=83, gb_free=62.3, wall=30112 epoch 021: 1377 / 1689 loss=3.567, nll_loss=2.019, ppl=4.05, wps=510442, ups=1.03, wpb=495253, bsz=16146.5, num_updates=35100, lr=0.00033758, gnorm=0.14, clip=0, loss_scale=1, train_wall=83, gb_free=62.3, wall=30112 epoch 021: 1377 / 1689 loss=3.567, nll_loss=2.019, ppl=4.05, wps=510442, ups=1.03, wpb=495253, bsz=16146.5, num_updates=35100, lr=0.00033758, gnorm=0.14, clip=0, loss_scale=1, train_wall=83, gb_free=62.3, wall=30112 epoch 021: 1377 / 1689 loss=3.567, nll_loss=2.019, ppl=4.05, wps=510442, ups=1.03, wpb=495253, bsz=16146.5, num_updates=35100, lr=0.00033758, gnorm=0.14, clip=0, loss_scale=1, train_wall=83, gb_free=62.3, wall=30112 epoch 021: 1377 / 1689 loss=3.567, nll_loss=2.019, ppl=4.05, wps=510442, ups=1.03, wpb=495253, bsz=16146.5, num_updates=35100, lr=0.00033758, gnorm=0.14, clip=0, loss_scale=1, train_wall=83, gb_free=62.3, wall=30112 epoch 021: 1377 / 1689 loss=3.567, nll_loss=2.019, ppl=4.05, wps=510442, ups=1.03, wpb=495253, bsz=16146.5, num_updates=35100, lr=0.00033758, gnorm=0.14, clip=0, loss_scale=1, train_wall=83, gb_free=62.3, wall=30112 epoch 021: 1377 / 1689 loss=3.567, nll_loss=2.019, ppl=4.05, wps=510442, ups=1.03, wpb=495253, bsz=16146.5, num_updates=35100, lr=0.00033758, gnorm=0.14, clip=0, loss_scale=1, train_wall=83, gb_free=62.3, wall=30112 epoch 021: 1377 / 1689 loss=3.567, nll_loss=2.019, ppl=4.05, wps=510442, ups=1.03, wpb=495253, bsz=16146.5, num_updates=35100, lr=0.00033758, gnorm=0.14, clip=0, loss_scale=1, train_wall=83, gb_free=62.3, wall=30112 epoch 021: 1377 / 1689 loss=3.567, nll_loss=2.019, ppl=4.05, wps=510442, ups=1.03, wpb=495253, bsz=16146.5, num_updates=35100, lr=0.00033758, gnorm=0.14, clip=0, loss_scale=1, train_wall=83, gb_free=62.3, wall=30112 epoch 021: 1377 / 1689 loss=3.567, nll_loss=2.019, ppl=4.05, wps=510442, ups=1.03, wpb=495253, bsz=16146.5, num_updates=35100, lr=0.00033758, gnorm=0.14, clip=0, loss_scale=1, train_wall=83, gb_free=62.3, wall=30112 epoch 021: 1377 / 1689 loss=3.567, nll_loss=2.019, ppl=4.05, wps=510442, ups=1.03, wpb=495253, bsz=16146.5, num_updates=35100, lr=0.00033758, gnorm=0.14, clip=0, loss_scale=1, train_wall=83, gb_free=62.3, wall=30112 epoch 021: 1377 / 1689 loss=3.567, nll_loss=2.019, ppl=4.05, wps=510442, ups=1.03, wpb=495253, bsz=16146.5, num_updates=35100, lr=0.00033758, gnorm=0.14, clip=0, loss_scale=1, train_wall=83, gb_free=62.3, wall=30112 epoch 021: 1377 / 1689 loss=3.567, nll_loss=2.019, ppl=4.05, wps=510442, ups=1.03, wpb=495253, bsz=16146.5, num_updates=35100, lr=0.00033758, gnorm=0.14, clip=0, loss_scale=1, train_wall=83, gb_free=62.3, wall=30112 epoch 021: 1377 / 1689 loss=3.567, nll_loss=2.019, ppl=4.05, wps=510442, ups=1.03, wpb=495253, bsz=16146.5, num_updates=35100, lr=0.00033758, gnorm=0.14, clip=0, loss_scale=1, train_wall=83, gb_free=62.3, wall=30112 epoch 021: 1377 / 1689 loss=3.567, nll_loss=2.019, ppl=4.05, wps=510442, ups=1.03, wpb=495253, bsz=16146.5, num_updates=35100, lr=0.00033758, gnorm=0.14, clip=0, loss_scale=1, train_wall=83, gb_free=62.3, wall=30112 epoch 021: 1377 / 1689 loss=3.567, nll_loss=2.019, ppl=4.05, wps=510442, ups=1.03, wpb=495253, bsz=16146.5, num_updates=35100, lr=0.00033758, gnorm=0.14, clip=0, loss_scale=1, train_wall=83, gb_free=62.3, wall=30112 epoch 021: 1377 / 1689 loss=3.567, nll_loss=2.019, ppl=4.05, wps=510442, ups=1.03, wpb=495253, bsz=16146.5, num_updates=35100, lr=0.00033758, gnorm=0.14, clip=0, loss_scale=1, train_wall=83, gb_free=62.3, wall=30112 epoch 021: 1377 / 1689 loss=3.567, nll_loss=2.019, ppl=4.05, wps=510442, ups=1.03, wpb=495253, bsz=16146.5, num_updates=35100, lr=0.00033758, gnorm=0.14, clip=0, loss_scale=1, train_wall=83, gb_free=62.3, wall=30112 epoch 021: 1377 / 1689 loss=3.567, nll_loss=2.019, ppl=4.05, wps=510442, ups=1.03, wpb=495253, bsz=16146.5, num_updates=35100, lr=0.00033758, gnorm=0.14, clip=0, loss_scale=1, train_wall=83, gb_free=62.3, wall=30112 epoch 021: 1377 / 1689 loss=3.567, nll_loss=2.019, ppl=4.05, wps=510442, ups=1.03, wpb=495253, bsz=16146.5, num_updates=35100, lr=0.00033758, gnorm=0.14, clip=0, loss_scale=1, train_wall=83, gb_free=62.3, wall=30112 epoch 021: 1477 / 1689 loss=3.566, nll_loss=2.018, ppl=4.05, wps=591911, ups=1.19, wpb=496903, bsz=16361.3, num_updates=35200, lr=0.0003371, gnorm=0.138, clip=0, loss_scale=1, train_wall=83, gb_free=61.6, wall=30196 epoch 021: 1477 / 1689 loss=3.566, nll_loss=2.018, ppl=4.05, wps=591911, ups=1.19, wpb=496903, bsz=16361.3, num_updates=35200, lr=0.0003371, gnorm=0.138, clip=0, loss_scale=1, train_wall=83, gb_free=61.6, wall=30196 epoch 021: 1477 / 1689 loss=3.566, nll_loss=2.018, ppl=4.05, wps=591911, ups=1.19, wpb=496903, bsz=16361.3, num_updates=35200, lr=0.0003371, gnorm=0.138, clip=0, loss_scale=1, train_wall=83, gb_free=61.6, wall=30196 epoch 021: 1477 / 1689 loss=3.566, nll_loss=2.018, ppl=4.05, wps=591911, ups=1.19, wpb=496903, bsz=16361.3, num_updates=35200, lr=0.0003371, gnorm=0.138, clip=0, loss_scale=1, train_wall=83, gb_free=61.6, wall=30196 epoch 021: 1477 / 1689 loss=3.566, nll_loss=2.018, ppl=4.05, wps=591911, ups=1.19, wpb=496903, bsz=16361.3, num_updates=35200, lr=0.0003371, gnorm=0.138, clip=0, loss_scale=1, train_wall=83, gb_free=61.6, wall=30196 epoch 021: 1477 / 1689 loss=3.566, nll_loss=2.018, ppl=4.05, wps=591911, ups=1.19, wpb=496903, bsz=16361.3, num_updates=35200, lr=0.0003371, gnorm=0.138, clip=0, loss_scale=1, train_wall=83, gb_free=61.6, wall=30196 epoch 021: 1477 / 1689 loss=3.566, nll_loss=2.018, ppl=4.05, wps=591911, ups=1.19, wpb=496903, bsz=16361.3, num_updates=35200, lr=0.0003371, gnorm=0.138, clip=0, loss_scale=1, train_wall=83, gb_free=61.6, wall=30196 epoch 021: 1477 / 1689 loss=3.566, nll_loss=2.018, ppl=4.05, wps=591911, ups=1.19, wpb=496903, bsz=16361.3, num_updates=35200, lr=0.0003371, gnorm=0.138, clip=0, loss_scale=1, train_wall=83, gb_free=61.6, wall=30196 epoch 021: 1477 / 1689 loss=3.566, nll_loss=2.018, ppl=4.05, wps=591911, ups=1.19, wpb=496903, bsz=16361.3, num_updates=35200, lr=0.0003371, gnorm=0.138, clip=0, loss_scale=1, train_wall=83, gb_free=61.6, wall=30196 epoch 021: 1477 / 1689 loss=3.566, nll_loss=2.018, ppl=4.05, wps=591911, ups=1.19, wpb=496903, bsz=16361.3, num_updates=35200, lr=0.0003371, gnorm=0.138, clip=0, loss_scale=1, train_wall=83, gb_free=61.6, wall=30196 epoch 021: 1477 / 1689 loss=3.566, nll_loss=2.018, ppl=4.05, wps=591911, ups=1.19, wpb=496903, bsz=16361.3, num_updates=35200, lr=0.0003371, gnorm=0.138, clip=0, loss_scale=1, train_wall=83, gb_free=61.6, wall=30196 epoch 021: 1477 / 1689 loss=3.566, nll_loss=2.018, ppl=4.05, wps=591911, ups=1.19, wpb=496903, bsz=16361.3, num_updates=35200, lr=0.0003371, gnorm=0.138, clip=0, loss_scale=1, train_wall=83, gb_free=61.6, wall=30196 epoch 021: 1477 / 1689 loss=3.566, nll_loss=2.018, ppl=4.05, wps=591911, ups=1.19, wpb=496903, bsz=16361.3, num_updates=35200, lr=0.0003371, gnorm=0.138, clip=0, loss_scale=1, train_wall=83, gb_free=61.6, wall=30196 epoch 021: 1477 / 1689 loss=3.566, nll_loss=2.018, ppl=4.05, wps=591911, ups=1.19, wpb=496903, bsz=16361.3, num_updates=35200, lr=0.0003371, gnorm=0.138, clip=0, loss_scale=1, train_wall=83, gb_free=61.6, wall=30196 epoch 021: 1477 / 1689 loss=3.566, nll_loss=2.018, ppl=4.05, wps=591911, ups=1.19, wpb=496903, bsz=16361.3, num_updates=35200, lr=0.0003371, gnorm=0.138, clip=0, loss_scale=1, train_wall=83, gb_free=61.6, wall=30196 epoch 021: 1477 / 1689 loss=3.566, nll_loss=2.018, ppl=4.05, wps=591911, ups=1.19, wpb=496903, bsz=16361.3, num_updates=35200, lr=0.0003371, gnorm=0.138, clip=0, loss_scale=1, train_wall=83, gb_free=61.6, wall=30196 epoch 021: 1477 / 1689 loss=3.566, nll_loss=2.018, ppl=4.05, wps=591911, ups=1.19, wpb=496903, bsz=16361.3, num_updates=35200, lr=0.0003371, gnorm=0.138, clip=0, loss_scale=1, train_wall=83, gb_free=61.6, wall=30196 epoch 021: 1477 / 1689 loss=3.566, nll_loss=2.018, ppl=4.05, wps=591911, ups=1.19, wpb=496903, bsz=16361.3, num_updates=35200, lr=0.0003371, gnorm=0.138, clip=0, loss_scale=1, train_wall=83, gb_free=61.6, wall=30196 epoch 021: 1477 / 1689 loss=3.566, nll_loss=2.018, ppl=4.05, wps=591911, ups=1.19, wpb=496903, bsz=16361.3, num_updates=35200, lr=0.0003371, gnorm=0.138, clip=0, loss_scale=1, train_wall=83, gb_free=61.6, wall=30196 epoch 021: 1477 / 1689 loss=3.566, nll_loss=2.018, ppl=4.05, wps=591911, ups=1.19, wpb=496903, bsz=16361.3, num_updates=35200, lr=0.0003371, gnorm=0.138, clip=0, loss_scale=1, train_wall=83, gb_free=61.6, wall=30196 epoch 021: 1477 / 1689 loss=3.566, nll_loss=2.018, ppl=4.05, wps=591911, ups=1.19, wpb=496903, bsz=16361.3, num_updates=35200, lr=0.0003371, gnorm=0.138, clip=0, loss_scale=1, train_wall=83, gb_free=61.6, wall=30196 epoch 021: 1577 / 1689 loss=3.57, nll_loss=2.023, ppl=4.06, wps=591659, ups=1.19, wpb=496768, bsz=16438.5, num_updates=35300, lr=0.000336622, gnorm=0.143, clip=0, loss_scale=1, train_wall=83, gb_free=61.5, wall=30280 epoch 021: 1577 / 1689 loss=3.57, nll_loss=2.023, ppl=4.06, wps=591659, ups=1.19, wpb=496768, bsz=16438.5, num_updates=35300, lr=0.000336622, gnorm=0.143, clip=0, loss_scale=1, train_wall=83, gb_free=61.5, wall=30280 epoch 021: 1577 / 1689 loss=3.57, nll_loss=2.023, ppl=4.06, wps=591659, ups=1.19, wpb=496768, bsz=16438.5, num_updates=35300, lr=0.000336622, gnorm=0.143, clip=0, loss_scale=1, train_wall=83, gb_free=61.5, wall=30280 epoch 021: 1577 / 1689 loss=3.57, nll_loss=2.023, ppl=4.06, wps=591659, ups=1.19, wpb=496768, bsz=16438.5, num_updates=35300, lr=0.000336622, gnorm=0.143, clip=0, loss_scale=1, train_wall=83, gb_free=61.5, wall=30280 epoch 021: 1577 / 1689 loss=3.57, nll_loss=2.023, ppl=4.06, wps=591659, ups=1.19, wpb=496768, bsz=16438.5, num_updates=35300, lr=0.000336622, gnorm=0.143, clip=0, loss_scale=1, train_wall=83, gb_free=61.5, wall=30280 epoch 021: 1577 / 1689 loss=3.57, nll_loss=2.023, ppl=4.06, wps=591659, ups=1.19, wpb=496768, bsz=16438.5, num_updates=35300, lr=0.000336622, gnorm=0.143, clip=0, loss_scale=1, train_wall=83, gb_free=61.5, wall=30280 epoch 021: 1577 / 1689 loss=3.57, nll_loss=2.023, ppl=4.06, wps=591659, ups=1.19, wpb=496768, bsz=16438.5, num_updates=35300, lr=0.000336622, gnorm=0.143, clip=0, loss_scale=1, train_wall=83, gb_free=61.5, wall=30280 epoch 021: 1577 / 1689 loss=3.57, nll_loss=2.023, ppl=4.06, wps=591659, ups=1.19, wpb=496768, bsz=16438.5, num_updates=35300, lr=0.000336622, gnorm=0.143, clip=0, loss_scale=1, train_wall=83, gb_free=61.5, wall=30280 epoch 021: 1577 / 1689 loss=3.57, nll_loss=2.023, ppl=4.06, wps=591659, ups=1.19, wpb=496768, bsz=16438.5, num_updates=35300, lr=0.000336622, gnorm=0.143, clip=0, loss_scale=1, train_wall=83, gb_free=61.5, wall=30280 epoch 021: 1577 / 1689 loss=3.57, nll_loss=2.023, ppl=4.06, wps=591659, ups=1.19, wpb=496768, bsz=16438.5, num_updates=35300, lr=0.000336622, gnorm=0.143, clip=0, loss_scale=1, train_wall=83, gb_free=61.5, wall=30280 epoch 021: 1577 / 1689 loss=3.57, nll_loss=2.023, ppl=4.06, wps=591659, ups=1.19, wpb=496768, bsz=16438.5, num_updates=35300, lr=0.000336622, gnorm=0.143, clip=0, loss_scale=1, train_wall=83, gb_free=61.5, wall=30280 epoch 021: 1577 / 1689 loss=3.57, nll_loss=2.023, ppl=4.06, wps=591659, ups=1.19, wpb=496768, bsz=16438.5, num_updates=35300, lr=0.000336622, gnorm=0.143, clip=0, loss_scale=1, train_wall=83, gb_free=61.5, wall=30280 epoch 021: 1577 / 1689 loss=3.57, nll_loss=2.023, ppl=4.06, wps=591659, ups=1.19, wpb=496768, bsz=16438.5, num_updates=35300, lr=0.000336622, gnorm=0.143, clip=0, loss_scale=1, train_wall=83, gb_free=61.5, wall=30280 epoch 021: 1577 / 1689 loss=3.57, nll_loss=2.023, ppl=4.06, wps=591659, ups=1.19, wpb=496768, bsz=16438.5, num_updates=35300, lr=0.000336622, gnorm=0.143, clip=0, loss_scale=1, train_wall=83, gb_free=61.5, wall=30280 epoch 021: 1577 / 1689 loss=3.57, nll_loss=2.023, ppl=4.06, wps=591659, ups=1.19, wpb=496768, bsz=16438.5, num_updates=35300, lr=0.000336622, gnorm=0.143, clip=0, loss_scale=1, train_wall=83, gb_free=61.5, wall=30280 epoch 021: 1577 / 1689 loss=3.57, nll_loss=2.023, ppl=4.06, wps=591659, ups=1.19, wpb=496768, bsz=16438.5, num_updates=35300, lr=0.000336622, gnorm=0.143, clip=0, loss_scale=1, train_wall=83, gb_free=61.5, wall=30280 epoch 021: 1577 / 1689 loss=3.57, nll_loss=2.023, ppl=4.06, wps=591659, ups=1.19, wpb=496768, bsz=16438.5, num_updates=35300, lr=0.000336622, gnorm=0.143, clip=0, loss_scale=1, train_wall=83, gb_free=61.5, wall=30280 epoch 021: 1577 / 1689 loss=3.57, nll_loss=2.023, ppl=4.06, wps=591659, ups=1.19, wpb=496768, bsz=16438.5, num_updates=35300, lr=0.000336622, gnorm=0.143, clip=0, loss_scale=1, train_wall=83, gb_free=61.5, wall=30280 epoch 021: 1577 / 1689 loss=3.57, nll_loss=2.023, ppl=4.06, wps=591659, ups=1.19, wpb=496768, bsz=16438.5, num_updates=35300, lr=0.000336622, gnorm=0.143, clip=0, loss_scale=1, train_wall=83, gb_free=61.5, wall=30280 epoch 021: 1577 / 1689 loss=3.57, nll_loss=2.023, ppl=4.06, wps=591659, ups=1.19, wpb=496768, bsz=16438.5, num_updates=35300, lr=0.000336622, gnorm=0.143, clip=0, loss_scale=1, train_wall=83, gb_free=61.5, wall=30280 epoch 021: 1577 / 1689 loss=3.57, nll_loss=2.023, ppl=4.06, wps=591659, ups=1.19, wpb=496768, bsz=16438.5, num_updates=35300, lr=0.000336622, gnorm=0.143, clip=0, loss_scale=1, train_wall=83, gb_free=61.5, wall=30280 epoch 021: 1677 / 1689 loss=3.566, nll_loss=2.019, ppl=4.05, wps=586308, ups=1.19, wpb=494461, bsz=16535, num_updates=35400, lr=0.000336146, gnorm=0.13, clip=0, loss_scale=1, train_wall=83, gb_free=57.8, wall=30364 epoch 021: 1677 / 1689 loss=3.566, nll_loss=2.019, ppl=4.05, wps=586308, ups=1.19, wpb=494461, bsz=16535, num_updates=35400, lr=0.000336146, gnorm=0.13, clip=0, loss_scale=1, train_wall=83, gb_free=57.8, wall=30364 epoch 021: 1677 / 1689 loss=3.566, nll_loss=2.019, ppl=4.05, wps=586308, ups=1.19, wpb=494461, bsz=16535, num_updates=35400, lr=0.000336146, gnorm=0.13, clip=0, loss_scale=1, train_wall=83, gb_free=57.8, wall=30364 epoch 021: 1677 / 1689 loss=3.566, nll_loss=2.019, ppl=4.05, wps=586308, ups=1.19, wpb=494461, bsz=16535, num_updates=35400, lr=0.000336146, gnorm=0.13, clip=0, loss_scale=1, train_wall=83, gb_free=57.8, wall=30364 epoch 021: 1677 / 1689 loss=3.566, nll_loss=2.019, ppl=4.05, wps=586308, ups=1.19, wpb=494461, bsz=16535, num_updates=35400, lr=0.000336146, gnorm=0.13, clip=0, loss_scale=1, train_wall=83, gb_free=57.8, wall=30364 epoch 021: 1677 / 1689 loss=3.566, nll_loss=2.019, ppl=4.05, wps=586308, ups=1.19, wpb=494461, bsz=16535, num_updates=35400, lr=0.000336146, gnorm=0.13, clip=0, loss_scale=1, train_wall=83, gb_free=57.8, wall=30364 epoch 021: 1677 / 1689 loss=3.566, nll_loss=2.019, ppl=4.05, wps=586308, ups=1.19, wpb=494461, bsz=16535, num_updates=35400, lr=0.000336146, gnorm=0.13, clip=0, loss_scale=1, train_wall=83, gb_free=57.8, wall=30364 epoch 021: 1677 / 1689 loss=3.566, nll_loss=2.019, ppl=4.05, wps=586308, ups=1.19, wpb=494461, bsz=16535, num_updates=35400, lr=0.000336146, gnorm=0.13, clip=0, loss_scale=1, train_wall=83, gb_free=57.8, wall=30364 epoch 021: 1677 / 1689 loss=3.566, nll_loss=2.019, ppl=4.05, wps=586308, ups=1.19, wpb=494461, bsz=16535, num_updates=35400, lr=0.000336146, gnorm=0.13, clip=0, loss_scale=1, train_wall=83, gb_free=57.8, wall=30364 epoch 021: 1677 / 1689 loss=3.566, nll_loss=2.019, ppl=4.05, wps=586308, ups=1.19, wpb=494461, bsz=16535, num_updates=35400, lr=0.000336146, gnorm=0.13, clip=0, loss_scale=1, train_wall=83, gb_free=57.8, wall=30364 epoch 021: 1677 / 1689 loss=3.566, nll_loss=2.019, ppl=4.05, wps=586308, ups=1.19, wpb=494461, bsz=16535, num_updates=35400, lr=0.000336146, gnorm=0.13, clip=0, loss_scale=1, train_wall=83, gb_free=57.8, wall=30364 epoch 021: 1677 / 1689 loss=3.566, nll_loss=2.019, ppl=4.05, wps=586308, ups=1.19, wpb=494461, bsz=16535, num_updates=35400, lr=0.000336146, gnorm=0.13, clip=0, loss_scale=1, train_wall=83, gb_free=57.8, wall=30364 epoch 021: 1677 / 1689 loss=3.566, nll_loss=2.019, ppl=4.05, wps=586308, ups=1.19, wpb=494461, bsz=16535, num_updates=35400, lr=0.000336146, gnorm=0.13, clip=0, loss_scale=1, train_wall=83, gb_free=57.8, wall=30364 epoch 021: 1677 / 1689 loss=3.566, nll_loss=2.019, ppl=4.05, wps=586308, ups=1.19, wpb=494461, bsz=16535, num_updates=35400, lr=0.000336146, gnorm=0.13, clip=0, loss_scale=1, train_wall=83, gb_free=57.8, wall=30364 epoch 021: 1677 / 1689 loss=3.566, nll_loss=2.019, ppl=4.05, wps=586308, ups=1.19, wpb=494461, bsz=16535, num_updates=35400, lr=0.000336146, gnorm=0.13, clip=0, loss_scale=1, train_wall=83, gb_free=57.8, wall=30364 epoch 021: 1677 / 1689 loss=3.566, nll_loss=2.019, ppl=4.05, wps=586308, ups=1.19, wpb=494461, bsz=16535, num_updates=35400, lr=0.000336146, gnorm=0.13, clip=0, loss_scale=1, train_wall=83, gb_free=57.8, wall=30364 epoch 021: 1677 / 1689 loss=3.566, nll_loss=2.019, ppl=4.05, wps=586308, ups=1.19, wpb=494461, bsz=16535, num_updates=35400, lr=0.000336146, gnorm=0.13, clip=0, loss_scale=1, train_wall=83, gb_free=57.8, wall=30364 epoch 021: 1677 / 1689 loss=3.566, nll_loss=2.019, ppl=4.05, wps=586308, ups=1.19, wpb=494461, bsz=16535, num_updates=35400, lr=0.000336146, gnorm=0.13, clip=0, loss_scale=1, train_wall=83, gb_free=57.8, wall=30364 epoch 021: 1677 / 1689 loss=3.566, nll_loss=2.019, ppl=4.05, wps=586308, ups=1.19, wpb=494461, bsz=16535, num_updates=35400, lr=0.000336146, gnorm=0.13, clip=0, loss_scale=1, train_wall=83, gb_free=57.8, wall=30364 epoch 021: 1677 / 1689 loss=3.566, nll_loss=2.019, ppl=4.05, wps=586308, ups=1.19, wpb=494461, bsz=16535, num_updates=35400, lr=0.000336146, gnorm=0.13, clip=0, loss_scale=1, train_wall=83, gb_free=57.8, wall=30364 epoch 021: 1677 / 1689 loss=3.566, nll_loss=2.019, ppl=4.05, wps=586308, ups=1.19, wpb=494461, bsz=16535, num_updates=35400, lr=0.000336146, gnorm=0.13, clip=0, loss_scale=1, train_wall=83, gb_free=57.8, wall=30364 end of epoch 21 (average epoch stats below) epoch 021 | loss 3.56 | nll_loss 2.011 | ppl 4.03 | wps 575428 | ups 1.16 | wpb 495117 | bsz 16503.9 | num_updates 35412 | lr 0.000336089 | gnorm 0.138 | clip 0 | loss_scale 1 | train_wall 1407 | gb_free 63.5 | wall 30374 epoch 021 | loss 3.56 | nll_loss 2.011 | ppl 4.03 | wps 575428 | ups 1.16 | wpb 495117 | bsz 16503.9 | num_updates 35412 | lr 0.000336089 | gnorm 0.138 | clip 0 | loss_scale 1 | train_wall 1407 | gb_free 63.5 | wall 30374 epoch 021 | loss 3.56 | nll_loss 2.011 | ppl 4.03 | wps 575428 | ups 1.16 | wpb 495117 | bsz 16503.9 | num_updates 35412 | lr 0.000336089 | gnorm 0.138 | clip 0 | loss_scale 1 | train_wall 1407 | gb_free 63.5 | wall 30374 epoch 021 | loss 3.56 | nll_loss 2.011 | ppl 4.03 | wps 575428 | ups 1.16 | wpb 495117 | bsz 16503.9 | num_updates 35412 | lr 0.000336089 | gnorm 0.138 | clip 0 | loss_scale 1 | train_wall 1407 | gb_free 63.5 | wall 30374 epoch 021 | loss 3.56 | nll_loss 2.011 | ppl 4.03 | wps 575428 | ups 1.16 | wpb 495117 | bsz 16503.9 | num_updates 35412 | lr 0.000336089 | gnorm 0.138 | clip 0 | loss_scale 1 | train_wall 1407 | gb_free 63.5 | wall 30374 epoch 021 | loss 3.56 | nll_loss 2.011 | ppl 4.03 | wps 575428 | ups 1.16 | wpb 495117 | bsz 16503.9 | num_updates 35412 | lr 0.000336089 | gnorm 0.138 | clip 0 | loss_scale 1 | train_wall 1407 | gb_free 63.5 | wall 30374 epoch 021 | loss 3.56 | nll_loss 2.011 | ppl 4.03 | wps 575428 | ups 1.16 | wpb 495117 | bsz 16503.9 | num_updates 35412 | lr 0.000336089 | gnorm 0.138 | clip 0 | loss_scale 1 | train_wall 1407 | gb_free 63.5 | wall 30374 epoch 021 | loss 3.56 | nll_loss 2.011 | ppl 4.03 | wps 575428 | ups 1.16 | wpb 495117 | bsz 16503.9 | num_updates 35412 | lr 0.000336089 | gnorm 0.138 | clip 0 | loss_scale 1 | train_wall 1407 | gb_free 63.5 | wall 30374 epoch 021 | loss 3.56 | nll_loss 2.011 | ppl 4.03 | wps 575428 | ups 1.16 | wpb 495117 | bsz 16503.9 | num_updates 35412 | lr 0.000336089 | gnorm 0.138 | clip 0 | loss_scale 1 | train_wall 1407 | gb_free 63.5 | wall 30374 epoch 021 | loss 3.56 | nll_loss 2.011 | ppl 4.03 | wps 575428 | ups 1.16 | wpb 495117 | bsz 16503.9 | num_updates 35412 | lr 0.000336089 | gnorm 0.138 | clip 0 | loss_scale 1 | train_wall 1407 | gb_free 63.5 | wall 30374 epoch 021 | loss 3.56 | nll_loss 2.011 | ppl 4.03 | wps 575428 | ups 1.16 | wpb 495117 | bsz 16503.9 | num_updates 35412 | lr 0.000336089 | gnorm 0.138 | clip 0 | loss_scale 1 | train_wall 1407 | gb_free 63.5 | wall 30374 epoch 021 | loss 3.56 | nll_loss 2.011 | ppl 4.03 | wps 575428 | ups 1.16 | wpb 495117 | bsz 16503.9 | num_updates 35412 | lr 0.000336089 | gnorm 0.138 | clip 0 | loss_scale 1 | train_wall 1407 | gb_free 63.5 | wall 30374 epoch 021 | loss 3.56 | nll_loss 2.011 | ppl 4.03 | wps 575428 | ups 1.16 | wpb 495117 | bsz 16503.9 | num_updates 35412 | lr 0.000336089 | gnorm 0.138 | clip 0 | loss_scale 1 | train_wall 1407 | gb_free 63.5 | wall 30374 epoch 021 | loss 3.56 | nll_loss 2.011 | ppl 4.03 | wps 575428 | ups 1.16 | wpb 495117 | bsz 16503.9 | num_updates 35412 | lr 0.000336089 | gnorm 0.138 | clip 0 | loss_scale 1 | train_wall 1407 | gb_free 63.5 | wall 30374 epoch 021 | loss 3.56 | nll_loss 2.011 | ppl 4.03 | wps 575428 | ups 1.16 | wpb 495117 | bsz 16503.9 | num_updates 35412 | lr 0.000336089 | gnorm 0.138 | clip 0 | loss_scale 1 | train_wall 1407 | gb_free 63.5 | wall 30374 epoch 021 | loss 3.56 | nll_loss 2.011 | ppl 4.03 | wps 575428 | ups 1.16 | wpb 495117 | bsz 16503.9 | num_updates 35412 | lr 0.000336089 | gnorm 0.138 | clip 0 | loss_scale 1 | train_wall 1407 | gb_free 63.5 | wall 30374 epoch 021 | loss 3.56 | nll_loss 2.011 | ppl 4.03 | wps 575428 | ups 1.16 | wpb 495117 | bsz 16503.9 | num_updates 35412 | lr 0.000336089 | gnorm 0.138 | clip 0 | loss_scale 1 | train_wall 1407 | gb_free 63.5 | wall 30374 epoch 021 | loss 3.56 | nll_loss 2.011 | ppl 4.03 | wps 575428 | ups 1.16 | wpb 495117 | bsz 16503.9 | num_updates 35412 | lr 0.000336089 | gnorm 0.138 | clip 0 | loss_scale 1 | train_wall 1407 | gb_free 63.5 | wall 30374 epoch 021 | loss 3.56 | nll_loss 2.011 | ppl 4.03 | wps 575428 | ups 1.16 | wpb 495117 | bsz 16503.9 | num_updates 35412 | lr 0.000336089 | gnorm 0.138 | clip 0 | loss_scale 1 | train_wall 1407 | gb_free 63.5 | wall 30374 epoch 021 | loss 3.56 | nll_loss 2.011 | ppl 4.03 | wps 575428 | ups 1.16 | wpb 495117 | bsz 16503.9 | num_updates 35412 | lr 0.000336089 | gnorm 0.138 | clip 0 | loss_scale 1 | train_wall 1407 | gb_free 63.5 | wall 30374 epoch 021 | loss 3.56 | nll_loss 2.011 | ppl 4.03 | wps 575428 | ups 1.16 | wpb 495117 | bsz 16503.9 | num_updates 35412 | lr 0.000336089 | gnorm 0.138 | clip 0 | loss_scale 1 | train_wall 1407 | gb_free 63.5 | wall 30374 Start iterating over samples epoch 022: 88 / 1689 loss=3.535, nll_loss=1.983, ppl=3.95, wps=579391, ups=1.18, wpb=491209, bsz=16542.8, num_updates=35500, lr=0.000335673, gnorm=0.136, clip=0, loss_scale=1, train_wall=83, gb_free=61.6, wall=30449 epoch 022: 88 / 1689 loss=3.535, nll_loss=1.983, ppl=3.95, wps=579391, ups=1.18, wpb=491209, bsz=16542.8, num_updates=35500, lr=0.000335673, gnorm=0.136, clip=0, loss_scale=1, train_wall=83, gb_free=61.6, wall=30449 epoch 022: 88 / 1689 loss=3.535, nll_loss=1.983, ppl=3.95, wps=579391, ups=1.18, wpb=491209, bsz=16542.8, num_updates=35500, lr=0.000335673, gnorm=0.136, clip=0, loss_scale=1, train_wall=83, gb_free=61.6, wall=30449 epoch 022: 88 / 1689 loss=3.535, nll_loss=1.983, ppl=3.95, wps=579391, ups=1.18, wpb=491209, bsz=16542.8, num_updates=35500, lr=0.000335673, gnorm=0.136, clip=0, loss_scale=1, train_wall=83, gb_free=61.6, wall=30449 epoch 022: 88 / 1689 loss=3.535, nll_loss=1.983, ppl=3.95, wps=579391, ups=1.18, wpb=491209, bsz=16542.8, num_updates=35500, lr=0.000335673, gnorm=0.136, clip=0, loss_scale=1, train_wall=83, gb_free=61.6, wall=30449 epoch 022: 88 / 1689 loss=3.535, nll_loss=1.983, ppl=3.95, wps=579391, ups=1.18, wpb=491209, bsz=16542.8, num_updates=35500, lr=0.000335673, gnorm=0.136, clip=0, loss_scale=1, train_wall=83, gb_free=61.6, wall=30449 epoch 022: 88 / 1689 loss=3.535, nll_loss=1.983, ppl=3.95, wps=579391, ups=1.18, wpb=491209, bsz=16542.8, num_updates=35500, lr=0.000335673, gnorm=0.136, clip=0, loss_scale=1, train_wall=83, gb_free=61.6, wall=30449 epoch 022: 88 / 1689 loss=3.535, nll_loss=1.983, ppl=3.95, wps=579391, ups=1.18, wpb=491209, bsz=16542.8, num_updates=35500, lr=0.000335673, gnorm=0.136, clip=0, loss_scale=1, train_wall=83, gb_free=61.6, wall=30449 epoch 022: 88 / 1689 loss=3.535, nll_loss=1.983, ppl=3.95, wps=579391, ups=1.18, wpb=491209, bsz=16542.8, num_updates=35500, lr=0.000335673, gnorm=0.136, clip=0, loss_scale=1, train_wall=83, gb_free=61.6, wall=30449 epoch 022: 88 / 1689 loss=3.535, nll_loss=1.983, ppl=3.95, wps=579391, ups=1.18, wpb=491209, bsz=16542.8, num_updates=35500, lr=0.000335673, gnorm=0.136, clip=0, loss_scale=1, train_wall=83, gb_free=61.6, wall=30449 epoch 022: 88 / 1689 loss=3.535, nll_loss=1.983, ppl=3.95, wps=579391, ups=1.18, wpb=491209, bsz=16542.8, num_updates=35500, lr=0.000335673, gnorm=0.136, clip=0, loss_scale=1, train_wall=83, gb_free=61.6, wall=30449 epoch 022: 88 / 1689 loss=3.535, nll_loss=1.983, ppl=3.95, wps=579391, ups=1.18, wpb=491209, bsz=16542.8, num_updates=35500, lr=0.000335673, gnorm=0.136, clip=0, loss_scale=1, train_wall=83, gb_free=61.6, wall=30449 epoch 022: 88 / 1689 loss=3.535, nll_loss=1.983, ppl=3.95, wps=579391, ups=1.18, wpb=491209, bsz=16542.8, num_updates=35500, lr=0.000335673, gnorm=0.136, clip=0, loss_scale=1, train_wall=83, gb_free=61.6, wall=30449 epoch 022: 88 / 1689 loss=3.535, nll_loss=1.983, ppl=3.95, wps=579391, ups=1.18, wpb=491209, bsz=16542.8, num_updates=35500, lr=0.000335673, gnorm=0.136, clip=0, loss_scale=1, train_wall=83, gb_free=61.6, wall=30449 epoch 022: 88 / 1689 loss=3.535, nll_loss=1.983, ppl=3.95, wps=579391, ups=1.18, wpb=491209, bsz=16542.8, num_updates=35500, lr=0.000335673, gnorm=0.136, clip=0, loss_scale=1, train_wall=83, gb_free=61.6, wall=30449 epoch 022: 88 / 1689 loss=3.535, nll_loss=1.983, ppl=3.95, wps=579391, ups=1.18, wpb=491209, bsz=16542.8, num_updates=35500, lr=0.000335673, gnorm=0.136, clip=0, loss_scale=1, train_wall=83, gb_free=61.6, wall=30449 epoch 022: 88 / 1689 loss=3.535, nll_loss=1.983, ppl=3.95, wps=579391, ups=1.18, wpb=491209, bsz=16542.8, num_updates=35500, lr=0.000335673, gnorm=0.136, clip=0, loss_scale=1, train_wall=83, gb_free=61.6, wall=30449 epoch 022: 88 / 1689 loss=3.535, nll_loss=1.983, ppl=3.95, wps=579391, ups=1.18, wpb=491209, bsz=16542.8, num_updates=35500, lr=0.000335673, gnorm=0.136, clip=0, loss_scale=1, train_wall=83, gb_free=61.6, wall=30449 epoch 022: 88 / 1689 loss=3.535, nll_loss=1.983, ppl=3.95, wps=579391, ups=1.18, wpb=491209, bsz=16542.8, num_updates=35500, lr=0.000335673, gnorm=0.136, clip=0, loss_scale=1, train_wall=83, gb_free=61.6, wall=30449 epoch 022: 88 / 1689 loss=3.535, nll_loss=1.983, ppl=3.95, wps=579391, ups=1.18, wpb=491209, bsz=16542.8, num_updates=35500, lr=0.000335673, gnorm=0.136, clip=0, loss_scale=1, train_wall=83, gb_free=61.6, wall=30449 epoch 022: 88 / 1689 loss=3.535, nll_loss=1.983, ppl=3.95, wps=579391, ups=1.18, wpb=491209, bsz=16542.8, num_updates=35500, lr=0.000335673, gnorm=0.136, clip=0, loss_scale=1, train_wall=83, gb_free=61.6, wall=30449 epoch 022: 88 / 1689 loss=3.535, nll_loss=1.983, ppl=3.95, wps=579391, ups=1.18, wpb=491209, bsz=16542.8, num_updates=35500, lr=0.000335673, gnorm=0.136, clip=0, loss_scale=1, train_wall=83, gb_free=61.6, wall=30449 epoch 022: 188 / 1689 loss=3.536, nll_loss=1.983, ppl=3.95, wps=584857, ups=1.18, wpb=493635, bsz=16042.6, num_updates=35600, lr=0.000335201, gnorm=0.136, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=30533 epoch 022: 188 / 1689 loss=3.536, nll_loss=1.983, ppl=3.95, wps=584857, ups=1.18, wpb=493635, bsz=16042.6, num_updates=35600, lr=0.000335201, gnorm=0.136, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=30533 epoch 022: 188 / 1689 loss=3.536, nll_loss=1.983, ppl=3.95, wps=584857, ups=1.18, wpb=493635, bsz=16042.6, num_updates=35600, lr=0.000335201, gnorm=0.136, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=30533 epoch 022: 188 / 1689 loss=3.536, nll_loss=1.983, ppl=3.95, wps=584857, ups=1.18, wpb=493635, bsz=16042.6, num_updates=35600, lr=0.000335201, gnorm=0.136, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=30533 epoch 022: 188 / 1689 loss=3.536, nll_loss=1.983, ppl=3.95, wps=584857, ups=1.18, wpb=493635, bsz=16042.6, num_updates=35600, lr=0.000335201, gnorm=0.136, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=30533 epoch 022: 188 / 1689 loss=3.536, nll_loss=1.983, ppl=3.95, wps=584857, ups=1.18, wpb=493635, bsz=16042.6, num_updates=35600, lr=0.000335201, gnorm=0.136, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=30533 epoch 022: 188 / 1689 loss=3.536, nll_loss=1.983, ppl=3.95, wps=584857, ups=1.18, wpb=493635, bsz=16042.6, num_updates=35600, lr=0.000335201, gnorm=0.136, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=30533 epoch 022: 188 / 1689 loss=3.536, nll_loss=1.983, ppl=3.95, wps=584857, ups=1.18, wpb=493635, bsz=16042.6, num_updates=35600, lr=0.000335201, gnorm=0.136, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=30533 epoch 022: 188 / 1689 loss=3.536, nll_loss=1.983, ppl=3.95, wps=584857, ups=1.18, wpb=493635, bsz=16042.6, num_updates=35600, lr=0.000335201, gnorm=0.136, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=30533 epoch 022: 188 / 1689 loss=3.536, nll_loss=1.983, ppl=3.95, wps=584857, ups=1.18, wpb=493635, bsz=16042.6, num_updates=35600, lr=0.000335201, gnorm=0.136, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=30533 epoch 022: 188 / 1689 loss=3.536, nll_loss=1.983, ppl=3.95, wps=584857, ups=1.18, wpb=493635, bsz=16042.6, num_updates=35600, lr=0.000335201, gnorm=0.136, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=30533 epoch 022: 188 / 1689 loss=3.536, nll_loss=1.983, ppl=3.95, wps=584857, ups=1.18, wpb=493635, bsz=16042.6, num_updates=35600, lr=0.000335201, gnorm=0.136, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=30533 epoch 022: 188 / 1689 loss=3.536, nll_loss=1.983, ppl=3.95, wps=584857, ups=1.18, wpb=493635, bsz=16042.6, num_updates=35600, lr=0.000335201, gnorm=0.136, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=30533 epoch 022: 188 / 1689 loss=3.536, nll_loss=1.983, ppl=3.95, wps=584857, ups=1.18, wpb=493635, bsz=16042.6, num_updates=35600, lr=0.000335201, gnorm=0.136, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=30533 epoch 022: 188 / 1689 loss=3.536, nll_loss=1.983, ppl=3.95, wps=584857, ups=1.18, wpb=493635, bsz=16042.6, num_updates=35600, lr=0.000335201, gnorm=0.136, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=30533 epoch 022: 188 / 1689 loss=3.536, nll_loss=1.983, ppl=3.95, wps=584857, ups=1.18, wpb=493635, bsz=16042.6, num_updates=35600, lr=0.000335201, gnorm=0.136, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=30533 epoch 022: 188 / 1689 loss=3.536, nll_loss=1.983, ppl=3.95, wps=584857, ups=1.18, wpb=493635, bsz=16042.6, num_updates=35600, lr=0.000335201, gnorm=0.136, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=30533 epoch 022: 188 / 1689 loss=3.536, nll_loss=1.983, ppl=3.95, wps=584857, ups=1.18, wpb=493635, bsz=16042.6, num_updates=35600, lr=0.000335201, gnorm=0.136, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=30533 epoch 022: 188 / 1689 loss=3.536, nll_loss=1.983, ppl=3.95, wps=584857, ups=1.18, wpb=493635, bsz=16042.6, num_updates=35600, lr=0.000335201, gnorm=0.136, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=30533 epoch 022: 188 / 1689 loss=3.536, nll_loss=1.983, ppl=3.95, wps=584857, ups=1.18, wpb=493635, bsz=16042.6, num_updates=35600, lr=0.000335201, gnorm=0.136, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=30533 epoch 022: 188 / 1689 loss=3.536, nll_loss=1.983, ppl=3.95, wps=584857, ups=1.18, wpb=493635, bsz=16042.6, num_updates=35600, lr=0.000335201, gnorm=0.136, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=30533 epoch 022: 188 / 1689 loss=3.536, nll_loss=1.983, ppl=3.95, wps=584857, ups=1.18, wpb=493635, bsz=16042.6, num_updates=35600, lr=0.000335201, gnorm=0.136, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=30533 epoch 022: 288 / 1689 loss=3.544, nll_loss=1.992, ppl=3.98, wps=581034, ups=1.18, wpb=494124, bsz=16657.3, num_updates=35700, lr=0.000334731, gnorm=0.138, clip=0, loss_scale=2, train_wall=84, gb_free=61.7, wall=30618 epoch 022: 288 / 1689 loss=3.544, nll_loss=1.992, ppl=3.98, wps=581034, ups=1.18, wpb=494124, bsz=16657.3, num_updates=35700, lr=0.000334731, gnorm=0.138, clip=0, loss_scale=2, train_wall=84, gb_free=61.7, wall=30618 epoch 022: 288 / 1689 loss=3.544, nll_loss=1.992, ppl=3.98, wps=581034, ups=1.18, wpb=494124, bsz=16657.3, num_updates=35700, lr=0.000334731, gnorm=0.138, clip=0, loss_scale=2, train_wall=84, gb_free=61.7, wall=30618 epoch 022: 288 / 1689 loss=3.544, nll_loss=1.992, ppl=3.98, wps=581034, ups=1.18, wpb=494124, bsz=16657.3, num_updates=35700, lr=0.000334731, gnorm=0.138, clip=0, loss_scale=2, train_wall=84, gb_free=61.7, wall=30618 epoch 022: 288 / 1689 loss=3.544, nll_loss=1.992, ppl=3.98, wps=581034, ups=1.18, wpb=494124, bsz=16657.3, num_updates=35700, lr=0.000334731, gnorm=0.138, clip=0, loss_scale=2, train_wall=84, gb_free=61.7, wall=30618 epoch 022: 288 / 1689 loss=3.544, nll_loss=1.992, ppl=3.98, wps=581034, ups=1.18, wpb=494124, bsz=16657.3, num_updates=35700, lr=0.000334731, gnorm=0.138, clip=0, loss_scale=2, train_wall=84, gb_free=61.7, wall=30618 epoch 022: 288 / 1689 loss=3.544, nll_loss=1.992, ppl=3.98, wps=581034, ups=1.18, wpb=494124, bsz=16657.3, num_updates=35700, lr=0.000334731, gnorm=0.138, clip=0, loss_scale=2, train_wall=84, gb_free=61.7, wall=30618 epoch 022: 288 / 1689 loss=3.544, nll_loss=1.992, ppl=3.98, wps=581034, ups=1.18, wpb=494124, bsz=16657.3, num_updates=35700, lr=0.000334731, gnorm=0.138, clip=0, loss_scale=2, train_wall=84, gb_free=61.7, wall=30618 epoch 022: 288 / 1689 loss=3.544, nll_loss=1.992, ppl=3.98, wps=581034, ups=1.18, wpb=494124, bsz=16657.3, num_updates=35700, lr=0.000334731, gnorm=0.138, clip=0, loss_scale=2, train_wall=84, gb_free=61.7, wall=30618 epoch 022: 288 / 1689 loss=3.544, nll_loss=1.992, ppl=3.98, wps=581034, ups=1.18, wpb=494124, bsz=16657.3, num_updates=35700, lr=0.000334731, gnorm=0.138, clip=0, loss_scale=2, train_wall=84, gb_free=61.7, wall=30618 epoch 022: 288 / 1689 loss=3.544, nll_loss=1.992, ppl=3.98, wps=581034, ups=1.18, wpb=494124, bsz=16657.3, num_updates=35700, lr=0.000334731, gnorm=0.138, clip=0, loss_scale=2, train_wall=84, gb_free=61.7, wall=30618 epoch 022: 288 / 1689 loss=3.544, nll_loss=1.992, ppl=3.98, wps=581034, ups=1.18, wpb=494124, bsz=16657.3, num_updates=35700, lr=0.000334731, gnorm=0.138, clip=0, loss_scale=2, train_wall=84, gb_free=61.7, wall=30618 epoch 022: 288 / 1689 loss=3.544, nll_loss=1.992, ppl=3.98, wps=581034, ups=1.18, wpb=494124, bsz=16657.3, num_updates=35700, lr=0.000334731, gnorm=0.138, clip=0, loss_scale=2, train_wall=84, gb_free=61.7, wall=30618 epoch 022: 288 / 1689 loss=3.544, nll_loss=1.992, ppl=3.98, wps=581034, ups=1.18, wpb=494124, bsz=16657.3, num_updates=35700, lr=0.000334731, gnorm=0.138, clip=0, loss_scale=2, train_wall=84, gb_free=61.7, wall=30618 epoch 022: 288 / 1689 loss=3.544, nll_loss=1.992, ppl=3.98, wps=581034, ups=1.18, wpb=494124, bsz=16657.3, num_updates=35700, lr=0.000334731, gnorm=0.138, clip=0, loss_scale=2, train_wall=84, gb_free=61.7, wall=30618 epoch 022: 288 / 1689 loss=3.544, nll_loss=1.992, ppl=3.98, wps=581034, ups=1.18, wpb=494124, bsz=16657.3, num_updates=35700, lr=0.000334731, gnorm=0.138, clip=0, loss_scale=2, train_wall=84, gb_free=61.7, wall=30618 epoch 022: 288 / 1689 loss=3.544, nll_loss=1.992, ppl=3.98, wps=581034, ups=1.18, wpb=494124, bsz=16657.3, num_updates=35700, lr=0.000334731, gnorm=0.138, clip=0, loss_scale=2, train_wall=84, gb_free=61.7, wall=30618 epoch 022: 288 / 1689 loss=3.544, nll_loss=1.992, ppl=3.98, wps=581034, ups=1.18, wpb=494124, bsz=16657.3, num_updates=35700, lr=0.000334731, gnorm=0.138, clip=0, loss_scale=2, train_wall=84, gb_free=61.7, wall=30618 epoch 022: 288 / 1689 loss=3.544, nll_loss=1.992, ppl=3.98, wps=581034, ups=1.18, wpb=494124, bsz=16657.3, num_updates=35700, lr=0.000334731, gnorm=0.138, clip=0, loss_scale=2, train_wall=84, gb_free=61.7, wall=30618 epoch 022: 288 / 1689 loss=3.544, nll_loss=1.992, ppl=3.98, wps=581034, ups=1.18, wpb=494124, bsz=16657.3, num_updates=35700, lr=0.000334731, gnorm=0.138, clip=0, loss_scale=2, train_wall=84, gb_free=61.7, wall=30618 epoch 022: 288 / 1689 loss=3.544, nll_loss=1.992, ppl=3.98, wps=581034, ups=1.18, wpb=494124, bsz=16657.3, num_updates=35700, lr=0.000334731, gnorm=0.138, clip=0, loss_scale=2, train_wall=84, gb_free=61.7, wall=30618 epoch 022: 288 / 1689 loss=3.544, nll_loss=1.992, ppl=3.98, wps=581034, ups=1.18, wpb=494124, bsz=16657.3, num_updates=35700, lr=0.000334731, gnorm=0.138, clip=0, loss_scale=2, train_wall=84, gb_free=61.7, wall=30618 epoch 022: 388 / 1689 loss=3.545, nll_loss=1.994, ppl=3.98, wps=587278, ups=1.18, wpb=495953, bsz=16806.8, num_updates=35800, lr=0.000334263, gnorm=0.139, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=30703 epoch 022: 388 / 1689 loss=3.545, nll_loss=1.994, ppl=3.98, wps=587278, ups=1.18, wpb=495953, bsz=16806.8, num_updates=35800, lr=0.000334263, gnorm=0.139, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=30703 epoch 022: 388 / 1689 loss=3.545, nll_loss=1.994, ppl=3.98, wps=587278, ups=1.18, wpb=495953, bsz=16806.8, num_updates=35800, lr=0.000334263, gnorm=0.139, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=30703 epoch 022: 388 / 1689 loss=3.545, nll_loss=1.994, ppl=3.98, wps=587278, ups=1.18, wpb=495953, bsz=16806.8, num_updates=35800, lr=0.000334263, gnorm=0.139, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=30703 epoch 022: 388 / 1689 loss=3.545, nll_loss=1.994, ppl=3.98, wps=587278, ups=1.18, wpb=495953, bsz=16806.8, num_updates=35800, lr=0.000334263, gnorm=0.139, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=30703 epoch 022: 388 / 1689 loss=3.545, nll_loss=1.994, ppl=3.98, wps=587278, ups=1.18, wpb=495953, bsz=16806.8, num_updates=35800, lr=0.000334263, gnorm=0.139, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=30703 epoch 022: 388 / 1689 loss=3.545, nll_loss=1.994, ppl=3.98, wps=587278, ups=1.18, wpb=495953, bsz=16806.8, num_updates=35800, lr=0.000334263, gnorm=0.139, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=30703 epoch 022: 388 / 1689 loss=3.545, nll_loss=1.994, ppl=3.98, wps=587278, ups=1.18, wpb=495953, bsz=16806.8, num_updates=35800, lr=0.000334263, gnorm=0.139, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=30703 epoch 022: 388 / 1689 loss=3.545, nll_loss=1.994, ppl=3.98, wps=587278, ups=1.18, wpb=495953, bsz=16806.8, num_updates=35800, lr=0.000334263, gnorm=0.139, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=30703 epoch 022: 388 / 1689 loss=3.545, nll_loss=1.994, ppl=3.98, wps=587278, ups=1.18, wpb=495953, bsz=16806.8, num_updates=35800, lr=0.000334263, gnorm=0.139, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=30703 epoch 022: 388 / 1689 loss=3.545, nll_loss=1.994, ppl=3.98, wps=587278, ups=1.18, wpb=495953, bsz=16806.8, num_updates=35800, lr=0.000334263, gnorm=0.139, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=30703 epoch 022: 388 / 1689 loss=3.545, nll_loss=1.994, ppl=3.98, wps=587278, ups=1.18, wpb=495953, bsz=16806.8, num_updates=35800, lr=0.000334263, gnorm=0.139, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=30703 epoch 022: 388 / 1689 loss=3.545, nll_loss=1.994, ppl=3.98, wps=587278, ups=1.18, wpb=495953, bsz=16806.8, num_updates=35800, lr=0.000334263, gnorm=0.139, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=30703 epoch 022: 388 / 1689 loss=3.545, nll_loss=1.994, ppl=3.98, wps=587278, ups=1.18, wpb=495953, bsz=16806.8, num_updates=35800, lr=0.000334263, gnorm=0.139, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=30703 epoch 022: 388 / 1689 loss=3.545, nll_loss=1.994, ppl=3.98, wps=587278, ups=1.18, wpb=495953, bsz=16806.8, num_updates=35800, lr=0.000334263, gnorm=0.139, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=30703 epoch 022: 388 / 1689 loss=3.545, nll_loss=1.994, ppl=3.98, wps=587278, ups=1.18, wpb=495953, bsz=16806.8, num_updates=35800, lr=0.000334263, gnorm=0.139, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=30703 epoch 022: 388 / 1689 loss=3.545, nll_loss=1.994, ppl=3.98, wps=587278, ups=1.18, wpb=495953, bsz=16806.8, num_updates=35800, lr=0.000334263, gnorm=0.139, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=30703 epoch 022: 388 / 1689 loss=3.545, nll_loss=1.994, ppl=3.98, wps=587278, ups=1.18, wpb=495953, bsz=16806.8, num_updates=35800, lr=0.000334263, gnorm=0.139, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=30703 epoch 022: 388 / 1689 loss=3.545, nll_loss=1.994, ppl=3.98, wps=587278, ups=1.18, wpb=495953, bsz=16806.8, num_updates=35800, lr=0.000334263, gnorm=0.139, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=30703 epoch 022: 388 / 1689 loss=3.545, nll_loss=1.994, ppl=3.98, wps=587278, ups=1.18, wpb=495953, bsz=16806.8, num_updates=35800, lr=0.000334263, gnorm=0.139, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=30703 epoch 022: 388 / 1689 loss=3.545, nll_loss=1.994, ppl=3.98, wps=587278, ups=1.18, wpb=495953, bsz=16806.8, num_updates=35800, lr=0.000334263, gnorm=0.139, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=30703 epoch 022: 388 / 1689 loss=3.545, nll_loss=1.994, ppl=3.98, wps=587278, ups=1.18, wpb=495953, bsz=16806.8, num_updates=35800, lr=0.000334263, gnorm=0.139, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=30703 epoch 022: 488 / 1689 loss=3.546, nll_loss=1.995, ppl=3.99, wps=584490, ups=1.18, wpb=495474, bsz=16569.6, num_updates=35900, lr=0.000333797, gnorm=0.132, clip=0, loss_scale=2, train_wall=83, gb_free=61.9, wall=30788 epoch 022: 488 / 1689 loss=3.546, nll_loss=1.995, ppl=3.99, wps=584490, ups=1.18, wpb=495474, bsz=16569.6, num_updates=35900, lr=0.000333797, gnorm=0.132, clip=0, loss_scale=2, train_wall=83, gb_free=61.9, wall=30788 epoch 022: 488 / 1689 loss=3.546, nll_loss=1.995, ppl=3.99, wps=584490, ups=1.18, wpb=495474, bsz=16569.6, num_updates=35900, lr=0.000333797, gnorm=0.132, clip=0, loss_scale=2, train_wall=83, gb_free=61.9, wall=30788 epoch 022: 488 / 1689 loss=3.546, nll_loss=1.995, ppl=3.99, wps=584490, ups=1.18, wpb=495474, bsz=16569.6, num_updates=35900, lr=0.000333797, gnorm=0.132, clip=0, loss_scale=2, train_wall=83, gb_free=61.9, wall=30788 epoch 022: 488 / 1689 loss=3.546, nll_loss=1.995, ppl=3.99, wps=584490, ups=1.18, wpb=495474, bsz=16569.6, num_updates=35900, lr=0.000333797, gnorm=0.132, clip=0, loss_scale=2, train_wall=83, gb_free=61.9, wall=30788 epoch 022: 488 / 1689 loss=3.546, nll_loss=1.995, ppl=3.99, wps=584490, ups=1.18, wpb=495474, bsz=16569.6, num_updates=35900, lr=0.000333797, gnorm=0.132, clip=0, loss_scale=2, train_wall=83, gb_free=61.9, wall=30788 epoch 022: 488 / 1689 loss=3.546, nll_loss=1.995, ppl=3.99, wps=584490, ups=1.18, wpb=495474, bsz=16569.6, num_updates=35900, lr=0.000333797, gnorm=0.132, clip=0, loss_scale=2, train_wall=83, gb_free=61.9, wall=30788 epoch 022: 488 / 1689 loss=3.546, nll_loss=1.995, ppl=3.99, wps=584490, ups=1.18, wpb=495474, bsz=16569.6, num_updates=35900, lr=0.000333797, gnorm=0.132, clip=0, loss_scale=2, train_wall=83, gb_free=61.9, wall=30788 epoch 022: 488 / 1689 loss=3.546, nll_loss=1.995, ppl=3.99, wps=584490, ups=1.18, wpb=495474, bsz=16569.6, num_updates=35900, lr=0.000333797, gnorm=0.132, clip=0, loss_scale=2, train_wall=83, gb_free=61.9, wall=30788 epoch 022: 488 / 1689 loss=3.546, nll_loss=1.995, ppl=3.99, wps=584490, ups=1.18, wpb=495474, bsz=16569.6, num_updates=35900, lr=0.000333797, gnorm=0.132, clip=0, loss_scale=2, train_wall=83, gb_free=61.9, wall=30788 epoch 022: 488 / 1689 loss=3.546, nll_loss=1.995, ppl=3.99, wps=584490, ups=1.18, wpb=495474, bsz=16569.6, num_updates=35900, lr=0.000333797, gnorm=0.132, clip=0, loss_scale=2, train_wall=83, gb_free=61.9, wall=30788 epoch 022: 488 / 1689 loss=3.546, nll_loss=1.995, ppl=3.99, wps=584490, ups=1.18, wpb=495474, bsz=16569.6, num_updates=35900, lr=0.000333797, gnorm=0.132, clip=0, loss_scale=2, train_wall=83, gb_free=61.9, wall=30788 epoch 022: 488 / 1689 loss=3.546, nll_loss=1.995, ppl=3.99, wps=584490, ups=1.18, wpb=495474, bsz=16569.6, num_updates=35900, lr=0.000333797, gnorm=0.132, clip=0, loss_scale=2, train_wall=83, gb_free=61.9, wall=30788 epoch 022: 488 / 1689 loss=3.546, nll_loss=1.995, ppl=3.99, wps=584490, ups=1.18, wpb=495474, bsz=16569.6, num_updates=35900, lr=0.000333797, gnorm=0.132, clip=0, loss_scale=2, train_wall=83, gb_free=61.9, wall=30788 epoch 022: 488 / 1689 loss=3.546, nll_loss=1.995, ppl=3.99, wps=584490, ups=1.18, wpb=495474, bsz=16569.6, num_updates=35900, lr=0.000333797, gnorm=0.132, clip=0, loss_scale=2, train_wall=83, gb_free=61.9, wall=30788 epoch 022: 488 / 1689 loss=3.546, nll_loss=1.995, ppl=3.99, wps=584490, ups=1.18, wpb=495474, bsz=16569.6, num_updates=35900, lr=0.000333797, gnorm=0.132, clip=0, loss_scale=2, train_wall=83, gb_free=61.9, wall=30788 epoch 022: 488 / 1689 loss=3.546, nll_loss=1.995, ppl=3.99, wps=584490, ups=1.18, wpb=495474, bsz=16569.6, num_updates=35900, lr=0.000333797, gnorm=0.132, clip=0, loss_scale=2, train_wall=83, gb_free=61.9, wall=30788 epoch 022: 488 / 1689 loss=3.546, nll_loss=1.995, ppl=3.99, wps=584490, ups=1.18, wpb=495474, bsz=16569.6, num_updates=35900, lr=0.000333797, gnorm=0.132, clip=0, loss_scale=2, train_wall=83, gb_free=61.9, wall=30788 epoch 022: 488 / 1689 loss=3.546, nll_loss=1.995, ppl=3.99, wps=584490, ups=1.18, wpb=495474, bsz=16569.6, num_updates=35900, lr=0.000333797, gnorm=0.132, clip=0, loss_scale=2, train_wall=83, gb_free=61.9, wall=30788 epoch 022: 488 / 1689 loss=3.546, nll_loss=1.995, ppl=3.99, wps=584490, ups=1.18, wpb=495474, bsz=16569.6, num_updates=35900, lr=0.000333797, gnorm=0.132, clip=0, loss_scale=2, train_wall=83, gb_free=61.9, wall=30788 epoch 022: 488 / 1689 loss=3.546, nll_loss=1.995, ppl=3.99, wps=584490, ups=1.18, wpb=495474, bsz=16569.6, num_updates=35900, lr=0.000333797, gnorm=0.132, clip=0, loss_scale=2, train_wall=83, gb_free=61.9, wall=30788 epoch 022: 488 / 1689 loss=3.546, nll_loss=1.995, ppl=3.99, wps=584490, ups=1.18, wpb=495474, bsz=16569.6, num_updates=35900, lr=0.000333797, gnorm=0.132, clip=0, loss_scale=2, train_wall=83, gb_free=61.9, wall=30788 epoch 022: 588 / 1689 loss=3.553, nll_loss=2.003, ppl=4.01, wps=587051, ups=1.18, wpb=495746, bsz=16622, num_updates=36000, lr=0.000333333, gnorm=0.136, clip=0, loss_scale=2, train_wall=83, gb_free=61.4, wall=30872 epoch 022: 588 / 1689 loss=3.553, nll_loss=2.003, ppl=4.01, wps=587051, ups=1.18, wpb=495746, bsz=16622, num_updates=36000, lr=0.000333333, gnorm=0.136, clip=0, loss_scale=2, train_wall=83, gb_free=61.4, wall=30872 epoch 022: 588 / 1689 loss=3.553, nll_loss=2.003, ppl=4.01, wps=587051, ups=1.18, wpb=495746, bsz=16622, num_updates=36000, lr=0.000333333, gnorm=0.136, clip=0, loss_scale=2, train_wall=83, gb_free=61.4, wall=30872 epoch 022: 588 / 1689 loss=3.553, nll_loss=2.003, ppl=4.01, wps=587051, ups=1.18, wpb=495746, bsz=16622, num_updates=36000, lr=0.000333333, gnorm=0.136, clip=0, loss_scale=2, train_wall=83, gb_free=61.4, wall=30872 epoch 022: 588 / 1689 loss=3.553, nll_loss=2.003, ppl=4.01, wps=587051, ups=1.18, wpb=495746, bsz=16622, num_updates=36000, lr=0.000333333, gnorm=0.136, clip=0, loss_scale=2, train_wall=83, gb_free=61.4, wall=30872 epoch 022: 588 / 1689 loss=3.553, nll_loss=2.003, ppl=4.01, wps=587051, ups=1.18, wpb=495746, bsz=16622, num_updates=36000, lr=0.000333333, gnorm=0.136, clip=0, loss_scale=2, train_wall=83, gb_free=61.4, wall=30872 epoch 022: 588 / 1689 loss=3.553, nll_loss=2.003, ppl=4.01, wps=587051, ups=1.18, wpb=495746, bsz=16622, num_updates=36000, lr=0.000333333, gnorm=0.136, clip=0, loss_scale=2, train_wall=83, gb_free=61.4, wall=30872 epoch 022: 588 / 1689 loss=3.553, nll_loss=2.003, ppl=4.01, wps=587051, ups=1.18, wpb=495746, bsz=16622, num_updates=36000, lr=0.000333333, gnorm=0.136, clip=0, loss_scale=2, train_wall=83, gb_free=61.4, wall=30872 epoch 022: 588 / 1689 loss=3.553, nll_loss=2.003, ppl=4.01, wps=587051, ups=1.18, wpb=495746, bsz=16622, num_updates=36000, lr=0.000333333, gnorm=0.136, clip=0, loss_scale=2, train_wall=83, gb_free=61.4, wall=30872 epoch 022: 588 / 1689 loss=3.553, nll_loss=2.003, ppl=4.01, wps=587051, ups=1.18, wpb=495746, bsz=16622, num_updates=36000, lr=0.000333333, gnorm=0.136, clip=0, loss_scale=2, train_wall=83, gb_free=61.4, wall=30872 epoch 022: 588 / 1689 loss=3.553, nll_loss=2.003, ppl=4.01, wps=587051, ups=1.18, wpb=495746, bsz=16622, num_updates=36000, lr=0.000333333, gnorm=0.136, clip=0, loss_scale=2, train_wall=83, gb_free=61.4, wall=30872 epoch 022: 588 / 1689 loss=3.553, nll_loss=2.003, ppl=4.01, wps=587051, ups=1.18, wpb=495746, bsz=16622, num_updates=36000, lr=0.000333333, gnorm=0.136, clip=0, loss_scale=2, train_wall=83, gb_free=61.4, wall=30872 epoch 022: 588 / 1689 loss=3.553, nll_loss=2.003, ppl=4.01, wps=587051, ups=1.18, wpb=495746, bsz=16622, num_updates=36000, lr=0.000333333, gnorm=0.136, clip=0, loss_scale=2, train_wall=83, gb_free=61.4, wall=30872 epoch 022: 588 / 1689 loss=3.553, nll_loss=2.003, ppl=4.01, wps=587051, ups=1.18, wpb=495746, bsz=16622, num_updates=36000, lr=0.000333333, gnorm=0.136, clip=0, loss_scale=2, train_wall=83, gb_free=61.4, wall=30872 epoch 022: 588 / 1689 loss=3.553, nll_loss=2.003, ppl=4.01, wps=587051, ups=1.18, wpb=495746, bsz=16622, num_updates=36000, lr=0.000333333, gnorm=0.136, clip=0, loss_scale=2, train_wall=83, gb_free=61.4, wall=30872 epoch 022: 588 / 1689 loss=3.553, nll_loss=2.003, ppl=4.01, wps=587051, ups=1.18, wpb=495746, bsz=16622, num_updates=36000, lr=0.000333333, gnorm=0.136, clip=0, loss_scale=2, train_wall=83, gb_free=61.4, wall=30872 epoch 022: 588 / 1689 loss=3.553, nll_loss=2.003, ppl=4.01, wps=587051, ups=1.18, wpb=495746, bsz=16622, num_updates=36000, lr=0.000333333, gnorm=0.136, clip=0, loss_scale=2, train_wall=83, gb_free=61.4, wall=30872 epoch 022: 588 / 1689 loss=3.553, nll_loss=2.003, ppl=4.01, wps=587051, ups=1.18, wpb=495746, bsz=16622, num_updates=36000, lr=0.000333333, gnorm=0.136, clip=0, loss_scale=2, train_wall=83, gb_free=61.4, wall=30872 epoch 022: 588 / 1689 loss=3.553, nll_loss=2.003, ppl=4.01, wps=587051, ups=1.18, wpb=495746, bsz=16622, num_updates=36000, lr=0.000333333, gnorm=0.136, clip=0, loss_scale=2, train_wall=83, gb_free=61.4, wall=30872 epoch 022: 588 / 1689 loss=3.553, nll_loss=2.003, ppl=4.01, wps=587051, ups=1.18, wpb=495746, bsz=16622, num_updates=36000, lr=0.000333333, gnorm=0.136, clip=0, loss_scale=2, train_wall=83, gb_free=61.4, wall=30872 epoch 022: 588 / 1689 loss=3.553, nll_loss=2.003, ppl=4.01, wps=587051, ups=1.18, wpb=495746, bsz=16622, num_updates=36000, lr=0.000333333, gnorm=0.136, clip=0, loss_scale=2, train_wall=83, gb_free=61.4, wall=30872 epoch 022: 588 / 1689 loss=3.553, nll_loss=2.003, ppl=4.01, wps=587051, ups=1.18, wpb=495746, bsz=16622, num_updates=36000, lr=0.000333333, gnorm=0.136, clip=0, loss_scale=2, train_wall=83, gb_free=61.4, wall=30872 begin validation on "valid" subset epoch 022 | valid on 'valid' subset | loss 3.699 | nll_loss 2.15 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 36000 | best_loss 3.699 epoch 022 | valid on 'valid' subset | loss 3.699 | nll_loss 2.15 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 36000 | best_loss 3.699 epoch 022 | valid on 'valid' subset | loss 3.699 | nll_loss 2.15 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 36000 | best_loss 3.699 epoch 022 | valid on 'valid' subset | loss 3.699 | nll_loss 2.15 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 36000 | best_loss 3.699 epoch 022 | valid on 'valid' subset | loss 3.699 | nll_loss 2.15 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 36000 | best_loss 3.699 epoch 022 | valid on 'valid' subset | loss 3.699 | nll_loss 2.15 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 36000 | best_loss 3.699 epoch 022 | valid on 'valid' subset | loss 3.699 | nll_loss 2.15 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 36000 | best_loss 3.699 epoch 022 | valid on 'valid' subset | loss 3.699 | nll_loss 2.15 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 36000 | best_loss 3.699 epoch 022 | valid on 'valid' subset | loss 3.699 | nll_loss 2.15 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 36000 | best_loss 3.699 epoch 022 | valid on 'valid' subset | loss 3.699 | nll_loss 2.15 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 36000 | best_loss 3.699 epoch 022 | valid on 'valid' subset | loss 3.699 | nll_loss 2.15 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 36000 | best_loss 3.699 epoch 022 | valid on 'valid' subset | loss 3.699 | nll_loss 2.15 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 36000 | best_loss 3.699 epoch 022 | valid on 'valid' subset | loss 3.699 | nll_loss 2.15 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 36000 | best_loss 3.699 epoch 022 | valid on 'valid' subset | loss 3.699 | nll_loss 2.15 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 36000 | best_loss 3.699 epoch 022 | valid on 'valid' subset | loss 3.699 | nll_loss 2.15 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 36000 | best_loss 3.699 epoch 022 | valid on 'valid' subset | loss 3.699 | nll_loss 2.15 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 36000 | best_loss 3.699 epoch 022 | valid on 'valid' subset | loss 3.699 | nll_loss 2.15 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 36000 | best_loss 3.699 epoch 022 | valid on 'valid' subset | loss 3.699 | nll_loss 2.15 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 36000 | best_loss 3.699 epoch 022 | valid on 'valid' subset | loss 3.699 | nll_loss 2.15 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 36000 | best_loss 3.699 epoch 022 | valid on 'valid' subset | loss 3.699 | nll_loss 2.15 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 36000 | best_loss 3.699 epoch 022 | valid on 'valid' subset | loss 3.699 | nll_loss 2.15 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 36000 | best_loss 3.699 epoch 022 | valid on 'valid' subset | loss 3.699 | nll_loss 2.15 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 36000 | best_loss 3.699 epoch 022: 688 / 1689 loss=3.553, nll_loss=2.003, ppl=4.01, wps=508743, ups=1.03, wpb=495838, bsz=16449.5, num_updates=36100, lr=0.000332871, gnorm=0.138, clip=0, loss_scale=4, train_wall=83, gb_free=61.2, wall=30969 epoch 022: 688 / 1689 loss=3.553, nll_loss=2.003, ppl=4.01, wps=508743, ups=1.03, wpb=495838, bsz=16449.5, num_updates=36100, lr=0.000332871, gnorm=0.138, clip=0, loss_scale=4, train_wall=83, gb_free=61.2, wall=30969 epoch 022: 688 / 1689 loss=3.553, nll_loss=2.003, ppl=4.01, wps=508743, ups=1.03, wpb=495838, bsz=16449.5, num_updates=36100, lr=0.000332871, gnorm=0.138, clip=0, loss_scale=4, train_wall=83, gb_free=61.2, wall=30969 epoch 022: 688 / 1689 loss=3.553, nll_loss=2.003, ppl=4.01, wps=508743, ups=1.03, wpb=495838, bsz=16449.5, num_updates=36100, lr=0.000332871, gnorm=0.138, clip=0, loss_scale=4, train_wall=83, gb_free=61.2, wall=30969 epoch 022: 688 / 1689 loss=3.553, nll_loss=2.003, ppl=4.01, wps=508743, ups=1.03, wpb=495838, bsz=16449.5, num_updates=36100, lr=0.000332871, gnorm=0.138, clip=0, loss_scale=4, train_wall=83, gb_free=61.2, wall=30969 epoch 022: 688 / 1689 loss=3.553, nll_loss=2.003, ppl=4.01, wps=508743, ups=1.03, wpb=495838, bsz=16449.5, num_updates=36100, lr=0.000332871, gnorm=0.138, clip=0, loss_scale=4, train_wall=83, gb_free=61.2, wall=30969 epoch 022: 688 / 1689 loss=3.553, nll_loss=2.003, ppl=4.01, wps=508743, ups=1.03, wpb=495838, bsz=16449.5, num_updates=36100, lr=0.000332871, gnorm=0.138, clip=0, loss_scale=4, train_wall=83, gb_free=61.2, wall=30969 epoch 022: 688 / 1689 loss=3.553, nll_loss=2.003, ppl=4.01, wps=508743, ups=1.03, wpb=495838, bsz=16449.5, num_updates=36100, lr=0.000332871, gnorm=0.138, clip=0, loss_scale=4, train_wall=83, gb_free=61.2, wall=30969 epoch 022: 688 / 1689 loss=3.553, nll_loss=2.003, ppl=4.01, wps=508743, ups=1.03, wpb=495838, bsz=16449.5, num_updates=36100, lr=0.000332871, gnorm=0.138, clip=0, loss_scale=4, train_wall=83, gb_free=61.2, wall=30969 epoch 022: 688 / 1689 loss=3.553, nll_loss=2.003, ppl=4.01, wps=508743, ups=1.03, wpb=495838, bsz=16449.5, num_updates=36100, lr=0.000332871, gnorm=0.138, clip=0, loss_scale=4, train_wall=83, gb_free=61.2, wall=30969 epoch 022: 688 / 1689 loss=3.553, nll_loss=2.003, ppl=4.01, wps=508743, ups=1.03, wpb=495838, bsz=16449.5, num_updates=36100, lr=0.000332871, gnorm=0.138, clip=0, loss_scale=4, train_wall=83, gb_free=61.2, wall=30969 epoch 022: 688 / 1689 loss=3.553, nll_loss=2.003, ppl=4.01, wps=508743, ups=1.03, wpb=495838, bsz=16449.5, num_updates=36100, lr=0.000332871, gnorm=0.138, clip=0, loss_scale=4, train_wall=83, gb_free=61.2, wall=30969 epoch 022: 688 / 1689 loss=3.553, nll_loss=2.003, ppl=4.01, wps=508743, ups=1.03, wpb=495838, bsz=16449.5, num_updates=36100, lr=0.000332871, gnorm=0.138, clip=0, loss_scale=4, train_wall=83, gb_free=61.2, wall=30969 epoch 022: 688 / 1689 loss=3.553, nll_loss=2.003, ppl=4.01, wps=508743, ups=1.03, wpb=495838, bsz=16449.5, num_updates=36100, lr=0.000332871, gnorm=0.138, clip=0, loss_scale=4, train_wall=83, gb_free=61.2, wall=30969 epoch 022: 688 / 1689 loss=3.553, nll_loss=2.003, ppl=4.01, wps=508743, ups=1.03, wpb=495838, bsz=16449.5, num_updates=36100, lr=0.000332871, gnorm=0.138, clip=0, loss_scale=4, train_wall=83, gb_free=61.2, wall=30969 epoch 022: 688 / 1689 loss=3.553, nll_loss=2.003, ppl=4.01, wps=508743, ups=1.03, wpb=495838, bsz=16449.5, num_updates=36100, lr=0.000332871, gnorm=0.138, clip=0, loss_scale=4, train_wall=83, gb_free=61.2, wall=30969 epoch 022: 688 / 1689 loss=3.553, nll_loss=2.003, ppl=4.01, wps=508743, ups=1.03, wpb=495838, bsz=16449.5, num_updates=36100, lr=0.000332871, gnorm=0.138, clip=0, loss_scale=4, train_wall=83, gb_free=61.2, wall=30969 epoch 022: 688 / 1689 loss=3.553, nll_loss=2.003, ppl=4.01, wps=508743, ups=1.03, wpb=495838, bsz=16449.5, num_updates=36100, lr=0.000332871, gnorm=0.138, clip=0, loss_scale=4, train_wall=83, gb_free=61.2, wall=30969 epoch 022: 688 / 1689 loss=3.553, nll_loss=2.003, ppl=4.01, wps=508743, ups=1.03, wpb=495838, bsz=16449.5, num_updates=36100, lr=0.000332871, gnorm=0.138, clip=0, loss_scale=4, train_wall=83, gb_free=61.2, wall=30969 epoch 022: 688 / 1689 loss=3.553, nll_loss=2.003, ppl=4.01, wps=508743, ups=1.03, wpb=495838, bsz=16449.5, num_updates=36100, lr=0.000332871, gnorm=0.138, clip=0, loss_scale=4, train_wall=83, gb_free=61.2, wall=30969 epoch 022: 688 / 1689 loss=3.553, nll_loss=2.003, ppl=4.01, wps=508743, ups=1.03, wpb=495838, bsz=16449.5, num_updates=36100, lr=0.000332871, gnorm=0.138, clip=0, loss_scale=4, train_wall=83, gb_free=61.2, wall=30969 epoch 022: 688 / 1689 loss=3.553, nll_loss=2.003, ppl=4.01, wps=508743, ups=1.03, wpb=495838, bsz=16449.5, num_updates=36100, lr=0.000332871, gnorm=0.138, clip=0, loss_scale=4, train_wall=83, gb_free=61.2, wall=30969 epoch 022: 788 / 1689 loss=3.555, nll_loss=2.005, ppl=4.01, wps=588733, ups=1.19, wpb=494453, bsz=16301.8, num_updates=36200, lr=0.000332411, gnorm=0.13, clip=0, loss_scale=4, train_wall=83, gb_free=61.2, wall=31053 epoch 022: 788 / 1689 loss=3.555, nll_loss=2.005, ppl=4.01, wps=588733, ups=1.19, wpb=494453, bsz=16301.8, num_updates=36200, lr=0.000332411, gnorm=0.13, clip=0, loss_scale=4, train_wall=83, gb_free=61.2, wall=31053 epoch 022: 788 / 1689 loss=3.555, nll_loss=2.005, ppl=4.01, wps=588733, ups=1.19, wpb=494453, bsz=16301.8, num_updates=36200, lr=0.000332411, gnorm=0.13, clip=0, loss_scale=4, train_wall=83, gb_free=61.2, wall=31053 epoch 022: 788 / 1689 loss=3.555, nll_loss=2.005, ppl=4.01, wps=588733, ups=1.19, wpb=494453, bsz=16301.8, num_updates=36200, lr=0.000332411, gnorm=0.13, clip=0, loss_scale=4, train_wall=83, gb_free=61.2, wall=31053 epoch 022: 788 / 1689 loss=3.555, nll_loss=2.005, ppl=4.01, wps=588733, ups=1.19, wpb=494453, bsz=16301.8, num_updates=36200, lr=0.000332411, gnorm=0.13, clip=0, loss_scale=4, train_wall=83, gb_free=61.2, wall=31053 epoch 022: 788 / 1689 loss=3.555, nll_loss=2.005, ppl=4.01, wps=588733, ups=1.19, wpb=494453, bsz=16301.8, num_updates=36200, lr=0.000332411, gnorm=0.13, clip=0, loss_scale=4, train_wall=83, gb_free=61.2, wall=31053 epoch 022: 788 / 1689 loss=3.555, nll_loss=2.005, ppl=4.01, wps=588733, ups=1.19, wpb=494453, bsz=16301.8, num_updates=36200, lr=0.000332411, gnorm=0.13, clip=0, loss_scale=4, train_wall=83, gb_free=61.2, wall=31053 epoch 022: 788 / 1689 loss=3.555, nll_loss=2.005, ppl=4.01, wps=588733, ups=1.19, wpb=494453, bsz=16301.8, num_updates=36200, lr=0.000332411, gnorm=0.13, clip=0, loss_scale=4, train_wall=83, gb_free=61.2, wall=31053 epoch 022: 788 / 1689 loss=3.555, nll_loss=2.005, ppl=4.01, wps=588733, ups=1.19, wpb=494453, bsz=16301.8, num_updates=36200, lr=0.000332411, gnorm=0.13, clip=0, loss_scale=4, train_wall=83, gb_free=61.2, wall=31053 epoch 022: 788 / 1689 loss=3.555, nll_loss=2.005, ppl=4.01, wps=588733, ups=1.19, wpb=494453, bsz=16301.8, num_updates=36200, lr=0.000332411, gnorm=0.13, clip=0, loss_scale=4, train_wall=83, gb_free=61.2, wall=31053 epoch 022: 788 / 1689 loss=3.555, nll_loss=2.005, ppl=4.01, wps=588733, ups=1.19, wpb=494453, bsz=16301.8, num_updates=36200, lr=0.000332411, gnorm=0.13, clip=0, loss_scale=4, train_wall=83, gb_free=61.2, wall=31053 epoch 022: 788 / 1689 loss=3.555, nll_loss=2.005, ppl=4.01, wps=588733, ups=1.19, wpb=494453, bsz=16301.8, num_updates=36200, lr=0.000332411, gnorm=0.13, clip=0, loss_scale=4, train_wall=83, gb_free=61.2, wall=31053 epoch 022: 788 / 1689 loss=3.555, nll_loss=2.005, ppl=4.01, wps=588733, ups=1.19, wpb=494453, bsz=16301.8, num_updates=36200, lr=0.000332411, gnorm=0.13, clip=0, loss_scale=4, train_wall=83, gb_free=61.2, wall=31053 epoch 022: 788 / 1689 loss=3.555, nll_loss=2.005, ppl=4.01, wps=588733, ups=1.19, wpb=494453, bsz=16301.8, num_updates=36200, lr=0.000332411, gnorm=0.13, clip=0, loss_scale=4, train_wall=83, gb_free=61.2, wall=31053 epoch 022: 788 / 1689 loss=3.555, nll_loss=2.005, ppl=4.01, wps=588733, ups=1.19, wpb=494453, bsz=16301.8, num_updates=36200, lr=0.000332411, gnorm=0.13, clip=0, loss_scale=4, train_wall=83, gb_free=61.2, wall=31053 epoch 022: 788 / 1689 loss=3.555, nll_loss=2.005, ppl=4.01, wps=588733, ups=1.19, wpb=494453, bsz=16301.8, num_updates=36200, lr=0.000332411, gnorm=0.13, clip=0, loss_scale=4, train_wall=83, gb_free=61.2, wall=31053 epoch 022: 788 / 1689 loss=3.555, nll_loss=2.005, ppl=4.01, wps=588733, ups=1.19, wpb=494453, bsz=16301.8, num_updates=36200, lr=0.000332411, gnorm=0.13, clip=0, loss_scale=4, train_wall=83, gb_free=61.2, wall=31053 epoch 022: 788 / 1689 loss=3.555, nll_loss=2.005, ppl=4.01, wps=588733, ups=1.19, wpb=494453, bsz=16301.8, num_updates=36200, lr=0.000332411, gnorm=0.13, clip=0, loss_scale=4, train_wall=83, gb_free=61.2, wall=31053 epoch 022: 788 / 1689 loss=3.555, nll_loss=2.005, ppl=4.01, wps=588733, ups=1.19, wpb=494453, bsz=16301.8, num_updates=36200, lr=0.000332411, gnorm=0.13, clip=0, loss_scale=4, train_wall=83, gb_free=61.2, wall=31053 epoch 022: 788 / 1689 loss=3.555, nll_loss=2.005, ppl=4.01, wps=588733, ups=1.19, wpb=494453, bsz=16301.8, num_updates=36200, lr=0.000332411, gnorm=0.13, clip=0, loss_scale=4, train_wall=83, gb_free=61.2, wall=31053 epoch 022: 788 / 1689 loss=3.555, nll_loss=2.005, ppl=4.01, wps=588733, ups=1.19, wpb=494453, bsz=16301.8, num_updates=36200, lr=0.000332411, gnorm=0.13, clip=0, loss_scale=4, train_wall=83, gb_free=61.2, wall=31053 epoch 022: 788 / 1689 loss=3.555, nll_loss=2.005, ppl=4.01, wps=588733, ups=1.19, wpb=494453, bsz=16301.8, num_updates=36200, lr=0.000332411, gnorm=0.13, clip=0, loss_scale=4, train_wall=83, gb_free=61.2, wall=31053 epoch 022: 888 / 1689 loss=3.558, nll_loss=2.009, ppl=4.03, wps=582196, ups=1.18, wpb=494396, bsz=16708.3, num_updates=36300, lr=0.000331953, gnorm=0.135, clip=0, loss_scale=4, train_wall=84, gb_free=60.1, wall=31138 epoch 022: 888 / 1689 loss=3.558, nll_loss=2.009, ppl=4.03, wps=582196, ups=1.18, wpb=494396, bsz=16708.3, num_updates=36300, lr=0.000331953, gnorm=0.135, clip=0, loss_scale=4, train_wall=84, gb_free=60.1, wall=31138 epoch 022: 888 / 1689 loss=3.558, nll_loss=2.009, ppl=4.03, wps=582196, ups=1.18, wpb=494396, bsz=16708.3, num_updates=36300, lr=0.000331953, gnorm=0.135, clip=0, loss_scale=4, train_wall=84, gb_free=60.1, wall=31138 epoch 022: 888 / 1689 loss=3.558, nll_loss=2.009, ppl=4.03, wps=582196, ups=1.18, wpb=494396, bsz=16708.3, num_updates=36300, lr=0.000331953, gnorm=0.135, clip=0, loss_scale=4, train_wall=84, gb_free=60.1, wall=31138 epoch 022: 888 / 1689 loss=3.558, nll_loss=2.009, ppl=4.03, wps=582196, ups=1.18, wpb=494396, bsz=16708.3, num_updates=36300, lr=0.000331953, gnorm=0.135, clip=0, loss_scale=4, train_wall=84, gb_free=60.1, wall=31138 epoch 022: 888 / 1689 loss=3.558, nll_loss=2.009, ppl=4.03, wps=582196, ups=1.18, wpb=494396, bsz=16708.3, num_updates=36300, lr=0.000331953, gnorm=0.135, clip=0, loss_scale=4, train_wall=84, gb_free=60.1, wall=31138 epoch 022: 888 / 1689 loss=3.558, nll_loss=2.009, ppl=4.03, wps=582196, ups=1.18, wpb=494396, bsz=16708.3, num_updates=36300, lr=0.000331953, gnorm=0.135, clip=0, loss_scale=4, train_wall=84, gb_free=60.1, wall=31138 epoch 022: 888 / 1689 loss=3.558, nll_loss=2.009, ppl=4.03, wps=582196, ups=1.18, wpb=494396, bsz=16708.3, num_updates=36300, lr=0.000331953, gnorm=0.135, clip=0, loss_scale=4, train_wall=84, gb_free=60.1, wall=31138 epoch 022: 888 / 1689 loss=3.558, nll_loss=2.009, ppl=4.03, wps=582196, ups=1.18, wpb=494396, bsz=16708.3, num_updates=36300, lr=0.000331953, gnorm=0.135, clip=0, loss_scale=4, train_wall=84, gb_free=60.1, wall=31138 epoch 022: 888 / 1689 loss=3.558, nll_loss=2.009, ppl=4.03, wps=582196, ups=1.18, wpb=494396, bsz=16708.3, num_updates=36300, lr=0.000331953, gnorm=0.135, clip=0, loss_scale=4, train_wall=84, gb_free=60.1, wall=31138 epoch 022: 888 / 1689 loss=3.558, nll_loss=2.009, ppl=4.03, wps=582196, ups=1.18, wpb=494396, bsz=16708.3, num_updates=36300, lr=0.000331953, gnorm=0.135, clip=0, loss_scale=4, train_wall=84, gb_free=60.1, wall=31138 epoch 022: 888 / 1689 loss=3.558, nll_loss=2.009, ppl=4.03, wps=582196, ups=1.18, wpb=494396, bsz=16708.3, num_updates=36300, lr=0.000331953, gnorm=0.135, clip=0, loss_scale=4, train_wall=84, gb_free=60.1, wall=31138 epoch 022: 888 / 1689 loss=3.558, nll_loss=2.009, ppl=4.03, wps=582196, ups=1.18, wpb=494396, bsz=16708.3, num_updates=36300, lr=0.000331953, gnorm=0.135, clip=0, loss_scale=4, train_wall=84, gb_free=60.1, wall=31138 epoch 022: 888 / 1689 loss=3.558, nll_loss=2.009, ppl=4.03, wps=582196, ups=1.18, wpb=494396, bsz=16708.3, num_updates=36300, lr=0.000331953, gnorm=0.135, clip=0, loss_scale=4, train_wall=84, gb_free=60.1, wall=31138 epoch 022: 888 / 1689 loss=3.558, nll_loss=2.009, ppl=4.03, wps=582196, ups=1.18, wpb=494396, bsz=16708.3, num_updates=36300, lr=0.000331953, gnorm=0.135, clip=0, loss_scale=4, train_wall=84, gb_free=60.1, wall=31138 epoch 022: 888 / 1689 loss=3.558, nll_loss=2.009, ppl=4.03, wps=582196, ups=1.18, wpb=494396, bsz=16708.3, num_updates=36300, lr=0.000331953, gnorm=0.135, clip=0, loss_scale=4, train_wall=84, gb_free=60.1, wall=31138 epoch 022: 888 / 1689 loss=3.558, nll_loss=2.009, ppl=4.03, wps=582196, ups=1.18, wpb=494396, bsz=16708.3, num_updates=36300, lr=0.000331953, gnorm=0.135, clip=0, loss_scale=4, train_wall=84, gb_free=60.1, wall=31138 epoch 022: 888 / 1689 loss=3.558, nll_loss=2.009, ppl=4.03, wps=582196, ups=1.18, wpb=494396, bsz=16708.3, num_updates=36300, lr=0.000331953, gnorm=0.135, clip=0, loss_scale=4, train_wall=84, gb_free=60.1, wall=31138 epoch 022: 888 / 1689 loss=3.558, nll_loss=2.009, ppl=4.03, wps=582196, ups=1.18, wpb=494396, bsz=16708.3, num_updates=36300, lr=0.000331953, gnorm=0.135, clip=0, loss_scale=4, train_wall=84, gb_free=60.1, wall=31138 epoch 022: 888 / 1689 loss=3.558, nll_loss=2.009, ppl=4.03, wps=582196, ups=1.18, wpb=494396, bsz=16708.3, num_updates=36300, lr=0.000331953, gnorm=0.135, clip=0, loss_scale=4, train_wall=84, gb_free=60.1, wall=31138 epoch 022: 888 / 1689 loss=3.558, nll_loss=2.009, ppl=4.03, wps=582196, ups=1.18, wpb=494396, bsz=16708.3, num_updates=36300, lr=0.000331953, gnorm=0.135, clip=0, loss_scale=4, train_wall=84, gb_free=60.1, wall=31138 epoch 022: 888 / 1689 loss=3.558, nll_loss=2.009, ppl=4.03, wps=582196, ups=1.18, wpb=494396, bsz=16708.3, num_updates=36300, lr=0.000331953, gnorm=0.135, clip=0, loss_scale=4, train_wall=84, gb_free=60.1, wall=31138 epoch 022: 989 / 1689 loss=3.56, nll_loss=2.011, ppl=4.03, wps=583840, ups=1.18, wpb=496619, bsz=16533.4, num_updates=36400, lr=0.000331497, gnorm=0.135, clip=0, loss_scale=2, train_wall=84, gb_free=58.9, wall=31223 epoch 022: 989 / 1689 loss=3.56, nll_loss=2.011, ppl=4.03, wps=583840, ups=1.18, wpb=496619, bsz=16533.4, num_updates=36400, lr=0.000331497, gnorm=0.135, clip=0, loss_scale=2, train_wall=84, gb_free=58.9, wall=31223 epoch 022: 989 / 1689 loss=3.56, nll_loss=2.011, ppl=4.03, wps=583840, ups=1.18, wpb=496619, bsz=16533.4, num_updates=36400, lr=0.000331497, gnorm=0.135, clip=0, loss_scale=2, train_wall=84, gb_free=58.9, wall=31223 epoch 022: 989 / 1689 loss=3.56, nll_loss=2.011, ppl=4.03, wps=583840, ups=1.18, wpb=496619, bsz=16533.4, num_updates=36400, lr=0.000331497, gnorm=0.135, clip=0, loss_scale=2, train_wall=84, gb_free=58.9, wall=31223 epoch 022: 989 / 1689 loss=3.56, nll_loss=2.011, ppl=4.03, wps=583840, ups=1.18, wpb=496619, bsz=16533.4, num_updates=36400, lr=0.000331497, gnorm=0.135, clip=0, loss_scale=2, train_wall=84, gb_free=58.9, wall=31223 epoch 022: 989 / 1689 loss=3.56, nll_loss=2.011, ppl=4.03, wps=583840, ups=1.18, wpb=496619, bsz=16533.4, num_updates=36400, lr=0.000331497, gnorm=0.135, clip=0, loss_scale=2, train_wall=84, gb_free=58.9, wall=31223 epoch 022: 989 / 1689 loss=3.56, nll_loss=2.011, ppl=4.03, wps=583840, ups=1.18, wpb=496619, bsz=16533.4, num_updates=36400, lr=0.000331497, gnorm=0.135, clip=0, loss_scale=2, train_wall=84, gb_free=58.9, wall=31223 epoch 022: 989 / 1689 loss=3.56, nll_loss=2.011, ppl=4.03, wps=583840, ups=1.18, wpb=496619, bsz=16533.4, num_updates=36400, lr=0.000331497, gnorm=0.135, clip=0, loss_scale=2, train_wall=84, gb_free=58.9, wall=31223 epoch 022: 989 / 1689 loss=3.56, nll_loss=2.011, ppl=4.03, wps=583840, ups=1.18, wpb=496619, bsz=16533.4, num_updates=36400, lr=0.000331497, gnorm=0.135, clip=0, loss_scale=2, train_wall=84, gb_free=58.9, wall=31223 epoch 022: 989 / 1689 loss=3.56, nll_loss=2.011, ppl=4.03, wps=583840, ups=1.18, wpb=496619, bsz=16533.4, num_updates=36400, lr=0.000331497, gnorm=0.135, clip=0, loss_scale=2, train_wall=84, gb_free=58.9, wall=31223 epoch 022: 989 / 1689 loss=3.56, nll_loss=2.011, ppl=4.03, wps=583840, ups=1.18, wpb=496619, bsz=16533.4, num_updates=36400, lr=0.000331497, gnorm=0.135, clip=0, loss_scale=2, train_wall=84, gb_free=58.9, wall=31223 epoch 022: 989 / 1689 loss=3.56, nll_loss=2.011, ppl=4.03, wps=583840, ups=1.18, wpb=496619, bsz=16533.4, num_updates=36400, lr=0.000331497, gnorm=0.135, clip=0, loss_scale=2, train_wall=84, gb_free=58.9, wall=31223 epoch 022: 989 / 1689 loss=3.56, nll_loss=2.011, ppl=4.03, wps=583840, ups=1.18, wpb=496619, bsz=16533.4, num_updates=36400, lr=0.000331497, gnorm=0.135, clip=0, loss_scale=2, train_wall=84, gb_free=58.9, wall=31223 epoch 022: 989 / 1689 loss=3.56, nll_loss=2.011, ppl=4.03, wps=583840, ups=1.18, wpb=496619, bsz=16533.4, num_updates=36400, lr=0.000331497, gnorm=0.135, clip=0, loss_scale=2, train_wall=84, gb_free=58.9, wall=31223 epoch 022: 989 / 1689 loss=3.56, nll_loss=2.011, ppl=4.03, wps=583840, ups=1.18, wpb=496619, bsz=16533.4, num_updates=36400, lr=0.000331497, gnorm=0.135, clip=0, loss_scale=2, train_wall=84, gb_free=58.9, wall=31223 epoch 022: 989 / 1689 loss=3.56, nll_loss=2.011, ppl=4.03, wps=583840, ups=1.18, wpb=496619, bsz=16533.4, num_updates=36400, lr=0.000331497, gnorm=0.135, clip=0, loss_scale=2, train_wall=84, gb_free=58.9, wall=31223 epoch 022: 989 / 1689 loss=3.56, nll_loss=2.011, ppl=4.03, wps=583840, ups=1.18, wpb=496619, bsz=16533.4, num_updates=36400, lr=0.000331497, gnorm=0.135, clip=0, loss_scale=2, train_wall=84, gb_free=58.9, wall=31223 epoch 022: 989 / 1689 loss=3.56, nll_loss=2.011, ppl=4.03, wps=583840, ups=1.18, wpb=496619, bsz=16533.4, num_updates=36400, lr=0.000331497, gnorm=0.135, clip=0, loss_scale=2, train_wall=84, gb_free=58.9, wall=31223 epoch 022: 989 / 1689 loss=3.56, nll_loss=2.011, ppl=4.03, wps=583840, ups=1.18, wpb=496619, bsz=16533.4, num_updates=36400, lr=0.000331497, gnorm=0.135, clip=0, loss_scale=2, train_wall=84, gb_free=58.9, wall=31223 epoch 022: 989 / 1689 loss=3.56, nll_loss=2.011, ppl=4.03, wps=583840, ups=1.18, wpb=496619, bsz=16533.4, num_updates=36400, lr=0.000331497, gnorm=0.135, clip=0, loss_scale=2, train_wall=84, gb_free=58.9, wall=31223 epoch 022: 989 / 1689 loss=3.56, nll_loss=2.011, ppl=4.03, wps=583840, ups=1.18, wpb=496619, bsz=16533.4, num_updates=36400, lr=0.000331497, gnorm=0.135, clip=0, loss_scale=2, train_wall=84, gb_free=58.9, wall=31223 epoch 022: 989 / 1689 loss=3.56, nll_loss=2.011, ppl=4.03, wps=583840, ups=1.18, wpb=496619, bsz=16533.4, num_updates=36400, lr=0.000331497, gnorm=0.135, clip=0, loss_scale=2, train_wall=84, gb_free=58.9, wall=31223 epoch 022: 1089 / 1689 loss=3.556, nll_loss=2.007, ppl=4.02, wps=586008, ups=1.18, wpb=494952, bsz=16559, num_updates=36500, lr=0.000331042, gnorm=0.138, clip=0, loss_scale=2, train_wall=83, gb_free=61.9, wall=31308 epoch 022: 1089 / 1689 loss=3.556, nll_loss=2.007, ppl=4.02, wps=586008, ups=1.18, wpb=494952, bsz=16559, num_updates=36500, lr=0.000331042, gnorm=0.138, clip=0, loss_scale=2, train_wall=83, gb_free=61.9, wall=31308 epoch 022: 1089 / 1689 loss=3.556, nll_loss=2.007, ppl=4.02, wps=586008, ups=1.18, wpb=494952, bsz=16559, num_updates=36500, lr=0.000331042, gnorm=0.138, clip=0, loss_scale=2, train_wall=83, gb_free=61.9, wall=31308 epoch 022: 1089 / 1689 loss=3.556, nll_loss=2.007, ppl=4.02, wps=586008, ups=1.18, wpb=494952, bsz=16559, num_updates=36500, lr=0.000331042, gnorm=0.138, clip=0, loss_scale=2, train_wall=83, gb_free=61.9, wall=31308 epoch 022: 1089 / 1689 loss=3.556, nll_loss=2.007, ppl=4.02, wps=586008, ups=1.18, wpb=494952, bsz=16559, num_updates=36500, lr=0.000331042, gnorm=0.138, clip=0, loss_scale=2, train_wall=83, gb_free=61.9, wall=31308 epoch 022: 1089 / 1689 loss=3.556, nll_loss=2.007, ppl=4.02, wps=586008, ups=1.18, wpb=494952, bsz=16559, num_updates=36500, lr=0.000331042, gnorm=0.138, clip=0, loss_scale=2, train_wall=83, gb_free=61.9, wall=31308 epoch 022: 1089 / 1689 loss=3.556, nll_loss=2.007, ppl=4.02, wps=586008, ups=1.18, wpb=494952, bsz=16559, num_updates=36500, lr=0.000331042, gnorm=0.138, clip=0, loss_scale=2, train_wall=83, gb_free=61.9, wall=31308 epoch 022: 1089 / 1689 loss=3.556, nll_loss=2.007, ppl=4.02, wps=586008, ups=1.18, wpb=494952, bsz=16559, num_updates=36500, lr=0.000331042, gnorm=0.138, clip=0, loss_scale=2, train_wall=83, gb_free=61.9, wall=31308 epoch 022: 1089 / 1689 loss=3.556, nll_loss=2.007, ppl=4.02, wps=586008, ups=1.18, wpb=494952, bsz=16559, num_updates=36500, lr=0.000331042, gnorm=0.138, clip=0, loss_scale=2, train_wall=83, gb_free=61.9, wall=31308 epoch 022: 1089 / 1689 loss=3.556, nll_loss=2.007, ppl=4.02, wps=586008, ups=1.18, wpb=494952, bsz=16559, num_updates=36500, lr=0.000331042, gnorm=0.138, clip=0, loss_scale=2, train_wall=83, gb_free=61.9, wall=31308 epoch 022: 1089 / 1689 loss=3.556, nll_loss=2.007, ppl=4.02, wps=586008, ups=1.18, wpb=494952, bsz=16559, num_updates=36500, lr=0.000331042, gnorm=0.138, clip=0, loss_scale=2, train_wall=83, gb_free=61.9, wall=31308 epoch 022: 1089 / 1689 loss=3.556, nll_loss=2.007, ppl=4.02, wps=586008, ups=1.18, wpb=494952, bsz=16559, num_updates=36500, lr=0.000331042, gnorm=0.138, clip=0, loss_scale=2, train_wall=83, gb_free=61.9, wall=31308 epoch 022: 1089 / 1689 loss=3.556, nll_loss=2.007, ppl=4.02, wps=586008, ups=1.18, wpb=494952, bsz=16559, num_updates=36500, lr=0.000331042, gnorm=0.138, clip=0, loss_scale=2, train_wall=83, gb_free=61.9, wall=31308 epoch 022: 1089 / 1689 loss=3.556, nll_loss=2.007, ppl=4.02, wps=586008, ups=1.18, wpb=494952, bsz=16559, num_updates=36500, lr=0.000331042, gnorm=0.138, clip=0, loss_scale=2, train_wall=83, gb_free=61.9, wall=31308 epoch 022: 1089 / 1689 loss=3.556, nll_loss=2.007, ppl=4.02, wps=586008, ups=1.18, wpb=494952, bsz=16559, num_updates=36500, lr=0.000331042, gnorm=0.138, clip=0, loss_scale=2, train_wall=83, gb_free=61.9, wall=31308 epoch 022: 1089 / 1689 loss=3.556, nll_loss=2.007, ppl=4.02, wps=586008, ups=1.18, wpb=494952, bsz=16559, num_updates=36500, lr=0.000331042, gnorm=0.138, clip=0, loss_scale=2, train_wall=83, gb_free=61.9, wall=31308 epoch 022: 1089 / 1689 loss=3.556, nll_loss=2.007, ppl=4.02, wps=586008, ups=1.18, wpb=494952, bsz=16559, num_updates=36500, lr=0.000331042, gnorm=0.138, clip=0, loss_scale=2, train_wall=83, gb_free=61.9, wall=31308 epoch 022: 1089 / 1689 loss=3.556, nll_loss=2.007, ppl=4.02, wps=586008, ups=1.18, wpb=494952, bsz=16559, num_updates=36500, lr=0.000331042, gnorm=0.138, clip=0, loss_scale=2, train_wall=83, gb_free=61.9, wall=31308 epoch 022: 1089 / 1689 loss=3.556, nll_loss=2.007, ppl=4.02, wps=586008, ups=1.18, wpb=494952, bsz=16559, num_updates=36500, lr=0.000331042, gnorm=0.138, clip=0, loss_scale=2, train_wall=83, gb_free=61.9, wall=31308 epoch 022: 1089 / 1689 loss=3.556, nll_loss=2.007, ppl=4.02, wps=586008, ups=1.18, wpb=494952, bsz=16559, num_updates=36500, lr=0.000331042, gnorm=0.138, clip=0, loss_scale=2, train_wall=83, gb_free=61.9, wall=31308 epoch 022: 1089 / 1689 loss=3.556, nll_loss=2.007, ppl=4.02, wps=586008, ups=1.18, wpb=494952, bsz=16559, num_updates=36500, lr=0.000331042, gnorm=0.138, clip=0, loss_scale=2, train_wall=83, gb_free=61.9, wall=31308 epoch 022: 1089 / 1689 loss=3.556, nll_loss=2.007, ppl=4.02, wps=586008, ups=1.18, wpb=494952, bsz=16559, num_updates=36500, lr=0.000331042, gnorm=0.138, clip=0, loss_scale=2, train_wall=83, gb_free=61.9, wall=31308 epoch 022: 1189 / 1689 loss=3.563, nll_loss=2.015, ppl=4.04, wps=586410, ups=1.18, wpb=495991, bsz=16386.8, num_updates=36600, lr=0.00033059, gnorm=0.129, clip=0, loss_scale=2, train_wall=83, gb_free=62, wall=31393 epoch 022: 1189 / 1689 loss=3.563, nll_loss=2.015, ppl=4.04, wps=586410, ups=1.18, wpb=495991, bsz=16386.8, num_updates=36600, lr=0.00033059, gnorm=0.129, clip=0, loss_scale=2, train_wall=83, gb_free=62, wall=31393 epoch 022: 1189 / 1689 loss=3.563, nll_loss=2.015, ppl=4.04, wps=586410, ups=1.18, wpb=495991, bsz=16386.8, num_updates=36600, lr=0.00033059, gnorm=0.129, clip=0, loss_scale=2, train_wall=83, gb_free=62, wall=31393 epoch 022: 1189 / 1689 loss=3.563, nll_loss=2.015, ppl=4.04, wps=586410, ups=1.18, wpb=495991, bsz=16386.8, num_updates=36600, lr=0.00033059, gnorm=0.129, clip=0, loss_scale=2, train_wall=83, gb_free=62, wall=31393 epoch 022: 1189 / 1689 loss=3.563, nll_loss=2.015, ppl=4.04, wps=586410, ups=1.18, wpb=495991, bsz=16386.8, num_updates=36600, lr=0.00033059, gnorm=0.129, clip=0, loss_scale=2, train_wall=83, gb_free=62, wall=31393 epoch 022: 1189 / 1689 loss=3.563, nll_loss=2.015, ppl=4.04, wps=586410, ups=1.18, wpb=495991, bsz=16386.8, num_updates=36600, lr=0.00033059, gnorm=0.129, clip=0, loss_scale=2, train_wall=83, gb_free=62, wall=31393 epoch 022: 1189 / 1689 loss=3.563, nll_loss=2.015, ppl=4.04, wps=586410, ups=1.18, wpb=495991, bsz=16386.8, num_updates=36600, lr=0.00033059, gnorm=0.129, clip=0, loss_scale=2, train_wall=83, gb_free=62, wall=31393 epoch 022: 1189 / 1689 loss=3.563, nll_loss=2.015, ppl=4.04, wps=586410, ups=1.18, wpb=495991, bsz=16386.8, num_updates=36600, lr=0.00033059, gnorm=0.129, clip=0, loss_scale=2, train_wall=83, gb_free=62, wall=31393 epoch 022: 1189 / 1689 loss=3.563, nll_loss=2.015, ppl=4.04, wps=586410, ups=1.18, wpb=495991, bsz=16386.8, num_updates=36600, lr=0.00033059, gnorm=0.129, clip=0, loss_scale=2, train_wall=83, gb_free=62, wall=31393 epoch 022: 1189 / 1689 loss=3.563, nll_loss=2.015, ppl=4.04, wps=586410, ups=1.18, wpb=495991, bsz=16386.8, num_updates=36600, lr=0.00033059, gnorm=0.129, clip=0, loss_scale=2, train_wall=83, gb_free=62, wall=31393 epoch 022: 1189 / 1689 loss=3.563, nll_loss=2.015, ppl=4.04, wps=586410, ups=1.18, wpb=495991, bsz=16386.8, num_updates=36600, lr=0.00033059, gnorm=0.129, clip=0, loss_scale=2, train_wall=83, gb_free=62, wall=31393 epoch 022: 1189 / 1689 loss=3.563, nll_loss=2.015, ppl=4.04, wps=586410, ups=1.18, wpb=495991, bsz=16386.8, num_updates=36600, lr=0.00033059, gnorm=0.129, clip=0, loss_scale=2, train_wall=83, gb_free=62, wall=31393 epoch 022: 1189 / 1689 loss=3.563, nll_loss=2.015, ppl=4.04, wps=586410, ups=1.18, wpb=495991, bsz=16386.8, num_updates=36600, lr=0.00033059, gnorm=0.129, clip=0, loss_scale=2, train_wall=83, gb_free=62, wall=31393 epoch 022: 1189 / 1689 loss=3.563, nll_loss=2.015, ppl=4.04, wps=586410, ups=1.18, wpb=495991, bsz=16386.8, num_updates=36600, lr=0.00033059, gnorm=0.129, clip=0, loss_scale=2, train_wall=83, gb_free=62, wall=31393 epoch 022: 1189 / 1689 loss=3.563, nll_loss=2.015, ppl=4.04, wps=586410, ups=1.18, wpb=495991, bsz=16386.8, num_updates=36600, lr=0.00033059, gnorm=0.129, clip=0, loss_scale=2, train_wall=83, gb_free=62, wall=31393 epoch 022: 1189 / 1689 loss=3.563, nll_loss=2.015, ppl=4.04, wps=586410, ups=1.18, wpb=495991, bsz=16386.8, num_updates=36600, lr=0.00033059, gnorm=0.129, clip=0, loss_scale=2, train_wall=83, gb_free=62, wall=31393 epoch 022: 1189 / 1689 loss=3.563, nll_loss=2.015, ppl=4.04, wps=586410, ups=1.18, wpb=495991, bsz=16386.8, num_updates=36600, lr=0.00033059, gnorm=0.129, clip=0, loss_scale=2, train_wall=83, gb_free=62, wall=31393 epoch 022: 1189 / 1689 loss=3.563, nll_loss=2.015, ppl=4.04, wps=586410, ups=1.18, wpb=495991, bsz=16386.8, num_updates=36600, lr=0.00033059, gnorm=0.129, clip=0, loss_scale=2, train_wall=83, gb_free=62, wall=31393 epoch 022: 1189 / 1689 loss=3.563, nll_loss=2.015, ppl=4.04, wps=586410, ups=1.18, wpb=495991, bsz=16386.8, num_updates=36600, lr=0.00033059, gnorm=0.129, clip=0, loss_scale=2, train_wall=83, gb_free=62, wall=31393 epoch 022: 1189 / 1689 loss=3.563, nll_loss=2.015, ppl=4.04, wps=586410, ups=1.18, wpb=495991, bsz=16386.8, num_updates=36600, lr=0.00033059, gnorm=0.129, clip=0, loss_scale=2, train_wall=83, gb_free=62, wall=31393 epoch 022: 1189 / 1689 loss=3.563, nll_loss=2.015, ppl=4.04, wps=586410, ups=1.18, wpb=495991, bsz=16386.8, num_updates=36600, lr=0.00033059, gnorm=0.129, clip=0, loss_scale=2, train_wall=83, gb_free=62, wall=31393 epoch 022: 1189 / 1689 loss=3.563, nll_loss=2.015, ppl=4.04, wps=586410, ups=1.18, wpb=495991, bsz=16386.8, num_updates=36600, lr=0.00033059, gnorm=0.129, clip=0, loss_scale=2, train_wall=83, gb_free=62, wall=31393 epoch 022: 1289 / 1689 loss=3.558, nll_loss=2.009, ppl=4.03, wps=587608, ups=1.18, wpb=496937, bsz=16475.1, num_updates=36700, lr=0.000330139, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61.4, wall=31477 epoch 022: 1289 / 1689 loss=3.558, nll_loss=2.009, ppl=4.03, wps=587608, ups=1.18, wpb=496937, bsz=16475.1, num_updates=36700, lr=0.000330139, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61.4, wall=31477 epoch 022: 1289 / 1689 loss=3.558, nll_loss=2.009, ppl=4.03, wps=587608, ups=1.18, wpb=496937, bsz=16475.1, num_updates=36700, lr=0.000330139, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61.4, wall=31477 epoch 022: 1289 / 1689 loss=3.558, nll_loss=2.009, ppl=4.03, wps=587608, ups=1.18, wpb=496937, bsz=16475.1, num_updates=36700, lr=0.000330139, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61.4, wall=31477 epoch 022: 1289 / 1689 loss=3.558, nll_loss=2.009, ppl=4.03, wps=587608, ups=1.18, wpb=496937, bsz=16475.1, num_updates=36700, lr=0.000330139, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61.4, wall=31477 epoch 022: 1289 / 1689 loss=3.558, nll_loss=2.009, ppl=4.03, wps=587608, ups=1.18, wpb=496937, bsz=16475.1, num_updates=36700, lr=0.000330139, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61.4, wall=31477 epoch 022: 1289 / 1689 loss=3.558, nll_loss=2.009, ppl=4.03, wps=587608, ups=1.18, wpb=496937, bsz=16475.1, num_updates=36700, lr=0.000330139, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61.4, wall=31477 epoch 022: 1289 / 1689 loss=3.558, nll_loss=2.009, ppl=4.03, wps=587608, ups=1.18, wpb=496937, bsz=16475.1, num_updates=36700, lr=0.000330139, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61.4, wall=31477 epoch 022: 1289 / 1689 loss=3.558, nll_loss=2.009, ppl=4.03, wps=587608, ups=1.18, wpb=496937, bsz=16475.1, num_updates=36700, lr=0.000330139, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61.4, wall=31477 epoch 022: 1289 / 1689 loss=3.558, nll_loss=2.009, ppl=4.03, wps=587608, ups=1.18, wpb=496937, bsz=16475.1, num_updates=36700, lr=0.000330139, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61.4, wall=31477 epoch 022: 1289 / 1689 loss=3.558, nll_loss=2.009, ppl=4.03, wps=587608, ups=1.18, wpb=496937, bsz=16475.1, num_updates=36700, lr=0.000330139, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61.4, wall=31477 epoch 022: 1289 / 1689 loss=3.558, nll_loss=2.009, ppl=4.03, wps=587608, ups=1.18, wpb=496937, bsz=16475.1, num_updates=36700, lr=0.000330139, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61.4, wall=31477 epoch 022: 1289 / 1689 loss=3.558, nll_loss=2.009, ppl=4.03, wps=587608, ups=1.18, wpb=496937, bsz=16475.1, num_updates=36700, lr=0.000330139, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61.4, wall=31477 epoch 022: 1289 / 1689 loss=3.558, nll_loss=2.009, ppl=4.03, wps=587608, ups=1.18, wpb=496937, bsz=16475.1, num_updates=36700, lr=0.000330139, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61.4, wall=31477 epoch 022: 1289 / 1689 loss=3.558, nll_loss=2.009, ppl=4.03, wps=587608, ups=1.18, wpb=496937, bsz=16475.1, num_updates=36700, lr=0.000330139, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61.4, wall=31477 epoch 022: 1289 / 1689 loss=3.558, nll_loss=2.009, ppl=4.03, wps=587608, ups=1.18, wpb=496937, bsz=16475.1, num_updates=36700, lr=0.000330139, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61.4, wall=31477 epoch 022: 1289 / 1689 loss=3.558, nll_loss=2.009, ppl=4.03, wps=587608, ups=1.18, wpb=496937, bsz=16475.1, num_updates=36700, lr=0.000330139, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61.4, wall=31477 epoch 022: 1289 / 1689 loss=3.558, nll_loss=2.009, ppl=4.03, wps=587608, ups=1.18, wpb=496937, bsz=16475.1, num_updates=36700, lr=0.000330139, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61.4, wall=31477 epoch 022: 1289 / 1689 loss=3.558, nll_loss=2.009, ppl=4.03, wps=587608, ups=1.18, wpb=496937, bsz=16475.1, num_updates=36700, lr=0.000330139, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61.4, wall=31477 epoch 022: 1289 / 1689 loss=3.558, nll_loss=2.009, ppl=4.03, wps=587608, ups=1.18, wpb=496937, bsz=16475.1, num_updates=36700, lr=0.000330139, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61.4, wall=31477 epoch 022: 1289 / 1689 loss=3.558, nll_loss=2.009, ppl=4.03, wps=587608, ups=1.18, wpb=496937, bsz=16475.1, num_updates=36700, lr=0.000330139, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61.4, wall=31477 epoch 022: 1289 / 1689 loss=3.558, nll_loss=2.009, ppl=4.03, wps=587608, ups=1.18, wpb=496937, bsz=16475.1, num_updates=36700, lr=0.000330139, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61.4, wall=31477 epoch 022: 1389 / 1689 loss=3.566, nll_loss=2.018, ppl=4.05, wps=589391, ups=1.19, wpb=495977, bsz=16665.8, num_updates=36800, lr=0.00032969, gnorm=0.138, clip=0, loss_scale=2, train_wall=83, gb_free=60.7, wall=31561 epoch 022: 1389 / 1689 loss=3.566, nll_loss=2.018, ppl=4.05, wps=589391, ups=1.19, wpb=495977, bsz=16665.8, num_updates=36800, lr=0.00032969, gnorm=0.138, clip=0, loss_scale=2, train_wall=83, gb_free=60.7, wall=31561 epoch 022: 1389 / 1689 loss=3.566, nll_loss=2.018, ppl=4.05, wps=589391, ups=1.19, wpb=495977, bsz=16665.8, num_updates=36800, lr=0.00032969, gnorm=0.138, clip=0, loss_scale=2, train_wall=83, gb_free=60.7, wall=31561 epoch 022: 1389 / 1689 loss=3.566, nll_loss=2.018, ppl=4.05, wps=589391, ups=1.19, wpb=495977, bsz=16665.8, num_updates=36800, lr=0.00032969, gnorm=0.138, clip=0, loss_scale=2, train_wall=83, gb_free=60.7, wall=31561 epoch 022: 1389 / 1689 loss=3.566, nll_loss=2.018, ppl=4.05, wps=589391, ups=1.19, wpb=495977, bsz=16665.8, num_updates=36800, lr=0.00032969, gnorm=0.138, clip=0, loss_scale=2, train_wall=83, gb_free=60.7, wall=31561 epoch 022: 1389 / 1689 loss=3.566, nll_loss=2.018, ppl=4.05, wps=589391, ups=1.19, wpb=495977, bsz=16665.8, num_updates=36800, lr=0.00032969, gnorm=0.138, clip=0, loss_scale=2, train_wall=83, gb_free=60.7, wall=31561 epoch 022: 1389 / 1689 loss=3.566, nll_loss=2.018, ppl=4.05, wps=589391, ups=1.19, wpb=495977, bsz=16665.8, num_updates=36800, lr=0.00032969, gnorm=0.138, clip=0, loss_scale=2, train_wall=83, gb_free=60.7, wall=31561 epoch 022: 1389 / 1689 loss=3.566, nll_loss=2.018, ppl=4.05, wps=589391, ups=1.19, wpb=495977, bsz=16665.8, num_updates=36800, lr=0.00032969, gnorm=0.138, clip=0, loss_scale=2, train_wall=83, gb_free=60.7, wall=31561 epoch 022: 1389 / 1689 loss=3.566, nll_loss=2.018, ppl=4.05, wps=589391, ups=1.19, wpb=495977, bsz=16665.8, num_updates=36800, lr=0.00032969, gnorm=0.138, clip=0, loss_scale=2, train_wall=83, gb_free=60.7, wall=31561 epoch 022: 1389 / 1689 loss=3.566, nll_loss=2.018, ppl=4.05, wps=589391, ups=1.19, wpb=495977, bsz=16665.8, num_updates=36800, lr=0.00032969, gnorm=0.138, clip=0, loss_scale=2, train_wall=83, gb_free=60.7, wall=31561 epoch 022: 1389 / 1689 loss=3.566, nll_loss=2.018, ppl=4.05, wps=589391, ups=1.19, wpb=495977, bsz=16665.8, num_updates=36800, lr=0.00032969, gnorm=0.138, clip=0, loss_scale=2, train_wall=83, gb_free=60.7, wall=31561 epoch 022: 1389 / 1689 loss=3.566, nll_loss=2.018, ppl=4.05, wps=589391, ups=1.19, wpb=495977, bsz=16665.8, num_updates=36800, lr=0.00032969, gnorm=0.138, clip=0, loss_scale=2, train_wall=83, gb_free=60.7, wall=31561 epoch 022: 1389 / 1689 loss=3.566, nll_loss=2.018, ppl=4.05, wps=589391, ups=1.19, wpb=495977, bsz=16665.8, num_updates=36800, lr=0.00032969, gnorm=0.138, clip=0, loss_scale=2, train_wall=83, gb_free=60.7, wall=31561 epoch 022: 1389 / 1689 loss=3.566, nll_loss=2.018, ppl=4.05, wps=589391, ups=1.19, wpb=495977, bsz=16665.8, num_updates=36800, lr=0.00032969, gnorm=0.138, clip=0, loss_scale=2, train_wall=83, gb_free=60.7, wall=31561 epoch 022: 1389 / 1689 loss=3.566, nll_loss=2.018, ppl=4.05, wps=589391, ups=1.19, wpb=495977, bsz=16665.8, num_updates=36800, lr=0.00032969, gnorm=0.138, clip=0, loss_scale=2, train_wall=83, gb_free=60.7, wall=31561 epoch 022: 1389 / 1689 loss=3.566, nll_loss=2.018, ppl=4.05, wps=589391, ups=1.19, wpb=495977, bsz=16665.8, num_updates=36800, lr=0.00032969, gnorm=0.138, clip=0, loss_scale=2, train_wall=83, gb_free=60.7, wall=31561 epoch 022: 1389 / 1689 loss=3.566, nll_loss=2.018, ppl=4.05, wps=589391, ups=1.19, wpb=495977, bsz=16665.8, num_updates=36800, lr=0.00032969, gnorm=0.138, clip=0, loss_scale=2, train_wall=83, gb_free=60.7, wall=31561 epoch 022: 1389 / 1689 loss=3.566, nll_loss=2.018, ppl=4.05, wps=589391, ups=1.19, wpb=495977, bsz=16665.8, num_updates=36800, lr=0.00032969, gnorm=0.138, clip=0, loss_scale=2, train_wall=83, gb_free=60.7, wall=31561 epoch 022: 1389 / 1689 loss=3.566, nll_loss=2.018, ppl=4.05, wps=589391, ups=1.19, wpb=495977, bsz=16665.8, num_updates=36800, lr=0.00032969, gnorm=0.138, clip=0, loss_scale=2, train_wall=83, gb_free=60.7, wall=31561 epoch 022: 1389 / 1689 loss=3.566, nll_loss=2.018, ppl=4.05, wps=589391, ups=1.19, wpb=495977, bsz=16665.8, num_updates=36800, lr=0.00032969, gnorm=0.138, clip=0, loss_scale=2, train_wall=83, gb_free=60.7, wall=31561 epoch 022: 1389 / 1689 loss=3.566, nll_loss=2.018, ppl=4.05, wps=589391, ups=1.19, wpb=495977, bsz=16665.8, num_updates=36800, lr=0.00032969, gnorm=0.138, clip=0, loss_scale=2, train_wall=83, gb_free=60.7, wall=31561 epoch 022: 1389 / 1689 loss=3.566, nll_loss=2.018, ppl=4.05, wps=589391, ups=1.19, wpb=495977, bsz=16665.8, num_updates=36800, lr=0.00032969, gnorm=0.138, clip=0, loss_scale=2, train_wall=83, gb_free=60.7, wall=31561 epoch 022: 1489 / 1689 loss=3.563, nll_loss=2.014, ppl=4.04, wps=584697, ups=1.18, wpb=495146, bsz=16278.4, num_updates=36900, lr=0.000329243, gnorm=0.133, clip=0, loss_scale=4, train_wall=83, gb_free=61.7, wall=31646 epoch 022: 1489 / 1689 loss=3.563, nll_loss=2.014, ppl=4.04, wps=584697, ups=1.18, wpb=495146, bsz=16278.4, num_updates=36900, lr=0.000329243, gnorm=0.133, clip=0, loss_scale=4, train_wall=83, gb_free=61.7, wall=31646 epoch 022: 1489 / 1689 loss=3.563, nll_loss=2.014, ppl=4.04, wps=584697, ups=1.18, wpb=495146, bsz=16278.4, num_updates=36900, lr=0.000329243, gnorm=0.133, clip=0, loss_scale=4, train_wall=83, gb_free=61.7, wall=31646 epoch 022: 1489 / 1689 loss=3.563, nll_loss=2.014, ppl=4.04, wps=584697, ups=1.18, wpb=495146, bsz=16278.4, num_updates=36900, lr=0.000329243, gnorm=0.133, clip=0, loss_scale=4, train_wall=83, gb_free=61.7, wall=31646 epoch 022: 1489 / 1689 loss=3.563, nll_loss=2.014, ppl=4.04, wps=584697, ups=1.18, wpb=495146, bsz=16278.4, num_updates=36900, lr=0.000329243, gnorm=0.133, clip=0, loss_scale=4, train_wall=83, gb_free=61.7, wall=31646 epoch 022: 1489 / 1689 loss=3.563, nll_loss=2.014, ppl=4.04, wps=584697, ups=1.18, wpb=495146, bsz=16278.4, num_updates=36900, lr=0.000329243, gnorm=0.133, clip=0, loss_scale=4, train_wall=83, gb_free=61.7, wall=31646 epoch 022: 1489 / 1689 loss=3.563, nll_loss=2.014, ppl=4.04, wps=584697, ups=1.18, wpb=495146, bsz=16278.4, num_updates=36900, lr=0.000329243, gnorm=0.133, clip=0, loss_scale=4, train_wall=83, gb_free=61.7, wall=31646 epoch 022: 1489 / 1689 loss=3.563, nll_loss=2.014, ppl=4.04, wps=584697, ups=1.18, wpb=495146, bsz=16278.4, num_updates=36900, lr=0.000329243, gnorm=0.133, clip=0, loss_scale=4, train_wall=83, gb_free=61.7, wall=31646 epoch 022: 1489 / 1689 loss=3.563, nll_loss=2.014, ppl=4.04, wps=584697, ups=1.18, wpb=495146, bsz=16278.4, num_updates=36900, lr=0.000329243, gnorm=0.133, clip=0, loss_scale=4, train_wall=83, gb_free=61.7, wall=31646 epoch 022: 1489 / 1689 loss=3.563, nll_loss=2.014, ppl=4.04, wps=584697, ups=1.18, wpb=495146, bsz=16278.4, num_updates=36900, lr=0.000329243, gnorm=0.133, clip=0, loss_scale=4, train_wall=83, gb_free=61.7, wall=31646 epoch 022: 1489 / 1689 loss=3.563, nll_loss=2.014, ppl=4.04, wps=584697, ups=1.18, wpb=495146, bsz=16278.4, num_updates=36900, lr=0.000329243, gnorm=0.133, clip=0, loss_scale=4, train_wall=83, gb_free=61.7, wall=31646 epoch 022: 1489 / 1689 loss=3.563, nll_loss=2.014, ppl=4.04, wps=584697, ups=1.18, wpb=495146, bsz=16278.4, num_updates=36900, lr=0.000329243, gnorm=0.133, clip=0, loss_scale=4, train_wall=83, gb_free=61.7, wall=31646 epoch 022: 1489 / 1689 loss=3.563, nll_loss=2.014, ppl=4.04, wps=584697, ups=1.18, wpb=495146, bsz=16278.4, num_updates=36900, lr=0.000329243, gnorm=0.133, clip=0, loss_scale=4, train_wall=83, gb_free=61.7, wall=31646 epoch 022: 1489 / 1689 loss=3.563, nll_loss=2.014, ppl=4.04, wps=584697, ups=1.18, wpb=495146, bsz=16278.4, num_updates=36900, lr=0.000329243, gnorm=0.133, clip=0, loss_scale=4, train_wall=83, gb_free=61.7, wall=31646 epoch 022: 1489 / 1689 loss=3.563, nll_loss=2.014, ppl=4.04, wps=584697, ups=1.18, wpb=495146, bsz=16278.4, num_updates=36900, lr=0.000329243, gnorm=0.133, clip=0, loss_scale=4, train_wall=83, gb_free=61.7, wall=31646 epoch 022: 1489 / 1689 loss=3.563, nll_loss=2.014, ppl=4.04, wps=584697, ups=1.18, wpb=495146, bsz=16278.4, num_updates=36900, lr=0.000329243, gnorm=0.133, clip=0, loss_scale=4, train_wall=83, gb_free=61.7, wall=31646 epoch 022: 1489 / 1689 loss=3.563, nll_loss=2.014, ppl=4.04, wps=584697, ups=1.18, wpb=495146, bsz=16278.4, num_updates=36900, lr=0.000329243, gnorm=0.133, clip=0, loss_scale=4, train_wall=83, gb_free=61.7, wall=31646 epoch 022: 1489 / 1689 loss=3.563, nll_loss=2.014, ppl=4.04, wps=584697, ups=1.18, wpb=495146, bsz=16278.4, num_updates=36900, lr=0.000329243, gnorm=0.133, clip=0, loss_scale=4, train_wall=83, gb_free=61.7, wall=31646 epoch 022: 1489 / 1689 loss=3.563, nll_loss=2.014, ppl=4.04, wps=584697, ups=1.18, wpb=495146, bsz=16278.4, num_updates=36900, lr=0.000329243, gnorm=0.133, clip=0, loss_scale=4, train_wall=83, gb_free=61.7, wall=31646 epoch 022: 1489 / 1689 loss=3.563, nll_loss=2.014, ppl=4.04, wps=584697, ups=1.18, wpb=495146, bsz=16278.4, num_updates=36900, lr=0.000329243, gnorm=0.133, clip=0, loss_scale=4, train_wall=83, gb_free=61.7, wall=31646 epoch 022: 1489 / 1689 loss=3.563, nll_loss=2.014, ppl=4.04, wps=584697, ups=1.18, wpb=495146, bsz=16278.4, num_updates=36900, lr=0.000329243, gnorm=0.133, clip=0, loss_scale=4, train_wall=83, gb_free=61.7, wall=31646 epoch 022: 1489 / 1689 loss=3.563, nll_loss=2.014, ppl=4.04, wps=584697, ups=1.18, wpb=495146, bsz=16278.4, num_updates=36900, lr=0.000329243, gnorm=0.133, clip=0, loss_scale=4, train_wall=83, gb_free=61.7, wall=31646 epoch 022: 1590 / 1689 loss=3.562, nll_loss=2.014, ppl=4.04, wps=581690, ups=1.17, wpb=495708, bsz=16676.9, num_updates=37000, lr=0.000328798, gnorm=0.137, clip=0, loss_scale=2, train_wall=84, gb_free=60, wall=31731 epoch 022: 1590 / 1689 loss=3.562, nll_loss=2.014, ppl=4.04, wps=581690, ups=1.17, wpb=495708, bsz=16676.9, num_updates=37000, lr=0.000328798, gnorm=0.137, clip=0, loss_scale=2, train_wall=84, gb_free=60, wall=31731 epoch 022: 1590 / 1689 loss=3.562, nll_loss=2.014, ppl=4.04, wps=581690, ups=1.17, wpb=495708, bsz=16676.9, num_updates=37000, lr=0.000328798, gnorm=0.137, clip=0, loss_scale=2, train_wall=84, gb_free=60, wall=31731 epoch 022: 1590 / 1689 loss=3.562, nll_loss=2.014, ppl=4.04, wps=581690, ups=1.17, wpb=495708, bsz=16676.9, num_updates=37000, lr=0.000328798, gnorm=0.137, clip=0, loss_scale=2, train_wall=84, gb_free=60, wall=31731 epoch 022: 1590 / 1689 loss=3.562, nll_loss=2.014, ppl=4.04, wps=581690, ups=1.17, wpb=495708, bsz=16676.9, num_updates=37000, lr=0.000328798, gnorm=0.137, clip=0, loss_scale=2, train_wall=84, gb_free=60, wall=31731 epoch 022: 1590 / 1689 loss=3.562, nll_loss=2.014, ppl=4.04, wps=581690, ups=1.17, wpb=495708, bsz=16676.9, num_updates=37000, lr=0.000328798, gnorm=0.137, clip=0, loss_scale=2, train_wall=84, gb_free=60, wall=31731 epoch 022: 1590 / 1689 loss=3.562, nll_loss=2.014, ppl=4.04, wps=581690, ups=1.17, wpb=495708, bsz=16676.9, num_updates=37000, lr=0.000328798, gnorm=0.137, clip=0, loss_scale=2, train_wall=84, gb_free=60, wall=31731 epoch 022: 1590 / 1689 loss=3.562, nll_loss=2.014, ppl=4.04, wps=581690, ups=1.17, wpb=495708, bsz=16676.9, num_updates=37000, lr=0.000328798, gnorm=0.137, clip=0, loss_scale=2, train_wall=84, gb_free=60, wall=31731 epoch 022: 1590 / 1689 loss=3.562, nll_loss=2.014, ppl=4.04, wps=581690, ups=1.17, wpb=495708, bsz=16676.9, num_updates=37000, lr=0.000328798, gnorm=0.137, clip=0, loss_scale=2, train_wall=84, gb_free=60, wall=31731 epoch 022: 1590 / 1689 loss=3.562, nll_loss=2.014, ppl=4.04, wps=581690, ups=1.17, wpb=495708, bsz=16676.9, num_updates=37000, lr=0.000328798, gnorm=0.137, clip=0, loss_scale=2, train_wall=84, gb_free=60, wall=31731 epoch 022: 1590 / 1689 loss=3.562, nll_loss=2.014, ppl=4.04, wps=581690, ups=1.17, wpb=495708, bsz=16676.9, num_updates=37000, lr=0.000328798, gnorm=0.137, clip=0, loss_scale=2, train_wall=84, gb_free=60, wall=31731 epoch 022: 1590 / 1689 loss=3.562, nll_loss=2.014, ppl=4.04, wps=581690, ups=1.17, wpb=495708, bsz=16676.9, num_updates=37000, lr=0.000328798, gnorm=0.137, clip=0, loss_scale=2, train_wall=84, gb_free=60, wall=31731 epoch 022: 1590 / 1689 loss=3.562, nll_loss=2.014, ppl=4.04, wps=581690, ups=1.17, wpb=495708, bsz=16676.9, num_updates=37000, lr=0.000328798, gnorm=0.137, clip=0, loss_scale=2, train_wall=84, gb_free=60, wall=31731 epoch 022: 1590 / 1689 loss=3.562, nll_loss=2.014, ppl=4.04, wps=581690, ups=1.17, wpb=495708, bsz=16676.9, num_updates=37000, lr=0.000328798, gnorm=0.137, clip=0, loss_scale=2, train_wall=84, gb_free=60, wall=31731 epoch 022: 1590 / 1689 loss=3.562, nll_loss=2.014, ppl=4.04, wps=581690, ups=1.17, wpb=495708, bsz=16676.9, num_updates=37000, lr=0.000328798, gnorm=0.137, clip=0, loss_scale=2, train_wall=84, gb_free=60, wall=31731 epoch 022: 1590 / 1689 loss=3.562, nll_loss=2.014, ppl=4.04, wps=581690, ups=1.17, wpb=495708, bsz=16676.9, num_updates=37000, lr=0.000328798, gnorm=0.137, clip=0, loss_scale=2, train_wall=84, gb_free=60, wall=31731 epoch 022: 1590 / 1689 loss=3.562, nll_loss=2.014, ppl=4.04, wps=581690, ups=1.17, wpb=495708, bsz=16676.9, num_updates=37000, lr=0.000328798, gnorm=0.137, clip=0, loss_scale=2, train_wall=84, gb_free=60, wall=31731 epoch 022: 1590 / 1689 loss=3.562, nll_loss=2.014, ppl=4.04, wps=581690, ups=1.17, wpb=495708, bsz=16676.9, num_updates=37000, lr=0.000328798, gnorm=0.137, clip=0, loss_scale=2, train_wall=84, gb_free=60, wall=31731 epoch 022: 1590 / 1689 loss=3.562, nll_loss=2.014, ppl=4.04, wps=581690, ups=1.17, wpb=495708, bsz=16676.9, num_updates=37000, lr=0.000328798, gnorm=0.137, clip=0, loss_scale=2, train_wall=84, gb_free=60, wall=31731 epoch 022: 1590 / 1689 loss=3.562, nll_loss=2.014, ppl=4.04, wps=581690, ups=1.17, wpb=495708, bsz=16676.9, num_updates=37000, lr=0.000328798, gnorm=0.137, clip=0, loss_scale=2, train_wall=84, gb_free=60, wall=31731 epoch 022: 1590 / 1689 loss=3.562, nll_loss=2.014, ppl=4.04, wps=581690, ups=1.17, wpb=495708, bsz=16676.9, num_updates=37000, lr=0.000328798, gnorm=0.137, clip=0, loss_scale=2, train_wall=84, gb_free=60, wall=31731 epoch 022: 1590 / 1689 loss=3.562, nll_loss=2.014, ppl=4.04, wps=581690, ups=1.17, wpb=495708, bsz=16676.9, num_updates=37000, lr=0.000328798, gnorm=0.137, clip=0, loss_scale=2, train_wall=84, gb_free=60, wall=31731 begin validation on "valid" subset epoch 022 | valid on 'valid' subset | loss 3.704 | nll_loss 2.16 | ppl 4.47 | wps 0 | wpb 44526 | bsz 2008 | num_updates 37000 | best_loss 3.699 epoch 022 | valid on 'valid' subset | loss 3.704 | nll_loss 2.16 | ppl 4.47 | wps 0 | wpb 44526 | bsz 2008 | num_updates 37000 | best_loss 3.699 epoch 022 | valid on 'valid' subset | loss 3.704 | nll_loss 2.16 | ppl 4.47 | wps 0 | wpb 44526 | bsz 2008 | num_updates 37000 | best_loss 3.699 epoch 022 | valid on 'valid' subset | loss 3.704 | nll_loss 2.16 | ppl 4.47 | wps 0 | wpb 44526 | bsz 2008 | num_updates 37000 | best_loss 3.699 epoch 022 | valid on 'valid' subset | loss 3.704 | nll_loss 2.16 | ppl 4.47 | wps 0 | wpb 44526 | bsz 2008 | num_updates 37000 | best_loss 3.699 epoch 022 | valid on 'valid' subset | loss 3.704 | nll_loss 2.16 | ppl 4.47 | wps 0 | wpb 44526 | bsz 2008 | num_updates 37000 | best_loss 3.699 epoch 022 | valid on 'valid' subset | loss 3.704 | nll_loss 2.16 | ppl 4.47 | wps 0 | wpb 44526 | bsz 2008 | num_updates 37000 | best_loss 3.699 epoch 022 | valid on 'valid' subset | loss 3.704 | nll_loss 2.16 | ppl 4.47 | wps 0 | wpb 44526 | bsz 2008 | num_updates 37000 | best_loss 3.699 epoch 022 | valid on 'valid' subset | loss 3.704 | nll_loss 2.16 | ppl 4.47 | wps 0 | wpb 44526 | bsz 2008 | num_updates 37000 | best_loss 3.699 epoch 022 | valid on 'valid' subset | loss 3.704 | nll_loss 2.16 | ppl 4.47 | wps 0 | wpb 44526 | bsz 2008 | num_updates 37000 | best_loss 3.699 epoch 022 | valid on 'valid' subset | loss 3.704 | nll_loss 2.16 | ppl 4.47 | wps 0 | wpb 44526 | bsz 2008 | num_updates 37000 | best_loss 3.699 epoch 022 | valid on 'valid' subset | loss 3.704 | nll_loss 2.16 | ppl 4.47 | wps 0 | wpb 44526 | bsz 2008 | num_updates 37000 | best_loss 3.699 epoch 022 | valid on 'valid' subset | loss 3.704 | nll_loss 2.16 | ppl 4.47 | wps 0 | wpb 44526 | bsz 2008 | num_updates 37000 | best_loss 3.699 epoch 022 | valid on 'valid' subset | loss 3.704 | nll_loss 2.16 | ppl 4.47 | wps 0 | wpb 44526 | bsz 2008 | num_updates 37000 | best_loss 3.699 epoch 022 | valid on 'valid' subset | loss 3.704 | nll_loss 2.16 | ppl 4.47 | wps 0 | wpb 44526 | bsz 2008 | num_updates 37000 | best_loss 3.699 epoch 022 | valid on 'valid' subset | loss 3.704 | nll_loss 2.16 | ppl 4.47 | wps 0 | wpb 44526 | bsz 2008 | num_updates 37000 | best_loss 3.699 epoch 022 | valid on 'valid' subset | loss 3.704 | nll_loss 2.16 | ppl 4.47 | wps 0 | wpb 44526 | bsz 2008 | num_updates 37000 | best_loss 3.699 epoch 022 | valid on 'valid' subset | loss 3.704 | nll_loss 2.16 | ppl 4.47 | wps 0 | wpb 44526 | bsz 2008 | num_updates 37000 | best_loss 3.699 epoch 022 | valid on 'valid' subset | loss 3.704 | nll_loss 2.16 | ppl 4.47 | wps 0 | wpb 44526 | bsz 2008 | num_updates 37000 | best_loss 3.699 epoch 022 | valid on 'valid' subset | loss 3.704 | nll_loss 2.16 | ppl 4.47 | wps 0 | wpb 44526 | bsz 2008 | num_updates 37000 | best_loss 3.699 epoch 022 | valid on 'valid' subset | loss 3.704 | nll_loss 2.16 | ppl 4.47 | wps 0 | wpb 44526 | bsz 2008 | num_updates 37000 | best_loss 3.699 epoch 022 | valid on 'valid' subset | loss 3.704 | nll_loss 2.16 | ppl 4.47 | wps 0 | wpb 44526 | bsz 2008 | num_updates 37000 | best_loss 3.699 end of epoch 22 (average epoch stats below) epoch 022 | loss 3.554 | nll_loss 2.004 | ppl 4.01 | wps 576937 | ups 1.17 | wpb 495129 | bsz 16507.6 | num_updates 37099 | lr 0.000328359 | gnorm 0.136 | clip 0 | loss_scale 2 | train_wall 1407 | gb_free 61.7 | wall 31821 epoch 022 | loss 3.554 | nll_loss 2.004 | ppl 4.01 | wps 576937 | ups 1.17 | wpb 495129 | bsz 16507.6 | num_updates 37099 | lr 0.000328359 | gnorm 0.136 | clip 0 | loss_scale 2 | train_wall 1407 | gb_free 61.7 | wall 31821 epoch 022 | loss 3.554 | nll_loss 2.004 | ppl 4.01 | wps 576937 | ups 1.17 | wpb 495129 | bsz 16507.6 | num_updates 37099 | lr 0.000328359 | gnorm 0.136 | clip 0 | loss_scale 2 | train_wall 1407 | gb_free 61.7 | wall 31821 epoch 022 | loss 3.554 | nll_loss 2.004 | ppl 4.01 | wps 576937 | ups 1.17 | wpb 495129 | bsz 16507.6 | num_updates 37099 | lr 0.000328359 | gnorm 0.136 | clip 0 | loss_scale 2 | train_wall 1407 | gb_free 61.7 | wall 31821 epoch 022 | loss 3.554 | nll_loss 2.004 | ppl 4.01 | wps 576937 | ups 1.17 | wpb 495129 | bsz 16507.6 | num_updates 37099 | lr 0.000328359 | gnorm 0.136 | clip 0 | loss_scale 2 | train_wall 1407 | gb_free 61.7 | wall 31821 epoch 022 | loss 3.554 | nll_loss 2.004 | ppl 4.01 | wps 576937 | ups 1.17 | wpb 495129 | bsz 16507.6 | num_updates 37099 | lr 0.000328359 | gnorm 0.136 | clip 0 | loss_scale 2 | train_wall 1407 | gb_free 61.7 | wall 31821 epoch 022 | loss 3.554 | nll_loss 2.004 | ppl 4.01 | wps 576937 | ups 1.17 | wpb 495129 | bsz 16507.6 | num_updates 37099 | lr 0.000328359 | gnorm 0.136 | clip 0 | loss_scale 2 | train_wall 1407 | gb_free 61.7 | wall 31821 epoch 022 | loss 3.554 | nll_loss 2.004 | ppl 4.01 | wps 576937 | ups 1.17 | wpb 495129 | bsz 16507.6 | num_updates 37099 | lr 0.000328359 | gnorm 0.136 | clip 0 | loss_scale 2 | train_wall 1407 | gb_free 61.7 | wall 31821 epoch 022 | loss 3.554 | nll_loss 2.004 | ppl 4.01 | wps 576937 | ups 1.17 | wpb 495129 | bsz 16507.6 | num_updates 37099 | lr 0.000328359 | gnorm 0.136 | clip 0 | loss_scale 2 | train_wall 1407 | gb_free 61.7 | wall 31821 epoch 022 | loss 3.554 | nll_loss 2.004 | ppl 4.01 | wps 576937 | ups 1.17 | wpb 495129 | bsz 16507.6 | num_updates 37099 | lr 0.000328359 | gnorm 0.136 | clip 0 | loss_scale 2 | train_wall 1407 | gb_free 61.7 | wall 31821 epoch 022 | loss 3.554 | nll_loss 2.004 | ppl 4.01 | wps 576937 | ups 1.17 | wpb 495129 | bsz 16507.6 | num_updates 37099 | lr 0.000328359 | gnorm 0.136 | clip 0 | loss_scale 2 | train_wall 1407 | gb_free 61.7 | wall 31821 epoch 022 | loss 3.554 | nll_loss 2.004 | ppl 4.01 | wps 576937 | ups 1.17 | wpb 495129 | bsz 16507.6 | num_updates 37099 | lr 0.000328359 | gnorm 0.136 | clip 0 | loss_scale 2 | train_wall 1407 | gb_free 61.7 | wall 31821 epoch 022 | loss 3.554 | nll_loss 2.004 | ppl 4.01 | wps 576937 | ups 1.17 | wpb 495129 | bsz 16507.6 | num_updates 37099 | lr 0.000328359 | gnorm 0.136 | clip 0 | loss_scale 2 | train_wall 1407 | gb_free 61.7 | wall 31821 epoch 022 | loss 3.554 | nll_loss 2.004 | ppl 4.01 | wps 576937 | ups 1.17 | wpb 495129 | bsz 16507.6 | num_updates 37099 | lr 0.000328359 | gnorm 0.136 | clip 0 | loss_scale 2 | train_wall 1407 | gb_free 61.7 | wall 31821 epoch 022 | loss 3.554 | nll_loss 2.004 | ppl 4.01 | wps 576937 | ups 1.17 | wpb 495129 | bsz 16507.6 | num_updates 37099 | lr 0.000328359 | gnorm 0.136 | clip 0 | loss_scale 2 | train_wall 1407 | gb_free 61.7 | wall 31821 epoch 022 | loss 3.554 | nll_loss 2.004 | ppl 4.01 | wps 576937 | ups 1.17 | wpb 495129 | bsz 16507.6 | num_updates 37099 | lr 0.000328359 | gnorm 0.136 | clip 0 | loss_scale 2 | train_wall 1407 | gb_free 61.7 | wall 31821 epoch 022 | loss 3.554 | nll_loss 2.004 | ppl 4.01 | wps 576937 | ups 1.17 | wpb 495129 | bsz 16507.6 | num_updates 37099 | lr 0.000328359 | gnorm 0.136 | clip 0 | loss_scale 2 | train_wall 1407 | gb_free 61.7 | wall 31821 epoch 022 | loss 3.554 | nll_loss 2.004 | ppl 4.01 | wps 576937 | ups 1.17 | wpb 495129 | bsz 16507.6 | num_updates 37099 | lr 0.000328359 | gnorm 0.136 | clip 0 | loss_scale 2 | train_wall 1407 | gb_free 61.7 | wall 31821 epoch 022 | loss 3.554 | nll_loss 2.004 | ppl 4.01 | wps 576937 | ups 1.17 | wpb 495129 | bsz 16507.6 | num_updates 37099 | lr 0.000328359 | gnorm 0.136 | clip 0 | loss_scale 2 | train_wall 1407 | gb_free 61.7 | wall 31821 epoch 022 | loss 3.554 | nll_loss 2.004 | ppl 4.01 | wps 576937 | ups 1.17 | wpb 495129 | bsz 16507.6 | num_updates 37099 | lr 0.000328359 | gnorm 0.136 | clip 0 | loss_scale 2 | train_wall 1407 | gb_free 61.7 | wall 31821 epoch 022 | loss 3.554 | nll_loss 2.004 | ppl 4.01 | wps 576937 | ups 1.17 | wpb 495129 | bsz 16507.6 | num_updates 37099 | lr 0.000328359 | gnorm 0.136 | clip 0 | loss_scale 2 | train_wall 1407 | gb_free 61.7 | wall 31821 epoch 022 | loss 3.554 | nll_loss 2.004 | ppl 4.01 | wps 576937 | ups 1.17 | wpb 495129 | bsz 16507.6 | num_updates 37099 | lr 0.000328359 | gnorm 0.136 | clip 0 | loss_scale 2 | train_wall 1407 | gb_free 61.7 | wall 31821 Start iterating over samples epoch 023: 1 / 1689 loss=3.561, nll_loss=2.013, ppl=4.03, wps=537038, ups=1.09, wpb=491514, bsz=16394.1, num_updates=37100, lr=0.000328355, gnorm=0.141, clip=0, loss_scale=2, train_wall=83, gb_free=60.2, wall=31823 epoch 023: 1 / 1689 loss=3.561, nll_loss=2.013, ppl=4.03, wps=537038, ups=1.09, wpb=491514, bsz=16394.1, num_updates=37100, lr=0.000328355, gnorm=0.141, clip=0, loss_scale=2, train_wall=83, gb_free=60.2, wall=31823 epoch 023: 1 / 1689 loss=3.561, nll_loss=2.013, ppl=4.03, wps=537038, ups=1.09, wpb=491514, bsz=16394.1, num_updates=37100, lr=0.000328355, gnorm=0.141, clip=0, loss_scale=2, train_wall=83, gb_free=60.2, wall=31823 epoch 023: 1 / 1689 loss=3.561, nll_loss=2.013, ppl=4.03, wps=537038, ups=1.09, wpb=491514, bsz=16394.1, num_updates=37100, lr=0.000328355, gnorm=0.141, clip=0, loss_scale=2, train_wall=83, gb_free=60.2, wall=31823 epoch 023: 1 / 1689 loss=3.561, nll_loss=2.013, ppl=4.03, wps=537038, ups=1.09, wpb=491514, bsz=16394.1, num_updates=37100, lr=0.000328355, gnorm=0.141, clip=0, loss_scale=2, train_wall=83, gb_free=60.2, wall=31823 epoch 023: 1 / 1689 loss=3.561, nll_loss=2.013, ppl=4.03, wps=537038, ups=1.09, wpb=491514, bsz=16394.1, num_updates=37100, lr=0.000328355, gnorm=0.141, clip=0, loss_scale=2, train_wall=83, gb_free=60.2, wall=31823 epoch 023: 1 / 1689 loss=3.561, nll_loss=2.013, ppl=4.03, wps=537038, ups=1.09, wpb=491514, bsz=16394.1, num_updates=37100, lr=0.000328355, gnorm=0.141, clip=0, loss_scale=2, train_wall=83, gb_free=60.2, wall=31823 epoch 023: 1 / 1689 loss=3.561, nll_loss=2.013, ppl=4.03, wps=537038, ups=1.09, wpb=491514, bsz=16394.1, num_updates=37100, lr=0.000328355, gnorm=0.141, clip=0, loss_scale=2, train_wall=83, gb_free=60.2, wall=31823 epoch 023: 1 / 1689 loss=3.561, nll_loss=2.013, ppl=4.03, wps=537038, ups=1.09, wpb=491514, bsz=16394.1, num_updates=37100, lr=0.000328355, gnorm=0.141, clip=0, loss_scale=2, train_wall=83, gb_free=60.2, wall=31823 epoch 023: 1 / 1689 loss=3.561, nll_loss=2.013, ppl=4.03, wps=537038, ups=1.09, wpb=491514, bsz=16394.1, num_updates=37100, lr=0.000328355, gnorm=0.141, clip=0, loss_scale=2, train_wall=83, gb_free=60.2, wall=31823 epoch 023: 1 / 1689 loss=3.561, nll_loss=2.013, ppl=4.03, wps=537038, ups=1.09, wpb=491514, bsz=16394.1, num_updates=37100, lr=0.000328355, gnorm=0.141, clip=0, loss_scale=2, train_wall=83, gb_free=60.2, wall=31823 epoch 023: 1 / 1689 loss=3.561, nll_loss=2.013, ppl=4.03, wps=537038, ups=1.09, wpb=491514, bsz=16394.1, num_updates=37100, lr=0.000328355, gnorm=0.141, clip=0, loss_scale=2, train_wall=83, gb_free=60.2, wall=31823 epoch 023: 1 / 1689 loss=3.561, nll_loss=2.013, ppl=4.03, wps=537038, ups=1.09, wpb=491514, bsz=16394.1, num_updates=37100, lr=0.000328355, gnorm=0.141, clip=0, loss_scale=2, train_wall=83, gb_free=60.2, wall=31823 epoch 023: 1 / 1689 loss=3.561, nll_loss=2.013, ppl=4.03, wps=537038, ups=1.09, wpb=491514, bsz=16394.1, num_updates=37100, lr=0.000328355, gnorm=0.141, clip=0, loss_scale=2, train_wall=83, gb_free=60.2, wall=31823 epoch 023: 1 / 1689 loss=3.561, nll_loss=2.013, ppl=4.03, wps=537038, ups=1.09, wpb=491514, bsz=16394.1, num_updates=37100, lr=0.000328355, gnorm=0.141, clip=0, loss_scale=2, train_wall=83, gb_free=60.2, wall=31823 epoch 023: 1 / 1689 loss=3.561, nll_loss=2.013, ppl=4.03, wps=537038, ups=1.09, wpb=491514, bsz=16394.1, num_updates=37100, lr=0.000328355, gnorm=0.141, clip=0, loss_scale=2, train_wall=83, gb_free=60.2, wall=31823 epoch 023: 1 / 1689 loss=3.561, nll_loss=2.013, ppl=4.03, wps=537038, ups=1.09, wpb=491514, bsz=16394.1, num_updates=37100, lr=0.000328355, gnorm=0.141, clip=0, loss_scale=2, train_wall=83, gb_free=60.2, wall=31823 epoch 023: 1 / 1689 loss=3.561, nll_loss=2.013, ppl=4.03, wps=537038, ups=1.09, wpb=491514, bsz=16394.1, num_updates=37100, lr=0.000328355, gnorm=0.141, clip=0, loss_scale=2, train_wall=83, gb_free=60.2, wall=31823 epoch 023: 1 / 1689 loss=3.561, nll_loss=2.013, ppl=4.03, wps=537038, ups=1.09, wpb=491514, bsz=16394.1, num_updates=37100, lr=0.000328355, gnorm=0.141, clip=0, loss_scale=2, train_wall=83, gb_free=60.2, wall=31823 epoch 023: 1 / 1689 loss=3.561, nll_loss=2.013, ppl=4.03, wps=537038, ups=1.09, wpb=491514, bsz=16394.1, num_updates=37100, lr=0.000328355, gnorm=0.141, clip=0, loss_scale=2, train_wall=83, gb_free=60.2, wall=31823 epoch 023: 1 / 1689 loss=3.561, nll_loss=2.013, ppl=4.03, wps=537038, ups=1.09, wpb=491514, bsz=16394.1, num_updates=37100, lr=0.000328355, gnorm=0.141, clip=0, loss_scale=2, train_wall=83, gb_free=60.2, wall=31823 epoch 023: 1 / 1689 loss=3.561, nll_loss=2.013, ppl=4.03, wps=537038, ups=1.09, wpb=491514, bsz=16394.1, num_updates=37100, lr=0.000328355, gnorm=0.141, clip=0, loss_scale=2, train_wall=83, gb_free=60.2, wall=31823 epoch 023: 1 / 1689 loss=3.561, nll_loss=2.013, ppl=4.03, wps=537038, ups=1.09, wpb=491514, bsz=16394.1, num_updates=37100, lr=0.000328355, gnorm=0.141, clip=0, loss_scale=2, train_wall=83, gb_free=60.2, wall=31823 epoch 023: 101 / 1689 loss=3.528, nll_loss=1.974, ppl=3.93, wps=584671, ups=1.18, wpb=495459, bsz=16571.8, num_updates=37200, lr=0.000327913, gnorm=0.135, clip=0, loss_scale=2, train_wall=84, gb_free=60, wall=31907 epoch 023: 101 / 1689 loss=3.528, nll_loss=1.974, ppl=3.93, wps=584671, ups=1.18, wpb=495459, bsz=16571.8, num_updates=37200, lr=0.000327913, gnorm=0.135, clip=0, loss_scale=2, train_wall=84, gb_free=60, wall=31907 epoch 023: 101 / 1689 loss=3.528, nll_loss=1.974, ppl=3.93, wps=584671, ups=1.18, wpb=495459, bsz=16571.8, num_updates=37200, lr=0.000327913, gnorm=0.135, clip=0, loss_scale=2, train_wall=84, gb_free=60, wall=31907 epoch 023: 101 / 1689 loss=3.528, nll_loss=1.974, ppl=3.93, wps=584671, ups=1.18, wpb=495459, bsz=16571.8, num_updates=37200, lr=0.000327913, gnorm=0.135, clip=0, loss_scale=2, train_wall=84, gb_free=60, wall=31907 epoch 023: 101 / 1689 loss=3.528, nll_loss=1.974, ppl=3.93, wps=584671, ups=1.18, wpb=495459, bsz=16571.8, num_updates=37200, lr=0.000327913, gnorm=0.135, clip=0, loss_scale=2, train_wall=84, gb_free=60, wall=31907 epoch 023: 101 / 1689 loss=3.528, nll_loss=1.974, ppl=3.93, wps=584671, ups=1.18, wpb=495459, bsz=16571.8, num_updates=37200, lr=0.000327913, gnorm=0.135, clip=0, loss_scale=2, train_wall=84, gb_free=60, wall=31907 epoch 023: 101 / 1689 loss=3.528, nll_loss=1.974, ppl=3.93, wps=584671, ups=1.18, wpb=495459, bsz=16571.8, num_updates=37200, lr=0.000327913, gnorm=0.135, clip=0, loss_scale=2, train_wall=84, gb_free=60, wall=31907 epoch 023: 101 / 1689 loss=3.528, nll_loss=1.974, ppl=3.93, wps=584671, ups=1.18, wpb=495459, bsz=16571.8, num_updates=37200, lr=0.000327913, gnorm=0.135, clip=0, loss_scale=2, train_wall=84, gb_free=60, wall=31907 epoch 023: 101 / 1689 loss=3.528, nll_loss=1.974, ppl=3.93, wps=584671, ups=1.18, wpb=495459, bsz=16571.8, num_updates=37200, lr=0.000327913, gnorm=0.135, clip=0, loss_scale=2, train_wall=84, gb_free=60, wall=31907 epoch 023: 101 / 1689 loss=3.528, nll_loss=1.974, ppl=3.93, wps=584671, ups=1.18, wpb=495459, bsz=16571.8, num_updates=37200, lr=0.000327913, gnorm=0.135, clip=0, loss_scale=2, train_wall=84, gb_free=60, wall=31907 epoch 023: 101 / 1689 loss=3.528, nll_loss=1.974, ppl=3.93, wps=584671, ups=1.18, wpb=495459, bsz=16571.8, num_updates=37200, lr=0.000327913, gnorm=0.135, clip=0, loss_scale=2, train_wall=84, gb_free=60, wall=31907 epoch 023: 101 / 1689 loss=3.528, nll_loss=1.974, ppl=3.93, wps=584671, ups=1.18, wpb=495459, bsz=16571.8, num_updates=37200, lr=0.000327913, gnorm=0.135, clip=0, loss_scale=2, train_wall=84, gb_free=60, wall=31907 epoch 023: 101 / 1689 loss=3.528, nll_loss=1.974, ppl=3.93, wps=584671, ups=1.18, wpb=495459, bsz=16571.8, num_updates=37200, lr=0.000327913, gnorm=0.135, clip=0, loss_scale=2, train_wall=84, gb_free=60, wall=31907 epoch 023: 101 / 1689 loss=3.528, nll_loss=1.974, ppl=3.93, wps=584671, ups=1.18, wpb=495459, bsz=16571.8, num_updates=37200, lr=0.000327913, gnorm=0.135, clip=0, loss_scale=2, train_wall=84, gb_free=60, wall=31907 epoch 023: 101 / 1689 loss=3.528, nll_loss=1.974, ppl=3.93, wps=584671, ups=1.18, wpb=495459, bsz=16571.8, num_updates=37200, lr=0.000327913, gnorm=0.135, clip=0, loss_scale=2, train_wall=84, gb_free=60, wall=31907 epoch 023: 101 / 1689 loss=3.528, nll_loss=1.974, ppl=3.93, wps=584671, ups=1.18, wpb=495459, bsz=16571.8, num_updates=37200, lr=0.000327913, gnorm=0.135, clip=0, loss_scale=2, train_wall=84, gb_free=60, wall=31907 epoch 023: 101 / 1689 loss=3.528, nll_loss=1.974, ppl=3.93, wps=584671, ups=1.18, wpb=495459, bsz=16571.8, num_updates=37200, lr=0.000327913, gnorm=0.135, clip=0, loss_scale=2, train_wall=84, gb_free=60, wall=31907 epoch 023: 101 / 1689 loss=3.528, nll_loss=1.974, ppl=3.93, wps=584671, ups=1.18, wpb=495459, bsz=16571.8, num_updates=37200, lr=0.000327913, gnorm=0.135, clip=0, loss_scale=2, train_wall=84, gb_free=60, wall=31907 epoch 023: 101 / 1689 loss=3.528, nll_loss=1.974, ppl=3.93, wps=584671, ups=1.18, wpb=495459, bsz=16571.8, num_updates=37200, lr=0.000327913, gnorm=0.135, clip=0, loss_scale=2, train_wall=84, gb_free=60, wall=31907 epoch 023: 101 / 1689 loss=3.528, nll_loss=1.974, ppl=3.93, wps=584671, ups=1.18, wpb=495459, bsz=16571.8, num_updates=37200, lr=0.000327913, gnorm=0.135, clip=0, loss_scale=2, train_wall=84, gb_free=60, wall=31907 epoch 023: 101 / 1689 loss=3.528, nll_loss=1.974, ppl=3.93, wps=584671, ups=1.18, wpb=495459, bsz=16571.8, num_updates=37200, lr=0.000327913, gnorm=0.135, clip=0, loss_scale=2, train_wall=84, gb_free=60, wall=31907 epoch 023: 101 / 1689 loss=3.528, nll_loss=1.974, ppl=3.93, wps=584671, ups=1.18, wpb=495459, bsz=16571.8, num_updates=37200, lr=0.000327913, gnorm=0.135, clip=0, loss_scale=2, train_wall=84, gb_free=60, wall=31907 epoch 023: 101 / 1689 loss=3.528, nll_loss=1.974, ppl=3.93, wps=584671, ups=1.18, wpb=495459, bsz=16571.8, num_updates=37200, lr=0.000327913, gnorm=0.135, clip=0, loss_scale=2, train_wall=84, gb_free=60, wall=31907 epoch 023: 201 / 1689 loss=3.535, nll_loss=1.982, ppl=3.95, wps=585983, ups=1.18, wpb=495029, bsz=17094.4, num_updates=37300, lr=0.000327473, gnorm=0.143, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=31992 epoch 023: 201 / 1689 loss=3.535, nll_loss=1.982, ppl=3.95, wps=585983, ups=1.18, wpb=495029, bsz=17094.4, num_updates=37300, lr=0.000327473, gnorm=0.143, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=31992 epoch 023: 201 / 1689 loss=3.535, nll_loss=1.982, ppl=3.95, wps=585983, ups=1.18, wpb=495029, bsz=17094.4, num_updates=37300, lr=0.000327473, gnorm=0.143, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=31992 epoch 023: 201 / 1689 loss=3.535, nll_loss=1.982, ppl=3.95, wps=585983, ups=1.18, wpb=495029, bsz=17094.4, num_updates=37300, lr=0.000327473, gnorm=0.143, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=31992 epoch 023: 201 / 1689 loss=3.535, nll_loss=1.982, ppl=3.95, wps=585983, ups=1.18, wpb=495029, bsz=17094.4, num_updates=37300, lr=0.000327473, gnorm=0.143, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=31992 epoch 023: 201 / 1689 loss=3.535, nll_loss=1.982, ppl=3.95, wps=585983, ups=1.18, wpb=495029, bsz=17094.4, num_updates=37300, lr=0.000327473, gnorm=0.143, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=31992 epoch 023: 201 / 1689 loss=3.535, nll_loss=1.982, ppl=3.95, wps=585983, ups=1.18, wpb=495029, bsz=17094.4, num_updates=37300, lr=0.000327473, gnorm=0.143, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=31992 epoch 023: 201 / 1689 loss=3.535, nll_loss=1.982, ppl=3.95, wps=585983, ups=1.18, wpb=495029, bsz=17094.4, num_updates=37300, lr=0.000327473, gnorm=0.143, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=31992 epoch 023: 201 / 1689 loss=3.535, nll_loss=1.982, ppl=3.95, wps=585983, ups=1.18, wpb=495029, bsz=17094.4, num_updates=37300, lr=0.000327473, gnorm=0.143, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=31992 epoch 023: 201 / 1689 loss=3.535, nll_loss=1.982, ppl=3.95, wps=585983, ups=1.18, wpb=495029, bsz=17094.4, num_updates=37300, lr=0.000327473, gnorm=0.143, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=31992 epoch 023: 201 / 1689 loss=3.535, nll_loss=1.982, ppl=3.95, wps=585983, ups=1.18, wpb=495029, bsz=17094.4, num_updates=37300, lr=0.000327473, gnorm=0.143, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=31992 epoch 023: 201 / 1689 loss=3.535, nll_loss=1.982, ppl=3.95, wps=585983, ups=1.18, wpb=495029, bsz=17094.4, num_updates=37300, lr=0.000327473, gnorm=0.143, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=31992 epoch 023: 201 / 1689 loss=3.535, nll_loss=1.982, ppl=3.95, wps=585983, ups=1.18, wpb=495029, bsz=17094.4, num_updates=37300, lr=0.000327473, gnorm=0.143, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=31992 epoch 023: 201 / 1689 loss=3.535, nll_loss=1.982, ppl=3.95, wps=585983, ups=1.18, wpb=495029, bsz=17094.4, num_updates=37300, lr=0.000327473, gnorm=0.143, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=31992 epoch 023: 201 / 1689 loss=3.535, nll_loss=1.982, ppl=3.95, wps=585983, ups=1.18, wpb=495029, bsz=17094.4, num_updates=37300, lr=0.000327473, gnorm=0.143, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=31992 epoch 023: 201 / 1689 loss=3.535, nll_loss=1.982, ppl=3.95, wps=585983, ups=1.18, wpb=495029, bsz=17094.4, num_updates=37300, lr=0.000327473, gnorm=0.143, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=31992 epoch 023: 201 / 1689 loss=3.535, nll_loss=1.982, ppl=3.95, wps=585983, ups=1.18, wpb=495029, bsz=17094.4, num_updates=37300, lr=0.000327473, gnorm=0.143, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=31992 epoch 023: 201 / 1689 loss=3.535, nll_loss=1.982, ppl=3.95, wps=585983, ups=1.18, wpb=495029, bsz=17094.4, num_updates=37300, lr=0.000327473, gnorm=0.143, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=31992 epoch 023: 201 / 1689 loss=3.535, nll_loss=1.982, ppl=3.95, wps=585983, ups=1.18, wpb=495029, bsz=17094.4, num_updates=37300, lr=0.000327473, gnorm=0.143, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=31992 epoch 023: 201 / 1689 loss=3.535, nll_loss=1.982, ppl=3.95, wps=585983, ups=1.18, wpb=495029, bsz=17094.4, num_updates=37300, lr=0.000327473, gnorm=0.143, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=31992 epoch 023: 201 / 1689 loss=3.535, nll_loss=1.982, ppl=3.95, wps=585983, ups=1.18, wpb=495029, bsz=17094.4, num_updates=37300, lr=0.000327473, gnorm=0.143, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=31992 epoch 023: 201 / 1689 loss=3.535, nll_loss=1.982, ppl=3.95, wps=585983, ups=1.18, wpb=495029, bsz=17094.4, num_updates=37300, lr=0.000327473, gnorm=0.143, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=31992 epoch 023: 201 / 1689 loss=3.535, nll_loss=1.982, ppl=3.95, wps=585983, ups=1.18, wpb=495029, bsz=17094.4, num_updates=37300, lr=0.000327473, gnorm=0.143, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=31992 epoch 023: 301 / 1689 loss=3.54, nll_loss=1.989, ppl=3.97, wps=585399, ups=1.18, wpb=496862, bsz=16738.6, num_updates=37400, lr=0.000327035, gnorm=0.134, clip=0, loss_scale=2, train_wall=84, gb_free=62.1, wall=32077 epoch 023: 301 / 1689 loss=3.54, nll_loss=1.989, ppl=3.97, wps=585399, ups=1.18, wpb=496862, bsz=16738.6, num_updates=37400, lr=0.000327035, gnorm=0.134, clip=0, loss_scale=2, train_wall=84, gb_free=62.1, wall=32077 epoch 023: 301 / 1689 loss=3.54, nll_loss=1.989, ppl=3.97, wps=585399, ups=1.18, wpb=496862, bsz=16738.6, num_updates=37400, lr=0.000327035, gnorm=0.134, clip=0, loss_scale=2, train_wall=84, gb_free=62.1, wall=32077 epoch 023: 301 / 1689 loss=3.54, nll_loss=1.989, ppl=3.97, wps=585399, ups=1.18, wpb=496862, bsz=16738.6, num_updates=37400, lr=0.000327035, gnorm=0.134, clip=0, loss_scale=2, train_wall=84, gb_free=62.1, wall=32077 epoch 023: 301 / 1689 loss=3.54, nll_loss=1.989, ppl=3.97, wps=585399, ups=1.18, wpb=496862, bsz=16738.6, num_updates=37400, lr=0.000327035, gnorm=0.134, clip=0, loss_scale=2, train_wall=84, gb_free=62.1, wall=32077 epoch 023: 301 / 1689 loss=3.54, nll_loss=1.989, ppl=3.97, wps=585399, ups=1.18, wpb=496862, bsz=16738.6, num_updates=37400, lr=0.000327035, gnorm=0.134, clip=0, loss_scale=2, train_wall=84, gb_free=62.1, wall=32077 epoch 023: 301 / 1689 loss=3.54, nll_loss=1.989, ppl=3.97, wps=585399, ups=1.18, wpb=496862, bsz=16738.6, num_updates=37400, lr=0.000327035, gnorm=0.134, clip=0, loss_scale=2, train_wall=84, gb_free=62.1, wall=32077 epoch 023: 301 / 1689 loss=3.54, nll_loss=1.989, ppl=3.97, wps=585399, ups=1.18, wpb=496862, bsz=16738.6, num_updates=37400, lr=0.000327035, gnorm=0.134, clip=0, loss_scale=2, train_wall=84, gb_free=62.1, wall=32077 epoch 023: 301 / 1689 loss=3.54, nll_loss=1.989, ppl=3.97, wps=585399, ups=1.18, wpb=496862, bsz=16738.6, num_updates=37400, lr=0.000327035, gnorm=0.134, clip=0, loss_scale=2, train_wall=84, gb_free=62.1, wall=32077 epoch 023: 301 / 1689 loss=3.54, nll_loss=1.989, ppl=3.97, wps=585399, ups=1.18, wpb=496862, bsz=16738.6, num_updates=37400, lr=0.000327035, gnorm=0.134, clip=0, loss_scale=2, train_wall=84, gb_free=62.1, wall=32077 epoch 023: 301 / 1689 loss=3.54, nll_loss=1.989, ppl=3.97, wps=585399, ups=1.18, wpb=496862, bsz=16738.6, num_updates=37400, lr=0.000327035, gnorm=0.134, clip=0, loss_scale=2, train_wall=84, gb_free=62.1, wall=32077 epoch 023: 301 / 1689 loss=3.54, nll_loss=1.989, ppl=3.97, wps=585399, ups=1.18, wpb=496862, bsz=16738.6, num_updates=37400, lr=0.000327035, gnorm=0.134, clip=0, loss_scale=2, train_wall=84, gb_free=62.1, wall=32077 epoch 023: 301 / 1689 loss=3.54, nll_loss=1.989, ppl=3.97, wps=585399, ups=1.18, wpb=496862, bsz=16738.6, num_updates=37400, lr=0.000327035, gnorm=0.134, clip=0, loss_scale=2, train_wall=84, gb_free=62.1, wall=32077 epoch 023: 301 / 1689 loss=3.54, nll_loss=1.989, ppl=3.97, wps=585399, ups=1.18, wpb=496862, bsz=16738.6, num_updates=37400, lr=0.000327035, gnorm=0.134, clip=0, loss_scale=2, train_wall=84, gb_free=62.1, wall=32077 epoch 023: 301 / 1689 loss=3.54, nll_loss=1.989, ppl=3.97, wps=585399, ups=1.18, wpb=496862, bsz=16738.6, num_updates=37400, lr=0.000327035, gnorm=0.134, clip=0, loss_scale=2, train_wall=84, gb_free=62.1, wall=32077 epoch 023: 301 / 1689 loss=3.54, nll_loss=1.989, ppl=3.97, wps=585399, ups=1.18, wpb=496862, bsz=16738.6, num_updates=37400, lr=0.000327035, gnorm=0.134, clip=0, loss_scale=2, train_wall=84, gb_free=62.1, wall=32077 epoch 023: 301 / 1689 loss=3.54, nll_loss=1.989, ppl=3.97, wps=585399, ups=1.18, wpb=496862, bsz=16738.6, num_updates=37400, lr=0.000327035, gnorm=0.134, clip=0, loss_scale=2, train_wall=84, gb_free=62.1, wall=32077 epoch 023: 301 / 1689 loss=3.54, nll_loss=1.989, ppl=3.97, wps=585399, ups=1.18, wpb=496862, bsz=16738.6, num_updates=37400, lr=0.000327035, gnorm=0.134, clip=0, loss_scale=2, train_wall=84, gb_free=62.1, wall=32077 epoch 023: 301 / 1689 loss=3.54, nll_loss=1.989, ppl=3.97, wps=585399, ups=1.18, wpb=496862, bsz=16738.6, num_updates=37400, lr=0.000327035, gnorm=0.134, clip=0, loss_scale=2, train_wall=84, gb_free=62.1, wall=32077 epoch 023: 301 / 1689 loss=3.54, nll_loss=1.989, ppl=3.97, wps=585399, ups=1.18, wpb=496862, bsz=16738.6, num_updates=37400, lr=0.000327035, gnorm=0.134, clip=0, loss_scale=2, train_wall=84, gb_free=62.1, wall=32077 epoch 023: 301 / 1689 loss=3.54, nll_loss=1.989, ppl=3.97, wps=585399, ups=1.18, wpb=496862, bsz=16738.6, num_updates=37400, lr=0.000327035, gnorm=0.134, clip=0, loss_scale=2, train_wall=84, gb_free=62.1, wall=32077 epoch 023: 301 / 1689 loss=3.54, nll_loss=1.989, ppl=3.97, wps=585399, ups=1.18, wpb=496862, bsz=16738.6, num_updates=37400, lr=0.000327035, gnorm=0.134, clip=0, loss_scale=2, train_wall=84, gb_free=62.1, wall=32077 epoch 023: 301 / 1689 loss=3.54, nll_loss=1.989, ppl=3.97, wps=585399, ups=1.18, wpb=496862, bsz=16738.6, num_updates=37400, lr=0.000327035, gnorm=0.134, clip=0, loss_scale=2, train_wall=84, gb_free=62.1, wall=32077 epoch 023: 401 / 1689 loss=3.545, nll_loss=1.994, ppl=3.98, wps=585252, ups=1.18, wpb=495750, bsz=16535.2, num_updates=37500, lr=0.000326599, gnorm=0.136, clip=0, loss_scale=4, train_wall=84, gb_free=61.8, wall=32161 epoch 023: 401 / 1689 loss=3.545, nll_loss=1.994, ppl=3.98, wps=585252, ups=1.18, wpb=495750, bsz=16535.2, num_updates=37500, lr=0.000326599, gnorm=0.136, clip=0, loss_scale=4, train_wall=84, gb_free=61.8, wall=32161 epoch 023: 401 / 1689 loss=3.545, nll_loss=1.994, ppl=3.98, wps=585252, ups=1.18, wpb=495750, bsz=16535.2, num_updates=37500, lr=0.000326599, gnorm=0.136, clip=0, loss_scale=4, train_wall=84, gb_free=61.8, wall=32161 epoch 023: 401 / 1689 loss=3.545, nll_loss=1.994, ppl=3.98, wps=585252, ups=1.18, wpb=495750, bsz=16535.2, num_updates=37500, lr=0.000326599, gnorm=0.136, clip=0, loss_scale=4, train_wall=84, gb_free=61.8, wall=32161 epoch 023: 401 / 1689 loss=3.545, nll_loss=1.994, ppl=3.98, wps=585252, ups=1.18, wpb=495750, bsz=16535.2, num_updates=37500, lr=0.000326599, gnorm=0.136, clip=0, loss_scale=4, train_wall=84, gb_free=61.8, wall=32161 epoch 023: 401 / 1689 loss=3.545, nll_loss=1.994, ppl=3.98, wps=585252, ups=1.18, wpb=495750, bsz=16535.2, num_updates=37500, lr=0.000326599, gnorm=0.136, clip=0, loss_scale=4, train_wall=84, gb_free=61.8, wall=32161 epoch 023: 401 / 1689 loss=3.545, nll_loss=1.994, ppl=3.98, wps=585252, ups=1.18, wpb=495750, bsz=16535.2, num_updates=37500, lr=0.000326599, gnorm=0.136, clip=0, loss_scale=4, train_wall=84, gb_free=61.8, wall=32161 epoch 023: 401 / 1689 loss=3.545, nll_loss=1.994, ppl=3.98, wps=585252, ups=1.18, wpb=495750, bsz=16535.2, num_updates=37500, lr=0.000326599, gnorm=0.136, clip=0, loss_scale=4, train_wall=84, gb_free=61.8, wall=32161 epoch 023: 401 / 1689 loss=3.545, nll_loss=1.994, ppl=3.98, wps=585252, ups=1.18, wpb=495750, bsz=16535.2, num_updates=37500, lr=0.000326599, gnorm=0.136, clip=0, loss_scale=4, train_wall=84, gb_free=61.8, wall=32161 epoch 023: 401 / 1689 loss=3.545, nll_loss=1.994, ppl=3.98, wps=585252, ups=1.18, wpb=495750, bsz=16535.2, num_updates=37500, lr=0.000326599, gnorm=0.136, clip=0, loss_scale=4, train_wall=84, gb_free=61.8, wall=32161 epoch 023: 401 / 1689 loss=3.545, nll_loss=1.994, ppl=3.98, wps=585252, ups=1.18, wpb=495750, bsz=16535.2, num_updates=37500, lr=0.000326599, gnorm=0.136, clip=0, loss_scale=4, train_wall=84, gb_free=61.8, wall=32161 epoch 023: 401 / 1689 loss=3.545, nll_loss=1.994, ppl=3.98, wps=585252, ups=1.18, wpb=495750, bsz=16535.2, num_updates=37500, lr=0.000326599, gnorm=0.136, clip=0, loss_scale=4, train_wall=84, gb_free=61.8, wall=32161 epoch 023: 401 / 1689 loss=3.545, nll_loss=1.994, ppl=3.98, wps=585252, ups=1.18, wpb=495750, bsz=16535.2, num_updates=37500, lr=0.000326599, gnorm=0.136, clip=0, loss_scale=4, train_wall=84, gb_free=61.8, wall=32161 epoch 023: 401 / 1689 loss=3.545, nll_loss=1.994, ppl=3.98, wps=585252, ups=1.18, wpb=495750, bsz=16535.2, num_updates=37500, lr=0.000326599, gnorm=0.136, clip=0, loss_scale=4, train_wall=84, gb_free=61.8, wall=32161 epoch 023: 401 / 1689 loss=3.545, nll_loss=1.994, ppl=3.98, wps=585252, ups=1.18, wpb=495750, bsz=16535.2, num_updates=37500, lr=0.000326599, gnorm=0.136, clip=0, loss_scale=4, train_wall=84, gb_free=61.8, wall=32161 epoch 023: 401 / 1689 loss=3.545, nll_loss=1.994, ppl=3.98, wps=585252, ups=1.18, wpb=495750, bsz=16535.2, num_updates=37500, lr=0.000326599, gnorm=0.136, clip=0, loss_scale=4, train_wall=84, gb_free=61.8, wall=32161 epoch 023: 401 / 1689 loss=3.545, nll_loss=1.994, ppl=3.98, wps=585252, ups=1.18, wpb=495750, bsz=16535.2, num_updates=37500, lr=0.000326599, gnorm=0.136, clip=0, loss_scale=4, train_wall=84, gb_free=61.8, wall=32161 epoch 023: 401 / 1689 loss=3.545, nll_loss=1.994, ppl=3.98, wps=585252, ups=1.18, wpb=495750, bsz=16535.2, num_updates=37500, lr=0.000326599, gnorm=0.136, clip=0, loss_scale=4, train_wall=84, gb_free=61.8, wall=32161 epoch 023: 401 / 1689 loss=3.545, nll_loss=1.994, ppl=3.98, wps=585252, ups=1.18, wpb=495750, bsz=16535.2, num_updates=37500, lr=0.000326599, gnorm=0.136, clip=0, loss_scale=4, train_wall=84, gb_free=61.8, wall=32161 epoch 023: 401 / 1689 loss=3.545, nll_loss=1.994, ppl=3.98, wps=585252, ups=1.18, wpb=495750, bsz=16535.2, num_updates=37500, lr=0.000326599, gnorm=0.136, clip=0, loss_scale=4, train_wall=84, gb_free=61.8, wall=32161 epoch 023: 401 / 1689 loss=3.545, nll_loss=1.994, ppl=3.98, wps=585252, ups=1.18, wpb=495750, bsz=16535.2, num_updates=37500, lr=0.000326599, gnorm=0.136, clip=0, loss_scale=4, train_wall=84, gb_free=61.8, wall=32161 epoch 023: 401 / 1689 loss=3.545, nll_loss=1.994, ppl=3.98, wps=585252, ups=1.18, wpb=495750, bsz=16535.2, num_updates=37500, lr=0.000326599, gnorm=0.136, clip=0, loss_scale=4, train_wall=84, gb_free=61.8, wall=32161 epoch 023: 401 / 1689 loss=3.545, nll_loss=1.994, ppl=3.98, wps=585252, ups=1.18, wpb=495750, bsz=16535.2, num_updates=37500, lr=0.000326599, gnorm=0.136, clip=0, loss_scale=4, train_wall=84, gb_free=61.8, wall=32161 epoch 023: 502 / 1689 loss=3.542, nll_loss=1.99, ppl=3.97, wps=579630, ups=1.17, wpb=495917, bsz=16831.2, num_updates=37600, lr=0.000326164, gnorm=0.133, clip=0, loss_scale=2, train_wall=84, gb_free=61.6, wall=32247 epoch 023: 502 / 1689 loss=3.542, nll_loss=1.99, ppl=3.97, wps=579630, ups=1.17, wpb=495917, bsz=16831.2, num_updates=37600, lr=0.000326164, gnorm=0.133, clip=0, loss_scale=2, train_wall=84, gb_free=61.6, wall=32247 epoch 023: 502 / 1689 loss=3.542, nll_loss=1.99, ppl=3.97, wps=579630, ups=1.17, wpb=495917, bsz=16831.2, num_updates=37600, lr=0.000326164, gnorm=0.133, clip=0, loss_scale=2, train_wall=84, gb_free=61.6, wall=32247 epoch 023: 502 / 1689 loss=3.542, nll_loss=1.99, ppl=3.97, wps=579630, ups=1.17, wpb=495917, bsz=16831.2, num_updates=37600, lr=0.000326164, gnorm=0.133, clip=0, loss_scale=2, train_wall=84, gb_free=61.6, wall=32247 epoch 023: 502 / 1689 loss=3.542, nll_loss=1.99, ppl=3.97, wps=579630, ups=1.17, wpb=495917, bsz=16831.2, num_updates=37600, lr=0.000326164, gnorm=0.133, clip=0, loss_scale=2, train_wall=84, gb_free=61.6, wall=32247 epoch 023: 502 / 1689 loss=3.542, nll_loss=1.99, ppl=3.97, wps=579630, ups=1.17, wpb=495917, bsz=16831.2, num_updates=37600, lr=0.000326164, gnorm=0.133, clip=0, loss_scale=2, train_wall=84, gb_free=61.6, wall=32247 epoch 023: 502 / 1689 loss=3.542, nll_loss=1.99, ppl=3.97, wps=579630, ups=1.17, wpb=495917, bsz=16831.2, num_updates=37600, lr=0.000326164, gnorm=0.133, clip=0, loss_scale=2, train_wall=84, gb_free=61.6, wall=32247 epoch 023: 502 / 1689 loss=3.542, nll_loss=1.99, ppl=3.97, wps=579630, ups=1.17, wpb=495917, bsz=16831.2, num_updates=37600, lr=0.000326164, gnorm=0.133, clip=0, loss_scale=2, train_wall=84, gb_free=61.6, wall=32247 epoch 023: 502 / 1689 loss=3.542, nll_loss=1.99, ppl=3.97, wps=579630, ups=1.17, wpb=495917, bsz=16831.2, num_updates=37600, lr=0.000326164, gnorm=0.133, clip=0, loss_scale=2, train_wall=84, gb_free=61.6, wall=32247 epoch 023: 502 / 1689 loss=3.542, nll_loss=1.99, ppl=3.97, wps=579630, ups=1.17, wpb=495917, bsz=16831.2, num_updates=37600, lr=0.000326164, gnorm=0.133, clip=0, loss_scale=2, train_wall=84, gb_free=61.6, wall=32247 epoch 023: 502 / 1689 loss=3.542, nll_loss=1.99, ppl=3.97, wps=579630, ups=1.17, wpb=495917, bsz=16831.2, num_updates=37600, lr=0.000326164, gnorm=0.133, clip=0, loss_scale=2, train_wall=84, gb_free=61.6, wall=32247 epoch 023: 502 / 1689 loss=3.542, nll_loss=1.99, ppl=3.97, wps=579630, ups=1.17, wpb=495917, bsz=16831.2, num_updates=37600, lr=0.000326164, gnorm=0.133, clip=0, loss_scale=2, train_wall=84, gb_free=61.6, wall=32247 epoch 023: 502 / 1689 loss=3.542, nll_loss=1.99, ppl=3.97, wps=579630, ups=1.17, wpb=495917, bsz=16831.2, num_updates=37600, lr=0.000326164, gnorm=0.133, clip=0, loss_scale=2, train_wall=84, gb_free=61.6, wall=32247 epoch 023: 502 / 1689 loss=3.542, nll_loss=1.99, ppl=3.97, wps=579630, ups=1.17, wpb=495917, bsz=16831.2, num_updates=37600, lr=0.000326164, gnorm=0.133, clip=0, loss_scale=2, train_wall=84, gb_free=61.6, wall=32247 epoch 023: 502 / 1689 loss=3.542, nll_loss=1.99, ppl=3.97, wps=579630, ups=1.17, wpb=495917, bsz=16831.2, num_updates=37600, lr=0.000326164, gnorm=0.133, clip=0, loss_scale=2, train_wall=84, gb_free=61.6, wall=32247 epoch 023: 502 / 1689 loss=3.542, nll_loss=1.99, ppl=3.97, wps=579630, ups=1.17, wpb=495917, bsz=16831.2, num_updates=37600, lr=0.000326164, gnorm=0.133, clip=0, loss_scale=2, train_wall=84, gb_free=61.6, wall=32247 epoch 023: 502 / 1689 loss=3.542, nll_loss=1.99, ppl=3.97, wps=579630, ups=1.17, wpb=495917, bsz=16831.2, num_updates=37600, lr=0.000326164, gnorm=0.133, clip=0, loss_scale=2, train_wall=84, gb_free=61.6, wall=32247 epoch 023: 502 / 1689 loss=3.542, nll_loss=1.99, ppl=3.97, wps=579630, ups=1.17, wpb=495917, bsz=16831.2, num_updates=37600, lr=0.000326164, gnorm=0.133, clip=0, loss_scale=2, train_wall=84, gb_free=61.6, wall=32247 epoch 023: 502 / 1689 loss=3.542, nll_loss=1.99, ppl=3.97, wps=579630, ups=1.17, wpb=495917, bsz=16831.2, num_updates=37600, lr=0.000326164, gnorm=0.133, clip=0, loss_scale=2, train_wall=84, gb_free=61.6, wall=32247 epoch 023: 502 / 1689 loss=3.542, nll_loss=1.99, ppl=3.97, wps=579630, ups=1.17, wpb=495917, bsz=16831.2, num_updates=37600, lr=0.000326164, gnorm=0.133, clip=0, loss_scale=2, train_wall=84, gb_free=61.6, wall=32247 epoch 023: 502 / 1689 loss=3.542, nll_loss=1.99, ppl=3.97, wps=579630, ups=1.17, wpb=495917, bsz=16831.2, num_updates=37600, lr=0.000326164, gnorm=0.133, clip=0, loss_scale=2, train_wall=84, gb_free=61.6, wall=32247 epoch 023: 502 / 1689 loss=3.542, nll_loss=1.99, ppl=3.97, wps=579630, ups=1.17, wpb=495917, bsz=16831.2, num_updates=37600, lr=0.000326164, gnorm=0.133, clip=0, loss_scale=2, train_wall=84, gb_free=61.6, wall=32247 epoch 023: 502 / 1689 loss=3.542, nll_loss=1.99, ppl=3.97, wps=579630, ups=1.17, wpb=495917, bsz=16831.2, num_updates=37600, lr=0.000326164, gnorm=0.133, clip=0, loss_scale=2, train_wall=84, gb_free=61.6, wall=32247 epoch 023: 602 / 1689 loss=3.547, nll_loss=1.997, ppl=3.99, wps=583605, ups=1.18, wpb=494166, bsz=16175.6, num_updates=37700, lr=0.000325731, gnorm=0.139, clip=0, loss_scale=2, train_wall=84, gb_free=61.5, wall=32332 epoch 023: 602 / 1689 loss=3.547, nll_loss=1.997, ppl=3.99, wps=583605, ups=1.18, wpb=494166, bsz=16175.6, num_updates=37700, lr=0.000325731, gnorm=0.139, clip=0, loss_scale=2, train_wall=84, gb_free=61.5, wall=32332 epoch 023: 602 / 1689 loss=3.547, nll_loss=1.997, ppl=3.99, wps=583605, ups=1.18, wpb=494166, bsz=16175.6, num_updates=37700, lr=0.000325731, gnorm=0.139, clip=0, loss_scale=2, train_wall=84, gb_free=61.5, wall=32332 epoch 023: 602 / 1689 loss=3.547, nll_loss=1.997, ppl=3.99, wps=583605, ups=1.18, wpb=494166, bsz=16175.6, num_updates=37700, lr=0.000325731, gnorm=0.139, clip=0, loss_scale=2, train_wall=84, gb_free=61.5, wall=32332 epoch 023: 602 / 1689 loss=3.547, nll_loss=1.997, ppl=3.99, wps=583605, ups=1.18, wpb=494166, bsz=16175.6, num_updates=37700, lr=0.000325731, gnorm=0.139, clip=0, loss_scale=2, train_wall=84, gb_free=61.5, wall=32332 epoch 023: 602 / 1689 loss=3.547, nll_loss=1.997, ppl=3.99, wps=583605, ups=1.18, wpb=494166, bsz=16175.6, num_updates=37700, lr=0.000325731, gnorm=0.139, clip=0, loss_scale=2, train_wall=84, gb_free=61.5, wall=32332 epoch 023: 602 / 1689 loss=3.547, nll_loss=1.997, ppl=3.99, wps=583605, ups=1.18, wpb=494166, bsz=16175.6, num_updates=37700, lr=0.000325731, gnorm=0.139, clip=0, loss_scale=2, train_wall=84, gb_free=61.5, wall=32332 epoch 023: 602 / 1689 loss=3.547, nll_loss=1.997, ppl=3.99, wps=583605, ups=1.18, wpb=494166, bsz=16175.6, num_updates=37700, lr=0.000325731, gnorm=0.139, clip=0, loss_scale=2, train_wall=84, gb_free=61.5, wall=32332 epoch 023: 602 / 1689 loss=3.547, nll_loss=1.997, ppl=3.99, wps=583605, ups=1.18, wpb=494166, bsz=16175.6, num_updates=37700, lr=0.000325731, gnorm=0.139, clip=0, loss_scale=2, train_wall=84, gb_free=61.5, wall=32332 epoch 023: 602 / 1689 loss=3.547, nll_loss=1.997, ppl=3.99, wps=583605, ups=1.18, wpb=494166, bsz=16175.6, num_updates=37700, lr=0.000325731, gnorm=0.139, clip=0, loss_scale=2, train_wall=84, gb_free=61.5, wall=32332 epoch 023: 602 / 1689 loss=3.547, nll_loss=1.997, ppl=3.99, wps=583605, ups=1.18, wpb=494166, bsz=16175.6, num_updates=37700, lr=0.000325731, gnorm=0.139, clip=0, loss_scale=2, train_wall=84, gb_free=61.5, wall=32332 epoch 023: 602 / 1689 loss=3.547, nll_loss=1.997, ppl=3.99, wps=583605, ups=1.18, wpb=494166, bsz=16175.6, num_updates=37700, lr=0.000325731, gnorm=0.139, clip=0, loss_scale=2, train_wall=84, gb_free=61.5, wall=32332 epoch 023: 602 / 1689 loss=3.547, nll_loss=1.997, ppl=3.99, wps=583605, ups=1.18, wpb=494166, bsz=16175.6, num_updates=37700, lr=0.000325731, gnorm=0.139, clip=0, loss_scale=2, train_wall=84, gb_free=61.5, wall=32332 epoch 023: 602 / 1689 loss=3.547, nll_loss=1.997, ppl=3.99, wps=583605, ups=1.18, wpb=494166, bsz=16175.6, num_updates=37700, lr=0.000325731, gnorm=0.139, clip=0, loss_scale=2, train_wall=84, gb_free=61.5, wall=32332 epoch 023: 602 / 1689 loss=3.547, nll_loss=1.997, ppl=3.99, wps=583605, ups=1.18, wpb=494166, bsz=16175.6, num_updates=37700, lr=0.000325731, gnorm=0.139, clip=0, loss_scale=2, train_wall=84, gb_free=61.5, wall=32332 epoch 023: 602 / 1689 loss=3.547, nll_loss=1.997, ppl=3.99, wps=583605, ups=1.18, wpb=494166, bsz=16175.6, num_updates=37700, lr=0.000325731, gnorm=0.139, clip=0, loss_scale=2, train_wall=84, gb_free=61.5, wall=32332 epoch 023: 602 / 1689 loss=3.547, nll_loss=1.997, ppl=3.99, wps=583605, ups=1.18, wpb=494166, bsz=16175.6, num_updates=37700, lr=0.000325731, gnorm=0.139, clip=0, loss_scale=2, train_wall=84, gb_free=61.5, wall=32332 epoch 023: 602 / 1689 loss=3.547, nll_loss=1.997, ppl=3.99, wps=583605, ups=1.18, wpb=494166, bsz=16175.6, num_updates=37700, lr=0.000325731, gnorm=0.139, clip=0, loss_scale=2, train_wall=84, gb_free=61.5, wall=32332 epoch 023: 602 / 1689 loss=3.547, nll_loss=1.997, ppl=3.99, wps=583605, ups=1.18, wpb=494166, bsz=16175.6, num_updates=37700, lr=0.000325731, gnorm=0.139, clip=0, loss_scale=2, train_wall=84, gb_free=61.5, wall=32332 epoch 023: 602 / 1689 loss=3.547, nll_loss=1.997, ppl=3.99, wps=583605, ups=1.18, wpb=494166, bsz=16175.6, num_updates=37700, lr=0.000325731, gnorm=0.139, clip=0, loss_scale=2, train_wall=84, gb_free=61.5, wall=32332 epoch 023: 602 / 1689 loss=3.547, nll_loss=1.997, ppl=3.99, wps=583605, ups=1.18, wpb=494166, bsz=16175.6, num_updates=37700, lr=0.000325731, gnorm=0.139, clip=0, loss_scale=2, train_wall=84, gb_free=61.5, wall=32332 epoch 023: 602 / 1689 loss=3.547, nll_loss=1.997, ppl=3.99, wps=583605, ups=1.18, wpb=494166, bsz=16175.6, num_updates=37700, lr=0.000325731, gnorm=0.139, clip=0, loss_scale=2, train_wall=84, gb_free=61.5, wall=32332 epoch 023: 602 / 1689 loss=3.547, nll_loss=1.997, ppl=3.99, wps=583605, ups=1.18, wpb=494166, bsz=16175.6, num_updates=37700, lr=0.000325731, gnorm=0.139, clip=0, loss_scale=2, train_wall=84, gb_free=61.5, wall=32332 epoch 023: 702 / 1689 loss=3.545, nll_loss=1.994, ppl=3.98, wps=584612, ups=1.18, wpb=494736, bsz=16525.6, num_updates=37800, lr=0.0003253, gnorm=0.135, clip=0, loss_scale=2, train_wall=84, gb_free=61.2, wall=32416 epoch 023: 702 / 1689 loss=3.545, nll_loss=1.994, ppl=3.98, wps=584612, ups=1.18, wpb=494736, bsz=16525.6, num_updates=37800, lr=0.0003253, gnorm=0.135, clip=0, loss_scale=2, train_wall=84, gb_free=61.2, wall=32416 epoch 023: 702 / 1689 loss=3.545, nll_loss=1.994, ppl=3.98, wps=584612, ups=1.18, wpb=494736, bsz=16525.6, num_updates=37800, lr=0.0003253, gnorm=0.135, clip=0, loss_scale=2, train_wall=84, gb_free=61.2, wall=32416 epoch 023: 702 / 1689 loss=3.545, nll_loss=1.994, ppl=3.98, wps=584612, ups=1.18, wpb=494736, bsz=16525.6, num_updates=37800, lr=0.0003253, gnorm=0.135, clip=0, loss_scale=2, train_wall=84, gb_free=61.2, wall=32416 epoch 023: 702 / 1689 loss=3.545, nll_loss=1.994, ppl=3.98, wps=584612, ups=1.18, wpb=494736, bsz=16525.6, num_updates=37800, lr=0.0003253, gnorm=0.135, clip=0, loss_scale=2, train_wall=84, gb_free=61.2, wall=32416 epoch 023: 702 / 1689 loss=3.545, nll_loss=1.994, ppl=3.98, wps=584612, ups=1.18, wpb=494736, bsz=16525.6, num_updates=37800, lr=0.0003253, gnorm=0.135, clip=0, loss_scale=2, train_wall=84, gb_free=61.2, wall=32416 epoch 023: 702 / 1689 loss=3.545, nll_loss=1.994, ppl=3.98, wps=584612, ups=1.18, wpb=494736, bsz=16525.6, num_updates=37800, lr=0.0003253, gnorm=0.135, clip=0, loss_scale=2, train_wall=84, gb_free=61.2, wall=32416 epoch 023: 702 / 1689 loss=3.545, nll_loss=1.994, ppl=3.98, wps=584612, ups=1.18, wpb=494736, bsz=16525.6, num_updates=37800, lr=0.0003253, gnorm=0.135, clip=0, loss_scale=2, train_wall=84, gb_free=61.2, wall=32416 epoch 023: 702 / 1689 loss=3.545, nll_loss=1.994, ppl=3.98, wps=584612, ups=1.18, wpb=494736, bsz=16525.6, num_updates=37800, lr=0.0003253, gnorm=0.135, clip=0, loss_scale=2, train_wall=84, gb_free=61.2, wall=32416 epoch 023: 702 / 1689 loss=3.545, nll_loss=1.994, ppl=3.98, wps=584612, ups=1.18, wpb=494736, bsz=16525.6, num_updates=37800, lr=0.0003253, gnorm=0.135, clip=0, loss_scale=2, train_wall=84, gb_free=61.2, wall=32416 epoch 023: 702 / 1689 loss=3.545, nll_loss=1.994, ppl=3.98, wps=584612, ups=1.18, wpb=494736, bsz=16525.6, num_updates=37800, lr=0.0003253, gnorm=0.135, clip=0, loss_scale=2, train_wall=84, gb_free=61.2, wall=32416 epoch 023: 702 / 1689 loss=3.545, nll_loss=1.994, ppl=3.98, wps=584612, ups=1.18, wpb=494736, bsz=16525.6, num_updates=37800, lr=0.0003253, gnorm=0.135, clip=0, loss_scale=2, train_wall=84, gb_free=61.2, wall=32416 epoch 023: 702 / 1689 loss=3.545, nll_loss=1.994, ppl=3.98, wps=584612, ups=1.18, wpb=494736, bsz=16525.6, num_updates=37800, lr=0.0003253, gnorm=0.135, clip=0, loss_scale=2, train_wall=84, gb_free=61.2, wall=32416 epoch 023: 702 / 1689 loss=3.545, nll_loss=1.994, ppl=3.98, wps=584612, ups=1.18, wpb=494736, bsz=16525.6, num_updates=37800, lr=0.0003253, gnorm=0.135, clip=0, loss_scale=2, train_wall=84, gb_free=61.2, wall=32416 epoch 023: 702 / 1689 loss=3.545, nll_loss=1.994, ppl=3.98, wps=584612, ups=1.18, wpb=494736, bsz=16525.6, num_updates=37800, lr=0.0003253, gnorm=0.135, clip=0, loss_scale=2, train_wall=84, gb_free=61.2, wall=32416 epoch 023: 702 / 1689 loss=3.545, nll_loss=1.994, ppl=3.98, wps=584612, ups=1.18, wpb=494736, bsz=16525.6, num_updates=37800, lr=0.0003253, gnorm=0.135, clip=0, loss_scale=2, train_wall=84, gb_free=61.2, wall=32416 epoch 023: 702 / 1689 loss=3.545, nll_loss=1.994, ppl=3.98, wps=584612, ups=1.18, wpb=494736, bsz=16525.6, num_updates=37800, lr=0.0003253, gnorm=0.135, clip=0, loss_scale=2, train_wall=84, gb_free=61.2, wall=32416 epoch 023: 702 / 1689 loss=3.545, nll_loss=1.994, ppl=3.98, wps=584612, ups=1.18, wpb=494736, bsz=16525.6, num_updates=37800, lr=0.0003253, gnorm=0.135, clip=0, loss_scale=2, train_wall=84, gb_free=61.2, wall=32416 epoch 023: 702 / 1689 loss=3.545, nll_loss=1.994, ppl=3.98, wps=584612, ups=1.18, wpb=494736, bsz=16525.6, num_updates=37800, lr=0.0003253, gnorm=0.135, clip=0, loss_scale=2, train_wall=84, gb_free=61.2, wall=32416 epoch 023: 702 / 1689 loss=3.545, nll_loss=1.994, ppl=3.98, wps=584612, ups=1.18, wpb=494736, bsz=16525.6, num_updates=37800, lr=0.0003253, gnorm=0.135, clip=0, loss_scale=2, train_wall=84, gb_free=61.2, wall=32416 epoch 023: 702 / 1689 loss=3.545, nll_loss=1.994, ppl=3.98, wps=584612, ups=1.18, wpb=494736, bsz=16525.6, num_updates=37800, lr=0.0003253, gnorm=0.135, clip=0, loss_scale=2, train_wall=84, gb_free=61.2, wall=32416 epoch 023: 702 / 1689 loss=3.545, nll_loss=1.994, ppl=3.98, wps=584612, ups=1.18, wpb=494736, bsz=16525.6, num_updates=37800, lr=0.0003253, gnorm=0.135, clip=0, loss_scale=2, train_wall=84, gb_free=61.2, wall=32416 epoch 023: 702 / 1689 loss=3.545, nll_loss=1.994, ppl=3.98, wps=584612, ups=1.18, wpb=494736, bsz=16525.6, num_updates=37800, lr=0.0003253, gnorm=0.135, clip=0, loss_scale=2, train_wall=84, gb_free=61.2, wall=32416 epoch 023: 802 / 1689 loss=3.551, nll_loss=2, ppl=4, wps=586610, ups=1.18, wpb=495910, bsz=16517.4, num_updates=37900, lr=0.000324871, gnorm=0.133, clip=0, loss_scale=2, train_wall=83, gb_free=61.9, wall=32501 epoch 023: 802 / 1689 loss=3.551, nll_loss=2, ppl=4, wps=586610, ups=1.18, wpb=495910, bsz=16517.4, num_updates=37900, lr=0.000324871, gnorm=0.133, clip=0, loss_scale=2, train_wall=83, gb_free=61.9, wall=32501 epoch 023: 802 / 1689 loss=3.551, nll_loss=2, ppl=4, wps=586610, ups=1.18, wpb=495910, bsz=16517.4, num_updates=37900, lr=0.000324871, gnorm=0.133, clip=0, loss_scale=2, train_wall=83, gb_free=61.9, wall=32501 epoch 023: 802 / 1689 loss=3.551, nll_loss=2, ppl=4, wps=586610, ups=1.18, wpb=495910, bsz=16517.4, num_updates=37900, lr=0.000324871, gnorm=0.133, clip=0, loss_scale=2, train_wall=83, gb_free=61.9, wall=32501 epoch 023: 802 / 1689 loss=3.551, nll_loss=2, ppl=4, wps=586610, ups=1.18, wpb=495910, bsz=16517.4, num_updates=37900, lr=0.000324871, gnorm=0.133, clip=0, loss_scale=2, train_wall=83, gb_free=61.9, wall=32501 epoch 023: 802 / 1689 loss=3.551, nll_loss=2, ppl=4, wps=586610, ups=1.18, wpb=495910, bsz=16517.4, num_updates=37900, lr=0.000324871, gnorm=0.133, clip=0, loss_scale=2, train_wall=83, gb_free=61.9, wall=32501 epoch 023: 802 / 1689 loss=3.551, nll_loss=2, ppl=4, wps=586610, ups=1.18, wpb=495910, bsz=16517.4, num_updates=37900, lr=0.000324871, gnorm=0.133, clip=0, loss_scale=2, train_wall=83, gb_free=61.9, wall=32501 epoch 023: 802 / 1689 loss=3.551, nll_loss=2, ppl=4, wps=586610, ups=1.18, wpb=495910, bsz=16517.4, num_updates=37900, lr=0.000324871, gnorm=0.133, clip=0, loss_scale=2, train_wall=83, gb_free=61.9, wall=32501 epoch 023: 802 / 1689 loss=3.551, nll_loss=2, ppl=4, wps=586610, ups=1.18, wpb=495910, bsz=16517.4, num_updates=37900, lr=0.000324871, gnorm=0.133, clip=0, loss_scale=2, train_wall=83, gb_free=61.9, wall=32501 epoch 023: 802 / 1689 loss=3.551, nll_loss=2, ppl=4, wps=586610, ups=1.18, wpb=495910, bsz=16517.4, num_updates=37900, lr=0.000324871, gnorm=0.133, clip=0, loss_scale=2, train_wall=83, gb_free=61.9, wall=32501 epoch 023: 802 / 1689 loss=3.551, nll_loss=2, ppl=4, wps=586610, ups=1.18, wpb=495910, bsz=16517.4, num_updates=37900, lr=0.000324871, gnorm=0.133, clip=0, loss_scale=2, train_wall=83, gb_free=61.9, wall=32501 epoch 023: 802 / 1689 loss=3.551, nll_loss=2, ppl=4, wps=586610, ups=1.18, wpb=495910, bsz=16517.4, num_updates=37900, lr=0.000324871, gnorm=0.133, clip=0, loss_scale=2, train_wall=83, gb_free=61.9, wall=32501 epoch 023: 802 / 1689 loss=3.551, nll_loss=2, ppl=4, wps=586610, ups=1.18, wpb=495910, bsz=16517.4, num_updates=37900, lr=0.000324871, gnorm=0.133, clip=0, loss_scale=2, train_wall=83, gb_free=61.9, wall=32501 epoch 023: 802 / 1689 loss=3.551, nll_loss=2, ppl=4, wps=586610, ups=1.18, wpb=495910, bsz=16517.4, num_updates=37900, lr=0.000324871, gnorm=0.133, clip=0, loss_scale=2, train_wall=83, gb_free=61.9, wall=32501 epoch 023: 802 / 1689 loss=3.551, nll_loss=2, ppl=4, wps=586610, ups=1.18, wpb=495910, bsz=16517.4, num_updates=37900, lr=0.000324871, gnorm=0.133, clip=0, loss_scale=2, train_wall=83, gb_free=61.9, wall=32501 epoch 023: 802 / 1689 loss=3.551, nll_loss=2, ppl=4, wps=586610, ups=1.18, wpb=495910, bsz=16517.4, num_updates=37900, lr=0.000324871, gnorm=0.133, clip=0, loss_scale=2, train_wall=83, gb_free=61.9, wall=32501 epoch 023: 802 / 1689 loss=3.551, nll_loss=2, ppl=4, wps=586610, ups=1.18, wpb=495910, bsz=16517.4, num_updates=37900, lr=0.000324871, gnorm=0.133, clip=0, loss_scale=2, train_wall=83, gb_free=61.9, wall=32501 epoch 023: 802 / 1689 loss=3.551, nll_loss=2, ppl=4, wps=586610, ups=1.18, wpb=495910, bsz=16517.4, num_updates=37900, lr=0.000324871, gnorm=0.133, clip=0, loss_scale=2, train_wall=83, gb_free=61.9, wall=32501 epoch 023: 802 / 1689 loss=3.551, nll_loss=2, ppl=4, wps=586610, ups=1.18, wpb=495910, bsz=16517.4, num_updates=37900, lr=0.000324871, gnorm=0.133, clip=0, loss_scale=2, train_wall=83, gb_free=61.9, wall=32501 epoch 023: 802 / 1689 loss=3.551, nll_loss=2, ppl=4, wps=586610, ups=1.18, wpb=495910, bsz=16517.4, num_updates=37900, lr=0.000324871, gnorm=0.133, clip=0, loss_scale=2, train_wall=83, gb_free=61.9, wall=32501 epoch 023: 802 / 1689 loss=3.551, nll_loss=2, ppl=4, wps=586610, ups=1.18, wpb=495910, bsz=16517.4, num_updates=37900, lr=0.000324871, gnorm=0.133, clip=0, loss_scale=2, train_wall=83, gb_free=61.9, wall=32501 epoch 023: 802 / 1689 loss=3.551, nll_loss=2, ppl=4, wps=586610, ups=1.18, wpb=495910, bsz=16517.4, num_updates=37900, lr=0.000324871, gnorm=0.133, clip=0, loss_scale=2, train_wall=83, gb_free=61.9, wall=32501 epoch 023: 802 / 1689 loss=3.551, nll_loss=2, ppl=4, wps=586610, ups=1.18, wpb=495910, bsz=16517.4, num_updates=37900, lr=0.000324871, gnorm=0.133, clip=0, loss_scale=2, train_wall=83, gb_free=61.9, wall=32501 epoch 023: 902 / 1689 loss=3.557, nll_loss=2.007, ppl=4.02, wps=584493, ups=1.18, wpb=494991, bsz=16879.3, num_updates=38000, lr=0.000324443, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=60.3, wall=32586 epoch 023: 902 / 1689 loss=3.557, nll_loss=2.007, ppl=4.02, wps=584493, ups=1.18, wpb=494991, bsz=16879.3, num_updates=38000, lr=0.000324443, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=60.3, wall=32586 epoch 023: 902 / 1689 loss=3.557, nll_loss=2.007, ppl=4.02, wps=584493, ups=1.18, wpb=494991, bsz=16879.3, num_updates=38000, lr=0.000324443, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=60.3, wall=32586 epoch 023: 902 / 1689 loss=3.557, nll_loss=2.007, ppl=4.02, wps=584493, ups=1.18, wpb=494991, bsz=16879.3, num_updates=38000, lr=0.000324443, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=60.3, wall=32586 epoch 023: 902 / 1689 loss=3.557, nll_loss=2.007, ppl=4.02, wps=584493, ups=1.18, wpb=494991, bsz=16879.3, num_updates=38000, lr=0.000324443, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=60.3, wall=32586 epoch 023: 902 / 1689 loss=3.557, nll_loss=2.007, ppl=4.02, wps=584493, ups=1.18, wpb=494991, bsz=16879.3, num_updates=38000, lr=0.000324443, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=60.3, wall=32586 epoch 023: 902 / 1689 loss=3.557, nll_loss=2.007, ppl=4.02, wps=584493, ups=1.18, wpb=494991, bsz=16879.3, num_updates=38000, lr=0.000324443, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=60.3, wall=32586 epoch 023: 902 / 1689 loss=3.557, nll_loss=2.007, ppl=4.02, wps=584493, ups=1.18, wpb=494991, bsz=16879.3, num_updates=38000, lr=0.000324443, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=60.3, wall=32586 epoch 023: 902 / 1689 loss=3.557, nll_loss=2.007, ppl=4.02, wps=584493, ups=1.18, wpb=494991, bsz=16879.3, num_updates=38000, lr=0.000324443, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=60.3, wall=32586 epoch 023: 902 / 1689 loss=3.557, nll_loss=2.007, ppl=4.02, wps=584493, ups=1.18, wpb=494991, bsz=16879.3, num_updates=38000, lr=0.000324443, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=60.3, wall=32586 epoch 023: 902 / 1689 loss=3.557, nll_loss=2.007, ppl=4.02, wps=584493, ups=1.18, wpb=494991, bsz=16879.3, num_updates=38000, lr=0.000324443, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=60.3, wall=32586 epoch 023: 902 / 1689 loss=3.557, nll_loss=2.007, ppl=4.02, wps=584493, ups=1.18, wpb=494991, bsz=16879.3, num_updates=38000, lr=0.000324443, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=60.3, wall=32586 epoch 023: 902 / 1689 loss=3.557, nll_loss=2.007, ppl=4.02, wps=584493, ups=1.18, wpb=494991, bsz=16879.3, num_updates=38000, lr=0.000324443, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=60.3, wall=32586 epoch 023: 902 / 1689 loss=3.557, nll_loss=2.007, ppl=4.02, wps=584493, ups=1.18, wpb=494991, bsz=16879.3, num_updates=38000, lr=0.000324443, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=60.3, wall=32586 epoch 023: 902 / 1689 loss=3.557, nll_loss=2.007, ppl=4.02, wps=584493, ups=1.18, wpb=494991, bsz=16879.3, num_updates=38000, lr=0.000324443, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=60.3, wall=32586 epoch 023: 902 / 1689 loss=3.557, nll_loss=2.007, ppl=4.02, wps=584493, ups=1.18, wpb=494991, bsz=16879.3, num_updates=38000, lr=0.000324443, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=60.3, wall=32586 epoch 023: 902 / 1689 loss=3.557, nll_loss=2.007, ppl=4.02, wps=584493, ups=1.18, wpb=494991, bsz=16879.3, num_updates=38000, lr=0.000324443, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=60.3, wall=32586 epoch 023: 902 / 1689 loss=3.557, nll_loss=2.007, ppl=4.02, wps=584493, ups=1.18, wpb=494991, bsz=16879.3, num_updates=38000, lr=0.000324443, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=60.3, wall=32586 epoch 023: 902 / 1689 loss=3.557, nll_loss=2.007, ppl=4.02, wps=584493, ups=1.18, wpb=494991, bsz=16879.3, num_updates=38000, lr=0.000324443, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=60.3, wall=32586 epoch 023: 902 / 1689 loss=3.557, nll_loss=2.007, ppl=4.02, wps=584493, ups=1.18, wpb=494991, bsz=16879.3, num_updates=38000, lr=0.000324443, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=60.3, wall=32586 epoch 023: 902 / 1689 loss=3.557, nll_loss=2.007, ppl=4.02, wps=584493, ups=1.18, wpb=494991, bsz=16879.3, num_updates=38000, lr=0.000324443, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=60.3, wall=32586 epoch 023: 902 / 1689 loss=3.557, nll_loss=2.007, ppl=4.02, wps=584493, ups=1.18, wpb=494991, bsz=16879.3, num_updates=38000, lr=0.000324443, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=60.3, wall=32586 epoch 023: 902 / 1689 loss=3.557, nll_loss=2.007, ppl=4.02, wps=584493, ups=1.18, wpb=494991, bsz=16879.3, num_updates=38000, lr=0.000324443, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=60.3, wall=32586 begin validation on "valid" subset epoch 023 | valid on 'valid' subset | loss 3.709 | nll_loss 2.161 | ppl 4.47 | wps 0 | wpb 44526 | bsz 2008 | num_updates 38000 | best_loss 3.699 epoch 023 | valid on 'valid' subset | loss 3.709 | nll_loss 2.161 | ppl 4.47 | wps 0 | wpb 44526 | bsz 2008 | num_updates 38000 | best_loss 3.699 epoch 023 | valid on 'valid' subset | loss 3.709 | nll_loss 2.161 | ppl 4.47 | wps 0 | wpb 44526 | bsz 2008 | num_updates 38000 | best_loss 3.699 epoch 023 | valid on 'valid' subset | loss 3.709 | nll_loss 2.161 | ppl 4.47 | wps 0 | wpb 44526 | bsz 2008 | num_updates 38000 | best_loss 3.699 epoch 023 | valid on 'valid' subset | loss 3.709 | nll_loss 2.161 | ppl 4.47 | wps 0 | wpb 44526 | bsz 2008 | num_updates 38000 | best_loss 3.699 epoch 023 | valid on 'valid' subset | loss 3.709 | nll_loss 2.161 | ppl 4.47 | wps 0 | wpb 44526 | bsz 2008 | num_updates 38000 | best_loss 3.699 epoch 023 | valid on 'valid' subset | loss 3.709 | nll_loss 2.161 | ppl 4.47 | wps 0 | wpb 44526 | bsz 2008 | num_updates 38000 | best_loss 3.699 epoch 023 | valid on 'valid' subset | loss 3.709 | nll_loss 2.161 | ppl 4.47 | wps 0 | wpb 44526 | bsz 2008 | num_updates 38000 | best_loss 3.699 epoch 023 | valid on 'valid' subset | loss 3.709 | nll_loss 2.161 | ppl 4.47 | wps 0 | wpb 44526 | bsz 2008 | num_updates 38000 | best_loss 3.699 epoch 023 | valid on 'valid' subset | loss 3.709 | nll_loss 2.161 | ppl 4.47 | wps 0 | wpb 44526 | bsz 2008 | num_updates 38000 | best_loss 3.699 epoch 023 | valid on 'valid' subset | loss 3.709 | nll_loss 2.161 | ppl 4.47 | wps 0 | wpb 44526 | bsz 2008 | num_updates 38000 | best_loss 3.699 epoch 023 | valid on 'valid' subset | loss 3.709 | nll_loss 2.161 | ppl 4.47 | wps 0 | wpb 44526 | bsz 2008 | num_updates 38000 | best_loss 3.699 epoch 023 | valid on 'valid' subset | loss 3.709 | nll_loss 2.161 | ppl 4.47 | wps 0 | wpb 44526 | bsz 2008 | num_updates 38000 | best_loss 3.699 epoch 023 | valid on 'valid' subset | loss 3.709 | nll_loss 2.161 | ppl 4.47 | wps 0 | wpb 44526 | bsz 2008 | num_updates 38000 | best_loss 3.699 epoch 023 | valid on 'valid' subset | loss 3.709 | nll_loss 2.161 | ppl 4.47 | wps 0 | wpb 44526 | bsz 2008 | num_updates 38000 | best_loss 3.699 epoch 023 | valid on 'valid' subset | loss 3.709 | nll_loss 2.161 | ppl 4.47 | wps 0 | wpb 44526 | bsz 2008 | num_updates 38000 | best_loss 3.699 epoch 023 | valid on 'valid' subset | loss 3.709 | nll_loss 2.161 | ppl 4.47 | wps 0 | wpb 44526 | bsz 2008 | num_updates 38000 | best_loss 3.699 epoch 023 | valid on 'valid' subset | loss 3.709 | nll_loss 2.161 | ppl 4.47 | wps 0 | wpb 44526 | bsz 2008 | num_updates 38000 | best_loss 3.699 epoch 023 | valid on 'valid' subset | loss 3.709 | nll_loss 2.161 | ppl 4.47 | wps 0 | wpb 44526 | bsz 2008 | num_updates 38000 | best_loss 3.699 epoch 023 | valid on 'valid' subset | loss 3.709 | nll_loss 2.161 | ppl 4.47 | wps 0 | wpb 44526 | bsz 2008 | num_updates 38000 | best_loss 3.699 epoch 023 | valid on 'valid' subset | loss 3.709 | nll_loss 2.161 | ppl 4.47 | wps 0 | wpb 44526 | bsz 2008 | num_updates 38000 | best_loss 3.699 epoch 023 | valid on 'valid' subset | loss 3.709 | nll_loss 2.161 | ppl 4.47 | wps 0 | wpb 44526 | bsz 2008 | num_updates 38000 | best_loss 3.699 epoch 023 | valid on 'valid' subset | loss 3.709 | nll_loss 2.161 | ppl 4.47 | wps 0 | wpb 44526 | bsz 2008 | num_updates 38000 | best_loss 3.699 epoch 023: 1002 / 1689 loss=3.548, nll_loss=1.998, ppl=3.99, wps=530501, ups=1.07, wpb=495307, bsz=16198.2, num_updates=38100, lr=0.000324017, gnorm=0.139, clip=0, loss_scale=4, train_wall=83, gb_free=62.1, wall=32679 epoch 023: 1002 / 1689 loss=3.548, nll_loss=1.998, ppl=3.99, wps=530501, ups=1.07, wpb=495307, bsz=16198.2, num_updates=38100, lr=0.000324017, gnorm=0.139, clip=0, loss_scale=4, train_wall=83, gb_free=62.1, wall=32679 epoch 023: 1002 / 1689 loss=3.548, nll_loss=1.998, ppl=3.99, wps=530501, ups=1.07, wpb=495307, bsz=16198.2, num_updates=38100, lr=0.000324017, gnorm=0.139, clip=0, loss_scale=4, train_wall=83, gb_free=62.1, wall=32679 epoch 023: 1002 / 1689 loss=3.548, nll_loss=1.998, ppl=3.99, wps=530501, ups=1.07, wpb=495307, bsz=16198.2, num_updates=38100, lr=0.000324017, gnorm=0.139, clip=0, loss_scale=4, train_wall=83, gb_free=62.1, wall=32679 epoch 023: 1002 / 1689 loss=3.548, nll_loss=1.998, ppl=3.99, wps=530501, ups=1.07, wpb=495307, bsz=16198.2, num_updates=38100, lr=0.000324017, gnorm=0.139, clip=0, loss_scale=4, train_wall=83, gb_free=62.1, wall=32679 epoch 023: 1002 / 1689 loss=3.548, nll_loss=1.998, ppl=3.99, wps=530501, ups=1.07, wpb=495307, bsz=16198.2, num_updates=38100, lr=0.000324017, gnorm=0.139, clip=0, loss_scale=4, train_wall=83, gb_free=62.1, wall=32679 epoch 023: 1002 / 1689 loss=3.548, nll_loss=1.998, ppl=3.99, wps=530501, ups=1.07, wpb=495307, bsz=16198.2, num_updates=38100, lr=0.000324017, gnorm=0.139, clip=0, loss_scale=4, train_wall=83, gb_free=62.1, wall=32679 epoch 023: 1002 / 1689 loss=3.548, nll_loss=1.998, ppl=3.99, wps=530501, ups=1.07, wpb=495307, bsz=16198.2, num_updates=38100, lr=0.000324017, gnorm=0.139, clip=0, loss_scale=4, train_wall=83, gb_free=62.1, wall=32679 epoch 023: 1002 / 1689 loss=3.548, nll_loss=1.998, ppl=3.99, wps=530501, ups=1.07, wpb=495307, bsz=16198.2, num_updates=38100, lr=0.000324017, gnorm=0.139, clip=0, loss_scale=4, train_wall=83, gb_free=62.1, wall=32679 epoch 023: 1002 / 1689 loss=3.548, nll_loss=1.998, ppl=3.99, wps=530501, ups=1.07, wpb=495307, bsz=16198.2, num_updates=38100, lr=0.000324017, gnorm=0.139, clip=0, loss_scale=4, train_wall=83, gb_free=62.1, wall=32679 epoch 023: 1002 / 1689 loss=3.548, nll_loss=1.998, ppl=3.99, wps=530501, ups=1.07, wpb=495307, bsz=16198.2, num_updates=38100, lr=0.000324017, gnorm=0.139, clip=0, loss_scale=4, train_wall=83, gb_free=62.1, wall=32679 epoch 023: 1002 / 1689 loss=3.548, nll_loss=1.998, ppl=3.99, wps=530501, ups=1.07, wpb=495307, bsz=16198.2, num_updates=38100, lr=0.000324017, gnorm=0.139, clip=0, loss_scale=4, train_wall=83, gb_free=62.1, wall=32679 epoch 023: 1002 / 1689 loss=3.548, nll_loss=1.998, ppl=3.99, wps=530501, ups=1.07, wpb=495307, bsz=16198.2, num_updates=38100, lr=0.000324017, gnorm=0.139, clip=0, loss_scale=4, train_wall=83, gb_free=62.1, wall=32679 epoch 023: 1002 / 1689 loss=3.548, nll_loss=1.998, ppl=3.99, wps=530501, ups=1.07, wpb=495307, bsz=16198.2, num_updates=38100, lr=0.000324017, gnorm=0.139, clip=0, loss_scale=4, train_wall=83, gb_free=62.1, wall=32679 epoch 023: 1002 / 1689 loss=3.548, nll_loss=1.998, ppl=3.99, wps=530501, ups=1.07, wpb=495307, bsz=16198.2, num_updates=38100, lr=0.000324017, gnorm=0.139, clip=0, loss_scale=4, train_wall=83, gb_free=62.1, wall=32679 epoch 023: 1002 / 1689 loss=3.548, nll_loss=1.998, ppl=3.99, wps=530501, ups=1.07, wpb=495307, bsz=16198.2, num_updates=38100, lr=0.000324017, gnorm=0.139, clip=0, loss_scale=4, train_wall=83, gb_free=62.1, wall=32679 epoch 023: 1002 / 1689 loss=3.548, nll_loss=1.998, ppl=3.99, wps=530501, ups=1.07, wpb=495307, bsz=16198.2, num_updates=38100, lr=0.000324017, gnorm=0.139, clip=0, loss_scale=4, train_wall=83, gb_free=62.1, wall=32679 epoch 023: 1002 / 1689 loss=3.548, nll_loss=1.998, ppl=3.99, wps=530501, ups=1.07, wpb=495307, bsz=16198.2, num_updates=38100, lr=0.000324017, gnorm=0.139, clip=0, loss_scale=4, train_wall=83, gb_free=62.1, wall=32679 epoch 023: 1002 / 1689 loss=3.548, nll_loss=1.998, ppl=3.99, wps=530501, ups=1.07, wpb=495307, bsz=16198.2, num_updates=38100, lr=0.000324017, gnorm=0.139, clip=0, loss_scale=4, train_wall=83, gb_free=62.1, wall=32679 epoch 023: 1002 / 1689 loss=3.548, nll_loss=1.998, ppl=3.99, wps=530501, ups=1.07, wpb=495307, bsz=16198.2, num_updates=38100, lr=0.000324017, gnorm=0.139, clip=0, loss_scale=4, train_wall=83, gb_free=62.1, wall=32679 epoch 023: 1002 / 1689 loss=3.548, nll_loss=1.998, ppl=3.99, wps=530501, ups=1.07, wpb=495307, bsz=16198.2, num_updates=38100, lr=0.000324017, gnorm=0.139, clip=0, loss_scale=4, train_wall=83, gb_free=62.1, wall=32679 epoch 023: 1002 / 1689 loss=3.548, nll_loss=1.998, ppl=3.99, wps=530501, ups=1.07, wpb=495307, bsz=16198.2, num_updates=38100, lr=0.000324017, gnorm=0.139, clip=0, loss_scale=4, train_wall=83, gb_free=62.1, wall=32679 epoch 023: 1002 / 1689 loss=3.548, nll_loss=1.998, ppl=3.99, wps=530501, ups=1.07, wpb=495307, bsz=16198.2, num_updates=38100, lr=0.000324017, gnorm=0.139, clip=0, loss_scale=4, train_wall=83, gb_free=62.1, wall=32679 epoch 023: 1103 / 1689 loss=3.559, nll_loss=2.01, ppl=4.03, wps=582848, ups=1.18, wpb=494513, bsz=16284.2, num_updates=38200, lr=0.000323592, gnorm=0.14, clip=0, loss_scale=2, train_wall=84, gb_free=61.9, wall=32764 epoch 023: 1103 / 1689 loss=3.559, nll_loss=2.01, ppl=4.03, wps=582848, ups=1.18, wpb=494513, bsz=16284.2, num_updates=38200, lr=0.000323592, gnorm=0.14, clip=0, loss_scale=2, train_wall=84, gb_free=61.9, wall=32764 epoch 023: 1103 / 1689 loss=3.559, nll_loss=2.01, ppl=4.03, wps=582848, ups=1.18, wpb=494513, bsz=16284.2, num_updates=38200, lr=0.000323592, gnorm=0.14, clip=0, loss_scale=2, train_wall=84, gb_free=61.9, wall=32764 epoch 023: 1103 / 1689 loss=3.559, nll_loss=2.01, ppl=4.03, wps=582848, ups=1.18, wpb=494513, bsz=16284.2, num_updates=38200, lr=0.000323592, gnorm=0.14, clip=0, loss_scale=2, train_wall=84, gb_free=61.9, wall=32764 epoch 023: 1103 / 1689 loss=3.559, nll_loss=2.01, ppl=4.03, wps=582848, ups=1.18, wpb=494513, bsz=16284.2, num_updates=38200, lr=0.000323592, gnorm=0.14, clip=0, loss_scale=2, train_wall=84, gb_free=61.9, wall=32764 epoch 023: 1103 / 1689 loss=3.559, nll_loss=2.01, ppl=4.03, wps=582848, ups=1.18, wpb=494513, bsz=16284.2, num_updates=38200, lr=0.000323592, gnorm=0.14, clip=0, loss_scale=2, train_wall=84, gb_free=61.9, wall=32764 epoch 023: 1103 / 1689 loss=3.559, nll_loss=2.01, ppl=4.03, wps=582848, ups=1.18, wpb=494513, bsz=16284.2, num_updates=38200, lr=0.000323592, gnorm=0.14, clip=0, loss_scale=2, train_wall=84, gb_free=61.9, wall=32764 epoch 023: 1103 / 1689 loss=3.559, nll_loss=2.01, ppl=4.03, wps=582848, ups=1.18, wpb=494513, bsz=16284.2, num_updates=38200, lr=0.000323592, gnorm=0.14, clip=0, loss_scale=2, train_wall=84, gb_free=61.9, wall=32764 epoch 023: 1103 / 1689 loss=3.559, nll_loss=2.01, ppl=4.03, wps=582848, ups=1.18, wpb=494513, bsz=16284.2, num_updates=38200, lr=0.000323592, gnorm=0.14, clip=0, loss_scale=2, train_wall=84, gb_free=61.9, wall=32764 epoch 023: 1103 / 1689 loss=3.559, nll_loss=2.01, ppl=4.03, wps=582848, ups=1.18, wpb=494513, bsz=16284.2, num_updates=38200, lr=0.000323592, gnorm=0.14, clip=0, loss_scale=2, train_wall=84, gb_free=61.9, wall=32764 epoch 023: 1103 / 1689 loss=3.559, nll_loss=2.01, ppl=4.03, wps=582848, ups=1.18, wpb=494513, bsz=16284.2, num_updates=38200, lr=0.000323592, gnorm=0.14, clip=0, loss_scale=2, train_wall=84, gb_free=61.9, wall=32764 epoch 023: 1103 / 1689 loss=3.559, nll_loss=2.01, ppl=4.03, wps=582848, ups=1.18, wpb=494513, bsz=16284.2, num_updates=38200, lr=0.000323592, gnorm=0.14, clip=0, loss_scale=2, train_wall=84, gb_free=61.9, wall=32764 epoch 023: 1103 / 1689 loss=3.559, nll_loss=2.01, ppl=4.03, wps=582848, ups=1.18, wpb=494513, bsz=16284.2, num_updates=38200, lr=0.000323592, gnorm=0.14, clip=0, loss_scale=2, train_wall=84, gb_free=61.9, wall=32764 epoch 023: 1103 / 1689 loss=3.559, nll_loss=2.01, ppl=4.03, wps=582848, ups=1.18, wpb=494513, bsz=16284.2, num_updates=38200, lr=0.000323592, gnorm=0.14, clip=0, loss_scale=2, train_wall=84, gb_free=61.9, wall=32764 epoch 023: 1103 / 1689 loss=3.559, nll_loss=2.01, ppl=4.03, wps=582848, ups=1.18, wpb=494513, bsz=16284.2, num_updates=38200, lr=0.000323592, gnorm=0.14, clip=0, loss_scale=2, train_wall=84, gb_free=61.9, wall=32764 epoch 023: 1103 / 1689 loss=3.559, nll_loss=2.01, ppl=4.03, wps=582848, ups=1.18, wpb=494513, bsz=16284.2, num_updates=38200, lr=0.000323592, gnorm=0.14, clip=0, loss_scale=2, train_wall=84, gb_free=61.9, wall=32764 epoch 023: 1103 / 1689 loss=3.559, nll_loss=2.01, ppl=4.03, wps=582848, ups=1.18, wpb=494513, bsz=16284.2, num_updates=38200, lr=0.000323592, gnorm=0.14, clip=0, loss_scale=2, train_wall=84, gb_free=61.9, wall=32764 epoch 023: 1103 / 1689 loss=3.559, nll_loss=2.01, ppl=4.03, wps=582848, ups=1.18, wpb=494513, bsz=16284.2, num_updates=38200, lr=0.000323592, gnorm=0.14, clip=0, loss_scale=2, train_wall=84, gb_free=61.9, wall=32764 epoch 023: 1103 / 1689 loss=3.559, nll_loss=2.01, ppl=4.03, wps=582848, ups=1.18, wpb=494513, bsz=16284.2, num_updates=38200, lr=0.000323592, gnorm=0.14, clip=0, loss_scale=2, train_wall=84, gb_free=61.9, wall=32764 epoch 023: 1103 / 1689 loss=3.559, nll_loss=2.01, ppl=4.03, wps=582848, ups=1.18, wpb=494513, bsz=16284.2, num_updates=38200, lr=0.000323592, gnorm=0.14, clip=0, loss_scale=2, train_wall=84, gb_free=61.9, wall=32764 epoch 023: 1103 / 1689 loss=3.559, nll_loss=2.01, ppl=4.03, wps=582848, ups=1.18, wpb=494513, bsz=16284.2, num_updates=38200, lr=0.000323592, gnorm=0.14, clip=0, loss_scale=2, train_wall=84, gb_free=61.9, wall=32764 epoch 023: 1103 / 1689 loss=3.559, nll_loss=2.01, ppl=4.03, wps=582848, ups=1.18, wpb=494513, bsz=16284.2, num_updates=38200, lr=0.000323592, gnorm=0.14, clip=0, loss_scale=2, train_wall=84, gb_free=61.9, wall=32764 epoch 023: 1103 / 1689 loss=3.559, nll_loss=2.01, ppl=4.03, wps=582848, ups=1.18, wpb=494513, bsz=16284.2, num_updates=38200, lr=0.000323592, gnorm=0.14, clip=0, loss_scale=2, train_wall=84, gb_free=61.9, wall=32764 epoch 023: 1203 / 1689 loss=3.551, nll_loss=2.002, ppl=4, wps=586947, ups=1.18, wpb=495650, bsz=16210.1, num_updates=38300, lr=0.00032317, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=32848 epoch 023: 1203 / 1689 loss=3.551, nll_loss=2.002, ppl=4, wps=586947, ups=1.18, wpb=495650, bsz=16210.1, num_updates=38300, lr=0.00032317, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=32848 epoch 023: 1203 / 1689 loss=3.551, nll_loss=2.002, ppl=4, wps=586947, ups=1.18, wpb=495650, bsz=16210.1, num_updates=38300, lr=0.00032317, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=32848 epoch 023: 1203 / 1689 loss=3.551, nll_loss=2.002, ppl=4, wps=586947, ups=1.18, wpb=495650, bsz=16210.1, num_updates=38300, lr=0.00032317, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=32848 epoch 023: 1203 / 1689 loss=3.551, nll_loss=2.002, ppl=4, wps=586947, ups=1.18, wpb=495650, bsz=16210.1, num_updates=38300, lr=0.00032317, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=32848 epoch 023: 1203 / 1689 loss=3.551, nll_loss=2.002, ppl=4, wps=586947, ups=1.18, wpb=495650, bsz=16210.1, num_updates=38300, lr=0.00032317, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=32848 epoch 023: 1203 / 1689 loss=3.551, nll_loss=2.002, ppl=4, wps=586947, ups=1.18, wpb=495650, bsz=16210.1, num_updates=38300, lr=0.00032317, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=32848 epoch 023: 1203 / 1689 loss=3.551, nll_loss=2.002, ppl=4, wps=586947, ups=1.18, wpb=495650, bsz=16210.1, num_updates=38300, lr=0.00032317, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=32848 epoch 023: 1203 / 1689 loss=3.551, nll_loss=2.002, ppl=4, wps=586947, ups=1.18, wpb=495650, bsz=16210.1, num_updates=38300, lr=0.00032317, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=32848 epoch 023: 1203 / 1689 loss=3.551, nll_loss=2.002, ppl=4, wps=586947, ups=1.18, wpb=495650, bsz=16210.1, num_updates=38300, lr=0.00032317, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=32848 epoch 023: 1203 / 1689 loss=3.551, nll_loss=2.002, ppl=4, wps=586947, ups=1.18, wpb=495650, bsz=16210.1, num_updates=38300, lr=0.00032317, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=32848 epoch 023: 1203 / 1689 loss=3.551, nll_loss=2.002, ppl=4, wps=586947, ups=1.18, wpb=495650, bsz=16210.1, num_updates=38300, lr=0.00032317, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=32848 epoch 023: 1203 / 1689 loss=3.551, nll_loss=2.002, ppl=4, wps=586947, ups=1.18, wpb=495650, bsz=16210.1, num_updates=38300, lr=0.00032317, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=32848 epoch 023: 1203 / 1689 loss=3.551, nll_loss=2.002, ppl=4, wps=586947, ups=1.18, wpb=495650, bsz=16210.1, num_updates=38300, lr=0.00032317, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=32848 epoch 023: 1203 / 1689 loss=3.551, nll_loss=2.002, ppl=4, wps=586947, ups=1.18, wpb=495650, bsz=16210.1, num_updates=38300, lr=0.00032317, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=32848 epoch 023: 1203 / 1689 loss=3.551, nll_loss=2.002, ppl=4, wps=586947, ups=1.18, wpb=495650, bsz=16210.1, num_updates=38300, lr=0.00032317, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=32848 epoch 023: 1203 / 1689 loss=3.551, nll_loss=2.002, ppl=4, wps=586947, ups=1.18, wpb=495650, bsz=16210.1, num_updates=38300, lr=0.00032317, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=32848 epoch 023: 1203 / 1689 loss=3.551, nll_loss=2.002, ppl=4, wps=586947, ups=1.18, wpb=495650, bsz=16210.1, num_updates=38300, lr=0.00032317, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=32848 epoch 023: 1203 / 1689 loss=3.551, nll_loss=2.002, ppl=4, wps=586947, ups=1.18, wpb=495650, bsz=16210.1, num_updates=38300, lr=0.00032317, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=32848 epoch 023: 1203 / 1689 loss=3.551, nll_loss=2.002, ppl=4, wps=586947, ups=1.18, wpb=495650, bsz=16210.1, num_updates=38300, lr=0.00032317, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=32848 epoch 023: 1203 / 1689 loss=3.551, nll_loss=2.002, ppl=4, wps=586947, ups=1.18, wpb=495650, bsz=16210.1, num_updates=38300, lr=0.00032317, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=32848 epoch 023: 1203 / 1689 loss=3.551, nll_loss=2.002, ppl=4, wps=586947, ups=1.18, wpb=495650, bsz=16210.1, num_updates=38300, lr=0.00032317, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=32848 epoch 023: 1203 / 1689 loss=3.551, nll_loss=2.002, ppl=4, wps=586947, ups=1.18, wpb=495650, bsz=16210.1, num_updates=38300, lr=0.00032317, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=32848 epoch 023: 1303 / 1689 loss=3.558, nll_loss=2.009, ppl=4.02, wps=589673, ups=1.19, wpb=495189, bsz=16523.8, num_updates=38400, lr=0.000322749, gnorm=0.138, clip=0, loss_scale=2, train_wall=83, gb_free=61.4, wall=32932 epoch 023: 1303 / 1689 loss=3.558, nll_loss=2.009, ppl=4.02, wps=589673, ups=1.19, wpb=495189, bsz=16523.8, num_updates=38400, lr=0.000322749, gnorm=0.138, clip=0, loss_scale=2, train_wall=83, gb_free=61.4, wall=32932 epoch 023: 1303 / 1689 loss=3.558, nll_loss=2.009, ppl=4.02, wps=589673, ups=1.19, wpb=495189, bsz=16523.8, num_updates=38400, lr=0.000322749, gnorm=0.138, clip=0, loss_scale=2, train_wall=83, gb_free=61.4, wall=32932 epoch 023: 1303 / 1689 loss=3.558, nll_loss=2.009, ppl=4.02, wps=589673, ups=1.19, wpb=495189, bsz=16523.8, num_updates=38400, lr=0.000322749, gnorm=0.138, clip=0, loss_scale=2, train_wall=83, gb_free=61.4, wall=32932 epoch 023: 1303 / 1689 loss=3.558, nll_loss=2.009, ppl=4.02, wps=589673, ups=1.19, wpb=495189, bsz=16523.8, num_updates=38400, lr=0.000322749, gnorm=0.138, clip=0, loss_scale=2, train_wall=83, gb_free=61.4, wall=32932 epoch 023: 1303 / 1689 loss=3.558, nll_loss=2.009, ppl=4.02, wps=589673, ups=1.19, wpb=495189, bsz=16523.8, num_updates=38400, lr=0.000322749, gnorm=0.138, clip=0, loss_scale=2, train_wall=83, gb_free=61.4, wall=32932 epoch 023: 1303 / 1689 loss=3.558, nll_loss=2.009, ppl=4.02, wps=589673, ups=1.19, wpb=495189, bsz=16523.8, num_updates=38400, lr=0.000322749, gnorm=0.138, clip=0, loss_scale=2, train_wall=83, gb_free=61.4, wall=32932 epoch 023: 1303 / 1689 loss=3.558, nll_loss=2.009, ppl=4.02, wps=589673, ups=1.19, wpb=495189, bsz=16523.8, num_updates=38400, lr=0.000322749, gnorm=0.138, clip=0, loss_scale=2, train_wall=83, gb_free=61.4, wall=32932 epoch 023: 1303 / 1689 loss=3.558, nll_loss=2.009, ppl=4.02, wps=589673, ups=1.19, wpb=495189, bsz=16523.8, num_updates=38400, lr=0.000322749, gnorm=0.138, clip=0, loss_scale=2, train_wall=83, gb_free=61.4, wall=32932 epoch 023: 1303 / 1689 loss=3.558, nll_loss=2.009, ppl=4.02, wps=589673, ups=1.19, wpb=495189, bsz=16523.8, num_updates=38400, lr=0.000322749, gnorm=0.138, clip=0, loss_scale=2, train_wall=83, gb_free=61.4, wall=32932 epoch 023: 1303 / 1689 loss=3.558, nll_loss=2.009, ppl=4.02, wps=589673, ups=1.19, wpb=495189, bsz=16523.8, num_updates=38400, lr=0.000322749, gnorm=0.138, clip=0, loss_scale=2, train_wall=83, gb_free=61.4, wall=32932 epoch 023: 1303 / 1689 loss=3.558, nll_loss=2.009, ppl=4.02, wps=589673, ups=1.19, wpb=495189, bsz=16523.8, num_updates=38400, lr=0.000322749, gnorm=0.138, clip=0, loss_scale=2, train_wall=83, gb_free=61.4, wall=32932 epoch 023: 1303 / 1689 loss=3.558, nll_loss=2.009, ppl=4.02, wps=589673, ups=1.19, wpb=495189, bsz=16523.8, num_updates=38400, lr=0.000322749, gnorm=0.138, clip=0, loss_scale=2, train_wall=83, gb_free=61.4, wall=32932 epoch 023: 1303 / 1689 loss=3.558, nll_loss=2.009, ppl=4.02, wps=589673, ups=1.19, wpb=495189, bsz=16523.8, num_updates=38400, lr=0.000322749, gnorm=0.138, clip=0, loss_scale=2, train_wall=83, gb_free=61.4, wall=32932 epoch 023: 1303 / 1689 loss=3.558, nll_loss=2.009, ppl=4.02, wps=589673, ups=1.19, wpb=495189, bsz=16523.8, num_updates=38400, lr=0.000322749, gnorm=0.138, clip=0, loss_scale=2, train_wall=83, gb_free=61.4, wall=32932 epoch 023: 1303 / 1689 loss=3.558, nll_loss=2.009, ppl=4.02, wps=589673, ups=1.19, wpb=495189, bsz=16523.8, num_updates=38400, lr=0.000322749, gnorm=0.138, clip=0, loss_scale=2, train_wall=83, gb_free=61.4, wall=32932 epoch 023: 1303 / 1689 loss=3.558, nll_loss=2.009, ppl=4.02, wps=589673, ups=1.19, wpb=495189, bsz=16523.8, num_updates=38400, lr=0.000322749, gnorm=0.138, clip=0, loss_scale=2, train_wall=83, gb_free=61.4, wall=32932 epoch 023: 1303 / 1689 loss=3.558, nll_loss=2.009, ppl=4.02, wps=589673, ups=1.19, wpb=495189, bsz=16523.8, num_updates=38400, lr=0.000322749, gnorm=0.138, clip=0, loss_scale=2, train_wall=83, gb_free=61.4, wall=32932 epoch 023: 1303 / 1689 loss=3.558, nll_loss=2.009, ppl=4.02, wps=589673, ups=1.19, wpb=495189, bsz=16523.8, num_updates=38400, lr=0.000322749, gnorm=0.138, clip=0, loss_scale=2, train_wall=83, gb_free=61.4, wall=32932 epoch 023: 1303 / 1689 loss=3.558, nll_loss=2.009, ppl=4.02, wps=589673, ups=1.19, wpb=495189, bsz=16523.8, num_updates=38400, lr=0.000322749, gnorm=0.138, clip=0, loss_scale=2, train_wall=83, gb_free=61.4, wall=32932 epoch 023: 1303 / 1689 loss=3.558, nll_loss=2.009, ppl=4.02, wps=589673, ups=1.19, wpb=495189, bsz=16523.8, num_updates=38400, lr=0.000322749, gnorm=0.138, clip=0, loss_scale=2, train_wall=83, gb_free=61.4, wall=32932 epoch 023: 1303 / 1689 loss=3.558, nll_loss=2.009, ppl=4.02, wps=589673, ups=1.19, wpb=495189, bsz=16523.8, num_updates=38400, lr=0.000322749, gnorm=0.138, clip=0, loss_scale=2, train_wall=83, gb_free=61.4, wall=32932 epoch 023: 1303 / 1689 loss=3.558, nll_loss=2.009, ppl=4.02, wps=589673, ups=1.19, wpb=495189, bsz=16523.8, num_updates=38400, lr=0.000322749, gnorm=0.138, clip=0, loss_scale=2, train_wall=83, gb_free=61.4, wall=32932 epoch 023: 1403 / 1689 loss=3.552, nll_loss=2.002, ppl=4.01, wps=586536, ups=1.19, wpb=494507, bsz=16473.4, num_updates=38500, lr=0.000322329, gnorm=0.135, clip=0, loss_scale=2, train_wall=83, gb_free=60.6, wall=33017 epoch 023: 1403 / 1689 loss=3.552, nll_loss=2.002, ppl=4.01, wps=586536, ups=1.19, wpb=494507, bsz=16473.4, num_updates=38500, lr=0.000322329, gnorm=0.135, clip=0, loss_scale=2, train_wall=83, gb_free=60.6, wall=33017 epoch 023: 1403 / 1689 loss=3.552, nll_loss=2.002, ppl=4.01, wps=586536, ups=1.19, wpb=494507, bsz=16473.4, num_updates=38500, lr=0.000322329, gnorm=0.135, clip=0, loss_scale=2, train_wall=83, gb_free=60.6, wall=33017 epoch 023: 1403 / 1689 loss=3.552, nll_loss=2.002, ppl=4.01, wps=586536, ups=1.19, wpb=494507, bsz=16473.4, num_updates=38500, lr=0.000322329, gnorm=0.135, clip=0, loss_scale=2, train_wall=83, gb_free=60.6, wall=33017 epoch 023: 1403 / 1689 loss=3.552, nll_loss=2.002, ppl=4.01, wps=586536, ups=1.19, wpb=494507, bsz=16473.4, num_updates=38500, lr=0.000322329, gnorm=0.135, clip=0, loss_scale=2, train_wall=83, gb_free=60.6, wall=33017 epoch 023: 1403 / 1689 loss=3.552, nll_loss=2.002, ppl=4.01, wps=586536, ups=1.19, wpb=494507, bsz=16473.4, num_updates=38500, lr=0.000322329, gnorm=0.135, clip=0, loss_scale=2, train_wall=83, gb_free=60.6, wall=33017 epoch 023: 1403 / 1689 loss=3.552, nll_loss=2.002, ppl=4.01, wps=586536, ups=1.19, wpb=494507, bsz=16473.4, num_updates=38500, lr=0.000322329, gnorm=0.135, clip=0, loss_scale=2, train_wall=83, gb_free=60.6, wall=33017 epoch 023: 1403 / 1689 loss=3.552, nll_loss=2.002, ppl=4.01, wps=586536, ups=1.19, wpb=494507, bsz=16473.4, num_updates=38500, lr=0.000322329, gnorm=0.135, clip=0, loss_scale=2, train_wall=83, gb_free=60.6, wall=33017 epoch 023: 1403 / 1689 loss=3.552, nll_loss=2.002, ppl=4.01, wps=586536, ups=1.19, wpb=494507, bsz=16473.4, num_updates=38500, lr=0.000322329, gnorm=0.135, clip=0, loss_scale=2, train_wall=83, gb_free=60.6, wall=33017 epoch 023: 1403 / 1689 loss=3.552, nll_loss=2.002, ppl=4.01, wps=586536, ups=1.19, wpb=494507, bsz=16473.4, num_updates=38500, lr=0.000322329, gnorm=0.135, clip=0, loss_scale=2, train_wall=83, gb_free=60.6, wall=33017 epoch 023: 1403 / 1689 loss=3.552, nll_loss=2.002, ppl=4.01, wps=586536, ups=1.19, wpb=494507, bsz=16473.4, num_updates=38500, lr=0.000322329, gnorm=0.135, clip=0, loss_scale=2, train_wall=83, gb_free=60.6, wall=33017 epoch 023: 1403 / 1689 loss=3.552, nll_loss=2.002, ppl=4.01, wps=586536, ups=1.19, wpb=494507, bsz=16473.4, num_updates=38500, lr=0.000322329, gnorm=0.135, clip=0, loss_scale=2, train_wall=83, gb_free=60.6, wall=33017 epoch 023: 1403 / 1689 loss=3.552, nll_loss=2.002, ppl=4.01, wps=586536, ups=1.19, wpb=494507, bsz=16473.4, num_updates=38500, lr=0.000322329, gnorm=0.135, clip=0, loss_scale=2, train_wall=83, gb_free=60.6, wall=33017 epoch 023: 1403 / 1689 loss=3.552, nll_loss=2.002, ppl=4.01, wps=586536, ups=1.19, wpb=494507, bsz=16473.4, num_updates=38500, lr=0.000322329, gnorm=0.135, clip=0, loss_scale=2, train_wall=83, gb_free=60.6, wall=33017 epoch 023: 1403 / 1689 loss=3.552, nll_loss=2.002, ppl=4.01, wps=586536, ups=1.19, wpb=494507, bsz=16473.4, num_updates=38500, lr=0.000322329, gnorm=0.135, clip=0, loss_scale=2, train_wall=83, gb_free=60.6, wall=33017 epoch 023: 1403 / 1689 loss=3.552, nll_loss=2.002, ppl=4.01, wps=586536, ups=1.19, wpb=494507, bsz=16473.4, num_updates=38500, lr=0.000322329, gnorm=0.135, clip=0, loss_scale=2, train_wall=83, gb_free=60.6, wall=33017 epoch 023: 1403 / 1689 loss=3.552, nll_loss=2.002, ppl=4.01, wps=586536, ups=1.19, wpb=494507, bsz=16473.4, num_updates=38500, lr=0.000322329, gnorm=0.135, clip=0, loss_scale=2, train_wall=83, gb_free=60.6, wall=33017 epoch 023: 1403 / 1689 loss=3.552, nll_loss=2.002, ppl=4.01, wps=586536, ups=1.19, wpb=494507, bsz=16473.4, num_updates=38500, lr=0.000322329, gnorm=0.135, clip=0, loss_scale=2, train_wall=83, gb_free=60.6, wall=33017 epoch 023: 1403 / 1689 loss=3.552, nll_loss=2.002, ppl=4.01, wps=586536, ups=1.19, wpb=494507, bsz=16473.4, num_updates=38500, lr=0.000322329, gnorm=0.135, clip=0, loss_scale=2, train_wall=83, gb_free=60.6, wall=33017 epoch 023: 1403 / 1689 loss=3.552, nll_loss=2.002, ppl=4.01, wps=586536, ups=1.19, wpb=494507, bsz=16473.4, num_updates=38500, lr=0.000322329, gnorm=0.135, clip=0, loss_scale=2, train_wall=83, gb_free=60.6, wall=33017 epoch 023: 1403 / 1689 loss=3.552, nll_loss=2.002, ppl=4.01, wps=586536, ups=1.19, wpb=494507, bsz=16473.4, num_updates=38500, lr=0.000322329, gnorm=0.135, clip=0, loss_scale=2, train_wall=83, gb_free=60.6, wall=33017 epoch 023: 1403 / 1689 loss=3.552, nll_loss=2.002, ppl=4.01, wps=586536, ups=1.19, wpb=494507, bsz=16473.4, num_updates=38500, lr=0.000322329, gnorm=0.135, clip=0, loss_scale=2, train_wall=83, gb_free=60.6, wall=33017 epoch 023: 1403 / 1689 loss=3.552, nll_loss=2.002, ppl=4.01, wps=586536, ups=1.19, wpb=494507, bsz=16473.4, num_updates=38500, lr=0.000322329, gnorm=0.135, clip=0, loss_scale=2, train_wall=83, gb_free=60.6, wall=33017 epoch 023: 1503 / 1689 loss=3.548, nll_loss=1.998, ppl=4, wps=581760, ups=1.17, wpb=495153, bsz=16529.1, num_updates=38600, lr=0.000321911, gnorm=0.137, clip=0, loss_scale=2, train_wall=84, gb_free=61.2, wall=33102 epoch 023: 1503 / 1689 loss=3.548, nll_loss=1.998, ppl=4, wps=581760, ups=1.17, wpb=495153, bsz=16529.1, num_updates=38600, lr=0.000321911, gnorm=0.137, clip=0, loss_scale=2, train_wall=84, gb_free=61.2, wall=33102 epoch 023: 1503 / 1689 loss=3.548, nll_loss=1.998, ppl=4, wps=581760, ups=1.17, wpb=495153, bsz=16529.1, num_updates=38600, lr=0.000321911, gnorm=0.137, clip=0, loss_scale=2, train_wall=84, gb_free=61.2, wall=33102 epoch 023: 1503 / 1689 loss=3.548, nll_loss=1.998, ppl=4, wps=581760, ups=1.17, wpb=495153, bsz=16529.1, num_updates=38600, lr=0.000321911, gnorm=0.137, clip=0, loss_scale=2, train_wall=84, gb_free=61.2, wall=33102 epoch 023: 1503 / 1689 loss=3.548, nll_loss=1.998, ppl=4, wps=581760, ups=1.17, wpb=495153, bsz=16529.1, num_updates=38600, lr=0.000321911, gnorm=0.137, clip=0, loss_scale=2, train_wall=84, gb_free=61.2, wall=33102 epoch 023: 1503 / 1689 loss=3.548, nll_loss=1.998, ppl=4, wps=581760, ups=1.17, wpb=495153, bsz=16529.1, num_updates=38600, lr=0.000321911, gnorm=0.137, clip=0, loss_scale=2, train_wall=84, gb_free=61.2, wall=33102 epoch 023: 1503 / 1689 loss=3.548, nll_loss=1.998, ppl=4, wps=581760, ups=1.17, wpb=495153, bsz=16529.1, num_updates=38600, lr=0.000321911, gnorm=0.137, clip=0, loss_scale=2, train_wall=84, gb_free=61.2, wall=33102 epoch 023: 1503 / 1689 loss=3.548, nll_loss=1.998, ppl=4, wps=581760, ups=1.17, wpb=495153, bsz=16529.1, num_updates=38600, lr=0.000321911, gnorm=0.137, clip=0, loss_scale=2, train_wall=84, gb_free=61.2, wall=33102 epoch 023: 1503 / 1689 loss=3.548, nll_loss=1.998, ppl=4, wps=581760, ups=1.17, wpb=495153, bsz=16529.1, num_updates=38600, lr=0.000321911, gnorm=0.137, clip=0, loss_scale=2, train_wall=84, gb_free=61.2, wall=33102 epoch 023: 1503 / 1689 loss=3.548, nll_loss=1.998, ppl=4, wps=581760, ups=1.17, wpb=495153, bsz=16529.1, num_updates=38600, lr=0.000321911, gnorm=0.137, clip=0, loss_scale=2, train_wall=84, gb_free=61.2, wall=33102 epoch 023: 1503 / 1689 loss=3.548, nll_loss=1.998, ppl=4, wps=581760, ups=1.17, wpb=495153, bsz=16529.1, num_updates=38600, lr=0.000321911, gnorm=0.137, clip=0, loss_scale=2, train_wall=84, gb_free=61.2, wall=33102 epoch 023: 1503 / 1689 loss=3.548, nll_loss=1.998, ppl=4, wps=581760, ups=1.17, wpb=495153, bsz=16529.1, num_updates=38600, lr=0.000321911, gnorm=0.137, clip=0, loss_scale=2, train_wall=84, gb_free=61.2, wall=33102 epoch 023: 1503 / 1689 loss=3.548, nll_loss=1.998, ppl=4, wps=581760, ups=1.17, wpb=495153, bsz=16529.1, num_updates=38600, lr=0.000321911, gnorm=0.137, clip=0, loss_scale=2, train_wall=84, gb_free=61.2, wall=33102 epoch 023: 1503 / 1689 loss=3.548, nll_loss=1.998, ppl=4, wps=581760, ups=1.17, wpb=495153, bsz=16529.1, num_updates=38600, lr=0.000321911, gnorm=0.137, clip=0, loss_scale=2, train_wall=84, gb_free=61.2, wall=33102 epoch 023: 1503 / 1689 loss=3.548, nll_loss=1.998, ppl=4, wps=581760, ups=1.17, wpb=495153, bsz=16529.1, num_updates=38600, lr=0.000321911, gnorm=0.137, clip=0, loss_scale=2, train_wall=84, gb_free=61.2, wall=33102 epoch 023: 1503 / 1689 loss=3.548, nll_loss=1.998, ppl=4, wps=581760, ups=1.17, wpb=495153, bsz=16529.1, num_updates=38600, lr=0.000321911, gnorm=0.137, clip=0, loss_scale=2, train_wall=84, gb_free=61.2, wall=33102 epoch 023: 1503 / 1689 loss=3.548, nll_loss=1.998, ppl=4, wps=581760, ups=1.17, wpb=495153, bsz=16529.1, num_updates=38600, lr=0.000321911, gnorm=0.137, clip=0, loss_scale=2, train_wall=84, gb_free=61.2, wall=33102 epoch 023: 1503 / 1689 loss=3.548, nll_loss=1.998, ppl=4, wps=581760, ups=1.17, wpb=495153, bsz=16529.1, num_updates=38600, lr=0.000321911, gnorm=0.137, clip=0, loss_scale=2, train_wall=84, gb_free=61.2, wall=33102 epoch 023: 1503 / 1689 loss=3.548, nll_loss=1.998, ppl=4, wps=581760, ups=1.17, wpb=495153, bsz=16529.1, num_updates=38600, lr=0.000321911, gnorm=0.137, clip=0, loss_scale=2, train_wall=84, gb_free=61.2, wall=33102 epoch 023: 1503 / 1689 loss=3.548, nll_loss=1.998, ppl=4, wps=581760, ups=1.17, wpb=495153, bsz=16529.1, num_updates=38600, lr=0.000321911, gnorm=0.137, clip=0, loss_scale=2, train_wall=84, gb_free=61.2, wall=33102 epoch 023: 1503 / 1689 loss=3.548, nll_loss=1.998, ppl=4, wps=581760, ups=1.17, wpb=495153, bsz=16529.1, num_updates=38600, lr=0.000321911, gnorm=0.137, clip=0, loss_scale=2, train_wall=84, gb_free=61.2, wall=33102 epoch 023: 1503 / 1689 loss=3.548, nll_loss=1.998, ppl=4, wps=581760, ups=1.17, wpb=495153, bsz=16529.1, num_updates=38600, lr=0.000321911, gnorm=0.137, clip=0, loss_scale=2, train_wall=84, gb_free=61.2, wall=33102 epoch 023: 1503 / 1689 loss=3.548, nll_loss=1.998, ppl=4, wps=581760, ups=1.17, wpb=495153, bsz=16529.1, num_updates=38600, lr=0.000321911, gnorm=0.137, clip=0, loss_scale=2, train_wall=84, gb_free=61.2, wall=33102 epoch 023: 1603 / 1689 loss=3.555, nll_loss=2.006, ppl=4.02, wps=584031, ups=1.18, wpb=496398, bsz=16338.2, num_updates=38700, lr=0.000321495, gnorm=0.134, clip=0, loss_scale=4, train_wall=84, gb_free=61.4, wall=33187 epoch 023: 1603 / 1689 loss=3.555, nll_loss=2.006, ppl=4.02, wps=584031, ups=1.18, wpb=496398, bsz=16338.2, num_updates=38700, lr=0.000321495, gnorm=0.134, clip=0, loss_scale=4, train_wall=84, gb_free=61.4, wall=33187 epoch 023: 1603 / 1689 loss=3.555, nll_loss=2.006, ppl=4.02, wps=584031, ups=1.18, wpb=496398, bsz=16338.2, num_updates=38700, lr=0.000321495, gnorm=0.134, clip=0, loss_scale=4, train_wall=84, gb_free=61.4, wall=33187 epoch 023: 1603 / 1689 loss=3.555, nll_loss=2.006, ppl=4.02, wps=584031, ups=1.18, wpb=496398, bsz=16338.2, num_updates=38700, lr=0.000321495, gnorm=0.134, clip=0, loss_scale=4, train_wall=84, gb_free=61.4, wall=33187 epoch 023: 1603 / 1689 loss=3.555, nll_loss=2.006, ppl=4.02, wps=584031, ups=1.18, wpb=496398, bsz=16338.2, num_updates=38700, lr=0.000321495, gnorm=0.134, clip=0, loss_scale=4, train_wall=84, gb_free=61.4, wall=33187 epoch 023: 1603 / 1689 loss=3.555, nll_loss=2.006, ppl=4.02, wps=584031, ups=1.18, wpb=496398, bsz=16338.2, num_updates=38700, lr=0.000321495, gnorm=0.134, clip=0, loss_scale=4, train_wall=84, gb_free=61.4, wall=33187 epoch 023: 1603 / 1689 loss=3.555, nll_loss=2.006, ppl=4.02, wps=584031, ups=1.18, wpb=496398, bsz=16338.2, num_updates=38700, lr=0.000321495, gnorm=0.134, clip=0, loss_scale=4, train_wall=84, gb_free=61.4, wall=33187 epoch 023: 1603 / 1689 loss=3.555, nll_loss=2.006, ppl=4.02, wps=584031, ups=1.18, wpb=496398, bsz=16338.2, num_updates=38700, lr=0.000321495, gnorm=0.134, clip=0, loss_scale=4, train_wall=84, gb_free=61.4, wall=33187 epoch 023: 1603 / 1689 loss=3.555, nll_loss=2.006, ppl=4.02, wps=584031, ups=1.18, wpb=496398, bsz=16338.2, num_updates=38700, lr=0.000321495, gnorm=0.134, clip=0, loss_scale=4, train_wall=84, gb_free=61.4, wall=33187 epoch 023: 1603 / 1689 loss=3.555, nll_loss=2.006, ppl=4.02, wps=584031, ups=1.18, wpb=496398, bsz=16338.2, num_updates=38700, lr=0.000321495, gnorm=0.134, clip=0, loss_scale=4, train_wall=84, gb_free=61.4, wall=33187 epoch 023: 1603 / 1689 loss=3.555, nll_loss=2.006, ppl=4.02, wps=584031, ups=1.18, wpb=496398, bsz=16338.2, num_updates=38700, lr=0.000321495, gnorm=0.134, clip=0, loss_scale=4, train_wall=84, gb_free=61.4, wall=33187 epoch 023: 1603 / 1689 loss=3.555, nll_loss=2.006, ppl=4.02, wps=584031, ups=1.18, wpb=496398, bsz=16338.2, num_updates=38700, lr=0.000321495, gnorm=0.134, clip=0, loss_scale=4, train_wall=84, gb_free=61.4, wall=33187 epoch 023: 1603 / 1689 loss=3.555, nll_loss=2.006, ppl=4.02, wps=584031, ups=1.18, wpb=496398, bsz=16338.2, num_updates=38700, lr=0.000321495, gnorm=0.134, clip=0, loss_scale=4, train_wall=84, gb_free=61.4, wall=33187 epoch 023: 1603 / 1689 loss=3.555, nll_loss=2.006, ppl=4.02, wps=584031, ups=1.18, wpb=496398, bsz=16338.2, num_updates=38700, lr=0.000321495, gnorm=0.134, clip=0, loss_scale=4, train_wall=84, gb_free=61.4, wall=33187 epoch 023: 1603 / 1689 loss=3.555, nll_loss=2.006, ppl=4.02, wps=584031, ups=1.18, wpb=496398, bsz=16338.2, num_updates=38700, lr=0.000321495, gnorm=0.134, clip=0, loss_scale=4, train_wall=84, gb_free=61.4, wall=33187 epoch 023: 1603 / 1689 loss=3.555, nll_loss=2.006, ppl=4.02, wps=584031, ups=1.18, wpb=496398, bsz=16338.2, num_updates=38700, lr=0.000321495, gnorm=0.134, clip=0, loss_scale=4, train_wall=84, gb_free=61.4, wall=33187 epoch 023: 1603 / 1689 loss=3.555, nll_loss=2.006, ppl=4.02, wps=584031, ups=1.18, wpb=496398, bsz=16338.2, num_updates=38700, lr=0.000321495, gnorm=0.134, clip=0, loss_scale=4, train_wall=84, gb_free=61.4, wall=33187 epoch 023: 1603 / 1689 loss=3.555, nll_loss=2.006, ppl=4.02, wps=584031, ups=1.18, wpb=496398, bsz=16338.2, num_updates=38700, lr=0.000321495, gnorm=0.134, clip=0, loss_scale=4, train_wall=84, gb_free=61.4, wall=33187 epoch 023: 1603 / 1689 loss=3.555, nll_loss=2.006, ppl=4.02, wps=584031, ups=1.18, wpb=496398, bsz=16338.2, num_updates=38700, lr=0.000321495, gnorm=0.134, clip=0, loss_scale=4, train_wall=84, gb_free=61.4, wall=33187 epoch 023: 1603 / 1689 loss=3.555, nll_loss=2.006, ppl=4.02, wps=584031, ups=1.18, wpb=496398, bsz=16338.2, num_updates=38700, lr=0.000321495, gnorm=0.134, clip=0, loss_scale=4, train_wall=84, gb_free=61.4, wall=33187 epoch 023: 1603 / 1689 loss=3.555, nll_loss=2.006, ppl=4.02, wps=584031, ups=1.18, wpb=496398, bsz=16338.2, num_updates=38700, lr=0.000321495, gnorm=0.134, clip=0, loss_scale=4, train_wall=84, gb_free=61.4, wall=33187 epoch 023: 1603 / 1689 loss=3.555, nll_loss=2.006, ppl=4.02, wps=584031, ups=1.18, wpb=496398, bsz=16338.2, num_updates=38700, lr=0.000321495, gnorm=0.134, clip=0, loss_scale=4, train_wall=84, gb_free=61.4, wall=33187 epoch 023: 1603 / 1689 loss=3.555, nll_loss=2.006, ppl=4.02, wps=584031, ups=1.18, wpb=496398, bsz=16338.2, num_updates=38700, lr=0.000321495, gnorm=0.134, clip=0, loss_scale=4, train_wall=84, gb_free=61.4, wall=33187 end of epoch 23 (average epoch stats below) epoch 023 | loss 3.548 | nll_loss 1.998 | ppl 3.99 | wps 580945 | ups 1.17 | wpb 495129 | bsz 16506.7 | num_updates 38785 | lr 0.000321143 | gnorm 0.136 | clip 0 | loss_scale 2 | train_wall 1408 | gb_free 63.2 | wall 33258 epoch 023 | loss 3.548 | nll_loss 1.998 | ppl 3.99 | wps 580945 | ups 1.17 | wpb 495129 | bsz 16506.7 | num_updates 38785 | lr 0.000321143 | gnorm 0.136 | clip 0 | loss_scale 2 | train_wall 1408 | gb_free 63.2 | wall 33258 epoch 023 | loss 3.548 | nll_loss 1.998 | ppl 3.99 | wps 580945 | ups 1.17 | wpb 495129 | bsz 16506.7 | num_updates 38785 | lr 0.000321143 | gnorm 0.136 | clip 0 | loss_scale 2 | train_wall 1408 | gb_free 63.2 | wall 33258 epoch 023 | loss 3.548 | nll_loss 1.998 | ppl 3.99 | wps 580945 | ups 1.17 | wpb 495129 | bsz 16506.7 | num_updates 38785 | lr 0.000321143 | gnorm 0.136 | clip 0 | loss_scale 2 | train_wall 1408 | gb_free 63.2 | wall 33258 epoch 023 | loss 3.548 | nll_loss 1.998 | ppl 3.99 | wps 580945 | ups 1.17 | wpb 495129 | bsz 16506.7 | num_updates 38785 | lr 0.000321143 | gnorm 0.136 | clip 0 | loss_scale 2 | train_wall 1408 | gb_free 63.2 | wall 33258 epoch 023 | loss 3.548 | nll_loss 1.998 | ppl 3.99 | wps 580945 | ups 1.17 | wpb 495129 | bsz 16506.7 | num_updates 38785 | lr 0.000321143 | gnorm 0.136 | clip 0 | loss_scale 2 | train_wall 1408 | gb_free 63.2 | wall 33258 epoch 023 | loss 3.548 | nll_loss 1.998 | ppl 3.99 | wps 580945 | ups 1.17 | wpb 495129 | bsz 16506.7 | num_updates 38785 | lr 0.000321143 | gnorm 0.136 | clip 0 | loss_scale 2 | train_wall 1408 | gb_free 63.2 | wall 33258 epoch 023 | loss 3.548 | nll_loss 1.998 | ppl 3.99 | wps 580945 | ups 1.17 | wpb 495129 | bsz 16506.7 | num_updates 38785 | lr 0.000321143 | gnorm 0.136 | clip 0 | loss_scale 2 | train_wall 1408 | gb_free 63.2 | wall 33258 epoch 023 | loss 3.548 | nll_loss 1.998 | ppl 3.99 | wps 580945 | ups 1.17 | wpb 495129 | bsz 16506.7 | num_updates 38785 | lr 0.000321143 | gnorm 0.136 | clip 0 | loss_scale 2 | train_wall 1408 | gb_free 63.2 | wall 33258 epoch 023 | loss 3.548 | nll_loss 1.998 | ppl 3.99 | wps 580945 | ups 1.17 | wpb 495129 | bsz 16506.7 | num_updates 38785 | lr 0.000321143 | gnorm 0.136 | clip 0 | loss_scale 2 | train_wall 1408 | gb_free 63.2 | wall 33258 epoch 023 | loss 3.548 | nll_loss 1.998 | ppl 3.99 | wps 580945 | ups 1.17 | wpb 495129 | bsz 16506.7 | num_updates 38785 | lr 0.000321143 | gnorm 0.136 | clip 0 | loss_scale 2 | train_wall 1408 | gb_free 63.2 | wall 33258 epoch 023 | loss 3.548 | nll_loss 1.998 | ppl 3.99 | wps 580945 | ups 1.17 | wpb 495129 | bsz 16506.7 | num_updates 38785 | lr 0.000321143 | gnorm 0.136 | clip 0 | loss_scale 2 | train_wall 1408 | gb_free 63.2 | wall 33258 epoch 023 | loss 3.548 | nll_loss 1.998 | ppl 3.99 | wps 580945 | ups 1.17 | wpb 495129 | bsz 16506.7 | num_updates 38785 | lr 0.000321143 | gnorm 0.136 | clip 0 | loss_scale 2 | train_wall 1408 | gb_free 63.2 | wall 33258 epoch 023 | loss 3.548 | nll_loss 1.998 | ppl 3.99 | wps 580945 | ups 1.17 | wpb 495129 | bsz 16506.7 | num_updates 38785 | lr 0.000321143 | gnorm 0.136 | clip 0 | loss_scale 2 | train_wall 1408 | gb_free 63.2 | wall 33258 epoch 023 | loss 3.548 | nll_loss 1.998 | ppl 3.99 | wps 580945 | ups 1.17 | wpb 495129 | bsz 16506.7 | num_updates 38785 | lr 0.000321143 | gnorm 0.136 | clip 0 | loss_scale 2 | train_wall 1408 | gb_free 63.2 | wall 33258 epoch 023 | loss 3.548 | nll_loss 1.998 | ppl 3.99 | wps 580945 | ups 1.17 | wpb 495129 | bsz 16506.7 | num_updates 38785 | lr 0.000321143 | gnorm 0.136 | clip 0 | loss_scale 2 | train_wall 1408 | gb_free 63.2 | wall 33258 epoch 023 | loss 3.548 | nll_loss 1.998 | ppl 3.99 | wps 580945 | ups 1.17 | wpb 495129 | bsz 16506.7 | num_updates 38785 | lr 0.000321143 | gnorm 0.136 | clip 0 | loss_scale 2 | train_wall 1408 | gb_free 63.2 | wall 33258 epoch 023 | loss 3.548 | nll_loss 1.998 | ppl 3.99 | wps 580945 | ups 1.17 | wpb 495129 | bsz 16506.7 | num_updates 38785 | lr 0.000321143 | gnorm 0.136 | clip 0 | loss_scale 2 | train_wall 1408 | gb_free 63.2 | wall 33258 epoch 023 | loss 3.548 | nll_loss 1.998 | ppl 3.99 | wps 580945 | ups 1.17 | wpb 495129 | bsz 16506.7 | num_updates 38785 | lr 0.000321143 | gnorm 0.136 | clip 0 | loss_scale 2 | train_wall 1408 | gb_free 63.2 | wall 33258 epoch 023 | loss 3.548 | nll_loss 1.998 | ppl 3.99 | wps 580945 | ups 1.17 | wpb 495129 | bsz 16506.7 | num_updates 38785 | lr 0.000321143 | gnorm 0.136 | clip 0 | loss_scale 2 | train_wall 1408 | gb_free 63.2 | wall 33258 epoch 023 | loss 3.548 | nll_loss 1.998 | ppl 3.99 | wps 580945 | ups 1.17 | wpb 495129 | bsz 16506.7 | num_updates 38785 | lr 0.000321143 | gnorm 0.136 | clip 0 | loss_scale 2 | train_wall 1408 | gb_free 63.2 | wall 33258 epoch 023 | loss 3.548 | nll_loss 1.998 | ppl 3.99 | wps 580945 | ups 1.17 | wpb 495129 | bsz 16506.7 | num_updates 38785 | lr 0.000321143 | gnorm 0.136 | clip 0 | loss_scale 2 | train_wall 1408 | gb_free 63.2 | wall 33258 epoch 023 | loss 3.548 | nll_loss 1.998 | ppl 3.99 | wps 580945 | ups 1.17 | wpb 495129 | bsz 16506.7 | num_updates 38785 | lr 0.000321143 | gnorm 0.136 | clip 0 | loss_scale 2 | train_wall 1408 | gb_free 63.2 | wall 33258 Start iterating over samples epoch 024: 15 / 1689 loss=3.552, nll_loss=2.002, ppl=4.01, wps=578814, ups=1.18, wpb=491597, bsz=16208.7, num_updates=38800, lr=0.000321081, gnorm=0.137, clip=0, loss_scale=2, train_wall=84, gb_free=61, wall=33272 epoch 024: 15 / 1689 loss=3.552, nll_loss=2.002, ppl=4.01, wps=578814, ups=1.18, wpb=491597, bsz=16208.7, num_updates=38800, lr=0.000321081, gnorm=0.137, clip=0, loss_scale=2, train_wall=84, gb_free=61, wall=33272 epoch 024: 15 / 1689 loss=3.552, nll_loss=2.002, ppl=4.01, wps=578814, ups=1.18, wpb=491597, bsz=16208.7, num_updates=38800, lr=0.000321081, gnorm=0.137, clip=0, loss_scale=2, train_wall=84, gb_free=61, wall=33272 epoch 024: 15 / 1689 loss=3.552, nll_loss=2.002, ppl=4.01, wps=578814, ups=1.18, wpb=491597, bsz=16208.7, num_updates=38800, lr=0.000321081, gnorm=0.137, clip=0, loss_scale=2, train_wall=84, gb_free=61, wall=33272 epoch 024: 15 / 1689 loss=3.552, nll_loss=2.002, ppl=4.01, wps=578814, ups=1.18, wpb=491597, bsz=16208.7, num_updates=38800, lr=0.000321081, gnorm=0.137, clip=0, loss_scale=2, train_wall=84, gb_free=61, wall=33272 epoch 024: 15 / 1689 loss=3.552, nll_loss=2.002, ppl=4.01, wps=578814, ups=1.18, wpb=491597, bsz=16208.7, num_updates=38800, lr=0.000321081, gnorm=0.137, clip=0, loss_scale=2, train_wall=84, gb_free=61, wall=33272 epoch 024: 15 / 1689 loss=3.552, nll_loss=2.002, ppl=4.01, wps=578814, ups=1.18, wpb=491597, bsz=16208.7, num_updates=38800, lr=0.000321081, gnorm=0.137, clip=0, loss_scale=2, train_wall=84, gb_free=61, wall=33272 epoch 024: 15 / 1689 loss=3.552, nll_loss=2.002, ppl=4.01, wps=578814, ups=1.18, wpb=491597, bsz=16208.7, num_updates=38800, lr=0.000321081, gnorm=0.137, clip=0, loss_scale=2, train_wall=84, gb_free=61, wall=33272 epoch 024: 15 / 1689 loss=3.552, nll_loss=2.002, ppl=4.01, wps=578814, ups=1.18, wpb=491597, bsz=16208.7, num_updates=38800, lr=0.000321081, gnorm=0.137, clip=0, loss_scale=2, train_wall=84, gb_free=61, wall=33272 epoch 024: 15 / 1689 loss=3.552, nll_loss=2.002, ppl=4.01, wps=578814, ups=1.18, wpb=491597, bsz=16208.7, num_updates=38800, lr=0.000321081, gnorm=0.137, clip=0, loss_scale=2, train_wall=84, gb_free=61, wall=33272 epoch 024: 15 / 1689 loss=3.552, nll_loss=2.002, ppl=4.01, wps=578814, ups=1.18, wpb=491597, bsz=16208.7, num_updates=38800, lr=0.000321081, gnorm=0.137, clip=0, loss_scale=2, train_wall=84, gb_free=61, wall=33272 epoch 024: 15 / 1689 loss=3.552, nll_loss=2.002, ppl=4.01, wps=578814, ups=1.18, wpb=491597, bsz=16208.7, num_updates=38800, lr=0.000321081, gnorm=0.137, clip=0, loss_scale=2, train_wall=84, gb_free=61, wall=33272 epoch 024: 15 / 1689 loss=3.552, nll_loss=2.002, ppl=4.01, wps=578814, ups=1.18, wpb=491597, bsz=16208.7, num_updates=38800, lr=0.000321081, gnorm=0.137, clip=0, loss_scale=2, train_wall=84, gb_free=61, wall=33272 epoch 024: 15 / 1689 loss=3.552, nll_loss=2.002, ppl=4.01, wps=578814, ups=1.18, wpb=491597, bsz=16208.7, num_updates=38800, lr=0.000321081, gnorm=0.137, clip=0, loss_scale=2, train_wall=84, gb_free=61, wall=33272 epoch 024: 15 / 1689 loss=3.552, nll_loss=2.002, ppl=4.01, wps=578814, ups=1.18, wpb=491597, bsz=16208.7, num_updates=38800, lr=0.000321081, gnorm=0.137, clip=0, loss_scale=2, train_wall=84, gb_free=61, wall=33272 epoch 024: 15 / 1689 loss=3.552, nll_loss=2.002, ppl=4.01, wps=578814, ups=1.18, wpb=491597, bsz=16208.7, num_updates=38800, lr=0.000321081, gnorm=0.137, clip=0, loss_scale=2, train_wall=84, gb_free=61, wall=33272 epoch 024: 15 / 1689 loss=3.552, nll_loss=2.002, ppl=4.01, wps=578814, ups=1.18, wpb=491597, bsz=16208.7, num_updates=38800, lr=0.000321081, gnorm=0.137, clip=0, loss_scale=2, train_wall=84, gb_free=61, wall=33272 epoch 024: 15 / 1689 loss=3.552, nll_loss=2.002, ppl=4.01, wps=578814, ups=1.18, wpb=491597, bsz=16208.7, num_updates=38800, lr=0.000321081, gnorm=0.137, clip=0, loss_scale=2, train_wall=84, gb_free=61, wall=33272 epoch 024: 15 / 1689 loss=3.552, nll_loss=2.002, ppl=4.01, wps=578814, ups=1.18, wpb=491597, bsz=16208.7, num_updates=38800, lr=0.000321081, gnorm=0.137, clip=0, loss_scale=2, train_wall=84, gb_free=61, wall=33272 epoch 024: 15 / 1689 loss=3.552, nll_loss=2.002, ppl=4.01, wps=578814, ups=1.18, wpb=491597, bsz=16208.7, num_updates=38800, lr=0.000321081, gnorm=0.137, clip=0, loss_scale=2, train_wall=84, gb_free=61, wall=33272 epoch 024: 15 / 1689 loss=3.552, nll_loss=2.002, ppl=4.01, wps=578814, ups=1.18, wpb=491597, bsz=16208.7, num_updates=38800, lr=0.000321081, gnorm=0.137, clip=0, loss_scale=2, train_wall=84, gb_free=61, wall=33272 epoch 024: 15 / 1689 loss=3.552, nll_loss=2.002, ppl=4.01, wps=578814, ups=1.18, wpb=491597, bsz=16208.7, num_updates=38800, lr=0.000321081, gnorm=0.137, clip=0, loss_scale=2, train_wall=84, gb_free=61, wall=33272 epoch 024: 15 / 1689 loss=3.552, nll_loss=2.002, ppl=4.01, wps=578814, ups=1.18, wpb=491597, bsz=16208.7, num_updates=38800, lr=0.000321081, gnorm=0.137, clip=0, loss_scale=2, train_wall=84, gb_free=61, wall=33272 epoch 024: 15 / 1689 loss=3.552, nll_loss=2.002, ppl=4.01, wps=578814, ups=1.18, wpb=491597, bsz=16208.7, num_updates=38800, lr=0.000321081, gnorm=0.137, clip=0, loss_scale=2, train_wall=84, gb_free=61, wall=33272 epoch 024: 115 / 1689 loss=3.533, nll_loss=1.98, ppl=3.94, wps=587134, ups=1.18, wpb=496361, bsz=16702.2, num_updates=38900, lr=0.000320668, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=33356 epoch 024: 115 / 1689 loss=3.533, nll_loss=1.98, ppl=3.94, wps=587134, ups=1.18, wpb=496361, bsz=16702.2, num_updates=38900, lr=0.000320668, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=33356 epoch 024: 115 / 1689 loss=3.533, nll_loss=1.98, ppl=3.94, wps=587134, ups=1.18, wpb=496361, bsz=16702.2, num_updates=38900, lr=0.000320668, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=33356 epoch 024: 115 / 1689 loss=3.533, nll_loss=1.98, ppl=3.94, wps=587134, ups=1.18, wpb=496361, bsz=16702.2, num_updates=38900, lr=0.000320668, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=33356 epoch 024: 115 / 1689 loss=3.533, nll_loss=1.98, ppl=3.94, wps=587134, ups=1.18, wpb=496361, bsz=16702.2, num_updates=38900, lr=0.000320668, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=33356 epoch 024: 115 / 1689 loss=3.533, nll_loss=1.98, ppl=3.94, wps=587134, ups=1.18, wpb=496361, bsz=16702.2, num_updates=38900, lr=0.000320668, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=33356 epoch 024: 115 / 1689 loss=3.533, nll_loss=1.98, ppl=3.94, wps=587134, ups=1.18, wpb=496361, bsz=16702.2, num_updates=38900, lr=0.000320668, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=33356 epoch 024: 115 / 1689 loss=3.533, nll_loss=1.98, ppl=3.94, wps=587134, ups=1.18, wpb=496361, bsz=16702.2, num_updates=38900, lr=0.000320668, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=33356 epoch 024: 115 / 1689 loss=3.533, nll_loss=1.98, ppl=3.94, wps=587134, ups=1.18, wpb=496361, bsz=16702.2, num_updates=38900, lr=0.000320668, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=33356 epoch 024: 115 / 1689 loss=3.533, nll_loss=1.98, ppl=3.94, wps=587134, ups=1.18, wpb=496361, bsz=16702.2, num_updates=38900, lr=0.000320668, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=33356 epoch 024: 115 / 1689 loss=3.533, nll_loss=1.98, ppl=3.94, wps=587134, ups=1.18, wpb=496361, bsz=16702.2, num_updates=38900, lr=0.000320668, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=33356 epoch 024: 115 / 1689 loss=3.533, nll_loss=1.98, ppl=3.94, wps=587134, ups=1.18, wpb=496361, bsz=16702.2, num_updates=38900, lr=0.000320668, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=33356 epoch 024: 115 / 1689 loss=3.533, nll_loss=1.98, ppl=3.94, wps=587134, ups=1.18, wpb=496361, bsz=16702.2, num_updates=38900, lr=0.000320668, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=33356 epoch 024: 115 / 1689 loss=3.533, nll_loss=1.98, ppl=3.94, wps=587134, ups=1.18, wpb=496361, bsz=16702.2, num_updates=38900, lr=0.000320668, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=33356 epoch 024: 115 / 1689 loss=3.533, nll_loss=1.98, ppl=3.94, wps=587134, ups=1.18, wpb=496361, bsz=16702.2, num_updates=38900, lr=0.000320668, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=33356 epoch 024: 115 / 1689 loss=3.533, nll_loss=1.98, ppl=3.94, wps=587134, ups=1.18, wpb=496361, bsz=16702.2, num_updates=38900, lr=0.000320668, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=33356 epoch 024: 115 / 1689 loss=3.533, nll_loss=1.98, ppl=3.94, wps=587134, ups=1.18, wpb=496361, bsz=16702.2, num_updates=38900, lr=0.000320668, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=33356 epoch 024: 115 / 1689 loss=3.533, nll_loss=1.98, ppl=3.94, wps=587134, ups=1.18, wpb=496361, bsz=16702.2, num_updates=38900, lr=0.000320668, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=33356 epoch 024: 115 / 1689 loss=3.533, nll_loss=1.98, ppl=3.94, wps=587134, ups=1.18, wpb=496361, bsz=16702.2, num_updates=38900, lr=0.000320668, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=33356 epoch 024: 115 / 1689 loss=3.533, nll_loss=1.98, ppl=3.94, wps=587134, ups=1.18, wpb=496361, bsz=16702.2, num_updates=38900, lr=0.000320668, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=33356 epoch 024: 115 / 1689 loss=3.533, nll_loss=1.98, ppl=3.94, wps=587134, ups=1.18, wpb=496361, bsz=16702.2, num_updates=38900, lr=0.000320668, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=33356 epoch 024: 115 / 1689 loss=3.533, nll_loss=1.98, ppl=3.94, wps=587134, ups=1.18, wpb=496361, bsz=16702.2, num_updates=38900, lr=0.000320668, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=33356 epoch 024: 115 / 1689 loss=3.533, nll_loss=1.98, ppl=3.94, wps=587134, ups=1.18, wpb=496361, bsz=16702.2, num_updates=38900, lr=0.000320668, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=33356 epoch 024: 115 / 1689 loss=3.533, nll_loss=1.98, ppl=3.94, wps=587134, ups=1.18, wpb=496361, bsz=16702.2, num_updates=38900, lr=0.000320668, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=33356 epoch 024: 215 / 1689 loss=3.538, nll_loss=1.986, ppl=3.96, wps=581683, ups=1.17, wpb=495911, bsz=16788.3, num_updates=39000, lr=0.000320256, gnorm=0.147, clip=0, loss_scale=2, train_wall=84, gb_free=59.8, wall=33441 epoch 024: 215 / 1689 loss=3.538, nll_loss=1.986, ppl=3.96, wps=581683, ups=1.17, wpb=495911, bsz=16788.3, num_updates=39000, lr=0.000320256, gnorm=0.147, clip=0, loss_scale=2, train_wall=84, gb_free=59.8, wall=33441 epoch 024: 215 / 1689 loss=3.538, nll_loss=1.986, ppl=3.96, wps=581683, ups=1.17, wpb=495911, bsz=16788.3, num_updates=39000, lr=0.000320256, gnorm=0.147, clip=0, loss_scale=2, train_wall=84, gb_free=59.8, wall=33441 epoch 024: 215 / 1689 loss=3.538, nll_loss=1.986, ppl=3.96, wps=581683, ups=1.17, wpb=495911, bsz=16788.3, num_updates=39000, lr=0.000320256, gnorm=0.147, clip=0, loss_scale=2, train_wall=84, gb_free=59.8, wall=33441 epoch 024: 215 / 1689 loss=3.538, nll_loss=1.986, ppl=3.96, wps=581683, ups=1.17, wpb=495911, bsz=16788.3, num_updates=39000, lr=0.000320256, gnorm=0.147, clip=0, loss_scale=2, train_wall=84, gb_free=59.8, wall=33441 epoch 024: 215 / 1689 loss=3.538, nll_loss=1.986, ppl=3.96, wps=581683, ups=1.17, wpb=495911, bsz=16788.3, num_updates=39000, lr=0.000320256, gnorm=0.147, clip=0, loss_scale=2, train_wall=84, gb_free=59.8, wall=33441 epoch 024: 215 / 1689 loss=3.538, nll_loss=1.986, ppl=3.96, wps=581683, ups=1.17, wpb=495911, bsz=16788.3, num_updates=39000, lr=0.000320256, gnorm=0.147, clip=0, loss_scale=2, train_wall=84, gb_free=59.8, wall=33441 epoch 024: 215 / 1689 loss=3.538, nll_loss=1.986, ppl=3.96, wps=581683, ups=1.17, wpb=495911, bsz=16788.3, num_updates=39000, lr=0.000320256, gnorm=0.147, clip=0, loss_scale=2, train_wall=84, gb_free=59.8, wall=33441 epoch 024: 215 / 1689 loss=3.538, nll_loss=1.986, ppl=3.96, wps=581683, ups=1.17, wpb=495911, bsz=16788.3, num_updates=39000, lr=0.000320256, gnorm=0.147, clip=0, loss_scale=2, train_wall=84, gb_free=59.8, wall=33441 epoch 024: 215 / 1689 loss=3.538, nll_loss=1.986, ppl=3.96, wps=581683, ups=1.17, wpb=495911, bsz=16788.3, num_updates=39000, lr=0.000320256, gnorm=0.147, clip=0, loss_scale=2, train_wall=84, gb_free=59.8, wall=33441 epoch 024: 215 / 1689 loss=3.538, nll_loss=1.986, ppl=3.96, wps=581683, ups=1.17, wpb=495911, bsz=16788.3, num_updates=39000, lr=0.000320256, gnorm=0.147, clip=0, loss_scale=2, train_wall=84, gb_free=59.8, wall=33441 epoch 024: 215 / 1689 loss=3.538, nll_loss=1.986, ppl=3.96, wps=581683, ups=1.17, wpb=495911, bsz=16788.3, num_updates=39000, lr=0.000320256, gnorm=0.147, clip=0, loss_scale=2, train_wall=84, gb_free=59.8, wall=33441 epoch 024: 215 / 1689 loss=3.538, nll_loss=1.986, ppl=3.96, wps=581683, ups=1.17, wpb=495911, bsz=16788.3, num_updates=39000, lr=0.000320256, gnorm=0.147, clip=0, loss_scale=2, train_wall=84, gb_free=59.8, wall=33441 epoch 024: 215 / 1689 loss=3.538, nll_loss=1.986, ppl=3.96, wps=581683, ups=1.17, wpb=495911, bsz=16788.3, num_updates=39000, lr=0.000320256, gnorm=0.147, clip=0, loss_scale=2, train_wall=84, gb_free=59.8, wall=33441 epoch 024: 215 / 1689 loss=3.538, nll_loss=1.986, ppl=3.96, wps=581683, ups=1.17, wpb=495911, bsz=16788.3, num_updates=39000, lr=0.000320256, gnorm=0.147, clip=0, loss_scale=2, train_wall=84, gb_free=59.8, wall=33441 epoch 024: 215 / 1689 loss=3.538, nll_loss=1.986, ppl=3.96, wps=581683, ups=1.17, wpb=495911, bsz=16788.3, num_updates=39000, lr=0.000320256, gnorm=0.147, clip=0, loss_scale=2, train_wall=84, gb_free=59.8, wall=33441 epoch 024: 215 / 1689 loss=3.538, nll_loss=1.986, ppl=3.96, wps=581683, ups=1.17, wpb=495911, bsz=16788.3, num_updates=39000, lr=0.000320256, gnorm=0.147, clip=0, loss_scale=2, train_wall=84, gb_free=59.8, wall=33441 epoch 024: 215 / 1689 loss=3.538, nll_loss=1.986, ppl=3.96, wps=581683, ups=1.17, wpb=495911, bsz=16788.3, num_updates=39000, lr=0.000320256, gnorm=0.147, clip=0, loss_scale=2, train_wall=84, gb_free=59.8, wall=33441 epoch 024: 215 / 1689 loss=3.538, nll_loss=1.986, ppl=3.96, wps=581683, ups=1.17, wpb=495911, bsz=16788.3, num_updates=39000, lr=0.000320256, gnorm=0.147, clip=0, loss_scale=2, train_wall=84, gb_free=59.8, wall=33441 epoch 024: 215 / 1689 loss=3.538, nll_loss=1.986, ppl=3.96, wps=581683, ups=1.17, wpb=495911, bsz=16788.3, num_updates=39000, lr=0.000320256, gnorm=0.147, clip=0, loss_scale=2, train_wall=84, gb_free=59.8, wall=33441 epoch 024: 215 / 1689 loss=3.538, nll_loss=1.986, ppl=3.96, wps=581683, ups=1.17, wpb=495911, bsz=16788.3, num_updates=39000, lr=0.000320256, gnorm=0.147, clip=0, loss_scale=2, train_wall=84, gb_free=59.8, wall=33441 epoch 024: 215 / 1689 loss=3.538, nll_loss=1.986, ppl=3.96, wps=581683, ups=1.17, wpb=495911, bsz=16788.3, num_updates=39000, lr=0.000320256, gnorm=0.147, clip=0, loss_scale=2, train_wall=84, gb_free=59.8, wall=33441 epoch 024: 215 / 1689 loss=3.538, nll_loss=1.986, ppl=3.96, wps=581683, ups=1.17, wpb=495911, bsz=16788.3, num_updates=39000, lr=0.000320256, gnorm=0.147, clip=0, loss_scale=2, train_wall=84, gb_free=59.8, wall=33441 epoch 024: 215 / 1689 loss=3.538, nll_loss=1.986, ppl=3.96, wps=581683, ups=1.17, wpb=495911, bsz=16788.3, num_updates=39000, lr=0.000320256, gnorm=0.147, clip=0, loss_scale=2, train_wall=84, gb_free=59.8, wall=33441 begin validation on "valid" subset epoch 024 | valid on 'valid' subset | loss 3.697 | nll_loss 2.141 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 39000 | best_loss 3.697 epoch 024 | valid on 'valid' subset | loss 3.697 | nll_loss 2.141 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 39000 | best_loss 3.697 epoch 024 | valid on 'valid' subset | loss 3.697 | nll_loss 2.141 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 39000 | best_loss 3.697 epoch 024 | valid on 'valid' subset | loss 3.697 | nll_loss 2.141 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 39000 | best_loss 3.697 epoch 024 | valid on 'valid' subset | loss 3.697 | nll_loss 2.141 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 39000 | best_loss 3.697 epoch 024 | valid on 'valid' subset | loss 3.697 | nll_loss 2.141 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 39000 | best_loss 3.697 epoch 024 | valid on 'valid' subset | loss 3.697 | nll_loss 2.141 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 39000 | best_loss 3.697 epoch 024 | valid on 'valid' subset | loss 3.697 | nll_loss 2.141 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 39000 | best_loss 3.697 epoch 024 | valid on 'valid' subset | loss 3.697 | nll_loss 2.141 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 39000 | best_loss 3.697 epoch 024 | valid on 'valid' subset | loss 3.697 | nll_loss 2.141 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 39000 | best_loss 3.697 epoch 024 | valid on 'valid' subset | loss 3.697 | nll_loss 2.141 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 39000 | best_loss 3.697 epoch 024 | valid on 'valid' subset | loss 3.697 | nll_loss 2.141 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 39000 | best_loss 3.697 epoch 024 | valid on 'valid' subset | loss 3.697 | nll_loss 2.141 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 39000 | best_loss 3.697 epoch 024 | valid on 'valid' subset | loss 3.697 | nll_loss 2.141 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 39000 | best_loss 3.697 epoch 024 | valid on 'valid' subset | loss 3.697 | nll_loss 2.141 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 39000 | best_loss 3.697 epoch 024 | valid on 'valid' subset | loss 3.697 | nll_loss 2.141 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 39000 | best_loss 3.697 epoch 024 | valid on 'valid' subset | loss 3.697 | nll_loss 2.141 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 39000 | best_loss 3.697 epoch 024 | valid on 'valid' subset | loss 3.697 | nll_loss 2.141 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 39000 | best_loss 3.697 epoch 024 | valid on 'valid' subset | loss 3.697 | nll_loss 2.141 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 39000 | best_loss 3.697 epoch 024 | valid on 'valid' subset | loss 3.697 | nll_loss 2.141 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 39000 | best_loss 3.697 epoch 024 | valid on 'valid' subset | loss 3.697 | nll_loss 2.141 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 39000 | best_loss 3.697 epoch 024 | valid on 'valid' subset | loss 3.697 | nll_loss 2.141 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 39000 | best_loss 3.697 epoch 024 | valid on 'valid' subset | loss 3.697 | nll_loss 2.141 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 39000 | best_loss 3.697 epoch 024 | valid on 'valid' subset | loss 3.697 | nll_loss 2.141 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 39000 | best_loss 3.697 epoch 024: 315 / 1689 loss=3.531, nll_loss=1.978, ppl=3.94, wps=517300, ups=1.04, wpb=495711, bsz=16254.4, num_updates=39100, lr=0.000319847, gnorm=0.135, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=33537 epoch 024: 315 / 1689 loss=3.531, nll_loss=1.978, ppl=3.94, wps=517300, ups=1.04, wpb=495711, bsz=16254.4, num_updates=39100, lr=0.000319847, gnorm=0.135, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=33537 epoch 024: 315 / 1689 loss=3.531, nll_loss=1.978, ppl=3.94, wps=517300, ups=1.04, wpb=495711, bsz=16254.4, num_updates=39100, lr=0.000319847, gnorm=0.135, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=33537 epoch 024: 315 / 1689 loss=3.531, nll_loss=1.978, ppl=3.94, wps=517300, ups=1.04, wpb=495711, bsz=16254.4, num_updates=39100, lr=0.000319847, gnorm=0.135, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=33537 epoch 024: 315 / 1689 loss=3.531, nll_loss=1.978, ppl=3.94, wps=517300, ups=1.04, wpb=495711, bsz=16254.4, num_updates=39100, lr=0.000319847, gnorm=0.135, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=33537 epoch 024: 315 / 1689 loss=3.531, nll_loss=1.978, ppl=3.94, wps=517300, ups=1.04, wpb=495711, bsz=16254.4, num_updates=39100, lr=0.000319847, gnorm=0.135, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=33537 epoch 024: 315 / 1689 loss=3.531, nll_loss=1.978, ppl=3.94, wps=517300, ups=1.04, wpb=495711, bsz=16254.4, num_updates=39100, lr=0.000319847, gnorm=0.135, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=33537 epoch 024: 315 / 1689 loss=3.531, nll_loss=1.978, ppl=3.94, wps=517300, ups=1.04, wpb=495711, bsz=16254.4, num_updates=39100, lr=0.000319847, gnorm=0.135, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=33537 epoch 024: 315 / 1689 loss=3.531, nll_loss=1.978, ppl=3.94, wps=517300, ups=1.04, wpb=495711, bsz=16254.4, num_updates=39100, lr=0.000319847, gnorm=0.135, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=33537 epoch 024: 315 / 1689 loss=3.531, nll_loss=1.978, ppl=3.94, wps=517300, ups=1.04, wpb=495711, bsz=16254.4, num_updates=39100, lr=0.000319847, gnorm=0.135, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=33537 epoch 024: 315 / 1689 loss=3.531, nll_loss=1.978, ppl=3.94, wps=517300, ups=1.04, wpb=495711, bsz=16254.4, num_updates=39100, lr=0.000319847, gnorm=0.135, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=33537 epoch 024: 315 / 1689 loss=3.531, nll_loss=1.978, ppl=3.94, wps=517300, ups=1.04, wpb=495711, bsz=16254.4, num_updates=39100, lr=0.000319847, gnorm=0.135, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=33537 epoch 024: 315 / 1689 loss=3.531, nll_loss=1.978, ppl=3.94, wps=517300, ups=1.04, wpb=495711, bsz=16254.4, num_updates=39100, lr=0.000319847, gnorm=0.135, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=33537 epoch 024: 315 / 1689 loss=3.531, nll_loss=1.978, ppl=3.94, wps=517300, ups=1.04, wpb=495711, bsz=16254.4, num_updates=39100, lr=0.000319847, gnorm=0.135, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=33537 epoch 024: 315 / 1689 loss=3.531, nll_loss=1.978, ppl=3.94, wps=517300, ups=1.04, wpb=495711, bsz=16254.4, num_updates=39100, lr=0.000319847, gnorm=0.135, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=33537 epoch 024: 315 / 1689 loss=3.531, nll_loss=1.978, ppl=3.94, wps=517300, ups=1.04, wpb=495711, bsz=16254.4, num_updates=39100, lr=0.000319847, gnorm=0.135, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=33537 epoch 024: 315 / 1689 loss=3.531, nll_loss=1.978, ppl=3.94, wps=517300, ups=1.04, wpb=495711, bsz=16254.4, num_updates=39100, lr=0.000319847, gnorm=0.135, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=33537 epoch 024: 315 / 1689 loss=3.531, nll_loss=1.978, ppl=3.94, wps=517300, ups=1.04, wpb=495711, bsz=16254.4, num_updates=39100, lr=0.000319847, gnorm=0.135, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=33537 epoch 024: 315 / 1689 loss=3.531, nll_loss=1.978, ppl=3.94, wps=517300, ups=1.04, wpb=495711, bsz=16254.4, num_updates=39100, lr=0.000319847, gnorm=0.135, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=33537 epoch 024: 315 / 1689 loss=3.531, nll_loss=1.978, ppl=3.94, wps=517300, ups=1.04, wpb=495711, bsz=16254.4, num_updates=39100, lr=0.000319847, gnorm=0.135, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=33537 epoch 024: 315 / 1689 loss=3.531, nll_loss=1.978, ppl=3.94, wps=517300, ups=1.04, wpb=495711, bsz=16254.4, num_updates=39100, lr=0.000319847, gnorm=0.135, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=33537 epoch 024: 315 / 1689 loss=3.531, nll_loss=1.978, ppl=3.94, wps=517300, ups=1.04, wpb=495711, bsz=16254.4, num_updates=39100, lr=0.000319847, gnorm=0.135, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=33537 epoch 024: 315 / 1689 loss=3.531, nll_loss=1.978, ppl=3.94, wps=517300, ups=1.04, wpb=495711, bsz=16254.4, num_updates=39100, lr=0.000319847, gnorm=0.135, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=33537 epoch 024: 315 / 1689 loss=3.531, nll_loss=1.978, ppl=3.94, wps=517300, ups=1.04, wpb=495711, bsz=16254.4, num_updates=39100, lr=0.000319847, gnorm=0.135, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=33537 epoch 024: 415 / 1689 loss=3.538, nll_loss=1.986, ppl=3.96, wps=584501, ups=1.18, wpb=494583, bsz=16410.7, num_updates=39200, lr=0.000319438, gnorm=0.14, clip=0, loss_scale=2, train_wall=83, gb_free=61.7, wall=33622 epoch 024: 415 / 1689 loss=3.538, nll_loss=1.986, ppl=3.96, wps=584501, ups=1.18, wpb=494583, bsz=16410.7, num_updates=39200, lr=0.000319438, gnorm=0.14, clip=0, loss_scale=2, train_wall=83, gb_free=61.7, wall=33622 epoch 024: 415 / 1689 loss=3.538, nll_loss=1.986, ppl=3.96, wps=584501, ups=1.18, wpb=494583, bsz=16410.7, num_updates=39200, lr=0.000319438, gnorm=0.14, clip=0, loss_scale=2, train_wall=83, gb_free=61.7, wall=33622 epoch 024: 415 / 1689 loss=3.538, nll_loss=1.986, ppl=3.96, wps=584501, ups=1.18, wpb=494583, bsz=16410.7, num_updates=39200, lr=0.000319438, gnorm=0.14, clip=0, loss_scale=2, train_wall=83, gb_free=61.7, wall=33622 epoch 024: 415 / 1689 loss=3.538, nll_loss=1.986, ppl=3.96, wps=584501, ups=1.18, wpb=494583, bsz=16410.7, num_updates=39200, lr=0.000319438, gnorm=0.14, clip=0, loss_scale=2, train_wall=83, gb_free=61.7, wall=33622 epoch 024: 415 / 1689 loss=3.538, nll_loss=1.986, ppl=3.96, wps=584501, ups=1.18, wpb=494583, bsz=16410.7, num_updates=39200, lr=0.000319438, gnorm=0.14, clip=0, loss_scale=2, train_wall=83, gb_free=61.7, wall=33622 epoch 024: 415 / 1689 loss=3.538, nll_loss=1.986, ppl=3.96, wps=584501, ups=1.18, wpb=494583, bsz=16410.7, num_updates=39200, lr=0.000319438, gnorm=0.14, clip=0, loss_scale=2, train_wall=83, gb_free=61.7, wall=33622 epoch 024: 415 / 1689 loss=3.538, nll_loss=1.986, ppl=3.96, wps=584501, ups=1.18, wpb=494583, bsz=16410.7, num_updates=39200, lr=0.000319438, gnorm=0.14, clip=0, loss_scale=2, train_wall=83, gb_free=61.7, wall=33622 epoch 024: 415 / 1689 loss=3.538, nll_loss=1.986, ppl=3.96, wps=584501, ups=1.18, wpb=494583, bsz=16410.7, num_updates=39200, lr=0.000319438, gnorm=0.14, clip=0, loss_scale=2, train_wall=83, gb_free=61.7, wall=33622 epoch 024: 415 / 1689 loss=3.538, nll_loss=1.986, ppl=3.96, wps=584501, ups=1.18, wpb=494583, bsz=16410.7, num_updates=39200, lr=0.000319438, gnorm=0.14, clip=0, loss_scale=2, train_wall=83, gb_free=61.7, wall=33622 epoch 024: 415 / 1689 loss=3.538, nll_loss=1.986, ppl=3.96, wps=584501, ups=1.18, wpb=494583, bsz=16410.7, num_updates=39200, lr=0.000319438, gnorm=0.14, clip=0, loss_scale=2, train_wall=83, gb_free=61.7, wall=33622 epoch 024: 415 / 1689 loss=3.538, nll_loss=1.986, ppl=3.96, wps=584501, ups=1.18, wpb=494583, bsz=16410.7, num_updates=39200, lr=0.000319438, gnorm=0.14, clip=0, loss_scale=2, train_wall=83, gb_free=61.7, wall=33622 epoch 024: 415 / 1689 loss=3.538, nll_loss=1.986, ppl=3.96, wps=584501, ups=1.18, wpb=494583, bsz=16410.7, num_updates=39200, lr=0.000319438, gnorm=0.14, clip=0, loss_scale=2, train_wall=83, gb_free=61.7, wall=33622 epoch 024: 415 / 1689 loss=3.538, nll_loss=1.986, ppl=3.96, wps=584501, ups=1.18, wpb=494583, bsz=16410.7, num_updates=39200, lr=0.000319438, gnorm=0.14, clip=0, loss_scale=2, train_wall=83, gb_free=61.7, wall=33622 epoch 024: 415 / 1689 loss=3.538, nll_loss=1.986, ppl=3.96, wps=584501, ups=1.18, wpb=494583, bsz=16410.7, num_updates=39200, lr=0.000319438, gnorm=0.14, clip=0, loss_scale=2, train_wall=83, gb_free=61.7, wall=33622 epoch 024: 415 / 1689 loss=3.538, nll_loss=1.986, ppl=3.96, wps=584501, ups=1.18, wpb=494583, bsz=16410.7, num_updates=39200, lr=0.000319438, gnorm=0.14, clip=0, loss_scale=2, train_wall=83, gb_free=61.7, wall=33622 epoch 024: 415 / 1689 loss=3.538, nll_loss=1.986, ppl=3.96, wps=584501, ups=1.18, wpb=494583, bsz=16410.7, num_updates=39200, lr=0.000319438, gnorm=0.14, clip=0, loss_scale=2, train_wall=83, gb_free=61.7, wall=33622 epoch 024: 415 / 1689 loss=3.538, nll_loss=1.986, ppl=3.96, wps=584501, ups=1.18, wpb=494583, bsz=16410.7, num_updates=39200, lr=0.000319438, gnorm=0.14, clip=0, loss_scale=2, train_wall=83, gb_free=61.7, wall=33622 epoch 024: 415 / 1689 loss=3.538, nll_loss=1.986, ppl=3.96, wps=584501, ups=1.18, wpb=494583, bsz=16410.7, num_updates=39200, lr=0.000319438, gnorm=0.14, clip=0, loss_scale=2, train_wall=83, gb_free=61.7, wall=33622 epoch 024: 415 / 1689 loss=3.538, nll_loss=1.986, ppl=3.96, wps=584501, ups=1.18, wpb=494583, bsz=16410.7, num_updates=39200, lr=0.000319438, gnorm=0.14, clip=0, loss_scale=2, train_wall=83, gb_free=61.7, wall=33622 epoch 024: 415 / 1689 loss=3.538, nll_loss=1.986, ppl=3.96, wps=584501, ups=1.18, wpb=494583, bsz=16410.7, num_updates=39200, lr=0.000319438, gnorm=0.14, clip=0, loss_scale=2, train_wall=83, gb_free=61.7, wall=33622 epoch 024: 415 / 1689 loss=3.538, nll_loss=1.986, ppl=3.96, wps=584501, ups=1.18, wpb=494583, bsz=16410.7, num_updates=39200, lr=0.000319438, gnorm=0.14, clip=0, loss_scale=2, train_wall=83, gb_free=61.7, wall=33622 epoch 024: 415 / 1689 loss=3.538, nll_loss=1.986, ppl=3.96, wps=584501, ups=1.18, wpb=494583, bsz=16410.7, num_updates=39200, lr=0.000319438, gnorm=0.14, clip=0, loss_scale=2, train_wall=83, gb_free=61.7, wall=33622 epoch 024: 415 / 1689 loss=3.538, nll_loss=1.986, ppl=3.96, wps=584501, ups=1.18, wpb=494583, bsz=16410.7, num_updates=39200, lr=0.000319438, gnorm=0.14, clip=0, loss_scale=2, train_wall=83, gb_free=61.7, wall=33622 epoch 024: 515 / 1689 loss=3.54, nll_loss=1.989, ppl=3.97, wps=587004, ups=1.18, wpb=496894, bsz=16625.7, num_updates=39300, lr=0.000319032, gnorm=0.132, clip=0, loss_scale=4, train_wall=83, gb_free=61.4, wall=33706 epoch 024: 515 / 1689 loss=3.54, nll_loss=1.989, ppl=3.97, wps=587004, ups=1.18, wpb=496894, bsz=16625.7, num_updates=39300, lr=0.000319032, gnorm=0.132, clip=0, loss_scale=4, train_wall=83, gb_free=61.4, wall=33706 epoch 024: 515 / 1689 loss=3.54, nll_loss=1.989, ppl=3.97, wps=587004, ups=1.18, wpb=496894, bsz=16625.7, num_updates=39300, lr=0.000319032, gnorm=0.132, clip=0, loss_scale=4, train_wall=83, gb_free=61.4, wall=33706 epoch 024: 515 / 1689 loss=3.54, nll_loss=1.989, ppl=3.97, wps=587004, ups=1.18, wpb=496894, bsz=16625.7, num_updates=39300, lr=0.000319032, gnorm=0.132, clip=0, loss_scale=4, train_wall=83, gb_free=61.4, wall=33706 epoch 024: 515 / 1689 loss=3.54, nll_loss=1.989, ppl=3.97, wps=587004, ups=1.18, wpb=496894, bsz=16625.7, num_updates=39300, lr=0.000319032, gnorm=0.132, clip=0, loss_scale=4, train_wall=83, gb_free=61.4, wall=33706 epoch 024: 515 / 1689 loss=3.54, nll_loss=1.989, ppl=3.97, wps=587004, ups=1.18, wpb=496894, bsz=16625.7, num_updates=39300, lr=0.000319032, gnorm=0.132, clip=0, loss_scale=4, train_wall=83, gb_free=61.4, wall=33706 epoch 024: 515 / 1689 loss=3.54, nll_loss=1.989, ppl=3.97, wps=587004, ups=1.18, wpb=496894, bsz=16625.7, num_updates=39300, lr=0.000319032, gnorm=0.132, clip=0, loss_scale=4, train_wall=83, gb_free=61.4, wall=33706 epoch 024: 515 / 1689 loss=3.54, nll_loss=1.989, ppl=3.97, wps=587004, ups=1.18, wpb=496894, bsz=16625.7, num_updates=39300, lr=0.000319032, gnorm=0.132, clip=0, loss_scale=4, train_wall=83, gb_free=61.4, wall=33706 epoch 024: 515 / 1689 loss=3.54, nll_loss=1.989, ppl=3.97, wps=587004, ups=1.18, wpb=496894, bsz=16625.7, num_updates=39300, lr=0.000319032, gnorm=0.132, clip=0, loss_scale=4, train_wall=83, gb_free=61.4, wall=33706 epoch 024: 515 / 1689 loss=3.54, nll_loss=1.989, ppl=3.97, wps=587004, ups=1.18, wpb=496894, bsz=16625.7, num_updates=39300, lr=0.000319032, gnorm=0.132, clip=0, loss_scale=4, train_wall=83, gb_free=61.4, wall=33706 epoch 024: 515 / 1689 loss=3.54, nll_loss=1.989, ppl=3.97, wps=587004, ups=1.18, wpb=496894, bsz=16625.7, num_updates=39300, lr=0.000319032, gnorm=0.132, clip=0, loss_scale=4, train_wall=83, gb_free=61.4, wall=33706 epoch 024: 515 / 1689 loss=3.54, nll_loss=1.989, ppl=3.97, wps=587004, ups=1.18, wpb=496894, bsz=16625.7, num_updates=39300, lr=0.000319032, gnorm=0.132, clip=0, loss_scale=4, train_wall=83, gb_free=61.4, wall=33706 epoch 024: 515 / 1689 loss=3.54, nll_loss=1.989, ppl=3.97, wps=587004, ups=1.18, wpb=496894, bsz=16625.7, num_updates=39300, lr=0.000319032, gnorm=0.132, clip=0, loss_scale=4, train_wall=83, gb_free=61.4, wall=33706 epoch 024: 515 / 1689 loss=3.54, nll_loss=1.989, ppl=3.97, wps=587004, ups=1.18, wpb=496894, bsz=16625.7, num_updates=39300, lr=0.000319032, gnorm=0.132, clip=0, loss_scale=4, train_wall=83, gb_free=61.4, wall=33706 epoch 024: 515 / 1689 loss=3.54, nll_loss=1.989, ppl=3.97, wps=587004, ups=1.18, wpb=496894, bsz=16625.7, num_updates=39300, lr=0.000319032, gnorm=0.132, clip=0, loss_scale=4, train_wall=83, gb_free=61.4, wall=33706 epoch 024: 515 / 1689 loss=3.54, nll_loss=1.989, ppl=3.97, wps=587004, ups=1.18, wpb=496894, bsz=16625.7, num_updates=39300, lr=0.000319032, gnorm=0.132, clip=0, loss_scale=4, train_wall=83, gb_free=61.4, wall=33706 epoch 024: 515 / 1689 loss=3.54, nll_loss=1.989, ppl=3.97, wps=587004, ups=1.18, wpb=496894, bsz=16625.7, num_updates=39300, lr=0.000319032, gnorm=0.132, clip=0, loss_scale=4, train_wall=83, gb_free=61.4, wall=33706 epoch 024: 515 / 1689 loss=3.54, nll_loss=1.989, ppl=3.97, wps=587004, ups=1.18, wpb=496894, bsz=16625.7, num_updates=39300, lr=0.000319032, gnorm=0.132, clip=0, loss_scale=4, train_wall=83, gb_free=61.4, wall=33706 epoch 024: 515 / 1689 loss=3.54, nll_loss=1.989, ppl=3.97, wps=587004, ups=1.18, wpb=496894, bsz=16625.7, num_updates=39300, lr=0.000319032, gnorm=0.132, clip=0, loss_scale=4, train_wall=83, gb_free=61.4, wall=33706 epoch 024: 515 / 1689 loss=3.54, nll_loss=1.989, ppl=3.97, wps=587004, ups=1.18, wpb=496894, bsz=16625.7, num_updates=39300, lr=0.000319032, gnorm=0.132, clip=0, loss_scale=4, train_wall=83, gb_free=61.4, wall=33706 epoch 024: 515 / 1689 loss=3.54, nll_loss=1.989, ppl=3.97, wps=587004, ups=1.18, wpb=496894, bsz=16625.7, num_updates=39300, lr=0.000319032, gnorm=0.132, clip=0, loss_scale=4, train_wall=83, gb_free=61.4, wall=33706 epoch 024: 515 / 1689 loss=3.54, nll_loss=1.989, ppl=3.97, wps=587004, ups=1.18, wpb=496894, bsz=16625.7, num_updates=39300, lr=0.000319032, gnorm=0.132, clip=0, loss_scale=4, train_wall=83, gb_free=61.4, wall=33706 epoch 024: 515 / 1689 loss=3.54, nll_loss=1.989, ppl=3.97, wps=587004, ups=1.18, wpb=496894, bsz=16625.7, num_updates=39300, lr=0.000319032, gnorm=0.132, clip=0, loss_scale=4, train_wall=83, gb_free=61.4, wall=33706 epoch 024: 515 / 1689 loss=3.54, nll_loss=1.989, ppl=3.97, wps=587004, ups=1.18, wpb=496894, bsz=16625.7, num_updates=39300, lr=0.000319032, gnorm=0.132, clip=0, loss_scale=4, train_wall=83, gb_free=61.4, wall=33706 epoch 024: 615 / 1689 loss=3.542, nll_loss=1.99, ppl=3.97, wps=584639, ups=1.18, wpb=494814, bsz=16333, num_updates=39400, lr=0.000318626, gnorm=0.136, clip=0, loss_scale=4, train_wall=84, gb_free=61.3, wall=33791 epoch 024: 615 / 1689 loss=3.542, nll_loss=1.99, ppl=3.97, wps=584639, ups=1.18, wpb=494814, bsz=16333, num_updates=39400, lr=0.000318626, gnorm=0.136, clip=0, loss_scale=4, train_wall=84, gb_free=61.3, wall=33791 epoch 024: 615 / 1689 loss=3.542, nll_loss=1.99, ppl=3.97, wps=584639, ups=1.18, wpb=494814, bsz=16333, num_updates=39400, lr=0.000318626, gnorm=0.136, clip=0, loss_scale=4, train_wall=84, gb_free=61.3, wall=33791 epoch 024: 615 / 1689 loss=3.542, nll_loss=1.99, ppl=3.97, wps=584639, ups=1.18, wpb=494814, bsz=16333, num_updates=39400, lr=0.000318626, gnorm=0.136, clip=0, loss_scale=4, train_wall=84, gb_free=61.3, wall=33791 epoch 024: 615 / 1689 loss=3.542, nll_loss=1.99, ppl=3.97, wps=584639, ups=1.18, wpb=494814, bsz=16333, num_updates=39400, lr=0.000318626, gnorm=0.136, clip=0, loss_scale=4, train_wall=84, gb_free=61.3, wall=33791 epoch 024: 615 / 1689 loss=3.542, nll_loss=1.99, ppl=3.97, wps=584639, ups=1.18, wpb=494814, bsz=16333, num_updates=39400, lr=0.000318626, gnorm=0.136, clip=0, loss_scale=4, train_wall=84, gb_free=61.3, wall=33791 epoch 024: 615 / 1689 loss=3.542, nll_loss=1.99, ppl=3.97, wps=584639, ups=1.18, wpb=494814, bsz=16333, num_updates=39400, lr=0.000318626, gnorm=0.136, clip=0, loss_scale=4, train_wall=84, gb_free=61.3, wall=33791 epoch 024: 615 / 1689 loss=3.542, nll_loss=1.99, ppl=3.97, wps=584639, ups=1.18, wpb=494814, bsz=16333, num_updates=39400, lr=0.000318626, gnorm=0.136, clip=0, loss_scale=4, train_wall=84, gb_free=61.3, wall=33791 epoch 024: 615 / 1689 loss=3.542, nll_loss=1.99, ppl=3.97, wps=584639, ups=1.18, wpb=494814, bsz=16333, num_updates=39400, lr=0.000318626, gnorm=0.136, clip=0, loss_scale=4, train_wall=84, gb_free=61.3, wall=33791 epoch 024: 615 / 1689 loss=3.542, nll_loss=1.99, ppl=3.97, wps=584639, ups=1.18, wpb=494814, bsz=16333, num_updates=39400, lr=0.000318626, gnorm=0.136, clip=0, loss_scale=4, train_wall=84, gb_free=61.3, wall=33791 epoch 024: 615 / 1689 loss=3.542, nll_loss=1.99, ppl=3.97, wps=584639, ups=1.18, wpb=494814, bsz=16333, num_updates=39400, lr=0.000318626, gnorm=0.136, clip=0, loss_scale=4, train_wall=84, gb_free=61.3, wall=33791 epoch 024: 615 / 1689 loss=3.542, nll_loss=1.99, ppl=3.97, wps=584639, ups=1.18, wpb=494814, bsz=16333, num_updates=39400, lr=0.000318626, gnorm=0.136, clip=0, loss_scale=4, train_wall=84, gb_free=61.3, wall=33791 epoch 024: 615 / 1689 loss=3.542, nll_loss=1.99, ppl=3.97, wps=584639, ups=1.18, wpb=494814, bsz=16333, num_updates=39400, lr=0.000318626, gnorm=0.136, clip=0, loss_scale=4, train_wall=84, gb_free=61.3, wall=33791 epoch 024: 615 / 1689 loss=3.542, nll_loss=1.99, ppl=3.97, wps=584639, ups=1.18, wpb=494814, bsz=16333, num_updates=39400, lr=0.000318626, gnorm=0.136, clip=0, loss_scale=4, train_wall=84, gb_free=61.3, wall=33791 epoch 024: 615 / 1689 loss=3.542, nll_loss=1.99, ppl=3.97, wps=584639, ups=1.18, wpb=494814, bsz=16333, num_updates=39400, lr=0.000318626, gnorm=0.136, clip=0, loss_scale=4, train_wall=84, gb_free=61.3, wall=33791 epoch 024: 615 / 1689 loss=3.542, nll_loss=1.99, ppl=3.97, wps=584639, ups=1.18, wpb=494814, bsz=16333, num_updates=39400, lr=0.000318626, gnorm=0.136, clip=0, loss_scale=4, train_wall=84, gb_free=61.3, wall=33791 epoch 024: 615 / 1689 loss=3.542, nll_loss=1.99, ppl=3.97, wps=584639, ups=1.18, wpb=494814, bsz=16333, num_updates=39400, lr=0.000318626, gnorm=0.136, clip=0, loss_scale=4, train_wall=84, gb_free=61.3, wall=33791 epoch 024: 615 / 1689 loss=3.542, nll_loss=1.99, ppl=3.97, wps=584639, ups=1.18, wpb=494814, bsz=16333, num_updates=39400, lr=0.000318626, gnorm=0.136, clip=0, loss_scale=4, train_wall=84, gb_free=61.3, wall=33791 epoch 024: 615 / 1689 loss=3.542, nll_loss=1.99, ppl=3.97, wps=584639, ups=1.18, wpb=494814, bsz=16333, num_updates=39400, lr=0.000318626, gnorm=0.136, clip=0, loss_scale=4, train_wall=84, gb_free=61.3, wall=33791 epoch 024: 615 / 1689 loss=3.542, nll_loss=1.99, ppl=3.97, wps=584639, ups=1.18, wpb=494814, bsz=16333, num_updates=39400, lr=0.000318626, gnorm=0.136, clip=0, loss_scale=4, train_wall=84, gb_free=61.3, wall=33791 epoch 024: 615 / 1689 loss=3.542, nll_loss=1.99, ppl=3.97, wps=584639, ups=1.18, wpb=494814, bsz=16333, num_updates=39400, lr=0.000318626, gnorm=0.136, clip=0, loss_scale=4, train_wall=84, gb_free=61.3, wall=33791 epoch 024: 615 / 1689 loss=3.542, nll_loss=1.99, ppl=3.97, wps=584639, ups=1.18, wpb=494814, bsz=16333, num_updates=39400, lr=0.000318626, gnorm=0.136, clip=0, loss_scale=4, train_wall=84, gb_free=61.3, wall=33791 epoch 024: 615 / 1689 loss=3.542, nll_loss=1.99, ppl=3.97, wps=584639, ups=1.18, wpb=494814, bsz=16333, num_updates=39400, lr=0.000318626, gnorm=0.136, clip=0, loss_scale=4, train_wall=84, gb_free=61.3, wall=33791 epoch 024: 615 / 1689 loss=3.542, nll_loss=1.99, ppl=3.97, wps=584639, ups=1.18, wpb=494814, bsz=16333, num_updates=39400, lr=0.000318626, gnorm=0.136, clip=0, loss_scale=4, train_wall=84, gb_free=61.3, wall=33791 epoch 024: 715 / 1689 loss=3.537, nll_loss=1.985, ppl=3.96, wps=583297, ups=1.18, wpb=494258, bsz=16370.5, num_updates=39500, lr=0.000318223, gnorm=0.136, clip=0, loss_scale=4, train_wall=84, gb_free=61.3, wall=33876 epoch 024: 715 / 1689 loss=3.537, nll_loss=1.985, ppl=3.96, wps=583297, ups=1.18, wpb=494258, bsz=16370.5, num_updates=39500, lr=0.000318223, gnorm=0.136, clip=0, loss_scale=4, train_wall=84, gb_free=61.3, wall=33876 epoch 024: 715 / 1689 loss=3.537, nll_loss=1.985, ppl=3.96, wps=583297, ups=1.18, wpb=494258, bsz=16370.5, num_updates=39500, lr=0.000318223, gnorm=0.136, clip=0, loss_scale=4, train_wall=84, gb_free=61.3, wall=33876 epoch 024: 715 / 1689 loss=3.537, nll_loss=1.985, ppl=3.96, wps=583297, ups=1.18, wpb=494258, bsz=16370.5, num_updates=39500, lr=0.000318223, gnorm=0.136, clip=0, loss_scale=4, train_wall=84, gb_free=61.3, wall=33876 epoch 024: 715 / 1689 loss=3.537, nll_loss=1.985, ppl=3.96, wps=583297, ups=1.18, wpb=494258, bsz=16370.5, num_updates=39500, lr=0.000318223, gnorm=0.136, clip=0, loss_scale=4, train_wall=84, gb_free=61.3, wall=33876 epoch 024: 715 / 1689 loss=3.537, nll_loss=1.985, ppl=3.96, wps=583297, ups=1.18, wpb=494258, bsz=16370.5, num_updates=39500, lr=0.000318223, gnorm=0.136, clip=0, loss_scale=4, train_wall=84, gb_free=61.3, wall=33876 epoch 024: 715 / 1689 loss=3.537, nll_loss=1.985, ppl=3.96, wps=583297, ups=1.18, wpb=494258, bsz=16370.5, num_updates=39500, lr=0.000318223, gnorm=0.136, clip=0, loss_scale=4, train_wall=84, gb_free=61.3, wall=33876 epoch 024: 715 / 1689 loss=3.537, nll_loss=1.985, ppl=3.96, wps=583297, ups=1.18, wpb=494258, bsz=16370.5, num_updates=39500, lr=0.000318223, gnorm=0.136, clip=0, loss_scale=4, train_wall=84, gb_free=61.3, wall=33876 epoch 024: 715 / 1689 loss=3.537, nll_loss=1.985, ppl=3.96, wps=583297, ups=1.18, wpb=494258, bsz=16370.5, num_updates=39500, lr=0.000318223, gnorm=0.136, clip=0, loss_scale=4, train_wall=84, gb_free=61.3, wall=33876 epoch 024: 715 / 1689 loss=3.537, nll_loss=1.985, ppl=3.96, wps=583297, ups=1.18, wpb=494258, bsz=16370.5, num_updates=39500, lr=0.000318223, gnorm=0.136, clip=0, loss_scale=4, train_wall=84, gb_free=61.3, wall=33876 epoch 024: 715 / 1689 loss=3.537, nll_loss=1.985, ppl=3.96, wps=583297, ups=1.18, wpb=494258, bsz=16370.5, num_updates=39500, lr=0.000318223, gnorm=0.136, clip=0, loss_scale=4, train_wall=84, gb_free=61.3, wall=33876 epoch 024: 715 / 1689 loss=3.537, nll_loss=1.985, ppl=3.96, wps=583297, ups=1.18, wpb=494258, bsz=16370.5, num_updates=39500, lr=0.000318223, gnorm=0.136, clip=0, loss_scale=4, train_wall=84, gb_free=61.3, wall=33876 epoch 024: 715 / 1689 loss=3.537, nll_loss=1.985, ppl=3.96, wps=583297, ups=1.18, wpb=494258, bsz=16370.5, num_updates=39500, lr=0.000318223, gnorm=0.136, clip=0, loss_scale=4, train_wall=84, gb_free=61.3, wall=33876 epoch 024: 715 / 1689 loss=3.537, nll_loss=1.985, ppl=3.96, wps=583297, ups=1.18, wpb=494258, bsz=16370.5, num_updates=39500, lr=0.000318223, gnorm=0.136, clip=0, loss_scale=4, train_wall=84, gb_free=61.3, wall=33876 epoch 024: 715 / 1689 loss=3.537, nll_loss=1.985, ppl=3.96, wps=583297, ups=1.18, wpb=494258, bsz=16370.5, num_updates=39500, lr=0.000318223, gnorm=0.136, clip=0, loss_scale=4, train_wall=84, gb_free=61.3, wall=33876 epoch 024: 715 / 1689 loss=3.537, nll_loss=1.985, ppl=3.96, wps=583297, ups=1.18, wpb=494258, bsz=16370.5, num_updates=39500, lr=0.000318223, gnorm=0.136, clip=0, loss_scale=4, train_wall=84, gb_free=61.3, wall=33876 epoch 024: 715 / 1689 loss=3.537, nll_loss=1.985, ppl=3.96, wps=583297, ups=1.18, wpb=494258, bsz=16370.5, num_updates=39500, lr=0.000318223, gnorm=0.136, clip=0, loss_scale=4, train_wall=84, gb_free=61.3, wall=33876 epoch 024: 715 / 1689 loss=3.537, nll_loss=1.985, ppl=3.96, wps=583297, ups=1.18, wpb=494258, bsz=16370.5, num_updates=39500, lr=0.000318223, gnorm=0.136, clip=0, loss_scale=4, train_wall=84, gb_free=61.3, wall=33876 epoch 024: 715 / 1689 loss=3.537, nll_loss=1.985, ppl=3.96, wps=583297, ups=1.18, wpb=494258, bsz=16370.5, num_updates=39500, lr=0.000318223, gnorm=0.136, clip=0, loss_scale=4, train_wall=84, gb_free=61.3, wall=33876 epoch 024: 715 / 1689 loss=3.537, nll_loss=1.985, ppl=3.96, wps=583297, ups=1.18, wpb=494258, bsz=16370.5, num_updates=39500, lr=0.000318223, gnorm=0.136, clip=0, loss_scale=4, train_wall=84, gb_free=61.3, wall=33876 epoch 024: 715 / 1689 loss=3.537, nll_loss=1.985, ppl=3.96, wps=583297, ups=1.18, wpb=494258, bsz=16370.5, num_updates=39500, lr=0.000318223, gnorm=0.136, clip=0, loss_scale=4, train_wall=84, gb_free=61.3, wall=33876 epoch 024: 715 / 1689 loss=3.537, nll_loss=1.985, ppl=3.96, wps=583297, ups=1.18, wpb=494258, bsz=16370.5, num_updates=39500, lr=0.000318223, gnorm=0.136, clip=0, loss_scale=4, train_wall=84, gb_free=61.3, wall=33876 epoch 024: 715 / 1689 loss=3.537, nll_loss=1.985, ppl=3.96, wps=583297, ups=1.18, wpb=494258, bsz=16370.5, num_updates=39500, lr=0.000318223, gnorm=0.136, clip=0, loss_scale=4, train_wall=84, gb_free=61.3, wall=33876 epoch 024: 715 / 1689 loss=3.537, nll_loss=1.985, ppl=3.96, wps=583297, ups=1.18, wpb=494258, bsz=16370.5, num_updates=39500, lr=0.000318223, gnorm=0.136, clip=0, loss_scale=4, train_wall=84, gb_free=61.3, wall=33876 epoch 024: 816 / 1689 loss=3.543, nll_loss=1.992, ppl=3.98, wps=580679, ups=1.17, wpb=496755, bsz=16342.8, num_updates=39600, lr=0.000317821, gnorm=0.133, clip=0, loss_scale=2, train_wall=84, gb_free=61.9, wall=33961 epoch 024: 816 / 1689 loss=3.543, nll_loss=1.992, ppl=3.98, wps=580679, ups=1.17, wpb=496755, bsz=16342.8, num_updates=39600, lr=0.000317821, gnorm=0.133, clip=0, loss_scale=2, train_wall=84, gb_free=61.9, wall=33961 epoch 024: 816 / 1689 loss=3.543, nll_loss=1.992, ppl=3.98, wps=580679, ups=1.17, wpb=496755, bsz=16342.8, num_updates=39600, lr=0.000317821, gnorm=0.133, clip=0, loss_scale=2, train_wall=84, gb_free=61.9, wall=33961 epoch 024: 816 / 1689 loss=3.543, nll_loss=1.992, ppl=3.98, wps=580679, ups=1.17, wpb=496755, bsz=16342.8, num_updates=39600, lr=0.000317821, gnorm=0.133, clip=0, loss_scale=2, train_wall=84, gb_free=61.9, wall=33961 epoch 024: 816 / 1689 loss=3.543, nll_loss=1.992, ppl=3.98, wps=580679, ups=1.17, wpb=496755, bsz=16342.8, num_updates=39600, lr=0.000317821, gnorm=0.133, clip=0, loss_scale=2, train_wall=84, gb_free=61.9, wall=33961 epoch 024: 816 / 1689 loss=3.543, nll_loss=1.992, ppl=3.98, wps=580679, ups=1.17, wpb=496755, bsz=16342.8, num_updates=39600, lr=0.000317821, gnorm=0.133, clip=0, loss_scale=2, train_wall=84, gb_free=61.9, wall=33961 epoch 024: 816 / 1689 loss=3.543, nll_loss=1.992, ppl=3.98, wps=580679, ups=1.17, wpb=496755, bsz=16342.8, num_updates=39600, lr=0.000317821, gnorm=0.133, clip=0, loss_scale=2, train_wall=84, gb_free=61.9, wall=33961 epoch 024: 816 / 1689 loss=3.543, nll_loss=1.992, ppl=3.98, wps=580679, ups=1.17, wpb=496755, bsz=16342.8, num_updates=39600, lr=0.000317821, gnorm=0.133, clip=0, loss_scale=2, train_wall=84, gb_free=61.9, wall=33961 epoch 024: 816 / 1689 loss=3.543, nll_loss=1.992, ppl=3.98, wps=580679, ups=1.17, wpb=496755, bsz=16342.8, num_updates=39600, lr=0.000317821, gnorm=0.133, clip=0, loss_scale=2, train_wall=84, gb_free=61.9, wall=33961 epoch 024: 816 / 1689 loss=3.543, nll_loss=1.992, ppl=3.98, wps=580679, ups=1.17, wpb=496755, bsz=16342.8, num_updates=39600, lr=0.000317821, gnorm=0.133, clip=0, loss_scale=2, train_wall=84, gb_free=61.9, wall=33961 epoch 024: 816 / 1689 loss=3.543, nll_loss=1.992, ppl=3.98, wps=580679, ups=1.17, wpb=496755, bsz=16342.8, num_updates=39600, lr=0.000317821, gnorm=0.133, clip=0, loss_scale=2, train_wall=84, gb_free=61.9, wall=33961 epoch 024: 816 / 1689 loss=3.543, nll_loss=1.992, ppl=3.98, wps=580679, ups=1.17, wpb=496755, bsz=16342.8, num_updates=39600, lr=0.000317821, gnorm=0.133, clip=0, loss_scale=2, train_wall=84, gb_free=61.9, wall=33961 epoch 024: 816 / 1689 loss=3.543, nll_loss=1.992, ppl=3.98, wps=580679, ups=1.17, wpb=496755, bsz=16342.8, num_updates=39600, lr=0.000317821, gnorm=0.133, clip=0, loss_scale=2, train_wall=84, gb_free=61.9, wall=33961 epoch 024: 816 / 1689 loss=3.543, nll_loss=1.992, ppl=3.98, wps=580679, ups=1.17, wpb=496755, bsz=16342.8, num_updates=39600, lr=0.000317821, gnorm=0.133, clip=0, loss_scale=2, train_wall=84, gb_free=61.9, wall=33961 epoch 024: 816 / 1689 loss=3.543, nll_loss=1.992, ppl=3.98, wps=580679, ups=1.17, wpb=496755, bsz=16342.8, num_updates=39600, lr=0.000317821, gnorm=0.133, clip=0, loss_scale=2, train_wall=84, gb_free=61.9, wall=33961 epoch 024: 816 / 1689 loss=3.543, nll_loss=1.992, ppl=3.98, wps=580679, ups=1.17, wpb=496755, bsz=16342.8, num_updates=39600, lr=0.000317821, gnorm=0.133, clip=0, loss_scale=2, train_wall=84, gb_free=61.9, wall=33961 epoch 024: 816 / 1689 loss=3.543, nll_loss=1.992, ppl=3.98, wps=580679, ups=1.17, wpb=496755, bsz=16342.8, num_updates=39600, lr=0.000317821, gnorm=0.133, clip=0, loss_scale=2, train_wall=84, gb_free=61.9, wall=33961 epoch 024: 816 / 1689 loss=3.543, nll_loss=1.992, ppl=3.98, wps=580679, ups=1.17, wpb=496755, bsz=16342.8, num_updates=39600, lr=0.000317821, gnorm=0.133, clip=0, loss_scale=2, train_wall=84, gb_free=61.9, wall=33961 epoch 024: 816 / 1689 loss=3.543, nll_loss=1.992, ppl=3.98, wps=580679, ups=1.17, wpb=496755, bsz=16342.8, num_updates=39600, lr=0.000317821, gnorm=0.133, clip=0, loss_scale=2, train_wall=84, gb_free=61.9, wall=33961 epoch 024: 816 / 1689 loss=3.543, nll_loss=1.992, ppl=3.98, wps=580679, ups=1.17, wpb=496755, bsz=16342.8, num_updates=39600, lr=0.000317821, gnorm=0.133, clip=0, loss_scale=2, train_wall=84, gb_free=61.9, wall=33961 epoch 024: 816 / 1689 loss=3.543, nll_loss=1.992, ppl=3.98, wps=580679, ups=1.17, wpb=496755, bsz=16342.8, num_updates=39600, lr=0.000317821, gnorm=0.133, clip=0, loss_scale=2, train_wall=84, gb_free=61.9, wall=33961 epoch 024: 816 / 1689 loss=3.543, nll_loss=1.992, ppl=3.98, wps=580679, ups=1.17, wpb=496755, bsz=16342.8, num_updates=39600, lr=0.000317821, gnorm=0.133, clip=0, loss_scale=2, train_wall=84, gb_free=61.9, wall=33961 epoch 024: 816 / 1689 loss=3.543, nll_loss=1.992, ppl=3.98, wps=580679, ups=1.17, wpb=496755, bsz=16342.8, num_updates=39600, lr=0.000317821, gnorm=0.133, clip=0, loss_scale=2, train_wall=84, gb_free=61.9, wall=33961 epoch 024: 816 / 1689 loss=3.543, nll_loss=1.992, ppl=3.98, wps=580679, ups=1.17, wpb=496755, bsz=16342.8, num_updates=39600, lr=0.000317821, gnorm=0.133, clip=0, loss_scale=2, train_wall=84, gb_free=61.9, wall=33961 epoch 024: 916 / 1689 loss=3.542, nll_loss=1.991, ppl=3.98, wps=584096, ups=1.18, wpb=495675, bsz=16588.1, num_updates=39700, lr=0.00031742, gnorm=0.136, clip=0, loss_scale=2, train_wall=84, gb_free=62.4, wall=34046 epoch 024: 916 / 1689 loss=3.542, nll_loss=1.991, ppl=3.98, wps=584096, ups=1.18, wpb=495675, bsz=16588.1, num_updates=39700, lr=0.00031742, gnorm=0.136, clip=0, loss_scale=2, train_wall=84, gb_free=62.4, wall=34046 epoch 024: 916 / 1689 loss=3.542, nll_loss=1.991, ppl=3.98, wps=584096, ups=1.18, wpb=495675, bsz=16588.1, num_updates=39700, lr=0.00031742, gnorm=0.136, clip=0, loss_scale=2, train_wall=84, gb_free=62.4, wall=34046 epoch 024: 916 / 1689 loss=3.542, nll_loss=1.991, ppl=3.98, wps=584096, ups=1.18, wpb=495675, bsz=16588.1, num_updates=39700, lr=0.00031742, gnorm=0.136, clip=0, loss_scale=2, train_wall=84, gb_free=62.4, wall=34046 epoch 024: 916 / 1689 loss=3.542, nll_loss=1.991, ppl=3.98, wps=584096, ups=1.18, wpb=495675, bsz=16588.1, num_updates=39700, lr=0.00031742, gnorm=0.136, clip=0, loss_scale=2, train_wall=84, gb_free=62.4, wall=34046 epoch 024: 916 / 1689 loss=3.542, nll_loss=1.991, ppl=3.98, wps=584096, ups=1.18, wpb=495675, bsz=16588.1, num_updates=39700, lr=0.00031742, gnorm=0.136, clip=0, loss_scale=2, train_wall=84, gb_free=62.4, wall=34046 epoch 024: 916 / 1689 loss=3.542, nll_loss=1.991, ppl=3.98, wps=584096, ups=1.18, wpb=495675, bsz=16588.1, num_updates=39700, lr=0.00031742, gnorm=0.136, clip=0, loss_scale=2, train_wall=84, gb_free=62.4, wall=34046 epoch 024: 916 / 1689 loss=3.542, nll_loss=1.991, ppl=3.98, wps=584096, ups=1.18, wpb=495675, bsz=16588.1, num_updates=39700, lr=0.00031742, gnorm=0.136, clip=0, loss_scale=2, train_wall=84, gb_free=62.4, wall=34046 epoch 024: 916 / 1689 loss=3.542, nll_loss=1.991, ppl=3.98, wps=584096, ups=1.18, wpb=495675, bsz=16588.1, num_updates=39700, lr=0.00031742, gnorm=0.136, clip=0, loss_scale=2, train_wall=84, gb_free=62.4, wall=34046 epoch 024: 916 / 1689 loss=3.542, nll_loss=1.991, ppl=3.98, wps=584096, ups=1.18, wpb=495675, bsz=16588.1, num_updates=39700, lr=0.00031742, gnorm=0.136, clip=0, loss_scale=2, train_wall=84, gb_free=62.4, wall=34046 epoch 024: 916 / 1689 loss=3.542, nll_loss=1.991, ppl=3.98, wps=584096, ups=1.18, wpb=495675, bsz=16588.1, num_updates=39700, lr=0.00031742, gnorm=0.136, clip=0, loss_scale=2, train_wall=84, gb_free=62.4, wall=34046 epoch 024: 916 / 1689 loss=3.542, nll_loss=1.991, ppl=3.98, wps=584096, ups=1.18, wpb=495675, bsz=16588.1, num_updates=39700, lr=0.00031742, gnorm=0.136, clip=0, loss_scale=2, train_wall=84, gb_free=62.4, wall=34046 epoch 024: 916 / 1689 loss=3.542, nll_loss=1.991, ppl=3.98, wps=584096, ups=1.18, wpb=495675, bsz=16588.1, num_updates=39700, lr=0.00031742, gnorm=0.136, clip=0, loss_scale=2, train_wall=84, gb_free=62.4, wall=34046 epoch 024: 916 / 1689 loss=3.542, nll_loss=1.991, ppl=3.98, wps=584096, ups=1.18, wpb=495675, bsz=16588.1, num_updates=39700, lr=0.00031742, gnorm=0.136, clip=0, loss_scale=2, train_wall=84, gb_free=62.4, wall=34046 epoch 024: 916 / 1689 loss=3.542, nll_loss=1.991, ppl=3.98, wps=584096, ups=1.18, wpb=495675, bsz=16588.1, num_updates=39700, lr=0.00031742, gnorm=0.136, clip=0, loss_scale=2, train_wall=84, gb_free=62.4, wall=34046 epoch 024: 916 / 1689 loss=3.542, nll_loss=1.991, ppl=3.98, wps=584096, ups=1.18, wpb=495675, bsz=16588.1, num_updates=39700, lr=0.00031742, gnorm=0.136, clip=0, loss_scale=2, train_wall=84, gb_free=62.4, wall=34046 epoch 024: 916 / 1689 loss=3.542, nll_loss=1.991, ppl=3.98, wps=584096, ups=1.18, wpb=495675, bsz=16588.1, num_updates=39700, lr=0.00031742, gnorm=0.136, clip=0, loss_scale=2, train_wall=84, gb_free=62.4, wall=34046 epoch 024: 916 / 1689 loss=3.542, nll_loss=1.991, ppl=3.98, wps=584096, ups=1.18, wpb=495675, bsz=16588.1, num_updates=39700, lr=0.00031742, gnorm=0.136, clip=0, loss_scale=2, train_wall=84, gb_free=62.4, wall=34046 epoch 024: 916 / 1689 loss=3.542, nll_loss=1.991, ppl=3.98, wps=584096, ups=1.18, wpb=495675, bsz=16588.1, num_updates=39700, lr=0.00031742, gnorm=0.136, clip=0, loss_scale=2, train_wall=84, gb_free=62.4, wall=34046 epoch 024: 916 / 1689 loss=3.542, nll_loss=1.991, ppl=3.98, wps=584096, ups=1.18, wpb=495675, bsz=16588.1, num_updates=39700, lr=0.00031742, gnorm=0.136, clip=0, loss_scale=2, train_wall=84, gb_free=62.4, wall=34046 epoch 024: 916 / 1689 loss=3.542, nll_loss=1.991, ppl=3.98, wps=584096, ups=1.18, wpb=495675, bsz=16588.1, num_updates=39700, lr=0.00031742, gnorm=0.136, clip=0, loss_scale=2, train_wall=84, gb_free=62.4, wall=34046 epoch 024: 916 / 1689 loss=3.542, nll_loss=1.991, ppl=3.98, wps=584096, ups=1.18, wpb=495675, bsz=16588.1, num_updates=39700, lr=0.00031742, gnorm=0.136, clip=0, loss_scale=2, train_wall=84, gb_free=62.4, wall=34046 epoch 024: 916 / 1689 loss=3.542, nll_loss=1.991, ppl=3.98, wps=584096, ups=1.18, wpb=495675, bsz=16588.1, num_updates=39700, lr=0.00031742, gnorm=0.136, clip=0, loss_scale=2, train_wall=84, gb_free=62.4, wall=34046 epoch 024: 916 / 1689 loss=3.542, nll_loss=1.991, ppl=3.98, wps=584096, ups=1.18, wpb=495675, bsz=16588.1, num_updates=39700, lr=0.00031742, gnorm=0.136, clip=0, loss_scale=2, train_wall=84, gb_free=62.4, wall=34046 epoch 024: 1016 / 1689 loss=3.547, nll_loss=1.997, ppl=3.99, wps=587119, ups=1.19, wpb=495363, bsz=16592, num_updates=39800, lr=0.000317021, gnorm=0.127, clip=0, loss_scale=2, train_wall=83, gb_free=60.6, wall=34131 epoch 024: 1016 / 1689 loss=3.547, nll_loss=1.997, ppl=3.99, wps=587119, ups=1.19, wpb=495363, bsz=16592, num_updates=39800, lr=0.000317021, gnorm=0.127, clip=0, loss_scale=2, train_wall=83, gb_free=60.6, wall=34131 epoch 024: 1016 / 1689 loss=3.547, nll_loss=1.997, ppl=3.99, wps=587119, ups=1.19, wpb=495363, bsz=16592, num_updates=39800, lr=0.000317021, gnorm=0.127, clip=0, loss_scale=2, train_wall=83, gb_free=60.6, wall=34131 epoch 024: 1016 / 1689 loss=3.547, nll_loss=1.997, ppl=3.99, wps=587119, ups=1.19, wpb=495363, bsz=16592, num_updates=39800, lr=0.000317021, gnorm=0.127, clip=0, loss_scale=2, train_wall=83, gb_free=60.6, wall=34131 epoch 024: 1016 / 1689 loss=3.547, nll_loss=1.997, ppl=3.99, wps=587119, ups=1.19, wpb=495363, bsz=16592, num_updates=39800, lr=0.000317021, gnorm=0.127, clip=0, loss_scale=2, train_wall=83, gb_free=60.6, wall=34131 epoch 024: 1016 / 1689 loss=3.547, nll_loss=1.997, ppl=3.99, wps=587119, ups=1.19, wpb=495363, bsz=16592, num_updates=39800, lr=0.000317021, gnorm=0.127, clip=0, loss_scale=2, train_wall=83, gb_free=60.6, wall=34131 epoch 024: 1016 / 1689 loss=3.547, nll_loss=1.997, ppl=3.99, wps=587119, ups=1.19, wpb=495363, bsz=16592, num_updates=39800, lr=0.000317021, gnorm=0.127, clip=0, loss_scale=2, train_wall=83, gb_free=60.6, wall=34131 epoch 024: 1016 / 1689 loss=3.547, nll_loss=1.997, ppl=3.99, wps=587119, ups=1.19, wpb=495363, bsz=16592, num_updates=39800, lr=0.000317021, gnorm=0.127, clip=0, loss_scale=2, train_wall=83, gb_free=60.6, wall=34131 epoch 024: 1016 / 1689 loss=3.547, nll_loss=1.997, ppl=3.99, wps=587119, ups=1.19, wpb=495363, bsz=16592, num_updates=39800, lr=0.000317021, gnorm=0.127, clip=0, loss_scale=2, train_wall=83, gb_free=60.6, wall=34131 epoch 024: 1016 / 1689 loss=3.547, nll_loss=1.997, ppl=3.99, wps=587119, ups=1.19, wpb=495363, bsz=16592, num_updates=39800, lr=0.000317021, gnorm=0.127, clip=0, loss_scale=2, train_wall=83, gb_free=60.6, wall=34131 epoch 024: 1016 / 1689 loss=3.547, nll_loss=1.997, ppl=3.99, wps=587119, ups=1.19, wpb=495363, bsz=16592, num_updates=39800, lr=0.000317021, gnorm=0.127, clip=0, loss_scale=2, train_wall=83, gb_free=60.6, wall=34131 epoch 024: 1016 / 1689 loss=3.547, nll_loss=1.997, ppl=3.99, wps=587119, ups=1.19, wpb=495363, bsz=16592, num_updates=39800, lr=0.000317021, gnorm=0.127, clip=0, loss_scale=2, train_wall=83, gb_free=60.6, wall=34131 epoch 024: 1016 / 1689 loss=3.547, nll_loss=1.997, ppl=3.99, wps=587119, ups=1.19, wpb=495363, bsz=16592, num_updates=39800, lr=0.000317021, gnorm=0.127, clip=0, loss_scale=2, train_wall=83, gb_free=60.6, wall=34131 epoch 024: 1016 / 1689 loss=3.547, nll_loss=1.997, ppl=3.99, wps=587119, ups=1.19, wpb=495363, bsz=16592, num_updates=39800, lr=0.000317021, gnorm=0.127, clip=0, loss_scale=2, train_wall=83, gb_free=60.6, wall=34131 epoch 024: 1016 / 1689 loss=3.547, nll_loss=1.997, ppl=3.99, wps=587119, ups=1.19, wpb=495363, bsz=16592, num_updates=39800, lr=0.000317021, gnorm=0.127, clip=0, loss_scale=2, train_wall=83, gb_free=60.6, wall=34131 epoch 024: 1016 / 1689 loss=3.547, nll_loss=1.997, ppl=3.99, wps=587119, ups=1.19, wpb=495363, bsz=16592, num_updates=39800, lr=0.000317021, gnorm=0.127, clip=0, loss_scale=2, train_wall=83, gb_free=60.6, wall=34131 epoch 024: 1016 / 1689 loss=3.547, nll_loss=1.997, ppl=3.99, wps=587119, ups=1.19, wpb=495363, bsz=16592, num_updates=39800, lr=0.000317021, gnorm=0.127, clip=0, loss_scale=2, train_wall=83, gb_free=60.6, wall=34131 epoch 024: 1016 / 1689 loss=3.547, nll_loss=1.997, ppl=3.99, wps=587119, ups=1.19, wpb=495363, bsz=16592, num_updates=39800, lr=0.000317021, gnorm=0.127, clip=0, loss_scale=2, train_wall=83, gb_free=60.6, wall=34131 epoch 024: 1016 / 1689 loss=3.547, nll_loss=1.997, ppl=3.99, wps=587119, ups=1.19, wpb=495363, bsz=16592, num_updates=39800, lr=0.000317021, gnorm=0.127, clip=0, loss_scale=2, train_wall=83, gb_free=60.6, wall=34131 epoch 024: 1016 / 1689 loss=3.547, nll_loss=1.997, ppl=3.99, wps=587119, ups=1.19, wpb=495363, bsz=16592, num_updates=39800, lr=0.000317021, gnorm=0.127, clip=0, loss_scale=2, train_wall=83, gb_free=60.6, wall=34131 epoch 024: 1016 / 1689 loss=3.547, nll_loss=1.997, ppl=3.99, wps=587119, ups=1.19, wpb=495363, bsz=16592, num_updates=39800, lr=0.000317021, gnorm=0.127, clip=0, loss_scale=2, train_wall=83, gb_free=60.6, wall=34131 epoch 024: 1016 / 1689 loss=3.547, nll_loss=1.997, ppl=3.99, wps=587119, ups=1.19, wpb=495363, bsz=16592, num_updates=39800, lr=0.000317021, gnorm=0.127, clip=0, loss_scale=2, train_wall=83, gb_free=60.6, wall=34131 epoch 024: 1016 / 1689 loss=3.547, nll_loss=1.997, ppl=3.99, wps=587119, ups=1.19, wpb=495363, bsz=16592, num_updates=39800, lr=0.000317021, gnorm=0.127, clip=0, loss_scale=2, train_wall=83, gb_free=60.6, wall=34131 epoch 024: 1016 / 1689 loss=3.547, nll_loss=1.997, ppl=3.99, wps=587119, ups=1.19, wpb=495363, bsz=16592, num_updates=39800, lr=0.000317021, gnorm=0.127, clip=0, loss_scale=2, train_wall=83, gb_free=60.6, wall=34131 epoch 024: 1116 / 1689 loss=3.547, nll_loss=1.996, ppl=3.99, wps=579577, ups=1.17, wpb=496128, bsz=16890.2, num_updates=39900, lr=0.000316624, gnorm=0.136, clip=0, loss_scale=2, train_wall=84, gb_free=61.4, wall=34216 epoch 024: 1116 / 1689 loss=3.547, nll_loss=1.996, ppl=3.99, wps=579577, ups=1.17, wpb=496128, bsz=16890.2, num_updates=39900, lr=0.000316624, gnorm=0.136, clip=0, loss_scale=2, train_wall=84, gb_free=61.4, wall=34216 epoch 024: 1116 / 1689 loss=3.547, nll_loss=1.996, ppl=3.99, wps=579577, ups=1.17, wpb=496128, bsz=16890.2, num_updates=39900, lr=0.000316624, gnorm=0.136, clip=0, loss_scale=2, train_wall=84, gb_free=61.4, wall=34216 epoch 024: 1116 / 1689 loss=3.547, nll_loss=1.996, ppl=3.99, wps=579577, ups=1.17, wpb=496128, bsz=16890.2, num_updates=39900, lr=0.000316624, gnorm=0.136, clip=0, loss_scale=2, train_wall=84, gb_free=61.4, wall=34216 epoch 024: 1116 / 1689 loss=3.547, nll_loss=1.996, ppl=3.99, wps=579577, ups=1.17, wpb=496128, bsz=16890.2, num_updates=39900, lr=0.000316624, gnorm=0.136, clip=0, loss_scale=2, train_wall=84, gb_free=61.4, wall=34216 epoch 024: 1116 / 1689 loss=3.547, nll_loss=1.996, ppl=3.99, wps=579577, ups=1.17, wpb=496128, bsz=16890.2, num_updates=39900, lr=0.000316624, gnorm=0.136, clip=0, loss_scale=2, train_wall=84, gb_free=61.4, wall=34216 epoch 024: 1116 / 1689 loss=3.547, nll_loss=1.996, ppl=3.99, wps=579577, ups=1.17, wpb=496128, bsz=16890.2, num_updates=39900, lr=0.000316624, gnorm=0.136, clip=0, loss_scale=2, train_wall=84, gb_free=61.4, wall=34216 epoch 024: 1116 / 1689 loss=3.547, nll_loss=1.996, ppl=3.99, wps=579577, ups=1.17, wpb=496128, bsz=16890.2, num_updates=39900, lr=0.000316624, gnorm=0.136, clip=0, loss_scale=2, train_wall=84, gb_free=61.4, wall=34216 epoch 024: 1116 / 1689 loss=3.547, nll_loss=1.996, ppl=3.99, wps=579577, ups=1.17, wpb=496128, bsz=16890.2, num_updates=39900, lr=0.000316624, gnorm=0.136, clip=0, loss_scale=2, train_wall=84, gb_free=61.4, wall=34216 epoch 024: 1116 / 1689 loss=3.547, nll_loss=1.996, ppl=3.99, wps=579577, ups=1.17, wpb=496128, bsz=16890.2, num_updates=39900, lr=0.000316624, gnorm=0.136, clip=0, loss_scale=2, train_wall=84, gb_free=61.4, wall=34216 epoch 024: 1116 / 1689 loss=3.547, nll_loss=1.996, ppl=3.99, wps=579577, ups=1.17, wpb=496128, bsz=16890.2, num_updates=39900, lr=0.000316624, gnorm=0.136, clip=0, loss_scale=2, train_wall=84, gb_free=61.4, wall=34216 epoch 024: 1116 / 1689 loss=3.547, nll_loss=1.996, ppl=3.99, wps=579577, ups=1.17, wpb=496128, bsz=16890.2, num_updates=39900, lr=0.000316624, gnorm=0.136, clip=0, loss_scale=2, train_wall=84, gb_free=61.4, wall=34216 epoch 024: 1116 / 1689 loss=3.547, nll_loss=1.996, ppl=3.99, wps=579577, ups=1.17, wpb=496128, bsz=16890.2, num_updates=39900, lr=0.000316624, gnorm=0.136, clip=0, loss_scale=2, train_wall=84, gb_free=61.4, wall=34216 epoch 024: 1116 / 1689 loss=3.547, nll_loss=1.996, ppl=3.99, wps=579577, ups=1.17, wpb=496128, bsz=16890.2, num_updates=39900, lr=0.000316624, gnorm=0.136, clip=0, loss_scale=2, train_wall=84, gb_free=61.4, wall=34216 epoch 024: 1116 / 1689 loss=3.547, nll_loss=1.996, ppl=3.99, wps=579577, ups=1.17, wpb=496128, bsz=16890.2, num_updates=39900, lr=0.000316624, gnorm=0.136, clip=0, loss_scale=2, train_wall=84, gb_free=61.4, wall=34216 epoch 024: 1116 / 1689 loss=3.547, nll_loss=1.996, ppl=3.99, wps=579577, ups=1.17, wpb=496128, bsz=16890.2, num_updates=39900, lr=0.000316624, gnorm=0.136, clip=0, loss_scale=2, train_wall=84, gb_free=61.4, wall=34216 epoch 024: 1116 / 1689 loss=3.547, nll_loss=1.996, ppl=3.99, wps=579577, ups=1.17, wpb=496128, bsz=16890.2, num_updates=39900, lr=0.000316624, gnorm=0.136, clip=0, loss_scale=2, train_wall=84, gb_free=61.4, wall=34216 epoch 024: 1116 / 1689 loss=3.547, nll_loss=1.996, ppl=3.99, wps=579577, ups=1.17, wpb=496128, bsz=16890.2, num_updates=39900, lr=0.000316624, gnorm=0.136, clip=0, loss_scale=2, train_wall=84, gb_free=61.4, wall=34216 epoch 024: 1116 / 1689 loss=3.547, nll_loss=1.996, ppl=3.99, wps=579577, ups=1.17, wpb=496128, bsz=16890.2, num_updates=39900, lr=0.000316624, gnorm=0.136, clip=0, loss_scale=2, train_wall=84, gb_free=61.4, wall=34216 epoch 024: 1116 / 1689 loss=3.547, nll_loss=1.996, ppl=3.99, wps=579577, ups=1.17, wpb=496128, bsz=16890.2, num_updates=39900, lr=0.000316624, gnorm=0.136, clip=0, loss_scale=2, train_wall=84, gb_free=61.4, wall=34216 epoch 024: 1116 / 1689 loss=3.547, nll_loss=1.996, ppl=3.99, wps=579577, ups=1.17, wpb=496128, bsz=16890.2, num_updates=39900, lr=0.000316624, gnorm=0.136, clip=0, loss_scale=2, train_wall=84, gb_free=61.4, wall=34216 epoch 024: 1116 / 1689 loss=3.547, nll_loss=1.996, ppl=3.99, wps=579577, ups=1.17, wpb=496128, bsz=16890.2, num_updates=39900, lr=0.000316624, gnorm=0.136, clip=0, loss_scale=2, train_wall=84, gb_free=61.4, wall=34216 epoch 024: 1116 / 1689 loss=3.547, nll_loss=1.996, ppl=3.99, wps=579577, ups=1.17, wpb=496128, bsz=16890.2, num_updates=39900, lr=0.000316624, gnorm=0.136, clip=0, loss_scale=2, train_wall=84, gb_free=61.4, wall=34216 epoch 024: 1116 / 1689 loss=3.547, nll_loss=1.996, ppl=3.99, wps=579577, ups=1.17, wpb=496128, bsz=16890.2, num_updates=39900, lr=0.000316624, gnorm=0.136, clip=0, loss_scale=2, train_wall=84, gb_free=61.4, wall=34216 epoch 024: 1216 / 1689 loss=3.545, nll_loss=1.994, ppl=3.98, wps=583601, ups=1.18, wpb=494196, bsz=16706.6, num_updates=40000, lr=0.000316228, gnorm=0.144, clip=0, loss_scale=2, train_wall=84, gb_free=61.2, wall=34301 epoch 024: 1216 / 1689 loss=3.545, nll_loss=1.994, ppl=3.98, wps=583601, ups=1.18, wpb=494196, bsz=16706.6, num_updates=40000, lr=0.000316228, gnorm=0.144, clip=0, loss_scale=2, train_wall=84, gb_free=61.2, wall=34301 epoch 024: 1216 / 1689 loss=3.545, nll_loss=1.994, ppl=3.98, wps=583601, ups=1.18, wpb=494196, bsz=16706.6, num_updates=40000, lr=0.000316228, gnorm=0.144, clip=0, loss_scale=2, train_wall=84, gb_free=61.2, wall=34301 epoch 024: 1216 / 1689 loss=3.545, nll_loss=1.994, ppl=3.98, wps=583601, ups=1.18, wpb=494196, bsz=16706.6, num_updates=40000, lr=0.000316228, gnorm=0.144, clip=0, loss_scale=2, train_wall=84, gb_free=61.2, wall=34301 epoch 024: 1216 / 1689 loss=3.545, nll_loss=1.994, ppl=3.98, wps=583601, ups=1.18, wpb=494196, bsz=16706.6, num_updates=40000, lr=0.000316228, gnorm=0.144, clip=0, loss_scale=2, train_wall=84, gb_free=61.2, wall=34301 epoch 024: 1216 / 1689 loss=3.545, nll_loss=1.994, ppl=3.98, wps=583601, ups=1.18, wpb=494196, bsz=16706.6, num_updates=40000, lr=0.000316228, gnorm=0.144, clip=0, loss_scale=2, train_wall=84, gb_free=61.2, wall=34301 epoch 024: 1216 / 1689 loss=3.545, nll_loss=1.994, ppl=3.98, wps=583601, ups=1.18, wpb=494196, bsz=16706.6, num_updates=40000, lr=0.000316228, gnorm=0.144, clip=0, loss_scale=2, train_wall=84, gb_free=61.2, wall=34301 epoch 024: 1216 / 1689 loss=3.545, nll_loss=1.994, ppl=3.98, wps=583601, ups=1.18, wpb=494196, bsz=16706.6, num_updates=40000, lr=0.000316228, gnorm=0.144, clip=0, loss_scale=2, train_wall=84, gb_free=61.2, wall=34301 epoch 024: 1216 / 1689 loss=3.545, nll_loss=1.994, ppl=3.98, wps=583601, ups=1.18, wpb=494196, bsz=16706.6, num_updates=40000, lr=0.000316228, gnorm=0.144, clip=0, loss_scale=2, train_wall=84, gb_free=61.2, wall=34301 epoch 024: 1216 / 1689 loss=3.545, nll_loss=1.994, ppl=3.98, wps=583601, ups=1.18, wpb=494196, bsz=16706.6, num_updates=40000, lr=0.000316228, gnorm=0.144, clip=0, loss_scale=2, train_wall=84, gb_free=61.2, wall=34301 epoch 024: 1216 / 1689 loss=3.545, nll_loss=1.994, ppl=3.98, wps=583601, ups=1.18, wpb=494196, bsz=16706.6, num_updates=40000, lr=0.000316228, gnorm=0.144, clip=0, loss_scale=2, train_wall=84, gb_free=61.2, wall=34301 epoch 024: 1216 / 1689 loss=3.545, nll_loss=1.994, ppl=3.98, wps=583601, ups=1.18, wpb=494196, bsz=16706.6, num_updates=40000, lr=0.000316228, gnorm=0.144, clip=0, loss_scale=2, train_wall=84, gb_free=61.2, wall=34301 epoch 024: 1216 / 1689 loss=3.545, nll_loss=1.994, ppl=3.98, wps=583601, ups=1.18, wpb=494196, bsz=16706.6, num_updates=40000, lr=0.000316228, gnorm=0.144, clip=0, loss_scale=2, train_wall=84, gb_free=61.2, wall=34301 epoch 024: 1216 / 1689 loss=3.545, nll_loss=1.994, ppl=3.98, wps=583601, ups=1.18, wpb=494196, bsz=16706.6, num_updates=40000, lr=0.000316228, gnorm=0.144, clip=0, loss_scale=2, train_wall=84, gb_free=61.2, wall=34301 epoch 024: 1216 / 1689 loss=3.545, nll_loss=1.994, ppl=3.98, wps=583601, ups=1.18, wpb=494196, bsz=16706.6, num_updates=40000, lr=0.000316228, gnorm=0.144, clip=0, loss_scale=2, train_wall=84, gb_free=61.2, wall=34301 epoch 024: 1216 / 1689 loss=3.545, nll_loss=1.994, ppl=3.98, wps=583601, ups=1.18, wpb=494196, bsz=16706.6, num_updates=40000, lr=0.000316228, gnorm=0.144, clip=0, loss_scale=2, train_wall=84, gb_free=61.2, wall=34301 epoch 024: 1216 / 1689 loss=3.545, nll_loss=1.994, ppl=3.98, wps=583601, ups=1.18, wpb=494196, bsz=16706.6, num_updates=40000, lr=0.000316228, gnorm=0.144, clip=0, loss_scale=2, train_wall=84, gb_free=61.2, wall=34301 epoch 024: 1216 / 1689 loss=3.545, nll_loss=1.994, ppl=3.98, wps=583601, ups=1.18, wpb=494196, bsz=16706.6, num_updates=40000, lr=0.000316228, gnorm=0.144, clip=0, loss_scale=2, train_wall=84, gb_free=61.2, wall=34301 epoch 024: 1216 / 1689 loss=3.545, nll_loss=1.994, ppl=3.98, wps=583601, ups=1.18, wpb=494196, bsz=16706.6, num_updates=40000, lr=0.000316228, gnorm=0.144, clip=0, loss_scale=2, train_wall=84, gb_free=61.2, wall=34301 epoch 024: 1216 / 1689 loss=3.545, nll_loss=1.994, ppl=3.98, wps=583601, ups=1.18, wpb=494196, bsz=16706.6, num_updates=40000, lr=0.000316228, gnorm=0.144, clip=0, loss_scale=2, train_wall=84, gb_free=61.2, wall=34301 epoch 024: 1216 / 1689 loss=3.545, nll_loss=1.994, ppl=3.98, wps=583601, ups=1.18, wpb=494196, bsz=16706.6, num_updates=40000, lr=0.000316228, gnorm=0.144, clip=0, loss_scale=2, train_wall=84, gb_free=61.2, wall=34301 epoch 024: 1216 / 1689 loss=3.545, nll_loss=1.994, ppl=3.98, wps=583601, ups=1.18, wpb=494196, bsz=16706.6, num_updates=40000, lr=0.000316228, gnorm=0.144, clip=0, loss_scale=2, train_wall=84, gb_free=61.2, wall=34301 epoch 024: 1216 / 1689 loss=3.545, nll_loss=1.994, ppl=3.98, wps=583601, ups=1.18, wpb=494196, bsz=16706.6, num_updates=40000, lr=0.000316228, gnorm=0.144, clip=0, loss_scale=2, train_wall=84, gb_free=61.2, wall=34301 epoch 024: 1216 / 1689 loss=3.545, nll_loss=1.994, ppl=3.98, wps=583601, ups=1.18, wpb=494196, bsz=16706.6, num_updates=40000, lr=0.000316228, gnorm=0.144, clip=0, loss_scale=2, train_wall=84, gb_free=61.2, wall=34301 begin validation on "valid" subset epoch 024 | valid on 'valid' subset | loss 3.696 | nll_loss 2.145 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 40000 | best_loss 3.696 epoch 024 | valid on 'valid' subset | loss 3.696 | nll_loss 2.145 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 40000 | best_loss 3.696 epoch 024 | valid on 'valid' subset | loss 3.696 | nll_loss 2.145 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 40000 | best_loss 3.696 epoch 024 | valid on 'valid' subset | loss 3.696 | nll_loss 2.145 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 40000 | best_loss 3.696 epoch 024 | valid on 'valid' subset | loss 3.696 | nll_loss 2.145 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 40000 | best_loss 3.696 epoch 024 | valid on 'valid' subset | loss 3.696 | nll_loss 2.145 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 40000 | best_loss 3.696 epoch 024 | valid on 'valid' subset | loss 3.696 | nll_loss 2.145 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 40000 | best_loss 3.696 epoch 024 | valid on 'valid' subset | loss 3.696 | nll_loss 2.145 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 40000 | best_loss 3.696 epoch 024 | valid on 'valid' subset | loss 3.696 | nll_loss 2.145 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 40000 | best_loss 3.696 epoch 024 | valid on 'valid' subset | loss 3.696 | nll_loss 2.145 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 40000 | best_loss 3.696 epoch 024 | valid on 'valid' subset | loss 3.696 | nll_loss 2.145 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 40000 | best_loss 3.696 epoch 024 | valid on 'valid' subset | loss 3.696 | nll_loss 2.145 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 40000 | best_loss 3.696 epoch 024 | valid on 'valid' subset | loss 3.696 | nll_loss 2.145 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 40000 | best_loss 3.696 epoch 024 | valid on 'valid' subset | loss 3.696 | nll_loss 2.145 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 40000 | best_loss 3.696 epoch 024 | valid on 'valid' subset | loss 3.696 | nll_loss 2.145 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 40000 | best_loss 3.696 epoch 024 | valid on 'valid' subset | loss 3.696 | nll_loss 2.145 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 40000 | best_loss 3.696 epoch 024 | valid on 'valid' subset | loss 3.696 | nll_loss 2.145 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 40000 | best_loss 3.696 epoch 024 | valid on 'valid' subset | loss 3.696 | nll_loss 2.145 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 40000 | best_loss 3.696 epoch 024 | valid on 'valid' subset | loss 3.696 | nll_loss 2.145 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 40000 | best_loss 3.696 epoch 024 | valid on 'valid' subset | loss 3.696 | nll_loss 2.145 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 40000 | best_loss 3.696 epoch 024 | valid on 'valid' subset | loss 3.696 | nll_loss 2.145 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 40000 | best_loss 3.696 epoch 024 | valid on 'valid' subset | loss 3.696 | nll_loss 2.145 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 40000 | best_loss 3.696 epoch 024 | valid on 'valid' subset | loss 3.696 | nll_loss 2.145 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 40000 | best_loss 3.696 epoch 024 | valid on 'valid' subset | loss 3.696 | nll_loss 2.145 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 40000 | best_loss 3.696 epoch 024: 1316 / 1689 loss=3.545, nll_loss=1.995, ppl=3.98, wps=514296, ups=1.04, wpb=494949, bsz=16521.4, num_updates=40100, lr=0.000315833, gnorm=0.135, clip=0, loss_scale=2, train_wall=83, gb_free=61.6, wall=34397 epoch 024: 1316 / 1689 loss=3.545, nll_loss=1.995, ppl=3.98, wps=514296, ups=1.04, wpb=494949, bsz=16521.4, num_updates=40100, lr=0.000315833, gnorm=0.135, clip=0, loss_scale=2, train_wall=83, gb_free=61.6, wall=34397 epoch 024: 1316 / 1689 loss=3.545, nll_loss=1.995, ppl=3.98, wps=514296, ups=1.04, wpb=494949, bsz=16521.4, num_updates=40100, lr=0.000315833, gnorm=0.135, clip=0, loss_scale=2, train_wall=83, gb_free=61.6, wall=34397 epoch 024: 1316 / 1689 loss=3.545, nll_loss=1.995, ppl=3.98, wps=514296, ups=1.04, wpb=494949, bsz=16521.4, num_updates=40100, lr=0.000315833, gnorm=0.135, clip=0, loss_scale=2, train_wall=83, gb_free=61.6, wall=34397 epoch 024: 1316 / 1689 loss=3.545, nll_loss=1.995, ppl=3.98, wps=514296, ups=1.04, wpb=494949, bsz=16521.4, num_updates=40100, lr=0.000315833, gnorm=0.135, clip=0, loss_scale=2, train_wall=83, gb_free=61.6, wall=34397 epoch 024: 1316 / 1689 loss=3.545, nll_loss=1.995, ppl=3.98, wps=514296, ups=1.04, wpb=494949, bsz=16521.4, num_updates=40100, lr=0.000315833, gnorm=0.135, clip=0, loss_scale=2, train_wall=83, gb_free=61.6, wall=34397 epoch 024: 1316 / 1689 loss=3.545, nll_loss=1.995, ppl=3.98, wps=514296, ups=1.04, wpb=494949, bsz=16521.4, num_updates=40100, lr=0.000315833, gnorm=0.135, clip=0, loss_scale=2, train_wall=83, gb_free=61.6, wall=34397 epoch 024: 1316 / 1689 loss=3.545, nll_loss=1.995, ppl=3.98, wps=514296, ups=1.04, wpb=494949, bsz=16521.4, num_updates=40100, lr=0.000315833, gnorm=0.135, clip=0, loss_scale=2, train_wall=83, gb_free=61.6, wall=34397 epoch 024: 1316 / 1689 loss=3.545, nll_loss=1.995, ppl=3.98, wps=514296, ups=1.04, wpb=494949, bsz=16521.4, num_updates=40100, lr=0.000315833, gnorm=0.135, clip=0, loss_scale=2, train_wall=83, gb_free=61.6, wall=34397 epoch 024: 1316 / 1689 loss=3.545, nll_loss=1.995, ppl=3.98, wps=514296, ups=1.04, wpb=494949, bsz=16521.4, num_updates=40100, lr=0.000315833, gnorm=0.135, clip=0, loss_scale=2, train_wall=83, gb_free=61.6, wall=34397 epoch 024: 1316 / 1689 loss=3.545, nll_loss=1.995, ppl=3.98, wps=514296, ups=1.04, wpb=494949, bsz=16521.4, num_updates=40100, lr=0.000315833, gnorm=0.135, clip=0, loss_scale=2, train_wall=83, gb_free=61.6, wall=34397 epoch 024: 1316 / 1689 loss=3.545, nll_loss=1.995, ppl=3.98, wps=514296, ups=1.04, wpb=494949, bsz=16521.4, num_updates=40100, lr=0.000315833, gnorm=0.135, clip=0, loss_scale=2, train_wall=83, gb_free=61.6, wall=34397 epoch 024: 1316 / 1689 loss=3.545, nll_loss=1.995, ppl=3.98, wps=514296, ups=1.04, wpb=494949, bsz=16521.4, num_updates=40100, lr=0.000315833, gnorm=0.135, clip=0, loss_scale=2, train_wall=83, gb_free=61.6, wall=34397 epoch 024: 1316 / 1689 loss=3.545, nll_loss=1.995, ppl=3.98, wps=514296, ups=1.04, wpb=494949, bsz=16521.4, num_updates=40100, lr=0.000315833, gnorm=0.135, clip=0, loss_scale=2, train_wall=83, gb_free=61.6, wall=34397 epoch 024: 1316 / 1689 loss=3.545, nll_loss=1.995, ppl=3.98, wps=514296, ups=1.04, wpb=494949, bsz=16521.4, num_updates=40100, lr=0.000315833, gnorm=0.135, clip=0, loss_scale=2, train_wall=83, gb_free=61.6, wall=34397 epoch 024: 1316 / 1689 loss=3.545, nll_loss=1.995, ppl=3.98, wps=514296, ups=1.04, wpb=494949, bsz=16521.4, num_updates=40100, lr=0.000315833, gnorm=0.135, clip=0, loss_scale=2, train_wall=83, gb_free=61.6, wall=34397 epoch 024: 1316 / 1689 loss=3.545, nll_loss=1.995, ppl=3.98, wps=514296, ups=1.04, wpb=494949, bsz=16521.4, num_updates=40100, lr=0.000315833, gnorm=0.135, clip=0, loss_scale=2, train_wall=83, gb_free=61.6, wall=34397 epoch 024: 1316 / 1689 loss=3.545, nll_loss=1.995, ppl=3.98, wps=514296, ups=1.04, wpb=494949, bsz=16521.4, num_updates=40100, lr=0.000315833, gnorm=0.135, clip=0, loss_scale=2, train_wall=83, gb_free=61.6, wall=34397 epoch 024: 1316 / 1689 loss=3.545, nll_loss=1.995, ppl=3.98, wps=514296, ups=1.04, wpb=494949, bsz=16521.4, num_updates=40100, lr=0.000315833, gnorm=0.135, clip=0, loss_scale=2, train_wall=83, gb_free=61.6, wall=34397 epoch 024: 1316 / 1689 loss=3.545, nll_loss=1.995, ppl=3.98, wps=514296, ups=1.04, wpb=494949, bsz=16521.4, num_updates=40100, lr=0.000315833, gnorm=0.135, clip=0, loss_scale=2, train_wall=83, gb_free=61.6, wall=34397 epoch 024: 1316 / 1689 loss=3.545, nll_loss=1.995, ppl=3.98, wps=514296, ups=1.04, wpb=494949, bsz=16521.4, num_updates=40100, lr=0.000315833, gnorm=0.135, clip=0, loss_scale=2, train_wall=83, gb_free=61.6, wall=34397 epoch 024: 1316 / 1689 loss=3.545, nll_loss=1.995, ppl=3.98, wps=514296, ups=1.04, wpb=494949, bsz=16521.4, num_updates=40100, lr=0.000315833, gnorm=0.135, clip=0, loss_scale=2, train_wall=83, gb_free=61.6, wall=34397 epoch 024: 1316 / 1689 loss=3.545, nll_loss=1.995, ppl=3.98, wps=514296, ups=1.04, wpb=494949, bsz=16521.4, num_updates=40100, lr=0.000315833, gnorm=0.135, clip=0, loss_scale=2, train_wall=83, gb_free=61.6, wall=34397 epoch 024: 1316 / 1689 loss=3.545, nll_loss=1.995, ppl=3.98, wps=514296, ups=1.04, wpb=494949, bsz=16521.4, num_updates=40100, lr=0.000315833, gnorm=0.135, clip=0, loss_scale=2, train_wall=83, gb_free=61.6, wall=34397 epoch 024: 1416 / 1689 loss=3.549, nll_loss=1.999, ppl=4, wps=583551, ups=1.18, wpb=493810, bsz=16201.5, num_updates=40200, lr=0.00031544, gnorm=0.136, clip=0, loss_scale=4, train_wall=83, gb_free=60.8, wall=34482 epoch 024: 1416 / 1689 loss=3.549, nll_loss=1.999, ppl=4, wps=583551, ups=1.18, wpb=493810, bsz=16201.5, num_updates=40200, lr=0.00031544, gnorm=0.136, clip=0, loss_scale=4, train_wall=83, gb_free=60.8, wall=34482 epoch 024: 1416 / 1689 loss=3.549, nll_loss=1.999, ppl=4, wps=583551, ups=1.18, wpb=493810, bsz=16201.5, num_updates=40200, lr=0.00031544, gnorm=0.136, clip=0, loss_scale=4, train_wall=83, gb_free=60.8, wall=34482 epoch 024: 1416 / 1689 loss=3.549, nll_loss=1.999, ppl=4, wps=583551, ups=1.18, wpb=493810, bsz=16201.5, num_updates=40200, lr=0.00031544, gnorm=0.136, clip=0, loss_scale=4, train_wall=83, gb_free=60.8, wall=34482 epoch 024: 1416 / 1689 loss=3.549, nll_loss=1.999, ppl=4, wps=583551, ups=1.18, wpb=493810, bsz=16201.5, num_updates=40200, lr=0.00031544, gnorm=0.136, clip=0, loss_scale=4, train_wall=83, gb_free=60.8, wall=34482 epoch 024: 1416 / 1689 loss=3.549, nll_loss=1.999, ppl=4, wps=583551, ups=1.18, wpb=493810, bsz=16201.5, num_updates=40200, lr=0.00031544, gnorm=0.136, clip=0, loss_scale=4, train_wall=83, gb_free=60.8, wall=34482 epoch 024: 1416 / 1689 loss=3.549, nll_loss=1.999, ppl=4, wps=583551, ups=1.18, wpb=493810, bsz=16201.5, num_updates=40200, lr=0.00031544, gnorm=0.136, clip=0, loss_scale=4, train_wall=83, gb_free=60.8, wall=34482 epoch 024: 1416 / 1689 loss=3.549, nll_loss=1.999, ppl=4, wps=583551, ups=1.18, wpb=493810, bsz=16201.5, num_updates=40200, lr=0.00031544, gnorm=0.136, clip=0, loss_scale=4, train_wall=83, gb_free=60.8, wall=34482 epoch 024: 1416 / 1689 loss=3.549, nll_loss=1.999, ppl=4, wps=583551, ups=1.18, wpb=493810, bsz=16201.5, num_updates=40200, lr=0.00031544, gnorm=0.136, clip=0, loss_scale=4, train_wall=83, gb_free=60.8, wall=34482 epoch 024: 1416 / 1689 loss=3.549, nll_loss=1.999, ppl=4, wps=583551, ups=1.18, wpb=493810, bsz=16201.5, num_updates=40200, lr=0.00031544, gnorm=0.136, clip=0, loss_scale=4, train_wall=83, gb_free=60.8, wall=34482 epoch 024: 1416 / 1689 loss=3.549, nll_loss=1.999, ppl=4, wps=583551, ups=1.18, wpb=493810, bsz=16201.5, num_updates=40200, lr=0.00031544, gnorm=0.136, clip=0, loss_scale=4, train_wall=83, gb_free=60.8, wall=34482 epoch 024: 1416 / 1689 loss=3.549, nll_loss=1.999, ppl=4, wps=583551, ups=1.18, wpb=493810, bsz=16201.5, num_updates=40200, lr=0.00031544, gnorm=0.136, clip=0, loss_scale=4, train_wall=83, gb_free=60.8, wall=34482 epoch 024: 1416 / 1689 loss=3.549, nll_loss=1.999, ppl=4, wps=583551, ups=1.18, wpb=493810, bsz=16201.5, num_updates=40200, lr=0.00031544, gnorm=0.136, clip=0, loss_scale=4, train_wall=83, gb_free=60.8, wall=34482 epoch 024: 1416 / 1689 loss=3.549, nll_loss=1.999, ppl=4, wps=583551, ups=1.18, wpb=493810, bsz=16201.5, num_updates=40200, lr=0.00031544, gnorm=0.136, clip=0, loss_scale=4, train_wall=83, gb_free=60.8, wall=34482 epoch 024: 1416 / 1689 loss=3.549, nll_loss=1.999, ppl=4, wps=583551, ups=1.18, wpb=493810, bsz=16201.5, num_updates=40200, lr=0.00031544, gnorm=0.136, clip=0, loss_scale=4, train_wall=83, gb_free=60.8, wall=34482 epoch 024: 1416 / 1689 loss=3.549, nll_loss=1.999, ppl=4, wps=583551, ups=1.18, wpb=493810, bsz=16201.5, num_updates=40200, lr=0.00031544, gnorm=0.136, clip=0, loss_scale=4, train_wall=83, gb_free=60.8, wall=34482 epoch 024: 1416 / 1689 loss=3.549, nll_loss=1.999, ppl=4, wps=583551, ups=1.18, wpb=493810, bsz=16201.5, num_updates=40200, lr=0.00031544, gnorm=0.136, clip=0, loss_scale=4, train_wall=83, gb_free=60.8, wall=34482 epoch 024: 1416 / 1689 loss=3.549, nll_loss=1.999, ppl=4, wps=583551, ups=1.18, wpb=493810, bsz=16201.5, num_updates=40200, lr=0.00031544, gnorm=0.136, clip=0, loss_scale=4, train_wall=83, gb_free=60.8, wall=34482 epoch 024: 1416 / 1689 loss=3.549, nll_loss=1.999, ppl=4, wps=583551, ups=1.18, wpb=493810, bsz=16201.5, num_updates=40200, lr=0.00031544, gnorm=0.136, clip=0, loss_scale=4, train_wall=83, gb_free=60.8, wall=34482 epoch 024: 1416 / 1689 loss=3.549, nll_loss=1.999, ppl=4, wps=583551, ups=1.18, wpb=493810, bsz=16201.5, num_updates=40200, lr=0.00031544, gnorm=0.136, clip=0, loss_scale=4, train_wall=83, gb_free=60.8, wall=34482 epoch 024: 1416 / 1689 loss=3.549, nll_loss=1.999, ppl=4, wps=583551, ups=1.18, wpb=493810, bsz=16201.5, num_updates=40200, lr=0.00031544, gnorm=0.136, clip=0, loss_scale=4, train_wall=83, gb_free=60.8, wall=34482 epoch 024: 1416 / 1689 loss=3.549, nll_loss=1.999, ppl=4, wps=583551, ups=1.18, wpb=493810, bsz=16201.5, num_updates=40200, lr=0.00031544, gnorm=0.136, clip=0, loss_scale=4, train_wall=83, gb_free=60.8, wall=34482 epoch 024: 1416 / 1689 loss=3.549, nll_loss=1.999, ppl=4, wps=583551, ups=1.18, wpb=493810, bsz=16201.5, num_updates=40200, lr=0.00031544, gnorm=0.136, clip=0, loss_scale=4, train_wall=83, gb_free=60.8, wall=34482 epoch 024: 1416 / 1689 loss=3.549, nll_loss=1.999, ppl=4, wps=583551, ups=1.18, wpb=493810, bsz=16201.5, num_updates=40200, lr=0.00031544, gnorm=0.136, clip=0, loss_scale=4, train_wall=83, gb_free=60.8, wall=34482 epoch 024: 1516 / 1689 loss=3.556, nll_loss=2.007, ppl=4.02, wps=585871, ups=1.18, wpb=495332, bsz=16471, num_updates=40300, lr=0.000315049, gnorm=0.136, clip=0, loss_scale=4, train_wall=83, gb_free=61.4, wall=34566 epoch 024: 1516 / 1689 loss=3.556, nll_loss=2.007, ppl=4.02, wps=585871, ups=1.18, wpb=495332, bsz=16471, num_updates=40300, lr=0.000315049, gnorm=0.136, clip=0, loss_scale=4, train_wall=83, gb_free=61.4, wall=34566 epoch 024: 1516 / 1689 loss=3.556, nll_loss=2.007, ppl=4.02, wps=585871, ups=1.18, wpb=495332, bsz=16471, num_updates=40300, lr=0.000315049, gnorm=0.136, clip=0, loss_scale=4, train_wall=83, gb_free=61.4, wall=34566 epoch 024: 1516 / 1689 loss=3.556, nll_loss=2.007, ppl=4.02, wps=585871, ups=1.18, wpb=495332, bsz=16471, num_updates=40300, lr=0.000315049, gnorm=0.136, clip=0, loss_scale=4, train_wall=83, gb_free=61.4, wall=34566 epoch 024: 1516 / 1689 loss=3.556, nll_loss=2.007, ppl=4.02, wps=585871, ups=1.18, wpb=495332, bsz=16471, num_updates=40300, lr=0.000315049, gnorm=0.136, clip=0, loss_scale=4, train_wall=83, gb_free=61.4, wall=34566 epoch 024: 1516 / 1689 loss=3.556, nll_loss=2.007, ppl=4.02, wps=585871, ups=1.18, wpb=495332, bsz=16471, num_updates=40300, lr=0.000315049, gnorm=0.136, clip=0, loss_scale=4, train_wall=83, gb_free=61.4, wall=34566 epoch 024: 1516 / 1689 loss=3.556, nll_loss=2.007, ppl=4.02, wps=585871, ups=1.18, wpb=495332, bsz=16471, num_updates=40300, lr=0.000315049, gnorm=0.136, clip=0, loss_scale=4, train_wall=83, gb_free=61.4, wall=34566 epoch 024: 1516 / 1689 loss=3.556, nll_loss=2.007, ppl=4.02, wps=585871, ups=1.18, wpb=495332, bsz=16471, num_updates=40300, lr=0.000315049, gnorm=0.136, clip=0, loss_scale=4, train_wall=83, gb_free=61.4, wall=34566 epoch 024: 1516 / 1689 loss=3.556, nll_loss=2.007, ppl=4.02, wps=585871, ups=1.18, wpb=495332, bsz=16471, num_updates=40300, lr=0.000315049, gnorm=0.136, clip=0, loss_scale=4, train_wall=83, gb_free=61.4, wall=34566 epoch 024: 1516 / 1689 loss=3.556, nll_loss=2.007, ppl=4.02, wps=585871, ups=1.18, wpb=495332, bsz=16471, num_updates=40300, lr=0.000315049, gnorm=0.136, clip=0, loss_scale=4, train_wall=83, gb_free=61.4, wall=34566 epoch 024: 1516 / 1689 loss=3.556, nll_loss=2.007, ppl=4.02, wps=585871, ups=1.18, wpb=495332, bsz=16471, num_updates=40300, lr=0.000315049, gnorm=0.136, clip=0, loss_scale=4, train_wall=83, gb_free=61.4, wall=34566 epoch 024: 1516 / 1689 loss=3.556, nll_loss=2.007, ppl=4.02, wps=585871, ups=1.18, wpb=495332, bsz=16471, num_updates=40300, lr=0.000315049, gnorm=0.136, clip=0, loss_scale=4, train_wall=83, gb_free=61.4, wall=34566 epoch 024: 1516 / 1689 loss=3.556, nll_loss=2.007, ppl=4.02, wps=585871, ups=1.18, wpb=495332, bsz=16471, num_updates=40300, lr=0.000315049, gnorm=0.136, clip=0, loss_scale=4, train_wall=83, gb_free=61.4, wall=34566 epoch 024: 1516 / 1689 loss=3.556, nll_loss=2.007, ppl=4.02, wps=585871, ups=1.18, wpb=495332, bsz=16471, num_updates=40300, lr=0.000315049, gnorm=0.136, clip=0, loss_scale=4, train_wall=83, gb_free=61.4, wall=34566 epoch 024: 1516 / 1689 loss=3.556, nll_loss=2.007, ppl=4.02, wps=585871, ups=1.18, wpb=495332, bsz=16471, num_updates=40300, lr=0.000315049, gnorm=0.136, clip=0, loss_scale=4, train_wall=83, gb_free=61.4, wall=34566 epoch 024: 1516 / 1689 loss=3.556, nll_loss=2.007, ppl=4.02, wps=585871, ups=1.18, wpb=495332, bsz=16471, num_updates=40300, lr=0.000315049, gnorm=0.136, clip=0, loss_scale=4, train_wall=83, gb_free=61.4, wall=34566 epoch 024: 1516 / 1689 loss=3.556, nll_loss=2.007, ppl=4.02, wps=585871, ups=1.18, wpb=495332, bsz=16471, num_updates=40300, lr=0.000315049, gnorm=0.136, clip=0, loss_scale=4, train_wall=83, gb_free=61.4, wall=34566 epoch 024: 1516 / 1689 loss=3.556, nll_loss=2.007, ppl=4.02, wps=585871, ups=1.18, wpb=495332, bsz=16471, num_updates=40300, lr=0.000315049, gnorm=0.136, clip=0, loss_scale=4, train_wall=83, gb_free=61.4, wall=34566 epoch 024: 1516 / 1689 loss=3.556, nll_loss=2.007, ppl=4.02, wps=585871, ups=1.18, wpb=495332, bsz=16471, num_updates=40300, lr=0.000315049, gnorm=0.136, clip=0, loss_scale=4, train_wall=83, gb_free=61.4, wall=34566 epoch 024: 1516 / 1689 loss=3.556, nll_loss=2.007, ppl=4.02, wps=585871, ups=1.18, wpb=495332, bsz=16471, num_updates=40300, lr=0.000315049, gnorm=0.136, clip=0, loss_scale=4, train_wall=83, gb_free=61.4, wall=34566 epoch 024: 1516 / 1689 loss=3.556, nll_loss=2.007, ppl=4.02, wps=585871, ups=1.18, wpb=495332, bsz=16471, num_updates=40300, lr=0.000315049, gnorm=0.136, clip=0, loss_scale=4, train_wall=83, gb_free=61.4, wall=34566 epoch 024: 1516 / 1689 loss=3.556, nll_loss=2.007, ppl=4.02, wps=585871, ups=1.18, wpb=495332, bsz=16471, num_updates=40300, lr=0.000315049, gnorm=0.136, clip=0, loss_scale=4, train_wall=83, gb_free=61.4, wall=34566 epoch 024: 1516 / 1689 loss=3.556, nll_loss=2.007, ppl=4.02, wps=585871, ups=1.18, wpb=495332, bsz=16471, num_updates=40300, lr=0.000315049, gnorm=0.136, clip=0, loss_scale=4, train_wall=83, gb_free=61.4, wall=34566 epoch 024: 1516 / 1689 loss=3.556, nll_loss=2.007, ppl=4.02, wps=585871, ups=1.18, wpb=495332, bsz=16471, num_updates=40300, lr=0.000315049, gnorm=0.136, clip=0, loss_scale=4, train_wall=83, gb_free=61.4, wall=34566 epoch 024: 1616 / 1689 loss=3.554, nll_loss=2.005, ppl=4.01, wps=588293, ups=1.19, wpb=496054, bsz=16560.1, num_updates=40400, lr=0.000314658, gnorm=0.132, clip=0, loss_scale=4, train_wall=83, gb_free=60.7, wall=34651 epoch 024: 1616 / 1689 loss=3.554, nll_loss=2.005, ppl=4.01, wps=588293, ups=1.19, wpb=496054, bsz=16560.1, num_updates=40400, lr=0.000314658, gnorm=0.132, clip=0, loss_scale=4, train_wall=83, gb_free=60.7, wall=34651 epoch 024: 1616 / 1689 loss=3.554, nll_loss=2.005, ppl=4.01, wps=588293, ups=1.19, wpb=496054, bsz=16560.1, num_updates=40400, lr=0.000314658, gnorm=0.132, clip=0, loss_scale=4, train_wall=83, gb_free=60.7, wall=34651 epoch 024: 1616 / 1689 loss=3.554, nll_loss=2.005, ppl=4.01, wps=588293, ups=1.19, wpb=496054, bsz=16560.1, num_updates=40400, lr=0.000314658, gnorm=0.132, clip=0, loss_scale=4, train_wall=83, gb_free=60.7, wall=34651 epoch 024: 1616 / 1689 loss=3.554, nll_loss=2.005, ppl=4.01, wps=588293, ups=1.19, wpb=496054, bsz=16560.1, num_updates=40400, lr=0.000314658, gnorm=0.132, clip=0, loss_scale=4, train_wall=83, gb_free=60.7, wall=34651 epoch 024: 1616 / 1689 loss=3.554, nll_loss=2.005, ppl=4.01, wps=588293, ups=1.19, wpb=496054, bsz=16560.1, num_updates=40400, lr=0.000314658, gnorm=0.132, clip=0, loss_scale=4, train_wall=83, gb_free=60.7, wall=34651 epoch 024: 1616 / 1689 loss=3.554, nll_loss=2.005, ppl=4.01, wps=588293, ups=1.19, wpb=496054, bsz=16560.1, num_updates=40400, lr=0.000314658, gnorm=0.132, clip=0, loss_scale=4, train_wall=83, gb_free=60.7, wall=34651 epoch 024: 1616 / 1689 loss=3.554, nll_loss=2.005, ppl=4.01, wps=588293, ups=1.19, wpb=496054, bsz=16560.1, num_updates=40400, lr=0.000314658, gnorm=0.132, clip=0, loss_scale=4, train_wall=83, gb_free=60.7, wall=34651 epoch 024: 1616 / 1689 loss=3.554, nll_loss=2.005, ppl=4.01, wps=588293, ups=1.19, wpb=496054, bsz=16560.1, num_updates=40400, lr=0.000314658, gnorm=0.132, clip=0, loss_scale=4, train_wall=83, gb_free=60.7, wall=34651 epoch 024: 1616 / 1689 loss=3.554, nll_loss=2.005, ppl=4.01, wps=588293, ups=1.19, wpb=496054, bsz=16560.1, num_updates=40400, lr=0.000314658, gnorm=0.132, clip=0, loss_scale=4, train_wall=83, gb_free=60.7, wall=34651 epoch 024: 1616 / 1689 loss=3.554, nll_loss=2.005, ppl=4.01, wps=588293, ups=1.19, wpb=496054, bsz=16560.1, num_updates=40400, lr=0.000314658, gnorm=0.132, clip=0, loss_scale=4, train_wall=83, gb_free=60.7, wall=34651 epoch 024: 1616 / 1689 loss=3.554, nll_loss=2.005, ppl=4.01, wps=588293, ups=1.19, wpb=496054, bsz=16560.1, num_updates=40400, lr=0.000314658, gnorm=0.132, clip=0, loss_scale=4, train_wall=83, gb_free=60.7, wall=34651 epoch 024: 1616 / 1689 loss=3.554, nll_loss=2.005, ppl=4.01, wps=588293, ups=1.19, wpb=496054, bsz=16560.1, num_updates=40400, lr=0.000314658, gnorm=0.132, clip=0, loss_scale=4, train_wall=83, gb_free=60.7, wall=34651 epoch 024: 1616 / 1689 loss=3.554, nll_loss=2.005, ppl=4.01, wps=588293, ups=1.19, wpb=496054, bsz=16560.1, num_updates=40400, lr=0.000314658, gnorm=0.132, clip=0, loss_scale=4, train_wall=83, gb_free=60.7, wall=34651 epoch 024: 1616 / 1689 loss=3.554, nll_loss=2.005, ppl=4.01, wps=588293, ups=1.19, wpb=496054, bsz=16560.1, num_updates=40400, lr=0.000314658, gnorm=0.132, clip=0, loss_scale=4, train_wall=83, gb_free=60.7, wall=34651 epoch 024: 1616 / 1689 loss=3.554, nll_loss=2.005, ppl=4.01, wps=588293, ups=1.19, wpb=496054, bsz=16560.1, num_updates=40400, lr=0.000314658, gnorm=0.132, clip=0, loss_scale=4, train_wall=83, gb_free=60.7, wall=34651 epoch 024: 1616 / 1689 loss=3.554, nll_loss=2.005, ppl=4.01, wps=588293, ups=1.19, wpb=496054, bsz=16560.1, num_updates=40400, lr=0.000314658, gnorm=0.132, clip=0, loss_scale=4, train_wall=83, gb_free=60.7, wall=34651 epoch 024: 1616 / 1689 loss=3.554, nll_loss=2.005, ppl=4.01, wps=588293, ups=1.19, wpb=496054, bsz=16560.1, num_updates=40400, lr=0.000314658, gnorm=0.132, clip=0, loss_scale=4, train_wall=83, gb_free=60.7, wall=34651 epoch 024: 1616 / 1689 loss=3.554, nll_loss=2.005, ppl=4.01, wps=588293, ups=1.19, wpb=496054, bsz=16560.1, num_updates=40400, lr=0.000314658, gnorm=0.132, clip=0, loss_scale=4, train_wall=83, gb_free=60.7, wall=34651 epoch 024: 1616 / 1689 loss=3.554, nll_loss=2.005, ppl=4.01, wps=588293, ups=1.19, wpb=496054, bsz=16560.1, num_updates=40400, lr=0.000314658, gnorm=0.132, clip=0, loss_scale=4, train_wall=83, gb_free=60.7, wall=34651 epoch 024: 1616 / 1689 loss=3.554, nll_loss=2.005, ppl=4.01, wps=588293, ups=1.19, wpb=496054, bsz=16560.1, num_updates=40400, lr=0.000314658, gnorm=0.132, clip=0, loss_scale=4, train_wall=83, gb_free=60.7, wall=34651 epoch 024: 1616 / 1689 loss=3.554, nll_loss=2.005, ppl=4.01, wps=588293, ups=1.19, wpb=496054, bsz=16560.1, num_updates=40400, lr=0.000314658, gnorm=0.132, clip=0, loss_scale=4, train_wall=83, gb_free=60.7, wall=34651 epoch 024: 1616 / 1689 loss=3.554, nll_loss=2.005, ppl=4.01, wps=588293, ups=1.19, wpb=496054, bsz=16560.1, num_updates=40400, lr=0.000314658, gnorm=0.132, clip=0, loss_scale=4, train_wall=83, gb_free=60.7, wall=34651 epoch 024: 1616 / 1689 loss=3.554, nll_loss=2.005, ppl=4.01, wps=588293, ups=1.19, wpb=496054, bsz=16560.1, num_updates=40400, lr=0.000314658, gnorm=0.132, clip=0, loss_scale=4, train_wall=83, gb_free=60.7, wall=34651 end of epoch 24 (average epoch stats below) epoch 024 | loss 3.543 | nll_loss 1.992 | ppl 3.98 | wps 575088 | ups 1.16 | wpb 495118 | bsz 16508.5 | num_updates 40473 | lr 0.000314374 | gnorm 0.137 | clip 0 | loss_scale 4 | train_wall 1410 | gb_free 62.7 | wall 34712 epoch 024 | loss 3.543 | nll_loss 1.992 | ppl 3.98 | wps 575088 | ups 1.16 | wpb 495118 | bsz 16508.5 | num_updates 40473 | lr 0.000314374 | gnorm 0.137 | clip 0 | loss_scale 4 | train_wall 1410 | gb_free 62.7 | wall 34712 epoch 024 | loss 3.543 | nll_loss 1.992 | ppl 3.98 | wps 575088 | ups 1.16 | wpb 495118 | bsz 16508.5 | num_updates 40473 | lr 0.000314374 | gnorm 0.137 | clip 0 | loss_scale 4 | train_wall 1410 | gb_free 62.7 | wall 34712 epoch 024 | loss 3.543 | nll_loss 1.992 | ppl 3.98 | wps 575088 | ups 1.16 | wpb 495118 | bsz 16508.5 | num_updates 40473 | lr 0.000314374 | gnorm 0.137 | clip 0 | loss_scale 4 | train_wall 1410 | gb_free 62.7 | wall 34712 epoch 024 | loss 3.543 | nll_loss 1.992 | ppl 3.98 | wps 575088 | ups 1.16 | wpb 495118 | bsz 16508.5 | num_updates 40473 | lr 0.000314374 | gnorm 0.137 | clip 0 | loss_scale 4 | train_wall 1410 | gb_free 62.7 | wall 34712 epoch 024 | loss 3.543 | nll_loss 1.992 | ppl 3.98 | wps 575088 | ups 1.16 | wpb 495118 | bsz 16508.5 | num_updates 40473 | lr 0.000314374 | gnorm 0.137 | clip 0 | loss_scale 4 | train_wall 1410 | gb_free 62.7 | wall 34712 epoch 024 | loss 3.543 | nll_loss 1.992 | ppl 3.98 | wps 575088 | ups 1.16 | wpb 495118 | bsz 16508.5 | num_updates 40473 | lr 0.000314374 | gnorm 0.137 | clip 0 | loss_scale 4 | train_wall 1410 | gb_free 62.7 | wall 34712 epoch 024 | loss 3.543 | nll_loss 1.992 | ppl 3.98 | wps 575088 | ups 1.16 | wpb 495118 | bsz 16508.5 | num_updates 40473 | lr 0.000314374 | gnorm 0.137 | clip 0 | loss_scale 4 | train_wall 1410 | gb_free 62.7 | wall 34712 epoch 024 | loss 3.543 | nll_loss 1.992 | ppl 3.98 | wps 575088 | ups 1.16 | wpb 495118 | bsz 16508.5 | num_updates 40473 | lr 0.000314374 | gnorm 0.137 | clip 0 | loss_scale 4 | train_wall 1410 | gb_free 62.7 | wall 34712 epoch 024 | loss 3.543 | nll_loss 1.992 | ppl 3.98 | wps 575088 | ups 1.16 | wpb 495118 | bsz 16508.5 | num_updates 40473 | lr 0.000314374 | gnorm 0.137 | clip 0 | loss_scale 4 | train_wall 1410 | gb_free 62.7 | wall 34712 epoch 024 | loss 3.543 | nll_loss 1.992 | ppl 3.98 | wps 575088 | ups 1.16 | wpb 495118 | bsz 16508.5 | num_updates 40473 | lr 0.000314374 | gnorm 0.137 | clip 0 | loss_scale 4 | train_wall 1410 | gb_free 62.7 | wall 34712 epoch 024 | loss 3.543 | nll_loss 1.992 | ppl 3.98 | wps 575088 | ups 1.16 | wpb 495118 | bsz 16508.5 | num_updates 40473 | lr 0.000314374 | gnorm 0.137 | clip 0 | loss_scale 4 | train_wall 1410 | gb_free 62.7 | wall 34712 epoch 024 | loss 3.543 | nll_loss 1.992 | ppl 3.98 | wps 575088 | ups 1.16 | wpb 495118 | bsz 16508.5 | num_updates 40473 | lr 0.000314374 | gnorm 0.137 | clip 0 | loss_scale 4 | train_wall 1410 | gb_free 62.7 | wall 34712 epoch 024 | loss 3.543 | nll_loss 1.992 | ppl 3.98 | wps 575088 | ups 1.16 | wpb 495118 | bsz 16508.5 | num_updates 40473 | lr 0.000314374 | gnorm 0.137 | clip 0 | loss_scale 4 | train_wall 1410 | gb_free 62.7 | wall 34712 epoch 024 | loss 3.543 | nll_loss 1.992 | ppl 3.98 | wps 575088 | ups 1.16 | wpb 495118 | bsz 16508.5 | num_updates 40473 | lr 0.000314374 | gnorm 0.137 | clip 0 | loss_scale 4 | train_wall 1410 | gb_free 62.7 | wall 34712 epoch 024 | loss 3.543 | nll_loss 1.992 | ppl 3.98 | wps 575088 | ups 1.16 | wpb 495118 | bsz 16508.5 | num_updates 40473 | lr 0.000314374 | gnorm 0.137 | clip 0 | loss_scale 4 | train_wall 1410 | gb_free 62.7 | wall 34712 epoch 024 | loss 3.543 | nll_loss 1.992 | ppl 3.98 | wps 575088 | ups 1.16 | wpb 495118 | bsz 16508.5 | num_updates 40473 | lr 0.000314374 | gnorm 0.137 | clip 0 | loss_scale 4 | train_wall 1410 | gb_free 62.7 | wall 34712 epoch 024 | loss 3.543 | nll_loss 1.992 | ppl 3.98 | wps 575088 | ups 1.16 | wpb 495118 | bsz 16508.5 | num_updates 40473 | lr 0.000314374 | gnorm 0.137 | clip 0 | loss_scale 4 | train_wall 1410 | gb_free 62.7 | wall 34712 epoch 024 | loss 3.543 | nll_loss 1.992 | ppl 3.98 | wps 575088 | ups 1.16 | wpb 495118 | bsz 16508.5 | num_updates 40473 | lr 0.000314374 | gnorm 0.137 | clip 0 | loss_scale 4 | train_wall 1410 | gb_free 62.7 | wall 34712 epoch 024 | loss 3.543 | nll_loss 1.992 | ppl 3.98 | wps 575088 | ups 1.16 | wpb 495118 | bsz 16508.5 | num_updates 40473 | lr 0.000314374 | gnorm 0.137 | clip 0 | loss_scale 4 | train_wall 1410 | gb_free 62.7 | wall 34712 epoch 024 | loss 3.543 | nll_loss 1.992 | ppl 3.98 | wps 575088 | ups 1.16 | wpb 495118 | bsz 16508.5 | num_updates 40473 | lr 0.000314374 | gnorm 0.137 | clip 0 | loss_scale 4 | train_wall 1410 | gb_free 62.7 | wall 34712 epoch 024 | loss 3.543 | nll_loss 1.992 | ppl 3.98 | wps 575088 | ups 1.16 | wpb 495118 | bsz 16508.5 | num_updates 40473 | lr 0.000314374 | gnorm 0.137 | clip 0 | loss_scale 4 | train_wall 1410 | gb_free 62.7 | wall 34712 epoch 024 | loss 3.543 | nll_loss 1.992 | ppl 3.98 | wps 575088 | ups 1.16 | wpb 495118 | bsz 16508.5 | num_updates 40473 | lr 0.000314374 | gnorm 0.137 | clip 0 | loss_scale 4 | train_wall 1410 | gb_free 62.7 | wall 34712 epoch 024 | loss 3.543 | nll_loss 1.992 | ppl 3.98 | wps 575088 | ups 1.16 | wpb 495118 | bsz 16508.5 | num_updates 40473 | lr 0.000314374 | gnorm 0.137 | clip 0 | loss_scale 4 | train_wall 1410 | gb_free 62.7 | wall 34712 Start iterating over samples epoch 025: 27 / 1689 loss=3.533, nll_loss=1.981, ppl=3.95, wps=583368, ups=1.19, wpb=490836, bsz=16153.3, num_updates=40500, lr=0.00031427, gnorm=0.143, clip=0, loss_scale=4, train_wall=83, gb_free=61.7, wall=34735 epoch 025: 27 / 1689 loss=3.533, nll_loss=1.981, ppl=3.95, wps=583368, ups=1.19, wpb=490836, bsz=16153.3, num_updates=40500, lr=0.00031427, gnorm=0.143, clip=0, loss_scale=4, train_wall=83, gb_free=61.7, wall=34735 epoch 025: 27 / 1689 loss=3.533, nll_loss=1.981, ppl=3.95, wps=583368, ups=1.19, wpb=490836, bsz=16153.3, num_updates=40500, lr=0.00031427, gnorm=0.143, clip=0, loss_scale=4, train_wall=83, gb_free=61.7, wall=34735 epoch 025: 27 / 1689 loss=3.533, nll_loss=1.981, ppl=3.95, wps=583368, ups=1.19, wpb=490836, bsz=16153.3, num_updates=40500, lr=0.00031427, gnorm=0.143, clip=0, loss_scale=4, train_wall=83, gb_free=61.7, wall=34735 epoch 025: 27 / 1689 loss=3.533, nll_loss=1.981, ppl=3.95, wps=583368, ups=1.19, wpb=490836, bsz=16153.3, num_updates=40500, lr=0.00031427, gnorm=0.143, clip=0, loss_scale=4, train_wall=83, gb_free=61.7, wall=34735 epoch 025: 27 / 1689 loss=3.533, nll_loss=1.981, ppl=3.95, wps=583368, ups=1.19, wpb=490836, bsz=16153.3, num_updates=40500, lr=0.00031427, gnorm=0.143, clip=0, loss_scale=4, train_wall=83, gb_free=61.7, wall=34735 epoch 025: 27 / 1689 loss=3.533, nll_loss=1.981, ppl=3.95, wps=583368, ups=1.19, wpb=490836, bsz=16153.3, num_updates=40500, lr=0.00031427, gnorm=0.143, clip=0, loss_scale=4, train_wall=83, gb_free=61.7, wall=34735 epoch 025: 27 / 1689 loss=3.533, nll_loss=1.981, ppl=3.95, wps=583368, ups=1.19, wpb=490836, bsz=16153.3, num_updates=40500, lr=0.00031427, gnorm=0.143, clip=0, loss_scale=4, train_wall=83, gb_free=61.7, wall=34735 epoch 025: 27 / 1689 loss=3.533, nll_loss=1.981, ppl=3.95, wps=583368, ups=1.19, wpb=490836, bsz=16153.3, num_updates=40500, lr=0.00031427, gnorm=0.143, clip=0, loss_scale=4, train_wall=83, gb_free=61.7, wall=34735 epoch 025: 27 / 1689 loss=3.533, nll_loss=1.981, ppl=3.95, wps=583368, ups=1.19, wpb=490836, bsz=16153.3, num_updates=40500, lr=0.00031427, gnorm=0.143, clip=0, loss_scale=4, train_wall=83, gb_free=61.7, wall=34735 epoch 025: 27 / 1689 loss=3.533, nll_loss=1.981, ppl=3.95, wps=583368, ups=1.19, wpb=490836, bsz=16153.3, num_updates=40500, lr=0.00031427, gnorm=0.143, clip=0, loss_scale=4, train_wall=83, gb_free=61.7, wall=34735 epoch 025: 27 / 1689 loss=3.533, nll_loss=1.981, ppl=3.95, wps=583368, ups=1.19, wpb=490836, bsz=16153.3, num_updates=40500, lr=0.00031427, gnorm=0.143, clip=0, loss_scale=4, train_wall=83, gb_free=61.7, wall=34735 epoch 025: 27 / 1689 loss=3.533, nll_loss=1.981, ppl=3.95, wps=583368, ups=1.19, wpb=490836, bsz=16153.3, num_updates=40500, lr=0.00031427, gnorm=0.143, clip=0, loss_scale=4, train_wall=83, gb_free=61.7, wall=34735 epoch 025: 27 / 1689 loss=3.533, nll_loss=1.981, ppl=3.95, wps=583368, ups=1.19, wpb=490836, bsz=16153.3, num_updates=40500, lr=0.00031427, gnorm=0.143, clip=0, loss_scale=4, train_wall=83, gb_free=61.7, wall=34735 epoch 025: 27 / 1689 loss=3.533, nll_loss=1.981, ppl=3.95, wps=583368, ups=1.19, wpb=490836, bsz=16153.3, num_updates=40500, lr=0.00031427, gnorm=0.143, clip=0, loss_scale=4, train_wall=83, gb_free=61.7, wall=34735 epoch 025: 27 / 1689 loss=3.533, nll_loss=1.981, ppl=3.95, wps=583368, ups=1.19, wpb=490836, bsz=16153.3, num_updates=40500, lr=0.00031427, gnorm=0.143, clip=0, loss_scale=4, train_wall=83, gb_free=61.7, wall=34735 epoch 025: 27 / 1689 loss=3.533, nll_loss=1.981, ppl=3.95, wps=583368, ups=1.19, wpb=490836, bsz=16153.3, num_updates=40500, lr=0.00031427, gnorm=0.143, clip=0, loss_scale=4, train_wall=83, gb_free=61.7, wall=34735 epoch 025: 27 / 1689 loss=3.533, nll_loss=1.981, ppl=3.95, wps=583368, ups=1.19, wpb=490836, bsz=16153.3, num_updates=40500, lr=0.00031427, gnorm=0.143, clip=0, loss_scale=4, train_wall=83, gb_free=61.7, wall=34735 epoch 025: 27 / 1689 loss=3.533, nll_loss=1.981, ppl=3.95, wps=583368, ups=1.19, wpb=490836, bsz=16153.3, num_updates=40500, lr=0.00031427, gnorm=0.143, clip=0, loss_scale=4, train_wall=83, gb_free=61.7, wall=34735 epoch 025: 27 / 1689 loss=3.533, nll_loss=1.981, ppl=3.95, wps=583368, ups=1.19, wpb=490836, bsz=16153.3, num_updates=40500, lr=0.00031427, gnorm=0.143, clip=0, loss_scale=4, train_wall=83, gb_free=61.7, wall=34735 epoch 025: 27 / 1689 loss=3.533, nll_loss=1.981, ppl=3.95, wps=583368, ups=1.19, wpb=490836, bsz=16153.3, num_updates=40500, lr=0.00031427, gnorm=0.143, clip=0, loss_scale=4, train_wall=83, gb_free=61.7, wall=34735 epoch 025: 27 / 1689 loss=3.533, nll_loss=1.981, ppl=3.95, wps=583368, ups=1.19, wpb=490836, bsz=16153.3, num_updates=40500, lr=0.00031427, gnorm=0.143, clip=0, loss_scale=4, train_wall=83, gb_free=61.7, wall=34735 epoch 025: 27 / 1689 loss=3.533, nll_loss=1.981, ppl=3.95, wps=583368, ups=1.19, wpb=490836, bsz=16153.3, num_updates=40500, lr=0.00031427, gnorm=0.143, clip=0, loss_scale=4, train_wall=83, gb_free=61.7, wall=34735 epoch 025: 27 / 1689 loss=3.533, nll_loss=1.981, ppl=3.95, wps=583368, ups=1.19, wpb=490836, bsz=16153.3, num_updates=40500, lr=0.00031427, gnorm=0.143, clip=0, loss_scale=4, train_wall=83, gb_free=61.7, wall=34735 epoch 025: 27 / 1689 loss=3.533, nll_loss=1.981, ppl=3.95, wps=583368, ups=1.19, wpb=490836, bsz=16153.3, num_updates=40500, lr=0.00031427, gnorm=0.143, clip=0, loss_scale=4, train_wall=83, gb_free=61.7, wall=34735 epoch 025: 127 / 1689 loss=3.525, nll_loss=1.971, ppl=3.92, wps=585711, ups=1.18, wpb=494949, bsz=16338, num_updates=40600, lr=0.000313882, gnorm=0.137, clip=0, loss_scale=4, train_wall=84, gb_free=61.1, wall=34819 epoch 025: 127 / 1689 loss=3.525, nll_loss=1.971, ppl=3.92, wps=585711, ups=1.18, wpb=494949, bsz=16338, num_updates=40600, lr=0.000313882, gnorm=0.137, clip=0, loss_scale=4, train_wall=84, gb_free=61.1, wall=34819 epoch 025: 127 / 1689 loss=3.525, nll_loss=1.971, ppl=3.92, wps=585711, ups=1.18, wpb=494949, bsz=16338, num_updates=40600, lr=0.000313882, gnorm=0.137, clip=0, loss_scale=4, train_wall=84, gb_free=61.1, wall=34819 epoch 025: 127 / 1689 loss=3.525, nll_loss=1.971, ppl=3.92, wps=585711, ups=1.18, wpb=494949, bsz=16338, num_updates=40600, lr=0.000313882, gnorm=0.137, clip=0, loss_scale=4, train_wall=84, gb_free=61.1, wall=34819 epoch 025: 127 / 1689 loss=3.525, nll_loss=1.971, ppl=3.92, wps=585711, ups=1.18, wpb=494949, bsz=16338, num_updates=40600, lr=0.000313882, gnorm=0.137, clip=0, loss_scale=4, train_wall=84, gb_free=61.1, wall=34819 epoch 025: 127 / 1689 loss=3.525, nll_loss=1.971, ppl=3.92, wps=585711, ups=1.18, wpb=494949, bsz=16338, num_updates=40600, lr=0.000313882, gnorm=0.137, clip=0, loss_scale=4, train_wall=84, gb_free=61.1, wall=34819 epoch 025: 127 / 1689 loss=3.525, nll_loss=1.971, ppl=3.92, wps=585711, ups=1.18, wpb=494949, bsz=16338, num_updates=40600, lr=0.000313882, gnorm=0.137, clip=0, loss_scale=4, train_wall=84, gb_free=61.1, wall=34819 epoch 025: 127 / 1689 loss=3.525, nll_loss=1.971, ppl=3.92, wps=585711, ups=1.18, wpb=494949, bsz=16338, num_updates=40600, lr=0.000313882, gnorm=0.137, clip=0, loss_scale=4, train_wall=84, gb_free=61.1, wall=34819 epoch 025: 127 / 1689 loss=3.525, nll_loss=1.971, ppl=3.92, wps=585711, ups=1.18, wpb=494949, bsz=16338, num_updates=40600, lr=0.000313882, gnorm=0.137, clip=0, loss_scale=4, train_wall=84, gb_free=61.1, wall=34819 epoch 025: 127 / 1689 loss=3.525, nll_loss=1.971, ppl=3.92, wps=585711, ups=1.18, wpb=494949, bsz=16338, num_updates=40600, lr=0.000313882, gnorm=0.137, clip=0, loss_scale=4, train_wall=84, gb_free=61.1, wall=34819 epoch 025: 127 / 1689 loss=3.525, nll_loss=1.971, ppl=3.92, wps=585711, ups=1.18, wpb=494949, bsz=16338, num_updates=40600, lr=0.000313882, gnorm=0.137, clip=0, loss_scale=4, train_wall=84, gb_free=61.1, wall=34819 epoch 025: 127 / 1689 loss=3.525, nll_loss=1.971, ppl=3.92, wps=585711, ups=1.18, wpb=494949, bsz=16338, num_updates=40600, lr=0.000313882, gnorm=0.137, clip=0, loss_scale=4, train_wall=84, gb_free=61.1, wall=34819 epoch 025: 127 / 1689 loss=3.525, nll_loss=1.971, ppl=3.92, wps=585711, ups=1.18, wpb=494949, bsz=16338, num_updates=40600, lr=0.000313882, gnorm=0.137, clip=0, loss_scale=4, train_wall=84, gb_free=61.1, wall=34819 epoch 025: 127 / 1689 loss=3.525, nll_loss=1.971, ppl=3.92, wps=585711, ups=1.18, wpb=494949, bsz=16338, num_updates=40600, lr=0.000313882, gnorm=0.137, clip=0, loss_scale=4, train_wall=84, gb_free=61.1, wall=34819 epoch 025: 127 / 1689 loss=3.525, nll_loss=1.971, ppl=3.92, wps=585711, ups=1.18, wpb=494949, bsz=16338, num_updates=40600, lr=0.000313882, gnorm=0.137, clip=0, loss_scale=4, train_wall=84, gb_free=61.1, wall=34819 epoch 025: 127 / 1689 loss=3.525, nll_loss=1.971, ppl=3.92, wps=585711, ups=1.18, wpb=494949, bsz=16338, num_updates=40600, lr=0.000313882, gnorm=0.137, clip=0, loss_scale=4, train_wall=84, gb_free=61.1, wall=34819 epoch 025: 127 / 1689 loss=3.525, nll_loss=1.971, ppl=3.92, wps=585711, ups=1.18, wpb=494949, bsz=16338, num_updates=40600, lr=0.000313882, gnorm=0.137, clip=0, loss_scale=4, train_wall=84, gb_free=61.1, wall=34819 epoch 025: 127 / 1689 loss=3.525, nll_loss=1.971, ppl=3.92, wps=585711, ups=1.18, wpb=494949, bsz=16338, num_updates=40600, lr=0.000313882, gnorm=0.137, clip=0, loss_scale=4, train_wall=84, gb_free=61.1, wall=34819 epoch 025: 127 / 1689 loss=3.525, nll_loss=1.971, ppl=3.92, wps=585711, ups=1.18, wpb=494949, bsz=16338, num_updates=40600, lr=0.000313882, gnorm=0.137, clip=0, loss_scale=4, train_wall=84, gb_free=61.1, wall=34819 epoch 025: 127 / 1689 loss=3.525, nll_loss=1.971, ppl=3.92, wps=585711, ups=1.18, wpb=494949, bsz=16338, num_updates=40600, lr=0.000313882, gnorm=0.137, clip=0, loss_scale=4, train_wall=84, gb_free=61.1, wall=34819 epoch 025: 127 / 1689 loss=3.525, nll_loss=1.971, ppl=3.92, wps=585711, ups=1.18, wpb=494949, bsz=16338, num_updates=40600, lr=0.000313882, gnorm=0.137, clip=0, loss_scale=4, train_wall=84, gb_free=61.1, wall=34819 epoch 025: 127 / 1689 loss=3.525, nll_loss=1.971, ppl=3.92, wps=585711, ups=1.18, wpb=494949, bsz=16338, num_updates=40600, lr=0.000313882, gnorm=0.137, clip=0, loss_scale=4, train_wall=84, gb_free=61.1, wall=34819 epoch 025: 127 / 1689 loss=3.525, nll_loss=1.971, ppl=3.92, wps=585711, ups=1.18, wpb=494949, bsz=16338, num_updates=40600, lr=0.000313882, gnorm=0.137, clip=0, loss_scale=4, train_wall=84, gb_free=61.1, wall=34819 epoch 025: 127 / 1689 loss=3.525, nll_loss=1.971, ppl=3.92, wps=585711, ups=1.18, wpb=494949, bsz=16338, num_updates=40600, lr=0.000313882, gnorm=0.137, clip=0, loss_scale=4, train_wall=84, gb_free=61.1, wall=34819 epoch 025: 127 / 1689 loss=3.525, nll_loss=1.971, ppl=3.92, wps=585711, ups=1.18, wpb=494949, bsz=16338, num_updates=40600, lr=0.000313882, gnorm=0.137, clip=0, loss_scale=4, train_wall=84, gb_free=61.1, wall=34819 epoch 025: 228 / 1689 loss=3.527, nll_loss=1.974, ppl=3.93, wps=581243, ups=1.17, wpb=496628, bsz=16460.6, num_updates=40700, lr=0.000313497, gnorm=0.133, clip=0, loss_scale=2, train_wall=84, gb_free=60.3, wall=34905 epoch 025: 228 / 1689 loss=3.527, nll_loss=1.974, ppl=3.93, wps=581243, ups=1.17, wpb=496628, bsz=16460.6, num_updates=40700, lr=0.000313497, gnorm=0.133, clip=0, loss_scale=2, train_wall=84, gb_free=60.3, wall=34905 epoch 025: 228 / 1689 loss=3.527, nll_loss=1.974, ppl=3.93, wps=581243, ups=1.17, wpb=496628, bsz=16460.6, num_updates=40700, lr=0.000313497, gnorm=0.133, clip=0, loss_scale=2, train_wall=84, gb_free=60.3, wall=34905 epoch 025: 228 / 1689 loss=3.527, nll_loss=1.974, ppl=3.93, wps=581243, ups=1.17, wpb=496628, bsz=16460.6, num_updates=40700, lr=0.000313497, gnorm=0.133, clip=0, loss_scale=2, train_wall=84, gb_free=60.3, wall=34905 epoch 025: 228 / 1689 loss=3.527, nll_loss=1.974, ppl=3.93, wps=581243, ups=1.17, wpb=496628, bsz=16460.6, num_updates=40700, lr=0.000313497, gnorm=0.133, clip=0, loss_scale=2, train_wall=84, gb_free=60.3, wall=34905 epoch 025: 228 / 1689 loss=3.527, nll_loss=1.974, ppl=3.93, wps=581243, ups=1.17, wpb=496628, bsz=16460.6, num_updates=40700, lr=0.000313497, gnorm=0.133, clip=0, loss_scale=2, train_wall=84, gb_free=60.3, wall=34905 epoch 025: 228 / 1689 loss=3.527, nll_loss=1.974, ppl=3.93, wps=581243, ups=1.17, wpb=496628, bsz=16460.6, num_updates=40700, lr=0.000313497, gnorm=0.133, clip=0, loss_scale=2, train_wall=84, gb_free=60.3, wall=34905 epoch 025: 228 / 1689 loss=3.527, nll_loss=1.974, ppl=3.93, wps=581243, ups=1.17, wpb=496628, bsz=16460.6, num_updates=40700, lr=0.000313497, gnorm=0.133, clip=0, loss_scale=2, train_wall=84, gb_free=60.3, wall=34905 epoch 025: 228 / 1689 loss=3.527, nll_loss=1.974, ppl=3.93, wps=581243, ups=1.17, wpb=496628, bsz=16460.6, num_updates=40700, lr=0.000313497, gnorm=0.133, clip=0, loss_scale=2, train_wall=84, gb_free=60.3, wall=34905 epoch 025: 228 / 1689 loss=3.527, nll_loss=1.974, ppl=3.93, wps=581243, ups=1.17, wpb=496628, bsz=16460.6, num_updates=40700, lr=0.000313497, gnorm=0.133, clip=0, loss_scale=2, train_wall=84, gb_free=60.3, wall=34905 epoch 025: 228 / 1689 loss=3.527, nll_loss=1.974, ppl=3.93, wps=581243, ups=1.17, wpb=496628, bsz=16460.6, num_updates=40700, lr=0.000313497, gnorm=0.133, clip=0, loss_scale=2, train_wall=84, gb_free=60.3, wall=34905 epoch 025: 228 / 1689 loss=3.527, nll_loss=1.974, ppl=3.93, wps=581243, ups=1.17, wpb=496628, bsz=16460.6, num_updates=40700, lr=0.000313497, gnorm=0.133, clip=0, loss_scale=2, train_wall=84, gb_free=60.3, wall=34905 epoch 025: 228 / 1689 loss=3.527, nll_loss=1.974, ppl=3.93, wps=581243, ups=1.17, wpb=496628, bsz=16460.6, num_updates=40700, lr=0.000313497, gnorm=0.133, clip=0, loss_scale=2, train_wall=84, gb_free=60.3, wall=34905 epoch 025: 228 / 1689 loss=3.527, nll_loss=1.974, ppl=3.93, wps=581243, ups=1.17, wpb=496628, bsz=16460.6, num_updates=40700, lr=0.000313497, gnorm=0.133, clip=0, loss_scale=2, train_wall=84, gb_free=60.3, wall=34905 epoch 025: 228 / 1689 loss=3.527, nll_loss=1.974, ppl=3.93, wps=581243, ups=1.17, wpb=496628, bsz=16460.6, num_updates=40700, lr=0.000313497, gnorm=0.133, clip=0, loss_scale=2, train_wall=84, gb_free=60.3, wall=34905 epoch 025: 228 / 1689 loss=3.527, nll_loss=1.974, ppl=3.93, wps=581243, ups=1.17, wpb=496628, bsz=16460.6, num_updates=40700, lr=0.000313497, gnorm=0.133, clip=0, loss_scale=2, train_wall=84, gb_free=60.3, wall=34905 epoch 025: 228 / 1689 loss=3.527, nll_loss=1.974, ppl=3.93, wps=581243, ups=1.17, wpb=496628, bsz=16460.6, num_updates=40700, lr=0.000313497, gnorm=0.133, clip=0, loss_scale=2, train_wall=84, gb_free=60.3, wall=34905 epoch 025: 228 / 1689 loss=3.527, nll_loss=1.974, ppl=3.93, wps=581243, ups=1.17, wpb=496628, bsz=16460.6, num_updates=40700, lr=0.000313497, gnorm=0.133, clip=0, loss_scale=2, train_wall=84, gb_free=60.3, wall=34905 epoch 025: 228 / 1689 loss=3.527, nll_loss=1.974, ppl=3.93, wps=581243, ups=1.17, wpb=496628, bsz=16460.6, num_updates=40700, lr=0.000313497, gnorm=0.133, clip=0, loss_scale=2, train_wall=84, gb_free=60.3, wall=34905 epoch 025: 228 / 1689 loss=3.527, nll_loss=1.974, ppl=3.93, wps=581243, ups=1.17, wpb=496628, bsz=16460.6, num_updates=40700, lr=0.000313497, gnorm=0.133, clip=0, loss_scale=2, train_wall=84, gb_free=60.3, wall=34905 epoch 025: 228 / 1689 loss=3.527, nll_loss=1.974, ppl=3.93, wps=581243, ups=1.17, wpb=496628, bsz=16460.6, num_updates=40700, lr=0.000313497, gnorm=0.133, clip=0, loss_scale=2, train_wall=84, gb_free=60.3, wall=34905 epoch 025: 228 / 1689 loss=3.527, nll_loss=1.974, ppl=3.93, wps=581243, ups=1.17, wpb=496628, bsz=16460.6, num_updates=40700, lr=0.000313497, gnorm=0.133, clip=0, loss_scale=2, train_wall=84, gb_free=60.3, wall=34905 epoch 025: 228 / 1689 loss=3.527, nll_loss=1.974, ppl=3.93, wps=581243, ups=1.17, wpb=496628, bsz=16460.6, num_updates=40700, lr=0.000313497, gnorm=0.133, clip=0, loss_scale=2, train_wall=84, gb_free=60.3, wall=34905 epoch 025: 228 / 1689 loss=3.527, nll_loss=1.974, ppl=3.93, wps=581243, ups=1.17, wpb=496628, bsz=16460.6, num_updates=40700, lr=0.000313497, gnorm=0.133, clip=0, loss_scale=2, train_wall=84, gb_free=60.3, wall=34905 epoch 025: 228 / 1689 loss=3.527, nll_loss=1.974, ppl=3.93, wps=581243, ups=1.17, wpb=496628, bsz=16460.6, num_updates=40700, lr=0.000313497, gnorm=0.133, clip=0, loss_scale=2, train_wall=84, gb_free=60.3, wall=34905 epoch 025: 328 / 1689 loss=3.532, nll_loss=1.98, ppl=3.94, wps=584553, ups=1.18, wpb=496477, bsz=16775.5, num_updates=40800, lr=0.000313112, gnorm=0.139, clip=0, loss_scale=2, train_wall=84, gb_free=60.8, wall=34990 epoch 025: 328 / 1689 loss=3.532, nll_loss=1.98, ppl=3.94, wps=584553, ups=1.18, wpb=496477, bsz=16775.5, num_updates=40800, lr=0.000313112, gnorm=0.139, clip=0, loss_scale=2, train_wall=84, gb_free=60.8, wall=34990 epoch 025: 328 / 1689 loss=3.532, nll_loss=1.98, ppl=3.94, wps=584553, ups=1.18, wpb=496477, bsz=16775.5, num_updates=40800, lr=0.000313112, gnorm=0.139, clip=0, loss_scale=2, train_wall=84, gb_free=60.8, wall=34990 epoch 025: 328 / 1689 loss=3.532, nll_loss=1.98, ppl=3.94, wps=584553, ups=1.18, wpb=496477, bsz=16775.5, num_updates=40800, lr=0.000313112, gnorm=0.139, clip=0, loss_scale=2, train_wall=84, gb_free=60.8, wall=34990 epoch 025: 328 / 1689 loss=3.532, nll_loss=1.98, ppl=3.94, wps=584553, ups=1.18, wpb=496477, bsz=16775.5, num_updates=40800, lr=0.000313112, gnorm=0.139, clip=0, loss_scale=2, train_wall=84, gb_free=60.8, wall=34990 epoch 025: 328 / 1689 loss=3.532, nll_loss=1.98, ppl=3.94, wps=584553, ups=1.18, wpb=496477, bsz=16775.5, num_updates=40800, lr=0.000313112, gnorm=0.139, clip=0, loss_scale=2, train_wall=84, gb_free=60.8, wall=34990 epoch 025: 328 / 1689 loss=3.532, nll_loss=1.98, ppl=3.94, wps=584553, ups=1.18, wpb=496477, bsz=16775.5, num_updates=40800, lr=0.000313112, gnorm=0.139, clip=0, loss_scale=2, train_wall=84, gb_free=60.8, wall=34990 epoch 025: 328 / 1689 loss=3.532, nll_loss=1.98, ppl=3.94, wps=584553, ups=1.18, wpb=496477, bsz=16775.5, num_updates=40800, lr=0.000313112, gnorm=0.139, clip=0, loss_scale=2, train_wall=84, gb_free=60.8, wall=34990 epoch 025: 328 / 1689 loss=3.532, nll_loss=1.98, ppl=3.94, wps=584553, ups=1.18, wpb=496477, bsz=16775.5, num_updates=40800, lr=0.000313112, gnorm=0.139, clip=0, loss_scale=2, train_wall=84, gb_free=60.8, wall=34990 epoch 025: 328 / 1689 loss=3.532, nll_loss=1.98, ppl=3.94, wps=584553, ups=1.18, wpb=496477, bsz=16775.5, num_updates=40800, lr=0.000313112, gnorm=0.139, clip=0, loss_scale=2, train_wall=84, gb_free=60.8, wall=34990 epoch 025: 328 / 1689 loss=3.532, nll_loss=1.98, ppl=3.94, wps=584553, ups=1.18, wpb=496477, bsz=16775.5, num_updates=40800, lr=0.000313112, gnorm=0.139, clip=0, loss_scale=2, train_wall=84, gb_free=60.8, wall=34990 epoch 025: 328 / 1689 loss=3.532, nll_loss=1.98, ppl=3.94, wps=584553, ups=1.18, wpb=496477, bsz=16775.5, num_updates=40800, lr=0.000313112, gnorm=0.139, clip=0, loss_scale=2, train_wall=84, gb_free=60.8, wall=34990 epoch 025: 328 / 1689 loss=3.532, nll_loss=1.98, ppl=3.94, wps=584553, ups=1.18, wpb=496477, bsz=16775.5, num_updates=40800, lr=0.000313112, gnorm=0.139, clip=0, loss_scale=2, train_wall=84, gb_free=60.8, wall=34990 epoch 025: 328 / 1689 loss=3.532, nll_loss=1.98, ppl=3.94, wps=584553, ups=1.18, wpb=496477, bsz=16775.5, num_updates=40800, lr=0.000313112, gnorm=0.139, clip=0, loss_scale=2, train_wall=84, gb_free=60.8, wall=34990 epoch 025: 328 / 1689 loss=3.532, nll_loss=1.98, ppl=3.94, wps=584553, ups=1.18, wpb=496477, bsz=16775.5, num_updates=40800, lr=0.000313112, gnorm=0.139, clip=0, loss_scale=2, train_wall=84, gb_free=60.8, wall=34990 epoch 025: 328 / 1689 loss=3.532, nll_loss=1.98, ppl=3.94, wps=584553, ups=1.18, wpb=496477, bsz=16775.5, num_updates=40800, lr=0.000313112, gnorm=0.139, clip=0, loss_scale=2, train_wall=84, gb_free=60.8, wall=34990 epoch 025: 328 / 1689 loss=3.532, nll_loss=1.98, ppl=3.94, wps=584553, ups=1.18, wpb=496477, bsz=16775.5, num_updates=40800, lr=0.000313112, gnorm=0.139, clip=0, loss_scale=2, train_wall=84, gb_free=60.8, wall=34990 epoch 025: 328 / 1689 loss=3.532, nll_loss=1.98, ppl=3.94, wps=584553, ups=1.18, wpb=496477, bsz=16775.5, num_updates=40800, lr=0.000313112, gnorm=0.139, clip=0, loss_scale=2, train_wall=84, gb_free=60.8, wall=34990 epoch 025: 328 / 1689 loss=3.532, nll_loss=1.98, ppl=3.94, wps=584553, ups=1.18, wpb=496477, bsz=16775.5, num_updates=40800, lr=0.000313112, gnorm=0.139, clip=0, loss_scale=2, train_wall=84, gb_free=60.8, wall=34990 epoch 025: 328 / 1689 loss=3.532, nll_loss=1.98, ppl=3.94, wps=584553, ups=1.18, wpb=496477, bsz=16775.5, num_updates=40800, lr=0.000313112, gnorm=0.139, clip=0, loss_scale=2, train_wall=84, gb_free=60.8, wall=34990 epoch 025: 328 / 1689 loss=3.532, nll_loss=1.98, ppl=3.94, wps=584553, ups=1.18, wpb=496477, bsz=16775.5, num_updates=40800, lr=0.000313112, gnorm=0.139, clip=0, loss_scale=2, train_wall=84, gb_free=60.8, wall=34990 epoch 025: 328 / 1689 loss=3.532, nll_loss=1.98, ppl=3.94, wps=584553, ups=1.18, wpb=496477, bsz=16775.5, num_updates=40800, lr=0.000313112, gnorm=0.139, clip=0, loss_scale=2, train_wall=84, gb_free=60.8, wall=34990 epoch 025: 328 / 1689 loss=3.532, nll_loss=1.98, ppl=3.94, wps=584553, ups=1.18, wpb=496477, bsz=16775.5, num_updates=40800, lr=0.000313112, gnorm=0.139, clip=0, loss_scale=2, train_wall=84, gb_free=60.8, wall=34990 epoch 025: 328 / 1689 loss=3.532, nll_loss=1.98, ppl=3.94, wps=584553, ups=1.18, wpb=496477, bsz=16775.5, num_updates=40800, lr=0.000313112, gnorm=0.139, clip=0, loss_scale=2, train_wall=84, gb_free=60.8, wall=34990 epoch 025: 328 / 1689 loss=3.532, nll_loss=1.98, ppl=3.94, wps=584553, ups=1.18, wpb=496477, bsz=16775.5, num_updates=40800, lr=0.000313112, gnorm=0.139, clip=0, loss_scale=2, train_wall=84, gb_free=60.8, wall=34990 epoch 025: 428 / 1689 loss=3.53, nll_loss=1.978, ppl=3.94, wps=585785, ups=1.18, wpb=495480, bsz=16492.5, num_updates=40900, lr=0.000312729, gnorm=0.137, clip=0, loss_scale=2, train_wall=84, gb_free=61.4, wall=35074 epoch 025: 428 / 1689 loss=3.53, nll_loss=1.978, ppl=3.94, wps=585785, ups=1.18, wpb=495480, bsz=16492.5, num_updates=40900, lr=0.000312729, gnorm=0.137, clip=0, loss_scale=2, train_wall=84, gb_free=61.4, wall=35074 epoch 025: 428 / 1689 loss=3.53, nll_loss=1.978, ppl=3.94, wps=585785, ups=1.18, wpb=495480, bsz=16492.5, num_updates=40900, lr=0.000312729, gnorm=0.137, clip=0, loss_scale=2, train_wall=84, gb_free=61.4, wall=35074 epoch 025: 428 / 1689 loss=3.53, nll_loss=1.978, ppl=3.94, wps=585785, ups=1.18, wpb=495480, bsz=16492.5, num_updates=40900, lr=0.000312729, gnorm=0.137, clip=0, loss_scale=2, train_wall=84, gb_free=61.4, wall=35074 epoch 025: 428 / 1689 loss=3.53, nll_loss=1.978, ppl=3.94, wps=585785, ups=1.18, wpb=495480, bsz=16492.5, num_updates=40900, lr=0.000312729, gnorm=0.137, clip=0, loss_scale=2, train_wall=84, gb_free=61.4, wall=35074 epoch 025: 428 / 1689 loss=3.53, nll_loss=1.978, ppl=3.94, wps=585785, ups=1.18, wpb=495480, bsz=16492.5, num_updates=40900, lr=0.000312729, gnorm=0.137, clip=0, loss_scale=2, train_wall=84, gb_free=61.4, wall=35074 epoch 025: 428 / 1689 loss=3.53, nll_loss=1.978, ppl=3.94, wps=585785, ups=1.18, wpb=495480, bsz=16492.5, num_updates=40900, lr=0.000312729, gnorm=0.137, clip=0, loss_scale=2, train_wall=84, gb_free=61.4, wall=35074 epoch 025: 428 / 1689 loss=3.53, nll_loss=1.978, ppl=3.94, wps=585785, ups=1.18, wpb=495480, bsz=16492.5, num_updates=40900, lr=0.000312729, gnorm=0.137, clip=0, loss_scale=2, train_wall=84, gb_free=61.4, wall=35074 epoch 025: 428 / 1689 loss=3.53, nll_loss=1.978, ppl=3.94, wps=585785, ups=1.18, wpb=495480, bsz=16492.5, num_updates=40900, lr=0.000312729, gnorm=0.137, clip=0, loss_scale=2, train_wall=84, gb_free=61.4, wall=35074 epoch 025: 428 / 1689 loss=3.53, nll_loss=1.978, ppl=3.94, wps=585785, ups=1.18, wpb=495480, bsz=16492.5, num_updates=40900, lr=0.000312729, gnorm=0.137, clip=0, loss_scale=2, train_wall=84, gb_free=61.4, wall=35074 epoch 025: 428 / 1689 loss=3.53, nll_loss=1.978, ppl=3.94, wps=585785, ups=1.18, wpb=495480, bsz=16492.5, num_updates=40900, lr=0.000312729, gnorm=0.137, clip=0, loss_scale=2, train_wall=84, gb_free=61.4, wall=35074 epoch 025: 428 / 1689 loss=3.53, nll_loss=1.978, ppl=3.94, wps=585785, ups=1.18, wpb=495480, bsz=16492.5, num_updates=40900, lr=0.000312729, gnorm=0.137, clip=0, loss_scale=2, train_wall=84, gb_free=61.4, wall=35074 epoch 025: 428 / 1689 loss=3.53, nll_loss=1.978, ppl=3.94, wps=585785, ups=1.18, wpb=495480, bsz=16492.5, num_updates=40900, lr=0.000312729, gnorm=0.137, clip=0, loss_scale=2, train_wall=84, gb_free=61.4, wall=35074 epoch 025: 428 / 1689 loss=3.53, nll_loss=1.978, ppl=3.94, wps=585785, ups=1.18, wpb=495480, bsz=16492.5, num_updates=40900, lr=0.000312729, gnorm=0.137, clip=0, loss_scale=2, train_wall=84, gb_free=61.4, wall=35074 epoch 025: 428 / 1689 loss=3.53, nll_loss=1.978, ppl=3.94, wps=585785, ups=1.18, wpb=495480, bsz=16492.5, num_updates=40900, lr=0.000312729, gnorm=0.137, clip=0, loss_scale=2, train_wall=84, gb_free=61.4, wall=35074 epoch 025: 428 / 1689 loss=3.53, nll_loss=1.978, ppl=3.94, wps=585785, ups=1.18, wpb=495480, bsz=16492.5, num_updates=40900, lr=0.000312729, gnorm=0.137, clip=0, loss_scale=2, train_wall=84, gb_free=61.4, wall=35074 epoch 025: 428 / 1689 loss=3.53, nll_loss=1.978, ppl=3.94, wps=585785, ups=1.18, wpb=495480, bsz=16492.5, num_updates=40900, lr=0.000312729, gnorm=0.137, clip=0, loss_scale=2, train_wall=84, gb_free=61.4, wall=35074 epoch 025: 428 / 1689 loss=3.53, nll_loss=1.978, ppl=3.94, wps=585785, ups=1.18, wpb=495480, bsz=16492.5, num_updates=40900, lr=0.000312729, gnorm=0.137, clip=0, loss_scale=2, train_wall=84, gb_free=61.4, wall=35074 epoch 025: 428 / 1689 loss=3.53, nll_loss=1.978, ppl=3.94, wps=585785, ups=1.18, wpb=495480, bsz=16492.5, num_updates=40900, lr=0.000312729, gnorm=0.137, clip=0, loss_scale=2, train_wall=84, gb_free=61.4, wall=35074 epoch 025: 428 / 1689 loss=3.53, nll_loss=1.978, ppl=3.94, wps=585785, ups=1.18, wpb=495480, bsz=16492.5, num_updates=40900, lr=0.000312729, gnorm=0.137, clip=0, loss_scale=2, train_wall=84, gb_free=61.4, wall=35074 epoch 025: 428 / 1689 loss=3.53, nll_loss=1.978, ppl=3.94, wps=585785, ups=1.18, wpb=495480, bsz=16492.5, num_updates=40900, lr=0.000312729, gnorm=0.137, clip=0, loss_scale=2, train_wall=84, gb_free=61.4, wall=35074 epoch 025: 428 / 1689 loss=3.53, nll_loss=1.978, ppl=3.94, wps=585785, ups=1.18, wpb=495480, bsz=16492.5, num_updates=40900, lr=0.000312729, gnorm=0.137, clip=0, loss_scale=2, train_wall=84, gb_free=61.4, wall=35074 epoch 025: 428 / 1689 loss=3.53, nll_loss=1.978, ppl=3.94, wps=585785, ups=1.18, wpb=495480, bsz=16492.5, num_updates=40900, lr=0.000312729, gnorm=0.137, clip=0, loss_scale=2, train_wall=84, gb_free=61.4, wall=35074 epoch 025: 428 / 1689 loss=3.53, nll_loss=1.978, ppl=3.94, wps=585785, ups=1.18, wpb=495480, bsz=16492.5, num_updates=40900, lr=0.000312729, gnorm=0.137, clip=0, loss_scale=2, train_wall=84, gb_free=61.4, wall=35074 epoch 025: 428 / 1689 loss=3.53, nll_loss=1.978, ppl=3.94, wps=585785, ups=1.18, wpb=495480, bsz=16492.5, num_updates=40900, lr=0.000312729, gnorm=0.137, clip=0, loss_scale=2, train_wall=84, gb_free=61.4, wall=35074 epoch 025: 528 / 1689 loss=3.538, nll_loss=1.986, ppl=3.96, wps=588846, ups=1.19, wpb=494892, bsz=16639.6, num_updates=41000, lr=0.000312348, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=35158 epoch 025: 528 / 1689 loss=3.538, nll_loss=1.986, ppl=3.96, wps=588846, ups=1.19, wpb=494892, bsz=16639.6, num_updates=41000, lr=0.000312348, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=35158 epoch 025: 528 / 1689 loss=3.538, nll_loss=1.986, ppl=3.96, wps=588846, ups=1.19, wpb=494892, bsz=16639.6, num_updates=41000, lr=0.000312348, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=35158 epoch 025: 528 / 1689 loss=3.538, nll_loss=1.986, ppl=3.96, wps=588846, ups=1.19, wpb=494892, bsz=16639.6, num_updates=41000, lr=0.000312348, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=35158 epoch 025: 528 / 1689 loss=3.538, nll_loss=1.986, ppl=3.96, wps=588846, ups=1.19, wpb=494892, bsz=16639.6, num_updates=41000, lr=0.000312348, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=35158 epoch 025: 528 / 1689 loss=3.538, nll_loss=1.986, ppl=3.96, wps=588846, ups=1.19, wpb=494892, bsz=16639.6, num_updates=41000, lr=0.000312348, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=35158 epoch 025: 528 / 1689 loss=3.538, nll_loss=1.986, ppl=3.96, wps=588846, ups=1.19, wpb=494892, bsz=16639.6, num_updates=41000, lr=0.000312348, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=35158 epoch 025: 528 / 1689 loss=3.538, nll_loss=1.986, ppl=3.96, wps=588846, ups=1.19, wpb=494892, bsz=16639.6, num_updates=41000, lr=0.000312348, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=35158 epoch 025: 528 / 1689 loss=3.538, nll_loss=1.986, ppl=3.96, wps=588846, ups=1.19, wpb=494892, bsz=16639.6, num_updates=41000, lr=0.000312348, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=35158 epoch 025: 528 / 1689 loss=3.538, nll_loss=1.986, ppl=3.96, wps=588846, ups=1.19, wpb=494892, bsz=16639.6, num_updates=41000, lr=0.000312348, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=35158 epoch 025: 528 / 1689 loss=3.538, nll_loss=1.986, ppl=3.96, wps=588846, ups=1.19, wpb=494892, bsz=16639.6, num_updates=41000, lr=0.000312348, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=35158 epoch 025: 528 / 1689 loss=3.538, nll_loss=1.986, ppl=3.96, wps=588846, ups=1.19, wpb=494892, bsz=16639.6, num_updates=41000, lr=0.000312348, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=35158 epoch 025: 528 / 1689 loss=3.538, nll_loss=1.986, ppl=3.96, wps=588846, ups=1.19, wpb=494892, bsz=16639.6, num_updates=41000, lr=0.000312348, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=35158 epoch 025: 528 / 1689 loss=3.538, nll_loss=1.986, ppl=3.96, wps=588846, ups=1.19, wpb=494892, bsz=16639.6, num_updates=41000, lr=0.000312348, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=35158 epoch 025: 528 / 1689 loss=3.538, nll_loss=1.986, ppl=3.96, wps=588846, ups=1.19, wpb=494892, bsz=16639.6, num_updates=41000, lr=0.000312348, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=35158 epoch 025: 528 / 1689 loss=3.538, nll_loss=1.986, ppl=3.96, wps=588846, ups=1.19, wpb=494892, bsz=16639.6, num_updates=41000, lr=0.000312348, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=35158 epoch 025: 528 / 1689 loss=3.538, nll_loss=1.986, ppl=3.96, wps=588846, ups=1.19, wpb=494892, bsz=16639.6, num_updates=41000, lr=0.000312348, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=35158 epoch 025: 528 / 1689 loss=3.538, nll_loss=1.986, ppl=3.96, wps=588846, ups=1.19, wpb=494892, bsz=16639.6, num_updates=41000, lr=0.000312348, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=35158 epoch 025: 528 / 1689 loss=3.538, nll_loss=1.986, ppl=3.96, wps=588846, ups=1.19, wpb=494892, bsz=16639.6, num_updates=41000, lr=0.000312348, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=35158 epoch 025: 528 / 1689 loss=3.538, nll_loss=1.986, ppl=3.96, wps=588846, ups=1.19, wpb=494892, bsz=16639.6, num_updates=41000, lr=0.000312348, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=35158 epoch 025: 528 / 1689 loss=3.538, nll_loss=1.986, ppl=3.96, wps=588846, ups=1.19, wpb=494892, bsz=16639.6, num_updates=41000, lr=0.000312348, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=35158 epoch 025: 528 / 1689 loss=3.538, nll_loss=1.986, ppl=3.96, wps=588846, ups=1.19, wpb=494892, bsz=16639.6, num_updates=41000, lr=0.000312348, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=35158 epoch 025: 528 / 1689 loss=3.538, nll_loss=1.986, ppl=3.96, wps=588846, ups=1.19, wpb=494892, bsz=16639.6, num_updates=41000, lr=0.000312348, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=35158 epoch 025: 528 / 1689 loss=3.538, nll_loss=1.986, ppl=3.96, wps=588846, ups=1.19, wpb=494892, bsz=16639.6, num_updates=41000, lr=0.000312348, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=35158 epoch 025: 528 / 1689 loss=3.538, nll_loss=1.986, ppl=3.96, wps=588846, ups=1.19, wpb=494892, bsz=16639.6, num_updates=41000, lr=0.000312348, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=35158 begin validation on "valid" subset epoch 025 | valid on 'valid' subset | loss 3.709 | nll_loss 2.162 | ppl 4.48 | wps 0 | wpb 44526 | bsz 2008 | num_updates 41000 | best_loss 3.696 epoch 025 | valid on 'valid' subset | loss 3.709 | nll_loss 2.162 | ppl 4.48 | wps 0 | wpb 44526 | bsz 2008 | num_updates 41000 | best_loss 3.696 epoch 025 | valid on 'valid' subset | loss 3.709 | nll_loss 2.162 | ppl 4.48 | wps 0 | wpb 44526 | bsz 2008 | num_updates 41000 | best_loss 3.696 epoch 025 | valid on 'valid' subset | loss 3.709 | nll_loss 2.162 | ppl 4.48 | wps 0 | wpb 44526 | bsz 2008 | num_updates 41000 | best_loss 3.696 epoch 025 | valid on 'valid' subset | loss 3.709 | nll_loss 2.162 | ppl 4.48 | wps 0 | wpb 44526 | bsz 2008 | num_updates 41000 | best_loss 3.696 epoch 025 | valid on 'valid' subset | loss 3.709 | nll_loss 2.162 | ppl 4.48 | wps 0 | wpb 44526 | bsz 2008 | num_updates 41000 | best_loss 3.696 epoch 025 | valid on 'valid' subset | loss 3.709 | nll_loss 2.162 | ppl 4.48 | wps 0 | wpb 44526 | bsz 2008 | num_updates 41000 | best_loss 3.696 epoch 025 | valid on 'valid' subset | loss 3.709 | nll_loss 2.162 | ppl 4.48 | wps 0 | wpb 44526 | bsz 2008 | num_updates 41000 | best_loss 3.696 epoch 025 | valid on 'valid' subset | loss 3.709 | nll_loss 2.162 | ppl 4.48 | wps 0 | wpb 44526 | bsz 2008 | num_updates 41000 | best_loss 3.696 epoch 025 | valid on 'valid' subset | loss 3.709 | nll_loss 2.162 | ppl 4.48 | wps 0 | wpb 44526 | bsz 2008 | num_updates 41000 | best_loss 3.696 epoch 025 | valid on 'valid' subset | loss 3.709 | nll_loss 2.162 | ppl 4.48 | wps 0 | wpb 44526 | bsz 2008 | num_updates 41000 | best_loss 3.696 epoch 025 | valid on 'valid' subset | loss 3.709 | nll_loss 2.162 | ppl 4.48 | wps 0 | wpb 44526 | bsz 2008 | num_updates 41000 | best_loss 3.696 epoch 025 | valid on 'valid' subset | loss 3.709 | nll_loss 2.162 | ppl 4.48 | wps 0 | wpb 44526 | bsz 2008 | num_updates 41000 | best_loss 3.696 epoch 025 | valid on 'valid' subset | loss 3.709 | nll_loss 2.162 | ppl 4.48 | wps 0 | wpb 44526 | bsz 2008 | num_updates 41000 | best_loss 3.696 epoch 025 | valid on 'valid' subset | loss 3.709 | nll_loss 2.162 | ppl 4.48 | wps 0 | wpb 44526 | bsz 2008 | num_updates 41000 | best_loss 3.696 epoch 025 | valid on 'valid' subset | loss 3.709 | nll_loss 2.162 | ppl 4.48 | wps 0 | wpb 44526 | bsz 2008 | num_updates 41000 | best_loss 3.696 epoch 025 | valid on 'valid' subset | loss 3.709 | nll_loss 2.162 | ppl 4.48 | wps 0 | wpb 44526 | bsz 2008 | num_updates 41000 | best_loss 3.696 epoch 025 | valid on 'valid' subset | loss 3.709 | nll_loss 2.162 | ppl 4.48 | wps 0 | wpb 44526 | bsz 2008 | num_updates 41000 | best_loss 3.696 epoch 025 | valid on 'valid' subset | loss 3.709 | nll_loss 2.162 | ppl 4.48 | wps 0 | wpb 44526 | bsz 2008 | num_updates 41000 | best_loss 3.696 epoch 025 | valid on 'valid' subset | loss 3.709 | nll_loss 2.162 | ppl 4.48 | wps 0 | wpb 44526 | bsz 2008 | num_updates 41000 | best_loss 3.696 epoch 025 | valid on 'valid' subset | loss 3.709 | nll_loss 2.162 | ppl 4.48 | wps 0 | wpb 44526 | bsz 2008 | num_updates 41000 | best_loss 3.696 epoch 025 | valid on 'valid' subset | loss 3.709 | nll_loss 2.162 | ppl 4.48 | wps 0 | wpb 44526 | bsz 2008 | num_updates 41000 | best_loss 3.696 epoch 025 | valid on 'valid' subset | loss 3.709 | nll_loss 2.162 | ppl 4.48 | wps 0 | wpb 44526 | bsz 2008 | num_updates 41000 | best_loss 3.696 epoch 025 | valid on 'valid' subset | loss 3.709 | nll_loss 2.162 | ppl 4.48 | wps 0 | wpb 44526 | bsz 2008 | num_updates 41000 | best_loss 3.696 epoch 025 | valid on 'valid' subset | loss 3.709 | nll_loss 2.162 | ppl 4.48 | wps 0 | wpb 44526 | bsz 2008 | num_updates 41000 | best_loss 3.696 epoch 025: 628 / 1689 loss=3.539, nll_loss=1.987, ppl=3.97, wps=532122, ups=1.08, wpb=494916, bsz=16365.5, num_updates=41100, lr=0.000311967, gnorm=0.133, clip=0, loss_scale=2, train_wall=83, gb_free=61.7, wall=35251 epoch 025: 628 / 1689 loss=3.539, nll_loss=1.987, ppl=3.97, wps=532122, ups=1.08, wpb=494916, bsz=16365.5, num_updates=41100, lr=0.000311967, gnorm=0.133, clip=0, loss_scale=2, train_wall=83, gb_free=61.7, wall=35251 epoch 025: 628 / 1689 loss=3.539, nll_loss=1.987, ppl=3.97, wps=532122, ups=1.08, wpb=494916, bsz=16365.5, num_updates=41100, lr=0.000311967, gnorm=0.133, clip=0, loss_scale=2, train_wall=83, gb_free=61.7, wall=35251 epoch 025: 628 / 1689 loss=3.539, nll_loss=1.987, ppl=3.97, wps=532122, ups=1.08, wpb=494916, bsz=16365.5, num_updates=41100, lr=0.000311967, gnorm=0.133, clip=0, loss_scale=2, train_wall=83, gb_free=61.7, wall=35251 epoch 025: 628 / 1689 loss=3.539, nll_loss=1.987, ppl=3.97, wps=532122, ups=1.08, wpb=494916, bsz=16365.5, num_updates=41100, lr=0.000311967, gnorm=0.133, clip=0, loss_scale=2, train_wall=83, gb_free=61.7, wall=35251 epoch 025: 628 / 1689 loss=3.539, nll_loss=1.987, ppl=3.97, wps=532122, ups=1.08, wpb=494916, bsz=16365.5, num_updates=41100, lr=0.000311967, gnorm=0.133, clip=0, loss_scale=2, train_wall=83, gb_free=61.7, wall=35251 epoch 025: 628 / 1689 loss=3.539, nll_loss=1.987, ppl=3.97, wps=532122, ups=1.08, wpb=494916, bsz=16365.5, num_updates=41100, lr=0.000311967, gnorm=0.133, clip=0, loss_scale=2, train_wall=83, gb_free=61.7, wall=35251 epoch 025: 628 / 1689 loss=3.539, nll_loss=1.987, ppl=3.97, wps=532122, ups=1.08, wpb=494916, bsz=16365.5, num_updates=41100, lr=0.000311967, gnorm=0.133, clip=0, loss_scale=2, train_wall=83, gb_free=61.7, wall=35251 epoch 025: 628 / 1689 loss=3.539, nll_loss=1.987, ppl=3.97, wps=532122, ups=1.08, wpb=494916, bsz=16365.5, num_updates=41100, lr=0.000311967, gnorm=0.133, clip=0, loss_scale=2, train_wall=83, gb_free=61.7, wall=35251 epoch 025: 628 / 1689 loss=3.539, nll_loss=1.987, ppl=3.97, wps=532122, ups=1.08, wpb=494916, bsz=16365.5, num_updates=41100, lr=0.000311967, gnorm=0.133, clip=0, loss_scale=2, train_wall=83, gb_free=61.7, wall=35251 epoch 025: 628 / 1689 loss=3.539, nll_loss=1.987, ppl=3.97, wps=532122, ups=1.08, wpb=494916, bsz=16365.5, num_updates=41100, lr=0.000311967, gnorm=0.133, clip=0, loss_scale=2, train_wall=83, gb_free=61.7, wall=35251 epoch 025: 628 / 1689 loss=3.539, nll_loss=1.987, ppl=3.97, wps=532122, ups=1.08, wpb=494916, bsz=16365.5, num_updates=41100, lr=0.000311967, gnorm=0.133, clip=0, loss_scale=2, train_wall=83, gb_free=61.7, wall=35251 epoch 025: 628 / 1689 loss=3.539, nll_loss=1.987, ppl=3.97, wps=532122, ups=1.08, wpb=494916, bsz=16365.5, num_updates=41100, lr=0.000311967, gnorm=0.133, clip=0, loss_scale=2, train_wall=83, gb_free=61.7, wall=35251 epoch 025: 628 / 1689 loss=3.539, nll_loss=1.987, ppl=3.97, wps=532122, ups=1.08, wpb=494916, bsz=16365.5, num_updates=41100, lr=0.000311967, gnorm=0.133, clip=0, loss_scale=2, train_wall=83, gb_free=61.7, wall=35251 epoch 025: 628 / 1689 loss=3.539, nll_loss=1.987, ppl=3.97, wps=532122, ups=1.08, wpb=494916, bsz=16365.5, num_updates=41100, lr=0.000311967, gnorm=0.133, clip=0, loss_scale=2, train_wall=83, gb_free=61.7, wall=35251 epoch 025: 628 / 1689 loss=3.539, nll_loss=1.987, ppl=3.97, wps=532122, ups=1.08, wpb=494916, bsz=16365.5, num_updates=41100, lr=0.000311967, gnorm=0.133, clip=0, loss_scale=2, train_wall=83, gb_free=61.7, wall=35251 epoch 025: 628 / 1689 loss=3.539, nll_loss=1.987, ppl=3.97, wps=532122, ups=1.08, wpb=494916, bsz=16365.5, num_updates=41100, lr=0.000311967, gnorm=0.133, clip=0, loss_scale=2, train_wall=83, gb_free=61.7, wall=35251 epoch 025: 628 / 1689 loss=3.539, nll_loss=1.987, ppl=3.97, wps=532122, ups=1.08, wpb=494916, bsz=16365.5, num_updates=41100, lr=0.000311967, gnorm=0.133, clip=0, loss_scale=2, train_wall=83, gb_free=61.7, wall=35251 epoch 025: 628 / 1689 loss=3.539, nll_loss=1.987, ppl=3.97, wps=532122, ups=1.08, wpb=494916, bsz=16365.5, num_updates=41100, lr=0.000311967, gnorm=0.133, clip=0, loss_scale=2, train_wall=83, gb_free=61.7, wall=35251 epoch 025: 628 / 1689 loss=3.539, nll_loss=1.987, ppl=3.97, wps=532122, ups=1.08, wpb=494916, bsz=16365.5, num_updates=41100, lr=0.000311967, gnorm=0.133, clip=0, loss_scale=2, train_wall=83, gb_free=61.7, wall=35251 epoch 025: 628 / 1689 loss=3.539, nll_loss=1.987, ppl=3.97, wps=532122, ups=1.08, wpb=494916, bsz=16365.5, num_updates=41100, lr=0.000311967, gnorm=0.133, clip=0, loss_scale=2, train_wall=83, gb_free=61.7, wall=35251 epoch 025: 628 / 1689 loss=3.539, nll_loss=1.987, ppl=3.97, wps=532122, ups=1.08, wpb=494916, bsz=16365.5, num_updates=41100, lr=0.000311967, gnorm=0.133, clip=0, loss_scale=2, train_wall=83, gb_free=61.7, wall=35251 epoch 025: 628 / 1689 loss=3.539, nll_loss=1.987, ppl=3.97, wps=532122, ups=1.08, wpb=494916, bsz=16365.5, num_updates=41100, lr=0.000311967, gnorm=0.133, clip=0, loss_scale=2, train_wall=83, gb_free=61.7, wall=35251 epoch 025: 628 / 1689 loss=3.539, nll_loss=1.987, ppl=3.97, wps=532122, ups=1.08, wpb=494916, bsz=16365.5, num_updates=41100, lr=0.000311967, gnorm=0.133, clip=0, loss_scale=2, train_wall=83, gb_free=61.7, wall=35251 epoch 025: 628 / 1689 loss=3.539, nll_loss=1.987, ppl=3.97, wps=532122, ups=1.08, wpb=494916, bsz=16365.5, num_updates=41100, lr=0.000311967, gnorm=0.133, clip=0, loss_scale=2, train_wall=83, gb_free=61.7, wall=35251 epoch 025: 728 / 1689 loss=3.538, nll_loss=1.987, ppl=3.96, wps=588225, ups=1.19, wpb=495682, bsz=16773.6, num_updates=41200, lr=0.000311588, gnorm=0.133, clip=0, loss_scale=4, train_wall=83, gb_free=61.2, wall=35336 epoch 025: 728 / 1689 loss=3.538, nll_loss=1.987, ppl=3.96, wps=588225, ups=1.19, wpb=495682, bsz=16773.6, num_updates=41200, lr=0.000311588, gnorm=0.133, clip=0, loss_scale=4, train_wall=83, gb_free=61.2, wall=35336 epoch 025: 728 / 1689 loss=3.538, nll_loss=1.987, ppl=3.96, wps=588225, ups=1.19, wpb=495682, bsz=16773.6, num_updates=41200, lr=0.000311588, gnorm=0.133, clip=0, loss_scale=4, train_wall=83, gb_free=61.2, wall=35336 epoch 025: 728 / 1689 loss=3.538, nll_loss=1.987, ppl=3.96, wps=588225, ups=1.19, wpb=495682, bsz=16773.6, num_updates=41200, lr=0.000311588, gnorm=0.133, clip=0, loss_scale=4, train_wall=83, gb_free=61.2, wall=35336 epoch 025: 728 / 1689 loss=3.538, nll_loss=1.987, ppl=3.96, wps=588225, ups=1.19, wpb=495682, bsz=16773.6, num_updates=41200, lr=0.000311588, gnorm=0.133, clip=0, loss_scale=4, train_wall=83, gb_free=61.2, wall=35336 epoch 025: 728 / 1689 loss=3.538, nll_loss=1.987, ppl=3.96, wps=588225, ups=1.19, wpb=495682, bsz=16773.6, num_updates=41200, lr=0.000311588, gnorm=0.133, clip=0, loss_scale=4, train_wall=83, gb_free=61.2, wall=35336 epoch 025: 728 / 1689 loss=3.538, nll_loss=1.987, ppl=3.96, wps=588225, ups=1.19, wpb=495682, bsz=16773.6, num_updates=41200, lr=0.000311588, gnorm=0.133, clip=0, loss_scale=4, train_wall=83, gb_free=61.2, wall=35336 epoch 025: 728 / 1689 loss=3.538, nll_loss=1.987, ppl=3.96, wps=588225, ups=1.19, wpb=495682, bsz=16773.6, num_updates=41200, lr=0.000311588, gnorm=0.133, clip=0, loss_scale=4, train_wall=83, gb_free=61.2, wall=35336 epoch 025: 728 / 1689 loss=3.538, nll_loss=1.987, ppl=3.96, wps=588225, ups=1.19, wpb=495682, bsz=16773.6, num_updates=41200, lr=0.000311588, gnorm=0.133, clip=0, loss_scale=4, train_wall=83, gb_free=61.2, wall=35336 epoch 025: 728 / 1689 loss=3.538, nll_loss=1.987, ppl=3.96, wps=588225, ups=1.19, wpb=495682, bsz=16773.6, num_updates=41200, lr=0.000311588, gnorm=0.133, clip=0, loss_scale=4, train_wall=83, gb_free=61.2, wall=35336 epoch 025: 728 / 1689 loss=3.538, nll_loss=1.987, ppl=3.96, wps=588225, ups=1.19, wpb=495682, bsz=16773.6, num_updates=41200, lr=0.000311588, gnorm=0.133, clip=0, loss_scale=4, train_wall=83, gb_free=61.2, wall=35336 epoch 025: 728 / 1689 loss=3.538, nll_loss=1.987, ppl=3.96, wps=588225, ups=1.19, wpb=495682, bsz=16773.6, num_updates=41200, lr=0.000311588, gnorm=0.133, clip=0, loss_scale=4, train_wall=83, gb_free=61.2, wall=35336 epoch 025: 728 / 1689 loss=3.538, nll_loss=1.987, ppl=3.96, wps=588225, ups=1.19, wpb=495682, bsz=16773.6, num_updates=41200, lr=0.000311588, gnorm=0.133, clip=0, loss_scale=4, train_wall=83, gb_free=61.2, wall=35336 epoch 025: 728 / 1689 loss=3.538, nll_loss=1.987, ppl=3.96, wps=588225, ups=1.19, wpb=495682, bsz=16773.6, num_updates=41200, lr=0.000311588, gnorm=0.133, clip=0, loss_scale=4, train_wall=83, gb_free=61.2, wall=35336 epoch 025: 728 / 1689 loss=3.538, nll_loss=1.987, ppl=3.96, wps=588225, ups=1.19, wpb=495682, bsz=16773.6, num_updates=41200, lr=0.000311588, gnorm=0.133, clip=0, loss_scale=4, train_wall=83, gb_free=61.2, wall=35336 epoch 025: 728 / 1689 loss=3.538, nll_loss=1.987, ppl=3.96, wps=588225, ups=1.19, wpb=495682, bsz=16773.6, num_updates=41200, lr=0.000311588, gnorm=0.133, clip=0, loss_scale=4, train_wall=83, gb_free=61.2, wall=35336 epoch 025: 728 / 1689 loss=3.538, nll_loss=1.987, ppl=3.96, wps=588225, ups=1.19, wpb=495682, bsz=16773.6, num_updates=41200, lr=0.000311588, gnorm=0.133, clip=0, loss_scale=4, train_wall=83, gb_free=61.2, wall=35336 epoch 025: 728 / 1689 loss=3.538, nll_loss=1.987, ppl=3.96, wps=588225, ups=1.19, wpb=495682, bsz=16773.6, num_updates=41200, lr=0.000311588, gnorm=0.133, clip=0, loss_scale=4, train_wall=83, gb_free=61.2, wall=35336 epoch 025: 728 / 1689 loss=3.538, nll_loss=1.987, ppl=3.96, wps=588225, ups=1.19, wpb=495682, bsz=16773.6, num_updates=41200, lr=0.000311588, gnorm=0.133, clip=0, loss_scale=4, train_wall=83, gb_free=61.2, wall=35336 epoch 025: 728 / 1689 loss=3.538, nll_loss=1.987, ppl=3.96, wps=588225, ups=1.19, wpb=495682, bsz=16773.6, num_updates=41200, lr=0.000311588, gnorm=0.133, clip=0, loss_scale=4, train_wall=83, gb_free=61.2, wall=35336 epoch 025: 728 / 1689 loss=3.538, nll_loss=1.987, ppl=3.96, wps=588225, ups=1.19, wpb=495682, bsz=16773.6, num_updates=41200, lr=0.000311588, gnorm=0.133, clip=0, loss_scale=4, train_wall=83, gb_free=61.2, wall=35336 epoch 025: 728 / 1689 loss=3.538, nll_loss=1.987, ppl=3.96, wps=588225, ups=1.19, wpb=495682, bsz=16773.6, num_updates=41200, lr=0.000311588, gnorm=0.133, clip=0, loss_scale=4, train_wall=83, gb_free=61.2, wall=35336 epoch 025: 728 / 1689 loss=3.538, nll_loss=1.987, ppl=3.96, wps=588225, ups=1.19, wpb=495682, bsz=16773.6, num_updates=41200, lr=0.000311588, gnorm=0.133, clip=0, loss_scale=4, train_wall=83, gb_free=61.2, wall=35336 epoch 025: 728 / 1689 loss=3.538, nll_loss=1.987, ppl=3.96, wps=588225, ups=1.19, wpb=495682, bsz=16773.6, num_updates=41200, lr=0.000311588, gnorm=0.133, clip=0, loss_scale=4, train_wall=83, gb_free=61.2, wall=35336 epoch 025: 728 / 1689 loss=3.538, nll_loss=1.987, ppl=3.96, wps=588225, ups=1.19, wpb=495682, bsz=16773.6, num_updates=41200, lr=0.000311588, gnorm=0.133, clip=0, loss_scale=4, train_wall=83, gb_free=61.2, wall=35336 epoch 025: 829 / 1689 loss=3.543, nll_loss=1.992, ppl=3.98, wps=582118, ups=1.18, wpb=495350, bsz=16331.4, num_updates=41300, lr=0.000311211, gnorm=0.14, clip=0, loss_scale=2, train_wall=84, gb_free=60.5, wall=35421 epoch 025: 829 / 1689 loss=3.543, nll_loss=1.992, ppl=3.98, wps=582118, ups=1.18, wpb=495350, bsz=16331.4, num_updates=41300, lr=0.000311211, gnorm=0.14, clip=0, loss_scale=2, train_wall=84, gb_free=60.5, wall=35421 epoch 025: 829 / 1689 loss=3.543, nll_loss=1.992, ppl=3.98, wps=582118, ups=1.18, wpb=495350, bsz=16331.4, num_updates=41300, lr=0.000311211, gnorm=0.14, clip=0, loss_scale=2, train_wall=84, gb_free=60.5, wall=35421 epoch 025: 829 / 1689 loss=3.543, nll_loss=1.992, ppl=3.98, wps=582118, ups=1.18, wpb=495350, bsz=16331.4, num_updates=41300, lr=0.000311211, gnorm=0.14, clip=0, loss_scale=2, train_wall=84, gb_free=60.5, wall=35421 epoch 025: 829 / 1689 loss=3.543, nll_loss=1.992, ppl=3.98, wps=582118, ups=1.18, wpb=495350, bsz=16331.4, num_updates=41300, lr=0.000311211, gnorm=0.14, clip=0, loss_scale=2, train_wall=84, gb_free=60.5, wall=35421 epoch 025: 829 / 1689 loss=3.543, nll_loss=1.992, ppl=3.98, wps=582118, ups=1.18, wpb=495350, bsz=16331.4, num_updates=41300, lr=0.000311211, gnorm=0.14, clip=0, loss_scale=2, train_wall=84, gb_free=60.5, wall=35421 epoch 025: 829 / 1689 loss=3.543, nll_loss=1.992, ppl=3.98, wps=582118, ups=1.18, wpb=495350, bsz=16331.4, num_updates=41300, lr=0.000311211, gnorm=0.14, clip=0, loss_scale=2, train_wall=84, gb_free=60.5, wall=35421 epoch 025: 829 / 1689 loss=3.543, nll_loss=1.992, ppl=3.98, wps=582118, ups=1.18, wpb=495350, bsz=16331.4, num_updates=41300, lr=0.000311211, gnorm=0.14, clip=0, loss_scale=2, train_wall=84, gb_free=60.5, wall=35421 epoch 025: 829 / 1689 loss=3.543, nll_loss=1.992, ppl=3.98, wps=582118, ups=1.18, wpb=495350, bsz=16331.4, num_updates=41300, lr=0.000311211, gnorm=0.14, clip=0, loss_scale=2, train_wall=84, gb_free=60.5, wall=35421 epoch 025: 829 / 1689 loss=3.543, nll_loss=1.992, ppl=3.98, wps=582118, ups=1.18, wpb=495350, bsz=16331.4, num_updates=41300, lr=0.000311211, gnorm=0.14, clip=0, loss_scale=2, train_wall=84, gb_free=60.5, wall=35421 epoch 025: 829 / 1689 loss=3.543, nll_loss=1.992, ppl=3.98, wps=582118, ups=1.18, wpb=495350, bsz=16331.4, num_updates=41300, lr=0.000311211, gnorm=0.14, clip=0, loss_scale=2, train_wall=84, gb_free=60.5, wall=35421 epoch 025: 829 / 1689 loss=3.543, nll_loss=1.992, ppl=3.98, wps=582118, ups=1.18, wpb=495350, bsz=16331.4, num_updates=41300, lr=0.000311211, gnorm=0.14, clip=0, loss_scale=2, train_wall=84, gb_free=60.5, wall=35421 epoch 025: 829 / 1689 loss=3.543, nll_loss=1.992, ppl=3.98, wps=582118, ups=1.18, wpb=495350, bsz=16331.4, num_updates=41300, lr=0.000311211, gnorm=0.14, clip=0, loss_scale=2, train_wall=84, gb_free=60.5, wall=35421 epoch 025: 829 / 1689 loss=3.543, nll_loss=1.992, ppl=3.98, wps=582118, ups=1.18, wpb=495350, bsz=16331.4, num_updates=41300, lr=0.000311211, gnorm=0.14, clip=0, loss_scale=2, train_wall=84, gb_free=60.5, wall=35421 epoch 025: 829 / 1689 loss=3.543, nll_loss=1.992, ppl=3.98, wps=582118, ups=1.18, wpb=495350, bsz=16331.4, num_updates=41300, lr=0.000311211, gnorm=0.14, clip=0, loss_scale=2, train_wall=84, gb_free=60.5, wall=35421 epoch 025: 829 / 1689 loss=3.543, nll_loss=1.992, ppl=3.98, wps=582118, ups=1.18, wpb=495350, bsz=16331.4, num_updates=41300, lr=0.000311211, gnorm=0.14, clip=0, loss_scale=2, train_wall=84, gb_free=60.5, wall=35421 epoch 025: 829 / 1689 loss=3.543, nll_loss=1.992, ppl=3.98, wps=582118, ups=1.18, wpb=495350, bsz=16331.4, num_updates=41300, lr=0.000311211, gnorm=0.14, clip=0, loss_scale=2, train_wall=84, gb_free=60.5, wall=35421 epoch 025: 829 / 1689 loss=3.543, nll_loss=1.992, ppl=3.98, wps=582118, ups=1.18, wpb=495350, bsz=16331.4, num_updates=41300, lr=0.000311211, gnorm=0.14, clip=0, loss_scale=2, train_wall=84, gb_free=60.5, wall=35421 epoch 025: 829 / 1689 loss=3.543, nll_loss=1.992, ppl=3.98, wps=582118, ups=1.18, wpb=495350, bsz=16331.4, num_updates=41300, lr=0.000311211, gnorm=0.14, clip=0, loss_scale=2, train_wall=84, gb_free=60.5, wall=35421 epoch 025: 829 / 1689 loss=3.543, nll_loss=1.992, ppl=3.98, wps=582118, ups=1.18, wpb=495350, bsz=16331.4, num_updates=41300, lr=0.000311211, gnorm=0.14, clip=0, loss_scale=2, train_wall=84, gb_free=60.5, wall=35421 epoch 025: 829 / 1689 loss=3.543, nll_loss=1.992, ppl=3.98, wps=582118, ups=1.18, wpb=495350, bsz=16331.4, num_updates=41300, lr=0.000311211, gnorm=0.14, clip=0, loss_scale=2, train_wall=84, gb_free=60.5, wall=35421 epoch 025: 829 / 1689 loss=3.543, nll_loss=1.992, ppl=3.98, wps=582118, ups=1.18, wpb=495350, bsz=16331.4, num_updates=41300, lr=0.000311211, gnorm=0.14, clip=0, loss_scale=2, train_wall=84, gb_free=60.5, wall=35421 epoch 025: 829 / 1689 loss=3.543, nll_loss=1.992, ppl=3.98, wps=582118, ups=1.18, wpb=495350, bsz=16331.4, num_updates=41300, lr=0.000311211, gnorm=0.14, clip=0, loss_scale=2, train_wall=84, gb_free=60.5, wall=35421 epoch 025: 829 / 1689 loss=3.543, nll_loss=1.992, ppl=3.98, wps=582118, ups=1.18, wpb=495350, bsz=16331.4, num_updates=41300, lr=0.000311211, gnorm=0.14, clip=0, loss_scale=2, train_wall=84, gb_free=60.5, wall=35421 epoch 025: 829 / 1689 loss=3.543, nll_loss=1.992, ppl=3.98, wps=582118, ups=1.18, wpb=495350, bsz=16331.4, num_updates=41300, lr=0.000311211, gnorm=0.14, clip=0, loss_scale=2, train_wall=84, gb_free=60.5, wall=35421 epoch 025: 929 / 1689 loss=3.548, nll_loss=1.998, ppl=3.99, wps=586608, ups=1.19, wpb=494608, bsz=16877.3, num_updates=41400, lr=0.000310835, gnorm=0.138, clip=0, loss_scale=2, train_wall=83, gb_free=61, wall=35505 epoch 025: 929 / 1689 loss=3.548, nll_loss=1.998, ppl=3.99, wps=586608, ups=1.19, wpb=494608, bsz=16877.3, num_updates=41400, lr=0.000310835, gnorm=0.138, clip=0, loss_scale=2, train_wall=83, gb_free=61, wall=35505 epoch 025: 929 / 1689 loss=3.548, nll_loss=1.998, ppl=3.99, wps=586608, ups=1.19, wpb=494608, bsz=16877.3, num_updates=41400, lr=0.000310835, gnorm=0.138, clip=0, loss_scale=2, train_wall=83, gb_free=61, wall=35505 epoch 025: 929 / 1689 loss=3.548, nll_loss=1.998, ppl=3.99, wps=586608, ups=1.19, wpb=494608, bsz=16877.3, num_updates=41400, lr=0.000310835, gnorm=0.138, clip=0, loss_scale=2, train_wall=83, gb_free=61, wall=35505 epoch 025: 929 / 1689 loss=3.548, nll_loss=1.998, ppl=3.99, wps=586608, ups=1.19, wpb=494608, bsz=16877.3, num_updates=41400, lr=0.000310835, gnorm=0.138, clip=0, loss_scale=2, train_wall=83, gb_free=61, wall=35505 epoch 025: 929 / 1689 loss=3.548, nll_loss=1.998, ppl=3.99, wps=586608, ups=1.19, wpb=494608, bsz=16877.3, num_updates=41400, lr=0.000310835, gnorm=0.138, clip=0, loss_scale=2, train_wall=83, gb_free=61, wall=35505 epoch 025: 929 / 1689 loss=3.548, nll_loss=1.998, ppl=3.99, wps=586608, ups=1.19, wpb=494608, bsz=16877.3, num_updates=41400, lr=0.000310835, gnorm=0.138, clip=0, loss_scale=2, train_wall=83, gb_free=61, wall=35505 epoch 025: 929 / 1689 loss=3.548, nll_loss=1.998, ppl=3.99, wps=586608, ups=1.19, wpb=494608, bsz=16877.3, num_updates=41400, lr=0.000310835, gnorm=0.138, clip=0, loss_scale=2, train_wall=83, gb_free=61, wall=35505 epoch 025: 929 / 1689 loss=3.548, nll_loss=1.998, ppl=3.99, wps=586608, ups=1.19, wpb=494608, bsz=16877.3, num_updates=41400, lr=0.000310835, gnorm=0.138, clip=0, loss_scale=2, train_wall=83, gb_free=61, wall=35505 epoch 025: 929 / 1689 loss=3.548, nll_loss=1.998, ppl=3.99, wps=586608, ups=1.19, wpb=494608, bsz=16877.3, num_updates=41400, lr=0.000310835, gnorm=0.138, clip=0, loss_scale=2, train_wall=83, gb_free=61, wall=35505 epoch 025: 929 / 1689 loss=3.548, nll_loss=1.998, ppl=3.99, wps=586608, ups=1.19, wpb=494608, bsz=16877.3, num_updates=41400, lr=0.000310835, gnorm=0.138, clip=0, loss_scale=2, train_wall=83, gb_free=61, wall=35505 epoch 025: 929 / 1689 loss=3.548, nll_loss=1.998, ppl=3.99, wps=586608, ups=1.19, wpb=494608, bsz=16877.3, num_updates=41400, lr=0.000310835, gnorm=0.138, clip=0, loss_scale=2, train_wall=83, gb_free=61, wall=35505 epoch 025: 929 / 1689 loss=3.548, nll_loss=1.998, ppl=3.99, wps=586608, ups=1.19, wpb=494608, bsz=16877.3, num_updates=41400, lr=0.000310835, gnorm=0.138, clip=0, loss_scale=2, train_wall=83, gb_free=61, wall=35505 epoch 025: 929 / 1689 loss=3.548, nll_loss=1.998, ppl=3.99, wps=586608, ups=1.19, wpb=494608, bsz=16877.3, num_updates=41400, lr=0.000310835, gnorm=0.138, clip=0, loss_scale=2, train_wall=83, gb_free=61, wall=35505 epoch 025: 929 / 1689 loss=3.548, nll_loss=1.998, ppl=3.99, wps=586608, ups=1.19, wpb=494608, bsz=16877.3, num_updates=41400, lr=0.000310835, gnorm=0.138, clip=0, loss_scale=2, train_wall=83, gb_free=61, wall=35505 epoch 025: 929 / 1689 loss=3.548, nll_loss=1.998, ppl=3.99, wps=586608, ups=1.19, wpb=494608, bsz=16877.3, num_updates=41400, lr=0.000310835, gnorm=0.138, clip=0, loss_scale=2, train_wall=83, gb_free=61, wall=35505 epoch 025: 929 / 1689 loss=3.548, nll_loss=1.998, ppl=3.99, wps=586608, ups=1.19, wpb=494608, bsz=16877.3, num_updates=41400, lr=0.000310835, gnorm=0.138, clip=0, loss_scale=2, train_wall=83, gb_free=61, wall=35505 epoch 025: 929 / 1689 loss=3.548, nll_loss=1.998, ppl=3.99, wps=586608, ups=1.19, wpb=494608, bsz=16877.3, num_updates=41400, lr=0.000310835, gnorm=0.138, clip=0, loss_scale=2, train_wall=83, gb_free=61, wall=35505 epoch 025: 929 / 1689 loss=3.548, nll_loss=1.998, ppl=3.99, wps=586608, ups=1.19, wpb=494608, bsz=16877.3, num_updates=41400, lr=0.000310835, gnorm=0.138, clip=0, loss_scale=2, train_wall=83, gb_free=61, wall=35505 epoch 025: 929 / 1689 loss=3.548, nll_loss=1.998, ppl=3.99, wps=586608, ups=1.19, wpb=494608, bsz=16877.3, num_updates=41400, lr=0.000310835, gnorm=0.138, clip=0, loss_scale=2, train_wall=83, gb_free=61, wall=35505 epoch 025: 929 / 1689 loss=3.548, nll_loss=1.998, ppl=3.99, wps=586608, ups=1.19, wpb=494608, bsz=16877.3, num_updates=41400, lr=0.000310835, gnorm=0.138, clip=0, loss_scale=2, train_wall=83, gb_free=61, wall=35505 epoch 025: 929 / 1689 loss=3.548, nll_loss=1.998, ppl=3.99, wps=586608, ups=1.19, wpb=494608, bsz=16877.3, num_updates=41400, lr=0.000310835, gnorm=0.138, clip=0, loss_scale=2, train_wall=83, gb_free=61, wall=35505 epoch 025: 929 / 1689 loss=3.548, nll_loss=1.998, ppl=3.99, wps=586608, ups=1.19, wpb=494608, bsz=16877.3, num_updates=41400, lr=0.000310835, gnorm=0.138, clip=0, loss_scale=2, train_wall=83, gb_free=61, wall=35505 epoch 025: 929 / 1689 loss=3.548, nll_loss=1.998, ppl=3.99, wps=586608, ups=1.19, wpb=494608, bsz=16877.3, num_updates=41400, lr=0.000310835, gnorm=0.138, clip=0, loss_scale=2, train_wall=83, gb_free=61, wall=35505 epoch 025: 929 / 1689 loss=3.548, nll_loss=1.998, ppl=3.99, wps=586608, ups=1.19, wpb=494608, bsz=16877.3, num_updates=41400, lr=0.000310835, gnorm=0.138, clip=0, loss_scale=2, train_wall=83, gb_free=61, wall=35505 epoch 025: 1029 / 1689 loss=3.54, nll_loss=1.989, ppl=3.97, wps=582975, ups=1.18, wpb=494252, bsz=16490.2, num_updates=41500, lr=0.00031046, gnorm=0.136, clip=0, loss_scale=2, train_wall=84, gb_free=61.6, wall=35590 epoch 025: 1029 / 1689 loss=3.54, nll_loss=1.989, ppl=3.97, wps=582975, ups=1.18, wpb=494252, bsz=16490.2, num_updates=41500, lr=0.00031046, gnorm=0.136, clip=0, loss_scale=2, train_wall=84, gb_free=61.6, wall=35590 epoch 025: 1029 / 1689 loss=3.54, nll_loss=1.989, ppl=3.97, wps=582975, ups=1.18, wpb=494252, bsz=16490.2, num_updates=41500, lr=0.00031046, gnorm=0.136, clip=0, loss_scale=2, train_wall=84, gb_free=61.6, wall=35590 epoch 025: 1029 / 1689 loss=3.54, nll_loss=1.989, ppl=3.97, wps=582975, ups=1.18, wpb=494252, bsz=16490.2, num_updates=41500, lr=0.00031046, gnorm=0.136, clip=0, loss_scale=2, train_wall=84, gb_free=61.6, wall=35590 epoch 025: 1029 / 1689 loss=3.54, nll_loss=1.989, ppl=3.97, wps=582975, ups=1.18, wpb=494252, bsz=16490.2, num_updates=41500, lr=0.00031046, gnorm=0.136, clip=0, loss_scale=2, train_wall=84, gb_free=61.6, wall=35590 epoch 025: 1029 / 1689 loss=3.54, nll_loss=1.989, ppl=3.97, wps=582975, ups=1.18, wpb=494252, bsz=16490.2, num_updates=41500, lr=0.00031046, gnorm=0.136, clip=0, loss_scale=2, train_wall=84, gb_free=61.6, wall=35590 epoch 025: 1029 / 1689 loss=3.54, nll_loss=1.989, ppl=3.97, wps=582975, ups=1.18, wpb=494252, bsz=16490.2, num_updates=41500, lr=0.00031046, gnorm=0.136, clip=0, loss_scale=2, train_wall=84, gb_free=61.6, wall=35590 epoch 025: 1029 / 1689 loss=3.54, nll_loss=1.989, ppl=3.97, wps=582975, ups=1.18, wpb=494252, bsz=16490.2, num_updates=41500, lr=0.00031046, gnorm=0.136, clip=0, loss_scale=2, train_wall=84, gb_free=61.6, wall=35590 epoch 025: 1029 / 1689 loss=3.54, nll_loss=1.989, ppl=3.97, wps=582975, ups=1.18, wpb=494252, bsz=16490.2, num_updates=41500, lr=0.00031046, gnorm=0.136, clip=0, loss_scale=2, train_wall=84, gb_free=61.6, wall=35590 epoch 025: 1029 / 1689 loss=3.54, nll_loss=1.989, ppl=3.97, wps=582975, ups=1.18, wpb=494252, bsz=16490.2, num_updates=41500, lr=0.00031046, gnorm=0.136, clip=0, loss_scale=2, train_wall=84, gb_free=61.6, wall=35590 epoch 025: 1029 / 1689 loss=3.54, nll_loss=1.989, ppl=3.97, wps=582975, ups=1.18, wpb=494252, bsz=16490.2, num_updates=41500, lr=0.00031046, gnorm=0.136, clip=0, loss_scale=2, train_wall=84, gb_free=61.6, wall=35590 epoch 025: 1029 / 1689 loss=3.54, nll_loss=1.989, ppl=3.97, wps=582975, ups=1.18, wpb=494252, bsz=16490.2, num_updates=41500, lr=0.00031046, gnorm=0.136, clip=0, loss_scale=2, train_wall=84, gb_free=61.6, wall=35590 epoch 025: 1029 / 1689 loss=3.54, nll_loss=1.989, ppl=3.97, wps=582975, ups=1.18, wpb=494252, bsz=16490.2, num_updates=41500, lr=0.00031046, gnorm=0.136, clip=0, loss_scale=2, train_wall=84, gb_free=61.6, wall=35590 epoch 025: 1029 / 1689 loss=3.54, nll_loss=1.989, ppl=3.97, wps=582975, ups=1.18, wpb=494252, bsz=16490.2, num_updates=41500, lr=0.00031046, gnorm=0.136, clip=0, loss_scale=2, train_wall=84, gb_free=61.6, wall=35590 epoch 025: 1029 / 1689 loss=3.54, nll_loss=1.989, ppl=3.97, wps=582975, ups=1.18, wpb=494252, bsz=16490.2, num_updates=41500, lr=0.00031046, gnorm=0.136, clip=0, loss_scale=2, train_wall=84, gb_free=61.6, wall=35590 epoch 025: 1029 / 1689 loss=3.54, nll_loss=1.989, ppl=3.97, wps=582975, ups=1.18, wpb=494252, bsz=16490.2, num_updates=41500, lr=0.00031046, gnorm=0.136, clip=0, loss_scale=2, train_wall=84, gb_free=61.6, wall=35590 epoch 025: 1029 / 1689 loss=3.54, nll_loss=1.989, ppl=3.97, wps=582975, ups=1.18, wpb=494252, bsz=16490.2, num_updates=41500, lr=0.00031046, gnorm=0.136, clip=0, loss_scale=2, train_wall=84, gb_free=61.6, wall=35590 epoch 025: 1029 / 1689 loss=3.54, nll_loss=1.989, ppl=3.97, wps=582975, ups=1.18, wpb=494252, bsz=16490.2, num_updates=41500, lr=0.00031046, gnorm=0.136, clip=0, loss_scale=2, train_wall=84, gb_free=61.6, wall=35590 epoch 025: 1029 / 1689 loss=3.54, nll_loss=1.989, ppl=3.97, wps=582975, ups=1.18, wpb=494252, bsz=16490.2, num_updates=41500, lr=0.00031046, gnorm=0.136, clip=0, loss_scale=2, train_wall=84, gb_free=61.6, wall=35590 epoch 025: 1029 / 1689 loss=3.54, nll_loss=1.989, ppl=3.97, wps=582975, ups=1.18, wpb=494252, bsz=16490.2, num_updates=41500, lr=0.00031046, gnorm=0.136, clip=0, loss_scale=2, train_wall=84, gb_free=61.6, wall=35590 epoch 025: 1029 / 1689 loss=3.54, nll_loss=1.989, ppl=3.97, wps=582975, ups=1.18, wpb=494252, bsz=16490.2, num_updates=41500, lr=0.00031046, gnorm=0.136, clip=0, loss_scale=2, train_wall=84, gb_free=61.6, wall=35590 epoch 025: 1029 / 1689 loss=3.54, nll_loss=1.989, ppl=3.97, wps=582975, ups=1.18, wpb=494252, bsz=16490.2, num_updates=41500, lr=0.00031046, gnorm=0.136, clip=0, loss_scale=2, train_wall=84, gb_free=61.6, wall=35590 epoch 025: 1029 / 1689 loss=3.54, nll_loss=1.989, ppl=3.97, wps=582975, ups=1.18, wpb=494252, bsz=16490.2, num_updates=41500, lr=0.00031046, gnorm=0.136, clip=0, loss_scale=2, train_wall=84, gb_free=61.6, wall=35590 epoch 025: 1029 / 1689 loss=3.54, nll_loss=1.989, ppl=3.97, wps=582975, ups=1.18, wpb=494252, bsz=16490.2, num_updates=41500, lr=0.00031046, gnorm=0.136, clip=0, loss_scale=2, train_wall=84, gb_free=61.6, wall=35590 epoch 025: 1029 / 1689 loss=3.54, nll_loss=1.989, ppl=3.97, wps=582975, ups=1.18, wpb=494252, bsz=16490.2, num_updates=41500, lr=0.00031046, gnorm=0.136, clip=0, loss_scale=2, train_wall=84, gb_free=61.6, wall=35590 epoch 025: 1129 / 1689 loss=3.536, nll_loss=1.985, ppl=3.96, wps=590199, ups=1.19, wpb=494910, bsz=16147.8, num_updates=41600, lr=0.000310087, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=59.4, wall=35674 epoch 025: 1129 / 1689 loss=3.536, nll_loss=1.985, ppl=3.96, wps=590199, ups=1.19, wpb=494910, bsz=16147.8, num_updates=41600, lr=0.000310087, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=59.4, wall=35674 epoch 025: 1129 / 1689 loss=3.536, nll_loss=1.985, ppl=3.96, wps=590199, ups=1.19, wpb=494910, bsz=16147.8, num_updates=41600, lr=0.000310087, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=59.4, wall=35674 epoch 025: 1129 / 1689 loss=3.536, nll_loss=1.985, ppl=3.96, wps=590199, ups=1.19, wpb=494910, bsz=16147.8, num_updates=41600, lr=0.000310087, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=59.4, wall=35674 epoch 025: 1129 / 1689 loss=3.536, nll_loss=1.985, ppl=3.96, wps=590199, ups=1.19, wpb=494910, bsz=16147.8, num_updates=41600, lr=0.000310087, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=59.4, wall=35674 epoch 025: 1129 / 1689 loss=3.536, nll_loss=1.985, ppl=3.96, wps=590199, ups=1.19, wpb=494910, bsz=16147.8, num_updates=41600, lr=0.000310087, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=59.4, wall=35674 epoch 025: 1129 / 1689 loss=3.536, nll_loss=1.985, ppl=3.96, wps=590199, ups=1.19, wpb=494910, bsz=16147.8, num_updates=41600, lr=0.000310087, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=59.4, wall=35674 epoch 025: 1129 / 1689 loss=3.536, nll_loss=1.985, ppl=3.96, wps=590199, ups=1.19, wpb=494910, bsz=16147.8, num_updates=41600, lr=0.000310087, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=59.4, wall=35674 epoch 025: 1129 / 1689 loss=3.536, nll_loss=1.985, ppl=3.96, wps=590199, ups=1.19, wpb=494910, bsz=16147.8, num_updates=41600, lr=0.000310087, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=59.4, wall=35674 epoch 025: 1129 / 1689 loss=3.536, nll_loss=1.985, ppl=3.96, wps=590199, ups=1.19, wpb=494910, bsz=16147.8, num_updates=41600, lr=0.000310087, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=59.4, wall=35674 epoch 025: 1129 / 1689 loss=3.536, nll_loss=1.985, ppl=3.96, wps=590199, ups=1.19, wpb=494910, bsz=16147.8, num_updates=41600, lr=0.000310087, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=59.4, wall=35674 epoch 025: 1129 / 1689 loss=3.536, nll_loss=1.985, ppl=3.96, wps=590199, ups=1.19, wpb=494910, bsz=16147.8, num_updates=41600, lr=0.000310087, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=59.4, wall=35674 epoch 025: 1129 / 1689 loss=3.536, nll_loss=1.985, ppl=3.96, wps=590199, ups=1.19, wpb=494910, bsz=16147.8, num_updates=41600, lr=0.000310087, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=59.4, wall=35674 epoch 025: 1129 / 1689 loss=3.536, nll_loss=1.985, ppl=3.96, wps=590199, ups=1.19, wpb=494910, bsz=16147.8, num_updates=41600, lr=0.000310087, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=59.4, wall=35674 epoch 025: 1129 / 1689 loss=3.536, nll_loss=1.985, ppl=3.96, wps=590199, ups=1.19, wpb=494910, bsz=16147.8, num_updates=41600, lr=0.000310087, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=59.4, wall=35674 epoch 025: 1129 / 1689 loss=3.536, nll_loss=1.985, ppl=3.96, wps=590199, ups=1.19, wpb=494910, bsz=16147.8, num_updates=41600, lr=0.000310087, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=59.4, wall=35674 epoch 025: 1129 / 1689 loss=3.536, nll_loss=1.985, ppl=3.96, wps=590199, ups=1.19, wpb=494910, bsz=16147.8, num_updates=41600, lr=0.000310087, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=59.4, wall=35674 epoch 025: 1129 / 1689 loss=3.536, nll_loss=1.985, ppl=3.96, wps=590199, ups=1.19, wpb=494910, bsz=16147.8, num_updates=41600, lr=0.000310087, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=59.4, wall=35674 epoch 025: 1129 / 1689 loss=3.536, nll_loss=1.985, ppl=3.96, wps=590199, ups=1.19, wpb=494910, bsz=16147.8, num_updates=41600, lr=0.000310087, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=59.4, wall=35674 epoch 025: 1129 / 1689 loss=3.536, nll_loss=1.985, ppl=3.96, wps=590199, ups=1.19, wpb=494910, bsz=16147.8, num_updates=41600, lr=0.000310087, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=59.4, wall=35674 epoch 025: 1129 / 1689 loss=3.536, nll_loss=1.985, ppl=3.96, wps=590199, ups=1.19, wpb=494910, bsz=16147.8, num_updates=41600, lr=0.000310087, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=59.4, wall=35674 epoch 025: 1129 / 1689 loss=3.536, nll_loss=1.985, ppl=3.96, wps=590199, ups=1.19, wpb=494910, bsz=16147.8, num_updates=41600, lr=0.000310087, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=59.4, wall=35674 epoch 025: 1129 / 1689 loss=3.536, nll_loss=1.985, ppl=3.96, wps=590199, ups=1.19, wpb=494910, bsz=16147.8, num_updates=41600, lr=0.000310087, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=59.4, wall=35674 epoch 025: 1129 / 1689 loss=3.536, nll_loss=1.985, ppl=3.96, wps=590199, ups=1.19, wpb=494910, bsz=16147.8, num_updates=41600, lr=0.000310087, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=59.4, wall=35674 epoch 025: 1129 / 1689 loss=3.536, nll_loss=1.985, ppl=3.96, wps=590199, ups=1.19, wpb=494910, bsz=16147.8, num_updates=41600, lr=0.000310087, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=59.4, wall=35674 epoch 025: 1229 / 1689 loss=3.544, nll_loss=1.993, ppl=3.98, wps=586503, ups=1.19, wpb=494629, bsz=16383.6, num_updates=41700, lr=0.000309715, gnorm=0.132, clip=0, loss_scale=2, train_wall=83, gb_free=61.9, wall=35758 epoch 025: 1229 / 1689 loss=3.544, nll_loss=1.993, ppl=3.98, wps=586503, ups=1.19, wpb=494629, bsz=16383.6, num_updates=41700, lr=0.000309715, gnorm=0.132, clip=0, loss_scale=2, train_wall=83, gb_free=61.9, wall=35758 epoch 025: 1229 / 1689 loss=3.544, nll_loss=1.993, ppl=3.98, wps=586503, ups=1.19, wpb=494629, bsz=16383.6, num_updates=41700, lr=0.000309715, gnorm=0.132, clip=0, loss_scale=2, train_wall=83, gb_free=61.9, wall=35758 epoch 025: 1229 / 1689 loss=3.544, nll_loss=1.993, ppl=3.98, wps=586503, ups=1.19, wpb=494629, bsz=16383.6, num_updates=41700, lr=0.000309715, gnorm=0.132, clip=0, loss_scale=2, train_wall=83, gb_free=61.9, wall=35758 epoch 025: 1229 / 1689 loss=3.544, nll_loss=1.993, ppl=3.98, wps=586503, ups=1.19, wpb=494629, bsz=16383.6, num_updates=41700, lr=0.000309715, gnorm=0.132, clip=0, loss_scale=2, train_wall=83, gb_free=61.9, wall=35758 epoch 025: 1229 / 1689 loss=3.544, nll_loss=1.993, ppl=3.98, wps=586503, ups=1.19, wpb=494629, bsz=16383.6, num_updates=41700, lr=0.000309715, gnorm=0.132, clip=0, loss_scale=2, train_wall=83, gb_free=61.9, wall=35758 epoch 025: 1229 / 1689 loss=3.544, nll_loss=1.993, ppl=3.98, wps=586503, ups=1.19, wpb=494629, bsz=16383.6, num_updates=41700, lr=0.000309715, gnorm=0.132, clip=0, loss_scale=2, train_wall=83, gb_free=61.9, wall=35758 epoch 025: 1229 / 1689 loss=3.544, nll_loss=1.993, ppl=3.98, wps=586503, ups=1.19, wpb=494629, bsz=16383.6, num_updates=41700, lr=0.000309715, gnorm=0.132, clip=0, loss_scale=2, train_wall=83, gb_free=61.9, wall=35758 epoch 025: 1229 / 1689 loss=3.544, nll_loss=1.993, ppl=3.98, wps=586503, ups=1.19, wpb=494629, bsz=16383.6, num_updates=41700, lr=0.000309715, gnorm=0.132, clip=0, loss_scale=2, train_wall=83, gb_free=61.9, wall=35758 epoch 025: 1229 / 1689 loss=3.544, nll_loss=1.993, ppl=3.98, wps=586503, ups=1.19, wpb=494629, bsz=16383.6, num_updates=41700, lr=0.000309715, gnorm=0.132, clip=0, loss_scale=2, train_wall=83, gb_free=61.9, wall=35758 epoch 025: 1229 / 1689 loss=3.544, nll_loss=1.993, ppl=3.98, wps=586503, ups=1.19, wpb=494629, bsz=16383.6, num_updates=41700, lr=0.000309715, gnorm=0.132, clip=0, loss_scale=2, train_wall=83, gb_free=61.9, wall=35758 epoch 025: 1229 / 1689 loss=3.544, nll_loss=1.993, ppl=3.98, wps=586503, ups=1.19, wpb=494629, bsz=16383.6, num_updates=41700, lr=0.000309715, gnorm=0.132, clip=0, loss_scale=2, train_wall=83, gb_free=61.9, wall=35758 epoch 025: 1229 / 1689 loss=3.544, nll_loss=1.993, ppl=3.98, wps=586503, ups=1.19, wpb=494629, bsz=16383.6, num_updates=41700, lr=0.000309715, gnorm=0.132, clip=0, loss_scale=2, train_wall=83, gb_free=61.9, wall=35758 epoch 025: 1229 / 1689 loss=3.544, nll_loss=1.993, ppl=3.98, wps=586503, ups=1.19, wpb=494629, bsz=16383.6, num_updates=41700, lr=0.000309715, gnorm=0.132, clip=0, loss_scale=2, train_wall=83, gb_free=61.9, wall=35758 epoch 025: 1229 / 1689 loss=3.544, nll_loss=1.993, ppl=3.98, wps=586503, ups=1.19, wpb=494629, bsz=16383.6, num_updates=41700, lr=0.000309715, gnorm=0.132, clip=0, loss_scale=2, train_wall=83, gb_free=61.9, wall=35758 epoch 025: 1229 / 1689 loss=3.544, nll_loss=1.993, ppl=3.98, wps=586503, ups=1.19, wpb=494629, bsz=16383.6, num_updates=41700, lr=0.000309715, gnorm=0.132, clip=0, loss_scale=2, train_wall=83, gb_free=61.9, wall=35758 epoch 025: 1229 / 1689 loss=3.544, nll_loss=1.993, ppl=3.98, wps=586503, ups=1.19, wpb=494629, bsz=16383.6, num_updates=41700, lr=0.000309715, gnorm=0.132, clip=0, loss_scale=2, train_wall=83, gb_free=61.9, wall=35758 epoch 025: 1229 / 1689 loss=3.544, nll_loss=1.993, ppl=3.98, wps=586503, ups=1.19, wpb=494629, bsz=16383.6, num_updates=41700, lr=0.000309715, gnorm=0.132, clip=0, loss_scale=2, train_wall=83, gb_free=61.9, wall=35758 epoch 025: 1229 / 1689 loss=3.544, nll_loss=1.993, ppl=3.98, wps=586503, ups=1.19, wpb=494629, bsz=16383.6, num_updates=41700, lr=0.000309715, gnorm=0.132, clip=0, loss_scale=2, train_wall=83, gb_free=61.9, wall=35758 epoch 025: 1229 / 1689 loss=3.544, nll_loss=1.993, ppl=3.98, wps=586503, ups=1.19, wpb=494629, bsz=16383.6, num_updates=41700, lr=0.000309715, gnorm=0.132, clip=0, loss_scale=2, train_wall=83, gb_free=61.9, wall=35758 epoch 025: 1229 / 1689 loss=3.544, nll_loss=1.993, ppl=3.98, wps=586503, ups=1.19, wpb=494629, bsz=16383.6, num_updates=41700, lr=0.000309715, gnorm=0.132, clip=0, loss_scale=2, train_wall=83, gb_free=61.9, wall=35758 epoch 025: 1229 / 1689 loss=3.544, nll_loss=1.993, ppl=3.98, wps=586503, ups=1.19, wpb=494629, bsz=16383.6, num_updates=41700, lr=0.000309715, gnorm=0.132, clip=0, loss_scale=2, train_wall=83, gb_free=61.9, wall=35758 epoch 025: 1229 / 1689 loss=3.544, nll_loss=1.993, ppl=3.98, wps=586503, ups=1.19, wpb=494629, bsz=16383.6, num_updates=41700, lr=0.000309715, gnorm=0.132, clip=0, loss_scale=2, train_wall=83, gb_free=61.9, wall=35758 epoch 025: 1229 / 1689 loss=3.544, nll_loss=1.993, ppl=3.98, wps=586503, ups=1.19, wpb=494629, bsz=16383.6, num_updates=41700, lr=0.000309715, gnorm=0.132, clip=0, loss_scale=2, train_wall=83, gb_free=61.9, wall=35758 epoch 025: 1229 / 1689 loss=3.544, nll_loss=1.993, ppl=3.98, wps=586503, ups=1.19, wpb=494629, bsz=16383.6, num_updates=41700, lr=0.000309715, gnorm=0.132, clip=0, loss_scale=2, train_wall=83, gb_free=61.9, wall=35758 epoch 025: 1329 / 1689 loss=3.537, nll_loss=1.986, ppl=3.96, wps=590786, ups=1.19, wpb=497588, bsz=16657, num_updates=41800, lr=0.000309344, gnorm=0.131, clip=0, loss_scale=2, train_wall=83, gb_free=61.6, wall=35842 epoch 025: 1329 / 1689 loss=3.537, nll_loss=1.986, ppl=3.96, wps=590786, ups=1.19, wpb=497588, bsz=16657, num_updates=41800, lr=0.000309344, gnorm=0.131, clip=0, loss_scale=2, train_wall=83, gb_free=61.6, wall=35842 epoch 025: 1329 / 1689 loss=3.537, nll_loss=1.986, ppl=3.96, wps=590786, ups=1.19, wpb=497588, bsz=16657, num_updates=41800, lr=0.000309344, gnorm=0.131, clip=0, loss_scale=2, train_wall=83, gb_free=61.6, wall=35842 epoch 025: 1329 / 1689 loss=3.537, nll_loss=1.986, ppl=3.96, wps=590786, ups=1.19, wpb=497588, bsz=16657, num_updates=41800, lr=0.000309344, gnorm=0.131, clip=0, loss_scale=2, train_wall=83, gb_free=61.6, wall=35842 epoch 025: 1329 / 1689 loss=3.537, nll_loss=1.986, ppl=3.96, wps=590786, ups=1.19, wpb=497588, bsz=16657, num_updates=41800, lr=0.000309344, gnorm=0.131, clip=0, loss_scale=2, train_wall=83, gb_free=61.6, wall=35842 epoch 025: 1329 / 1689 loss=3.537, nll_loss=1.986, ppl=3.96, wps=590786, ups=1.19, wpb=497588, bsz=16657, num_updates=41800, lr=0.000309344, gnorm=0.131, clip=0, loss_scale=2, train_wall=83, gb_free=61.6, wall=35842 epoch 025: 1329 / 1689 loss=3.537, nll_loss=1.986, ppl=3.96, wps=590786, ups=1.19, wpb=497588, bsz=16657, num_updates=41800, lr=0.000309344, gnorm=0.131, clip=0, loss_scale=2, train_wall=83, gb_free=61.6, wall=35842 epoch 025: 1329 / 1689 loss=3.537, nll_loss=1.986, ppl=3.96, wps=590786, ups=1.19, wpb=497588, bsz=16657, num_updates=41800, lr=0.000309344, gnorm=0.131, clip=0, loss_scale=2, train_wall=83, gb_free=61.6, wall=35842 epoch 025: 1329 / 1689 loss=3.537, nll_loss=1.986, ppl=3.96, wps=590786, ups=1.19, wpb=497588, bsz=16657, num_updates=41800, lr=0.000309344, gnorm=0.131, clip=0, loss_scale=2, train_wall=83, gb_free=61.6, wall=35842 epoch 025: 1329 / 1689 loss=3.537, nll_loss=1.986, ppl=3.96, wps=590786, ups=1.19, wpb=497588, bsz=16657, num_updates=41800, lr=0.000309344, gnorm=0.131, clip=0, loss_scale=2, train_wall=83, gb_free=61.6, wall=35842 epoch 025: 1329 / 1689 loss=3.537, nll_loss=1.986, ppl=3.96, wps=590786, ups=1.19, wpb=497588, bsz=16657, num_updates=41800, lr=0.000309344, gnorm=0.131, clip=0, loss_scale=2, train_wall=83, gb_free=61.6, wall=35842 epoch 025: 1329 / 1689 loss=3.537, nll_loss=1.986, ppl=3.96, wps=590786, ups=1.19, wpb=497588, bsz=16657, num_updates=41800, lr=0.000309344, gnorm=0.131, clip=0, loss_scale=2, train_wall=83, gb_free=61.6, wall=35842 epoch 025: 1329 / 1689 loss=3.537, nll_loss=1.986, ppl=3.96, wps=590786, ups=1.19, wpb=497588, bsz=16657, num_updates=41800, lr=0.000309344, gnorm=0.131, clip=0, loss_scale=2, train_wall=83, gb_free=61.6, wall=35842 epoch 025: 1329 / 1689 loss=3.537, nll_loss=1.986, ppl=3.96, wps=590786, ups=1.19, wpb=497588, bsz=16657, num_updates=41800, lr=0.000309344, gnorm=0.131, clip=0, loss_scale=2, train_wall=83, gb_free=61.6, wall=35842 epoch 025: 1329 / 1689 loss=3.537, nll_loss=1.986, ppl=3.96, wps=590786, ups=1.19, wpb=497588, bsz=16657, num_updates=41800, lr=0.000309344, gnorm=0.131, clip=0, loss_scale=2, train_wall=83, gb_free=61.6, wall=35842 epoch 025: 1329 / 1689 loss=3.537, nll_loss=1.986, ppl=3.96, wps=590786, ups=1.19, wpb=497588, bsz=16657, num_updates=41800, lr=0.000309344, gnorm=0.131, clip=0, loss_scale=2, train_wall=83, gb_free=61.6, wall=35842 epoch 025: 1329 / 1689 loss=3.537, nll_loss=1.986, ppl=3.96, wps=590786, ups=1.19, wpb=497588, bsz=16657, num_updates=41800, lr=0.000309344, gnorm=0.131, clip=0, loss_scale=2, train_wall=83, gb_free=61.6, wall=35842 epoch 025: 1329 / 1689 loss=3.537, nll_loss=1.986, ppl=3.96, wps=590786, ups=1.19, wpb=497588, bsz=16657, num_updates=41800, lr=0.000309344, gnorm=0.131, clip=0, loss_scale=2, train_wall=83, gb_free=61.6, wall=35842 epoch 025: 1329 / 1689 loss=3.537, nll_loss=1.986, ppl=3.96, wps=590786, ups=1.19, wpb=497588, bsz=16657, num_updates=41800, lr=0.000309344, gnorm=0.131, clip=0, loss_scale=2, train_wall=83, gb_free=61.6, wall=35842 epoch 025: 1329 / 1689 loss=3.537, nll_loss=1.986, ppl=3.96, wps=590786, ups=1.19, wpb=497588, bsz=16657, num_updates=41800, lr=0.000309344, gnorm=0.131, clip=0, loss_scale=2, train_wall=83, gb_free=61.6, wall=35842 epoch 025: 1329 / 1689 loss=3.537, nll_loss=1.986, ppl=3.96, wps=590786, ups=1.19, wpb=497588, bsz=16657, num_updates=41800, lr=0.000309344, gnorm=0.131, clip=0, loss_scale=2, train_wall=83, gb_free=61.6, wall=35842 epoch 025: 1329 / 1689 loss=3.537, nll_loss=1.986, ppl=3.96, wps=590786, ups=1.19, wpb=497588, bsz=16657, num_updates=41800, lr=0.000309344, gnorm=0.131, clip=0, loss_scale=2, train_wall=83, gb_free=61.6, wall=35842 epoch 025: 1329 / 1689 loss=3.537, nll_loss=1.986, ppl=3.96, wps=590786, ups=1.19, wpb=497588, bsz=16657, num_updates=41800, lr=0.000309344, gnorm=0.131, clip=0, loss_scale=2, train_wall=83, gb_free=61.6, wall=35842 epoch 025: 1329 / 1689 loss=3.537, nll_loss=1.986, ppl=3.96, wps=590786, ups=1.19, wpb=497588, bsz=16657, num_updates=41800, lr=0.000309344, gnorm=0.131, clip=0, loss_scale=2, train_wall=83, gb_free=61.6, wall=35842 epoch 025: 1329 / 1689 loss=3.537, nll_loss=1.986, ppl=3.96, wps=590786, ups=1.19, wpb=497588, bsz=16657, num_updates=41800, lr=0.000309344, gnorm=0.131, clip=0, loss_scale=2, train_wall=83, gb_free=61.6, wall=35842 epoch 025: 1429 / 1689 loss=3.541, nll_loss=1.99, ppl=3.97, wps=581996, ups=1.18, wpb=494888, bsz=16390, num_updates=41900, lr=0.000308975, gnorm=0.131, clip=0, loss_scale=4, train_wall=84, gb_free=61.3, wall=35927 epoch 025: 1429 / 1689 loss=3.541, nll_loss=1.99, ppl=3.97, wps=581996, ups=1.18, wpb=494888, bsz=16390, num_updates=41900, lr=0.000308975, gnorm=0.131, clip=0, loss_scale=4, train_wall=84, gb_free=61.3, wall=35927 epoch 025: 1429 / 1689 loss=3.541, nll_loss=1.99, ppl=3.97, wps=581996, ups=1.18, wpb=494888, bsz=16390, num_updates=41900, lr=0.000308975, gnorm=0.131, clip=0, loss_scale=4, train_wall=84, gb_free=61.3, wall=35927 epoch 025: 1429 / 1689 loss=3.541, nll_loss=1.99, ppl=3.97, wps=581996, ups=1.18, wpb=494888, bsz=16390, num_updates=41900, lr=0.000308975, gnorm=0.131, clip=0, loss_scale=4, train_wall=84, gb_free=61.3, wall=35927 epoch 025: 1429 / 1689 loss=3.541, nll_loss=1.99, ppl=3.97, wps=581996, ups=1.18, wpb=494888, bsz=16390, num_updates=41900, lr=0.000308975, gnorm=0.131, clip=0, loss_scale=4, train_wall=84, gb_free=61.3, wall=35927 epoch 025: 1429 / 1689 loss=3.541, nll_loss=1.99, ppl=3.97, wps=581996, ups=1.18, wpb=494888, bsz=16390, num_updates=41900, lr=0.000308975, gnorm=0.131, clip=0, loss_scale=4, train_wall=84, gb_free=61.3, wall=35927 epoch 025: 1429 / 1689 loss=3.541, nll_loss=1.99, ppl=3.97, wps=581996, ups=1.18, wpb=494888, bsz=16390, num_updates=41900, lr=0.000308975, gnorm=0.131, clip=0, loss_scale=4, train_wall=84, gb_free=61.3, wall=35927 epoch 025: 1429 / 1689 loss=3.541, nll_loss=1.99, ppl=3.97, wps=581996, ups=1.18, wpb=494888, bsz=16390, num_updates=41900, lr=0.000308975, gnorm=0.131, clip=0, loss_scale=4, train_wall=84, gb_free=61.3, wall=35927 epoch 025: 1429 / 1689 loss=3.541, nll_loss=1.99, ppl=3.97, wps=581996, ups=1.18, wpb=494888, bsz=16390, num_updates=41900, lr=0.000308975, gnorm=0.131, clip=0, loss_scale=4, train_wall=84, gb_free=61.3, wall=35927 epoch 025: 1429 / 1689 loss=3.541, nll_loss=1.99, ppl=3.97, wps=581996, ups=1.18, wpb=494888, bsz=16390, num_updates=41900, lr=0.000308975, gnorm=0.131, clip=0, loss_scale=4, train_wall=84, gb_free=61.3, wall=35927 epoch 025: 1429 / 1689 loss=3.541, nll_loss=1.99, ppl=3.97, wps=581996, ups=1.18, wpb=494888, bsz=16390, num_updates=41900, lr=0.000308975, gnorm=0.131, clip=0, loss_scale=4, train_wall=84, gb_free=61.3, wall=35927 epoch 025: 1429 / 1689 loss=3.541, nll_loss=1.99, ppl=3.97, wps=581996, ups=1.18, wpb=494888, bsz=16390, num_updates=41900, lr=0.000308975, gnorm=0.131, clip=0, loss_scale=4, train_wall=84, gb_free=61.3, wall=35927 epoch 025: 1429 / 1689 loss=3.541, nll_loss=1.99, ppl=3.97, wps=581996, ups=1.18, wpb=494888, bsz=16390, num_updates=41900, lr=0.000308975, gnorm=0.131, clip=0, loss_scale=4, train_wall=84, gb_free=61.3, wall=35927 epoch 025: 1429 / 1689 loss=3.541, nll_loss=1.99, ppl=3.97, wps=581996, ups=1.18, wpb=494888, bsz=16390, num_updates=41900, lr=0.000308975, gnorm=0.131, clip=0, loss_scale=4, train_wall=84, gb_free=61.3, wall=35927 epoch 025: 1429 / 1689 loss=3.541, nll_loss=1.99, ppl=3.97, wps=581996, ups=1.18, wpb=494888, bsz=16390, num_updates=41900, lr=0.000308975, gnorm=0.131, clip=0, loss_scale=4, train_wall=84, gb_free=61.3, wall=35927 epoch 025: 1429 / 1689 loss=3.541, nll_loss=1.99, ppl=3.97, wps=581996, ups=1.18, wpb=494888, bsz=16390, num_updates=41900, lr=0.000308975, gnorm=0.131, clip=0, loss_scale=4, train_wall=84, gb_free=61.3, wall=35927 epoch 025: 1429 / 1689 loss=3.541, nll_loss=1.99, ppl=3.97, wps=581996, ups=1.18, wpb=494888, bsz=16390, num_updates=41900, lr=0.000308975, gnorm=0.131, clip=0, loss_scale=4, train_wall=84, gb_free=61.3, wall=35927 epoch 025: 1429 / 1689 loss=3.541, nll_loss=1.99, ppl=3.97, wps=581996, ups=1.18, wpb=494888, bsz=16390, num_updates=41900, lr=0.000308975, gnorm=0.131, clip=0, loss_scale=4, train_wall=84, gb_free=61.3, wall=35927 epoch 025: 1429 / 1689 loss=3.541, nll_loss=1.99, ppl=3.97, wps=581996, ups=1.18, wpb=494888, bsz=16390, num_updates=41900, lr=0.000308975, gnorm=0.131, clip=0, loss_scale=4, train_wall=84, gb_free=61.3, wall=35927 epoch 025: 1429 / 1689 loss=3.541, nll_loss=1.99, ppl=3.97, wps=581996, ups=1.18, wpb=494888, bsz=16390, num_updates=41900, lr=0.000308975, gnorm=0.131, clip=0, loss_scale=4, train_wall=84, gb_free=61.3, wall=35927 epoch 025: 1429 / 1689 loss=3.541, nll_loss=1.99, ppl=3.97, wps=581996, ups=1.18, wpb=494888, bsz=16390, num_updates=41900, lr=0.000308975, gnorm=0.131, clip=0, loss_scale=4, train_wall=84, gb_free=61.3, wall=35927 epoch 025: 1429 / 1689 loss=3.541, nll_loss=1.99, ppl=3.97, wps=581996, ups=1.18, wpb=494888, bsz=16390, num_updates=41900, lr=0.000308975, gnorm=0.131, clip=0, loss_scale=4, train_wall=84, gb_free=61.3, wall=35927 epoch 025: 1429 / 1689 loss=3.541, nll_loss=1.99, ppl=3.97, wps=581996, ups=1.18, wpb=494888, bsz=16390, num_updates=41900, lr=0.000308975, gnorm=0.131, clip=0, loss_scale=4, train_wall=84, gb_free=61.3, wall=35927 epoch 025: 1429 / 1689 loss=3.541, nll_loss=1.99, ppl=3.97, wps=581996, ups=1.18, wpb=494888, bsz=16390, num_updates=41900, lr=0.000308975, gnorm=0.131, clip=0, loss_scale=4, train_wall=84, gb_free=61.3, wall=35927 epoch 025: 1429 / 1689 loss=3.541, nll_loss=1.99, ppl=3.97, wps=581996, ups=1.18, wpb=494888, bsz=16390, num_updates=41900, lr=0.000308975, gnorm=0.131, clip=0, loss_scale=4, train_wall=84, gb_free=61.3, wall=35927 epoch 025: 1529 / 1689 loss=3.545, nll_loss=1.995, ppl=3.99, wps=583258, ups=1.18, wpb=494226, bsz=16804.2, num_updates=42000, lr=0.000308607, gnorm=0.133, clip=0, loss_scale=4, train_wall=84, gb_free=61.4, wall=36012 epoch 025: 1529 / 1689 loss=3.545, nll_loss=1.995, ppl=3.99, wps=583258, ups=1.18, wpb=494226, bsz=16804.2, num_updates=42000, lr=0.000308607, gnorm=0.133, clip=0, loss_scale=4, train_wall=84, gb_free=61.4, wall=36012 epoch 025: 1529 / 1689 loss=3.545, nll_loss=1.995, ppl=3.99, wps=583258, ups=1.18, wpb=494226, bsz=16804.2, num_updates=42000, lr=0.000308607, gnorm=0.133, clip=0, loss_scale=4, train_wall=84, gb_free=61.4, wall=36012 epoch 025: 1529 / 1689 loss=3.545, nll_loss=1.995, ppl=3.99, wps=583258, ups=1.18, wpb=494226, bsz=16804.2, num_updates=42000, lr=0.000308607, gnorm=0.133, clip=0, loss_scale=4, train_wall=84, gb_free=61.4, wall=36012 epoch 025: 1529 / 1689 loss=3.545, nll_loss=1.995, ppl=3.99, wps=583258, ups=1.18, wpb=494226, bsz=16804.2, num_updates=42000, lr=0.000308607, gnorm=0.133, clip=0, loss_scale=4, train_wall=84, gb_free=61.4, wall=36012 epoch 025: 1529 / 1689 loss=3.545, nll_loss=1.995, ppl=3.99, wps=583258, ups=1.18, wpb=494226, bsz=16804.2, num_updates=42000, lr=0.000308607, gnorm=0.133, clip=0, loss_scale=4, train_wall=84, gb_free=61.4, wall=36012 epoch 025: 1529 / 1689 loss=3.545, nll_loss=1.995, ppl=3.99, wps=583258, ups=1.18, wpb=494226, bsz=16804.2, num_updates=42000, lr=0.000308607, gnorm=0.133, clip=0, loss_scale=4, train_wall=84, gb_free=61.4, wall=36012 epoch 025: 1529 / 1689 loss=3.545, nll_loss=1.995, ppl=3.99, wps=583258, ups=1.18, wpb=494226, bsz=16804.2, num_updates=42000, lr=0.000308607, gnorm=0.133, clip=0, loss_scale=4, train_wall=84, gb_free=61.4, wall=36012 epoch 025: 1529 / 1689 loss=3.545, nll_loss=1.995, ppl=3.99, wps=583258, ups=1.18, wpb=494226, bsz=16804.2, num_updates=42000, lr=0.000308607, gnorm=0.133, clip=0, loss_scale=4, train_wall=84, gb_free=61.4, wall=36012 epoch 025: 1529 / 1689 loss=3.545, nll_loss=1.995, ppl=3.99, wps=583258, ups=1.18, wpb=494226, bsz=16804.2, num_updates=42000, lr=0.000308607, gnorm=0.133, clip=0, loss_scale=4, train_wall=84, gb_free=61.4, wall=36012 epoch 025: 1529 / 1689 loss=3.545, nll_loss=1.995, ppl=3.99, wps=583258, ups=1.18, wpb=494226, bsz=16804.2, num_updates=42000, lr=0.000308607, gnorm=0.133, clip=0, loss_scale=4, train_wall=84, gb_free=61.4, wall=36012 epoch 025: 1529 / 1689 loss=3.545, nll_loss=1.995, ppl=3.99, wps=583258, ups=1.18, wpb=494226, bsz=16804.2, num_updates=42000, lr=0.000308607, gnorm=0.133, clip=0, loss_scale=4, train_wall=84, gb_free=61.4, wall=36012 epoch 025: 1529 / 1689 loss=3.545, nll_loss=1.995, ppl=3.99, wps=583258, ups=1.18, wpb=494226, bsz=16804.2, num_updates=42000, lr=0.000308607, gnorm=0.133, clip=0, loss_scale=4, train_wall=84, gb_free=61.4, wall=36012 epoch 025: 1529 / 1689 loss=3.545, nll_loss=1.995, ppl=3.99, wps=583258, ups=1.18, wpb=494226, bsz=16804.2, num_updates=42000, lr=0.000308607, gnorm=0.133, clip=0, loss_scale=4, train_wall=84, gb_free=61.4, wall=36012 epoch 025: 1529 / 1689 loss=3.545, nll_loss=1.995, ppl=3.99, wps=583258, ups=1.18, wpb=494226, bsz=16804.2, num_updates=42000, lr=0.000308607, gnorm=0.133, clip=0, loss_scale=4, train_wall=84, gb_free=61.4, wall=36012 epoch 025: 1529 / 1689 loss=3.545, nll_loss=1.995, ppl=3.99, wps=583258, ups=1.18, wpb=494226, bsz=16804.2, num_updates=42000, lr=0.000308607, gnorm=0.133, clip=0, loss_scale=4, train_wall=84, gb_free=61.4, wall=36012 epoch 025: 1529 / 1689 loss=3.545, nll_loss=1.995, ppl=3.99, wps=583258, ups=1.18, wpb=494226, bsz=16804.2, num_updates=42000, lr=0.000308607, gnorm=0.133, clip=0, loss_scale=4, train_wall=84, gb_free=61.4, wall=36012 epoch 025: 1529 / 1689 loss=3.545, nll_loss=1.995, ppl=3.99, wps=583258, ups=1.18, wpb=494226, bsz=16804.2, num_updates=42000, lr=0.000308607, gnorm=0.133, clip=0, loss_scale=4, train_wall=84, gb_free=61.4, wall=36012 epoch 025: 1529 / 1689 loss=3.545, nll_loss=1.995, ppl=3.99, wps=583258, ups=1.18, wpb=494226, bsz=16804.2, num_updates=42000, lr=0.000308607, gnorm=0.133, clip=0, loss_scale=4, train_wall=84, gb_free=61.4, wall=36012 epoch 025: 1529 / 1689 loss=3.545, nll_loss=1.995, ppl=3.99, wps=583258, ups=1.18, wpb=494226, bsz=16804.2, num_updates=42000, lr=0.000308607, gnorm=0.133, clip=0, loss_scale=4, train_wall=84, gb_free=61.4, wall=36012 epoch 025: 1529 / 1689 loss=3.545, nll_loss=1.995, ppl=3.99, wps=583258, ups=1.18, wpb=494226, bsz=16804.2, num_updates=42000, lr=0.000308607, gnorm=0.133, clip=0, loss_scale=4, train_wall=84, gb_free=61.4, wall=36012 epoch 025: 1529 / 1689 loss=3.545, nll_loss=1.995, ppl=3.99, wps=583258, ups=1.18, wpb=494226, bsz=16804.2, num_updates=42000, lr=0.000308607, gnorm=0.133, clip=0, loss_scale=4, train_wall=84, gb_free=61.4, wall=36012 epoch 025: 1529 / 1689 loss=3.545, nll_loss=1.995, ppl=3.99, wps=583258, ups=1.18, wpb=494226, bsz=16804.2, num_updates=42000, lr=0.000308607, gnorm=0.133, clip=0, loss_scale=4, train_wall=84, gb_free=61.4, wall=36012 epoch 025: 1529 / 1689 loss=3.545, nll_loss=1.995, ppl=3.99, wps=583258, ups=1.18, wpb=494226, bsz=16804.2, num_updates=42000, lr=0.000308607, gnorm=0.133, clip=0, loss_scale=4, train_wall=84, gb_free=61.4, wall=36012 epoch 025: 1529 / 1689 loss=3.545, nll_loss=1.995, ppl=3.99, wps=583258, ups=1.18, wpb=494226, bsz=16804.2, num_updates=42000, lr=0.000308607, gnorm=0.133, clip=0, loss_scale=4, train_wall=84, gb_free=61.4, wall=36012 begin validation on "valid" subset epoch 025 | valid on 'valid' subset | loss 3.695 | nll_loss 2.146 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 42000 | best_loss 3.695 epoch 025 | valid on 'valid' subset | loss 3.695 | nll_loss 2.146 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 42000 | best_loss 3.695 epoch 025 | valid on 'valid' subset | loss 3.695 | nll_loss 2.146 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 42000 | best_loss 3.695 epoch 025 | valid on 'valid' subset | loss 3.695 | nll_loss 2.146 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 42000 | best_loss 3.695 epoch 025 | valid on 'valid' subset | loss 3.695 | nll_loss 2.146 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 42000 | best_loss 3.695 epoch 025 | valid on 'valid' subset | loss 3.695 | nll_loss 2.146 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 42000 | best_loss 3.695 epoch 025 | valid on 'valid' subset | loss 3.695 | nll_loss 2.146 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 42000 | best_loss 3.695 epoch 025 | valid on 'valid' subset | loss 3.695 | nll_loss 2.146 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 42000 | best_loss 3.695 epoch 025 | valid on 'valid' subset | loss 3.695 | nll_loss 2.146 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 42000 | best_loss 3.695 epoch 025 | valid on 'valid' subset | loss 3.695 | nll_loss 2.146 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 42000 | best_loss 3.695 epoch 025 | valid on 'valid' subset | loss 3.695 | nll_loss 2.146 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 42000 | best_loss 3.695 epoch 025 | valid on 'valid' subset | loss 3.695 | nll_loss 2.146 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 42000 | best_loss 3.695 epoch 025 | valid on 'valid' subset | loss 3.695 | nll_loss 2.146 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 42000 | best_loss 3.695 epoch 025 | valid on 'valid' subset | loss 3.695 | nll_loss 2.146 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 42000 | best_loss 3.695 epoch 025 | valid on 'valid' subset | loss 3.695 | nll_loss 2.146 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 42000 | best_loss 3.695 epoch 025 | valid on 'valid' subset | loss 3.695 | nll_loss 2.146 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 42000 | best_loss 3.695 epoch 025 | valid on 'valid' subset | loss 3.695 | nll_loss 2.146 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 42000 | best_loss 3.695 epoch 025 | valid on 'valid' subset | loss 3.695 | nll_loss 2.146 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 42000 | best_loss 3.695 epoch 025 | valid on 'valid' subset | loss 3.695 | nll_loss 2.146 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 42000 | best_loss 3.695 epoch 025 | valid on 'valid' subset | loss 3.695 | nll_loss 2.146 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 42000 | best_loss 3.695 epoch 025 | valid on 'valid' subset | loss 3.695 | nll_loss 2.146 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 42000 | best_loss 3.695 epoch 025 | valid on 'valid' subset | loss 3.695 | nll_loss 2.146 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 42000 | best_loss 3.695 epoch 025 | valid on 'valid' subset | loss 3.695 | nll_loss 2.146 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 42000 | best_loss 3.695 epoch 025 | valid on 'valid' subset | loss 3.695 | nll_loss 2.146 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 42000 | best_loss 3.695 epoch 025 | valid on 'valid' subset | loss 3.695 | nll_loss 2.146 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 42000 | best_loss 3.695 epoch 025: 1629 / 1689 loss=3.543, nll_loss=1.993, ppl=3.98, wps=515008, ups=1.04, wpb=495420, bsz=16322.4, num_updates=42100, lr=0.00030824, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=61.3, wall=36108 epoch 025: 1629 / 1689 loss=3.543, nll_loss=1.993, ppl=3.98, wps=515008, ups=1.04, wpb=495420, bsz=16322.4, num_updates=42100, lr=0.00030824, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=61.3, wall=36108 epoch 025: 1629 / 1689 loss=3.543, nll_loss=1.993, ppl=3.98, wps=515008, ups=1.04, wpb=495420, bsz=16322.4, num_updates=42100, lr=0.00030824, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=61.3, wall=36108 epoch 025: 1629 / 1689 loss=3.543, nll_loss=1.993, ppl=3.98, wps=515008, ups=1.04, wpb=495420, bsz=16322.4, num_updates=42100, lr=0.00030824, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=61.3, wall=36108 epoch 025: 1629 / 1689 loss=3.543, nll_loss=1.993, ppl=3.98, wps=515008, ups=1.04, wpb=495420, bsz=16322.4, num_updates=42100, lr=0.00030824, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=61.3, wall=36108 epoch 025: 1629 / 1689 loss=3.543, nll_loss=1.993, ppl=3.98, wps=515008, ups=1.04, wpb=495420, bsz=16322.4, num_updates=42100, lr=0.00030824, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=61.3, wall=36108 epoch 025: 1629 / 1689 loss=3.543, nll_loss=1.993, ppl=3.98, wps=515008, ups=1.04, wpb=495420, bsz=16322.4, num_updates=42100, lr=0.00030824, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=61.3, wall=36108 epoch 025: 1629 / 1689 loss=3.543, nll_loss=1.993, ppl=3.98, wps=515008, ups=1.04, wpb=495420, bsz=16322.4, num_updates=42100, lr=0.00030824, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=61.3, wall=36108 epoch 025: 1629 / 1689 loss=3.543, nll_loss=1.993, ppl=3.98, wps=515008, ups=1.04, wpb=495420, bsz=16322.4, num_updates=42100, lr=0.00030824, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=61.3, wall=36108 epoch 025: 1629 / 1689 loss=3.543, nll_loss=1.993, ppl=3.98, wps=515008, ups=1.04, wpb=495420, bsz=16322.4, num_updates=42100, lr=0.00030824, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=61.3, wall=36108 epoch 025: 1629 / 1689 loss=3.543, nll_loss=1.993, ppl=3.98, wps=515008, ups=1.04, wpb=495420, bsz=16322.4, num_updates=42100, lr=0.00030824, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=61.3, wall=36108 epoch 025: 1629 / 1689 loss=3.543, nll_loss=1.993, ppl=3.98, wps=515008, ups=1.04, wpb=495420, bsz=16322.4, num_updates=42100, lr=0.00030824, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=61.3, wall=36108 epoch 025: 1629 / 1689 loss=3.543, nll_loss=1.993, ppl=3.98, wps=515008, ups=1.04, wpb=495420, bsz=16322.4, num_updates=42100, lr=0.00030824, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=61.3, wall=36108 epoch 025: 1629 / 1689 loss=3.543, nll_loss=1.993, ppl=3.98, wps=515008, ups=1.04, wpb=495420, bsz=16322.4, num_updates=42100, lr=0.00030824, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=61.3, wall=36108 epoch 025: 1629 / 1689 loss=3.543, nll_loss=1.993, ppl=3.98, wps=515008, ups=1.04, wpb=495420, bsz=16322.4, num_updates=42100, lr=0.00030824, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=61.3, wall=36108 epoch 025: 1629 / 1689 loss=3.543, nll_loss=1.993, ppl=3.98, wps=515008, ups=1.04, wpb=495420, bsz=16322.4, num_updates=42100, lr=0.00030824, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=61.3, wall=36108 epoch 025: 1629 / 1689 loss=3.543, nll_loss=1.993, ppl=3.98, wps=515008, ups=1.04, wpb=495420, bsz=16322.4, num_updates=42100, lr=0.00030824, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=61.3, wall=36108 epoch 025: 1629 / 1689 loss=3.543, nll_loss=1.993, ppl=3.98, wps=515008, ups=1.04, wpb=495420, bsz=16322.4, num_updates=42100, lr=0.00030824, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=61.3, wall=36108 epoch 025: 1629 / 1689 loss=3.543, nll_loss=1.993, ppl=3.98, wps=515008, ups=1.04, wpb=495420, bsz=16322.4, num_updates=42100, lr=0.00030824, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=61.3, wall=36108 epoch 025: 1629 / 1689 loss=3.543, nll_loss=1.993, ppl=3.98, wps=515008, ups=1.04, wpb=495420, bsz=16322.4, num_updates=42100, lr=0.00030824, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=61.3, wall=36108 epoch 025: 1629 / 1689 loss=3.543, nll_loss=1.993, ppl=3.98, wps=515008, ups=1.04, wpb=495420, bsz=16322.4, num_updates=42100, lr=0.00030824, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=61.3, wall=36108 epoch 025: 1629 / 1689 loss=3.543, nll_loss=1.993, ppl=3.98, wps=515008, ups=1.04, wpb=495420, bsz=16322.4, num_updates=42100, lr=0.00030824, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=61.3, wall=36108 epoch 025: 1629 / 1689 loss=3.543, nll_loss=1.993, ppl=3.98, wps=515008, ups=1.04, wpb=495420, bsz=16322.4, num_updates=42100, lr=0.00030824, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=61.3, wall=36108 epoch 025: 1629 / 1689 loss=3.543, nll_loss=1.993, ppl=3.98, wps=515008, ups=1.04, wpb=495420, bsz=16322.4, num_updates=42100, lr=0.00030824, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=61.3, wall=36108 epoch 025: 1629 / 1689 loss=3.543, nll_loss=1.993, ppl=3.98, wps=515008, ups=1.04, wpb=495420, bsz=16322.4, num_updates=42100, lr=0.00030824, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=61.3, wall=36108 end of epoch 25 (average epoch stats below) epoch 025 | loss 3.538 | nll_loss 1.986 | ppl 3.96 | wps 577307 | ups 1.17 | wpb 495114 | bsz 16504.9 | num_updates 42160 | lr 0.000308021 | gnorm 0.135 | clip 0 | loss_scale 4 | train_wall 1407 | gb_free 63.3 | wall 36158 epoch 025 | loss 3.538 | nll_loss 1.986 | ppl 3.96 | wps 577307 | ups 1.17 | wpb 495114 | bsz 16504.9 | num_updates 42160 | lr 0.000308021 | gnorm 0.135 | clip 0 | loss_scale 4 | train_wall 1407 | gb_free 63.3 | wall 36158 epoch 025 | loss 3.538 | nll_loss 1.986 | ppl 3.96 | wps 577307 | ups 1.17 | wpb 495114 | bsz 16504.9 | num_updates 42160 | lr 0.000308021 | gnorm 0.135 | clip 0 | loss_scale 4 | train_wall 1407 | gb_free 63.3 | wall 36158 epoch 025 | loss 3.538 | nll_loss 1.986 | ppl 3.96 | wps 577307 | ups 1.17 | wpb 495114 | bsz 16504.9 | num_updates 42160 | lr 0.000308021 | gnorm 0.135 | clip 0 | loss_scale 4 | train_wall 1407 | gb_free 63.3 | wall 36158 epoch 025 | loss 3.538 | nll_loss 1.986 | ppl 3.96 | wps 577307 | ups 1.17 | wpb 495114 | bsz 16504.9 | num_updates 42160 | lr 0.000308021 | gnorm 0.135 | clip 0 | loss_scale 4 | train_wall 1407 | gb_free 63.3 | wall 36158 epoch 025 | loss 3.538 | nll_loss 1.986 | ppl 3.96 | wps 577307 | ups 1.17 | wpb 495114 | bsz 16504.9 | num_updates 42160 | lr 0.000308021 | gnorm 0.135 | clip 0 | loss_scale 4 | train_wall 1407 | gb_free 63.3 | wall 36158 epoch 025 | loss 3.538 | nll_loss 1.986 | ppl 3.96 | wps 577307 | ups 1.17 | wpb 495114 | bsz 16504.9 | num_updates 42160 | lr 0.000308021 | gnorm 0.135 | clip 0 | loss_scale 4 | train_wall 1407 | gb_free 63.3 | wall 36158 epoch 025 | loss 3.538 | nll_loss 1.986 | ppl 3.96 | wps 577307 | ups 1.17 | wpb 495114 | bsz 16504.9 | num_updates 42160 | lr 0.000308021 | gnorm 0.135 | clip 0 | loss_scale 4 | train_wall 1407 | gb_free 63.3 | wall 36158 epoch 025 | loss 3.538 | nll_loss 1.986 | ppl 3.96 | wps 577307 | ups 1.17 | wpb 495114 | bsz 16504.9 | num_updates 42160 | lr 0.000308021 | gnorm 0.135 | clip 0 | loss_scale 4 | train_wall 1407 | gb_free 63.3 | wall 36158 epoch 025 | loss 3.538 | nll_loss 1.986 | ppl 3.96 | wps 577307 | ups 1.17 | wpb 495114 | bsz 16504.9 | num_updates 42160 | lr 0.000308021 | gnorm 0.135 | clip 0 | loss_scale 4 | train_wall 1407 | gb_free 63.3 | wall 36158 epoch 025 | loss 3.538 | nll_loss 1.986 | ppl 3.96 | wps 577307 | ups 1.17 | wpb 495114 | bsz 16504.9 | num_updates 42160 | lr 0.000308021 | gnorm 0.135 | clip 0 | loss_scale 4 | train_wall 1407 | gb_free 63.3 | wall 36158 epoch 025 | loss 3.538 | nll_loss 1.986 | ppl 3.96 | wps 577307 | ups 1.17 | wpb 495114 | bsz 16504.9 | num_updates 42160 | lr 0.000308021 | gnorm 0.135 | clip 0 | loss_scale 4 | train_wall 1407 | gb_free 63.3 | wall 36158 epoch 025 | loss 3.538 | nll_loss 1.986 | ppl 3.96 | wps 577307 | ups 1.17 | wpb 495114 | bsz 16504.9 | num_updates 42160 | lr 0.000308021 | gnorm 0.135 | clip 0 | loss_scale 4 | train_wall 1407 | gb_free 63.3 | wall 36158 epoch 025 | loss 3.538 | nll_loss 1.986 | ppl 3.96 | wps 577307 | ups 1.17 | wpb 495114 | bsz 16504.9 | num_updates 42160 | lr 0.000308021 | gnorm 0.135 | clip 0 | loss_scale 4 | train_wall 1407 | gb_free 63.3 | wall 36158 epoch 025 | loss 3.538 | nll_loss 1.986 | ppl 3.96 | wps 577307 | ups 1.17 | wpb 495114 | bsz 16504.9 | num_updates 42160 | lr 0.000308021 | gnorm 0.135 | clip 0 | loss_scale 4 | train_wall 1407 | gb_free 63.3 | wall 36158 epoch 025 | loss 3.538 | nll_loss 1.986 | ppl 3.96 | wps 577307 | ups 1.17 | wpb 495114 | bsz 16504.9 | num_updates 42160 | lr 0.000308021 | gnorm 0.135 | clip 0 | loss_scale 4 | train_wall 1407 | gb_free 63.3 | wall 36158 epoch 025 | loss 3.538 | nll_loss 1.986 | ppl 3.96 | wps 577307 | ups 1.17 | wpb 495114 | bsz 16504.9 | num_updates 42160 | lr 0.000308021 | gnorm 0.135 | clip 0 | loss_scale 4 | train_wall 1407 | gb_free 63.3 | wall 36158 epoch 025 | loss 3.538 | nll_loss 1.986 | ppl 3.96 | wps 577307 | ups 1.17 | wpb 495114 | bsz 16504.9 | num_updates 42160 | lr 0.000308021 | gnorm 0.135 | clip 0 | loss_scale 4 | train_wall 1407 | gb_free 63.3 | wall 36158 epoch 025 | loss 3.538 | nll_loss 1.986 | ppl 3.96 | wps 577307 | ups 1.17 | wpb 495114 | bsz 16504.9 | num_updates 42160 | lr 0.000308021 | gnorm 0.135 | clip 0 | loss_scale 4 | train_wall 1407 | gb_free 63.3 | wall 36158 epoch 025 | loss 3.538 | nll_loss 1.986 | ppl 3.96 | wps 577307 | ups 1.17 | wpb 495114 | bsz 16504.9 | num_updates 42160 | lr 0.000308021 | gnorm 0.135 | clip 0 | loss_scale 4 | train_wall 1407 | gb_free 63.3 | wall 36158 epoch 025 | loss 3.538 | nll_loss 1.986 | ppl 3.96 | wps 577307 | ups 1.17 | wpb 495114 | bsz 16504.9 | num_updates 42160 | lr 0.000308021 | gnorm 0.135 | clip 0 | loss_scale 4 | train_wall 1407 | gb_free 63.3 | wall 36158 epoch 025 | loss 3.538 | nll_loss 1.986 | ppl 3.96 | wps 577307 | ups 1.17 | wpb 495114 | bsz 16504.9 | num_updates 42160 | lr 0.000308021 | gnorm 0.135 | clip 0 | loss_scale 4 | train_wall 1407 | gb_free 63.3 | wall 36158 epoch 025 | loss 3.538 | nll_loss 1.986 | ppl 3.96 | wps 577307 | ups 1.17 | wpb 495114 | bsz 16504.9 | num_updates 42160 | lr 0.000308021 | gnorm 0.135 | clip 0 | loss_scale 4 | train_wall 1407 | gb_free 63.3 | wall 36158 epoch 025 | loss 3.538 | nll_loss 1.986 | ppl 3.96 | wps 577307 | ups 1.17 | wpb 495114 | bsz 16504.9 | num_updates 42160 | lr 0.000308021 | gnorm 0.135 | clip 0 | loss_scale 4 | train_wall 1407 | gb_free 63.3 | wall 36158 epoch 025 | loss 3.538 | nll_loss 1.986 | ppl 3.96 | wps 577307 | ups 1.17 | wpb 495114 | bsz 16504.9 | num_updates 42160 | lr 0.000308021 | gnorm 0.135 | clip 0 | loss_scale 4 | train_wall 1407 | gb_free 63.3 | wall 36158 Start iterating over samples epoch 026: 40 / 1689 loss=3.531, nll_loss=1.978, ppl=3.94, wps=584480, ups=1.19, wpb=492013, bsz=16520.6, num_updates=42200, lr=0.000307875, gnorm=0.139, clip=0, loss_scale=4, train_wall=83, gb_free=61.8, wall=36192 epoch 026: 40 / 1689 loss=3.531, nll_loss=1.978, ppl=3.94, wps=584480, ups=1.19, wpb=492013, bsz=16520.6, num_updates=42200, lr=0.000307875, gnorm=0.139, clip=0, loss_scale=4, train_wall=83, gb_free=61.8, wall=36192 epoch 026: 40 / 1689 loss=3.531, nll_loss=1.978, ppl=3.94, wps=584480, ups=1.19, wpb=492013, bsz=16520.6, num_updates=42200, lr=0.000307875, gnorm=0.139, clip=0, loss_scale=4, train_wall=83, gb_free=61.8, wall=36192 epoch 026: 40 / 1689 loss=3.531, nll_loss=1.978, ppl=3.94, wps=584480, ups=1.19, wpb=492013, bsz=16520.6, num_updates=42200, lr=0.000307875, gnorm=0.139, clip=0, loss_scale=4, train_wall=83, gb_free=61.8, wall=36192 epoch 026: 40 / 1689 loss=3.531, nll_loss=1.978, ppl=3.94, wps=584480, ups=1.19, wpb=492013, bsz=16520.6, num_updates=42200, lr=0.000307875, gnorm=0.139, clip=0, loss_scale=4, train_wall=83, gb_free=61.8, wall=36192 epoch 026: 40 / 1689 loss=3.531, nll_loss=1.978, ppl=3.94, wps=584480, ups=1.19, wpb=492013, bsz=16520.6, num_updates=42200, lr=0.000307875, gnorm=0.139, clip=0, loss_scale=4, train_wall=83, gb_free=61.8, wall=36192 epoch 026: 40 / 1689 loss=3.531, nll_loss=1.978, ppl=3.94, wps=584480, ups=1.19, wpb=492013, bsz=16520.6, num_updates=42200, lr=0.000307875, gnorm=0.139, clip=0, loss_scale=4, train_wall=83, gb_free=61.8, wall=36192 epoch 026: 40 / 1689 loss=3.531, nll_loss=1.978, ppl=3.94, wps=584480, ups=1.19, wpb=492013, bsz=16520.6, num_updates=42200, lr=0.000307875, gnorm=0.139, clip=0, loss_scale=4, train_wall=83, gb_free=61.8, wall=36192 epoch 026: 40 / 1689 loss=3.531, nll_loss=1.978, ppl=3.94, wps=584480, ups=1.19, wpb=492013, bsz=16520.6, num_updates=42200, lr=0.000307875, gnorm=0.139, clip=0, loss_scale=4, train_wall=83, gb_free=61.8, wall=36192 epoch 026: 40 / 1689 loss=3.531, nll_loss=1.978, ppl=3.94, wps=584480, ups=1.19, wpb=492013, bsz=16520.6, num_updates=42200, lr=0.000307875, gnorm=0.139, clip=0, loss_scale=4, train_wall=83, gb_free=61.8, wall=36192 epoch 026: 40 / 1689 loss=3.531, nll_loss=1.978, ppl=3.94, wps=584480, ups=1.19, wpb=492013, bsz=16520.6, num_updates=42200, lr=0.000307875, gnorm=0.139, clip=0, loss_scale=4, train_wall=83, gb_free=61.8, wall=36192 epoch 026: 40 / 1689 loss=3.531, nll_loss=1.978, ppl=3.94, wps=584480, ups=1.19, wpb=492013, bsz=16520.6, num_updates=42200, lr=0.000307875, gnorm=0.139, clip=0, loss_scale=4, train_wall=83, gb_free=61.8, wall=36192 epoch 026: 40 / 1689 loss=3.531, nll_loss=1.978, ppl=3.94, wps=584480, ups=1.19, wpb=492013, bsz=16520.6, num_updates=42200, lr=0.000307875, gnorm=0.139, clip=0, loss_scale=4, train_wall=83, gb_free=61.8, wall=36192 epoch 026: 40 / 1689 loss=3.531, nll_loss=1.978, ppl=3.94, wps=584480, ups=1.19, wpb=492013, bsz=16520.6, num_updates=42200, lr=0.000307875, gnorm=0.139, clip=0, loss_scale=4, train_wall=83, gb_free=61.8, wall=36192 epoch 026: 40 / 1689 loss=3.531, nll_loss=1.978, ppl=3.94, wps=584480, ups=1.19, wpb=492013, bsz=16520.6, num_updates=42200, lr=0.000307875, gnorm=0.139, clip=0, loss_scale=4, train_wall=83, gb_free=61.8, wall=36192 epoch 026: 40 / 1689 loss=3.531, nll_loss=1.978, ppl=3.94, wps=584480, ups=1.19, wpb=492013, bsz=16520.6, num_updates=42200, lr=0.000307875, gnorm=0.139, clip=0, loss_scale=4, train_wall=83, gb_free=61.8, wall=36192 epoch 026: 40 / 1689 loss=3.531, nll_loss=1.978, ppl=3.94, wps=584480, ups=1.19, wpb=492013, bsz=16520.6, num_updates=42200, lr=0.000307875, gnorm=0.139, clip=0, loss_scale=4, train_wall=83, gb_free=61.8, wall=36192 epoch 026: 40 / 1689 loss=3.531, nll_loss=1.978, ppl=3.94, wps=584480, ups=1.19, wpb=492013, bsz=16520.6, num_updates=42200, lr=0.000307875, gnorm=0.139, clip=0, loss_scale=4, train_wall=83, gb_free=61.8, wall=36192 epoch 026: 40 / 1689 loss=3.531, nll_loss=1.978, ppl=3.94, wps=584480, ups=1.19, wpb=492013, bsz=16520.6, num_updates=42200, lr=0.000307875, gnorm=0.139, clip=0, loss_scale=4, train_wall=83, gb_free=61.8, wall=36192 epoch 026: 40 / 1689 loss=3.531, nll_loss=1.978, ppl=3.94, wps=584480, ups=1.19, wpb=492013, bsz=16520.6, num_updates=42200, lr=0.000307875, gnorm=0.139, clip=0, loss_scale=4, train_wall=83, gb_free=61.8, wall=36192 epoch 026: 40 / 1689 loss=3.531, nll_loss=1.978, ppl=3.94, wps=584480, ups=1.19, wpb=492013, bsz=16520.6, num_updates=42200, lr=0.000307875, gnorm=0.139, clip=0, loss_scale=4, train_wall=83, gb_free=61.8, wall=36192 epoch 026: 40 / 1689 loss=3.531, nll_loss=1.978, ppl=3.94, wps=584480, ups=1.19, wpb=492013, bsz=16520.6, num_updates=42200, lr=0.000307875, gnorm=0.139, clip=0, loss_scale=4, train_wall=83, gb_free=61.8, wall=36192 epoch 026: 40 / 1689 loss=3.531, nll_loss=1.978, ppl=3.94, wps=584480, ups=1.19, wpb=492013, bsz=16520.6, num_updates=42200, lr=0.000307875, gnorm=0.139, clip=0, loss_scale=4, train_wall=83, gb_free=61.8, wall=36192 epoch 026: 40 / 1689 loss=3.531, nll_loss=1.978, ppl=3.94, wps=584480, ups=1.19, wpb=492013, bsz=16520.6, num_updates=42200, lr=0.000307875, gnorm=0.139, clip=0, loss_scale=4, train_wall=83, gb_free=61.8, wall=36192 epoch 026: 40 / 1689 loss=3.531, nll_loss=1.978, ppl=3.94, wps=584480, ups=1.19, wpb=492013, bsz=16520.6, num_updates=42200, lr=0.000307875, gnorm=0.139, clip=0, loss_scale=4, train_wall=83, gb_free=61.8, wall=36192 epoch 026: 40 / 1689 loss=3.531, nll_loss=1.978, ppl=3.94, wps=584480, ups=1.19, wpb=492013, bsz=16520.6, num_updates=42200, lr=0.000307875, gnorm=0.139, clip=0, loss_scale=4, train_wall=83, gb_free=61.8, wall=36192 epoch 026: 140 / 1689 loss=3.518, nll_loss=1.963, ppl=3.9, wps=585119, ups=1.18, wpb=495073, bsz=16471.7, num_updates=42300, lr=0.00030751, gnorm=0.13, clip=0, loss_scale=4, train_wall=84, gb_free=60.9, wall=36277 epoch 026: 140 / 1689 loss=3.518, nll_loss=1.963, ppl=3.9, wps=585119, ups=1.18, wpb=495073, bsz=16471.7, num_updates=42300, lr=0.00030751, gnorm=0.13, clip=0, loss_scale=4, train_wall=84, gb_free=60.9, wall=36277 epoch 026: 140 / 1689 loss=3.518, nll_loss=1.963, ppl=3.9, wps=585119, ups=1.18, wpb=495073, bsz=16471.7, num_updates=42300, lr=0.00030751, gnorm=0.13, clip=0, loss_scale=4, train_wall=84, gb_free=60.9, wall=36277 epoch 026: 140 / 1689 loss=3.518, nll_loss=1.963, ppl=3.9, wps=585119, ups=1.18, wpb=495073, bsz=16471.7, num_updates=42300, lr=0.00030751, gnorm=0.13, clip=0, loss_scale=4, train_wall=84, gb_free=60.9, wall=36277 epoch 026: 140 / 1689 loss=3.518, nll_loss=1.963, ppl=3.9, wps=585119, ups=1.18, wpb=495073, bsz=16471.7, num_updates=42300, lr=0.00030751, gnorm=0.13, clip=0, loss_scale=4, train_wall=84, gb_free=60.9, wall=36277 epoch 026: 140 / 1689 loss=3.518, nll_loss=1.963, ppl=3.9, wps=585119, ups=1.18, wpb=495073, bsz=16471.7, num_updates=42300, lr=0.00030751, gnorm=0.13, clip=0, loss_scale=4, train_wall=84, gb_free=60.9, wall=36277 epoch 026: 140 / 1689 loss=3.518, nll_loss=1.963, ppl=3.9, wps=585119, ups=1.18, wpb=495073, bsz=16471.7, num_updates=42300, lr=0.00030751, gnorm=0.13, clip=0, loss_scale=4, train_wall=84, gb_free=60.9, wall=36277 epoch 026: 140 / 1689 loss=3.518, nll_loss=1.963, ppl=3.9, wps=585119, ups=1.18, wpb=495073, bsz=16471.7, num_updates=42300, lr=0.00030751, gnorm=0.13, clip=0, loss_scale=4, train_wall=84, gb_free=60.9, wall=36277 epoch 026: 140 / 1689 loss=3.518, nll_loss=1.963, ppl=3.9, wps=585119, ups=1.18, wpb=495073, bsz=16471.7, num_updates=42300, lr=0.00030751, gnorm=0.13, clip=0, loss_scale=4, train_wall=84, gb_free=60.9, wall=36277 epoch 026: 140 / 1689 loss=3.518, nll_loss=1.963, ppl=3.9, wps=585119, ups=1.18, wpb=495073, bsz=16471.7, num_updates=42300, lr=0.00030751, gnorm=0.13, clip=0, loss_scale=4, train_wall=84, gb_free=60.9, wall=36277 epoch 026: 140 / 1689 loss=3.518, nll_loss=1.963, ppl=3.9, wps=585119, ups=1.18, wpb=495073, bsz=16471.7, num_updates=42300, lr=0.00030751, gnorm=0.13, clip=0, loss_scale=4, train_wall=84, gb_free=60.9, wall=36277 epoch 026: 140 / 1689 loss=3.518, nll_loss=1.963, ppl=3.9, wps=585119, ups=1.18, wpb=495073, bsz=16471.7, num_updates=42300, lr=0.00030751, gnorm=0.13, clip=0, loss_scale=4, train_wall=84, gb_free=60.9, wall=36277 epoch 026: 140 / 1689 loss=3.518, nll_loss=1.963, ppl=3.9, wps=585119, ups=1.18, wpb=495073, bsz=16471.7, num_updates=42300, lr=0.00030751, gnorm=0.13, clip=0, loss_scale=4, train_wall=84, gb_free=60.9, wall=36277 epoch 026: 140 / 1689 loss=3.518, nll_loss=1.963, ppl=3.9, wps=585119, ups=1.18, wpb=495073, bsz=16471.7, num_updates=42300, lr=0.00030751, gnorm=0.13, clip=0, loss_scale=4, train_wall=84, gb_free=60.9, wall=36277 epoch 026: 140 / 1689 loss=3.518, nll_loss=1.963, ppl=3.9, wps=585119, ups=1.18, wpb=495073, bsz=16471.7, num_updates=42300, lr=0.00030751, gnorm=0.13, clip=0, loss_scale=4, train_wall=84, gb_free=60.9, wall=36277 epoch 026: 140 / 1689 loss=3.518, nll_loss=1.963, ppl=3.9, wps=585119, ups=1.18, wpb=495073, bsz=16471.7, num_updates=42300, lr=0.00030751, gnorm=0.13, clip=0, loss_scale=4, train_wall=84, gb_free=60.9, wall=36277 epoch 026: 140 / 1689 loss=3.518, nll_loss=1.963, ppl=3.9, wps=585119, ups=1.18, wpb=495073, bsz=16471.7, num_updates=42300, lr=0.00030751, gnorm=0.13, clip=0, loss_scale=4, train_wall=84, gb_free=60.9, wall=36277 epoch 026: 140 / 1689 loss=3.518, nll_loss=1.963, ppl=3.9, wps=585119, ups=1.18, wpb=495073, bsz=16471.7, num_updates=42300, lr=0.00030751, gnorm=0.13, clip=0, loss_scale=4, train_wall=84, gb_free=60.9, wall=36277 epoch 026: 140 / 1689 loss=3.518, nll_loss=1.963, ppl=3.9, wps=585119, ups=1.18, wpb=495073, bsz=16471.7, num_updates=42300, lr=0.00030751, gnorm=0.13, clip=0, loss_scale=4, train_wall=84, gb_free=60.9, wall=36277 epoch 026: 140 / 1689 loss=3.518, nll_loss=1.963, ppl=3.9, wps=585119, ups=1.18, wpb=495073, bsz=16471.7, num_updates=42300, lr=0.00030751, gnorm=0.13, clip=0, loss_scale=4, train_wall=84, gb_free=60.9, wall=36277 epoch 026: 140 / 1689 loss=3.518, nll_loss=1.963, ppl=3.9, wps=585119, ups=1.18, wpb=495073, bsz=16471.7, num_updates=42300, lr=0.00030751, gnorm=0.13, clip=0, loss_scale=4, train_wall=84, gb_free=60.9, wall=36277 epoch 026: 140 / 1689 loss=3.518, nll_loss=1.963, ppl=3.9, wps=585119, ups=1.18, wpb=495073, bsz=16471.7, num_updates=42300, lr=0.00030751, gnorm=0.13, clip=0, loss_scale=4, train_wall=84, gb_free=60.9, wall=36277 epoch 026: 140 / 1689 loss=3.518, nll_loss=1.963, ppl=3.9, wps=585119, ups=1.18, wpb=495073, bsz=16471.7, num_updates=42300, lr=0.00030751, gnorm=0.13, clip=0, loss_scale=4, train_wall=84, gb_free=60.9, wall=36277 epoch 026: 140 / 1689 loss=3.518, nll_loss=1.963, ppl=3.9, wps=585119, ups=1.18, wpb=495073, bsz=16471.7, num_updates=42300, lr=0.00030751, gnorm=0.13, clip=0, loss_scale=4, train_wall=84, gb_free=60.9, wall=36277 epoch 026: 140 / 1689 loss=3.518, nll_loss=1.963, ppl=3.9, wps=585119, ups=1.18, wpb=495073, bsz=16471.7, num_updates=42300, lr=0.00030751, gnorm=0.13, clip=0, loss_scale=4, train_wall=84, gb_free=60.9, wall=36277 epoch 026: 140 / 1689 loss=3.518, nll_loss=1.963, ppl=3.9, wps=585119, ups=1.18, wpb=495073, bsz=16471.7, num_updates=42300, lr=0.00030751, gnorm=0.13, clip=0, loss_scale=4, train_wall=84, gb_free=60.9, wall=36277 epoch 026: 241 / 1689 loss=3.526, nll_loss=1.972, ppl=3.92, wps=580845, ups=1.17, wpb=496057, bsz=16802.3, num_updates=42400, lr=0.000307148, gnorm=0.137, clip=0, loss_scale=4, train_wall=84, gb_free=61, wall=36362 epoch 026: 241 / 1689 loss=3.526, nll_loss=1.972, ppl=3.92, wps=580845, ups=1.17, wpb=496057, bsz=16802.3, num_updates=42400, lr=0.000307148, gnorm=0.137, clip=0, loss_scale=4, train_wall=84, gb_free=61, wall=36362 epoch 026: 241 / 1689 loss=3.526, nll_loss=1.972, ppl=3.92, wps=580845, ups=1.17, wpb=496057, bsz=16802.3, num_updates=42400, lr=0.000307148, gnorm=0.137, clip=0, loss_scale=4, train_wall=84, gb_free=61, wall=36362 epoch 026: 241 / 1689 loss=3.526, nll_loss=1.972, ppl=3.92, wps=580845, ups=1.17, wpb=496057, bsz=16802.3, num_updates=42400, lr=0.000307148, gnorm=0.137, clip=0, loss_scale=4, train_wall=84, gb_free=61, wall=36362 epoch 026: 241 / 1689 loss=3.526, nll_loss=1.972, ppl=3.92, wps=580845, ups=1.17, wpb=496057, bsz=16802.3, num_updates=42400, lr=0.000307148, gnorm=0.137, clip=0, loss_scale=4, train_wall=84, gb_free=61, wall=36362 epoch 026: 241 / 1689 loss=3.526, nll_loss=1.972, ppl=3.92, wps=580845, ups=1.17, wpb=496057, bsz=16802.3, num_updates=42400, lr=0.000307148, gnorm=0.137, clip=0, loss_scale=4, train_wall=84, gb_free=61, wall=36362 epoch 026: 241 / 1689 loss=3.526, nll_loss=1.972, ppl=3.92, wps=580845, ups=1.17, wpb=496057, bsz=16802.3, num_updates=42400, lr=0.000307148, gnorm=0.137, clip=0, loss_scale=4, train_wall=84, gb_free=61, wall=36362 epoch 026: 241 / 1689 loss=3.526, nll_loss=1.972, ppl=3.92, wps=580845, ups=1.17, wpb=496057, bsz=16802.3, num_updates=42400, lr=0.000307148, gnorm=0.137, clip=0, loss_scale=4, train_wall=84, gb_free=61, wall=36362 epoch 026: 241 / 1689 loss=3.526, nll_loss=1.972, ppl=3.92, wps=580845, ups=1.17, wpb=496057, bsz=16802.3, num_updates=42400, lr=0.000307148, gnorm=0.137, clip=0, loss_scale=4, train_wall=84, gb_free=61, wall=36362 epoch 026: 241 / 1689 loss=3.526, nll_loss=1.972, ppl=3.92, wps=580845, ups=1.17, wpb=496057, bsz=16802.3, num_updates=42400, lr=0.000307148, gnorm=0.137, clip=0, loss_scale=4, train_wall=84, gb_free=61, wall=36362 epoch 026: 241 / 1689 loss=3.526, nll_loss=1.972, ppl=3.92, wps=580845, ups=1.17, wpb=496057, bsz=16802.3, num_updates=42400, lr=0.000307148, gnorm=0.137, clip=0, loss_scale=4, train_wall=84, gb_free=61, wall=36362 epoch 026: 241 / 1689 loss=3.526, nll_loss=1.972, ppl=3.92, wps=580845, ups=1.17, wpb=496057, bsz=16802.3, num_updates=42400, lr=0.000307148, gnorm=0.137, clip=0, loss_scale=4, train_wall=84, gb_free=61, wall=36362 epoch 026: 241 / 1689 loss=3.526, nll_loss=1.972, ppl=3.92, wps=580845, ups=1.17, wpb=496057, bsz=16802.3, num_updates=42400, lr=0.000307148, gnorm=0.137, clip=0, loss_scale=4, train_wall=84, gb_free=61, wall=36362 epoch 026: 241 / 1689 loss=3.526, nll_loss=1.972, ppl=3.92, wps=580845, ups=1.17, wpb=496057, bsz=16802.3, num_updates=42400, lr=0.000307148, gnorm=0.137, clip=0, loss_scale=4, train_wall=84, gb_free=61, wall=36362 epoch 026: 241 / 1689 loss=3.526, nll_loss=1.972, ppl=3.92, wps=580845, ups=1.17, wpb=496057, bsz=16802.3, num_updates=42400, lr=0.000307148, gnorm=0.137, clip=0, loss_scale=4, train_wall=84, gb_free=61, wall=36362 epoch 026: 241 / 1689 loss=3.526, nll_loss=1.972, ppl=3.92, wps=580845, ups=1.17, wpb=496057, bsz=16802.3, num_updates=42400, lr=0.000307148, gnorm=0.137, clip=0, loss_scale=4, train_wall=84, gb_free=61, wall=36362 epoch 026: 241 / 1689 loss=3.526, nll_loss=1.972, ppl=3.92, wps=580845, ups=1.17, wpb=496057, bsz=16802.3, num_updates=42400, lr=0.000307148, gnorm=0.137, clip=0, loss_scale=4, train_wall=84, gb_free=61, wall=36362 epoch 026: 241 / 1689 loss=3.526, nll_loss=1.972, ppl=3.92, wps=580845, ups=1.17, wpb=496057, bsz=16802.3, num_updates=42400, lr=0.000307148, gnorm=0.137, clip=0, loss_scale=4, train_wall=84, gb_free=61, wall=36362 epoch 026: 241 / 1689 loss=3.526, nll_loss=1.972, ppl=3.92, wps=580845, ups=1.17, wpb=496057, bsz=16802.3, num_updates=42400, lr=0.000307148, gnorm=0.137, clip=0, loss_scale=4, train_wall=84, gb_free=61, wall=36362 epoch 026: 241 / 1689 loss=3.526, nll_loss=1.972, ppl=3.92, wps=580845, ups=1.17, wpb=496057, bsz=16802.3, num_updates=42400, lr=0.000307148, gnorm=0.137, clip=0, loss_scale=4, train_wall=84, gb_free=61, wall=36362 epoch 026: 241 / 1689 loss=3.526, nll_loss=1.972, ppl=3.92, wps=580845, ups=1.17, wpb=496057, bsz=16802.3, num_updates=42400, lr=0.000307148, gnorm=0.137, clip=0, loss_scale=4, train_wall=84, gb_free=61, wall=36362 epoch 026: 241 / 1689 loss=3.526, nll_loss=1.972, ppl=3.92, wps=580845, ups=1.17, wpb=496057, bsz=16802.3, num_updates=42400, lr=0.000307148, gnorm=0.137, clip=0, loss_scale=4, train_wall=84, gb_free=61, wall=36362 epoch 026: 241 / 1689 loss=3.526, nll_loss=1.972, ppl=3.92, wps=580845, ups=1.17, wpb=496057, bsz=16802.3, num_updates=42400, lr=0.000307148, gnorm=0.137, clip=0, loss_scale=4, train_wall=84, gb_free=61, wall=36362 epoch 026: 241 / 1689 loss=3.526, nll_loss=1.972, ppl=3.92, wps=580845, ups=1.17, wpb=496057, bsz=16802.3, num_updates=42400, lr=0.000307148, gnorm=0.137, clip=0, loss_scale=4, train_wall=84, gb_free=61, wall=36362 epoch 026: 241 / 1689 loss=3.526, nll_loss=1.972, ppl=3.92, wps=580845, ups=1.17, wpb=496057, bsz=16802.3, num_updates=42400, lr=0.000307148, gnorm=0.137, clip=0, loss_scale=4, train_wall=84, gb_free=61, wall=36362 epoch 026: 241 / 1689 loss=3.526, nll_loss=1.972, ppl=3.92, wps=580845, ups=1.17, wpb=496057, bsz=16802.3, num_updates=42400, lr=0.000307148, gnorm=0.137, clip=0, loss_scale=4, train_wall=84, gb_free=61, wall=36362 epoch 026: 341 / 1689 loss=3.527, nll_loss=1.974, ppl=3.93, wps=584988, ups=1.18, wpb=494455, bsz=16730.5, num_updates=42500, lr=0.000306786, gnorm=0.131, clip=0, loss_scale=4, train_wall=83, gb_free=61.2, wall=36447 epoch 026: 341 / 1689 loss=3.527, nll_loss=1.974, ppl=3.93, wps=584988, ups=1.18, wpb=494455, bsz=16730.5, num_updates=42500, lr=0.000306786, gnorm=0.131, clip=0, loss_scale=4, train_wall=83, gb_free=61.2, wall=36447 epoch 026: 341 / 1689 loss=3.527, nll_loss=1.974, ppl=3.93, wps=584988, ups=1.18, wpb=494455, bsz=16730.5, num_updates=42500, lr=0.000306786, gnorm=0.131, clip=0, loss_scale=4, train_wall=83, gb_free=61.2, wall=36447 epoch 026: 341 / 1689 loss=3.527, nll_loss=1.974, ppl=3.93, wps=584988, ups=1.18, wpb=494455, bsz=16730.5, num_updates=42500, lr=0.000306786, gnorm=0.131, clip=0, loss_scale=4, train_wall=83, gb_free=61.2, wall=36447 epoch 026: 341 / 1689 loss=3.527, nll_loss=1.974, ppl=3.93, wps=584988, ups=1.18, wpb=494455, bsz=16730.5, num_updates=42500, lr=0.000306786, gnorm=0.131, clip=0, loss_scale=4, train_wall=83, gb_free=61.2, wall=36447 epoch 026: 341 / 1689 loss=3.527, nll_loss=1.974, ppl=3.93, wps=584988, ups=1.18, wpb=494455, bsz=16730.5, num_updates=42500, lr=0.000306786, gnorm=0.131, clip=0, loss_scale=4, train_wall=83, gb_free=61.2, wall=36447 epoch 026: 341 / 1689 loss=3.527, nll_loss=1.974, ppl=3.93, wps=584988, ups=1.18, wpb=494455, bsz=16730.5, num_updates=42500, lr=0.000306786, gnorm=0.131, clip=0, loss_scale=4, train_wall=83, gb_free=61.2, wall=36447 epoch 026: 341 / 1689 loss=3.527, nll_loss=1.974, ppl=3.93, wps=584988, ups=1.18, wpb=494455, bsz=16730.5, num_updates=42500, lr=0.000306786, gnorm=0.131, clip=0, loss_scale=4, train_wall=83, gb_free=61.2, wall=36447 epoch 026: 341 / 1689 loss=3.527, nll_loss=1.974, ppl=3.93, wps=584988, ups=1.18, wpb=494455, bsz=16730.5, num_updates=42500, lr=0.000306786, gnorm=0.131, clip=0, loss_scale=4, train_wall=83, gb_free=61.2, wall=36447 epoch 026: 341 / 1689 loss=3.527, nll_loss=1.974, ppl=3.93, wps=584988, ups=1.18, wpb=494455, bsz=16730.5, num_updates=42500, lr=0.000306786, gnorm=0.131, clip=0, loss_scale=4, train_wall=83, gb_free=61.2, wall=36447 epoch 026: 341 / 1689 loss=3.527, nll_loss=1.974, ppl=3.93, wps=584988, ups=1.18, wpb=494455, bsz=16730.5, num_updates=42500, lr=0.000306786, gnorm=0.131, clip=0, loss_scale=4, train_wall=83, gb_free=61.2, wall=36447 epoch 026: 341 / 1689 loss=3.527, nll_loss=1.974, ppl=3.93, wps=584988, ups=1.18, wpb=494455, bsz=16730.5, num_updates=42500, lr=0.000306786, gnorm=0.131, clip=0, loss_scale=4, train_wall=83, gb_free=61.2, wall=36447 epoch 026: 341 / 1689 loss=3.527, nll_loss=1.974, ppl=3.93, wps=584988, ups=1.18, wpb=494455, bsz=16730.5, num_updates=42500, lr=0.000306786, gnorm=0.131, clip=0, loss_scale=4, train_wall=83, gb_free=61.2, wall=36447 epoch 026: 341 / 1689 loss=3.527, nll_loss=1.974, ppl=3.93, wps=584988, ups=1.18, wpb=494455, bsz=16730.5, num_updates=42500, lr=0.000306786, gnorm=0.131, clip=0, loss_scale=4, train_wall=83, gb_free=61.2, wall=36447 epoch 026: 341 / 1689 loss=3.527, nll_loss=1.974, ppl=3.93, wps=584988, ups=1.18, wpb=494455, bsz=16730.5, num_updates=42500, lr=0.000306786, gnorm=0.131, clip=0, loss_scale=4, train_wall=83, gb_free=61.2, wall=36447 epoch 026: 341 / 1689 loss=3.527, nll_loss=1.974, ppl=3.93, wps=584988, ups=1.18, wpb=494455, bsz=16730.5, num_updates=42500, lr=0.000306786, gnorm=0.131, clip=0, loss_scale=4, train_wall=83, gb_free=61.2, wall=36447 epoch 026: 341 / 1689 loss=3.527, nll_loss=1.974, ppl=3.93, wps=584988, ups=1.18, wpb=494455, bsz=16730.5, num_updates=42500, lr=0.000306786, gnorm=0.131, clip=0, loss_scale=4, train_wall=83, gb_free=61.2, wall=36447 epoch 026: 341 / 1689 loss=3.527, nll_loss=1.974, ppl=3.93, wps=584988, ups=1.18, wpb=494455, bsz=16730.5, num_updates=42500, lr=0.000306786, gnorm=0.131, clip=0, loss_scale=4, train_wall=83, gb_free=61.2, wall=36447 epoch 026: 341 / 1689 loss=3.527, nll_loss=1.974, ppl=3.93, wps=584988, ups=1.18, wpb=494455, bsz=16730.5, num_updates=42500, lr=0.000306786, gnorm=0.131, clip=0, loss_scale=4, train_wall=83, gb_free=61.2, wall=36447 epoch 026: 341 / 1689 loss=3.527, nll_loss=1.974, ppl=3.93, wps=584988, ups=1.18, wpb=494455, bsz=16730.5, num_updates=42500, lr=0.000306786, gnorm=0.131, clip=0, loss_scale=4, train_wall=83, gb_free=61.2, wall=36447 epoch 026: 341 / 1689 loss=3.527, nll_loss=1.974, ppl=3.93, wps=584988, ups=1.18, wpb=494455, bsz=16730.5, num_updates=42500, lr=0.000306786, gnorm=0.131, clip=0, loss_scale=4, train_wall=83, gb_free=61.2, wall=36447 epoch 026: 341 / 1689 loss=3.527, nll_loss=1.974, ppl=3.93, wps=584988, ups=1.18, wpb=494455, bsz=16730.5, num_updates=42500, lr=0.000306786, gnorm=0.131, clip=0, loss_scale=4, train_wall=83, gb_free=61.2, wall=36447 epoch 026: 341 / 1689 loss=3.527, nll_loss=1.974, ppl=3.93, wps=584988, ups=1.18, wpb=494455, bsz=16730.5, num_updates=42500, lr=0.000306786, gnorm=0.131, clip=0, loss_scale=4, train_wall=83, gb_free=61.2, wall=36447 epoch 026: 341 / 1689 loss=3.527, nll_loss=1.974, ppl=3.93, wps=584988, ups=1.18, wpb=494455, bsz=16730.5, num_updates=42500, lr=0.000306786, gnorm=0.131, clip=0, loss_scale=4, train_wall=83, gb_free=61.2, wall=36447 epoch 026: 341 / 1689 loss=3.527, nll_loss=1.974, ppl=3.93, wps=584988, ups=1.18, wpb=494455, bsz=16730.5, num_updates=42500, lr=0.000306786, gnorm=0.131, clip=0, loss_scale=4, train_wall=83, gb_free=61.2, wall=36447 epoch 026: 341 / 1689 loss=3.527, nll_loss=1.974, ppl=3.93, wps=584988, ups=1.18, wpb=494455, bsz=16730.5, num_updates=42500, lr=0.000306786, gnorm=0.131, clip=0, loss_scale=4, train_wall=83, gb_free=61.2, wall=36447 epoch 026: 441 / 1689 loss=3.527, nll_loss=1.974, ppl=3.93, wps=584882, ups=1.18, wpb=494590, bsz=16201.1, num_updates=42600, lr=0.000306426, gnorm=0.14, clip=0, loss_scale=4, train_wall=83, gb_free=61.5, wall=36531 epoch 026: 441 / 1689 loss=3.527, nll_loss=1.974, ppl=3.93, wps=584882, ups=1.18, wpb=494590, bsz=16201.1, num_updates=42600, lr=0.000306426, gnorm=0.14, clip=0, loss_scale=4, train_wall=83, gb_free=61.5, wall=36531 epoch 026: 441 / 1689 loss=3.527, nll_loss=1.974, ppl=3.93, wps=584882, ups=1.18, wpb=494590, bsz=16201.1, num_updates=42600, lr=0.000306426, gnorm=0.14, clip=0, loss_scale=4, train_wall=83, gb_free=61.5, wall=36531 epoch 026: 441 / 1689 loss=3.527, nll_loss=1.974, ppl=3.93, wps=584882, ups=1.18, wpb=494590, bsz=16201.1, num_updates=42600, lr=0.000306426, gnorm=0.14, clip=0, loss_scale=4, train_wall=83, gb_free=61.5, wall=36531 epoch 026: 441 / 1689 loss=3.527, nll_loss=1.974, ppl=3.93, wps=584882, ups=1.18, wpb=494590, bsz=16201.1, num_updates=42600, lr=0.000306426, gnorm=0.14, clip=0, loss_scale=4, train_wall=83, gb_free=61.5, wall=36531 epoch 026: 441 / 1689 loss=3.527, nll_loss=1.974, ppl=3.93, wps=584882, ups=1.18, wpb=494590, bsz=16201.1, num_updates=42600, lr=0.000306426, gnorm=0.14, clip=0, loss_scale=4, train_wall=83, gb_free=61.5, wall=36531 epoch 026: 441 / 1689 loss=3.527, nll_loss=1.974, ppl=3.93, wps=584882, ups=1.18, wpb=494590, bsz=16201.1, num_updates=42600, lr=0.000306426, gnorm=0.14, clip=0, loss_scale=4, train_wall=83, gb_free=61.5, wall=36531 epoch 026: 441 / 1689 loss=3.527, nll_loss=1.974, ppl=3.93, wps=584882, ups=1.18, wpb=494590, bsz=16201.1, num_updates=42600, lr=0.000306426, gnorm=0.14, clip=0, loss_scale=4, train_wall=83, gb_free=61.5, wall=36531 epoch 026: 441 / 1689 loss=3.527, nll_loss=1.974, ppl=3.93, wps=584882, ups=1.18, wpb=494590, bsz=16201.1, num_updates=42600, lr=0.000306426, gnorm=0.14, clip=0, loss_scale=4, train_wall=83, gb_free=61.5, wall=36531 epoch 026: 441 / 1689 loss=3.527, nll_loss=1.974, ppl=3.93, wps=584882, ups=1.18, wpb=494590, bsz=16201.1, num_updates=42600, lr=0.000306426, gnorm=0.14, clip=0, loss_scale=4, train_wall=83, gb_free=61.5, wall=36531 epoch 026: 441 / 1689 loss=3.527, nll_loss=1.974, ppl=3.93, wps=584882, ups=1.18, wpb=494590, bsz=16201.1, num_updates=42600, lr=0.000306426, gnorm=0.14, clip=0, loss_scale=4, train_wall=83, gb_free=61.5, wall=36531 epoch 026: 441 / 1689 loss=3.527, nll_loss=1.974, ppl=3.93, wps=584882, ups=1.18, wpb=494590, bsz=16201.1, num_updates=42600, lr=0.000306426, gnorm=0.14, clip=0, loss_scale=4, train_wall=83, gb_free=61.5, wall=36531 epoch 026: 441 / 1689 loss=3.527, nll_loss=1.974, ppl=3.93, wps=584882, ups=1.18, wpb=494590, bsz=16201.1, num_updates=42600, lr=0.000306426, gnorm=0.14, clip=0, loss_scale=4, train_wall=83, gb_free=61.5, wall=36531 epoch 026: 441 / 1689 loss=3.527, nll_loss=1.974, ppl=3.93, wps=584882, ups=1.18, wpb=494590, bsz=16201.1, num_updates=42600, lr=0.000306426, gnorm=0.14, clip=0, loss_scale=4, train_wall=83, gb_free=61.5, wall=36531 epoch 026: 441 / 1689 loss=3.527, nll_loss=1.974, ppl=3.93, wps=584882, ups=1.18, wpb=494590, bsz=16201.1, num_updates=42600, lr=0.000306426, gnorm=0.14, clip=0, loss_scale=4, train_wall=83, gb_free=61.5, wall=36531 epoch 026: 441 / 1689 loss=3.527, nll_loss=1.974, ppl=3.93, wps=584882, ups=1.18, wpb=494590, bsz=16201.1, num_updates=42600, lr=0.000306426, gnorm=0.14, clip=0, loss_scale=4, train_wall=83, gb_free=61.5, wall=36531 epoch 026: 441 / 1689 loss=3.527, nll_loss=1.974, ppl=3.93, wps=584882, ups=1.18, wpb=494590, bsz=16201.1, num_updates=42600, lr=0.000306426, gnorm=0.14, clip=0, loss_scale=4, train_wall=83, gb_free=61.5, wall=36531 epoch 026: 441 / 1689 loss=3.527, nll_loss=1.974, ppl=3.93, wps=584882, ups=1.18, wpb=494590, bsz=16201.1, num_updates=42600, lr=0.000306426, gnorm=0.14, clip=0, loss_scale=4, train_wall=83, gb_free=61.5, wall=36531 epoch 026: 441 / 1689 loss=3.527, nll_loss=1.974, ppl=3.93, wps=584882, ups=1.18, wpb=494590, bsz=16201.1, num_updates=42600, lr=0.000306426, gnorm=0.14, clip=0, loss_scale=4, train_wall=83, gb_free=61.5, wall=36531 epoch 026: 441 / 1689 loss=3.527, nll_loss=1.974, ppl=3.93, wps=584882, ups=1.18, wpb=494590, bsz=16201.1, num_updates=42600, lr=0.000306426, gnorm=0.14, clip=0, loss_scale=4, train_wall=83, gb_free=61.5, wall=36531 epoch 026: 441 / 1689 loss=3.527, nll_loss=1.974, ppl=3.93, wps=584882, ups=1.18, wpb=494590, bsz=16201.1, num_updates=42600, lr=0.000306426, gnorm=0.14, clip=0, loss_scale=4, train_wall=83, gb_free=61.5, wall=36531 epoch 026: 441 / 1689 loss=3.527, nll_loss=1.974, ppl=3.93, wps=584882, ups=1.18, wpb=494590, bsz=16201.1, num_updates=42600, lr=0.000306426, gnorm=0.14, clip=0, loss_scale=4, train_wall=83, gb_free=61.5, wall=36531 epoch 026: 441 / 1689 loss=3.527, nll_loss=1.974, ppl=3.93, wps=584882, ups=1.18, wpb=494590, bsz=16201.1, num_updates=42600, lr=0.000306426, gnorm=0.14, clip=0, loss_scale=4, train_wall=83, gb_free=61.5, wall=36531 epoch 026: 441 / 1689 loss=3.527, nll_loss=1.974, ppl=3.93, wps=584882, ups=1.18, wpb=494590, bsz=16201.1, num_updates=42600, lr=0.000306426, gnorm=0.14, clip=0, loss_scale=4, train_wall=83, gb_free=61.5, wall=36531 epoch 026: 441 / 1689 loss=3.527, nll_loss=1.974, ppl=3.93, wps=584882, ups=1.18, wpb=494590, bsz=16201.1, num_updates=42600, lr=0.000306426, gnorm=0.14, clip=0, loss_scale=4, train_wall=83, gb_free=61.5, wall=36531 epoch 026: 441 / 1689 loss=3.527, nll_loss=1.974, ppl=3.93, wps=584882, ups=1.18, wpb=494590, bsz=16201.1, num_updates=42600, lr=0.000306426, gnorm=0.14, clip=0, loss_scale=4, train_wall=83, gb_free=61.5, wall=36531 epoch 026: 541 / 1689 loss=3.526, nll_loss=1.972, ppl=3.92, wps=583522, ups=1.18, wpb=495630, bsz=16604.6, num_updates=42700, lr=0.000306067, gnorm=0.136, clip=0, loss_scale=4, train_wall=84, gb_free=62.2, wall=36616 epoch 026: 541 / 1689 loss=3.526, nll_loss=1.972, ppl=3.92, wps=583522, ups=1.18, wpb=495630, bsz=16604.6, num_updates=42700, lr=0.000306067, gnorm=0.136, clip=0, loss_scale=4, train_wall=84, gb_free=62.2, wall=36616 epoch 026: 541 / 1689 loss=3.526, nll_loss=1.972, ppl=3.92, wps=583522, ups=1.18, wpb=495630, bsz=16604.6, num_updates=42700, lr=0.000306067, gnorm=0.136, clip=0, loss_scale=4, train_wall=84, gb_free=62.2, wall=36616 epoch 026: 541 / 1689 loss=3.526, nll_loss=1.972, ppl=3.92, wps=583522, ups=1.18, wpb=495630, bsz=16604.6, num_updates=42700, lr=0.000306067, gnorm=0.136, clip=0, loss_scale=4, train_wall=84, gb_free=62.2, wall=36616 epoch 026: 541 / 1689 loss=3.526, nll_loss=1.972, ppl=3.92, wps=583522, ups=1.18, wpb=495630, bsz=16604.6, num_updates=42700, lr=0.000306067, gnorm=0.136, clip=0, loss_scale=4, train_wall=84, gb_free=62.2, wall=36616 epoch 026: 541 / 1689 loss=3.526, nll_loss=1.972, ppl=3.92, wps=583522, ups=1.18, wpb=495630, bsz=16604.6, num_updates=42700, lr=0.000306067, gnorm=0.136, clip=0, loss_scale=4, train_wall=84, gb_free=62.2, wall=36616 epoch 026: 541 / 1689 loss=3.526, nll_loss=1.972, ppl=3.92, wps=583522, ups=1.18, wpb=495630, bsz=16604.6, num_updates=42700, lr=0.000306067, gnorm=0.136, clip=0, loss_scale=4, train_wall=84, gb_free=62.2, wall=36616 epoch 026: 541 / 1689 loss=3.526, nll_loss=1.972, ppl=3.92, wps=583522, ups=1.18, wpb=495630, bsz=16604.6, num_updates=42700, lr=0.000306067, gnorm=0.136, clip=0, loss_scale=4, train_wall=84, gb_free=62.2, wall=36616 epoch 026: 541 / 1689 loss=3.526, nll_loss=1.972, ppl=3.92, wps=583522, ups=1.18, wpb=495630, bsz=16604.6, num_updates=42700, lr=0.000306067, gnorm=0.136, clip=0, loss_scale=4, train_wall=84, gb_free=62.2, wall=36616 epoch 026: 541 / 1689 loss=3.526, nll_loss=1.972, ppl=3.92, wps=583522, ups=1.18, wpb=495630, bsz=16604.6, num_updates=42700, lr=0.000306067, gnorm=0.136, clip=0, loss_scale=4, train_wall=84, gb_free=62.2, wall=36616 epoch 026: 541 / 1689 loss=3.526, nll_loss=1.972, ppl=3.92, wps=583522, ups=1.18, wpb=495630, bsz=16604.6, num_updates=42700, lr=0.000306067, gnorm=0.136, clip=0, loss_scale=4, train_wall=84, gb_free=62.2, wall=36616 epoch 026: 541 / 1689 loss=3.526, nll_loss=1.972, ppl=3.92, wps=583522, ups=1.18, wpb=495630, bsz=16604.6, num_updates=42700, lr=0.000306067, gnorm=0.136, clip=0, loss_scale=4, train_wall=84, gb_free=62.2, wall=36616 epoch 026: 541 / 1689 loss=3.526, nll_loss=1.972, ppl=3.92, wps=583522, ups=1.18, wpb=495630, bsz=16604.6, num_updates=42700, lr=0.000306067, gnorm=0.136, clip=0, loss_scale=4, train_wall=84, gb_free=62.2, wall=36616 epoch 026: 541 / 1689 loss=3.526, nll_loss=1.972, ppl=3.92, wps=583522, ups=1.18, wpb=495630, bsz=16604.6, num_updates=42700, lr=0.000306067, gnorm=0.136, clip=0, loss_scale=4, train_wall=84, gb_free=62.2, wall=36616 epoch 026: 541 / 1689 loss=3.526, nll_loss=1.972, ppl=3.92, wps=583522, ups=1.18, wpb=495630, bsz=16604.6, num_updates=42700, lr=0.000306067, gnorm=0.136, clip=0, loss_scale=4, train_wall=84, gb_free=62.2, wall=36616 epoch 026: 541 / 1689 loss=3.526, nll_loss=1.972, ppl=3.92, wps=583522, ups=1.18, wpb=495630, bsz=16604.6, num_updates=42700, lr=0.000306067, gnorm=0.136, clip=0, loss_scale=4, train_wall=84, gb_free=62.2, wall=36616 epoch 026: 541 / 1689 loss=3.526, nll_loss=1.972, ppl=3.92, wps=583522, ups=1.18, wpb=495630, bsz=16604.6, num_updates=42700, lr=0.000306067, gnorm=0.136, clip=0, loss_scale=4, train_wall=84, gb_free=62.2, wall=36616 epoch 026: 541 / 1689 loss=3.526, nll_loss=1.972, ppl=3.92, wps=583522, ups=1.18, wpb=495630, bsz=16604.6, num_updates=42700, lr=0.000306067, gnorm=0.136, clip=0, loss_scale=4, train_wall=84, gb_free=62.2, wall=36616 epoch 026: 541 / 1689 loss=3.526, nll_loss=1.972, ppl=3.92, wps=583522, ups=1.18, wpb=495630, bsz=16604.6, num_updates=42700, lr=0.000306067, gnorm=0.136, clip=0, loss_scale=4, train_wall=84, gb_free=62.2, wall=36616 epoch 026: 541 / 1689 loss=3.526, nll_loss=1.972, ppl=3.92, wps=583522, ups=1.18, wpb=495630, bsz=16604.6, num_updates=42700, lr=0.000306067, gnorm=0.136, clip=0, loss_scale=4, train_wall=84, gb_free=62.2, wall=36616 epoch 026: 541 / 1689 loss=3.526, nll_loss=1.972, ppl=3.92, wps=583522, ups=1.18, wpb=495630, bsz=16604.6, num_updates=42700, lr=0.000306067, gnorm=0.136, clip=0, loss_scale=4, train_wall=84, gb_free=62.2, wall=36616 epoch 026: 541 / 1689 loss=3.526, nll_loss=1.972, ppl=3.92, wps=583522, ups=1.18, wpb=495630, bsz=16604.6, num_updates=42700, lr=0.000306067, gnorm=0.136, clip=0, loss_scale=4, train_wall=84, gb_free=62.2, wall=36616 epoch 026: 541 / 1689 loss=3.526, nll_loss=1.972, ppl=3.92, wps=583522, ups=1.18, wpb=495630, bsz=16604.6, num_updates=42700, lr=0.000306067, gnorm=0.136, clip=0, loss_scale=4, train_wall=84, gb_free=62.2, wall=36616 epoch 026: 541 / 1689 loss=3.526, nll_loss=1.972, ppl=3.92, wps=583522, ups=1.18, wpb=495630, bsz=16604.6, num_updates=42700, lr=0.000306067, gnorm=0.136, clip=0, loss_scale=4, train_wall=84, gb_free=62.2, wall=36616 epoch 026: 541 / 1689 loss=3.526, nll_loss=1.972, ppl=3.92, wps=583522, ups=1.18, wpb=495630, bsz=16604.6, num_updates=42700, lr=0.000306067, gnorm=0.136, clip=0, loss_scale=4, train_wall=84, gb_free=62.2, wall=36616 epoch 026: 541 / 1689 loss=3.526, nll_loss=1.972, ppl=3.92, wps=583522, ups=1.18, wpb=495630, bsz=16604.6, num_updates=42700, lr=0.000306067, gnorm=0.136, clip=0, loss_scale=4, train_wall=84, gb_free=62.2, wall=36616 epoch 026: 641 / 1689 loss=3.531, nll_loss=1.978, ppl=3.94, wps=590772, ups=1.19, wpb=495312, bsz=16084.3, num_updates=42800, lr=0.000305709, gnorm=0.133, clip=0, loss_scale=4, train_wall=83, gb_free=61.2, wall=36700 epoch 026: 641 / 1689 loss=3.531, nll_loss=1.978, ppl=3.94, wps=590772, ups=1.19, wpb=495312, bsz=16084.3, num_updates=42800, lr=0.000305709, gnorm=0.133, clip=0, loss_scale=4, train_wall=83, gb_free=61.2, wall=36700 epoch 026: 641 / 1689 loss=3.531, nll_loss=1.978, ppl=3.94, wps=590772, ups=1.19, wpb=495312, bsz=16084.3, num_updates=42800, lr=0.000305709, gnorm=0.133, clip=0, loss_scale=4, train_wall=83, gb_free=61.2, wall=36700 epoch 026: 641 / 1689 loss=3.531, nll_loss=1.978, ppl=3.94, wps=590772, ups=1.19, wpb=495312, bsz=16084.3, num_updates=42800, lr=0.000305709, gnorm=0.133, clip=0, loss_scale=4, train_wall=83, gb_free=61.2, wall=36700 epoch 026: 641 / 1689 loss=3.531, nll_loss=1.978, ppl=3.94, wps=590772, ups=1.19, wpb=495312, bsz=16084.3, num_updates=42800, lr=0.000305709, gnorm=0.133, clip=0, loss_scale=4, train_wall=83, gb_free=61.2, wall=36700 epoch 026: 641 / 1689 loss=3.531, nll_loss=1.978, ppl=3.94, wps=590772, ups=1.19, wpb=495312, bsz=16084.3, num_updates=42800, lr=0.000305709, gnorm=0.133, clip=0, loss_scale=4, train_wall=83, gb_free=61.2, wall=36700 epoch 026: 641 / 1689 loss=3.531, nll_loss=1.978, ppl=3.94, wps=590772, ups=1.19, wpb=495312, bsz=16084.3, num_updates=42800, lr=0.000305709, gnorm=0.133, clip=0, loss_scale=4, train_wall=83, gb_free=61.2, wall=36700 epoch 026: 641 / 1689 loss=3.531, nll_loss=1.978, ppl=3.94, wps=590772, ups=1.19, wpb=495312, bsz=16084.3, num_updates=42800, lr=0.000305709, gnorm=0.133, clip=0, loss_scale=4, train_wall=83, gb_free=61.2, wall=36700 epoch 026: 641 / 1689 loss=3.531, nll_loss=1.978, ppl=3.94, wps=590772, ups=1.19, wpb=495312, bsz=16084.3, num_updates=42800, lr=0.000305709, gnorm=0.133, clip=0, loss_scale=4, train_wall=83, gb_free=61.2, wall=36700 epoch 026: 641 / 1689 loss=3.531, nll_loss=1.978, ppl=3.94, wps=590772, ups=1.19, wpb=495312, bsz=16084.3, num_updates=42800, lr=0.000305709, gnorm=0.133, clip=0, loss_scale=4, train_wall=83, gb_free=61.2, wall=36700 epoch 026: 641 / 1689 loss=3.531, nll_loss=1.978, ppl=3.94, wps=590772, ups=1.19, wpb=495312, bsz=16084.3, num_updates=42800, lr=0.000305709, gnorm=0.133, clip=0, loss_scale=4, train_wall=83, gb_free=61.2, wall=36700 epoch 026: 641 / 1689 loss=3.531, nll_loss=1.978, ppl=3.94, wps=590772, ups=1.19, wpb=495312, bsz=16084.3, num_updates=42800, lr=0.000305709, gnorm=0.133, clip=0, loss_scale=4, train_wall=83, gb_free=61.2, wall=36700 epoch 026: 641 / 1689 loss=3.531, nll_loss=1.978, ppl=3.94, wps=590772, ups=1.19, wpb=495312, bsz=16084.3, num_updates=42800, lr=0.000305709, gnorm=0.133, clip=0, loss_scale=4, train_wall=83, gb_free=61.2, wall=36700 epoch 026: 641 / 1689 loss=3.531, nll_loss=1.978, ppl=3.94, wps=590772, ups=1.19, wpb=495312, bsz=16084.3, num_updates=42800, lr=0.000305709, gnorm=0.133, clip=0, loss_scale=4, train_wall=83, gb_free=61.2, wall=36700 epoch 026: 641 / 1689 loss=3.531, nll_loss=1.978, ppl=3.94, wps=590772, ups=1.19, wpb=495312, bsz=16084.3, num_updates=42800, lr=0.000305709, gnorm=0.133, clip=0, loss_scale=4, train_wall=83, gb_free=61.2, wall=36700 epoch 026: 641 / 1689 loss=3.531, nll_loss=1.978, ppl=3.94, wps=590772, ups=1.19, wpb=495312, bsz=16084.3, num_updates=42800, lr=0.000305709, gnorm=0.133, clip=0, loss_scale=4, train_wall=83, gb_free=61.2, wall=36700 epoch 026: 641 / 1689 loss=3.531, nll_loss=1.978, ppl=3.94, wps=590772, ups=1.19, wpb=495312, bsz=16084.3, num_updates=42800, lr=0.000305709, gnorm=0.133, clip=0, loss_scale=4, train_wall=83, gb_free=61.2, wall=36700 epoch 026: 641 / 1689 loss=3.531, nll_loss=1.978, ppl=3.94, wps=590772, ups=1.19, wpb=495312, bsz=16084.3, num_updates=42800, lr=0.000305709, gnorm=0.133, clip=0, loss_scale=4, train_wall=83, gb_free=61.2, wall=36700 epoch 026: 641 / 1689 loss=3.531, nll_loss=1.978, ppl=3.94, wps=590772, ups=1.19, wpb=495312, bsz=16084.3, num_updates=42800, lr=0.000305709, gnorm=0.133, clip=0, loss_scale=4, train_wall=83, gb_free=61.2, wall=36700 epoch 026: 641 / 1689 loss=3.531, nll_loss=1.978, ppl=3.94, wps=590772, ups=1.19, wpb=495312, bsz=16084.3, num_updates=42800, lr=0.000305709, gnorm=0.133, clip=0, loss_scale=4, train_wall=83, gb_free=61.2, wall=36700 epoch 026: 641 / 1689 loss=3.531, nll_loss=1.978, ppl=3.94, wps=590772, ups=1.19, wpb=495312, bsz=16084.3, num_updates=42800, lr=0.000305709, gnorm=0.133, clip=0, loss_scale=4, train_wall=83, gb_free=61.2, wall=36700 epoch 026: 641 / 1689 loss=3.531, nll_loss=1.978, ppl=3.94, wps=590772, ups=1.19, wpb=495312, bsz=16084.3, num_updates=42800, lr=0.000305709, gnorm=0.133, clip=0, loss_scale=4, train_wall=83, gb_free=61.2, wall=36700 epoch 026: 641 / 1689 loss=3.531, nll_loss=1.978, ppl=3.94, wps=590772, ups=1.19, wpb=495312, bsz=16084.3, num_updates=42800, lr=0.000305709, gnorm=0.133, clip=0, loss_scale=4, train_wall=83, gb_free=61.2, wall=36700 epoch 026: 641 / 1689 loss=3.531, nll_loss=1.978, ppl=3.94, wps=590772, ups=1.19, wpb=495312, bsz=16084.3, num_updates=42800, lr=0.000305709, gnorm=0.133, clip=0, loss_scale=4, train_wall=83, gb_free=61.2, wall=36700 epoch 026: 641 / 1689 loss=3.531, nll_loss=1.978, ppl=3.94, wps=590772, ups=1.19, wpb=495312, bsz=16084.3, num_updates=42800, lr=0.000305709, gnorm=0.133, clip=0, loss_scale=4, train_wall=83, gb_free=61.2, wall=36700 epoch 026: 641 / 1689 loss=3.531, nll_loss=1.978, ppl=3.94, wps=590772, ups=1.19, wpb=495312, bsz=16084.3, num_updates=42800, lr=0.000305709, gnorm=0.133, clip=0, loss_scale=4, train_wall=83, gb_free=61.2, wall=36700 epoch 026: 742 / 1689 loss=3.532, nll_loss=1.98, ppl=3.94, wps=579018, ups=1.17, wpb=496257, bsz=16732, num_updates=42900, lr=0.000305352, gnorm=0.14, clip=0, loss_scale=4, train_wall=85, gb_free=62.1, wall=36786 epoch 026: 742 / 1689 loss=3.532, nll_loss=1.98, ppl=3.94, wps=579018, ups=1.17, wpb=496257, bsz=16732, num_updates=42900, lr=0.000305352, gnorm=0.14, clip=0, loss_scale=4, train_wall=85, gb_free=62.1, wall=36786 epoch 026: 742 / 1689 loss=3.532, nll_loss=1.98, ppl=3.94, wps=579018, ups=1.17, wpb=496257, bsz=16732, num_updates=42900, lr=0.000305352, gnorm=0.14, clip=0, loss_scale=4, train_wall=85, gb_free=62.1, wall=36786 epoch 026: 742 / 1689 loss=3.532, nll_loss=1.98, ppl=3.94, wps=579018, ups=1.17, wpb=496257, bsz=16732, num_updates=42900, lr=0.000305352, gnorm=0.14, clip=0, loss_scale=4, train_wall=85, gb_free=62.1, wall=36786 epoch 026: 742 / 1689 loss=3.532, nll_loss=1.98, ppl=3.94, wps=579018, ups=1.17, wpb=496257, bsz=16732, num_updates=42900, lr=0.000305352, gnorm=0.14, clip=0, loss_scale=4, train_wall=85, gb_free=62.1, wall=36786 epoch 026: 742 / 1689 loss=3.532, nll_loss=1.98, ppl=3.94, wps=579018, ups=1.17, wpb=496257, bsz=16732, num_updates=42900, lr=0.000305352, gnorm=0.14, clip=0, loss_scale=4, train_wall=85, gb_free=62.1, wall=36786 epoch 026: 742 / 1689 loss=3.532, nll_loss=1.98, ppl=3.94, wps=579018, ups=1.17, wpb=496257, bsz=16732, num_updates=42900, lr=0.000305352, gnorm=0.14, clip=0, loss_scale=4, train_wall=85, gb_free=62.1, wall=36786 epoch 026: 742 / 1689 loss=3.532, nll_loss=1.98, ppl=3.94, wps=579018, ups=1.17, wpb=496257, bsz=16732, num_updates=42900, lr=0.000305352, gnorm=0.14, clip=0, loss_scale=4, train_wall=85, gb_free=62.1, wall=36786 epoch 026: 742 / 1689 loss=3.532, nll_loss=1.98, ppl=3.94, wps=579018, ups=1.17, wpb=496257, bsz=16732, num_updates=42900, lr=0.000305352, gnorm=0.14, clip=0, loss_scale=4, train_wall=85, gb_free=62.1, wall=36786 epoch 026: 742 / 1689 loss=3.532, nll_loss=1.98, ppl=3.94, wps=579018, ups=1.17, wpb=496257, bsz=16732, num_updates=42900, lr=0.000305352, gnorm=0.14, clip=0, loss_scale=4, train_wall=85, gb_free=62.1, wall=36786 epoch 026: 742 / 1689 loss=3.532, nll_loss=1.98, ppl=3.94, wps=579018, ups=1.17, wpb=496257, bsz=16732, num_updates=42900, lr=0.000305352, gnorm=0.14, clip=0, loss_scale=4, train_wall=85, gb_free=62.1, wall=36786 epoch 026: 742 / 1689 loss=3.532, nll_loss=1.98, ppl=3.94, wps=579018, ups=1.17, wpb=496257, bsz=16732, num_updates=42900, lr=0.000305352, gnorm=0.14, clip=0, loss_scale=4, train_wall=85, gb_free=62.1, wall=36786 epoch 026: 742 / 1689 loss=3.532, nll_loss=1.98, ppl=3.94, wps=579018, ups=1.17, wpb=496257, bsz=16732, num_updates=42900, lr=0.000305352, gnorm=0.14, clip=0, loss_scale=4, train_wall=85, gb_free=62.1, wall=36786 epoch 026: 742 / 1689 loss=3.532, nll_loss=1.98, ppl=3.94, wps=579018, ups=1.17, wpb=496257, bsz=16732, num_updates=42900, lr=0.000305352, gnorm=0.14, clip=0, loss_scale=4, train_wall=85, gb_free=62.1, wall=36786 epoch 026: 742 / 1689 loss=3.532, nll_loss=1.98, ppl=3.94, wps=579018, ups=1.17, wpb=496257, bsz=16732, num_updates=42900, lr=0.000305352, gnorm=0.14, clip=0, loss_scale=4, train_wall=85, gb_free=62.1, wall=36786 epoch 026: 742 / 1689 loss=3.532, nll_loss=1.98, ppl=3.94, wps=579018, ups=1.17, wpb=496257, bsz=16732, num_updates=42900, lr=0.000305352, gnorm=0.14, clip=0, loss_scale=4, train_wall=85, gb_free=62.1, wall=36786 epoch 026: 742 / 1689 loss=3.532, nll_loss=1.98, ppl=3.94, wps=579018, ups=1.17, wpb=496257, bsz=16732, num_updates=42900, lr=0.000305352, gnorm=0.14, clip=0, loss_scale=4, train_wall=85, gb_free=62.1, wall=36786 epoch 026: 742 / 1689 loss=3.532, nll_loss=1.98, ppl=3.94, wps=579018, ups=1.17, wpb=496257, bsz=16732, num_updates=42900, lr=0.000305352, gnorm=0.14, clip=0, loss_scale=4, train_wall=85, gb_free=62.1, wall=36786 epoch 026: 742 / 1689 loss=3.532, nll_loss=1.98, ppl=3.94, wps=579018, ups=1.17, wpb=496257, bsz=16732, num_updates=42900, lr=0.000305352, gnorm=0.14, clip=0, loss_scale=4, train_wall=85, gb_free=62.1, wall=36786 epoch 026: 742 / 1689 loss=3.532, nll_loss=1.98, ppl=3.94, wps=579018, ups=1.17, wpb=496257, bsz=16732, num_updates=42900, lr=0.000305352, gnorm=0.14, clip=0, loss_scale=4, train_wall=85, gb_free=62.1, wall=36786 epoch 026: 742 / 1689 loss=3.532, nll_loss=1.98, ppl=3.94, wps=579018, ups=1.17, wpb=496257, bsz=16732, num_updates=42900, lr=0.000305352, gnorm=0.14, clip=0, loss_scale=4, train_wall=85, gb_free=62.1, wall=36786 epoch 026: 742 / 1689 loss=3.532, nll_loss=1.98, ppl=3.94, wps=579018, ups=1.17, wpb=496257, bsz=16732, num_updates=42900, lr=0.000305352, gnorm=0.14, clip=0, loss_scale=4, train_wall=85, gb_free=62.1, wall=36786 epoch 026: 742 / 1689 loss=3.532, nll_loss=1.98, ppl=3.94, wps=579018, ups=1.17, wpb=496257, bsz=16732, num_updates=42900, lr=0.000305352, gnorm=0.14, clip=0, loss_scale=4, train_wall=85, gb_free=62.1, wall=36786 epoch 026: 742 / 1689 loss=3.532, nll_loss=1.98, ppl=3.94, wps=579018, ups=1.17, wpb=496257, bsz=16732, num_updates=42900, lr=0.000305352, gnorm=0.14, clip=0, loss_scale=4, train_wall=85, gb_free=62.1, wall=36786 epoch 026: 742 / 1689 loss=3.532, nll_loss=1.98, ppl=3.94, wps=579018, ups=1.17, wpb=496257, bsz=16732, num_updates=42900, lr=0.000305352, gnorm=0.14, clip=0, loss_scale=4, train_wall=85, gb_free=62.1, wall=36786 epoch 026: 742 / 1689 loss=3.532, nll_loss=1.98, ppl=3.94, wps=579018, ups=1.17, wpb=496257, bsz=16732, num_updates=42900, lr=0.000305352, gnorm=0.14, clip=0, loss_scale=4, train_wall=85, gb_free=62.1, wall=36786 epoch 026: 842 / 1689 loss=3.537, nll_loss=1.985, ppl=3.96, wps=582762, ups=1.18, wpb=495045, bsz=16528.5, num_updates=43000, lr=0.000304997, gnorm=0.141, clip=0, loss_scale=4, train_wall=84, gb_free=61, wall=36871 epoch 026: 842 / 1689 loss=3.537, nll_loss=1.985, ppl=3.96, wps=582762, ups=1.18, wpb=495045, bsz=16528.5, num_updates=43000, lr=0.000304997, gnorm=0.141, clip=0, loss_scale=4, train_wall=84, gb_free=61, wall=36871 epoch 026: 842 / 1689 loss=3.537, nll_loss=1.985, ppl=3.96, wps=582762, ups=1.18, wpb=495045, bsz=16528.5, num_updates=43000, lr=0.000304997, gnorm=0.141, clip=0, loss_scale=4, train_wall=84, gb_free=61, wall=36871 epoch 026: 842 / 1689 loss=3.537, nll_loss=1.985, ppl=3.96, wps=582762, ups=1.18, wpb=495045, bsz=16528.5, num_updates=43000, lr=0.000304997, gnorm=0.141, clip=0, loss_scale=4, train_wall=84, gb_free=61, wall=36871 epoch 026: 842 / 1689 loss=3.537, nll_loss=1.985, ppl=3.96, wps=582762, ups=1.18, wpb=495045, bsz=16528.5, num_updates=43000, lr=0.000304997, gnorm=0.141, clip=0, loss_scale=4, train_wall=84, gb_free=61, wall=36871 epoch 026: 842 / 1689 loss=3.537, nll_loss=1.985, ppl=3.96, wps=582762, ups=1.18, wpb=495045, bsz=16528.5, num_updates=43000, lr=0.000304997, gnorm=0.141, clip=0, loss_scale=4, train_wall=84, gb_free=61, wall=36871 epoch 026: 842 / 1689 loss=3.537, nll_loss=1.985, ppl=3.96, wps=582762, ups=1.18, wpb=495045, bsz=16528.5, num_updates=43000, lr=0.000304997, gnorm=0.141, clip=0, loss_scale=4, train_wall=84, gb_free=61, wall=36871 epoch 026: 842 / 1689 loss=3.537, nll_loss=1.985, ppl=3.96, wps=582762, ups=1.18, wpb=495045, bsz=16528.5, num_updates=43000, lr=0.000304997, gnorm=0.141, clip=0, loss_scale=4, train_wall=84, gb_free=61, wall=36871 epoch 026: 842 / 1689 loss=3.537, nll_loss=1.985, ppl=3.96, wps=582762, ups=1.18, wpb=495045, bsz=16528.5, num_updates=43000, lr=0.000304997, gnorm=0.141, clip=0, loss_scale=4, train_wall=84, gb_free=61, wall=36871 epoch 026: 842 / 1689 loss=3.537, nll_loss=1.985, ppl=3.96, wps=582762, ups=1.18, wpb=495045, bsz=16528.5, num_updates=43000, lr=0.000304997, gnorm=0.141, clip=0, loss_scale=4, train_wall=84, gb_free=61, wall=36871 epoch 026: 842 / 1689 loss=3.537, nll_loss=1.985, ppl=3.96, wps=582762, ups=1.18, wpb=495045, bsz=16528.5, num_updates=43000, lr=0.000304997, gnorm=0.141, clip=0, loss_scale=4, train_wall=84, gb_free=61, wall=36871 epoch 026: 842 / 1689 loss=3.537, nll_loss=1.985, ppl=3.96, wps=582762, ups=1.18, wpb=495045, bsz=16528.5, num_updates=43000, lr=0.000304997, gnorm=0.141, clip=0, loss_scale=4, train_wall=84, gb_free=61, wall=36871 epoch 026: 842 / 1689 loss=3.537, nll_loss=1.985, ppl=3.96, wps=582762, ups=1.18, wpb=495045, bsz=16528.5, num_updates=43000, lr=0.000304997, gnorm=0.141, clip=0, loss_scale=4, train_wall=84, gb_free=61, wall=36871 epoch 026: 842 / 1689 loss=3.537, nll_loss=1.985, ppl=3.96, wps=582762, ups=1.18, wpb=495045, bsz=16528.5, num_updates=43000, lr=0.000304997, gnorm=0.141, clip=0, loss_scale=4, train_wall=84, gb_free=61, wall=36871 epoch 026: 842 / 1689 loss=3.537, nll_loss=1.985, ppl=3.96, wps=582762, ups=1.18, wpb=495045, bsz=16528.5, num_updates=43000, lr=0.000304997, gnorm=0.141, clip=0, loss_scale=4, train_wall=84, gb_free=61, wall=36871 epoch 026: 842 / 1689 loss=3.537, nll_loss=1.985, ppl=3.96, wps=582762, ups=1.18, wpb=495045, bsz=16528.5, num_updates=43000, lr=0.000304997, gnorm=0.141, clip=0, loss_scale=4, train_wall=84, gb_free=61, wall=36871 epoch 026: 842 / 1689 loss=3.537, nll_loss=1.985, ppl=3.96, wps=582762, ups=1.18, wpb=495045, bsz=16528.5, num_updates=43000, lr=0.000304997, gnorm=0.141, clip=0, loss_scale=4, train_wall=84, gb_free=61, wall=36871 epoch 026: 842 / 1689 loss=3.537, nll_loss=1.985, ppl=3.96, wps=582762, ups=1.18, wpb=495045, bsz=16528.5, num_updates=43000, lr=0.000304997, gnorm=0.141, clip=0, loss_scale=4, train_wall=84, gb_free=61, wall=36871 epoch 026: 842 / 1689 loss=3.537, nll_loss=1.985, ppl=3.96, wps=582762, ups=1.18, wpb=495045, bsz=16528.5, num_updates=43000, lr=0.000304997, gnorm=0.141, clip=0, loss_scale=4, train_wall=84, gb_free=61, wall=36871 epoch 026: 842 / 1689 loss=3.537, nll_loss=1.985, ppl=3.96, wps=582762, ups=1.18, wpb=495045, bsz=16528.5, num_updates=43000, lr=0.000304997, gnorm=0.141, clip=0, loss_scale=4, train_wall=84, gb_free=61, wall=36871 epoch 026: 842 / 1689 loss=3.537, nll_loss=1.985, ppl=3.96, wps=582762, ups=1.18, wpb=495045, bsz=16528.5, num_updates=43000, lr=0.000304997, gnorm=0.141, clip=0, loss_scale=4, train_wall=84, gb_free=61, wall=36871 epoch 026: 842 / 1689 loss=3.537, nll_loss=1.985, ppl=3.96, wps=582762, ups=1.18, wpb=495045, bsz=16528.5, num_updates=43000, lr=0.000304997, gnorm=0.141, clip=0, loss_scale=4, train_wall=84, gb_free=61, wall=36871 epoch 026: 842 / 1689 loss=3.537, nll_loss=1.985, ppl=3.96, wps=582762, ups=1.18, wpb=495045, bsz=16528.5, num_updates=43000, lr=0.000304997, gnorm=0.141, clip=0, loss_scale=4, train_wall=84, gb_free=61, wall=36871 epoch 026: 842 / 1689 loss=3.537, nll_loss=1.985, ppl=3.96, wps=582762, ups=1.18, wpb=495045, bsz=16528.5, num_updates=43000, lr=0.000304997, gnorm=0.141, clip=0, loss_scale=4, train_wall=84, gb_free=61, wall=36871 epoch 026: 842 / 1689 loss=3.537, nll_loss=1.985, ppl=3.96, wps=582762, ups=1.18, wpb=495045, bsz=16528.5, num_updates=43000, lr=0.000304997, gnorm=0.141, clip=0, loss_scale=4, train_wall=84, gb_free=61, wall=36871 epoch 026: 842 / 1689 loss=3.537, nll_loss=1.985, ppl=3.96, wps=582762, ups=1.18, wpb=495045, bsz=16528.5, num_updates=43000, lr=0.000304997, gnorm=0.141, clip=0, loss_scale=4, train_wall=84, gb_free=61, wall=36871 begin validation on "valid" subset epoch 026 | valid on 'valid' subset | loss 3.697 | nll_loss 2.147 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 43000 | best_loss 3.695 epoch 026 | valid on 'valid' subset | loss 3.697 | nll_loss 2.147 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 43000 | best_loss 3.695 epoch 026 | valid on 'valid' subset | loss 3.697 | nll_loss 2.147 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 43000 | best_loss 3.695 epoch 026 | valid on 'valid' subset | loss 3.697 | nll_loss 2.147 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 43000 | best_loss 3.695 epoch 026 | valid on 'valid' subset | loss 3.697 | nll_loss 2.147 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 43000 | best_loss 3.695 epoch 026 | valid on 'valid' subset | loss 3.697 | nll_loss 2.147 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 43000 | best_loss 3.695 epoch 026 | valid on 'valid' subset | loss 3.697 | nll_loss 2.147 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 43000 | best_loss 3.695 epoch 026 | valid on 'valid' subset | loss 3.697 | nll_loss 2.147 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 43000 | best_loss 3.695 epoch 026 | valid on 'valid' subset | loss 3.697 | nll_loss 2.147 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 43000 | best_loss 3.695 epoch 026 | valid on 'valid' subset | loss 3.697 | nll_loss 2.147 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 43000 | best_loss 3.695 epoch 026 | valid on 'valid' subset | loss 3.697 | nll_loss 2.147 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 43000 | best_loss 3.695 epoch 026 | valid on 'valid' subset | loss 3.697 | nll_loss 2.147 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 43000 | best_loss 3.695 epoch 026 | valid on 'valid' subset | loss 3.697 | nll_loss 2.147 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 43000 | best_loss 3.695 epoch 026 | valid on 'valid' subset | loss 3.697 | nll_loss 2.147 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 43000 | best_loss 3.695 epoch 026 | valid on 'valid' subset | loss 3.697 | nll_loss 2.147 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 43000 | best_loss 3.695 epoch 026 | valid on 'valid' subset | loss 3.697 | nll_loss 2.147 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 43000 | best_loss 3.695 epoch 026 | valid on 'valid' subset | loss 3.697 | nll_loss 2.147 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 43000 | best_loss 3.695 epoch 026 | valid on 'valid' subset | loss 3.697 | nll_loss 2.147 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 43000 | best_loss 3.695 epoch 026 | valid on 'valid' subset | loss 3.697 | nll_loss 2.147 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 43000 | best_loss 3.695 epoch 026 | valid on 'valid' subset | loss 3.697 | nll_loss 2.147 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 43000 | best_loss 3.695 epoch 026 | valid on 'valid' subset | loss 3.697 | nll_loss 2.147 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 43000 | best_loss 3.695 epoch 026 | valid on 'valid' subset | loss 3.697 | nll_loss 2.147 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 43000 | best_loss 3.695 epoch 026 | valid on 'valid' subset | loss 3.697 | nll_loss 2.147 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 43000 | best_loss 3.695 epoch 026 | valid on 'valid' subset | loss 3.697 | nll_loss 2.147 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 43000 | best_loss 3.695 epoch 026 | valid on 'valid' subset | loss 3.697 | nll_loss 2.147 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 43000 | best_loss 3.695 epoch 026 | valid on 'valid' subset | loss 3.697 | nll_loss 2.147 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 43000 | best_loss 3.695 epoch 026: 942 / 1689 loss=3.535, nll_loss=1.984, ppl=3.96, wps=536044, ups=1.08, wpb=495800, bsz=16491.7, num_updates=43100, lr=0.000304643, gnorm=0.134, clip=0, loss_scale=4, train_wall=83, gb_free=60.3, wall=36963 epoch 026: 942 / 1689 loss=3.535, nll_loss=1.984, ppl=3.96, wps=536044, ups=1.08, wpb=495800, bsz=16491.7, num_updates=43100, lr=0.000304643, gnorm=0.134, clip=0, loss_scale=4, train_wall=83, gb_free=60.3, wall=36963 epoch 026: 942 / 1689 loss=3.535, nll_loss=1.984, ppl=3.96, wps=536044, ups=1.08, wpb=495800, bsz=16491.7, num_updates=43100, lr=0.000304643, gnorm=0.134, clip=0, loss_scale=4, train_wall=83, gb_free=60.3, wall=36963 epoch 026: 942 / 1689 loss=3.535, nll_loss=1.984, ppl=3.96, wps=536044, ups=1.08, wpb=495800, bsz=16491.7, num_updates=43100, lr=0.000304643, gnorm=0.134, clip=0, loss_scale=4, train_wall=83, gb_free=60.3, wall=36963 epoch 026: 942 / 1689 loss=3.535, nll_loss=1.984, ppl=3.96, wps=536044, ups=1.08, wpb=495800, bsz=16491.7, num_updates=43100, lr=0.000304643, gnorm=0.134, clip=0, loss_scale=4, train_wall=83, gb_free=60.3, wall=36963 epoch 026: 942 / 1689 loss=3.535, nll_loss=1.984, ppl=3.96, wps=536044, ups=1.08, wpb=495800, bsz=16491.7, num_updates=43100, lr=0.000304643, gnorm=0.134, clip=0, loss_scale=4, train_wall=83, gb_free=60.3, wall=36963 epoch 026: 942 / 1689 loss=3.535, nll_loss=1.984, ppl=3.96, wps=536044, ups=1.08, wpb=495800, bsz=16491.7, num_updates=43100, lr=0.000304643, gnorm=0.134, clip=0, loss_scale=4, train_wall=83, gb_free=60.3, wall=36963 epoch 026: 942 / 1689 loss=3.535, nll_loss=1.984, ppl=3.96, wps=536044, ups=1.08, wpb=495800, bsz=16491.7, num_updates=43100, lr=0.000304643, gnorm=0.134, clip=0, loss_scale=4, train_wall=83, gb_free=60.3, wall=36963 epoch 026: 942 / 1689 loss=3.535, nll_loss=1.984, ppl=3.96, wps=536044, ups=1.08, wpb=495800, bsz=16491.7, num_updates=43100, lr=0.000304643, gnorm=0.134, clip=0, loss_scale=4, train_wall=83, gb_free=60.3, wall=36963 epoch 026: 942 / 1689 loss=3.535, nll_loss=1.984, ppl=3.96, wps=536044, ups=1.08, wpb=495800, bsz=16491.7, num_updates=43100, lr=0.000304643, gnorm=0.134, clip=0, loss_scale=4, train_wall=83, gb_free=60.3, wall=36963 epoch 026: 942 / 1689 loss=3.535, nll_loss=1.984, ppl=3.96, wps=536044, ups=1.08, wpb=495800, bsz=16491.7, num_updates=43100, lr=0.000304643, gnorm=0.134, clip=0, loss_scale=4, train_wall=83, gb_free=60.3, wall=36963 epoch 026: 942 / 1689 loss=3.535, nll_loss=1.984, ppl=3.96, wps=536044, ups=1.08, wpb=495800, bsz=16491.7, num_updates=43100, lr=0.000304643, gnorm=0.134, clip=0, loss_scale=4, train_wall=83, gb_free=60.3, wall=36963 epoch 026: 942 / 1689 loss=3.535, nll_loss=1.984, ppl=3.96, wps=536044, ups=1.08, wpb=495800, bsz=16491.7, num_updates=43100, lr=0.000304643, gnorm=0.134, clip=0, loss_scale=4, train_wall=83, gb_free=60.3, wall=36963 epoch 026: 942 / 1689 loss=3.535, nll_loss=1.984, ppl=3.96, wps=536044, ups=1.08, wpb=495800, bsz=16491.7, num_updates=43100, lr=0.000304643, gnorm=0.134, clip=0, loss_scale=4, train_wall=83, gb_free=60.3, wall=36963 epoch 026: 942 / 1689 loss=3.535, nll_loss=1.984, ppl=3.96, wps=536044, ups=1.08, wpb=495800, bsz=16491.7, num_updates=43100, lr=0.000304643, gnorm=0.134, clip=0, loss_scale=4, train_wall=83, gb_free=60.3, wall=36963 epoch 026: 942 / 1689 loss=3.535, nll_loss=1.984, ppl=3.96, wps=536044, ups=1.08, wpb=495800, bsz=16491.7, num_updates=43100, lr=0.000304643, gnorm=0.134, clip=0, loss_scale=4, train_wall=83, gb_free=60.3, wall=36963 epoch 026: 942 / 1689 loss=3.535, nll_loss=1.984, ppl=3.96, wps=536044, ups=1.08, wpb=495800, bsz=16491.7, num_updates=43100, lr=0.000304643, gnorm=0.134, clip=0, loss_scale=4, train_wall=83, gb_free=60.3, wall=36963 epoch 026: 942 / 1689 loss=3.535, nll_loss=1.984, ppl=3.96, wps=536044, ups=1.08, wpb=495800, bsz=16491.7, num_updates=43100, lr=0.000304643, gnorm=0.134, clip=0, loss_scale=4, train_wall=83, gb_free=60.3, wall=36963 epoch 026: 942 / 1689 loss=3.535, nll_loss=1.984, ppl=3.96, wps=536044, ups=1.08, wpb=495800, bsz=16491.7, num_updates=43100, lr=0.000304643, gnorm=0.134, clip=0, loss_scale=4, train_wall=83, gb_free=60.3, wall=36963 epoch 026: 942 / 1689 loss=3.535, nll_loss=1.984, ppl=3.96, wps=536044, ups=1.08, wpb=495800, bsz=16491.7, num_updates=43100, lr=0.000304643, gnorm=0.134, clip=0, loss_scale=4, train_wall=83, gb_free=60.3, wall=36963 epoch 026: 942 / 1689 loss=3.535, nll_loss=1.984, ppl=3.96, wps=536044, ups=1.08, wpb=495800, bsz=16491.7, num_updates=43100, lr=0.000304643, gnorm=0.134, clip=0, loss_scale=4, train_wall=83, gb_free=60.3, wall=36963 epoch 026: 942 / 1689 loss=3.535, nll_loss=1.984, ppl=3.96, wps=536044, ups=1.08, wpb=495800, bsz=16491.7, num_updates=43100, lr=0.000304643, gnorm=0.134, clip=0, loss_scale=4, train_wall=83, gb_free=60.3, wall=36963 epoch 026: 942 / 1689 loss=3.535, nll_loss=1.984, ppl=3.96, wps=536044, ups=1.08, wpb=495800, bsz=16491.7, num_updates=43100, lr=0.000304643, gnorm=0.134, clip=0, loss_scale=4, train_wall=83, gb_free=60.3, wall=36963 epoch 026: 942 / 1689 loss=3.535, nll_loss=1.984, ppl=3.96, wps=536044, ups=1.08, wpb=495800, bsz=16491.7, num_updates=43100, lr=0.000304643, gnorm=0.134, clip=0, loss_scale=4, train_wall=83, gb_free=60.3, wall=36963 epoch 026: 942 / 1689 loss=3.535, nll_loss=1.984, ppl=3.96, wps=536044, ups=1.08, wpb=495800, bsz=16491.7, num_updates=43100, lr=0.000304643, gnorm=0.134, clip=0, loss_scale=4, train_wall=83, gb_free=60.3, wall=36963 epoch 026: 942 / 1689 loss=3.535, nll_loss=1.984, ppl=3.96, wps=536044, ups=1.08, wpb=495800, bsz=16491.7, num_updates=43100, lr=0.000304643, gnorm=0.134, clip=0, loss_scale=4, train_wall=83, gb_free=60.3, wall=36963 epoch 026: 1043 / 1689 loss=3.537, nll_loss=1.986, ppl=3.96, wps=579007, ups=1.17, wpb=495385, bsz=16280.1, num_updates=43200, lr=0.00030429, gnorm=0.137, clip=0, loss_scale=2, train_wall=84, gb_free=61.7, wall=37049 epoch 026: 1043 / 1689 loss=3.537, nll_loss=1.986, ppl=3.96, wps=579007, ups=1.17, wpb=495385, bsz=16280.1, num_updates=43200, lr=0.00030429, gnorm=0.137, clip=0, loss_scale=2, train_wall=84, gb_free=61.7, wall=37049 epoch 026: 1043 / 1689 loss=3.537, nll_loss=1.986, ppl=3.96, wps=579007, ups=1.17, wpb=495385, bsz=16280.1, num_updates=43200, lr=0.00030429, gnorm=0.137, clip=0, loss_scale=2, train_wall=84, gb_free=61.7, wall=37049 epoch 026: 1043 / 1689 loss=3.537, nll_loss=1.986, ppl=3.96, wps=579007, ups=1.17, wpb=495385, bsz=16280.1, num_updates=43200, lr=0.00030429, gnorm=0.137, clip=0, loss_scale=2, train_wall=84, gb_free=61.7, wall=37049 epoch 026: 1043 / 1689 loss=3.537, nll_loss=1.986, ppl=3.96, wps=579007, ups=1.17, wpb=495385, bsz=16280.1, num_updates=43200, lr=0.00030429, gnorm=0.137, clip=0, loss_scale=2, train_wall=84, gb_free=61.7, wall=37049 epoch 026: 1043 / 1689 loss=3.537, nll_loss=1.986, ppl=3.96, wps=579007, ups=1.17, wpb=495385, bsz=16280.1, num_updates=43200, lr=0.00030429, gnorm=0.137, clip=0, loss_scale=2, train_wall=84, gb_free=61.7, wall=37049 epoch 026: 1043 / 1689 loss=3.537, nll_loss=1.986, ppl=3.96, wps=579007, ups=1.17, wpb=495385, bsz=16280.1, num_updates=43200, lr=0.00030429, gnorm=0.137, clip=0, loss_scale=2, train_wall=84, gb_free=61.7, wall=37049 epoch 026: 1043 / 1689 loss=3.537, nll_loss=1.986, ppl=3.96, wps=579007, ups=1.17, wpb=495385, bsz=16280.1, num_updates=43200, lr=0.00030429, gnorm=0.137, clip=0, loss_scale=2, train_wall=84, gb_free=61.7, wall=37049 epoch 026: 1043 / 1689 loss=3.537, nll_loss=1.986, ppl=3.96, wps=579007, ups=1.17, wpb=495385, bsz=16280.1, num_updates=43200, lr=0.00030429, gnorm=0.137, clip=0, loss_scale=2, train_wall=84, gb_free=61.7, wall=37049 epoch 026: 1043 / 1689 loss=3.537, nll_loss=1.986, ppl=3.96, wps=579007, ups=1.17, wpb=495385, bsz=16280.1, num_updates=43200, lr=0.00030429, gnorm=0.137, clip=0, loss_scale=2, train_wall=84, gb_free=61.7, wall=37049 epoch 026: 1043 / 1689 loss=3.537, nll_loss=1.986, ppl=3.96, wps=579007, ups=1.17, wpb=495385, bsz=16280.1, num_updates=43200, lr=0.00030429, gnorm=0.137, clip=0, loss_scale=2, train_wall=84, gb_free=61.7, wall=37049 epoch 026: 1043 / 1689 loss=3.537, nll_loss=1.986, ppl=3.96, wps=579007, ups=1.17, wpb=495385, bsz=16280.1, num_updates=43200, lr=0.00030429, gnorm=0.137, clip=0, loss_scale=2, train_wall=84, gb_free=61.7, wall=37049 epoch 026: 1043 / 1689 loss=3.537, nll_loss=1.986, ppl=3.96, wps=579007, ups=1.17, wpb=495385, bsz=16280.1, num_updates=43200, lr=0.00030429, gnorm=0.137, clip=0, loss_scale=2, train_wall=84, gb_free=61.7, wall=37049 epoch 026: 1043 / 1689 loss=3.537, nll_loss=1.986, ppl=3.96, wps=579007, ups=1.17, wpb=495385, bsz=16280.1, num_updates=43200, lr=0.00030429, gnorm=0.137, clip=0, loss_scale=2, train_wall=84, gb_free=61.7, wall=37049 epoch 026: 1043 / 1689 loss=3.537, nll_loss=1.986, ppl=3.96, wps=579007, ups=1.17, wpb=495385, bsz=16280.1, num_updates=43200, lr=0.00030429, gnorm=0.137, clip=0, loss_scale=2, train_wall=84, gb_free=61.7, wall=37049 epoch 026: 1043 / 1689 loss=3.537, nll_loss=1.986, ppl=3.96, wps=579007, ups=1.17, wpb=495385, bsz=16280.1, num_updates=43200, lr=0.00030429, gnorm=0.137, clip=0, loss_scale=2, train_wall=84, gb_free=61.7, wall=37049 epoch 026: 1043 / 1689 loss=3.537, nll_loss=1.986, ppl=3.96, wps=579007, ups=1.17, wpb=495385, bsz=16280.1, num_updates=43200, lr=0.00030429, gnorm=0.137, clip=0, loss_scale=2, train_wall=84, gb_free=61.7, wall=37049 epoch 026: 1043 / 1689 loss=3.537, nll_loss=1.986, ppl=3.96, wps=579007, ups=1.17, wpb=495385, bsz=16280.1, num_updates=43200, lr=0.00030429, gnorm=0.137, clip=0, loss_scale=2, train_wall=84, gb_free=61.7, wall=37049 epoch 026: 1043 / 1689 loss=3.537, nll_loss=1.986, ppl=3.96, wps=579007, ups=1.17, wpb=495385, bsz=16280.1, num_updates=43200, lr=0.00030429, gnorm=0.137, clip=0, loss_scale=2, train_wall=84, gb_free=61.7, wall=37049 epoch 026: 1043 / 1689 loss=3.537, nll_loss=1.986, ppl=3.96, wps=579007, ups=1.17, wpb=495385, bsz=16280.1, num_updates=43200, lr=0.00030429, gnorm=0.137, clip=0, loss_scale=2, train_wall=84, gb_free=61.7, wall=37049 epoch 026: 1043 / 1689 loss=3.537, nll_loss=1.986, ppl=3.96, wps=579007, ups=1.17, wpb=495385, bsz=16280.1, num_updates=43200, lr=0.00030429, gnorm=0.137, clip=0, loss_scale=2, train_wall=84, gb_free=61.7, wall=37049 epoch 026: 1043 / 1689 loss=3.537, nll_loss=1.986, ppl=3.96, wps=579007, ups=1.17, wpb=495385, bsz=16280.1, num_updates=43200, lr=0.00030429, gnorm=0.137, clip=0, loss_scale=2, train_wall=84, gb_free=61.7, wall=37049 epoch 026: 1043 / 1689 loss=3.537, nll_loss=1.986, ppl=3.96, wps=579007, ups=1.17, wpb=495385, bsz=16280.1, num_updates=43200, lr=0.00030429, gnorm=0.137, clip=0, loss_scale=2, train_wall=84, gb_free=61.7, wall=37049 epoch 026: 1043 / 1689 loss=3.537, nll_loss=1.986, ppl=3.96, wps=579007, ups=1.17, wpb=495385, bsz=16280.1, num_updates=43200, lr=0.00030429, gnorm=0.137, clip=0, loss_scale=2, train_wall=84, gb_free=61.7, wall=37049 epoch 026: 1043 / 1689 loss=3.537, nll_loss=1.986, ppl=3.96, wps=579007, ups=1.17, wpb=495385, bsz=16280.1, num_updates=43200, lr=0.00030429, gnorm=0.137, clip=0, loss_scale=2, train_wall=84, gb_free=61.7, wall=37049 epoch 026: 1043 / 1689 loss=3.537, nll_loss=1.986, ppl=3.96, wps=579007, ups=1.17, wpb=495385, bsz=16280.1, num_updates=43200, lr=0.00030429, gnorm=0.137, clip=0, loss_scale=2, train_wall=84, gb_free=61.7, wall=37049 epoch 026: 1143 / 1689 loss=3.538, nll_loss=1.987, ppl=3.97, wps=589419, ups=1.19, wpb=495308, bsz=16975.4, num_updates=43300, lr=0.000303939, gnorm=0.136, clip=0, loss_scale=2, train_wall=83, gb_free=61.1, wall=37133 epoch 026: 1143 / 1689 loss=3.538, nll_loss=1.987, ppl=3.97, wps=589419, ups=1.19, wpb=495308, bsz=16975.4, num_updates=43300, lr=0.000303939, gnorm=0.136, clip=0, loss_scale=2, train_wall=83, gb_free=61.1, wall=37133 epoch 026: 1143 / 1689 loss=3.538, nll_loss=1.987, ppl=3.97, wps=589419, ups=1.19, wpb=495308, bsz=16975.4, num_updates=43300, lr=0.000303939, gnorm=0.136, clip=0, loss_scale=2, train_wall=83, gb_free=61.1, wall=37133 epoch 026: 1143 / 1689 loss=3.538, nll_loss=1.987, ppl=3.97, wps=589419, ups=1.19, wpb=495308, bsz=16975.4, num_updates=43300, lr=0.000303939, gnorm=0.136, clip=0, loss_scale=2, train_wall=83, gb_free=61.1, wall=37133 epoch 026: 1143 / 1689 loss=3.538, nll_loss=1.987, ppl=3.97, wps=589419, ups=1.19, wpb=495308, bsz=16975.4, num_updates=43300, lr=0.000303939, gnorm=0.136, clip=0, loss_scale=2, train_wall=83, gb_free=61.1, wall=37133 epoch 026: 1143 / 1689 loss=3.538, nll_loss=1.987, ppl=3.97, wps=589419, ups=1.19, wpb=495308, bsz=16975.4, num_updates=43300, lr=0.000303939, gnorm=0.136, clip=0, loss_scale=2, train_wall=83, gb_free=61.1, wall=37133 epoch 026: 1143 / 1689 loss=3.538, nll_loss=1.987, ppl=3.97, wps=589419, ups=1.19, wpb=495308, bsz=16975.4, num_updates=43300, lr=0.000303939, gnorm=0.136, clip=0, loss_scale=2, train_wall=83, gb_free=61.1, wall=37133 epoch 026: 1143 / 1689 loss=3.538, nll_loss=1.987, ppl=3.97, wps=589419, ups=1.19, wpb=495308, bsz=16975.4, num_updates=43300, lr=0.000303939, gnorm=0.136, clip=0, loss_scale=2, train_wall=83, gb_free=61.1, wall=37133 epoch 026: 1143 / 1689 loss=3.538, nll_loss=1.987, ppl=3.97, wps=589419, ups=1.19, wpb=495308, bsz=16975.4, num_updates=43300, lr=0.000303939, gnorm=0.136, clip=0, loss_scale=2, train_wall=83, gb_free=61.1, wall=37133 epoch 026: 1143 / 1689 loss=3.538, nll_loss=1.987, ppl=3.97, wps=589419, ups=1.19, wpb=495308, bsz=16975.4, num_updates=43300, lr=0.000303939, gnorm=0.136, clip=0, loss_scale=2, train_wall=83, gb_free=61.1, wall=37133 epoch 026: 1143 / 1689 loss=3.538, nll_loss=1.987, ppl=3.97, wps=589419, ups=1.19, wpb=495308, bsz=16975.4, num_updates=43300, lr=0.000303939, gnorm=0.136, clip=0, loss_scale=2, train_wall=83, gb_free=61.1, wall=37133 epoch 026: 1143 / 1689 loss=3.538, nll_loss=1.987, ppl=3.97, wps=589419, ups=1.19, wpb=495308, bsz=16975.4, num_updates=43300, lr=0.000303939, gnorm=0.136, clip=0, loss_scale=2, train_wall=83, gb_free=61.1, wall=37133 epoch 026: 1143 / 1689 loss=3.538, nll_loss=1.987, ppl=3.97, wps=589419, ups=1.19, wpb=495308, bsz=16975.4, num_updates=43300, lr=0.000303939, gnorm=0.136, clip=0, loss_scale=2, train_wall=83, gb_free=61.1, wall=37133 epoch 026: 1143 / 1689 loss=3.538, nll_loss=1.987, ppl=3.97, wps=589419, ups=1.19, wpb=495308, bsz=16975.4, num_updates=43300, lr=0.000303939, gnorm=0.136, clip=0, loss_scale=2, train_wall=83, gb_free=61.1, wall=37133 epoch 026: 1143 / 1689 loss=3.538, nll_loss=1.987, ppl=3.97, wps=589419, ups=1.19, wpb=495308, bsz=16975.4, num_updates=43300, lr=0.000303939, gnorm=0.136, clip=0, loss_scale=2, train_wall=83, gb_free=61.1, wall=37133 epoch 026: 1143 / 1689 loss=3.538, nll_loss=1.987, ppl=3.97, wps=589419, ups=1.19, wpb=495308, bsz=16975.4, num_updates=43300, lr=0.000303939, gnorm=0.136, clip=0, loss_scale=2, train_wall=83, gb_free=61.1, wall=37133 epoch 026: 1143 / 1689 loss=3.538, nll_loss=1.987, ppl=3.97, wps=589419, ups=1.19, wpb=495308, bsz=16975.4, num_updates=43300, lr=0.000303939, gnorm=0.136, clip=0, loss_scale=2, train_wall=83, gb_free=61.1, wall=37133 epoch 026: 1143 / 1689 loss=3.538, nll_loss=1.987, ppl=3.97, wps=589419, ups=1.19, wpb=495308, bsz=16975.4, num_updates=43300, lr=0.000303939, gnorm=0.136, clip=0, loss_scale=2, train_wall=83, gb_free=61.1, wall=37133 epoch 026: 1143 / 1689 loss=3.538, nll_loss=1.987, ppl=3.97, wps=589419, ups=1.19, wpb=495308, bsz=16975.4, num_updates=43300, lr=0.000303939, gnorm=0.136, clip=0, loss_scale=2, train_wall=83, gb_free=61.1, wall=37133 epoch 026: 1143 / 1689 loss=3.538, nll_loss=1.987, ppl=3.97, wps=589419, ups=1.19, wpb=495308, bsz=16975.4, num_updates=43300, lr=0.000303939, gnorm=0.136, clip=0, loss_scale=2, train_wall=83, gb_free=61.1, wall=37133 epoch 026: 1143 / 1689 loss=3.538, nll_loss=1.987, ppl=3.97, wps=589419, ups=1.19, wpb=495308, bsz=16975.4, num_updates=43300, lr=0.000303939, gnorm=0.136, clip=0, loss_scale=2, train_wall=83, gb_free=61.1, wall=37133 epoch 026: 1143 / 1689 loss=3.538, nll_loss=1.987, ppl=3.97, wps=589419, ups=1.19, wpb=495308, bsz=16975.4, num_updates=43300, lr=0.000303939, gnorm=0.136, clip=0, loss_scale=2, train_wall=83, gb_free=61.1, wall=37133 epoch 026: 1143 / 1689 loss=3.538, nll_loss=1.987, ppl=3.97, wps=589419, ups=1.19, wpb=495308, bsz=16975.4, num_updates=43300, lr=0.000303939, gnorm=0.136, clip=0, loss_scale=2, train_wall=83, gb_free=61.1, wall=37133 epoch 026: 1143 / 1689 loss=3.538, nll_loss=1.987, ppl=3.97, wps=589419, ups=1.19, wpb=495308, bsz=16975.4, num_updates=43300, lr=0.000303939, gnorm=0.136, clip=0, loss_scale=2, train_wall=83, gb_free=61.1, wall=37133 epoch 026: 1143 / 1689 loss=3.538, nll_loss=1.987, ppl=3.97, wps=589419, ups=1.19, wpb=495308, bsz=16975.4, num_updates=43300, lr=0.000303939, gnorm=0.136, clip=0, loss_scale=2, train_wall=83, gb_free=61.1, wall=37133 epoch 026: 1143 / 1689 loss=3.538, nll_loss=1.987, ppl=3.97, wps=589419, ups=1.19, wpb=495308, bsz=16975.4, num_updates=43300, lr=0.000303939, gnorm=0.136, clip=0, loss_scale=2, train_wall=83, gb_free=61.1, wall=37133 epoch 026: 1243 / 1689 loss=3.539, nll_loss=1.988, ppl=3.97, wps=585514, ups=1.18, wpb=495626, bsz=16258.6, num_updates=43400, lr=0.000303588, gnorm=0.134, clip=0, loss_scale=2, train_wall=84, gb_free=61.4, wall=37218 epoch 026: 1243 / 1689 loss=3.539, nll_loss=1.988, ppl=3.97, wps=585514, ups=1.18, wpb=495626, bsz=16258.6, num_updates=43400, lr=0.000303588, gnorm=0.134, clip=0, loss_scale=2, train_wall=84, gb_free=61.4, wall=37218 epoch 026: 1243 / 1689 loss=3.539, nll_loss=1.988, ppl=3.97, wps=585514, ups=1.18, wpb=495626, bsz=16258.6, num_updates=43400, lr=0.000303588, gnorm=0.134, clip=0, loss_scale=2, train_wall=84, gb_free=61.4, wall=37218 epoch 026: 1243 / 1689 loss=3.539, nll_loss=1.988, ppl=3.97, wps=585514, ups=1.18, wpb=495626, bsz=16258.6, num_updates=43400, lr=0.000303588, gnorm=0.134, clip=0, loss_scale=2, train_wall=84, gb_free=61.4, wall=37218 epoch 026: 1243 / 1689 loss=3.539, nll_loss=1.988, ppl=3.97, wps=585514, ups=1.18, wpb=495626, bsz=16258.6, num_updates=43400, lr=0.000303588, gnorm=0.134, clip=0, loss_scale=2, train_wall=84, gb_free=61.4, wall=37218 epoch 026: 1243 / 1689 loss=3.539, nll_loss=1.988, ppl=3.97, wps=585514, ups=1.18, wpb=495626, bsz=16258.6, num_updates=43400, lr=0.000303588, gnorm=0.134, clip=0, loss_scale=2, train_wall=84, gb_free=61.4, wall=37218 epoch 026: 1243 / 1689 loss=3.539, nll_loss=1.988, ppl=3.97, wps=585514, ups=1.18, wpb=495626, bsz=16258.6, num_updates=43400, lr=0.000303588, gnorm=0.134, clip=0, loss_scale=2, train_wall=84, gb_free=61.4, wall=37218 epoch 026: 1243 / 1689 loss=3.539, nll_loss=1.988, ppl=3.97, wps=585514, ups=1.18, wpb=495626, bsz=16258.6, num_updates=43400, lr=0.000303588, gnorm=0.134, clip=0, loss_scale=2, train_wall=84, gb_free=61.4, wall=37218 epoch 026: 1243 / 1689 loss=3.539, nll_loss=1.988, ppl=3.97, wps=585514, ups=1.18, wpb=495626, bsz=16258.6, num_updates=43400, lr=0.000303588, gnorm=0.134, clip=0, loss_scale=2, train_wall=84, gb_free=61.4, wall=37218 epoch 026: 1243 / 1689 loss=3.539, nll_loss=1.988, ppl=3.97, wps=585514, ups=1.18, wpb=495626, bsz=16258.6, num_updates=43400, lr=0.000303588, gnorm=0.134, clip=0, loss_scale=2, train_wall=84, gb_free=61.4, wall=37218 epoch 026: 1243 / 1689 loss=3.539, nll_loss=1.988, ppl=3.97, wps=585514, ups=1.18, wpb=495626, bsz=16258.6, num_updates=43400, lr=0.000303588, gnorm=0.134, clip=0, loss_scale=2, train_wall=84, gb_free=61.4, wall=37218 epoch 026: 1243 / 1689 loss=3.539, nll_loss=1.988, ppl=3.97, wps=585514, ups=1.18, wpb=495626, bsz=16258.6, num_updates=43400, lr=0.000303588, gnorm=0.134, clip=0, loss_scale=2, train_wall=84, gb_free=61.4, wall=37218 epoch 026: 1243 / 1689 loss=3.539, nll_loss=1.988, ppl=3.97, wps=585514, ups=1.18, wpb=495626, bsz=16258.6, num_updates=43400, lr=0.000303588, gnorm=0.134, clip=0, loss_scale=2, train_wall=84, gb_free=61.4, wall=37218 epoch 026: 1243 / 1689 loss=3.539, nll_loss=1.988, ppl=3.97, wps=585514, ups=1.18, wpb=495626, bsz=16258.6, num_updates=43400, lr=0.000303588, gnorm=0.134, clip=0, loss_scale=2, train_wall=84, gb_free=61.4, wall=37218 epoch 026: 1243 / 1689 loss=3.539, nll_loss=1.988, ppl=3.97, wps=585514, ups=1.18, wpb=495626, bsz=16258.6, num_updates=43400, lr=0.000303588, gnorm=0.134, clip=0, loss_scale=2, train_wall=84, gb_free=61.4, wall=37218 epoch 026: 1243 / 1689 loss=3.539, nll_loss=1.988, ppl=3.97, wps=585514, ups=1.18, wpb=495626, bsz=16258.6, num_updates=43400, lr=0.000303588, gnorm=0.134, clip=0, loss_scale=2, train_wall=84, gb_free=61.4, wall=37218 epoch 026: 1243 / 1689 loss=3.539, nll_loss=1.988, ppl=3.97, wps=585514, ups=1.18, wpb=495626, bsz=16258.6, num_updates=43400, lr=0.000303588, gnorm=0.134, clip=0, loss_scale=2, train_wall=84, gb_free=61.4, wall=37218 epoch 026: 1243 / 1689 loss=3.539, nll_loss=1.988, ppl=3.97, wps=585514, ups=1.18, wpb=495626, bsz=16258.6, num_updates=43400, lr=0.000303588, gnorm=0.134, clip=0, loss_scale=2, train_wall=84, gb_free=61.4, wall=37218 epoch 026: 1243 / 1689 loss=3.539, nll_loss=1.988, ppl=3.97, wps=585514, ups=1.18, wpb=495626, bsz=16258.6, num_updates=43400, lr=0.000303588, gnorm=0.134, clip=0, loss_scale=2, train_wall=84, gb_free=61.4, wall=37218 epoch 026: 1243 / 1689 loss=3.539, nll_loss=1.988, ppl=3.97, wps=585514, ups=1.18, wpb=495626, bsz=16258.6, num_updates=43400, lr=0.000303588, gnorm=0.134, clip=0, loss_scale=2, train_wall=84, gb_free=61.4, wall=37218 epoch 026: 1243 / 1689 loss=3.539, nll_loss=1.988, ppl=3.97, wps=585514, ups=1.18, wpb=495626, bsz=16258.6, num_updates=43400, lr=0.000303588, gnorm=0.134, clip=0, loss_scale=2, train_wall=84, gb_free=61.4, wall=37218 epoch 026: 1243 / 1689 loss=3.539, nll_loss=1.988, ppl=3.97, wps=585514, ups=1.18, wpb=495626, bsz=16258.6, num_updates=43400, lr=0.000303588, gnorm=0.134, clip=0, loss_scale=2, train_wall=84, gb_free=61.4, wall=37218 epoch 026: 1243 / 1689 loss=3.539, nll_loss=1.988, ppl=3.97, wps=585514, ups=1.18, wpb=495626, bsz=16258.6, num_updates=43400, lr=0.000303588, gnorm=0.134, clip=0, loss_scale=2, train_wall=84, gb_free=61.4, wall=37218 epoch 026: 1243 / 1689 loss=3.539, nll_loss=1.988, ppl=3.97, wps=585514, ups=1.18, wpb=495626, bsz=16258.6, num_updates=43400, lr=0.000303588, gnorm=0.134, clip=0, loss_scale=2, train_wall=84, gb_free=61.4, wall=37218 epoch 026: 1243 / 1689 loss=3.539, nll_loss=1.988, ppl=3.97, wps=585514, ups=1.18, wpb=495626, bsz=16258.6, num_updates=43400, lr=0.000303588, gnorm=0.134, clip=0, loss_scale=2, train_wall=84, gb_free=61.4, wall=37218 epoch 026: 1243 / 1689 loss=3.539, nll_loss=1.988, ppl=3.97, wps=585514, ups=1.18, wpb=495626, bsz=16258.6, num_updates=43400, lr=0.000303588, gnorm=0.134, clip=0, loss_scale=2, train_wall=84, gb_free=61.4, wall=37218 epoch 026: 1343 / 1689 loss=3.534, nll_loss=1.983, ppl=3.95, wps=588381, ups=1.19, wpb=495569, bsz=16432.5, num_updates=43500, lr=0.000303239, gnorm=0.136, clip=0, loss_scale=2, train_wall=83, gb_free=60.9, wall=37302 epoch 026: 1343 / 1689 loss=3.534, nll_loss=1.983, ppl=3.95, wps=588381, ups=1.19, wpb=495569, bsz=16432.5, num_updates=43500, lr=0.000303239, gnorm=0.136, clip=0, loss_scale=2, train_wall=83, gb_free=60.9, wall=37302 epoch 026: 1343 / 1689 loss=3.534, nll_loss=1.983, ppl=3.95, wps=588381, ups=1.19, wpb=495569, bsz=16432.5, num_updates=43500, lr=0.000303239, gnorm=0.136, clip=0, loss_scale=2, train_wall=83, gb_free=60.9, wall=37302 epoch 026: 1343 / 1689 loss=3.534, nll_loss=1.983, ppl=3.95, wps=588381, ups=1.19, wpb=495569, bsz=16432.5, num_updates=43500, lr=0.000303239, gnorm=0.136, clip=0, loss_scale=2, train_wall=83, gb_free=60.9, wall=37302 epoch 026: 1343 / 1689 loss=3.534, nll_loss=1.983, ppl=3.95, wps=588381, ups=1.19, wpb=495569, bsz=16432.5, num_updates=43500, lr=0.000303239, gnorm=0.136, clip=0, loss_scale=2, train_wall=83, gb_free=60.9, wall=37302 epoch 026: 1343 / 1689 loss=3.534, nll_loss=1.983, ppl=3.95, wps=588381, ups=1.19, wpb=495569, bsz=16432.5, num_updates=43500, lr=0.000303239, gnorm=0.136, clip=0, loss_scale=2, train_wall=83, gb_free=60.9, wall=37302 epoch 026: 1343 / 1689 loss=3.534, nll_loss=1.983, ppl=3.95, wps=588381, ups=1.19, wpb=495569, bsz=16432.5, num_updates=43500, lr=0.000303239, gnorm=0.136, clip=0, loss_scale=2, train_wall=83, gb_free=60.9, wall=37302 epoch 026: 1343 / 1689 loss=3.534, nll_loss=1.983, ppl=3.95, wps=588381, ups=1.19, wpb=495569, bsz=16432.5, num_updates=43500, lr=0.000303239, gnorm=0.136, clip=0, loss_scale=2, train_wall=83, gb_free=60.9, wall=37302 epoch 026: 1343 / 1689 loss=3.534, nll_loss=1.983, ppl=3.95, wps=588381, ups=1.19, wpb=495569, bsz=16432.5, num_updates=43500, lr=0.000303239, gnorm=0.136, clip=0, loss_scale=2, train_wall=83, gb_free=60.9, wall=37302 epoch 026: 1343 / 1689 loss=3.534, nll_loss=1.983, ppl=3.95, wps=588381, ups=1.19, wpb=495569, bsz=16432.5, num_updates=43500, lr=0.000303239, gnorm=0.136, clip=0, loss_scale=2, train_wall=83, gb_free=60.9, wall=37302 epoch 026: 1343 / 1689 loss=3.534, nll_loss=1.983, ppl=3.95, wps=588381, ups=1.19, wpb=495569, bsz=16432.5, num_updates=43500, lr=0.000303239, gnorm=0.136, clip=0, loss_scale=2, train_wall=83, gb_free=60.9, wall=37302 epoch 026: 1343 / 1689 loss=3.534, nll_loss=1.983, ppl=3.95, wps=588381, ups=1.19, wpb=495569, bsz=16432.5, num_updates=43500, lr=0.000303239, gnorm=0.136, clip=0, loss_scale=2, train_wall=83, gb_free=60.9, wall=37302 epoch 026: 1343 / 1689 loss=3.534, nll_loss=1.983, ppl=3.95, wps=588381, ups=1.19, wpb=495569, bsz=16432.5, num_updates=43500, lr=0.000303239, gnorm=0.136, clip=0, loss_scale=2, train_wall=83, gb_free=60.9, wall=37302 epoch 026: 1343 / 1689 loss=3.534, nll_loss=1.983, ppl=3.95, wps=588381, ups=1.19, wpb=495569, bsz=16432.5, num_updates=43500, lr=0.000303239, gnorm=0.136, clip=0, loss_scale=2, train_wall=83, gb_free=60.9, wall=37302 epoch 026: 1343 / 1689 loss=3.534, nll_loss=1.983, ppl=3.95, wps=588381, ups=1.19, wpb=495569, bsz=16432.5, num_updates=43500, lr=0.000303239, gnorm=0.136, clip=0, loss_scale=2, train_wall=83, gb_free=60.9, wall=37302 epoch 026: 1343 / 1689 loss=3.534, nll_loss=1.983, ppl=3.95, wps=588381, ups=1.19, wpb=495569, bsz=16432.5, num_updates=43500, lr=0.000303239, gnorm=0.136, clip=0, loss_scale=2, train_wall=83, gb_free=60.9, wall=37302 epoch 026: 1343 / 1689 loss=3.534, nll_loss=1.983, ppl=3.95, wps=588381, ups=1.19, wpb=495569, bsz=16432.5, num_updates=43500, lr=0.000303239, gnorm=0.136, clip=0, loss_scale=2, train_wall=83, gb_free=60.9, wall=37302 epoch 026: 1343 / 1689 loss=3.534, nll_loss=1.983, ppl=3.95, wps=588381, ups=1.19, wpb=495569, bsz=16432.5, num_updates=43500, lr=0.000303239, gnorm=0.136, clip=0, loss_scale=2, train_wall=83, gb_free=60.9, wall=37302 epoch 026: 1343 / 1689 loss=3.534, nll_loss=1.983, ppl=3.95, wps=588381, ups=1.19, wpb=495569, bsz=16432.5, num_updates=43500, lr=0.000303239, gnorm=0.136, clip=0, loss_scale=2, train_wall=83, gb_free=60.9, wall=37302 epoch 026: 1343 / 1689 loss=3.534, nll_loss=1.983, ppl=3.95, wps=588381, ups=1.19, wpb=495569, bsz=16432.5, num_updates=43500, lr=0.000303239, gnorm=0.136, clip=0, loss_scale=2, train_wall=83, gb_free=60.9, wall=37302 epoch 026: 1343 / 1689 loss=3.534, nll_loss=1.983, ppl=3.95, wps=588381, ups=1.19, wpb=495569, bsz=16432.5, num_updates=43500, lr=0.000303239, gnorm=0.136, clip=0, loss_scale=2, train_wall=83, gb_free=60.9, wall=37302 epoch 026: 1343 / 1689 loss=3.534, nll_loss=1.983, ppl=3.95, wps=588381, ups=1.19, wpb=495569, bsz=16432.5, num_updates=43500, lr=0.000303239, gnorm=0.136, clip=0, loss_scale=2, train_wall=83, gb_free=60.9, wall=37302 epoch 026: 1343 / 1689 loss=3.534, nll_loss=1.983, ppl=3.95, wps=588381, ups=1.19, wpb=495569, bsz=16432.5, num_updates=43500, lr=0.000303239, gnorm=0.136, clip=0, loss_scale=2, train_wall=83, gb_free=60.9, wall=37302 epoch 026: 1343 / 1689 loss=3.534, nll_loss=1.983, ppl=3.95, wps=588381, ups=1.19, wpb=495569, bsz=16432.5, num_updates=43500, lr=0.000303239, gnorm=0.136, clip=0, loss_scale=2, train_wall=83, gb_free=60.9, wall=37302 epoch 026: 1343 / 1689 loss=3.534, nll_loss=1.983, ppl=3.95, wps=588381, ups=1.19, wpb=495569, bsz=16432.5, num_updates=43500, lr=0.000303239, gnorm=0.136, clip=0, loss_scale=2, train_wall=83, gb_free=60.9, wall=37302 epoch 026: 1343 / 1689 loss=3.534, nll_loss=1.983, ppl=3.95, wps=588381, ups=1.19, wpb=495569, bsz=16432.5, num_updates=43500, lr=0.000303239, gnorm=0.136, clip=0, loss_scale=2, train_wall=83, gb_free=60.9, wall=37302 epoch 026: 1443 / 1689 loss=3.538, nll_loss=1.987, ppl=3.96, wps=585373, ups=1.18, wpb=495737, bsz=16639.4, num_updates=43600, lr=0.000302891, gnorm=0.133, clip=0, loss_scale=2, train_wall=84, gb_free=61.3, wall=37387 epoch 026: 1443 / 1689 loss=3.538, nll_loss=1.987, ppl=3.96, wps=585373, ups=1.18, wpb=495737, bsz=16639.4, num_updates=43600, lr=0.000302891, gnorm=0.133, clip=0, loss_scale=2, train_wall=84, gb_free=61.3, wall=37387 epoch 026: 1443 / 1689 loss=3.538, nll_loss=1.987, ppl=3.96, wps=585373, ups=1.18, wpb=495737, bsz=16639.4, num_updates=43600, lr=0.000302891, gnorm=0.133, clip=0, loss_scale=2, train_wall=84, gb_free=61.3, wall=37387 epoch 026: 1443 / 1689 loss=3.538, nll_loss=1.987, ppl=3.96, wps=585373, ups=1.18, wpb=495737, bsz=16639.4, num_updates=43600, lr=0.000302891, gnorm=0.133, clip=0, loss_scale=2, train_wall=84, gb_free=61.3, wall=37387 epoch 026: 1443 / 1689 loss=3.538, nll_loss=1.987, ppl=3.96, wps=585373, ups=1.18, wpb=495737, bsz=16639.4, num_updates=43600, lr=0.000302891, gnorm=0.133, clip=0, loss_scale=2, train_wall=84, gb_free=61.3, wall=37387 epoch 026: 1443 / 1689 loss=3.538, nll_loss=1.987, ppl=3.96, wps=585373, ups=1.18, wpb=495737, bsz=16639.4, num_updates=43600, lr=0.000302891, gnorm=0.133, clip=0, loss_scale=2, train_wall=84, gb_free=61.3, wall=37387 epoch 026: 1443 / 1689 loss=3.538, nll_loss=1.987, ppl=3.96, wps=585373, ups=1.18, wpb=495737, bsz=16639.4, num_updates=43600, lr=0.000302891, gnorm=0.133, clip=0, loss_scale=2, train_wall=84, gb_free=61.3, wall=37387 epoch 026: 1443 / 1689 loss=3.538, nll_loss=1.987, ppl=3.96, wps=585373, ups=1.18, wpb=495737, bsz=16639.4, num_updates=43600, lr=0.000302891, gnorm=0.133, clip=0, loss_scale=2, train_wall=84, gb_free=61.3, wall=37387 epoch 026: 1443 / 1689 loss=3.538, nll_loss=1.987, ppl=3.96, wps=585373, ups=1.18, wpb=495737, bsz=16639.4, num_updates=43600, lr=0.000302891, gnorm=0.133, clip=0, loss_scale=2, train_wall=84, gb_free=61.3, wall=37387 epoch 026: 1443 / 1689 loss=3.538, nll_loss=1.987, ppl=3.96, wps=585373, ups=1.18, wpb=495737, bsz=16639.4, num_updates=43600, lr=0.000302891, gnorm=0.133, clip=0, loss_scale=2, train_wall=84, gb_free=61.3, wall=37387 epoch 026: 1443 / 1689 loss=3.538, nll_loss=1.987, ppl=3.96, wps=585373, ups=1.18, wpb=495737, bsz=16639.4, num_updates=43600, lr=0.000302891, gnorm=0.133, clip=0, loss_scale=2, train_wall=84, gb_free=61.3, wall=37387 epoch 026: 1443 / 1689 loss=3.538, nll_loss=1.987, ppl=3.96, wps=585373, ups=1.18, wpb=495737, bsz=16639.4, num_updates=43600, lr=0.000302891, gnorm=0.133, clip=0, loss_scale=2, train_wall=84, gb_free=61.3, wall=37387 epoch 026: 1443 / 1689 loss=3.538, nll_loss=1.987, ppl=3.96, wps=585373, ups=1.18, wpb=495737, bsz=16639.4, num_updates=43600, lr=0.000302891, gnorm=0.133, clip=0, loss_scale=2, train_wall=84, gb_free=61.3, wall=37387 epoch 026: 1443 / 1689 loss=3.538, nll_loss=1.987, ppl=3.96, wps=585373, ups=1.18, wpb=495737, bsz=16639.4, num_updates=43600, lr=0.000302891, gnorm=0.133, clip=0, loss_scale=2, train_wall=84, gb_free=61.3, wall=37387 epoch 026: 1443 / 1689 loss=3.538, nll_loss=1.987, ppl=3.96, wps=585373, ups=1.18, wpb=495737, bsz=16639.4, num_updates=43600, lr=0.000302891, gnorm=0.133, clip=0, loss_scale=2, train_wall=84, gb_free=61.3, wall=37387 epoch 026: 1443 / 1689 loss=3.538, nll_loss=1.987, ppl=3.96, wps=585373, ups=1.18, wpb=495737, bsz=16639.4, num_updates=43600, lr=0.000302891, gnorm=0.133, clip=0, loss_scale=2, train_wall=84, gb_free=61.3, wall=37387 epoch 026: 1443 / 1689 loss=3.538, nll_loss=1.987, ppl=3.96, wps=585373, ups=1.18, wpb=495737, bsz=16639.4, num_updates=43600, lr=0.000302891, gnorm=0.133, clip=0, loss_scale=2, train_wall=84, gb_free=61.3, wall=37387 epoch 026: 1443 / 1689 loss=3.538, nll_loss=1.987, ppl=3.96, wps=585373, ups=1.18, wpb=495737, bsz=16639.4, num_updates=43600, lr=0.000302891, gnorm=0.133, clip=0, loss_scale=2, train_wall=84, gb_free=61.3, wall=37387 epoch 026: 1443 / 1689 loss=3.538, nll_loss=1.987, ppl=3.96, wps=585373, ups=1.18, wpb=495737, bsz=16639.4, num_updates=43600, lr=0.000302891, gnorm=0.133, clip=0, loss_scale=2, train_wall=84, gb_free=61.3, wall=37387 epoch 026: 1443 / 1689 loss=3.538, nll_loss=1.987, ppl=3.96, wps=585373, ups=1.18, wpb=495737, bsz=16639.4, num_updates=43600, lr=0.000302891, gnorm=0.133, clip=0, loss_scale=2, train_wall=84, gb_free=61.3, wall=37387 epoch 026: 1443 / 1689 loss=3.538, nll_loss=1.987, ppl=3.96, wps=585373, ups=1.18, wpb=495737, bsz=16639.4, num_updates=43600, lr=0.000302891, gnorm=0.133, clip=0, loss_scale=2, train_wall=84, gb_free=61.3, wall=37387 epoch 026: 1443 / 1689 loss=3.538, nll_loss=1.987, ppl=3.96, wps=585373, ups=1.18, wpb=495737, bsz=16639.4, num_updates=43600, lr=0.000302891, gnorm=0.133, clip=0, loss_scale=2, train_wall=84, gb_free=61.3, wall=37387 epoch 026: 1443 / 1689 loss=3.538, nll_loss=1.987, ppl=3.96, wps=585373, ups=1.18, wpb=495737, bsz=16639.4, num_updates=43600, lr=0.000302891, gnorm=0.133, clip=0, loss_scale=2, train_wall=84, gb_free=61.3, wall=37387 epoch 026: 1443 / 1689 loss=3.538, nll_loss=1.987, ppl=3.96, wps=585373, ups=1.18, wpb=495737, bsz=16639.4, num_updates=43600, lr=0.000302891, gnorm=0.133, clip=0, loss_scale=2, train_wall=84, gb_free=61.3, wall=37387 epoch 026: 1443 / 1689 loss=3.538, nll_loss=1.987, ppl=3.96, wps=585373, ups=1.18, wpb=495737, bsz=16639.4, num_updates=43600, lr=0.000302891, gnorm=0.133, clip=0, loss_scale=2, train_wall=84, gb_free=61.3, wall=37387 epoch 026: 1443 / 1689 loss=3.538, nll_loss=1.987, ppl=3.96, wps=585373, ups=1.18, wpb=495737, bsz=16639.4, num_updates=43600, lr=0.000302891, gnorm=0.133, clip=0, loss_scale=2, train_wall=84, gb_free=61.3, wall=37387 epoch 026: 1544 / 1689 loss=3.544, nll_loss=1.994, ppl=3.98, wps=582033, ups=1.18, wpb=494730, bsz=16415.4, num_updates=43700, lr=0.000302545, gnorm=0.138, clip=0, loss_scale=1, train_wall=84, gb_free=61.1, wall=37472 epoch 026: 1544 / 1689 loss=3.544, nll_loss=1.994, ppl=3.98, wps=582033, ups=1.18, wpb=494730, bsz=16415.4, num_updates=43700, lr=0.000302545, gnorm=0.138, clip=0, loss_scale=1, train_wall=84, gb_free=61.1, wall=37472 epoch 026: 1544 / 1689 loss=3.544, nll_loss=1.994, ppl=3.98, wps=582033, ups=1.18, wpb=494730, bsz=16415.4, num_updates=43700, lr=0.000302545, gnorm=0.138, clip=0, loss_scale=1, train_wall=84, gb_free=61.1, wall=37472 epoch 026: 1544 / 1689 loss=3.544, nll_loss=1.994, ppl=3.98, wps=582033, ups=1.18, wpb=494730, bsz=16415.4, num_updates=43700, lr=0.000302545, gnorm=0.138, clip=0, loss_scale=1, train_wall=84, gb_free=61.1, wall=37472 epoch 026: 1544 / 1689 loss=3.544, nll_loss=1.994, ppl=3.98, wps=582033, ups=1.18, wpb=494730, bsz=16415.4, num_updates=43700, lr=0.000302545, gnorm=0.138, clip=0, loss_scale=1, train_wall=84, gb_free=61.1, wall=37472 epoch 026: 1544 / 1689 loss=3.544, nll_loss=1.994, ppl=3.98, wps=582033, ups=1.18, wpb=494730, bsz=16415.4, num_updates=43700, lr=0.000302545, gnorm=0.138, clip=0, loss_scale=1, train_wall=84, gb_free=61.1, wall=37472 epoch 026: 1544 / 1689 loss=3.544, nll_loss=1.994, ppl=3.98, wps=582033, ups=1.18, wpb=494730, bsz=16415.4, num_updates=43700, lr=0.000302545, gnorm=0.138, clip=0, loss_scale=1, train_wall=84, gb_free=61.1, wall=37472 epoch 026: 1544 / 1689 loss=3.544, nll_loss=1.994, ppl=3.98, wps=582033, ups=1.18, wpb=494730, bsz=16415.4, num_updates=43700, lr=0.000302545, gnorm=0.138, clip=0, loss_scale=1, train_wall=84, gb_free=61.1, wall=37472 epoch 026: 1544 / 1689 loss=3.544, nll_loss=1.994, ppl=3.98, wps=582033, ups=1.18, wpb=494730, bsz=16415.4, num_updates=43700, lr=0.000302545, gnorm=0.138, clip=0, loss_scale=1, train_wall=84, gb_free=61.1, wall=37472 epoch 026: 1544 / 1689 loss=3.544, nll_loss=1.994, ppl=3.98, wps=582033, ups=1.18, wpb=494730, bsz=16415.4, num_updates=43700, lr=0.000302545, gnorm=0.138, clip=0, loss_scale=1, train_wall=84, gb_free=61.1, wall=37472 epoch 026: 1544 / 1689 loss=3.544, nll_loss=1.994, ppl=3.98, wps=582033, ups=1.18, wpb=494730, bsz=16415.4, num_updates=43700, lr=0.000302545, gnorm=0.138, clip=0, loss_scale=1, train_wall=84, gb_free=61.1, wall=37472 epoch 026: 1544 / 1689 loss=3.544, nll_loss=1.994, ppl=3.98, wps=582033, ups=1.18, wpb=494730, bsz=16415.4, num_updates=43700, lr=0.000302545, gnorm=0.138, clip=0, loss_scale=1, train_wall=84, gb_free=61.1, wall=37472 epoch 026: 1544 / 1689 loss=3.544, nll_loss=1.994, ppl=3.98, wps=582033, ups=1.18, wpb=494730, bsz=16415.4, num_updates=43700, lr=0.000302545, gnorm=0.138, clip=0, loss_scale=1, train_wall=84, gb_free=61.1, wall=37472 epoch 026: 1544 / 1689 loss=3.544, nll_loss=1.994, ppl=3.98, wps=582033, ups=1.18, wpb=494730, bsz=16415.4, num_updates=43700, lr=0.000302545, gnorm=0.138, clip=0, loss_scale=1, train_wall=84, gb_free=61.1, wall=37472 epoch 026: 1544 / 1689 loss=3.544, nll_loss=1.994, ppl=3.98, wps=582033, ups=1.18, wpb=494730, bsz=16415.4, num_updates=43700, lr=0.000302545, gnorm=0.138, clip=0, loss_scale=1, train_wall=84, gb_free=61.1, wall=37472 epoch 026: 1544 / 1689 loss=3.544, nll_loss=1.994, ppl=3.98, wps=582033, ups=1.18, wpb=494730, bsz=16415.4, num_updates=43700, lr=0.000302545, gnorm=0.138, clip=0, loss_scale=1, train_wall=84, gb_free=61.1, wall=37472 epoch 026: 1544 / 1689 loss=3.544, nll_loss=1.994, ppl=3.98, wps=582033, ups=1.18, wpb=494730, bsz=16415.4, num_updates=43700, lr=0.000302545, gnorm=0.138, clip=0, loss_scale=1, train_wall=84, gb_free=61.1, wall=37472 epoch 026: 1544 / 1689 loss=3.544, nll_loss=1.994, ppl=3.98, wps=582033, ups=1.18, wpb=494730, bsz=16415.4, num_updates=43700, lr=0.000302545, gnorm=0.138, clip=0, loss_scale=1, train_wall=84, gb_free=61.1, wall=37472 epoch 026: 1544 / 1689 loss=3.544, nll_loss=1.994, ppl=3.98, wps=582033, ups=1.18, wpb=494730, bsz=16415.4, num_updates=43700, lr=0.000302545, gnorm=0.138, clip=0, loss_scale=1, train_wall=84, gb_free=61.1, wall=37472 epoch 026: 1544 / 1689 loss=3.544, nll_loss=1.994, ppl=3.98, wps=582033, ups=1.18, wpb=494730, bsz=16415.4, num_updates=43700, lr=0.000302545, gnorm=0.138, clip=0, loss_scale=1, train_wall=84, gb_free=61.1, wall=37472 epoch 026: 1544 / 1689 loss=3.544, nll_loss=1.994, ppl=3.98, wps=582033, ups=1.18, wpb=494730, bsz=16415.4, num_updates=43700, lr=0.000302545, gnorm=0.138, clip=0, loss_scale=1, train_wall=84, gb_free=61.1, wall=37472 epoch 026: 1544 / 1689 loss=3.544, nll_loss=1.994, ppl=3.98, wps=582033, ups=1.18, wpb=494730, bsz=16415.4, num_updates=43700, lr=0.000302545, gnorm=0.138, clip=0, loss_scale=1, train_wall=84, gb_free=61.1, wall=37472 epoch 026: 1544 / 1689 loss=3.544, nll_loss=1.994, ppl=3.98, wps=582033, ups=1.18, wpb=494730, bsz=16415.4, num_updates=43700, lr=0.000302545, gnorm=0.138, clip=0, loss_scale=1, train_wall=84, gb_free=61.1, wall=37472 epoch 026: 1544 / 1689 loss=3.544, nll_loss=1.994, ppl=3.98, wps=582033, ups=1.18, wpb=494730, bsz=16415.4, num_updates=43700, lr=0.000302545, gnorm=0.138, clip=0, loss_scale=1, train_wall=84, gb_free=61.1, wall=37472 epoch 026: 1544 / 1689 loss=3.544, nll_loss=1.994, ppl=3.98, wps=582033, ups=1.18, wpb=494730, bsz=16415.4, num_updates=43700, lr=0.000302545, gnorm=0.138, clip=0, loss_scale=1, train_wall=84, gb_free=61.1, wall=37472 epoch 026: 1544 / 1689 loss=3.544, nll_loss=1.994, ppl=3.98, wps=582033, ups=1.18, wpb=494730, bsz=16415.4, num_updates=43700, lr=0.000302545, gnorm=0.138, clip=0, loss_scale=1, train_wall=84, gb_free=61.1, wall=37472 epoch 026: 1644 / 1689 loss=3.538, nll_loss=1.988, ppl=3.97, wps=583821, ups=1.18, wpb=495048, bsz=16410.8, num_updates=43800, lr=0.000302199, gnorm=0.139, clip=0, loss_scale=1, train_wall=84, gb_free=60.8, wall=37556 epoch 026: 1644 / 1689 loss=3.538, nll_loss=1.988, ppl=3.97, wps=583821, ups=1.18, wpb=495048, bsz=16410.8, num_updates=43800, lr=0.000302199, gnorm=0.139, clip=0, loss_scale=1, train_wall=84, gb_free=60.8, wall=37556 epoch 026: 1644 / 1689 loss=3.538, nll_loss=1.988, ppl=3.97, wps=583821, ups=1.18, wpb=495048, bsz=16410.8, num_updates=43800, lr=0.000302199, gnorm=0.139, clip=0, loss_scale=1, train_wall=84, gb_free=60.8, wall=37556 epoch 026: 1644 / 1689 loss=3.538, nll_loss=1.988, ppl=3.97, wps=583821, ups=1.18, wpb=495048, bsz=16410.8, num_updates=43800, lr=0.000302199, gnorm=0.139, clip=0, loss_scale=1, train_wall=84, gb_free=60.8, wall=37556 epoch 026: 1644 / 1689 loss=3.538, nll_loss=1.988, ppl=3.97, wps=583821, ups=1.18, wpb=495048, bsz=16410.8, num_updates=43800, lr=0.000302199, gnorm=0.139, clip=0, loss_scale=1, train_wall=84, gb_free=60.8, wall=37556 epoch 026: 1644 / 1689 loss=3.538, nll_loss=1.988, ppl=3.97, wps=583821, ups=1.18, wpb=495048, bsz=16410.8, num_updates=43800, lr=0.000302199, gnorm=0.139, clip=0, loss_scale=1, train_wall=84, gb_free=60.8, wall=37556 epoch 026: 1644 / 1689 loss=3.538, nll_loss=1.988, ppl=3.97, wps=583821, ups=1.18, wpb=495048, bsz=16410.8, num_updates=43800, lr=0.000302199, gnorm=0.139, clip=0, loss_scale=1, train_wall=84, gb_free=60.8, wall=37556 epoch 026: 1644 / 1689 loss=3.538, nll_loss=1.988, ppl=3.97, wps=583821, ups=1.18, wpb=495048, bsz=16410.8, num_updates=43800, lr=0.000302199, gnorm=0.139, clip=0, loss_scale=1, train_wall=84, gb_free=60.8, wall=37556 epoch 026: 1644 / 1689 loss=3.538, nll_loss=1.988, ppl=3.97, wps=583821, ups=1.18, wpb=495048, bsz=16410.8, num_updates=43800, lr=0.000302199, gnorm=0.139, clip=0, loss_scale=1, train_wall=84, gb_free=60.8, wall=37556 epoch 026: 1644 / 1689 loss=3.538, nll_loss=1.988, ppl=3.97, wps=583821, ups=1.18, wpb=495048, bsz=16410.8, num_updates=43800, lr=0.000302199, gnorm=0.139, clip=0, loss_scale=1, train_wall=84, gb_free=60.8, wall=37556 epoch 026: 1644 / 1689 loss=3.538, nll_loss=1.988, ppl=3.97, wps=583821, ups=1.18, wpb=495048, bsz=16410.8, num_updates=43800, lr=0.000302199, gnorm=0.139, clip=0, loss_scale=1, train_wall=84, gb_free=60.8, wall=37556 epoch 026: 1644 / 1689 loss=3.538, nll_loss=1.988, ppl=3.97, wps=583821, ups=1.18, wpb=495048, bsz=16410.8, num_updates=43800, lr=0.000302199, gnorm=0.139, clip=0, loss_scale=1, train_wall=84, gb_free=60.8, wall=37556 epoch 026: 1644 / 1689 loss=3.538, nll_loss=1.988, ppl=3.97, wps=583821, ups=1.18, wpb=495048, bsz=16410.8, num_updates=43800, lr=0.000302199, gnorm=0.139, clip=0, loss_scale=1, train_wall=84, gb_free=60.8, wall=37556 epoch 026: 1644 / 1689 loss=3.538, nll_loss=1.988, ppl=3.97, wps=583821, ups=1.18, wpb=495048, bsz=16410.8, num_updates=43800, lr=0.000302199, gnorm=0.139, clip=0, loss_scale=1, train_wall=84, gb_free=60.8, wall=37556 epoch 026: 1644 / 1689 loss=3.538, nll_loss=1.988, ppl=3.97, wps=583821, ups=1.18, wpb=495048, bsz=16410.8, num_updates=43800, lr=0.000302199, gnorm=0.139, clip=0, loss_scale=1, train_wall=84, gb_free=60.8, wall=37556 epoch 026: 1644 / 1689 loss=3.538, nll_loss=1.988, ppl=3.97, wps=583821, ups=1.18, wpb=495048, bsz=16410.8, num_updates=43800, lr=0.000302199, gnorm=0.139, clip=0, loss_scale=1, train_wall=84, gb_free=60.8, wall=37556 epoch 026: 1644 / 1689 loss=3.538, nll_loss=1.988, ppl=3.97, wps=583821, ups=1.18, wpb=495048, bsz=16410.8, num_updates=43800, lr=0.000302199, gnorm=0.139, clip=0, loss_scale=1, train_wall=84, gb_free=60.8, wall=37556 epoch 026: 1644 / 1689 loss=3.538, nll_loss=1.988, ppl=3.97, wps=583821, ups=1.18, wpb=495048, bsz=16410.8, num_updates=43800, lr=0.000302199, gnorm=0.139, clip=0, loss_scale=1, train_wall=84, gb_free=60.8, wall=37556 epoch 026: 1644 / 1689 loss=3.538, nll_loss=1.988, ppl=3.97, wps=583821, ups=1.18, wpb=495048, bsz=16410.8, num_updates=43800, lr=0.000302199, gnorm=0.139, clip=0, loss_scale=1, train_wall=84, gb_free=60.8, wall=37556 epoch 026: 1644 / 1689 loss=3.538, nll_loss=1.988, ppl=3.97, wps=583821, ups=1.18, wpb=495048, bsz=16410.8, num_updates=43800, lr=0.000302199, gnorm=0.139, clip=0, loss_scale=1, train_wall=84, gb_free=60.8, wall=37556 epoch 026: 1644 / 1689 loss=3.538, nll_loss=1.988, ppl=3.97, wps=583821, ups=1.18, wpb=495048, bsz=16410.8, num_updates=43800, lr=0.000302199, gnorm=0.139, clip=0, loss_scale=1, train_wall=84, gb_free=60.8, wall=37556 epoch 026: 1644 / 1689 loss=3.538, nll_loss=1.988, ppl=3.97, wps=583821, ups=1.18, wpb=495048, bsz=16410.8, num_updates=43800, lr=0.000302199, gnorm=0.139, clip=0, loss_scale=1, train_wall=84, gb_free=60.8, wall=37556 epoch 026: 1644 / 1689 loss=3.538, nll_loss=1.988, ppl=3.97, wps=583821, ups=1.18, wpb=495048, bsz=16410.8, num_updates=43800, lr=0.000302199, gnorm=0.139, clip=0, loss_scale=1, train_wall=84, gb_free=60.8, wall=37556 epoch 026: 1644 / 1689 loss=3.538, nll_loss=1.988, ppl=3.97, wps=583821, ups=1.18, wpb=495048, bsz=16410.8, num_updates=43800, lr=0.000302199, gnorm=0.139, clip=0, loss_scale=1, train_wall=84, gb_free=60.8, wall=37556 epoch 026: 1644 / 1689 loss=3.538, nll_loss=1.988, ppl=3.97, wps=583821, ups=1.18, wpb=495048, bsz=16410.8, num_updates=43800, lr=0.000302199, gnorm=0.139, clip=0, loss_scale=1, train_wall=84, gb_free=60.8, wall=37556 epoch 026: 1644 / 1689 loss=3.538, nll_loss=1.988, ppl=3.97, wps=583821, ups=1.18, wpb=495048, bsz=16410.8, num_updates=43800, lr=0.000302199, gnorm=0.139, clip=0, loss_scale=1, train_wall=84, gb_free=60.8, wall=37556 end of epoch 26 (average epoch stats below) epoch 026 | loss 3.533 | nll_loss 1.981 | ppl 3.95 | wps 581146 | ups 1.17 | wpb 495097 | bsz 16501.8 | num_updates 43845 | lr 0.000302044 | gnorm 0.136 | clip 0 | loss_scale 1 | train_wall 1408 | gb_free 62.5 | wall 37594 epoch 026 | loss 3.533 | nll_loss 1.981 | ppl 3.95 | wps 581146 | ups 1.17 | wpb 495097 | bsz 16501.8 | num_updates 43845 | lr 0.000302044 | gnorm 0.136 | clip 0 | loss_scale 1 | train_wall 1408 | gb_free 62.5 | wall 37594 epoch 026 | loss 3.533 | nll_loss 1.981 | ppl 3.95 | wps 581146 | ups 1.17 | wpb 495097 | bsz 16501.8 | num_updates 43845 | lr 0.000302044 | gnorm 0.136 | clip 0 | loss_scale 1 | train_wall 1408 | gb_free 62.5 | wall 37594 epoch 026 | loss 3.533 | nll_loss 1.981 | ppl 3.95 | wps 581146 | ups 1.17 | wpb 495097 | bsz 16501.8 | num_updates 43845 | lr 0.000302044 | gnorm 0.136 | clip 0 | loss_scale 1 | train_wall 1408 | gb_free 62.5 | wall 37594 epoch 026 | loss 3.533 | nll_loss 1.981 | ppl 3.95 | wps 581146 | ups 1.17 | wpb 495097 | bsz 16501.8 | num_updates 43845 | lr 0.000302044 | gnorm 0.136 | clip 0 | loss_scale 1 | train_wall 1408 | gb_free 62.5 | wall 37594 epoch 026 | loss 3.533 | nll_loss 1.981 | ppl 3.95 | wps 581146 | ups 1.17 | wpb 495097 | bsz 16501.8 | num_updates 43845 | lr 0.000302044 | gnorm 0.136 | clip 0 | loss_scale 1 | train_wall 1408 | gb_free 62.5 | wall 37594 epoch 026 | loss 3.533 | nll_loss 1.981 | ppl 3.95 | wps 581146 | ups 1.17 | wpb 495097 | bsz 16501.8 | num_updates 43845 | lr 0.000302044 | gnorm 0.136 | clip 0 | loss_scale 1 | train_wall 1408 | gb_free 62.5 | wall 37594 epoch 026 | loss 3.533 | nll_loss 1.981 | ppl 3.95 | wps 581146 | ups 1.17 | wpb 495097 | bsz 16501.8 | num_updates 43845 | lr 0.000302044 | gnorm 0.136 | clip 0 | loss_scale 1 | train_wall 1408 | gb_free 62.5 | wall 37594 epoch 026 | loss 3.533 | nll_loss 1.981 | ppl 3.95 | wps 581146 | ups 1.17 | wpb 495097 | bsz 16501.8 | num_updates 43845 | lr 0.000302044 | gnorm 0.136 | clip 0 | loss_scale 1 | train_wall 1408 | gb_free 62.5 | wall 37594 epoch 026 | loss 3.533 | nll_loss 1.981 | ppl 3.95 | wps 581146 | ups 1.17 | wpb 495097 | bsz 16501.8 | num_updates 43845 | lr 0.000302044 | gnorm 0.136 | clip 0 | loss_scale 1 | train_wall 1408 | gb_free 62.5 | wall 37594 epoch 026 | loss 3.533 | nll_loss 1.981 | ppl 3.95 | wps 581146 | ups 1.17 | wpb 495097 | bsz 16501.8 | num_updates 43845 | lr 0.000302044 | gnorm 0.136 | clip 0 | loss_scale 1 | train_wall 1408 | gb_free 62.5 | wall 37594 epoch 026 | loss 3.533 | nll_loss 1.981 | ppl 3.95 | wps 581146 | ups 1.17 | wpb 495097 | bsz 16501.8 | num_updates 43845 | lr 0.000302044 | gnorm 0.136 | clip 0 | loss_scale 1 | train_wall 1408 | gb_free 62.5 | wall 37594 epoch 026 | loss 3.533 | nll_loss 1.981 | ppl 3.95 | wps 581146 | ups 1.17 | wpb 495097 | bsz 16501.8 | num_updates 43845 | lr 0.000302044 | gnorm 0.136 | clip 0 | loss_scale 1 | train_wall 1408 | gb_free 62.5 | wall 37594 epoch 026 | loss 3.533 | nll_loss 1.981 | ppl 3.95 | wps 581146 | ups 1.17 | wpb 495097 | bsz 16501.8 | num_updates 43845 | lr 0.000302044 | gnorm 0.136 | clip 0 | loss_scale 1 | train_wall 1408 | gb_free 62.5 | wall 37594 epoch 026 | loss 3.533 | nll_loss 1.981 | ppl 3.95 | wps 581146 | ups 1.17 | wpb 495097 | bsz 16501.8 | num_updates 43845 | lr 0.000302044 | gnorm 0.136 | clip 0 | loss_scale 1 | train_wall 1408 | gb_free 62.5 | wall 37594 epoch 026 | loss 3.533 | nll_loss 1.981 | ppl 3.95 | wps 581146 | ups 1.17 | wpb 495097 | bsz 16501.8 | num_updates 43845 | lr 0.000302044 | gnorm 0.136 | clip 0 | loss_scale 1 | train_wall 1408 | gb_free 62.5 | wall 37594 epoch 026 | loss 3.533 | nll_loss 1.981 | ppl 3.95 | wps 581146 | ups 1.17 | wpb 495097 | bsz 16501.8 | num_updates 43845 | lr 0.000302044 | gnorm 0.136 | clip 0 | loss_scale 1 | train_wall 1408 | gb_free 62.5 | wall 37594 epoch 026 | loss 3.533 | nll_loss 1.981 | ppl 3.95 | wps 581146 | ups 1.17 | wpb 495097 | bsz 16501.8 | num_updates 43845 | lr 0.000302044 | gnorm 0.136 | clip 0 | loss_scale 1 | train_wall 1408 | gb_free 62.5 | wall 37594 epoch 026 | loss 3.533 | nll_loss 1.981 | ppl 3.95 | wps 581146 | ups 1.17 | wpb 495097 | bsz 16501.8 | num_updates 43845 | lr 0.000302044 | gnorm 0.136 | clip 0 | loss_scale 1 | train_wall 1408 | gb_free 62.5 | wall 37594 epoch 026 | loss 3.533 | nll_loss 1.981 | ppl 3.95 | wps 581146 | ups 1.17 | wpb 495097 | bsz 16501.8 | num_updates 43845 | lr 0.000302044 | gnorm 0.136 | clip 0 | loss_scale 1 | train_wall 1408 | gb_free 62.5 | wall 37594 epoch 026 | loss 3.533 | nll_loss 1.981 | ppl 3.95 | wps 581146 | ups 1.17 | wpb 495097 | bsz 16501.8 | num_updates 43845 | lr 0.000302044 | gnorm 0.136 | clip 0 | loss_scale 1 | train_wall 1408 | gb_free 62.5 | wall 37594 epoch 026 | loss 3.533 | nll_loss 1.981 | ppl 3.95 | wps 581146 | ups 1.17 | wpb 495097 | bsz 16501.8 | num_updates 43845 | lr 0.000302044 | gnorm 0.136 | clip 0 | loss_scale 1 | train_wall 1408 | gb_free 62.5 | wall 37594 epoch 026 | loss 3.533 | nll_loss 1.981 | ppl 3.95 | wps 581146 | ups 1.17 | wpb 495097 | bsz 16501.8 | num_updates 43845 | lr 0.000302044 | gnorm 0.136 | clip 0 | loss_scale 1 | train_wall 1408 | gb_free 62.5 | wall 37594 epoch 026 | loss 3.533 | nll_loss 1.981 | ppl 3.95 | wps 581146 | ups 1.17 | wpb 495097 | bsz 16501.8 | num_updates 43845 | lr 0.000302044 | gnorm 0.136 | clip 0 | loss_scale 1 | train_wall 1408 | gb_free 62.5 | wall 37594 epoch 026 | loss 3.533 | nll_loss 1.981 | ppl 3.95 | wps 581146 | ups 1.17 | wpb 495097 | bsz 16501.8 | num_updates 43845 | lr 0.000302044 | gnorm 0.136 | clip 0 | loss_scale 1 | train_wall 1408 | gb_free 62.5 | wall 37594 epoch 026 | loss 3.533 | nll_loss 1.981 | ppl 3.95 | wps 581146 | ups 1.17 | wpb 495097 | bsz 16501.8 | num_updates 43845 | lr 0.000302044 | gnorm 0.136 | clip 0 | loss_scale 1 | train_wall 1408 | gb_free 62.5 | wall 37594 Start iterating over samples epoch 027: 55 / 1689 loss=3.519, nll_loss=1.965, ppl=3.9, wps=579568, ups=1.18, wpb=490299, bsz=16026, num_updates=43900, lr=0.000301855, gnorm=0.129, clip=0, loss_scale=1, train_wall=83, gb_free=61.7, wall=37641 epoch 027: 55 / 1689 loss=3.519, nll_loss=1.965, ppl=3.9, wps=579568, ups=1.18, wpb=490299, bsz=16026, num_updates=43900, lr=0.000301855, gnorm=0.129, clip=0, loss_scale=1, train_wall=83, gb_free=61.7, wall=37641 epoch 027: 55 / 1689 loss=3.519, nll_loss=1.965, ppl=3.9, wps=579568, ups=1.18, wpb=490299, bsz=16026, num_updates=43900, lr=0.000301855, gnorm=0.129, clip=0, loss_scale=1, train_wall=83, gb_free=61.7, wall=37641 epoch 027: 55 / 1689 loss=3.519, nll_loss=1.965, ppl=3.9, wps=579568, ups=1.18, wpb=490299, bsz=16026, num_updates=43900, lr=0.000301855, gnorm=0.129, clip=0, loss_scale=1, train_wall=83, gb_free=61.7, wall=37641 epoch 027: 55 / 1689 loss=3.519, nll_loss=1.965, ppl=3.9, wps=579568, ups=1.18, wpb=490299, bsz=16026, num_updates=43900, lr=0.000301855, gnorm=0.129, clip=0, loss_scale=1, train_wall=83, gb_free=61.7, wall=37641 epoch 027: 55 / 1689 loss=3.519, nll_loss=1.965, ppl=3.9, wps=579568, ups=1.18, wpb=490299, bsz=16026, num_updates=43900, lr=0.000301855, gnorm=0.129, clip=0, loss_scale=1, train_wall=83, gb_free=61.7, wall=37641 epoch 027: 55 / 1689 loss=3.519, nll_loss=1.965, ppl=3.9, wps=579568, ups=1.18, wpb=490299, bsz=16026, num_updates=43900, lr=0.000301855, gnorm=0.129, clip=0, loss_scale=1, train_wall=83, gb_free=61.7, wall=37641 epoch 027: 55 / 1689 loss=3.519, nll_loss=1.965, ppl=3.9, wps=579568, ups=1.18, wpb=490299, bsz=16026, num_updates=43900, lr=0.000301855, gnorm=0.129, clip=0, loss_scale=1, train_wall=83, gb_free=61.7, wall=37641 epoch 027: 55 / 1689 loss=3.519, nll_loss=1.965, ppl=3.9, wps=579568, ups=1.18, wpb=490299, bsz=16026, num_updates=43900, lr=0.000301855, gnorm=0.129, clip=0, loss_scale=1, train_wall=83, gb_free=61.7, wall=37641 epoch 027: 55 / 1689 loss=3.519, nll_loss=1.965, ppl=3.9, wps=579568, ups=1.18, wpb=490299, bsz=16026, num_updates=43900, lr=0.000301855, gnorm=0.129, clip=0, loss_scale=1, train_wall=83, gb_free=61.7, wall=37641 epoch 027: 55 / 1689 loss=3.519, nll_loss=1.965, ppl=3.9, wps=579568, ups=1.18, wpb=490299, bsz=16026, num_updates=43900, lr=0.000301855, gnorm=0.129, clip=0, loss_scale=1, train_wall=83, gb_free=61.7, wall=37641 epoch 027: 55 / 1689 loss=3.519, nll_loss=1.965, ppl=3.9, wps=579568, ups=1.18, wpb=490299, bsz=16026, num_updates=43900, lr=0.000301855, gnorm=0.129, clip=0, loss_scale=1, train_wall=83, gb_free=61.7, wall=37641 epoch 027: 55 / 1689 loss=3.519, nll_loss=1.965, ppl=3.9, wps=579568, ups=1.18, wpb=490299, bsz=16026, num_updates=43900, lr=0.000301855, gnorm=0.129, clip=0, loss_scale=1, train_wall=83, gb_free=61.7, wall=37641 epoch 027: 55 / 1689 loss=3.519, nll_loss=1.965, ppl=3.9, wps=579568, ups=1.18, wpb=490299, bsz=16026, num_updates=43900, lr=0.000301855, gnorm=0.129, clip=0, loss_scale=1, train_wall=83, gb_free=61.7, wall=37641 epoch 027: 55 / 1689 loss=3.519, nll_loss=1.965, ppl=3.9, wps=579568, ups=1.18, wpb=490299, bsz=16026, num_updates=43900, lr=0.000301855, gnorm=0.129, clip=0, loss_scale=1, train_wall=83, gb_free=61.7, wall=37641 epoch 027: 55 / 1689 loss=3.519, nll_loss=1.965, ppl=3.9, wps=579568, ups=1.18, wpb=490299, bsz=16026, num_updates=43900, lr=0.000301855, gnorm=0.129, clip=0, loss_scale=1, train_wall=83, gb_free=61.7, wall=37641 epoch 027: 55 / 1689 loss=3.519, nll_loss=1.965, ppl=3.9, wps=579568, ups=1.18, wpb=490299, bsz=16026, num_updates=43900, lr=0.000301855, gnorm=0.129, clip=0, loss_scale=1, train_wall=83, gb_free=61.7, wall=37641 epoch 027: 55 / 1689 loss=3.519, nll_loss=1.965, ppl=3.9, wps=579568, ups=1.18, wpb=490299, bsz=16026, num_updates=43900, lr=0.000301855, gnorm=0.129, clip=0, loss_scale=1, train_wall=83, gb_free=61.7, wall=37641 epoch 027: 55 / 1689 loss=3.519, nll_loss=1.965, ppl=3.9, wps=579568, ups=1.18, wpb=490299, bsz=16026, num_updates=43900, lr=0.000301855, gnorm=0.129, clip=0, loss_scale=1, train_wall=83, gb_free=61.7, wall=37641 epoch 027: 55 / 1689 loss=3.519, nll_loss=1.965, ppl=3.9, wps=579568, ups=1.18, wpb=490299, bsz=16026, num_updates=43900, lr=0.000301855, gnorm=0.129, clip=0, loss_scale=1, train_wall=83, gb_free=61.7, wall=37641 epoch 027: 55 / 1689 loss=3.519, nll_loss=1.965, ppl=3.9, wps=579568, ups=1.18, wpb=490299, bsz=16026, num_updates=43900, lr=0.000301855, gnorm=0.129, clip=0, loss_scale=1, train_wall=83, gb_free=61.7, wall=37641 epoch 027: 55 / 1689 loss=3.519, nll_loss=1.965, ppl=3.9, wps=579568, ups=1.18, wpb=490299, bsz=16026, num_updates=43900, lr=0.000301855, gnorm=0.129, clip=0, loss_scale=1, train_wall=83, gb_free=61.7, wall=37641 epoch 027: 55 / 1689 loss=3.519, nll_loss=1.965, ppl=3.9, wps=579568, ups=1.18, wpb=490299, bsz=16026, num_updates=43900, lr=0.000301855, gnorm=0.129, clip=0, loss_scale=1, train_wall=83, gb_free=61.7, wall=37641 epoch 027: 55 / 1689 loss=3.519, nll_loss=1.965, ppl=3.9, wps=579568, ups=1.18, wpb=490299, bsz=16026, num_updates=43900, lr=0.000301855, gnorm=0.129, clip=0, loss_scale=1, train_wall=83, gb_free=61.7, wall=37641 epoch 027: 55 / 1689 loss=3.519, nll_loss=1.965, ppl=3.9, wps=579568, ups=1.18, wpb=490299, bsz=16026, num_updates=43900, lr=0.000301855, gnorm=0.129, clip=0, loss_scale=1, train_wall=83, gb_free=61.7, wall=37641 epoch 027: 55 / 1689 loss=3.519, nll_loss=1.965, ppl=3.9, wps=579568, ups=1.18, wpb=490299, bsz=16026, num_updates=43900, lr=0.000301855, gnorm=0.129, clip=0, loss_scale=1, train_wall=83, gb_free=61.7, wall=37641 epoch 027: 55 / 1689 loss=3.519, nll_loss=1.965, ppl=3.9, wps=579568, ups=1.18, wpb=490299, bsz=16026, num_updates=43900, lr=0.000301855, gnorm=0.129, clip=0, loss_scale=1, train_wall=83, gb_free=61.7, wall=37641 epoch 027: 155 / 1689 loss=3.51, nll_loss=1.954, ppl=3.88, wps=586800, ups=1.18, wpb=495930, bsz=16524.4, num_updates=44000, lr=0.000301511, gnorm=0.132, clip=0, loss_scale=1, train_wall=83, gb_free=61.7, wall=37725 epoch 027: 155 / 1689 loss=3.51, nll_loss=1.954, ppl=3.88, wps=586800, ups=1.18, wpb=495930, bsz=16524.4, num_updates=44000, lr=0.000301511, gnorm=0.132, clip=0, loss_scale=1, train_wall=83, gb_free=61.7, wall=37725 epoch 027: 155 / 1689 loss=3.51, nll_loss=1.954, ppl=3.88, wps=586800, ups=1.18, wpb=495930, bsz=16524.4, num_updates=44000, lr=0.000301511, gnorm=0.132, clip=0, loss_scale=1, train_wall=83, gb_free=61.7, wall=37725 epoch 027: 155 / 1689 loss=3.51, nll_loss=1.954, ppl=3.88, wps=586800, ups=1.18, wpb=495930, bsz=16524.4, num_updates=44000, lr=0.000301511, gnorm=0.132, clip=0, loss_scale=1, train_wall=83, gb_free=61.7, wall=37725 epoch 027: 155 / 1689 loss=3.51, nll_loss=1.954, ppl=3.88, wps=586800, ups=1.18, wpb=495930, bsz=16524.4, num_updates=44000, lr=0.000301511, gnorm=0.132, clip=0, loss_scale=1, train_wall=83, gb_free=61.7, wall=37725 epoch 027: 155 / 1689 loss=3.51, nll_loss=1.954, ppl=3.88, wps=586800, ups=1.18, wpb=495930, bsz=16524.4, num_updates=44000, lr=0.000301511, gnorm=0.132, clip=0, loss_scale=1, train_wall=83, gb_free=61.7, wall=37725 epoch 027: 155 / 1689 loss=3.51, nll_loss=1.954, ppl=3.88, wps=586800, ups=1.18, wpb=495930, bsz=16524.4, num_updates=44000, lr=0.000301511, gnorm=0.132, clip=0, loss_scale=1, train_wall=83, gb_free=61.7, wall=37725 epoch 027: 155 / 1689 loss=3.51, nll_loss=1.954, ppl=3.88, wps=586800, ups=1.18, wpb=495930, bsz=16524.4, num_updates=44000, lr=0.000301511, gnorm=0.132, clip=0, loss_scale=1, train_wall=83, gb_free=61.7, wall=37725 epoch 027: 155 / 1689 loss=3.51, nll_loss=1.954, ppl=3.88, wps=586800, ups=1.18, wpb=495930, bsz=16524.4, num_updates=44000, lr=0.000301511, gnorm=0.132, clip=0, loss_scale=1, train_wall=83, gb_free=61.7, wall=37725 epoch 027: 155 / 1689 loss=3.51, nll_loss=1.954, ppl=3.88, wps=586800, ups=1.18, wpb=495930, bsz=16524.4, num_updates=44000, lr=0.000301511, gnorm=0.132, clip=0, loss_scale=1, train_wall=83, gb_free=61.7, wall=37725 epoch 027: 155 / 1689 loss=3.51, nll_loss=1.954, ppl=3.88, wps=586800, ups=1.18, wpb=495930, bsz=16524.4, num_updates=44000, lr=0.000301511, gnorm=0.132, clip=0, loss_scale=1, train_wall=83, gb_free=61.7, wall=37725 epoch 027: 155 / 1689 loss=3.51, nll_loss=1.954, ppl=3.88, wps=586800, ups=1.18, wpb=495930, bsz=16524.4, num_updates=44000, lr=0.000301511, gnorm=0.132, clip=0, loss_scale=1, train_wall=83, gb_free=61.7, wall=37725 epoch 027: 155 / 1689 loss=3.51, nll_loss=1.954, ppl=3.88, wps=586800, ups=1.18, wpb=495930, bsz=16524.4, num_updates=44000, lr=0.000301511, gnorm=0.132, clip=0, loss_scale=1, train_wall=83, gb_free=61.7, wall=37725 epoch 027: 155 / 1689 loss=3.51, nll_loss=1.954, ppl=3.88, wps=586800, ups=1.18, wpb=495930, bsz=16524.4, num_updates=44000, lr=0.000301511, gnorm=0.132, clip=0, loss_scale=1, train_wall=83, gb_free=61.7, wall=37725 epoch 027: 155 / 1689 loss=3.51, nll_loss=1.954, ppl=3.88, wps=586800, ups=1.18, wpb=495930, bsz=16524.4, num_updates=44000, lr=0.000301511, gnorm=0.132, clip=0, loss_scale=1, train_wall=83, gb_free=61.7, wall=37725 epoch 027: 155 / 1689 loss=3.51, nll_loss=1.954, ppl=3.88, wps=586800, ups=1.18, wpb=495930, bsz=16524.4, num_updates=44000, lr=0.000301511, gnorm=0.132, clip=0, loss_scale=1, train_wall=83, gb_free=61.7, wall=37725 epoch 027: 155 / 1689 loss=3.51, nll_loss=1.954, ppl=3.88, wps=586800, ups=1.18, wpb=495930, bsz=16524.4, num_updates=44000, lr=0.000301511, gnorm=0.132, clip=0, loss_scale=1, train_wall=83, gb_free=61.7, wall=37725 epoch 027: 155 / 1689 loss=3.51, nll_loss=1.954, ppl=3.88, wps=586800, ups=1.18, wpb=495930, bsz=16524.4, num_updates=44000, lr=0.000301511, gnorm=0.132, clip=0, loss_scale=1, train_wall=83, gb_free=61.7, wall=37725 epoch 027: 155 / 1689 loss=3.51, nll_loss=1.954, ppl=3.88, wps=586800, ups=1.18, wpb=495930, bsz=16524.4, num_updates=44000, lr=0.000301511, gnorm=0.132, clip=0, loss_scale=1, train_wall=83, gb_free=61.7, wall=37725 epoch 027: 155 / 1689 loss=3.51, nll_loss=1.954, ppl=3.88, wps=586800, ups=1.18, wpb=495930, bsz=16524.4, num_updates=44000, lr=0.000301511, gnorm=0.132, clip=0, loss_scale=1, train_wall=83, gb_free=61.7, wall=37725 epoch 027: 155 / 1689 loss=3.51, nll_loss=1.954, ppl=3.88, wps=586800, ups=1.18, wpb=495930, bsz=16524.4, num_updates=44000, lr=0.000301511, gnorm=0.132, clip=0, loss_scale=1, train_wall=83, gb_free=61.7, wall=37725 epoch 027: 155 / 1689 loss=3.51, nll_loss=1.954, ppl=3.88, wps=586800, ups=1.18, wpb=495930, bsz=16524.4, num_updates=44000, lr=0.000301511, gnorm=0.132, clip=0, loss_scale=1, train_wall=83, gb_free=61.7, wall=37725 epoch 027: 155 / 1689 loss=3.51, nll_loss=1.954, ppl=3.88, wps=586800, ups=1.18, wpb=495930, bsz=16524.4, num_updates=44000, lr=0.000301511, gnorm=0.132, clip=0, loss_scale=1, train_wall=83, gb_free=61.7, wall=37725 epoch 027: 155 / 1689 loss=3.51, nll_loss=1.954, ppl=3.88, wps=586800, ups=1.18, wpb=495930, bsz=16524.4, num_updates=44000, lr=0.000301511, gnorm=0.132, clip=0, loss_scale=1, train_wall=83, gb_free=61.7, wall=37725 epoch 027: 155 / 1689 loss=3.51, nll_loss=1.954, ppl=3.88, wps=586800, ups=1.18, wpb=495930, bsz=16524.4, num_updates=44000, lr=0.000301511, gnorm=0.132, clip=0, loss_scale=1, train_wall=83, gb_free=61.7, wall=37725 epoch 027: 155 / 1689 loss=3.51, nll_loss=1.954, ppl=3.88, wps=586800, ups=1.18, wpb=495930, bsz=16524.4, num_updates=44000, lr=0.000301511, gnorm=0.132, clip=0, loss_scale=1, train_wall=83, gb_free=61.7, wall=37725 epoch 027: 155 / 1689 loss=3.51, nll_loss=1.954, ppl=3.88, wps=586800, ups=1.18, wpb=495930, bsz=16524.4, num_updates=44000, lr=0.000301511, gnorm=0.132, clip=0, loss_scale=1, train_wall=83, gb_free=61.7, wall=37725 begin validation on "valid" subset epoch 027 | valid on 'valid' subset | loss 3.7 | nll_loss 2.154 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 44000 | best_loss 3.695 epoch 027 | valid on 'valid' subset | loss 3.7 | nll_loss 2.154 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 44000 | best_loss 3.695 epoch 027 | valid on 'valid' subset | loss 3.7 | nll_loss 2.154 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 44000 | best_loss 3.695 epoch 027 | valid on 'valid' subset | loss 3.7 | nll_loss 2.154 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 44000 | best_loss 3.695 epoch 027 | valid on 'valid' subset | loss 3.7 | nll_loss 2.154 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 44000 | best_loss 3.695 epoch 027 | valid on 'valid' subset | loss 3.7 | nll_loss 2.154 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 44000 | best_loss 3.695 epoch 027 | valid on 'valid' subset | loss 3.7 | nll_loss 2.154 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 44000 | best_loss 3.695 epoch 027 | valid on 'valid' subset | loss 3.7 | nll_loss 2.154 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 44000 | best_loss 3.695 epoch 027 | valid on 'valid' subset | loss 3.7 | nll_loss 2.154 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 44000 | best_loss 3.695 epoch 027 | valid on 'valid' subset | loss 3.7 | nll_loss 2.154 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 44000 | best_loss 3.695 epoch 027 | valid on 'valid' subset | loss 3.7 | nll_loss 2.154 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 44000 | best_loss 3.695 epoch 027 | valid on 'valid' subset | loss 3.7 | nll_loss 2.154 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 44000 | best_loss 3.695 epoch 027 | valid on 'valid' subset | loss 3.7 | nll_loss 2.154 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 44000 | best_loss 3.695 epoch 027 | valid on 'valid' subset | loss 3.7 | nll_loss 2.154 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 44000 | best_loss 3.695 epoch 027 | valid on 'valid' subset | loss 3.7 | nll_loss 2.154 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 44000 | best_loss 3.695 epoch 027 | valid on 'valid' subset | loss 3.7 | nll_loss 2.154 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 44000 | best_loss 3.695 epoch 027 | valid on 'valid' subset | loss 3.7 | nll_loss 2.154 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 44000 | best_loss 3.695 epoch 027 | valid on 'valid' subset | loss 3.7 | nll_loss 2.154 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 44000 | best_loss 3.695 epoch 027 | valid on 'valid' subset | loss 3.7 | nll_loss 2.154 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 44000 | best_loss 3.695 epoch 027 | valid on 'valid' subset | loss 3.7 | nll_loss 2.154 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 44000 | best_loss 3.695 epoch 027 | valid on 'valid' subset | loss 3.7 | nll_loss 2.154 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 44000 | best_loss 3.695 epoch 027 | valid on 'valid' subset | loss 3.7 | nll_loss 2.154 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 44000 | best_loss 3.695 epoch 027 | valid on 'valid' subset | loss 3.7 | nll_loss 2.154 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 44000 | best_loss 3.695 epoch 027 | valid on 'valid' subset | loss 3.7 | nll_loss 2.154 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 44000 | best_loss 3.695 epoch 027 | valid on 'valid' subset | loss 3.7 | nll_loss 2.154 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 44000 | best_loss 3.695 epoch 027 | valid on 'valid' subset | loss 3.7 | nll_loss 2.154 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 44000 | best_loss 3.695 epoch 027 | valid on 'valid' subset | loss 3.7 | nll_loss 2.154 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 44000 | best_loss 3.695 epoch 027: 255 / 1689 loss=3.519, nll_loss=1.965, ppl=3.9, wps=539574, ups=1.09, wpb=495645, bsz=15993.9, num_updates=44100, lr=0.000301169, gnorm=0.132, clip=0, loss_scale=1, train_wall=83, gb_free=61.3, wall=37817 epoch 027: 255 / 1689 loss=3.519, nll_loss=1.965, ppl=3.9, wps=539574, ups=1.09, wpb=495645, bsz=15993.9, num_updates=44100, lr=0.000301169, gnorm=0.132, clip=0, loss_scale=1, train_wall=83, gb_free=61.3, wall=37817 epoch 027: 255 / 1689 loss=3.519, nll_loss=1.965, ppl=3.9, wps=539574, ups=1.09, wpb=495645, bsz=15993.9, num_updates=44100, lr=0.000301169, gnorm=0.132, clip=0, loss_scale=1, train_wall=83, gb_free=61.3, wall=37817 epoch 027: 255 / 1689 loss=3.519, nll_loss=1.965, ppl=3.9, wps=539574, ups=1.09, wpb=495645, bsz=15993.9, num_updates=44100, lr=0.000301169, gnorm=0.132, clip=0, loss_scale=1, train_wall=83, gb_free=61.3, wall=37817 epoch 027: 255 / 1689 loss=3.519, nll_loss=1.965, ppl=3.9, wps=539574, ups=1.09, wpb=495645, bsz=15993.9, num_updates=44100, lr=0.000301169, gnorm=0.132, clip=0, loss_scale=1, train_wall=83, gb_free=61.3, wall=37817 epoch 027: 255 / 1689 loss=3.519, nll_loss=1.965, ppl=3.9, wps=539574, ups=1.09, wpb=495645, bsz=15993.9, num_updates=44100, lr=0.000301169, gnorm=0.132, clip=0, loss_scale=1, train_wall=83, gb_free=61.3, wall=37817 epoch 027: 255 / 1689 loss=3.519, nll_loss=1.965, ppl=3.9, wps=539574, ups=1.09, wpb=495645, bsz=15993.9, num_updates=44100, lr=0.000301169, gnorm=0.132, clip=0, loss_scale=1, train_wall=83, gb_free=61.3, wall=37817 epoch 027: 255 / 1689 loss=3.519, nll_loss=1.965, ppl=3.9, wps=539574, ups=1.09, wpb=495645, bsz=15993.9, num_updates=44100, lr=0.000301169, gnorm=0.132, clip=0, loss_scale=1, train_wall=83, gb_free=61.3, wall=37817 epoch 027: 255 / 1689 loss=3.519, nll_loss=1.965, ppl=3.9, wps=539574, ups=1.09, wpb=495645, bsz=15993.9, num_updates=44100, lr=0.000301169, gnorm=0.132, clip=0, loss_scale=1, train_wall=83, gb_free=61.3, wall=37817 epoch 027: 255 / 1689 loss=3.519, nll_loss=1.965, ppl=3.9, wps=539574, ups=1.09, wpb=495645, bsz=15993.9, num_updates=44100, lr=0.000301169, gnorm=0.132, clip=0, loss_scale=1, train_wall=83, gb_free=61.3, wall=37817 epoch 027: 255 / 1689 loss=3.519, nll_loss=1.965, ppl=3.9, wps=539574, ups=1.09, wpb=495645, bsz=15993.9, num_updates=44100, lr=0.000301169, gnorm=0.132, clip=0, loss_scale=1, train_wall=83, gb_free=61.3, wall=37817 epoch 027: 255 / 1689 loss=3.519, nll_loss=1.965, ppl=3.9, wps=539574, ups=1.09, wpb=495645, bsz=15993.9, num_updates=44100, lr=0.000301169, gnorm=0.132, clip=0, loss_scale=1, train_wall=83, gb_free=61.3, wall=37817 epoch 027: 255 / 1689 loss=3.519, nll_loss=1.965, ppl=3.9, wps=539574, ups=1.09, wpb=495645, bsz=15993.9, num_updates=44100, lr=0.000301169, gnorm=0.132, clip=0, loss_scale=1, train_wall=83, gb_free=61.3, wall=37817 epoch 027: 255 / 1689 loss=3.519, nll_loss=1.965, ppl=3.9, wps=539574, ups=1.09, wpb=495645, bsz=15993.9, num_updates=44100, lr=0.000301169, gnorm=0.132, clip=0, loss_scale=1, train_wall=83, gb_free=61.3, wall=37817 epoch 027: 255 / 1689 loss=3.519, nll_loss=1.965, ppl=3.9, wps=539574, ups=1.09, wpb=495645, bsz=15993.9, num_updates=44100, lr=0.000301169, gnorm=0.132, clip=0, loss_scale=1, train_wall=83, gb_free=61.3, wall=37817 epoch 027: 255 / 1689 loss=3.519, nll_loss=1.965, ppl=3.9, wps=539574, ups=1.09, wpb=495645, bsz=15993.9, num_updates=44100, lr=0.000301169, gnorm=0.132, clip=0, loss_scale=1, train_wall=83, gb_free=61.3, wall=37817 epoch 027: 255 / 1689 loss=3.519, nll_loss=1.965, ppl=3.9, wps=539574, ups=1.09, wpb=495645, bsz=15993.9, num_updates=44100, lr=0.000301169, gnorm=0.132, clip=0, loss_scale=1, train_wall=83, gb_free=61.3, wall=37817 epoch 027: 255 / 1689 loss=3.519, nll_loss=1.965, ppl=3.9, wps=539574, ups=1.09, wpb=495645, bsz=15993.9, num_updates=44100, lr=0.000301169, gnorm=0.132, clip=0, loss_scale=1, train_wall=83, gb_free=61.3, wall=37817 epoch 027: 255 / 1689 loss=3.519, nll_loss=1.965, ppl=3.9, wps=539574, ups=1.09, wpb=495645, bsz=15993.9, num_updates=44100, lr=0.000301169, gnorm=0.132, clip=0, loss_scale=1, train_wall=83, gb_free=61.3, wall=37817 epoch 027: 255 / 1689 loss=3.519, nll_loss=1.965, ppl=3.9, wps=539574, ups=1.09, wpb=495645, bsz=15993.9, num_updates=44100, lr=0.000301169, gnorm=0.132, clip=0, loss_scale=1, train_wall=83, gb_free=61.3, wall=37817 epoch 027: 255 / 1689 loss=3.519, nll_loss=1.965, ppl=3.9, wps=539574, ups=1.09, wpb=495645, bsz=15993.9, num_updates=44100, lr=0.000301169, gnorm=0.132, clip=0, loss_scale=1, train_wall=83, gb_free=61.3, wall=37817 epoch 027: 255 / 1689 loss=3.519, nll_loss=1.965, ppl=3.9, wps=539574, ups=1.09, wpb=495645, bsz=15993.9, num_updates=44100, lr=0.000301169, gnorm=0.132, clip=0, loss_scale=1, train_wall=83, gb_free=61.3, wall=37817 epoch 027: 255 / 1689 loss=3.519, nll_loss=1.965, ppl=3.9, wps=539574, ups=1.09, wpb=495645, bsz=15993.9, num_updates=44100, lr=0.000301169, gnorm=0.132, clip=0, loss_scale=1, train_wall=83, gb_free=61.3, wall=37817 epoch 027: 255 / 1689 loss=3.519, nll_loss=1.965, ppl=3.9, wps=539574, ups=1.09, wpb=495645, bsz=15993.9, num_updates=44100, lr=0.000301169, gnorm=0.132, clip=0, loss_scale=1, train_wall=83, gb_free=61.3, wall=37817 epoch 027: 255 / 1689 loss=3.519, nll_loss=1.965, ppl=3.9, wps=539574, ups=1.09, wpb=495645, bsz=15993.9, num_updates=44100, lr=0.000301169, gnorm=0.132, clip=0, loss_scale=1, train_wall=83, gb_free=61.3, wall=37817 epoch 027: 255 / 1689 loss=3.519, nll_loss=1.965, ppl=3.9, wps=539574, ups=1.09, wpb=495645, bsz=15993.9, num_updates=44100, lr=0.000301169, gnorm=0.132, clip=0, loss_scale=1, train_wall=83, gb_free=61.3, wall=37817 epoch 027: 255 / 1689 loss=3.519, nll_loss=1.965, ppl=3.9, wps=539574, ups=1.09, wpb=495645, bsz=15993.9, num_updates=44100, lr=0.000301169, gnorm=0.132, clip=0, loss_scale=1, train_wall=83, gb_free=61.3, wall=37817 epoch 027: 355 / 1689 loss=3.518, nll_loss=1.964, ppl=3.9, wps=587604, ups=1.19, wpb=495861, bsz=16575.3, num_updates=44200, lr=0.000300828, gnorm=0.133, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=37902 epoch 027: 355 / 1689 loss=3.518, nll_loss=1.964, ppl=3.9, wps=587604, ups=1.19, wpb=495861, bsz=16575.3, num_updates=44200, lr=0.000300828, gnorm=0.133, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=37902 epoch 027: 355 / 1689 loss=3.518, nll_loss=1.964, ppl=3.9, wps=587604, ups=1.19, wpb=495861, bsz=16575.3, num_updates=44200, lr=0.000300828, gnorm=0.133, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=37902 epoch 027: 355 / 1689 loss=3.518, nll_loss=1.964, ppl=3.9, wps=587604, ups=1.19, wpb=495861, bsz=16575.3, num_updates=44200, lr=0.000300828, gnorm=0.133, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=37902 epoch 027: 355 / 1689 loss=3.518, nll_loss=1.964, ppl=3.9, wps=587604, ups=1.19, wpb=495861, bsz=16575.3, num_updates=44200, lr=0.000300828, gnorm=0.133, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=37902 epoch 027: 355 / 1689 loss=3.518, nll_loss=1.964, ppl=3.9, wps=587604, ups=1.19, wpb=495861, bsz=16575.3, num_updates=44200, lr=0.000300828, gnorm=0.133, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=37902 epoch 027: 355 / 1689 loss=3.518, nll_loss=1.964, ppl=3.9, wps=587604, ups=1.19, wpb=495861, bsz=16575.3, num_updates=44200, lr=0.000300828, gnorm=0.133, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=37902 epoch 027: 355 / 1689 loss=3.518, nll_loss=1.964, ppl=3.9, wps=587604, ups=1.19, wpb=495861, bsz=16575.3, num_updates=44200, lr=0.000300828, gnorm=0.133, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=37902 epoch 027: 355 / 1689 loss=3.518, nll_loss=1.964, ppl=3.9, wps=587604, ups=1.19, wpb=495861, bsz=16575.3, num_updates=44200, lr=0.000300828, gnorm=0.133, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=37902 epoch 027: 355 / 1689 loss=3.518, nll_loss=1.964, ppl=3.9, wps=587604, ups=1.19, wpb=495861, bsz=16575.3, num_updates=44200, lr=0.000300828, gnorm=0.133, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=37902 epoch 027: 355 / 1689 loss=3.518, nll_loss=1.964, ppl=3.9, wps=587604, ups=1.19, wpb=495861, bsz=16575.3, num_updates=44200, lr=0.000300828, gnorm=0.133, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=37902 epoch 027: 355 / 1689 loss=3.518, nll_loss=1.964, ppl=3.9, wps=587604, ups=1.19, wpb=495861, bsz=16575.3, num_updates=44200, lr=0.000300828, gnorm=0.133, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=37902 epoch 027: 355 / 1689 loss=3.518, nll_loss=1.964, ppl=3.9, wps=587604, ups=1.19, wpb=495861, bsz=16575.3, num_updates=44200, lr=0.000300828, gnorm=0.133, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=37902 epoch 027: 355 / 1689 loss=3.518, nll_loss=1.964, ppl=3.9, wps=587604, ups=1.19, wpb=495861, bsz=16575.3, num_updates=44200, lr=0.000300828, gnorm=0.133, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=37902 epoch 027: 355 / 1689 loss=3.518, nll_loss=1.964, ppl=3.9, wps=587604, ups=1.19, wpb=495861, bsz=16575.3, num_updates=44200, lr=0.000300828, gnorm=0.133, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=37902 epoch 027: 355 / 1689 loss=3.518, nll_loss=1.964, ppl=3.9, wps=587604, ups=1.19, wpb=495861, bsz=16575.3, num_updates=44200, lr=0.000300828, gnorm=0.133, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=37902 epoch 027: 355 / 1689 loss=3.518, nll_loss=1.964, ppl=3.9, wps=587604, ups=1.19, wpb=495861, bsz=16575.3, num_updates=44200, lr=0.000300828, gnorm=0.133, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=37902 epoch 027: 355 / 1689 loss=3.518, nll_loss=1.964, ppl=3.9, wps=587604, ups=1.19, wpb=495861, bsz=16575.3, num_updates=44200, lr=0.000300828, gnorm=0.133, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=37902 epoch 027: 355 / 1689 loss=3.518, nll_loss=1.964, ppl=3.9, wps=587604, ups=1.19, wpb=495861, bsz=16575.3, num_updates=44200, lr=0.000300828, gnorm=0.133, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=37902 epoch 027: 355 / 1689 loss=3.518, nll_loss=1.964, ppl=3.9, wps=587604, ups=1.19, wpb=495861, bsz=16575.3, num_updates=44200, lr=0.000300828, gnorm=0.133, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=37902 epoch 027: 355 / 1689 loss=3.518, nll_loss=1.964, ppl=3.9, wps=587604, ups=1.19, wpb=495861, bsz=16575.3, num_updates=44200, lr=0.000300828, gnorm=0.133, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=37902 epoch 027: 355 / 1689 loss=3.518, nll_loss=1.964, ppl=3.9, wps=587604, ups=1.19, wpb=495861, bsz=16575.3, num_updates=44200, lr=0.000300828, gnorm=0.133, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=37902 epoch 027: 355 / 1689 loss=3.518, nll_loss=1.964, ppl=3.9, wps=587604, ups=1.19, wpb=495861, bsz=16575.3, num_updates=44200, lr=0.000300828, gnorm=0.133, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=37902 epoch 027: 355 / 1689 loss=3.518, nll_loss=1.964, ppl=3.9, wps=587604, ups=1.19, wpb=495861, bsz=16575.3, num_updates=44200, lr=0.000300828, gnorm=0.133, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=37902 epoch 027: 355 / 1689 loss=3.518, nll_loss=1.964, ppl=3.9, wps=587604, ups=1.19, wpb=495861, bsz=16575.3, num_updates=44200, lr=0.000300828, gnorm=0.133, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=37902 epoch 027: 355 / 1689 loss=3.518, nll_loss=1.964, ppl=3.9, wps=587604, ups=1.19, wpb=495861, bsz=16575.3, num_updates=44200, lr=0.000300828, gnorm=0.133, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=37902 epoch 027: 355 / 1689 loss=3.518, nll_loss=1.964, ppl=3.9, wps=587604, ups=1.19, wpb=495861, bsz=16575.3, num_updates=44200, lr=0.000300828, gnorm=0.133, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=37902 epoch 027: 455 / 1689 loss=3.524, nll_loss=1.97, ppl=3.92, wps=587764, ups=1.18, wpb=496120, bsz=16376.6, num_updates=44300, lr=0.000300489, gnorm=0.139, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=37986 epoch 027: 455 / 1689 loss=3.524, nll_loss=1.97, ppl=3.92, wps=587764, ups=1.18, wpb=496120, bsz=16376.6, num_updates=44300, lr=0.000300489, gnorm=0.139, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=37986 epoch 027: 455 / 1689 loss=3.524, nll_loss=1.97, ppl=3.92, wps=587764, ups=1.18, wpb=496120, bsz=16376.6, num_updates=44300, lr=0.000300489, gnorm=0.139, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=37986 epoch 027: 455 / 1689 loss=3.524, nll_loss=1.97, ppl=3.92, wps=587764, ups=1.18, wpb=496120, bsz=16376.6, num_updates=44300, lr=0.000300489, gnorm=0.139, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=37986 epoch 027: 455 / 1689 loss=3.524, nll_loss=1.97, ppl=3.92, wps=587764, ups=1.18, wpb=496120, bsz=16376.6, num_updates=44300, lr=0.000300489, gnorm=0.139, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=37986 epoch 027: 455 / 1689 loss=3.524, nll_loss=1.97, ppl=3.92, wps=587764, ups=1.18, wpb=496120, bsz=16376.6, num_updates=44300, lr=0.000300489, gnorm=0.139, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=37986 epoch 027: 455 / 1689 loss=3.524, nll_loss=1.97, ppl=3.92, wps=587764, ups=1.18, wpb=496120, bsz=16376.6, num_updates=44300, lr=0.000300489, gnorm=0.139, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=37986 epoch 027: 455 / 1689 loss=3.524, nll_loss=1.97, ppl=3.92, wps=587764, ups=1.18, wpb=496120, bsz=16376.6, num_updates=44300, lr=0.000300489, gnorm=0.139, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=37986 epoch 027: 455 / 1689 loss=3.524, nll_loss=1.97, ppl=3.92, wps=587764, ups=1.18, wpb=496120, bsz=16376.6, num_updates=44300, lr=0.000300489, gnorm=0.139, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=37986 epoch 027: 455 / 1689 loss=3.524, nll_loss=1.97, ppl=3.92, wps=587764, ups=1.18, wpb=496120, bsz=16376.6, num_updates=44300, lr=0.000300489, gnorm=0.139, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=37986 epoch 027: 455 / 1689 loss=3.524, nll_loss=1.97, ppl=3.92, wps=587764, ups=1.18, wpb=496120, bsz=16376.6, num_updates=44300, lr=0.000300489, gnorm=0.139, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=37986 epoch 027: 455 / 1689 loss=3.524, nll_loss=1.97, ppl=3.92, wps=587764, ups=1.18, wpb=496120, bsz=16376.6, num_updates=44300, lr=0.000300489, gnorm=0.139, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=37986 epoch 027: 455 / 1689 loss=3.524, nll_loss=1.97, ppl=3.92, wps=587764, ups=1.18, wpb=496120, bsz=16376.6, num_updates=44300, lr=0.000300489, gnorm=0.139, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=37986 epoch 027: 455 / 1689 loss=3.524, nll_loss=1.97, ppl=3.92, wps=587764, ups=1.18, wpb=496120, bsz=16376.6, num_updates=44300, lr=0.000300489, gnorm=0.139, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=37986 epoch 027: 455 / 1689 loss=3.524, nll_loss=1.97, ppl=3.92, wps=587764, ups=1.18, wpb=496120, bsz=16376.6, num_updates=44300, lr=0.000300489, gnorm=0.139, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=37986 epoch 027: 455 / 1689 loss=3.524, nll_loss=1.97, ppl=3.92, wps=587764, ups=1.18, wpb=496120, bsz=16376.6, num_updates=44300, lr=0.000300489, gnorm=0.139, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=37986 epoch 027: 455 / 1689 loss=3.524, nll_loss=1.97, ppl=3.92, wps=587764, ups=1.18, wpb=496120, bsz=16376.6, num_updates=44300, lr=0.000300489, gnorm=0.139, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=37986 epoch 027: 455 / 1689 loss=3.524, nll_loss=1.97, ppl=3.92, wps=587764, ups=1.18, wpb=496120, bsz=16376.6, num_updates=44300, lr=0.000300489, gnorm=0.139, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=37986 epoch 027: 455 / 1689 loss=3.524, nll_loss=1.97, ppl=3.92, wps=587764, ups=1.18, wpb=496120, bsz=16376.6, num_updates=44300, lr=0.000300489, gnorm=0.139, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=37986 epoch 027: 455 / 1689 loss=3.524, nll_loss=1.97, ppl=3.92, wps=587764, ups=1.18, wpb=496120, bsz=16376.6, num_updates=44300, lr=0.000300489, gnorm=0.139, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=37986 epoch 027: 455 / 1689 loss=3.524, nll_loss=1.97, ppl=3.92, wps=587764, ups=1.18, wpb=496120, bsz=16376.6, num_updates=44300, lr=0.000300489, gnorm=0.139, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=37986 epoch 027: 455 / 1689 loss=3.524, nll_loss=1.97, ppl=3.92, wps=587764, ups=1.18, wpb=496120, bsz=16376.6, num_updates=44300, lr=0.000300489, gnorm=0.139, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=37986 epoch 027: 455 / 1689 loss=3.524, nll_loss=1.97, ppl=3.92, wps=587764, ups=1.18, wpb=496120, bsz=16376.6, num_updates=44300, lr=0.000300489, gnorm=0.139, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=37986 epoch 027: 455 / 1689 loss=3.524, nll_loss=1.97, ppl=3.92, wps=587764, ups=1.18, wpb=496120, bsz=16376.6, num_updates=44300, lr=0.000300489, gnorm=0.139, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=37986 epoch 027: 455 / 1689 loss=3.524, nll_loss=1.97, ppl=3.92, wps=587764, ups=1.18, wpb=496120, bsz=16376.6, num_updates=44300, lr=0.000300489, gnorm=0.139, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=37986 epoch 027: 455 / 1689 loss=3.524, nll_loss=1.97, ppl=3.92, wps=587764, ups=1.18, wpb=496120, bsz=16376.6, num_updates=44300, lr=0.000300489, gnorm=0.139, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=37986 epoch 027: 455 / 1689 loss=3.524, nll_loss=1.97, ppl=3.92, wps=587764, ups=1.18, wpb=496120, bsz=16376.6, num_updates=44300, lr=0.000300489, gnorm=0.139, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=37986 epoch 027: 555 / 1689 loss=3.524, nll_loss=1.97, ppl=3.92, wps=590560, ups=1.19, wpb=496671, bsz=16293.1, num_updates=44400, lr=0.00030015, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61.1, wall=38070 epoch 027: 555 / 1689 loss=3.524, nll_loss=1.97, ppl=3.92, wps=590560, ups=1.19, wpb=496671, bsz=16293.1, num_updates=44400, lr=0.00030015, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61.1, wall=38070 epoch 027: 555 / 1689 loss=3.524, nll_loss=1.97, ppl=3.92, wps=590560, ups=1.19, wpb=496671, bsz=16293.1, num_updates=44400, lr=0.00030015, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61.1, wall=38070 epoch 027: 555 / 1689 loss=3.524, nll_loss=1.97, ppl=3.92, wps=590560, ups=1.19, wpb=496671, bsz=16293.1, num_updates=44400, lr=0.00030015, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61.1, wall=38070 epoch 027: 555 / 1689 loss=3.524, nll_loss=1.97, ppl=3.92, wps=590560, ups=1.19, wpb=496671, bsz=16293.1, num_updates=44400, lr=0.00030015, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61.1, wall=38070 epoch 027: 555 / 1689 loss=3.524, nll_loss=1.97, ppl=3.92, wps=590560, ups=1.19, wpb=496671, bsz=16293.1, num_updates=44400, lr=0.00030015, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61.1, wall=38070 epoch 027: 555 / 1689 loss=3.524, nll_loss=1.97, ppl=3.92, wps=590560, ups=1.19, wpb=496671, bsz=16293.1, num_updates=44400, lr=0.00030015, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61.1, wall=38070 epoch 027: 555 / 1689 loss=3.524, nll_loss=1.97, ppl=3.92, wps=590560, ups=1.19, wpb=496671, bsz=16293.1, num_updates=44400, lr=0.00030015, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61.1, wall=38070 epoch 027: 555 / 1689 loss=3.524, nll_loss=1.97, ppl=3.92, wps=590560, ups=1.19, wpb=496671, bsz=16293.1, num_updates=44400, lr=0.00030015, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61.1, wall=38070 epoch 027: 555 / 1689 loss=3.524, nll_loss=1.97, ppl=3.92, wps=590560, ups=1.19, wpb=496671, bsz=16293.1, num_updates=44400, lr=0.00030015, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61.1, wall=38070 epoch 027: 555 / 1689 loss=3.524, nll_loss=1.97, ppl=3.92, wps=590560, ups=1.19, wpb=496671, bsz=16293.1, num_updates=44400, lr=0.00030015, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61.1, wall=38070 epoch 027: 555 / 1689 loss=3.524, nll_loss=1.97, ppl=3.92, wps=590560, ups=1.19, wpb=496671, bsz=16293.1, num_updates=44400, lr=0.00030015, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61.1, wall=38070 epoch 027: 555 / 1689 loss=3.524, nll_loss=1.97, ppl=3.92, wps=590560, ups=1.19, wpb=496671, bsz=16293.1, num_updates=44400, lr=0.00030015, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61.1, wall=38070 epoch 027: 555 / 1689 loss=3.524, nll_loss=1.97, ppl=3.92, wps=590560, ups=1.19, wpb=496671, bsz=16293.1, num_updates=44400, lr=0.00030015, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61.1, wall=38070 epoch 027: 555 / 1689 loss=3.524, nll_loss=1.97, ppl=3.92, wps=590560, ups=1.19, wpb=496671, bsz=16293.1, num_updates=44400, lr=0.00030015, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61.1, wall=38070 epoch 027: 555 / 1689 loss=3.524, nll_loss=1.97, ppl=3.92, wps=590560, ups=1.19, wpb=496671, bsz=16293.1, num_updates=44400, lr=0.00030015, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61.1, wall=38070 epoch 027: 555 / 1689 loss=3.524, nll_loss=1.97, ppl=3.92, wps=590560, ups=1.19, wpb=496671, bsz=16293.1, num_updates=44400, lr=0.00030015, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61.1, wall=38070 epoch 027: 555 / 1689 loss=3.524, nll_loss=1.97, ppl=3.92, wps=590560, ups=1.19, wpb=496671, bsz=16293.1, num_updates=44400, lr=0.00030015, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61.1, wall=38070 epoch 027: 555 / 1689 loss=3.524, nll_loss=1.97, ppl=3.92, wps=590560, ups=1.19, wpb=496671, bsz=16293.1, num_updates=44400, lr=0.00030015, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61.1, wall=38070 epoch 027: 555 / 1689 loss=3.524, nll_loss=1.97, ppl=3.92, wps=590560, ups=1.19, wpb=496671, bsz=16293.1, num_updates=44400, lr=0.00030015, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61.1, wall=38070 epoch 027: 555 / 1689 loss=3.524, nll_loss=1.97, ppl=3.92, wps=590560, ups=1.19, wpb=496671, bsz=16293.1, num_updates=44400, lr=0.00030015, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61.1, wall=38070 epoch 027: 555 / 1689 loss=3.524, nll_loss=1.97, ppl=3.92, wps=590560, ups=1.19, wpb=496671, bsz=16293.1, num_updates=44400, lr=0.00030015, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61.1, wall=38070 epoch 027: 555 / 1689 loss=3.524, nll_loss=1.97, ppl=3.92, wps=590560, ups=1.19, wpb=496671, bsz=16293.1, num_updates=44400, lr=0.00030015, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61.1, wall=38070 epoch 027: 555 / 1689 loss=3.524, nll_loss=1.97, ppl=3.92, wps=590560, ups=1.19, wpb=496671, bsz=16293.1, num_updates=44400, lr=0.00030015, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61.1, wall=38070 epoch 027: 555 / 1689 loss=3.524, nll_loss=1.97, ppl=3.92, wps=590560, ups=1.19, wpb=496671, bsz=16293.1, num_updates=44400, lr=0.00030015, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61.1, wall=38070 epoch 027: 555 / 1689 loss=3.524, nll_loss=1.97, ppl=3.92, wps=590560, ups=1.19, wpb=496671, bsz=16293.1, num_updates=44400, lr=0.00030015, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61.1, wall=38070 epoch 027: 555 / 1689 loss=3.524, nll_loss=1.97, ppl=3.92, wps=590560, ups=1.19, wpb=496671, bsz=16293.1, num_updates=44400, lr=0.00030015, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61.1, wall=38070 epoch 027: 655 / 1689 loss=3.525, nll_loss=1.972, ppl=3.92, wps=584754, ups=1.18, wpb=493584, bsz=16711.3, num_updates=44500, lr=0.000299813, gnorm=0.135, clip=0, loss_scale=2, train_wall=83, gb_free=61.6, wall=38155 epoch 027: 655 / 1689 loss=3.525, nll_loss=1.972, ppl=3.92, wps=584754, ups=1.18, wpb=493584, bsz=16711.3, num_updates=44500, lr=0.000299813, gnorm=0.135, clip=0, loss_scale=2, train_wall=83, gb_free=61.6, wall=38155 epoch 027: 655 / 1689 loss=3.525, nll_loss=1.972, ppl=3.92, wps=584754, ups=1.18, wpb=493584, bsz=16711.3, num_updates=44500, lr=0.000299813, gnorm=0.135, clip=0, loss_scale=2, train_wall=83, gb_free=61.6, wall=38155 epoch 027: 655 / 1689 loss=3.525, nll_loss=1.972, ppl=3.92, wps=584754, ups=1.18, wpb=493584, bsz=16711.3, num_updates=44500, lr=0.000299813, gnorm=0.135, clip=0, loss_scale=2, train_wall=83, gb_free=61.6, wall=38155 epoch 027: 655 / 1689 loss=3.525, nll_loss=1.972, ppl=3.92, wps=584754, ups=1.18, wpb=493584, bsz=16711.3, num_updates=44500, lr=0.000299813, gnorm=0.135, clip=0, loss_scale=2, train_wall=83, gb_free=61.6, wall=38155 epoch 027: 655 / 1689 loss=3.525, nll_loss=1.972, ppl=3.92, wps=584754, ups=1.18, wpb=493584, bsz=16711.3, num_updates=44500, lr=0.000299813, gnorm=0.135, clip=0, loss_scale=2, train_wall=83, gb_free=61.6, wall=38155 epoch 027: 655 / 1689 loss=3.525, nll_loss=1.972, ppl=3.92, wps=584754, ups=1.18, wpb=493584, bsz=16711.3, num_updates=44500, lr=0.000299813, gnorm=0.135, clip=0, loss_scale=2, train_wall=83, gb_free=61.6, wall=38155 epoch 027: 655 / 1689 loss=3.525, nll_loss=1.972, ppl=3.92, wps=584754, ups=1.18, wpb=493584, bsz=16711.3, num_updates=44500, lr=0.000299813, gnorm=0.135, clip=0, loss_scale=2, train_wall=83, gb_free=61.6, wall=38155 epoch 027: 655 / 1689 loss=3.525, nll_loss=1.972, ppl=3.92, wps=584754, ups=1.18, wpb=493584, bsz=16711.3, num_updates=44500, lr=0.000299813, gnorm=0.135, clip=0, loss_scale=2, train_wall=83, gb_free=61.6, wall=38155 epoch 027: 655 / 1689 loss=3.525, nll_loss=1.972, ppl=3.92, wps=584754, ups=1.18, wpb=493584, bsz=16711.3, num_updates=44500, lr=0.000299813, gnorm=0.135, clip=0, loss_scale=2, train_wall=83, gb_free=61.6, wall=38155 epoch 027: 655 / 1689 loss=3.525, nll_loss=1.972, ppl=3.92, wps=584754, ups=1.18, wpb=493584, bsz=16711.3, num_updates=44500, lr=0.000299813, gnorm=0.135, clip=0, loss_scale=2, train_wall=83, gb_free=61.6, wall=38155 epoch 027: 655 / 1689 loss=3.525, nll_loss=1.972, ppl=3.92, wps=584754, ups=1.18, wpb=493584, bsz=16711.3, num_updates=44500, lr=0.000299813, gnorm=0.135, clip=0, loss_scale=2, train_wall=83, gb_free=61.6, wall=38155 epoch 027: 655 / 1689 loss=3.525, nll_loss=1.972, ppl=3.92, wps=584754, ups=1.18, wpb=493584, bsz=16711.3, num_updates=44500, lr=0.000299813, gnorm=0.135, clip=0, loss_scale=2, train_wall=83, gb_free=61.6, wall=38155 epoch 027: 655 / 1689 loss=3.525, nll_loss=1.972, ppl=3.92, wps=584754, ups=1.18, wpb=493584, bsz=16711.3, num_updates=44500, lr=0.000299813, gnorm=0.135, clip=0, loss_scale=2, train_wall=83, gb_free=61.6, wall=38155 epoch 027: 655 / 1689 loss=3.525, nll_loss=1.972, ppl=3.92, wps=584754, ups=1.18, wpb=493584, bsz=16711.3, num_updates=44500, lr=0.000299813, gnorm=0.135, clip=0, loss_scale=2, train_wall=83, gb_free=61.6, wall=38155 epoch 027: 655 / 1689 loss=3.525, nll_loss=1.972, ppl=3.92, wps=584754, ups=1.18, wpb=493584, bsz=16711.3, num_updates=44500, lr=0.000299813, gnorm=0.135, clip=0, loss_scale=2, train_wall=83, gb_free=61.6, wall=38155 epoch 027: 655 / 1689 loss=3.525, nll_loss=1.972, ppl=3.92, wps=584754, ups=1.18, wpb=493584, bsz=16711.3, num_updates=44500, lr=0.000299813, gnorm=0.135, clip=0, loss_scale=2, train_wall=83, gb_free=61.6, wall=38155 epoch 027: 655 / 1689 loss=3.525, nll_loss=1.972, ppl=3.92, wps=584754, ups=1.18, wpb=493584, bsz=16711.3, num_updates=44500, lr=0.000299813, gnorm=0.135, clip=0, loss_scale=2, train_wall=83, gb_free=61.6, wall=38155 epoch 027: 655 / 1689 loss=3.525, nll_loss=1.972, ppl=3.92, wps=584754, ups=1.18, wpb=493584, bsz=16711.3, num_updates=44500, lr=0.000299813, gnorm=0.135, clip=0, loss_scale=2, train_wall=83, gb_free=61.6, wall=38155 epoch 027: 655 / 1689 loss=3.525, nll_loss=1.972, ppl=3.92, wps=584754, ups=1.18, wpb=493584, bsz=16711.3, num_updates=44500, lr=0.000299813, gnorm=0.135, clip=0, loss_scale=2, train_wall=83, gb_free=61.6, wall=38155 epoch 027: 655 / 1689 loss=3.525, nll_loss=1.972, ppl=3.92, wps=584754, ups=1.18, wpb=493584, bsz=16711.3, num_updates=44500, lr=0.000299813, gnorm=0.135, clip=0, loss_scale=2, train_wall=83, gb_free=61.6, wall=38155 epoch 027: 655 / 1689 loss=3.525, nll_loss=1.972, ppl=3.92, wps=584754, ups=1.18, wpb=493584, bsz=16711.3, num_updates=44500, lr=0.000299813, gnorm=0.135, clip=0, loss_scale=2, train_wall=83, gb_free=61.6, wall=38155 epoch 027: 655 / 1689 loss=3.525, nll_loss=1.972, ppl=3.92, wps=584754, ups=1.18, wpb=493584, bsz=16711.3, num_updates=44500, lr=0.000299813, gnorm=0.135, clip=0, loss_scale=2, train_wall=83, gb_free=61.6, wall=38155 epoch 027: 655 / 1689 loss=3.525, nll_loss=1.972, ppl=3.92, wps=584754, ups=1.18, wpb=493584, bsz=16711.3, num_updates=44500, lr=0.000299813, gnorm=0.135, clip=0, loss_scale=2, train_wall=83, gb_free=61.6, wall=38155 epoch 027: 655 / 1689 loss=3.525, nll_loss=1.972, ppl=3.92, wps=584754, ups=1.18, wpb=493584, bsz=16711.3, num_updates=44500, lr=0.000299813, gnorm=0.135, clip=0, loss_scale=2, train_wall=83, gb_free=61.6, wall=38155 epoch 027: 655 / 1689 loss=3.525, nll_loss=1.972, ppl=3.92, wps=584754, ups=1.18, wpb=493584, bsz=16711.3, num_updates=44500, lr=0.000299813, gnorm=0.135, clip=0, loss_scale=2, train_wall=83, gb_free=61.6, wall=38155 epoch 027: 655 / 1689 loss=3.525, nll_loss=1.972, ppl=3.92, wps=584754, ups=1.18, wpb=493584, bsz=16711.3, num_updates=44500, lr=0.000299813, gnorm=0.135, clip=0, loss_scale=2, train_wall=83, gb_free=61.6, wall=38155 epoch 027: 755 / 1689 loss=3.529, nll_loss=1.976, ppl=3.94, wps=582741, ups=1.18, wpb=494026, bsz=16714.8, num_updates=44600, lr=0.000299476, gnorm=0.138, clip=0, loss_scale=2, train_wall=84, gb_free=61.3, wall=38239 epoch 027: 755 / 1689 loss=3.529, nll_loss=1.976, ppl=3.94, wps=582741, ups=1.18, wpb=494026, bsz=16714.8, num_updates=44600, lr=0.000299476, gnorm=0.138, clip=0, loss_scale=2, train_wall=84, gb_free=61.3, wall=38239 epoch 027: 755 / 1689 loss=3.529, nll_loss=1.976, ppl=3.94, wps=582741, ups=1.18, wpb=494026, bsz=16714.8, num_updates=44600, lr=0.000299476, gnorm=0.138, clip=0, loss_scale=2, train_wall=84, gb_free=61.3, wall=38239 epoch 027: 755 / 1689 loss=3.529, nll_loss=1.976, ppl=3.94, wps=582741, ups=1.18, wpb=494026, bsz=16714.8, num_updates=44600, lr=0.000299476, gnorm=0.138, clip=0, loss_scale=2, train_wall=84, gb_free=61.3, wall=38239 epoch 027: 755 / 1689 loss=3.529, nll_loss=1.976, ppl=3.94, wps=582741, ups=1.18, wpb=494026, bsz=16714.8, num_updates=44600, lr=0.000299476, gnorm=0.138, clip=0, loss_scale=2, train_wall=84, gb_free=61.3, wall=38239 epoch 027: 755 / 1689 loss=3.529, nll_loss=1.976, ppl=3.94, wps=582741, ups=1.18, wpb=494026, bsz=16714.8, num_updates=44600, lr=0.000299476, gnorm=0.138, clip=0, loss_scale=2, train_wall=84, gb_free=61.3, wall=38239 epoch 027: 755 / 1689 loss=3.529, nll_loss=1.976, ppl=3.94, wps=582741, ups=1.18, wpb=494026, bsz=16714.8, num_updates=44600, lr=0.000299476, gnorm=0.138, clip=0, loss_scale=2, train_wall=84, gb_free=61.3, wall=38239 epoch 027: 755 / 1689 loss=3.529, nll_loss=1.976, ppl=3.94, wps=582741, ups=1.18, wpb=494026, bsz=16714.8, num_updates=44600, lr=0.000299476, gnorm=0.138, clip=0, loss_scale=2, train_wall=84, gb_free=61.3, wall=38239 epoch 027: 755 / 1689 loss=3.529, nll_loss=1.976, ppl=3.94, wps=582741, ups=1.18, wpb=494026, bsz=16714.8, num_updates=44600, lr=0.000299476, gnorm=0.138, clip=0, loss_scale=2, train_wall=84, gb_free=61.3, wall=38239 epoch 027: 755 / 1689 loss=3.529, nll_loss=1.976, ppl=3.94, wps=582741, ups=1.18, wpb=494026, bsz=16714.8, num_updates=44600, lr=0.000299476, gnorm=0.138, clip=0, loss_scale=2, train_wall=84, gb_free=61.3, wall=38239 epoch 027: 755 / 1689 loss=3.529, nll_loss=1.976, ppl=3.94, wps=582741, ups=1.18, wpb=494026, bsz=16714.8, num_updates=44600, lr=0.000299476, gnorm=0.138, clip=0, loss_scale=2, train_wall=84, gb_free=61.3, wall=38239 epoch 027: 755 / 1689 loss=3.529, nll_loss=1.976, ppl=3.94, wps=582741, ups=1.18, wpb=494026, bsz=16714.8, num_updates=44600, lr=0.000299476, gnorm=0.138, clip=0, loss_scale=2, train_wall=84, gb_free=61.3, wall=38239 epoch 027: 755 / 1689 loss=3.529, nll_loss=1.976, ppl=3.94, wps=582741, ups=1.18, wpb=494026, bsz=16714.8, num_updates=44600, lr=0.000299476, gnorm=0.138, clip=0, loss_scale=2, train_wall=84, gb_free=61.3, wall=38239 epoch 027: 755 / 1689 loss=3.529, nll_loss=1.976, ppl=3.94, wps=582741, ups=1.18, wpb=494026, bsz=16714.8, num_updates=44600, lr=0.000299476, gnorm=0.138, clip=0, loss_scale=2, train_wall=84, gb_free=61.3, wall=38239 epoch 027: 755 / 1689 loss=3.529, nll_loss=1.976, ppl=3.94, wps=582741, ups=1.18, wpb=494026, bsz=16714.8, num_updates=44600, lr=0.000299476, gnorm=0.138, clip=0, loss_scale=2, train_wall=84, gb_free=61.3, wall=38239 epoch 027: 755 / 1689 loss=3.529, nll_loss=1.976, ppl=3.94, wps=582741, ups=1.18, wpb=494026, bsz=16714.8, num_updates=44600, lr=0.000299476, gnorm=0.138, clip=0, loss_scale=2, train_wall=84, gb_free=61.3, wall=38239 epoch 027: 755 / 1689 loss=3.529, nll_loss=1.976, ppl=3.94, wps=582741, ups=1.18, wpb=494026, bsz=16714.8, num_updates=44600, lr=0.000299476, gnorm=0.138, clip=0, loss_scale=2, train_wall=84, gb_free=61.3, wall=38239 epoch 027: 755 / 1689 loss=3.529, nll_loss=1.976, ppl=3.94, wps=582741, ups=1.18, wpb=494026, bsz=16714.8, num_updates=44600, lr=0.000299476, gnorm=0.138, clip=0, loss_scale=2, train_wall=84, gb_free=61.3, wall=38239 epoch 027: 755 / 1689 loss=3.529, nll_loss=1.976, ppl=3.94, wps=582741, ups=1.18, wpb=494026, bsz=16714.8, num_updates=44600, lr=0.000299476, gnorm=0.138, clip=0, loss_scale=2, train_wall=84, gb_free=61.3, wall=38239 epoch 027: 755 / 1689 loss=3.529, nll_loss=1.976, ppl=3.94, wps=582741, ups=1.18, wpb=494026, bsz=16714.8, num_updates=44600, lr=0.000299476, gnorm=0.138, clip=0, loss_scale=2, train_wall=84, gb_free=61.3, wall=38239 epoch 027: 755 / 1689 loss=3.529, nll_loss=1.976, ppl=3.94, wps=582741, ups=1.18, wpb=494026, bsz=16714.8, num_updates=44600, lr=0.000299476, gnorm=0.138, clip=0, loss_scale=2, train_wall=84, gb_free=61.3, wall=38239 epoch 027: 755 / 1689 loss=3.529, nll_loss=1.976, ppl=3.94, wps=582741, ups=1.18, wpb=494026, bsz=16714.8, num_updates=44600, lr=0.000299476, gnorm=0.138, clip=0, loss_scale=2, train_wall=84, gb_free=61.3, wall=38239 epoch 027: 755 / 1689 loss=3.529, nll_loss=1.976, ppl=3.94, wps=582741, ups=1.18, wpb=494026, bsz=16714.8, num_updates=44600, lr=0.000299476, gnorm=0.138, clip=0, loss_scale=2, train_wall=84, gb_free=61.3, wall=38239 epoch 027: 755 / 1689 loss=3.529, nll_loss=1.976, ppl=3.94, wps=582741, ups=1.18, wpb=494026, bsz=16714.8, num_updates=44600, lr=0.000299476, gnorm=0.138, clip=0, loss_scale=2, train_wall=84, gb_free=61.3, wall=38239 epoch 027: 755 / 1689 loss=3.529, nll_loss=1.976, ppl=3.94, wps=582741, ups=1.18, wpb=494026, bsz=16714.8, num_updates=44600, lr=0.000299476, gnorm=0.138, clip=0, loss_scale=2, train_wall=84, gb_free=61.3, wall=38239 epoch 027: 755 / 1689 loss=3.529, nll_loss=1.976, ppl=3.94, wps=582741, ups=1.18, wpb=494026, bsz=16714.8, num_updates=44600, lr=0.000299476, gnorm=0.138, clip=0, loss_scale=2, train_wall=84, gb_free=61.3, wall=38239 epoch 027: 755 / 1689 loss=3.529, nll_loss=1.976, ppl=3.94, wps=582741, ups=1.18, wpb=494026, bsz=16714.8, num_updates=44600, lr=0.000299476, gnorm=0.138, clip=0, loss_scale=2, train_wall=84, gb_free=61.3, wall=38239 epoch 027: 855 / 1689 loss=3.531, nll_loss=1.979, ppl=3.94, wps=587899, ups=1.19, wpb=494885, bsz=16747, num_updates=44700, lr=0.000299141, gnorm=0.139, clip=0, loss_scale=4, train_wall=83, gb_free=61.4, wall=38324 epoch 027: 855 / 1689 loss=3.531, nll_loss=1.979, ppl=3.94, wps=587899, ups=1.19, wpb=494885, bsz=16747, num_updates=44700, lr=0.000299141, gnorm=0.139, clip=0, loss_scale=4, train_wall=83, gb_free=61.4, wall=38324 epoch 027: 855 / 1689 loss=3.531, nll_loss=1.979, ppl=3.94, wps=587899, ups=1.19, wpb=494885, bsz=16747, num_updates=44700, lr=0.000299141, gnorm=0.139, clip=0, loss_scale=4, train_wall=83, gb_free=61.4, wall=38324 epoch 027: 855 / 1689 loss=3.531, nll_loss=1.979, ppl=3.94, wps=587899, ups=1.19, wpb=494885, bsz=16747, num_updates=44700, lr=0.000299141, gnorm=0.139, clip=0, loss_scale=4, train_wall=83, gb_free=61.4, wall=38324 epoch 027: 855 / 1689 loss=3.531, nll_loss=1.979, ppl=3.94, wps=587899, ups=1.19, wpb=494885, bsz=16747, num_updates=44700, lr=0.000299141, gnorm=0.139, clip=0, loss_scale=4, train_wall=83, gb_free=61.4, wall=38324 epoch 027: 855 / 1689 loss=3.531, nll_loss=1.979, ppl=3.94, wps=587899, ups=1.19, wpb=494885, bsz=16747, num_updates=44700, lr=0.000299141, gnorm=0.139, clip=0, loss_scale=4, train_wall=83, gb_free=61.4, wall=38324 epoch 027: 855 / 1689 loss=3.531, nll_loss=1.979, ppl=3.94, wps=587899, ups=1.19, wpb=494885, bsz=16747, num_updates=44700, lr=0.000299141, gnorm=0.139, clip=0, loss_scale=4, train_wall=83, gb_free=61.4, wall=38324 epoch 027: 855 / 1689 loss=3.531, nll_loss=1.979, ppl=3.94, wps=587899, ups=1.19, wpb=494885, bsz=16747, num_updates=44700, lr=0.000299141, gnorm=0.139, clip=0, loss_scale=4, train_wall=83, gb_free=61.4, wall=38324 epoch 027: 855 / 1689 loss=3.531, nll_loss=1.979, ppl=3.94, wps=587899, ups=1.19, wpb=494885, bsz=16747, num_updates=44700, lr=0.000299141, gnorm=0.139, clip=0, loss_scale=4, train_wall=83, gb_free=61.4, wall=38324 epoch 027: 855 / 1689 loss=3.531, nll_loss=1.979, ppl=3.94, wps=587899, ups=1.19, wpb=494885, bsz=16747, num_updates=44700, lr=0.000299141, gnorm=0.139, clip=0, loss_scale=4, train_wall=83, gb_free=61.4, wall=38324 epoch 027: 855 / 1689 loss=3.531, nll_loss=1.979, ppl=3.94, wps=587899, ups=1.19, wpb=494885, bsz=16747, num_updates=44700, lr=0.000299141, gnorm=0.139, clip=0, loss_scale=4, train_wall=83, gb_free=61.4, wall=38324 epoch 027: 855 / 1689 loss=3.531, nll_loss=1.979, ppl=3.94, wps=587899, ups=1.19, wpb=494885, bsz=16747, num_updates=44700, lr=0.000299141, gnorm=0.139, clip=0, loss_scale=4, train_wall=83, gb_free=61.4, wall=38324 epoch 027: 855 / 1689 loss=3.531, nll_loss=1.979, ppl=3.94, wps=587899, ups=1.19, wpb=494885, bsz=16747, num_updates=44700, lr=0.000299141, gnorm=0.139, clip=0, loss_scale=4, train_wall=83, gb_free=61.4, wall=38324 epoch 027: 855 / 1689 loss=3.531, nll_loss=1.979, ppl=3.94, wps=587899, ups=1.19, wpb=494885, bsz=16747, num_updates=44700, lr=0.000299141, gnorm=0.139, clip=0, loss_scale=4, train_wall=83, gb_free=61.4, wall=38324 epoch 027: 855 / 1689 loss=3.531, nll_loss=1.979, ppl=3.94, wps=587899, ups=1.19, wpb=494885, bsz=16747, num_updates=44700, lr=0.000299141, gnorm=0.139, clip=0, loss_scale=4, train_wall=83, gb_free=61.4, wall=38324 epoch 027: 855 / 1689 loss=3.531, nll_loss=1.979, ppl=3.94, wps=587899, ups=1.19, wpb=494885, bsz=16747, num_updates=44700, lr=0.000299141, gnorm=0.139, clip=0, loss_scale=4, train_wall=83, gb_free=61.4, wall=38324 epoch 027: 855 / 1689 loss=3.531, nll_loss=1.979, ppl=3.94, wps=587899, ups=1.19, wpb=494885, bsz=16747, num_updates=44700, lr=0.000299141, gnorm=0.139, clip=0, loss_scale=4, train_wall=83, gb_free=61.4, wall=38324 epoch 027: 855 / 1689 loss=3.531, nll_loss=1.979, ppl=3.94, wps=587899, ups=1.19, wpb=494885, bsz=16747, num_updates=44700, lr=0.000299141, gnorm=0.139, clip=0, loss_scale=4, train_wall=83, gb_free=61.4, wall=38324 epoch 027: 855 / 1689 loss=3.531, nll_loss=1.979, ppl=3.94, wps=587899, ups=1.19, wpb=494885, bsz=16747, num_updates=44700, lr=0.000299141, gnorm=0.139, clip=0, loss_scale=4, train_wall=83, gb_free=61.4, wall=38324 epoch 027: 855 / 1689 loss=3.531, nll_loss=1.979, ppl=3.94, wps=587899, ups=1.19, wpb=494885, bsz=16747, num_updates=44700, lr=0.000299141, gnorm=0.139, clip=0, loss_scale=4, train_wall=83, gb_free=61.4, wall=38324 epoch 027: 855 / 1689 loss=3.531, nll_loss=1.979, ppl=3.94, wps=587899, ups=1.19, wpb=494885, bsz=16747, num_updates=44700, lr=0.000299141, gnorm=0.139, clip=0, loss_scale=4, train_wall=83, gb_free=61.4, wall=38324 epoch 027: 855 / 1689 loss=3.531, nll_loss=1.979, ppl=3.94, wps=587899, ups=1.19, wpb=494885, bsz=16747, num_updates=44700, lr=0.000299141, gnorm=0.139, clip=0, loss_scale=4, train_wall=83, gb_free=61.4, wall=38324 epoch 027: 855 / 1689 loss=3.531, nll_loss=1.979, ppl=3.94, wps=587899, ups=1.19, wpb=494885, bsz=16747, num_updates=44700, lr=0.000299141, gnorm=0.139, clip=0, loss_scale=4, train_wall=83, gb_free=61.4, wall=38324 epoch 027: 855 / 1689 loss=3.531, nll_loss=1.979, ppl=3.94, wps=587899, ups=1.19, wpb=494885, bsz=16747, num_updates=44700, lr=0.000299141, gnorm=0.139, clip=0, loss_scale=4, train_wall=83, gb_free=61.4, wall=38324 epoch 027: 855 / 1689 loss=3.531, nll_loss=1.979, ppl=3.94, wps=587899, ups=1.19, wpb=494885, bsz=16747, num_updates=44700, lr=0.000299141, gnorm=0.139, clip=0, loss_scale=4, train_wall=83, gb_free=61.4, wall=38324 epoch 027: 855 / 1689 loss=3.531, nll_loss=1.979, ppl=3.94, wps=587899, ups=1.19, wpb=494885, bsz=16747, num_updates=44700, lr=0.000299141, gnorm=0.139, clip=0, loss_scale=4, train_wall=83, gb_free=61.4, wall=38324 epoch 027: 855 / 1689 loss=3.531, nll_loss=1.979, ppl=3.94, wps=587899, ups=1.19, wpb=494885, bsz=16747, num_updates=44700, lr=0.000299141, gnorm=0.139, clip=0, loss_scale=4, train_wall=83, gb_free=61.4, wall=38324 epoch 027: 955 / 1689 loss=3.534, nll_loss=1.982, ppl=3.95, wps=583043, ups=1.18, wpb=495188, bsz=16222.5, num_updates=44800, lr=0.000298807, gnorm=0.134, clip=0, loss_scale=4, train_wall=84, gb_free=61.8, wall=38409 epoch 027: 955 / 1689 loss=3.534, nll_loss=1.982, ppl=3.95, wps=583043, ups=1.18, wpb=495188, bsz=16222.5, num_updates=44800, lr=0.000298807, gnorm=0.134, clip=0, loss_scale=4, train_wall=84, gb_free=61.8, wall=38409 epoch 027: 955 / 1689 loss=3.534, nll_loss=1.982, ppl=3.95, wps=583043, ups=1.18, wpb=495188, bsz=16222.5, num_updates=44800, lr=0.000298807, gnorm=0.134, clip=0, loss_scale=4, train_wall=84, gb_free=61.8, wall=38409 epoch 027: 955 / 1689 loss=3.534, nll_loss=1.982, ppl=3.95, wps=583043, ups=1.18, wpb=495188, bsz=16222.5, num_updates=44800, lr=0.000298807, gnorm=0.134, clip=0, loss_scale=4, train_wall=84, gb_free=61.8, wall=38409 epoch 027: 955 / 1689 loss=3.534, nll_loss=1.982, ppl=3.95, wps=583043, ups=1.18, wpb=495188, bsz=16222.5, num_updates=44800, lr=0.000298807, gnorm=0.134, clip=0, loss_scale=4, train_wall=84, gb_free=61.8, wall=38409 epoch 027: 955 / 1689 loss=3.534, nll_loss=1.982, ppl=3.95, wps=583043, ups=1.18, wpb=495188, bsz=16222.5, num_updates=44800, lr=0.000298807, gnorm=0.134, clip=0, loss_scale=4, train_wall=84, gb_free=61.8, wall=38409 epoch 027: 955 / 1689 loss=3.534, nll_loss=1.982, ppl=3.95, wps=583043, ups=1.18, wpb=495188, bsz=16222.5, num_updates=44800, lr=0.000298807, gnorm=0.134, clip=0, loss_scale=4, train_wall=84, gb_free=61.8, wall=38409 epoch 027: 955 / 1689 loss=3.534, nll_loss=1.982, ppl=3.95, wps=583043, ups=1.18, wpb=495188, bsz=16222.5, num_updates=44800, lr=0.000298807, gnorm=0.134, clip=0, loss_scale=4, train_wall=84, gb_free=61.8, wall=38409 epoch 027: 955 / 1689 loss=3.534, nll_loss=1.982, ppl=3.95, wps=583043, ups=1.18, wpb=495188, bsz=16222.5, num_updates=44800, lr=0.000298807, gnorm=0.134, clip=0, loss_scale=4, train_wall=84, gb_free=61.8, wall=38409 epoch 027: 955 / 1689 loss=3.534, nll_loss=1.982, ppl=3.95, wps=583043, ups=1.18, wpb=495188, bsz=16222.5, num_updates=44800, lr=0.000298807, gnorm=0.134, clip=0, loss_scale=4, train_wall=84, gb_free=61.8, wall=38409 epoch 027: 955 / 1689 loss=3.534, nll_loss=1.982, ppl=3.95, wps=583043, ups=1.18, wpb=495188, bsz=16222.5, num_updates=44800, lr=0.000298807, gnorm=0.134, clip=0, loss_scale=4, train_wall=84, gb_free=61.8, wall=38409 epoch 027: 955 / 1689 loss=3.534, nll_loss=1.982, ppl=3.95, wps=583043, ups=1.18, wpb=495188, bsz=16222.5, num_updates=44800, lr=0.000298807, gnorm=0.134, clip=0, loss_scale=4, train_wall=84, gb_free=61.8, wall=38409 epoch 027: 955 / 1689 loss=3.534, nll_loss=1.982, ppl=3.95, wps=583043, ups=1.18, wpb=495188, bsz=16222.5, num_updates=44800, lr=0.000298807, gnorm=0.134, clip=0, loss_scale=4, train_wall=84, gb_free=61.8, wall=38409 epoch 027: 955 / 1689 loss=3.534, nll_loss=1.982, ppl=3.95, wps=583043, ups=1.18, wpb=495188, bsz=16222.5, num_updates=44800, lr=0.000298807, gnorm=0.134, clip=0, loss_scale=4, train_wall=84, gb_free=61.8, wall=38409 epoch 027: 955 / 1689 loss=3.534, nll_loss=1.982, ppl=3.95, wps=583043, ups=1.18, wpb=495188, bsz=16222.5, num_updates=44800, lr=0.000298807, gnorm=0.134, clip=0, loss_scale=4, train_wall=84, gb_free=61.8, wall=38409 epoch 027: 955 / 1689 loss=3.534, nll_loss=1.982, ppl=3.95, wps=583043, ups=1.18, wpb=495188, bsz=16222.5, num_updates=44800, lr=0.000298807, gnorm=0.134, clip=0, loss_scale=4, train_wall=84, gb_free=61.8, wall=38409 epoch 027: 955 / 1689 loss=3.534, nll_loss=1.982, ppl=3.95, wps=583043, ups=1.18, wpb=495188, bsz=16222.5, num_updates=44800, lr=0.000298807, gnorm=0.134, clip=0, loss_scale=4, train_wall=84, gb_free=61.8, wall=38409 epoch 027: 955 / 1689 loss=3.534, nll_loss=1.982, ppl=3.95, wps=583043, ups=1.18, wpb=495188, bsz=16222.5, num_updates=44800, lr=0.000298807, gnorm=0.134, clip=0, loss_scale=4, train_wall=84, gb_free=61.8, wall=38409 epoch 027: 955 / 1689 loss=3.534, nll_loss=1.982, ppl=3.95, wps=583043, ups=1.18, wpb=495188, bsz=16222.5, num_updates=44800, lr=0.000298807, gnorm=0.134, clip=0, loss_scale=4, train_wall=84, gb_free=61.8, wall=38409 epoch 027: 955 / 1689 loss=3.534, nll_loss=1.982, ppl=3.95, wps=583043, ups=1.18, wpb=495188, bsz=16222.5, num_updates=44800, lr=0.000298807, gnorm=0.134, clip=0, loss_scale=4, train_wall=84, gb_free=61.8, wall=38409 epoch 027: 955 / 1689 loss=3.534, nll_loss=1.982, ppl=3.95, wps=583043, ups=1.18, wpb=495188, bsz=16222.5, num_updates=44800, lr=0.000298807, gnorm=0.134, clip=0, loss_scale=4, train_wall=84, gb_free=61.8, wall=38409 epoch 027: 955 / 1689 loss=3.534, nll_loss=1.982, ppl=3.95, wps=583043, ups=1.18, wpb=495188, bsz=16222.5, num_updates=44800, lr=0.000298807, gnorm=0.134, clip=0, loss_scale=4, train_wall=84, gb_free=61.8, wall=38409 epoch 027: 955 / 1689 loss=3.534, nll_loss=1.982, ppl=3.95, wps=583043, ups=1.18, wpb=495188, bsz=16222.5, num_updates=44800, lr=0.000298807, gnorm=0.134, clip=0, loss_scale=4, train_wall=84, gb_free=61.8, wall=38409 epoch 027: 955 / 1689 loss=3.534, nll_loss=1.982, ppl=3.95, wps=583043, ups=1.18, wpb=495188, bsz=16222.5, num_updates=44800, lr=0.000298807, gnorm=0.134, clip=0, loss_scale=4, train_wall=84, gb_free=61.8, wall=38409 epoch 027: 955 / 1689 loss=3.534, nll_loss=1.982, ppl=3.95, wps=583043, ups=1.18, wpb=495188, bsz=16222.5, num_updates=44800, lr=0.000298807, gnorm=0.134, clip=0, loss_scale=4, train_wall=84, gb_free=61.8, wall=38409 epoch 027: 955 / 1689 loss=3.534, nll_loss=1.982, ppl=3.95, wps=583043, ups=1.18, wpb=495188, bsz=16222.5, num_updates=44800, lr=0.000298807, gnorm=0.134, clip=0, loss_scale=4, train_wall=84, gb_free=61.8, wall=38409 epoch 027: 955 / 1689 loss=3.534, nll_loss=1.982, ppl=3.95, wps=583043, ups=1.18, wpb=495188, bsz=16222.5, num_updates=44800, lr=0.000298807, gnorm=0.134, clip=0, loss_scale=4, train_wall=84, gb_free=61.8, wall=38409 epoch 027: 1055 / 1689 loss=3.534, nll_loss=1.982, ppl=3.95, wps=579570, ups=1.17, wpb=494220, bsz=16861.4, num_updates=44900, lr=0.000298474, gnorm=0.132, clip=0, loss_scale=4, train_wall=84, gb_free=61.2, wall=38494 epoch 027: 1055 / 1689 loss=3.534, nll_loss=1.982, ppl=3.95, wps=579570, ups=1.17, wpb=494220, bsz=16861.4, num_updates=44900, lr=0.000298474, gnorm=0.132, clip=0, loss_scale=4, train_wall=84, gb_free=61.2, wall=38494 epoch 027: 1055 / 1689 loss=3.534, nll_loss=1.982, ppl=3.95, wps=579570, ups=1.17, wpb=494220, bsz=16861.4, num_updates=44900, lr=0.000298474, gnorm=0.132, clip=0, loss_scale=4, train_wall=84, gb_free=61.2, wall=38494 epoch 027: 1055 / 1689 loss=3.534, nll_loss=1.982, ppl=3.95, wps=579570, ups=1.17, wpb=494220, bsz=16861.4, num_updates=44900, lr=0.000298474, gnorm=0.132, clip=0, loss_scale=4, train_wall=84, gb_free=61.2, wall=38494 epoch 027: 1055 / 1689 loss=3.534, nll_loss=1.982, ppl=3.95, wps=579570, ups=1.17, wpb=494220, bsz=16861.4, num_updates=44900, lr=0.000298474, gnorm=0.132, clip=0, loss_scale=4, train_wall=84, gb_free=61.2, wall=38494 epoch 027: 1055 / 1689 loss=3.534, nll_loss=1.982, ppl=3.95, wps=579570, ups=1.17, wpb=494220, bsz=16861.4, num_updates=44900, lr=0.000298474, gnorm=0.132, clip=0, loss_scale=4, train_wall=84, gb_free=61.2, wall=38494 epoch 027: 1055 / 1689 loss=3.534, nll_loss=1.982, ppl=3.95, wps=579570, ups=1.17, wpb=494220, bsz=16861.4, num_updates=44900, lr=0.000298474, gnorm=0.132, clip=0, loss_scale=4, train_wall=84, gb_free=61.2, wall=38494 epoch 027: 1055 / 1689 loss=3.534, nll_loss=1.982, ppl=3.95, wps=579570, ups=1.17, wpb=494220, bsz=16861.4, num_updates=44900, lr=0.000298474, gnorm=0.132, clip=0, loss_scale=4, train_wall=84, gb_free=61.2, wall=38494 epoch 027: 1055 / 1689 loss=3.534, nll_loss=1.982, ppl=3.95, wps=579570, ups=1.17, wpb=494220, bsz=16861.4, num_updates=44900, lr=0.000298474, gnorm=0.132, clip=0, loss_scale=4, train_wall=84, gb_free=61.2, wall=38494 epoch 027: 1055 / 1689 loss=3.534, nll_loss=1.982, ppl=3.95, wps=579570, ups=1.17, wpb=494220, bsz=16861.4, num_updates=44900, lr=0.000298474, gnorm=0.132, clip=0, loss_scale=4, train_wall=84, gb_free=61.2, wall=38494 epoch 027: 1055 / 1689 loss=3.534, nll_loss=1.982, ppl=3.95, wps=579570, ups=1.17, wpb=494220, bsz=16861.4, num_updates=44900, lr=0.000298474, gnorm=0.132, clip=0, loss_scale=4, train_wall=84, gb_free=61.2, wall=38494 epoch 027: 1055 / 1689 loss=3.534, nll_loss=1.982, ppl=3.95, wps=579570, ups=1.17, wpb=494220, bsz=16861.4, num_updates=44900, lr=0.000298474, gnorm=0.132, clip=0, loss_scale=4, train_wall=84, gb_free=61.2, wall=38494 epoch 027: 1055 / 1689 loss=3.534, nll_loss=1.982, ppl=3.95, wps=579570, ups=1.17, wpb=494220, bsz=16861.4, num_updates=44900, lr=0.000298474, gnorm=0.132, clip=0, loss_scale=4, train_wall=84, gb_free=61.2, wall=38494 epoch 027: 1055 / 1689 loss=3.534, nll_loss=1.982, ppl=3.95, wps=579570, ups=1.17, wpb=494220, bsz=16861.4, num_updates=44900, lr=0.000298474, gnorm=0.132, clip=0, loss_scale=4, train_wall=84, gb_free=61.2, wall=38494 epoch 027: 1055 / 1689 loss=3.534, nll_loss=1.982, ppl=3.95, wps=579570, ups=1.17, wpb=494220, bsz=16861.4, num_updates=44900, lr=0.000298474, gnorm=0.132, clip=0, loss_scale=4, train_wall=84, gb_free=61.2, wall=38494 epoch 027: 1055 / 1689 loss=3.534, nll_loss=1.982, ppl=3.95, wps=579570, ups=1.17, wpb=494220, bsz=16861.4, num_updates=44900, lr=0.000298474, gnorm=0.132, clip=0, loss_scale=4, train_wall=84, gb_free=61.2, wall=38494 epoch 027: 1055 / 1689 loss=3.534, nll_loss=1.982, ppl=3.95, wps=579570, ups=1.17, wpb=494220, bsz=16861.4, num_updates=44900, lr=0.000298474, gnorm=0.132, clip=0, loss_scale=4, train_wall=84, gb_free=61.2, wall=38494 epoch 027: 1055 / 1689 loss=3.534, nll_loss=1.982, ppl=3.95, wps=579570, ups=1.17, wpb=494220, bsz=16861.4, num_updates=44900, lr=0.000298474, gnorm=0.132, clip=0, loss_scale=4, train_wall=84, gb_free=61.2, wall=38494 epoch 027: 1055 / 1689 loss=3.534, nll_loss=1.982, ppl=3.95, wps=579570, ups=1.17, wpb=494220, bsz=16861.4, num_updates=44900, lr=0.000298474, gnorm=0.132, clip=0, loss_scale=4, train_wall=84, gb_free=61.2, wall=38494 epoch 027: 1055 / 1689 loss=3.534, nll_loss=1.982, ppl=3.95, wps=579570, ups=1.17, wpb=494220, bsz=16861.4, num_updates=44900, lr=0.000298474, gnorm=0.132, clip=0, loss_scale=4, train_wall=84, gb_free=61.2, wall=38494 epoch 027: 1055 / 1689 loss=3.534, nll_loss=1.982, ppl=3.95, wps=579570, ups=1.17, wpb=494220, bsz=16861.4, num_updates=44900, lr=0.000298474, gnorm=0.132, clip=0, loss_scale=4, train_wall=84, gb_free=61.2, wall=38494 epoch 027: 1055 / 1689 loss=3.534, nll_loss=1.982, ppl=3.95, wps=579570, ups=1.17, wpb=494220, bsz=16861.4, num_updates=44900, lr=0.000298474, gnorm=0.132, clip=0, loss_scale=4, train_wall=84, gb_free=61.2, wall=38494 epoch 027: 1055 / 1689 loss=3.534, nll_loss=1.982, ppl=3.95, wps=579570, ups=1.17, wpb=494220, bsz=16861.4, num_updates=44900, lr=0.000298474, gnorm=0.132, clip=0, loss_scale=4, train_wall=84, gb_free=61.2, wall=38494 epoch 027: 1055 / 1689 loss=3.534, nll_loss=1.982, ppl=3.95, wps=579570, ups=1.17, wpb=494220, bsz=16861.4, num_updates=44900, lr=0.000298474, gnorm=0.132, clip=0, loss_scale=4, train_wall=84, gb_free=61.2, wall=38494 epoch 027: 1055 / 1689 loss=3.534, nll_loss=1.982, ppl=3.95, wps=579570, ups=1.17, wpb=494220, bsz=16861.4, num_updates=44900, lr=0.000298474, gnorm=0.132, clip=0, loss_scale=4, train_wall=84, gb_free=61.2, wall=38494 epoch 027: 1055 / 1689 loss=3.534, nll_loss=1.982, ppl=3.95, wps=579570, ups=1.17, wpb=494220, bsz=16861.4, num_updates=44900, lr=0.000298474, gnorm=0.132, clip=0, loss_scale=4, train_wall=84, gb_free=61.2, wall=38494 epoch 027: 1055 / 1689 loss=3.534, nll_loss=1.982, ppl=3.95, wps=579570, ups=1.17, wpb=494220, bsz=16861.4, num_updates=44900, lr=0.000298474, gnorm=0.132, clip=0, loss_scale=4, train_wall=84, gb_free=61.2, wall=38494 epoch 027: 1155 / 1689 loss=3.533, nll_loss=1.981, ppl=3.95, wps=585401, ups=1.18, wpb=494956, bsz=17000.2, num_updates=45000, lr=0.000298142, gnorm=0.138, clip=0, loss_scale=4, train_wall=83, gb_free=59.4, wall=38578 epoch 027: 1155 / 1689 loss=3.533, nll_loss=1.981, ppl=3.95, wps=585401, ups=1.18, wpb=494956, bsz=17000.2, num_updates=45000, lr=0.000298142, gnorm=0.138, clip=0, loss_scale=4, train_wall=83, gb_free=59.4, wall=38578 epoch 027: 1155 / 1689 loss=3.533, nll_loss=1.981, ppl=3.95, wps=585401, ups=1.18, wpb=494956, bsz=17000.2, num_updates=45000, lr=0.000298142, gnorm=0.138, clip=0, loss_scale=4, train_wall=83, gb_free=59.4, wall=38578 epoch 027: 1155 / 1689 loss=3.533, nll_loss=1.981, ppl=3.95, wps=585401, ups=1.18, wpb=494956, bsz=17000.2, num_updates=45000, lr=0.000298142, gnorm=0.138, clip=0, loss_scale=4, train_wall=83, gb_free=59.4, wall=38578 epoch 027: 1155 / 1689 loss=3.533, nll_loss=1.981, ppl=3.95, wps=585401, ups=1.18, wpb=494956, bsz=17000.2, num_updates=45000, lr=0.000298142, gnorm=0.138, clip=0, loss_scale=4, train_wall=83, gb_free=59.4, wall=38578 epoch 027: 1155 / 1689 loss=3.533, nll_loss=1.981, ppl=3.95, wps=585401, ups=1.18, wpb=494956, bsz=17000.2, num_updates=45000, lr=0.000298142, gnorm=0.138, clip=0, loss_scale=4, train_wall=83, gb_free=59.4, wall=38578 epoch 027: 1155 / 1689 loss=3.533, nll_loss=1.981, ppl=3.95, wps=585401, ups=1.18, wpb=494956, bsz=17000.2, num_updates=45000, lr=0.000298142, gnorm=0.138, clip=0, loss_scale=4, train_wall=83, gb_free=59.4, wall=38578 epoch 027: 1155 / 1689 loss=3.533, nll_loss=1.981, ppl=3.95, wps=585401, ups=1.18, wpb=494956, bsz=17000.2, num_updates=45000, lr=0.000298142, gnorm=0.138, clip=0, loss_scale=4, train_wall=83, gb_free=59.4, wall=38578 epoch 027: 1155 / 1689 loss=3.533, nll_loss=1.981, ppl=3.95, wps=585401, ups=1.18, wpb=494956, bsz=17000.2, num_updates=45000, lr=0.000298142, gnorm=0.138, clip=0, loss_scale=4, train_wall=83, gb_free=59.4, wall=38578 epoch 027: 1155 / 1689 loss=3.533, nll_loss=1.981, ppl=3.95, wps=585401, ups=1.18, wpb=494956, bsz=17000.2, num_updates=45000, lr=0.000298142, gnorm=0.138, clip=0, loss_scale=4, train_wall=83, gb_free=59.4, wall=38578 epoch 027: 1155 / 1689 loss=3.533, nll_loss=1.981, ppl=3.95, wps=585401, ups=1.18, wpb=494956, bsz=17000.2, num_updates=45000, lr=0.000298142, gnorm=0.138, clip=0, loss_scale=4, train_wall=83, gb_free=59.4, wall=38578 epoch 027: 1155 / 1689 loss=3.533, nll_loss=1.981, ppl=3.95, wps=585401, ups=1.18, wpb=494956, bsz=17000.2, num_updates=45000, lr=0.000298142, gnorm=0.138, clip=0, loss_scale=4, train_wall=83, gb_free=59.4, wall=38578 epoch 027: 1155 / 1689 loss=3.533, nll_loss=1.981, ppl=3.95, wps=585401, ups=1.18, wpb=494956, bsz=17000.2, num_updates=45000, lr=0.000298142, gnorm=0.138, clip=0, loss_scale=4, train_wall=83, gb_free=59.4, wall=38578 epoch 027: 1155 / 1689 loss=3.533, nll_loss=1.981, ppl=3.95, wps=585401, ups=1.18, wpb=494956, bsz=17000.2, num_updates=45000, lr=0.000298142, gnorm=0.138, clip=0, loss_scale=4, train_wall=83, gb_free=59.4, wall=38578 epoch 027: 1155 / 1689 loss=3.533, nll_loss=1.981, ppl=3.95, wps=585401, ups=1.18, wpb=494956, bsz=17000.2, num_updates=45000, lr=0.000298142, gnorm=0.138, clip=0, loss_scale=4, train_wall=83, gb_free=59.4, wall=38578 epoch 027: 1155 / 1689 loss=3.533, nll_loss=1.981, ppl=3.95, wps=585401, ups=1.18, wpb=494956, bsz=17000.2, num_updates=45000, lr=0.000298142, gnorm=0.138, clip=0, loss_scale=4, train_wall=83, gb_free=59.4, wall=38578 epoch 027: 1155 / 1689 loss=3.533, nll_loss=1.981, ppl=3.95, wps=585401, ups=1.18, wpb=494956, bsz=17000.2, num_updates=45000, lr=0.000298142, gnorm=0.138, clip=0, loss_scale=4, train_wall=83, gb_free=59.4, wall=38578 epoch 027: 1155 / 1689 loss=3.533, nll_loss=1.981, ppl=3.95, wps=585401, ups=1.18, wpb=494956, bsz=17000.2, num_updates=45000, lr=0.000298142, gnorm=0.138, clip=0, loss_scale=4, train_wall=83, gb_free=59.4, wall=38578 epoch 027: 1155 / 1689 loss=3.533, nll_loss=1.981, ppl=3.95, wps=585401, ups=1.18, wpb=494956, bsz=17000.2, num_updates=45000, lr=0.000298142, gnorm=0.138, clip=0, loss_scale=4, train_wall=83, gb_free=59.4, wall=38578 epoch 027: 1155 / 1689 loss=3.533, nll_loss=1.981, ppl=3.95, wps=585401, ups=1.18, wpb=494956, bsz=17000.2, num_updates=45000, lr=0.000298142, gnorm=0.138, clip=0, loss_scale=4, train_wall=83, gb_free=59.4, wall=38578 epoch 027: 1155 / 1689 loss=3.533, nll_loss=1.981, ppl=3.95, wps=585401, ups=1.18, wpb=494956, bsz=17000.2, num_updates=45000, lr=0.000298142, gnorm=0.138, clip=0, loss_scale=4, train_wall=83, gb_free=59.4, wall=38578 epoch 027: 1155 / 1689 loss=3.533, nll_loss=1.981, ppl=3.95, wps=585401, ups=1.18, wpb=494956, bsz=17000.2, num_updates=45000, lr=0.000298142, gnorm=0.138, clip=0, loss_scale=4, train_wall=83, gb_free=59.4, wall=38578 epoch 027: 1155 / 1689 loss=3.533, nll_loss=1.981, ppl=3.95, wps=585401, ups=1.18, wpb=494956, bsz=17000.2, num_updates=45000, lr=0.000298142, gnorm=0.138, clip=0, loss_scale=4, train_wall=83, gb_free=59.4, wall=38578 epoch 027: 1155 / 1689 loss=3.533, nll_loss=1.981, ppl=3.95, wps=585401, ups=1.18, wpb=494956, bsz=17000.2, num_updates=45000, lr=0.000298142, gnorm=0.138, clip=0, loss_scale=4, train_wall=83, gb_free=59.4, wall=38578 epoch 027: 1155 / 1689 loss=3.533, nll_loss=1.981, ppl=3.95, wps=585401, ups=1.18, wpb=494956, bsz=17000.2, num_updates=45000, lr=0.000298142, gnorm=0.138, clip=0, loss_scale=4, train_wall=83, gb_free=59.4, wall=38578 epoch 027: 1155 / 1689 loss=3.533, nll_loss=1.981, ppl=3.95, wps=585401, ups=1.18, wpb=494956, bsz=17000.2, num_updates=45000, lr=0.000298142, gnorm=0.138, clip=0, loss_scale=4, train_wall=83, gb_free=59.4, wall=38578 epoch 027: 1155 / 1689 loss=3.533, nll_loss=1.981, ppl=3.95, wps=585401, ups=1.18, wpb=494956, bsz=17000.2, num_updates=45000, lr=0.000298142, gnorm=0.138, clip=0, loss_scale=4, train_wall=83, gb_free=59.4, wall=38578 begin validation on "valid" subset epoch 027 | valid on 'valid' subset | loss 3.687 | nll_loss 2.137 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 45000 | best_loss 3.687 epoch 027 | valid on 'valid' subset | loss 3.687 | nll_loss 2.137 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 45000 | best_loss 3.687 epoch 027 | valid on 'valid' subset | loss 3.687 | nll_loss 2.137 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 45000 | best_loss 3.687 epoch 027 | valid on 'valid' subset | loss 3.687 | nll_loss 2.137 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 45000 | best_loss 3.687 epoch 027 | valid on 'valid' subset | loss 3.687 | nll_loss 2.137 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 45000 | best_loss 3.687 epoch 027 | valid on 'valid' subset | loss 3.687 | nll_loss 2.137 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 45000 | best_loss 3.687 epoch 027 | valid on 'valid' subset | loss 3.687 | nll_loss 2.137 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 45000 | best_loss 3.687 epoch 027 | valid on 'valid' subset | loss 3.687 | nll_loss 2.137 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 45000 | best_loss 3.687 epoch 027 | valid on 'valid' subset | loss 3.687 | nll_loss 2.137 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 45000 | best_loss 3.687 epoch 027 | valid on 'valid' subset | loss 3.687 | nll_loss 2.137 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 45000 | best_loss 3.687 epoch 027 | valid on 'valid' subset | loss 3.687 | nll_loss 2.137 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 45000 | best_loss 3.687 epoch 027 | valid on 'valid' subset | loss 3.687 | nll_loss 2.137 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 45000 | best_loss 3.687 epoch 027 | valid on 'valid' subset | loss 3.687 | nll_loss 2.137 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 45000 | best_loss 3.687 epoch 027 | valid on 'valid' subset | loss 3.687 | nll_loss 2.137 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 45000 | best_loss 3.687 epoch 027 | valid on 'valid' subset | loss 3.687 | nll_loss 2.137 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 45000 | best_loss 3.687 epoch 027 | valid on 'valid' subset | loss 3.687 | nll_loss 2.137 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 45000 | best_loss 3.687 epoch 027 | valid on 'valid' subset | loss 3.687 | nll_loss 2.137 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 45000 | best_loss 3.687 epoch 027 | valid on 'valid' subset | loss 3.687 | nll_loss 2.137 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 45000 | best_loss 3.687 epoch 027 | valid on 'valid' subset | loss 3.687 | nll_loss 2.137 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 45000 | best_loss 3.687 epoch 027 | valid on 'valid' subset | loss 3.687 | nll_loss 2.137 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 45000 | best_loss 3.687 epoch 027 | valid on 'valid' subset | loss 3.687 | nll_loss 2.137 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 45000 | best_loss 3.687 epoch 027 | valid on 'valid' subset | loss 3.687 | nll_loss 2.137 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 45000 | best_loss 3.687 epoch 027 | valid on 'valid' subset | loss 3.687 | nll_loss 2.137 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 45000 | best_loss 3.687 epoch 027 | valid on 'valid' subset | loss 3.687 | nll_loss 2.137 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 45000 | best_loss 3.687 epoch 027 | valid on 'valid' subset | loss 3.687 | nll_loss 2.137 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 45000 | best_loss 3.687 epoch 027 | valid on 'valid' subset | loss 3.687 | nll_loss 2.137 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 45000 | best_loss 3.687 epoch 027 | valid on 'valid' subset | loss 3.687 | nll_loss 2.137 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 45000 | best_loss 3.687 epoch 027: 1255 / 1689 loss=3.538, nll_loss=1.987, ppl=3.96, wps=513238, ups=1.04, wpb=495820, bsz=16303.6, num_updates=45100, lr=0.000297812, gnorm=0.138, clip=0, loss_scale=4, train_wall=83, gb_free=61.5, wall=38675 epoch 027: 1255 / 1689 loss=3.538, nll_loss=1.987, ppl=3.96, wps=513238, ups=1.04, wpb=495820, bsz=16303.6, num_updates=45100, lr=0.000297812, gnorm=0.138, clip=0, loss_scale=4, train_wall=83, gb_free=61.5, wall=38675 epoch 027: 1255 / 1689 loss=3.538, nll_loss=1.987, ppl=3.96, wps=513238, ups=1.04, wpb=495820, bsz=16303.6, num_updates=45100, lr=0.000297812, gnorm=0.138, clip=0, loss_scale=4, train_wall=83, gb_free=61.5, wall=38675 epoch 027: 1255 / 1689 loss=3.538, nll_loss=1.987, ppl=3.96, wps=513238, ups=1.04, wpb=495820, bsz=16303.6, num_updates=45100, lr=0.000297812, gnorm=0.138, clip=0, loss_scale=4, train_wall=83, gb_free=61.5, wall=38675 epoch 027: 1255 / 1689 loss=3.538, nll_loss=1.987, ppl=3.96, wps=513238, ups=1.04, wpb=495820, bsz=16303.6, num_updates=45100, lr=0.000297812, gnorm=0.138, clip=0, loss_scale=4, train_wall=83, gb_free=61.5, wall=38675 epoch 027: 1255 / 1689 loss=3.538, nll_loss=1.987, ppl=3.96, wps=513238, ups=1.04, wpb=495820, bsz=16303.6, num_updates=45100, lr=0.000297812, gnorm=0.138, clip=0, loss_scale=4, train_wall=83, gb_free=61.5, wall=38675 epoch 027: 1255 / 1689 loss=3.538, nll_loss=1.987, ppl=3.96, wps=513238, ups=1.04, wpb=495820, bsz=16303.6, num_updates=45100, lr=0.000297812, gnorm=0.138, clip=0, loss_scale=4, train_wall=83, gb_free=61.5, wall=38675 epoch 027: 1255 / 1689 loss=3.538, nll_loss=1.987, ppl=3.96, wps=513238, ups=1.04, wpb=495820, bsz=16303.6, num_updates=45100, lr=0.000297812, gnorm=0.138, clip=0, loss_scale=4, train_wall=83, gb_free=61.5, wall=38675 epoch 027: 1255 / 1689 loss=3.538, nll_loss=1.987, ppl=3.96, wps=513238, ups=1.04, wpb=495820, bsz=16303.6, num_updates=45100, lr=0.000297812, gnorm=0.138, clip=0, loss_scale=4, train_wall=83, gb_free=61.5, wall=38675 epoch 027: 1255 / 1689 loss=3.538, nll_loss=1.987, ppl=3.96, wps=513238, ups=1.04, wpb=495820, bsz=16303.6, num_updates=45100, lr=0.000297812, gnorm=0.138, clip=0, loss_scale=4, train_wall=83, gb_free=61.5, wall=38675 epoch 027: 1255 / 1689 loss=3.538, nll_loss=1.987, ppl=3.96, wps=513238, ups=1.04, wpb=495820, bsz=16303.6, num_updates=45100, lr=0.000297812, gnorm=0.138, clip=0, loss_scale=4, train_wall=83, gb_free=61.5, wall=38675 epoch 027: 1255 / 1689 loss=3.538, nll_loss=1.987, ppl=3.96, wps=513238, ups=1.04, wpb=495820, bsz=16303.6, num_updates=45100, lr=0.000297812, gnorm=0.138, clip=0, loss_scale=4, train_wall=83, gb_free=61.5, wall=38675 epoch 027: 1255 / 1689 loss=3.538, nll_loss=1.987, ppl=3.96, wps=513238, ups=1.04, wpb=495820, bsz=16303.6, num_updates=45100, lr=0.000297812, gnorm=0.138, clip=0, loss_scale=4, train_wall=83, gb_free=61.5, wall=38675 epoch 027: 1255 / 1689 loss=3.538, nll_loss=1.987, ppl=3.96, wps=513238, ups=1.04, wpb=495820, bsz=16303.6, num_updates=45100, lr=0.000297812, gnorm=0.138, clip=0, loss_scale=4, train_wall=83, gb_free=61.5, wall=38675 epoch 027: 1255 / 1689 loss=3.538, nll_loss=1.987, ppl=3.96, wps=513238, ups=1.04, wpb=495820, bsz=16303.6, num_updates=45100, lr=0.000297812, gnorm=0.138, clip=0, loss_scale=4, train_wall=83, gb_free=61.5, wall=38675 epoch 027: 1255 / 1689 loss=3.538, nll_loss=1.987, ppl=3.96, wps=513238, ups=1.04, wpb=495820, bsz=16303.6, num_updates=45100, lr=0.000297812, gnorm=0.138, clip=0, loss_scale=4, train_wall=83, gb_free=61.5, wall=38675 epoch 027: 1255 / 1689 loss=3.538, nll_loss=1.987, ppl=3.96, wps=513238, ups=1.04, wpb=495820, bsz=16303.6, num_updates=45100, lr=0.000297812, gnorm=0.138, clip=0, loss_scale=4, train_wall=83, gb_free=61.5, wall=38675 epoch 027: 1255 / 1689 loss=3.538, nll_loss=1.987, ppl=3.96, wps=513238, ups=1.04, wpb=495820, bsz=16303.6, num_updates=45100, lr=0.000297812, gnorm=0.138, clip=0, loss_scale=4, train_wall=83, gb_free=61.5, wall=38675 epoch 027: 1255 / 1689 loss=3.538, nll_loss=1.987, ppl=3.96, wps=513238, ups=1.04, wpb=495820, bsz=16303.6, num_updates=45100, lr=0.000297812, gnorm=0.138, clip=0, loss_scale=4, train_wall=83, gb_free=61.5, wall=38675 epoch 027: 1255 / 1689 loss=3.538, nll_loss=1.987, ppl=3.96, wps=513238, ups=1.04, wpb=495820, bsz=16303.6, num_updates=45100, lr=0.000297812, gnorm=0.138, clip=0, loss_scale=4, train_wall=83, gb_free=61.5, wall=38675 epoch 027: 1255 / 1689 loss=3.538, nll_loss=1.987, ppl=3.96, wps=513238, ups=1.04, wpb=495820, bsz=16303.6, num_updates=45100, lr=0.000297812, gnorm=0.138, clip=0, loss_scale=4, train_wall=83, gb_free=61.5, wall=38675 epoch 027: 1255 / 1689 loss=3.538, nll_loss=1.987, ppl=3.96, wps=513238, ups=1.04, wpb=495820, bsz=16303.6, num_updates=45100, lr=0.000297812, gnorm=0.138, clip=0, loss_scale=4, train_wall=83, gb_free=61.5, wall=38675 epoch 027: 1255 / 1689 loss=3.538, nll_loss=1.987, ppl=3.96, wps=513238, ups=1.04, wpb=495820, bsz=16303.6, num_updates=45100, lr=0.000297812, gnorm=0.138, clip=0, loss_scale=4, train_wall=83, gb_free=61.5, wall=38675 epoch 027: 1255 / 1689 loss=3.538, nll_loss=1.987, ppl=3.96, wps=513238, ups=1.04, wpb=495820, bsz=16303.6, num_updates=45100, lr=0.000297812, gnorm=0.138, clip=0, loss_scale=4, train_wall=83, gb_free=61.5, wall=38675 epoch 027: 1255 / 1689 loss=3.538, nll_loss=1.987, ppl=3.96, wps=513238, ups=1.04, wpb=495820, bsz=16303.6, num_updates=45100, lr=0.000297812, gnorm=0.138, clip=0, loss_scale=4, train_wall=83, gb_free=61.5, wall=38675 epoch 027: 1255 / 1689 loss=3.538, nll_loss=1.987, ppl=3.96, wps=513238, ups=1.04, wpb=495820, bsz=16303.6, num_updates=45100, lr=0.000297812, gnorm=0.138, clip=0, loss_scale=4, train_wall=83, gb_free=61.5, wall=38675 epoch 027: 1255 / 1689 loss=3.538, nll_loss=1.987, ppl=3.96, wps=513238, ups=1.04, wpb=495820, bsz=16303.6, num_updates=45100, lr=0.000297812, gnorm=0.138, clip=0, loss_scale=4, train_wall=83, gb_free=61.5, wall=38675 epoch 027: 1357 / 1689 loss=3.535, nll_loss=1.983, ppl=3.95, wps=577334, ups=1.16, wpb=495666, bsz=16384.6, num_updates=45200, lr=0.000297482, gnorm=0.135, clip=0, loss_scale=2, train_wall=85, gb_free=61.3, wall=38761 epoch 027: 1357 / 1689 loss=3.535, nll_loss=1.983, ppl=3.95, wps=577334, ups=1.16, wpb=495666, bsz=16384.6, num_updates=45200, lr=0.000297482, gnorm=0.135, clip=0, loss_scale=2, train_wall=85, gb_free=61.3, wall=38761 epoch 027: 1357 / 1689 loss=3.535, nll_loss=1.983, ppl=3.95, wps=577334, ups=1.16, wpb=495666, bsz=16384.6, num_updates=45200, lr=0.000297482, gnorm=0.135, clip=0, loss_scale=2, train_wall=85, gb_free=61.3, wall=38761 epoch 027: 1357 / 1689 loss=3.535, nll_loss=1.983, ppl=3.95, wps=577334, ups=1.16, wpb=495666, bsz=16384.6, num_updates=45200, lr=0.000297482, gnorm=0.135, clip=0, loss_scale=2, train_wall=85, gb_free=61.3, wall=38761 epoch 027: 1357 / 1689 loss=3.535, nll_loss=1.983, ppl=3.95, wps=577334, ups=1.16, wpb=495666, bsz=16384.6, num_updates=45200, lr=0.000297482, gnorm=0.135, clip=0, loss_scale=2, train_wall=85, gb_free=61.3, wall=38761 epoch 027: 1357 / 1689 loss=3.535, nll_loss=1.983, ppl=3.95, wps=577334, ups=1.16, wpb=495666, bsz=16384.6, num_updates=45200, lr=0.000297482, gnorm=0.135, clip=0, loss_scale=2, train_wall=85, gb_free=61.3, wall=38761 epoch 027: 1357 / 1689 loss=3.535, nll_loss=1.983, ppl=3.95, wps=577334, ups=1.16, wpb=495666, bsz=16384.6, num_updates=45200, lr=0.000297482, gnorm=0.135, clip=0, loss_scale=2, train_wall=85, gb_free=61.3, wall=38761 epoch 027: 1357 / 1689 loss=3.535, nll_loss=1.983, ppl=3.95, wps=577334, ups=1.16, wpb=495666, bsz=16384.6, num_updates=45200, lr=0.000297482, gnorm=0.135, clip=0, loss_scale=2, train_wall=85, gb_free=61.3, wall=38761 epoch 027: 1357 / 1689 loss=3.535, nll_loss=1.983, ppl=3.95, wps=577334, ups=1.16, wpb=495666, bsz=16384.6, num_updates=45200, lr=0.000297482, gnorm=0.135, clip=0, loss_scale=2, train_wall=85, gb_free=61.3, wall=38761 epoch 027: 1357 / 1689 loss=3.535, nll_loss=1.983, ppl=3.95, wps=577334, ups=1.16, wpb=495666, bsz=16384.6, num_updates=45200, lr=0.000297482, gnorm=0.135, clip=0, loss_scale=2, train_wall=85, gb_free=61.3, wall=38761 epoch 027: 1357 / 1689 loss=3.535, nll_loss=1.983, ppl=3.95, wps=577334, ups=1.16, wpb=495666, bsz=16384.6, num_updates=45200, lr=0.000297482, gnorm=0.135, clip=0, loss_scale=2, train_wall=85, gb_free=61.3, wall=38761 epoch 027: 1357 / 1689 loss=3.535, nll_loss=1.983, ppl=3.95, wps=577334, ups=1.16, wpb=495666, bsz=16384.6, num_updates=45200, lr=0.000297482, gnorm=0.135, clip=0, loss_scale=2, train_wall=85, gb_free=61.3, wall=38761 epoch 027: 1357 / 1689 loss=3.535, nll_loss=1.983, ppl=3.95, wps=577334, ups=1.16, wpb=495666, bsz=16384.6, num_updates=45200, lr=0.000297482, gnorm=0.135, clip=0, loss_scale=2, train_wall=85, gb_free=61.3, wall=38761 epoch 027: 1357 / 1689 loss=3.535, nll_loss=1.983, ppl=3.95, wps=577334, ups=1.16, wpb=495666, bsz=16384.6, num_updates=45200, lr=0.000297482, gnorm=0.135, clip=0, loss_scale=2, train_wall=85, gb_free=61.3, wall=38761 epoch 027: 1357 / 1689 loss=3.535, nll_loss=1.983, ppl=3.95, wps=577334, ups=1.16, wpb=495666, bsz=16384.6, num_updates=45200, lr=0.000297482, gnorm=0.135, clip=0, loss_scale=2, train_wall=85, gb_free=61.3, wall=38761 epoch 027: 1357 / 1689 loss=3.535, nll_loss=1.983, ppl=3.95, wps=577334, ups=1.16, wpb=495666, bsz=16384.6, num_updates=45200, lr=0.000297482, gnorm=0.135, clip=0, loss_scale=2, train_wall=85, gb_free=61.3, wall=38761 epoch 027: 1357 / 1689 loss=3.535, nll_loss=1.983, ppl=3.95, wps=577334, ups=1.16, wpb=495666, bsz=16384.6, num_updates=45200, lr=0.000297482, gnorm=0.135, clip=0, loss_scale=2, train_wall=85, gb_free=61.3, wall=38761 epoch 027: 1357 / 1689 loss=3.535, nll_loss=1.983, ppl=3.95, wps=577334, ups=1.16, wpb=495666, bsz=16384.6, num_updates=45200, lr=0.000297482, gnorm=0.135, clip=0, loss_scale=2, train_wall=85, gb_free=61.3, wall=38761 epoch 027: 1357 / 1689 loss=3.535, nll_loss=1.983, ppl=3.95, wps=577334, ups=1.16, wpb=495666, bsz=16384.6, num_updates=45200, lr=0.000297482, gnorm=0.135, clip=0, loss_scale=2, train_wall=85, gb_free=61.3, wall=38761 epoch 027: 1357 / 1689 loss=3.535, nll_loss=1.983, ppl=3.95, wps=577334, ups=1.16, wpb=495666, bsz=16384.6, num_updates=45200, lr=0.000297482, gnorm=0.135, clip=0, loss_scale=2, train_wall=85, gb_free=61.3, wall=38761 epoch 027: 1357 / 1689 loss=3.535, nll_loss=1.983, ppl=3.95, wps=577334, ups=1.16, wpb=495666, bsz=16384.6, num_updates=45200, lr=0.000297482, gnorm=0.135, clip=0, loss_scale=2, train_wall=85, gb_free=61.3, wall=38761 epoch 027: 1357 / 1689 loss=3.535, nll_loss=1.983, ppl=3.95, wps=577334, ups=1.16, wpb=495666, bsz=16384.6, num_updates=45200, lr=0.000297482, gnorm=0.135, clip=0, loss_scale=2, train_wall=85, gb_free=61.3, wall=38761 epoch 027: 1357 / 1689 loss=3.535, nll_loss=1.983, ppl=3.95, wps=577334, ups=1.16, wpb=495666, bsz=16384.6, num_updates=45200, lr=0.000297482, gnorm=0.135, clip=0, loss_scale=2, train_wall=85, gb_free=61.3, wall=38761 epoch 027: 1357 / 1689 loss=3.535, nll_loss=1.983, ppl=3.95, wps=577334, ups=1.16, wpb=495666, bsz=16384.6, num_updates=45200, lr=0.000297482, gnorm=0.135, clip=0, loss_scale=2, train_wall=85, gb_free=61.3, wall=38761 epoch 027: 1357 / 1689 loss=3.535, nll_loss=1.983, ppl=3.95, wps=577334, ups=1.16, wpb=495666, bsz=16384.6, num_updates=45200, lr=0.000297482, gnorm=0.135, clip=0, loss_scale=2, train_wall=85, gb_free=61.3, wall=38761 epoch 027: 1357 / 1689 loss=3.535, nll_loss=1.983, ppl=3.95, wps=577334, ups=1.16, wpb=495666, bsz=16384.6, num_updates=45200, lr=0.000297482, gnorm=0.135, clip=0, loss_scale=2, train_wall=85, gb_free=61.3, wall=38761 epoch 027: 1357 / 1689 loss=3.535, nll_loss=1.983, ppl=3.95, wps=577334, ups=1.16, wpb=495666, bsz=16384.6, num_updates=45200, lr=0.000297482, gnorm=0.135, clip=0, loss_scale=2, train_wall=85, gb_free=61.3, wall=38761 epoch 027: 1457 / 1689 loss=3.535, nll_loss=1.984, ppl=3.96, wps=585476, ups=1.18, wpb=496101, bsz=16435.5, num_updates=45300, lr=0.000297154, gnorm=0.132, clip=0, loss_scale=2, train_wall=84, gb_free=60.1, wall=38846 epoch 027: 1457 / 1689 loss=3.535, nll_loss=1.984, ppl=3.96, wps=585476, ups=1.18, wpb=496101, bsz=16435.5, num_updates=45300, lr=0.000297154, gnorm=0.132, clip=0, loss_scale=2, train_wall=84, gb_free=60.1, wall=38846 epoch 027: 1457 / 1689 loss=3.535, nll_loss=1.984, ppl=3.96, wps=585476, ups=1.18, wpb=496101, bsz=16435.5, num_updates=45300, lr=0.000297154, gnorm=0.132, clip=0, loss_scale=2, train_wall=84, gb_free=60.1, wall=38846 epoch 027: 1457 / 1689 loss=3.535, nll_loss=1.984, ppl=3.96, wps=585476, ups=1.18, wpb=496101, bsz=16435.5, num_updates=45300, lr=0.000297154, gnorm=0.132, clip=0, loss_scale=2, train_wall=84, gb_free=60.1, wall=38846 epoch 027: 1457 / 1689 loss=3.535, nll_loss=1.984, ppl=3.96, wps=585476, ups=1.18, wpb=496101, bsz=16435.5, num_updates=45300, lr=0.000297154, gnorm=0.132, clip=0, loss_scale=2, train_wall=84, gb_free=60.1, wall=38846 epoch 027: 1457 / 1689 loss=3.535, nll_loss=1.984, ppl=3.96, wps=585476, ups=1.18, wpb=496101, bsz=16435.5, num_updates=45300, lr=0.000297154, gnorm=0.132, clip=0, loss_scale=2, train_wall=84, gb_free=60.1, wall=38846 epoch 027: 1457 / 1689 loss=3.535, nll_loss=1.984, ppl=3.96, wps=585476, ups=1.18, wpb=496101, bsz=16435.5, num_updates=45300, lr=0.000297154, gnorm=0.132, clip=0, loss_scale=2, train_wall=84, gb_free=60.1, wall=38846 epoch 027: 1457 / 1689 loss=3.535, nll_loss=1.984, ppl=3.96, wps=585476, ups=1.18, wpb=496101, bsz=16435.5, num_updates=45300, lr=0.000297154, gnorm=0.132, clip=0, loss_scale=2, train_wall=84, gb_free=60.1, wall=38846 epoch 027: 1457 / 1689 loss=3.535, nll_loss=1.984, ppl=3.96, wps=585476, ups=1.18, wpb=496101, bsz=16435.5, num_updates=45300, lr=0.000297154, gnorm=0.132, clip=0, loss_scale=2, train_wall=84, gb_free=60.1, wall=38846 epoch 027: 1457 / 1689 loss=3.535, nll_loss=1.984, ppl=3.96, wps=585476, ups=1.18, wpb=496101, bsz=16435.5, num_updates=45300, lr=0.000297154, gnorm=0.132, clip=0, loss_scale=2, train_wall=84, gb_free=60.1, wall=38846 epoch 027: 1457 / 1689 loss=3.535, nll_loss=1.984, ppl=3.96, wps=585476, ups=1.18, wpb=496101, bsz=16435.5, num_updates=45300, lr=0.000297154, gnorm=0.132, clip=0, loss_scale=2, train_wall=84, gb_free=60.1, wall=38846 epoch 027: 1457 / 1689 loss=3.535, nll_loss=1.984, ppl=3.96, wps=585476, ups=1.18, wpb=496101, bsz=16435.5, num_updates=45300, lr=0.000297154, gnorm=0.132, clip=0, loss_scale=2, train_wall=84, gb_free=60.1, wall=38846 epoch 027: 1457 / 1689 loss=3.535, nll_loss=1.984, ppl=3.96, wps=585476, ups=1.18, wpb=496101, bsz=16435.5, num_updates=45300, lr=0.000297154, gnorm=0.132, clip=0, loss_scale=2, train_wall=84, gb_free=60.1, wall=38846 epoch 027: 1457 / 1689 loss=3.535, nll_loss=1.984, ppl=3.96, wps=585476, ups=1.18, wpb=496101, bsz=16435.5, num_updates=45300, lr=0.000297154, gnorm=0.132, clip=0, loss_scale=2, train_wall=84, gb_free=60.1, wall=38846 epoch 027: 1457 / 1689 loss=3.535, nll_loss=1.984, ppl=3.96, wps=585476, ups=1.18, wpb=496101, bsz=16435.5, num_updates=45300, lr=0.000297154, gnorm=0.132, clip=0, loss_scale=2, train_wall=84, gb_free=60.1, wall=38846 epoch 027: 1457 / 1689 loss=3.535, nll_loss=1.984, ppl=3.96, wps=585476, ups=1.18, wpb=496101, bsz=16435.5, num_updates=45300, lr=0.000297154, gnorm=0.132, clip=0, loss_scale=2, train_wall=84, gb_free=60.1, wall=38846 epoch 027: 1457 / 1689 loss=3.535, nll_loss=1.984, ppl=3.96, wps=585476, ups=1.18, wpb=496101, bsz=16435.5, num_updates=45300, lr=0.000297154, gnorm=0.132, clip=0, loss_scale=2, train_wall=84, gb_free=60.1, wall=38846 epoch 027: 1457 / 1689 loss=3.535, nll_loss=1.984, ppl=3.96, wps=585476, ups=1.18, wpb=496101, bsz=16435.5, num_updates=45300, lr=0.000297154, gnorm=0.132, clip=0, loss_scale=2, train_wall=84, gb_free=60.1, wall=38846 epoch 027: 1457 / 1689 loss=3.535, nll_loss=1.984, ppl=3.96, wps=585476, ups=1.18, wpb=496101, bsz=16435.5, num_updates=45300, lr=0.000297154, gnorm=0.132, clip=0, loss_scale=2, train_wall=84, gb_free=60.1, wall=38846 epoch 027: 1457 / 1689 loss=3.535, nll_loss=1.984, ppl=3.96, wps=585476, ups=1.18, wpb=496101, bsz=16435.5, num_updates=45300, lr=0.000297154, gnorm=0.132, clip=0, loss_scale=2, train_wall=84, gb_free=60.1, wall=38846 epoch 027: 1457 / 1689 loss=3.535, nll_loss=1.984, ppl=3.96, wps=585476, ups=1.18, wpb=496101, bsz=16435.5, num_updates=45300, lr=0.000297154, gnorm=0.132, clip=0, loss_scale=2, train_wall=84, gb_free=60.1, wall=38846 epoch 027: 1457 / 1689 loss=3.535, nll_loss=1.984, ppl=3.96, wps=585476, ups=1.18, wpb=496101, bsz=16435.5, num_updates=45300, lr=0.000297154, gnorm=0.132, clip=0, loss_scale=2, train_wall=84, gb_free=60.1, wall=38846 epoch 027: 1457 / 1689 loss=3.535, nll_loss=1.984, ppl=3.96, wps=585476, ups=1.18, wpb=496101, bsz=16435.5, num_updates=45300, lr=0.000297154, gnorm=0.132, clip=0, loss_scale=2, train_wall=84, gb_free=60.1, wall=38846 epoch 027: 1457 / 1689 loss=3.535, nll_loss=1.984, ppl=3.96, wps=585476, ups=1.18, wpb=496101, bsz=16435.5, num_updates=45300, lr=0.000297154, gnorm=0.132, clip=0, loss_scale=2, train_wall=84, gb_free=60.1, wall=38846 epoch 027: 1457 / 1689 loss=3.535, nll_loss=1.984, ppl=3.96, wps=585476, ups=1.18, wpb=496101, bsz=16435.5, num_updates=45300, lr=0.000297154, gnorm=0.132, clip=0, loss_scale=2, train_wall=84, gb_free=60.1, wall=38846 epoch 027: 1457 / 1689 loss=3.535, nll_loss=1.984, ppl=3.96, wps=585476, ups=1.18, wpb=496101, bsz=16435.5, num_updates=45300, lr=0.000297154, gnorm=0.132, clip=0, loss_scale=2, train_wall=84, gb_free=60.1, wall=38846 epoch 027: 1457 / 1689 loss=3.535, nll_loss=1.984, ppl=3.96, wps=585476, ups=1.18, wpb=496101, bsz=16435.5, num_updates=45300, lr=0.000297154, gnorm=0.132, clip=0, loss_scale=2, train_wall=84, gb_free=60.1, wall=38846 epoch 027: 1557 / 1689 loss=3.539, nll_loss=1.988, ppl=3.97, wps=586878, ups=1.19, wpb=495131, bsz=16396.5, num_updates=45400, lr=0.000296826, gnorm=0.129, clip=0, loss_scale=2, train_wall=83, gb_free=61.1, wall=38930 epoch 027: 1557 / 1689 loss=3.539, nll_loss=1.988, ppl=3.97, wps=586878, ups=1.19, wpb=495131, bsz=16396.5, num_updates=45400, lr=0.000296826, gnorm=0.129, clip=0, loss_scale=2, train_wall=83, gb_free=61.1, wall=38930 epoch 027: 1557 / 1689 loss=3.539, nll_loss=1.988, ppl=3.97, wps=586878, ups=1.19, wpb=495131, bsz=16396.5, num_updates=45400, lr=0.000296826, gnorm=0.129, clip=0, loss_scale=2, train_wall=83, gb_free=61.1, wall=38930 epoch 027: 1557 / 1689 loss=3.539, nll_loss=1.988, ppl=3.97, wps=586878, ups=1.19, wpb=495131, bsz=16396.5, num_updates=45400, lr=0.000296826, gnorm=0.129, clip=0, loss_scale=2, train_wall=83, gb_free=61.1, wall=38930 epoch 027: 1557 / 1689 loss=3.539, nll_loss=1.988, ppl=3.97, wps=586878, ups=1.19, wpb=495131, bsz=16396.5, num_updates=45400, lr=0.000296826, gnorm=0.129, clip=0, loss_scale=2, train_wall=83, gb_free=61.1, wall=38930 epoch 027: 1557 / 1689 loss=3.539, nll_loss=1.988, ppl=3.97, wps=586878, ups=1.19, wpb=495131, bsz=16396.5, num_updates=45400, lr=0.000296826, gnorm=0.129, clip=0, loss_scale=2, train_wall=83, gb_free=61.1, wall=38930 epoch 027: 1557 / 1689 loss=3.539, nll_loss=1.988, ppl=3.97, wps=586878, ups=1.19, wpb=495131, bsz=16396.5, num_updates=45400, lr=0.000296826, gnorm=0.129, clip=0, loss_scale=2, train_wall=83, gb_free=61.1, wall=38930 epoch 027: 1557 / 1689 loss=3.539, nll_loss=1.988, ppl=3.97, wps=586878, ups=1.19, wpb=495131, bsz=16396.5, num_updates=45400, lr=0.000296826, gnorm=0.129, clip=0, loss_scale=2, train_wall=83, gb_free=61.1, wall=38930 epoch 027: 1557 / 1689 loss=3.539, nll_loss=1.988, ppl=3.97, wps=586878, ups=1.19, wpb=495131, bsz=16396.5, num_updates=45400, lr=0.000296826, gnorm=0.129, clip=0, loss_scale=2, train_wall=83, gb_free=61.1, wall=38930 epoch 027: 1557 / 1689 loss=3.539, nll_loss=1.988, ppl=3.97, wps=586878, ups=1.19, wpb=495131, bsz=16396.5, num_updates=45400, lr=0.000296826, gnorm=0.129, clip=0, loss_scale=2, train_wall=83, gb_free=61.1, wall=38930 epoch 027: 1557 / 1689 loss=3.539, nll_loss=1.988, ppl=3.97, wps=586878, ups=1.19, wpb=495131, bsz=16396.5, num_updates=45400, lr=0.000296826, gnorm=0.129, clip=0, loss_scale=2, train_wall=83, gb_free=61.1, wall=38930 epoch 027: 1557 / 1689 loss=3.539, nll_loss=1.988, ppl=3.97, wps=586878, ups=1.19, wpb=495131, bsz=16396.5, num_updates=45400, lr=0.000296826, gnorm=0.129, clip=0, loss_scale=2, train_wall=83, gb_free=61.1, wall=38930 epoch 027: 1557 / 1689 loss=3.539, nll_loss=1.988, ppl=3.97, wps=586878, ups=1.19, wpb=495131, bsz=16396.5, num_updates=45400, lr=0.000296826, gnorm=0.129, clip=0, loss_scale=2, train_wall=83, gb_free=61.1, wall=38930 epoch 027: 1557 / 1689 loss=3.539, nll_loss=1.988, ppl=3.97, wps=586878, ups=1.19, wpb=495131, bsz=16396.5, num_updates=45400, lr=0.000296826, gnorm=0.129, clip=0, loss_scale=2, train_wall=83, gb_free=61.1, wall=38930 epoch 027: 1557 / 1689 loss=3.539, nll_loss=1.988, ppl=3.97, wps=586878, ups=1.19, wpb=495131, bsz=16396.5, num_updates=45400, lr=0.000296826, gnorm=0.129, clip=0, loss_scale=2, train_wall=83, gb_free=61.1, wall=38930 epoch 027: 1557 / 1689 loss=3.539, nll_loss=1.988, ppl=3.97, wps=586878, ups=1.19, wpb=495131, bsz=16396.5, num_updates=45400, lr=0.000296826, gnorm=0.129, clip=0, loss_scale=2, train_wall=83, gb_free=61.1, wall=38930 epoch 027: 1557 / 1689 loss=3.539, nll_loss=1.988, ppl=3.97, wps=586878, ups=1.19, wpb=495131, bsz=16396.5, num_updates=45400, lr=0.000296826, gnorm=0.129, clip=0, loss_scale=2, train_wall=83, gb_free=61.1, wall=38930 epoch 027: 1557 / 1689 loss=3.539, nll_loss=1.988, ppl=3.97, wps=586878, ups=1.19, wpb=495131, bsz=16396.5, num_updates=45400, lr=0.000296826, gnorm=0.129, clip=0, loss_scale=2, train_wall=83, gb_free=61.1, wall=38930 epoch 027: 1557 / 1689 loss=3.539, nll_loss=1.988, ppl=3.97, wps=586878, ups=1.19, wpb=495131, bsz=16396.5, num_updates=45400, lr=0.000296826, gnorm=0.129, clip=0, loss_scale=2, train_wall=83, gb_free=61.1, wall=38930 epoch 027: 1557 / 1689 loss=3.539, nll_loss=1.988, ppl=3.97, wps=586878, ups=1.19, wpb=495131, bsz=16396.5, num_updates=45400, lr=0.000296826, gnorm=0.129, clip=0, loss_scale=2, train_wall=83, gb_free=61.1, wall=38930 epoch 027: 1557 / 1689 loss=3.539, nll_loss=1.988, ppl=3.97, wps=586878, ups=1.19, wpb=495131, bsz=16396.5, num_updates=45400, lr=0.000296826, gnorm=0.129, clip=0, loss_scale=2, train_wall=83, gb_free=61.1, wall=38930 epoch 027: 1557 / 1689 loss=3.539, nll_loss=1.988, ppl=3.97, wps=586878, ups=1.19, wpb=495131, bsz=16396.5, num_updates=45400, lr=0.000296826, gnorm=0.129, clip=0, loss_scale=2, train_wall=83, gb_free=61.1, wall=38930 epoch 027: 1557 / 1689 loss=3.539, nll_loss=1.988, ppl=3.97, wps=586878, ups=1.19, wpb=495131, bsz=16396.5, num_updates=45400, lr=0.000296826, gnorm=0.129, clip=0, loss_scale=2, train_wall=83, gb_free=61.1, wall=38930 epoch 027: 1557 / 1689 loss=3.539, nll_loss=1.988, ppl=3.97, wps=586878, ups=1.19, wpb=495131, bsz=16396.5, num_updates=45400, lr=0.000296826, gnorm=0.129, clip=0, loss_scale=2, train_wall=83, gb_free=61.1, wall=38930 epoch 027: 1557 / 1689 loss=3.539, nll_loss=1.988, ppl=3.97, wps=586878, ups=1.19, wpb=495131, bsz=16396.5, num_updates=45400, lr=0.000296826, gnorm=0.129, clip=0, loss_scale=2, train_wall=83, gb_free=61.1, wall=38930 epoch 027: 1557 / 1689 loss=3.539, nll_loss=1.988, ppl=3.97, wps=586878, ups=1.19, wpb=495131, bsz=16396.5, num_updates=45400, lr=0.000296826, gnorm=0.129, clip=0, loss_scale=2, train_wall=83, gb_free=61.1, wall=38930 epoch 027: 1557 / 1689 loss=3.539, nll_loss=1.988, ppl=3.97, wps=586878, ups=1.19, wpb=495131, bsz=16396.5, num_updates=45400, lr=0.000296826, gnorm=0.129, clip=0, loss_scale=2, train_wall=83, gb_free=61.1, wall=38930 epoch 027: 1657 / 1689 loss=3.534, nll_loss=1.983, ppl=3.95, wps=586270, ups=1.18, wpb=495582, bsz=16837.7, num_updates=45500, lr=0.0002965, gnorm=0.135, clip=0, loss_scale=2, train_wall=83, gb_free=61.6, wall=39014 epoch 027: 1657 / 1689 loss=3.534, nll_loss=1.983, ppl=3.95, wps=586270, ups=1.18, wpb=495582, bsz=16837.7, num_updates=45500, lr=0.0002965, gnorm=0.135, clip=0, loss_scale=2, train_wall=83, gb_free=61.6, wall=39014 epoch 027: 1657 / 1689 loss=3.534, nll_loss=1.983, ppl=3.95, wps=586270, ups=1.18, wpb=495582, bsz=16837.7, num_updates=45500, lr=0.0002965, gnorm=0.135, clip=0, loss_scale=2, train_wall=83, gb_free=61.6, wall=39014 epoch 027: 1657 / 1689 loss=3.534, nll_loss=1.983, ppl=3.95, wps=586270, ups=1.18, wpb=495582, bsz=16837.7, num_updates=45500, lr=0.0002965, gnorm=0.135, clip=0, loss_scale=2, train_wall=83, gb_free=61.6, wall=39014 epoch 027: 1657 / 1689 loss=3.534, nll_loss=1.983, ppl=3.95, wps=586270, ups=1.18, wpb=495582, bsz=16837.7, num_updates=45500, lr=0.0002965, gnorm=0.135, clip=0, loss_scale=2, train_wall=83, gb_free=61.6, wall=39014 epoch 027: 1657 / 1689 loss=3.534, nll_loss=1.983, ppl=3.95, wps=586270, ups=1.18, wpb=495582, bsz=16837.7, num_updates=45500, lr=0.0002965, gnorm=0.135, clip=0, loss_scale=2, train_wall=83, gb_free=61.6, wall=39014 epoch 027: 1657 / 1689 loss=3.534, nll_loss=1.983, ppl=3.95, wps=586270, ups=1.18, wpb=495582, bsz=16837.7, num_updates=45500, lr=0.0002965, gnorm=0.135, clip=0, loss_scale=2, train_wall=83, gb_free=61.6, wall=39014 epoch 027: 1657 / 1689 loss=3.534, nll_loss=1.983, ppl=3.95, wps=586270, ups=1.18, wpb=495582, bsz=16837.7, num_updates=45500, lr=0.0002965, gnorm=0.135, clip=0, loss_scale=2, train_wall=83, gb_free=61.6, wall=39014 epoch 027: 1657 / 1689 loss=3.534, nll_loss=1.983, ppl=3.95, wps=586270, ups=1.18, wpb=495582, bsz=16837.7, num_updates=45500, lr=0.0002965, gnorm=0.135, clip=0, loss_scale=2, train_wall=83, gb_free=61.6, wall=39014 epoch 027: 1657 / 1689 loss=3.534, nll_loss=1.983, ppl=3.95, wps=586270, ups=1.18, wpb=495582, bsz=16837.7, num_updates=45500, lr=0.0002965, gnorm=0.135, clip=0, loss_scale=2, train_wall=83, gb_free=61.6, wall=39014 epoch 027: 1657 / 1689 loss=3.534, nll_loss=1.983, ppl=3.95, wps=586270, ups=1.18, wpb=495582, bsz=16837.7, num_updates=45500, lr=0.0002965, gnorm=0.135, clip=0, loss_scale=2, train_wall=83, gb_free=61.6, wall=39014 epoch 027: 1657 / 1689 loss=3.534, nll_loss=1.983, ppl=3.95, wps=586270, ups=1.18, wpb=495582, bsz=16837.7, num_updates=45500, lr=0.0002965, gnorm=0.135, clip=0, loss_scale=2, train_wall=83, gb_free=61.6, wall=39014 epoch 027: 1657 / 1689 loss=3.534, nll_loss=1.983, ppl=3.95, wps=586270, ups=1.18, wpb=495582, bsz=16837.7, num_updates=45500, lr=0.0002965, gnorm=0.135, clip=0, loss_scale=2, train_wall=83, gb_free=61.6, wall=39014 epoch 027: 1657 / 1689 loss=3.534, nll_loss=1.983, ppl=3.95, wps=586270, ups=1.18, wpb=495582, bsz=16837.7, num_updates=45500, lr=0.0002965, gnorm=0.135, clip=0, loss_scale=2, train_wall=83, gb_free=61.6, wall=39014 epoch 027: 1657 / 1689 loss=3.534, nll_loss=1.983, ppl=3.95, wps=586270, ups=1.18, wpb=495582, bsz=16837.7, num_updates=45500, lr=0.0002965, gnorm=0.135, clip=0, loss_scale=2, train_wall=83, gb_free=61.6, wall=39014 epoch 027: 1657 / 1689 loss=3.534, nll_loss=1.983, ppl=3.95, wps=586270, ups=1.18, wpb=495582, bsz=16837.7, num_updates=45500, lr=0.0002965, gnorm=0.135, clip=0, loss_scale=2, train_wall=83, gb_free=61.6, wall=39014 epoch 027: 1657 / 1689 loss=3.534, nll_loss=1.983, ppl=3.95, wps=586270, ups=1.18, wpb=495582, bsz=16837.7, num_updates=45500, lr=0.0002965, gnorm=0.135, clip=0, loss_scale=2, train_wall=83, gb_free=61.6, wall=39014 epoch 027: 1657 / 1689 loss=3.534, nll_loss=1.983, ppl=3.95, wps=586270, ups=1.18, wpb=495582, bsz=16837.7, num_updates=45500, lr=0.0002965, gnorm=0.135, clip=0, loss_scale=2, train_wall=83, gb_free=61.6, wall=39014 epoch 027: 1657 / 1689 loss=3.534, nll_loss=1.983, ppl=3.95, wps=586270, ups=1.18, wpb=495582, bsz=16837.7, num_updates=45500, lr=0.0002965, gnorm=0.135, clip=0, loss_scale=2, train_wall=83, gb_free=61.6, wall=39014 epoch 027: 1657 / 1689 loss=3.534, nll_loss=1.983, ppl=3.95, wps=586270, ups=1.18, wpb=495582, bsz=16837.7, num_updates=45500, lr=0.0002965, gnorm=0.135, clip=0, loss_scale=2, train_wall=83, gb_free=61.6, wall=39014 epoch 027: 1657 / 1689 loss=3.534, nll_loss=1.983, ppl=3.95, wps=586270, ups=1.18, wpb=495582, bsz=16837.7, num_updates=45500, lr=0.0002965, gnorm=0.135, clip=0, loss_scale=2, train_wall=83, gb_free=61.6, wall=39014 epoch 027: 1657 / 1689 loss=3.534, nll_loss=1.983, ppl=3.95, wps=586270, ups=1.18, wpb=495582, bsz=16837.7, num_updates=45500, lr=0.0002965, gnorm=0.135, clip=0, loss_scale=2, train_wall=83, gb_free=61.6, wall=39014 epoch 027: 1657 / 1689 loss=3.534, nll_loss=1.983, ppl=3.95, wps=586270, ups=1.18, wpb=495582, bsz=16837.7, num_updates=45500, lr=0.0002965, gnorm=0.135, clip=0, loss_scale=2, train_wall=83, gb_free=61.6, wall=39014 epoch 027: 1657 / 1689 loss=3.534, nll_loss=1.983, ppl=3.95, wps=586270, ups=1.18, wpb=495582, bsz=16837.7, num_updates=45500, lr=0.0002965, gnorm=0.135, clip=0, loss_scale=2, train_wall=83, gb_free=61.6, wall=39014 epoch 027: 1657 / 1689 loss=3.534, nll_loss=1.983, ppl=3.95, wps=586270, ups=1.18, wpb=495582, bsz=16837.7, num_updates=45500, lr=0.0002965, gnorm=0.135, clip=0, loss_scale=2, train_wall=83, gb_free=61.6, wall=39014 epoch 027: 1657 / 1689 loss=3.534, nll_loss=1.983, ppl=3.95, wps=586270, ups=1.18, wpb=495582, bsz=16837.7, num_updates=45500, lr=0.0002965, gnorm=0.135, clip=0, loss_scale=2, train_wall=83, gb_free=61.6, wall=39014 epoch 027: 1657 / 1689 loss=3.534, nll_loss=1.983, ppl=3.95, wps=586270, ups=1.18, wpb=495582, bsz=16837.7, num_updates=45500, lr=0.0002965, gnorm=0.135, clip=0, loss_scale=2, train_wall=83, gb_free=61.6, wall=39014 end of epoch 27 (average epoch stats below) epoch 027 | loss 3.528 | nll_loss 1.976 | ppl 3.93 | wps 577285 | ups 1.17 | wpb 495108 | bsz 16502.9 | num_updates 45532 | lr 0.000296396 | gnorm 0.134 | clip 0 | loss_scale 2 | train_wall 1407 | gb_free 62.9 | wall 39041 epoch 027 | loss 3.528 | nll_loss 1.976 | ppl 3.93 | wps 577285 | ups 1.17 | wpb 495108 | bsz 16502.9 | num_updates 45532 | lr 0.000296396 | gnorm 0.134 | clip 0 | loss_scale 2 | train_wall 1407 | gb_free 62.9 | wall 39041 epoch 027 | loss 3.528 | nll_loss 1.976 | ppl 3.93 | wps 577285 | ups 1.17 | wpb 495108 | bsz 16502.9 | num_updates 45532 | lr 0.000296396 | gnorm 0.134 | clip 0 | loss_scale 2 | train_wall 1407 | gb_free 62.9 | wall 39041 epoch 027 | loss 3.528 | nll_loss 1.976 | ppl 3.93 | wps 577285 | ups 1.17 | wpb 495108 | bsz 16502.9 | num_updates 45532 | lr 0.000296396 | gnorm 0.134 | clip 0 | loss_scale 2 | train_wall 1407 | gb_free 62.9 | wall 39041 epoch 027 | loss 3.528 | nll_loss 1.976 | ppl 3.93 | wps 577285 | ups 1.17 | wpb 495108 | bsz 16502.9 | num_updates 45532 | lr 0.000296396 | gnorm 0.134 | clip 0 | loss_scale 2 | train_wall 1407 | gb_free 62.9 | wall 39041 epoch 027 | loss 3.528 | nll_loss 1.976 | ppl 3.93 | wps 577285 | ups 1.17 | wpb 495108 | bsz 16502.9 | num_updates 45532 | lr 0.000296396 | gnorm 0.134 | clip 0 | loss_scale 2 | train_wall 1407 | gb_free 62.9 | wall 39041 epoch 027 | loss 3.528 | nll_loss 1.976 | ppl 3.93 | wps 577285 | ups 1.17 | wpb 495108 | bsz 16502.9 | num_updates 45532 | lr 0.000296396 | gnorm 0.134 | clip 0 | loss_scale 2 | train_wall 1407 | gb_free 62.9 | wall 39041 epoch 027 | loss 3.528 | nll_loss 1.976 | ppl 3.93 | wps 577285 | ups 1.17 | wpb 495108 | bsz 16502.9 | num_updates 45532 | lr 0.000296396 | gnorm 0.134 | clip 0 | loss_scale 2 | train_wall 1407 | gb_free 62.9 | wall 39041 epoch 027 | loss 3.528 | nll_loss 1.976 | ppl 3.93 | wps 577285 | ups 1.17 | wpb 495108 | bsz 16502.9 | num_updates 45532 | lr 0.000296396 | gnorm 0.134 | clip 0 | loss_scale 2 | train_wall 1407 | gb_free 62.9 | wall 39041 epoch 027 | loss 3.528 | nll_loss 1.976 | ppl 3.93 | wps 577285 | ups 1.17 | wpb 495108 | bsz 16502.9 | num_updates 45532 | lr 0.000296396 | gnorm 0.134 | clip 0 | loss_scale 2 | train_wall 1407 | gb_free 62.9 | wall 39041 epoch 027 | loss 3.528 | nll_loss 1.976 | ppl 3.93 | wps 577285 | ups 1.17 | wpb 495108 | bsz 16502.9 | num_updates 45532 | lr 0.000296396 | gnorm 0.134 | clip 0 | loss_scale 2 | train_wall 1407 | gb_free 62.9 | wall 39041 epoch 027 | loss 3.528 | nll_loss 1.976 | ppl 3.93 | wps 577285 | ups 1.17 | wpb 495108 | bsz 16502.9 | num_updates 45532 | lr 0.000296396 | gnorm 0.134 | clip 0 | loss_scale 2 | train_wall 1407 | gb_free 62.9 | wall 39041 epoch 027 | loss 3.528 | nll_loss 1.976 | ppl 3.93 | wps 577285 | ups 1.17 | wpb 495108 | bsz 16502.9 | num_updates 45532 | lr 0.000296396 | gnorm 0.134 | clip 0 | loss_scale 2 | train_wall 1407 | gb_free 62.9 | wall 39041 epoch 027 | loss 3.528 | nll_loss 1.976 | ppl 3.93 | wps 577285 | ups 1.17 | wpb 495108 | bsz 16502.9 | num_updates 45532 | lr 0.000296396 | gnorm 0.134 | clip 0 | loss_scale 2 | train_wall 1407 | gb_free 62.9 | wall 39041 epoch 027 | loss 3.528 | nll_loss 1.976 | ppl 3.93 | wps 577285 | ups 1.17 | wpb 495108 | bsz 16502.9 | num_updates 45532 | lr 0.000296396 | gnorm 0.134 | clip 0 | loss_scale 2 | train_wall 1407 | gb_free 62.9 | wall 39041 epoch 027 | loss 3.528 | nll_loss 1.976 | ppl 3.93 | wps 577285 | ups 1.17 | wpb 495108 | bsz 16502.9 | num_updates 45532 | lr 0.000296396 | gnorm 0.134 | clip 0 | loss_scale 2 | train_wall 1407 | gb_free 62.9 | wall 39041 epoch 027 | loss 3.528 | nll_loss 1.976 | ppl 3.93 | wps 577285 | ups 1.17 | wpb 495108 | bsz 16502.9 | num_updates 45532 | lr 0.000296396 | gnorm 0.134 | clip 0 | loss_scale 2 | train_wall 1407 | gb_free 62.9 | wall 39041 epoch 027 | loss 3.528 | nll_loss 1.976 | ppl 3.93 | wps 577285 | ups 1.17 | wpb 495108 | bsz 16502.9 | num_updates 45532 | lr 0.000296396 | gnorm 0.134 | clip 0 | loss_scale 2 | train_wall 1407 | gb_free 62.9 | wall 39041 epoch 027 | loss 3.528 | nll_loss 1.976 | ppl 3.93 | wps 577285 | ups 1.17 | wpb 495108 | bsz 16502.9 | num_updates 45532 | lr 0.000296396 | gnorm 0.134 | clip 0 | loss_scale 2 | train_wall 1407 | gb_free 62.9 | wall 39041 epoch 027 | loss 3.528 | nll_loss 1.976 | ppl 3.93 | wps 577285 | ups 1.17 | wpb 495108 | bsz 16502.9 | num_updates 45532 | lr 0.000296396 | gnorm 0.134 | clip 0 | loss_scale 2 | train_wall 1407 | gb_free 62.9 | wall 39041 epoch 027 | loss 3.528 | nll_loss 1.976 | ppl 3.93 | wps 577285 | ups 1.17 | wpb 495108 | bsz 16502.9 | num_updates 45532 | lr 0.000296396 | gnorm 0.134 | clip 0 | loss_scale 2 | train_wall 1407 | gb_free 62.9 | wall 39041 epoch 027 | loss 3.528 | nll_loss 1.976 | ppl 3.93 | wps 577285 | ups 1.17 | wpb 495108 | bsz 16502.9 | num_updates 45532 | lr 0.000296396 | gnorm 0.134 | clip 0 | loss_scale 2 | train_wall 1407 | gb_free 62.9 | wall 39041 epoch 027 | loss 3.528 | nll_loss 1.976 | ppl 3.93 | wps 577285 | ups 1.17 | wpb 495108 | bsz 16502.9 | num_updates 45532 | lr 0.000296396 | gnorm 0.134 | clip 0 | loss_scale 2 | train_wall 1407 | gb_free 62.9 | wall 39041 epoch 027 | loss 3.528 | nll_loss 1.976 | ppl 3.93 | wps 577285 | ups 1.17 | wpb 495108 | bsz 16502.9 | num_updates 45532 | lr 0.000296396 | gnorm 0.134 | clip 0 | loss_scale 2 | train_wall 1407 | gb_free 62.9 | wall 39041 epoch 027 | loss 3.528 | nll_loss 1.976 | ppl 3.93 | wps 577285 | ups 1.17 | wpb 495108 | bsz 16502.9 | num_updates 45532 | lr 0.000296396 | gnorm 0.134 | clip 0 | loss_scale 2 | train_wall 1407 | gb_free 62.9 | wall 39041 epoch 027 | loss 3.528 | nll_loss 1.976 | ppl 3.93 | wps 577285 | ups 1.17 | wpb 495108 | bsz 16502.9 | num_updates 45532 | lr 0.000296396 | gnorm 0.134 | clip 0 | loss_scale 2 | train_wall 1407 | gb_free 62.9 | wall 39041 epoch 027 | loss 3.528 | nll_loss 1.976 | ppl 3.93 | wps 577285 | ups 1.17 | wpb 495108 | bsz 16502.9 | num_updates 45532 | lr 0.000296396 | gnorm 0.134 | clip 0 | loss_scale 2 | train_wall 1407 | gb_free 62.9 | wall 39041 Start iterating over samples epoch 028: 68 / 1689 loss=3.52, nll_loss=1.967, ppl=3.91, wps=584988, ups=1.19, wpb=490784, bsz=16614.6, num_updates=45600, lr=0.000296174, gnorm=0.141, clip=0, loss_scale=2, train_wall=82, gb_free=61.8, wall=39098 epoch 028: 68 / 1689 loss=3.52, nll_loss=1.967, ppl=3.91, wps=584988, ups=1.19, wpb=490784, bsz=16614.6, num_updates=45600, lr=0.000296174, gnorm=0.141, clip=0, loss_scale=2, train_wall=82, gb_free=61.8, wall=39098 epoch 028: 68 / 1689 loss=3.52, nll_loss=1.967, ppl=3.91, wps=584988, ups=1.19, wpb=490784, bsz=16614.6, num_updates=45600, lr=0.000296174, gnorm=0.141, clip=0, loss_scale=2, train_wall=82, gb_free=61.8, wall=39098 epoch 028: 68 / 1689 loss=3.52, nll_loss=1.967, ppl=3.91, wps=584988, ups=1.19, wpb=490784, bsz=16614.6, num_updates=45600, lr=0.000296174, gnorm=0.141, clip=0, loss_scale=2, train_wall=82, gb_free=61.8, wall=39098 epoch 028: 68 / 1689 loss=3.52, nll_loss=1.967, ppl=3.91, wps=584988, ups=1.19, wpb=490784, bsz=16614.6, num_updates=45600, lr=0.000296174, gnorm=0.141, clip=0, loss_scale=2, train_wall=82, gb_free=61.8, wall=39098 epoch 028: 68 / 1689 loss=3.52, nll_loss=1.967, ppl=3.91, wps=584988, ups=1.19, wpb=490784, bsz=16614.6, num_updates=45600, lr=0.000296174, gnorm=0.141, clip=0, loss_scale=2, train_wall=82, gb_free=61.8, wall=39098 epoch 028: 68 / 1689 loss=3.52, nll_loss=1.967, ppl=3.91, wps=584988, ups=1.19, wpb=490784, bsz=16614.6, num_updates=45600, lr=0.000296174, gnorm=0.141, clip=0, loss_scale=2, train_wall=82, gb_free=61.8, wall=39098 epoch 028: 68 / 1689 loss=3.52, nll_loss=1.967, ppl=3.91, wps=584988, ups=1.19, wpb=490784, bsz=16614.6, num_updates=45600, lr=0.000296174, gnorm=0.141, clip=0, loss_scale=2, train_wall=82, gb_free=61.8, wall=39098 epoch 028: 68 / 1689 loss=3.52, nll_loss=1.967, ppl=3.91, wps=584988, ups=1.19, wpb=490784, bsz=16614.6, num_updates=45600, lr=0.000296174, gnorm=0.141, clip=0, loss_scale=2, train_wall=82, gb_free=61.8, wall=39098 epoch 028: 68 / 1689 loss=3.52, nll_loss=1.967, ppl=3.91, wps=584988, ups=1.19, wpb=490784, bsz=16614.6, num_updates=45600, lr=0.000296174, gnorm=0.141, clip=0, loss_scale=2, train_wall=82, gb_free=61.8, wall=39098 epoch 028: 68 / 1689 loss=3.52, nll_loss=1.967, ppl=3.91, wps=584988, ups=1.19, wpb=490784, bsz=16614.6, num_updates=45600, lr=0.000296174, gnorm=0.141, clip=0, loss_scale=2, train_wall=82, gb_free=61.8, wall=39098 epoch 028: 68 / 1689 loss=3.52, nll_loss=1.967, ppl=3.91, wps=584988, ups=1.19, wpb=490784, bsz=16614.6, num_updates=45600, lr=0.000296174, gnorm=0.141, clip=0, loss_scale=2, train_wall=82, gb_free=61.8, wall=39098 epoch 028: 68 / 1689 loss=3.52, nll_loss=1.967, ppl=3.91, wps=584988, ups=1.19, wpb=490784, bsz=16614.6, num_updates=45600, lr=0.000296174, gnorm=0.141, clip=0, loss_scale=2, train_wall=82, gb_free=61.8, wall=39098 epoch 028: 68 / 1689 loss=3.52, nll_loss=1.967, ppl=3.91, wps=584988, ups=1.19, wpb=490784, bsz=16614.6, num_updates=45600, lr=0.000296174, gnorm=0.141, clip=0, loss_scale=2, train_wall=82, gb_free=61.8, wall=39098 epoch 028: 68 / 1689 loss=3.52, nll_loss=1.967, ppl=3.91, wps=584988, ups=1.19, wpb=490784, bsz=16614.6, num_updates=45600, lr=0.000296174, gnorm=0.141, clip=0, loss_scale=2, train_wall=82, gb_free=61.8, wall=39098 epoch 028: 68 / 1689 loss=3.52, nll_loss=1.967, ppl=3.91, wps=584988, ups=1.19, wpb=490784, bsz=16614.6, num_updates=45600, lr=0.000296174, gnorm=0.141, clip=0, loss_scale=2, train_wall=82, gb_free=61.8, wall=39098 epoch 028: 68 / 1689 loss=3.52, nll_loss=1.967, ppl=3.91, wps=584988, ups=1.19, wpb=490784, bsz=16614.6, num_updates=45600, lr=0.000296174, gnorm=0.141, clip=0, loss_scale=2, train_wall=82, gb_free=61.8, wall=39098 epoch 028: 68 / 1689 loss=3.52, nll_loss=1.967, ppl=3.91, wps=584988, ups=1.19, wpb=490784, bsz=16614.6, num_updates=45600, lr=0.000296174, gnorm=0.141, clip=0, loss_scale=2, train_wall=82, gb_free=61.8, wall=39098 epoch 028: 68 / 1689 loss=3.52, nll_loss=1.967, ppl=3.91, wps=584988, ups=1.19, wpb=490784, bsz=16614.6, num_updates=45600, lr=0.000296174, gnorm=0.141, clip=0, loss_scale=2, train_wall=82, gb_free=61.8, wall=39098 epoch 028: 68 / 1689 loss=3.52, nll_loss=1.967, ppl=3.91, wps=584988, ups=1.19, wpb=490784, bsz=16614.6, num_updates=45600, lr=0.000296174, gnorm=0.141, clip=0, loss_scale=2, train_wall=82, gb_free=61.8, wall=39098 epoch 028: 68 / 1689 loss=3.52, nll_loss=1.967, ppl=3.91, wps=584988, ups=1.19, wpb=490784, bsz=16614.6, num_updates=45600, lr=0.000296174, gnorm=0.141, clip=0, loss_scale=2, train_wall=82, gb_free=61.8, wall=39098 epoch 028: 68 / 1689 loss=3.52, nll_loss=1.967, ppl=3.91, wps=584988, ups=1.19, wpb=490784, bsz=16614.6, num_updates=45600, lr=0.000296174, gnorm=0.141, clip=0, loss_scale=2, train_wall=82, gb_free=61.8, wall=39098 epoch 028: 68 / 1689 loss=3.52, nll_loss=1.967, ppl=3.91, wps=584988, ups=1.19, wpb=490784, bsz=16614.6, num_updates=45600, lr=0.000296174, gnorm=0.141, clip=0, loss_scale=2, train_wall=82, gb_free=61.8, wall=39098 epoch 028: 68 / 1689 loss=3.52, nll_loss=1.967, ppl=3.91, wps=584988, ups=1.19, wpb=490784, bsz=16614.6, num_updates=45600, lr=0.000296174, gnorm=0.141, clip=0, loss_scale=2, train_wall=82, gb_free=61.8, wall=39098 epoch 028: 68 / 1689 loss=3.52, nll_loss=1.967, ppl=3.91, wps=584988, ups=1.19, wpb=490784, bsz=16614.6, num_updates=45600, lr=0.000296174, gnorm=0.141, clip=0, loss_scale=2, train_wall=82, gb_free=61.8, wall=39098 epoch 028: 68 / 1689 loss=3.52, nll_loss=1.967, ppl=3.91, wps=584988, ups=1.19, wpb=490784, bsz=16614.6, num_updates=45600, lr=0.000296174, gnorm=0.141, clip=0, loss_scale=2, train_wall=82, gb_free=61.8, wall=39098 epoch 028: 68 / 1689 loss=3.52, nll_loss=1.967, ppl=3.91, wps=584988, ups=1.19, wpb=490784, bsz=16614.6, num_updates=45600, lr=0.000296174, gnorm=0.141, clip=0, loss_scale=2, train_wall=82, gb_free=61.8, wall=39098 epoch 028: 68 / 1689 loss=3.52, nll_loss=1.967, ppl=3.91, wps=584988, ups=1.19, wpb=490784, bsz=16614.6, num_updates=45600, lr=0.000296174, gnorm=0.141, clip=0, loss_scale=2, train_wall=82, gb_free=61.8, wall=39098 epoch 028: 168 / 1689 loss=3.51, nll_loss=1.955, ppl=3.88, wps=585919, ups=1.18, wpb=497599, bsz=16268, num_updates=45700, lr=0.00029585, gnorm=0.137, clip=0, loss_scale=2, train_wall=84, gb_free=60.9, wall=39183 epoch 028: 168 / 1689 loss=3.51, nll_loss=1.955, ppl=3.88, wps=585919, ups=1.18, wpb=497599, bsz=16268, num_updates=45700, lr=0.00029585, gnorm=0.137, clip=0, loss_scale=2, train_wall=84, gb_free=60.9, wall=39183 epoch 028: 168 / 1689 loss=3.51, nll_loss=1.955, ppl=3.88, wps=585919, ups=1.18, wpb=497599, bsz=16268, num_updates=45700, lr=0.00029585, gnorm=0.137, clip=0, loss_scale=2, train_wall=84, gb_free=60.9, wall=39183 epoch 028: 168 / 1689 loss=3.51, nll_loss=1.955, ppl=3.88, wps=585919, ups=1.18, wpb=497599, bsz=16268, num_updates=45700, lr=0.00029585, gnorm=0.137, clip=0, loss_scale=2, train_wall=84, gb_free=60.9, wall=39183 epoch 028: 168 / 1689 loss=3.51, nll_loss=1.955, ppl=3.88, wps=585919, ups=1.18, wpb=497599, bsz=16268, num_updates=45700, lr=0.00029585, gnorm=0.137, clip=0, loss_scale=2, train_wall=84, gb_free=60.9, wall=39183 epoch 028: 168 / 1689 loss=3.51, nll_loss=1.955, ppl=3.88, wps=585919, ups=1.18, wpb=497599, bsz=16268, num_updates=45700, lr=0.00029585, gnorm=0.137, clip=0, loss_scale=2, train_wall=84, gb_free=60.9, wall=39183 epoch 028: 168 / 1689 loss=3.51, nll_loss=1.955, ppl=3.88, wps=585919, ups=1.18, wpb=497599, bsz=16268, num_updates=45700, lr=0.00029585, gnorm=0.137, clip=0, loss_scale=2, train_wall=84, gb_free=60.9, wall=39183 epoch 028: 168 / 1689 loss=3.51, nll_loss=1.955, ppl=3.88, wps=585919, ups=1.18, wpb=497599, bsz=16268, num_updates=45700, lr=0.00029585, gnorm=0.137, clip=0, loss_scale=2, train_wall=84, gb_free=60.9, wall=39183 epoch 028: 168 / 1689 loss=3.51, nll_loss=1.955, ppl=3.88, wps=585919, ups=1.18, wpb=497599, bsz=16268, num_updates=45700, lr=0.00029585, gnorm=0.137, clip=0, loss_scale=2, train_wall=84, gb_free=60.9, wall=39183 epoch 028: 168 / 1689 loss=3.51, nll_loss=1.955, ppl=3.88, wps=585919, ups=1.18, wpb=497599, bsz=16268, num_updates=45700, lr=0.00029585, gnorm=0.137, clip=0, loss_scale=2, train_wall=84, gb_free=60.9, wall=39183 epoch 028: 168 / 1689 loss=3.51, nll_loss=1.955, ppl=3.88, wps=585919, ups=1.18, wpb=497599, bsz=16268, num_updates=45700, lr=0.00029585, gnorm=0.137, clip=0, loss_scale=2, train_wall=84, gb_free=60.9, wall=39183 epoch 028: 168 / 1689 loss=3.51, nll_loss=1.955, ppl=3.88, wps=585919, ups=1.18, wpb=497599, bsz=16268, num_updates=45700, lr=0.00029585, gnorm=0.137, clip=0, loss_scale=2, train_wall=84, gb_free=60.9, wall=39183 epoch 028: 168 / 1689 loss=3.51, nll_loss=1.955, ppl=3.88, wps=585919, ups=1.18, wpb=497599, bsz=16268, num_updates=45700, lr=0.00029585, gnorm=0.137, clip=0, loss_scale=2, train_wall=84, gb_free=60.9, wall=39183 epoch 028: 168 / 1689 loss=3.51, nll_loss=1.955, ppl=3.88, wps=585919, ups=1.18, wpb=497599, bsz=16268, num_updates=45700, lr=0.00029585, gnorm=0.137, clip=0, loss_scale=2, train_wall=84, gb_free=60.9, wall=39183 epoch 028: 168 / 1689 loss=3.51, nll_loss=1.955, ppl=3.88, wps=585919, ups=1.18, wpb=497599, bsz=16268, num_updates=45700, lr=0.00029585, gnorm=0.137, clip=0, loss_scale=2, train_wall=84, gb_free=60.9, wall=39183 epoch 028: 168 / 1689 loss=3.51, nll_loss=1.955, ppl=3.88, wps=585919, ups=1.18, wpb=497599, bsz=16268, num_updates=45700, lr=0.00029585, gnorm=0.137, clip=0, loss_scale=2, train_wall=84, gb_free=60.9, wall=39183 epoch 028: 168 / 1689 loss=3.51, nll_loss=1.955, ppl=3.88, wps=585919, ups=1.18, wpb=497599, bsz=16268, num_updates=45700, lr=0.00029585, gnorm=0.137, clip=0, loss_scale=2, train_wall=84, gb_free=60.9, wall=39183 epoch 028: 168 / 1689 loss=3.51, nll_loss=1.955, ppl=3.88, wps=585919, ups=1.18, wpb=497599, bsz=16268, num_updates=45700, lr=0.00029585, gnorm=0.137, clip=0, loss_scale=2, train_wall=84, gb_free=60.9, wall=39183 epoch 028: 168 / 1689 loss=3.51, nll_loss=1.955, ppl=3.88, wps=585919, ups=1.18, wpb=497599, bsz=16268, num_updates=45700, lr=0.00029585, gnorm=0.137, clip=0, loss_scale=2, train_wall=84, gb_free=60.9, wall=39183 epoch 028: 168 / 1689 loss=3.51, nll_loss=1.955, ppl=3.88, wps=585919, ups=1.18, wpb=497599, bsz=16268, num_updates=45700, lr=0.00029585, gnorm=0.137, clip=0, loss_scale=2, train_wall=84, gb_free=60.9, wall=39183 epoch 028: 168 / 1689 loss=3.51, nll_loss=1.955, ppl=3.88, wps=585919, ups=1.18, wpb=497599, bsz=16268, num_updates=45700, lr=0.00029585, gnorm=0.137, clip=0, loss_scale=2, train_wall=84, gb_free=60.9, wall=39183 epoch 028: 168 / 1689 loss=3.51, nll_loss=1.955, ppl=3.88, wps=585919, ups=1.18, wpb=497599, bsz=16268, num_updates=45700, lr=0.00029585, gnorm=0.137, clip=0, loss_scale=2, train_wall=84, gb_free=60.9, wall=39183 epoch 028: 168 / 1689 loss=3.51, nll_loss=1.955, ppl=3.88, wps=585919, ups=1.18, wpb=497599, bsz=16268, num_updates=45700, lr=0.00029585, gnorm=0.137, clip=0, loss_scale=2, train_wall=84, gb_free=60.9, wall=39183 epoch 028: 168 / 1689 loss=3.51, nll_loss=1.955, ppl=3.88, wps=585919, ups=1.18, wpb=497599, bsz=16268, num_updates=45700, lr=0.00029585, gnorm=0.137, clip=0, loss_scale=2, train_wall=84, gb_free=60.9, wall=39183 epoch 028: 168 / 1689 loss=3.51, nll_loss=1.955, ppl=3.88, wps=585919, ups=1.18, wpb=497599, bsz=16268, num_updates=45700, lr=0.00029585, gnorm=0.137, clip=0, loss_scale=2, train_wall=84, gb_free=60.9, wall=39183 epoch 028: 168 / 1689 loss=3.51, nll_loss=1.955, ppl=3.88, wps=585919, ups=1.18, wpb=497599, bsz=16268, num_updates=45700, lr=0.00029585, gnorm=0.137, clip=0, loss_scale=2, train_wall=84, gb_free=60.9, wall=39183 epoch 028: 168 / 1689 loss=3.51, nll_loss=1.955, ppl=3.88, wps=585919, ups=1.18, wpb=497599, bsz=16268, num_updates=45700, lr=0.00029585, gnorm=0.137, clip=0, loss_scale=2, train_wall=84, gb_free=60.9, wall=39183 epoch 028: 168 / 1689 loss=3.51, nll_loss=1.955, ppl=3.88, wps=585919, ups=1.18, wpb=497599, bsz=16268, num_updates=45700, lr=0.00029585, gnorm=0.137, clip=0, loss_scale=2, train_wall=84, gb_free=60.9, wall=39183 epoch 028: 268 / 1689 loss=3.515, nll_loss=1.96, ppl=3.89, wps=584975, ups=1.18, wpb=494796, bsz=16484.5, num_updates=45800, lr=0.000295527, gnorm=0.138, clip=0, loss_scale=4, train_wall=84, gb_free=61.6, wall=39268 epoch 028: 268 / 1689 loss=3.515, nll_loss=1.96, ppl=3.89, wps=584975, ups=1.18, wpb=494796, bsz=16484.5, num_updates=45800, lr=0.000295527, gnorm=0.138, clip=0, loss_scale=4, train_wall=84, gb_free=61.6, wall=39268 epoch 028: 268 / 1689 loss=3.515, nll_loss=1.96, ppl=3.89, wps=584975, ups=1.18, wpb=494796, bsz=16484.5, num_updates=45800, lr=0.000295527, gnorm=0.138, clip=0, loss_scale=4, train_wall=84, gb_free=61.6, wall=39268 epoch 028: 268 / 1689 loss=3.515, nll_loss=1.96, ppl=3.89, wps=584975, ups=1.18, wpb=494796, bsz=16484.5, num_updates=45800, lr=0.000295527, gnorm=0.138, clip=0, loss_scale=4, train_wall=84, gb_free=61.6, wall=39268 epoch 028: 268 / 1689 loss=3.515, nll_loss=1.96, ppl=3.89, wps=584975, ups=1.18, wpb=494796, bsz=16484.5, num_updates=45800, lr=0.000295527, gnorm=0.138, clip=0, loss_scale=4, train_wall=84, gb_free=61.6, wall=39268 epoch 028: 268 / 1689 loss=3.515, nll_loss=1.96, ppl=3.89, wps=584975, ups=1.18, wpb=494796, bsz=16484.5, num_updates=45800, lr=0.000295527, gnorm=0.138, clip=0, loss_scale=4, train_wall=84, gb_free=61.6, wall=39268 epoch 028: 268 / 1689 loss=3.515, nll_loss=1.96, ppl=3.89, wps=584975, ups=1.18, wpb=494796, bsz=16484.5, num_updates=45800, lr=0.000295527, gnorm=0.138, clip=0, loss_scale=4, train_wall=84, gb_free=61.6, wall=39268 epoch 028: 268 / 1689 loss=3.515, nll_loss=1.96, ppl=3.89, wps=584975, ups=1.18, wpb=494796, bsz=16484.5, num_updates=45800, lr=0.000295527, gnorm=0.138, clip=0, loss_scale=4, train_wall=84, gb_free=61.6, wall=39268 epoch 028: 268 / 1689 loss=3.515, nll_loss=1.96, ppl=3.89, wps=584975, ups=1.18, wpb=494796, bsz=16484.5, num_updates=45800, lr=0.000295527, gnorm=0.138, clip=0, loss_scale=4, train_wall=84, gb_free=61.6, wall=39268 epoch 028: 268 / 1689 loss=3.515, nll_loss=1.96, ppl=3.89, wps=584975, ups=1.18, wpb=494796, bsz=16484.5, num_updates=45800, lr=0.000295527, gnorm=0.138, clip=0, loss_scale=4, train_wall=84, gb_free=61.6, wall=39268 epoch 028: 268 / 1689 loss=3.515, nll_loss=1.96, ppl=3.89, wps=584975, ups=1.18, wpb=494796, bsz=16484.5, num_updates=45800, lr=0.000295527, gnorm=0.138, clip=0, loss_scale=4, train_wall=84, gb_free=61.6, wall=39268 epoch 028: 268 / 1689 loss=3.515, nll_loss=1.96, ppl=3.89, wps=584975, ups=1.18, wpb=494796, bsz=16484.5, num_updates=45800, lr=0.000295527, gnorm=0.138, clip=0, loss_scale=4, train_wall=84, gb_free=61.6, wall=39268 epoch 028: 268 / 1689 loss=3.515, nll_loss=1.96, ppl=3.89, wps=584975, ups=1.18, wpb=494796, bsz=16484.5, num_updates=45800, lr=0.000295527, gnorm=0.138, clip=0, loss_scale=4, train_wall=84, gb_free=61.6, wall=39268 epoch 028: 268 / 1689 loss=3.515, nll_loss=1.96, ppl=3.89, wps=584975, ups=1.18, wpb=494796, bsz=16484.5, num_updates=45800, lr=0.000295527, gnorm=0.138, clip=0, loss_scale=4, train_wall=84, gb_free=61.6, wall=39268 epoch 028: 268 / 1689 loss=3.515, nll_loss=1.96, ppl=3.89, wps=584975, ups=1.18, wpb=494796, bsz=16484.5, num_updates=45800, lr=0.000295527, gnorm=0.138, clip=0, loss_scale=4, train_wall=84, gb_free=61.6, wall=39268 epoch 028: 268 / 1689 loss=3.515, nll_loss=1.96, ppl=3.89, wps=584975, ups=1.18, wpb=494796, bsz=16484.5, num_updates=45800, lr=0.000295527, gnorm=0.138, clip=0, loss_scale=4, train_wall=84, gb_free=61.6, wall=39268 epoch 028: 268 / 1689 loss=3.515, nll_loss=1.96, ppl=3.89, wps=584975, ups=1.18, wpb=494796, bsz=16484.5, num_updates=45800, lr=0.000295527, gnorm=0.138, clip=0, loss_scale=4, train_wall=84, gb_free=61.6, wall=39268 epoch 028: 268 / 1689 loss=3.515, nll_loss=1.96, ppl=3.89, wps=584975, ups=1.18, wpb=494796, bsz=16484.5, num_updates=45800, lr=0.000295527, gnorm=0.138, clip=0, loss_scale=4, train_wall=84, gb_free=61.6, wall=39268 epoch 028: 268 / 1689 loss=3.515, nll_loss=1.96, ppl=3.89, wps=584975, ups=1.18, wpb=494796, bsz=16484.5, num_updates=45800, lr=0.000295527, gnorm=0.138, clip=0, loss_scale=4, train_wall=84, gb_free=61.6, wall=39268 epoch 028: 268 / 1689 loss=3.515, nll_loss=1.96, ppl=3.89, wps=584975, ups=1.18, wpb=494796, bsz=16484.5, num_updates=45800, lr=0.000295527, gnorm=0.138, clip=0, loss_scale=4, train_wall=84, gb_free=61.6, wall=39268 epoch 028: 268 / 1689 loss=3.515, nll_loss=1.96, ppl=3.89, wps=584975, ups=1.18, wpb=494796, bsz=16484.5, num_updates=45800, lr=0.000295527, gnorm=0.138, clip=0, loss_scale=4, train_wall=84, gb_free=61.6, wall=39268 epoch 028: 268 / 1689 loss=3.515, nll_loss=1.96, ppl=3.89, wps=584975, ups=1.18, wpb=494796, bsz=16484.5, num_updates=45800, lr=0.000295527, gnorm=0.138, clip=0, loss_scale=4, train_wall=84, gb_free=61.6, wall=39268 epoch 028: 268 / 1689 loss=3.515, nll_loss=1.96, ppl=3.89, wps=584975, ups=1.18, wpb=494796, bsz=16484.5, num_updates=45800, lr=0.000295527, gnorm=0.138, clip=0, loss_scale=4, train_wall=84, gb_free=61.6, wall=39268 epoch 028: 268 / 1689 loss=3.515, nll_loss=1.96, ppl=3.89, wps=584975, ups=1.18, wpb=494796, bsz=16484.5, num_updates=45800, lr=0.000295527, gnorm=0.138, clip=0, loss_scale=4, train_wall=84, gb_free=61.6, wall=39268 epoch 028: 268 / 1689 loss=3.515, nll_loss=1.96, ppl=3.89, wps=584975, ups=1.18, wpb=494796, bsz=16484.5, num_updates=45800, lr=0.000295527, gnorm=0.138, clip=0, loss_scale=4, train_wall=84, gb_free=61.6, wall=39268 epoch 028: 268 / 1689 loss=3.515, nll_loss=1.96, ppl=3.89, wps=584975, ups=1.18, wpb=494796, bsz=16484.5, num_updates=45800, lr=0.000295527, gnorm=0.138, clip=0, loss_scale=4, train_wall=84, gb_free=61.6, wall=39268 epoch 028: 268 / 1689 loss=3.515, nll_loss=1.96, ppl=3.89, wps=584975, ups=1.18, wpb=494796, bsz=16484.5, num_updates=45800, lr=0.000295527, gnorm=0.138, clip=0, loss_scale=4, train_wall=84, gb_free=61.6, wall=39268 epoch 028: 268 / 1689 loss=3.515, nll_loss=1.96, ppl=3.89, wps=584975, ups=1.18, wpb=494796, bsz=16484.5, num_updates=45800, lr=0.000295527, gnorm=0.138, clip=0, loss_scale=4, train_wall=84, gb_free=61.6, wall=39268 epoch 028: 368 / 1689 loss=3.518, nll_loss=1.964, ppl=3.9, wps=587481, ups=1.19, wpb=494874, bsz=16615.5, num_updates=45900, lr=0.000295205, gnorm=0.133, clip=0, loss_scale=4, train_wall=83, gb_free=61.9, wall=39352 epoch 028: 368 / 1689 loss=3.518, nll_loss=1.964, ppl=3.9, wps=587481, ups=1.19, wpb=494874, bsz=16615.5, num_updates=45900, lr=0.000295205, gnorm=0.133, clip=0, loss_scale=4, train_wall=83, gb_free=61.9, wall=39352 epoch 028: 368 / 1689 loss=3.518, nll_loss=1.964, ppl=3.9, wps=587481, ups=1.19, wpb=494874, bsz=16615.5, num_updates=45900, lr=0.000295205, gnorm=0.133, clip=0, loss_scale=4, train_wall=83, gb_free=61.9, wall=39352 epoch 028: 368 / 1689 loss=3.518, nll_loss=1.964, ppl=3.9, wps=587481, ups=1.19, wpb=494874, bsz=16615.5, num_updates=45900, lr=0.000295205, gnorm=0.133, clip=0, loss_scale=4, train_wall=83, gb_free=61.9, wall=39352 epoch 028: 368 / 1689 loss=3.518, nll_loss=1.964, ppl=3.9, wps=587481, ups=1.19, wpb=494874, bsz=16615.5, num_updates=45900, lr=0.000295205, gnorm=0.133, clip=0, loss_scale=4, train_wall=83, gb_free=61.9, wall=39352 epoch 028: 368 / 1689 loss=3.518, nll_loss=1.964, ppl=3.9, wps=587481, ups=1.19, wpb=494874, bsz=16615.5, num_updates=45900, lr=0.000295205, gnorm=0.133, clip=0, loss_scale=4, train_wall=83, gb_free=61.9, wall=39352 epoch 028: 368 / 1689 loss=3.518, nll_loss=1.964, ppl=3.9, wps=587481, ups=1.19, wpb=494874, bsz=16615.5, num_updates=45900, lr=0.000295205, gnorm=0.133, clip=0, loss_scale=4, train_wall=83, gb_free=61.9, wall=39352 epoch 028: 368 / 1689 loss=3.518, nll_loss=1.964, ppl=3.9, wps=587481, ups=1.19, wpb=494874, bsz=16615.5, num_updates=45900, lr=0.000295205, gnorm=0.133, clip=0, loss_scale=4, train_wall=83, gb_free=61.9, wall=39352 epoch 028: 368 / 1689 loss=3.518, nll_loss=1.964, ppl=3.9, wps=587481, ups=1.19, wpb=494874, bsz=16615.5, num_updates=45900, lr=0.000295205, gnorm=0.133, clip=0, loss_scale=4, train_wall=83, gb_free=61.9, wall=39352 epoch 028: 368 / 1689 loss=3.518, nll_loss=1.964, ppl=3.9, wps=587481, ups=1.19, wpb=494874, bsz=16615.5, num_updates=45900, lr=0.000295205, gnorm=0.133, clip=0, loss_scale=4, train_wall=83, gb_free=61.9, wall=39352 epoch 028: 368 / 1689 loss=3.518, nll_loss=1.964, ppl=3.9, wps=587481, ups=1.19, wpb=494874, bsz=16615.5, num_updates=45900, lr=0.000295205, gnorm=0.133, clip=0, loss_scale=4, train_wall=83, gb_free=61.9, wall=39352 epoch 028: 368 / 1689 loss=3.518, nll_loss=1.964, ppl=3.9, wps=587481, ups=1.19, wpb=494874, bsz=16615.5, num_updates=45900, lr=0.000295205, gnorm=0.133, clip=0, loss_scale=4, train_wall=83, gb_free=61.9, wall=39352 epoch 028: 368 / 1689 loss=3.518, nll_loss=1.964, ppl=3.9, wps=587481, ups=1.19, wpb=494874, bsz=16615.5, num_updates=45900, lr=0.000295205, gnorm=0.133, clip=0, loss_scale=4, train_wall=83, gb_free=61.9, wall=39352 epoch 028: 368 / 1689 loss=3.518, nll_loss=1.964, ppl=3.9, wps=587481, ups=1.19, wpb=494874, bsz=16615.5, num_updates=45900, lr=0.000295205, gnorm=0.133, clip=0, loss_scale=4, train_wall=83, gb_free=61.9, wall=39352 epoch 028: 368 / 1689 loss=3.518, nll_loss=1.964, ppl=3.9, wps=587481, ups=1.19, wpb=494874, bsz=16615.5, num_updates=45900, lr=0.000295205, gnorm=0.133, clip=0, loss_scale=4, train_wall=83, gb_free=61.9, wall=39352 epoch 028: 368 / 1689 loss=3.518, nll_loss=1.964, ppl=3.9, wps=587481, ups=1.19, wpb=494874, bsz=16615.5, num_updates=45900, lr=0.000295205, gnorm=0.133, clip=0, loss_scale=4, train_wall=83, gb_free=61.9, wall=39352 epoch 028: 368 / 1689 loss=3.518, nll_loss=1.964, ppl=3.9, wps=587481, ups=1.19, wpb=494874, bsz=16615.5, num_updates=45900, lr=0.000295205, gnorm=0.133, clip=0, loss_scale=4, train_wall=83, gb_free=61.9, wall=39352 epoch 028: 368 / 1689 loss=3.518, nll_loss=1.964, ppl=3.9, wps=587481, ups=1.19, wpb=494874, bsz=16615.5, num_updates=45900, lr=0.000295205, gnorm=0.133, clip=0, loss_scale=4, train_wall=83, gb_free=61.9, wall=39352 epoch 028: 368 / 1689 loss=3.518, nll_loss=1.964, ppl=3.9, wps=587481, ups=1.19, wpb=494874, bsz=16615.5, num_updates=45900, lr=0.000295205, gnorm=0.133, clip=0, loss_scale=4, train_wall=83, gb_free=61.9, wall=39352 epoch 028: 368 / 1689 loss=3.518, nll_loss=1.964, ppl=3.9, wps=587481, ups=1.19, wpb=494874, bsz=16615.5, num_updates=45900, lr=0.000295205, gnorm=0.133, clip=0, loss_scale=4, train_wall=83, gb_free=61.9, wall=39352 epoch 028: 368 / 1689 loss=3.518, nll_loss=1.964, ppl=3.9, wps=587481, ups=1.19, wpb=494874, bsz=16615.5, num_updates=45900, lr=0.000295205, gnorm=0.133, clip=0, loss_scale=4, train_wall=83, gb_free=61.9, wall=39352 epoch 028: 368 / 1689 loss=3.518, nll_loss=1.964, ppl=3.9, wps=587481, ups=1.19, wpb=494874, bsz=16615.5, num_updates=45900, lr=0.000295205, gnorm=0.133, clip=0, loss_scale=4, train_wall=83, gb_free=61.9, wall=39352 epoch 028: 368 / 1689 loss=3.518, nll_loss=1.964, ppl=3.9, wps=587481, ups=1.19, wpb=494874, bsz=16615.5, num_updates=45900, lr=0.000295205, gnorm=0.133, clip=0, loss_scale=4, train_wall=83, gb_free=61.9, wall=39352 epoch 028: 368 / 1689 loss=3.518, nll_loss=1.964, ppl=3.9, wps=587481, ups=1.19, wpb=494874, bsz=16615.5, num_updates=45900, lr=0.000295205, gnorm=0.133, clip=0, loss_scale=4, train_wall=83, gb_free=61.9, wall=39352 epoch 028: 368 / 1689 loss=3.518, nll_loss=1.964, ppl=3.9, wps=587481, ups=1.19, wpb=494874, bsz=16615.5, num_updates=45900, lr=0.000295205, gnorm=0.133, clip=0, loss_scale=4, train_wall=83, gb_free=61.9, wall=39352 epoch 028: 368 / 1689 loss=3.518, nll_loss=1.964, ppl=3.9, wps=587481, ups=1.19, wpb=494874, bsz=16615.5, num_updates=45900, lr=0.000295205, gnorm=0.133, clip=0, loss_scale=4, train_wall=83, gb_free=61.9, wall=39352 epoch 028: 368 / 1689 loss=3.518, nll_loss=1.964, ppl=3.9, wps=587481, ups=1.19, wpb=494874, bsz=16615.5, num_updates=45900, lr=0.000295205, gnorm=0.133, clip=0, loss_scale=4, train_wall=83, gb_free=61.9, wall=39352 epoch 028: 368 / 1689 loss=3.518, nll_loss=1.964, ppl=3.9, wps=587481, ups=1.19, wpb=494874, bsz=16615.5, num_updates=45900, lr=0.000295205, gnorm=0.133, clip=0, loss_scale=4, train_wall=83, gb_free=61.9, wall=39352 epoch 028: 469 / 1689 loss=3.515, nll_loss=1.96, ppl=3.89, wps=579587, ups=1.17, wpb=496347, bsz=16684.1, num_updates=46000, lr=0.000294884, gnorm=0.134, clip=0, loss_scale=2, train_wall=85, gb_free=62.2, wall=39438 epoch 028: 469 / 1689 loss=3.515, nll_loss=1.96, ppl=3.89, wps=579587, ups=1.17, wpb=496347, bsz=16684.1, num_updates=46000, lr=0.000294884, gnorm=0.134, clip=0, loss_scale=2, train_wall=85, gb_free=62.2, wall=39438 epoch 028: 469 / 1689 loss=3.515, nll_loss=1.96, ppl=3.89, wps=579587, ups=1.17, wpb=496347, bsz=16684.1, num_updates=46000, lr=0.000294884, gnorm=0.134, clip=0, loss_scale=2, train_wall=85, gb_free=62.2, wall=39438 epoch 028: 469 / 1689 loss=3.515, nll_loss=1.96, ppl=3.89, wps=579587, ups=1.17, wpb=496347, bsz=16684.1, num_updates=46000, lr=0.000294884, gnorm=0.134, clip=0, loss_scale=2, train_wall=85, gb_free=62.2, wall=39438 epoch 028: 469 / 1689 loss=3.515, nll_loss=1.96, ppl=3.89, wps=579587, ups=1.17, wpb=496347, bsz=16684.1, num_updates=46000, lr=0.000294884, gnorm=0.134, clip=0, loss_scale=2, train_wall=85, gb_free=62.2, wall=39438 epoch 028: 469 / 1689 loss=3.515, nll_loss=1.96, ppl=3.89, wps=579587, ups=1.17, wpb=496347, bsz=16684.1, num_updates=46000, lr=0.000294884, gnorm=0.134, clip=0, loss_scale=2, train_wall=85, gb_free=62.2, wall=39438 epoch 028: 469 / 1689 loss=3.515, nll_loss=1.96, ppl=3.89, wps=579587, ups=1.17, wpb=496347, bsz=16684.1, num_updates=46000, lr=0.000294884, gnorm=0.134, clip=0, loss_scale=2, train_wall=85, gb_free=62.2, wall=39438 epoch 028: 469 / 1689 loss=3.515, nll_loss=1.96, ppl=3.89, wps=579587, ups=1.17, wpb=496347, bsz=16684.1, num_updates=46000, lr=0.000294884, gnorm=0.134, clip=0, loss_scale=2, train_wall=85, gb_free=62.2, wall=39438 epoch 028: 469 / 1689 loss=3.515, nll_loss=1.96, ppl=3.89, wps=579587, ups=1.17, wpb=496347, bsz=16684.1, num_updates=46000, lr=0.000294884, gnorm=0.134, clip=0, loss_scale=2, train_wall=85, gb_free=62.2, wall=39438 epoch 028: 469 / 1689 loss=3.515, nll_loss=1.96, ppl=3.89, wps=579587, ups=1.17, wpb=496347, bsz=16684.1, num_updates=46000, lr=0.000294884, gnorm=0.134, clip=0, loss_scale=2, train_wall=85, gb_free=62.2, wall=39438 epoch 028: 469 / 1689 loss=3.515, nll_loss=1.96, ppl=3.89, wps=579587, ups=1.17, wpb=496347, bsz=16684.1, num_updates=46000, lr=0.000294884, gnorm=0.134, clip=0, loss_scale=2, train_wall=85, gb_free=62.2, wall=39438 epoch 028: 469 / 1689 loss=3.515, nll_loss=1.96, ppl=3.89, wps=579587, ups=1.17, wpb=496347, bsz=16684.1, num_updates=46000, lr=0.000294884, gnorm=0.134, clip=0, loss_scale=2, train_wall=85, gb_free=62.2, wall=39438 epoch 028: 469 / 1689 loss=3.515, nll_loss=1.96, ppl=3.89, wps=579587, ups=1.17, wpb=496347, bsz=16684.1, num_updates=46000, lr=0.000294884, gnorm=0.134, clip=0, loss_scale=2, train_wall=85, gb_free=62.2, wall=39438 epoch 028: 469 / 1689 loss=3.515, nll_loss=1.96, ppl=3.89, wps=579587, ups=1.17, wpb=496347, bsz=16684.1, num_updates=46000, lr=0.000294884, gnorm=0.134, clip=0, loss_scale=2, train_wall=85, gb_free=62.2, wall=39438 epoch 028: 469 / 1689 loss=3.515, nll_loss=1.96, ppl=3.89, wps=579587, ups=1.17, wpb=496347, bsz=16684.1, num_updates=46000, lr=0.000294884, gnorm=0.134, clip=0, loss_scale=2, train_wall=85, gb_free=62.2, wall=39438 epoch 028: 469 / 1689 loss=3.515, nll_loss=1.96, ppl=3.89, wps=579587, ups=1.17, wpb=496347, bsz=16684.1, num_updates=46000, lr=0.000294884, gnorm=0.134, clip=0, loss_scale=2, train_wall=85, gb_free=62.2, wall=39438 epoch 028: 469 / 1689 loss=3.515, nll_loss=1.96, ppl=3.89, wps=579587, ups=1.17, wpb=496347, bsz=16684.1, num_updates=46000, lr=0.000294884, gnorm=0.134, clip=0, loss_scale=2, train_wall=85, gb_free=62.2, wall=39438 epoch 028: 469 / 1689 loss=3.515, nll_loss=1.96, ppl=3.89, wps=579587, ups=1.17, wpb=496347, bsz=16684.1, num_updates=46000, lr=0.000294884, gnorm=0.134, clip=0, loss_scale=2, train_wall=85, gb_free=62.2, wall=39438 epoch 028: 469 / 1689 loss=3.515, nll_loss=1.96, ppl=3.89, wps=579587, ups=1.17, wpb=496347, bsz=16684.1, num_updates=46000, lr=0.000294884, gnorm=0.134, clip=0, loss_scale=2, train_wall=85, gb_free=62.2, wall=39438 epoch 028: 469 / 1689 loss=3.515, nll_loss=1.96, ppl=3.89, wps=579587, ups=1.17, wpb=496347, bsz=16684.1, num_updates=46000, lr=0.000294884, gnorm=0.134, clip=0, loss_scale=2, train_wall=85, gb_free=62.2, wall=39438 epoch 028: 469 / 1689 loss=3.515, nll_loss=1.96, ppl=3.89, wps=579587, ups=1.17, wpb=496347, bsz=16684.1, num_updates=46000, lr=0.000294884, gnorm=0.134, clip=0, loss_scale=2, train_wall=85, gb_free=62.2, wall=39438 epoch 028: 469 / 1689 loss=3.515, nll_loss=1.96, ppl=3.89, wps=579587, ups=1.17, wpb=496347, bsz=16684.1, num_updates=46000, lr=0.000294884, gnorm=0.134, clip=0, loss_scale=2, train_wall=85, gb_free=62.2, wall=39438 epoch 028: 469 / 1689 loss=3.515, nll_loss=1.96, ppl=3.89, wps=579587, ups=1.17, wpb=496347, bsz=16684.1, num_updates=46000, lr=0.000294884, gnorm=0.134, clip=0, loss_scale=2, train_wall=85, gb_free=62.2, wall=39438 epoch 028: 469 / 1689 loss=3.515, nll_loss=1.96, ppl=3.89, wps=579587, ups=1.17, wpb=496347, bsz=16684.1, num_updates=46000, lr=0.000294884, gnorm=0.134, clip=0, loss_scale=2, train_wall=85, gb_free=62.2, wall=39438 epoch 028: 469 / 1689 loss=3.515, nll_loss=1.96, ppl=3.89, wps=579587, ups=1.17, wpb=496347, bsz=16684.1, num_updates=46000, lr=0.000294884, gnorm=0.134, clip=0, loss_scale=2, train_wall=85, gb_free=62.2, wall=39438 epoch 028: 469 / 1689 loss=3.515, nll_loss=1.96, ppl=3.89, wps=579587, ups=1.17, wpb=496347, bsz=16684.1, num_updates=46000, lr=0.000294884, gnorm=0.134, clip=0, loss_scale=2, train_wall=85, gb_free=62.2, wall=39438 epoch 028: 469 / 1689 loss=3.515, nll_loss=1.96, ppl=3.89, wps=579587, ups=1.17, wpb=496347, bsz=16684.1, num_updates=46000, lr=0.000294884, gnorm=0.134, clip=0, loss_scale=2, train_wall=85, gb_free=62.2, wall=39438 epoch 028: 469 / 1689 loss=3.515, nll_loss=1.96, ppl=3.89, wps=579587, ups=1.17, wpb=496347, bsz=16684.1, num_updates=46000, lr=0.000294884, gnorm=0.134, clip=0, loss_scale=2, train_wall=85, gb_free=62.2, wall=39438 begin validation on "valid" subset epoch 028 | valid on 'valid' subset | loss 3.699 | nll_loss 2.147 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 46000 | best_loss 3.687 epoch 028 | valid on 'valid' subset | loss 3.699 | nll_loss 2.147 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 46000 | best_loss 3.687 epoch 028 | valid on 'valid' subset | loss 3.699 | nll_loss 2.147 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 46000 | best_loss 3.687 epoch 028 | valid on 'valid' subset | loss 3.699 | nll_loss 2.147 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 46000 | best_loss 3.687 epoch 028 | valid on 'valid' subset | loss 3.699 | nll_loss 2.147 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 46000 | best_loss 3.687 epoch 028 | valid on 'valid' subset | loss 3.699 | nll_loss 2.147 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 46000 | best_loss 3.687 epoch 028 | valid on 'valid' subset | loss 3.699 | nll_loss 2.147 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 46000 | best_loss 3.687 epoch 028 | valid on 'valid' subset | loss 3.699 | nll_loss 2.147 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 46000 | best_loss 3.687 epoch 028 | valid on 'valid' subset | loss 3.699 | nll_loss 2.147 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 46000 | best_loss 3.687 epoch 028 | valid on 'valid' subset | loss 3.699 | nll_loss 2.147 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 46000 | best_loss 3.687 epoch 028 | valid on 'valid' subset | loss 3.699 | nll_loss 2.147 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 46000 | best_loss 3.687 epoch 028 | valid on 'valid' subset | loss 3.699 | nll_loss 2.147 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 46000 | best_loss 3.687 epoch 028 | valid on 'valid' subset | loss 3.699 | nll_loss 2.147 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 46000 | best_loss 3.687 epoch 028 | valid on 'valid' subset | loss 3.699 | nll_loss 2.147 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 46000 | best_loss 3.687 epoch 028 | valid on 'valid' subset | loss 3.699 | nll_loss 2.147 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 46000 | best_loss 3.687 epoch 028 | valid on 'valid' subset | loss 3.699 | nll_loss 2.147 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 46000 | best_loss 3.687 epoch 028 | valid on 'valid' subset | loss 3.699 | nll_loss 2.147 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 46000 | best_loss 3.687 epoch 028 | valid on 'valid' subset | loss 3.699 | nll_loss 2.147 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 46000 | best_loss 3.687 epoch 028 | valid on 'valid' subset | loss 3.699 | nll_loss 2.147 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 46000 | best_loss 3.687 epoch 028 | valid on 'valid' subset | loss 3.699 | nll_loss 2.147 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 46000 | best_loss 3.687 epoch 028 | valid on 'valid' subset | loss 3.699 | nll_loss 2.147 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 46000 | best_loss 3.687 epoch 028 | valid on 'valid' subset | loss 3.699 | nll_loss 2.147 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 46000 | best_loss 3.687 epoch 028 | valid on 'valid' subset | loss 3.699 | nll_loss 2.147 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 46000 | best_loss 3.687 epoch 028 | valid on 'valid' subset | loss 3.699 | nll_loss 2.147 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 46000 | best_loss 3.687 epoch 028 | valid on 'valid' subset | loss 3.699 | nll_loss 2.147 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 46000 | best_loss 3.687 epoch 028 | valid on 'valid' subset | loss 3.699 | nll_loss 2.147 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 46000 | best_loss 3.687 epoch 028 | valid on 'valid' subset | loss 3.699 | nll_loss 2.147 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 46000 | best_loss 3.687 epoch 028 | valid on 'valid' subset | loss 3.699 | nll_loss 2.147 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 46000 | best_loss 3.687 epoch 028: 569 / 1689 loss=3.521, nll_loss=1.968, ppl=3.91, wps=531448, ups=1.08, wpb=494199, bsz=17136.1, num_updates=46100, lr=0.000294564, gnorm=0.134, clip=0, loss_scale=2, train_wall=84, gb_free=61.6, wall=39531 epoch 028: 569 / 1689 loss=3.521, nll_loss=1.968, ppl=3.91, wps=531448, ups=1.08, wpb=494199, bsz=17136.1, num_updates=46100, lr=0.000294564, gnorm=0.134, clip=0, loss_scale=2, train_wall=84, gb_free=61.6, wall=39531 epoch 028: 569 / 1689 loss=3.521, nll_loss=1.968, ppl=3.91, wps=531448, ups=1.08, wpb=494199, bsz=17136.1, num_updates=46100, lr=0.000294564, gnorm=0.134, clip=0, loss_scale=2, train_wall=84, gb_free=61.6, wall=39531 epoch 028: 569 / 1689 loss=3.521, nll_loss=1.968, ppl=3.91, wps=531448, ups=1.08, wpb=494199, bsz=17136.1, num_updates=46100, lr=0.000294564, gnorm=0.134, clip=0, loss_scale=2, train_wall=84, gb_free=61.6, wall=39531 epoch 028: 569 / 1689 loss=3.521, nll_loss=1.968, ppl=3.91, wps=531448, ups=1.08, wpb=494199, bsz=17136.1, num_updates=46100, lr=0.000294564, gnorm=0.134, clip=0, loss_scale=2, train_wall=84, gb_free=61.6, wall=39531 epoch 028: 569 / 1689 loss=3.521, nll_loss=1.968, ppl=3.91, wps=531448, ups=1.08, wpb=494199, bsz=17136.1, num_updates=46100, lr=0.000294564, gnorm=0.134, clip=0, loss_scale=2, train_wall=84, gb_free=61.6, wall=39531 epoch 028: 569 / 1689 loss=3.521, nll_loss=1.968, ppl=3.91, wps=531448, ups=1.08, wpb=494199, bsz=17136.1, num_updates=46100, lr=0.000294564, gnorm=0.134, clip=0, loss_scale=2, train_wall=84, gb_free=61.6, wall=39531 epoch 028: 569 / 1689 loss=3.521, nll_loss=1.968, ppl=3.91, wps=531448, ups=1.08, wpb=494199, bsz=17136.1, num_updates=46100, lr=0.000294564, gnorm=0.134, clip=0, loss_scale=2, train_wall=84, gb_free=61.6, wall=39531 epoch 028: 569 / 1689 loss=3.521, nll_loss=1.968, ppl=3.91, wps=531448, ups=1.08, wpb=494199, bsz=17136.1, num_updates=46100, lr=0.000294564, gnorm=0.134, clip=0, loss_scale=2, train_wall=84, gb_free=61.6, wall=39531 epoch 028: 569 / 1689 loss=3.521, nll_loss=1.968, ppl=3.91, wps=531448, ups=1.08, wpb=494199, bsz=17136.1, num_updates=46100, lr=0.000294564, gnorm=0.134, clip=0, loss_scale=2, train_wall=84, gb_free=61.6, wall=39531 epoch 028: 569 / 1689 loss=3.521, nll_loss=1.968, ppl=3.91, wps=531448, ups=1.08, wpb=494199, bsz=17136.1, num_updates=46100, lr=0.000294564, gnorm=0.134, clip=0, loss_scale=2, train_wall=84, gb_free=61.6, wall=39531 epoch 028: 569 / 1689 loss=3.521, nll_loss=1.968, ppl=3.91, wps=531448, ups=1.08, wpb=494199, bsz=17136.1, num_updates=46100, lr=0.000294564, gnorm=0.134, clip=0, loss_scale=2, train_wall=84, gb_free=61.6, wall=39531 epoch 028: 569 / 1689 loss=3.521, nll_loss=1.968, ppl=3.91, wps=531448, ups=1.08, wpb=494199, bsz=17136.1, num_updates=46100, lr=0.000294564, gnorm=0.134, clip=0, loss_scale=2, train_wall=84, gb_free=61.6, wall=39531 epoch 028: 569 / 1689 loss=3.521, nll_loss=1.968, ppl=3.91, wps=531448, ups=1.08, wpb=494199, bsz=17136.1, num_updates=46100, lr=0.000294564, gnorm=0.134, clip=0, loss_scale=2, train_wall=84, gb_free=61.6, wall=39531 epoch 028: 569 / 1689 loss=3.521, nll_loss=1.968, ppl=3.91, wps=531448, ups=1.08, wpb=494199, bsz=17136.1, num_updates=46100, lr=0.000294564, gnorm=0.134, clip=0, loss_scale=2, train_wall=84, gb_free=61.6, wall=39531 epoch 028: 569 / 1689 loss=3.521, nll_loss=1.968, ppl=3.91, wps=531448, ups=1.08, wpb=494199, bsz=17136.1, num_updates=46100, lr=0.000294564, gnorm=0.134, clip=0, loss_scale=2, train_wall=84, gb_free=61.6, wall=39531 epoch 028: 569 / 1689 loss=3.521, nll_loss=1.968, ppl=3.91, wps=531448, ups=1.08, wpb=494199, bsz=17136.1, num_updates=46100, lr=0.000294564, gnorm=0.134, clip=0, loss_scale=2, train_wall=84, gb_free=61.6, wall=39531 epoch 028: 569 / 1689 loss=3.521, nll_loss=1.968, ppl=3.91, wps=531448, ups=1.08, wpb=494199, bsz=17136.1, num_updates=46100, lr=0.000294564, gnorm=0.134, clip=0, loss_scale=2, train_wall=84, gb_free=61.6, wall=39531 epoch 028: 569 / 1689 loss=3.521, nll_loss=1.968, ppl=3.91, wps=531448, ups=1.08, wpb=494199, bsz=17136.1, num_updates=46100, lr=0.000294564, gnorm=0.134, clip=0, loss_scale=2, train_wall=84, gb_free=61.6, wall=39531 epoch 028: 569 / 1689 loss=3.521, nll_loss=1.968, ppl=3.91, wps=531448, ups=1.08, wpb=494199, bsz=17136.1, num_updates=46100, lr=0.000294564, gnorm=0.134, clip=0, loss_scale=2, train_wall=84, gb_free=61.6, wall=39531 epoch 028: 569 / 1689 loss=3.521, nll_loss=1.968, ppl=3.91, wps=531448, ups=1.08, wpb=494199, bsz=17136.1, num_updates=46100, lr=0.000294564, gnorm=0.134, clip=0, loss_scale=2, train_wall=84, gb_free=61.6, wall=39531 epoch 028: 569 / 1689 loss=3.521, nll_loss=1.968, ppl=3.91, wps=531448, ups=1.08, wpb=494199, bsz=17136.1, num_updates=46100, lr=0.000294564, gnorm=0.134, clip=0, loss_scale=2, train_wall=84, gb_free=61.6, wall=39531 epoch 028: 569 / 1689 loss=3.521, nll_loss=1.968, ppl=3.91, wps=531448, ups=1.08, wpb=494199, bsz=17136.1, num_updates=46100, lr=0.000294564, gnorm=0.134, clip=0, loss_scale=2, train_wall=84, gb_free=61.6, wall=39531 epoch 028: 569 / 1689 loss=3.521, nll_loss=1.968, ppl=3.91, wps=531448, ups=1.08, wpb=494199, bsz=17136.1, num_updates=46100, lr=0.000294564, gnorm=0.134, clip=0, loss_scale=2, train_wall=84, gb_free=61.6, wall=39531 epoch 028: 569 / 1689 loss=3.521, nll_loss=1.968, ppl=3.91, wps=531448, ups=1.08, wpb=494199, bsz=17136.1, num_updates=46100, lr=0.000294564, gnorm=0.134, clip=0, loss_scale=2, train_wall=84, gb_free=61.6, wall=39531 epoch 028: 569 / 1689 loss=3.521, nll_loss=1.968, ppl=3.91, wps=531448, ups=1.08, wpb=494199, bsz=17136.1, num_updates=46100, lr=0.000294564, gnorm=0.134, clip=0, loss_scale=2, train_wall=84, gb_free=61.6, wall=39531 epoch 028: 569 / 1689 loss=3.521, nll_loss=1.968, ppl=3.91, wps=531448, ups=1.08, wpb=494199, bsz=17136.1, num_updates=46100, lr=0.000294564, gnorm=0.134, clip=0, loss_scale=2, train_wall=84, gb_free=61.6, wall=39531 epoch 028: 569 / 1689 loss=3.521, nll_loss=1.968, ppl=3.91, wps=531448, ups=1.08, wpb=494199, bsz=17136.1, num_updates=46100, lr=0.000294564, gnorm=0.134, clip=0, loss_scale=2, train_wall=84, gb_free=61.6, wall=39531 epoch 028: 669 / 1689 loss=3.528, nll_loss=1.975, ppl=3.93, wps=591300, ups=1.19, wpb=495856, bsz=16509.1, num_updates=46200, lr=0.000294245, gnorm=0.139, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=39615 epoch 028: 669 / 1689 loss=3.528, nll_loss=1.975, ppl=3.93, wps=591300, ups=1.19, wpb=495856, bsz=16509.1, num_updates=46200, lr=0.000294245, gnorm=0.139, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=39615 epoch 028: 669 / 1689 loss=3.528, nll_loss=1.975, ppl=3.93, wps=591300, ups=1.19, wpb=495856, bsz=16509.1, num_updates=46200, lr=0.000294245, gnorm=0.139, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=39615 epoch 028: 669 / 1689 loss=3.528, nll_loss=1.975, ppl=3.93, wps=591300, ups=1.19, wpb=495856, bsz=16509.1, num_updates=46200, lr=0.000294245, gnorm=0.139, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=39615 epoch 028: 669 / 1689 loss=3.528, nll_loss=1.975, ppl=3.93, wps=591300, ups=1.19, wpb=495856, bsz=16509.1, num_updates=46200, lr=0.000294245, gnorm=0.139, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=39615 epoch 028: 669 / 1689 loss=3.528, nll_loss=1.975, ppl=3.93, wps=591300, ups=1.19, wpb=495856, bsz=16509.1, num_updates=46200, lr=0.000294245, gnorm=0.139, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=39615 epoch 028: 669 / 1689 loss=3.528, nll_loss=1.975, ppl=3.93, wps=591300, ups=1.19, wpb=495856, bsz=16509.1, num_updates=46200, lr=0.000294245, gnorm=0.139, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=39615 epoch 028: 669 / 1689 loss=3.528, nll_loss=1.975, ppl=3.93, wps=591300, ups=1.19, wpb=495856, bsz=16509.1, num_updates=46200, lr=0.000294245, gnorm=0.139, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=39615 epoch 028: 669 / 1689 loss=3.528, nll_loss=1.975, ppl=3.93, wps=591300, ups=1.19, wpb=495856, bsz=16509.1, num_updates=46200, lr=0.000294245, gnorm=0.139, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=39615 epoch 028: 669 / 1689 loss=3.528, nll_loss=1.975, ppl=3.93, wps=591300, ups=1.19, wpb=495856, bsz=16509.1, num_updates=46200, lr=0.000294245, gnorm=0.139, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=39615 epoch 028: 669 / 1689 loss=3.528, nll_loss=1.975, ppl=3.93, wps=591300, ups=1.19, wpb=495856, bsz=16509.1, num_updates=46200, lr=0.000294245, gnorm=0.139, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=39615 epoch 028: 669 / 1689 loss=3.528, nll_loss=1.975, ppl=3.93, wps=591300, ups=1.19, wpb=495856, bsz=16509.1, num_updates=46200, lr=0.000294245, gnorm=0.139, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=39615 epoch 028: 669 / 1689 loss=3.528, nll_loss=1.975, ppl=3.93, wps=591300, ups=1.19, wpb=495856, bsz=16509.1, num_updates=46200, lr=0.000294245, gnorm=0.139, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=39615 epoch 028: 669 / 1689 loss=3.528, nll_loss=1.975, ppl=3.93, wps=591300, ups=1.19, wpb=495856, bsz=16509.1, num_updates=46200, lr=0.000294245, gnorm=0.139, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=39615 epoch 028: 669 / 1689 loss=3.528, nll_loss=1.975, ppl=3.93, wps=591300, ups=1.19, wpb=495856, bsz=16509.1, num_updates=46200, lr=0.000294245, gnorm=0.139, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=39615 epoch 028: 669 / 1689 loss=3.528, nll_loss=1.975, ppl=3.93, wps=591300, ups=1.19, wpb=495856, bsz=16509.1, num_updates=46200, lr=0.000294245, gnorm=0.139, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=39615 epoch 028: 669 / 1689 loss=3.528, nll_loss=1.975, ppl=3.93, wps=591300, ups=1.19, wpb=495856, bsz=16509.1, num_updates=46200, lr=0.000294245, gnorm=0.139, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=39615 epoch 028: 669 / 1689 loss=3.528, nll_loss=1.975, ppl=3.93, wps=591300, ups=1.19, wpb=495856, bsz=16509.1, num_updates=46200, lr=0.000294245, gnorm=0.139, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=39615 epoch 028: 669 / 1689 loss=3.528, nll_loss=1.975, ppl=3.93, wps=591300, ups=1.19, wpb=495856, bsz=16509.1, num_updates=46200, lr=0.000294245, gnorm=0.139, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=39615 epoch 028: 669 / 1689 loss=3.528, nll_loss=1.975, ppl=3.93, wps=591300, ups=1.19, wpb=495856, bsz=16509.1, num_updates=46200, lr=0.000294245, gnorm=0.139, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=39615 epoch 028: 669 / 1689 loss=3.528, nll_loss=1.975, ppl=3.93, wps=591300, ups=1.19, wpb=495856, bsz=16509.1, num_updates=46200, lr=0.000294245, gnorm=0.139, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=39615 epoch 028: 669 / 1689 loss=3.528, nll_loss=1.975, ppl=3.93, wps=591300, ups=1.19, wpb=495856, bsz=16509.1, num_updates=46200, lr=0.000294245, gnorm=0.139, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=39615 epoch 028: 669 / 1689 loss=3.528, nll_loss=1.975, ppl=3.93, wps=591300, ups=1.19, wpb=495856, bsz=16509.1, num_updates=46200, lr=0.000294245, gnorm=0.139, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=39615 epoch 028: 669 / 1689 loss=3.528, nll_loss=1.975, ppl=3.93, wps=591300, ups=1.19, wpb=495856, bsz=16509.1, num_updates=46200, lr=0.000294245, gnorm=0.139, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=39615 epoch 028: 669 / 1689 loss=3.528, nll_loss=1.975, ppl=3.93, wps=591300, ups=1.19, wpb=495856, bsz=16509.1, num_updates=46200, lr=0.000294245, gnorm=0.139, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=39615 epoch 028: 669 / 1689 loss=3.528, nll_loss=1.975, ppl=3.93, wps=591300, ups=1.19, wpb=495856, bsz=16509.1, num_updates=46200, lr=0.000294245, gnorm=0.139, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=39615 epoch 028: 669 / 1689 loss=3.528, nll_loss=1.975, ppl=3.93, wps=591300, ups=1.19, wpb=495856, bsz=16509.1, num_updates=46200, lr=0.000294245, gnorm=0.139, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=39615 epoch 028: 669 / 1689 loss=3.528, nll_loss=1.975, ppl=3.93, wps=591300, ups=1.19, wpb=495856, bsz=16509.1, num_updates=46200, lr=0.000294245, gnorm=0.139, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=39615 epoch 028: 769 / 1689 loss=3.518, nll_loss=1.964, ppl=3.9, wps=587373, ups=1.18, wpb=496533, bsz=16446.9, num_updates=46300, lr=0.000293927, gnorm=0.131, clip=0, loss_scale=2, train_wall=83, gb_free=61.4, wall=39699 epoch 028: 769 / 1689 loss=3.518, nll_loss=1.964, ppl=3.9, wps=587373, ups=1.18, wpb=496533, bsz=16446.9, num_updates=46300, lr=0.000293927, gnorm=0.131, clip=0, loss_scale=2, train_wall=83, gb_free=61.4, wall=39699 epoch 028: 769 / 1689 loss=3.518, nll_loss=1.964, ppl=3.9, wps=587373, ups=1.18, wpb=496533, bsz=16446.9, num_updates=46300, lr=0.000293927, gnorm=0.131, clip=0, loss_scale=2, train_wall=83, gb_free=61.4, wall=39699 epoch 028: 769 / 1689 loss=3.518, nll_loss=1.964, ppl=3.9, wps=587373, ups=1.18, wpb=496533, bsz=16446.9, num_updates=46300, lr=0.000293927, gnorm=0.131, clip=0, loss_scale=2, train_wall=83, gb_free=61.4, wall=39699 epoch 028: 769 / 1689 loss=3.518, nll_loss=1.964, ppl=3.9, wps=587373, ups=1.18, wpb=496533, bsz=16446.9, num_updates=46300, lr=0.000293927, gnorm=0.131, clip=0, loss_scale=2, train_wall=83, gb_free=61.4, wall=39699 epoch 028: 769 / 1689 loss=3.518, nll_loss=1.964, ppl=3.9, wps=587373, ups=1.18, wpb=496533, bsz=16446.9, num_updates=46300, lr=0.000293927, gnorm=0.131, clip=0, loss_scale=2, train_wall=83, gb_free=61.4, wall=39699 epoch 028: 769 / 1689 loss=3.518, nll_loss=1.964, ppl=3.9, wps=587373, ups=1.18, wpb=496533, bsz=16446.9, num_updates=46300, lr=0.000293927, gnorm=0.131, clip=0, loss_scale=2, train_wall=83, gb_free=61.4, wall=39699 epoch 028: 769 / 1689 loss=3.518, nll_loss=1.964, ppl=3.9, wps=587373, ups=1.18, wpb=496533, bsz=16446.9, num_updates=46300, lr=0.000293927, gnorm=0.131, clip=0, loss_scale=2, train_wall=83, gb_free=61.4, wall=39699 epoch 028: 769 / 1689 loss=3.518, nll_loss=1.964, ppl=3.9, wps=587373, ups=1.18, wpb=496533, bsz=16446.9, num_updates=46300, lr=0.000293927, gnorm=0.131, clip=0, loss_scale=2, train_wall=83, gb_free=61.4, wall=39699 epoch 028: 769 / 1689 loss=3.518, nll_loss=1.964, ppl=3.9, wps=587373, ups=1.18, wpb=496533, bsz=16446.9, num_updates=46300, lr=0.000293927, gnorm=0.131, clip=0, loss_scale=2, train_wall=83, gb_free=61.4, wall=39699 epoch 028: 769 / 1689 loss=3.518, nll_loss=1.964, ppl=3.9, wps=587373, ups=1.18, wpb=496533, bsz=16446.9, num_updates=46300, lr=0.000293927, gnorm=0.131, clip=0, loss_scale=2, train_wall=83, gb_free=61.4, wall=39699 epoch 028: 769 / 1689 loss=3.518, nll_loss=1.964, ppl=3.9, wps=587373, ups=1.18, wpb=496533, bsz=16446.9, num_updates=46300, lr=0.000293927, gnorm=0.131, clip=0, loss_scale=2, train_wall=83, gb_free=61.4, wall=39699 epoch 028: 769 / 1689 loss=3.518, nll_loss=1.964, ppl=3.9, wps=587373, ups=1.18, wpb=496533, bsz=16446.9, num_updates=46300, lr=0.000293927, gnorm=0.131, clip=0, loss_scale=2, train_wall=83, gb_free=61.4, wall=39699 epoch 028: 769 / 1689 loss=3.518, nll_loss=1.964, ppl=3.9, wps=587373, ups=1.18, wpb=496533, bsz=16446.9, num_updates=46300, lr=0.000293927, gnorm=0.131, clip=0, loss_scale=2, train_wall=83, gb_free=61.4, wall=39699 epoch 028: 769 / 1689 loss=3.518, nll_loss=1.964, ppl=3.9, wps=587373, ups=1.18, wpb=496533, bsz=16446.9, num_updates=46300, lr=0.000293927, gnorm=0.131, clip=0, loss_scale=2, train_wall=83, gb_free=61.4, wall=39699 epoch 028: 769 / 1689 loss=3.518, nll_loss=1.964, ppl=3.9, wps=587373, ups=1.18, wpb=496533, bsz=16446.9, num_updates=46300, lr=0.000293927, gnorm=0.131, clip=0, loss_scale=2, train_wall=83, gb_free=61.4, wall=39699 epoch 028: 769 / 1689 loss=3.518, nll_loss=1.964, ppl=3.9, wps=587373, ups=1.18, wpb=496533, bsz=16446.9, num_updates=46300, lr=0.000293927, gnorm=0.131, clip=0, loss_scale=2, train_wall=83, gb_free=61.4, wall=39699 epoch 028: 769 / 1689 loss=3.518, nll_loss=1.964, ppl=3.9, wps=587373, ups=1.18, wpb=496533, bsz=16446.9, num_updates=46300, lr=0.000293927, gnorm=0.131, clip=0, loss_scale=2, train_wall=83, gb_free=61.4, wall=39699 epoch 028: 769 / 1689 loss=3.518, nll_loss=1.964, ppl=3.9, wps=587373, ups=1.18, wpb=496533, bsz=16446.9, num_updates=46300, lr=0.000293927, gnorm=0.131, clip=0, loss_scale=2, train_wall=83, gb_free=61.4, wall=39699 epoch 028: 769 / 1689 loss=3.518, nll_loss=1.964, ppl=3.9, wps=587373, ups=1.18, wpb=496533, bsz=16446.9, num_updates=46300, lr=0.000293927, gnorm=0.131, clip=0, loss_scale=2, train_wall=83, gb_free=61.4, wall=39699 epoch 028: 769 / 1689 loss=3.518, nll_loss=1.964, ppl=3.9, wps=587373, ups=1.18, wpb=496533, bsz=16446.9, num_updates=46300, lr=0.000293927, gnorm=0.131, clip=0, loss_scale=2, train_wall=83, gb_free=61.4, wall=39699 epoch 028: 769 / 1689 loss=3.518, nll_loss=1.964, ppl=3.9, wps=587373, ups=1.18, wpb=496533, bsz=16446.9, num_updates=46300, lr=0.000293927, gnorm=0.131, clip=0, loss_scale=2, train_wall=83, gb_free=61.4, wall=39699 epoch 028: 769 / 1689 loss=3.518, nll_loss=1.964, ppl=3.9, wps=587373, ups=1.18, wpb=496533, bsz=16446.9, num_updates=46300, lr=0.000293927, gnorm=0.131, clip=0, loss_scale=2, train_wall=83, gb_free=61.4, wall=39699 epoch 028: 769 / 1689 loss=3.518, nll_loss=1.964, ppl=3.9, wps=587373, ups=1.18, wpb=496533, bsz=16446.9, num_updates=46300, lr=0.000293927, gnorm=0.131, clip=0, loss_scale=2, train_wall=83, gb_free=61.4, wall=39699 epoch 028: 769 / 1689 loss=3.518, nll_loss=1.964, ppl=3.9, wps=587373, ups=1.18, wpb=496533, bsz=16446.9, num_updates=46300, lr=0.000293927, gnorm=0.131, clip=0, loss_scale=2, train_wall=83, gb_free=61.4, wall=39699 epoch 028: 769 / 1689 loss=3.518, nll_loss=1.964, ppl=3.9, wps=587373, ups=1.18, wpb=496533, bsz=16446.9, num_updates=46300, lr=0.000293927, gnorm=0.131, clip=0, loss_scale=2, train_wall=83, gb_free=61.4, wall=39699 epoch 028: 769 / 1689 loss=3.518, nll_loss=1.964, ppl=3.9, wps=587373, ups=1.18, wpb=496533, bsz=16446.9, num_updates=46300, lr=0.000293927, gnorm=0.131, clip=0, loss_scale=2, train_wall=83, gb_free=61.4, wall=39699 epoch 028: 769 / 1689 loss=3.518, nll_loss=1.964, ppl=3.9, wps=587373, ups=1.18, wpb=496533, bsz=16446.9, num_updates=46300, lr=0.000293927, gnorm=0.131, clip=0, loss_scale=2, train_wall=83, gb_free=61.4, wall=39699 epoch 028: 869 / 1689 loss=3.527, nll_loss=1.974, ppl=3.93, wps=584697, ups=1.18, wpb=496238, bsz=16445.7, num_updates=46400, lr=0.00029361, gnorm=0.14, clip=0, loss_scale=2, train_wall=84, gb_free=61.9, wall=39784 epoch 028: 869 / 1689 loss=3.527, nll_loss=1.974, ppl=3.93, wps=584697, ups=1.18, wpb=496238, bsz=16445.7, num_updates=46400, lr=0.00029361, gnorm=0.14, clip=0, loss_scale=2, train_wall=84, gb_free=61.9, wall=39784 epoch 028: 869 / 1689 loss=3.527, nll_loss=1.974, ppl=3.93, wps=584697, ups=1.18, wpb=496238, bsz=16445.7, num_updates=46400, lr=0.00029361, gnorm=0.14, clip=0, loss_scale=2, train_wall=84, gb_free=61.9, wall=39784 epoch 028: 869 / 1689 loss=3.527, nll_loss=1.974, ppl=3.93, wps=584697, ups=1.18, wpb=496238, bsz=16445.7, num_updates=46400, lr=0.00029361, gnorm=0.14, clip=0, loss_scale=2, train_wall=84, gb_free=61.9, wall=39784 epoch 028: 869 / 1689 loss=3.527, nll_loss=1.974, ppl=3.93, wps=584697, ups=1.18, wpb=496238, bsz=16445.7, num_updates=46400, lr=0.00029361, gnorm=0.14, clip=0, loss_scale=2, train_wall=84, gb_free=61.9, wall=39784 epoch 028: 869 / 1689 loss=3.527, nll_loss=1.974, ppl=3.93, wps=584697, ups=1.18, wpb=496238, bsz=16445.7, num_updates=46400, lr=0.00029361, gnorm=0.14, clip=0, loss_scale=2, train_wall=84, gb_free=61.9, wall=39784 epoch 028: 869 / 1689 loss=3.527, nll_loss=1.974, ppl=3.93, wps=584697, ups=1.18, wpb=496238, bsz=16445.7, num_updates=46400, lr=0.00029361, gnorm=0.14, clip=0, loss_scale=2, train_wall=84, gb_free=61.9, wall=39784 epoch 028: 869 / 1689 loss=3.527, nll_loss=1.974, ppl=3.93, wps=584697, ups=1.18, wpb=496238, bsz=16445.7, num_updates=46400, lr=0.00029361, gnorm=0.14, clip=0, loss_scale=2, train_wall=84, gb_free=61.9, wall=39784 epoch 028: 869 / 1689 loss=3.527, nll_loss=1.974, ppl=3.93, wps=584697, ups=1.18, wpb=496238, bsz=16445.7, num_updates=46400, lr=0.00029361, gnorm=0.14, clip=0, loss_scale=2, train_wall=84, gb_free=61.9, wall=39784 epoch 028: 869 / 1689 loss=3.527, nll_loss=1.974, ppl=3.93, wps=584697, ups=1.18, wpb=496238, bsz=16445.7, num_updates=46400, lr=0.00029361, gnorm=0.14, clip=0, loss_scale=2, train_wall=84, gb_free=61.9, wall=39784 epoch 028: 869 / 1689 loss=3.527, nll_loss=1.974, ppl=3.93, wps=584697, ups=1.18, wpb=496238, bsz=16445.7, num_updates=46400, lr=0.00029361, gnorm=0.14, clip=0, loss_scale=2, train_wall=84, gb_free=61.9, wall=39784 epoch 028: 869 / 1689 loss=3.527, nll_loss=1.974, ppl=3.93, wps=584697, ups=1.18, wpb=496238, bsz=16445.7, num_updates=46400, lr=0.00029361, gnorm=0.14, clip=0, loss_scale=2, train_wall=84, gb_free=61.9, wall=39784 epoch 028: 869 / 1689 loss=3.527, nll_loss=1.974, ppl=3.93, wps=584697, ups=1.18, wpb=496238, bsz=16445.7, num_updates=46400, lr=0.00029361, gnorm=0.14, clip=0, loss_scale=2, train_wall=84, gb_free=61.9, wall=39784 epoch 028: 869 / 1689 loss=3.527, nll_loss=1.974, ppl=3.93, wps=584697, ups=1.18, wpb=496238, bsz=16445.7, num_updates=46400, lr=0.00029361, gnorm=0.14, clip=0, loss_scale=2, train_wall=84, gb_free=61.9, wall=39784 epoch 028: 869 / 1689 loss=3.527, nll_loss=1.974, ppl=3.93, wps=584697, ups=1.18, wpb=496238, bsz=16445.7, num_updates=46400, lr=0.00029361, gnorm=0.14, clip=0, loss_scale=2, train_wall=84, gb_free=61.9, wall=39784 epoch 028: 869 / 1689 loss=3.527, nll_loss=1.974, ppl=3.93, wps=584697, ups=1.18, wpb=496238, bsz=16445.7, num_updates=46400, lr=0.00029361, gnorm=0.14, clip=0, loss_scale=2, train_wall=84, gb_free=61.9, wall=39784 epoch 028: 869 / 1689 loss=3.527, nll_loss=1.974, ppl=3.93, wps=584697, ups=1.18, wpb=496238, bsz=16445.7, num_updates=46400, lr=0.00029361, gnorm=0.14, clip=0, loss_scale=2, train_wall=84, gb_free=61.9, wall=39784 epoch 028: 869 / 1689 loss=3.527, nll_loss=1.974, ppl=3.93, wps=584697, ups=1.18, wpb=496238, bsz=16445.7, num_updates=46400, lr=0.00029361, gnorm=0.14, clip=0, loss_scale=2, train_wall=84, gb_free=61.9, wall=39784 epoch 028: 869 / 1689 loss=3.527, nll_loss=1.974, ppl=3.93, wps=584697, ups=1.18, wpb=496238, bsz=16445.7, num_updates=46400, lr=0.00029361, gnorm=0.14, clip=0, loss_scale=2, train_wall=84, gb_free=61.9, wall=39784 epoch 028: 869 / 1689 loss=3.527, nll_loss=1.974, ppl=3.93, wps=584697, ups=1.18, wpb=496238, bsz=16445.7, num_updates=46400, lr=0.00029361, gnorm=0.14, clip=0, loss_scale=2, train_wall=84, gb_free=61.9, wall=39784 epoch 028: 869 / 1689 loss=3.527, nll_loss=1.974, ppl=3.93, wps=584697, ups=1.18, wpb=496238, bsz=16445.7, num_updates=46400, lr=0.00029361, gnorm=0.14, clip=0, loss_scale=2, train_wall=84, gb_free=61.9, wall=39784 epoch 028: 869 / 1689 loss=3.527, nll_loss=1.974, ppl=3.93, wps=584697, ups=1.18, wpb=496238, bsz=16445.7, num_updates=46400, lr=0.00029361, gnorm=0.14, clip=0, loss_scale=2, train_wall=84, gb_free=61.9, wall=39784 epoch 028: 869 / 1689 loss=3.527, nll_loss=1.974, ppl=3.93, wps=584697, ups=1.18, wpb=496238, bsz=16445.7, num_updates=46400, lr=0.00029361, gnorm=0.14, clip=0, loss_scale=2, train_wall=84, gb_free=61.9, wall=39784 epoch 028: 869 / 1689 loss=3.527, nll_loss=1.974, ppl=3.93, wps=584697, ups=1.18, wpb=496238, bsz=16445.7, num_updates=46400, lr=0.00029361, gnorm=0.14, clip=0, loss_scale=2, train_wall=84, gb_free=61.9, wall=39784 epoch 028: 869 / 1689 loss=3.527, nll_loss=1.974, ppl=3.93, wps=584697, ups=1.18, wpb=496238, bsz=16445.7, num_updates=46400, lr=0.00029361, gnorm=0.14, clip=0, loss_scale=2, train_wall=84, gb_free=61.9, wall=39784 epoch 028: 869 / 1689 loss=3.527, nll_loss=1.974, ppl=3.93, wps=584697, ups=1.18, wpb=496238, bsz=16445.7, num_updates=46400, lr=0.00029361, gnorm=0.14, clip=0, loss_scale=2, train_wall=84, gb_free=61.9, wall=39784 epoch 028: 869 / 1689 loss=3.527, nll_loss=1.974, ppl=3.93, wps=584697, ups=1.18, wpb=496238, bsz=16445.7, num_updates=46400, lr=0.00029361, gnorm=0.14, clip=0, loss_scale=2, train_wall=84, gb_free=61.9, wall=39784 epoch 028: 869 / 1689 loss=3.527, nll_loss=1.974, ppl=3.93, wps=584697, ups=1.18, wpb=496238, bsz=16445.7, num_updates=46400, lr=0.00029361, gnorm=0.14, clip=0, loss_scale=2, train_wall=84, gb_free=61.9, wall=39784 epoch 028: 969 / 1689 loss=3.529, nll_loss=1.977, ppl=3.94, wps=588666, ups=1.19, wpb=495308, bsz=16403, num_updates=46500, lr=0.000293294, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=62.1, wall=39868 epoch 028: 969 / 1689 loss=3.529, nll_loss=1.977, ppl=3.94, wps=588666, ups=1.19, wpb=495308, bsz=16403, num_updates=46500, lr=0.000293294, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=62.1, wall=39868 epoch 028: 969 / 1689 loss=3.529, nll_loss=1.977, ppl=3.94, wps=588666, ups=1.19, wpb=495308, bsz=16403, num_updates=46500, lr=0.000293294, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=62.1, wall=39868 epoch 028: 969 / 1689 loss=3.529, nll_loss=1.977, ppl=3.94, wps=588666, ups=1.19, wpb=495308, bsz=16403, num_updates=46500, lr=0.000293294, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=62.1, wall=39868 epoch 028: 969 / 1689 loss=3.529, nll_loss=1.977, ppl=3.94, wps=588666, ups=1.19, wpb=495308, bsz=16403, num_updates=46500, lr=0.000293294, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=62.1, wall=39868 epoch 028: 969 / 1689 loss=3.529, nll_loss=1.977, ppl=3.94, wps=588666, ups=1.19, wpb=495308, bsz=16403, num_updates=46500, lr=0.000293294, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=62.1, wall=39868 epoch 028: 969 / 1689 loss=3.529, nll_loss=1.977, ppl=3.94, wps=588666, ups=1.19, wpb=495308, bsz=16403, num_updates=46500, lr=0.000293294, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=62.1, wall=39868 epoch 028: 969 / 1689 loss=3.529, nll_loss=1.977, ppl=3.94, wps=588666, ups=1.19, wpb=495308, bsz=16403, num_updates=46500, lr=0.000293294, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=62.1, wall=39868 epoch 028: 969 / 1689 loss=3.529, nll_loss=1.977, ppl=3.94, wps=588666, ups=1.19, wpb=495308, bsz=16403, num_updates=46500, lr=0.000293294, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=62.1, wall=39868 epoch 028: 969 / 1689 loss=3.529, nll_loss=1.977, ppl=3.94, wps=588666, ups=1.19, wpb=495308, bsz=16403, num_updates=46500, lr=0.000293294, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=62.1, wall=39868 epoch 028: 969 / 1689 loss=3.529, nll_loss=1.977, ppl=3.94, wps=588666, ups=1.19, wpb=495308, bsz=16403, num_updates=46500, lr=0.000293294, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=62.1, wall=39868 epoch 028: 969 / 1689 loss=3.529, nll_loss=1.977, ppl=3.94, wps=588666, ups=1.19, wpb=495308, bsz=16403, num_updates=46500, lr=0.000293294, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=62.1, wall=39868 epoch 028: 969 / 1689 loss=3.529, nll_loss=1.977, ppl=3.94, wps=588666, ups=1.19, wpb=495308, bsz=16403, num_updates=46500, lr=0.000293294, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=62.1, wall=39868 epoch 028: 969 / 1689 loss=3.529, nll_loss=1.977, ppl=3.94, wps=588666, ups=1.19, wpb=495308, bsz=16403, num_updates=46500, lr=0.000293294, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=62.1, wall=39868 epoch 028: 969 / 1689 loss=3.529, nll_loss=1.977, ppl=3.94, wps=588666, ups=1.19, wpb=495308, bsz=16403, num_updates=46500, lr=0.000293294, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=62.1, wall=39868 epoch 028: 969 / 1689 loss=3.529, nll_loss=1.977, ppl=3.94, wps=588666, ups=1.19, wpb=495308, bsz=16403, num_updates=46500, lr=0.000293294, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=62.1, wall=39868 epoch 028: 969 / 1689 loss=3.529, nll_loss=1.977, ppl=3.94, wps=588666, ups=1.19, wpb=495308, bsz=16403, num_updates=46500, lr=0.000293294, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=62.1, wall=39868 epoch 028: 969 / 1689 loss=3.529, nll_loss=1.977, ppl=3.94, wps=588666, ups=1.19, wpb=495308, bsz=16403, num_updates=46500, lr=0.000293294, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=62.1, wall=39868 epoch 028: 969 / 1689 loss=3.529, nll_loss=1.977, ppl=3.94, wps=588666, ups=1.19, wpb=495308, bsz=16403, num_updates=46500, lr=0.000293294, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=62.1, wall=39868 epoch 028: 969 / 1689 loss=3.529, nll_loss=1.977, ppl=3.94, wps=588666, ups=1.19, wpb=495308, bsz=16403, num_updates=46500, lr=0.000293294, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=62.1, wall=39868 epoch 028: 969 / 1689 loss=3.529, nll_loss=1.977, ppl=3.94, wps=588666, ups=1.19, wpb=495308, bsz=16403, num_updates=46500, lr=0.000293294, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=62.1, wall=39868 epoch 028: 969 / 1689 loss=3.529, nll_loss=1.977, ppl=3.94, wps=588666, ups=1.19, wpb=495308, bsz=16403, num_updates=46500, lr=0.000293294, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=62.1, wall=39868 epoch 028: 969 / 1689 loss=3.529, nll_loss=1.977, ppl=3.94, wps=588666, ups=1.19, wpb=495308, bsz=16403, num_updates=46500, lr=0.000293294, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=62.1, wall=39868 epoch 028: 969 / 1689 loss=3.529, nll_loss=1.977, ppl=3.94, wps=588666, ups=1.19, wpb=495308, bsz=16403, num_updates=46500, lr=0.000293294, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=62.1, wall=39868 epoch 028: 969 / 1689 loss=3.529, nll_loss=1.977, ppl=3.94, wps=588666, ups=1.19, wpb=495308, bsz=16403, num_updates=46500, lr=0.000293294, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=62.1, wall=39868 epoch 028: 969 / 1689 loss=3.529, nll_loss=1.977, ppl=3.94, wps=588666, ups=1.19, wpb=495308, bsz=16403, num_updates=46500, lr=0.000293294, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=62.1, wall=39868 epoch 028: 969 / 1689 loss=3.529, nll_loss=1.977, ppl=3.94, wps=588666, ups=1.19, wpb=495308, bsz=16403, num_updates=46500, lr=0.000293294, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=62.1, wall=39868 epoch 028: 969 / 1689 loss=3.529, nll_loss=1.977, ppl=3.94, wps=588666, ups=1.19, wpb=495308, bsz=16403, num_updates=46500, lr=0.000293294, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=62.1, wall=39868 epoch 028: 1069 / 1689 loss=3.518, nll_loss=1.965, ppl=3.9, wps=588174, ups=1.19, wpb=495759, bsz=16210.4, num_updates=46600, lr=0.000292979, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=61.5, wall=39952 epoch 028: 1069 / 1689 loss=3.518, nll_loss=1.965, ppl=3.9, wps=588174, ups=1.19, wpb=495759, bsz=16210.4, num_updates=46600, lr=0.000292979, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=61.5, wall=39952 epoch 028: 1069 / 1689 loss=3.518, nll_loss=1.965, ppl=3.9, wps=588174, ups=1.19, wpb=495759, bsz=16210.4, num_updates=46600, lr=0.000292979, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=61.5, wall=39952 epoch 028: 1069 / 1689 loss=3.518, nll_loss=1.965, ppl=3.9, wps=588174, ups=1.19, wpb=495759, bsz=16210.4, num_updates=46600, lr=0.000292979, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=61.5, wall=39952 epoch 028: 1069 / 1689 loss=3.518, nll_loss=1.965, ppl=3.9, wps=588174, ups=1.19, wpb=495759, bsz=16210.4, num_updates=46600, lr=0.000292979, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=61.5, wall=39952 epoch 028: 1069 / 1689 loss=3.518, nll_loss=1.965, ppl=3.9, wps=588174, ups=1.19, wpb=495759, bsz=16210.4, num_updates=46600, lr=0.000292979, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=61.5, wall=39952 epoch 028: 1069 / 1689 loss=3.518, nll_loss=1.965, ppl=3.9, wps=588174, ups=1.19, wpb=495759, bsz=16210.4, num_updates=46600, lr=0.000292979, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=61.5, wall=39952 epoch 028: 1069 / 1689 loss=3.518, nll_loss=1.965, ppl=3.9, wps=588174, ups=1.19, wpb=495759, bsz=16210.4, num_updates=46600, lr=0.000292979, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=61.5, wall=39952 epoch 028: 1069 / 1689 loss=3.518, nll_loss=1.965, ppl=3.9, wps=588174, ups=1.19, wpb=495759, bsz=16210.4, num_updates=46600, lr=0.000292979, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=61.5, wall=39952 epoch 028: 1069 / 1689 loss=3.518, nll_loss=1.965, ppl=3.9, wps=588174, ups=1.19, wpb=495759, bsz=16210.4, num_updates=46600, lr=0.000292979, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=61.5, wall=39952 epoch 028: 1069 / 1689 loss=3.518, nll_loss=1.965, ppl=3.9, wps=588174, ups=1.19, wpb=495759, bsz=16210.4, num_updates=46600, lr=0.000292979, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=61.5, wall=39952 epoch 028: 1069 / 1689 loss=3.518, nll_loss=1.965, ppl=3.9, wps=588174, ups=1.19, wpb=495759, bsz=16210.4, num_updates=46600, lr=0.000292979, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=61.5, wall=39952 epoch 028: 1069 / 1689 loss=3.518, nll_loss=1.965, ppl=3.9, wps=588174, ups=1.19, wpb=495759, bsz=16210.4, num_updates=46600, lr=0.000292979, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=61.5, wall=39952 epoch 028: 1069 / 1689 loss=3.518, nll_loss=1.965, ppl=3.9, wps=588174, ups=1.19, wpb=495759, bsz=16210.4, num_updates=46600, lr=0.000292979, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=61.5, wall=39952 epoch 028: 1069 / 1689 loss=3.518, nll_loss=1.965, ppl=3.9, wps=588174, ups=1.19, wpb=495759, bsz=16210.4, num_updates=46600, lr=0.000292979, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=61.5, wall=39952 epoch 028: 1069 / 1689 loss=3.518, nll_loss=1.965, ppl=3.9, wps=588174, ups=1.19, wpb=495759, bsz=16210.4, num_updates=46600, lr=0.000292979, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=61.5, wall=39952 epoch 028: 1069 / 1689 loss=3.518, nll_loss=1.965, ppl=3.9, wps=588174, ups=1.19, wpb=495759, bsz=16210.4, num_updates=46600, lr=0.000292979, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=61.5, wall=39952 epoch 028: 1069 / 1689 loss=3.518, nll_loss=1.965, ppl=3.9, wps=588174, ups=1.19, wpb=495759, bsz=16210.4, num_updates=46600, lr=0.000292979, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=61.5, wall=39952 epoch 028: 1069 / 1689 loss=3.518, nll_loss=1.965, ppl=3.9, wps=588174, ups=1.19, wpb=495759, bsz=16210.4, num_updates=46600, lr=0.000292979, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=61.5, wall=39952 epoch 028: 1069 / 1689 loss=3.518, nll_loss=1.965, ppl=3.9, wps=588174, ups=1.19, wpb=495759, bsz=16210.4, num_updates=46600, lr=0.000292979, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=61.5, wall=39952 epoch 028: 1069 / 1689 loss=3.518, nll_loss=1.965, ppl=3.9, wps=588174, ups=1.19, wpb=495759, bsz=16210.4, num_updates=46600, lr=0.000292979, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=61.5, wall=39952 epoch 028: 1069 / 1689 loss=3.518, nll_loss=1.965, ppl=3.9, wps=588174, ups=1.19, wpb=495759, bsz=16210.4, num_updates=46600, lr=0.000292979, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=61.5, wall=39952 epoch 028: 1069 / 1689 loss=3.518, nll_loss=1.965, ppl=3.9, wps=588174, ups=1.19, wpb=495759, bsz=16210.4, num_updates=46600, lr=0.000292979, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=61.5, wall=39952 epoch 028: 1069 / 1689 loss=3.518, nll_loss=1.965, ppl=3.9, wps=588174, ups=1.19, wpb=495759, bsz=16210.4, num_updates=46600, lr=0.000292979, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=61.5, wall=39952 epoch 028: 1069 / 1689 loss=3.518, nll_loss=1.965, ppl=3.9, wps=588174, ups=1.19, wpb=495759, bsz=16210.4, num_updates=46600, lr=0.000292979, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=61.5, wall=39952 epoch 028: 1069 / 1689 loss=3.518, nll_loss=1.965, ppl=3.9, wps=588174, ups=1.19, wpb=495759, bsz=16210.4, num_updates=46600, lr=0.000292979, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=61.5, wall=39952 epoch 028: 1069 / 1689 loss=3.518, nll_loss=1.965, ppl=3.9, wps=588174, ups=1.19, wpb=495759, bsz=16210.4, num_updates=46600, lr=0.000292979, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=61.5, wall=39952 epoch 028: 1069 / 1689 loss=3.518, nll_loss=1.965, ppl=3.9, wps=588174, ups=1.19, wpb=495759, bsz=16210.4, num_updates=46600, lr=0.000292979, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=61.5, wall=39952 epoch 028: 1169 / 1689 loss=3.526, nll_loss=1.973, ppl=3.93, wps=588814, ups=1.18, wpb=497012, bsz=16246.5, num_updates=46700, lr=0.000292666, gnorm=0.141, clip=0, loss_scale=4, train_wall=83, gb_free=62.2, wall=40037 epoch 028: 1169 / 1689 loss=3.526, nll_loss=1.973, ppl=3.93, wps=588814, ups=1.18, wpb=497012, bsz=16246.5, num_updates=46700, lr=0.000292666, gnorm=0.141, clip=0, loss_scale=4, train_wall=83, gb_free=62.2, wall=40037 epoch 028: 1169 / 1689 loss=3.526, nll_loss=1.973, ppl=3.93, wps=588814, ups=1.18, wpb=497012, bsz=16246.5, num_updates=46700, lr=0.000292666, gnorm=0.141, clip=0, loss_scale=4, train_wall=83, gb_free=62.2, wall=40037 epoch 028: 1169 / 1689 loss=3.526, nll_loss=1.973, ppl=3.93, wps=588814, ups=1.18, wpb=497012, bsz=16246.5, num_updates=46700, lr=0.000292666, gnorm=0.141, clip=0, loss_scale=4, train_wall=83, gb_free=62.2, wall=40037 epoch 028: 1169 / 1689 loss=3.526, nll_loss=1.973, ppl=3.93, wps=588814, ups=1.18, wpb=497012, bsz=16246.5, num_updates=46700, lr=0.000292666, gnorm=0.141, clip=0, loss_scale=4, train_wall=83, gb_free=62.2, wall=40037 epoch 028: 1169 / 1689 loss=3.526, nll_loss=1.973, ppl=3.93, wps=588814, ups=1.18, wpb=497012, bsz=16246.5, num_updates=46700, lr=0.000292666, gnorm=0.141, clip=0, loss_scale=4, train_wall=83, gb_free=62.2, wall=40037 epoch 028: 1169 / 1689 loss=3.526, nll_loss=1.973, ppl=3.93, wps=588814, ups=1.18, wpb=497012, bsz=16246.5, num_updates=46700, lr=0.000292666, gnorm=0.141, clip=0, loss_scale=4, train_wall=83, gb_free=62.2, wall=40037 epoch 028: 1169 / 1689 loss=3.526, nll_loss=1.973, ppl=3.93, wps=588814, ups=1.18, wpb=497012, bsz=16246.5, num_updates=46700, lr=0.000292666, gnorm=0.141, clip=0, loss_scale=4, train_wall=83, gb_free=62.2, wall=40037 epoch 028: 1169 / 1689 loss=3.526, nll_loss=1.973, ppl=3.93, wps=588814, ups=1.18, wpb=497012, bsz=16246.5, num_updates=46700, lr=0.000292666, gnorm=0.141, clip=0, loss_scale=4, train_wall=83, gb_free=62.2, wall=40037 epoch 028: 1169 / 1689 loss=3.526, nll_loss=1.973, ppl=3.93, wps=588814, ups=1.18, wpb=497012, bsz=16246.5, num_updates=46700, lr=0.000292666, gnorm=0.141, clip=0, loss_scale=4, train_wall=83, gb_free=62.2, wall=40037 epoch 028: 1169 / 1689 loss=3.526, nll_loss=1.973, ppl=3.93, wps=588814, ups=1.18, wpb=497012, bsz=16246.5, num_updates=46700, lr=0.000292666, gnorm=0.141, clip=0, loss_scale=4, train_wall=83, gb_free=62.2, wall=40037 epoch 028: 1169 / 1689 loss=3.526, nll_loss=1.973, ppl=3.93, wps=588814, ups=1.18, wpb=497012, bsz=16246.5, num_updates=46700, lr=0.000292666, gnorm=0.141, clip=0, loss_scale=4, train_wall=83, gb_free=62.2, wall=40037 epoch 028: 1169 / 1689 loss=3.526, nll_loss=1.973, ppl=3.93, wps=588814, ups=1.18, wpb=497012, bsz=16246.5, num_updates=46700, lr=0.000292666, gnorm=0.141, clip=0, loss_scale=4, train_wall=83, gb_free=62.2, wall=40037 epoch 028: 1169 / 1689 loss=3.526, nll_loss=1.973, ppl=3.93, wps=588814, ups=1.18, wpb=497012, bsz=16246.5, num_updates=46700, lr=0.000292666, gnorm=0.141, clip=0, loss_scale=4, train_wall=83, gb_free=62.2, wall=40037 epoch 028: 1169 / 1689 loss=3.526, nll_loss=1.973, ppl=3.93, wps=588814, ups=1.18, wpb=497012, bsz=16246.5, num_updates=46700, lr=0.000292666, gnorm=0.141, clip=0, loss_scale=4, train_wall=83, gb_free=62.2, wall=40037 epoch 028: 1169 / 1689 loss=3.526, nll_loss=1.973, ppl=3.93, wps=588814, ups=1.18, wpb=497012, bsz=16246.5, num_updates=46700, lr=0.000292666, gnorm=0.141, clip=0, loss_scale=4, train_wall=83, gb_free=62.2, wall=40037 epoch 028: 1169 / 1689 loss=3.526, nll_loss=1.973, ppl=3.93, wps=588814, ups=1.18, wpb=497012, bsz=16246.5, num_updates=46700, lr=0.000292666, gnorm=0.141, clip=0, loss_scale=4, train_wall=83, gb_free=62.2, wall=40037 epoch 028: 1169 / 1689 loss=3.526, nll_loss=1.973, ppl=3.93, wps=588814, ups=1.18, wpb=497012, bsz=16246.5, num_updates=46700, lr=0.000292666, gnorm=0.141, clip=0, loss_scale=4, train_wall=83, gb_free=62.2, wall=40037 epoch 028: 1169 / 1689 loss=3.526, nll_loss=1.973, ppl=3.93, wps=588814, ups=1.18, wpb=497012, bsz=16246.5, num_updates=46700, lr=0.000292666, gnorm=0.141, clip=0, loss_scale=4, train_wall=83, gb_free=62.2, wall=40037 epoch 028: 1169 / 1689 loss=3.526, nll_loss=1.973, ppl=3.93, wps=588814, ups=1.18, wpb=497012, bsz=16246.5, num_updates=46700, lr=0.000292666, gnorm=0.141, clip=0, loss_scale=4, train_wall=83, gb_free=62.2, wall=40037 epoch 028: 1169 / 1689 loss=3.526, nll_loss=1.973, ppl=3.93, wps=588814, ups=1.18, wpb=497012, bsz=16246.5, num_updates=46700, lr=0.000292666, gnorm=0.141, clip=0, loss_scale=4, train_wall=83, gb_free=62.2, wall=40037 epoch 028: 1169 / 1689 loss=3.526, nll_loss=1.973, ppl=3.93, wps=588814, ups=1.18, wpb=497012, bsz=16246.5, num_updates=46700, lr=0.000292666, gnorm=0.141, clip=0, loss_scale=4, train_wall=83, gb_free=62.2, wall=40037 epoch 028: 1169 / 1689 loss=3.526, nll_loss=1.973, ppl=3.93, wps=588814, ups=1.18, wpb=497012, bsz=16246.5, num_updates=46700, lr=0.000292666, gnorm=0.141, clip=0, loss_scale=4, train_wall=83, gb_free=62.2, wall=40037 epoch 028: 1169 / 1689 loss=3.526, nll_loss=1.973, ppl=3.93, wps=588814, ups=1.18, wpb=497012, bsz=16246.5, num_updates=46700, lr=0.000292666, gnorm=0.141, clip=0, loss_scale=4, train_wall=83, gb_free=62.2, wall=40037 epoch 028: 1169 / 1689 loss=3.526, nll_loss=1.973, ppl=3.93, wps=588814, ups=1.18, wpb=497012, bsz=16246.5, num_updates=46700, lr=0.000292666, gnorm=0.141, clip=0, loss_scale=4, train_wall=83, gb_free=62.2, wall=40037 epoch 028: 1169 / 1689 loss=3.526, nll_loss=1.973, ppl=3.93, wps=588814, ups=1.18, wpb=497012, bsz=16246.5, num_updates=46700, lr=0.000292666, gnorm=0.141, clip=0, loss_scale=4, train_wall=83, gb_free=62.2, wall=40037 epoch 028: 1169 / 1689 loss=3.526, nll_loss=1.973, ppl=3.93, wps=588814, ups=1.18, wpb=497012, bsz=16246.5, num_updates=46700, lr=0.000292666, gnorm=0.141, clip=0, loss_scale=4, train_wall=83, gb_free=62.2, wall=40037 epoch 028: 1169 / 1689 loss=3.526, nll_loss=1.973, ppl=3.93, wps=588814, ups=1.18, wpb=497012, bsz=16246.5, num_updates=46700, lr=0.000292666, gnorm=0.141, clip=0, loss_scale=4, train_wall=83, gb_free=62.2, wall=40037 epoch 028: 1269 / 1689 loss=3.529, nll_loss=1.977, ppl=3.94, wps=585306, ups=1.18, wpb=494638, bsz=16362.2, num_updates=46800, lr=0.000292353, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=61.5, wall=40121 epoch 028: 1269 / 1689 loss=3.529, nll_loss=1.977, ppl=3.94, wps=585306, ups=1.18, wpb=494638, bsz=16362.2, num_updates=46800, lr=0.000292353, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=61.5, wall=40121 epoch 028: 1269 / 1689 loss=3.529, nll_loss=1.977, ppl=3.94, wps=585306, ups=1.18, wpb=494638, bsz=16362.2, num_updates=46800, lr=0.000292353, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=61.5, wall=40121 epoch 028: 1269 / 1689 loss=3.529, nll_loss=1.977, ppl=3.94, wps=585306, ups=1.18, wpb=494638, bsz=16362.2, num_updates=46800, lr=0.000292353, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=61.5, wall=40121 epoch 028: 1269 / 1689 loss=3.529, nll_loss=1.977, ppl=3.94, wps=585306, ups=1.18, wpb=494638, bsz=16362.2, num_updates=46800, lr=0.000292353, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=61.5, wall=40121 epoch 028: 1269 / 1689 loss=3.529, nll_loss=1.977, ppl=3.94, wps=585306, ups=1.18, wpb=494638, bsz=16362.2, num_updates=46800, lr=0.000292353, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=61.5, wall=40121 epoch 028: 1269 / 1689 loss=3.529, nll_loss=1.977, ppl=3.94, wps=585306, ups=1.18, wpb=494638, bsz=16362.2, num_updates=46800, lr=0.000292353, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=61.5, wall=40121 epoch 028: 1269 / 1689 loss=3.529, nll_loss=1.977, ppl=3.94, wps=585306, ups=1.18, wpb=494638, bsz=16362.2, num_updates=46800, lr=0.000292353, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=61.5, wall=40121 epoch 028: 1269 / 1689 loss=3.529, nll_loss=1.977, ppl=3.94, wps=585306, ups=1.18, wpb=494638, bsz=16362.2, num_updates=46800, lr=0.000292353, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=61.5, wall=40121 epoch 028: 1269 / 1689 loss=3.529, nll_loss=1.977, ppl=3.94, wps=585306, ups=1.18, wpb=494638, bsz=16362.2, num_updates=46800, lr=0.000292353, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=61.5, wall=40121 epoch 028: 1269 / 1689 loss=3.529, nll_loss=1.977, ppl=3.94, wps=585306, ups=1.18, wpb=494638, bsz=16362.2, num_updates=46800, lr=0.000292353, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=61.5, wall=40121 epoch 028: 1269 / 1689 loss=3.529, nll_loss=1.977, ppl=3.94, wps=585306, ups=1.18, wpb=494638, bsz=16362.2, num_updates=46800, lr=0.000292353, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=61.5, wall=40121 epoch 028: 1269 / 1689 loss=3.529, nll_loss=1.977, ppl=3.94, wps=585306, ups=1.18, wpb=494638, bsz=16362.2, num_updates=46800, lr=0.000292353, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=61.5, wall=40121 epoch 028: 1269 / 1689 loss=3.529, nll_loss=1.977, ppl=3.94, wps=585306, ups=1.18, wpb=494638, bsz=16362.2, num_updates=46800, lr=0.000292353, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=61.5, wall=40121 epoch 028: 1269 / 1689 loss=3.529, nll_loss=1.977, ppl=3.94, wps=585306, ups=1.18, wpb=494638, bsz=16362.2, num_updates=46800, lr=0.000292353, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=61.5, wall=40121 epoch 028: 1269 / 1689 loss=3.529, nll_loss=1.977, ppl=3.94, wps=585306, ups=1.18, wpb=494638, bsz=16362.2, num_updates=46800, lr=0.000292353, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=61.5, wall=40121 epoch 028: 1269 / 1689 loss=3.529, nll_loss=1.977, ppl=3.94, wps=585306, ups=1.18, wpb=494638, bsz=16362.2, num_updates=46800, lr=0.000292353, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=61.5, wall=40121 epoch 028: 1269 / 1689 loss=3.529, nll_loss=1.977, ppl=3.94, wps=585306, ups=1.18, wpb=494638, bsz=16362.2, num_updates=46800, lr=0.000292353, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=61.5, wall=40121 epoch 028: 1269 / 1689 loss=3.529, nll_loss=1.977, ppl=3.94, wps=585306, ups=1.18, wpb=494638, bsz=16362.2, num_updates=46800, lr=0.000292353, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=61.5, wall=40121 epoch 028: 1269 / 1689 loss=3.529, nll_loss=1.977, ppl=3.94, wps=585306, ups=1.18, wpb=494638, bsz=16362.2, num_updates=46800, lr=0.000292353, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=61.5, wall=40121 epoch 028: 1269 / 1689 loss=3.529, nll_loss=1.977, ppl=3.94, wps=585306, ups=1.18, wpb=494638, bsz=16362.2, num_updates=46800, lr=0.000292353, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=61.5, wall=40121 epoch 028: 1269 / 1689 loss=3.529, nll_loss=1.977, ppl=3.94, wps=585306, ups=1.18, wpb=494638, bsz=16362.2, num_updates=46800, lr=0.000292353, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=61.5, wall=40121 epoch 028: 1269 / 1689 loss=3.529, nll_loss=1.977, ppl=3.94, wps=585306, ups=1.18, wpb=494638, bsz=16362.2, num_updates=46800, lr=0.000292353, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=61.5, wall=40121 epoch 028: 1269 / 1689 loss=3.529, nll_loss=1.977, ppl=3.94, wps=585306, ups=1.18, wpb=494638, bsz=16362.2, num_updates=46800, lr=0.000292353, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=61.5, wall=40121 epoch 028: 1269 / 1689 loss=3.529, nll_loss=1.977, ppl=3.94, wps=585306, ups=1.18, wpb=494638, bsz=16362.2, num_updates=46800, lr=0.000292353, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=61.5, wall=40121 epoch 028: 1269 / 1689 loss=3.529, nll_loss=1.977, ppl=3.94, wps=585306, ups=1.18, wpb=494638, bsz=16362.2, num_updates=46800, lr=0.000292353, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=61.5, wall=40121 epoch 028: 1269 / 1689 loss=3.529, nll_loss=1.977, ppl=3.94, wps=585306, ups=1.18, wpb=494638, bsz=16362.2, num_updates=46800, lr=0.000292353, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=61.5, wall=40121 epoch 028: 1269 / 1689 loss=3.529, nll_loss=1.977, ppl=3.94, wps=585306, ups=1.18, wpb=494638, bsz=16362.2, num_updates=46800, lr=0.000292353, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=61.5, wall=40121 epoch 028: 1369 / 1689 loss=3.528, nll_loss=1.976, ppl=3.93, wps=587285, ups=1.18, wpb=496765, bsz=16636.2, num_updates=46900, lr=0.000292041, gnorm=0.135, clip=0, loss_scale=4, train_wall=84, gb_free=61.7, wall=40206 epoch 028: 1369 / 1689 loss=3.528, nll_loss=1.976, ppl=3.93, wps=587285, ups=1.18, wpb=496765, bsz=16636.2, num_updates=46900, lr=0.000292041, gnorm=0.135, clip=0, loss_scale=4, train_wall=84, gb_free=61.7, wall=40206 epoch 028: 1369 / 1689 loss=3.528, nll_loss=1.976, ppl=3.93, wps=587285, ups=1.18, wpb=496765, bsz=16636.2, num_updates=46900, lr=0.000292041, gnorm=0.135, clip=0, loss_scale=4, train_wall=84, gb_free=61.7, wall=40206 epoch 028: 1369 / 1689 loss=3.528, nll_loss=1.976, ppl=3.93, wps=587285, ups=1.18, wpb=496765, bsz=16636.2, num_updates=46900, lr=0.000292041, gnorm=0.135, clip=0, loss_scale=4, train_wall=84, gb_free=61.7, wall=40206 epoch 028: 1369 / 1689 loss=3.528, nll_loss=1.976, ppl=3.93, wps=587285, ups=1.18, wpb=496765, bsz=16636.2, num_updates=46900, lr=0.000292041, gnorm=0.135, clip=0, loss_scale=4, train_wall=84, gb_free=61.7, wall=40206 epoch 028: 1369 / 1689 loss=3.528, nll_loss=1.976, ppl=3.93, wps=587285, ups=1.18, wpb=496765, bsz=16636.2, num_updates=46900, lr=0.000292041, gnorm=0.135, clip=0, loss_scale=4, train_wall=84, gb_free=61.7, wall=40206 epoch 028: 1369 / 1689 loss=3.528, nll_loss=1.976, ppl=3.93, wps=587285, ups=1.18, wpb=496765, bsz=16636.2, num_updates=46900, lr=0.000292041, gnorm=0.135, clip=0, loss_scale=4, train_wall=84, gb_free=61.7, wall=40206 epoch 028: 1369 / 1689 loss=3.528, nll_loss=1.976, ppl=3.93, wps=587285, ups=1.18, wpb=496765, bsz=16636.2, num_updates=46900, lr=0.000292041, gnorm=0.135, clip=0, loss_scale=4, train_wall=84, gb_free=61.7, wall=40206 epoch 028: 1369 / 1689 loss=3.528, nll_loss=1.976, ppl=3.93, wps=587285, ups=1.18, wpb=496765, bsz=16636.2, num_updates=46900, lr=0.000292041, gnorm=0.135, clip=0, loss_scale=4, train_wall=84, gb_free=61.7, wall=40206 epoch 028: 1369 / 1689 loss=3.528, nll_loss=1.976, ppl=3.93, wps=587285, ups=1.18, wpb=496765, bsz=16636.2, num_updates=46900, lr=0.000292041, gnorm=0.135, clip=0, loss_scale=4, train_wall=84, gb_free=61.7, wall=40206 epoch 028: 1369 / 1689 loss=3.528, nll_loss=1.976, ppl=3.93, wps=587285, ups=1.18, wpb=496765, bsz=16636.2, num_updates=46900, lr=0.000292041, gnorm=0.135, clip=0, loss_scale=4, train_wall=84, gb_free=61.7, wall=40206 epoch 028: 1369 / 1689 loss=3.528, nll_loss=1.976, ppl=3.93, wps=587285, ups=1.18, wpb=496765, bsz=16636.2, num_updates=46900, lr=0.000292041, gnorm=0.135, clip=0, loss_scale=4, train_wall=84, gb_free=61.7, wall=40206 epoch 028: 1369 / 1689 loss=3.528, nll_loss=1.976, ppl=3.93, wps=587285, ups=1.18, wpb=496765, bsz=16636.2, num_updates=46900, lr=0.000292041, gnorm=0.135, clip=0, loss_scale=4, train_wall=84, gb_free=61.7, wall=40206 epoch 028: 1369 / 1689 loss=3.528, nll_loss=1.976, ppl=3.93, wps=587285, ups=1.18, wpb=496765, bsz=16636.2, num_updates=46900, lr=0.000292041, gnorm=0.135, clip=0, loss_scale=4, train_wall=84, gb_free=61.7, wall=40206 epoch 028: 1369 / 1689 loss=3.528, nll_loss=1.976, ppl=3.93, wps=587285, ups=1.18, wpb=496765, bsz=16636.2, num_updates=46900, lr=0.000292041, gnorm=0.135, clip=0, loss_scale=4, train_wall=84, gb_free=61.7, wall=40206 epoch 028: 1369 / 1689 loss=3.528, nll_loss=1.976, ppl=3.93, wps=587285, ups=1.18, wpb=496765, bsz=16636.2, num_updates=46900, lr=0.000292041, gnorm=0.135, clip=0, loss_scale=4, train_wall=84, gb_free=61.7, wall=40206 epoch 028: 1369 / 1689 loss=3.528, nll_loss=1.976, ppl=3.93, wps=587285, ups=1.18, wpb=496765, bsz=16636.2, num_updates=46900, lr=0.000292041, gnorm=0.135, clip=0, loss_scale=4, train_wall=84, gb_free=61.7, wall=40206 epoch 028: 1369 / 1689 loss=3.528, nll_loss=1.976, ppl=3.93, wps=587285, ups=1.18, wpb=496765, bsz=16636.2, num_updates=46900, lr=0.000292041, gnorm=0.135, clip=0, loss_scale=4, train_wall=84, gb_free=61.7, wall=40206 epoch 028: 1369 / 1689 loss=3.528, nll_loss=1.976, ppl=3.93, wps=587285, ups=1.18, wpb=496765, bsz=16636.2, num_updates=46900, lr=0.000292041, gnorm=0.135, clip=0, loss_scale=4, train_wall=84, gb_free=61.7, wall=40206 epoch 028: 1369 / 1689 loss=3.528, nll_loss=1.976, ppl=3.93, wps=587285, ups=1.18, wpb=496765, bsz=16636.2, num_updates=46900, lr=0.000292041, gnorm=0.135, clip=0, loss_scale=4, train_wall=84, gb_free=61.7, wall=40206 epoch 028: 1369 / 1689 loss=3.528, nll_loss=1.976, ppl=3.93, wps=587285, ups=1.18, wpb=496765, bsz=16636.2, num_updates=46900, lr=0.000292041, gnorm=0.135, clip=0, loss_scale=4, train_wall=84, gb_free=61.7, wall=40206 epoch 028: 1369 / 1689 loss=3.528, nll_loss=1.976, ppl=3.93, wps=587285, ups=1.18, wpb=496765, bsz=16636.2, num_updates=46900, lr=0.000292041, gnorm=0.135, clip=0, loss_scale=4, train_wall=84, gb_free=61.7, wall=40206 epoch 028: 1369 / 1689 loss=3.528, nll_loss=1.976, ppl=3.93, wps=587285, ups=1.18, wpb=496765, bsz=16636.2, num_updates=46900, lr=0.000292041, gnorm=0.135, clip=0, loss_scale=4, train_wall=84, gb_free=61.7, wall=40206 epoch 028: 1369 / 1689 loss=3.528, nll_loss=1.976, ppl=3.93, wps=587285, ups=1.18, wpb=496765, bsz=16636.2, num_updates=46900, lr=0.000292041, gnorm=0.135, clip=0, loss_scale=4, train_wall=84, gb_free=61.7, wall=40206 epoch 028: 1369 / 1689 loss=3.528, nll_loss=1.976, ppl=3.93, wps=587285, ups=1.18, wpb=496765, bsz=16636.2, num_updates=46900, lr=0.000292041, gnorm=0.135, clip=0, loss_scale=4, train_wall=84, gb_free=61.7, wall=40206 epoch 028: 1369 / 1689 loss=3.528, nll_loss=1.976, ppl=3.93, wps=587285, ups=1.18, wpb=496765, bsz=16636.2, num_updates=46900, lr=0.000292041, gnorm=0.135, clip=0, loss_scale=4, train_wall=84, gb_free=61.7, wall=40206 epoch 028: 1369 / 1689 loss=3.528, nll_loss=1.976, ppl=3.93, wps=587285, ups=1.18, wpb=496765, bsz=16636.2, num_updates=46900, lr=0.000292041, gnorm=0.135, clip=0, loss_scale=4, train_wall=84, gb_free=61.7, wall=40206 epoch 028: 1369 / 1689 loss=3.528, nll_loss=1.976, ppl=3.93, wps=587285, ups=1.18, wpb=496765, bsz=16636.2, num_updates=46900, lr=0.000292041, gnorm=0.135, clip=0, loss_scale=4, train_wall=84, gb_free=61.7, wall=40206 epoch 028: 1470 / 1689 loss=3.532, nll_loss=1.981, ppl=3.95, wps=580118, ups=1.17, wpb=494626, bsz=16507, num_updates=47000, lr=0.00029173, gnorm=0.135, clip=0, loss_scale=2, train_wall=84, gb_free=61.7, wall=40291 epoch 028: 1470 / 1689 loss=3.532, nll_loss=1.981, ppl=3.95, wps=580118, ups=1.17, wpb=494626, bsz=16507, num_updates=47000, lr=0.00029173, gnorm=0.135, clip=0, loss_scale=2, train_wall=84, gb_free=61.7, wall=40291 epoch 028: 1470 / 1689 loss=3.532, nll_loss=1.981, ppl=3.95, wps=580118, ups=1.17, wpb=494626, bsz=16507, num_updates=47000, lr=0.00029173, gnorm=0.135, clip=0, loss_scale=2, train_wall=84, gb_free=61.7, wall=40291 epoch 028: 1470 / 1689 loss=3.532, nll_loss=1.981, ppl=3.95, wps=580118, ups=1.17, wpb=494626, bsz=16507, num_updates=47000, lr=0.00029173, gnorm=0.135, clip=0, loss_scale=2, train_wall=84, gb_free=61.7, wall=40291 epoch 028: 1470 / 1689 loss=3.532, nll_loss=1.981, ppl=3.95, wps=580118, ups=1.17, wpb=494626, bsz=16507, num_updates=47000, lr=0.00029173, gnorm=0.135, clip=0, loss_scale=2, train_wall=84, gb_free=61.7, wall=40291 epoch 028: 1470 / 1689 loss=3.532, nll_loss=1.981, ppl=3.95, wps=580118, ups=1.17, wpb=494626, bsz=16507, num_updates=47000, lr=0.00029173, gnorm=0.135, clip=0, loss_scale=2, train_wall=84, gb_free=61.7, wall=40291 epoch 028: 1470 / 1689 loss=3.532, nll_loss=1.981, ppl=3.95, wps=580118, ups=1.17, wpb=494626, bsz=16507, num_updates=47000, lr=0.00029173, gnorm=0.135, clip=0, loss_scale=2, train_wall=84, gb_free=61.7, wall=40291 epoch 028: 1470 / 1689 loss=3.532, nll_loss=1.981, ppl=3.95, wps=580118, ups=1.17, wpb=494626, bsz=16507, num_updates=47000, lr=0.00029173, gnorm=0.135, clip=0, loss_scale=2, train_wall=84, gb_free=61.7, wall=40291 epoch 028: 1470 / 1689 loss=3.532, nll_loss=1.981, ppl=3.95, wps=580118, ups=1.17, wpb=494626, bsz=16507, num_updates=47000, lr=0.00029173, gnorm=0.135, clip=0, loss_scale=2, train_wall=84, gb_free=61.7, wall=40291 epoch 028: 1470 / 1689 loss=3.532, nll_loss=1.981, ppl=3.95, wps=580118, ups=1.17, wpb=494626, bsz=16507, num_updates=47000, lr=0.00029173, gnorm=0.135, clip=0, loss_scale=2, train_wall=84, gb_free=61.7, wall=40291 epoch 028: 1470 / 1689 loss=3.532, nll_loss=1.981, ppl=3.95, wps=580118, ups=1.17, wpb=494626, bsz=16507, num_updates=47000, lr=0.00029173, gnorm=0.135, clip=0, loss_scale=2, train_wall=84, gb_free=61.7, wall=40291 epoch 028: 1470 / 1689 loss=3.532, nll_loss=1.981, ppl=3.95, wps=580118, ups=1.17, wpb=494626, bsz=16507, num_updates=47000, lr=0.00029173, gnorm=0.135, clip=0, loss_scale=2, train_wall=84, gb_free=61.7, wall=40291 epoch 028: 1470 / 1689 loss=3.532, nll_loss=1.981, ppl=3.95, wps=580118, ups=1.17, wpb=494626, bsz=16507, num_updates=47000, lr=0.00029173, gnorm=0.135, clip=0, loss_scale=2, train_wall=84, gb_free=61.7, wall=40291 epoch 028: 1470 / 1689 loss=3.532, nll_loss=1.981, ppl=3.95, wps=580118, ups=1.17, wpb=494626, bsz=16507, num_updates=47000, lr=0.00029173, gnorm=0.135, clip=0, loss_scale=2, train_wall=84, gb_free=61.7, wall=40291 epoch 028: 1470 / 1689 loss=3.532, nll_loss=1.981, ppl=3.95, wps=580118, ups=1.17, wpb=494626, bsz=16507, num_updates=47000, lr=0.00029173, gnorm=0.135, clip=0, loss_scale=2, train_wall=84, gb_free=61.7, wall=40291 epoch 028: 1470 / 1689 loss=3.532, nll_loss=1.981, ppl=3.95, wps=580118, ups=1.17, wpb=494626, bsz=16507, num_updates=47000, lr=0.00029173, gnorm=0.135, clip=0, loss_scale=2, train_wall=84, gb_free=61.7, wall=40291 epoch 028: 1470 / 1689 loss=3.532, nll_loss=1.981, ppl=3.95, wps=580118, ups=1.17, wpb=494626, bsz=16507, num_updates=47000, lr=0.00029173, gnorm=0.135, clip=0, loss_scale=2, train_wall=84, gb_free=61.7, wall=40291 epoch 028: 1470 / 1689 loss=3.532, nll_loss=1.981, ppl=3.95, wps=580118, ups=1.17, wpb=494626, bsz=16507, num_updates=47000, lr=0.00029173, gnorm=0.135, clip=0, loss_scale=2, train_wall=84, gb_free=61.7, wall=40291 epoch 028: 1470 / 1689 loss=3.532, nll_loss=1.981, ppl=3.95, wps=580118, ups=1.17, wpb=494626, bsz=16507, num_updates=47000, lr=0.00029173, gnorm=0.135, clip=0, loss_scale=2, train_wall=84, gb_free=61.7, wall=40291 epoch 028: 1470 / 1689 loss=3.532, nll_loss=1.981, ppl=3.95, wps=580118, ups=1.17, wpb=494626, bsz=16507, num_updates=47000, lr=0.00029173, gnorm=0.135, clip=0, loss_scale=2, train_wall=84, gb_free=61.7, wall=40291 epoch 028: 1470 / 1689 loss=3.532, nll_loss=1.981, ppl=3.95, wps=580118, ups=1.17, wpb=494626, bsz=16507, num_updates=47000, lr=0.00029173, gnorm=0.135, clip=0, loss_scale=2, train_wall=84, gb_free=61.7, wall=40291 epoch 028: 1470 / 1689 loss=3.532, nll_loss=1.981, ppl=3.95, wps=580118, ups=1.17, wpb=494626, bsz=16507, num_updates=47000, lr=0.00029173, gnorm=0.135, clip=0, loss_scale=2, train_wall=84, gb_free=61.7, wall=40291 epoch 028: 1470 / 1689 loss=3.532, nll_loss=1.981, ppl=3.95, wps=580118, ups=1.17, wpb=494626, bsz=16507, num_updates=47000, lr=0.00029173, gnorm=0.135, clip=0, loss_scale=2, train_wall=84, gb_free=61.7, wall=40291 epoch 028: 1470 / 1689 loss=3.532, nll_loss=1.981, ppl=3.95, wps=580118, ups=1.17, wpb=494626, bsz=16507, num_updates=47000, lr=0.00029173, gnorm=0.135, clip=0, loss_scale=2, train_wall=84, gb_free=61.7, wall=40291 epoch 028: 1470 / 1689 loss=3.532, nll_loss=1.981, ppl=3.95, wps=580118, ups=1.17, wpb=494626, bsz=16507, num_updates=47000, lr=0.00029173, gnorm=0.135, clip=0, loss_scale=2, train_wall=84, gb_free=61.7, wall=40291 epoch 028: 1470 / 1689 loss=3.532, nll_loss=1.981, ppl=3.95, wps=580118, ups=1.17, wpb=494626, bsz=16507, num_updates=47000, lr=0.00029173, gnorm=0.135, clip=0, loss_scale=2, train_wall=84, gb_free=61.7, wall=40291 epoch 028: 1470 / 1689 loss=3.532, nll_loss=1.981, ppl=3.95, wps=580118, ups=1.17, wpb=494626, bsz=16507, num_updates=47000, lr=0.00029173, gnorm=0.135, clip=0, loss_scale=2, train_wall=84, gb_free=61.7, wall=40291 epoch 028: 1470 / 1689 loss=3.532, nll_loss=1.981, ppl=3.95, wps=580118, ups=1.17, wpb=494626, bsz=16507, num_updates=47000, lr=0.00029173, gnorm=0.135, clip=0, loss_scale=2, train_wall=84, gb_free=61.7, wall=40291 begin validation on "valid" subset epoch 028 | valid on 'valid' subset | loss 3.685 | nll_loss 2.136 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 47000 | best_loss 3.685 epoch 028 | valid on 'valid' subset | loss 3.685 | nll_loss 2.136 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 47000 | best_loss 3.685 epoch 028 | valid on 'valid' subset | loss 3.685 | nll_loss 2.136 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 47000 | best_loss 3.685 epoch 028 | valid on 'valid' subset | loss 3.685 | nll_loss 2.136 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 47000 | best_loss 3.685 epoch 028 | valid on 'valid' subset | loss 3.685 | nll_loss 2.136 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 47000 | best_loss 3.685 epoch 028 | valid on 'valid' subset | loss 3.685 | nll_loss 2.136 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 47000 | best_loss 3.685 epoch 028 | valid on 'valid' subset | loss 3.685 | nll_loss 2.136 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 47000 | best_loss 3.685 epoch 028 | valid on 'valid' subset | loss 3.685 | nll_loss 2.136 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 47000 | best_loss 3.685 epoch 028 | valid on 'valid' subset | loss 3.685 | nll_loss 2.136 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 47000 | best_loss 3.685 epoch 028 | valid on 'valid' subset | loss 3.685 | nll_loss 2.136 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 47000 | best_loss 3.685 epoch 028 | valid on 'valid' subset | loss 3.685 | nll_loss 2.136 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 47000 | best_loss 3.685 epoch 028 | valid on 'valid' subset | loss 3.685 | nll_loss 2.136 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 47000 | best_loss 3.685 epoch 028 | valid on 'valid' subset | loss 3.685 | nll_loss 2.136 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 47000 | best_loss 3.685 epoch 028 | valid on 'valid' subset | loss 3.685 | nll_loss 2.136 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 47000 | best_loss 3.685 epoch 028 | valid on 'valid' subset | loss 3.685 | nll_loss 2.136 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 47000 | best_loss 3.685 epoch 028 | valid on 'valid' subset | loss 3.685 | nll_loss 2.136 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 47000 | best_loss 3.685 epoch 028 | valid on 'valid' subset | loss 3.685 | nll_loss 2.136 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 47000 | best_loss 3.685 epoch 028 | valid on 'valid' subset | loss 3.685 | nll_loss 2.136 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 47000 | best_loss 3.685 epoch 028 | valid on 'valid' subset | loss 3.685 | nll_loss 2.136 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 47000 | best_loss 3.685 epoch 028 | valid on 'valid' subset | loss 3.685 | nll_loss 2.136 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 47000 | best_loss 3.685 epoch 028 | valid on 'valid' subset | loss 3.685 | nll_loss 2.136 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 47000 | best_loss 3.685 epoch 028 | valid on 'valid' subset | loss 3.685 | nll_loss 2.136 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 47000 | best_loss 3.685 epoch 028 | valid on 'valid' subset | loss 3.685 | nll_loss 2.136 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 47000 | best_loss 3.685 epoch 028 | valid on 'valid' subset | loss 3.685 | nll_loss 2.136 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 47000 | best_loss 3.685 epoch 028 | valid on 'valid' subset | loss 3.685 | nll_loss 2.136 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 47000 | best_loss 3.685 epoch 028 | valid on 'valid' subset | loss 3.685 | nll_loss 2.136 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 47000 | best_loss 3.685 epoch 028 | valid on 'valid' subset | loss 3.685 | nll_loss 2.136 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 47000 | best_loss 3.685 epoch 028 | valid on 'valid' subset | loss 3.685 | nll_loss 2.136 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 47000 | best_loss 3.685 epoch 028: 1570 / 1689 loss=3.535, nll_loss=1.984, ppl=3.96, wps=515770, ups=1.04, wpb=494378, bsz=16422.1, num_updates=47100, lr=0.00029142, gnorm=0.138, clip=0, loss_scale=2, train_wall=83, gb_free=62.6, wall=40387 epoch 028: 1570 / 1689 loss=3.535, nll_loss=1.984, ppl=3.96, wps=515770, ups=1.04, wpb=494378, bsz=16422.1, num_updates=47100, lr=0.00029142, gnorm=0.138, clip=0, loss_scale=2, train_wall=83, gb_free=62.6, wall=40387 epoch 028: 1570 / 1689 loss=3.535, nll_loss=1.984, ppl=3.96, wps=515770, ups=1.04, wpb=494378, bsz=16422.1, num_updates=47100, lr=0.00029142, gnorm=0.138, clip=0, loss_scale=2, train_wall=83, gb_free=62.6, wall=40387 epoch 028: 1570 / 1689 loss=3.535, nll_loss=1.984, ppl=3.96, wps=515770, ups=1.04, wpb=494378, bsz=16422.1, num_updates=47100, lr=0.00029142, gnorm=0.138, clip=0, loss_scale=2, train_wall=83, gb_free=62.6, wall=40387 epoch 028: 1570 / 1689 loss=3.535, nll_loss=1.984, ppl=3.96, wps=515770, ups=1.04, wpb=494378, bsz=16422.1, num_updates=47100, lr=0.00029142, gnorm=0.138, clip=0, loss_scale=2, train_wall=83, gb_free=62.6, wall=40387 epoch 028: 1570 / 1689 loss=3.535, nll_loss=1.984, ppl=3.96, wps=515770, ups=1.04, wpb=494378, bsz=16422.1, num_updates=47100, lr=0.00029142, gnorm=0.138, clip=0, loss_scale=2, train_wall=83, gb_free=62.6, wall=40387 epoch 028: 1570 / 1689 loss=3.535, nll_loss=1.984, ppl=3.96, wps=515770, ups=1.04, wpb=494378, bsz=16422.1, num_updates=47100, lr=0.00029142, gnorm=0.138, clip=0, loss_scale=2, train_wall=83, gb_free=62.6, wall=40387 epoch 028: 1570 / 1689 loss=3.535, nll_loss=1.984, ppl=3.96, wps=515770, ups=1.04, wpb=494378, bsz=16422.1, num_updates=47100, lr=0.00029142, gnorm=0.138, clip=0, loss_scale=2, train_wall=83, gb_free=62.6, wall=40387 epoch 028: 1570 / 1689 loss=3.535, nll_loss=1.984, ppl=3.96, wps=515770, ups=1.04, wpb=494378, bsz=16422.1, num_updates=47100, lr=0.00029142, gnorm=0.138, clip=0, loss_scale=2, train_wall=83, gb_free=62.6, wall=40387 epoch 028: 1570 / 1689 loss=3.535, nll_loss=1.984, ppl=3.96, wps=515770, ups=1.04, wpb=494378, bsz=16422.1, num_updates=47100, lr=0.00029142, gnorm=0.138, clip=0, loss_scale=2, train_wall=83, gb_free=62.6, wall=40387 epoch 028: 1570 / 1689 loss=3.535, nll_loss=1.984, ppl=3.96, wps=515770, ups=1.04, wpb=494378, bsz=16422.1, num_updates=47100, lr=0.00029142, gnorm=0.138, clip=0, loss_scale=2, train_wall=83, gb_free=62.6, wall=40387 epoch 028: 1570 / 1689 loss=3.535, nll_loss=1.984, ppl=3.96, wps=515770, ups=1.04, wpb=494378, bsz=16422.1, num_updates=47100, lr=0.00029142, gnorm=0.138, clip=0, loss_scale=2, train_wall=83, gb_free=62.6, wall=40387 epoch 028: 1570 / 1689 loss=3.535, nll_loss=1.984, ppl=3.96, wps=515770, ups=1.04, wpb=494378, bsz=16422.1, num_updates=47100, lr=0.00029142, gnorm=0.138, clip=0, loss_scale=2, train_wall=83, gb_free=62.6, wall=40387 epoch 028: 1570 / 1689 loss=3.535, nll_loss=1.984, ppl=3.96, wps=515770, ups=1.04, wpb=494378, bsz=16422.1, num_updates=47100, lr=0.00029142, gnorm=0.138, clip=0, loss_scale=2, train_wall=83, gb_free=62.6, wall=40387 epoch 028: 1570 / 1689 loss=3.535, nll_loss=1.984, ppl=3.96, wps=515770, ups=1.04, wpb=494378, bsz=16422.1, num_updates=47100, lr=0.00029142, gnorm=0.138, clip=0, loss_scale=2, train_wall=83, gb_free=62.6, wall=40387 epoch 028: 1570 / 1689 loss=3.535, nll_loss=1.984, ppl=3.96, wps=515770, ups=1.04, wpb=494378, bsz=16422.1, num_updates=47100, lr=0.00029142, gnorm=0.138, clip=0, loss_scale=2, train_wall=83, gb_free=62.6, wall=40387 epoch 028: 1570 / 1689 loss=3.535, nll_loss=1.984, ppl=3.96, wps=515770, ups=1.04, wpb=494378, bsz=16422.1, num_updates=47100, lr=0.00029142, gnorm=0.138, clip=0, loss_scale=2, train_wall=83, gb_free=62.6, wall=40387 epoch 028: 1570 / 1689 loss=3.535, nll_loss=1.984, ppl=3.96, wps=515770, ups=1.04, wpb=494378, bsz=16422.1, num_updates=47100, lr=0.00029142, gnorm=0.138, clip=0, loss_scale=2, train_wall=83, gb_free=62.6, wall=40387 epoch 028: 1570 / 1689 loss=3.535, nll_loss=1.984, ppl=3.96, wps=515770, ups=1.04, wpb=494378, bsz=16422.1, num_updates=47100, lr=0.00029142, gnorm=0.138, clip=0, loss_scale=2, train_wall=83, gb_free=62.6, wall=40387 epoch 028: 1570 / 1689 loss=3.535, nll_loss=1.984, ppl=3.96, wps=515770, ups=1.04, wpb=494378, bsz=16422.1, num_updates=47100, lr=0.00029142, gnorm=0.138, clip=0, loss_scale=2, train_wall=83, gb_free=62.6, wall=40387 epoch 028: 1570 / 1689 loss=3.535, nll_loss=1.984, ppl=3.96, wps=515770, ups=1.04, wpb=494378, bsz=16422.1, num_updates=47100, lr=0.00029142, gnorm=0.138, clip=0, loss_scale=2, train_wall=83, gb_free=62.6, wall=40387 epoch 028: 1570 / 1689 loss=3.535, nll_loss=1.984, ppl=3.96, wps=515770, ups=1.04, wpb=494378, bsz=16422.1, num_updates=47100, lr=0.00029142, gnorm=0.138, clip=0, loss_scale=2, train_wall=83, gb_free=62.6, wall=40387 epoch 028: 1570 / 1689 loss=3.535, nll_loss=1.984, ppl=3.96, wps=515770, ups=1.04, wpb=494378, bsz=16422.1, num_updates=47100, lr=0.00029142, gnorm=0.138, clip=0, loss_scale=2, train_wall=83, gb_free=62.6, wall=40387 epoch 028: 1570 / 1689 loss=3.535, nll_loss=1.984, ppl=3.96, wps=515770, ups=1.04, wpb=494378, bsz=16422.1, num_updates=47100, lr=0.00029142, gnorm=0.138, clip=0, loss_scale=2, train_wall=83, gb_free=62.6, wall=40387 epoch 028: 1570 / 1689 loss=3.535, nll_loss=1.984, ppl=3.96, wps=515770, ups=1.04, wpb=494378, bsz=16422.1, num_updates=47100, lr=0.00029142, gnorm=0.138, clip=0, loss_scale=2, train_wall=83, gb_free=62.6, wall=40387 epoch 028: 1570 / 1689 loss=3.535, nll_loss=1.984, ppl=3.96, wps=515770, ups=1.04, wpb=494378, bsz=16422.1, num_updates=47100, lr=0.00029142, gnorm=0.138, clip=0, loss_scale=2, train_wall=83, gb_free=62.6, wall=40387 epoch 028: 1570 / 1689 loss=3.535, nll_loss=1.984, ppl=3.96, wps=515770, ups=1.04, wpb=494378, bsz=16422.1, num_updates=47100, lr=0.00029142, gnorm=0.138, clip=0, loss_scale=2, train_wall=83, gb_free=62.6, wall=40387 epoch 028: 1570 / 1689 loss=3.535, nll_loss=1.984, ppl=3.96, wps=515770, ups=1.04, wpb=494378, bsz=16422.1, num_updates=47100, lr=0.00029142, gnorm=0.138, clip=0, loss_scale=2, train_wall=83, gb_free=62.6, wall=40387 epoch 028: 1670 / 1689 loss=3.538, nll_loss=1.988, ppl=3.97, wps=586410, ups=1.19, wpb=492580, bsz=16725.3, num_updates=47200, lr=0.000291111, gnorm=0.133, clip=0, loss_scale=2, train_wall=83, gb_free=61.8, wall=40471 epoch 028: 1670 / 1689 loss=3.538, nll_loss=1.988, ppl=3.97, wps=586410, ups=1.19, wpb=492580, bsz=16725.3, num_updates=47200, lr=0.000291111, gnorm=0.133, clip=0, loss_scale=2, train_wall=83, gb_free=61.8, wall=40471 epoch 028: 1670 / 1689 loss=3.538, nll_loss=1.988, ppl=3.97, wps=586410, ups=1.19, wpb=492580, bsz=16725.3, num_updates=47200, lr=0.000291111, gnorm=0.133, clip=0, loss_scale=2, train_wall=83, gb_free=61.8, wall=40471 epoch 028: 1670 / 1689 loss=3.538, nll_loss=1.988, ppl=3.97, wps=586410, ups=1.19, wpb=492580, bsz=16725.3, num_updates=47200, lr=0.000291111, gnorm=0.133, clip=0, loss_scale=2, train_wall=83, gb_free=61.8, wall=40471 epoch 028: 1670 / 1689 loss=3.538, nll_loss=1.988, ppl=3.97, wps=586410, ups=1.19, wpb=492580, bsz=16725.3, num_updates=47200, lr=0.000291111, gnorm=0.133, clip=0, loss_scale=2, train_wall=83, gb_free=61.8, wall=40471 epoch 028: 1670 / 1689 loss=3.538, nll_loss=1.988, ppl=3.97, wps=586410, ups=1.19, wpb=492580, bsz=16725.3, num_updates=47200, lr=0.000291111, gnorm=0.133, clip=0, loss_scale=2, train_wall=83, gb_free=61.8, wall=40471 epoch 028: 1670 / 1689 loss=3.538, nll_loss=1.988, ppl=3.97, wps=586410, ups=1.19, wpb=492580, bsz=16725.3, num_updates=47200, lr=0.000291111, gnorm=0.133, clip=0, loss_scale=2, train_wall=83, gb_free=61.8, wall=40471 epoch 028: 1670 / 1689 loss=3.538, nll_loss=1.988, ppl=3.97, wps=586410, ups=1.19, wpb=492580, bsz=16725.3, num_updates=47200, lr=0.000291111, gnorm=0.133, clip=0, loss_scale=2, train_wall=83, gb_free=61.8, wall=40471 epoch 028: 1670 / 1689 loss=3.538, nll_loss=1.988, ppl=3.97, wps=586410, ups=1.19, wpb=492580, bsz=16725.3, num_updates=47200, lr=0.000291111, gnorm=0.133, clip=0, loss_scale=2, train_wall=83, gb_free=61.8, wall=40471 epoch 028: 1670 / 1689 loss=3.538, nll_loss=1.988, ppl=3.97, wps=586410, ups=1.19, wpb=492580, bsz=16725.3, num_updates=47200, lr=0.000291111, gnorm=0.133, clip=0, loss_scale=2, train_wall=83, gb_free=61.8, wall=40471 epoch 028: 1670 / 1689 loss=3.538, nll_loss=1.988, ppl=3.97, wps=586410, ups=1.19, wpb=492580, bsz=16725.3, num_updates=47200, lr=0.000291111, gnorm=0.133, clip=0, loss_scale=2, train_wall=83, gb_free=61.8, wall=40471 epoch 028: 1670 / 1689 loss=3.538, nll_loss=1.988, ppl=3.97, wps=586410, ups=1.19, wpb=492580, bsz=16725.3, num_updates=47200, lr=0.000291111, gnorm=0.133, clip=0, loss_scale=2, train_wall=83, gb_free=61.8, wall=40471 epoch 028: 1670 / 1689 loss=3.538, nll_loss=1.988, ppl=3.97, wps=586410, ups=1.19, wpb=492580, bsz=16725.3, num_updates=47200, lr=0.000291111, gnorm=0.133, clip=0, loss_scale=2, train_wall=83, gb_free=61.8, wall=40471 epoch 028: 1670 / 1689 loss=3.538, nll_loss=1.988, ppl=3.97, wps=586410, ups=1.19, wpb=492580, bsz=16725.3, num_updates=47200, lr=0.000291111, gnorm=0.133, clip=0, loss_scale=2, train_wall=83, gb_free=61.8, wall=40471 epoch 028: 1670 / 1689 loss=3.538, nll_loss=1.988, ppl=3.97, wps=586410, ups=1.19, wpb=492580, bsz=16725.3, num_updates=47200, lr=0.000291111, gnorm=0.133, clip=0, loss_scale=2, train_wall=83, gb_free=61.8, wall=40471 epoch 028: 1670 / 1689 loss=3.538, nll_loss=1.988, ppl=3.97, wps=586410, ups=1.19, wpb=492580, bsz=16725.3, num_updates=47200, lr=0.000291111, gnorm=0.133, clip=0, loss_scale=2, train_wall=83, gb_free=61.8, wall=40471 epoch 028: 1670 / 1689 loss=3.538, nll_loss=1.988, ppl=3.97, wps=586410, ups=1.19, wpb=492580, bsz=16725.3, num_updates=47200, lr=0.000291111, gnorm=0.133, clip=0, loss_scale=2, train_wall=83, gb_free=61.8, wall=40471 epoch 028: 1670 / 1689 loss=3.538, nll_loss=1.988, ppl=3.97, wps=586410, ups=1.19, wpb=492580, bsz=16725.3, num_updates=47200, lr=0.000291111, gnorm=0.133, clip=0, loss_scale=2, train_wall=83, gb_free=61.8, wall=40471 epoch 028: 1670 / 1689 loss=3.538, nll_loss=1.988, ppl=3.97, wps=586410, ups=1.19, wpb=492580, bsz=16725.3, num_updates=47200, lr=0.000291111, gnorm=0.133, clip=0, loss_scale=2, train_wall=83, gb_free=61.8, wall=40471 epoch 028: 1670 / 1689 loss=3.538, nll_loss=1.988, ppl=3.97, wps=586410, ups=1.19, wpb=492580, bsz=16725.3, num_updates=47200, lr=0.000291111, gnorm=0.133, clip=0, loss_scale=2, train_wall=83, gb_free=61.8, wall=40471 epoch 028: 1670 / 1689 loss=3.538, nll_loss=1.988, ppl=3.97, wps=586410, ups=1.19, wpb=492580, bsz=16725.3, num_updates=47200, lr=0.000291111, gnorm=0.133, clip=0, loss_scale=2, train_wall=83, gb_free=61.8, wall=40471 epoch 028: 1670 / 1689 loss=3.538, nll_loss=1.988, ppl=3.97, wps=586410, ups=1.19, wpb=492580, bsz=16725.3, num_updates=47200, lr=0.000291111, gnorm=0.133, clip=0, loss_scale=2, train_wall=83, gb_free=61.8, wall=40471 epoch 028: 1670 / 1689 loss=3.538, nll_loss=1.988, ppl=3.97, wps=586410, ups=1.19, wpb=492580, bsz=16725.3, num_updates=47200, lr=0.000291111, gnorm=0.133, clip=0, loss_scale=2, train_wall=83, gb_free=61.8, wall=40471 epoch 028: 1670 / 1689 loss=3.538, nll_loss=1.988, ppl=3.97, wps=586410, ups=1.19, wpb=492580, bsz=16725.3, num_updates=47200, lr=0.000291111, gnorm=0.133, clip=0, loss_scale=2, train_wall=83, gb_free=61.8, wall=40471 epoch 028: 1670 / 1689 loss=3.538, nll_loss=1.988, ppl=3.97, wps=586410, ups=1.19, wpb=492580, bsz=16725.3, num_updates=47200, lr=0.000291111, gnorm=0.133, clip=0, loss_scale=2, train_wall=83, gb_free=61.8, wall=40471 epoch 028: 1670 / 1689 loss=3.538, nll_loss=1.988, ppl=3.97, wps=586410, ups=1.19, wpb=492580, bsz=16725.3, num_updates=47200, lr=0.000291111, gnorm=0.133, clip=0, loss_scale=2, train_wall=83, gb_free=61.8, wall=40471 epoch 028: 1670 / 1689 loss=3.538, nll_loss=1.988, ppl=3.97, wps=586410, ups=1.19, wpb=492580, bsz=16725.3, num_updates=47200, lr=0.000291111, gnorm=0.133, clip=0, loss_scale=2, train_wall=83, gb_free=61.8, wall=40471 epoch 028: 1670 / 1689 loss=3.538, nll_loss=1.988, ppl=3.97, wps=586410, ups=1.19, wpb=492580, bsz=16725.3, num_updates=47200, lr=0.000291111, gnorm=0.133, clip=0, loss_scale=2, train_wall=83, gb_free=61.8, wall=40471 end of epoch 28 (average epoch stats below) epoch 028 | loss 3.524 | nll_loss 1.971 | ppl 3.92 | wps 577757 | ups 1.17 | wpb 495122 | bsz 16503.8 | num_updates 47219 | lr 0.000291053 | gnorm 0.136 | clip 0 | loss_scale 2 | train_wall 1407 | gb_free 62.2 | wall 40486 epoch 028 | loss 3.524 | nll_loss 1.971 | ppl 3.92 | wps 577757 | ups 1.17 | wpb 495122 | bsz 16503.8 | num_updates 47219 | lr 0.000291053 | gnorm 0.136 | clip 0 | loss_scale 2 | train_wall 1407 | gb_free 62.2 | wall 40486 epoch 028 | loss 3.524 | nll_loss 1.971 | ppl 3.92 | wps 577757 | ups 1.17 | wpb 495122 | bsz 16503.8 | num_updates 47219 | lr 0.000291053 | gnorm 0.136 | clip 0 | loss_scale 2 | train_wall 1407 | gb_free 62.2 | wall 40486 epoch 028 | loss 3.524 | nll_loss 1.971 | ppl 3.92 | wps 577757 | ups 1.17 | wpb 495122 | bsz 16503.8 | num_updates 47219 | lr 0.000291053 | gnorm 0.136 | clip 0 | loss_scale 2 | train_wall 1407 | gb_free 62.2 | wall 40486 epoch 028 | loss 3.524 | nll_loss 1.971 | ppl 3.92 | wps 577757 | ups 1.17 | wpb 495122 | bsz 16503.8 | num_updates 47219 | lr 0.000291053 | gnorm 0.136 | clip 0 | loss_scale 2 | train_wall 1407 | gb_free 62.2 | wall 40486 epoch 028 | loss 3.524 | nll_loss 1.971 | ppl 3.92 | wps 577757 | ups 1.17 | wpb 495122 | bsz 16503.8 | num_updates 47219 | lr 0.000291053 | gnorm 0.136 | clip 0 | loss_scale 2 | train_wall 1407 | gb_free 62.2 | wall 40486 epoch 028 | loss 3.524 | nll_loss 1.971 | ppl 3.92 | wps 577757 | ups 1.17 | wpb 495122 | bsz 16503.8 | num_updates 47219 | lr 0.000291053 | gnorm 0.136 | clip 0 | loss_scale 2 | train_wall 1407 | gb_free 62.2 | wall 40486 epoch 028 | loss 3.524 | nll_loss 1.971 | ppl 3.92 | wps 577757 | ups 1.17 | wpb 495122 | bsz 16503.8 | num_updates 47219 | lr 0.000291053 | gnorm 0.136 | clip 0 | loss_scale 2 | train_wall 1407 | gb_free 62.2 | wall 40486 epoch 028 | loss 3.524 | nll_loss 1.971 | ppl 3.92 | wps 577757 | ups 1.17 | wpb 495122 | bsz 16503.8 | num_updates 47219 | lr 0.000291053 | gnorm 0.136 | clip 0 | loss_scale 2 | train_wall 1407 | gb_free 62.2 | wall 40486 epoch 028 | loss 3.524 | nll_loss 1.971 | ppl 3.92 | wps 577757 | ups 1.17 | wpb 495122 | bsz 16503.8 | num_updates 47219 | lr 0.000291053 | gnorm 0.136 | clip 0 | loss_scale 2 | train_wall 1407 | gb_free 62.2 | wall 40486 epoch 028 | loss 3.524 | nll_loss 1.971 | ppl 3.92 | wps 577757 | ups 1.17 | wpb 495122 | bsz 16503.8 | num_updates 47219 | lr 0.000291053 | gnorm 0.136 | clip 0 | loss_scale 2 | train_wall 1407 | gb_free 62.2 | wall 40486 epoch 028 | loss 3.524 | nll_loss 1.971 | ppl 3.92 | wps 577757 | ups 1.17 | wpb 495122 | bsz 16503.8 | num_updates 47219 | lr 0.000291053 | gnorm 0.136 | clip 0 | loss_scale 2 | train_wall 1407 | gb_free 62.2 | wall 40486 epoch 028 | loss 3.524 | nll_loss 1.971 | ppl 3.92 | wps 577757 | ups 1.17 | wpb 495122 | bsz 16503.8 | num_updates 47219 | lr 0.000291053 | gnorm 0.136 | clip 0 | loss_scale 2 | train_wall 1407 | gb_free 62.2 | wall 40486 epoch 028 | loss 3.524 | nll_loss 1.971 | ppl 3.92 | wps 577757 | ups 1.17 | wpb 495122 | bsz 16503.8 | num_updates 47219 | lr 0.000291053 | gnorm 0.136 | clip 0 | loss_scale 2 | train_wall 1407 | gb_free 62.2 | wall 40486 epoch 028 | loss 3.524 | nll_loss 1.971 | ppl 3.92 | wps 577757 | ups 1.17 | wpb 495122 | bsz 16503.8 | num_updates 47219 | lr 0.000291053 | gnorm 0.136 | clip 0 | loss_scale 2 | train_wall 1407 | gb_free 62.2 | wall 40486 epoch 028 | loss 3.524 | nll_loss 1.971 | ppl 3.92 | wps 577757 | ups 1.17 | wpb 495122 | bsz 16503.8 | num_updates 47219 | lr 0.000291053 | gnorm 0.136 | clip 0 | loss_scale 2 | train_wall 1407 | gb_free 62.2 | wall 40486 epoch 028 | loss 3.524 | nll_loss 1.971 | ppl 3.92 | wps 577757 | ups 1.17 | wpb 495122 | bsz 16503.8 | num_updates 47219 | lr 0.000291053 | gnorm 0.136 | clip 0 | loss_scale 2 | train_wall 1407 | gb_free 62.2 | wall 40486 epoch 028 | loss 3.524 | nll_loss 1.971 | ppl 3.92 | wps 577757 | ups 1.17 | wpb 495122 | bsz 16503.8 | num_updates 47219 | lr 0.000291053 | gnorm 0.136 | clip 0 | loss_scale 2 | train_wall 1407 | gb_free 62.2 | wall 40486 epoch 028 | loss 3.524 | nll_loss 1.971 | ppl 3.92 | wps 577757 | ups 1.17 | wpb 495122 | bsz 16503.8 | num_updates 47219 | lr 0.000291053 | gnorm 0.136 | clip 0 | loss_scale 2 | train_wall 1407 | gb_free 62.2 | wall 40486 epoch 028 | loss 3.524 | nll_loss 1.971 | ppl 3.92 | wps 577757 | ups 1.17 | wpb 495122 | bsz 16503.8 | num_updates 47219 | lr 0.000291053 | gnorm 0.136 | clip 0 | loss_scale 2 | train_wall 1407 | gb_free 62.2 | wall 40486 epoch 028 | loss 3.524 | nll_loss 1.971 | ppl 3.92 | wps 577757 | ups 1.17 | wpb 495122 | bsz 16503.8 | num_updates 47219 | lr 0.000291053 | gnorm 0.136 | clip 0 | loss_scale 2 | train_wall 1407 | gb_free 62.2 | wall 40486 epoch 028 | loss 3.524 | nll_loss 1.971 | ppl 3.92 | wps 577757 | ups 1.17 | wpb 495122 | bsz 16503.8 | num_updates 47219 | lr 0.000291053 | gnorm 0.136 | clip 0 | loss_scale 2 | train_wall 1407 | gb_free 62.2 | wall 40486 epoch 028 | loss 3.524 | nll_loss 1.971 | ppl 3.92 | wps 577757 | ups 1.17 | wpb 495122 | bsz 16503.8 | num_updates 47219 | lr 0.000291053 | gnorm 0.136 | clip 0 | loss_scale 2 | train_wall 1407 | gb_free 62.2 | wall 40486 epoch 028 | loss 3.524 | nll_loss 1.971 | ppl 3.92 | wps 577757 | ups 1.17 | wpb 495122 | bsz 16503.8 | num_updates 47219 | lr 0.000291053 | gnorm 0.136 | clip 0 | loss_scale 2 | train_wall 1407 | gb_free 62.2 | wall 40486 epoch 028 | loss 3.524 | nll_loss 1.971 | ppl 3.92 | wps 577757 | ups 1.17 | wpb 495122 | bsz 16503.8 | num_updates 47219 | lr 0.000291053 | gnorm 0.136 | clip 0 | loss_scale 2 | train_wall 1407 | gb_free 62.2 | wall 40486 epoch 028 | loss 3.524 | nll_loss 1.971 | ppl 3.92 | wps 577757 | ups 1.17 | wpb 495122 | bsz 16503.8 | num_updates 47219 | lr 0.000291053 | gnorm 0.136 | clip 0 | loss_scale 2 | train_wall 1407 | gb_free 62.2 | wall 40486 epoch 028 | loss 3.524 | nll_loss 1.971 | ppl 3.92 | wps 577757 | ups 1.17 | wpb 495122 | bsz 16503.8 | num_updates 47219 | lr 0.000291053 | gnorm 0.136 | clip 0 | loss_scale 2 | train_wall 1407 | gb_free 62.2 | wall 40486 epoch 028 | loss 3.524 | nll_loss 1.971 | ppl 3.92 | wps 577757 | ups 1.17 | wpb 495122 | bsz 16503.8 | num_updates 47219 | lr 0.000291053 | gnorm 0.136 | clip 0 | loss_scale 2 | train_wall 1407 | gb_free 62.2 | wall 40486 Start iterating over samples epoch 029: 81 / 1689 loss=3.512, nll_loss=1.957, ppl=3.88, wps=580329, ups=1.18, wpb=490928, bsz=16436.1, num_updates=47300, lr=0.000290803, gnorm=0.139, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=40556 epoch 029: 81 / 1689 loss=3.512, nll_loss=1.957, ppl=3.88, wps=580329, ups=1.18, wpb=490928, bsz=16436.1, num_updates=47300, lr=0.000290803, gnorm=0.139, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=40556 epoch 029: 81 / 1689 loss=3.512, nll_loss=1.957, ppl=3.88, wps=580329, ups=1.18, wpb=490928, bsz=16436.1, num_updates=47300, lr=0.000290803, gnorm=0.139, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=40556 epoch 029: 81 / 1689 loss=3.512, nll_loss=1.957, ppl=3.88, wps=580329, ups=1.18, wpb=490928, bsz=16436.1, num_updates=47300, lr=0.000290803, gnorm=0.139, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=40556 epoch 029: 81 / 1689 loss=3.512, nll_loss=1.957, ppl=3.88, wps=580329, ups=1.18, wpb=490928, bsz=16436.1, num_updates=47300, lr=0.000290803, gnorm=0.139, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=40556 epoch 029: 81 / 1689 loss=3.512, nll_loss=1.957, ppl=3.88, wps=580329, ups=1.18, wpb=490928, bsz=16436.1, num_updates=47300, lr=0.000290803, gnorm=0.139, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=40556 epoch 029: 81 / 1689 loss=3.512, nll_loss=1.957, ppl=3.88, wps=580329, ups=1.18, wpb=490928, bsz=16436.1, num_updates=47300, lr=0.000290803, gnorm=0.139, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=40556 epoch 029: 81 / 1689 loss=3.512, nll_loss=1.957, ppl=3.88, wps=580329, ups=1.18, wpb=490928, bsz=16436.1, num_updates=47300, lr=0.000290803, gnorm=0.139, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=40556 epoch 029: 81 / 1689 loss=3.512, nll_loss=1.957, ppl=3.88, wps=580329, ups=1.18, wpb=490928, bsz=16436.1, num_updates=47300, lr=0.000290803, gnorm=0.139, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=40556 epoch 029: 81 / 1689 loss=3.512, nll_loss=1.957, ppl=3.88, wps=580329, ups=1.18, wpb=490928, bsz=16436.1, num_updates=47300, lr=0.000290803, gnorm=0.139, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=40556 epoch 029: 81 / 1689 loss=3.512, nll_loss=1.957, ppl=3.88, wps=580329, ups=1.18, wpb=490928, bsz=16436.1, num_updates=47300, lr=0.000290803, gnorm=0.139, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=40556 epoch 029: 81 / 1689 loss=3.512, nll_loss=1.957, ppl=3.88, wps=580329, ups=1.18, wpb=490928, bsz=16436.1, num_updates=47300, lr=0.000290803, gnorm=0.139, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=40556 epoch 029: 81 / 1689 loss=3.512, nll_loss=1.957, ppl=3.88, wps=580329, ups=1.18, wpb=490928, bsz=16436.1, num_updates=47300, lr=0.000290803, gnorm=0.139, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=40556 epoch 029: 81 / 1689 loss=3.512, nll_loss=1.957, ppl=3.88, wps=580329, ups=1.18, wpb=490928, bsz=16436.1, num_updates=47300, lr=0.000290803, gnorm=0.139, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=40556 epoch 029: 81 / 1689 loss=3.512, nll_loss=1.957, ppl=3.88, wps=580329, ups=1.18, wpb=490928, bsz=16436.1, num_updates=47300, lr=0.000290803, gnorm=0.139, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=40556 epoch 029: 81 / 1689 loss=3.512, nll_loss=1.957, ppl=3.88, wps=580329, ups=1.18, wpb=490928, bsz=16436.1, num_updates=47300, lr=0.000290803, gnorm=0.139, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=40556 epoch 029: 81 / 1689 loss=3.512, nll_loss=1.957, ppl=3.88, wps=580329, ups=1.18, wpb=490928, bsz=16436.1, num_updates=47300, lr=0.000290803, gnorm=0.139, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=40556 epoch 029: 81 / 1689 loss=3.512, nll_loss=1.957, ppl=3.88, wps=580329, ups=1.18, wpb=490928, bsz=16436.1, num_updates=47300, lr=0.000290803, gnorm=0.139, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=40556 epoch 029: 81 / 1689 loss=3.512, nll_loss=1.957, ppl=3.88, wps=580329, ups=1.18, wpb=490928, bsz=16436.1, num_updates=47300, lr=0.000290803, gnorm=0.139, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=40556 epoch 029: 81 / 1689 loss=3.512, nll_loss=1.957, ppl=3.88, wps=580329, ups=1.18, wpb=490928, bsz=16436.1, num_updates=47300, lr=0.000290803, gnorm=0.139, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=40556 epoch 029: 81 / 1689 loss=3.512, nll_loss=1.957, ppl=3.88, wps=580329, ups=1.18, wpb=490928, bsz=16436.1, num_updates=47300, lr=0.000290803, gnorm=0.139, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=40556 epoch 029: 81 / 1689 loss=3.512, nll_loss=1.957, ppl=3.88, wps=580329, ups=1.18, wpb=490928, bsz=16436.1, num_updates=47300, lr=0.000290803, gnorm=0.139, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=40556 epoch 029: 81 / 1689 loss=3.512, nll_loss=1.957, ppl=3.88, wps=580329, ups=1.18, wpb=490928, bsz=16436.1, num_updates=47300, lr=0.000290803, gnorm=0.139, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=40556 epoch 029: 81 / 1689 loss=3.512, nll_loss=1.957, ppl=3.88, wps=580329, ups=1.18, wpb=490928, bsz=16436.1, num_updates=47300, lr=0.000290803, gnorm=0.139, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=40556 epoch 029: 81 / 1689 loss=3.512, nll_loss=1.957, ppl=3.88, wps=580329, ups=1.18, wpb=490928, bsz=16436.1, num_updates=47300, lr=0.000290803, gnorm=0.139, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=40556 epoch 029: 81 / 1689 loss=3.512, nll_loss=1.957, ppl=3.88, wps=580329, ups=1.18, wpb=490928, bsz=16436.1, num_updates=47300, lr=0.000290803, gnorm=0.139, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=40556 epoch 029: 81 / 1689 loss=3.512, nll_loss=1.957, ppl=3.88, wps=580329, ups=1.18, wpb=490928, bsz=16436.1, num_updates=47300, lr=0.000290803, gnorm=0.139, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=40556 epoch 029: 81 / 1689 loss=3.512, nll_loss=1.957, ppl=3.88, wps=580329, ups=1.18, wpb=490928, bsz=16436.1, num_updates=47300, lr=0.000290803, gnorm=0.139, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=40556 epoch 029: 81 / 1689 loss=3.512, nll_loss=1.957, ppl=3.88, wps=580329, ups=1.18, wpb=490928, bsz=16436.1, num_updates=47300, lr=0.000290803, gnorm=0.139, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=40556 epoch 029: 181 / 1689 loss=3.508, nll_loss=1.953, ppl=3.87, wps=588625, ups=1.19, wpb=495351, bsz=16975.9, num_updates=47400, lr=0.000290496, gnorm=0.128, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=40640 epoch 029: 181 / 1689 loss=3.508, nll_loss=1.953, ppl=3.87, wps=588625, ups=1.19, wpb=495351, bsz=16975.9, num_updates=47400, lr=0.000290496, gnorm=0.128, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=40640 epoch 029: 181 / 1689 loss=3.508, nll_loss=1.953, ppl=3.87, wps=588625, ups=1.19, wpb=495351, bsz=16975.9, num_updates=47400, lr=0.000290496, gnorm=0.128, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=40640 epoch 029: 181 / 1689 loss=3.508, nll_loss=1.953, ppl=3.87, wps=588625, ups=1.19, wpb=495351, bsz=16975.9, num_updates=47400, lr=0.000290496, gnorm=0.128, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=40640 epoch 029: 181 / 1689 loss=3.508, nll_loss=1.953, ppl=3.87, wps=588625, ups=1.19, wpb=495351, bsz=16975.9, num_updates=47400, lr=0.000290496, gnorm=0.128, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=40640 epoch 029: 181 / 1689 loss=3.508, nll_loss=1.953, ppl=3.87, wps=588625, ups=1.19, wpb=495351, bsz=16975.9, num_updates=47400, lr=0.000290496, gnorm=0.128, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=40640 epoch 029: 181 / 1689 loss=3.508, nll_loss=1.953, ppl=3.87, wps=588625, ups=1.19, wpb=495351, bsz=16975.9, num_updates=47400, lr=0.000290496, gnorm=0.128, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=40640 epoch 029: 181 / 1689 loss=3.508, nll_loss=1.953, ppl=3.87, wps=588625, ups=1.19, wpb=495351, bsz=16975.9, num_updates=47400, lr=0.000290496, gnorm=0.128, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=40640 epoch 029: 181 / 1689 loss=3.508, nll_loss=1.953, ppl=3.87, wps=588625, ups=1.19, wpb=495351, bsz=16975.9, num_updates=47400, lr=0.000290496, gnorm=0.128, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=40640 epoch 029: 181 / 1689 loss=3.508, nll_loss=1.953, ppl=3.87, wps=588625, ups=1.19, wpb=495351, bsz=16975.9, num_updates=47400, lr=0.000290496, gnorm=0.128, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=40640 epoch 029: 181 / 1689 loss=3.508, nll_loss=1.953, ppl=3.87, wps=588625, ups=1.19, wpb=495351, bsz=16975.9, num_updates=47400, lr=0.000290496, gnorm=0.128, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=40640 epoch 029: 181 / 1689 loss=3.508, nll_loss=1.953, ppl=3.87, wps=588625, ups=1.19, wpb=495351, bsz=16975.9, num_updates=47400, lr=0.000290496, gnorm=0.128, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=40640 epoch 029: 181 / 1689 loss=3.508, nll_loss=1.953, ppl=3.87, wps=588625, ups=1.19, wpb=495351, bsz=16975.9, num_updates=47400, lr=0.000290496, gnorm=0.128, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=40640 epoch 029: 181 / 1689 loss=3.508, nll_loss=1.953, ppl=3.87, wps=588625, ups=1.19, wpb=495351, bsz=16975.9, num_updates=47400, lr=0.000290496, gnorm=0.128, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=40640 epoch 029: 181 / 1689 loss=3.508, nll_loss=1.953, ppl=3.87, wps=588625, ups=1.19, wpb=495351, bsz=16975.9, num_updates=47400, lr=0.000290496, gnorm=0.128, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=40640 epoch 029: 181 / 1689 loss=3.508, nll_loss=1.953, ppl=3.87, wps=588625, ups=1.19, wpb=495351, bsz=16975.9, num_updates=47400, lr=0.000290496, gnorm=0.128, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=40640 epoch 029: 181 / 1689 loss=3.508, nll_loss=1.953, ppl=3.87, wps=588625, ups=1.19, wpb=495351, bsz=16975.9, num_updates=47400, lr=0.000290496, gnorm=0.128, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=40640 epoch 029: 181 / 1689 loss=3.508, nll_loss=1.953, ppl=3.87, wps=588625, ups=1.19, wpb=495351, bsz=16975.9, num_updates=47400, lr=0.000290496, gnorm=0.128, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=40640 epoch 029: 181 / 1689 loss=3.508, nll_loss=1.953, ppl=3.87, wps=588625, ups=1.19, wpb=495351, bsz=16975.9, num_updates=47400, lr=0.000290496, gnorm=0.128, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=40640 epoch 029: 181 / 1689 loss=3.508, nll_loss=1.953, ppl=3.87, wps=588625, ups=1.19, wpb=495351, bsz=16975.9, num_updates=47400, lr=0.000290496, gnorm=0.128, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=40640 epoch 029: 181 / 1689 loss=3.508, nll_loss=1.953, ppl=3.87, wps=588625, ups=1.19, wpb=495351, bsz=16975.9, num_updates=47400, lr=0.000290496, gnorm=0.128, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=40640 epoch 029: 181 / 1689 loss=3.508, nll_loss=1.953, ppl=3.87, wps=588625, ups=1.19, wpb=495351, bsz=16975.9, num_updates=47400, lr=0.000290496, gnorm=0.128, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=40640 epoch 029: 181 / 1689 loss=3.508, nll_loss=1.953, ppl=3.87, wps=588625, ups=1.19, wpb=495351, bsz=16975.9, num_updates=47400, lr=0.000290496, gnorm=0.128, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=40640 epoch 029: 181 / 1689 loss=3.508, nll_loss=1.953, ppl=3.87, wps=588625, ups=1.19, wpb=495351, bsz=16975.9, num_updates=47400, lr=0.000290496, gnorm=0.128, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=40640 epoch 029: 181 / 1689 loss=3.508, nll_loss=1.953, ppl=3.87, wps=588625, ups=1.19, wpb=495351, bsz=16975.9, num_updates=47400, lr=0.000290496, gnorm=0.128, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=40640 epoch 029: 181 / 1689 loss=3.508, nll_loss=1.953, ppl=3.87, wps=588625, ups=1.19, wpb=495351, bsz=16975.9, num_updates=47400, lr=0.000290496, gnorm=0.128, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=40640 epoch 029: 181 / 1689 loss=3.508, nll_loss=1.953, ppl=3.87, wps=588625, ups=1.19, wpb=495351, bsz=16975.9, num_updates=47400, lr=0.000290496, gnorm=0.128, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=40640 epoch 029: 181 / 1689 loss=3.508, nll_loss=1.953, ppl=3.87, wps=588625, ups=1.19, wpb=495351, bsz=16975.9, num_updates=47400, lr=0.000290496, gnorm=0.128, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=40640 epoch 029: 181 / 1689 loss=3.508, nll_loss=1.953, ppl=3.87, wps=588625, ups=1.19, wpb=495351, bsz=16975.9, num_updates=47400, lr=0.000290496, gnorm=0.128, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=40640 epoch 029: 282 / 1689 loss=3.509, nll_loss=1.954, ppl=3.87, wps=580180, ups=1.17, wpb=494945, bsz=16228.6, num_updates=47500, lr=0.000290191, gnorm=0.133, clip=0, loss_scale=2, train_wall=84, gb_free=61.1, wall=40725 epoch 029: 282 / 1689 loss=3.509, nll_loss=1.954, ppl=3.87, wps=580180, ups=1.17, wpb=494945, bsz=16228.6, num_updates=47500, lr=0.000290191, gnorm=0.133, clip=0, loss_scale=2, train_wall=84, gb_free=61.1, wall=40725 epoch 029: 282 / 1689 loss=3.509, nll_loss=1.954, ppl=3.87, wps=580180, ups=1.17, wpb=494945, bsz=16228.6, num_updates=47500, lr=0.000290191, gnorm=0.133, clip=0, loss_scale=2, train_wall=84, gb_free=61.1, wall=40725 epoch 029: 282 / 1689 loss=3.509, nll_loss=1.954, ppl=3.87, wps=580180, ups=1.17, wpb=494945, bsz=16228.6, num_updates=47500, lr=0.000290191, gnorm=0.133, clip=0, loss_scale=2, train_wall=84, gb_free=61.1, wall=40725 epoch 029: 282 / 1689 loss=3.509, nll_loss=1.954, ppl=3.87, wps=580180, ups=1.17, wpb=494945, bsz=16228.6, num_updates=47500, lr=0.000290191, gnorm=0.133, clip=0, loss_scale=2, train_wall=84, gb_free=61.1, wall=40725 epoch 029: 282 / 1689 loss=3.509, nll_loss=1.954, ppl=3.87, wps=580180, ups=1.17, wpb=494945, bsz=16228.6, num_updates=47500, lr=0.000290191, gnorm=0.133, clip=0, loss_scale=2, train_wall=84, gb_free=61.1, wall=40725 epoch 029: 282 / 1689 loss=3.509, nll_loss=1.954, ppl=3.87, wps=580180, ups=1.17, wpb=494945, bsz=16228.6, num_updates=47500, lr=0.000290191, gnorm=0.133, clip=0, loss_scale=2, train_wall=84, gb_free=61.1, wall=40725 epoch 029: 282 / 1689 loss=3.509, nll_loss=1.954, ppl=3.87, wps=580180, ups=1.17, wpb=494945, bsz=16228.6, num_updates=47500, lr=0.000290191, gnorm=0.133, clip=0, loss_scale=2, train_wall=84, gb_free=61.1, wall=40725 epoch 029: 282 / 1689 loss=3.509, nll_loss=1.954, ppl=3.87, wps=580180, ups=1.17, wpb=494945, bsz=16228.6, num_updates=47500, lr=0.000290191, gnorm=0.133, clip=0, loss_scale=2, train_wall=84, gb_free=61.1, wall=40725 epoch 029: 282 / 1689 loss=3.509, nll_loss=1.954, ppl=3.87, wps=580180, ups=1.17, wpb=494945, bsz=16228.6, num_updates=47500, lr=0.000290191, gnorm=0.133, clip=0, loss_scale=2, train_wall=84, gb_free=61.1, wall=40725 epoch 029: 282 / 1689 loss=3.509, nll_loss=1.954, ppl=3.87, wps=580180, ups=1.17, wpb=494945, bsz=16228.6, num_updates=47500, lr=0.000290191, gnorm=0.133, clip=0, loss_scale=2, train_wall=84, gb_free=61.1, wall=40725 epoch 029: 282 / 1689 loss=3.509, nll_loss=1.954, ppl=3.87, wps=580180, ups=1.17, wpb=494945, bsz=16228.6, num_updates=47500, lr=0.000290191, gnorm=0.133, clip=0, loss_scale=2, train_wall=84, gb_free=61.1, wall=40725 epoch 029: 282 / 1689 loss=3.509, nll_loss=1.954, ppl=3.87, wps=580180, ups=1.17, wpb=494945, bsz=16228.6, num_updates=47500, lr=0.000290191, gnorm=0.133, clip=0, loss_scale=2, train_wall=84, gb_free=61.1, wall=40725 epoch 029: 282 / 1689 loss=3.509, nll_loss=1.954, ppl=3.87, wps=580180, ups=1.17, wpb=494945, bsz=16228.6, num_updates=47500, lr=0.000290191, gnorm=0.133, clip=0, loss_scale=2, train_wall=84, gb_free=61.1, wall=40725 epoch 029: 282 / 1689 loss=3.509, nll_loss=1.954, ppl=3.87, wps=580180, ups=1.17, wpb=494945, bsz=16228.6, num_updates=47500, lr=0.000290191, gnorm=0.133, clip=0, loss_scale=2, train_wall=84, gb_free=61.1, wall=40725 epoch 029: 282 / 1689 loss=3.509, nll_loss=1.954, ppl=3.87, wps=580180, ups=1.17, wpb=494945, bsz=16228.6, num_updates=47500, lr=0.000290191, gnorm=0.133, clip=0, loss_scale=2, train_wall=84, gb_free=61.1, wall=40725 epoch 029: 282 / 1689 loss=3.509, nll_loss=1.954, ppl=3.87, wps=580180, ups=1.17, wpb=494945, bsz=16228.6, num_updates=47500, lr=0.000290191, gnorm=0.133, clip=0, loss_scale=2, train_wall=84, gb_free=61.1, wall=40725 epoch 029: 282 / 1689 loss=3.509, nll_loss=1.954, ppl=3.87, wps=580180, ups=1.17, wpb=494945, bsz=16228.6, num_updates=47500, lr=0.000290191, gnorm=0.133, clip=0, loss_scale=2, train_wall=84, gb_free=61.1, wall=40725 epoch 029: 282 / 1689 loss=3.509, nll_loss=1.954, ppl=3.87, wps=580180, ups=1.17, wpb=494945, bsz=16228.6, num_updates=47500, lr=0.000290191, gnorm=0.133, clip=0, loss_scale=2, train_wall=84, gb_free=61.1, wall=40725 epoch 029: 282 / 1689 loss=3.509, nll_loss=1.954, ppl=3.87, wps=580180, ups=1.17, wpb=494945, bsz=16228.6, num_updates=47500, lr=0.000290191, gnorm=0.133, clip=0, loss_scale=2, train_wall=84, gb_free=61.1, wall=40725 epoch 029: 282 / 1689 loss=3.509, nll_loss=1.954, ppl=3.87, wps=580180, ups=1.17, wpb=494945, bsz=16228.6, num_updates=47500, lr=0.000290191, gnorm=0.133, clip=0, loss_scale=2, train_wall=84, gb_free=61.1, wall=40725 epoch 029: 282 / 1689 loss=3.509, nll_loss=1.954, ppl=3.87, wps=580180, ups=1.17, wpb=494945, bsz=16228.6, num_updates=47500, lr=0.000290191, gnorm=0.133, clip=0, loss_scale=2, train_wall=84, gb_free=61.1, wall=40725 epoch 029: 282 / 1689 loss=3.509, nll_loss=1.954, ppl=3.87, wps=580180, ups=1.17, wpb=494945, bsz=16228.6, num_updates=47500, lr=0.000290191, gnorm=0.133, clip=0, loss_scale=2, train_wall=84, gb_free=61.1, wall=40725 epoch 029: 282 / 1689 loss=3.509, nll_loss=1.954, ppl=3.87, wps=580180, ups=1.17, wpb=494945, bsz=16228.6, num_updates=47500, lr=0.000290191, gnorm=0.133, clip=0, loss_scale=2, train_wall=84, gb_free=61.1, wall=40725 epoch 029: 282 / 1689 loss=3.509, nll_loss=1.954, ppl=3.87, wps=580180, ups=1.17, wpb=494945, bsz=16228.6, num_updates=47500, lr=0.000290191, gnorm=0.133, clip=0, loss_scale=2, train_wall=84, gb_free=61.1, wall=40725 epoch 029: 282 / 1689 loss=3.509, nll_loss=1.954, ppl=3.87, wps=580180, ups=1.17, wpb=494945, bsz=16228.6, num_updates=47500, lr=0.000290191, gnorm=0.133, clip=0, loss_scale=2, train_wall=84, gb_free=61.1, wall=40725 epoch 029: 282 / 1689 loss=3.509, nll_loss=1.954, ppl=3.87, wps=580180, ups=1.17, wpb=494945, bsz=16228.6, num_updates=47500, lr=0.000290191, gnorm=0.133, clip=0, loss_scale=2, train_wall=84, gb_free=61.1, wall=40725 epoch 029: 282 / 1689 loss=3.509, nll_loss=1.954, ppl=3.87, wps=580180, ups=1.17, wpb=494945, bsz=16228.6, num_updates=47500, lr=0.000290191, gnorm=0.133, clip=0, loss_scale=2, train_wall=84, gb_free=61.1, wall=40725 epoch 029: 282 / 1689 loss=3.509, nll_loss=1.954, ppl=3.87, wps=580180, ups=1.17, wpb=494945, bsz=16228.6, num_updates=47500, lr=0.000290191, gnorm=0.133, clip=0, loss_scale=2, train_wall=84, gb_free=61.1, wall=40725 epoch 029: 382 / 1689 loss=3.515, nll_loss=1.96, ppl=3.89, wps=588685, ups=1.19, wpb=494542, bsz=16025, num_updates=47600, lr=0.000289886, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=60.1, wall=40809 epoch 029: 382 / 1689 loss=3.515, nll_loss=1.96, ppl=3.89, wps=588685, ups=1.19, wpb=494542, bsz=16025, num_updates=47600, lr=0.000289886, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=60.1, wall=40809 epoch 029: 382 / 1689 loss=3.515, nll_loss=1.96, ppl=3.89, wps=588685, ups=1.19, wpb=494542, bsz=16025, num_updates=47600, lr=0.000289886, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=60.1, wall=40809 epoch 029: 382 / 1689 loss=3.515, nll_loss=1.96, ppl=3.89, wps=588685, ups=1.19, wpb=494542, bsz=16025, num_updates=47600, lr=0.000289886, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=60.1, wall=40809 epoch 029: 382 / 1689 loss=3.515, nll_loss=1.96, ppl=3.89, wps=588685, ups=1.19, wpb=494542, bsz=16025, num_updates=47600, lr=0.000289886, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=60.1, wall=40809 epoch 029: 382 / 1689 loss=3.515, nll_loss=1.96, ppl=3.89, wps=588685, ups=1.19, wpb=494542, bsz=16025, num_updates=47600, lr=0.000289886, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=60.1, wall=40809 epoch 029: 382 / 1689 loss=3.515, nll_loss=1.96, ppl=3.89, wps=588685, ups=1.19, wpb=494542, bsz=16025, num_updates=47600, lr=0.000289886, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=60.1, wall=40809 epoch 029: 382 / 1689 loss=3.515, nll_loss=1.96, ppl=3.89, wps=588685, ups=1.19, wpb=494542, bsz=16025, num_updates=47600, lr=0.000289886, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=60.1, wall=40809 epoch 029: 382 / 1689 loss=3.515, nll_loss=1.96, ppl=3.89, wps=588685, ups=1.19, wpb=494542, bsz=16025, num_updates=47600, lr=0.000289886, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=60.1, wall=40809 epoch 029: 382 / 1689 loss=3.515, nll_loss=1.96, ppl=3.89, wps=588685, ups=1.19, wpb=494542, bsz=16025, num_updates=47600, lr=0.000289886, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=60.1, wall=40809 epoch 029: 382 / 1689 loss=3.515, nll_loss=1.96, ppl=3.89, wps=588685, ups=1.19, wpb=494542, bsz=16025, num_updates=47600, lr=0.000289886, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=60.1, wall=40809 epoch 029: 382 / 1689 loss=3.515, nll_loss=1.96, ppl=3.89, wps=588685, ups=1.19, wpb=494542, bsz=16025, num_updates=47600, lr=0.000289886, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=60.1, wall=40809 epoch 029: 382 / 1689 loss=3.515, nll_loss=1.96, ppl=3.89, wps=588685, ups=1.19, wpb=494542, bsz=16025, num_updates=47600, lr=0.000289886, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=60.1, wall=40809 epoch 029: 382 / 1689 loss=3.515, nll_loss=1.96, ppl=3.89, wps=588685, ups=1.19, wpb=494542, bsz=16025, num_updates=47600, lr=0.000289886, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=60.1, wall=40809 epoch 029: 382 / 1689 loss=3.515, nll_loss=1.96, ppl=3.89, wps=588685, ups=1.19, wpb=494542, bsz=16025, num_updates=47600, lr=0.000289886, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=60.1, wall=40809 epoch 029: 382 / 1689 loss=3.515, nll_loss=1.96, ppl=3.89, wps=588685, ups=1.19, wpb=494542, bsz=16025, num_updates=47600, lr=0.000289886, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=60.1, wall=40809 epoch 029: 382 / 1689 loss=3.515, nll_loss=1.96, ppl=3.89, wps=588685, ups=1.19, wpb=494542, bsz=16025, num_updates=47600, lr=0.000289886, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=60.1, wall=40809 epoch 029: 382 / 1689 loss=3.515, nll_loss=1.96, ppl=3.89, wps=588685, ups=1.19, wpb=494542, bsz=16025, num_updates=47600, lr=0.000289886, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=60.1, wall=40809 epoch 029: 382 / 1689 loss=3.515, nll_loss=1.96, ppl=3.89, wps=588685, ups=1.19, wpb=494542, bsz=16025, num_updates=47600, lr=0.000289886, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=60.1, wall=40809 epoch 029: 382 / 1689 loss=3.515, nll_loss=1.96, ppl=3.89, wps=588685, ups=1.19, wpb=494542, bsz=16025, num_updates=47600, lr=0.000289886, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=60.1, wall=40809 epoch 029: 382 / 1689 loss=3.515, nll_loss=1.96, ppl=3.89, wps=588685, ups=1.19, wpb=494542, bsz=16025, num_updates=47600, lr=0.000289886, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=60.1, wall=40809 epoch 029: 382 / 1689 loss=3.515, nll_loss=1.96, ppl=3.89, wps=588685, ups=1.19, wpb=494542, bsz=16025, num_updates=47600, lr=0.000289886, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=60.1, wall=40809 epoch 029: 382 / 1689 loss=3.515, nll_loss=1.96, ppl=3.89, wps=588685, ups=1.19, wpb=494542, bsz=16025, num_updates=47600, lr=0.000289886, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=60.1, wall=40809 epoch 029: 382 / 1689 loss=3.515, nll_loss=1.96, ppl=3.89, wps=588685, ups=1.19, wpb=494542, bsz=16025, num_updates=47600, lr=0.000289886, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=60.1, wall=40809 epoch 029: 382 / 1689 loss=3.515, nll_loss=1.96, ppl=3.89, wps=588685, ups=1.19, wpb=494542, bsz=16025, num_updates=47600, lr=0.000289886, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=60.1, wall=40809 epoch 029: 382 / 1689 loss=3.515, nll_loss=1.96, ppl=3.89, wps=588685, ups=1.19, wpb=494542, bsz=16025, num_updates=47600, lr=0.000289886, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=60.1, wall=40809 epoch 029: 382 / 1689 loss=3.515, nll_loss=1.96, ppl=3.89, wps=588685, ups=1.19, wpb=494542, bsz=16025, num_updates=47600, lr=0.000289886, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=60.1, wall=40809 epoch 029: 382 / 1689 loss=3.515, nll_loss=1.96, ppl=3.89, wps=588685, ups=1.19, wpb=494542, bsz=16025, num_updates=47600, lr=0.000289886, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=60.1, wall=40809 epoch 029: 382 / 1689 loss=3.515, nll_loss=1.96, ppl=3.89, wps=588685, ups=1.19, wpb=494542, bsz=16025, num_updates=47600, lr=0.000289886, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=60.1, wall=40809 epoch 029: 482 / 1689 loss=3.507, nll_loss=1.952, ppl=3.87, wps=586993, ups=1.18, wpb=495452, bsz=16718.6, num_updates=47700, lr=0.000289581, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=60.2, wall=40894 epoch 029: 482 / 1689 loss=3.507, nll_loss=1.952, ppl=3.87, wps=586993, ups=1.18, wpb=495452, bsz=16718.6, num_updates=47700, lr=0.000289581, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=60.2, wall=40894 epoch 029: 482 / 1689 loss=3.507, nll_loss=1.952, ppl=3.87, wps=586993, ups=1.18, wpb=495452, bsz=16718.6, num_updates=47700, lr=0.000289581, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=60.2, wall=40894 epoch 029: 482 / 1689 loss=3.507, nll_loss=1.952, ppl=3.87, wps=586993, ups=1.18, wpb=495452, bsz=16718.6, num_updates=47700, lr=0.000289581, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=60.2, wall=40894 epoch 029: 482 / 1689 loss=3.507, nll_loss=1.952, ppl=3.87, wps=586993, ups=1.18, wpb=495452, bsz=16718.6, num_updates=47700, lr=0.000289581, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=60.2, wall=40894 epoch 029: 482 / 1689 loss=3.507, nll_loss=1.952, ppl=3.87, wps=586993, ups=1.18, wpb=495452, bsz=16718.6, num_updates=47700, lr=0.000289581, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=60.2, wall=40894 epoch 029: 482 / 1689 loss=3.507, nll_loss=1.952, ppl=3.87, wps=586993, ups=1.18, wpb=495452, bsz=16718.6, num_updates=47700, lr=0.000289581, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=60.2, wall=40894 epoch 029: 482 / 1689 loss=3.507, nll_loss=1.952, ppl=3.87, wps=586993, ups=1.18, wpb=495452, bsz=16718.6, num_updates=47700, lr=0.000289581, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=60.2, wall=40894 epoch 029: 482 / 1689 loss=3.507, nll_loss=1.952, ppl=3.87, wps=586993, ups=1.18, wpb=495452, bsz=16718.6, num_updates=47700, lr=0.000289581, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=60.2, wall=40894 epoch 029: 482 / 1689 loss=3.507, nll_loss=1.952, ppl=3.87, wps=586993, ups=1.18, wpb=495452, bsz=16718.6, num_updates=47700, lr=0.000289581, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=60.2, wall=40894 epoch 029: 482 / 1689 loss=3.507, nll_loss=1.952, ppl=3.87, wps=586993, ups=1.18, wpb=495452, bsz=16718.6, num_updates=47700, lr=0.000289581, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=60.2, wall=40894 epoch 029: 482 / 1689 loss=3.507, nll_loss=1.952, ppl=3.87, wps=586993, ups=1.18, wpb=495452, bsz=16718.6, num_updates=47700, lr=0.000289581, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=60.2, wall=40894 epoch 029: 482 / 1689 loss=3.507, nll_loss=1.952, ppl=3.87, wps=586993, ups=1.18, wpb=495452, bsz=16718.6, num_updates=47700, lr=0.000289581, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=60.2, wall=40894 epoch 029: 482 / 1689 loss=3.507, nll_loss=1.952, ppl=3.87, wps=586993, ups=1.18, wpb=495452, bsz=16718.6, num_updates=47700, lr=0.000289581, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=60.2, wall=40894 epoch 029: 482 / 1689 loss=3.507, nll_loss=1.952, ppl=3.87, wps=586993, ups=1.18, wpb=495452, bsz=16718.6, num_updates=47700, lr=0.000289581, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=60.2, wall=40894 epoch 029: 482 / 1689 loss=3.507, nll_loss=1.952, ppl=3.87, wps=586993, ups=1.18, wpb=495452, bsz=16718.6, num_updates=47700, lr=0.000289581, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=60.2, wall=40894 epoch 029: 482 / 1689 loss=3.507, nll_loss=1.952, ppl=3.87, wps=586993, ups=1.18, wpb=495452, bsz=16718.6, num_updates=47700, lr=0.000289581, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=60.2, wall=40894 epoch 029: 482 / 1689 loss=3.507, nll_loss=1.952, ppl=3.87, wps=586993, ups=1.18, wpb=495452, bsz=16718.6, num_updates=47700, lr=0.000289581, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=60.2, wall=40894 epoch 029: 482 / 1689 loss=3.507, nll_loss=1.952, ppl=3.87, wps=586993, ups=1.18, wpb=495452, bsz=16718.6, num_updates=47700, lr=0.000289581, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=60.2, wall=40894 epoch 029: 482 / 1689 loss=3.507, nll_loss=1.952, ppl=3.87, wps=586993, ups=1.18, wpb=495452, bsz=16718.6, num_updates=47700, lr=0.000289581, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=60.2, wall=40894 epoch 029: 482 / 1689 loss=3.507, nll_loss=1.952, ppl=3.87, wps=586993, ups=1.18, wpb=495452, bsz=16718.6, num_updates=47700, lr=0.000289581, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=60.2, wall=40894 epoch 029: 482 / 1689 loss=3.507, nll_loss=1.952, ppl=3.87, wps=586993, ups=1.18, wpb=495452, bsz=16718.6, num_updates=47700, lr=0.000289581, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=60.2, wall=40894 epoch 029: 482 / 1689 loss=3.507, nll_loss=1.952, ppl=3.87, wps=586993, ups=1.18, wpb=495452, bsz=16718.6, num_updates=47700, lr=0.000289581, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=60.2, wall=40894 epoch 029: 482 / 1689 loss=3.507, nll_loss=1.952, ppl=3.87, wps=586993, ups=1.18, wpb=495452, bsz=16718.6, num_updates=47700, lr=0.000289581, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=60.2, wall=40894 epoch 029: 482 / 1689 loss=3.507, nll_loss=1.952, ppl=3.87, wps=586993, ups=1.18, wpb=495452, bsz=16718.6, num_updates=47700, lr=0.000289581, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=60.2, wall=40894 epoch 029: 482 / 1689 loss=3.507, nll_loss=1.952, ppl=3.87, wps=586993, ups=1.18, wpb=495452, bsz=16718.6, num_updates=47700, lr=0.000289581, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=60.2, wall=40894 epoch 029: 482 / 1689 loss=3.507, nll_loss=1.952, ppl=3.87, wps=586993, ups=1.18, wpb=495452, bsz=16718.6, num_updates=47700, lr=0.000289581, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=60.2, wall=40894 epoch 029: 482 / 1689 loss=3.507, nll_loss=1.952, ppl=3.87, wps=586993, ups=1.18, wpb=495452, bsz=16718.6, num_updates=47700, lr=0.000289581, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=60.2, wall=40894 epoch 029: 482 / 1689 loss=3.507, nll_loss=1.952, ppl=3.87, wps=586993, ups=1.18, wpb=495452, bsz=16718.6, num_updates=47700, lr=0.000289581, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=60.2, wall=40894 epoch 029: 582 / 1689 loss=3.514, nll_loss=1.96, ppl=3.89, wps=583298, ups=1.18, wpb=494910, bsz=16880, num_updates=47800, lr=0.000289278, gnorm=0.138, clip=0, loss_scale=2, train_wall=84, gb_free=61.8, wall=40978 epoch 029: 582 / 1689 loss=3.514, nll_loss=1.96, ppl=3.89, wps=583298, ups=1.18, wpb=494910, bsz=16880, num_updates=47800, lr=0.000289278, gnorm=0.138, clip=0, loss_scale=2, train_wall=84, gb_free=61.8, wall=40978 epoch 029: 582 / 1689 loss=3.514, nll_loss=1.96, ppl=3.89, wps=583298, ups=1.18, wpb=494910, bsz=16880, num_updates=47800, lr=0.000289278, gnorm=0.138, clip=0, loss_scale=2, train_wall=84, gb_free=61.8, wall=40978 epoch 029: 582 / 1689 loss=3.514, nll_loss=1.96, ppl=3.89, wps=583298, ups=1.18, wpb=494910, bsz=16880, num_updates=47800, lr=0.000289278, gnorm=0.138, clip=0, loss_scale=2, train_wall=84, gb_free=61.8, wall=40978 epoch 029: 582 / 1689 loss=3.514, nll_loss=1.96, ppl=3.89, wps=583298, ups=1.18, wpb=494910, bsz=16880, num_updates=47800, lr=0.000289278, gnorm=0.138, clip=0, loss_scale=2, train_wall=84, gb_free=61.8, wall=40978 epoch 029: 582 / 1689 loss=3.514, nll_loss=1.96, ppl=3.89, wps=583298, ups=1.18, wpb=494910, bsz=16880, num_updates=47800, lr=0.000289278, gnorm=0.138, clip=0, loss_scale=2, train_wall=84, gb_free=61.8, wall=40978 epoch 029: 582 / 1689 loss=3.514, nll_loss=1.96, ppl=3.89, wps=583298, ups=1.18, wpb=494910, bsz=16880, num_updates=47800, lr=0.000289278, gnorm=0.138, clip=0, loss_scale=2, train_wall=84, gb_free=61.8, wall=40978 epoch 029: 582 / 1689 loss=3.514, nll_loss=1.96, ppl=3.89, wps=583298, ups=1.18, wpb=494910, bsz=16880, num_updates=47800, lr=0.000289278, gnorm=0.138, clip=0, loss_scale=2, train_wall=84, gb_free=61.8, wall=40978 epoch 029: 582 / 1689 loss=3.514, nll_loss=1.96, ppl=3.89, wps=583298, ups=1.18, wpb=494910, bsz=16880, num_updates=47800, lr=0.000289278, gnorm=0.138, clip=0, loss_scale=2, train_wall=84, gb_free=61.8, wall=40978 epoch 029: 582 / 1689 loss=3.514, nll_loss=1.96, ppl=3.89, wps=583298, ups=1.18, wpb=494910, bsz=16880, num_updates=47800, lr=0.000289278, gnorm=0.138, clip=0, loss_scale=2, train_wall=84, gb_free=61.8, wall=40978 epoch 029: 582 / 1689 loss=3.514, nll_loss=1.96, ppl=3.89, wps=583298, ups=1.18, wpb=494910, bsz=16880, num_updates=47800, lr=0.000289278, gnorm=0.138, clip=0, loss_scale=2, train_wall=84, gb_free=61.8, wall=40978 epoch 029: 582 / 1689 loss=3.514, nll_loss=1.96, ppl=3.89, wps=583298, ups=1.18, wpb=494910, bsz=16880, num_updates=47800, lr=0.000289278, gnorm=0.138, clip=0, loss_scale=2, train_wall=84, gb_free=61.8, wall=40978 epoch 029: 582 / 1689 loss=3.514, nll_loss=1.96, ppl=3.89, wps=583298, ups=1.18, wpb=494910, bsz=16880, num_updates=47800, lr=0.000289278, gnorm=0.138, clip=0, loss_scale=2, train_wall=84, gb_free=61.8, wall=40978 epoch 029: 582 / 1689 loss=3.514, nll_loss=1.96, ppl=3.89, wps=583298, ups=1.18, wpb=494910, bsz=16880, num_updates=47800, lr=0.000289278, gnorm=0.138, clip=0, loss_scale=2, train_wall=84, gb_free=61.8, wall=40978 epoch 029: 582 / 1689 loss=3.514, nll_loss=1.96, ppl=3.89, wps=583298, ups=1.18, wpb=494910, bsz=16880, num_updates=47800, lr=0.000289278, gnorm=0.138, clip=0, loss_scale=2, train_wall=84, gb_free=61.8, wall=40978 epoch 029: 582 / 1689 loss=3.514, nll_loss=1.96, ppl=3.89, wps=583298, ups=1.18, wpb=494910, bsz=16880, num_updates=47800, lr=0.000289278, gnorm=0.138, clip=0, loss_scale=2, train_wall=84, gb_free=61.8, wall=40978 epoch 029: 582 / 1689 loss=3.514, nll_loss=1.96, ppl=3.89, wps=583298, ups=1.18, wpb=494910, bsz=16880, num_updates=47800, lr=0.000289278, gnorm=0.138, clip=0, loss_scale=2, train_wall=84, gb_free=61.8, wall=40978 epoch 029: 582 / 1689 loss=3.514, nll_loss=1.96, ppl=3.89, wps=583298, ups=1.18, wpb=494910, bsz=16880, num_updates=47800, lr=0.000289278, gnorm=0.138, clip=0, loss_scale=2, train_wall=84, gb_free=61.8, wall=40978 epoch 029: 582 / 1689 loss=3.514, nll_loss=1.96, ppl=3.89, wps=583298, ups=1.18, wpb=494910, bsz=16880, num_updates=47800, lr=0.000289278, gnorm=0.138, clip=0, loss_scale=2, train_wall=84, gb_free=61.8, wall=40978 epoch 029: 582 / 1689 loss=3.514, nll_loss=1.96, ppl=3.89, wps=583298, ups=1.18, wpb=494910, bsz=16880, num_updates=47800, lr=0.000289278, gnorm=0.138, clip=0, loss_scale=2, train_wall=84, gb_free=61.8, wall=40978 epoch 029: 582 / 1689 loss=3.514, nll_loss=1.96, ppl=3.89, wps=583298, ups=1.18, wpb=494910, bsz=16880, num_updates=47800, lr=0.000289278, gnorm=0.138, clip=0, loss_scale=2, train_wall=84, gb_free=61.8, wall=40978 epoch 029: 582 / 1689 loss=3.514, nll_loss=1.96, ppl=3.89, wps=583298, ups=1.18, wpb=494910, bsz=16880, num_updates=47800, lr=0.000289278, gnorm=0.138, clip=0, loss_scale=2, train_wall=84, gb_free=61.8, wall=40978 epoch 029: 582 / 1689 loss=3.514, nll_loss=1.96, ppl=3.89, wps=583298, ups=1.18, wpb=494910, bsz=16880, num_updates=47800, lr=0.000289278, gnorm=0.138, clip=0, loss_scale=2, train_wall=84, gb_free=61.8, wall=40978 epoch 029: 582 / 1689 loss=3.514, nll_loss=1.96, ppl=3.89, wps=583298, ups=1.18, wpb=494910, bsz=16880, num_updates=47800, lr=0.000289278, gnorm=0.138, clip=0, loss_scale=2, train_wall=84, gb_free=61.8, wall=40978 epoch 029: 582 / 1689 loss=3.514, nll_loss=1.96, ppl=3.89, wps=583298, ups=1.18, wpb=494910, bsz=16880, num_updates=47800, lr=0.000289278, gnorm=0.138, clip=0, loss_scale=2, train_wall=84, gb_free=61.8, wall=40978 epoch 029: 582 / 1689 loss=3.514, nll_loss=1.96, ppl=3.89, wps=583298, ups=1.18, wpb=494910, bsz=16880, num_updates=47800, lr=0.000289278, gnorm=0.138, clip=0, loss_scale=2, train_wall=84, gb_free=61.8, wall=40978 epoch 029: 582 / 1689 loss=3.514, nll_loss=1.96, ppl=3.89, wps=583298, ups=1.18, wpb=494910, bsz=16880, num_updates=47800, lr=0.000289278, gnorm=0.138, clip=0, loss_scale=2, train_wall=84, gb_free=61.8, wall=40978 epoch 029: 582 / 1689 loss=3.514, nll_loss=1.96, ppl=3.89, wps=583298, ups=1.18, wpb=494910, bsz=16880, num_updates=47800, lr=0.000289278, gnorm=0.138, clip=0, loss_scale=2, train_wall=84, gb_free=61.8, wall=40978 epoch 029: 582 / 1689 loss=3.514, nll_loss=1.96, ppl=3.89, wps=583298, ups=1.18, wpb=494910, bsz=16880, num_updates=47800, lr=0.000289278, gnorm=0.138, clip=0, loss_scale=2, train_wall=84, gb_free=61.8, wall=40978 epoch 029: 682 / 1689 loss=3.519, nll_loss=1.965, ppl=3.9, wps=589556, ups=1.19, wpb=496790, bsz=16554.6, num_updates=47900, lr=0.000288976, gnorm=0.14, clip=0, loss_scale=2, train_wall=83, gb_free=62.1, wall=41063 epoch 029: 682 / 1689 loss=3.519, nll_loss=1.965, ppl=3.9, wps=589556, ups=1.19, wpb=496790, bsz=16554.6, num_updates=47900, lr=0.000288976, gnorm=0.14, clip=0, loss_scale=2, train_wall=83, gb_free=62.1, wall=41063 epoch 029: 682 / 1689 loss=3.519, nll_loss=1.965, ppl=3.9, wps=589556, ups=1.19, wpb=496790, bsz=16554.6, num_updates=47900, lr=0.000288976, gnorm=0.14, clip=0, loss_scale=2, train_wall=83, gb_free=62.1, wall=41063 epoch 029: 682 / 1689 loss=3.519, nll_loss=1.965, ppl=3.9, wps=589556, ups=1.19, wpb=496790, bsz=16554.6, num_updates=47900, lr=0.000288976, gnorm=0.14, clip=0, loss_scale=2, train_wall=83, gb_free=62.1, wall=41063 epoch 029: 682 / 1689 loss=3.519, nll_loss=1.965, ppl=3.9, wps=589556, ups=1.19, wpb=496790, bsz=16554.6, num_updates=47900, lr=0.000288976, gnorm=0.14, clip=0, loss_scale=2, train_wall=83, gb_free=62.1, wall=41063 epoch 029: 682 / 1689 loss=3.519, nll_loss=1.965, ppl=3.9, wps=589556, ups=1.19, wpb=496790, bsz=16554.6, num_updates=47900, lr=0.000288976, gnorm=0.14, clip=0, loss_scale=2, train_wall=83, gb_free=62.1, wall=41063 epoch 029: 682 / 1689 loss=3.519, nll_loss=1.965, ppl=3.9, wps=589556, ups=1.19, wpb=496790, bsz=16554.6, num_updates=47900, lr=0.000288976, gnorm=0.14, clip=0, loss_scale=2, train_wall=83, gb_free=62.1, wall=41063 epoch 029: 682 / 1689 loss=3.519, nll_loss=1.965, ppl=3.9, wps=589556, ups=1.19, wpb=496790, bsz=16554.6, num_updates=47900, lr=0.000288976, gnorm=0.14, clip=0, loss_scale=2, train_wall=83, gb_free=62.1, wall=41063 epoch 029: 682 / 1689 loss=3.519, nll_loss=1.965, ppl=3.9, wps=589556, ups=1.19, wpb=496790, bsz=16554.6, num_updates=47900, lr=0.000288976, gnorm=0.14, clip=0, loss_scale=2, train_wall=83, gb_free=62.1, wall=41063 epoch 029: 682 / 1689 loss=3.519, nll_loss=1.965, ppl=3.9, wps=589556, ups=1.19, wpb=496790, bsz=16554.6, num_updates=47900, lr=0.000288976, gnorm=0.14, clip=0, loss_scale=2, train_wall=83, gb_free=62.1, wall=41063 epoch 029: 682 / 1689 loss=3.519, nll_loss=1.965, ppl=3.9, wps=589556, ups=1.19, wpb=496790, bsz=16554.6, num_updates=47900, lr=0.000288976, gnorm=0.14, clip=0, loss_scale=2, train_wall=83, gb_free=62.1, wall=41063 epoch 029: 682 / 1689 loss=3.519, nll_loss=1.965, ppl=3.9, wps=589556, ups=1.19, wpb=496790, bsz=16554.6, num_updates=47900, lr=0.000288976, gnorm=0.14, clip=0, loss_scale=2, train_wall=83, gb_free=62.1, wall=41063 epoch 029: 682 / 1689 loss=3.519, nll_loss=1.965, ppl=3.9, wps=589556, ups=1.19, wpb=496790, bsz=16554.6, num_updates=47900, lr=0.000288976, gnorm=0.14, clip=0, loss_scale=2, train_wall=83, gb_free=62.1, wall=41063 epoch 029: 682 / 1689 loss=3.519, nll_loss=1.965, ppl=3.9, wps=589556, ups=1.19, wpb=496790, bsz=16554.6, num_updates=47900, lr=0.000288976, gnorm=0.14, clip=0, loss_scale=2, train_wall=83, gb_free=62.1, wall=41063 epoch 029: 682 / 1689 loss=3.519, nll_loss=1.965, ppl=3.9, wps=589556, ups=1.19, wpb=496790, bsz=16554.6, num_updates=47900, lr=0.000288976, gnorm=0.14, clip=0, loss_scale=2, train_wall=83, gb_free=62.1, wall=41063 epoch 029: 682 / 1689 loss=3.519, nll_loss=1.965, ppl=3.9, wps=589556, ups=1.19, wpb=496790, bsz=16554.6, num_updates=47900, lr=0.000288976, gnorm=0.14, clip=0, loss_scale=2, train_wall=83, gb_free=62.1, wall=41063 epoch 029: 682 / 1689 loss=3.519, nll_loss=1.965, ppl=3.9, wps=589556, ups=1.19, wpb=496790, bsz=16554.6, num_updates=47900, lr=0.000288976, gnorm=0.14, clip=0, loss_scale=2, train_wall=83, gb_free=62.1, wall=41063 epoch 029: 682 / 1689 loss=3.519, nll_loss=1.965, ppl=3.9, wps=589556, ups=1.19, wpb=496790, bsz=16554.6, num_updates=47900, lr=0.000288976, gnorm=0.14, clip=0, loss_scale=2, train_wall=83, gb_free=62.1, wall=41063 epoch 029: 682 / 1689 loss=3.519, nll_loss=1.965, ppl=3.9, wps=589556, ups=1.19, wpb=496790, bsz=16554.6, num_updates=47900, lr=0.000288976, gnorm=0.14, clip=0, loss_scale=2, train_wall=83, gb_free=62.1, wall=41063 epoch 029: 682 / 1689 loss=3.519, nll_loss=1.965, ppl=3.9, wps=589556, ups=1.19, wpb=496790, bsz=16554.6, num_updates=47900, lr=0.000288976, gnorm=0.14, clip=0, loss_scale=2, train_wall=83, gb_free=62.1, wall=41063 epoch 029: 682 / 1689 loss=3.519, nll_loss=1.965, ppl=3.9, wps=589556, ups=1.19, wpb=496790, bsz=16554.6, num_updates=47900, lr=0.000288976, gnorm=0.14, clip=0, loss_scale=2, train_wall=83, gb_free=62.1, wall=41063 epoch 029: 682 / 1689 loss=3.519, nll_loss=1.965, ppl=3.9, wps=589556, ups=1.19, wpb=496790, bsz=16554.6, num_updates=47900, lr=0.000288976, gnorm=0.14, clip=0, loss_scale=2, train_wall=83, gb_free=62.1, wall=41063 epoch 029: 682 / 1689 loss=3.519, nll_loss=1.965, ppl=3.9, wps=589556, ups=1.19, wpb=496790, bsz=16554.6, num_updates=47900, lr=0.000288976, gnorm=0.14, clip=0, loss_scale=2, train_wall=83, gb_free=62.1, wall=41063 epoch 029: 682 / 1689 loss=3.519, nll_loss=1.965, ppl=3.9, wps=589556, ups=1.19, wpb=496790, bsz=16554.6, num_updates=47900, lr=0.000288976, gnorm=0.14, clip=0, loss_scale=2, train_wall=83, gb_free=62.1, wall=41063 epoch 029: 682 / 1689 loss=3.519, nll_loss=1.965, ppl=3.9, wps=589556, ups=1.19, wpb=496790, bsz=16554.6, num_updates=47900, lr=0.000288976, gnorm=0.14, clip=0, loss_scale=2, train_wall=83, gb_free=62.1, wall=41063 epoch 029: 682 / 1689 loss=3.519, nll_loss=1.965, ppl=3.9, wps=589556, ups=1.19, wpb=496790, bsz=16554.6, num_updates=47900, lr=0.000288976, gnorm=0.14, clip=0, loss_scale=2, train_wall=83, gb_free=62.1, wall=41063 epoch 029: 682 / 1689 loss=3.519, nll_loss=1.965, ppl=3.9, wps=589556, ups=1.19, wpb=496790, bsz=16554.6, num_updates=47900, lr=0.000288976, gnorm=0.14, clip=0, loss_scale=2, train_wall=83, gb_free=62.1, wall=41063 epoch 029: 682 / 1689 loss=3.519, nll_loss=1.965, ppl=3.9, wps=589556, ups=1.19, wpb=496790, bsz=16554.6, num_updates=47900, lr=0.000288976, gnorm=0.14, clip=0, loss_scale=2, train_wall=83, gb_free=62.1, wall=41063 epoch 029: 682 / 1689 loss=3.519, nll_loss=1.965, ppl=3.9, wps=589556, ups=1.19, wpb=496790, bsz=16554.6, num_updates=47900, lr=0.000288976, gnorm=0.14, clip=0, loss_scale=2, train_wall=83, gb_free=62.1, wall=41063 epoch 029: 783 / 1689 loss=3.522, nll_loss=1.969, ppl=3.91, wps=580348, ups=1.17, wpb=496660, bsz=16364, num_updates=48000, lr=0.000288675, gnorm=0.135, clip=0, loss_scale=1, train_wall=85, gb_free=62.1, wall=41148 epoch 029: 783 / 1689 loss=3.522, nll_loss=1.969, ppl=3.91, wps=580348, ups=1.17, wpb=496660, bsz=16364, num_updates=48000, lr=0.000288675, gnorm=0.135, clip=0, loss_scale=1, train_wall=85, gb_free=62.1, wall=41148 epoch 029: 783 / 1689 loss=3.522, nll_loss=1.969, ppl=3.91, wps=580348, ups=1.17, wpb=496660, bsz=16364, num_updates=48000, lr=0.000288675, gnorm=0.135, clip=0, loss_scale=1, train_wall=85, gb_free=62.1, wall=41148 epoch 029: 783 / 1689 loss=3.522, nll_loss=1.969, ppl=3.91, wps=580348, ups=1.17, wpb=496660, bsz=16364, num_updates=48000, lr=0.000288675, gnorm=0.135, clip=0, loss_scale=1, train_wall=85, gb_free=62.1, wall=41148 epoch 029: 783 / 1689 loss=3.522, nll_loss=1.969, ppl=3.91, wps=580348, ups=1.17, wpb=496660, bsz=16364, num_updates=48000, lr=0.000288675, gnorm=0.135, clip=0, loss_scale=1, train_wall=85, gb_free=62.1, wall=41148 epoch 029: 783 / 1689 loss=3.522, nll_loss=1.969, ppl=3.91, wps=580348, ups=1.17, wpb=496660, bsz=16364, num_updates=48000, lr=0.000288675, gnorm=0.135, clip=0, loss_scale=1, train_wall=85, gb_free=62.1, wall=41148 epoch 029: 783 / 1689 loss=3.522, nll_loss=1.969, ppl=3.91, wps=580348, ups=1.17, wpb=496660, bsz=16364, num_updates=48000, lr=0.000288675, gnorm=0.135, clip=0, loss_scale=1, train_wall=85, gb_free=62.1, wall=41148 epoch 029: 783 / 1689 loss=3.522, nll_loss=1.969, ppl=3.91, wps=580348, ups=1.17, wpb=496660, bsz=16364, num_updates=48000, lr=0.000288675, gnorm=0.135, clip=0, loss_scale=1, train_wall=85, gb_free=62.1, wall=41148 epoch 029: 783 / 1689 loss=3.522, nll_loss=1.969, ppl=3.91, wps=580348, ups=1.17, wpb=496660, bsz=16364, num_updates=48000, lr=0.000288675, gnorm=0.135, clip=0, loss_scale=1, train_wall=85, gb_free=62.1, wall=41148 epoch 029: 783 / 1689 loss=3.522, nll_loss=1.969, ppl=3.91, wps=580348, ups=1.17, wpb=496660, bsz=16364, num_updates=48000, lr=0.000288675, gnorm=0.135, clip=0, loss_scale=1, train_wall=85, gb_free=62.1, wall=41148 epoch 029: 783 / 1689 loss=3.522, nll_loss=1.969, ppl=3.91, wps=580348, ups=1.17, wpb=496660, bsz=16364, num_updates=48000, lr=0.000288675, gnorm=0.135, clip=0, loss_scale=1, train_wall=85, gb_free=62.1, wall=41148 epoch 029: 783 / 1689 loss=3.522, nll_loss=1.969, ppl=3.91, wps=580348, ups=1.17, wpb=496660, bsz=16364, num_updates=48000, lr=0.000288675, gnorm=0.135, clip=0, loss_scale=1, train_wall=85, gb_free=62.1, wall=41148 epoch 029: 783 / 1689 loss=3.522, nll_loss=1.969, ppl=3.91, wps=580348, ups=1.17, wpb=496660, bsz=16364, num_updates=48000, lr=0.000288675, gnorm=0.135, clip=0, loss_scale=1, train_wall=85, gb_free=62.1, wall=41148 epoch 029: 783 / 1689 loss=3.522, nll_loss=1.969, ppl=3.91, wps=580348, ups=1.17, wpb=496660, bsz=16364, num_updates=48000, lr=0.000288675, gnorm=0.135, clip=0, loss_scale=1, train_wall=85, gb_free=62.1, wall=41148 epoch 029: 783 / 1689 loss=3.522, nll_loss=1.969, ppl=3.91, wps=580348, ups=1.17, wpb=496660, bsz=16364, num_updates=48000, lr=0.000288675, gnorm=0.135, clip=0, loss_scale=1, train_wall=85, gb_free=62.1, wall=41148 epoch 029: 783 / 1689 loss=3.522, nll_loss=1.969, ppl=3.91, wps=580348, ups=1.17, wpb=496660, bsz=16364, num_updates=48000, lr=0.000288675, gnorm=0.135, clip=0, loss_scale=1, train_wall=85, gb_free=62.1, wall=41148 epoch 029: 783 / 1689 loss=3.522, nll_loss=1.969, ppl=3.91, wps=580348, ups=1.17, wpb=496660, bsz=16364, num_updates=48000, lr=0.000288675, gnorm=0.135, clip=0, loss_scale=1, train_wall=85, gb_free=62.1, wall=41148 epoch 029: 783 / 1689 loss=3.522, nll_loss=1.969, ppl=3.91, wps=580348, ups=1.17, wpb=496660, bsz=16364, num_updates=48000, lr=0.000288675, gnorm=0.135, clip=0, loss_scale=1, train_wall=85, gb_free=62.1, wall=41148 epoch 029: 783 / 1689 loss=3.522, nll_loss=1.969, ppl=3.91, wps=580348, ups=1.17, wpb=496660, bsz=16364, num_updates=48000, lr=0.000288675, gnorm=0.135, clip=0, loss_scale=1, train_wall=85, gb_free=62.1, wall=41148 epoch 029: 783 / 1689 loss=3.522, nll_loss=1.969, ppl=3.91, wps=580348, ups=1.17, wpb=496660, bsz=16364, num_updates=48000, lr=0.000288675, gnorm=0.135, clip=0, loss_scale=1, train_wall=85, gb_free=62.1, wall=41148 epoch 029: 783 / 1689 loss=3.522, nll_loss=1.969, ppl=3.91, wps=580348, ups=1.17, wpb=496660, bsz=16364, num_updates=48000, lr=0.000288675, gnorm=0.135, clip=0, loss_scale=1, train_wall=85, gb_free=62.1, wall=41148 epoch 029: 783 / 1689 loss=3.522, nll_loss=1.969, ppl=3.91, wps=580348, ups=1.17, wpb=496660, bsz=16364, num_updates=48000, lr=0.000288675, gnorm=0.135, clip=0, loss_scale=1, train_wall=85, gb_free=62.1, wall=41148 epoch 029: 783 / 1689 loss=3.522, nll_loss=1.969, ppl=3.91, wps=580348, ups=1.17, wpb=496660, bsz=16364, num_updates=48000, lr=0.000288675, gnorm=0.135, clip=0, loss_scale=1, train_wall=85, gb_free=62.1, wall=41148 epoch 029: 783 / 1689 loss=3.522, nll_loss=1.969, ppl=3.91, wps=580348, ups=1.17, wpb=496660, bsz=16364, num_updates=48000, lr=0.000288675, gnorm=0.135, clip=0, loss_scale=1, train_wall=85, gb_free=62.1, wall=41148 epoch 029: 783 / 1689 loss=3.522, nll_loss=1.969, ppl=3.91, wps=580348, ups=1.17, wpb=496660, bsz=16364, num_updates=48000, lr=0.000288675, gnorm=0.135, clip=0, loss_scale=1, train_wall=85, gb_free=62.1, wall=41148 epoch 029: 783 / 1689 loss=3.522, nll_loss=1.969, ppl=3.91, wps=580348, ups=1.17, wpb=496660, bsz=16364, num_updates=48000, lr=0.000288675, gnorm=0.135, clip=0, loss_scale=1, train_wall=85, gb_free=62.1, wall=41148 epoch 029: 783 / 1689 loss=3.522, nll_loss=1.969, ppl=3.91, wps=580348, ups=1.17, wpb=496660, bsz=16364, num_updates=48000, lr=0.000288675, gnorm=0.135, clip=0, loss_scale=1, train_wall=85, gb_free=62.1, wall=41148 epoch 029: 783 / 1689 loss=3.522, nll_loss=1.969, ppl=3.91, wps=580348, ups=1.17, wpb=496660, bsz=16364, num_updates=48000, lr=0.000288675, gnorm=0.135, clip=0, loss_scale=1, train_wall=85, gb_free=62.1, wall=41148 epoch 029: 783 / 1689 loss=3.522, nll_loss=1.969, ppl=3.91, wps=580348, ups=1.17, wpb=496660, bsz=16364, num_updates=48000, lr=0.000288675, gnorm=0.135, clip=0, loss_scale=1, train_wall=85, gb_free=62.1, wall=41148 begin validation on "valid" subset epoch 029 | valid on 'valid' subset | loss 3.689 | nll_loss 2.136 | ppl 4.39 | wps 0 | wpb 44526 | bsz 2008 | num_updates 48000 | best_loss 3.685 epoch 029 | valid on 'valid' subset | loss 3.689 | nll_loss 2.136 | ppl 4.39 | wps 0 | wpb 44526 | bsz 2008 | num_updates 48000 | best_loss 3.685 epoch 029 | valid on 'valid' subset | loss 3.689 | nll_loss 2.136 | ppl 4.39 | wps 0 | wpb 44526 | bsz 2008 | num_updates 48000 | best_loss 3.685 epoch 029 | valid on 'valid' subset | loss 3.689 | nll_loss 2.136 | ppl 4.39 | wps 0 | wpb 44526 | bsz 2008 | num_updates 48000 | best_loss 3.685 epoch 029 | valid on 'valid' subset | loss 3.689 | nll_loss 2.136 | ppl 4.39 | wps 0 | wpb 44526 | bsz 2008 | num_updates 48000 | best_loss 3.685 epoch 029 | valid on 'valid' subset | loss 3.689 | nll_loss 2.136 | ppl 4.39 | wps 0 | wpb 44526 | bsz 2008 | num_updates 48000 | best_loss 3.685 epoch 029 | valid on 'valid' subset | loss 3.689 | nll_loss 2.136 | ppl 4.39 | wps 0 | wpb 44526 | bsz 2008 | num_updates 48000 | best_loss 3.685 epoch 029 | valid on 'valid' subset | loss 3.689 | nll_loss 2.136 | ppl 4.39 | wps 0 | wpb 44526 | bsz 2008 | num_updates 48000 | best_loss 3.685 epoch 029 | valid on 'valid' subset | loss 3.689 | nll_loss 2.136 | ppl 4.39 | wps 0 | wpb 44526 | bsz 2008 | num_updates 48000 | best_loss 3.685 epoch 029 | valid on 'valid' subset | loss 3.689 | nll_loss 2.136 | ppl 4.39 | wps 0 | wpb 44526 | bsz 2008 | num_updates 48000 | best_loss 3.685 epoch 029 | valid on 'valid' subset | loss 3.689 | nll_loss 2.136 | ppl 4.39 | wps 0 | wpb 44526 | bsz 2008 | num_updates 48000 | best_loss 3.685 epoch 029 | valid on 'valid' subset | loss 3.689 | nll_loss 2.136 | ppl 4.39 | wps 0 | wpb 44526 | bsz 2008 | num_updates 48000 | best_loss 3.685 epoch 029 | valid on 'valid' subset | loss 3.689 | nll_loss 2.136 | ppl 4.39 | wps 0 | wpb 44526 | bsz 2008 | num_updates 48000 | best_loss 3.685 epoch 029 | valid on 'valid' subset | loss 3.689 | nll_loss 2.136 | ppl 4.39 | wps 0 | wpb 44526 | bsz 2008 | num_updates 48000 | best_loss 3.685 epoch 029 | valid on 'valid' subset | loss 3.689 | nll_loss 2.136 | ppl 4.39 | wps 0 | wpb 44526 | bsz 2008 | num_updates 48000 | best_loss 3.685 epoch 029 | valid on 'valid' subset | loss 3.689 | nll_loss 2.136 | ppl 4.39 | wps 0 | wpb 44526 | bsz 2008 | num_updates 48000 | best_loss 3.685 epoch 029 | valid on 'valid' subset | loss 3.689 | nll_loss 2.136 | ppl 4.39 | wps 0 | wpb 44526 | bsz 2008 | num_updates 48000 | best_loss 3.685 epoch 029 | valid on 'valid' subset | loss 3.689 | nll_loss 2.136 | ppl 4.39 | wps 0 | wpb 44526 | bsz 2008 | num_updates 48000 | best_loss 3.685 epoch 029 | valid on 'valid' subset | loss 3.689 | nll_loss 2.136 | ppl 4.39 | wps 0 | wpb 44526 | bsz 2008 | num_updates 48000 | best_loss 3.685 epoch 029 | valid on 'valid' subset | loss 3.689 | nll_loss 2.136 | ppl 4.39 | wps 0 | wpb 44526 | bsz 2008 | num_updates 48000 | best_loss 3.685 epoch 029 | valid on 'valid' subset | loss 3.689 | nll_loss 2.136 | ppl 4.39 | wps 0 | wpb 44526 | bsz 2008 | num_updates 48000 | best_loss 3.685 epoch 029 | valid on 'valid' subset | loss 3.689 | nll_loss 2.136 | ppl 4.39 | wps 0 | wpb 44526 | bsz 2008 | num_updates 48000 | best_loss 3.685 epoch 029 | valid on 'valid' subset | loss 3.689 | nll_loss 2.136 | ppl 4.39 | wps 0 | wpb 44526 | bsz 2008 | num_updates 48000 | best_loss 3.685 epoch 029 | valid on 'valid' subset | loss 3.689 | nll_loss 2.136 | ppl 4.39 | wps 0 | wpb 44526 | bsz 2008 | num_updates 48000 | best_loss 3.685 epoch 029 | valid on 'valid' subset | loss 3.689 | nll_loss 2.136 | ppl 4.39 | wps 0 | wpb 44526 | bsz 2008 | num_updates 48000 | best_loss 3.685 epoch 029 | valid on 'valid' subset | loss 3.689 | nll_loss 2.136 | ppl 4.39 | wps 0 | wpb 44526 | bsz 2008 | num_updates 48000 | best_loss 3.685 epoch 029 | valid on 'valid' subset | loss 3.689 | nll_loss 2.136 | ppl 4.39 | wps 0 | wpb 44526 | bsz 2008 | num_updates 48000 | best_loss 3.685 epoch 029 | valid on 'valid' subset | loss 3.689 | nll_loss 2.136 | ppl 4.39 | wps 0 | wpb 44526 | bsz 2008 | num_updates 48000 | best_loss 3.685 epoch 029 | valid on 'valid' subset | loss 3.689 | nll_loss 2.136 | ppl 4.39 | wps 0 | wpb 44526 | bsz 2008 | num_updates 48000 | best_loss 3.685 epoch 029: 883 / 1689 loss=3.527, nll_loss=1.974, ppl=3.93, wps=529613, ups=1.07, wpb=492714, bsz=16678.2, num_updates=48100, lr=0.000288375, gnorm=0.141, clip=0, loss_scale=1, train_wall=84, gb_free=62.2, wall=41241 epoch 029: 883 / 1689 loss=3.527, nll_loss=1.974, ppl=3.93, wps=529613, ups=1.07, wpb=492714, bsz=16678.2, num_updates=48100, lr=0.000288375, gnorm=0.141, clip=0, loss_scale=1, train_wall=84, gb_free=62.2, wall=41241 epoch 029: 883 / 1689 loss=3.527, nll_loss=1.974, ppl=3.93, wps=529613, ups=1.07, wpb=492714, bsz=16678.2, num_updates=48100, lr=0.000288375, gnorm=0.141, clip=0, loss_scale=1, train_wall=84, gb_free=62.2, wall=41241 epoch 029: 883 / 1689 loss=3.527, nll_loss=1.974, ppl=3.93, wps=529613, ups=1.07, wpb=492714, bsz=16678.2, num_updates=48100, lr=0.000288375, gnorm=0.141, clip=0, loss_scale=1, train_wall=84, gb_free=62.2, wall=41241 epoch 029: 883 / 1689 loss=3.527, nll_loss=1.974, ppl=3.93, wps=529613, ups=1.07, wpb=492714, bsz=16678.2, num_updates=48100, lr=0.000288375, gnorm=0.141, clip=0, loss_scale=1, train_wall=84, gb_free=62.2, wall=41241 epoch 029: 883 / 1689 loss=3.527, nll_loss=1.974, ppl=3.93, wps=529613, ups=1.07, wpb=492714, bsz=16678.2, num_updates=48100, lr=0.000288375, gnorm=0.141, clip=0, loss_scale=1, train_wall=84, gb_free=62.2, wall=41241 epoch 029: 883 / 1689 loss=3.527, nll_loss=1.974, ppl=3.93, wps=529613, ups=1.07, wpb=492714, bsz=16678.2, num_updates=48100, lr=0.000288375, gnorm=0.141, clip=0, loss_scale=1, train_wall=84, gb_free=62.2, wall=41241 epoch 029: 883 / 1689 loss=3.527, nll_loss=1.974, ppl=3.93, wps=529613, ups=1.07, wpb=492714, bsz=16678.2, num_updates=48100, lr=0.000288375, gnorm=0.141, clip=0, loss_scale=1, train_wall=84, gb_free=62.2, wall=41241 epoch 029: 883 / 1689 loss=3.527, nll_loss=1.974, ppl=3.93, wps=529613, ups=1.07, wpb=492714, bsz=16678.2, num_updates=48100, lr=0.000288375, gnorm=0.141, clip=0, loss_scale=1, train_wall=84, gb_free=62.2, wall=41241 epoch 029: 883 / 1689 loss=3.527, nll_loss=1.974, ppl=3.93, wps=529613, ups=1.07, wpb=492714, bsz=16678.2, num_updates=48100, lr=0.000288375, gnorm=0.141, clip=0, loss_scale=1, train_wall=84, gb_free=62.2, wall=41241 epoch 029: 883 / 1689 loss=3.527, nll_loss=1.974, ppl=3.93, wps=529613, ups=1.07, wpb=492714, bsz=16678.2, num_updates=48100, lr=0.000288375, gnorm=0.141, clip=0, loss_scale=1, train_wall=84, gb_free=62.2, wall=41241 epoch 029: 883 / 1689 loss=3.527, nll_loss=1.974, ppl=3.93, wps=529613, ups=1.07, wpb=492714, bsz=16678.2, num_updates=48100, lr=0.000288375, gnorm=0.141, clip=0, loss_scale=1, train_wall=84, gb_free=62.2, wall=41241 epoch 029: 883 / 1689 loss=3.527, nll_loss=1.974, ppl=3.93, wps=529613, ups=1.07, wpb=492714, bsz=16678.2, num_updates=48100, lr=0.000288375, gnorm=0.141, clip=0, loss_scale=1, train_wall=84, gb_free=62.2, wall=41241 epoch 029: 883 / 1689 loss=3.527, nll_loss=1.974, ppl=3.93, wps=529613, ups=1.07, wpb=492714, bsz=16678.2, num_updates=48100, lr=0.000288375, gnorm=0.141, clip=0, loss_scale=1, train_wall=84, gb_free=62.2, wall=41241 epoch 029: 883 / 1689 loss=3.527, nll_loss=1.974, ppl=3.93, wps=529613, ups=1.07, wpb=492714, bsz=16678.2, num_updates=48100, lr=0.000288375, gnorm=0.141, clip=0, loss_scale=1, train_wall=84, gb_free=62.2, wall=41241 epoch 029: 883 / 1689 loss=3.527, nll_loss=1.974, ppl=3.93, wps=529613, ups=1.07, wpb=492714, bsz=16678.2, num_updates=48100, lr=0.000288375, gnorm=0.141, clip=0, loss_scale=1, train_wall=84, gb_free=62.2, wall=41241 epoch 029: 883 / 1689 loss=3.527, nll_loss=1.974, ppl=3.93, wps=529613, ups=1.07, wpb=492714, bsz=16678.2, num_updates=48100, lr=0.000288375, gnorm=0.141, clip=0, loss_scale=1, train_wall=84, gb_free=62.2, wall=41241 epoch 029: 883 / 1689 loss=3.527, nll_loss=1.974, ppl=3.93, wps=529613, ups=1.07, wpb=492714, bsz=16678.2, num_updates=48100, lr=0.000288375, gnorm=0.141, clip=0, loss_scale=1, train_wall=84, gb_free=62.2, wall=41241 epoch 029: 883 / 1689 loss=3.527, nll_loss=1.974, ppl=3.93, wps=529613, ups=1.07, wpb=492714, bsz=16678.2, num_updates=48100, lr=0.000288375, gnorm=0.141, clip=0, loss_scale=1, train_wall=84, gb_free=62.2, wall=41241 epoch 029: 883 / 1689 loss=3.527, nll_loss=1.974, ppl=3.93, wps=529613, ups=1.07, wpb=492714, bsz=16678.2, num_updates=48100, lr=0.000288375, gnorm=0.141, clip=0, loss_scale=1, train_wall=84, gb_free=62.2, wall=41241 epoch 029: 883 / 1689 loss=3.527, nll_loss=1.974, ppl=3.93, wps=529613, ups=1.07, wpb=492714, bsz=16678.2, num_updates=48100, lr=0.000288375, gnorm=0.141, clip=0, loss_scale=1, train_wall=84, gb_free=62.2, wall=41241 epoch 029: 883 / 1689 loss=3.527, nll_loss=1.974, ppl=3.93, wps=529613, ups=1.07, wpb=492714, bsz=16678.2, num_updates=48100, lr=0.000288375, gnorm=0.141, clip=0, loss_scale=1, train_wall=84, gb_free=62.2, wall=41241 epoch 029: 883 / 1689 loss=3.527, nll_loss=1.974, ppl=3.93, wps=529613, ups=1.07, wpb=492714, bsz=16678.2, num_updates=48100, lr=0.000288375, gnorm=0.141, clip=0, loss_scale=1, train_wall=84, gb_free=62.2, wall=41241 epoch 029: 883 / 1689 loss=3.527, nll_loss=1.974, ppl=3.93, wps=529613, ups=1.07, wpb=492714, bsz=16678.2, num_updates=48100, lr=0.000288375, gnorm=0.141, clip=0, loss_scale=1, train_wall=84, gb_free=62.2, wall=41241 epoch 029: 883 / 1689 loss=3.527, nll_loss=1.974, ppl=3.93, wps=529613, ups=1.07, wpb=492714, bsz=16678.2, num_updates=48100, lr=0.000288375, gnorm=0.141, clip=0, loss_scale=1, train_wall=84, gb_free=62.2, wall=41241 epoch 029: 883 / 1689 loss=3.527, nll_loss=1.974, ppl=3.93, wps=529613, ups=1.07, wpb=492714, bsz=16678.2, num_updates=48100, lr=0.000288375, gnorm=0.141, clip=0, loss_scale=1, train_wall=84, gb_free=62.2, wall=41241 epoch 029: 883 / 1689 loss=3.527, nll_loss=1.974, ppl=3.93, wps=529613, ups=1.07, wpb=492714, bsz=16678.2, num_updates=48100, lr=0.000288375, gnorm=0.141, clip=0, loss_scale=1, train_wall=84, gb_free=62.2, wall=41241 epoch 029: 883 / 1689 loss=3.527, nll_loss=1.974, ppl=3.93, wps=529613, ups=1.07, wpb=492714, bsz=16678.2, num_updates=48100, lr=0.000288375, gnorm=0.141, clip=0, loss_scale=1, train_wall=84, gb_free=62.2, wall=41241 epoch 029: 883 / 1689 loss=3.527, nll_loss=1.974, ppl=3.93, wps=529613, ups=1.07, wpb=492714, bsz=16678.2, num_updates=48100, lr=0.000288375, gnorm=0.141, clip=0, loss_scale=1, train_wall=84, gb_free=62.2, wall=41241 epoch 029: 983 / 1689 loss=3.525, nll_loss=1.973, ppl=3.92, wps=587974, ups=1.19, wpb=493075, bsz=16542.2, num_updates=48200, lr=0.000288076, gnorm=0.141, clip=0, loss_scale=1, train_wall=83, gb_free=61.6, wall=41325 epoch 029: 983 / 1689 loss=3.525, nll_loss=1.973, ppl=3.92, wps=587974, ups=1.19, wpb=493075, bsz=16542.2, num_updates=48200, lr=0.000288076, gnorm=0.141, clip=0, loss_scale=1, train_wall=83, gb_free=61.6, wall=41325 epoch 029: 983 / 1689 loss=3.525, nll_loss=1.973, ppl=3.92, wps=587974, ups=1.19, wpb=493075, bsz=16542.2, num_updates=48200, lr=0.000288076, gnorm=0.141, clip=0, loss_scale=1, train_wall=83, gb_free=61.6, wall=41325 epoch 029: 983 / 1689 loss=3.525, nll_loss=1.973, ppl=3.92, wps=587974, ups=1.19, wpb=493075, bsz=16542.2, num_updates=48200, lr=0.000288076, gnorm=0.141, clip=0, loss_scale=1, train_wall=83, gb_free=61.6, wall=41325 epoch 029: 983 / 1689 loss=3.525, nll_loss=1.973, ppl=3.92, wps=587974, ups=1.19, wpb=493075, bsz=16542.2, num_updates=48200, lr=0.000288076, gnorm=0.141, clip=0, loss_scale=1, train_wall=83, gb_free=61.6, wall=41325 epoch 029: 983 / 1689 loss=3.525, nll_loss=1.973, ppl=3.92, wps=587974, ups=1.19, wpb=493075, bsz=16542.2, num_updates=48200, lr=0.000288076, gnorm=0.141, clip=0, loss_scale=1, train_wall=83, gb_free=61.6, wall=41325 epoch 029: 983 / 1689 loss=3.525, nll_loss=1.973, ppl=3.92, wps=587974, ups=1.19, wpb=493075, bsz=16542.2, num_updates=48200, lr=0.000288076, gnorm=0.141, clip=0, loss_scale=1, train_wall=83, gb_free=61.6, wall=41325 epoch 029: 983 / 1689 loss=3.525, nll_loss=1.973, ppl=3.92, wps=587974, ups=1.19, wpb=493075, bsz=16542.2, num_updates=48200, lr=0.000288076, gnorm=0.141, clip=0, loss_scale=1, train_wall=83, gb_free=61.6, wall=41325 epoch 029: 983 / 1689 loss=3.525, nll_loss=1.973, ppl=3.92, wps=587974, ups=1.19, wpb=493075, bsz=16542.2, num_updates=48200, lr=0.000288076, gnorm=0.141, clip=0, loss_scale=1, train_wall=83, gb_free=61.6, wall=41325 epoch 029: 983 / 1689 loss=3.525, nll_loss=1.973, ppl=3.92, wps=587974, ups=1.19, wpb=493075, bsz=16542.2, num_updates=48200, lr=0.000288076, gnorm=0.141, clip=0, loss_scale=1, train_wall=83, gb_free=61.6, wall=41325 epoch 029: 983 / 1689 loss=3.525, nll_loss=1.973, ppl=3.92, wps=587974, ups=1.19, wpb=493075, bsz=16542.2, num_updates=48200, lr=0.000288076, gnorm=0.141, clip=0, loss_scale=1, train_wall=83, gb_free=61.6, wall=41325 epoch 029: 983 / 1689 loss=3.525, nll_loss=1.973, ppl=3.92, wps=587974, ups=1.19, wpb=493075, bsz=16542.2, num_updates=48200, lr=0.000288076, gnorm=0.141, clip=0, loss_scale=1, train_wall=83, gb_free=61.6, wall=41325 epoch 029: 983 / 1689 loss=3.525, nll_loss=1.973, ppl=3.92, wps=587974, ups=1.19, wpb=493075, bsz=16542.2, num_updates=48200, lr=0.000288076, gnorm=0.141, clip=0, loss_scale=1, train_wall=83, gb_free=61.6, wall=41325 epoch 029: 983 / 1689 loss=3.525, nll_loss=1.973, ppl=3.92, wps=587974, ups=1.19, wpb=493075, bsz=16542.2, num_updates=48200, lr=0.000288076, gnorm=0.141, clip=0, loss_scale=1, train_wall=83, gb_free=61.6, wall=41325 epoch 029: 983 / 1689 loss=3.525, nll_loss=1.973, ppl=3.92, wps=587974, ups=1.19, wpb=493075, bsz=16542.2, num_updates=48200, lr=0.000288076, gnorm=0.141, clip=0, loss_scale=1, train_wall=83, gb_free=61.6, wall=41325 epoch 029: 983 / 1689 loss=3.525, nll_loss=1.973, ppl=3.92, wps=587974, ups=1.19, wpb=493075, bsz=16542.2, num_updates=48200, lr=0.000288076, gnorm=0.141, clip=0, loss_scale=1, train_wall=83, gb_free=61.6, wall=41325 epoch 029: 983 / 1689 loss=3.525, nll_loss=1.973, ppl=3.92, wps=587974, ups=1.19, wpb=493075, bsz=16542.2, num_updates=48200, lr=0.000288076, gnorm=0.141, clip=0, loss_scale=1, train_wall=83, gb_free=61.6, wall=41325 epoch 029: 983 / 1689 loss=3.525, nll_loss=1.973, ppl=3.92, wps=587974, ups=1.19, wpb=493075, bsz=16542.2, num_updates=48200, lr=0.000288076, gnorm=0.141, clip=0, loss_scale=1, train_wall=83, gb_free=61.6, wall=41325 epoch 029: 983 / 1689 loss=3.525, nll_loss=1.973, ppl=3.92, wps=587974, ups=1.19, wpb=493075, bsz=16542.2, num_updates=48200, lr=0.000288076, gnorm=0.141, clip=0, loss_scale=1, train_wall=83, gb_free=61.6, wall=41325 epoch 029: 983 / 1689 loss=3.525, nll_loss=1.973, ppl=3.92, wps=587974, ups=1.19, wpb=493075, bsz=16542.2, num_updates=48200, lr=0.000288076, gnorm=0.141, clip=0, loss_scale=1, train_wall=83, gb_free=61.6, wall=41325 epoch 029: 983 / 1689 loss=3.525, nll_loss=1.973, ppl=3.92, wps=587974, ups=1.19, wpb=493075, bsz=16542.2, num_updates=48200, lr=0.000288076, gnorm=0.141, clip=0, loss_scale=1, train_wall=83, gb_free=61.6, wall=41325 epoch 029: 983 / 1689 loss=3.525, nll_loss=1.973, ppl=3.92, wps=587974, ups=1.19, wpb=493075, bsz=16542.2, num_updates=48200, lr=0.000288076, gnorm=0.141, clip=0, loss_scale=1, train_wall=83, gb_free=61.6, wall=41325 epoch 029: 983 / 1689 loss=3.525, nll_loss=1.973, ppl=3.92, wps=587974, ups=1.19, wpb=493075, bsz=16542.2, num_updates=48200, lr=0.000288076, gnorm=0.141, clip=0, loss_scale=1, train_wall=83, gb_free=61.6, wall=41325 epoch 029: 983 / 1689 loss=3.525, nll_loss=1.973, ppl=3.92, wps=587974, ups=1.19, wpb=493075, bsz=16542.2, num_updates=48200, lr=0.000288076, gnorm=0.141, clip=0, loss_scale=1, train_wall=83, gb_free=61.6, wall=41325 epoch 029: 983 / 1689 loss=3.525, nll_loss=1.973, ppl=3.92, wps=587974, ups=1.19, wpb=493075, bsz=16542.2, num_updates=48200, lr=0.000288076, gnorm=0.141, clip=0, loss_scale=1, train_wall=83, gb_free=61.6, wall=41325 epoch 029: 983 / 1689 loss=3.525, nll_loss=1.973, ppl=3.92, wps=587974, ups=1.19, wpb=493075, bsz=16542.2, num_updates=48200, lr=0.000288076, gnorm=0.141, clip=0, loss_scale=1, train_wall=83, gb_free=61.6, wall=41325 epoch 029: 983 / 1689 loss=3.525, nll_loss=1.973, ppl=3.92, wps=587974, ups=1.19, wpb=493075, bsz=16542.2, num_updates=48200, lr=0.000288076, gnorm=0.141, clip=0, loss_scale=1, train_wall=83, gb_free=61.6, wall=41325 epoch 029: 983 / 1689 loss=3.525, nll_loss=1.973, ppl=3.92, wps=587974, ups=1.19, wpb=493075, bsz=16542.2, num_updates=48200, lr=0.000288076, gnorm=0.141, clip=0, loss_scale=1, train_wall=83, gb_free=61.6, wall=41325 epoch 029: 983 / 1689 loss=3.525, nll_loss=1.973, ppl=3.92, wps=587974, ups=1.19, wpb=493075, bsz=16542.2, num_updates=48200, lr=0.000288076, gnorm=0.141, clip=0, loss_scale=1, train_wall=83, gb_free=61.6, wall=41325 epoch 029: 1083 / 1689 loss=3.515, nll_loss=1.961, ppl=3.89, wps=586411, ups=1.18, wpb=496414, bsz=16338.1, num_updates=48300, lr=0.000287777, gnorm=0.131, clip=0, loss_scale=1, train_wall=83, gb_free=60.6, wall=41410 epoch 029: 1083 / 1689 loss=3.515, nll_loss=1.961, ppl=3.89, wps=586411, ups=1.18, wpb=496414, bsz=16338.1, num_updates=48300, lr=0.000287777, gnorm=0.131, clip=0, loss_scale=1, train_wall=83, gb_free=60.6, wall=41410 epoch 029: 1083 / 1689 loss=3.515, nll_loss=1.961, ppl=3.89, wps=586411, ups=1.18, wpb=496414, bsz=16338.1, num_updates=48300, lr=0.000287777, gnorm=0.131, clip=0, loss_scale=1, train_wall=83, gb_free=60.6, wall=41410 epoch 029: 1083 / 1689 loss=3.515, nll_loss=1.961, ppl=3.89, wps=586411, ups=1.18, wpb=496414, bsz=16338.1, num_updates=48300, lr=0.000287777, gnorm=0.131, clip=0, loss_scale=1, train_wall=83, gb_free=60.6, wall=41410 epoch 029: 1083 / 1689 loss=3.515, nll_loss=1.961, ppl=3.89, wps=586411, ups=1.18, wpb=496414, bsz=16338.1, num_updates=48300, lr=0.000287777, gnorm=0.131, clip=0, loss_scale=1, train_wall=83, gb_free=60.6, wall=41410 epoch 029: 1083 / 1689 loss=3.515, nll_loss=1.961, ppl=3.89, wps=586411, ups=1.18, wpb=496414, bsz=16338.1, num_updates=48300, lr=0.000287777, gnorm=0.131, clip=0, loss_scale=1, train_wall=83, gb_free=60.6, wall=41410 epoch 029: 1083 / 1689 loss=3.515, nll_loss=1.961, ppl=3.89, wps=586411, ups=1.18, wpb=496414, bsz=16338.1, num_updates=48300, lr=0.000287777, gnorm=0.131, clip=0, loss_scale=1, train_wall=83, gb_free=60.6, wall=41410 epoch 029: 1083 / 1689 loss=3.515, nll_loss=1.961, ppl=3.89, wps=586411, ups=1.18, wpb=496414, bsz=16338.1, num_updates=48300, lr=0.000287777, gnorm=0.131, clip=0, loss_scale=1, train_wall=83, gb_free=60.6, wall=41410 epoch 029: 1083 / 1689 loss=3.515, nll_loss=1.961, ppl=3.89, wps=586411, ups=1.18, wpb=496414, bsz=16338.1, num_updates=48300, lr=0.000287777, gnorm=0.131, clip=0, loss_scale=1, train_wall=83, gb_free=60.6, wall=41410 epoch 029: 1083 / 1689 loss=3.515, nll_loss=1.961, ppl=3.89, wps=586411, ups=1.18, wpb=496414, bsz=16338.1, num_updates=48300, lr=0.000287777, gnorm=0.131, clip=0, loss_scale=1, train_wall=83, gb_free=60.6, wall=41410 epoch 029: 1083 / 1689 loss=3.515, nll_loss=1.961, ppl=3.89, wps=586411, ups=1.18, wpb=496414, bsz=16338.1, num_updates=48300, lr=0.000287777, gnorm=0.131, clip=0, loss_scale=1, train_wall=83, gb_free=60.6, wall=41410 epoch 029: 1083 / 1689 loss=3.515, nll_loss=1.961, ppl=3.89, wps=586411, ups=1.18, wpb=496414, bsz=16338.1, num_updates=48300, lr=0.000287777, gnorm=0.131, clip=0, loss_scale=1, train_wall=83, gb_free=60.6, wall=41410 epoch 029: 1083 / 1689 loss=3.515, nll_loss=1.961, ppl=3.89, wps=586411, ups=1.18, wpb=496414, bsz=16338.1, num_updates=48300, lr=0.000287777, gnorm=0.131, clip=0, loss_scale=1, train_wall=83, gb_free=60.6, wall=41410 epoch 029: 1083 / 1689 loss=3.515, nll_loss=1.961, ppl=3.89, wps=586411, ups=1.18, wpb=496414, bsz=16338.1, num_updates=48300, lr=0.000287777, gnorm=0.131, clip=0, loss_scale=1, train_wall=83, gb_free=60.6, wall=41410 epoch 029: 1083 / 1689 loss=3.515, nll_loss=1.961, ppl=3.89, wps=586411, ups=1.18, wpb=496414, bsz=16338.1, num_updates=48300, lr=0.000287777, gnorm=0.131, clip=0, loss_scale=1, train_wall=83, gb_free=60.6, wall=41410 epoch 029: 1083 / 1689 loss=3.515, nll_loss=1.961, ppl=3.89, wps=586411, ups=1.18, wpb=496414, bsz=16338.1, num_updates=48300, lr=0.000287777, gnorm=0.131, clip=0, loss_scale=1, train_wall=83, gb_free=60.6, wall=41410 epoch 029: 1083 / 1689 loss=3.515, nll_loss=1.961, ppl=3.89, wps=586411, ups=1.18, wpb=496414, bsz=16338.1, num_updates=48300, lr=0.000287777, gnorm=0.131, clip=0, loss_scale=1, train_wall=83, gb_free=60.6, wall=41410 epoch 029: 1083 / 1689 loss=3.515, nll_loss=1.961, ppl=3.89, wps=586411, ups=1.18, wpb=496414, bsz=16338.1, num_updates=48300, lr=0.000287777, gnorm=0.131, clip=0, loss_scale=1, train_wall=83, gb_free=60.6, wall=41410 epoch 029: 1083 / 1689 loss=3.515, nll_loss=1.961, ppl=3.89, wps=586411, ups=1.18, wpb=496414, bsz=16338.1, num_updates=48300, lr=0.000287777, gnorm=0.131, clip=0, loss_scale=1, train_wall=83, gb_free=60.6, wall=41410 epoch 029: 1083 / 1689 loss=3.515, nll_loss=1.961, ppl=3.89, wps=586411, ups=1.18, wpb=496414, bsz=16338.1, num_updates=48300, lr=0.000287777, gnorm=0.131, clip=0, loss_scale=1, train_wall=83, gb_free=60.6, wall=41410 epoch 029: 1083 / 1689 loss=3.515, nll_loss=1.961, ppl=3.89, wps=586411, ups=1.18, wpb=496414, bsz=16338.1, num_updates=48300, lr=0.000287777, gnorm=0.131, clip=0, loss_scale=1, train_wall=83, gb_free=60.6, wall=41410 epoch 029: 1083 / 1689 loss=3.515, nll_loss=1.961, ppl=3.89, wps=586411, ups=1.18, wpb=496414, bsz=16338.1, num_updates=48300, lr=0.000287777, gnorm=0.131, clip=0, loss_scale=1, train_wall=83, gb_free=60.6, wall=41410 epoch 029: 1083 / 1689 loss=3.515, nll_loss=1.961, ppl=3.89, wps=586411, ups=1.18, wpb=496414, bsz=16338.1, num_updates=48300, lr=0.000287777, gnorm=0.131, clip=0, loss_scale=1, train_wall=83, gb_free=60.6, wall=41410 epoch 029: 1083 / 1689 loss=3.515, nll_loss=1.961, ppl=3.89, wps=586411, ups=1.18, wpb=496414, bsz=16338.1, num_updates=48300, lr=0.000287777, gnorm=0.131, clip=0, loss_scale=1, train_wall=83, gb_free=60.6, wall=41410 epoch 029: 1083 / 1689 loss=3.515, nll_loss=1.961, ppl=3.89, wps=586411, ups=1.18, wpb=496414, bsz=16338.1, num_updates=48300, lr=0.000287777, gnorm=0.131, clip=0, loss_scale=1, train_wall=83, gb_free=60.6, wall=41410 epoch 029: 1083 / 1689 loss=3.515, nll_loss=1.961, ppl=3.89, wps=586411, ups=1.18, wpb=496414, bsz=16338.1, num_updates=48300, lr=0.000287777, gnorm=0.131, clip=0, loss_scale=1, train_wall=83, gb_free=60.6, wall=41410 epoch 029: 1083 / 1689 loss=3.515, nll_loss=1.961, ppl=3.89, wps=586411, ups=1.18, wpb=496414, bsz=16338.1, num_updates=48300, lr=0.000287777, gnorm=0.131, clip=0, loss_scale=1, train_wall=83, gb_free=60.6, wall=41410 epoch 029: 1083 / 1689 loss=3.515, nll_loss=1.961, ppl=3.89, wps=586411, ups=1.18, wpb=496414, bsz=16338.1, num_updates=48300, lr=0.000287777, gnorm=0.131, clip=0, loss_scale=1, train_wall=83, gb_free=60.6, wall=41410 epoch 029: 1083 / 1689 loss=3.515, nll_loss=1.961, ppl=3.89, wps=586411, ups=1.18, wpb=496414, bsz=16338.1, num_updates=48300, lr=0.000287777, gnorm=0.131, clip=0, loss_scale=1, train_wall=83, gb_free=60.6, wall=41410 epoch 029: 1183 / 1689 loss=3.523, nll_loss=1.97, ppl=3.92, wps=588998, ups=1.19, wpb=496307, bsz=16758.4, num_updates=48400, lr=0.00028748, gnorm=0.13, clip=0, loss_scale=1, train_wall=83, gb_free=61.2, wall=41494 epoch 029: 1183 / 1689 loss=3.523, nll_loss=1.97, ppl=3.92, wps=588998, ups=1.19, wpb=496307, bsz=16758.4, num_updates=48400, lr=0.00028748, gnorm=0.13, clip=0, loss_scale=1, train_wall=83, gb_free=61.2, wall=41494 epoch 029: 1183 / 1689 loss=3.523, nll_loss=1.97, ppl=3.92, wps=588998, ups=1.19, wpb=496307, bsz=16758.4, num_updates=48400, lr=0.00028748, gnorm=0.13, clip=0, loss_scale=1, train_wall=83, gb_free=61.2, wall=41494 epoch 029: 1183 / 1689 loss=3.523, nll_loss=1.97, ppl=3.92, wps=588998, ups=1.19, wpb=496307, bsz=16758.4, num_updates=48400, lr=0.00028748, gnorm=0.13, clip=0, loss_scale=1, train_wall=83, gb_free=61.2, wall=41494 epoch 029: 1183 / 1689 loss=3.523, nll_loss=1.97, ppl=3.92, wps=588998, ups=1.19, wpb=496307, bsz=16758.4, num_updates=48400, lr=0.00028748, gnorm=0.13, clip=0, loss_scale=1, train_wall=83, gb_free=61.2, wall=41494 epoch 029: 1183 / 1689 loss=3.523, nll_loss=1.97, ppl=3.92, wps=588998, ups=1.19, wpb=496307, bsz=16758.4, num_updates=48400, lr=0.00028748, gnorm=0.13, clip=0, loss_scale=1, train_wall=83, gb_free=61.2, wall=41494 epoch 029: 1183 / 1689 loss=3.523, nll_loss=1.97, ppl=3.92, wps=588998, ups=1.19, wpb=496307, bsz=16758.4, num_updates=48400, lr=0.00028748, gnorm=0.13, clip=0, loss_scale=1, train_wall=83, gb_free=61.2, wall=41494 epoch 029: 1183 / 1689 loss=3.523, nll_loss=1.97, ppl=3.92, wps=588998, ups=1.19, wpb=496307, bsz=16758.4, num_updates=48400, lr=0.00028748, gnorm=0.13, clip=0, loss_scale=1, train_wall=83, gb_free=61.2, wall=41494 epoch 029: 1183 / 1689 loss=3.523, nll_loss=1.97, ppl=3.92, wps=588998, ups=1.19, wpb=496307, bsz=16758.4, num_updates=48400, lr=0.00028748, gnorm=0.13, clip=0, loss_scale=1, train_wall=83, gb_free=61.2, wall=41494 epoch 029: 1183 / 1689 loss=3.523, nll_loss=1.97, ppl=3.92, wps=588998, ups=1.19, wpb=496307, bsz=16758.4, num_updates=48400, lr=0.00028748, gnorm=0.13, clip=0, loss_scale=1, train_wall=83, gb_free=61.2, wall=41494 epoch 029: 1183 / 1689 loss=3.523, nll_loss=1.97, ppl=3.92, wps=588998, ups=1.19, wpb=496307, bsz=16758.4, num_updates=48400, lr=0.00028748, gnorm=0.13, clip=0, loss_scale=1, train_wall=83, gb_free=61.2, wall=41494 epoch 029: 1183 / 1689 loss=3.523, nll_loss=1.97, ppl=3.92, wps=588998, ups=1.19, wpb=496307, bsz=16758.4, num_updates=48400, lr=0.00028748, gnorm=0.13, clip=0, loss_scale=1, train_wall=83, gb_free=61.2, wall=41494 epoch 029: 1183 / 1689 loss=3.523, nll_loss=1.97, ppl=3.92, wps=588998, ups=1.19, wpb=496307, bsz=16758.4, num_updates=48400, lr=0.00028748, gnorm=0.13, clip=0, loss_scale=1, train_wall=83, gb_free=61.2, wall=41494 epoch 029: 1183 / 1689 loss=3.523, nll_loss=1.97, ppl=3.92, wps=588998, ups=1.19, wpb=496307, bsz=16758.4, num_updates=48400, lr=0.00028748, gnorm=0.13, clip=0, loss_scale=1, train_wall=83, gb_free=61.2, wall=41494 epoch 029: 1183 / 1689 loss=3.523, nll_loss=1.97, ppl=3.92, wps=588998, ups=1.19, wpb=496307, bsz=16758.4, num_updates=48400, lr=0.00028748, gnorm=0.13, clip=0, loss_scale=1, train_wall=83, gb_free=61.2, wall=41494 epoch 029: 1183 / 1689 loss=3.523, nll_loss=1.97, ppl=3.92, wps=588998, ups=1.19, wpb=496307, bsz=16758.4, num_updates=48400, lr=0.00028748, gnorm=0.13, clip=0, loss_scale=1, train_wall=83, gb_free=61.2, wall=41494 epoch 029: 1183 / 1689 loss=3.523, nll_loss=1.97, ppl=3.92, wps=588998, ups=1.19, wpb=496307, bsz=16758.4, num_updates=48400, lr=0.00028748, gnorm=0.13, clip=0, loss_scale=1, train_wall=83, gb_free=61.2, wall=41494 epoch 029: 1183 / 1689 loss=3.523, nll_loss=1.97, ppl=3.92, wps=588998, ups=1.19, wpb=496307, bsz=16758.4, num_updates=48400, lr=0.00028748, gnorm=0.13, clip=0, loss_scale=1, train_wall=83, gb_free=61.2, wall=41494 epoch 029: 1183 / 1689 loss=3.523, nll_loss=1.97, ppl=3.92, wps=588998, ups=1.19, wpb=496307, bsz=16758.4, num_updates=48400, lr=0.00028748, gnorm=0.13, clip=0, loss_scale=1, train_wall=83, gb_free=61.2, wall=41494 epoch 029: 1183 / 1689 loss=3.523, nll_loss=1.97, ppl=3.92, wps=588998, ups=1.19, wpb=496307, bsz=16758.4, num_updates=48400, lr=0.00028748, gnorm=0.13, clip=0, loss_scale=1, train_wall=83, gb_free=61.2, wall=41494 epoch 029: 1183 / 1689 loss=3.523, nll_loss=1.97, ppl=3.92, wps=588998, ups=1.19, wpb=496307, bsz=16758.4, num_updates=48400, lr=0.00028748, gnorm=0.13, clip=0, loss_scale=1, train_wall=83, gb_free=61.2, wall=41494 epoch 029: 1183 / 1689 loss=3.523, nll_loss=1.97, ppl=3.92, wps=588998, ups=1.19, wpb=496307, bsz=16758.4, num_updates=48400, lr=0.00028748, gnorm=0.13, clip=0, loss_scale=1, train_wall=83, gb_free=61.2, wall=41494 epoch 029: 1183 / 1689 loss=3.523, nll_loss=1.97, ppl=3.92, wps=588998, ups=1.19, wpb=496307, bsz=16758.4, num_updates=48400, lr=0.00028748, gnorm=0.13, clip=0, loss_scale=1, train_wall=83, gb_free=61.2, wall=41494 epoch 029: 1183 / 1689 loss=3.523, nll_loss=1.97, ppl=3.92, wps=588998, ups=1.19, wpb=496307, bsz=16758.4, num_updates=48400, lr=0.00028748, gnorm=0.13, clip=0, loss_scale=1, train_wall=83, gb_free=61.2, wall=41494 epoch 029: 1183 / 1689 loss=3.523, nll_loss=1.97, ppl=3.92, wps=588998, ups=1.19, wpb=496307, bsz=16758.4, num_updates=48400, lr=0.00028748, gnorm=0.13, clip=0, loss_scale=1, train_wall=83, gb_free=61.2, wall=41494 epoch 029: 1183 / 1689 loss=3.523, nll_loss=1.97, ppl=3.92, wps=588998, ups=1.19, wpb=496307, bsz=16758.4, num_updates=48400, lr=0.00028748, gnorm=0.13, clip=0, loss_scale=1, train_wall=83, gb_free=61.2, wall=41494 epoch 029: 1183 / 1689 loss=3.523, nll_loss=1.97, ppl=3.92, wps=588998, ups=1.19, wpb=496307, bsz=16758.4, num_updates=48400, lr=0.00028748, gnorm=0.13, clip=0, loss_scale=1, train_wall=83, gb_free=61.2, wall=41494 epoch 029: 1183 / 1689 loss=3.523, nll_loss=1.97, ppl=3.92, wps=588998, ups=1.19, wpb=496307, bsz=16758.4, num_updates=48400, lr=0.00028748, gnorm=0.13, clip=0, loss_scale=1, train_wall=83, gb_free=61.2, wall=41494 epoch 029: 1183 / 1689 loss=3.523, nll_loss=1.97, ppl=3.92, wps=588998, ups=1.19, wpb=496307, bsz=16758.4, num_updates=48400, lr=0.00028748, gnorm=0.13, clip=0, loss_scale=1, train_wall=83, gb_free=61.2, wall=41494 epoch 029: 1283 / 1689 loss=3.528, nll_loss=1.976, ppl=3.93, wps=588282, ups=1.19, wpb=496337, bsz=16520.8, num_updates=48500, lr=0.000287183, gnorm=0.134, clip=0, loss_scale=2, train_wall=83, gb_free=61.8, wall=41578 epoch 029: 1283 / 1689 loss=3.528, nll_loss=1.976, ppl=3.93, wps=588282, ups=1.19, wpb=496337, bsz=16520.8, num_updates=48500, lr=0.000287183, gnorm=0.134, clip=0, loss_scale=2, train_wall=83, gb_free=61.8, wall=41578 epoch 029: 1283 / 1689 loss=3.528, nll_loss=1.976, ppl=3.93, wps=588282, ups=1.19, wpb=496337, bsz=16520.8, num_updates=48500, lr=0.000287183, gnorm=0.134, clip=0, loss_scale=2, train_wall=83, gb_free=61.8, wall=41578 epoch 029: 1283 / 1689 loss=3.528, nll_loss=1.976, ppl=3.93, wps=588282, ups=1.19, wpb=496337, bsz=16520.8, num_updates=48500, lr=0.000287183, gnorm=0.134, clip=0, loss_scale=2, train_wall=83, gb_free=61.8, wall=41578 epoch 029: 1283 / 1689 loss=3.528, nll_loss=1.976, ppl=3.93, wps=588282, ups=1.19, wpb=496337, bsz=16520.8, num_updates=48500, lr=0.000287183, gnorm=0.134, clip=0, loss_scale=2, train_wall=83, gb_free=61.8, wall=41578 epoch 029: 1283 / 1689 loss=3.528, nll_loss=1.976, ppl=3.93, wps=588282, ups=1.19, wpb=496337, bsz=16520.8, num_updates=48500, lr=0.000287183, gnorm=0.134, clip=0, loss_scale=2, train_wall=83, gb_free=61.8, wall=41578 epoch 029: 1283 / 1689 loss=3.528, nll_loss=1.976, ppl=3.93, wps=588282, ups=1.19, wpb=496337, bsz=16520.8, num_updates=48500, lr=0.000287183, gnorm=0.134, clip=0, loss_scale=2, train_wall=83, gb_free=61.8, wall=41578 epoch 029: 1283 / 1689 loss=3.528, nll_loss=1.976, ppl=3.93, wps=588282, ups=1.19, wpb=496337, bsz=16520.8, num_updates=48500, lr=0.000287183, gnorm=0.134, clip=0, loss_scale=2, train_wall=83, gb_free=61.8, wall=41578 epoch 029: 1283 / 1689 loss=3.528, nll_loss=1.976, ppl=3.93, wps=588282, ups=1.19, wpb=496337, bsz=16520.8, num_updates=48500, lr=0.000287183, gnorm=0.134, clip=0, loss_scale=2, train_wall=83, gb_free=61.8, wall=41578 epoch 029: 1283 / 1689 loss=3.528, nll_loss=1.976, ppl=3.93, wps=588282, ups=1.19, wpb=496337, bsz=16520.8, num_updates=48500, lr=0.000287183, gnorm=0.134, clip=0, loss_scale=2, train_wall=83, gb_free=61.8, wall=41578 epoch 029: 1283 / 1689 loss=3.528, nll_loss=1.976, ppl=3.93, wps=588282, ups=1.19, wpb=496337, bsz=16520.8, num_updates=48500, lr=0.000287183, gnorm=0.134, clip=0, loss_scale=2, train_wall=83, gb_free=61.8, wall=41578 epoch 029: 1283 / 1689 loss=3.528, nll_loss=1.976, ppl=3.93, wps=588282, ups=1.19, wpb=496337, bsz=16520.8, num_updates=48500, lr=0.000287183, gnorm=0.134, clip=0, loss_scale=2, train_wall=83, gb_free=61.8, wall=41578 epoch 029: 1283 / 1689 loss=3.528, nll_loss=1.976, ppl=3.93, wps=588282, ups=1.19, wpb=496337, bsz=16520.8, num_updates=48500, lr=0.000287183, gnorm=0.134, clip=0, loss_scale=2, train_wall=83, gb_free=61.8, wall=41578 epoch 029: 1283 / 1689 loss=3.528, nll_loss=1.976, ppl=3.93, wps=588282, ups=1.19, wpb=496337, bsz=16520.8, num_updates=48500, lr=0.000287183, gnorm=0.134, clip=0, loss_scale=2, train_wall=83, gb_free=61.8, wall=41578 epoch 029: 1283 / 1689 loss=3.528, nll_loss=1.976, ppl=3.93, wps=588282, ups=1.19, wpb=496337, bsz=16520.8, num_updates=48500, lr=0.000287183, gnorm=0.134, clip=0, loss_scale=2, train_wall=83, gb_free=61.8, wall=41578 epoch 029: 1283 / 1689 loss=3.528, nll_loss=1.976, ppl=3.93, wps=588282, ups=1.19, wpb=496337, bsz=16520.8, num_updates=48500, lr=0.000287183, gnorm=0.134, clip=0, loss_scale=2, train_wall=83, gb_free=61.8, wall=41578 epoch 029: 1283 / 1689 loss=3.528, nll_loss=1.976, ppl=3.93, wps=588282, ups=1.19, wpb=496337, bsz=16520.8, num_updates=48500, lr=0.000287183, gnorm=0.134, clip=0, loss_scale=2, train_wall=83, gb_free=61.8, wall=41578 epoch 029: 1283 / 1689 loss=3.528, nll_loss=1.976, ppl=3.93, wps=588282, ups=1.19, wpb=496337, bsz=16520.8, num_updates=48500, lr=0.000287183, gnorm=0.134, clip=0, loss_scale=2, train_wall=83, gb_free=61.8, wall=41578 epoch 029: 1283 / 1689 loss=3.528, nll_loss=1.976, ppl=3.93, wps=588282, ups=1.19, wpb=496337, bsz=16520.8, num_updates=48500, lr=0.000287183, gnorm=0.134, clip=0, loss_scale=2, train_wall=83, gb_free=61.8, wall=41578 epoch 029: 1283 / 1689 loss=3.528, nll_loss=1.976, ppl=3.93, wps=588282, ups=1.19, wpb=496337, bsz=16520.8, num_updates=48500, lr=0.000287183, gnorm=0.134, clip=0, loss_scale=2, train_wall=83, gb_free=61.8, wall=41578 epoch 029: 1283 / 1689 loss=3.528, nll_loss=1.976, ppl=3.93, wps=588282, ups=1.19, wpb=496337, bsz=16520.8, num_updates=48500, lr=0.000287183, gnorm=0.134, clip=0, loss_scale=2, train_wall=83, gb_free=61.8, wall=41578 epoch 029: 1283 / 1689 loss=3.528, nll_loss=1.976, ppl=3.93, wps=588282, ups=1.19, wpb=496337, bsz=16520.8, num_updates=48500, lr=0.000287183, gnorm=0.134, clip=0, loss_scale=2, train_wall=83, gb_free=61.8, wall=41578 epoch 029: 1283 / 1689 loss=3.528, nll_loss=1.976, ppl=3.93, wps=588282, ups=1.19, wpb=496337, bsz=16520.8, num_updates=48500, lr=0.000287183, gnorm=0.134, clip=0, loss_scale=2, train_wall=83, gb_free=61.8, wall=41578 epoch 029: 1283 / 1689 loss=3.528, nll_loss=1.976, ppl=3.93, wps=588282, ups=1.19, wpb=496337, bsz=16520.8, num_updates=48500, lr=0.000287183, gnorm=0.134, clip=0, loss_scale=2, train_wall=83, gb_free=61.8, wall=41578 epoch 029: 1283 / 1689 loss=3.528, nll_loss=1.976, ppl=3.93, wps=588282, ups=1.19, wpb=496337, bsz=16520.8, num_updates=48500, lr=0.000287183, gnorm=0.134, clip=0, loss_scale=2, train_wall=83, gb_free=61.8, wall=41578 epoch 029: 1283 / 1689 loss=3.528, nll_loss=1.976, ppl=3.93, wps=588282, ups=1.19, wpb=496337, bsz=16520.8, num_updates=48500, lr=0.000287183, gnorm=0.134, clip=0, loss_scale=2, train_wall=83, gb_free=61.8, wall=41578 epoch 029: 1283 / 1689 loss=3.528, nll_loss=1.976, ppl=3.93, wps=588282, ups=1.19, wpb=496337, bsz=16520.8, num_updates=48500, lr=0.000287183, gnorm=0.134, clip=0, loss_scale=2, train_wall=83, gb_free=61.8, wall=41578 epoch 029: 1283 / 1689 loss=3.528, nll_loss=1.976, ppl=3.93, wps=588282, ups=1.19, wpb=496337, bsz=16520.8, num_updates=48500, lr=0.000287183, gnorm=0.134, clip=0, loss_scale=2, train_wall=83, gb_free=61.8, wall=41578 epoch 029: 1283 / 1689 loss=3.528, nll_loss=1.976, ppl=3.93, wps=588282, ups=1.19, wpb=496337, bsz=16520.8, num_updates=48500, lr=0.000287183, gnorm=0.134, clip=0, loss_scale=2, train_wall=83, gb_free=61.8, wall=41578 epoch 029: 1383 / 1689 loss=3.534, nll_loss=1.983, ppl=3.95, wps=587767, ups=1.18, wpb=496561, bsz=16520.6, num_updates=48600, lr=0.000286888, gnorm=0.128, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=41663 epoch 029: 1383 / 1689 loss=3.534, nll_loss=1.983, ppl=3.95, wps=587767, ups=1.18, wpb=496561, bsz=16520.6, num_updates=48600, lr=0.000286888, gnorm=0.128, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=41663 epoch 029: 1383 / 1689 loss=3.534, nll_loss=1.983, ppl=3.95, wps=587767, ups=1.18, wpb=496561, bsz=16520.6, num_updates=48600, lr=0.000286888, gnorm=0.128, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=41663 epoch 029: 1383 / 1689 loss=3.534, nll_loss=1.983, ppl=3.95, wps=587767, ups=1.18, wpb=496561, bsz=16520.6, num_updates=48600, lr=0.000286888, gnorm=0.128, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=41663 epoch 029: 1383 / 1689 loss=3.534, nll_loss=1.983, ppl=3.95, wps=587767, ups=1.18, wpb=496561, bsz=16520.6, num_updates=48600, lr=0.000286888, gnorm=0.128, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=41663 epoch 029: 1383 / 1689 loss=3.534, nll_loss=1.983, ppl=3.95, wps=587767, ups=1.18, wpb=496561, bsz=16520.6, num_updates=48600, lr=0.000286888, gnorm=0.128, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=41663 epoch 029: 1383 / 1689 loss=3.534, nll_loss=1.983, ppl=3.95, wps=587767, ups=1.18, wpb=496561, bsz=16520.6, num_updates=48600, lr=0.000286888, gnorm=0.128, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=41663 epoch 029: 1383 / 1689 loss=3.534, nll_loss=1.983, ppl=3.95, wps=587767, ups=1.18, wpb=496561, bsz=16520.6, num_updates=48600, lr=0.000286888, gnorm=0.128, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=41663 epoch 029: 1383 / 1689 loss=3.534, nll_loss=1.983, ppl=3.95, wps=587767, ups=1.18, wpb=496561, bsz=16520.6, num_updates=48600, lr=0.000286888, gnorm=0.128, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=41663 epoch 029: 1383 / 1689 loss=3.534, nll_loss=1.983, ppl=3.95, wps=587767, ups=1.18, wpb=496561, bsz=16520.6, num_updates=48600, lr=0.000286888, gnorm=0.128, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=41663 epoch 029: 1383 / 1689 loss=3.534, nll_loss=1.983, ppl=3.95, wps=587767, ups=1.18, wpb=496561, bsz=16520.6, num_updates=48600, lr=0.000286888, gnorm=0.128, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=41663 epoch 029: 1383 / 1689 loss=3.534, nll_loss=1.983, ppl=3.95, wps=587767, ups=1.18, wpb=496561, bsz=16520.6, num_updates=48600, lr=0.000286888, gnorm=0.128, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=41663 epoch 029: 1383 / 1689 loss=3.534, nll_loss=1.983, ppl=3.95, wps=587767, ups=1.18, wpb=496561, bsz=16520.6, num_updates=48600, lr=0.000286888, gnorm=0.128, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=41663 epoch 029: 1383 / 1689 loss=3.534, nll_loss=1.983, ppl=3.95, wps=587767, ups=1.18, wpb=496561, bsz=16520.6, num_updates=48600, lr=0.000286888, gnorm=0.128, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=41663 epoch 029: 1383 / 1689 loss=3.534, nll_loss=1.983, ppl=3.95, wps=587767, ups=1.18, wpb=496561, bsz=16520.6, num_updates=48600, lr=0.000286888, gnorm=0.128, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=41663 epoch 029: 1383 / 1689 loss=3.534, nll_loss=1.983, ppl=3.95, wps=587767, ups=1.18, wpb=496561, bsz=16520.6, num_updates=48600, lr=0.000286888, gnorm=0.128, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=41663 epoch 029: 1383 / 1689 loss=3.534, nll_loss=1.983, ppl=3.95, wps=587767, ups=1.18, wpb=496561, bsz=16520.6, num_updates=48600, lr=0.000286888, gnorm=0.128, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=41663 epoch 029: 1383 / 1689 loss=3.534, nll_loss=1.983, ppl=3.95, wps=587767, ups=1.18, wpb=496561, bsz=16520.6, num_updates=48600, lr=0.000286888, gnorm=0.128, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=41663 epoch 029: 1383 / 1689 loss=3.534, nll_loss=1.983, ppl=3.95, wps=587767, ups=1.18, wpb=496561, bsz=16520.6, num_updates=48600, lr=0.000286888, gnorm=0.128, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=41663 epoch 029: 1383 / 1689 loss=3.534, nll_loss=1.983, ppl=3.95, wps=587767, ups=1.18, wpb=496561, bsz=16520.6, num_updates=48600, lr=0.000286888, gnorm=0.128, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=41663 epoch 029: 1383 / 1689 loss=3.534, nll_loss=1.983, ppl=3.95, wps=587767, ups=1.18, wpb=496561, bsz=16520.6, num_updates=48600, lr=0.000286888, gnorm=0.128, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=41663 epoch 029: 1383 / 1689 loss=3.534, nll_loss=1.983, ppl=3.95, wps=587767, ups=1.18, wpb=496561, bsz=16520.6, num_updates=48600, lr=0.000286888, gnorm=0.128, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=41663 epoch 029: 1383 / 1689 loss=3.534, nll_loss=1.983, ppl=3.95, wps=587767, ups=1.18, wpb=496561, bsz=16520.6, num_updates=48600, lr=0.000286888, gnorm=0.128, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=41663 epoch 029: 1383 / 1689 loss=3.534, nll_loss=1.983, ppl=3.95, wps=587767, ups=1.18, wpb=496561, bsz=16520.6, num_updates=48600, lr=0.000286888, gnorm=0.128, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=41663 epoch 029: 1383 / 1689 loss=3.534, nll_loss=1.983, ppl=3.95, wps=587767, ups=1.18, wpb=496561, bsz=16520.6, num_updates=48600, lr=0.000286888, gnorm=0.128, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=41663 epoch 029: 1383 / 1689 loss=3.534, nll_loss=1.983, ppl=3.95, wps=587767, ups=1.18, wpb=496561, bsz=16520.6, num_updates=48600, lr=0.000286888, gnorm=0.128, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=41663 epoch 029: 1383 / 1689 loss=3.534, nll_loss=1.983, ppl=3.95, wps=587767, ups=1.18, wpb=496561, bsz=16520.6, num_updates=48600, lr=0.000286888, gnorm=0.128, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=41663 epoch 029: 1383 / 1689 loss=3.534, nll_loss=1.983, ppl=3.95, wps=587767, ups=1.18, wpb=496561, bsz=16520.6, num_updates=48600, lr=0.000286888, gnorm=0.128, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=41663 epoch 029: 1383 / 1689 loss=3.534, nll_loss=1.983, ppl=3.95, wps=587767, ups=1.18, wpb=496561, bsz=16520.6, num_updates=48600, lr=0.000286888, gnorm=0.128, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=41663 epoch 029: 1483 / 1689 loss=3.524, nll_loss=1.971, ppl=3.92, wps=583416, ups=1.18, wpb=494670, bsz=16444.9, num_updates=48700, lr=0.000286593, gnorm=0.138, clip=0, loss_scale=2, train_wall=84, gb_free=61.3, wall=41748 epoch 029: 1483 / 1689 loss=3.524, nll_loss=1.971, ppl=3.92, wps=583416, ups=1.18, wpb=494670, bsz=16444.9, num_updates=48700, lr=0.000286593, gnorm=0.138, clip=0, loss_scale=2, train_wall=84, gb_free=61.3, wall=41748 epoch 029: 1483 / 1689 loss=3.524, nll_loss=1.971, ppl=3.92, wps=583416, ups=1.18, wpb=494670, bsz=16444.9, num_updates=48700, lr=0.000286593, gnorm=0.138, clip=0, loss_scale=2, train_wall=84, gb_free=61.3, wall=41748 epoch 029: 1483 / 1689 loss=3.524, nll_loss=1.971, ppl=3.92, wps=583416, ups=1.18, wpb=494670, bsz=16444.9, num_updates=48700, lr=0.000286593, gnorm=0.138, clip=0, loss_scale=2, train_wall=84, gb_free=61.3, wall=41748 epoch 029: 1483 / 1689 loss=3.524, nll_loss=1.971, ppl=3.92, wps=583416, ups=1.18, wpb=494670, bsz=16444.9, num_updates=48700, lr=0.000286593, gnorm=0.138, clip=0, loss_scale=2, train_wall=84, gb_free=61.3, wall=41748 epoch 029: 1483 / 1689 loss=3.524, nll_loss=1.971, ppl=3.92, wps=583416, ups=1.18, wpb=494670, bsz=16444.9, num_updates=48700, lr=0.000286593, gnorm=0.138, clip=0, loss_scale=2, train_wall=84, gb_free=61.3, wall=41748 epoch 029: 1483 / 1689 loss=3.524, nll_loss=1.971, ppl=3.92, wps=583416, ups=1.18, wpb=494670, bsz=16444.9, num_updates=48700, lr=0.000286593, gnorm=0.138, clip=0, loss_scale=2, train_wall=84, gb_free=61.3, wall=41748 epoch 029: 1483 / 1689 loss=3.524, nll_loss=1.971, ppl=3.92, wps=583416, ups=1.18, wpb=494670, bsz=16444.9, num_updates=48700, lr=0.000286593, gnorm=0.138, clip=0, loss_scale=2, train_wall=84, gb_free=61.3, wall=41748 epoch 029: 1483 / 1689 loss=3.524, nll_loss=1.971, ppl=3.92, wps=583416, ups=1.18, wpb=494670, bsz=16444.9, num_updates=48700, lr=0.000286593, gnorm=0.138, clip=0, loss_scale=2, train_wall=84, gb_free=61.3, wall=41748 epoch 029: 1483 / 1689 loss=3.524, nll_loss=1.971, ppl=3.92, wps=583416, ups=1.18, wpb=494670, bsz=16444.9, num_updates=48700, lr=0.000286593, gnorm=0.138, clip=0, loss_scale=2, train_wall=84, gb_free=61.3, wall=41748 epoch 029: 1483 / 1689 loss=3.524, nll_loss=1.971, ppl=3.92, wps=583416, ups=1.18, wpb=494670, bsz=16444.9, num_updates=48700, lr=0.000286593, gnorm=0.138, clip=0, loss_scale=2, train_wall=84, gb_free=61.3, wall=41748 epoch 029: 1483 / 1689 loss=3.524, nll_loss=1.971, ppl=3.92, wps=583416, ups=1.18, wpb=494670, bsz=16444.9, num_updates=48700, lr=0.000286593, gnorm=0.138, clip=0, loss_scale=2, train_wall=84, gb_free=61.3, wall=41748 epoch 029: 1483 / 1689 loss=3.524, nll_loss=1.971, ppl=3.92, wps=583416, ups=1.18, wpb=494670, bsz=16444.9, num_updates=48700, lr=0.000286593, gnorm=0.138, clip=0, loss_scale=2, train_wall=84, gb_free=61.3, wall=41748 epoch 029: 1483 / 1689 loss=3.524, nll_loss=1.971, ppl=3.92, wps=583416, ups=1.18, wpb=494670, bsz=16444.9, num_updates=48700, lr=0.000286593, gnorm=0.138, clip=0, loss_scale=2, train_wall=84, gb_free=61.3, wall=41748 epoch 029: 1483 / 1689 loss=3.524, nll_loss=1.971, ppl=3.92, wps=583416, ups=1.18, wpb=494670, bsz=16444.9, num_updates=48700, lr=0.000286593, gnorm=0.138, clip=0, loss_scale=2, train_wall=84, gb_free=61.3, wall=41748 epoch 029: 1483 / 1689 loss=3.524, nll_loss=1.971, ppl=3.92, wps=583416, ups=1.18, wpb=494670, bsz=16444.9, num_updates=48700, lr=0.000286593, gnorm=0.138, clip=0, loss_scale=2, train_wall=84, gb_free=61.3, wall=41748 epoch 029: 1483 / 1689 loss=3.524, nll_loss=1.971, ppl=3.92, wps=583416, ups=1.18, wpb=494670, bsz=16444.9, num_updates=48700, lr=0.000286593, gnorm=0.138, clip=0, loss_scale=2, train_wall=84, gb_free=61.3, wall=41748 epoch 029: 1483 / 1689 loss=3.524, nll_loss=1.971, ppl=3.92, wps=583416, ups=1.18, wpb=494670, bsz=16444.9, num_updates=48700, lr=0.000286593, gnorm=0.138, clip=0, loss_scale=2, train_wall=84, gb_free=61.3, wall=41748 epoch 029: 1483 / 1689 loss=3.524, nll_loss=1.971, ppl=3.92, wps=583416, ups=1.18, wpb=494670, bsz=16444.9, num_updates=48700, lr=0.000286593, gnorm=0.138, clip=0, loss_scale=2, train_wall=84, gb_free=61.3, wall=41748 epoch 029: 1483 / 1689 loss=3.524, nll_loss=1.971, ppl=3.92, wps=583416, ups=1.18, wpb=494670, bsz=16444.9, num_updates=48700, lr=0.000286593, gnorm=0.138, clip=0, loss_scale=2, train_wall=84, gb_free=61.3, wall=41748 epoch 029: 1483 / 1689 loss=3.524, nll_loss=1.971, ppl=3.92, wps=583416, ups=1.18, wpb=494670, bsz=16444.9, num_updates=48700, lr=0.000286593, gnorm=0.138, clip=0, loss_scale=2, train_wall=84, gb_free=61.3, wall=41748 epoch 029: 1483 / 1689 loss=3.524, nll_loss=1.971, ppl=3.92, wps=583416, ups=1.18, wpb=494670, bsz=16444.9, num_updates=48700, lr=0.000286593, gnorm=0.138, clip=0, loss_scale=2, train_wall=84, gb_free=61.3, wall=41748 epoch 029: 1483 / 1689 loss=3.524, nll_loss=1.971, ppl=3.92, wps=583416, ups=1.18, wpb=494670, bsz=16444.9, num_updates=48700, lr=0.000286593, gnorm=0.138, clip=0, loss_scale=2, train_wall=84, gb_free=61.3, wall=41748 epoch 029: 1483 / 1689 loss=3.524, nll_loss=1.971, ppl=3.92, wps=583416, ups=1.18, wpb=494670, bsz=16444.9, num_updates=48700, lr=0.000286593, gnorm=0.138, clip=0, loss_scale=2, train_wall=84, gb_free=61.3, wall=41748 epoch 029: 1483 / 1689 loss=3.524, nll_loss=1.971, ppl=3.92, wps=583416, ups=1.18, wpb=494670, bsz=16444.9, num_updates=48700, lr=0.000286593, gnorm=0.138, clip=0, loss_scale=2, train_wall=84, gb_free=61.3, wall=41748 epoch 029: 1483 / 1689 loss=3.524, nll_loss=1.971, ppl=3.92, wps=583416, ups=1.18, wpb=494670, bsz=16444.9, num_updates=48700, lr=0.000286593, gnorm=0.138, clip=0, loss_scale=2, train_wall=84, gb_free=61.3, wall=41748 epoch 029: 1483 / 1689 loss=3.524, nll_loss=1.971, ppl=3.92, wps=583416, ups=1.18, wpb=494670, bsz=16444.9, num_updates=48700, lr=0.000286593, gnorm=0.138, clip=0, loss_scale=2, train_wall=84, gb_free=61.3, wall=41748 epoch 029: 1483 / 1689 loss=3.524, nll_loss=1.971, ppl=3.92, wps=583416, ups=1.18, wpb=494670, bsz=16444.9, num_updates=48700, lr=0.000286593, gnorm=0.138, clip=0, loss_scale=2, train_wall=84, gb_free=61.3, wall=41748 epoch 029: 1483 / 1689 loss=3.524, nll_loss=1.971, ppl=3.92, wps=583416, ups=1.18, wpb=494670, bsz=16444.9, num_updates=48700, lr=0.000286593, gnorm=0.138, clip=0, loss_scale=2, train_wall=84, gb_free=61.3, wall=41748 epoch 029: 1584 / 1689 loss=3.529, nll_loss=1.977, ppl=3.94, wps=578858, ups=1.17, wpb=493836, bsz=16285.4, num_updates=48800, lr=0.000286299, gnorm=0.131, clip=0, loss_scale=1, train_wall=84, gb_free=60.7, wall=41833 epoch 029: 1584 / 1689 loss=3.529, nll_loss=1.977, ppl=3.94, wps=578858, ups=1.17, wpb=493836, bsz=16285.4, num_updates=48800, lr=0.000286299, gnorm=0.131, clip=0, loss_scale=1, train_wall=84, gb_free=60.7, wall=41833 epoch 029: 1584 / 1689 loss=3.529, nll_loss=1.977, ppl=3.94, wps=578858, ups=1.17, wpb=493836, bsz=16285.4, num_updates=48800, lr=0.000286299, gnorm=0.131, clip=0, loss_scale=1, train_wall=84, gb_free=60.7, wall=41833 epoch 029: 1584 / 1689 loss=3.529, nll_loss=1.977, ppl=3.94, wps=578858, ups=1.17, wpb=493836, bsz=16285.4, num_updates=48800, lr=0.000286299, gnorm=0.131, clip=0, loss_scale=1, train_wall=84, gb_free=60.7, wall=41833 epoch 029: 1584 / 1689 loss=3.529, nll_loss=1.977, ppl=3.94, wps=578858, ups=1.17, wpb=493836, bsz=16285.4, num_updates=48800, lr=0.000286299, gnorm=0.131, clip=0, loss_scale=1, train_wall=84, gb_free=60.7, wall=41833 epoch 029: 1584 / 1689 loss=3.529, nll_loss=1.977, ppl=3.94, wps=578858, ups=1.17, wpb=493836, bsz=16285.4, num_updates=48800, lr=0.000286299, gnorm=0.131, clip=0, loss_scale=1, train_wall=84, gb_free=60.7, wall=41833 epoch 029: 1584 / 1689 loss=3.529, nll_loss=1.977, ppl=3.94, wps=578858, ups=1.17, wpb=493836, bsz=16285.4, num_updates=48800, lr=0.000286299, gnorm=0.131, clip=0, loss_scale=1, train_wall=84, gb_free=60.7, wall=41833 epoch 029: 1584 / 1689 loss=3.529, nll_loss=1.977, ppl=3.94, wps=578858, ups=1.17, wpb=493836, bsz=16285.4, num_updates=48800, lr=0.000286299, gnorm=0.131, clip=0, loss_scale=1, train_wall=84, gb_free=60.7, wall=41833 epoch 029: 1584 / 1689 loss=3.529, nll_loss=1.977, ppl=3.94, wps=578858, ups=1.17, wpb=493836, bsz=16285.4, num_updates=48800, lr=0.000286299, gnorm=0.131, clip=0, loss_scale=1, train_wall=84, gb_free=60.7, wall=41833 epoch 029: 1584 / 1689 loss=3.529, nll_loss=1.977, ppl=3.94, wps=578858, ups=1.17, wpb=493836, bsz=16285.4, num_updates=48800, lr=0.000286299, gnorm=0.131, clip=0, loss_scale=1, train_wall=84, gb_free=60.7, wall=41833 epoch 029: 1584 / 1689 loss=3.529, nll_loss=1.977, ppl=3.94, wps=578858, ups=1.17, wpb=493836, bsz=16285.4, num_updates=48800, lr=0.000286299, gnorm=0.131, clip=0, loss_scale=1, train_wall=84, gb_free=60.7, wall=41833 epoch 029: 1584 / 1689 loss=3.529, nll_loss=1.977, ppl=3.94, wps=578858, ups=1.17, wpb=493836, bsz=16285.4, num_updates=48800, lr=0.000286299, gnorm=0.131, clip=0, loss_scale=1, train_wall=84, gb_free=60.7, wall=41833 epoch 029: 1584 / 1689 loss=3.529, nll_loss=1.977, ppl=3.94, wps=578858, ups=1.17, wpb=493836, bsz=16285.4, num_updates=48800, lr=0.000286299, gnorm=0.131, clip=0, loss_scale=1, train_wall=84, gb_free=60.7, wall=41833 epoch 029: 1584 / 1689 loss=3.529, nll_loss=1.977, ppl=3.94, wps=578858, ups=1.17, wpb=493836, bsz=16285.4, num_updates=48800, lr=0.000286299, gnorm=0.131, clip=0, loss_scale=1, train_wall=84, gb_free=60.7, wall=41833 epoch 029: 1584 / 1689 loss=3.529, nll_loss=1.977, ppl=3.94, wps=578858, ups=1.17, wpb=493836, bsz=16285.4, num_updates=48800, lr=0.000286299, gnorm=0.131, clip=0, loss_scale=1, train_wall=84, gb_free=60.7, wall=41833 epoch 029: 1584 / 1689 loss=3.529, nll_loss=1.977, ppl=3.94, wps=578858, ups=1.17, wpb=493836, bsz=16285.4, num_updates=48800, lr=0.000286299, gnorm=0.131, clip=0, loss_scale=1, train_wall=84, gb_free=60.7, wall=41833 epoch 029: 1584 / 1689 loss=3.529, nll_loss=1.977, ppl=3.94, wps=578858, ups=1.17, wpb=493836, bsz=16285.4, num_updates=48800, lr=0.000286299, gnorm=0.131, clip=0, loss_scale=1, train_wall=84, gb_free=60.7, wall=41833 epoch 029: 1584 / 1689 loss=3.529, nll_loss=1.977, ppl=3.94, wps=578858, ups=1.17, wpb=493836, bsz=16285.4, num_updates=48800, lr=0.000286299, gnorm=0.131, clip=0, loss_scale=1, train_wall=84, gb_free=60.7, wall=41833 epoch 029: 1584 / 1689 loss=3.529, nll_loss=1.977, ppl=3.94, wps=578858, ups=1.17, wpb=493836, bsz=16285.4, num_updates=48800, lr=0.000286299, gnorm=0.131, clip=0, loss_scale=1, train_wall=84, gb_free=60.7, wall=41833 epoch 029: 1584 / 1689 loss=3.529, nll_loss=1.977, ppl=3.94, wps=578858, ups=1.17, wpb=493836, bsz=16285.4, num_updates=48800, lr=0.000286299, gnorm=0.131, clip=0, loss_scale=1, train_wall=84, gb_free=60.7, wall=41833 epoch 029: 1584 / 1689 loss=3.529, nll_loss=1.977, ppl=3.94, wps=578858, ups=1.17, wpb=493836, bsz=16285.4, num_updates=48800, lr=0.000286299, gnorm=0.131, clip=0, loss_scale=1, train_wall=84, gb_free=60.7, wall=41833 epoch 029: 1584 / 1689 loss=3.529, nll_loss=1.977, ppl=3.94, wps=578858, ups=1.17, wpb=493836, bsz=16285.4, num_updates=48800, lr=0.000286299, gnorm=0.131, clip=0, loss_scale=1, train_wall=84, gb_free=60.7, wall=41833 epoch 029: 1584 / 1689 loss=3.529, nll_loss=1.977, ppl=3.94, wps=578858, ups=1.17, wpb=493836, bsz=16285.4, num_updates=48800, lr=0.000286299, gnorm=0.131, clip=0, loss_scale=1, train_wall=84, gb_free=60.7, wall=41833 epoch 029: 1584 / 1689 loss=3.529, nll_loss=1.977, ppl=3.94, wps=578858, ups=1.17, wpb=493836, bsz=16285.4, num_updates=48800, lr=0.000286299, gnorm=0.131, clip=0, loss_scale=1, train_wall=84, gb_free=60.7, wall=41833 epoch 029: 1584 / 1689 loss=3.529, nll_loss=1.977, ppl=3.94, wps=578858, ups=1.17, wpb=493836, bsz=16285.4, num_updates=48800, lr=0.000286299, gnorm=0.131, clip=0, loss_scale=1, train_wall=84, gb_free=60.7, wall=41833 epoch 029: 1584 / 1689 loss=3.529, nll_loss=1.977, ppl=3.94, wps=578858, ups=1.17, wpb=493836, bsz=16285.4, num_updates=48800, lr=0.000286299, gnorm=0.131, clip=0, loss_scale=1, train_wall=84, gb_free=60.7, wall=41833 epoch 029: 1584 / 1689 loss=3.529, nll_loss=1.977, ppl=3.94, wps=578858, ups=1.17, wpb=493836, bsz=16285.4, num_updates=48800, lr=0.000286299, gnorm=0.131, clip=0, loss_scale=1, train_wall=84, gb_free=60.7, wall=41833 epoch 029: 1584 / 1689 loss=3.529, nll_loss=1.977, ppl=3.94, wps=578858, ups=1.17, wpb=493836, bsz=16285.4, num_updates=48800, lr=0.000286299, gnorm=0.131, clip=0, loss_scale=1, train_wall=84, gb_free=60.7, wall=41833 epoch 029: 1584 / 1689 loss=3.529, nll_loss=1.977, ppl=3.94, wps=578858, ups=1.17, wpb=493836, bsz=16285.4, num_updates=48800, lr=0.000286299, gnorm=0.131, clip=0, loss_scale=1, train_wall=84, gb_free=60.7, wall=41833 epoch 029: 1684 / 1689 loss=3.53, nll_loss=1.978, ppl=3.94, wps=587910, ups=1.18, wpb=496805, bsz=16270.8, num_updates=48900, lr=0.000286006, gnorm=0.132, clip=0, loss_scale=1, train_wall=83, gb_free=61.4, wall=41918 epoch 029: 1684 / 1689 loss=3.53, nll_loss=1.978, ppl=3.94, wps=587910, ups=1.18, wpb=496805, bsz=16270.8, num_updates=48900, lr=0.000286006, gnorm=0.132, clip=0, loss_scale=1, train_wall=83, gb_free=61.4, wall=41918 epoch 029: 1684 / 1689 loss=3.53, nll_loss=1.978, ppl=3.94, wps=587910, ups=1.18, wpb=496805, bsz=16270.8, num_updates=48900, lr=0.000286006, gnorm=0.132, clip=0, loss_scale=1, train_wall=83, gb_free=61.4, wall=41918 epoch 029: 1684 / 1689 loss=3.53, nll_loss=1.978, ppl=3.94, wps=587910, ups=1.18, wpb=496805, bsz=16270.8, num_updates=48900, lr=0.000286006, gnorm=0.132, clip=0, loss_scale=1, train_wall=83, gb_free=61.4, wall=41918 epoch 029: 1684 / 1689 loss=3.53, nll_loss=1.978, ppl=3.94, wps=587910, ups=1.18, wpb=496805, bsz=16270.8, num_updates=48900, lr=0.000286006, gnorm=0.132, clip=0, loss_scale=1, train_wall=83, gb_free=61.4, wall=41918 epoch 029: 1684 / 1689 loss=3.53, nll_loss=1.978, ppl=3.94, wps=587910, ups=1.18, wpb=496805, bsz=16270.8, num_updates=48900, lr=0.000286006, gnorm=0.132, clip=0, loss_scale=1, train_wall=83, gb_free=61.4, wall=41918 epoch 029: 1684 / 1689 loss=3.53, nll_loss=1.978, ppl=3.94, wps=587910, ups=1.18, wpb=496805, bsz=16270.8, num_updates=48900, lr=0.000286006, gnorm=0.132, clip=0, loss_scale=1, train_wall=83, gb_free=61.4, wall=41918 epoch 029: 1684 / 1689 loss=3.53, nll_loss=1.978, ppl=3.94, wps=587910, ups=1.18, wpb=496805, bsz=16270.8, num_updates=48900, lr=0.000286006, gnorm=0.132, clip=0, loss_scale=1, train_wall=83, gb_free=61.4, wall=41918 epoch 029: 1684 / 1689 loss=3.53, nll_loss=1.978, ppl=3.94, wps=587910, ups=1.18, wpb=496805, bsz=16270.8, num_updates=48900, lr=0.000286006, gnorm=0.132, clip=0, loss_scale=1, train_wall=83, gb_free=61.4, wall=41918 epoch 029: 1684 / 1689 loss=3.53, nll_loss=1.978, ppl=3.94, wps=587910, ups=1.18, wpb=496805, bsz=16270.8, num_updates=48900, lr=0.000286006, gnorm=0.132, clip=0, loss_scale=1, train_wall=83, gb_free=61.4, wall=41918 epoch 029: 1684 / 1689 loss=3.53, nll_loss=1.978, ppl=3.94, wps=587910, ups=1.18, wpb=496805, bsz=16270.8, num_updates=48900, lr=0.000286006, gnorm=0.132, clip=0, loss_scale=1, train_wall=83, gb_free=61.4, wall=41918 epoch 029: 1684 / 1689 loss=3.53, nll_loss=1.978, ppl=3.94, wps=587910, ups=1.18, wpb=496805, bsz=16270.8, num_updates=48900, lr=0.000286006, gnorm=0.132, clip=0, loss_scale=1, train_wall=83, gb_free=61.4, wall=41918 epoch 029: 1684 / 1689 loss=3.53, nll_loss=1.978, ppl=3.94, wps=587910, ups=1.18, wpb=496805, bsz=16270.8, num_updates=48900, lr=0.000286006, gnorm=0.132, clip=0, loss_scale=1, train_wall=83, gb_free=61.4, wall=41918 epoch 029: 1684 / 1689 loss=3.53, nll_loss=1.978, ppl=3.94, wps=587910, ups=1.18, wpb=496805, bsz=16270.8, num_updates=48900, lr=0.000286006, gnorm=0.132, clip=0, loss_scale=1, train_wall=83, gb_free=61.4, wall=41918 epoch 029: 1684 / 1689 loss=3.53, nll_loss=1.978, ppl=3.94, wps=587910, ups=1.18, wpb=496805, bsz=16270.8, num_updates=48900, lr=0.000286006, gnorm=0.132, clip=0, loss_scale=1, train_wall=83, gb_free=61.4, wall=41918 epoch 029: 1684 / 1689 loss=3.53, nll_loss=1.978, ppl=3.94, wps=587910, ups=1.18, wpb=496805, bsz=16270.8, num_updates=48900, lr=0.000286006, gnorm=0.132, clip=0, loss_scale=1, train_wall=83, gb_free=61.4, wall=41918 epoch 029: 1684 / 1689 loss=3.53, nll_loss=1.978, ppl=3.94, wps=587910, ups=1.18, wpb=496805, bsz=16270.8, num_updates=48900, lr=0.000286006, gnorm=0.132, clip=0, loss_scale=1, train_wall=83, gb_free=61.4, wall=41918 epoch 029: 1684 / 1689 loss=3.53, nll_loss=1.978, ppl=3.94, wps=587910, ups=1.18, wpb=496805, bsz=16270.8, num_updates=48900, lr=0.000286006, gnorm=0.132, clip=0, loss_scale=1, train_wall=83, gb_free=61.4, wall=41918 epoch 029: 1684 / 1689 loss=3.53, nll_loss=1.978, ppl=3.94, wps=587910, ups=1.18, wpb=496805, bsz=16270.8, num_updates=48900, lr=0.000286006, gnorm=0.132, clip=0, loss_scale=1, train_wall=83, gb_free=61.4, wall=41918 epoch 029: 1684 / 1689 loss=3.53, nll_loss=1.978, ppl=3.94, wps=587910, ups=1.18, wpb=496805, bsz=16270.8, num_updates=48900, lr=0.000286006, gnorm=0.132, clip=0, loss_scale=1, train_wall=83, gb_free=61.4, wall=41918 epoch 029: 1684 / 1689 loss=3.53, nll_loss=1.978, ppl=3.94, wps=587910, ups=1.18, wpb=496805, bsz=16270.8, num_updates=48900, lr=0.000286006, gnorm=0.132, clip=0, loss_scale=1, train_wall=83, gb_free=61.4, wall=41918 epoch 029: 1684 / 1689 loss=3.53, nll_loss=1.978, ppl=3.94, wps=587910, ups=1.18, wpb=496805, bsz=16270.8, num_updates=48900, lr=0.000286006, gnorm=0.132, clip=0, loss_scale=1, train_wall=83, gb_free=61.4, wall=41918 epoch 029: 1684 / 1689 loss=3.53, nll_loss=1.978, ppl=3.94, wps=587910, ups=1.18, wpb=496805, bsz=16270.8, num_updates=48900, lr=0.000286006, gnorm=0.132, clip=0, loss_scale=1, train_wall=83, gb_free=61.4, wall=41918 epoch 029: 1684 / 1689 loss=3.53, nll_loss=1.978, ppl=3.94, wps=587910, ups=1.18, wpb=496805, bsz=16270.8, num_updates=48900, lr=0.000286006, gnorm=0.132, clip=0, loss_scale=1, train_wall=83, gb_free=61.4, wall=41918 epoch 029: 1684 / 1689 loss=3.53, nll_loss=1.978, ppl=3.94, wps=587910, ups=1.18, wpb=496805, bsz=16270.8, num_updates=48900, lr=0.000286006, gnorm=0.132, clip=0, loss_scale=1, train_wall=83, gb_free=61.4, wall=41918 epoch 029: 1684 / 1689 loss=3.53, nll_loss=1.978, ppl=3.94, wps=587910, ups=1.18, wpb=496805, bsz=16270.8, num_updates=48900, lr=0.000286006, gnorm=0.132, clip=0, loss_scale=1, train_wall=83, gb_free=61.4, wall=41918 epoch 029: 1684 / 1689 loss=3.53, nll_loss=1.978, ppl=3.94, wps=587910, ups=1.18, wpb=496805, bsz=16270.8, num_updates=48900, lr=0.000286006, gnorm=0.132, clip=0, loss_scale=1, train_wall=83, gb_free=61.4, wall=41918 epoch 029: 1684 / 1689 loss=3.53, nll_loss=1.978, ppl=3.94, wps=587910, ups=1.18, wpb=496805, bsz=16270.8, num_updates=48900, lr=0.000286006, gnorm=0.132, clip=0, loss_scale=1, train_wall=83, gb_free=61.4, wall=41918 epoch 029: 1684 / 1689 loss=3.53, nll_loss=1.978, ppl=3.94, wps=587910, ups=1.18, wpb=496805, bsz=16270.8, num_updates=48900, lr=0.000286006, gnorm=0.132, clip=0, loss_scale=1, train_wall=83, gb_free=61.4, wall=41918 end of epoch 29 (average epoch stats below) epoch 029 | loss 3.52 | nll_loss 1.967 | ppl 3.91 | wps 581868 | ups 1.18 | wpb 495109 | bsz 16504.1 | num_updates 48905 | lr 0.000285992 | gnorm 0.135 | clip 0 | loss_scale 1 | train_wall 1407 | gb_free 65.5 | wall 41921 epoch 029 | loss 3.52 | nll_loss 1.967 | ppl 3.91 | wps 581868 | ups 1.18 | wpb 495109 | bsz 16504.1 | num_updates 48905 | lr 0.000285992 | gnorm 0.135 | clip 0 | loss_scale 1 | train_wall 1407 | gb_free 65.5 | wall 41921 epoch 029 | loss 3.52 | nll_loss 1.967 | ppl 3.91 | wps 581868 | ups 1.18 | wpb 495109 | bsz 16504.1 | num_updates 48905 | lr 0.000285992 | gnorm 0.135 | clip 0 | loss_scale 1 | train_wall 1407 | gb_free 65.5 | wall 41921 epoch 029 | loss 3.52 | nll_loss 1.967 | ppl 3.91 | wps 581868 | ups 1.18 | wpb 495109 | bsz 16504.1 | num_updates 48905 | lr 0.000285992 | gnorm 0.135 | clip 0 | loss_scale 1 | train_wall 1407 | gb_free 65.5 | wall 41921 epoch 029 | loss 3.52 | nll_loss 1.967 | ppl 3.91 | wps 581868 | ups 1.18 | wpb 495109 | bsz 16504.1 | num_updates 48905 | lr 0.000285992 | gnorm 0.135 | clip 0 | loss_scale 1 | train_wall 1407 | gb_free 65.5 | wall 41921 epoch 029 | loss 3.52 | nll_loss 1.967 | ppl 3.91 | wps 581868 | ups 1.18 | wpb 495109 | bsz 16504.1 | num_updates 48905 | lr 0.000285992 | gnorm 0.135 | clip 0 | loss_scale 1 | train_wall 1407 | gb_free 65.5 | wall 41921 epoch 029 | loss 3.52 | nll_loss 1.967 | ppl 3.91 | wps 581868 | ups 1.18 | wpb 495109 | bsz 16504.1 | num_updates 48905 | lr 0.000285992 | gnorm 0.135 | clip 0 | loss_scale 1 | train_wall 1407 | gb_free 65.5 | wall 41921 epoch 029 | loss 3.52 | nll_loss 1.967 | ppl 3.91 | wps 581868 | ups 1.18 | wpb 495109 | bsz 16504.1 | num_updates 48905 | lr 0.000285992 | gnorm 0.135 | clip 0 | loss_scale 1 | train_wall 1407 | gb_free 65.5 | wall 41921 epoch 029 | loss 3.52 | nll_loss 1.967 | ppl 3.91 | wps 581868 | ups 1.18 | wpb 495109 | bsz 16504.1 | num_updates 48905 | lr 0.000285992 | gnorm 0.135 | clip 0 | loss_scale 1 | train_wall 1407 | gb_free 65.5 | wall 41921 epoch 029 | loss 3.52 | nll_loss 1.967 | ppl 3.91 | wps 581868 | ups 1.18 | wpb 495109 | bsz 16504.1 | num_updates 48905 | lr 0.000285992 | gnorm 0.135 | clip 0 | loss_scale 1 | train_wall 1407 | gb_free 65.5 | wall 41921 epoch 029 | loss 3.52 | nll_loss 1.967 | ppl 3.91 | wps 581868 | ups 1.18 | wpb 495109 | bsz 16504.1 | num_updates 48905 | lr 0.000285992 | gnorm 0.135 | clip 0 | loss_scale 1 | train_wall 1407 | gb_free 65.5 | wall 41921 epoch 029 | loss 3.52 | nll_loss 1.967 | ppl 3.91 | wps 581868 | ups 1.18 | wpb 495109 | bsz 16504.1 | num_updates 48905 | lr 0.000285992 | gnorm 0.135 | clip 0 | loss_scale 1 | train_wall 1407 | gb_free 65.5 | wall 41921 epoch 029 | loss 3.52 | nll_loss 1.967 | ppl 3.91 | wps 581868 | ups 1.18 | wpb 495109 | bsz 16504.1 | num_updates 48905 | lr 0.000285992 | gnorm 0.135 | clip 0 | loss_scale 1 | train_wall 1407 | gb_free 65.5 | wall 41921 epoch 029 | loss 3.52 | nll_loss 1.967 | ppl 3.91 | wps 581868 | ups 1.18 | wpb 495109 | bsz 16504.1 | num_updates 48905 | lr 0.000285992 | gnorm 0.135 | clip 0 | loss_scale 1 | train_wall 1407 | gb_free 65.5 | wall 41921 epoch 029 | loss 3.52 | nll_loss 1.967 | ppl 3.91 | wps 581868 | ups 1.18 | wpb 495109 | bsz 16504.1 | num_updates 48905 | lr 0.000285992 | gnorm 0.135 | clip 0 | loss_scale 1 | train_wall 1407 | gb_free 65.5 | wall 41921 epoch 029 | loss 3.52 | nll_loss 1.967 | ppl 3.91 | wps 581868 | ups 1.18 | wpb 495109 | bsz 16504.1 | num_updates 48905 | lr 0.000285992 | gnorm 0.135 | clip 0 | loss_scale 1 | train_wall 1407 | gb_free 65.5 | wall 41921 epoch 029 | loss 3.52 | nll_loss 1.967 | ppl 3.91 | wps 581868 | ups 1.18 | wpb 495109 | bsz 16504.1 | num_updates 48905 | lr 0.000285992 | gnorm 0.135 | clip 0 | loss_scale 1 | train_wall 1407 | gb_free 65.5 | wall 41921 epoch 029 | loss 3.52 | nll_loss 1.967 | ppl 3.91 | wps 581868 | ups 1.18 | wpb 495109 | bsz 16504.1 | num_updates 48905 | lr 0.000285992 | gnorm 0.135 | clip 0 | loss_scale 1 | train_wall 1407 | gb_free 65.5 | wall 41921 epoch 029 | loss 3.52 | nll_loss 1.967 | ppl 3.91 | wps 581868 | ups 1.18 | wpb 495109 | bsz 16504.1 | num_updates 48905 | lr 0.000285992 | gnorm 0.135 | clip 0 | loss_scale 1 | train_wall 1407 | gb_free 65.5 | wall 41921 epoch 029 | loss 3.52 | nll_loss 1.967 | ppl 3.91 | wps 581868 | ups 1.18 | wpb 495109 | bsz 16504.1 | num_updates 48905 | lr 0.000285992 | gnorm 0.135 | clip 0 | loss_scale 1 | train_wall 1407 | gb_free 65.5 | wall 41921 epoch 029 | loss 3.52 | nll_loss 1.967 | ppl 3.91 | wps 581868 | ups 1.18 | wpb 495109 | bsz 16504.1 | num_updates 48905 | lr 0.000285992 | gnorm 0.135 | clip 0 | loss_scale 1 | train_wall 1407 | gb_free 65.5 | wall 41921 epoch 029 | loss 3.52 | nll_loss 1.967 | ppl 3.91 | wps 581868 | ups 1.18 | wpb 495109 | bsz 16504.1 | num_updates 48905 | lr 0.000285992 | gnorm 0.135 | clip 0 | loss_scale 1 | train_wall 1407 | gb_free 65.5 | wall 41921 epoch 029 | loss 3.52 | nll_loss 1.967 | ppl 3.91 | wps 581868 | ups 1.18 | wpb 495109 | bsz 16504.1 | num_updates 48905 | lr 0.000285992 | gnorm 0.135 | clip 0 | loss_scale 1 | train_wall 1407 | gb_free 65.5 | wall 41921 epoch 029 | loss 3.52 | nll_loss 1.967 | ppl 3.91 | wps 581868 | ups 1.18 | wpb 495109 | bsz 16504.1 | num_updates 48905 | lr 0.000285992 | gnorm 0.135 | clip 0 | loss_scale 1 | train_wall 1407 | gb_free 65.5 | wall 41921 epoch 029 | loss 3.52 | nll_loss 1.967 | ppl 3.91 | wps 581868 | ups 1.18 | wpb 495109 | bsz 16504.1 | num_updates 48905 | lr 0.000285992 | gnorm 0.135 | clip 0 | loss_scale 1 | train_wall 1407 | gb_free 65.5 | wall 41921 epoch 029 | loss 3.52 | nll_loss 1.967 | ppl 3.91 | wps 581868 | ups 1.18 | wpb 495109 | bsz 16504.1 | num_updates 48905 | lr 0.000285992 | gnorm 0.135 | clip 0 | loss_scale 1 | train_wall 1407 | gb_free 65.5 | wall 41921 epoch 029 | loss 3.52 | nll_loss 1.967 | ppl 3.91 | wps 581868 | ups 1.18 | wpb 495109 | bsz 16504.1 | num_updates 48905 | lr 0.000285992 | gnorm 0.135 | clip 0 | loss_scale 1 | train_wall 1407 | gb_free 65.5 | wall 41921 epoch 029 | loss 3.52 | nll_loss 1.967 | ppl 3.91 | wps 581868 | ups 1.18 | wpb 495109 | bsz 16504.1 | num_updates 48905 | lr 0.000285992 | gnorm 0.135 | clip 0 | loss_scale 1 | train_wall 1407 | gb_free 65.5 | wall 41921 epoch 029 | loss 3.52 | nll_loss 1.967 | ppl 3.91 | wps 581868 | ups 1.18 | wpb 495109 | bsz 16504.1 | num_updates 48905 | lr 0.000285992 | gnorm 0.135 | clip 0 | loss_scale 1 | train_wall 1407 | gb_free 65.5 | wall 41921 Start iterating over samples epoch 030: 95 / 1689 loss=3.504, nll_loss=1.948, ppl=3.86, wps=582684, ups=1.19, wpb=491104, bsz=16510.3, num_updates=49000, lr=0.000285714, gnorm=0.14, clip=0, loss_scale=1, train_wall=83, gb_free=60.5, wall=42002 epoch 030: 95 / 1689 loss=3.504, nll_loss=1.948, ppl=3.86, wps=582684, ups=1.19, wpb=491104, bsz=16510.3, num_updates=49000, lr=0.000285714, gnorm=0.14, clip=0, loss_scale=1, train_wall=83, gb_free=60.5, wall=42002 epoch 030: 95 / 1689 loss=3.504, nll_loss=1.948, ppl=3.86, wps=582684, ups=1.19, wpb=491104, bsz=16510.3, num_updates=49000, lr=0.000285714, gnorm=0.14, clip=0, loss_scale=1, train_wall=83, gb_free=60.5, wall=42002 epoch 030: 95 / 1689 loss=3.504, nll_loss=1.948, ppl=3.86, wps=582684, ups=1.19, wpb=491104, bsz=16510.3, num_updates=49000, lr=0.000285714, gnorm=0.14, clip=0, loss_scale=1, train_wall=83, gb_free=60.5, wall=42002 epoch 030: 95 / 1689 loss=3.504, nll_loss=1.948, ppl=3.86, wps=582684, ups=1.19, wpb=491104, bsz=16510.3, num_updates=49000, lr=0.000285714, gnorm=0.14, clip=0, loss_scale=1, train_wall=83, gb_free=60.5, wall=42002 epoch 030: 95 / 1689 loss=3.504, nll_loss=1.948, ppl=3.86, wps=582684, ups=1.19, wpb=491104, bsz=16510.3, num_updates=49000, lr=0.000285714, gnorm=0.14, clip=0, loss_scale=1, train_wall=83, gb_free=60.5, wall=42002 epoch 030: 95 / 1689 loss=3.504, nll_loss=1.948, ppl=3.86, wps=582684, ups=1.19, wpb=491104, bsz=16510.3, num_updates=49000, lr=0.000285714, gnorm=0.14, clip=0, loss_scale=1, train_wall=83, gb_free=60.5, wall=42002 epoch 030: 95 / 1689 loss=3.504, nll_loss=1.948, ppl=3.86, wps=582684, ups=1.19, wpb=491104, bsz=16510.3, num_updates=49000, lr=0.000285714, gnorm=0.14, clip=0, loss_scale=1, train_wall=83, gb_free=60.5, wall=42002 epoch 030: 95 / 1689 loss=3.504, nll_loss=1.948, ppl=3.86, wps=582684, ups=1.19, wpb=491104, bsz=16510.3, num_updates=49000, lr=0.000285714, gnorm=0.14, clip=0, loss_scale=1, train_wall=83, gb_free=60.5, wall=42002 epoch 030: 95 / 1689 loss=3.504, nll_loss=1.948, ppl=3.86, wps=582684, ups=1.19, wpb=491104, bsz=16510.3, num_updates=49000, lr=0.000285714, gnorm=0.14, clip=0, loss_scale=1, train_wall=83, gb_free=60.5, wall=42002 epoch 030: 95 / 1689 loss=3.504, nll_loss=1.948, ppl=3.86, wps=582684, ups=1.19, wpb=491104, bsz=16510.3, num_updates=49000, lr=0.000285714, gnorm=0.14, clip=0, loss_scale=1, train_wall=83, gb_free=60.5, wall=42002 epoch 030: 95 / 1689 loss=3.504, nll_loss=1.948, ppl=3.86, wps=582684, ups=1.19, wpb=491104, bsz=16510.3, num_updates=49000, lr=0.000285714, gnorm=0.14, clip=0, loss_scale=1, train_wall=83, gb_free=60.5, wall=42002 epoch 030: 95 / 1689 loss=3.504, nll_loss=1.948, ppl=3.86, wps=582684, ups=1.19, wpb=491104, bsz=16510.3, num_updates=49000, lr=0.000285714, gnorm=0.14, clip=0, loss_scale=1, train_wall=83, gb_free=60.5, wall=42002 epoch 030: 95 / 1689 loss=3.504, nll_loss=1.948, ppl=3.86, wps=582684, ups=1.19, wpb=491104, bsz=16510.3, num_updates=49000, lr=0.000285714, gnorm=0.14, clip=0, loss_scale=1, train_wall=83, gb_free=60.5, wall=42002 epoch 030: 95 / 1689 loss=3.504, nll_loss=1.948, ppl=3.86, wps=582684, ups=1.19, wpb=491104, bsz=16510.3, num_updates=49000, lr=0.000285714, gnorm=0.14, clip=0, loss_scale=1, train_wall=83, gb_free=60.5, wall=42002 epoch 030: 95 / 1689 loss=3.504, nll_loss=1.948, ppl=3.86, wps=582684, ups=1.19, wpb=491104, bsz=16510.3, num_updates=49000, lr=0.000285714, gnorm=0.14, clip=0, loss_scale=1, train_wall=83, gb_free=60.5, wall=42002 epoch 030: 95 / 1689 loss=3.504, nll_loss=1.948, ppl=3.86, wps=582684, ups=1.19, wpb=491104, bsz=16510.3, num_updates=49000, lr=0.000285714, gnorm=0.14, clip=0, loss_scale=1, train_wall=83, gb_free=60.5, wall=42002 epoch 030: 95 / 1689 loss=3.504, nll_loss=1.948, ppl=3.86, wps=582684, ups=1.19, wpb=491104, bsz=16510.3, num_updates=49000, lr=0.000285714, gnorm=0.14, clip=0, loss_scale=1, train_wall=83, gb_free=60.5, wall=42002 epoch 030: 95 / 1689 loss=3.504, nll_loss=1.948, ppl=3.86, wps=582684, ups=1.19, wpb=491104, bsz=16510.3, num_updates=49000, lr=0.000285714, gnorm=0.14, clip=0, loss_scale=1, train_wall=83, gb_free=60.5, wall=42002 epoch 030: 95 / 1689 loss=3.504, nll_loss=1.948, ppl=3.86, wps=582684, ups=1.19, wpb=491104, bsz=16510.3, num_updates=49000, lr=0.000285714, gnorm=0.14, clip=0, loss_scale=1, train_wall=83, gb_free=60.5, wall=42002 epoch 030: 95 / 1689 loss=3.504, nll_loss=1.948, ppl=3.86, wps=582684, ups=1.19, wpb=491104, bsz=16510.3, num_updates=49000, lr=0.000285714, gnorm=0.14, clip=0, loss_scale=1, train_wall=83, gb_free=60.5, wall=42002 epoch 030: 95 / 1689 loss=3.504, nll_loss=1.948, ppl=3.86, wps=582684, ups=1.19, wpb=491104, bsz=16510.3, num_updates=49000, lr=0.000285714, gnorm=0.14, clip=0, loss_scale=1, train_wall=83, gb_free=60.5, wall=42002 epoch 030: 95 / 1689 loss=3.504, nll_loss=1.948, ppl=3.86, wps=582684, ups=1.19, wpb=491104, bsz=16510.3, num_updates=49000, lr=0.000285714, gnorm=0.14, clip=0, loss_scale=1, train_wall=83, gb_free=60.5, wall=42002 epoch 030: 95 / 1689 loss=3.504, nll_loss=1.948, ppl=3.86, wps=582684, ups=1.19, wpb=491104, bsz=16510.3, num_updates=49000, lr=0.000285714, gnorm=0.14, clip=0, loss_scale=1, train_wall=83, gb_free=60.5, wall=42002 epoch 030: 95 / 1689 loss=3.504, nll_loss=1.948, ppl=3.86, wps=582684, ups=1.19, wpb=491104, bsz=16510.3, num_updates=49000, lr=0.000285714, gnorm=0.14, clip=0, loss_scale=1, train_wall=83, gb_free=60.5, wall=42002 epoch 030: 95 / 1689 loss=3.504, nll_loss=1.948, ppl=3.86, wps=582684, ups=1.19, wpb=491104, bsz=16510.3, num_updates=49000, lr=0.000285714, gnorm=0.14, clip=0, loss_scale=1, train_wall=83, gb_free=60.5, wall=42002 epoch 030: 95 / 1689 loss=3.504, nll_loss=1.948, ppl=3.86, wps=582684, ups=1.19, wpb=491104, bsz=16510.3, num_updates=49000, lr=0.000285714, gnorm=0.14, clip=0, loss_scale=1, train_wall=83, gb_free=60.5, wall=42002 epoch 030: 95 / 1689 loss=3.504, nll_loss=1.948, ppl=3.86, wps=582684, ups=1.19, wpb=491104, bsz=16510.3, num_updates=49000, lr=0.000285714, gnorm=0.14, clip=0, loss_scale=1, train_wall=83, gb_free=60.5, wall=42002 epoch 030: 95 / 1689 loss=3.504, nll_loss=1.948, ppl=3.86, wps=582684, ups=1.19, wpb=491104, bsz=16510.3, num_updates=49000, lr=0.000285714, gnorm=0.14, clip=0, loss_scale=1, train_wall=83, gb_free=60.5, wall=42002 epoch 030: 95 / 1689 loss=3.504, nll_loss=1.948, ppl=3.86, wps=582684, ups=1.19, wpb=491104, bsz=16510.3, num_updates=49000, lr=0.000285714, gnorm=0.14, clip=0, loss_scale=1, train_wall=83, gb_free=60.5, wall=42002 begin validation on "valid" subset epoch 030 | valid on 'valid' subset | loss 3.695 | nll_loss 2.144 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 49000 | best_loss 3.685 epoch 030 | valid on 'valid' subset | loss 3.695 | nll_loss 2.144 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 49000 | best_loss 3.685 epoch 030 | valid on 'valid' subset | loss 3.695 | nll_loss 2.144 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 49000 | best_loss 3.685 epoch 030 | valid on 'valid' subset | loss 3.695 | nll_loss 2.144 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 49000 | best_loss 3.685 epoch 030 | valid on 'valid' subset | loss 3.695 | nll_loss 2.144 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 49000 | best_loss 3.685 epoch 030 | valid on 'valid' subset | loss 3.695 | nll_loss 2.144 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 49000 | best_loss 3.685 epoch 030 | valid on 'valid' subset | loss 3.695 | nll_loss 2.144 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 49000 | best_loss 3.685 epoch 030 | valid on 'valid' subset | loss 3.695 | nll_loss 2.144 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 49000 | best_loss 3.685 epoch 030 | valid on 'valid' subset | loss 3.695 | nll_loss 2.144 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 49000 | best_loss 3.685 epoch 030 | valid on 'valid' subset | loss 3.695 | nll_loss 2.144 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 49000 | best_loss 3.685 epoch 030 | valid on 'valid' subset | loss 3.695 | nll_loss 2.144 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 49000 | best_loss 3.685 epoch 030 | valid on 'valid' subset | loss 3.695 | nll_loss 2.144 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 49000 | best_loss 3.685 epoch 030 | valid on 'valid' subset | loss 3.695 | nll_loss 2.144 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 49000 | best_loss 3.685 epoch 030 | valid on 'valid' subset | loss 3.695 | nll_loss 2.144 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 49000 | best_loss 3.685 epoch 030 | valid on 'valid' subset | loss 3.695 | nll_loss 2.144 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 49000 | best_loss 3.685 epoch 030 | valid on 'valid' subset | loss 3.695 | nll_loss 2.144 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 49000 | best_loss 3.685 epoch 030 | valid on 'valid' subset | loss 3.695 | nll_loss 2.144 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 49000 | best_loss 3.685 epoch 030 | valid on 'valid' subset | loss 3.695 | nll_loss 2.144 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 49000 | best_loss 3.685 epoch 030 | valid on 'valid' subset | loss 3.695 | nll_loss 2.144 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 49000 | best_loss 3.685 epoch 030 | valid on 'valid' subset | loss 3.695 | nll_loss 2.144 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 49000 | best_loss 3.685 epoch 030 | valid on 'valid' subset | loss 3.695 | nll_loss 2.144 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 49000 | best_loss 3.685 epoch 030 | valid on 'valid' subset | loss 3.695 | nll_loss 2.144 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 49000 | best_loss 3.685 epoch 030 | valid on 'valid' subset | loss 3.695 | nll_loss 2.144 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 49000 | best_loss 3.685 epoch 030 | valid on 'valid' subset | loss 3.695 | nll_loss 2.144 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 49000 | best_loss 3.685 epoch 030 | valid on 'valid' subset | loss 3.695 | nll_loss 2.144 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 49000 | best_loss 3.685 epoch 030 | valid on 'valid' subset | loss 3.695 | nll_loss 2.144 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 49000 | best_loss 3.685 epoch 030 | valid on 'valid' subset | loss 3.695 | nll_loss 2.144 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 49000 | best_loss 3.685 epoch 030 | valid on 'valid' subset | loss 3.695 | nll_loss 2.144 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 49000 | best_loss 3.685 epoch 030 | valid on 'valid' subset | loss 3.695 | nll_loss 2.144 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 49000 | best_loss 3.685 epoch 030 | valid on 'valid' subset | loss 3.695 | nll_loss 2.144 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 49000 | best_loss 3.685 epoch 030: 195 / 1689 loss=3.501, nll_loss=1.945, ppl=3.85, wps=536213, ups=1.08, wpb=495182, bsz=16201.9, num_updates=49100, lr=0.000285423, gnorm=0.136, clip=0, loss_scale=1, train_wall=83, gb_free=61.2, wall=42094 epoch 030: 195 / 1689 loss=3.501, nll_loss=1.945, ppl=3.85, wps=536213, ups=1.08, wpb=495182, bsz=16201.9, num_updates=49100, lr=0.000285423, gnorm=0.136, clip=0, loss_scale=1, train_wall=83, gb_free=61.2, wall=42094 epoch 030: 195 / 1689 loss=3.501, nll_loss=1.945, ppl=3.85, wps=536213, ups=1.08, wpb=495182, bsz=16201.9, num_updates=49100, lr=0.000285423, gnorm=0.136, clip=0, loss_scale=1, train_wall=83, gb_free=61.2, wall=42094 epoch 030: 195 / 1689 loss=3.501, nll_loss=1.945, ppl=3.85, wps=536213, ups=1.08, wpb=495182, bsz=16201.9, num_updates=49100, lr=0.000285423, gnorm=0.136, clip=0, loss_scale=1, train_wall=83, gb_free=61.2, wall=42094 epoch 030: 195 / 1689 loss=3.501, nll_loss=1.945, ppl=3.85, wps=536213, ups=1.08, wpb=495182, bsz=16201.9, num_updates=49100, lr=0.000285423, gnorm=0.136, clip=0, loss_scale=1, train_wall=83, gb_free=61.2, wall=42094 epoch 030: 195 / 1689 loss=3.501, nll_loss=1.945, ppl=3.85, wps=536213, ups=1.08, wpb=495182, bsz=16201.9, num_updates=49100, lr=0.000285423, gnorm=0.136, clip=0, loss_scale=1, train_wall=83, gb_free=61.2, wall=42094 epoch 030: 195 / 1689 loss=3.501, nll_loss=1.945, ppl=3.85, wps=536213, ups=1.08, wpb=495182, bsz=16201.9, num_updates=49100, lr=0.000285423, gnorm=0.136, clip=0, loss_scale=1, train_wall=83, gb_free=61.2, wall=42094 epoch 030: 195 / 1689 loss=3.501, nll_loss=1.945, ppl=3.85, wps=536213, ups=1.08, wpb=495182, bsz=16201.9, num_updates=49100, lr=0.000285423, gnorm=0.136, clip=0, loss_scale=1, train_wall=83, gb_free=61.2, wall=42094 epoch 030: 195 / 1689 loss=3.501, nll_loss=1.945, ppl=3.85, wps=536213, ups=1.08, wpb=495182, bsz=16201.9, num_updates=49100, lr=0.000285423, gnorm=0.136, clip=0, loss_scale=1, train_wall=83, gb_free=61.2, wall=42094 epoch 030: 195 / 1689 loss=3.501, nll_loss=1.945, ppl=3.85, wps=536213, ups=1.08, wpb=495182, bsz=16201.9, num_updates=49100, lr=0.000285423, gnorm=0.136, clip=0, loss_scale=1, train_wall=83, gb_free=61.2, wall=42094 epoch 030: 195 / 1689 loss=3.501, nll_loss=1.945, ppl=3.85, wps=536213, ups=1.08, wpb=495182, bsz=16201.9, num_updates=49100, lr=0.000285423, gnorm=0.136, clip=0, loss_scale=1, train_wall=83, gb_free=61.2, wall=42094 epoch 030: 195 / 1689 loss=3.501, nll_loss=1.945, ppl=3.85, wps=536213, ups=1.08, wpb=495182, bsz=16201.9, num_updates=49100, lr=0.000285423, gnorm=0.136, clip=0, loss_scale=1, train_wall=83, gb_free=61.2, wall=42094 epoch 030: 195 / 1689 loss=3.501, nll_loss=1.945, ppl=3.85, wps=536213, ups=1.08, wpb=495182, bsz=16201.9, num_updates=49100, lr=0.000285423, gnorm=0.136, clip=0, loss_scale=1, train_wall=83, gb_free=61.2, wall=42094 epoch 030: 195 / 1689 loss=3.501, nll_loss=1.945, ppl=3.85, wps=536213, ups=1.08, wpb=495182, bsz=16201.9, num_updates=49100, lr=0.000285423, gnorm=0.136, clip=0, loss_scale=1, train_wall=83, gb_free=61.2, wall=42094 epoch 030: 195 / 1689 loss=3.501, nll_loss=1.945, ppl=3.85, wps=536213, ups=1.08, wpb=495182, bsz=16201.9, num_updates=49100, lr=0.000285423, gnorm=0.136, clip=0, loss_scale=1, train_wall=83, gb_free=61.2, wall=42094 epoch 030: 195 / 1689 loss=3.501, nll_loss=1.945, ppl=3.85, wps=536213, ups=1.08, wpb=495182, bsz=16201.9, num_updates=49100, lr=0.000285423, gnorm=0.136, clip=0, loss_scale=1, train_wall=83, gb_free=61.2, wall=42094 epoch 030: 195 / 1689 loss=3.501, nll_loss=1.945, ppl=3.85, wps=536213, ups=1.08, wpb=495182, bsz=16201.9, num_updates=49100, lr=0.000285423, gnorm=0.136, clip=0, loss_scale=1, train_wall=83, gb_free=61.2, wall=42094 epoch 030: 195 / 1689 loss=3.501, nll_loss=1.945, ppl=3.85, wps=536213, ups=1.08, wpb=495182, bsz=16201.9, num_updates=49100, lr=0.000285423, gnorm=0.136, clip=0, loss_scale=1, train_wall=83, gb_free=61.2, wall=42094 epoch 030: 195 / 1689 loss=3.501, nll_loss=1.945, ppl=3.85, wps=536213, ups=1.08, wpb=495182, bsz=16201.9, num_updates=49100, lr=0.000285423, gnorm=0.136, clip=0, loss_scale=1, train_wall=83, gb_free=61.2, wall=42094 epoch 030: 195 / 1689 loss=3.501, nll_loss=1.945, ppl=3.85, wps=536213, ups=1.08, wpb=495182, bsz=16201.9, num_updates=49100, lr=0.000285423, gnorm=0.136, clip=0, loss_scale=1, train_wall=83, gb_free=61.2, wall=42094 epoch 030: 195 / 1689 loss=3.501, nll_loss=1.945, ppl=3.85, wps=536213, ups=1.08, wpb=495182, bsz=16201.9, num_updates=49100, lr=0.000285423, gnorm=0.136, clip=0, loss_scale=1, train_wall=83, gb_free=61.2, wall=42094 epoch 030: 195 / 1689 loss=3.501, nll_loss=1.945, ppl=3.85, wps=536213, ups=1.08, wpb=495182, bsz=16201.9, num_updates=49100, lr=0.000285423, gnorm=0.136, clip=0, loss_scale=1, train_wall=83, gb_free=61.2, wall=42094 epoch 030: 195 / 1689 loss=3.501, nll_loss=1.945, ppl=3.85, wps=536213, ups=1.08, wpb=495182, bsz=16201.9, num_updates=49100, lr=0.000285423, gnorm=0.136, clip=0, loss_scale=1, train_wall=83, gb_free=61.2, wall=42094 epoch 030: 195 / 1689 loss=3.501, nll_loss=1.945, ppl=3.85, wps=536213, ups=1.08, wpb=495182, bsz=16201.9, num_updates=49100, lr=0.000285423, gnorm=0.136, clip=0, loss_scale=1, train_wall=83, gb_free=61.2, wall=42094 epoch 030: 195 / 1689 loss=3.501, nll_loss=1.945, ppl=3.85, wps=536213, ups=1.08, wpb=495182, bsz=16201.9, num_updates=49100, lr=0.000285423, gnorm=0.136, clip=0, loss_scale=1, train_wall=83, gb_free=61.2, wall=42094 epoch 030: 195 / 1689 loss=3.501, nll_loss=1.945, ppl=3.85, wps=536213, ups=1.08, wpb=495182, bsz=16201.9, num_updates=49100, lr=0.000285423, gnorm=0.136, clip=0, loss_scale=1, train_wall=83, gb_free=61.2, wall=42094 epoch 030: 195 / 1689 loss=3.501, nll_loss=1.945, ppl=3.85, wps=536213, ups=1.08, wpb=495182, bsz=16201.9, num_updates=49100, lr=0.000285423, gnorm=0.136, clip=0, loss_scale=1, train_wall=83, gb_free=61.2, wall=42094 epoch 030: 195 / 1689 loss=3.501, nll_loss=1.945, ppl=3.85, wps=536213, ups=1.08, wpb=495182, bsz=16201.9, num_updates=49100, lr=0.000285423, gnorm=0.136, clip=0, loss_scale=1, train_wall=83, gb_free=61.2, wall=42094 epoch 030: 195 / 1689 loss=3.501, nll_loss=1.945, ppl=3.85, wps=536213, ups=1.08, wpb=495182, bsz=16201.9, num_updates=49100, lr=0.000285423, gnorm=0.136, clip=0, loss_scale=1, train_wall=83, gb_free=61.2, wall=42094 epoch 030: 195 / 1689 loss=3.501, nll_loss=1.945, ppl=3.85, wps=536213, ups=1.08, wpb=495182, bsz=16201.9, num_updates=49100, lr=0.000285423, gnorm=0.136, clip=0, loss_scale=1, train_wall=83, gb_free=61.2, wall=42094 epoch 030: 295 / 1689 loss=3.507, nll_loss=1.952, ppl=3.87, wps=585062, ups=1.18, wpb=496020, bsz=16304.6, num_updates=49200, lr=0.000285133, gnorm=0.138, clip=0, loss_scale=1, train_wall=84, gb_free=61.9, wall=42179 epoch 030: 295 / 1689 loss=3.507, nll_loss=1.952, ppl=3.87, wps=585062, ups=1.18, wpb=496020, bsz=16304.6, num_updates=49200, lr=0.000285133, gnorm=0.138, clip=0, loss_scale=1, train_wall=84, gb_free=61.9, wall=42179 epoch 030: 295 / 1689 loss=3.507, nll_loss=1.952, ppl=3.87, wps=585062, ups=1.18, wpb=496020, bsz=16304.6, num_updates=49200, lr=0.000285133, gnorm=0.138, clip=0, loss_scale=1, train_wall=84, gb_free=61.9, wall=42179 epoch 030: 295 / 1689 loss=3.507, nll_loss=1.952, ppl=3.87, wps=585062, ups=1.18, wpb=496020, bsz=16304.6, num_updates=49200, lr=0.000285133, gnorm=0.138, clip=0, loss_scale=1, train_wall=84, gb_free=61.9, wall=42179 epoch 030: 295 / 1689 loss=3.507, nll_loss=1.952, ppl=3.87, wps=585062, ups=1.18, wpb=496020, bsz=16304.6, num_updates=49200, lr=0.000285133, gnorm=0.138, clip=0, loss_scale=1, train_wall=84, gb_free=61.9, wall=42179 epoch 030: 295 / 1689 loss=3.507, nll_loss=1.952, ppl=3.87, wps=585062, ups=1.18, wpb=496020, bsz=16304.6, num_updates=49200, lr=0.000285133, gnorm=0.138, clip=0, loss_scale=1, train_wall=84, gb_free=61.9, wall=42179 epoch 030: 295 / 1689 loss=3.507, nll_loss=1.952, ppl=3.87, wps=585062, ups=1.18, wpb=496020, bsz=16304.6, num_updates=49200, lr=0.000285133, gnorm=0.138, clip=0, loss_scale=1, train_wall=84, gb_free=61.9, wall=42179 epoch 030: 295 / 1689 loss=3.507, nll_loss=1.952, ppl=3.87, wps=585062, ups=1.18, wpb=496020, bsz=16304.6, num_updates=49200, lr=0.000285133, gnorm=0.138, clip=0, loss_scale=1, train_wall=84, gb_free=61.9, wall=42179 epoch 030: 295 / 1689 loss=3.507, nll_loss=1.952, ppl=3.87, wps=585062, ups=1.18, wpb=496020, bsz=16304.6, num_updates=49200, lr=0.000285133, gnorm=0.138, clip=0, loss_scale=1, train_wall=84, gb_free=61.9, wall=42179 epoch 030: 295 / 1689 loss=3.507, nll_loss=1.952, ppl=3.87, wps=585062, ups=1.18, wpb=496020, bsz=16304.6, num_updates=49200, lr=0.000285133, gnorm=0.138, clip=0, loss_scale=1, train_wall=84, gb_free=61.9, wall=42179 epoch 030: 295 / 1689 loss=3.507, nll_loss=1.952, ppl=3.87, wps=585062, ups=1.18, wpb=496020, bsz=16304.6, num_updates=49200, lr=0.000285133, gnorm=0.138, clip=0, loss_scale=1, train_wall=84, gb_free=61.9, wall=42179 epoch 030: 295 / 1689 loss=3.507, nll_loss=1.952, ppl=3.87, wps=585062, ups=1.18, wpb=496020, bsz=16304.6, num_updates=49200, lr=0.000285133, gnorm=0.138, clip=0, loss_scale=1, train_wall=84, gb_free=61.9, wall=42179 epoch 030: 295 / 1689 loss=3.507, nll_loss=1.952, ppl=3.87, wps=585062, ups=1.18, wpb=496020, bsz=16304.6, num_updates=49200, lr=0.000285133, gnorm=0.138, clip=0, loss_scale=1, train_wall=84, gb_free=61.9, wall=42179 epoch 030: 295 / 1689 loss=3.507, nll_loss=1.952, ppl=3.87, wps=585062, ups=1.18, wpb=496020, bsz=16304.6, num_updates=49200, lr=0.000285133, gnorm=0.138, clip=0, loss_scale=1, train_wall=84, gb_free=61.9, wall=42179 epoch 030: 295 / 1689 loss=3.507, nll_loss=1.952, ppl=3.87, wps=585062, ups=1.18, wpb=496020, bsz=16304.6, num_updates=49200, lr=0.000285133, gnorm=0.138, clip=0, loss_scale=1, train_wall=84, gb_free=61.9, wall=42179 epoch 030: 295 / 1689 loss=3.507, nll_loss=1.952, ppl=3.87, wps=585062, ups=1.18, wpb=496020, bsz=16304.6, num_updates=49200, lr=0.000285133, gnorm=0.138, clip=0, loss_scale=1, train_wall=84, gb_free=61.9, wall=42179 epoch 030: 295 / 1689 loss=3.507, nll_loss=1.952, ppl=3.87, wps=585062, ups=1.18, wpb=496020, bsz=16304.6, num_updates=49200, lr=0.000285133, gnorm=0.138, clip=0, loss_scale=1, train_wall=84, gb_free=61.9, wall=42179 epoch 030: 295 / 1689 loss=3.507, nll_loss=1.952, ppl=3.87, wps=585062, ups=1.18, wpb=496020, bsz=16304.6, num_updates=49200, lr=0.000285133, gnorm=0.138, clip=0, loss_scale=1, train_wall=84, gb_free=61.9, wall=42179 epoch 030: 295 / 1689 loss=3.507, nll_loss=1.952, ppl=3.87, wps=585062, ups=1.18, wpb=496020, bsz=16304.6, num_updates=49200, lr=0.000285133, gnorm=0.138, clip=0, loss_scale=1, train_wall=84, gb_free=61.9, wall=42179 epoch 030: 295 / 1689 loss=3.507, nll_loss=1.952, ppl=3.87, wps=585062, ups=1.18, wpb=496020, bsz=16304.6, num_updates=49200, lr=0.000285133, gnorm=0.138, clip=0, loss_scale=1, train_wall=84, gb_free=61.9, wall=42179 epoch 030: 295 / 1689 loss=3.507, nll_loss=1.952, ppl=3.87, wps=585062, ups=1.18, wpb=496020, bsz=16304.6, num_updates=49200, lr=0.000285133, gnorm=0.138, clip=0, loss_scale=1, train_wall=84, gb_free=61.9, wall=42179 epoch 030: 295 / 1689 loss=3.507, nll_loss=1.952, ppl=3.87, wps=585062, ups=1.18, wpb=496020, bsz=16304.6, num_updates=49200, lr=0.000285133, gnorm=0.138, clip=0, loss_scale=1, train_wall=84, gb_free=61.9, wall=42179 epoch 030: 295 / 1689 loss=3.507, nll_loss=1.952, ppl=3.87, wps=585062, ups=1.18, wpb=496020, bsz=16304.6, num_updates=49200, lr=0.000285133, gnorm=0.138, clip=0, loss_scale=1, train_wall=84, gb_free=61.9, wall=42179 epoch 030: 295 / 1689 loss=3.507, nll_loss=1.952, ppl=3.87, wps=585062, ups=1.18, wpb=496020, bsz=16304.6, num_updates=49200, lr=0.000285133, gnorm=0.138, clip=0, loss_scale=1, train_wall=84, gb_free=61.9, wall=42179 epoch 030: 295 / 1689 loss=3.507, nll_loss=1.952, ppl=3.87, wps=585062, ups=1.18, wpb=496020, bsz=16304.6, num_updates=49200, lr=0.000285133, gnorm=0.138, clip=0, loss_scale=1, train_wall=84, gb_free=61.9, wall=42179 epoch 030: 295 / 1689 loss=3.507, nll_loss=1.952, ppl=3.87, wps=585062, ups=1.18, wpb=496020, bsz=16304.6, num_updates=49200, lr=0.000285133, gnorm=0.138, clip=0, loss_scale=1, train_wall=84, gb_free=61.9, wall=42179 epoch 030: 295 / 1689 loss=3.507, nll_loss=1.952, ppl=3.87, wps=585062, ups=1.18, wpb=496020, bsz=16304.6, num_updates=49200, lr=0.000285133, gnorm=0.138, clip=0, loss_scale=1, train_wall=84, gb_free=61.9, wall=42179 epoch 030: 295 / 1689 loss=3.507, nll_loss=1.952, ppl=3.87, wps=585062, ups=1.18, wpb=496020, bsz=16304.6, num_updates=49200, lr=0.000285133, gnorm=0.138, clip=0, loss_scale=1, train_wall=84, gb_free=61.9, wall=42179 epoch 030: 295 / 1689 loss=3.507, nll_loss=1.952, ppl=3.87, wps=585062, ups=1.18, wpb=496020, bsz=16304.6, num_updates=49200, lr=0.000285133, gnorm=0.138, clip=0, loss_scale=1, train_wall=84, gb_free=61.9, wall=42179 epoch 030: 295 / 1689 loss=3.507, nll_loss=1.952, ppl=3.87, wps=585062, ups=1.18, wpb=496020, bsz=16304.6, num_updates=49200, lr=0.000285133, gnorm=0.138, clip=0, loss_scale=1, train_wall=84, gb_free=61.9, wall=42179 epoch 030: 395 / 1689 loss=3.508, nll_loss=1.953, ppl=3.87, wps=584688, ups=1.18, wpb=494914, bsz=16353.8, num_updates=49300, lr=0.000284844, gnorm=0.132, clip=0, loss_scale=1, train_wall=83, gb_free=61.2, wall=42264 epoch 030: 395 / 1689 loss=3.508, nll_loss=1.953, ppl=3.87, wps=584688, ups=1.18, wpb=494914, bsz=16353.8, num_updates=49300, lr=0.000284844, gnorm=0.132, clip=0, loss_scale=1, train_wall=83, gb_free=61.2, wall=42264 epoch 030: 395 / 1689 loss=3.508, nll_loss=1.953, ppl=3.87, wps=584688, ups=1.18, wpb=494914, bsz=16353.8, num_updates=49300, lr=0.000284844, gnorm=0.132, clip=0, loss_scale=1, train_wall=83, gb_free=61.2, wall=42264 epoch 030: 395 / 1689 loss=3.508, nll_loss=1.953, ppl=3.87, wps=584688, ups=1.18, wpb=494914, bsz=16353.8, num_updates=49300, lr=0.000284844, gnorm=0.132, clip=0, loss_scale=1, train_wall=83, gb_free=61.2, wall=42264 epoch 030: 395 / 1689 loss=3.508, nll_loss=1.953, ppl=3.87, wps=584688, ups=1.18, wpb=494914, bsz=16353.8, num_updates=49300, lr=0.000284844, gnorm=0.132, clip=0, loss_scale=1, train_wall=83, gb_free=61.2, wall=42264 epoch 030: 395 / 1689 loss=3.508, nll_loss=1.953, ppl=3.87, wps=584688, ups=1.18, wpb=494914, bsz=16353.8, num_updates=49300, lr=0.000284844, gnorm=0.132, clip=0, loss_scale=1, train_wall=83, gb_free=61.2, wall=42264 epoch 030: 395 / 1689 loss=3.508, nll_loss=1.953, ppl=3.87, wps=584688, ups=1.18, wpb=494914, bsz=16353.8, num_updates=49300, lr=0.000284844, gnorm=0.132, clip=0, loss_scale=1, train_wall=83, gb_free=61.2, wall=42264 epoch 030: 395 / 1689 loss=3.508, nll_loss=1.953, ppl=3.87, wps=584688, ups=1.18, wpb=494914, bsz=16353.8, num_updates=49300, lr=0.000284844, gnorm=0.132, clip=0, loss_scale=1, train_wall=83, gb_free=61.2, wall=42264 epoch 030: 395 / 1689 loss=3.508, nll_loss=1.953, ppl=3.87, wps=584688, ups=1.18, wpb=494914, bsz=16353.8, num_updates=49300, lr=0.000284844, gnorm=0.132, clip=0, loss_scale=1, train_wall=83, gb_free=61.2, wall=42264 epoch 030: 395 / 1689 loss=3.508, nll_loss=1.953, ppl=3.87, wps=584688, ups=1.18, wpb=494914, bsz=16353.8, num_updates=49300, lr=0.000284844, gnorm=0.132, clip=0, loss_scale=1, train_wall=83, gb_free=61.2, wall=42264 epoch 030: 395 / 1689 loss=3.508, nll_loss=1.953, ppl=3.87, wps=584688, ups=1.18, wpb=494914, bsz=16353.8, num_updates=49300, lr=0.000284844, gnorm=0.132, clip=0, loss_scale=1, train_wall=83, gb_free=61.2, wall=42264 epoch 030: 395 / 1689 loss=3.508, nll_loss=1.953, ppl=3.87, wps=584688, ups=1.18, wpb=494914, bsz=16353.8, num_updates=49300, lr=0.000284844, gnorm=0.132, clip=0, loss_scale=1, train_wall=83, gb_free=61.2, wall=42264 epoch 030: 395 / 1689 loss=3.508, nll_loss=1.953, ppl=3.87, wps=584688, ups=1.18, wpb=494914, bsz=16353.8, num_updates=49300, lr=0.000284844, gnorm=0.132, clip=0, loss_scale=1, train_wall=83, gb_free=61.2, wall=42264 epoch 030: 395 / 1689 loss=3.508, nll_loss=1.953, ppl=3.87, wps=584688, ups=1.18, wpb=494914, bsz=16353.8, num_updates=49300, lr=0.000284844, gnorm=0.132, clip=0, loss_scale=1, train_wall=83, gb_free=61.2, wall=42264 epoch 030: 395 / 1689 loss=3.508, nll_loss=1.953, ppl=3.87, wps=584688, ups=1.18, wpb=494914, bsz=16353.8, num_updates=49300, lr=0.000284844, gnorm=0.132, clip=0, loss_scale=1, train_wall=83, gb_free=61.2, wall=42264 epoch 030: 395 / 1689 loss=3.508, nll_loss=1.953, ppl=3.87, wps=584688, ups=1.18, wpb=494914, bsz=16353.8, num_updates=49300, lr=0.000284844, gnorm=0.132, clip=0, loss_scale=1, train_wall=83, gb_free=61.2, wall=42264 epoch 030: 395 / 1689 loss=3.508, nll_loss=1.953, ppl=3.87, wps=584688, ups=1.18, wpb=494914, bsz=16353.8, num_updates=49300, lr=0.000284844, gnorm=0.132, clip=0, loss_scale=1, train_wall=83, gb_free=61.2, wall=42264 epoch 030: 395 / 1689 loss=3.508, nll_loss=1.953, ppl=3.87, wps=584688, ups=1.18, wpb=494914, bsz=16353.8, num_updates=49300, lr=0.000284844, gnorm=0.132, clip=0, loss_scale=1, train_wall=83, gb_free=61.2, wall=42264 epoch 030: 395 / 1689 loss=3.508, nll_loss=1.953, ppl=3.87, wps=584688, ups=1.18, wpb=494914, bsz=16353.8, num_updates=49300, lr=0.000284844, gnorm=0.132, clip=0, loss_scale=1, train_wall=83, gb_free=61.2, wall=42264 epoch 030: 395 / 1689 loss=3.508, nll_loss=1.953, ppl=3.87, wps=584688, ups=1.18, wpb=494914, bsz=16353.8, num_updates=49300, lr=0.000284844, gnorm=0.132, clip=0, loss_scale=1, train_wall=83, gb_free=61.2, wall=42264 epoch 030: 395 / 1689 loss=3.508, nll_loss=1.953, ppl=3.87, wps=584688, ups=1.18, wpb=494914, bsz=16353.8, num_updates=49300, lr=0.000284844, gnorm=0.132, clip=0, loss_scale=1, train_wall=83, gb_free=61.2, wall=42264 epoch 030: 395 / 1689 loss=3.508, nll_loss=1.953, ppl=3.87, wps=584688, ups=1.18, wpb=494914, bsz=16353.8, num_updates=49300, lr=0.000284844, gnorm=0.132, clip=0, loss_scale=1, train_wall=83, gb_free=61.2, wall=42264 epoch 030: 395 / 1689 loss=3.508, nll_loss=1.953, ppl=3.87, wps=584688, ups=1.18, wpb=494914, bsz=16353.8, num_updates=49300, lr=0.000284844, gnorm=0.132, clip=0, loss_scale=1, train_wall=83, gb_free=61.2, wall=42264 epoch 030: 395 / 1689 loss=3.508, nll_loss=1.953, ppl=3.87, wps=584688, ups=1.18, wpb=494914, bsz=16353.8, num_updates=49300, lr=0.000284844, gnorm=0.132, clip=0, loss_scale=1, train_wall=83, gb_free=61.2, wall=42264 epoch 030: 395 / 1689 loss=3.508, nll_loss=1.953, ppl=3.87, wps=584688, ups=1.18, wpb=494914, bsz=16353.8, num_updates=49300, lr=0.000284844, gnorm=0.132, clip=0, loss_scale=1, train_wall=83, gb_free=61.2, wall=42264 epoch 030: 395 / 1689 loss=3.508, nll_loss=1.953, ppl=3.87, wps=584688, ups=1.18, wpb=494914, bsz=16353.8, num_updates=49300, lr=0.000284844, gnorm=0.132, clip=0, loss_scale=1, train_wall=83, gb_free=61.2, wall=42264 epoch 030: 395 / 1689 loss=3.508, nll_loss=1.953, ppl=3.87, wps=584688, ups=1.18, wpb=494914, bsz=16353.8, num_updates=49300, lr=0.000284844, gnorm=0.132, clip=0, loss_scale=1, train_wall=83, gb_free=61.2, wall=42264 epoch 030: 395 / 1689 loss=3.508, nll_loss=1.953, ppl=3.87, wps=584688, ups=1.18, wpb=494914, bsz=16353.8, num_updates=49300, lr=0.000284844, gnorm=0.132, clip=0, loss_scale=1, train_wall=83, gb_free=61.2, wall=42264 epoch 030: 395 / 1689 loss=3.508, nll_loss=1.953, ppl=3.87, wps=584688, ups=1.18, wpb=494914, bsz=16353.8, num_updates=49300, lr=0.000284844, gnorm=0.132, clip=0, loss_scale=1, train_wall=83, gb_free=61.2, wall=42264 epoch 030: 395 / 1689 loss=3.508, nll_loss=1.953, ppl=3.87, wps=584688, ups=1.18, wpb=494914, bsz=16353.8, num_updates=49300, lr=0.000284844, gnorm=0.132, clip=0, loss_scale=1, train_wall=83, gb_free=61.2, wall=42264 epoch 030: 495 / 1689 loss=3.514, nll_loss=1.959, ppl=3.89, wps=587660, ups=1.18, wpb=496281, bsz=16852.6, num_updates=49400, lr=0.000284555, gnorm=0.139, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=42348 epoch 030: 495 / 1689 loss=3.514, nll_loss=1.959, ppl=3.89, wps=587660, ups=1.18, wpb=496281, bsz=16852.6, num_updates=49400, lr=0.000284555, gnorm=0.139, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=42348 epoch 030: 495 / 1689 loss=3.514, nll_loss=1.959, ppl=3.89, wps=587660, ups=1.18, wpb=496281, bsz=16852.6, num_updates=49400, lr=0.000284555, gnorm=0.139, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=42348 epoch 030: 495 / 1689 loss=3.514, nll_loss=1.959, ppl=3.89, wps=587660, ups=1.18, wpb=496281, bsz=16852.6, num_updates=49400, lr=0.000284555, gnorm=0.139, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=42348 epoch 030: 495 / 1689 loss=3.514, nll_loss=1.959, ppl=3.89, wps=587660, ups=1.18, wpb=496281, bsz=16852.6, num_updates=49400, lr=0.000284555, gnorm=0.139, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=42348 epoch 030: 495 / 1689 loss=3.514, nll_loss=1.959, ppl=3.89, wps=587660, ups=1.18, wpb=496281, bsz=16852.6, num_updates=49400, lr=0.000284555, gnorm=0.139, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=42348 epoch 030: 495 / 1689 loss=3.514, nll_loss=1.959, ppl=3.89, wps=587660, ups=1.18, wpb=496281, bsz=16852.6, num_updates=49400, lr=0.000284555, gnorm=0.139, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=42348 epoch 030: 495 / 1689 loss=3.514, nll_loss=1.959, ppl=3.89, wps=587660, ups=1.18, wpb=496281, bsz=16852.6, num_updates=49400, lr=0.000284555, gnorm=0.139, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=42348 epoch 030: 495 / 1689 loss=3.514, nll_loss=1.959, ppl=3.89, wps=587660, ups=1.18, wpb=496281, bsz=16852.6, num_updates=49400, lr=0.000284555, gnorm=0.139, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=42348 epoch 030: 495 / 1689 loss=3.514, nll_loss=1.959, ppl=3.89, wps=587660, ups=1.18, wpb=496281, bsz=16852.6, num_updates=49400, lr=0.000284555, gnorm=0.139, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=42348 epoch 030: 495 / 1689 loss=3.514, nll_loss=1.959, ppl=3.89, wps=587660, ups=1.18, wpb=496281, bsz=16852.6, num_updates=49400, lr=0.000284555, gnorm=0.139, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=42348 epoch 030: 495 / 1689 loss=3.514, nll_loss=1.959, ppl=3.89, wps=587660, ups=1.18, wpb=496281, bsz=16852.6, num_updates=49400, lr=0.000284555, gnorm=0.139, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=42348 epoch 030: 495 / 1689 loss=3.514, nll_loss=1.959, ppl=3.89, wps=587660, ups=1.18, wpb=496281, bsz=16852.6, num_updates=49400, lr=0.000284555, gnorm=0.139, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=42348 epoch 030: 495 / 1689 loss=3.514, nll_loss=1.959, ppl=3.89, wps=587660, ups=1.18, wpb=496281, bsz=16852.6, num_updates=49400, lr=0.000284555, gnorm=0.139, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=42348 epoch 030: 495 / 1689 loss=3.514, nll_loss=1.959, ppl=3.89, wps=587660, ups=1.18, wpb=496281, bsz=16852.6, num_updates=49400, lr=0.000284555, gnorm=0.139, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=42348 epoch 030: 495 / 1689 loss=3.514, nll_loss=1.959, ppl=3.89, wps=587660, ups=1.18, wpb=496281, bsz=16852.6, num_updates=49400, lr=0.000284555, gnorm=0.139, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=42348 epoch 030: 495 / 1689 loss=3.514, nll_loss=1.959, ppl=3.89, wps=587660, ups=1.18, wpb=496281, bsz=16852.6, num_updates=49400, lr=0.000284555, gnorm=0.139, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=42348 epoch 030: 495 / 1689 loss=3.514, nll_loss=1.959, ppl=3.89, wps=587660, ups=1.18, wpb=496281, bsz=16852.6, num_updates=49400, lr=0.000284555, gnorm=0.139, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=42348 epoch 030: 495 / 1689 loss=3.514, nll_loss=1.959, ppl=3.89, wps=587660, ups=1.18, wpb=496281, bsz=16852.6, num_updates=49400, lr=0.000284555, gnorm=0.139, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=42348 epoch 030: 495 / 1689 loss=3.514, nll_loss=1.959, ppl=3.89, wps=587660, ups=1.18, wpb=496281, bsz=16852.6, num_updates=49400, lr=0.000284555, gnorm=0.139, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=42348 epoch 030: 495 / 1689 loss=3.514, nll_loss=1.959, ppl=3.89, wps=587660, ups=1.18, wpb=496281, bsz=16852.6, num_updates=49400, lr=0.000284555, gnorm=0.139, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=42348 epoch 030: 495 / 1689 loss=3.514, nll_loss=1.959, ppl=3.89, wps=587660, ups=1.18, wpb=496281, bsz=16852.6, num_updates=49400, lr=0.000284555, gnorm=0.139, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=42348 epoch 030: 495 / 1689 loss=3.514, nll_loss=1.959, ppl=3.89, wps=587660, ups=1.18, wpb=496281, bsz=16852.6, num_updates=49400, lr=0.000284555, gnorm=0.139, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=42348 epoch 030: 495 / 1689 loss=3.514, nll_loss=1.959, ppl=3.89, wps=587660, ups=1.18, wpb=496281, bsz=16852.6, num_updates=49400, lr=0.000284555, gnorm=0.139, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=42348 epoch 030: 495 / 1689 loss=3.514, nll_loss=1.959, ppl=3.89, wps=587660, ups=1.18, wpb=496281, bsz=16852.6, num_updates=49400, lr=0.000284555, gnorm=0.139, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=42348 epoch 030: 495 / 1689 loss=3.514, nll_loss=1.959, ppl=3.89, wps=587660, ups=1.18, wpb=496281, bsz=16852.6, num_updates=49400, lr=0.000284555, gnorm=0.139, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=42348 epoch 030: 495 / 1689 loss=3.514, nll_loss=1.959, ppl=3.89, wps=587660, ups=1.18, wpb=496281, bsz=16852.6, num_updates=49400, lr=0.000284555, gnorm=0.139, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=42348 epoch 030: 495 / 1689 loss=3.514, nll_loss=1.959, ppl=3.89, wps=587660, ups=1.18, wpb=496281, bsz=16852.6, num_updates=49400, lr=0.000284555, gnorm=0.139, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=42348 epoch 030: 495 / 1689 loss=3.514, nll_loss=1.959, ppl=3.89, wps=587660, ups=1.18, wpb=496281, bsz=16852.6, num_updates=49400, lr=0.000284555, gnorm=0.139, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=42348 epoch 030: 495 / 1689 loss=3.514, nll_loss=1.959, ppl=3.89, wps=587660, ups=1.18, wpb=496281, bsz=16852.6, num_updates=49400, lr=0.000284555, gnorm=0.139, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=42348 epoch 030: 595 / 1689 loss=3.513, nll_loss=1.958, ppl=3.89, wps=589773, ups=1.19, wpb=495997, bsz=16496.6, num_updates=49500, lr=0.000284268, gnorm=0.139, clip=0, loss_scale=2, train_wall=83, gb_free=61.2, wall=42432 epoch 030: 595 / 1689 loss=3.513, nll_loss=1.958, ppl=3.89, wps=589773, ups=1.19, wpb=495997, bsz=16496.6, num_updates=49500, lr=0.000284268, gnorm=0.139, clip=0, loss_scale=2, train_wall=83, gb_free=61.2, wall=42432 epoch 030: 595 / 1689 loss=3.513, nll_loss=1.958, ppl=3.89, wps=589773, ups=1.19, wpb=495997, bsz=16496.6, num_updates=49500, lr=0.000284268, gnorm=0.139, clip=0, loss_scale=2, train_wall=83, gb_free=61.2, wall=42432 epoch 030: 595 / 1689 loss=3.513, nll_loss=1.958, ppl=3.89, wps=589773, ups=1.19, wpb=495997, bsz=16496.6, num_updates=49500, lr=0.000284268, gnorm=0.139, clip=0, loss_scale=2, train_wall=83, gb_free=61.2, wall=42432 epoch 030: 595 / 1689 loss=3.513, nll_loss=1.958, ppl=3.89, wps=589773, ups=1.19, wpb=495997, bsz=16496.6, num_updates=49500, lr=0.000284268, gnorm=0.139, clip=0, loss_scale=2, train_wall=83, gb_free=61.2, wall=42432 epoch 030: 595 / 1689 loss=3.513, nll_loss=1.958, ppl=3.89, wps=589773, ups=1.19, wpb=495997, bsz=16496.6, num_updates=49500, lr=0.000284268, gnorm=0.139, clip=0, loss_scale=2, train_wall=83, gb_free=61.2, wall=42432 epoch 030: 595 / 1689 loss=3.513, nll_loss=1.958, ppl=3.89, wps=589773, ups=1.19, wpb=495997, bsz=16496.6, num_updates=49500, lr=0.000284268, gnorm=0.139, clip=0, loss_scale=2, train_wall=83, gb_free=61.2, wall=42432 epoch 030: 595 / 1689 loss=3.513, nll_loss=1.958, ppl=3.89, wps=589773, ups=1.19, wpb=495997, bsz=16496.6, num_updates=49500, lr=0.000284268, gnorm=0.139, clip=0, loss_scale=2, train_wall=83, gb_free=61.2, wall=42432 epoch 030: 595 / 1689 loss=3.513, nll_loss=1.958, ppl=3.89, wps=589773, ups=1.19, wpb=495997, bsz=16496.6, num_updates=49500, lr=0.000284268, gnorm=0.139, clip=0, loss_scale=2, train_wall=83, gb_free=61.2, wall=42432 epoch 030: 595 / 1689 loss=3.513, nll_loss=1.958, ppl=3.89, wps=589773, ups=1.19, wpb=495997, bsz=16496.6, num_updates=49500, lr=0.000284268, gnorm=0.139, clip=0, loss_scale=2, train_wall=83, gb_free=61.2, wall=42432 epoch 030: 595 / 1689 loss=3.513, nll_loss=1.958, ppl=3.89, wps=589773, ups=1.19, wpb=495997, bsz=16496.6, num_updates=49500, lr=0.000284268, gnorm=0.139, clip=0, loss_scale=2, train_wall=83, gb_free=61.2, wall=42432 epoch 030: 595 / 1689 loss=3.513, nll_loss=1.958, ppl=3.89, wps=589773, ups=1.19, wpb=495997, bsz=16496.6, num_updates=49500, lr=0.000284268, gnorm=0.139, clip=0, loss_scale=2, train_wall=83, gb_free=61.2, wall=42432 epoch 030: 595 / 1689 loss=3.513, nll_loss=1.958, ppl=3.89, wps=589773, ups=1.19, wpb=495997, bsz=16496.6, num_updates=49500, lr=0.000284268, gnorm=0.139, clip=0, loss_scale=2, train_wall=83, gb_free=61.2, wall=42432 epoch 030: 595 / 1689 loss=3.513, nll_loss=1.958, ppl=3.89, wps=589773, ups=1.19, wpb=495997, bsz=16496.6, num_updates=49500, lr=0.000284268, gnorm=0.139, clip=0, loss_scale=2, train_wall=83, gb_free=61.2, wall=42432 epoch 030: 595 / 1689 loss=3.513, nll_loss=1.958, ppl=3.89, wps=589773, ups=1.19, wpb=495997, bsz=16496.6, num_updates=49500, lr=0.000284268, gnorm=0.139, clip=0, loss_scale=2, train_wall=83, gb_free=61.2, wall=42432 epoch 030: 595 / 1689 loss=3.513, nll_loss=1.958, ppl=3.89, wps=589773, ups=1.19, wpb=495997, bsz=16496.6, num_updates=49500, lr=0.000284268, gnorm=0.139, clip=0, loss_scale=2, train_wall=83, gb_free=61.2, wall=42432 epoch 030: 595 / 1689 loss=3.513, nll_loss=1.958, ppl=3.89, wps=589773, ups=1.19, wpb=495997, bsz=16496.6, num_updates=49500, lr=0.000284268, gnorm=0.139, clip=0, loss_scale=2, train_wall=83, gb_free=61.2, wall=42432 epoch 030: 595 / 1689 loss=3.513, nll_loss=1.958, ppl=3.89, wps=589773, ups=1.19, wpb=495997, bsz=16496.6, num_updates=49500, lr=0.000284268, gnorm=0.139, clip=0, loss_scale=2, train_wall=83, gb_free=61.2, wall=42432 epoch 030: 595 / 1689 loss=3.513, nll_loss=1.958, ppl=3.89, wps=589773, ups=1.19, wpb=495997, bsz=16496.6, num_updates=49500, lr=0.000284268, gnorm=0.139, clip=0, loss_scale=2, train_wall=83, gb_free=61.2, wall=42432 epoch 030: 595 / 1689 loss=3.513, nll_loss=1.958, ppl=3.89, wps=589773, ups=1.19, wpb=495997, bsz=16496.6, num_updates=49500, lr=0.000284268, gnorm=0.139, clip=0, loss_scale=2, train_wall=83, gb_free=61.2, wall=42432 epoch 030: 595 / 1689 loss=3.513, nll_loss=1.958, ppl=3.89, wps=589773, ups=1.19, wpb=495997, bsz=16496.6, num_updates=49500, lr=0.000284268, gnorm=0.139, clip=0, loss_scale=2, train_wall=83, gb_free=61.2, wall=42432 epoch 030: 595 / 1689 loss=3.513, nll_loss=1.958, ppl=3.89, wps=589773, ups=1.19, wpb=495997, bsz=16496.6, num_updates=49500, lr=0.000284268, gnorm=0.139, clip=0, loss_scale=2, train_wall=83, gb_free=61.2, wall=42432 epoch 030: 595 / 1689 loss=3.513, nll_loss=1.958, ppl=3.89, wps=589773, ups=1.19, wpb=495997, bsz=16496.6, num_updates=49500, lr=0.000284268, gnorm=0.139, clip=0, loss_scale=2, train_wall=83, gb_free=61.2, wall=42432 epoch 030: 595 / 1689 loss=3.513, nll_loss=1.958, ppl=3.89, wps=589773, ups=1.19, wpb=495997, bsz=16496.6, num_updates=49500, lr=0.000284268, gnorm=0.139, clip=0, loss_scale=2, train_wall=83, gb_free=61.2, wall=42432 epoch 030: 595 / 1689 loss=3.513, nll_loss=1.958, ppl=3.89, wps=589773, ups=1.19, wpb=495997, bsz=16496.6, num_updates=49500, lr=0.000284268, gnorm=0.139, clip=0, loss_scale=2, train_wall=83, gb_free=61.2, wall=42432 epoch 030: 595 / 1689 loss=3.513, nll_loss=1.958, ppl=3.89, wps=589773, ups=1.19, wpb=495997, bsz=16496.6, num_updates=49500, lr=0.000284268, gnorm=0.139, clip=0, loss_scale=2, train_wall=83, gb_free=61.2, wall=42432 epoch 030: 595 / 1689 loss=3.513, nll_loss=1.958, ppl=3.89, wps=589773, ups=1.19, wpb=495997, bsz=16496.6, num_updates=49500, lr=0.000284268, gnorm=0.139, clip=0, loss_scale=2, train_wall=83, gb_free=61.2, wall=42432 epoch 030: 595 / 1689 loss=3.513, nll_loss=1.958, ppl=3.89, wps=589773, ups=1.19, wpb=495997, bsz=16496.6, num_updates=49500, lr=0.000284268, gnorm=0.139, clip=0, loss_scale=2, train_wall=83, gb_free=61.2, wall=42432 epoch 030: 595 / 1689 loss=3.513, nll_loss=1.958, ppl=3.89, wps=589773, ups=1.19, wpb=495997, bsz=16496.6, num_updates=49500, lr=0.000284268, gnorm=0.139, clip=0, loss_scale=2, train_wall=83, gb_free=61.2, wall=42432 epoch 030: 595 / 1689 loss=3.513, nll_loss=1.958, ppl=3.89, wps=589773, ups=1.19, wpb=495997, bsz=16496.6, num_updates=49500, lr=0.000284268, gnorm=0.139, clip=0, loss_scale=2, train_wall=83, gb_free=61.2, wall=42432 epoch 030: 696 / 1689 loss=3.516, nll_loss=1.962, ppl=3.9, wps=583588, ups=1.18, wpb=495551, bsz=16456.2, num_updates=49600, lr=0.000283981, gnorm=0.15, clip=0, loss_scale=1, train_wall=84, gb_free=62, wall=42517 epoch 030: 696 / 1689 loss=3.516, nll_loss=1.962, ppl=3.9, wps=583588, ups=1.18, wpb=495551, bsz=16456.2, num_updates=49600, lr=0.000283981, gnorm=0.15, clip=0, loss_scale=1, train_wall=84, gb_free=62, wall=42517 epoch 030: 696 / 1689 loss=3.516, nll_loss=1.962, ppl=3.9, wps=583588, ups=1.18, wpb=495551, bsz=16456.2, num_updates=49600, lr=0.000283981, gnorm=0.15, clip=0, loss_scale=1, train_wall=84, gb_free=62, wall=42517 epoch 030: 696 / 1689 loss=3.516, nll_loss=1.962, ppl=3.9, wps=583588, ups=1.18, wpb=495551, bsz=16456.2, num_updates=49600, lr=0.000283981, gnorm=0.15, clip=0, loss_scale=1, train_wall=84, gb_free=62, wall=42517 epoch 030: 696 / 1689 loss=3.516, nll_loss=1.962, ppl=3.9, wps=583588, ups=1.18, wpb=495551, bsz=16456.2, num_updates=49600, lr=0.000283981, gnorm=0.15, clip=0, loss_scale=1, train_wall=84, gb_free=62, wall=42517 epoch 030: 696 / 1689 loss=3.516, nll_loss=1.962, ppl=3.9, wps=583588, ups=1.18, wpb=495551, bsz=16456.2, num_updates=49600, lr=0.000283981, gnorm=0.15, clip=0, loss_scale=1, train_wall=84, gb_free=62, wall=42517 epoch 030: 696 / 1689 loss=3.516, nll_loss=1.962, ppl=3.9, wps=583588, ups=1.18, wpb=495551, bsz=16456.2, num_updates=49600, lr=0.000283981, gnorm=0.15, clip=0, loss_scale=1, train_wall=84, gb_free=62, wall=42517 epoch 030: 696 / 1689 loss=3.516, nll_loss=1.962, ppl=3.9, wps=583588, ups=1.18, wpb=495551, bsz=16456.2, num_updates=49600, lr=0.000283981, gnorm=0.15, clip=0, loss_scale=1, train_wall=84, gb_free=62, wall=42517 epoch 030: 696 / 1689 loss=3.516, nll_loss=1.962, ppl=3.9, wps=583588, ups=1.18, wpb=495551, bsz=16456.2, num_updates=49600, lr=0.000283981, gnorm=0.15, clip=0, loss_scale=1, train_wall=84, gb_free=62, wall=42517 epoch 030: 696 / 1689 loss=3.516, nll_loss=1.962, ppl=3.9, wps=583588, ups=1.18, wpb=495551, bsz=16456.2, num_updates=49600, lr=0.000283981, gnorm=0.15, clip=0, loss_scale=1, train_wall=84, gb_free=62, wall=42517 epoch 030: 696 / 1689 loss=3.516, nll_loss=1.962, ppl=3.9, wps=583588, ups=1.18, wpb=495551, bsz=16456.2, num_updates=49600, lr=0.000283981, gnorm=0.15, clip=0, loss_scale=1, train_wall=84, gb_free=62, wall=42517 epoch 030: 696 / 1689 loss=3.516, nll_loss=1.962, ppl=3.9, wps=583588, ups=1.18, wpb=495551, bsz=16456.2, num_updates=49600, lr=0.000283981, gnorm=0.15, clip=0, loss_scale=1, train_wall=84, gb_free=62, wall=42517 epoch 030: 696 / 1689 loss=3.516, nll_loss=1.962, ppl=3.9, wps=583588, ups=1.18, wpb=495551, bsz=16456.2, num_updates=49600, lr=0.000283981, gnorm=0.15, clip=0, loss_scale=1, train_wall=84, gb_free=62, wall=42517 epoch 030: 696 / 1689 loss=3.516, nll_loss=1.962, ppl=3.9, wps=583588, ups=1.18, wpb=495551, bsz=16456.2, num_updates=49600, lr=0.000283981, gnorm=0.15, clip=0, loss_scale=1, train_wall=84, gb_free=62, wall=42517 epoch 030: 696 / 1689 loss=3.516, nll_loss=1.962, ppl=3.9, wps=583588, ups=1.18, wpb=495551, bsz=16456.2, num_updates=49600, lr=0.000283981, gnorm=0.15, clip=0, loss_scale=1, train_wall=84, gb_free=62, wall=42517 epoch 030: 696 / 1689 loss=3.516, nll_loss=1.962, ppl=3.9, wps=583588, ups=1.18, wpb=495551, bsz=16456.2, num_updates=49600, lr=0.000283981, gnorm=0.15, clip=0, loss_scale=1, train_wall=84, gb_free=62, wall=42517 epoch 030: 696 / 1689 loss=3.516, nll_loss=1.962, ppl=3.9, wps=583588, ups=1.18, wpb=495551, bsz=16456.2, num_updates=49600, lr=0.000283981, gnorm=0.15, clip=0, loss_scale=1, train_wall=84, gb_free=62, wall=42517 epoch 030: 696 / 1689 loss=3.516, nll_loss=1.962, ppl=3.9, wps=583588, ups=1.18, wpb=495551, bsz=16456.2, num_updates=49600, lr=0.000283981, gnorm=0.15, clip=0, loss_scale=1, train_wall=84, gb_free=62, wall=42517 epoch 030: 696 / 1689 loss=3.516, nll_loss=1.962, ppl=3.9, wps=583588, ups=1.18, wpb=495551, bsz=16456.2, num_updates=49600, lr=0.000283981, gnorm=0.15, clip=0, loss_scale=1, train_wall=84, gb_free=62, wall=42517 epoch 030: 696 / 1689 loss=3.516, nll_loss=1.962, ppl=3.9, wps=583588, ups=1.18, wpb=495551, bsz=16456.2, num_updates=49600, lr=0.000283981, gnorm=0.15, clip=0, loss_scale=1, train_wall=84, gb_free=62, wall=42517 epoch 030: 696 / 1689 loss=3.516, nll_loss=1.962, ppl=3.9, wps=583588, ups=1.18, wpb=495551, bsz=16456.2, num_updates=49600, lr=0.000283981, gnorm=0.15, clip=0, loss_scale=1, train_wall=84, gb_free=62, wall=42517 epoch 030: 696 / 1689 loss=3.516, nll_loss=1.962, ppl=3.9, wps=583588, ups=1.18, wpb=495551, bsz=16456.2, num_updates=49600, lr=0.000283981, gnorm=0.15, clip=0, loss_scale=1, train_wall=84, gb_free=62, wall=42517 epoch 030: 696 / 1689 loss=3.516, nll_loss=1.962, ppl=3.9, wps=583588, ups=1.18, wpb=495551, bsz=16456.2, num_updates=49600, lr=0.000283981, gnorm=0.15, clip=0, loss_scale=1, train_wall=84, gb_free=62, wall=42517 epoch 030: 696 / 1689 loss=3.516, nll_loss=1.962, ppl=3.9, wps=583588, ups=1.18, wpb=495551, bsz=16456.2, num_updates=49600, lr=0.000283981, gnorm=0.15, clip=0, loss_scale=1, train_wall=84, gb_free=62, wall=42517 epoch 030: 696 / 1689 loss=3.516, nll_loss=1.962, ppl=3.9, wps=583588, ups=1.18, wpb=495551, bsz=16456.2, num_updates=49600, lr=0.000283981, gnorm=0.15, clip=0, loss_scale=1, train_wall=84, gb_free=62, wall=42517 epoch 030: 696 / 1689 loss=3.516, nll_loss=1.962, ppl=3.9, wps=583588, ups=1.18, wpb=495551, bsz=16456.2, num_updates=49600, lr=0.000283981, gnorm=0.15, clip=0, loss_scale=1, train_wall=84, gb_free=62, wall=42517 epoch 030: 696 / 1689 loss=3.516, nll_loss=1.962, ppl=3.9, wps=583588, ups=1.18, wpb=495551, bsz=16456.2, num_updates=49600, lr=0.000283981, gnorm=0.15, clip=0, loss_scale=1, train_wall=84, gb_free=62, wall=42517 epoch 030: 696 / 1689 loss=3.516, nll_loss=1.962, ppl=3.9, wps=583588, ups=1.18, wpb=495551, bsz=16456.2, num_updates=49600, lr=0.000283981, gnorm=0.15, clip=0, loss_scale=1, train_wall=84, gb_free=62, wall=42517 epoch 030: 696 / 1689 loss=3.516, nll_loss=1.962, ppl=3.9, wps=583588, ups=1.18, wpb=495551, bsz=16456.2, num_updates=49600, lr=0.000283981, gnorm=0.15, clip=0, loss_scale=1, train_wall=84, gb_free=62, wall=42517 epoch 030: 696 / 1689 loss=3.516, nll_loss=1.962, ppl=3.9, wps=583588, ups=1.18, wpb=495551, bsz=16456.2, num_updates=49600, lr=0.000283981, gnorm=0.15, clip=0, loss_scale=1, train_wall=84, gb_free=62, wall=42517 epoch 030: 796 / 1689 loss=3.52, nll_loss=1.966, ppl=3.91, wps=589344, ups=1.19, wpb=496369, bsz=16303.8, num_updates=49700, lr=0.000283695, gnorm=0.139, clip=0, loss_scale=1, train_wall=83, gb_free=61.5, wall=42601 epoch 030: 796 / 1689 loss=3.52, nll_loss=1.966, ppl=3.91, wps=589344, ups=1.19, wpb=496369, bsz=16303.8, num_updates=49700, lr=0.000283695, gnorm=0.139, clip=0, loss_scale=1, train_wall=83, gb_free=61.5, wall=42601 epoch 030: 796 / 1689 loss=3.52, nll_loss=1.966, ppl=3.91, wps=589344, ups=1.19, wpb=496369, bsz=16303.8, num_updates=49700, lr=0.000283695, gnorm=0.139, clip=0, loss_scale=1, train_wall=83, gb_free=61.5, wall=42601 epoch 030: 796 / 1689 loss=3.52, nll_loss=1.966, ppl=3.91, wps=589344, ups=1.19, wpb=496369, bsz=16303.8, num_updates=49700, lr=0.000283695, gnorm=0.139, clip=0, loss_scale=1, train_wall=83, gb_free=61.5, wall=42601 epoch 030: 796 / 1689 loss=3.52, nll_loss=1.966, ppl=3.91, wps=589344, ups=1.19, wpb=496369, bsz=16303.8, num_updates=49700, lr=0.000283695, gnorm=0.139, clip=0, loss_scale=1, train_wall=83, gb_free=61.5, wall=42601 epoch 030: 796 / 1689 loss=3.52, nll_loss=1.966, ppl=3.91, wps=589344, ups=1.19, wpb=496369, bsz=16303.8, num_updates=49700, lr=0.000283695, gnorm=0.139, clip=0, loss_scale=1, train_wall=83, gb_free=61.5, wall=42601 epoch 030: 796 / 1689 loss=3.52, nll_loss=1.966, ppl=3.91, wps=589344, ups=1.19, wpb=496369, bsz=16303.8, num_updates=49700, lr=0.000283695, gnorm=0.139, clip=0, loss_scale=1, train_wall=83, gb_free=61.5, wall=42601 epoch 030: 796 / 1689 loss=3.52, nll_loss=1.966, ppl=3.91, wps=589344, ups=1.19, wpb=496369, bsz=16303.8, num_updates=49700, lr=0.000283695, gnorm=0.139, clip=0, loss_scale=1, train_wall=83, gb_free=61.5, wall=42601 epoch 030: 796 / 1689 loss=3.52, nll_loss=1.966, ppl=3.91, wps=589344, ups=1.19, wpb=496369, bsz=16303.8, num_updates=49700, lr=0.000283695, gnorm=0.139, clip=0, loss_scale=1, train_wall=83, gb_free=61.5, wall=42601 epoch 030: 796 / 1689 loss=3.52, nll_loss=1.966, ppl=3.91, wps=589344, ups=1.19, wpb=496369, bsz=16303.8, num_updates=49700, lr=0.000283695, gnorm=0.139, clip=0, loss_scale=1, train_wall=83, gb_free=61.5, wall=42601 epoch 030: 796 / 1689 loss=3.52, nll_loss=1.966, ppl=3.91, wps=589344, ups=1.19, wpb=496369, bsz=16303.8, num_updates=49700, lr=0.000283695, gnorm=0.139, clip=0, loss_scale=1, train_wall=83, gb_free=61.5, wall=42601 epoch 030: 796 / 1689 loss=3.52, nll_loss=1.966, ppl=3.91, wps=589344, ups=1.19, wpb=496369, bsz=16303.8, num_updates=49700, lr=0.000283695, gnorm=0.139, clip=0, loss_scale=1, train_wall=83, gb_free=61.5, wall=42601 epoch 030: 796 / 1689 loss=3.52, nll_loss=1.966, ppl=3.91, wps=589344, ups=1.19, wpb=496369, bsz=16303.8, num_updates=49700, lr=0.000283695, gnorm=0.139, clip=0, loss_scale=1, train_wall=83, gb_free=61.5, wall=42601 epoch 030: 796 / 1689 loss=3.52, nll_loss=1.966, ppl=3.91, wps=589344, ups=1.19, wpb=496369, bsz=16303.8, num_updates=49700, lr=0.000283695, gnorm=0.139, clip=0, loss_scale=1, train_wall=83, gb_free=61.5, wall=42601 epoch 030: 796 / 1689 loss=3.52, nll_loss=1.966, ppl=3.91, wps=589344, ups=1.19, wpb=496369, bsz=16303.8, num_updates=49700, lr=0.000283695, gnorm=0.139, clip=0, loss_scale=1, train_wall=83, gb_free=61.5, wall=42601 epoch 030: 796 / 1689 loss=3.52, nll_loss=1.966, ppl=3.91, wps=589344, ups=1.19, wpb=496369, bsz=16303.8, num_updates=49700, lr=0.000283695, gnorm=0.139, clip=0, loss_scale=1, train_wall=83, gb_free=61.5, wall=42601 epoch 030: 796 / 1689 loss=3.52, nll_loss=1.966, ppl=3.91, wps=589344, ups=1.19, wpb=496369, bsz=16303.8, num_updates=49700, lr=0.000283695, gnorm=0.139, clip=0, loss_scale=1, train_wall=83, gb_free=61.5, wall=42601 epoch 030: 796 / 1689 loss=3.52, nll_loss=1.966, ppl=3.91, wps=589344, ups=1.19, wpb=496369, bsz=16303.8, num_updates=49700, lr=0.000283695, gnorm=0.139, clip=0, loss_scale=1, train_wall=83, gb_free=61.5, wall=42601 epoch 030: 796 / 1689 loss=3.52, nll_loss=1.966, ppl=3.91, wps=589344, ups=1.19, wpb=496369, bsz=16303.8, num_updates=49700, lr=0.000283695, gnorm=0.139, clip=0, loss_scale=1, train_wall=83, gb_free=61.5, wall=42601 epoch 030: 796 / 1689 loss=3.52, nll_loss=1.966, ppl=3.91, wps=589344, ups=1.19, wpb=496369, bsz=16303.8, num_updates=49700, lr=0.000283695, gnorm=0.139, clip=0, loss_scale=1, train_wall=83, gb_free=61.5, wall=42601 epoch 030: 796 / 1689 loss=3.52, nll_loss=1.966, ppl=3.91, wps=589344, ups=1.19, wpb=496369, bsz=16303.8, num_updates=49700, lr=0.000283695, gnorm=0.139, clip=0, loss_scale=1, train_wall=83, gb_free=61.5, wall=42601 epoch 030: 796 / 1689 loss=3.52, nll_loss=1.966, ppl=3.91, wps=589344, ups=1.19, wpb=496369, bsz=16303.8, num_updates=49700, lr=0.000283695, gnorm=0.139, clip=0, loss_scale=1, train_wall=83, gb_free=61.5, wall=42601 epoch 030: 796 / 1689 loss=3.52, nll_loss=1.966, ppl=3.91, wps=589344, ups=1.19, wpb=496369, bsz=16303.8, num_updates=49700, lr=0.000283695, gnorm=0.139, clip=0, loss_scale=1, train_wall=83, gb_free=61.5, wall=42601 epoch 030: 796 / 1689 loss=3.52, nll_loss=1.966, ppl=3.91, wps=589344, ups=1.19, wpb=496369, bsz=16303.8, num_updates=49700, lr=0.000283695, gnorm=0.139, clip=0, loss_scale=1, train_wall=83, gb_free=61.5, wall=42601 epoch 030: 796 / 1689 loss=3.52, nll_loss=1.966, ppl=3.91, wps=589344, ups=1.19, wpb=496369, bsz=16303.8, num_updates=49700, lr=0.000283695, gnorm=0.139, clip=0, loss_scale=1, train_wall=83, gb_free=61.5, wall=42601 epoch 030: 796 / 1689 loss=3.52, nll_loss=1.966, ppl=3.91, wps=589344, ups=1.19, wpb=496369, bsz=16303.8, num_updates=49700, lr=0.000283695, gnorm=0.139, clip=0, loss_scale=1, train_wall=83, gb_free=61.5, wall=42601 epoch 030: 796 / 1689 loss=3.52, nll_loss=1.966, ppl=3.91, wps=589344, ups=1.19, wpb=496369, bsz=16303.8, num_updates=49700, lr=0.000283695, gnorm=0.139, clip=0, loss_scale=1, train_wall=83, gb_free=61.5, wall=42601 epoch 030: 796 / 1689 loss=3.52, nll_loss=1.966, ppl=3.91, wps=589344, ups=1.19, wpb=496369, bsz=16303.8, num_updates=49700, lr=0.000283695, gnorm=0.139, clip=0, loss_scale=1, train_wall=83, gb_free=61.5, wall=42601 epoch 030: 796 / 1689 loss=3.52, nll_loss=1.966, ppl=3.91, wps=589344, ups=1.19, wpb=496369, bsz=16303.8, num_updates=49700, lr=0.000283695, gnorm=0.139, clip=0, loss_scale=1, train_wall=83, gb_free=61.5, wall=42601 epoch 030: 796 / 1689 loss=3.52, nll_loss=1.966, ppl=3.91, wps=589344, ups=1.19, wpb=496369, bsz=16303.8, num_updates=49700, lr=0.000283695, gnorm=0.139, clip=0, loss_scale=1, train_wall=83, gb_free=61.5, wall=42601 epoch 030: 896 / 1689 loss=3.518, nll_loss=1.965, ppl=3.9, wps=583819, ups=1.18, wpb=494834, bsz=16456.1, num_updates=49800, lr=0.00028341, gnorm=0.136, clip=0, loss_scale=1, train_wall=84, gb_free=60.4, wall=42686 epoch 030: 896 / 1689 loss=3.518, nll_loss=1.965, ppl=3.9, wps=583819, ups=1.18, wpb=494834, bsz=16456.1, num_updates=49800, lr=0.00028341, gnorm=0.136, clip=0, loss_scale=1, train_wall=84, gb_free=60.4, wall=42686 epoch 030: 896 / 1689 loss=3.518, nll_loss=1.965, ppl=3.9, wps=583819, ups=1.18, wpb=494834, bsz=16456.1, num_updates=49800, lr=0.00028341, gnorm=0.136, clip=0, loss_scale=1, train_wall=84, gb_free=60.4, wall=42686 epoch 030: 896 / 1689 loss=3.518, nll_loss=1.965, ppl=3.9, wps=583819, ups=1.18, wpb=494834, bsz=16456.1, num_updates=49800, lr=0.00028341, gnorm=0.136, clip=0, loss_scale=1, train_wall=84, gb_free=60.4, wall=42686 epoch 030: 896 / 1689 loss=3.518, nll_loss=1.965, ppl=3.9, wps=583819, ups=1.18, wpb=494834, bsz=16456.1, num_updates=49800, lr=0.00028341, gnorm=0.136, clip=0, loss_scale=1, train_wall=84, gb_free=60.4, wall=42686 epoch 030: 896 / 1689 loss=3.518, nll_loss=1.965, ppl=3.9, wps=583819, ups=1.18, wpb=494834, bsz=16456.1, num_updates=49800, lr=0.00028341, gnorm=0.136, clip=0, loss_scale=1, train_wall=84, gb_free=60.4, wall=42686 epoch 030: 896 / 1689 loss=3.518, nll_loss=1.965, ppl=3.9, wps=583819, ups=1.18, wpb=494834, bsz=16456.1, num_updates=49800, lr=0.00028341, gnorm=0.136, clip=0, loss_scale=1, train_wall=84, gb_free=60.4, wall=42686 epoch 030: 896 / 1689 loss=3.518, nll_loss=1.965, ppl=3.9, wps=583819, ups=1.18, wpb=494834, bsz=16456.1, num_updates=49800, lr=0.00028341, gnorm=0.136, clip=0, loss_scale=1, train_wall=84, gb_free=60.4, wall=42686 epoch 030: 896 / 1689 loss=3.518, nll_loss=1.965, ppl=3.9, wps=583819, ups=1.18, wpb=494834, bsz=16456.1, num_updates=49800, lr=0.00028341, gnorm=0.136, clip=0, loss_scale=1, train_wall=84, gb_free=60.4, wall=42686 epoch 030: 896 / 1689 loss=3.518, nll_loss=1.965, ppl=3.9, wps=583819, ups=1.18, wpb=494834, bsz=16456.1, num_updates=49800, lr=0.00028341, gnorm=0.136, clip=0, loss_scale=1, train_wall=84, gb_free=60.4, wall=42686 epoch 030: 896 / 1689 loss=3.518, nll_loss=1.965, ppl=3.9, wps=583819, ups=1.18, wpb=494834, bsz=16456.1, num_updates=49800, lr=0.00028341, gnorm=0.136, clip=0, loss_scale=1, train_wall=84, gb_free=60.4, wall=42686 epoch 030: 896 / 1689 loss=3.518, nll_loss=1.965, ppl=3.9, wps=583819, ups=1.18, wpb=494834, bsz=16456.1, num_updates=49800, lr=0.00028341, gnorm=0.136, clip=0, loss_scale=1, train_wall=84, gb_free=60.4, wall=42686 epoch 030: 896 / 1689 loss=3.518, nll_loss=1.965, ppl=3.9, wps=583819, ups=1.18, wpb=494834, bsz=16456.1, num_updates=49800, lr=0.00028341, gnorm=0.136, clip=0, loss_scale=1, train_wall=84, gb_free=60.4, wall=42686 epoch 030: 896 / 1689 loss=3.518, nll_loss=1.965, ppl=3.9, wps=583819, ups=1.18, wpb=494834, bsz=16456.1, num_updates=49800, lr=0.00028341, gnorm=0.136, clip=0, loss_scale=1, train_wall=84, gb_free=60.4, wall=42686 epoch 030: 896 / 1689 loss=3.518, nll_loss=1.965, ppl=3.9, wps=583819, ups=1.18, wpb=494834, bsz=16456.1, num_updates=49800, lr=0.00028341, gnorm=0.136, clip=0, loss_scale=1, train_wall=84, gb_free=60.4, wall=42686 epoch 030: 896 / 1689 loss=3.518, nll_loss=1.965, ppl=3.9, wps=583819, ups=1.18, wpb=494834, bsz=16456.1, num_updates=49800, lr=0.00028341, gnorm=0.136, clip=0, loss_scale=1, train_wall=84, gb_free=60.4, wall=42686 epoch 030: 896 / 1689 loss=3.518, nll_loss=1.965, ppl=3.9, wps=583819, ups=1.18, wpb=494834, bsz=16456.1, num_updates=49800, lr=0.00028341, gnorm=0.136, clip=0, loss_scale=1, train_wall=84, gb_free=60.4, wall=42686 epoch 030: 896 / 1689 loss=3.518, nll_loss=1.965, ppl=3.9, wps=583819, ups=1.18, wpb=494834, bsz=16456.1, num_updates=49800, lr=0.00028341, gnorm=0.136, clip=0, loss_scale=1, train_wall=84, gb_free=60.4, wall=42686 epoch 030: 896 / 1689 loss=3.518, nll_loss=1.965, ppl=3.9, wps=583819, ups=1.18, wpb=494834, bsz=16456.1, num_updates=49800, lr=0.00028341, gnorm=0.136, clip=0, loss_scale=1, train_wall=84, gb_free=60.4, wall=42686 epoch 030: 896 / 1689 loss=3.518, nll_loss=1.965, ppl=3.9, wps=583819, ups=1.18, wpb=494834, bsz=16456.1, num_updates=49800, lr=0.00028341, gnorm=0.136, clip=0, loss_scale=1, train_wall=84, gb_free=60.4, wall=42686 epoch 030: 896 / 1689 loss=3.518, nll_loss=1.965, ppl=3.9, wps=583819, ups=1.18, wpb=494834, bsz=16456.1, num_updates=49800, lr=0.00028341, gnorm=0.136, clip=0, loss_scale=1, train_wall=84, gb_free=60.4, wall=42686 epoch 030: 896 / 1689 loss=3.518, nll_loss=1.965, ppl=3.9, wps=583819, ups=1.18, wpb=494834, bsz=16456.1, num_updates=49800, lr=0.00028341, gnorm=0.136, clip=0, loss_scale=1, train_wall=84, gb_free=60.4, wall=42686 epoch 030: 896 / 1689 loss=3.518, nll_loss=1.965, ppl=3.9, wps=583819, ups=1.18, wpb=494834, bsz=16456.1, num_updates=49800, lr=0.00028341, gnorm=0.136, clip=0, loss_scale=1, train_wall=84, gb_free=60.4, wall=42686 epoch 030: 896 / 1689 loss=3.518, nll_loss=1.965, ppl=3.9, wps=583819, ups=1.18, wpb=494834, bsz=16456.1, num_updates=49800, lr=0.00028341, gnorm=0.136, clip=0, loss_scale=1, train_wall=84, gb_free=60.4, wall=42686 epoch 030: 896 / 1689 loss=3.518, nll_loss=1.965, ppl=3.9, wps=583819, ups=1.18, wpb=494834, bsz=16456.1, num_updates=49800, lr=0.00028341, gnorm=0.136, clip=0, loss_scale=1, train_wall=84, gb_free=60.4, wall=42686 epoch 030: 896 / 1689 loss=3.518, nll_loss=1.965, ppl=3.9, wps=583819, ups=1.18, wpb=494834, bsz=16456.1, num_updates=49800, lr=0.00028341, gnorm=0.136, clip=0, loss_scale=1, train_wall=84, gb_free=60.4, wall=42686 epoch 030: 896 / 1689 loss=3.518, nll_loss=1.965, ppl=3.9, wps=583819, ups=1.18, wpb=494834, bsz=16456.1, num_updates=49800, lr=0.00028341, gnorm=0.136, clip=0, loss_scale=1, train_wall=84, gb_free=60.4, wall=42686 epoch 030: 896 / 1689 loss=3.518, nll_loss=1.965, ppl=3.9, wps=583819, ups=1.18, wpb=494834, bsz=16456.1, num_updates=49800, lr=0.00028341, gnorm=0.136, clip=0, loss_scale=1, train_wall=84, gb_free=60.4, wall=42686 epoch 030: 896 / 1689 loss=3.518, nll_loss=1.965, ppl=3.9, wps=583819, ups=1.18, wpb=494834, bsz=16456.1, num_updates=49800, lr=0.00028341, gnorm=0.136, clip=0, loss_scale=1, train_wall=84, gb_free=60.4, wall=42686 epoch 030: 896 / 1689 loss=3.518, nll_loss=1.965, ppl=3.9, wps=583819, ups=1.18, wpb=494834, bsz=16456.1, num_updates=49800, lr=0.00028341, gnorm=0.136, clip=0, loss_scale=1, train_wall=84, gb_free=60.4, wall=42686 epoch 030: 996 / 1689 loss=3.512, nll_loss=1.957, ppl=3.88, wps=584121, ups=1.18, wpb=494695, bsz=16616.5, num_updates=49900, lr=0.000283126, gnorm=0.133, clip=0, loss_scale=1, train_wall=84, gb_free=61.7, wall=42771 epoch 030: 996 / 1689 loss=3.512, nll_loss=1.957, ppl=3.88, wps=584121, ups=1.18, wpb=494695, bsz=16616.5, num_updates=49900, lr=0.000283126, gnorm=0.133, clip=0, loss_scale=1, train_wall=84, gb_free=61.7, wall=42771 epoch 030: 996 / 1689 loss=3.512, nll_loss=1.957, ppl=3.88, wps=584121, ups=1.18, wpb=494695, bsz=16616.5, num_updates=49900, lr=0.000283126, gnorm=0.133, clip=0, loss_scale=1, train_wall=84, gb_free=61.7, wall=42771 epoch 030: 996 / 1689 loss=3.512, nll_loss=1.957, ppl=3.88, wps=584121, ups=1.18, wpb=494695, bsz=16616.5, num_updates=49900, lr=0.000283126, gnorm=0.133, clip=0, loss_scale=1, train_wall=84, gb_free=61.7, wall=42771 epoch 030: 996 / 1689 loss=3.512, nll_loss=1.957, ppl=3.88, wps=584121, ups=1.18, wpb=494695, bsz=16616.5, num_updates=49900, lr=0.000283126, gnorm=0.133, clip=0, loss_scale=1, train_wall=84, gb_free=61.7, wall=42771 epoch 030: 996 / 1689 loss=3.512, nll_loss=1.957, ppl=3.88, wps=584121, ups=1.18, wpb=494695, bsz=16616.5, num_updates=49900, lr=0.000283126, gnorm=0.133, clip=0, loss_scale=1, train_wall=84, gb_free=61.7, wall=42771 epoch 030: 996 / 1689 loss=3.512, nll_loss=1.957, ppl=3.88, wps=584121, ups=1.18, wpb=494695, bsz=16616.5, num_updates=49900, lr=0.000283126, gnorm=0.133, clip=0, loss_scale=1, train_wall=84, gb_free=61.7, wall=42771 epoch 030: 996 / 1689 loss=3.512, nll_loss=1.957, ppl=3.88, wps=584121, ups=1.18, wpb=494695, bsz=16616.5, num_updates=49900, lr=0.000283126, gnorm=0.133, clip=0, loss_scale=1, train_wall=84, gb_free=61.7, wall=42771 epoch 030: 996 / 1689 loss=3.512, nll_loss=1.957, ppl=3.88, wps=584121, ups=1.18, wpb=494695, bsz=16616.5, num_updates=49900, lr=0.000283126, gnorm=0.133, clip=0, loss_scale=1, train_wall=84, gb_free=61.7, wall=42771 epoch 030: 996 / 1689 loss=3.512, nll_loss=1.957, ppl=3.88, wps=584121, ups=1.18, wpb=494695, bsz=16616.5, num_updates=49900, lr=0.000283126, gnorm=0.133, clip=0, loss_scale=1, train_wall=84, gb_free=61.7, wall=42771 epoch 030: 996 / 1689 loss=3.512, nll_loss=1.957, ppl=3.88, wps=584121, ups=1.18, wpb=494695, bsz=16616.5, num_updates=49900, lr=0.000283126, gnorm=0.133, clip=0, loss_scale=1, train_wall=84, gb_free=61.7, wall=42771 epoch 030: 996 / 1689 loss=3.512, nll_loss=1.957, ppl=3.88, wps=584121, ups=1.18, wpb=494695, bsz=16616.5, num_updates=49900, lr=0.000283126, gnorm=0.133, clip=0, loss_scale=1, train_wall=84, gb_free=61.7, wall=42771 epoch 030: 996 / 1689 loss=3.512, nll_loss=1.957, ppl=3.88, wps=584121, ups=1.18, wpb=494695, bsz=16616.5, num_updates=49900, lr=0.000283126, gnorm=0.133, clip=0, loss_scale=1, train_wall=84, gb_free=61.7, wall=42771 epoch 030: 996 / 1689 loss=3.512, nll_loss=1.957, ppl=3.88, wps=584121, ups=1.18, wpb=494695, bsz=16616.5, num_updates=49900, lr=0.000283126, gnorm=0.133, clip=0, loss_scale=1, train_wall=84, gb_free=61.7, wall=42771 epoch 030: 996 / 1689 loss=3.512, nll_loss=1.957, ppl=3.88, wps=584121, ups=1.18, wpb=494695, bsz=16616.5, num_updates=49900, lr=0.000283126, gnorm=0.133, clip=0, loss_scale=1, train_wall=84, gb_free=61.7, wall=42771 epoch 030: 996 / 1689 loss=3.512, nll_loss=1.957, ppl=3.88, wps=584121, ups=1.18, wpb=494695, bsz=16616.5, num_updates=49900, lr=0.000283126, gnorm=0.133, clip=0, loss_scale=1, train_wall=84, gb_free=61.7, wall=42771 epoch 030: 996 / 1689 loss=3.512, nll_loss=1.957, ppl=3.88, wps=584121, ups=1.18, wpb=494695, bsz=16616.5, num_updates=49900, lr=0.000283126, gnorm=0.133, clip=0, loss_scale=1, train_wall=84, gb_free=61.7, wall=42771 epoch 030: 996 / 1689 loss=3.512, nll_loss=1.957, ppl=3.88, wps=584121, ups=1.18, wpb=494695, bsz=16616.5, num_updates=49900, lr=0.000283126, gnorm=0.133, clip=0, loss_scale=1, train_wall=84, gb_free=61.7, wall=42771 epoch 030: 996 / 1689 loss=3.512, nll_loss=1.957, ppl=3.88, wps=584121, ups=1.18, wpb=494695, bsz=16616.5, num_updates=49900, lr=0.000283126, gnorm=0.133, clip=0, loss_scale=1, train_wall=84, gb_free=61.7, wall=42771 epoch 030: 996 / 1689 loss=3.512, nll_loss=1.957, ppl=3.88, wps=584121, ups=1.18, wpb=494695, bsz=16616.5, num_updates=49900, lr=0.000283126, gnorm=0.133, clip=0, loss_scale=1, train_wall=84, gb_free=61.7, wall=42771 epoch 030: 996 / 1689 loss=3.512, nll_loss=1.957, ppl=3.88, wps=584121, ups=1.18, wpb=494695, bsz=16616.5, num_updates=49900, lr=0.000283126, gnorm=0.133, clip=0, loss_scale=1, train_wall=84, gb_free=61.7, wall=42771 epoch 030: 996 / 1689 loss=3.512, nll_loss=1.957, ppl=3.88, wps=584121, ups=1.18, wpb=494695, bsz=16616.5, num_updates=49900, lr=0.000283126, gnorm=0.133, clip=0, loss_scale=1, train_wall=84, gb_free=61.7, wall=42771 epoch 030: 996 / 1689 loss=3.512, nll_loss=1.957, ppl=3.88, wps=584121, ups=1.18, wpb=494695, bsz=16616.5, num_updates=49900, lr=0.000283126, gnorm=0.133, clip=0, loss_scale=1, train_wall=84, gb_free=61.7, wall=42771 epoch 030: 996 / 1689 loss=3.512, nll_loss=1.957, ppl=3.88, wps=584121, ups=1.18, wpb=494695, bsz=16616.5, num_updates=49900, lr=0.000283126, gnorm=0.133, clip=0, loss_scale=1, train_wall=84, gb_free=61.7, wall=42771 epoch 030: 996 / 1689 loss=3.512, nll_loss=1.957, ppl=3.88, wps=584121, ups=1.18, wpb=494695, bsz=16616.5, num_updates=49900, lr=0.000283126, gnorm=0.133, clip=0, loss_scale=1, train_wall=84, gb_free=61.7, wall=42771 epoch 030: 996 / 1689 loss=3.512, nll_loss=1.957, ppl=3.88, wps=584121, ups=1.18, wpb=494695, bsz=16616.5, num_updates=49900, lr=0.000283126, gnorm=0.133, clip=0, loss_scale=1, train_wall=84, gb_free=61.7, wall=42771 epoch 030: 996 / 1689 loss=3.512, nll_loss=1.957, ppl=3.88, wps=584121, ups=1.18, wpb=494695, bsz=16616.5, num_updates=49900, lr=0.000283126, gnorm=0.133, clip=0, loss_scale=1, train_wall=84, gb_free=61.7, wall=42771 epoch 030: 996 / 1689 loss=3.512, nll_loss=1.957, ppl=3.88, wps=584121, ups=1.18, wpb=494695, bsz=16616.5, num_updates=49900, lr=0.000283126, gnorm=0.133, clip=0, loss_scale=1, train_wall=84, gb_free=61.7, wall=42771 epoch 030: 996 / 1689 loss=3.512, nll_loss=1.957, ppl=3.88, wps=584121, ups=1.18, wpb=494695, bsz=16616.5, num_updates=49900, lr=0.000283126, gnorm=0.133, clip=0, loss_scale=1, train_wall=84, gb_free=61.7, wall=42771 epoch 030: 996 / 1689 loss=3.512, nll_loss=1.957, ppl=3.88, wps=584121, ups=1.18, wpb=494695, bsz=16616.5, num_updates=49900, lr=0.000283126, gnorm=0.133, clip=0, loss_scale=1, train_wall=84, gb_free=61.7, wall=42771 epoch 030: 1096 / 1689 loss=3.524, nll_loss=1.971, ppl=3.92, wps=585546, ups=1.18, wpb=494970, bsz=16527.8, num_updates=50000, lr=0.000282843, gnorm=0.131, clip=0, loss_scale=1, train_wall=83, gb_free=60.9, wall=42855 epoch 030: 1096 / 1689 loss=3.524, nll_loss=1.971, ppl=3.92, wps=585546, ups=1.18, wpb=494970, bsz=16527.8, num_updates=50000, lr=0.000282843, gnorm=0.131, clip=0, loss_scale=1, train_wall=83, gb_free=60.9, wall=42855 epoch 030: 1096 / 1689 loss=3.524, nll_loss=1.971, ppl=3.92, wps=585546, ups=1.18, wpb=494970, bsz=16527.8, num_updates=50000, lr=0.000282843, gnorm=0.131, clip=0, loss_scale=1, train_wall=83, gb_free=60.9, wall=42855 epoch 030: 1096 / 1689 loss=3.524, nll_loss=1.971, ppl=3.92, wps=585546, ups=1.18, wpb=494970, bsz=16527.8, num_updates=50000, lr=0.000282843, gnorm=0.131, clip=0, loss_scale=1, train_wall=83, gb_free=60.9, wall=42855 epoch 030: 1096 / 1689 loss=3.524, nll_loss=1.971, ppl=3.92, wps=585546, ups=1.18, wpb=494970, bsz=16527.8, num_updates=50000, lr=0.000282843, gnorm=0.131, clip=0, loss_scale=1, train_wall=83, gb_free=60.9, wall=42855 epoch 030: 1096 / 1689 loss=3.524, nll_loss=1.971, ppl=3.92, wps=585546, ups=1.18, wpb=494970, bsz=16527.8, num_updates=50000, lr=0.000282843, gnorm=0.131, clip=0, loss_scale=1, train_wall=83, gb_free=60.9, wall=42855 epoch 030: 1096 / 1689 loss=3.524, nll_loss=1.971, ppl=3.92, wps=585546, ups=1.18, wpb=494970, bsz=16527.8, num_updates=50000, lr=0.000282843, gnorm=0.131, clip=0, loss_scale=1, train_wall=83, gb_free=60.9, wall=42855 epoch 030: 1096 / 1689 loss=3.524, nll_loss=1.971, ppl=3.92, wps=585546, ups=1.18, wpb=494970, bsz=16527.8, num_updates=50000, lr=0.000282843, gnorm=0.131, clip=0, loss_scale=1, train_wall=83, gb_free=60.9, wall=42855 epoch 030: 1096 / 1689 loss=3.524, nll_loss=1.971, ppl=3.92, wps=585546, ups=1.18, wpb=494970, bsz=16527.8, num_updates=50000, lr=0.000282843, gnorm=0.131, clip=0, loss_scale=1, train_wall=83, gb_free=60.9, wall=42855 epoch 030: 1096 / 1689 loss=3.524, nll_loss=1.971, ppl=3.92, wps=585546, ups=1.18, wpb=494970, bsz=16527.8, num_updates=50000, lr=0.000282843, gnorm=0.131, clip=0, loss_scale=1, train_wall=83, gb_free=60.9, wall=42855 epoch 030: 1096 / 1689 loss=3.524, nll_loss=1.971, ppl=3.92, wps=585546, ups=1.18, wpb=494970, bsz=16527.8, num_updates=50000, lr=0.000282843, gnorm=0.131, clip=0, loss_scale=1, train_wall=83, gb_free=60.9, wall=42855 epoch 030: 1096 / 1689 loss=3.524, nll_loss=1.971, ppl=3.92, wps=585546, ups=1.18, wpb=494970, bsz=16527.8, num_updates=50000, lr=0.000282843, gnorm=0.131, clip=0, loss_scale=1, train_wall=83, gb_free=60.9, wall=42855 epoch 030: 1096 / 1689 loss=3.524, nll_loss=1.971, ppl=3.92, wps=585546, ups=1.18, wpb=494970, bsz=16527.8, num_updates=50000, lr=0.000282843, gnorm=0.131, clip=0, loss_scale=1, train_wall=83, gb_free=60.9, wall=42855 epoch 030: 1096 / 1689 loss=3.524, nll_loss=1.971, ppl=3.92, wps=585546, ups=1.18, wpb=494970, bsz=16527.8, num_updates=50000, lr=0.000282843, gnorm=0.131, clip=0, loss_scale=1, train_wall=83, gb_free=60.9, wall=42855 epoch 030: 1096 / 1689 loss=3.524, nll_loss=1.971, ppl=3.92, wps=585546, ups=1.18, wpb=494970, bsz=16527.8, num_updates=50000, lr=0.000282843, gnorm=0.131, clip=0, loss_scale=1, train_wall=83, gb_free=60.9, wall=42855 epoch 030: 1096 / 1689 loss=3.524, nll_loss=1.971, ppl=3.92, wps=585546, ups=1.18, wpb=494970, bsz=16527.8, num_updates=50000, lr=0.000282843, gnorm=0.131, clip=0, loss_scale=1, train_wall=83, gb_free=60.9, wall=42855 epoch 030: 1096 / 1689 loss=3.524, nll_loss=1.971, ppl=3.92, wps=585546, ups=1.18, wpb=494970, bsz=16527.8, num_updates=50000, lr=0.000282843, gnorm=0.131, clip=0, loss_scale=1, train_wall=83, gb_free=60.9, wall=42855 epoch 030: 1096 / 1689 loss=3.524, nll_loss=1.971, ppl=3.92, wps=585546, ups=1.18, wpb=494970, bsz=16527.8, num_updates=50000, lr=0.000282843, gnorm=0.131, clip=0, loss_scale=1, train_wall=83, gb_free=60.9, wall=42855 epoch 030: 1096 / 1689 loss=3.524, nll_loss=1.971, ppl=3.92, wps=585546, ups=1.18, wpb=494970, bsz=16527.8, num_updates=50000, lr=0.000282843, gnorm=0.131, clip=0, loss_scale=1, train_wall=83, gb_free=60.9, wall=42855 epoch 030: 1096 / 1689 loss=3.524, nll_loss=1.971, ppl=3.92, wps=585546, ups=1.18, wpb=494970, bsz=16527.8, num_updates=50000, lr=0.000282843, gnorm=0.131, clip=0, loss_scale=1, train_wall=83, gb_free=60.9, wall=42855 epoch 030: 1096 / 1689 loss=3.524, nll_loss=1.971, ppl=3.92, wps=585546, ups=1.18, wpb=494970, bsz=16527.8, num_updates=50000, lr=0.000282843, gnorm=0.131, clip=0, loss_scale=1, train_wall=83, gb_free=60.9, wall=42855 epoch 030: 1096 / 1689 loss=3.524, nll_loss=1.971, ppl=3.92, wps=585546, ups=1.18, wpb=494970, bsz=16527.8, num_updates=50000, lr=0.000282843, gnorm=0.131, clip=0, loss_scale=1, train_wall=83, gb_free=60.9, wall=42855 epoch 030: 1096 / 1689 loss=3.524, nll_loss=1.971, ppl=3.92, wps=585546, ups=1.18, wpb=494970, bsz=16527.8, num_updates=50000, lr=0.000282843, gnorm=0.131, clip=0, loss_scale=1, train_wall=83, gb_free=60.9, wall=42855 epoch 030: 1096 / 1689 loss=3.524, nll_loss=1.971, ppl=3.92, wps=585546, ups=1.18, wpb=494970, bsz=16527.8, num_updates=50000, lr=0.000282843, gnorm=0.131, clip=0, loss_scale=1, train_wall=83, gb_free=60.9, wall=42855 epoch 030: 1096 / 1689 loss=3.524, nll_loss=1.971, ppl=3.92, wps=585546, ups=1.18, wpb=494970, bsz=16527.8, num_updates=50000, lr=0.000282843, gnorm=0.131, clip=0, loss_scale=1, train_wall=83, gb_free=60.9, wall=42855 epoch 030: 1096 / 1689 loss=3.524, nll_loss=1.971, ppl=3.92, wps=585546, ups=1.18, wpb=494970, bsz=16527.8, num_updates=50000, lr=0.000282843, gnorm=0.131, clip=0, loss_scale=1, train_wall=83, gb_free=60.9, wall=42855 epoch 030: 1096 / 1689 loss=3.524, nll_loss=1.971, ppl=3.92, wps=585546, ups=1.18, wpb=494970, bsz=16527.8, num_updates=50000, lr=0.000282843, gnorm=0.131, clip=0, loss_scale=1, train_wall=83, gb_free=60.9, wall=42855 epoch 030: 1096 / 1689 loss=3.524, nll_loss=1.971, ppl=3.92, wps=585546, ups=1.18, wpb=494970, bsz=16527.8, num_updates=50000, lr=0.000282843, gnorm=0.131, clip=0, loss_scale=1, train_wall=83, gb_free=60.9, wall=42855 epoch 030: 1096 / 1689 loss=3.524, nll_loss=1.971, ppl=3.92, wps=585546, ups=1.18, wpb=494970, bsz=16527.8, num_updates=50000, lr=0.000282843, gnorm=0.131, clip=0, loss_scale=1, train_wall=83, gb_free=60.9, wall=42855 epoch 030: 1096 / 1689 loss=3.524, nll_loss=1.971, ppl=3.92, wps=585546, ups=1.18, wpb=494970, bsz=16527.8, num_updates=50000, lr=0.000282843, gnorm=0.131, clip=0, loss_scale=1, train_wall=83, gb_free=60.9, wall=42855 begin validation on "valid" subset epoch 030 | valid on 'valid' subset | loss 3.702 | nll_loss 2.152 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 50000 | best_loss 3.685 epoch 030 | valid on 'valid' subset | loss 3.702 | nll_loss 2.152 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 50000 | best_loss 3.685 epoch 030 | valid on 'valid' subset | loss 3.702 | nll_loss 2.152 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 50000 | best_loss 3.685 epoch 030 | valid on 'valid' subset | loss 3.702 | nll_loss 2.152 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 50000 | best_loss 3.685 epoch 030 | valid on 'valid' subset | loss 3.702 | nll_loss 2.152 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 50000 | best_loss 3.685 epoch 030 | valid on 'valid' subset | loss 3.702 | nll_loss 2.152 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 50000 | best_loss 3.685 epoch 030 | valid on 'valid' subset | loss 3.702 | nll_loss 2.152 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 50000 | best_loss 3.685 epoch 030 | valid on 'valid' subset | loss 3.702 | nll_loss 2.152 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 50000 | best_loss 3.685 epoch 030 | valid on 'valid' subset | loss 3.702 | nll_loss 2.152 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 50000 | best_loss 3.685 epoch 030 | valid on 'valid' subset | loss 3.702 | nll_loss 2.152 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 50000 | best_loss 3.685 epoch 030 | valid on 'valid' subset | loss 3.702 | nll_loss 2.152 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 50000 | best_loss 3.685 epoch 030 | valid on 'valid' subset | loss 3.702 | nll_loss 2.152 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 50000 | best_loss 3.685 epoch 030 | valid on 'valid' subset | loss 3.702 | nll_loss 2.152 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 50000 | best_loss 3.685 epoch 030 | valid on 'valid' subset | loss 3.702 | nll_loss 2.152 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 50000 | best_loss 3.685 epoch 030 | valid on 'valid' subset | loss 3.702 | nll_loss 2.152 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 50000 | best_loss 3.685 epoch 030 | valid on 'valid' subset | loss 3.702 | nll_loss 2.152 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 50000 | best_loss 3.685 epoch 030 | valid on 'valid' subset | loss 3.702 | nll_loss 2.152 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 50000 | best_loss 3.685 epoch 030 | valid on 'valid' subset | loss 3.702 | nll_loss 2.152 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 50000 | best_loss 3.685 epoch 030 | valid on 'valid' subset | loss 3.702 | nll_loss 2.152 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 50000 | best_loss 3.685 epoch 030 | valid on 'valid' subset | loss 3.702 | nll_loss 2.152 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 50000 | best_loss 3.685 epoch 030 | valid on 'valid' subset | loss 3.702 | nll_loss 2.152 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 50000 | best_loss 3.685 epoch 030 | valid on 'valid' subset | loss 3.702 | nll_loss 2.152 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 50000 | best_loss 3.685 epoch 030 | valid on 'valid' subset | loss 3.702 | nll_loss 2.152 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 50000 | best_loss 3.685 epoch 030 | valid on 'valid' subset | loss 3.702 | nll_loss 2.152 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 50000 | best_loss 3.685 epoch 030 | valid on 'valid' subset | loss 3.702 | nll_loss 2.152 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 50000 | best_loss 3.685 epoch 030 | valid on 'valid' subset | loss 3.702 | nll_loss 2.152 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 50000 | best_loss 3.685 epoch 030 | valid on 'valid' subset | loss 3.702 | nll_loss 2.152 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 50000 | best_loss 3.685 epoch 030 | valid on 'valid' subset | loss 3.702 | nll_loss 2.152 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 50000 | best_loss 3.685 epoch 030 | valid on 'valid' subset | loss 3.702 | nll_loss 2.152 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 50000 | best_loss 3.685 epoch 030 | valid on 'valid' subset | loss 3.702 | nll_loss 2.152 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 50000 | best_loss 3.685 epoch 030: 1196 / 1689 loss=3.528, nll_loss=1.975, ppl=3.93, wps=532211, ups=1.08, wpb=494193, bsz=16915, num_updates=50100, lr=0.00028256, gnorm=0.143, clip=0, loss_scale=2, train_wall=83, gb_free=61.6, wall=42948 epoch 030: 1196 / 1689 loss=3.528, nll_loss=1.975, ppl=3.93, wps=532211, ups=1.08, wpb=494193, bsz=16915, num_updates=50100, lr=0.00028256, gnorm=0.143, clip=0, loss_scale=2, train_wall=83, gb_free=61.6, wall=42948 epoch 030: 1196 / 1689 loss=3.528, nll_loss=1.975, ppl=3.93, wps=532211, ups=1.08, wpb=494193, bsz=16915, num_updates=50100, lr=0.00028256, gnorm=0.143, clip=0, loss_scale=2, train_wall=83, gb_free=61.6, wall=42948 epoch 030: 1196 / 1689 loss=3.528, nll_loss=1.975, ppl=3.93, wps=532211, ups=1.08, wpb=494193, bsz=16915, num_updates=50100, lr=0.00028256, gnorm=0.143, clip=0, loss_scale=2, train_wall=83, gb_free=61.6, wall=42948 epoch 030: 1196 / 1689 loss=3.528, nll_loss=1.975, ppl=3.93, wps=532211, ups=1.08, wpb=494193, bsz=16915, num_updates=50100, lr=0.00028256, gnorm=0.143, clip=0, loss_scale=2, train_wall=83, gb_free=61.6, wall=42948 epoch 030: 1196 / 1689 loss=3.528, nll_loss=1.975, ppl=3.93, wps=532211, ups=1.08, wpb=494193, bsz=16915, num_updates=50100, lr=0.00028256, gnorm=0.143, clip=0, loss_scale=2, train_wall=83, gb_free=61.6, wall=42948 epoch 030: 1196 / 1689 loss=3.528, nll_loss=1.975, ppl=3.93, wps=532211, ups=1.08, wpb=494193, bsz=16915, num_updates=50100, lr=0.00028256, gnorm=0.143, clip=0, loss_scale=2, train_wall=83, gb_free=61.6, wall=42948 epoch 030: 1196 / 1689 loss=3.528, nll_loss=1.975, ppl=3.93, wps=532211, ups=1.08, wpb=494193, bsz=16915, num_updates=50100, lr=0.00028256, gnorm=0.143, clip=0, loss_scale=2, train_wall=83, gb_free=61.6, wall=42948 epoch 030: 1196 / 1689 loss=3.528, nll_loss=1.975, ppl=3.93, wps=532211, ups=1.08, wpb=494193, bsz=16915, num_updates=50100, lr=0.00028256, gnorm=0.143, clip=0, loss_scale=2, train_wall=83, gb_free=61.6, wall=42948 epoch 030: 1196 / 1689 loss=3.528, nll_loss=1.975, ppl=3.93, wps=532211, ups=1.08, wpb=494193, bsz=16915, num_updates=50100, lr=0.00028256, gnorm=0.143, clip=0, loss_scale=2, train_wall=83, gb_free=61.6, wall=42948 epoch 030: 1196 / 1689 loss=3.528, nll_loss=1.975, ppl=3.93, wps=532211, ups=1.08, wpb=494193, bsz=16915, num_updates=50100, lr=0.00028256, gnorm=0.143, clip=0, loss_scale=2, train_wall=83, gb_free=61.6, wall=42948 epoch 030: 1196 / 1689 loss=3.528, nll_loss=1.975, ppl=3.93, wps=532211, ups=1.08, wpb=494193, bsz=16915, num_updates=50100, lr=0.00028256, gnorm=0.143, clip=0, loss_scale=2, train_wall=83, gb_free=61.6, wall=42948 epoch 030: 1196 / 1689 loss=3.528, nll_loss=1.975, ppl=3.93, wps=532211, ups=1.08, wpb=494193, bsz=16915, num_updates=50100, lr=0.00028256, gnorm=0.143, clip=0, loss_scale=2, train_wall=83, gb_free=61.6, wall=42948 epoch 030: 1196 / 1689 loss=3.528, nll_loss=1.975, ppl=3.93, wps=532211, ups=1.08, wpb=494193, bsz=16915, num_updates=50100, lr=0.00028256, gnorm=0.143, clip=0, loss_scale=2, train_wall=83, gb_free=61.6, wall=42948 epoch 030: 1196 / 1689 loss=3.528, nll_loss=1.975, ppl=3.93, wps=532211, ups=1.08, wpb=494193, bsz=16915, num_updates=50100, lr=0.00028256, gnorm=0.143, clip=0, loss_scale=2, train_wall=83, gb_free=61.6, wall=42948 epoch 030: 1196 / 1689 loss=3.528, nll_loss=1.975, ppl=3.93, wps=532211, ups=1.08, wpb=494193, bsz=16915, num_updates=50100, lr=0.00028256, gnorm=0.143, clip=0, loss_scale=2, train_wall=83, gb_free=61.6, wall=42948 epoch 030: 1196 / 1689 loss=3.528, nll_loss=1.975, ppl=3.93, wps=532211, ups=1.08, wpb=494193, bsz=16915, num_updates=50100, lr=0.00028256, gnorm=0.143, clip=0, loss_scale=2, train_wall=83, gb_free=61.6, wall=42948 epoch 030: 1196 / 1689 loss=3.528, nll_loss=1.975, ppl=3.93, wps=532211, ups=1.08, wpb=494193, bsz=16915, num_updates=50100, lr=0.00028256, gnorm=0.143, clip=0, loss_scale=2, train_wall=83, gb_free=61.6, wall=42948 epoch 030: 1196 / 1689 loss=3.528, nll_loss=1.975, ppl=3.93, wps=532211, ups=1.08, wpb=494193, bsz=16915, num_updates=50100, lr=0.00028256, gnorm=0.143, clip=0, loss_scale=2, train_wall=83, gb_free=61.6, wall=42948 epoch 030: 1196 / 1689 loss=3.528, nll_loss=1.975, ppl=3.93, wps=532211, ups=1.08, wpb=494193, bsz=16915, num_updates=50100, lr=0.00028256, gnorm=0.143, clip=0, loss_scale=2, train_wall=83, gb_free=61.6, wall=42948 epoch 030: 1196 / 1689 loss=3.528, nll_loss=1.975, ppl=3.93, wps=532211, ups=1.08, wpb=494193, bsz=16915, num_updates=50100, lr=0.00028256, gnorm=0.143, clip=0, loss_scale=2, train_wall=83, gb_free=61.6, wall=42948 epoch 030: 1196 / 1689 loss=3.528, nll_loss=1.975, ppl=3.93, wps=532211, ups=1.08, wpb=494193, bsz=16915, num_updates=50100, lr=0.00028256, gnorm=0.143, clip=0, loss_scale=2, train_wall=83, gb_free=61.6, wall=42948 epoch 030: 1196 / 1689 loss=3.528, nll_loss=1.975, ppl=3.93, wps=532211, ups=1.08, wpb=494193, bsz=16915, num_updates=50100, lr=0.00028256, gnorm=0.143, clip=0, loss_scale=2, train_wall=83, gb_free=61.6, wall=42948 epoch 030: 1196 / 1689 loss=3.528, nll_loss=1.975, ppl=3.93, wps=532211, ups=1.08, wpb=494193, bsz=16915, num_updates=50100, lr=0.00028256, gnorm=0.143, clip=0, loss_scale=2, train_wall=83, gb_free=61.6, wall=42948 epoch 030: 1196 / 1689 loss=3.528, nll_loss=1.975, ppl=3.93, wps=532211, ups=1.08, wpb=494193, bsz=16915, num_updates=50100, lr=0.00028256, gnorm=0.143, clip=0, loss_scale=2, train_wall=83, gb_free=61.6, wall=42948 epoch 030: 1196 / 1689 loss=3.528, nll_loss=1.975, ppl=3.93, wps=532211, ups=1.08, wpb=494193, bsz=16915, num_updates=50100, lr=0.00028256, gnorm=0.143, clip=0, loss_scale=2, train_wall=83, gb_free=61.6, wall=42948 epoch 030: 1196 / 1689 loss=3.528, nll_loss=1.975, ppl=3.93, wps=532211, ups=1.08, wpb=494193, bsz=16915, num_updates=50100, lr=0.00028256, gnorm=0.143, clip=0, loss_scale=2, train_wall=83, gb_free=61.6, wall=42948 epoch 030: 1196 / 1689 loss=3.528, nll_loss=1.975, ppl=3.93, wps=532211, ups=1.08, wpb=494193, bsz=16915, num_updates=50100, lr=0.00028256, gnorm=0.143, clip=0, loss_scale=2, train_wall=83, gb_free=61.6, wall=42948 epoch 030: 1196 / 1689 loss=3.528, nll_loss=1.975, ppl=3.93, wps=532211, ups=1.08, wpb=494193, bsz=16915, num_updates=50100, lr=0.00028256, gnorm=0.143, clip=0, loss_scale=2, train_wall=83, gb_free=61.6, wall=42948 epoch 030: 1196 / 1689 loss=3.528, nll_loss=1.975, ppl=3.93, wps=532211, ups=1.08, wpb=494193, bsz=16915, num_updates=50100, lr=0.00028256, gnorm=0.143, clip=0, loss_scale=2, train_wall=83, gb_free=61.6, wall=42948 epoch 030: 1296 / 1689 loss=3.524, nll_loss=1.972, ppl=3.92, wps=584355, ups=1.18, wpb=494198, bsz=16396.9, num_updates=50200, lr=0.000282279, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61, wall=43033 epoch 030: 1296 / 1689 loss=3.524, nll_loss=1.972, ppl=3.92, wps=584355, ups=1.18, wpb=494198, bsz=16396.9, num_updates=50200, lr=0.000282279, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61, wall=43033 epoch 030: 1296 / 1689 loss=3.524, nll_loss=1.972, ppl=3.92, wps=584355, ups=1.18, wpb=494198, bsz=16396.9, num_updates=50200, lr=0.000282279, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61, wall=43033 epoch 030: 1296 / 1689 loss=3.524, nll_loss=1.972, ppl=3.92, wps=584355, ups=1.18, wpb=494198, bsz=16396.9, num_updates=50200, lr=0.000282279, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61, wall=43033 epoch 030: 1296 / 1689 loss=3.524, nll_loss=1.972, ppl=3.92, wps=584355, ups=1.18, wpb=494198, bsz=16396.9, num_updates=50200, lr=0.000282279, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61, wall=43033 epoch 030: 1296 / 1689 loss=3.524, nll_loss=1.972, ppl=3.92, wps=584355, ups=1.18, wpb=494198, bsz=16396.9, num_updates=50200, lr=0.000282279, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61, wall=43033 epoch 030: 1296 / 1689 loss=3.524, nll_loss=1.972, ppl=3.92, wps=584355, ups=1.18, wpb=494198, bsz=16396.9, num_updates=50200, lr=0.000282279, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61, wall=43033 epoch 030: 1296 / 1689 loss=3.524, nll_loss=1.972, ppl=3.92, wps=584355, ups=1.18, wpb=494198, bsz=16396.9, num_updates=50200, lr=0.000282279, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61, wall=43033 epoch 030: 1296 / 1689 loss=3.524, nll_loss=1.972, ppl=3.92, wps=584355, ups=1.18, wpb=494198, bsz=16396.9, num_updates=50200, lr=0.000282279, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61, wall=43033 epoch 030: 1296 / 1689 loss=3.524, nll_loss=1.972, ppl=3.92, wps=584355, ups=1.18, wpb=494198, bsz=16396.9, num_updates=50200, lr=0.000282279, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61, wall=43033 epoch 030: 1296 / 1689 loss=3.524, nll_loss=1.972, ppl=3.92, wps=584355, ups=1.18, wpb=494198, bsz=16396.9, num_updates=50200, lr=0.000282279, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61, wall=43033 epoch 030: 1296 / 1689 loss=3.524, nll_loss=1.972, ppl=3.92, wps=584355, ups=1.18, wpb=494198, bsz=16396.9, num_updates=50200, lr=0.000282279, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61, wall=43033 epoch 030: 1296 / 1689 loss=3.524, nll_loss=1.972, ppl=3.92, wps=584355, ups=1.18, wpb=494198, bsz=16396.9, num_updates=50200, lr=0.000282279, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61, wall=43033 epoch 030: 1296 / 1689 loss=3.524, nll_loss=1.972, ppl=3.92, wps=584355, ups=1.18, wpb=494198, bsz=16396.9, num_updates=50200, lr=0.000282279, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61, wall=43033 epoch 030: 1296 / 1689 loss=3.524, nll_loss=1.972, ppl=3.92, wps=584355, ups=1.18, wpb=494198, bsz=16396.9, num_updates=50200, lr=0.000282279, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61, wall=43033 epoch 030: 1296 / 1689 loss=3.524, nll_loss=1.972, ppl=3.92, wps=584355, ups=1.18, wpb=494198, bsz=16396.9, num_updates=50200, lr=0.000282279, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61, wall=43033 epoch 030: 1296 / 1689 loss=3.524, nll_loss=1.972, ppl=3.92, wps=584355, ups=1.18, wpb=494198, bsz=16396.9, num_updates=50200, lr=0.000282279, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61, wall=43033 epoch 030: 1296 / 1689 loss=3.524, nll_loss=1.972, ppl=3.92, wps=584355, ups=1.18, wpb=494198, bsz=16396.9, num_updates=50200, lr=0.000282279, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61, wall=43033 epoch 030: 1296 / 1689 loss=3.524, nll_loss=1.972, ppl=3.92, wps=584355, ups=1.18, wpb=494198, bsz=16396.9, num_updates=50200, lr=0.000282279, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61, wall=43033 epoch 030: 1296 / 1689 loss=3.524, nll_loss=1.972, ppl=3.92, wps=584355, ups=1.18, wpb=494198, bsz=16396.9, num_updates=50200, lr=0.000282279, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61, wall=43033 epoch 030: 1296 / 1689 loss=3.524, nll_loss=1.972, ppl=3.92, wps=584355, ups=1.18, wpb=494198, bsz=16396.9, num_updates=50200, lr=0.000282279, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61, wall=43033 epoch 030: 1296 / 1689 loss=3.524, nll_loss=1.972, ppl=3.92, wps=584355, ups=1.18, wpb=494198, bsz=16396.9, num_updates=50200, lr=0.000282279, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61, wall=43033 epoch 030: 1296 / 1689 loss=3.524, nll_loss=1.972, ppl=3.92, wps=584355, ups=1.18, wpb=494198, bsz=16396.9, num_updates=50200, lr=0.000282279, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61, wall=43033 epoch 030: 1296 / 1689 loss=3.524, nll_loss=1.972, ppl=3.92, wps=584355, ups=1.18, wpb=494198, bsz=16396.9, num_updates=50200, lr=0.000282279, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61, wall=43033 epoch 030: 1296 / 1689 loss=3.524, nll_loss=1.972, ppl=3.92, wps=584355, ups=1.18, wpb=494198, bsz=16396.9, num_updates=50200, lr=0.000282279, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61, wall=43033 epoch 030: 1296 / 1689 loss=3.524, nll_loss=1.972, ppl=3.92, wps=584355, ups=1.18, wpb=494198, bsz=16396.9, num_updates=50200, lr=0.000282279, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61, wall=43033 epoch 030: 1296 / 1689 loss=3.524, nll_loss=1.972, ppl=3.92, wps=584355, ups=1.18, wpb=494198, bsz=16396.9, num_updates=50200, lr=0.000282279, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61, wall=43033 epoch 030: 1296 / 1689 loss=3.524, nll_loss=1.972, ppl=3.92, wps=584355, ups=1.18, wpb=494198, bsz=16396.9, num_updates=50200, lr=0.000282279, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61, wall=43033 epoch 030: 1296 / 1689 loss=3.524, nll_loss=1.972, ppl=3.92, wps=584355, ups=1.18, wpb=494198, bsz=16396.9, num_updates=50200, lr=0.000282279, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61, wall=43033 epoch 030: 1296 / 1689 loss=3.524, nll_loss=1.972, ppl=3.92, wps=584355, ups=1.18, wpb=494198, bsz=16396.9, num_updates=50200, lr=0.000282279, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61, wall=43033 epoch 030: 1396 / 1689 loss=3.524, nll_loss=1.971, ppl=3.92, wps=583225, ups=1.18, wpb=495318, bsz=16544.9, num_updates=50300, lr=0.000281998, gnorm=0.144, clip=0, loss_scale=2, train_wall=84, gb_free=61.3, wall=43118 epoch 030: 1396 / 1689 loss=3.524, nll_loss=1.971, ppl=3.92, wps=583225, ups=1.18, wpb=495318, bsz=16544.9, num_updates=50300, lr=0.000281998, gnorm=0.144, clip=0, loss_scale=2, train_wall=84, gb_free=61.3, wall=43118 epoch 030: 1396 / 1689 loss=3.524, nll_loss=1.971, ppl=3.92, wps=583225, ups=1.18, wpb=495318, bsz=16544.9, num_updates=50300, lr=0.000281998, gnorm=0.144, clip=0, loss_scale=2, train_wall=84, gb_free=61.3, wall=43118 epoch 030: 1396 / 1689 loss=3.524, nll_loss=1.971, ppl=3.92, wps=583225, ups=1.18, wpb=495318, bsz=16544.9, num_updates=50300, lr=0.000281998, gnorm=0.144, clip=0, loss_scale=2, train_wall=84, gb_free=61.3, wall=43118 epoch 030: 1396 / 1689 loss=3.524, nll_loss=1.971, ppl=3.92, wps=583225, ups=1.18, wpb=495318, bsz=16544.9, num_updates=50300, lr=0.000281998, gnorm=0.144, clip=0, loss_scale=2, train_wall=84, gb_free=61.3, wall=43118 epoch 030: 1396 / 1689 loss=3.524, nll_loss=1.971, ppl=3.92, wps=583225, ups=1.18, wpb=495318, bsz=16544.9, num_updates=50300, lr=0.000281998, gnorm=0.144, clip=0, loss_scale=2, train_wall=84, gb_free=61.3, wall=43118 epoch 030: 1396 / 1689 loss=3.524, nll_loss=1.971, ppl=3.92, wps=583225, ups=1.18, wpb=495318, bsz=16544.9, num_updates=50300, lr=0.000281998, gnorm=0.144, clip=0, loss_scale=2, train_wall=84, gb_free=61.3, wall=43118 epoch 030: 1396 / 1689 loss=3.524, nll_loss=1.971, ppl=3.92, wps=583225, ups=1.18, wpb=495318, bsz=16544.9, num_updates=50300, lr=0.000281998, gnorm=0.144, clip=0, loss_scale=2, train_wall=84, gb_free=61.3, wall=43118 epoch 030: 1396 / 1689 loss=3.524, nll_loss=1.971, ppl=3.92, wps=583225, ups=1.18, wpb=495318, bsz=16544.9, num_updates=50300, lr=0.000281998, gnorm=0.144, clip=0, loss_scale=2, train_wall=84, gb_free=61.3, wall=43118 epoch 030: 1396 / 1689 loss=3.524, nll_loss=1.971, ppl=3.92, wps=583225, ups=1.18, wpb=495318, bsz=16544.9, num_updates=50300, lr=0.000281998, gnorm=0.144, clip=0, loss_scale=2, train_wall=84, gb_free=61.3, wall=43118 epoch 030: 1396 / 1689 loss=3.524, nll_loss=1.971, ppl=3.92, wps=583225, ups=1.18, wpb=495318, bsz=16544.9, num_updates=50300, lr=0.000281998, gnorm=0.144, clip=0, loss_scale=2, train_wall=84, gb_free=61.3, wall=43118 epoch 030: 1396 / 1689 loss=3.524, nll_loss=1.971, ppl=3.92, wps=583225, ups=1.18, wpb=495318, bsz=16544.9, num_updates=50300, lr=0.000281998, gnorm=0.144, clip=0, loss_scale=2, train_wall=84, gb_free=61.3, wall=43118 epoch 030: 1396 / 1689 loss=3.524, nll_loss=1.971, ppl=3.92, wps=583225, ups=1.18, wpb=495318, bsz=16544.9, num_updates=50300, lr=0.000281998, gnorm=0.144, clip=0, loss_scale=2, train_wall=84, gb_free=61.3, wall=43118 epoch 030: 1396 / 1689 loss=3.524, nll_loss=1.971, ppl=3.92, wps=583225, ups=1.18, wpb=495318, bsz=16544.9, num_updates=50300, lr=0.000281998, gnorm=0.144, clip=0, loss_scale=2, train_wall=84, gb_free=61.3, wall=43118 epoch 030: 1396 / 1689 loss=3.524, nll_loss=1.971, ppl=3.92, wps=583225, ups=1.18, wpb=495318, bsz=16544.9, num_updates=50300, lr=0.000281998, gnorm=0.144, clip=0, loss_scale=2, train_wall=84, gb_free=61.3, wall=43118 epoch 030: 1396 / 1689 loss=3.524, nll_loss=1.971, ppl=3.92, wps=583225, ups=1.18, wpb=495318, bsz=16544.9, num_updates=50300, lr=0.000281998, gnorm=0.144, clip=0, loss_scale=2, train_wall=84, gb_free=61.3, wall=43118 epoch 030: 1396 / 1689 loss=3.524, nll_loss=1.971, ppl=3.92, wps=583225, ups=1.18, wpb=495318, bsz=16544.9, num_updates=50300, lr=0.000281998, gnorm=0.144, clip=0, loss_scale=2, train_wall=84, gb_free=61.3, wall=43118 epoch 030: 1396 / 1689 loss=3.524, nll_loss=1.971, ppl=3.92, wps=583225, ups=1.18, wpb=495318, bsz=16544.9, num_updates=50300, lr=0.000281998, gnorm=0.144, clip=0, loss_scale=2, train_wall=84, gb_free=61.3, wall=43118 epoch 030: 1396 / 1689 loss=3.524, nll_loss=1.971, ppl=3.92, wps=583225, ups=1.18, wpb=495318, bsz=16544.9, num_updates=50300, lr=0.000281998, gnorm=0.144, clip=0, loss_scale=2, train_wall=84, gb_free=61.3, wall=43118 epoch 030: 1396 / 1689 loss=3.524, nll_loss=1.971, ppl=3.92, wps=583225, ups=1.18, wpb=495318, bsz=16544.9, num_updates=50300, lr=0.000281998, gnorm=0.144, clip=0, loss_scale=2, train_wall=84, gb_free=61.3, wall=43118 epoch 030: 1396 / 1689 loss=3.524, nll_loss=1.971, ppl=3.92, wps=583225, ups=1.18, wpb=495318, bsz=16544.9, num_updates=50300, lr=0.000281998, gnorm=0.144, clip=0, loss_scale=2, train_wall=84, gb_free=61.3, wall=43118 epoch 030: 1396 / 1689 loss=3.524, nll_loss=1.971, ppl=3.92, wps=583225, ups=1.18, wpb=495318, bsz=16544.9, num_updates=50300, lr=0.000281998, gnorm=0.144, clip=0, loss_scale=2, train_wall=84, gb_free=61.3, wall=43118 epoch 030: 1396 / 1689 loss=3.524, nll_loss=1.971, ppl=3.92, wps=583225, ups=1.18, wpb=495318, bsz=16544.9, num_updates=50300, lr=0.000281998, gnorm=0.144, clip=0, loss_scale=2, train_wall=84, gb_free=61.3, wall=43118 epoch 030: 1396 / 1689 loss=3.524, nll_loss=1.971, ppl=3.92, wps=583225, ups=1.18, wpb=495318, bsz=16544.9, num_updates=50300, lr=0.000281998, gnorm=0.144, clip=0, loss_scale=2, train_wall=84, gb_free=61.3, wall=43118 epoch 030: 1396 / 1689 loss=3.524, nll_loss=1.971, ppl=3.92, wps=583225, ups=1.18, wpb=495318, bsz=16544.9, num_updates=50300, lr=0.000281998, gnorm=0.144, clip=0, loss_scale=2, train_wall=84, gb_free=61.3, wall=43118 epoch 030: 1396 / 1689 loss=3.524, nll_loss=1.971, ppl=3.92, wps=583225, ups=1.18, wpb=495318, bsz=16544.9, num_updates=50300, lr=0.000281998, gnorm=0.144, clip=0, loss_scale=2, train_wall=84, gb_free=61.3, wall=43118 epoch 030: 1396 / 1689 loss=3.524, nll_loss=1.971, ppl=3.92, wps=583225, ups=1.18, wpb=495318, bsz=16544.9, num_updates=50300, lr=0.000281998, gnorm=0.144, clip=0, loss_scale=2, train_wall=84, gb_free=61.3, wall=43118 epoch 030: 1396 / 1689 loss=3.524, nll_loss=1.971, ppl=3.92, wps=583225, ups=1.18, wpb=495318, bsz=16544.9, num_updates=50300, lr=0.000281998, gnorm=0.144, clip=0, loss_scale=2, train_wall=84, gb_free=61.3, wall=43118 epoch 030: 1396 / 1689 loss=3.524, nll_loss=1.971, ppl=3.92, wps=583225, ups=1.18, wpb=495318, bsz=16544.9, num_updates=50300, lr=0.000281998, gnorm=0.144, clip=0, loss_scale=2, train_wall=84, gb_free=61.3, wall=43118 epoch 030: 1396 / 1689 loss=3.524, nll_loss=1.971, ppl=3.92, wps=583225, ups=1.18, wpb=495318, bsz=16544.9, num_updates=50300, lr=0.000281998, gnorm=0.144, clip=0, loss_scale=2, train_wall=84, gb_free=61.3, wall=43118 epoch 030: 1496 / 1689 loss=3.516, nll_loss=1.963, ppl=3.9, wps=589306, ups=1.19, wpb=495905, bsz=16741.5, num_updates=50400, lr=0.000281718, gnorm=0.132, clip=0, loss_scale=2, train_wall=83, gb_free=60.9, wall=43202 epoch 030: 1496 / 1689 loss=3.516, nll_loss=1.963, ppl=3.9, wps=589306, ups=1.19, wpb=495905, bsz=16741.5, num_updates=50400, lr=0.000281718, gnorm=0.132, clip=0, loss_scale=2, train_wall=83, gb_free=60.9, wall=43202 epoch 030: 1496 / 1689 loss=3.516, nll_loss=1.963, ppl=3.9, wps=589306, ups=1.19, wpb=495905, bsz=16741.5, num_updates=50400, lr=0.000281718, gnorm=0.132, clip=0, loss_scale=2, train_wall=83, gb_free=60.9, wall=43202 epoch 030: 1496 / 1689 loss=3.516, nll_loss=1.963, ppl=3.9, wps=589306, ups=1.19, wpb=495905, bsz=16741.5, num_updates=50400, lr=0.000281718, gnorm=0.132, clip=0, loss_scale=2, train_wall=83, gb_free=60.9, wall=43202 epoch 030: 1496 / 1689 loss=3.516, nll_loss=1.963, ppl=3.9, wps=589306, ups=1.19, wpb=495905, bsz=16741.5, num_updates=50400, lr=0.000281718, gnorm=0.132, clip=0, loss_scale=2, train_wall=83, gb_free=60.9, wall=43202 epoch 030: 1496 / 1689 loss=3.516, nll_loss=1.963, ppl=3.9, wps=589306, ups=1.19, wpb=495905, bsz=16741.5, num_updates=50400, lr=0.000281718, gnorm=0.132, clip=0, loss_scale=2, train_wall=83, gb_free=60.9, wall=43202 epoch 030: 1496 / 1689 loss=3.516, nll_loss=1.963, ppl=3.9, wps=589306, ups=1.19, wpb=495905, bsz=16741.5, num_updates=50400, lr=0.000281718, gnorm=0.132, clip=0, loss_scale=2, train_wall=83, gb_free=60.9, wall=43202 epoch 030: 1496 / 1689 loss=3.516, nll_loss=1.963, ppl=3.9, wps=589306, ups=1.19, wpb=495905, bsz=16741.5, num_updates=50400, lr=0.000281718, gnorm=0.132, clip=0, loss_scale=2, train_wall=83, gb_free=60.9, wall=43202 epoch 030: 1496 / 1689 loss=3.516, nll_loss=1.963, ppl=3.9, wps=589306, ups=1.19, wpb=495905, bsz=16741.5, num_updates=50400, lr=0.000281718, gnorm=0.132, clip=0, loss_scale=2, train_wall=83, gb_free=60.9, wall=43202 epoch 030: 1496 / 1689 loss=3.516, nll_loss=1.963, ppl=3.9, wps=589306, ups=1.19, wpb=495905, bsz=16741.5, num_updates=50400, lr=0.000281718, gnorm=0.132, clip=0, loss_scale=2, train_wall=83, gb_free=60.9, wall=43202 epoch 030: 1496 / 1689 loss=3.516, nll_loss=1.963, ppl=3.9, wps=589306, ups=1.19, wpb=495905, bsz=16741.5, num_updates=50400, lr=0.000281718, gnorm=0.132, clip=0, loss_scale=2, train_wall=83, gb_free=60.9, wall=43202 epoch 030: 1496 / 1689 loss=3.516, nll_loss=1.963, ppl=3.9, wps=589306, ups=1.19, wpb=495905, bsz=16741.5, num_updates=50400, lr=0.000281718, gnorm=0.132, clip=0, loss_scale=2, train_wall=83, gb_free=60.9, wall=43202 epoch 030: 1496 / 1689 loss=3.516, nll_loss=1.963, ppl=3.9, wps=589306, ups=1.19, wpb=495905, bsz=16741.5, num_updates=50400, lr=0.000281718, gnorm=0.132, clip=0, loss_scale=2, train_wall=83, gb_free=60.9, wall=43202 epoch 030: 1496 / 1689 loss=3.516, nll_loss=1.963, ppl=3.9, wps=589306, ups=1.19, wpb=495905, bsz=16741.5, num_updates=50400, lr=0.000281718, gnorm=0.132, clip=0, loss_scale=2, train_wall=83, gb_free=60.9, wall=43202 epoch 030: 1496 / 1689 loss=3.516, nll_loss=1.963, ppl=3.9, wps=589306, ups=1.19, wpb=495905, bsz=16741.5, num_updates=50400, lr=0.000281718, gnorm=0.132, clip=0, loss_scale=2, train_wall=83, gb_free=60.9, wall=43202 epoch 030: 1496 / 1689 loss=3.516, nll_loss=1.963, ppl=3.9, wps=589306, ups=1.19, wpb=495905, bsz=16741.5, num_updates=50400, lr=0.000281718, gnorm=0.132, clip=0, loss_scale=2, train_wall=83, gb_free=60.9, wall=43202 epoch 030: 1496 / 1689 loss=3.516, nll_loss=1.963, ppl=3.9, wps=589306, ups=1.19, wpb=495905, bsz=16741.5, num_updates=50400, lr=0.000281718, gnorm=0.132, clip=0, loss_scale=2, train_wall=83, gb_free=60.9, wall=43202 epoch 030: 1496 / 1689 loss=3.516, nll_loss=1.963, ppl=3.9, wps=589306, ups=1.19, wpb=495905, bsz=16741.5, num_updates=50400, lr=0.000281718, gnorm=0.132, clip=0, loss_scale=2, train_wall=83, gb_free=60.9, wall=43202 epoch 030: 1496 / 1689 loss=3.516, nll_loss=1.963, ppl=3.9, wps=589306, ups=1.19, wpb=495905, bsz=16741.5, num_updates=50400, lr=0.000281718, gnorm=0.132, clip=0, loss_scale=2, train_wall=83, gb_free=60.9, wall=43202 epoch 030: 1496 / 1689 loss=3.516, nll_loss=1.963, ppl=3.9, wps=589306, ups=1.19, wpb=495905, bsz=16741.5, num_updates=50400, lr=0.000281718, gnorm=0.132, clip=0, loss_scale=2, train_wall=83, gb_free=60.9, wall=43202 epoch 030: 1496 / 1689 loss=3.516, nll_loss=1.963, ppl=3.9, wps=589306, ups=1.19, wpb=495905, bsz=16741.5, num_updates=50400, lr=0.000281718, gnorm=0.132, clip=0, loss_scale=2, train_wall=83, gb_free=60.9, wall=43202 epoch 030: 1496 / 1689 loss=3.516, nll_loss=1.963, ppl=3.9, wps=589306, ups=1.19, wpb=495905, bsz=16741.5, num_updates=50400, lr=0.000281718, gnorm=0.132, clip=0, loss_scale=2, train_wall=83, gb_free=60.9, wall=43202 epoch 030: 1496 / 1689 loss=3.516, nll_loss=1.963, ppl=3.9, wps=589306, ups=1.19, wpb=495905, bsz=16741.5, num_updates=50400, lr=0.000281718, gnorm=0.132, clip=0, loss_scale=2, train_wall=83, gb_free=60.9, wall=43202 epoch 030: 1496 / 1689 loss=3.516, nll_loss=1.963, ppl=3.9, wps=589306, ups=1.19, wpb=495905, bsz=16741.5, num_updates=50400, lr=0.000281718, gnorm=0.132, clip=0, loss_scale=2, train_wall=83, gb_free=60.9, wall=43202 epoch 030: 1496 / 1689 loss=3.516, nll_loss=1.963, ppl=3.9, wps=589306, ups=1.19, wpb=495905, bsz=16741.5, num_updates=50400, lr=0.000281718, gnorm=0.132, clip=0, loss_scale=2, train_wall=83, gb_free=60.9, wall=43202 epoch 030: 1496 / 1689 loss=3.516, nll_loss=1.963, ppl=3.9, wps=589306, ups=1.19, wpb=495905, bsz=16741.5, num_updates=50400, lr=0.000281718, gnorm=0.132, clip=0, loss_scale=2, train_wall=83, gb_free=60.9, wall=43202 epoch 030: 1496 / 1689 loss=3.516, nll_loss=1.963, ppl=3.9, wps=589306, ups=1.19, wpb=495905, bsz=16741.5, num_updates=50400, lr=0.000281718, gnorm=0.132, clip=0, loss_scale=2, train_wall=83, gb_free=60.9, wall=43202 epoch 030: 1496 / 1689 loss=3.516, nll_loss=1.963, ppl=3.9, wps=589306, ups=1.19, wpb=495905, bsz=16741.5, num_updates=50400, lr=0.000281718, gnorm=0.132, clip=0, loss_scale=2, train_wall=83, gb_free=60.9, wall=43202 epoch 030: 1496 / 1689 loss=3.516, nll_loss=1.963, ppl=3.9, wps=589306, ups=1.19, wpb=495905, bsz=16741.5, num_updates=50400, lr=0.000281718, gnorm=0.132, clip=0, loss_scale=2, train_wall=83, gb_free=60.9, wall=43202 epoch 030: 1496 / 1689 loss=3.516, nll_loss=1.963, ppl=3.9, wps=589306, ups=1.19, wpb=495905, bsz=16741.5, num_updates=50400, lr=0.000281718, gnorm=0.132, clip=0, loss_scale=2, train_wall=83, gb_free=60.9, wall=43202 epoch 030: 1596 / 1689 loss=3.519, nll_loss=1.966, ppl=3.91, wps=587949, ups=1.18, wpb=496777, bsz=16528.5, num_updates=50500, lr=0.000281439, gnorm=0.133, clip=0, loss_scale=2, train_wall=83, gb_free=62.3, wall=43286 epoch 030: 1596 / 1689 loss=3.519, nll_loss=1.966, ppl=3.91, wps=587949, ups=1.18, wpb=496777, bsz=16528.5, num_updates=50500, lr=0.000281439, gnorm=0.133, clip=0, loss_scale=2, train_wall=83, gb_free=62.3, wall=43286 epoch 030: 1596 / 1689 loss=3.519, nll_loss=1.966, ppl=3.91, wps=587949, ups=1.18, wpb=496777, bsz=16528.5, num_updates=50500, lr=0.000281439, gnorm=0.133, clip=0, loss_scale=2, train_wall=83, gb_free=62.3, wall=43286 epoch 030: 1596 / 1689 loss=3.519, nll_loss=1.966, ppl=3.91, wps=587949, ups=1.18, wpb=496777, bsz=16528.5, num_updates=50500, lr=0.000281439, gnorm=0.133, clip=0, loss_scale=2, train_wall=83, gb_free=62.3, wall=43286 epoch 030: 1596 / 1689 loss=3.519, nll_loss=1.966, ppl=3.91, wps=587949, ups=1.18, wpb=496777, bsz=16528.5, num_updates=50500, lr=0.000281439, gnorm=0.133, clip=0, loss_scale=2, train_wall=83, gb_free=62.3, wall=43286 epoch 030: 1596 / 1689 loss=3.519, nll_loss=1.966, ppl=3.91, wps=587949, ups=1.18, wpb=496777, bsz=16528.5, num_updates=50500, lr=0.000281439, gnorm=0.133, clip=0, loss_scale=2, train_wall=83, gb_free=62.3, wall=43286 epoch 030: 1596 / 1689 loss=3.519, nll_loss=1.966, ppl=3.91, wps=587949, ups=1.18, wpb=496777, bsz=16528.5, num_updates=50500, lr=0.000281439, gnorm=0.133, clip=0, loss_scale=2, train_wall=83, gb_free=62.3, wall=43286 epoch 030: 1596 / 1689 loss=3.519, nll_loss=1.966, ppl=3.91, wps=587949, ups=1.18, wpb=496777, bsz=16528.5, num_updates=50500, lr=0.000281439, gnorm=0.133, clip=0, loss_scale=2, train_wall=83, gb_free=62.3, wall=43286 epoch 030: 1596 / 1689 loss=3.519, nll_loss=1.966, ppl=3.91, wps=587949, ups=1.18, wpb=496777, bsz=16528.5, num_updates=50500, lr=0.000281439, gnorm=0.133, clip=0, loss_scale=2, train_wall=83, gb_free=62.3, wall=43286 epoch 030: 1596 / 1689 loss=3.519, nll_loss=1.966, ppl=3.91, wps=587949, ups=1.18, wpb=496777, bsz=16528.5, num_updates=50500, lr=0.000281439, gnorm=0.133, clip=0, loss_scale=2, train_wall=83, gb_free=62.3, wall=43286 epoch 030: 1596 / 1689 loss=3.519, nll_loss=1.966, ppl=3.91, wps=587949, ups=1.18, wpb=496777, bsz=16528.5, num_updates=50500, lr=0.000281439, gnorm=0.133, clip=0, loss_scale=2, train_wall=83, gb_free=62.3, wall=43286 epoch 030: 1596 / 1689 loss=3.519, nll_loss=1.966, ppl=3.91, wps=587949, ups=1.18, wpb=496777, bsz=16528.5, num_updates=50500, lr=0.000281439, gnorm=0.133, clip=0, loss_scale=2, train_wall=83, gb_free=62.3, wall=43286 epoch 030: 1596 / 1689 loss=3.519, nll_loss=1.966, ppl=3.91, wps=587949, ups=1.18, wpb=496777, bsz=16528.5, num_updates=50500, lr=0.000281439, gnorm=0.133, clip=0, loss_scale=2, train_wall=83, gb_free=62.3, wall=43286 epoch 030: 1596 / 1689 loss=3.519, nll_loss=1.966, ppl=3.91, wps=587949, ups=1.18, wpb=496777, bsz=16528.5, num_updates=50500, lr=0.000281439, gnorm=0.133, clip=0, loss_scale=2, train_wall=83, gb_free=62.3, wall=43286 epoch 030: 1596 / 1689 loss=3.519, nll_loss=1.966, ppl=3.91, wps=587949, ups=1.18, wpb=496777, bsz=16528.5, num_updates=50500, lr=0.000281439, gnorm=0.133, clip=0, loss_scale=2, train_wall=83, gb_free=62.3, wall=43286 epoch 030: 1596 / 1689 loss=3.519, nll_loss=1.966, ppl=3.91, wps=587949, ups=1.18, wpb=496777, bsz=16528.5, num_updates=50500, lr=0.000281439, gnorm=0.133, clip=0, loss_scale=2, train_wall=83, gb_free=62.3, wall=43286 epoch 030: 1596 / 1689 loss=3.519, nll_loss=1.966, ppl=3.91, wps=587949, ups=1.18, wpb=496777, bsz=16528.5, num_updates=50500, lr=0.000281439, gnorm=0.133, clip=0, loss_scale=2, train_wall=83, gb_free=62.3, wall=43286 epoch 030: 1596 / 1689 loss=3.519, nll_loss=1.966, ppl=3.91, wps=587949, ups=1.18, wpb=496777, bsz=16528.5, num_updates=50500, lr=0.000281439, gnorm=0.133, clip=0, loss_scale=2, train_wall=83, gb_free=62.3, wall=43286 epoch 030: 1596 / 1689 loss=3.519, nll_loss=1.966, ppl=3.91, wps=587949, ups=1.18, wpb=496777, bsz=16528.5, num_updates=50500, lr=0.000281439, gnorm=0.133, clip=0, loss_scale=2, train_wall=83, gb_free=62.3, wall=43286 epoch 030: 1596 / 1689 loss=3.519, nll_loss=1.966, ppl=3.91, wps=587949, ups=1.18, wpb=496777, bsz=16528.5, num_updates=50500, lr=0.000281439, gnorm=0.133, clip=0, loss_scale=2, train_wall=83, gb_free=62.3, wall=43286 epoch 030: 1596 / 1689 loss=3.519, nll_loss=1.966, ppl=3.91, wps=587949, ups=1.18, wpb=496777, bsz=16528.5, num_updates=50500, lr=0.000281439, gnorm=0.133, clip=0, loss_scale=2, train_wall=83, gb_free=62.3, wall=43286 epoch 030: 1596 / 1689 loss=3.519, nll_loss=1.966, ppl=3.91, wps=587949, ups=1.18, wpb=496777, bsz=16528.5, num_updates=50500, lr=0.000281439, gnorm=0.133, clip=0, loss_scale=2, train_wall=83, gb_free=62.3, wall=43286 epoch 030: 1596 / 1689 loss=3.519, nll_loss=1.966, ppl=3.91, wps=587949, ups=1.18, wpb=496777, bsz=16528.5, num_updates=50500, lr=0.000281439, gnorm=0.133, clip=0, loss_scale=2, train_wall=83, gb_free=62.3, wall=43286 epoch 030: 1596 / 1689 loss=3.519, nll_loss=1.966, ppl=3.91, wps=587949, ups=1.18, wpb=496777, bsz=16528.5, num_updates=50500, lr=0.000281439, gnorm=0.133, clip=0, loss_scale=2, train_wall=83, gb_free=62.3, wall=43286 epoch 030: 1596 / 1689 loss=3.519, nll_loss=1.966, ppl=3.91, wps=587949, ups=1.18, wpb=496777, bsz=16528.5, num_updates=50500, lr=0.000281439, gnorm=0.133, clip=0, loss_scale=2, train_wall=83, gb_free=62.3, wall=43286 epoch 030: 1596 / 1689 loss=3.519, nll_loss=1.966, ppl=3.91, wps=587949, ups=1.18, wpb=496777, bsz=16528.5, num_updates=50500, lr=0.000281439, gnorm=0.133, clip=0, loss_scale=2, train_wall=83, gb_free=62.3, wall=43286 epoch 030: 1596 / 1689 loss=3.519, nll_loss=1.966, ppl=3.91, wps=587949, ups=1.18, wpb=496777, bsz=16528.5, num_updates=50500, lr=0.000281439, gnorm=0.133, clip=0, loss_scale=2, train_wall=83, gb_free=62.3, wall=43286 epoch 030: 1596 / 1689 loss=3.519, nll_loss=1.966, ppl=3.91, wps=587949, ups=1.18, wpb=496777, bsz=16528.5, num_updates=50500, lr=0.000281439, gnorm=0.133, clip=0, loss_scale=2, train_wall=83, gb_free=62.3, wall=43286 epoch 030: 1596 / 1689 loss=3.519, nll_loss=1.966, ppl=3.91, wps=587949, ups=1.18, wpb=496777, bsz=16528.5, num_updates=50500, lr=0.000281439, gnorm=0.133, clip=0, loss_scale=2, train_wall=83, gb_free=62.3, wall=43286 epoch 030: 1596 / 1689 loss=3.519, nll_loss=1.966, ppl=3.91, wps=587949, ups=1.18, wpb=496777, bsz=16528.5, num_updates=50500, lr=0.000281439, gnorm=0.133, clip=0, loss_scale=2, train_wall=83, gb_free=62.3, wall=43286 end of epoch 30 (average epoch stats below) epoch 030 | loss 3.516 | nll_loss 1.962 | ppl 3.9 | wps 579098 | ups 1.17 | wpb 495120 | bsz 16507 | num_updates 50593 | lr 0.00028118 | gnorm 0.138 | clip 0 | loss_scale 4 | train_wall 1407 | gb_free 63.4 | wall 43364 epoch 030 | loss 3.516 | nll_loss 1.962 | ppl 3.9 | wps 579098 | ups 1.17 | wpb 495120 | bsz 16507 | num_updates 50593 | lr 0.00028118 | gnorm 0.138 | clip 0 | loss_scale 4 | train_wall 1407 | gb_free 63.4 | wall 43364 epoch 030 | loss 3.516 | nll_loss 1.962 | ppl 3.9 | wps 579098 | ups 1.17 | wpb 495120 | bsz 16507 | num_updates 50593 | lr 0.00028118 | gnorm 0.138 | clip 0 | loss_scale 4 | train_wall 1407 | gb_free 63.4 | wall 43364 epoch 030 | loss 3.516 | nll_loss 1.962 | ppl 3.9 | wps 579098 | ups 1.17 | wpb 495120 | bsz 16507 | num_updates 50593 | lr 0.00028118 | gnorm 0.138 | clip 0 | loss_scale 4 | train_wall 1407 | gb_free 63.4 | wall 43364 epoch 030 | loss 3.516 | nll_loss 1.962 | ppl 3.9 | wps 579098 | ups 1.17 | wpb 495120 | bsz 16507 | num_updates 50593 | lr 0.00028118 | gnorm 0.138 | clip 0 | loss_scale 4 | train_wall 1407 | gb_free 63.4 | wall 43364 epoch 030 | loss 3.516 | nll_loss 1.962 | ppl 3.9 | wps 579098 | ups 1.17 | wpb 495120 | bsz 16507 | num_updates 50593 | lr 0.00028118 | gnorm 0.138 | clip 0 | loss_scale 4 | train_wall 1407 | gb_free 63.4 | wall 43364 epoch 030 | loss 3.516 | nll_loss 1.962 | ppl 3.9 | wps 579098 | ups 1.17 | wpb 495120 | bsz 16507 | num_updates 50593 | lr 0.00028118 | gnorm 0.138 | clip 0 | loss_scale 4 | train_wall 1407 | gb_free 63.4 | wall 43364 epoch 030 | loss 3.516 | nll_loss 1.962 | ppl 3.9 | wps 579098 | ups 1.17 | wpb 495120 | bsz 16507 | num_updates 50593 | lr 0.00028118 | gnorm 0.138 | clip 0 | loss_scale 4 | train_wall 1407 | gb_free 63.4 | wall 43364 epoch 030 | loss 3.516 | nll_loss 1.962 | ppl 3.9 | wps 579098 | ups 1.17 | wpb 495120 | bsz 16507 | num_updates 50593 | lr 0.00028118 | gnorm 0.138 | clip 0 | loss_scale 4 | train_wall 1407 | gb_free 63.4 | wall 43364 epoch 030 | loss 3.516 | nll_loss 1.962 | ppl 3.9 | wps 579098 | ups 1.17 | wpb 495120 | bsz 16507 | num_updates 50593 | lr 0.00028118 | gnorm 0.138 | clip 0 | loss_scale 4 | train_wall 1407 | gb_free 63.4 | wall 43364 epoch 030 | loss 3.516 | nll_loss 1.962 | ppl 3.9 | wps 579098 | ups 1.17 | wpb 495120 | bsz 16507 | num_updates 50593 | lr 0.00028118 | gnorm 0.138 | clip 0 | loss_scale 4 | train_wall 1407 | gb_free 63.4 | wall 43364 epoch 030 | loss 3.516 | nll_loss 1.962 | ppl 3.9 | wps 579098 | ups 1.17 | wpb 495120 | bsz 16507 | num_updates 50593 | lr 0.00028118 | gnorm 0.138 | clip 0 | loss_scale 4 | train_wall 1407 | gb_free 63.4 | wall 43364 epoch 030 | loss 3.516 | nll_loss 1.962 | ppl 3.9 | wps 579098 | ups 1.17 | wpb 495120 | bsz 16507 | num_updates 50593 | lr 0.00028118 | gnorm 0.138 | clip 0 | loss_scale 4 | train_wall 1407 | gb_free 63.4 | wall 43364 epoch 030 | loss 3.516 | nll_loss 1.962 | ppl 3.9 | wps 579098 | ups 1.17 | wpb 495120 | bsz 16507 | num_updates 50593 | lr 0.00028118 | gnorm 0.138 | clip 0 | loss_scale 4 | train_wall 1407 | gb_free 63.4 | wall 43364 epoch 030 | loss 3.516 | nll_loss 1.962 | ppl 3.9 | wps 579098 | ups 1.17 | wpb 495120 | bsz 16507 | num_updates 50593 | lr 0.00028118 | gnorm 0.138 | clip 0 | loss_scale 4 | train_wall 1407 | gb_free 63.4 | wall 43364 epoch 030 | loss 3.516 | nll_loss 1.962 | ppl 3.9 | wps 579098 | ups 1.17 | wpb 495120 | bsz 16507 | num_updates 50593 | lr 0.00028118 | gnorm 0.138 | clip 0 | loss_scale 4 | train_wall 1407 | gb_free 63.4 | wall 43364 epoch 030 | loss 3.516 | nll_loss 1.962 | ppl 3.9 | wps 579098 | ups 1.17 | wpb 495120 | bsz 16507 | num_updates 50593 | lr 0.00028118 | gnorm 0.138 | clip 0 | loss_scale 4 | train_wall 1407 | gb_free 63.4 | wall 43364 epoch 030 | loss 3.516 | nll_loss 1.962 | ppl 3.9 | wps 579098 | ups 1.17 | wpb 495120 | bsz 16507 | num_updates 50593 | lr 0.00028118 | gnorm 0.138 | clip 0 | loss_scale 4 | train_wall 1407 | gb_free 63.4 | wall 43364 epoch 030 | loss 3.516 | nll_loss 1.962 | ppl 3.9 | wps 579098 | ups 1.17 | wpb 495120 | bsz 16507 | num_updates 50593 | lr 0.00028118 | gnorm 0.138 | clip 0 | loss_scale 4 | train_wall 1407 | gb_free 63.4 | wall 43364 epoch 030 | loss 3.516 | nll_loss 1.962 | ppl 3.9 | wps 579098 | ups 1.17 | wpb 495120 | bsz 16507 | num_updates 50593 | lr 0.00028118 | gnorm 0.138 | clip 0 | loss_scale 4 | train_wall 1407 | gb_free 63.4 | wall 43364 epoch 030 | loss 3.516 | nll_loss 1.962 | ppl 3.9 | wps 579098 | ups 1.17 | wpb 495120 | bsz 16507 | num_updates 50593 | lr 0.00028118 | gnorm 0.138 | clip 0 | loss_scale 4 | train_wall 1407 | gb_free 63.4 | wall 43364 epoch 030 | loss 3.516 | nll_loss 1.962 | ppl 3.9 | wps 579098 | ups 1.17 | wpb 495120 | bsz 16507 | num_updates 50593 | lr 0.00028118 | gnorm 0.138 | clip 0 | loss_scale 4 | train_wall 1407 | gb_free 63.4 | wall 43364 epoch 030 | loss 3.516 | nll_loss 1.962 | ppl 3.9 | wps 579098 | ups 1.17 | wpb 495120 | bsz 16507 | num_updates 50593 | lr 0.00028118 | gnorm 0.138 | clip 0 | loss_scale 4 | train_wall 1407 | gb_free 63.4 | wall 43364 epoch 030 | loss 3.516 | nll_loss 1.962 | ppl 3.9 | wps 579098 | ups 1.17 | wpb 495120 | bsz 16507 | num_updates 50593 | lr 0.00028118 | gnorm 0.138 | clip 0 | loss_scale 4 | train_wall 1407 | gb_free 63.4 | wall 43364 epoch 030 | loss 3.516 | nll_loss 1.962 | ppl 3.9 | wps 579098 | ups 1.17 | wpb 495120 | bsz 16507 | num_updates 50593 | lr 0.00028118 | gnorm 0.138 | clip 0 | loss_scale 4 | train_wall 1407 | gb_free 63.4 | wall 43364 epoch 030 | loss 3.516 | nll_loss 1.962 | ppl 3.9 | wps 579098 | ups 1.17 | wpb 495120 | bsz 16507 | num_updates 50593 | lr 0.00028118 | gnorm 0.138 | clip 0 | loss_scale 4 | train_wall 1407 | gb_free 63.4 | wall 43364 epoch 030 | loss 3.516 | nll_loss 1.962 | ppl 3.9 | wps 579098 | ups 1.17 | wpb 495120 | bsz 16507 | num_updates 50593 | lr 0.00028118 | gnorm 0.138 | clip 0 | loss_scale 4 | train_wall 1407 | gb_free 63.4 | wall 43364 epoch 030 | loss 3.516 | nll_loss 1.962 | ppl 3.9 | wps 579098 | ups 1.17 | wpb 495120 | bsz 16507 | num_updates 50593 | lr 0.00028118 | gnorm 0.138 | clip 0 | loss_scale 4 | train_wall 1407 | gb_free 63.4 | wall 43364 epoch 030 | loss 3.516 | nll_loss 1.962 | ppl 3.9 | wps 579098 | ups 1.17 | wpb 495120 | bsz 16507 | num_updates 50593 | lr 0.00028118 | gnorm 0.138 | clip 0 | loss_scale 4 | train_wall 1407 | gb_free 63.4 | wall 43364 epoch 030 | loss 3.516 | nll_loss 1.962 | ppl 3.9 | wps 579098 | ups 1.17 | wpb 495120 | bsz 16507 | num_updates 50593 | lr 0.00028118 | gnorm 0.138 | clip 0 | loss_scale 4 | train_wall 1407 | gb_free 63.4 | wall 43364 Start iterating over samples epoch 031: 7 / 1689 loss=3.527, nll_loss=1.974, ppl=3.93, wps=581465, ups=1.19, wpb=490607, bsz=16419.6, num_updates=50600, lr=0.000281161, gnorm=0.146, clip=0, loss_scale=4, train_wall=83, gb_free=61.3, wall=43371 epoch 031: 7 / 1689 loss=3.527, nll_loss=1.974, ppl=3.93, wps=581465, ups=1.19, wpb=490607, bsz=16419.6, num_updates=50600, lr=0.000281161, gnorm=0.146, clip=0, loss_scale=4, train_wall=83, gb_free=61.3, wall=43371 epoch 031: 7 / 1689 loss=3.527, nll_loss=1.974, ppl=3.93, wps=581465, ups=1.19, wpb=490607, bsz=16419.6, num_updates=50600, lr=0.000281161, gnorm=0.146, clip=0, loss_scale=4, train_wall=83, gb_free=61.3, wall=43371 epoch 031: 7 / 1689 loss=3.527, nll_loss=1.974, ppl=3.93, wps=581465, ups=1.19, wpb=490607, bsz=16419.6, num_updates=50600, lr=0.000281161, gnorm=0.146, clip=0, loss_scale=4, train_wall=83, gb_free=61.3, wall=43371 epoch 031: 7 / 1689 loss=3.527, nll_loss=1.974, ppl=3.93, wps=581465, ups=1.19, wpb=490607, bsz=16419.6, num_updates=50600, lr=0.000281161, gnorm=0.146, clip=0, loss_scale=4, train_wall=83, gb_free=61.3, wall=43371 epoch 031: 7 / 1689 loss=3.527, nll_loss=1.974, ppl=3.93, wps=581465, ups=1.19, wpb=490607, bsz=16419.6, num_updates=50600, lr=0.000281161, gnorm=0.146, clip=0, loss_scale=4, train_wall=83, gb_free=61.3, wall=43371 epoch 031: 7 / 1689 loss=3.527, nll_loss=1.974, ppl=3.93, wps=581465, ups=1.19, wpb=490607, bsz=16419.6, num_updates=50600, lr=0.000281161, gnorm=0.146, clip=0, loss_scale=4, train_wall=83, gb_free=61.3, wall=43371 epoch 031: 7 / 1689 loss=3.527, nll_loss=1.974, ppl=3.93, wps=581465, ups=1.19, wpb=490607, bsz=16419.6, num_updates=50600, lr=0.000281161, gnorm=0.146, clip=0, loss_scale=4, train_wall=83, gb_free=61.3, wall=43371 epoch 031: 7 / 1689 loss=3.527, nll_loss=1.974, ppl=3.93, wps=581465, ups=1.19, wpb=490607, bsz=16419.6, num_updates=50600, lr=0.000281161, gnorm=0.146, clip=0, loss_scale=4, train_wall=83, gb_free=61.3, wall=43371 epoch 031: 7 / 1689 loss=3.527, nll_loss=1.974, ppl=3.93, wps=581465, ups=1.19, wpb=490607, bsz=16419.6, num_updates=50600, lr=0.000281161, gnorm=0.146, clip=0, loss_scale=4, train_wall=83, gb_free=61.3, wall=43371 epoch 031: 7 / 1689 loss=3.527, nll_loss=1.974, ppl=3.93, wps=581465, ups=1.19, wpb=490607, bsz=16419.6, num_updates=50600, lr=0.000281161, gnorm=0.146, clip=0, loss_scale=4, train_wall=83, gb_free=61.3, wall=43371 epoch 031: 7 / 1689 loss=3.527, nll_loss=1.974, ppl=3.93, wps=581465, ups=1.19, wpb=490607, bsz=16419.6, num_updates=50600, lr=0.000281161, gnorm=0.146, clip=0, loss_scale=4, train_wall=83, gb_free=61.3, wall=43371 epoch 031: 7 / 1689 loss=3.527, nll_loss=1.974, ppl=3.93, wps=581465, ups=1.19, wpb=490607, bsz=16419.6, num_updates=50600, lr=0.000281161, gnorm=0.146, clip=0, loss_scale=4, train_wall=83, gb_free=61.3, wall=43371 epoch 031: 7 / 1689 loss=3.527, nll_loss=1.974, ppl=3.93, wps=581465, ups=1.19, wpb=490607, bsz=16419.6, num_updates=50600, lr=0.000281161, gnorm=0.146, clip=0, loss_scale=4, train_wall=83, gb_free=61.3, wall=43371 epoch 031: 7 / 1689 loss=3.527, nll_loss=1.974, ppl=3.93, wps=581465, ups=1.19, wpb=490607, bsz=16419.6, num_updates=50600, lr=0.000281161, gnorm=0.146, clip=0, loss_scale=4, train_wall=83, gb_free=61.3, wall=43371 epoch 031: 7 / 1689 loss=3.527, nll_loss=1.974, ppl=3.93, wps=581465, ups=1.19, wpb=490607, bsz=16419.6, num_updates=50600, lr=0.000281161, gnorm=0.146, clip=0, loss_scale=4, train_wall=83, gb_free=61.3, wall=43371 epoch 031: 7 / 1689 loss=3.527, nll_loss=1.974, ppl=3.93, wps=581465, ups=1.19, wpb=490607, bsz=16419.6, num_updates=50600, lr=0.000281161, gnorm=0.146, clip=0, loss_scale=4, train_wall=83, gb_free=61.3, wall=43371 epoch 031: 7 / 1689 loss=3.527, nll_loss=1.974, ppl=3.93, wps=581465, ups=1.19, wpb=490607, bsz=16419.6, num_updates=50600, lr=0.000281161, gnorm=0.146, clip=0, loss_scale=4, train_wall=83, gb_free=61.3, wall=43371 epoch 031: 7 / 1689 loss=3.527, nll_loss=1.974, ppl=3.93, wps=581465, ups=1.19, wpb=490607, bsz=16419.6, num_updates=50600, lr=0.000281161, gnorm=0.146, clip=0, loss_scale=4, train_wall=83, gb_free=61.3, wall=43371 epoch 031: 7 / 1689 loss=3.527, nll_loss=1.974, ppl=3.93, wps=581465, ups=1.19, wpb=490607, bsz=16419.6, num_updates=50600, lr=0.000281161, gnorm=0.146, clip=0, loss_scale=4, train_wall=83, gb_free=61.3, wall=43371 epoch 031: 7 / 1689 loss=3.527, nll_loss=1.974, ppl=3.93, wps=581465, ups=1.19, wpb=490607, bsz=16419.6, num_updates=50600, lr=0.000281161, gnorm=0.146, clip=0, loss_scale=4, train_wall=83, gb_free=61.3, wall=43371 epoch 031: 7 / 1689 loss=3.527, nll_loss=1.974, ppl=3.93, wps=581465, ups=1.19, wpb=490607, bsz=16419.6, num_updates=50600, lr=0.000281161, gnorm=0.146, clip=0, loss_scale=4, train_wall=83, gb_free=61.3, wall=43371 epoch 031: 7 / 1689 loss=3.527, nll_loss=1.974, ppl=3.93, wps=581465, ups=1.19, wpb=490607, bsz=16419.6, num_updates=50600, lr=0.000281161, gnorm=0.146, clip=0, loss_scale=4, train_wall=83, gb_free=61.3, wall=43371 epoch 031: 7 / 1689 loss=3.527, nll_loss=1.974, ppl=3.93, wps=581465, ups=1.19, wpb=490607, bsz=16419.6, num_updates=50600, lr=0.000281161, gnorm=0.146, clip=0, loss_scale=4, train_wall=83, gb_free=61.3, wall=43371 epoch 031: 7 / 1689 loss=3.527, nll_loss=1.974, ppl=3.93, wps=581465, ups=1.19, wpb=490607, bsz=16419.6, num_updates=50600, lr=0.000281161, gnorm=0.146, clip=0, loss_scale=4, train_wall=83, gb_free=61.3, wall=43371 epoch 031: 7 / 1689 loss=3.527, nll_loss=1.974, ppl=3.93, wps=581465, ups=1.19, wpb=490607, bsz=16419.6, num_updates=50600, lr=0.000281161, gnorm=0.146, clip=0, loss_scale=4, train_wall=83, gb_free=61.3, wall=43371 epoch 031: 7 / 1689 loss=3.527, nll_loss=1.974, ppl=3.93, wps=581465, ups=1.19, wpb=490607, bsz=16419.6, num_updates=50600, lr=0.000281161, gnorm=0.146, clip=0, loss_scale=4, train_wall=83, gb_free=61.3, wall=43371 epoch 031: 7 / 1689 loss=3.527, nll_loss=1.974, ppl=3.93, wps=581465, ups=1.19, wpb=490607, bsz=16419.6, num_updates=50600, lr=0.000281161, gnorm=0.146, clip=0, loss_scale=4, train_wall=83, gb_free=61.3, wall=43371 epoch 031: 7 / 1689 loss=3.527, nll_loss=1.974, ppl=3.93, wps=581465, ups=1.19, wpb=490607, bsz=16419.6, num_updates=50600, lr=0.000281161, gnorm=0.146, clip=0, loss_scale=4, train_wall=83, gb_free=61.3, wall=43371 epoch 031: 7 / 1689 loss=3.527, nll_loss=1.974, ppl=3.93, wps=581465, ups=1.19, wpb=490607, bsz=16419.6, num_updates=50600, lr=0.000281161, gnorm=0.146, clip=0, loss_scale=4, train_wall=83, gb_free=61.3, wall=43371 epoch 031: 7 / 1689 loss=3.527, nll_loss=1.974, ppl=3.93, wps=581465, ups=1.19, wpb=490607, bsz=16419.6, num_updates=50600, lr=0.000281161, gnorm=0.146, clip=0, loss_scale=4, train_wall=83, gb_free=61.3, wall=43371 epoch 031: 108 / 1689 loss=3.501, nll_loss=1.944, ppl=3.85, wps=582456, ups=1.18, wpb=495005, bsz=16303.2, num_updates=50700, lr=0.000280883, gnorm=0.132, clip=0, loss_scale=2, train_wall=84, gb_free=61.3, wall=43456 epoch 031: 108 / 1689 loss=3.501, nll_loss=1.944, ppl=3.85, wps=582456, ups=1.18, wpb=495005, bsz=16303.2, num_updates=50700, lr=0.000280883, gnorm=0.132, clip=0, loss_scale=2, train_wall=84, gb_free=61.3, wall=43456 epoch 031: 108 / 1689 loss=3.501, nll_loss=1.944, ppl=3.85, wps=582456, ups=1.18, wpb=495005, bsz=16303.2, num_updates=50700, lr=0.000280883, gnorm=0.132, clip=0, loss_scale=2, train_wall=84, gb_free=61.3, wall=43456 epoch 031: 108 / 1689 loss=3.501, nll_loss=1.944, ppl=3.85, wps=582456, ups=1.18, wpb=495005, bsz=16303.2, num_updates=50700, lr=0.000280883, gnorm=0.132, clip=0, loss_scale=2, train_wall=84, gb_free=61.3, wall=43456 epoch 031: 108 / 1689 loss=3.501, nll_loss=1.944, ppl=3.85, wps=582456, ups=1.18, wpb=495005, bsz=16303.2, num_updates=50700, lr=0.000280883, gnorm=0.132, clip=0, loss_scale=2, train_wall=84, gb_free=61.3, wall=43456 epoch 031: 108 / 1689 loss=3.501, nll_loss=1.944, ppl=3.85, wps=582456, ups=1.18, wpb=495005, bsz=16303.2, num_updates=50700, lr=0.000280883, gnorm=0.132, clip=0, loss_scale=2, train_wall=84, gb_free=61.3, wall=43456 epoch 031: 108 / 1689 loss=3.501, nll_loss=1.944, ppl=3.85, wps=582456, ups=1.18, wpb=495005, bsz=16303.2, num_updates=50700, lr=0.000280883, gnorm=0.132, clip=0, loss_scale=2, train_wall=84, gb_free=61.3, wall=43456 epoch 031: 108 / 1689 loss=3.501, nll_loss=1.944, ppl=3.85, wps=582456, ups=1.18, wpb=495005, bsz=16303.2, num_updates=50700, lr=0.000280883, gnorm=0.132, clip=0, loss_scale=2, train_wall=84, gb_free=61.3, wall=43456 epoch 031: 108 / 1689 loss=3.501, nll_loss=1.944, ppl=3.85, wps=582456, ups=1.18, wpb=495005, bsz=16303.2, num_updates=50700, lr=0.000280883, gnorm=0.132, clip=0, loss_scale=2, train_wall=84, gb_free=61.3, wall=43456 epoch 031: 108 / 1689 loss=3.501, nll_loss=1.944, ppl=3.85, wps=582456, ups=1.18, wpb=495005, bsz=16303.2, num_updates=50700, lr=0.000280883, gnorm=0.132, clip=0, loss_scale=2, train_wall=84, gb_free=61.3, wall=43456 epoch 031: 108 / 1689 loss=3.501, nll_loss=1.944, ppl=3.85, wps=582456, ups=1.18, wpb=495005, bsz=16303.2, num_updates=50700, lr=0.000280883, gnorm=0.132, clip=0, loss_scale=2, train_wall=84, gb_free=61.3, wall=43456 epoch 031: 108 / 1689 loss=3.501, nll_loss=1.944, ppl=3.85, wps=582456, ups=1.18, wpb=495005, bsz=16303.2, num_updates=50700, lr=0.000280883, gnorm=0.132, clip=0, loss_scale=2, train_wall=84, gb_free=61.3, wall=43456 epoch 031: 108 / 1689 loss=3.501, nll_loss=1.944, ppl=3.85, wps=582456, ups=1.18, wpb=495005, bsz=16303.2, num_updates=50700, lr=0.000280883, gnorm=0.132, clip=0, loss_scale=2, train_wall=84, gb_free=61.3, wall=43456 epoch 031: 108 / 1689 loss=3.501, nll_loss=1.944, ppl=3.85, wps=582456, ups=1.18, wpb=495005, bsz=16303.2, num_updates=50700, lr=0.000280883, gnorm=0.132, clip=0, loss_scale=2, train_wall=84, gb_free=61.3, wall=43456 epoch 031: 108 / 1689 loss=3.501, nll_loss=1.944, ppl=3.85, wps=582456, ups=1.18, wpb=495005, bsz=16303.2, num_updates=50700, lr=0.000280883, gnorm=0.132, clip=0, loss_scale=2, train_wall=84, gb_free=61.3, wall=43456 epoch 031: 108 / 1689 loss=3.501, nll_loss=1.944, ppl=3.85, wps=582456, ups=1.18, wpb=495005, bsz=16303.2, num_updates=50700, lr=0.000280883, gnorm=0.132, clip=0, loss_scale=2, train_wall=84, gb_free=61.3, wall=43456 epoch 031: 108 / 1689 loss=3.501, nll_loss=1.944, ppl=3.85, wps=582456, ups=1.18, wpb=495005, bsz=16303.2, num_updates=50700, lr=0.000280883, gnorm=0.132, clip=0, loss_scale=2, train_wall=84, gb_free=61.3, wall=43456 epoch 031: 108 / 1689 loss=3.501, nll_loss=1.944, ppl=3.85, wps=582456, ups=1.18, wpb=495005, bsz=16303.2, num_updates=50700, lr=0.000280883, gnorm=0.132, clip=0, loss_scale=2, train_wall=84, gb_free=61.3, wall=43456 epoch 031: 108 / 1689 loss=3.501, nll_loss=1.944, ppl=3.85, wps=582456, ups=1.18, wpb=495005, bsz=16303.2, num_updates=50700, lr=0.000280883, gnorm=0.132, clip=0, loss_scale=2, train_wall=84, gb_free=61.3, wall=43456 epoch 031: 108 / 1689 loss=3.501, nll_loss=1.944, ppl=3.85, wps=582456, ups=1.18, wpb=495005, bsz=16303.2, num_updates=50700, lr=0.000280883, gnorm=0.132, clip=0, loss_scale=2, train_wall=84, gb_free=61.3, wall=43456 epoch 031: 108 / 1689 loss=3.501, nll_loss=1.944, ppl=3.85, wps=582456, ups=1.18, wpb=495005, bsz=16303.2, num_updates=50700, lr=0.000280883, gnorm=0.132, clip=0, loss_scale=2, train_wall=84, gb_free=61.3, wall=43456 epoch 031: 108 / 1689 loss=3.501, nll_loss=1.944, ppl=3.85, wps=582456, ups=1.18, wpb=495005, bsz=16303.2, num_updates=50700, lr=0.000280883, gnorm=0.132, clip=0, loss_scale=2, train_wall=84, gb_free=61.3, wall=43456 epoch 031: 108 / 1689 loss=3.501, nll_loss=1.944, ppl=3.85, wps=582456, ups=1.18, wpb=495005, bsz=16303.2, num_updates=50700, lr=0.000280883, gnorm=0.132, clip=0, loss_scale=2, train_wall=84, gb_free=61.3, wall=43456 epoch 031: 108 / 1689 loss=3.501, nll_loss=1.944, ppl=3.85, wps=582456, ups=1.18, wpb=495005, bsz=16303.2, num_updates=50700, lr=0.000280883, gnorm=0.132, clip=0, loss_scale=2, train_wall=84, gb_free=61.3, wall=43456 epoch 031: 108 / 1689 loss=3.501, nll_loss=1.944, ppl=3.85, wps=582456, ups=1.18, wpb=495005, bsz=16303.2, num_updates=50700, lr=0.000280883, gnorm=0.132, clip=0, loss_scale=2, train_wall=84, gb_free=61.3, wall=43456 epoch 031: 108 / 1689 loss=3.501, nll_loss=1.944, ppl=3.85, wps=582456, ups=1.18, wpb=495005, bsz=16303.2, num_updates=50700, lr=0.000280883, gnorm=0.132, clip=0, loss_scale=2, train_wall=84, gb_free=61.3, wall=43456 epoch 031: 108 / 1689 loss=3.501, nll_loss=1.944, ppl=3.85, wps=582456, ups=1.18, wpb=495005, bsz=16303.2, num_updates=50700, lr=0.000280883, gnorm=0.132, clip=0, loss_scale=2, train_wall=84, gb_free=61.3, wall=43456 epoch 031: 108 / 1689 loss=3.501, nll_loss=1.944, ppl=3.85, wps=582456, ups=1.18, wpb=495005, bsz=16303.2, num_updates=50700, lr=0.000280883, gnorm=0.132, clip=0, loss_scale=2, train_wall=84, gb_free=61.3, wall=43456 epoch 031: 108 / 1689 loss=3.501, nll_loss=1.944, ppl=3.85, wps=582456, ups=1.18, wpb=495005, bsz=16303.2, num_updates=50700, lr=0.000280883, gnorm=0.132, clip=0, loss_scale=2, train_wall=84, gb_free=61.3, wall=43456 epoch 031: 108 / 1689 loss=3.501, nll_loss=1.944, ppl=3.85, wps=582456, ups=1.18, wpb=495005, bsz=16303.2, num_updates=50700, lr=0.000280883, gnorm=0.132, clip=0, loss_scale=2, train_wall=84, gb_free=61.3, wall=43456 epoch 031: 108 / 1689 loss=3.501, nll_loss=1.944, ppl=3.85, wps=582456, ups=1.18, wpb=495005, bsz=16303.2, num_updates=50700, lr=0.000280883, gnorm=0.132, clip=0, loss_scale=2, train_wall=84, gb_free=61.3, wall=43456 epoch 031: 209 / 1689 loss=3.498, nll_loss=1.941, ppl=3.84, wps=580536, ups=1.17, wpb=495540, bsz=16625.8, num_updates=50800, lr=0.000280607, gnorm=0.138, clip=0, loss_scale=1, train_wall=84, gb_free=61, wall=43541 epoch 031: 209 / 1689 loss=3.498, nll_loss=1.941, ppl=3.84, wps=580536, ups=1.17, wpb=495540, bsz=16625.8, num_updates=50800, lr=0.000280607, gnorm=0.138, clip=0, loss_scale=1, train_wall=84, gb_free=61, wall=43541 epoch 031: 209 / 1689 loss=3.498, nll_loss=1.941, ppl=3.84, wps=580536, ups=1.17, wpb=495540, bsz=16625.8, num_updates=50800, lr=0.000280607, gnorm=0.138, clip=0, loss_scale=1, train_wall=84, gb_free=61, wall=43541 epoch 031: 209 / 1689 loss=3.498, nll_loss=1.941, ppl=3.84, wps=580536, ups=1.17, wpb=495540, bsz=16625.8, num_updates=50800, lr=0.000280607, gnorm=0.138, clip=0, loss_scale=1, train_wall=84, gb_free=61, wall=43541 epoch 031: 209 / 1689 loss=3.498, nll_loss=1.941, ppl=3.84, wps=580536, ups=1.17, wpb=495540, bsz=16625.8, num_updates=50800, lr=0.000280607, gnorm=0.138, clip=0, loss_scale=1, train_wall=84, gb_free=61, wall=43541 epoch 031: 209 / 1689 loss=3.498, nll_loss=1.941, ppl=3.84, wps=580536, ups=1.17, wpb=495540, bsz=16625.8, num_updates=50800, lr=0.000280607, gnorm=0.138, clip=0, loss_scale=1, train_wall=84, gb_free=61, wall=43541 epoch 031: 209 / 1689 loss=3.498, nll_loss=1.941, ppl=3.84, wps=580536, ups=1.17, wpb=495540, bsz=16625.8, num_updates=50800, lr=0.000280607, gnorm=0.138, clip=0, loss_scale=1, train_wall=84, gb_free=61, wall=43541 epoch 031: 209 / 1689 loss=3.498, nll_loss=1.941, ppl=3.84, wps=580536, ups=1.17, wpb=495540, bsz=16625.8, num_updates=50800, lr=0.000280607, gnorm=0.138, clip=0, loss_scale=1, train_wall=84, gb_free=61, wall=43541 epoch 031: 209 / 1689 loss=3.498, nll_loss=1.941, ppl=3.84, wps=580536, ups=1.17, wpb=495540, bsz=16625.8, num_updates=50800, lr=0.000280607, gnorm=0.138, clip=0, loss_scale=1, train_wall=84, gb_free=61, wall=43541 epoch 031: 209 / 1689 loss=3.498, nll_loss=1.941, ppl=3.84, wps=580536, ups=1.17, wpb=495540, bsz=16625.8, num_updates=50800, lr=0.000280607, gnorm=0.138, clip=0, loss_scale=1, train_wall=84, gb_free=61, wall=43541 epoch 031: 209 / 1689 loss=3.498, nll_loss=1.941, ppl=3.84, wps=580536, ups=1.17, wpb=495540, bsz=16625.8, num_updates=50800, lr=0.000280607, gnorm=0.138, clip=0, loss_scale=1, train_wall=84, gb_free=61, wall=43541 epoch 031: 209 / 1689 loss=3.498, nll_loss=1.941, ppl=3.84, wps=580536, ups=1.17, wpb=495540, bsz=16625.8, num_updates=50800, lr=0.000280607, gnorm=0.138, clip=0, loss_scale=1, train_wall=84, gb_free=61, wall=43541 epoch 031: 209 / 1689 loss=3.498, nll_loss=1.941, ppl=3.84, wps=580536, ups=1.17, wpb=495540, bsz=16625.8, num_updates=50800, lr=0.000280607, gnorm=0.138, clip=0, loss_scale=1, train_wall=84, gb_free=61, wall=43541 epoch 031: 209 / 1689 loss=3.498, nll_loss=1.941, ppl=3.84, wps=580536, ups=1.17, wpb=495540, bsz=16625.8, num_updates=50800, lr=0.000280607, gnorm=0.138, clip=0, loss_scale=1, train_wall=84, gb_free=61, wall=43541 epoch 031: 209 / 1689 loss=3.498, nll_loss=1.941, ppl=3.84, wps=580536, ups=1.17, wpb=495540, bsz=16625.8, num_updates=50800, lr=0.000280607, gnorm=0.138, clip=0, loss_scale=1, train_wall=84, gb_free=61, wall=43541 epoch 031: 209 / 1689 loss=3.498, nll_loss=1.941, ppl=3.84, wps=580536, ups=1.17, wpb=495540, bsz=16625.8, num_updates=50800, lr=0.000280607, gnorm=0.138, clip=0, loss_scale=1, train_wall=84, gb_free=61, wall=43541 epoch 031: 209 / 1689 loss=3.498, nll_loss=1.941, ppl=3.84, wps=580536, ups=1.17, wpb=495540, bsz=16625.8, num_updates=50800, lr=0.000280607, gnorm=0.138, clip=0, loss_scale=1, train_wall=84, gb_free=61, wall=43541 epoch 031: 209 / 1689 loss=3.498, nll_loss=1.941, ppl=3.84, wps=580536, ups=1.17, wpb=495540, bsz=16625.8, num_updates=50800, lr=0.000280607, gnorm=0.138, clip=0, loss_scale=1, train_wall=84, gb_free=61, wall=43541 epoch 031: 209 / 1689 loss=3.498, nll_loss=1.941, ppl=3.84, wps=580536, ups=1.17, wpb=495540, bsz=16625.8, num_updates=50800, lr=0.000280607, gnorm=0.138, clip=0, loss_scale=1, train_wall=84, gb_free=61, wall=43541 epoch 031: 209 / 1689 loss=3.498, nll_loss=1.941, ppl=3.84, wps=580536, ups=1.17, wpb=495540, bsz=16625.8, num_updates=50800, lr=0.000280607, gnorm=0.138, clip=0, loss_scale=1, train_wall=84, gb_free=61, wall=43541 epoch 031: 209 / 1689 loss=3.498, nll_loss=1.941, ppl=3.84, wps=580536, ups=1.17, wpb=495540, bsz=16625.8, num_updates=50800, lr=0.000280607, gnorm=0.138, clip=0, loss_scale=1, train_wall=84, gb_free=61, wall=43541 epoch 031: 209 / 1689 loss=3.498, nll_loss=1.941, ppl=3.84, wps=580536, ups=1.17, wpb=495540, bsz=16625.8, num_updates=50800, lr=0.000280607, gnorm=0.138, clip=0, loss_scale=1, train_wall=84, gb_free=61, wall=43541 epoch 031: 209 / 1689 loss=3.498, nll_loss=1.941, ppl=3.84, wps=580536, ups=1.17, wpb=495540, bsz=16625.8, num_updates=50800, lr=0.000280607, gnorm=0.138, clip=0, loss_scale=1, train_wall=84, gb_free=61, wall=43541 epoch 031: 209 / 1689 loss=3.498, nll_loss=1.941, ppl=3.84, wps=580536, ups=1.17, wpb=495540, bsz=16625.8, num_updates=50800, lr=0.000280607, gnorm=0.138, clip=0, loss_scale=1, train_wall=84, gb_free=61, wall=43541 epoch 031: 209 / 1689 loss=3.498, nll_loss=1.941, ppl=3.84, wps=580536, ups=1.17, wpb=495540, bsz=16625.8, num_updates=50800, lr=0.000280607, gnorm=0.138, clip=0, loss_scale=1, train_wall=84, gb_free=61, wall=43541 epoch 031: 209 / 1689 loss=3.498, nll_loss=1.941, ppl=3.84, wps=580536, ups=1.17, wpb=495540, bsz=16625.8, num_updates=50800, lr=0.000280607, gnorm=0.138, clip=0, loss_scale=1, train_wall=84, gb_free=61, wall=43541 epoch 031: 209 / 1689 loss=3.498, nll_loss=1.941, ppl=3.84, wps=580536, ups=1.17, wpb=495540, bsz=16625.8, num_updates=50800, lr=0.000280607, gnorm=0.138, clip=0, loss_scale=1, train_wall=84, gb_free=61, wall=43541 epoch 031: 209 / 1689 loss=3.498, nll_loss=1.941, ppl=3.84, wps=580536, ups=1.17, wpb=495540, bsz=16625.8, num_updates=50800, lr=0.000280607, gnorm=0.138, clip=0, loss_scale=1, train_wall=84, gb_free=61, wall=43541 epoch 031: 209 / 1689 loss=3.498, nll_loss=1.941, ppl=3.84, wps=580536, ups=1.17, wpb=495540, bsz=16625.8, num_updates=50800, lr=0.000280607, gnorm=0.138, clip=0, loss_scale=1, train_wall=84, gb_free=61, wall=43541 epoch 031: 209 / 1689 loss=3.498, nll_loss=1.941, ppl=3.84, wps=580536, ups=1.17, wpb=495540, bsz=16625.8, num_updates=50800, lr=0.000280607, gnorm=0.138, clip=0, loss_scale=1, train_wall=84, gb_free=61, wall=43541 epoch 031: 209 / 1689 loss=3.498, nll_loss=1.941, ppl=3.84, wps=580536, ups=1.17, wpb=495540, bsz=16625.8, num_updates=50800, lr=0.000280607, gnorm=0.138, clip=0, loss_scale=1, train_wall=84, gb_free=61, wall=43541 epoch 031: 309 / 1689 loss=3.498, nll_loss=1.942, ppl=3.84, wps=587277, ups=1.19, wpb=495370, bsz=16242.8, num_updates=50900, lr=0.000280331, gnorm=0.134, clip=0, loss_scale=1, train_wall=83, gb_free=62.3, wall=43625 epoch 031: 309 / 1689 loss=3.498, nll_loss=1.942, ppl=3.84, wps=587277, ups=1.19, wpb=495370, bsz=16242.8, num_updates=50900, lr=0.000280331, gnorm=0.134, clip=0, loss_scale=1, train_wall=83, gb_free=62.3, wall=43625 epoch 031: 309 / 1689 loss=3.498, nll_loss=1.942, ppl=3.84, wps=587277, ups=1.19, wpb=495370, bsz=16242.8, num_updates=50900, lr=0.000280331, gnorm=0.134, clip=0, loss_scale=1, train_wall=83, gb_free=62.3, wall=43625 epoch 031: 309 / 1689 loss=3.498, nll_loss=1.942, ppl=3.84, wps=587277, ups=1.19, wpb=495370, bsz=16242.8, num_updates=50900, lr=0.000280331, gnorm=0.134, clip=0, loss_scale=1, train_wall=83, gb_free=62.3, wall=43625 epoch 031: 309 / 1689 loss=3.498, nll_loss=1.942, ppl=3.84, wps=587277, ups=1.19, wpb=495370, bsz=16242.8, num_updates=50900, lr=0.000280331, gnorm=0.134, clip=0, loss_scale=1, train_wall=83, gb_free=62.3, wall=43625 epoch 031: 309 / 1689 loss=3.498, nll_loss=1.942, ppl=3.84, wps=587277, ups=1.19, wpb=495370, bsz=16242.8, num_updates=50900, lr=0.000280331, gnorm=0.134, clip=0, loss_scale=1, train_wall=83, gb_free=62.3, wall=43625 epoch 031: 309 / 1689 loss=3.498, nll_loss=1.942, ppl=3.84, wps=587277, ups=1.19, wpb=495370, bsz=16242.8, num_updates=50900, lr=0.000280331, gnorm=0.134, clip=0, loss_scale=1, train_wall=83, gb_free=62.3, wall=43625 epoch 031: 309 / 1689 loss=3.498, nll_loss=1.942, ppl=3.84, wps=587277, ups=1.19, wpb=495370, bsz=16242.8, num_updates=50900, lr=0.000280331, gnorm=0.134, clip=0, loss_scale=1, train_wall=83, gb_free=62.3, wall=43625 epoch 031: 309 / 1689 loss=3.498, nll_loss=1.942, ppl=3.84, wps=587277, ups=1.19, wpb=495370, bsz=16242.8, num_updates=50900, lr=0.000280331, gnorm=0.134, clip=0, loss_scale=1, train_wall=83, gb_free=62.3, wall=43625 epoch 031: 309 / 1689 loss=3.498, nll_loss=1.942, ppl=3.84, wps=587277, ups=1.19, wpb=495370, bsz=16242.8, num_updates=50900, lr=0.000280331, gnorm=0.134, clip=0, loss_scale=1, train_wall=83, gb_free=62.3, wall=43625 epoch 031: 309 / 1689 loss=3.498, nll_loss=1.942, ppl=3.84, wps=587277, ups=1.19, wpb=495370, bsz=16242.8, num_updates=50900, lr=0.000280331, gnorm=0.134, clip=0, loss_scale=1, train_wall=83, gb_free=62.3, wall=43625 epoch 031: 309 / 1689 loss=3.498, nll_loss=1.942, ppl=3.84, wps=587277, ups=1.19, wpb=495370, bsz=16242.8, num_updates=50900, lr=0.000280331, gnorm=0.134, clip=0, loss_scale=1, train_wall=83, gb_free=62.3, wall=43625 epoch 031: 309 / 1689 loss=3.498, nll_loss=1.942, ppl=3.84, wps=587277, ups=1.19, wpb=495370, bsz=16242.8, num_updates=50900, lr=0.000280331, gnorm=0.134, clip=0, loss_scale=1, train_wall=83, gb_free=62.3, wall=43625 epoch 031: 309 / 1689 loss=3.498, nll_loss=1.942, ppl=3.84, wps=587277, ups=1.19, wpb=495370, bsz=16242.8, num_updates=50900, lr=0.000280331, gnorm=0.134, clip=0, loss_scale=1, train_wall=83, gb_free=62.3, wall=43625 epoch 031: 309 / 1689 loss=3.498, nll_loss=1.942, ppl=3.84, wps=587277, ups=1.19, wpb=495370, bsz=16242.8, num_updates=50900, lr=0.000280331, gnorm=0.134, clip=0, loss_scale=1, train_wall=83, gb_free=62.3, wall=43625 epoch 031: 309 / 1689 loss=3.498, nll_loss=1.942, ppl=3.84, wps=587277, ups=1.19, wpb=495370, bsz=16242.8, num_updates=50900, lr=0.000280331, gnorm=0.134, clip=0, loss_scale=1, train_wall=83, gb_free=62.3, wall=43625 epoch 031: 309 / 1689 loss=3.498, nll_loss=1.942, ppl=3.84, wps=587277, ups=1.19, wpb=495370, bsz=16242.8, num_updates=50900, lr=0.000280331, gnorm=0.134, clip=0, loss_scale=1, train_wall=83, gb_free=62.3, wall=43625 epoch 031: 309 / 1689 loss=3.498, nll_loss=1.942, ppl=3.84, wps=587277, ups=1.19, wpb=495370, bsz=16242.8, num_updates=50900, lr=0.000280331, gnorm=0.134, clip=0, loss_scale=1, train_wall=83, gb_free=62.3, wall=43625 epoch 031: 309 / 1689 loss=3.498, nll_loss=1.942, ppl=3.84, wps=587277, ups=1.19, wpb=495370, bsz=16242.8, num_updates=50900, lr=0.000280331, gnorm=0.134, clip=0, loss_scale=1, train_wall=83, gb_free=62.3, wall=43625 epoch 031: 309 / 1689 loss=3.498, nll_loss=1.942, ppl=3.84, wps=587277, ups=1.19, wpb=495370, bsz=16242.8, num_updates=50900, lr=0.000280331, gnorm=0.134, clip=0, loss_scale=1, train_wall=83, gb_free=62.3, wall=43625 epoch 031: 309 / 1689 loss=3.498, nll_loss=1.942, ppl=3.84, wps=587277, ups=1.19, wpb=495370, bsz=16242.8, num_updates=50900, lr=0.000280331, gnorm=0.134, clip=0, loss_scale=1, train_wall=83, gb_free=62.3, wall=43625 epoch 031: 309 / 1689 loss=3.498, nll_loss=1.942, ppl=3.84, wps=587277, ups=1.19, wpb=495370, bsz=16242.8, num_updates=50900, lr=0.000280331, gnorm=0.134, clip=0, loss_scale=1, train_wall=83, gb_free=62.3, wall=43625 epoch 031: 309 / 1689 loss=3.498, nll_loss=1.942, ppl=3.84, wps=587277, ups=1.19, wpb=495370, bsz=16242.8, num_updates=50900, lr=0.000280331, gnorm=0.134, clip=0, loss_scale=1, train_wall=83, gb_free=62.3, wall=43625 epoch 031: 309 / 1689 loss=3.498, nll_loss=1.942, ppl=3.84, wps=587277, ups=1.19, wpb=495370, bsz=16242.8, num_updates=50900, lr=0.000280331, gnorm=0.134, clip=0, loss_scale=1, train_wall=83, gb_free=62.3, wall=43625 epoch 031: 309 / 1689 loss=3.498, nll_loss=1.942, ppl=3.84, wps=587277, ups=1.19, wpb=495370, bsz=16242.8, num_updates=50900, lr=0.000280331, gnorm=0.134, clip=0, loss_scale=1, train_wall=83, gb_free=62.3, wall=43625 epoch 031: 309 / 1689 loss=3.498, nll_loss=1.942, ppl=3.84, wps=587277, ups=1.19, wpb=495370, bsz=16242.8, num_updates=50900, lr=0.000280331, gnorm=0.134, clip=0, loss_scale=1, train_wall=83, gb_free=62.3, wall=43625 epoch 031: 309 / 1689 loss=3.498, nll_loss=1.942, ppl=3.84, wps=587277, ups=1.19, wpb=495370, bsz=16242.8, num_updates=50900, lr=0.000280331, gnorm=0.134, clip=0, loss_scale=1, train_wall=83, gb_free=62.3, wall=43625 epoch 031: 309 / 1689 loss=3.498, nll_loss=1.942, ppl=3.84, wps=587277, ups=1.19, wpb=495370, bsz=16242.8, num_updates=50900, lr=0.000280331, gnorm=0.134, clip=0, loss_scale=1, train_wall=83, gb_free=62.3, wall=43625 epoch 031: 309 / 1689 loss=3.498, nll_loss=1.942, ppl=3.84, wps=587277, ups=1.19, wpb=495370, bsz=16242.8, num_updates=50900, lr=0.000280331, gnorm=0.134, clip=0, loss_scale=1, train_wall=83, gb_free=62.3, wall=43625 epoch 031: 309 / 1689 loss=3.498, nll_loss=1.942, ppl=3.84, wps=587277, ups=1.19, wpb=495370, bsz=16242.8, num_updates=50900, lr=0.000280331, gnorm=0.134, clip=0, loss_scale=1, train_wall=83, gb_free=62.3, wall=43625 epoch 031: 309 / 1689 loss=3.498, nll_loss=1.942, ppl=3.84, wps=587277, ups=1.19, wpb=495370, bsz=16242.8, num_updates=50900, lr=0.000280331, gnorm=0.134, clip=0, loss_scale=1, train_wall=83, gb_free=62.3, wall=43625 epoch 031: 409 / 1689 loss=3.512, nll_loss=1.957, ppl=3.88, wps=579800, ups=1.17, wpb=493633, bsz=16448.4, num_updates=51000, lr=0.000280056, gnorm=0.14, clip=0, loss_scale=1, train_wall=84, gb_free=61.6, wall=43711 epoch 031: 409 / 1689 loss=3.512, nll_loss=1.957, ppl=3.88, wps=579800, ups=1.17, wpb=493633, bsz=16448.4, num_updates=51000, lr=0.000280056, gnorm=0.14, clip=0, loss_scale=1, train_wall=84, gb_free=61.6, wall=43711 epoch 031: 409 / 1689 loss=3.512, nll_loss=1.957, ppl=3.88, wps=579800, ups=1.17, wpb=493633, bsz=16448.4, num_updates=51000, lr=0.000280056, gnorm=0.14, clip=0, loss_scale=1, train_wall=84, gb_free=61.6, wall=43711 epoch 031: 409 / 1689 loss=3.512, nll_loss=1.957, ppl=3.88, wps=579800, ups=1.17, wpb=493633, bsz=16448.4, num_updates=51000, lr=0.000280056, gnorm=0.14, clip=0, loss_scale=1, train_wall=84, gb_free=61.6, wall=43711 epoch 031: 409 / 1689 loss=3.512, nll_loss=1.957, ppl=3.88, wps=579800, ups=1.17, wpb=493633, bsz=16448.4, num_updates=51000, lr=0.000280056, gnorm=0.14, clip=0, loss_scale=1, train_wall=84, gb_free=61.6, wall=43711 epoch 031: 409 / 1689 loss=3.512, nll_loss=1.957, ppl=3.88, wps=579800, ups=1.17, wpb=493633, bsz=16448.4, num_updates=51000, lr=0.000280056, gnorm=0.14, clip=0, loss_scale=1, train_wall=84, gb_free=61.6, wall=43711 epoch 031: 409 / 1689 loss=3.512, nll_loss=1.957, ppl=3.88, wps=579800, ups=1.17, wpb=493633, bsz=16448.4, num_updates=51000, lr=0.000280056, gnorm=0.14, clip=0, loss_scale=1, train_wall=84, gb_free=61.6, wall=43711 epoch 031: 409 / 1689 loss=3.512, nll_loss=1.957, ppl=3.88, wps=579800, ups=1.17, wpb=493633, bsz=16448.4, num_updates=51000, lr=0.000280056, gnorm=0.14, clip=0, loss_scale=1, train_wall=84, gb_free=61.6, wall=43711 epoch 031: 409 / 1689 loss=3.512, nll_loss=1.957, ppl=3.88, wps=579800, ups=1.17, wpb=493633, bsz=16448.4, num_updates=51000, lr=0.000280056, gnorm=0.14, clip=0, loss_scale=1, train_wall=84, gb_free=61.6, wall=43711 epoch 031: 409 / 1689 loss=3.512, nll_loss=1.957, ppl=3.88, wps=579800, ups=1.17, wpb=493633, bsz=16448.4, num_updates=51000, lr=0.000280056, gnorm=0.14, clip=0, loss_scale=1, train_wall=84, gb_free=61.6, wall=43711 epoch 031: 409 / 1689 loss=3.512, nll_loss=1.957, ppl=3.88, wps=579800, ups=1.17, wpb=493633, bsz=16448.4, num_updates=51000, lr=0.000280056, gnorm=0.14, clip=0, loss_scale=1, train_wall=84, gb_free=61.6, wall=43711 epoch 031: 409 / 1689 loss=3.512, nll_loss=1.957, ppl=3.88, wps=579800, ups=1.17, wpb=493633, bsz=16448.4, num_updates=51000, lr=0.000280056, gnorm=0.14, clip=0, loss_scale=1, train_wall=84, gb_free=61.6, wall=43711 epoch 031: 409 / 1689 loss=3.512, nll_loss=1.957, ppl=3.88, wps=579800, ups=1.17, wpb=493633, bsz=16448.4, num_updates=51000, lr=0.000280056, gnorm=0.14, clip=0, loss_scale=1, train_wall=84, gb_free=61.6, wall=43711 epoch 031: 409 / 1689 loss=3.512, nll_loss=1.957, ppl=3.88, wps=579800, ups=1.17, wpb=493633, bsz=16448.4, num_updates=51000, lr=0.000280056, gnorm=0.14, clip=0, loss_scale=1, train_wall=84, gb_free=61.6, wall=43711 epoch 031: 409 / 1689 loss=3.512, nll_loss=1.957, ppl=3.88, wps=579800, ups=1.17, wpb=493633, bsz=16448.4, num_updates=51000, lr=0.000280056, gnorm=0.14, clip=0, loss_scale=1, train_wall=84, gb_free=61.6, wall=43711 epoch 031: 409 / 1689 loss=3.512, nll_loss=1.957, ppl=3.88, wps=579800, ups=1.17, wpb=493633, bsz=16448.4, num_updates=51000, lr=0.000280056, gnorm=0.14, clip=0, loss_scale=1, train_wall=84, gb_free=61.6, wall=43711 epoch 031: 409 / 1689 loss=3.512, nll_loss=1.957, ppl=3.88, wps=579800, ups=1.17, wpb=493633, bsz=16448.4, num_updates=51000, lr=0.000280056, gnorm=0.14, clip=0, loss_scale=1, train_wall=84, gb_free=61.6, wall=43711 epoch 031: 409 / 1689 loss=3.512, nll_loss=1.957, ppl=3.88, wps=579800, ups=1.17, wpb=493633, bsz=16448.4, num_updates=51000, lr=0.000280056, gnorm=0.14, clip=0, loss_scale=1, train_wall=84, gb_free=61.6, wall=43711 epoch 031: 409 / 1689 loss=3.512, nll_loss=1.957, ppl=3.88, wps=579800, ups=1.17, wpb=493633, bsz=16448.4, num_updates=51000, lr=0.000280056, gnorm=0.14, clip=0, loss_scale=1, train_wall=84, gb_free=61.6, wall=43711 epoch 031: 409 / 1689 loss=3.512, nll_loss=1.957, ppl=3.88, wps=579800, ups=1.17, wpb=493633, bsz=16448.4, num_updates=51000, lr=0.000280056, gnorm=0.14, clip=0, loss_scale=1, train_wall=84, gb_free=61.6, wall=43711 epoch 031: 409 / 1689 loss=3.512, nll_loss=1.957, ppl=3.88, wps=579800, ups=1.17, wpb=493633, bsz=16448.4, num_updates=51000, lr=0.000280056, gnorm=0.14, clip=0, loss_scale=1, train_wall=84, gb_free=61.6, wall=43711 epoch 031: 409 / 1689 loss=3.512, nll_loss=1.957, ppl=3.88, wps=579800, ups=1.17, wpb=493633, bsz=16448.4, num_updates=51000, lr=0.000280056, gnorm=0.14, clip=0, loss_scale=1, train_wall=84, gb_free=61.6, wall=43711 epoch 031: 409 / 1689 loss=3.512, nll_loss=1.957, ppl=3.88, wps=579800, ups=1.17, wpb=493633, bsz=16448.4, num_updates=51000, lr=0.000280056, gnorm=0.14, clip=0, loss_scale=1, train_wall=84, gb_free=61.6, wall=43711 epoch 031: 409 / 1689 loss=3.512, nll_loss=1.957, ppl=3.88, wps=579800, ups=1.17, wpb=493633, bsz=16448.4, num_updates=51000, lr=0.000280056, gnorm=0.14, clip=0, loss_scale=1, train_wall=84, gb_free=61.6, wall=43711 epoch 031: 409 / 1689 loss=3.512, nll_loss=1.957, ppl=3.88, wps=579800, ups=1.17, wpb=493633, bsz=16448.4, num_updates=51000, lr=0.000280056, gnorm=0.14, clip=0, loss_scale=1, train_wall=84, gb_free=61.6, wall=43711 epoch 031: 409 / 1689 loss=3.512, nll_loss=1.957, ppl=3.88, wps=579800, ups=1.17, wpb=493633, bsz=16448.4, num_updates=51000, lr=0.000280056, gnorm=0.14, clip=0, loss_scale=1, train_wall=84, gb_free=61.6, wall=43711 epoch 031: 409 / 1689 loss=3.512, nll_loss=1.957, ppl=3.88, wps=579800, ups=1.17, wpb=493633, bsz=16448.4, num_updates=51000, lr=0.000280056, gnorm=0.14, clip=0, loss_scale=1, train_wall=84, gb_free=61.6, wall=43711 epoch 031: 409 / 1689 loss=3.512, nll_loss=1.957, ppl=3.88, wps=579800, ups=1.17, wpb=493633, bsz=16448.4, num_updates=51000, lr=0.000280056, gnorm=0.14, clip=0, loss_scale=1, train_wall=84, gb_free=61.6, wall=43711 epoch 031: 409 / 1689 loss=3.512, nll_loss=1.957, ppl=3.88, wps=579800, ups=1.17, wpb=493633, bsz=16448.4, num_updates=51000, lr=0.000280056, gnorm=0.14, clip=0, loss_scale=1, train_wall=84, gb_free=61.6, wall=43711 epoch 031: 409 / 1689 loss=3.512, nll_loss=1.957, ppl=3.88, wps=579800, ups=1.17, wpb=493633, bsz=16448.4, num_updates=51000, lr=0.000280056, gnorm=0.14, clip=0, loss_scale=1, train_wall=84, gb_free=61.6, wall=43711 epoch 031: 409 / 1689 loss=3.512, nll_loss=1.957, ppl=3.88, wps=579800, ups=1.17, wpb=493633, bsz=16448.4, num_updates=51000, lr=0.000280056, gnorm=0.14, clip=0, loss_scale=1, train_wall=84, gb_free=61.6, wall=43711 begin validation on "valid" subset epoch 031 | valid on 'valid' subset | loss 3.706 | nll_loss 2.157 | ppl 4.46 | wps 0 | wpb 44526 | bsz 2008 | num_updates 51000 | best_loss 3.685 epoch 031 | valid on 'valid' subset | loss 3.706 | nll_loss 2.157 | ppl 4.46 | wps 0 | wpb 44526 | bsz 2008 | num_updates 51000 | best_loss 3.685 epoch 031 | valid on 'valid' subset | loss 3.706 | nll_loss 2.157 | ppl 4.46 | wps 0 | wpb 44526 | bsz 2008 | num_updates 51000 | best_loss 3.685 epoch 031 | valid on 'valid' subset | loss 3.706 | nll_loss 2.157 | ppl 4.46 | wps 0 | wpb 44526 | bsz 2008 | num_updates 51000 | best_loss 3.685 epoch 031 | valid on 'valid' subset | loss 3.706 | nll_loss 2.157 | ppl 4.46 | wps 0 | wpb 44526 | bsz 2008 | num_updates 51000 | best_loss 3.685 epoch 031 | valid on 'valid' subset | loss 3.706 | nll_loss 2.157 | ppl 4.46 | wps 0 | wpb 44526 | bsz 2008 | num_updates 51000 | best_loss 3.685 epoch 031 | valid on 'valid' subset | loss 3.706 | nll_loss 2.157 | ppl 4.46 | wps 0 | wpb 44526 | bsz 2008 | num_updates 51000 | best_loss 3.685 epoch 031 | valid on 'valid' subset | loss 3.706 | nll_loss 2.157 | ppl 4.46 | wps 0 | wpb 44526 | bsz 2008 | num_updates 51000 | best_loss 3.685 epoch 031 | valid on 'valid' subset | loss 3.706 | nll_loss 2.157 | ppl 4.46 | wps 0 | wpb 44526 | bsz 2008 | num_updates 51000 | best_loss 3.685 epoch 031 | valid on 'valid' subset | loss 3.706 | nll_loss 2.157 | ppl 4.46 | wps 0 | wpb 44526 | bsz 2008 | num_updates 51000 | best_loss 3.685 epoch 031 | valid on 'valid' subset | loss 3.706 | nll_loss 2.157 | ppl 4.46 | wps 0 | wpb 44526 | bsz 2008 | num_updates 51000 | best_loss 3.685 epoch 031 | valid on 'valid' subset | loss 3.706 | nll_loss 2.157 | ppl 4.46 | wps 0 | wpb 44526 | bsz 2008 | num_updates 51000 | best_loss 3.685 epoch 031 | valid on 'valid' subset | loss 3.706 | nll_loss 2.157 | ppl 4.46 | wps 0 | wpb 44526 | bsz 2008 | num_updates 51000 | best_loss 3.685 epoch 031 | valid on 'valid' subset | loss 3.706 | nll_loss 2.157 | ppl 4.46 | wps 0 | wpb 44526 | bsz 2008 | num_updates 51000 | best_loss 3.685 epoch 031 | valid on 'valid' subset | loss 3.706 | nll_loss 2.157 | ppl 4.46 | wps 0 | wpb 44526 | bsz 2008 | num_updates 51000 | best_loss 3.685 epoch 031 | valid on 'valid' subset | loss 3.706 | nll_loss 2.157 | ppl 4.46 | wps 0 | wpb 44526 | bsz 2008 | num_updates 51000 | best_loss 3.685 epoch 031 | valid on 'valid' subset | loss 3.706 | nll_loss 2.157 | ppl 4.46 | wps 0 | wpb 44526 | bsz 2008 | num_updates 51000 | best_loss 3.685 epoch 031 | valid on 'valid' subset | loss 3.706 | nll_loss 2.157 | ppl 4.46 | wps 0 | wpb 44526 | bsz 2008 | num_updates 51000 | best_loss 3.685 epoch 031 | valid on 'valid' subset | loss 3.706 | nll_loss 2.157 | ppl 4.46 | wps 0 | wpb 44526 | bsz 2008 | num_updates 51000 | best_loss 3.685 epoch 031 | valid on 'valid' subset | loss 3.706 | nll_loss 2.157 | ppl 4.46 | wps 0 | wpb 44526 | bsz 2008 | num_updates 51000 | best_loss 3.685 epoch 031 | valid on 'valid' subset | loss 3.706 | nll_loss 2.157 | ppl 4.46 | wps 0 | wpb 44526 | bsz 2008 | num_updates 51000 | best_loss 3.685 epoch 031 | valid on 'valid' subset | loss 3.706 | nll_loss 2.157 | ppl 4.46 | wps 0 | wpb 44526 | bsz 2008 | num_updates 51000 | best_loss 3.685 epoch 031 | valid on 'valid' subset | loss 3.706 | nll_loss 2.157 | ppl 4.46 | wps 0 | wpb 44526 | bsz 2008 | num_updates 51000 | best_loss 3.685 epoch 031 | valid on 'valid' subset | loss 3.706 | nll_loss 2.157 | ppl 4.46 | wps 0 | wpb 44526 | bsz 2008 | num_updates 51000 | best_loss 3.685 epoch 031 | valid on 'valid' subset | loss 3.706 | nll_loss 2.157 | ppl 4.46 | wps 0 | wpb 44526 | bsz 2008 | num_updates 51000 | best_loss 3.685 epoch 031 | valid on 'valid' subset | loss 3.706 | nll_loss 2.157 | ppl 4.46 | wps 0 | wpb 44526 | bsz 2008 | num_updates 51000 | best_loss 3.685 epoch 031 | valid on 'valid' subset | loss 3.706 | nll_loss 2.157 | ppl 4.46 | wps 0 | wpb 44526 | bsz 2008 | num_updates 51000 | best_loss 3.685 epoch 031 | valid on 'valid' subset | loss 3.706 | nll_loss 2.157 | ppl 4.46 | wps 0 | wpb 44526 | bsz 2008 | num_updates 51000 | best_loss 3.685 epoch 031 | valid on 'valid' subset | loss 3.706 | nll_loss 2.157 | ppl 4.46 | wps 0 | wpb 44526 | bsz 2008 | num_updates 51000 | best_loss 3.685 epoch 031 | valid on 'valid' subset | loss 3.706 | nll_loss 2.157 | ppl 4.46 | wps 0 | wpb 44526 | bsz 2008 | num_updates 51000 | best_loss 3.685 epoch 031 | valid on 'valid' subset | loss 3.706 | nll_loss 2.157 | ppl 4.46 | wps 0 | wpb 44526 | bsz 2008 | num_updates 51000 | best_loss 3.685 epoch 031: 509 / 1689 loss=3.504, nll_loss=1.949, ppl=3.86, wps=532483, ups=1.08, wpb=494549, bsz=16820.9, num_updates=51100, lr=0.000279782, gnorm=0.131, clip=0, loss_scale=1, train_wall=83, gb_free=61.3, wall=43803 epoch 031: 509 / 1689 loss=3.504, nll_loss=1.949, ppl=3.86, wps=532483, ups=1.08, wpb=494549, bsz=16820.9, num_updates=51100, lr=0.000279782, gnorm=0.131, clip=0, loss_scale=1, train_wall=83, gb_free=61.3, wall=43803 epoch 031: 509 / 1689 loss=3.504, nll_loss=1.949, ppl=3.86, wps=532483, ups=1.08, wpb=494549, bsz=16820.9, num_updates=51100, lr=0.000279782, gnorm=0.131, clip=0, loss_scale=1, train_wall=83, gb_free=61.3, wall=43803 epoch 031: 509 / 1689 loss=3.504, nll_loss=1.949, ppl=3.86, wps=532483, ups=1.08, wpb=494549, bsz=16820.9, num_updates=51100, lr=0.000279782, gnorm=0.131, clip=0, loss_scale=1, train_wall=83, gb_free=61.3, wall=43803 epoch 031: 509 / 1689 loss=3.504, nll_loss=1.949, ppl=3.86, wps=532483, ups=1.08, wpb=494549, bsz=16820.9, num_updates=51100, lr=0.000279782, gnorm=0.131, clip=0, loss_scale=1, train_wall=83, gb_free=61.3, wall=43803 epoch 031: 509 / 1689 loss=3.504, nll_loss=1.949, ppl=3.86, wps=532483, ups=1.08, wpb=494549, bsz=16820.9, num_updates=51100, lr=0.000279782, gnorm=0.131, clip=0, loss_scale=1, train_wall=83, gb_free=61.3, wall=43803 epoch 031: 509 / 1689 loss=3.504, nll_loss=1.949, ppl=3.86, wps=532483, ups=1.08, wpb=494549, bsz=16820.9, num_updates=51100, lr=0.000279782, gnorm=0.131, clip=0, loss_scale=1, train_wall=83, gb_free=61.3, wall=43803 epoch 031: 509 / 1689 loss=3.504, nll_loss=1.949, ppl=3.86, wps=532483, ups=1.08, wpb=494549, bsz=16820.9, num_updates=51100, lr=0.000279782, gnorm=0.131, clip=0, loss_scale=1, train_wall=83, gb_free=61.3, wall=43803 epoch 031: 509 / 1689 loss=3.504, nll_loss=1.949, ppl=3.86, wps=532483, ups=1.08, wpb=494549, bsz=16820.9, num_updates=51100, lr=0.000279782, gnorm=0.131, clip=0, loss_scale=1, train_wall=83, gb_free=61.3, wall=43803 epoch 031: 509 / 1689 loss=3.504, nll_loss=1.949, ppl=3.86, wps=532483, ups=1.08, wpb=494549, bsz=16820.9, num_updates=51100, lr=0.000279782, gnorm=0.131, clip=0, loss_scale=1, train_wall=83, gb_free=61.3, wall=43803 epoch 031: 509 / 1689 loss=3.504, nll_loss=1.949, ppl=3.86, wps=532483, ups=1.08, wpb=494549, bsz=16820.9, num_updates=51100, lr=0.000279782, gnorm=0.131, clip=0, loss_scale=1, train_wall=83, gb_free=61.3, wall=43803 epoch 031: 509 / 1689 loss=3.504, nll_loss=1.949, ppl=3.86, wps=532483, ups=1.08, wpb=494549, bsz=16820.9, num_updates=51100, lr=0.000279782, gnorm=0.131, clip=0, loss_scale=1, train_wall=83, gb_free=61.3, wall=43803 epoch 031: 509 / 1689 loss=3.504, nll_loss=1.949, ppl=3.86, wps=532483, ups=1.08, wpb=494549, bsz=16820.9, num_updates=51100, lr=0.000279782, gnorm=0.131, clip=0, loss_scale=1, train_wall=83, gb_free=61.3, wall=43803 epoch 031: 509 / 1689 loss=3.504, nll_loss=1.949, ppl=3.86, wps=532483, ups=1.08, wpb=494549, bsz=16820.9, num_updates=51100, lr=0.000279782, gnorm=0.131, clip=0, loss_scale=1, train_wall=83, gb_free=61.3, wall=43803 epoch 031: 509 / 1689 loss=3.504, nll_loss=1.949, ppl=3.86, wps=532483, ups=1.08, wpb=494549, bsz=16820.9, num_updates=51100, lr=0.000279782, gnorm=0.131, clip=0, loss_scale=1, train_wall=83, gb_free=61.3, wall=43803 epoch 031: 509 / 1689 loss=3.504, nll_loss=1.949, ppl=3.86, wps=532483, ups=1.08, wpb=494549, bsz=16820.9, num_updates=51100, lr=0.000279782, gnorm=0.131, clip=0, loss_scale=1, train_wall=83, gb_free=61.3, wall=43803 epoch 031: 509 / 1689 loss=3.504, nll_loss=1.949, ppl=3.86, wps=532483, ups=1.08, wpb=494549, bsz=16820.9, num_updates=51100, lr=0.000279782, gnorm=0.131, clip=0, loss_scale=1, train_wall=83, gb_free=61.3, wall=43803 epoch 031: 509 / 1689 loss=3.504, nll_loss=1.949, ppl=3.86, wps=532483, ups=1.08, wpb=494549, bsz=16820.9, num_updates=51100, lr=0.000279782, gnorm=0.131, clip=0, loss_scale=1, train_wall=83, gb_free=61.3, wall=43803 epoch 031: 509 / 1689 loss=3.504, nll_loss=1.949, ppl=3.86, wps=532483, ups=1.08, wpb=494549, bsz=16820.9, num_updates=51100, lr=0.000279782, gnorm=0.131, clip=0, loss_scale=1, train_wall=83, gb_free=61.3, wall=43803 epoch 031: 509 / 1689 loss=3.504, nll_loss=1.949, ppl=3.86, wps=532483, ups=1.08, wpb=494549, bsz=16820.9, num_updates=51100, lr=0.000279782, gnorm=0.131, clip=0, loss_scale=1, train_wall=83, gb_free=61.3, wall=43803 epoch 031: 509 / 1689 loss=3.504, nll_loss=1.949, ppl=3.86, wps=532483, ups=1.08, wpb=494549, bsz=16820.9, num_updates=51100, lr=0.000279782, gnorm=0.131, clip=0, loss_scale=1, train_wall=83, gb_free=61.3, wall=43803 epoch 031: 509 / 1689 loss=3.504, nll_loss=1.949, ppl=3.86, wps=532483, ups=1.08, wpb=494549, bsz=16820.9, num_updates=51100, lr=0.000279782, gnorm=0.131, clip=0, loss_scale=1, train_wall=83, gb_free=61.3, wall=43803 epoch 031: 509 / 1689 loss=3.504, nll_loss=1.949, ppl=3.86, wps=532483, ups=1.08, wpb=494549, bsz=16820.9, num_updates=51100, lr=0.000279782, gnorm=0.131, clip=0, loss_scale=1, train_wall=83, gb_free=61.3, wall=43803 epoch 031: 509 / 1689 loss=3.504, nll_loss=1.949, ppl=3.86, wps=532483, ups=1.08, wpb=494549, bsz=16820.9, num_updates=51100, lr=0.000279782, gnorm=0.131, clip=0, loss_scale=1, train_wall=83, gb_free=61.3, wall=43803 epoch 031: 509 / 1689 loss=3.504, nll_loss=1.949, ppl=3.86, wps=532483, ups=1.08, wpb=494549, bsz=16820.9, num_updates=51100, lr=0.000279782, gnorm=0.131, clip=0, loss_scale=1, train_wall=83, gb_free=61.3, wall=43803 epoch 031: 509 / 1689 loss=3.504, nll_loss=1.949, ppl=3.86, wps=532483, ups=1.08, wpb=494549, bsz=16820.9, num_updates=51100, lr=0.000279782, gnorm=0.131, clip=0, loss_scale=1, train_wall=83, gb_free=61.3, wall=43803 epoch 031: 509 / 1689 loss=3.504, nll_loss=1.949, ppl=3.86, wps=532483, ups=1.08, wpb=494549, bsz=16820.9, num_updates=51100, lr=0.000279782, gnorm=0.131, clip=0, loss_scale=1, train_wall=83, gb_free=61.3, wall=43803 epoch 031: 509 / 1689 loss=3.504, nll_loss=1.949, ppl=3.86, wps=532483, ups=1.08, wpb=494549, bsz=16820.9, num_updates=51100, lr=0.000279782, gnorm=0.131, clip=0, loss_scale=1, train_wall=83, gb_free=61.3, wall=43803 epoch 031: 509 / 1689 loss=3.504, nll_loss=1.949, ppl=3.86, wps=532483, ups=1.08, wpb=494549, bsz=16820.9, num_updates=51100, lr=0.000279782, gnorm=0.131, clip=0, loss_scale=1, train_wall=83, gb_free=61.3, wall=43803 epoch 031: 509 / 1689 loss=3.504, nll_loss=1.949, ppl=3.86, wps=532483, ups=1.08, wpb=494549, bsz=16820.9, num_updates=51100, lr=0.000279782, gnorm=0.131, clip=0, loss_scale=1, train_wall=83, gb_free=61.3, wall=43803 epoch 031: 509 / 1689 loss=3.504, nll_loss=1.949, ppl=3.86, wps=532483, ups=1.08, wpb=494549, bsz=16820.9, num_updates=51100, lr=0.000279782, gnorm=0.131, clip=0, loss_scale=1, train_wall=83, gb_free=61.3, wall=43803 epoch 031: 609 / 1689 loss=3.505, nll_loss=1.949, ppl=3.86, wps=589098, ups=1.19, wpb=495402, bsz=16485.5, num_updates=51200, lr=0.000279508, gnorm=0.142, clip=0, loss_scale=1, train_wall=83, gb_free=61.4, wall=43887 epoch 031: 609 / 1689 loss=3.505, nll_loss=1.949, ppl=3.86, wps=589098, ups=1.19, wpb=495402, bsz=16485.5, num_updates=51200, lr=0.000279508, gnorm=0.142, clip=0, loss_scale=1, train_wall=83, gb_free=61.4, wall=43887 epoch 031: 609 / 1689 loss=3.505, nll_loss=1.949, ppl=3.86, wps=589098, ups=1.19, wpb=495402, bsz=16485.5, num_updates=51200, lr=0.000279508, gnorm=0.142, clip=0, loss_scale=1, train_wall=83, gb_free=61.4, wall=43887 epoch 031: 609 / 1689 loss=3.505, nll_loss=1.949, ppl=3.86, wps=589098, ups=1.19, wpb=495402, bsz=16485.5, num_updates=51200, lr=0.000279508, gnorm=0.142, clip=0, loss_scale=1, train_wall=83, gb_free=61.4, wall=43887 epoch 031: 609 / 1689 loss=3.505, nll_loss=1.949, ppl=3.86, wps=589098, ups=1.19, wpb=495402, bsz=16485.5, num_updates=51200, lr=0.000279508, gnorm=0.142, clip=0, loss_scale=1, train_wall=83, gb_free=61.4, wall=43887 epoch 031: 609 / 1689 loss=3.505, nll_loss=1.949, ppl=3.86, wps=589098, ups=1.19, wpb=495402, bsz=16485.5, num_updates=51200, lr=0.000279508, gnorm=0.142, clip=0, loss_scale=1, train_wall=83, gb_free=61.4, wall=43887 epoch 031: 609 / 1689 loss=3.505, nll_loss=1.949, ppl=3.86, wps=589098, ups=1.19, wpb=495402, bsz=16485.5, num_updates=51200, lr=0.000279508, gnorm=0.142, clip=0, loss_scale=1, train_wall=83, gb_free=61.4, wall=43887 epoch 031: 609 / 1689 loss=3.505, nll_loss=1.949, ppl=3.86, wps=589098, ups=1.19, wpb=495402, bsz=16485.5, num_updates=51200, lr=0.000279508, gnorm=0.142, clip=0, loss_scale=1, train_wall=83, gb_free=61.4, wall=43887 epoch 031: 609 / 1689 loss=3.505, nll_loss=1.949, ppl=3.86, wps=589098, ups=1.19, wpb=495402, bsz=16485.5, num_updates=51200, lr=0.000279508, gnorm=0.142, clip=0, loss_scale=1, train_wall=83, gb_free=61.4, wall=43887 epoch 031: 609 / 1689 loss=3.505, nll_loss=1.949, ppl=3.86, wps=589098, ups=1.19, wpb=495402, bsz=16485.5, num_updates=51200, lr=0.000279508, gnorm=0.142, clip=0, loss_scale=1, train_wall=83, gb_free=61.4, wall=43887 epoch 031: 609 / 1689 loss=3.505, nll_loss=1.949, ppl=3.86, wps=589098, ups=1.19, wpb=495402, bsz=16485.5, num_updates=51200, lr=0.000279508, gnorm=0.142, clip=0, loss_scale=1, train_wall=83, gb_free=61.4, wall=43887 epoch 031: 609 / 1689 loss=3.505, nll_loss=1.949, ppl=3.86, wps=589098, ups=1.19, wpb=495402, bsz=16485.5, num_updates=51200, lr=0.000279508, gnorm=0.142, clip=0, loss_scale=1, train_wall=83, gb_free=61.4, wall=43887 epoch 031: 609 / 1689 loss=3.505, nll_loss=1.949, ppl=3.86, wps=589098, ups=1.19, wpb=495402, bsz=16485.5, num_updates=51200, lr=0.000279508, gnorm=0.142, clip=0, loss_scale=1, train_wall=83, gb_free=61.4, wall=43887 epoch 031: 609 / 1689 loss=3.505, nll_loss=1.949, ppl=3.86, wps=589098, ups=1.19, wpb=495402, bsz=16485.5, num_updates=51200, lr=0.000279508, gnorm=0.142, clip=0, loss_scale=1, train_wall=83, gb_free=61.4, wall=43887 epoch 031: 609 / 1689 loss=3.505, nll_loss=1.949, ppl=3.86, wps=589098, ups=1.19, wpb=495402, bsz=16485.5, num_updates=51200, lr=0.000279508, gnorm=0.142, clip=0, loss_scale=1, train_wall=83, gb_free=61.4, wall=43887 epoch 031: 609 / 1689 loss=3.505, nll_loss=1.949, ppl=3.86, wps=589098, ups=1.19, wpb=495402, bsz=16485.5, num_updates=51200, lr=0.000279508, gnorm=0.142, clip=0, loss_scale=1, train_wall=83, gb_free=61.4, wall=43887 epoch 031: 609 / 1689 loss=3.505, nll_loss=1.949, ppl=3.86, wps=589098, ups=1.19, wpb=495402, bsz=16485.5, num_updates=51200, lr=0.000279508, gnorm=0.142, clip=0, loss_scale=1, train_wall=83, gb_free=61.4, wall=43887 epoch 031: 609 / 1689 loss=3.505, nll_loss=1.949, ppl=3.86, wps=589098, ups=1.19, wpb=495402, bsz=16485.5, num_updates=51200, lr=0.000279508, gnorm=0.142, clip=0, loss_scale=1, train_wall=83, gb_free=61.4, wall=43887 epoch 031: 609 / 1689 loss=3.505, nll_loss=1.949, ppl=3.86, wps=589098, ups=1.19, wpb=495402, bsz=16485.5, num_updates=51200, lr=0.000279508, gnorm=0.142, clip=0, loss_scale=1, train_wall=83, gb_free=61.4, wall=43887 epoch 031: 609 / 1689 loss=3.505, nll_loss=1.949, ppl=3.86, wps=589098, ups=1.19, wpb=495402, bsz=16485.5, num_updates=51200, lr=0.000279508, gnorm=0.142, clip=0, loss_scale=1, train_wall=83, gb_free=61.4, wall=43887 epoch 031: 609 / 1689 loss=3.505, nll_loss=1.949, ppl=3.86, wps=589098, ups=1.19, wpb=495402, bsz=16485.5, num_updates=51200, lr=0.000279508, gnorm=0.142, clip=0, loss_scale=1, train_wall=83, gb_free=61.4, wall=43887 epoch 031: 609 / 1689 loss=3.505, nll_loss=1.949, ppl=3.86, wps=589098, ups=1.19, wpb=495402, bsz=16485.5, num_updates=51200, lr=0.000279508, gnorm=0.142, clip=0, loss_scale=1, train_wall=83, gb_free=61.4, wall=43887 epoch 031: 609 / 1689 loss=3.505, nll_loss=1.949, ppl=3.86, wps=589098, ups=1.19, wpb=495402, bsz=16485.5, num_updates=51200, lr=0.000279508, gnorm=0.142, clip=0, loss_scale=1, train_wall=83, gb_free=61.4, wall=43887 epoch 031: 609 / 1689 loss=3.505, nll_loss=1.949, ppl=3.86, wps=589098, ups=1.19, wpb=495402, bsz=16485.5, num_updates=51200, lr=0.000279508, gnorm=0.142, clip=0, loss_scale=1, train_wall=83, gb_free=61.4, wall=43887 epoch 031: 609 / 1689 loss=3.505, nll_loss=1.949, ppl=3.86, wps=589098, ups=1.19, wpb=495402, bsz=16485.5, num_updates=51200, lr=0.000279508, gnorm=0.142, clip=0, loss_scale=1, train_wall=83, gb_free=61.4, wall=43887 epoch 031: 609 / 1689 loss=3.505, nll_loss=1.949, ppl=3.86, wps=589098, ups=1.19, wpb=495402, bsz=16485.5, num_updates=51200, lr=0.000279508, gnorm=0.142, clip=0, loss_scale=1, train_wall=83, gb_free=61.4, wall=43887 epoch 031: 609 / 1689 loss=3.505, nll_loss=1.949, ppl=3.86, wps=589098, ups=1.19, wpb=495402, bsz=16485.5, num_updates=51200, lr=0.000279508, gnorm=0.142, clip=0, loss_scale=1, train_wall=83, gb_free=61.4, wall=43887 epoch 031: 609 / 1689 loss=3.505, nll_loss=1.949, ppl=3.86, wps=589098, ups=1.19, wpb=495402, bsz=16485.5, num_updates=51200, lr=0.000279508, gnorm=0.142, clip=0, loss_scale=1, train_wall=83, gb_free=61.4, wall=43887 epoch 031: 609 / 1689 loss=3.505, nll_loss=1.949, ppl=3.86, wps=589098, ups=1.19, wpb=495402, bsz=16485.5, num_updates=51200, lr=0.000279508, gnorm=0.142, clip=0, loss_scale=1, train_wall=83, gb_free=61.4, wall=43887 epoch 031: 609 / 1689 loss=3.505, nll_loss=1.949, ppl=3.86, wps=589098, ups=1.19, wpb=495402, bsz=16485.5, num_updates=51200, lr=0.000279508, gnorm=0.142, clip=0, loss_scale=1, train_wall=83, gb_free=61.4, wall=43887 epoch 031: 609 / 1689 loss=3.505, nll_loss=1.949, ppl=3.86, wps=589098, ups=1.19, wpb=495402, bsz=16485.5, num_updates=51200, lr=0.000279508, gnorm=0.142, clip=0, loss_scale=1, train_wall=83, gb_free=61.4, wall=43887 epoch 031: 709 / 1689 loss=3.512, nll_loss=1.957, ppl=3.88, wps=588295, ups=1.19, wpb=496128, bsz=16668, num_updates=51300, lr=0.000279236, gnorm=0.142, clip=0, loss_scale=2, train_wall=83, gb_free=61.7, wall=43972 epoch 031: 709 / 1689 loss=3.512, nll_loss=1.957, ppl=3.88, wps=588295, ups=1.19, wpb=496128, bsz=16668, num_updates=51300, lr=0.000279236, gnorm=0.142, clip=0, loss_scale=2, train_wall=83, gb_free=61.7, wall=43972 epoch 031: 709 / 1689 loss=3.512, nll_loss=1.957, ppl=3.88, wps=588295, ups=1.19, wpb=496128, bsz=16668, num_updates=51300, lr=0.000279236, gnorm=0.142, clip=0, loss_scale=2, train_wall=83, gb_free=61.7, wall=43972 epoch 031: 709 / 1689 loss=3.512, nll_loss=1.957, ppl=3.88, wps=588295, ups=1.19, wpb=496128, bsz=16668, num_updates=51300, lr=0.000279236, gnorm=0.142, clip=0, loss_scale=2, train_wall=83, gb_free=61.7, wall=43972 epoch 031: 709 / 1689 loss=3.512, nll_loss=1.957, ppl=3.88, wps=588295, ups=1.19, wpb=496128, bsz=16668, num_updates=51300, lr=0.000279236, gnorm=0.142, clip=0, loss_scale=2, train_wall=83, gb_free=61.7, wall=43972 epoch 031: 709 / 1689 loss=3.512, nll_loss=1.957, ppl=3.88, wps=588295, ups=1.19, wpb=496128, bsz=16668, num_updates=51300, lr=0.000279236, gnorm=0.142, clip=0, loss_scale=2, train_wall=83, gb_free=61.7, wall=43972 epoch 031: 709 / 1689 loss=3.512, nll_loss=1.957, ppl=3.88, wps=588295, ups=1.19, wpb=496128, bsz=16668, num_updates=51300, lr=0.000279236, gnorm=0.142, clip=0, loss_scale=2, train_wall=83, gb_free=61.7, wall=43972 epoch 031: 709 / 1689 loss=3.512, nll_loss=1.957, ppl=3.88, wps=588295, ups=1.19, wpb=496128, bsz=16668, num_updates=51300, lr=0.000279236, gnorm=0.142, clip=0, loss_scale=2, train_wall=83, gb_free=61.7, wall=43972 epoch 031: 709 / 1689 loss=3.512, nll_loss=1.957, ppl=3.88, wps=588295, ups=1.19, wpb=496128, bsz=16668, num_updates=51300, lr=0.000279236, gnorm=0.142, clip=0, loss_scale=2, train_wall=83, gb_free=61.7, wall=43972 epoch 031: 709 / 1689 loss=3.512, nll_loss=1.957, ppl=3.88, wps=588295, ups=1.19, wpb=496128, bsz=16668, num_updates=51300, lr=0.000279236, gnorm=0.142, clip=0, loss_scale=2, train_wall=83, gb_free=61.7, wall=43972 epoch 031: 709 / 1689 loss=3.512, nll_loss=1.957, ppl=3.88, wps=588295, ups=1.19, wpb=496128, bsz=16668, num_updates=51300, lr=0.000279236, gnorm=0.142, clip=0, loss_scale=2, train_wall=83, gb_free=61.7, wall=43972 epoch 031: 709 / 1689 loss=3.512, nll_loss=1.957, ppl=3.88, wps=588295, ups=1.19, wpb=496128, bsz=16668, num_updates=51300, lr=0.000279236, gnorm=0.142, clip=0, loss_scale=2, train_wall=83, gb_free=61.7, wall=43972 epoch 031: 709 / 1689 loss=3.512, nll_loss=1.957, ppl=3.88, wps=588295, ups=1.19, wpb=496128, bsz=16668, num_updates=51300, lr=0.000279236, gnorm=0.142, clip=0, loss_scale=2, train_wall=83, gb_free=61.7, wall=43972 epoch 031: 709 / 1689 loss=3.512, nll_loss=1.957, ppl=3.88, wps=588295, ups=1.19, wpb=496128, bsz=16668, num_updates=51300, lr=0.000279236, gnorm=0.142, clip=0, loss_scale=2, train_wall=83, gb_free=61.7, wall=43972 epoch 031: 709 / 1689 loss=3.512, nll_loss=1.957, ppl=3.88, wps=588295, ups=1.19, wpb=496128, bsz=16668, num_updates=51300, lr=0.000279236, gnorm=0.142, clip=0, loss_scale=2, train_wall=83, gb_free=61.7, wall=43972 epoch 031: 709 / 1689 loss=3.512, nll_loss=1.957, ppl=3.88, wps=588295, ups=1.19, wpb=496128, bsz=16668, num_updates=51300, lr=0.000279236, gnorm=0.142, clip=0, loss_scale=2, train_wall=83, gb_free=61.7, wall=43972 epoch 031: 709 / 1689 loss=3.512, nll_loss=1.957, ppl=3.88, wps=588295, ups=1.19, wpb=496128, bsz=16668, num_updates=51300, lr=0.000279236, gnorm=0.142, clip=0, loss_scale=2, train_wall=83, gb_free=61.7, wall=43972 epoch 031: 709 / 1689 loss=3.512, nll_loss=1.957, ppl=3.88, wps=588295, ups=1.19, wpb=496128, bsz=16668, num_updates=51300, lr=0.000279236, gnorm=0.142, clip=0, loss_scale=2, train_wall=83, gb_free=61.7, wall=43972 epoch 031: 709 / 1689 loss=3.512, nll_loss=1.957, ppl=3.88, wps=588295, ups=1.19, wpb=496128, bsz=16668, num_updates=51300, lr=0.000279236, gnorm=0.142, clip=0, loss_scale=2, train_wall=83, gb_free=61.7, wall=43972 epoch 031: 709 / 1689 loss=3.512, nll_loss=1.957, ppl=3.88, wps=588295, ups=1.19, wpb=496128, bsz=16668, num_updates=51300, lr=0.000279236, gnorm=0.142, clip=0, loss_scale=2, train_wall=83, gb_free=61.7, wall=43972 epoch 031: 709 / 1689 loss=3.512, nll_loss=1.957, ppl=3.88, wps=588295, ups=1.19, wpb=496128, bsz=16668, num_updates=51300, lr=0.000279236, gnorm=0.142, clip=0, loss_scale=2, train_wall=83, gb_free=61.7, wall=43972 epoch 031: 709 / 1689 loss=3.512, nll_loss=1.957, ppl=3.88, wps=588295, ups=1.19, wpb=496128, bsz=16668, num_updates=51300, lr=0.000279236, gnorm=0.142, clip=0, loss_scale=2, train_wall=83, gb_free=61.7, wall=43972 epoch 031: 709 / 1689 loss=3.512, nll_loss=1.957, ppl=3.88, wps=588295, ups=1.19, wpb=496128, bsz=16668, num_updates=51300, lr=0.000279236, gnorm=0.142, clip=0, loss_scale=2, train_wall=83, gb_free=61.7, wall=43972 epoch 031: 709 / 1689 loss=3.512, nll_loss=1.957, ppl=3.88, wps=588295, ups=1.19, wpb=496128, bsz=16668, num_updates=51300, lr=0.000279236, gnorm=0.142, clip=0, loss_scale=2, train_wall=83, gb_free=61.7, wall=43972 epoch 031: 709 / 1689 loss=3.512, nll_loss=1.957, ppl=3.88, wps=588295, ups=1.19, wpb=496128, bsz=16668, num_updates=51300, lr=0.000279236, gnorm=0.142, clip=0, loss_scale=2, train_wall=83, gb_free=61.7, wall=43972 epoch 031: 709 / 1689 loss=3.512, nll_loss=1.957, ppl=3.88, wps=588295, ups=1.19, wpb=496128, bsz=16668, num_updates=51300, lr=0.000279236, gnorm=0.142, clip=0, loss_scale=2, train_wall=83, gb_free=61.7, wall=43972 epoch 031: 709 / 1689 loss=3.512, nll_loss=1.957, ppl=3.88, wps=588295, ups=1.19, wpb=496128, bsz=16668, num_updates=51300, lr=0.000279236, gnorm=0.142, clip=0, loss_scale=2, train_wall=83, gb_free=61.7, wall=43972 epoch 031: 709 / 1689 loss=3.512, nll_loss=1.957, ppl=3.88, wps=588295, ups=1.19, wpb=496128, bsz=16668, num_updates=51300, lr=0.000279236, gnorm=0.142, clip=0, loss_scale=2, train_wall=83, gb_free=61.7, wall=43972 epoch 031: 709 / 1689 loss=3.512, nll_loss=1.957, ppl=3.88, wps=588295, ups=1.19, wpb=496128, bsz=16668, num_updates=51300, lr=0.000279236, gnorm=0.142, clip=0, loss_scale=2, train_wall=83, gb_free=61.7, wall=43972 epoch 031: 709 / 1689 loss=3.512, nll_loss=1.957, ppl=3.88, wps=588295, ups=1.19, wpb=496128, bsz=16668, num_updates=51300, lr=0.000279236, gnorm=0.142, clip=0, loss_scale=2, train_wall=83, gb_free=61.7, wall=43972 epoch 031: 709 / 1689 loss=3.512, nll_loss=1.957, ppl=3.88, wps=588295, ups=1.19, wpb=496128, bsz=16668, num_updates=51300, lr=0.000279236, gnorm=0.142, clip=0, loss_scale=2, train_wall=83, gb_free=61.7, wall=43972 epoch 031: 809 / 1689 loss=3.512, nll_loss=1.958, ppl=3.88, wps=585771, ups=1.18, wpb=496078, bsz=16358.8, num_updates=51400, lr=0.000278964, gnorm=0.132, clip=0, loss_scale=2, train_wall=84, gb_free=61.9, wall=44057 epoch 031: 809 / 1689 loss=3.512, nll_loss=1.958, ppl=3.88, wps=585771, ups=1.18, wpb=496078, bsz=16358.8, num_updates=51400, lr=0.000278964, gnorm=0.132, clip=0, loss_scale=2, train_wall=84, gb_free=61.9, wall=44057 epoch 031: 809 / 1689 loss=3.512, nll_loss=1.958, ppl=3.88, wps=585771, ups=1.18, wpb=496078, bsz=16358.8, num_updates=51400, lr=0.000278964, gnorm=0.132, clip=0, loss_scale=2, train_wall=84, gb_free=61.9, wall=44057 epoch 031: 809 / 1689 loss=3.512, nll_loss=1.958, ppl=3.88, wps=585771, ups=1.18, wpb=496078, bsz=16358.8, num_updates=51400, lr=0.000278964, gnorm=0.132, clip=0, loss_scale=2, train_wall=84, gb_free=61.9, wall=44057 epoch 031: 809 / 1689 loss=3.512, nll_loss=1.958, ppl=3.88, wps=585771, ups=1.18, wpb=496078, bsz=16358.8, num_updates=51400, lr=0.000278964, gnorm=0.132, clip=0, loss_scale=2, train_wall=84, gb_free=61.9, wall=44057 epoch 031: 809 / 1689 loss=3.512, nll_loss=1.958, ppl=3.88, wps=585771, ups=1.18, wpb=496078, bsz=16358.8, num_updates=51400, lr=0.000278964, gnorm=0.132, clip=0, loss_scale=2, train_wall=84, gb_free=61.9, wall=44057 epoch 031: 809 / 1689 loss=3.512, nll_loss=1.958, ppl=3.88, wps=585771, ups=1.18, wpb=496078, bsz=16358.8, num_updates=51400, lr=0.000278964, gnorm=0.132, clip=0, loss_scale=2, train_wall=84, gb_free=61.9, wall=44057 epoch 031: 809 / 1689 loss=3.512, nll_loss=1.958, ppl=3.88, wps=585771, ups=1.18, wpb=496078, bsz=16358.8, num_updates=51400, lr=0.000278964, gnorm=0.132, clip=0, loss_scale=2, train_wall=84, gb_free=61.9, wall=44057 epoch 031: 809 / 1689 loss=3.512, nll_loss=1.958, ppl=3.88, wps=585771, ups=1.18, wpb=496078, bsz=16358.8, num_updates=51400, lr=0.000278964, gnorm=0.132, clip=0, loss_scale=2, train_wall=84, gb_free=61.9, wall=44057 epoch 031: 809 / 1689 loss=3.512, nll_loss=1.958, ppl=3.88, wps=585771, ups=1.18, wpb=496078, bsz=16358.8, num_updates=51400, lr=0.000278964, gnorm=0.132, clip=0, loss_scale=2, train_wall=84, gb_free=61.9, wall=44057 epoch 031: 809 / 1689 loss=3.512, nll_loss=1.958, ppl=3.88, wps=585771, ups=1.18, wpb=496078, bsz=16358.8, num_updates=51400, lr=0.000278964, gnorm=0.132, clip=0, loss_scale=2, train_wall=84, gb_free=61.9, wall=44057 epoch 031: 809 / 1689 loss=3.512, nll_loss=1.958, ppl=3.88, wps=585771, ups=1.18, wpb=496078, bsz=16358.8, num_updates=51400, lr=0.000278964, gnorm=0.132, clip=0, loss_scale=2, train_wall=84, gb_free=61.9, wall=44057 epoch 031: 809 / 1689 loss=3.512, nll_loss=1.958, ppl=3.88, wps=585771, ups=1.18, wpb=496078, bsz=16358.8, num_updates=51400, lr=0.000278964, gnorm=0.132, clip=0, loss_scale=2, train_wall=84, gb_free=61.9, wall=44057 epoch 031: 809 / 1689 loss=3.512, nll_loss=1.958, ppl=3.88, wps=585771, ups=1.18, wpb=496078, bsz=16358.8, num_updates=51400, lr=0.000278964, gnorm=0.132, clip=0, loss_scale=2, train_wall=84, gb_free=61.9, wall=44057 epoch 031: 809 / 1689 loss=3.512, nll_loss=1.958, ppl=3.88, wps=585771, ups=1.18, wpb=496078, bsz=16358.8, num_updates=51400, lr=0.000278964, gnorm=0.132, clip=0, loss_scale=2, train_wall=84, gb_free=61.9, wall=44057 epoch 031: 809 / 1689 loss=3.512, nll_loss=1.958, ppl=3.88, wps=585771, ups=1.18, wpb=496078, bsz=16358.8, num_updates=51400, lr=0.000278964, gnorm=0.132, clip=0, loss_scale=2, train_wall=84, gb_free=61.9, wall=44057 epoch 031: 809 / 1689 loss=3.512, nll_loss=1.958, ppl=3.88, wps=585771, ups=1.18, wpb=496078, bsz=16358.8, num_updates=51400, lr=0.000278964, gnorm=0.132, clip=0, loss_scale=2, train_wall=84, gb_free=61.9, wall=44057 epoch 031: 809 / 1689 loss=3.512, nll_loss=1.958, ppl=3.88, wps=585771, ups=1.18, wpb=496078, bsz=16358.8, num_updates=51400, lr=0.000278964, gnorm=0.132, clip=0, loss_scale=2, train_wall=84, gb_free=61.9, wall=44057 epoch 031: 809 / 1689 loss=3.512, nll_loss=1.958, ppl=3.88, wps=585771, ups=1.18, wpb=496078, bsz=16358.8, num_updates=51400, lr=0.000278964, gnorm=0.132, clip=0, loss_scale=2, train_wall=84, gb_free=61.9, wall=44057 epoch 031: 809 / 1689 loss=3.512, nll_loss=1.958, ppl=3.88, wps=585771, ups=1.18, wpb=496078, bsz=16358.8, num_updates=51400, lr=0.000278964, gnorm=0.132, clip=0, loss_scale=2, train_wall=84, gb_free=61.9, wall=44057 epoch 031: 809 / 1689 loss=3.512, nll_loss=1.958, ppl=3.88, wps=585771, ups=1.18, wpb=496078, bsz=16358.8, num_updates=51400, lr=0.000278964, gnorm=0.132, clip=0, loss_scale=2, train_wall=84, gb_free=61.9, wall=44057 epoch 031: 809 / 1689 loss=3.512, nll_loss=1.958, ppl=3.88, wps=585771, ups=1.18, wpb=496078, bsz=16358.8, num_updates=51400, lr=0.000278964, gnorm=0.132, clip=0, loss_scale=2, train_wall=84, gb_free=61.9, wall=44057 epoch 031: 809 / 1689 loss=3.512, nll_loss=1.958, ppl=3.88, wps=585771, ups=1.18, wpb=496078, bsz=16358.8, num_updates=51400, lr=0.000278964, gnorm=0.132, clip=0, loss_scale=2, train_wall=84, gb_free=61.9, wall=44057 epoch 031: 809 / 1689 loss=3.512, nll_loss=1.958, ppl=3.88, wps=585771, ups=1.18, wpb=496078, bsz=16358.8, num_updates=51400, lr=0.000278964, gnorm=0.132, clip=0, loss_scale=2, train_wall=84, gb_free=61.9, wall=44057 epoch 031: 809 / 1689 loss=3.512, nll_loss=1.958, ppl=3.88, wps=585771, ups=1.18, wpb=496078, bsz=16358.8, num_updates=51400, lr=0.000278964, gnorm=0.132, clip=0, loss_scale=2, train_wall=84, gb_free=61.9, wall=44057 epoch 031: 809 / 1689 loss=3.512, nll_loss=1.958, ppl=3.88, wps=585771, ups=1.18, wpb=496078, bsz=16358.8, num_updates=51400, lr=0.000278964, gnorm=0.132, clip=0, loss_scale=2, train_wall=84, gb_free=61.9, wall=44057 epoch 031: 809 / 1689 loss=3.512, nll_loss=1.958, ppl=3.88, wps=585771, ups=1.18, wpb=496078, bsz=16358.8, num_updates=51400, lr=0.000278964, gnorm=0.132, clip=0, loss_scale=2, train_wall=84, gb_free=61.9, wall=44057 epoch 031: 809 / 1689 loss=3.512, nll_loss=1.958, ppl=3.88, wps=585771, ups=1.18, wpb=496078, bsz=16358.8, num_updates=51400, lr=0.000278964, gnorm=0.132, clip=0, loss_scale=2, train_wall=84, gb_free=61.9, wall=44057 epoch 031: 809 / 1689 loss=3.512, nll_loss=1.958, ppl=3.88, wps=585771, ups=1.18, wpb=496078, bsz=16358.8, num_updates=51400, lr=0.000278964, gnorm=0.132, clip=0, loss_scale=2, train_wall=84, gb_free=61.9, wall=44057 epoch 031: 809 / 1689 loss=3.512, nll_loss=1.958, ppl=3.88, wps=585771, ups=1.18, wpb=496078, bsz=16358.8, num_updates=51400, lr=0.000278964, gnorm=0.132, clip=0, loss_scale=2, train_wall=84, gb_free=61.9, wall=44057 epoch 031: 809 / 1689 loss=3.512, nll_loss=1.958, ppl=3.88, wps=585771, ups=1.18, wpb=496078, bsz=16358.8, num_updates=51400, lr=0.000278964, gnorm=0.132, clip=0, loss_scale=2, train_wall=84, gb_free=61.9, wall=44057 epoch 031: 909 / 1689 loss=3.516, nll_loss=1.963, ppl=3.9, wps=585296, ups=1.18, wpb=495779, bsz=16363.7, num_updates=51500, lr=0.000278693, gnorm=0.136, clip=0, loss_scale=2, train_wall=84, gb_free=61.6, wall=44141 epoch 031: 909 / 1689 loss=3.516, nll_loss=1.963, ppl=3.9, wps=585296, ups=1.18, wpb=495779, bsz=16363.7, num_updates=51500, lr=0.000278693, gnorm=0.136, clip=0, loss_scale=2, train_wall=84, gb_free=61.6, wall=44141 epoch 031: 909 / 1689 loss=3.516, nll_loss=1.963, ppl=3.9, wps=585296, ups=1.18, wpb=495779, bsz=16363.7, num_updates=51500, lr=0.000278693, gnorm=0.136, clip=0, loss_scale=2, train_wall=84, gb_free=61.6, wall=44141 epoch 031: 909 / 1689 loss=3.516, nll_loss=1.963, ppl=3.9, wps=585296, ups=1.18, wpb=495779, bsz=16363.7, num_updates=51500, lr=0.000278693, gnorm=0.136, clip=0, loss_scale=2, train_wall=84, gb_free=61.6, wall=44141 epoch 031: 909 / 1689 loss=3.516, nll_loss=1.963, ppl=3.9, wps=585296, ups=1.18, wpb=495779, bsz=16363.7, num_updates=51500, lr=0.000278693, gnorm=0.136, clip=0, loss_scale=2, train_wall=84, gb_free=61.6, wall=44141 epoch 031: 909 / 1689 loss=3.516, nll_loss=1.963, ppl=3.9, wps=585296, ups=1.18, wpb=495779, bsz=16363.7, num_updates=51500, lr=0.000278693, gnorm=0.136, clip=0, loss_scale=2, train_wall=84, gb_free=61.6, wall=44141 epoch 031: 909 / 1689 loss=3.516, nll_loss=1.963, ppl=3.9, wps=585296, ups=1.18, wpb=495779, bsz=16363.7, num_updates=51500, lr=0.000278693, gnorm=0.136, clip=0, loss_scale=2, train_wall=84, gb_free=61.6, wall=44141 epoch 031: 909 / 1689 loss=3.516, nll_loss=1.963, ppl=3.9, wps=585296, ups=1.18, wpb=495779, bsz=16363.7, num_updates=51500, lr=0.000278693, gnorm=0.136, clip=0, loss_scale=2, train_wall=84, gb_free=61.6, wall=44141 epoch 031: 909 / 1689 loss=3.516, nll_loss=1.963, ppl=3.9, wps=585296, ups=1.18, wpb=495779, bsz=16363.7, num_updates=51500, lr=0.000278693, gnorm=0.136, clip=0, loss_scale=2, train_wall=84, gb_free=61.6, wall=44141 epoch 031: 909 / 1689 loss=3.516, nll_loss=1.963, ppl=3.9, wps=585296, ups=1.18, wpb=495779, bsz=16363.7, num_updates=51500, lr=0.000278693, gnorm=0.136, clip=0, loss_scale=2, train_wall=84, gb_free=61.6, wall=44141 epoch 031: 909 / 1689 loss=3.516, nll_loss=1.963, ppl=3.9, wps=585296, ups=1.18, wpb=495779, bsz=16363.7, num_updates=51500, lr=0.000278693, gnorm=0.136, clip=0, loss_scale=2, train_wall=84, gb_free=61.6, wall=44141 epoch 031: 909 / 1689 loss=3.516, nll_loss=1.963, ppl=3.9, wps=585296, ups=1.18, wpb=495779, bsz=16363.7, num_updates=51500, lr=0.000278693, gnorm=0.136, clip=0, loss_scale=2, train_wall=84, gb_free=61.6, wall=44141 epoch 031: 909 / 1689 loss=3.516, nll_loss=1.963, ppl=3.9, wps=585296, ups=1.18, wpb=495779, bsz=16363.7, num_updates=51500, lr=0.000278693, gnorm=0.136, clip=0, loss_scale=2, train_wall=84, gb_free=61.6, wall=44141 epoch 031: 909 / 1689 loss=3.516, nll_loss=1.963, ppl=3.9, wps=585296, ups=1.18, wpb=495779, bsz=16363.7, num_updates=51500, lr=0.000278693, gnorm=0.136, clip=0, loss_scale=2, train_wall=84, gb_free=61.6, wall=44141 epoch 031: 909 / 1689 loss=3.516, nll_loss=1.963, ppl=3.9, wps=585296, ups=1.18, wpb=495779, bsz=16363.7, num_updates=51500, lr=0.000278693, gnorm=0.136, clip=0, loss_scale=2, train_wall=84, gb_free=61.6, wall=44141 epoch 031: 909 / 1689 loss=3.516, nll_loss=1.963, ppl=3.9, wps=585296, ups=1.18, wpb=495779, bsz=16363.7, num_updates=51500, lr=0.000278693, gnorm=0.136, clip=0, loss_scale=2, train_wall=84, gb_free=61.6, wall=44141 epoch 031: 909 / 1689 loss=3.516, nll_loss=1.963, ppl=3.9, wps=585296, ups=1.18, wpb=495779, bsz=16363.7, num_updates=51500, lr=0.000278693, gnorm=0.136, clip=0, loss_scale=2, train_wall=84, gb_free=61.6, wall=44141 epoch 031: 909 / 1689 loss=3.516, nll_loss=1.963, ppl=3.9, wps=585296, ups=1.18, wpb=495779, bsz=16363.7, num_updates=51500, lr=0.000278693, gnorm=0.136, clip=0, loss_scale=2, train_wall=84, gb_free=61.6, wall=44141 epoch 031: 909 / 1689 loss=3.516, nll_loss=1.963, ppl=3.9, wps=585296, ups=1.18, wpb=495779, bsz=16363.7, num_updates=51500, lr=0.000278693, gnorm=0.136, clip=0, loss_scale=2, train_wall=84, gb_free=61.6, wall=44141 epoch 031: 909 / 1689 loss=3.516, nll_loss=1.963, ppl=3.9, wps=585296, ups=1.18, wpb=495779, bsz=16363.7, num_updates=51500, lr=0.000278693, gnorm=0.136, clip=0, loss_scale=2, train_wall=84, gb_free=61.6, wall=44141 epoch 031: 909 / 1689 loss=3.516, nll_loss=1.963, ppl=3.9, wps=585296, ups=1.18, wpb=495779, bsz=16363.7, num_updates=51500, lr=0.000278693, gnorm=0.136, clip=0, loss_scale=2, train_wall=84, gb_free=61.6, wall=44141 epoch 031: 909 / 1689 loss=3.516, nll_loss=1.963, ppl=3.9, wps=585296, ups=1.18, wpb=495779, bsz=16363.7, num_updates=51500, lr=0.000278693, gnorm=0.136, clip=0, loss_scale=2, train_wall=84, gb_free=61.6, wall=44141 epoch 031: 909 / 1689 loss=3.516, nll_loss=1.963, ppl=3.9, wps=585296, ups=1.18, wpb=495779, bsz=16363.7, num_updates=51500, lr=0.000278693, gnorm=0.136, clip=0, loss_scale=2, train_wall=84, gb_free=61.6, wall=44141 epoch 031: 909 / 1689 loss=3.516, nll_loss=1.963, ppl=3.9, wps=585296, ups=1.18, wpb=495779, bsz=16363.7, num_updates=51500, lr=0.000278693, gnorm=0.136, clip=0, loss_scale=2, train_wall=84, gb_free=61.6, wall=44141 epoch 031: 909 / 1689 loss=3.516, nll_loss=1.963, ppl=3.9, wps=585296, ups=1.18, wpb=495779, bsz=16363.7, num_updates=51500, lr=0.000278693, gnorm=0.136, clip=0, loss_scale=2, train_wall=84, gb_free=61.6, wall=44141 epoch 031: 909 / 1689 loss=3.516, nll_loss=1.963, ppl=3.9, wps=585296, ups=1.18, wpb=495779, bsz=16363.7, num_updates=51500, lr=0.000278693, gnorm=0.136, clip=0, loss_scale=2, train_wall=84, gb_free=61.6, wall=44141 epoch 031: 909 / 1689 loss=3.516, nll_loss=1.963, ppl=3.9, wps=585296, ups=1.18, wpb=495779, bsz=16363.7, num_updates=51500, lr=0.000278693, gnorm=0.136, clip=0, loss_scale=2, train_wall=84, gb_free=61.6, wall=44141 epoch 031: 909 / 1689 loss=3.516, nll_loss=1.963, ppl=3.9, wps=585296, ups=1.18, wpb=495779, bsz=16363.7, num_updates=51500, lr=0.000278693, gnorm=0.136, clip=0, loss_scale=2, train_wall=84, gb_free=61.6, wall=44141 epoch 031: 909 / 1689 loss=3.516, nll_loss=1.963, ppl=3.9, wps=585296, ups=1.18, wpb=495779, bsz=16363.7, num_updates=51500, lr=0.000278693, gnorm=0.136, clip=0, loss_scale=2, train_wall=84, gb_free=61.6, wall=44141 epoch 031: 909 / 1689 loss=3.516, nll_loss=1.963, ppl=3.9, wps=585296, ups=1.18, wpb=495779, bsz=16363.7, num_updates=51500, lr=0.000278693, gnorm=0.136, clip=0, loss_scale=2, train_wall=84, gb_free=61.6, wall=44141 epoch 031: 909 / 1689 loss=3.516, nll_loss=1.963, ppl=3.9, wps=585296, ups=1.18, wpb=495779, bsz=16363.7, num_updates=51500, lr=0.000278693, gnorm=0.136, clip=0, loss_scale=2, train_wall=84, gb_free=61.6, wall=44141 epoch 031: 1009 / 1689 loss=3.52, nll_loss=1.967, ppl=3.91, wps=585528, ups=1.18, wpb=495126, bsz=16800.6, num_updates=51600, lr=0.000278423, gnorm=0.133, clip=0, loss_scale=2, train_wall=83, gb_free=61.6, wall=44226 epoch 031: 1009 / 1689 loss=3.52, nll_loss=1.967, ppl=3.91, wps=585528, ups=1.18, wpb=495126, bsz=16800.6, num_updates=51600, lr=0.000278423, gnorm=0.133, clip=0, loss_scale=2, train_wall=83, gb_free=61.6, wall=44226 epoch 031: 1009 / 1689 loss=3.52, nll_loss=1.967, ppl=3.91, wps=585528, ups=1.18, wpb=495126, bsz=16800.6, num_updates=51600, lr=0.000278423, gnorm=0.133, clip=0, loss_scale=2, train_wall=83, gb_free=61.6, wall=44226 epoch 031: 1009 / 1689 loss=3.52, nll_loss=1.967, ppl=3.91, wps=585528, ups=1.18, wpb=495126, bsz=16800.6, num_updates=51600, lr=0.000278423, gnorm=0.133, clip=0, loss_scale=2, train_wall=83, gb_free=61.6, wall=44226 epoch 031: 1009 / 1689 loss=3.52, nll_loss=1.967, ppl=3.91, wps=585528, ups=1.18, wpb=495126, bsz=16800.6, num_updates=51600, lr=0.000278423, gnorm=0.133, clip=0, loss_scale=2, train_wall=83, gb_free=61.6, wall=44226 epoch 031: 1009 / 1689 loss=3.52, nll_loss=1.967, ppl=3.91, wps=585528, ups=1.18, wpb=495126, bsz=16800.6, num_updates=51600, lr=0.000278423, gnorm=0.133, clip=0, loss_scale=2, train_wall=83, gb_free=61.6, wall=44226 epoch 031: 1009 / 1689 loss=3.52, nll_loss=1.967, ppl=3.91, wps=585528, ups=1.18, wpb=495126, bsz=16800.6, num_updates=51600, lr=0.000278423, gnorm=0.133, clip=0, loss_scale=2, train_wall=83, gb_free=61.6, wall=44226 epoch 031: 1009 / 1689 loss=3.52, nll_loss=1.967, ppl=3.91, wps=585528, ups=1.18, wpb=495126, bsz=16800.6, num_updates=51600, lr=0.000278423, gnorm=0.133, clip=0, loss_scale=2, train_wall=83, gb_free=61.6, wall=44226 epoch 031: 1009 / 1689 loss=3.52, nll_loss=1.967, ppl=3.91, wps=585528, ups=1.18, wpb=495126, bsz=16800.6, num_updates=51600, lr=0.000278423, gnorm=0.133, clip=0, loss_scale=2, train_wall=83, gb_free=61.6, wall=44226 epoch 031: 1009 / 1689 loss=3.52, nll_loss=1.967, ppl=3.91, wps=585528, ups=1.18, wpb=495126, bsz=16800.6, num_updates=51600, lr=0.000278423, gnorm=0.133, clip=0, loss_scale=2, train_wall=83, gb_free=61.6, wall=44226 epoch 031: 1009 / 1689 loss=3.52, nll_loss=1.967, ppl=3.91, wps=585528, ups=1.18, wpb=495126, bsz=16800.6, num_updates=51600, lr=0.000278423, gnorm=0.133, clip=0, loss_scale=2, train_wall=83, gb_free=61.6, wall=44226 epoch 031: 1009 / 1689 loss=3.52, nll_loss=1.967, ppl=3.91, wps=585528, ups=1.18, wpb=495126, bsz=16800.6, num_updates=51600, lr=0.000278423, gnorm=0.133, clip=0, loss_scale=2, train_wall=83, gb_free=61.6, wall=44226 epoch 031: 1009 / 1689 loss=3.52, nll_loss=1.967, ppl=3.91, wps=585528, ups=1.18, wpb=495126, bsz=16800.6, num_updates=51600, lr=0.000278423, gnorm=0.133, clip=0, loss_scale=2, train_wall=83, gb_free=61.6, wall=44226 epoch 031: 1009 / 1689 loss=3.52, nll_loss=1.967, ppl=3.91, wps=585528, ups=1.18, wpb=495126, bsz=16800.6, num_updates=51600, lr=0.000278423, gnorm=0.133, clip=0, loss_scale=2, train_wall=83, gb_free=61.6, wall=44226 epoch 031: 1009 / 1689 loss=3.52, nll_loss=1.967, ppl=3.91, wps=585528, ups=1.18, wpb=495126, bsz=16800.6, num_updates=51600, lr=0.000278423, gnorm=0.133, clip=0, loss_scale=2, train_wall=83, gb_free=61.6, wall=44226 epoch 031: 1009 / 1689 loss=3.52, nll_loss=1.967, ppl=3.91, wps=585528, ups=1.18, wpb=495126, bsz=16800.6, num_updates=51600, lr=0.000278423, gnorm=0.133, clip=0, loss_scale=2, train_wall=83, gb_free=61.6, wall=44226 epoch 031: 1009 / 1689 loss=3.52, nll_loss=1.967, ppl=3.91, wps=585528, ups=1.18, wpb=495126, bsz=16800.6, num_updates=51600, lr=0.000278423, gnorm=0.133, clip=0, loss_scale=2, train_wall=83, gb_free=61.6, wall=44226 epoch 031: 1009 / 1689 loss=3.52, nll_loss=1.967, ppl=3.91, wps=585528, ups=1.18, wpb=495126, bsz=16800.6, num_updates=51600, lr=0.000278423, gnorm=0.133, clip=0, loss_scale=2, train_wall=83, gb_free=61.6, wall=44226 epoch 031: 1009 / 1689 loss=3.52, nll_loss=1.967, ppl=3.91, wps=585528, ups=1.18, wpb=495126, bsz=16800.6, num_updates=51600, lr=0.000278423, gnorm=0.133, clip=0, loss_scale=2, train_wall=83, gb_free=61.6, wall=44226 epoch 031: 1009 / 1689 loss=3.52, nll_loss=1.967, ppl=3.91, wps=585528, ups=1.18, wpb=495126, bsz=16800.6, num_updates=51600, lr=0.000278423, gnorm=0.133, clip=0, loss_scale=2, train_wall=83, gb_free=61.6, wall=44226 epoch 031: 1009 / 1689 loss=3.52, nll_loss=1.967, ppl=3.91, wps=585528, ups=1.18, wpb=495126, bsz=16800.6, num_updates=51600, lr=0.000278423, gnorm=0.133, clip=0, loss_scale=2, train_wall=83, gb_free=61.6, wall=44226 epoch 031: 1009 / 1689 loss=3.52, nll_loss=1.967, ppl=3.91, wps=585528, ups=1.18, wpb=495126, bsz=16800.6, num_updates=51600, lr=0.000278423, gnorm=0.133, clip=0, loss_scale=2, train_wall=83, gb_free=61.6, wall=44226 epoch 031: 1009 / 1689 loss=3.52, nll_loss=1.967, ppl=3.91, wps=585528, ups=1.18, wpb=495126, bsz=16800.6, num_updates=51600, lr=0.000278423, gnorm=0.133, clip=0, loss_scale=2, train_wall=83, gb_free=61.6, wall=44226 epoch 031: 1009 / 1689 loss=3.52, nll_loss=1.967, ppl=3.91, wps=585528, ups=1.18, wpb=495126, bsz=16800.6, num_updates=51600, lr=0.000278423, gnorm=0.133, clip=0, loss_scale=2, train_wall=83, gb_free=61.6, wall=44226 epoch 031: 1009 / 1689 loss=3.52, nll_loss=1.967, ppl=3.91, wps=585528, ups=1.18, wpb=495126, bsz=16800.6, num_updates=51600, lr=0.000278423, gnorm=0.133, clip=0, loss_scale=2, train_wall=83, gb_free=61.6, wall=44226 epoch 031: 1009 / 1689 loss=3.52, nll_loss=1.967, ppl=3.91, wps=585528, ups=1.18, wpb=495126, bsz=16800.6, num_updates=51600, lr=0.000278423, gnorm=0.133, clip=0, loss_scale=2, train_wall=83, gb_free=61.6, wall=44226 epoch 031: 1009 / 1689 loss=3.52, nll_loss=1.967, ppl=3.91, wps=585528, ups=1.18, wpb=495126, bsz=16800.6, num_updates=51600, lr=0.000278423, gnorm=0.133, clip=0, loss_scale=2, train_wall=83, gb_free=61.6, wall=44226 epoch 031: 1009 / 1689 loss=3.52, nll_loss=1.967, ppl=3.91, wps=585528, ups=1.18, wpb=495126, bsz=16800.6, num_updates=51600, lr=0.000278423, gnorm=0.133, clip=0, loss_scale=2, train_wall=83, gb_free=61.6, wall=44226 epoch 031: 1009 / 1689 loss=3.52, nll_loss=1.967, ppl=3.91, wps=585528, ups=1.18, wpb=495126, bsz=16800.6, num_updates=51600, lr=0.000278423, gnorm=0.133, clip=0, loss_scale=2, train_wall=83, gb_free=61.6, wall=44226 epoch 031: 1009 / 1689 loss=3.52, nll_loss=1.967, ppl=3.91, wps=585528, ups=1.18, wpb=495126, bsz=16800.6, num_updates=51600, lr=0.000278423, gnorm=0.133, clip=0, loss_scale=2, train_wall=83, gb_free=61.6, wall=44226 epoch 031: 1009 / 1689 loss=3.52, nll_loss=1.967, ppl=3.91, wps=585528, ups=1.18, wpb=495126, bsz=16800.6, num_updates=51600, lr=0.000278423, gnorm=0.133, clip=0, loss_scale=2, train_wall=83, gb_free=61.6, wall=44226 epoch 031: 1109 / 1689 loss=3.517, nll_loss=1.963, ppl=3.9, wps=591224, ups=1.19, wpb=496208, bsz=15934.2, num_updates=51700, lr=0.000278154, gnorm=0.132, clip=0, loss_scale=2, train_wall=83, gb_free=61.2, wall=44310 epoch 031: 1109 / 1689 loss=3.517, nll_loss=1.963, ppl=3.9, wps=591224, ups=1.19, wpb=496208, bsz=15934.2, num_updates=51700, lr=0.000278154, gnorm=0.132, clip=0, loss_scale=2, train_wall=83, gb_free=61.2, wall=44310 epoch 031: 1109 / 1689 loss=3.517, nll_loss=1.963, ppl=3.9, wps=591224, ups=1.19, wpb=496208, bsz=15934.2, num_updates=51700, lr=0.000278154, gnorm=0.132, clip=0, loss_scale=2, train_wall=83, gb_free=61.2, wall=44310 epoch 031: 1109 / 1689 loss=3.517, nll_loss=1.963, ppl=3.9, wps=591224, ups=1.19, wpb=496208, bsz=15934.2, num_updates=51700, lr=0.000278154, gnorm=0.132, clip=0, loss_scale=2, train_wall=83, gb_free=61.2, wall=44310 epoch 031: 1109 / 1689 loss=3.517, nll_loss=1.963, ppl=3.9, wps=591224, ups=1.19, wpb=496208, bsz=15934.2, num_updates=51700, lr=0.000278154, gnorm=0.132, clip=0, loss_scale=2, train_wall=83, gb_free=61.2, wall=44310 epoch 031: 1109 / 1689 loss=3.517, nll_loss=1.963, ppl=3.9, wps=591224, ups=1.19, wpb=496208, bsz=15934.2, num_updates=51700, lr=0.000278154, gnorm=0.132, clip=0, loss_scale=2, train_wall=83, gb_free=61.2, wall=44310 epoch 031: 1109 / 1689 loss=3.517, nll_loss=1.963, ppl=3.9, wps=591224, ups=1.19, wpb=496208, bsz=15934.2, num_updates=51700, lr=0.000278154, gnorm=0.132, clip=0, loss_scale=2, train_wall=83, gb_free=61.2, wall=44310 epoch 031: 1109 / 1689 loss=3.517, nll_loss=1.963, ppl=3.9, wps=591224, ups=1.19, wpb=496208, bsz=15934.2, num_updates=51700, lr=0.000278154, gnorm=0.132, clip=0, loss_scale=2, train_wall=83, gb_free=61.2, wall=44310 epoch 031: 1109 / 1689 loss=3.517, nll_loss=1.963, ppl=3.9, wps=591224, ups=1.19, wpb=496208, bsz=15934.2, num_updates=51700, lr=0.000278154, gnorm=0.132, clip=0, loss_scale=2, train_wall=83, gb_free=61.2, wall=44310 epoch 031: 1109 / 1689 loss=3.517, nll_loss=1.963, ppl=3.9, wps=591224, ups=1.19, wpb=496208, bsz=15934.2, num_updates=51700, lr=0.000278154, gnorm=0.132, clip=0, loss_scale=2, train_wall=83, gb_free=61.2, wall=44310 epoch 031: 1109 / 1689 loss=3.517, nll_loss=1.963, ppl=3.9, wps=591224, ups=1.19, wpb=496208, bsz=15934.2, num_updates=51700, lr=0.000278154, gnorm=0.132, clip=0, loss_scale=2, train_wall=83, gb_free=61.2, wall=44310 epoch 031: 1109 / 1689 loss=3.517, nll_loss=1.963, ppl=3.9, wps=591224, ups=1.19, wpb=496208, bsz=15934.2, num_updates=51700, lr=0.000278154, gnorm=0.132, clip=0, loss_scale=2, train_wall=83, gb_free=61.2, wall=44310 epoch 031: 1109 / 1689 loss=3.517, nll_loss=1.963, ppl=3.9, wps=591224, ups=1.19, wpb=496208, bsz=15934.2, num_updates=51700, lr=0.000278154, gnorm=0.132, clip=0, loss_scale=2, train_wall=83, gb_free=61.2, wall=44310 epoch 031: 1109 / 1689 loss=3.517, nll_loss=1.963, ppl=3.9, wps=591224, ups=1.19, wpb=496208, bsz=15934.2, num_updates=51700, lr=0.000278154, gnorm=0.132, clip=0, loss_scale=2, train_wall=83, gb_free=61.2, wall=44310 epoch 031: 1109 / 1689 loss=3.517, nll_loss=1.963, ppl=3.9, wps=591224, ups=1.19, wpb=496208, bsz=15934.2, num_updates=51700, lr=0.000278154, gnorm=0.132, clip=0, loss_scale=2, train_wall=83, gb_free=61.2, wall=44310 epoch 031: 1109 / 1689 loss=3.517, nll_loss=1.963, ppl=3.9, wps=591224, ups=1.19, wpb=496208, bsz=15934.2, num_updates=51700, lr=0.000278154, gnorm=0.132, clip=0, loss_scale=2, train_wall=83, gb_free=61.2, wall=44310 epoch 031: 1109 / 1689 loss=3.517, nll_loss=1.963, ppl=3.9, wps=591224, ups=1.19, wpb=496208, bsz=15934.2, num_updates=51700, lr=0.000278154, gnorm=0.132, clip=0, loss_scale=2, train_wall=83, gb_free=61.2, wall=44310 epoch 031: 1109 / 1689 loss=3.517, nll_loss=1.963, ppl=3.9, wps=591224, ups=1.19, wpb=496208, bsz=15934.2, num_updates=51700, lr=0.000278154, gnorm=0.132, clip=0, loss_scale=2, train_wall=83, gb_free=61.2, wall=44310 epoch 031: 1109 / 1689 loss=3.517, nll_loss=1.963, ppl=3.9, wps=591224, ups=1.19, wpb=496208, bsz=15934.2, num_updates=51700, lr=0.000278154, gnorm=0.132, clip=0, loss_scale=2, train_wall=83, gb_free=61.2, wall=44310 epoch 031: 1109 / 1689 loss=3.517, nll_loss=1.963, ppl=3.9, wps=591224, ups=1.19, wpb=496208, bsz=15934.2, num_updates=51700, lr=0.000278154, gnorm=0.132, clip=0, loss_scale=2, train_wall=83, gb_free=61.2, wall=44310 epoch 031: 1109 / 1689 loss=3.517, nll_loss=1.963, ppl=3.9, wps=591224, ups=1.19, wpb=496208, bsz=15934.2, num_updates=51700, lr=0.000278154, gnorm=0.132, clip=0, loss_scale=2, train_wall=83, gb_free=61.2, wall=44310 epoch 031: 1109 / 1689 loss=3.517, nll_loss=1.963, ppl=3.9, wps=591224, ups=1.19, wpb=496208, bsz=15934.2, num_updates=51700, lr=0.000278154, gnorm=0.132, clip=0, loss_scale=2, train_wall=83, gb_free=61.2, wall=44310 epoch 031: 1109 / 1689 loss=3.517, nll_loss=1.963, ppl=3.9, wps=591224, ups=1.19, wpb=496208, bsz=15934.2, num_updates=51700, lr=0.000278154, gnorm=0.132, clip=0, loss_scale=2, train_wall=83, gb_free=61.2, wall=44310 epoch 031: 1109 / 1689 loss=3.517, nll_loss=1.963, ppl=3.9, wps=591224, ups=1.19, wpb=496208, bsz=15934.2, num_updates=51700, lr=0.000278154, gnorm=0.132, clip=0, loss_scale=2, train_wall=83, gb_free=61.2, wall=44310 epoch 031: 1109 / 1689 loss=3.517, nll_loss=1.963, ppl=3.9, wps=591224, ups=1.19, wpb=496208, bsz=15934.2, num_updates=51700, lr=0.000278154, gnorm=0.132, clip=0, loss_scale=2, train_wall=83, gb_free=61.2, wall=44310 epoch 031: 1109 / 1689 loss=3.517, nll_loss=1.963, ppl=3.9, wps=591224, ups=1.19, wpb=496208, bsz=15934.2, num_updates=51700, lr=0.000278154, gnorm=0.132, clip=0, loss_scale=2, train_wall=83, gb_free=61.2, wall=44310 epoch 031: 1109 / 1689 loss=3.517, nll_loss=1.963, ppl=3.9, wps=591224, ups=1.19, wpb=496208, bsz=15934.2, num_updates=51700, lr=0.000278154, gnorm=0.132, clip=0, loss_scale=2, train_wall=83, gb_free=61.2, wall=44310 epoch 031: 1109 / 1689 loss=3.517, nll_loss=1.963, ppl=3.9, wps=591224, ups=1.19, wpb=496208, bsz=15934.2, num_updates=51700, lr=0.000278154, gnorm=0.132, clip=0, loss_scale=2, train_wall=83, gb_free=61.2, wall=44310 epoch 031: 1109 / 1689 loss=3.517, nll_loss=1.963, ppl=3.9, wps=591224, ups=1.19, wpb=496208, bsz=15934.2, num_updates=51700, lr=0.000278154, gnorm=0.132, clip=0, loss_scale=2, train_wall=83, gb_free=61.2, wall=44310 epoch 031: 1109 / 1689 loss=3.517, nll_loss=1.963, ppl=3.9, wps=591224, ups=1.19, wpb=496208, bsz=15934.2, num_updates=51700, lr=0.000278154, gnorm=0.132, clip=0, loss_scale=2, train_wall=83, gb_free=61.2, wall=44310 epoch 031: 1109 / 1689 loss=3.517, nll_loss=1.963, ppl=3.9, wps=591224, ups=1.19, wpb=496208, bsz=15934.2, num_updates=51700, lr=0.000278154, gnorm=0.132, clip=0, loss_scale=2, train_wall=83, gb_free=61.2, wall=44310 epoch 031: 1209 / 1689 loss=3.515, nll_loss=1.962, ppl=3.9, wps=582627, ups=1.18, wpb=493391, bsz=16745.7, num_updates=51800, lr=0.000277885, gnorm=0.135, clip=0, loss_scale=4, train_wall=84, gb_free=61.9, wall=44394 epoch 031: 1209 / 1689 loss=3.515, nll_loss=1.962, ppl=3.9, wps=582627, ups=1.18, wpb=493391, bsz=16745.7, num_updates=51800, lr=0.000277885, gnorm=0.135, clip=0, loss_scale=4, train_wall=84, gb_free=61.9, wall=44394 epoch 031: 1209 / 1689 loss=3.515, nll_loss=1.962, ppl=3.9, wps=582627, ups=1.18, wpb=493391, bsz=16745.7, num_updates=51800, lr=0.000277885, gnorm=0.135, clip=0, loss_scale=4, train_wall=84, gb_free=61.9, wall=44394 epoch 031: 1209 / 1689 loss=3.515, nll_loss=1.962, ppl=3.9, wps=582627, ups=1.18, wpb=493391, bsz=16745.7, num_updates=51800, lr=0.000277885, gnorm=0.135, clip=0, loss_scale=4, train_wall=84, gb_free=61.9, wall=44394 epoch 031: 1209 / 1689 loss=3.515, nll_loss=1.962, ppl=3.9, wps=582627, ups=1.18, wpb=493391, bsz=16745.7, num_updates=51800, lr=0.000277885, gnorm=0.135, clip=0, loss_scale=4, train_wall=84, gb_free=61.9, wall=44394 epoch 031: 1209 / 1689 loss=3.515, nll_loss=1.962, ppl=3.9, wps=582627, ups=1.18, wpb=493391, bsz=16745.7, num_updates=51800, lr=0.000277885, gnorm=0.135, clip=0, loss_scale=4, train_wall=84, gb_free=61.9, wall=44394 epoch 031: 1209 / 1689 loss=3.515, nll_loss=1.962, ppl=3.9, wps=582627, ups=1.18, wpb=493391, bsz=16745.7, num_updates=51800, lr=0.000277885, gnorm=0.135, clip=0, loss_scale=4, train_wall=84, gb_free=61.9, wall=44394 epoch 031: 1209 / 1689 loss=3.515, nll_loss=1.962, ppl=3.9, wps=582627, ups=1.18, wpb=493391, bsz=16745.7, num_updates=51800, lr=0.000277885, gnorm=0.135, clip=0, loss_scale=4, train_wall=84, gb_free=61.9, wall=44394 epoch 031: 1209 / 1689 loss=3.515, nll_loss=1.962, ppl=3.9, wps=582627, ups=1.18, wpb=493391, bsz=16745.7, num_updates=51800, lr=0.000277885, gnorm=0.135, clip=0, loss_scale=4, train_wall=84, gb_free=61.9, wall=44394 epoch 031: 1209 / 1689 loss=3.515, nll_loss=1.962, ppl=3.9, wps=582627, ups=1.18, wpb=493391, bsz=16745.7, num_updates=51800, lr=0.000277885, gnorm=0.135, clip=0, loss_scale=4, train_wall=84, gb_free=61.9, wall=44394 epoch 031: 1209 / 1689 loss=3.515, nll_loss=1.962, ppl=3.9, wps=582627, ups=1.18, wpb=493391, bsz=16745.7, num_updates=51800, lr=0.000277885, gnorm=0.135, clip=0, loss_scale=4, train_wall=84, gb_free=61.9, wall=44394 epoch 031: 1209 / 1689 loss=3.515, nll_loss=1.962, ppl=3.9, wps=582627, ups=1.18, wpb=493391, bsz=16745.7, num_updates=51800, lr=0.000277885, gnorm=0.135, clip=0, loss_scale=4, train_wall=84, gb_free=61.9, wall=44394 epoch 031: 1209 / 1689 loss=3.515, nll_loss=1.962, ppl=3.9, wps=582627, ups=1.18, wpb=493391, bsz=16745.7, num_updates=51800, lr=0.000277885, gnorm=0.135, clip=0, loss_scale=4, train_wall=84, gb_free=61.9, wall=44394 epoch 031: 1209 / 1689 loss=3.515, nll_loss=1.962, ppl=3.9, wps=582627, ups=1.18, wpb=493391, bsz=16745.7, num_updates=51800, lr=0.000277885, gnorm=0.135, clip=0, loss_scale=4, train_wall=84, gb_free=61.9, wall=44394 epoch 031: 1209 / 1689 loss=3.515, nll_loss=1.962, ppl=3.9, wps=582627, ups=1.18, wpb=493391, bsz=16745.7, num_updates=51800, lr=0.000277885, gnorm=0.135, clip=0, loss_scale=4, train_wall=84, gb_free=61.9, wall=44394 epoch 031: 1209 / 1689 loss=3.515, nll_loss=1.962, ppl=3.9, wps=582627, ups=1.18, wpb=493391, bsz=16745.7, num_updates=51800, lr=0.000277885, gnorm=0.135, clip=0, loss_scale=4, train_wall=84, gb_free=61.9, wall=44394 epoch 031: 1209 / 1689 loss=3.515, nll_loss=1.962, ppl=3.9, wps=582627, ups=1.18, wpb=493391, bsz=16745.7, num_updates=51800, lr=0.000277885, gnorm=0.135, clip=0, loss_scale=4, train_wall=84, gb_free=61.9, wall=44394 epoch 031: 1209 / 1689 loss=3.515, nll_loss=1.962, ppl=3.9, wps=582627, ups=1.18, wpb=493391, bsz=16745.7, num_updates=51800, lr=0.000277885, gnorm=0.135, clip=0, loss_scale=4, train_wall=84, gb_free=61.9, wall=44394 epoch 031: 1209 / 1689 loss=3.515, nll_loss=1.962, ppl=3.9, wps=582627, ups=1.18, wpb=493391, bsz=16745.7, num_updates=51800, lr=0.000277885, gnorm=0.135, clip=0, loss_scale=4, train_wall=84, gb_free=61.9, wall=44394 epoch 031: 1209 / 1689 loss=3.515, nll_loss=1.962, ppl=3.9, wps=582627, ups=1.18, wpb=493391, bsz=16745.7, num_updates=51800, lr=0.000277885, gnorm=0.135, clip=0, loss_scale=4, train_wall=84, gb_free=61.9, wall=44394 epoch 031: 1209 / 1689 loss=3.515, nll_loss=1.962, ppl=3.9, wps=582627, ups=1.18, wpb=493391, bsz=16745.7, num_updates=51800, lr=0.000277885, gnorm=0.135, clip=0, loss_scale=4, train_wall=84, gb_free=61.9, wall=44394 epoch 031: 1209 / 1689 loss=3.515, nll_loss=1.962, ppl=3.9, wps=582627, ups=1.18, wpb=493391, bsz=16745.7, num_updates=51800, lr=0.000277885, gnorm=0.135, clip=0, loss_scale=4, train_wall=84, gb_free=61.9, wall=44394 epoch 031: 1209 / 1689 loss=3.515, nll_loss=1.962, ppl=3.9, wps=582627, ups=1.18, wpb=493391, bsz=16745.7, num_updates=51800, lr=0.000277885, gnorm=0.135, clip=0, loss_scale=4, train_wall=84, gb_free=61.9, wall=44394 epoch 031: 1209 / 1689 loss=3.515, nll_loss=1.962, ppl=3.9, wps=582627, ups=1.18, wpb=493391, bsz=16745.7, num_updates=51800, lr=0.000277885, gnorm=0.135, clip=0, loss_scale=4, train_wall=84, gb_free=61.9, wall=44394 epoch 031: 1209 / 1689 loss=3.515, nll_loss=1.962, ppl=3.9, wps=582627, ups=1.18, wpb=493391, bsz=16745.7, num_updates=51800, lr=0.000277885, gnorm=0.135, clip=0, loss_scale=4, train_wall=84, gb_free=61.9, wall=44394 epoch 031: 1209 / 1689 loss=3.515, nll_loss=1.962, ppl=3.9, wps=582627, ups=1.18, wpb=493391, bsz=16745.7, num_updates=51800, lr=0.000277885, gnorm=0.135, clip=0, loss_scale=4, train_wall=84, gb_free=61.9, wall=44394 epoch 031: 1209 / 1689 loss=3.515, nll_loss=1.962, ppl=3.9, wps=582627, ups=1.18, wpb=493391, bsz=16745.7, num_updates=51800, lr=0.000277885, gnorm=0.135, clip=0, loss_scale=4, train_wall=84, gb_free=61.9, wall=44394 epoch 031: 1209 / 1689 loss=3.515, nll_loss=1.962, ppl=3.9, wps=582627, ups=1.18, wpb=493391, bsz=16745.7, num_updates=51800, lr=0.000277885, gnorm=0.135, clip=0, loss_scale=4, train_wall=84, gb_free=61.9, wall=44394 epoch 031: 1209 / 1689 loss=3.515, nll_loss=1.962, ppl=3.9, wps=582627, ups=1.18, wpb=493391, bsz=16745.7, num_updates=51800, lr=0.000277885, gnorm=0.135, clip=0, loss_scale=4, train_wall=84, gb_free=61.9, wall=44394 epoch 031: 1209 / 1689 loss=3.515, nll_loss=1.962, ppl=3.9, wps=582627, ups=1.18, wpb=493391, bsz=16745.7, num_updates=51800, lr=0.000277885, gnorm=0.135, clip=0, loss_scale=4, train_wall=84, gb_free=61.9, wall=44394 epoch 031: 1209 / 1689 loss=3.515, nll_loss=1.962, ppl=3.9, wps=582627, ups=1.18, wpb=493391, bsz=16745.7, num_updates=51800, lr=0.000277885, gnorm=0.135, clip=0, loss_scale=4, train_wall=84, gb_free=61.9, wall=44394 epoch 031: 1309 / 1689 loss=3.524, nll_loss=1.972, ppl=3.92, wps=584176, ups=1.18, wpb=495586, bsz=16256.2, num_updates=51900, lr=0.000277617, gnorm=0.142, clip=0, loss_scale=4, train_wall=84, gb_free=61.8, wall=44479 epoch 031: 1309 / 1689 loss=3.524, nll_loss=1.972, ppl=3.92, wps=584176, ups=1.18, wpb=495586, bsz=16256.2, num_updates=51900, lr=0.000277617, gnorm=0.142, clip=0, loss_scale=4, train_wall=84, gb_free=61.8, wall=44479 epoch 031: 1309 / 1689 loss=3.524, nll_loss=1.972, ppl=3.92, wps=584176, ups=1.18, wpb=495586, bsz=16256.2, num_updates=51900, lr=0.000277617, gnorm=0.142, clip=0, loss_scale=4, train_wall=84, gb_free=61.8, wall=44479 epoch 031: 1309 / 1689 loss=3.524, nll_loss=1.972, ppl=3.92, wps=584176, ups=1.18, wpb=495586, bsz=16256.2, num_updates=51900, lr=0.000277617, gnorm=0.142, clip=0, loss_scale=4, train_wall=84, gb_free=61.8, wall=44479 epoch 031: 1309 / 1689 loss=3.524, nll_loss=1.972, ppl=3.92, wps=584176, ups=1.18, wpb=495586, bsz=16256.2, num_updates=51900, lr=0.000277617, gnorm=0.142, clip=0, loss_scale=4, train_wall=84, gb_free=61.8, wall=44479 epoch 031: 1309 / 1689 loss=3.524, nll_loss=1.972, ppl=3.92, wps=584176, ups=1.18, wpb=495586, bsz=16256.2, num_updates=51900, lr=0.000277617, gnorm=0.142, clip=0, loss_scale=4, train_wall=84, gb_free=61.8, wall=44479 epoch 031: 1309 / 1689 loss=3.524, nll_loss=1.972, ppl=3.92, wps=584176, ups=1.18, wpb=495586, bsz=16256.2, num_updates=51900, lr=0.000277617, gnorm=0.142, clip=0, loss_scale=4, train_wall=84, gb_free=61.8, wall=44479 epoch 031: 1309 / 1689 loss=3.524, nll_loss=1.972, ppl=3.92, wps=584176, ups=1.18, wpb=495586, bsz=16256.2, num_updates=51900, lr=0.000277617, gnorm=0.142, clip=0, loss_scale=4, train_wall=84, gb_free=61.8, wall=44479 epoch 031: 1309 / 1689 loss=3.524, nll_loss=1.972, ppl=3.92, wps=584176, ups=1.18, wpb=495586, bsz=16256.2, num_updates=51900, lr=0.000277617, gnorm=0.142, clip=0, loss_scale=4, train_wall=84, gb_free=61.8, wall=44479 epoch 031: 1309 / 1689 loss=3.524, nll_loss=1.972, ppl=3.92, wps=584176, ups=1.18, wpb=495586, bsz=16256.2, num_updates=51900, lr=0.000277617, gnorm=0.142, clip=0, loss_scale=4, train_wall=84, gb_free=61.8, wall=44479 epoch 031: 1309 / 1689 loss=3.524, nll_loss=1.972, ppl=3.92, wps=584176, ups=1.18, wpb=495586, bsz=16256.2, num_updates=51900, lr=0.000277617, gnorm=0.142, clip=0, loss_scale=4, train_wall=84, gb_free=61.8, wall=44479 epoch 031: 1309 / 1689 loss=3.524, nll_loss=1.972, ppl=3.92, wps=584176, ups=1.18, wpb=495586, bsz=16256.2, num_updates=51900, lr=0.000277617, gnorm=0.142, clip=0, loss_scale=4, train_wall=84, gb_free=61.8, wall=44479 epoch 031: 1309 / 1689 loss=3.524, nll_loss=1.972, ppl=3.92, wps=584176, ups=1.18, wpb=495586, bsz=16256.2, num_updates=51900, lr=0.000277617, gnorm=0.142, clip=0, loss_scale=4, train_wall=84, gb_free=61.8, wall=44479 epoch 031: 1309 / 1689 loss=3.524, nll_loss=1.972, ppl=3.92, wps=584176, ups=1.18, wpb=495586, bsz=16256.2, num_updates=51900, lr=0.000277617, gnorm=0.142, clip=0, loss_scale=4, train_wall=84, gb_free=61.8, wall=44479 epoch 031: 1309 / 1689 loss=3.524, nll_loss=1.972, ppl=3.92, wps=584176, ups=1.18, wpb=495586, bsz=16256.2, num_updates=51900, lr=0.000277617, gnorm=0.142, clip=0, loss_scale=4, train_wall=84, gb_free=61.8, wall=44479 epoch 031: 1309 / 1689 loss=3.524, nll_loss=1.972, ppl=3.92, wps=584176, ups=1.18, wpb=495586, bsz=16256.2, num_updates=51900, lr=0.000277617, gnorm=0.142, clip=0, loss_scale=4, train_wall=84, gb_free=61.8, wall=44479 epoch 031: 1309 / 1689 loss=3.524, nll_loss=1.972, ppl=3.92, wps=584176, ups=1.18, wpb=495586, bsz=16256.2, num_updates=51900, lr=0.000277617, gnorm=0.142, clip=0, loss_scale=4, train_wall=84, gb_free=61.8, wall=44479 epoch 031: 1309 / 1689 loss=3.524, nll_loss=1.972, ppl=3.92, wps=584176, ups=1.18, wpb=495586, bsz=16256.2, num_updates=51900, lr=0.000277617, gnorm=0.142, clip=0, loss_scale=4, train_wall=84, gb_free=61.8, wall=44479 epoch 031: 1309 / 1689 loss=3.524, nll_loss=1.972, ppl=3.92, wps=584176, ups=1.18, wpb=495586, bsz=16256.2, num_updates=51900, lr=0.000277617, gnorm=0.142, clip=0, loss_scale=4, train_wall=84, gb_free=61.8, wall=44479 epoch 031: 1309 / 1689 loss=3.524, nll_loss=1.972, ppl=3.92, wps=584176, ups=1.18, wpb=495586, bsz=16256.2, num_updates=51900, lr=0.000277617, gnorm=0.142, clip=0, loss_scale=4, train_wall=84, gb_free=61.8, wall=44479 epoch 031: 1309 / 1689 loss=3.524, nll_loss=1.972, ppl=3.92, wps=584176, ups=1.18, wpb=495586, bsz=16256.2, num_updates=51900, lr=0.000277617, gnorm=0.142, clip=0, loss_scale=4, train_wall=84, gb_free=61.8, wall=44479 epoch 031: 1309 / 1689 loss=3.524, nll_loss=1.972, ppl=3.92, wps=584176, ups=1.18, wpb=495586, bsz=16256.2, num_updates=51900, lr=0.000277617, gnorm=0.142, clip=0, loss_scale=4, train_wall=84, gb_free=61.8, wall=44479 epoch 031: 1309 / 1689 loss=3.524, nll_loss=1.972, ppl=3.92, wps=584176, ups=1.18, wpb=495586, bsz=16256.2, num_updates=51900, lr=0.000277617, gnorm=0.142, clip=0, loss_scale=4, train_wall=84, gb_free=61.8, wall=44479 epoch 031: 1309 / 1689 loss=3.524, nll_loss=1.972, ppl=3.92, wps=584176, ups=1.18, wpb=495586, bsz=16256.2, num_updates=51900, lr=0.000277617, gnorm=0.142, clip=0, loss_scale=4, train_wall=84, gb_free=61.8, wall=44479 epoch 031: 1309 / 1689 loss=3.524, nll_loss=1.972, ppl=3.92, wps=584176, ups=1.18, wpb=495586, bsz=16256.2, num_updates=51900, lr=0.000277617, gnorm=0.142, clip=0, loss_scale=4, train_wall=84, gb_free=61.8, wall=44479 epoch 031: 1309 / 1689 loss=3.524, nll_loss=1.972, ppl=3.92, wps=584176, ups=1.18, wpb=495586, bsz=16256.2, num_updates=51900, lr=0.000277617, gnorm=0.142, clip=0, loss_scale=4, train_wall=84, gb_free=61.8, wall=44479 epoch 031: 1309 / 1689 loss=3.524, nll_loss=1.972, ppl=3.92, wps=584176, ups=1.18, wpb=495586, bsz=16256.2, num_updates=51900, lr=0.000277617, gnorm=0.142, clip=0, loss_scale=4, train_wall=84, gb_free=61.8, wall=44479 epoch 031: 1309 / 1689 loss=3.524, nll_loss=1.972, ppl=3.92, wps=584176, ups=1.18, wpb=495586, bsz=16256.2, num_updates=51900, lr=0.000277617, gnorm=0.142, clip=0, loss_scale=4, train_wall=84, gb_free=61.8, wall=44479 epoch 031: 1309 / 1689 loss=3.524, nll_loss=1.972, ppl=3.92, wps=584176, ups=1.18, wpb=495586, bsz=16256.2, num_updates=51900, lr=0.000277617, gnorm=0.142, clip=0, loss_scale=4, train_wall=84, gb_free=61.8, wall=44479 epoch 031: 1309 / 1689 loss=3.524, nll_loss=1.972, ppl=3.92, wps=584176, ups=1.18, wpb=495586, bsz=16256.2, num_updates=51900, lr=0.000277617, gnorm=0.142, clip=0, loss_scale=4, train_wall=84, gb_free=61.8, wall=44479 epoch 031: 1309 / 1689 loss=3.524, nll_loss=1.972, ppl=3.92, wps=584176, ups=1.18, wpb=495586, bsz=16256.2, num_updates=51900, lr=0.000277617, gnorm=0.142, clip=0, loss_scale=4, train_wall=84, gb_free=61.8, wall=44479 epoch 031: 1409 / 1689 loss=3.518, nll_loss=1.965, ppl=3.9, wps=582442, ups=1.18, wpb=494728, bsz=16833, num_updates=52000, lr=0.00027735, gnorm=0.131, clip=0, loss_scale=4, train_wall=84, gb_free=61.3, wall=44564 epoch 031: 1409 / 1689 loss=3.518, nll_loss=1.965, ppl=3.9, wps=582442, ups=1.18, wpb=494728, bsz=16833, num_updates=52000, lr=0.00027735, gnorm=0.131, clip=0, loss_scale=4, train_wall=84, gb_free=61.3, wall=44564 epoch 031: 1409 / 1689 loss=3.518, nll_loss=1.965, ppl=3.9, wps=582442, ups=1.18, wpb=494728, bsz=16833, num_updates=52000, lr=0.00027735, gnorm=0.131, clip=0, loss_scale=4, train_wall=84, gb_free=61.3, wall=44564 epoch 031: 1409 / 1689 loss=3.518, nll_loss=1.965, ppl=3.9, wps=582442, ups=1.18, wpb=494728, bsz=16833, num_updates=52000, lr=0.00027735, gnorm=0.131, clip=0, loss_scale=4, train_wall=84, gb_free=61.3, wall=44564 epoch 031: 1409 / 1689 loss=3.518, nll_loss=1.965, ppl=3.9, wps=582442, ups=1.18, wpb=494728, bsz=16833, num_updates=52000, lr=0.00027735, gnorm=0.131, clip=0, loss_scale=4, train_wall=84, gb_free=61.3, wall=44564 epoch 031: 1409 / 1689 loss=3.518, nll_loss=1.965, ppl=3.9, wps=582442, ups=1.18, wpb=494728, bsz=16833, num_updates=52000, lr=0.00027735, gnorm=0.131, clip=0, loss_scale=4, train_wall=84, gb_free=61.3, wall=44564 epoch 031: 1409 / 1689 loss=3.518, nll_loss=1.965, ppl=3.9, wps=582442, ups=1.18, wpb=494728, bsz=16833, num_updates=52000, lr=0.00027735, gnorm=0.131, clip=0, loss_scale=4, train_wall=84, gb_free=61.3, wall=44564 epoch 031: 1409 / 1689 loss=3.518, nll_loss=1.965, ppl=3.9, wps=582442, ups=1.18, wpb=494728, bsz=16833, num_updates=52000, lr=0.00027735, gnorm=0.131, clip=0, loss_scale=4, train_wall=84, gb_free=61.3, wall=44564 epoch 031: 1409 / 1689 loss=3.518, nll_loss=1.965, ppl=3.9, wps=582442, ups=1.18, wpb=494728, bsz=16833, num_updates=52000, lr=0.00027735, gnorm=0.131, clip=0, loss_scale=4, train_wall=84, gb_free=61.3, wall=44564 epoch 031: 1409 / 1689 loss=3.518, nll_loss=1.965, ppl=3.9, wps=582442, ups=1.18, wpb=494728, bsz=16833, num_updates=52000, lr=0.00027735, gnorm=0.131, clip=0, loss_scale=4, train_wall=84, gb_free=61.3, wall=44564 epoch 031: 1409 / 1689 loss=3.518, nll_loss=1.965, ppl=3.9, wps=582442, ups=1.18, wpb=494728, bsz=16833, num_updates=52000, lr=0.00027735, gnorm=0.131, clip=0, loss_scale=4, train_wall=84, gb_free=61.3, wall=44564 epoch 031: 1409 / 1689 loss=3.518, nll_loss=1.965, ppl=3.9, wps=582442, ups=1.18, wpb=494728, bsz=16833, num_updates=52000, lr=0.00027735, gnorm=0.131, clip=0, loss_scale=4, train_wall=84, gb_free=61.3, wall=44564 epoch 031: 1409 / 1689 loss=3.518, nll_loss=1.965, ppl=3.9, wps=582442, ups=1.18, wpb=494728, bsz=16833, num_updates=52000, lr=0.00027735, gnorm=0.131, clip=0, loss_scale=4, train_wall=84, gb_free=61.3, wall=44564 epoch 031: 1409 / 1689 loss=3.518, nll_loss=1.965, ppl=3.9, wps=582442, ups=1.18, wpb=494728, bsz=16833, num_updates=52000, lr=0.00027735, gnorm=0.131, clip=0, loss_scale=4, train_wall=84, gb_free=61.3, wall=44564 epoch 031: 1409 / 1689 loss=3.518, nll_loss=1.965, ppl=3.9, wps=582442, ups=1.18, wpb=494728, bsz=16833, num_updates=52000, lr=0.00027735, gnorm=0.131, clip=0, loss_scale=4, train_wall=84, gb_free=61.3, wall=44564 epoch 031: 1409 / 1689 loss=3.518, nll_loss=1.965, ppl=3.9, wps=582442, ups=1.18, wpb=494728, bsz=16833, num_updates=52000, lr=0.00027735, gnorm=0.131, clip=0, loss_scale=4, train_wall=84, gb_free=61.3, wall=44564 epoch 031: 1409 / 1689 loss=3.518, nll_loss=1.965, ppl=3.9, wps=582442, ups=1.18, wpb=494728, bsz=16833, num_updates=52000, lr=0.00027735, gnorm=0.131, clip=0, loss_scale=4, train_wall=84, gb_free=61.3, wall=44564 epoch 031: 1409 / 1689 loss=3.518, nll_loss=1.965, ppl=3.9, wps=582442, ups=1.18, wpb=494728, bsz=16833, num_updates=52000, lr=0.00027735, gnorm=0.131, clip=0, loss_scale=4, train_wall=84, gb_free=61.3, wall=44564 epoch 031: 1409 / 1689 loss=3.518, nll_loss=1.965, ppl=3.9, wps=582442, ups=1.18, wpb=494728, bsz=16833, num_updates=52000, lr=0.00027735, gnorm=0.131, clip=0, loss_scale=4, train_wall=84, gb_free=61.3, wall=44564 epoch 031: 1409 / 1689 loss=3.518, nll_loss=1.965, ppl=3.9, wps=582442, ups=1.18, wpb=494728, bsz=16833, num_updates=52000, lr=0.00027735, gnorm=0.131, clip=0, loss_scale=4, train_wall=84, gb_free=61.3, wall=44564 epoch 031: 1409 / 1689 loss=3.518, nll_loss=1.965, ppl=3.9, wps=582442, ups=1.18, wpb=494728, bsz=16833, num_updates=52000, lr=0.00027735, gnorm=0.131, clip=0, loss_scale=4, train_wall=84, gb_free=61.3, wall=44564 epoch 031: 1409 / 1689 loss=3.518, nll_loss=1.965, ppl=3.9, wps=582442, ups=1.18, wpb=494728, bsz=16833, num_updates=52000, lr=0.00027735, gnorm=0.131, clip=0, loss_scale=4, train_wall=84, gb_free=61.3, wall=44564 epoch 031: 1409 / 1689 loss=3.518, nll_loss=1.965, ppl=3.9, wps=582442, ups=1.18, wpb=494728, bsz=16833, num_updates=52000, lr=0.00027735, gnorm=0.131, clip=0, loss_scale=4, train_wall=84, gb_free=61.3, wall=44564 epoch 031: 1409 / 1689 loss=3.518, nll_loss=1.965, ppl=3.9, wps=582442, ups=1.18, wpb=494728, bsz=16833, num_updates=52000, lr=0.00027735, gnorm=0.131, clip=0, loss_scale=4, train_wall=84, gb_free=61.3, wall=44564 epoch 031: 1409 / 1689 loss=3.518, nll_loss=1.965, ppl=3.9, wps=582442, ups=1.18, wpb=494728, bsz=16833, num_updates=52000, lr=0.00027735, gnorm=0.131, clip=0, loss_scale=4, train_wall=84, gb_free=61.3, wall=44564 epoch 031: 1409 / 1689 loss=3.518, nll_loss=1.965, ppl=3.9, wps=582442, ups=1.18, wpb=494728, bsz=16833, num_updates=52000, lr=0.00027735, gnorm=0.131, clip=0, loss_scale=4, train_wall=84, gb_free=61.3, wall=44564 epoch 031: 1409 / 1689 loss=3.518, nll_loss=1.965, ppl=3.9, wps=582442, ups=1.18, wpb=494728, bsz=16833, num_updates=52000, lr=0.00027735, gnorm=0.131, clip=0, loss_scale=4, train_wall=84, gb_free=61.3, wall=44564 epoch 031: 1409 / 1689 loss=3.518, nll_loss=1.965, ppl=3.9, wps=582442, ups=1.18, wpb=494728, bsz=16833, num_updates=52000, lr=0.00027735, gnorm=0.131, clip=0, loss_scale=4, train_wall=84, gb_free=61.3, wall=44564 epoch 031: 1409 / 1689 loss=3.518, nll_loss=1.965, ppl=3.9, wps=582442, ups=1.18, wpb=494728, bsz=16833, num_updates=52000, lr=0.00027735, gnorm=0.131, clip=0, loss_scale=4, train_wall=84, gb_free=61.3, wall=44564 epoch 031: 1409 / 1689 loss=3.518, nll_loss=1.965, ppl=3.9, wps=582442, ups=1.18, wpb=494728, bsz=16833, num_updates=52000, lr=0.00027735, gnorm=0.131, clip=0, loss_scale=4, train_wall=84, gb_free=61.3, wall=44564 epoch 031: 1409 / 1689 loss=3.518, nll_loss=1.965, ppl=3.9, wps=582442, ups=1.18, wpb=494728, bsz=16833, num_updates=52000, lr=0.00027735, gnorm=0.131, clip=0, loss_scale=4, train_wall=84, gb_free=61.3, wall=44564 begin validation on "valid" subset epoch 031 | valid on 'valid' subset | loss 3.702 | nll_loss 2.154 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 52000 | best_loss 3.685 epoch 031 | valid on 'valid' subset | loss 3.702 | nll_loss 2.154 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 52000 | best_loss 3.685 epoch 031 | valid on 'valid' subset | loss 3.702 | nll_loss 2.154 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 52000 | best_loss 3.685 epoch 031 | valid on 'valid' subset | loss 3.702 | nll_loss 2.154 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 52000 | best_loss 3.685 epoch 031 | valid on 'valid' subset | loss 3.702 | nll_loss 2.154 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 52000 | best_loss 3.685 epoch 031 | valid on 'valid' subset | loss 3.702 | nll_loss 2.154 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 52000 | best_loss 3.685 epoch 031 | valid on 'valid' subset | loss 3.702 | nll_loss 2.154 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 52000 | best_loss 3.685 epoch 031 | valid on 'valid' subset | loss 3.702 | nll_loss 2.154 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 52000 | best_loss 3.685 epoch 031 | valid on 'valid' subset | loss 3.702 | nll_loss 2.154 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 52000 | best_loss 3.685 epoch 031 | valid on 'valid' subset | loss 3.702 | nll_loss 2.154 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 52000 | best_loss 3.685 epoch 031 | valid on 'valid' subset | loss 3.702 | nll_loss 2.154 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 52000 | best_loss 3.685 epoch 031 | valid on 'valid' subset | loss 3.702 | nll_loss 2.154 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 52000 | best_loss 3.685 epoch 031 | valid on 'valid' subset | loss 3.702 | nll_loss 2.154 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 52000 | best_loss 3.685 epoch 031 | valid on 'valid' subset | loss 3.702 | nll_loss 2.154 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 52000 | best_loss 3.685 epoch 031 | valid on 'valid' subset | loss 3.702 | nll_loss 2.154 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 52000 | best_loss 3.685 epoch 031 | valid on 'valid' subset | loss 3.702 | nll_loss 2.154 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 52000 | best_loss 3.685 epoch 031 | valid on 'valid' subset | loss 3.702 | nll_loss 2.154 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 52000 | best_loss 3.685 epoch 031 | valid on 'valid' subset | loss 3.702 | nll_loss 2.154 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 52000 | best_loss 3.685 epoch 031 | valid on 'valid' subset | loss 3.702 | nll_loss 2.154 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 52000 | best_loss 3.685 epoch 031 | valid on 'valid' subset | loss 3.702 | nll_loss 2.154 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 52000 | best_loss 3.685 epoch 031 | valid on 'valid' subset | loss 3.702 | nll_loss 2.154 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 52000 | best_loss 3.685 epoch 031 | valid on 'valid' subset | loss 3.702 | nll_loss 2.154 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 52000 | best_loss 3.685 epoch 031 | valid on 'valid' subset | loss 3.702 | nll_loss 2.154 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 52000 | best_loss 3.685 epoch 031 | valid on 'valid' subset | loss 3.702 | nll_loss 2.154 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 52000 | best_loss 3.685 epoch 031 | valid on 'valid' subset | loss 3.702 | nll_loss 2.154 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 52000 | best_loss 3.685 epoch 031 | valid on 'valid' subset | loss 3.702 | nll_loss 2.154 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 52000 | best_loss 3.685 epoch 031 | valid on 'valid' subset | loss 3.702 | nll_loss 2.154 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 52000 | best_loss 3.685 epoch 031 | valid on 'valid' subset | loss 3.702 | nll_loss 2.154 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 52000 | best_loss 3.685 epoch 031 | valid on 'valid' subset | loss 3.702 | nll_loss 2.154 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 52000 | best_loss 3.685 epoch 031 | valid on 'valid' subset | loss 3.702 | nll_loss 2.154 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 52000 | best_loss 3.685 epoch 031 | valid on 'valid' subset | loss 3.702 | nll_loss 2.154 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 52000 | best_loss 3.685 epoch 031: 1509 / 1689 loss=3.524, nll_loss=1.972, ppl=3.92, wps=535568, ups=1.08, wpb=497053, bsz=16848.7, num_updates=52100, lr=0.000277084, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=61.1, wall=44657 epoch 031: 1509 / 1689 loss=3.524, nll_loss=1.972, ppl=3.92, wps=535568, ups=1.08, wpb=497053, bsz=16848.7, num_updates=52100, lr=0.000277084, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=61.1, wall=44657 epoch 031: 1509 / 1689 loss=3.524, nll_loss=1.972, ppl=3.92, wps=535568, ups=1.08, wpb=497053, bsz=16848.7, num_updates=52100, lr=0.000277084, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=61.1, wall=44657 epoch 031: 1509 / 1689 loss=3.524, nll_loss=1.972, ppl=3.92, wps=535568, ups=1.08, wpb=497053, bsz=16848.7, num_updates=52100, lr=0.000277084, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=61.1, wall=44657 epoch 031: 1509 / 1689 loss=3.524, nll_loss=1.972, ppl=3.92, wps=535568, ups=1.08, wpb=497053, bsz=16848.7, num_updates=52100, lr=0.000277084, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=61.1, wall=44657 epoch 031: 1509 / 1689 loss=3.524, nll_loss=1.972, ppl=3.92, wps=535568, ups=1.08, wpb=497053, bsz=16848.7, num_updates=52100, lr=0.000277084, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=61.1, wall=44657 epoch 031: 1509 / 1689 loss=3.524, nll_loss=1.972, ppl=3.92, wps=535568, ups=1.08, wpb=497053, bsz=16848.7, num_updates=52100, lr=0.000277084, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=61.1, wall=44657 epoch 031: 1509 / 1689 loss=3.524, nll_loss=1.972, ppl=3.92, wps=535568, ups=1.08, wpb=497053, bsz=16848.7, num_updates=52100, lr=0.000277084, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=61.1, wall=44657 epoch 031: 1509 / 1689 loss=3.524, nll_loss=1.972, ppl=3.92, wps=535568, ups=1.08, wpb=497053, bsz=16848.7, num_updates=52100, lr=0.000277084, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=61.1, wall=44657 epoch 031: 1509 / 1689 loss=3.524, nll_loss=1.972, ppl=3.92, wps=535568, ups=1.08, wpb=497053, bsz=16848.7, num_updates=52100, lr=0.000277084, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=61.1, wall=44657 epoch 031: 1509 / 1689 loss=3.524, nll_loss=1.972, ppl=3.92, wps=535568, ups=1.08, wpb=497053, bsz=16848.7, num_updates=52100, lr=0.000277084, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=61.1, wall=44657 epoch 031: 1509 / 1689 loss=3.524, nll_loss=1.972, ppl=3.92, wps=535568, ups=1.08, wpb=497053, bsz=16848.7, num_updates=52100, lr=0.000277084, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=61.1, wall=44657 epoch 031: 1509 / 1689 loss=3.524, nll_loss=1.972, ppl=3.92, wps=535568, ups=1.08, wpb=497053, bsz=16848.7, num_updates=52100, lr=0.000277084, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=61.1, wall=44657 epoch 031: 1509 / 1689 loss=3.524, nll_loss=1.972, ppl=3.92, wps=535568, ups=1.08, wpb=497053, bsz=16848.7, num_updates=52100, lr=0.000277084, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=61.1, wall=44657 epoch 031: 1509 / 1689 loss=3.524, nll_loss=1.972, ppl=3.92, wps=535568, ups=1.08, wpb=497053, bsz=16848.7, num_updates=52100, lr=0.000277084, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=61.1, wall=44657 epoch 031: 1509 / 1689 loss=3.524, nll_loss=1.972, ppl=3.92, wps=535568, ups=1.08, wpb=497053, bsz=16848.7, num_updates=52100, lr=0.000277084, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=61.1, wall=44657 epoch 031: 1509 / 1689 loss=3.524, nll_loss=1.972, ppl=3.92, wps=535568, ups=1.08, wpb=497053, bsz=16848.7, num_updates=52100, lr=0.000277084, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=61.1, wall=44657 epoch 031: 1509 / 1689 loss=3.524, nll_loss=1.972, ppl=3.92, wps=535568, ups=1.08, wpb=497053, bsz=16848.7, num_updates=52100, lr=0.000277084, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=61.1, wall=44657 epoch 031: 1509 / 1689 loss=3.524, nll_loss=1.972, ppl=3.92, wps=535568, ups=1.08, wpb=497053, bsz=16848.7, num_updates=52100, lr=0.000277084, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=61.1, wall=44657 epoch 031: 1509 / 1689 loss=3.524, nll_loss=1.972, ppl=3.92, wps=535568, ups=1.08, wpb=497053, bsz=16848.7, num_updates=52100, lr=0.000277084, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=61.1, wall=44657 epoch 031: 1509 / 1689 loss=3.524, nll_loss=1.972, ppl=3.92, wps=535568, ups=1.08, wpb=497053, bsz=16848.7, num_updates=52100, lr=0.000277084, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=61.1, wall=44657 epoch 031: 1509 / 1689 loss=3.524, nll_loss=1.972, ppl=3.92, wps=535568, ups=1.08, wpb=497053, bsz=16848.7, num_updates=52100, lr=0.000277084, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=61.1, wall=44657 epoch 031: 1509 / 1689 loss=3.524, nll_loss=1.972, ppl=3.92, wps=535568, ups=1.08, wpb=497053, bsz=16848.7, num_updates=52100, lr=0.000277084, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=61.1, wall=44657 epoch 031: 1509 / 1689 loss=3.524, nll_loss=1.972, ppl=3.92, wps=535568, ups=1.08, wpb=497053, bsz=16848.7, num_updates=52100, lr=0.000277084, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=61.1, wall=44657 epoch 031: 1509 / 1689 loss=3.524, nll_loss=1.972, ppl=3.92, wps=535568, ups=1.08, wpb=497053, bsz=16848.7, num_updates=52100, lr=0.000277084, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=61.1, wall=44657 epoch 031: 1509 / 1689 loss=3.524, nll_loss=1.972, ppl=3.92, wps=535568, ups=1.08, wpb=497053, bsz=16848.7, num_updates=52100, lr=0.000277084, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=61.1, wall=44657 epoch 031: 1509 / 1689 loss=3.524, nll_loss=1.972, ppl=3.92, wps=535568, ups=1.08, wpb=497053, bsz=16848.7, num_updates=52100, lr=0.000277084, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=61.1, wall=44657 epoch 031: 1509 / 1689 loss=3.524, nll_loss=1.972, ppl=3.92, wps=535568, ups=1.08, wpb=497053, bsz=16848.7, num_updates=52100, lr=0.000277084, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=61.1, wall=44657 epoch 031: 1509 / 1689 loss=3.524, nll_loss=1.972, ppl=3.92, wps=535568, ups=1.08, wpb=497053, bsz=16848.7, num_updates=52100, lr=0.000277084, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=61.1, wall=44657 epoch 031: 1509 / 1689 loss=3.524, nll_loss=1.972, ppl=3.92, wps=535568, ups=1.08, wpb=497053, bsz=16848.7, num_updates=52100, lr=0.000277084, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=61.1, wall=44657 epoch 031: 1509 / 1689 loss=3.524, nll_loss=1.972, ppl=3.92, wps=535568, ups=1.08, wpb=497053, bsz=16848.7, num_updates=52100, lr=0.000277084, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=61.1, wall=44657 epoch 031: 1609 / 1689 loss=3.519, nll_loss=1.966, ppl=3.91, wps=586336, ups=1.18, wpb=494986, bsz=16441.3, num_updates=52200, lr=0.000276818, gnorm=0.132, clip=0, loss_scale=4, train_wall=83, gb_free=60, wall=44741 epoch 031: 1609 / 1689 loss=3.519, nll_loss=1.966, ppl=3.91, wps=586336, ups=1.18, wpb=494986, bsz=16441.3, num_updates=52200, lr=0.000276818, gnorm=0.132, clip=0, loss_scale=4, train_wall=83, gb_free=60, wall=44741 epoch 031: 1609 / 1689 loss=3.519, nll_loss=1.966, ppl=3.91, wps=586336, ups=1.18, wpb=494986, bsz=16441.3, num_updates=52200, lr=0.000276818, gnorm=0.132, clip=0, loss_scale=4, train_wall=83, gb_free=60, wall=44741 epoch 031: 1609 / 1689 loss=3.519, nll_loss=1.966, ppl=3.91, wps=586336, ups=1.18, wpb=494986, bsz=16441.3, num_updates=52200, lr=0.000276818, gnorm=0.132, clip=0, loss_scale=4, train_wall=83, gb_free=60, wall=44741 epoch 031: 1609 / 1689 loss=3.519, nll_loss=1.966, ppl=3.91, wps=586336, ups=1.18, wpb=494986, bsz=16441.3, num_updates=52200, lr=0.000276818, gnorm=0.132, clip=0, loss_scale=4, train_wall=83, gb_free=60, wall=44741 epoch 031: 1609 / 1689 loss=3.519, nll_loss=1.966, ppl=3.91, wps=586336, ups=1.18, wpb=494986, bsz=16441.3, num_updates=52200, lr=0.000276818, gnorm=0.132, clip=0, loss_scale=4, train_wall=83, gb_free=60, wall=44741 epoch 031: 1609 / 1689 loss=3.519, nll_loss=1.966, ppl=3.91, wps=586336, ups=1.18, wpb=494986, bsz=16441.3, num_updates=52200, lr=0.000276818, gnorm=0.132, clip=0, loss_scale=4, train_wall=83, gb_free=60, wall=44741 epoch 031: 1609 / 1689 loss=3.519, nll_loss=1.966, ppl=3.91, wps=586336, ups=1.18, wpb=494986, bsz=16441.3, num_updates=52200, lr=0.000276818, gnorm=0.132, clip=0, loss_scale=4, train_wall=83, gb_free=60, wall=44741 epoch 031: 1609 / 1689 loss=3.519, nll_loss=1.966, ppl=3.91, wps=586336, ups=1.18, wpb=494986, bsz=16441.3, num_updates=52200, lr=0.000276818, gnorm=0.132, clip=0, loss_scale=4, train_wall=83, gb_free=60, wall=44741 epoch 031: 1609 / 1689 loss=3.519, nll_loss=1.966, ppl=3.91, wps=586336, ups=1.18, wpb=494986, bsz=16441.3, num_updates=52200, lr=0.000276818, gnorm=0.132, clip=0, loss_scale=4, train_wall=83, gb_free=60, wall=44741 epoch 031: 1609 / 1689 loss=3.519, nll_loss=1.966, ppl=3.91, wps=586336, ups=1.18, wpb=494986, bsz=16441.3, num_updates=52200, lr=0.000276818, gnorm=0.132, clip=0, loss_scale=4, train_wall=83, gb_free=60, wall=44741 epoch 031: 1609 / 1689 loss=3.519, nll_loss=1.966, ppl=3.91, wps=586336, ups=1.18, wpb=494986, bsz=16441.3, num_updates=52200, lr=0.000276818, gnorm=0.132, clip=0, loss_scale=4, train_wall=83, gb_free=60, wall=44741 epoch 031: 1609 / 1689 loss=3.519, nll_loss=1.966, ppl=3.91, wps=586336, ups=1.18, wpb=494986, bsz=16441.3, num_updates=52200, lr=0.000276818, gnorm=0.132, clip=0, loss_scale=4, train_wall=83, gb_free=60, wall=44741 epoch 031: 1609 / 1689 loss=3.519, nll_loss=1.966, ppl=3.91, wps=586336, ups=1.18, wpb=494986, bsz=16441.3, num_updates=52200, lr=0.000276818, gnorm=0.132, clip=0, loss_scale=4, train_wall=83, gb_free=60, wall=44741 epoch 031: 1609 / 1689 loss=3.519, nll_loss=1.966, ppl=3.91, wps=586336, ups=1.18, wpb=494986, bsz=16441.3, num_updates=52200, lr=0.000276818, gnorm=0.132, clip=0, loss_scale=4, train_wall=83, gb_free=60, wall=44741 epoch 031: 1609 / 1689 loss=3.519, nll_loss=1.966, ppl=3.91, wps=586336, ups=1.18, wpb=494986, bsz=16441.3, num_updates=52200, lr=0.000276818, gnorm=0.132, clip=0, loss_scale=4, train_wall=83, gb_free=60, wall=44741 epoch 031: 1609 / 1689 loss=3.519, nll_loss=1.966, ppl=3.91, wps=586336, ups=1.18, wpb=494986, bsz=16441.3, num_updates=52200, lr=0.000276818, gnorm=0.132, clip=0, loss_scale=4, train_wall=83, gb_free=60, wall=44741 epoch 031: 1609 / 1689 loss=3.519, nll_loss=1.966, ppl=3.91, wps=586336, ups=1.18, wpb=494986, bsz=16441.3, num_updates=52200, lr=0.000276818, gnorm=0.132, clip=0, loss_scale=4, train_wall=83, gb_free=60, wall=44741 epoch 031: 1609 / 1689 loss=3.519, nll_loss=1.966, ppl=3.91, wps=586336, ups=1.18, wpb=494986, bsz=16441.3, num_updates=52200, lr=0.000276818, gnorm=0.132, clip=0, loss_scale=4, train_wall=83, gb_free=60, wall=44741 epoch 031: 1609 / 1689 loss=3.519, nll_loss=1.966, ppl=3.91, wps=586336, ups=1.18, wpb=494986, bsz=16441.3, num_updates=52200, lr=0.000276818, gnorm=0.132, clip=0, loss_scale=4, train_wall=83, gb_free=60, wall=44741 epoch 031: 1609 / 1689 loss=3.519, nll_loss=1.966, ppl=3.91, wps=586336, ups=1.18, wpb=494986, bsz=16441.3, num_updates=52200, lr=0.000276818, gnorm=0.132, clip=0, loss_scale=4, train_wall=83, gb_free=60, wall=44741 epoch 031: 1609 / 1689 loss=3.519, nll_loss=1.966, ppl=3.91, wps=586336, ups=1.18, wpb=494986, bsz=16441.3, num_updates=52200, lr=0.000276818, gnorm=0.132, clip=0, loss_scale=4, train_wall=83, gb_free=60, wall=44741 epoch 031: 1609 / 1689 loss=3.519, nll_loss=1.966, ppl=3.91, wps=586336, ups=1.18, wpb=494986, bsz=16441.3, num_updates=52200, lr=0.000276818, gnorm=0.132, clip=0, loss_scale=4, train_wall=83, gb_free=60, wall=44741 epoch 031: 1609 / 1689 loss=3.519, nll_loss=1.966, ppl=3.91, wps=586336, ups=1.18, wpb=494986, bsz=16441.3, num_updates=52200, lr=0.000276818, gnorm=0.132, clip=0, loss_scale=4, train_wall=83, gb_free=60, wall=44741 epoch 031: 1609 / 1689 loss=3.519, nll_loss=1.966, ppl=3.91, wps=586336, ups=1.18, wpb=494986, bsz=16441.3, num_updates=52200, lr=0.000276818, gnorm=0.132, clip=0, loss_scale=4, train_wall=83, gb_free=60, wall=44741 epoch 031: 1609 / 1689 loss=3.519, nll_loss=1.966, ppl=3.91, wps=586336, ups=1.18, wpb=494986, bsz=16441.3, num_updates=52200, lr=0.000276818, gnorm=0.132, clip=0, loss_scale=4, train_wall=83, gb_free=60, wall=44741 epoch 031: 1609 / 1689 loss=3.519, nll_loss=1.966, ppl=3.91, wps=586336, ups=1.18, wpb=494986, bsz=16441.3, num_updates=52200, lr=0.000276818, gnorm=0.132, clip=0, loss_scale=4, train_wall=83, gb_free=60, wall=44741 epoch 031: 1609 / 1689 loss=3.519, nll_loss=1.966, ppl=3.91, wps=586336, ups=1.18, wpb=494986, bsz=16441.3, num_updates=52200, lr=0.000276818, gnorm=0.132, clip=0, loss_scale=4, train_wall=83, gb_free=60, wall=44741 epoch 031: 1609 / 1689 loss=3.519, nll_loss=1.966, ppl=3.91, wps=586336, ups=1.18, wpb=494986, bsz=16441.3, num_updates=52200, lr=0.000276818, gnorm=0.132, clip=0, loss_scale=4, train_wall=83, gb_free=60, wall=44741 epoch 031: 1609 / 1689 loss=3.519, nll_loss=1.966, ppl=3.91, wps=586336, ups=1.18, wpb=494986, bsz=16441.3, num_updates=52200, lr=0.000276818, gnorm=0.132, clip=0, loss_scale=4, train_wall=83, gb_free=60, wall=44741 epoch 031: 1609 / 1689 loss=3.519, nll_loss=1.966, ppl=3.91, wps=586336, ups=1.18, wpb=494986, bsz=16441.3, num_updates=52200, lr=0.000276818, gnorm=0.132, clip=0, loss_scale=4, train_wall=83, gb_free=60, wall=44741 end of epoch 31 (average epoch stats below) epoch 031 | loss 3.512 | nll_loss 1.958 | ppl 3.89 | wps 578504 | ups 1.17 | wpb 495141 | bsz 16503.5 | num_updates 52280 | lr 0.000276606 | gnorm 0.135 | clip 0 | loss_scale 4 | train_wall 1408 | gb_free 62.5 | wall 44808 epoch 031 | loss 3.512 | nll_loss 1.958 | ppl 3.89 | wps 578504 | ups 1.17 | wpb 495141 | bsz 16503.5 | num_updates 52280 | lr 0.000276606 | gnorm 0.135 | clip 0 | loss_scale 4 | train_wall 1408 | gb_free 62.5 | wall 44808 epoch 031 | loss 3.512 | nll_loss 1.958 | ppl 3.89 | wps 578504 | ups 1.17 | wpb 495141 | bsz 16503.5 | num_updates 52280 | lr 0.000276606 | gnorm 0.135 | clip 0 | loss_scale 4 | train_wall 1408 | gb_free 62.5 | wall 44808 epoch 031 | loss 3.512 | nll_loss 1.958 | ppl 3.89 | wps 578504 | ups 1.17 | wpb 495141 | bsz 16503.5 | num_updates 52280 | lr 0.000276606 | gnorm 0.135 | clip 0 | loss_scale 4 | train_wall 1408 | gb_free 62.5 | wall 44808 epoch 031 | loss 3.512 | nll_loss 1.958 | ppl 3.89 | wps 578504 | ups 1.17 | wpb 495141 | bsz 16503.5 | num_updates 52280 | lr 0.000276606 | gnorm 0.135 | clip 0 | loss_scale 4 | train_wall 1408 | gb_free 62.5 | wall 44808 epoch 031 | loss 3.512 | nll_loss 1.958 | ppl 3.89 | wps 578504 | ups 1.17 | wpb 495141 | bsz 16503.5 | num_updates 52280 | lr 0.000276606 | gnorm 0.135 | clip 0 | loss_scale 4 | train_wall 1408 | gb_free 62.5 | wall 44808 epoch 031 | loss 3.512 | nll_loss 1.958 | ppl 3.89 | wps 578504 | ups 1.17 | wpb 495141 | bsz 16503.5 | num_updates 52280 | lr 0.000276606 | gnorm 0.135 | clip 0 | loss_scale 4 | train_wall 1408 | gb_free 62.5 | wall 44808 epoch 031 | loss 3.512 | nll_loss 1.958 | ppl 3.89 | wps 578504 | ups 1.17 | wpb 495141 | bsz 16503.5 | num_updates 52280 | lr 0.000276606 | gnorm 0.135 | clip 0 | loss_scale 4 | train_wall 1408 | gb_free 62.5 | wall 44808 epoch 031 | loss 3.512 | nll_loss 1.958 | ppl 3.89 | wps 578504 | ups 1.17 | wpb 495141 | bsz 16503.5 | num_updates 52280 | lr 0.000276606 | gnorm 0.135 | clip 0 | loss_scale 4 | train_wall 1408 | gb_free 62.5 | wall 44808 epoch 031 | loss 3.512 | nll_loss 1.958 | ppl 3.89 | wps 578504 | ups 1.17 | wpb 495141 | bsz 16503.5 | num_updates 52280 | lr 0.000276606 | gnorm 0.135 | clip 0 | loss_scale 4 | train_wall 1408 | gb_free 62.5 | wall 44808 epoch 031 | loss 3.512 | nll_loss 1.958 | ppl 3.89 | wps 578504 | ups 1.17 | wpb 495141 | bsz 16503.5 | num_updates 52280 | lr 0.000276606 | gnorm 0.135 | clip 0 | loss_scale 4 | train_wall 1408 | gb_free 62.5 | wall 44808 epoch 031 | loss 3.512 | nll_loss 1.958 | ppl 3.89 | wps 578504 | ups 1.17 | wpb 495141 | bsz 16503.5 | num_updates 52280 | lr 0.000276606 | gnorm 0.135 | clip 0 | loss_scale 4 | train_wall 1408 | gb_free 62.5 | wall 44808 epoch 031 | loss 3.512 | nll_loss 1.958 | ppl 3.89 | wps 578504 | ups 1.17 | wpb 495141 | bsz 16503.5 | num_updates 52280 | lr 0.000276606 | gnorm 0.135 | clip 0 | loss_scale 4 | train_wall 1408 | gb_free 62.5 | wall 44808 epoch 031 | loss 3.512 | nll_loss 1.958 | ppl 3.89 | wps 578504 | ups 1.17 | wpb 495141 | bsz 16503.5 | num_updates 52280 | lr 0.000276606 | gnorm 0.135 | clip 0 | loss_scale 4 | train_wall 1408 | gb_free 62.5 | wall 44808 epoch 031 | loss 3.512 | nll_loss 1.958 | ppl 3.89 | wps 578504 | ups 1.17 | wpb 495141 | bsz 16503.5 | num_updates 52280 | lr 0.000276606 | gnorm 0.135 | clip 0 | loss_scale 4 | train_wall 1408 | gb_free 62.5 | wall 44808 epoch 031 | loss 3.512 | nll_loss 1.958 | ppl 3.89 | wps 578504 | ups 1.17 | wpb 495141 | bsz 16503.5 | num_updates 52280 | lr 0.000276606 | gnorm 0.135 | clip 0 | loss_scale 4 | train_wall 1408 | gb_free 62.5 | wall 44808 epoch 031 | loss 3.512 | nll_loss 1.958 | ppl 3.89 | wps 578504 | ups 1.17 | wpb 495141 | bsz 16503.5 | num_updates 52280 | lr 0.000276606 | gnorm 0.135 | clip 0 | loss_scale 4 | train_wall 1408 | gb_free 62.5 | wall 44808 epoch 031 | loss 3.512 | nll_loss 1.958 | ppl 3.89 | wps 578504 | ups 1.17 | wpb 495141 | bsz 16503.5 | num_updates 52280 | lr 0.000276606 | gnorm 0.135 | clip 0 | loss_scale 4 | train_wall 1408 | gb_free 62.5 | wall 44808 epoch 031 | loss 3.512 | nll_loss 1.958 | ppl 3.89 | wps 578504 | ups 1.17 | wpb 495141 | bsz 16503.5 | num_updates 52280 | lr 0.000276606 | gnorm 0.135 | clip 0 | loss_scale 4 | train_wall 1408 | gb_free 62.5 | wall 44808 epoch 031 | loss 3.512 | nll_loss 1.958 | ppl 3.89 | wps 578504 | ups 1.17 | wpb 495141 | bsz 16503.5 | num_updates 52280 | lr 0.000276606 | gnorm 0.135 | clip 0 | loss_scale 4 | train_wall 1408 | gb_free 62.5 | wall 44808 epoch 031 | loss 3.512 | nll_loss 1.958 | ppl 3.89 | wps 578504 | ups 1.17 | wpb 495141 | bsz 16503.5 | num_updates 52280 | lr 0.000276606 | gnorm 0.135 | clip 0 | loss_scale 4 | train_wall 1408 | gb_free 62.5 | wall 44808 epoch 031 | loss 3.512 | nll_loss 1.958 | ppl 3.89 | wps 578504 | ups 1.17 | wpb 495141 | bsz 16503.5 | num_updates 52280 | lr 0.000276606 | gnorm 0.135 | clip 0 | loss_scale 4 | train_wall 1408 | gb_free 62.5 | wall 44808 epoch 031 | loss 3.512 | nll_loss 1.958 | ppl 3.89 | wps 578504 | ups 1.17 | wpb 495141 | bsz 16503.5 | num_updates 52280 | lr 0.000276606 | gnorm 0.135 | clip 0 | loss_scale 4 | train_wall 1408 | gb_free 62.5 | wall 44808 epoch 031 | loss 3.512 | nll_loss 1.958 | ppl 3.89 | wps 578504 | ups 1.17 | wpb 495141 | bsz 16503.5 | num_updates 52280 | lr 0.000276606 | gnorm 0.135 | clip 0 | loss_scale 4 | train_wall 1408 | gb_free 62.5 | wall 44808 epoch 031 | loss 3.512 | nll_loss 1.958 | ppl 3.89 | wps 578504 | ups 1.17 | wpb 495141 | bsz 16503.5 | num_updates 52280 | lr 0.000276606 | gnorm 0.135 | clip 0 | loss_scale 4 | train_wall 1408 | gb_free 62.5 | wall 44808 epoch 031 | loss 3.512 | nll_loss 1.958 | ppl 3.89 | wps 578504 | ups 1.17 | wpb 495141 | bsz 16503.5 | num_updates 52280 | lr 0.000276606 | gnorm 0.135 | clip 0 | loss_scale 4 | train_wall 1408 | gb_free 62.5 | wall 44808 epoch 031 | loss 3.512 | nll_loss 1.958 | ppl 3.89 | wps 578504 | ups 1.17 | wpb 495141 | bsz 16503.5 | num_updates 52280 | lr 0.000276606 | gnorm 0.135 | clip 0 | loss_scale 4 | train_wall 1408 | gb_free 62.5 | wall 44808 epoch 031 | loss 3.512 | nll_loss 1.958 | ppl 3.89 | wps 578504 | ups 1.17 | wpb 495141 | bsz 16503.5 | num_updates 52280 | lr 0.000276606 | gnorm 0.135 | clip 0 | loss_scale 4 | train_wall 1408 | gb_free 62.5 | wall 44808 epoch 031 | loss 3.512 | nll_loss 1.958 | ppl 3.89 | wps 578504 | ups 1.17 | wpb 495141 | bsz 16503.5 | num_updates 52280 | lr 0.000276606 | gnorm 0.135 | clip 0 | loss_scale 4 | train_wall 1408 | gb_free 62.5 | wall 44808 epoch 031 | loss 3.512 | nll_loss 1.958 | ppl 3.89 | wps 578504 | ups 1.17 | wpb 495141 | bsz 16503.5 | num_updates 52280 | lr 0.000276606 | gnorm 0.135 | clip 0 | loss_scale 4 | train_wall 1408 | gb_free 62.5 | wall 44808 epoch 031 | loss 3.512 | nll_loss 1.958 | ppl 3.89 | wps 578504 | ups 1.17 | wpb 495141 | bsz 16503.5 | num_updates 52280 | lr 0.000276606 | gnorm 0.135 | clip 0 | loss_scale 4 | train_wall 1408 | gb_free 62.5 | wall 44808 Start iterating over samples epoch 032: 20 / 1689 loss=3.511, nll_loss=1.956, ppl=3.88, wps=585718, ups=1.19, wpb=493246, bsz=16484.4, num_updates=52300, lr=0.000276553, gnorm=0.136, clip=0, loss_scale=4, train_wall=83, gb_free=61.6, wall=44826 epoch 032: 20 / 1689 loss=3.511, nll_loss=1.956, ppl=3.88, wps=585718, ups=1.19, wpb=493246, bsz=16484.4, num_updates=52300, lr=0.000276553, gnorm=0.136, clip=0, loss_scale=4, train_wall=83, gb_free=61.6, wall=44826 epoch 032: 20 / 1689 loss=3.511, nll_loss=1.956, ppl=3.88, wps=585718, ups=1.19, wpb=493246, bsz=16484.4, num_updates=52300, lr=0.000276553, gnorm=0.136, clip=0, loss_scale=4, train_wall=83, gb_free=61.6, wall=44826 epoch 032: 20 / 1689 loss=3.511, nll_loss=1.956, ppl=3.88, wps=585718, ups=1.19, wpb=493246, bsz=16484.4, num_updates=52300, lr=0.000276553, gnorm=0.136, clip=0, loss_scale=4, train_wall=83, gb_free=61.6, wall=44826 epoch 032: 20 / 1689 loss=3.511, nll_loss=1.956, ppl=3.88, wps=585718, ups=1.19, wpb=493246, bsz=16484.4, num_updates=52300, lr=0.000276553, gnorm=0.136, clip=0, loss_scale=4, train_wall=83, gb_free=61.6, wall=44826 epoch 032: 20 / 1689 loss=3.511, nll_loss=1.956, ppl=3.88, wps=585718, ups=1.19, wpb=493246, bsz=16484.4, num_updates=52300, lr=0.000276553, gnorm=0.136, clip=0, loss_scale=4, train_wall=83, gb_free=61.6, wall=44826 epoch 032: 20 / 1689 loss=3.511, nll_loss=1.956, ppl=3.88, wps=585718, ups=1.19, wpb=493246, bsz=16484.4, num_updates=52300, lr=0.000276553, gnorm=0.136, clip=0, loss_scale=4, train_wall=83, gb_free=61.6, wall=44826 epoch 032: 20 / 1689 loss=3.511, nll_loss=1.956, ppl=3.88, wps=585718, ups=1.19, wpb=493246, bsz=16484.4, num_updates=52300, lr=0.000276553, gnorm=0.136, clip=0, loss_scale=4, train_wall=83, gb_free=61.6, wall=44826 epoch 032: 20 / 1689 loss=3.511, nll_loss=1.956, ppl=3.88, wps=585718, ups=1.19, wpb=493246, bsz=16484.4, num_updates=52300, lr=0.000276553, gnorm=0.136, clip=0, loss_scale=4, train_wall=83, gb_free=61.6, wall=44826 epoch 032: 20 / 1689 loss=3.511, nll_loss=1.956, ppl=3.88, wps=585718, ups=1.19, wpb=493246, bsz=16484.4, num_updates=52300, lr=0.000276553, gnorm=0.136, clip=0, loss_scale=4, train_wall=83, gb_free=61.6, wall=44826 epoch 032: 20 / 1689 loss=3.511, nll_loss=1.956, ppl=3.88, wps=585718, ups=1.19, wpb=493246, bsz=16484.4, num_updates=52300, lr=0.000276553, gnorm=0.136, clip=0, loss_scale=4, train_wall=83, gb_free=61.6, wall=44826 epoch 032: 20 / 1689 loss=3.511, nll_loss=1.956, ppl=3.88, wps=585718, ups=1.19, wpb=493246, bsz=16484.4, num_updates=52300, lr=0.000276553, gnorm=0.136, clip=0, loss_scale=4, train_wall=83, gb_free=61.6, wall=44826 epoch 032: 20 / 1689 loss=3.511, nll_loss=1.956, ppl=3.88, wps=585718, ups=1.19, wpb=493246, bsz=16484.4, num_updates=52300, lr=0.000276553, gnorm=0.136, clip=0, loss_scale=4, train_wall=83, gb_free=61.6, wall=44826 epoch 032: 20 / 1689 loss=3.511, nll_loss=1.956, ppl=3.88, wps=585718, ups=1.19, wpb=493246, bsz=16484.4, num_updates=52300, lr=0.000276553, gnorm=0.136, clip=0, loss_scale=4, train_wall=83, gb_free=61.6, wall=44826 epoch 032: 20 / 1689 loss=3.511, nll_loss=1.956, ppl=3.88, wps=585718, ups=1.19, wpb=493246, bsz=16484.4, num_updates=52300, lr=0.000276553, gnorm=0.136, clip=0, loss_scale=4, train_wall=83, gb_free=61.6, wall=44826 epoch 032: 20 / 1689 loss=3.511, nll_loss=1.956, ppl=3.88, wps=585718, ups=1.19, wpb=493246, bsz=16484.4, num_updates=52300, lr=0.000276553, gnorm=0.136, clip=0, loss_scale=4, train_wall=83, gb_free=61.6, wall=44826 epoch 032: 20 / 1689 loss=3.511, nll_loss=1.956, ppl=3.88, wps=585718, ups=1.19, wpb=493246, bsz=16484.4, num_updates=52300, lr=0.000276553, gnorm=0.136, clip=0, loss_scale=4, train_wall=83, gb_free=61.6, wall=44826 epoch 032: 20 / 1689 loss=3.511, nll_loss=1.956, ppl=3.88, wps=585718, ups=1.19, wpb=493246, bsz=16484.4, num_updates=52300, lr=0.000276553, gnorm=0.136, clip=0, loss_scale=4, train_wall=83, gb_free=61.6, wall=44826 epoch 032: 20 / 1689 loss=3.511, nll_loss=1.956, ppl=3.88, wps=585718, ups=1.19, wpb=493246, bsz=16484.4, num_updates=52300, lr=0.000276553, gnorm=0.136, clip=0, loss_scale=4, train_wall=83, gb_free=61.6, wall=44826 epoch 032: 20 / 1689 loss=3.511, nll_loss=1.956, ppl=3.88, wps=585718, ups=1.19, wpb=493246, bsz=16484.4, num_updates=52300, lr=0.000276553, gnorm=0.136, clip=0, loss_scale=4, train_wall=83, gb_free=61.6, wall=44826 epoch 032: 20 / 1689 loss=3.511, nll_loss=1.956, ppl=3.88, wps=585718, ups=1.19, wpb=493246, bsz=16484.4, num_updates=52300, lr=0.000276553, gnorm=0.136, clip=0, loss_scale=4, train_wall=83, gb_free=61.6, wall=44826 epoch 032: 20 / 1689 loss=3.511, nll_loss=1.956, ppl=3.88, wps=585718, ups=1.19, wpb=493246, bsz=16484.4, num_updates=52300, lr=0.000276553, gnorm=0.136, clip=0, loss_scale=4, train_wall=83, gb_free=61.6, wall=44826 epoch 032: 20 / 1689 loss=3.511, nll_loss=1.956, ppl=3.88, wps=585718, ups=1.19, wpb=493246, bsz=16484.4, num_updates=52300, lr=0.000276553, gnorm=0.136, clip=0, loss_scale=4, train_wall=83, gb_free=61.6, wall=44826 epoch 032: 20 / 1689 loss=3.511, nll_loss=1.956, ppl=3.88, wps=585718, ups=1.19, wpb=493246, bsz=16484.4, num_updates=52300, lr=0.000276553, gnorm=0.136, clip=0, loss_scale=4, train_wall=83, gb_free=61.6, wall=44826 epoch 032: 20 / 1689 loss=3.511, nll_loss=1.956, ppl=3.88, wps=585718, ups=1.19, wpb=493246, bsz=16484.4, num_updates=52300, lr=0.000276553, gnorm=0.136, clip=0, loss_scale=4, train_wall=83, gb_free=61.6, wall=44826 epoch 032: 20 / 1689 loss=3.511, nll_loss=1.956, ppl=3.88, wps=585718, ups=1.19, wpb=493246, bsz=16484.4, num_updates=52300, lr=0.000276553, gnorm=0.136, clip=0, loss_scale=4, train_wall=83, gb_free=61.6, wall=44826 epoch 032: 20 / 1689 loss=3.511, nll_loss=1.956, ppl=3.88, wps=585718, ups=1.19, wpb=493246, bsz=16484.4, num_updates=52300, lr=0.000276553, gnorm=0.136, clip=0, loss_scale=4, train_wall=83, gb_free=61.6, wall=44826 epoch 032: 20 / 1689 loss=3.511, nll_loss=1.956, ppl=3.88, wps=585718, ups=1.19, wpb=493246, bsz=16484.4, num_updates=52300, lr=0.000276553, gnorm=0.136, clip=0, loss_scale=4, train_wall=83, gb_free=61.6, wall=44826 epoch 032: 20 / 1689 loss=3.511, nll_loss=1.956, ppl=3.88, wps=585718, ups=1.19, wpb=493246, bsz=16484.4, num_updates=52300, lr=0.000276553, gnorm=0.136, clip=0, loss_scale=4, train_wall=83, gb_free=61.6, wall=44826 epoch 032: 20 / 1689 loss=3.511, nll_loss=1.956, ppl=3.88, wps=585718, ups=1.19, wpb=493246, bsz=16484.4, num_updates=52300, lr=0.000276553, gnorm=0.136, clip=0, loss_scale=4, train_wall=83, gb_free=61.6, wall=44826 epoch 032: 20 / 1689 loss=3.511, nll_loss=1.956, ppl=3.88, wps=585718, ups=1.19, wpb=493246, bsz=16484.4, num_updates=52300, lr=0.000276553, gnorm=0.136, clip=0, loss_scale=4, train_wall=83, gb_free=61.6, wall=44826 epoch 032: 20 / 1689 loss=3.511, nll_loss=1.956, ppl=3.88, wps=585718, ups=1.19, wpb=493246, bsz=16484.4, num_updates=52300, lr=0.000276553, gnorm=0.136, clip=0, loss_scale=4, train_wall=83, gb_free=61.6, wall=44826 epoch 032: 121 / 1689 loss=3.496, nll_loss=1.939, ppl=3.84, wps=580768, ups=1.17, wpb=494726, bsz=16774.6, num_updates=52400, lr=0.000276289, gnorm=0.139, clip=0, loss_scale=4, train_wall=84, gb_free=60.9, wall=44911 epoch 032: 121 / 1689 loss=3.496, nll_loss=1.939, ppl=3.84, wps=580768, ups=1.17, wpb=494726, bsz=16774.6, num_updates=52400, lr=0.000276289, gnorm=0.139, clip=0, loss_scale=4, train_wall=84, gb_free=60.9, wall=44911 epoch 032: 121 / 1689 loss=3.496, nll_loss=1.939, ppl=3.84, wps=580768, ups=1.17, wpb=494726, bsz=16774.6, num_updates=52400, lr=0.000276289, gnorm=0.139, clip=0, loss_scale=4, train_wall=84, gb_free=60.9, wall=44911 epoch 032: 121 / 1689 loss=3.496, nll_loss=1.939, ppl=3.84, wps=580768, ups=1.17, wpb=494726, bsz=16774.6, num_updates=52400, lr=0.000276289, gnorm=0.139, clip=0, loss_scale=4, train_wall=84, gb_free=60.9, wall=44911 epoch 032: 121 / 1689 loss=3.496, nll_loss=1.939, ppl=3.84, wps=580768, ups=1.17, wpb=494726, bsz=16774.6, num_updates=52400, lr=0.000276289, gnorm=0.139, clip=0, loss_scale=4, train_wall=84, gb_free=60.9, wall=44911 epoch 032: 121 / 1689 loss=3.496, nll_loss=1.939, ppl=3.84, wps=580768, ups=1.17, wpb=494726, bsz=16774.6, num_updates=52400, lr=0.000276289, gnorm=0.139, clip=0, loss_scale=4, train_wall=84, gb_free=60.9, wall=44911 epoch 032: 121 / 1689 loss=3.496, nll_loss=1.939, ppl=3.84, wps=580768, ups=1.17, wpb=494726, bsz=16774.6, num_updates=52400, lr=0.000276289, gnorm=0.139, clip=0, loss_scale=4, train_wall=84, gb_free=60.9, wall=44911 epoch 032: 121 / 1689 loss=3.496, nll_loss=1.939, ppl=3.84, wps=580768, ups=1.17, wpb=494726, bsz=16774.6, num_updates=52400, lr=0.000276289, gnorm=0.139, clip=0, loss_scale=4, train_wall=84, gb_free=60.9, wall=44911 epoch 032: 121 / 1689 loss=3.496, nll_loss=1.939, ppl=3.84, wps=580768, ups=1.17, wpb=494726, bsz=16774.6, num_updates=52400, lr=0.000276289, gnorm=0.139, clip=0, loss_scale=4, train_wall=84, gb_free=60.9, wall=44911 epoch 032: 121 / 1689 loss=3.496, nll_loss=1.939, ppl=3.84, wps=580768, ups=1.17, wpb=494726, bsz=16774.6, num_updates=52400, lr=0.000276289, gnorm=0.139, clip=0, loss_scale=4, train_wall=84, gb_free=60.9, wall=44911 epoch 032: 121 / 1689 loss=3.496, nll_loss=1.939, ppl=3.84, wps=580768, ups=1.17, wpb=494726, bsz=16774.6, num_updates=52400, lr=0.000276289, gnorm=0.139, clip=0, loss_scale=4, train_wall=84, gb_free=60.9, wall=44911 epoch 032: 121 / 1689 loss=3.496, nll_loss=1.939, ppl=3.84, wps=580768, ups=1.17, wpb=494726, bsz=16774.6, num_updates=52400, lr=0.000276289, gnorm=0.139, clip=0, loss_scale=4, train_wall=84, gb_free=60.9, wall=44911 epoch 032: 121 / 1689 loss=3.496, nll_loss=1.939, ppl=3.84, wps=580768, ups=1.17, wpb=494726, bsz=16774.6, num_updates=52400, lr=0.000276289, gnorm=0.139, clip=0, loss_scale=4, train_wall=84, gb_free=60.9, wall=44911 epoch 032: 121 / 1689 loss=3.496, nll_loss=1.939, ppl=3.84, wps=580768, ups=1.17, wpb=494726, bsz=16774.6, num_updates=52400, lr=0.000276289, gnorm=0.139, clip=0, loss_scale=4, train_wall=84, gb_free=60.9, wall=44911 epoch 032: 121 / 1689 loss=3.496, nll_loss=1.939, ppl=3.84, wps=580768, ups=1.17, wpb=494726, bsz=16774.6, num_updates=52400, lr=0.000276289, gnorm=0.139, clip=0, loss_scale=4, train_wall=84, gb_free=60.9, wall=44911 epoch 032: 121 / 1689 loss=3.496, nll_loss=1.939, ppl=3.84, wps=580768, ups=1.17, wpb=494726, bsz=16774.6, num_updates=52400, lr=0.000276289, gnorm=0.139, clip=0, loss_scale=4, train_wall=84, gb_free=60.9, wall=44911 epoch 032: 121 / 1689 loss=3.496, nll_loss=1.939, ppl=3.84, wps=580768, ups=1.17, wpb=494726, bsz=16774.6, num_updates=52400, lr=0.000276289, gnorm=0.139, clip=0, loss_scale=4, train_wall=84, gb_free=60.9, wall=44911 epoch 032: 121 / 1689 loss=3.496, nll_loss=1.939, ppl=3.84, wps=580768, ups=1.17, wpb=494726, bsz=16774.6, num_updates=52400, lr=0.000276289, gnorm=0.139, clip=0, loss_scale=4, train_wall=84, gb_free=60.9, wall=44911 epoch 032: 121 / 1689 loss=3.496, nll_loss=1.939, ppl=3.84, wps=580768, ups=1.17, wpb=494726, bsz=16774.6, num_updates=52400, lr=0.000276289, gnorm=0.139, clip=0, loss_scale=4, train_wall=84, gb_free=60.9, wall=44911 epoch 032: 121 / 1689 loss=3.496, nll_loss=1.939, ppl=3.84, wps=580768, ups=1.17, wpb=494726, bsz=16774.6, num_updates=52400, lr=0.000276289, gnorm=0.139, clip=0, loss_scale=4, train_wall=84, gb_free=60.9, wall=44911 epoch 032: 121 / 1689 loss=3.496, nll_loss=1.939, ppl=3.84, wps=580768, ups=1.17, wpb=494726, bsz=16774.6, num_updates=52400, lr=0.000276289, gnorm=0.139, clip=0, loss_scale=4, train_wall=84, gb_free=60.9, wall=44911 epoch 032: 121 / 1689 loss=3.496, nll_loss=1.939, ppl=3.84, wps=580768, ups=1.17, wpb=494726, bsz=16774.6, num_updates=52400, lr=0.000276289, gnorm=0.139, clip=0, loss_scale=4, train_wall=84, gb_free=60.9, wall=44911 epoch 032: 121 / 1689 loss=3.496, nll_loss=1.939, ppl=3.84, wps=580768, ups=1.17, wpb=494726, bsz=16774.6, num_updates=52400, lr=0.000276289, gnorm=0.139, clip=0, loss_scale=4, train_wall=84, gb_free=60.9, wall=44911 epoch 032: 121 / 1689 loss=3.496, nll_loss=1.939, ppl=3.84, wps=580768, ups=1.17, wpb=494726, bsz=16774.6, num_updates=52400, lr=0.000276289, gnorm=0.139, clip=0, loss_scale=4, train_wall=84, gb_free=60.9, wall=44911 epoch 032: 121 / 1689 loss=3.496, nll_loss=1.939, ppl=3.84, wps=580768, ups=1.17, wpb=494726, bsz=16774.6, num_updates=52400, lr=0.000276289, gnorm=0.139, clip=0, loss_scale=4, train_wall=84, gb_free=60.9, wall=44911 epoch 032: 121 / 1689 loss=3.496, nll_loss=1.939, ppl=3.84, wps=580768, ups=1.17, wpb=494726, bsz=16774.6, num_updates=52400, lr=0.000276289, gnorm=0.139, clip=0, loss_scale=4, train_wall=84, gb_free=60.9, wall=44911 epoch 032: 121 / 1689 loss=3.496, nll_loss=1.939, ppl=3.84, wps=580768, ups=1.17, wpb=494726, bsz=16774.6, num_updates=52400, lr=0.000276289, gnorm=0.139, clip=0, loss_scale=4, train_wall=84, gb_free=60.9, wall=44911 epoch 032: 121 / 1689 loss=3.496, nll_loss=1.939, ppl=3.84, wps=580768, ups=1.17, wpb=494726, bsz=16774.6, num_updates=52400, lr=0.000276289, gnorm=0.139, clip=0, loss_scale=4, train_wall=84, gb_free=60.9, wall=44911 epoch 032: 121 / 1689 loss=3.496, nll_loss=1.939, ppl=3.84, wps=580768, ups=1.17, wpb=494726, bsz=16774.6, num_updates=52400, lr=0.000276289, gnorm=0.139, clip=0, loss_scale=4, train_wall=84, gb_free=60.9, wall=44911 epoch 032: 121 / 1689 loss=3.496, nll_loss=1.939, ppl=3.84, wps=580768, ups=1.17, wpb=494726, bsz=16774.6, num_updates=52400, lr=0.000276289, gnorm=0.139, clip=0, loss_scale=4, train_wall=84, gb_free=60.9, wall=44911 epoch 032: 121 / 1689 loss=3.496, nll_loss=1.939, ppl=3.84, wps=580768, ups=1.17, wpb=494726, bsz=16774.6, num_updates=52400, lr=0.000276289, gnorm=0.139, clip=0, loss_scale=4, train_wall=84, gb_free=60.9, wall=44911 epoch 032: 121 / 1689 loss=3.496, nll_loss=1.939, ppl=3.84, wps=580768, ups=1.17, wpb=494726, bsz=16774.6, num_updates=52400, lr=0.000276289, gnorm=0.139, clip=0, loss_scale=4, train_wall=84, gb_free=60.9, wall=44911 epoch 032: 221 / 1689 loss=3.496, nll_loss=1.939, ppl=3.84, wps=582376, ups=1.18, wpb=494316, bsz=16480.2, num_updates=52500, lr=0.000276026, gnorm=0.13, clip=0, loss_scale=4, train_wall=84, gb_free=62.1, wall=44996 epoch 032: 221 / 1689 loss=3.496, nll_loss=1.939, ppl=3.84, wps=582376, ups=1.18, wpb=494316, bsz=16480.2, num_updates=52500, lr=0.000276026, gnorm=0.13, clip=0, loss_scale=4, train_wall=84, gb_free=62.1, wall=44996 epoch 032: 221 / 1689 loss=3.496, nll_loss=1.939, ppl=3.84, wps=582376, ups=1.18, wpb=494316, bsz=16480.2, num_updates=52500, lr=0.000276026, gnorm=0.13, clip=0, loss_scale=4, train_wall=84, gb_free=62.1, wall=44996 epoch 032: 221 / 1689 loss=3.496, nll_loss=1.939, ppl=3.84, wps=582376, ups=1.18, wpb=494316, bsz=16480.2, num_updates=52500, lr=0.000276026, gnorm=0.13, clip=0, loss_scale=4, train_wall=84, gb_free=62.1, wall=44996 epoch 032: 221 / 1689 loss=3.496, nll_loss=1.939, ppl=3.84, wps=582376, ups=1.18, wpb=494316, bsz=16480.2, num_updates=52500, lr=0.000276026, gnorm=0.13, clip=0, loss_scale=4, train_wall=84, gb_free=62.1, wall=44996 epoch 032: 221 / 1689 loss=3.496, nll_loss=1.939, ppl=3.84, wps=582376, ups=1.18, wpb=494316, bsz=16480.2, num_updates=52500, lr=0.000276026, gnorm=0.13, clip=0, loss_scale=4, train_wall=84, gb_free=62.1, wall=44996 epoch 032: 221 / 1689 loss=3.496, nll_loss=1.939, ppl=3.84, wps=582376, ups=1.18, wpb=494316, bsz=16480.2, num_updates=52500, lr=0.000276026, gnorm=0.13, clip=0, loss_scale=4, train_wall=84, gb_free=62.1, wall=44996 epoch 032: 221 / 1689 loss=3.496, nll_loss=1.939, ppl=3.84, wps=582376, ups=1.18, wpb=494316, bsz=16480.2, num_updates=52500, lr=0.000276026, gnorm=0.13, clip=0, loss_scale=4, train_wall=84, gb_free=62.1, wall=44996 epoch 032: 221 / 1689 loss=3.496, nll_loss=1.939, ppl=3.84, wps=582376, ups=1.18, wpb=494316, bsz=16480.2, num_updates=52500, lr=0.000276026, gnorm=0.13, clip=0, loss_scale=4, train_wall=84, gb_free=62.1, wall=44996 epoch 032: 221 / 1689 loss=3.496, nll_loss=1.939, ppl=3.84, wps=582376, ups=1.18, wpb=494316, bsz=16480.2, num_updates=52500, lr=0.000276026, gnorm=0.13, clip=0, loss_scale=4, train_wall=84, gb_free=62.1, wall=44996 epoch 032: 221 / 1689 loss=3.496, nll_loss=1.939, ppl=3.84, wps=582376, ups=1.18, wpb=494316, bsz=16480.2, num_updates=52500, lr=0.000276026, gnorm=0.13, clip=0, loss_scale=4, train_wall=84, gb_free=62.1, wall=44996 epoch 032: 221 / 1689 loss=3.496, nll_loss=1.939, ppl=3.84, wps=582376, ups=1.18, wpb=494316, bsz=16480.2, num_updates=52500, lr=0.000276026, gnorm=0.13, clip=0, loss_scale=4, train_wall=84, gb_free=62.1, wall=44996 epoch 032: 221 / 1689 loss=3.496, nll_loss=1.939, ppl=3.84, wps=582376, ups=1.18, wpb=494316, bsz=16480.2, num_updates=52500, lr=0.000276026, gnorm=0.13, clip=0, loss_scale=4, train_wall=84, gb_free=62.1, wall=44996 epoch 032: 221 / 1689 loss=3.496, nll_loss=1.939, ppl=3.84, wps=582376, ups=1.18, wpb=494316, bsz=16480.2, num_updates=52500, lr=0.000276026, gnorm=0.13, clip=0, loss_scale=4, train_wall=84, gb_free=62.1, wall=44996 epoch 032: 221 / 1689 loss=3.496, nll_loss=1.939, ppl=3.84, wps=582376, ups=1.18, wpb=494316, bsz=16480.2, num_updates=52500, lr=0.000276026, gnorm=0.13, clip=0, loss_scale=4, train_wall=84, gb_free=62.1, wall=44996 epoch 032: 221 / 1689 loss=3.496, nll_loss=1.939, ppl=3.84, wps=582376, ups=1.18, wpb=494316, bsz=16480.2, num_updates=52500, lr=0.000276026, gnorm=0.13, clip=0, loss_scale=4, train_wall=84, gb_free=62.1, wall=44996 epoch 032: 221 / 1689 loss=3.496, nll_loss=1.939, ppl=3.84, wps=582376, ups=1.18, wpb=494316, bsz=16480.2, num_updates=52500, lr=0.000276026, gnorm=0.13, clip=0, loss_scale=4, train_wall=84, gb_free=62.1, wall=44996 epoch 032: 221 / 1689 loss=3.496, nll_loss=1.939, ppl=3.84, wps=582376, ups=1.18, wpb=494316, bsz=16480.2, num_updates=52500, lr=0.000276026, gnorm=0.13, clip=0, loss_scale=4, train_wall=84, gb_free=62.1, wall=44996 epoch 032: 221 / 1689 loss=3.496, nll_loss=1.939, ppl=3.84, wps=582376, ups=1.18, wpb=494316, bsz=16480.2, num_updates=52500, lr=0.000276026, gnorm=0.13, clip=0, loss_scale=4, train_wall=84, gb_free=62.1, wall=44996 epoch 032: 221 / 1689 loss=3.496, nll_loss=1.939, ppl=3.84, wps=582376, ups=1.18, wpb=494316, bsz=16480.2, num_updates=52500, lr=0.000276026, gnorm=0.13, clip=0, loss_scale=4, train_wall=84, gb_free=62.1, wall=44996 epoch 032: 221 / 1689 loss=3.496, nll_loss=1.939, ppl=3.84, wps=582376, ups=1.18, wpb=494316, bsz=16480.2, num_updates=52500, lr=0.000276026, gnorm=0.13, clip=0, loss_scale=4, train_wall=84, gb_free=62.1, wall=44996 epoch 032: 221 / 1689 loss=3.496, nll_loss=1.939, ppl=3.84, wps=582376, ups=1.18, wpb=494316, bsz=16480.2, num_updates=52500, lr=0.000276026, gnorm=0.13, clip=0, loss_scale=4, train_wall=84, gb_free=62.1, wall=44996 epoch 032: 221 / 1689 loss=3.496, nll_loss=1.939, ppl=3.84, wps=582376, ups=1.18, wpb=494316, bsz=16480.2, num_updates=52500, lr=0.000276026, gnorm=0.13, clip=0, loss_scale=4, train_wall=84, gb_free=62.1, wall=44996 epoch 032: 221 / 1689 loss=3.496, nll_loss=1.939, ppl=3.84, wps=582376, ups=1.18, wpb=494316, bsz=16480.2, num_updates=52500, lr=0.000276026, gnorm=0.13, clip=0, loss_scale=4, train_wall=84, gb_free=62.1, wall=44996 epoch 032: 221 / 1689 loss=3.496, nll_loss=1.939, ppl=3.84, wps=582376, ups=1.18, wpb=494316, bsz=16480.2, num_updates=52500, lr=0.000276026, gnorm=0.13, clip=0, loss_scale=4, train_wall=84, gb_free=62.1, wall=44996 epoch 032: 221 / 1689 loss=3.496, nll_loss=1.939, ppl=3.84, wps=582376, ups=1.18, wpb=494316, bsz=16480.2, num_updates=52500, lr=0.000276026, gnorm=0.13, clip=0, loss_scale=4, train_wall=84, gb_free=62.1, wall=44996 epoch 032: 221 / 1689 loss=3.496, nll_loss=1.939, ppl=3.84, wps=582376, ups=1.18, wpb=494316, bsz=16480.2, num_updates=52500, lr=0.000276026, gnorm=0.13, clip=0, loss_scale=4, train_wall=84, gb_free=62.1, wall=44996 epoch 032: 221 / 1689 loss=3.496, nll_loss=1.939, ppl=3.84, wps=582376, ups=1.18, wpb=494316, bsz=16480.2, num_updates=52500, lr=0.000276026, gnorm=0.13, clip=0, loss_scale=4, train_wall=84, gb_free=62.1, wall=44996 epoch 032: 221 / 1689 loss=3.496, nll_loss=1.939, ppl=3.84, wps=582376, ups=1.18, wpb=494316, bsz=16480.2, num_updates=52500, lr=0.000276026, gnorm=0.13, clip=0, loss_scale=4, train_wall=84, gb_free=62.1, wall=44996 epoch 032: 221 / 1689 loss=3.496, nll_loss=1.939, ppl=3.84, wps=582376, ups=1.18, wpb=494316, bsz=16480.2, num_updates=52500, lr=0.000276026, gnorm=0.13, clip=0, loss_scale=4, train_wall=84, gb_free=62.1, wall=44996 epoch 032: 221 / 1689 loss=3.496, nll_loss=1.939, ppl=3.84, wps=582376, ups=1.18, wpb=494316, bsz=16480.2, num_updates=52500, lr=0.000276026, gnorm=0.13, clip=0, loss_scale=4, train_wall=84, gb_free=62.1, wall=44996 epoch 032: 221 / 1689 loss=3.496, nll_loss=1.939, ppl=3.84, wps=582376, ups=1.18, wpb=494316, bsz=16480.2, num_updates=52500, lr=0.000276026, gnorm=0.13, clip=0, loss_scale=4, train_wall=84, gb_free=62.1, wall=44996 epoch 032: 321 / 1689 loss=3.492, nll_loss=1.935, ppl=3.82, wps=584376, ups=1.18, wpb=495928, bsz=16648.3, num_updates=52600, lr=0.000275764, gnorm=0.136, clip=0, loss_scale=4, train_wall=84, gb_free=61.9, wall=45081 epoch 032: 321 / 1689 loss=3.492, nll_loss=1.935, ppl=3.82, wps=584376, ups=1.18, wpb=495928, bsz=16648.3, num_updates=52600, lr=0.000275764, gnorm=0.136, clip=0, loss_scale=4, train_wall=84, gb_free=61.9, wall=45081 epoch 032: 321 / 1689 loss=3.492, nll_loss=1.935, ppl=3.82, wps=584376, ups=1.18, wpb=495928, bsz=16648.3, num_updates=52600, lr=0.000275764, gnorm=0.136, clip=0, loss_scale=4, train_wall=84, gb_free=61.9, wall=45081 epoch 032: 321 / 1689 loss=3.492, nll_loss=1.935, ppl=3.82, wps=584376, ups=1.18, wpb=495928, bsz=16648.3, num_updates=52600, lr=0.000275764, gnorm=0.136, clip=0, loss_scale=4, train_wall=84, gb_free=61.9, wall=45081 epoch 032: 321 / 1689 loss=3.492, nll_loss=1.935, ppl=3.82, wps=584376, ups=1.18, wpb=495928, bsz=16648.3, num_updates=52600, lr=0.000275764, gnorm=0.136, clip=0, loss_scale=4, train_wall=84, gb_free=61.9, wall=45081 epoch 032: 321 / 1689 loss=3.492, nll_loss=1.935, ppl=3.82, wps=584376, ups=1.18, wpb=495928, bsz=16648.3, num_updates=52600, lr=0.000275764, gnorm=0.136, clip=0, loss_scale=4, train_wall=84, gb_free=61.9, wall=45081 epoch 032: 321 / 1689 loss=3.492, nll_loss=1.935, ppl=3.82, wps=584376, ups=1.18, wpb=495928, bsz=16648.3, num_updates=52600, lr=0.000275764, gnorm=0.136, clip=0, loss_scale=4, train_wall=84, gb_free=61.9, wall=45081 epoch 032: 321 / 1689 loss=3.492, nll_loss=1.935, ppl=3.82, wps=584376, ups=1.18, wpb=495928, bsz=16648.3, num_updates=52600, lr=0.000275764, gnorm=0.136, clip=0, loss_scale=4, train_wall=84, gb_free=61.9, wall=45081 epoch 032: 321 / 1689 loss=3.492, nll_loss=1.935, ppl=3.82, wps=584376, ups=1.18, wpb=495928, bsz=16648.3, num_updates=52600, lr=0.000275764, gnorm=0.136, clip=0, loss_scale=4, train_wall=84, gb_free=61.9, wall=45081 epoch 032: 321 / 1689 loss=3.492, nll_loss=1.935, ppl=3.82, wps=584376, ups=1.18, wpb=495928, bsz=16648.3, num_updates=52600, lr=0.000275764, gnorm=0.136, clip=0, loss_scale=4, train_wall=84, gb_free=61.9, wall=45081 epoch 032: 321 / 1689 loss=3.492, nll_loss=1.935, ppl=3.82, wps=584376, ups=1.18, wpb=495928, bsz=16648.3, num_updates=52600, lr=0.000275764, gnorm=0.136, clip=0, loss_scale=4, train_wall=84, gb_free=61.9, wall=45081 epoch 032: 321 / 1689 loss=3.492, nll_loss=1.935, ppl=3.82, wps=584376, ups=1.18, wpb=495928, bsz=16648.3, num_updates=52600, lr=0.000275764, gnorm=0.136, clip=0, loss_scale=4, train_wall=84, gb_free=61.9, wall=45081 epoch 032: 321 / 1689 loss=3.492, nll_loss=1.935, ppl=3.82, wps=584376, ups=1.18, wpb=495928, bsz=16648.3, num_updates=52600, lr=0.000275764, gnorm=0.136, clip=0, loss_scale=4, train_wall=84, gb_free=61.9, wall=45081 epoch 032: 321 / 1689 loss=3.492, nll_loss=1.935, ppl=3.82, wps=584376, ups=1.18, wpb=495928, bsz=16648.3, num_updates=52600, lr=0.000275764, gnorm=0.136, clip=0, loss_scale=4, train_wall=84, gb_free=61.9, wall=45081 epoch 032: 321 / 1689 loss=3.492, nll_loss=1.935, ppl=3.82, wps=584376, ups=1.18, wpb=495928, bsz=16648.3, num_updates=52600, lr=0.000275764, gnorm=0.136, clip=0, loss_scale=4, train_wall=84, gb_free=61.9, wall=45081 epoch 032: 321 / 1689 loss=3.492, nll_loss=1.935, ppl=3.82, wps=584376, ups=1.18, wpb=495928, bsz=16648.3, num_updates=52600, lr=0.000275764, gnorm=0.136, clip=0, loss_scale=4, train_wall=84, gb_free=61.9, wall=45081 epoch 032: 321 / 1689 loss=3.492, nll_loss=1.935, ppl=3.82, wps=584376, ups=1.18, wpb=495928, bsz=16648.3, num_updates=52600, lr=0.000275764, gnorm=0.136, clip=0, loss_scale=4, train_wall=84, gb_free=61.9, wall=45081 epoch 032: 321 / 1689 loss=3.492, nll_loss=1.935, ppl=3.82, wps=584376, ups=1.18, wpb=495928, bsz=16648.3, num_updates=52600, lr=0.000275764, gnorm=0.136, clip=0, loss_scale=4, train_wall=84, gb_free=61.9, wall=45081 epoch 032: 321 / 1689 loss=3.492, nll_loss=1.935, ppl=3.82, wps=584376, ups=1.18, wpb=495928, bsz=16648.3, num_updates=52600, lr=0.000275764, gnorm=0.136, clip=0, loss_scale=4, train_wall=84, gb_free=61.9, wall=45081 epoch 032: 321 / 1689 loss=3.492, nll_loss=1.935, ppl=3.82, wps=584376, ups=1.18, wpb=495928, bsz=16648.3, num_updates=52600, lr=0.000275764, gnorm=0.136, clip=0, loss_scale=4, train_wall=84, gb_free=61.9, wall=45081 epoch 032: 321 / 1689 loss=3.492, nll_loss=1.935, ppl=3.82, wps=584376, ups=1.18, wpb=495928, bsz=16648.3, num_updates=52600, lr=0.000275764, gnorm=0.136, clip=0, loss_scale=4, train_wall=84, gb_free=61.9, wall=45081 epoch 032: 321 / 1689 loss=3.492, nll_loss=1.935, ppl=3.82, wps=584376, ups=1.18, wpb=495928, bsz=16648.3, num_updates=52600, lr=0.000275764, gnorm=0.136, clip=0, loss_scale=4, train_wall=84, gb_free=61.9, wall=45081 epoch 032: 321 / 1689 loss=3.492, nll_loss=1.935, ppl=3.82, wps=584376, ups=1.18, wpb=495928, bsz=16648.3, num_updates=52600, lr=0.000275764, gnorm=0.136, clip=0, loss_scale=4, train_wall=84, gb_free=61.9, wall=45081 epoch 032: 321 / 1689 loss=3.492, nll_loss=1.935, ppl=3.82, wps=584376, ups=1.18, wpb=495928, bsz=16648.3, num_updates=52600, lr=0.000275764, gnorm=0.136, clip=0, loss_scale=4, train_wall=84, gb_free=61.9, wall=45081 epoch 032: 321 / 1689 loss=3.492, nll_loss=1.935, ppl=3.82, wps=584376, ups=1.18, wpb=495928, bsz=16648.3, num_updates=52600, lr=0.000275764, gnorm=0.136, clip=0, loss_scale=4, train_wall=84, gb_free=61.9, wall=45081 epoch 032: 321 / 1689 loss=3.492, nll_loss=1.935, ppl=3.82, wps=584376, ups=1.18, wpb=495928, bsz=16648.3, num_updates=52600, lr=0.000275764, gnorm=0.136, clip=0, loss_scale=4, train_wall=84, gb_free=61.9, wall=45081 epoch 032: 321 / 1689 loss=3.492, nll_loss=1.935, ppl=3.82, wps=584376, ups=1.18, wpb=495928, bsz=16648.3, num_updates=52600, lr=0.000275764, gnorm=0.136, clip=0, loss_scale=4, train_wall=84, gb_free=61.9, wall=45081 epoch 032: 321 / 1689 loss=3.492, nll_loss=1.935, ppl=3.82, wps=584376, ups=1.18, wpb=495928, bsz=16648.3, num_updates=52600, lr=0.000275764, gnorm=0.136, clip=0, loss_scale=4, train_wall=84, gb_free=61.9, wall=45081 epoch 032: 321 / 1689 loss=3.492, nll_loss=1.935, ppl=3.82, wps=584376, ups=1.18, wpb=495928, bsz=16648.3, num_updates=52600, lr=0.000275764, gnorm=0.136, clip=0, loss_scale=4, train_wall=84, gb_free=61.9, wall=45081 epoch 032: 321 / 1689 loss=3.492, nll_loss=1.935, ppl=3.82, wps=584376, ups=1.18, wpb=495928, bsz=16648.3, num_updates=52600, lr=0.000275764, gnorm=0.136, clip=0, loss_scale=4, train_wall=84, gb_free=61.9, wall=45081 epoch 032: 321 / 1689 loss=3.492, nll_loss=1.935, ppl=3.82, wps=584376, ups=1.18, wpb=495928, bsz=16648.3, num_updates=52600, lr=0.000275764, gnorm=0.136, clip=0, loss_scale=4, train_wall=84, gb_free=61.9, wall=45081 epoch 032: 321 / 1689 loss=3.492, nll_loss=1.935, ppl=3.82, wps=584376, ups=1.18, wpb=495928, bsz=16648.3, num_updates=52600, lr=0.000275764, gnorm=0.136, clip=0, loss_scale=4, train_wall=84, gb_free=61.9, wall=45081 epoch 032: 421 / 1689 loss=3.499, nll_loss=1.943, ppl=3.85, wps=588388, ups=1.18, wpb=497387, bsz=16576.9, num_updates=52700, lr=0.000275502, gnorm=0.137, clip=0, loss_scale=4, train_wall=83, gb_free=61.7, wall=45165 epoch 032: 421 / 1689 loss=3.499, nll_loss=1.943, ppl=3.85, wps=588388, ups=1.18, wpb=497387, bsz=16576.9, num_updates=52700, lr=0.000275502, gnorm=0.137, clip=0, loss_scale=4, train_wall=83, gb_free=61.7, wall=45165 epoch 032: 421 / 1689 loss=3.499, nll_loss=1.943, ppl=3.85, wps=588388, ups=1.18, wpb=497387, bsz=16576.9, num_updates=52700, lr=0.000275502, gnorm=0.137, clip=0, loss_scale=4, train_wall=83, gb_free=61.7, wall=45165 epoch 032: 421 / 1689 loss=3.499, nll_loss=1.943, ppl=3.85, wps=588388, ups=1.18, wpb=497387, bsz=16576.9, num_updates=52700, lr=0.000275502, gnorm=0.137, clip=0, loss_scale=4, train_wall=83, gb_free=61.7, wall=45165 epoch 032: 421 / 1689 loss=3.499, nll_loss=1.943, ppl=3.85, wps=588388, ups=1.18, wpb=497387, bsz=16576.9, num_updates=52700, lr=0.000275502, gnorm=0.137, clip=0, loss_scale=4, train_wall=83, gb_free=61.7, wall=45165 epoch 032: 421 / 1689 loss=3.499, nll_loss=1.943, ppl=3.85, wps=588388, ups=1.18, wpb=497387, bsz=16576.9, num_updates=52700, lr=0.000275502, gnorm=0.137, clip=0, loss_scale=4, train_wall=83, gb_free=61.7, wall=45165 epoch 032: 421 / 1689 loss=3.499, nll_loss=1.943, ppl=3.85, wps=588388, ups=1.18, wpb=497387, bsz=16576.9, num_updates=52700, lr=0.000275502, gnorm=0.137, clip=0, loss_scale=4, train_wall=83, gb_free=61.7, wall=45165 epoch 032: 421 / 1689 loss=3.499, nll_loss=1.943, ppl=3.85, wps=588388, ups=1.18, wpb=497387, bsz=16576.9, num_updates=52700, lr=0.000275502, gnorm=0.137, clip=0, loss_scale=4, train_wall=83, gb_free=61.7, wall=45165 epoch 032: 421 / 1689 loss=3.499, nll_loss=1.943, ppl=3.85, wps=588388, ups=1.18, wpb=497387, bsz=16576.9, num_updates=52700, lr=0.000275502, gnorm=0.137, clip=0, loss_scale=4, train_wall=83, gb_free=61.7, wall=45165 epoch 032: 421 / 1689 loss=3.499, nll_loss=1.943, ppl=3.85, wps=588388, ups=1.18, wpb=497387, bsz=16576.9, num_updates=52700, lr=0.000275502, gnorm=0.137, clip=0, loss_scale=4, train_wall=83, gb_free=61.7, wall=45165 epoch 032: 421 / 1689 loss=3.499, nll_loss=1.943, ppl=3.85, wps=588388, ups=1.18, wpb=497387, bsz=16576.9, num_updates=52700, lr=0.000275502, gnorm=0.137, clip=0, loss_scale=4, train_wall=83, gb_free=61.7, wall=45165 epoch 032: 421 / 1689 loss=3.499, nll_loss=1.943, ppl=3.85, wps=588388, ups=1.18, wpb=497387, bsz=16576.9, num_updates=52700, lr=0.000275502, gnorm=0.137, clip=0, loss_scale=4, train_wall=83, gb_free=61.7, wall=45165 epoch 032: 421 / 1689 loss=3.499, nll_loss=1.943, ppl=3.85, wps=588388, ups=1.18, wpb=497387, bsz=16576.9, num_updates=52700, lr=0.000275502, gnorm=0.137, clip=0, loss_scale=4, train_wall=83, gb_free=61.7, wall=45165 epoch 032: 421 / 1689 loss=3.499, nll_loss=1.943, ppl=3.85, wps=588388, ups=1.18, wpb=497387, bsz=16576.9, num_updates=52700, lr=0.000275502, gnorm=0.137, clip=0, loss_scale=4, train_wall=83, gb_free=61.7, wall=45165 epoch 032: 421 / 1689 loss=3.499, nll_loss=1.943, ppl=3.85, wps=588388, ups=1.18, wpb=497387, bsz=16576.9, num_updates=52700, lr=0.000275502, gnorm=0.137, clip=0, loss_scale=4, train_wall=83, gb_free=61.7, wall=45165 epoch 032: 421 / 1689 loss=3.499, nll_loss=1.943, ppl=3.85, wps=588388, ups=1.18, wpb=497387, bsz=16576.9, num_updates=52700, lr=0.000275502, gnorm=0.137, clip=0, loss_scale=4, train_wall=83, gb_free=61.7, wall=45165 epoch 032: 421 / 1689 loss=3.499, nll_loss=1.943, ppl=3.85, wps=588388, ups=1.18, wpb=497387, bsz=16576.9, num_updates=52700, lr=0.000275502, gnorm=0.137, clip=0, loss_scale=4, train_wall=83, gb_free=61.7, wall=45165 epoch 032: 421 / 1689 loss=3.499, nll_loss=1.943, ppl=3.85, wps=588388, ups=1.18, wpb=497387, bsz=16576.9, num_updates=52700, lr=0.000275502, gnorm=0.137, clip=0, loss_scale=4, train_wall=83, gb_free=61.7, wall=45165 epoch 032: 421 / 1689 loss=3.499, nll_loss=1.943, ppl=3.85, wps=588388, ups=1.18, wpb=497387, bsz=16576.9, num_updates=52700, lr=0.000275502, gnorm=0.137, clip=0, loss_scale=4, train_wall=83, gb_free=61.7, wall=45165 epoch 032: 421 / 1689 loss=3.499, nll_loss=1.943, ppl=3.85, wps=588388, ups=1.18, wpb=497387, bsz=16576.9, num_updates=52700, lr=0.000275502, gnorm=0.137, clip=0, loss_scale=4, train_wall=83, gb_free=61.7, wall=45165 epoch 032: 421 / 1689 loss=3.499, nll_loss=1.943, ppl=3.85, wps=588388, ups=1.18, wpb=497387, bsz=16576.9, num_updates=52700, lr=0.000275502, gnorm=0.137, clip=0, loss_scale=4, train_wall=83, gb_free=61.7, wall=45165 epoch 032: 421 / 1689 loss=3.499, nll_loss=1.943, ppl=3.85, wps=588388, ups=1.18, wpb=497387, bsz=16576.9, num_updates=52700, lr=0.000275502, gnorm=0.137, clip=0, loss_scale=4, train_wall=83, gb_free=61.7, wall=45165 epoch 032: 421 / 1689 loss=3.499, nll_loss=1.943, ppl=3.85, wps=588388, ups=1.18, wpb=497387, bsz=16576.9, num_updates=52700, lr=0.000275502, gnorm=0.137, clip=0, loss_scale=4, train_wall=83, gb_free=61.7, wall=45165 epoch 032: 421 / 1689 loss=3.499, nll_loss=1.943, ppl=3.85, wps=588388, ups=1.18, wpb=497387, bsz=16576.9, num_updates=52700, lr=0.000275502, gnorm=0.137, clip=0, loss_scale=4, train_wall=83, gb_free=61.7, wall=45165 epoch 032: 421 / 1689 loss=3.499, nll_loss=1.943, ppl=3.85, wps=588388, ups=1.18, wpb=497387, bsz=16576.9, num_updates=52700, lr=0.000275502, gnorm=0.137, clip=0, loss_scale=4, train_wall=83, gb_free=61.7, wall=45165 epoch 032: 421 / 1689 loss=3.499, nll_loss=1.943, ppl=3.85, wps=588388, ups=1.18, wpb=497387, bsz=16576.9, num_updates=52700, lr=0.000275502, gnorm=0.137, clip=0, loss_scale=4, train_wall=83, gb_free=61.7, wall=45165 epoch 032: 421 / 1689 loss=3.499, nll_loss=1.943, ppl=3.85, wps=588388, ups=1.18, wpb=497387, bsz=16576.9, num_updates=52700, lr=0.000275502, gnorm=0.137, clip=0, loss_scale=4, train_wall=83, gb_free=61.7, wall=45165 epoch 032: 421 / 1689 loss=3.499, nll_loss=1.943, ppl=3.85, wps=588388, ups=1.18, wpb=497387, bsz=16576.9, num_updates=52700, lr=0.000275502, gnorm=0.137, clip=0, loss_scale=4, train_wall=83, gb_free=61.7, wall=45165 epoch 032: 421 / 1689 loss=3.499, nll_loss=1.943, ppl=3.85, wps=588388, ups=1.18, wpb=497387, bsz=16576.9, num_updates=52700, lr=0.000275502, gnorm=0.137, clip=0, loss_scale=4, train_wall=83, gb_free=61.7, wall=45165 epoch 032: 421 / 1689 loss=3.499, nll_loss=1.943, ppl=3.85, wps=588388, ups=1.18, wpb=497387, bsz=16576.9, num_updates=52700, lr=0.000275502, gnorm=0.137, clip=0, loss_scale=4, train_wall=83, gb_free=61.7, wall=45165 epoch 032: 421 / 1689 loss=3.499, nll_loss=1.943, ppl=3.85, wps=588388, ups=1.18, wpb=497387, bsz=16576.9, num_updates=52700, lr=0.000275502, gnorm=0.137, clip=0, loss_scale=4, train_wall=83, gb_free=61.7, wall=45165 epoch 032: 421 / 1689 loss=3.499, nll_loss=1.943, ppl=3.85, wps=588388, ups=1.18, wpb=497387, bsz=16576.9, num_updates=52700, lr=0.000275502, gnorm=0.137, clip=0, loss_scale=4, train_wall=83, gb_free=61.7, wall=45165 epoch 032: 521 / 1689 loss=3.505, nll_loss=1.95, ppl=3.86, wps=582083, ups=1.18, wpb=494238, bsz=16820.4, num_updates=52800, lr=0.000275241, gnorm=0.137, clip=0, loss_scale=4, train_wall=84, gb_free=61.3, wall=45250 epoch 032: 521 / 1689 loss=3.505, nll_loss=1.95, ppl=3.86, wps=582083, ups=1.18, wpb=494238, bsz=16820.4, num_updates=52800, lr=0.000275241, gnorm=0.137, clip=0, loss_scale=4, train_wall=84, gb_free=61.3, wall=45250 epoch 032: 521 / 1689 loss=3.505, nll_loss=1.95, ppl=3.86, wps=582083, ups=1.18, wpb=494238, bsz=16820.4, num_updates=52800, lr=0.000275241, gnorm=0.137, clip=0, loss_scale=4, train_wall=84, gb_free=61.3, wall=45250 epoch 032: 521 / 1689 loss=3.505, nll_loss=1.95, ppl=3.86, wps=582083, ups=1.18, wpb=494238, bsz=16820.4, num_updates=52800, lr=0.000275241, gnorm=0.137, clip=0, loss_scale=4, train_wall=84, gb_free=61.3, wall=45250 epoch 032: 521 / 1689 loss=3.505, nll_loss=1.95, ppl=3.86, wps=582083, ups=1.18, wpb=494238, bsz=16820.4, num_updates=52800, lr=0.000275241, gnorm=0.137, clip=0, loss_scale=4, train_wall=84, gb_free=61.3, wall=45250 epoch 032: 521 / 1689 loss=3.505, nll_loss=1.95, ppl=3.86, wps=582083, ups=1.18, wpb=494238, bsz=16820.4, num_updates=52800, lr=0.000275241, gnorm=0.137, clip=0, loss_scale=4, train_wall=84, gb_free=61.3, wall=45250 epoch 032: 521 / 1689 loss=3.505, nll_loss=1.95, ppl=3.86, wps=582083, ups=1.18, wpb=494238, bsz=16820.4, num_updates=52800, lr=0.000275241, gnorm=0.137, clip=0, loss_scale=4, train_wall=84, gb_free=61.3, wall=45250 epoch 032: 521 / 1689 loss=3.505, nll_loss=1.95, ppl=3.86, wps=582083, ups=1.18, wpb=494238, bsz=16820.4, num_updates=52800, lr=0.000275241, gnorm=0.137, clip=0, loss_scale=4, train_wall=84, gb_free=61.3, wall=45250 epoch 032: 521 / 1689 loss=3.505, nll_loss=1.95, ppl=3.86, wps=582083, ups=1.18, wpb=494238, bsz=16820.4, num_updates=52800, lr=0.000275241, gnorm=0.137, clip=0, loss_scale=4, train_wall=84, gb_free=61.3, wall=45250 epoch 032: 521 / 1689 loss=3.505, nll_loss=1.95, ppl=3.86, wps=582083, ups=1.18, wpb=494238, bsz=16820.4, num_updates=52800, lr=0.000275241, gnorm=0.137, clip=0, loss_scale=4, train_wall=84, gb_free=61.3, wall=45250 epoch 032: 521 / 1689 loss=3.505, nll_loss=1.95, ppl=3.86, wps=582083, ups=1.18, wpb=494238, bsz=16820.4, num_updates=52800, lr=0.000275241, gnorm=0.137, clip=0, loss_scale=4, train_wall=84, gb_free=61.3, wall=45250 epoch 032: 521 / 1689 loss=3.505, nll_loss=1.95, ppl=3.86, wps=582083, ups=1.18, wpb=494238, bsz=16820.4, num_updates=52800, lr=0.000275241, gnorm=0.137, clip=0, loss_scale=4, train_wall=84, gb_free=61.3, wall=45250 epoch 032: 521 / 1689 loss=3.505, nll_loss=1.95, ppl=3.86, wps=582083, ups=1.18, wpb=494238, bsz=16820.4, num_updates=52800, lr=0.000275241, gnorm=0.137, clip=0, loss_scale=4, train_wall=84, gb_free=61.3, wall=45250 epoch 032: 521 / 1689 loss=3.505, nll_loss=1.95, ppl=3.86, wps=582083, ups=1.18, wpb=494238, bsz=16820.4, num_updates=52800, lr=0.000275241, gnorm=0.137, clip=0, loss_scale=4, train_wall=84, gb_free=61.3, wall=45250 epoch 032: 521 / 1689 loss=3.505, nll_loss=1.95, ppl=3.86, wps=582083, ups=1.18, wpb=494238, bsz=16820.4, num_updates=52800, lr=0.000275241, gnorm=0.137, clip=0, loss_scale=4, train_wall=84, gb_free=61.3, wall=45250 epoch 032: 521 / 1689 loss=3.505, nll_loss=1.95, ppl=3.86, wps=582083, ups=1.18, wpb=494238, bsz=16820.4, num_updates=52800, lr=0.000275241, gnorm=0.137, clip=0, loss_scale=4, train_wall=84, gb_free=61.3, wall=45250 epoch 032: 521 / 1689 loss=3.505, nll_loss=1.95, ppl=3.86, wps=582083, ups=1.18, wpb=494238, bsz=16820.4, num_updates=52800, lr=0.000275241, gnorm=0.137, clip=0, loss_scale=4, train_wall=84, gb_free=61.3, wall=45250 epoch 032: 521 / 1689 loss=3.505, nll_loss=1.95, ppl=3.86, wps=582083, ups=1.18, wpb=494238, bsz=16820.4, num_updates=52800, lr=0.000275241, gnorm=0.137, clip=0, loss_scale=4, train_wall=84, gb_free=61.3, wall=45250 epoch 032: 521 / 1689 loss=3.505, nll_loss=1.95, ppl=3.86, wps=582083, ups=1.18, wpb=494238, bsz=16820.4, num_updates=52800, lr=0.000275241, gnorm=0.137, clip=0, loss_scale=4, train_wall=84, gb_free=61.3, wall=45250 epoch 032: 521 / 1689 loss=3.505, nll_loss=1.95, ppl=3.86, wps=582083, ups=1.18, wpb=494238, bsz=16820.4, num_updates=52800, lr=0.000275241, gnorm=0.137, clip=0, loss_scale=4, train_wall=84, gb_free=61.3, wall=45250 epoch 032: 521 / 1689 loss=3.505, nll_loss=1.95, ppl=3.86, wps=582083, ups=1.18, wpb=494238, bsz=16820.4, num_updates=52800, lr=0.000275241, gnorm=0.137, clip=0, loss_scale=4, train_wall=84, gb_free=61.3, wall=45250 epoch 032: 521 / 1689 loss=3.505, nll_loss=1.95, ppl=3.86, wps=582083, ups=1.18, wpb=494238, bsz=16820.4, num_updates=52800, lr=0.000275241, gnorm=0.137, clip=0, loss_scale=4, train_wall=84, gb_free=61.3, wall=45250 epoch 032: 521 / 1689 loss=3.505, nll_loss=1.95, ppl=3.86, wps=582083, ups=1.18, wpb=494238, bsz=16820.4, num_updates=52800, lr=0.000275241, gnorm=0.137, clip=0, loss_scale=4, train_wall=84, gb_free=61.3, wall=45250 epoch 032: 521 / 1689 loss=3.505, nll_loss=1.95, ppl=3.86, wps=582083, ups=1.18, wpb=494238, bsz=16820.4, num_updates=52800, lr=0.000275241, gnorm=0.137, clip=0, loss_scale=4, train_wall=84, gb_free=61.3, wall=45250 epoch 032: 521 / 1689 loss=3.505, nll_loss=1.95, ppl=3.86, wps=582083, ups=1.18, wpb=494238, bsz=16820.4, num_updates=52800, lr=0.000275241, gnorm=0.137, clip=0, loss_scale=4, train_wall=84, gb_free=61.3, wall=45250 epoch 032: 521 / 1689 loss=3.505, nll_loss=1.95, ppl=3.86, wps=582083, ups=1.18, wpb=494238, bsz=16820.4, num_updates=52800, lr=0.000275241, gnorm=0.137, clip=0, loss_scale=4, train_wall=84, gb_free=61.3, wall=45250 epoch 032: 521 / 1689 loss=3.505, nll_loss=1.95, ppl=3.86, wps=582083, ups=1.18, wpb=494238, bsz=16820.4, num_updates=52800, lr=0.000275241, gnorm=0.137, clip=0, loss_scale=4, train_wall=84, gb_free=61.3, wall=45250 epoch 032: 521 / 1689 loss=3.505, nll_loss=1.95, ppl=3.86, wps=582083, ups=1.18, wpb=494238, bsz=16820.4, num_updates=52800, lr=0.000275241, gnorm=0.137, clip=0, loss_scale=4, train_wall=84, gb_free=61.3, wall=45250 epoch 032: 521 / 1689 loss=3.505, nll_loss=1.95, ppl=3.86, wps=582083, ups=1.18, wpb=494238, bsz=16820.4, num_updates=52800, lr=0.000275241, gnorm=0.137, clip=0, loss_scale=4, train_wall=84, gb_free=61.3, wall=45250 epoch 032: 521 / 1689 loss=3.505, nll_loss=1.95, ppl=3.86, wps=582083, ups=1.18, wpb=494238, bsz=16820.4, num_updates=52800, lr=0.000275241, gnorm=0.137, clip=0, loss_scale=4, train_wall=84, gb_free=61.3, wall=45250 epoch 032: 521 / 1689 loss=3.505, nll_loss=1.95, ppl=3.86, wps=582083, ups=1.18, wpb=494238, bsz=16820.4, num_updates=52800, lr=0.000275241, gnorm=0.137, clip=0, loss_scale=4, train_wall=84, gb_free=61.3, wall=45250 epoch 032: 521 / 1689 loss=3.505, nll_loss=1.95, ppl=3.86, wps=582083, ups=1.18, wpb=494238, bsz=16820.4, num_updates=52800, lr=0.000275241, gnorm=0.137, clip=0, loss_scale=4, train_wall=84, gb_free=61.3, wall=45250 epoch 032: 621 / 1689 loss=3.508, nll_loss=1.953, ppl=3.87, wps=587181, ups=1.18, wpb=495871, bsz=16469.1, num_updates=52900, lr=0.000274981, gnorm=0.135, clip=0, loss_scale=8, train_wall=83, gb_free=60.9, wall=45334 epoch 032: 621 / 1689 loss=3.508, nll_loss=1.953, ppl=3.87, wps=587181, ups=1.18, wpb=495871, bsz=16469.1, num_updates=52900, lr=0.000274981, gnorm=0.135, clip=0, loss_scale=8, train_wall=83, gb_free=60.9, wall=45334 epoch 032: 621 / 1689 loss=3.508, nll_loss=1.953, ppl=3.87, wps=587181, ups=1.18, wpb=495871, bsz=16469.1, num_updates=52900, lr=0.000274981, gnorm=0.135, clip=0, loss_scale=8, train_wall=83, gb_free=60.9, wall=45334 epoch 032: 621 / 1689 loss=3.508, nll_loss=1.953, ppl=3.87, wps=587181, ups=1.18, wpb=495871, bsz=16469.1, num_updates=52900, lr=0.000274981, gnorm=0.135, clip=0, loss_scale=8, train_wall=83, gb_free=60.9, wall=45334 epoch 032: 621 / 1689 loss=3.508, nll_loss=1.953, ppl=3.87, wps=587181, ups=1.18, wpb=495871, bsz=16469.1, num_updates=52900, lr=0.000274981, gnorm=0.135, clip=0, loss_scale=8, train_wall=83, gb_free=60.9, wall=45334 epoch 032: 621 / 1689 loss=3.508, nll_loss=1.953, ppl=3.87, wps=587181, ups=1.18, wpb=495871, bsz=16469.1, num_updates=52900, lr=0.000274981, gnorm=0.135, clip=0, loss_scale=8, train_wall=83, gb_free=60.9, wall=45334 epoch 032: 621 / 1689 loss=3.508, nll_loss=1.953, ppl=3.87, wps=587181, ups=1.18, wpb=495871, bsz=16469.1, num_updates=52900, lr=0.000274981, gnorm=0.135, clip=0, loss_scale=8, train_wall=83, gb_free=60.9, wall=45334 epoch 032: 621 / 1689 loss=3.508, nll_loss=1.953, ppl=3.87, wps=587181, ups=1.18, wpb=495871, bsz=16469.1, num_updates=52900, lr=0.000274981, gnorm=0.135, clip=0, loss_scale=8, train_wall=83, gb_free=60.9, wall=45334 epoch 032: 621 / 1689 loss=3.508, nll_loss=1.953, ppl=3.87, wps=587181, ups=1.18, wpb=495871, bsz=16469.1, num_updates=52900, lr=0.000274981, gnorm=0.135, clip=0, loss_scale=8, train_wall=83, gb_free=60.9, wall=45334 epoch 032: 621 / 1689 loss=3.508, nll_loss=1.953, ppl=3.87, wps=587181, ups=1.18, wpb=495871, bsz=16469.1, num_updates=52900, lr=0.000274981, gnorm=0.135, clip=0, loss_scale=8, train_wall=83, gb_free=60.9, wall=45334 epoch 032: 621 / 1689 loss=3.508, nll_loss=1.953, ppl=3.87, wps=587181, ups=1.18, wpb=495871, bsz=16469.1, num_updates=52900, lr=0.000274981, gnorm=0.135, clip=0, loss_scale=8, train_wall=83, gb_free=60.9, wall=45334 epoch 032: 621 / 1689 loss=3.508, nll_loss=1.953, ppl=3.87, wps=587181, ups=1.18, wpb=495871, bsz=16469.1, num_updates=52900, lr=0.000274981, gnorm=0.135, clip=0, loss_scale=8, train_wall=83, gb_free=60.9, wall=45334 epoch 032: 621 / 1689 loss=3.508, nll_loss=1.953, ppl=3.87, wps=587181, ups=1.18, wpb=495871, bsz=16469.1, num_updates=52900, lr=0.000274981, gnorm=0.135, clip=0, loss_scale=8, train_wall=83, gb_free=60.9, wall=45334 epoch 032: 621 / 1689 loss=3.508, nll_loss=1.953, ppl=3.87, wps=587181, ups=1.18, wpb=495871, bsz=16469.1, num_updates=52900, lr=0.000274981, gnorm=0.135, clip=0, loss_scale=8, train_wall=83, gb_free=60.9, wall=45334 epoch 032: 621 / 1689 loss=3.508, nll_loss=1.953, ppl=3.87, wps=587181, ups=1.18, wpb=495871, bsz=16469.1, num_updates=52900, lr=0.000274981, gnorm=0.135, clip=0, loss_scale=8, train_wall=83, gb_free=60.9, wall=45334 epoch 032: 621 / 1689 loss=3.508, nll_loss=1.953, ppl=3.87, wps=587181, ups=1.18, wpb=495871, bsz=16469.1, num_updates=52900, lr=0.000274981, gnorm=0.135, clip=0, loss_scale=8, train_wall=83, gb_free=60.9, wall=45334 epoch 032: 621 / 1689 loss=3.508, nll_loss=1.953, ppl=3.87, wps=587181, ups=1.18, wpb=495871, bsz=16469.1, num_updates=52900, lr=0.000274981, gnorm=0.135, clip=0, loss_scale=8, train_wall=83, gb_free=60.9, wall=45334 epoch 032: 621 / 1689 loss=3.508, nll_loss=1.953, ppl=3.87, wps=587181, ups=1.18, wpb=495871, bsz=16469.1, num_updates=52900, lr=0.000274981, gnorm=0.135, clip=0, loss_scale=8, train_wall=83, gb_free=60.9, wall=45334 epoch 032: 621 / 1689 loss=3.508, nll_loss=1.953, ppl=3.87, wps=587181, ups=1.18, wpb=495871, bsz=16469.1, num_updates=52900, lr=0.000274981, gnorm=0.135, clip=0, loss_scale=8, train_wall=83, gb_free=60.9, wall=45334 epoch 032: 621 / 1689 loss=3.508, nll_loss=1.953, ppl=3.87, wps=587181, ups=1.18, wpb=495871, bsz=16469.1, num_updates=52900, lr=0.000274981, gnorm=0.135, clip=0, loss_scale=8, train_wall=83, gb_free=60.9, wall=45334 epoch 032: 621 / 1689 loss=3.508, nll_loss=1.953, ppl=3.87, wps=587181, ups=1.18, wpb=495871, bsz=16469.1, num_updates=52900, lr=0.000274981, gnorm=0.135, clip=0, loss_scale=8, train_wall=83, gb_free=60.9, wall=45334 epoch 032: 621 / 1689 loss=3.508, nll_loss=1.953, ppl=3.87, wps=587181, ups=1.18, wpb=495871, bsz=16469.1, num_updates=52900, lr=0.000274981, gnorm=0.135, clip=0, loss_scale=8, train_wall=83, gb_free=60.9, wall=45334 epoch 032: 621 / 1689 loss=3.508, nll_loss=1.953, ppl=3.87, wps=587181, ups=1.18, wpb=495871, bsz=16469.1, num_updates=52900, lr=0.000274981, gnorm=0.135, clip=0, loss_scale=8, train_wall=83, gb_free=60.9, wall=45334 epoch 032: 621 / 1689 loss=3.508, nll_loss=1.953, ppl=3.87, wps=587181, ups=1.18, wpb=495871, bsz=16469.1, num_updates=52900, lr=0.000274981, gnorm=0.135, clip=0, loss_scale=8, train_wall=83, gb_free=60.9, wall=45334 epoch 032: 621 / 1689 loss=3.508, nll_loss=1.953, ppl=3.87, wps=587181, ups=1.18, wpb=495871, bsz=16469.1, num_updates=52900, lr=0.000274981, gnorm=0.135, clip=0, loss_scale=8, train_wall=83, gb_free=60.9, wall=45334 epoch 032: 621 / 1689 loss=3.508, nll_loss=1.953, ppl=3.87, wps=587181, ups=1.18, wpb=495871, bsz=16469.1, num_updates=52900, lr=0.000274981, gnorm=0.135, clip=0, loss_scale=8, train_wall=83, gb_free=60.9, wall=45334 epoch 032: 621 / 1689 loss=3.508, nll_loss=1.953, ppl=3.87, wps=587181, ups=1.18, wpb=495871, bsz=16469.1, num_updates=52900, lr=0.000274981, gnorm=0.135, clip=0, loss_scale=8, train_wall=83, gb_free=60.9, wall=45334 epoch 032: 621 / 1689 loss=3.508, nll_loss=1.953, ppl=3.87, wps=587181, ups=1.18, wpb=495871, bsz=16469.1, num_updates=52900, lr=0.000274981, gnorm=0.135, clip=0, loss_scale=8, train_wall=83, gb_free=60.9, wall=45334 epoch 032: 621 / 1689 loss=3.508, nll_loss=1.953, ppl=3.87, wps=587181, ups=1.18, wpb=495871, bsz=16469.1, num_updates=52900, lr=0.000274981, gnorm=0.135, clip=0, loss_scale=8, train_wall=83, gb_free=60.9, wall=45334 epoch 032: 621 / 1689 loss=3.508, nll_loss=1.953, ppl=3.87, wps=587181, ups=1.18, wpb=495871, bsz=16469.1, num_updates=52900, lr=0.000274981, gnorm=0.135, clip=0, loss_scale=8, train_wall=83, gb_free=60.9, wall=45334 epoch 032: 621 / 1689 loss=3.508, nll_loss=1.953, ppl=3.87, wps=587181, ups=1.18, wpb=495871, bsz=16469.1, num_updates=52900, lr=0.000274981, gnorm=0.135, clip=0, loss_scale=8, train_wall=83, gb_free=60.9, wall=45334 epoch 032: 621 / 1689 loss=3.508, nll_loss=1.953, ppl=3.87, wps=587181, ups=1.18, wpb=495871, bsz=16469.1, num_updates=52900, lr=0.000274981, gnorm=0.135, clip=0, loss_scale=8, train_wall=83, gb_free=60.9, wall=45334 epoch 032: 722 / 1689 loss=3.508, nll_loss=1.954, ppl=3.87, wps=579304, ups=1.17, wpb=494290, bsz=16566.6, num_updates=53000, lr=0.000274721, gnorm=0.14, clip=0, loss_scale=4, train_wall=84, gb_free=61.8, wall=45420 epoch 032: 722 / 1689 loss=3.508, nll_loss=1.954, ppl=3.87, wps=579304, ups=1.17, wpb=494290, bsz=16566.6, num_updates=53000, lr=0.000274721, gnorm=0.14, clip=0, loss_scale=4, train_wall=84, gb_free=61.8, wall=45420 epoch 032: 722 / 1689 loss=3.508, nll_loss=1.954, ppl=3.87, wps=579304, ups=1.17, wpb=494290, bsz=16566.6, num_updates=53000, lr=0.000274721, gnorm=0.14, clip=0, loss_scale=4, train_wall=84, gb_free=61.8, wall=45420 epoch 032: 722 / 1689 loss=3.508, nll_loss=1.954, ppl=3.87, wps=579304, ups=1.17, wpb=494290, bsz=16566.6, num_updates=53000, lr=0.000274721, gnorm=0.14, clip=0, loss_scale=4, train_wall=84, gb_free=61.8, wall=45420 epoch 032: 722 / 1689 loss=3.508, nll_loss=1.954, ppl=3.87, wps=579304, ups=1.17, wpb=494290, bsz=16566.6, num_updates=53000, lr=0.000274721, gnorm=0.14, clip=0, loss_scale=4, train_wall=84, gb_free=61.8, wall=45420 epoch 032: 722 / 1689 loss=3.508, nll_loss=1.954, ppl=3.87, wps=579304, ups=1.17, wpb=494290, bsz=16566.6, num_updates=53000, lr=0.000274721, gnorm=0.14, clip=0, loss_scale=4, train_wall=84, gb_free=61.8, wall=45420 epoch 032: 722 / 1689 loss=3.508, nll_loss=1.954, ppl=3.87, wps=579304, ups=1.17, wpb=494290, bsz=16566.6, num_updates=53000, lr=0.000274721, gnorm=0.14, clip=0, loss_scale=4, train_wall=84, gb_free=61.8, wall=45420 epoch 032: 722 / 1689 loss=3.508, nll_loss=1.954, ppl=3.87, wps=579304, ups=1.17, wpb=494290, bsz=16566.6, num_updates=53000, lr=0.000274721, gnorm=0.14, clip=0, loss_scale=4, train_wall=84, gb_free=61.8, wall=45420 epoch 032: 722 / 1689 loss=3.508, nll_loss=1.954, ppl=3.87, wps=579304, ups=1.17, wpb=494290, bsz=16566.6, num_updates=53000, lr=0.000274721, gnorm=0.14, clip=0, loss_scale=4, train_wall=84, gb_free=61.8, wall=45420 epoch 032: 722 / 1689 loss=3.508, nll_loss=1.954, ppl=3.87, wps=579304, ups=1.17, wpb=494290, bsz=16566.6, num_updates=53000, lr=0.000274721, gnorm=0.14, clip=0, loss_scale=4, train_wall=84, gb_free=61.8, wall=45420 epoch 032: 722 / 1689 loss=3.508, nll_loss=1.954, ppl=3.87, wps=579304, ups=1.17, wpb=494290, bsz=16566.6, num_updates=53000, lr=0.000274721, gnorm=0.14, clip=0, loss_scale=4, train_wall=84, gb_free=61.8, wall=45420 epoch 032: 722 / 1689 loss=3.508, nll_loss=1.954, ppl=3.87, wps=579304, ups=1.17, wpb=494290, bsz=16566.6, num_updates=53000, lr=0.000274721, gnorm=0.14, clip=0, loss_scale=4, train_wall=84, gb_free=61.8, wall=45420 epoch 032: 722 / 1689 loss=3.508, nll_loss=1.954, ppl=3.87, wps=579304, ups=1.17, wpb=494290, bsz=16566.6, num_updates=53000, lr=0.000274721, gnorm=0.14, clip=0, loss_scale=4, train_wall=84, gb_free=61.8, wall=45420 epoch 032: 722 / 1689 loss=3.508, nll_loss=1.954, ppl=3.87, wps=579304, ups=1.17, wpb=494290, bsz=16566.6, num_updates=53000, lr=0.000274721, gnorm=0.14, clip=0, loss_scale=4, train_wall=84, gb_free=61.8, wall=45420 epoch 032: 722 / 1689 loss=3.508, nll_loss=1.954, ppl=3.87, wps=579304, ups=1.17, wpb=494290, bsz=16566.6, num_updates=53000, lr=0.000274721, gnorm=0.14, clip=0, loss_scale=4, train_wall=84, gb_free=61.8, wall=45420 epoch 032: 722 / 1689 loss=3.508, nll_loss=1.954, ppl=3.87, wps=579304, ups=1.17, wpb=494290, bsz=16566.6, num_updates=53000, lr=0.000274721, gnorm=0.14, clip=0, loss_scale=4, train_wall=84, gb_free=61.8, wall=45420 epoch 032: 722 / 1689 loss=3.508, nll_loss=1.954, ppl=3.87, wps=579304, ups=1.17, wpb=494290, bsz=16566.6, num_updates=53000, lr=0.000274721, gnorm=0.14, clip=0, loss_scale=4, train_wall=84, gb_free=61.8, wall=45420 epoch 032: 722 / 1689 loss=3.508, nll_loss=1.954, ppl=3.87, wps=579304, ups=1.17, wpb=494290, bsz=16566.6, num_updates=53000, lr=0.000274721, gnorm=0.14, clip=0, loss_scale=4, train_wall=84, gb_free=61.8, wall=45420 epoch 032: 722 / 1689 loss=3.508, nll_loss=1.954, ppl=3.87, wps=579304, ups=1.17, wpb=494290, bsz=16566.6, num_updates=53000, lr=0.000274721, gnorm=0.14, clip=0, loss_scale=4, train_wall=84, gb_free=61.8, wall=45420 epoch 032: 722 / 1689 loss=3.508, nll_loss=1.954, ppl=3.87, wps=579304, ups=1.17, wpb=494290, bsz=16566.6, num_updates=53000, lr=0.000274721, gnorm=0.14, clip=0, loss_scale=4, train_wall=84, gb_free=61.8, wall=45420 epoch 032: 722 / 1689 loss=3.508, nll_loss=1.954, ppl=3.87, wps=579304, ups=1.17, wpb=494290, bsz=16566.6, num_updates=53000, lr=0.000274721, gnorm=0.14, clip=0, loss_scale=4, train_wall=84, gb_free=61.8, wall=45420 epoch 032: 722 / 1689 loss=3.508, nll_loss=1.954, ppl=3.87, wps=579304, ups=1.17, wpb=494290, bsz=16566.6, num_updates=53000, lr=0.000274721, gnorm=0.14, clip=0, loss_scale=4, train_wall=84, gb_free=61.8, wall=45420 epoch 032: 722 / 1689 loss=3.508, nll_loss=1.954, ppl=3.87, wps=579304, ups=1.17, wpb=494290, bsz=16566.6, num_updates=53000, lr=0.000274721, gnorm=0.14, clip=0, loss_scale=4, train_wall=84, gb_free=61.8, wall=45420 epoch 032: 722 / 1689 loss=3.508, nll_loss=1.954, ppl=3.87, wps=579304, ups=1.17, wpb=494290, bsz=16566.6, num_updates=53000, lr=0.000274721, gnorm=0.14, clip=0, loss_scale=4, train_wall=84, gb_free=61.8, wall=45420 epoch 032: 722 / 1689 loss=3.508, nll_loss=1.954, ppl=3.87, wps=579304, ups=1.17, wpb=494290, bsz=16566.6, num_updates=53000, lr=0.000274721, gnorm=0.14, clip=0, loss_scale=4, train_wall=84, gb_free=61.8, wall=45420 epoch 032: 722 / 1689 loss=3.508, nll_loss=1.954, ppl=3.87, wps=579304, ups=1.17, wpb=494290, bsz=16566.6, num_updates=53000, lr=0.000274721, gnorm=0.14, clip=0, loss_scale=4, train_wall=84, gb_free=61.8, wall=45420 epoch 032: 722 / 1689 loss=3.508, nll_loss=1.954, ppl=3.87, wps=579304, ups=1.17, wpb=494290, bsz=16566.6, num_updates=53000, lr=0.000274721, gnorm=0.14, clip=0, loss_scale=4, train_wall=84, gb_free=61.8, wall=45420 epoch 032: 722 / 1689 loss=3.508, nll_loss=1.954, ppl=3.87, wps=579304, ups=1.17, wpb=494290, bsz=16566.6, num_updates=53000, lr=0.000274721, gnorm=0.14, clip=0, loss_scale=4, train_wall=84, gb_free=61.8, wall=45420 epoch 032: 722 / 1689 loss=3.508, nll_loss=1.954, ppl=3.87, wps=579304, ups=1.17, wpb=494290, bsz=16566.6, num_updates=53000, lr=0.000274721, gnorm=0.14, clip=0, loss_scale=4, train_wall=84, gb_free=61.8, wall=45420 epoch 032: 722 / 1689 loss=3.508, nll_loss=1.954, ppl=3.87, wps=579304, ups=1.17, wpb=494290, bsz=16566.6, num_updates=53000, lr=0.000274721, gnorm=0.14, clip=0, loss_scale=4, train_wall=84, gb_free=61.8, wall=45420 epoch 032: 722 / 1689 loss=3.508, nll_loss=1.954, ppl=3.87, wps=579304, ups=1.17, wpb=494290, bsz=16566.6, num_updates=53000, lr=0.000274721, gnorm=0.14, clip=0, loss_scale=4, train_wall=84, gb_free=61.8, wall=45420 epoch 032: 722 / 1689 loss=3.508, nll_loss=1.954, ppl=3.87, wps=579304, ups=1.17, wpb=494290, bsz=16566.6, num_updates=53000, lr=0.000274721, gnorm=0.14, clip=0, loss_scale=4, train_wall=84, gb_free=61.8, wall=45420 begin validation on "valid" subset epoch 032 | valid on 'valid' subset | loss 3.711 | nll_loss 2.166 | ppl 4.49 | wps 0 | wpb 44526 | bsz 2008 | num_updates 53000 | best_loss 3.685 epoch 032 | valid on 'valid' subset | loss 3.711 | nll_loss 2.166 | ppl 4.49 | wps 0 | wpb 44526 | bsz 2008 | num_updates 53000 | best_loss 3.685 epoch 032 | valid on 'valid' subset | loss 3.711 | nll_loss 2.166 | ppl 4.49 | wps 0 | wpb 44526 | bsz 2008 | num_updates 53000 | best_loss 3.685 epoch 032 | valid on 'valid' subset | loss 3.711 | nll_loss 2.166 | ppl 4.49 | wps 0 | wpb 44526 | bsz 2008 | num_updates 53000 | best_loss 3.685 epoch 032 | valid on 'valid' subset | loss 3.711 | nll_loss 2.166 | ppl 4.49 | wps 0 | wpb 44526 | bsz 2008 | num_updates 53000 | best_loss 3.685 epoch 032 | valid on 'valid' subset | loss 3.711 | nll_loss 2.166 | ppl 4.49 | wps 0 | wpb 44526 | bsz 2008 | num_updates 53000 | best_loss 3.685 epoch 032 | valid on 'valid' subset | loss 3.711 | nll_loss 2.166 | ppl 4.49 | wps 0 | wpb 44526 | bsz 2008 | num_updates 53000 | best_loss 3.685 epoch 032 | valid on 'valid' subset | loss 3.711 | nll_loss 2.166 | ppl 4.49 | wps 0 | wpb 44526 | bsz 2008 | num_updates 53000 | best_loss 3.685 epoch 032 | valid on 'valid' subset | loss 3.711 | nll_loss 2.166 | ppl 4.49 | wps 0 | wpb 44526 | bsz 2008 | num_updates 53000 | best_loss 3.685 epoch 032 | valid on 'valid' subset | loss 3.711 | nll_loss 2.166 | ppl 4.49 | wps 0 | wpb 44526 | bsz 2008 | num_updates 53000 | best_loss 3.685 epoch 032 | valid on 'valid' subset | loss 3.711 | nll_loss 2.166 | ppl 4.49 | wps 0 | wpb 44526 | bsz 2008 | num_updates 53000 | best_loss 3.685 epoch 032 | valid on 'valid' subset | loss 3.711 | nll_loss 2.166 | ppl 4.49 | wps 0 | wpb 44526 | bsz 2008 | num_updates 53000 | best_loss 3.685 epoch 032 | valid on 'valid' subset | loss 3.711 | nll_loss 2.166 | ppl 4.49 | wps 0 | wpb 44526 | bsz 2008 | num_updates 53000 | best_loss 3.685 epoch 032 | valid on 'valid' subset | loss 3.711 | nll_loss 2.166 | ppl 4.49 | wps 0 | wpb 44526 | bsz 2008 | num_updates 53000 | best_loss 3.685 epoch 032 | valid on 'valid' subset | loss 3.711 | nll_loss 2.166 | ppl 4.49 | wps 0 | wpb 44526 | bsz 2008 | num_updates 53000 | best_loss 3.685 epoch 032 | valid on 'valid' subset | loss 3.711 | nll_loss 2.166 | ppl 4.49 | wps 0 | wpb 44526 | bsz 2008 | num_updates 53000 | best_loss 3.685 epoch 032 | valid on 'valid' subset | loss 3.711 | nll_loss 2.166 | ppl 4.49 | wps 0 | wpb 44526 | bsz 2008 | num_updates 53000 | best_loss 3.685 epoch 032 | valid on 'valid' subset | loss 3.711 | nll_loss 2.166 | ppl 4.49 | wps 0 | wpb 44526 | bsz 2008 | num_updates 53000 | best_loss 3.685 epoch 032 | valid on 'valid' subset | loss 3.711 | nll_loss 2.166 | ppl 4.49 | wps 0 | wpb 44526 | bsz 2008 | num_updates 53000 | best_loss 3.685 epoch 032 | valid on 'valid' subset | loss 3.711 | nll_loss 2.166 | ppl 4.49 | wps 0 | wpb 44526 | bsz 2008 | num_updates 53000 | best_loss 3.685 epoch 032 | valid on 'valid' subset | loss 3.711 | nll_loss 2.166 | ppl 4.49 | wps 0 | wpb 44526 | bsz 2008 | num_updates 53000 | best_loss 3.685 epoch 032 | valid on 'valid' subset | loss 3.711 | nll_loss 2.166 | ppl 4.49 | wps 0 | wpb 44526 | bsz 2008 | num_updates 53000 | best_loss 3.685 epoch 032 | valid on 'valid' subset | loss 3.711 | nll_loss 2.166 | ppl 4.49 | wps 0 | wpb 44526 | bsz 2008 | num_updates 53000 | best_loss 3.685 epoch 032 | valid on 'valid' subset | loss 3.711 | nll_loss 2.166 | ppl 4.49 | wps 0 | wpb 44526 | bsz 2008 | num_updates 53000 | best_loss 3.685 epoch 032 | valid on 'valid' subset | loss 3.711 | nll_loss 2.166 | ppl 4.49 | wps 0 | wpb 44526 | bsz 2008 | num_updates 53000 | best_loss 3.685 epoch 032 | valid on 'valid' subset | loss 3.711 | nll_loss 2.166 | ppl 4.49 | wps 0 | wpb 44526 | bsz 2008 | num_updates 53000 | best_loss 3.685 epoch 032 | valid on 'valid' subset | loss 3.711 | nll_loss 2.166 | ppl 4.49 | wps 0 | wpb 44526 | bsz 2008 | num_updates 53000 | best_loss 3.685 epoch 032 | valid on 'valid' subset | loss 3.711 | nll_loss 2.166 | ppl 4.49 | wps 0 | wpb 44526 | bsz 2008 | num_updates 53000 | best_loss 3.685 epoch 032 | valid on 'valid' subset | loss 3.711 | nll_loss 2.166 | ppl 4.49 | wps 0 | wpb 44526 | bsz 2008 | num_updates 53000 | best_loss 3.685 epoch 032 | valid on 'valid' subset | loss 3.711 | nll_loss 2.166 | ppl 4.49 | wps 0 | wpb 44526 | bsz 2008 | num_updates 53000 | best_loss 3.685 epoch 032 | valid on 'valid' subset | loss 3.711 | nll_loss 2.166 | ppl 4.49 | wps 0 | wpb 44526 | bsz 2008 | num_updates 53000 | best_loss 3.685 epoch 032 | valid on 'valid' subset | loss 3.711 | nll_loss 2.166 | ppl 4.49 | wps 0 | wpb 44526 | bsz 2008 | num_updates 53000 | best_loss 3.685 epoch 032: 822 / 1689 loss=3.509, nll_loss=1.954, ppl=3.87, wps=537206, ups=1.09, wpb=494767, bsz=16038.6, num_updates=53100, lr=0.000274462, gnorm=0.133, clip=0, loss_scale=4, train_wall=83, gb_free=61.7, wall=45512 epoch 032: 822 / 1689 loss=3.509, nll_loss=1.954, ppl=3.87, wps=537206, ups=1.09, wpb=494767, bsz=16038.6, num_updates=53100, lr=0.000274462, gnorm=0.133, clip=0, loss_scale=4, train_wall=83, gb_free=61.7, wall=45512 epoch 032: 822 / 1689 loss=3.509, nll_loss=1.954, ppl=3.87, wps=537206, ups=1.09, wpb=494767, bsz=16038.6, num_updates=53100, lr=0.000274462, gnorm=0.133, clip=0, loss_scale=4, train_wall=83, gb_free=61.7, wall=45512 epoch 032: 822 / 1689 loss=3.509, nll_loss=1.954, ppl=3.87, wps=537206, ups=1.09, wpb=494767, bsz=16038.6, num_updates=53100, lr=0.000274462, gnorm=0.133, clip=0, loss_scale=4, train_wall=83, gb_free=61.7, wall=45512 epoch 032: 822 / 1689 loss=3.509, nll_loss=1.954, ppl=3.87, wps=537206, ups=1.09, wpb=494767, bsz=16038.6, num_updates=53100, lr=0.000274462, gnorm=0.133, clip=0, loss_scale=4, train_wall=83, gb_free=61.7, wall=45512 epoch 032: 822 / 1689 loss=3.509, nll_loss=1.954, ppl=3.87, wps=537206, ups=1.09, wpb=494767, bsz=16038.6, num_updates=53100, lr=0.000274462, gnorm=0.133, clip=0, loss_scale=4, train_wall=83, gb_free=61.7, wall=45512 epoch 032: 822 / 1689 loss=3.509, nll_loss=1.954, ppl=3.87, wps=537206, ups=1.09, wpb=494767, bsz=16038.6, num_updates=53100, lr=0.000274462, gnorm=0.133, clip=0, loss_scale=4, train_wall=83, gb_free=61.7, wall=45512 epoch 032: 822 / 1689 loss=3.509, nll_loss=1.954, ppl=3.87, wps=537206, ups=1.09, wpb=494767, bsz=16038.6, num_updates=53100, lr=0.000274462, gnorm=0.133, clip=0, loss_scale=4, train_wall=83, gb_free=61.7, wall=45512 epoch 032: 822 / 1689 loss=3.509, nll_loss=1.954, ppl=3.87, wps=537206, ups=1.09, wpb=494767, bsz=16038.6, num_updates=53100, lr=0.000274462, gnorm=0.133, clip=0, loss_scale=4, train_wall=83, gb_free=61.7, wall=45512 epoch 032: 822 / 1689 loss=3.509, nll_loss=1.954, ppl=3.87, wps=537206, ups=1.09, wpb=494767, bsz=16038.6, num_updates=53100, lr=0.000274462, gnorm=0.133, clip=0, loss_scale=4, train_wall=83, gb_free=61.7, wall=45512 epoch 032: 822 / 1689 loss=3.509, nll_loss=1.954, ppl=3.87, wps=537206, ups=1.09, wpb=494767, bsz=16038.6, num_updates=53100, lr=0.000274462, gnorm=0.133, clip=0, loss_scale=4, train_wall=83, gb_free=61.7, wall=45512 epoch 032: 822 / 1689 loss=3.509, nll_loss=1.954, ppl=3.87, wps=537206, ups=1.09, wpb=494767, bsz=16038.6, num_updates=53100, lr=0.000274462, gnorm=0.133, clip=0, loss_scale=4, train_wall=83, gb_free=61.7, wall=45512 epoch 032: 822 / 1689 loss=3.509, nll_loss=1.954, ppl=3.87, wps=537206, ups=1.09, wpb=494767, bsz=16038.6, num_updates=53100, lr=0.000274462, gnorm=0.133, clip=0, loss_scale=4, train_wall=83, gb_free=61.7, wall=45512 epoch 032: 822 / 1689 loss=3.509, nll_loss=1.954, ppl=3.87, wps=537206, ups=1.09, wpb=494767, bsz=16038.6, num_updates=53100, lr=0.000274462, gnorm=0.133, clip=0, loss_scale=4, train_wall=83, gb_free=61.7, wall=45512 epoch 032: 822 / 1689 loss=3.509, nll_loss=1.954, ppl=3.87, wps=537206, ups=1.09, wpb=494767, bsz=16038.6, num_updates=53100, lr=0.000274462, gnorm=0.133, clip=0, loss_scale=4, train_wall=83, gb_free=61.7, wall=45512 epoch 032: 822 / 1689 loss=3.509, nll_loss=1.954, ppl=3.87, wps=537206, ups=1.09, wpb=494767, bsz=16038.6, num_updates=53100, lr=0.000274462, gnorm=0.133, clip=0, loss_scale=4, train_wall=83, gb_free=61.7, wall=45512 epoch 032: 822 / 1689 loss=3.509, nll_loss=1.954, ppl=3.87, wps=537206, ups=1.09, wpb=494767, bsz=16038.6, num_updates=53100, lr=0.000274462, gnorm=0.133, clip=0, loss_scale=4, train_wall=83, gb_free=61.7, wall=45512 epoch 032: 822 / 1689 loss=3.509, nll_loss=1.954, ppl=3.87, wps=537206, ups=1.09, wpb=494767, bsz=16038.6, num_updates=53100, lr=0.000274462, gnorm=0.133, clip=0, loss_scale=4, train_wall=83, gb_free=61.7, wall=45512 epoch 032: 822 / 1689 loss=3.509, nll_loss=1.954, ppl=3.87, wps=537206, ups=1.09, wpb=494767, bsz=16038.6, num_updates=53100, lr=0.000274462, gnorm=0.133, clip=0, loss_scale=4, train_wall=83, gb_free=61.7, wall=45512 epoch 032: 822 / 1689 loss=3.509, nll_loss=1.954, ppl=3.87, wps=537206, ups=1.09, wpb=494767, bsz=16038.6, num_updates=53100, lr=0.000274462, gnorm=0.133, clip=0, loss_scale=4, train_wall=83, gb_free=61.7, wall=45512 epoch 032: 822 / 1689 loss=3.509, nll_loss=1.954, ppl=3.87, wps=537206, ups=1.09, wpb=494767, bsz=16038.6, num_updates=53100, lr=0.000274462, gnorm=0.133, clip=0, loss_scale=4, train_wall=83, gb_free=61.7, wall=45512 epoch 032: 822 / 1689 loss=3.509, nll_loss=1.954, ppl=3.87, wps=537206, ups=1.09, wpb=494767, bsz=16038.6, num_updates=53100, lr=0.000274462, gnorm=0.133, clip=0, loss_scale=4, train_wall=83, gb_free=61.7, wall=45512 epoch 032: 822 / 1689 loss=3.509, nll_loss=1.954, ppl=3.87, wps=537206, ups=1.09, wpb=494767, bsz=16038.6, num_updates=53100, lr=0.000274462, gnorm=0.133, clip=0, loss_scale=4, train_wall=83, gb_free=61.7, wall=45512 epoch 032: 822 / 1689 loss=3.509, nll_loss=1.954, ppl=3.87, wps=537206, ups=1.09, wpb=494767, bsz=16038.6, num_updates=53100, lr=0.000274462, gnorm=0.133, clip=0, loss_scale=4, train_wall=83, gb_free=61.7, wall=45512 epoch 032: 822 / 1689 loss=3.509, nll_loss=1.954, ppl=3.87, wps=537206, ups=1.09, wpb=494767, bsz=16038.6, num_updates=53100, lr=0.000274462, gnorm=0.133, clip=0, loss_scale=4, train_wall=83, gb_free=61.7, wall=45512 epoch 032: 822 / 1689 loss=3.509, nll_loss=1.954, ppl=3.87, wps=537206, ups=1.09, wpb=494767, bsz=16038.6, num_updates=53100, lr=0.000274462, gnorm=0.133, clip=0, loss_scale=4, train_wall=83, gb_free=61.7, wall=45512 epoch 032: 822 / 1689 loss=3.509, nll_loss=1.954, ppl=3.87, wps=537206, ups=1.09, wpb=494767, bsz=16038.6, num_updates=53100, lr=0.000274462, gnorm=0.133, clip=0, loss_scale=4, train_wall=83, gb_free=61.7, wall=45512 epoch 032: 822 / 1689 loss=3.509, nll_loss=1.954, ppl=3.87, wps=537206, ups=1.09, wpb=494767, bsz=16038.6, num_updates=53100, lr=0.000274462, gnorm=0.133, clip=0, loss_scale=4, train_wall=83, gb_free=61.7, wall=45512 epoch 032: 822 / 1689 loss=3.509, nll_loss=1.954, ppl=3.87, wps=537206, ups=1.09, wpb=494767, bsz=16038.6, num_updates=53100, lr=0.000274462, gnorm=0.133, clip=0, loss_scale=4, train_wall=83, gb_free=61.7, wall=45512 epoch 032: 822 / 1689 loss=3.509, nll_loss=1.954, ppl=3.87, wps=537206, ups=1.09, wpb=494767, bsz=16038.6, num_updates=53100, lr=0.000274462, gnorm=0.133, clip=0, loss_scale=4, train_wall=83, gb_free=61.7, wall=45512 epoch 032: 822 / 1689 loss=3.509, nll_loss=1.954, ppl=3.87, wps=537206, ups=1.09, wpb=494767, bsz=16038.6, num_updates=53100, lr=0.000274462, gnorm=0.133, clip=0, loss_scale=4, train_wall=83, gb_free=61.7, wall=45512 epoch 032: 822 / 1689 loss=3.509, nll_loss=1.954, ppl=3.87, wps=537206, ups=1.09, wpb=494767, bsz=16038.6, num_updates=53100, lr=0.000274462, gnorm=0.133, clip=0, loss_scale=4, train_wall=83, gb_free=61.7, wall=45512 epoch 032: 923 / 1689 loss=3.512, nll_loss=1.958, ppl=3.89, wps=580428, ups=1.17, wpb=495763, bsz=16513.1, num_updates=53200, lr=0.000274204, gnorm=0.138, clip=0, loss_scale=2, train_wall=84, gb_free=61, wall=45597 epoch 032: 923 / 1689 loss=3.512, nll_loss=1.958, ppl=3.89, wps=580428, ups=1.17, wpb=495763, bsz=16513.1, num_updates=53200, lr=0.000274204, gnorm=0.138, clip=0, loss_scale=2, train_wall=84, gb_free=61, wall=45597 epoch 032: 923 / 1689 loss=3.512, nll_loss=1.958, ppl=3.89, wps=580428, ups=1.17, wpb=495763, bsz=16513.1, num_updates=53200, lr=0.000274204, gnorm=0.138, clip=0, loss_scale=2, train_wall=84, gb_free=61, wall=45597 epoch 032: 923 / 1689 loss=3.512, nll_loss=1.958, ppl=3.89, wps=580428, ups=1.17, wpb=495763, bsz=16513.1, num_updates=53200, lr=0.000274204, gnorm=0.138, clip=0, loss_scale=2, train_wall=84, gb_free=61, wall=45597 epoch 032: 923 / 1689 loss=3.512, nll_loss=1.958, ppl=3.89, wps=580428, ups=1.17, wpb=495763, bsz=16513.1, num_updates=53200, lr=0.000274204, gnorm=0.138, clip=0, loss_scale=2, train_wall=84, gb_free=61, wall=45597 epoch 032: 923 / 1689 loss=3.512, nll_loss=1.958, ppl=3.89, wps=580428, ups=1.17, wpb=495763, bsz=16513.1, num_updates=53200, lr=0.000274204, gnorm=0.138, clip=0, loss_scale=2, train_wall=84, gb_free=61, wall=45597 epoch 032: 923 / 1689 loss=3.512, nll_loss=1.958, ppl=3.89, wps=580428, ups=1.17, wpb=495763, bsz=16513.1, num_updates=53200, lr=0.000274204, gnorm=0.138, clip=0, loss_scale=2, train_wall=84, gb_free=61, wall=45597 epoch 032: 923 / 1689 loss=3.512, nll_loss=1.958, ppl=3.89, wps=580428, ups=1.17, wpb=495763, bsz=16513.1, num_updates=53200, lr=0.000274204, gnorm=0.138, clip=0, loss_scale=2, train_wall=84, gb_free=61, wall=45597 epoch 032: 923 / 1689 loss=3.512, nll_loss=1.958, ppl=3.89, wps=580428, ups=1.17, wpb=495763, bsz=16513.1, num_updates=53200, lr=0.000274204, gnorm=0.138, clip=0, loss_scale=2, train_wall=84, gb_free=61, wall=45597 epoch 032: 923 / 1689 loss=3.512, nll_loss=1.958, ppl=3.89, wps=580428, ups=1.17, wpb=495763, bsz=16513.1, num_updates=53200, lr=0.000274204, gnorm=0.138, clip=0, loss_scale=2, train_wall=84, gb_free=61, wall=45597 epoch 032: 923 / 1689 loss=3.512, nll_loss=1.958, ppl=3.89, wps=580428, ups=1.17, wpb=495763, bsz=16513.1, num_updates=53200, lr=0.000274204, gnorm=0.138, clip=0, loss_scale=2, train_wall=84, gb_free=61, wall=45597 epoch 032: 923 / 1689 loss=3.512, nll_loss=1.958, ppl=3.89, wps=580428, ups=1.17, wpb=495763, bsz=16513.1, num_updates=53200, lr=0.000274204, gnorm=0.138, clip=0, loss_scale=2, train_wall=84, gb_free=61, wall=45597 epoch 032: 923 / 1689 loss=3.512, nll_loss=1.958, ppl=3.89, wps=580428, ups=1.17, wpb=495763, bsz=16513.1, num_updates=53200, lr=0.000274204, gnorm=0.138, clip=0, loss_scale=2, train_wall=84, gb_free=61, wall=45597 epoch 032: 923 / 1689 loss=3.512, nll_loss=1.958, ppl=3.89, wps=580428, ups=1.17, wpb=495763, bsz=16513.1, num_updates=53200, lr=0.000274204, gnorm=0.138, clip=0, loss_scale=2, train_wall=84, gb_free=61, wall=45597 epoch 032: 923 / 1689 loss=3.512, nll_loss=1.958, ppl=3.89, wps=580428, ups=1.17, wpb=495763, bsz=16513.1, num_updates=53200, lr=0.000274204, gnorm=0.138, clip=0, loss_scale=2, train_wall=84, gb_free=61, wall=45597 epoch 032: 923 / 1689 loss=3.512, nll_loss=1.958, ppl=3.89, wps=580428, ups=1.17, wpb=495763, bsz=16513.1, num_updates=53200, lr=0.000274204, gnorm=0.138, clip=0, loss_scale=2, train_wall=84, gb_free=61, wall=45597 epoch 032: 923 / 1689 loss=3.512, nll_loss=1.958, ppl=3.89, wps=580428, ups=1.17, wpb=495763, bsz=16513.1, num_updates=53200, lr=0.000274204, gnorm=0.138, clip=0, loss_scale=2, train_wall=84, gb_free=61, wall=45597 epoch 032: 923 / 1689 loss=3.512, nll_loss=1.958, ppl=3.89, wps=580428, ups=1.17, wpb=495763, bsz=16513.1, num_updates=53200, lr=0.000274204, gnorm=0.138, clip=0, loss_scale=2, train_wall=84, gb_free=61, wall=45597 epoch 032: 923 / 1689 loss=3.512, nll_loss=1.958, ppl=3.89, wps=580428, ups=1.17, wpb=495763, bsz=16513.1, num_updates=53200, lr=0.000274204, gnorm=0.138, clip=0, loss_scale=2, train_wall=84, gb_free=61, wall=45597 epoch 032: 923 / 1689 loss=3.512, nll_loss=1.958, ppl=3.89, wps=580428, ups=1.17, wpb=495763, bsz=16513.1, num_updates=53200, lr=0.000274204, gnorm=0.138, clip=0, loss_scale=2, train_wall=84, gb_free=61, wall=45597 epoch 032: 923 / 1689 loss=3.512, nll_loss=1.958, ppl=3.89, wps=580428, ups=1.17, wpb=495763, bsz=16513.1, num_updates=53200, lr=0.000274204, gnorm=0.138, clip=0, loss_scale=2, train_wall=84, gb_free=61, wall=45597 epoch 032: 923 / 1689 loss=3.512, nll_loss=1.958, ppl=3.89, wps=580428, ups=1.17, wpb=495763, bsz=16513.1, num_updates=53200, lr=0.000274204, gnorm=0.138, clip=0, loss_scale=2, train_wall=84, gb_free=61, wall=45597 epoch 032: 923 / 1689 loss=3.512, nll_loss=1.958, ppl=3.89, wps=580428, ups=1.17, wpb=495763, bsz=16513.1, num_updates=53200, lr=0.000274204, gnorm=0.138, clip=0, loss_scale=2, train_wall=84, gb_free=61, wall=45597 epoch 032: 923 / 1689 loss=3.512, nll_loss=1.958, ppl=3.89, wps=580428, ups=1.17, wpb=495763, bsz=16513.1, num_updates=53200, lr=0.000274204, gnorm=0.138, clip=0, loss_scale=2, train_wall=84, gb_free=61, wall=45597 epoch 032: 923 / 1689 loss=3.512, nll_loss=1.958, ppl=3.89, wps=580428, ups=1.17, wpb=495763, bsz=16513.1, num_updates=53200, lr=0.000274204, gnorm=0.138, clip=0, loss_scale=2, train_wall=84, gb_free=61, wall=45597 epoch 032: 923 / 1689 loss=3.512, nll_loss=1.958, ppl=3.89, wps=580428, ups=1.17, wpb=495763, bsz=16513.1, num_updates=53200, lr=0.000274204, gnorm=0.138, clip=0, loss_scale=2, train_wall=84, gb_free=61, wall=45597 epoch 032: 923 / 1689 loss=3.512, nll_loss=1.958, ppl=3.89, wps=580428, ups=1.17, wpb=495763, bsz=16513.1, num_updates=53200, lr=0.000274204, gnorm=0.138, clip=0, loss_scale=2, train_wall=84, gb_free=61, wall=45597 epoch 032: 923 / 1689 loss=3.512, nll_loss=1.958, ppl=3.89, wps=580428, ups=1.17, wpb=495763, bsz=16513.1, num_updates=53200, lr=0.000274204, gnorm=0.138, clip=0, loss_scale=2, train_wall=84, gb_free=61, wall=45597 epoch 032: 923 / 1689 loss=3.512, nll_loss=1.958, ppl=3.89, wps=580428, ups=1.17, wpb=495763, bsz=16513.1, num_updates=53200, lr=0.000274204, gnorm=0.138, clip=0, loss_scale=2, train_wall=84, gb_free=61, wall=45597 epoch 032: 923 / 1689 loss=3.512, nll_loss=1.958, ppl=3.89, wps=580428, ups=1.17, wpb=495763, bsz=16513.1, num_updates=53200, lr=0.000274204, gnorm=0.138, clip=0, loss_scale=2, train_wall=84, gb_free=61, wall=45597 epoch 032: 923 / 1689 loss=3.512, nll_loss=1.958, ppl=3.89, wps=580428, ups=1.17, wpb=495763, bsz=16513.1, num_updates=53200, lr=0.000274204, gnorm=0.138, clip=0, loss_scale=2, train_wall=84, gb_free=61, wall=45597 epoch 032: 923 / 1689 loss=3.512, nll_loss=1.958, ppl=3.89, wps=580428, ups=1.17, wpb=495763, bsz=16513.1, num_updates=53200, lr=0.000274204, gnorm=0.138, clip=0, loss_scale=2, train_wall=84, gb_free=61, wall=45597 epoch 032: 1023 / 1689 loss=3.51, nll_loss=1.955, ppl=3.88, wps=590435, ups=1.19, wpb=496290, bsz=16061.1, num_updates=53300, lr=0.000273947, gnorm=0.131, clip=0, loss_scale=2, train_wall=83, gb_free=61.8, wall=45681 epoch 032: 1023 / 1689 loss=3.51, nll_loss=1.955, ppl=3.88, wps=590435, ups=1.19, wpb=496290, bsz=16061.1, num_updates=53300, lr=0.000273947, gnorm=0.131, clip=0, loss_scale=2, train_wall=83, gb_free=61.8, wall=45681 epoch 032: 1023 / 1689 loss=3.51, nll_loss=1.955, ppl=3.88, wps=590435, ups=1.19, wpb=496290, bsz=16061.1, num_updates=53300, lr=0.000273947, gnorm=0.131, clip=0, loss_scale=2, train_wall=83, gb_free=61.8, wall=45681 epoch 032: 1023 / 1689 loss=3.51, nll_loss=1.955, ppl=3.88, wps=590435, ups=1.19, wpb=496290, bsz=16061.1, num_updates=53300, lr=0.000273947, gnorm=0.131, clip=0, loss_scale=2, train_wall=83, gb_free=61.8, wall=45681 epoch 032: 1023 / 1689 loss=3.51, nll_loss=1.955, ppl=3.88, wps=590435, ups=1.19, wpb=496290, bsz=16061.1, num_updates=53300, lr=0.000273947, gnorm=0.131, clip=0, loss_scale=2, train_wall=83, gb_free=61.8, wall=45681 epoch 032: 1023 / 1689 loss=3.51, nll_loss=1.955, ppl=3.88, wps=590435, ups=1.19, wpb=496290, bsz=16061.1, num_updates=53300, lr=0.000273947, gnorm=0.131, clip=0, loss_scale=2, train_wall=83, gb_free=61.8, wall=45681 epoch 032: 1023 / 1689 loss=3.51, nll_loss=1.955, ppl=3.88, wps=590435, ups=1.19, wpb=496290, bsz=16061.1, num_updates=53300, lr=0.000273947, gnorm=0.131, clip=0, loss_scale=2, train_wall=83, gb_free=61.8, wall=45681 epoch 032: 1023 / 1689 loss=3.51, nll_loss=1.955, ppl=3.88, wps=590435, ups=1.19, wpb=496290, bsz=16061.1, num_updates=53300, lr=0.000273947, gnorm=0.131, clip=0, loss_scale=2, train_wall=83, gb_free=61.8, wall=45681 epoch 032: 1023 / 1689 loss=3.51, nll_loss=1.955, ppl=3.88, wps=590435, ups=1.19, wpb=496290, bsz=16061.1, num_updates=53300, lr=0.000273947, gnorm=0.131, clip=0, loss_scale=2, train_wall=83, gb_free=61.8, wall=45681 epoch 032: 1023 / 1689 loss=3.51, nll_loss=1.955, ppl=3.88, wps=590435, ups=1.19, wpb=496290, bsz=16061.1, num_updates=53300, lr=0.000273947, gnorm=0.131, clip=0, loss_scale=2, train_wall=83, gb_free=61.8, wall=45681 epoch 032: 1023 / 1689 loss=3.51, nll_loss=1.955, ppl=3.88, wps=590435, ups=1.19, wpb=496290, bsz=16061.1, num_updates=53300, lr=0.000273947, gnorm=0.131, clip=0, loss_scale=2, train_wall=83, gb_free=61.8, wall=45681 epoch 032: 1023 / 1689 loss=3.51, nll_loss=1.955, ppl=3.88, wps=590435, ups=1.19, wpb=496290, bsz=16061.1, num_updates=53300, lr=0.000273947, gnorm=0.131, clip=0, loss_scale=2, train_wall=83, gb_free=61.8, wall=45681 epoch 032: 1023 / 1689 loss=3.51, nll_loss=1.955, ppl=3.88, wps=590435, ups=1.19, wpb=496290, bsz=16061.1, num_updates=53300, lr=0.000273947, gnorm=0.131, clip=0, loss_scale=2, train_wall=83, gb_free=61.8, wall=45681 epoch 032: 1023 / 1689 loss=3.51, nll_loss=1.955, ppl=3.88, wps=590435, ups=1.19, wpb=496290, bsz=16061.1, num_updates=53300, lr=0.000273947, gnorm=0.131, clip=0, loss_scale=2, train_wall=83, gb_free=61.8, wall=45681 epoch 032: 1023 / 1689 loss=3.51, nll_loss=1.955, ppl=3.88, wps=590435, ups=1.19, wpb=496290, bsz=16061.1, num_updates=53300, lr=0.000273947, gnorm=0.131, clip=0, loss_scale=2, train_wall=83, gb_free=61.8, wall=45681 epoch 032: 1023 / 1689 loss=3.51, nll_loss=1.955, ppl=3.88, wps=590435, ups=1.19, wpb=496290, bsz=16061.1, num_updates=53300, lr=0.000273947, gnorm=0.131, clip=0, loss_scale=2, train_wall=83, gb_free=61.8, wall=45681 epoch 032: 1023 / 1689 loss=3.51, nll_loss=1.955, ppl=3.88, wps=590435, ups=1.19, wpb=496290, bsz=16061.1, num_updates=53300, lr=0.000273947, gnorm=0.131, clip=0, loss_scale=2, train_wall=83, gb_free=61.8, wall=45681 epoch 032: 1023 / 1689 loss=3.51, nll_loss=1.955, ppl=3.88, wps=590435, ups=1.19, wpb=496290, bsz=16061.1, num_updates=53300, lr=0.000273947, gnorm=0.131, clip=0, loss_scale=2, train_wall=83, gb_free=61.8, wall=45681 epoch 032: 1023 / 1689 loss=3.51, nll_loss=1.955, ppl=3.88, wps=590435, ups=1.19, wpb=496290, bsz=16061.1, num_updates=53300, lr=0.000273947, gnorm=0.131, clip=0, loss_scale=2, train_wall=83, gb_free=61.8, wall=45681 epoch 032: 1023 / 1689 loss=3.51, nll_loss=1.955, ppl=3.88, wps=590435, ups=1.19, wpb=496290, bsz=16061.1, num_updates=53300, lr=0.000273947, gnorm=0.131, clip=0, loss_scale=2, train_wall=83, gb_free=61.8, wall=45681 epoch 032: 1023 / 1689 loss=3.51, nll_loss=1.955, ppl=3.88, wps=590435, ups=1.19, wpb=496290, bsz=16061.1, num_updates=53300, lr=0.000273947, gnorm=0.131, clip=0, loss_scale=2, train_wall=83, gb_free=61.8, wall=45681 epoch 032: 1023 / 1689 loss=3.51, nll_loss=1.955, ppl=3.88, wps=590435, ups=1.19, wpb=496290, bsz=16061.1, num_updates=53300, lr=0.000273947, gnorm=0.131, clip=0, loss_scale=2, train_wall=83, gb_free=61.8, wall=45681 epoch 032: 1023 / 1689 loss=3.51, nll_loss=1.955, ppl=3.88, wps=590435, ups=1.19, wpb=496290, bsz=16061.1, num_updates=53300, lr=0.000273947, gnorm=0.131, clip=0, loss_scale=2, train_wall=83, gb_free=61.8, wall=45681 epoch 032: 1023 / 1689 loss=3.51, nll_loss=1.955, ppl=3.88, wps=590435, ups=1.19, wpb=496290, bsz=16061.1, num_updates=53300, lr=0.000273947, gnorm=0.131, clip=0, loss_scale=2, train_wall=83, gb_free=61.8, wall=45681 epoch 032: 1023 / 1689 loss=3.51, nll_loss=1.955, ppl=3.88, wps=590435, ups=1.19, wpb=496290, bsz=16061.1, num_updates=53300, lr=0.000273947, gnorm=0.131, clip=0, loss_scale=2, train_wall=83, gb_free=61.8, wall=45681 epoch 032: 1023 / 1689 loss=3.51, nll_loss=1.955, ppl=3.88, wps=590435, ups=1.19, wpb=496290, bsz=16061.1, num_updates=53300, lr=0.000273947, gnorm=0.131, clip=0, loss_scale=2, train_wall=83, gb_free=61.8, wall=45681 epoch 032: 1023 / 1689 loss=3.51, nll_loss=1.955, ppl=3.88, wps=590435, ups=1.19, wpb=496290, bsz=16061.1, num_updates=53300, lr=0.000273947, gnorm=0.131, clip=0, loss_scale=2, train_wall=83, gb_free=61.8, wall=45681 epoch 032: 1023 / 1689 loss=3.51, nll_loss=1.955, ppl=3.88, wps=590435, ups=1.19, wpb=496290, bsz=16061.1, num_updates=53300, lr=0.000273947, gnorm=0.131, clip=0, loss_scale=2, train_wall=83, gb_free=61.8, wall=45681 epoch 032: 1023 / 1689 loss=3.51, nll_loss=1.955, ppl=3.88, wps=590435, ups=1.19, wpb=496290, bsz=16061.1, num_updates=53300, lr=0.000273947, gnorm=0.131, clip=0, loss_scale=2, train_wall=83, gb_free=61.8, wall=45681 epoch 032: 1023 / 1689 loss=3.51, nll_loss=1.955, ppl=3.88, wps=590435, ups=1.19, wpb=496290, bsz=16061.1, num_updates=53300, lr=0.000273947, gnorm=0.131, clip=0, loss_scale=2, train_wall=83, gb_free=61.8, wall=45681 epoch 032: 1023 / 1689 loss=3.51, nll_loss=1.955, ppl=3.88, wps=590435, ups=1.19, wpb=496290, bsz=16061.1, num_updates=53300, lr=0.000273947, gnorm=0.131, clip=0, loss_scale=2, train_wall=83, gb_free=61.8, wall=45681 epoch 032: 1023 / 1689 loss=3.51, nll_loss=1.955, ppl=3.88, wps=590435, ups=1.19, wpb=496290, bsz=16061.1, num_updates=53300, lr=0.000273947, gnorm=0.131, clip=0, loss_scale=2, train_wall=83, gb_free=61.8, wall=45681 epoch 032: 1123 / 1689 loss=3.517, nll_loss=1.964, ppl=3.9, wps=583055, ups=1.18, wpb=494835, bsz=16510.5, num_updates=53400, lr=0.00027369, gnorm=0.136, clip=0, loss_scale=2, train_wall=83, gb_free=60.6, wall=45766 epoch 032: 1123 / 1689 loss=3.517, nll_loss=1.964, ppl=3.9, wps=583055, ups=1.18, wpb=494835, bsz=16510.5, num_updates=53400, lr=0.00027369, gnorm=0.136, clip=0, loss_scale=2, train_wall=83, gb_free=60.6, wall=45766 epoch 032: 1123 / 1689 loss=3.517, nll_loss=1.964, ppl=3.9, wps=583055, ups=1.18, wpb=494835, bsz=16510.5, num_updates=53400, lr=0.00027369, gnorm=0.136, clip=0, loss_scale=2, train_wall=83, gb_free=60.6, wall=45766 epoch 032: 1123 / 1689 loss=3.517, nll_loss=1.964, ppl=3.9, wps=583055, ups=1.18, wpb=494835, bsz=16510.5, num_updates=53400, lr=0.00027369, gnorm=0.136, clip=0, loss_scale=2, train_wall=83, gb_free=60.6, wall=45766 epoch 032: 1123 / 1689 loss=3.517, nll_loss=1.964, ppl=3.9, wps=583055, ups=1.18, wpb=494835, bsz=16510.5, num_updates=53400, lr=0.00027369, gnorm=0.136, clip=0, loss_scale=2, train_wall=83, gb_free=60.6, wall=45766 epoch 032: 1123 / 1689 loss=3.517, nll_loss=1.964, ppl=3.9, wps=583055, ups=1.18, wpb=494835, bsz=16510.5, num_updates=53400, lr=0.00027369, gnorm=0.136, clip=0, loss_scale=2, train_wall=83, gb_free=60.6, wall=45766 epoch 032: 1123 / 1689 loss=3.517, nll_loss=1.964, ppl=3.9, wps=583055, ups=1.18, wpb=494835, bsz=16510.5, num_updates=53400, lr=0.00027369, gnorm=0.136, clip=0, loss_scale=2, train_wall=83, gb_free=60.6, wall=45766 epoch 032: 1123 / 1689 loss=3.517, nll_loss=1.964, ppl=3.9, wps=583055, ups=1.18, wpb=494835, bsz=16510.5, num_updates=53400, lr=0.00027369, gnorm=0.136, clip=0, loss_scale=2, train_wall=83, gb_free=60.6, wall=45766 epoch 032: 1123 / 1689 loss=3.517, nll_loss=1.964, ppl=3.9, wps=583055, ups=1.18, wpb=494835, bsz=16510.5, num_updates=53400, lr=0.00027369, gnorm=0.136, clip=0, loss_scale=2, train_wall=83, gb_free=60.6, wall=45766 epoch 032: 1123 / 1689 loss=3.517, nll_loss=1.964, ppl=3.9, wps=583055, ups=1.18, wpb=494835, bsz=16510.5, num_updates=53400, lr=0.00027369, gnorm=0.136, clip=0, loss_scale=2, train_wall=83, gb_free=60.6, wall=45766 epoch 032: 1123 / 1689 loss=3.517, nll_loss=1.964, ppl=3.9, wps=583055, ups=1.18, wpb=494835, bsz=16510.5, num_updates=53400, lr=0.00027369, gnorm=0.136, clip=0, loss_scale=2, train_wall=83, gb_free=60.6, wall=45766 epoch 032: 1123 / 1689 loss=3.517, nll_loss=1.964, ppl=3.9, wps=583055, ups=1.18, wpb=494835, bsz=16510.5, num_updates=53400, lr=0.00027369, gnorm=0.136, clip=0, loss_scale=2, train_wall=83, gb_free=60.6, wall=45766 epoch 032: 1123 / 1689 loss=3.517, nll_loss=1.964, ppl=3.9, wps=583055, ups=1.18, wpb=494835, bsz=16510.5, num_updates=53400, lr=0.00027369, gnorm=0.136, clip=0, loss_scale=2, train_wall=83, gb_free=60.6, wall=45766 epoch 032: 1123 / 1689 loss=3.517, nll_loss=1.964, ppl=3.9, wps=583055, ups=1.18, wpb=494835, bsz=16510.5, num_updates=53400, lr=0.00027369, gnorm=0.136, clip=0, loss_scale=2, train_wall=83, gb_free=60.6, wall=45766 epoch 032: 1123 / 1689 loss=3.517, nll_loss=1.964, ppl=3.9, wps=583055, ups=1.18, wpb=494835, bsz=16510.5, num_updates=53400, lr=0.00027369, gnorm=0.136, clip=0, loss_scale=2, train_wall=83, gb_free=60.6, wall=45766 epoch 032: 1123 / 1689 loss=3.517, nll_loss=1.964, ppl=3.9, wps=583055, ups=1.18, wpb=494835, bsz=16510.5, num_updates=53400, lr=0.00027369, gnorm=0.136, clip=0, loss_scale=2, train_wall=83, gb_free=60.6, wall=45766 epoch 032: 1123 / 1689 loss=3.517, nll_loss=1.964, ppl=3.9, wps=583055, ups=1.18, wpb=494835, bsz=16510.5, num_updates=53400, lr=0.00027369, gnorm=0.136, clip=0, loss_scale=2, train_wall=83, gb_free=60.6, wall=45766 epoch 032: 1123 / 1689 loss=3.517, nll_loss=1.964, ppl=3.9, wps=583055, ups=1.18, wpb=494835, bsz=16510.5, num_updates=53400, lr=0.00027369, gnorm=0.136, clip=0, loss_scale=2, train_wall=83, gb_free=60.6, wall=45766 epoch 032: 1123 / 1689 loss=3.517, nll_loss=1.964, ppl=3.9, wps=583055, ups=1.18, wpb=494835, bsz=16510.5, num_updates=53400, lr=0.00027369, gnorm=0.136, clip=0, loss_scale=2, train_wall=83, gb_free=60.6, wall=45766 epoch 032: 1123 / 1689 loss=3.517, nll_loss=1.964, ppl=3.9, wps=583055, ups=1.18, wpb=494835, bsz=16510.5, num_updates=53400, lr=0.00027369, gnorm=0.136, clip=0, loss_scale=2, train_wall=83, gb_free=60.6, wall=45766 epoch 032: 1123 / 1689 loss=3.517, nll_loss=1.964, ppl=3.9, wps=583055, ups=1.18, wpb=494835, bsz=16510.5, num_updates=53400, lr=0.00027369, gnorm=0.136, clip=0, loss_scale=2, train_wall=83, gb_free=60.6, wall=45766 epoch 032: 1123 / 1689 loss=3.517, nll_loss=1.964, ppl=3.9, wps=583055, ups=1.18, wpb=494835, bsz=16510.5, num_updates=53400, lr=0.00027369, gnorm=0.136, clip=0, loss_scale=2, train_wall=83, gb_free=60.6, wall=45766 epoch 032: 1123 / 1689 loss=3.517, nll_loss=1.964, ppl=3.9, wps=583055, ups=1.18, wpb=494835, bsz=16510.5, num_updates=53400, lr=0.00027369, gnorm=0.136, clip=0, loss_scale=2, train_wall=83, gb_free=60.6, wall=45766 epoch 032: 1123 / 1689 loss=3.517, nll_loss=1.964, ppl=3.9, wps=583055, ups=1.18, wpb=494835, bsz=16510.5, num_updates=53400, lr=0.00027369, gnorm=0.136, clip=0, loss_scale=2, train_wall=83, gb_free=60.6, wall=45766 epoch 032: 1123 / 1689 loss=3.517, nll_loss=1.964, ppl=3.9, wps=583055, ups=1.18, wpb=494835, bsz=16510.5, num_updates=53400, lr=0.00027369, gnorm=0.136, clip=0, loss_scale=2, train_wall=83, gb_free=60.6, wall=45766 epoch 032: 1123 / 1689 loss=3.517, nll_loss=1.964, ppl=3.9, wps=583055, ups=1.18, wpb=494835, bsz=16510.5, num_updates=53400, lr=0.00027369, gnorm=0.136, clip=0, loss_scale=2, train_wall=83, gb_free=60.6, wall=45766 epoch 032: 1123 / 1689 loss=3.517, nll_loss=1.964, ppl=3.9, wps=583055, ups=1.18, wpb=494835, bsz=16510.5, num_updates=53400, lr=0.00027369, gnorm=0.136, clip=0, loss_scale=2, train_wall=83, gb_free=60.6, wall=45766 epoch 032: 1123 / 1689 loss=3.517, nll_loss=1.964, ppl=3.9, wps=583055, ups=1.18, wpb=494835, bsz=16510.5, num_updates=53400, lr=0.00027369, gnorm=0.136, clip=0, loss_scale=2, train_wall=83, gb_free=60.6, wall=45766 epoch 032: 1123 / 1689 loss=3.517, nll_loss=1.964, ppl=3.9, wps=583055, ups=1.18, wpb=494835, bsz=16510.5, num_updates=53400, lr=0.00027369, gnorm=0.136, clip=0, loss_scale=2, train_wall=83, gb_free=60.6, wall=45766 epoch 032: 1123 / 1689 loss=3.517, nll_loss=1.964, ppl=3.9, wps=583055, ups=1.18, wpb=494835, bsz=16510.5, num_updates=53400, lr=0.00027369, gnorm=0.136, clip=0, loss_scale=2, train_wall=83, gb_free=60.6, wall=45766 epoch 032: 1123 / 1689 loss=3.517, nll_loss=1.964, ppl=3.9, wps=583055, ups=1.18, wpb=494835, bsz=16510.5, num_updates=53400, lr=0.00027369, gnorm=0.136, clip=0, loss_scale=2, train_wall=83, gb_free=60.6, wall=45766 epoch 032: 1123 / 1689 loss=3.517, nll_loss=1.964, ppl=3.9, wps=583055, ups=1.18, wpb=494835, bsz=16510.5, num_updates=53400, lr=0.00027369, gnorm=0.136, clip=0, loss_scale=2, train_wall=83, gb_free=60.6, wall=45766 epoch 032: 1223 / 1689 loss=3.515, nll_loss=1.962, ppl=3.89, wps=589242, ups=1.19, wpb=494484, bsz=16221.8, num_updates=53500, lr=0.000273434, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=60.3, wall=45850 epoch 032: 1223 / 1689 loss=3.515, nll_loss=1.962, ppl=3.89, wps=589242, ups=1.19, wpb=494484, bsz=16221.8, num_updates=53500, lr=0.000273434, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=60.3, wall=45850 epoch 032: 1223 / 1689 loss=3.515, nll_loss=1.962, ppl=3.89, wps=589242, ups=1.19, wpb=494484, bsz=16221.8, num_updates=53500, lr=0.000273434, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=60.3, wall=45850 epoch 032: 1223 / 1689 loss=3.515, nll_loss=1.962, ppl=3.89, wps=589242, ups=1.19, wpb=494484, bsz=16221.8, num_updates=53500, lr=0.000273434, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=60.3, wall=45850 epoch 032: 1223 / 1689 loss=3.515, nll_loss=1.962, ppl=3.89, wps=589242, ups=1.19, wpb=494484, bsz=16221.8, num_updates=53500, lr=0.000273434, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=60.3, wall=45850 epoch 032: 1223 / 1689 loss=3.515, nll_loss=1.962, ppl=3.89, wps=589242, ups=1.19, wpb=494484, bsz=16221.8, num_updates=53500, lr=0.000273434, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=60.3, wall=45850 epoch 032: 1223 / 1689 loss=3.515, nll_loss=1.962, ppl=3.89, wps=589242, ups=1.19, wpb=494484, bsz=16221.8, num_updates=53500, lr=0.000273434, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=60.3, wall=45850 epoch 032: 1223 / 1689 loss=3.515, nll_loss=1.962, ppl=3.89, wps=589242, ups=1.19, wpb=494484, bsz=16221.8, num_updates=53500, lr=0.000273434, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=60.3, wall=45850 epoch 032: 1223 / 1689 loss=3.515, nll_loss=1.962, ppl=3.89, wps=589242, ups=1.19, wpb=494484, bsz=16221.8, num_updates=53500, lr=0.000273434, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=60.3, wall=45850 epoch 032: 1223 / 1689 loss=3.515, nll_loss=1.962, ppl=3.89, wps=589242, ups=1.19, wpb=494484, bsz=16221.8, num_updates=53500, lr=0.000273434, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=60.3, wall=45850 epoch 032: 1223 / 1689 loss=3.515, nll_loss=1.962, ppl=3.89, wps=589242, ups=1.19, wpb=494484, bsz=16221.8, num_updates=53500, lr=0.000273434, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=60.3, wall=45850 epoch 032: 1223 / 1689 loss=3.515, nll_loss=1.962, ppl=3.89, wps=589242, ups=1.19, wpb=494484, bsz=16221.8, num_updates=53500, lr=0.000273434, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=60.3, wall=45850 epoch 032: 1223 / 1689 loss=3.515, nll_loss=1.962, ppl=3.89, wps=589242, ups=1.19, wpb=494484, bsz=16221.8, num_updates=53500, lr=0.000273434, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=60.3, wall=45850 epoch 032: 1223 / 1689 loss=3.515, nll_loss=1.962, ppl=3.89, wps=589242, ups=1.19, wpb=494484, bsz=16221.8, num_updates=53500, lr=0.000273434, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=60.3, wall=45850 epoch 032: 1223 / 1689 loss=3.515, nll_loss=1.962, ppl=3.89, wps=589242, ups=1.19, wpb=494484, bsz=16221.8, num_updates=53500, lr=0.000273434, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=60.3, wall=45850 epoch 032: 1223 / 1689 loss=3.515, nll_loss=1.962, ppl=3.89, wps=589242, ups=1.19, wpb=494484, bsz=16221.8, num_updates=53500, lr=0.000273434, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=60.3, wall=45850 epoch 032: 1223 / 1689 loss=3.515, nll_loss=1.962, ppl=3.89, wps=589242, ups=1.19, wpb=494484, bsz=16221.8, num_updates=53500, lr=0.000273434, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=60.3, wall=45850 epoch 032: 1223 / 1689 loss=3.515, nll_loss=1.962, ppl=3.89, wps=589242, ups=1.19, wpb=494484, bsz=16221.8, num_updates=53500, lr=0.000273434, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=60.3, wall=45850 epoch 032: 1223 / 1689 loss=3.515, nll_loss=1.962, ppl=3.89, wps=589242, ups=1.19, wpb=494484, bsz=16221.8, num_updates=53500, lr=0.000273434, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=60.3, wall=45850 epoch 032: 1223 / 1689 loss=3.515, nll_loss=1.962, ppl=3.89, wps=589242, ups=1.19, wpb=494484, bsz=16221.8, num_updates=53500, lr=0.000273434, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=60.3, wall=45850 epoch 032: 1223 / 1689 loss=3.515, nll_loss=1.962, ppl=3.89, wps=589242, ups=1.19, wpb=494484, bsz=16221.8, num_updates=53500, lr=0.000273434, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=60.3, wall=45850 epoch 032: 1223 / 1689 loss=3.515, nll_loss=1.962, ppl=3.89, wps=589242, ups=1.19, wpb=494484, bsz=16221.8, num_updates=53500, lr=0.000273434, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=60.3, wall=45850 epoch 032: 1223 / 1689 loss=3.515, nll_loss=1.962, ppl=3.89, wps=589242, ups=1.19, wpb=494484, bsz=16221.8, num_updates=53500, lr=0.000273434, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=60.3, wall=45850 epoch 032: 1223 / 1689 loss=3.515, nll_loss=1.962, ppl=3.89, wps=589242, ups=1.19, wpb=494484, bsz=16221.8, num_updates=53500, lr=0.000273434, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=60.3, wall=45850 epoch 032: 1223 / 1689 loss=3.515, nll_loss=1.962, ppl=3.89, wps=589242, ups=1.19, wpb=494484, bsz=16221.8, num_updates=53500, lr=0.000273434, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=60.3, wall=45850 epoch 032: 1223 / 1689 loss=3.515, nll_loss=1.962, ppl=3.89, wps=589242, ups=1.19, wpb=494484, bsz=16221.8, num_updates=53500, lr=0.000273434, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=60.3, wall=45850 epoch 032: 1223 / 1689 loss=3.515, nll_loss=1.962, ppl=3.89, wps=589242, ups=1.19, wpb=494484, bsz=16221.8, num_updates=53500, lr=0.000273434, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=60.3, wall=45850 epoch 032: 1223 / 1689 loss=3.515, nll_loss=1.962, ppl=3.89, wps=589242, ups=1.19, wpb=494484, bsz=16221.8, num_updates=53500, lr=0.000273434, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=60.3, wall=45850 epoch 032: 1223 / 1689 loss=3.515, nll_loss=1.962, ppl=3.89, wps=589242, ups=1.19, wpb=494484, bsz=16221.8, num_updates=53500, lr=0.000273434, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=60.3, wall=45850 epoch 032: 1223 / 1689 loss=3.515, nll_loss=1.962, ppl=3.89, wps=589242, ups=1.19, wpb=494484, bsz=16221.8, num_updates=53500, lr=0.000273434, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=60.3, wall=45850 epoch 032: 1223 / 1689 loss=3.515, nll_loss=1.962, ppl=3.89, wps=589242, ups=1.19, wpb=494484, bsz=16221.8, num_updates=53500, lr=0.000273434, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=60.3, wall=45850 epoch 032: 1223 / 1689 loss=3.515, nll_loss=1.962, ppl=3.89, wps=589242, ups=1.19, wpb=494484, bsz=16221.8, num_updates=53500, lr=0.000273434, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=60.3, wall=45850 epoch 032: 1323 / 1689 loss=3.509, nll_loss=1.954, ppl=3.88, wps=588082, ups=1.19, wpb=496174, bsz=16116.5, num_updates=53600, lr=0.000273179, gnorm=0.135, clip=0, loss_scale=2, train_wall=83, gb_free=60.4, wall=45935 epoch 032: 1323 / 1689 loss=3.509, nll_loss=1.954, ppl=3.88, wps=588082, ups=1.19, wpb=496174, bsz=16116.5, num_updates=53600, lr=0.000273179, gnorm=0.135, clip=0, loss_scale=2, train_wall=83, gb_free=60.4, wall=45935 epoch 032: 1323 / 1689 loss=3.509, nll_loss=1.954, ppl=3.88, wps=588082, ups=1.19, wpb=496174, bsz=16116.5, num_updates=53600, lr=0.000273179, gnorm=0.135, clip=0, loss_scale=2, train_wall=83, gb_free=60.4, wall=45935 epoch 032: 1323 / 1689 loss=3.509, nll_loss=1.954, ppl=3.88, wps=588082, ups=1.19, wpb=496174, bsz=16116.5, num_updates=53600, lr=0.000273179, gnorm=0.135, clip=0, loss_scale=2, train_wall=83, gb_free=60.4, wall=45935 epoch 032: 1323 / 1689 loss=3.509, nll_loss=1.954, ppl=3.88, wps=588082, ups=1.19, wpb=496174, bsz=16116.5, num_updates=53600, lr=0.000273179, gnorm=0.135, clip=0, loss_scale=2, train_wall=83, gb_free=60.4, wall=45935 epoch 032: 1323 / 1689 loss=3.509, nll_loss=1.954, ppl=3.88, wps=588082, ups=1.19, wpb=496174, bsz=16116.5, num_updates=53600, lr=0.000273179, gnorm=0.135, clip=0, loss_scale=2, train_wall=83, gb_free=60.4, wall=45935 epoch 032: 1323 / 1689 loss=3.509, nll_loss=1.954, ppl=3.88, wps=588082, ups=1.19, wpb=496174, bsz=16116.5, num_updates=53600, lr=0.000273179, gnorm=0.135, clip=0, loss_scale=2, train_wall=83, gb_free=60.4, wall=45935 epoch 032: 1323 / 1689 loss=3.509, nll_loss=1.954, ppl=3.88, wps=588082, ups=1.19, wpb=496174, bsz=16116.5, num_updates=53600, lr=0.000273179, gnorm=0.135, clip=0, loss_scale=2, train_wall=83, gb_free=60.4, wall=45935 epoch 032: 1323 / 1689 loss=3.509, nll_loss=1.954, ppl=3.88, wps=588082, ups=1.19, wpb=496174, bsz=16116.5, num_updates=53600, lr=0.000273179, gnorm=0.135, clip=0, loss_scale=2, train_wall=83, gb_free=60.4, wall=45935 epoch 032: 1323 / 1689 loss=3.509, nll_loss=1.954, ppl=3.88, wps=588082, ups=1.19, wpb=496174, bsz=16116.5, num_updates=53600, lr=0.000273179, gnorm=0.135, clip=0, loss_scale=2, train_wall=83, gb_free=60.4, wall=45935 epoch 032: 1323 / 1689 loss=3.509, nll_loss=1.954, ppl=3.88, wps=588082, ups=1.19, wpb=496174, bsz=16116.5, num_updates=53600, lr=0.000273179, gnorm=0.135, clip=0, loss_scale=2, train_wall=83, gb_free=60.4, wall=45935 epoch 032: 1323 / 1689 loss=3.509, nll_loss=1.954, ppl=3.88, wps=588082, ups=1.19, wpb=496174, bsz=16116.5, num_updates=53600, lr=0.000273179, gnorm=0.135, clip=0, loss_scale=2, train_wall=83, gb_free=60.4, wall=45935 epoch 032: 1323 / 1689 loss=3.509, nll_loss=1.954, ppl=3.88, wps=588082, ups=1.19, wpb=496174, bsz=16116.5, num_updates=53600, lr=0.000273179, gnorm=0.135, clip=0, loss_scale=2, train_wall=83, gb_free=60.4, wall=45935 epoch 032: 1323 / 1689 loss=3.509, nll_loss=1.954, ppl=3.88, wps=588082, ups=1.19, wpb=496174, bsz=16116.5, num_updates=53600, lr=0.000273179, gnorm=0.135, clip=0, loss_scale=2, train_wall=83, gb_free=60.4, wall=45935 epoch 032: 1323 / 1689 loss=3.509, nll_loss=1.954, ppl=3.88, wps=588082, ups=1.19, wpb=496174, bsz=16116.5, num_updates=53600, lr=0.000273179, gnorm=0.135, clip=0, loss_scale=2, train_wall=83, gb_free=60.4, wall=45935 epoch 032: 1323 / 1689 loss=3.509, nll_loss=1.954, ppl=3.88, wps=588082, ups=1.19, wpb=496174, bsz=16116.5, num_updates=53600, lr=0.000273179, gnorm=0.135, clip=0, loss_scale=2, train_wall=83, gb_free=60.4, wall=45935 epoch 032: 1323 / 1689 loss=3.509, nll_loss=1.954, ppl=3.88, wps=588082, ups=1.19, wpb=496174, bsz=16116.5, num_updates=53600, lr=0.000273179, gnorm=0.135, clip=0, loss_scale=2, train_wall=83, gb_free=60.4, wall=45935 epoch 032: 1323 / 1689 loss=3.509, nll_loss=1.954, ppl=3.88, wps=588082, ups=1.19, wpb=496174, bsz=16116.5, num_updates=53600, lr=0.000273179, gnorm=0.135, clip=0, loss_scale=2, train_wall=83, gb_free=60.4, wall=45935 epoch 032: 1323 / 1689 loss=3.509, nll_loss=1.954, ppl=3.88, wps=588082, ups=1.19, wpb=496174, bsz=16116.5, num_updates=53600, lr=0.000273179, gnorm=0.135, clip=0, loss_scale=2, train_wall=83, gb_free=60.4, wall=45935 epoch 032: 1323 / 1689 loss=3.509, nll_loss=1.954, ppl=3.88, wps=588082, ups=1.19, wpb=496174, bsz=16116.5, num_updates=53600, lr=0.000273179, gnorm=0.135, clip=0, loss_scale=2, train_wall=83, gb_free=60.4, wall=45935 epoch 032: 1323 / 1689 loss=3.509, nll_loss=1.954, ppl=3.88, wps=588082, ups=1.19, wpb=496174, bsz=16116.5, num_updates=53600, lr=0.000273179, gnorm=0.135, clip=0, loss_scale=2, train_wall=83, gb_free=60.4, wall=45935 epoch 032: 1323 / 1689 loss=3.509, nll_loss=1.954, ppl=3.88, wps=588082, ups=1.19, wpb=496174, bsz=16116.5, num_updates=53600, lr=0.000273179, gnorm=0.135, clip=0, loss_scale=2, train_wall=83, gb_free=60.4, wall=45935 epoch 032: 1323 / 1689 loss=3.509, nll_loss=1.954, ppl=3.88, wps=588082, ups=1.19, wpb=496174, bsz=16116.5, num_updates=53600, lr=0.000273179, gnorm=0.135, clip=0, loss_scale=2, train_wall=83, gb_free=60.4, wall=45935 epoch 032: 1323 / 1689 loss=3.509, nll_loss=1.954, ppl=3.88, wps=588082, ups=1.19, wpb=496174, bsz=16116.5, num_updates=53600, lr=0.000273179, gnorm=0.135, clip=0, loss_scale=2, train_wall=83, gb_free=60.4, wall=45935 epoch 032: 1323 / 1689 loss=3.509, nll_loss=1.954, ppl=3.88, wps=588082, ups=1.19, wpb=496174, bsz=16116.5, num_updates=53600, lr=0.000273179, gnorm=0.135, clip=0, loss_scale=2, train_wall=83, gb_free=60.4, wall=45935 epoch 032: 1323 / 1689 loss=3.509, nll_loss=1.954, ppl=3.88, wps=588082, ups=1.19, wpb=496174, bsz=16116.5, num_updates=53600, lr=0.000273179, gnorm=0.135, clip=0, loss_scale=2, train_wall=83, gb_free=60.4, wall=45935 epoch 032: 1323 / 1689 loss=3.509, nll_loss=1.954, ppl=3.88, wps=588082, ups=1.19, wpb=496174, bsz=16116.5, num_updates=53600, lr=0.000273179, gnorm=0.135, clip=0, loss_scale=2, train_wall=83, gb_free=60.4, wall=45935 epoch 032: 1323 / 1689 loss=3.509, nll_loss=1.954, ppl=3.88, wps=588082, ups=1.19, wpb=496174, bsz=16116.5, num_updates=53600, lr=0.000273179, gnorm=0.135, clip=0, loss_scale=2, train_wall=83, gb_free=60.4, wall=45935 epoch 032: 1323 / 1689 loss=3.509, nll_loss=1.954, ppl=3.88, wps=588082, ups=1.19, wpb=496174, bsz=16116.5, num_updates=53600, lr=0.000273179, gnorm=0.135, clip=0, loss_scale=2, train_wall=83, gb_free=60.4, wall=45935 epoch 032: 1323 / 1689 loss=3.509, nll_loss=1.954, ppl=3.88, wps=588082, ups=1.19, wpb=496174, bsz=16116.5, num_updates=53600, lr=0.000273179, gnorm=0.135, clip=0, loss_scale=2, train_wall=83, gb_free=60.4, wall=45935 epoch 032: 1323 / 1689 loss=3.509, nll_loss=1.954, ppl=3.88, wps=588082, ups=1.19, wpb=496174, bsz=16116.5, num_updates=53600, lr=0.000273179, gnorm=0.135, clip=0, loss_scale=2, train_wall=83, gb_free=60.4, wall=45935 epoch 032: 1323 / 1689 loss=3.509, nll_loss=1.954, ppl=3.88, wps=588082, ups=1.19, wpb=496174, bsz=16116.5, num_updates=53600, lr=0.000273179, gnorm=0.135, clip=0, loss_scale=2, train_wall=83, gb_free=60.4, wall=45935 epoch 032: 1423 / 1689 loss=3.522, nll_loss=1.969, ppl=3.92, wps=584810, ups=1.18, wpb=494462, bsz=16830.5, num_updates=53700, lr=0.000272925, gnorm=0.141, clip=0, loss_scale=4, train_wall=83, gb_free=60.3, wall=46019 epoch 032: 1423 / 1689 loss=3.522, nll_loss=1.969, ppl=3.92, wps=584810, ups=1.18, wpb=494462, bsz=16830.5, num_updates=53700, lr=0.000272925, gnorm=0.141, clip=0, loss_scale=4, train_wall=83, gb_free=60.3, wall=46019 epoch 032: 1423 / 1689 loss=3.522, nll_loss=1.969, ppl=3.92, wps=584810, ups=1.18, wpb=494462, bsz=16830.5, num_updates=53700, lr=0.000272925, gnorm=0.141, clip=0, loss_scale=4, train_wall=83, gb_free=60.3, wall=46019 epoch 032: 1423 / 1689 loss=3.522, nll_loss=1.969, ppl=3.92, wps=584810, ups=1.18, wpb=494462, bsz=16830.5, num_updates=53700, lr=0.000272925, gnorm=0.141, clip=0, loss_scale=4, train_wall=83, gb_free=60.3, wall=46019 epoch 032: 1423 / 1689 loss=3.522, nll_loss=1.969, ppl=3.92, wps=584810, ups=1.18, wpb=494462, bsz=16830.5, num_updates=53700, lr=0.000272925, gnorm=0.141, clip=0, loss_scale=4, train_wall=83, gb_free=60.3, wall=46019 epoch 032: 1423 / 1689 loss=3.522, nll_loss=1.969, ppl=3.92, wps=584810, ups=1.18, wpb=494462, bsz=16830.5, num_updates=53700, lr=0.000272925, gnorm=0.141, clip=0, loss_scale=4, train_wall=83, gb_free=60.3, wall=46019 epoch 032: 1423 / 1689 loss=3.522, nll_loss=1.969, ppl=3.92, wps=584810, ups=1.18, wpb=494462, bsz=16830.5, num_updates=53700, lr=0.000272925, gnorm=0.141, clip=0, loss_scale=4, train_wall=83, gb_free=60.3, wall=46019 epoch 032: 1423 / 1689 loss=3.522, nll_loss=1.969, ppl=3.92, wps=584810, ups=1.18, wpb=494462, bsz=16830.5, num_updates=53700, lr=0.000272925, gnorm=0.141, clip=0, loss_scale=4, train_wall=83, gb_free=60.3, wall=46019 epoch 032: 1423 / 1689 loss=3.522, nll_loss=1.969, ppl=3.92, wps=584810, ups=1.18, wpb=494462, bsz=16830.5, num_updates=53700, lr=0.000272925, gnorm=0.141, clip=0, loss_scale=4, train_wall=83, gb_free=60.3, wall=46019 epoch 032: 1423 / 1689 loss=3.522, nll_loss=1.969, ppl=3.92, wps=584810, ups=1.18, wpb=494462, bsz=16830.5, num_updates=53700, lr=0.000272925, gnorm=0.141, clip=0, loss_scale=4, train_wall=83, gb_free=60.3, wall=46019 epoch 032: 1423 / 1689 loss=3.522, nll_loss=1.969, ppl=3.92, wps=584810, ups=1.18, wpb=494462, bsz=16830.5, num_updates=53700, lr=0.000272925, gnorm=0.141, clip=0, loss_scale=4, train_wall=83, gb_free=60.3, wall=46019 epoch 032: 1423 / 1689 loss=3.522, nll_loss=1.969, ppl=3.92, wps=584810, ups=1.18, wpb=494462, bsz=16830.5, num_updates=53700, lr=0.000272925, gnorm=0.141, clip=0, loss_scale=4, train_wall=83, gb_free=60.3, wall=46019 epoch 032: 1423 / 1689 loss=3.522, nll_loss=1.969, ppl=3.92, wps=584810, ups=1.18, wpb=494462, bsz=16830.5, num_updates=53700, lr=0.000272925, gnorm=0.141, clip=0, loss_scale=4, train_wall=83, gb_free=60.3, wall=46019 epoch 032: 1423 / 1689 loss=3.522, nll_loss=1.969, ppl=3.92, wps=584810, ups=1.18, wpb=494462, bsz=16830.5, num_updates=53700, lr=0.000272925, gnorm=0.141, clip=0, loss_scale=4, train_wall=83, gb_free=60.3, wall=46019 epoch 032: 1423 / 1689 loss=3.522, nll_loss=1.969, ppl=3.92, wps=584810, ups=1.18, wpb=494462, bsz=16830.5, num_updates=53700, lr=0.000272925, gnorm=0.141, clip=0, loss_scale=4, train_wall=83, gb_free=60.3, wall=46019 epoch 032: 1423 / 1689 loss=3.522, nll_loss=1.969, ppl=3.92, wps=584810, ups=1.18, wpb=494462, bsz=16830.5, num_updates=53700, lr=0.000272925, gnorm=0.141, clip=0, loss_scale=4, train_wall=83, gb_free=60.3, wall=46019 epoch 032: 1423 / 1689 loss=3.522, nll_loss=1.969, ppl=3.92, wps=584810, ups=1.18, wpb=494462, bsz=16830.5, num_updates=53700, lr=0.000272925, gnorm=0.141, clip=0, loss_scale=4, train_wall=83, gb_free=60.3, wall=46019 epoch 032: 1423 / 1689 loss=3.522, nll_loss=1.969, ppl=3.92, wps=584810, ups=1.18, wpb=494462, bsz=16830.5, num_updates=53700, lr=0.000272925, gnorm=0.141, clip=0, loss_scale=4, train_wall=83, gb_free=60.3, wall=46019 epoch 032: 1423 / 1689 loss=3.522, nll_loss=1.969, ppl=3.92, wps=584810, ups=1.18, wpb=494462, bsz=16830.5, num_updates=53700, lr=0.000272925, gnorm=0.141, clip=0, loss_scale=4, train_wall=83, gb_free=60.3, wall=46019 epoch 032: 1423 / 1689 loss=3.522, nll_loss=1.969, ppl=3.92, wps=584810, ups=1.18, wpb=494462, bsz=16830.5, num_updates=53700, lr=0.000272925, gnorm=0.141, clip=0, loss_scale=4, train_wall=83, gb_free=60.3, wall=46019 epoch 032: 1423 / 1689 loss=3.522, nll_loss=1.969, ppl=3.92, wps=584810, ups=1.18, wpb=494462, bsz=16830.5, num_updates=53700, lr=0.000272925, gnorm=0.141, clip=0, loss_scale=4, train_wall=83, gb_free=60.3, wall=46019 epoch 032: 1423 / 1689 loss=3.522, nll_loss=1.969, ppl=3.92, wps=584810, ups=1.18, wpb=494462, bsz=16830.5, num_updates=53700, lr=0.000272925, gnorm=0.141, clip=0, loss_scale=4, train_wall=83, gb_free=60.3, wall=46019 epoch 032: 1423 / 1689 loss=3.522, nll_loss=1.969, ppl=3.92, wps=584810, ups=1.18, wpb=494462, bsz=16830.5, num_updates=53700, lr=0.000272925, gnorm=0.141, clip=0, loss_scale=4, train_wall=83, gb_free=60.3, wall=46019 epoch 032: 1423 / 1689 loss=3.522, nll_loss=1.969, ppl=3.92, wps=584810, ups=1.18, wpb=494462, bsz=16830.5, num_updates=53700, lr=0.000272925, gnorm=0.141, clip=0, loss_scale=4, train_wall=83, gb_free=60.3, wall=46019 epoch 032: 1423 / 1689 loss=3.522, nll_loss=1.969, ppl=3.92, wps=584810, ups=1.18, wpb=494462, bsz=16830.5, num_updates=53700, lr=0.000272925, gnorm=0.141, clip=0, loss_scale=4, train_wall=83, gb_free=60.3, wall=46019 epoch 032: 1423 / 1689 loss=3.522, nll_loss=1.969, ppl=3.92, wps=584810, ups=1.18, wpb=494462, bsz=16830.5, num_updates=53700, lr=0.000272925, gnorm=0.141, clip=0, loss_scale=4, train_wall=83, gb_free=60.3, wall=46019 epoch 032: 1423 / 1689 loss=3.522, nll_loss=1.969, ppl=3.92, wps=584810, ups=1.18, wpb=494462, bsz=16830.5, num_updates=53700, lr=0.000272925, gnorm=0.141, clip=0, loss_scale=4, train_wall=83, gb_free=60.3, wall=46019 epoch 032: 1423 / 1689 loss=3.522, nll_loss=1.969, ppl=3.92, wps=584810, ups=1.18, wpb=494462, bsz=16830.5, num_updates=53700, lr=0.000272925, gnorm=0.141, clip=0, loss_scale=4, train_wall=83, gb_free=60.3, wall=46019 epoch 032: 1423 / 1689 loss=3.522, nll_loss=1.969, ppl=3.92, wps=584810, ups=1.18, wpb=494462, bsz=16830.5, num_updates=53700, lr=0.000272925, gnorm=0.141, clip=0, loss_scale=4, train_wall=83, gb_free=60.3, wall=46019 epoch 032: 1423 / 1689 loss=3.522, nll_loss=1.969, ppl=3.92, wps=584810, ups=1.18, wpb=494462, bsz=16830.5, num_updates=53700, lr=0.000272925, gnorm=0.141, clip=0, loss_scale=4, train_wall=83, gb_free=60.3, wall=46019 epoch 032: 1423 / 1689 loss=3.522, nll_loss=1.969, ppl=3.92, wps=584810, ups=1.18, wpb=494462, bsz=16830.5, num_updates=53700, lr=0.000272925, gnorm=0.141, clip=0, loss_scale=4, train_wall=83, gb_free=60.3, wall=46019 epoch 032: 1423 / 1689 loss=3.522, nll_loss=1.969, ppl=3.92, wps=584810, ups=1.18, wpb=494462, bsz=16830.5, num_updates=53700, lr=0.000272925, gnorm=0.141, clip=0, loss_scale=4, train_wall=83, gb_free=60.3, wall=46019 epoch 032: 1523 / 1689 loss=3.516, nll_loss=1.963, ppl=3.9, wps=586101, ups=1.18, wpb=496851, bsz=16677.4, num_updates=53800, lr=0.000272671, gnorm=0.14, clip=0, loss_scale=4, train_wall=84, gb_free=62.6, wall=46104 epoch 032: 1523 / 1689 loss=3.516, nll_loss=1.963, ppl=3.9, wps=586101, ups=1.18, wpb=496851, bsz=16677.4, num_updates=53800, lr=0.000272671, gnorm=0.14, clip=0, loss_scale=4, train_wall=84, gb_free=62.6, wall=46104 epoch 032: 1523 / 1689 loss=3.516, nll_loss=1.963, ppl=3.9, wps=586101, ups=1.18, wpb=496851, bsz=16677.4, num_updates=53800, lr=0.000272671, gnorm=0.14, clip=0, loss_scale=4, train_wall=84, gb_free=62.6, wall=46104 epoch 032: 1523 / 1689 loss=3.516, nll_loss=1.963, ppl=3.9, wps=586101, ups=1.18, wpb=496851, bsz=16677.4, num_updates=53800, lr=0.000272671, gnorm=0.14, clip=0, loss_scale=4, train_wall=84, gb_free=62.6, wall=46104 epoch 032: 1523 / 1689 loss=3.516, nll_loss=1.963, ppl=3.9, wps=586101, ups=1.18, wpb=496851, bsz=16677.4, num_updates=53800, lr=0.000272671, gnorm=0.14, clip=0, loss_scale=4, train_wall=84, gb_free=62.6, wall=46104 epoch 032: 1523 / 1689 loss=3.516, nll_loss=1.963, ppl=3.9, wps=586101, ups=1.18, wpb=496851, bsz=16677.4, num_updates=53800, lr=0.000272671, gnorm=0.14, clip=0, loss_scale=4, train_wall=84, gb_free=62.6, wall=46104 epoch 032: 1523 / 1689 loss=3.516, nll_loss=1.963, ppl=3.9, wps=586101, ups=1.18, wpb=496851, bsz=16677.4, num_updates=53800, lr=0.000272671, gnorm=0.14, clip=0, loss_scale=4, train_wall=84, gb_free=62.6, wall=46104 epoch 032: 1523 / 1689 loss=3.516, nll_loss=1.963, ppl=3.9, wps=586101, ups=1.18, wpb=496851, bsz=16677.4, num_updates=53800, lr=0.000272671, gnorm=0.14, clip=0, loss_scale=4, train_wall=84, gb_free=62.6, wall=46104 epoch 032: 1523 / 1689 loss=3.516, nll_loss=1.963, ppl=3.9, wps=586101, ups=1.18, wpb=496851, bsz=16677.4, num_updates=53800, lr=0.000272671, gnorm=0.14, clip=0, loss_scale=4, train_wall=84, gb_free=62.6, wall=46104 epoch 032: 1523 / 1689 loss=3.516, nll_loss=1.963, ppl=3.9, wps=586101, ups=1.18, wpb=496851, bsz=16677.4, num_updates=53800, lr=0.000272671, gnorm=0.14, clip=0, loss_scale=4, train_wall=84, gb_free=62.6, wall=46104 epoch 032: 1523 / 1689 loss=3.516, nll_loss=1.963, ppl=3.9, wps=586101, ups=1.18, wpb=496851, bsz=16677.4, num_updates=53800, lr=0.000272671, gnorm=0.14, clip=0, loss_scale=4, train_wall=84, gb_free=62.6, wall=46104 epoch 032: 1523 / 1689 loss=3.516, nll_loss=1.963, ppl=3.9, wps=586101, ups=1.18, wpb=496851, bsz=16677.4, num_updates=53800, lr=0.000272671, gnorm=0.14, clip=0, loss_scale=4, train_wall=84, gb_free=62.6, wall=46104 epoch 032: 1523 / 1689 loss=3.516, nll_loss=1.963, ppl=3.9, wps=586101, ups=1.18, wpb=496851, bsz=16677.4, num_updates=53800, lr=0.000272671, gnorm=0.14, clip=0, loss_scale=4, train_wall=84, gb_free=62.6, wall=46104 epoch 032: 1523 / 1689 loss=3.516, nll_loss=1.963, ppl=3.9, wps=586101, ups=1.18, wpb=496851, bsz=16677.4, num_updates=53800, lr=0.000272671, gnorm=0.14, clip=0, loss_scale=4, train_wall=84, gb_free=62.6, wall=46104 epoch 032: 1523 / 1689 loss=3.516, nll_loss=1.963, ppl=3.9, wps=586101, ups=1.18, wpb=496851, bsz=16677.4, num_updates=53800, lr=0.000272671, gnorm=0.14, clip=0, loss_scale=4, train_wall=84, gb_free=62.6, wall=46104 epoch 032: 1523 / 1689 loss=3.516, nll_loss=1.963, ppl=3.9, wps=586101, ups=1.18, wpb=496851, bsz=16677.4, num_updates=53800, lr=0.000272671, gnorm=0.14, clip=0, loss_scale=4, train_wall=84, gb_free=62.6, wall=46104 epoch 032: 1523 / 1689 loss=3.516, nll_loss=1.963, ppl=3.9, wps=586101, ups=1.18, wpb=496851, bsz=16677.4, num_updates=53800, lr=0.000272671, gnorm=0.14, clip=0, loss_scale=4, train_wall=84, gb_free=62.6, wall=46104 epoch 032: 1523 / 1689 loss=3.516, nll_loss=1.963, ppl=3.9, wps=586101, ups=1.18, wpb=496851, bsz=16677.4, num_updates=53800, lr=0.000272671, gnorm=0.14, clip=0, loss_scale=4, train_wall=84, gb_free=62.6, wall=46104 epoch 032: 1523 / 1689 loss=3.516, nll_loss=1.963, ppl=3.9, wps=586101, ups=1.18, wpb=496851, bsz=16677.4, num_updates=53800, lr=0.000272671, gnorm=0.14, clip=0, loss_scale=4, train_wall=84, gb_free=62.6, wall=46104 epoch 032: 1523 / 1689 loss=3.516, nll_loss=1.963, ppl=3.9, wps=586101, ups=1.18, wpb=496851, bsz=16677.4, num_updates=53800, lr=0.000272671, gnorm=0.14, clip=0, loss_scale=4, train_wall=84, gb_free=62.6, wall=46104 epoch 032: 1523 / 1689 loss=3.516, nll_loss=1.963, ppl=3.9, wps=586101, ups=1.18, wpb=496851, bsz=16677.4, num_updates=53800, lr=0.000272671, gnorm=0.14, clip=0, loss_scale=4, train_wall=84, gb_free=62.6, wall=46104 epoch 032: 1523 / 1689 loss=3.516, nll_loss=1.963, ppl=3.9, wps=586101, ups=1.18, wpb=496851, bsz=16677.4, num_updates=53800, lr=0.000272671, gnorm=0.14, clip=0, loss_scale=4, train_wall=84, gb_free=62.6, wall=46104 epoch 032: 1523 / 1689 loss=3.516, nll_loss=1.963, ppl=3.9, wps=586101, ups=1.18, wpb=496851, bsz=16677.4, num_updates=53800, lr=0.000272671, gnorm=0.14, clip=0, loss_scale=4, train_wall=84, gb_free=62.6, wall=46104 epoch 032: 1523 / 1689 loss=3.516, nll_loss=1.963, ppl=3.9, wps=586101, ups=1.18, wpb=496851, bsz=16677.4, num_updates=53800, lr=0.000272671, gnorm=0.14, clip=0, loss_scale=4, train_wall=84, gb_free=62.6, wall=46104 epoch 032: 1523 / 1689 loss=3.516, nll_loss=1.963, ppl=3.9, wps=586101, ups=1.18, wpb=496851, bsz=16677.4, num_updates=53800, lr=0.000272671, gnorm=0.14, clip=0, loss_scale=4, train_wall=84, gb_free=62.6, wall=46104 epoch 032: 1523 / 1689 loss=3.516, nll_loss=1.963, ppl=3.9, wps=586101, ups=1.18, wpb=496851, bsz=16677.4, num_updates=53800, lr=0.000272671, gnorm=0.14, clip=0, loss_scale=4, train_wall=84, gb_free=62.6, wall=46104 epoch 032: 1523 / 1689 loss=3.516, nll_loss=1.963, ppl=3.9, wps=586101, ups=1.18, wpb=496851, bsz=16677.4, num_updates=53800, lr=0.000272671, gnorm=0.14, clip=0, loss_scale=4, train_wall=84, gb_free=62.6, wall=46104 epoch 032: 1523 / 1689 loss=3.516, nll_loss=1.963, ppl=3.9, wps=586101, ups=1.18, wpb=496851, bsz=16677.4, num_updates=53800, lr=0.000272671, gnorm=0.14, clip=0, loss_scale=4, train_wall=84, gb_free=62.6, wall=46104 epoch 032: 1523 / 1689 loss=3.516, nll_loss=1.963, ppl=3.9, wps=586101, ups=1.18, wpb=496851, bsz=16677.4, num_updates=53800, lr=0.000272671, gnorm=0.14, clip=0, loss_scale=4, train_wall=84, gb_free=62.6, wall=46104 epoch 032: 1523 / 1689 loss=3.516, nll_loss=1.963, ppl=3.9, wps=586101, ups=1.18, wpb=496851, bsz=16677.4, num_updates=53800, lr=0.000272671, gnorm=0.14, clip=0, loss_scale=4, train_wall=84, gb_free=62.6, wall=46104 epoch 032: 1523 / 1689 loss=3.516, nll_loss=1.963, ppl=3.9, wps=586101, ups=1.18, wpb=496851, bsz=16677.4, num_updates=53800, lr=0.000272671, gnorm=0.14, clip=0, loss_scale=4, train_wall=84, gb_free=62.6, wall=46104 epoch 032: 1523 / 1689 loss=3.516, nll_loss=1.963, ppl=3.9, wps=586101, ups=1.18, wpb=496851, bsz=16677.4, num_updates=53800, lr=0.000272671, gnorm=0.14, clip=0, loss_scale=4, train_wall=84, gb_free=62.6, wall=46104 epoch 032: 1624 / 1689 loss=3.519, nll_loss=1.966, ppl=3.91, wps=578402, ups=1.17, wpb=494310, bsz=16476.6, num_updates=53900, lr=0.000272418, gnorm=0.131, clip=0, loss_scale=2, train_wall=84, gb_free=61.7, wall=46189 epoch 032: 1624 / 1689 loss=3.519, nll_loss=1.966, ppl=3.91, wps=578402, ups=1.17, wpb=494310, bsz=16476.6, num_updates=53900, lr=0.000272418, gnorm=0.131, clip=0, loss_scale=2, train_wall=84, gb_free=61.7, wall=46189 epoch 032: 1624 / 1689 loss=3.519, nll_loss=1.966, ppl=3.91, wps=578402, ups=1.17, wpb=494310, bsz=16476.6, num_updates=53900, lr=0.000272418, gnorm=0.131, clip=0, loss_scale=2, train_wall=84, gb_free=61.7, wall=46189 epoch 032: 1624 / 1689 loss=3.519, nll_loss=1.966, ppl=3.91, wps=578402, ups=1.17, wpb=494310, bsz=16476.6, num_updates=53900, lr=0.000272418, gnorm=0.131, clip=0, loss_scale=2, train_wall=84, gb_free=61.7, wall=46189 epoch 032: 1624 / 1689 loss=3.519, nll_loss=1.966, ppl=3.91, wps=578402, ups=1.17, wpb=494310, bsz=16476.6, num_updates=53900, lr=0.000272418, gnorm=0.131, clip=0, loss_scale=2, train_wall=84, gb_free=61.7, wall=46189 epoch 032: 1624 / 1689 loss=3.519, nll_loss=1.966, ppl=3.91, wps=578402, ups=1.17, wpb=494310, bsz=16476.6, num_updates=53900, lr=0.000272418, gnorm=0.131, clip=0, loss_scale=2, train_wall=84, gb_free=61.7, wall=46189 epoch 032: 1624 / 1689 loss=3.519, nll_loss=1.966, ppl=3.91, wps=578402, ups=1.17, wpb=494310, bsz=16476.6, num_updates=53900, lr=0.000272418, gnorm=0.131, clip=0, loss_scale=2, train_wall=84, gb_free=61.7, wall=46189 epoch 032: 1624 / 1689 loss=3.519, nll_loss=1.966, ppl=3.91, wps=578402, ups=1.17, wpb=494310, bsz=16476.6, num_updates=53900, lr=0.000272418, gnorm=0.131, clip=0, loss_scale=2, train_wall=84, gb_free=61.7, wall=46189 epoch 032: 1624 / 1689 loss=3.519, nll_loss=1.966, ppl=3.91, wps=578402, ups=1.17, wpb=494310, bsz=16476.6, num_updates=53900, lr=0.000272418, gnorm=0.131, clip=0, loss_scale=2, train_wall=84, gb_free=61.7, wall=46189 epoch 032: 1624 / 1689 loss=3.519, nll_loss=1.966, ppl=3.91, wps=578402, ups=1.17, wpb=494310, bsz=16476.6, num_updates=53900, lr=0.000272418, gnorm=0.131, clip=0, loss_scale=2, train_wall=84, gb_free=61.7, wall=46189 epoch 032: 1624 / 1689 loss=3.519, nll_loss=1.966, ppl=3.91, wps=578402, ups=1.17, wpb=494310, bsz=16476.6, num_updates=53900, lr=0.000272418, gnorm=0.131, clip=0, loss_scale=2, train_wall=84, gb_free=61.7, wall=46189 epoch 032: 1624 / 1689 loss=3.519, nll_loss=1.966, ppl=3.91, wps=578402, ups=1.17, wpb=494310, bsz=16476.6, num_updates=53900, lr=0.000272418, gnorm=0.131, clip=0, loss_scale=2, train_wall=84, gb_free=61.7, wall=46189 epoch 032: 1624 / 1689 loss=3.519, nll_loss=1.966, ppl=3.91, wps=578402, ups=1.17, wpb=494310, bsz=16476.6, num_updates=53900, lr=0.000272418, gnorm=0.131, clip=0, loss_scale=2, train_wall=84, gb_free=61.7, wall=46189 epoch 032: 1624 / 1689 loss=3.519, nll_loss=1.966, ppl=3.91, wps=578402, ups=1.17, wpb=494310, bsz=16476.6, num_updates=53900, lr=0.000272418, gnorm=0.131, clip=0, loss_scale=2, train_wall=84, gb_free=61.7, wall=46189 epoch 032: 1624 / 1689 loss=3.519, nll_loss=1.966, ppl=3.91, wps=578402, ups=1.17, wpb=494310, bsz=16476.6, num_updates=53900, lr=0.000272418, gnorm=0.131, clip=0, loss_scale=2, train_wall=84, gb_free=61.7, wall=46189 epoch 032: 1624 / 1689 loss=3.519, nll_loss=1.966, ppl=3.91, wps=578402, ups=1.17, wpb=494310, bsz=16476.6, num_updates=53900, lr=0.000272418, gnorm=0.131, clip=0, loss_scale=2, train_wall=84, gb_free=61.7, wall=46189 epoch 032: 1624 / 1689 loss=3.519, nll_loss=1.966, ppl=3.91, wps=578402, ups=1.17, wpb=494310, bsz=16476.6, num_updates=53900, lr=0.000272418, gnorm=0.131, clip=0, loss_scale=2, train_wall=84, gb_free=61.7, wall=46189 epoch 032: 1624 / 1689 loss=3.519, nll_loss=1.966, ppl=3.91, wps=578402, ups=1.17, wpb=494310, bsz=16476.6, num_updates=53900, lr=0.000272418, gnorm=0.131, clip=0, loss_scale=2, train_wall=84, gb_free=61.7, wall=46189 epoch 032: 1624 / 1689 loss=3.519, nll_loss=1.966, ppl=3.91, wps=578402, ups=1.17, wpb=494310, bsz=16476.6, num_updates=53900, lr=0.000272418, gnorm=0.131, clip=0, loss_scale=2, train_wall=84, gb_free=61.7, wall=46189 epoch 032: 1624 / 1689 loss=3.519, nll_loss=1.966, ppl=3.91, wps=578402, ups=1.17, wpb=494310, bsz=16476.6, num_updates=53900, lr=0.000272418, gnorm=0.131, clip=0, loss_scale=2, train_wall=84, gb_free=61.7, wall=46189 epoch 032: 1624 / 1689 loss=3.519, nll_loss=1.966, ppl=3.91, wps=578402, ups=1.17, wpb=494310, bsz=16476.6, num_updates=53900, lr=0.000272418, gnorm=0.131, clip=0, loss_scale=2, train_wall=84, gb_free=61.7, wall=46189 epoch 032: 1624 / 1689 loss=3.519, nll_loss=1.966, ppl=3.91, wps=578402, ups=1.17, wpb=494310, bsz=16476.6, num_updates=53900, lr=0.000272418, gnorm=0.131, clip=0, loss_scale=2, train_wall=84, gb_free=61.7, wall=46189 epoch 032: 1624 / 1689 loss=3.519, nll_loss=1.966, ppl=3.91, wps=578402, ups=1.17, wpb=494310, bsz=16476.6, num_updates=53900, lr=0.000272418, gnorm=0.131, clip=0, loss_scale=2, train_wall=84, gb_free=61.7, wall=46189 epoch 032: 1624 / 1689 loss=3.519, nll_loss=1.966, ppl=3.91, wps=578402, ups=1.17, wpb=494310, bsz=16476.6, num_updates=53900, lr=0.000272418, gnorm=0.131, clip=0, loss_scale=2, train_wall=84, gb_free=61.7, wall=46189 epoch 032: 1624 / 1689 loss=3.519, nll_loss=1.966, ppl=3.91, wps=578402, ups=1.17, wpb=494310, bsz=16476.6, num_updates=53900, lr=0.000272418, gnorm=0.131, clip=0, loss_scale=2, train_wall=84, gb_free=61.7, wall=46189 epoch 032: 1624 / 1689 loss=3.519, nll_loss=1.966, ppl=3.91, wps=578402, ups=1.17, wpb=494310, bsz=16476.6, num_updates=53900, lr=0.000272418, gnorm=0.131, clip=0, loss_scale=2, train_wall=84, gb_free=61.7, wall=46189 epoch 032: 1624 / 1689 loss=3.519, nll_loss=1.966, ppl=3.91, wps=578402, ups=1.17, wpb=494310, bsz=16476.6, num_updates=53900, lr=0.000272418, gnorm=0.131, clip=0, loss_scale=2, train_wall=84, gb_free=61.7, wall=46189 epoch 032: 1624 / 1689 loss=3.519, nll_loss=1.966, ppl=3.91, wps=578402, ups=1.17, wpb=494310, bsz=16476.6, num_updates=53900, lr=0.000272418, gnorm=0.131, clip=0, loss_scale=2, train_wall=84, gb_free=61.7, wall=46189 epoch 032: 1624 / 1689 loss=3.519, nll_loss=1.966, ppl=3.91, wps=578402, ups=1.17, wpb=494310, bsz=16476.6, num_updates=53900, lr=0.000272418, gnorm=0.131, clip=0, loss_scale=2, train_wall=84, gb_free=61.7, wall=46189 epoch 032: 1624 / 1689 loss=3.519, nll_loss=1.966, ppl=3.91, wps=578402, ups=1.17, wpb=494310, bsz=16476.6, num_updates=53900, lr=0.000272418, gnorm=0.131, clip=0, loss_scale=2, train_wall=84, gb_free=61.7, wall=46189 epoch 032: 1624 / 1689 loss=3.519, nll_loss=1.966, ppl=3.91, wps=578402, ups=1.17, wpb=494310, bsz=16476.6, num_updates=53900, lr=0.000272418, gnorm=0.131, clip=0, loss_scale=2, train_wall=84, gb_free=61.7, wall=46189 epoch 032: 1624 / 1689 loss=3.519, nll_loss=1.966, ppl=3.91, wps=578402, ups=1.17, wpb=494310, bsz=16476.6, num_updates=53900, lr=0.000272418, gnorm=0.131, clip=0, loss_scale=2, train_wall=84, gb_free=61.7, wall=46189 end of epoch 32 (average epoch stats below) epoch 032 | loss 3.509 | nll_loss 1.954 | ppl 3.87 | wps 581059 | ups 1.17 | wpb 495102 | bsz 16507.9 | num_updates 53965 | lr 0.000272254 | gnorm 0.136 | clip 0 | loss_scale 2 | train_wall 1408 | gb_free 62.3 | wall 46244 epoch 032 | loss 3.509 | nll_loss 1.954 | ppl 3.87 | wps 581059 | ups 1.17 | wpb 495102 | bsz 16507.9 | num_updates 53965 | lr 0.000272254 | gnorm 0.136 | clip 0 | loss_scale 2 | train_wall 1408 | gb_free 62.3 | wall 46244 epoch 032 | loss 3.509 | nll_loss 1.954 | ppl 3.87 | wps 581059 | ups 1.17 | wpb 495102 | bsz 16507.9 | num_updates 53965 | lr 0.000272254 | gnorm 0.136 | clip 0 | loss_scale 2 | train_wall 1408 | gb_free 62.3 | wall 46244 epoch 032 | loss 3.509 | nll_loss 1.954 | ppl 3.87 | wps 581059 | ups 1.17 | wpb 495102 | bsz 16507.9 | num_updates 53965 | lr 0.000272254 | gnorm 0.136 | clip 0 | loss_scale 2 | train_wall 1408 | gb_free 62.3 | wall 46244 epoch 032 | loss 3.509 | nll_loss 1.954 | ppl 3.87 | wps 581059 | ups 1.17 | wpb 495102 | bsz 16507.9 | num_updates 53965 | lr 0.000272254 | gnorm 0.136 | clip 0 | loss_scale 2 | train_wall 1408 | gb_free 62.3 | wall 46244 epoch 032 | loss 3.509 | nll_loss 1.954 | ppl 3.87 | wps 581059 | ups 1.17 | wpb 495102 | bsz 16507.9 | num_updates 53965 | lr 0.000272254 | gnorm 0.136 | clip 0 | loss_scale 2 | train_wall 1408 | gb_free 62.3 | wall 46244 epoch 032 | loss 3.509 | nll_loss 1.954 | ppl 3.87 | wps 581059 | ups 1.17 | wpb 495102 | bsz 16507.9 | num_updates 53965 | lr 0.000272254 | gnorm 0.136 | clip 0 | loss_scale 2 | train_wall 1408 | gb_free 62.3 | wall 46244 epoch 032 | loss 3.509 | nll_loss 1.954 | ppl 3.87 | wps 581059 | ups 1.17 | wpb 495102 | bsz 16507.9 | num_updates 53965 | lr 0.000272254 | gnorm 0.136 | clip 0 | loss_scale 2 | train_wall 1408 | gb_free 62.3 | wall 46244 epoch 032 | loss 3.509 | nll_loss 1.954 | ppl 3.87 | wps 581059 | ups 1.17 | wpb 495102 | bsz 16507.9 | num_updates 53965 | lr 0.000272254 | gnorm 0.136 | clip 0 | loss_scale 2 | train_wall 1408 | gb_free 62.3 | wall 46244 epoch 032 | loss 3.509 | nll_loss 1.954 | ppl 3.87 | wps 581059 | ups 1.17 | wpb 495102 | bsz 16507.9 | num_updates 53965 | lr 0.000272254 | gnorm 0.136 | clip 0 | loss_scale 2 | train_wall 1408 | gb_free 62.3 | wall 46244 epoch 032 | loss 3.509 | nll_loss 1.954 | ppl 3.87 | wps 581059 | ups 1.17 | wpb 495102 | bsz 16507.9 | num_updates 53965 | lr 0.000272254 | gnorm 0.136 | clip 0 | loss_scale 2 | train_wall 1408 | gb_free 62.3 | wall 46244 epoch 032 | loss 3.509 | nll_loss 1.954 | ppl 3.87 | wps 581059 | ups 1.17 | wpb 495102 | bsz 16507.9 | num_updates 53965 | lr 0.000272254 | gnorm 0.136 | clip 0 | loss_scale 2 | train_wall 1408 | gb_free 62.3 | wall 46244 epoch 032 | loss 3.509 | nll_loss 1.954 | ppl 3.87 | wps 581059 | ups 1.17 | wpb 495102 | bsz 16507.9 | num_updates 53965 | lr 0.000272254 | gnorm 0.136 | clip 0 | loss_scale 2 | train_wall 1408 | gb_free 62.3 | wall 46244 epoch 032 | loss 3.509 | nll_loss 1.954 | ppl 3.87 | wps 581059 | ups 1.17 | wpb 495102 | bsz 16507.9 | num_updates 53965 | lr 0.000272254 | gnorm 0.136 | clip 0 | loss_scale 2 | train_wall 1408 | gb_free 62.3 | wall 46244 epoch 032 | loss 3.509 | nll_loss 1.954 | ppl 3.87 | wps 581059 | ups 1.17 | wpb 495102 | bsz 16507.9 | num_updates 53965 | lr 0.000272254 | gnorm 0.136 | clip 0 | loss_scale 2 | train_wall 1408 | gb_free 62.3 | wall 46244 epoch 032 | loss 3.509 | nll_loss 1.954 | ppl 3.87 | wps 581059 | ups 1.17 | wpb 495102 | bsz 16507.9 | num_updates 53965 | lr 0.000272254 | gnorm 0.136 | clip 0 | loss_scale 2 | train_wall 1408 | gb_free 62.3 | wall 46244 epoch 032 | loss 3.509 | nll_loss 1.954 | ppl 3.87 | wps 581059 | ups 1.17 | wpb 495102 | bsz 16507.9 | num_updates 53965 | lr 0.000272254 | gnorm 0.136 | clip 0 | loss_scale 2 | train_wall 1408 | gb_free 62.3 | wall 46244 epoch 032 | loss 3.509 | nll_loss 1.954 | ppl 3.87 | wps 581059 | ups 1.17 | wpb 495102 | bsz 16507.9 | num_updates 53965 | lr 0.000272254 | gnorm 0.136 | clip 0 | loss_scale 2 | train_wall 1408 | gb_free 62.3 | wall 46244 epoch 032 | loss 3.509 | nll_loss 1.954 | ppl 3.87 | wps 581059 | ups 1.17 | wpb 495102 | bsz 16507.9 | num_updates 53965 | lr 0.000272254 | gnorm 0.136 | clip 0 | loss_scale 2 | train_wall 1408 | gb_free 62.3 | wall 46244 epoch 032 | loss 3.509 | nll_loss 1.954 | ppl 3.87 | wps 581059 | ups 1.17 | wpb 495102 | bsz 16507.9 | num_updates 53965 | lr 0.000272254 | gnorm 0.136 | clip 0 | loss_scale 2 | train_wall 1408 | gb_free 62.3 | wall 46244 epoch 032 | loss 3.509 | nll_loss 1.954 | ppl 3.87 | wps 581059 | ups 1.17 | wpb 495102 | bsz 16507.9 | num_updates 53965 | lr 0.000272254 | gnorm 0.136 | clip 0 | loss_scale 2 | train_wall 1408 | gb_free 62.3 | wall 46244 epoch 032 | loss 3.509 | nll_loss 1.954 | ppl 3.87 | wps 581059 | ups 1.17 | wpb 495102 | bsz 16507.9 | num_updates 53965 | lr 0.000272254 | gnorm 0.136 | clip 0 | loss_scale 2 | train_wall 1408 | gb_free 62.3 | wall 46244 epoch 032 | loss 3.509 | nll_loss 1.954 | ppl 3.87 | wps 581059 | ups 1.17 | wpb 495102 | bsz 16507.9 | num_updates 53965 | lr 0.000272254 | gnorm 0.136 | clip 0 | loss_scale 2 | train_wall 1408 | gb_free 62.3 | wall 46244 epoch 032 | loss 3.509 | nll_loss 1.954 | ppl 3.87 | wps 581059 | ups 1.17 | wpb 495102 | bsz 16507.9 | num_updates 53965 | lr 0.000272254 | gnorm 0.136 | clip 0 | loss_scale 2 | train_wall 1408 | gb_free 62.3 | wall 46244 epoch 032 | loss 3.509 | nll_loss 1.954 | ppl 3.87 | wps 581059 | ups 1.17 | wpb 495102 | bsz 16507.9 | num_updates 53965 | lr 0.000272254 | gnorm 0.136 | clip 0 | loss_scale 2 | train_wall 1408 | gb_free 62.3 | wall 46244 epoch 032 | loss 3.509 | nll_loss 1.954 | ppl 3.87 | wps 581059 | ups 1.17 | wpb 495102 | bsz 16507.9 | num_updates 53965 | lr 0.000272254 | gnorm 0.136 | clip 0 | loss_scale 2 | train_wall 1408 | gb_free 62.3 | wall 46244 epoch 032 | loss 3.509 | nll_loss 1.954 | ppl 3.87 | wps 581059 | ups 1.17 | wpb 495102 | bsz 16507.9 | num_updates 53965 | lr 0.000272254 | gnorm 0.136 | clip 0 | loss_scale 2 | train_wall 1408 | gb_free 62.3 | wall 46244 epoch 032 | loss 3.509 | nll_loss 1.954 | ppl 3.87 | wps 581059 | ups 1.17 | wpb 495102 | bsz 16507.9 | num_updates 53965 | lr 0.000272254 | gnorm 0.136 | clip 0 | loss_scale 2 | train_wall 1408 | gb_free 62.3 | wall 46244 epoch 032 | loss 3.509 | nll_loss 1.954 | ppl 3.87 | wps 581059 | ups 1.17 | wpb 495102 | bsz 16507.9 | num_updates 53965 | lr 0.000272254 | gnorm 0.136 | clip 0 | loss_scale 2 | train_wall 1408 | gb_free 62.3 | wall 46244 epoch 032 | loss 3.509 | nll_loss 1.954 | ppl 3.87 | wps 581059 | ups 1.17 | wpb 495102 | bsz 16507.9 | num_updates 53965 | lr 0.000272254 | gnorm 0.136 | clip 0 | loss_scale 2 | train_wall 1408 | gb_free 62.3 | wall 46244 epoch 032 | loss 3.509 | nll_loss 1.954 | ppl 3.87 | wps 581059 | ups 1.17 | wpb 495102 | bsz 16507.9 | num_updates 53965 | lr 0.000272254 | gnorm 0.136 | clip 0 | loss_scale 2 | train_wall 1408 | gb_free 62.3 | wall 46244 epoch 032 | loss 3.509 | nll_loss 1.954 | ppl 3.87 | wps 581059 | ups 1.17 | wpb 495102 | bsz 16507.9 | num_updates 53965 | lr 0.000272254 | gnorm 0.136 | clip 0 | loss_scale 2 | train_wall 1408 | gb_free 62.3 | wall 46244 Start iterating over samples epoch 033: 35 / 1689 loss=3.513, nll_loss=1.959, ppl=3.89, wps=582054, ups=1.18, wpb=491892, bsz=16545.4, num_updates=54000, lr=0.000272166, gnorm=0.136, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=46274 epoch 033: 35 / 1689 loss=3.513, nll_loss=1.959, ppl=3.89, wps=582054, ups=1.18, wpb=491892, bsz=16545.4, num_updates=54000, lr=0.000272166, gnorm=0.136, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=46274 epoch 033: 35 / 1689 loss=3.513, nll_loss=1.959, ppl=3.89, wps=582054, ups=1.18, wpb=491892, bsz=16545.4, num_updates=54000, lr=0.000272166, gnorm=0.136, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=46274 epoch 033: 35 / 1689 loss=3.513, nll_loss=1.959, ppl=3.89, wps=582054, ups=1.18, wpb=491892, bsz=16545.4, num_updates=54000, lr=0.000272166, gnorm=0.136, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=46274 epoch 033: 35 / 1689 loss=3.513, nll_loss=1.959, ppl=3.89, wps=582054, ups=1.18, wpb=491892, bsz=16545.4, num_updates=54000, lr=0.000272166, gnorm=0.136, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=46274 epoch 033: 35 / 1689 loss=3.513, nll_loss=1.959, ppl=3.89, wps=582054, ups=1.18, wpb=491892, bsz=16545.4, num_updates=54000, lr=0.000272166, gnorm=0.136, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=46274 epoch 033: 35 / 1689 loss=3.513, nll_loss=1.959, ppl=3.89, wps=582054, ups=1.18, wpb=491892, bsz=16545.4, num_updates=54000, lr=0.000272166, gnorm=0.136, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=46274 epoch 033: 35 / 1689 loss=3.513, nll_loss=1.959, ppl=3.89, wps=582054, ups=1.18, wpb=491892, bsz=16545.4, num_updates=54000, lr=0.000272166, gnorm=0.136, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=46274 epoch 033: 35 / 1689 loss=3.513, nll_loss=1.959, ppl=3.89, wps=582054, ups=1.18, wpb=491892, bsz=16545.4, num_updates=54000, lr=0.000272166, gnorm=0.136, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=46274 epoch 033: 35 / 1689 loss=3.513, nll_loss=1.959, ppl=3.89, wps=582054, ups=1.18, wpb=491892, bsz=16545.4, num_updates=54000, lr=0.000272166, gnorm=0.136, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=46274 epoch 033: 35 / 1689 loss=3.513, nll_loss=1.959, ppl=3.89, wps=582054, ups=1.18, wpb=491892, bsz=16545.4, num_updates=54000, lr=0.000272166, gnorm=0.136, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=46274 epoch 033: 35 / 1689 loss=3.513, nll_loss=1.959, ppl=3.89, wps=582054, ups=1.18, wpb=491892, bsz=16545.4, num_updates=54000, lr=0.000272166, gnorm=0.136, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=46274 epoch 033: 35 / 1689 loss=3.513, nll_loss=1.959, ppl=3.89, wps=582054, ups=1.18, wpb=491892, bsz=16545.4, num_updates=54000, lr=0.000272166, gnorm=0.136, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=46274 epoch 033: 35 / 1689 loss=3.513, nll_loss=1.959, ppl=3.89, wps=582054, ups=1.18, wpb=491892, bsz=16545.4, num_updates=54000, lr=0.000272166, gnorm=0.136, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=46274 epoch 033: 35 / 1689 loss=3.513, nll_loss=1.959, ppl=3.89, wps=582054, ups=1.18, wpb=491892, bsz=16545.4, num_updates=54000, lr=0.000272166, gnorm=0.136, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=46274 epoch 033: 35 / 1689 loss=3.513, nll_loss=1.959, ppl=3.89, wps=582054, ups=1.18, wpb=491892, bsz=16545.4, num_updates=54000, lr=0.000272166, gnorm=0.136, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=46274 epoch 033: 35 / 1689 loss=3.513, nll_loss=1.959, ppl=3.89, wps=582054, ups=1.18, wpb=491892, bsz=16545.4, num_updates=54000, lr=0.000272166, gnorm=0.136, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=46274 epoch 033: 35 / 1689 loss=3.513, nll_loss=1.959, ppl=3.89, wps=582054, ups=1.18, wpb=491892, bsz=16545.4, num_updates=54000, lr=0.000272166, gnorm=0.136, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=46274 epoch 033: 35 / 1689 loss=3.513, nll_loss=1.959, ppl=3.89, wps=582054, ups=1.18, wpb=491892, bsz=16545.4, num_updates=54000, lr=0.000272166, gnorm=0.136, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=46274 epoch 033: 35 / 1689 loss=3.513, nll_loss=1.959, ppl=3.89, wps=582054, ups=1.18, wpb=491892, bsz=16545.4, num_updates=54000, lr=0.000272166, gnorm=0.136, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=46274 epoch 033: 35 / 1689 loss=3.513, nll_loss=1.959, ppl=3.89, wps=582054, ups=1.18, wpb=491892, bsz=16545.4, num_updates=54000, lr=0.000272166, gnorm=0.136, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=46274 epoch 033: 35 / 1689 loss=3.513, nll_loss=1.959, ppl=3.89, wps=582054, ups=1.18, wpb=491892, bsz=16545.4, num_updates=54000, lr=0.000272166, gnorm=0.136, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=46274 epoch 033: 35 / 1689 loss=3.513, nll_loss=1.959, ppl=3.89, wps=582054, ups=1.18, wpb=491892, bsz=16545.4, num_updates=54000, lr=0.000272166, gnorm=0.136, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=46274 epoch 033: 35 / 1689 loss=3.513, nll_loss=1.959, ppl=3.89, wps=582054, ups=1.18, wpb=491892, bsz=16545.4, num_updates=54000, lr=0.000272166, gnorm=0.136, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=46274 epoch 033: 35 / 1689 loss=3.513, nll_loss=1.959, ppl=3.89, wps=582054, ups=1.18, wpb=491892, bsz=16545.4, num_updates=54000, lr=0.000272166, gnorm=0.136, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=46274 epoch 033: 35 / 1689 loss=3.513, nll_loss=1.959, ppl=3.89, wps=582054, ups=1.18, wpb=491892, bsz=16545.4, num_updates=54000, lr=0.000272166, gnorm=0.136, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=46274 epoch 033: 35 / 1689 loss=3.513, nll_loss=1.959, ppl=3.89, wps=582054, ups=1.18, wpb=491892, bsz=16545.4, num_updates=54000, lr=0.000272166, gnorm=0.136, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=46274 epoch 033: 35 / 1689 loss=3.513, nll_loss=1.959, ppl=3.89, wps=582054, ups=1.18, wpb=491892, bsz=16545.4, num_updates=54000, lr=0.000272166, gnorm=0.136, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=46274 epoch 033: 35 / 1689 loss=3.513, nll_loss=1.959, ppl=3.89, wps=582054, ups=1.18, wpb=491892, bsz=16545.4, num_updates=54000, lr=0.000272166, gnorm=0.136, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=46274 epoch 033: 35 / 1689 loss=3.513, nll_loss=1.959, ppl=3.89, wps=582054, ups=1.18, wpb=491892, bsz=16545.4, num_updates=54000, lr=0.000272166, gnorm=0.136, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=46274 epoch 033: 35 / 1689 loss=3.513, nll_loss=1.959, ppl=3.89, wps=582054, ups=1.18, wpb=491892, bsz=16545.4, num_updates=54000, lr=0.000272166, gnorm=0.136, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=46274 epoch 033: 35 / 1689 loss=3.513, nll_loss=1.959, ppl=3.89, wps=582054, ups=1.18, wpb=491892, bsz=16545.4, num_updates=54000, lr=0.000272166, gnorm=0.136, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=46274 epoch 033: 35 / 1689 loss=3.513, nll_loss=1.959, ppl=3.89, wps=582054, ups=1.18, wpb=491892, bsz=16545.4, num_updates=54000, lr=0.000272166, gnorm=0.136, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=46274 begin validation on "valid" subset epoch 033 | valid on 'valid' subset | loss 3.706 | nll_loss 2.155 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 54000 | best_loss 3.685 epoch 033 | valid on 'valid' subset | loss 3.706 | nll_loss 2.155 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 54000 | best_loss 3.685 epoch 033 | valid on 'valid' subset | loss 3.706 | nll_loss 2.155 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 54000 | best_loss 3.685 epoch 033 | valid on 'valid' subset | loss 3.706 | nll_loss 2.155 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 54000 | best_loss 3.685 epoch 033 | valid on 'valid' subset | loss 3.706 | nll_loss 2.155 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 54000 | best_loss 3.685 epoch 033 | valid on 'valid' subset | loss 3.706 | nll_loss 2.155 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 54000 | best_loss 3.685 epoch 033 | valid on 'valid' subset | loss 3.706 | nll_loss 2.155 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 54000 | best_loss 3.685 epoch 033 | valid on 'valid' subset | loss 3.706 | nll_loss 2.155 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 54000 | best_loss 3.685 epoch 033 | valid on 'valid' subset | loss 3.706 | nll_loss 2.155 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 54000 | best_loss 3.685 epoch 033 | valid on 'valid' subset | loss 3.706 | nll_loss 2.155 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 54000 | best_loss 3.685 epoch 033 | valid on 'valid' subset | loss 3.706 | nll_loss 2.155 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 54000 | best_loss 3.685 epoch 033 | valid on 'valid' subset | loss 3.706 | nll_loss 2.155 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 54000 | best_loss 3.685 epoch 033 | valid on 'valid' subset | loss 3.706 | nll_loss 2.155 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 54000 | best_loss 3.685 epoch 033 | valid on 'valid' subset | loss 3.706 | nll_loss 2.155 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 54000 | best_loss 3.685 epoch 033 | valid on 'valid' subset | loss 3.706 | nll_loss 2.155 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 54000 | best_loss 3.685 epoch 033 | valid on 'valid' subset | loss 3.706 | nll_loss 2.155 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 54000 | best_loss 3.685 epoch 033 | valid on 'valid' subset | loss 3.706 | nll_loss 2.155 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 54000 | best_loss 3.685 epoch 033 | valid on 'valid' subset | loss 3.706 | nll_loss 2.155 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 54000 | best_loss 3.685 epoch 033 | valid on 'valid' subset | loss 3.706 | nll_loss 2.155 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 54000 | best_loss 3.685 epoch 033 | valid on 'valid' subset | loss 3.706 | nll_loss 2.155 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 54000 | best_loss 3.685 epoch 033 | valid on 'valid' subset | loss 3.706 | nll_loss 2.155 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 54000 | best_loss 3.685 epoch 033 | valid on 'valid' subset | loss 3.706 | nll_loss 2.155 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 54000 | best_loss 3.685 epoch 033 | valid on 'valid' subset | loss 3.706 | nll_loss 2.155 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 54000 | best_loss 3.685 epoch 033 | valid on 'valid' subset | loss 3.706 | nll_loss 2.155 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 54000 | best_loss 3.685 epoch 033 | valid on 'valid' subset | loss 3.706 | nll_loss 2.155 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 54000 | best_loss 3.685 epoch 033 | valid on 'valid' subset | loss 3.706 | nll_loss 2.155 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 54000 | best_loss 3.685 epoch 033 | valid on 'valid' subset | loss 3.706 | nll_loss 2.155 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 54000 | best_loss 3.685 epoch 033 | valid on 'valid' subset | loss 3.706 | nll_loss 2.155 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 54000 | best_loss 3.685 epoch 033 | valid on 'valid' subset | loss 3.706 | nll_loss 2.155 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 54000 | best_loss 3.685 epoch 033 | valid on 'valid' subset | loss 3.706 | nll_loss 2.155 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 54000 | best_loss 3.685 epoch 033 | valid on 'valid' subset | loss 3.706 | nll_loss 2.155 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 54000 | best_loss 3.685 epoch 033 | valid on 'valid' subset | loss 3.706 | nll_loss 2.155 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 54000 | best_loss 3.685 epoch 033 | valid on 'valid' subset | loss 3.706 | nll_loss 2.155 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 54000 | best_loss 3.685 epoch 033: 135 / 1689 loss=3.495, nll_loss=1.938, ppl=3.83, wps=535690, ups=1.08, wpb=495168, bsz=16618.8, num_updates=54100, lr=0.000271914, gnorm=0.14, clip=0, loss_scale=2, train_wall=83, gb_free=62.1, wall=46366 epoch 033: 135 / 1689 loss=3.495, nll_loss=1.938, ppl=3.83, wps=535690, ups=1.08, wpb=495168, bsz=16618.8, num_updates=54100, lr=0.000271914, gnorm=0.14, clip=0, loss_scale=2, train_wall=83, gb_free=62.1, wall=46366 epoch 033: 135 / 1689 loss=3.495, nll_loss=1.938, ppl=3.83, wps=535690, ups=1.08, wpb=495168, bsz=16618.8, num_updates=54100, lr=0.000271914, gnorm=0.14, clip=0, loss_scale=2, train_wall=83, gb_free=62.1, wall=46366 epoch 033: 135 / 1689 loss=3.495, nll_loss=1.938, ppl=3.83, wps=535690, ups=1.08, wpb=495168, bsz=16618.8, num_updates=54100, lr=0.000271914, gnorm=0.14, clip=0, loss_scale=2, train_wall=83, gb_free=62.1, wall=46366 epoch 033: 135 / 1689 loss=3.495, nll_loss=1.938, ppl=3.83, wps=535690, ups=1.08, wpb=495168, bsz=16618.8, num_updates=54100, lr=0.000271914, gnorm=0.14, clip=0, loss_scale=2, train_wall=83, gb_free=62.1, wall=46366 epoch 033: 135 / 1689 loss=3.495, nll_loss=1.938, ppl=3.83, wps=535690, ups=1.08, wpb=495168, bsz=16618.8, num_updates=54100, lr=0.000271914, gnorm=0.14, clip=0, loss_scale=2, train_wall=83, gb_free=62.1, wall=46366 epoch 033: 135 / 1689 loss=3.495, nll_loss=1.938, ppl=3.83, wps=535690, ups=1.08, wpb=495168, bsz=16618.8, num_updates=54100, lr=0.000271914, gnorm=0.14, clip=0, loss_scale=2, train_wall=83, gb_free=62.1, wall=46366 epoch 033: 135 / 1689 loss=3.495, nll_loss=1.938, ppl=3.83, wps=535690, ups=1.08, wpb=495168, bsz=16618.8, num_updates=54100, lr=0.000271914, gnorm=0.14, clip=0, loss_scale=2, train_wall=83, gb_free=62.1, wall=46366 epoch 033: 135 / 1689 loss=3.495, nll_loss=1.938, ppl=3.83, wps=535690, ups=1.08, wpb=495168, bsz=16618.8, num_updates=54100, lr=0.000271914, gnorm=0.14, clip=0, loss_scale=2, train_wall=83, gb_free=62.1, wall=46366 epoch 033: 135 / 1689 loss=3.495, nll_loss=1.938, ppl=3.83, wps=535690, ups=1.08, wpb=495168, bsz=16618.8, num_updates=54100, lr=0.000271914, gnorm=0.14, clip=0, loss_scale=2, train_wall=83, gb_free=62.1, wall=46366 epoch 033: 135 / 1689 loss=3.495, nll_loss=1.938, ppl=3.83, wps=535690, ups=1.08, wpb=495168, bsz=16618.8, num_updates=54100, lr=0.000271914, gnorm=0.14, clip=0, loss_scale=2, train_wall=83, gb_free=62.1, wall=46366 epoch 033: 135 / 1689 loss=3.495, nll_loss=1.938, ppl=3.83, wps=535690, ups=1.08, wpb=495168, bsz=16618.8, num_updates=54100, lr=0.000271914, gnorm=0.14, clip=0, loss_scale=2, train_wall=83, gb_free=62.1, wall=46366 epoch 033: 135 / 1689 loss=3.495, nll_loss=1.938, ppl=3.83, wps=535690, ups=1.08, wpb=495168, bsz=16618.8, num_updates=54100, lr=0.000271914, gnorm=0.14, clip=0, loss_scale=2, train_wall=83, gb_free=62.1, wall=46366 epoch 033: 135 / 1689 loss=3.495, nll_loss=1.938, ppl=3.83, wps=535690, ups=1.08, wpb=495168, bsz=16618.8, num_updates=54100, lr=0.000271914, gnorm=0.14, clip=0, loss_scale=2, train_wall=83, gb_free=62.1, wall=46366 epoch 033: 135 / 1689 loss=3.495, nll_loss=1.938, ppl=3.83, wps=535690, ups=1.08, wpb=495168, bsz=16618.8, num_updates=54100, lr=0.000271914, gnorm=0.14, clip=0, loss_scale=2, train_wall=83, gb_free=62.1, wall=46366 epoch 033: 135 / 1689 loss=3.495, nll_loss=1.938, ppl=3.83, wps=535690, ups=1.08, wpb=495168, bsz=16618.8, num_updates=54100, lr=0.000271914, gnorm=0.14, clip=0, loss_scale=2, train_wall=83, gb_free=62.1, wall=46366 epoch 033: 135 / 1689 loss=3.495, nll_loss=1.938, ppl=3.83, wps=535690, ups=1.08, wpb=495168, bsz=16618.8, num_updates=54100, lr=0.000271914, gnorm=0.14, clip=0, loss_scale=2, train_wall=83, gb_free=62.1, wall=46366 epoch 033: 135 / 1689 loss=3.495, nll_loss=1.938, ppl=3.83, wps=535690, ups=1.08, wpb=495168, bsz=16618.8, num_updates=54100, lr=0.000271914, gnorm=0.14, clip=0, loss_scale=2, train_wall=83, gb_free=62.1, wall=46366 epoch 033: 135 / 1689 loss=3.495, nll_loss=1.938, ppl=3.83, wps=535690, ups=1.08, wpb=495168, bsz=16618.8, num_updates=54100, lr=0.000271914, gnorm=0.14, clip=0, loss_scale=2, train_wall=83, gb_free=62.1, wall=46366 epoch 033: 135 / 1689 loss=3.495, nll_loss=1.938, ppl=3.83, wps=535690, ups=1.08, wpb=495168, bsz=16618.8, num_updates=54100, lr=0.000271914, gnorm=0.14, clip=0, loss_scale=2, train_wall=83, gb_free=62.1, wall=46366 epoch 033: 135 / 1689 loss=3.495, nll_loss=1.938, ppl=3.83, wps=535690, ups=1.08, wpb=495168, bsz=16618.8, num_updates=54100, lr=0.000271914, gnorm=0.14, clip=0, loss_scale=2, train_wall=83, gb_free=62.1, wall=46366 epoch 033: 135 / 1689 loss=3.495, nll_loss=1.938, ppl=3.83, wps=535690, ups=1.08, wpb=495168, bsz=16618.8, num_updates=54100, lr=0.000271914, gnorm=0.14, clip=0, loss_scale=2, train_wall=83, gb_free=62.1, wall=46366 epoch 033: 135 / 1689 loss=3.495, nll_loss=1.938, ppl=3.83, wps=535690, ups=1.08, wpb=495168, bsz=16618.8, num_updates=54100, lr=0.000271914, gnorm=0.14, clip=0, loss_scale=2, train_wall=83, gb_free=62.1, wall=46366 epoch 033: 135 / 1689 loss=3.495, nll_loss=1.938, ppl=3.83, wps=535690, ups=1.08, wpb=495168, bsz=16618.8, num_updates=54100, lr=0.000271914, gnorm=0.14, clip=0, loss_scale=2, train_wall=83, gb_free=62.1, wall=46366 epoch 033: 135 / 1689 loss=3.495, nll_loss=1.938, ppl=3.83, wps=535690, ups=1.08, wpb=495168, bsz=16618.8, num_updates=54100, lr=0.000271914, gnorm=0.14, clip=0, loss_scale=2, train_wall=83, gb_free=62.1, wall=46366 epoch 033: 135 / 1689 loss=3.495, nll_loss=1.938, ppl=3.83, wps=535690, ups=1.08, wpb=495168, bsz=16618.8, num_updates=54100, lr=0.000271914, gnorm=0.14, clip=0, loss_scale=2, train_wall=83, gb_free=62.1, wall=46366 epoch 033: 135 / 1689 loss=3.495, nll_loss=1.938, ppl=3.83, wps=535690, ups=1.08, wpb=495168, bsz=16618.8, num_updates=54100, lr=0.000271914, gnorm=0.14, clip=0, loss_scale=2, train_wall=83, gb_free=62.1, wall=46366 epoch 033: 135 / 1689 loss=3.495, nll_loss=1.938, ppl=3.83, wps=535690, ups=1.08, wpb=495168, bsz=16618.8, num_updates=54100, lr=0.000271914, gnorm=0.14, clip=0, loss_scale=2, train_wall=83, gb_free=62.1, wall=46366 epoch 033: 135 / 1689 loss=3.495, nll_loss=1.938, ppl=3.83, wps=535690, ups=1.08, wpb=495168, bsz=16618.8, num_updates=54100, lr=0.000271914, gnorm=0.14, clip=0, loss_scale=2, train_wall=83, gb_free=62.1, wall=46366 epoch 033: 135 / 1689 loss=3.495, nll_loss=1.938, ppl=3.83, wps=535690, ups=1.08, wpb=495168, bsz=16618.8, num_updates=54100, lr=0.000271914, gnorm=0.14, clip=0, loss_scale=2, train_wall=83, gb_free=62.1, wall=46366 epoch 033: 135 / 1689 loss=3.495, nll_loss=1.938, ppl=3.83, wps=535690, ups=1.08, wpb=495168, bsz=16618.8, num_updates=54100, lr=0.000271914, gnorm=0.14, clip=0, loss_scale=2, train_wall=83, gb_free=62.1, wall=46366 epoch 033: 135 / 1689 loss=3.495, nll_loss=1.938, ppl=3.83, wps=535690, ups=1.08, wpb=495168, bsz=16618.8, num_updates=54100, lr=0.000271914, gnorm=0.14, clip=0, loss_scale=2, train_wall=83, gb_free=62.1, wall=46366 epoch 033: 135 / 1689 loss=3.495, nll_loss=1.938, ppl=3.83, wps=535690, ups=1.08, wpb=495168, bsz=16618.8, num_updates=54100, lr=0.000271914, gnorm=0.14, clip=0, loss_scale=2, train_wall=83, gb_free=62.1, wall=46366 epoch 033: 235 / 1689 loss=3.494, nll_loss=1.937, ppl=3.83, wps=586715, ups=1.18, wpb=496693, bsz=16958.9, num_updates=54200, lr=0.000271663, gnorm=0.132, clip=0, loss_scale=2, train_wall=83, gb_free=60.5, wall=46451 epoch 033: 235 / 1689 loss=3.494, nll_loss=1.937, ppl=3.83, wps=586715, ups=1.18, wpb=496693, bsz=16958.9, num_updates=54200, lr=0.000271663, gnorm=0.132, clip=0, loss_scale=2, train_wall=83, gb_free=60.5, wall=46451 epoch 033: 235 / 1689 loss=3.494, nll_loss=1.937, ppl=3.83, wps=586715, ups=1.18, wpb=496693, bsz=16958.9, num_updates=54200, lr=0.000271663, gnorm=0.132, clip=0, loss_scale=2, train_wall=83, gb_free=60.5, wall=46451 epoch 033: 235 / 1689 loss=3.494, nll_loss=1.937, ppl=3.83, wps=586715, ups=1.18, wpb=496693, bsz=16958.9, num_updates=54200, lr=0.000271663, gnorm=0.132, clip=0, loss_scale=2, train_wall=83, gb_free=60.5, wall=46451 epoch 033: 235 / 1689 loss=3.494, nll_loss=1.937, ppl=3.83, wps=586715, ups=1.18, wpb=496693, bsz=16958.9, num_updates=54200, lr=0.000271663, gnorm=0.132, clip=0, loss_scale=2, train_wall=83, gb_free=60.5, wall=46451 epoch 033: 235 / 1689 loss=3.494, nll_loss=1.937, ppl=3.83, wps=586715, ups=1.18, wpb=496693, bsz=16958.9, num_updates=54200, lr=0.000271663, gnorm=0.132, clip=0, loss_scale=2, train_wall=83, gb_free=60.5, wall=46451 epoch 033: 235 / 1689 loss=3.494, nll_loss=1.937, ppl=3.83, wps=586715, ups=1.18, wpb=496693, bsz=16958.9, num_updates=54200, lr=0.000271663, gnorm=0.132, clip=0, loss_scale=2, train_wall=83, gb_free=60.5, wall=46451 epoch 033: 235 / 1689 loss=3.494, nll_loss=1.937, ppl=3.83, wps=586715, ups=1.18, wpb=496693, bsz=16958.9, num_updates=54200, lr=0.000271663, gnorm=0.132, clip=0, loss_scale=2, train_wall=83, gb_free=60.5, wall=46451 epoch 033: 235 / 1689 loss=3.494, nll_loss=1.937, ppl=3.83, wps=586715, ups=1.18, wpb=496693, bsz=16958.9, num_updates=54200, lr=0.000271663, gnorm=0.132, clip=0, loss_scale=2, train_wall=83, gb_free=60.5, wall=46451 epoch 033: 235 / 1689 loss=3.494, nll_loss=1.937, ppl=3.83, wps=586715, ups=1.18, wpb=496693, bsz=16958.9, num_updates=54200, lr=0.000271663, gnorm=0.132, clip=0, loss_scale=2, train_wall=83, gb_free=60.5, wall=46451 epoch 033: 235 / 1689 loss=3.494, nll_loss=1.937, ppl=3.83, wps=586715, ups=1.18, wpb=496693, bsz=16958.9, num_updates=54200, lr=0.000271663, gnorm=0.132, clip=0, loss_scale=2, train_wall=83, gb_free=60.5, wall=46451 epoch 033: 235 / 1689 loss=3.494, nll_loss=1.937, ppl=3.83, wps=586715, ups=1.18, wpb=496693, bsz=16958.9, num_updates=54200, lr=0.000271663, gnorm=0.132, clip=0, loss_scale=2, train_wall=83, gb_free=60.5, wall=46451 epoch 033: 235 / 1689 loss=3.494, nll_loss=1.937, ppl=3.83, wps=586715, ups=1.18, wpb=496693, bsz=16958.9, num_updates=54200, lr=0.000271663, gnorm=0.132, clip=0, loss_scale=2, train_wall=83, gb_free=60.5, wall=46451 epoch 033: 235 / 1689 loss=3.494, nll_loss=1.937, ppl=3.83, wps=586715, ups=1.18, wpb=496693, bsz=16958.9, num_updates=54200, lr=0.000271663, gnorm=0.132, clip=0, loss_scale=2, train_wall=83, gb_free=60.5, wall=46451 epoch 033: 235 / 1689 loss=3.494, nll_loss=1.937, ppl=3.83, wps=586715, ups=1.18, wpb=496693, bsz=16958.9, num_updates=54200, lr=0.000271663, gnorm=0.132, clip=0, loss_scale=2, train_wall=83, gb_free=60.5, wall=46451 epoch 033: 235 / 1689 loss=3.494, nll_loss=1.937, ppl=3.83, wps=586715, ups=1.18, wpb=496693, bsz=16958.9, num_updates=54200, lr=0.000271663, gnorm=0.132, clip=0, loss_scale=2, train_wall=83, gb_free=60.5, wall=46451 epoch 033: 235 / 1689 loss=3.494, nll_loss=1.937, ppl=3.83, wps=586715, ups=1.18, wpb=496693, bsz=16958.9, num_updates=54200, lr=0.000271663, gnorm=0.132, clip=0, loss_scale=2, train_wall=83, gb_free=60.5, wall=46451 epoch 033: 235 / 1689 loss=3.494, nll_loss=1.937, ppl=3.83, wps=586715, ups=1.18, wpb=496693, bsz=16958.9, num_updates=54200, lr=0.000271663, gnorm=0.132, clip=0, loss_scale=2, train_wall=83, gb_free=60.5, wall=46451 epoch 033: 235 / 1689 loss=3.494, nll_loss=1.937, ppl=3.83, wps=586715, ups=1.18, wpb=496693, bsz=16958.9, num_updates=54200, lr=0.000271663, gnorm=0.132, clip=0, loss_scale=2, train_wall=83, gb_free=60.5, wall=46451 epoch 033: 235 / 1689 loss=3.494, nll_loss=1.937, ppl=3.83, wps=586715, ups=1.18, wpb=496693, bsz=16958.9, num_updates=54200, lr=0.000271663, gnorm=0.132, clip=0, loss_scale=2, train_wall=83, gb_free=60.5, wall=46451 epoch 033: 235 / 1689 loss=3.494, nll_loss=1.937, ppl=3.83, wps=586715, ups=1.18, wpb=496693, bsz=16958.9, num_updates=54200, lr=0.000271663, gnorm=0.132, clip=0, loss_scale=2, train_wall=83, gb_free=60.5, wall=46451 epoch 033: 235 / 1689 loss=3.494, nll_loss=1.937, ppl=3.83, wps=586715, ups=1.18, wpb=496693, bsz=16958.9, num_updates=54200, lr=0.000271663, gnorm=0.132, clip=0, loss_scale=2, train_wall=83, gb_free=60.5, wall=46451 epoch 033: 235 / 1689 loss=3.494, nll_loss=1.937, ppl=3.83, wps=586715, ups=1.18, wpb=496693, bsz=16958.9, num_updates=54200, lr=0.000271663, gnorm=0.132, clip=0, loss_scale=2, train_wall=83, gb_free=60.5, wall=46451 epoch 033: 235 / 1689 loss=3.494, nll_loss=1.937, ppl=3.83, wps=586715, ups=1.18, wpb=496693, bsz=16958.9, num_updates=54200, lr=0.000271663, gnorm=0.132, clip=0, loss_scale=2, train_wall=83, gb_free=60.5, wall=46451 epoch 033: 235 / 1689 loss=3.494, nll_loss=1.937, ppl=3.83, wps=586715, ups=1.18, wpb=496693, bsz=16958.9, num_updates=54200, lr=0.000271663, gnorm=0.132, clip=0, loss_scale=2, train_wall=83, gb_free=60.5, wall=46451 epoch 033: 235 / 1689 loss=3.494, nll_loss=1.937, ppl=3.83, wps=586715, ups=1.18, wpb=496693, bsz=16958.9, num_updates=54200, lr=0.000271663, gnorm=0.132, clip=0, loss_scale=2, train_wall=83, gb_free=60.5, wall=46451 epoch 033: 235 / 1689 loss=3.494, nll_loss=1.937, ppl=3.83, wps=586715, ups=1.18, wpb=496693, bsz=16958.9, num_updates=54200, lr=0.000271663, gnorm=0.132, clip=0, loss_scale=2, train_wall=83, gb_free=60.5, wall=46451 epoch 033: 235 / 1689 loss=3.494, nll_loss=1.937, ppl=3.83, wps=586715, ups=1.18, wpb=496693, bsz=16958.9, num_updates=54200, lr=0.000271663, gnorm=0.132, clip=0, loss_scale=2, train_wall=83, gb_free=60.5, wall=46451 epoch 033: 235 / 1689 loss=3.494, nll_loss=1.937, ppl=3.83, wps=586715, ups=1.18, wpb=496693, bsz=16958.9, num_updates=54200, lr=0.000271663, gnorm=0.132, clip=0, loss_scale=2, train_wall=83, gb_free=60.5, wall=46451 epoch 033: 235 / 1689 loss=3.494, nll_loss=1.937, ppl=3.83, wps=586715, ups=1.18, wpb=496693, bsz=16958.9, num_updates=54200, lr=0.000271663, gnorm=0.132, clip=0, loss_scale=2, train_wall=83, gb_free=60.5, wall=46451 epoch 033: 235 / 1689 loss=3.494, nll_loss=1.937, ppl=3.83, wps=586715, ups=1.18, wpb=496693, bsz=16958.9, num_updates=54200, lr=0.000271663, gnorm=0.132, clip=0, loss_scale=2, train_wall=83, gb_free=60.5, wall=46451 epoch 033: 235 / 1689 loss=3.494, nll_loss=1.937, ppl=3.83, wps=586715, ups=1.18, wpb=496693, bsz=16958.9, num_updates=54200, lr=0.000271663, gnorm=0.132, clip=0, loss_scale=2, train_wall=83, gb_free=60.5, wall=46451 epoch 033: 235 / 1689 loss=3.494, nll_loss=1.937, ppl=3.83, wps=586715, ups=1.18, wpb=496693, bsz=16958.9, num_updates=54200, lr=0.000271663, gnorm=0.132, clip=0, loss_scale=2, train_wall=83, gb_free=60.5, wall=46451 epoch 033: 335 / 1689 loss=3.491, nll_loss=1.934, ppl=3.82, wps=584900, ups=1.18, wpb=494001, bsz=16163.9, num_updates=54300, lr=0.000271413, gnorm=0.135, clip=0, loss_scale=2, train_wall=83, gb_free=61.2, wall=46535 epoch 033: 335 / 1689 loss=3.491, nll_loss=1.934, ppl=3.82, wps=584900, ups=1.18, wpb=494001, bsz=16163.9, num_updates=54300, lr=0.000271413, gnorm=0.135, clip=0, loss_scale=2, train_wall=83, gb_free=61.2, wall=46535 epoch 033: 335 / 1689 loss=3.491, nll_loss=1.934, ppl=3.82, wps=584900, ups=1.18, wpb=494001, bsz=16163.9, num_updates=54300, lr=0.000271413, gnorm=0.135, clip=0, loss_scale=2, train_wall=83, gb_free=61.2, wall=46535 epoch 033: 335 / 1689 loss=3.491, nll_loss=1.934, ppl=3.82, wps=584900, ups=1.18, wpb=494001, bsz=16163.9, num_updates=54300, lr=0.000271413, gnorm=0.135, clip=0, loss_scale=2, train_wall=83, gb_free=61.2, wall=46535 epoch 033: 335 / 1689 loss=3.491, nll_loss=1.934, ppl=3.82, wps=584900, ups=1.18, wpb=494001, bsz=16163.9, num_updates=54300, lr=0.000271413, gnorm=0.135, clip=0, loss_scale=2, train_wall=83, gb_free=61.2, wall=46535 epoch 033: 335 / 1689 loss=3.491, nll_loss=1.934, ppl=3.82, wps=584900, ups=1.18, wpb=494001, bsz=16163.9, num_updates=54300, lr=0.000271413, gnorm=0.135, clip=0, loss_scale=2, train_wall=83, gb_free=61.2, wall=46535 epoch 033: 335 / 1689 loss=3.491, nll_loss=1.934, ppl=3.82, wps=584900, ups=1.18, wpb=494001, bsz=16163.9, num_updates=54300, lr=0.000271413, gnorm=0.135, clip=0, loss_scale=2, train_wall=83, gb_free=61.2, wall=46535 epoch 033: 335 / 1689 loss=3.491, nll_loss=1.934, ppl=3.82, wps=584900, ups=1.18, wpb=494001, bsz=16163.9, num_updates=54300, lr=0.000271413, gnorm=0.135, clip=0, loss_scale=2, train_wall=83, gb_free=61.2, wall=46535 epoch 033: 335 / 1689 loss=3.491, nll_loss=1.934, ppl=3.82, wps=584900, ups=1.18, wpb=494001, bsz=16163.9, num_updates=54300, lr=0.000271413, gnorm=0.135, clip=0, loss_scale=2, train_wall=83, gb_free=61.2, wall=46535 epoch 033: 335 / 1689 loss=3.491, nll_loss=1.934, ppl=3.82, wps=584900, ups=1.18, wpb=494001, bsz=16163.9, num_updates=54300, lr=0.000271413, gnorm=0.135, clip=0, loss_scale=2, train_wall=83, gb_free=61.2, wall=46535 epoch 033: 335 / 1689 loss=3.491, nll_loss=1.934, ppl=3.82, wps=584900, ups=1.18, wpb=494001, bsz=16163.9, num_updates=54300, lr=0.000271413, gnorm=0.135, clip=0, loss_scale=2, train_wall=83, gb_free=61.2, wall=46535 epoch 033: 335 / 1689 loss=3.491, nll_loss=1.934, ppl=3.82, wps=584900, ups=1.18, wpb=494001, bsz=16163.9, num_updates=54300, lr=0.000271413, gnorm=0.135, clip=0, loss_scale=2, train_wall=83, gb_free=61.2, wall=46535 epoch 033: 335 / 1689 loss=3.491, nll_loss=1.934, ppl=3.82, wps=584900, ups=1.18, wpb=494001, bsz=16163.9, num_updates=54300, lr=0.000271413, gnorm=0.135, clip=0, loss_scale=2, train_wall=83, gb_free=61.2, wall=46535 epoch 033: 335 / 1689 loss=3.491, nll_loss=1.934, ppl=3.82, wps=584900, ups=1.18, wpb=494001, bsz=16163.9, num_updates=54300, lr=0.000271413, gnorm=0.135, clip=0, loss_scale=2, train_wall=83, gb_free=61.2, wall=46535 epoch 033: 335 / 1689 loss=3.491, nll_loss=1.934, ppl=3.82, wps=584900, ups=1.18, wpb=494001, bsz=16163.9, num_updates=54300, lr=0.000271413, gnorm=0.135, clip=0, loss_scale=2, train_wall=83, gb_free=61.2, wall=46535 epoch 033: 335 / 1689 loss=3.491, nll_loss=1.934, ppl=3.82, wps=584900, ups=1.18, wpb=494001, bsz=16163.9, num_updates=54300, lr=0.000271413, gnorm=0.135, clip=0, loss_scale=2, train_wall=83, gb_free=61.2, wall=46535 epoch 033: 335 / 1689 loss=3.491, nll_loss=1.934, ppl=3.82, wps=584900, ups=1.18, wpb=494001, bsz=16163.9, num_updates=54300, lr=0.000271413, gnorm=0.135, clip=0, loss_scale=2, train_wall=83, gb_free=61.2, wall=46535 epoch 033: 335 / 1689 loss=3.491, nll_loss=1.934, ppl=3.82, wps=584900, ups=1.18, wpb=494001, bsz=16163.9, num_updates=54300, lr=0.000271413, gnorm=0.135, clip=0, loss_scale=2, train_wall=83, gb_free=61.2, wall=46535 epoch 033: 335 / 1689 loss=3.491, nll_loss=1.934, ppl=3.82, wps=584900, ups=1.18, wpb=494001, bsz=16163.9, num_updates=54300, lr=0.000271413, gnorm=0.135, clip=0, loss_scale=2, train_wall=83, gb_free=61.2, wall=46535 epoch 033: 335 / 1689 loss=3.491, nll_loss=1.934, ppl=3.82, wps=584900, ups=1.18, wpb=494001, bsz=16163.9, num_updates=54300, lr=0.000271413, gnorm=0.135, clip=0, loss_scale=2, train_wall=83, gb_free=61.2, wall=46535 epoch 033: 335 / 1689 loss=3.491, nll_loss=1.934, ppl=3.82, wps=584900, ups=1.18, wpb=494001, bsz=16163.9, num_updates=54300, lr=0.000271413, gnorm=0.135, clip=0, loss_scale=2, train_wall=83, gb_free=61.2, wall=46535 epoch 033: 335 / 1689 loss=3.491, nll_loss=1.934, ppl=3.82, wps=584900, ups=1.18, wpb=494001, bsz=16163.9, num_updates=54300, lr=0.000271413, gnorm=0.135, clip=0, loss_scale=2, train_wall=83, gb_free=61.2, wall=46535 epoch 033: 335 / 1689 loss=3.491, nll_loss=1.934, ppl=3.82, wps=584900, ups=1.18, wpb=494001, bsz=16163.9, num_updates=54300, lr=0.000271413, gnorm=0.135, clip=0, loss_scale=2, train_wall=83, gb_free=61.2, wall=46535 epoch 033: 335 / 1689 loss=3.491, nll_loss=1.934, ppl=3.82, wps=584900, ups=1.18, wpb=494001, bsz=16163.9, num_updates=54300, lr=0.000271413, gnorm=0.135, clip=0, loss_scale=2, train_wall=83, gb_free=61.2, wall=46535 epoch 033: 335 / 1689 loss=3.491, nll_loss=1.934, ppl=3.82, wps=584900, ups=1.18, wpb=494001, bsz=16163.9, num_updates=54300, lr=0.000271413, gnorm=0.135, clip=0, loss_scale=2, train_wall=83, gb_free=61.2, wall=46535 epoch 033: 335 / 1689 loss=3.491, nll_loss=1.934, ppl=3.82, wps=584900, ups=1.18, wpb=494001, bsz=16163.9, num_updates=54300, lr=0.000271413, gnorm=0.135, clip=0, loss_scale=2, train_wall=83, gb_free=61.2, wall=46535 epoch 033: 335 / 1689 loss=3.491, nll_loss=1.934, ppl=3.82, wps=584900, ups=1.18, wpb=494001, bsz=16163.9, num_updates=54300, lr=0.000271413, gnorm=0.135, clip=0, loss_scale=2, train_wall=83, gb_free=61.2, wall=46535 epoch 033: 335 / 1689 loss=3.491, nll_loss=1.934, ppl=3.82, wps=584900, ups=1.18, wpb=494001, bsz=16163.9, num_updates=54300, lr=0.000271413, gnorm=0.135, clip=0, loss_scale=2, train_wall=83, gb_free=61.2, wall=46535 epoch 033: 335 / 1689 loss=3.491, nll_loss=1.934, ppl=3.82, wps=584900, ups=1.18, wpb=494001, bsz=16163.9, num_updates=54300, lr=0.000271413, gnorm=0.135, clip=0, loss_scale=2, train_wall=83, gb_free=61.2, wall=46535 epoch 033: 335 / 1689 loss=3.491, nll_loss=1.934, ppl=3.82, wps=584900, ups=1.18, wpb=494001, bsz=16163.9, num_updates=54300, lr=0.000271413, gnorm=0.135, clip=0, loss_scale=2, train_wall=83, gb_free=61.2, wall=46535 epoch 033: 335 / 1689 loss=3.491, nll_loss=1.934, ppl=3.82, wps=584900, ups=1.18, wpb=494001, bsz=16163.9, num_updates=54300, lr=0.000271413, gnorm=0.135, clip=0, loss_scale=2, train_wall=83, gb_free=61.2, wall=46535 epoch 033: 335 / 1689 loss=3.491, nll_loss=1.934, ppl=3.82, wps=584900, ups=1.18, wpb=494001, bsz=16163.9, num_updates=54300, lr=0.000271413, gnorm=0.135, clip=0, loss_scale=2, train_wall=83, gb_free=61.2, wall=46535 epoch 033: 335 / 1689 loss=3.491, nll_loss=1.934, ppl=3.82, wps=584900, ups=1.18, wpb=494001, bsz=16163.9, num_updates=54300, lr=0.000271413, gnorm=0.135, clip=0, loss_scale=2, train_wall=83, gb_free=61.2, wall=46535 epoch 033: 435 / 1689 loss=3.499, nll_loss=1.943, ppl=3.85, wps=586467, ups=1.19, wpb=494697, bsz=16487.4, num_updates=54400, lr=0.000271163, gnorm=0.132, clip=0, loss_scale=4, train_wall=83, gb_free=61.7, wall=46620 epoch 033: 435 / 1689 loss=3.499, nll_loss=1.943, ppl=3.85, wps=586467, ups=1.19, wpb=494697, bsz=16487.4, num_updates=54400, lr=0.000271163, gnorm=0.132, clip=0, loss_scale=4, train_wall=83, gb_free=61.7, wall=46620 epoch 033: 435 / 1689 loss=3.499, nll_loss=1.943, ppl=3.85, wps=586467, ups=1.19, wpb=494697, bsz=16487.4, num_updates=54400, lr=0.000271163, gnorm=0.132, clip=0, loss_scale=4, train_wall=83, gb_free=61.7, wall=46620 epoch 033: 435 / 1689 loss=3.499, nll_loss=1.943, ppl=3.85, wps=586467, ups=1.19, wpb=494697, bsz=16487.4, num_updates=54400, lr=0.000271163, gnorm=0.132, clip=0, loss_scale=4, train_wall=83, gb_free=61.7, wall=46620 epoch 033: 435 / 1689 loss=3.499, nll_loss=1.943, ppl=3.85, wps=586467, ups=1.19, wpb=494697, bsz=16487.4, num_updates=54400, lr=0.000271163, gnorm=0.132, clip=0, loss_scale=4, train_wall=83, gb_free=61.7, wall=46620 epoch 033: 435 / 1689 loss=3.499, nll_loss=1.943, ppl=3.85, wps=586467, ups=1.19, wpb=494697, bsz=16487.4, num_updates=54400, lr=0.000271163, gnorm=0.132, clip=0, loss_scale=4, train_wall=83, gb_free=61.7, wall=46620 epoch 033: 435 / 1689 loss=3.499, nll_loss=1.943, ppl=3.85, wps=586467, ups=1.19, wpb=494697, bsz=16487.4, num_updates=54400, lr=0.000271163, gnorm=0.132, clip=0, loss_scale=4, train_wall=83, gb_free=61.7, wall=46620 epoch 033: 435 / 1689 loss=3.499, nll_loss=1.943, ppl=3.85, wps=586467, ups=1.19, wpb=494697, bsz=16487.4, num_updates=54400, lr=0.000271163, gnorm=0.132, clip=0, loss_scale=4, train_wall=83, gb_free=61.7, wall=46620 epoch 033: 435 / 1689 loss=3.499, nll_loss=1.943, ppl=3.85, wps=586467, ups=1.19, wpb=494697, bsz=16487.4, num_updates=54400, lr=0.000271163, gnorm=0.132, clip=0, loss_scale=4, train_wall=83, gb_free=61.7, wall=46620 epoch 033: 435 / 1689 loss=3.499, nll_loss=1.943, ppl=3.85, wps=586467, ups=1.19, wpb=494697, bsz=16487.4, num_updates=54400, lr=0.000271163, gnorm=0.132, clip=0, loss_scale=4, train_wall=83, gb_free=61.7, wall=46620 epoch 033: 435 / 1689 loss=3.499, nll_loss=1.943, ppl=3.85, wps=586467, ups=1.19, wpb=494697, bsz=16487.4, num_updates=54400, lr=0.000271163, gnorm=0.132, clip=0, loss_scale=4, train_wall=83, gb_free=61.7, wall=46620 epoch 033: 435 / 1689 loss=3.499, nll_loss=1.943, ppl=3.85, wps=586467, ups=1.19, wpb=494697, bsz=16487.4, num_updates=54400, lr=0.000271163, gnorm=0.132, clip=0, loss_scale=4, train_wall=83, gb_free=61.7, wall=46620 epoch 033: 435 / 1689 loss=3.499, nll_loss=1.943, ppl=3.85, wps=586467, ups=1.19, wpb=494697, bsz=16487.4, num_updates=54400, lr=0.000271163, gnorm=0.132, clip=0, loss_scale=4, train_wall=83, gb_free=61.7, wall=46620 epoch 033: 435 / 1689 loss=3.499, nll_loss=1.943, ppl=3.85, wps=586467, ups=1.19, wpb=494697, bsz=16487.4, num_updates=54400, lr=0.000271163, gnorm=0.132, clip=0, loss_scale=4, train_wall=83, gb_free=61.7, wall=46620 epoch 033: 435 / 1689 loss=3.499, nll_loss=1.943, ppl=3.85, wps=586467, ups=1.19, wpb=494697, bsz=16487.4, num_updates=54400, lr=0.000271163, gnorm=0.132, clip=0, loss_scale=4, train_wall=83, gb_free=61.7, wall=46620 epoch 033: 435 / 1689 loss=3.499, nll_loss=1.943, ppl=3.85, wps=586467, ups=1.19, wpb=494697, bsz=16487.4, num_updates=54400, lr=0.000271163, gnorm=0.132, clip=0, loss_scale=4, train_wall=83, gb_free=61.7, wall=46620 epoch 033: 435 / 1689 loss=3.499, nll_loss=1.943, ppl=3.85, wps=586467, ups=1.19, wpb=494697, bsz=16487.4, num_updates=54400, lr=0.000271163, gnorm=0.132, clip=0, loss_scale=4, train_wall=83, gb_free=61.7, wall=46620 epoch 033: 435 / 1689 loss=3.499, nll_loss=1.943, ppl=3.85, wps=586467, ups=1.19, wpb=494697, bsz=16487.4, num_updates=54400, lr=0.000271163, gnorm=0.132, clip=0, loss_scale=4, train_wall=83, gb_free=61.7, wall=46620 epoch 033: 435 / 1689 loss=3.499, nll_loss=1.943, ppl=3.85, wps=586467, ups=1.19, wpb=494697, bsz=16487.4, num_updates=54400, lr=0.000271163, gnorm=0.132, clip=0, loss_scale=4, train_wall=83, gb_free=61.7, wall=46620 epoch 033: 435 / 1689 loss=3.499, nll_loss=1.943, ppl=3.85, wps=586467, ups=1.19, wpb=494697, bsz=16487.4, num_updates=54400, lr=0.000271163, gnorm=0.132, clip=0, loss_scale=4, train_wall=83, gb_free=61.7, wall=46620 epoch 033: 435 / 1689 loss=3.499, nll_loss=1.943, ppl=3.85, wps=586467, ups=1.19, wpb=494697, bsz=16487.4, num_updates=54400, lr=0.000271163, gnorm=0.132, clip=0, loss_scale=4, train_wall=83, gb_free=61.7, wall=46620 epoch 033: 435 / 1689 loss=3.499, nll_loss=1.943, ppl=3.85, wps=586467, ups=1.19, wpb=494697, bsz=16487.4, num_updates=54400, lr=0.000271163, gnorm=0.132, clip=0, loss_scale=4, train_wall=83, gb_free=61.7, wall=46620 epoch 033: 435 / 1689 loss=3.499, nll_loss=1.943, ppl=3.85, wps=586467, ups=1.19, wpb=494697, bsz=16487.4, num_updates=54400, lr=0.000271163, gnorm=0.132, clip=0, loss_scale=4, train_wall=83, gb_free=61.7, wall=46620 epoch 033: 435 / 1689 loss=3.499, nll_loss=1.943, ppl=3.85, wps=586467, ups=1.19, wpb=494697, bsz=16487.4, num_updates=54400, lr=0.000271163, gnorm=0.132, clip=0, loss_scale=4, train_wall=83, gb_free=61.7, wall=46620 epoch 033: 435 / 1689 loss=3.499, nll_loss=1.943, ppl=3.85, wps=586467, ups=1.19, wpb=494697, bsz=16487.4, num_updates=54400, lr=0.000271163, gnorm=0.132, clip=0, loss_scale=4, train_wall=83, gb_free=61.7, wall=46620 epoch 033: 435 / 1689 loss=3.499, nll_loss=1.943, ppl=3.85, wps=586467, ups=1.19, wpb=494697, bsz=16487.4, num_updates=54400, lr=0.000271163, gnorm=0.132, clip=0, loss_scale=4, train_wall=83, gb_free=61.7, wall=46620 epoch 033: 435 / 1689 loss=3.499, nll_loss=1.943, ppl=3.85, wps=586467, ups=1.19, wpb=494697, bsz=16487.4, num_updates=54400, lr=0.000271163, gnorm=0.132, clip=0, loss_scale=4, train_wall=83, gb_free=61.7, wall=46620 epoch 033: 435 / 1689 loss=3.499, nll_loss=1.943, ppl=3.85, wps=586467, ups=1.19, wpb=494697, bsz=16487.4, num_updates=54400, lr=0.000271163, gnorm=0.132, clip=0, loss_scale=4, train_wall=83, gb_free=61.7, wall=46620 epoch 033: 435 / 1689 loss=3.499, nll_loss=1.943, ppl=3.85, wps=586467, ups=1.19, wpb=494697, bsz=16487.4, num_updates=54400, lr=0.000271163, gnorm=0.132, clip=0, loss_scale=4, train_wall=83, gb_free=61.7, wall=46620 epoch 033: 435 / 1689 loss=3.499, nll_loss=1.943, ppl=3.85, wps=586467, ups=1.19, wpb=494697, bsz=16487.4, num_updates=54400, lr=0.000271163, gnorm=0.132, clip=0, loss_scale=4, train_wall=83, gb_free=61.7, wall=46620 epoch 033: 435 / 1689 loss=3.499, nll_loss=1.943, ppl=3.85, wps=586467, ups=1.19, wpb=494697, bsz=16487.4, num_updates=54400, lr=0.000271163, gnorm=0.132, clip=0, loss_scale=4, train_wall=83, gb_free=61.7, wall=46620 epoch 033: 435 / 1689 loss=3.499, nll_loss=1.943, ppl=3.85, wps=586467, ups=1.19, wpb=494697, bsz=16487.4, num_updates=54400, lr=0.000271163, gnorm=0.132, clip=0, loss_scale=4, train_wall=83, gb_free=61.7, wall=46620 epoch 033: 435 / 1689 loss=3.499, nll_loss=1.943, ppl=3.85, wps=586467, ups=1.19, wpb=494697, bsz=16487.4, num_updates=54400, lr=0.000271163, gnorm=0.132, clip=0, loss_scale=4, train_wall=83, gb_free=61.7, wall=46620 epoch 033: 535 / 1689 loss=3.498, nll_loss=1.942, ppl=3.84, wps=587127, ups=1.18, wpb=496269, bsz=16698.1, num_updates=54500, lr=0.000270914, gnorm=0.137, clip=0, loss_scale=4, train_wall=83, gb_free=61.6, wall=46704 epoch 033: 535 / 1689 loss=3.498, nll_loss=1.942, ppl=3.84, wps=587127, ups=1.18, wpb=496269, bsz=16698.1, num_updates=54500, lr=0.000270914, gnorm=0.137, clip=0, loss_scale=4, train_wall=83, gb_free=61.6, wall=46704 epoch 033: 535 / 1689 loss=3.498, nll_loss=1.942, ppl=3.84, wps=587127, ups=1.18, wpb=496269, bsz=16698.1, num_updates=54500, lr=0.000270914, gnorm=0.137, clip=0, loss_scale=4, train_wall=83, gb_free=61.6, wall=46704 epoch 033: 535 / 1689 loss=3.498, nll_loss=1.942, ppl=3.84, wps=587127, ups=1.18, wpb=496269, bsz=16698.1, num_updates=54500, lr=0.000270914, gnorm=0.137, clip=0, loss_scale=4, train_wall=83, gb_free=61.6, wall=46704 epoch 033: 535 / 1689 loss=3.498, nll_loss=1.942, ppl=3.84, wps=587127, ups=1.18, wpb=496269, bsz=16698.1, num_updates=54500, lr=0.000270914, gnorm=0.137, clip=0, loss_scale=4, train_wall=83, gb_free=61.6, wall=46704 epoch 033: 535 / 1689 loss=3.498, nll_loss=1.942, ppl=3.84, wps=587127, ups=1.18, wpb=496269, bsz=16698.1, num_updates=54500, lr=0.000270914, gnorm=0.137, clip=0, loss_scale=4, train_wall=83, gb_free=61.6, wall=46704 epoch 033: 535 / 1689 loss=3.498, nll_loss=1.942, ppl=3.84, wps=587127, ups=1.18, wpb=496269, bsz=16698.1, num_updates=54500, lr=0.000270914, gnorm=0.137, clip=0, loss_scale=4, train_wall=83, gb_free=61.6, wall=46704 epoch 033: 535 / 1689 loss=3.498, nll_loss=1.942, ppl=3.84, wps=587127, ups=1.18, wpb=496269, bsz=16698.1, num_updates=54500, lr=0.000270914, gnorm=0.137, clip=0, loss_scale=4, train_wall=83, gb_free=61.6, wall=46704 epoch 033: 535 / 1689 loss=3.498, nll_loss=1.942, ppl=3.84, wps=587127, ups=1.18, wpb=496269, bsz=16698.1, num_updates=54500, lr=0.000270914, gnorm=0.137, clip=0, loss_scale=4, train_wall=83, gb_free=61.6, wall=46704 epoch 033: 535 / 1689 loss=3.498, nll_loss=1.942, ppl=3.84, wps=587127, ups=1.18, wpb=496269, bsz=16698.1, num_updates=54500, lr=0.000270914, gnorm=0.137, clip=0, loss_scale=4, train_wall=83, gb_free=61.6, wall=46704 epoch 033: 535 / 1689 loss=3.498, nll_loss=1.942, ppl=3.84, wps=587127, ups=1.18, wpb=496269, bsz=16698.1, num_updates=54500, lr=0.000270914, gnorm=0.137, clip=0, loss_scale=4, train_wall=83, gb_free=61.6, wall=46704 epoch 033: 535 / 1689 loss=3.498, nll_loss=1.942, ppl=3.84, wps=587127, ups=1.18, wpb=496269, bsz=16698.1, num_updates=54500, lr=0.000270914, gnorm=0.137, clip=0, loss_scale=4, train_wall=83, gb_free=61.6, wall=46704 epoch 033: 535 / 1689 loss=3.498, nll_loss=1.942, ppl=3.84, wps=587127, ups=1.18, wpb=496269, bsz=16698.1, num_updates=54500, lr=0.000270914, gnorm=0.137, clip=0, loss_scale=4, train_wall=83, gb_free=61.6, wall=46704 epoch 033: 535 / 1689 loss=3.498, nll_loss=1.942, ppl=3.84, wps=587127, ups=1.18, wpb=496269, bsz=16698.1, num_updates=54500, lr=0.000270914, gnorm=0.137, clip=0, loss_scale=4, train_wall=83, gb_free=61.6, wall=46704 epoch 033: 535 / 1689 loss=3.498, nll_loss=1.942, ppl=3.84, wps=587127, ups=1.18, wpb=496269, bsz=16698.1, num_updates=54500, lr=0.000270914, gnorm=0.137, clip=0, loss_scale=4, train_wall=83, gb_free=61.6, wall=46704 epoch 033: 535 / 1689 loss=3.498, nll_loss=1.942, ppl=3.84, wps=587127, ups=1.18, wpb=496269, bsz=16698.1, num_updates=54500, lr=0.000270914, gnorm=0.137, clip=0, loss_scale=4, train_wall=83, gb_free=61.6, wall=46704 epoch 033: 535 / 1689 loss=3.498, nll_loss=1.942, ppl=3.84, wps=587127, ups=1.18, wpb=496269, bsz=16698.1, num_updates=54500, lr=0.000270914, gnorm=0.137, clip=0, loss_scale=4, train_wall=83, gb_free=61.6, wall=46704 epoch 033: 535 / 1689 loss=3.498, nll_loss=1.942, ppl=3.84, wps=587127, ups=1.18, wpb=496269, bsz=16698.1, num_updates=54500, lr=0.000270914, gnorm=0.137, clip=0, loss_scale=4, train_wall=83, gb_free=61.6, wall=46704 epoch 033: 535 / 1689 loss=3.498, nll_loss=1.942, ppl=3.84, wps=587127, ups=1.18, wpb=496269, bsz=16698.1, num_updates=54500, lr=0.000270914, gnorm=0.137, clip=0, loss_scale=4, train_wall=83, gb_free=61.6, wall=46704 epoch 033: 535 / 1689 loss=3.498, nll_loss=1.942, ppl=3.84, wps=587127, ups=1.18, wpb=496269, bsz=16698.1, num_updates=54500, lr=0.000270914, gnorm=0.137, clip=0, loss_scale=4, train_wall=83, gb_free=61.6, wall=46704 epoch 033: 535 / 1689 loss=3.498, nll_loss=1.942, ppl=3.84, wps=587127, ups=1.18, wpb=496269, bsz=16698.1, num_updates=54500, lr=0.000270914, gnorm=0.137, clip=0, loss_scale=4, train_wall=83, gb_free=61.6, wall=46704 epoch 033: 535 / 1689 loss=3.498, nll_loss=1.942, ppl=3.84, wps=587127, ups=1.18, wpb=496269, bsz=16698.1, num_updates=54500, lr=0.000270914, gnorm=0.137, clip=0, loss_scale=4, train_wall=83, gb_free=61.6, wall=46704 epoch 033: 535 / 1689 loss=3.498, nll_loss=1.942, ppl=3.84, wps=587127, ups=1.18, wpb=496269, bsz=16698.1, num_updates=54500, lr=0.000270914, gnorm=0.137, clip=0, loss_scale=4, train_wall=83, gb_free=61.6, wall=46704 epoch 033: 535 / 1689 loss=3.498, nll_loss=1.942, ppl=3.84, wps=587127, ups=1.18, wpb=496269, bsz=16698.1, num_updates=54500, lr=0.000270914, gnorm=0.137, clip=0, loss_scale=4, train_wall=83, gb_free=61.6, wall=46704 epoch 033: 535 / 1689 loss=3.498, nll_loss=1.942, ppl=3.84, wps=587127, ups=1.18, wpb=496269, bsz=16698.1, num_updates=54500, lr=0.000270914, gnorm=0.137, clip=0, loss_scale=4, train_wall=83, gb_free=61.6, wall=46704 epoch 033: 535 / 1689 loss=3.498, nll_loss=1.942, ppl=3.84, wps=587127, ups=1.18, wpb=496269, bsz=16698.1, num_updates=54500, lr=0.000270914, gnorm=0.137, clip=0, loss_scale=4, train_wall=83, gb_free=61.6, wall=46704 epoch 033: 535 / 1689 loss=3.498, nll_loss=1.942, ppl=3.84, wps=587127, ups=1.18, wpb=496269, bsz=16698.1, num_updates=54500, lr=0.000270914, gnorm=0.137, clip=0, loss_scale=4, train_wall=83, gb_free=61.6, wall=46704 epoch 033: 535 / 1689 loss=3.498, nll_loss=1.942, ppl=3.84, wps=587127, ups=1.18, wpb=496269, bsz=16698.1, num_updates=54500, lr=0.000270914, gnorm=0.137, clip=0, loss_scale=4, train_wall=83, gb_free=61.6, wall=46704 epoch 033: 535 / 1689 loss=3.498, nll_loss=1.942, ppl=3.84, wps=587127, ups=1.18, wpb=496269, bsz=16698.1, num_updates=54500, lr=0.000270914, gnorm=0.137, clip=0, loss_scale=4, train_wall=83, gb_free=61.6, wall=46704 epoch 033: 535 / 1689 loss=3.498, nll_loss=1.942, ppl=3.84, wps=587127, ups=1.18, wpb=496269, bsz=16698.1, num_updates=54500, lr=0.000270914, gnorm=0.137, clip=0, loss_scale=4, train_wall=83, gb_free=61.6, wall=46704 epoch 033: 535 / 1689 loss=3.498, nll_loss=1.942, ppl=3.84, wps=587127, ups=1.18, wpb=496269, bsz=16698.1, num_updates=54500, lr=0.000270914, gnorm=0.137, clip=0, loss_scale=4, train_wall=83, gb_free=61.6, wall=46704 epoch 033: 535 / 1689 loss=3.498, nll_loss=1.942, ppl=3.84, wps=587127, ups=1.18, wpb=496269, bsz=16698.1, num_updates=54500, lr=0.000270914, gnorm=0.137, clip=0, loss_scale=4, train_wall=83, gb_free=61.6, wall=46704 epoch 033: 535 / 1689 loss=3.498, nll_loss=1.942, ppl=3.84, wps=587127, ups=1.18, wpb=496269, bsz=16698.1, num_updates=54500, lr=0.000270914, gnorm=0.137, clip=0, loss_scale=4, train_wall=83, gb_free=61.6, wall=46704 epoch 033: 635 / 1689 loss=3.499, nll_loss=1.943, ppl=3.84, wps=585454, ups=1.18, wpb=495416, bsz=16713, num_updates=54600, lr=0.000270666, gnorm=0.14, clip=0, loss_scale=4, train_wall=83, gb_free=61, wall=46789 epoch 033: 635 / 1689 loss=3.499, nll_loss=1.943, ppl=3.84, wps=585454, ups=1.18, wpb=495416, bsz=16713, num_updates=54600, lr=0.000270666, gnorm=0.14, clip=0, loss_scale=4, train_wall=83, gb_free=61, wall=46789 epoch 033: 635 / 1689 loss=3.499, nll_loss=1.943, ppl=3.84, wps=585454, ups=1.18, wpb=495416, bsz=16713, num_updates=54600, lr=0.000270666, gnorm=0.14, clip=0, loss_scale=4, train_wall=83, gb_free=61, wall=46789 epoch 033: 635 / 1689 loss=3.499, nll_loss=1.943, ppl=3.84, wps=585454, ups=1.18, wpb=495416, bsz=16713, num_updates=54600, lr=0.000270666, gnorm=0.14, clip=0, loss_scale=4, train_wall=83, gb_free=61, wall=46789 epoch 033: 635 / 1689 loss=3.499, nll_loss=1.943, ppl=3.84, wps=585454, ups=1.18, wpb=495416, bsz=16713, num_updates=54600, lr=0.000270666, gnorm=0.14, clip=0, loss_scale=4, train_wall=83, gb_free=61, wall=46789 epoch 033: 635 / 1689 loss=3.499, nll_loss=1.943, ppl=3.84, wps=585454, ups=1.18, wpb=495416, bsz=16713, num_updates=54600, lr=0.000270666, gnorm=0.14, clip=0, loss_scale=4, train_wall=83, gb_free=61, wall=46789 epoch 033: 635 / 1689 loss=3.499, nll_loss=1.943, ppl=3.84, wps=585454, ups=1.18, wpb=495416, bsz=16713, num_updates=54600, lr=0.000270666, gnorm=0.14, clip=0, loss_scale=4, train_wall=83, gb_free=61, wall=46789 epoch 033: 635 / 1689 loss=3.499, nll_loss=1.943, ppl=3.84, wps=585454, ups=1.18, wpb=495416, bsz=16713, num_updates=54600, lr=0.000270666, gnorm=0.14, clip=0, loss_scale=4, train_wall=83, gb_free=61, wall=46789 epoch 033: 635 / 1689 loss=3.499, nll_loss=1.943, ppl=3.84, wps=585454, ups=1.18, wpb=495416, bsz=16713, num_updates=54600, lr=0.000270666, gnorm=0.14, clip=0, loss_scale=4, train_wall=83, gb_free=61, wall=46789 epoch 033: 635 / 1689 loss=3.499, nll_loss=1.943, ppl=3.84, wps=585454, ups=1.18, wpb=495416, bsz=16713, num_updates=54600, lr=0.000270666, gnorm=0.14, clip=0, loss_scale=4, train_wall=83, gb_free=61, wall=46789 epoch 033: 635 / 1689 loss=3.499, nll_loss=1.943, ppl=3.84, wps=585454, ups=1.18, wpb=495416, bsz=16713, num_updates=54600, lr=0.000270666, gnorm=0.14, clip=0, loss_scale=4, train_wall=83, gb_free=61, wall=46789 epoch 033: 635 / 1689 loss=3.499, nll_loss=1.943, ppl=3.84, wps=585454, ups=1.18, wpb=495416, bsz=16713, num_updates=54600, lr=0.000270666, gnorm=0.14, clip=0, loss_scale=4, train_wall=83, gb_free=61, wall=46789 epoch 033: 635 / 1689 loss=3.499, nll_loss=1.943, ppl=3.84, wps=585454, ups=1.18, wpb=495416, bsz=16713, num_updates=54600, lr=0.000270666, gnorm=0.14, clip=0, loss_scale=4, train_wall=83, gb_free=61, wall=46789 epoch 033: 635 / 1689 loss=3.499, nll_loss=1.943, ppl=3.84, wps=585454, ups=1.18, wpb=495416, bsz=16713, num_updates=54600, lr=0.000270666, gnorm=0.14, clip=0, loss_scale=4, train_wall=83, gb_free=61, wall=46789 epoch 033: 635 / 1689 loss=3.499, nll_loss=1.943, ppl=3.84, wps=585454, ups=1.18, wpb=495416, bsz=16713, num_updates=54600, lr=0.000270666, gnorm=0.14, clip=0, loss_scale=4, train_wall=83, gb_free=61, wall=46789 epoch 033: 635 / 1689 loss=3.499, nll_loss=1.943, ppl=3.84, wps=585454, ups=1.18, wpb=495416, bsz=16713, num_updates=54600, lr=0.000270666, gnorm=0.14, clip=0, loss_scale=4, train_wall=83, gb_free=61, wall=46789 epoch 033: 635 / 1689 loss=3.499, nll_loss=1.943, ppl=3.84, wps=585454, ups=1.18, wpb=495416, bsz=16713, num_updates=54600, lr=0.000270666, gnorm=0.14, clip=0, loss_scale=4, train_wall=83, gb_free=61, wall=46789 epoch 033: 635 / 1689 loss=3.499, nll_loss=1.943, ppl=3.84, wps=585454, ups=1.18, wpb=495416, bsz=16713, num_updates=54600, lr=0.000270666, gnorm=0.14, clip=0, loss_scale=4, train_wall=83, gb_free=61, wall=46789 epoch 033: 635 / 1689 loss=3.499, nll_loss=1.943, ppl=3.84, wps=585454, ups=1.18, wpb=495416, bsz=16713, num_updates=54600, lr=0.000270666, gnorm=0.14, clip=0, loss_scale=4, train_wall=83, gb_free=61, wall=46789 epoch 033: 635 / 1689 loss=3.499, nll_loss=1.943, ppl=3.84, wps=585454, ups=1.18, wpb=495416, bsz=16713, num_updates=54600, lr=0.000270666, gnorm=0.14, clip=0, loss_scale=4, train_wall=83, gb_free=61, wall=46789 epoch 033: 635 / 1689 loss=3.499, nll_loss=1.943, ppl=3.84, wps=585454, ups=1.18, wpb=495416, bsz=16713, num_updates=54600, lr=0.000270666, gnorm=0.14, clip=0, loss_scale=4, train_wall=83, gb_free=61, wall=46789 epoch 033: 635 / 1689 loss=3.499, nll_loss=1.943, ppl=3.84, wps=585454, ups=1.18, wpb=495416, bsz=16713, num_updates=54600, lr=0.000270666, gnorm=0.14, clip=0, loss_scale=4, train_wall=83, gb_free=61, wall=46789 epoch 033: 635 / 1689 loss=3.499, nll_loss=1.943, ppl=3.84, wps=585454, ups=1.18, wpb=495416, bsz=16713, num_updates=54600, lr=0.000270666, gnorm=0.14, clip=0, loss_scale=4, train_wall=83, gb_free=61, wall=46789 epoch 033: 635 / 1689 loss=3.499, nll_loss=1.943, ppl=3.84, wps=585454, ups=1.18, wpb=495416, bsz=16713, num_updates=54600, lr=0.000270666, gnorm=0.14, clip=0, loss_scale=4, train_wall=83, gb_free=61, wall=46789 epoch 033: 635 / 1689 loss=3.499, nll_loss=1.943, ppl=3.84, wps=585454, ups=1.18, wpb=495416, bsz=16713, num_updates=54600, lr=0.000270666, gnorm=0.14, clip=0, loss_scale=4, train_wall=83, gb_free=61, wall=46789 epoch 033: 635 / 1689 loss=3.499, nll_loss=1.943, ppl=3.84, wps=585454, ups=1.18, wpb=495416, bsz=16713, num_updates=54600, lr=0.000270666, gnorm=0.14, clip=0, loss_scale=4, train_wall=83, gb_free=61, wall=46789 epoch 033: 635 / 1689 loss=3.499, nll_loss=1.943, ppl=3.84, wps=585454, ups=1.18, wpb=495416, bsz=16713, num_updates=54600, lr=0.000270666, gnorm=0.14, clip=0, loss_scale=4, train_wall=83, gb_free=61, wall=46789 epoch 033: 635 / 1689 loss=3.499, nll_loss=1.943, ppl=3.84, wps=585454, ups=1.18, wpb=495416, bsz=16713, num_updates=54600, lr=0.000270666, gnorm=0.14, clip=0, loss_scale=4, train_wall=83, gb_free=61, wall=46789 epoch 033: 635 / 1689 loss=3.499, nll_loss=1.943, ppl=3.84, wps=585454, ups=1.18, wpb=495416, bsz=16713, num_updates=54600, lr=0.000270666, gnorm=0.14, clip=0, loss_scale=4, train_wall=83, gb_free=61, wall=46789 epoch 033: 635 / 1689 loss=3.499, nll_loss=1.943, ppl=3.84, wps=585454, ups=1.18, wpb=495416, bsz=16713, num_updates=54600, lr=0.000270666, gnorm=0.14, clip=0, loss_scale=4, train_wall=83, gb_free=61, wall=46789 epoch 033: 635 / 1689 loss=3.499, nll_loss=1.943, ppl=3.84, wps=585454, ups=1.18, wpb=495416, bsz=16713, num_updates=54600, lr=0.000270666, gnorm=0.14, clip=0, loss_scale=4, train_wall=83, gb_free=61, wall=46789 epoch 033: 635 / 1689 loss=3.499, nll_loss=1.943, ppl=3.84, wps=585454, ups=1.18, wpb=495416, bsz=16713, num_updates=54600, lr=0.000270666, gnorm=0.14, clip=0, loss_scale=4, train_wall=83, gb_free=61, wall=46789 epoch 033: 635 / 1689 loss=3.499, nll_loss=1.943, ppl=3.84, wps=585454, ups=1.18, wpb=495416, bsz=16713, num_updates=54600, lr=0.000270666, gnorm=0.14, clip=0, loss_scale=4, train_wall=83, gb_free=61, wall=46789 epoch 033: 735 / 1689 loss=3.505, nll_loss=1.95, ppl=3.86, wps=588513, ups=1.19, wpb=494131, bsz=16274.1, num_updates=54700, lr=0.000270418, gnorm=0.136, clip=0, loss_scale=4, train_wall=83, gb_free=61.3, wall=46873 epoch 033: 735 / 1689 loss=3.505, nll_loss=1.95, ppl=3.86, wps=588513, ups=1.19, wpb=494131, bsz=16274.1, num_updates=54700, lr=0.000270418, gnorm=0.136, clip=0, loss_scale=4, train_wall=83, gb_free=61.3, wall=46873 epoch 033: 735 / 1689 loss=3.505, nll_loss=1.95, ppl=3.86, wps=588513, ups=1.19, wpb=494131, bsz=16274.1, num_updates=54700, lr=0.000270418, gnorm=0.136, clip=0, loss_scale=4, train_wall=83, gb_free=61.3, wall=46873 epoch 033: 735 / 1689 loss=3.505, nll_loss=1.95, ppl=3.86, wps=588513, ups=1.19, wpb=494131, bsz=16274.1, num_updates=54700, lr=0.000270418, gnorm=0.136, clip=0, loss_scale=4, train_wall=83, gb_free=61.3, wall=46873 epoch 033: 735 / 1689 loss=3.505, nll_loss=1.95, ppl=3.86, wps=588513, ups=1.19, wpb=494131, bsz=16274.1, num_updates=54700, lr=0.000270418, gnorm=0.136, clip=0, loss_scale=4, train_wall=83, gb_free=61.3, wall=46873 epoch 033: 735 / 1689 loss=3.505, nll_loss=1.95, ppl=3.86, wps=588513, ups=1.19, wpb=494131, bsz=16274.1, num_updates=54700, lr=0.000270418, gnorm=0.136, clip=0, loss_scale=4, train_wall=83, gb_free=61.3, wall=46873 epoch 033: 735 / 1689 loss=3.505, nll_loss=1.95, ppl=3.86, wps=588513, ups=1.19, wpb=494131, bsz=16274.1, num_updates=54700, lr=0.000270418, gnorm=0.136, clip=0, loss_scale=4, train_wall=83, gb_free=61.3, wall=46873 epoch 033: 735 / 1689 loss=3.505, nll_loss=1.95, ppl=3.86, wps=588513, ups=1.19, wpb=494131, bsz=16274.1, num_updates=54700, lr=0.000270418, gnorm=0.136, clip=0, loss_scale=4, train_wall=83, gb_free=61.3, wall=46873 epoch 033: 735 / 1689 loss=3.505, nll_loss=1.95, ppl=3.86, wps=588513, ups=1.19, wpb=494131, bsz=16274.1, num_updates=54700, lr=0.000270418, gnorm=0.136, clip=0, loss_scale=4, train_wall=83, gb_free=61.3, wall=46873 epoch 033: 735 / 1689 loss=3.505, nll_loss=1.95, ppl=3.86, wps=588513, ups=1.19, wpb=494131, bsz=16274.1, num_updates=54700, lr=0.000270418, gnorm=0.136, clip=0, loss_scale=4, train_wall=83, gb_free=61.3, wall=46873 epoch 033: 735 / 1689 loss=3.505, nll_loss=1.95, ppl=3.86, wps=588513, ups=1.19, wpb=494131, bsz=16274.1, num_updates=54700, lr=0.000270418, gnorm=0.136, clip=0, loss_scale=4, train_wall=83, gb_free=61.3, wall=46873 epoch 033: 735 / 1689 loss=3.505, nll_loss=1.95, ppl=3.86, wps=588513, ups=1.19, wpb=494131, bsz=16274.1, num_updates=54700, lr=0.000270418, gnorm=0.136, clip=0, loss_scale=4, train_wall=83, gb_free=61.3, wall=46873 epoch 033: 735 / 1689 loss=3.505, nll_loss=1.95, ppl=3.86, wps=588513, ups=1.19, wpb=494131, bsz=16274.1, num_updates=54700, lr=0.000270418, gnorm=0.136, clip=0, loss_scale=4, train_wall=83, gb_free=61.3, wall=46873 epoch 033: 735 / 1689 loss=3.505, nll_loss=1.95, ppl=3.86, wps=588513, ups=1.19, wpb=494131, bsz=16274.1, num_updates=54700, lr=0.000270418, gnorm=0.136, clip=0, loss_scale=4, train_wall=83, gb_free=61.3, wall=46873 epoch 033: 735 / 1689 loss=3.505, nll_loss=1.95, ppl=3.86, wps=588513, ups=1.19, wpb=494131, bsz=16274.1, num_updates=54700, lr=0.000270418, gnorm=0.136, clip=0, loss_scale=4, train_wall=83, gb_free=61.3, wall=46873 epoch 033: 735 / 1689 loss=3.505, nll_loss=1.95, ppl=3.86, wps=588513, ups=1.19, wpb=494131, bsz=16274.1, num_updates=54700, lr=0.000270418, gnorm=0.136, clip=0, loss_scale=4, train_wall=83, gb_free=61.3, wall=46873 epoch 033: 735 / 1689 loss=3.505, nll_loss=1.95, ppl=3.86, wps=588513, ups=1.19, wpb=494131, bsz=16274.1, num_updates=54700, lr=0.000270418, gnorm=0.136, clip=0, loss_scale=4, train_wall=83, gb_free=61.3, wall=46873 epoch 033: 735 / 1689 loss=3.505, nll_loss=1.95, ppl=3.86, wps=588513, ups=1.19, wpb=494131, bsz=16274.1, num_updates=54700, lr=0.000270418, gnorm=0.136, clip=0, loss_scale=4, train_wall=83, gb_free=61.3, wall=46873 epoch 033: 735 / 1689 loss=3.505, nll_loss=1.95, ppl=3.86, wps=588513, ups=1.19, wpb=494131, bsz=16274.1, num_updates=54700, lr=0.000270418, gnorm=0.136, clip=0, loss_scale=4, train_wall=83, gb_free=61.3, wall=46873 epoch 033: 735 / 1689 loss=3.505, nll_loss=1.95, ppl=3.86, wps=588513, ups=1.19, wpb=494131, bsz=16274.1, num_updates=54700, lr=0.000270418, gnorm=0.136, clip=0, loss_scale=4, train_wall=83, gb_free=61.3, wall=46873 epoch 033: 735 / 1689 loss=3.505, nll_loss=1.95, ppl=3.86, wps=588513, ups=1.19, wpb=494131, bsz=16274.1, num_updates=54700, lr=0.000270418, gnorm=0.136, clip=0, loss_scale=4, train_wall=83, gb_free=61.3, wall=46873 epoch 033: 735 / 1689 loss=3.505, nll_loss=1.95, ppl=3.86, wps=588513, ups=1.19, wpb=494131, bsz=16274.1, num_updates=54700, lr=0.000270418, gnorm=0.136, clip=0, loss_scale=4, train_wall=83, gb_free=61.3, wall=46873 epoch 033: 735 / 1689 loss=3.505, nll_loss=1.95, ppl=3.86, wps=588513, ups=1.19, wpb=494131, bsz=16274.1, num_updates=54700, lr=0.000270418, gnorm=0.136, clip=0, loss_scale=4, train_wall=83, gb_free=61.3, wall=46873 epoch 033: 735 / 1689 loss=3.505, nll_loss=1.95, ppl=3.86, wps=588513, ups=1.19, wpb=494131, bsz=16274.1, num_updates=54700, lr=0.000270418, gnorm=0.136, clip=0, loss_scale=4, train_wall=83, gb_free=61.3, wall=46873 epoch 033: 735 / 1689 loss=3.505, nll_loss=1.95, ppl=3.86, wps=588513, ups=1.19, wpb=494131, bsz=16274.1, num_updates=54700, lr=0.000270418, gnorm=0.136, clip=0, loss_scale=4, train_wall=83, gb_free=61.3, wall=46873 epoch 033: 735 / 1689 loss=3.505, nll_loss=1.95, ppl=3.86, wps=588513, ups=1.19, wpb=494131, bsz=16274.1, num_updates=54700, lr=0.000270418, gnorm=0.136, clip=0, loss_scale=4, train_wall=83, gb_free=61.3, wall=46873 epoch 033: 735 / 1689 loss=3.505, nll_loss=1.95, ppl=3.86, wps=588513, ups=1.19, wpb=494131, bsz=16274.1, num_updates=54700, lr=0.000270418, gnorm=0.136, clip=0, loss_scale=4, train_wall=83, gb_free=61.3, wall=46873 epoch 033: 735 / 1689 loss=3.505, nll_loss=1.95, ppl=3.86, wps=588513, ups=1.19, wpb=494131, bsz=16274.1, num_updates=54700, lr=0.000270418, gnorm=0.136, clip=0, loss_scale=4, train_wall=83, gb_free=61.3, wall=46873 epoch 033: 735 / 1689 loss=3.505, nll_loss=1.95, ppl=3.86, wps=588513, ups=1.19, wpb=494131, bsz=16274.1, num_updates=54700, lr=0.000270418, gnorm=0.136, clip=0, loss_scale=4, train_wall=83, gb_free=61.3, wall=46873 epoch 033: 735 / 1689 loss=3.505, nll_loss=1.95, ppl=3.86, wps=588513, ups=1.19, wpb=494131, bsz=16274.1, num_updates=54700, lr=0.000270418, gnorm=0.136, clip=0, loss_scale=4, train_wall=83, gb_free=61.3, wall=46873 epoch 033: 735 / 1689 loss=3.505, nll_loss=1.95, ppl=3.86, wps=588513, ups=1.19, wpb=494131, bsz=16274.1, num_updates=54700, lr=0.000270418, gnorm=0.136, clip=0, loss_scale=4, train_wall=83, gb_free=61.3, wall=46873 epoch 033: 735 / 1689 loss=3.505, nll_loss=1.95, ppl=3.86, wps=588513, ups=1.19, wpb=494131, bsz=16274.1, num_updates=54700, lr=0.000270418, gnorm=0.136, clip=0, loss_scale=4, train_wall=83, gb_free=61.3, wall=46873 epoch 033: 735 / 1689 loss=3.505, nll_loss=1.95, ppl=3.86, wps=588513, ups=1.19, wpb=494131, bsz=16274.1, num_updates=54700, lr=0.000270418, gnorm=0.136, clip=0, loss_scale=4, train_wall=83, gb_free=61.3, wall=46873 epoch 033: 835 / 1689 loss=3.511, nll_loss=1.956, ppl=3.88, wps=586358, ups=1.19, wpb=494142, bsz=16823.5, num_updates=54800, lr=0.000270172, gnorm=0.136, clip=0, loss_scale=4, train_wall=83, gb_free=61.8, wall=46957 epoch 033: 835 / 1689 loss=3.511, nll_loss=1.956, ppl=3.88, wps=586358, ups=1.19, wpb=494142, bsz=16823.5, num_updates=54800, lr=0.000270172, gnorm=0.136, clip=0, loss_scale=4, train_wall=83, gb_free=61.8, wall=46957 epoch 033: 835 / 1689 loss=3.511, nll_loss=1.956, ppl=3.88, wps=586358, ups=1.19, wpb=494142, bsz=16823.5, num_updates=54800, lr=0.000270172, gnorm=0.136, clip=0, loss_scale=4, train_wall=83, gb_free=61.8, wall=46957 epoch 033: 835 / 1689 loss=3.511, nll_loss=1.956, ppl=3.88, wps=586358, ups=1.19, wpb=494142, bsz=16823.5, num_updates=54800, lr=0.000270172, gnorm=0.136, clip=0, loss_scale=4, train_wall=83, gb_free=61.8, wall=46957 epoch 033: 835 / 1689 loss=3.511, nll_loss=1.956, ppl=3.88, wps=586358, ups=1.19, wpb=494142, bsz=16823.5, num_updates=54800, lr=0.000270172, gnorm=0.136, clip=0, loss_scale=4, train_wall=83, gb_free=61.8, wall=46957 epoch 033: 835 / 1689 loss=3.511, nll_loss=1.956, ppl=3.88, wps=586358, ups=1.19, wpb=494142, bsz=16823.5, num_updates=54800, lr=0.000270172, gnorm=0.136, clip=0, loss_scale=4, train_wall=83, gb_free=61.8, wall=46957 epoch 033: 835 / 1689 loss=3.511, nll_loss=1.956, ppl=3.88, wps=586358, ups=1.19, wpb=494142, bsz=16823.5, num_updates=54800, lr=0.000270172, gnorm=0.136, clip=0, loss_scale=4, train_wall=83, gb_free=61.8, wall=46957 epoch 033: 835 / 1689 loss=3.511, nll_loss=1.956, ppl=3.88, wps=586358, ups=1.19, wpb=494142, bsz=16823.5, num_updates=54800, lr=0.000270172, gnorm=0.136, clip=0, loss_scale=4, train_wall=83, gb_free=61.8, wall=46957 epoch 033: 835 / 1689 loss=3.511, nll_loss=1.956, ppl=3.88, wps=586358, ups=1.19, wpb=494142, bsz=16823.5, num_updates=54800, lr=0.000270172, gnorm=0.136, clip=0, loss_scale=4, train_wall=83, gb_free=61.8, wall=46957 epoch 033: 835 / 1689 loss=3.511, nll_loss=1.956, ppl=3.88, wps=586358, ups=1.19, wpb=494142, bsz=16823.5, num_updates=54800, lr=0.000270172, gnorm=0.136, clip=0, loss_scale=4, train_wall=83, gb_free=61.8, wall=46957 epoch 033: 835 / 1689 loss=3.511, nll_loss=1.956, ppl=3.88, wps=586358, ups=1.19, wpb=494142, bsz=16823.5, num_updates=54800, lr=0.000270172, gnorm=0.136, clip=0, loss_scale=4, train_wall=83, gb_free=61.8, wall=46957 epoch 033: 835 / 1689 loss=3.511, nll_loss=1.956, ppl=3.88, wps=586358, ups=1.19, wpb=494142, bsz=16823.5, num_updates=54800, lr=0.000270172, gnorm=0.136, clip=0, loss_scale=4, train_wall=83, gb_free=61.8, wall=46957 epoch 033: 835 / 1689 loss=3.511, nll_loss=1.956, ppl=3.88, wps=586358, ups=1.19, wpb=494142, bsz=16823.5, num_updates=54800, lr=0.000270172, gnorm=0.136, clip=0, loss_scale=4, train_wall=83, gb_free=61.8, wall=46957 epoch 033: 835 / 1689 loss=3.511, nll_loss=1.956, ppl=3.88, wps=586358, ups=1.19, wpb=494142, bsz=16823.5, num_updates=54800, lr=0.000270172, gnorm=0.136, clip=0, loss_scale=4, train_wall=83, gb_free=61.8, wall=46957 epoch 033: 835 / 1689 loss=3.511, nll_loss=1.956, ppl=3.88, wps=586358, ups=1.19, wpb=494142, bsz=16823.5, num_updates=54800, lr=0.000270172, gnorm=0.136, clip=0, loss_scale=4, train_wall=83, gb_free=61.8, wall=46957 epoch 033: 835 / 1689 loss=3.511, nll_loss=1.956, ppl=3.88, wps=586358, ups=1.19, wpb=494142, bsz=16823.5, num_updates=54800, lr=0.000270172, gnorm=0.136, clip=0, loss_scale=4, train_wall=83, gb_free=61.8, wall=46957 epoch 033: 835 / 1689 loss=3.511, nll_loss=1.956, ppl=3.88, wps=586358, ups=1.19, wpb=494142, bsz=16823.5, num_updates=54800, lr=0.000270172, gnorm=0.136, clip=0, loss_scale=4, train_wall=83, gb_free=61.8, wall=46957 epoch 033: 835 / 1689 loss=3.511, nll_loss=1.956, ppl=3.88, wps=586358, ups=1.19, wpb=494142, bsz=16823.5, num_updates=54800, lr=0.000270172, gnorm=0.136, clip=0, loss_scale=4, train_wall=83, gb_free=61.8, wall=46957 epoch 033: 835 / 1689 loss=3.511, nll_loss=1.956, ppl=3.88, wps=586358, ups=1.19, wpb=494142, bsz=16823.5, num_updates=54800, lr=0.000270172, gnorm=0.136, clip=0, loss_scale=4, train_wall=83, gb_free=61.8, wall=46957 epoch 033: 835 / 1689 loss=3.511, nll_loss=1.956, ppl=3.88, wps=586358, ups=1.19, wpb=494142, bsz=16823.5, num_updates=54800, lr=0.000270172, gnorm=0.136, clip=0, loss_scale=4, train_wall=83, gb_free=61.8, wall=46957 epoch 033: 835 / 1689 loss=3.511, nll_loss=1.956, ppl=3.88, wps=586358, ups=1.19, wpb=494142, bsz=16823.5, num_updates=54800, lr=0.000270172, gnorm=0.136, clip=0, loss_scale=4, train_wall=83, gb_free=61.8, wall=46957 epoch 033: 835 / 1689 loss=3.511, nll_loss=1.956, ppl=3.88, wps=586358, ups=1.19, wpb=494142, bsz=16823.5, num_updates=54800, lr=0.000270172, gnorm=0.136, clip=0, loss_scale=4, train_wall=83, gb_free=61.8, wall=46957 epoch 033: 835 / 1689 loss=3.511, nll_loss=1.956, ppl=3.88, wps=586358, ups=1.19, wpb=494142, bsz=16823.5, num_updates=54800, lr=0.000270172, gnorm=0.136, clip=0, loss_scale=4, train_wall=83, gb_free=61.8, wall=46957 epoch 033: 835 / 1689 loss=3.511, nll_loss=1.956, ppl=3.88, wps=586358, ups=1.19, wpb=494142, bsz=16823.5, num_updates=54800, lr=0.000270172, gnorm=0.136, clip=0, loss_scale=4, train_wall=83, gb_free=61.8, wall=46957 epoch 033: 835 / 1689 loss=3.511, nll_loss=1.956, ppl=3.88, wps=586358, ups=1.19, wpb=494142, bsz=16823.5, num_updates=54800, lr=0.000270172, gnorm=0.136, clip=0, loss_scale=4, train_wall=83, gb_free=61.8, wall=46957 epoch 033: 835 / 1689 loss=3.511, nll_loss=1.956, ppl=3.88, wps=586358, ups=1.19, wpb=494142, bsz=16823.5, num_updates=54800, lr=0.000270172, gnorm=0.136, clip=0, loss_scale=4, train_wall=83, gb_free=61.8, wall=46957 epoch 033: 835 / 1689 loss=3.511, nll_loss=1.956, ppl=3.88, wps=586358, ups=1.19, wpb=494142, bsz=16823.5, num_updates=54800, lr=0.000270172, gnorm=0.136, clip=0, loss_scale=4, train_wall=83, gb_free=61.8, wall=46957 epoch 033: 835 / 1689 loss=3.511, nll_loss=1.956, ppl=3.88, wps=586358, ups=1.19, wpb=494142, bsz=16823.5, num_updates=54800, lr=0.000270172, gnorm=0.136, clip=0, loss_scale=4, train_wall=83, gb_free=61.8, wall=46957 epoch 033: 835 / 1689 loss=3.511, nll_loss=1.956, ppl=3.88, wps=586358, ups=1.19, wpb=494142, bsz=16823.5, num_updates=54800, lr=0.000270172, gnorm=0.136, clip=0, loss_scale=4, train_wall=83, gb_free=61.8, wall=46957 epoch 033: 835 / 1689 loss=3.511, nll_loss=1.956, ppl=3.88, wps=586358, ups=1.19, wpb=494142, bsz=16823.5, num_updates=54800, lr=0.000270172, gnorm=0.136, clip=0, loss_scale=4, train_wall=83, gb_free=61.8, wall=46957 epoch 033: 835 / 1689 loss=3.511, nll_loss=1.956, ppl=3.88, wps=586358, ups=1.19, wpb=494142, bsz=16823.5, num_updates=54800, lr=0.000270172, gnorm=0.136, clip=0, loss_scale=4, train_wall=83, gb_free=61.8, wall=46957 epoch 033: 835 / 1689 loss=3.511, nll_loss=1.956, ppl=3.88, wps=586358, ups=1.19, wpb=494142, bsz=16823.5, num_updates=54800, lr=0.000270172, gnorm=0.136, clip=0, loss_scale=4, train_wall=83, gb_free=61.8, wall=46957 epoch 033: 835 / 1689 loss=3.511, nll_loss=1.956, ppl=3.88, wps=586358, ups=1.19, wpb=494142, bsz=16823.5, num_updates=54800, lr=0.000270172, gnorm=0.136, clip=0, loss_scale=4, train_wall=83, gb_free=61.8, wall=46957 epoch 033: 935 / 1689 loss=3.502, nll_loss=1.947, ppl=3.86, wps=589747, ups=1.19, wpb=496394, bsz=15964.5, num_updates=54900, lr=0.000269925, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=62.2, wall=47041 epoch 033: 935 / 1689 loss=3.502, nll_loss=1.947, ppl=3.86, wps=589747, ups=1.19, wpb=496394, bsz=15964.5, num_updates=54900, lr=0.000269925, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=62.2, wall=47041 epoch 033: 935 / 1689 loss=3.502, nll_loss=1.947, ppl=3.86, wps=589747, ups=1.19, wpb=496394, bsz=15964.5, num_updates=54900, lr=0.000269925, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=62.2, wall=47041 epoch 033: 935 / 1689 loss=3.502, nll_loss=1.947, ppl=3.86, wps=589747, ups=1.19, wpb=496394, bsz=15964.5, num_updates=54900, lr=0.000269925, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=62.2, wall=47041 epoch 033: 935 / 1689 loss=3.502, nll_loss=1.947, ppl=3.86, wps=589747, ups=1.19, wpb=496394, bsz=15964.5, num_updates=54900, lr=0.000269925, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=62.2, wall=47041 epoch 033: 935 / 1689 loss=3.502, nll_loss=1.947, ppl=3.86, wps=589747, ups=1.19, wpb=496394, bsz=15964.5, num_updates=54900, lr=0.000269925, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=62.2, wall=47041 epoch 033: 935 / 1689 loss=3.502, nll_loss=1.947, ppl=3.86, wps=589747, ups=1.19, wpb=496394, bsz=15964.5, num_updates=54900, lr=0.000269925, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=62.2, wall=47041 epoch 033: 935 / 1689 loss=3.502, nll_loss=1.947, ppl=3.86, wps=589747, ups=1.19, wpb=496394, bsz=15964.5, num_updates=54900, lr=0.000269925, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=62.2, wall=47041 epoch 033: 935 / 1689 loss=3.502, nll_loss=1.947, ppl=3.86, wps=589747, ups=1.19, wpb=496394, bsz=15964.5, num_updates=54900, lr=0.000269925, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=62.2, wall=47041 epoch 033: 935 / 1689 loss=3.502, nll_loss=1.947, ppl=3.86, wps=589747, ups=1.19, wpb=496394, bsz=15964.5, num_updates=54900, lr=0.000269925, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=62.2, wall=47041 epoch 033: 935 / 1689 loss=3.502, nll_loss=1.947, ppl=3.86, wps=589747, ups=1.19, wpb=496394, bsz=15964.5, num_updates=54900, lr=0.000269925, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=62.2, wall=47041 epoch 033: 935 / 1689 loss=3.502, nll_loss=1.947, ppl=3.86, wps=589747, ups=1.19, wpb=496394, bsz=15964.5, num_updates=54900, lr=0.000269925, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=62.2, wall=47041 epoch 033: 935 / 1689 loss=3.502, nll_loss=1.947, ppl=3.86, wps=589747, ups=1.19, wpb=496394, bsz=15964.5, num_updates=54900, lr=0.000269925, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=62.2, wall=47041 epoch 033: 935 / 1689 loss=3.502, nll_loss=1.947, ppl=3.86, wps=589747, ups=1.19, wpb=496394, bsz=15964.5, num_updates=54900, lr=0.000269925, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=62.2, wall=47041 epoch 033: 935 / 1689 loss=3.502, nll_loss=1.947, ppl=3.86, wps=589747, ups=1.19, wpb=496394, bsz=15964.5, num_updates=54900, lr=0.000269925, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=62.2, wall=47041 epoch 033: 935 / 1689 loss=3.502, nll_loss=1.947, ppl=3.86, wps=589747, ups=1.19, wpb=496394, bsz=15964.5, num_updates=54900, lr=0.000269925, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=62.2, wall=47041 epoch 033: 935 / 1689 loss=3.502, nll_loss=1.947, ppl=3.86, wps=589747, ups=1.19, wpb=496394, bsz=15964.5, num_updates=54900, lr=0.000269925, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=62.2, wall=47041 epoch 033: 935 / 1689 loss=3.502, nll_loss=1.947, ppl=3.86, wps=589747, ups=1.19, wpb=496394, bsz=15964.5, num_updates=54900, lr=0.000269925, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=62.2, wall=47041 epoch 033: 935 / 1689 loss=3.502, nll_loss=1.947, ppl=3.86, wps=589747, ups=1.19, wpb=496394, bsz=15964.5, num_updates=54900, lr=0.000269925, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=62.2, wall=47041 epoch 033: 935 / 1689 loss=3.502, nll_loss=1.947, ppl=3.86, wps=589747, ups=1.19, wpb=496394, bsz=15964.5, num_updates=54900, lr=0.000269925, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=62.2, wall=47041 epoch 033: 935 / 1689 loss=3.502, nll_loss=1.947, ppl=3.86, wps=589747, ups=1.19, wpb=496394, bsz=15964.5, num_updates=54900, lr=0.000269925, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=62.2, wall=47041 epoch 033: 935 / 1689 loss=3.502, nll_loss=1.947, ppl=3.86, wps=589747, ups=1.19, wpb=496394, bsz=15964.5, num_updates=54900, lr=0.000269925, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=62.2, wall=47041 epoch 033: 935 / 1689 loss=3.502, nll_loss=1.947, ppl=3.86, wps=589747, ups=1.19, wpb=496394, bsz=15964.5, num_updates=54900, lr=0.000269925, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=62.2, wall=47041 epoch 033: 935 / 1689 loss=3.502, nll_loss=1.947, ppl=3.86, wps=589747, ups=1.19, wpb=496394, bsz=15964.5, num_updates=54900, lr=0.000269925, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=62.2, wall=47041 epoch 033: 935 / 1689 loss=3.502, nll_loss=1.947, ppl=3.86, wps=589747, ups=1.19, wpb=496394, bsz=15964.5, num_updates=54900, lr=0.000269925, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=62.2, wall=47041 epoch 033: 935 / 1689 loss=3.502, nll_loss=1.947, ppl=3.86, wps=589747, ups=1.19, wpb=496394, bsz=15964.5, num_updates=54900, lr=0.000269925, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=62.2, wall=47041 epoch 033: 935 / 1689 loss=3.502, nll_loss=1.947, ppl=3.86, wps=589747, ups=1.19, wpb=496394, bsz=15964.5, num_updates=54900, lr=0.000269925, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=62.2, wall=47041 epoch 033: 935 / 1689 loss=3.502, nll_loss=1.947, ppl=3.86, wps=589747, ups=1.19, wpb=496394, bsz=15964.5, num_updates=54900, lr=0.000269925, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=62.2, wall=47041 epoch 033: 935 / 1689 loss=3.502, nll_loss=1.947, ppl=3.86, wps=589747, ups=1.19, wpb=496394, bsz=15964.5, num_updates=54900, lr=0.000269925, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=62.2, wall=47041 epoch 033: 935 / 1689 loss=3.502, nll_loss=1.947, ppl=3.86, wps=589747, ups=1.19, wpb=496394, bsz=15964.5, num_updates=54900, lr=0.000269925, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=62.2, wall=47041 epoch 033: 935 / 1689 loss=3.502, nll_loss=1.947, ppl=3.86, wps=589747, ups=1.19, wpb=496394, bsz=15964.5, num_updates=54900, lr=0.000269925, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=62.2, wall=47041 epoch 033: 935 / 1689 loss=3.502, nll_loss=1.947, ppl=3.86, wps=589747, ups=1.19, wpb=496394, bsz=15964.5, num_updates=54900, lr=0.000269925, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=62.2, wall=47041 epoch 033: 935 / 1689 loss=3.502, nll_loss=1.947, ppl=3.86, wps=589747, ups=1.19, wpb=496394, bsz=15964.5, num_updates=54900, lr=0.000269925, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=62.2, wall=47041 epoch 033: 1036 / 1689 loss=3.506, nll_loss=1.951, ppl=3.87, wps=580805, ups=1.17, wpb=495200, bsz=16505.7, num_updates=55000, lr=0.00026968, gnorm=0.136, clip=0, loss_scale=4, train_wall=84, gb_free=61.9, wall=47127 epoch 033: 1036 / 1689 loss=3.506, nll_loss=1.951, ppl=3.87, wps=580805, ups=1.17, wpb=495200, bsz=16505.7, num_updates=55000, lr=0.00026968, gnorm=0.136, clip=0, loss_scale=4, train_wall=84, gb_free=61.9, wall=47127 epoch 033: 1036 / 1689 loss=3.506, nll_loss=1.951, ppl=3.87, wps=580805, ups=1.17, wpb=495200, bsz=16505.7, num_updates=55000, lr=0.00026968, gnorm=0.136, clip=0, loss_scale=4, train_wall=84, gb_free=61.9, wall=47127 epoch 033: 1036 / 1689 loss=3.506, nll_loss=1.951, ppl=3.87, wps=580805, ups=1.17, wpb=495200, bsz=16505.7, num_updates=55000, lr=0.00026968, gnorm=0.136, clip=0, loss_scale=4, train_wall=84, gb_free=61.9, wall=47127 epoch 033: 1036 / 1689 loss=3.506, nll_loss=1.951, ppl=3.87, wps=580805, ups=1.17, wpb=495200, bsz=16505.7, num_updates=55000, lr=0.00026968, gnorm=0.136, clip=0, loss_scale=4, train_wall=84, gb_free=61.9, wall=47127 epoch 033: 1036 / 1689 loss=3.506, nll_loss=1.951, ppl=3.87, wps=580805, ups=1.17, wpb=495200, bsz=16505.7, num_updates=55000, lr=0.00026968, gnorm=0.136, clip=0, loss_scale=4, train_wall=84, gb_free=61.9, wall=47127 epoch 033: 1036 / 1689 loss=3.506, nll_loss=1.951, ppl=3.87, wps=580805, ups=1.17, wpb=495200, bsz=16505.7, num_updates=55000, lr=0.00026968, gnorm=0.136, clip=0, loss_scale=4, train_wall=84, gb_free=61.9, wall=47127 epoch 033: 1036 / 1689 loss=3.506, nll_loss=1.951, ppl=3.87, wps=580805, ups=1.17, wpb=495200, bsz=16505.7, num_updates=55000, lr=0.00026968, gnorm=0.136, clip=0, loss_scale=4, train_wall=84, gb_free=61.9, wall=47127 epoch 033: 1036 / 1689 loss=3.506, nll_loss=1.951, ppl=3.87, wps=580805, ups=1.17, wpb=495200, bsz=16505.7, num_updates=55000, lr=0.00026968, gnorm=0.136, clip=0, loss_scale=4, train_wall=84, gb_free=61.9, wall=47127 epoch 033: 1036 / 1689 loss=3.506, nll_loss=1.951, ppl=3.87, wps=580805, ups=1.17, wpb=495200, bsz=16505.7, num_updates=55000, lr=0.00026968, gnorm=0.136, clip=0, loss_scale=4, train_wall=84, gb_free=61.9, wall=47127 epoch 033: 1036 / 1689 loss=3.506, nll_loss=1.951, ppl=3.87, wps=580805, ups=1.17, wpb=495200, bsz=16505.7, num_updates=55000, lr=0.00026968, gnorm=0.136, clip=0, loss_scale=4, train_wall=84, gb_free=61.9, wall=47127 epoch 033: 1036 / 1689 loss=3.506, nll_loss=1.951, ppl=3.87, wps=580805, ups=1.17, wpb=495200, bsz=16505.7, num_updates=55000, lr=0.00026968, gnorm=0.136, clip=0, loss_scale=4, train_wall=84, gb_free=61.9, wall=47127 epoch 033: 1036 / 1689 loss=3.506, nll_loss=1.951, ppl=3.87, wps=580805, ups=1.17, wpb=495200, bsz=16505.7, num_updates=55000, lr=0.00026968, gnorm=0.136, clip=0, loss_scale=4, train_wall=84, gb_free=61.9, wall=47127 epoch 033: 1036 / 1689 loss=3.506, nll_loss=1.951, ppl=3.87, wps=580805, ups=1.17, wpb=495200, bsz=16505.7, num_updates=55000, lr=0.00026968, gnorm=0.136, clip=0, loss_scale=4, train_wall=84, gb_free=61.9, wall=47127 epoch 033: 1036 / 1689 loss=3.506, nll_loss=1.951, ppl=3.87, wps=580805, ups=1.17, wpb=495200, bsz=16505.7, num_updates=55000, lr=0.00026968, gnorm=0.136, clip=0, loss_scale=4, train_wall=84, gb_free=61.9, wall=47127 epoch 033: 1036 / 1689 loss=3.506, nll_loss=1.951, ppl=3.87, wps=580805, ups=1.17, wpb=495200, bsz=16505.7, num_updates=55000, lr=0.00026968, gnorm=0.136, clip=0, loss_scale=4, train_wall=84, gb_free=61.9, wall=47127 epoch 033: 1036 / 1689 loss=3.506, nll_loss=1.951, ppl=3.87, wps=580805, ups=1.17, wpb=495200, bsz=16505.7, num_updates=55000, lr=0.00026968, gnorm=0.136, clip=0, loss_scale=4, train_wall=84, gb_free=61.9, wall=47127 epoch 033: 1036 / 1689 loss=3.506, nll_loss=1.951, ppl=3.87, wps=580805, ups=1.17, wpb=495200, bsz=16505.7, num_updates=55000, lr=0.00026968, gnorm=0.136, clip=0, loss_scale=4, train_wall=84, gb_free=61.9, wall=47127 epoch 033: 1036 / 1689 loss=3.506, nll_loss=1.951, ppl=3.87, wps=580805, ups=1.17, wpb=495200, bsz=16505.7, num_updates=55000, lr=0.00026968, gnorm=0.136, clip=0, loss_scale=4, train_wall=84, gb_free=61.9, wall=47127 epoch 033: 1036 / 1689 loss=3.506, nll_loss=1.951, ppl=3.87, wps=580805, ups=1.17, wpb=495200, bsz=16505.7, num_updates=55000, lr=0.00026968, gnorm=0.136, clip=0, loss_scale=4, train_wall=84, gb_free=61.9, wall=47127 epoch 033: 1036 / 1689 loss=3.506, nll_loss=1.951, ppl=3.87, wps=580805, ups=1.17, wpb=495200, bsz=16505.7, num_updates=55000, lr=0.00026968, gnorm=0.136, clip=0, loss_scale=4, train_wall=84, gb_free=61.9, wall=47127 epoch 033: 1036 / 1689 loss=3.506, nll_loss=1.951, ppl=3.87, wps=580805, ups=1.17, wpb=495200, bsz=16505.7, num_updates=55000, lr=0.00026968, gnorm=0.136, clip=0, loss_scale=4, train_wall=84, gb_free=61.9, wall=47127 epoch 033: 1036 / 1689 loss=3.506, nll_loss=1.951, ppl=3.87, wps=580805, ups=1.17, wpb=495200, bsz=16505.7, num_updates=55000, lr=0.00026968, gnorm=0.136, clip=0, loss_scale=4, train_wall=84, gb_free=61.9, wall=47127 epoch 033: 1036 / 1689 loss=3.506, nll_loss=1.951, ppl=3.87, wps=580805, ups=1.17, wpb=495200, bsz=16505.7, num_updates=55000, lr=0.00026968, gnorm=0.136, clip=0, loss_scale=4, train_wall=84, gb_free=61.9, wall=47127 epoch 033: 1036 / 1689 loss=3.506, nll_loss=1.951, ppl=3.87, wps=580805, ups=1.17, wpb=495200, bsz=16505.7, num_updates=55000, lr=0.00026968, gnorm=0.136, clip=0, loss_scale=4, train_wall=84, gb_free=61.9, wall=47127 epoch 033: 1036 / 1689 loss=3.506, nll_loss=1.951, ppl=3.87, wps=580805, ups=1.17, wpb=495200, bsz=16505.7, num_updates=55000, lr=0.00026968, gnorm=0.136, clip=0, loss_scale=4, train_wall=84, gb_free=61.9, wall=47127 epoch 033: 1036 / 1689 loss=3.506, nll_loss=1.951, ppl=3.87, wps=580805, ups=1.17, wpb=495200, bsz=16505.7, num_updates=55000, lr=0.00026968, gnorm=0.136, clip=0, loss_scale=4, train_wall=84, gb_free=61.9, wall=47127 epoch 033: 1036 / 1689 loss=3.506, nll_loss=1.951, ppl=3.87, wps=580805, ups=1.17, wpb=495200, bsz=16505.7, num_updates=55000, lr=0.00026968, gnorm=0.136, clip=0, loss_scale=4, train_wall=84, gb_free=61.9, wall=47127 epoch 033: 1036 / 1689 loss=3.506, nll_loss=1.951, ppl=3.87, wps=580805, ups=1.17, wpb=495200, bsz=16505.7, num_updates=55000, lr=0.00026968, gnorm=0.136, clip=0, loss_scale=4, train_wall=84, gb_free=61.9, wall=47127 epoch 033: 1036 / 1689 loss=3.506, nll_loss=1.951, ppl=3.87, wps=580805, ups=1.17, wpb=495200, bsz=16505.7, num_updates=55000, lr=0.00026968, gnorm=0.136, clip=0, loss_scale=4, train_wall=84, gb_free=61.9, wall=47127 epoch 033: 1036 / 1689 loss=3.506, nll_loss=1.951, ppl=3.87, wps=580805, ups=1.17, wpb=495200, bsz=16505.7, num_updates=55000, lr=0.00026968, gnorm=0.136, clip=0, loss_scale=4, train_wall=84, gb_free=61.9, wall=47127 epoch 033: 1036 / 1689 loss=3.506, nll_loss=1.951, ppl=3.87, wps=580805, ups=1.17, wpb=495200, bsz=16505.7, num_updates=55000, lr=0.00026968, gnorm=0.136, clip=0, loss_scale=4, train_wall=84, gb_free=61.9, wall=47127 epoch 033: 1036 / 1689 loss=3.506, nll_loss=1.951, ppl=3.87, wps=580805, ups=1.17, wpb=495200, bsz=16505.7, num_updates=55000, lr=0.00026968, gnorm=0.136, clip=0, loss_scale=4, train_wall=84, gb_free=61.9, wall=47127 begin validation on "valid" subset epoch 033 | valid on 'valid' subset | loss 3.702 | nll_loss 2.154 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 55000 | best_loss 3.685 epoch 033 | valid on 'valid' subset | loss 3.702 | nll_loss 2.154 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 55000 | best_loss 3.685 epoch 033 | valid on 'valid' subset | loss 3.702 | nll_loss 2.154 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 55000 | best_loss 3.685 epoch 033 | valid on 'valid' subset | loss 3.702 | nll_loss 2.154 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 55000 | best_loss 3.685 epoch 033 | valid on 'valid' subset | loss 3.702 | nll_loss 2.154 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 55000 | best_loss 3.685 epoch 033 | valid on 'valid' subset | loss 3.702 | nll_loss 2.154 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 55000 | best_loss 3.685 epoch 033 | valid on 'valid' subset | loss 3.702 | nll_loss 2.154 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 55000 | best_loss 3.685 epoch 033 | valid on 'valid' subset | loss 3.702 | nll_loss 2.154 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 55000 | best_loss 3.685 epoch 033 | valid on 'valid' subset | loss 3.702 | nll_loss 2.154 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 55000 | best_loss 3.685 epoch 033 | valid on 'valid' subset | loss 3.702 | nll_loss 2.154 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 55000 | best_loss 3.685 epoch 033 | valid on 'valid' subset | loss 3.702 | nll_loss 2.154 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 55000 | best_loss 3.685 epoch 033 | valid on 'valid' subset | loss 3.702 | nll_loss 2.154 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 55000 | best_loss 3.685 epoch 033 | valid on 'valid' subset | loss 3.702 | nll_loss 2.154 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 55000 | best_loss 3.685 epoch 033 | valid on 'valid' subset | loss 3.702 | nll_loss 2.154 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 55000 | best_loss 3.685 epoch 033 | valid on 'valid' subset | loss 3.702 | nll_loss 2.154 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 55000 | best_loss 3.685 epoch 033 | valid on 'valid' subset | loss 3.702 | nll_loss 2.154 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 55000 | best_loss 3.685 epoch 033 | valid on 'valid' subset | loss 3.702 | nll_loss 2.154 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 55000 | best_loss 3.685 epoch 033 | valid on 'valid' subset | loss 3.702 | nll_loss 2.154 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 55000 | best_loss 3.685 epoch 033 | valid on 'valid' subset | loss 3.702 | nll_loss 2.154 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 55000 | best_loss 3.685 epoch 033 | valid on 'valid' subset | loss 3.702 | nll_loss 2.154 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 55000 | best_loss 3.685 epoch 033 | valid on 'valid' subset | loss 3.702 | nll_loss 2.154 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 55000 | best_loss 3.685 epoch 033 | valid on 'valid' subset | loss 3.702 | nll_loss 2.154 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 55000 | best_loss 3.685 epoch 033 | valid on 'valid' subset | loss 3.702 | nll_loss 2.154 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 55000 | best_loss 3.685 epoch 033 | valid on 'valid' subset | loss 3.702 | nll_loss 2.154 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 55000 | best_loss 3.685 epoch 033 | valid on 'valid' subset | loss 3.702 | nll_loss 2.154 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 55000 | best_loss 3.685 epoch 033 | valid on 'valid' subset | loss 3.702 | nll_loss 2.154 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 55000 | best_loss 3.685 epoch 033 | valid on 'valid' subset | loss 3.702 | nll_loss 2.154 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 55000 | best_loss 3.685 epoch 033 | valid on 'valid' subset | loss 3.702 | nll_loss 2.154 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 55000 | best_loss 3.685 epoch 033 | valid on 'valid' subset | loss 3.702 | nll_loss 2.154 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 55000 | best_loss 3.685 epoch 033 | valid on 'valid' subset | loss 3.702 | nll_loss 2.154 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 55000 | best_loss 3.685 epoch 033 | valid on 'valid' subset | loss 3.702 | nll_loss 2.154 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 55000 | best_loss 3.685 epoch 033 | valid on 'valid' subset | loss 3.702 | nll_loss 2.154 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 55000 | best_loss 3.685 epoch 033 | valid on 'valid' subset | loss 3.702 | nll_loss 2.154 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 55000 | best_loss 3.685 epoch 033: 1136 / 1689 loss=3.515, nll_loss=1.961, ppl=3.89, wps=523905, ups=1.06, wpb=494240, bsz=16487.7, num_updates=55100, lr=0.000269435, gnorm=0.133, clip=0, loss_scale=4, train_wall=83, gb_free=61.4, wall=47221 epoch 033: 1136 / 1689 loss=3.515, nll_loss=1.961, ppl=3.89, wps=523905, ups=1.06, wpb=494240, bsz=16487.7, num_updates=55100, lr=0.000269435, gnorm=0.133, clip=0, loss_scale=4, train_wall=83, gb_free=61.4, wall=47221 epoch 033: 1136 / 1689 loss=3.515, nll_loss=1.961, ppl=3.89, wps=523905, ups=1.06, wpb=494240, bsz=16487.7, num_updates=55100, lr=0.000269435, gnorm=0.133, clip=0, loss_scale=4, train_wall=83, gb_free=61.4, wall=47221 epoch 033: 1136 / 1689 loss=3.515, nll_loss=1.961, ppl=3.89, wps=523905, ups=1.06, wpb=494240, bsz=16487.7, num_updates=55100, lr=0.000269435, gnorm=0.133, clip=0, loss_scale=4, train_wall=83, gb_free=61.4, wall=47221 epoch 033: 1136 / 1689 loss=3.515, nll_loss=1.961, ppl=3.89, wps=523905, ups=1.06, wpb=494240, bsz=16487.7, num_updates=55100, lr=0.000269435, gnorm=0.133, clip=0, loss_scale=4, train_wall=83, gb_free=61.4, wall=47221 epoch 033: 1136 / 1689 loss=3.515, nll_loss=1.961, ppl=3.89, wps=523905, ups=1.06, wpb=494240, bsz=16487.7, num_updates=55100, lr=0.000269435, gnorm=0.133, clip=0, loss_scale=4, train_wall=83, gb_free=61.4, wall=47221 epoch 033: 1136 / 1689 loss=3.515, nll_loss=1.961, ppl=3.89, wps=523905, ups=1.06, wpb=494240, bsz=16487.7, num_updates=55100, lr=0.000269435, gnorm=0.133, clip=0, loss_scale=4, train_wall=83, gb_free=61.4, wall=47221 epoch 033: 1136 / 1689 loss=3.515, nll_loss=1.961, ppl=3.89, wps=523905, ups=1.06, wpb=494240, bsz=16487.7, num_updates=55100, lr=0.000269435, gnorm=0.133, clip=0, loss_scale=4, train_wall=83, gb_free=61.4, wall=47221 epoch 033: 1136 / 1689 loss=3.515, nll_loss=1.961, ppl=3.89, wps=523905, ups=1.06, wpb=494240, bsz=16487.7, num_updates=55100, lr=0.000269435, gnorm=0.133, clip=0, loss_scale=4, train_wall=83, gb_free=61.4, wall=47221 epoch 033: 1136 / 1689 loss=3.515, nll_loss=1.961, ppl=3.89, wps=523905, ups=1.06, wpb=494240, bsz=16487.7, num_updates=55100, lr=0.000269435, gnorm=0.133, clip=0, loss_scale=4, train_wall=83, gb_free=61.4, wall=47221 epoch 033: 1136 / 1689 loss=3.515, nll_loss=1.961, ppl=3.89, wps=523905, ups=1.06, wpb=494240, bsz=16487.7, num_updates=55100, lr=0.000269435, gnorm=0.133, clip=0, loss_scale=4, train_wall=83, gb_free=61.4, wall=47221 epoch 033: 1136 / 1689 loss=3.515, nll_loss=1.961, ppl=3.89, wps=523905, ups=1.06, wpb=494240, bsz=16487.7, num_updates=55100, lr=0.000269435, gnorm=0.133, clip=0, loss_scale=4, train_wall=83, gb_free=61.4, wall=47221 epoch 033: 1136 / 1689 loss=3.515, nll_loss=1.961, ppl=3.89, wps=523905, ups=1.06, wpb=494240, bsz=16487.7, num_updates=55100, lr=0.000269435, gnorm=0.133, clip=0, loss_scale=4, train_wall=83, gb_free=61.4, wall=47221 epoch 033: 1136 / 1689 loss=3.515, nll_loss=1.961, ppl=3.89, wps=523905, ups=1.06, wpb=494240, bsz=16487.7, num_updates=55100, lr=0.000269435, gnorm=0.133, clip=0, loss_scale=4, train_wall=83, gb_free=61.4, wall=47221 epoch 033: 1136 / 1689 loss=3.515, nll_loss=1.961, ppl=3.89, wps=523905, ups=1.06, wpb=494240, bsz=16487.7, num_updates=55100, lr=0.000269435, gnorm=0.133, clip=0, loss_scale=4, train_wall=83, gb_free=61.4, wall=47221 epoch 033: 1136 / 1689 loss=3.515, nll_loss=1.961, ppl=3.89, wps=523905, ups=1.06, wpb=494240, bsz=16487.7, num_updates=55100, lr=0.000269435, gnorm=0.133, clip=0, loss_scale=4, train_wall=83, gb_free=61.4, wall=47221 epoch 033: 1136 / 1689 loss=3.515, nll_loss=1.961, ppl=3.89, wps=523905, ups=1.06, wpb=494240, bsz=16487.7, num_updates=55100, lr=0.000269435, gnorm=0.133, clip=0, loss_scale=4, train_wall=83, gb_free=61.4, wall=47221 epoch 033: 1136 / 1689 loss=3.515, nll_loss=1.961, ppl=3.89, wps=523905, ups=1.06, wpb=494240, bsz=16487.7, num_updates=55100, lr=0.000269435, gnorm=0.133, clip=0, loss_scale=4, train_wall=83, gb_free=61.4, wall=47221 epoch 033: 1136 / 1689 loss=3.515, nll_loss=1.961, ppl=3.89, wps=523905, ups=1.06, wpb=494240, bsz=16487.7, num_updates=55100, lr=0.000269435, gnorm=0.133, clip=0, loss_scale=4, train_wall=83, gb_free=61.4, wall=47221 epoch 033: 1136 / 1689 loss=3.515, nll_loss=1.961, ppl=3.89, wps=523905, ups=1.06, wpb=494240, bsz=16487.7, num_updates=55100, lr=0.000269435, gnorm=0.133, clip=0, loss_scale=4, train_wall=83, gb_free=61.4, wall=47221 epoch 033: 1136 / 1689 loss=3.515, nll_loss=1.961, ppl=3.89, wps=523905, ups=1.06, wpb=494240, bsz=16487.7, num_updates=55100, lr=0.000269435, gnorm=0.133, clip=0, loss_scale=4, train_wall=83, gb_free=61.4, wall=47221 epoch 033: 1136 / 1689 loss=3.515, nll_loss=1.961, ppl=3.89, wps=523905, ups=1.06, wpb=494240, bsz=16487.7, num_updates=55100, lr=0.000269435, gnorm=0.133, clip=0, loss_scale=4, train_wall=83, gb_free=61.4, wall=47221 epoch 033: 1136 / 1689 loss=3.515, nll_loss=1.961, ppl=3.89, wps=523905, ups=1.06, wpb=494240, bsz=16487.7, num_updates=55100, lr=0.000269435, gnorm=0.133, clip=0, loss_scale=4, train_wall=83, gb_free=61.4, wall=47221 epoch 033: 1136 / 1689 loss=3.515, nll_loss=1.961, ppl=3.89, wps=523905, ups=1.06, wpb=494240, bsz=16487.7, num_updates=55100, lr=0.000269435, gnorm=0.133, clip=0, loss_scale=4, train_wall=83, gb_free=61.4, wall=47221 epoch 033: 1136 / 1689 loss=3.515, nll_loss=1.961, ppl=3.89, wps=523905, ups=1.06, wpb=494240, bsz=16487.7, num_updates=55100, lr=0.000269435, gnorm=0.133, clip=0, loss_scale=4, train_wall=83, gb_free=61.4, wall=47221 epoch 033: 1136 / 1689 loss=3.515, nll_loss=1.961, ppl=3.89, wps=523905, ups=1.06, wpb=494240, bsz=16487.7, num_updates=55100, lr=0.000269435, gnorm=0.133, clip=0, loss_scale=4, train_wall=83, gb_free=61.4, wall=47221 epoch 033: 1136 / 1689 loss=3.515, nll_loss=1.961, ppl=3.89, wps=523905, ups=1.06, wpb=494240, bsz=16487.7, num_updates=55100, lr=0.000269435, gnorm=0.133, clip=0, loss_scale=4, train_wall=83, gb_free=61.4, wall=47221 epoch 033: 1136 / 1689 loss=3.515, nll_loss=1.961, ppl=3.89, wps=523905, ups=1.06, wpb=494240, bsz=16487.7, num_updates=55100, lr=0.000269435, gnorm=0.133, clip=0, loss_scale=4, train_wall=83, gb_free=61.4, wall=47221 epoch 033: 1136 / 1689 loss=3.515, nll_loss=1.961, ppl=3.89, wps=523905, ups=1.06, wpb=494240, bsz=16487.7, num_updates=55100, lr=0.000269435, gnorm=0.133, clip=0, loss_scale=4, train_wall=83, gb_free=61.4, wall=47221 epoch 033: 1136 / 1689 loss=3.515, nll_loss=1.961, ppl=3.89, wps=523905, ups=1.06, wpb=494240, bsz=16487.7, num_updates=55100, lr=0.000269435, gnorm=0.133, clip=0, loss_scale=4, train_wall=83, gb_free=61.4, wall=47221 epoch 033: 1136 / 1689 loss=3.515, nll_loss=1.961, ppl=3.89, wps=523905, ups=1.06, wpb=494240, bsz=16487.7, num_updates=55100, lr=0.000269435, gnorm=0.133, clip=0, loss_scale=4, train_wall=83, gb_free=61.4, wall=47221 epoch 033: 1136 / 1689 loss=3.515, nll_loss=1.961, ppl=3.89, wps=523905, ups=1.06, wpb=494240, bsz=16487.7, num_updates=55100, lr=0.000269435, gnorm=0.133, clip=0, loss_scale=4, train_wall=83, gb_free=61.4, wall=47221 epoch 033: 1136 / 1689 loss=3.515, nll_loss=1.961, ppl=3.89, wps=523905, ups=1.06, wpb=494240, bsz=16487.7, num_updates=55100, lr=0.000269435, gnorm=0.133, clip=0, loss_scale=4, train_wall=83, gb_free=61.4, wall=47221 epoch 033: 1236 / 1689 loss=3.52, nll_loss=1.967, ppl=3.91, wps=588602, ups=1.19, wpb=495742, bsz=17065.8, num_updates=55200, lr=0.000269191, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=59.4, wall=47305 epoch 033: 1236 / 1689 loss=3.52, nll_loss=1.967, ppl=3.91, wps=588602, ups=1.19, wpb=495742, bsz=17065.8, num_updates=55200, lr=0.000269191, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=59.4, wall=47305 epoch 033: 1236 / 1689 loss=3.52, nll_loss=1.967, ppl=3.91, wps=588602, ups=1.19, wpb=495742, bsz=17065.8, num_updates=55200, lr=0.000269191, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=59.4, wall=47305 epoch 033: 1236 / 1689 loss=3.52, nll_loss=1.967, ppl=3.91, wps=588602, ups=1.19, wpb=495742, bsz=17065.8, num_updates=55200, lr=0.000269191, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=59.4, wall=47305 epoch 033: 1236 / 1689 loss=3.52, nll_loss=1.967, ppl=3.91, wps=588602, ups=1.19, wpb=495742, bsz=17065.8, num_updates=55200, lr=0.000269191, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=59.4, wall=47305 epoch 033: 1236 / 1689 loss=3.52, nll_loss=1.967, ppl=3.91, wps=588602, ups=1.19, wpb=495742, bsz=17065.8, num_updates=55200, lr=0.000269191, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=59.4, wall=47305 epoch 033: 1236 / 1689 loss=3.52, nll_loss=1.967, ppl=3.91, wps=588602, ups=1.19, wpb=495742, bsz=17065.8, num_updates=55200, lr=0.000269191, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=59.4, wall=47305 epoch 033: 1236 / 1689 loss=3.52, nll_loss=1.967, ppl=3.91, wps=588602, ups=1.19, wpb=495742, bsz=17065.8, num_updates=55200, lr=0.000269191, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=59.4, wall=47305 epoch 033: 1236 / 1689 loss=3.52, nll_loss=1.967, ppl=3.91, wps=588602, ups=1.19, wpb=495742, bsz=17065.8, num_updates=55200, lr=0.000269191, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=59.4, wall=47305 epoch 033: 1236 / 1689 loss=3.52, nll_loss=1.967, ppl=3.91, wps=588602, ups=1.19, wpb=495742, bsz=17065.8, num_updates=55200, lr=0.000269191, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=59.4, wall=47305 epoch 033: 1236 / 1689 loss=3.52, nll_loss=1.967, ppl=3.91, wps=588602, ups=1.19, wpb=495742, bsz=17065.8, num_updates=55200, lr=0.000269191, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=59.4, wall=47305 epoch 033: 1236 / 1689 loss=3.52, nll_loss=1.967, ppl=3.91, wps=588602, ups=1.19, wpb=495742, bsz=17065.8, num_updates=55200, lr=0.000269191, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=59.4, wall=47305 epoch 033: 1236 / 1689 loss=3.52, nll_loss=1.967, ppl=3.91, wps=588602, ups=1.19, wpb=495742, bsz=17065.8, num_updates=55200, lr=0.000269191, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=59.4, wall=47305 epoch 033: 1236 / 1689 loss=3.52, nll_loss=1.967, ppl=3.91, wps=588602, ups=1.19, wpb=495742, bsz=17065.8, num_updates=55200, lr=0.000269191, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=59.4, wall=47305 epoch 033: 1236 / 1689 loss=3.52, nll_loss=1.967, ppl=3.91, wps=588602, ups=1.19, wpb=495742, bsz=17065.8, num_updates=55200, lr=0.000269191, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=59.4, wall=47305 epoch 033: 1236 / 1689 loss=3.52, nll_loss=1.967, ppl=3.91, wps=588602, ups=1.19, wpb=495742, bsz=17065.8, num_updates=55200, lr=0.000269191, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=59.4, wall=47305 epoch 033: 1236 / 1689 loss=3.52, nll_loss=1.967, ppl=3.91, wps=588602, ups=1.19, wpb=495742, bsz=17065.8, num_updates=55200, lr=0.000269191, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=59.4, wall=47305 epoch 033: 1236 / 1689 loss=3.52, nll_loss=1.967, ppl=3.91, wps=588602, ups=1.19, wpb=495742, bsz=17065.8, num_updates=55200, lr=0.000269191, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=59.4, wall=47305 epoch 033: 1236 / 1689 loss=3.52, nll_loss=1.967, ppl=3.91, wps=588602, ups=1.19, wpb=495742, bsz=17065.8, num_updates=55200, lr=0.000269191, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=59.4, wall=47305 epoch 033: 1236 / 1689 loss=3.52, nll_loss=1.967, ppl=3.91, wps=588602, ups=1.19, wpb=495742, bsz=17065.8, num_updates=55200, lr=0.000269191, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=59.4, wall=47305 epoch 033: 1236 / 1689 loss=3.52, nll_loss=1.967, ppl=3.91, wps=588602, ups=1.19, wpb=495742, bsz=17065.8, num_updates=55200, lr=0.000269191, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=59.4, wall=47305 epoch 033: 1236 / 1689 loss=3.52, nll_loss=1.967, ppl=3.91, wps=588602, ups=1.19, wpb=495742, bsz=17065.8, num_updates=55200, lr=0.000269191, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=59.4, wall=47305 epoch 033: 1236 / 1689 loss=3.52, nll_loss=1.967, ppl=3.91, wps=588602, ups=1.19, wpb=495742, bsz=17065.8, num_updates=55200, lr=0.000269191, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=59.4, wall=47305 epoch 033: 1236 / 1689 loss=3.52, nll_loss=1.967, ppl=3.91, wps=588602, ups=1.19, wpb=495742, bsz=17065.8, num_updates=55200, lr=0.000269191, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=59.4, wall=47305 epoch 033: 1236 / 1689 loss=3.52, nll_loss=1.967, ppl=3.91, wps=588602, ups=1.19, wpb=495742, bsz=17065.8, num_updates=55200, lr=0.000269191, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=59.4, wall=47305 epoch 033: 1236 / 1689 loss=3.52, nll_loss=1.967, ppl=3.91, wps=588602, ups=1.19, wpb=495742, bsz=17065.8, num_updates=55200, lr=0.000269191, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=59.4, wall=47305 epoch 033: 1236 / 1689 loss=3.52, nll_loss=1.967, ppl=3.91, wps=588602, ups=1.19, wpb=495742, bsz=17065.8, num_updates=55200, lr=0.000269191, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=59.4, wall=47305 epoch 033: 1236 / 1689 loss=3.52, nll_loss=1.967, ppl=3.91, wps=588602, ups=1.19, wpb=495742, bsz=17065.8, num_updates=55200, lr=0.000269191, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=59.4, wall=47305 epoch 033: 1236 / 1689 loss=3.52, nll_loss=1.967, ppl=3.91, wps=588602, ups=1.19, wpb=495742, bsz=17065.8, num_updates=55200, lr=0.000269191, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=59.4, wall=47305 epoch 033: 1236 / 1689 loss=3.52, nll_loss=1.967, ppl=3.91, wps=588602, ups=1.19, wpb=495742, bsz=17065.8, num_updates=55200, lr=0.000269191, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=59.4, wall=47305 epoch 033: 1236 / 1689 loss=3.52, nll_loss=1.967, ppl=3.91, wps=588602, ups=1.19, wpb=495742, bsz=17065.8, num_updates=55200, lr=0.000269191, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=59.4, wall=47305 epoch 033: 1236 / 1689 loss=3.52, nll_loss=1.967, ppl=3.91, wps=588602, ups=1.19, wpb=495742, bsz=17065.8, num_updates=55200, lr=0.000269191, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=59.4, wall=47305 epoch 033: 1236 / 1689 loss=3.52, nll_loss=1.967, ppl=3.91, wps=588602, ups=1.19, wpb=495742, bsz=17065.8, num_updates=55200, lr=0.000269191, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=59.4, wall=47305 epoch 033: 1336 / 1689 loss=3.514, nll_loss=1.96, ppl=3.89, wps=589531, ups=1.19, wpb=496130, bsz=16390, num_updates=55300, lr=0.000268947, gnorm=0.136, clip=0, loss_scale=4, train_wall=83, gb_free=61.5, wall=47389 epoch 033: 1336 / 1689 loss=3.514, nll_loss=1.96, ppl=3.89, wps=589531, ups=1.19, wpb=496130, bsz=16390, num_updates=55300, lr=0.000268947, gnorm=0.136, clip=0, loss_scale=4, train_wall=83, gb_free=61.5, wall=47389 epoch 033: 1336 / 1689 loss=3.514, nll_loss=1.96, ppl=3.89, wps=589531, ups=1.19, wpb=496130, bsz=16390, num_updates=55300, lr=0.000268947, gnorm=0.136, clip=0, loss_scale=4, train_wall=83, gb_free=61.5, wall=47389 epoch 033: 1336 / 1689 loss=3.514, nll_loss=1.96, ppl=3.89, wps=589531, ups=1.19, wpb=496130, bsz=16390, num_updates=55300, lr=0.000268947, gnorm=0.136, clip=0, loss_scale=4, train_wall=83, gb_free=61.5, wall=47389 epoch 033: 1336 / 1689 loss=3.514, nll_loss=1.96, ppl=3.89, wps=589531, ups=1.19, wpb=496130, bsz=16390, num_updates=55300, lr=0.000268947, gnorm=0.136, clip=0, loss_scale=4, train_wall=83, gb_free=61.5, wall=47389 epoch 033: 1336 / 1689 loss=3.514, nll_loss=1.96, ppl=3.89, wps=589531, ups=1.19, wpb=496130, bsz=16390, num_updates=55300, lr=0.000268947, gnorm=0.136, clip=0, loss_scale=4, train_wall=83, gb_free=61.5, wall=47389 epoch 033: 1336 / 1689 loss=3.514, nll_loss=1.96, ppl=3.89, wps=589531, ups=1.19, wpb=496130, bsz=16390, num_updates=55300, lr=0.000268947, gnorm=0.136, clip=0, loss_scale=4, train_wall=83, gb_free=61.5, wall=47389 epoch 033: 1336 / 1689 loss=3.514, nll_loss=1.96, ppl=3.89, wps=589531, ups=1.19, wpb=496130, bsz=16390, num_updates=55300, lr=0.000268947, gnorm=0.136, clip=0, loss_scale=4, train_wall=83, gb_free=61.5, wall=47389 epoch 033: 1336 / 1689 loss=3.514, nll_loss=1.96, ppl=3.89, wps=589531, ups=1.19, wpb=496130, bsz=16390, num_updates=55300, lr=0.000268947, gnorm=0.136, clip=0, loss_scale=4, train_wall=83, gb_free=61.5, wall=47389 epoch 033: 1336 / 1689 loss=3.514, nll_loss=1.96, ppl=3.89, wps=589531, ups=1.19, wpb=496130, bsz=16390, num_updates=55300, lr=0.000268947, gnorm=0.136, clip=0, loss_scale=4, train_wall=83, gb_free=61.5, wall=47389 epoch 033: 1336 / 1689 loss=3.514, nll_loss=1.96, ppl=3.89, wps=589531, ups=1.19, wpb=496130, bsz=16390, num_updates=55300, lr=0.000268947, gnorm=0.136, clip=0, loss_scale=4, train_wall=83, gb_free=61.5, wall=47389 epoch 033: 1336 / 1689 loss=3.514, nll_loss=1.96, ppl=3.89, wps=589531, ups=1.19, wpb=496130, bsz=16390, num_updates=55300, lr=0.000268947, gnorm=0.136, clip=0, loss_scale=4, train_wall=83, gb_free=61.5, wall=47389 epoch 033: 1336 / 1689 loss=3.514, nll_loss=1.96, ppl=3.89, wps=589531, ups=1.19, wpb=496130, bsz=16390, num_updates=55300, lr=0.000268947, gnorm=0.136, clip=0, loss_scale=4, train_wall=83, gb_free=61.5, wall=47389 epoch 033: 1336 / 1689 loss=3.514, nll_loss=1.96, ppl=3.89, wps=589531, ups=1.19, wpb=496130, bsz=16390, num_updates=55300, lr=0.000268947, gnorm=0.136, clip=0, loss_scale=4, train_wall=83, gb_free=61.5, wall=47389 epoch 033: 1336 / 1689 loss=3.514, nll_loss=1.96, ppl=3.89, wps=589531, ups=1.19, wpb=496130, bsz=16390, num_updates=55300, lr=0.000268947, gnorm=0.136, clip=0, loss_scale=4, train_wall=83, gb_free=61.5, wall=47389 epoch 033: 1336 / 1689 loss=3.514, nll_loss=1.96, ppl=3.89, wps=589531, ups=1.19, wpb=496130, bsz=16390, num_updates=55300, lr=0.000268947, gnorm=0.136, clip=0, loss_scale=4, train_wall=83, gb_free=61.5, wall=47389 epoch 033: 1336 / 1689 loss=3.514, nll_loss=1.96, ppl=3.89, wps=589531, ups=1.19, wpb=496130, bsz=16390, num_updates=55300, lr=0.000268947, gnorm=0.136, clip=0, loss_scale=4, train_wall=83, gb_free=61.5, wall=47389 epoch 033: 1336 / 1689 loss=3.514, nll_loss=1.96, ppl=3.89, wps=589531, ups=1.19, wpb=496130, bsz=16390, num_updates=55300, lr=0.000268947, gnorm=0.136, clip=0, loss_scale=4, train_wall=83, gb_free=61.5, wall=47389 epoch 033: 1336 / 1689 loss=3.514, nll_loss=1.96, ppl=3.89, wps=589531, ups=1.19, wpb=496130, bsz=16390, num_updates=55300, lr=0.000268947, gnorm=0.136, clip=0, loss_scale=4, train_wall=83, gb_free=61.5, wall=47389 epoch 033: 1336 / 1689 loss=3.514, nll_loss=1.96, ppl=3.89, wps=589531, ups=1.19, wpb=496130, bsz=16390, num_updates=55300, lr=0.000268947, gnorm=0.136, clip=0, loss_scale=4, train_wall=83, gb_free=61.5, wall=47389 epoch 033: 1336 / 1689 loss=3.514, nll_loss=1.96, ppl=3.89, wps=589531, ups=1.19, wpb=496130, bsz=16390, num_updates=55300, lr=0.000268947, gnorm=0.136, clip=0, loss_scale=4, train_wall=83, gb_free=61.5, wall=47389 epoch 033: 1336 / 1689 loss=3.514, nll_loss=1.96, ppl=3.89, wps=589531, ups=1.19, wpb=496130, bsz=16390, num_updates=55300, lr=0.000268947, gnorm=0.136, clip=0, loss_scale=4, train_wall=83, gb_free=61.5, wall=47389 epoch 033: 1336 / 1689 loss=3.514, nll_loss=1.96, ppl=3.89, wps=589531, ups=1.19, wpb=496130, bsz=16390, num_updates=55300, lr=0.000268947, gnorm=0.136, clip=0, loss_scale=4, train_wall=83, gb_free=61.5, wall=47389 epoch 033: 1336 / 1689 loss=3.514, nll_loss=1.96, ppl=3.89, wps=589531, ups=1.19, wpb=496130, bsz=16390, num_updates=55300, lr=0.000268947, gnorm=0.136, clip=0, loss_scale=4, train_wall=83, gb_free=61.5, wall=47389 epoch 033: 1336 / 1689 loss=3.514, nll_loss=1.96, ppl=3.89, wps=589531, ups=1.19, wpb=496130, bsz=16390, num_updates=55300, lr=0.000268947, gnorm=0.136, clip=0, loss_scale=4, train_wall=83, gb_free=61.5, wall=47389 epoch 033: 1336 / 1689 loss=3.514, nll_loss=1.96, ppl=3.89, wps=589531, ups=1.19, wpb=496130, bsz=16390, num_updates=55300, lr=0.000268947, gnorm=0.136, clip=0, loss_scale=4, train_wall=83, gb_free=61.5, wall=47389 epoch 033: 1336 / 1689 loss=3.514, nll_loss=1.96, ppl=3.89, wps=589531, ups=1.19, wpb=496130, bsz=16390, num_updates=55300, lr=0.000268947, gnorm=0.136, clip=0, loss_scale=4, train_wall=83, gb_free=61.5, wall=47389 epoch 033: 1336 / 1689 loss=3.514, nll_loss=1.96, ppl=3.89, wps=589531, ups=1.19, wpb=496130, bsz=16390, num_updates=55300, lr=0.000268947, gnorm=0.136, clip=0, loss_scale=4, train_wall=83, gb_free=61.5, wall=47389 epoch 033: 1336 / 1689 loss=3.514, nll_loss=1.96, ppl=3.89, wps=589531, ups=1.19, wpb=496130, bsz=16390, num_updates=55300, lr=0.000268947, gnorm=0.136, clip=0, loss_scale=4, train_wall=83, gb_free=61.5, wall=47389 epoch 033: 1336 / 1689 loss=3.514, nll_loss=1.96, ppl=3.89, wps=589531, ups=1.19, wpb=496130, bsz=16390, num_updates=55300, lr=0.000268947, gnorm=0.136, clip=0, loss_scale=4, train_wall=83, gb_free=61.5, wall=47389 epoch 033: 1336 / 1689 loss=3.514, nll_loss=1.96, ppl=3.89, wps=589531, ups=1.19, wpb=496130, bsz=16390, num_updates=55300, lr=0.000268947, gnorm=0.136, clip=0, loss_scale=4, train_wall=83, gb_free=61.5, wall=47389 epoch 033: 1336 / 1689 loss=3.514, nll_loss=1.96, ppl=3.89, wps=589531, ups=1.19, wpb=496130, bsz=16390, num_updates=55300, lr=0.000268947, gnorm=0.136, clip=0, loss_scale=4, train_wall=83, gb_free=61.5, wall=47389 epoch 033: 1336 / 1689 loss=3.514, nll_loss=1.96, ppl=3.89, wps=589531, ups=1.19, wpb=496130, bsz=16390, num_updates=55300, lr=0.000268947, gnorm=0.136, clip=0, loss_scale=4, train_wall=83, gb_free=61.5, wall=47389 epoch 033: 1436 / 1689 loss=3.519, nll_loss=1.966, ppl=3.91, wps=588780, ups=1.19, wpb=496398, bsz=16788.8, num_updates=55400, lr=0.000268705, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=62, wall=47474 epoch 033: 1436 / 1689 loss=3.519, nll_loss=1.966, ppl=3.91, wps=588780, ups=1.19, wpb=496398, bsz=16788.8, num_updates=55400, lr=0.000268705, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=62, wall=47474 epoch 033: 1436 / 1689 loss=3.519, nll_loss=1.966, ppl=3.91, wps=588780, ups=1.19, wpb=496398, bsz=16788.8, num_updates=55400, lr=0.000268705, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=62, wall=47474 epoch 033: 1436 / 1689 loss=3.519, nll_loss=1.966, ppl=3.91, wps=588780, ups=1.19, wpb=496398, bsz=16788.8, num_updates=55400, lr=0.000268705, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=62, wall=47474 epoch 033: 1436 / 1689 loss=3.519, nll_loss=1.966, ppl=3.91, wps=588780, ups=1.19, wpb=496398, bsz=16788.8, num_updates=55400, lr=0.000268705, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=62, wall=47474 epoch 033: 1436 / 1689 loss=3.519, nll_loss=1.966, ppl=3.91, wps=588780, ups=1.19, wpb=496398, bsz=16788.8, num_updates=55400, lr=0.000268705, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=62, wall=47474 epoch 033: 1436 / 1689 loss=3.519, nll_loss=1.966, ppl=3.91, wps=588780, ups=1.19, wpb=496398, bsz=16788.8, num_updates=55400, lr=0.000268705, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=62, wall=47474 epoch 033: 1436 / 1689 loss=3.519, nll_loss=1.966, ppl=3.91, wps=588780, ups=1.19, wpb=496398, bsz=16788.8, num_updates=55400, lr=0.000268705, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=62, wall=47474 epoch 033: 1436 / 1689 loss=3.519, nll_loss=1.966, ppl=3.91, wps=588780, ups=1.19, wpb=496398, bsz=16788.8, num_updates=55400, lr=0.000268705, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=62, wall=47474 epoch 033: 1436 / 1689 loss=3.519, nll_loss=1.966, ppl=3.91, wps=588780, ups=1.19, wpb=496398, bsz=16788.8, num_updates=55400, lr=0.000268705, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=62, wall=47474 epoch 033: 1436 / 1689 loss=3.519, nll_loss=1.966, ppl=3.91, wps=588780, ups=1.19, wpb=496398, bsz=16788.8, num_updates=55400, lr=0.000268705, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=62, wall=47474 epoch 033: 1436 / 1689 loss=3.519, nll_loss=1.966, ppl=3.91, wps=588780, ups=1.19, wpb=496398, bsz=16788.8, num_updates=55400, lr=0.000268705, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=62, wall=47474 epoch 033: 1436 / 1689 loss=3.519, nll_loss=1.966, ppl=3.91, wps=588780, ups=1.19, wpb=496398, bsz=16788.8, num_updates=55400, lr=0.000268705, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=62, wall=47474 epoch 033: 1436 / 1689 loss=3.519, nll_loss=1.966, ppl=3.91, wps=588780, ups=1.19, wpb=496398, bsz=16788.8, num_updates=55400, lr=0.000268705, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=62, wall=47474 epoch 033: 1436 / 1689 loss=3.519, nll_loss=1.966, ppl=3.91, wps=588780, ups=1.19, wpb=496398, bsz=16788.8, num_updates=55400, lr=0.000268705, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=62, wall=47474 epoch 033: 1436 / 1689 loss=3.519, nll_loss=1.966, ppl=3.91, wps=588780, ups=1.19, wpb=496398, bsz=16788.8, num_updates=55400, lr=0.000268705, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=62, wall=47474 epoch 033: 1436 / 1689 loss=3.519, nll_loss=1.966, ppl=3.91, wps=588780, ups=1.19, wpb=496398, bsz=16788.8, num_updates=55400, lr=0.000268705, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=62, wall=47474 epoch 033: 1436 / 1689 loss=3.519, nll_loss=1.966, ppl=3.91, wps=588780, ups=1.19, wpb=496398, bsz=16788.8, num_updates=55400, lr=0.000268705, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=62, wall=47474 epoch 033: 1436 / 1689 loss=3.519, nll_loss=1.966, ppl=3.91, wps=588780, ups=1.19, wpb=496398, bsz=16788.8, num_updates=55400, lr=0.000268705, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=62, wall=47474 epoch 033: 1436 / 1689 loss=3.519, nll_loss=1.966, ppl=3.91, wps=588780, ups=1.19, wpb=496398, bsz=16788.8, num_updates=55400, lr=0.000268705, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=62, wall=47474 epoch 033: 1436 / 1689 loss=3.519, nll_loss=1.966, ppl=3.91, wps=588780, ups=1.19, wpb=496398, bsz=16788.8, num_updates=55400, lr=0.000268705, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=62, wall=47474 epoch 033: 1436 / 1689 loss=3.519, nll_loss=1.966, ppl=3.91, wps=588780, ups=1.19, wpb=496398, bsz=16788.8, num_updates=55400, lr=0.000268705, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=62, wall=47474 epoch 033: 1436 / 1689 loss=3.519, nll_loss=1.966, ppl=3.91, wps=588780, ups=1.19, wpb=496398, bsz=16788.8, num_updates=55400, lr=0.000268705, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=62, wall=47474 epoch 033: 1436 / 1689 loss=3.519, nll_loss=1.966, ppl=3.91, wps=588780, ups=1.19, wpb=496398, bsz=16788.8, num_updates=55400, lr=0.000268705, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=62, wall=47474 epoch 033: 1436 / 1689 loss=3.519, nll_loss=1.966, ppl=3.91, wps=588780, ups=1.19, wpb=496398, bsz=16788.8, num_updates=55400, lr=0.000268705, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=62, wall=47474 epoch 033: 1436 / 1689 loss=3.519, nll_loss=1.966, ppl=3.91, wps=588780, ups=1.19, wpb=496398, bsz=16788.8, num_updates=55400, lr=0.000268705, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=62, wall=47474 epoch 033: 1436 / 1689 loss=3.519, nll_loss=1.966, ppl=3.91, wps=588780, ups=1.19, wpb=496398, bsz=16788.8, num_updates=55400, lr=0.000268705, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=62, wall=47474 epoch 033: 1436 / 1689 loss=3.519, nll_loss=1.966, ppl=3.91, wps=588780, ups=1.19, wpb=496398, bsz=16788.8, num_updates=55400, lr=0.000268705, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=62, wall=47474 epoch 033: 1436 / 1689 loss=3.519, nll_loss=1.966, ppl=3.91, wps=588780, ups=1.19, wpb=496398, bsz=16788.8, num_updates=55400, lr=0.000268705, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=62, wall=47474 epoch 033: 1436 / 1689 loss=3.519, nll_loss=1.966, ppl=3.91, wps=588780, ups=1.19, wpb=496398, bsz=16788.8, num_updates=55400, lr=0.000268705, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=62, wall=47474 epoch 033: 1436 / 1689 loss=3.519, nll_loss=1.966, ppl=3.91, wps=588780, ups=1.19, wpb=496398, bsz=16788.8, num_updates=55400, lr=0.000268705, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=62, wall=47474 epoch 033: 1436 / 1689 loss=3.519, nll_loss=1.966, ppl=3.91, wps=588780, ups=1.19, wpb=496398, bsz=16788.8, num_updates=55400, lr=0.000268705, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=62, wall=47474 epoch 033: 1436 / 1689 loss=3.519, nll_loss=1.966, ppl=3.91, wps=588780, ups=1.19, wpb=496398, bsz=16788.8, num_updates=55400, lr=0.000268705, gnorm=0.135, clip=0, loss_scale=4, train_wall=83, gb_free=62, wall=47474 epoch 033: 1536 / 1689 loss=3.507, nll_loss=1.953, ppl=3.87, wps=586402, ups=1.18, wpb=495398, bsz=16522.5, num_updates=55500, lr=0.000268462, gnorm=0.137, clip=0, loss_scale=8, train_wall=83, gb_free=61.9, wall=47558 epoch 033: 1536 / 1689 loss=3.507, nll_loss=1.953, ppl=3.87, wps=586402, ups=1.18, wpb=495398, bsz=16522.5, num_updates=55500, lr=0.000268462, gnorm=0.137, clip=0, loss_scale=8, train_wall=83, gb_free=61.9, wall=47558 epoch 033: 1536 / 1689 loss=3.507, nll_loss=1.953, ppl=3.87, wps=586402, ups=1.18, wpb=495398, bsz=16522.5, num_updates=55500, lr=0.000268462, gnorm=0.137, clip=0, loss_scale=8, train_wall=83, gb_free=61.9, wall=47558 epoch 033: 1536 / 1689 loss=3.507, nll_loss=1.953, ppl=3.87, wps=586402, ups=1.18, wpb=495398, bsz=16522.5, num_updates=55500, lr=0.000268462, gnorm=0.137, clip=0, loss_scale=8, train_wall=83, gb_free=61.9, wall=47558 epoch 033: 1536 / 1689 loss=3.507, nll_loss=1.953, ppl=3.87, wps=586402, ups=1.18, wpb=495398, bsz=16522.5, num_updates=55500, lr=0.000268462, gnorm=0.137, clip=0, loss_scale=8, train_wall=83, gb_free=61.9, wall=47558 epoch 033: 1536 / 1689 loss=3.507, nll_loss=1.953, ppl=3.87, wps=586402, ups=1.18, wpb=495398, bsz=16522.5, num_updates=55500, lr=0.000268462, gnorm=0.137, clip=0, loss_scale=8, train_wall=83, gb_free=61.9, wall=47558 epoch 033: 1536 / 1689 loss=3.507, nll_loss=1.953, ppl=3.87, wps=586402, ups=1.18, wpb=495398, bsz=16522.5, num_updates=55500, lr=0.000268462, gnorm=0.137, clip=0, loss_scale=8, train_wall=83, gb_free=61.9, wall=47558 epoch 033: 1536 / 1689 loss=3.507, nll_loss=1.953, ppl=3.87, wps=586402, ups=1.18, wpb=495398, bsz=16522.5, num_updates=55500, lr=0.000268462, gnorm=0.137, clip=0, loss_scale=8, train_wall=83, gb_free=61.9, wall=47558 epoch 033: 1536 / 1689 loss=3.507, nll_loss=1.953, ppl=3.87, wps=586402, ups=1.18, wpb=495398, bsz=16522.5, num_updates=55500, lr=0.000268462, gnorm=0.137, clip=0, loss_scale=8, train_wall=83, gb_free=61.9, wall=47558 epoch 033: 1536 / 1689 loss=3.507, nll_loss=1.953, ppl=3.87, wps=586402, ups=1.18, wpb=495398, bsz=16522.5, num_updates=55500, lr=0.000268462, gnorm=0.137, clip=0, loss_scale=8, train_wall=83, gb_free=61.9, wall=47558 epoch 033: 1536 / 1689 loss=3.507, nll_loss=1.953, ppl=3.87, wps=586402, ups=1.18, wpb=495398, bsz=16522.5, num_updates=55500, lr=0.000268462, gnorm=0.137, clip=0, loss_scale=8, train_wall=83, gb_free=61.9, wall=47558 epoch 033: 1536 / 1689 loss=3.507, nll_loss=1.953, ppl=3.87, wps=586402, ups=1.18, wpb=495398, bsz=16522.5, num_updates=55500, lr=0.000268462, gnorm=0.137, clip=0, loss_scale=8, train_wall=83, gb_free=61.9, wall=47558 epoch 033: 1536 / 1689 loss=3.507, nll_loss=1.953, ppl=3.87, wps=586402, ups=1.18, wpb=495398, bsz=16522.5, num_updates=55500, lr=0.000268462, gnorm=0.137, clip=0, loss_scale=8, train_wall=83, gb_free=61.9, wall=47558 epoch 033: 1536 / 1689 loss=3.507, nll_loss=1.953, ppl=3.87, wps=586402, ups=1.18, wpb=495398, bsz=16522.5, num_updates=55500, lr=0.000268462, gnorm=0.137, clip=0, loss_scale=8, train_wall=83, gb_free=61.9, wall=47558 epoch 033: 1536 / 1689 loss=3.507, nll_loss=1.953, ppl=3.87, wps=586402, ups=1.18, wpb=495398, bsz=16522.5, num_updates=55500, lr=0.000268462, gnorm=0.137, clip=0, loss_scale=8, train_wall=83, gb_free=61.9, wall=47558 epoch 033: 1536 / 1689 loss=3.507, nll_loss=1.953, ppl=3.87, wps=586402, ups=1.18, wpb=495398, bsz=16522.5, num_updates=55500, lr=0.000268462, gnorm=0.137, clip=0, loss_scale=8, train_wall=83, gb_free=61.9, wall=47558 epoch 033: 1536 / 1689 loss=3.507, nll_loss=1.953, ppl=3.87, wps=586402, ups=1.18, wpb=495398, bsz=16522.5, num_updates=55500, lr=0.000268462, gnorm=0.137, clip=0, loss_scale=8, train_wall=83, gb_free=61.9, wall=47558 epoch 033: 1536 / 1689 loss=3.507, nll_loss=1.953, ppl=3.87, wps=586402, ups=1.18, wpb=495398, bsz=16522.5, num_updates=55500, lr=0.000268462, gnorm=0.137, clip=0, loss_scale=8, train_wall=83, gb_free=61.9, wall=47558 epoch 033: 1536 / 1689 loss=3.507, nll_loss=1.953, ppl=3.87, wps=586402, ups=1.18, wpb=495398, bsz=16522.5, num_updates=55500, lr=0.000268462, gnorm=0.137, clip=0, loss_scale=8, train_wall=83, gb_free=61.9, wall=47558 epoch 033: 1536 / 1689 loss=3.507, nll_loss=1.953, ppl=3.87, wps=586402, ups=1.18, wpb=495398, bsz=16522.5, num_updates=55500, lr=0.000268462, gnorm=0.137, clip=0, loss_scale=8, train_wall=83, gb_free=61.9, wall=47558 epoch 033: 1536 / 1689 loss=3.507, nll_loss=1.953, ppl=3.87, wps=586402, ups=1.18, wpb=495398, bsz=16522.5, num_updates=55500, lr=0.000268462, gnorm=0.137, clip=0, loss_scale=8, train_wall=83, gb_free=61.9, wall=47558 epoch 033: 1536 / 1689 loss=3.507, nll_loss=1.953, ppl=3.87, wps=586402, ups=1.18, wpb=495398, bsz=16522.5, num_updates=55500, lr=0.000268462, gnorm=0.137, clip=0, loss_scale=8, train_wall=83, gb_free=61.9, wall=47558 epoch 033: 1536 / 1689 loss=3.507, nll_loss=1.953, ppl=3.87, wps=586402, ups=1.18, wpb=495398, bsz=16522.5, num_updates=55500, lr=0.000268462, gnorm=0.137, clip=0, loss_scale=8, train_wall=83, gb_free=61.9, wall=47558 epoch 033: 1536 / 1689 loss=3.507, nll_loss=1.953, ppl=3.87, wps=586402, ups=1.18, wpb=495398, bsz=16522.5, num_updates=55500, lr=0.000268462, gnorm=0.137, clip=0, loss_scale=8, train_wall=83, gb_free=61.9, wall=47558 epoch 033: 1536 / 1689 loss=3.507, nll_loss=1.953, ppl=3.87, wps=586402, ups=1.18, wpb=495398, bsz=16522.5, num_updates=55500, lr=0.000268462, gnorm=0.137, clip=0, loss_scale=8, train_wall=83, gb_free=61.9, wall=47558 epoch 033: 1536 / 1689 loss=3.507, nll_loss=1.953, ppl=3.87, wps=586402, ups=1.18, wpb=495398, bsz=16522.5, num_updates=55500, lr=0.000268462, gnorm=0.137, clip=0, loss_scale=8, train_wall=83, gb_free=61.9, wall=47558 epoch 033: 1536 / 1689 loss=3.507, nll_loss=1.953, ppl=3.87, wps=586402, ups=1.18, wpb=495398, bsz=16522.5, num_updates=55500, lr=0.000268462, gnorm=0.137, clip=0, loss_scale=8, train_wall=83, gb_free=61.9, wall=47558 epoch 033: 1536 / 1689 loss=3.507, nll_loss=1.953, ppl=3.87, wps=586402, ups=1.18, wpb=495398, bsz=16522.5, num_updates=55500, lr=0.000268462, gnorm=0.137, clip=0, loss_scale=8, train_wall=83, gb_free=61.9, wall=47558 epoch 033: 1536 / 1689 loss=3.507, nll_loss=1.953, ppl=3.87, wps=586402, ups=1.18, wpb=495398, bsz=16522.5, num_updates=55500, lr=0.000268462, gnorm=0.137, clip=0, loss_scale=8, train_wall=83, gb_free=61.9, wall=47558 epoch 033: 1536 / 1689 loss=3.507, nll_loss=1.953, ppl=3.87, wps=586402, ups=1.18, wpb=495398, bsz=16522.5, num_updates=55500, lr=0.000268462, gnorm=0.137, clip=0, loss_scale=8, train_wall=83, gb_free=61.9, wall=47558 epoch 033: 1536 / 1689 loss=3.507, nll_loss=1.953, ppl=3.87, wps=586402, ups=1.18, wpb=495398, bsz=16522.5, num_updates=55500, lr=0.000268462, gnorm=0.137, clip=0, loss_scale=8, train_wall=83, gb_free=61.9, wall=47558 epoch 033: 1536 / 1689 loss=3.507, nll_loss=1.953, ppl=3.87, wps=586402, ups=1.18, wpb=495398, bsz=16522.5, num_updates=55500, lr=0.000268462, gnorm=0.137, clip=0, loss_scale=8, train_wall=83, gb_free=61.9, wall=47558 epoch 033: 1536 / 1689 loss=3.507, nll_loss=1.953, ppl=3.87, wps=586402, ups=1.18, wpb=495398, bsz=16522.5, num_updates=55500, lr=0.000268462, gnorm=0.137, clip=0, loss_scale=8, train_wall=83, gb_free=61.9, wall=47558 epoch 033: 1638 / 1689 loss=3.515, nll_loss=1.961, ppl=3.89, wps=575965, ups=1.16, wpb=495414, bsz=16116.5, num_updates=55600, lr=0.000268221, gnorm=0.131, clip=0, loss_scale=2, train_wall=85, gb_free=60, wall=47644 epoch 033: 1638 / 1689 loss=3.515, nll_loss=1.961, ppl=3.89, wps=575965, ups=1.16, wpb=495414, bsz=16116.5, num_updates=55600, lr=0.000268221, gnorm=0.131, clip=0, loss_scale=2, train_wall=85, gb_free=60, wall=47644 epoch 033: 1638 / 1689 loss=3.515, nll_loss=1.961, ppl=3.89, wps=575965, ups=1.16, wpb=495414, bsz=16116.5, num_updates=55600, lr=0.000268221, gnorm=0.131, clip=0, loss_scale=2, train_wall=85, gb_free=60, wall=47644 epoch 033: 1638 / 1689 loss=3.515, nll_loss=1.961, ppl=3.89, wps=575965, ups=1.16, wpb=495414, bsz=16116.5, num_updates=55600, lr=0.000268221, gnorm=0.131, clip=0, loss_scale=2, train_wall=85, gb_free=60, wall=47644 epoch 033: 1638 / 1689 loss=3.515, nll_loss=1.961, ppl=3.89, wps=575965, ups=1.16, wpb=495414, bsz=16116.5, num_updates=55600, lr=0.000268221, gnorm=0.131, clip=0, loss_scale=2, train_wall=85, gb_free=60, wall=47644 epoch 033: 1638 / 1689 loss=3.515, nll_loss=1.961, ppl=3.89, wps=575965, ups=1.16, wpb=495414, bsz=16116.5, num_updates=55600, lr=0.000268221, gnorm=0.131, clip=0, loss_scale=2, train_wall=85, gb_free=60, wall=47644 epoch 033: 1638 / 1689 loss=3.515, nll_loss=1.961, ppl=3.89, wps=575965, ups=1.16, wpb=495414, bsz=16116.5, num_updates=55600, lr=0.000268221, gnorm=0.131, clip=0, loss_scale=2, train_wall=85, gb_free=60, wall=47644 epoch 033: 1638 / 1689 loss=3.515, nll_loss=1.961, ppl=3.89, wps=575965, ups=1.16, wpb=495414, bsz=16116.5, num_updates=55600, lr=0.000268221, gnorm=0.131, clip=0, loss_scale=2, train_wall=85, gb_free=60, wall=47644 epoch 033: 1638 / 1689 loss=3.515, nll_loss=1.961, ppl=3.89, wps=575965, ups=1.16, wpb=495414, bsz=16116.5, num_updates=55600, lr=0.000268221, gnorm=0.131, clip=0, loss_scale=2, train_wall=85, gb_free=60, wall=47644 epoch 033: 1638 / 1689 loss=3.515, nll_loss=1.961, ppl=3.89, wps=575965, ups=1.16, wpb=495414, bsz=16116.5, num_updates=55600, lr=0.000268221, gnorm=0.131, clip=0, loss_scale=2, train_wall=85, gb_free=60, wall=47644 epoch 033: 1638 / 1689 loss=3.515, nll_loss=1.961, ppl=3.89, wps=575965, ups=1.16, wpb=495414, bsz=16116.5, num_updates=55600, lr=0.000268221, gnorm=0.131, clip=0, loss_scale=2, train_wall=85, gb_free=60, wall=47644 epoch 033: 1638 / 1689 loss=3.515, nll_loss=1.961, ppl=3.89, wps=575965, ups=1.16, wpb=495414, bsz=16116.5, num_updates=55600, lr=0.000268221, gnorm=0.131, clip=0, loss_scale=2, train_wall=85, gb_free=60, wall=47644 epoch 033: 1638 / 1689 loss=3.515, nll_loss=1.961, ppl=3.89, wps=575965, ups=1.16, wpb=495414, bsz=16116.5, num_updates=55600, lr=0.000268221, gnorm=0.131, clip=0, loss_scale=2, train_wall=85, gb_free=60, wall=47644 epoch 033: 1638 / 1689 loss=3.515, nll_loss=1.961, ppl=3.89, wps=575965, ups=1.16, wpb=495414, bsz=16116.5, num_updates=55600, lr=0.000268221, gnorm=0.131, clip=0, loss_scale=2, train_wall=85, gb_free=60, wall=47644 epoch 033: 1638 / 1689 loss=3.515, nll_loss=1.961, ppl=3.89, wps=575965, ups=1.16, wpb=495414, bsz=16116.5, num_updates=55600, lr=0.000268221, gnorm=0.131, clip=0, loss_scale=2, train_wall=85, gb_free=60, wall=47644 epoch 033: 1638 / 1689 loss=3.515, nll_loss=1.961, ppl=3.89, wps=575965, ups=1.16, wpb=495414, bsz=16116.5, num_updates=55600, lr=0.000268221, gnorm=0.131, clip=0, loss_scale=2, train_wall=85, gb_free=60, wall=47644 epoch 033: 1638 / 1689 loss=3.515, nll_loss=1.961, ppl=3.89, wps=575965, ups=1.16, wpb=495414, bsz=16116.5, num_updates=55600, lr=0.000268221, gnorm=0.131, clip=0, loss_scale=2, train_wall=85, gb_free=60, wall=47644 epoch 033: 1638 / 1689 loss=3.515, nll_loss=1.961, ppl=3.89, wps=575965, ups=1.16, wpb=495414, bsz=16116.5, num_updates=55600, lr=0.000268221, gnorm=0.131, clip=0, loss_scale=2, train_wall=85, gb_free=60, wall=47644 epoch 033: 1638 / 1689 loss=3.515, nll_loss=1.961, ppl=3.89, wps=575965, ups=1.16, wpb=495414, bsz=16116.5, num_updates=55600, lr=0.000268221, gnorm=0.131, clip=0, loss_scale=2, train_wall=85, gb_free=60, wall=47644 epoch 033: 1638 / 1689 loss=3.515, nll_loss=1.961, ppl=3.89, wps=575965, ups=1.16, wpb=495414, bsz=16116.5, num_updates=55600, lr=0.000268221, gnorm=0.131, clip=0, loss_scale=2, train_wall=85, gb_free=60, wall=47644 epoch 033: 1638 / 1689 loss=3.515, nll_loss=1.961, ppl=3.89, wps=575965, ups=1.16, wpb=495414, bsz=16116.5, num_updates=55600, lr=0.000268221, gnorm=0.131, clip=0, loss_scale=2, train_wall=85, gb_free=60, wall=47644 epoch 033: 1638 / 1689 loss=3.515, nll_loss=1.961, ppl=3.89, wps=575965, ups=1.16, wpb=495414, bsz=16116.5, num_updates=55600, lr=0.000268221, gnorm=0.131, clip=0, loss_scale=2, train_wall=85, gb_free=60, wall=47644 epoch 033: 1638 / 1689 loss=3.515, nll_loss=1.961, ppl=3.89, wps=575965, ups=1.16, wpb=495414, bsz=16116.5, num_updates=55600, lr=0.000268221, gnorm=0.131, clip=0, loss_scale=2, train_wall=85, gb_free=60, wall=47644 epoch 033: 1638 / 1689 loss=3.515, nll_loss=1.961, ppl=3.89, wps=575965, ups=1.16, wpb=495414, bsz=16116.5, num_updates=55600, lr=0.000268221, gnorm=0.131, clip=0, loss_scale=2, train_wall=85, gb_free=60, wall=47644 epoch 033: 1638 / 1689 loss=3.515, nll_loss=1.961, ppl=3.89, wps=575965, ups=1.16, wpb=495414, bsz=16116.5, num_updates=55600, lr=0.000268221, gnorm=0.131, clip=0, loss_scale=2, train_wall=85, gb_free=60, wall=47644 epoch 033: 1638 / 1689 loss=3.515, nll_loss=1.961, ppl=3.89, wps=575965, ups=1.16, wpb=495414, bsz=16116.5, num_updates=55600, lr=0.000268221, gnorm=0.131, clip=0, loss_scale=2, train_wall=85, gb_free=60, wall=47644 epoch 033: 1638 / 1689 loss=3.515, nll_loss=1.961, ppl=3.89, wps=575965, ups=1.16, wpb=495414, bsz=16116.5, num_updates=55600, lr=0.000268221, gnorm=0.131, clip=0, loss_scale=2, train_wall=85, gb_free=60, wall=47644 epoch 033: 1638 / 1689 loss=3.515, nll_loss=1.961, ppl=3.89, wps=575965, ups=1.16, wpb=495414, bsz=16116.5, num_updates=55600, lr=0.000268221, gnorm=0.131, clip=0, loss_scale=2, train_wall=85, gb_free=60, wall=47644 epoch 033: 1638 / 1689 loss=3.515, nll_loss=1.961, ppl=3.89, wps=575965, ups=1.16, wpb=495414, bsz=16116.5, num_updates=55600, lr=0.000268221, gnorm=0.131, clip=0, loss_scale=2, train_wall=85, gb_free=60, wall=47644 epoch 033: 1638 / 1689 loss=3.515, nll_loss=1.961, ppl=3.89, wps=575965, ups=1.16, wpb=495414, bsz=16116.5, num_updates=55600, lr=0.000268221, gnorm=0.131, clip=0, loss_scale=2, train_wall=85, gb_free=60, wall=47644 epoch 033: 1638 / 1689 loss=3.515, nll_loss=1.961, ppl=3.89, wps=575965, ups=1.16, wpb=495414, bsz=16116.5, num_updates=55600, lr=0.000268221, gnorm=0.131, clip=0, loss_scale=2, train_wall=85, gb_free=60, wall=47644 epoch 033: 1638 / 1689 loss=3.515, nll_loss=1.961, ppl=3.89, wps=575965, ups=1.16, wpb=495414, bsz=16116.5, num_updates=55600, lr=0.000268221, gnorm=0.131, clip=0, loss_scale=2, train_wall=85, gb_free=60, wall=47644 epoch 033: 1638 / 1689 loss=3.515, nll_loss=1.961, ppl=3.89, wps=575965, ups=1.16, wpb=495414, bsz=16116.5, num_updates=55600, lr=0.000268221, gnorm=0.131, clip=0, loss_scale=2, train_wall=85, gb_free=60, wall=47644 end of epoch 33 (average epoch stats below) epoch 033 | loss 3.505 | nll_loss 1.95 | ppl 3.86 | wps 578770 | ups 1.17 | wpb 495131 | bsz 16506.6 | num_updates 55651 | lr 0.000268098 | gnorm 0.135 | clip 0 | loss_scale 2 | train_wall 1404 | gb_free 63.6 | wall 47686 epoch 033 | loss 3.505 | nll_loss 1.95 | ppl 3.86 | wps 578770 | ups 1.17 | wpb 495131 | bsz 16506.6 | num_updates 55651 | lr 0.000268098 | gnorm 0.135 | clip 0 | loss_scale 2 | train_wall 1404 | gb_free 63.6 | wall 47686 epoch 033 | loss 3.505 | nll_loss 1.95 | ppl 3.86 | wps 578770 | ups 1.17 | wpb 495131 | bsz 16506.6 | num_updates 55651 | lr 0.000268098 | gnorm 0.135 | clip 0 | loss_scale 2 | train_wall 1404 | gb_free 63.6 | wall 47686 epoch 033 | loss 3.505 | nll_loss 1.95 | ppl 3.86 | wps 578770 | ups 1.17 | wpb 495131 | bsz 16506.6 | num_updates 55651 | lr 0.000268098 | gnorm 0.135 | clip 0 | loss_scale 2 | train_wall 1404 | gb_free 63.6 | wall 47686 epoch 033 | loss 3.505 | nll_loss 1.95 | ppl 3.86 | wps 578770 | ups 1.17 | wpb 495131 | bsz 16506.6 | num_updates 55651 | lr 0.000268098 | gnorm 0.135 | clip 0 | loss_scale 2 | train_wall 1404 | gb_free 63.6 | wall 47686 epoch 033 | loss 3.505 | nll_loss 1.95 | ppl 3.86 | wps 578770 | ups 1.17 | wpb 495131 | bsz 16506.6 | num_updates 55651 | lr 0.000268098 | gnorm 0.135 | clip 0 | loss_scale 2 | train_wall 1404 | gb_free 63.6 | wall 47686 epoch 033 | loss 3.505 | nll_loss 1.95 | ppl 3.86 | wps 578770 | ups 1.17 | wpb 495131 | bsz 16506.6 | num_updates 55651 | lr 0.000268098 | gnorm 0.135 | clip 0 | loss_scale 2 | train_wall 1404 | gb_free 63.6 | wall 47686 epoch 033 | loss 3.505 | nll_loss 1.95 | ppl 3.86 | wps 578770 | ups 1.17 | wpb 495131 | bsz 16506.6 | num_updates 55651 | lr 0.000268098 | gnorm 0.135 | clip 0 | loss_scale 2 | train_wall 1404 | gb_free 63.6 | wall 47686 epoch 033 | loss 3.505 | nll_loss 1.95 | ppl 3.86 | wps 578770 | ups 1.17 | wpb 495131 | bsz 16506.6 | num_updates 55651 | lr 0.000268098 | gnorm 0.135 | clip 0 | loss_scale 2 | train_wall 1404 | gb_free 63.6 | wall 47686 epoch 033 | loss 3.505 | nll_loss 1.95 | ppl 3.86 | wps 578770 | ups 1.17 | wpb 495131 | bsz 16506.6 | num_updates 55651 | lr 0.000268098 | gnorm 0.135 | clip 0 | loss_scale 2 | train_wall 1404 | gb_free 63.6 | wall 47686 epoch 033 | loss 3.505 | nll_loss 1.95 | ppl 3.86 | wps 578770 | ups 1.17 | wpb 495131 | bsz 16506.6 | num_updates 55651 | lr 0.000268098 | gnorm 0.135 | clip 0 | loss_scale 2 | train_wall 1404 | gb_free 63.6 | wall 47686 epoch 033 | loss 3.505 | nll_loss 1.95 | ppl 3.86 | wps 578770 | ups 1.17 | wpb 495131 | bsz 16506.6 | num_updates 55651 | lr 0.000268098 | gnorm 0.135 | clip 0 | loss_scale 2 | train_wall 1404 | gb_free 63.6 | wall 47686 epoch 033 | loss 3.505 | nll_loss 1.95 | ppl 3.86 | wps 578770 | ups 1.17 | wpb 495131 | bsz 16506.6 | num_updates 55651 | lr 0.000268098 | gnorm 0.135 | clip 0 | loss_scale 2 | train_wall 1404 | gb_free 63.6 | wall 47686 epoch 033 | loss 3.505 | nll_loss 1.95 | ppl 3.86 | wps 578770 | ups 1.17 | wpb 495131 | bsz 16506.6 | num_updates 55651 | lr 0.000268098 | gnorm 0.135 | clip 0 | loss_scale 2 | train_wall 1404 | gb_free 63.6 | wall 47686 epoch 033 | loss 3.505 | nll_loss 1.95 | ppl 3.86 | wps 578770 | ups 1.17 | wpb 495131 | bsz 16506.6 | num_updates 55651 | lr 0.000268098 | gnorm 0.135 | clip 0 | loss_scale 2 | train_wall 1404 | gb_free 63.6 | wall 47686 epoch 033 | loss 3.505 | nll_loss 1.95 | ppl 3.86 | wps 578770 | ups 1.17 | wpb 495131 | bsz 16506.6 | num_updates 55651 | lr 0.000268098 | gnorm 0.135 | clip 0 | loss_scale 2 | train_wall 1404 | gb_free 63.6 | wall 47686 epoch 033 | loss 3.505 | nll_loss 1.95 | ppl 3.86 | wps 578770 | ups 1.17 | wpb 495131 | bsz 16506.6 | num_updates 55651 | lr 0.000268098 | gnorm 0.135 | clip 0 | loss_scale 2 | train_wall 1404 | gb_free 63.6 | wall 47686 epoch 033 | loss 3.505 | nll_loss 1.95 | ppl 3.86 | wps 578770 | ups 1.17 | wpb 495131 | bsz 16506.6 | num_updates 55651 | lr 0.000268098 | gnorm 0.135 | clip 0 | loss_scale 2 | train_wall 1404 | gb_free 63.6 | wall 47686 epoch 033 | loss 3.505 | nll_loss 1.95 | ppl 3.86 | wps 578770 | ups 1.17 | wpb 495131 | bsz 16506.6 | num_updates 55651 | lr 0.000268098 | gnorm 0.135 | clip 0 | loss_scale 2 | train_wall 1404 | gb_free 63.6 | wall 47686 epoch 033 | loss 3.505 | nll_loss 1.95 | ppl 3.86 | wps 578770 | ups 1.17 | wpb 495131 | bsz 16506.6 | num_updates 55651 | lr 0.000268098 | gnorm 0.135 | clip 0 | loss_scale 2 | train_wall 1404 | gb_free 63.6 | wall 47686 epoch 033 | loss 3.505 | nll_loss 1.95 | ppl 3.86 | wps 578770 | ups 1.17 | wpb 495131 | bsz 16506.6 | num_updates 55651 | lr 0.000268098 | gnorm 0.135 | clip 0 | loss_scale 2 | train_wall 1404 | gb_free 63.6 | wall 47686 epoch 033 | loss 3.505 | nll_loss 1.95 | ppl 3.86 | wps 578770 | ups 1.17 | wpb 495131 | bsz 16506.6 | num_updates 55651 | lr 0.000268098 | gnorm 0.135 | clip 0 | loss_scale 2 | train_wall 1404 | gb_free 63.6 | wall 47686 epoch 033 | loss 3.505 | nll_loss 1.95 | ppl 3.86 | wps 578770 | ups 1.17 | wpb 495131 | bsz 16506.6 | num_updates 55651 | lr 0.000268098 | gnorm 0.135 | clip 0 | loss_scale 2 | train_wall 1404 | gb_free 63.6 | wall 47686 epoch 033 | loss 3.505 | nll_loss 1.95 | ppl 3.86 | wps 578770 | ups 1.17 | wpb 495131 | bsz 16506.6 | num_updates 55651 | lr 0.000268098 | gnorm 0.135 | clip 0 | loss_scale 2 | train_wall 1404 | gb_free 63.6 | wall 47686 epoch 033 | loss 3.505 | nll_loss 1.95 | ppl 3.86 | wps 578770 | ups 1.17 | wpb 495131 | bsz 16506.6 | num_updates 55651 | lr 0.000268098 | gnorm 0.135 | clip 0 | loss_scale 2 | train_wall 1404 | gb_free 63.6 | wall 47686 epoch 033 | loss 3.505 | nll_loss 1.95 | ppl 3.86 | wps 578770 | ups 1.17 | wpb 495131 | bsz 16506.6 | num_updates 55651 | lr 0.000268098 | gnorm 0.135 | clip 0 | loss_scale 2 | train_wall 1404 | gb_free 63.6 | wall 47686 epoch 033 | loss 3.505 | nll_loss 1.95 | ppl 3.86 | wps 578770 | ups 1.17 | wpb 495131 | bsz 16506.6 | num_updates 55651 | lr 0.000268098 | gnorm 0.135 | clip 0 | loss_scale 2 | train_wall 1404 | gb_free 63.6 | wall 47686 epoch 033 | loss 3.505 | nll_loss 1.95 | ppl 3.86 | wps 578770 | ups 1.17 | wpb 495131 | bsz 16506.6 | num_updates 55651 | lr 0.000268098 | gnorm 0.135 | clip 0 | loss_scale 2 | train_wall 1404 | gb_free 63.6 | wall 47686 epoch 033 | loss 3.505 | nll_loss 1.95 | ppl 3.86 | wps 578770 | ups 1.17 | wpb 495131 | bsz 16506.6 | num_updates 55651 | lr 0.000268098 | gnorm 0.135 | clip 0 | loss_scale 2 | train_wall 1404 | gb_free 63.6 | wall 47686 epoch 033 | loss 3.505 | nll_loss 1.95 | ppl 3.86 | wps 578770 | ups 1.17 | wpb 495131 | bsz 16506.6 | num_updates 55651 | lr 0.000268098 | gnorm 0.135 | clip 0 | loss_scale 2 | train_wall 1404 | gb_free 63.6 | wall 47686 epoch 033 | loss 3.505 | nll_loss 1.95 | ppl 3.86 | wps 578770 | ups 1.17 | wpb 495131 | bsz 16506.6 | num_updates 55651 | lr 0.000268098 | gnorm 0.135 | clip 0 | loss_scale 2 | train_wall 1404 | gb_free 63.6 | wall 47686 epoch 033 | loss 3.505 | nll_loss 1.95 | ppl 3.86 | wps 578770 | ups 1.17 | wpb 495131 | bsz 16506.6 | num_updates 55651 | lr 0.000268098 | gnorm 0.135 | clip 0 | loss_scale 2 | train_wall 1404 | gb_free 63.6 | wall 47686 epoch 033 | loss 3.505 | nll_loss 1.95 | ppl 3.86 | wps 578770 | ups 1.17 | wpb 495131 | bsz 16506.6 | num_updates 55651 | lr 0.000268098 | gnorm 0.135 | clip 0 | loss_scale 2 | train_wall 1404 | gb_free 63.6 | wall 47686 Start iterating over samples epoch 034: 49 / 1689 loss=3.504, nll_loss=1.948, ppl=3.86, wps=585352, ups=1.19, wpb=491202, bsz=16203.9, num_updates=55700, lr=0.00026798, gnorm=0.139, clip=0, loss_scale=2, train_wall=82, gb_free=61.7, wall=47728 epoch 034: 49 / 1689 loss=3.504, nll_loss=1.948, ppl=3.86, wps=585352, ups=1.19, wpb=491202, bsz=16203.9, num_updates=55700, lr=0.00026798, gnorm=0.139, clip=0, loss_scale=2, train_wall=82, gb_free=61.7, wall=47728 epoch 034: 49 / 1689 loss=3.504, nll_loss=1.948, ppl=3.86, wps=585352, ups=1.19, wpb=491202, bsz=16203.9, num_updates=55700, lr=0.00026798, gnorm=0.139, clip=0, loss_scale=2, train_wall=82, gb_free=61.7, wall=47728 epoch 034: 49 / 1689 loss=3.504, nll_loss=1.948, ppl=3.86, wps=585352, ups=1.19, wpb=491202, bsz=16203.9, num_updates=55700, lr=0.00026798, gnorm=0.139, clip=0, loss_scale=2, train_wall=82, gb_free=61.7, wall=47728 epoch 034: 49 / 1689 loss=3.504, nll_loss=1.948, ppl=3.86, wps=585352, ups=1.19, wpb=491202, bsz=16203.9, num_updates=55700, lr=0.00026798, gnorm=0.139, clip=0, loss_scale=2, train_wall=82, gb_free=61.7, wall=47728 epoch 034: 49 / 1689 loss=3.504, nll_loss=1.948, ppl=3.86, wps=585352, ups=1.19, wpb=491202, bsz=16203.9, num_updates=55700, lr=0.00026798, gnorm=0.139, clip=0, loss_scale=2, train_wall=82, gb_free=61.7, wall=47728 epoch 034: 49 / 1689 loss=3.504, nll_loss=1.948, ppl=3.86, wps=585352, ups=1.19, wpb=491202, bsz=16203.9, num_updates=55700, lr=0.00026798, gnorm=0.139, clip=0, loss_scale=2, train_wall=82, gb_free=61.7, wall=47728 epoch 034: 49 / 1689 loss=3.504, nll_loss=1.948, ppl=3.86, wps=585352, ups=1.19, wpb=491202, bsz=16203.9, num_updates=55700, lr=0.00026798, gnorm=0.139, clip=0, loss_scale=2, train_wall=82, gb_free=61.7, wall=47728 epoch 034: 49 / 1689 loss=3.504, nll_loss=1.948, ppl=3.86, wps=585352, ups=1.19, wpb=491202, bsz=16203.9, num_updates=55700, lr=0.00026798, gnorm=0.139, clip=0, loss_scale=2, train_wall=82, gb_free=61.7, wall=47728 epoch 034: 49 / 1689 loss=3.504, nll_loss=1.948, ppl=3.86, wps=585352, ups=1.19, wpb=491202, bsz=16203.9, num_updates=55700, lr=0.00026798, gnorm=0.139, clip=0, loss_scale=2, train_wall=82, gb_free=61.7, wall=47728 epoch 034: 49 / 1689 loss=3.504, nll_loss=1.948, ppl=3.86, wps=585352, ups=1.19, wpb=491202, bsz=16203.9, num_updates=55700, lr=0.00026798, gnorm=0.139, clip=0, loss_scale=2, train_wall=82, gb_free=61.7, wall=47728 epoch 034: 49 / 1689 loss=3.504, nll_loss=1.948, ppl=3.86, wps=585352, ups=1.19, wpb=491202, bsz=16203.9, num_updates=55700, lr=0.00026798, gnorm=0.139, clip=0, loss_scale=2, train_wall=82, gb_free=61.7, wall=47728 epoch 034: 49 / 1689 loss=3.504, nll_loss=1.948, ppl=3.86, wps=585352, ups=1.19, wpb=491202, bsz=16203.9, num_updates=55700, lr=0.00026798, gnorm=0.139, clip=0, loss_scale=2, train_wall=82, gb_free=61.7, wall=47728 epoch 034: 49 / 1689 loss=3.504, nll_loss=1.948, ppl=3.86, wps=585352, ups=1.19, wpb=491202, bsz=16203.9, num_updates=55700, lr=0.00026798, gnorm=0.139, clip=0, loss_scale=2, train_wall=82, gb_free=61.7, wall=47728 epoch 034: 49 / 1689 loss=3.504, nll_loss=1.948, ppl=3.86, wps=585352, ups=1.19, wpb=491202, bsz=16203.9, num_updates=55700, lr=0.00026798, gnorm=0.139, clip=0, loss_scale=2, train_wall=82, gb_free=61.7, wall=47728 epoch 034: 49 / 1689 loss=3.504, nll_loss=1.948, ppl=3.86, wps=585352, ups=1.19, wpb=491202, bsz=16203.9, num_updates=55700, lr=0.00026798, gnorm=0.139, clip=0, loss_scale=2, train_wall=82, gb_free=61.7, wall=47728 epoch 034: 49 / 1689 loss=3.504, nll_loss=1.948, ppl=3.86, wps=585352, ups=1.19, wpb=491202, bsz=16203.9, num_updates=55700, lr=0.00026798, gnorm=0.139, clip=0, loss_scale=2, train_wall=82, gb_free=61.7, wall=47728 epoch 034: 49 / 1689 loss=3.504, nll_loss=1.948, ppl=3.86, wps=585352, ups=1.19, wpb=491202, bsz=16203.9, num_updates=55700, lr=0.00026798, gnorm=0.139, clip=0, loss_scale=2, train_wall=82, gb_free=61.7, wall=47728 epoch 034: 49 / 1689 loss=3.504, nll_loss=1.948, ppl=3.86, wps=585352, ups=1.19, wpb=491202, bsz=16203.9, num_updates=55700, lr=0.00026798, gnorm=0.139, clip=0, loss_scale=2, train_wall=82, gb_free=61.7, wall=47728 epoch 034: 49 / 1689 loss=3.504, nll_loss=1.948, ppl=3.86, wps=585352, ups=1.19, wpb=491202, bsz=16203.9, num_updates=55700, lr=0.00026798, gnorm=0.139, clip=0, loss_scale=2, train_wall=82, gb_free=61.7, wall=47728 epoch 034: 49 / 1689 loss=3.504, nll_loss=1.948, ppl=3.86, wps=585352, ups=1.19, wpb=491202, bsz=16203.9, num_updates=55700, lr=0.00026798, gnorm=0.139, clip=0, loss_scale=2, train_wall=82, gb_free=61.7, wall=47728 epoch 034: 49 / 1689 loss=3.504, nll_loss=1.948, ppl=3.86, wps=585352, ups=1.19, wpb=491202, bsz=16203.9, num_updates=55700, lr=0.00026798, gnorm=0.139, clip=0, loss_scale=2, train_wall=82, gb_free=61.7, wall=47728 epoch 034: 49 / 1689 loss=3.504, nll_loss=1.948, ppl=3.86, wps=585352, ups=1.19, wpb=491202, bsz=16203.9, num_updates=55700, lr=0.00026798, gnorm=0.139, clip=0, loss_scale=2, train_wall=82, gb_free=61.7, wall=47728 epoch 034: 49 / 1689 loss=3.504, nll_loss=1.948, ppl=3.86, wps=585352, ups=1.19, wpb=491202, bsz=16203.9, num_updates=55700, lr=0.00026798, gnorm=0.139, clip=0, loss_scale=2, train_wall=82, gb_free=61.7, wall=47728 epoch 034: 49 / 1689 loss=3.504, nll_loss=1.948, ppl=3.86, wps=585352, ups=1.19, wpb=491202, bsz=16203.9, num_updates=55700, lr=0.00026798, gnorm=0.139, clip=0, loss_scale=2, train_wall=82, gb_free=61.7, wall=47728 epoch 034: 49 / 1689 loss=3.504, nll_loss=1.948, ppl=3.86, wps=585352, ups=1.19, wpb=491202, bsz=16203.9, num_updates=55700, lr=0.00026798, gnorm=0.139, clip=0, loss_scale=2, train_wall=82, gb_free=61.7, wall=47728 epoch 034: 49 / 1689 loss=3.504, nll_loss=1.948, ppl=3.86, wps=585352, ups=1.19, wpb=491202, bsz=16203.9, num_updates=55700, lr=0.00026798, gnorm=0.139, clip=0, loss_scale=2, train_wall=82, gb_free=61.7, wall=47728 epoch 034: 49 / 1689 loss=3.504, nll_loss=1.948, ppl=3.86, wps=585352, ups=1.19, wpb=491202, bsz=16203.9, num_updates=55700, lr=0.00026798, gnorm=0.139, clip=0, loss_scale=2, train_wall=82, gb_free=61.7, wall=47728 epoch 034: 49 / 1689 loss=3.504, nll_loss=1.948, ppl=3.86, wps=585352, ups=1.19, wpb=491202, bsz=16203.9, num_updates=55700, lr=0.00026798, gnorm=0.139, clip=0, loss_scale=2, train_wall=82, gb_free=61.7, wall=47728 epoch 034: 49 / 1689 loss=3.504, nll_loss=1.948, ppl=3.86, wps=585352, ups=1.19, wpb=491202, bsz=16203.9, num_updates=55700, lr=0.00026798, gnorm=0.139, clip=0, loss_scale=2, train_wall=82, gb_free=61.7, wall=47728 epoch 034: 49 / 1689 loss=3.504, nll_loss=1.948, ppl=3.86, wps=585352, ups=1.19, wpb=491202, bsz=16203.9, num_updates=55700, lr=0.00026798, gnorm=0.139, clip=0, loss_scale=2, train_wall=82, gb_free=61.7, wall=47728 epoch 034: 49 / 1689 loss=3.504, nll_loss=1.948, ppl=3.86, wps=585352, ups=1.19, wpb=491202, bsz=16203.9, num_updates=55700, lr=0.00026798, gnorm=0.139, clip=0, loss_scale=2, train_wall=82, gb_free=61.7, wall=47728 epoch 034: 49 / 1689 loss=3.504, nll_loss=1.948, ppl=3.86, wps=585352, ups=1.19, wpb=491202, bsz=16203.9, num_updates=55700, lr=0.00026798, gnorm=0.139, clip=0, loss_scale=2, train_wall=82, gb_free=61.7, wall=47728 epoch 034: 49 / 1689 loss=3.504, nll_loss=1.948, ppl=3.86, wps=585352, ups=1.19, wpb=491202, bsz=16203.9, num_updates=55700, lr=0.00026798, gnorm=0.139, clip=0, loss_scale=2, train_wall=82, gb_free=61.7, wall=47728 epoch 034: 149 / 1689 loss=3.486, nll_loss=1.928, ppl=3.8, wps=588077, ups=1.19, wpb=496172, bsz=16955.7, num_updates=55800, lr=0.00026774, gnorm=0.134, clip=0, loss_scale=2, train_wall=83, gb_free=60.7, wall=47812 epoch 034: 149 / 1689 loss=3.486, nll_loss=1.928, ppl=3.8, wps=588077, ups=1.19, wpb=496172, bsz=16955.7, num_updates=55800, lr=0.00026774, gnorm=0.134, clip=0, loss_scale=2, train_wall=83, gb_free=60.7, wall=47812 epoch 034: 149 / 1689 loss=3.486, nll_loss=1.928, ppl=3.8, wps=588077, ups=1.19, wpb=496172, bsz=16955.7, num_updates=55800, lr=0.00026774, gnorm=0.134, clip=0, loss_scale=2, train_wall=83, gb_free=60.7, wall=47812 epoch 034: 149 / 1689 loss=3.486, nll_loss=1.928, ppl=3.8, wps=588077, ups=1.19, wpb=496172, bsz=16955.7, num_updates=55800, lr=0.00026774, gnorm=0.134, clip=0, loss_scale=2, train_wall=83, gb_free=60.7, wall=47812 epoch 034: 149 / 1689 loss=3.486, nll_loss=1.928, ppl=3.8, wps=588077, ups=1.19, wpb=496172, bsz=16955.7, num_updates=55800, lr=0.00026774, gnorm=0.134, clip=0, loss_scale=2, train_wall=83, gb_free=60.7, wall=47812 epoch 034: 149 / 1689 loss=3.486, nll_loss=1.928, ppl=3.8, wps=588077, ups=1.19, wpb=496172, bsz=16955.7, num_updates=55800, lr=0.00026774, gnorm=0.134, clip=0, loss_scale=2, train_wall=83, gb_free=60.7, wall=47812 epoch 034: 149 / 1689 loss=3.486, nll_loss=1.928, ppl=3.8, wps=588077, ups=1.19, wpb=496172, bsz=16955.7, num_updates=55800, lr=0.00026774, gnorm=0.134, clip=0, loss_scale=2, train_wall=83, gb_free=60.7, wall=47812 epoch 034: 149 / 1689 loss=3.486, nll_loss=1.928, ppl=3.8, wps=588077, ups=1.19, wpb=496172, bsz=16955.7, num_updates=55800, lr=0.00026774, gnorm=0.134, clip=0, loss_scale=2, train_wall=83, gb_free=60.7, wall=47812 epoch 034: 149 / 1689 loss=3.486, nll_loss=1.928, ppl=3.8, wps=588077, ups=1.19, wpb=496172, bsz=16955.7, num_updates=55800, lr=0.00026774, gnorm=0.134, clip=0, loss_scale=2, train_wall=83, gb_free=60.7, wall=47812 epoch 034: 149 / 1689 loss=3.486, nll_loss=1.928, ppl=3.8, wps=588077, ups=1.19, wpb=496172, bsz=16955.7, num_updates=55800, lr=0.00026774, gnorm=0.134, clip=0, loss_scale=2, train_wall=83, gb_free=60.7, wall=47812 epoch 034: 149 / 1689 loss=3.486, nll_loss=1.928, ppl=3.8, wps=588077, ups=1.19, wpb=496172, bsz=16955.7, num_updates=55800, lr=0.00026774, gnorm=0.134, clip=0, loss_scale=2, train_wall=83, gb_free=60.7, wall=47812 epoch 034: 149 / 1689 loss=3.486, nll_loss=1.928, ppl=3.8, wps=588077, ups=1.19, wpb=496172, bsz=16955.7, num_updates=55800, lr=0.00026774, gnorm=0.134, clip=0, loss_scale=2, train_wall=83, gb_free=60.7, wall=47812 epoch 034: 149 / 1689 loss=3.486, nll_loss=1.928, ppl=3.8, wps=588077, ups=1.19, wpb=496172, bsz=16955.7, num_updates=55800, lr=0.00026774, gnorm=0.134, clip=0, loss_scale=2, train_wall=83, gb_free=60.7, wall=47812 epoch 034: 149 / 1689 loss=3.486, nll_loss=1.928, ppl=3.8, wps=588077, ups=1.19, wpb=496172, bsz=16955.7, num_updates=55800, lr=0.00026774, gnorm=0.134, clip=0, loss_scale=2, train_wall=83, gb_free=60.7, wall=47812 epoch 034: 149 / 1689 loss=3.486, nll_loss=1.928, ppl=3.8, wps=588077, ups=1.19, wpb=496172, bsz=16955.7, num_updates=55800, lr=0.00026774, gnorm=0.134, clip=0, loss_scale=2, train_wall=83, gb_free=60.7, wall=47812 epoch 034: 149 / 1689 loss=3.486, nll_loss=1.928, ppl=3.8, wps=588077, ups=1.19, wpb=496172, bsz=16955.7, num_updates=55800, lr=0.00026774, gnorm=0.134, clip=0, loss_scale=2, train_wall=83, gb_free=60.7, wall=47812 epoch 034: 149 / 1689 loss=3.486, nll_loss=1.928, ppl=3.8, wps=588077, ups=1.19, wpb=496172, bsz=16955.7, num_updates=55800, lr=0.00026774, gnorm=0.134, clip=0, loss_scale=2, train_wall=83, gb_free=60.7, wall=47812 epoch 034: 149 / 1689 loss=3.486, nll_loss=1.928, ppl=3.8, wps=588077, ups=1.19, wpb=496172, bsz=16955.7, num_updates=55800, lr=0.00026774, gnorm=0.134, clip=0, loss_scale=2, train_wall=83, gb_free=60.7, wall=47812 epoch 034: 149 / 1689 loss=3.486, nll_loss=1.928, ppl=3.8, wps=588077, ups=1.19, wpb=496172, bsz=16955.7, num_updates=55800, lr=0.00026774, gnorm=0.134, clip=0, loss_scale=2, train_wall=83, gb_free=60.7, wall=47812 epoch 034: 149 / 1689 loss=3.486, nll_loss=1.928, ppl=3.8, wps=588077, ups=1.19, wpb=496172, bsz=16955.7, num_updates=55800, lr=0.00026774, gnorm=0.134, clip=0, loss_scale=2, train_wall=83, gb_free=60.7, wall=47812 epoch 034: 149 / 1689 loss=3.486, nll_loss=1.928, ppl=3.8, wps=588077, ups=1.19, wpb=496172, bsz=16955.7, num_updates=55800, lr=0.00026774, gnorm=0.134, clip=0, loss_scale=2, train_wall=83, gb_free=60.7, wall=47812 epoch 034: 149 / 1689 loss=3.486, nll_loss=1.928, ppl=3.8, wps=588077, ups=1.19, wpb=496172, bsz=16955.7, num_updates=55800, lr=0.00026774, gnorm=0.134, clip=0, loss_scale=2, train_wall=83, gb_free=60.7, wall=47812 epoch 034: 149 / 1689 loss=3.486, nll_loss=1.928, ppl=3.8, wps=588077, ups=1.19, wpb=496172, bsz=16955.7, num_updates=55800, lr=0.00026774, gnorm=0.134, clip=0, loss_scale=2, train_wall=83, gb_free=60.7, wall=47812 epoch 034: 149 / 1689 loss=3.486, nll_loss=1.928, ppl=3.8, wps=588077, ups=1.19, wpb=496172, bsz=16955.7, num_updates=55800, lr=0.00026774, gnorm=0.134, clip=0, loss_scale=2, train_wall=83, gb_free=60.7, wall=47812 epoch 034: 149 / 1689 loss=3.486, nll_loss=1.928, ppl=3.8, wps=588077, ups=1.19, wpb=496172, bsz=16955.7, num_updates=55800, lr=0.00026774, gnorm=0.134, clip=0, loss_scale=2, train_wall=83, gb_free=60.7, wall=47812 epoch 034: 149 / 1689 loss=3.486, nll_loss=1.928, ppl=3.8, wps=588077, ups=1.19, wpb=496172, bsz=16955.7, num_updates=55800, lr=0.00026774, gnorm=0.134, clip=0, loss_scale=2, train_wall=83, gb_free=60.7, wall=47812 epoch 034: 149 / 1689 loss=3.486, nll_loss=1.928, ppl=3.8, wps=588077, ups=1.19, wpb=496172, bsz=16955.7, num_updates=55800, lr=0.00026774, gnorm=0.134, clip=0, loss_scale=2, train_wall=83, gb_free=60.7, wall=47812 epoch 034: 149 / 1689 loss=3.486, nll_loss=1.928, ppl=3.8, wps=588077, ups=1.19, wpb=496172, bsz=16955.7, num_updates=55800, lr=0.00026774, gnorm=0.134, clip=0, loss_scale=2, train_wall=83, gb_free=60.7, wall=47812 epoch 034: 149 / 1689 loss=3.486, nll_loss=1.928, ppl=3.8, wps=588077, ups=1.19, wpb=496172, bsz=16955.7, num_updates=55800, lr=0.00026774, gnorm=0.134, clip=0, loss_scale=2, train_wall=83, gb_free=60.7, wall=47812 epoch 034: 149 / 1689 loss=3.486, nll_loss=1.928, ppl=3.8, wps=588077, ups=1.19, wpb=496172, bsz=16955.7, num_updates=55800, lr=0.00026774, gnorm=0.134, clip=0, loss_scale=2, train_wall=83, gb_free=60.7, wall=47812 epoch 034: 149 / 1689 loss=3.486, nll_loss=1.928, ppl=3.8, wps=588077, ups=1.19, wpb=496172, bsz=16955.7, num_updates=55800, lr=0.00026774, gnorm=0.134, clip=0, loss_scale=2, train_wall=83, gb_free=60.7, wall=47812 epoch 034: 149 / 1689 loss=3.486, nll_loss=1.928, ppl=3.8, wps=588077, ups=1.19, wpb=496172, bsz=16955.7, num_updates=55800, lr=0.00026774, gnorm=0.134, clip=0, loss_scale=2, train_wall=83, gb_free=60.7, wall=47812 epoch 034: 149 / 1689 loss=3.486, nll_loss=1.928, ppl=3.8, wps=588077, ups=1.19, wpb=496172, bsz=16955.7, num_updates=55800, lr=0.00026774, gnorm=0.134, clip=0, loss_scale=2, train_wall=83, gb_free=60.7, wall=47812 epoch 034: 149 / 1689 loss=3.486, nll_loss=1.928, ppl=3.8, wps=588077, ups=1.19, wpb=496172, bsz=16955.7, num_updates=55800, lr=0.00026774, gnorm=0.134, clip=0, loss_scale=2, train_wall=83, gb_free=60.7, wall=47812 epoch 034: 249 / 1689 loss=3.488, nll_loss=1.93, ppl=3.81, wps=587090, ups=1.18, wpb=496363, bsz=16040.2, num_updates=55900, lr=0.0002675, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61.8, wall=47897 epoch 034: 249 / 1689 loss=3.488, nll_loss=1.93, ppl=3.81, wps=587090, ups=1.18, wpb=496363, bsz=16040.2, num_updates=55900, lr=0.0002675, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61.8, wall=47897 epoch 034: 249 / 1689 loss=3.488, nll_loss=1.93, ppl=3.81, wps=587090, ups=1.18, wpb=496363, bsz=16040.2, num_updates=55900, lr=0.0002675, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61.8, wall=47897 epoch 034: 249 / 1689 loss=3.488, nll_loss=1.93, ppl=3.81, wps=587090, ups=1.18, wpb=496363, bsz=16040.2, num_updates=55900, lr=0.0002675, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61.8, wall=47897 epoch 034: 249 / 1689 loss=3.488, nll_loss=1.93, ppl=3.81, wps=587090, ups=1.18, wpb=496363, bsz=16040.2, num_updates=55900, lr=0.0002675, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61.8, wall=47897 epoch 034: 249 / 1689 loss=3.488, nll_loss=1.93, ppl=3.81, wps=587090, ups=1.18, wpb=496363, bsz=16040.2, num_updates=55900, lr=0.0002675, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61.8, wall=47897 epoch 034: 249 / 1689 loss=3.488, nll_loss=1.93, ppl=3.81, wps=587090, ups=1.18, wpb=496363, bsz=16040.2, num_updates=55900, lr=0.0002675, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61.8, wall=47897 epoch 034: 249 / 1689 loss=3.488, nll_loss=1.93, ppl=3.81, wps=587090, ups=1.18, wpb=496363, bsz=16040.2, num_updates=55900, lr=0.0002675, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61.8, wall=47897 epoch 034: 249 / 1689 loss=3.488, nll_loss=1.93, ppl=3.81, wps=587090, ups=1.18, wpb=496363, bsz=16040.2, num_updates=55900, lr=0.0002675, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61.8, wall=47897 epoch 034: 249 / 1689 loss=3.488, nll_loss=1.93, ppl=3.81, wps=587090, ups=1.18, wpb=496363, bsz=16040.2, num_updates=55900, lr=0.0002675, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61.8, wall=47897 epoch 034: 249 / 1689 loss=3.488, nll_loss=1.93, ppl=3.81, wps=587090, ups=1.18, wpb=496363, bsz=16040.2, num_updates=55900, lr=0.0002675, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61.8, wall=47897 epoch 034: 249 / 1689 loss=3.488, nll_loss=1.93, ppl=3.81, wps=587090, ups=1.18, wpb=496363, bsz=16040.2, num_updates=55900, lr=0.0002675, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61.8, wall=47897 epoch 034: 249 / 1689 loss=3.488, nll_loss=1.93, ppl=3.81, wps=587090, ups=1.18, wpb=496363, bsz=16040.2, num_updates=55900, lr=0.0002675, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61.8, wall=47897 epoch 034: 249 / 1689 loss=3.488, nll_loss=1.93, ppl=3.81, wps=587090, ups=1.18, wpb=496363, bsz=16040.2, num_updates=55900, lr=0.0002675, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61.8, wall=47897 epoch 034: 249 / 1689 loss=3.488, nll_loss=1.93, ppl=3.81, wps=587090, ups=1.18, wpb=496363, bsz=16040.2, num_updates=55900, lr=0.0002675, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61.8, wall=47897 epoch 034: 249 / 1689 loss=3.488, nll_loss=1.93, ppl=3.81, wps=587090, ups=1.18, wpb=496363, bsz=16040.2, num_updates=55900, lr=0.0002675, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61.8, wall=47897 epoch 034: 249 / 1689 loss=3.488, nll_loss=1.93, ppl=3.81, wps=587090, ups=1.18, wpb=496363, bsz=16040.2, num_updates=55900, lr=0.0002675, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61.8, wall=47897 epoch 034: 249 / 1689 loss=3.488, nll_loss=1.93, ppl=3.81, wps=587090, ups=1.18, wpb=496363, bsz=16040.2, num_updates=55900, lr=0.0002675, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61.8, wall=47897 epoch 034: 249 / 1689 loss=3.488, nll_loss=1.93, ppl=3.81, wps=587090, ups=1.18, wpb=496363, bsz=16040.2, num_updates=55900, lr=0.0002675, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61.8, wall=47897 epoch 034: 249 / 1689 loss=3.488, nll_loss=1.93, ppl=3.81, wps=587090, ups=1.18, wpb=496363, bsz=16040.2, num_updates=55900, lr=0.0002675, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61.8, wall=47897 epoch 034: 249 / 1689 loss=3.488, nll_loss=1.93, ppl=3.81, wps=587090, ups=1.18, wpb=496363, bsz=16040.2, num_updates=55900, lr=0.0002675, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61.8, wall=47897 epoch 034: 249 / 1689 loss=3.488, nll_loss=1.93, ppl=3.81, wps=587090, ups=1.18, wpb=496363, bsz=16040.2, num_updates=55900, lr=0.0002675, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61.8, wall=47897 epoch 034: 249 / 1689 loss=3.488, nll_loss=1.93, ppl=3.81, wps=587090, ups=1.18, wpb=496363, bsz=16040.2, num_updates=55900, lr=0.0002675, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61.8, wall=47897 epoch 034: 249 / 1689 loss=3.488, nll_loss=1.93, ppl=3.81, wps=587090, ups=1.18, wpb=496363, bsz=16040.2, num_updates=55900, lr=0.0002675, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61.8, wall=47897 epoch 034: 249 / 1689 loss=3.488, nll_loss=1.93, ppl=3.81, wps=587090, ups=1.18, wpb=496363, bsz=16040.2, num_updates=55900, lr=0.0002675, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61.8, wall=47897 epoch 034: 249 / 1689 loss=3.488, nll_loss=1.93, ppl=3.81, wps=587090, ups=1.18, wpb=496363, bsz=16040.2, num_updates=55900, lr=0.0002675, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61.8, wall=47897 epoch 034: 249 / 1689 loss=3.488, nll_loss=1.93, ppl=3.81, wps=587090, ups=1.18, wpb=496363, bsz=16040.2, num_updates=55900, lr=0.0002675, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61.8, wall=47897 epoch 034: 249 / 1689 loss=3.488, nll_loss=1.93, ppl=3.81, wps=587090, ups=1.18, wpb=496363, bsz=16040.2, num_updates=55900, lr=0.0002675, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61.8, wall=47897 epoch 034: 249 / 1689 loss=3.488, nll_loss=1.93, ppl=3.81, wps=587090, ups=1.18, wpb=496363, bsz=16040.2, num_updates=55900, lr=0.0002675, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61.8, wall=47897 epoch 034: 249 / 1689 loss=3.488, nll_loss=1.93, ppl=3.81, wps=587090, ups=1.18, wpb=496363, bsz=16040.2, num_updates=55900, lr=0.0002675, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61.8, wall=47897 epoch 034: 249 / 1689 loss=3.488, nll_loss=1.93, ppl=3.81, wps=587090, ups=1.18, wpb=496363, bsz=16040.2, num_updates=55900, lr=0.0002675, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61.8, wall=47897 epoch 034: 249 / 1689 loss=3.488, nll_loss=1.93, ppl=3.81, wps=587090, ups=1.18, wpb=496363, bsz=16040.2, num_updates=55900, lr=0.0002675, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61.8, wall=47897 epoch 034: 249 / 1689 loss=3.488, nll_loss=1.93, ppl=3.81, wps=587090, ups=1.18, wpb=496363, bsz=16040.2, num_updates=55900, lr=0.0002675, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61.8, wall=47897 epoch 034: 249 / 1689 loss=3.488, nll_loss=1.93, ppl=3.81, wps=587090, ups=1.18, wpb=496363, bsz=16040.2, num_updates=55900, lr=0.0002675, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61.8, wall=47897 epoch 034: 349 / 1689 loss=3.496, nll_loss=1.939, ppl=3.83, wps=586572, ups=1.18, wpb=495262, bsz=16299, num_updates=56000, lr=0.000267261, gnorm=0.128, clip=0, loss_scale=2, train_wall=83, gb_free=61, wall=47981 epoch 034: 349 / 1689 loss=3.496, nll_loss=1.939, ppl=3.83, wps=586572, ups=1.18, wpb=495262, bsz=16299, num_updates=56000, lr=0.000267261, gnorm=0.128, clip=0, loss_scale=2, train_wall=83, gb_free=61, wall=47981 epoch 034: 349 / 1689 loss=3.496, nll_loss=1.939, ppl=3.83, wps=586572, ups=1.18, wpb=495262, bsz=16299, num_updates=56000, lr=0.000267261, gnorm=0.128, clip=0, loss_scale=2, train_wall=83, gb_free=61, wall=47981 epoch 034: 349 / 1689 loss=3.496, nll_loss=1.939, ppl=3.83, wps=586572, ups=1.18, wpb=495262, bsz=16299, num_updates=56000, lr=0.000267261, gnorm=0.128, clip=0, loss_scale=2, train_wall=83, gb_free=61, wall=47981 epoch 034: 349 / 1689 loss=3.496, nll_loss=1.939, ppl=3.83, wps=586572, ups=1.18, wpb=495262, bsz=16299, num_updates=56000, lr=0.000267261, gnorm=0.128, clip=0, loss_scale=2, train_wall=83, gb_free=61, wall=47981 epoch 034: 349 / 1689 loss=3.496, nll_loss=1.939, ppl=3.83, wps=586572, ups=1.18, wpb=495262, bsz=16299, num_updates=56000, lr=0.000267261, gnorm=0.128, clip=0, loss_scale=2, train_wall=83, gb_free=61, wall=47981 epoch 034: 349 / 1689 loss=3.496, nll_loss=1.939, ppl=3.83, wps=586572, ups=1.18, wpb=495262, bsz=16299, num_updates=56000, lr=0.000267261, gnorm=0.128, clip=0, loss_scale=2, train_wall=83, gb_free=61, wall=47981 epoch 034: 349 / 1689 loss=3.496, nll_loss=1.939, ppl=3.83, wps=586572, ups=1.18, wpb=495262, bsz=16299, num_updates=56000, lr=0.000267261, gnorm=0.128, clip=0, loss_scale=2, train_wall=83, gb_free=61, wall=47981 epoch 034: 349 / 1689 loss=3.496, nll_loss=1.939, ppl=3.83, wps=586572, ups=1.18, wpb=495262, bsz=16299, num_updates=56000, lr=0.000267261, gnorm=0.128, clip=0, loss_scale=2, train_wall=83, gb_free=61, wall=47981 epoch 034: 349 / 1689 loss=3.496, nll_loss=1.939, ppl=3.83, wps=586572, ups=1.18, wpb=495262, bsz=16299, num_updates=56000, lr=0.000267261, gnorm=0.128, clip=0, loss_scale=2, train_wall=83, gb_free=61, wall=47981 epoch 034: 349 / 1689 loss=3.496, nll_loss=1.939, ppl=3.83, wps=586572, ups=1.18, wpb=495262, bsz=16299, num_updates=56000, lr=0.000267261, gnorm=0.128, clip=0, loss_scale=2, train_wall=83, gb_free=61, wall=47981 epoch 034: 349 / 1689 loss=3.496, nll_loss=1.939, ppl=3.83, wps=586572, ups=1.18, wpb=495262, bsz=16299, num_updates=56000, lr=0.000267261, gnorm=0.128, clip=0, loss_scale=2, train_wall=83, gb_free=61, wall=47981 epoch 034: 349 / 1689 loss=3.496, nll_loss=1.939, ppl=3.83, wps=586572, ups=1.18, wpb=495262, bsz=16299, num_updates=56000, lr=0.000267261, gnorm=0.128, clip=0, loss_scale=2, train_wall=83, gb_free=61, wall=47981 epoch 034: 349 / 1689 loss=3.496, nll_loss=1.939, ppl=3.83, wps=586572, ups=1.18, wpb=495262, bsz=16299, num_updates=56000, lr=0.000267261, gnorm=0.128, clip=0, loss_scale=2, train_wall=83, gb_free=61, wall=47981 epoch 034: 349 / 1689 loss=3.496, nll_loss=1.939, ppl=3.83, wps=586572, ups=1.18, wpb=495262, bsz=16299, num_updates=56000, lr=0.000267261, gnorm=0.128, clip=0, loss_scale=2, train_wall=83, gb_free=61, wall=47981 epoch 034: 349 / 1689 loss=3.496, nll_loss=1.939, ppl=3.83, wps=586572, ups=1.18, wpb=495262, bsz=16299, num_updates=56000, lr=0.000267261, gnorm=0.128, clip=0, loss_scale=2, train_wall=83, gb_free=61, wall=47981 epoch 034: 349 / 1689 loss=3.496, nll_loss=1.939, ppl=3.83, wps=586572, ups=1.18, wpb=495262, bsz=16299, num_updates=56000, lr=0.000267261, gnorm=0.128, clip=0, loss_scale=2, train_wall=83, gb_free=61, wall=47981 epoch 034: 349 / 1689 loss=3.496, nll_loss=1.939, ppl=3.83, wps=586572, ups=1.18, wpb=495262, bsz=16299, num_updates=56000, lr=0.000267261, gnorm=0.128, clip=0, loss_scale=2, train_wall=83, gb_free=61, wall=47981 epoch 034: 349 / 1689 loss=3.496, nll_loss=1.939, ppl=3.83, wps=586572, ups=1.18, wpb=495262, bsz=16299, num_updates=56000, lr=0.000267261, gnorm=0.128, clip=0, loss_scale=2, train_wall=83, gb_free=61, wall=47981 epoch 034: 349 / 1689 loss=3.496, nll_loss=1.939, ppl=3.83, wps=586572, ups=1.18, wpb=495262, bsz=16299, num_updates=56000, lr=0.000267261, gnorm=0.128, clip=0, loss_scale=2, train_wall=83, gb_free=61, wall=47981 epoch 034: 349 / 1689 loss=3.496, nll_loss=1.939, ppl=3.83, wps=586572, ups=1.18, wpb=495262, bsz=16299, num_updates=56000, lr=0.000267261, gnorm=0.128, clip=0, loss_scale=2, train_wall=83, gb_free=61, wall=47981 epoch 034: 349 / 1689 loss=3.496, nll_loss=1.939, ppl=3.83, wps=586572, ups=1.18, wpb=495262, bsz=16299, num_updates=56000, lr=0.000267261, gnorm=0.128, clip=0, loss_scale=2, train_wall=83, gb_free=61, wall=47981 epoch 034: 349 / 1689 loss=3.496, nll_loss=1.939, ppl=3.83, wps=586572, ups=1.18, wpb=495262, bsz=16299, num_updates=56000, lr=0.000267261, gnorm=0.128, clip=0, loss_scale=2, train_wall=83, gb_free=61, wall=47981 epoch 034: 349 / 1689 loss=3.496, nll_loss=1.939, ppl=3.83, wps=586572, ups=1.18, wpb=495262, bsz=16299, num_updates=56000, lr=0.000267261, gnorm=0.128, clip=0, loss_scale=2, train_wall=83, gb_free=61, wall=47981 epoch 034: 349 / 1689 loss=3.496, nll_loss=1.939, ppl=3.83, wps=586572, ups=1.18, wpb=495262, bsz=16299, num_updates=56000, lr=0.000267261, gnorm=0.128, clip=0, loss_scale=2, train_wall=83, gb_free=61, wall=47981 epoch 034: 349 / 1689 loss=3.496, nll_loss=1.939, ppl=3.83, wps=586572, ups=1.18, wpb=495262, bsz=16299, num_updates=56000, lr=0.000267261, gnorm=0.128, clip=0, loss_scale=2, train_wall=83, gb_free=61, wall=47981 epoch 034: 349 / 1689 loss=3.496, nll_loss=1.939, ppl=3.83, wps=586572, ups=1.18, wpb=495262, bsz=16299, num_updates=56000, lr=0.000267261, gnorm=0.128, clip=0, loss_scale=2, train_wall=83, gb_free=61, wall=47981 epoch 034: 349 / 1689 loss=3.496, nll_loss=1.939, ppl=3.83, wps=586572, ups=1.18, wpb=495262, bsz=16299, num_updates=56000, lr=0.000267261, gnorm=0.128, clip=0, loss_scale=2, train_wall=83, gb_free=61, wall=47981 epoch 034: 349 / 1689 loss=3.496, nll_loss=1.939, ppl=3.83, wps=586572, ups=1.18, wpb=495262, bsz=16299, num_updates=56000, lr=0.000267261, gnorm=0.128, clip=0, loss_scale=2, train_wall=83, gb_free=61, wall=47981 epoch 034: 349 / 1689 loss=3.496, nll_loss=1.939, ppl=3.83, wps=586572, ups=1.18, wpb=495262, bsz=16299, num_updates=56000, lr=0.000267261, gnorm=0.128, clip=0, loss_scale=2, train_wall=83, gb_free=61, wall=47981 epoch 034: 349 / 1689 loss=3.496, nll_loss=1.939, ppl=3.83, wps=586572, ups=1.18, wpb=495262, bsz=16299, num_updates=56000, lr=0.000267261, gnorm=0.128, clip=0, loss_scale=2, train_wall=83, gb_free=61, wall=47981 epoch 034: 349 / 1689 loss=3.496, nll_loss=1.939, ppl=3.83, wps=586572, ups=1.18, wpb=495262, bsz=16299, num_updates=56000, lr=0.000267261, gnorm=0.128, clip=0, loss_scale=2, train_wall=83, gb_free=61, wall=47981 epoch 034: 349 / 1689 loss=3.496, nll_loss=1.939, ppl=3.83, wps=586572, ups=1.18, wpb=495262, bsz=16299, num_updates=56000, lr=0.000267261, gnorm=0.128, clip=0, loss_scale=2, train_wall=83, gb_free=61, wall=47981 epoch 034: 349 / 1689 loss=3.496, nll_loss=1.939, ppl=3.83, wps=586572, ups=1.18, wpb=495262, bsz=16299, num_updates=56000, lr=0.000267261, gnorm=0.128, clip=0, loss_scale=2, train_wall=83, gb_free=61, wall=47981 begin validation on "valid" subset epoch 034 | valid on 'valid' subset | loss 3.69 | nll_loss 2.14 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 56000 | best_loss 3.685 epoch 034 | valid on 'valid' subset | loss 3.69 | nll_loss 2.14 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 56000 | best_loss 3.685 epoch 034 | valid on 'valid' subset | loss 3.69 | nll_loss 2.14 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 56000 | best_loss 3.685 epoch 034 | valid on 'valid' subset | loss 3.69 | nll_loss 2.14 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 56000 | best_loss 3.685 epoch 034 | valid on 'valid' subset | loss 3.69 | nll_loss 2.14 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 56000 | best_loss 3.685 epoch 034 | valid on 'valid' subset | loss 3.69 | nll_loss 2.14 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 56000 | best_loss 3.685 epoch 034 | valid on 'valid' subset | loss 3.69 | nll_loss 2.14 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 56000 | best_loss 3.685 epoch 034 | valid on 'valid' subset | loss 3.69 | nll_loss 2.14 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 56000 | best_loss 3.685 epoch 034 | valid on 'valid' subset | loss 3.69 | nll_loss 2.14 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 56000 | best_loss 3.685 epoch 034 | valid on 'valid' subset | loss 3.69 | nll_loss 2.14 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 56000 | best_loss 3.685 epoch 034 | valid on 'valid' subset | loss 3.69 | nll_loss 2.14 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 56000 | best_loss 3.685 epoch 034 | valid on 'valid' subset | loss 3.69 | nll_loss 2.14 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 56000 | best_loss 3.685 epoch 034 | valid on 'valid' subset | loss 3.69 | nll_loss 2.14 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 56000 | best_loss 3.685 epoch 034 | valid on 'valid' subset | loss 3.69 | nll_loss 2.14 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 56000 | best_loss 3.685 epoch 034 | valid on 'valid' subset | loss 3.69 | nll_loss 2.14 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 56000 | best_loss 3.685 epoch 034 | valid on 'valid' subset | loss 3.69 | nll_loss 2.14 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 56000 | best_loss 3.685 epoch 034 | valid on 'valid' subset | loss 3.69 | nll_loss 2.14 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 56000 | best_loss 3.685 epoch 034 | valid on 'valid' subset | loss 3.69 | nll_loss 2.14 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 56000 | best_loss 3.685 epoch 034 | valid on 'valid' subset | loss 3.69 | nll_loss 2.14 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 56000 | best_loss 3.685 epoch 034 | valid on 'valid' subset | loss 3.69 | nll_loss 2.14 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 56000 | best_loss 3.685 epoch 034 | valid on 'valid' subset | loss 3.69 | nll_loss 2.14 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 56000 | best_loss 3.685 epoch 034 | valid on 'valid' subset | loss 3.69 | nll_loss 2.14 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 56000 | best_loss 3.685 epoch 034 | valid on 'valid' subset | loss 3.69 | nll_loss 2.14 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 56000 | best_loss 3.685 epoch 034 | valid on 'valid' subset | loss 3.69 | nll_loss 2.14 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 56000 | best_loss 3.685 epoch 034 | valid on 'valid' subset | loss 3.69 | nll_loss 2.14 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 56000 | best_loss 3.685 epoch 034 | valid on 'valid' subset | loss 3.69 | nll_loss 2.14 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 56000 | best_loss 3.685 epoch 034 | valid on 'valid' subset | loss 3.69 | nll_loss 2.14 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 56000 | best_loss 3.685 epoch 034 | valid on 'valid' subset | loss 3.69 | nll_loss 2.14 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 56000 | best_loss 3.685 epoch 034 | valid on 'valid' subset | loss 3.69 | nll_loss 2.14 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 56000 | best_loss 3.685 epoch 034 | valid on 'valid' subset | loss 3.69 | nll_loss 2.14 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 56000 | best_loss 3.685 epoch 034 | valid on 'valid' subset | loss 3.69 | nll_loss 2.14 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 56000 | best_loss 3.685 epoch 034 | valid on 'valid' subset | loss 3.69 | nll_loss 2.14 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 56000 | best_loss 3.685 epoch 034 | valid on 'valid' subset | loss 3.69 | nll_loss 2.14 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 56000 | best_loss 3.685 epoch 034 | valid on 'valid' subset | loss 3.69 | nll_loss 2.14 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 56000 | best_loss 3.685 epoch 034: 449 / 1689 loss=3.497, nll_loss=1.94, ppl=3.84, wps=537453, ups=1.09, wpb=494514, bsz=16593.9, num_updates=56100, lr=0.000267023, gnorm=0.131, clip=0, loss_scale=4, train_wall=83, gb_free=61.8, wall=48073 epoch 034: 449 / 1689 loss=3.497, nll_loss=1.94, ppl=3.84, wps=537453, ups=1.09, wpb=494514, bsz=16593.9, num_updates=56100, lr=0.000267023, gnorm=0.131, clip=0, loss_scale=4, train_wall=83, gb_free=61.8, wall=48073 epoch 034: 449 / 1689 loss=3.497, nll_loss=1.94, ppl=3.84, wps=537453, ups=1.09, wpb=494514, bsz=16593.9, num_updates=56100, lr=0.000267023, gnorm=0.131, clip=0, loss_scale=4, train_wall=83, gb_free=61.8, wall=48073 epoch 034: 449 / 1689 loss=3.497, nll_loss=1.94, ppl=3.84, wps=537453, ups=1.09, wpb=494514, bsz=16593.9, num_updates=56100, lr=0.000267023, gnorm=0.131, clip=0, loss_scale=4, train_wall=83, gb_free=61.8, wall=48073 epoch 034: 449 / 1689 loss=3.497, nll_loss=1.94, ppl=3.84, wps=537453, ups=1.09, wpb=494514, bsz=16593.9, num_updates=56100, lr=0.000267023, gnorm=0.131, clip=0, loss_scale=4, train_wall=83, gb_free=61.8, wall=48073 epoch 034: 449 / 1689 loss=3.497, nll_loss=1.94, ppl=3.84, wps=537453, ups=1.09, wpb=494514, bsz=16593.9, num_updates=56100, lr=0.000267023, gnorm=0.131, clip=0, loss_scale=4, train_wall=83, gb_free=61.8, wall=48073 epoch 034: 449 / 1689 loss=3.497, nll_loss=1.94, ppl=3.84, wps=537453, ups=1.09, wpb=494514, bsz=16593.9, num_updates=56100, lr=0.000267023, gnorm=0.131, clip=0, loss_scale=4, train_wall=83, gb_free=61.8, wall=48073 epoch 034: 449 / 1689 loss=3.497, nll_loss=1.94, ppl=3.84, wps=537453, ups=1.09, wpb=494514, bsz=16593.9, num_updates=56100, lr=0.000267023, gnorm=0.131, clip=0, loss_scale=4, train_wall=83, gb_free=61.8, wall=48073 epoch 034: 449 / 1689 loss=3.497, nll_loss=1.94, ppl=3.84, wps=537453, ups=1.09, wpb=494514, bsz=16593.9, num_updates=56100, lr=0.000267023, gnorm=0.131, clip=0, loss_scale=4, train_wall=83, gb_free=61.8, wall=48073 epoch 034: 449 / 1689 loss=3.497, nll_loss=1.94, ppl=3.84, wps=537453, ups=1.09, wpb=494514, bsz=16593.9, num_updates=56100, lr=0.000267023, gnorm=0.131, clip=0, loss_scale=4, train_wall=83, gb_free=61.8, wall=48073 epoch 034: 449 / 1689 loss=3.497, nll_loss=1.94, ppl=3.84, wps=537453, ups=1.09, wpb=494514, bsz=16593.9, num_updates=56100, lr=0.000267023, gnorm=0.131, clip=0, loss_scale=4, train_wall=83, gb_free=61.8, wall=48073 epoch 034: 449 / 1689 loss=3.497, nll_loss=1.94, ppl=3.84, wps=537453, ups=1.09, wpb=494514, bsz=16593.9, num_updates=56100, lr=0.000267023, gnorm=0.131, clip=0, loss_scale=4, train_wall=83, gb_free=61.8, wall=48073 epoch 034: 449 / 1689 loss=3.497, nll_loss=1.94, ppl=3.84, wps=537453, ups=1.09, wpb=494514, bsz=16593.9, num_updates=56100, lr=0.000267023, gnorm=0.131, clip=0, loss_scale=4, train_wall=83, gb_free=61.8, wall=48073 epoch 034: 449 / 1689 loss=3.497, nll_loss=1.94, ppl=3.84, wps=537453, ups=1.09, wpb=494514, bsz=16593.9, num_updates=56100, lr=0.000267023, gnorm=0.131, clip=0, loss_scale=4, train_wall=83, gb_free=61.8, wall=48073 epoch 034: 449 / 1689 loss=3.497, nll_loss=1.94, ppl=3.84, wps=537453, ups=1.09, wpb=494514, bsz=16593.9, num_updates=56100, lr=0.000267023, gnorm=0.131, clip=0, loss_scale=4, train_wall=83, gb_free=61.8, wall=48073 epoch 034: 449 / 1689 loss=3.497, nll_loss=1.94, ppl=3.84, wps=537453, ups=1.09, wpb=494514, bsz=16593.9, num_updates=56100, lr=0.000267023, gnorm=0.131, clip=0, loss_scale=4, train_wall=83, gb_free=61.8, wall=48073 epoch 034: 449 / 1689 loss=3.497, nll_loss=1.94, ppl=3.84, wps=537453, ups=1.09, wpb=494514, bsz=16593.9, num_updates=56100, lr=0.000267023, gnorm=0.131, clip=0, loss_scale=4, train_wall=83, gb_free=61.8, wall=48073 epoch 034: 449 / 1689 loss=3.497, nll_loss=1.94, ppl=3.84, wps=537453, ups=1.09, wpb=494514, bsz=16593.9, num_updates=56100, lr=0.000267023, gnorm=0.131, clip=0, loss_scale=4, train_wall=83, gb_free=61.8, wall=48073 epoch 034: 449 / 1689 loss=3.497, nll_loss=1.94, ppl=3.84, wps=537453, ups=1.09, wpb=494514, bsz=16593.9, num_updates=56100, lr=0.000267023, gnorm=0.131, clip=0, loss_scale=4, train_wall=83, gb_free=61.8, wall=48073 epoch 034: 449 / 1689 loss=3.497, nll_loss=1.94, ppl=3.84, wps=537453, ups=1.09, wpb=494514, bsz=16593.9, num_updates=56100, lr=0.000267023, gnorm=0.131, clip=0, loss_scale=4, train_wall=83, gb_free=61.8, wall=48073 epoch 034: 449 / 1689 loss=3.497, nll_loss=1.94, ppl=3.84, wps=537453, ups=1.09, wpb=494514, bsz=16593.9, num_updates=56100, lr=0.000267023, gnorm=0.131, clip=0, loss_scale=4, train_wall=83, gb_free=61.8, wall=48073 epoch 034: 449 / 1689 loss=3.497, nll_loss=1.94, ppl=3.84, wps=537453, ups=1.09, wpb=494514, bsz=16593.9, num_updates=56100, lr=0.000267023, gnorm=0.131, clip=0, loss_scale=4, train_wall=83, gb_free=61.8, wall=48073 epoch 034: 449 / 1689 loss=3.497, nll_loss=1.94, ppl=3.84, wps=537453, ups=1.09, wpb=494514, bsz=16593.9, num_updates=56100, lr=0.000267023, gnorm=0.131, clip=0, loss_scale=4, train_wall=83, gb_free=61.8, wall=48073 epoch 034: 449 / 1689 loss=3.497, nll_loss=1.94, ppl=3.84, wps=537453, ups=1.09, wpb=494514, bsz=16593.9, num_updates=56100, lr=0.000267023, gnorm=0.131, clip=0, loss_scale=4, train_wall=83, gb_free=61.8, wall=48073 epoch 034: 449 / 1689 loss=3.497, nll_loss=1.94, ppl=3.84, wps=537453, ups=1.09, wpb=494514, bsz=16593.9, num_updates=56100, lr=0.000267023, gnorm=0.131, clip=0, loss_scale=4, train_wall=83, gb_free=61.8, wall=48073 epoch 034: 449 / 1689 loss=3.497, nll_loss=1.94, ppl=3.84, wps=537453, ups=1.09, wpb=494514, bsz=16593.9, num_updates=56100, lr=0.000267023, gnorm=0.131, clip=0, loss_scale=4, train_wall=83, gb_free=61.8, wall=48073 epoch 034: 449 / 1689 loss=3.497, nll_loss=1.94, ppl=3.84, wps=537453, ups=1.09, wpb=494514, bsz=16593.9, num_updates=56100, lr=0.000267023, gnorm=0.131, clip=0, loss_scale=4, train_wall=83, gb_free=61.8, wall=48073 epoch 034: 449 / 1689 loss=3.497, nll_loss=1.94, ppl=3.84, wps=537453, ups=1.09, wpb=494514, bsz=16593.9, num_updates=56100, lr=0.000267023, gnorm=0.131, clip=0, loss_scale=4, train_wall=83, gb_free=61.8, wall=48073 epoch 034: 449 / 1689 loss=3.497, nll_loss=1.94, ppl=3.84, wps=537453, ups=1.09, wpb=494514, bsz=16593.9, num_updates=56100, lr=0.000267023, gnorm=0.131, clip=0, loss_scale=4, train_wall=83, gb_free=61.8, wall=48073 epoch 034: 449 / 1689 loss=3.497, nll_loss=1.94, ppl=3.84, wps=537453, ups=1.09, wpb=494514, bsz=16593.9, num_updates=56100, lr=0.000267023, gnorm=0.131, clip=0, loss_scale=4, train_wall=83, gb_free=61.8, wall=48073 epoch 034: 449 / 1689 loss=3.497, nll_loss=1.94, ppl=3.84, wps=537453, ups=1.09, wpb=494514, bsz=16593.9, num_updates=56100, lr=0.000267023, gnorm=0.131, clip=0, loss_scale=4, train_wall=83, gb_free=61.8, wall=48073 epoch 034: 449 / 1689 loss=3.497, nll_loss=1.94, ppl=3.84, wps=537453, ups=1.09, wpb=494514, bsz=16593.9, num_updates=56100, lr=0.000267023, gnorm=0.131, clip=0, loss_scale=4, train_wall=83, gb_free=61.8, wall=48073 epoch 034: 449 / 1689 loss=3.497, nll_loss=1.94, ppl=3.84, wps=537453, ups=1.09, wpb=494514, bsz=16593.9, num_updates=56100, lr=0.000267023, gnorm=0.131, clip=0, loss_scale=4, train_wall=83, gb_free=61.8, wall=48073 epoch 034: 449 / 1689 loss=3.497, nll_loss=1.94, ppl=3.84, wps=537453, ups=1.09, wpb=494514, bsz=16593.9, num_updates=56100, lr=0.000267023, gnorm=0.131, clip=0, loss_scale=4, train_wall=83, gb_free=61.8, wall=48073 epoch 034: 549 / 1689 loss=3.496, nll_loss=1.94, ppl=3.84, wps=592092, ups=1.19, wpb=496011, bsz=16473.9, num_updates=56200, lr=0.000266785, gnorm=0.14, clip=0, loss_scale=4, train_wall=83, gb_free=62, wall=48157 epoch 034: 549 / 1689 loss=3.496, nll_loss=1.94, ppl=3.84, wps=592092, ups=1.19, wpb=496011, bsz=16473.9, num_updates=56200, lr=0.000266785, gnorm=0.14, clip=0, loss_scale=4, train_wall=83, gb_free=62, wall=48157 epoch 034: 549 / 1689 loss=3.496, nll_loss=1.94, ppl=3.84, wps=592092, ups=1.19, wpb=496011, bsz=16473.9, num_updates=56200, lr=0.000266785, gnorm=0.14, clip=0, loss_scale=4, train_wall=83, gb_free=62, wall=48157 epoch 034: 549 / 1689 loss=3.496, nll_loss=1.94, ppl=3.84, wps=592092, ups=1.19, wpb=496011, bsz=16473.9, num_updates=56200, lr=0.000266785, gnorm=0.14, clip=0, loss_scale=4, train_wall=83, gb_free=62, wall=48157 epoch 034: 549 / 1689 loss=3.496, nll_loss=1.94, ppl=3.84, wps=592092, ups=1.19, wpb=496011, bsz=16473.9, num_updates=56200, lr=0.000266785, gnorm=0.14, clip=0, loss_scale=4, train_wall=83, gb_free=62, wall=48157 epoch 034: 549 / 1689 loss=3.496, nll_loss=1.94, ppl=3.84, wps=592092, ups=1.19, wpb=496011, bsz=16473.9, num_updates=56200, lr=0.000266785, gnorm=0.14, clip=0, loss_scale=4, train_wall=83, gb_free=62, wall=48157 epoch 034: 549 / 1689 loss=3.496, nll_loss=1.94, ppl=3.84, wps=592092, ups=1.19, wpb=496011, bsz=16473.9, num_updates=56200, lr=0.000266785, gnorm=0.14, clip=0, loss_scale=4, train_wall=83, gb_free=62, wall=48157 epoch 034: 549 / 1689 loss=3.496, nll_loss=1.94, ppl=3.84, wps=592092, ups=1.19, wpb=496011, bsz=16473.9, num_updates=56200, lr=0.000266785, gnorm=0.14, clip=0, loss_scale=4, train_wall=83, gb_free=62, wall=48157 epoch 034: 549 / 1689 loss=3.496, nll_loss=1.94, ppl=3.84, wps=592092, ups=1.19, wpb=496011, bsz=16473.9, num_updates=56200, lr=0.000266785, gnorm=0.14, clip=0, loss_scale=4, train_wall=83, gb_free=62, wall=48157 epoch 034: 549 / 1689 loss=3.496, nll_loss=1.94, ppl=3.84, wps=592092, ups=1.19, wpb=496011, bsz=16473.9, num_updates=56200, lr=0.000266785, gnorm=0.14, clip=0, loss_scale=4, train_wall=83, gb_free=62, wall=48157 epoch 034: 549 / 1689 loss=3.496, nll_loss=1.94, ppl=3.84, wps=592092, ups=1.19, wpb=496011, bsz=16473.9, num_updates=56200, lr=0.000266785, gnorm=0.14, clip=0, loss_scale=4, train_wall=83, gb_free=62, wall=48157 epoch 034: 549 / 1689 loss=3.496, nll_loss=1.94, ppl=3.84, wps=592092, ups=1.19, wpb=496011, bsz=16473.9, num_updates=56200, lr=0.000266785, gnorm=0.14, clip=0, loss_scale=4, train_wall=83, gb_free=62, wall=48157 epoch 034: 549 / 1689 loss=3.496, nll_loss=1.94, ppl=3.84, wps=592092, ups=1.19, wpb=496011, bsz=16473.9, num_updates=56200, lr=0.000266785, gnorm=0.14, clip=0, loss_scale=4, train_wall=83, gb_free=62, wall=48157 epoch 034: 549 / 1689 loss=3.496, nll_loss=1.94, ppl=3.84, wps=592092, ups=1.19, wpb=496011, bsz=16473.9, num_updates=56200, lr=0.000266785, gnorm=0.14, clip=0, loss_scale=4, train_wall=83, gb_free=62, wall=48157 epoch 034: 549 / 1689 loss=3.496, nll_loss=1.94, ppl=3.84, wps=592092, ups=1.19, wpb=496011, bsz=16473.9, num_updates=56200, lr=0.000266785, gnorm=0.14, clip=0, loss_scale=4, train_wall=83, gb_free=62, wall=48157 epoch 034: 549 / 1689 loss=3.496, nll_loss=1.94, ppl=3.84, wps=592092, ups=1.19, wpb=496011, bsz=16473.9, num_updates=56200, lr=0.000266785, gnorm=0.14, clip=0, loss_scale=4, train_wall=83, gb_free=62, wall=48157 epoch 034: 549 / 1689 loss=3.496, nll_loss=1.94, ppl=3.84, wps=592092, ups=1.19, wpb=496011, bsz=16473.9, num_updates=56200, lr=0.000266785, gnorm=0.14, clip=0, loss_scale=4, train_wall=83, gb_free=62, wall=48157 epoch 034: 549 / 1689 loss=3.496, nll_loss=1.94, ppl=3.84, wps=592092, ups=1.19, wpb=496011, bsz=16473.9, num_updates=56200, lr=0.000266785, gnorm=0.14, clip=0, loss_scale=4, train_wall=83, gb_free=62, wall=48157 epoch 034: 549 / 1689 loss=3.496, nll_loss=1.94, ppl=3.84, wps=592092, ups=1.19, wpb=496011, bsz=16473.9, num_updates=56200, lr=0.000266785, gnorm=0.14, clip=0, loss_scale=4, train_wall=83, gb_free=62, wall=48157 epoch 034: 549 / 1689 loss=3.496, nll_loss=1.94, ppl=3.84, wps=592092, ups=1.19, wpb=496011, bsz=16473.9, num_updates=56200, lr=0.000266785, gnorm=0.14, clip=0, loss_scale=4, train_wall=83, gb_free=62, wall=48157 epoch 034: 549 / 1689 loss=3.496, nll_loss=1.94, ppl=3.84, wps=592092, ups=1.19, wpb=496011, bsz=16473.9, num_updates=56200, lr=0.000266785, gnorm=0.14, clip=0, loss_scale=4, train_wall=83, gb_free=62, wall=48157 epoch 034: 549 / 1689 loss=3.496, nll_loss=1.94, ppl=3.84, wps=592092, ups=1.19, wpb=496011, bsz=16473.9, num_updates=56200, lr=0.000266785, gnorm=0.14, clip=0, loss_scale=4, train_wall=83, gb_free=62, wall=48157 epoch 034: 549 / 1689 loss=3.496, nll_loss=1.94, ppl=3.84, wps=592092, ups=1.19, wpb=496011, bsz=16473.9, num_updates=56200, lr=0.000266785, gnorm=0.14, clip=0, loss_scale=4, train_wall=83, gb_free=62, wall=48157 epoch 034: 549 / 1689 loss=3.496, nll_loss=1.94, ppl=3.84, wps=592092, ups=1.19, wpb=496011, bsz=16473.9, num_updates=56200, lr=0.000266785, gnorm=0.14, clip=0, loss_scale=4, train_wall=83, gb_free=62, wall=48157 epoch 034: 549 / 1689 loss=3.496, nll_loss=1.94, ppl=3.84, wps=592092, ups=1.19, wpb=496011, bsz=16473.9, num_updates=56200, lr=0.000266785, gnorm=0.14, clip=0, loss_scale=4, train_wall=83, gb_free=62, wall=48157 epoch 034: 549 / 1689 loss=3.496, nll_loss=1.94, ppl=3.84, wps=592092, ups=1.19, wpb=496011, bsz=16473.9, num_updates=56200, lr=0.000266785, gnorm=0.14, clip=0, loss_scale=4, train_wall=83, gb_free=62, wall=48157 epoch 034: 549 / 1689 loss=3.496, nll_loss=1.94, ppl=3.84, wps=592092, ups=1.19, wpb=496011, bsz=16473.9, num_updates=56200, lr=0.000266785, gnorm=0.14, clip=0, loss_scale=4, train_wall=83, gb_free=62, wall=48157 epoch 034: 549 / 1689 loss=3.496, nll_loss=1.94, ppl=3.84, wps=592092, ups=1.19, wpb=496011, bsz=16473.9, num_updates=56200, lr=0.000266785, gnorm=0.14, clip=0, loss_scale=4, train_wall=83, gb_free=62, wall=48157 epoch 034: 549 / 1689 loss=3.496, nll_loss=1.94, ppl=3.84, wps=592092, ups=1.19, wpb=496011, bsz=16473.9, num_updates=56200, lr=0.000266785, gnorm=0.14, clip=0, loss_scale=4, train_wall=83, gb_free=62, wall=48157 epoch 034: 549 / 1689 loss=3.496, nll_loss=1.94, ppl=3.84, wps=592092, ups=1.19, wpb=496011, bsz=16473.9, num_updates=56200, lr=0.000266785, gnorm=0.14, clip=0, loss_scale=4, train_wall=83, gb_free=62, wall=48157 epoch 034: 549 / 1689 loss=3.496, nll_loss=1.94, ppl=3.84, wps=592092, ups=1.19, wpb=496011, bsz=16473.9, num_updates=56200, lr=0.000266785, gnorm=0.14, clip=0, loss_scale=4, train_wall=83, gb_free=62, wall=48157 epoch 034: 549 / 1689 loss=3.496, nll_loss=1.94, ppl=3.84, wps=592092, ups=1.19, wpb=496011, bsz=16473.9, num_updates=56200, lr=0.000266785, gnorm=0.14, clip=0, loss_scale=4, train_wall=83, gb_free=62, wall=48157 epoch 034: 549 / 1689 loss=3.496, nll_loss=1.94, ppl=3.84, wps=592092, ups=1.19, wpb=496011, bsz=16473.9, num_updates=56200, lr=0.000266785, gnorm=0.14, clip=0, loss_scale=4, train_wall=83, gb_free=62, wall=48157 epoch 034: 549 / 1689 loss=3.496, nll_loss=1.94, ppl=3.84, wps=592092, ups=1.19, wpb=496011, bsz=16473.9, num_updates=56200, lr=0.000266785, gnorm=0.14, clip=0, loss_scale=4, train_wall=83, gb_free=62, wall=48157 epoch 034: 649 / 1689 loss=3.508, nll_loss=1.954, ppl=3.87, wps=588658, ups=1.19, wpb=494741, bsz=16668.3, num_updates=56300, lr=0.000266548, gnorm=0.144, clip=0, loss_scale=4, train_wall=83, gb_free=61.1, wall=48241 epoch 034: 649 / 1689 loss=3.508, nll_loss=1.954, ppl=3.87, wps=588658, ups=1.19, wpb=494741, bsz=16668.3, num_updates=56300, lr=0.000266548, gnorm=0.144, clip=0, loss_scale=4, train_wall=83, gb_free=61.1, wall=48241 epoch 034: 649 / 1689 loss=3.508, nll_loss=1.954, ppl=3.87, wps=588658, ups=1.19, wpb=494741, bsz=16668.3, num_updates=56300, lr=0.000266548, gnorm=0.144, clip=0, loss_scale=4, train_wall=83, gb_free=61.1, wall=48241 epoch 034: 649 / 1689 loss=3.508, nll_loss=1.954, ppl=3.87, wps=588658, ups=1.19, wpb=494741, bsz=16668.3, num_updates=56300, lr=0.000266548, gnorm=0.144, clip=0, loss_scale=4, train_wall=83, gb_free=61.1, wall=48241 epoch 034: 649 / 1689 loss=3.508, nll_loss=1.954, ppl=3.87, wps=588658, ups=1.19, wpb=494741, bsz=16668.3, num_updates=56300, lr=0.000266548, gnorm=0.144, clip=0, loss_scale=4, train_wall=83, gb_free=61.1, wall=48241 epoch 034: 649 / 1689 loss=3.508, nll_loss=1.954, ppl=3.87, wps=588658, ups=1.19, wpb=494741, bsz=16668.3, num_updates=56300, lr=0.000266548, gnorm=0.144, clip=0, loss_scale=4, train_wall=83, gb_free=61.1, wall=48241 epoch 034: 649 / 1689 loss=3.508, nll_loss=1.954, ppl=3.87, wps=588658, ups=1.19, wpb=494741, bsz=16668.3, num_updates=56300, lr=0.000266548, gnorm=0.144, clip=0, loss_scale=4, train_wall=83, gb_free=61.1, wall=48241 epoch 034: 649 / 1689 loss=3.508, nll_loss=1.954, ppl=3.87, wps=588658, ups=1.19, wpb=494741, bsz=16668.3, num_updates=56300, lr=0.000266548, gnorm=0.144, clip=0, loss_scale=4, train_wall=83, gb_free=61.1, wall=48241 epoch 034: 649 / 1689 loss=3.508, nll_loss=1.954, ppl=3.87, wps=588658, ups=1.19, wpb=494741, bsz=16668.3, num_updates=56300, lr=0.000266548, gnorm=0.144, clip=0, loss_scale=4, train_wall=83, gb_free=61.1, wall=48241 epoch 034: 649 / 1689 loss=3.508, nll_loss=1.954, ppl=3.87, wps=588658, ups=1.19, wpb=494741, bsz=16668.3, num_updates=56300, lr=0.000266548, gnorm=0.144, clip=0, loss_scale=4, train_wall=83, gb_free=61.1, wall=48241 epoch 034: 649 / 1689 loss=3.508, nll_loss=1.954, ppl=3.87, wps=588658, ups=1.19, wpb=494741, bsz=16668.3, num_updates=56300, lr=0.000266548, gnorm=0.144, clip=0, loss_scale=4, train_wall=83, gb_free=61.1, wall=48241 epoch 034: 649 / 1689 loss=3.508, nll_loss=1.954, ppl=3.87, wps=588658, ups=1.19, wpb=494741, bsz=16668.3, num_updates=56300, lr=0.000266548, gnorm=0.144, clip=0, loss_scale=4, train_wall=83, gb_free=61.1, wall=48241 epoch 034: 649 / 1689 loss=3.508, nll_loss=1.954, ppl=3.87, wps=588658, ups=1.19, wpb=494741, bsz=16668.3, num_updates=56300, lr=0.000266548, gnorm=0.144, clip=0, loss_scale=4, train_wall=83, gb_free=61.1, wall=48241 epoch 034: 649 / 1689 loss=3.508, nll_loss=1.954, ppl=3.87, wps=588658, ups=1.19, wpb=494741, bsz=16668.3, num_updates=56300, lr=0.000266548, gnorm=0.144, clip=0, loss_scale=4, train_wall=83, gb_free=61.1, wall=48241 epoch 034: 649 / 1689 loss=3.508, nll_loss=1.954, ppl=3.87, wps=588658, ups=1.19, wpb=494741, bsz=16668.3, num_updates=56300, lr=0.000266548, gnorm=0.144, clip=0, loss_scale=4, train_wall=83, gb_free=61.1, wall=48241 epoch 034: 649 / 1689 loss=3.508, nll_loss=1.954, ppl=3.87, wps=588658, ups=1.19, wpb=494741, bsz=16668.3, num_updates=56300, lr=0.000266548, gnorm=0.144, clip=0, loss_scale=4, train_wall=83, gb_free=61.1, wall=48241 epoch 034: 649 / 1689 loss=3.508, nll_loss=1.954, ppl=3.87, wps=588658, ups=1.19, wpb=494741, bsz=16668.3, num_updates=56300, lr=0.000266548, gnorm=0.144, clip=0, loss_scale=4, train_wall=83, gb_free=61.1, wall=48241 epoch 034: 649 / 1689 loss=3.508, nll_loss=1.954, ppl=3.87, wps=588658, ups=1.19, wpb=494741, bsz=16668.3, num_updates=56300, lr=0.000266548, gnorm=0.144, clip=0, loss_scale=4, train_wall=83, gb_free=61.1, wall=48241 epoch 034: 649 / 1689 loss=3.508, nll_loss=1.954, ppl=3.87, wps=588658, ups=1.19, wpb=494741, bsz=16668.3, num_updates=56300, lr=0.000266548, gnorm=0.144, clip=0, loss_scale=4, train_wall=83, gb_free=61.1, wall=48241 epoch 034: 649 / 1689 loss=3.508, nll_loss=1.954, ppl=3.87, wps=588658, ups=1.19, wpb=494741, bsz=16668.3, num_updates=56300, lr=0.000266548, gnorm=0.144, clip=0, loss_scale=4, train_wall=83, gb_free=61.1, wall=48241 epoch 034: 649 / 1689 loss=3.508, nll_loss=1.954, ppl=3.87, wps=588658, ups=1.19, wpb=494741, bsz=16668.3, num_updates=56300, lr=0.000266548, gnorm=0.144, clip=0, loss_scale=4, train_wall=83, gb_free=61.1, wall=48241 epoch 034: 649 / 1689 loss=3.508, nll_loss=1.954, ppl=3.87, wps=588658, ups=1.19, wpb=494741, bsz=16668.3, num_updates=56300, lr=0.000266548, gnorm=0.144, clip=0, loss_scale=4, train_wall=83, gb_free=61.1, wall=48241 epoch 034: 649 / 1689 loss=3.508, nll_loss=1.954, ppl=3.87, wps=588658, ups=1.19, wpb=494741, bsz=16668.3, num_updates=56300, lr=0.000266548, gnorm=0.144, clip=0, loss_scale=4, train_wall=83, gb_free=61.1, wall=48241 epoch 034: 649 / 1689 loss=3.508, nll_loss=1.954, ppl=3.87, wps=588658, ups=1.19, wpb=494741, bsz=16668.3, num_updates=56300, lr=0.000266548, gnorm=0.144, clip=0, loss_scale=4, train_wall=83, gb_free=61.1, wall=48241 epoch 034: 649 / 1689 loss=3.508, nll_loss=1.954, ppl=3.87, wps=588658, ups=1.19, wpb=494741, bsz=16668.3, num_updates=56300, lr=0.000266548, gnorm=0.144, clip=0, loss_scale=4, train_wall=83, gb_free=61.1, wall=48241 epoch 034: 649 / 1689 loss=3.508, nll_loss=1.954, ppl=3.87, wps=588658, ups=1.19, wpb=494741, bsz=16668.3, num_updates=56300, lr=0.000266548, gnorm=0.144, clip=0, loss_scale=4, train_wall=83, gb_free=61.1, wall=48241 epoch 034: 649 / 1689 loss=3.508, nll_loss=1.954, ppl=3.87, wps=588658, ups=1.19, wpb=494741, bsz=16668.3, num_updates=56300, lr=0.000266548, gnorm=0.144, clip=0, loss_scale=4, train_wall=83, gb_free=61.1, wall=48241 epoch 034: 649 / 1689 loss=3.508, nll_loss=1.954, ppl=3.87, wps=588658, ups=1.19, wpb=494741, bsz=16668.3, num_updates=56300, lr=0.000266548, gnorm=0.144, clip=0, loss_scale=4, train_wall=83, gb_free=61.1, wall=48241 epoch 034: 649 / 1689 loss=3.508, nll_loss=1.954, ppl=3.87, wps=588658, ups=1.19, wpb=494741, bsz=16668.3, num_updates=56300, lr=0.000266548, gnorm=0.144, clip=0, loss_scale=4, train_wall=83, gb_free=61.1, wall=48241 epoch 034: 649 / 1689 loss=3.508, nll_loss=1.954, ppl=3.87, wps=588658, ups=1.19, wpb=494741, bsz=16668.3, num_updates=56300, lr=0.000266548, gnorm=0.144, clip=0, loss_scale=4, train_wall=83, gb_free=61.1, wall=48241 epoch 034: 649 / 1689 loss=3.508, nll_loss=1.954, ppl=3.87, wps=588658, ups=1.19, wpb=494741, bsz=16668.3, num_updates=56300, lr=0.000266548, gnorm=0.144, clip=0, loss_scale=4, train_wall=83, gb_free=61.1, wall=48241 epoch 034: 649 / 1689 loss=3.508, nll_loss=1.954, ppl=3.87, wps=588658, ups=1.19, wpb=494741, bsz=16668.3, num_updates=56300, lr=0.000266548, gnorm=0.144, clip=0, loss_scale=4, train_wall=83, gb_free=61.1, wall=48241 epoch 034: 649 / 1689 loss=3.508, nll_loss=1.954, ppl=3.87, wps=588658, ups=1.19, wpb=494741, bsz=16668.3, num_updates=56300, lr=0.000266548, gnorm=0.144, clip=0, loss_scale=4, train_wall=83, gb_free=61.1, wall=48241 epoch 034: 649 / 1689 loss=3.508, nll_loss=1.954, ppl=3.87, wps=588658, ups=1.19, wpb=494741, bsz=16668.3, num_updates=56300, lr=0.000266548, gnorm=0.144, clip=0, loss_scale=4, train_wall=83, gb_free=61.1, wall=48241 epoch 034: 749 / 1689 loss=3.5, nll_loss=1.945, ppl=3.85, wps=591079, ups=1.19, wpb=496431, bsz=16705.3, num_updates=56400, lr=0.000266312, gnorm=0.133, clip=0, loss_scale=4, train_wall=83, gb_free=61.3, wall=48325 epoch 034: 749 / 1689 loss=3.5, nll_loss=1.945, ppl=3.85, wps=591079, ups=1.19, wpb=496431, bsz=16705.3, num_updates=56400, lr=0.000266312, gnorm=0.133, clip=0, loss_scale=4, train_wall=83, gb_free=61.3, wall=48325 epoch 034: 749 / 1689 loss=3.5, nll_loss=1.945, ppl=3.85, wps=591079, ups=1.19, wpb=496431, bsz=16705.3, num_updates=56400, lr=0.000266312, gnorm=0.133, clip=0, loss_scale=4, train_wall=83, gb_free=61.3, wall=48325 epoch 034: 749 / 1689 loss=3.5, nll_loss=1.945, ppl=3.85, wps=591079, ups=1.19, wpb=496431, bsz=16705.3, num_updates=56400, lr=0.000266312, gnorm=0.133, clip=0, loss_scale=4, train_wall=83, gb_free=61.3, wall=48325 epoch 034: 749 / 1689 loss=3.5, nll_loss=1.945, ppl=3.85, wps=591079, ups=1.19, wpb=496431, bsz=16705.3, num_updates=56400, lr=0.000266312, gnorm=0.133, clip=0, loss_scale=4, train_wall=83, gb_free=61.3, wall=48325 epoch 034: 749 / 1689 loss=3.5, nll_loss=1.945, ppl=3.85, wps=591079, ups=1.19, wpb=496431, bsz=16705.3, num_updates=56400, lr=0.000266312, gnorm=0.133, clip=0, loss_scale=4, train_wall=83, gb_free=61.3, wall=48325 epoch 034: 749 / 1689 loss=3.5, nll_loss=1.945, ppl=3.85, wps=591079, ups=1.19, wpb=496431, bsz=16705.3, num_updates=56400, lr=0.000266312, gnorm=0.133, clip=0, loss_scale=4, train_wall=83, gb_free=61.3, wall=48325 epoch 034: 749 / 1689 loss=3.5, nll_loss=1.945, ppl=3.85, wps=591079, ups=1.19, wpb=496431, bsz=16705.3, num_updates=56400, lr=0.000266312, gnorm=0.133, clip=0, loss_scale=4, train_wall=83, gb_free=61.3, wall=48325 epoch 034: 749 / 1689 loss=3.5, nll_loss=1.945, ppl=3.85, wps=591079, ups=1.19, wpb=496431, bsz=16705.3, num_updates=56400, lr=0.000266312, gnorm=0.133, clip=0, loss_scale=4, train_wall=83, gb_free=61.3, wall=48325 epoch 034: 749 / 1689 loss=3.5, nll_loss=1.945, ppl=3.85, wps=591079, ups=1.19, wpb=496431, bsz=16705.3, num_updates=56400, lr=0.000266312, gnorm=0.133, clip=0, loss_scale=4, train_wall=83, gb_free=61.3, wall=48325 epoch 034: 749 / 1689 loss=3.5, nll_loss=1.945, ppl=3.85, wps=591079, ups=1.19, wpb=496431, bsz=16705.3, num_updates=56400, lr=0.000266312, gnorm=0.133, clip=0, loss_scale=4, train_wall=83, gb_free=61.3, wall=48325 epoch 034: 749 / 1689 loss=3.5, nll_loss=1.945, ppl=3.85, wps=591079, ups=1.19, wpb=496431, bsz=16705.3, num_updates=56400, lr=0.000266312, gnorm=0.133, clip=0, loss_scale=4, train_wall=83, gb_free=61.3, wall=48325 epoch 034: 749 / 1689 loss=3.5, nll_loss=1.945, ppl=3.85, wps=591079, ups=1.19, wpb=496431, bsz=16705.3, num_updates=56400, lr=0.000266312, gnorm=0.133, clip=0, loss_scale=4, train_wall=83, gb_free=61.3, wall=48325 epoch 034: 749 / 1689 loss=3.5, nll_loss=1.945, ppl=3.85, wps=591079, ups=1.19, wpb=496431, bsz=16705.3, num_updates=56400, lr=0.000266312, gnorm=0.133, clip=0, loss_scale=4, train_wall=83, gb_free=61.3, wall=48325 epoch 034: 749 / 1689 loss=3.5, nll_loss=1.945, ppl=3.85, wps=591079, ups=1.19, wpb=496431, bsz=16705.3, num_updates=56400, lr=0.000266312, gnorm=0.133, clip=0, loss_scale=4, train_wall=83, gb_free=61.3, wall=48325 epoch 034: 749 / 1689 loss=3.5, nll_loss=1.945, ppl=3.85, wps=591079, ups=1.19, wpb=496431, bsz=16705.3, num_updates=56400, lr=0.000266312, gnorm=0.133, clip=0, loss_scale=4, train_wall=83, gb_free=61.3, wall=48325 epoch 034: 749 / 1689 loss=3.5, nll_loss=1.945, ppl=3.85, wps=591079, ups=1.19, wpb=496431, bsz=16705.3, num_updates=56400, lr=0.000266312, gnorm=0.133, clip=0, loss_scale=4, train_wall=83, gb_free=61.3, wall=48325 epoch 034: 749 / 1689 loss=3.5, nll_loss=1.945, ppl=3.85, wps=591079, ups=1.19, wpb=496431, bsz=16705.3, num_updates=56400, lr=0.000266312, gnorm=0.133, clip=0, loss_scale=4, train_wall=83, gb_free=61.3, wall=48325 epoch 034: 749 / 1689 loss=3.5, nll_loss=1.945, ppl=3.85, wps=591079, ups=1.19, wpb=496431, bsz=16705.3, num_updates=56400, lr=0.000266312, gnorm=0.133, clip=0, loss_scale=4, train_wall=83, gb_free=61.3, wall=48325 epoch 034: 749 / 1689 loss=3.5, nll_loss=1.945, ppl=3.85, wps=591079, ups=1.19, wpb=496431, bsz=16705.3, num_updates=56400, lr=0.000266312, gnorm=0.133, clip=0, loss_scale=4, train_wall=83, gb_free=61.3, wall=48325 epoch 034: 749 / 1689 loss=3.5, nll_loss=1.945, ppl=3.85, wps=591079, ups=1.19, wpb=496431, bsz=16705.3, num_updates=56400, lr=0.000266312, gnorm=0.133, clip=0, loss_scale=4, train_wall=83, gb_free=61.3, wall=48325 epoch 034: 749 / 1689 loss=3.5, nll_loss=1.945, ppl=3.85, wps=591079, ups=1.19, wpb=496431, bsz=16705.3, num_updates=56400, lr=0.000266312, gnorm=0.133, clip=0, loss_scale=4, train_wall=83, gb_free=61.3, wall=48325 epoch 034: 749 / 1689 loss=3.5, nll_loss=1.945, ppl=3.85, wps=591079, ups=1.19, wpb=496431, bsz=16705.3, num_updates=56400, lr=0.000266312, gnorm=0.133, clip=0, loss_scale=4, train_wall=83, gb_free=61.3, wall=48325 epoch 034: 749 / 1689 loss=3.5, nll_loss=1.945, ppl=3.85, wps=591079, ups=1.19, wpb=496431, bsz=16705.3, num_updates=56400, lr=0.000266312, gnorm=0.133, clip=0, loss_scale=4, train_wall=83, gb_free=61.3, wall=48325 epoch 034: 749 / 1689 loss=3.5, nll_loss=1.945, ppl=3.85, wps=591079, ups=1.19, wpb=496431, bsz=16705.3, num_updates=56400, lr=0.000266312, gnorm=0.133, clip=0, loss_scale=4, train_wall=83, gb_free=61.3, wall=48325 epoch 034: 749 / 1689 loss=3.5, nll_loss=1.945, ppl=3.85, wps=591079, ups=1.19, wpb=496431, bsz=16705.3, num_updates=56400, lr=0.000266312, gnorm=0.133, clip=0, loss_scale=4, train_wall=83, gb_free=61.3, wall=48325 epoch 034: 749 / 1689 loss=3.5, nll_loss=1.945, ppl=3.85, wps=591079, ups=1.19, wpb=496431, bsz=16705.3, num_updates=56400, lr=0.000266312, gnorm=0.133, clip=0, loss_scale=4, train_wall=83, gb_free=61.3, wall=48325 epoch 034: 749 / 1689 loss=3.5, nll_loss=1.945, ppl=3.85, wps=591079, ups=1.19, wpb=496431, bsz=16705.3, num_updates=56400, lr=0.000266312, gnorm=0.133, clip=0, loss_scale=4, train_wall=83, gb_free=61.3, wall=48325 epoch 034: 749 / 1689 loss=3.5, nll_loss=1.945, ppl=3.85, wps=591079, ups=1.19, wpb=496431, bsz=16705.3, num_updates=56400, lr=0.000266312, gnorm=0.133, clip=0, loss_scale=4, train_wall=83, gb_free=61.3, wall=48325 epoch 034: 749 / 1689 loss=3.5, nll_loss=1.945, ppl=3.85, wps=591079, ups=1.19, wpb=496431, bsz=16705.3, num_updates=56400, lr=0.000266312, gnorm=0.133, clip=0, loss_scale=4, train_wall=83, gb_free=61.3, wall=48325 epoch 034: 749 / 1689 loss=3.5, nll_loss=1.945, ppl=3.85, wps=591079, ups=1.19, wpb=496431, bsz=16705.3, num_updates=56400, lr=0.000266312, gnorm=0.133, clip=0, loss_scale=4, train_wall=83, gb_free=61.3, wall=48325 epoch 034: 749 / 1689 loss=3.5, nll_loss=1.945, ppl=3.85, wps=591079, ups=1.19, wpb=496431, bsz=16705.3, num_updates=56400, lr=0.000266312, gnorm=0.133, clip=0, loss_scale=4, train_wall=83, gb_free=61.3, wall=48325 epoch 034: 749 / 1689 loss=3.5, nll_loss=1.945, ppl=3.85, wps=591079, ups=1.19, wpb=496431, bsz=16705.3, num_updates=56400, lr=0.000266312, gnorm=0.133, clip=0, loss_scale=4, train_wall=83, gb_free=61.3, wall=48325 epoch 034: 749 / 1689 loss=3.5, nll_loss=1.945, ppl=3.85, wps=591079, ups=1.19, wpb=496431, bsz=16705.3, num_updates=56400, lr=0.000266312, gnorm=0.133, clip=0, loss_scale=4, train_wall=83, gb_free=61.3, wall=48325 epoch 034: 849 / 1689 loss=3.501, nll_loss=1.945, ppl=3.85, wps=589464, ups=1.19, wpb=494612, bsz=16414.8, num_updates=56500, lr=0.000266076, gnorm=0.131, clip=0, loss_scale=4, train_wall=83, gb_free=60.7, wall=48409 epoch 034: 849 / 1689 loss=3.501, nll_loss=1.945, ppl=3.85, wps=589464, ups=1.19, wpb=494612, bsz=16414.8, num_updates=56500, lr=0.000266076, gnorm=0.131, clip=0, loss_scale=4, train_wall=83, gb_free=60.7, wall=48409 epoch 034: 849 / 1689 loss=3.501, nll_loss=1.945, ppl=3.85, wps=589464, ups=1.19, wpb=494612, bsz=16414.8, num_updates=56500, lr=0.000266076, gnorm=0.131, clip=0, loss_scale=4, train_wall=83, gb_free=60.7, wall=48409 epoch 034: 849 / 1689 loss=3.501, nll_loss=1.945, ppl=3.85, wps=589464, ups=1.19, wpb=494612, bsz=16414.8, num_updates=56500, lr=0.000266076, gnorm=0.131, clip=0, loss_scale=4, train_wall=83, gb_free=60.7, wall=48409 epoch 034: 849 / 1689 loss=3.501, nll_loss=1.945, ppl=3.85, wps=589464, ups=1.19, wpb=494612, bsz=16414.8, num_updates=56500, lr=0.000266076, gnorm=0.131, clip=0, loss_scale=4, train_wall=83, gb_free=60.7, wall=48409 epoch 034: 849 / 1689 loss=3.501, nll_loss=1.945, ppl=3.85, wps=589464, ups=1.19, wpb=494612, bsz=16414.8, num_updates=56500, lr=0.000266076, gnorm=0.131, clip=0, loss_scale=4, train_wall=83, gb_free=60.7, wall=48409 epoch 034: 849 / 1689 loss=3.501, nll_loss=1.945, ppl=3.85, wps=589464, ups=1.19, wpb=494612, bsz=16414.8, num_updates=56500, lr=0.000266076, gnorm=0.131, clip=0, loss_scale=4, train_wall=83, gb_free=60.7, wall=48409 epoch 034: 849 / 1689 loss=3.501, nll_loss=1.945, ppl=3.85, wps=589464, ups=1.19, wpb=494612, bsz=16414.8, num_updates=56500, lr=0.000266076, gnorm=0.131, clip=0, loss_scale=4, train_wall=83, gb_free=60.7, wall=48409 epoch 034: 849 / 1689 loss=3.501, nll_loss=1.945, ppl=3.85, wps=589464, ups=1.19, wpb=494612, bsz=16414.8, num_updates=56500, lr=0.000266076, gnorm=0.131, clip=0, loss_scale=4, train_wall=83, gb_free=60.7, wall=48409 epoch 034: 849 / 1689 loss=3.501, nll_loss=1.945, ppl=3.85, wps=589464, ups=1.19, wpb=494612, bsz=16414.8, num_updates=56500, lr=0.000266076, gnorm=0.131, clip=0, loss_scale=4, train_wall=83, gb_free=60.7, wall=48409 epoch 034: 849 / 1689 loss=3.501, nll_loss=1.945, ppl=3.85, wps=589464, ups=1.19, wpb=494612, bsz=16414.8, num_updates=56500, lr=0.000266076, gnorm=0.131, clip=0, loss_scale=4, train_wall=83, gb_free=60.7, wall=48409 epoch 034: 849 / 1689 loss=3.501, nll_loss=1.945, ppl=3.85, wps=589464, ups=1.19, wpb=494612, bsz=16414.8, num_updates=56500, lr=0.000266076, gnorm=0.131, clip=0, loss_scale=4, train_wall=83, gb_free=60.7, wall=48409 epoch 034: 849 / 1689 loss=3.501, nll_loss=1.945, ppl=3.85, wps=589464, ups=1.19, wpb=494612, bsz=16414.8, num_updates=56500, lr=0.000266076, gnorm=0.131, clip=0, loss_scale=4, train_wall=83, gb_free=60.7, wall=48409 epoch 034: 849 / 1689 loss=3.501, nll_loss=1.945, ppl=3.85, wps=589464, ups=1.19, wpb=494612, bsz=16414.8, num_updates=56500, lr=0.000266076, gnorm=0.131, clip=0, loss_scale=4, train_wall=83, gb_free=60.7, wall=48409 epoch 034: 849 / 1689 loss=3.501, nll_loss=1.945, ppl=3.85, wps=589464, ups=1.19, wpb=494612, bsz=16414.8, num_updates=56500, lr=0.000266076, gnorm=0.131, clip=0, loss_scale=4, train_wall=83, gb_free=60.7, wall=48409 epoch 034: 849 / 1689 loss=3.501, nll_loss=1.945, ppl=3.85, wps=589464, ups=1.19, wpb=494612, bsz=16414.8, num_updates=56500, lr=0.000266076, gnorm=0.131, clip=0, loss_scale=4, train_wall=83, gb_free=60.7, wall=48409 epoch 034: 849 / 1689 loss=3.501, nll_loss=1.945, ppl=3.85, wps=589464, ups=1.19, wpb=494612, bsz=16414.8, num_updates=56500, lr=0.000266076, gnorm=0.131, clip=0, loss_scale=4, train_wall=83, gb_free=60.7, wall=48409 epoch 034: 849 / 1689 loss=3.501, nll_loss=1.945, ppl=3.85, wps=589464, ups=1.19, wpb=494612, bsz=16414.8, num_updates=56500, lr=0.000266076, gnorm=0.131, clip=0, loss_scale=4, train_wall=83, gb_free=60.7, wall=48409 epoch 034: 849 / 1689 loss=3.501, nll_loss=1.945, ppl=3.85, wps=589464, ups=1.19, wpb=494612, bsz=16414.8, num_updates=56500, lr=0.000266076, gnorm=0.131, clip=0, loss_scale=4, train_wall=83, gb_free=60.7, wall=48409 epoch 034: 849 / 1689 loss=3.501, nll_loss=1.945, ppl=3.85, wps=589464, ups=1.19, wpb=494612, bsz=16414.8, num_updates=56500, lr=0.000266076, gnorm=0.131, clip=0, loss_scale=4, train_wall=83, gb_free=60.7, wall=48409 epoch 034: 849 / 1689 loss=3.501, nll_loss=1.945, ppl=3.85, wps=589464, ups=1.19, wpb=494612, bsz=16414.8, num_updates=56500, lr=0.000266076, gnorm=0.131, clip=0, loss_scale=4, train_wall=83, gb_free=60.7, wall=48409 epoch 034: 849 / 1689 loss=3.501, nll_loss=1.945, ppl=3.85, wps=589464, ups=1.19, wpb=494612, bsz=16414.8, num_updates=56500, lr=0.000266076, gnorm=0.131, clip=0, loss_scale=4, train_wall=83, gb_free=60.7, wall=48409 epoch 034: 849 / 1689 loss=3.501, nll_loss=1.945, ppl=3.85, wps=589464, ups=1.19, wpb=494612, bsz=16414.8, num_updates=56500, lr=0.000266076, gnorm=0.131, clip=0, loss_scale=4, train_wall=83, gb_free=60.7, wall=48409 epoch 034: 849 / 1689 loss=3.501, nll_loss=1.945, ppl=3.85, wps=589464, ups=1.19, wpb=494612, bsz=16414.8, num_updates=56500, lr=0.000266076, gnorm=0.131, clip=0, loss_scale=4, train_wall=83, gb_free=60.7, wall=48409 epoch 034: 849 / 1689 loss=3.501, nll_loss=1.945, ppl=3.85, wps=589464, ups=1.19, wpb=494612, bsz=16414.8, num_updates=56500, lr=0.000266076, gnorm=0.131, clip=0, loss_scale=4, train_wall=83, gb_free=60.7, wall=48409 epoch 034: 849 / 1689 loss=3.501, nll_loss=1.945, ppl=3.85, wps=589464, ups=1.19, wpb=494612, bsz=16414.8, num_updates=56500, lr=0.000266076, gnorm=0.131, clip=0, loss_scale=4, train_wall=83, gb_free=60.7, wall=48409 epoch 034: 849 / 1689 loss=3.501, nll_loss=1.945, ppl=3.85, wps=589464, ups=1.19, wpb=494612, bsz=16414.8, num_updates=56500, lr=0.000266076, gnorm=0.131, clip=0, loss_scale=4, train_wall=83, gb_free=60.7, wall=48409 epoch 034: 849 / 1689 loss=3.501, nll_loss=1.945, ppl=3.85, wps=589464, ups=1.19, wpb=494612, bsz=16414.8, num_updates=56500, lr=0.000266076, gnorm=0.131, clip=0, loss_scale=4, train_wall=83, gb_free=60.7, wall=48409 epoch 034: 849 / 1689 loss=3.501, nll_loss=1.945, ppl=3.85, wps=589464, ups=1.19, wpb=494612, bsz=16414.8, num_updates=56500, lr=0.000266076, gnorm=0.131, clip=0, loss_scale=4, train_wall=83, gb_free=60.7, wall=48409 epoch 034: 849 / 1689 loss=3.501, nll_loss=1.945, ppl=3.85, wps=589464, ups=1.19, wpb=494612, bsz=16414.8, num_updates=56500, lr=0.000266076, gnorm=0.131, clip=0, loss_scale=4, train_wall=83, gb_free=60.7, wall=48409 epoch 034: 849 / 1689 loss=3.501, nll_loss=1.945, ppl=3.85, wps=589464, ups=1.19, wpb=494612, bsz=16414.8, num_updates=56500, lr=0.000266076, gnorm=0.131, clip=0, loss_scale=4, train_wall=83, gb_free=60.7, wall=48409 epoch 034: 849 / 1689 loss=3.501, nll_loss=1.945, ppl=3.85, wps=589464, ups=1.19, wpb=494612, bsz=16414.8, num_updates=56500, lr=0.000266076, gnorm=0.131, clip=0, loss_scale=4, train_wall=83, gb_free=60.7, wall=48409 epoch 034: 849 / 1689 loss=3.501, nll_loss=1.945, ppl=3.85, wps=589464, ups=1.19, wpb=494612, bsz=16414.8, num_updates=56500, lr=0.000266076, gnorm=0.131, clip=0, loss_scale=4, train_wall=83, gb_free=60.7, wall=48409 epoch 034: 849 / 1689 loss=3.501, nll_loss=1.945, ppl=3.85, wps=589464, ups=1.19, wpb=494612, bsz=16414.8, num_updates=56500, lr=0.000266076, gnorm=0.131, clip=0, loss_scale=4, train_wall=83, gb_free=60.7, wall=48409 epoch 034: 949 / 1689 loss=3.503, nll_loss=1.948, ppl=3.86, wps=585132, ups=1.18, wpb=493917, bsz=16682.5, num_updates=56600, lr=0.000265841, gnorm=0.135, clip=0, loss_scale=8, train_wall=83, gb_free=60.9, wall=48494 epoch 034: 949 / 1689 loss=3.503, nll_loss=1.948, ppl=3.86, wps=585132, ups=1.18, wpb=493917, bsz=16682.5, num_updates=56600, lr=0.000265841, gnorm=0.135, clip=0, loss_scale=8, train_wall=83, gb_free=60.9, wall=48494 epoch 034: 949 / 1689 loss=3.503, nll_loss=1.948, ppl=3.86, wps=585132, ups=1.18, wpb=493917, bsz=16682.5, num_updates=56600, lr=0.000265841, gnorm=0.135, clip=0, loss_scale=8, train_wall=83, gb_free=60.9, wall=48494 epoch 034: 949 / 1689 loss=3.503, nll_loss=1.948, ppl=3.86, wps=585132, ups=1.18, wpb=493917, bsz=16682.5, num_updates=56600, lr=0.000265841, gnorm=0.135, clip=0, loss_scale=8, train_wall=83, gb_free=60.9, wall=48494 epoch 034: 949 / 1689 loss=3.503, nll_loss=1.948, ppl=3.86, wps=585132, ups=1.18, wpb=493917, bsz=16682.5, num_updates=56600, lr=0.000265841, gnorm=0.135, clip=0, loss_scale=8, train_wall=83, gb_free=60.9, wall=48494 epoch 034: 949 / 1689 loss=3.503, nll_loss=1.948, ppl=3.86, wps=585132, ups=1.18, wpb=493917, bsz=16682.5, num_updates=56600, lr=0.000265841, gnorm=0.135, clip=0, loss_scale=8, train_wall=83, gb_free=60.9, wall=48494 epoch 034: 949 / 1689 loss=3.503, nll_loss=1.948, ppl=3.86, wps=585132, ups=1.18, wpb=493917, bsz=16682.5, num_updates=56600, lr=0.000265841, gnorm=0.135, clip=0, loss_scale=8, train_wall=83, gb_free=60.9, wall=48494 epoch 034: 949 / 1689 loss=3.503, nll_loss=1.948, ppl=3.86, wps=585132, ups=1.18, wpb=493917, bsz=16682.5, num_updates=56600, lr=0.000265841, gnorm=0.135, clip=0, loss_scale=8, train_wall=83, gb_free=60.9, wall=48494 epoch 034: 949 / 1689 loss=3.503, nll_loss=1.948, ppl=3.86, wps=585132, ups=1.18, wpb=493917, bsz=16682.5, num_updates=56600, lr=0.000265841, gnorm=0.135, clip=0, loss_scale=8, train_wall=83, gb_free=60.9, wall=48494 epoch 034: 949 / 1689 loss=3.503, nll_loss=1.948, ppl=3.86, wps=585132, ups=1.18, wpb=493917, bsz=16682.5, num_updates=56600, lr=0.000265841, gnorm=0.135, clip=0, loss_scale=8, train_wall=83, gb_free=60.9, wall=48494 epoch 034: 949 / 1689 loss=3.503, nll_loss=1.948, ppl=3.86, wps=585132, ups=1.18, wpb=493917, bsz=16682.5, num_updates=56600, lr=0.000265841, gnorm=0.135, clip=0, loss_scale=8, train_wall=83, gb_free=60.9, wall=48494 epoch 034: 949 / 1689 loss=3.503, nll_loss=1.948, ppl=3.86, wps=585132, ups=1.18, wpb=493917, bsz=16682.5, num_updates=56600, lr=0.000265841, gnorm=0.135, clip=0, loss_scale=8, train_wall=83, gb_free=60.9, wall=48494 epoch 034: 949 / 1689 loss=3.503, nll_loss=1.948, ppl=3.86, wps=585132, ups=1.18, wpb=493917, bsz=16682.5, num_updates=56600, lr=0.000265841, gnorm=0.135, clip=0, loss_scale=8, train_wall=83, gb_free=60.9, wall=48494 epoch 034: 949 / 1689 loss=3.503, nll_loss=1.948, ppl=3.86, wps=585132, ups=1.18, wpb=493917, bsz=16682.5, num_updates=56600, lr=0.000265841, gnorm=0.135, clip=0, loss_scale=8, train_wall=83, gb_free=60.9, wall=48494 epoch 034: 949 / 1689 loss=3.503, nll_loss=1.948, ppl=3.86, wps=585132, ups=1.18, wpb=493917, bsz=16682.5, num_updates=56600, lr=0.000265841, gnorm=0.135, clip=0, loss_scale=8, train_wall=83, gb_free=60.9, wall=48494 epoch 034: 949 / 1689 loss=3.503, nll_loss=1.948, ppl=3.86, wps=585132, ups=1.18, wpb=493917, bsz=16682.5, num_updates=56600, lr=0.000265841, gnorm=0.135, clip=0, loss_scale=8, train_wall=83, gb_free=60.9, wall=48494 epoch 034: 949 / 1689 loss=3.503, nll_loss=1.948, ppl=3.86, wps=585132, ups=1.18, wpb=493917, bsz=16682.5, num_updates=56600, lr=0.000265841, gnorm=0.135, clip=0, loss_scale=8, train_wall=83, gb_free=60.9, wall=48494 epoch 034: 949 / 1689 loss=3.503, nll_loss=1.948, ppl=3.86, wps=585132, ups=1.18, wpb=493917, bsz=16682.5, num_updates=56600, lr=0.000265841, gnorm=0.135, clip=0, loss_scale=8, train_wall=83, gb_free=60.9, wall=48494 epoch 034: 949 / 1689 loss=3.503, nll_loss=1.948, ppl=3.86, wps=585132, ups=1.18, wpb=493917, bsz=16682.5, num_updates=56600, lr=0.000265841, gnorm=0.135, clip=0, loss_scale=8, train_wall=83, gb_free=60.9, wall=48494 epoch 034: 949 / 1689 loss=3.503, nll_loss=1.948, ppl=3.86, wps=585132, ups=1.18, wpb=493917, bsz=16682.5, num_updates=56600, lr=0.000265841, gnorm=0.135, clip=0, loss_scale=8, train_wall=83, gb_free=60.9, wall=48494 epoch 034: 949 / 1689 loss=3.503, nll_loss=1.948, ppl=3.86, wps=585132, ups=1.18, wpb=493917, bsz=16682.5, num_updates=56600, lr=0.000265841, gnorm=0.135, clip=0, loss_scale=8, train_wall=83, gb_free=60.9, wall=48494 epoch 034: 949 / 1689 loss=3.503, nll_loss=1.948, ppl=3.86, wps=585132, ups=1.18, wpb=493917, bsz=16682.5, num_updates=56600, lr=0.000265841, gnorm=0.135, clip=0, loss_scale=8, train_wall=83, gb_free=60.9, wall=48494 epoch 034: 949 / 1689 loss=3.503, nll_loss=1.948, ppl=3.86, wps=585132, ups=1.18, wpb=493917, bsz=16682.5, num_updates=56600, lr=0.000265841, gnorm=0.135, clip=0, loss_scale=8, train_wall=83, gb_free=60.9, wall=48494 epoch 034: 949 / 1689 loss=3.503, nll_loss=1.948, ppl=3.86, wps=585132, ups=1.18, wpb=493917, bsz=16682.5, num_updates=56600, lr=0.000265841, gnorm=0.135, clip=0, loss_scale=8, train_wall=83, gb_free=60.9, wall=48494 epoch 034: 949 / 1689 loss=3.503, nll_loss=1.948, ppl=3.86, wps=585132, ups=1.18, wpb=493917, bsz=16682.5, num_updates=56600, lr=0.000265841, gnorm=0.135, clip=0, loss_scale=8, train_wall=83, gb_free=60.9, wall=48494 epoch 034: 949 / 1689 loss=3.503, nll_loss=1.948, ppl=3.86, wps=585132, ups=1.18, wpb=493917, bsz=16682.5, num_updates=56600, lr=0.000265841, gnorm=0.135, clip=0, loss_scale=8, train_wall=83, gb_free=60.9, wall=48494 epoch 034: 949 / 1689 loss=3.503, nll_loss=1.948, ppl=3.86, wps=585132, ups=1.18, wpb=493917, bsz=16682.5, num_updates=56600, lr=0.000265841, gnorm=0.135, clip=0, loss_scale=8, train_wall=83, gb_free=60.9, wall=48494 epoch 034: 949 / 1689 loss=3.503, nll_loss=1.948, ppl=3.86, wps=585132, ups=1.18, wpb=493917, bsz=16682.5, num_updates=56600, lr=0.000265841, gnorm=0.135, clip=0, loss_scale=8, train_wall=83, gb_free=60.9, wall=48494 epoch 034: 949 / 1689 loss=3.503, nll_loss=1.948, ppl=3.86, wps=585132, ups=1.18, wpb=493917, bsz=16682.5, num_updates=56600, lr=0.000265841, gnorm=0.135, clip=0, loss_scale=8, train_wall=83, gb_free=60.9, wall=48494 epoch 034: 949 / 1689 loss=3.503, nll_loss=1.948, ppl=3.86, wps=585132, ups=1.18, wpb=493917, bsz=16682.5, num_updates=56600, lr=0.000265841, gnorm=0.135, clip=0, loss_scale=8, train_wall=83, gb_free=60.9, wall=48494 epoch 034: 949 / 1689 loss=3.503, nll_loss=1.948, ppl=3.86, wps=585132, ups=1.18, wpb=493917, bsz=16682.5, num_updates=56600, lr=0.000265841, gnorm=0.135, clip=0, loss_scale=8, train_wall=83, gb_free=60.9, wall=48494 epoch 034: 949 / 1689 loss=3.503, nll_loss=1.948, ppl=3.86, wps=585132, ups=1.18, wpb=493917, bsz=16682.5, num_updates=56600, lr=0.000265841, gnorm=0.135, clip=0, loss_scale=8, train_wall=83, gb_free=60.9, wall=48494 epoch 034: 949 / 1689 loss=3.503, nll_loss=1.948, ppl=3.86, wps=585132, ups=1.18, wpb=493917, bsz=16682.5, num_updates=56600, lr=0.000265841, gnorm=0.135, clip=0, loss_scale=8, train_wall=83, gb_free=60.9, wall=48494 epoch 034: 949 / 1689 loss=3.503, nll_loss=1.948, ppl=3.86, wps=585132, ups=1.18, wpb=493917, bsz=16682.5, num_updates=56600, lr=0.000265841, gnorm=0.135, clip=0, loss_scale=8, train_wall=83, gb_free=60.9, wall=48494 epoch 034: 1050 / 1689 loss=3.511, nll_loss=1.957, ppl=3.88, wps=584352, ups=1.18, wpb=495049, bsz=16079.4, num_updates=56700, lr=0.000265606, gnorm=0.14, clip=0, loss_scale=4, train_wall=84, gb_free=60.9, wall=48578 epoch 034: 1050 / 1689 loss=3.511, nll_loss=1.957, ppl=3.88, wps=584352, ups=1.18, wpb=495049, bsz=16079.4, num_updates=56700, lr=0.000265606, gnorm=0.14, clip=0, loss_scale=4, train_wall=84, gb_free=60.9, wall=48578 epoch 034: 1050 / 1689 loss=3.511, nll_loss=1.957, ppl=3.88, wps=584352, ups=1.18, wpb=495049, bsz=16079.4, num_updates=56700, lr=0.000265606, gnorm=0.14, clip=0, loss_scale=4, train_wall=84, gb_free=60.9, wall=48578 epoch 034: 1050 / 1689 loss=3.511, nll_loss=1.957, ppl=3.88, wps=584352, ups=1.18, wpb=495049, bsz=16079.4, num_updates=56700, lr=0.000265606, gnorm=0.14, clip=0, loss_scale=4, train_wall=84, gb_free=60.9, wall=48578 epoch 034: 1050 / 1689 loss=3.511, nll_loss=1.957, ppl=3.88, wps=584352, ups=1.18, wpb=495049, bsz=16079.4, num_updates=56700, lr=0.000265606, gnorm=0.14, clip=0, loss_scale=4, train_wall=84, gb_free=60.9, wall=48578 epoch 034: 1050 / 1689 loss=3.511, nll_loss=1.957, ppl=3.88, wps=584352, ups=1.18, wpb=495049, bsz=16079.4, num_updates=56700, lr=0.000265606, gnorm=0.14, clip=0, loss_scale=4, train_wall=84, gb_free=60.9, wall=48578 epoch 034: 1050 / 1689 loss=3.511, nll_loss=1.957, ppl=3.88, wps=584352, ups=1.18, wpb=495049, bsz=16079.4, num_updates=56700, lr=0.000265606, gnorm=0.14, clip=0, loss_scale=4, train_wall=84, gb_free=60.9, wall=48578 epoch 034: 1050 / 1689 loss=3.511, nll_loss=1.957, ppl=3.88, wps=584352, ups=1.18, wpb=495049, bsz=16079.4, num_updates=56700, lr=0.000265606, gnorm=0.14, clip=0, loss_scale=4, train_wall=84, gb_free=60.9, wall=48578 epoch 034: 1050 / 1689 loss=3.511, nll_loss=1.957, ppl=3.88, wps=584352, ups=1.18, wpb=495049, bsz=16079.4, num_updates=56700, lr=0.000265606, gnorm=0.14, clip=0, loss_scale=4, train_wall=84, gb_free=60.9, wall=48578 epoch 034: 1050 / 1689 loss=3.511, nll_loss=1.957, ppl=3.88, wps=584352, ups=1.18, wpb=495049, bsz=16079.4, num_updates=56700, lr=0.000265606, gnorm=0.14, clip=0, loss_scale=4, train_wall=84, gb_free=60.9, wall=48578 epoch 034: 1050 / 1689 loss=3.511, nll_loss=1.957, ppl=3.88, wps=584352, ups=1.18, wpb=495049, bsz=16079.4, num_updates=56700, lr=0.000265606, gnorm=0.14, clip=0, loss_scale=4, train_wall=84, gb_free=60.9, wall=48578 epoch 034: 1050 / 1689 loss=3.511, nll_loss=1.957, ppl=3.88, wps=584352, ups=1.18, wpb=495049, bsz=16079.4, num_updates=56700, lr=0.000265606, gnorm=0.14, clip=0, loss_scale=4, train_wall=84, gb_free=60.9, wall=48578 epoch 034: 1050 / 1689 loss=3.511, nll_loss=1.957, ppl=3.88, wps=584352, ups=1.18, wpb=495049, bsz=16079.4, num_updates=56700, lr=0.000265606, gnorm=0.14, clip=0, loss_scale=4, train_wall=84, gb_free=60.9, wall=48578 epoch 034: 1050 / 1689 loss=3.511, nll_loss=1.957, ppl=3.88, wps=584352, ups=1.18, wpb=495049, bsz=16079.4, num_updates=56700, lr=0.000265606, gnorm=0.14, clip=0, loss_scale=4, train_wall=84, gb_free=60.9, wall=48578 epoch 034: 1050 / 1689 loss=3.511, nll_loss=1.957, ppl=3.88, wps=584352, ups=1.18, wpb=495049, bsz=16079.4, num_updates=56700, lr=0.000265606, gnorm=0.14, clip=0, loss_scale=4, train_wall=84, gb_free=60.9, wall=48578 epoch 034: 1050 / 1689 loss=3.511, nll_loss=1.957, ppl=3.88, wps=584352, ups=1.18, wpb=495049, bsz=16079.4, num_updates=56700, lr=0.000265606, gnorm=0.14, clip=0, loss_scale=4, train_wall=84, gb_free=60.9, wall=48578 epoch 034: 1050 / 1689 loss=3.511, nll_loss=1.957, ppl=3.88, wps=584352, ups=1.18, wpb=495049, bsz=16079.4, num_updates=56700, lr=0.000265606, gnorm=0.14, clip=0, loss_scale=4, train_wall=84, gb_free=60.9, wall=48578 epoch 034: 1050 / 1689 loss=3.511, nll_loss=1.957, ppl=3.88, wps=584352, ups=1.18, wpb=495049, bsz=16079.4, num_updates=56700, lr=0.000265606, gnorm=0.14, clip=0, loss_scale=4, train_wall=84, gb_free=60.9, wall=48578 epoch 034: 1050 / 1689 loss=3.511, nll_loss=1.957, ppl=3.88, wps=584352, ups=1.18, wpb=495049, bsz=16079.4, num_updates=56700, lr=0.000265606, gnorm=0.14, clip=0, loss_scale=4, train_wall=84, gb_free=60.9, wall=48578 epoch 034: 1050 / 1689 loss=3.511, nll_loss=1.957, ppl=3.88, wps=584352, ups=1.18, wpb=495049, bsz=16079.4, num_updates=56700, lr=0.000265606, gnorm=0.14, clip=0, loss_scale=4, train_wall=84, gb_free=60.9, wall=48578 epoch 034: 1050 / 1689 loss=3.511, nll_loss=1.957, ppl=3.88, wps=584352, ups=1.18, wpb=495049, bsz=16079.4, num_updates=56700, lr=0.000265606, gnorm=0.14, clip=0, loss_scale=4, train_wall=84, gb_free=60.9, wall=48578 epoch 034: 1050 / 1689 loss=3.511, nll_loss=1.957, ppl=3.88, wps=584352, ups=1.18, wpb=495049, bsz=16079.4, num_updates=56700, lr=0.000265606, gnorm=0.14, clip=0, loss_scale=4, train_wall=84, gb_free=60.9, wall=48578 epoch 034: 1050 / 1689 loss=3.511, nll_loss=1.957, ppl=3.88, wps=584352, ups=1.18, wpb=495049, bsz=16079.4, num_updates=56700, lr=0.000265606, gnorm=0.14, clip=0, loss_scale=4, train_wall=84, gb_free=60.9, wall=48578 epoch 034: 1050 / 1689 loss=3.511, nll_loss=1.957, ppl=3.88, wps=584352, ups=1.18, wpb=495049, bsz=16079.4, num_updates=56700, lr=0.000265606, gnorm=0.14, clip=0, loss_scale=4, train_wall=84, gb_free=60.9, wall=48578 epoch 034: 1050 / 1689 loss=3.511, nll_loss=1.957, ppl=3.88, wps=584352, ups=1.18, wpb=495049, bsz=16079.4, num_updates=56700, lr=0.000265606, gnorm=0.14, clip=0, loss_scale=4, train_wall=84, gb_free=60.9, wall=48578 epoch 034: 1050 / 1689 loss=3.511, nll_loss=1.957, ppl=3.88, wps=584352, ups=1.18, wpb=495049, bsz=16079.4, num_updates=56700, lr=0.000265606, gnorm=0.14, clip=0, loss_scale=4, train_wall=84, gb_free=60.9, wall=48578 epoch 034: 1050 / 1689 loss=3.511, nll_loss=1.957, ppl=3.88, wps=584352, ups=1.18, wpb=495049, bsz=16079.4, num_updates=56700, lr=0.000265606, gnorm=0.14, clip=0, loss_scale=4, train_wall=84, gb_free=60.9, wall=48578 epoch 034: 1050 / 1689 loss=3.511, nll_loss=1.957, ppl=3.88, wps=584352, ups=1.18, wpb=495049, bsz=16079.4, num_updates=56700, lr=0.000265606, gnorm=0.14, clip=0, loss_scale=4, train_wall=84, gb_free=60.9, wall=48578 epoch 034: 1050 / 1689 loss=3.511, nll_loss=1.957, ppl=3.88, wps=584352, ups=1.18, wpb=495049, bsz=16079.4, num_updates=56700, lr=0.000265606, gnorm=0.14, clip=0, loss_scale=4, train_wall=84, gb_free=60.9, wall=48578 epoch 034: 1050 / 1689 loss=3.511, nll_loss=1.957, ppl=3.88, wps=584352, ups=1.18, wpb=495049, bsz=16079.4, num_updates=56700, lr=0.000265606, gnorm=0.14, clip=0, loss_scale=4, train_wall=84, gb_free=60.9, wall=48578 epoch 034: 1050 / 1689 loss=3.511, nll_loss=1.957, ppl=3.88, wps=584352, ups=1.18, wpb=495049, bsz=16079.4, num_updates=56700, lr=0.000265606, gnorm=0.14, clip=0, loss_scale=4, train_wall=84, gb_free=60.9, wall=48578 epoch 034: 1050 / 1689 loss=3.511, nll_loss=1.957, ppl=3.88, wps=584352, ups=1.18, wpb=495049, bsz=16079.4, num_updates=56700, lr=0.000265606, gnorm=0.14, clip=0, loss_scale=4, train_wall=84, gb_free=60.9, wall=48578 epoch 034: 1050 / 1689 loss=3.511, nll_loss=1.957, ppl=3.88, wps=584352, ups=1.18, wpb=495049, bsz=16079.4, num_updates=56700, lr=0.000265606, gnorm=0.14, clip=0, loss_scale=4, train_wall=84, gb_free=60.9, wall=48578 epoch 034: 1050 / 1689 loss=3.511, nll_loss=1.957, ppl=3.88, wps=584352, ups=1.18, wpb=495049, bsz=16079.4, num_updates=56700, lr=0.000265606, gnorm=0.14, clip=0, loss_scale=4, train_wall=84, gb_free=60.9, wall=48578 epoch 034: 1150 / 1689 loss=3.501, nll_loss=1.945, ppl=3.85, wps=587744, ups=1.19, wpb=495105, bsz=16504.6, num_updates=56800, lr=0.000265372, gnorm=0.137, clip=0, loss_scale=4, train_wall=83, gb_free=61.9, wall=48662 epoch 034: 1150 / 1689 loss=3.501, nll_loss=1.945, ppl=3.85, wps=587744, ups=1.19, wpb=495105, bsz=16504.6, num_updates=56800, lr=0.000265372, gnorm=0.137, clip=0, loss_scale=4, train_wall=83, gb_free=61.9, wall=48662 epoch 034: 1150 / 1689 loss=3.501, nll_loss=1.945, ppl=3.85, wps=587744, ups=1.19, wpb=495105, bsz=16504.6, num_updates=56800, lr=0.000265372, gnorm=0.137, clip=0, loss_scale=4, train_wall=83, gb_free=61.9, wall=48662 epoch 034: 1150 / 1689 loss=3.501, nll_loss=1.945, ppl=3.85, wps=587744, ups=1.19, wpb=495105, bsz=16504.6, num_updates=56800, lr=0.000265372, gnorm=0.137, clip=0, loss_scale=4, train_wall=83, gb_free=61.9, wall=48662 epoch 034: 1150 / 1689 loss=3.501, nll_loss=1.945, ppl=3.85, wps=587744, ups=1.19, wpb=495105, bsz=16504.6, num_updates=56800, lr=0.000265372, gnorm=0.137, clip=0, loss_scale=4, train_wall=83, gb_free=61.9, wall=48662 epoch 034: 1150 / 1689 loss=3.501, nll_loss=1.945, ppl=3.85, wps=587744, ups=1.19, wpb=495105, bsz=16504.6, num_updates=56800, lr=0.000265372, gnorm=0.137, clip=0, loss_scale=4, train_wall=83, gb_free=61.9, wall=48662 epoch 034: 1150 / 1689 loss=3.501, nll_loss=1.945, ppl=3.85, wps=587744, ups=1.19, wpb=495105, bsz=16504.6, num_updates=56800, lr=0.000265372, gnorm=0.137, clip=0, loss_scale=4, train_wall=83, gb_free=61.9, wall=48662 epoch 034: 1150 / 1689 loss=3.501, nll_loss=1.945, ppl=3.85, wps=587744, ups=1.19, wpb=495105, bsz=16504.6, num_updates=56800, lr=0.000265372, gnorm=0.137, clip=0, loss_scale=4, train_wall=83, gb_free=61.9, wall=48662 epoch 034: 1150 / 1689 loss=3.501, nll_loss=1.945, ppl=3.85, wps=587744, ups=1.19, wpb=495105, bsz=16504.6, num_updates=56800, lr=0.000265372, gnorm=0.137, clip=0, loss_scale=4, train_wall=83, gb_free=61.9, wall=48662 epoch 034: 1150 / 1689 loss=3.501, nll_loss=1.945, ppl=3.85, wps=587744, ups=1.19, wpb=495105, bsz=16504.6, num_updates=56800, lr=0.000265372, gnorm=0.137, clip=0, loss_scale=4, train_wall=83, gb_free=61.9, wall=48662 epoch 034: 1150 / 1689 loss=3.501, nll_loss=1.945, ppl=3.85, wps=587744, ups=1.19, wpb=495105, bsz=16504.6, num_updates=56800, lr=0.000265372, gnorm=0.137, clip=0, loss_scale=4, train_wall=83, gb_free=61.9, wall=48662 epoch 034: 1150 / 1689 loss=3.501, nll_loss=1.945, ppl=3.85, wps=587744, ups=1.19, wpb=495105, bsz=16504.6, num_updates=56800, lr=0.000265372, gnorm=0.137, clip=0, loss_scale=4, train_wall=83, gb_free=61.9, wall=48662 epoch 034: 1150 / 1689 loss=3.501, nll_loss=1.945, ppl=3.85, wps=587744, ups=1.19, wpb=495105, bsz=16504.6, num_updates=56800, lr=0.000265372, gnorm=0.137, clip=0, loss_scale=4, train_wall=83, gb_free=61.9, wall=48662 epoch 034: 1150 / 1689 loss=3.501, nll_loss=1.945, ppl=3.85, wps=587744, ups=1.19, wpb=495105, bsz=16504.6, num_updates=56800, lr=0.000265372, gnorm=0.137, clip=0, loss_scale=4, train_wall=83, gb_free=61.9, wall=48662 epoch 034: 1150 / 1689 loss=3.501, nll_loss=1.945, ppl=3.85, wps=587744, ups=1.19, wpb=495105, bsz=16504.6, num_updates=56800, lr=0.000265372, gnorm=0.137, clip=0, loss_scale=4, train_wall=83, gb_free=61.9, wall=48662 epoch 034: 1150 / 1689 loss=3.501, nll_loss=1.945, ppl=3.85, wps=587744, ups=1.19, wpb=495105, bsz=16504.6, num_updates=56800, lr=0.000265372, gnorm=0.137, clip=0, loss_scale=4, train_wall=83, gb_free=61.9, wall=48662 epoch 034: 1150 / 1689 loss=3.501, nll_loss=1.945, ppl=3.85, wps=587744, ups=1.19, wpb=495105, bsz=16504.6, num_updates=56800, lr=0.000265372, gnorm=0.137, clip=0, loss_scale=4, train_wall=83, gb_free=61.9, wall=48662 epoch 034: 1150 / 1689 loss=3.501, nll_loss=1.945, ppl=3.85, wps=587744, ups=1.19, wpb=495105, bsz=16504.6, num_updates=56800, lr=0.000265372, gnorm=0.137, clip=0, loss_scale=4, train_wall=83, gb_free=61.9, wall=48662 epoch 034: 1150 / 1689 loss=3.501, nll_loss=1.945, ppl=3.85, wps=587744, ups=1.19, wpb=495105, bsz=16504.6, num_updates=56800, lr=0.000265372, gnorm=0.137, clip=0, loss_scale=4, train_wall=83, gb_free=61.9, wall=48662 epoch 034: 1150 / 1689 loss=3.501, nll_loss=1.945, ppl=3.85, wps=587744, ups=1.19, wpb=495105, bsz=16504.6, num_updates=56800, lr=0.000265372, gnorm=0.137, clip=0, loss_scale=4, train_wall=83, gb_free=61.9, wall=48662 epoch 034: 1150 / 1689 loss=3.501, nll_loss=1.945, ppl=3.85, wps=587744, ups=1.19, wpb=495105, bsz=16504.6, num_updates=56800, lr=0.000265372, gnorm=0.137, clip=0, loss_scale=4, train_wall=83, gb_free=61.9, wall=48662 epoch 034: 1150 / 1689 loss=3.501, nll_loss=1.945, ppl=3.85, wps=587744, ups=1.19, wpb=495105, bsz=16504.6, num_updates=56800, lr=0.000265372, gnorm=0.137, clip=0, loss_scale=4, train_wall=83, gb_free=61.9, wall=48662 epoch 034: 1150 / 1689 loss=3.501, nll_loss=1.945, ppl=3.85, wps=587744, ups=1.19, wpb=495105, bsz=16504.6, num_updates=56800, lr=0.000265372, gnorm=0.137, clip=0, loss_scale=4, train_wall=83, gb_free=61.9, wall=48662 epoch 034: 1150 / 1689 loss=3.501, nll_loss=1.945, ppl=3.85, wps=587744, ups=1.19, wpb=495105, bsz=16504.6, num_updates=56800, lr=0.000265372, gnorm=0.137, clip=0, loss_scale=4, train_wall=83, gb_free=61.9, wall=48662 epoch 034: 1150 / 1689 loss=3.501, nll_loss=1.945, ppl=3.85, wps=587744, ups=1.19, wpb=495105, bsz=16504.6, num_updates=56800, lr=0.000265372, gnorm=0.137, clip=0, loss_scale=4, train_wall=83, gb_free=61.9, wall=48662 epoch 034: 1150 / 1689 loss=3.501, nll_loss=1.945, ppl=3.85, wps=587744, ups=1.19, wpb=495105, bsz=16504.6, num_updates=56800, lr=0.000265372, gnorm=0.137, clip=0, loss_scale=4, train_wall=83, gb_free=61.9, wall=48662 epoch 034: 1150 / 1689 loss=3.501, nll_loss=1.945, ppl=3.85, wps=587744, ups=1.19, wpb=495105, bsz=16504.6, num_updates=56800, lr=0.000265372, gnorm=0.137, clip=0, loss_scale=4, train_wall=83, gb_free=61.9, wall=48662 epoch 034: 1150 / 1689 loss=3.501, nll_loss=1.945, ppl=3.85, wps=587744, ups=1.19, wpb=495105, bsz=16504.6, num_updates=56800, lr=0.000265372, gnorm=0.137, clip=0, loss_scale=4, train_wall=83, gb_free=61.9, wall=48662 epoch 034: 1150 / 1689 loss=3.501, nll_loss=1.945, ppl=3.85, wps=587744, ups=1.19, wpb=495105, bsz=16504.6, num_updates=56800, lr=0.000265372, gnorm=0.137, clip=0, loss_scale=4, train_wall=83, gb_free=61.9, wall=48662 epoch 034: 1150 / 1689 loss=3.501, nll_loss=1.945, ppl=3.85, wps=587744, ups=1.19, wpb=495105, bsz=16504.6, num_updates=56800, lr=0.000265372, gnorm=0.137, clip=0, loss_scale=4, train_wall=83, gb_free=61.9, wall=48662 epoch 034: 1150 / 1689 loss=3.501, nll_loss=1.945, ppl=3.85, wps=587744, ups=1.19, wpb=495105, bsz=16504.6, num_updates=56800, lr=0.000265372, gnorm=0.137, clip=0, loss_scale=4, train_wall=83, gb_free=61.9, wall=48662 epoch 034: 1150 / 1689 loss=3.501, nll_loss=1.945, ppl=3.85, wps=587744, ups=1.19, wpb=495105, bsz=16504.6, num_updates=56800, lr=0.000265372, gnorm=0.137, clip=0, loss_scale=4, train_wall=83, gb_free=61.9, wall=48662 epoch 034: 1150 / 1689 loss=3.501, nll_loss=1.945, ppl=3.85, wps=587744, ups=1.19, wpb=495105, bsz=16504.6, num_updates=56800, lr=0.000265372, gnorm=0.137, clip=0, loss_scale=4, train_wall=83, gb_free=61.9, wall=48662 epoch 034: 1150 / 1689 loss=3.501, nll_loss=1.945, ppl=3.85, wps=587744, ups=1.19, wpb=495105, bsz=16504.6, num_updates=56800, lr=0.000265372, gnorm=0.137, clip=0, loss_scale=4, train_wall=83, gb_free=61.9, wall=48662 epoch 034: 1251 / 1689 loss=3.508, nll_loss=1.954, ppl=3.87, wps=580584, ups=1.17, wpb=495493, bsz=16535.8, num_updates=56900, lr=0.000265139, gnorm=0.134, clip=0, loss_scale=2, train_wall=84, gb_free=61.4, wall=48748 epoch 034: 1251 / 1689 loss=3.508, nll_loss=1.954, ppl=3.87, wps=580584, ups=1.17, wpb=495493, bsz=16535.8, num_updates=56900, lr=0.000265139, gnorm=0.134, clip=0, loss_scale=2, train_wall=84, gb_free=61.4, wall=48748 epoch 034: 1251 / 1689 loss=3.508, nll_loss=1.954, ppl=3.87, wps=580584, ups=1.17, wpb=495493, bsz=16535.8, num_updates=56900, lr=0.000265139, gnorm=0.134, clip=0, loss_scale=2, train_wall=84, gb_free=61.4, wall=48748 epoch 034: 1251 / 1689 loss=3.508, nll_loss=1.954, ppl=3.87, wps=580584, ups=1.17, wpb=495493, bsz=16535.8, num_updates=56900, lr=0.000265139, gnorm=0.134, clip=0, loss_scale=2, train_wall=84, gb_free=61.4, wall=48748 epoch 034: 1251 / 1689 loss=3.508, nll_loss=1.954, ppl=3.87, wps=580584, ups=1.17, wpb=495493, bsz=16535.8, num_updates=56900, lr=0.000265139, gnorm=0.134, clip=0, loss_scale=2, train_wall=84, gb_free=61.4, wall=48748 epoch 034: 1251 / 1689 loss=3.508, nll_loss=1.954, ppl=3.87, wps=580584, ups=1.17, wpb=495493, bsz=16535.8, num_updates=56900, lr=0.000265139, gnorm=0.134, clip=0, loss_scale=2, train_wall=84, gb_free=61.4, wall=48748 epoch 034: 1251 / 1689 loss=3.508, nll_loss=1.954, ppl=3.87, wps=580584, ups=1.17, wpb=495493, bsz=16535.8, num_updates=56900, lr=0.000265139, gnorm=0.134, clip=0, loss_scale=2, train_wall=84, gb_free=61.4, wall=48748 epoch 034: 1251 / 1689 loss=3.508, nll_loss=1.954, ppl=3.87, wps=580584, ups=1.17, wpb=495493, bsz=16535.8, num_updates=56900, lr=0.000265139, gnorm=0.134, clip=0, loss_scale=2, train_wall=84, gb_free=61.4, wall=48748 epoch 034: 1251 / 1689 loss=3.508, nll_loss=1.954, ppl=3.87, wps=580584, ups=1.17, wpb=495493, bsz=16535.8, num_updates=56900, lr=0.000265139, gnorm=0.134, clip=0, loss_scale=2, train_wall=84, gb_free=61.4, wall=48748 epoch 034: 1251 / 1689 loss=3.508, nll_loss=1.954, ppl=3.87, wps=580584, ups=1.17, wpb=495493, bsz=16535.8, num_updates=56900, lr=0.000265139, gnorm=0.134, clip=0, loss_scale=2, train_wall=84, gb_free=61.4, wall=48748 epoch 034: 1251 / 1689 loss=3.508, nll_loss=1.954, ppl=3.87, wps=580584, ups=1.17, wpb=495493, bsz=16535.8, num_updates=56900, lr=0.000265139, gnorm=0.134, clip=0, loss_scale=2, train_wall=84, gb_free=61.4, wall=48748 epoch 034: 1251 / 1689 loss=3.508, nll_loss=1.954, ppl=3.87, wps=580584, ups=1.17, wpb=495493, bsz=16535.8, num_updates=56900, lr=0.000265139, gnorm=0.134, clip=0, loss_scale=2, train_wall=84, gb_free=61.4, wall=48748 epoch 034: 1251 / 1689 loss=3.508, nll_loss=1.954, ppl=3.87, wps=580584, ups=1.17, wpb=495493, bsz=16535.8, num_updates=56900, lr=0.000265139, gnorm=0.134, clip=0, loss_scale=2, train_wall=84, gb_free=61.4, wall=48748 epoch 034: 1251 / 1689 loss=3.508, nll_loss=1.954, ppl=3.87, wps=580584, ups=1.17, wpb=495493, bsz=16535.8, num_updates=56900, lr=0.000265139, gnorm=0.134, clip=0, loss_scale=2, train_wall=84, gb_free=61.4, wall=48748 epoch 034: 1251 / 1689 loss=3.508, nll_loss=1.954, ppl=3.87, wps=580584, ups=1.17, wpb=495493, bsz=16535.8, num_updates=56900, lr=0.000265139, gnorm=0.134, clip=0, loss_scale=2, train_wall=84, gb_free=61.4, wall=48748 epoch 034: 1251 / 1689 loss=3.508, nll_loss=1.954, ppl=3.87, wps=580584, ups=1.17, wpb=495493, bsz=16535.8, num_updates=56900, lr=0.000265139, gnorm=0.134, clip=0, loss_scale=2, train_wall=84, gb_free=61.4, wall=48748 epoch 034: 1251 / 1689 loss=3.508, nll_loss=1.954, ppl=3.87, wps=580584, ups=1.17, wpb=495493, bsz=16535.8, num_updates=56900, lr=0.000265139, gnorm=0.134, clip=0, loss_scale=2, train_wall=84, gb_free=61.4, wall=48748 epoch 034: 1251 / 1689 loss=3.508, nll_loss=1.954, ppl=3.87, wps=580584, ups=1.17, wpb=495493, bsz=16535.8, num_updates=56900, lr=0.000265139, gnorm=0.134, clip=0, loss_scale=2, train_wall=84, gb_free=61.4, wall=48748 epoch 034: 1251 / 1689 loss=3.508, nll_loss=1.954, ppl=3.87, wps=580584, ups=1.17, wpb=495493, bsz=16535.8, num_updates=56900, lr=0.000265139, gnorm=0.134, clip=0, loss_scale=2, train_wall=84, gb_free=61.4, wall=48748 epoch 034: 1251 / 1689 loss=3.508, nll_loss=1.954, ppl=3.87, wps=580584, ups=1.17, wpb=495493, bsz=16535.8, num_updates=56900, lr=0.000265139, gnorm=0.134, clip=0, loss_scale=2, train_wall=84, gb_free=61.4, wall=48748 epoch 034: 1251 / 1689 loss=3.508, nll_loss=1.954, ppl=3.87, wps=580584, ups=1.17, wpb=495493, bsz=16535.8, num_updates=56900, lr=0.000265139, gnorm=0.134, clip=0, loss_scale=2, train_wall=84, gb_free=61.4, wall=48748 epoch 034: 1251 / 1689 loss=3.508, nll_loss=1.954, ppl=3.87, wps=580584, ups=1.17, wpb=495493, bsz=16535.8, num_updates=56900, lr=0.000265139, gnorm=0.134, clip=0, loss_scale=2, train_wall=84, gb_free=61.4, wall=48748 epoch 034: 1251 / 1689 loss=3.508, nll_loss=1.954, ppl=3.87, wps=580584, ups=1.17, wpb=495493, bsz=16535.8, num_updates=56900, lr=0.000265139, gnorm=0.134, clip=0, loss_scale=2, train_wall=84, gb_free=61.4, wall=48748 epoch 034: 1251 / 1689 loss=3.508, nll_loss=1.954, ppl=3.87, wps=580584, ups=1.17, wpb=495493, bsz=16535.8, num_updates=56900, lr=0.000265139, gnorm=0.134, clip=0, loss_scale=2, train_wall=84, gb_free=61.4, wall=48748 epoch 034: 1251 / 1689 loss=3.508, nll_loss=1.954, ppl=3.87, wps=580584, ups=1.17, wpb=495493, bsz=16535.8, num_updates=56900, lr=0.000265139, gnorm=0.134, clip=0, loss_scale=2, train_wall=84, gb_free=61.4, wall=48748 epoch 034: 1251 / 1689 loss=3.508, nll_loss=1.954, ppl=3.87, wps=580584, ups=1.17, wpb=495493, bsz=16535.8, num_updates=56900, lr=0.000265139, gnorm=0.134, clip=0, loss_scale=2, train_wall=84, gb_free=61.4, wall=48748 epoch 034: 1251 / 1689 loss=3.508, nll_loss=1.954, ppl=3.87, wps=580584, ups=1.17, wpb=495493, bsz=16535.8, num_updates=56900, lr=0.000265139, gnorm=0.134, clip=0, loss_scale=2, train_wall=84, gb_free=61.4, wall=48748 epoch 034: 1251 / 1689 loss=3.508, nll_loss=1.954, ppl=3.87, wps=580584, ups=1.17, wpb=495493, bsz=16535.8, num_updates=56900, lr=0.000265139, gnorm=0.134, clip=0, loss_scale=2, train_wall=84, gb_free=61.4, wall=48748 epoch 034: 1251 / 1689 loss=3.508, nll_loss=1.954, ppl=3.87, wps=580584, ups=1.17, wpb=495493, bsz=16535.8, num_updates=56900, lr=0.000265139, gnorm=0.134, clip=0, loss_scale=2, train_wall=84, gb_free=61.4, wall=48748 epoch 034: 1251 / 1689 loss=3.508, nll_loss=1.954, ppl=3.87, wps=580584, ups=1.17, wpb=495493, bsz=16535.8, num_updates=56900, lr=0.000265139, gnorm=0.134, clip=0, loss_scale=2, train_wall=84, gb_free=61.4, wall=48748 epoch 034: 1251 / 1689 loss=3.508, nll_loss=1.954, ppl=3.87, wps=580584, ups=1.17, wpb=495493, bsz=16535.8, num_updates=56900, lr=0.000265139, gnorm=0.134, clip=0, loss_scale=2, train_wall=84, gb_free=61.4, wall=48748 epoch 034: 1251 / 1689 loss=3.508, nll_loss=1.954, ppl=3.87, wps=580584, ups=1.17, wpb=495493, bsz=16535.8, num_updates=56900, lr=0.000265139, gnorm=0.134, clip=0, loss_scale=2, train_wall=84, gb_free=61.4, wall=48748 epoch 034: 1251 / 1689 loss=3.508, nll_loss=1.954, ppl=3.87, wps=580584, ups=1.17, wpb=495493, bsz=16535.8, num_updates=56900, lr=0.000265139, gnorm=0.134, clip=0, loss_scale=2, train_wall=84, gb_free=61.4, wall=48748 epoch 034: 1251 / 1689 loss=3.508, nll_loss=1.954, ppl=3.87, wps=580584, ups=1.17, wpb=495493, bsz=16535.8, num_updates=56900, lr=0.000265139, gnorm=0.134, clip=0, loss_scale=2, train_wall=84, gb_free=61.4, wall=48748 epoch 034: 1351 / 1689 loss=3.507, nll_loss=1.952, ppl=3.87, wps=586512, ups=1.18, wpb=496665, bsz=16800.2, num_updates=57000, lr=0.000264906, gnorm=0.14, clip=0, loss_scale=2, train_wall=84, gb_free=61.7, wall=48833 epoch 034: 1351 / 1689 loss=3.507, nll_loss=1.952, ppl=3.87, wps=586512, ups=1.18, wpb=496665, bsz=16800.2, num_updates=57000, lr=0.000264906, gnorm=0.14, clip=0, loss_scale=2, train_wall=84, gb_free=61.7, wall=48833 epoch 034: 1351 / 1689 loss=3.507, nll_loss=1.952, ppl=3.87, wps=586512, ups=1.18, wpb=496665, bsz=16800.2, num_updates=57000, lr=0.000264906, gnorm=0.14, clip=0, loss_scale=2, train_wall=84, gb_free=61.7, wall=48833 epoch 034: 1351 / 1689 loss=3.507, nll_loss=1.952, ppl=3.87, wps=586512, ups=1.18, wpb=496665, bsz=16800.2, num_updates=57000, lr=0.000264906, gnorm=0.14, clip=0, loss_scale=2, train_wall=84, gb_free=61.7, wall=48833 epoch 034: 1351 / 1689 loss=3.507, nll_loss=1.952, ppl=3.87, wps=586512, ups=1.18, wpb=496665, bsz=16800.2, num_updates=57000, lr=0.000264906, gnorm=0.14, clip=0, loss_scale=2, train_wall=84, gb_free=61.7, wall=48833 epoch 034: 1351 / 1689 loss=3.507, nll_loss=1.952, ppl=3.87, wps=586512, ups=1.18, wpb=496665, bsz=16800.2, num_updates=57000, lr=0.000264906, gnorm=0.14, clip=0, loss_scale=2, train_wall=84, gb_free=61.7, wall=48833 epoch 034: 1351 / 1689 loss=3.507, nll_loss=1.952, ppl=3.87, wps=586512, ups=1.18, wpb=496665, bsz=16800.2, num_updates=57000, lr=0.000264906, gnorm=0.14, clip=0, loss_scale=2, train_wall=84, gb_free=61.7, wall=48833 epoch 034: 1351 / 1689 loss=3.507, nll_loss=1.952, ppl=3.87, wps=586512, ups=1.18, wpb=496665, bsz=16800.2, num_updates=57000, lr=0.000264906, gnorm=0.14, clip=0, loss_scale=2, train_wall=84, gb_free=61.7, wall=48833 epoch 034: 1351 / 1689 loss=3.507, nll_loss=1.952, ppl=3.87, wps=586512, ups=1.18, wpb=496665, bsz=16800.2, num_updates=57000, lr=0.000264906, gnorm=0.14, clip=0, loss_scale=2, train_wall=84, gb_free=61.7, wall=48833 epoch 034: 1351 / 1689 loss=3.507, nll_loss=1.952, ppl=3.87, wps=586512, ups=1.18, wpb=496665, bsz=16800.2, num_updates=57000, lr=0.000264906, gnorm=0.14, clip=0, loss_scale=2, train_wall=84, gb_free=61.7, wall=48833 epoch 034: 1351 / 1689 loss=3.507, nll_loss=1.952, ppl=3.87, wps=586512, ups=1.18, wpb=496665, bsz=16800.2, num_updates=57000, lr=0.000264906, gnorm=0.14, clip=0, loss_scale=2, train_wall=84, gb_free=61.7, wall=48833 epoch 034: 1351 / 1689 loss=3.507, nll_loss=1.952, ppl=3.87, wps=586512, ups=1.18, wpb=496665, bsz=16800.2, num_updates=57000, lr=0.000264906, gnorm=0.14, clip=0, loss_scale=2, train_wall=84, gb_free=61.7, wall=48833 epoch 034: 1351 / 1689 loss=3.507, nll_loss=1.952, ppl=3.87, wps=586512, ups=1.18, wpb=496665, bsz=16800.2, num_updates=57000, lr=0.000264906, gnorm=0.14, clip=0, loss_scale=2, train_wall=84, gb_free=61.7, wall=48833 epoch 034: 1351 / 1689 loss=3.507, nll_loss=1.952, ppl=3.87, wps=586512, ups=1.18, wpb=496665, bsz=16800.2, num_updates=57000, lr=0.000264906, gnorm=0.14, clip=0, loss_scale=2, train_wall=84, gb_free=61.7, wall=48833 epoch 034: 1351 / 1689 loss=3.507, nll_loss=1.952, ppl=3.87, wps=586512, ups=1.18, wpb=496665, bsz=16800.2, num_updates=57000, lr=0.000264906, gnorm=0.14, clip=0, loss_scale=2, train_wall=84, gb_free=61.7, wall=48833 epoch 034: 1351 / 1689 loss=3.507, nll_loss=1.952, ppl=3.87, wps=586512, ups=1.18, wpb=496665, bsz=16800.2, num_updates=57000, lr=0.000264906, gnorm=0.14, clip=0, loss_scale=2, train_wall=84, gb_free=61.7, wall=48833 epoch 034: 1351 / 1689 loss=3.507, nll_loss=1.952, ppl=3.87, wps=586512, ups=1.18, wpb=496665, bsz=16800.2, num_updates=57000, lr=0.000264906, gnorm=0.14, clip=0, loss_scale=2, train_wall=84, gb_free=61.7, wall=48833 epoch 034: 1351 / 1689 loss=3.507, nll_loss=1.952, ppl=3.87, wps=586512, ups=1.18, wpb=496665, bsz=16800.2, num_updates=57000, lr=0.000264906, gnorm=0.14, clip=0, loss_scale=2, train_wall=84, gb_free=61.7, wall=48833 epoch 034: 1351 / 1689 loss=3.507, nll_loss=1.952, ppl=3.87, wps=586512, ups=1.18, wpb=496665, bsz=16800.2, num_updates=57000, lr=0.000264906, gnorm=0.14, clip=0, loss_scale=2, train_wall=84, gb_free=61.7, wall=48833 epoch 034: 1351 / 1689 loss=3.507, nll_loss=1.952, ppl=3.87, wps=586512, ups=1.18, wpb=496665, bsz=16800.2, num_updates=57000, lr=0.000264906, gnorm=0.14, clip=0, loss_scale=2, train_wall=84, gb_free=61.7, wall=48833 epoch 034: 1351 / 1689 loss=3.507, nll_loss=1.952, ppl=3.87, wps=586512, ups=1.18, wpb=496665, bsz=16800.2, num_updates=57000, lr=0.000264906, gnorm=0.14, clip=0, loss_scale=2, train_wall=84, gb_free=61.7, wall=48833 epoch 034: 1351 / 1689 loss=3.507, nll_loss=1.952, ppl=3.87, wps=586512, ups=1.18, wpb=496665, bsz=16800.2, num_updates=57000, lr=0.000264906, gnorm=0.14, clip=0, loss_scale=2, train_wall=84, gb_free=61.7, wall=48833 epoch 034: 1351 / 1689 loss=3.507, nll_loss=1.952, ppl=3.87, wps=586512, ups=1.18, wpb=496665, bsz=16800.2, num_updates=57000, lr=0.000264906, gnorm=0.14, clip=0, loss_scale=2, train_wall=84, gb_free=61.7, wall=48833 epoch 034: 1351 / 1689 loss=3.507, nll_loss=1.952, ppl=3.87, wps=586512, ups=1.18, wpb=496665, bsz=16800.2, num_updates=57000, lr=0.000264906, gnorm=0.14, clip=0, loss_scale=2, train_wall=84, gb_free=61.7, wall=48833 epoch 034: 1351 / 1689 loss=3.507, nll_loss=1.952, ppl=3.87, wps=586512, ups=1.18, wpb=496665, bsz=16800.2, num_updates=57000, lr=0.000264906, gnorm=0.14, clip=0, loss_scale=2, train_wall=84, gb_free=61.7, wall=48833 epoch 034: 1351 / 1689 loss=3.507, nll_loss=1.952, ppl=3.87, wps=586512, ups=1.18, wpb=496665, bsz=16800.2, num_updates=57000, lr=0.000264906, gnorm=0.14, clip=0, loss_scale=2, train_wall=84, gb_free=61.7, wall=48833 epoch 034: 1351 / 1689 loss=3.507, nll_loss=1.952, ppl=3.87, wps=586512, ups=1.18, wpb=496665, bsz=16800.2, num_updates=57000, lr=0.000264906, gnorm=0.14, clip=0, loss_scale=2, train_wall=84, gb_free=61.7, wall=48833 epoch 034: 1351 / 1689 loss=3.507, nll_loss=1.952, ppl=3.87, wps=586512, ups=1.18, wpb=496665, bsz=16800.2, num_updates=57000, lr=0.000264906, gnorm=0.14, clip=0, loss_scale=2, train_wall=84, gb_free=61.7, wall=48833 epoch 034: 1351 / 1689 loss=3.507, nll_loss=1.952, ppl=3.87, wps=586512, ups=1.18, wpb=496665, bsz=16800.2, num_updates=57000, lr=0.000264906, gnorm=0.14, clip=0, loss_scale=2, train_wall=84, gb_free=61.7, wall=48833 epoch 034: 1351 / 1689 loss=3.507, nll_loss=1.952, ppl=3.87, wps=586512, ups=1.18, wpb=496665, bsz=16800.2, num_updates=57000, lr=0.000264906, gnorm=0.14, clip=0, loss_scale=2, train_wall=84, gb_free=61.7, wall=48833 epoch 034: 1351 / 1689 loss=3.507, nll_loss=1.952, ppl=3.87, wps=586512, ups=1.18, wpb=496665, bsz=16800.2, num_updates=57000, lr=0.000264906, gnorm=0.14, clip=0, loss_scale=2, train_wall=84, gb_free=61.7, wall=48833 epoch 034: 1351 / 1689 loss=3.507, nll_loss=1.952, ppl=3.87, wps=586512, ups=1.18, wpb=496665, bsz=16800.2, num_updates=57000, lr=0.000264906, gnorm=0.14, clip=0, loss_scale=2, train_wall=84, gb_free=61.7, wall=48833 epoch 034: 1351 / 1689 loss=3.507, nll_loss=1.952, ppl=3.87, wps=586512, ups=1.18, wpb=496665, bsz=16800.2, num_updates=57000, lr=0.000264906, gnorm=0.14, clip=0, loss_scale=2, train_wall=84, gb_free=61.7, wall=48833 epoch 034: 1351 / 1689 loss=3.507, nll_loss=1.952, ppl=3.87, wps=586512, ups=1.18, wpb=496665, bsz=16800.2, num_updates=57000, lr=0.000264906, gnorm=0.14, clip=0, loss_scale=2, train_wall=84, gb_free=61.7, wall=48833 begin validation on "valid" subset epoch 034 | valid on 'valid' subset | loss 3.695 | nll_loss 2.149 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 57000 | best_loss 3.685 epoch 034 | valid on 'valid' subset | loss 3.695 | nll_loss 2.149 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 57000 | best_loss 3.685 epoch 034 | valid on 'valid' subset | loss 3.695 | nll_loss 2.149 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 57000 | best_loss 3.685 epoch 034 | valid on 'valid' subset | loss 3.695 | nll_loss 2.149 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 57000 | best_loss 3.685 epoch 034 | valid on 'valid' subset | loss 3.695 | nll_loss 2.149 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 57000 | best_loss 3.685 epoch 034 | valid on 'valid' subset | loss 3.695 | nll_loss 2.149 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 57000 | best_loss 3.685 epoch 034 | valid on 'valid' subset | loss 3.695 | nll_loss 2.149 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 57000 | best_loss 3.685 epoch 034 | valid on 'valid' subset | loss 3.695 | nll_loss 2.149 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 57000 | best_loss 3.685 epoch 034 | valid on 'valid' subset | loss 3.695 | nll_loss 2.149 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 57000 | best_loss 3.685 epoch 034 | valid on 'valid' subset | loss 3.695 | nll_loss 2.149 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 57000 | best_loss 3.685 epoch 034 | valid on 'valid' subset | loss 3.695 | nll_loss 2.149 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 57000 | best_loss 3.685 epoch 034 | valid on 'valid' subset | loss 3.695 | nll_loss 2.149 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 57000 | best_loss 3.685 epoch 034 | valid on 'valid' subset | loss 3.695 | nll_loss 2.149 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 57000 | best_loss 3.685 epoch 034 | valid on 'valid' subset | loss 3.695 | nll_loss 2.149 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 57000 | best_loss 3.685 epoch 034 | valid on 'valid' subset | loss 3.695 | nll_loss 2.149 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 57000 | best_loss 3.685 epoch 034 | valid on 'valid' subset | loss 3.695 | nll_loss 2.149 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 57000 | best_loss 3.685 epoch 034 | valid on 'valid' subset | loss 3.695 | nll_loss 2.149 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 57000 | best_loss 3.685 epoch 034 | valid on 'valid' subset | loss 3.695 | nll_loss 2.149 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 57000 | best_loss 3.685 epoch 034 | valid on 'valid' subset | loss 3.695 | nll_loss 2.149 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 57000 | best_loss 3.685 epoch 034 | valid on 'valid' subset | loss 3.695 | nll_loss 2.149 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 57000 | best_loss 3.685 epoch 034 | valid on 'valid' subset | loss 3.695 | nll_loss 2.149 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 57000 | best_loss 3.685 epoch 034 | valid on 'valid' subset | loss 3.695 | nll_loss 2.149 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 57000 | best_loss 3.685 epoch 034 | valid on 'valid' subset | loss 3.695 | nll_loss 2.149 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 57000 | best_loss 3.685 epoch 034 | valid on 'valid' subset | loss 3.695 | nll_loss 2.149 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 57000 | best_loss 3.685 epoch 034 | valid on 'valid' subset | loss 3.695 | nll_loss 2.149 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 57000 | best_loss 3.685 epoch 034 | valid on 'valid' subset | loss 3.695 | nll_loss 2.149 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 57000 | best_loss 3.685 epoch 034 | valid on 'valid' subset | loss 3.695 | nll_loss 2.149 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 57000 | best_loss 3.685 epoch 034 | valid on 'valid' subset | loss 3.695 | nll_loss 2.149 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 57000 | best_loss 3.685 epoch 034 | valid on 'valid' subset | loss 3.695 | nll_loss 2.149 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 57000 | best_loss 3.685 epoch 034 | valid on 'valid' subset | loss 3.695 | nll_loss 2.149 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 57000 | best_loss 3.685 epoch 034 | valid on 'valid' subset | loss 3.695 | nll_loss 2.149 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 57000 | best_loss 3.685 epoch 034 | valid on 'valid' subset | loss 3.695 | nll_loss 2.149 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 57000 | best_loss 3.685 epoch 034 | valid on 'valid' subset | loss 3.695 | nll_loss 2.149 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 57000 | best_loss 3.685 epoch 034 | valid on 'valid' subset | loss 3.695 | nll_loss 2.149 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 57000 | best_loss 3.685 epoch 034: 1451 / 1689 loss=3.516, nll_loss=1.962, ppl=3.9, wps=534864, ups=1.08, wpb=495873, bsz=16436.9, num_updates=57100, lr=0.000264674, gnorm=0.139, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=48925 epoch 034: 1451 / 1689 loss=3.516, nll_loss=1.962, ppl=3.9, wps=534864, ups=1.08, wpb=495873, bsz=16436.9, num_updates=57100, lr=0.000264674, gnorm=0.139, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=48925 epoch 034: 1451 / 1689 loss=3.516, nll_loss=1.962, ppl=3.9, wps=534864, ups=1.08, wpb=495873, bsz=16436.9, num_updates=57100, lr=0.000264674, gnorm=0.139, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=48925 epoch 034: 1451 / 1689 loss=3.516, nll_loss=1.962, ppl=3.9, wps=534864, ups=1.08, wpb=495873, bsz=16436.9, num_updates=57100, lr=0.000264674, gnorm=0.139, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=48925 epoch 034: 1451 / 1689 loss=3.516, nll_loss=1.962, ppl=3.9, wps=534864, ups=1.08, wpb=495873, bsz=16436.9, num_updates=57100, lr=0.000264674, gnorm=0.139, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=48925 epoch 034: 1451 / 1689 loss=3.516, nll_loss=1.962, ppl=3.9, wps=534864, ups=1.08, wpb=495873, bsz=16436.9, num_updates=57100, lr=0.000264674, gnorm=0.139, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=48925 epoch 034: 1451 / 1689 loss=3.516, nll_loss=1.962, ppl=3.9, wps=534864, ups=1.08, wpb=495873, bsz=16436.9, num_updates=57100, lr=0.000264674, gnorm=0.139, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=48925 epoch 034: 1451 / 1689 loss=3.516, nll_loss=1.962, ppl=3.9, wps=534864, ups=1.08, wpb=495873, bsz=16436.9, num_updates=57100, lr=0.000264674, gnorm=0.139, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=48925 epoch 034: 1451 / 1689 loss=3.516, nll_loss=1.962, ppl=3.9, wps=534864, ups=1.08, wpb=495873, bsz=16436.9, num_updates=57100, lr=0.000264674, gnorm=0.139, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=48925 epoch 034: 1451 / 1689 loss=3.516, nll_loss=1.962, ppl=3.9, wps=534864, ups=1.08, wpb=495873, bsz=16436.9, num_updates=57100, lr=0.000264674, gnorm=0.139, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=48925 epoch 034: 1451 / 1689 loss=3.516, nll_loss=1.962, ppl=3.9, wps=534864, ups=1.08, wpb=495873, bsz=16436.9, num_updates=57100, lr=0.000264674, gnorm=0.139, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=48925 epoch 034: 1451 / 1689 loss=3.516, nll_loss=1.962, ppl=3.9, wps=534864, ups=1.08, wpb=495873, bsz=16436.9, num_updates=57100, lr=0.000264674, gnorm=0.139, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=48925 epoch 034: 1451 / 1689 loss=3.516, nll_loss=1.962, ppl=3.9, wps=534864, ups=1.08, wpb=495873, bsz=16436.9, num_updates=57100, lr=0.000264674, gnorm=0.139, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=48925 epoch 034: 1451 / 1689 loss=3.516, nll_loss=1.962, ppl=3.9, wps=534864, ups=1.08, wpb=495873, bsz=16436.9, num_updates=57100, lr=0.000264674, gnorm=0.139, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=48925 epoch 034: 1451 / 1689 loss=3.516, nll_loss=1.962, ppl=3.9, wps=534864, ups=1.08, wpb=495873, bsz=16436.9, num_updates=57100, lr=0.000264674, gnorm=0.139, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=48925 epoch 034: 1451 / 1689 loss=3.516, nll_loss=1.962, ppl=3.9, wps=534864, ups=1.08, wpb=495873, bsz=16436.9, num_updates=57100, lr=0.000264674, gnorm=0.139, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=48925 epoch 034: 1451 / 1689 loss=3.516, nll_loss=1.962, ppl=3.9, wps=534864, ups=1.08, wpb=495873, bsz=16436.9, num_updates=57100, lr=0.000264674, gnorm=0.139, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=48925 epoch 034: 1451 / 1689 loss=3.516, nll_loss=1.962, ppl=3.9, wps=534864, ups=1.08, wpb=495873, bsz=16436.9, num_updates=57100, lr=0.000264674, gnorm=0.139, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=48925 epoch 034: 1451 / 1689 loss=3.516, nll_loss=1.962, ppl=3.9, wps=534864, ups=1.08, wpb=495873, bsz=16436.9, num_updates=57100, lr=0.000264674, gnorm=0.139, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=48925 epoch 034: 1451 / 1689 loss=3.516, nll_loss=1.962, ppl=3.9, wps=534864, ups=1.08, wpb=495873, bsz=16436.9, num_updates=57100, lr=0.000264674, gnorm=0.139, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=48925 epoch 034: 1451 / 1689 loss=3.516, nll_loss=1.962, ppl=3.9, wps=534864, ups=1.08, wpb=495873, bsz=16436.9, num_updates=57100, lr=0.000264674, gnorm=0.139, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=48925 epoch 034: 1451 / 1689 loss=3.516, nll_loss=1.962, ppl=3.9, wps=534864, ups=1.08, wpb=495873, bsz=16436.9, num_updates=57100, lr=0.000264674, gnorm=0.139, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=48925 epoch 034: 1451 / 1689 loss=3.516, nll_loss=1.962, ppl=3.9, wps=534864, ups=1.08, wpb=495873, bsz=16436.9, num_updates=57100, lr=0.000264674, gnorm=0.139, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=48925 epoch 034: 1451 / 1689 loss=3.516, nll_loss=1.962, ppl=3.9, wps=534864, ups=1.08, wpb=495873, bsz=16436.9, num_updates=57100, lr=0.000264674, gnorm=0.139, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=48925 epoch 034: 1451 / 1689 loss=3.516, nll_loss=1.962, ppl=3.9, wps=534864, ups=1.08, wpb=495873, bsz=16436.9, num_updates=57100, lr=0.000264674, gnorm=0.139, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=48925 epoch 034: 1451 / 1689 loss=3.516, nll_loss=1.962, ppl=3.9, wps=534864, ups=1.08, wpb=495873, bsz=16436.9, num_updates=57100, lr=0.000264674, gnorm=0.139, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=48925 epoch 034: 1451 / 1689 loss=3.516, nll_loss=1.962, ppl=3.9, wps=534864, ups=1.08, wpb=495873, bsz=16436.9, num_updates=57100, lr=0.000264674, gnorm=0.139, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=48925 epoch 034: 1451 / 1689 loss=3.516, nll_loss=1.962, ppl=3.9, wps=534864, ups=1.08, wpb=495873, bsz=16436.9, num_updates=57100, lr=0.000264674, gnorm=0.139, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=48925 epoch 034: 1451 / 1689 loss=3.516, nll_loss=1.962, ppl=3.9, wps=534864, ups=1.08, wpb=495873, bsz=16436.9, num_updates=57100, lr=0.000264674, gnorm=0.139, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=48925 epoch 034: 1451 / 1689 loss=3.516, nll_loss=1.962, ppl=3.9, wps=534864, ups=1.08, wpb=495873, bsz=16436.9, num_updates=57100, lr=0.000264674, gnorm=0.139, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=48925 epoch 034: 1451 / 1689 loss=3.516, nll_loss=1.962, ppl=3.9, wps=534864, ups=1.08, wpb=495873, bsz=16436.9, num_updates=57100, lr=0.000264674, gnorm=0.139, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=48925 epoch 034: 1451 / 1689 loss=3.516, nll_loss=1.962, ppl=3.9, wps=534864, ups=1.08, wpb=495873, bsz=16436.9, num_updates=57100, lr=0.000264674, gnorm=0.139, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=48925 epoch 034: 1451 / 1689 loss=3.516, nll_loss=1.962, ppl=3.9, wps=534864, ups=1.08, wpb=495873, bsz=16436.9, num_updates=57100, lr=0.000264674, gnorm=0.139, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=48925 epoch 034: 1451 / 1689 loss=3.516, nll_loss=1.962, ppl=3.9, wps=534864, ups=1.08, wpb=495873, bsz=16436.9, num_updates=57100, lr=0.000264674, gnorm=0.139, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=48925 epoch 034: 1551 / 1689 loss=3.509, nll_loss=1.955, ppl=3.88, wps=586806, ups=1.19, wpb=493888, bsz=16512.5, num_updates=57200, lr=0.000264443, gnorm=0.134, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=49009 epoch 034: 1551 / 1689 loss=3.509, nll_loss=1.955, ppl=3.88, wps=586806, ups=1.19, wpb=493888, bsz=16512.5, num_updates=57200, lr=0.000264443, gnorm=0.134, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=49009 epoch 034: 1551 / 1689 loss=3.509, nll_loss=1.955, ppl=3.88, wps=586806, ups=1.19, wpb=493888, bsz=16512.5, num_updates=57200, lr=0.000264443, gnorm=0.134, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=49009 epoch 034: 1551 / 1689 loss=3.509, nll_loss=1.955, ppl=3.88, wps=586806, ups=1.19, wpb=493888, bsz=16512.5, num_updates=57200, lr=0.000264443, gnorm=0.134, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=49009 epoch 034: 1551 / 1689 loss=3.509, nll_loss=1.955, ppl=3.88, wps=586806, ups=1.19, wpb=493888, bsz=16512.5, num_updates=57200, lr=0.000264443, gnorm=0.134, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=49009 epoch 034: 1551 / 1689 loss=3.509, nll_loss=1.955, ppl=3.88, wps=586806, ups=1.19, wpb=493888, bsz=16512.5, num_updates=57200, lr=0.000264443, gnorm=0.134, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=49009 epoch 034: 1551 / 1689 loss=3.509, nll_loss=1.955, ppl=3.88, wps=586806, ups=1.19, wpb=493888, bsz=16512.5, num_updates=57200, lr=0.000264443, gnorm=0.134, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=49009 epoch 034: 1551 / 1689 loss=3.509, nll_loss=1.955, ppl=3.88, wps=586806, ups=1.19, wpb=493888, bsz=16512.5, num_updates=57200, lr=0.000264443, gnorm=0.134, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=49009 epoch 034: 1551 / 1689 loss=3.509, nll_loss=1.955, ppl=3.88, wps=586806, ups=1.19, wpb=493888, bsz=16512.5, num_updates=57200, lr=0.000264443, gnorm=0.134, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=49009 epoch 034: 1551 / 1689 loss=3.509, nll_loss=1.955, ppl=3.88, wps=586806, ups=1.19, wpb=493888, bsz=16512.5, num_updates=57200, lr=0.000264443, gnorm=0.134, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=49009 epoch 034: 1551 / 1689 loss=3.509, nll_loss=1.955, ppl=3.88, wps=586806, ups=1.19, wpb=493888, bsz=16512.5, num_updates=57200, lr=0.000264443, gnorm=0.134, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=49009 epoch 034: 1551 / 1689 loss=3.509, nll_loss=1.955, ppl=3.88, wps=586806, ups=1.19, wpb=493888, bsz=16512.5, num_updates=57200, lr=0.000264443, gnorm=0.134, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=49009 epoch 034: 1551 / 1689 loss=3.509, nll_loss=1.955, ppl=3.88, wps=586806, ups=1.19, wpb=493888, bsz=16512.5, num_updates=57200, lr=0.000264443, gnorm=0.134, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=49009 epoch 034: 1551 / 1689 loss=3.509, nll_loss=1.955, ppl=3.88, wps=586806, ups=1.19, wpb=493888, bsz=16512.5, num_updates=57200, lr=0.000264443, gnorm=0.134, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=49009 epoch 034: 1551 / 1689 loss=3.509, nll_loss=1.955, ppl=3.88, wps=586806, ups=1.19, wpb=493888, bsz=16512.5, num_updates=57200, lr=0.000264443, gnorm=0.134, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=49009 epoch 034: 1551 / 1689 loss=3.509, nll_loss=1.955, ppl=3.88, wps=586806, ups=1.19, wpb=493888, bsz=16512.5, num_updates=57200, lr=0.000264443, gnorm=0.134, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=49009 epoch 034: 1551 / 1689 loss=3.509, nll_loss=1.955, ppl=3.88, wps=586806, ups=1.19, wpb=493888, bsz=16512.5, num_updates=57200, lr=0.000264443, gnorm=0.134, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=49009 epoch 034: 1551 / 1689 loss=3.509, nll_loss=1.955, ppl=3.88, wps=586806, ups=1.19, wpb=493888, bsz=16512.5, num_updates=57200, lr=0.000264443, gnorm=0.134, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=49009 epoch 034: 1551 / 1689 loss=3.509, nll_loss=1.955, ppl=3.88, wps=586806, ups=1.19, wpb=493888, bsz=16512.5, num_updates=57200, lr=0.000264443, gnorm=0.134, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=49009 epoch 034: 1551 / 1689 loss=3.509, nll_loss=1.955, ppl=3.88, wps=586806, ups=1.19, wpb=493888, bsz=16512.5, num_updates=57200, lr=0.000264443, gnorm=0.134, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=49009 epoch 034: 1551 / 1689 loss=3.509, nll_loss=1.955, ppl=3.88, wps=586806, ups=1.19, wpb=493888, bsz=16512.5, num_updates=57200, lr=0.000264443, gnorm=0.134, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=49009 epoch 034: 1551 / 1689 loss=3.509, nll_loss=1.955, ppl=3.88, wps=586806, ups=1.19, wpb=493888, bsz=16512.5, num_updates=57200, lr=0.000264443, gnorm=0.134, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=49009 epoch 034: 1551 / 1689 loss=3.509, nll_loss=1.955, ppl=3.88, wps=586806, ups=1.19, wpb=493888, bsz=16512.5, num_updates=57200, lr=0.000264443, gnorm=0.134, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=49009 epoch 034: 1551 / 1689 loss=3.509, nll_loss=1.955, ppl=3.88, wps=586806, ups=1.19, wpb=493888, bsz=16512.5, num_updates=57200, lr=0.000264443, gnorm=0.134, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=49009 epoch 034: 1551 / 1689 loss=3.509, nll_loss=1.955, ppl=3.88, wps=586806, ups=1.19, wpb=493888, bsz=16512.5, num_updates=57200, lr=0.000264443, gnorm=0.134, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=49009 epoch 034: 1551 / 1689 loss=3.509, nll_loss=1.955, ppl=3.88, wps=586806, ups=1.19, wpb=493888, bsz=16512.5, num_updates=57200, lr=0.000264443, gnorm=0.134, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=49009 epoch 034: 1551 / 1689 loss=3.509, nll_loss=1.955, ppl=3.88, wps=586806, ups=1.19, wpb=493888, bsz=16512.5, num_updates=57200, lr=0.000264443, gnorm=0.134, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=49009 epoch 034: 1551 / 1689 loss=3.509, nll_loss=1.955, ppl=3.88, wps=586806, ups=1.19, wpb=493888, bsz=16512.5, num_updates=57200, lr=0.000264443, gnorm=0.134, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=49009 epoch 034: 1551 / 1689 loss=3.509, nll_loss=1.955, ppl=3.88, wps=586806, ups=1.19, wpb=493888, bsz=16512.5, num_updates=57200, lr=0.000264443, gnorm=0.134, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=49009 epoch 034: 1551 / 1689 loss=3.509, nll_loss=1.955, ppl=3.88, wps=586806, ups=1.19, wpb=493888, bsz=16512.5, num_updates=57200, lr=0.000264443, gnorm=0.134, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=49009 epoch 034: 1551 / 1689 loss=3.509, nll_loss=1.955, ppl=3.88, wps=586806, ups=1.19, wpb=493888, bsz=16512.5, num_updates=57200, lr=0.000264443, gnorm=0.134, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=49009 epoch 034: 1551 / 1689 loss=3.509, nll_loss=1.955, ppl=3.88, wps=586806, ups=1.19, wpb=493888, bsz=16512.5, num_updates=57200, lr=0.000264443, gnorm=0.134, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=49009 epoch 034: 1551 / 1689 loss=3.509, nll_loss=1.955, ppl=3.88, wps=586806, ups=1.19, wpb=493888, bsz=16512.5, num_updates=57200, lr=0.000264443, gnorm=0.134, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=49009 epoch 034: 1551 / 1689 loss=3.509, nll_loss=1.955, ppl=3.88, wps=586806, ups=1.19, wpb=493888, bsz=16512.5, num_updates=57200, lr=0.000264443, gnorm=0.134, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=49009 epoch 034: 1651 / 1689 loss=3.507, nll_loss=1.953, ppl=3.87, wps=589284, ups=1.19, wpb=495614, bsz=16391.3, num_updates=57300, lr=0.000264212, gnorm=0.132, clip=0, loss_scale=2, train_wall=83, gb_free=60.7, wall=49094 epoch 034: 1651 / 1689 loss=3.507, nll_loss=1.953, ppl=3.87, wps=589284, ups=1.19, wpb=495614, bsz=16391.3, num_updates=57300, lr=0.000264212, gnorm=0.132, clip=0, loss_scale=2, train_wall=83, gb_free=60.7, wall=49094 epoch 034: 1651 / 1689 loss=3.507, nll_loss=1.953, ppl=3.87, wps=589284, ups=1.19, wpb=495614, bsz=16391.3, num_updates=57300, lr=0.000264212, gnorm=0.132, clip=0, loss_scale=2, train_wall=83, gb_free=60.7, wall=49094 epoch 034: 1651 / 1689 loss=3.507, nll_loss=1.953, ppl=3.87, wps=589284, ups=1.19, wpb=495614, bsz=16391.3, num_updates=57300, lr=0.000264212, gnorm=0.132, clip=0, loss_scale=2, train_wall=83, gb_free=60.7, wall=49094 epoch 034: 1651 / 1689 loss=3.507, nll_loss=1.953, ppl=3.87, wps=589284, ups=1.19, wpb=495614, bsz=16391.3, num_updates=57300, lr=0.000264212, gnorm=0.132, clip=0, loss_scale=2, train_wall=83, gb_free=60.7, wall=49094 epoch 034: 1651 / 1689 loss=3.507, nll_loss=1.953, ppl=3.87, wps=589284, ups=1.19, wpb=495614, bsz=16391.3, num_updates=57300, lr=0.000264212, gnorm=0.132, clip=0, loss_scale=2, train_wall=83, gb_free=60.7, wall=49094 epoch 034: 1651 / 1689 loss=3.507, nll_loss=1.953, ppl=3.87, wps=589284, ups=1.19, wpb=495614, bsz=16391.3, num_updates=57300, lr=0.000264212, gnorm=0.132, clip=0, loss_scale=2, train_wall=83, gb_free=60.7, wall=49094 epoch 034: 1651 / 1689 loss=3.507, nll_loss=1.953, ppl=3.87, wps=589284, ups=1.19, wpb=495614, bsz=16391.3, num_updates=57300, lr=0.000264212, gnorm=0.132, clip=0, loss_scale=2, train_wall=83, gb_free=60.7, wall=49094 epoch 034: 1651 / 1689 loss=3.507, nll_loss=1.953, ppl=3.87, wps=589284, ups=1.19, wpb=495614, bsz=16391.3, num_updates=57300, lr=0.000264212, gnorm=0.132, clip=0, loss_scale=2, train_wall=83, gb_free=60.7, wall=49094 epoch 034: 1651 / 1689 loss=3.507, nll_loss=1.953, ppl=3.87, wps=589284, ups=1.19, wpb=495614, bsz=16391.3, num_updates=57300, lr=0.000264212, gnorm=0.132, clip=0, loss_scale=2, train_wall=83, gb_free=60.7, wall=49094 epoch 034: 1651 / 1689 loss=3.507, nll_loss=1.953, ppl=3.87, wps=589284, ups=1.19, wpb=495614, bsz=16391.3, num_updates=57300, lr=0.000264212, gnorm=0.132, clip=0, loss_scale=2, train_wall=83, gb_free=60.7, wall=49094 epoch 034: 1651 / 1689 loss=3.507, nll_loss=1.953, ppl=3.87, wps=589284, ups=1.19, wpb=495614, bsz=16391.3, num_updates=57300, lr=0.000264212, gnorm=0.132, clip=0, loss_scale=2, train_wall=83, gb_free=60.7, wall=49094 epoch 034: 1651 / 1689 loss=3.507, nll_loss=1.953, ppl=3.87, wps=589284, ups=1.19, wpb=495614, bsz=16391.3, num_updates=57300, lr=0.000264212, gnorm=0.132, clip=0, loss_scale=2, train_wall=83, gb_free=60.7, wall=49094 epoch 034: 1651 / 1689 loss=3.507, nll_loss=1.953, ppl=3.87, wps=589284, ups=1.19, wpb=495614, bsz=16391.3, num_updates=57300, lr=0.000264212, gnorm=0.132, clip=0, loss_scale=2, train_wall=83, gb_free=60.7, wall=49094 epoch 034: 1651 / 1689 loss=3.507, nll_loss=1.953, ppl=3.87, wps=589284, ups=1.19, wpb=495614, bsz=16391.3, num_updates=57300, lr=0.000264212, gnorm=0.132, clip=0, loss_scale=2, train_wall=83, gb_free=60.7, wall=49094 epoch 034: 1651 / 1689 loss=3.507, nll_loss=1.953, ppl=3.87, wps=589284, ups=1.19, wpb=495614, bsz=16391.3, num_updates=57300, lr=0.000264212, gnorm=0.132, clip=0, loss_scale=2, train_wall=83, gb_free=60.7, wall=49094 epoch 034: 1651 / 1689 loss=3.507, nll_loss=1.953, ppl=3.87, wps=589284, ups=1.19, wpb=495614, bsz=16391.3, num_updates=57300, lr=0.000264212, gnorm=0.132, clip=0, loss_scale=2, train_wall=83, gb_free=60.7, wall=49094 epoch 034: 1651 / 1689 loss=3.507, nll_loss=1.953, ppl=3.87, wps=589284, ups=1.19, wpb=495614, bsz=16391.3, num_updates=57300, lr=0.000264212, gnorm=0.132, clip=0, loss_scale=2, train_wall=83, gb_free=60.7, wall=49094 epoch 034: 1651 / 1689 loss=3.507, nll_loss=1.953, ppl=3.87, wps=589284, ups=1.19, wpb=495614, bsz=16391.3, num_updates=57300, lr=0.000264212, gnorm=0.132, clip=0, loss_scale=2, train_wall=83, gb_free=60.7, wall=49094 epoch 034: 1651 / 1689 loss=3.507, nll_loss=1.953, ppl=3.87, wps=589284, ups=1.19, wpb=495614, bsz=16391.3, num_updates=57300, lr=0.000264212, gnorm=0.132, clip=0, loss_scale=2, train_wall=83, gb_free=60.7, wall=49094 epoch 034: 1651 / 1689 loss=3.507, nll_loss=1.953, ppl=3.87, wps=589284, ups=1.19, wpb=495614, bsz=16391.3, num_updates=57300, lr=0.000264212, gnorm=0.132, clip=0, loss_scale=2, train_wall=83, gb_free=60.7, wall=49094 epoch 034: 1651 / 1689 loss=3.507, nll_loss=1.953, ppl=3.87, wps=589284, ups=1.19, wpb=495614, bsz=16391.3, num_updates=57300, lr=0.000264212, gnorm=0.132, clip=0, loss_scale=2, train_wall=83, gb_free=60.7, wall=49094 epoch 034: 1651 / 1689 loss=3.507, nll_loss=1.953, ppl=3.87, wps=589284, ups=1.19, wpb=495614, bsz=16391.3, num_updates=57300, lr=0.000264212, gnorm=0.132, clip=0, loss_scale=2, train_wall=83, gb_free=60.7, wall=49094 epoch 034: 1651 / 1689 loss=3.507, nll_loss=1.953, ppl=3.87, wps=589284, ups=1.19, wpb=495614, bsz=16391.3, num_updates=57300, lr=0.000264212, gnorm=0.132, clip=0, loss_scale=2, train_wall=83, gb_free=60.7, wall=49094 epoch 034: 1651 / 1689 loss=3.507, nll_loss=1.953, ppl=3.87, wps=589284, ups=1.19, wpb=495614, bsz=16391.3, num_updates=57300, lr=0.000264212, gnorm=0.132, clip=0, loss_scale=2, train_wall=83, gb_free=60.7, wall=49094 epoch 034: 1651 / 1689 loss=3.507, nll_loss=1.953, ppl=3.87, wps=589284, ups=1.19, wpb=495614, bsz=16391.3, num_updates=57300, lr=0.000264212, gnorm=0.132, clip=0, loss_scale=2, train_wall=83, gb_free=60.7, wall=49094 epoch 034: 1651 / 1689 loss=3.507, nll_loss=1.953, ppl=3.87, wps=589284, ups=1.19, wpb=495614, bsz=16391.3, num_updates=57300, lr=0.000264212, gnorm=0.132, clip=0, loss_scale=2, train_wall=83, gb_free=60.7, wall=49094 epoch 034: 1651 / 1689 loss=3.507, nll_loss=1.953, ppl=3.87, wps=589284, ups=1.19, wpb=495614, bsz=16391.3, num_updates=57300, lr=0.000264212, gnorm=0.132, clip=0, loss_scale=2, train_wall=83, gb_free=60.7, wall=49094 epoch 034: 1651 / 1689 loss=3.507, nll_loss=1.953, ppl=3.87, wps=589284, ups=1.19, wpb=495614, bsz=16391.3, num_updates=57300, lr=0.000264212, gnorm=0.132, clip=0, loss_scale=2, train_wall=83, gb_free=60.7, wall=49094 epoch 034: 1651 / 1689 loss=3.507, nll_loss=1.953, ppl=3.87, wps=589284, ups=1.19, wpb=495614, bsz=16391.3, num_updates=57300, lr=0.000264212, gnorm=0.132, clip=0, loss_scale=2, train_wall=83, gb_free=60.7, wall=49094 epoch 034: 1651 / 1689 loss=3.507, nll_loss=1.953, ppl=3.87, wps=589284, ups=1.19, wpb=495614, bsz=16391.3, num_updates=57300, lr=0.000264212, gnorm=0.132, clip=0, loss_scale=2, train_wall=83, gb_free=60.7, wall=49094 epoch 034: 1651 / 1689 loss=3.507, nll_loss=1.953, ppl=3.87, wps=589284, ups=1.19, wpb=495614, bsz=16391.3, num_updates=57300, lr=0.000264212, gnorm=0.132, clip=0, loss_scale=2, train_wall=83, gb_free=60.7, wall=49094 epoch 034: 1651 / 1689 loss=3.507, nll_loss=1.953, ppl=3.87, wps=589284, ups=1.19, wpb=495614, bsz=16391.3, num_updates=57300, lr=0.000264212, gnorm=0.132, clip=0, loss_scale=2, train_wall=83, gb_free=60.7, wall=49094 epoch 034: 1651 / 1689 loss=3.507, nll_loss=1.953, ppl=3.87, wps=589284, ups=1.19, wpb=495614, bsz=16391.3, num_updates=57300, lr=0.000264212, gnorm=0.132, clip=0, loss_scale=2, train_wall=83, gb_free=60.7, wall=49094 end of epoch 34 (average epoch stats below) epoch 034 | loss 3.502 | nll_loss 1.947 | ppl 3.85 | wps 580644 | ups 1.17 | wpb 495121 | bsz 16504.4 | num_updates 57338 | lr 0.000264125 | gnorm 0.136 | clip 0 | loss_scale 2 | train_wall 1403 | gb_free 61.9 | wall 49125 epoch 034 | loss 3.502 | nll_loss 1.947 | ppl 3.85 | wps 580644 | ups 1.17 | wpb 495121 | bsz 16504.4 | num_updates 57338 | lr 0.000264125 | gnorm 0.136 | clip 0 | loss_scale 2 | train_wall 1403 | gb_free 61.9 | wall 49125 epoch 034 | loss 3.502 | nll_loss 1.947 | ppl 3.85 | wps 580644 | ups 1.17 | wpb 495121 | bsz 16504.4 | num_updates 57338 | lr 0.000264125 | gnorm 0.136 | clip 0 | loss_scale 2 | train_wall 1403 | gb_free 61.9 | wall 49125 epoch 034 | loss 3.502 | nll_loss 1.947 | ppl 3.85 | wps 580644 | ups 1.17 | wpb 495121 | bsz 16504.4 | num_updates 57338 | lr 0.000264125 | gnorm 0.136 | clip 0 | loss_scale 2 | train_wall 1403 | gb_free 61.9 | wall 49125 epoch 034 | loss 3.502 | nll_loss 1.947 | ppl 3.85 | wps 580644 | ups 1.17 | wpb 495121 | bsz 16504.4 | num_updates 57338 | lr 0.000264125 | gnorm 0.136 | clip 0 | loss_scale 2 | train_wall 1403 | gb_free 61.9 | wall 49125 epoch 034 | loss 3.502 | nll_loss 1.947 | ppl 3.85 | wps 580644 | ups 1.17 | wpb 495121 | bsz 16504.4 | num_updates 57338 | lr 0.000264125 | gnorm 0.136 | clip 0 | loss_scale 2 | train_wall 1403 | gb_free 61.9 | wall 49125 epoch 034 | loss 3.502 | nll_loss 1.947 | ppl 3.85 | wps 580644 | ups 1.17 | wpb 495121 | bsz 16504.4 | num_updates 57338 | lr 0.000264125 | gnorm 0.136 | clip 0 | loss_scale 2 | train_wall 1403 | gb_free 61.9 | wall 49125 epoch 034 | loss 3.502 | nll_loss 1.947 | ppl 3.85 | wps 580644 | ups 1.17 | wpb 495121 | bsz 16504.4 | num_updates 57338 | lr 0.000264125 | gnorm 0.136 | clip 0 | loss_scale 2 | train_wall 1403 | gb_free 61.9 | wall 49125 epoch 034 | loss 3.502 | nll_loss 1.947 | ppl 3.85 | wps 580644 | ups 1.17 | wpb 495121 | bsz 16504.4 | num_updates 57338 | lr 0.000264125 | gnorm 0.136 | clip 0 | loss_scale 2 | train_wall 1403 | gb_free 61.9 | wall 49125 epoch 034 | loss 3.502 | nll_loss 1.947 | ppl 3.85 | wps 580644 | ups 1.17 | wpb 495121 | bsz 16504.4 | num_updates 57338 | lr 0.000264125 | gnorm 0.136 | clip 0 | loss_scale 2 | train_wall 1403 | gb_free 61.9 | wall 49125 epoch 034 | loss 3.502 | nll_loss 1.947 | ppl 3.85 | wps 580644 | ups 1.17 | wpb 495121 | bsz 16504.4 | num_updates 57338 | lr 0.000264125 | gnorm 0.136 | clip 0 | loss_scale 2 | train_wall 1403 | gb_free 61.9 | wall 49125 epoch 034 | loss 3.502 | nll_loss 1.947 | ppl 3.85 | wps 580644 | ups 1.17 | wpb 495121 | bsz 16504.4 | num_updates 57338 | lr 0.000264125 | gnorm 0.136 | clip 0 | loss_scale 2 | train_wall 1403 | gb_free 61.9 | wall 49125 epoch 034 | loss 3.502 | nll_loss 1.947 | ppl 3.85 | wps 580644 | ups 1.17 | wpb 495121 | bsz 16504.4 | num_updates 57338 | lr 0.000264125 | gnorm 0.136 | clip 0 | loss_scale 2 | train_wall 1403 | gb_free 61.9 | wall 49125 epoch 034 | loss 3.502 | nll_loss 1.947 | ppl 3.85 | wps 580644 | ups 1.17 | wpb 495121 | bsz 16504.4 | num_updates 57338 | lr 0.000264125 | gnorm 0.136 | clip 0 | loss_scale 2 | train_wall 1403 | gb_free 61.9 | wall 49125 epoch 034 | loss 3.502 | nll_loss 1.947 | ppl 3.85 | wps 580644 | ups 1.17 | wpb 495121 | bsz 16504.4 | num_updates 57338 | lr 0.000264125 | gnorm 0.136 | clip 0 | loss_scale 2 | train_wall 1403 | gb_free 61.9 | wall 49125 epoch 034 | loss 3.502 | nll_loss 1.947 | ppl 3.85 | wps 580644 | ups 1.17 | wpb 495121 | bsz 16504.4 | num_updates 57338 | lr 0.000264125 | gnorm 0.136 | clip 0 | loss_scale 2 | train_wall 1403 | gb_free 61.9 | wall 49125 epoch 034 | loss 3.502 | nll_loss 1.947 | ppl 3.85 | wps 580644 | ups 1.17 | wpb 495121 | bsz 16504.4 | num_updates 57338 | lr 0.000264125 | gnorm 0.136 | clip 0 | loss_scale 2 | train_wall 1403 | gb_free 61.9 | wall 49125 epoch 034 | loss 3.502 | nll_loss 1.947 | ppl 3.85 | wps 580644 | ups 1.17 | wpb 495121 | bsz 16504.4 | num_updates 57338 | lr 0.000264125 | gnorm 0.136 | clip 0 | loss_scale 2 | train_wall 1403 | gb_free 61.9 | wall 49125 epoch 034 | loss 3.502 | nll_loss 1.947 | ppl 3.85 | wps 580644 | ups 1.17 | wpb 495121 | bsz 16504.4 | num_updates 57338 | lr 0.000264125 | gnorm 0.136 | clip 0 | loss_scale 2 | train_wall 1403 | gb_free 61.9 | wall 49125 epoch 034 | loss 3.502 | nll_loss 1.947 | ppl 3.85 | wps 580644 | ups 1.17 | wpb 495121 | bsz 16504.4 | num_updates 57338 | lr 0.000264125 | gnorm 0.136 | clip 0 | loss_scale 2 | train_wall 1403 | gb_free 61.9 | wall 49125 epoch 034 | loss 3.502 | nll_loss 1.947 | ppl 3.85 | wps 580644 | ups 1.17 | wpb 495121 | bsz 16504.4 | num_updates 57338 | lr 0.000264125 | gnorm 0.136 | clip 0 | loss_scale 2 | train_wall 1403 | gb_free 61.9 | wall 49125 epoch 034 | loss 3.502 | nll_loss 1.947 | ppl 3.85 | wps 580644 | ups 1.17 | wpb 495121 | bsz 16504.4 | num_updates 57338 | lr 0.000264125 | gnorm 0.136 | clip 0 | loss_scale 2 | train_wall 1403 | gb_free 61.9 | wall 49125 epoch 034 | loss 3.502 | nll_loss 1.947 | ppl 3.85 | wps 580644 | ups 1.17 | wpb 495121 | bsz 16504.4 | num_updates 57338 | lr 0.000264125 | gnorm 0.136 | clip 0 | loss_scale 2 | train_wall 1403 | gb_free 61.9 | wall 49125 epoch 034 | loss 3.502 | nll_loss 1.947 | ppl 3.85 | wps 580644 | ups 1.17 | wpb 495121 | bsz 16504.4 | num_updates 57338 | lr 0.000264125 | gnorm 0.136 | clip 0 | loss_scale 2 | train_wall 1403 | gb_free 61.9 | wall 49125 epoch 034 | loss 3.502 | nll_loss 1.947 | ppl 3.85 | wps 580644 | ups 1.17 | wpb 495121 | bsz 16504.4 | num_updates 57338 | lr 0.000264125 | gnorm 0.136 | clip 0 | loss_scale 2 | train_wall 1403 | gb_free 61.9 | wall 49125 epoch 034 | loss 3.502 | nll_loss 1.947 | ppl 3.85 | wps 580644 | ups 1.17 | wpb 495121 | bsz 16504.4 | num_updates 57338 | lr 0.000264125 | gnorm 0.136 | clip 0 | loss_scale 2 | train_wall 1403 | gb_free 61.9 | wall 49125 epoch 034 | loss 3.502 | nll_loss 1.947 | ppl 3.85 | wps 580644 | ups 1.17 | wpb 495121 | bsz 16504.4 | num_updates 57338 | lr 0.000264125 | gnorm 0.136 | clip 0 | loss_scale 2 | train_wall 1403 | gb_free 61.9 | wall 49125 epoch 034 | loss 3.502 | nll_loss 1.947 | ppl 3.85 | wps 580644 | ups 1.17 | wpb 495121 | bsz 16504.4 | num_updates 57338 | lr 0.000264125 | gnorm 0.136 | clip 0 | loss_scale 2 | train_wall 1403 | gb_free 61.9 | wall 49125 epoch 034 | loss 3.502 | nll_loss 1.947 | ppl 3.85 | wps 580644 | ups 1.17 | wpb 495121 | bsz 16504.4 | num_updates 57338 | lr 0.000264125 | gnorm 0.136 | clip 0 | loss_scale 2 | train_wall 1403 | gb_free 61.9 | wall 49125 epoch 034 | loss 3.502 | nll_loss 1.947 | ppl 3.85 | wps 580644 | ups 1.17 | wpb 495121 | bsz 16504.4 | num_updates 57338 | lr 0.000264125 | gnorm 0.136 | clip 0 | loss_scale 2 | train_wall 1403 | gb_free 61.9 | wall 49125 epoch 034 | loss 3.502 | nll_loss 1.947 | ppl 3.85 | wps 580644 | ups 1.17 | wpb 495121 | bsz 16504.4 | num_updates 57338 | lr 0.000264125 | gnorm 0.136 | clip 0 | loss_scale 2 | train_wall 1403 | gb_free 61.9 | wall 49125 epoch 034 | loss 3.502 | nll_loss 1.947 | ppl 3.85 | wps 580644 | ups 1.17 | wpb 495121 | bsz 16504.4 | num_updates 57338 | lr 0.000264125 | gnorm 0.136 | clip 0 | loss_scale 2 | train_wall 1403 | gb_free 61.9 | wall 49125 epoch 034 | loss 3.502 | nll_loss 1.947 | ppl 3.85 | wps 580644 | ups 1.17 | wpb 495121 | bsz 16504.4 | num_updates 57338 | lr 0.000264125 | gnorm 0.136 | clip 0 | loss_scale 2 | train_wall 1403 | gb_free 61.9 | wall 49125 epoch 034 | loss 3.502 | nll_loss 1.947 | ppl 3.85 | wps 580644 | ups 1.17 | wpb 495121 | bsz 16504.4 | num_updates 57338 | lr 0.000264125 | gnorm 0.136 | clip 0 | loss_scale 2 | train_wall 1403 | gb_free 61.9 | wall 49125 Start iterating over samples epoch 035: 62 / 1689 loss=3.491, nll_loss=1.934, ppl=3.82, wps=586248, ups=1.19, wpb=491993, bsz=16364.1, num_updates=57400, lr=0.000263982, gnorm=0.138, clip=0, loss_scale=4, train_wall=82, gb_free=62, wall=49177 epoch 035: 62 / 1689 loss=3.491, nll_loss=1.934, ppl=3.82, wps=586248, ups=1.19, wpb=491993, bsz=16364.1, num_updates=57400, lr=0.000263982, gnorm=0.138, clip=0, loss_scale=4, train_wall=82, gb_free=62, wall=49177 epoch 035: 62 / 1689 loss=3.491, nll_loss=1.934, ppl=3.82, wps=586248, ups=1.19, wpb=491993, bsz=16364.1, num_updates=57400, lr=0.000263982, gnorm=0.138, clip=0, loss_scale=4, train_wall=82, gb_free=62, wall=49177 epoch 035: 62 / 1689 loss=3.491, nll_loss=1.934, ppl=3.82, wps=586248, ups=1.19, wpb=491993, bsz=16364.1, num_updates=57400, lr=0.000263982, gnorm=0.138, clip=0, loss_scale=4, train_wall=82, gb_free=62, wall=49177 epoch 035: 62 / 1689 loss=3.491, nll_loss=1.934, ppl=3.82, wps=586248, ups=1.19, wpb=491993, bsz=16364.1, num_updates=57400, lr=0.000263982, gnorm=0.138, clip=0, loss_scale=4, train_wall=82, gb_free=62, wall=49177 epoch 035: 62 / 1689 loss=3.491, nll_loss=1.934, ppl=3.82, wps=586248, ups=1.19, wpb=491993, bsz=16364.1, num_updates=57400, lr=0.000263982, gnorm=0.138, clip=0, loss_scale=4, train_wall=82, gb_free=62, wall=49177 epoch 035: 62 / 1689 loss=3.491, nll_loss=1.934, ppl=3.82, wps=586248, ups=1.19, wpb=491993, bsz=16364.1, num_updates=57400, lr=0.000263982, gnorm=0.138, clip=0, loss_scale=4, train_wall=82, gb_free=62, wall=49177 epoch 035: 62 / 1689 loss=3.491, nll_loss=1.934, ppl=3.82, wps=586248, ups=1.19, wpb=491993, bsz=16364.1, num_updates=57400, lr=0.000263982, gnorm=0.138, clip=0, loss_scale=4, train_wall=82, gb_free=62, wall=49177 epoch 035: 62 / 1689 loss=3.491, nll_loss=1.934, ppl=3.82, wps=586248, ups=1.19, wpb=491993, bsz=16364.1, num_updates=57400, lr=0.000263982, gnorm=0.138, clip=0, loss_scale=4, train_wall=82, gb_free=62, wall=49177 epoch 035: 62 / 1689 loss=3.491, nll_loss=1.934, ppl=3.82, wps=586248, ups=1.19, wpb=491993, bsz=16364.1, num_updates=57400, lr=0.000263982, gnorm=0.138, clip=0, loss_scale=4, train_wall=82, gb_free=62, wall=49177 epoch 035: 62 / 1689 loss=3.491, nll_loss=1.934, ppl=3.82, wps=586248, ups=1.19, wpb=491993, bsz=16364.1, num_updates=57400, lr=0.000263982, gnorm=0.138, clip=0, loss_scale=4, train_wall=82, gb_free=62, wall=49177 epoch 035: 62 / 1689 loss=3.491, nll_loss=1.934, ppl=3.82, wps=586248, ups=1.19, wpb=491993, bsz=16364.1, num_updates=57400, lr=0.000263982, gnorm=0.138, clip=0, loss_scale=4, train_wall=82, gb_free=62, wall=49177 epoch 035: 62 / 1689 loss=3.491, nll_loss=1.934, ppl=3.82, wps=586248, ups=1.19, wpb=491993, bsz=16364.1, num_updates=57400, lr=0.000263982, gnorm=0.138, clip=0, loss_scale=4, train_wall=82, gb_free=62, wall=49177 epoch 035: 62 / 1689 loss=3.491, nll_loss=1.934, ppl=3.82, wps=586248, ups=1.19, wpb=491993, bsz=16364.1, num_updates=57400, lr=0.000263982, gnorm=0.138, clip=0, loss_scale=4, train_wall=82, gb_free=62, wall=49177 epoch 035: 62 / 1689 loss=3.491, nll_loss=1.934, ppl=3.82, wps=586248, ups=1.19, wpb=491993, bsz=16364.1, num_updates=57400, lr=0.000263982, gnorm=0.138, clip=0, loss_scale=4, train_wall=82, gb_free=62, wall=49177 epoch 035: 62 / 1689 loss=3.491, nll_loss=1.934, ppl=3.82, wps=586248, ups=1.19, wpb=491993, bsz=16364.1, num_updates=57400, lr=0.000263982, gnorm=0.138, clip=0, loss_scale=4, train_wall=82, gb_free=62, wall=49177 epoch 035: 62 / 1689 loss=3.491, nll_loss=1.934, ppl=3.82, wps=586248, ups=1.19, wpb=491993, bsz=16364.1, num_updates=57400, lr=0.000263982, gnorm=0.138, clip=0, loss_scale=4, train_wall=82, gb_free=62, wall=49177 epoch 035: 62 / 1689 loss=3.491, nll_loss=1.934, ppl=3.82, wps=586248, ups=1.19, wpb=491993, bsz=16364.1, num_updates=57400, lr=0.000263982, gnorm=0.138, clip=0, loss_scale=4, train_wall=82, gb_free=62, wall=49177 epoch 035: 62 / 1689 loss=3.491, nll_loss=1.934, ppl=3.82, wps=586248, ups=1.19, wpb=491993, bsz=16364.1, num_updates=57400, lr=0.000263982, gnorm=0.138, clip=0, loss_scale=4, train_wall=82, gb_free=62, wall=49177 epoch 035: 62 / 1689 loss=3.491, nll_loss=1.934, ppl=3.82, wps=586248, ups=1.19, wpb=491993, bsz=16364.1, num_updates=57400, lr=0.000263982, gnorm=0.138, clip=0, loss_scale=4, train_wall=82, gb_free=62, wall=49177 epoch 035: 62 / 1689 loss=3.491, nll_loss=1.934, ppl=3.82, wps=586248, ups=1.19, wpb=491993, bsz=16364.1, num_updates=57400, lr=0.000263982, gnorm=0.138, clip=0, loss_scale=4, train_wall=82, gb_free=62, wall=49177 epoch 035: 62 / 1689 loss=3.491, nll_loss=1.934, ppl=3.82, wps=586248, ups=1.19, wpb=491993, bsz=16364.1, num_updates=57400, lr=0.000263982, gnorm=0.138, clip=0, loss_scale=4, train_wall=82, gb_free=62, wall=49177 epoch 035: 62 / 1689 loss=3.491, nll_loss=1.934, ppl=3.82, wps=586248, ups=1.19, wpb=491993, bsz=16364.1, num_updates=57400, lr=0.000263982, gnorm=0.138, clip=0, loss_scale=4, train_wall=82, gb_free=62, wall=49177 epoch 035: 62 / 1689 loss=3.491, nll_loss=1.934, ppl=3.82, wps=586248, ups=1.19, wpb=491993, bsz=16364.1, num_updates=57400, lr=0.000263982, gnorm=0.138, clip=0, loss_scale=4, train_wall=82, gb_free=62, wall=49177 epoch 035: 62 / 1689 loss=3.491, nll_loss=1.934, ppl=3.82, wps=586248, ups=1.19, wpb=491993, bsz=16364.1, num_updates=57400, lr=0.000263982, gnorm=0.138, clip=0, loss_scale=4, train_wall=82, gb_free=62, wall=49177 epoch 035: 62 / 1689 loss=3.491, nll_loss=1.934, ppl=3.82, wps=586248, ups=1.19, wpb=491993, bsz=16364.1, num_updates=57400, lr=0.000263982, gnorm=0.138, clip=0, loss_scale=4, train_wall=82, gb_free=62, wall=49177 epoch 035: 62 / 1689 loss=3.491, nll_loss=1.934, ppl=3.82, wps=586248, ups=1.19, wpb=491993, bsz=16364.1, num_updates=57400, lr=0.000263982, gnorm=0.138, clip=0, loss_scale=4, train_wall=82, gb_free=62, wall=49177 epoch 035: 62 / 1689 loss=3.491, nll_loss=1.934, ppl=3.82, wps=586248, ups=1.19, wpb=491993, bsz=16364.1, num_updates=57400, lr=0.000263982, gnorm=0.138, clip=0, loss_scale=4, train_wall=82, gb_free=62, wall=49177 epoch 035: 62 / 1689 loss=3.491, nll_loss=1.934, ppl=3.82, wps=586248, ups=1.19, wpb=491993, bsz=16364.1, num_updates=57400, lr=0.000263982, gnorm=0.138, clip=0, loss_scale=4, train_wall=82, gb_free=62, wall=49177 epoch 035: 62 / 1689 loss=3.491, nll_loss=1.934, ppl=3.82, wps=586248, ups=1.19, wpb=491993, bsz=16364.1, num_updates=57400, lr=0.000263982, gnorm=0.138, clip=0, loss_scale=4, train_wall=82, gb_free=62, wall=49177 epoch 035: 62 / 1689 loss=3.491, nll_loss=1.934, ppl=3.82, wps=586248, ups=1.19, wpb=491993, bsz=16364.1, num_updates=57400, lr=0.000263982, gnorm=0.138, clip=0, loss_scale=4, train_wall=82, gb_free=62, wall=49177 epoch 035: 62 / 1689 loss=3.491, nll_loss=1.934, ppl=3.82, wps=586248, ups=1.19, wpb=491993, bsz=16364.1, num_updates=57400, lr=0.000263982, gnorm=0.138, clip=0, loss_scale=4, train_wall=82, gb_free=62, wall=49177 epoch 035: 62 / 1689 loss=3.491, nll_loss=1.934, ppl=3.82, wps=586248, ups=1.19, wpb=491993, bsz=16364.1, num_updates=57400, lr=0.000263982, gnorm=0.138, clip=0, loss_scale=4, train_wall=82, gb_free=62, wall=49177 epoch 035: 62 / 1689 loss=3.491, nll_loss=1.934, ppl=3.82, wps=586248, ups=1.19, wpb=491993, bsz=16364.1, num_updates=57400, lr=0.000263982, gnorm=0.138, clip=0, loss_scale=4, train_wall=82, gb_free=62, wall=49177 epoch 035: 62 / 1689 loss=3.491, nll_loss=1.934, ppl=3.82, wps=586248, ups=1.19, wpb=491993, bsz=16364.1, num_updates=57400, lr=0.000263982, gnorm=0.138, clip=0, loss_scale=4, train_wall=82, gb_free=62, wall=49177 epoch 035: 162 / 1689 loss=3.483, nll_loss=1.925, ppl=3.8, wps=590333, ups=1.19, wpb=494070, bsz=16313.2, num_updates=57500, lr=0.000263752, gnorm=0.134, clip=0, loss_scale=4, train_wall=83, gb_free=61.1, wall=49261 epoch 035: 162 / 1689 loss=3.483, nll_loss=1.925, ppl=3.8, wps=590333, ups=1.19, wpb=494070, bsz=16313.2, num_updates=57500, lr=0.000263752, gnorm=0.134, clip=0, loss_scale=4, train_wall=83, gb_free=61.1, wall=49261 epoch 035: 162 / 1689 loss=3.483, nll_loss=1.925, ppl=3.8, wps=590333, ups=1.19, wpb=494070, bsz=16313.2, num_updates=57500, lr=0.000263752, gnorm=0.134, clip=0, loss_scale=4, train_wall=83, gb_free=61.1, wall=49261 epoch 035: 162 / 1689 loss=3.483, nll_loss=1.925, ppl=3.8, wps=590333, ups=1.19, wpb=494070, bsz=16313.2, num_updates=57500, lr=0.000263752, gnorm=0.134, clip=0, loss_scale=4, train_wall=83, gb_free=61.1, wall=49261 epoch 035: 162 / 1689 loss=3.483, nll_loss=1.925, ppl=3.8, wps=590333, ups=1.19, wpb=494070, bsz=16313.2, num_updates=57500, lr=0.000263752, gnorm=0.134, clip=0, loss_scale=4, train_wall=83, gb_free=61.1, wall=49261 epoch 035: 162 / 1689 loss=3.483, nll_loss=1.925, ppl=3.8, wps=590333, ups=1.19, wpb=494070, bsz=16313.2, num_updates=57500, lr=0.000263752, gnorm=0.134, clip=0, loss_scale=4, train_wall=83, gb_free=61.1, wall=49261 epoch 035: 162 / 1689 loss=3.483, nll_loss=1.925, ppl=3.8, wps=590333, ups=1.19, wpb=494070, bsz=16313.2, num_updates=57500, lr=0.000263752, gnorm=0.134, clip=0, loss_scale=4, train_wall=83, gb_free=61.1, wall=49261 epoch 035: 162 / 1689 loss=3.483, nll_loss=1.925, ppl=3.8, wps=590333, ups=1.19, wpb=494070, bsz=16313.2, num_updates=57500, lr=0.000263752, gnorm=0.134, clip=0, loss_scale=4, train_wall=83, gb_free=61.1, wall=49261 epoch 035: 162 / 1689 loss=3.483, nll_loss=1.925, ppl=3.8, wps=590333, ups=1.19, wpb=494070, bsz=16313.2, num_updates=57500, lr=0.000263752, gnorm=0.134, clip=0, loss_scale=4, train_wall=83, gb_free=61.1, wall=49261 epoch 035: 162 / 1689 loss=3.483, nll_loss=1.925, ppl=3.8, wps=590333, ups=1.19, wpb=494070, bsz=16313.2, num_updates=57500, lr=0.000263752, gnorm=0.134, clip=0, loss_scale=4, train_wall=83, gb_free=61.1, wall=49261 epoch 035: 162 / 1689 loss=3.483, nll_loss=1.925, ppl=3.8, wps=590333, ups=1.19, wpb=494070, bsz=16313.2, num_updates=57500, lr=0.000263752, gnorm=0.134, clip=0, loss_scale=4, train_wall=83, gb_free=61.1, wall=49261 epoch 035: 162 / 1689 loss=3.483, nll_loss=1.925, ppl=3.8, wps=590333, ups=1.19, wpb=494070, bsz=16313.2, num_updates=57500, lr=0.000263752, gnorm=0.134, clip=0, loss_scale=4, train_wall=83, gb_free=61.1, wall=49261 epoch 035: 162 / 1689 loss=3.483, nll_loss=1.925, ppl=3.8, wps=590333, ups=1.19, wpb=494070, bsz=16313.2, num_updates=57500, lr=0.000263752, gnorm=0.134, clip=0, loss_scale=4, train_wall=83, gb_free=61.1, wall=49261 epoch 035: 162 / 1689 loss=3.483, nll_loss=1.925, ppl=3.8, wps=590333, ups=1.19, wpb=494070, bsz=16313.2, num_updates=57500, lr=0.000263752, gnorm=0.134, clip=0, loss_scale=4, train_wall=83, gb_free=61.1, wall=49261 epoch 035: 162 / 1689 loss=3.483, nll_loss=1.925, ppl=3.8, wps=590333, ups=1.19, wpb=494070, bsz=16313.2, num_updates=57500, lr=0.000263752, gnorm=0.134, clip=0, loss_scale=4, train_wall=83, gb_free=61.1, wall=49261 epoch 035: 162 / 1689 loss=3.483, nll_loss=1.925, ppl=3.8, wps=590333, ups=1.19, wpb=494070, bsz=16313.2, num_updates=57500, lr=0.000263752, gnorm=0.134, clip=0, loss_scale=4, train_wall=83, gb_free=61.1, wall=49261 epoch 035: 162 / 1689 loss=3.483, nll_loss=1.925, ppl=3.8, wps=590333, ups=1.19, wpb=494070, bsz=16313.2, num_updates=57500, lr=0.000263752, gnorm=0.134, clip=0, loss_scale=4, train_wall=83, gb_free=61.1, wall=49261 epoch 035: 162 / 1689 loss=3.483, nll_loss=1.925, ppl=3.8, wps=590333, ups=1.19, wpb=494070, bsz=16313.2, num_updates=57500, lr=0.000263752, gnorm=0.134, clip=0, loss_scale=4, train_wall=83, gb_free=61.1, wall=49261 epoch 035: 162 / 1689 loss=3.483, nll_loss=1.925, ppl=3.8, wps=590333, ups=1.19, wpb=494070, bsz=16313.2, num_updates=57500, lr=0.000263752, gnorm=0.134, clip=0, loss_scale=4, train_wall=83, gb_free=61.1, wall=49261 epoch 035: 162 / 1689 loss=3.483, nll_loss=1.925, ppl=3.8, wps=590333, ups=1.19, wpb=494070, bsz=16313.2, num_updates=57500, lr=0.000263752, gnorm=0.134, clip=0, loss_scale=4, train_wall=83, gb_free=61.1, wall=49261 epoch 035: 162 / 1689 loss=3.483, nll_loss=1.925, ppl=3.8, wps=590333, ups=1.19, wpb=494070, bsz=16313.2, num_updates=57500, lr=0.000263752, gnorm=0.134, clip=0, loss_scale=4, train_wall=83, gb_free=61.1, wall=49261 epoch 035: 162 / 1689 loss=3.483, nll_loss=1.925, ppl=3.8, wps=590333, ups=1.19, wpb=494070, bsz=16313.2, num_updates=57500, lr=0.000263752, gnorm=0.134, clip=0, loss_scale=4, train_wall=83, gb_free=61.1, wall=49261 epoch 035: 162 / 1689 loss=3.483, nll_loss=1.925, ppl=3.8, wps=590333, ups=1.19, wpb=494070, bsz=16313.2, num_updates=57500, lr=0.000263752, gnorm=0.134, clip=0, loss_scale=4, train_wall=83, gb_free=61.1, wall=49261 epoch 035: 162 / 1689 loss=3.483, nll_loss=1.925, ppl=3.8, wps=590333, ups=1.19, wpb=494070, bsz=16313.2, num_updates=57500, lr=0.000263752, gnorm=0.134, clip=0, loss_scale=4, train_wall=83, gb_free=61.1, wall=49261 epoch 035: 162 / 1689 loss=3.483, nll_loss=1.925, ppl=3.8, wps=590333, ups=1.19, wpb=494070, bsz=16313.2, num_updates=57500, lr=0.000263752, gnorm=0.134, clip=0, loss_scale=4, train_wall=83, gb_free=61.1, wall=49261 epoch 035: 162 / 1689 loss=3.483, nll_loss=1.925, ppl=3.8, wps=590333, ups=1.19, wpb=494070, bsz=16313.2, num_updates=57500, lr=0.000263752, gnorm=0.134, clip=0, loss_scale=4, train_wall=83, gb_free=61.1, wall=49261 epoch 035: 162 / 1689 loss=3.483, nll_loss=1.925, ppl=3.8, wps=590333, ups=1.19, wpb=494070, bsz=16313.2, num_updates=57500, lr=0.000263752, gnorm=0.134, clip=0, loss_scale=4, train_wall=83, gb_free=61.1, wall=49261 epoch 035: 162 / 1689 loss=3.483, nll_loss=1.925, ppl=3.8, wps=590333, ups=1.19, wpb=494070, bsz=16313.2, num_updates=57500, lr=0.000263752, gnorm=0.134, clip=0, loss_scale=4, train_wall=83, gb_free=61.1, wall=49261 epoch 035: 162 / 1689 loss=3.483, nll_loss=1.925, ppl=3.8, wps=590333, ups=1.19, wpb=494070, bsz=16313.2, num_updates=57500, lr=0.000263752, gnorm=0.134, clip=0, loss_scale=4, train_wall=83, gb_free=61.1, wall=49261 epoch 035: 162 / 1689 loss=3.483, nll_loss=1.925, ppl=3.8, wps=590333, ups=1.19, wpb=494070, bsz=16313.2, num_updates=57500, lr=0.000263752, gnorm=0.134, clip=0, loss_scale=4, train_wall=83, gb_free=61.1, wall=49261 epoch 035: 162 / 1689 loss=3.483, nll_loss=1.925, ppl=3.8, wps=590333, ups=1.19, wpb=494070, bsz=16313.2, num_updates=57500, lr=0.000263752, gnorm=0.134, clip=0, loss_scale=4, train_wall=83, gb_free=61.1, wall=49261 epoch 035: 162 / 1689 loss=3.483, nll_loss=1.925, ppl=3.8, wps=590333, ups=1.19, wpb=494070, bsz=16313.2, num_updates=57500, lr=0.000263752, gnorm=0.134, clip=0, loss_scale=4, train_wall=83, gb_free=61.1, wall=49261 epoch 035: 162 / 1689 loss=3.483, nll_loss=1.925, ppl=3.8, wps=590333, ups=1.19, wpb=494070, bsz=16313.2, num_updates=57500, lr=0.000263752, gnorm=0.134, clip=0, loss_scale=4, train_wall=83, gb_free=61.1, wall=49261 epoch 035: 162 / 1689 loss=3.483, nll_loss=1.925, ppl=3.8, wps=590333, ups=1.19, wpb=494070, bsz=16313.2, num_updates=57500, lr=0.000263752, gnorm=0.134, clip=0, loss_scale=4, train_wall=83, gb_free=61.1, wall=49261 epoch 035: 162 / 1689 loss=3.483, nll_loss=1.925, ppl=3.8, wps=590333, ups=1.19, wpb=494070, bsz=16313.2, num_updates=57500, lr=0.000263752, gnorm=0.134, clip=0, loss_scale=4, train_wall=83, gb_free=61.1, wall=49261 epoch 035: 263 / 1689 loss=3.49, nll_loss=1.932, ppl=3.82, wps=584920, ups=1.18, wpb=496756, bsz=16568.6, num_updates=57600, lr=0.000263523, gnorm=0.137, clip=0, loss_scale=2, train_wall=84, gb_free=61.3, wall=49346 epoch 035: 263 / 1689 loss=3.49, nll_loss=1.932, ppl=3.82, wps=584920, ups=1.18, wpb=496756, bsz=16568.6, num_updates=57600, lr=0.000263523, gnorm=0.137, clip=0, loss_scale=2, train_wall=84, gb_free=61.3, wall=49346 epoch 035: 263 / 1689 loss=3.49, nll_loss=1.932, ppl=3.82, wps=584920, ups=1.18, wpb=496756, bsz=16568.6, num_updates=57600, lr=0.000263523, gnorm=0.137, clip=0, loss_scale=2, train_wall=84, gb_free=61.3, wall=49346 epoch 035: 263 / 1689 loss=3.49, nll_loss=1.932, ppl=3.82, wps=584920, ups=1.18, wpb=496756, bsz=16568.6, num_updates=57600, lr=0.000263523, gnorm=0.137, clip=0, loss_scale=2, train_wall=84, gb_free=61.3, wall=49346 epoch 035: 263 / 1689 loss=3.49, nll_loss=1.932, ppl=3.82, wps=584920, ups=1.18, wpb=496756, bsz=16568.6, num_updates=57600, lr=0.000263523, gnorm=0.137, clip=0, loss_scale=2, train_wall=84, gb_free=61.3, wall=49346 epoch 035: 263 / 1689 loss=3.49, nll_loss=1.932, ppl=3.82, wps=584920, ups=1.18, wpb=496756, bsz=16568.6, num_updates=57600, lr=0.000263523, gnorm=0.137, clip=0, loss_scale=2, train_wall=84, gb_free=61.3, wall=49346 epoch 035: 263 / 1689 loss=3.49, nll_loss=1.932, ppl=3.82, wps=584920, ups=1.18, wpb=496756, bsz=16568.6, num_updates=57600, lr=0.000263523, gnorm=0.137, clip=0, loss_scale=2, train_wall=84, gb_free=61.3, wall=49346 epoch 035: 263 / 1689 loss=3.49, nll_loss=1.932, ppl=3.82, wps=584920, ups=1.18, wpb=496756, bsz=16568.6, num_updates=57600, lr=0.000263523, gnorm=0.137, clip=0, loss_scale=2, train_wall=84, gb_free=61.3, wall=49346 epoch 035: 263 / 1689 loss=3.49, nll_loss=1.932, ppl=3.82, wps=584920, ups=1.18, wpb=496756, bsz=16568.6, num_updates=57600, lr=0.000263523, gnorm=0.137, clip=0, loss_scale=2, train_wall=84, gb_free=61.3, wall=49346 epoch 035: 263 / 1689 loss=3.49, nll_loss=1.932, ppl=3.82, wps=584920, ups=1.18, wpb=496756, bsz=16568.6, num_updates=57600, lr=0.000263523, gnorm=0.137, clip=0, loss_scale=2, train_wall=84, gb_free=61.3, wall=49346 epoch 035: 263 / 1689 loss=3.49, nll_loss=1.932, ppl=3.82, wps=584920, ups=1.18, wpb=496756, bsz=16568.6, num_updates=57600, lr=0.000263523, gnorm=0.137, clip=0, loss_scale=2, train_wall=84, gb_free=61.3, wall=49346 epoch 035: 263 / 1689 loss=3.49, nll_loss=1.932, ppl=3.82, wps=584920, ups=1.18, wpb=496756, bsz=16568.6, num_updates=57600, lr=0.000263523, gnorm=0.137, clip=0, loss_scale=2, train_wall=84, gb_free=61.3, wall=49346 epoch 035: 263 / 1689 loss=3.49, nll_loss=1.932, ppl=3.82, wps=584920, ups=1.18, wpb=496756, bsz=16568.6, num_updates=57600, lr=0.000263523, gnorm=0.137, clip=0, loss_scale=2, train_wall=84, gb_free=61.3, wall=49346 epoch 035: 263 / 1689 loss=3.49, nll_loss=1.932, ppl=3.82, wps=584920, ups=1.18, wpb=496756, bsz=16568.6, num_updates=57600, lr=0.000263523, gnorm=0.137, clip=0, loss_scale=2, train_wall=84, gb_free=61.3, wall=49346 epoch 035: 263 / 1689 loss=3.49, nll_loss=1.932, ppl=3.82, wps=584920, ups=1.18, wpb=496756, bsz=16568.6, num_updates=57600, lr=0.000263523, gnorm=0.137, clip=0, loss_scale=2, train_wall=84, gb_free=61.3, wall=49346 epoch 035: 263 / 1689 loss=3.49, nll_loss=1.932, ppl=3.82, wps=584920, ups=1.18, wpb=496756, bsz=16568.6, num_updates=57600, lr=0.000263523, gnorm=0.137, clip=0, loss_scale=2, train_wall=84, gb_free=61.3, wall=49346 epoch 035: 263 / 1689 loss=3.49, nll_loss=1.932, ppl=3.82, wps=584920, ups=1.18, wpb=496756, bsz=16568.6, num_updates=57600, lr=0.000263523, gnorm=0.137, clip=0, loss_scale=2, train_wall=84, gb_free=61.3, wall=49346 epoch 035: 263 / 1689 loss=3.49, nll_loss=1.932, ppl=3.82, wps=584920, ups=1.18, wpb=496756, bsz=16568.6, num_updates=57600, lr=0.000263523, gnorm=0.137, clip=0, loss_scale=2, train_wall=84, gb_free=61.3, wall=49346 epoch 035: 263 / 1689 loss=3.49, nll_loss=1.932, ppl=3.82, wps=584920, ups=1.18, wpb=496756, bsz=16568.6, num_updates=57600, lr=0.000263523, gnorm=0.137, clip=0, loss_scale=2, train_wall=84, gb_free=61.3, wall=49346 epoch 035: 263 / 1689 loss=3.49, nll_loss=1.932, ppl=3.82, wps=584920, ups=1.18, wpb=496756, bsz=16568.6, num_updates=57600, lr=0.000263523, gnorm=0.137, clip=0, loss_scale=2, train_wall=84, gb_free=61.3, wall=49346 epoch 035: 263 / 1689 loss=3.49, nll_loss=1.932, ppl=3.82, wps=584920, ups=1.18, wpb=496756, bsz=16568.6, num_updates=57600, lr=0.000263523, gnorm=0.137, clip=0, loss_scale=2, train_wall=84, gb_free=61.3, wall=49346 epoch 035: 263 / 1689 loss=3.49, nll_loss=1.932, ppl=3.82, wps=584920, ups=1.18, wpb=496756, bsz=16568.6, num_updates=57600, lr=0.000263523, gnorm=0.137, clip=0, loss_scale=2, train_wall=84, gb_free=61.3, wall=49346 epoch 035: 263 / 1689 loss=3.49, nll_loss=1.932, ppl=3.82, wps=584920, ups=1.18, wpb=496756, bsz=16568.6, num_updates=57600, lr=0.000263523, gnorm=0.137, clip=0, loss_scale=2, train_wall=84, gb_free=61.3, wall=49346 epoch 035: 263 / 1689 loss=3.49, nll_loss=1.932, ppl=3.82, wps=584920, ups=1.18, wpb=496756, bsz=16568.6, num_updates=57600, lr=0.000263523, gnorm=0.137, clip=0, loss_scale=2, train_wall=84, gb_free=61.3, wall=49346 epoch 035: 263 / 1689 loss=3.49, nll_loss=1.932, ppl=3.82, wps=584920, ups=1.18, wpb=496756, bsz=16568.6, num_updates=57600, lr=0.000263523, gnorm=0.137, clip=0, loss_scale=2, train_wall=84, gb_free=61.3, wall=49346 epoch 035: 263 / 1689 loss=3.49, nll_loss=1.932, ppl=3.82, wps=584920, ups=1.18, wpb=496756, bsz=16568.6, num_updates=57600, lr=0.000263523, gnorm=0.137, clip=0, loss_scale=2, train_wall=84, gb_free=61.3, wall=49346 epoch 035: 263 / 1689 loss=3.49, nll_loss=1.932, ppl=3.82, wps=584920, ups=1.18, wpb=496756, bsz=16568.6, num_updates=57600, lr=0.000263523, gnorm=0.137, clip=0, loss_scale=2, train_wall=84, gb_free=61.3, wall=49346 epoch 035: 263 / 1689 loss=3.49, nll_loss=1.932, ppl=3.82, wps=584920, ups=1.18, wpb=496756, bsz=16568.6, num_updates=57600, lr=0.000263523, gnorm=0.137, clip=0, loss_scale=2, train_wall=84, gb_free=61.3, wall=49346 epoch 035: 263 / 1689 loss=3.49, nll_loss=1.932, ppl=3.82, wps=584920, ups=1.18, wpb=496756, bsz=16568.6, num_updates=57600, lr=0.000263523, gnorm=0.137, clip=0, loss_scale=2, train_wall=84, gb_free=61.3, wall=49346 epoch 035: 263 / 1689 loss=3.49, nll_loss=1.932, ppl=3.82, wps=584920, ups=1.18, wpb=496756, bsz=16568.6, num_updates=57600, lr=0.000263523, gnorm=0.137, clip=0, loss_scale=2, train_wall=84, gb_free=61.3, wall=49346 epoch 035: 263 / 1689 loss=3.49, nll_loss=1.932, ppl=3.82, wps=584920, ups=1.18, wpb=496756, bsz=16568.6, num_updates=57600, lr=0.000263523, gnorm=0.137, clip=0, loss_scale=2, train_wall=84, gb_free=61.3, wall=49346 epoch 035: 263 / 1689 loss=3.49, nll_loss=1.932, ppl=3.82, wps=584920, ups=1.18, wpb=496756, bsz=16568.6, num_updates=57600, lr=0.000263523, gnorm=0.137, clip=0, loss_scale=2, train_wall=84, gb_free=61.3, wall=49346 epoch 035: 263 / 1689 loss=3.49, nll_loss=1.932, ppl=3.82, wps=584920, ups=1.18, wpb=496756, bsz=16568.6, num_updates=57600, lr=0.000263523, gnorm=0.137, clip=0, loss_scale=2, train_wall=84, gb_free=61.3, wall=49346 epoch 035: 263 / 1689 loss=3.49, nll_loss=1.932, ppl=3.82, wps=584920, ups=1.18, wpb=496756, bsz=16568.6, num_updates=57600, lr=0.000263523, gnorm=0.137, clip=0, loss_scale=2, train_wall=84, gb_free=61.3, wall=49346 epoch 035: 263 / 1689 loss=3.49, nll_loss=1.932, ppl=3.82, wps=584920, ups=1.18, wpb=496756, bsz=16568.6, num_updates=57600, lr=0.000263523, gnorm=0.137, clip=0, loss_scale=2, train_wall=84, gb_free=61.3, wall=49346 epoch 035: 363 / 1689 loss=3.486, nll_loss=1.929, ppl=3.81, wps=587007, ups=1.18, wpb=495760, bsz=16469.9, num_updates=57700, lr=0.000263295, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61.9, wall=49431 epoch 035: 363 / 1689 loss=3.486, nll_loss=1.929, ppl=3.81, wps=587007, ups=1.18, wpb=495760, bsz=16469.9, num_updates=57700, lr=0.000263295, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61.9, wall=49431 epoch 035: 363 / 1689 loss=3.486, nll_loss=1.929, ppl=3.81, wps=587007, ups=1.18, wpb=495760, bsz=16469.9, num_updates=57700, lr=0.000263295, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61.9, wall=49431 epoch 035: 363 / 1689 loss=3.486, nll_loss=1.929, ppl=3.81, wps=587007, ups=1.18, wpb=495760, bsz=16469.9, num_updates=57700, lr=0.000263295, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61.9, wall=49431 epoch 035: 363 / 1689 loss=3.486, nll_loss=1.929, ppl=3.81, wps=587007, ups=1.18, wpb=495760, bsz=16469.9, num_updates=57700, lr=0.000263295, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61.9, wall=49431 epoch 035: 363 / 1689 loss=3.486, nll_loss=1.929, ppl=3.81, wps=587007, ups=1.18, wpb=495760, bsz=16469.9, num_updates=57700, lr=0.000263295, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61.9, wall=49431 epoch 035: 363 / 1689 loss=3.486, nll_loss=1.929, ppl=3.81, wps=587007, ups=1.18, wpb=495760, bsz=16469.9, num_updates=57700, lr=0.000263295, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61.9, wall=49431 epoch 035: 363 / 1689 loss=3.486, nll_loss=1.929, ppl=3.81, wps=587007, ups=1.18, wpb=495760, bsz=16469.9, num_updates=57700, lr=0.000263295, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61.9, wall=49431 epoch 035: 363 / 1689 loss=3.486, nll_loss=1.929, ppl=3.81, wps=587007, ups=1.18, wpb=495760, bsz=16469.9, num_updates=57700, lr=0.000263295, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61.9, wall=49431 epoch 035: 363 / 1689 loss=3.486, nll_loss=1.929, ppl=3.81, wps=587007, ups=1.18, wpb=495760, bsz=16469.9, num_updates=57700, lr=0.000263295, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61.9, wall=49431 epoch 035: 363 / 1689 loss=3.486, nll_loss=1.929, ppl=3.81, wps=587007, ups=1.18, wpb=495760, bsz=16469.9, num_updates=57700, lr=0.000263295, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61.9, wall=49431 epoch 035: 363 / 1689 loss=3.486, nll_loss=1.929, ppl=3.81, wps=587007, ups=1.18, wpb=495760, bsz=16469.9, num_updates=57700, lr=0.000263295, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61.9, wall=49431 epoch 035: 363 / 1689 loss=3.486, nll_loss=1.929, ppl=3.81, wps=587007, ups=1.18, wpb=495760, bsz=16469.9, num_updates=57700, lr=0.000263295, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61.9, wall=49431 epoch 035: 363 / 1689 loss=3.486, nll_loss=1.929, ppl=3.81, wps=587007, ups=1.18, wpb=495760, bsz=16469.9, num_updates=57700, lr=0.000263295, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61.9, wall=49431 epoch 035: 363 / 1689 loss=3.486, nll_loss=1.929, ppl=3.81, wps=587007, ups=1.18, wpb=495760, bsz=16469.9, num_updates=57700, lr=0.000263295, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61.9, wall=49431 epoch 035: 363 / 1689 loss=3.486, nll_loss=1.929, ppl=3.81, wps=587007, ups=1.18, wpb=495760, bsz=16469.9, num_updates=57700, lr=0.000263295, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61.9, wall=49431 epoch 035: 363 / 1689 loss=3.486, nll_loss=1.929, ppl=3.81, wps=587007, ups=1.18, wpb=495760, bsz=16469.9, num_updates=57700, lr=0.000263295, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61.9, wall=49431 epoch 035: 363 / 1689 loss=3.486, nll_loss=1.929, ppl=3.81, wps=587007, ups=1.18, wpb=495760, bsz=16469.9, num_updates=57700, lr=0.000263295, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61.9, wall=49431 epoch 035: 363 / 1689 loss=3.486, nll_loss=1.929, ppl=3.81, wps=587007, ups=1.18, wpb=495760, bsz=16469.9, num_updates=57700, lr=0.000263295, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61.9, wall=49431 epoch 035: 363 / 1689 loss=3.486, nll_loss=1.929, ppl=3.81, wps=587007, ups=1.18, wpb=495760, bsz=16469.9, num_updates=57700, lr=0.000263295, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61.9, wall=49431 epoch 035: 363 / 1689 loss=3.486, nll_loss=1.929, ppl=3.81, wps=587007, ups=1.18, wpb=495760, bsz=16469.9, num_updates=57700, lr=0.000263295, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61.9, wall=49431 epoch 035: 363 / 1689 loss=3.486, nll_loss=1.929, ppl=3.81, wps=587007, ups=1.18, wpb=495760, bsz=16469.9, num_updates=57700, lr=0.000263295, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61.9, wall=49431 epoch 035: 363 / 1689 loss=3.486, nll_loss=1.929, ppl=3.81, wps=587007, ups=1.18, wpb=495760, bsz=16469.9, num_updates=57700, lr=0.000263295, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61.9, wall=49431 epoch 035: 363 / 1689 loss=3.486, nll_loss=1.929, ppl=3.81, wps=587007, ups=1.18, wpb=495760, bsz=16469.9, num_updates=57700, lr=0.000263295, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61.9, wall=49431 epoch 035: 363 / 1689 loss=3.486, nll_loss=1.929, ppl=3.81, wps=587007, ups=1.18, wpb=495760, bsz=16469.9, num_updates=57700, lr=0.000263295, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61.9, wall=49431 epoch 035: 363 / 1689 loss=3.486, nll_loss=1.929, ppl=3.81, wps=587007, ups=1.18, wpb=495760, bsz=16469.9, num_updates=57700, lr=0.000263295, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61.9, wall=49431 epoch 035: 363 / 1689 loss=3.486, nll_loss=1.929, ppl=3.81, wps=587007, ups=1.18, wpb=495760, bsz=16469.9, num_updates=57700, lr=0.000263295, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61.9, wall=49431 epoch 035: 363 / 1689 loss=3.486, nll_loss=1.929, ppl=3.81, wps=587007, ups=1.18, wpb=495760, bsz=16469.9, num_updates=57700, lr=0.000263295, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61.9, wall=49431 epoch 035: 363 / 1689 loss=3.486, nll_loss=1.929, ppl=3.81, wps=587007, ups=1.18, wpb=495760, bsz=16469.9, num_updates=57700, lr=0.000263295, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61.9, wall=49431 epoch 035: 363 / 1689 loss=3.486, nll_loss=1.929, ppl=3.81, wps=587007, ups=1.18, wpb=495760, bsz=16469.9, num_updates=57700, lr=0.000263295, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61.9, wall=49431 epoch 035: 363 / 1689 loss=3.486, nll_loss=1.929, ppl=3.81, wps=587007, ups=1.18, wpb=495760, bsz=16469.9, num_updates=57700, lr=0.000263295, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61.9, wall=49431 epoch 035: 363 / 1689 loss=3.486, nll_loss=1.929, ppl=3.81, wps=587007, ups=1.18, wpb=495760, bsz=16469.9, num_updates=57700, lr=0.000263295, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61.9, wall=49431 epoch 035: 363 / 1689 loss=3.486, nll_loss=1.929, ppl=3.81, wps=587007, ups=1.18, wpb=495760, bsz=16469.9, num_updates=57700, lr=0.000263295, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61.9, wall=49431 epoch 035: 363 / 1689 loss=3.486, nll_loss=1.929, ppl=3.81, wps=587007, ups=1.18, wpb=495760, bsz=16469.9, num_updates=57700, lr=0.000263295, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61.9, wall=49431 epoch 035: 363 / 1689 loss=3.486, nll_loss=1.929, ppl=3.81, wps=587007, ups=1.18, wpb=495760, bsz=16469.9, num_updates=57700, lr=0.000263295, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61.9, wall=49431 epoch 035: 463 / 1689 loss=3.5, nll_loss=1.944, ppl=3.85, wps=586106, ups=1.19, wpb=494284, bsz=16535.2, num_updates=57800, lr=0.000263067, gnorm=0.138, clip=0, loss_scale=2, train_wall=83, gb_free=60.7, wall=49515 epoch 035: 463 / 1689 loss=3.5, nll_loss=1.944, ppl=3.85, wps=586106, ups=1.19, wpb=494284, bsz=16535.2, num_updates=57800, lr=0.000263067, gnorm=0.138, clip=0, loss_scale=2, train_wall=83, gb_free=60.7, wall=49515 epoch 035: 463 / 1689 loss=3.5, nll_loss=1.944, ppl=3.85, wps=586106, ups=1.19, wpb=494284, bsz=16535.2, num_updates=57800, lr=0.000263067, gnorm=0.138, clip=0, loss_scale=2, train_wall=83, gb_free=60.7, wall=49515 epoch 035: 463 / 1689 loss=3.5, nll_loss=1.944, ppl=3.85, wps=586106, ups=1.19, wpb=494284, bsz=16535.2, num_updates=57800, lr=0.000263067, gnorm=0.138, clip=0, loss_scale=2, train_wall=83, gb_free=60.7, wall=49515 epoch 035: 463 / 1689 loss=3.5, nll_loss=1.944, ppl=3.85, wps=586106, ups=1.19, wpb=494284, bsz=16535.2, num_updates=57800, lr=0.000263067, gnorm=0.138, clip=0, loss_scale=2, train_wall=83, gb_free=60.7, wall=49515 epoch 035: 463 / 1689 loss=3.5, nll_loss=1.944, ppl=3.85, wps=586106, ups=1.19, wpb=494284, bsz=16535.2, num_updates=57800, lr=0.000263067, gnorm=0.138, clip=0, loss_scale=2, train_wall=83, gb_free=60.7, wall=49515 epoch 035: 463 / 1689 loss=3.5, nll_loss=1.944, ppl=3.85, wps=586106, ups=1.19, wpb=494284, bsz=16535.2, num_updates=57800, lr=0.000263067, gnorm=0.138, clip=0, loss_scale=2, train_wall=83, gb_free=60.7, wall=49515 epoch 035: 463 / 1689 loss=3.5, nll_loss=1.944, ppl=3.85, wps=586106, ups=1.19, wpb=494284, bsz=16535.2, num_updates=57800, lr=0.000263067, gnorm=0.138, clip=0, loss_scale=2, train_wall=83, gb_free=60.7, wall=49515 epoch 035: 463 / 1689 loss=3.5, nll_loss=1.944, ppl=3.85, wps=586106, ups=1.19, wpb=494284, bsz=16535.2, num_updates=57800, lr=0.000263067, gnorm=0.138, clip=0, loss_scale=2, train_wall=83, gb_free=60.7, wall=49515 epoch 035: 463 / 1689 loss=3.5, nll_loss=1.944, ppl=3.85, wps=586106, ups=1.19, wpb=494284, bsz=16535.2, num_updates=57800, lr=0.000263067, gnorm=0.138, clip=0, loss_scale=2, train_wall=83, gb_free=60.7, wall=49515 epoch 035: 463 / 1689 loss=3.5, nll_loss=1.944, ppl=3.85, wps=586106, ups=1.19, wpb=494284, bsz=16535.2, num_updates=57800, lr=0.000263067, gnorm=0.138, clip=0, loss_scale=2, train_wall=83, gb_free=60.7, wall=49515 epoch 035: 463 / 1689 loss=3.5, nll_loss=1.944, ppl=3.85, wps=586106, ups=1.19, wpb=494284, bsz=16535.2, num_updates=57800, lr=0.000263067, gnorm=0.138, clip=0, loss_scale=2, train_wall=83, gb_free=60.7, wall=49515 epoch 035: 463 / 1689 loss=3.5, nll_loss=1.944, ppl=3.85, wps=586106, ups=1.19, wpb=494284, bsz=16535.2, num_updates=57800, lr=0.000263067, gnorm=0.138, clip=0, loss_scale=2, train_wall=83, gb_free=60.7, wall=49515 epoch 035: 463 / 1689 loss=3.5, nll_loss=1.944, ppl=3.85, wps=586106, ups=1.19, wpb=494284, bsz=16535.2, num_updates=57800, lr=0.000263067, gnorm=0.138, clip=0, loss_scale=2, train_wall=83, gb_free=60.7, wall=49515 epoch 035: 463 / 1689 loss=3.5, nll_loss=1.944, ppl=3.85, wps=586106, ups=1.19, wpb=494284, bsz=16535.2, num_updates=57800, lr=0.000263067, gnorm=0.138, clip=0, loss_scale=2, train_wall=83, gb_free=60.7, wall=49515 epoch 035: 463 / 1689 loss=3.5, nll_loss=1.944, ppl=3.85, wps=586106, ups=1.19, wpb=494284, bsz=16535.2, num_updates=57800, lr=0.000263067, gnorm=0.138, clip=0, loss_scale=2, train_wall=83, gb_free=60.7, wall=49515 epoch 035: 463 / 1689 loss=3.5, nll_loss=1.944, ppl=3.85, wps=586106, ups=1.19, wpb=494284, bsz=16535.2, num_updates=57800, lr=0.000263067, gnorm=0.138, clip=0, loss_scale=2, train_wall=83, gb_free=60.7, wall=49515 epoch 035: 463 / 1689 loss=3.5, nll_loss=1.944, ppl=3.85, wps=586106, ups=1.19, wpb=494284, bsz=16535.2, num_updates=57800, lr=0.000263067, gnorm=0.138, clip=0, loss_scale=2, train_wall=83, gb_free=60.7, wall=49515 epoch 035: 463 / 1689 loss=3.5, nll_loss=1.944, ppl=3.85, wps=586106, ups=1.19, wpb=494284, bsz=16535.2, num_updates=57800, lr=0.000263067, gnorm=0.138, clip=0, loss_scale=2, train_wall=83, gb_free=60.7, wall=49515 epoch 035: 463 / 1689 loss=3.5, nll_loss=1.944, ppl=3.85, wps=586106, ups=1.19, wpb=494284, bsz=16535.2, num_updates=57800, lr=0.000263067, gnorm=0.138, clip=0, loss_scale=2, train_wall=83, gb_free=60.7, wall=49515 epoch 035: 463 / 1689 loss=3.5, nll_loss=1.944, ppl=3.85, wps=586106, ups=1.19, wpb=494284, bsz=16535.2, num_updates=57800, lr=0.000263067, gnorm=0.138, clip=0, loss_scale=2, train_wall=83, gb_free=60.7, wall=49515 epoch 035: 463 / 1689 loss=3.5, nll_loss=1.944, ppl=3.85, wps=586106, ups=1.19, wpb=494284, bsz=16535.2, num_updates=57800, lr=0.000263067, gnorm=0.138, clip=0, loss_scale=2, train_wall=83, gb_free=60.7, wall=49515 epoch 035: 463 / 1689 loss=3.5, nll_loss=1.944, ppl=3.85, wps=586106, ups=1.19, wpb=494284, bsz=16535.2, num_updates=57800, lr=0.000263067, gnorm=0.138, clip=0, loss_scale=2, train_wall=83, gb_free=60.7, wall=49515 epoch 035: 463 / 1689 loss=3.5, nll_loss=1.944, ppl=3.85, wps=586106, ups=1.19, wpb=494284, bsz=16535.2, num_updates=57800, lr=0.000263067, gnorm=0.138, clip=0, loss_scale=2, train_wall=83, gb_free=60.7, wall=49515 epoch 035: 463 / 1689 loss=3.5, nll_loss=1.944, ppl=3.85, wps=586106, ups=1.19, wpb=494284, bsz=16535.2, num_updates=57800, lr=0.000263067, gnorm=0.138, clip=0, loss_scale=2, train_wall=83, gb_free=60.7, wall=49515 epoch 035: 463 / 1689 loss=3.5, nll_loss=1.944, ppl=3.85, wps=586106, ups=1.19, wpb=494284, bsz=16535.2, num_updates=57800, lr=0.000263067, gnorm=0.138, clip=0, loss_scale=2, train_wall=83, gb_free=60.7, wall=49515 epoch 035: 463 / 1689 loss=3.5, nll_loss=1.944, ppl=3.85, wps=586106, ups=1.19, wpb=494284, bsz=16535.2, num_updates=57800, lr=0.000263067, gnorm=0.138, clip=0, loss_scale=2, train_wall=83, gb_free=60.7, wall=49515 epoch 035: 463 / 1689 loss=3.5, nll_loss=1.944, ppl=3.85, wps=586106, ups=1.19, wpb=494284, bsz=16535.2, num_updates=57800, lr=0.000263067, gnorm=0.138, clip=0, loss_scale=2, train_wall=83, gb_free=60.7, wall=49515 epoch 035: 463 / 1689 loss=3.5, nll_loss=1.944, ppl=3.85, wps=586106, ups=1.19, wpb=494284, bsz=16535.2, num_updates=57800, lr=0.000263067, gnorm=0.138, clip=0, loss_scale=2, train_wall=83, gb_free=60.7, wall=49515 epoch 035: 463 / 1689 loss=3.5, nll_loss=1.944, ppl=3.85, wps=586106, ups=1.19, wpb=494284, bsz=16535.2, num_updates=57800, lr=0.000263067, gnorm=0.138, clip=0, loss_scale=2, train_wall=83, gb_free=60.7, wall=49515 epoch 035: 463 / 1689 loss=3.5, nll_loss=1.944, ppl=3.85, wps=586106, ups=1.19, wpb=494284, bsz=16535.2, num_updates=57800, lr=0.000263067, gnorm=0.138, clip=0, loss_scale=2, train_wall=83, gb_free=60.7, wall=49515 epoch 035: 463 / 1689 loss=3.5, nll_loss=1.944, ppl=3.85, wps=586106, ups=1.19, wpb=494284, bsz=16535.2, num_updates=57800, lr=0.000263067, gnorm=0.138, clip=0, loss_scale=2, train_wall=83, gb_free=60.7, wall=49515 epoch 035: 463 / 1689 loss=3.5, nll_loss=1.944, ppl=3.85, wps=586106, ups=1.19, wpb=494284, bsz=16535.2, num_updates=57800, lr=0.000263067, gnorm=0.138, clip=0, loss_scale=2, train_wall=83, gb_free=60.7, wall=49515 epoch 035: 463 / 1689 loss=3.5, nll_loss=1.944, ppl=3.85, wps=586106, ups=1.19, wpb=494284, bsz=16535.2, num_updates=57800, lr=0.000263067, gnorm=0.138, clip=0, loss_scale=2, train_wall=83, gb_free=60.7, wall=49515 epoch 035: 463 / 1689 loss=3.5, nll_loss=1.944, ppl=3.85, wps=586106, ups=1.19, wpb=494284, bsz=16535.2, num_updates=57800, lr=0.000263067, gnorm=0.138, clip=0, loss_scale=2, train_wall=83, gb_free=60.7, wall=49515 epoch 035: 563 / 1689 loss=3.497, nll_loss=1.941, ppl=3.84, wps=589403, ups=1.19, wpb=495462, bsz=16232.1, num_updates=57900, lr=0.00026284, gnorm=0.136, clip=0, loss_scale=2, train_wall=83, gb_free=60.1, wall=49599 epoch 035: 563 / 1689 loss=3.497, nll_loss=1.941, ppl=3.84, wps=589403, ups=1.19, wpb=495462, bsz=16232.1, num_updates=57900, lr=0.00026284, gnorm=0.136, clip=0, loss_scale=2, train_wall=83, gb_free=60.1, wall=49599 epoch 035: 563 / 1689 loss=3.497, nll_loss=1.941, ppl=3.84, wps=589403, ups=1.19, wpb=495462, bsz=16232.1, num_updates=57900, lr=0.00026284, gnorm=0.136, clip=0, loss_scale=2, train_wall=83, gb_free=60.1, wall=49599 epoch 035: 563 / 1689 loss=3.497, nll_loss=1.941, ppl=3.84, wps=589403, ups=1.19, wpb=495462, bsz=16232.1, num_updates=57900, lr=0.00026284, gnorm=0.136, clip=0, loss_scale=2, train_wall=83, gb_free=60.1, wall=49599 epoch 035: 563 / 1689 loss=3.497, nll_loss=1.941, ppl=3.84, wps=589403, ups=1.19, wpb=495462, bsz=16232.1, num_updates=57900, lr=0.00026284, gnorm=0.136, clip=0, loss_scale=2, train_wall=83, gb_free=60.1, wall=49599 epoch 035: 563 / 1689 loss=3.497, nll_loss=1.941, ppl=3.84, wps=589403, ups=1.19, wpb=495462, bsz=16232.1, num_updates=57900, lr=0.00026284, gnorm=0.136, clip=0, loss_scale=2, train_wall=83, gb_free=60.1, wall=49599 epoch 035: 563 / 1689 loss=3.497, nll_loss=1.941, ppl=3.84, wps=589403, ups=1.19, wpb=495462, bsz=16232.1, num_updates=57900, lr=0.00026284, gnorm=0.136, clip=0, loss_scale=2, train_wall=83, gb_free=60.1, wall=49599 epoch 035: 563 / 1689 loss=3.497, nll_loss=1.941, ppl=3.84, wps=589403, ups=1.19, wpb=495462, bsz=16232.1, num_updates=57900, lr=0.00026284, gnorm=0.136, clip=0, loss_scale=2, train_wall=83, gb_free=60.1, wall=49599 epoch 035: 563 / 1689 loss=3.497, nll_loss=1.941, ppl=3.84, wps=589403, ups=1.19, wpb=495462, bsz=16232.1, num_updates=57900, lr=0.00026284, gnorm=0.136, clip=0, loss_scale=2, train_wall=83, gb_free=60.1, wall=49599 epoch 035: 563 / 1689 loss=3.497, nll_loss=1.941, ppl=3.84, wps=589403, ups=1.19, wpb=495462, bsz=16232.1, num_updates=57900, lr=0.00026284, gnorm=0.136, clip=0, loss_scale=2, train_wall=83, gb_free=60.1, wall=49599 epoch 035: 563 / 1689 loss=3.497, nll_loss=1.941, ppl=3.84, wps=589403, ups=1.19, wpb=495462, bsz=16232.1, num_updates=57900, lr=0.00026284, gnorm=0.136, clip=0, loss_scale=2, train_wall=83, gb_free=60.1, wall=49599 epoch 035: 563 / 1689 loss=3.497, nll_loss=1.941, ppl=3.84, wps=589403, ups=1.19, wpb=495462, bsz=16232.1, num_updates=57900, lr=0.00026284, gnorm=0.136, clip=0, loss_scale=2, train_wall=83, gb_free=60.1, wall=49599 epoch 035: 563 / 1689 loss=3.497, nll_loss=1.941, ppl=3.84, wps=589403, ups=1.19, wpb=495462, bsz=16232.1, num_updates=57900, lr=0.00026284, gnorm=0.136, clip=0, loss_scale=2, train_wall=83, gb_free=60.1, wall=49599 epoch 035: 563 / 1689 loss=3.497, nll_loss=1.941, ppl=3.84, wps=589403, ups=1.19, wpb=495462, bsz=16232.1, num_updates=57900, lr=0.00026284, gnorm=0.136, clip=0, loss_scale=2, train_wall=83, gb_free=60.1, wall=49599 epoch 035: 563 / 1689 loss=3.497, nll_loss=1.941, ppl=3.84, wps=589403, ups=1.19, wpb=495462, bsz=16232.1, num_updates=57900, lr=0.00026284, gnorm=0.136, clip=0, loss_scale=2, train_wall=83, gb_free=60.1, wall=49599 epoch 035: 563 / 1689 loss=3.497, nll_loss=1.941, ppl=3.84, wps=589403, ups=1.19, wpb=495462, bsz=16232.1, num_updates=57900, lr=0.00026284, gnorm=0.136, clip=0, loss_scale=2, train_wall=83, gb_free=60.1, wall=49599 epoch 035: 563 / 1689 loss=3.497, nll_loss=1.941, ppl=3.84, wps=589403, ups=1.19, wpb=495462, bsz=16232.1, num_updates=57900, lr=0.00026284, gnorm=0.136, clip=0, loss_scale=2, train_wall=83, gb_free=60.1, wall=49599 epoch 035: 563 / 1689 loss=3.497, nll_loss=1.941, ppl=3.84, wps=589403, ups=1.19, wpb=495462, bsz=16232.1, num_updates=57900, lr=0.00026284, gnorm=0.136, clip=0, loss_scale=2, train_wall=83, gb_free=60.1, wall=49599 epoch 035: 563 / 1689 loss=3.497, nll_loss=1.941, ppl=3.84, wps=589403, ups=1.19, wpb=495462, bsz=16232.1, num_updates=57900, lr=0.00026284, gnorm=0.136, clip=0, loss_scale=2, train_wall=83, gb_free=60.1, wall=49599 epoch 035: 563 / 1689 loss=3.497, nll_loss=1.941, ppl=3.84, wps=589403, ups=1.19, wpb=495462, bsz=16232.1, num_updates=57900, lr=0.00026284, gnorm=0.136, clip=0, loss_scale=2, train_wall=83, gb_free=60.1, wall=49599 epoch 035: 563 / 1689 loss=3.497, nll_loss=1.941, ppl=3.84, wps=589403, ups=1.19, wpb=495462, bsz=16232.1, num_updates=57900, lr=0.00026284, gnorm=0.136, clip=0, loss_scale=2, train_wall=83, gb_free=60.1, wall=49599 epoch 035: 563 / 1689 loss=3.497, nll_loss=1.941, ppl=3.84, wps=589403, ups=1.19, wpb=495462, bsz=16232.1, num_updates=57900, lr=0.00026284, gnorm=0.136, clip=0, loss_scale=2, train_wall=83, gb_free=60.1, wall=49599 epoch 035: 563 / 1689 loss=3.497, nll_loss=1.941, ppl=3.84, wps=589403, ups=1.19, wpb=495462, bsz=16232.1, num_updates=57900, lr=0.00026284, gnorm=0.136, clip=0, loss_scale=2, train_wall=83, gb_free=60.1, wall=49599 epoch 035: 563 / 1689 loss=3.497, nll_loss=1.941, ppl=3.84, wps=589403, ups=1.19, wpb=495462, bsz=16232.1, num_updates=57900, lr=0.00026284, gnorm=0.136, clip=0, loss_scale=2, train_wall=83, gb_free=60.1, wall=49599 epoch 035: 563 / 1689 loss=3.497, nll_loss=1.941, ppl=3.84, wps=589403, ups=1.19, wpb=495462, bsz=16232.1, num_updates=57900, lr=0.00026284, gnorm=0.136, clip=0, loss_scale=2, train_wall=83, gb_free=60.1, wall=49599 epoch 035: 563 / 1689 loss=3.497, nll_loss=1.941, ppl=3.84, wps=589403, ups=1.19, wpb=495462, bsz=16232.1, num_updates=57900, lr=0.00026284, gnorm=0.136, clip=0, loss_scale=2, train_wall=83, gb_free=60.1, wall=49599 epoch 035: 563 / 1689 loss=3.497, nll_loss=1.941, ppl=3.84, wps=589403, ups=1.19, wpb=495462, bsz=16232.1, num_updates=57900, lr=0.00026284, gnorm=0.136, clip=0, loss_scale=2, train_wall=83, gb_free=60.1, wall=49599 epoch 035: 563 / 1689 loss=3.497, nll_loss=1.941, ppl=3.84, wps=589403, ups=1.19, wpb=495462, bsz=16232.1, num_updates=57900, lr=0.00026284, gnorm=0.136, clip=0, loss_scale=2, train_wall=83, gb_free=60.1, wall=49599 epoch 035: 563 / 1689 loss=3.497, nll_loss=1.941, ppl=3.84, wps=589403, ups=1.19, wpb=495462, bsz=16232.1, num_updates=57900, lr=0.00026284, gnorm=0.136, clip=0, loss_scale=2, train_wall=83, gb_free=60.1, wall=49599 epoch 035: 563 / 1689 loss=3.497, nll_loss=1.941, ppl=3.84, wps=589403, ups=1.19, wpb=495462, bsz=16232.1, num_updates=57900, lr=0.00026284, gnorm=0.136, clip=0, loss_scale=2, train_wall=83, gb_free=60.1, wall=49599 epoch 035: 563 / 1689 loss=3.497, nll_loss=1.941, ppl=3.84, wps=589403, ups=1.19, wpb=495462, bsz=16232.1, num_updates=57900, lr=0.00026284, gnorm=0.136, clip=0, loss_scale=2, train_wall=83, gb_free=60.1, wall=49599 epoch 035: 563 / 1689 loss=3.497, nll_loss=1.941, ppl=3.84, wps=589403, ups=1.19, wpb=495462, bsz=16232.1, num_updates=57900, lr=0.00026284, gnorm=0.136, clip=0, loss_scale=2, train_wall=83, gb_free=60.1, wall=49599 epoch 035: 563 / 1689 loss=3.497, nll_loss=1.941, ppl=3.84, wps=589403, ups=1.19, wpb=495462, bsz=16232.1, num_updates=57900, lr=0.00026284, gnorm=0.136, clip=0, loss_scale=2, train_wall=83, gb_free=60.1, wall=49599 epoch 035: 563 / 1689 loss=3.497, nll_loss=1.941, ppl=3.84, wps=589403, ups=1.19, wpb=495462, bsz=16232.1, num_updates=57900, lr=0.00026284, gnorm=0.136, clip=0, loss_scale=2, train_wall=83, gb_free=60.1, wall=49599 epoch 035: 563 / 1689 loss=3.497, nll_loss=1.941, ppl=3.84, wps=589403, ups=1.19, wpb=495462, bsz=16232.1, num_updates=57900, lr=0.00026284, gnorm=0.136, clip=0, loss_scale=2, train_wall=83, gb_free=60.1, wall=49599 epoch 035: 663 / 1689 loss=3.504, nll_loss=1.949, ppl=3.86, wps=589734, ups=1.19, wpb=494975, bsz=16341.4, num_updates=58000, lr=0.000262613, gnorm=0.138, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=49683 epoch 035: 663 / 1689 loss=3.504, nll_loss=1.949, ppl=3.86, wps=589734, ups=1.19, wpb=494975, bsz=16341.4, num_updates=58000, lr=0.000262613, gnorm=0.138, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=49683 epoch 035: 663 / 1689 loss=3.504, nll_loss=1.949, ppl=3.86, wps=589734, ups=1.19, wpb=494975, bsz=16341.4, num_updates=58000, lr=0.000262613, gnorm=0.138, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=49683 epoch 035: 663 / 1689 loss=3.504, nll_loss=1.949, ppl=3.86, wps=589734, ups=1.19, wpb=494975, bsz=16341.4, num_updates=58000, lr=0.000262613, gnorm=0.138, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=49683 epoch 035: 663 / 1689 loss=3.504, nll_loss=1.949, ppl=3.86, wps=589734, ups=1.19, wpb=494975, bsz=16341.4, num_updates=58000, lr=0.000262613, gnorm=0.138, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=49683 epoch 035: 663 / 1689 loss=3.504, nll_loss=1.949, ppl=3.86, wps=589734, ups=1.19, wpb=494975, bsz=16341.4, num_updates=58000, lr=0.000262613, gnorm=0.138, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=49683 epoch 035: 663 / 1689 loss=3.504, nll_loss=1.949, ppl=3.86, wps=589734, ups=1.19, wpb=494975, bsz=16341.4, num_updates=58000, lr=0.000262613, gnorm=0.138, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=49683 epoch 035: 663 / 1689 loss=3.504, nll_loss=1.949, ppl=3.86, wps=589734, ups=1.19, wpb=494975, bsz=16341.4, num_updates=58000, lr=0.000262613, gnorm=0.138, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=49683 epoch 035: 663 / 1689 loss=3.504, nll_loss=1.949, ppl=3.86, wps=589734, ups=1.19, wpb=494975, bsz=16341.4, num_updates=58000, lr=0.000262613, gnorm=0.138, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=49683 epoch 035: 663 / 1689 loss=3.504, nll_loss=1.949, ppl=3.86, wps=589734, ups=1.19, wpb=494975, bsz=16341.4, num_updates=58000, lr=0.000262613, gnorm=0.138, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=49683 epoch 035: 663 / 1689 loss=3.504, nll_loss=1.949, ppl=3.86, wps=589734, ups=1.19, wpb=494975, bsz=16341.4, num_updates=58000, lr=0.000262613, gnorm=0.138, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=49683 epoch 035: 663 / 1689 loss=3.504, nll_loss=1.949, ppl=3.86, wps=589734, ups=1.19, wpb=494975, bsz=16341.4, num_updates=58000, lr=0.000262613, gnorm=0.138, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=49683 epoch 035: 663 / 1689 loss=3.504, nll_loss=1.949, ppl=3.86, wps=589734, ups=1.19, wpb=494975, bsz=16341.4, num_updates=58000, lr=0.000262613, gnorm=0.138, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=49683 epoch 035: 663 / 1689 loss=3.504, nll_loss=1.949, ppl=3.86, wps=589734, ups=1.19, wpb=494975, bsz=16341.4, num_updates=58000, lr=0.000262613, gnorm=0.138, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=49683 epoch 035: 663 / 1689 loss=3.504, nll_loss=1.949, ppl=3.86, wps=589734, ups=1.19, wpb=494975, bsz=16341.4, num_updates=58000, lr=0.000262613, gnorm=0.138, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=49683 epoch 035: 663 / 1689 loss=3.504, nll_loss=1.949, ppl=3.86, wps=589734, ups=1.19, wpb=494975, bsz=16341.4, num_updates=58000, lr=0.000262613, gnorm=0.138, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=49683 epoch 035: 663 / 1689 loss=3.504, nll_loss=1.949, ppl=3.86, wps=589734, ups=1.19, wpb=494975, bsz=16341.4, num_updates=58000, lr=0.000262613, gnorm=0.138, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=49683 epoch 035: 663 / 1689 loss=3.504, nll_loss=1.949, ppl=3.86, wps=589734, ups=1.19, wpb=494975, bsz=16341.4, num_updates=58000, lr=0.000262613, gnorm=0.138, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=49683 epoch 035: 663 / 1689 loss=3.504, nll_loss=1.949, ppl=3.86, wps=589734, ups=1.19, wpb=494975, bsz=16341.4, num_updates=58000, lr=0.000262613, gnorm=0.138, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=49683 epoch 035: 663 / 1689 loss=3.504, nll_loss=1.949, ppl=3.86, wps=589734, ups=1.19, wpb=494975, bsz=16341.4, num_updates=58000, lr=0.000262613, gnorm=0.138, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=49683 epoch 035: 663 / 1689 loss=3.504, nll_loss=1.949, ppl=3.86, wps=589734, ups=1.19, wpb=494975, bsz=16341.4, num_updates=58000, lr=0.000262613, gnorm=0.138, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=49683 epoch 035: 663 / 1689 loss=3.504, nll_loss=1.949, ppl=3.86, wps=589734, ups=1.19, wpb=494975, bsz=16341.4, num_updates=58000, lr=0.000262613, gnorm=0.138, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=49683 epoch 035: 663 / 1689 loss=3.504, nll_loss=1.949, ppl=3.86, wps=589734, ups=1.19, wpb=494975, bsz=16341.4, num_updates=58000, lr=0.000262613, gnorm=0.138, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=49683 epoch 035: 663 / 1689 loss=3.504, nll_loss=1.949, ppl=3.86, wps=589734, ups=1.19, wpb=494975, bsz=16341.4, num_updates=58000, lr=0.000262613, gnorm=0.138, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=49683 epoch 035: 663 / 1689 loss=3.504, nll_loss=1.949, ppl=3.86, wps=589734, ups=1.19, wpb=494975, bsz=16341.4, num_updates=58000, lr=0.000262613, gnorm=0.138, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=49683 epoch 035: 663 / 1689 loss=3.504, nll_loss=1.949, ppl=3.86, wps=589734, ups=1.19, wpb=494975, bsz=16341.4, num_updates=58000, lr=0.000262613, gnorm=0.138, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=49683 epoch 035: 663 / 1689 loss=3.504, nll_loss=1.949, ppl=3.86, wps=589734, ups=1.19, wpb=494975, bsz=16341.4, num_updates=58000, lr=0.000262613, gnorm=0.138, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=49683 epoch 035: 663 / 1689 loss=3.504, nll_loss=1.949, ppl=3.86, wps=589734, ups=1.19, wpb=494975, bsz=16341.4, num_updates=58000, lr=0.000262613, gnorm=0.138, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=49683 epoch 035: 663 / 1689 loss=3.504, nll_loss=1.949, ppl=3.86, wps=589734, ups=1.19, wpb=494975, bsz=16341.4, num_updates=58000, lr=0.000262613, gnorm=0.138, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=49683 epoch 035: 663 / 1689 loss=3.504, nll_loss=1.949, ppl=3.86, wps=589734, ups=1.19, wpb=494975, bsz=16341.4, num_updates=58000, lr=0.000262613, gnorm=0.138, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=49683 epoch 035: 663 / 1689 loss=3.504, nll_loss=1.949, ppl=3.86, wps=589734, ups=1.19, wpb=494975, bsz=16341.4, num_updates=58000, lr=0.000262613, gnorm=0.138, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=49683 epoch 035: 663 / 1689 loss=3.504, nll_loss=1.949, ppl=3.86, wps=589734, ups=1.19, wpb=494975, bsz=16341.4, num_updates=58000, lr=0.000262613, gnorm=0.138, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=49683 epoch 035: 663 / 1689 loss=3.504, nll_loss=1.949, ppl=3.86, wps=589734, ups=1.19, wpb=494975, bsz=16341.4, num_updates=58000, lr=0.000262613, gnorm=0.138, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=49683 epoch 035: 663 / 1689 loss=3.504, nll_loss=1.949, ppl=3.86, wps=589734, ups=1.19, wpb=494975, bsz=16341.4, num_updates=58000, lr=0.000262613, gnorm=0.138, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=49683 epoch 035: 663 / 1689 loss=3.504, nll_loss=1.949, ppl=3.86, wps=589734, ups=1.19, wpb=494975, bsz=16341.4, num_updates=58000, lr=0.000262613, gnorm=0.138, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=49683 begin validation on "valid" subset epoch 035 | valid on 'valid' subset | loss 3.698 | nll_loss 2.148 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 58000 | best_loss 3.685 epoch 035 | valid on 'valid' subset | loss 3.698 | nll_loss 2.148 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 58000 | best_loss 3.685 epoch 035 | valid on 'valid' subset | loss 3.698 | nll_loss 2.148 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 58000 | best_loss 3.685 epoch 035 | valid on 'valid' subset | loss 3.698 | nll_loss 2.148 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 58000 | best_loss 3.685 epoch 035 | valid on 'valid' subset | loss 3.698 | nll_loss 2.148 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 58000 | best_loss 3.685 epoch 035 | valid on 'valid' subset | loss 3.698 | nll_loss 2.148 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 58000 | best_loss 3.685 epoch 035 | valid on 'valid' subset | loss 3.698 | nll_loss 2.148 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 58000 | best_loss 3.685 epoch 035 | valid on 'valid' subset | loss 3.698 | nll_loss 2.148 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 58000 | best_loss 3.685 epoch 035 | valid on 'valid' subset | loss 3.698 | nll_loss 2.148 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 58000 | best_loss 3.685 epoch 035 | valid on 'valid' subset | loss 3.698 | nll_loss 2.148 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 58000 | best_loss 3.685 epoch 035 | valid on 'valid' subset | loss 3.698 | nll_loss 2.148 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 58000 | best_loss 3.685 epoch 035 | valid on 'valid' subset | loss 3.698 | nll_loss 2.148 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 58000 | best_loss 3.685 epoch 035 | valid on 'valid' subset | loss 3.698 | nll_loss 2.148 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 58000 | best_loss 3.685 epoch 035 | valid on 'valid' subset | loss 3.698 | nll_loss 2.148 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 58000 | best_loss 3.685 epoch 035 | valid on 'valid' subset | loss 3.698 | nll_loss 2.148 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 58000 | best_loss 3.685 epoch 035 | valid on 'valid' subset | loss 3.698 | nll_loss 2.148 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 58000 | best_loss 3.685 epoch 035 | valid on 'valid' subset | loss 3.698 | nll_loss 2.148 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 58000 | best_loss 3.685 epoch 035 | valid on 'valid' subset | loss 3.698 | nll_loss 2.148 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 58000 | best_loss 3.685 epoch 035 | valid on 'valid' subset | loss 3.698 | nll_loss 2.148 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 58000 | best_loss 3.685 epoch 035 | valid on 'valid' subset | loss 3.698 | nll_loss 2.148 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 58000 | best_loss 3.685 epoch 035 | valid on 'valid' subset | loss 3.698 | nll_loss 2.148 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 58000 | best_loss 3.685 epoch 035 | valid on 'valid' subset | loss 3.698 | nll_loss 2.148 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 58000 | best_loss 3.685 epoch 035 | valid on 'valid' subset | loss 3.698 | nll_loss 2.148 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 58000 | best_loss 3.685 epoch 035 | valid on 'valid' subset | loss 3.698 | nll_loss 2.148 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 58000 | best_loss 3.685 epoch 035 | valid on 'valid' subset | loss 3.698 | nll_loss 2.148 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 58000 | best_loss 3.685 epoch 035 | valid on 'valid' subset | loss 3.698 | nll_loss 2.148 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 58000 | best_loss 3.685 epoch 035 | valid on 'valid' subset | loss 3.698 | nll_loss 2.148 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 58000 | best_loss 3.685 epoch 035 | valid on 'valid' subset | loss 3.698 | nll_loss 2.148 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 58000 | best_loss 3.685 epoch 035 | valid on 'valid' subset | loss 3.698 | nll_loss 2.148 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 58000 | best_loss 3.685 epoch 035 | valid on 'valid' subset | loss 3.698 | nll_loss 2.148 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 58000 | best_loss 3.685 epoch 035 | valid on 'valid' subset | loss 3.698 | nll_loss 2.148 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 58000 | best_loss 3.685 epoch 035 | valid on 'valid' subset | loss 3.698 | nll_loss 2.148 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 58000 | best_loss 3.685 epoch 035 | valid on 'valid' subset | loss 3.698 | nll_loss 2.148 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 58000 | best_loss 3.685 epoch 035 | valid on 'valid' subset | loss 3.698 | nll_loss 2.148 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 58000 | best_loss 3.685 epoch 035 | valid on 'valid' subset | loss 3.698 | nll_loss 2.148 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 58000 | best_loss 3.685 epoch 035: 763 / 1689 loss=3.495, nll_loss=1.938, ppl=3.83, wps=511349, ups=1.03, wpb=494917, bsz=16871.9, num_updates=58100, lr=0.000262387, gnorm=0.136, clip=0, loss_scale=4, train_wall=83, gb_free=61.8, wall=49780 epoch 035: 763 / 1689 loss=3.495, nll_loss=1.938, ppl=3.83, wps=511349, ups=1.03, wpb=494917, bsz=16871.9, num_updates=58100, lr=0.000262387, gnorm=0.136, clip=0, loss_scale=4, train_wall=83, gb_free=61.8, wall=49780 epoch 035: 763 / 1689 loss=3.495, nll_loss=1.938, ppl=3.83, wps=511349, ups=1.03, wpb=494917, bsz=16871.9, num_updates=58100, lr=0.000262387, gnorm=0.136, clip=0, loss_scale=4, train_wall=83, gb_free=61.8, wall=49780 epoch 035: 763 / 1689 loss=3.495, nll_loss=1.938, ppl=3.83, wps=511349, ups=1.03, wpb=494917, bsz=16871.9, num_updates=58100, lr=0.000262387, gnorm=0.136, clip=0, loss_scale=4, train_wall=83, gb_free=61.8, wall=49780 epoch 035: 763 / 1689 loss=3.495, nll_loss=1.938, ppl=3.83, wps=511349, ups=1.03, wpb=494917, bsz=16871.9, num_updates=58100, lr=0.000262387, gnorm=0.136, clip=0, loss_scale=4, train_wall=83, gb_free=61.8, wall=49780 epoch 035: 763 / 1689 loss=3.495, nll_loss=1.938, ppl=3.83, wps=511349, ups=1.03, wpb=494917, bsz=16871.9, num_updates=58100, lr=0.000262387, gnorm=0.136, clip=0, loss_scale=4, train_wall=83, gb_free=61.8, wall=49780 epoch 035: 763 / 1689 loss=3.495, nll_loss=1.938, ppl=3.83, wps=511349, ups=1.03, wpb=494917, bsz=16871.9, num_updates=58100, lr=0.000262387, gnorm=0.136, clip=0, loss_scale=4, train_wall=83, gb_free=61.8, wall=49780 epoch 035: 763 / 1689 loss=3.495, nll_loss=1.938, ppl=3.83, wps=511349, ups=1.03, wpb=494917, bsz=16871.9, num_updates=58100, lr=0.000262387, gnorm=0.136, clip=0, loss_scale=4, train_wall=83, gb_free=61.8, wall=49780 epoch 035: 763 / 1689 loss=3.495, nll_loss=1.938, ppl=3.83, wps=511349, ups=1.03, wpb=494917, bsz=16871.9, num_updates=58100, lr=0.000262387, gnorm=0.136, clip=0, loss_scale=4, train_wall=83, gb_free=61.8, wall=49780 epoch 035: 763 / 1689 loss=3.495, nll_loss=1.938, ppl=3.83, wps=511349, ups=1.03, wpb=494917, bsz=16871.9, num_updates=58100, lr=0.000262387, gnorm=0.136, clip=0, loss_scale=4, train_wall=83, gb_free=61.8, wall=49780 epoch 035: 763 / 1689 loss=3.495, nll_loss=1.938, ppl=3.83, wps=511349, ups=1.03, wpb=494917, bsz=16871.9, num_updates=58100, lr=0.000262387, gnorm=0.136, clip=0, loss_scale=4, train_wall=83, gb_free=61.8, wall=49780 epoch 035: 763 / 1689 loss=3.495, nll_loss=1.938, ppl=3.83, wps=511349, ups=1.03, wpb=494917, bsz=16871.9, num_updates=58100, lr=0.000262387, gnorm=0.136, clip=0, loss_scale=4, train_wall=83, gb_free=61.8, wall=49780 epoch 035: 763 / 1689 loss=3.495, nll_loss=1.938, ppl=3.83, wps=511349, ups=1.03, wpb=494917, bsz=16871.9, num_updates=58100, lr=0.000262387, gnorm=0.136, clip=0, loss_scale=4, train_wall=83, gb_free=61.8, wall=49780 epoch 035: 763 / 1689 loss=3.495, nll_loss=1.938, ppl=3.83, wps=511349, ups=1.03, wpb=494917, bsz=16871.9, num_updates=58100, lr=0.000262387, gnorm=0.136, clip=0, loss_scale=4, train_wall=83, gb_free=61.8, wall=49780 epoch 035: 763 / 1689 loss=3.495, nll_loss=1.938, ppl=3.83, wps=511349, ups=1.03, wpb=494917, bsz=16871.9, num_updates=58100, lr=0.000262387, gnorm=0.136, clip=0, loss_scale=4, train_wall=83, gb_free=61.8, wall=49780 epoch 035: 763 / 1689 loss=3.495, nll_loss=1.938, ppl=3.83, wps=511349, ups=1.03, wpb=494917, bsz=16871.9, num_updates=58100, lr=0.000262387, gnorm=0.136, clip=0, loss_scale=4, train_wall=83, gb_free=61.8, wall=49780 epoch 035: 763 / 1689 loss=3.495, nll_loss=1.938, ppl=3.83, wps=511349, ups=1.03, wpb=494917, bsz=16871.9, num_updates=58100, lr=0.000262387, gnorm=0.136, clip=0, loss_scale=4, train_wall=83, gb_free=61.8, wall=49780 epoch 035: 763 / 1689 loss=3.495, nll_loss=1.938, ppl=3.83, wps=511349, ups=1.03, wpb=494917, bsz=16871.9, num_updates=58100, lr=0.000262387, gnorm=0.136, clip=0, loss_scale=4, train_wall=83, gb_free=61.8, wall=49780 epoch 035: 763 / 1689 loss=3.495, nll_loss=1.938, ppl=3.83, wps=511349, ups=1.03, wpb=494917, bsz=16871.9, num_updates=58100, lr=0.000262387, gnorm=0.136, clip=0, loss_scale=4, train_wall=83, gb_free=61.8, wall=49780 epoch 035: 763 / 1689 loss=3.495, nll_loss=1.938, ppl=3.83, wps=511349, ups=1.03, wpb=494917, bsz=16871.9, num_updates=58100, lr=0.000262387, gnorm=0.136, clip=0, loss_scale=4, train_wall=83, gb_free=61.8, wall=49780 epoch 035: 763 / 1689 loss=3.495, nll_loss=1.938, ppl=3.83, wps=511349, ups=1.03, wpb=494917, bsz=16871.9, num_updates=58100, lr=0.000262387, gnorm=0.136, clip=0, loss_scale=4, train_wall=83, gb_free=61.8, wall=49780 epoch 035: 763 / 1689 loss=3.495, nll_loss=1.938, ppl=3.83, wps=511349, ups=1.03, wpb=494917, bsz=16871.9, num_updates=58100, lr=0.000262387, gnorm=0.136, clip=0, loss_scale=4, train_wall=83, gb_free=61.8, wall=49780 epoch 035: 763 / 1689 loss=3.495, nll_loss=1.938, ppl=3.83, wps=511349, ups=1.03, wpb=494917, bsz=16871.9, num_updates=58100, lr=0.000262387, gnorm=0.136, clip=0, loss_scale=4, train_wall=83, gb_free=61.8, wall=49780 epoch 035: 763 / 1689 loss=3.495, nll_loss=1.938, ppl=3.83, wps=511349, ups=1.03, wpb=494917, bsz=16871.9, num_updates=58100, lr=0.000262387, gnorm=0.136, clip=0, loss_scale=4, train_wall=83, gb_free=61.8, wall=49780 epoch 035: 763 / 1689 loss=3.495, nll_loss=1.938, ppl=3.83, wps=511349, ups=1.03, wpb=494917, bsz=16871.9, num_updates=58100, lr=0.000262387, gnorm=0.136, clip=0, loss_scale=4, train_wall=83, gb_free=61.8, wall=49780 epoch 035: 763 / 1689 loss=3.495, nll_loss=1.938, ppl=3.83, wps=511349, ups=1.03, wpb=494917, bsz=16871.9, num_updates=58100, lr=0.000262387, gnorm=0.136, clip=0, loss_scale=4, train_wall=83, gb_free=61.8, wall=49780 epoch 035: 763 / 1689 loss=3.495, nll_loss=1.938, ppl=3.83, wps=511349, ups=1.03, wpb=494917, bsz=16871.9, num_updates=58100, lr=0.000262387, gnorm=0.136, clip=0, loss_scale=4, train_wall=83, gb_free=61.8, wall=49780 epoch 035: 763 / 1689 loss=3.495, nll_loss=1.938, ppl=3.83, wps=511349, ups=1.03, wpb=494917, bsz=16871.9, num_updates=58100, lr=0.000262387, gnorm=0.136, clip=0, loss_scale=4, train_wall=83, gb_free=61.8, wall=49780 epoch 035: 763 / 1689 loss=3.495, nll_loss=1.938, ppl=3.83, wps=511349, ups=1.03, wpb=494917, bsz=16871.9, num_updates=58100, lr=0.000262387, gnorm=0.136, clip=0, loss_scale=4, train_wall=83, gb_free=61.8, wall=49780 epoch 035: 763 / 1689 loss=3.495, nll_loss=1.938, ppl=3.83, wps=511349, ups=1.03, wpb=494917, bsz=16871.9, num_updates=58100, lr=0.000262387, gnorm=0.136, clip=0, loss_scale=4, train_wall=83, gb_free=61.8, wall=49780 epoch 035: 763 / 1689 loss=3.495, nll_loss=1.938, ppl=3.83, wps=511349, ups=1.03, wpb=494917, bsz=16871.9, num_updates=58100, lr=0.000262387, gnorm=0.136, clip=0, loss_scale=4, train_wall=83, gb_free=61.8, wall=49780 epoch 035: 763 / 1689 loss=3.495, nll_loss=1.938, ppl=3.83, wps=511349, ups=1.03, wpb=494917, bsz=16871.9, num_updates=58100, lr=0.000262387, gnorm=0.136, clip=0, loss_scale=4, train_wall=83, gb_free=61.8, wall=49780 epoch 035: 763 / 1689 loss=3.495, nll_loss=1.938, ppl=3.83, wps=511349, ups=1.03, wpb=494917, bsz=16871.9, num_updates=58100, lr=0.000262387, gnorm=0.136, clip=0, loss_scale=4, train_wall=83, gb_free=61.8, wall=49780 epoch 035: 763 / 1689 loss=3.495, nll_loss=1.938, ppl=3.83, wps=511349, ups=1.03, wpb=494917, bsz=16871.9, num_updates=58100, lr=0.000262387, gnorm=0.136, clip=0, loss_scale=4, train_wall=83, gb_free=61.8, wall=49780 epoch 035: 763 / 1689 loss=3.495, nll_loss=1.938, ppl=3.83, wps=511349, ups=1.03, wpb=494917, bsz=16871.9, num_updates=58100, lr=0.000262387, gnorm=0.136, clip=0, loss_scale=4, train_wall=83, gb_free=61.8, wall=49780 epoch 035: 864 / 1689 loss=3.499, nll_loss=1.944, ppl=3.85, wps=586051, ups=1.18, wpb=495938, bsz=16512.5, num_updates=58200, lr=0.000262161, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61.8, wall=49864 epoch 035: 864 / 1689 loss=3.499, nll_loss=1.944, ppl=3.85, wps=586051, ups=1.18, wpb=495938, bsz=16512.5, num_updates=58200, lr=0.000262161, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61.8, wall=49864 epoch 035: 864 / 1689 loss=3.499, nll_loss=1.944, ppl=3.85, wps=586051, ups=1.18, wpb=495938, bsz=16512.5, num_updates=58200, lr=0.000262161, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61.8, wall=49864 epoch 035: 864 / 1689 loss=3.499, nll_loss=1.944, ppl=3.85, wps=586051, ups=1.18, wpb=495938, bsz=16512.5, num_updates=58200, lr=0.000262161, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61.8, wall=49864 epoch 035: 864 / 1689 loss=3.499, nll_loss=1.944, ppl=3.85, wps=586051, ups=1.18, wpb=495938, bsz=16512.5, num_updates=58200, lr=0.000262161, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61.8, wall=49864 epoch 035: 864 / 1689 loss=3.499, nll_loss=1.944, ppl=3.85, wps=586051, ups=1.18, wpb=495938, bsz=16512.5, num_updates=58200, lr=0.000262161, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61.8, wall=49864 epoch 035: 864 / 1689 loss=3.499, nll_loss=1.944, ppl=3.85, wps=586051, ups=1.18, wpb=495938, bsz=16512.5, num_updates=58200, lr=0.000262161, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61.8, wall=49864 epoch 035: 864 / 1689 loss=3.499, nll_loss=1.944, ppl=3.85, wps=586051, ups=1.18, wpb=495938, bsz=16512.5, num_updates=58200, lr=0.000262161, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61.8, wall=49864 epoch 035: 864 / 1689 loss=3.499, nll_loss=1.944, ppl=3.85, wps=586051, ups=1.18, wpb=495938, bsz=16512.5, num_updates=58200, lr=0.000262161, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61.8, wall=49864 epoch 035: 864 / 1689 loss=3.499, nll_loss=1.944, ppl=3.85, wps=586051, ups=1.18, wpb=495938, bsz=16512.5, num_updates=58200, lr=0.000262161, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61.8, wall=49864 epoch 035: 864 / 1689 loss=3.499, nll_loss=1.944, ppl=3.85, wps=586051, ups=1.18, wpb=495938, bsz=16512.5, num_updates=58200, lr=0.000262161, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61.8, wall=49864 epoch 035: 864 / 1689 loss=3.499, nll_loss=1.944, ppl=3.85, wps=586051, ups=1.18, wpb=495938, bsz=16512.5, num_updates=58200, lr=0.000262161, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61.8, wall=49864 epoch 035: 864 / 1689 loss=3.499, nll_loss=1.944, ppl=3.85, wps=586051, ups=1.18, wpb=495938, bsz=16512.5, num_updates=58200, lr=0.000262161, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61.8, wall=49864 epoch 035: 864 / 1689 loss=3.499, nll_loss=1.944, ppl=3.85, wps=586051, ups=1.18, wpb=495938, bsz=16512.5, num_updates=58200, lr=0.000262161, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61.8, wall=49864 epoch 035: 864 / 1689 loss=3.499, nll_loss=1.944, ppl=3.85, wps=586051, ups=1.18, wpb=495938, bsz=16512.5, num_updates=58200, lr=0.000262161, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61.8, wall=49864 epoch 035: 864 / 1689 loss=3.499, nll_loss=1.944, ppl=3.85, wps=586051, ups=1.18, wpb=495938, bsz=16512.5, num_updates=58200, lr=0.000262161, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61.8, wall=49864 epoch 035: 864 / 1689 loss=3.499, nll_loss=1.944, ppl=3.85, wps=586051, ups=1.18, wpb=495938, bsz=16512.5, num_updates=58200, lr=0.000262161, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61.8, wall=49864 epoch 035: 864 / 1689 loss=3.499, nll_loss=1.944, ppl=3.85, wps=586051, ups=1.18, wpb=495938, bsz=16512.5, num_updates=58200, lr=0.000262161, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61.8, wall=49864 epoch 035: 864 / 1689 loss=3.499, nll_loss=1.944, ppl=3.85, wps=586051, ups=1.18, wpb=495938, bsz=16512.5, num_updates=58200, lr=0.000262161, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61.8, wall=49864 epoch 035: 864 / 1689 loss=3.499, nll_loss=1.944, ppl=3.85, wps=586051, ups=1.18, wpb=495938, bsz=16512.5, num_updates=58200, lr=0.000262161, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61.8, wall=49864 epoch 035: 864 / 1689 loss=3.499, nll_loss=1.944, ppl=3.85, wps=586051, ups=1.18, wpb=495938, bsz=16512.5, num_updates=58200, lr=0.000262161, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61.8, wall=49864 epoch 035: 864 / 1689 loss=3.499, nll_loss=1.944, ppl=3.85, wps=586051, ups=1.18, wpb=495938, bsz=16512.5, num_updates=58200, lr=0.000262161, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61.8, wall=49864 epoch 035: 864 / 1689 loss=3.499, nll_loss=1.944, ppl=3.85, wps=586051, ups=1.18, wpb=495938, bsz=16512.5, num_updates=58200, lr=0.000262161, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61.8, wall=49864 epoch 035: 864 / 1689 loss=3.499, nll_loss=1.944, ppl=3.85, wps=586051, ups=1.18, wpb=495938, bsz=16512.5, num_updates=58200, lr=0.000262161, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61.8, wall=49864 epoch 035: 864 / 1689 loss=3.499, nll_loss=1.944, ppl=3.85, wps=586051, ups=1.18, wpb=495938, bsz=16512.5, num_updates=58200, lr=0.000262161, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61.8, wall=49864 epoch 035: 864 / 1689 loss=3.499, nll_loss=1.944, ppl=3.85, wps=586051, ups=1.18, wpb=495938, bsz=16512.5, num_updates=58200, lr=0.000262161, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61.8, wall=49864 epoch 035: 864 / 1689 loss=3.499, nll_loss=1.944, ppl=3.85, wps=586051, ups=1.18, wpb=495938, bsz=16512.5, num_updates=58200, lr=0.000262161, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61.8, wall=49864 epoch 035: 864 / 1689 loss=3.499, nll_loss=1.944, ppl=3.85, wps=586051, ups=1.18, wpb=495938, bsz=16512.5, num_updates=58200, lr=0.000262161, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61.8, wall=49864 epoch 035: 864 / 1689 loss=3.499, nll_loss=1.944, ppl=3.85, wps=586051, ups=1.18, wpb=495938, bsz=16512.5, num_updates=58200, lr=0.000262161, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61.8, wall=49864 epoch 035: 864 / 1689 loss=3.499, nll_loss=1.944, ppl=3.85, wps=586051, ups=1.18, wpb=495938, bsz=16512.5, num_updates=58200, lr=0.000262161, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61.8, wall=49864 epoch 035: 864 / 1689 loss=3.499, nll_loss=1.944, ppl=3.85, wps=586051, ups=1.18, wpb=495938, bsz=16512.5, num_updates=58200, lr=0.000262161, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61.8, wall=49864 epoch 035: 864 / 1689 loss=3.499, nll_loss=1.944, ppl=3.85, wps=586051, ups=1.18, wpb=495938, bsz=16512.5, num_updates=58200, lr=0.000262161, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61.8, wall=49864 epoch 035: 864 / 1689 loss=3.499, nll_loss=1.944, ppl=3.85, wps=586051, ups=1.18, wpb=495938, bsz=16512.5, num_updates=58200, lr=0.000262161, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61.8, wall=49864 epoch 035: 864 / 1689 loss=3.499, nll_loss=1.944, ppl=3.85, wps=586051, ups=1.18, wpb=495938, bsz=16512.5, num_updates=58200, lr=0.000262161, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61.8, wall=49864 epoch 035: 864 / 1689 loss=3.499, nll_loss=1.944, ppl=3.85, wps=586051, ups=1.18, wpb=495938, bsz=16512.5, num_updates=58200, lr=0.000262161, gnorm=0.137, clip=0, loss_scale=2, train_wall=83, gb_free=61.8, wall=49864 epoch 035: 964 / 1689 loss=3.505, nll_loss=1.95, ppl=3.86, wps=587987, ups=1.19, wpb=495185, bsz=16671.7, num_updates=58300, lr=0.000261936, gnorm=0.138, clip=0, loss_scale=2, train_wall=83, gb_free=60.4, wall=49948 epoch 035: 964 / 1689 loss=3.505, nll_loss=1.95, ppl=3.86, wps=587987, ups=1.19, wpb=495185, bsz=16671.7, num_updates=58300, lr=0.000261936, gnorm=0.138, clip=0, loss_scale=2, train_wall=83, gb_free=60.4, wall=49948 epoch 035: 964 / 1689 loss=3.505, nll_loss=1.95, ppl=3.86, wps=587987, ups=1.19, wpb=495185, bsz=16671.7, num_updates=58300, lr=0.000261936, gnorm=0.138, clip=0, loss_scale=2, train_wall=83, gb_free=60.4, wall=49948 epoch 035: 964 / 1689 loss=3.505, nll_loss=1.95, ppl=3.86, wps=587987, ups=1.19, wpb=495185, bsz=16671.7, num_updates=58300, lr=0.000261936, gnorm=0.138, clip=0, loss_scale=2, train_wall=83, gb_free=60.4, wall=49948 epoch 035: 964 / 1689 loss=3.505, nll_loss=1.95, ppl=3.86, wps=587987, ups=1.19, wpb=495185, bsz=16671.7, num_updates=58300, lr=0.000261936, gnorm=0.138, clip=0, loss_scale=2, train_wall=83, gb_free=60.4, wall=49948 epoch 035: 964 / 1689 loss=3.505, nll_loss=1.95, ppl=3.86, wps=587987, ups=1.19, wpb=495185, bsz=16671.7, num_updates=58300, lr=0.000261936, gnorm=0.138, clip=0, loss_scale=2, train_wall=83, gb_free=60.4, wall=49948 epoch 035: 964 / 1689 loss=3.505, nll_loss=1.95, ppl=3.86, wps=587987, ups=1.19, wpb=495185, bsz=16671.7, num_updates=58300, lr=0.000261936, gnorm=0.138, clip=0, loss_scale=2, train_wall=83, gb_free=60.4, wall=49948 epoch 035: 964 / 1689 loss=3.505, nll_loss=1.95, ppl=3.86, wps=587987, ups=1.19, wpb=495185, bsz=16671.7, num_updates=58300, lr=0.000261936, gnorm=0.138, clip=0, loss_scale=2, train_wall=83, gb_free=60.4, wall=49948 epoch 035: 964 / 1689 loss=3.505, nll_loss=1.95, ppl=3.86, wps=587987, ups=1.19, wpb=495185, bsz=16671.7, num_updates=58300, lr=0.000261936, gnorm=0.138, clip=0, loss_scale=2, train_wall=83, gb_free=60.4, wall=49948 epoch 035: 964 / 1689 loss=3.505, nll_loss=1.95, ppl=3.86, wps=587987, ups=1.19, wpb=495185, bsz=16671.7, num_updates=58300, lr=0.000261936, gnorm=0.138, clip=0, loss_scale=2, train_wall=83, gb_free=60.4, wall=49948 epoch 035: 964 / 1689 loss=3.505, nll_loss=1.95, ppl=3.86, wps=587987, ups=1.19, wpb=495185, bsz=16671.7, num_updates=58300, lr=0.000261936, gnorm=0.138, clip=0, loss_scale=2, train_wall=83, gb_free=60.4, wall=49948 epoch 035: 964 / 1689 loss=3.505, nll_loss=1.95, ppl=3.86, wps=587987, ups=1.19, wpb=495185, bsz=16671.7, num_updates=58300, lr=0.000261936, gnorm=0.138, clip=0, loss_scale=2, train_wall=83, gb_free=60.4, wall=49948 epoch 035: 964 / 1689 loss=3.505, nll_loss=1.95, ppl=3.86, wps=587987, ups=1.19, wpb=495185, bsz=16671.7, num_updates=58300, lr=0.000261936, gnorm=0.138, clip=0, loss_scale=2, train_wall=83, gb_free=60.4, wall=49948 epoch 035: 964 / 1689 loss=3.505, nll_loss=1.95, ppl=3.86, wps=587987, ups=1.19, wpb=495185, bsz=16671.7, num_updates=58300, lr=0.000261936, gnorm=0.138, clip=0, loss_scale=2, train_wall=83, gb_free=60.4, wall=49948 epoch 035: 964 / 1689 loss=3.505, nll_loss=1.95, ppl=3.86, wps=587987, ups=1.19, wpb=495185, bsz=16671.7, num_updates=58300, lr=0.000261936, gnorm=0.138, clip=0, loss_scale=2, train_wall=83, gb_free=60.4, wall=49948 epoch 035: 964 / 1689 loss=3.505, nll_loss=1.95, ppl=3.86, wps=587987, ups=1.19, wpb=495185, bsz=16671.7, num_updates=58300, lr=0.000261936, gnorm=0.138, clip=0, loss_scale=2, train_wall=83, gb_free=60.4, wall=49948 epoch 035: 964 / 1689 loss=3.505, nll_loss=1.95, ppl=3.86, wps=587987, ups=1.19, wpb=495185, bsz=16671.7, num_updates=58300, lr=0.000261936, gnorm=0.138, clip=0, loss_scale=2, train_wall=83, gb_free=60.4, wall=49948 epoch 035: 964 / 1689 loss=3.505, nll_loss=1.95, ppl=3.86, wps=587987, ups=1.19, wpb=495185, bsz=16671.7, num_updates=58300, lr=0.000261936, gnorm=0.138, clip=0, loss_scale=2, train_wall=83, gb_free=60.4, wall=49948 epoch 035: 964 / 1689 loss=3.505, nll_loss=1.95, ppl=3.86, wps=587987, ups=1.19, wpb=495185, bsz=16671.7, num_updates=58300, lr=0.000261936, gnorm=0.138, clip=0, loss_scale=2, train_wall=83, gb_free=60.4, wall=49948 epoch 035: 964 / 1689 loss=3.505, nll_loss=1.95, ppl=3.86, wps=587987, ups=1.19, wpb=495185, bsz=16671.7, num_updates=58300, lr=0.000261936, gnorm=0.138, clip=0, loss_scale=2, train_wall=83, gb_free=60.4, wall=49948 epoch 035: 964 / 1689 loss=3.505, nll_loss=1.95, ppl=3.86, wps=587987, ups=1.19, wpb=495185, bsz=16671.7, num_updates=58300, lr=0.000261936, gnorm=0.138, clip=0, loss_scale=2, train_wall=83, gb_free=60.4, wall=49948 epoch 035: 964 / 1689 loss=3.505, nll_loss=1.95, ppl=3.86, wps=587987, ups=1.19, wpb=495185, bsz=16671.7, num_updates=58300, lr=0.000261936, gnorm=0.138, clip=0, loss_scale=2, train_wall=83, gb_free=60.4, wall=49948 epoch 035: 964 / 1689 loss=3.505, nll_loss=1.95, ppl=3.86, wps=587987, ups=1.19, wpb=495185, bsz=16671.7, num_updates=58300, lr=0.000261936, gnorm=0.138, clip=0, loss_scale=2, train_wall=83, gb_free=60.4, wall=49948 epoch 035: 964 / 1689 loss=3.505, nll_loss=1.95, ppl=3.86, wps=587987, ups=1.19, wpb=495185, bsz=16671.7, num_updates=58300, lr=0.000261936, gnorm=0.138, clip=0, loss_scale=2, train_wall=83, gb_free=60.4, wall=49948 epoch 035: 964 / 1689 loss=3.505, nll_loss=1.95, ppl=3.86, wps=587987, ups=1.19, wpb=495185, bsz=16671.7, num_updates=58300, lr=0.000261936, gnorm=0.138, clip=0, loss_scale=2, train_wall=83, gb_free=60.4, wall=49948 epoch 035: 964 / 1689 loss=3.505, nll_loss=1.95, ppl=3.86, wps=587987, ups=1.19, wpb=495185, bsz=16671.7, num_updates=58300, lr=0.000261936, gnorm=0.138, clip=0, loss_scale=2, train_wall=83, gb_free=60.4, wall=49948 epoch 035: 964 / 1689 loss=3.505, nll_loss=1.95, ppl=3.86, wps=587987, ups=1.19, wpb=495185, bsz=16671.7, num_updates=58300, lr=0.000261936, gnorm=0.138, clip=0, loss_scale=2, train_wall=83, gb_free=60.4, wall=49948 epoch 035: 964 / 1689 loss=3.505, nll_loss=1.95, ppl=3.86, wps=587987, ups=1.19, wpb=495185, bsz=16671.7, num_updates=58300, lr=0.000261936, gnorm=0.138, clip=0, loss_scale=2, train_wall=83, gb_free=60.4, wall=49948 epoch 035: 964 / 1689 loss=3.505, nll_loss=1.95, ppl=3.86, wps=587987, ups=1.19, wpb=495185, bsz=16671.7, num_updates=58300, lr=0.000261936, gnorm=0.138, clip=0, loss_scale=2, train_wall=83, gb_free=60.4, wall=49948 epoch 035: 964 / 1689 loss=3.505, nll_loss=1.95, ppl=3.86, wps=587987, ups=1.19, wpb=495185, bsz=16671.7, num_updates=58300, lr=0.000261936, gnorm=0.138, clip=0, loss_scale=2, train_wall=83, gb_free=60.4, wall=49948 epoch 035: 964 / 1689 loss=3.505, nll_loss=1.95, ppl=3.86, wps=587987, ups=1.19, wpb=495185, bsz=16671.7, num_updates=58300, lr=0.000261936, gnorm=0.138, clip=0, loss_scale=2, train_wall=83, gb_free=60.4, wall=49948 epoch 035: 964 / 1689 loss=3.505, nll_loss=1.95, ppl=3.86, wps=587987, ups=1.19, wpb=495185, bsz=16671.7, num_updates=58300, lr=0.000261936, gnorm=0.138, clip=0, loss_scale=2, train_wall=83, gb_free=60.4, wall=49948 epoch 035: 964 / 1689 loss=3.505, nll_loss=1.95, ppl=3.86, wps=587987, ups=1.19, wpb=495185, bsz=16671.7, num_updates=58300, lr=0.000261936, gnorm=0.138, clip=0, loss_scale=2, train_wall=83, gb_free=60.4, wall=49948 epoch 035: 964 / 1689 loss=3.505, nll_loss=1.95, ppl=3.86, wps=587987, ups=1.19, wpb=495185, bsz=16671.7, num_updates=58300, lr=0.000261936, gnorm=0.138, clip=0, loss_scale=2, train_wall=83, gb_free=60.4, wall=49948 epoch 035: 964 / 1689 loss=3.505, nll_loss=1.95, ppl=3.86, wps=587987, ups=1.19, wpb=495185, bsz=16671.7, num_updates=58300, lr=0.000261936, gnorm=0.138, clip=0, loss_scale=2, train_wall=83, gb_free=60.4, wall=49948 epoch 035: 1064 / 1689 loss=3.499, nll_loss=1.943, ppl=3.85, wps=589691, ups=1.19, wpb=496118, bsz=16395.4, num_updates=58400, lr=0.000261712, gnorm=0.134, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=50033 epoch 035: 1064 / 1689 loss=3.499, nll_loss=1.943, ppl=3.85, wps=589691, ups=1.19, wpb=496118, bsz=16395.4, num_updates=58400, lr=0.000261712, gnorm=0.134, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=50033 epoch 035: 1064 / 1689 loss=3.499, nll_loss=1.943, ppl=3.85, wps=589691, ups=1.19, wpb=496118, bsz=16395.4, num_updates=58400, lr=0.000261712, gnorm=0.134, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=50033 epoch 035: 1064 / 1689 loss=3.499, nll_loss=1.943, ppl=3.85, wps=589691, ups=1.19, wpb=496118, bsz=16395.4, num_updates=58400, lr=0.000261712, gnorm=0.134, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=50033 epoch 035: 1064 / 1689 loss=3.499, nll_loss=1.943, ppl=3.85, wps=589691, ups=1.19, wpb=496118, bsz=16395.4, num_updates=58400, lr=0.000261712, gnorm=0.134, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=50033 epoch 035: 1064 / 1689 loss=3.499, nll_loss=1.943, ppl=3.85, wps=589691, ups=1.19, wpb=496118, bsz=16395.4, num_updates=58400, lr=0.000261712, gnorm=0.134, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=50033 epoch 035: 1064 / 1689 loss=3.499, nll_loss=1.943, ppl=3.85, wps=589691, ups=1.19, wpb=496118, bsz=16395.4, num_updates=58400, lr=0.000261712, gnorm=0.134, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=50033 epoch 035: 1064 / 1689 loss=3.499, nll_loss=1.943, ppl=3.85, wps=589691, ups=1.19, wpb=496118, bsz=16395.4, num_updates=58400, lr=0.000261712, gnorm=0.134, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=50033 epoch 035: 1064 / 1689 loss=3.499, nll_loss=1.943, ppl=3.85, wps=589691, ups=1.19, wpb=496118, bsz=16395.4, num_updates=58400, lr=0.000261712, gnorm=0.134, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=50033 epoch 035: 1064 / 1689 loss=3.499, nll_loss=1.943, ppl=3.85, wps=589691, ups=1.19, wpb=496118, bsz=16395.4, num_updates=58400, lr=0.000261712, gnorm=0.134, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=50033 epoch 035: 1064 / 1689 loss=3.499, nll_loss=1.943, ppl=3.85, wps=589691, ups=1.19, wpb=496118, bsz=16395.4, num_updates=58400, lr=0.000261712, gnorm=0.134, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=50033 epoch 035: 1064 / 1689 loss=3.499, nll_loss=1.943, ppl=3.85, wps=589691, ups=1.19, wpb=496118, bsz=16395.4, num_updates=58400, lr=0.000261712, gnorm=0.134, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=50033 epoch 035: 1064 / 1689 loss=3.499, nll_loss=1.943, ppl=3.85, wps=589691, ups=1.19, wpb=496118, bsz=16395.4, num_updates=58400, lr=0.000261712, gnorm=0.134, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=50033 epoch 035: 1064 / 1689 loss=3.499, nll_loss=1.943, ppl=3.85, wps=589691, ups=1.19, wpb=496118, bsz=16395.4, num_updates=58400, lr=0.000261712, gnorm=0.134, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=50033 epoch 035: 1064 / 1689 loss=3.499, nll_loss=1.943, ppl=3.85, wps=589691, ups=1.19, wpb=496118, bsz=16395.4, num_updates=58400, lr=0.000261712, gnorm=0.134, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=50033 epoch 035: 1064 / 1689 loss=3.499, nll_loss=1.943, ppl=3.85, wps=589691, ups=1.19, wpb=496118, bsz=16395.4, num_updates=58400, lr=0.000261712, gnorm=0.134, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=50033 epoch 035: 1064 / 1689 loss=3.499, nll_loss=1.943, ppl=3.85, wps=589691, ups=1.19, wpb=496118, bsz=16395.4, num_updates=58400, lr=0.000261712, gnorm=0.134, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=50033 epoch 035: 1064 / 1689 loss=3.499, nll_loss=1.943, ppl=3.85, wps=589691, ups=1.19, wpb=496118, bsz=16395.4, num_updates=58400, lr=0.000261712, gnorm=0.134, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=50033 epoch 035: 1064 / 1689 loss=3.499, nll_loss=1.943, ppl=3.85, wps=589691, ups=1.19, wpb=496118, bsz=16395.4, num_updates=58400, lr=0.000261712, gnorm=0.134, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=50033 epoch 035: 1064 / 1689 loss=3.499, nll_loss=1.943, ppl=3.85, wps=589691, ups=1.19, wpb=496118, bsz=16395.4, num_updates=58400, lr=0.000261712, gnorm=0.134, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=50033 epoch 035: 1064 / 1689 loss=3.499, nll_loss=1.943, ppl=3.85, wps=589691, ups=1.19, wpb=496118, bsz=16395.4, num_updates=58400, lr=0.000261712, gnorm=0.134, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=50033 epoch 035: 1064 / 1689 loss=3.499, nll_loss=1.943, ppl=3.85, wps=589691, ups=1.19, wpb=496118, bsz=16395.4, num_updates=58400, lr=0.000261712, gnorm=0.134, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=50033 epoch 035: 1064 / 1689 loss=3.499, nll_loss=1.943, ppl=3.85, wps=589691, ups=1.19, wpb=496118, bsz=16395.4, num_updates=58400, lr=0.000261712, gnorm=0.134, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=50033 epoch 035: 1064 / 1689 loss=3.499, nll_loss=1.943, ppl=3.85, wps=589691, ups=1.19, wpb=496118, bsz=16395.4, num_updates=58400, lr=0.000261712, gnorm=0.134, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=50033 epoch 035: 1064 / 1689 loss=3.499, nll_loss=1.943, ppl=3.85, wps=589691, ups=1.19, wpb=496118, bsz=16395.4, num_updates=58400, lr=0.000261712, gnorm=0.134, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=50033 epoch 035: 1064 / 1689 loss=3.499, nll_loss=1.943, ppl=3.85, wps=589691, ups=1.19, wpb=496118, bsz=16395.4, num_updates=58400, lr=0.000261712, gnorm=0.134, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=50033 epoch 035: 1064 / 1689 loss=3.499, nll_loss=1.943, ppl=3.85, wps=589691, ups=1.19, wpb=496118, bsz=16395.4, num_updates=58400, lr=0.000261712, gnorm=0.134, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=50033 epoch 035: 1064 / 1689 loss=3.499, nll_loss=1.943, ppl=3.85, wps=589691, ups=1.19, wpb=496118, bsz=16395.4, num_updates=58400, lr=0.000261712, gnorm=0.134, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=50033 epoch 035: 1064 / 1689 loss=3.499, nll_loss=1.943, ppl=3.85, wps=589691, ups=1.19, wpb=496118, bsz=16395.4, num_updates=58400, lr=0.000261712, gnorm=0.134, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=50033 epoch 035: 1064 / 1689 loss=3.499, nll_loss=1.943, ppl=3.85, wps=589691, ups=1.19, wpb=496118, bsz=16395.4, num_updates=58400, lr=0.000261712, gnorm=0.134, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=50033 epoch 035: 1064 / 1689 loss=3.499, nll_loss=1.943, ppl=3.85, wps=589691, ups=1.19, wpb=496118, bsz=16395.4, num_updates=58400, lr=0.000261712, gnorm=0.134, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=50033 epoch 035: 1064 / 1689 loss=3.499, nll_loss=1.943, ppl=3.85, wps=589691, ups=1.19, wpb=496118, bsz=16395.4, num_updates=58400, lr=0.000261712, gnorm=0.134, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=50033 epoch 035: 1064 / 1689 loss=3.499, nll_loss=1.943, ppl=3.85, wps=589691, ups=1.19, wpb=496118, bsz=16395.4, num_updates=58400, lr=0.000261712, gnorm=0.134, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=50033 epoch 035: 1064 / 1689 loss=3.499, nll_loss=1.943, ppl=3.85, wps=589691, ups=1.19, wpb=496118, bsz=16395.4, num_updates=58400, lr=0.000261712, gnorm=0.134, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=50033 epoch 035: 1064 / 1689 loss=3.499, nll_loss=1.943, ppl=3.85, wps=589691, ups=1.19, wpb=496118, bsz=16395.4, num_updates=58400, lr=0.000261712, gnorm=0.134, clip=0, loss_scale=2, train_wall=83, gb_free=61.5, wall=50033 epoch 035: 1165 / 1689 loss=3.506, nll_loss=1.951, ppl=3.87, wps=581967, ups=1.17, wpb=496415, bsz=16357.2, num_updates=58500, lr=0.000261488, gnorm=0.136, clip=0, loss_scale=1, train_wall=84, gb_free=62.1, wall=50118 epoch 035: 1165 / 1689 loss=3.506, nll_loss=1.951, ppl=3.87, wps=581967, ups=1.17, wpb=496415, bsz=16357.2, num_updates=58500, lr=0.000261488, gnorm=0.136, clip=0, loss_scale=1, train_wall=84, gb_free=62.1, wall=50118 epoch 035: 1165 / 1689 loss=3.506, nll_loss=1.951, ppl=3.87, wps=581967, ups=1.17, wpb=496415, bsz=16357.2, num_updates=58500, lr=0.000261488, gnorm=0.136, clip=0, loss_scale=1, train_wall=84, gb_free=62.1, wall=50118 epoch 035: 1165 / 1689 loss=3.506, nll_loss=1.951, ppl=3.87, wps=581967, ups=1.17, wpb=496415, bsz=16357.2, num_updates=58500, lr=0.000261488, gnorm=0.136, clip=0, loss_scale=1, train_wall=84, gb_free=62.1, wall=50118 epoch 035: 1165 / 1689 loss=3.506, nll_loss=1.951, ppl=3.87, wps=581967, ups=1.17, wpb=496415, bsz=16357.2, num_updates=58500, lr=0.000261488, gnorm=0.136, clip=0, loss_scale=1, train_wall=84, gb_free=62.1, wall=50118 epoch 035: 1165 / 1689 loss=3.506, nll_loss=1.951, ppl=3.87, wps=581967, ups=1.17, wpb=496415, bsz=16357.2, num_updates=58500, lr=0.000261488, gnorm=0.136, clip=0, loss_scale=1, train_wall=84, gb_free=62.1, wall=50118 epoch 035: 1165 / 1689 loss=3.506, nll_loss=1.951, ppl=3.87, wps=581967, ups=1.17, wpb=496415, bsz=16357.2, num_updates=58500, lr=0.000261488, gnorm=0.136, clip=0, loss_scale=1, train_wall=84, gb_free=62.1, wall=50118 epoch 035: 1165 / 1689 loss=3.506, nll_loss=1.951, ppl=3.87, wps=581967, ups=1.17, wpb=496415, bsz=16357.2, num_updates=58500, lr=0.000261488, gnorm=0.136, clip=0, loss_scale=1, train_wall=84, gb_free=62.1, wall=50118 epoch 035: 1165 / 1689 loss=3.506, nll_loss=1.951, ppl=3.87, wps=581967, ups=1.17, wpb=496415, bsz=16357.2, num_updates=58500, lr=0.000261488, gnorm=0.136, clip=0, loss_scale=1, train_wall=84, gb_free=62.1, wall=50118 epoch 035: 1165 / 1689 loss=3.506, nll_loss=1.951, ppl=3.87, wps=581967, ups=1.17, wpb=496415, bsz=16357.2, num_updates=58500, lr=0.000261488, gnorm=0.136, clip=0, loss_scale=1, train_wall=84, gb_free=62.1, wall=50118 epoch 035: 1165 / 1689 loss=3.506, nll_loss=1.951, ppl=3.87, wps=581967, ups=1.17, wpb=496415, bsz=16357.2, num_updates=58500, lr=0.000261488, gnorm=0.136, clip=0, loss_scale=1, train_wall=84, gb_free=62.1, wall=50118 epoch 035: 1165 / 1689 loss=3.506, nll_loss=1.951, ppl=3.87, wps=581967, ups=1.17, wpb=496415, bsz=16357.2, num_updates=58500, lr=0.000261488, gnorm=0.136, clip=0, loss_scale=1, train_wall=84, gb_free=62.1, wall=50118 epoch 035: 1165 / 1689 loss=3.506, nll_loss=1.951, ppl=3.87, wps=581967, ups=1.17, wpb=496415, bsz=16357.2, num_updates=58500, lr=0.000261488, gnorm=0.136, clip=0, loss_scale=1, train_wall=84, gb_free=62.1, wall=50118 epoch 035: 1165 / 1689 loss=3.506, nll_loss=1.951, ppl=3.87, wps=581967, ups=1.17, wpb=496415, bsz=16357.2, num_updates=58500, lr=0.000261488, gnorm=0.136, clip=0, loss_scale=1, train_wall=84, gb_free=62.1, wall=50118 epoch 035: 1165 / 1689 loss=3.506, nll_loss=1.951, ppl=3.87, wps=581967, ups=1.17, wpb=496415, bsz=16357.2, num_updates=58500, lr=0.000261488, gnorm=0.136, clip=0, loss_scale=1, train_wall=84, gb_free=62.1, wall=50118 epoch 035: 1165 / 1689 loss=3.506, nll_loss=1.951, ppl=3.87, wps=581967, ups=1.17, wpb=496415, bsz=16357.2, num_updates=58500, lr=0.000261488, gnorm=0.136, clip=0, loss_scale=1, train_wall=84, gb_free=62.1, wall=50118 epoch 035: 1165 / 1689 loss=3.506, nll_loss=1.951, ppl=3.87, wps=581967, ups=1.17, wpb=496415, bsz=16357.2, num_updates=58500, lr=0.000261488, gnorm=0.136, clip=0, loss_scale=1, train_wall=84, gb_free=62.1, wall=50118 epoch 035: 1165 / 1689 loss=3.506, nll_loss=1.951, ppl=3.87, wps=581967, ups=1.17, wpb=496415, bsz=16357.2, num_updates=58500, lr=0.000261488, gnorm=0.136, clip=0, loss_scale=1, train_wall=84, gb_free=62.1, wall=50118 epoch 035: 1165 / 1689 loss=3.506, nll_loss=1.951, ppl=3.87, wps=581967, ups=1.17, wpb=496415, bsz=16357.2, num_updates=58500, lr=0.000261488, gnorm=0.136, clip=0, loss_scale=1, train_wall=84, gb_free=62.1, wall=50118 epoch 035: 1165 / 1689 loss=3.506, nll_loss=1.951, ppl=3.87, wps=581967, ups=1.17, wpb=496415, bsz=16357.2, num_updates=58500, lr=0.000261488, gnorm=0.136, clip=0, loss_scale=1, train_wall=84, gb_free=62.1, wall=50118 epoch 035: 1165 / 1689 loss=3.506, nll_loss=1.951, ppl=3.87, wps=581967, ups=1.17, wpb=496415, bsz=16357.2, num_updates=58500, lr=0.000261488, gnorm=0.136, clip=0, loss_scale=1, train_wall=84, gb_free=62.1, wall=50118 epoch 035: 1165 / 1689 loss=3.506, nll_loss=1.951, ppl=3.87, wps=581967, ups=1.17, wpb=496415, bsz=16357.2, num_updates=58500, lr=0.000261488, gnorm=0.136, clip=0, loss_scale=1, train_wall=84, gb_free=62.1, wall=50118 epoch 035: 1165 / 1689 loss=3.506, nll_loss=1.951, ppl=3.87, wps=581967, ups=1.17, wpb=496415, bsz=16357.2, num_updates=58500, lr=0.000261488, gnorm=0.136, clip=0, loss_scale=1, train_wall=84, gb_free=62.1, wall=50118 epoch 035: 1165 / 1689 loss=3.506, nll_loss=1.951, ppl=3.87, wps=581967, ups=1.17, wpb=496415, bsz=16357.2, num_updates=58500, lr=0.000261488, gnorm=0.136, clip=0, loss_scale=1, train_wall=84, gb_free=62.1, wall=50118 epoch 035: 1165 / 1689 loss=3.506, nll_loss=1.951, ppl=3.87, wps=581967, ups=1.17, wpb=496415, bsz=16357.2, num_updates=58500, lr=0.000261488, gnorm=0.136, clip=0, loss_scale=1, train_wall=84, gb_free=62.1, wall=50118 epoch 035: 1165 / 1689 loss=3.506, nll_loss=1.951, ppl=3.87, wps=581967, ups=1.17, wpb=496415, bsz=16357.2, num_updates=58500, lr=0.000261488, gnorm=0.136, clip=0, loss_scale=1, train_wall=84, gb_free=62.1, wall=50118 epoch 035: 1165 / 1689 loss=3.506, nll_loss=1.951, ppl=3.87, wps=581967, ups=1.17, wpb=496415, bsz=16357.2, num_updates=58500, lr=0.000261488, gnorm=0.136, clip=0, loss_scale=1, train_wall=84, gb_free=62.1, wall=50118 epoch 035: 1165 / 1689 loss=3.506, nll_loss=1.951, ppl=3.87, wps=581967, ups=1.17, wpb=496415, bsz=16357.2, num_updates=58500, lr=0.000261488, gnorm=0.136, clip=0, loss_scale=1, train_wall=84, gb_free=62.1, wall=50118 epoch 035: 1165 / 1689 loss=3.506, nll_loss=1.951, ppl=3.87, wps=581967, ups=1.17, wpb=496415, bsz=16357.2, num_updates=58500, lr=0.000261488, gnorm=0.136, clip=0, loss_scale=1, train_wall=84, gb_free=62.1, wall=50118 epoch 035: 1165 / 1689 loss=3.506, nll_loss=1.951, ppl=3.87, wps=581967, ups=1.17, wpb=496415, bsz=16357.2, num_updates=58500, lr=0.000261488, gnorm=0.136, clip=0, loss_scale=1, train_wall=84, gb_free=62.1, wall=50118 epoch 035: 1165 / 1689 loss=3.506, nll_loss=1.951, ppl=3.87, wps=581967, ups=1.17, wpb=496415, bsz=16357.2, num_updates=58500, lr=0.000261488, gnorm=0.136, clip=0, loss_scale=1, train_wall=84, gb_free=62.1, wall=50118 epoch 035: 1165 / 1689 loss=3.506, nll_loss=1.951, ppl=3.87, wps=581967, ups=1.17, wpb=496415, bsz=16357.2, num_updates=58500, lr=0.000261488, gnorm=0.136, clip=0, loss_scale=1, train_wall=84, gb_free=62.1, wall=50118 epoch 035: 1165 / 1689 loss=3.506, nll_loss=1.951, ppl=3.87, wps=581967, ups=1.17, wpb=496415, bsz=16357.2, num_updates=58500, lr=0.000261488, gnorm=0.136, clip=0, loss_scale=1, train_wall=84, gb_free=62.1, wall=50118 epoch 035: 1165 / 1689 loss=3.506, nll_loss=1.951, ppl=3.87, wps=581967, ups=1.17, wpb=496415, bsz=16357.2, num_updates=58500, lr=0.000261488, gnorm=0.136, clip=0, loss_scale=1, train_wall=84, gb_free=62.1, wall=50118 epoch 035: 1165 / 1689 loss=3.506, nll_loss=1.951, ppl=3.87, wps=581967, ups=1.17, wpb=496415, bsz=16357.2, num_updates=58500, lr=0.000261488, gnorm=0.136, clip=0, loss_scale=1, train_wall=84, gb_free=62.1, wall=50118 epoch 035: 1265 / 1689 loss=3.497, nll_loss=1.942, ppl=3.84, wps=584054, ups=1.18, wpb=494429, bsz=16297.8, num_updates=58600, lr=0.000261265, gnorm=0.142, clip=0, loss_scale=1, train_wall=84, gb_free=62.1, wall=50203 epoch 035: 1265 / 1689 loss=3.497, nll_loss=1.942, ppl=3.84, wps=584054, ups=1.18, wpb=494429, bsz=16297.8, num_updates=58600, lr=0.000261265, gnorm=0.142, clip=0, loss_scale=1, train_wall=84, gb_free=62.1, wall=50203 epoch 035: 1265 / 1689 loss=3.497, nll_loss=1.942, ppl=3.84, wps=584054, ups=1.18, wpb=494429, bsz=16297.8, num_updates=58600, lr=0.000261265, gnorm=0.142, clip=0, loss_scale=1, train_wall=84, gb_free=62.1, wall=50203 epoch 035: 1265 / 1689 loss=3.497, nll_loss=1.942, ppl=3.84, wps=584054, ups=1.18, wpb=494429, bsz=16297.8, num_updates=58600, lr=0.000261265, gnorm=0.142, clip=0, loss_scale=1, train_wall=84, gb_free=62.1, wall=50203 epoch 035: 1265 / 1689 loss=3.497, nll_loss=1.942, ppl=3.84, wps=584054, ups=1.18, wpb=494429, bsz=16297.8, num_updates=58600, lr=0.000261265, gnorm=0.142, clip=0, loss_scale=1, train_wall=84, gb_free=62.1, wall=50203 epoch 035: 1265 / 1689 loss=3.497, nll_loss=1.942, ppl=3.84, wps=584054, ups=1.18, wpb=494429, bsz=16297.8, num_updates=58600, lr=0.000261265, gnorm=0.142, clip=0, loss_scale=1, train_wall=84, gb_free=62.1, wall=50203 epoch 035: 1265 / 1689 loss=3.497, nll_loss=1.942, ppl=3.84, wps=584054, ups=1.18, wpb=494429, bsz=16297.8, num_updates=58600, lr=0.000261265, gnorm=0.142, clip=0, loss_scale=1, train_wall=84, gb_free=62.1, wall=50203 epoch 035: 1265 / 1689 loss=3.497, nll_loss=1.942, ppl=3.84, wps=584054, ups=1.18, wpb=494429, bsz=16297.8, num_updates=58600, lr=0.000261265, gnorm=0.142, clip=0, loss_scale=1, train_wall=84, gb_free=62.1, wall=50203 epoch 035: 1265 / 1689 loss=3.497, nll_loss=1.942, ppl=3.84, wps=584054, ups=1.18, wpb=494429, bsz=16297.8, num_updates=58600, lr=0.000261265, gnorm=0.142, clip=0, loss_scale=1, train_wall=84, gb_free=62.1, wall=50203 epoch 035: 1265 / 1689 loss=3.497, nll_loss=1.942, ppl=3.84, wps=584054, ups=1.18, wpb=494429, bsz=16297.8, num_updates=58600, lr=0.000261265, gnorm=0.142, clip=0, loss_scale=1, train_wall=84, gb_free=62.1, wall=50203 epoch 035: 1265 / 1689 loss=3.497, nll_loss=1.942, ppl=3.84, wps=584054, ups=1.18, wpb=494429, bsz=16297.8, num_updates=58600, lr=0.000261265, gnorm=0.142, clip=0, loss_scale=1, train_wall=84, gb_free=62.1, wall=50203 epoch 035: 1265 / 1689 loss=3.497, nll_loss=1.942, ppl=3.84, wps=584054, ups=1.18, wpb=494429, bsz=16297.8, num_updates=58600, lr=0.000261265, gnorm=0.142, clip=0, loss_scale=1, train_wall=84, gb_free=62.1, wall=50203 epoch 035: 1265 / 1689 loss=3.497, nll_loss=1.942, ppl=3.84, wps=584054, ups=1.18, wpb=494429, bsz=16297.8, num_updates=58600, lr=0.000261265, gnorm=0.142, clip=0, loss_scale=1, train_wall=84, gb_free=62.1, wall=50203 epoch 035: 1265 / 1689 loss=3.497, nll_loss=1.942, ppl=3.84, wps=584054, ups=1.18, wpb=494429, bsz=16297.8, num_updates=58600, lr=0.000261265, gnorm=0.142, clip=0, loss_scale=1, train_wall=84, gb_free=62.1, wall=50203 epoch 035: 1265 / 1689 loss=3.497, nll_loss=1.942, ppl=3.84, wps=584054, ups=1.18, wpb=494429, bsz=16297.8, num_updates=58600, lr=0.000261265, gnorm=0.142, clip=0, loss_scale=1, train_wall=84, gb_free=62.1, wall=50203 epoch 035: 1265 / 1689 loss=3.497, nll_loss=1.942, ppl=3.84, wps=584054, ups=1.18, wpb=494429, bsz=16297.8, num_updates=58600, lr=0.000261265, gnorm=0.142, clip=0, loss_scale=1, train_wall=84, gb_free=62.1, wall=50203 epoch 035: 1265 / 1689 loss=3.497, nll_loss=1.942, ppl=3.84, wps=584054, ups=1.18, wpb=494429, bsz=16297.8, num_updates=58600, lr=0.000261265, gnorm=0.142, clip=0, loss_scale=1, train_wall=84, gb_free=62.1, wall=50203 epoch 035: 1265 / 1689 loss=3.497, nll_loss=1.942, ppl=3.84, wps=584054, ups=1.18, wpb=494429, bsz=16297.8, num_updates=58600, lr=0.000261265, gnorm=0.142, clip=0, loss_scale=1, train_wall=84, gb_free=62.1, wall=50203 epoch 035: 1265 / 1689 loss=3.497, nll_loss=1.942, ppl=3.84, wps=584054, ups=1.18, wpb=494429, bsz=16297.8, num_updates=58600, lr=0.000261265, gnorm=0.142, clip=0, loss_scale=1, train_wall=84, gb_free=62.1, wall=50203 epoch 035: 1265 / 1689 loss=3.497, nll_loss=1.942, ppl=3.84, wps=584054, ups=1.18, wpb=494429, bsz=16297.8, num_updates=58600, lr=0.000261265, gnorm=0.142, clip=0, loss_scale=1, train_wall=84, gb_free=62.1, wall=50203 epoch 035: 1265 / 1689 loss=3.497, nll_loss=1.942, ppl=3.84, wps=584054, ups=1.18, wpb=494429, bsz=16297.8, num_updates=58600, lr=0.000261265, gnorm=0.142, clip=0, loss_scale=1, train_wall=84, gb_free=62.1, wall=50203 epoch 035: 1265 / 1689 loss=3.497, nll_loss=1.942, ppl=3.84, wps=584054, ups=1.18, wpb=494429, bsz=16297.8, num_updates=58600, lr=0.000261265, gnorm=0.142, clip=0, loss_scale=1, train_wall=84, gb_free=62.1, wall=50203 epoch 035: 1265 / 1689 loss=3.497, nll_loss=1.942, ppl=3.84, wps=584054, ups=1.18, wpb=494429, bsz=16297.8, num_updates=58600, lr=0.000261265, gnorm=0.142, clip=0, loss_scale=1, train_wall=84, gb_free=62.1, wall=50203 epoch 035: 1265 / 1689 loss=3.497, nll_loss=1.942, ppl=3.84, wps=584054, ups=1.18, wpb=494429, bsz=16297.8, num_updates=58600, lr=0.000261265, gnorm=0.142, clip=0, loss_scale=1, train_wall=84, gb_free=62.1, wall=50203 epoch 035: 1265 / 1689 loss=3.497, nll_loss=1.942, ppl=3.84, wps=584054, ups=1.18, wpb=494429, bsz=16297.8, num_updates=58600, lr=0.000261265, gnorm=0.142, clip=0, loss_scale=1, train_wall=84, gb_free=62.1, wall=50203 epoch 035: 1265 / 1689 loss=3.497, nll_loss=1.942, ppl=3.84, wps=584054, ups=1.18, wpb=494429, bsz=16297.8, num_updates=58600, lr=0.000261265, gnorm=0.142, clip=0, loss_scale=1, train_wall=84, gb_free=62.1, wall=50203 epoch 035: 1265 / 1689 loss=3.497, nll_loss=1.942, ppl=3.84, wps=584054, ups=1.18, wpb=494429, bsz=16297.8, num_updates=58600, lr=0.000261265, gnorm=0.142, clip=0, loss_scale=1, train_wall=84, gb_free=62.1, wall=50203 epoch 035: 1265 / 1689 loss=3.497, nll_loss=1.942, ppl=3.84, wps=584054, ups=1.18, wpb=494429, bsz=16297.8, num_updates=58600, lr=0.000261265, gnorm=0.142, clip=0, loss_scale=1, train_wall=84, gb_free=62.1, wall=50203 epoch 035: 1265 / 1689 loss=3.497, nll_loss=1.942, ppl=3.84, wps=584054, ups=1.18, wpb=494429, bsz=16297.8, num_updates=58600, lr=0.000261265, gnorm=0.142, clip=0, loss_scale=1, train_wall=84, gb_free=62.1, wall=50203 epoch 035: 1265 / 1689 loss=3.497, nll_loss=1.942, ppl=3.84, wps=584054, ups=1.18, wpb=494429, bsz=16297.8, num_updates=58600, lr=0.000261265, gnorm=0.142, clip=0, loss_scale=1, train_wall=84, gb_free=62.1, wall=50203 epoch 035: 1265 / 1689 loss=3.497, nll_loss=1.942, ppl=3.84, wps=584054, ups=1.18, wpb=494429, bsz=16297.8, num_updates=58600, lr=0.000261265, gnorm=0.142, clip=0, loss_scale=1, train_wall=84, gb_free=62.1, wall=50203 epoch 035: 1265 / 1689 loss=3.497, nll_loss=1.942, ppl=3.84, wps=584054, ups=1.18, wpb=494429, bsz=16297.8, num_updates=58600, lr=0.000261265, gnorm=0.142, clip=0, loss_scale=1, train_wall=84, gb_free=62.1, wall=50203 epoch 035: 1265 / 1689 loss=3.497, nll_loss=1.942, ppl=3.84, wps=584054, ups=1.18, wpb=494429, bsz=16297.8, num_updates=58600, lr=0.000261265, gnorm=0.142, clip=0, loss_scale=1, train_wall=84, gb_free=62.1, wall=50203 epoch 035: 1265 / 1689 loss=3.497, nll_loss=1.942, ppl=3.84, wps=584054, ups=1.18, wpb=494429, bsz=16297.8, num_updates=58600, lr=0.000261265, gnorm=0.142, clip=0, loss_scale=1, train_wall=84, gb_free=62.1, wall=50203 epoch 035: 1265 / 1689 loss=3.497, nll_loss=1.942, ppl=3.84, wps=584054, ups=1.18, wpb=494429, bsz=16297.8, num_updates=58600, lr=0.000261265, gnorm=0.142, clip=0, loss_scale=1, train_wall=84, gb_free=62.1, wall=50203 epoch 035: 1365 / 1689 loss=3.507, nll_loss=1.953, ppl=3.87, wps=584146, ups=1.18, wpb=493561, bsz=16438.5, num_updates=58700, lr=0.000261042, gnorm=0.132, clip=0, loss_scale=1, train_wall=83, gb_free=61.3, wall=50287 epoch 035: 1365 / 1689 loss=3.507, nll_loss=1.953, ppl=3.87, wps=584146, ups=1.18, wpb=493561, bsz=16438.5, num_updates=58700, lr=0.000261042, gnorm=0.132, clip=0, loss_scale=1, train_wall=83, gb_free=61.3, wall=50287 epoch 035: 1365 / 1689 loss=3.507, nll_loss=1.953, ppl=3.87, wps=584146, ups=1.18, wpb=493561, bsz=16438.5, num_updates=58700, lr=0.000261042, gnorm=0.132, clip=0, loss_scale=1, train_wall=83, gb_free=61.3, wall=50287 epoch 035: 1365 / 1689 loss=3.507, nll_loss=1.953, ppl=3.87, wps=584146, ups=1.18, wpb=493561, bsz=16438.5, num_updates=58700, lr=0.000261042, gnorm=0.132, clip=0, loss_scale=1, train_wall=83, gb_free=61.3, wall=50287 epoch 035: 1365 / 1689 loss=3.507, nll_loss=1.953, ppl=3.87, wps=584146, ups=1.18, wpb=493561, bsz=16438.5, num_updates=58700, lr=0.000261042, gnorm=0.132, clip=0, loss_scale=1, train_wall=83, gb_free=61.3, wall=50287 epoch 035: 1365 / 1689 loss=3.507, nll_loss=1.953, ppl=3.87, wps=584146, ups=1.18, wpb=493561, bsz=16438.5, num_updates=58700, lr=0.000261042, gnorm=0.132, clip=0, loss_scale=1, train_wall=83, gb_free=61.3, wall=50287 epoch 035: 1365 / 1689 loss=3.507, nll_loss=1.953, ppl=3.87, wps=584146, ups=1.18, wpb=493561, bsz=16438.5, num_updates=58700, lr=0.000261042, gnorm=0.132, clip=0, loss_scale=1, train_wall=83, gb_free=61.3, wall=50287 epoch 035: 1365 / 1689 loss=3.507, nll_loss=1.953, ppl=3.87, wps=584146, ups=1.18, wpb=493561, bsz=16438.5, num_updates=58700, lr=0.000261042, gnorm=0.132, clip=0, loss_scale=1, train_wall=83, gb_free=61.3, wall=50287 epoch 035: 1365 / 1689 loss=3.507, nll_loss=1.953, ppl=3.87, wps=584146, ups=1.18, wpb=493561, bsz=16438.5, num_updates=58700, lr=0.000261042, gnorm=0.132, clip=0, loss_scale=1, train_wall=83, gb_free=61.3, wall=50287 epoch 035: 1365 / 1689 loss=3.507, nll_loss=1.953, ppl=3.87, wps=584146, ups=1.18, wpb=493561, bsz=16438.5, num_updates=58700, lr=0.000261042, gnorm=0.132, clip=0, loss_scale=1, train_wall=83, gb_free=61.3, wall=50287 epoch 035: 1365 / 1689 loss=3.507, nll_loss=1.953, ppl=3.87, wps=584146, ups=1.18, wpb=493561, bsz=16438.5, num_updates=58700, lr=0.000261042, gnorm=0.132, clip=0, loss_scale=1, train_wall=83, gb_free=61.3, wall=50287 epoch 035: 1365 / 1689 loss=3.507, nll_loss=1.953, ppl=3.87, wps=584146, ups=1.18, wpb=493561, bsz=16438.5, num_updates=58700, lr=0.000261042, gnorm=0.132, clip=0, loss_scale=1, train_wall=83, gb_free=61.3, wall=50287 epoch 035: 1365 / 1689 loss=3.507, nll_loss=1.953, ppl=3.87, wps=584146, ups=1.18, wpb=493561, bsz=16438.5, num_updates=58700, lr=0.000261042, gnorm=0.132, clip=0, loss_scale=1, train_wall=83, gb_free=61.3, wall=50287 epoch 035: 1365 / 1689 loss=3.507, nll_loss=1.953, ppl=3.87, wps=584146, ups=1.18, wpb=493561, bsz=16438.5, num_updates=58700, lr=0.000261042, gnorm=0.132, clip=0, loss_scale=1, train_wall=83, gb_free=61.3, wall=50287 epoch 035: 1365 / 1689 loss=3.507, nll_loss=1.953, ppl=3.87, wps=584146, ups=1.18, wpb=493561, bsz=16438.5, num_updates=58700, lr=0.000261042, gnorm=0.132, clip=0, loss_scale=1, train_wall=83, gb_free=61.3, wall=50287 epoch 035: 1365 / 1689 loss=3.507, nll_loss=1.953, ppl=3.87, wps=584146, ups=1.18, wpb=493561, bsz=16438.5, num_updates=58700, lr=0.000261042, gnorm=0.132, clip=0, loss_scale=1, train_wall=83, gb_free=61.3, wall=50287 epoch 035: 1365 / 1689 loss=3.507, nll_loss=1.953, ppl=3.87, wps=584146, ups=1.18, wpb=493561, bsz=16438.5, num_updates=58700, lr=0.000261042, gnorm=0.132, clip=0, loss_scale=1, train_wall=83, gb_free=61.3, wall=50287 epoch 035: 1365 / 1689 loss=3.507, nll_loss=1.953, ppl=3.87, wps=584146, ups=1.18, wpb=493561, bsz=16438.5, num_updates=58700, lr=0.000261042, gnorm=0.132, clip=0, loss_scale=1, train_wall=83, gb_free=61.3, wall=50287 epoch 035: 1365 / 1689 loss=3.507, nll_loss=1.953, ppl=3.87, wps=584146, ups=1.18, wpb=493561, bsz=16438.5, num_updates=58700, lr=0.000261042, gnorm=0.132, clip=0, loss_scale=1, train_wall=83, gb_free=61.3, wall=50287 epoch 035: 1365 / 1689 loss=3.507, nll_loss=1.953, ppl=3.87, wps=584146, ups=1.18, wpb=493561, bsz=16438.5, num_updates=58700, lr=0.000261042, gnorm=0.132, clip=0, loss_scale=1, train_wall=83, gb_free=61.3, wall=50287 epoch 035: 1365 / 1689 loss=3.507, nll_loss=1.953, ppl=3.87, wps=584146, ups=1.18, wpb=493561, bsz=16438.5, num_updates=58700, lr=0.000261042, gnorm=0.132, clip=0, loss_scale=1, train_wall=83, gb_free=61.3, wall=50287 epoch 035: 1365 / 1689 loss=3.507, nll_loss=1.953, ppl=3.87, wps=584146, ups=1.18, wpb=493561, bsz=16438.5, num_updates=58700, lr=0.000261042, gnorm=0.132, clip=0, loss_scale=1, train_wall=83, gb_free=61.3, wall=50287 epoch 035: 1365 / 1689 loss=3.507, nll_loss=1.953, ppl=3.87, wps=584146, ups=1.18, wpb=493561, bsz=16438.5, num_updates=58700, lr=0.000261042, gnorm=0.132, clip=0, loss_scale=1, train_wall=83, gb_free=61.3, wall=50287 epoch 035: 1365 / 1689 loss=3.507, nll_loss=1.953, ppl=3.87, wps=584146, ups=1.18, wpb=493561, bsz=16438.5, num_updates=58700, lr=0.000261042, gnorm=0.132, clip=0, loss_scale=1, train_wall=83, gb_free=61.3, wall=50287 epoch 035: 1365 / 1689 loss=3.507, nll_loss=1.953, ppl=3.87, wps=584146, ups=1.18, wpb=493561, bsz=16438.5, num_updates=58700, lr=0.000261042, gnorm=0.132, clip=0, loss_scale=1, train_wall=83, gb_free=61.3, wall=50287 epoch 035: 1365 / 1689 loss=3.507, nll_loss=1.953, ppl=3.87, wps=584146, ups=1.18, wpb=493561, bsz=16438.5, num_updates=58700, lr=0.000261042, gnorm=0.132, clip=0, loss_scale=1, train_wall=83, gb_free=61.3, wall=50287 epoch 035: 1365 / 1689 loss=3.507, nll_loss=1.953, ppl=3.87, wps=584146, ups=1.18, wpb=493561, bsz=16438.5, num_updates=58700, lr=0.000261042, gnorm=0.132, clip=0, loss_scale=1, train_wall=83, gb_free=61.3, wall=50287 epoch 035: 1365 / 1689 loss=3.507, nll_loss=1.953, ppl=3.87, wps=584146, ups=1.18, wpb=493561, bsz=16438.5, num_updates=58700, lr=0.000261042, gnorm=0.132, clip=0, loss_scale=1, train_wall=83, gb_free=61.3, wall=50287 epoch 035: 1365 / 1689 loss=3.507, nll_loss=1.953, ppl=3.87, wps=584146, ups=1.18, wpb=493561, bsz=16438.5, num_updates=58700, lr=0.000261042, gnorm=0.132, clip=0, loss_scale=1, train_wall=83, gb_free=61.3, wall=50287 epoch 035: 1365 / 1689 loss=3.507, nll_loss=1.953, ppl=3.87, wps=584146, ups=1.18, wpb=493561, bsz=16438.5, num_updates=58700, lr=0.000261042, gnorm=0.132, clip=0, loss_scale=1, train_wall=83, gb_free=61.3, wall=50287 epoch 035: 1365 / 1689 loss=3.507, nll_loss=1.953, ppl=3.87, wps=584146, ups=1.18, wpb=493561, bsz=16438.5, num_updates=58700, lr=0.000261042, gnorm=0.132, clip=0, loss_scale=1, train_wall=83, gb_free=61.3, wall=50287 epoch 035: 1365 / 1689 loss=3.507, nll_loss=1.953, ppl=3.87, wps=584146, ups=1.18, wpb=493561, bsz=16438.5, num_updates=58700, lr=0.000261042, gnorm=0.132, clip=0, loss_scale=1, train_wall=83, gb_free=61.3, wall=50287 epoch 035: 1365 / 1689 loss=3.507, nll_loss=1.953, ppl=3.87, wps=584146, ups=1.18, wpb=493561, bsz=16438.5, num_updates=58700, lr=0.000261042, gnorm=0.132, clip=0, loss_scale=1, train_wall=83, gb_free=61.3, wall=50287 epoch 035: 1365 / 1689 loss=3.507, nll_loss=1.953, ppl=3.87, wps=584146, ups=1.18, wpb=493561, bsz=16438.5, num_updates=58700, lr=0.000261042, gnorm=0.132, clip=0, loss_scale=1, train_wall=83, gb_free=61.3, wall=50287 epoch 035: 1365 / 1689 loss=3.507, nll_loss=1.953, ppl=3.87, wps=584146, ups=1.18, wpb=493561, bsz=16438.5, num_updates=58700, lr=0.000261042, gnorm=0.132, clip=0, loss_scale=1, train_wall=83, gb_free=61.3, wall=50287 epoch 035: 1465 / 1689 loss=3.503, nll_loss=1.948, ppl=3.86, wps=585438, ups=1.18, wpb=496440, bsz=16622.3, num_updates=58800, lr=0.00026082, gnorm=0.129, clip=0, loss_scale=1, train_wall=84, gb_free=60.7, wall=50372 epoch 035: 1465 / 1689 loss=3.503, nll_loss=1.948, ppl=3.86, wps=585438, ups=1.18, wpb=496440, bsz=16622.3, num_updates=58800, lr=0.00026082, gnorm=0.129, clip=0, loss_scale=1, train_wall=84, gb_free=60.7, wall=50372 epoch 035: 1465 / 1689 loss=3.503, nll_loss=1.948, ppl=3.86, wps=585438, ups=1.18, wpb=496440, bsz=16622.3, num_updates=58800, lr=0.00026082, gnorm=0.129, clip=0, loss_scale=1, train_wall=84, gb_free=60.7, wall=50372 epoch 035: 1465 / 1689 loss=3.503, nll_loss=1.948, ppl=3.86, wps=585438, ups=1.18, wpb=496440, bsz=16622.3, num_updates=58800, lr=0.00026082, gnorm=0.129, clip=0, loss_scale=1, train_wall=84, gb_free=60.7, wall=50372 epoch 035: 1465 / 1689 loss=3.503, nll_loss=1.948, ppl=3.86, wps=585438, ups=1.18, wpb=496440, bsz=16622.3, num_updates=58800, lr=0.00026082, gnorm=0.129, clip=0, loss_scale=1, train_wall=84, gb_free=60.7, wall=50372 epoch 035: 1465 / 1689 loss=3.503, nll_loss=1.948, ppl=3.86, wps=585438, ups=1.18, wpb=496440, bsz=16622.3, num_updates=58800, lr=0.00026082, gnorm=0.129, clip=0, loss_scale=1, train_wall=84, gb_free=60.7, wall=50372 epoch 035: 1465 / 1689 loss=3.503, nll_loss=1.948, ppl=3.86, wps=585438, ups=1.18, wpb=496440, bsz=16622.3, num_updates=58800, lr=0.00026082, gnorm=0.129, clip=0, loss_scale=1, train_wall=84, gb_free=60.7, wall=50372 epoch 035: 1465 / 1689 loss=3.503, nll_loss=1.948, ppl=3.86, wps=585438, ups=1.18, wpb=496440, bsz=16622.3, num_updates=58800, lr=0.00026082, gnorm=0.129, clip=0, loss_scale=1, train_wall=84, gb_free=60.7, wall=50372 epoch 035: 1465 / 1689 loss=3.503, nll_loss=1.948, ppl=3.86, wps=585438, ups=1.18, wpb=496440, bsz=16622.3, num_updates=58800, lr=0.00026082, gnorm=0.129, clip=0, loss_scale=1, train_wall=84, gb_free=60.7, wall=50372 epoch 035: 1465 / 1689 loss=3.503, nll_loss=1.948, ppl=3.86, wps=585438, ups=1.18, wpb=496440, bsz=16622.3, num_updates=58800, lr=0.00026082, gnorm=0.129, clip=0, loss_scale=1, train_wall=84, gb_free=60.7, wall=50372 epoch 035: 1465 / 1689 loss=3.503, nll_loss=1.948, ppl=3.86, wps=585438, ups=1.18, wpb=496440, bsz=16622.3, num_updates=58800, lr=0.00026082, gnorm=0.129, clip=0, loss_scale=1, train_wall=84, gb_free=60.7, wall=50372 epoch 035: 1465 / 1689 loss=3.503, nll_loss=1.948, ppl=3.86, wps=585438, ups=1.18, wpb=496440, bsz=16622.3, num_updates=58800, lr=0.00026082, gnorm=0.129, clip=0, loss_scale=1, train_wall=84, gb_free=60.7, wall=50372 epoch 035: 1465 / 1689 loss=3.503, nll_loss=1.948, ppl=3.86, wps=585438, ups=1.18, wpb=496440, bsz=16622.3, num_updates=58800, lr=0.00026082, gnorm=0.129, clip=0, loss_scale=1, train_wall=84, gb_free=60.7, wall=50372 epoch 035: 1465 / 1689 loss=3.503, nll_loss=1.948, ppl=3.86, wps=585438, ups=1.18, wpb=496440, bsz=16622.3, num_updates=58800, lr=0.00026082, gnorm=0.129, clip=0, loss_scale=1, train_wall=84, gb_free=60.7, wall=50372 epoch 035: 1465 / 1689 loss=3.503, nll_loss=1.948, ppl=3.86, wps=585438, ups=1.18, wpb=496440, bsz=16622.3, num_updates=58800, lr=0.00026082, gnorm=0.129, clip=0, loss_scale=1, train_wall=84, gb_free=60.7, wall=50372 epoch 035: 1465 / 1689 loss=3.503, nll_loss=1.948, ppl=3.86, wps=585438, ups=1.18, wpb=496440, bsz=16622.3, num_updates=58800, lr=0.00026082, gnorm=0.129, clip=0, loss_scale=1, train_wall=84, gb_free=60.7, wall=50372 epoch 035: 1465 / 1689 loss=3.503, nll_loss=1.948, ppl=3.86, wps=585438, ups=1.18, wpb=496440, bsz=16622.3, num_updates=58800, lr=0.00026082, gnorm=0.129, clip=0, loss_scale=1, train_wall=84, gb_free=60.7, wall=50372 epoch 035: 1465 / 1689 loss=3.503, nll_loss=1.948, ppl=3.86, wps=585438, ups=1.18, wpb=496440, bsz=16622.3, num_updates=58800, lr=0.00026082, gnorm=0.129, clip=0, loss_scale=1, train_wall=84, gb_free=60.7, wall=50372 epoch 035: 1465 / 1689 loss=3.503, nll_loss=1.948, ppl=3.86, wps=585438, ups=1.18, wpb=496440, bsz=16622.3, num_updates=58800, lr=0.00026082, gnorm=0.129, clip=0, loss_scale=1, train_wall=84, gb_free=60.7, wall=50372 epoch 035: 1465 / 1689 loss=3.503, nll_loss=1.948, ppl=3.86, wps=585438, ups=1.18, wpb=496440, bsz=16622.3, num_updates=58800, lr=0.00026082, gnorm=0.129, clip=0, loss_scale=1, train_wall=84, gb_free=60.7, wall=50372 epoch 035: 1465 / 1689 loss=3.503, nll_loss=1.948, ppl=3.86, wps=585438, ups=1.18, wpb=496440, bsz=16622.3, num_updates=58800, lr=0.00026082, gnorm=0.129, clip=0, loss_scale=1, train_wall=84, gb_free=60.7, wall=50372 epoch 035: 1465 / 1689 loss=3.503, nll_loss=1.948, ppl=3.86, wps=585438, ups=1.18, wpb=496440, bsz=16622.3, num_updates=58800, lr=0.00026082, gnorm=0.129, clip=0, loss_scale=1, train_wall=84, gb_free=60.7, wall=50372 epoch 035: 1465 / 1689 loss=3.503, nll_loss=1.948, ppl=3.86, wps=585438, ups=1.18, wpb=496440, bsz=16622.3, num_updates=58800, lr=0.00026082, gnorm=0.129, clip=0, loss_scale=1, train_wall=84, gb_free=60.7, wall=50372 epoch 035: 1465 / 1689 loss=3.503, nll_loss=1.948, ppl=3.86, wps=585438, ups=1.18, wpb=496440, bsz=16622.3, num_updates=58800, lr=0.00026082, gnorm=0.129, clip=0, loss_scale=1, train_wall=84, gb_free=60.7, wall=50372 epoch 035: 1465 / 1689 loss=3.503, nll_loss=1.948, ppl=3.86, wps=585438, ups=1.18, wpb=496440, bsz=16622.3, num_updates=58800, lr=0.00026082, gnorm=0.129, clip=0, loss_scale=1, train_wall=84, gb_free=60.7, wall=50372 epoch 035: 1465 / 1689 loss=3.503, nll_loss=1.948, ppl=3.86, wps=585438, ups=1.18, wpb=496440, bsz=16622.3, num_updates=58800, lr=0.00026082, gnorm=0.129, clip=0, loss_scale=1, train_wall=84, gb_free=60.7, wall=50372 epoch 035: 1465 / 1689 loss=3.503, nll_loss=1.948, ppl=3.86, wps=585438, ups=1.18, wpb=496440, bsz=16622.3, num_updates=58800, lr=0.00026082, gnorm=0.129, clip=0, loss_scale=1, train_wall=84, gb_free=60.7, wall=50372 epoch 035: 1465 / 1689 loss=3.503, nll_loss=1.948, ppl=3.86, wps=585438, ups=1.18, wpb=496440, bsz=16622.3, num_updates=58800, lr=0.00026082, gnorm=0.129, clip=0, loss_scale=1, train_wall=84, gb_free=60.7, wall=50372 epoch 035: 1465 / 1689 loss=3.503, nll_loss=1.948, ppl=3.86, wps=585438, ups=1.18, wpb=496440, bsz=16622.3, num_updates=58800, lr=0.00026082, gnorm=0.129, clip=0, loss_scale=1, train_wall=84, gb_free=60.7, wall=50372 epoch 035: 1465 / 1689 loss=3.503, nll_loss=1.948, ppl=3.86, wps=585438, ups=1.18, wpb=496440, bsz=16622.3, num_updates=58800, lr=0.00026082, gnorm=0.129, clip=0, loss_scale=1, train_wall=84, gb_free=60.7, wall=50372 epoch 035: 1465 / 1689 loss=3.503, nll_loss=1.948, ppl=3.86, wps=585438, ups=1.18, wpb=496440, bsz=16622.3, num_updates=58800, lr=0.00026082, gnorm=0.129, clip=0, loss_scale=1, train_wall=84, gb_free=60.7, wall=50372 epoch 035: 1465 / 1689 loss=3.503, nll_loss=1.948, ppl=3.86, wps=585438, ups=1.18, wpb=496440, bsz=16622.3, num_updates=58800, lr=0.00026082, gnorm=0.129, clip=0, loss_scale=1, train_wall=84, gb_free=60.7, wall=50372 epoch 035: 1465 / 1689 loss=3.503, nll_loss=1.948, ppl=3.86, wps=585438, ups=1.18, wpb=496440, bsz=16622.3, num_updates=58800, lr=0.00026082, gnorm=0.129, clip=0, loss_scale=1, train_wall=84, gb_free=60.7, wall=50372 epoch 035: 1465 / 1689 loss=3.503, nll_loss=1.948, ppl=3.86, wps=585438, ups=1.18, wpb=496440, bsz=16622.3, num_updates=58800, lr=0.00026082, gnorm=0.129, clip=0, loss_scale=1, train_wall=84, gb_free=60.7, wall=50372 epoch 035: 1465 / 1689 loss=3.503, nll_loss=1.948, ppl=3.86, wps=585438, ups=1.18, wpb=496440, bsz=16622.3, num_updates=58800, lr=0.00026082, gnorm=0.129, clip=0, loss_scale=1, train_wall=84, gb_free=60.7, wall=50372 epoch 035: 1565 / 1689 loss=3.508, nll_loss=1.954, ppl=3.87, wps=584838, ups=1.18, wpb=495117, bsz=16811.2, num_updates=58900, lr=0.000260599, gnorm=0.137, clip=0, loss_scale=1, train_wall=83, gb_free=60.3, wall=50457 epoch 035: 1565 / 1689 loss=3.508, nll_loss=1.954, ppl=3.87, wps=584838, ups=1.18, wpb=495117, bsz=16811.2, num_updates=58900, lr=0.000260599, gnorm=0.137, clip=0, loss_scale=1, train_wall=83, gb_free=60.3, wall=50457 epoch 035: 1565 / 1689 loss=3.508, nll_loss=1.954, ppl=3.87, wps=584838, ups=1.18, wpb=495117, bsz=16811.2, num_updates=58900, lr=0.000260599, gnorm=0.137, clip=0, loss_scale=1, train_wall=83, gb_free=60.3, wall=50457 epoch 035: 1565 / 1689 loss=3.508, nll_loss=1.954, ppl=3.87, wps=584838, ups=1.18, wpb=495117, bsz=16811.2, num_updates=58900, lr=0.000260599, gnorm=0.137, clip=0, loss_scale=1, train_wall=83, gb_free=60.3, wall=50457 epoch 035: 1565 / 1689 loss=3.508, nll_loss=1.954, ppl=3.87, wps=584838, ups=1.18, wpb=495117, bsz=16811.2, num_updates=58900, lr=0.000260599, gnorm=0.137, clip=0, loss_scale=1, train_wall=83, gb_free=60.3, wall=50457 epoch 035: 1565 / 1689 loss=3.508, nll_loss=1.954, ppl=3.87, wps=584838, ups=1.18, wpb=495117, bsz=16811.2, num_updates=58900, lr=0.000260599, gnorm=0.137, clip=0, loss_scale=1, train_wall=83, gb_free=60.3, wall=50457 epoch 035: 1565 / 1689 loss=3.508, nll_loss=1.954, ppl=3.87, wps=584838, ups=1.18, wpb=495117, bsz=16811.2, num_updates=58900, lr=0.000260599, gnorm=0.137, clip=0, loss_scale=1, train_wall=83, gb_free=60.3, wall=50457 epoch 035: 1565 / 1689 loss=3.508, nll_loss=1.954, ppl=3.87, wps=584838, ups=1.18, wpb=495117, bsz=16811.2, num_updates=58900, lr=0.000260599, gnorm=0.137, clip=0, loss_scale=1, train_wall=83, gb_free=60.3, wall=50457 epoch 035: 1565 / 1689 loss=3.508, nll_loss=1.954, ppl=3.87, wps=584838, ups=1.18, wpb=495117, bsz=16811.2, num_updates=58900, lr=0.000260599, gnorm=0.137, clip=0, loss_scale=1, train_wall=83, gb_free=60.3, wall=50457 epoch 035: 1565 / 1689 loss=3.508, nll_loss=1.954, ppl=3.87, wps=584838, ups=1.18, wpb=495117, bsz=16811.2, num_updates=58900, lr=0.000260599, gnorm=0.137, clip=0, loss_scale=1, train_wall=83, gb_free=60.3, wall=50457 epoch 035: 1565 / 1689 loss=3.508, nll_loss=1.954, ppl=3.87, wps=584838, ups=1.18, wpb=495117, bsz=16811.2, num_updates=58900, lr=0.000260599, gnorm=0.137, clip=0, loss_scale=1, train_wall=83, gb_free=60.3, wall=50457 epoch 035: 1565 / 1689 loss=3.508, nll_loss=1.954, ppl=3.87, wps=584838, ups=1.18, wpb=495117, bsz=16811.2, num_updates=58900, lr=0.000260599, gnorm=0.137, clip=0, loss_scale=1, train_wall=83, gb_free=60.3, wall=50457 epoch 035: 1565 / 1689 loss=3.508, nll_loss=1.954, ppl=3.87, wps=584838, ups=1.18, wpb=495117, bsz=16811.2, num_updates=58900, lr=0.000260599, gnorm=0.137, clip=0, loss_scale=1, train_wall=83, gb_free=60.3, wall=50457 epoch 035: 1565 / 1689 loss=3.508, nll_loss=1.954, ppl=3.87, wps=584838, ups=1.18, wpb=495117, bsz=16811.2, num_updates=58900, lr=0.000260599, gnorm=0.137, clip=0, loss_scale=1, train_wall=83, gb_free=60.3, wall=50457 epoch 035: 1565 / 1689 loss=3.508, nll_loss=1.954, ppl=3.87, wps=584838, ups=1.18, wpb=495117, bsz=16811.2, num_updates=58900, lr=0.000260599, gnorm=0.137, clip=0, loss_scale=1, train_wall=83, gb_free=60.3, wall=50457 epoch 035: 1565 / 1689 loss=3.508, nll_loss=1.954, ppl=3.87, wps=584838, ups=1.18, wpb=495117, bsz=16811.2, num_updates=58900, lr=0.000260599, gnorm=0.137, clip=0, loss_scale=1, train_wall=83, gb_free=60.3, wall=50457 epoch 035: 1565 / 1689 loss=3.508, nll_loss=1.954, ppl=3.87, wps=584838, ups=1.18, wpb=495117, bsz=16811.2, num_updates=58900, lr=0.000260599, gnorm=0.137, clip=0, loss_scale=1, train_wall=83, gb_free=60.3, wall=50457 epoch 035: 1565 / 1689 loss=3.508, nll_loss=1.954, ppl=3.87, wps=584838, ups=1.18, wpb=495117, bsz=16811.2, num_updates=58900, lr=0.000260599, gnorm=0.137, clip=0, loss_scale=1, train_wall=83, gb_free=60.3, wall=50457 epoch 035: 1565 / 1689 loss=3.508, nll_loss=1.954, ppl=3.87, wps=584838, ups=1.18, wpb=495117, bsz=16811.2, num_updates=58900, lr=0.000260599, gnorm=0.137, clip=0, loss_scale=1, train_wall=83, gb_free=60.3, wall=50457 epoch 035: 1565 / 1689 loss=3.508, nll_loss=1.954, ppl=3.87, wps=584838, ups=1.18, wpb=495117, bsz=16811.2, num_updates=58900, lr=0.000260599, gnorm=0.137, clip=0, loss_scale=1, train_wall=83, gb_free=60.3, wall=50457 epoch 035: 1565 / 1689 loss=3.508, nll_loss=1.954, ppl=3.87, wps=584838, ups=1.18, wpb=495117, bsz=16811.2, num_updates=58900, lr=0.000260599, gnorm=0.137, clip=0, loss_scale=1, train_wall=83, gb_free=60.3, wall=50457 epoch 035: 1565 / 1689 loss=3.508, nll_loss=1.954, ppl=3.87, wps=584838, ups=1.18, wpb=495117, bsz=16811.2, num_updates=58900, lr=0.000260599, gnorm=0.137, clip=0, loss_scale=1, train_wall=83, gb_free=60.3, wall=50457 epoch 035: 1565 / 1689 loss=3.508, nll_loss=1.954, ppl=3.87, wps=584838, ups=1.18, wpb=495117, bsz=16811.2, num_updates=58900, lr=0.000260599, gnorm=0.137, clip=0, loss_scale=1, train_wall=83, gb_free=60.3, wall=50457 epoch 035: 1565 / 1689 loss=3.508, nll_loss=1.954, ppl=3.87, wps=584838, ups=1.18, wpb=495117, bsz=16811.2, num_updates=58900, lr=0.000260599, gnorm=0.137, clip=0, loss_scale=1, train_wall=83, gb_free=60.3, wall=50457 epoch 035: 1565 / 1689 loss=3.508, nll_loss=1.954, ppl=3.87, wps=584838, ups=1.18, wpb=495117, bsz=16811.2, num_updates=58900, lr=0.000260599, gnorm=0.137, clip=0, loss_scale=1, train_wall=83, gb_free=60.3, wall=50457 epoch 035: 1565 / 1689 loss=3.508, nll_loss=1.954, ppl=3.87, wps=584838, ups=1.18, wpb=495117, bsz=16811.2, num_updates=58900, lr=0.000260599, gnorm=0.137, clip=0, loss_scale=1, train_wall=83, gb_free=60.3, wall=50457 epoch 035: 1565 / 1689 loss=3.508, nll_loss=1.954, ppl=3.87, wps=584838, ups=1.18, wpb=495117, bsz=16811.2, num_updates=58900, lr=0.000260599, gnorm=0.137, clip=0, loss_scale=1, train_wall=83, gb_free=60.3, wall=50457 epoch 035: 1565 / 1689 loss=3.508, nll_loss=1.954, ppl=3.87, wps=584838, ups=1.18, wpb=495117, bsz=16811.2, num_updates=58900, lr=0.000260599, gnorm=0.137, clip=0, loss_scale=1, train_wall=83, gb_free=60.3, wall=50457 epoch 035: 1565 / 1689 loss=3.508, nll_loss=1.954, ppl=3.87, wps=584838, ups=1.18, wpb=495117, bsz=16811.2, num_updates=58900, lr=0.000260599, gnorm=0.137, clip=0, loss_scale=1, train_wall=83, gb_free=60.3, wall=50457 epoch 035: 1565 / 1689 loss=3.508, nll_loss=1.954, ppl=3.87, wps=584838, ups=1.18, wpb=495117, bsz=16811.2, num_updates=58900, lr=0.000260599, gnorm=0.137, clip=0, loss_scale=1, train_wall=83, gb_free=60.3, wall=50457 epoch 035: 1565 / 1689 loss=3.508, nll_loss=1.954, ppl=3.87, wps=584838, ups=1.18, wpb=495117, bsz=16811.2, num_updates=58900, lr=0.000260599, gnorm=0.137, clip=0, loss_scale=1, train_wall=83, gb_free=60.3, wall=50457 epoch 035: 1565 / 1689 loss=3.508, nll_loss=1.954, ppl=3.87, wps=584838, ups=1.18, wpb=495117, bsz=16811.2, num_updates=58900, lr=0.000260599, gnorm=0.137, clip=0, loss_scale=1, train_wall=83, gb_free=60.3, wall=50457 epoch 035: 1565 / 1689 loss=3.508, nll_loss=1.954, ppl=3.87, wps=584838, ups=1.18, wpb=495117, bsz=16811.2, num_updates=58900, lr=0.000260599, gnorm=0.137, clip=0, loss_scale=1, train_wall=83, gb_free=60.3, wall=50457 epoch 035: 1565 / 1689 loss=3.508, nll_loss=1.954, ppl=3.87, wps=584838, ups=1.18, wpb=495117, bsz=16811.2, num_updates=58900, lr=0.000260599, gnorm=0.137, clip=0, loss_scale=1, train_wall=83, gb_free=60.3, wall=50457 epoch 035: 1565 / 1689 loss=3.508, nll_loss=1.954, ppl=3.87, wps=584838, ups=1.18, wpb=495117, bsz=16811.2, num_updates=58900, lr=0.000260599, gnorm=0.137, clip=0, loss_scale=1, train_wall=83, gb_free=60.3, wall=50457 epoch 035: 1665 / 1689 loss=3.512, nll_loss=1.959, ppl=3.89, wps=584900, ups=1.18, wpb=495063, bsz=16826.7, num_updates=59000, lr=0.000260378, gnorm=0.131, clip=0, loss_scale=2, train_wall=84, gb_free=61, wall=50541 epoch 035: 1665 / 1689 loss=3.512, nll_loss=1.959, ppl=3.89, wps=584900, ups=1.18, wpb=495063, bsz=16826.7, num_updates=59000, lr=0.000260378, gnorm=0.131, clip=0, loss_scale=2, train_wall=84, gb_free=61, wall=50541 epoch 035: 1665 / 1689 loss=3.512, nll_loss=1.959, ppl=3.89, wps=584900, ups=1.18, wpb=495063, bsz=16826.7, num_updates=59000, lr=0.000260378, gnorm=0.131, clip=0, loss_scale=2, train_wall=84, gb_free=61, wall=50541 epoch 035: 1665 / 1689 loss=3.512, nll_loss=1.959, ppl=3.89, wps=584900, ups=1.18, wpb=495063, bsz=16826.7, num_updates=59000, lr=0.000260378, gnorm=0.131, clip=0, loss_scale=2, train_wall=84, gb_free=61, wall=50541 epoch 035: 1665 / 1689 loss=3.512, nll_loss=1.959, ppl=3.89, wps=584900, ups=1.18, wpb=495063, bsz=16826.7, num_updates=59000, lr=0.000260378, gnorm=0.131, clip=0, loss_scale=2, train_wall=84, gb_free=61, wall=50541 epoch 035: 1665 / 1689 loss=3.512, nll_loss=1.959, ppl=3.89, wps=584900, ups=1.18, wpb=495063, bsz=16826.7, num_updates=59000, lr=0.000260378, gnorm=0.131, clip=0, loss_scale=2, train_wall=84, gb_free=61, wall=50541 epoch 035: 1665 / 1689 loss=3.512, nll_loss=1.959, ppl=3.89, wps=584900, ups=1.18, wpb=495063, bsz=16826.7, num_updates=59000, lr=0.000260378, gnorm=0.131, clip=0, loss_scale=2, train_wall=84, gb_free=61, wall=50541 epoch 035: 1665 / 1689 loss=3.512, nll_loss=1.959, ppl=3.89, wps=584900, ups=1.18, wpb=495063, bsz=16826.7, num_updates=59000, lr=0.000260378, gnorm=0.131, clip=0, loss_scale=2, train_wall=84, gb_free=61, wall=50541 epoch 035: 1665 / 1689 loss=3.512, nll_loss=1.959, ppl=3.89, wps=584900, ups=1.18, wpb=495063, bsz=16826.7, num_updates=59000, lr=0.000260378, gnorm=0.131, clip=0, loss_scale=2, train_wall=84, gb_free=61, wall=50541 epoch 035: 1665 / 1689 loss=3.512, nll_loss=1.959, ppl=3.89, wps=584900, ups=1.18, wpb=495063, bsz=16826.7, num_updates=59000, lr=0.000260378, gnorm=0.131, clip=0, loss_scale=2, train_wall=84, gb_free=61, wall=50541 epoch 035: 1665 / 1689 loss=3.512, nll_loss=1.959, ppl=3.89, wps=584900, ups=1.18, wpb=495063, bsz=16826.7, num_updates=59000, lr=0.000260378, gnorm=0.131, clip=0, loss_scale=2, train_wall=84, gb_free=61, wall=50541 epoch 035: 1665 / 1689 loss=3.512, nll_loss=1.959, ppl=3.89, wps=584900, ups=1.18, wpb=495063, bsz=16826.7, num_updates=59000, lr=0.000260378, gnorm=0.131, clip=0, loss_scale=2, train_wall=84, gb_free=61, wall=50541 epoch 035: 1665 / 1689 loss=3.512, nll_loss=1.959, ppl=3.89, wps=584900, ups=1.18, wpb=495063, bsz=16826.7, num_updates=59000, lr=0.000260378, gnorm=0.131, clip=0, loss_scale=2, train_wall=84, gb_free=61, wall=50541 epoch 035: 1665 / 1689 loss=3.512, nll_loss=1.959, ppl=3.89, wps=584900, ups=1.18, wpb=495063, bsz=16826.7, num_updates=59000, lr=0.000260378, gnorm=0.131, clip=0, loss_scale=2, train_wall=84, gb_free=61, wall=50541 epoch 035: 1665 / 1689 loss=3.512, nll_loss=1.959, ppl=3.89, wps=584900, ups=1.18, wpb=495063, bsz=16826.7, num_updates=59000, lr=0.000260378, gnorm=0.131, clip=0, loss_scale=2, train_wall=84, gb_free=61, wall=50541 epoch 035: 1665 / 1689 loss=3.512, nll_loss=1.959, ppl=3.89, wps=584900, ups=1.18, wpb=495063, bsz=16826.7, num_updates=59000, lr=0.000260378, gnorm=0.131, clip=0, loss_scale=2, train_wall=84, gb_free=61, wall=50541 epoch 035: 1665 / 1689 loss=3.512, nll_loss=1.959, ppl=3.89, wps=584900, ups=1.18, wpb=495063, bsz=16826.7, num_updates=59000, lr=0.000260378, gnorm=0.131, clip=0, loss_scale=2, train_wall=84, gb_free=61, wall=50541 epoch 035: 1665 / 1689 loss=3.512, nll_loss=1.959, ppl=3.89, wps=584900, ups=1.18, wpb=495063, bsz=16826.7, num_updates=59000, lr=0.000260378, gnorm=0.131, clip=0, loss_scale=2, train_wall=84, gb_free=61, wall=50541 epoch 035: 1665 / 1689 loss=3.512, nll_loss=1.959, ppl=3.89, wps=584900, ups=1.18, wpb=495063, bsz=16826.7, num_updates=59000, lr=0.000260378, gnorm=0.131, clip=0, loss_scale=2, train_wall=84, gb_free=61, wall=50541 epoch 035: 1665 / 1689 loss=3.512, nll_loss=1.959, ppl=3.89, wps=584900, ups=1.18, wpb=495063, bsz=16826.7, num_updates=59000, lr=0.000260378, gnorm=0.131, clip=0, loss_scale=2, train_wall=84, gb_free=61, wall=50541 epoch 035: 1665 / 1689 loss=3.512, nll_loss=1.959, ppl=3.89, wps=584900, ups=1.18, wpb=495063, bsz=16826.7, num_updates=59000, lr=0.000260378, gnorm=0.131, clip=0, loss_scale=2, train_wall=84, gb_free=61, wall=50541 epoch 035: 1665 / 1689 loss=3.512, nll_loss=1.959, ppl=3.89, wps=584900, ups=1.18, wpb=495063, bsz=16826.7, num_updates=59000, lr=0.000260378, gnorm=0.131, clip=0, loss_scale=2, train_wall=84, gb_free=61, wall=50541 epoch 035: 1665 / 1689 loss=3.512, nll_loss=1.959, ppl=3.89, wps=584900, ups=1.18, wpb=495063, bsz=16826.7, num_updates=59000, lr=0.000260378, gnorm=0.131, clip=0, loss_scale=2, train_wall=84, gb_free=61, wall=50541 epoch 035: 1665 / 1689 loss=3.512, nll_loss=1.959, ppl=3.89, wps=584900, ups=1.18, wpb=495063, bsz=16826.7, num_updates=59000, lr=0.000260378, gnorm=0.131, clip=0, loss_scale=2, train_wall=84, gb_free=61, wall=50541 epoch 035: 1665 / 1689 loss=3.512, nll_loss=1.959, ppl=3.89, wps=584900, ups=1.18, wpb=495063, bsz=16826.7, num_updates=59000, lr=0.000260378, gnorm=0.131, clip=0, loss_scale=2, train_wall=84, gb_free=61, wall=50541 epoch 035: 1665 / 1689 loss=3.512, nll_loss=1.959, ppl=3.89, wps=584900, ups=1.18, wpb=495063, bsz=16826.7, num_updates=59000, lr=0.000260378, gnorm=0.131, clip=0, loss_scale=2, train_wall=84, gb_free=61, wall=50541 epoch 035: 1665 / 1689 loss=3.512, nll_loss=1.959, ppl=3.89, wps=584900, ups=1.18, wpb=495063, bsz=16826.7, num_updates=59000, lr=0.000260378, gnorm=0.131, clip=0, loss_scale=2, train_wall=84, gb_free=61, wall=50541 epoch 035: 1665 / 1689 loss=3.512, nll_loss=1.959, ppl=3.89, wps=584900, ups=1.18, wpb=495063, bsz=16826.7, num_updates=59000, lr=0.000260378, gnorm=0.131, clip=0, loss_scale=2, train_wall=84, gb_free=61, wall=50541 epoch 035: 1665 / 1689 loss=3.512, nll_loss=1.959, ppl=3.89, wps=584900, ups=1.18, wpb=495063, bsz=16826.7, num_updates=59000, lr=0.000260378, gnorm=0.131, clip=0, loss_scale=2, train_wall=84, gb_free=61, wall=50541 epoch 035: 1665 / 1689 loss=3.512, nll_loss=1.959, ppl=3.89, wps=584900, ups=1.18, wpb=495063, bsz=16826.7, num_updates=59000, lr=0.000260378, gnorm=0.131, clip=0, loss_scale=2, train_wall=84, gb_free=61, wall=50541 epoch 035: 1665 / 1689 loss=3.512, nll_loss=1.959, ppl=3.89, wps=584900, ups=1.18, wpb=495063, bsz=16826.7, num_updates=59000, lr=0.000260378, gnorm=0.131, clip=0, loss_scale=2, train_wall=84, gb_free=61, wall=50541 epoch 035: 1665 / 1689 loss=3.512, nll_loss=1.959, ppl=3.89, wps=584900, ups=1.18, wpb=495063, bsz=16826.7, num_updates=59000, lr=0.000260378, gnorm=0.131, clip=0, loss_scale=2, train_wall=84, gb_free=61, wall=50541 epoch 035: 1665 / 1689 loss=3.512, nll_loss=1.959, ppl=3.89, wps=584900, ups=1.18, wpb=495063, bsz=16826.7, num_updates=59000, lr=0.000260378, gnorm=0.131, clip=0, loss_scale=2, train_wall=84, gb_free=61, wall=50541 epoch 035: 1665 / 1689 loss=3.512, nll_loss=1.959, ppl=3.89, wps=584900, ups=1.18, wpb=495063, bsz=16826.7, num_updates=59000, lr=0.000260378, gnorm=0.131, clip=0, loss_scale=2, train_wall=84, gb_free=61, wall=50541 epoch 035: 1665 / 1689 loss=3.512, nll_loss=1.959, ppl=3.89, wps=584900, ups=1.18, wpb=495063, bsz=16826.7, num_updates=59000, lr=0.000260378, gnorm=0.131, clip=0, loss_scale=2, train_wall=84, gb_free=61, wall=50541 begin validation on "valid" subset epoch 035 | valid on 'valid' subset | loss 3.706 | nll_loss 2.16 | ppl 4.47 | wps 0 | wpb 44526 | bsz 2008 | num_updates 59000 | best_loss 3.685 epoch 035 | valid on 'valid' subset | loss 3.706 | nll_loss 2.16 | ppl 4.47 | wps 0 | wpb 44526 | bsz 2008 | num_updates 59000 | best_loss 3.685 epoch 035 | valid on 'valid' subset | loss 3.706 | nll_loss 2.16 | ppl 4.47 | wps 0 | wpb 44526 | bsz 2008 | num_updates 59000 | best_loss 3.685 epoch 035 | valid on 'valid' subset | loss 3.706 | nll_loss 2.16 | ppl 4.47 | wps 0 | wpb 44526 | bsz 2008 | num_updates 59000 | best_loss 3.685 epoch 035 | valid on 'valid' subset | loss 3.706 | nll_loss 2.16 | ppl 4.47 | wps 0 | wpb 44526 | bsz 2008 | num_updates 59000 | best_loss 3.685 epoch 035 | valid on 'valid' subset | loss 3.706 | nll_loss 2.16 | ppl 4.47 | wps 0 | wpb 44526 | bsz 2008 | num_updates 59000 | best_loss 3.685 epoch 035 | valid on 'valid' subset | loss 3.706 | nll_loss 2.16 | ppl 4.47 | wps 0 | wpb 44526 | bsz 2008 | num_updates 59000 | best_loss 3.685 epoch 035 | valid on 'valid' subset | loss 3.706 | nll_loss 2.16 | ppl 4.47 | wps 0 | wpb 44526 | bsz 2008 | num_updates 59000 | best_loss 3.685 epoch 035 | valid on 'valid' subset | loss 3.706 | nll_loss 2.16 | ppl 4.47 | wps 0 | wpb 44526 | bsz 2008 | num_updates 59000 | best_loss 3.685 epoch 035 | valid on 'valid' subset | loss 3.706 | nll_loss 2.16 | ppl 4.47 | wps 0 | wpb 44526 | bsz 2008 | num_updates 59000 | best_loss 3.685 epoch 035 | valid on 'valid' subset | loss 3.706 | nll_loss 2.16 | ppl 4.47 | wps 0 | wpb 44526 | bsz 2008 | num_updates 59000 | best_loss 3.685 epoch 035 | valid on 'valid' subset | loss 3.706 | nll_loss 2.16 | ppl 4.47 | wps 0 | wpb 44526 | bsz 2008 | num_updates 59000 | best_loss 3.685 epoch 035 | valid on 'valid' subset | loss 3.706 | nll_loss 2.16 | ppl 4.47 | wps 0 | wpb 44526 | bsz 2008 | num_updates 59000 | best_loss 3.685 epoch 035 | valid on 'valid' subset | loss 3.706 | nll_loss 2.16 | ppl 4.47 | wps 0 | wpb 44526 | bsz 2008 | num_updates 59000 | best_loss 3.685 epoch 035 | valid on 'valid' subset | loss 3.706 | nll_loss 2.16 | ppl 4.47 | wps 0 | wpb 44526 | bsz 2008 | num_updates 59000 | best_loss 3.685 epoch 035 | valid on 'valid' subset | loss 3.706 | nll_loss 2.16 | ppl 4.47 | wps 0 | wpb 44526 | bsz 2008 | num_updates 59000 | best_loss 3.685 epoch 035 | valid on 'valid' subset | loss 3.706 | nll_loss 2.16 | ppl 4.47 | wps 0 | wpb 44526 | bsz 2008 | num_updates 59000 | best_loss 3.685 epoch 035 | valid on 'valid' subset | loss 3.706 | nll_loss 2.16 | ppl 4.47 | wps 0 | wpb 44526 | bsz 2008 | num_updates 59000 | best_loss 3.685 epoch 035 | valid on 'valid' subset | loss 3.706 | nll_loss 2.16 | ppl 4.47 | wps 0 | wpb 44526 | bsz 2008 | num_updates 59000 | best_loss 3.685 epoch 035 | valid on 'valid' subset | loss 3.706 | nll_loss 2.16 | ppl 4.47 | wps 0 | wpb 44526 | bsz 2008 | num_updates 59000 | best_loss 3.685 epoch 035 | valid on 'valid' subset | loss 3.706 | nll_loss 2.16 | ppl 4.47 | wps 0 | wpb 44526 | bsz 2008 | num_updates 59000 | best_loss 3.685 epoch 035 | valid on 'valid' subset | loss 3.706 | nll_loss 2.16 | ppl 4.47 | wps 0 | wpb 44526 | bsz 2008 | num_updates 59000 | best_loss 3.685 epoch 035 | valid on 'valid' subset | loss 3.706 | nll_loss 2.16 | ppl 4.47 | wps 0 | wpb 44526 | bsz 2008 | num_updates 59000 | best_loss 3.685 epoch 035 | valid on 'valid' subset | loss 3.706 | nll_loss 2.16 | ppl 4.47 | wps 0 | wpb 44526 | bsz 2008 | num_updates 59000 | best_loss 3.685 epoch 035 | valid on 'valid' subset | loss 3.706 | nll_loss 2.16 | ppl 4.47 | wps 0 | wpb 44526 | bsz 2008 | num_updates 59000 | best_loss 3.685 epoch 035 | valid on 'valid' subset | loss 3.706 | nll_loss 2.16 | ppl 4.47 | wps 0 | wpb 44526 | bsz 2008 | num_updates 59000 | best_loss 3.685 epoch 035 | valid on 'valid' subset | loss 3.706 | nll_loss 2.16 | ppl 4.47 | wps 0 | wpb 44526 | bsz 2008 | num_updates 59000 | best_loss 3.685 epoch 035 | valid on 'valid' subset | loss 3.706 | nll_loss 2.16 | ppl 4.47 | wps 0 | wpb 44526 | bsz 2008 | num_updates 59000 | best_loss 3.685 epoch 035 | valid on 'valid' subset | loss 3.706 | nll_loss 2.16 | ppl 4.47 | wps 0 | wpb 44526 | bsz 2008 | num_updates 59000 | best_loss 3.685 epoch 035 | valid on 'valid' subset | loss 3.706 | nll_loss 2.16 | ppl 4.47 | wps 0 | wpb 44526 | bsz 2008 | num_updates 59000 | best_loss 3.685 epoch 035 | valid on 'valid' subset | loss 3.706 | nll_loss 2.16 | ppl 4.47 | wps 0 | wpb 44526 | bsz 2008 | num_updates 59000 | best_loss 3.685 epoch 035 | valid on 'valid' subset | loss 3.706 | nll_loss 2.16 | ppl 4.47 | wps 0 | wpb 44526 | bsz 2008 | num_updates 59000 | best_loss 3.685 epoch 035 | valid on 'valid' subset | loss 3.706 | nll_loss 2.16 | ppl 4.47 | wps 0 | wpb 44526 | bsz 2008 | num_updates 59000 | best_loss 3.685 epoch 035 | valid on 'valid' subset | loss 3.706 | nll_loss 2.16 | ppl 4.47 | wps 0 | wpb 44526 | bsz 2008 | num_updates 59000 | best_loss 3.685 epoch 035 | valid on 'valid' subset | loss 3.706 | nll_loss 2.16 | ppl 4.47 | wps 0 | wpb 44526 | bsz 2008 | num_updates 59000 | best_loss 3.685 end of epoch 35 (average epoch stats below) epoch 035 | loss 3.499 | nll_loss 1.943 | ppl 3.85 | wps 577699 | ups 1.17 | wpb 495114 | bsz 16505.9 | num_updates 59024 | lr 0.000260325 | gnorm 0.136 | clip 0 | loss_scale 2 | train_wall 1405 | gb_free 62.6 | wall 50570 epoch 035 | loss 3.499 | nll_loss 1.943 | ppl 3.85 | wps 577699 | ups 1.17 | wpb 495114 | bsz 16505.9 | num_updates 59024 | lr 0.000260325 | gnorm 0.136 | clip 0 | loss_scale 2 | train_wall 1405 | gb_free 62.6 | wall 50570 epoch 035 | loss 3.499 | nll_loss 1.943 | ppl 3.85 | wps 577699 | ups 1.17 | wpb 495114 | bsz 16505.9 | num_updates 59024 | lr 0.000260325 | gnorm 0.136 | clip 0 | loss_scale 2 | train_wall 1405 | gb_free 62.6 | wall 50570 epoch 035 | loss 3.499 | nll_loss 1.943 | ppl 3.85 | wps 577699 | ups 1.17 | wpb 495114 | bsz 16505.9 | num_updates 59024 | lr 0.000260325 | gnorm 0.136 | clip 0 | loss_scale 2 | train_wall 1405 | gb_free 62.6 | wall 50570 epoch 035 | loss 3.499 | nll_loss 1.943 | ppl 3.85 | wps 577699 | ups 1.17 | wpb 495114 | bsz 16505.9 | num_updates 59024 | lr 0.000260325 | gnorm 0.136 | clip 0 | loss_scale 2 | train_wall 1405 | gb_free 62.6 | wall 50570 epoch 035 | loss 3.499 | nll_loss 1.943 | ppl 3.85 | wps 577699 | ups 1.17 | wpb 495114 | bsz 16505.9 | num_updates 59024 | lr 0.000260325 | gnorm 0.136 | clip 0 | loss_scale 2 | train_wall 1405 | gb_free 62.6 | wall 50570 epoch 035 | loss 3.499 | nll_loss 1.943 | ppl 3.85 | wps 577699 | ups 1.17 | wpb 495114 | bsz 16505.9 | num_updates 59024 | lr 0.000260325 | gnorm 0.136 | clip 0 | loss_scale 2 | train_wall 1405 | gb_free 62.6 | wall 50570 epoch 035 | loss 3.499 | nll_loss 1.943 | ppl 3.85 | wps 577699 | ups 1.17 | wpb 495114 | bsz 16505.9 | num_updates 59024 | lr 0.000260325 | gnorm 0.136 | clip 0 | loss_scale 2 | train_wall 1405 | gb_free 62.6 | wall 50570 epoch 035 | loss 3.499 | nll_loss 1.943 | ppl 3.85 | wps 577699 | ups 1.17 | wpb 495114 | bsz 16505.9 | num_updates 59024 | lr 0.000260325 | gnorm 0.136 | clip 0 | loss_scale 2 | train_wall 1405 | gb_free 62.6 | wall 50570 epoch 035 | loss 3.499 | nll_loss 1.943 | ppl 3.85 | wps 577699 | ups 1.17 | wpb 495114 | bsz 16505.9 | num_updates 59024 | lr 0.000260325 | gnorm 0.136 | clip 0 | loss_scale 2 | train_wall 1405 | gb_free 62.6 | wall 50570 epoch 035 | loss 3.499 | nll_loss 1.943 | ppl 3.85 | wps 577699 | ups 1.17 | wpb 495114 | bsz 16505.9 | num_updates 59024 | lr 0.000260325 | gnorm 0.136 | clip 0 | loss_scale 2 | train_wall 1405 | gb_free 62.6 | wall 50570 epoch 035 | loss 3.499 | nll_loss 1.943 | ppl 3.85 | wps 577699 | ups 1.17 | wpb 495114 | bsz 16505.9 | num_updates 59024 | lr 0.000260325 | gnorm 0.136 | clip 0 | loss_scale 2 | train_wall 1405 | gb_free 62.6 | wall 50570 epoch 035 | loss 3.499 | nll_loss 1.943 | ppl 3.85 | wps 577699 | ups 1.17 | wpb 495114 | bsz 16505.9 | num_updates 59024 | lr 0.000260325 | gnorm 0.136 | clip 0 | loss_scale 2 | train_wall 1405 | gb_free 62.6 | wall 50570 epoch 035 | loss 3.499 | nll_loss 1.943 | ppl 3.85 | wps 577699 | ups 1.17 | wpb 495114 | bsz 16505.9 | num_updates 59024 | lr 0.000260325 | gnorm 0.136 | clip 0 | loss_scale 2 | train_wall 1405 | gb_free 62.6 | wall 50570 epoch 035 | loss 3.499 | nll_loss 1.943 | ppl 3.85 | wps 577699 | ups 1.17 | wpb 495114 | bsz 16505.9 | num_updates 59024 | lr 0.000260325 | gnorm 0.136 | clip 0 | loss_scale 2 | train_wall 1405 | gb_free 62.6 | wall 50570 epoch 035 | loss 3.499 | nll_loss 1.943 | ppl 3.85 | wps 577699 | ups 1.17 | wpb 495114 | bsz 16505.9 | num_updates 59024 | lr 0.000260325 | gnorm 0.136 | clip 0 | loss_scale 2 | train_wall 1405 | gb_free 62.6 | wall 50570 epoch 035 | loss 3.499 | nll_loss 1.943 | ppl 3.85 | wps 577699 | ups 1.17 | wpb 495114 | bsz 16505.9 | num_updates 59024 | lr 0.000260325 | gnorm 0.136 | clip 0 | loss_scale 2 | train_wall 1405 | gb_free 62.6 | wall 50570 epoch 035 | loss 3.499 | nll_loss 1.943 | ppl 3.85 | wps 577699 | ups 1.17 | wpb 495114 | bsz 16505.9 | num_updates 59024 | lr 0.000260325 | gnorm 0.136 | clip 0 | loss_scale 2 | train_wall 1405 | gb_free 62.6 | wall 50570 epoch 035 | loss 3.499 | nll_loss 1.943 | ppl 3.85 | wps 577699 | ups 1.17 | wpb 495114 | bsz 16505.9 | num_updates 59024 | lr 0.000260325 | gnorm 0.136 | clip 0 | loss_scale 2 | train_wall 1405 | gb_free 62.6 | wall 50570 epoch 035 | loss 3.499 | nll_loss 1.943 | ppl 3.85 | wps 577699 | ups 1.17 | wpb 495114 | bsz 16505.9 | num_updates 59024 | lr 0.000260325 | gnorm 0.136 | clip 0 | loss_scale 2 | train_wall 1405 | gb_free 62.6 | wall 50570 epoch 035 | loss 3.499 | nll_loss 1.943 | ppl 3.85 | wps 577699 | ups 1.17 | wpb 495114 | bsz 16505.9 | num_updates 59024 | lr 0.000260325 | gnorm 0.136 | clip 0 | loss_scale 2 | train_wall 1405 | gb_free 62.6 | wall 50570 epoch 035 | loss 3.499 | nll_loss 1.943 | ppl 3.85 | wps 577699 | ups 1.17 | wpb 495114 | bsz 16505.9 | num_updates 59024 | lr 0.000260325 | gnorm 0.136 | clip 0 | loss_scale 2 | train_wall 1405 | gb_free 62.6 | wall 50570 epoch 035 | loss 3.499 | nll_loss 1.943 | ppl 3.85 | wps 577699 | ups 1.17 | wpb 495114 | bsz 16505.9 | num_updates 59024 | lr 0.000260325 | gnorm 0.136 | clip 0 | loss_scale 2 | train_wall 1405 | gb_free 62.6 | wall 50570 epoch 035 | loss 3.499 | nll_loss 1.943 | ppl 3.85 | wps 577699 | ups 1.17 | wpb 495114 | bsz 16505.9 | num_updates 59024 | lr 0.000260325 | gnorm 0.136 | clip 0 | loss_scale 2 | train_wall 1405 | gb_free 62.6 | wall 50570 epoch 035 | loss 3.499 | nll_loss 1.943 | ppl 3.85 | wps 577699 | ups 1.17 | wpb 495114 | bsz 16505.9 | num_updates 59024 | lr 0.000260325 | gnorm 0.136 | clip 0 | loss_scale 2 | train_wall 1405 | gb_free 62.6 | wall 50570 epoch 035 | loss 3.499 | nll_loss 1.943 | ppl 3.85 | wps 577699 | ups 1.17 | wpb 495114 | bsz 16505.9 | num_updates 59024 | lr 0.000260325 | gnorm 0.136 | clip 0 | loss_scale 2 | train_wall 1405 | gb_free 62.6 | wall 50570 epoch 035 | loss 3.499 | nll_loss 1.943 | ppl 3.85 | wps 577699 | ups 1.17 | wpb 495114 | bsz 16505.9 | num_updates 59024 | lr 0.000260325 | gnorm 0.136 | clip 0 | loss_scale 2 | train_wall 1405 | gb_free 62.6 | wall 50570 epoch 035 | loss 3.499 | nll_loss 1.943 | ppl 3.85 | wps 577699 | ups 1.17 | wpb 495114 | bsz 16505.9 | num_updates 59024 | lr 0.000260325 | gnorm 0.136 | clip 0 | loss_scale 2 | train_wall 1405 | gb_free 62.6 | wall 50570 epoch 035 | loss 3.499 | nll_loss 1.943 | ppl 3.85 | wps 577699 | ups 1.17 | wpb 495114 | bsz 16505.9 | num_updates 59024 | lr 0.000260325 | gnorm 0.136 | clip 0 | loss_scale 2 | train_wall 1405 | gb_free 62.6 | wall 50570 epoch 035 | loss 3.499 | nll_loss 1.943 | ppl 3.85 | wps 577699 | ups 1.17 | wpb 495114 | bsz 16505.9 | num_updates 59024 | lr 0.000260325 | gnorm 0.136 | clip 0 | loss_scale 2 | train_wall 1405 | gb_free 62.6 | wall 50570 epoch 035 | loss 3.499 | nll_loss 1.943 | ppl 3.85 | wps 577699 | ups 1.17 | wpb 495114 | bsz 16505.9 | num_updates 59024 | lr 0.000260325 | gnorm 0.136 | clip 0 | loss_scale 2 | train_wall 1405 | gb_free 62.6 | wall 50570 epoch 035 | loss 3.499 | nll_loss 1.943 | ppl 3.85 | wps 577699 | ups 1.17 | wpb 495114 | bsz 16505.9 | num_updates 59024 | lr 0.000260325 | gnorm 0.136 | clip 0 | loss_scale 2 | train_wall 1405 | gb_free 62.6 | wall 50570 epoch 035 | loss 3.499 | nll_loss 1.943 | ppl 3.85 | wps 577699 | ups 1.17 | wpb 495114 | bsz 16505.9 | num_updates 59024 | lr 0.000260325 | gnorm 0.136 | clip 0 | loss_scale 2 | train_wall 1405 | gb_free 62.6 | wall 50570 epoch 035 | loss 3.499 | nll_loss 1.943 | ppl 3.85 | wps 577699 | ups 1.17 | wpb 495114 | bsz 16505.9 | num_updates 59024 | lr 0.000260325 | gnorm 0.136 | clip 0 | loss_scale 2 | train_wall 1405 | gb_free 62.6 | wall 50570 epoch 035 | loss 3.499 | nll_loss 1.943 | ppl 3.85 | wps 577699 | ups 1.17 | wpb 495114 | bsz 16505.9 | num_updates 59024 | lr 0.000260325 | gnorm 0.136 | clip 0 | loss_scale 2 | train_wall 1405 | gb_free 62.6 | wall 50570 Start iterating over samples epoch 036: 76 / 1689 loss=3.481, nll_loss=1.923, ppl=3.79, wps=526560, ups=1.07, wpb=491893, bsz=16777.7, num_updates=59100, lr=0.000260157, gnorm=0.136, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=50635 epoch 036: 76 / 1689 loss=3.481, nll_loss=1.923, ppl=3.79, wps=526560, ups=1.07, wpb=491893, bsz=16777.7, num_updates=59100, lr=0.000260157, gnorm=0.136, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=50635 epoch 036: 76 / 1689 loss=3.481, nll_loss=1.923, ppl=3.79, wps=526560, ups=1.07, wpb=491893, bsz=16777.7, num_updates=59100, lr=0.000260157, gnorm=0.136, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=50635 epoch 036: 76 / 1689 loss=3.481, nll_loss=1.923, ppl=3.79, wps=526560, ups=1.07, wpb=491893, bsz=16777.7, num_updates=59100, lr=0.000260157, gnorm=0.136, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=50635 epoch 036: 76 / 1689 loss=3.481, nll_loss=1.923, ppl=3.79, wps=526560, ups=1.07, wpb=491893, bsz=16777.7, num_updates=59100, lr=0.000260157, gnorm=0.136, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=50635 epoch 036: 76 / 1689 loss=3.481, nll_loss=1.923, ppl=3.79, wps=526560, ups=1.07, wpb=491893, bsz=16777.7, num_updates=59100, lr=0.000260157, gnorm=0.136, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=50635 epoch 036: 76 / 1689 loss=3.481, nll_loss=1.923, ppl=3.79, wps=526560, ups=1.07, wpb=491893, bsz=16777.7, num_updates=59100, lr=0.000260157, gnorm=0.136, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=50635 epoch 036: 76 / 1689 loss=3.481, nll_loss=1.923, ppl=3.79, wps=526560, ups=1.07, wpb=491893, bsz=16777.7, num_updates=59100, lr=0.000260157, gnorm=0.136, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=50635 epoch 036: 76 / 1689 loss=3.481, nll_loss=1.923, ppl=3.79, wps=526560, ups=1.07, wpb=491893, bsz=16777.7, num_updates=59100, lr=0.000260157, gnorm=0.136, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=50635 epoch 036: 76 / 1689 loss=3.481, nll_loss=1.923, ppl=3.79, wps=526560, ups=1.07, wpb=491893, bsz=16777.7, num_updates=59100, lr=0.000260157, gnorm=0.136, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=50635 epoch 036: 76 / 1689 loss=3.481, nll_loss=1.923, ppl=3.79, wps=526560, ups=1.07, wpb=491893, bsz=16777.7, num_updates=59100, lr=0.000260157, gnorm=0.136, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=50635 epoch 036: 76 / 1689 loss=3.481, nll_loss=1.923, ppl=3.79, wps=526560, ups=1.07, wpb=491893, bsz=16777.7, num_updates=59100, lr=0.000260157, gnorm=0.136, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=50635 epoch 036: 76 / 1689 loss=3.481, nll_loss=1.923, ppl=3.79, wps=526560, ups=1.07, wpb=491893, bsz=16777.7, num_updates=59100, lr=0.000260157, gnorm=0.136, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=50635 epoch 036: 76 / 1689 loss=3.481, nll_loss=1.923, ppl=3.79, wps=526560, ups=1.07, wpb=491893, bsz=16777.7, num_updates=59100, lr=0.000260157, gnorm=0.136, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=50635 epoch 036: 76 / 1689 loss=3.481, nll_loss=1.923, ppl=3.79, wps=526560, ups=1.07, wpb=491893, bsz=16777.7, num_updates=59100, lr=0.000260157, gnorm=0.136, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=50635 epoch 036: 76 / 1689 loss=3.481, nll_loss=1.923, ppl=3.79, wps=526560, ups=1.07, wpb=491893, bsz=16777.7, num_updates=59100, lr=0.000260157, gnorm=0.136, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=50635 epoch 036: 76 / 1689 loss=3.481, nll_loss=1.923, ppl=3.79, wps=526560, ups=1.07, wpb=491893, bsz=16777.7, num_updates=59100, lr=0.000260157, gnorm=0.136, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=50635 epoch 036: 76 / 1689 loss=3.481, nll_loss=1.923, ppl=3.79, wps=526560, ups=1.07, wpb=491893, bsz=16777.7, num_updates=59100, lr=0.000260157, gnorm=0.136, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=50635 epoch 036: 76 / 1689 loss=3.481, nll_loss=1.923, ppl=3.79, wps=526560, ups=1.07, wpb=491893, bsz=16777.7, num_updates=59100, lr=0.000260157, gnorm=0.136, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=50635 epoch 036: 76 / 1689 loss=3.481, nll_loss=1.923, ppl=3.79, wps=526560, ups=1.07, wpb=491893, bsz=16777.7, num_updates=59100, lr=0.000260157, gnorm=0.136, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=50635 epoch 036: 76 / 1689 loss=3.481, nll_loss=1.923, ppl=3.79, wps=526560, ups=1.07, wpb=491893, bsz=16777.7, num_updates=59100, lr=0.000260157, gnorm=0.136, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=50635 epoch 036: 76 / 1689 loss=3.481, nll_loss=1.923, ppl=3.79, wps=526560, ups=1.07, wpb=491893, bsz=16777.7, num_updates=59100, lr=0.000260157, gnorm=0.136, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=50635 epoch 036: 76 / 1689 loss=3.481, nll_loss=1.923, ppl=3.79, wps=526560, ups=1.07, wpb=491893, bsz=16777.7, num_updates=59100, lr=0.000260157, gnorm=0.136, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=50635 epoch 036: 76 / 1689 loss=3.481, nll_loss=1.923, ppl=3.79, wps=526560, ups=1.07, wpb=491893, bsz=16777.7, num_updates=59100, lr=0.000260157, gnorm=0.136, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=50635 epoch 036: 76 / 1689 loss=3.481, nll_loss=1.923, ppl=3.79, wps=526560, ups=1.07, wpb=491893, bsz=16777.7, num_updates=59100, lr=0.000260157, gnorm=0.136, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=50635 epoch 036: 76 / 1689 loss=3.481, nll_loss=1.923, ppl=3.79, wps=526560, ups=1.07, wpb=491893, bsz=16777.7, num_updates=59100, lr=0.000260157, gnorm=0.136, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=50635 epoch 036: 76 / 1689 loss=3.481, nll_loss=1.923, ppl=3.79, wps=526560, ups=1.07, wpb=491893, bsz=16777.7, num_updates=59100, lr=0.000260157, gnorm=0.136, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=50635 epoch 036: 76 / 1689 loss=3.481, nll_loss=1.923, ppl=3.79, wps=526560, ups=1.07, wpb=491893, bsz=16777.7, num_updates=59100, lr=0.000260157, gnorm=0.136, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=50635 epoch 036: 76 / 1689 loss=3.481, nll_loss=1.923, ppl=3.79, wps=526560, ups=1.07, wpb=491893, bsz=16777.7, num_updates=59100, lr=0.000260157, gnorm=0.136, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=50635 epoch 036: 76 / 1689 loss=3.481, nll_loss=1.923, ppl=3.79, wps=526560, ups=1.07, wpb=491893, bsz=16777.7, num_updates=59100, lr=0.000260157, gnorm=0.136, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=50635 epoch 036: 76 / 1689 loss=3.481, nll_loss=1.923, ppl=3.79, wps=526560, ups=1.07, wpb=491893, bsz=16777.7, num_updates=59100, lr=0.000260157, gnorm=0.136, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=50635 epoch 036: 76 / 1689 loss=3.481, nll_loss=1.923, ppl=3.79, wps=526560, ups=1.07, wpb=491893, bsz=16777.7, num_updates=59100, lr=0.000260157, gnorm=0.136, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=50635 epoch 036: 76 / 1689 loss=3.481, nll_loss=1.923, ppl=3.79, wps=526560, ups=1.07, wpb=491893, bsz=16777.7, num_updates=59100, lr=0.000260157, gnorm=0.136, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=50635 epoch 036: 76 / 1689 loss=3.481, nll_loss=1.923, ppl=3.79, wps=526560, ups=1.07, wpb=491893, bsz=16777.7, num_updates=59100, lr=0.000260157, gnorm=0.136, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=50635 epoch 036: 76 / 1689 loss=3.481, nll_loss=1.923, ppl=3.79, wps=526560, ups=1.07, wpb=491893, bsz=16777.7, num_updates=59100, lr=0.000260157, gnorm=0.136, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=50635 epoch 036: 76 / 1689 loss=3.481, nll_loss=1.923, ppl=3.79, wps=526560, ups=1.07, wpb=491893, bsz=16777.7, num_updates=59100, lr=0.000260157, gnorm=0.136, clip=0, loss_scale=2, train_wall=83, gb_free=61.3, wall=50635 epoch 036: 176 / 1689 loss=3.478, nll_loss=1.919, ppl=3.78, wps=588706, ups=1.19, wpb=493507, bsz=16228.9, num_updates=59200, lr=0.000259938, gnorm=0.143, clip=0, loss_scale=2, train_wall=83, gb_free=61.7, wall=50718 epoch 036: 176 / 1689 loss=3.478, nll_loss=1.919, ppl=3.78, wps=588706, ups=1.19, wpb=493507, bsz=16228.9, num_updates=59200, lr=0.000259938, gnorm=0.143, clip=0, loss_scale=2, train_wall=83, gb_free=61.7, wall=50718 epoch 036: 176 / 1689 loss=3.478, nll_loss=1.919, ppl=3.78, wps=588706, ups=1.19, wpb=493507, bsz=16228.9, num_updates=59200, lr=0.000259938, gnorm=0.143, clip=0, loss_scale=2, train_wall=83, gb_free=61.7, wall=50718 epoch 036: 176 / 1689 loss=3.478, nll_loss=1.919, ppl=3.78, wps=588706, ups=1.19, wpb=493507, bsz=16228.9, num_updates=59200, lr=0.000259938, gnorm=0.143, clip=0, loss_scale=2, train_wall=83, gb_free=61.7, wall=50718 epoch 036: 176 / 1689 loss=3.478, nll_loss=1.919, ppl=3.78, wps=588706, ups=1.19, wpb=493507, bsz=16228.9, num_updates=59200, lr=0.000259938, gnorm=0.143, clip=0, loss_scale=2, train_wall=83, gb_free=61.7, wall=50718 epoch 036: 176 / 1689 loss=3.478, nll_loss=1.919, ppl=3.78, wps=588706, ups=1.19, wpb=493507, bsz=16228.9, num_updates=59200, lr=0.000259938, gnorm=0.143, clip=0, loss_scale=2, train_wall=83, gb_free=61.7, wall=50718 epoch 036: 176 / 1689 loss=3.478, nll_loss=1.919, ppl=3.78, wps=588706, ups=1.19, wpb=493507, bsz=16228.9, num_updates=59200, lr=0.000259938, gnorm=0.143, clip=0, loss_scale=2, train_wall=83, gb_free=61.7, wall=50718 epoch 036: 176 / 1689 loss=3.478, nll_loss=1.919, ppl=3.78, wps=588706, ups=1.19, wpb=493507, bsz=16228.9, num_updates=59200, lr=0.000259938, gnorm=0.143, clip=0, loss_scale=2, train_wall=83, gb_free=61.7, wall=50718 epoch 036: 176 / 1689 loss=3.478, nll_loss=1.919, ppl=3.78, wps=588706, ups=1.19, wpb=493507, bsz=16228.9, num_updates=59200, lr=0.000259938, gnorm=0.143, clip=0, loss_scale=2, train_wall=83, gb_free=61.7, wall=50718 epoch 036: 176 / 1689 loss=3.478, nll_loss=1.919, ppl=3.78, wps=588706, ups=1.19, wpb=493507, bsz=16228.9, num_updates=59200, lr=0.000259938, gnorm=0.143, clip=0, loss_scale=2, train_wall=83, gb_free=61.7, wall=50718 epoch 036: 176 / 1689 loss=3.478, nll_loss=1.919, ppl=3.78, wps=588706, ups=1.19, wpb=493507, bsz=16228.9, num_updates=59200, lr=0.000259938, gnorm=0.143, clip=0, loss_scale=2, train_wall=83, gb_free=61.7, wall=50718 epoch 036: 176 / 1689 loss=3.478, nll_loss=1.919, ppl=3.78, wps=588706, ups=1.19, wpb=493507, bsz=16228.9, num_updates=59200, lr=0.000259938, gnorm=0.143, clip=0, loss_scale=2, train_wall=83, gb_free=61.7, wall=50718 epoch 036: 176 / 1689 loss=3.478, nll_loss=1.919, ppl=3.78, wps=588706, ups=1.19, wpb=493507, bsz=16228.9, num_updates=59200, lr=0.000259938, gnorm=0.143, clip=0, loss_scale=2, train_wall=83, gb_free=61.7, wall=50718 epoch 036: 176 / 1689 loss=3.478, nll_loss=1.919, ppl=3.78, wps=588706, ups=1.19, wpb=493507, bsz=16228.9, num_updates=59200, lr=0.000259938, gnorm=0.143, clip=0, loss_scale=2, train_wall=83, gb_free=61.7, wall=50718 epoch 036: 176 / 1689 loss=3.478, nll_loss=1.919, ppl=3.78, wps=588706, ups=1.19, wpb=493507, bsz=16228.9, num_updates=59200, lr=0.000259938, gnorm=0.143, clip=0, loss_scale=2, train_wall=83, gb_free=61.7, wall=50718 epoch 036: 176 / 1689 loss=3.478, nll_loss=1.919, ppl=3.78, wps=588706, ups=1.19, wpb=493507, bsz=16228.9, num_updates=59200, lr=0.000259938, gnorm=0.143, clip=0, loss_scale=2, train_wall=83, gb_free=61.7, wall=50718 epoch 036: 176 / 1689 loss=3.478, nll_loss=1.919, ppl=3.78, wps=588706, ups=1.19, wpb=493507, bsz=16228.9, num_updates=59200, lr=0.000259938, gnorm=0.143, clip=0, loss_scale=2, train_wall=83, gb_free=61.7, wall=50718 epoch 036: 176 / 1689 loss=3.478, nll_loss=1.919, ppl=3.78, wps=588706, ups=1.19, wpb=493507, bsz=16228.9, num_updates=59200, lr=0.000259938, gnorm=0.143, clip=0, loss_scale=2, train_wall=83, gb_free=61.7, wall=50718 epoch 036: 176 / 1689 loss=3.478, nll_loss=1.919, ppl=3.78, wps=588706, ups=1.19, wpb=493507, bsz=16228.9, num_updates=59200, lr=0.000259938, gnorm=0.143, clip=0, loss_scale=2, train_wall=83, gb_free=61.7, wall=50718 epoch 036: 176 / 1689 loss=3.478, nll_loss=1.919, ppl=3.78, wps=588706, ups=1.19, wpb=493507, bsz=16228.9, num_updates=59200, lr=0.000259938, gnorm=0.143, clip=0, loss_scale=2, train_wall=83, gb_free=61.7, wall=50718 epoch 036: 176 / 1689 loss=3.478, nll_loss=1.919, ppl=3.78, wps=588706, ups=1.19, wpb=493507, bsz=16228.9, num_updates=59200, lr=0.000259938, gnorm=0.143, clip=0, loss_scale=2, train_wall=83, gb_free=61.7, wall=50718 epoch 036: 176 / 1689 loss=3.478, nll_loss=1.919, ppl=3.78, wps=588706, ups=1.19, wpb=493507, bsz=16228.9, num_updates=59200, lr=0.000259938, gnorm=0.143, clip=0, loss_scale=2, train_wall=83, gb_free=61.7, wall=50718 epoch 036: 176 / 1689 loss=3.478, nll_loss=1.919, ppl=3.78, wps=588706, ups=1.19, wpb=493507, bsz=16228.9, num_updates=59200, lr=0.000259938, gnorm=0.143, clip=0, loss_scale=2, train_wall=83, gb_free=61.7, wall=50718 epoch 036: 176 / 1689 loss=3.478, nll_loss=1.919, ppl=3.78, wps=588706, ups=1.19, wpb=493507, bsz=16228.9, num_updates=59200, lr=0.000259938, gnorm=0.143, clip=0, loss_scale=2, train_wall=83, gb_free=61.7, wall=50718 epoch 036: 176 / 1689 loss=3.478, nll_loss=1.919, ppl=3.78, wps=588706, ups=1.19, wpb=493507, bsz=16228.9, num_updates=59200, lr=0.000259938, gnorm=0.143, clip=0, loss_scale=2, train_wall=83, gb_free=61.7, wall=50718 epoch 036: 176 / 1689 loss=3.478, nll_loss=1.919, ppl=3.78, wps=588706, ups=1.19, wpb=493507, bsz=16228.9, num_updates=59200, lr=0.000259938, gnorm=0.143, clip=0, loss_scale=2, train_wall=83, gb_free=61.7, wall=50718 epoch 036: 176 / 1689 loss=3.478, nll_loss=1.919, ppl=3.78, wps=588706, ups=1.19, wpb=493507, bsz=16228.9, num_updates=59200, lr=0.000259938, gnorm=0.143, clip=0, loss_scale=2, train_wall=83, gb_free=61.7, wall=50718 epoch 036: 176 / 1689 loss=3.478, nll_loss=1.919, ppl=3.78, wps=588706, ups=1.19, wpb=493507, bsz=16228.9, num_updates=59200, lr=0.000259938, gnorm=0.143, clip=0, loss_scale=2, train_wall=83, gb_free=61.7, wall=50718 epoch 036: 176 / 1689 loss=3.478, nll_loss=1.919, ppl=3.78, wps=588706, ups=1.19, wpb=493507, bsz=16228.9, num_updates=59200, lr=0.000259938, gnorm=0.143, clip=0, loss_scale=2, train_wall=83, gb_free=61.7, wall=50718 epoch 036: 176 / 1689 loss=3.478, nll_loss=1.919, ppl=3.78, wps=588706, ups=1.19, wpb=493507, bsz=16228.9, num_updates=59200, lr=0.000259938, gnorm=0.143, clip=0, loss_scale=2, train_wall=83, gb_free=61.7, wall=50718 epoch 036: 176 / 1689 loss=3.478, nll_loss=1.919, ppl=3.78, wps=588706, ups=1.19, wpb=493507, bsz=16228.9, num_updates=59200, lr=0.000259938, gnorm=0.143, clip=0, loss_scale=2, train_wall=83, gb_free=61.7, wall=50718 epoch 036: 176 / 1689 loss=3.478, nll_loss=1.919, ppl=3.78, wps=588706, ups=1.19, wpb=493507, bsz=16228.9, num_updates=59200, lr=0.000259938, gnorm=0.143, clip=0, loss_scale=2, train_wall=83, gb_free=61.7, wall=50718 epoch 036: 176 / 1689 loss=3.478, nll_loss=1.919, ppl=3.78, wps=588706, ups=1.19, wpb=493507, bsz=16228.9, num_updates=59200, lr=0.000259938, gnorm=0.143, clip=0, loss_scale=2, train_wall=83, gb_free=61.7, wall=50718 epoch 036: 176 / 1689 loss=3.478, nll_loss=1.919, ppl=3.78, wps=588706, ups=1.19, wpb=493507, bsz=16228.9, num_updates=59200, lr=0.000259938, gnorm=0.143, clip=0, loss_scale=2, train_wall=83, gb_free=61.7, wall=50718 epoch 036: 176 / 1689 loss=3.478, nll_loss=1.919, ppl=3.78, wps=588706, ups=1.19, wpb=493507, bsz=16228.9, num_updates=59200, lr=0.000259938, gnorm=0.143, clip=0, loss_scale=2, train_wall=83, gb_free=61.7, wall=50718 epoch 036: 176 / 1689 loss=3.478, nll_loss=1.919, ppl=3.78, wps=588706, ups=1.19, wpb=493507, bsz=16228.9, num_updates=59200, lr=0.000259938, gnorm=0.143, clip=0, loss_scale=2, train_wall=83, gb_free=61.7, wall=50718 epoch 036: 277 / 1689 loss=3.486, nll_loss=1.928, ppl=3.81, wps=581185, ups=1.17, wpb=494932, bsz=16276.1, num_updates=59300, lr=0.000259718, gnorm=0.139, clip=0, loss_scale=1, train_wall=84, gb_free=61.4, wall=50804 epoch 036: 277 / 1689 loss=3.486, nll_loss=1.928, ppl=3.81, wps=581185, ups=1.17, wpb=494932, bsz=16276.1, num_updates=59300, lr=0.000259718, gnorm=0.139, clip=0, loss_scale=1, train_wall=84, gb_free=61.4, wall=50804 epoch 036: 277 / 1689 loss=3.486, nll_loss=1.928, ppl=3.81, wps=581185, ups=1.17, wpb=494932, bsz=16276.1, num_updates=59300, lr=0.000259718, gnorm=0.139, clip=0, loss_scale=1, train_wall=84, gb_free=61.4, wall=50804 epoch 036: 277 / 1689 loss=3.486, nll_loss=1.928, ppl=3.81, wps=581185, ups=1.17, wpb=494932, bsz=16276.1, num_updates=59300, lr=0.000259718, gnorm=0.139, clip=0, loss_scale=1, train_wall=84, gb_free=61.4, wall=50804 epoch 036: 277 / 1689 loss=3.486, nll_loss=1.928, ppl=3.81, wps=581185, ups=1.17, wpb=494932, bsz=16276.1, num_updates=59300, lr=0.000259718, gnorm=0.139, clip=0, loss_scale=1, train_wall=84, gb_free=61.4, wall=50804 epoch 036: 277 / 1689 loss=3.486, nll_loss=1.928, ppl=3.81, wps=581185, ups=1.17, wpb=494932, bsz=16276.1, num_updates=59300, lr=0.000259718, gnorm=0.139, clip=0, loss_scale=1, train_wall=84, gb_free=61.4, wall=50804 epoch 036: 277 / 1689 loss=3.486, nll_loss=1.928, ppl=3.81, wps=581185, ups=1.17, wpb=494932, bsz=16276.1, num_updates=59300, lr=0.000259718, gnorm=0.139, clip=0, loss_scale=1, train_wall=84, gb_free=61.4, wall=50804 epoch 036: 277 / 1689 loss=3.486, nll_loss=1.928, ppl=3.81, wps=581185, ups=1.17, wpb=494932, bsz=16276.1, num_updates=59300, lr=0.000259718, gnorm=0.139, clip=0, loss_scale=1, train_wall=84, gb_free=61.4, wall=50804 epoch 036: 277 / 1689 loss=3.486, nll_loss=1.928, ppl=3.81, wps=581185, ups=1.17, wpb=494932, bsz=16276.1, num_updates=59300, lr=0.000259718, gnorm=0.139, clip=0, loss_scale=1, train_wall=84, gb_free=61.4, wall=50804 epoch 036: 277 / 1689 loss=3.486, nll_loss=1.928, ppl=3.81, wps=581185, ups=1.17, wpb=494932, bsz=16276.1, num_updates=59300, lr=0.000259718, gnorm=0.139, clip=0, loss_scale=1, train_wall=84, gb_free=61.4, wall=50804 epoch 036: 277 / 1689 loss=3.486, nll_loss=1.928, ppl=3.81, wps=581185, ups=1.17, wpb=494932, bsz=16276.1, num_updates=59300, lr=0.000259718, gnorm=0.139, clip=0, loss_scale=1, train_wall=84, gb_free=61.4, wall=50804 epoch 036: 277 / 1689 loss=3.486, nll_loss=1.928, ppl=3.81, wps=581185, ups=1.17, wpb=494932, bsz=16276.1, num_updates=59300, lr=0.000259718, gnorm=0.139, clip=0, loss_scale=1, train_wall=84, gb_free=61.4, wall=50804 epoch 036: 277 / 1689 loss=3.486, nll_loss=1.928, ppl=3.81, wps=581185, ups=1.17, wpb=494932, bsz=16276.1, num_updates=59300, lr=0.000259718, gnorm=0.139, clip=0, loss_scale=1, train_wall=84, gb_free=61.4, wall=50804 epoch 036: 277 / 1689 loss=3.486, nll_loss=1.928, ppl=3.81, wps=581185, ups=1.17, wpb=494932, bsz=16276.1, num_updates=59300, lr=0.000259718, gnorm=0.139, clip=0, loss_scale=1, train_wall=84, gb_free=61.4, wall=50804 epoch 036: 277 / 1689 loss=3.486, nll_loss=1.928, ppl=3.81, wps=581185, ups=1.17, wpb=494932, bsz=16276.1, num_updates=59300, lr=0.000259718, gnorm=0.139, clip=0, loss_scale=1, train_wall=84, gb_free=61.4, wall=50804 epoch 036: 277 / 1689 loss=3.486, nll_loss=1.928, ppl=3.81, wps=581185, ups=1.17, wpb=494932, bsz=16276.1, num_updates=59300, lr=0.000259718, gnorm=0.139, clip=0, loss_scale=1, train_wall=84, gb_free=61.4, wall=50804 epoch 036: 277 / 1689 loss=3.486, nll_loss=1.928, ppl=3.81, wps=581185, ups=1.17, wpb=494932, bsz=16276.1, num_updates=59300, lr=0.000259718, gnorm=0.139, clip=0, loss_scale=1, train_wall=84, gb_free=61.4, wall=50804 epoch 036: 277 / 1689 loss=3.486, nll_loss=1.928, ppl=3.81, wps=581185, ups=1.17, wpb=494932, bsz=16276.1, num_updates=59300, lr=0.000259718, gnorm=0.139, clip=0, loss_scale=1, train_wall=84, gb_free=61.4, wall=50804 epoch 036: 277 / 1689 loss=3.486, nll_loss=1.928, ppl=3.81, wps=581185, ups=1.17, wpb=494932, bsz=16276.1, num_updates=59300, lr=0.000259718, gnorm=0.139, clip=0, loss_scale=1, train_wall=84, gb_free=61.4, wall=50804 epoch 036: 277 / 1689 loss=3.486, nll_loss=1.928, ppl=3.81, wps=581185, ups=1.17, wpb=494932, bsz=16276.1, num_updates=59300, lr=0.000259718, gnorm=0.139, clip=0, loss_scale=1, train_wall=84, gb_free=61.4, wall=50804 epoch 036: 277 / 1689 loss=3.486, nll_loss=1.928, ppl=3.81, wps=581185, ups=1.17, wpb=494932, bsz=16276.1, num_updates=59300, lr=0.000259718, gnorm=0.139, clip=0, loss_scale=1, train_wall=84, gb_free=61.4, wall=50804 epoch 036: 277 / 1689 loss=3.486, nll_loss=1.928, ppl=3.81, wps=581185, ups=1.17, wpb=494932, bsz=16276.1, num_updates=59300, lr=0.000259718, gnorm=0.139, clip=0, loss_scale=1, train_wall=84, gb_free=61.4, wall=50804 epoch 036: 277 / 1689 loss=3.486, nll_loss=1.928, ppl=3.81, wps=581185, ups=1.17, wpb=494932, bsz=16276.1, num_updates=59300, lr=0.000259718, gnorm=0.139, clip=0, loss_scale=1, train_wall=84, gb_free=61.4, wall=50804 epoch 036: 277 / 1689 loss=3.486, nll_loss=1.928, ppl=3.81, wps=581185, ups=1.17, wpb=494932, bsz=16276.1, num_updates=59300, lr=0.000259718, gnorm=0.139, clip=0, loss_scale=1, train_wall=84, gb_free=61.4, wall=50804 epoch 036: 277 / 1689 loss=3.486, nll_loss=1.928, ppl=3.81, wps=581185, ups=1.17, wpb=494932, bsz=16276.1, num_updates=59300, lr=0.000259718, gnorm=0.139, clip=0, loss_scale=1, train_wall=84, gb_free=61.4, wall=50804 epoch 036: 277 / 1689 loss=3.486, nll_loss=1.928, ppl=3.81, wps=581185, ups=1.17, wpb=494932, bsz=16276.1, num_updates=59300, lr=0.000259718, gnorm=0.139, clip=0, loss_scale=1, train_wall=84, gb_free=61.4, wall=50804 epoch 036: 277 / 1689 loss=3.486, nll_loss=1.928, ppl=3.81, wps=581185, ups=1.17, wpb=494932, bsz=16276.1, num_updates=59300, lr=0.000259718, gnorm=0.139, clip=0, loss_scale=1, train_wall=84, gb_free=61.4, wall=50804 epoch 036: 277 / 1689 loss=3.486, nll_loss=1.928, ppl=3.81, wps=581185, ups=1.17, wpb=494932, bsz=16276.1, num_updates=59300, lr=0.000259718, gnorm=0.139, clip=0, loss_scale=1, train_wall=84, gb_free=61.4, wall=50804 epoch 036: 277 / 1689 loss=3.486, nll_loss=1.928, ppl=3.81, wps=581185, ups=1.17, wpb=494932, bsz=16276.1, num_updates=59300, lr=0.000259718, gnorm=0.139, clip=0, loss_scale=1, train_wall=84, gb_free=61.4, wall=50804 epoch 036: 277 / 1689 loss=3.486, nll_loss=1.928, ppl=3.81, wps=581185, ups=1.17, wpb=494932, bsz=16276.1, num_updates=59300, lr=0.000259718, gnorm=0.139, clip=0, loss_scale=1, train_wall=84, gb_free=61.4, wall=50804 epoch 036: 277 / 1689 loss=3.486, nll_loss=1.928, ppl=3.81, wps=581185, ups=1.17, wpb=494932, bsz=16276.1, num_updates=59300, lr=0.000259718, gnorm=0.139, clip=0, loss_scale=1, train_wall=84, gb_free=61.4, wall=50804 epoch 036: 277 / 1689 loss=3.486, nll_loss=1.928, ppl=3.81, wps=581185, ups=1.17, wpb=494932, bsz=16276.1, num_updates=59300, lr=0.000259718, gnorm=0.139, clip=0, loss_scale=1, train_wall=84, gb_free=61.4, wall=50804 epoch 036: 277 / 1689 loss=3.486, nll_loss=1.928, ppl=3.81, wps=581185, ups=1.17, wpb=494932, bsz=16276.1, num_updates=59300, lr=0.000259718, gnorm=0.139, clip=0, loss_scale=1, train_wall=84, gb_free=61.4, wall=50804 epoch 036: 277 / 1689 loss=3.486, nll_loss=1.928, ppl=3.81, wps=581185, ups=1.17, wpb=494932, bsz=16276.1, num_updates=59300, lr=0.000259718, gnorm=0.139, clip=0, loss_scale=1, train_wall=84, gb_free=61.4, wall=50804 epoch 036: 277 / 1689 loss=3.486, nll_loss=1.928, ppl=3.81, wps=581185, ups=1.17, wpb=494932, bsz=16276.1, num_updates=59300, lr=0.000259718, gnorm=0.139, clip=0, loss_scale=1, train_wall=84, gb_free=61.4, wall=50804 epoch 036: 277 / 1689 loss=3.486, nll_loss=1.928, ppl=3.81, wps=581185, ups=1.17, wpb=494932, bsz=16276.1, num_updates=59300, lr=0.000259718, gnorm=0.139, clip=0, loss_scale=1, train_wall=84, gb_free=61.4, wall=50804 epoch 036: 377 / 1689 loss=3.494, nll_loss=1.937, ppl=3.83, wps=587339, ups=1.18, wpb=496408, bsz=16741.4, num_updates=59400, lr=0.0002595, gnorm=0.137, clip=0, loss_scale=1, train_wall=83, gb_free=61.9, wall=50888 epoch 036: 377 / 1689 loss=3.494, nll_loss=1.937, ppl=3.83, wps=587339, ups=1.18, wpb=496408, bsz=16741.4, num_updates=59400, lr=0.0002595, gnorm=0.137, clip=0, loss_scale=1, train_wall=83, gb_free=61.9, wall=50888 epoch 036: 377 / 1689 loss=3.494, nll_loss=1.937, ppl=3.83, wps=587339, ups=1.18, wpb=496408, bsz=16741.4, num_updates=59400, lr=0.0002595, gnorm=0.137, clip=0, loss_scale=1, train_wall=83, gb_free=61.9, wall=50888 epoch 036: 377 / 1689 loss=3.494, nll_loss=1.937, ppl=3.83, wps=587339, ups=1.18, wpb=496408, bsz=16741.4, num_updates=59400, lr=0.0002595, gnorm=0.137, clip=0, loss_scale=1, train_wall=83, gb_free=61.9, wall=50888 epoch 036: 377 / 1689 loss=3.494, nll_loss=1.937, ppl=3.83, wps=587339, ups=1.18, wpb=496408, bsz=16741.4, num_updates=59400, lr=0.0002595, gnorm=0.137, clip=0, loss_scale=1, train_wall=83, gb_free=61.9, wall=50888 epoch 036: 377 / 1689 loss=3.494, nll_loss=1.937, ppl=3.83, wps=587339, ups=1.18, wpb=496408, bsz=16741.4, num_updates=59400, lr=0.0002595, gnorm=0.137, clip=0, loss_scale=1, train_wall=83, gb_free=61.9, wall=50888 epoch 036: 377 / 1689 loss=3.494, nll_loss=1.937, ppl=3.83, wps=587339, ups=1.18, wpb=496408, bsz=16741.4, num_updates=59400, lr=0.0002595, gnorm=0.137, clip=0, loss_scale=1, train_wall=83, gb_free=61.9, wall=50888 epoch 036: 377 / 1689 loss=3.494, nll_loss=1.937, ppl=3.83, wps=587339, ups=1.18, wpb=496408, bsz=16741.4, num_updates=59400, lr=0.0002595, gnorm=0.137, clip=0, loss_scale=1, train_wall=83, gb_free=61.9, wall=50888 epoch 036: 377 / 1689 loss=3.494, nll_loss=1.937, ppl=3.83, wps=587339, ups=1.18, wpb=496408, bsz=16741.4, num_updates=59400, lr=0.0002595, gnorm=0.137, clip=0, loss_scale=1, train_wall=83, gb_free=61.9, wall=50888 epoch 036: 377 / 1689 loss=3.494, nll_loss=1.937, ppl=3.83, wps=587339, ups=1.18, wpb=496408, bsz=16741.4, num_updates=59400, lr=0.0002595, gnorm=0.137, clip=0, loss_scale=1, train_wall=83, gb_free=61.9, wall=50888 epoch 036: 377 / 1689 loss=3.494, nll_loss=1.937, ppl=3.83, wps=587339, ups=1.18, wpb=496408, bsz=16741.4, num_updates=59400, lr=0.0002595, gnorm=0.137, clip=0, loss_scale=1, train_wall=83, gb_free=61.9, wall=50888 epoch 036: 377 / 1689 loss=3.494, nll_loss=1.937, ppl=3.83, wps=587339, ups=1.18, wpb=496408, bsz=16741.4, num_updates=59400, lr=0.0002595, gnorm=0.137, clip=0, loss_scale=1, train_wall=83, gb_free=61.9, wall=50888 epoch 036: 377 / 1689 loss=3.494, nll_loss=1.937, ppl=3.83, wps=587339, ups=1.18, wpb=496408, bsz=16741.4, num_updates=59400, lr=0.0002595, gnorm=0.137, clip=0, loss_scale=1, train_wall=83, gb_free=61.9, wall=50888 epoch 036: 377 / 1689 loss=3.494, nll_loss=1.937, ppl=3.83, wps=587339, ups=1.18, wpb=496408, bsz=16741.4, num_updates=59400, lr=0.0002595, gnorm=0.137, clip=0, loss_scale=1, train_wall=83, gb_free=61.9, wall=50888 epoch 036: 377 / 1689 loss=3.494, nll_loss=1.937, ppl=3.83, wps=587339, ups=1.18, wpb=496408, bsz=16741.4, num_updates=59400, lr=0.0002595, gnorm=0.137, clip=0, loss_scale=1, train_wall=83, gb_free=61.9, wall=50888 epoch 036: 377 / 1689 loss=3.494, nll_loss=1.937, ppl=3.83, wps=587339, ups=1.18, wpb=496408, bsz=16741.4, num_updates=59400, lr=0.0002595, gnorm=0.137, clip=0, loss_scale=1, train_wall=83, gb_free=61.9, wall=50888 epoch 036: 377 / 1689 loss=3.494, nll_loss=1.937, ppl=3.83, wps=587339, ups=1.18, wpb=496408, bsz=16741.4, num_updates=59400, lr=0.0002595, gnorm=0.137, clip=0, loss_scale=1, train_wall=83, gb_free=61.9, wall=50888 epoch 036: 377 / 1689 loss=3.494, nll_loss=1.937, ppl=3.83, wps=587339, ups=1.18, wpb=496408, bsz=16741.4, num_updates=59400, lr=0.0002595, gnorm=0.137, clip=0, loss_scale=1, train_wall=83, gb_free=61.9, wall=50888 epoch 036: 377 / 1689 loss=3.494, nll_loss=1.937, ppl=3.83, wps=587339, ups=1.18, wpb=496408, bsz=16741.4, num_updates=59400, lr=0.0002595, gnorm=0.137, clip=0, loss_scale=1, train_wall=83, gb_free=61.9, wall=50888 epoch 036: 377 / 1689 loss=3.494, nll_loss=1.937, ppl=3.83, wps=587339, ups=1.18, wpb=496408, bsz=16741.4, num_updates=59400, lr=0.0002595, gnorm=0.137, clip=0, loss_scale=1, train_wall=83, gb_free=61.9, wall=50888 epoch 036: 377 / 1689 loss=3.494, nll_loss=1.937, ppl=3.83, wps=587339, ups=1.18, wpb=496408, bsz=16741.4, num_updates=59400, lr=0.0002595, gnorm=0.137, clip=0, loss_scale=1, train_wall=83, gb_free=61.9, wall=50888 epoch 036: 377 / 1689 loss=3.494, nll_loss=1.937, ppl=3.83, wps=587339, ups=1.18, wpb=496408, bsz=16741.4, num_updates=59400, lr=0.0002595, gnorm=0.137, clip=0, loss_scale=1, train_wall=83, gb_free=61.9, wall=50888 epoch 036: 377 / 1689 loss=3.494, nll_loss=1.937, ppl=3.83, wps=587339, ups=1.18, wpb=496408, bsz=16741.4, num_updates=59400, lr=0.0002595, gnorm=0.137, clip=0, loss_scale=1, train_wall=83, gb_free=61.9, wall=50888 epoch 036: 377 / 1689 loss=3.494, nll_loss=1.937, ppl=3.83, wps=587339, ups=1.18, wpb=496408, bsz=16741.4, num_updates=59400, lr=0.0002595, gnorm=0.137, clip=0, loss_scale=1, train_wall=83, gb_free=61.9, wall=50888 epoch 036: 377 / 1689 loss=3.494, nll_loss=1.937, ppl=3.83, wps=587339, ups=1.18, wpb=496408, bsz=16741.4, num_updates=59400, lr=0.0002595, gnorm=0.137, clip=0, loss_scale=1, train_wall=83, gb_free=61.9, wall=50888 epoch 036: 377 / 1689 loss=3.494, nll_loss=1.937, ppl=3.83, wps=587339, ups=1.18, wpb=496408, bsz=16741.4, num_updates=59400, lr=0.0002595, gnorm=0.137, clip=0, loss_scale=1, train_wall=83, gb_free=61.9, wall=50888 epoch 036: 377 / 1689 loss=3.494, nll_loss=1.937, ppl=3.83, wps=587339, ups=1.18, wpb=496408, bsz=16741.4, num_updates=59400, lr=0.0002595, gnorm=0.137, clip=0, loss_scale=1, train_wall=83, gb_free=61.9, wall=50888 epoch 036: 377 / 1689 loss=3.494, nll_loss=1.937, ppl=3.83, wps=587339, ups=1.18, wpb=496408, bsz=16741.4, num_updates=59400, lr=0.0002595, gnorm=0.137, clip=0, loss_scale=1, train_wall=83, gb_free=61.9, wall=50888 epoch 036: 377 / 1689 loss=3.494, nll_loss=1.937, ppl=3.83, wps=587339, ups=1.18, wpb=496408, bsz=16741.4, num_updates=59400, lr=0.0002595, gnorm=0.137, clip=0, loss_scale=1, train_wall=83, gb_free=61.9, wall=50888 epoch 036: 377 / 1689 loss=3.494, nll_loss=1.937, ppl=3.83, wps=587339, ups=1.18, wpb=496408, bsz=16741.4, num_updates=59400, lr=0.0002595, gnorm=0.137, clip=0, loss_scale=1, train_wall=83, gb_free=61.9, wall=50888 epoch 036: 377 / 1689 loss=3.494, nll_loss=1.937, ppl=3.83, wps=587339, ups=1.18, wpb=496408, bsz=16741.4, num_updates=59400, lr=0.0002595, gnorm=0.137, clip=0, loss_scale=1, train_wall=83, gb_free=61.9, wall=50888 epoch 036: 377 / 1689 loss=3.494, nll_loss=1.937, ppl=3.83, wps=587339, ups=1.18, wpb=496408, bsz=16741.4, num_updates=59400, lr=0.0002595, gnorm=0.137, clip=0, loss_scale=1, train_wall=83, gb_free=61.9, wall=50888 epoch 036: 377 / 1689 loss=3.494, nll_loss=1.937, ppl=3.83, wps=587339, ups=1.18, wpb=496408, bsz=16741.4, num_updates=59400, lr=0.0002595, gnorm=0.137, clip=0, loss_scale=1, train_wall=83, gb_free=61.9, wall=50888 epoch 036: 377 / 1689 loss=3.494, nll_loss=1.937, ppl=3.83, wps=587339, ups=1.18, wpb=496408, bsz=16741.4, num_updates=59400, lr=0.0002595, gnorm=0.137, clip=0, loss_scale=1, train_wall=83, gb_free=61.9, wall=50888 epoch 036: 377 / 1689 loss=3.494, nll_loss=1.937, ppl=3.83, wps=587339, ups=1.18, wpb=496408, bsz=16741.4, num_updates=59400, lr=0.0002595, gnorm=0.137, clip=0, loss_scale=1, train_wall=83, gb_free=61.9, wall=50888 epoch 036: 377 / 1689 loss=3.494, nll_loss=1.937, ppl=3.83, wps=587339, ups=1.18, wpb=496408, bsz=16741.4, num_updates=59400, lr=0.0002595, gnorm=0.137, clip=0, loss_scale=1, train_wall=83, gb_free=61.9, wall=50888 epoch 036: 477 / 1689 loss=3.483, nll_loss=1.925, ppl=3.8, wps=587804, ups=1.19, wpb=495751, bsz=16603.4, num_updates=59500, lr=0.000259281, gnorm=0.129, clip=0, loss_scale=1, train_wall=83, gb_free=61.5, wall=50972 epoch 036: 477 / 1689 loss=3.483, nll_loss=1.925, ppl=3.8, wps=587804, ups=1.19, wpb=495751, bsz=16603.4, num_updates=59500, lr=0.000259281, gnorm=0.129, clip=0, loss_scale=1, train_wall=83, gb_free=61.5, wall=50972 epoch 036: 477 / 1689 loss=3.483, nll_loss=1.925, ppl=3.8, wps=587804, ups=1.19, wpb=495751, bsz=16603.4, num_updates=59500, lr=0.000259281, gnorm=0.129, clip=0, loss_scale=1, train_wall=83, gb_free=61.5, wall=50972 epoch 036: 477 / 1689 loss=3.483, nll_loss=1.925, ppl=3.8, wps=587804, ups=1.19, wpb=495751, bsz=16603.4, num_updates=59500, lr=0.000259281, gnorm=0.129, clip=0, loss_scale=1, train_wall=83, gb_free=61.5, wall=50972 epoch 036: 477 / 1689 loss=3.483, nll_loss=1.925, ppl=3.8, wps=587804, ups=1.19, wpb=495751, bsz=16603.4, num_updates=59500, lr=0.000259281, gnorm=0.129, clip=0, loss_scale=1, train_wall=83, gb_free=61.5, wall=50972 epoch 036: 477 / 1689 loss=3.483, nll_loss=1.925, ppl=3.8, wps=587804, ups=1.19, wpb=495751, bsz=16603.4, num_updates=59500, lr=0.000259281, gnorm=0.129, clip=0, loss_scale=1, train_wall=83, gb_free=61.5, wall=50972 epoch 036: 477 / 1689 loss=3.483, nll_loss=1.925, ppl=3.8, wps=587804, ups=1.19, wpb=495751, bsz=16603.4, num_updates=59500, lr=0.000259281, gnorm=0.129, clip=0, loss_scale=1, train_wall=83, gb_free=61.5, wall=50972 epoch 036: 477 / 1689 loss=3.483, nll_loss=1.925, ppl=3.8, wps=587804, ups=1.19, wpb=495751, bsz=16603.4, num_updates=59500, lr=0.000259281, gnorm=0.129, clip=0, loss_scale=1, train_wall=83, gb_free=61.5, wall=50972 epoch 036: 477 / 1689 loss=3.483, nll_loss=1.925, ppl=3.8, wps=587804, ups=1.19, wpb=495751, bsz=16603.4, num_updates=59500, lr=0.000259281, gnorm=0.129, clip=0, loss_scale=1, train_wall=83, gb_free=61.5, wall=50972 epoch 036: 477 / 1689 loss=3.483, nll_loss=1.925, ppl=3.8, wps=587804, ups=1.19, wpb=495751, bsz=16603.4, num_updates=59500, lr=0.000259281, gnorm=0.129, clip=0, loss_scale=1, train_wall=83, gb_free=61.5, wall=50972 epoch 036: 477 / 1689 loss=3.483, nll_loss=1.925, ppl=3.8, wps=587804, ups=1.19, wpb=495751, bsz=16603.4, num_updates=59500, lr=0.000259281, gnorm=0.129, clip=0, loss_scale=1, train_wall=83, gb_free=61.5, wall=50972 epoch 036: 477 / 1689 loss=3.483, nll_loss=1.925, ppl=3.8, wps=587804, ups=1.19, wpb=495751, bsz=16603.4, num_updates=59500, lr=0.000259281, gnorm=0.129, clip=0, loss_scale=1, train_wall=83, gb_free=61.5, wall=50972 epoch 036: 477 / 1689 loss=3.483, nll_loss=1.925, ppl=3.8, wps=587804, ups=1.19, wpb=495751, bsz=16603.4, num_updates=59500, lr=0.000259281, gnorm=0.129, clip=0, loss_scale=1, train_wall=83, gb_free=61.5, wall=50972 epoch 036: 477 / 1689 loss=3.483, nll_loss=1.925, ppl=3.8, wps=587804, ups=1.19, wpb=495751, bsz=16603.4, num_updates=59500, lr=0.000259281, gnorm=0.129, clip=0, loss_scale=1, train_wall=83, gb_free=61.5, wall=50972 epoch 036: 477 / 1689 loss=3.483, nll_loss=1.925, ppl=3.8, wps=587804, ups=1.19, wpb=495751, bsz=16603.4, num_updates=59500, lr=0.000259281, gnorm=0.129, clip=0, loss_scale=1, train_wall=83, gb_free=61.5, wall=50972 epoch 036: 477 / 1689 loss=3.483, nll_loss=1.925, ppl=3.8, wps=587804, ups=1.19, wpb=495751, bsz=16603.4, num_updates=59500, lr=0.000259281, gnorm=0.129, clip=0, loss_scale=1, train_wall=83, gb_free=61.5, wall=50972 epoch 036: 477 / 1689 loss=3.483, nll_loss=1.925, ppl=3.8, wps=587804, ups=1.19, wpb=495751, bsz=16603.4, num_updates=59500, lr=0.000259281, gnorm=0.129, clip=0, loss_scale=1, train_wall=83, gb_free=61.5, wall=50972 epoch 036: 477 / 1689 loss=3.483, nll_loss=1.925, ppl=3.8, wps=587804, ups=1.19, wpb=495751, bsz=16603.4, num_updates=59500, lr=0.000259281, gnorm=0.129, clip=0, loss_scale=1, train_wall=83, gb_free=61.5, wall=50972 epoch 036: 477 / 1689 loss=3.483, nll_loss=1.925, ppl=3.8, wps=587804, ups=1.19, wpb=495751, bsz=16603.4, num_updates=59500, lr=0.000259281, gnorm=0.129, clip=0, loss_scale=1, train_wall=83, gb_free=61.5, wall=50972 epoch 036: 477 / 1689 loss=3.483, nll_loss=1.925, ppl=3.8, wps=587804, ups=1.19, wpb=495751, bsz=16603.4, num_updates=59500, lr=0.000259281, gnorm=0.129, clip=0, loss_scale=1, train_wall=83, gb_free=61.5, wall=50972 epoch 036: 477 / 1689 loss=3.483, nll_loss=1.925, ppl=3.8, wps=587804, ups=1.19, wpb=495751, bsz=16603.4, num_updates=59500, lr=0.000259281, gnorm=0.129, clip=0, loss_scale=1, train_wall=83, gb_free=61.5, wall=50972 epoch 036: 477 / 1689 loss=3.483, nll_loss=1.925, ppl=3.8, wps=587804, ups=1.19, wpb=495751, bsz=16603.4, num_updates=59500, lr=0.000259281, gnorm=0.129, clip=0, loss_scale=1, train_wall=83, gb_free=61.5, wall=50972 epoch 036: 477 / 1689 loss=3.483, nll_loss=1.925, ppl=3.8, wps=587804, ups=1.19, wpb=495751, bsz=16603.4, num_updates=59500, lr=0.000259281, gnorm=0.129, clip=0, loss_scale=1, train_wall=83, gb_free=61.5, wall=50972 epoch 036: 477 / 1689 loss=3.483, nll_loss=1.925, ppl=3.8, wps=587804, ups=1.19, wpb=495751, bsz=16603.4, num_updates=59500, lr=0.000259281, gnorm=0.129, clip=0, loss_scale=1, train_wall=83, gb_free=61.5, wall=50972 epoch 036: 477 / 1689 loss=3.483, nll_loss=1.925, ppl=3.8, wps=587804, ups=1.19, wpb=495751, bsz=16603.4, num_updates=59500, lr=0.000259281, gnorm=0.129, clip=0, loss_scale=1, train_wall=83, gb_free=61.5, wall=50972 epoch 036: 477 / 1689 loss=3.483, nll_loss=1.925, ppl=3.8, wps=587804, ups=1.19, wpb=495751, bsz=16603.4, num_updates=59500, lr=0.000259281, gnorm=0.129, clip=0, loss_scale=1, train_wall=83, gb_free=61.5, wall=50972 epoch 036: 477 / 1689 loss=3.483, nll_loss=1.925, ppl=3.8, wps=587804, ups=1.19, wpb=495751, bsz=16603.4, num_updates=59500, lr=0.000259281, gnorm=0.129, clip=0, loss_scale=1, train_wall=83, gb_free=61.5, wall=50972 epoch 036: 477 / 1689 loss=3.483, nll_loss=1.925, ppl=3.8, wps=587804, ups=1.19, wpb=495751, bsz=16603.4, num_updates=59500, lr=0.000259281, gnorm=0.129, clip=0, loss_scale=1, train_wall=83, gb_free=61.5, wall=50972 epoch 036: 477 / 1689 loss=3.483, nll_loss=1.925, ppl=3.8, wps=587804, ups=1.19, wpb=495751, bsz=16603.4, num_updates=59500, lr=0.000259281, gnorm=0.129, clip=0, loss_scale=1, train_wall=83, gb_free=61.5, wall=50972 epoch 036: 477 / 1689 loss=3.483, nll_loss=1.925, ppl=3.8, wps=587804, ups=1.19, wpb=495751, bsz=16603.4, num_updates=59500, lr=0.000259281, gnorm=0.129, clip=0, loss_scale=1, train_wall=83, gb_free=61.5, wall=50972 epoch 036: 477 / 1689 loss=3.483, nll_loss=1.925, ppl=3.8, wps=587804, ups=1.19, wpb=495751, bsz=16603.4, num_updates=59500, lr=0.000259281, gnorm=0.129, clip=0, loss_scale=1, train_wall=83, gb_free=61.5, wall=50972 epoch 036: 477 / 1689 loss=3.483, nll_loss=1.925, ppl=3.8, wps=587804, ups=1.19, wpb=495751, bsz=16603.4, num_updates=59500, lr=0.000259281, gnorm=0.129, clip=0, loss_scale=1, train_wall=83, gb_free=61.5, wall=50972 epoch 036: 477 / 1689 loss=3.483, nll_loss=1.925, ppl=3.8, wps=587804, ups=1.19, wpb=495751, bsz=16603.4, num_updates=59500, lr=0.000259281, gnorm=0.129, clip=0, loss_scale=1, train_wall=83, gb_free=61.5, wall=50972 epoch 036: 477 / 1689 loss=3.483, nll_loss=1.925, ppl=3.8, wps=587804, ups=1.19, wpb=495751, bsz=16603.4, num_updates=59500, lr=0.000259281, gnorm=0.129, clip=0, loss_scale=1, train_wall=83, gb_free=61.5, wall=50972 epoch 036: 477 / 1689 loss=3.483, nll_loss=1.925, ppl=3.8, wps=587804, ups=1.19, wpb=495751, bsz=16603.4, num_updates=59500, lr=0.000259281, gnorm=0.129, clip=0, loss_scale=1, train_wall=83, gb_free=61.5, wall=50972 epoch 036: 477 / 1689 loss=3.483, nll_loss=1.925, ppl=3.8, wps=587804, ups=1.19, wpb=495751, bsz=16603.4, num_updates=59500, lr=0.000259281, gnorm=0.129, clip=0, loss_scale=1, train_wall=83, gb_free=61.5, wall=50972 epoch 036: 577 / 1689 loss=3.493, nll_loss=1.936, ppl=3.83, wps=586087, ups=1.18, wpb=495308, bsz=16373.9, num_updates=59600, lr=0.000259064, gnorm=0.136, clip=0, loss_scale=1, train_wall=83, gb_free=61.2, wall=51057 epoch 036: 577 / 1689 loss=3.493, nll_loss=1.936, ppl=3.83, wps=586087, ups=1.18, wpb=495308, bsz=16373.9, num_updates=59600, lr=0.000259064, gnorm=0.136, clip=0, loss_scale=1, train_wall=83, gb_free=61.2, wall=51057 epoch 036: 577 / 1689 loss=3.493, nll_loss=1.936, ppl=3.83, wps=586087, ups=1.18, wpb=495308, bsz=16373.9, num_updates=59600, lr=0.000259064, gnorm=0.136, clip=0, loss_scale=1, train_wall=83, gb_free=61.2, wall=51057 epoch 036: 577 / 1689 loss=3.493, nll_loss=1.936, ppl=3.83, wps=586087, ups=1.18, wpb=495308, bsz=16373.9, num_updates=59600, lr=0.000259064, gnorm=0.136, clip=0, loss_scale=1, train_wall=83, gb_free=61.2, wall=51057 epoch 036: 577 / 1689 loss=3.493, nll_loss=1.936, ppl=3.83, wps=586087, ups=1.18, wpb=495308, bsz=16373.9, num_updates=59600, lr=0.000259064, gnorm=0.136, clip=0, loss_scale=1, train_wall=83, gb_free=61.2, wall=51057 epoch 036: 577 / 1689 loss=3.493, nll_loss=1.936, ppl=3.83, wps=586087, ups=1.18, wpb=495308, bsz=16373.9, num_updates=59600, lr=0.000259064, gnorm=0.136, clip=0, loss_scale=1, train_wall=83, gb_free=61.2, wall=51057 epoch 036: 577 / 1689 loss=3.493, nll_loss=1.936, ppl=3.83, wps=586087, ups=1.18, wpb=495308, bsz=16373.9, num_updates=59600, lr=0.000259064, gnorm=0.136, clip=0, loss_scale=1, train_wall=83, gb_free=61.2, wall=51057 epoch 036: 577 / 1689 loss=3.493, nll_loss=1.936, ppl=3.83, wps=586087, ups=1.18, wpb=495308, bsz=16373.9, num_updates=59600, lr=0.000259064, gnorm=0.136, clip=0, loss_scale=1, train_wall=83, gb_free=61.2, wall=51057 epoch 036: 577 / 1689 loss=3.493, nll_loss=1.936, ppl=3.83, wps=586087, ups=1.18, wpb=495308, bsz=16373.9, num_updates=59600, lr=0.000259064, gnorm=0.136, clip=0, loss_scale=1, train_wall=83, gb_free=61.2, wall=51057 epoch 036: 577 / 1689 loss=3.493, nll_loss=1.936, ppl=3.83, wps=586087, ups=1.18, wpb=495308, bsz=16373.9, num_updates=59600, lr=0.000259064, gnorm=0.136, clip=0, loss_scale=1, train_wall=83, gb_free=61.2, wall=51057 epoch 036: 577 / 1689 loss=3.493, nll_loss=1.936, ppl=3.83, wps=586087, ups=1.18, wpb=495308, bsz=16373.9, num_updates=59600, lr=0.000259064, gnorm=0.136, clip=0, loss_scale=1, train_wall=83, gb_free=61.2, wall=51057 epoch 036: 577 / 1689 loss=3.493, nll_loss=1.936, ppl=3.83, wps=586087, ups=1.18, wpb=495308, bsz=16373.9, num_updates=59600, lr=0.000259064, gnorm=0.136, clip=0, loss_scale=1, train_wall=83, gb_free=61.2, wall=51057 epoch 036: 577 / 1689 loss=3.493, nll_loss=1.936, ppl=3.83, wps=586087, ups=1.18, wpb=495308, bsz=16373.9, num_updates=59600, lr=0.000259064, gnorm=0.136, clip=0, loss_scale=1, train_wall=83, gb_free=61.2, wall=51057 epoch 036: 577 / 1689 loss=3.493, nll_loss=1.936, ppl=3.83, wps=586087, ups=1.18, wpb=495308, bsz=16373.9, num_updates=59600, lr=0.000259064, gnorm=0.136, clip=0, loss_scale=1, train_wall=83, gb_free=61.2, wall=51057 epoch 036: 577 / 1689 loss=3.493, nll_loss=1.936, ppl=3.83, wps=586087, ups=1.18, wpb=495308, bsz=16373.9, num_updates=59600, lr=0.000259064, gnorm=0.136, clip=0, loss_scale=1, train_wall=83, gb_free=61.2, wall=51057 epoch 036: 577 / 1689 loss=3.493, nll_loss=1.936, ppl=3.83, wps=586087, ups=1.18, wpb=495308, bsz=16373.9, num_updates=59600, lr=0.000259064, gnorm=0.136, clip=0, loss_scale=1, train_wall=83, gb_free=61.2, wall=51057 epoch 036: 577 / 1689 loss=3.493, nll_loss=1.936, ppl=3.83, wps=586087, ups=1.18, wpb=495308, bsz=16373.9, num_updates=59600, lr=0.000259064, gnorm=0.136, clip=0, loss_scale=1, train_wall=83, gb_free=61.2, wall=51057 epoch 036: 577 / 1689 loss=3.493, nll_loss=1.936, ppl=3.83, wps=586087, ups=1.18, wpb=495308, bsz=16373.9, num_updates=59600, lr=0.000259064, gnorm=0.136, clip=0, loss_scale=1, train_wall=83, gb_free=61.2, wall=51057 epoch 036: 577 / 1689 loss=3.493, nll_loss=1.936, ppl=3.83, wps=586087, ups=1.18, wpb=495308, bsz=16373.9, num_updates=59600, lr=0.000259064, gnorm=0.136, clip=0, loss_scale=1, train_wall=83, gb_free=61.2, wall=51057 epoch 036: 577 / 1689 loss=3.493, nll_loss=1.936, ppl=3.83, wps=586087, ups=1.18, wpb=495308, bsz=16373.9, num_updates=59600, lr=0.000259064, gnorm=0.136, clip=0, loss_scale=1, train_wall=83, gb_free=61.2, wall=51057 epoch 036: 577 / 1689 loss=3.493, nll_loss=1.936, ppl=3.83, wps=586087, ups=1.18, wpb=495308, bsz=16373.9, num_updates=59600, lr=0.000259064, gnorm=0.136, clip=0, loss_scale=1, train_wall=83, gb_free=61.2, wall=51057 epoch 036: 577 / 1689 loss=3.493, nll_loss=1.936, ppl=3.83, wps=586087, ups=1.18, wpb=495308, bsz=16373.9, num_updates=59600, lr=0.000259064, gnorm=0.136, clip=0, loss_scale=1, train_wall=83, gb_free=61.2, wall=51057 epoch 036: 577 / 1689 loss=3.493, nll_loss=1.936, ppl=3.83, wps=586087, ups=1.18, wpb=495308, bsz=16373.9, num_updates=59600, lr=0.000259064, gnorm=0.136, clip=0, loss_scale=1, train_wall=83, gb_free=61.2, wall=51057 epoch 036: 577 / 1689 loss=3.493, nll_loss=1.936, ppl=3.83, wps=586087, ups=1.18, wpb=495308, bsz=16373.9, num_updates=59600, lr=0.000259064, gnorm=0.136, clip=0, loss_scale=1, train_wall=83, gb_free=61.2, wall=51057 epoch 036: 577 / 1689 loss=3.493, nll_loss=1.936, ppl=3.83, wps=586087, ups=1.18, wpb=495308, bsz=16373.9, num_updates=59600, lr=0.000259064, gnorm=0.136, clip=0, loss_scale=1, train_wall=83, gb_free=61.2, wall=51057 epoch 036: 577 / 1689 loss=3.493, nll_loss=1.936, ppl=3.83, wps=586087, ups=1.18, wpb=495308, bsz=16373.9, num_updates=59600, lr=0.000259064, gnorm=0.136, clip=0, loss_scale=1, train_wall=83, gb_free=61.2, wall=51057 epoch 036: 577 / 1689 loss=3.493, nll_loss=1.936, ppl=3.83, wps=586087, ups=1.18, wpb=495308, bsz=16373.9, num_updates=59600, lr=0.000259064, gnorm=0.136, clip=0, loss_scale=1, train_wall=83, gb_free=61.2, wall=51057 epoch 036: 577 / 1689 loss=3.493, nll_loss=1.936, ppl=3.83, wps=586087, ups=1.18, wpb=495308, bsz=16373.9, num_updates=59600, lr=0.000259064, gnorm=0.136, clip=0, loss_scale=1, train_wall=83, gb_free=61.2, wall=51057 epoch 036: 577 / 1689 loss=3.493, nll_loss=1.936, ppl=3.83, wps=586087, ups=1.18, wpb=495308, bsz=16373.9, num_updates=59600, lr=0.000259064, gnorm=0.136, clip=0, loss_scale=1, train_wall=83, gb_free=61.2, wall=51057 epoch 036: 577 / 1689 loss=3.493, nll_loss=1.936, ppl=3.83, wps=586087, ups=1.18, wpb=495308, bsz=16373.9, num_updates=59600, lr=0.000259064, gnorm=0.136, clip=0, loss_scale=1, train_wall=83, gb_free=61.2, wall=51057 epoch 036: 577 / 1689 loss=3.493, nll_loss=1.936, ppl=3.83, wps=586087, ups=1.18, wpb=495308, bsz=16373.9, num_updates=59600, lr=0.000259064, gnorm=0.136, clip=0, loss_scale=1, train_wall=83, gb_free=61.2, wall=51057 epoch 036: 577 / 1689 loss=3.493, nll_loss=1.936, ppl=3.83, wps=586087, ups=1.18, wpb=495308, bsz=16373.9, num_updates=59600, lr=0.000259064, gnorm=0.136, clip=0, loss_scale=1, train_wall=83, gb_free=61.2, wall=51057 epoch 036: 577 / 1689 loss=3.493, nll_loss=1.936, ppl=3.83, wps=586087, ups=1.18, wpb=495308, bsz=16373.9, num_updates=59600, lr=0.000259064, gnorm=0.136, clip=0, loss_scale=1, train_wall=83, gb_free=61.2, wall=51057 epoch 036: 577 / 1689 loss=3.493, nll_loss=1.936, ppl=3.83, wps=586087, ups=1.18, wpb=495308, bsz=16373.9, num_updates=59600, lr=0.000259064, gnorm=0.136, clip=0, loss_scale=1, train_wall=83, gb_free=61.2, wall=51057 epoch 036: 577 / 1689 loss=3.493, nll_loss=1.936, ppl=3.83, wps=586087, ups=1.18, wpb=495308, bsz=16373.9, num_updates=59600, lr=0.000259064, gnorm=0.136, clip=0, loss_scale=1, train_wall=83, gb_free=61.2, wall=51057 epoch 036: 577 / 1689 loss=3.493, nll_loss=1.936, ppl=3.83, wps=586087, ups=1.18, wpb=495308, bsz=16373.9, num_updates=59600, lr=0.000259064, gnorm=0.136, clip=0, loss_scale=1, train_wall=83, gb_free=61.2, wall=51057 epoch 036: 677 / 1689 loss=3.495, nll_loss=1.938, ppl=3.83, wps=588255, ups=1.19, wpb=494176, bsz=16226.2, num_updates=59700, lr=0.000258847, gnorm=0.139, clip=0, loss_scale=1, train_wall=83, gb_free=61.7, wall=51141 epoch 036: 677 / 1689 loss=3.495, nll_loss=1.938, ppl=3.83, wps=588255, ups=1.19, wpb=494176, bsz=16226.2, num_updates=59700, lr=0.000258847, gnorm=0.139, clip=0, loss_scale=1, train_wall=83, gb_free=61.7, wall=51141 epoch 036: 677 / 1689 loss=3.495, nll_loss=1.938, ppl=3.83, wps=588255, ups=1.19, wpb=494176, bsz=16226.2, num_updates=59700, lr=0.000258847, gnorm=0.139, clip=0, loss_scale=1, train_wall=83, gb_free=61.7, wall=51141 epoch 036: 677 / 1689 loss=3.495, nll_loss=1.938, ppl=3.83, wps=588255, ups=1.19, wpb=494176, bsz=16226.2, num_updates=59700, lr=0.000258847, gnorm=0.139, clip=0, loss_scale=1, train_wall=83, gb_free=61.7, wall=51141 epoch 036: 677 / 1689 loss=3.495, nll_loss=1.938, ppl=3.83, wps=588255, ups=1.19, wpb=494176, bsz=16226.2, num_updates=59700, lr=0.000258847, gnorm=0.139, clip=0, loss_scale=1, train_wall=83, gb_free=61.7, wall=51141 epoch 036: 677 / 1689 loss=3.495, nll_loss=1.938, ppl=3.83, wps=588255, ups=1.19, wpb=494176, bsz=16226.2, num_updates=59700, lr=0.000258847, gnorm=0.139, clip=0, loss_scale=1, train_wall=83, gb_free=61.7, wall=51141 epoch 036: 677 / 1689 loss=3.495, nll_loss=1.938, ppl=3.83, wps=588255, ups=1.19, wpb=494176, bsz=16226.2, num_updates=59700, lr=0.000258847, gnorm=0.139, clip=0, loss_scale=1, train_wall=83, gb_free=61.7, wall=51141 epoch 036: 677 / 1689 loss=3.495, nll_loss=1.938, ppl=3.83, wps=588255, ups=1.19, wpb=494176, bsz=16226.2, num_updates=59700, lr=0.000258847, gnorm=0.139, clip=0, loss_scale=1, train_wall=83, gb_free=61.7, wall=51141 epoch 036: 677 / 1689 loss=3.495, nll_loss=1.938, ppl=3.83, wps=588255, ups=1.19, wpb=494176, bsz=16226.2, num_updates=59700, lr=0.000258847, gnorm=0.139, clip=0, loss_scale=1, train_wall=83, gb_free=61.7, wall=51141 epoch 036: 677 / 1689 loss=3.495, nll_loss=1.938, ppl=3.83, wps=588255, ups=1.19, wpb=494176, bsz=16226.2, num_updates=59700, lr=0.000258847, gnorm=0.139, clip=0, loss_scale=1, train_wall=83, gb_free=61.7, wall=51141 epoch 036: 677 / 1689 loss=3.495, nll_loss=1.938, ppl=3.83, wps=588255, ups=1.19, wpb=494176, bsz=16226.2, num_updates=59700, lr=0.000258847, gnorm=0.139, clip=0, loss_scale=1, train_wall=83, gb_free=61.7, wall=51141 epoch 036: 677 / 1689 loss=3.495, nll_loss=1.938, ppl=3.83, wps=588255, ups=1.19, wpb=494176, bsz=16226.2, num_updates=59700, lr=0.000258847, gnorm=0.139, clip=0, loss_scale=1, train_wall=83, gb_free=61.7, wall=51141 epoch 036: 677 / 1689 loss=3.495, nll_loss=1.938, ppl=3.83, wps=588255, ups=1.19, wpb=494176, bsz=16226.2, num_updates=59700, lr=0.000258847, gnorm=0.139, clip=0, loss_scale=1, train_wall=83, gb_free=61.7, wall=51141 epoch 036: 677 / 1689 loss=3.495, nll_loss=1.938, ppl=3.83, wps=588255, ups=1.19, wpb=494176, bsz=16226.2, num_updates=59700, lr=0.000258847, gnorm=0.139, clip=0, loss_scale=1, train_wall=83, gb_free=61.7, wall=51141 epoch 036: 677 / 1689 loss=3.495, nll_loss=1.938, ppl=3.83, wps=588255, ups=1.19, wpb=494176, bsz=16226.2, num_updates=59700, lr=0.000258847, gnorm=0.139, clip=0, loss_scale=1, train_wall=83, gb_free=61.7, wall=51141 epoch 036: 677 / 1689 loss=3.495, nll_loss=1.938, ppl=3.83, wps=588255, ups=1.19, wpb=494176, bsz=16226.2, num_updates=59700, lr=0.000258847, gnorm=0.139, clip=0, loss_scale=1, train_wall=83, gb_free=61.7, wall=51141 epoch 036: 677 / 1689 loss=3.495, nll_loss=1.938, ppl=3.83, wps=588255, ups=1.19, wpb=494176, bsz=16226.2, num_updates=59700, lr=0.000258847, gnorm=0.139, clip=0, loss_scale=1, train_wall=83, gb_free=61.7, wall=51141 epoch 036: 677 / 1689 loss=3.495, nll_loss=1.938, ppl=3.83, wps=588255, ups=1.19, wpb=494176, bsz=16226.2, num_updates=59700, lr=0.000258847, gnorm=0.139, clip=0, loss_scale=1, train_wall=83, gb_free=61.7, wall=51141 epoch 036: 677 / 1689 loss=3.495, nll_loss=1.938, ppl=3.83, wps=588255, ups=1.19, wpb=494176, bsz=16226.2, num_updates=59700, lr=0.000258847, gnorm=0.139, clip=0, loss_scale=1, train_wall=83, gb_free=61.7, wall=51141 epoch 036: 677 / 1689 loss=3.495, nll_loss=1.938, ppl=3.83, wps=588255, ups=1.19, wpb=494176, bsz=16226.2, num_updates=59700, lr=0.000258847, gnorm=0.139, clip=0, loss_scale=1, train_wall=83, gb_free=61.7, wall=51141 epoch 036: 677 / 1689 loss=3.495, nll_loss=1.938, ppl=3.83, wps=588255, ups=1.19, wpb=494176, bsz=16226.2, num_updates=59700, lr=0.000258847, gnorm=0.139, clip=0, loss_scale=1, train_wall=83, gb_free=61.7, wall=51141 epoch 036: 677 / 1689 loss=3.495, nll_loss=1.938, ppl=3.83, wps=588255, ups=1.19, wpb=494176, bsz=16226.2, num_updates=59700, lr=0.000258847, gnorm=0.139, clip=0, loss_scale=1, train_wall=83, gb_free=61.7, wall=51141 epoch 036: 677 / 1689 loss=3.495, nll_loss=1.938, ppl=3.83, wps=588255, ups=1.19, wpb=494176, bsz=16226.2, num_updates=59700, lr=0.000258847, gnorm=0.139, clip=0, loss_scale=1, train_wall=83, gb_free=61.7, wall=51141 epoch 036: 677 / 1689 loss=3.495, nll_loss=1.938, ppl=3.83, wps=588255, ups=1.19, wpb=494176, bsz=16226.2, num_updates=59700, lr=0.000258847, gnorm=0.139, clip=0, loss_scale=1, train_wall=83, gb_free=61.7, wall=51141 epoch 036: 677 / 1689 loss=3.495, nll_loss=1.938, ppl=3.83, wps=588255, ups=1.19, wpb=494176, bsz=16226.2, num_updates=59700, lr=0.000258847, gnorm=0.139, clip=0, loss_scale=1, train_wall=83, gb_free=61.7, wall=51141 epoch 036: 677 / 1689 loss=3.495, nll_loss=1.938, ppl=3.83, wps=588255, ups=1.19, wpb=494176, bsz=16226.2, num_updates=59700, lr=0.000258847, gnorm=0.139, clip=0, loss_scale=1, train_wall=83, gb_free=61.7, wall=51141 epoch 036: 677 / 1689 loss=3.495, nll_loss=1.938, ppl=3.83, wps=588255, ups=1.19, wpb=494176, bsz=16226.2, num_updates=59700, lr=0.000258847, gnorm=0.139, clip=0, loss_scale=1, train_wall=83, gb_free=61.7, wall=51141 epoch 036: 677 / 1689 loss=3.495, nll_loss=1.938, ppl=3.83, wps=588255, ups=1.19, wpb=494176, bsz=16226.2, num_updates=59700, lr=0.000258847, gnorm=0.139, clip=0, loss_scale=1, train_wall=83, gb_free=61.7, wall=51141 epoch 036: 677 / 1689 loss=3.495, nll_loss=1.938, ppl=3.83, wps=588255, ups=1.19, wpb=494176, bsz=16226.2, num_updates=59700, lr=0.000258847, gnorm=0.139, clip=0, loss_scale=1, train_wall=83, gb_free=61.7, wall=51141 epoch 036: 677 / 1689 loss=3.495, nll_loss=1.938, ppl=3.83, wps=588255, ups=1.19, wpb=494176, bsz=16226.2, num_updates=59700, lr=0.000258847, gnorm=0.139, clip=0, loss_scale=1, train_wall=83, gb_free=61.7, wall=51141 epoch 036: 677 / 1689 loss=3.495, nll_loss=1.938, ppl=3.83, wps=588255, ups=1.19, wpb=494176, bsz=16226.2, num_updates=59700, lr=0.000258847, gnorm=0.139, clip=0, loss_scale=1, train_wall=83, gb_free=61.7, wall=51141 epoch 036: 677 / 1689 loss=3.495, nll_loss=1.938, ppl=3.83, wps=588255, ups=1.19, wpb=494176, bsz=16226.2, num_updates=59700, lr=0.000258847, gnorm=0.139, clip=0, loss_scale=1, train_wall=83, gb_free=61.7, wall=51141 epoch 036: 677 / 1689 loss=3.495, nll_loss=1.938, ppl=3.83, wps=588255, ups=1.19, wpb=494176, bsz=16226.2, num_updates=59700, lr=0.000258847, gnorm=0.139, clip=0, loss_scale=1, train_wall=83, gb_free=61.7, wall=51141 epoch 036: 677 / 1689 loss=3.495, nll_loss=1.938, ppl=3.83, wps=588255, ups=1.19, wpb=494176, bsz=16226.2, num_updates=59700, lr=0.000258847, gnorm=0.139, clip=0, loss_scale=1, train_wall=83, gb_free=61.7, wall=51141 epoch 036: 677 / 1689 loss=3.495, nll_loss=1.938, ppl=3.83, wps=588255, ups=1.19, wpb=494176, bsz=16226.2, num_updates=59700, lr=0.000258847, gnorm=0.139, clip=0, loss_scale=1, train_wall=83, gb_free=61.7, wall=51141 epoch 036: 677 / 1689 loss=3.495, nll_loss=1.938, ppl=3.83, wps=588255, ups=1.19, wpb=494176, bsz=16226.2, num_updates=59700, lr=0.000258847, gnorm=0.139, clip=0, loss_scale=1, train_wall=83, gb_free=61.7, wall=51141 epoch 036: 777 / 1689 loss=3.502, nll_loss=1.947, ppl=3.86, wps=584375, ups=1.18, wpb=493981, bsz=16296.5, num_updates=59800, lr=0.00025863, gnorm=0.135, clip=0, loss_scale=2, train_wall=83, gb_free=61.6, wall=51225 epoch 036: 777 / 1689 loss=3.502, nll_loss=1.947, ppl=3.86, wps=584375, ups=1.18, wpb=493981, bsz=16296.5, num_updates=59800, lr=0.00025863, gnorm=0.135, clip=0, loss_scale=2, train_wall=83, gb_free=61.6, wall=51225 epoch 036: 777 / 1689 loss=3.502, nll_loss=1.947, ppl=3.86, wps=584375, ups=1.18, wpb=493981, bsz=16296.5, num_updates=59800, lr=0.00025863, gnorm=0.135, clip=0, loss_scale=2, train_wall=83, gb_free=61.6, wall=51225 epoch 036: 777 / 1689 loss=3.502, nll_loss=1.947, ppl=3.86, wps=584375, ups=1.18, wpb=493981, bsz=16296.5, num_updates=59800, lr=0.00025863, gnorm=0.135, clip=0, loss_scale=2, train_wall=83, gb_free=61.6, wall=51225 epoch 036: 777 / 1689 loss=3.502, nll_loss=1.947, ppl=3.86, wps=584375, ups=1.18, wpb=493981, bsz=16296.5, num_updates=59800, lr=0.00025863, gnorm=0.135, clip=0, loss_scale=2, train_wall=83, gb_free=61.6, wall=51225 epoch 036: 777 / 1689 loss=3.502, nll_loss=1.947, ppl=3.86, wps=584375, ups=1.18, wpb=493981, bsz=16296.5, num_updates=59800, lr=0.00025863, gnorm=0.135, clip=0, loss_scale=2, train_wall=83, gb_free=61.6, wall=51225 epoch 036: 777 / 1689 loss=3.502, nll_loss=1.947, ppl=3.86, wps=584375, ups=1.18, wpb=493981, bsz=16296.5, num_updates=59800, lr=0.00025863, gnorm=0.135, clip=0, loss_scale=2, train_wall=83, gb_free=61.6, wall=51225 epoch 036: 777 / 1689 loss=3.502, nll_loss=1.947, ppl=3.86, wps=584375, ups=1.18, wpb=493981, bsz=16296.5, num_updates=59800, lr=0.00025863, gnorm=0.135, clip=0, loss_scale=2, train_wall=83, gb_free=61.6, wall=51225 epoch 036: 777 / 1689 loss=3.502, nll_loss=1.947, ppl=3.86, wps=584375, ups=1.18, wpb=493981, bsz=16296.5, num_updates=59800, lr=0.00025863, gnorm=0.135, clip=0, loss_scale=2, train_wall=83, gb_free=61.6, wall=51225 epoch 036: 777 / 1689 loss=3.502, nll_loss=1.947, ppl=3.86, wps=584375, ups=1.18, wpb=493981, bsz=16296.5, num_updates=59800, lr=0.00025863, gnorm=0.135, clip=0, loss_scale=2, train_wall=83, gb_free=61.6, wall=51225 epoch 036: 777 / 1689 loss=3.502, nll_loss=1.947, ppl=3.86, wps=584375, ups=1.18, wpb=493981, bsz=16296.5, num_updates=59800, lr=0.00025863, gnorm=0.135, clip=0, loss_scale=2, train_wall=83, gb_free=61.6, wall=51225 epoch 036: 777 / 1689 loss=3.502, nll_loss=1.947, ppl=3.86, wps=584375, ups=1.18, wpb=493981, bsz=16296.5, num_updates=59800, lr=0.00025863, gnorm=0.135, clip=0, loss_scale=2, train_wall=83, gb_free=61.6, wall=51225 epoch 036: 777 / 1689 loss=3.502, nll_loss=1.947, ppl=3.86, wps=584375, ups=1.18, wpb=493981, bsz=16296.5, num_updates=59800, lr=0.00025863, gnorm=0.135, clip=0, loss_scale=2, train_wall=83, gb_free=61.6, wall=51225 epoch 036: 777 / 1689 loss=3.502, nll_loss=1.947, ppl=3.86, wps=584375, ups=1.18, wpb=493981, bsz=16296.5, num_updates=59800, lr=0.00025863, gnorm=0.135, clip=0, loss_scale=2, train_wall=83, gb_free=61.6, wall=51225 epoch 036: 777 / 1689 loss=3.502, nll_loss=1.947, ppl=3.86, wps=584375, ups=1.18, wpb=493981, bsz=16296.5, num_updates=59800, lr=0.00025863, gnorm=0.135, clip=0, loss_scale=2, train_wall=83, gb_free=61.6, wall=51225 epoch 036: 777 / 1689 loss=3.502, nll_loss=1.947, ppl=3.86, wps=584375, ups=1.18, wpb=493981, bsz=16296.5, num_updates=59800, lr=0.00025863, gnorm=0.135, clip=0, loss_scale=2, train_wall=83, gb_free=61.6, wall=51225 epoch 036: 777 / 1689 loss=3.502, nll_loss=1.947, ppl=3.86, wps=584375, ups=1.18, wpb=493981, bsz=16296.5, num_updates=59800, lr=0.00025863, gnorm=0.135, clip=0, loss_scale=2, train_wall=83, gb_free=61.6, wall=51225 epoch 036: 777 / 1689 loss=3.502, nll_loss=1.947, ppl=3.86, wps=584375, ups=1.18, wpb=493981, bsz=16296.5, num_updates=59800, lr=0.00025863, gnorm=0.135, clip=0, loss_scale=2, train_wall=83, gb_free=61.6, wall=51225 epoch 036: 777 / 1689 loss=3.502, nll_loss=1.947, ppl=3.86, wps=584375, ups=1.18, wpb=493981, bsz=16296.5, num_updates=59800, lr=0.00025863, gnorm=0.135, clip=0, loss_scale=2, train_wall=83, gb_free=61.6, wall=51225 epoch 036: 777 / 1689 loss=3.502, nll_loss=1.947, ppl=3.86, wps=584375, ups=1.18, wpb=493981, bsz=16296.5, num_updates=59800, lr=0.00025863, gnorm=0.135, clip=0, loss_scale=2, train_wall=83, gb_free=61.6, wall=51225 epoch 036: 777 / 1689 loss=3.502, nll_loss=1.947, ppl=3.86, wps=584375, ups=1.18, wpb=493981, bsz=16296.5, num_updates=59800, lr=0.00025863, gnorm=0.135, clip=0, loss_scale=2, train_wall=83, gb_free=61.6, wall=51225 epoch 036: 777 / 1689 loss=3.502, nll_loss=1.947, ppl=3.86, wps=584375, ups=1.18, wpb=493981, bsz=16296.5, num_updates=59800, lr=0.00025863, gnorm=0.135, clip=0, loss_scale=2, train_wall=83, gb_free=61.6, wall=51225 epoch 036: 777 / 1689 loss=3.502, nll_loss=1.947, ppl=3.86, wps=584375, ups=1.18, wpb=493981, bsz=16296.5, num_updates=59800, lr=0.00025863, gnorm=0.135, clip=0, loss_scale=2, train_wall=83, gb_free=61.6, wall=51225 epoch 036: 777 / 1689 loss=3.502, nll_loss=1.947, ppl=3.86, wps=584375, ups=1.18, wpb=493981, bsz=16296.5, num_updates=59800, lr=0.00025863, gnorm=0.135, clip=0, loss_scale=2, train_wall=83, gb_free=61.6, wall=51225 epoch 036: 777 / 1689 loss=3.502, nll_loss=1.947, ppl=3.86, wps=584375, ups=1.18, wpb=493981, bsz=16296.5, num_updates=59800, lr=0.00025863, gnorm=0.135, clip=0, loss_scale=2, train_wall=83, gb_free=61.6, wall=51225 epoch 036: 777 / 1689 loss=3.502, nll_loss=1.947, ppl=3.86, wps=584375, ups=1.18, wpb=493981, bsz=16296.5, num_updates=59800, lr=0.00025863, gnorm=0.135, clip=0, loss_scale=2, train_wall=83, gb_free=61.6, wall=51225 epoch 036: 777 / 1689 loss=3.502, nll_loss=1.947, ppl=3.86, wps=584375, ups=1.18, wpb=493981, bsz=16296.5, num_updates=59800, lr=0.00025863, gnorm=0.135, clip=0, loss_scale=2, train_wall=83, gb_free=61.6, wall=51225 epoch 036: 777 / 1689 loss=3.502, nll_loss=1.947, ppl=3.86, wps=584375, ups=1.18, wpb=493981, bsz=16296.5, num_updates=59800, lr=0.00025863, gnorm=0.135, clip=0, loss_scale=2, train_wall=83, gb_free=61.6, wall=51225 epoch 036: 777 / 1689 loss=3.502, nll_loss=1.947, ppl=3.86, wps=584375, ups=1.18, wpb=493981, bsz=16296.5, num_updates=59800, lr=0.00025863, gnorm=0.135, clip=0, loss_scale=2, train_wall=83, gb_free=61.6, wall=51225 epoch 036: 777 / 1689 loss=3.502, nll_loss=1.947, ppl=3.86, wps=584375, ups=1.18, wpb=493981, bsz=16296.5, num_updates=59800, lr=0.00025863, gnorm=0.135, clip=0, loss_scale=2, train_wall=83, gb_free=61.6, wall=51225 epoch 036: 777 / 1689 loss=3.502, nll_loss=1.947, ppl=3.86, wps=584375, ups=1.18, wpb=493981, bsz=16296.5, num_updates=59800, lr=0.00025863, gnorm=0.135, clip=0, loss_scale=2, train_wall=83, gb_free=61.6, wall=51225 epoch 036: 777 / 1689 loss=3.502, nll_loss=1.947, ppl=3.86, wps=584375, ups=1.18, wpb=493981, bsz=16296.5, num_updates=59800, lr=0.00025863, gnorm=0.135, clip=0, loss_scale=2, train_wall=83, gb_free=61.6, wall=51225 epoch 036: 777 / 1689 loss=3.502, nll_loss=1.947, ppl=3.86, wps=584375, ups=1.18, wpb=493981, bsz=16296.5, num_updates=59800, lr=0.00025863, gnorm=0.135, clip=0, loss_scale=2, train_wall=83, gb_free=61.6, wall=51225 epoch 036: 777 / 1689 loss=3.502, nll_loss=1.947, ppl=3.86, wps=584375, ups=1.18, wpb=493981, bsz=16296.5, num_updates=59800, lr=0.00025863, gnorm=0.135, clip=0, loss_scale=2, train_wall=83, gb_free=61.6, wall=51225 epoch 036: 777 / 1689 loss=3.502, nll_loss=1.947, ppl=3.86, wps=584375, ups=1.18, wpb=493981, bsz=16296.5, num_updates=59800, lr=0.00025863, gnorm=0.135, clip=0, loss_scale=2, train_wall=83, gb_free=61.6, wall=51225 epoch 036: 777 / 1689 loss=3.502, nll_loss=1.947, ppl=3.86, wps=584375, ups=1.18, wpb=493981, bsz=16296.5, num_updates=59800, lr=0.00025863, gnorm=0.135, clip=0, loss_scale=2, train_wall=83, gb_free=61.6, wall=51225 epoch 036: 877 / 1689 loss=3.495, nll_loss=1.939, ppl=3.83, wps=585775, ups=1.19, wpb=494113, bsz=16353.3, num_updates=59900, lr=0.000258414, gnorm=0.133, clip=0, loss_scale=2, train_wall=83, gb_free=60.7, wall=51310 epoch 036: 877 / 1689 loss=3.495, nll_loss=1.939, ppl=3.83, wps=585775, ups=1.19, wpb=494113, bsz=16353.3, num_updates=59900, lr=0.000258414, gnorm=0.133, clip=0, loss_scale=2, train_wall=83, gb_free=60.7, wall=51310 epoch 036: 877 / 1689 loss=3.495, nll_loss=1.939, ppl=3.83, wps=585775, ups=1.19, wpb=494113, bsz=16353.3, num_updates=59900, lr=0.000258414, gnorm=0.133, clip=0, loss_scale=2, train_wall=83, gb_free=60.7, wall=51310 epoch 036: 877 / 1689 loss=3.495, nll_loss=1.939, ppl=3.83, wps=585775, ups=1.19, wpb=494113, bsz=16353.3, num_updates=59900, lr=0.000258414, gnorm=0.133, clip=0, loss_scale=2, train_wall=83, gb_free=60.7, wall=51310 epoch 036: 877 / 1689 loss=3.495, nll_loss=1.939, ppl=3.83, wps=585775, ups=1.19, wpb=494113, bsz=16353.3, num_updates=59900, lr=0.000258414, gnorm=0.133, clip=0, loss_scale=2, train_wall=83, gb_free=60.7, wall=51310 epoch 036: 877 / 1689 loss=3.495, nll_loss=1.939, ppl=3.83, wps=585775, ups=1.19, wpb=494113, bsz=16353.3, num_updates=59900, lr=0.000258414, gnorm=0.133, clip=0, loss_scale=2, train_wall=83, gb_free=60.7, wall=51310 epoch 036: 877 / 1689 loss=3.495, nll_loss=1.939, ppl=3.83, wps=585775, ups=1.19, wpb=494113, bsz=16353.3, num_updates=59900, lr=0.000258414, gnorm=0.133, clip=0, loss_scale=2, train_wall=83, gb_free=60.7, wall=51310 epoch 036: 877 / 1689 loss=3.495, nll_loss=1.939, ppl=3.83, wps=585775, ups=1.19, wpb=494113, bsz=16353.3, num_updates=59900, lr=0.000258414, gnorm=0.133, clip=0, loss_scale=2, train_wall=83, gb_free=60.7, wall=51310 epoch 036: 877 / 1689 loss=3.495, nll_loss=1.939, ppl=3.83, wps=585775, ups=1.19, wpb=494113, bsz=16353.3, num_updates=59900, lr=0.000258414, gnorm=0.133, clip=0, loss_scale=2, train_wall=83, gb_free=60.7, wall=51310 epoch 036: 877 / 1689 loss=3.495, nll_loss=1.939, ppl=3.83, wps=585775, ups=1.19, wpb=494113, bsz=16353.3, num_updates=59900, lr=0.000258414, gnorm=0.133, clip=0, loss_scale=2, train_wall=83, gb_free=60.7, wall=51310 epoch 036: 877 / 1689 loss=3.495, nll_loss=1.939, ppl=3.83, wps=585775, ups=1.19, wpb=494113, bsz=16353.3, num_updates=59900, lr=0.000258414, gnorm=0.133, clip=0, loss_scale=2, train_wall=83, gb_free=60.7, wall=51310 epoch 036: 877 / 1689 loss=3.495, nll_loss=1.939, ppl=3.83, wps=585775, ups=1.19, wpb=494113, bsz=16353.3, num_updates=59900, lr=0.000258414, gnorm=0.133, clip=0, loss_scale=2, train_wall=83, gb_free=60.7, wall=51310 epoch 036: 877 / 1689 loss=3.495, nll_loss=1.939, ppl=3.83, wps=585775, ups=1.19, wpb=494113, bsz=16353.3, num_updates=59900, lr=0.000258414, gnorm=0.133, clip=0, loss_scale=2, train_wall=83, gb_free=60.7, wall=51310 epoch 036: 877 / 1689 loss=3.495, nll_loss=1.939, ppl=3.83, wps=585775, ups=1.19, wpb=494113, bsz=16353.3, num_updates=59900, lr=0.000258414, gnorm=0.133, clip=0, loss_scale=2, train_wall=83, gb_free=60.7, wall=51310 epoch 036: 877 / 1689 loss=3.495, nll_loss=1.939, ppl=3.83, wps=585775, ups=1.19, wpb=494113, bsz=16353.3, num_updates=59900, lr=0.000258414, gnorm=0.133, clip=0, loss_scale=2, train_wall=83, gb_free=60.7, wall=51310 epoch 036: 877 / 1689 loss=3.495, nll_loss=1.939, ppl=3.83, wps=585775, ups=1.19, wpb=494113, bsz=16353.3, num_updates=59900, lr=0.000258414, gnorm=0.133, clip=0, loss_scale=2, train_wall=83, gb_free=60.7, wall=51310 epoch 036: 877 / 1689 loss=3.495, nll_loss=1.939, ppl=3.83, wps=585775, ups=1.19, wpb=494113, bsz=16353.3, num_updates=59900, lr=0.000258414, gnorm=0.133, clip=0, loss_scale=2, train_wall=83, gb_free=60.7, wall=51310 epoch 036: 877 / 1689 loss=3.495, nll_loss=1.939, ppl=3.83, wps=585775, ups=1.19, wpb=494113, bsz=16353.3, num_updates=59900, lr=0.000258414, gnorm=0.133, clip=0, loss_scale=2, train_wall=83, gb_free=60.7, wall=51310 epoch 036: 877 / 1689 loss=3.495, nll_loss=1.939, ppl=3.83, wps=585775, ups=1.19, wpb=494113, bsz=16353.3, num_updates=59900, lr=0.000258414, gnorm=0.133, clip=0, loss_scale=2, train_wall=83, gb_free=60.7, wall=51310 epoch 036: 877 / 1689 loss=3.495, nll_loss=1.939, ppl=3.83, wps=585775, ups=1.19, wpb=494113, bsz=16353.3, num_updates=59900, lr=0.000258414, gnorm=0.133, clip=0, loss_scale=2, train_wall=83, gb_free=60.7, wall=51310 epoch 036: 877 / 1689 loss=3.495, nll_loss=1.939, ppl=3.83, wps=585775, ups=1.19, wpb=494113, bsz=16353.3, num_updates=59900, lr=0.000258414, gnorm=0.133, clip=0, loss_scale=2, train_wall=83, gb_free=60.7, wall=51310 epoch 036: 877 / 1689 loss=3.495, nll_loss=1.939, ppl=3.83, wps=585775, ups=1.19, wpb=494113, bsz=16353.3, num_updates=59900, lr=0.000258414, gnorm=0.133, clip=0, loss_scale=2, train_wall=83, gb_free=60.7, wall=51310 epoch 036: 877 / 1689 loss=3.495, nll_loss=1.939, ppl=3.83, wps=585775, ups=1.19, wpb=494113, bsz=16353.3, num_updates=59900, lr=0.000258414, gnorm=0.133, clip=0, loss_scale=2, train_wall=83, gb_free=60.7, wall=51310 epoch 036: 877 / 1689 loss=3.495, nll_loss=1.939, ppl=3.83, wps=585775, ups=1.19, wpb=494113, bsz=16353.3, num_updates=59900, lr=0.000258414, gnorm=0.133, clip=0, loss_scale=2, train_wall=83, gb_free=60.7, wall=51310 epoch 036: 877 / 1689 loss=3.495, nll_loss=1.939, ppl=3.83, wps=585775, ups=1.19, wpb=494113, bsz=16353.3, num_updates=59900, lr=0.000258414, gnorm=0.133, clip=0, loss_scale=2, train_wall=83, gb_free=60.7, wall=51310 epoch 036: 877 / 1689 loss=3.495, nll_loss=1.939, ppl=3.83, wps=585775, ups=1.19, wpb=494113, bsz=16353.3, num_updates=59900, lr=0.000258414, gnorm=0.133, clip=0, loss_scale=2, train_wall=83, gb_free=60.7, wall=51310 epoch 036: 877 / 1689 loss=3.495, nll_loss=1.939, ppl=3.83, wps=585775, ups=1.19, wpb=494113, bsz=16353.3, num_updates=59900, lr=0.000258414, gnorm=0.133, clip=0, loss_scale=2, train_wall=83, gb_free=60.7, wall=51310 epoch 036: 877 / 1689 loss=3.495, nll_loss=1.939, ppl=3.83, wps=585775, ups=1.19, wpb=494113, bsz=16353.3, num_updates=59900, lr=0.000258414, gnorm=0.133, clip=0, loss_scale=2, train_wall=83, gb_free=60.7, wall=51310 epoch 036: 877 / 1689 loss=3.495, nll_loss=1.939, ppl=3.83, wps=585775, ups=1.19, wpb=494113, bsz=16353.3, num_updates=59900, lr=0.000258414, gnorm=0.133, clip=0, loss_scale=2, train_wall=83, gb_free=60.7, wall=51310 epoch 036: 877 / 1689 loss=3.495, nll_loss=1.939, ppl=3.83, wps=585775, ups=1.19, wpb=494113, bsz=16353.3, num_updates=59900, lr=0.000258414, gnorm=0.133, clip=0, loss_scale=2, train_wall=83, gb_free=60.7, wall=51310 epoch 036: 877 / 1689 loss=3.495, nll_loss=1.939, ppl=3.83, wps=585775, ups=1.19, wpb=494113, bsz=16353.3, num_updates=59900, lr=0.000258414, gnorm=0.133, clip=0, loss_scale=2, train_wall=83, gb_free=60.7, wall=51310 epoch 036: 877 / 1689 loss=3.495, nll_loss=1.939, ppl=3.83, wps=585775, ups=1.19, wpb=494113, bsz=16353.3, num_updates=59900, lr=0.000258414, gnorm=0.133, clip=0, loss_scale=2, train_wall=83, gb_free=60.7, wall=51310 epoch 036: 877 / 1689 loss=3.495, nll_loss=1.939, ppl=3.83, wps=585775, ups=1.19, wpb=494113, bsz=16353.3, num_updates=59900, lr=0.000258414, gnorm=0.133, clip=0, loss_scale=2, train_wall=83, gb_free=60.7, wall=51310 epoch 036: 877 / 1689 loss=3.495, nll_loss=1.939, ppl=3.83, wps=585775, ups=1.19, wpb=494113, bsz=16353.3, num_updates=59900, lr=0.000258414, gnorm=0.133, clip=0, loss_scale=2, train_wall=83, gb_free=60.7, wall=51310 epoch 036: 877 / 1689 loss=3.495, nll_loss=1.939, ppl=3.83, wps=585775, ups=1.19, wpb=494113, bsz=16353.3, num_updates=59900, lr=0.000258414, gnorm=0.133, clip=0, loss_scale=2, train_wall=83, gb_free=60.7, wall=51310 epoch 036: 877 / 1689 loss=3.495, nll_loss=1.939, ppl=3.83, wps=585775, ups=1.19, wpb=494113, bsz=16353.3, num_updates=59900, lr=0.000258414, gnorm=0.133, clip=0, loss_scale=2, train_wall=83, gb_free=60.7, wall=51310 epoch 036: 977 / 1689 loss=3.497, nll_loss=1.942, ppl=3.84, wps=585506, ups=1.18, wpb=495269, bsz=16519.1, num_updates=60000, lr=0.000258199, gnorm=0.133, clip=0, loss_scale=2, train_wall=83, gb_free=61.4, wall=51394 epoch 036: 977 / 1689 loss=3.497, nll_loss=1.942, ppl=3.84, wps=585506, ups=1.18, wpb=495269, bsz=16519.1, num_updates=60000, lr=0.000258199, gnorm=0.133, clip=0, loss_scale=2, train_wall=83, gb_free=61.4, wall=51394 epoch 036: 977 / 1689 loss=3.497, nll_loss=1.942, ppl=3.84, wps=585506, ups=1.18, wpb=495269, bsz=16519.1, num_updates=60000, lr=0.000258199, gnorm=0.133, clip=0, loss_scale=2, train_wall=83, gb_free=61.4, wall=51394 epoch 036: 977 / 1689 loss=3.497, nll_loss=1.942, ppl=3.84, wps=585506, ups=1.18, wpb=495269, bsz=16519.1, num_updates=60000, lr=0.000258199, gnorm=0.133, clip=0, loss_scale=2, train_wall=83, gb_free=61.4, wall=51394 epoch 036: 977 / 1689 loss=3.497, nll_loss=1.942, ppl=3.84, wps=585506, ups=1.18, wpb=495269, bsz=16519.1, num_updates=60000, lr=0.000258199, gnorm=0.133, clip=0, loss_scale=2, train_wall=83, gb_free=61.4, wall=51394 epoch 036: 977 / 1689 loss=3.497, nll_loss=1.942, ppl=3.84, wps=585506, ups=1.18, wpb=495269, bsz=16519.1, num_updates=60000, lr=0.000258199, gnorm=0.133, clip=0, loss_scale=2, train_wall=83, gb_free=61.4, wall=51394 epoch 036: 977 / 1689 loss=3.497, nll_loss=1.942, ppl=3.84, wps=585506, ups=1.18, wpb=495269, bsz=16519.1, num_updates=60000, lr=0.000258199, gnorm=0.133, clip=0, loss_scale=2, train_wall=83, gb_free=61.4, wall=51394 epoch 036: 977 / 1689 loss=3.497, nll_loss=1.942, ppl=3.84, wps=585506, ups=1.18, wpb=495269, bsz=16519.1, num_updates=60000, lr=0.000258199, gnorm=0.133, clip=0, loss_scale=2, train_wall=83, gb_free=61.4, wall=51394 epoch 036: 977 / 1689 loss=3.497, nll_loss=1.942, ppl=3.84, wps=585506, ups=1.18, wpb=495269, bsz=16519.1, num_updates=60000, lr=0.000258199, gnorm=0.133, clip=0, loss_scale=2, train_wall=83, gb_free=61.4, wall=51394 epoch 036: 977 / 1689 loss=3.497, nll_loss=1.942, ppl=3.84, wps=585506, ups=1.18, wpb=495269, bsz=16519.1, num_updates=60000, lr=0.000258199, gnorm=0.133, clip=0, loss_scale=2, train_wall=83, gb_free=61.4, wall=51394 epoch 036: 977 / 1689 loss=3.497, nll_loss=1.942, ppl=3.84, wps=585506, ups=1.18, wpb=495269, bsz=16519.1, num_updates=60000, lr=0.000258199, gnorm=0.133, clip=0, loss_scale=2, train_wall=83, gb_free=61.4, wall=51394 epoch 036: 977 / 1689 loss=3.497, nll_loss=1.942, ppl=3.84, wps=585506, ups=1.18, wpb=495269, bsz=16519.1, num_updates=60000, lr=0.000258199, gnorm=0.133, clip=0, loss_scale=2, train_wall=83, gb_free=61.4, wall=51394 epoch 036: 977 / 1689 loss=3.497, nll_loss=1.942, ppl=3.84, wps=585506, ups=1.18, wpb=495269, bsz=16519.1, num_updates=60000, lr=0.000258199, gnorm=0.133, clip=0, loss_scale=2, train_wall=83, gb_free=61.4, wall=51394 epoch 036: 977 / 1689 loss=3.497, nll_loss=1.942, ppl=3.84, wps=585506, ups=1.18, wpb=495269, bsz=16519.1, num_updates=60000, lr=0.000258199, gnorm=0.133, clip=0, loss_scale=2, train_wall=83, gb_free=61.4, wall=51394 epoch 036: 977 / 1689 loss=3.497, nll_loss=1.942, ppl=3.84, wps=585506, ups=1.18, wpb=495269, bsz=16519.1, num_updates=60000, lr=0.000258199, gnorm=0.133, clip=0, loss_scale=2, train_wall=83, gb_free=61.4, wall=51394 epoch 036: 977 / 1689 loss=3.497, nll_loss=1.942, ppl=3.84, wps=585506, ups=1.18, wpb=495269, bsz=16519.1, num_updates=60000, lr=0.000258199, gnorm=0.133, clip=0, loss_scale=2, train_wall=83, gb_free=61.4, wall=51394 epoch 036: 977 / 1689 loss=3.497, nll_loss=1.942, ppl=3.84, wps=585506, ups=1.18, wpb=495269, bsz=16519.1, num_updates=60000, lr=0.000258199, gnorm=0.133, clip=0, loss_scale=2, train_wall=83, gb_free=61.4, wall=51394 epoch 036: 977 / 1689 loss=3.497, nll_loss=1.942, ppl=3.84, wps=585506, ups=1.18, wpb=495269, bsz=16519.1, num_updates=60000, lr=0.000258199, gnorm=0.133, clip=0, loss_scale=2, train_wall=83, gb_free=61.4, wall=51394 epoch 036: 977 / 1689 loss=3.497, nll_loss=1.942, ppl=3.84, wps=585506, ups=1.18, wpb=495269, bsz=16519.1, num_updates=60000, lr=0.000258199, gnorm=0.133, clip=0, loss_scale=2, train_wall=83, gb_free=61.4, wall=51394 epoch 036: 977 / 1689 loss=3.497, nll_loss=1.942, ppl=3.84, wps=585506, ups=1.18, wpb=495269, bsz=16519.1, num_updates=60000, lr=0.000258199, gnorm=0.133, clip=0, loss_scale=2, train_wall=83, gb_free=61.4, wall=51394 epoch 036: 977 / 1689 loss=3.497, nll_loss=1.942, ppl=3.84, wps=585506, ups=1.18, wpb=495269, bsz=16519.1, num_updates=60000, lr=0.000258199, gnorm=0.133, clip=0, loss_scale=2, train_wall=83, gb_free=61.4, wall=51394 epoch 036: 977 / 1689 loss=3.497, nll_loss=1.942, ppl=3.84, wps=585506, ups=1.18, wpb=495269, bsz=16519.1, num_updates=60000, lr=0.000258199, gnorm=0.133, clip=0, loss_scale=2, train_wall=83, gb_free=61.4, wall=51394 epoch 036: 977 / 1689 loss=3.497, nll_loss=1.942, ppl=3.84, wps=585506, ups=1.18, wpb=495269, bsz=16519.1, num_updates=60000, lr=0.000258199, gnorm=0.133, clip=0, loss_scale=2, train_wall=83, gb_free=61.4, wall=51394 epoch 036: 977 / 1689 loss=3.497, nll_loss=1.942, ppl=3.84, wps=585506, ups=1.18, wpb=495269, bsz=16519.1, num_updates=60000, lr=0.000258199, gnorm=0.133, clip=0, loss_scale=2, train_wall=83, gb_free=61.4, wall=51394 epoch 036: 977 / 1689 loss=3.497, nll_loss=1.942, ppl=3.84, wps=585506, ups=1.18, wpb=495269, bsz=16519.1, num_updates=60000, lr=0.000258199, gnorm=0.133, clip=0, loss_scale=2, train_wall=83, gb_free=61.4, wall=51394 epoch 036: 977 / 1689 loss=3.497, nll_loss=1.942, ppl=3.84, wps=585506, ups=1.18, wpb=495269, bsz=16519.1, num_updates=60000, lr=0.000258199, gnorm=0.133, clip=0, loss_scale=2, train_wall=83, gb_free=61.4, wall=51394 epoch 036: 977 / 1689 loss=3.497, nll_loss=1.942, ppl=3.84, wps=585506, ups=1.18, wpb=495269, bsz=16519.1, num_updates=60000, lr=0.000258199, gnorm=0.133, clip=0, loss_scale=2, train_wall=83, gb_free=61.4, wall=51394 epoch 036: 977 / 1689 loss=3.497, nll_loss=1.942, ppl=3.84, wps=585506, ups=1.18, wpb=495269, bsz=16519.1, num_updates=60000, lr=0.000258199, gnorm=0.133, clip=0, loss_scale=2, train_wall=83, gb_free=61.4, wall=51394 epoch 036: 977 / 1689 loss=3.497, nll_loss=1.942, ppl=3.84, wps=585506, ups=1.18, wpb=495269, bsz=16519.1, num_updates=60000, lr=0.000258199, gnorm=0.133, clip=0, loss_scale=2, train_wall=83, gb_free=61.4, wall=51394 epoch 036: 977 / 1689 loss=3.497, nll_loss=1.942, ppl=3.84, wps=585506, ups=1.18, wpb=495269, bsz=16519.1, num_updates=60000, lr=0.000258199, gnorm=0.133, clip=0, loss_scale=2, train_wall=83, gb_free=61.4, wall=51394 epoch 036: 977 / 1689 loss=3.497, nll_loss=1.942, ppl=3.84, wps=585506, ups=1.18, wpb=495269, bsz=16519.1, num_updates=60000, lr=0.000258199, gnorm=0.133, clip=0, loss_scale=2, train_wall=83, gb_free=61.4, wall=51394 epoch 036: 977 / 1689 loss=3.497, nll_loss=1.942, ppl=3.84, wps=585506, ups=1.18, wpb=495269, bsz=16519.1, num_updates=60000, lr=0.000258199, gnorm=0.133, clip=0, loss_scale=2, train_wall=83, gb_free=61.4, wall=51394 epoch 036: 977 / 1689 loss=3.497, nll_loss=1.942, ppl=3.84, wps=585506, ups=1.18, wpb=495269, bsz=16519.1, num_updates=60000, lr=0.000258199, gnorm=0.133, clip=0, loss_scale=2, train_wall=83, gb_free=61.4, wall=51394 epoch 036: 977 / 1689 loss=3.497, nll_loss=1.942, ppl=3.84, wps=585506, ups=1.18, wpb=495269, bsz=16519.1, num_updates=60000, lr=0.000258199, gnorm=0.133, clip=0, loss_scale=2, train_wall=83, gb_free=61.4, wall=51394 epoch 036: 977 / 1689 loss=3.497, nll_loss=1.942, ppl=3.84, wps=585506, ups=1.18, wpb=495269, bsz=16519.1, num_updates=60000, lr=0.000258199, gnorm=0.133, clip=0, loss_scale=2, train_wall=83, gb_free=61.4, wall=51394 epoch 036: 977 / 1689 loss=3.497, nll_loss=1.942, ppl=3.84, wps=585506, ups=1.18, wpb=495269, bsz=16519.1, num_updates=60000, lr=0.000258199, gnorm=0.133, clip=0, loss_scale=2, train_wall=83, gb_free=61.4, wall=51394 Stopping training due to num_updates: 60000 >= max_update: 60000 begin validation on "valid" subset epoch 036 | valid on 'valid' subset | loss 3.696 | nll_loss 2.151 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 60000 | best_loss 3.685 epoch 036 | valid on 'valid' subset | loss 3.696 | nll_loss 2.151 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 60000 | best_loss 3.685 epoch 036 | valid on 'valid' subset | loss 3.696 | nll_loss 2.151 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 60000 | best_loss 3.685 epoch 036 | valid on 'valid' subset | loss 3.696 | nll_loss 2.151 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 60000 | best_loss 3.685 epoch 036 | valid on 'valid' subset | loss 3.696 | nll_loss 2.151 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 60000 | best_loss 3.685 epoch 036 | valid on 'valid' subset | loss 3.696 | nll_loss 2.151 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 60000 | best_loss 3.685 epoch 036 | valid on 'valid' subset | loss 3.696 | nll_loss 2.151 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 60000 | best_loss 3.685 epoch 036 | valid on 'valid' subset | loss 3.696 | nll_loss 2.151 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 60000 | best_loss 3.685 epoch 036 | valid on 'valid' subset | loss 3.696 | nll_loss 2.151 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 60000 | best_loss 3.685 epoch 036 | valid on 'valid' subset | loss 3.696 | nll_loss 2.151 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 60000 | best_loss 3.685 epoch 036 | valid on 'valid' subset | loss 3.696 | nll_loss 2.151 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 60000 | best_loss 3.685 epoch 036 | valid on 'valid' subset | loss 3.696 | nll_loss 2.151 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 60000 | best_loss 3.685 epoch 036 | valid on 'valid' subset | loss 3.696 | nll_loss 2.151 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 60000 | best_loss 3.685 epoch 036 | valid on 'valid' subset | loss 3.696 | nll_loss 2.151 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 60000 | best_loss 3.685 epoch 036 | valid on 'valid' subset | loss 3.696 | nll_loss 2.151 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 60000 | best_loss 3.685 epoch 036 | valid on 'valid' subset | loss 3.696 | nll_loss 2.151 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 60000 | best_loss 3.685 epoch 036 | valid on 'valid' subset | loss 3.696 | nll_loss 2.151 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 60000 | best_loss 3.685 epoch 036 | valid on 'valid' subset | loss 3.696 | nll_loss 2.151 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 60000 | best_loss 3.685 epoch 036 | valid on 'valid' subset | loss 3.696 | nll_loss 2.151 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 60000 | best_loss 3.685 epoch 036 | valid on 'valid' subset | loss 3.696 | nll_loss 2.151 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 60000 | best_loss 3.685 epoch 036 | valid on 'valid' subset | loss 3.696 | nll_loss 2.151 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 60000 | best_loss 3.685 epoch 036 | valid on 'valid' subset | loss 3.696 | nll_loss 2.151 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 60000 | best_loss 3.685 epoch 036 | valid on 'valid' subset | loss 3.696 | nll_loss 2.151 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 60000 | best_loss 3.685 epoch 036 | valid on 'valid' subset | loss 3.696 | nll_loss 2.151 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 60000 | best_loss 3.685 epoch 036 | valid on 'valid' subset | loss 3.696 | nll_loss 2.151 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 60000 | best_loss 3.685 epoch 036 | valid on 'valid' subset | loss 3.696 | nll_loss 2.151 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 60000 | best_loss 3.685 epoch 036 | valid on 'valid' subset | loss 3.696 | nll_loss 2.151 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 60000 | best_loss 3.685 epoch 036 | valid on 'valid' subset | loss 3.696 | nll_loss 2.151 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 60000 | best_loss 3.685 epoch 036 | valid on 'valid' subset | loss 3.696 | nll_loss 2.151 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 60000 | best_loss 3.685 epoch 036 | valid on 'valid' subset | loss 3.696 | nll_loss 2.151 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 60000 | best_loss 3.685 epoch 036 | valid on 'valid' subset | loss 3.696 | nll_loss 2.151 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 60000 | best_loss 3.685 epoch 036 | valid on 'valid' subset | loss 3.696 | nll_loss 2.151 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 60000 | best_loss 3.685 epoch 036 | valid on 'valid' subset | loss 3.696 | nll_loss 2.151 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 60000 | best_loss 3.685 epoch 036 | valid on 'valid' subset | loss 3.696 | nll_loss 2.151 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 60000 | best_loss 3.685 epoch 036 | valid on 'valid' subset | loss 3.696 | nll_loss 2.151 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 60000 | best_loss 3.685 epoch 036 | valid on 'valid' subset | loss 3.696 | nll_loss 2.151 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 60000 | best_loss 3.685 end of epoch 36 (average epoch stats below) epoch 036 | loss 3.49 | nll_loss 1.933 | ppl 3.82 | wps 579321 | ups 1.17 | wpb 494809 | bsz 16450.7 | num_updates 60000 | lr 0.000258199 | gnorm 0.136 | clip 0 | loss_scale 2 | train_wall 813 | gb_free 61.4 | wall 51403 epoch 036 | loss 3.49 | nll_loss 1.933 | ppl 3.82 | wps 579321 | ups 1.17 | wpb 494809 | bsz 16450.7 | num_updates 60000 | lr 0.000258199 | gnorm 0.136 | clip 0 | loss_scale 2 | train_wall 813 | gb_free 61.4 | wall 51403 epoch 036 | loss 3.49 | nll_loss 1.933 | ppl 3.82 | wps 579321 | ups 1.17 | wpb 494809 | bsz 16450.7 | num_updates 60000 | lr 0.000258199 | gnorm 0.136 | clip 0 | loss_scale 2 | train_wall 813 | gb_free 61.4 | wall 51403 epoch 036 | loss 3.49 | nll_loss 1.933 | ppl 3.82 | wps 579321 | ups 1.17 | wpb 494809 | bsz 16450.7 | num_updates 60000 | lr 0.000258199 | gnorm 0.136 | clip 0 | loss_scale 2 | train_wall 813 | gb_free 61.4 | wall 51403 epoch 036 | loss 3.49 | nll_loss 1.933 | ppl 3.82 | wps 579321 | ups 1.17 | wpb 494809 | bsz 16450.7 | num_updates 60000 | lr 0.000258199 | gnorm 0.136 | clip 0 | loss_scale 2 | train_wall 813 | gb_free 61.4 | wall 51403 epoch 036 | loss 3.49 | nll_loss 1.933 | ppl 3.82 | wps 579321 | ups 1.17 | wpb 494809 | bsz 16450.7 | num_updates 60000 | lr 0.000258199 | gnorm 0.136 | clip 0 | loss_scale 2 | train_wall 813 | gb_free 61.4 | wall 51403 epoch 036 | loss 3.49 | nll_loss 1.933 | ppl 3.82 | wps 579321 | ups 1.17 | wpb 494809 | bsz 16450.7 | num_updates 60000 | lr 0.000258199 | gnorm 0.136 | clip 0 | loss_scale 2 | train_wall 813 | gb_free 61.4 | wall 51403 epoch 036 | loss 3.49 | nll_loss 1.933 | ppl 3.82 | wps 579321 | ups 1.17 | wpb 494809 | bsz 16450.7 | num_updates 60000 | lr 0.000258199 | gnorm 0.136 | clip 0 | loss_scale 2 | train_wall 813 | gb_free 61.4 | wall 51403 epoch 036 | loss 3.49 | nll_loss 1.933 | ppl 3.82 | wps 579321 | ups 1.17 | wpb 494809 | bsz 16450.7 | num_updates 60000 | lr 0.000258199 | gnorm 0.136 | clip 0 | loss_scale 2 | train_wall 813 | gb_free 61.4 | wall 51403 epoch 036 | loss 3.49 | nll_loss 1.933 | ppl 3.82 | wps 579321 | ups 1.17 | wpb 494809 | bsz 16450.7 | num_updates 60000 | lr 0.000258199 | gnorm 0.136 | clip 0 | loss_scale 2 | train_wall 813 | gb_free 61.4 | wall 51403 epoch 036 | loss 3.49 | nll_loss 1.933 | ppl 3.82 | wps 579321 | ups 1.17 | wpb 494809 | bsz 16450.7 | num_updates 60000 | lr 0.000258199 | gnorm 0.136 | clip 0 | loss_scale 2 | train_wall 813 | gb_free 61.4 | wall 51403 epoch 036 | loss 3.49 | nll_loss 1.933 | ppl 3.82 | wps 579321 | ups 1.17 | wpb 494809 | bsz 16450.7 | num_updates 60000 | lr 0.000258199 | gnorm 0.136 | clip 0 | loss_scale 2 | train_wall 813 | gb_free 61.4 | wall 51403 epoch 036 | loss 3.49 | nll_loss 1.933 | ppl 3.82 | wps 579321 | ups 1.17 | wpb 494809 | bsz 16450.7 | num_updates 60000 | lr 0.000258199 | gnorm 0.136 | clip 0 | loss_scale 2 | train_wall 813 | gb_free 61.4 | wall 51403 epoch 036 | loss 3.49 | nll_loss 1.933 | ppl 3.82 | wps 579321 | ups 1.17 | wpb 494809 | bsz 16450.7 | num_updates 60000 | lr 0.000258199 | gnorm 0.136 | clip 0 | loss_scale 2 | train_wall 813 | gb_free 61.4 | wall 51403 epoch 036 | loss 3.49 | nll_loss 1.933 | ppl 3.82 | wps 579321 | ups 1.17 | wpb 494809 | bsz 16450.7 | num_updates 60000 | lr 0.000258199 | gnorm 0.136 | clip 0 | loss_scale 2 | train_wall 813 | gb_free 61.4 | wall 51403 epoch 036 | loss 3.49 | nll_loss 1.933 | ppl 3.82 | wps 579321 | ups 1.17 | wpb 494809 | bsz 16450.7 | num_updates 60000 | lr 0.000258199 | gnorm 0.136 | clip 0 | loss_scale 2 | train_wall 813 | gb_free 61.4 | wall 51403 epoch 036 | loss 3.49 | nll_loss 1.933 | ppl 3.82 | wps 579321 | ups 1.17 | wpb 494809 | bsz 16450.7 | num_updates 60000 | lr 0.000258199 | gnorm 0.136 | clip 0 | loss_scale 2 | train_wall 813 | gb_free 61.4 | wall 51403 epoch 036 | loss 3.49 | nll_loss 1.933 | ppl 3.82 | wps 579321 | ups 1.17 | wpb 494809 | bsz 16450.7 | num_updates 60000 | lr 0.000258199 | gnorm 0.136 | clip 0 | loss_scale 2 | train_wall 813 | gb_free 61.4 | wall 51403 epoch 036 | loss 3.49 | nll_loss 1.933 | ppl 3.82 | wps 579321 | ups 1.17 | wpb 494809 | bsz 16450.7 | num_updates 60000 | lr 0.000258199 | gnorm 0.136 | clip 0 | loss_scale 2 | train_wall 813 | gb_free 61.4 | wall 51403 epoch 036 | loss 3.49 | nll_loss 1.933 | ppl 3.82 | wps 579321 | ups 1.17 | wpb 494809 | bsz 16450.7 | num_updates 60000 | lr 0.000258199 | gnorm 0.136 | clip 0 | loss_scale 2 | train_wall 813 | gb_free 61.4 | wall 51403 epoch 036 | loss 3.49 | nll_loss 1.933 | ppl 3.82 | wps 579321 | ups 1.17 | wpb 494809 | bsz 16450.7 | num_updates 60000 | lr 0.000258199 | gnorm 0.136 | clip 0 | loss_scale 2 | train_wall 813 | gb_free 61.4 | wall 51403 epoch 036 | loss 3.49 | nll_loss 1.933 | ppl 3.82 | wps 579321 | ups 1.17 | wpb 494809 | bsz 16450.7 | num_updates 60000 | lr 0.000258199 | gnorm 0.136 | clip 0 | loss_scale 2 | train_wall 813 | gb_free 61.4 | wall 51403 epoch 036 | loss 3.49 | nll_loss 1.933 | ppl 3.82 | wps 579321 | ups 1.17 | wpb 494809 | bsz 16450.7 | num_updates 60000 | lr 0.000258199 | gnorm 0.136 | clip 0 | loss_scale 2 | train_wall 813 | gb_free 61.4 | wall 51403 epoch 036 | loss 3.49 | nll_loss 1.933 | ppl 3.82 | wps 579321 | ups 1.17 | wpb 494809 | bsz 16450.7 | num_updates 60000 | lr 0.000258199 | gnorm 0.136 | clip 0 | loss_scale 2 | train_wall 813 | gb_free 61.4 | wall 51403 epoch 036 | loss 3.49 | nll_loss 1.933 | ppl 3.82 | wps 579321 | ups 1.17 | wpb 494809 | bsz 16450.7 | num_updates 60000 | lr 0.000258199 | gnorm 0.136 | clip 0 | loss_scale 2 | train_wall 813 | gb_free 61.4 | wall 51403 epoch 036 | loss 3.49 | nll_loss 1.933 | ppl 3.82 | wps 579321 | ups 1.17 | wpb 494809 | bsz 16450.7 | num_updates 60000 | lr 0.000258199 | gnorm 0.136 | clip 0 | loss_scale 2 | train_wall 813 | gb_free 61.4 | wall 51403 epoch 036 | loss 3.49 | nll_loss 1.933 | ppl 3.82 | wps 579321 | ups 1.17 | wpb 494809 | bsz 16450.7 | num_updates 60000 | lr 0.000258199 | gnorm 0.136 | clip 0 | loss_scale 2 | train_wall 813 | gb_free 61.4 | wall 51403 epoch 036 | loss 3.49 | nll_loss 1.933 | ppl 3.82 | wps 579321 | ups 1.17 | wpb 494809 | bsz 16450.7 | num_updates 60000 | lr 0.000258199 | gnorm 0.136 | clip 0 | loss_scale 2 | train_wall 813 | gb_free 61.4 | wall 51403 epoch 036 | loss 3.49 | nll_loss 1.933 | ppl 3.82 | wps 579321 | ups 1.17 | wpb 494809 | bsz 16450.7 | num_updates 60000 | lr 0.000258199 | gnorm 0.136 | clip 0 | loss_scale 2 | train_wall 813 | gb_free 61.4 | wall 51403 epoch 036 | loss 3.49 | nll_loss 1.933 | ppl 3.82 | wps 579321 | ups 1.17 | wpb 494809 | bsz 16450.7 | num_updates 60000 | lr 0.000258199 | gnorm 0.136 | clip 0 | loss_scale 2 | train_wall 813 | gb_free 61.4 | wall 51403 epoch 036 | loss 3.49 | nll_loss 1.933 | ppl 3.82 | wps 579321 | ups 1.17 | wpb 494809 | bsz 16450.7 | num_updates 60000 | lr 0.000258199 | gnorm 0.136 | clip 0 | loss_scale 2 | train_wall 813 | gb_free 61.4 | wall 51403 epoch 036 | loss 3.49 | nll_loss 1.933 | ppl 3.82 | wps 579321 | ups 1.17 | wpb 494809 | bsz 16450.7 | num_updates 60000 | lr 0.000258199 | gnorm 0.136 | clip 0 | loss_scale 2 | train_wall 813 | gb_free 61.4 | wall 51403 epoch 036 | loss 3.49 | nll_loss 1.933 | ppl 3.82 | wps 579321 | ups 1.17 | wpb 494809 | bsz 16450.7 | num_updates 60000 | lr 0.000258199 | gnorm 0.136 | clip 0 | loss_scale 2 | train_wall 813 | gb_free 61.4 | wall 51403 epoch 036 | loss 3.49 | nll_loss 1.933 | ppl 3.82 | wps 579321 | ups 1.17 | wpb 494809 | bsz 16450.7 | num_updates 60000 | lr 0.000258199 | gnorm 0.136 | clip 0 | loss_scale 2 | train_wall 813 | gb_free 61.4 | wall 51403 epoch 036 | loss 3.49 | nll_loss 1.933 | ppl 3.82 | wps 579321 | ups 1.17 | wpb 494809 | bsz 16450.7 | num_updates 60000 | lr 0.000258199 | gnorm 0.136 | clip 0 | loss_scale 2 | train_wall 813 | gb_free 61.4 | wall 51403 epoch 036 | loss 3.49 | nll_loss 1.933 | ppl 3.82 | wps 579321 | ups 1.17 | wpb 494809 | bsz 16450.7 | num_updates 60000 | lr 0.000258199 | gnorm 0.136 | clip 0 | loss_scale 2 | train_wall 813 | gb_free 61.4 | wall 51403 done training in 51393.4 seconds