diff --git a/README.md b/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..205ffc00b0cd5da21716a8d717cdab106074cd3e
--- /dev/null
+++ b/README.md
@@ -0,0 +1,59 @@
+---
+license: other
+library_name: peft
+tags:
+- llama-factory
+- lora
+- generated_from_trainer
+base_model: hfl/chinese-alpaca-2-1.3b
+model-index:
+- name: train_2024-03-14-05-56-29
+ results: []
+---
+
+
+
+# train_2024-03-14-05-56-29
+
+This model is a fine-tuned version of [hfl/chinese-alpaca-2-1.3b](https://huggingface.co/hfl/chinese-alpaca-2-1.3b) on the alpaca_zh dataset.
+
+## Model description
+
+More information needed
+
+## Intended uses & limitations
+
+More information needed
+
+## Training and evaluation data
+
+More information needed
+
+## Training procedure
+
+### Training hyperparameters
+
+The following hyperparameters were used during training:
+- learning_rate: 5e-05
+- train_batch_size: 2
+- eval_batch_size: 8
+- seed: 42
+- gradient_accumulation_steps: 8
+- total_train_batch_size: 16
+- optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
+- lr_scheduler_type: cosine
+- num_epochs: 1.0
+- mixed_precision_training: Native AMP
+
+### Training results
+
+
+
+### Framework versions
+
+- PEFT 0.9.0
+- Transformers 4.38.2
+- Pytorch 2.2.1+cu121
+- Datasets 2.18.0
+- Tokenizers 0.15.2
\ No newline at end of file
diff --git a/adapter_config.json b/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..67326eac9d1aeaece3cee6be49e088077b9cebe9
--- /dev/null
+++ b/adapter_config.json
@@ -0,0 +1,28 @@
+{
+ "alpha_pattern": {},
+ "auto_mapping": null,
+ "base_model_name_or_path": "hfl/chinese-alpaca-2-1.3b",
+ "bias": "none",
+ "fan_in_fan_out": false,
+ "inference_mode": true,
+ "init_lora_weights": true,
+ "layers_pattern": null,
+ "layers_to_transform": null,
+ "loftq_config": {},
+ "lora_alpha": 16,
+ "lora_dropout": 0.1,
+ "megatron_config": null,
+ "megatron_core": "megatron.core",
+ "modules_to_save": null,
+ "peft_type": "LORA",
+ "r": 8,
+ "rank_pattern": {},
+ "revision": null,
+ "target_modules": [
+ "q_proj",
+ "v_proj"
+ ],
+ "task_type": "CAUSAL_LM",
+ "use_dora": false,
+ "use_rslora": false
+}
\ No newline at end of file
diff --git a/adapter_model.safetensors b/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..f627743fc8052e394d07d8452fb86a248e220795
--- /dev/null
+++ b/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:462bcac6c9586774f4979f614b6ec3f18bbc2f0febcdd1c766687b7f2056c66a
+size 2099272
diff --git a/all_results.json b/all_results.json
new file mode 100644
index 0000000000000000000000000000000000000000..b5f4085a932af8484c59597eb4f5cc1bb81a42f8
--- /dev/null
+++ b/all_results.json
@@ -0,0 +1,7 @@
+{
+ "epoch": 0.48,
+ "train_loss": 2.0602192145127516,
+ "train_runtime": 801.4891,
+ "train_samples_per_second": 64.198,
+ "train_steps_per_second": 4.011
+}
\ No newline at end of file
diff --git a/checkpoint-100/README.md b/checkpoint-100/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..b21eb87c49b1ea11193cd8f664d04b06c22e6bb5
--- /dev/null
+++ b/checkpoint-100/README.md
@@ -0,0 +1,202 @@
+---
+library_name: peft
+base_model: hfl/chinese-alpaca-2-1.3b
+---
+
+# Model Card for Model ID
+
+
+
+
+
+## Model Details
+
+### Model Description
+
+
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+
+
+### Direct Use
+
+
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+
+
+[More Information Needed]
+
+### Recommendations
+
+
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+
+
+[More Information Needed]
+
+### Training Procedure
+
+
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed]
+
+#### Speeds, Sizes, Times [optional]
+
+
+
+[More Information Needed]
+
+## Evaluation
+
+
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+
+
+[More Information Needed]
+
+#### Factors
+
+
+
+[More Information Needed]
+
+#### Metrics
+
+
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+
+
+[More Information Needed]
+
+## Environmental Impact
+
+
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.9.0
\ No newline at end of file
diff --git a/checkpoint-100/adapter_config.json b/checkpoint-100/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..67326eac9d1aeaece3cee6be49e088077b9cebe9
--- /dev/null
+++ b/checkpoint-100/adapter_config.json
@@ -0,0 +1,28 @@
+{
+ "alpha_pattern": {},
+ "auto_mapping": null,
+ "base_model_name_or_path": "hfl/chinese-alpaca-2-1.3b",
+ "bias": "none",
+ "fan_in_fan_out": false,
+ "inference_mode": true,
+ "init_lora_weights": true,
+ "layers_pattern": null,
+ "layers_to_transform": null,
+ "loftq_config": {},
+ "lora_alpha": 16,
+ "lora_dropout": 0.1,
+ "megatron_config": null,
+ "megatron_core": "megatron.core",
+ "modules_to_save": null,
+ "peft_type": "LORA",
+ "r": 8,
+ "rank_pattern": {},
+ "revision": null,
+ "target_modules": [
+ "q_proj",
+ "v_proj"
+ ],
+ "task_type": "CAUSAL_LM",
+ "use_dora": false,
+ "use_rslora": false
+}
\ No newline at end of file
diff --git a/checkpoint-100/adapter_model.safetensors b/checkpoint-100/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..e93177b961498c33143fb1d3c1fa99de4f19e01d
--- /dev/null
+++ b/checkpoint-100/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6d36ee8ac772f766edf9dbbfd9624d8e73a0cfd65f5df48211d324cc57a4e5c6
+size 2099272
diff --git a/checkpoint-100/optimizer.pt b/checkpoint-100/optimizer.pt
new file mode 100644
index 0000000000000000000000000000000000000000..023f9eeb4c52e156b65246e54d1a04234318f2ab
--- /dev/null
+++ b/checkpoint-100/optimizer.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3f41177184e0c283aa638b8ba6cde4ccf932b0a8e13fad8e15ae916a4cb18c62
+size 4208302
diff --git a/checkpoint-100/rng_state.pth b/checkpoint-100/rng_state.pth
new file mode 100644
index 0000000000000000000000000000000000000000..162c42af6b1715633121d91ddaa9ad6d6b894acc
--- /dev/null
+++ b/checkpoint-100/rng_state.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b5b99b025074fe1142a3334c2107e39b98b9d36c2981cb810283beb4adc7ac94
+size 14244
diff --git a/checkpoint-100/scheduler.pt b/checkpoint-100/scheduler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..d6c8d3ac24feb7d7a857c7f9590ce22e8b517a51
--- /dev/null
+++ b/checkpoint-100/scheduler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e2197384a954a4fcf1ccbd8df7831a0d78a90198a2460b1cc6c71d1497ca1586
+size 1064
diff --git a/checkpoint-100/special_tokens_map.json b/checkpoint-100/special_tokens_map.json
new file mode 100644
index 0000000000000000000000000000000000000000..94c4595c4eb202faff8e45d273ceffbe19fe3036
--- /dev/null
+++ b/checkpoint-100/special_tokens_map.json
@@ -0,0 +1,30 @@
+{
+ "bos_token": {
+ "content": "",
+ "lstrip": false,
+ "normalized": true,
+ "rstrip": false,
+ "single_word": false
+ },
+ "eos_token": {
+ "content": "",
+ "lstrip": false,
+ "normalized": true,
+ "rstrip": false,
+ "single_word": false
+ },
+ "pad_token": {
+ "content": "",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false
+ },
+ "unk_token": {
+ "content": "",
+ "lstrip": false,
+ "normalized": true,
+ "rstrip": false,
+ "single_word": false
+ }
+}
diff --git a/checkpoint-100/tokenizer.model b/checkpoint-100/tokenizer.model
new file mode 100644
index 0000000000000000000000000000000000000000..3df664fcc67f61d0c7e635a60d9f55cd0624158a
--- /dev/null
+++ b/checkpoint-100/tokenizer.model
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a3b8844863b200dfcca971db228e96ce388290dfcf72c15d7a9d2f604bac787c
+size 844403
diff --git a/checkpoint-100/tokenizer_config.json b/checkpoint-100/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..6912f7cdcb84c3039edad59aa64f70dc4344a517
--- /dev/null
+++ b/checkpoint-100/tokenizer_config.json
@@ -0,0 +1,54 @@
+{
+ "add_bos_token": true,
+ "add_eos_token": false,
+ "add_prefix_space": true,
+ "added_tokens_decoder": {
+ "0": {
+ "content": "",
+ "lstrip": false,
+ "normalized": true,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "1": {
+ "content": "",
+ "lstrip": false,
+ "normalized": true,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "2": {
+ "content": "",
+ "lstrip": false,
+ "normalized": true,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "32000": {
+ "content": "",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ }
+ },
+ "bos_token": "",
+ "chat_template": "{% set system_message = 'You are a helpful assistant. 你是一个乐于助人的助手。' %}{% if messages[0]['role'] == 'system' %}{% set system_message = messages[0]['content'] %}{% endif %}{% for message in messages %}{% set content = message['content'] %}{% if loop.index0 == 0 and system_message is defined %}{% set content = '<>\\n' + system_message + '\\n<>\\n\\n' + message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ '' + '[INST] ' + content + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ content + '' }}{% endif %}{% endfor %}",
+ "clean_up_tokenization_spaces": false,
+ "eos_token": "",
+ "legacy": true,
+ "model_max_length": 1000000000000000019884624838656,
+ "pad_token": "",
+ "padding_side": "right",
+ "sp_model_kwargs": {},
+ "spaces_between_special_tokens": false,
+ "split_special_tokens": false,
+ "tokenizer_class": "LlamaTokenizer",
+ "unk_token": "",
+ "use_default_system_prompt": false,
+ "use_fast": false
+}
diff --git a/checkpoint-100/trainer_state.json b/checkpoint-100/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..86ccc7f1deda8a29046e7d72e06e1642e3c0408a
--- /dev/null
+++ b/checkpoint-100/trainer_state.json
@@ -0,0 +1,161 @@
+{
+ "best_metric": null,
+ "best_model_checkpoint": null,
+ "epoch": 0.031095735997201383,
+ "eval_steps": 500,
+ "global_step": 100,
+ "is_hyper_param_search": false,
+ "is_local_process_zero": true,
+ "is_world_process_zero": true,
+ "log_history": [
+ {
+ "epoch": 0.0,
+ "grad_norm": 0.47774845361709595,
+ "learning_rate": 4.999970160815579e-05,
+ "loss": 2.0765,
+ "step": 5
+ },
+ {
+ "epoch": 0.0,
+ "grad_norm": 0.6051416397094727,
+ "learning_rate": 4.999880643974619e-05,
+ "loss": 2.2297,
+ "step": 10
+ },
+ {
+ "epoch": 0.0,
+ "grad_norm": 0.6161717772483826,
+ "learning_rate": 4.9997314516140056e-05,
+ "loss": 2.1103,
+ "step": 15
+ },
+ {
+ "epoch": 0.01,
+ "grad_norm": 0.4686434268951416,
+ "learning_rate": 4.999522587295162e-05,
+ "loss": 2.0057,
+ "step": 20
+ },
+ {
+ "epoch": 0.01,
+ "grad_norm": 0.8412289023399353,
+ "learning_rate": 4.999254056003963e-05,
+ "loss": 2.1778,
+ "step": 25
+ },
+ {
+ "epoch": 0.01,
+ "grad_norm": 0.5333625078201294,
+ "learning_rate": 4.99892586415061e-05,
+ "loss": 2.2399,
+ "step": 30
+ },
+ {
+ "epoch": 0.01,
+ "grad_norm": 0.821148157119751,
+ "learning_rate": 4.9985380195694856e-05,
+ "loss": 2.3215,
+ "step": 35
+ },
+ {
+ "epoch": 0.01,
+ "grad_norm": 0.8403909206390381,
+ "learning_rate": 4.998090531518962e-05,
+ "loss": 1.8295,
+ "step": 40
+ },
+ {
+ "epoch": 0.01,
+ "grad_norm": 0.6633398532867432,
+ "learning_rate": 4.9975834106811834e-05,
+ "loss": 2.0195,
+ "step": 45
+ },
+ {
+ "epoch": 0.02,
+ "grad_norm": 0.6386868357658386,
+ "learning_rate": 4.997016669161806e-05,
+ "loss": 2.1257,
+ "step": 50
+ },
+ {
+ "epoch": 0.02,
+ "grad_norm": 0.7762248516082764,
+ "learning_rate": 4.996390320489715e-05,
+ "loss": 2.057,
+ "step": 55
+ },
+ {
+ "epoch": 0.02,
+ "grad_norm": 1.3192856311798096,
+ "learning_rate": 4.9957043796166966e-05,
+ "loss": 2.0753,
+ "step": 60
+ },
+ {
+ "epoch": 0.02,
+ "grad_norm": 0.9797518849372864,
+ "learning_rate": 4.994958862917083e-05,
+ "loss": 1.9736,
+ "step": 65
+ },
+ {
+ "epoch": 0.02,
+ "grad_norm": 1.000693440437317,
+ "learning_rate": 4.994153788187363e-05,
+ "loss": 2.1572,
+ "step": 70
+ },
+ {
+ "epoch": 0.02,
+ "grad_norm": 0.6852813959121704,
+ "learning_rate": 4.993289174645757e-05,
+ "loss": 2.1491,
+ "step": 75
+ },
+ {
+ "epoch": 0.02,
+ "grad_norm": 1.0075691938400269,
+ "learning_rate": 4.992365042931752e-05,
+ "loss": 1.945,
+ "step": 80
+ },
+ {
+ "epoch": 0.03,
+ "grad_norm": 1.1973133087158203,
+ "learning_rate": 4.991381415105619e-05,
+ "loss": 2.0811,
+ "step": 85
+ },
+ {
+ "epoch": 0.03,
+ "grad_norm": 0.9927239418029785,
+ "learning_rate": 4.990338314647881e-05,
+ "loss": 1.961,
+ "step": 90
+ },
+ {
+ "epoch": 0.03,
+ "grad_norm": 0.9499759674072266,
+ "learning_rate": 4.98923576645875e-05,
+ "loss": 2.0653,
+ "step": 95
+ },
+ {
+ "epoch": 0.03,
+ "grad_norm": 0.7233040928840637,
+ "learning_rate": 4.9880737968575365e-05,
+ "loss": 1.9999,
+ "step": 100
+ }
+ ],
+ "logging_steps": 5,
+ "max_steps": 3215,
+ "num_input_tokens_seen": 0,
+ "num_train_epochs": 1,
+ "save_steps": 100,
+ "total_flos": 1342876749004800.0,
+ "train_batch_size": 2,
+ "trial_name": null,
+ "trial_params": null
+}
diff --git a/checkpoint-100/training_args.bin b/checkpoint-100/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..dbb5f752acaebe6718c05e59ced2a70ce6dfb6a0
--- /dev/null
+++ b/checkpoint-100/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:79e80b13ff00898b4493d440a3c1a1eb234c0ae541cbca8a8b1befef97a354c9
+size 5112
diff --git a/checkpoint-1000/README.md b/checkpoint-1000/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..b21eb87c49b1ea11193cd8f664d04b06c22e6bb5
--- /dev/null
+++ b/checkpoint-1000/README.md
@@ -0,0 +1,202 @@
+---
+library_name: peft
+base_model: hfl/chinese-alpaca-2-1.3b
+---
+
+# Model Card for Model ID
+
+
+
+
+
+## Model Details
+
+### Model Description
+
+
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+
+
+### Direct Use
+
+
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+
+
+[More Information Needed]
+
+### Recommendations
+
+
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+
+
+[More Information Needed]
+
+### Training Procedure
+
+
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed]
+
+#### Speeds, Sizes, Times [optional]
+
+
+
+[More Information Needed]
+
+## Evaluation
+
+
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+
+
+[More Information Needed]
+
+#### Factors
+
+
+
+[More Information Needed]
+
+#### Metrics
+
+
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+
+
+[More Information Needed]
+
+## Environmental Impact
+
+
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.9.0
\ No newline at end of file
diff --git a/checkpoint-1000/adapter_config.json b/checkpoint-1000/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..67326eac9d1aeaece3cee6be49e088077b9cebe9
--- /dev/null
+++ b/checkpoint-1000/adapter_config.json
@@ -0,0 +1,28 @@
+{
+ "alpha_pattern": {},
+ "auto_mapping": null,
+ "base_model_name_or_path": "hfl/chinese-alpaca-2-1.3b",
+ "bias": "none",
+ "fan_in_fan_out": false,
+ "inference_mode": true,
+ "init_lora_weights": true,
+ "layers_pattern": null,
+ "layers_to_transform": null,
+ "loftq_config": {},
+ "lora_alpha": 16,
+ "lora_dropout": 0.1,
+ "megatron_config": null,
+ "megatron_core": "megatron.core",
+ "modules_to_save": null,
+ "peft_type": "LORA",
+ "r": 8,
+ "rank_pattern": {},
+ "revision": null,
+ "target_modules": [
+ "q_proj",
+ "v_proj"
+ ],
+ "task_type": "CAUSAL_LM",
+ "use_dora": false,
+ "use_rslora": false
+}
\ No newline at end of file
diff --git a/checkpoint-1000/adapter_model.safetensors b/checkpoint-1000/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..e08ac9f1131a9eca827b7983ab68ecdc0e0e746a
--- /dev/null
+++ b/checkpoint-1000/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:48a2d9599d49f21f6da2043f811c7fc36667538dcd0694bf5ab45264e604eb6a
+size 2099272
diff --git a/checkpoint-1000/optimizer.pt b/checkpoint-1000/optimizer.pt
new file mode 100644
index 0000000000000000000000000000000000000000..713871392f596e05c0afccf6387210df924ac1f4
--- /dev/null
+++ b/checkpoint-1000/optimizer.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3f006162ce9a8ae562e525479e6cbcc5e65fcf10721c1c878a1eba1f3248032d
+size 4208302
diff --git a/checkpoint-1000/rng_state.pth b/checkpoint-1000/rng_state.pth
new file mode 100644
index 0000000000000000000000000000000000000000..72558266052bf947b48a003cb72b89f6e3de0769
--- /dev/null
+++ b/checkpoint-1000/rng_state.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bab00d452a5c7da709ec7f6117cea515f8bedf68f40e39d014d87811d017f294
+size 14244
diff --git a/checkpoint-1000/scheduler.pt b/checkpoint-1000/scheduler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..9af51ea8ab150a610a34adbcb06e117504b159da
--- /dev/null
+++ b/checkpoint-1000/scheduler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b0997e59d7add886c3367f611ab22a16eaa2f60d42a1ebdb93b4ad46ac309297
+size 1064
diff --git a/checkpoint-1000/special_tokens_map.json b/checkpoint-1000/special_tokens_map.json
new file mode 100644
index 0000000000000000000000000000000000000000..94c4595c4eb202faff8e45d273ceffbe19fe3036
--- /dev/null
+++ b/checkpoint-1000/special_tokens_map.json
@@ -0,0 +1,30 @@
+{
+ "bos_token": {
+ "content": "",
+ "lstrip": false,
+ "normalized": true,
+ "rstrip": false,
+ "single_word": false
+ },
+ "eos_token": {
+ "content": "",
+ "lstrip": false,
+ "normalized": true,
+ "rstrip": false,
+ "single_word": false
+ },
+ "pad_token": {
+ "content": "",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false
+ },
+ "unk_token": {
+ "content": "",
+ "lstrip": false,
+ "normalized": true,
+ "rstrip": false,
+ "single_word": false
+ }
+}
diff --git a/checkpoint-1000/tokenizer.model b/checkpoint-1000/tokenizer.model
new file mode 100644
index 0000000000000000000000000000000000000000..3df664fcc67f61d0c7e635a60d9f55cd0624158a
--- /dev/null
+++ b/checkpoint-1000/tokenizer.model
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a3b8844863b200dfcca971db228e96ce388290dfcf72c15d7a9d2f604bac787c
+size 844403
diff --git a/checkpoint-1000/tokenizer_config.json b/checkpoint-1000/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..6912f7cdcb84c3039edad59aa64f70dc4344a517
--- /dev/null
+++ b/checkpoint-1000/tokenizer_config.json
@@ -0,0 +1,54 @@
+{
+ "add_bos_token": true,
+ "add_eos_token": false,
+ "add_prefix_space": true,
+ "added_tokens_decoder": {
+ "0": {
+ "content": "",
+ "lstrip": false,
+ "normalized": true,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "1": {
+ "content": "",
+ "lstrip": false,
+ "normalized": true,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "2": {
+ "content": "",
+ "lstrip": false,
+ "normalized": true,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "32000": {
+ "content": "",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ }
+ },
+ "bos_token": "",
+ "chat_template": "{% set system_message = 'You are a helpful assistant. 你是一个乐于助人的助手。' %}{% if messages[0]['role'] == 'system' %}{% set system_message = messages[0]['content'] %}{% endif %}{% for message in messages %}{% set content = message['content'] %}{% if loop.index0 == 0 and system_message is defined %}{% set content = '<>\\n' + system_message + '\\n<>\\n\\n' + message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ '' + '[INST] ' + content + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ content + '' }}{% endif %}{% endfor %}",
+ "clean_up_tokenization_spaces": false,
+ "eos_token": "",
+ "legacy": true,
+ "model_max_length": 1000000000000000019884624838656,
+ "pad_token": "",
+ "padding_side": "right",
+ "sp_model_kwargs": {},
+ "spaces_between_special_tokens": false,
+ "split_special_tokens": false,
+ "tokenizer_class": "LlamaTokenizer",
+ "unk_token": "",
+ "use_default_system_prompt": false,
+ "use_fast": false
+}
diff --git a/checkpoint-1000/trainer_state.json b/checkpoint-1000/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..af8a9908632620735408ddc8508673478ec4dab4
--- /dev/null
+++ b/checkpoint-1000/trainer_state.json
@@ -0,0 +1,1421 @@
+{
+ "best_metric": null,
+ "best_model_checkpoint": null,
+ "epoch": 0.31095735997201385,
+ "eval_steps": 500,
+ "global_step": 1000,
+ "is_hyper_param_search": false,
+ "is_local_process_zero": true,
+ "is_world_process_zero": true,
+ "log_history": [
+ {
+ "epoch": 0.0,
+ "grad_norm": 0.47774845361709595,
+ "learning_rate": 4.999970160815579e-05,
+ "loss": 2.0765,
+ "step": 5
+ },
+ {
+ "epoch": 0.0,
+ "grad_norm": 0.6051416397094727,
+ "learning_rate": 4.999880643974619e-05,
+ "loss": 2.2297,
+ "step": 10
+ },
+ {
+ "epoch": 0.0,
+ "grad_norm": 0.6161717772483826,
+ "learning_rate": 4.9997314516140056e-05,
+ "loss": 2.1103,
+ "step": 15
+ },
+ {
+ "epoch": 0.01,
+ "grad_norm": 0.4686434268951416,
+ "learning_rate": 4.999522587295162e-05,
+ "loss": 2.0057,
+ "step": 20
+ },
+ {
+ "epoch": 0.01,
+ "grad_norm": 0.8412289023399353,
+ "learning_rate": 4.999254056003963e-05,
+ "loss": 2.1778,
+ "step": 25
+ },
+ {
+ "epoch": 0.01,
+ "grad_norm": 0.5333625078201294,
+ "learning_rate": 4.99892586415061e-05,
+ "loss": 2.2399,
+ "step": 30
+ },
+ {
+ "epoch": 0.01,
+ "grad_norm": 0.821148157119751,
+ "learning_rate": 4.9985380195694856e-05,
+ "loss": 2.3215,
+ "step": 35
+ },
+ {
+ "epoch": 0.01,
+ "grad_norm": 0.8403909206390381,
+ "learning_rate": 4.998090531518962e-05,
+ "loss": 1.8295,
+ "step": 40
+ },
+ {
+ "epoch": 0.01,
+ "grad_norm": 0.6633398532867432,
+ "learning_rate": 4.9975834106811834e-05,
+ "loss": 2.0195,
+ "step": 45
+ },
+ {
+ "epoch": 0.02,
+ "grad_norm": 0.6386868357658386,
+ "learning_rate": 4.997016669161806e-05,
+ "loss": 2.1257,
+ "step": 50
+ },
+ {
+ "epoch": 0.02,
+ "grad_norm": 0.7762248516082764,
+ "learning_rate": 4.996390320489715e-05,
+ "loss": 2.057,
+ "step": 55
+ },
+ {
+ "epoch": 0.02,
+ "grad_norm": 1.3192856311798096,
+ "learning_rate": 4.9957043796166966e-05,
+ "loss": 2.0753,
+ "step": 60
+ },
+ {
+ "epoch": 0.02,
+ "grad_norm": 0.9797518849372864,
+ "learning_rate": 4.994958862917083e-05,
+ "loss": 1.9736,
+ "step": 65
+ },
+ {
+ "epoch": 0.02,
+ "grad_norm": 1.000693440437317,
+ "learning_rate": 4.994153788187363e-05,
+ "loss": 2.1572,
+ "step": 70
+ },
+ {
+ "epoch": 0.02,
+ "grad_norm": 0.6852813959121704,
+ "learning_rate": 4.993289174645757e-05,
+ "loss": 2.1491,
+ "step": 75
+ },
+ {
+ "epoch": 0.02,
+ "grad_norm": 1.0075691938400269,
+ "learning_rate": 4.992365042931752e-05,
+ "loss": 1.945,
+ "step": 80
+ },
+ {
+ "epoch": 0.03,
+ "grad_norm": 1.1973133087158203,
+ "learning_rate": 4.991381415105619e-05,
+ "loss": 2.0811,
+ "step": 85
+ },
+ {
+ "epoch": 0.03,
+ "grad_norm": 0.9927239418029785,
+ "learning_rate": 4.990338314647881e-05,
+ "loss": 1.961,
+ "step": 90
+ },
+ {
+ "epoch": 0.03,
+ "grad_norm": 0.9499759674072266,
+ "learning_rate": 4.98923576645875e-05,
+ "loss": 2.0653,
+ "step": 95
+ },
+ {
+ "epoch": 0.03,
+ "grad_norm": 0.7233040928840637,
+ "learning_rate": 4.9880737968575365e-05,
+ "loss": 1.9999,
+ "step": 100
+ },
+ {
+ "epoch": 0.03,
+ "grad_norm": 1.55235755443573,
+ "learning_rate": 4.986852433582022e-05,
+ "loss": 2.2258,
+ "step": 105
+ },
+ {
+ "epoch": 0.03,
+ "grad_norm": 0.9007890820503235,
+ "learning_rate": 4.985571705787793e-05,
+ "loss": 2.1034,
+ "step": 110
+ },
+ {
+ "epoch": 0.04,
+ "grad_norm": 0.6774860620498657,
+ "learning_rate": 4.9842316440475475e-05,
+ "loss": 2.1753,
+ "step": 115
+ },
+ {
+ "epoch": 0.04,
+ "grad_norm": 0.7676737308502197,
+ "learning_rate": 4.9828322803503665e-05,
+ "loss": 2.1384,
+ "step": 120
+ },
+ {
+ "epoch": 0.04,
+ "grad_norm": 0.9624544978141785,
+ "learning_rate": 4.981373648100946e-05,
+ "loss": 2.0521,
+ "step": 125
+ },
+ {
+ "epoch": 0.04,
+ "grad_norm": 0.9315722584724426,
+ "learning_rate": 4.979855782118802e-05,
+ "loss": 1.9256,
+ "step": 130
+ },
+ {
+ "epoch": 0.04,
+ "grad_norm": 0.9035864472389221,
+ "learning_rate": 4.978278718637443e-05,
+ "loss": 2.0882,
+ "step": 135
+ },
+ {
+ "epoch": 0.04,
+ "grad_norm": 0.7997236251831055,
+ "learning_rate": 4.9766424953035e-05,
+ "loss": 2.0724,
+ "step": 140
+ },
+ {
+ "epoch": 0.05,
+ "grad_norm": 1.0692921876907349,
+ "learning_rate": 4.974947151175826e-05,
+ "loss": 2.1329,
+ "step": 145
+ },
+ {
+ "epoch": 0.05,
+ "grad_norm": 0.9506180286407471,
+ "learning_rate": 4.973192726724572e-05,
+ "loss": 2.082,
+ "step": 150
+ },
+ {
+ "epoch": 0.05,
+ "grad_norm": 0.8647387027740479,
+ "learning_rate": 4.9713792638302145e-05,
+ "loss": 2.0366,
+ "step": 155
+ },
+ {
+ "epoch": 0.05,
+ "grad_norm": 1.105302095413208,
+ "learning_rate": 4.969506805782555e-05,
+ "loss": 2.1481,
+ "step": 160
+ },
+ {
+ "epoch": 0.05,
+ "grad_norm": 0.7593303918838501,
+ "learning_rate": 4.967575397279689e-05,
+ "loss": 2.032,
+ "step": 165
+ },
+ {
+ "epoch": 0.05,
+ "grad_norm": 0.7521979808807373,
+ "learning_rate": 4.965585084426943e-05,
+ "loss": 2.0379,
+ "step": 170
+ },
+ {
+ "epoch": 0.05,
+ "grad_norm": 0.947120726108551,
+ "learning_rate": 4.9635359147357655e-05,
+ "loss": 2.1444,
+ "step": 175
+ },
+ {
+ "epoch": 0.06,
+ "grad_norm": 1.2184454202651978,
+ "learning_rate": 4.961427937122598e-05,
+ "loss": 1.9164,
+ "step": 180
+ },
+ {
+ "epoch": 0.06,
+ "grad_norm": 1.221663475036621,
+ "learning_rate": 4.959261201907707e-05,
+ "loss": 2.0084,
+ "step": 185
+ },
+ {
+ "epoch": 0.06,
+ "grad_norm": 1.0457361936569214,
+ "learning_rate": 4.957035760813982e-05,
+ "loss": 2.2032,
+ "step": 190
+ },
+ {
+ "epoch": 0.06,
+ "grad_norm": 0.8834909200668335,
+ "learning_rate": 4.954751666965701e-05,
+ "loss": 2.2101,
+ "step": 195
+ },
+ {
+ "epoch": 0.06,
+ "grad_norm": 0.791902482509613,
+ "learning_rate": 4.9524089748872615e-05,
+ "loss": 2.0472,
+ "step": 200
+ },
+ {
+ "epoch": 0.06,
+ "grad_norm": 1.2905739545822144,
+ "learning_rate": 4.9500077405018807e-05,
+ "loss": 2.0987,
+ "step": 205
+ },
+ {
+ "epoch": 0.07,
+ "grad_norm": 0.8612006306648254,
+ "learning_rate": 4.9475480211302583e-05,
+ "loss": 2.1765,
+ "step": 210
+ },
+ {
+ "epoch": 0.07,
+ "grad_norm": 1.3128459453582764,
+ "learning_rate": 4.945029875489212e-05,
+ "loss": 1.9926,
+ "step": 215
+ },
+ {
+ "epoch": 0.07,
+ "grad_norm": 0.9610918164253235,
+ "learning_rate": 4.94245336369027e-05,
+ "loss": 2.0124,
+ "step": 220
+ },
+ {
+ "epoch": 0.07,
+ "grad_norm": 0.873160183429718,
+ "learning_rate": 4.939818547238241e-05,
+ "loss": 2.2229,
+ "step": 225
+ },
+ {
+ "epoch": 0.07,
+ "grad_norm": 1.5535285472869873,
+ "learning_rate": 4.9371254890297446e-05,
+ "loss": 2.2013,
+ "step": 230
+ },
+ {
+ "epoch": 0.07,
+ "grad_norm": 1.1951836347579956,
+ "learning_rate": 4.93437425335171e-05,
+ "loss": 2.014,
+ "step": 235
+ },
+ {
+ "epoch": 0.07,
+ "grad_norm": 0.7874170541763306,
+ "learning_rate": 4.9315649058798384e-05,
+ "loss": 2.1701,
+ "step": 240
+ },
+ {
+ "epoch": 0.08,
+ "grad_norm": 1.3503323793411255,
+ "learning_rate": 4.928697513677042e-05,
+ "loss": 2.1681,
+ "step": 245
+ },
+ {
+ "epoch": 0.08,
+ "grad_norm": 1.3091179132461548,
+ "learning_rate": 4.925772145191834e-05,
+ "loss": 2.1224,
+ "step": 250
+ },
+ {
+ "epoch": 0.08,
+ "grad_norm": 1.4428555965423584,
+ "learning_rate": 4.9227888702567044e-05,
+ "loss": 2.0512,
+ "step": 255
+ },
+ {
+ "epoch": 0.08,
+ "grad_norm": 0.8234395980834961,
+ "learning_rate": 4.9197477600864446e-05,
+ "loss": 2.1067,
+ "step": 260
+ },
+ {
+ "epoch": 0.08,
+ "grad_norm": 1.9094969034194946,
+ "learning_rate": 4.9166488872764526e-05,
+ "loss": 1.8884,
+ "step": 265
+ },
+ {
+ "epoch": 0.08,
+ "grad_norm": 1.0074087381362915,
+ "learning_rate": 4.913492325800999e-05,
+ "loss": 1.9345,
+ "step": 270
+ },
+ {
+ "epoch": 0.09,
+ "grad_norm": 1.0867297649383545,
+ "learning_rate": 4.910278151011458e-05,
+ "loss": 2.1928,
+ "step": 275
+ },
+ {
+ "epoch": 0.09,
+ "grad_norm": 0.6842357516288757,
+ "learning_rate": 4.907006439634516e-05,
+ "loss": 2.0407,
+ "step": 280
+ },
+ {
+ "epoch": 0.09,
+ "grad_norm": 0.8409023284912109,
+ "learning_rate": 4.903677269770329e-05,
+ "loss": 2.2344,
+ "step": 285
+ },
+ {
+ "epoch": 0.09,
+ "grad_norm": 0.8119503259658813,
+ "learning_rate": 4.900290720890671e-05,
+ "loss": 2.1296,
+ "step": 290
+ },
+ {
+ "epoch": 0.09,
+ "grad_norm": 0.9938147068023682,
+ "learning_rate": 4.8968468738370244e-05,
+ "loss": 2.152,
+ "step": 295
+ },
+ {
+ "epoch": 0.09,
+ "grad_norm": 0.9865244030952454,
+ "learning_rate": 4.8933458108186606e-05,
+ "loss": 1.9623,
+ "step": 300
+ },
+ {
+ "epoch": 0.09,
+ "grad_norm": 1.3944802284240723,
+ "learning_rate": 4.889787615410672e-05,
+ "loss": 1.915,
+ "step": 305
+ },
+ {
+ "epoch": 0.1,
+ "grad_norm": 1.3749767541885376,
+ "learning_rate": 4.886172372551977e-05,
+ "loss": 1.9934,
+ "step": 310
+ },
+ {
+ "epoch": 0.1,
+ "grad_norm": 0.9024938941001892,
+ "learning_rate": 4.882500168543294e-05,
+ "loss": 2.1541,
+ "step": 315
+ },
+ {
+ "epoch": 0.1,
+ "grad_norm": 1.1978263854980469,
+ "learning_rate": 4.878771091045082e-05,
+ "loss": 2.1688,
+ "step": 320
+ },
+ {
+ "epoch": 0.1,
+ "grad_norm": 0.8360010981559753,
+ "learning_rate": 4.874985229075446e-05,
+ "loss": 2.1387,
+ "step": 325
+ },
+ {
+ "epoch": 0.1,
+ "grad_norm": 0.7683364152908325,
+ "learning_rate": 4.871142673008012e-05,
+ "loss": 2.0215,
+ "step": 330
+ },
+ {
+ "epoch": 0.1,
+ "grad_norm": 1.4230670928955078,
+ "learning_rate": 4.867243514569772e-05,
+ "loss": 1.9491,
+ "step": 335
+ },
+ {
+ "epoch": 0.11,
+ "grad_norm": 0.8198773860931396,
+ "learning_rate": 4.863287846838891e-05,
+ "loss": 2.0151,
+ "step": 340
+ },
+ {
+ "epoch": 0.11,
+ "grad_norm": 1.467207908630371,
+ "learning_rate": 4.85927576424249e-05,
+ "loss": 1.8906,
+ "step": 345
+ },
+ {
+ "epoch": 0.11,
+ "grad_norm": 0.9537095427513123,
+ "learning_rate": 4.855207362554385e-05,
+ "loss": 2.1844,
+ "step": 350
+ },
+ {
+ "epoch": 0.11,
+ "grad_norm": 1.0757155418395996,
+ "learning_rate": 4.851082738892809e-05,
+ "loss": 2.048,
+ "step": 355
+ },
+ {
+ "epoch": 0.11,
+ "grad_norm": 1.6884938478469849,
+ "learning_rate": 4.8469019917180846e-05,
+ "loss": 1.9537,
+ "step": 360
+ },
+ {
+ "epoch": 0.11,
+ "grad_norm": 1.4680182933807373,
+ "learning_rate": 4.8426652208302814e-05,
+ "loss": 1.9731,
+ "step": 365
+ },
+ {
+ "epoch": 0.12,
+ "grad_norm": 1.1778632402420044,
+ "learning_rate": 4.83837252736683e-05,
+ "loss": 2.1395,
+ "step": 370
+ },
+ {
+ "epoch": 0.12,
+ "grad_norm": 1.2865056991577148,
+ "learning_rate": 4.834024013800108e-05,
+ "loss": 2.0016,
+ "step": 375
+ },
+ {
+ "epoch": 0.12,
+ "grad_norm": 1.055177092552185,
+ "learning_rate": 4.8296197839349944e-05,
+ "loss": 1.9632,
+ "step": 380
+ },
+ {
+ "epoch": 0.12,
+ "grad_norm": 1.0041871070861816,
+ "learning_rate": 4.825159942906389e-05,
+ "loss": 2.3302,
+ "step": 385
+ },
+ {
+ "epoch": 0.12,
+ "grad_norm": 1.0026438236236572,
+ "learning_rate": 4.820644597176709e-05,
+ "loss": 2.1517,
+ "step": 390
+ },
+ {
+ "epoch": 0.12,
+ "grad_norm": 1.3532180786132812,
+ "learning_rate": 4.81607385453334e-05,
+ "loss": 2.1229,
+ "step": 395
+ },
+ {
+ "epoch": 0.12,
+ "grad_norm": 0.7670988440513611,
+ "learning_rate": 4.81144782408607e-05,
+ "loss": 2.1382,
+ "step": 400
+ },
+ {
+ "epoch": 0.13,
+ "grad_norm": 1.0405700206756592,
+ "learning_rate": 4.8067666162644774e-05,
+ "loss": 1.9614,
+ "step": 405
+ },
+ {
+ "epoch": 0.13,
+ "grad_norm": 1.2252662181854248,
+ "learning_rate": 4.802030342815304e-05,
+ "loss": 2.1399,
+ "step": 410
+ },
+ {
+ "epoch": 0.13,
+ "grad_norm": 1.237946629524231,
+ "learning_rate": 4.7972391167997754e-05,
+ "loss": 1.9034,
+ "step": 415
+ },
+ {
+ "epoch": 0.13,
+ "grad_norm": 0.8064705729484558,
+ "learning_rate": 4.7923930525909156e-05,
+ "loss": 2.0075,
+ "step": 420
+ },
+ {
+ "epoch": 0.13,
+ "grad_norm": 0.8717565536499023,
+ "learning_rate": 4.7874922658708065e-05,
+ "loss": 2.0105,
+ "step": 425
+ },
+ {
+ "epoch": 0.13,
+ "grad_norm": 1.6693098545074463,
+ "learning_rate": 4.782536873627832e-05,
+ "loss": 2.0242,
+ "step": 430
+ },
+ {
+ "epoch": 0.14,
+ "grad_norm": 0.82447350025177,
+ "learning_rate": 4.777526994153882e-05,
+ "loss": 2.0267,
+ "step": 435
+ },
+ {
+ "epoch": 0.14,
+ "grad_norm": 0.9926588535308838,
+ "learning_rate": 4.7724627470415307e-05,
+ "loss": 1.9119,
+ "step": 440
+ },
+ {
+ "epoch": 0.14,
+ "grad_norm": 1.0924450159072876,
+ "learning_rate": 4.7673442531811796e-05,
+ "loss": 2.2653,
+ "step": 445
+ },
+ {
+ "epoch": 0.14,
+ "grad_norm": 1.1592103242874146,
+ "learning_rate": 4.762171634758177e-05,
+ "loss": 2.0017,
+ "step": 450
+ },
+ {
+ "epoch": 0.14,
+ "grad_norm": 0.9172110557556152,
+ "learning_rate": 4.7569450152498927e-05,
+ "loss": 2.1408,
+ "step": 455
+ },
+ {
+ "epoch": 0.14,
+ "grad_norm": 1.1897525787353516,
+ "learning_rate": 4.751664519422778e-05,
+ "loss": 2.0935,
+ "step": 460
+ },
+ {
+ "epoch": 0.14,
+ "grad_norm": 0.8793094158172607,
+ "learning_rate": 4.746330273329386e-05,
+ "loss": 2.1142,
+ "step": 465
+ },
+ {
+ "epoch": 0.15,
+ "grad_norm": 1.4337489604949951,
+ "learning_rate": 4.740942404305356e-05,
+ "loss": 2.1289,
+ "step": 470
+ },
+ {
+ "epoch": 0.15,
+ "grad_norm": 1.0251764059066772,
+ "learning_rate": 4.735501040966383e-05,
+ "loss": 1.9741,
+ "step": 475
+ },
+ {
+ "epoch": 0.15,
+ "grad_norm": 1.2659822702407837,
+ "learning_rate": 4.730006313205143e-05,
+ "loss": 2.088,
+ "step": 480
+ },
+ {
+ "epoch": 0.15,
+ "grad_norm": 0.8884140849113464,
+ "learning_rate": 4.724458352188192e-05,
+ "loss": 2.2079,
+ "step": 485
+ },
+ {
+ "epoch": 0.15,
+ "grad_norm": 1.1937768459320068,
+ "learning_rate": 4.718857290352835e-05,
+ "loss": 2.048,
+ "step": 490
+ },
+ {
+ "epoch": 0.15,
+ "grad_norm": 0.9741552472114563,
+ "learning_rate": 4.713203261403966e-05,
+ "loss": 2.2569,
+ "step": 495
+ },
+ {
+ "epoch": 0.16,
+ "grad_norm": 0.7996780872344971,
+ "learning_rate": 4.707496400310874e-05,
+ "loss": 1.9574,
+ "step": 500
+ },
+ {
+ "epoch": 0.16,
+ "grad_norm": 1.8182051181793213,
+ "learning_rate": 4.701736843304025e-05,
+ "loss": 2.0951,
+ "step": 505
+ },
+ {
+ "epoch": 0.16,
+ "grad_norm": 1.507320761680603,
+ "learning_rate": 4.695924727871805e-05,
+ "loss": 2.0253,
+ "step": 510
+ },
+ {
+ "epoch": 0.16,
+ "grad_norm": 0.759121835231781,
+ "learning_rate": 4.690060192757242e-05,
+ "loss": 2.0602,
+ "step": 515
+ },
+ {
+ "epoch": 0.16,
+ "grad_norm": 1.5943195819854736,
+ "learning_rate": 4.684143377954691e-05,
+ "loss": 2.0386,
+ "step": 520
+ },
+ {
+ "epoch": 0.16,
+ "grad_norm": 0.8568710088729858,
+ "learning_rate": 4.6781744247064955e-05,
+ "loss": 2.073,
+ "step": 525
+ },
+ {
+ "epoch": 0.16,
+ "grad_norm": 1.3352620601654053,
+ "learning_rate": 4.6721534754996125e-05,
+ "loss": 2.1443,
+ "step": 530
+ },
+ {
+ "epoch": 0.17,
+ "grad_norm": 1.3417474031448364,
+ "learning_rate": 4.666080674062213e-05,
+ "loss": 2.0288,
+ "step": 535
+ },
+ {
+ "epoch": 0.17,
+ "grad_norm": 1.5334464311599731,
+ "learning_rate": 4.659956165360251e-05,
+ "loss": 2.0609,
+ "step": 540
+ },
+ {
+ "epoch": 0.17,
+ "grad_norm": 0.9658721089363098,
+ "learning_rate": 4.6537800955940005e-05,
+ "loss": 1.9539,
+ "step": 545
+ },
+ {
+ "epoch": 0.17,
+ "grad_norm": 1.9197947978973389,
+ "learning_rate": 4.647552612194572e-05,
+ "loss": 2.149,
+ "step": 550
+ },
+ {
+ "epoch": 0.17,
+ "grad_norm": 0.8512137532234192,
+ "learning_rate": 4.641273863820383e-05,
+ "loss": 1.9722,
+ "step": 555
+ },
+ {
+ "epoch": 0.17,
+ "grad_norm": 1.827289342880249,
+ "learning_rate": 4.634944000353622e-05,
+ "loss": 2.0729,
+ "step": 560
+ },
+ {
+ "epoch": 0.18,
+ "grad_norm": 1.088416337966919,
+ "learning_rate": 4.628563172896655e-05,
+ "loss": 1.9507,
+ "step": 565
+ },
+ {
+ "epoch": 0.18,
+ "grad_norm": 1.3566908836364746,
+ "learning_rate": 4.6221315337684353e-05,
+ "loss": 2.1643,
+ "step": 570
+ },
+ {
+ "epoch": 0.18,
+ "grad_norm": 1.3541293144226074,
+ "learning_rate": 4.615649236500854e-05,
+ "loss": 2.1839,
+ "step": 575
+ },
+ {
+ "epoch": 0.18,
+ "grad_norm": 0.991269588470459,
+ "learning_rate": 4.609116435835083e-05,
+ "loss": 2.0976,
+ "step": 580
+ },
+ {
+ "epoch": 0.18,
+ "grad_norm": 1.0280535221099854,
+ "learning_rate": 4.602533287717877e-05,
+ "loss": 2.1474,
+ "step": 585
+ },
+ {
+ "epoch": 0.18,
+ "grad_norm": 1.013123631477356,
+ "learning_rate": 4.5958999492978524e-05,
+ "loss": 2.1873,
+ "step": 590
+ },
+ {
+ "epoch": 0.19,
+ "grad_norm": 1.1753040552139282,
+ "learning_rate": 4.589216578921737e-05,
+ "loss": 2.1744,
+ "step": 595
+ },
+ {
+ "epoch": 0.19,
+ "grad_norm": 1.1839090585708618,
+ "learning_rate": 4.582483336130586e-05,
+ "loss": 1.9982,
+ "step": 600
+ },
+ {
+ "epoch": 0.19,
+ "grad_norm": 1.0724798440933228,
+ "learning_rate": 4.575700381655979e-05,
+ "loss": 2.1234,
+ "step": 605
+ },
+ {
+ "epoch": 0.19,
+ "grad_norm": 2.009913682937622,
+ "learning_rate": 4.5688678774161796e-05,
+ "loss": 1.9478,
+ "step": 610
+ },
+ {
+ "epoch": 0.19,
+ "grad_norm": 0.9897060394287109,
+ "learning_rate": 4.561985986512271e-05,
+ "loss": 1.8268,
+ "step": 615
+ },
+ {
+ "epoch": 0.19,
+ "grad_norm": 0.8881808519363403,
+ "learning_rate": 4.555054873224263e-05,
+ "loss": 1.9887,
+ "step": 620
+ },
+ {
+ "epoch": 0.19,
+ "grad_norm": 1.155900001525879,
+ "learning_rate": 4.54807470300717e-05,
+ "loss": 2.0777,
+ "step": 625
+ },
+ {
+ "epoch": 0.2,
+ "grad_norm": 0.8782421350479126,
+ "learning_rate": 4.5410456424870596e-05,
+ "loss": 2.0566,
+ "step": 630
+ },
+ {
+ "epoch": 0.2,
+ "grad_norm": 1.3324674367904663,
+ "learning_rate": 4.5339678594570795e-05,
+ "loss": 2.047,
+ "step": 635
+ },
+ {
+ "epoch": 0.2,
+ "grad_norm": 1.9805939197540283,
+ "learning_rate": 4.526841522873449e-05,
+ "loss": 1.962,
+ "step": 640
+ },
+ {
+ "epoch": 0.2,
+ "grad_norm": 1.4999943971633911,
+ "learning_rate": 4.519666802851422e-05,
+ "loss": 2.0972,
+ "step": 645
+ },
+ {
+ "epoch": 0.2,
+ "grad_norm": 1.4504961967468262,
+ "learning_rate": 4.5124438706612376e-05,
+ "loss": 2.0041,
+ "step": 650
+ },
+ {
+ "epoch": 0.2,
+ "grad_norm": 0.9078169465065002,
+ "learning_rate": 4.505172898724018e-05,
+ "loss": 2.1229,
+ "step": 655
+ },
+ {
+ "epoch": 0.21,
+ "grad_norm": 1.1635804176330566,
+ "learning_rate": 4.497854060607662e-05,
+ "loss": 2.0195,
+ "step": 660
+ },
+ {
+ "epoch": 0.21,
+ "grad_norm": 1.46576726436615,
+ "learning_rate": 4.490487531022699e-05,
+ "loss": 2.0745,
+ "step": 665
+ },
+ {
+ "epoch": 0.21,
+ "grad_norm": 1.2094652652740479,
+ "learning_rate": 4.4830734858181145e-05,
+ "loss": 2.1068,
+ "step": 670
+ },
+ {
+ "epoch": 0.21,
+ "grad_norm": 1.4738895893096924,
+ "learning_rate": 4.47561210197716e-05,
+ "loss": 1.8088,
+ "step": 675
+ },
+ {
+ "epoch": 0.21,
+ "grad_norm": 1.23384690284729,
+ "learning_rate": 4.4681035576131215e-05,
+ "loss": 2.0995,
+ "step": 680
+ },
+ {
+ "epoch": 0.21,
+ "grad_norm": 0.8332946300506592,
+ "learning_rate": 4.46054803196507e-05,
+ "loss": 2.0541,
+ "step": 685
+ },
+ {
+ "epoch": 0.21,
+ "grad_norm": 0.9207485318183899,
+ "learning_rate": 4.452945705393586e-05,
+ "loss": 2.166,
+ "step": 690
+ },
+ {
+ "epoch": 0.22,
+ "grad_norm": 1.292945146560669,
+ "learning_rate": 4.445296759376449e-05,
+ "loss": 2.0784,
+ "step": 695
+ },
+ {
+ "epoch": 0.22,
+ "grad_norm": 0.9874763488769531,
+ "learning_rate": 4.437601376504307e-05,
+ "loss": 2.2087,
+ "step": 700
+ },
+ {
+ "epoch": 0.22,
+ "grad_norm": 0.9427415132522583,
+ "learning_rate": 4.4298597404763186e-05,
+ "loss": 2.1199,
+ "step": 705
+ },
+ {
+ "epoch": 0.22,
+ "grad_norm": 1.7369529008865356,
+ "learning_rate": 4.422072036095768e-05,
+ "loss": 2.0355,
+ "step": 710
+ },
+ {
+ "epoch": 0.22,
+ "grad_norm": 1.2423696517944336,
+ "learning_rate": 4.414238449265654e-05,
+ "loss": 2.0011,
+ "step": 715
+ },
+ {
+ "epoch": 0.22,
+ "grad_norm": 1.2304831743240356,
+ "learning_rate": 4.406359166984249e-05,
+ "loss": 2.0368,
+ "step": 720
+ },
+ {
+ "epoch": 0.23,
+ "grad_norm": 0.9090413451194763,
+ "learning_rate": 4.39843437734064e-05,
+ "loss": 1.9983,
+ "step": 725
+ },
+ {
+ "epoch": 0.23,
+ "grad_norm": 1.2729507684707642,
+ "learning_rate": 4.390464269510233e-05,
+ "loss": 2.021,
+ "step": 730
+ },
+ {
+ "epoch": 0.23,
+ "grad_norm": 1.3009227514266968,
+ "learning_rate": 4.382449033750244e-05,
+ "loss": 1.9743,
+ "step": 735
+ },
+ {
+ "epoch": 0.23,
+ "grad_norm": 1.5456056594848633,
+ "learning_rate": 4.37438886139515e-05,
+ "loss": 2.0689,
+ "step": 740
+ },
+ {
+ "epoch": 0.23,
+ "grad_norm": 1.3235007524490356,
+ "learning_rate": 4.3662839448521264e-05,
+ "loss": 2.0838,
+ "step": 745
+ },
+ {
+ "epoch": 0.23,
+ "grad_norm": 2.2074007987976074,
+ "learning_rate": 4.358134477596454e-05,
+ "loss": 2.0835,
+ "step": 750
+ },
+ {
+ "epoch": 0.23,
+ "grad_norm": 1.403738021850586,
+ "learning_rate": 4.3499406541668966e-05,
+ "loss": 2.0916,
+ "step": 755
+ },
+ {
+ "epoch": 0.24,
+ "grad_norm": 1.0940325260162354,
+ "learning_rate": 4.3417026701610616e-05,
+ "loss": 1.972,
+ "step": 760
+ },
+ {
+ "epoch": 0.24,
+ "grad_norm": 1.666353702545166,
+ "learning_rate": 4.3334207222307275e-05,
+ "loss": 1.927,
+ "step": 765
+ },
+ {
+ "epoch": 0.24,
+ "grad_norm": 1.0777515172958374,
+ "learning_rate": 4.325095008077154e-05,
+ "loss": 2.1192,
+ "step": 770
+ },
+ {
+ "epoch": 0.24,
+ "grad_norm": 1.7218186855316162,
+ "learning_rate": 4.316725726446353e-05,
+ "loss": 2.0774,
+ "step": 775
+ },
+ {
+ "epoch": 0.24,
+ "grad_norm": 1.356753945350647,
+ "learning_rate": 4.3083130771243586e-05,
+ "loss": 2.0847,
+ "step": 780
+ },
+ {
+ "epoch": 0.24,
+ "grad_norm": 0.9967429637908936,
+ "learning_rate": 4.299857260932445e-05,
+ "loss": 2.0485,
+ "step": 785
+ },
+ {
+ "epoch": 0.25,
+ "grad_norm": 1.6216442584991455,
+ "learning_rate": 4.2913584797223397e-05,
+ "loss": 2.1008,
+ "step": 790
+ },
+ {
+ "epoch": 0.25,
+ "grad_norm": 1.2556742429733276,
+ "learning_rate": 4.2828169363714016e-05,
+ "loss": 1.9209,
+ "step": 795
+ },
+ {
+ "epoch": 0.25,
+ "grad_norm": 1.1800439357757568,
+ "learning_rate": 4.274232834777782e-05,
+ "loss": 1.9722,
+ "step": 800
+ },
+ {
+ "epoch": 0.25,
+ "grad_norm": 1.1313499212265015,
+ "learning_rate": 4.2656063798555515e-05,
+ "loss": 1.9176,
+ "step": 805
+ },
+ {
+ "epoch": 0.25,
+ "grad_norm": 1.137534737586975,
+ "learning_rate": 4.256937777529815e-05,
+ "loss": 1.9929,
+ "step": 810
+ },
+ {
+ "epoch": 0.25,
+ "grad_norm": 1.0575093030929565,
+ "learning_rate": 4.2482272347317906e-05,
+ "loss": 2.166,
+ "step": 815
+ },
+ {
+ "epoch": 0.25,
+ "grad_norm": 1.5939594507217407,
+ "learning_rate": 4.2394749593938733e-05,
+ "loss": 2.1334,
+ "step": 820
+ },
+ {
+ "epoch": 0.26,
+ "grad_norm": 1.1045507192611694,
+ "learning_rate": 4.230681160444669e-05,
+ "loss": 2.0853,
+ "step": 825
+ },
+ {
+ "epoch": 0.26,
+ "grad_norm": 1.3480136394500732,
+ "learning_rate": 4.221846047804009e-05,
+ "loss": 2.1802,
+ "step": 830
+ },
+ {
+ "epoch": 0.26,
+ "grad_norm": 1.1822657585144043,
+ "learning_rate": 4.2129698323779366e-05,
+ "loss": 2.0739,
+ "step": 835
+ },
+ {
+ "epoch": 0.26,
+ "grad_norm": 1.1771117448806763,
+ "learning_rate": 4.204052726053676e-05,
+ "loss": 2.0238,
+ "step": 840
+ },
+ {
+ "epoch": 0.26,
+ "grad_norm": 1.4757814407348633,
+ "learning_rate": 4.195094941694571e-05,
+ "loss": 2.1557,
+ "step": 845
+ },
+ {
+ "epoch": 0.26,
+ "grad_norm": 0.9095075726509094,
+ "learning_rate": 4.1860966931350054e-05,
+ "loss": 2.1666,
+ "step": 850
+ },
+ {
+ "epoch": 0.27,
+ "grad_norm": 1.1039543151855469,
+ "learning_rate": 4.1770581951752976e-05,
+ "loss": 2.105,
+ "step": 855
+ },
+ {
+ "epoch": 0.27,
+ "grad_norm": 0.8517205119132996,
+ "learning_rate": 4.1679796635765735e-05,
+ "loss": 1.9656,
+ "step": 860
+ },
+ {
+ "epoch": 0.27,
+ "grad_norm": 1.239492654800415,
+ "learning_rate": 4.158861315055617e-05,
+ "loss": 2.0166,
+ "step": 865
+ },
+ {
+ "epoch": 0.27,
+ "grad_norm": 1.1358321905136108,
+ "learning_rate": 4.1497033672796924e-05,
+ "loss": 2.0076,
+ "step": 870
+ },
+ {
+ "epoch": 0.27,
+ "grad_norm": 1.6215249300003052,
+ "learning_rate": 4.140506038861356e-05,
+ "loss": 2.1594,
+ "step": 875
+ },
+ {
+ "epoch": 0.27,
+ "grad_norm": 1.0528080463409424,
+ "learning_rate": 4.131269549353229e-05,
+ "loss": 2.1416,
+ "step": 880
+ },
+ {
+ "epoch": 0.28,
+ "grad_norm": 0.8976901769638062,
+ "learning_rate": 4.1219941192427644e-05,
+ "loss": 2.1242,
+ "step": 885
+ },
+ {
+ "epoch": 0.28,
+ "grad_norm": 1.263594388961792,
+ "learning_rate": 4.112679969946977e-05,
+ "loss": 2.02,
+ "step": 890
+ },
+ {
+ "epoch": 0.28,
+ "grad_norm": 1.4173017740249634,
+ "learning_rate": 4.103327323807162e-05,
+ "loss": 2.0438,
+ "step": 895
+ },
+ {
+ "epoch": 0.28,
+ "grad_norm": 1.876170039176941,
+ "learning_rate": 4.093936404083585e-05,
+ "loss": 1.9806,
+ "step": 900
+ },
+ {
+ "epoch": 0.28,
+ "grad_norm": 1.4649231433868408,
+ "learning_rate": 4.0845074349501544e-05,
+ "loss": 2.1476,
+ "step": 905
+ },
+ {
+ "epoch": 0.28,
+ "grad_norm": 1.0446043014526367,
+ "learning_rate": 4.0750406414890695e-05,
+ "loss": 1.9672,
+ "step": 910
+ },
+ {
+ "epoch": 0.28,
+ "grad_norm": 1.0225305557250977,
+ "learning_rate": 4.065536249685448e-05,
+ "loss": 1.9984,
+ "step": 915
+ },
+ {
+ "epoch": 0.29,
+ "grad_norm": 1.0120617151260376,
+ "learning_rate": 4.055994486421929e-05,
+ "loss": 2.1162,
+ "step": 920
+ },
+ {
+ "epoch": 0.29,
+ "grad_norm": 1.0469881296157837,
+ "learning_rate": 4.04641557947326e-05,
+ "loss": 2.0435,
+ "step": 925
+ },
+ {
+ "epoch": 0.29,
+ "grad_norm": 1.2435941696166992,
+ "learning_rate": 4.036799757500856e-05,
+ "loss": 2.0431,
+ "step": 930
+ },
+ {
+ "epoch": 0.29,
+ "grad_norm": 1.0055103302001953,
+ "learning_rate": 4.027147250047348e-05,
+ "loss": 2.2021,
+ "step": 935
+ },
+ {
+ "epoch": 0.29,
+ "grad_norm": 1.1212949752807617,
+ "learning_rate": 4.017458287531094e-05,
+ "loss": 1.997,
+ "step": 940
+ },
+ {
+ "epoch": 0.29,
+ "grad_norm": 1.1048357486724854,
+ "learning_rate": 4.007733101240685e-05,
+ "loss": 1.946,
+ "step": 945
+ },
+ {
+ "epoch": 0.3,
+ "grad_norm": 1.4721689224243164,
+ "learning_rate": 3.997971923329426e-05,
+ "loss": 2.0723,
+ "step": 950
+ },
+ {
+ "epoch": 0.3,
+ "grad_norm": 1.3793156147003174,
+ "learning_rate": 3.988174986809783e-05,
+ "loss": 2.034,
+ "step": 955
+ },
+ {
+ "epoch": 0.3,
+ "grad_norm": 0.9013482928276062,
+ "learning_rate": 3.9783425255478355e-05,
+ "loss": 1.9736,
+ "step": 960
+ },
+ {
+ "epoch": 0.3,
+ "grad_norm": 0.9192422032356262,
+ "learning_rate": 3.968474774257682e-05,
+ "loss": 1.9878,
+ "step": 965
+ },
+ {
+ "epoch": 0.3,
+ "grad_norm": 1.9304206371307373,
+ "learning_rate": 3.9585719684958446e-05,
+ "loss": 2.117,
+ "step": 970
+ },
+ {
+ "epoch": 0.3,
+ "grad_norm": 1.0435137748718262,
+ "learning_rate": 3.948634344655639e-05,
+ "loss": 2.0585,
+ "step": 975
+ },
+ {
+ "epoch": 0.3,
+ "grad_norm": 1.4636590480804443,
+ "learning_rate": 3.938662139961538e-05,
+ "loss": 2.0409,
+ "step": 980
+ },
+ {
+ "epoch": 0.31,
+ "grad_norm": 1.8014529943466187,
+ "learning_rate": 3.928655592463508e-05,
+ "loss": 2.0369,
+ "step": 985
+ },
+ {
+ "epoch": 0.31,
+ "grad_norm": 1.2412620782852173,
+ "learning_rate": 3.918614941031319e-05,
+ "loss": 1.967,
+ "step": 990
+ },
+ {
+ "epoch": 0.31,
+ "grad_norm": 1.3581103086471558,
+ "learning_rate": 3.908540425348852e-05,
+ "loss": 2.0037,
+ "step": 995
+ },
+ {
+ "epoch": 0.31,
+ "grad_norm": 1.2377780675888062,
+ "learning_rate": 3.8984322859083725e-05,
+ "loss": 1.9991,
+ "step": 1000
+ }
+ ],
+ "logging_steps": 5,
+ "max_steps": 3215,
+ "num_input_tokens_seen": 0,
+ "num_train_epochs": 1,
+ "save_steps": 100,
+ "total_flos": 1.343065816498176e+16,
+ "train_batch_size": 2,
+ "trial_name": null,
+ "trial_params": null
+}
diff --git a/checkpoint-1000/training_args.bin b/checkpoint-1000/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..dbb5f752acaebe6718c05e59ced2a70ce6dfb6a0
--- /dev/null
+++ b/checkpoint-1000/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:79e80b13ff00898b4493d440a3c1a1eb234c0ae541cbca8a8b1befef97a354c9
+size 5112
diff --git a/checkpoint-1100/README.md b/checkpoint-1100/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..b21eb87c49b1ea11193cd8f664d04b06c22e6bb5
--- /dev/null
+++ b/checkpoint-1100/README.md
@@ -0,0 +1,202 @@
+---
+library_name: peft
+base_model: hfl/chinese-alpaca-2-1.3b
+---
+
+# Model Card for Model ID
+
+
+
+
+
+## Model Details
+
+### Model Description
+
+
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+
+
+### Direct Use
+
+
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+
+
+[More Information Needed]
+
+### Recommendations
+
+
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+
+
+[More Information Needed]
+
+### Training Procedure
+
+
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed]
+
+#### Speeds, Sizes, Times [optional]
+
+
+
+[More Information Needed]
+
+## Evaluation
+
+
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+
+
+[More Information Needed]
+
+#### Factors
+
+
+
+[More Information Needed]
+
+#### Metrics
+
+
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+
+
+[More Information Needed]
+
+## Environmental Impact
+
+
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.9.0
\ No newline at end of file
diff --git a/checkpoint-1100/adapter_config.json b/checkpoint-1100/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..67326eac9d1aeaece3cee6be49e088077b9cebe9
--- /dev/null
+++ b/checkpoint-1100/adapter_config.json
@@ -0,0 +1,28 @@
+{
+ "alpha_pattern": {},
+ "auto_mapping": null,
+ "base_model_name_or_path": "hfl/chinese-alpaca-2-1.3b",
+ "bias": "none",
+ "fan_in_fan_out": false,
+ "inference_mode": true,
+ "init_lora_weights": true,
+ "layers_pattern": null,
+ "layers_to_transform": null,
+ "loftq_config": {},
+ "lora_alpha": 16,
+ "lora_dropout": 0.1,
+ "megatron_config": null,
+ "megatron_core": "megatron.core",
+ "modules_to_save": null,
+ "peft_type": "LORA",
+ "r": 8,
+ "rank_pattern": {},
+ "revision": null,
+ "target_modules": [
+ "q_proj",
+ "v_proj"
+ ],
+ "task_type": "CAUSAL_LM",
+ "use_dora": false,
+ "use_rslora": false
+}
\ No newline at end of file
diff --git a/checkpoint-1100/adapter_model.safetensors b/checkpoint-1100/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..0fab7c359f49f806f7a707f85bafc1d3bbaa0194
--- /dev/null
+++ b/checkpoint-1100/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5a683217eb65ff427c775be7b4f2b17fa3e4d2a1f4ec2c6f932ead2131164d58
+size 2099272
diff --git a/checkpoint-1100/optimizer.pt b/checkpoint-1100/optimizer.pt
new file mode 100644
index 0000000000000000000000000000000000000000..b9331e6086ee91332160742707cb1a78f346b726
--- /dev/null
+++ b/checkpoint-1100/optimizer.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f09f84a51333688d2c1ffa008ee924f88b13e5b05cf3e96c960de16b6a3ec732
+size 4208302
diff --git a/checkpoint-1100/rng_state.pth b/checkpoint-1100/rng_state.pth
new file mode 100644
index 0000000000000000000000000000000000000000..634fb2b7b3aace9c27427af47144ca2c2f16720b
--- /dev/null
+++ b/checkpoint-1100/rng_state.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ea2be7046a8cfae98823a1a5937a6b641a96662a439d14f80e764a9be0f430b4
+size 14244
diff --git a/checkpoint-1100/scheduler.pt b/checkpoint-1100/scheduler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..5424712ae48d8b747d8e1acccad0a2e32c6ef196
--- /dev/null
+++ b/checkpoint-1100/scheduler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c9ebce50ac9027c187ef9430639e84c374e26350a5f18c89c2fee60ddec9bbbf
+size 1064
diff --git a/checkpoint-1100/special_tokens_map.json b/checkpoint-1100/special_tokens_map.json
new file mode 100644
index 0000000000000000000000000000000000000000..94c4595c4eb202faff8e45d273ceffbe19fe3036
--- /dev/null
+++ b/checkpoint-1100/special_tokens_map.json
@@ -0,0 +1,30 @@
+{
+ "bos_token": {
+ "content": "",
+ "lstrip": false,
+ "normalized": true,
+ "rstrip": false,
+ "single_word": false
+ },
+ "eos_token": {
+ "content": "",
+ "lstrip": false,
+ "normalized": true,
+ "rstrip": false,
+ "single_word": false
+ },
+ "pad_token": {
+ "content": "",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false
+ },
+ "unk_token": {
+ "content": "",
+ "lstrip": false,
+ "normalized": true,
+ "rstrip": false,
+ "single_word": false
+ }
+}
diff --git a/checkpoint-1100/tokenizer.model b/checkpoint-1100/tokenizer.model
new file mode 100644
index 0000000000000000000000000000000000000000..3df664fcc67f61d0c7e635a60d9f55cd0624158a
--- /dev/null
+++ b/checkpoint-1100/tokenizer.model
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a3b8844863b200dfcca971db228e96ce388290dfcf72c15d7a9d2f604bac787c
+size 844403
diff --git a/checkpoint-1100/tokenizer_config.json b/checkpoint-1100/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..6912f7cdcb84c3039edad59aa64f70dc4344a517
--- /dev/null
+++ b/checkpoint-1100/tokenizer_config.json
@@ -0,0 +1,54 @@
+{
+ "add_bos_token": true,
+ "add_eos_token": false,
+ "add_prefix_space": true,
+ "added_tokens_decoder": {
+ "0": {
+ "content": "",
+ "lstrip": false,
+ "normalized": true,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "1": {
+ "content": "",
+ "lstrip": false,
+ "normalized": true,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "2": {
+ "content": "",
+ "lstrip": false,
+ "normalized": true,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "32000": {
+ "content": "",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ }
+ },
+ "bos_token": "",
+ "chat_template": "{% set system_message = 'You are a helpful assistant. 你是一个乐于助人的助手。' %}{% if messages[0]['role'] == 'system' %}{% set system_message = messages[0]['content'] %}{% endif %}{% for message in messages %}{% set content = message['content'] %}{% if loop.index0 == 0 and system_message is defined %}{% set content = '<>\\n' + system_message + '\\n<>\\n\\n' + message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ '' + '[INST] ' + content + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ content + '' }}{% endif %}{% endfor %}",
+ "clean_up_tokenization_spaces": false,
+ "eos_token": "",
+ "legacy": true,
+ "model_max_length": 1000000000000000019884624838656,
+ "pad_token": "",
+ "padding_side": "right",
+ "sp_model_kwargs": {},
+ "spaces_between_special_tokens": false,
+ "split_special_tokens": false,
+ "tokenizer_class": "LlamaTokenizer",
+ "unk_token": "",
+ "use_default_system_prompt": false,
+ "use_fast": false
+}
diff --git a/checkpoint-1100/trainer_state.json b/checkpoint-1100/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..c721a7555f4c3a72641e8f61392875b3d43c6890
--- /dev/null
+++ b/checkpoint-1100/trainer_state.json
@@ -0,0 +1,1561 @@
+{
+ "best_metric": null,
+ "best_model_checkpoint": null,
+ "epoch": 0.34205309596921524,
+ "eval_steps": 500,
+ "global_step": 1100,
+ "is_hyper_param_search": false,
+ "is_local_process_zero": true,
+ "is_world_process_zero": true,
+ "log_history": [
+ {
+ "epoch": 0.0,
+ "grad_norm": 0.47774845361709595,
+ "learning_rate": 4.999970160815579e-05,
+ "loss": 2.0765,
+ "step": 5
+ },
+ {
+ "epoch": 0.0,
+ "grad_norm": 0.6051416397094727,
+ "learning_rate": 4.999880643974619e-05,
+ "loss": 2.2297,
+ "step": 10
+ },
+ {
+ "epoch": 0.0,
+ "grad_norm": 0.6161717772483826,
+ "learning_rate": 4.9997314516140056e-05,
+ "loss": 2.1103,
+ "step": 15
+ },
+ {
+ "epoch": 0.01,
+ "grad_norm": 0.4686434268951416,
+ "learning_rate": 4.999522587295162e-05,
+ "loss": 2.0057,
+ "step": 20
+ },
+ {
+ "epoch": 0.01,
+ "grad_norm": 0.8412289023399353,
+ "learning_rate": 4.999254056003963e-05,
+ "loss": 2.1778,
+ "step": 25
+ },
+ {
+ "epoch": 0.01,
+ "grad_norm": 0.5333625078201294,
+ "learning_rate": 4.99892586415061e-05,
+ "loss": 2.2399,
+ "step": 30
+ },
+ {
+ "epoch": 0.01,
+ "grad_norm": 0.821148157119751,
+ "learning_rate": 4.9985380195694856e-05,
+ "loss": 2.3215,
+ "step": 35
+ },
+ {
+ "epoch": 0.01,
+ "grad_norm": 0.8403909206390381,
+ "learning_rate": 4.998090531518962e-05,
+ "loss": 1.8295,
+ "step": 40
+ },
+ {
+ "epoch": 0.01,
+ "grad_norm": 0.6633398532867432,
+ "learning_rate": 4.9975834106811834e-05,
+ "loss": 2.0195,
+ "step": 45
+ },
+ {
+ "epoch": 0.02,
+ "grad_norm": 0.6386868357658386,
+ "learning_rate": 4.997016669161806e-05,
+ "loss": 2.1257,
+ "step": 50
+ },
+ {
+ "epoch": 0.02,
+ "grad_norm": 0.7762248516082764,
+ "learning_rate": 4.996390320489715e-05,
+ "loss": 2.057,
+ "step": 55
+ },
+ {
+ "epoch": 0.02,
+ "grad_norm": 1.3192856311798096,
+ "learning_rate": 4.9957043796166966e-05,
+ "loss": 2.0753,
+ "step": 60
+ },
+ {
+ "epoch": 0.02,
+ "grad_norm": 0.9797518849372864,
+ "learning_rate": 4.994958862917083e-05,
+ "loss": 1.9736,
+ "step": 65
+ },
+ {
+ "epoch": 0.02,
+ "grad_norm": 1.000693440437317,
+ "learning_rate": 4.994153788187363e-05,
+ "loss": 2.1572,
+ "step": 70
+ },
+ {
+ "epoch": 0.02,
+ "grad_norm": 0.6852813959121704,
+ "learning_rate": 4.993289174645757e-05,
+ "loss": 2.1491,
+ "step": 75
+ },
+ {
+ "epoch": 0.02,
+ "grad_norm": 1.0075691938400269,
+ "learning_rate": 4.992365042931752e-05,
+ "loss": 1.945,
+ "step": 80
+ },
+ {
+ "epoch": 0.03,
+ "grad_norm": 1.1973133087158203,
+ "learning_rate": 4.991381415105619e-05,
+ "loss": 2.0811,
+ "step": 85
+ },
+ {
+ "epoch": 0.03,
+ "grad_norm": 0.9927239418029785,
+ "learning_rate": 4.990338314647881e-05,
+ "loss": 1.961,
+ "step": 90
+ },
+ {
+ "epoch": 0.03,
+ "grad_norm": 0.9499759674072266,
+ "learning_rate": 4.98923576645875e-05,
+ "loss": 2.0653,
+ "step": 95
+ },
+ {
+ "epoch": 0.03,
+ "grad_norm": 0.7233040928840637,
+ "learning_rate": 4.9880737968575365e-05,
+ "loss": 1.9999,
+ "step": 100
+ },
+ {
+ "epoch": 0.03,
+ "grad_norm": 1.55235755443573,
+ "learning_rate": 4.986852433582022e-05,
+ "loss": 2.2258,
+ "step": 105
+ },
+ {
+ "epoch": 0.03,
+ "grad_norm": 0.9007890820503235,
+ "learning_rate": 4.985571705787793e-05,
+ "loss": 2.1034,
+ "step": 110
+ },
+ {
+ "epoch": 0.04,
+ "grad_norm": 0.6774860620498657,
+ "learning_rate": 4.9842316440475475e-05,
+ "loss": 2.1753,
+ "step": 115
+ },
+ {
+ "epoch": 0.04,
+ "grad_norm": 0.7676737308502197,
+ "learning_rate": 4.9828322803503665e-05,
+ "loss": 2.1384,
+ "step": 120
+ },
+ {
+ "epoch": 0.04,
+ "grad_norm": 0.9624544978141785,
+ "learning_rate": 4.981373648100946e-05,
+ "loss": 2.0521,
+ "step": 125
+ },
+ {
+ "epoch": 0.04,
+ "grad_norm": 0.9315722584724426,
+ "learning_rate": 4.979855782118802e-05,
+ "loss": 1.9256,
+ "step": 130
+ },
+ {
+ "epoch": 0.04,
+ "grad_norm": 0.9035864472389221,
+ "learning_rate": 4.978278718637443e-05,
+ "loss": 2.0882,
+ "step": 135
+ },
+ {
+ "epoch": 0.04,
+ "grad_norm": 0.7997236251831055,
+ "learning_rate": 4.9766424953035e-05,
+ "loss": 2.0724,
+ "step": 140
+ },
+ {
+ "epoch": 0.05,
+ "grad_norm": 1.0692921876907349,
+ "learning_rate": 4.974947151175826e-05,
+ "loss": 2.1329,
+ "step": 145
+ },
+ {
+ "epoch": 0.05,
+ "grad_norm": 0.9506180286407471,
+ "learning_rate": 4.973192726724572e-05,
+ "loss": 2.082,
+ "step": 150
+ },
+ {
+ "epoch": 0.05,
+ "grad_norm": 0.8647387027740479,
+ "learning_rate": 4.9713792638302145e-05,
+ "loss": 2.0366,
+ "step": 155
+ },
+ {
+ "epoch": 0.05,
+ "grad_norm": 1.105302095413208,
+ "learning_rate": 4.969506805782555e-05,
+ "loss": 2.1481,
+ "step": 160
+ },
+ {
+ "epoch": 0.05,
+ "grad_norm": 0.7593303918838501,
+ "learning_rate": 4.967575397279689e-05,
+ "loss": 2.032,
+ "step": 165
+ },
+ {
+ "epoch": 0.05,
+ "grad_norm": 0.7521979808807373,
+ "learning_rate": 4.965585084426943e-05,
+ "loss": 2.0379,
+ "step": 170
+ },
+ {
+ "epoch": 0.05,
+ "grad_norm": 0.947120726108551,
+ "learning_rate": 4.9635359147357655e-05,
+ "loss": 2.1444,
+ "step": 175
+ },
+ {
+ "epoch": 0.06,
+ "grad_norm": 1.2184454202651978,
+ "learning_rate": 4.961427937122598e-05,
+ "loss": 1.9164,
+ "step": 180
+ },
+ {
+ "epoch": 0.06,
+ "grad_norm": 1.221663475036621,
+ "learning_rate": 4.959261201907707e-05,
+ "loss": 2.0084,
+ "step": 185
+ },
+ {
+ "epoch": 0.06,
+ "grad_norm": 1.0457361936569214,
+ "learning_rate": 4.957035760813982e-05,
+ "loss": 2.2032,
+ "step": 190
+ },
+ {
+ "epoch": 0.06,
+ "grad_norm": 0.8834909200668335,
+ "learning_rate": 4.954751666965701e-05,
+ "loss": 2.2101,
+ "step": 195
+ },
+ {
+ "epoch": 0.06,
+ "grad_norm": 0.791902482509613,
+ "learning_rate": 4.9524089748872615e-05,
+ "loss": 2.0472,
+ "step": 200
+ },
+ {
+ "epoch": 0.06,
+ "grad_norm": 1.2905739545822144,
+ "learning_rate": 4.9500077405018807e-05,
+ "loss": 2.0987,
+ "step": 205
+ },
+ {
+ "epoch": 0.07,
+ "grad_norm": 0.8612006306648254,
+ "learning_rate": 4.9475480211302583e-05,
+ "loss": 2.1765,
+ "step": 210
+ },
+ {
+ "epoch": 0.07,
+ "grad_norm": 1.3128459453582764,
+ "learning_rate": 4.945029875489212e-05,
+ "loss": 1.9926,
+ "step": 215
+ },
+ {
+ "epoch": 0.07,
+ "grad_norm": 0.9610918164253235,
+ "learning_rate": 4.94245336369027e-05,
+ "loss": 2.0124,
+ "step": 220
+ },
+ {
+ "epoch": 0.07,
+ "grad_norm": 0.873160183429718,
+ "learning_rate": 4.939818547238241e-05,
+ "loss": 2.2229,
+ "step": 225
+ },
+ {
+ "epoch": 0.07,
+ "grad_norm": 1.5535285472869873,
+ "learning_rate": 4.9371254890297446e-05,
+ "loss": 2.2013,
+ "step": 230
+ },
+ {
+ "epoch": 0.07,
+ "grad_norm": 1.1951836347579956,
+ "learning_rate": 4.93437425335171e-05,
+ "loss": 2.014,
+ "step": 235
+ },
+ {
+ "epoch": 0.07,
+ "grad_norm": 0.7874170541763306,
+ "learning_rate": 4.9315649058798384e-05,
+ "loss": 2.1701,
+ "step": 240
+ },
+ {
+ "epoch": 0.08,
+ "grad_norm": 1.3503323793411255,
+ "learning_rate": 4.928697513677042e-05,
+ "loss": 2.1681,
+ "step": 245
+ },
+ {
+ "epoch": 0.08,
+ "grad_norm": 1.3091179132461548,
+ "learning_rate": 4.925772145191834e-05,
+ "loss": 2.1224,
+ "step": 250
+ },
+ {
+ "epoch": 0.08,
+ "grad_norm": 1.4428555965423584,
+ "learning_rate": 4.9227888702567044e-05,
+ "loss": 2.0512,
+ "step": 255
+ },
+ {
+ "epoch": 0.08,
+ "grad_norm": 0.8234395980834961,
+ "learning_rate": 4.9197477600864446e-05,
+ "loss": 2.1067,
+ "step": 260
+ },
+ {
+ "epoch": 0.08,
+ "grad_norm": 1.9094969034194946,
+ "learning_rate": 4.9166488872764526e-05,
+ "loss": 1.8884,
+ "step": 265
+ },
+ {
+ "epoch": 0.08,
+ "grad_norm": 1.0074087381362915,
+ "learning_rate": 4.913492325800999e-05,
+ "loss": 1.9345,
+ "step": 270
+ },
+ {
+ "epoch": 0.09,
+ "grad_norm": 1.0867297649383545,
+ "learning_rate": 4.910278151011458e-05,
+ "loss": 2.1928,
+ "step": 275
+ },
+ {
+ "epoch": 0.09,
+ "grad_norm": 0.6842357516288757,
+ "learning_rate": 4.907006439634516e-05,
+ "loss": 2.0407,
+ "step": 280
+ },
+ {
+ "epoch": 0.09,
+ "grad_norm": 0.8409023284912109,
+ "learning_rate": 4.903677269770329e-05,
+ "loss": 2.2344,
+ "step": 285
+ },
+ {
+ "epoch": 0.09,
+ "grad_norm": 0.8119503259658813,
+ "learning_rate": 4.900290720890671e-05,
+ "loss": 2.1296,
+ "step": 290
+ },
+ {
+ "epoch": 0.09,
+ "grad_norm": 0.9938147068023682,
+ "learning_rate": 4.8968468738370244e-05,
+ "loss": 2.152,
+ "step": 295
+ },
+ {
+ "epoch": 0.09,
+ "grad_norm": 0.9865244030952454,
+ "learning_rate": 4.8933458108186606e-05,
+ "loss": 1.9623,
+ "step": 300
+ },
+ {
+ "epoch": 0.09,
+ "grad_norm": 1.3944802284240723,
+ "learning_rate": 4.889787615410672e-05,
+ "loss": 1.915,
+ "step": 305
+ },
+ {
+ "epoch": 0.1,
+ "grad_norm": 1.3749767541885376,
+ "learning_rate": 4.886172372551977e-05,
+ "loss": 1.9934,
+ "step": 310
+ },
+ {
+ "epoch": 0.1,
+ "grad_norm": 0.9024938941001892,
+ "learning_rate": 4.882500168543294e-05,
+ "loss": 2.1541,
+ "step": 315
+ },
+ {
+ "epoch": 0.1,
+ "grad_norm": 1.1978263854980469,
+ "learning_rate": 4.878771091045082e-05,
+ "loss": 2.1688,
+ "step": 320
+ },
+ {
+ "epoch": 0.1,
+ "grad_norm": 0.8360010981559753,
+ "learning_rate": 4.874985229075446e-05,
+ "loss": 2.1387,
+ "step": 325
+ },
+ {
+ "epoch": 0.1,
+ "grad_norm": 0.7683364152908325,
+ "learning_rate": 4.871142673008012e-05,
+ "loss": 2.0215,
+ "step": 330
+ },
+ {
+ "epoch": 0.1,
+ "grad_norm": 1.4230670928955078,
+ "learning_rate": 4.867243514569772e-05,
+ "loss": 1.9491,
+ "step": 335
+ },
+ {
+ "epoch": 0.11,
+ "grad_norm": 0.8198773860931396,
+ "learning_rate": 4.863287846838891e-05,
+ "loss": 2.0151,
+ "step": 340
+ },
+ {
+ "epoch": 0.11,
+ "grad_norm": 1.467207908630371,
+ "learning_rate": 4.85927576424249e-05,
+ "loss": 1.8906,
+ "step": 345
+ },
+ {
+ "epoch": 0.11,
+ "grad_norm": 0.9537095427513123,
+ "learning_rate": 4.855207362554385e-05,
+ "loss": 2.1844,
+ "step": 350
+ },
+ {
+ "epoch": 0.11,
+ "grad_norm": 1.0757155418395996,
+ "learning_rate": 4.851082738892809e-05,
+ "loss": 2.048,
+ "step": 355
+ },
+ {
+ "epoch": 0.11,
+ "grad_norm": 1.6884938478469849,
+ "learning_rate": 4.8469019917180846e-05,
+ "loss": 1.9537,
+ "step": 360
+ },
+ {
+ "epoch": 0.11,
+ "grad_norm": 1.4680182933807373,
+ "learning_rate": 4.8426652208302814e-05,
+ "loss": 1.9731,
+ "step": 365
+ },
+ {
+ "epoch": 0.12,
+ "grad_norm": 1.1778632402420044,
+ "learning_rate": 4.83837252736683e-05,
+ "loss": 2.1395,
+ "step": 370
+ },
+ {
+ "epoch": 0.12,
+ "grad_norm": 1.2865056991577148,
+ "learning_rate": 4.834024013800108e-05,
+ "loss": 2.0016,
+ "step": 375
+ },
+ {
+ "epoch": 0.12,
+ "grad_norm": 1.055177092552185,
+ "learning_rate": 4.8296197839349944e-05,
+ "loss": 1.9632,
+ "step": 380
+ },
+ {
+ "epoch": 0.12,
+ "grad_norm": 1.0041871070861816,
+ "learning_rate": 4.825159942906389e-05,
+ "loss": 2.3302,
+ "step": 385
+ },
+ {
+ "epoch": 0.12,
+ "grad_norm": 1.0026438236236572,
+ "learning_rate": 4.820644597176709e-05,
+ "loss": 2.1517,
+ "step": 390
+ },
+ {
+ "epoch": 0.12,
+ "grad_norm": 1.3532180786132812,
+ "learning_rate": 4.81607385453334e-05,
+ "loss": 2.1229,
+ "step": 395
+ },
+ {
+ "epoch": 0.12,
+ "grad_norm": 0.7670988440513611,
+ "learning_rate": 4.81144782408607e-05,
+ "loss": 2.1382,
+ "step": 400
+ },
+ {
+ "epoch": 0.13,
+ "grad_norm": 1.0405700206756592,
+ "learning_rate": 4.8067666162644774e-05,
+ "loss": 1.9614,
+ "step": 405
+ },
+ {
+ "epoch": 0.13,
+ "grad_norm": 1.2252662181854248,
+ "learning_rate": 4.802030342815304e-05,
+ "loss": 2.1399,
+ "step": 410
+ },
+ {
+ "epoch": 0.13,
+ "grad_norm": 1.237946629524231,
+ "learning_rate": 4.7972391167997754e-05,
+ "loss": 1.9034,
+ "step": 415
+ },
+ {
+ "epoch": 0.13,
+ "grad_norm": 0.8064705729484558,
+ "learning_rate": 4.7923930525909156e-05,
+ "loss": 2.0075,
+ "step": 420
+ },
+ {
+ "epoch": 0.13,
+ "grad_norm": 0.8717565536499023,
+ "learning_rate": 4.7874922658708065e-05,
+ "loss": 2.0105,
+ "step": 425
+ },
+ {
+ "epoch": 0.13,
+ "grad_norm": 1.6693098545074463,
+ "learning_rate": 4.782536873627832e-05,
+ "loss": 2.0242,
+ "step": 430
+ },
+ {
+ "epoch": 0.14,
+ "grad_norm": 0.82447350025177,
+ "learning_rate": 4.777526994153882e-05,
+ "loss": 2.0267,
+ "step": 435
+ },
+ {
+ "epoch": 0.14,
+ "grad_norm": 0.9926588535308838,
+ "learning_rate": 4.7724627470415307e-05,
+ "loss": 1.9119,
+ "step": 440
+ },
+ {
+ "epoch": 0.14,
+ "grad_norm": 1.0924450159072876,
+ "learning_rate": 4.7673442531811796e-05,
+ "loss": 2.2653,
+ "step": 445
+ },
+ {
+ "epoch": 0.14,
+ "grad_norm": 1.1592103242874146,
+ "learning_rate": 4.762171634758177e-05,
+ "loss": 2.0017,
+ "step": 450
+ },
+ {
+ "epoch": 0.14,
+ "grad_norm": 0.9172110557556152,
+ "learning_rate": 4.7569450152498927e-05,
+ "loss": 2.1408,
+ "step": 455
+ },
+ {
+ "epoch": 0.14,
+ "grad_norm": 1.1897525787353516,
+ "learning_rate": 4.751664519422778e-05,
+ "loss": 2.0935,
+ "step": 460
+ },
+ {
+ "epoch": 0.14,
+ "grad_norm": 0.8793094158172607,
+ "learning_rate": 4.746330273329386e-05,
+ "loss": 2.1142,
+ "step": 465
+ },
+ {
+ "epoch": 0.15,
+ "grad_norm": 1.4337489604949951,
+ "learning_rate": 4.740942404305356e-05,
+ "loss": 2.1289,
+ "step": 470
+ },
+ {
+ "epoch": 0.15,
+ "grad_norm": 1.0251764059066772,
+ "learning_rate": 4.735501040966383e-05,
+ "loss": 1.9741,
+ "step": 475
+ },
+ {
+ "epoch": 0.15,
+ "grad_norm": 1.2659822702407837,
+ "learning_rate": 4.730006313205143e-05,
+ "loss": 2.088,
+ "step": 480
+ },
+ {
+ "epoch": 0.15,
+ "grad_norm": 0.8884140849113464,
+ "learning_rate": 4.724458352188192e-05,
+ "loss": 2.2079,
+ "step": 485
+ },
+ {
+ "epoch": 0.15,
+ "grad_norm": 1.1937768459320068,
+ "learning_rate": 4.718857290352835e-05,
+ "loss": 2.048,
+ "step": 490
+ },
+ {
+ "epoch": 0.15,
+ "grad_norm": 0.9741552472114563,
+ "learning_rate": 4.713203261403966e-05,
+ "loss": 2.2569,
+ "step": 495
+ },
+ {
+ "epoch": 0.16,
+ "grad_norm": 0.7996780872344971,
+ "learning_rate": 4.707496400310874e-05,
+ "loss": 1.9574,
+ "step": 500
+ },
+ {
+ "epoch": 0.16,
+ "grad_norm": 1.8182051181793213,
+ "learning_rate": 4.701736843304025e-05,
+ "loss": 2.0951,
+ "step": 505
+ },
+ {
+ "epoch": 0.16,
+ "grad_norm": 1.507320761680603,
+ "learning_rate": 4.695924727871805e-05,
+ "loss": 2.0253,
+ "step": 510
+ },
+ {
+ "epoch": 0.16,
+ "grad_norm": 0.759121835231781,
+ "learning_rate": 4.690060192757242e-05,
+ "loss": 2.0602,
+ "step": 515
+ },
+ {
+ "epoch": 0.16,
+ "grad_norm": 1.5943195819854736,
+ "learning_rate": 4.684143377954691e-05,
+ "loss": 2.0386,
+ "step": 520
+ },
+ {
+ "epoch": 0.16,
+ "grad_norm": 0.8568710088729858,
+ "learning_rate": 4.6781744247064955e-05,
+ "loss": 2.073,
+ "step": 525
+ },
+ {
+ "epoch": 0.16,
+ "grad_norm": 1.3352620601654053,
+ "learning_rate": 4.6721534754996125e-05,
+ "loss": 2.1443,
+ "step": 530
+ },
+ {
+ "epoch": 0.17,
+ "grad_norm": 1.3417474031448364,
+ "learning_rate": 4.666080674062213e-05,
+ "loss": 2.0288,
+ "step": 535
+ },
+ {
+ "epoch": 0.17,
+ "grad_norm": 1.5334464311599731,
+ "learning_rate": 4.659956165360251e-05,
+ "loss": 2.0609,
+ "step": 540
+ },
+ {
+ "epoch": 0.17,
+ "grad_norm": 0.9658721089363098,
+ "learning_rate": 4.6537800955940005e-05,
+ "loss": 1.9539,
+ "step": 545
+ },
+ {
+ "epoch": 0.17,
+ "grad_norm": 1.9197947978973389,
+ "learning_rate": 4.647552612194572e-05,
+ "loss": 2.149,
+ "step": 550
+ },
+ {
+ "epoch": 0.17,
+ "grad_norm": 0.8512137532234192,
+ "learning_rate": 4.641273863820383e-05,
+ "loss": 1.9722,
+ "step": 555
+ },
+ {
+ "epoch": 0.17,
+ "grad_norm": 1.827289342880249,
+ "learning_rate": 4.634944000353622e-05,
+ "loss": 2.0729,
+ "step": 560
+ },
+ {
+ "epoch": 0.18,
+ "grad_norm": 1.088416337966919,
+ "learning_rate": 4.628563172896655e-05,
+ "loss": 1.9507,
+ "step": 565
+ },
+ {
+ "epoch": 0.18,
+ "grad_norm": 1.3566908836364746,
+ "learning_rate": 4.6221315337684353e-05,
+ "loss": 2.1643,
+ "step": 570
+ },
+ {
+ "epoch": 0.18,
+ "grad_norm": 1.3541293144226074,
+ "learning_rate": 4.615649236500854e-05,
+ "loss": 2.1839,
+ "step": 575
+ },
+ {
+ "epoch": 0.18,
+ "grad_norm": 0.991269588470459,
+ "learning_rate": 4.609116435835083e-05,
+ "loss": 2.0976,
+ "step": 580
+ },
+ {
+ "epoch": 0.18,
+ "grad_norm": 1.0280535221099854,
+ "learning_rate": 4.602533287717877e-05,
+ "loss": 2.1474,
+ "step": 585
+ },
+ {
+ "epoch": 0.18,
+ "grad_norm": 1.013123631477356,
+ "learning_rate": 4.5958999492978524e-05,
+ "loss": 2.1873,
+ "step": 590
+ },
+ {
+ "epoch": 0.19,
+ "grad_norm": 1.1753040552139282,
+ "learning_rate": 4.589216578921737e-05,
+ "loss": 2.1744,
+ "step": 595
+ },
+ {
+ "epoch": 0.19,
+ "grad_norm": 1.1839090585708618,
+ "learning_rate": 4.582483336130586e-05,
+ "loss": 1.9982,
+ "step": 600
+ },
+ {
+ "epoch": 0.19,
+ "grad_norm": 1.0724798440933228,
+ "learning_rate": 4.575700381655979e-05,
+ "loss": 2.1234,
+ "step": 605
+ },
+ {
+ "epoch": 0.19,
+ "grad_norm": 2.009913682937622,
+ "learning_rate": 4.5688678774161796e-05,
+ "loss": 1.9478,
+ "step": 610
+ },
+ {
+ "epoch": 0.19,
+ "grad_norm": 0.9897060394287109,
+ "learning_rate": 4.561985986512271e-05,
+ "loss": 1.8268,
+ "step": 615
+ },
+ {
+ "epoch": 0.19,
+ "grad_norm": 0.8881808519363403,
+ "learning_rate": 4.555054873224263e-05,
+ "loss": 1.9887,
+ "step": 620
+ },
+ {
+ "epoch": 0.19,
+ "grad_norm": 1.155900001525879,
+ "learning_rate": 4.54807470300717e-05,
+ "loss": 2.0777,
+ "step": 625
+ },
+ {
+ "epoch": 0.2,
+ "grad_norm": 0.8782421350479126,
+ "learning_rate": 4.5410456424870596e-05,
+ "loss": 2.0566,
+ "step": 630
+ },
+ {
+ "epoch": 0.2,
+ "grad_norm": 1.3324674367904663,
+ "learning_rate": 4.5339678594570795e-05,
+ "loss": 2.047,
+ "step": 635
+ },
+ {
+ "epoch": 0.2,
+ "grad_norm": 1.9805939197540283,
+ "learning_rate": 4.526841522873449e-05,
+ "loss": 1.962,
+ "step": 640
+ },
+ {
+ "epoch": 0.2,
+ "grad_norm": 1.4999943971633911,
+ "learning_rate": 4.519666802851422e-05,
+ "loss": 2.0972,
+ "step": 645
+ },
+ {
+ "epoch": 0.2,
+ "grad_norm": 1.4504961967468262,
+ "learning_rate": 4.5124438706612376e-05,
+ "loss": 2.0041,
+ "step": 650
+ },
+ {
+ "epoch": 0.2,
+ "grad_norm": 0.9078169465065002,
+ "learning_rate": 4.505172898724018e-05,
+ "loss": 2.1229,
+ "step": 655
+ },
+ {
+ "epoch": 0.21,
+ "grad_norm": 1.1635804176330566,
+ "learning_rate": 4.497854060607662e-05,
+ "loss": 2.0195,
+ "step": 660
+ },
+ {
+ "epoch": 0.21,
+ "grad_norm": 1.46576726436615,
+ "learning_rate": 4.490487531022699e-05,
+ "loss": 2.0745,
+ "step": 665
+ },
+ {
+ "epoch": 0.21,
+ "grad_norm": 1.2094652652740479,
+ "learning_rate": 4.4830734858181145e-05,
+ "loss": 2.1068,
+ "step": 670
+ },
+ {
+ "epoch": 0.21,
+ "grad_norm": 1.4738895893096924,
+ "learning_rate": 4.47561210197716e-05,
+ "loss": 1.8088,
+ "step": 675
+ },
+ {
+ "epoch": 0.21,
+ "grad_norm": 1.23384690284729,
+ "learning_rate": 4.4681035576131215e-05,
+ "loss": 2.0995,
+ "step": 680
+ },
+ {
+ "epoch": 0.21,
+ "grad_norm": 0.8332946300506592,
+ "learning_rate": 4.46054803196507e-05,
+ "loss": 2.0541,
+ "step": 685
+ },
+ {
+ "epoch": 0.21,
+ "grad_norm": 0.9207485318183899,
+ "learning_rate": 4.452945705393586e-05,
+ "loss": 2.166,
+ "step": 690
+ },
+ {
+ "epoch": 0.22,
+ "grad_norm": 1.292945146560669,
+ "learning_rate": 4.445296759376449e-05,
+ "loss": 2.0784,
+ "step": 695
+ },
+ {
+ "epoch": 0.22,
+ "grad_norm": 0.9874763488769531,
+ "learning_rate": 4.437601376504307e-05,
+ "loss": 2.2087,
+ "step": 700
+ },
+ {
+ "epoch": 0.22,
+ "grad_norm": 0.9427415132522583,
+ "learning_rate": 4.4298597404763186e-05,
+ "loss": 2.1199,
+ "step": 705
+ },
+ {
+ "epoch": 0.22,
+ "grad_norm": 1.7369529008865356,
+ "learning_rate": 4.422072036095768e-05,
+ "loss": 2.0355,
+ "step": 710
+ },
+ {
+ "epoch": 0.22,
+ "grad_norm": 1.2423696517944336,
+ "learning_rate": 4.414238449265654e-05,
+ "loss": 2.0011,
+ "step": 715
+ },
+ {
+ "epoch": 0.22,
+ "grad_norm": 1.2304831743240356,
+ "learning_rate": 4.406359166984249e-05,
+ "loss": 2.0368,
+ "step": 720
+ },
+ {
+ "epoch": 0.23,
+ "grad_norm": 0.9090413451194763,
+ "learning_rate": 4.39843437734064e-05,
+ "loss": 1.9983,
+ "step": 725
+ },
+ {
+ "epoch": 0.23,
+ "grad_norm": 1.2729507684707642,
+ "learning_rate": 4.390464269510233e-05,
+ "loss": 2.021,
+ "step": 730
+ },
+ {
+ "epoch": 0.23,
+ "grad_norm": 1.3009227514266968,
+ "learning_rate": 4.382449033750244e-05,
+ "loss": 1.9743,
+ "step": 735
+ },
+ {
+ "epoch": 0.23,
+ "grad_norm": 1.5456056594848633,
+ "learning_rate": 4.37438886139515e-05,
+ "loss": 2.0689,
+ "step": 740
+ },
+ {
+ "epoch": 0.23,
+ "grad_norm": 1.3235007524490356,
+ "learning_rate": 4.3662839448521264e-05,
+ "loss": 2.0838,
+ "step": 745
+ },
+ {
+ "epoch": 0.23,
+ "grad_norm": 2.2074007987976074,
+ "learning_rate": 4.358134477596454e-05,
+ "loss": 2.0835,
+ "step": 750
+ },
+ {
+ "epoch": 0.23,
+ "grad_norm": 1.403738021850586,
+ "learning_rate": 4.3499406541668966e-05,
+ "loss": 2.0916,
+ "step": 755
+ },
+ {
+ "epoch": 0.24,
+ "grad_norm": 1.0940325260162354,
+ "learning_rate": 4.3417026701610616e-05,
+ "loss": 1.972,
+ "step": 760
+ },
+ {
+ "epoch": 0.24,
+ "grad_norm": 1.666353702545166,
+ "learning_rate": 4.3334207222307275e-05,
+ "loss": 1.927,
+ "step": 765
+ },
+ {
+ "epoch": 0.24,
+ "grad_norm": 1.0777515172958374,
+ "learning_rate": 4.325095008077154e-05,
+ "loss": 2.1192,
+ "step": 770
+ },
+ {
+ "epoch": 0.24,
+ "grad_norm": 1.7218186855316162,
+ "learning_rate": 4.316725726446353e-05,
+ "loss": 2.0774,
+ "step": 775
+ },
+ {
+ "epoch": 0.24,
+ "grad_norm": 1.356753945350647,
+ "learning_rate": 4.3083130771243586e-05,
+ "loss": 2.0847,
+ "step": 780
+ },
+ {
+ "epoch": 0.24,
+ "grad_norm": 0.9967429637908936,
+ "learning_rate": 4.299857260932445e-05,
+ "loss": 2.0485,
+ "step": 785
+ },
+ {
+ "epoch": 0.25,
+ "grad_norm": 1.6216442584991455,
+ "learning_rate": 4.2913584797223397e-05,
+ "loss": 2.1008,
+ "step": 790
+ },
+ {
+ "epoch": 0.25,
+ "grad_norm": 1.2556742429733276,
+ "learning_rate": 4.2828169363714016e-05,
+ "loss": 1.9209,
+ "step": 795
+ },
+ {
+ "epoch": 0.25,
+ "grad_norm": 1.1800439357757568,
+ "learning_rate": 4.274232834777782e-05,
+ "loss": 1.9722,
+ "step": 800
+ },
+ {
+ "epoch": 0.25,
+ "grad_norm": 1.1313499212265015,
+ "learning_rate": 4.2656063798555515e-05,
+ "loss": 1.9176,
+ "step": 805
+ },
+ {
+ "epoch": 0.25,
+ "grad_norm": 1.137534737586975,
+ "learning_rate": 4.256937777529815e-05,
+ "loss": 1.9929,
+ "step": 810
+ },
+ {
+ "epoch": 0.25,
+ "grad_norm": 1.0575093030929565,
+ "learning_rate": 4.2482272347317906e-05,
+ "loss": 2.166,
+ "step": 815
+ },
+ {
+ "epoch": 0.25,
+ "grad_norm": 1.5939594507217407,
+ "learning_rate": 4.2394749593938733e-05,
+ "loss": 2.1334,
+ "step": 820
+ },
+ {
+ "epoch": 0.26,
+ "grad_norm": 1.1045507192611694,
+ "learning_rate": 4.230681160444669e-05,
+ "loss": 2.0853,
+ "step": 825
+ },
+ {
+ "epoch": 0.26,
+ "grad_norm": 1.3480136394500732,
+ "learning_rate": 4.221846047804009e-05,
+ "loss": 2.1802,
+ "step": 830
+ },
+ {
+ "epoch": 0.26,
+ "grad_norm": 1.1822657585144043,
+ "learning_rate": 4.2129698323779366e-05,
+ "loss": 2.0739,
+ "step": 835
+ },
+ {
+ "epoch": 0.26,
+ "grad_norm": 1.1771117448806763,
+ "learning_rate": 4.204052726053676e-05,
+ "loss": 2.0238,
+ "step": 840
+ },
+ {
+ "epoch": 0.26,
+ "grad_norm": 1.4757814407348633,
+ "learning_rate": 4.195094941694571e-05,
+ "loss": 2.1557,
+ "step": 845
+ },
+ {
+ "epoch": 0.26,
+ "grad_norm": 0.9095075726509094,
+ "learning_rate": 4.1860966931350054e-05,
+ "loss": 2.1666,
+ "step": 850
+ },
+ {
+ "epoch": 0.27,
+ "grad_norm": 1.1039543151855469,
+ "learning_rate": 4.1770581951752976e-05,
+ "loss": 2.105,
+ "step": 855
+ },
+ {
+ "epoch": 0.27,
+ "grad_norm": 0.8517205119132996,
+ "learning_rate": 4.1679796635765735e-05,
+ "loss": 1.9656,
+ "step": 860
+ },
+ {
+ "epoch": 0.27,
+ "grad_norm": 1.239492654800415,
+ "learning_rate": 4.158861315055617e-05,
+ "loss": 2.0166,
+ "step": 865
+ },
+ {
+ "epoch": 0.27,
+ "grad_norm": 1.1358321905136108,
+ "learning_rate": 4.1497033672796924e-05,
+ "loss": 2.0076,
+ "step": 870
+ },
+ {
+ "epoch": 0.27,
+ "grad_norm": 1.6215249300003052,
+ "learning_rate": 4.140506038861356e-05,
+ "loss": 2.1594,
+ "step": 875
+ },
+ {
+ "epoch": 0.27,
+ "grad_norm": 1.0528080463409424,
+ "learning_rate": 4.131269549353229e-05,
+ "loss": 2.1416,
+ "step": 880
+ },
+ {
+ "epoch": 0.28,
+ "grad_norm": 0.8976901769638062,
+ "learning_rate": 4.1219941192427644e-05,
+ "loss": 2.1242,
+ "step": 885
+ },
+ {
+ "epoch": 0.28,
+ "grad_norm": 1.263594388961792,
+ "learning_rate": 4.112679969946977e-05,
+ "loss": 2.02,
+ "step": 890
+ },
+ {
+ "epoch": 0.28,
+ "grad_norm": 1.4173017740249634,
+ "learning_rate": 4.103327323807162e-05,
+ "loss": 2.0438,
+ "step": 895
+ },
+ {
+ "epoch": 0.28,
+ "grad_norm": 1.876170039176941,
+ "learning_rate": 4.093936404083585e-05,
+ "loss": 1.9806,
+ "step": 900
+ },
+ {
+ "epoch": 0.28,
+ "grad_norm": 1.4649231433868408,
+ "learning_rate": 4.0845074349501544e-05,
+ "loss": 2.1476,
+ "step": 905
+ },
+ {
+ "epoch": 0.28,
+ "grad_norm": 1.0446043014526367,
+ "learning_rate": 4.0750406414890695e-05,
+ "loss": 1.9672,
+ "step": 910
+ },
+ {
+ "epoch": 0.28,
+ "grad_norm": 1.0225305557250977,
+ "learning_rate": 4.065536249685448e-05,
+ "loss": 1.9984,
+ "step": 915
+ },
+ {
+ "epoch": 0.29,
+ "grad_norm": 1.0120617151260376,
+ "learning_rate": 4.055994486421929e-05,
+ "loss": 2.1162,
+ "step": 920
+ },
+ {
+ "epoch": 0.29,
+ "grad_norm": 1.0469881296157837,
+ "learning_rate": 4.04641557947326e-05,
+ "loss": 2.0435,
+ "step": 925
+ },
+ {
+ "epoch": 0.29,
+ "grad_norm": 1.2435941696166992,
+ "learning_rate": 4.036799757500856e-05,
+ "loss": 2.0431,
+ "step": 930
+ },
+ {
+ "epoch": 0.29,
+ "grad_norm": 1.0055103302001953,
+ "learning_rate": 4.027147250047348e-05,
+ "loss": 2.2021,
+ "step": 935
+ },
+ {
+ "epoch": 0.29,
+ "grad_norm": 1.1212949752807617,
+ "learning_rate": 4.017458287531094e-05,
+ "loss": 1.997,
+ "step": 940
+ },
+ {
+ "epoch": 0.29,
+ "grad_norm": 1.1048357486724854,
+ "learning_rate": 4.007733101240685e-05,
+ "loss": 1.946,
+ "step": 945
+ },
+ {
+ "epoch": 0.3,
+ "grad_norm": 1.4721689224243164,
+ "learning_rate": 3.997971923329426e-05,
+ "loss": 2.0723,
+ "step": 950
+ },
+ {
+ "epoch": 0.3,
+ "grad_norm": 1.3793156147003174,
+ "learning_rate": 3.988174986809783e-05,
+ "loss": 2.034,
+ "step": 955
+ },
+ {
+ "epoch": 0.3,
+ "grad_norm": 0.9013482928276062,
+ "learning_rate": 3.9783425255478355e-05,
+ "loss": 1.9736,
+ "step": 960
+ },
+ {
+ "epoch": 0.3,
+ "grad_norm": 0.9192422032356262,
+ "learning_rate": 3.968474774257682e-05,
+ "loss": 1.9878,
+ "step": 965
+ },
+ {
+ "epoch": 0.3,
+ "grad_norm": 1.9304206371307373,
+ "learning_rate": 3.9585719684958446e-05,
+ "loss": 2.117,
+ "step": 970
+ },
+ {
+ "epoch": 0.3,
+ "grad_norm": 1.0435137748718262,
+ "learning_rate": 3.948634344655639e-05,
+ "loss": 2.0585,
+ "step": 975
+ },
+ {
+ "epoch": 0.3,
+ "grad_norm": 1.4636590480804443,
+ "learning_rate": 3.938662139961538e-05,
+ "loss": 2.0409,
+ "step": 980
+ },
+ {
+ "epoch": 0.31,
+ "grad_norm": 1.8014529943466187,
+ "learning_rate": 3.928655592463508e-05,
+ "loss": 2.0369,
+ "step": 985
+ },
+ {
+ "epoch": 0.31,
+ "grad_norm": 1.2412620782852173,
+ "learning_rate": 3.918614941031319e-05,
+ "loss": 1.967,
+ "step": 990
+ },
+ {
+ "epoch": 0.31,
+ "grad_norm": 1.3581103086471558,
+ "learning_rate": 3.908540425348852e-05,
+ "loss": 2.0037,
+ "step": 995
+ },
+ {
+ "epoch": 0.31,
+ "grad_norm": 1.2377780675888062,
+ "learning_rate": 3.8984322859083725e-05,
+ "loss": 1.9991,
+ "step": 1000
+ },
+ {
+ "epoch": 0.31,
+ "grad_norm": 0.9209259748458862,
+ "learning_rate": 3.8882907640047896e-05,
+ "loss": 2.0448,
+ "step": 1005
+ },
+ {
+ "epoch": 0.31,
+ "grad_norm": 1.0150959491729736,
+ "learning_rate": 3.878116101729897e-05,
+ "loss": 2.0791,
+ "step": 1010
+ },
+ {
+ "epoch": 0.32,
+ "grad_norm": 1.5959141254425049,
+ "learning_rate": 3.867908541966594e-05,
+ "loss": 1.9997,
+ "step": 1015
+ },
+ {
+ "epoch": 0.32,
+ "grad_norm": 1.3945012092590332,
+ "learning_rate": 3.857668328383088e-05,
+ "loss": 2.0481,
+ "step": 1020
+ },
+ {
+ "epoch": 0.32,
+ "grad_norm": 1.2361671924591064,
+ "learning_rate": 3.847395705427075e-05,
+ "loss": 2.2664,
+ "step": 1025
+ },
+ {
+ "epoch": 0.32,
+ "grad_norm": 1.9661719799041748,
+ "learning_rate": 3.837090918319909e-05,
+ "loss": 1.9752,
+ "step": 1030
+ },
+ {
+ "epoch": 0.32,
+ "grad_norm": 1.6995949745178223,
+ "learning_rate": 3.8267542130507436e-05,
+ "loss": 2.1332,
+ "step": 1035
+ },
+ {
+ "epoch": 0.32,
+ "grad_norm": 1.1248412132263184,
+ "learning_rate": 3.816385836370663e-05,
+ "loss": 2.0432,
+ "step": 1040
+ },
+ {
+ "epoch": 0.32,
+ "grad_norm": 0.8734235763549805,
+ "learning_rate": 3.805986035786789e-05,
+ "loss": 1.9618,
+ "step": 1045
+ },
+ {
+ "epoch": 0.33,
+ "grad_norm": 1.322766661643982,
+ "learning_rate": 3.795555059556378e-05,
+ "loss": 2.0267,
+ "step": 1050
+ },
+ {
+ "epoch": 0.33,
+ "grad_norm": 1.0396028757095337,
+ "learning_rate": 3.7850931566808866e-05,
+ "loss": 2.1075,
+ "step": 1055
+ },
+ {
+ "epoch": 0.33,
+ "grad_norm": 0.9574625492095947,
+ "learning_rate": 3.7746005769000363e-05,
+ "loss": 2.156,
+ "step": 1060
+ },
+ {
+ "epoch": 0.33,
+ "grad_norm": 1.4480133056640625,
+ "learning_rate": 3.764077570685844e-05,
+ "loss": 1.9615,
+ "step": 1065
+ },
+ {
+ "epoch": 0.33,
+ "grad_norm": 1.5908560752868652,
+ "learning_rate": 3.753524389236648e-05,
+ "loss": 2.0928,
+ "step": 1070
+ },
+ {
+ "epoch": 0.33,
+ "grad_norm": 1.2628813982009888,
+ "learning_rate": 3.742941284471111e-05,
+ "loss": 2.1074,
+ "step": 1075
+ },
+ {
+ "epoch": 0.34,
+ "grad_norm": 1.2687503099441528,
+ "learning_rate": 3.7323285090222054e-05,
+ "loss": 1.9666,
+ "step": 1080
+ },
+ {
+ "epoch": 0.34,
+ "grad_norm": 1.2571731805801392,
+ "learning_rate": 3.721686316231181e-05,
+ "loss": 2.0468,
+ "step": 1085
+ },
+ {
+ "epoch": 0.34,
+ "grad_norm": 1.007453441619873,
+ "learning_rate": 3.7110149601415215e-05,
+ "loss": 2.0624,
+ "step": 1090
+ },
+ {
+ "epoch": 0.34,
+ "grad_norm": 1.2390377521514893,
+ "learning_rate": 3.700314695492876e-05,
+ "loss": 1.9888,
+ "step": 1095
+ },
+ {
+ "epoch": 0.34,
+ "grad_norm": 1.0878371000289917,
+ "learning_rate": 3.6895857777149825e-05,
+ "loss": 2.1013,
+ "step": 1100
+ }
+ ],
+ "logging_steps": 5,
+ "max_steps": 3215,
+ "num_input_tokens_seen": 0,
+ "num_train_epochs": 1,
+ "save_steps": 100,
+ "total_flos": 1.478467994517504e+16,
+ "train_batch_size": 2,
+ "trial_name": null,
+ "trial_params": null
+}
diff --git a/checkpoint-1100/training_args.bin b/checkpoint-1100/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..dbb5f752acaebe6718c05e59ced2a70ce6dfb6a0
--- /dev/null
+++ b/checkpoint-1100/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:79e80b13ff00898b4493d440a3c1a1eb234c0ae541cbca8a8b1befef97a354c9
+size 5112
diff --git a/checkpoint-1200/README.md b/checkpoint-1200/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..b21eb87c49b1ea11193cd8f664d04b06c22e6bb5
--- /dev/null
+++ b/checkpoint-1200/README.md
@@ -0,0 +1,202 @@
+---
+library_name: peft
+base_model: hfl/chinese-alpaca-2-1.3b
+---
+
+# Model Card for Model ID
+
+
+
+
+
+## Model Details
+
+### Model Description
+
+
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+
+
+### Direct Use
+
+
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+
+
+[More Information Needed]
+
+### Recommendations
+
+
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+
+
+[More Information Needed]
+
+### Training Procedure
+
+
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed]
+
+#### Speeds, Sizes, Times [optional]
+
+
+
+[More Information Needed]
+
+## Evaluation
+
+
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+
+
+[More Information Needed]
+
+#### Factors
+
+
+
+[More Information Needed]
+
+#### Metrics
+
+
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+
+
+[More Information Needed]
+
+## Environmental Impact
+
+
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.9.0
\ No newline at end of file
diff --git a/checkpoint-1200/adapter_config.json b/checkpoint-1200/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..67326eac9d1aeaece3cee6be49e088077b9cebe9
--- /dev/null
+++ b/checkpoint-1200/adapter_config.json
@@ -0,0 +1,28 @@
+{
+ "alpha_pattern": {},
+ "auto_mapping": null,
+ "base_model_name_or_path": "hfl/chinese-alpaca-2-1.3b",
+ "bias": "none",
+ "fan_in_fan_out": false,
+ "inference_mode": true,
+ "init_lora_weights": true,
+ "layers_pattern": null,
+ "layers_to_transform": null,
+ "loftq_config": {},
+ "lora_alpha": 16,
+ "lora_dropout": 0.1,
+ "megatron_config": null,
+ "megatron_core": "megatron.core",
+ "modules_to_save": null,
+ "peft_type": "LORA",
+ "r": 8,
+ "rank_pattern": {},
+ "revision": null,
+ "target_modules": [
+ "q_proj",
+ "v_proj"
+ ],
+ "task_type": "CAUSAL_LM",
+ "use_dora": false,
+ "use_rslora": false
+}
\ No newline at end of file
diff --git a/checkpoint-1200/adapter_model.safetensors b/checkpoint-1200/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..d3871fd26d9cc993a3a9b582859988bc33199474
--- /dev/null
+++ b/checkpoint-1200/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c7ac7f86b3d698764177b292c0945fc14ade25e4053b2ea32433e2ec468c1c68
+size 2099272
diff --git a/checkpoint-1200/optimizer.pt b/checkpoint-1200/optimizer.pt
new file mode 100644
index 0000000000000000000000000000000000000000..92ee7636340dd2b5d37d195ef4b533c20a5e0169
--- /dev/null
+++ b/checkpoint-1200/optimizer.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f46418bdb2847edff424887e74f54e939ccb878883a90f7033fb72d289847b08
+size 4208302
diff --git a/checkpoint-1200/rng_state.pth b/checkpoint-1200/rng_state.pth
new file mode 100644
index 0000000000000000000000000000000000000000..b221020e695dc394acd40bb79272217a2f504bec
--- /dev/null
+++ b/checkpoint-1200/rng_state.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:76802f226aa39edc0b86081075bc5ce21c5a32a4f1656a577b0f88858dbbf174
+size 14244
diff --git a/checkpoint-1200/scheduler.pt b/checkpoint-1200/scheduler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..cf3f988a1e02202f3ce9c66ae845472cd5d86cfc
--- /dev/null
+++ b/checkpoint-1200/scheduler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ca63f72cf59858dda6b2859e21cee9d57c26194ed3023a7e6e3eb27a883baab6
+size 1064
diff --git a/checkpoint-1200/special_tokens_map.json b/checkpoint-1200/special_tokens_map.json
new file mode 100644
index 0000000000000000000000000000000000000000..94c4595c4eb202faff8e45d273ceffbe19fe3036
--- /dev/null
+++ b/checkpoint-1200/special_tokens_map.json
@@ -0,0 +1,30 @@
+{
+ "bos_token": {
+ "content": "",
+ "lstrip": false,
+ "normalized": true,
+ "rstrip": false,
+ "single_word": false
+ },
+ "eos_token": {
+ "content": "",
+ "lstrip": false,
+ "normalized": true,
+ "rstrip": false,
+ "single_word": false
+ },
+ "pad_token": {
+ "content": "",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false
+ },
+ "unk_token": {
+ "content": "",
+ "lstrip": false,
+ "normalized": true,
+ "rstrip": false,
+ "single_word": false
+ }
+}
diff --git a/checkpoint-1200/tokenizer.model b/checkpoint-1200/tokenizer.model
new file mode 100644
index 0000000000000000000000000000000000000000..3df664fcc67f61d0c7e635a60d9f55cd0624158a
--- /dev/null
+++ b/checkpoint-1200/tokenizer.model
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a3b8844863b200dfcca971db228e96ce388290dfcf72c15d7a9d2f604bac787c
+size 844403
diff --git a/checkpoint-1200/tokenizer_config.json b/checkpoint-1200/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..6912f7cdcb84c3039edad59aa64f70dc4344a517
--- /dev/null
+++ b/checkpoint-1200/tokenizer_config.json
@@ -0,0 +1,54 @@
+{
+ "add_bos_token": true,
+ "add_eos_token": false,
+ "add_prefix_space": true,
+ "added_tokens_decoder": {
+ "0": {
+ "content": "",
+ "lstrip": false,
+ "normalized": true,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "1": {
+ "content": "",
+ "lstrip": false,
+ "normalized": true,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "2": {
+ "content": "",
+ "lstrip": false,
+ "normalized": true,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "32000": {
+ "content": "",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ }
+ },
+ "bos_token": "",
+ "chat_template": "{% set system_message = 'You are a helpful assistant. 你是一个乐于助人的助手。' %}{% if messages[0]['role'] == 'system' %}{% set system_message = messages[0]['content'] %}{% endif %}{% for message in messages %}{% set content = message['content'] %}{% if loop.index0 == 0 and system_message is defined %}{% set content = '<>\\n' + system_message + '\\n<>\\n\\n' + message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ '' + '[INST] ' + content + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ content + '' }}{% endif %}{% endfor %}",
+ "clean_up_tokenization_spaces": false,
+ "eos_token": "",
+ "legacy": true,
+ "model_max_length": 1000000000000000019884624838656,
+ "pad_token": "",
+ "padding_side": "right",
+ "sp_model_kwargs": {},
+ "spaces_between_special_tokens": false,
+ "split_special_tokens": false,
+ "tokenizer_class": "LlamaTokenizer",
+ "unk_token": "",
+ "use_default_system_prompt": false,
+ "use_fast": false
+}
diff --git a/checkpoint-1200/trainer_state.json b/checkpoint-1200/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..9b1f058c7a57a56ca7312a3cf5c021abfc3c9cde
--- /dev/null
+++ b/checkpoint-1200/trainer_state.json
@@ -0,0 +1,1701 @@
+{
+ "best_metric": null,
+ "best_model_checkpoint": null,
+ "epoch": 0.3731488319664166,
+ "eval_steps": 500,
+ "global_step": 1200,
+ "is_hyper_param_search": false,
+ "is_local_process_zero": true,
+ "is_world_process_zero": true,
+ "log_history": [
+ {
+ "epoch": 0.0,
+ "grad_norm": 0.47774845361709595,
+ "learning_rate": 4.999970160815579e-05,
+ "loss": 2.0765,
+ "step": 5
+ },
+ {
+ "epoch": 0.0,
+ "grad_norm": 0.6051416397094727,
+ "learning_rate": 4.999880643974619e-05,
+ "loss": 2.2297,
+ "step": 10
+ },
+ {
+ "epoch": 0.0,
+ "grad_norm": 0.6161717772483826,
+ "learning_rate": 4.9997314516140056e-05,
+ "loss": 2.1103,
+ "step": 15
+ },
+ {
+ "epoch": 0.01,
+ "grad_norm": 0.4686434268951416,
+ "learning_rate": 4.999522587295162e-05,
+ "loss": 2.0057,
+ "step": 20
+ },
+ {
+ "epoch": 0.01,
+ "grad_norm": 0.8412289023399353,
+ "learning_rate": 4.999254056003963e-05,
+ "loss": 2.1778,
+ "step": 25
+ },
+ {
+ "epoch": 0.01,
+ "grad_norm": 0.5333625078201294,
+ "learning_rate": 4.99892586415061e-05,
+ "loss": 2.2399,
+ "step": 30
+ },
+ {
+ "epoch": 0.01,
+ "grad_norm": 0.821148157119751,
+ "learning_rate": 4.9985380195694856e-05,
+ "loss": 2.3215,
+ "step": 35
+ },
+ {
+ "epoch": 0.01,
+ "grad_norm": 0.8403909206390381,
+ "learning_rate": 4.998090531518962e-05,
+ "loss": 1.8295,
+ "step": 40
+ },
+ {
+ "epoch": 0.01,
+ "grad_norm": 0.6633398532867432,
+ "learning_rate": 4.9975834106811834e-05,
+ "loss": 2.0195,
+ "step": 45
+ },
+ {
+ "epoch": 0.02,
+ "grad_norm": 0.6386868357658386,
+ "learning_rate": 4.997016669161806e-05,
+ "loss": 2.1257,
+ "step": 50
+ },
+ {
+ "epoch": 0.02,
+ "grad_norm": 0.7762248516082764,
+ "learning_rate": 4.996390320489715e-05,
+ "loss": 2.057,
+ "step": 55
+ },
+ {
+ "epoch": 0.02,
+ "grad_norm": 1.3192856311798096,
+ "learning_rate": 4.9957043796166966e-05,
+ "loss": 2.0753,
+ "step": 60
+ },
+ {
+ "epoch": 0.02,
+ "grad_norm": 0.9797518849372864,
+ "learning_rate": 4.994958862917083e-05,
+ "loss": 1.9736,
+ "step": 65
+ },
+ {
+ "epoch": 0.02,
+ "grad_norm": 1.000693440437317,
+ "learning_rate": 4.994153788187363e-05,
+ "loss": 2.1572,
+ "step": 70
+ },
+ {
+ "epoch": 0.02,
+ "grad_norm": 0.6852813959121704,
+ "learning_rate": 4.993289174645757e-05,
+ "loss": 2.1491,
+ "step": 75
+ },
+ {
+ "epoch": 0.02,
+ "grad_norm": 1.0075691938400269,
+ "learning_rate": 4.992365042931752e-05,
+ "loss": 1.945,
+ "step": 80
+ },
+ {
+ "epoch": 0.03,
+ "grad_norm": 1.1973133087158203,
+ "learning_rate": 4.991381415105619e-05,
+ "loss": 2.0811,
+ "step": 85
+ },
+ {
+ "epoch": 0.03,
+ "grad_norm": 0.9927239418029785,
+ "learning_rate": 4.990338314647881e-05,
+ "loss": 1.961,
+ "step": 90
+ },
+ {
+ "epoch": 0.03,
+ "grad_norm": 0.9499759674072266,
+ "learning_rate": 4.98923576645875e-05,
+ "loss": 2.0653,
+ "step": 95
+ },
+ {
+ "epoch": 0.03,
+ "grad_norm": 0.7233040928840637,
+ "learning_rate": 4.9880737968575365e-05,
+ "loss": 1.9999,
+ "step": 100
+ },
+ {
+ "epoch": 0.03,
+ "grad_norm": 1.55235755443573,
+ "learning_rate": 4.986852433582022e-05,
+ "loss": 2.2258,
+ "step": 105
+ },
+ {
+ "epoch": 0.03,
+ "grad_norm": 0.9007890820503235,
+ "learning_rate": 4.985571705787793e-05,
+ "loss": 2.1034,
+ "step": 110
+ },
+ {
+ "epoch": 0.04,
+ "grad_norm": 0.6774860620498657,
+ "learning_rate": 4.9842316440475475e-05,
+ "loss": 2.1753,
+ "step": 115
+ },
+ {
+ "epoch": 0.04,
+ "grad_norm": 0.7676737308502197,
+ "learning_rate": 4.9828322803503665e-05,
+ "loss": 2.1384,
+ "step": 120
+ },
+ {
+ "epoch": 0.04,
+ "grad_norm": 0.9624544978141785,
+ "learning_rate": 4.981373648100946e-05,
+ "loss": 2.0521,
+ "step": 125
+ },
+ {
+ "epoch": 0.04,
+ "grad_norm": 0.9315722584724426,
+ "learning_rate": 4.979855782118802e-05,
+ "loss": 1.9256,
+ "step": 130
+ },
+ {
+ "epoch": 0.04,
+ "grad_norm": 0.9035864472389221,
+ "learning_rate": 4.978278718637443e-05,
+ "loss": 2.0882,
+ "step": 135
+ },
+ {
+ "epoch": 0.04,
+ "grad_norm": 0.7997236251831055,
+ "learning_rate": 4.9766424953035e-05,
+ "loss": 2.0724,
+ "step": 140
+ },
+ {
+ "epoch": 0.05,
+ "grad_norm": 1.0692921876907349,
+ "learning_rate": 4.974947151175826e-05,
+ "loss": 2.1329,
+ "step": 145
+ },
+ {
+ "epoch": 0.05,
+ "grad_norm": 0.9506180286407471,
+ "learning_rate": 4.973192726724572e-05,
+ "loss": 2.082,
+ "step": 150
+ },
+ {
+ "epoch": 0.05,
+ "grad_norm": 0.8647387027740479,
+ "learning_rate": 4.9713792638302145e-05,
+ "loss": 2.0366,
+ "step": 155
+ },
+ {
+ "epoch": 0.05,
+ "grad_norm": 1.105302095413208,
+ "learning_rate": 4.969506805782555e-05,
+ "loss": 2.1481,
+ "step": 160
+ },
+ {
+ "epoch": 0.05,
+ "grad_norm": 0.7593303918838501,
+ "learning_rate": 4.967575397279689e-05,
+ "loss": 2.032,
+ "step": 165
+ },
+ {
+ "epoch": 0.05,
+ "grad_norm": 0.7521979808807373,
+ "learning_rate": 4.965585084426943e-05,
+ "loss": 2.0379,
+ "step": 170
+ },
+ {
+ "epoch": 0.05,
+ "grad_norm": 0.947120726108551,
+ "learning_rate": 4.9635359147357655e-05,
+ "loss": 2.1444,
+ "step": 175
+ },
+ {
+ "epoch": 0.06,
+ "grad_norm": 1.2184454202651978,
+ "learning_rate": 4.961427937122598e-05,
+ "loss": 1.9164,
+ "step": 180
+ },
+ {
+ "epoch": 0.06,
+ "grad_norm": 1.221663475036621,
+ "learning_rate": 4.959261201907707e-05,
+ "loss": 2.0084,
+ "step": 185
+ },
+ {
+ "epoch": 0.06,
+ "grad_norm": 1.0457361936569214,
+ "learning_rate": 4.957035760813982e-05,
+ "loss": 2.2032,
+ "step": 190
+ },
+ {
+ "epoch": 0.06,
+ "grad_norm": 0.8834909200668335,
+ "learning_rate": 4.954751666965701e-05,
+ "loss": 2.2101,
+ "step": 195
+ },
+ {
+ "epoch": 0.06,
+ "grad_norm": 0.791902482509613,
+ "learning_rate": 4.9524089748872615e-05,
+ "loss": 2.0472,
+ "step": 200
+ },
+ {
+ "epoch": 0.06,
+ "grad_norm": 1.2905739545822144,
+ "learning_rate": 4.9500077405018807e-05,
+ "loss": 2.0987,
+ "step": 205
+ },
+ {
+ "epoch": 0.07,
+ "grad_norm": 0.8612006306648254,
+ "learning_rate": 4.9475480211302583e-05,
+ "loss": 2.1765,
+ "step": 210
+ },
+ {
+ "epoch": 0.07,
+ "grad_norm": 1.3128459453582764,
+ "learning_rate": 4.945029875489212e-05,
+ "loss": 1.9926,
+ "step": 215
+ },
+ {
+ "epoch": 0.07,
+ "grad_norm": 0.9610918164253235,
+ "learning_rate": 4.94245336369027e-05,
+ "loss": 2.0124,
+ "step": 220
+ },
+ {
+ "epoch": 0.07,
+ "grad_norm": 0.873160183429718,
+ "learning_rate": 4.939818547238241e-05,
+ "loss": 2.2229,
+ "step": 225
+ },
+ {
+ "epoch": 0.07,
+ "grad_norm": 1.5535285472869873,
+ "learning_rate": 4.9371254890297446e-05,
+ "loss": 2.2013,
+ "step": 230
+ },
+ {
+ "epoch": 0.07,
+ "grad_norm": 1.1951836347579956,
+ "learning_rate": 4.93437425335171e-05,
+ "loss": 2.014,
+ "step": 235
+ },
+ {
+ "epoch": 0.07,
+ "grad_norm": 0.7874170541763306,
+ "learning_rate": 4.9315649058798384e-05,
+ "loss": 2.1701,
+ "step": 240
+ },
+ {
+ "epoch": 0.08,
+ "grad_norm": 1.3503323793411255,
+ "learning_rate": 4.928697513677042e-05,
+ "loss": 2.1681,
+ "step": 245
+ },
+ {
+ "epoch": 0.08,
+ "grad_norm": 1.3091179132461548,
+ "learning_rate": 4.925772145191834e-05,
+ "loss": 2.1224,
+ "step": 250
+ },
+ {
+ "epoch": 0.08,
+ "grad_norm": 1.4428555965423584,
+ "learning_rate": 4.9227888702567044e-05,
+ "loss": 2.0512,
+ "step": 255
+ },
+ {
+ "epoch": 0.08,
+ "grad_norm": 0.8234395980834961,
+ "learning_rate": 4.9197477600864446e-05,
+ "loss": 2.1067,
+ "step": 260
+ },
+ {
+ "epoch": 0.08,
+ "grad_norm": 1.9094969034194946,
+ "learning_rate": 4.9166488872764526e-05,
+ "loss": 1.8884,
+ "step": 265
+ },
+ {
+ "epoch": 0.08,
+ "grad_norm": 1.0074087381362915,
+ "learning_rate": 4.913492325800999e-05,
+ "loss": 1.9345,
+ "step": 270
+ },
+ {
+ "epoch": 0.09,
+ "grad_norm": 1.0867297649383545,
+ "learning_rate": 4.910278151011458e-05,
+ "loss": 2.1928,
+ "step": 275
+ },
+ {
+ "epoch": 0.09,
+ "grad_norm": 0.6842357516288757,
+ "learning_rate": 4.907006439634516e-05,
+ "loss": 2.0407,
+ "step": 280
+ },
+ {
+ "epoch": 0.09,
+ "grad_norm": 0.8409023284912109,
+ "learning_rate": 4.903677269770329e-05,
+ "loss": 2.2344,
+ "step": 285
+ },
+ {
+ "epoch": 0.09,
+ "grad_norm": 0.8119503259658813,
+ "learning_rate": 4.900290720890671e-05,
+ "loss": 2.1296,
+ "step": 290
+ },
+ {
+ "epoch": 0.09,
+ "grad_norm": 0.9938147068023682,
+ "learning_rate": 4.8968468738370244e-05,
+ "loss": 2.152,
+ "step": 295
+ },
+ {
+ "epoch": 0.09,
+ "grad_norm": 0.9865244030952454,
+ "learning_rate": 4.8933458108186606e-05,
+ "loss": 1.9623,
+ "step": 300
+ },
+ {
+ "epoch": 0.09,
+ "grad_norm": 1.3944802284240723,
+ "learning_rate": 4.889787615410672e-05,
+ "loss": 1.915,
+ "step": 305
+ },
+ {
+ "epoch": 0.1,
+ "grad_norm": 1.3749767541885376,
+ "learning_rate": 4.886172372551977e-05,
+ "loss": 1.9934,
+ "step": 310
+ },
+ {
+ "epoch": 0.1,
+ "grad_norm": 0.9024938941001892,
+ "learning_rate": 4.882500168543294e-05,
+ "loss": 2.1541,
+ "step": 315
+ },
+ {
+ "epoch": 0.1,
+ "grad_norm": 1.1978263854980469,
+ "learning_rate": 4.878771091045082e-05,
+ "loss": 2.1688,
+ "step": 320
+ },
+ {
+ "epoch": 0.1,
+ "grad_norm": 0.8360010981559753,
+ "learning_rate": 4.874985229075446e-05,
+ "loss": 2.1387,
+ "step": 325
+ },
+ {
+ "epoch": 0.1,
+ "grad_norm": 0.7683364152908325,
+ "learning_rate": 4.871142673008012e-05,
+ "loss": 2.0215,
+ "step": 330
+ },
+ {
+ "epoch": 0.1,
+ "grad_norm": 1.4230670928955078,
+ "learning_rate": 4.867243514569772e-05,
+ "loss": 1.9491,
+ "step": 335
+ },
+ {
+ "epoch": 0.11,
+ "grad_norm": 0.8198773860931396,
+ "learning_rate": 4.863287846838891e-05,
+ "loss": 2.0151,
+ "step": 340
+ },
+ {
+ "epoch": 0.11,
+ "grad_norm": 1.467207908630371,
+ "learning_rate": 4.85927576424249e-05,
+ "loss": 1.8906,
+ "step": 345
+ },
+ {
+ "epoch": 0.11,
+ "grad_norm": 0.9537095427513123,
+ "learning_rate": 4.855207362554385e-05,
+ "loss": 2.1844,
+ "step": 350
+ },
+ {
+ "epoch": 0.11,
+ "grad_norm": 1.0757155418395996,
+ "learning_rate": 4.851082738892809e-05,
+ "loss": 2.048,
+ "step": 355
+ },
+ {
+ "epoch": 0.11,
+ "grad_norm": 1.6884938478469849,
+ "learning_rate": 4.8469019917180846e-05,
+ "loss": 1.9537,
+ "step": 360
+ },
+ {
+ "epoch": 0.11,
+ "grad_norm": 1.4680182933807373,
+ "learning_rate": 4.8426652208302814e-05,
+ "loss": 1.9731,
+ "step": 365
+ },
+ {
+ "epoch": 0.12,
+ "grad_norm": 1.1778632402420044,
+ "learning_rate": 4.83837252736683e-05,
+ "loss": 2.1395,
+ "step": 370
+ },
+ {
+ "epoch": 0.12,
+ "grad_norm": 1.2865056991577148,
+ "learning_rate": 4.834024013800108e-05,
+ "loss": 2.0016,
+ "step": 375
+ },
+ {
+ "epoch": 0.12,
+ "grad_norm": 1.055177092552185,
+ "learning_rate": 4.8296197839349944e-05,
+ "loss": 1.9632,
+ "step": 380
+ },
+ {
+ "epoch": 0.12,
+ "grad_norm": 1.0041871070861816,
+ "learning_rate": 4.825159942906389e-05,
+ "loss": 2.3302,
+ "step": 385
+ },
+ {
+ "epoch": 0.12,
+ "grad_norm": 1.0026438236236572,
+ "learning_rate": 4.820644597176709e-05,
+ "loss": 2.1517,
+ "step": 390
+ },
+ {
+ "epoch": 0.12,
+ "grad_norm": 1.3532180786132812,
+ "learning_rate": 4.81607385453334e-05,
+ "loss": 2.1229,
+ "step": 395
+ },
+ {
+ "epoch": 0.12,
+ "grad_norm": 0.7670988440513611,
+ "learning_rate": 4.81144782408607e-05,
+ "loss": 2.1382,
+ "step": 400
+ },
+ {
+ "epoch": 0.13,
+ "grad_norm": 1.0405700206756592,
+ "learning_rate": 4.8067666162644774e-05,
+ "loss": 1.9614,
+ "step": 405
+ },
+ {
+ "epoch": 0.13,
+ "grad_norm": 1.2252662181854248,
+ "learning_rate": 4.802030342815304e-05,
+ "loss": 2.1399,
+ "step": 410
+ },
+ {
+ "epoch": 0.13,
+ "grad_norm": 1.237946629524231,
+ "learning_rate": 4.7972391167997754e-05,
+ "loss": 1.9034,
+ "step": 415
+ },
+ {
+ "epoch": 0.13,
+ "grad_norm": 0.8064705729484558,
+ "learning_rate": 4.7923930525909156e-05,
+ "loss": 2.0075,
+ "step": 420
+ },
+ {
+ "epoch": 0.13,
+ "grad_norm": 0.8717565536499023,
+ "learning_rate": 4.7874922658708065e-05,
+ "loss": 2.0105,
+ "step": 425
+ },
+ {
+ "epoch": 0.13,
+ "grad_norm": 1.6693098545074463,
+ "learning_rate": 4.782536873627832e-05,
+ "loss": 2.0242,
+ "step": 430
+ },
+ {
+ "epoch": 0.14,
+ "grad_norm": 0.82447350025177,
+ "learning_rate": 4.777526994153882e-05,
+ "loss": 2.0267,
+ "step": 435
+ },
+ {
+ "epoch": 0.14,
+ "grad_norm": 0.9926588535308838,
+ "learning_rate": 4.7724627470415307e-05,
+ "loss": 1.9119,
+ "step": 440
+ },
+ {
+ "epoch": 0.14,
+ "grad_norm": 1.0924450159072876,
+ "learning_rate": 4.7673442531811796e-05,
+ "loss": 2.2653,
+ "step": 445
+ },
+ {
+ "epoch": 0.14,
+ "grad_norm": 1.1592103242874146,
+ "learning_rate": 4.762171634758177e-05,
+ "loss": 2.0017,
+ "step": 450
+ },
+ {
+ "epoch": 0.14,
+ "grad_norm": 0.9172110557556152,
+ "learning_rate": 4.7569450152498927e-05,
+ "loss": 2.1408,
+ "step": 455
+ },
+ {
+ "epoch": 0.14,
+ "grad_norm": 1.1897525787353516,
+ "learning_rate": 4.751664519422778e-05,
+ "loss": 2.0935,
+ "step": 460
+ },
+ {
+ "epoch": 0.14,
+ "grad_norm": 0.8793094158172607,
+ "learning_rate": 4.746330273329386e-05,
+ "loss": 2.1142,
+ "step": 465
+ },
+ {
+ "epoch": 0.15,
+ "grad_norm": 1.4337489604949951,
+ "learning_rate": 4.740942404305356e-05,
+ "loss": 2.1289,
+ "step": 470
+ },
+ {
+ "epoch": 0.15,
+ "grad_norm": 1.0251764059066772,
+ "learning_rate": 4.735501040966383e-05,
+ "loss": 1.9741,
+ "step": 475
+ },
+ {
+ "epoch": 0.15,
+ "grad_norm": 1.2659822702407837,
+ "learning_rate": 4.730006313205143e-05,
+ "loss": 2.088,
+ "step": 480
+ },
+ {
+ "epoch": 0.15,
+ "grad_norm": 0.8884140849113464,
+ "learning_rate": 4.724458352188192e-05,
+ "loss": 2.2079,
+ "step": 485
+ },
+ {
+ "epoch": 0.15,
+ "grad_norm": 1.1937768459320068,
+ "learning_rate": 4.718857290352835e-05,
+ "loss": 2.048,
+ "step": 490
+ },
+ {
+ "epoch": 0.15,
+ "grad_norm": 0.9741552472114563,
+ "learning_rate": 4.713203261403966e-05,
+ "loss": 2.2569,
+ "step": 495
+ },
+ {
+ "epoch": 0.16,
+ "grad_norm": 0.7996780872344971,
+ "learning_rate": 4.707496400310874e-05,
+ "loss": 1.9574,
+ "step": 500
+ },
+ {
+ "epoch": 0.16,
+ "grad_norm": 1.8182051181793213,
+ "learning_rate": 4.701736843304025e-05,
+ "loss": 2.0951,
+ "step": 505
+ },
+ {
+ "epoch": 0.16,
+ "grad_norm": 1.507320761680603,
+ "learning_rate": 4.695924727871805e-05,
+ "loss": 2.0253,
+ "step": 510
+ },
+ {
+ "epoch": 0.16,
+ "grad_norm": 0.759121835231781,
+ "learning_rate": 4.690060192757242e-05,
+ "loss": 2.0602,
+ "step": 515
+ },
+ {
+ "epoch": 0.16,
+ "grad_norm": 1.5943195819854736,
+ "learning_rate": 4.684143377954691e-05,
+ "loss": 2.0386,
+ "step": 520
+ },
+ {
+ "epoch": 0.16,
+ "grad_norm": 0.8568710088729858,
+ "learning_rate": 4.6781744247064955e-05,
+ "loss": 2.073,
+ "step": 525
+ },
+ {
+ "epoch": 0.16,
+ "grad_norm": 1.3352620601654053,
+ "learning_rate": 4.6721534754996125e-05,
+ "loss": 2.1443,
+ "step": 530
+ },
+ {
+ "epoch": 0.17,
+ "grad_norm": 1.3417474031448364,
+ "learning_rate": 4.666080674062213e-05,
+ "loss": 2.0288,
+ "step": 535
+ },
+ {
+ "epoch": 0.17,
+ "grad_norm": 1.5334464311599731,
+ "learning_rate": 4.659956165360251e-05,
+ "loss": 2.0609,
+ "step": 540
+ },
+ {
+ "epoch": 0.17,
+ "grad_norm": 0.9658721089363098,
+ "learning_rate": 4.6537800955940005e-05,
+ "loss": 1.9539,
+ "step": 545
+ },
+ {
+ "epoch": 0.17,
+ "grad_norm": 1.9197947978973389,
+ "learning_rate": 4.647552612194572e-05,
+ "loss": 2.149,
+ "step": 550
+ },
+ {
+ "epoch": 0.17,
+ "grad_norm": 0.8512137532234192,
+ "learning_rate": 4.641273863820383e-05,
+ "loss": 1.9722,
+ "step": 555
+ },
+ {
+ "epoch": 0.17,
+ "grad_norm": 1.827289342880249,
+ "learning_rate": 4.634944000353622e-05,
+ "loss": 2.0729,
+ "step": 560
+ },
+ {
+ "epoch": 0.18,
+ "grad_norm": 1.088416337966919,
+ "learning_rate": 4.628563172896655e-05,
+ "loss": 1.9507,
+ "step": 565
+ },
+ {
+ "epoch": 0.18,
+ "grad_norm": 1.3566908836364746,
+ "learning_rate": 4.6221315337684353e-05,
+ "loss": 2.1643,
+ "step": 570
+ },
+ {
+ "epoch": 0.18,
+ "grad_norm": 1.3541293144226074,
+ "learning_rate": 4.615649236500854e-05,
+ "loss": 2.1839,
+ "step": 575
+ },
+ {
+ "epoch": 0.18,
+ "grad_norm": 0.991269588470459,
+ "learning_rate": 4.609116435835083e-05,
+ "loss": 2.0976,
+ "step": 580
+ },
+ {
+ "epoch": 0.18,
+ "grad_norm": 1.0280535221099854,
+ "learning_rate": 4.602533287717877e-05,
+ "loss": 2.1474,
+ "step": 585
+ },
+ {
+ "epoch": 0.18,
+ "grad_norm": 1.013123631477356,
+ "learning_rate": 4.5958999492978524e-05,
+ "loss": 2.1873,
+ "step": 590
+ },
+ {
+ "epoch": 0.19,
+ "grad_norm": 1.1753040552139282,
+ "learning_rate": 4.589216578921737e-05,
+ "loss": 2.1744,
+ "step": 595
+ },
+ {
+ "epoch": 0.19,
+ "grad_norm": 1.1839090585708618,
+ "learning_rate": 4.582483336130586e-05,
+ "loss": 1.9982,
+ "step": 600
+ },
+ {
+ "epoch": 0.19,
+ "grad_norm": 1.0724798440933228,
+ "learning_rate": 4.575700381655979e-05,
+ "loss": 2.1234,
+ "step": 605
+ },
+ {
+ "epoch": 0.19,
+ "grad_norm": 2.009913682937622,
+ "learning_rate": 4.5688678774161796e-05,
+ "loss": 1.9478,
+ "step": 610
+ },
+ {
+ "epoch": 0.19,
+ "grad_norm": 0.9897060394287109,
+ "learning_rate": 4.561985986512271e-05,
+ "loss": 1.8268,
+ "step": 615
+ },
+ {
+ "epoch": 0.19,
+ "grad_norm": 0.8881808519363403,
+ "learning_rate": 4.555054873224263e-05,
+ "loss": 1.9887,
+ "step": 620
+ },
+ {
+ "epoch": 0.19,
+ "grad_norm": 1.155900001525879,
+ "learning_rate": 4.54807470300717e-05,
+ "loss": 2.0777,
+ "step": 625
+ },
+ {
+ "epoch": 0.2,
+ "grad_norm": 0.8782421350479126,
+ "learning_rate": 4.5410456424870596e-05,
+ "loss": 2.0566,
+ "step": 630
+ },
+ {
+ "epoch": 0.2,
+ "grad_norm": 1.3324674367904663,
+ "learning_rate": 4.5339678594570795e-05,
+ "loss": 2.047,
+ "step": 635
+ },
+ {
+ "epoch": 0.2,
+ "grad_norm": 1.9805939197540283,
+ "learning_rate": 4.526841522873449e-05,
+ "loss": 1.962,
+ "step": 640
+ },
+ {
+ "epoch": 0.2,
+ "grad_norm": 1.4999943971633911,
+ "learning_rate": 4.519666802851422e-05,
+ "loss": 2.0972,
+ "step": 645
+ },
+ {
+ "epoch": 0.2,
+ "grad_norm": 1.4504961967468262,
+ "learning_rate": 4.5124438706612376e-05,
+ "loss": 2.0041,
+ "step": 650
+ },
+ {
+ "epoch": 0.2,
+ "grad_norm": 0.9078169465065002,
+ "learning_rate": 4.505172898724018e-05,
+ "loss": 2.1229,
+ "step": 655
+ },
+ {
+ "epoch": 0.21,
+ "grad_norm": 1.1635804176330566,
+ "learning_rate": 4.497854060607662e-05,
+ "loss": 2.0195,
+ "step": 660
+ },
+ {
+ "epoch": 0.21,
+ "grad_norm": 1.46576726436615,
+ "learning_rate": 4.490487531022699e-05,
+ "loss": 2.0745,
+ "step": 665
+ },
+ {
+ "epoch": 0.21,
+ "grad_norm": 1.2094652652740479,
+ "learning_rate": 4.4830734858181145e-05,
+ "loss": 2.1068,
+ "step": 670
+ },
+ {
+ "epoch": 0.21,
+ "grad_norm": 1.4738895893096924,
+ "learning_rate": 4.47561210197716e-05,
+ "loss": 1.8088,
+ "step": 675
+ },
+ {
+ "epoch": 0.21,
+ "grad_norm": 1.23384690284729,
+ "learning_rate": 4.4681035576131215e-05,
+ "loss": 2.0995,
+ "step": 680
+ },
+ {
+ "epoch": 0.21,
+ "grad_norm": 0.8332946300506592,
+ "learning_rate": 4.46054803196507e-05,
+ "loss": 2.0541,
+ "step": 685
+ },
+ {
+ "epoch": 0.21,
+ "grad_norm": 0.9207485318183899,
+ "learning_rate": 4.452945705393586e-05,
+ "loss": 2.166,
+ "step": 690
+ },
+ {
+ "epoch": 0.22,
+ "grad_norm": 1.292945146560669,
+ "learning_rate": 4.445296759376449e-05,
+ "loss": 2.0784,
+ "step": 695
+ },
+ {
+ "epoch": 0.22,
+ "grad_norm": 0.9874763488769531,
+ "learning_rate": 4.437601376504307e-05,
+ "loss": 2.2087,
+ "step": 700
+ },
+ {
+ "epoch": 0.22,
+ "grad_norm": 0.9427415132522583,
+ "learning_rate": 4.4298597404763186e-05,
+ "loss": 2.1199,
+ "step": 705
+ },
+ {
+ "epoch": 0.22,
+ "grad_norm": 1.7369529008865356,
+ "learning_rate": 4.422072036095768e-05,
+ "loss": 2.0355,
+ "step": 710
+ },
+ {
+ "epoch": 0.22,
+ "grad_norm": 1.2423696517944336,
+ "learning_rate": 4.414238449265654e-05,
+ "loss": 2.0011,
+ "step": 715
+ },
+ {
+ "epoch": 0.22,
+ "grad_norm": 1.2304831743240356,
+ "learning_rate": 4.406359166984249e-05,
+ "loss": 2.0368,
+ "step": 720
+ },
+ {
+ "epoch": 0.23,
+ "grad_norm": 0.9090413451194763,
+ "learning_rate": 4.39843437734064e-05,
+ "loss": 1.9983,
+ "step": 725
+ },
+ {
+ "epoch": 0.23,
+ "grad_norm": 1.2729507684707642,
+ "learning_rate": 4.390464269510233e-05,
+ "loss": 2.021,
+ "step": 730
+ },
+ {
+ "epoch": 0.23,
+ "grad_norm": 1.3009227514266968,
+ "learning_rate": 4.382449033750244e-05,
+ "loss": 1.9743,
+ "step": 735
+ },
+ {
+ "epoch": 0.23,
+ "grad_norm": 1.5456056594848633,
+ "learning_rate": 4.37438886139515e-05,
+ "loss": 2.0689,
+ "step": 740
+ },
+ {
+ "epoch": 0.23,
+ "grad_norm": 1.3235007524490356,
+ "learning_rate": 4.3662839448521264e-05,
+ "loss": 2.0838,
+ "step": 745
+ },
+ {
+ "epoch": 0.23,
+ "grad_norm": 2.2074007987976074,
+ "learning_rate": 4.358134477596454e-05,
+ "loss": 2.0835,
+ "step": 750
+ },
+ {
+ "epoch": 0.23,
+ "grad_norm": 1.403738021850586,
+ "learning_rate": 4.3499406541668966e-05,
+ "loss": 2.0916,
+ "step": 755
+ },
+ {
+ "epoch": 0.24,
+ "grad_norm": 1.0940325260162354,
+ "learning_rate": 4.3417026701610616e-05,
+ "loss": 1.972,
+ "step": 760
+ },
+ {
+ "epoch": 0.24,
+ "grad_norm": 1.666353702545166,
+ "learning_rate": 4.3334207222307275e-05,
+ "loss": 1.927,
+ "step": 765
+ },
+ {
+ "epoch": 0.24,
+ "grad_norm": 1.0777515172958374,
+ "learning_rate": 4.325095008077154e-05,
+ "loss": 2.1192,
+ "step": 770
+ },
+ {
+ "epoch": 0.24,
+ "grad_norm": 1.7218186855316162,
+ "learning_rate": 4.316725726446353e-05,
+ "loss": 2.0774,
+ "step": 775
+ },
+ {
+ "epoch": 0.24,
+ "grad_norm": 1.356753945350647,
+ "learning_rate": 4.3083130771243586e-05,
+ "loss": 2.0847,
+ "step": 780
+ },
+ {
+ "epoch": 0.24,
+ "grad_norm": 0.9967429637908936,
+ "learning_rate": 4.299857260932445e-05,
+ "loss": 2.0485,
+ "step": 785
+ },
+ {
+ "epoch": 0.25,
+ "grad_norm": 1.6216442584991455,
+ "learning_rate": 4.2913584797223397e-05,
+ "loss": 2.1008,
+ "step": 790
+ },
+ {
+ "epoch": 0.25,
+ "grad_norm": 1.2556742429733276,
+ "learning_rate": 4.2828169363714016e-05,
+ "loss": 1.9209,
+ "step": 795
+ },
+ {
+ "epoch": 0.25,
+ "grad_norm": 1.1800439357757568,
+ "learning_rate": 4.274232834777782e-05,
+ "loss": 1.9722,
+ "step": 800
+ },
+ {
+ "epoch": 0.25,
+ "grad_norm": 1.1313499212265015,
+ "learning_rate": 4.2656063798555515e-05,
+ "loss": 1.9176,
+ "step": 805
+ },
+ {
+ "epoch": 0.25,
+ "grad_norm": 1.137534737586975,
+ "learning_rate": 4.256937777529815e-05,
+ "loss": 1.9929,
+ "step": 810
+ },
+ {
+ "epoch": 0.25,
+ "grad_norm": 1.0575093030929565,
+ "learning_rate": 4.2482272347317906e-05,
+ "loss": 2.166,
+ "step": 815
+ },
+ {
+ "epoch": 0.25,
+ "grad_norm": 1.5939594507217407,
+ "learning_rate": 4.2394749593938733e-05,
+ "loss": 2.1334,
+ "step": 820
+ },
+ {
+ "epoch": 0.26,
+ "grad_norm": 1.1045507192611694,
+ "learning_rate": 4.230681160444669e-05,
+ "loss": 2.0853,
+ "step": 825
+ },
+ {
+ "epoch": 0.26,
+ "grad_norm": 1.3480136394500732,
+ "learning_rate": 4.221846047804009e-05,
+ "loss": 2.1802,
+ "step": 830
+ },
+ {
+ "epoch": 0.26,
+ "grad_norm": 1.1822657585144043,
+ "learning_rate": 4.2129698323779366e-05,
+ "loss": 2.0739,
+ "step": 835
+ },
+ {
+ "epoch": 0.26,
+ "grad_norm": 1.1771117448806763,
+ "learning_rate": 4.204052726053676e-05,
+ "loss": 2.0238,
+ "step": 840
+ },
+ {
+ "epoch": 0.26,
+ "grad_norm": 1.4757814407348633,
+ "learning_rate": 4.195094941694571e-05,
+ "loss": 2.1557,
+ "step": 845
+ },
+ {
+ "epoch": 0.26,
+ "grad_norm": 0.9095075726509094,
+ "learning_rate": 4.1860966931350054e-05,
+ "loss": 2.1666,
+ "step": 850
+ },
+ {
+ "epoch": 0.27,
+ "grad_norm": 1.1039543151855469,
+ "learning_rate": 4.1770581951752976e-05,
+ "loss": 2.105,
+ "step": 855
+ },
+ {
+ "epoch": 0.27,
+ "grad_norm": 0.8517205119132996,
+ "learning_rate": 4.1679796635765735e-05,
+ "loss": 1.9656,
+ "step": 860
+ },
+ {
+ "epoch": 0.27,
+ "grad_norm": 1.239492654800415,
+ "learning_rate": 4.158861315055617e-05,
+ "loss": 2.0166,
+ "step": 865
+ },
+ {
+ "epoch": 0.27,
+ "grad_norm": 1.1358321905136108,
+ "learning_rate": 4.1497033672796924e-05,
+ "loss": 2.0076,
+ "step": 870
+ },
+ {
+ "epoch": 0.27,
+ "grad_norm": 1.6215249300003052,
+ "learning_rate": 4.140506038861356e-05,
+ "loss": 2.1594,
+ "step": 875
+ },
+ {
+ "epoch": 0.27,
+ "grad_norm": 1.0528080463409424,
+ "learning_rate": 4.131269549353229e-05,
+ "loss": 2.1416,
+ "step": 880
+ },
+ {
+ "epoch": 0.28,
+ "grad_norm": 0.8976901769638062,
+ "learning_rate": 4.1219941192427644e-05,
+ "loss": 2.1242,
+ "step": 885
+ },
+ {
+ "epoch": 0.28,
+ "grad_norm": 1.263594388961792,
+ "learning_rate": 4.112679969946977e-05,
+ "loss": 2.02,
+ "step": 890
+ },
+ {
+ "epoch": 0.28,
+ "grad_norm": 1.4173017740249634,
+ "learning_rate": 4.103327323807162e-05,
+ "loss": 2.0438,
+ "step": 895
+ },
+ {
+ "epoch": 0.28,
+ "grad_norm": 1.876170039176941,
+ "learning_rate": 4.093936404083585e-05,
+ "loss": 1.9806,
+ "step": 900
+ },
+ {
+ "epoch": 0.28,
+ "grad_norm": 1.4649231433868408,
+ "learning_rate": 4.0845074349501544e-05,
+ "loss": 2.1476,
+ "step": 905
+ },
+ {
+ "epoch": 0.28,
+ "grad_norm": 1.0446043014526367,
+ "learning_rate": 4.0750406414890695e-05,
+ "loss": 1.9672,
+ "step": 910
+ },
+ {
+ "epoch": 0.28,
+ "grad_norm": 1.0225305557250977,
+ "learning_rate": 4.065536249685448e-05,
+ "loss": 1.9984,
+ "step": 915
+ },
+ {
+ "epoch": 0.29,
+ "grad_norm": 1.0120617151260376,
+ "learning_rate": 4.055994486421929e-05,
+ "loss": 2.1162,
+ "step": 920
+ },
+ {
+ "epoch": 0.29,
+ "grad_norm": 1.0469881296157837,
+ "learning_rate": 4.04641557947326e-05,
+ "loss": 2.0435,
+ "step": 925
+ },
+ {
+ "epoch": 0.29,
+ "grad_norm": 1.2435941696166992,
+ "learning_rate": 4.036799757500856e-05,
+ "loss": 2.0431,
+ "step": 930
+ },
+ {
+ "epoch": 0.29,
+ "grad_norm": 1.0055103302001953,
+ "learning_rate": 4.027147250047348e-05,
+ "loss": 2.2021,
+ "step": 935
+ },
+ {
+ "epoch": 0.29,
+ "grad_norm": 1.1212949752807617,
+ "learning_rate": 4.017458287531094e-05,
+ "loss": 1.997,
+ "step": 940
+ },
+ {
+ "epoch": 0.29,
+ "grad_norm": 1.1048357486724854,
+ "learning_rate": 4.007733101240685e-05,
+ "loss": 1.946,
+ "step": 945
+ },
+ {
+ "epoch": 0.3,
+ "grad_norm": 1.4721689224243164,
+ "learning_rate": 3.997971923329426e-05,
+ "loss": 2.0723,
+ "step": 950
+ },
+ {
+ "epoch": 0.3,
+ "grad_norm": 1.3793156147003174,
+ "learning_rate": 3.988174986809783e-05,
+ "loss": 2.034,
+ "step": 955
+ },
+ {
+ "epoch": 0.3,
+ "grad_norm": 0.9013482928276062,
+ "learning_rate": 3.9783425255478355e-05,
+ "loss": 1.9736,
+ "step": 960
+ },
+ {
+ "epoch": 0.3,
+ "grad_norm": 0.9192422032356262,
+ "learning_rate": 3.968474774257682e-05,
+ "loss": 1.9878,
+ "step": 965
+ },
+ {
+ "epoch": 0.3,
+ "grad_norm": 1.9304206371307373,
+ "learning_rate": 3.9585719684958446e-05,
+ "loss": 2.117,
+ "step": 970
+ },
+ {
+ "epoch": 0.3,
+ "grad_norm": 1.0435137748718262,
+ "learning_rate": 3.948634344655639e-05,
+ "loss": 2.0585,
+ "step": 975
+ },
+ {
+ "epoch": 0.3,
+ "grad_norm": 1.4636590480804443,
+ "learning_rate": 3.938662139961538e-05,
+ "loss": 2.0409,
+ "step": 980
+ },
+ {
+ "epoch": 0.31,
+ "grad_norm": 1.8014529943466187,
+ "learning_rate": 3.928655592463508e-05,
+ "loss": 2.0369,
+ "step": 985
+ },
+ {
+ "epoch": 0.31,
+ "grad_norm": 1.2412620782852173,
+ "learning_rate": 3.918614941031319e-05,
+ "loss": 1.967,
+ "step": 990
+ },
+ {
+ "epoch": 0.31,
+ "grad_norm": 1.3581103086471558,
+ "learning_rate": 3.908540425348852e-05,
+ "loss": 2.0037,
+ "step": 995
+ },
+ {
+ "epoch": 0.31,
+ "grad_norm": 1.2377780675888062,
+ "learning_rate": 3.8984322859083725e-05,
+ "loss": 1.9991,
+ "step": 1000
+ },
+ {
+ "epoch": 0.31,
+ "grad_norm": 0.9209259748458862,
+ "learning_rate": 3.8882907640047896e-05,
+ "loss": 2.0448,
+ "step": 1005
+ },
+ {
+ "epoch": 0.31,
+ "grad_norm": 1.0150959491729736,
+ "learning_rate": 3.878116101729897e-05,
+ "loss": 2.0791,
+ "step": 1010
+ },
+ {
+ "epoch": 0.32,
+ "grad_norm": 1.5959141254425049,
+ "learning_rate": 3.867908541966594e-05,
+ "loss": 1.9997,
+ "step": 1015
+ },
+ {
+ "epoch": 0.32,
+ "grad_norm": 1.3945012092590332,
+ "learning_rate": 3.857668328383088e-05,
+ "loss": 2.0481,
+ "step": 1020
+ },
+ {
+ "epoch": 0.32,
+ "grad_norm": 1.2361671924591064,
+ "learning_rate": 3.847395705427075e-05,
+ "loss": 2.2664,
+ "step": 1025
+ },
+ {
+ "epoch": 0.32,
+ "grad_norm": 1.9661719799041748,
+ "learning_rate": 3.837090918319909e-05,
+ "loss": 1.9752,
+ "step": 1030
+ },
+ {
+ "epoch": 0.32,
+ "grad_norm": 1.6995949745178223,
+ "learning_rate": 3.8267542130507436e-05,
+ "loss": 2.1332,
+ "step": 1035
+ },
+ {
+ "epoch": 0.32,
+ "grad_norm": 1.1248412132263184,
+ "learning_rate": 3.816385836370663e-05,
+ "loss": 2.0432,
+ "step": 1040
+ },
+ {
+ "epoch": 0.32,
+ "grad_norm": 0.8734235763549805,
+ "learning_rate": 3.805986035786789e-05,
+ "loss": 1.9618,
+ "step": 1045
+ },
+ {
+ "epoch": 0.33,
+ "grad_norm": 1.322766661643982,
+ "learning_rate": 3.795555059556378e-05,
+ "loss": 2.0267,
+ "step": 1050
+ },
+ {
+ "epoch": 0.33,
+ "grad_norm": 1.0396028757095337,
+ "learning_rate": 3.7850931566808866e-05,
+ "loss": 2.1075,
+ "step": 1055
+ },
+ {
+ "epoch": 0.33,
+ "grad_norm": 0.9574625492095947,
+ "learning_rate": 3.7746005769000363e-05,
+ "loss": 2.156,
+ "step": 1060
+ },
+ {
+ "epoch": 0.33,
+ "grad_norm": 1.4480133056640625,
+ "learning_rate": 3.764077570685844e-05,
+ "loss": 1.9615,
+ "step": 1065
+ },
+ {
+ "epoch": 0.33,
+ "grad_norm": 1.5908560752868652,
+ "learning_rate": 3.753524389236648e-05,
+ "loss": 2.0928,
+ "step": 1070
+ },
+ {
+ "epoch": 0.33,
+ "grad_norm": 1.2628813982009888,
+ "learning_rate": 3.742941284471111e-05,
+ "loss": 2.1074,
+ "step": 1075
+ },
+ {
+ "epoch": 0.34,
+ "grad_norm": 1.2687503099441528,
+ "learning_rate": 3.7323285090222054e-05,
+ "loss": 1.9666,
+ "step": 1080
+ },
+ {
+ "epoch": 0.34,
+ "grad_norm": 1.2571731805801392,
+ "learning_rate": 3.721686316231181e-05,
+ "loss": 2.0468,
+ "step": 1085
+ },
+ {
+ "epoch": 0.34,
+ "grad_norm": 1.007453441619873,
+ "learning_rate": 3.7110149601415215e-05,
+ "loss": 2.0624,
+ "step": 1090
+ },
+ {
+ "epoch": 0.34,
+ "grad_norm": 1.2390377521514893,
+ "learning_rate": 3.700314695492876e-05,
+ "loss": 1.9888,
+ "step": 1095
+ },
+ {
+ "epoch": 0.34,
+ "grad_norm": 1.0878371000289917,
+ "learning_rate": 3.6895857777149825e-05,
+ "loss": 2.1013,
+ "step": 1100
+ },
+ {
+ "epoch": 0.34,
+ "grad_norm": 0.8759217262268066,
+ "learning_rate": 3.6788284629215624e-05,
+ "loss": 1.875,
+ "step": 1105
+ },
+ {
+ "epoch": 0.35,
+ "grad_norm": 1.1345970630645752,
+ "learning_rate": 3.668043007904219e-05,
+ "loss": 1.9096,
+ "step": 1110
+ },
+ {
+ "epoch": 0.35,
+ "grad_norm": 1.253629446029663,
+ "learning_rate": 3.6572296701262966e-05,
+ "loss": 2.1859,
+ "step": 1115
+ },
+ {
+ "epoch": 0.35,
+ "grad_norm": 0.9796190857887268,
+ "learning_rate": 3.646388707716738e-05,
+ "loss": 2.2092,
+ "step": 1120
+ },
+ {
+ "epoch": 0.35,
+ "grad_norm": 1.3893767595291138,
+ "learning_rate": 3.635520379463926e-05,
+ "loss": 2.0026,
+ "step": 1125
+ },
+ {
+ "epoch": 0.35,
+ "grad_norm": 0.8778309226036072,
+ "learning_rate": 3.6246249448095004e-05,
+ "loss": 2.2112,
+ "step": 1130
+ },
+ {
+ "epoch": 0.35,
+ "grad_norm": 1.2479698657989502,
+ "learning_rate": 3.6137026638421696e-05,
+ "loss": 2.0221,
+ "step": 1135
+ },
+ {
+ "epoch": 0.35,
+ "grad_norm": 1.3813824653625488,
+ "learning_rate": 3.6027537972914974e-05,
+ "loss": 1.9106,
+ "step": 1140
+ },
+ {
+ "epoch": 0.36,
+ "grad_norm": 1.2043218612670898,
+ "learning_rate": 3.5917786065216826e-05,
+ "loss": 2.0673,
+ "step": 1145
+ },
+ {
+ "epoch": 0.36,
+ "grad_norm": 1.5337340831756592,
+ "learning_rate": 3.580777353525318e-05,
+ "loss": 2.1463,
+ "step": 1150
+ },
+ {
+ "epoch": 0.36,
+ "grad_norm": 1.155813455581665,
+ "learning_rate": 3.5697503009171385e-05,
+ "loss": 2.0255,
+ "step": 1155
+ },
+ {
+ "epoch": 0.36,
+ "grad_norm": 1.034644365310669,
+ "learning_rate": 3.558697711927748e-05,
+ "loss": 2.1348,
+ "step": 1160
+ },
+ {
+ "epoch": 0.36,
+ "grad_norm": 1.0959795713424683,
+ "learning_rate": 3.54761985039734e-05,
+ "loss": 2.1457,
+ "step": 1165
+ },
+ {
+ "epoch": 0.36,
+ "grad_norm": 1.1938838958740234,
+ "learning_rate": 3.5365169807693966e-05,
+ "loss": 2.1256,
+ "step": 1170
+ },
+ {
+ "epoch": 0.37,
+ "grad_norm": 0.8162047863006592,
+ "learning_rate": 3.525389368084379e-05,
+ "loss": 1.9587,
+ "step": 1175
+ },
+ {
+ "epoch": 0.37,
+ "grad_norm": 0.9358930587768555,
+ "learning_rate": 3.514237277973393e-05,
+ "loss": 1.8965,
+ "step": 1180
+ },
+ {
+ "epoch": 0.37,
+ "grad_norm": 0.9210988879203796,
+ "learning_rate": 3.503060976651862e-05,
+ "loss": 1.9669,
+ "step": 1185
+ },
+ {
+ "epoch": 0.37,
+ "grad_norm": 1.4641343355178833,
+ "learning_rate": 3.491860730913156e-05,
+ "loss": 2.003,
+ "step": 1190
+ },
+ {
+ "epoch": 0.37,
+ "grad_norm": 1.2458257675170898,
+ "learning_rate": 3.480636808122235e-05,
+ "loss": 2.1487,
+ "step": 1195
+ },
+ {
+ "epoch": 0.37,
+ "grad_norm": 1.6770122051239014,
+ "learning_rate": 3.469389476209259e-05,
+ "loss": 2.0686,
+ "step": 1200
+ }
+ ],
+ "logging_steps": 5,
+ "max_steps": 3215,
+ "num_input_tokens_seen": 0,
+ "num_train_epochs": 1,
+ "save_steps": 100,
+ "total_flos": 1.613880123457536e+16,
+ "train_batch_size": 2,
+ "trial_name": null,
+ "trial_params": null
+}
diff --git a/checkpoint-1200/training_args.bin b/checkpoint-1200/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..dbb5f752acaebe6718c05e59ced2a70ce6dfb6a0
--- /dev/null
+++ b/checkpoint-1200/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:79e80b13ff00898b4493d440a3c1a1eb234c0ae541cbca8a8b1befef97a354c9
+size 5112
diff --git a/checkpoint-1300/README.md b/checkpoint-1300/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..b21eb87c49b1ea11193cd8f664d04b06c22e6bb5
--- /dev/null
+++ b/checkpoint-1300/README.md
@@ -0,0 +1,202 @@
+---
+library_name: peft
+base_model: hfl/chinese-alpaca-2-1.3b
+---
+
+# Model Card for Model ID
+
+
+
+
+
+## Model Details
+
+### Model Description
+
+
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+
+
+### Direct Use
+
+
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+
+
+[More Information Needed]
+
+### Recommendations
+
+
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+
+
+[More Information Needed]
+
+### Training Procedure
+
+
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed]
+
+#### Speeds, Sizes, Times [optional]
+
+
+
+[More Information Needed]
+
+## Evaluation
+
+
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+
+
+[More Information Needed]
+
+#### Factors
+
+
+
+[More Information Needed]
+
+#### Metrics
+
+
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+
+
+[More Information Needed]
+
+## Environmental Impact
+
+
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.9.0
\ No newline at end of file
diff --git a/checkpoint-1300/adapter_config.json b/checkpoint-1300/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..67326eac9d1aeaece3cee6be49e088077b9cebe9
--- /dev/null
+++ b/checkpoint-1300/adapter_config.json
@@ -0,0 +1,28 @@
+{
+ "alpha_pattern": {},
+ "auto_mapping": null,
+ "base_model_name_or_path": "hfl/chinese-alpaca-2-1.3b",
+ "bias": "none",
+ "fan_in_fan_out": false,
+ "inference_mode": true,
+ "init_lora_weights": true,
+ "layers_pattern": null,
+ "layers_to_transform": null,
+ "loftq_config": {},
+ "lora_alpha": 16,
+ "lora_dropout": 0.1,
+ "megatron_config": null,
+ "megatron_core": "megatron.core",
+ "modules_to_save": null,
+ "peft_type": "LORA",
+ "r": 8,
+ "rank_pattern": {},
+ "revision": null,
+ "target_modules": [
+ "q_proj",
+ "v_proj"
+ ],
+ "task_type": "CAUSAL_LM",
+ "use_dora": false,
+ "use_rslora": false
+}
\ No newline at end of file
diff --git a/checkpoint-1300/adapter_model.safetensors b/checkpoint-1300/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..e0382739e62785a838f827ecd8282f87b0c726aa
--- /dev/null
+++ b/checkpoint-1300/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e7f5a3debe200cebce199a766f211c3a049b8d6f10373d8f3f83faa87c3e960b
+size 2099272
diff --git a/checkpoint-1300/optimizer.pt b/checkpoint-1300/optimizer.pt
new file mode 100644
index 0000000000000000000000000000000000000000..3129b4c94ab5bfbc61d1d7e9ea30089521b170f2
--- /dev/null
+++ b/checkpoint-1300/optimizer.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:87eabd303a694623686d7f654d52392eddab6262cd4355830697686f67c0a855
+size 4208302
diff --git a/checkpoint-1300/rng_state.pth b/checkpoint-1300/rng_state.pth
new file mode 100644
index 0000000000000000000000000000000000000000..35939a6ba3357b9f0e8c27610c512092142b7951
--- /dev/null
+++ b/checkpoint-1300/rng_state.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4791e25833f91f30ef2f35dd4e766076d71c0375d8c0095c91afb487cabff9a0
+size 14244
diff --git a/checkpoint-1300/scheduler.pt b/checkpoint-1300/scheduler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..620239d9d62203c933d741ea700759304d84d8d7
--- /dev/null
+++ b/checkpoint-1300/scheduler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0dc06e39f1e7d640266c26ad77111cfa1a6963193bf9dba5df25b2337808bbe7
+size 1064
diff --git a/checkpoint-1300/special_tokens_map.json b/checkpoint-1300/special_tokens_map.json
new file mode 100644
index 0000000000000000000000000000000000000000..94c4595c4eb202faff8e45d273ceffbe19fe3036
--- /dev/null
+++ b/checkpoint-1300/special_tokens_map.json
@@ -0,0 +1,30 @@
+{
+ "bos_token": {
+ "content": "",
+ "lstrip": false,
+ "normalized": true,
+ "rstrip": false,
+ "single_word": false
+ },
+ "eos_token": {
+ "content": "",
+ "lstrip": false,
+ "normalized": true,
+ "rstrip": false,
+ "single_word": false
+ },
+ "pad_token": {
+ "content": "",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false
+ },
+ "unk_token": {
+ "content": "",
+ "lstrip": false,
+ "normalized": true,
+ "rstrip": false,
+ "single_word": false
+ }
+}
diff --git a/checkpoint-1300/tokenizer.model b/checkpoint-1300/tokenizer.model
new file mode 100644
index 0000000000000000000000000000000000000000..3df664fcc67f61d0c7e635a60d9f55cd0624158a
--- /dev/null
+++ b/checkpoint-1300/tokenizer.model
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a3b8844863b200dfcca971db228e96ce388290dfcf72c15d7a9d2f604bac787c
+size 844403
diff --git a/checkpoint-1300/tokenizer_config.json b/checkpoint-1300/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..6912f7cdcb84c3039edad59aa64f70dc4344a517
--- /dev/null
+++ b/checkpoint-1300/tokenizer_config.json
@@ -0,0 +1,54 @@
+{
+ "add_bos_token": true,
+ "add_eos_token": false,
+ "add_prefix_space": true,
+ "added_tokens_decoder": {
+ "0": {
+ "content": "",
+ "lstrip": false,
+ "normalized": true,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "1": {
+ "content": "",
+ "lstrip": false,
+ "normalized": true,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "2": {
+ "content": "",
+ "lstrip": false,
+ "normalized": true,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "32000": {
+ "content": "",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ }
+ },
+ "bos_token": "",
+ "chat_template": "{% set system_message = 'You are a helpful assistant. 你是一个乐于助人的助手。' %}{% if messages[0]['role'] == 'system' %}{% set system_message = messages[0]['content'] %}{% endif %}{% for message in messages %}{% set content = message['content'] %}{% if loop.index0 == 0 and system_message is defined %}{% set content = '<>\\n' + system_message + '\\n<>\\n\\n' + message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ '' + '[INST] ' + content + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ content + '' }}{% endif %}{% endfor %}",
+ "clean_up_tokenization_spaces": false,
+ "eos_token": "",
+ "legacy": true,
+ "model_max_length": 1000000000000000019884624838656,
+ "pad_token": "",
+ "padding_side": "right",
+ "sp_model_kwargs": {},
+ "spaces_between_special_tokens": false,
+ "split_special_tokens": false,
+ "tokenizer_class": "LlamaTokenizer",
+ "unk_token": "",
+ "use_default_system_prompt": false,
+ "use_fast": false
+}
diff --git a/checkpoint-1300/trainer_state.json b/checkpoint-1300/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..7533c209de558de20f04010af25f164baf7b197b
--- /dev/null
+++ b/checkpoint-1300/trainer_state.json
@@ -0,0 +1,1841 @@
+{
+ "best_metric": null,
+ "best_model_checkpoint": null,
+ "epoch": 0.40424456796361796,
+ "eval_steps": 500,
+ "global_step": 1300,
+ "is_hyper_param_search": false,
+ "is_local_process_zero": true,
+ "is_world_process_zero": true,
+ "log_history": [
+ {
+ "epoch": 0.0,
+ "grad_norm": 0.47774845361709595,
+ "learning_rate": 4.999970160815579e-05,
+ "loss": 2.0765,
+ "step": 5
+ },
+ {
+ "epoch": 0.0,
+ "grad_norm": 0.6051416397094727,
+ "learning_rate": 4.999880643974619e-05,
+ "loss": 2.2297,
+ "step": 10
+ },
+ {
+ "epoch": 0.0,
+ "grad_norm": 0.6161717772483826,
+ "learning_rate": 4.9997314516140056e-05,
+ "loss": 2.1103,
+ "step": 15
+ },
+ {
+ "epoch": 0.01,
+ "grad_norm": 0.4686434268951416,
+ "learning_rate": 4.999522587295162e-05,
+ "loss": 2.0057,
+ "step": 20
+ },
+ {
+ "epoch": 0.01,
+ "grad_norm": 0.8412289023399353,
+ "learning_rate": 4.999254056003963e-05,
+ "loss": 2.1778,
+ "step": 25
+ },
+ {
+ "epoch": 0.01,
+ "grad_norm": 0.5333625078201294,
+ "learning_rate": 4.99892586415061e-05,
+ "loss": 2.2399,
+ "step": 30
+ },
+ {
+ "epoch": 0.01,
+ "grad_norm": 0.821148157119751,
+ "learning_rate": 4.9985380195694856e-05,
+ "loss": 2.3215,
+ "step": 35
+ },
+ {
+ "epoch": 0.01,
+ "grad_norm": 0.8403909206390381,
+ "learning_rate": 4.998090531518962e-05,
+ "loss": 1.8295,
+ "step": 40
+ },
+ {
+ "epoch": 0.01,
+ "grad_norm": 0.6633398532867432,
+ "learning_rate": 4.9975834106811834e-05,
+ "loss": 2.0195,
+ "step": 45
+ },
+ {
+ "epoch": 0.02,
+ "grad_norm": 0.6386868357658386,
+ "learning_rate": 4.997016669161806e-05,
+ "loss": 2.1257,
+ "step": 50
+ },
+ {
+ "epoch": 0.02,
+ "grad_norm": 0.7762248516082764,
+ "learning_rate": 4.996390320489715e-05,
+ "loss": 2.057,
+ "step": 55
+ },
+ {
+ "epoch": 0.02,
+ "grad_norm": 1.3192856311798096,
+ "learning_rate": 4.9957043796166966e-05,
+ "loss": 2.0753,
+ "step": 60
+ },
+ {
+ "epoch": 0.02,
+ "grad_norm": 0.9797518849372864,
+ "learning_rate": 4.994958862917083e-05,
+ "loss": 1.9736,
+ "step": 65
+ },
+ {
+ "epoch": 0.02,
+ "grad_norm": 1.000693440437317,
+ "learning_rate": 4.994153788187363e-05,
+ "loss": 2.1572,
+ "step": 70
+ },
+ {
+ "epoch": 0.02,
+ "grad_norm": 0.6852813959121704,
+ "learning_rate": 4.993289174645757e-05,
+ "loss": 2.1491,
+ "step": 75
+ },
+ {
+ "epoch": 0.02,
+ "grad_norm": 1.0075691938400269,
+ "learning_rate": 4.992365042931752e-05,
+ "loss": 1.945,
+ "step": 80
+ },
+ {
+ "epoch": 0.03,
+ "grad_norm": 1.1973133087158203,
+ "learning_rate": 4.991381415105619e-05,
+ "loss": 2.0811,
+ "step": 85
+ },
+ {
+ "epoch": 0.03,
+ "grad_norm": 0.9927239418029785,
+ "learning_rate": 4.990338314647881e-05,
+ "loss": 1.961,
+ "step": 90
+ },
+ {
+ "epoch": 0.03,
+ "grad_norm": 0.9499759674072266,
+ "learning_rate": 4.98923576645875e-05,
+ "loss": 2.0653,
+ "step": 95
+ },
+ {
+ "epoch": 0.03,
+ "grad_norm": 0.7233040928840637,
+ "learning_rate": 4.9880737968575365e-05,
+ "loss": 1.9999,
+ "step": 100
+ },
+ {
+ "epoch": 0.03,
+ "grad_norm": 1.55235755443573,
+ "learning_rate": 4.986852433582022e-05,
+ "loss": 2.2258,
+ "step": 105
+ },
+ {
+ "epoch": 0.03,
+ "grad_norm": 0.9007890820503235,
+ "learning_rate": 4.985571705787793e-05,
+ "loss": 2.1034,
+ "step": 110
+ },
+ {
+ "epoch": 0.04,
+ "grad_norm": 0.6774860620498657,
+ "learning_rate": 4.9842316440475475e-05,
+ "loss": 2.1753,
+ "step": 115
+ },
+ {
+ "epoch": 0.04,
+ "grad_norm": 0.7676737308502197,
+ "learning_rate": 4.9828322803503665e-05,
+ "loss": 2.1384,
+ "step": 120
+ },
+ {
+ "epoch": 0.04,
+ "grad_norm": 0.9624544978141785,
+ "learning_rate": 4.981373648100946e-05,
+ "loss": 2.0521,
+ "step": 125
+ },
+ {
+ "epoch": 0.04,
+ "grad_norm": 0.9315722584724426,
+ "learning_rate": 4.979855782118802e-05,
+ "loss": 1.9256,
+ "step": 130
+ },
+ {
+ "epoch": 0.04,
+ "grad_norm": 0.9035864472389221,
+ "learning_rate": 4.978278718637443e-05,
+ "loss": 2.0882,
+ "step": 135
+ },
+ {
+ "epoch": 0.04,
+ "grad_norm": 0.7997236251831055,
+ "learning_rate": 4.9766424953035e-05,
+ "loss": 2.0724,
+ "step": 140
+ },
+ {
+ "epoch": 0.05,
+ "grad_norm": 1.0692921876907349,
+ "learning_rate": 4.974947151175826e-05,
+ "loss": 2.1329,
+ "step": 145
+ },
+ {
+ "epoch": 0.05,
+ "grad_norm": 0.9506180286407471,
+ "learning_rate": 4.973192726724572e-05,
+ "loss": 2.082,
+ "step": 150
+ },
+ {
+ "epoch": 0.05,
+ "grad_norm": 0.8647387027740479,
+ "learning_rate": 4.9713792638302145e-05,
+ "loss": 2.0366,
+ "step": 155
+ },
+ {
+ "epoch": 0.05,
+ "grad_norm": 1.105302095413208,
+ "learning_rate": 4.969506805782555e-05,
+ "loss": 2.1481,
+ "step": 160
+ },
+ {
+ "epoch": 0.05,
+ "grad_norm": 0.7593303918838501,
+ "learning_rate": 4.967575397279689e-05,
+ "loss": 2.032,
+ "step": 165
+ },
+ {
+ "epoch": 0.05,
+ "grad_norm": 0.7521979808807373,
+ "learning_rate": 4.965585084426943e-05,
+ "loss": 2.0379,
+ "step": 170
+ },
+ {
+ "epoch": 0.05,
+ "grad_norm": 0.947120726108551,
+ "learning_rate": 4.9635359147357655e-05,
+ "loss": 2.1444,
+ "step": 175
+ },
+ {
+ "epoch": 0.06,
+ "grad_norm": 1.2184454202651978,
+ "learning_rate": 4.961427937122598e-05,
+ "loss": 1.9164,
+ "step": 180
+ },
+ {
+ "epoch": 0.06,
+ "grad_norm": 1.221663475036621,
+ "learning_rate": 4.959261201907707e-05,
+ "loss": 2.0084,
+ "step": 185
+ },
+ {
+ "epoch": 0.06,
+ "grad_norm": 1.0457361936569214,
+ "learning_rate": 4.957035760813982e-05,
+ "loss": 2.2032,
+ "step": 190
+ },
+ {
+ "epoch": 0.06,
+ "grad_norm": 0.8834909200668335,
+ "learning_rate": 4.954751666965701e-05,
+ "loss": 2.2101,
+ "step": 195
+ },
+ {
+ "epoch": 0.06,
+ "grad_norm": 0.791902482509613,
+ "learning_rate": 4.9524089748872615e-05,
+ "loss": 2.0472,
+ "step": 200
+ },
+ {
+ "epoch": 0.06,
+ "grad_norm": 1.2905739545822144,
+ "learning_rate": 4.9500077405018807e-05,
+ "loss": 2.0987,
+ "step": 205
+ },
+ {
+ "epoch": 0.07,
+ "grad_norm": 0.8612006306648254,
+ "learning_rate": 4.9475480211302583e-05,
+ "loss": 2.1765,
+ "step": 210
+ },
+ {
+ "epoch": 0.07,
+ "grad_norm": 1.3128459453582764,
+ "learning_rate": 4.945029875489212e-05,
+ "loss": 1.9926,
+ "step": 215
+ },
+ {
+ "epoch": 0.07,
+ "grad_norm": 0.9610918164253235,
+ "learning_rate": 4.94245336369027e-05,
+ "loss": 2.0124,
+ "step": 220
+ },
+ {
+ "epoch": 0.07,
+ "grad_norm": 0.873160183429718,
+ "learning_rate": 4.939818547238241e-05,
+ "loss": 2.2229,
+ "step": 225
+ },
+ {
+ "epoch": 0.07,
+ "grad_norm": 1.5535285472869873,
+ "learning_rate": 4.9371254890297446e-05,
+ "loss": 2.2013,
+ "step": 230
+ },
+ {
+ "epoch": 0.07,
+ "grad_norm": 1.1951836347579956,
+ "learning_rate": 4.93437425335171e-05,
+ "loss": 2.014,
+ "step": 235
+ },
+ {
+ "epoch": 0.07,
+ "grad_norm": 0.7874170541763306,
+ "learning_rate": 4.9315649058798384e-05,
+ "loss": 2.1701,
+ "step": 240
+ },
+ {
+ "epoch": 0.08,
+ "grad_norm": 1.3503323793411255,
+ "learning_rate": 4.928697513677042e-05,
+ "loss": 2.1681,
+ "step": 245
+ },
+ {
+ "epoch": 0.08,
+ "grad_norm": 1.3091179132461548,
+ "learning_rate": 4.925772145191834e-05,
+ "loss": 2.1224,
+ "step": 250
+ },
+ {
+ "epoch": 0.08,
+ "grad_norm": 1.4428555965423584,
+ "learning_rate": 4.9227888702567044e-05,
+ "loss": 2.0512,
+ "step": 255
+ },
+ {
+ "epoch": 0.08,
+ "grad_norm": 0.8234395980834961,
+ "learning_rate": 4.9197477600864446e-05,
+ "loss": 2.1067,
+ "step": 260
+ },
+ {
+ "epoch": 0.08,
+ "grad_norm": 1.9094969034194946,
+ "learning_rate": 4.9166488872764526e-05,
+ "loss": 1.8884,
+ "step": 265
+ },
+ {
+ "epoch": 0.08,
+ "grad_norm": 1.0074087381362915,
+ "learning_rate": 4.913492325800999e-05,
+ "loss": 1.9345,
+ "step": 270
+ },
+ {
+ "epoch": 0.09,
+ "grad_norm": 1.0867297649383545,
+ "learning_rate": 4.910278151011458e-05,
+ "loss": 2.1928,
+ "step": 275
+ },
+ {
+ "epoch": 0.09,
+ "grad_norm": 0.6842357516288757,
+ "learning_rate": 4.907006439634516e-05,
+ "loss": 2.0407,
+ "step": 280
+ },
+ {
+ "epoch": 0.09,
+ "grad_norm": 0.8409023284912109,
+ "learning_rate": 4.903677269770329e-05,
+ "loss": 2.2344,
+ "step": 285
+ },
+ {
+ "epoch": 0.09,
+ "grad_norm": 0.8119503259658813,
+ "learning_rate": 4.900290720890671e-05,
+ "loss": 2.1296,
+ "step": 290
+ },
+ {
+ "epoch": 0.09,
+ "grad_norm": 0.9938147068023682,
+ "learning_rate": 4.8968468738370244e-05,
+ "loss": 2.152,
+ "step": 295
+ },
+ {
+ "epoch": 0.09,
+ "grad_norm": 0.9865244030952454,
+ "learning_rate": 4.8933458108186606e-05,
+ "loss": 1.9623,
+ "step": 300
+ },
+ {
+ "epoch": 0.09,
+ "grad_norm": 1.3944802284240723,
+ "learning_rate": 4.889787615410672e-05,
+ "loss": 1.915,
+ "step": 305
+ },
+ {
+ "epoch": 0.1,
+ "grad_norm": 1.3749767541885376,
+ "learning_rate": 4.886172372551977e-05,
+ "loss": 1.9934,
+ "step": 310
+ },
+ {
+ "epoch": 0.1,
+ "grad_norm": 0.9024938941001892,
+ "learning_rate": 4.882500168543294e-05,
+ "loss": 2.1541,
+ "step": 315
+ },
+ {
+ "epoch": 0.1,
+ "grad_norm": 1.1978263854980469,
+ "learning_rate": 4.878771091045082e-05,
+ "loss": 2.1688,
+ "step": 320
+ },
+ {
+ "epoch": 0.1,
+ "grad_norm": 0.8360010981559753,
+ "learning_rate": 4.874985229075446e-05,
+ "loss": 2.1387,
+ "step": 325
+ },
+ {
+ "epoch": 0.1,
+ "grad_norm": 0.7683364152908325,
+ "learning_rate": 4.871142673008012e-05,
+ "loss": 2.0215,
+ "step": 330
+ },
+ {
+ "epoch": 0.1,
+ "grad_norm": 1.4230670928955078,
+ "learning_rate": 4.867243514569772e-05,
+ "loss": 1.9491,
+ "step": 335
+ },
+ {
+ "epoch": 0.11,
+ "grad_norm": 0.8198773860931396,
+ "learning_rate": 4.863287846838891e-05,
+ "loss": 2.0151,
+ "step": 340
+ },
+ {
+ "epoch": 0.11,
+ "grad_norm": 1.467207908630371,
+ "learning_rate": 4.85927576424249e-05,
+ "loss": 1.8906,
+ "step": 345
+ },
+ {
+ "epoch": 0.11,
+ "grad_norm": 0.9537095427513123,
+ "learning_rate": 4.855207362554385e-05,
+ "loss": 2.1844,
+ "step": 350
+ },
+ {
+ "epoch": 0.11,
+ "grad_norm": 1.0757155418395996,
+ "learning_rate": 4.851082738892809e-05,
+ "loss": 2.048,
+ "step": 355
+ },
+ {
+ "epoch": 0.11,
+ "grad_norm": 1.6884938478469849,
+ "learning_rate": 4.8469019917180846e-05,
+ "loss": 1.9537,
+ "step": 360
+ },
+ {
+ "epoch": 0.11,
+ "grad_norm": 1.4680182933807373,
+ "learning_rate": 4.8426652208302814e-05,
+ "loss": 1.9731,
+ "step": 365
+ },
+ {
+ "epoch": 0.12,
+ "grad_norm": 1.1778632402420044,
+ "learning_rate": 4.83837252736683e-05,
+ "loss": 2.1395,
+ "step": 370
+ },
+ {
+ "epoch": 0.12,
+ "grad_norm": 1.2865056991577148,
+ "learning_rate": 4.834024013800108e-05,
+ "loss": 2.0016,
+ "step": 375
+ },
+ {
+ "epoch": 0.12,
+ "grad_norm": 1.055177092552185,
+ "learning_rate": 4.8296197839349944e-05,
+ "loss": 1.9632,
+ "step": 380
+ },
+ {
+ "epoch": 0.12,
+ "grad_norm": 1.0041871070861816,
+ "learning_rate": 4.825159942906389e-05,
+ "loss": 2.3302,
+ "step": 385
+ },
+ {
+ "epoch": 0.12,
+ "grad_norm": 1.0026438236236572,
+ "learning_rate": 4.820644597176709e-05,
+ "loss": 2.1517,
+ "step": 390
+ },
+ {
+ "epoch": 0.12,
+ "grad_norm": 1.3532180786132812,
+ "learning_rate": 4.81607385453334e-05,
+ "loss": 2.1229,
+ "step": 395
+ },
+ {
+ "epoch": 0.12,
+ "grad_norm": 0.7670988440513611,
+ "learning_rate": 4.81144782408607e-05,
+ "loss": 2.1382,
+ "step": 400
+ },
+ {
+ "epoch": 0.13,
+ "grad_norm": 1.0405700206756592,
+ "learning_rate": 4.8067666162644774e-05,
+ "loss": 1.9614,
+ "step": 405
+ },
+ {
+ "epoch": 0.13,
+ "grad_norm": 1.2252662181854248,
+ "learning_rate": 4.802030342815304e-05,
+ "loss": 2.1399,
+ "step": 410
+ },
+ {
+ "epoch": 0.13,
+ "grad_norm": 1.237946629524231,
+ "learning_rate": 4.7972391167997754e-05,
+ "loss": 1.9034,
+ "step": 415
+ },
+ {
+ "epoch": 0.13,
+ "grad_norm": 0.8064705729484558,
+ "learning_rate": 4.7923930525909156e-05,
+ "loss": 2.0075,
+ "step": 420
+ },
+ {
+ "epoch": 0.13,
+ "grad_norm": 0.8717565536499023,
+ "learning_rate": 4.7874922658708065e-05,
+ "loss": 2.0105,
+ "step": 425
+ },
+ {
+ "epoch": 0.13,
+ "grad_norm": 1.6693098545074463,
+ "learning_rate": 4.782536873627832e-05,
+ "loss": 2.0242,
+ "step": 430
+ },
+ {
+ "epoch": 0.14,
+ "grad_norm": 0.82447350025177,
+ "learning_rate": 4.777526994153882e-05,
+ "loss": 2.0267,
+ "step": 435
+ },
+ {
+ "epoch": 0.14,
+ "grad_norm": 0.9926588535308838,
+ "learning_rate": 4.7724627470415307e-05,
+ "loss": 1.9119,
+ "step": 440
+ },
+ {
+ "epoch": 0.14,
+ "grad_norm": 1.0924450159072876,
+ "learning_rate": 4.7673442531811796e-05,
+ "loss": 2.2653,
+ "step": 445
+ },
+ {
+ "epoch": 0.14,
+ "grad_norm": 1.1592103242874146,
+ "learning_rate": 4.762171634758177e-05,
+ "loss": 2.0017,
+ "step": 450
+ },
+ {
+ "epoch": 0.14,
+ "grad_norm": 0.9172110557556152,
+ "learning_rate": 4.7569450152498927e-05,
+ "loss": 2.1408,
+ "step": 455
+ },
+ {
+ "epoch": 0.14,
+ "grad_norm": 1.1897525787353516,
+ "learning_rate": 4.751664519422778e-05,
+ "loss": 2.0935,
+ "step": 460
+ },
+ {
+ "epoch": 0.14,
+ "grad_norm": 0.8793094158172607,
+ "learning_rate": 4.746330273329386e-05,
+ "loss": 2.1142,
+ "step": 465
+ },
+ {
+ "epoch": 0.15,
+ "grad_norm": 1.4337489604949951,
+ "learning_rate": 4.740942404305356e-05,
+ "loss": 2.1289,
+ "step": 470
+ },
+ {
+ "epoch": 0.15,
+ "grad_norm": 1.0251764059066772,
+ "learning_rate": 4.735501040966383e-05,
+ "loss": 1.9741,
+ "step": 475
+ },
+ {
+ "epoch": 0.15,
+ "grad_norm": 1.2659822702407837,
+ "learning_rate": 4.730006313205143e-05,
+ "loss": 2.088,
+ "step": 480
+ },
+ {
+ "epoch": 0.15,
+ "grad_norm": 0.8884140849113464,
+ "learning_rate": 4.724458352188192e-05,
+ "loss": 2.2079,
+ "step": 485
+ },
+ {
+ "epoch": 0.15,
+ "grad_norm": 1.1937768459320068,
+ "learning_rate": 4.718857290352835e-05,
+ "loss": 2.048,
+ "step": 490
+ },
+ {
+ "epoch": 0.15,
+ "grad_norm": 0.9741552472114563,
+ "learning_rate": 4.713203261403966e-05,
+ "loss": 2.2569,
+ "step": 495
+ },
+ {
+ "epoch": 0.16,
+ "grad_norm": 0.7996780872344971,
+ "learning_rate": 4.707496400310874e-05,
+ "loss": 1.9574,
+ "step": 500
+ },
+ {
+ "epoch": 0.16,
+ "grad_norm": 1.8182051181793213,
+ "learning_rate": 4.701736843304025e-05,
+ "loss": 2.0951,
+ "step": 505
+ },
+ {
+ "epoch": 0.16,
+ "grad_norm": 1.507320761680603,
+ "learning_rate": 4.695924727871805e-05,
+ "loss": 2.0253,
+ "step": 510
+ },
+ {
+ "epoch": 0.16,
+ "grad_norm": 0.759121835231781,
+ "learning_rate": 4.690060192757242e-05,
+ "loss": 2.0602,
+ "step": 515
+ },
+ {
+ "epoch": 0.16,
+ "grad_norm": 1.5943195819854736,
+ "learning_rate": 4.684143377954691e-05,
+ "loss": 2.0386,
+ "step": 520
+ },
+ {
+ "epoch": 0.16,
+ "grad_norm": 0.8568710088729858,
+ "learning_rate": 4.6781744247064955e-05,
+ "loss": 2.073,
+ "step": 525
+ },
+ {
+ "epoch": 0.16,
+ "grad_norm": 1.3352620601654053,
+ "learning_rate": 4.6721534754996125e-05,
+ "loss": 2.1443,
+ "step": 530
+ },
+ {
+ "epoch": 0.17,
+ "grad_norm": 1.3417474031448364,
+ "learning_rate": 4.666080674062213e-05,
+ "loss": 2.0288,
+ "step": 535
+ },
+ {
+ "epoch": 0.17,
+ "grad_norm": 1.5334464311599731,
+ "learning_rate": 4.659956165360251e-05,
+ "loss": 2.0609,
+ "step": 540
+ },
+ {
+ "epoch": 0.17,
+ "grad_norm": 0.9658721089363098,
+ "learning_rate": 4.6537800955940005e-05,
+ "loss": 1.9539,
+ "step": 545
+ },
+ {
+ "epoch": 0.17,
+ "grad_norm": 1.9197947978973389,
+ "learning_rate": 4.647552612194572e-05,
+ "loss": 2.149,
+ "step": 550
+ },
+ {
+ "epoch": 0.17,
+ "grad_norm": 0.8512137532234192,
+ "learning_rate": 4.641273863820383e-05,
+ "loss": 1.9722,
+ "step": 555
+ },
+ {
+ "epoch": 0.17,
+ "grad_norm": 1.827289342880249,
+ "learning_rate": 4.634944000353622e-05,
+ "loss": 2.0729,
+ "step": 560
+ },
+ {
+ "epoch": 0.18,
+ "grad_norm": 1.088416337966919,
+ "learning_rate": 4.628563172896655e-05,
+ "loss": 1.9507,
+ "step": 565
+ },
+ {
+ "epoch": 0.18,
+ "grad_norm": 1.3566908836364746,
+ "learning_rate": 4.6221315337684353e-05,
+ "loss": 2.1643,
+ "step": 570
+ },
+ {
+ "epoch": 0.18,
+ "grad_norm": 1.3541293144226074,
+ "learning_rate": 4.615649236500854e-05,
+ "loss": 2.1839,
+ "step": 575
+ },
+ {
+ "epoch": 0.18,
+ "grad_norm": 0.991269588470459,
+ "learning_rate": 4.609116435835083e-05,
+ "loss": 2.0976,
+ "step": 580
+ },
+ {
+ "epoch": 0.18,
+ "grad_norm": 1.0280535221099854,
+ "learning_rate": 4.602533287717877e-05,
+ "loss": 2.1474,
+ "step": 585
+ },
+ {
+ "epoch": 0.18,
+ "grad_norm": 1.013123631477356,
+ "learning_rate": 4.5958999492978524e-05,
+ "loss": 2.1873,
+ "step": 590
+ },
+ {
+ "epoch": 0.19,
+ "grad_norm": 1.1753040552139282,
+ "learning_rate": 4.589216578921737e-05,
+ "loss": 2.1744,
+ "step": 595
+ },
+ {
+ "epoch": 0.19,
+ "grad_norm": 1.1839090585708618,
+ "learning_rate": 4.582483336130586e-05,
+ "loss": 1.9982,
+ "step": 600
+ },
+ {
+ "epoch": 0.19,
+ "grad_norm": 1.0724798440933228,
+ "learning_rate": 4.575700381655979e-05,
+ "loss": 2.1234,
+ "step": 605
+ },
+ {
+ "epoch": 0.19,
+ "grad_norm": 2.009913682937622,
+ "learning_rate": 4.5688678774161796e-05,
+ "loss": 1.9478,
+ "step": 610
+ },
+ {
+ "epoch": 0.19,
+ "grad_norm": 0.9897060394287109,
+ "learning_rate": 4.561985986512271e-05,
+ "loss": 1.8268,
+ "step": 615
+ },
+ {
+ "epoch": 0.19,
+ "grad_norm": 0.8881808519363403,
+ "learning_rate": 4.555054873224263e-05,
+ "loss": 1.9887,
+ "step": 620
+ },
+ {
+ "epoch": 0.19,
+ "grad_norm": 1.155900001525879,
+ "learning_rate": 4.54807470300717e-05,
+ "loss": 2.0777,
+ "step": 625
+ },
+ {
+ "epoch": 0.2,
+ "grad_norm": 0.8782421350479126,
+ "learning_rate": 4.5410456424870596e-05,
+ "loss": 2.0566,
+ "step": 630
+ },
+ {
+ "epoch": 0.2,
+ "grad_norm": 1.3324674367904663,
+ "learning_rate": 4.5339678594570795e-05,
+ "loss": 2.047,
+ "step": 635
+ },
+ {
+ "epoch": 0.2,
+ "grad_norm": 1.9805939197540283,
+ "learning_rate": 4.526841522873449e-05,
+ "loss": 1.962,
+ "step": 640
+ },
+ {
+ "epoch": 0.2,
+ "grad_norm": 1.4999943971633911,
+ "learning_rate": 4.519666802851422e-05,
+ "loss": 2.0972,
+ "step": 645
+ },
+ {
+ "epoch": 0.2,
+ "grad_norm": 1.4504961967468262,
+ "learning_rate": 4.5124438706612376e-05,
+ "loss": 2.0041,
+ "step": 650
+ },
+ {
+ "epoch": 0.2,
+ "grad_norm": 0.9078169465065002,
+ "learning_rate": 4.505172898724018e-05,
+ "loss": 2.1229,
+ "step": 655
+ },
+ {
+ "epoch": 0.21,
+ "grad_norm": 1.1635804176330566,
+ "learning_rate": 4.497854060607662e-05,
+ "loss": 2.0195,
+ "step": 660
+ },
+ {
+ "epoch": 0.21,
+ "grad_norm": 1.46576726436615,
+ "learning_rate": 4.490487531022699e-05,
+ "loss": 2.0745,
+ "step": 665
+ },
+ {
+ "epoch": 0.21,
+ "grad_norm": 1.2094652652740479,
+ "learning_rate": 4.4830734858181145e-05,
+ "loss": 2.1068,
+ "step": 670
+ },
+ {
+ "epoch": 0.21,
+ "grad_norm": 1.4738895893096924,
+ "learning_rate": 4.47561210197716e-05,
+ "loss": 1.8088,
+ "step": 675
+ },
+ {
+ "epoch": 0.21,
+ "grad_norm": 1.23384690284729,
+ "learning_rate": 4.4681035576131215e-05,
+ "loss": 2.0995,
+ "step": 680
+ },
+ {
+ "epoch": 0.21,
+ "grad_norm": 0.8332946300506592,
+ "learning_rate": 4.46054803196507e-05,
+ "loss": 2.0541,
+ "step": 685
+ },
+ {
+ "epoch": 0.21,
+ "grad_norm": 0.9207485318183899,
+ "learning_rate": 4.452945705393586e-05,
+ "loss": 2.166,
+ "step": 690
+ },
+ {
+ "epoch": 0.22,
+ "grad_norm": 1.292945146560669,
+ "learning_rate": 4.445296759376449e-05,
+ "loss": 2.0784,
+ "step": 695
+ },
+ {
+ "epoch": 0.22,
+ "grad_norm": 0.9874763488769531,
+ "learning_rate": 4.437601376504307e-05,
+ "loss": 2.2087,
+ "step": 700
+ },
+ {
+ "epoch": 0.22,
+ "grad_norm": 0.9427415132522583,
+ "learning_rate": 4.4298597404763186e-05,
+ "loss": 2.1199,
+ "step": 705
+ },
+ {
+ "epoch": 0.22,
+ "grad_norm": 1.7369529008865356,
+ "learning_rate": 4.422072036095768e-05,
+ "loss": 2.0355,
+ "step": 710
+ },
+ {
+ "epoch": 0.22,
+ "grad_norm": 1.2423696517944336,
+ "learning_rate": 4.414238449265654e-05,
+ "loss": 2.0011,
+ "step": 715
+ },
+ {
+ "epoch": 0.22,
+ "grad_norm": 1.2304831743240356,
+ "learning_rate": 4.406359166984249e-05,
+ "loss": 2.0368,
+ "step": 720
+ },
+ {
+ "epoch": 0.23,
+ "grad_norm": 0.9090413451194763,
+ "learning_rate": 4.39843437734064e-05,
+ "loss": 1.9983,
+ "step": 725
+ },
+ {
+ "epoch": 0.23,
+ "grad_norm": 1.2729507684707642,
+ "learning_rate": 4.390464269510233e-05,
+ "loss": 2.021,
+ "step": 730
+ },
+ {
+ "epoch": 0.23,
+ "grad_norm": 1.3009227514266968,
+ "learning_rate": 4.382449033750244e-05,
+ "loss": 1.9743,
+ "step": 735
+ },
+ {
+ "epoch": 0.23,
+ "grad_norm": 1.5456056594848633,
+ "learning_rate": 4.37438886139515e-05,
+ "loss": 2.0689,
+ "step": 740
+ },
+ {
+ "epoch": 0.23,
+ "grad_norm": 1.3235007524490356,
+ "learning_rate": 4.3662839448521264e-05,
+ "loss": 2.0838,
+ "step": 745
+ },
+ {
+ "epoch": 0.23,
+ "grad_norm": 2.2074007987976074,
+ "learning_rate": 4.358134477596454e-05,
+ "loss": 2.0835,
+ "step": 750
+ },
+ {
+ "epoch": 0.23,
+ "grad_norm": 1.403738021850586,
+ "learning_rate": 4.3499406541668966e-05,
+ "loss": 2.0916,
+ "step": 755
+ },
+ {
+ "epoch": 0.24,
+ "grad_norm": 1.0940325260162354,
+ "learning_rate": 4.3417026701610616e-05,
+ "loss": 1.972,
+ "step": 760
+ },
+ {
+ "epoch": 0.24,
+ "grad_norm": 1.666353702545166,
+ "learning_rate": 4.3334207222307275e-05,
+ "loss": 1.927,
+ "step": 765
+ },
+ {
+ "epoch": 0.24,
+ "grad_norm": 1.0777515172958374,
+ "learning_rate": 4.325095008077154e-05,
+ "loss": 2.1192,
+ "step": 770
+ },
+ {
+ "epoch": 0.24,
+ "grad_norm": 1.7218186855316162,
+ "learning_rate": 4.316725726446353e-05,
+ "loss": 2.0774,
+ "step": 775
+ },
+ {
+ "epoch": 0.24,
+ "grad_norm": 1.356753945350647,
+ "learning_rate": 4.3083130771243586e-05,
+ "loss": 2.0847,
+ "step": 780
+ },
+ {
+ "epoch": 0.24,
+ "grad_norm": 0.9967429637908936,
+ "learning_rate": 4.299857260932445e-05,
+ "loss": 2.0485,
+ "step": 785
+ },
+ {
+ "epoch": 0.25,
+ "grad_norm": 1.6216442584991455,
+ "learning_rate": 4.2913584797223397e-05,
+ "loss": 2.1008,
+ "step": 790
+ },
+ {
+ "epoch": 0.25,
+ "grad_norm": 1.2556742429733276,
+ "learning_rate": 4.2828169363714016e-05,
+ "loss": 1.9209,
+ "step": 795
+ },
+ {
+ "epoch": 0.25,
+ "grad_norm": 1.1800439357757568,
+ "learning_rate": 4.274232834777782e-05,
+ "loss": 1.9722,
+ "step": 800
+ },
+ {
+ "epoch": 0.25,
+ "grad_norm": 1.1313499212265015,
+ "learning_rate": 4.2656063798555515e-05,
+ "loss": 1.9176,
+ "step": 805
+ },
+ {
+ "epoch": 0.25,
+ "grad_norm": 1.137534737586975,
+ "learning_rate": 4.256937777529815e-05,
+ "loss": 1.9929,
+ "step": 810
+ },
+ {
+ "epoch": 0.25,
+ "grad_norm": 1.0575093030929565,
+ "learning_rate": 4.2482272347317906e-05,
+ "loss": 2.166,
+ "step": 815
+ },
+ {
+ "epoch": 0.25,
+ "grad_norm": 1.5939594507217407,
+ "learning_rate": 4.2394749593938733e-05,
+ "loss": 2.1334,
+ "step": 820
+ },
+ {
+ "epoch": 0.26,
+ "grad_norm": 1.1045507192611694,
+ "learning_rate": 4.230681160444669e-05,
+ "loss": 2.0853,
+ "step": 825
+ },
+ {
+ "epoch": 0.26,
+ "grad_norm": 1.3480136394500732,
+ "learning_rate": 4.221846047804009e-05,
+ "loss": 2.1802,
+ "step": 830
+ },
+ {
+ "epoch": 0.26,
+ "grad_norm": 1.1822657585144043,
+ "learning_rate": 4.2129698323779366e-05,
+ "loss": 2.0739,
+ "step": 835
+ },
+ {
+ "epoch": 0.26,
+ "grad_norm": 1.1771117448806763,
+ "learning_rate": 4.204052726053676e-05,
+ "loss": 2.0238,
+ "step": 840
+ },
+ {
+ "epoch": 0.26,
+ "grad_norm": 1.4757814407348633,
+ "learning_rate": 4.195094941694571e-05,
+ "loss": 2.1557,
+ "step": 845
+ },
+ {
+ "epoch": 0.26,
+ "grad_norm": 0.9095075726509094,
+ "learning_rate": 4.1860966931350054e-05,
+ "loss": 2.1666,
+ "step": 850
+ },
+ {
+ "epoch": 0.27,
+ "grad_norm": 1.1039543151855469,
+ "learning_rate": 4.1770581951752976e-05,
+ "loss": 2.105,
+ "step": 855
+ },
+ {
+ "epoch": 0.27,
+ "grad_norm": 0.8517205119132996,
+ "learning_rate": 4.1679796635765735e-05,
+ "loss": 1.9656,
+ "step": 860
+ },
+ {
+ "epoch": 0.27,
+ "grad_norm": 1.239492654800415,
+ "learning_rate": 4.158861315055617e-05,
+ "loss": 2.0166,
+ "step": 865
+ },
+ {
+ "epoch": 0.27,
+ "grad_norm": 1.1358321905136108,
+ "learning_rate": 4.1497033672796924e-05,
+ "loss": 2.0076,
+ "step": 870
+ },
+ {
+ "epoch": 0.27,
+ "grad_norm": 1.6215249300003052,
+ "learning_rate": 4.140506038861356e-05,
+ "loss": 2.1594,
+ "step": 875
+ },
+ {
+ "epoch": 0.27,
+ "grad_norm": 1.0528080463409424,
+ "learning_rate": 4.131269549353229e-05,
+ "loss": 2.1416,
+ "step": 880
+ },
+ {
+ "epoch": 0.28,
+ "grad_norm": 0.8976901769638062,
+ "learning_rate": 4.1219941192427644e-05,
+ "loss": 2.1242,
+ "step": 885
+ },
+ {
+ "epoch": 0.28,
+ "grad_norm": 1.263594388961792,
+ "learning_rate": 4.112679969946977e-05,
+ "loss": 2.02,
+ "step": 890
+ },
+ {
+ "epoch": 0.28,
+ "grad_norm": 1.4173017740249634,
+ "learning_rate": 4.103327323807162e-05,
+ "loss": 2.0438,
+ "step": 895
+ },
+ {
+ "epoch": 0.28,
+ "grad_norm": 1.876170039176941,
+ "learning_rate": 4.093936404083585e-05,
+ "loss": 1.9806,
+ "step": 900
+ },
+ {
+ "epoch": 0.28,
+ "grad_norm": 1.4649231433868408,
+ "learning_rate": 4.0845074349501544e-05,
+ "loss": 2.1476,
+ "step": 905
+ },
+ {
+ "epoch": 0.28,
+ "grad_norm": 1.0446043014526367,
+ "learning_rate": 4.0750406414890695e-05,
+ "loss": 1.9672,
+ "step": 910
+ },
+ {
+ "epoch": 0.28,
+ "grad_norm": 1.0225305557250977,
+ "learning_rate": 4.065536249685448e-05,
+ "loss": 1.9984,
+ "step": 915
+ },
+ {
+ "epoch": 0.29,
+ "grad_norm": 1.0120617151260376,
+ "learning_rate": 4.055994486421929e-05,
+ "loss": 2.1162,
+ "step": 920
+ },
+ {
+ "epoch": 0.29,
+ "grad_norm": 1.0469881296157837,
+ "learning_rate": 4.04641557947326e-05,
+ "loss": 2.0435,
+ "step": 925
+ },
+ {
+ "epoch": 0.29,
+ "grad_norm": 1.2435941696166992,
+ "learning_rate": 4.036799757500856e-05,
+ "loss": 2.0431,
+ "step": 930
+ },
+ {
+ "epoch": 0.29,
+ "grad_norm": 1.0055103302001953,
+ "learning_rate": 4.027147250047348e-05,
+ "loss": 2.2021,
+ "step": 935
+ },
+ {
+ "epoch": 0.29,
+ "grad_norm": 1.1212949752807617,
+ "learning_rate": 4.017458287531094e-05,
+ "loss": 1.997,
+ "step": 940
+ },
+ {
+ "epoch": 0.29,
+ "grad_norm": 1.1048357486724854,
+ "learning_rate": 4.007733101240685e-05,
+ "loss": 1.946,
+ "step": 945
+ },
+ {
+ "epoch": 0.3,
+ "grad_norm": 1.4721689224243164,
+ "learning_rate": 3.997971923329426e-05,
+ "loss": 2.0723,
+ "step": 950
+ },
+ {
+ "epoch": 0.3,
+ "grad_norm": 1.3793156147003174,
+ "learning_rate": 3.988174986809783e-05,
+ "loss": 2.034,
+ "step": 955
+ },
+ {
+ "epoch": 0.3,
+ "grad_norm": 0.9013482928276062,
+ "learning_rate": 3.9783425255478355e-05,
+ "loss": 1.9736,
+ "step": 960
+ },
+ {
+ "epoch": 0.3,
+ "grad_norm": 0.9192422032356262,
+ "learning_rate": 3.968474774257682e-05,
+ "loss": 1.9878,
+ "step": 965
+ },
+ {
+ "epoch": 0.3,
+ "grad_norm": 1.9304206371307373,
+ "learning_rate": 3.9585719684958446e-05,
+ "loss": 2.117,
+ "step": 970
+ },
+ {
+ "epoch": 0.3,
+ "grad_norm": 1.0435137748718262,
+ "learning_rate": 3.948634344655639e-05,
+ "loss": 2.0585,
+ "step": 975
+ },
+ {
+ "epoch": 0.3,
+ "grad_norm": 1.4636590480804443,
+ "learning_rate": 3.938662139961538e-05,
+ "loss": 2.0409,
+ "step": 980
+ },
+ {
+ "epoch": 0.31,
+ "grad_norm": 1.8014529943466187,
+ "learning_rate": 3.928655592463508e-05,
+ "loss": 2.0369,
+ "step": 985
+ },
+ {
+ "epoch": 0.31,
+ "grad_norm": 1.2412620782852173,
+ "learning_rate": 3.918614941031319e-05,
+ "loss": 1.967,
+ "step": 990
+ },
+ {
+ "epoch": 0.31,
+ "grad_norm": 1.3581103086471558,
+ "learning_rate": 3.908540425348852e-05,
+ "loss": 2.0037,
+ "step": 995
+ },
+ {
+ "epoch": 0.31,
+ "grad_norm": 1.2377780675888062,
+ "learning_rate": 3.8984322859083725e-05,
+ "loss": 1.9991,
+ "step": 1000
+ },
+ {
+ "epoch": 0.31,
+ "grad_norm": 0.9209259748458862,
+ "learning_rate": 3.8882907640047896e-05,
+ "loss": 2.0448,
+ "step": 1005
+ },
+ {
+ "epoch": 0.31,
+ "grad_norm": 1.0150959491729736,
+ "learning_rate": 3.878116101729897e-05,
+ "loss": 2.0791,
+ "step": 1010
+ },
+ {
+ "epoch": 0.32,
+ "grad_norm": 1.5959141254425049,
+ "learning_rate": 3.867908541966594e-05,
+ "loss": 1.9997,
+ "step": 1015
+ },
+ {
+ "epoch": 0.32,
+ "grad_norm": 1.3945012092590332,
+ "learning_rate": 3.857668328383088e-05,
+ "loss": 2.0481,
+ "step": 1020
+ },
+ {
+ "epoch": 0.32,
+ "grad_norm": 1.2361671924591064,
+ "learning_rate": 3.847395705427075e-05,
+ "loss": 2.2664,
+ "step": 1025
+ },
+ {
+ "epoch": 0.32,
+ "grad_norm": 1.9661719799041748,
+ "learning_rate": 3.837090918319909e-05,
+ "loss": 1.9752,
+ "step": 1030
+ },
+ {
+ "epoch": 0.32,
+ "grad_norm": 1.6995949745178223,
+ "learning_rate": 3.8267542130507436e-05,
+ "loss": 2.1332,
+ "step": 1035
+ },
+ {
+ "epoch": 0.32,
+ "grad_norm": 1.1248412132263184,
+ "learning_rate": 3.816385836370663e-05,
+ "loss": 2.0432,
+ "step": 1040
+ },
+ {
+ "epoch": 0.32,
+ "grad_norm": 0.8734235763549805,
+ "learning_rate": 3.805986035786789e-05,
+ "loss": 1.9618,
+ "step": 1045
+ },
+ {
+ "epoch": 0.33,
+ "grad_norm": 1.322766661643982,
+ "learning_rate": 3.795555059556378e-05,
+ "loss": 2.0267,
+ "step": 1050
+ },
+ {
+ "epoch": 0.33,
+ "grad_norm": 1.0396028757095337,
+ "learning_rate": 3.7850931566808866e-05,
+ "loss": 2.1075,
+ "step": 1055
+ },
+ {
+ "epoch": 0.33,
+ "grad_norm": 0.9574625492095947,
+ "learning_rate": 3.7746005769000363e-05,
+ "loss": 2.156,
+ "step": 1060
+ },
+ {
+ "epoch": 0.33,
+ "grad_norm": 1.4480133056640625,
+ "learning_rate": 3.764077570685844e-05,
+ "loss": 1.9615,
+ "step": 1065
+ },
+ {
+ "epoch": 0.33,
+ "grad_norm": 1.5908560752868652,
+ "learning_rate": 3.753524389236648e-05,
+ "loss": 2.0928,
+ "step": 1070
+ },
+ {
+ "epoch": 0.33,
+ "grad_norm": 1.2628813982009888,
+ "learning_rate": 3.742941284471111e-05,
+ "loss": 2.1074,
+ "step": 1075
+ },
+ {
+ "epoch": 0.34,
+ "grad_norm": 1.2687503099441528,
+ "learning_rate": 3.7323285090222054e-05,
+ "loss": 1.9666,
+ "step": 1080
+ },
+ {
+ "epoch": 0.34,
+ "grad_norm": 1.2571731805801392,
+ "learning_rate": 3.721686316231181e-05,
+ "loss": 2.0468,
+ "step": 1085
+ },
+ {
+ "epoch": 0.34,
+ "grad_norm": 1.007453441619873,
+ "learning_rate": 3.7110149601415215e-05,
+ "loss": 2.0624,
+ "step": 1090
+ },
+ {
+ "epoch": 0.34,
+ "grad_norm": 1.2390377521514893,
+ "learning_rate": 3.700314695492876e-05,
+ "loss": 1.9888,
+ "step": 1095
+ },
+ {
+ "epoch": 0.34,
+ "grad_norm": 1.0878371000289917,
+ "learning_rate": 3.6895857777149825e-05,
+ "loss": 2.1013,
+ "step": 1100
+ },
+ {
+ "epoch": 0.34,
+ "grad_norm": 0.8759217262268066,
+ "learning_rate": 3.6788284629215624e-05,
+ "loss": 1.875,
+ "step": 1105
+ },
+ {
+ "epoch": 0.35,
+ "grad_norm": 1.1345970630645752,
+ "learning_rate": 3.668043007904219e-05,
+ "loss": 1.9096,
+ "step": 1110
+ },
+ {
+ "epoch": 0.35,
+ "grad_norm": 1.253629446029663,
+ "learning_rate": 3.6572296701262966e-05,
+ "loss": 2.1859,
+ "step": 1115
+ },
+ {
+ "epoch": 0.35,
+ "grad_norm": 0.9796190857887268,
+ "learning_rate": 3.646388707716738e-05,
+ "loss": 2.2092,
+ "step": 1120
+ },
+ {
+ "epoch": 0.35,
+ "grad_norm": 1.3893767595291138,
+ "learning_rate": 3.635520379463926e-05,
+ "loss": 2.0026,
+ "step": 1125
+ },
+ {
+ "epoch": 0.35,
+ "grad_norm": 0.8778309226036072,
+ "learning_rate": 3.6246249448095004e-05,
+ "loss": 2.2112,
+ "step": 1130
+ },
+ {
+ "epoch": 0.35,
+ "grad_norm": 1.2479698657989502,
+ "learning_rate": 3.6137026638421696e-05,
+ "loss": 2.0221,
+ "step": 1135
+ },
+ {
+ "epoch": 0.35,
+ "grad_norm": 1.3813824653625488,
+ "learning_rate": 3.6027537972914974e-05,
+ "loss": 1.9106,
+ "step": 1140
+ },
+ {
+ "epoch": 0.36,
+ "grad_norm": 1.2043218612670898,
+ "learning_rate": 3.5917786065216826e-05,
+ "loss": 2.0673,
+ "step": 1145
+ },
+ {
+ "epoch": 0.36,
+ "grad_norm": 1.5337340831756592,
+ "learning_rate": 3.580777353525318e-05,
+ "loss": 2.1463,
+ "step": 1150
+ },
+ {
+ "epoch": 0.36,
+ "grad_norm": 1.155813455581665,
+ "learning_rate": 3.5697503009171385e-05,
+ "loss": 2.0255,
+ "step": 1155
+ },
+ {
+ "epoch": 0.36,
+ "grad_norm": 1.034644365310669,
+ "learning_rate": 3.558697711927748e-05,
+ "loss": 2.1348,
+ "step": 1160
+ },
+ {
+ "epoch": 0.36,
+ "grad_norm": 1.0959795713424683,
+ "learning_rate": 3.54761985039734e-05,
+ "loss": 2.1457,
+ "step": 1165
+ },
+ {
+ "epoch": 0.36,
+ "grad_norm": 1.1938838958740234,
+ "learning_rate": 3.5365169807693966e-05,
+ "loss": 2.1256,
+ "step": 1170
+ },
+ {
+ "epoch": 0.37,
+ "grad_norm": 0.8162047863006592,
+ "learning_rate": 3.525389368084379e-05,
+ "loss": 1.9587,
+ "step": 1175
+ },
+ {
+ "epoch": 0.37,
+ "grad_norm": 0.9358930587768555,
+ "learning_rate": 3.514237277973393e-05,
+ "loss": 1.8965,
+ "step": 1180
+ },
+ {
+ "epoch": 0.37,
+ "grad_norm": 0.9210988879203796,
+ "learning_rate": 3.503060976651862e-05,
+ "loss": 1.9669,
+ "step": 1185
+ },
+ {
+ "epoch": 0.37,
+ "grad_norm": 1.4641343355178833,
+ "learning_rate": 3.491860730913156e-05,
+ "loss": 2.003,
+ "step": 1190
+ },
+ {
+ "epoch": 0.37,
+ "grad_norm": 1.2458257675170898,
+ "learning_rate": 3.480636808122235e-05,
+ "loss": 2.1487,
+ "step": 1195
+ },
+ {
+ "epoch": 0.37,
+ "grad_norm": 1.6770122051239014,
+ "learning_rate": 3.469389476209259e-05,
+ "loss": 2.0686,
+ "step": 1200
+ },
+ {
+ "epoch": 0.37,
+ "grad_norm": 0.9083845019340515,
+ "learning_rate": 3.458119003663199e-05,
+ "loss": 2.0284,
+ "step": 1205
+ },
+ {
+ "epoch": 0.38,
+ "grad_norm": 1.2679696083068848,
+ "learning_rate": 3.446825659525421e-05,
+ "loss": 2.0555,
+ "step": 1210
+ },
+ {
+ "epoch": 0.38,
+ "grad_norm": 1.3823720216751099,
+ "learning_rate": 3.435509713383268e-05,
+ "loss": 1.9375,
+ "step": 1215
+ },
+ {
+ "epoch": 0.38,
+ "grad_norm": 1.5862077474594116,
+ "learning_rate": 3.424171435363623e-05,
+ "loss": 2.0271,
+ "step": 1220
+ },
+ {
+ "epoch": 0.38,
+ "grad_norm": 2.0107533931732178,
+ "learning_rate": 3.412811096126461e-05,
+ "loss": 2.1897,
+ "step": 1225
+ },
+ {
+ "epoch": 0.38,
+ "grad_norm": 1.4544458389282227,
+ "learning_rate": 3.401428966858387e-05,
+ "loss": 1.9978,
+ "step": 1230
+ },
+ {
+ "epoch": 0.38,
+ "grad_norm": 1.188170075416565,
+ "learning_rate": 3.390025319266167e-05,
+ "loss": 2.0688,
+ "step": 1235
+ },
+ {
+ "epoch": 0.39,
+ "grad_norm": 1.1016322374343872,
+ "learning_rate": 3.3786004255702336e-05,
+ "loss": 2.0396,
+ "step": 1240
+ },
+ {
+ "epoch": 0.39,
+ "grad_norm": 1.6623334884643555,
+ "learning_rate": 3.3671545584981954e-05,
+ "loss": 1.9566,
+ "step": 1245
+ },
+ {
+ "epoch": 0.39,
+ "grad_norm": 0.9161584377288818,
+ "learning_rate": 3.355687991278324e-05,
+ "loss": 2.0474,
+ "step": 1250
+ },
+ {
+ "epoch": 0.39,
+ "grad_norm": 0.9911025166511536,
+ "learning_rate": 3.3442009976330305e-05,
+ "loss": 2.2163,
+ "step": 1255
+ },
+ {
+ "epoch": 0.39,
+ "grad_norm": 1.1504255533218384,
+ "learning_rate": 3.332693851772331e-05,
+ "loss": 2.1088,
+ "step": 1260
+ },
+ {
+ "epoch": 0.39,
+ "grad_norm": 0.9544184803962708,
+ "learning_rate": 3.3211668283873035e-05,
+ "loss": 1.8947,
+ "step": 1265
+ },
+ {
+ "epoch": 0.39,
+ "grad_norm": 1.4625756740570068,
+ "learning_rate": 3.3096202026435304e-05,
+ "loss": 2.1748,
+ "step": 1270
+ },
+ {
+ "epoch": 0.4,
+ "grad_norm": 1.3267475366592407,
+ "learning_rate": 3.298054250174527e-05,
+ "loss": 1.9218,
+ "step": 1275
+ },
+ {
+ "epoch": 0.4,
+ "grad_norm": 0.9869363903999329,
+ "learning_rate": 3.2864692470751654e-05,
+ "loss": 2.2723,
+ "step": 1280
+ },
+ {
+ "epoch": 0.4,
+ "grad_norm": 1.5177838802337646,
+ "learning_rate": 3.27486546989508e-05,
+ "loss": 2.1456,
+ "step": 1285
+ },
+ {
+ "epoch": 0.4,
+ "grad_norm": 1.1998714208602905,
+ "learning_rate": 3.263243195632068e-05,
+ "loss": 1.8877,
+ "step": 1290
+ },
+ {
+ "epoch": 0.4,
+ "grad_norm": 1.2112164497375488,
+ "learning_rate": 3.2516027017254785e-05,
+ "loss": 2.0615,
+ "step": 1295
+ },
+ {
+ "epoch": 0.4,
+ "grad_norm": 1.0616129636764526,
+ "learning_rate": 3.239944266049587e-05,
+ "loss": 2.0402,
+ "step": 1300
+ }
+ ],
+ "logging_steps": 5,
+ "max_steps": 3215,
+ "num_input_tokens_seen": 0,
+ "num_train_epochs": 1,
+ "save_steps": 100,
+ "total_flos": 1.747451332067328e+16,
+ "train_batch_size": 2,
+ "trial_name": null,
+ "trial_params": null
+}
diff --git a/checkpoint-1300/training_args.bin b/checkpoint-1300/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..dbb5f752acaebe6718c05e59ced2a70ce6dfb6a0
--- /dev/null
+++ b/checkpoint-1300/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:79e80b13ff00898b4493d440a3c1a1eb234c0ae541cbca8a8b1befef97a354c9
+size 5112
diff --git a/checkpoint-1400/README.md b/checkpoint-1400/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..b21eb87c49b1ea11193cd8f664d04b06c22e6bb5
--- /dev/null
+++ b/checkpoint-1400/README.md
@@ -0,0 +1,202 @@
+---
+library_name: peft
+base_model: hfl/chinese-alpaca-2-1.3b
+---
+
+# Model Card for Model ID
+
+
+
+
+
+## Model Details
+
+### Model Description
+
+
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+
+
+### Direct Use
+
+
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+
+
+[More Information Needed]
+
+### Recommendations
+
+
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+
+
+[More Information Needed]
+
+### Training Procedure
+
+
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed]
+
+#### Speeds, Sizes, Times [optional]
+
+
+
+[More Information Needed]
+
+## Evaluation
+
+
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+
+
+[More Information Needed]
+
+#### Factors
+
+
+
+[More Information Needed]
+
+#### Metrics
+
+
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+
+
+[More Information Needed]
+
+## Environmental Impact
+
+
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.9.0
\ No newline at end of file
diff --git a/checkpoint-1400/adapter_config.json b/checkpoint-1400/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..67326eac9d1aeaece3cee6be49e088077b9cebe9
--- /dev/null
+++ b/checkpoint-1400/adapter_config.json
@@ -0,0 +1,28 @@
+{
+ "alpha_pattern": {},
+ "auto_mapping": null,
+ "base_model_name_or_path": "hfl/chinese-alpaca-2-1.3b",
+ "bias": "none",
+ "fan_in_fan_out": false,
+ "inference_mode": true,
+ "init_lora_weights": true,
+ "layers_pattern": null,
+ "layers_to_transform": null,
+ "loftq_config": {},
+ "lora_alpha": 16,
+ "lora_dropout": 0.1,
+ "megatron_config": null,
+ "megatron_core": "megatron.core",
+ "modules_to_save": null,
+ "peft_type": "LORA",
+ "r": 8,
+ "rank_pattern": {},
+ "revision": null,
+ "target_modules": [
+ "q_proj",
+ "v_proj"
+ ],
+ "task_type": "CAUSAL_LM",
+ "use_dora": false,
+ "use_rslora": false
+}
\ No newline at end of file
diff --git a/checkpoint-1400/adapter_model.safetensors b/checkpoint-1400/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..eaa8b2650253ebfcf941d1c16402912ba0b8cbf6
--- /dev/null
+++ b/checkpoint-1400/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:30abf59f7e85e60d6fc510e94a01f896e2ef14b837a8b3f56ac7d4bb9a248e9c
+size 2099272
diff --git a/checkpoint-1400/optimizer.pt b/checkpoint-1400/optimizer.pt
new file mode 100644
index 0000000000000000000000000000000000000000..3a7f029ddb416fca8d95340f936edf538016445d
--- /dev/null
+++ b/checkpoint-1400/optimizer.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fafb6357aceb71a6ccddedd4839e0ef4f366976d7d2e2245e1422ecc10b7e69c
+size 4208302
diff --git a/checkpoint-1400/rng_state.pth b/checkpoint-1400/rng_state.pth
new file mode 100644
index 0000000000000000000000000000000000000000..f33244e2e624fdb74b459fefcf7646f046d95e23
--- /dev/null
+++ b/checkpoint-1400/rng_state.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:77b2546a9b2821ed4272c4daa26fcfa05f024372bf736fd11b02713ac1401a37
+size 14244
diff --git a/checkpoint-1400/scheduler.pt b/checkpoint-1400/scheduler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..6e6511dff5368667fbf76bca367bf31a711991b2
--- /dev/null
+++ b/checkpoint-1400/scheduler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e15b497f0a675bff3c8aab7f24bf8b46dabf69d9eb519a60f1c8b5f7ccc2be1c
+size 1064
diff --git a/checkpoint-1400/special_tokens_map.json b/checkpoint-1400/special_tokens_map.json
new file mode 100644
index 0000000000000000000000000000000000000000..94c4595c4eb202faff8e45d273ceffbe19fe3036
--- /dev/null
+++ b/checkpoint-1400/special_tokens_map.json
@@ -0,0 +1,30 @@
+{
+ "bos_token": {
+ "content": "",
+ "lstrip": false,
+ "normalized": true,
+ "rstrip": false,
+ "single_word": false
+ },
+ "eos_token": {
+ "content": "",
+ "lstrip": false,
+ "normalized": true,
+ "rstrip": false,
+ "single_word": false
+ },
+ "pad_token": {
+ "content": "",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false
+ },
+ "unk_token": {
+ "content": "",
+ "lstrip": false,
+ "normalized": true,
+ "rstrip": false,
+ "single_word": false
+ }
+}
diff --git a/checkpoint-1400/tokenizer.model b/checkpoint-1400/tokenizer.model
new file mode 100644
index 0000000000000000000000000000000000000000..3df664fcc67f61d0c7e635a60d9f55cd0624158a
--- /dev/null
+++ b/checkpoint-1400/tokenizer.model
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a3b8844863b200dfcca971db228e96ce388290dfcf72c15d7a9d2f604bac787c
+size 844403
diff --git a/checkpoint-1400/tokenizer_config.json b/checkpoint-1400/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..6912f7cdcb84c3039edad59aa64f70dc4344a517
--- /dev/null
+++ b/checkpoint-1400/tokenizer_config.json
@@ -0,0 +1,54 @@
+{
+ "add_bos_token": true,
+ "add_eos_token": false,
+ "add_prefix_space": true,
+ "added_tokens_decoder": {
+ "0": {
+ "content": "",
+ "lstrip": false,
+ "normalized": true,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "1": {
+ "content": "",
+ "lstrip": false,
+ "normalized": true,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "2": {
+ "content": "",
+ "lstrip": false,
+ "normalized": true,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "32000": {
+ "content": "",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ }
+ },
+ "bos_token": "",
+ "chat_template": "{% set system_message = 'You are a helpful assistant. 你是一个乐于助人的助手。' %}{% if messages[0]['role'] == 'system' %}{% set system_message = messages[0]['content'] %}{% endif %}{% for message in messages %}{% set content = message['content'] %}{% if loop.index0 == 0 and system_message is defined %}{% set content = '<>\\n' + system_message + '\\n<>\\n\\n' + message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ '' + '[INST] ' + content + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ content + '' }}{% endif %}{% endfor %}",
+ "clean_up_tokenization_spaces": false,
+ "eos_token": "",
+ "legacy": true,
+ "model_max_length": 1000000000000000019884624838656,
+ "pad_token": "",
+ "padding_side": "right",
+ "sp_model_kwargs": {},
+ "spaces_between_special_tokens": false,
+ "split_special_tokens": false,
+ "tokenizer_class": "LlamaTokenizer",
+ "unk_token": "",
+ "use_default_system_prompt": false,
+ "use_fast": false
+}
diff --git a/checkpoint-1400/trainer_state.json b/checkpoint-1400/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..14654d90543430d5e16193ecb647812ae3bd312d
--- /dev/null
+++ b/checkpoint-1400/trainer_state.json
@@ -0,0 +1,1981 @@
+{
+ "best_metric": null,
+ "best_model_checkpoint": null,
+ "epoch": 0.43534030396081935,
+ "eval_steps": 500,
+ "global_step": 1400,
+ "is_hyper_param_search": false,
+ "is_local_process_zero": true,
+ "is_world_process_zero": true,
+ "log_history": [
+ {
+ "epoch": 0.0,
+ "grad_norm": 0.47774845361709595,
+ "learning_rate": 4.999970160815579e-05,
+ "loss": 2.0765,
+ "step": 5
+ },
+ {
+ "epoch": 0.0,
+ "grad_norm": 0.6051416397094727,
+ "learning_rate": 4.999880643974619e-05,
+ "loss": 2.2297,
+ "step": 10
+ },
+ {
+ "epoch": 0.0,
+ "grad_norm": 0.6161717772483826,
+ "learning_rate": 4.9997314516140056e-05,
+ "loss": 2.1103,
+ "step": 15
+ },
+ {
+ "epoch": 0.01,
+ "grad_norm": 0.4686434268951416,
+ "learning_rate": 4.999522587295162e-05,
+ "loss": 2.0057,
+ "step": 20
+ },
+ {
+ "epoch": 0.01,
+ "grad_norm": 0.8412289023399353,
+ "learning_rate": 4.999254056003963e-05,
+ "loss": 2.1778,
+ "step": 25
+ },
+ {
+ "epoch": 0.01,
+ "grad_norm": 0.5333625078201294,
+ "learning_rate": 4.99892586415061e-05,
+ "loss": 2.2399,
+ "step": 30
+ },
+ {
+ "epoch": 0.01,
+ "grad_norm": 0.821148157119751,
+ "learning_rate": 4.9985380195694856e-05,
+ "loss": 2.3215,
+ "step": 35
+ },
+ {
+ "epoch": 0.01,
+ "grad_norm": 0.8403909206390381,
+ "learning_rate": 4.998090531518962e-05,
+ "loss": 1.8295,
+ "step": 40
+ },
+ {
+ "epoch": 0.01,
+ "grad_norm": 0.6633398532867432,
+ "learning_rate": 4.9975834106811834e-05,
+ "loss": 2.0195,
+ "step": 45
+ },
+ {
+ "epoch": 0.02,
+ "grad_norm": 0.6386868357658386,
+ "learning_rate": 4.997016669161806e-05,
+ "loss": 2.1257,
+ "step": 50
+ },
+ {
+ "epoch": 0.02,
+ "grad_norm": 0.7762248516082764,
+ "learning_rate": 4.996390320489715e-05,
+ "loss": 2.057,
+ "step": 55
+ },
+ {
+ "epoch": 0.02,
+ "grad_norm": 1.3192856311798096,
+ "learning_rate": 4.9957043796166966e-05,
+ "loss": 2.0753,
+ "step": 60
+ },
+ {
+ "epoch": 0.02,
+ "grad_norm": 0.9797518849372864,
+ "learning_rate": 4.994958862917083e-05,
+ "loss": 1.9736,
+ "step": 65
+ },
+ {
+ "epoch": 0.02,
+ "grad_norm": 1.000693440437317,
+ "learning_rate": 4.994153788187363e-05,
+ "loss": 2.1572,
+ "step": 70
+ },
+ {
+ "epoch": 0.02,
+ "grad_norm": 0.6852813959121704,
+ "learning_rate": 4.993289174645757e-05,
+ "loss": 2.1491,
+ "step": 75
+ },
+ {
+ "epoch": 0.02,
+ "grad_norm": 1.0075691938400269,
+ "learning_rate": 4.992365042931752e-05,
+ "loss": 1.945,
+ "step": 80
+ },
+ {
+ "epoch": 0.03,
+ "grad_norm": 1.1973133087158203,
+ "learning_rate": 4.991381415105619e-05,
+ "loss": 2.0811,
+ "step": 85
+ },
+ {
+ "epoch": 0.03,
+ "grad_norm": 0.9927239418029785,
+ "learning_rate": 4.990338314647881e-05,
+ "loss": 1.961,
+ "step": 90
+ },
+ {
+ "epoch": 0.03,
+ "grad_norm": 0.9499759674072266,
+ "learning_rate": 4.98923576645875e-05,
+ "loss": 2.0653,
+ "step": 95
+ },
+ {
+ "epoch": 0.03,
+ "grad_norm": 0.7233040928840637,
+ "learning_rate": 4.9880737968575365e-05,
+ "loss": 1.9999,
+ "step": 100
+ },
+ {
+ "epoch": 0.03,
+ "grad_norm": 1.55235755443573,
+ "learning_rate": 4.986852433582022e-05,
+ "loss": 2.2258,
+ "step": 105
+ },
+ {
+ "epoch": 0.03,
+ "grad_norm": 0.9007890820503235,
+ "learning_rate": 4.985571705787793e-05,
+ "loss": 2.1034,
+ "step": 110
+ },
+ {
+ "epoch": 0.04,
+ "grad_norm": 0.6774860620498657,
+ "learning_rate": 4.9842316440475475e-05,
+ "loss": 2.1753,
+ "step": 115
+ },
+ {
+ "epoch": 0.04,
+ "grad_norm": 0.7676737308502197,
+ "learning_rate": 4.9828322803503665e-05,
+ "loss": 2.1384,
+ "step": 120
+ },
+ {
+ "epoch": 0.04,
+ "grad_norm": 0.9624544978141785,
+ "learning_rate": 4.981373648100946e-05,
+ "loss": 2.0521,
+ "step": 125
+ },
+ {
+ "epoch": 0.04,
+ "grad_norm": 0.9315722584724426,
+ "learning_rate": 4.979855782118802e-05,
+ "loss": 1.9256,
+ "step": 130
+ },
+ {
+ "epoch": 0.04,
+ "grad_norm": 0.9035864472389221,
+ "learning_rate": 4.978278718637443e-05,
+ "loss": 2.0882,
+ "step": 135
+ },
+ {
+ "epoch": 0.04,
+ "grad_norm": 0.7997236251831055,
+ "learning_rate": 4.9766424953035e-05,
+ "loss": 2.0724,
+ "step": 140
+ },
+ {
+ "epoch": 0.05,
+ "grad_norm": 1.0692921876907349,
+ "learning_rate": 4.974947151175826e-05,
+ "loss": 2.1329,
+ "step": 145
+ },
+ {
+ "epoch": 0.05,
+ "grad_norm": 0.9506180286407471,
+ "learning_rate": 4.973192726724572e-05,
+ "loss": 2.082,
+ "step": 150
+ },
+ {
+ "epoch": 0.05,
+ "grad_norm": 0.8647387027740479,
+ "learning_rate": 4.9713792638302145e-05,
+ "loss": 2.0366,
+ "step": 155
+ },
+ {
+ "epoch": 0.05,
+ "grad_norm": 1.105302095413208,
+ "learning_rate": 4.969506805782555e-05,
+ "loss": 2.1481,
+ "step": 160
+ },
+ {
+ "epoch": 0.05,
+ "grad_norm": 0.7593303918838501,
+ "learning_rate": 4.967575397279689e-05,
+ "loss": 2.032,
+ "step": 165
+ },
+ {
+ "epoch": 0.05,
+ "grad_norm": 0.7521979808807373,
+ "learning_rate": 4.965585084426943e-05,
+ "loss": 2.0379,
+ "step": 170
+ },
+ {
+ "epoch": 0.05,
+ "grad_norm": 0.947120726108551,
+ "learning_rate": 4.9635359147357655e-05,
+ "loss": 2.1444,
+ "step": 175
+ },
+ {
+ "epoch": 0.06,
+ "grad_norm": 1.2184454202651978,
+ "learning_rate": 4.961427937122598e-05,
+ "loss": 1.9164,
+ "step": 180
+ },
+ {
+ "epoch": 0.06,
+ "grad_norm": 1.221663475036621,
+ "learning_rate": 4.959261201907707e-05,
+ "loss": 2.0084,
+ "step": 185
+ },
+ {
+ "epoch": 0.06,
+ "grad_norm": 1.0457361936569214,
+ "learning_rate": 4.957035760813982e-05,
+ "loss": 2.2032,
+ "step": 190
+ },
+ {
+ "epoch": 0.06,
+ "grad_norm": 0.8834909200668335,
+ "learning_rate": 4.954751666965701e-05,
+ "loss": 2.2101,
+ "step": 195
+ },
+ {
+ "epoch": 0.06,
+ "grad_norm": 0.791902482509613,
+ "learning_rate": 4.9524089748872615e-05,
+ "loss": 2.0472,
+ "step": 200
+ },
+ {
+ "epoch": 0.06,
+ "grad_norm": 1.2905739545822144,
+ "learning_rate": 4.9500077405018807e-05,
+ "loss": 2.0987,
+ "step": 205
+ },
+ {
+ "epoch": 0.07,
+ "grad_norm": 0.8612006306648254,
+ "learning_rate": 4.9475480211302583e-05,
+ "loss": 2.1765,
+ "step": 210
+ },
+ {
+ "epoch": 0.07,
+ "grad_norm": 1.3128459453582764,
+ "learning_rate": 4.945029875489212e-05,
+ "loss": 1.9926,
+ "step": 215
+ },
+ {
+ "epoch": 0.07,
+ "grad_norm": 0.9610918164253235,
+ "learning_rate": 4.94245336369027e-05,
+ "loss": 2.0124,
+ "step": 220
+ },
+ {
+ "epoch": 0.07,
+ "grad_norm": 0.873160183429718,
+ "learning_rate": 4.939818547238241e-05,
+ "loss": 2.2229,
+ "step": 225
+ },
+ {
+ "epoch": 0.07,
+ "grad_norm": 1.5535285472869873,
+ "learning_rate": 4.9371254890297446e-05,
+ "loss": 2.2013,
+ "step": 230
+ },
+ {
+ "epoch": 0.07,
+ "grad_norm": 1.1951836347579956,
+ "learning_rate": 4.93437425335171e-05,
+ "loss": 2.014,
+ "step": 235
+ },
+ {
+ "epoch": 0.07,
+ "grad_norm": 0.7874170541763306,
+ "learning_rate": 4.9315649058798384e-05,
+ "loss": 2.1701,
+ "step": 240
+ },
+ {
+ "epoch": 0.08,
+ "grad_norm": 1.3503323793411255,
+ "learning_rate": 4.928697513677042e-05,
+ "loss": 2.1681,
+ "step": 245
+ },
+ {
+ "epoch": 0.08,
+ "grad_norm": 1.3091179132461548,
+ "learning_rate": 4.925772145191834e-05,
+ "loss": 2.1224,
+ "step": 250
+ },
+ {
+ "epoch": 0.08,
+ "grad_norm": 1.4428555965423584,
+ "learning_rate": 4.9227888702567044e-05,
+ "loss": 2.0512,
+ "step": 255
+ },
+ {
+ "epoch": 0.08,
+ "grad_norm": 0.8234395980834961,
+ "learning_rate": 4.9197477600864446e-05,
+ "loss": 2.1067,
+ "step": 260
+ },
+ {
+ "epoch": 0.08,
+ "grad_norm": 1.9094969034194946,
+ "learning_rate": 4.9166488872764526e-05,
+ "loss": 1.8884,
+ "step": 265
+ },
+ {
+ "epoch": 0.08,
+ "grad_norm": 1.0074087381362915,
+ "learning_rate": 4.913492325800999e-05,
+ "loss": 1.9345,
+ "step": 270
+ },
+ {
+ "epoch": 0.09,
+ "grad_norm": 1.0867297649383545,
+ "learning_rate": 4.910278151011458e-05,
+ "loss": 2.1928,
+ "step": 275
+ },
+ {
+ "epoch": 0.09,
+ "grad_norm": 0.6842357516288757,
+ "learning_rate": 4.907006439634516e-05,
+ "loss": 2.0407,
+ "step": 280
+ },
+ {
+ "epoch": 0.09,
+ "grad_norm": 0.8409023284912109,
+ "learning_rate": 4.903677269770329e-05,
+ "loss": 2.2344,
+ "step": 285
+ },
+ {
+ "epoch": 0.09,
+ "grad_norm": 0.8119503259658813,
+ "learning_rate": 4.900290720890671e-05,
+ "loss": 2.1296,
+ "step": 290
+ },
+ {
+ "epoch": 0.09,
+ "grad_norm": 0.9938147068023682,
+ "learning_rate": 4.8968468738370244e-05,
+ "loss": 2.152,
+ "step": 295
+ },
+ {
+ "epoch": 0.09,
+ "grad_norm": 0.9865244030952454,
+ "learning_rate": 4.8933458108186606e-05,
+ "loss": 1.9623,
+ "step": 300
+ },
+ {
+ "epoch": 0.09,
+ "grad_norm": 1.3944802284240723,
+ "learning_rate": 4.889787615410672e-05,
+ "loss": 1.915,
+ "step": 305
+ },
+ {
+ "epoch": 0.1,
+ "grad_norm": 1.3749767541885376,
+ "learning_rate": 4.886172372551977e-05,
+ "loss": 1.9934,
+ "step": 310
+ },
+ {
+ "epoch": 0.1,
+ "grad_norm": 0.9024938941001892,
+ "learning_rate": 4.882500168543294e-05,
+ "loss": 2.1541,
+ "step": 315
+ },
+ {
+ "epoch": 0.1,
+ "grad_norm": 1.1978263854980469,
+ "learning_rate": 4.878771091045082e-05,
+ "loss": 2.1688,
+ "step": 320
+ },
+ {
+ "epoch": 0.1,
+ "grad_norm": 0.8360010981559753,
+ "learning_rate": 4.874985229075446e-05,
+ "loss": 2.1387,
+ "step": 325
+ },
+ {
+ "epoch": 0.1,
+ "grad_norm": 0.7683364152908325,
+ "learning_rate": 4.871142673008012e-05,
+ "loss": 2.0215,
+ "step": 330
+ },
+ {
+ "epoch": 0.1,
+ "grad_norm": 1.4230670928955078,
+ "learning_rate": 4.867243514569772e-05,
+ "loss": 1.9491,
+ "step": 335
+ },
+ {
+ "epoch": 0.11,
+ "grad_norm": 0.8198773860931396,
+ "learning_rate": 4.863287846838891e-05,
+ "loss": 2.0151,
+ "step": 340
+ },
+ {
+ "epoch": 0.11,
+ "grad_norm": 1.467207908630371,
+ "learning_rate": 4.85927576424249e-05,
+ "loss": 1.8906,
+ "step": 345
+ },
+ {
+ "epoch": 0.11,
+ "grad_norm": 0.9537095427513123,
+ "learning_rate": 4.855207362554385e-05,
+ "loss": 2.1844,
+ "step": 350
+ },
+ {
+ "epoch": 0.11,
+ "grad_norm": 1.0757155418395996,
+ "learning_rate": 4.851082738892809e-05,
+ "loss": 2.048,
+ "step": 355
+ },
+ {
+ "epoch": 0.11,
+ "grad_norm": 1.6884938478469849,
+ "learning_rate": 4.8469019917180846e-05,
+ "loss": 1.9537,
+ "step": 360
+ },
+ {
+ "epoch": 0.11,
+ "grad_norm": 1.4680182933807373,
+ "learning_rate": 4.8426652208302814e-05,
+ "loss": 1.9731,
+ "step": 365
+ },
+ {
+ "epoch": 0.12,
+ "grad_norm": 1.1778632402420044,
+ "learning_rate": 4.83837252736683e-05,
+ "loss": 2.1395,
+ "step": 370
+ },
+ {
+ "epoch": 0.12,
+ "grad_norm": 1.2865056991577148,
+ "learning_rate": 4.834024013800108e-05,
+ "loss": 2.0016,
+ "step": 375
+ },
+ {
+ "epoch": 0.12,
+ "grad_norm": 1.055177092552185,
+ "learning_rate": 4.8296197839349944e-05,
+ "loss": 1.9632,
+ "step": 380
+ },
+ {
+ "epoch": 0.12,
+ "grad_norm": 1.0041871070861816,
+ "learning_rate": 4.825159942906389e-05,
+ "loss": 2.3302,
+ "step": 385
+ },
+ {
+ "epoch": 0.12,
+ "grad_norm": 1.0026438236236572,
+ "learning_rate": 4.820644597176709e-05,
+ "loss": 2.1517,
+ "step": 390
+ },
+ {
+ "epoch": 0.12,
+ "grad_norm": 1.3532180786132812,
+ "learning_rate": 4.81607385453334e-05,
+ "loss": 2.1229,
+ "step": 395
+ },
+ {
+ "epoch": 0.12,
+ "grad_norm": 0.7670988440513611,
+ "learning_rate": 4.81144782408607e-05,
+ "loss": 2.1382,
+ "step": 400
+ },
+ {
+ "epoch": 0.13,
+ "grad_norm": 1.0405700206756592,
+ "learning_rate": 4.8067666162644774e-05,
+ "loss": 1.9614,
+ "step": 405
+ },
+ {
+ "epoch": 0.13,
+ "grad_norm": 1.2252662181854248,
+ "learning_rate": 4.802030342815304e-05,
+ "loss": 2.1399,
+ "step": 410
+ },
+ {
+ "epoch": 0.13,
+ "grad_norm": 1.237946629524231,
+ "learning_rate": 4.7972391167997754e-05,
+ "loss": 1.9034,
+ "step": 415
+ },
+ {
+ "epoch": 0.13,
+ "grad_norm": 0.8064705729484558,
+ "learning_rate": 4.7923930525909156e-05,
+ "loss": 2.0075,
+ "step": 420
+ },
+ {
+ "epoch": 0.13,
+ "grad_norm": 0.8717565536499023,
+ "learning_rate": 4.7874922658708065e-05,
+ "loss": 2.0105,
+ "step": 425
+ },
+ {
+ "epoch": 0.13,
+ "grad_norm": 1.6693098545074463,
+ "learning_rate": 4.782536873627832e-05,
+ "loss": 2.0242,
+ "step": 430
+ },
+ {
+ "epoch": 0.14,
+ "grad_norm": 0.82447350025177,
+ "learning_rate": 4.777526994153882e-05,
+ "loss": 2.0267,
+ "step": 435
+ },
+ {
+ "epoch": 0.14,
+ "grad_norm": 0.9926588535308838,
+ "learning_rate": 4.7724627470415307e-05,
+ "loss": 1.9119,
+ "step": 440
+ },
+ {
+ "epoch": 0.14,
+ "grad_norm": 1.0924450159072876,
+ "learning_rate": 4.7673442531811796e-05,
+ "loss": 2.2653,
+ "step": 445
+ },
+ {
+ "epoch": 0.14,
+ "grad_norm": 1.1592103242874146,
+ "learning_rate": 4.762171634758177e-05,
+ "loss": 2.0017,
+ "step": 450
+ },
+ {
+ "epoch": 0.14,
+ "grad_norm": 0.9172110557556152,
+ "learning_rate": 4.7569450152498927e-05,
+ "loss": 2.1408,
+ "step": 455
+ },
+ {
+ "epoch": 0.14,
+ "grad_norm": 1.1897525787353516,
+ "learning_rate": 4.751664519422778e-05,
+ "loss": 2.0935,
+ "step": 460
+ },
+ {
+ "epoch": 0.14,
+ "grad_norm": 0.8793094158172607,
+ "learning_rate": 4.746330273329386e-05,
+ "loss": 2.1142,
+ "step": 465
+ },
+ {
+ "epoch": 0.15,
+ "grad_norm": 1.4337489604949951,
+ "learning_rate": 4.740942404305356e-05,
+ "loss": 2.1289,
+ "step": 470
+ },
+ {
+ "epoch": 0.15,
+ "grad_norm": 1.0251764059066772,
+ "learning_rate": 4.735501040966383e-05,
+ "loss": 1.9741,
+ "step": 475
+ },
+ {
+ "epoch": 0.15,
+ "grad_norm": 1.2659822702407837,
+ "learning_rate": 4.730006313205143e-05,
+ "loss": 2.088,
+ "step": 480
+ },
+ {
+ "epoch": 0.15,
+ "grad_norm": 0.8884140849113464,
+ "learning_rate": 4.724458352188192e-05,
+ "loss": 2.2079,
+ "step": 485
+ },
+ {
+ "epoch": 0.15,
+ "grad_norm": 1.1937768459320068,
+ "learning_rate": 4.718857290352835e-05,
+ "loss": 2.048,
+ "step": 490
+ },
+ {
+ "epoch": 0.15,
+ "grad_norm": 0.9741552472114563,
+ "learning_rate": 4.713203261403966e-05,
+ "loss": 2.2569,
+ "step": 495
+ },
+ {
+ "epoch": 0.16,
+ "grad_norm": 0.7996780872344971,
+ "learning_rate": 4.707496400310874e-05,
+ "loss": 1.9574,
+ "step": 500
+ },
+ {
+ "epoch": 0.16,
+ "grad_norm": 1.8182051181793213,
+ "learning_rate": 4.701736843304025e-05,
+ "loss": 2.0951,
+ "step": 505
+ },
+ {
+ "epoch": 0.16,
+ "grad_norm": 1.507320761680603,
+ "learning_rate": 4.695924727871805e-05,
+ "loss": 2.0253,
+ "step": 510
+ },
+ {
+ "epoch": 0.16,
+ "grad_norm": 0.759121835231781,
+ "learning_rate": 4.690060192757242e-05,
+ "loss": 2.0602,
+ "step": 515
+ },
+ {
+ "epoch": 0.16,
+ "grad_norm": 1.5943195819854736,
+ "learning_rate": 4.684143377954691e-05,
+ "loss": 2.0386,
+ "step": 520
+ },
+ {
+ "epoch": 0.16,
+ "grad_norm": 0.8568710088729858,
+ "learning_rate": 4.6781744247064955e-05,
+ "loss": 2.073,
+ "step": 525
+ },
+ {
+ "epoch": 0.16,
+ "grad_norm": 1.3352620601654053,
+ "learning_rate": 4.6721534754996125e-05,
+ "loss": 2.1443,
+ "step": 530
+ },
+ {
+ "epoch": 0.17,
+ "grad_norm": 1.3417474031448364,
+ "learning_rate": 4.666080674062213e-05,
+ "loss": 2.0288,
+ "step": 535
+ },
+ {
+ "epoch": 0.17,
+ "grad_norm": 1.5334464311599731,
+ "learning_rate": 4.659956165360251e-05,
+ "loss": 2.0609,
+ "step": 540
+ },
+ {
+ "epoch": 0.17,
+ "grad_norm": 0.9658721089363098,
+ "learning_rate": 4.6537800955940005e-05,
+ "loss": 1.9539,
+ "step": 545
+ },
+ {
+ "epoch": 0.17,
+ "grad_norm": 1.9197947978973389,
+ "learning_rate": 4.647552612194572e-05,
+ "loss": 2.149,
+ "step": 550
+ },
+ {
+ "epoch": 0.17,
+ "grad_norm": 0.8512137532234192,
+ "learning_rate": 4.641273863820383e-05,
+ "loss": 1.9722,
+ "step": 555
+ },
+ {
+ "epoch": 0.17,
+ "grad_norm": 1.827289342880249,
+ "learning_rate": 4.634944000353622e-05,
+ "loss": 2.0729,
+ "step": 560
+ },
+ {
+ "epoch": 0.18,
+ "grad_norm": 1.088416337966919,
+ "learning_rate": 4.628563172896655e-05,
+ "loss": 1.9507,
+ "step": 565
+ },
+ {
+ "epoch": 0.18,
+ "grad_norm": 1.3566908836364746,
+ "learning_rate": 4.6221315337684353e-05,
+ "loss": 2.1643,
+ "step": 570
+ },
+ {
+ "epoch": 0.18,
+ "grad_norm": 1.3541293144226074,
+ "learning_rate": 4.615649236500854e-05,
+ "loss": 2.1839,
+ "step": 575
+ },
+ {
+ "epoch": 0.18,
+ "grad_norm": 0.991269588470459,
+ "learning_rate": 4.609116435835083e-05,
+ "loss": 2.0976,
+ "step": 580
+ },
+ {
+ "epoch": 0.18,
+ "grad_norm": 1.0280535221099854,
+ "learning_rate": 4.602533287717877e-05,
+ "loss": 2.1474,
+ "step": 585
+ },
+ {
+ "epoch": 0.18,
+ "grad_norm": 1.013123631477356,
+ "learning_rate": 4.5958999492978524e-05,
+ "loss": 2.1873,
+ "step": 590
+ },
+ {
+ "epoch": 0.19,
+ "grad_norm": 1.1753040552139282,
+ "learning_rate": 4.589216578921737e-05,
+ "loss": 2.1744,
+ "step": 595
+ },
+ {
+ "epoch": 0.19,
+ "grad_norm": 1.1839090585708618,
+ "learning_rate": 4.582483336130586e-05,
+ "loss": 1.9982,
+ "step": 600
+ },
+ {
+ "epoch": 0.19,
+ "grad_norm": 1.0724798440933228,
+ "learning_rate": 4.575700381655979e-05,
+ "loss": 2.1234,
+ "step": 605
+ },
+ {
+ "epoch": 0.19,
+ "grad_norm": 2.009913682937622,
+ "learning_rate": 4.5688678774161796e-05,
+ "loss": 1.9478,
+ "step": 610
+ },
+ {
+ "epoch": 0.19,
+ "grad_norm": 0.9897060394287109,
+ "learning_rate": 4.561985986512271e-05,
+ "loss": 1.8268,
+ "step": 615
+ },
+ {
+ "epoch": 0.19,
+ "grad_norm": 0.8881808519363403,
+ "learning_rate": 4.555054873224263e-05,
+ "loss": 1.9887,
+ "step": 620
+ },
+ {
+ "epoch": 0.19,
+ "grad_norm": 1.155900001525879,
+ "learning_rate": 4.54807470300717e-05,
+ "loss": 2.0777,
+ "step": 625
+ },
+ {
+ "epoch": 0.2,
+ "grad_norm": 0.8782421350479126,
+ "learning_rate": 4.5410456424870596e-05,
+ "loss": 2.0566,
+ "step": 630
+ },
+ {
+ "epoch": 0.2,
+ "grad_norm": 1.3324674367904663,
+ "learning_rate": 4.5339678594570795e-05,
+ "loss": 2.047,
+ "step": 635
+ },
+ {
+ "epoch": 0.2,
+ "grad_norm": 1.9805939197540283,
+ "learning_rate": 4.526841522873449e-05,
+ "loss": 1.962,
+ "step": 640
+ },
+ {
+ "epoch": 0.2,
+ "grad_norm": 1.4999943971633911,
+ "learning_rate": 4.519666802851422e-05,
+ "loss": 2.0972,
+ "step": 645
+ },
+ {
+ "epoch": 0.2,
+ "grad_norm": 1.4504961967468262,
+ "learning_rate": 4.5124438706612376e-05,
+ "loss": 2.0041,
+ "step": 650
+ },
+ {
+ "epoch": 0.2,
+ "grad_norm": 0.9078169465065002,
+ "learning_rate": 4.505172898724018e-05,
+ "loss": 2.1229,
+ "step": 655
+ },
+ {
+ "epoch": 0.21,
+ "grad_norm": 1.1635804176330566,
+ "learning_rate": 4.497854060607662e-05,
+ "loss": 2.0195,
+ "step": 660
+ },
+ {
+ "epoch": 0.21,
+ "grad_norm": 1.46576726436615,
+ "learning_rate": 4.490487531022699e-05,
+ "loss": 2.0745,
+ "step": 665
+ },
+ {
+ "epoch": 0.21,
+ "grad_norm": 1.2094652652740479,
+ "learning_rate": 4.4830734858181145e-05,
+ "loss": 2.1068,
+ "step": 670
+ },
+ {
+ "epoch": 0.21,
+ "grad_norm": 1.4738895893096924,
+ "learning_rate": 4.47561210197716e-05,
+ "loss": 1.8088,
+ "step": 675
+ },
+ {
+ "epoch": 0.21,
+ "grad_norm": 1.23384690284729,
+ "learning_rate": 4.4681035576131215e-05,
+ "loss": 2.0995,
+ "step": 680
+ },
+ {
+ "epoch": 0.21,
+ "grad_norm": 0.8332946300506592,
+ "learning_rate": 4.46054803196507e-05,
+ "loss": 2.0541,
+ "step": 685
+ },
+ {
+ "epoch": 0.21,
+ "grad_norm": 0.9207485318183899,
+ "learning_rate": 4.452945705393586e-05,
+ "loss": 2.166,
+ "step": 690
+ },
+ {
+ "epoch": 0.22,
+ "grad_norm": 1.292945146560669,
+ "learning_rate": 4.445296759376449e-05,
+ "loss": 2.0784,
+ "step": 695
+ },
+ {
+ "epoch": 0.22,
+ "grad_norm": 0.9874763488769531,
+ "learning_rate": 4.437601376504307e-05,
+ "loss": 2.2087,
+ "step": 700
+ },
+ {
+ "epoch": 0.22,
+ "grad_norm": 0.9427415132522583,
+ "learning_rate": 4.4298597404763186e-05,
+ "loss": 2.1199,
+ "step": 705
+ },
+ {
+ "epoch": 0.22,
+ "grad_norm": 1.7369529008865356,
+ "learning_rate": 4.422072036095768e-05,
+ "loss": 2.0355,
+ "step": 710
+ },
+ {
+ "epoch": 0.22,
+ "grad_norm": 1.2423696517944336,
+ "learning_rate": 4.414238449265654e-05,
+ "loss": 2.0011,
+ "step": 715
+ },
+ {
+ "epoch": 0.22,
+ "grad_norm": 1.2304831743240356,
+ "learning_rate": 4.406359166984249e-05,
+ "loss": 2.0368,
+ "step": 720
+ },
+ {
+ "epoch": 0.23,
+ "grad_norm": 0.9090413451194763,
+ "learning_rate": 4.39843437734064e-05,
+ "loss": 1.9983,
+ "step": 725
+ },
+ {
+ "epoch": 0.23,
+ "grad_norm": 1.2729507684707642,
+ "learning_rate": 4.390464269510233e-05,
+ "loss": 2.021,
+ "step": 730
+ },
+ {
+ "epoch": 0.23,
+ "grad_norm": 1.3009227514266968,
+ "learning_rate": 4.382449033750244e-05,
+ "loss": 1.9743,
+ "step": 735
+ },
+ {
+ "epoch": 0.23,
+ "grad_norm": 1.5456056594848633,
+ "learning_rate": 4.37438886139515e-05,
+ "loss": 2.0689,
+ "step": 740
+ },
+ {
+ "epoch": 0.23,
+ "grad_norm": 1.3235007524490356,
+ "learning_rate": 4.3662839448521264e-05,
+ "loss": 2.0838,
+ "step": 745
+ },
+ {
+ "epoch": 0.23,
+ "grad_norm": 2.2074007987976074,
+ "learning_rate": 4.358134477596454e-05,
+ "loss": 2.0835,
+ "step": 750
+ },
+ {
+ "epoch": 0.23,
+ "grad_norm": 1.403738021850586,
+ "learning_rate": 4.3499406541668966e-05,
+ "loss": 2.0916,
+ "step": 755
+ },
+ {
+ "epoch": 0.24,
+ "grad_norm": 1.0940325260162354,
+ "learning_rate": 4.3417026701610616e-05,
+ "loss": 1.972,
+ "step": 760
+ },
+ {
+ "epoch": 0.24,
+ "grad_norm": 1.666353702545166,
+ "learning_rate": 4.3334207222307275e-05,
+ "loss": 1.927,
+ "step": 765
+ },
+ {
+ "epoch": 0.24,
+ "grad_norm": 1.0777515172958374,
+ "learning_rate": 4.325095008077154e-05,
+ "loss": 2.1192,
+ "step": 770
+ },
+ {
+ "epoch": 0.24,
+ "grad_norm": 1.7218186855316162,
+ "learning_rate": 4.316725726446353e-05,
+ "loss": 2.0774,
+ "step": 775
+ },
+ {
+ "epoch": 0.24,
+ "grad_norm": 1.356753945350647,
+ "learning_rate": 4.3083130771243586e-05,
+ "loss": 2.0847,
+ "step": 780
+ },
+ {
+ "epoch": 0.24,
+ "grad_norm": 0.9967429637908936,
+ "learning_rate": 4.299857260932445e-05,
+ "loss": 2.0485,
+ "step": 785
+ },
+ {
+ "epoch": 0.25,
+ "grad_norm": 1.6216442584991455,
+ "learning_rate": 4.2913584797223397e-05,
+ "loss": 2.1008,
+ "step": 790
+ },
+ {
+ "epoch": 0.25,
+ "grad_norm": 1.2556742429733276,
+ "learning_rate": 4.2828169363714016e-05,
+ "loss": 1.9209,
+ "step": 795
+ },
+ {
+ "epoch": 0.25,
+ "grad_norm": 1.1800439357757568,
+ "learning_rate": 4.274232834777782e-05,
+ "loss": 1.9722,
+ "step": 800
+ },
+ {
+ "epoch": 0.25,
+ "grad_norm": 1.1313499212265015,
+ "learning_rate": 4.2656063798555515e-05,
+ "loss": 1.9176,
+ "step": 805
+ },
+ {
+ "epoch": 0.25,
+ "grad_norm": 1.137534737586975,
+ "learning_rate": 4.256937777529815e-05,
+ "loss": 1.9929,
+ "step": 810
+ },
+ {
+ "epoch": 0.25,
+ "grad_norm": 1.0575093030929565,
+ "learning_rate": 4.2482272347317906e-05,
+ "loss": 2.166,
+ "step": 815
+ },
+ {
+ "epoch": 0.25,
+ "grad_norm": 1.5939594507217407,
+ "learning_rate": 4.2394749593938733e-05,
+ "loss": 2.1334,
+ "step": 820
+ },
+ {
+ "epoch": 0.26,
+ "grad_norm": 1.1045507192611694,
+ "learning_rate": 4.230681160444669e-05,
+ "loss": 2.0853,
+ "step": 825
+ },
+ {
+ "epoch": 0.26,
+ "grad_norm": 1.3480136394500732,
+ "learning_rate": 4.221846047804009e-05,
+ "loss": 2.1802,
+ "step": 830
+ },
+ {
+ "epoch": 0.26,
+ "grad_norm": 1.1822657585144043,
+ "learning_rate": 4.2129698323779366e-05,
+ "loss": 2.0739,
+ "step": 835
+ },
+ {
+ "epoch": 0.26,
+ "grad_norm": 1.1771117448806763,
+ "learning_rate": 4.204052726053676e-05,
+ "loss": 2.0238,
+ "step": 840
+ },
+ {
+ "epoch": 0.26,
+ "grad_norm": 1.4757814407348633,
+ "learning_rate": 4.195094941694571e-05,
+ "loss": 2.1557,
+ "step": 845
+ },
+ {
+ "epoch": 0.26,
+ "grad_norm": 0.9095075726509094,
+ "learning_rate": 4.1860966931350054e-05,
+ "loss": 2.1666,
+ "step": 850
+ },
+ {
+ "epoch": 0.27,
+ "grad_norm": 1.1039543151855469,
+ "learning_rate": 4.1770581951752976e-05,
+ "loss": 2.105,
+ "step": 855
+ },
+ {
+ "epoch": 0.27,
+ "grad_norm": 0.8517205119132996,
+ "learning_rate": 4.1679796635765735e-05,
+ "loss": 1.9656,
+ "step": 860
+ },
+ {
+ "epoch": 0.27,
+ "grad_norm": 1.239492654800415,
+ "learning_rate": 4.158861315055617e-05,
+ "loss": 2.0166,
+ "step": 865
+ },
+ {
+ "epoch": 0.27,
+ "grad_norm": 1.1358321905136108,
+ "learning_rate": 4.1497033672796924e-05,
+ "loss": 2.0076,
+ "step": 870
+ },
+ {
+ "epoch": 0.27,
+ "grad_norm": 1.6215249300003052,
+ "learning_rate": 4.140506038861356e-05,
+ "loss": 2.1594,
+ "step": 875
+ },
+ {
+ "epoch": 0.27,
+ "grad_norm": 1.0528080463409424,
+ "learning_rate": 4.131269549353229e-05,
+ "loss": 2.1416,
+ "step": 880
+ },
+ {
+ "epoch": 0.28,
+ "grad_norm": 0.8976901769638062,
+ "learning_rate": 4.1219941192427644e-05,
+ "loss": 2.1242,
+ "step": 885
+ },
+ {
+ "epoch": 0.28,
+ "grad_norm": 1.263594388961792,
+ "learning_rate": 4.112679969946977e-05,
+ "loss": 2.02,
+ "step": 890
+ },
+ {
+ "epoch": 0.28,
+ "grad_norm": 1.4173017740249634,
+ "learning_rate": 4.103327323807162e-05,
+ "loss": 2.0438,
+ "step": 895
+ },
+ {
+ "epoch": 0.28,
+ "grad_norm": 1.876170039176941,
+ "learning_rate": 4.093936404083585e-05,
+ "loss": 1.9806,
+ "step": 900
+ },
+ {
+ "epoch": 0.28,
+ "grad_norm": 1.4649231433868408,
+ "learning_rate": 4.0845074349501544e-05,
+ "loss": 2.1476,
+ "step": 905
+ },
+ {
+ "epoch": 0.28,
+ "grad_norm": 1.0446043014526367,
+ "learning_rate": 4.0750406414890695e-05,
+ "loss": 1.9672,
+ "step": 910
+ },
+ {
+ "epoch": 0.28,
+ "grad_norm": 1.0225305557250977,
+ "learning_rate": 4.065536249685448e-05,
+ "loss": 1.9984,
+ "step": 915
+ },
+ {
+ "epoch": 0.29,
+ "grad_norm": 1.0120617151260376,
+ "learning_rate": 4.055994486421929e-05,
+ "loss": 2.1162,
+ "step": 920
+ },
+ {
+ "epoch": 0.29,
+ "grad_norm": 1.0469881296157837,
+ "learning_rate": 4.04641557947326e-05,
+ "loss": 2.0435,
+ "step": 925
+ },
+ {
+ "epoch": 0.29,
+ "grad_norm": 1.2435941696166992,
+ "learning_rate": 4.036799757500856e-05,
+ "loss": 2.0431,
+ "step": 930
+ },
+ {
+ "epoch": 0.29,
+ "grad_norm": 1.0055103302001953,
+ "learning_rate": 4.027147250047348e-05,
+ "loss": 2.2021,
+ "step": 935
+ },
+ {
+ "epoch": 0.29,
+ "grad_norm": 1.1212949752807617,
+ "learning_rate": 4.017458287531094e-05,
+ "loss": 1.997,
+ "step": 940
+ },
+ {
+ "epoch": 0.29,
+ "grad_norm": 1.1048357486724854,
+ "learning_rate": 4.007733101240685e-05,
+ "loss": 1.946,
+ "step": 945
+ },
+ {
+ "epoch": 0.3,
+ "grad_norm": 1.4721689224243164,
+ "learning_rate": 3.997971923329426e-05,
+ "loss": 2.0723,
+ "step": 950
+ },
+ {
+ "epoch": 0.3,
+ "grad_norm": 1.3793156147003174,
+ "learning_rate": 3.988174986809783e-05,
+ "loss": 2.034,
+ "step": 955
+ },
+ {
+ "epoch": 0.3,
+ "grad_norm": 0.9013482928276062,
+ "learning_rate": 3.9783425255478355e-05,
+ "loss": 1.9736,
+ "step": 960
+ },
+ {
+ "epoch": 0.3,
+ "grad_norm": 0.9192422032356262,
+ "learning_rate": 3.968474774257682e-05,
+ "loss": 1.9878,
+ "step": 965
+ },
+ {
+ "epoch": 0.3,
+ "grad_norm": 1.9304206371307373,
+ "learning_rate": 3.9585719684958446e-05,
+ "loss": 2.117,
+ "step": 970
+ },
+ {
+ "epoch": 0.3,
+ "grad_norm": 1.0435137748718262,
+ "learning_rate": 3.948634344655639e-05,
+ "loss": 2.0585,
+ "step": 975
+ },
+ {
+ "epoch": 0.3,
+ "grad_norm": 1.4636590480804443,
+ "learning_rate": 3.938662139961538e-05,
+ "loss": 2.0409,
+ "step": 980
+ },
+ {
+ "epoch": 0.31,
+ "grad_norm": 1.8014529943466187,
+ "learning_rate": 3.928655592463508e-05,
+ "loss": 2.0369,
+ "step": 985
+ },
+ {
+ "epoch": 0.31,
+ "grad_norm": 1.2412620782852173,
+ "learning_rate": 3.918614941031319e-05,
+ "loss": 1.967,
+ "step": 990
+ },
+ {
+ "epoch": 0.31,
+ "grad_norm": 1.3581103086471558,
+ "learning_rate": 3.908540425348852e-05,
+ "loss": 2.0037,
+ "step": 995
+ },
+ {
+ "epoch": 0.31,
+ "grad_norm": 1.2377780675888062,
+ "learning_rate": 3.8984322859083725e-05,
+ "loss": 1.9991,
+ "step": 1000
+ },
+ {
+ "epoch": 0.31,
+ "grad_norm": 0.9209259748458862,
+ "learning_rate": 3.8882907640047896e-05,
+ "loss": 2.0448,
+ "step": 1005
+ },
+ {
+ "epoch": 0.31,
+ "grad_norm": 1.0150959491729736,
+ "learning_rate": 3.878116101729897e-05,
+ "loss": 2.0791,
+ "step": 1010
+ },
+ {
+ "epoch": 0.32,
+ "grad_norm": 1.5959141254425049,
+ "learning_rate": 3.867908541966594e-05,
+ "loss": 1.9997,
+ "step": 1015
+ },
+ {
+ "epoch": 0.32,
+ "grad_norm": 1.3945012092590332,
+ "learning_rate": 3.857668328383088e-05,
+ "loss": 2.0481,
+ "step": 1020
+ },
+ {
+ "epoch": 0.32,
+ "grad_norm": 1.2361671924591064,
+ "learning_rate": 3.847395705427075e-05,
+ "loss": 2.2664,
+ "step": 1025
+ },
+ {
+ "epoch": 0.32,
+ "grad_norm": 1.9661719799041748,
+ "learning_rate": 3.837090918319909e-05,
+ "loss": 1.9752,
+ "step": 1030
+ },
+ {
+ "epoch": 0.32,
+ "grad_norm": 1.6995949745178223,
+ "learning_rate": 3.8267542130507436e-05,
+ "loss": 2.1332,
+ "step": 1035
+ },
+ {
+ "epoch": 0.32,
+ "grad_norm": 1.1248412132263184,
+ "learning_rate": 3.816385836370663e-05,
+ "loss": 2.0432,
+ "step": 1040
+ },
+ {
+ "epoch": 0.32,
+ "grad_norm": 0.8734235763549805,
+ "learning_rate": 3.805986035786789e-05,
+ "loss": 1.9618,
+ "step": 1045
+ },
+ {
+ "epoch": 0.33,
+ "grad_norm": 1.322766661643982,
+ "learning_rate": 3.795555059556378e-05,
+ "loss": 2.0267,
+ "step": 1050
+ },
+ {
+ "epoch": 0.33,
+ "grad_norm": 1.0396028757095337,
+ "learning_rate": 3.7850931566808866e-05,
+ "loss": 2.1075,
+ "step": 1055
+ },
+ {
+ "epoch": 0.33,
+ "grad_norm": 0.9574625492095947,
+ "learning_rate": 3.7746005769000363e-05,
+ "loss": 2.156,
+ "step": 1060
+ },
+ {
+ "epoch": 0.33,
+ "grad_norm": 1.4480133056640625,
+ "learning_rate": 3.764077570685844e-05,
+ "loss": 1.9615,
+ "step": 1065
+ },
+ {
+ "epoch": 0.33,
+ "grad_norm": 1.5908560752868652,
+ "learning_rate": 3.753524389236648e-05,
+ "loss": 2.0928,
+ "step": 1070
+ },
+ {
+ "epoch": 0.33,
+ "grad_norm": 1.2628813982009888,
+ "learning_rate": 3.742941284471111e-05,
+ "loss": 2.1074,
+ "step": 1075
+ },
+ {
+ "epoch": 0.34,
+ "grad_norm": 1.2687503099441528,
+ "learning_rate": 3.7323285090222054e-05,
+ "loss": 1.9666,
+ "step": 1080
+ },
+ {
+ "epoch": 0.34,
+ "grad_norm": 1.2571731805801392,
+ "learning_rate": 3.721686316231181e-05,
+ "loss": 2.0468,
+ "step": 1085
+ },
+ {
+ "epoch": 0.34,
+ "grad_norm": 1.007453441619873,
+ "learning_rate": 3.7110149601415215e-05,
+ "loss": 2.0624,
+ "step": 1090
+ },
+ {
+ "epoch": 0.34,
+ "grad_norm": 1.2390377521514893,
+ "learning_rate": 3.700314695492876e-05,
+ "loss": 1.9888,
+ "step": 1095
+ },
+ {
+ "epoch": 0.34,
+ "grad_norm": 1.0878371000289917,
+ "learning_rate": 3.6895857777149825e-05,
+ "loss": 2.1013,
+ "step": 1100
+ },
+ {
+ "epoch": 0.34,
+ "grad_norm": 0.8759217262268066,
+ "learning_rate": 3.6788284629215624e-05,
+ "loss": 1.875,
+ "step": 1105
+ },
+ {
+ "epoch": 0.35,
+ "grad_norm": 1.1345970630645752,
+ "learning_rate": 3.668043007904219e-05,
+ "loss": 1.9096,
+ "step": 1110
+ },
+ {
+ "epoch": 0.35,
+ "grad_norm": 1.253629446029663,
+ "learning_rate": 3.6572296701262966e-05,
+ "loss": 2.1859,
+ "step": 1115
+ },
+ {
+ "epoch": 0.35,
+ "grad_norm": 0.9796190857887268,
+ "learning_rate": 3.646388707716738e-05,
+ "loss": 2.2092,
+ "step": 1120
+ },
+ {
+ "epoch": 0.35,
+ "grad_norm": 1.3893767595291138,
+ "learning_rate": 3.635520379463926e-05,
+ "loss": 2.0026,
+ "step": 1125
+ },
+ {
+ "epoch": 0.35,
+ "grad_norm": 0.8778309226036072,
+ "learning_rate": 3.6246249448095004e-05,
+ "loss": 2.2112,
+ "step": 1130
+ },
+ {
+ "epoch": 0.35,
+ "grad_norm": 1.2479698657989502,
+ "learning_rate": 3.6137026638421696e-05,
+ "loss": 2.0221,
+ "step": 1135
+ },
+ {
+ "epoch": 0.35,
+ "grad_norm": 1.3813824653625488,
+ "learning_rate": 3.6027537972914974e-05,
+ "loss": 1.9106,
+ "step": 1140
+ },
+ {
+ "epoch": 0.36,
+ "grad_norm": 1.2043218612670898,
+ "learning_rate": 3.5917786065216826e-05,
+ "loss": 2.0673,
+ "step": 1145
+ },
+ {
+ "epoch": 0.36,
+ "grad_norm": 1.5337340831756592,
+ "learning_rate": 3.580777353525318e-05,
+ "loss": 2.1463,
+ "step": 1150
+ },
+ {
+ "epoch": 0.36,
+ "grad_norm": 1.155813455581665,
+ "learning_rate": 3.5697503009171385e-05,
+ "loss": 2.0255,
+ "step": 1155
+ },
+ {
+ "epoch": 0.36,
+ "grad_norm": 1.034644365310669,
+ "learning_rate": 3.558697711927748e-05,
+ "loss": 2.1348,
+ "step": 1160
+ },
+ {
+ "epoch": 0.36,
+ "grad_norm": 1.0959795713424683,
+ "learning_rate": 3.54761985039734e-05,
+ "loss": 2.1457,
+ "step": 1165
+ },
+ {
+ "epoch": 0.36,
+ "grad_norm": 1.1938838958740234,
+ "learning_rate": 3.5365169807693966e-05,
+ "loss": 2.1256,
+ "step": 1170
+ },
+ {
+ "epoch": 0.37,
+ "grad_norm": 0.8162047863006592,
+ "learning_rate": 3.525389368084379e-05,
+ "loss": 1.9587,
+ "step": 1175
+ },
+ {
+ "epoch": 0.37,
+ "grad_norm": 0.9358930587768555,
+ "learning_rate": 3.514237277973393e-05,
+ "loss": 1.8965,
+ "step": 1180
+ },
+ {
+ "epoch": 0.37,
+ "grad_norm": 0.9210988879203796,
+ "learning_rate": 3.503060976651862e-05,
+ "loss": 1.9669,
+ "step": 1185
+ },
+ {
+ "epoch": 0.37,
+ "grad_norm": 1.4641343355178833,
+ "learning_rate": 3.491860730913156e-05,
+ "loss": 2.003,
+ "step": 1190
+ },
+ {
+ "epoch": 0.37,
+ "grad_norm": 1.2458257675170898,
+ "learning_rate": 3.480636808122235e-05,
+ "loss": 2.1487,
+ "step": 1195
+ },
+ {
+ "epoch": 0.37,
+ "grad_norm": 1.6770122051239014,
+ "learning_rate": 3.469389476209259e-05,
+ "loss": 2.0686,
+ "step": 1200
+ },
+ {
+ "epoch": 0.37,
+ "grad_norm": 0.9083845019340515,
+ "learning_rate": 3.458119003663199e-05,
+ "loss": 2.0284,
+ "step": 1205
+ },
+ {
+ "epoch": 0.38,
+ "grad_norm": 1.2679696083068848,
+ "learning_rate": 3.446825659525421e-05,
+ "loss": 2.0555,
+ "step": 1210
+ },
+ {
+ "epoch": 0.38,
+ "grad_norm": 1.3823720216751099,
+ "learning_rate": 3.435509713383268e-05,
+ "loss": 1.9375,
+ "step": 1215
+ },
+ {
+ "epoch": 0.38,
+ "grad_norm": 1.5862077474594116,
+ "learning_rate": 3.424171435363623e-05,
+ "loss": 2.0271,
+ "step": 1220
+ },
+ {
+ "epoch": 0.38,
+ "grad_norm": 2.0107533931732178,
+ "learning_rate": 3.412811096126461e-05,
+ "loss": 2.1897,
+ "step": 1225
+ },
+ {
+ "epoch": 0.38,
+ "grad_norm": 1.4544458389282227,
+ "learning_rate": 3.401428966858387e-05,
+ "loss": 1.9978,
+ "step": 1230
+ },
+ {
+ "epoch": 0.38,
+ "grad_norm": 1.188170075416565,
+ "learning_rate": 3.390025319266167e-05,
+ "loss": 2.0688,
+ "step": 1235
+ },
+ {
+ "epoch": 0.39,
+ "grad_norm": 1.1016322374343872,
+ "learning_rate": 3.3786004255702336e-05,
+ "loss": 2.0396,
+ "step": 1240
+ },
+ {
+ "epoch": 0.39,
+ "grad_norm": 1.6623334884643555,
+ "learning_rate": 3.3671545584981954e-05,
+ "loss": 1.9566,
+ "step": 1245
+ },
+ {
+ "epoch": 0.39,
+ "grad_norm": 0.9161584377288818,
+ "learning_rate": 3.355687991278324e-05,
+ "loss": 2.0474,
+ "step": 1250
+ },
+ {
+ "epoch": 0.39,
+ "grad_norm": 0.9911025166511536,
+ "learning_rate": 3.3442009976330305e-05,
+ "loss": 2.2163,
+ "step": 1255
+ },
+ {
+ "epoch": 0.39,
+ "grad_norm": 1.1504255533218384,
+ "learning_rate": 3.332693851772331e-05,
+ "loss": 2.1088,
+ "step": 1260
+ },
+ {
+ "epoch": 0.39,
+ "grad_norm": 0.9544184803962708,
+ "learning_rate": 3.3211668283873035e-05,
+ "loss": 1.8947,
+ "step": 1265
+ },
+ {
+ "epoch": 0.39,
+ "grad_norm": 1.4625756740570068,
+ "learning_rate": 3.3096202026435304e-05,
+ "loss": 2.1748,
+ "step": 1270
+ },
+ {
+ "epoch": 0.4,
+ "grad_norm": 1.3267475366592407,
+ "learning_rate": 3.298054250174527e-05,
+ "loss": 1.9218,
+ "step": 1275
+ },
+ {
+ "epoch": 0.4,
+ "grad_norm": 0.9869363903999329,
+ "learning_rate": 3.2864692470751654e-05,
+ "loss": 2.2723,
+ "step": 1280
+ },
+ {
+ "epoch": 0.4,
+ "grad_norm": 1.5177838802337646,
+ "learning_rate": 3.27486546989508e-05,
+ "loss": 2.1456,
+ "step": 1285
+ },
+ {
+ "epoch": 0.4,
+ "grad_norm": 1.1998714208602905,
+ "learning_rate": 3.263243195632068e-05,
+ "loss": 1.8877,
+ "step": 1290
+ },
+ {
+ "epoch": 0.4,
+ "grad_norm": 1.2112164497375488,
+ "learning_rate": 3.2516027017254785e-05,
+ "loss": 2.0615,
+ "step": 1295
+ },
+ {
+ "epoch": 0.4,
+ "grad_norm": 1.0616129636764526,
+ "learning_rate": 3.239944266049587e-05,
+ "loss": 2.0402,
+ "step": 1300
+ },
+ {
+ "epoch": 0.41,
+ "grad_norm": 1.4537287950515747,
+ "learning_rate": 3.228268166906962e-05,
+ "loss": 2.0728,
+ "step": 1305
+ },
+ {
+ "epoch": 0.41,
+ "grad_norm": 1.3899391889572144,
+ "learning_rate": 3.2165746830218254e-05,
+ "loss": 2.1815,
+ "step": 1310
+ },
+ {
+ "epoch": 0.41,
+ "grad_norm": 1.332529067993164,
+ "learning_rate": 3.204864093533394e-05,
+ "loss": 1.8935,
+ "step": 1315
+ },
+ {
+ "epoch": 0.41,
+ "grad_norm": 1.4466496706008911,
+ "learning_rate": 3.193136677989221e-05,
+ "loss": 1.9567,
+ "step": 1320
+ },
+ {
+ "epoch": 0.41,
+ "grad_norm": 1.1781721115112305,
+ "learning_rate": 3.181392716338516e-05,
+ "loss": 2.055,
+ "step": 1325
+ },
+ {
+ "epoch": 0.41,
+ "grad_norm": 0.9411901831626892,
+ "learning_rate": 3.1696324889254716e-05,
+ "loss": 1.8794,
+ "step": 1330
+ },
+ {
+ "epoch": 0.42,
+ "grad_norm": 1.2628341913223267,
+ "learning_rate": 3.15785627648256e-05,
+ "loss": 2.0299,
+ "step": 1335
+ },
+ {
+ "epoch": 0.42,
+ "grad_norm": 1.4857370853424072,
+ "learning_rate": 3.146064360123846e-05,
+ "loss": 1.9342,
+ "step": 1340
+ },
+ {
+ "epoch": 0.42,
+ "grad_norm": 1.661470651626587,
+ "learning_rate": 3.1342570213382594e-05,
+ "loss": 2.0399,
+ "step": 1345
+ },
+ {
+ "epoch": 0.42,
+ "grad_norm": 1.522845983505249,
+ "learning_rate": 3.122434541982888e-05,
+ "loss": 2.1419,
+ "step": 1350
+ },
+ {
+ "epoch": 0.42,
+ "grad_norm": 1.5679118633270264,
+ "learning_rate": 3.110597204276247e-05,
+ "loss": 2.2932,
+ "step": 1355
+ },
+ {
+ "epoch": 0.42,
+ "grad_norm": 1.3367788791656494,
+ "learning_rate": 3.098745290791539e-05,
+ "loss": 1.8989,
+ "step": 1360
+ },
+ {
+ "epoch": 0.42,
+ "grad_norm": 1.3873472213745117,
+ "learning_rate": 3.086879084449907e-05,
+ "loss": 2.1214,
+ "step": 1365
+ },
+ {
+ "epoch": 0.43,
+ "grad_norm": 1.2957035303115845,
+ "learning_rate": 3.074998868513688e-05,
+ "loss": 2.2538,
+ "step": 1370
+ },
+ {
+ "epoch": 0.43,
+ "grad_norm": 1.122176170349121,
+ "learning_rate": 3.0631049265796465e-05,
+ "loss": 2.0974,
+ "step": 1375
+ },
+ {
+ "epoch": 0.43,
+ "grad_norm": 1.0422618389129639,
+ "learning_rate": 3.051197542572203e-05,
+ "loss": 2.054,
+ "step": 1380
+ },
+ {
+ "epoch": 0.43,
+ "grad_norm": 1.1926140785217285,
+ "learning_rate": 3.0392770007366584e-05,
+ "loss": 1.9798,
+ "step": 1385
+ },
+ {
+ "epoch": 0.43,
+ "grad_norm": 0.8764025568962097,
+ "learning_rate": 3.0273435856324112e-05,
+ "loss": 2.0796,
+ "step": 1390
+ },
+ {
+ "epoch": 0.43,
+ "grad_norm": 0.8200764656066895,
+ "learning_rate": 3.0153975821261605e-05,
+ "loss": 1.9116,
+ "step": 1395
+ },
+ {
+ "epoch": 0.44,
+ "grad_norm": 1.0340498685836792,
+ "learning_rate": 3.0034392753851066e-05,
+ "loss": 2.0235,
+ "step": 1400
+ }
+ ],
+ "logging_steps": 5,
+ "max_steps": 3215,
+ "num_input_tokens_seen": 0,
+ "num_train_epochs": 1,
+ "save_steps": 100,
+ "total_flos": 1.882972921135104e+16,
+ "train_batch_size": 2,
+ "trial_name": null,
+ "trial_params": null
+}
diff --git a/checkpoint-1400/training_args.bin b/checkpoint-1400/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..dbb5f752acaebe6718c05e59ced2a70ce6dfb6a0
--- /dev/null
+++ b/checkpoint-1400/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:79e80b13ff00898b4493d440a3c1a1eb234c0ae541cbca8a8b1befef97a354c9
+size 5112
diff --git a/checkpoint-1500/README.md b/checkpoint-1500/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..b21eb87c49b1ea11193cd8f664d04b06c22e6bb5
--- /dev/null
+++ b/checkpoint-1500/README.md
@@ -0,0 +1,202 @@
+---
+library_name: peft
+base_model: hfl/chinese-alpaca-2-1.3b
+---
+
+# Model Card for Model ID
+
+
+
+
+
+## Model Details
+
+### Model Description
+
+
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+
+
+### Direct Use
+
+
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+
+
+[More Information Needed]
+
+### Recommendations
+
+
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+
+
+[More Information Needed]
+
+### Training Procedure
+
+
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed]
+
+#### Speeds, Sizes, Times [optional]
+
+
+
+[More Information Needed]
+
+## Evaluation
+
+
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+
+
+[More Information Needed]
+
+#### Factors
+
+
+
+[More Information Needed]
+
+#### Metrics
+
+
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+
+
+[More Information Needed]
+
+## Environmental Impact
+
+
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.9.0
\ No newline at end of file
diff --git a/checkpoint-1500/adapter_config.json b/checkpoint-1500/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..67326eac9d1aeaece3cee6be49e088077b9cebe9
--- /dev/null
+++ b/checkpoint-1500/adapter_config.json
@@ -0,0 +1,28 @@
+{
+ "alpha_pattern": {},
+ "auto_mapping": null,
+ "base_model_name_or_path": "hfl/chinese-alpaca-2-1.3b",
+ "bias": "none",
+ "fan_in_fan_out": false,
+ "inference_mode": true,
+ "init_lora_weights": true,
+ "layers_pattern": null,
+ "layers_to_transform": null,
+ "loftq_config": {},
+ "lora_alpha": 16,
+ "lora_dropout": 0.1,
+ "megatron_config": null,
+ "megatron_core": "megatron.core",
+ "modules_to_save": null,
+ "peft_type": "LORA",
+ "r": 8,
+ "rank_pattern": {},
+ "revision": null,
+ "target_modules": [
+ "q_proj",
+ "v_proj"
+ ],
+ "task_type": "CAUSAL_LM",
+ "use_dora": false,
+ "use_rslora": false
+}
\ No newline at end of file
diff --git a/checkpoint-1500/adapter_model.safetensors b/checkpoint-1500/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..6c2d95973ab5dcc39d138d45ed82277df67fe94c
--- /dev/null
+++ b/checkpoint-1500/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b2f60d025081c52abcefd12bd979226e64c15fd546a1ee3e1adf198151a00500
+size 2099272
diff --git a/checkpoint-1500/optimizer.pt b/checkpoint-1500/optimizer.pt
new file mode 100644
index 0000000000000000000000000000000000000000..86366385128e9283e92abda1cbfbc83f9402ca08
--- /dev/null
+++ b/checkpoint-1500/optimizer.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b7a0a5e958488fbf75d0f567d49412c03fdbb981c6b650f3bdb392b7fa1d76a1
+size 4208302
diff --git a/checkpoint-1500/rng_state.pth b/checkpoint-1500/rng_state.pth
new file mode 100644
index 0000000000000000000000000000000000000000..d0432b613bb44fb8774aebf9e899af9d1f00491d
--- /dev/null
+++ b/checkpoint-1500/rng_state.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:73915b1ab178881d6013bb6a6fe6df32c58dc47c81965c3d636ff61253042ecb
+size 14244
diff --git a/checkpoint-1500/scheduler.pt b/checkpoint-1500/scheduler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..9b0ebc24aa6ed81d67da211547b65c827d85b097
--- /dev/null
+++ b/checkpoint-1500/scheduler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:15e6dbea6b02ec35a2eb009d4eef91b1c0ffa2c9fe7ffea15f5ebda80bcb5333
+size 1064
diff --git a/checkpoint-1500/special_tokens_map.json b/checkpoint-1500/special_tokens_map.json
new file mode 100644
index 0000000000000000000000000000000000000000..94c4595c4eb202faff8e45d273ceffbe19fe3036
--- /dev/null
+++ b/checkpoint-1500/special_tokens_map.json
@@ -0,0 +1,30 @@
+{
+ "bos_token": {
+ "content": "",
+ "lstrip": false,
+ "normalized": true,
+ "rstrip": false,
+ "single_word": false
+ },
+ "eos_token": {
+ "content": "",
+ "lstrip": false,
+ "normalized": true,
+ "rstrip": false,
+ "single_word": false
+ },
+ "pad_token": {
+ "content": "",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false
+ },
+ "unk_token": {
+ "content": "",
+ "lstrip": false,
+ "normalized": true,
+ "rstrip": false,
+ "single_word": false
+ }
+}
diff --git a/checkpoint-1500/tokenizer.model b/checkpoint-1500/tokenizer.model
new file mode 100644
index 0000000000000000000000000000000000000000..3df664fcc67f61d0c7e635a60d9f55cd0624158a
--- /dev/null
+++ b/checkpoint-1500/tokenizer.model
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a3b8844863b200dfcca971db228e96ce388290dfcf72c15d7a9d2f604bac787c
+size 844403
diff --git a/checkpoint-1500/tokenizer_config.json b/checkpoint-1500/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..6912f7cdcb84c3039edad59aa64f70dc4344a517
--- /dev/null
+++ b/checkpoint-1500/tokenizer_config.json
@@ -0,0 +1,54 @@
+{
+ "add_bos_token": true,
+ "add_eos_token": false,
+ "add_prefix_space": true,
+ "added_tokens_decoder": {
+ "0": {
+ "content": "",
+ "lstrip": false,
+ "normalized": true,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "1": {
+ "content": "",
+ "lstrip": false,
+ "normalized": true,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "2": {
+ "content": "",
+ "lstrip": false,
+ "normalized": true,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "32000": {
+ "content": "",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ }
+ },
+ "bos_token": "",
+ "chat_template": "{% set system_message = 'You are a helpful assistant. 你是一个乐于助人的助手。' %}{% if messages[0]['role'] == 'system' %}{% set system_message = messages[0]['content'] %}{% endif %}{% for message in messages %}{% set content = message['content'] %}{% if loop.index0 == 0 and system_message is defined %}{% set content = '<>\\n' + system_message + '\\n<>\\n\\n' + message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ '' + '[INST] ' + content + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ content + '' }}{% endif %}{% endfor %}",
+ "clean_up_tokenization_spaces": false,
+ "eos_token": "",
+ "legacy": true,
+ "model_max_length": 1000000000000000019884624838656,
+ "pad_token": "",
+ "padding_side": "right",
+ "sp_model_kwargs": {},
+ "spaces_between_special_tokens": false,
+ "split_special_tokens": false,
+ "tokenizer_class": "LlamaTokenizer",
+ "unk_token": "",
+ "use_default_system_prompt": false,
+ "use_fast": false
+}
diff --git a/checkpoint-1500/trainer_state.json b/checkpoint-1500/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..acd2e02b84fe640d3cb9761fcd12113bbe806d38
--- /dev/null
+++ b/checkpoint-1500/trainer_state.json
@@ -0,0 +1,2121 @@
+{
+ "best_metric": null,
+ "best_model_checkpoint": null,
+ "epoch": 0.46643603995802074,
+ "eval_steps": 500,
+ "global_step": 1500,
+ "is_hyper_param_search": false,
+ "is_local_process_zero": true,
+ "is_world_process_zero": true,
+ "log_history": [
+ {
+ "epoch": 0.0,
+ "grad_norm": 0.47774845361709595,
+ "learning_rate": 4.999970160815579e-05,
+ "loss": 2.0765,
+ "step": 5
+ },
+ {
+ "epoch": 0.0,
+ "grad_norm": 0.6051416397094727,
+ "learning_rate": 4.999880643974619e-05,
+ "loss": 2.2297,
+ "step": 10
+ },
+ {
+ "epoch": 0.0,
+ "grad_norm": 0.6161717772483826,
+ "learning_rate": 4.9997314516140056e-05,
+ "loss": 2.1103,
+ "step": 15
+ },
+ {
+ "epoch": 0.01,
+ "grad_norm": 0.4686434268951416,
+ "learning_rate": 4.999522587295162e-05,
+ "loss": 2.0057,
+ "step": 20
+ },
+ {
+ "epoch": 0.01,
+ "grad_norm": 0.8412289023399353,
+ "learning_rate": 4.999254056003963e-05,
+ "loss": 2.1778,
+ "step": 25
+ },
+ {
+ "epoch": 0.01,
+ "grad_norm": 0.5333625078201294,
+ "learning_rate": 4.99892586415061e-05,
+ "loss": 2.2399,
+ "step": 30
+ },
+ {
+ "epoch": 0.01,
+ "grad_norm": 0.821148157119751,
+ "learning_rate": 4.9985380195694856e-05,
+ "loss": 2.3215,
+ "step": 35
+ },
+ {
+ "epoch": 0.01,
+ "grad_norm": 0.8403909206390381,
+ "learning_rate": 4.998090531518962e-05,
+ "loss": 1.8295,
+ "step": 40
+ },
+ {
+ "epoch": 0.01,
+ "grad_norm": 0.6633398532867432,
+ "learning_rate": 4.9975834106811834e-05,
+ "loss": 2.0195,
+ "step": 45
+ },
+ {
+ "epoch": 0.02,
+ "grad_norm": 0.6386868357658386,
+ "learning_rate": 4.997016669161806e-05,
+ "loss": 2.1257,
+ "step": 50
+ },
+ {
+ "epoch": 0.02,
+ "grad_norm": 0.7762248516082764,
+ "learning_rate": 4.996390320489715e-05,
+ "loss": 2.057,
+ "step": 55
+ },
+ {
+ "epoch": 0.02,
+ "grad_norm": 1.3192856311798096,
+ "learning_rate": 4.9957043796166966e-05,
+ "loss": 2.0753,
+ "step": 60
+ },
+ {
+ "epoch": 0.02,
+ "grad_norm": 0.9797518849372864,
+ "learning_rate": 4.994958862917083e-05,
+ "loss": 1.9736,
+ "step": 65
+ },
+ {
+ "epoch": 0.02,
+ "grad_norm": 1.000693440437317,
+ "learning_rate": 4.994153788187363e-05,
+ "loss": 2.1572,
+ "step": 70
+ },
+ {
+ "epoch": 0.02,
+ "grad_norm": 0.6852813959121704,
+ "learning_rate": 4.993289174645757e-05,
+ "loss": 2.1491,
+ "step": 75
+ },
+ {
+ "epoch": 0.02,
+ "grad_norm": 1.0075691938400269,
+ "learning_rate": 4.992365042931752e-05,
+ "loss": 1.945,
+ "step": 80
+ },
+ {
+ "epoch": 0.03,
+ "grad_norm": 1.1973133087158203,
+ "learning_rate": 4.991381415105619e-05,
+ "loss": 2.0811,
+ "step": 85
+ },
+ {
+ "epoch": 0.03,
+ "grad_norm": 0.9927239418029785,
+ "learning_rate": 4.990338314647881e-05,
+ "loss": 1.961,
+ "step": 90
+ },
+ {
+ "epoch": 0.03,
+ "grad_norm": 0.9499759674072266,
+ "learning_rate": 4.98923576645875e-05,
+ "loss": 2.0653,
+ "step": 95
+ },
+ {
+ "epoch": 0.03,
+ "grad_norm": 0.7233040928840637,
+ "learning_rate": 4.9880737968575365e-05,
+ "loss": 1.9999,
+ "step": 100
+ },
+ {
+ "epoch": 0.03,
+ "grad_norm": 1.55235755443573,
+ "learning_rate": 4.986852433582022e-05,
+ "loss": 2.2258,
+ "step": 105
+ },
+ {
+ "epoch": 0.03,
+ "grad_norm": 0.9007890820503235,
+ "learning_rate": 4.985571705787793e-05,
+ "loss": 2.1034,
+ "step": 110
+ },
+ {
+ "epoch": 0.04,
+ "grad_norm": 0.6774860620498657,
+ "learning_rate": 4.9842316440475475e-05,
+ "loss": 2.1753,
+ "step": 115
+ },
+ {
+ "epoch": 0.04,
+ "grad_norm": 0.7676737308502197,
+ "learning_rate": 4.9828322803503665e-05,
+ "loss": 2.1384,
+ "step": 120
+ },
+ {
+ "epoch": 0.04,
+ "grad_norm": 0.9624544978141785,
+ "learning_rate": 4.981373648100946e-05,
+ "loss": 2.0521,
+ "step": 125
+ },
+ {
+ "epoch": 0.04,
+ "grad_norm": 0.9315722584724426,
+ "learning_rate": 4.979855782118802e-05,
+ "loss": 1.9256,
+ "step": 130
+ },
+ {
+ "epoch": 0.04,
+ "grad_norm": 0.9035864472389221,
+ "learning_rate": 4.978278718637443e-05,
+ "loss": 2.0882,
+ "step": 135
+ },
+ {
+ "epoch": 0.04,
+ "grad_norm": 0.7997236251831055,
+ "learning_rate": 4.9766424953035e-05,
+ "loss": 2.0724,
+ "step": 140
+ },
+ {
+ "epoch": 0.05,
+ "grad_norm": 1.0692921876907349,
+ "learning_rate": 4.974947151175826e-05,
+ "loss": 2.1329,
+ "step": 145
+ },
+ {
+ "epoch": 0.05,
+ "grad_norm": 0.9506180286407471,
+ "learning_rate": 4.973192726724572e-05,
+ "loss": 2.082,
+ "step": 150
+ },
+ {
+ "epoch": 0.05,
+ "grad_norm": 0.8647387027740479,
+ "learning_rate": 4.9713792638302145e-05,
+ "loss": 2.0366,
+ "step": 155
+ },
+ {
+ "epoch": 0.05,
+ "grad_norm": 1.105302095413208,
+ "learning_rate": 4.969506805782555e-05,
+ "loss": 2.1481,
+ "step": 160
+ },
+ {
+ "epoch": 0.05,
+ "grad_norm": 0.7593303918838501,
+ "learning_rate": 4.967575397279689e-05,
+ "loss": 2.032,
+ "step": 165
+ },
+ {
+ "epoch": 0.05,
+ "grad_norm": 0.7521979808807373,
+ "learning_rate": 4.965585084426943e-05,
+ "loss": 2.0379,
+ "step": 170
+ },
+ {
+ "epoch": 0.05,
+ "grad_norm": 0.947120726108551,
+ "learning_rate": 4.9635359147357655e-05,
+ "loss": 2.1444,
+ "step": 175
+ },
+ {
+ "epoch": 0.06,
+ "grad_norm": 1.2184454202651978,
+ "learning_rate": 4.961427937122598e-05,
+ "loss": 1.9164,
+ "step": 180
+ },
+ {
+ "epoch": 0.06,
+ "grad_norm": 1.221663475036621,
+ "learning_rate": 4.959261201907707e-05,
+ "loss": 2.0084,
+ "step": 185
+ },
+ {
+ "epoch": 0.06,
+ "grad_norm": 1.0457361936569214,
+ "learning_rate": 4.957035760813982e-05,
+ "loss": 2.2032,
+ "step": 190
+ },
+ {
+ "epoch": 0.06,
+ "grad_norm": 0.8834909200668335,
+ "learning_rate": 4.954751666965701e-05,
+ "loss": 2.2101,
+ "step": 195
+ },
+ {
+ "epoch": 0.06,
+ "grad_norm": 0.791902482509613,
+ "learning_rate": 4.9524089748872615e-05,
+ "loss": 2.0472,
+ "step": 200
+ },
+ {
+ "epoch": 0.06,
+ "grad_norm": 1.2905739545822144,
+ "learning_rate": 4.9500077405018807e-05,
+ "loss": 2.0987,
+ "step": 205
+ },
+ {
+ "epoch": 0.07,
+ "grad_norm": 0.8612006306648254,
+ "learning_rate": 4.9475480211302583e-05,
+ "loss": 2.1765,
+ "step": 210
+ },
+ {
+ "epoch": 0.07,
+ "grad_norm": 1.3128459453582764,
+ "learning_rate": 4.945029875489212e-05,
+ "loss": 1.9926,
+ "step": 215
+ },
+ {
+ "epoch": 0.07,
+ "grad_norm": 0.9610918164253235,
+ "learning_rate": 4.94245336369027e-05,
+ "loss": 2.0124,
+ "step": 220
+ },
+ {
+ "epoch": 0.07,
+ "grad_norm": 0.873160183429718,
+ "learning_rate": 4.939818547238241e-05,
+ "loss": 2.2229,
+ "step": 225
+ },
+ {
+ "epoch": 0.07,
+ "grad_norm": 1.5535285472869873,
+ "learning_rate": 4.9371254890297446e-05,
+ "loss": 2.2013,
+ "step": 230
+ },
+ {
+ "epoch": 0.07,
+ "grad_norm": 1.1951836347579956,
+ "learning_rate": 4.93437425335171e-05,
+ "loss": 2.014,
+ "step": 235
+ },
+ {
+ "epoch": 0.07,
+ "grad_norm": 0.7874170541763306,
+ "learning_rate": 4.9315649058798384e-05,
+ "loss": 2.1701,
+ "step": 240
+ },
+ {
+ "epoch": 0.08,
+ "grad_norm": 1.3503323793411255,
+ "learning_rate": 4.928697513677042e-05,
+ "loss": 2.1681,
+ "step": 245
+ },
+ {
+ "epoch": 0.08,
+ "grad_norm": 1.3091179132461548,
+ "learning_rate": 4.925772145191834e-05,
+ "loss": 2.1224,
+ "step": 250
+ },
+ {
+ "epoch": 0.08,
+ "grad_norm": 1.4428555965423584,
+ "learning_rate": 4.9227888702567044e-05,
+ "loss": 2.0512,
+ "step": 255
+ },
+ {
+ "epoch": 0.08,
+ "grad_norm": 0.8234395980834961,
+ "learning_rate": 4.9197477600864446e-05,
+ "loss": 2.1067,
+ "step": 260
+ },
+ {
+ "epoch": 0.08,
+ "grad_norm": 1.9094969034194946,
+ "learning_rate": 4.9166488872764526e-05,
+ "loss": 1.8884,
+ "step": 265
+ },
+ {
+ "epoch": 0.08,
+ "grad_norm": 1.0074087381362915,
+ "learning_rate": 4.913492325800999e-05,
+ "loss": 1.9345,
+ "step": 270
+ },
+ {
+ "epoch": 0.09,
+ "grad_norm": 1.0867297649383545,
+ "learning_rate": 4.910278151011458e-05,
+ "loss": 2.1928,
+ "step": 275
+ },
+ {
+ "epoch": 0.09,
+ "grad_norm": 0.6842357516288757,
+ "learning_rate": 4.907006439634516e-05,
+ "loss": 2.0407,
+ "step": 280
+ },
+ {
+ "epoch": 0.09,
+ "grad_norm": 0.8409023284912109,
+ "learning_rate": 4.903677269770329e-05,
+ "loss": 2.2344,
+ "step": 285
+ },
+ {
+ "epoch": 0.09,
+ "grad_norm": 0.8119503259658813,
+ "learning_rate": 4.900290720890671e-05,
+ "loss": 2.1296,
+ "step": 290
+ },
+ {
+ "epoch": 0.09,
+ "grad_norm": 0.9938147068023682,
+ "learning_rate": 4.8968468738370244e-05,
+ "loss": 2.152,
+ "step": 295
+ },
+ {
+ "epoch": 0.09,
+ "grad_norm": 0.9865244030952454,
+ "learning_rate": 4.8933458108186606e-05,
+ "loss": 1.9623,
+ "step": 300
+ },
+ {
+ "epoch": 0.09,
+ "grad_norm": 1.3944802284240723,
+ "learning_rate": 4.889787615410672e-05,
+ "loss": 1.915,
+ "step": 305
+ },
+ {
+ "epoch": 0.1,
+ "grad_norm": 1.3749767541885376,
+ "learning_rate": 4.886172372551977e-05,
+ "loss": 1.9934,
+ "step": 310
+ },
+ {
+ "epoch": 0.1,
+ "grad_norm": 0.9024938941001892,
+ "learning_rate": 4.882500168543294e-05,
+ "loss": 2.1541,
+ "step": 315
+ },
+ {
+ "epoch": 0.1,
+ "grad_norm": 1.1978263854980469,
+ "learning_rate": 4.878771091045082e-05,
+ "loss": 2.1688,
+ "step": 320
+ },
+ {
+ "epoch": 0.1,
+ "grad_norm": 0.8360010981559753,
+ "learning_rate": 4.874985229075446e-05,
+ "loss": 2.1387,
+ "step": 325
+ },
+ {
+ "epoch": 0.1,
+ "grad_norm": 0.7683364152908325,
+ "learning_rate": 4.871142673008012e-05,
+ "loss": 2.0215,
+ "step": 330
+ },
+ {
+ "epoch": 0.1,
+ "grad_norm": 1.4230670928955078,
+ "learning_rate": 4.867243514569772e-05,
+ "loss": 1.9491,
+ "step": 335
+ },
+ {
+ "epoch": 0.11,
+ "grad_norm": 0.8198773860931396,
+ "learning_rate": 4.863287846838891e-05,
+ "loss": 2.0151,
+ "step": 340
+ },
+ {
+ "epoch": 0.11,
+ "grad_norm": 1.467207908630371,
+ "learning_rate": 4.85927576424249e-05,
+ "loss": 1.8906,
+ "step": 345
+ },
+ {
+ "epoch": 0.11,
+ "grad_norm": 0.9537095427513123,
+ "learning_rate": 4.855207362554385e-05,
+ "loss": 2.1844,
+ "step": 350
+ },
+ {
+ "epoch": 0.11,
+ "grad_norm": 1.0757155418395996,
+ "learning_rate": 4.851082738892809e-05,
+ "loss": 2.048,
+ "step": 355
+ },
+ {
+ "epoch": 0.11,
+ "grad_norm": 1.6884938478469849,
+ "learning_rate": 4.8469019917180846e-05,
+ "loss": 1.9537,
+ "step": 360
+ },
+ {
+ "epoch": 0.11,
+ "grad_norm": 1.4680182933807373,
+ "learning_rate": 4.8426652208302814e-05,
+ "loss": 1.9731,
+ "step": 365
+ },
+ {
+ "epoch": 0.12,
+ "grad_norm": 1.1778632402420044,
+ "learning_rate": 4.83837252736683e-05,
+ "loss": 2.1395,
+ "step": 370
+ },
+ {
+ "epoch": 0.12,
+ "grad_norm": 1.2865056991577148,
+ "learning_rate": 4.834024013800108e-05,
+ "loss": 2.0016,
+ "step": 375
+ },
+ {
+ "epoch": 0.12,
+ "grad_norm": 1.055177092552185,
+ "learning_rate": 4.8296197839349944e-05,
+ "loss": 1.9632,
+ "step": 380
+ },
+ {
+ "epoch": 0.12,
+ "grad_norm": 1.0041871070861816,
+ "learning_rate": 4.825159942906389e-05,
+ "loss": 2.3302,
+ "step": 385
+ },
+ {
+ "epoch": 0.12,
+ "grad_norm": 1.0026438236236572,
+ "learning_rate": 4.820644597176709e-05,
+ "loss": 2.1517,
+ "step": 390
+ },
+ {
+ "epoch": 0.12,
+ "grad_norm": 1.3532180786132812,
+ "learning_rate": 4.81607385453334e-05,
+ "loss": 2.1229,
+ "step": 395
+ },
+ {
+ "epoch": 0.12,
+ "grad_norm": 0.7670988440513611,
+ "learning_rate": 4.81144782408607e-05,
+ "loss": 2.1382,
+ "step": 400
+ },
+ {
+ "epoch": 0.13,
+ "grad_norm": 1.0405700206756592,
+ "learning_rate": 4.8067666162644774e-05,
+ "loss": 1.9614,
+ "step": 405
+ },
+ {
+ "epoch": 0.13,
+ "grad_norm": 1.2252662181854248,
+ "learning_rate": 4.802030342815304e-05,
+ "loss": 2.1399,
+ "step": 410
+ },
+ {
+ "epoch": 0.13,
+ "grad_norm": 1.237946629524231,
+ "learning_rate": 4.7972391167997754e-05,
+ "loss": 1.9034,
+ "step": 415
+ },
+ {
+ "epoch": 0.13,
+ "grad_norm": 0.8064705729484558,
+ "learning_rate": 4.7923930525909156e-05,
+ "loss": 2.0075,
+ "step": 420
+ },
+ {
+ "epoch": 0.13,
+ "grad_norm": 0.8717565536499023,
+ "learning_rate": 4.7874922658708065e-05,
+ "loss": 2.0105,
+ "step": 425
+ },
+ {
+ "epoch": 0.13,
+ "grad_norm": 1.6693098545074463,
+ "learning_rate": 4.782536873627832e-05,
+ "loss": 2.0242,
+ "step": 430
+ },
+ {
+ "epoch": 0.14,
+ "grad_norm": 0.82447350025177,
+ "learning_rate": 4.777526994153882e-05,
+ "loss": 2.0267,
+ "step": 435
+ },
+ {
+ "epoch": 0.14,
+ "grad_norm": 0.9926588535308838,
+ "learning_rate": 4.7724627470415307e-05,
+ "loss": 1.9119,
+ "step": 440
+ },
+ {
+ "epoch": 0.14,
+ "grad_norm": 1.0924450159072876,
+ "learning_rate": 4.7673442531811796e-05,
+ "loss": 2.2653,
+ "step": 445
+ },
+ {
+ "epoch": 0.14,
+ "grad_norm": 1.1592103242874146,
+ "learning_rate": 4.762171634758177e-05,
+ "loss": 2.0017,
+ "step": 450
+ },
+ {
+ "epoch": 0.14,
+ "grad_norm": 0.9172110557556152,
+ "learning_rate": 4.7569450152498927e-05,
+ "loss": 2.1408,
+ "step": 455
+ },
+ {
+ "epoch": 0.14,
+ "grad_norm": 1.1897525787353516,
+ "learning_rate": 4.751664519422778e-05,
+ "loss": 2.0935,
+ "step": 460
+ },
+ {
+ "epoch": 0.14,
+ "grad_norm": 0.8793094158172607,
+ "learning_rate": 4.746330273329386e-05,
+ "loss": 2.1142,
+ "step": 465
+ },
+ {
+ "epoch": 0.15,
+ "grad_norm": 1.4337489604949951,
+ "learning_rate": 4.740942404305356e-05,
+ "loss": 2.1289,
+ "step": 470
+ },
+ {
+ "epoch": 0.15,
+ "grad_norm": 1.0251764059066772,
+ "learning_rate": 4.735501040966383e-05,
+ "loss": 1.9741,
+ "step": 475
+ },
+ {
+ "epoch": 0.15,
+ "grad_norm": 1.2659822702407837,
+ "learning_rate": 4.730006313205143e-05,
+ "loss": 2.088,
+ "step": 480
+ },
+ {
+ "epoch": 0.15,
+ "grad_norm": 0.8884140849113464,
+ "learning_rate": 4.724458352188192e-05,
+ "loss": 2.2079,
+ "step": 485
+ },
+ {
+ "epoch": 0.15,
+ "grad_norm": 1.1937768459320068,
+ "learning_rate": 4.718857290352835e-05,
+ "loss": 2.048,
+ "step": 490
+ },
+ {
+ "epoch": 0.15,
+ "grad_norm": 0.9741552472114563,
+ "learning_rate": 4.713203261403966e-05,
+ "loss": 2.2569,
+ "step": 495
+ },
+ {
+ "epoch": 0.16,
+ "grad_norm": 0.7996780872344971,
+ "learning_rate": 4.707496400310874e-05,
+ "loss": 1.9574,
+ "step": 500
+ },
+ {
+ "epoch": 0.16,
+ "grad_norm": 1.8182051181793213,
+ "learning_rate": 4.701736843304025e-05,
+ "loss": 2.0951,
+ "step": 505
+ },
+ {
+ "epoch": 0.16,
+ "grad_norm": 1.507320761680603,
+ "learning_rate": 4.695924727871805e-05,
+ "loss": 2.0253,
+ "step": 510
+ },
+ {
+ "epoch": 0.16,
+ "grad_norm": 0.759121835231781,
+ "learning_rate": 4.690060192757242e-05,
+ "loss": 2.0602,
+ "step": 515
+ },
+ {
+ "epoch": 0.16,
+ "grad_norm": 1.5943195819854736,
+ "learning_rate": 4.684143377954691e-05,
+ "loss": 2.0386,
+ "step": 520
+ },
+ {
+ "epoch": 0.16,
+ "grad_norm": 0.8568710088729858,
+ "learning_rate": 4.6781744247064955e-05,
+ "loss": 2.073,
+ "step": 525
+ },
+ {
+ "epoch": 0.16,
+ "grad_norm": 1.3352620601654053,
+ "learning_rate": 4.6721534754996125e-05,
+ "loss": 2.1443,
+ "step": 530
+ },
+ {
+ "epoch": 0.17,
+ "grad_norm": 1.3417474031448364,
+ "learning_rate": 4.666080674062213e-05,
+ "loss": 2.0288,
+ "step": 535
+ },
+ {
+ "epoch": 0.17,
+ "grad_norm": 1.5334464311599731,
+ "learning_rate": 4.659956165360251e-05,
+ "loss": 2.0609,
+ "step": 540
+ },
+ {
+ "epoch": 0.17,
+ "grad_norm": 0.9658721089363098,
+ "learning_rate": 4.6537800955940005e-05,
+ "loss": 1.9539,
+ "step": 545
+ },
+ {
+ "epoch": 0.17,
+ "grad_norm": 1.9197947978973389,
+ "learning_rate": 4.647552612194572e-05,
+ "loss": 2.149,
+ "step": 550
+ },
+ {
+ "epoch": 0.17,
+ "grad_norm": 0.8512137532234192,
+ "learning_rate": 4.641273863820383e-05,
+ "loss": 1.9722,
+ "step": 555
+ },
+ {
+ "epoch": 0.17,
+ "grad_norm": 1.827289342880249,
+ "learning_rate": 4.634944000353622e-05,
+ "loss": 2.0729,
+ "step": 560
+ },
+ {
+ "epoch": 0.18,
+ "grad_norm": 1.088416337966919,
+ "learning_rate": 4.628563172896655e-05,
+ "loss": 1.9507,
+ "step": 565
+ },
+ {
+ "epoch": 0.18,
+ "grad_norm": 1.3566908836364746,
+ "learning_rate": 4.6221315337684353e-05,
+ "loss": 2.1643,
+ "step": 570
+ },
+ {
+ "epoch": 0.18,
+ "grad_norm": 1.3541293144226074,
+ "learning_rate": 4.615649236500854e-05,
+ "loss": 2.1839,
+ "step": 575
+ },
+ {
+ "epoch": 0.18,
+ "grad_norm": 0.991269588470459,
+ "learning_rate": 4.609116435835083e-05,
+ "loss": 2.0976,
+ "step": 580
+ },
+ {
+ "epoch": 0.18,
+ "grad_norm": 1.0280535221099854,
+ "learning_rate": 4.602533287717877e-05,
+ "loss": 2.1474,
+ "step": 585
+ },
+ {
+ "epoch": 0.18,
+ "grad_norm": 1.013123631477356,
+ "learning_rate": 4.5958999492978524e-05,
+ "loss": 2.1873,
+ "step": 590
+ },
+ {
+ "epoch": 0.19,
+ "grad_norm": 1.1753040552139282,
+ "learning_rate": 4.589216578921737e-05,
+ "loss": 2.1744,
+ "step": 595
+ },
+ {
+ "epoch": 0.19,
+ "grad_norm": 1.1839090585708618,
+ "learning_rate": 4.582483336130586e-05,
+ "loss": 1.9982,
+ "step": 600
+ },
+ {
+ "epoch": 0.19,
+ "grad_norm": 1.0724798440933228,
+ "learning_rate": 4.575700381655979e-05,
+ "loss": 2.1234,
+ "step": 605
+ },
+ {
+ "epoch": 0.19,
+ "grad_norm": 2.009913682937622,
+ "learning_rate": 4.5688678774161796e-05,
+ "loss": 1.9478,
+ "step": 610
+ },
+ {
+ "epoch": 0.19,
+ "grad_norm": 0.9897060394287109,
+ "learning_rate": 4.561985986512271e-05,
+ "loss": 1.8268,
+ "step": 615
+ },
+ {
+ "epoch": 0.19,
+ "grad_norm": 0.8881808519363403,
+ "learning_rate": 4.555054873224263e-05,
+ "loss": 1.9887,
+ "step": 620
+ },
+ {
+ "epoch": 0.19,
+ "grad_norm": 1.155900001525879,
+ "learning_rate": 4.54807470300717e-05,
+ "loss": 2.0777,
+ "step": 625
+ },
+ {
+ "epoch": 0.2,
+ "grad_norm": 0.8782421350479126,
+ "learning_rate": 4.5410456424870596e-05,
+ "loss": 2.0566,
+ "step": 630
+ },
+ {
+ "epoch": 0.2,
+ "grad_norm": 1.3324674367904663,
+ "learning_rate": 4.5339678594570795e-05,
+ "loss": 2.047,
+ "step": 635
+ },
+ {
+ "epoch": 0.2,
+ "grad_norm": 1.9805939197540283,
+ "learning_rate": 4.526841522873449e-05,
+ "loss": 1.962,
+ "step": 640
+ },
+ {
+ "epoch": 0.2,
+ "grad_norm": 1.4999943971633911,
+ "learning_rate": 4.519666802851422e-05,
+ "loss": 2.0972,
+ "step": 645
+ },
+ {
+ "epoch": 0.2,
+ "grad_norm": 1.4504961967468262,
+ "learning_rate": 4.5124438706612376e-05,
+ "loss": 2.0041,
+ "step": 650
+ },
+ {
+ "epoch": 0.2,
+ "grad_norm": 0.9078169465065002,
+ "learning_rate": 4.505172898724018e-05,
+ "loss": 2.1229,
+ "step": 655
+ },
+ {
+ "epoch": 0.21,
+ "grad_norm": 1.1635804176330566,
+ "learning_rate": 4.497854060607662e-05,
+ "loss": 2.0195,
+ "step": 660
+ },
+ {
+ "epoch": 0.21,
+ "grad_norm": 1.46576726436615,
+ "learning_rate": 4.490487531022699e-05,
+ "loss": 2.0745,
+ "step": 665
+ },
+ {
+ "epoch": 0.21,
+ "grad_norm": 1.2094652652740479,
+ "learning_rate": 4.4830734858181145e-05,
+ "loss": 2.1068,
+ "step": 670
+ },
+ {
+ "epoch": 0.21,
+ "grad_norm": 1.4738895893096924,
+ "learning_rate": 4.47561210197716e-05,
+ "loss": 1.8088,
+ "step": 675
+ },
+ {
+ "epoch": 0.21,
+ "grad_norm": 1.23384690284729,
+ "learning_rate": 4.4681035576131215e-05,
+ "loss": 2.0995,
+ "step": 680
+ },
+ {
+ "epoch": 0.21,
+ "grad_norm": 0.8332946300506592,
+ "learning_rate": 4.46054803196507e-05,
+ "loss": 2.0541,
+ "step": 685
+ },
+ {
+ "epoch": 0.21,
+ "grad_norm": 0.9207485318183899,
+ "learning_rate": 4.452945705393586e-05,
+ "loss": 2.166,
+ "step": 690
+ },
+ {
+ "epoch": 0.22,
+ "grad_norm": 1.292945146560669,
+ "learning_rate": 4.445296759376449e-05,
+ "loss": 2.0784,
+ "step": 695
+ },
+ {
+ "epoch": 0.22,
+ "grad_norm": 0.9874763488769531,
+ "learning_rate": 4.437601376504307e-05,
+ "loss": 2.2087,
+ "step": 700
+ },
+ {
+ "epoch": 0.22,
+ "grad_norm": 0.9427415132522583,
+ "learning_rate": 4.4298597404763186e-05,
+ "loss": 2.1199,
+ "step": 705
+ },
+ {
+ "epoch": 0.22,
+ "grad_norm": 1.7369529008865356,
+ "learning_rate": 4.422072036095768e-05,
+ "loss": 2.0355,
+ "step": 710
+ },
+ {
+ "epoch": 0.22,
+ "grad_norm": 1.2423696517944336,
+ "learning_rate": 4.414238449265654e-05,
+ "loss": 2.0011,
+ "step": 715
+ },
+ {
+ "epoch": 0.22,
+ "grad_norm": 1.2304831743240356,
+ "learning_rate": 4.406359166984249e-05,
+ "loss": 2.0368,
+ "step": 720
+ },
+ {
+ "epoch": 0.23,
+ "grad_norm": 0.9090413451194763,
+ "learning_rate": 4.39843437734064e-05,
+ "loss": 1.9983,
+ "step": 725
+ },
+ {
+ "epoch": 0.23,
+ "grad_norm": 1.2729507684707642,
+ "learning_rate": 4.390464269510233e-05,
+ "loss": 2.021,
+ "step": 730
+ },
+ {
+ "epoch": 0.23,
+ "grad_norm": 1.3009227514266968,
+ "learning_rate": 4.382449033750244e-05,
+ "loss": 1.9743,
+ "step": 735
+ },
+ {
+ "epoch": 0.23,
+ "grad_norm": 1.5456056594848633,
+ "learning_rate": 4.37438886139515e-05,
+ "loss": 2.0689,
+ "step": 740
+ },
+ {
+ "epoch": 0.23,
+ "grad_norm": 1.3235007524490356,
+ "learning_rate": 4.3662839448521264e-05,
+ "loss": 2.0838,
+ "step": 745
+ },
+ {
+ "epoch": 0.23,
+ "grad_norm": 2.2074007987976074,
+ "learning_rate": 4.358134477596454e-05,
+ "loss": 2.0835,
+ "step": 750
+ },
+ {
+ "epoch": 0.23,
+ "grad_norm": 1.403738021850586,
+ "learning_rate": 4.3499406541668966e-05,
+ "loss": 2.0916,
+ "step": 755
+ },
+ {
+ "epoch": 0.24,
+ "grad_norm": 1.0940325260162354,
+ "learning_rate": 4.3417026701610616e-05,
+ "loss": 1.972,
+ "step": 760
+ },
+ {
+ "epoch": 0.24,
+ "grad_norm": 1.666353702545166,
+ "learning_rate": 4.3334207222307275e-05,
+ "loss": 1.927,
+ "step": 765
+ },
+ {
+ "epoch": 0.24,
+ "grad_norm": 1.0777515172958374,
+ "learning_rate": 4.325095008077154e-05,
+ "loss": 2.1192,
+ "step": 770
+ },
+ {
+ "epoch": 0.24,
+ "grad_norm": 1.7218186855316162,
+ "learning_rate": 4.316725726446353e-05,
+ "loss": 2.0774,
+ "step": 775
+ },
+ {
+ "epoch": 0.24,
+ "grad_norm": 1.356753945350647,
+ "learning_rate": 4.3083130771243586e-05,
+ "loss": 2.0847,
+ "step": 780
+ },
+ {
+ "epoch": 0.24,
+ "grad_norm": 0.9967429637908936,
+ "learning_rate": 4.299857260932445e-05,
+ "loss": 2.0485,
+ "step": 785
+ },
+ {
+ "epoch": 0.25,
+ "grad_norm": 1.6216442584991455,
+ "learning_rate": 4.2913584797223397e-05,
+ "loss": 2.1008,
+ "step": 790
+ },
+ {
+ "epoch": 0.25,
+ "grad_norm": 1.2556742429733276,
+ "learning_rate": 4.2828169363714016e-05,
+ "loss": 1.9209,
+ "step": 795
+ },
+ {
+ "epoch": 0.25,
+ "grad_norm": 1.1800439357757568,
+ "learning_rate": 4.274232834777782e-05,
+ "loss": 1.9722,
+ "step": 800
+ },
+ {
+ "epoch": 0.25,
+ "grad_norm": 1.1313499212265015,
+ "learning_rate": 4.2656063798555515e-05,
+ "loss": 1.9176,
+ "step": 805
+ },
+ {
+ "epoch": 0.25,
+ "grad_norm": 1.137534737586975,
+ "learning_rate": 4.256937777529815e-05,
+ "loss": 1.9929,
+ "step": 810
+ },
+ {
+ "epoch": 0.25,
+ "grad_norm": 1.0575093030929565,
+ "learning_rate": 4.2482272347317906e-05,
+ "loss": 2.166,
+ "step": 815
+ },
+ {
+ "epoch": 0.25,
+ "grad_norm": 1.5939594507217407,
+ "learning_rate": 4.2394749593938733e-05,
+ "loss": 2.1334,
+ "step": 820
+ },
+ {
+ "epoch": 0.26,
+ "grad_norm": 1.1045507192611694,
+ "learning_rate": 4.230681160444669e-05,
+ "loss": 2.0853,
+ "step": 825
+ },
+ {
+ "epoch": 0.26,
+ "grad_norm": 1.3480136394500732,
+ "learning_rate": 4.221846047804009e-05,
+ "loss": 2.1802,
+ "step": 830
+ },
+ {
+ "epoch": 0.26,
+ "grad_norm": 1.1822657585144043,
+ "learning_rate": 4.2129698323779366e-05,
+ "loss": 2.0739,
+ "step": 835
+ },
+ {
+ "epoch": 0.26,
+ "grad_norm": 1.1771117448806763,
+ "learning_rate": 4.204052726053676e-05,
+ "loss": 2.0238,
+ "step": 840
+ },
+ {
+ "epoch": 0.26,
+ "grad_norm": 1.4757814407348633,
+ "learning_rate": 4.195094941694571e-05,
+ "loss": 2.1557,
+ "step": 845
+ },
+ {
+ "epoch": 0.26,
+ "grad_norm": 0.9095075726509094,
+ "learning_rate": 4.1860966931350054e-05,
+ "loss": 2.1666,
+ "step": 850
+ },
+ {
+ "epoch": 0.27,
+ "grad_norm": 1.1039543151855469,
+ "learning_rate": 4.1770581951752976e-05,
+ "loss": 2.105,
+ "step": 855
+ },
+ {
+ "epoch": 0.27,
+ "grad_norm": 0.8517205119132996,
+ "learning_rate": 4.1679796635765735e-05,
+ "loss": 1.9656,
+ "step": 860
+ },
+ {
+ "epoch": 0.27,
+ "grad_norm": 1.239492654800415,
+ "learning_rate": 4.158861315055617e-05,
+ "loss": 2.0166,
+ "step": 865
+ },
+ {
+ "epoch": 0.27,
+ "grad_norm": 1.1358321905136108,
+ "learning_rate": 4.1497033672796924e-05,
+ "loss": 2.0076,
+ "step": 870
+ },
+ {
+ "epoch": 0.27,
+ "grad_norm": 1.6215249300003052,
+ "learning_rate": 4.140506038861356e-05,
+ "loss": 2.1594,
+ "step": 875
+ },
+ {
+ "epoch": 0.27,
+ "grad_norm": 1.0528080463409424,
+ "learning_rate": 4.131269549353229e-05,
+ "loss": 2.1416,
+ "step": 880
+ },
+ {
+ "epoch": 0.28,
+ "grad_norm": 0.8976901769638062,
+ "learning_rate": 4.1219941192427644e-05,
+ "loss": 2.1242,
+ "step": 885
+ },
+ {
+ "epoch": 0.28,
+ "grad_norm": 1.263594388961792,
+ "learning_rate": 4.112679969946977e-05,
+ "loss": 2.02,
+ "step": 890
+ },
+ {
+ "epoch": 0.28,
+ "grad_norm": 1.4173017740249634,
+ "learning_rate": 4.103327323807162e-05,
+ "loss": 2.0438,
+ "step": 895
+ },
+ {
+ "epoch": 0.28,
+ "grad_norm": 1.876170039176941,
+ "learning_rate": 4.093936404083585e-05,
+ "loss": 1.9806,
+ "step": 900
+ },
+ {
+ "epoch": 0.28,
+ "grad_norm": 1.4649231433868408,
+ "learning_rate": 4.0845074349501544e-05,
+ "loss": 2.1476,
+ "step": 905
+ },
+ {
+ "epoch": 0.28,
+ "grad_norm": 1.0446043014526367,
+ "learning_rate": 4.0750406414890695e-05,
+ "loss": 1.9672,
+ "step": 910
+ },
+ {
+ "epoch": 0.28,
+ "grad_norm": 1.0225305557250977,
+ "learning_rate": 4.065536249685448e-05,
+ "loss": 1.9984,
+ "step": 915
+ },
+ {
+ "epoch": 0.29,
+ "grad_norm": 1.0120617151260376,
+ "learning_rate": 4.055994486421929e-05,
+ "loss": 2.1162,
+ "step": 920
+ },
+ {
+ "epoch": 0.29,
+ "grad_norm": 1.0469881296157837,
+ "learning_rate": 4.04641557947326e-05,
+ "loss": 2.0435,
+ "step": 925
+ },
+ {
+ "epoch": 0.29,
+ "grad_norm": 1.2435941696166992,
+ "learning_rate": 4.036799757500856e-05,
+ "loss": 2.0431,
+ "step": 930
+ },
+ {
+ "epoch": 0.29,
+ "grad_norm": 1.0055103302001953,
+ "learning_rate": 4.027147250047348e-05,
+ "loss": 2.2021,
+ "step": 935
+ },
+ {
+ "epoch": 0.29,
+ "grad_norm": 1.1212949752807617,
+ "learning_rate": 4.017458287531094e-05,
+ "loss": 1.997,
+ "step": 940
+ },
+ {
+ "epoch": 0.29,
+ "grad_norm": 1.1048357486724854,
+ "learning_rate": 4.007733101240685e-05,
+ "loss": 1.946,
+ "step": 945
+ },
+ {
+ "epoch": 0.3,
+ "grad_norm": 1.4721689224243164,
+ "learning_rate": 3.997971923329426e-05,
+ "loss": 2.0723,
+ "step": 950
+ },
+ {
+ "epoch": 0.3,
+ "grad_norm": 1.3793156147003174,
+ "learning_rate": 3.988174986809783e-05,
+ "loss": 2.034,
+ "step": 955
+ },
+ {
+ "epoch": 0.3,
+ "grad_norm": 0.9013482928276062,
+ "learning_rate": 3.9783425255478355e-05,
+ "loss": 1.9736,
+ "step": 960
+ },
+ {
+ "epoch": 0.3,
+ "grad_norm": 0.9192422032356262,
+ "learning_rate": 3.968474774257682e-05,
+ "loss": 1.9878,
+ "step": 965
+ },
+ {
+ "epoch": 0.3,
+ "grad_norm": 1.9304206371307373,
+ "learning_rate": 3.9585719684958446e-05,
+ "loss": 2.117,
+ "step": 970
+ },
+ {
+ "epoch": 0.3,
+ "grad_norm": 1.0435137748718262,
+ "learning_rate": 3.948634344655639e-05,
+ "loss": 2.0585,
+ "step": 975
+ },
+ {
+ "epoch": 0.3,
+ "grad_norm": 1.4636590480804443,
+ "learning_rate": 3.938662139961538e-05,
+ "loss": 2.0409,
+ "step": 980
+ },
+ {
+ "epoch": 0.31,
+ "grad_norm": 1.8014529943466187,
+ "learning_rate": 3.928655592463508e-05,
+ "loss": 2.0369,
+ "step": 985
+ },
+ {
+ "epoch": 0.31,
+ "grad_norm": 1.2412620782852173,
+ "learning_rate": 3.918614941031319e-05,
+ "loss": 1.967,
+ "step": 990
+ },
+ {
+ "epoch": 0.31,
+ "grad_norm": 1.3581103086471558,
+ "learning_rate": 3.908540425348852e-05,
+ "loss": 2.0037,
+ "step": 995
+ },
+ {
+ "epoch": 0.31,
+ "grad_norm": 1.2377780675888062,
+ "learning_rate": 3.8984322859083725e-05,
+ "loss": 1.9991,
+ "step": 1000
+ },
+ {
+ "epoch": 0.31,
+ "grad_norm": 0.9209259748458862,
+ "learning_rate": 3.8882907640047896e-05,
+ "loss": 2.0448,
+ "step": 1005
+ },
+ {
+ "epoch": 0.31,
+ "grad_norm": 1.0150959491729736,
+ "learning_rate": 3.878116101729897e-05,
+ "loss": 2.0791,
+ "step": 1010
+ },
+ {
+ "epoch": 0.32,
+ "grad_norm": 1.5959141254425049,
+ "learning_rate": 3.867908541966594e-05,
+ "loss": 1.9997,
+ "step": 1015
+ },
+ {
+ "epoch": 0.32,
+ "grad_norm": 1.3945012092590332,
+ "learning_rate": 3.857668328383088e-05,
+ "loss": 2.0481,
+ "step": 1020
+ },
+ {
+ "epoch": 0.32,
+ "grad_norm": 1.2361671924591064,
+ "learning_rate": 3.847395705427075e-05,
+ "loss": 2.2664,
+ "step": 1025
+ },
+ {
+ "epoch": 0.32,
+ "grad_norm": 1.9661719799041748,
+ "learning_rate": 3.837090918319909e-05,
+ "loss": 1.9752,
+ "step": 1030
+ },
+ {
+ "epoch": 0.32,
+ "grad_norm": 1.6995949745178223,
+ "learning_rate": 3.8267542130507436e-05,
+ "loss": 2.1332,
+ "step": 1035
+ },
+ {
+ "epoch": 0.32,
+ "grad_norm": 1.1248412132263184,
+ "learning_rate": 3.816385836370663e-05,
+ "loss": 2.0432,
+ "step": 1040
+ },
+ {
+ "epoch": 0.32,
+ "grad_norm": 0.8734235763549805,
+ "learning_rate": 3.805986035786789e-05,
+ "loss": 1.9618,
+ "step": 1045
+ },
+ {
+ "epoch": 0.33,
+ "grad_norm": 1.322766661643982,
+ "learning_rate": 3.795555059556378e-05,
+ "loss": 2.0267,
+ "step": 1050
+ },
+ {
+ "epoch": 0.33,
+ "grad_norm": 1.0396028757095337,
+ "learning_rate": 3.7850931566808866e-05,
+ "loss": 2.1075,
+ "step": 1055
+ },
+ {
+ "epoch": 0.33,
+ "grad_norm": 0.9574625492095947,
+ "learning_rate": 3.7746005769000363e-05,
+ "loss": 2.156,
+ "step": 1060
+ },
+ {
+ "epoch": 0.33,
+ "grad_norm": 1.4480133056640625,
+ "learning_rate": 3.764077570685844e-05,
+ "loss": 1.9615,
+ "step": 1065
+ },
+ {
+ "epoch": 0.33,
+ "grad_norm": 1.5908560752868652,
+ "learning_rate": 3.753524389236648e-05,
+ "loss": 2.0928,
+ "step": 1070
+ },
+ {
+ "epoch": 0.33,
+ "grad_norm": 1.2628813982009888,
+ "learning_rate": 3.742941284471111e-05,
+ "loss": 2.1074,
+ "step": 1075
+ },
+ {
+ "epoch": 0.34,
+ "grad_norm": 1.2687503099441528,
+ "learning_rate": 3.7323285090222054e-05,
+ "loss": 1.9666,
+ "step": 1080
+ },
+ {
+ "epoch": 0.34,
+ "grad_norm": 1.2571731805801392,
+ "learning_rate": 3.721686316231181e-05,
+ "loss": 2.0468,
+ "step": 1085
+ },
+ {
+ "epoch": 0.34,
+ "grad_norm": 1.007453441619873,
+ "learning_rate": 3.7110149601415215e-05,
+ "loss": 2.0624,
+ "step": 1090
+ },
+ {
+ "epoch": 0.34,
+ "grad_norm": 1.2390377521514893,
+ "learning_rate": 3.700314695492876e-05,
+ "loss": 1.9888,
+ "step": 1095
+ },
+ {
+ "epoch": 0.34,
+ "grad_norm": 1.0878371000289917,
+ "learning_rate": 3.6895857777149825e-05,
+ "loss": 2.1013,
+ "step": 1100
+ },
+ {
+ "epoch": 0.34,
+ "grad_norm": 0.8759217262268066,
+ "learning_rate": 3.6788284629215624e-05,
+ "loss": 1.875,
+ "step": 1105
+ },
+ {
+ "epoch": 0.35,
+ "grad_norm": 1.1345970630645752,
+ "learning_rate": 3.668043007904219e-05,
+ "loss": 1.9096,
+ "step": 1110
+ },
+ {
+ "epoch": 0.35,
+ "grad_norm": 1.253629446029663,
+ "learning_rate": 3.6572296701262966e-05,
+ "loss": 2.1859,
+ "step": 1115
+ },
+ {
+ "epoch": 0.35,
+ "grad_norm": 0.9796190857887268,
+ "learning_rate": 3.646388707716738e-05,
+ "loss": 2.2092,
+ "step": 1120
+ },
+ {
+ "epoch": 0.35,
+ "grad_norm": 1.3893767595291138,
+ "learning_rate": 3.635520379463926e-05,
+ "loss": 2.0026,
+ "step": 1125
+ },
+ {
+ "epoch": 0.35,
+ "grad_norm": 0.8778309226036072,
+ "learning_rate": 3.6246249448095004e-05,
+ "loss": 2.2112,
+ "step": 1130
+ },
+ {
+ "epoch": 0.35,
+ "grad_norm": 1.2479698657989502,
+ "learning_rate": 3.6137026638421696e-05,
+ "loss": 2.0221,
+ "step": 1135
+ },
+ {
+ "epoch": 0.35,
+ "grad_norm": 1.3813824653625488,
+ "learning_rate": 3.6027537972914974e-05,
+ "loss": 1.9106,
+ "step": 1140
+ },
+ {
+ "epoch": 0.36,
+ "grad_norm": 1.2043218612670898,
+ "learning_rate": 3.5917786065216826e-05,
+ "loss": 2.0673,
+ "step": 1145
+ },
+ {
+ "epoch": 0.36,
+ "grad_norm": 1.5337340831756592,
+ "learning_rate": 3.580777353525318e-05,
+ "loss": 2.1463,
+ "step": 1150
+ },
+ {
+ "epoch": 0.36,
+ "grad_norm": 1.155813455581665,
+ "learning_rate": 3.5697503009171385e-05,
+ "loss": 2.0255,
+ "step": 1155
+ },
+ {
+ "epoch": 0.36,
+ "grad_norm": 1.034644365310669,
+ "learning_rate": 3.558697711927748e-05,
+ "loss": 2.1348,
+ "step": 1160
+ },
+ {
+ "epoch": 0.36,
+ "grad_norm": 1.0959795713424683,
+ "learning_rate": 3.54761985039734e-05,
+ "loss": 2.1457,
+ "step": 1165
+ },
+ {
+ "epoch": 0.36,
+ "grad_norm": 1.1938838958740234,
+ "learning_rate": 3.5365169807693966e-05,
+ "loss": 2.1256,
+ "step": 1170
+ },
+ {
+ "epoch": 0.37,
+ "grad_norm": 0.8162047863006592,
+ "learning_rate": 3.525389368084379e-05,
+ "loss": 1.9587,
+ "step": 1175
+ },
+ {
+ "epoch": 0.37,
+ "grad_norm": 0.9358930587768555,
+ "learning_rate": 3.514237277973393e-05,
+ "loss": 1.8965,
+ "step": 1180
+ },
+ {
+ "epoch": 0.37,
+ "grad_norm": 0.9210988879203796,
+ "learning_rate": 3.503060976651862e-05,
+ "loss": 1.9669,
+ "step": 1185
+ },
+ {
+ "epoch": 0.37,
+ "grad_norm": 1.4641343355178833,
+ "learning_rate": 3.491860730913156e-05,
+ "loss": 2.003,
+ "step": 1190
+ },
+ {
+ "epoch": 0.37,
+ "grad_norm": 1.2458257675170898,
+ "learning_rate": 3.480636808122235e-05,
+ "loss": 2.1487,
+ "step": 1195
+ },
+ {
+ "epoch": 0.37,
+ "grad_norm": 1.6770122051239014,
+ "learning_rate": 3.469389476209259e-05,
+ "loss": 2.0686,
+ "step": 1200
+ },
+ {
+ "epoch": 0.37,
+ "grad_norm": 0.9083845019340515,
+ "learning_rate": 3.458119003663199e-05,
+ "loss": 2.0284,
+ "step": 1205
+ },
+ {
+ "epoch": 0.38,
+ "grad_norm": 1.2679696083068848,
+ "learning_rate": 3.446825659525421e-05,
+ "loss": 2.0555,
+ "step": 1210
+ },
+ {
+ "epoch": 0.38,
+ "grad_norm": 1.3823720216751099,
+ "learning_rate": 3.435509713383268e-05,
+ "loss": 1.9375,
+ "step": 1215
+ },
+ {
+ "epoch": 0.38,
+ "grad_norm": 1.5862077474594116,
+ "learning_rate": 3.424171435363623e-05,
+ "loss": 2.0271,
+ "step": 1220
+ },
+ {
+ "epoch": 0.38,
+ "grad_norm": 2.0107533931732178,
+ "learning_rate": 3.412811096126461e-05,
+ "loss": 2.1897,
+ "step": 1225
+ },
+ {
+ "epoch": 0.38,
+ "grad_norm": 1.4544458389282227,
+ "learning_rate": 3.401428966858387e-05,
+ "loss": 1.9978,
+ "step": 1230
+ },
+ {
+ "epoch": 0.38,
+ "grad_norm": 1.188170075416565,
+ "learning_rate": 3.390025319266167e-05,
+ "loss": 2.0688,
+ "step": 1235
+ },
+ {
+ "epoch": 0.39,
+ "grad_norm": 1.1016322374343872,
+ "learning_rate": 3.3786004255702336e-05,
+ "loss": 2.0396,
+ "step": 1240
+ },
+ {
+ "epoch": 0.39,
+ "grad_norm": 1.6623334884643555,
+ "learning_rate": 3.3671545584981954e-05,
+ "loss": 1.9566,
+ "step": 1245
+ },
+ {
+ "epoch": 0.39,
+ "grad_norm": 0.9161584377288818,
+ "learning_rate": 3.355687991278324e-05,
+ "loss": 2.0474,
+ "step": 1250
+ },
+ {
+ "epoch": 0.39,
+ "grad_norm": 0.9911025166511536,
+ "learning_rate": 3.3442009976330305e-05,
+ "loss": 2.2163,
+ "step": 1255
+ },
+ {
+ "epoch": 0.39,
+ "grad_norm": 1.1504255533218384,
+ "learning_rate": 3.332693851772331e-05,
+ "loss": 2.1088,
+ "step": 1260
+ },
+ {
+ "epoch": 0.39,
+ "grad_norm": 0.9544184803962708,
+ "learning_rate": 3.3211668283873035e-05,
+ "loss": 1.8947,
+ "step": 1265
+ },
+ {
+ "epoch": 0.39,
+ "grad_norm": 1.4625756740570068,
+ "learning_rate": 3.3096202026435304e-05,
+ "loss": 2.1748,
+ "step": 1270
+ },
+ {
+ "epoch": 0.4,
+ "grad_norm": 1.3267475366592407,
+ "learning_rate": 3.298054250174527e-05,
+ "loss": 1.9218,
+ "step": 1275
+ },
+ {
+ "epoch": 0.4,
+ "grad_norm": 0.9869363903999329,
+ "learning_rate": 3.2864692470751654e-05,
+ "loss": 2.2723,
+ "step": 1280
+ },
+ {
+ "epoch": 0.4,
+ "grad_norm": 1.5177838802337646,
+ "learning_rate": 3.27486546989508e-05,
+ "loss": 2.1456,
+ "step": 1285
+ },
+ {
+ "epoch": 0.4,
+ "grad_norm": 1.1998714208602905,
+ "learning_rate": 3.263243195632068e-05,
+ "loss": 1.8877,
+ "step": 1290
+ },
+ {
+ "epoch": 0.4,
+ "grad_norm": 1.2112164497375488,
+ "learning_rate": 3.2516027017254785e-05,
+ "loss": 2.0615,
+ "step": 1295
+ },
+ {
+ "epoch": 0.4,
+ "grad_norm": 1.0616129636764526,
+ "learning_rate": 3.239944266049587e-05,
+ "loss": 2.0402,
+ "step": 1300
+ },
+ {
+ "epoch": 0.41,
+ "grad_norm": 1.4537287950515747,
+ "learning_rate": 3.228268166906962e-05,
+ "loss": 2.0728,
+ "step": 1305
+ },
+ {
+ "epoch": 0.41,
+ "grad_norm": 1.3899391889572144,
+ "learning_rate": 3.2165746830218254e-05,
+ "loss": 2.1815,
+ "step": 1310
+ },
+ {
+ "epoch": 0.41,
+ "grad_norm": 1.332529067993164,
+ "learning_rate": 3.204864093533394e-05,
+ "loss": 1.8935,
+ "step": 1315
+ },
+ {
+ "epoch": 0.41,
+ "grad_norm": 1.4466496706008911,
+ "learning_rate": 3.193136677989221e-05,
+ "loss": 1.9567,
+ "step": 1320
+ },
+ {
+ "epoch": 0.41,
+ "grad_norm": 1.1781721115112305,
+ "learning_rate": 3.181392716338516e-05,
+ "loss": 2.055,
+ "step": 1325
+ },
+ {
+ "epoch": 0.41,
+ "grad_norm": 0.9411901831626892,
+ "learning_rate": 3.1696324889254716e-05,
+ "loss": 1.8794,
+ "step": 1330
+ },
+ {
+ "epoch": 0.42,
+ "grad_norm": 1.2628341913223267,
+ "learning_rate": 3.15785627648256e-05,
+ "loss": 2.0299,
+ "step": 1335
+ },
+ {
+ "epoch": 0.42,
+ "grad_norm": 1.4857370853424072,
+ "learning_rate": 3.146064360123846e-05,
+ "loss": 1.9342,
+ "step": 1340
+ },
+ {
+ "epoch": 0.42,
+ "grad_norm": 1.661470651626587,
+ "learning_rate": 3.1342570213382594e-05,
+ "loss": 2.0399,
+ "step": 1345
+ },
+ {
+ "epoch": 0.42,
+ "grad_norm": 1.522845983505249,
+ "learning_rate": 3.122434541982888e-05,
+ "loss": 2.1419,
+ "step": 1350
+ },
+ {
+ "epoch": 0.42,
+ "grad_norm": 1.5679118633270264,
+ "learning_rate": 3.110597204276247e-05,
+ "loss": 2.2932,
+ "step": 1355
+ },
+ {
+ "epoch": 0.42,
+ "grad_norm": 1.3367788791656494,
+ "learning_rate": 3.098745290791539e-05,
+ "loss": 1.8989,
+ "step": 1360
+ },
+ {
+ "epoch": 0.42,
+ "grad_norm": 1.3873472213745117,
+ "learning_rate": 3.086879084449907e-05,
+ "loss": 2.1214,
+ "step": 1365
+ },
+ {
+ "epoch": 0.43,
+ "grad_norm": 1.2957035303115845,
+ "learning_rate": 3.074998868513688e-05,
+ "loss": 2.2538,
+ "step": 1370
+ },
+ {
+ "epoch": 0.43,
+ "grad_norm": 1.122176170349121,
+ "learning_rate": 3.0631049265796465e-05,
+ "loss": 2.0974,
+ "step": 1375
+ },
+ {
+ "epoch": 0.43,
+ "grad_norm": 1.0422618389129639,
+ "learning_rate": 3.051197542572203e-05,
+ "loss": 2.054,
+ "step": 1380
+ },
+ {
+ "epoch": 0.43,
+ "grad_norm": 1.1926140785217285,
+ "learning_rate": 3.0392770007366584e-05,
+ "loss": 1.9798,
+ "step": 1385
+ },
+ {
+ "epoch": 0.43,
+ "grad_norm": 0.8764025568962097,
+ "learning_rate": 3.0273435856324112e-05,
+ "loss": 2.0796,
+ "step": 1390
+ },
+ {
+ "epoch": 0.43,
+ "grad_norm": 0.8200764656066895,
+ "learning_rate": 3.0153975821261605e-05,
+ "loss": 1.9116,
+ "step": 1395
+ },
+ {
+ "epoch": 0.44,
+ "grad_norm": 1.0340498685836792,
+ "learning_rate": 3.0034392753851066e-05,
+ "loss": 2.0235,
+ "step": 1400
+ },
+ {
+ "epoch": 0.44,
+ "grad_norm": 1.0799012184143066,
+ "learning_rate": 2.9914689508701476e-05,
+ "loss": 2.1455,
+ "step": 1405
+ },
+ {
+ "epoch": 0.44,
+ "grad_norm": 1.301015853881836,
+ "learning_rate": 2.979486894329058e-05,
+ "loss": 2.0355,
+ "step": 1410
+ },
+ {
+ "epoch": 0.44,
+ "grad_norm": 1.2926914691925049,
+ "learning_rate": 2.9674933917896747e-05,
+ "loss": 2.0379,
+ "step": 1415
+ },
+ {
+ "epoch": 0.44,
+ "grad_norm": 1.4712942838668823,
+ "learning_rate": 2.9554887295530647e-05,
+ "loss": 2.0802,
+ "step": 1420
+ },
+ {
+ "epoch": 0.44,
+ "grad_norm": 1.1957335472106934,
+ "learning_rate": 2.943473194186693e-05,
+ "loss": 2.1044,
+ "step": 1425
+ },
+ {
+ "epoch": 0.44,
+ "grad_norm": 1.568293571472168,
+ "learning_rate": 2.9314470725175792e-05,
+ "loss": 2.0121,
+ "step": 1430
+ },
+ {
+ "epoch": 0.45,
+ "grad_norm": 1.4844893217086792,
+ "learning_rate": 2.919410651625455e-05,
+ "loss": 2.0717,
+ "step": 1435
+ },
+ {
+ "epoch": 0.45,
+ "grad_norm": 1.3942641019821167,
+ "learning_rate": 2.907364218835904e-05,
+ "loss": 1.9522,
+ "step": 1440
+ },
+ {
+ "epoch": 0.45,
+ "grad_norm": 0.7795314788818359,
+ "learning_rate": 2.8953080617135115e-05,
+ "loss": 1.9593,
+ "step": 1445
+ },
+ {
+ "epoch": 0.45,
+ "grad_norm": 1.751107931137085,
+ "learning_rate": 2.8832424680549937e-05,
+ "loss": 1.8073,
+ "step": 1450
+ },
+ {
+ "epoch": 0.45,
+ "grad_norm": 1.2202279567718506,
+ "learning_rate": 2.8711677258823306e-05,
+ "loss": 2.0042,
+ "step": 1455
+ },
+ {
+ "epoch": 0.45,
+ "grad_norm": 1.5163853168487549,
+ "learning_rate": 2.859084123435887e-05,
+ "loss": 1.9931,
+ "step": 1460
+ },
+ {
+ "epoch": 0.46,
+ "grad_norm": 0.94038987159729,
+ "learning_rate": 2.84699194916754e-05,
+ "loss": 2.1533,
+ "step": 1465
+ },
+ {
+ "epoch": 0.46,
+ "grad_norm": 1.4618102312088013,
+ "learning_rate": 2.834891491733781e-05,
+ "loss": 2.029,
+ "step": 1470
+ },
+ {
+ "epoch": 0.46,
+ "grad_norm": 0.9747155904769897,
+ "learning_rate": 2.822783039988836e-05,
+ "loss": 2.0241,
+ "step": 1475
+ },
+ {
+ "epoch": 0.46,
+ "grad_norm": 1.0887038707733154,
+ "learning_rate": 2.8106668829777645e-05,
+ "loss": 2.0959,
+ "step": 1480
+ },
+ {
+ "epoch": 0.46,
+ "grad_norm": 1.2170171737670898,
+ "learning_rate": 2.7985433099295618e-05,
+ "loss": 1.8718,
+ "step": 1485
+ },
+ {
+ "epoch": 0.46,
+ "grad_norm": 1.1366883516311646,
+ "learning_rate": 2.7864126102502524e-05,
+ "loss": 2.2397,
+ "step": 1490
+ },
+ {
+ "epoch": 0.46,
+ "grad_norm": 1.1206785440444946,
+ "learning_rate": 2.774275073515985e-05,
+ "loss": 2.1083,
+ "step": 1495
+ },
+ {
+ "epoch": 0.47,
+ "grad_norm": 1.126807451248169,
+ "learning_rate": 2.7621309894661167e-05,
+ "loss": 2.0764,
+ "step": 1500
+ }
+ ],
+ "logging_steps": 5,
+ "max_steps": 3215,
+ "num_input_tokens_seen": 0,
+ "num_train_epochs": 1,
+ "save_steps": 100,
+ "total_flos": 2.01535997018112e+16,
+ "train_batch_size": 2,
+ "trial_name": null,
+ "trial_params": null
+}
diff --git a/checkpoint-1500/training_args.bin b/checkpoint-1500/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..dbb5f752acaebe6718c05e59ced2a70ce6dfb6a0
--- /dev/null
+++ b/checkpoint-1500/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:79e80b13ff00898b4493d440a3c1a1eb234c0ae541cbca8a8b1befef97a354c9
+size 5112
diff --git a/checkpoint-200/README.md b/checkpoint-200/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..b21eb87c49b1ea11193cd8f664d04b06c22e6bb5
--- /dev/null
+++ b/checkpoint-200/README.md
@@ -0,0 +1,202 @@
+---
+library_name: peft
+base_model: hfl/chinese-alpaca-2-1.3b
+---
+
+# Model Card for Model ID
+
+
+
+
+
+## Model Details
+
+### Model Description
+
+
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+
+
+### Direct Use
+
+
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+
+
+[More Information Needed]
+
+### Recommendations
+
+
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+
+
+[More Information Needed]
+
+### Training Procedure
+
+
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed]
+
+#### Speeds, Sizes, Times [optional]
+
+
+
+[More Information Needed]
+
+## Evaluation
+
+
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+
+
+[More Information Needed]
+
+#### Factors
+
+
+
+[More Information Needed]
+
+#### Metrics
+
+
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+
+
+[More Information Needed]
+
+## Environmental Impact
+
+
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.9.0
\ No newline at end of file
diff --git a/checkpoint-200/adapter_config.json b/checkpoint-200/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..67326eac9d1aeaece3cee6be49e088077b9cebe9
--- /dev/null
+++ b/checkpoint-200/adapter_config.json
@@ -0,0 +1,28 @@
+{
+ "alpha_pattern": {},
+ "auto_mapping": null,
+ "base_model_name_or_path": "hfl/chinese-alpaca-2-1.3b",
+ "bias": "none",
+ "fan_in_fan_out": false,
+ "inference_mode": true,
+ "init_lora_weights": true,
+ "layers_pattern": null,
+ "layers_to_transform": null,
+ "loftq_config": {},
+ "lora_alpha": 16,
+ "lora_dropout": 0.1,
+ "megatron_config": null,
+ "megatron_core": "megatron.core",
+ "modules_to_save": null,
+ "peft_type": "LORA",
+ "r": 8,
+ "rank_pattern": {},
+ "revision": null,
+ "target_modules": [
+ "q_proj",
+ "v_proj"
+ ],
+ "task_type": "CAUSAL_LM",
+ "use_dora": false,
+ "use_rslora": false
+}
\ No newline at end of file
diff --git a/checkpoint-200/adapter_model.safetensors b/checkpoint-200/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..78fcb02a90be8e210dcc77542f0daf40421ad364
--- /dev/null
+++ b/checkpoint-200/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f35acad31ae62bc0123d9805573e0a0ebe6a74796fe8a3e82694ed0c92738660
+size 2099272
diff --git a/checkpoint-200/optimizer.pt b/checkpoint-200/optimizer.pt
new file mode 100644
index 0000000000000000000000000000000000000000..40c1d2782dd1f86624396ff5fd460ad915786524
--- /dev/null
+++ b/checkpoint-200/optimizer.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5fbdb058d049f37cb8663b64755cdb0b9c9914449aabe57fd861b9a38be97135
+size 4208302
diff --git a/checkpoint-200/rng_state.pth b/checkpoint-200/rng_state.pth
new file mode 100644
index 0000000000000000000000000000000000000000..6279c5458de52148be4a6e2e6bf48ca6f1d7ef44
--- /dev/null
+++ b/checkpoint-200/rng_state.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c6effff382e27d9bddcb6e934a8054a91fd0264c14b69428d12d2650e52650c3
+size 14244
diff --git a/checkpoint-200/scheduler.pt b/checkpoint-200/scheduler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..0cc26892a158fe75d5e7422a78f4d02a13c821d9
--- /dev/null
+++ b/checkpoint-200/scheduler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e7f3c90f0256918706f894a068be28979d7cef535fe184ef7473a22beeb1d99d
+size 1064
diff --git a/checkpoint-200/special_tokens_map.json b/checkpoint-200/special_tokens_map.json
new file mode 100644
index 0000000000000000000000000000000000000000..94c4595c4eb202faff8e45d273ceffbe19fe3036
--- /dev/null
+++ b/checkpoint-200/special_tokens_map.json
@@ -0,0 +1,30 @@
+{
+ "bos_token": {
+ "content": "",
+ "lstrip": false,
+ "normalized": true,
+ "rstrip": false,
+ "single_word": false
+ },
+ "eos_token": {
+ "content": "",
+ "lstrip": false,
+ "normalized": true,
+ "rstrip": false,
+ "single_word": false
+ },
+ "pad_token": {
+ "content": "",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false
+ },
+ "unk_token": {
+ "content": "",
+ "lstrip": false,
+ "normalized": true,
+ "rstrip": false,
+ "single_word": false
+ }
+}
diff --git a/checkpoint-200/tokenizer.model b/checkpoint-200/tokenizer.model
new file mode 100644
index 0000000000000000000000000000000000000000..3df664fcc67f61d0c7e635a60d9f55cd0624158a
--- /dev/null
+++ b/checkpoint-200/tokenizer.model
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a3b8844863b200dfcca971db228e96ce388290dfcf72c15d7a9d2f604bac787c
+size 844403
diff --git a/checkpoint-200/tokenizer_config.json b/checkpoint-200/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..6912f7cdcb84c3039edad59aa64f70dc4344a517
--- /dev/null
+++ b/checkpoint-200/tokenizer_config.json
@@ -0,0 +1,54 @@
+{
+ "add_bos_token": true,
+ "add_eos_token": false,
+ "add_prefix_space": true,
+ "added_tokens_decoder": {
+ "0": {
+ "content": "",
+ "lstrip": false,
+ "normalized": true,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "1": {
+ "content": "",
+ "lstrip": false,
+ "normalized": true,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "2": {
+ "content": "",
+ "lstrip": false,
+ "normalized": true,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "32000": {
+ "content": "",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ }
+ },
+ "bos_token": "",
+ "chat_template": "{% set system_message = 'You are a helpful assistant. 你是一个乐于助人的助手。' %}{% if messages[0]['role'] == 'system' %}{% set system_message = messages[0]['content'] %}{% endif %}{% for message in messages %}{% set content = message['content'] %}{% if loop.index0 == 0 and system_message is defined %}{% set content = '<>\\n' + system_message + '\\n<>\\n\\n' + message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ '' + '[INST] ' + content + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ content + '' }}{% endif %}{% endfor %}",
+ "clean_up_tokenization_spaces": false,
+ "eos_token": "",
+ "legacy": true,
+ "model_max_length": 1000000000000000019884624838656,
+ "pad_token": "",
+ "padding_side": "right",
+ "sp_model_kwargs": {},
+ "spaces_between_special_tokens": false,
+ "split_special_tokens": false,
+ "tokenizer_class": "LlamaTokenizer",
+ "unk_token": "",
+ "use_default_system_prompt": false,
+ "use_fast": false
+}
diff --git a/checkpoint-200/trainer_state.json b/checkpoint-200/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..d2b8e69aab1de7672010cdbd6c4860aaba506d12
--- /dev/null
+++ b/checkpoint-200/trainer_state.json
@@ -0,0 +1,301 @@
+{
+ "best_metric": null,
+ "best_model_checkpoint": null,
+ "epoch": 0.06219147199440277,
+ "eval_steps": 500,
+ "global_step": 200,
+ "is_hyper_param_search": false,
+ "is_local_process_zero": true,
+ "is_world_process_zero": true,
+ "log_history": [
+ {
+ "epoch": 0.0,
+ "grad_norm": 0.47774845361709595,
+ "learning_rate": 4.999970160815579e-05,
+ "loss": 2.0765,
+ "step": 5
+ },
+ {
+ "epoch": 0.0,
+ "grad_norm": 0.6051416397094727,
+ "learning_rate": 4.999880643974619e-05,
+ "loss": 2.2297,
+ "step": 10
+ },
+ {
+ "epoch": 0.0,
+ "grad_norm": 0.6161717772483826,
+ "learning_rate": 4.9997314516140056e-05,
+ "loss": 2.1103,
+ "step": 15
+ },
+ {
+ "epoch": 0.01,
+ "grad_norm": 0.4686434268951416,
+ "learning_rate": 4.999522587295162e-05,
+ "loss": 2.0057,
+ "step": 20
+ },
+ {
+ "epoch": 0.01,
+ "grad_norm": 0.8412289023399353,
+ "learning_rate": 4.999254056003963e-05,
+ "loss": 2.1778,
+ "step": 25
+ },
+ {
+ "epoch": 0.01,
+ "grad_norm": 0.5333625078201294,
+ "learning_rate": 4.99892586415061e-05,
+ "loss": 2.2399,
+ "step": 30
+ },
+ {
+ "epoch": 0.01,
+ "grad_norm": 0.821148157119751,
+ "learning_rate": 4.9985380195694856e-05,
+ "loss": 2.3215,
+ "step": 35
+ },
+ {
+ "epoch": 0.01,
+ "grad_norm": 0.8403909206390381,
+ "learning_rate": 4.998090531518962e-05,
+ "loss": 1.8295,
+ "step": 40
+ },
+ {
+ "epoch": 0.01,
+ "grad_norm": 0.6633398532867432,
+ "learning_rate": 4.9975834106811834e-05,
+ "loss": 2.0195,
+ "step": 45
+ },
+ {
+ "epoch": 0.02,
+ "grad_norm": 0.6386868357658386,
+ "learning_rate": 4.997016669161806e-05,
+ "loss": 2.1257,
+ "step": 50
+ },
+ {
+ "epoch": 0.02,
+ "grad_norm": 0.7762248516082764,
+ "learning_rate": 4.996390320489715e-05,
+ "loss": 2.057,
+ "step": 55
+ },
+ {
+ "epoch": 0.02,
+ "grad_norm": 1.3192856311798096,
+ "learning_rate": 4.9957043796166966e-05,
+ "loss": 2.0753,
+ "step": 60
+ },
+ {
+ "epoch": 0.02,
+ "grad_norm": 0.9797518849372864,
+ "learning_rate": 4.994958862917083e-05,
+ "loss": 1.9736,
+ "step": 65
+ },
+ {
+ "epoch": 0.02,
+ "grad_norm": 1.000693440437317,
+ "learning_rate": 4.994153788187363e-05,
+ "loss": 2.1572,
+ "step": 70
+ },
+ {
+ "epoch": 0.02,
+ "grad_norm": 0.6852813959121704,
+ "learning_rate": 4.993289174645757e-05,
+ "loss": 2.1491,
+ "step": 75
+ },
+ {
+ "epoch": 0.02,
+ "grad_norm": 1.0075691938400269,
+ "learning_rate": 4.992365042931752e-05,
+ "loss": 1.945,
+ "step": 80
+ },
+ {
+ "epoch": 0.03,
+ "grad_norm": 1.1973133087158203,
+ "learning_rate": 4.991381415105619e-05,
+ "loss": 2.0811,
+ "step": 85
+ },
+ {
+ "epoch": 0.03,
+ "grad_norm": 0.9927239418029785,
+ "learning_rate": 4.990338314647881e-05,
+ "loss": 1.961,
+ "step": 90
+ },
+ {
+ "epoch": 0.03,
+ "grad_norm": 0.9499759674072266,
+ "learning_rate": 4.98923576645875e-05,
+ "loss": 2.0653,
+ "step": 95
+ },
+ {
+ "epoch": 0.03,
+ "grad_norm": 0.7233040928840637,
+ "learning_rate": 4.9880737968575365e-05,
+ "loss": 1.9999,
+ "step": 100
+ },
+ {
+ "epoch": 0.03,
+ "grad_norm": 1.55235755443573,
+ "learning_rate": 4.986852433582022e-05,
+ "loss": 2.2258,
+ "step": 105
+ },
+ {
+ "epoch": 0.03,
+ "grad_norm": 0.9007890820503235,
+ "learning_rate": 4.985571705787793e-05,
+ "loss": 2.1034,
+ "step": 110
+ },
+ {
+ "epoch": 0.04,
+ "grad_norm": 0.6774860620498657,
+ "learning_rate": 4.9842316440475475e-05,
+ "loss": 2.1753,
+ "step": 115
+ },
+ {
+ "epoch": 0.04,
+ "grad_norm": 0.7676737308502197,
+ "learning_rate": 4.9828322803503665e-05,
+ "loss": 2.1384,
+ "step": 120
+ },
+ {
+ "epoch": 0.04,
+ "grad_norm": 0.9624544978141785,
+ "learning_rate": 4.981373648100946e-05,
+ "loss": 2.0521,
+ "step": 125
+ },
+ {
+ "epoch": 0.04,
+ "grad_norm": 0.9315722584724426,
+ "learning_rate": 4.979855782118802e-05,
+ "loss": 1.9256,
+ "step": 130
+ },
+ {
+ "epoch": 0.04,
+ "grad_norm": 0.9035864472389221,
+ "learning_rate": 4.978278718637443e-05,
+ "loss": 2.0882,
+ "step": 135
+ },
+ {
+ "epoch": 0.04,
+ "grad_norm": 0.7997236251831055,
+ "learning_rate": 4.9766424953035e-05,
+ "loss": 2.0724,
+ "step": 140
+ },
+ {
+ "epoch": 0.05,
+ "grad_norm": 1.0692921876907349,
+ "learning_rate": 4.974947151175826e-05,
+ "loss": 2.1329,
+ "step": 145
+ },
+ {
+ "epoch": 0.05,
+ "grad_norm": 0.9506180286407471,
+ "learning_rate": 4.973192726724572e-05,
+ "loss": 2.082,
+ "step": 150
+ },
+ {
+ "epoch": 0.05,
+ "grad_norm": 0.8647387027740479,
+ "learning_rate": 4.9713792638302145e-05,
+ "loss": 2.0366,
+ "step": 155
+ },
+ {
+ "epoch": 0.05,
+ "grad_norm": 1.105302095413208,
+ "learning_rate": 4.969506805782555e-05,
+ "loss": 2.1481,
+ "step": 160
+ },
+ {
+ "epoch": 0.05,
+ "grad_norm": 0.7593303918838501,
+ "learning_rate": 4.967575397279689e-05,
+ "loss": 2.032,
+ "step": 165
+ },
+ {
+ "epoch": 0.05,
+ "grad_norm": 0.7521979808807373,
+ "learning_rate": 4.965585084426943e-05,
+ "loss": 2.0379,
+ "step": 170
+ },
+ {
+ "epoch": 0.05,
+ "grad_norm": 0.947120726108551,
+ "learning_rate": 4.9635359147357655e-05,
+ "loss": 2.1444,
+ "step": 175
+ },
+ {
+ "epoch": 0.06,
+ "grad_norm": 1.2184454202651978,
+ "learning_rate": 4.961427937122598e-05,
+ "loss": 1.9164,
+ "step": 180
+ },
+ {
+ "epoch": 0.06,
+ "grad_norm": 1.221663475036621,
+ "learning_rate": 4.959261201907707e-05,
+ "loss": 2.0084,
+ "step": 185
+ },
+ {
+ "epoch": 0.06,
+ "grad_norm": 1.0457361936569214,
+ "learning_rate": 4.957035760813982e-05,
+ "loss": 2.2032,
+ "step": 190
+ },
+ {
+ "epoch": 0.06,
+ "grad_norm": 0.8834909200668335,
+ "learning_rate": 4.954751666965701e-05,
+ "loss": 2.2101,
+ "step": 195
+ },
+ {
+ "epoch": 0.06,
+ "grad_norm": 0.791902482509613,
+ "learning_rate": 4.9524089748872615e-05,
+ "loss": 2.0472,
+ "step": 200
+ }
+ ],
+ "logging_steps": 5,
+ "max_steps": 3215,
+ "num_input_tokens_seen": 0,
+ "num_train_epochs": 1,
+ "save_steps": 100,
+ "total_flos": 2692221596467200.0,
+ "train_batch_size": 2,
+ "trial_name": null,
+ "trial_params": null
+}
diff --git a/checkpoint-200/training_args.bin b/checkpoint-200/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..dbb5f752acaebe6718c05e59ced2a70ce6dfb6a0
--- /dev/null
+++ b/checkpoint-200/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:79e80b13ff00898b4493d440a3c1a1eb234c0ae541cbca8a8b1befef97a354c9
+size 5112
diff --git a/checkpoint-300/README.md b/checkpoint-300/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..b21eb87c49b1ea11193cd8f664d04b06c22e6bb5
--- /dev/null
+++ b/checkpoint-300/README.md
@@ -0,0 +1,202 @@
+---
+library_name: peft
+base_model: hfl/chinese-alpaca-2-1.3b
+---
+
+# Model Card for Model ID
+
+
+
+
+
+## Model Details
+
+### Model Description
+
+
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+
+
+### Direct Use
+
+
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+
+
+[More Information Needed]
+
+### Recommendations
+
+
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+
+
+[More Information Needed]
+
+### Training Procedure
+
+
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed]
+
+#### Speeds, Sizes, Times [optional]
+
+
+
+[More Information Needed]
+
+## Evaluation
+
+
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+
+
+[More Information Needed]
+
+#### Factors
+
+
+
+[More Information Needed]
+
+#### Metrics
+
+
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+
+
+[More Information Needed]
+
+## Environmental Impact
+
+
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.9.0
\ No newline at end of file
diff --git a/checkpoint-300/adapter_config.json b/checkpoint-300/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..67326eac9d1aeaece3cee6be49e088077b9cebe9
--- /dev/null
+++ b/checkpoint-300/adapter_config.json
@@ -0,0 +1,28 @@
+{
+ "alpha_pattern": {},
+ "auto_mapping": null,
+ "base_model_name_or_path": "hfl/chinese-alpaca-2-1.3b",
+ "bias": "none",
+ "fan_in_fan_out": false,
+ "inference_mode": true,
+ "init_lora_weights": true,
+ "layers_pattern": null,
+ "layers_to_transform": null,
+ "loftq_config": {},
+ "lora_alpha": 16,
+ "lora_dropout": 0.1,
+ "megatron_config": null,
+ "megatron_core": "megatron.core",
+ "modules_to_save": null,
+ "peft_type": "LORA",
+ "r": 8,
+ "rank_pattern": {},
+ "revision": null,
+ "target_modules": [
+ "q_proj",
+ "v_proj"
+ ],
+ "task_type": "CAUSAL_LM",
+ "use_dora": false,
+ "use_rslora": false
+}
\ No newline at end of file
diff --git a/checkpoint-300/adapter_model.safetensors b/checkpoint-300/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..0ac6b14b7edcbdbd6348c7aeacfa237c7b35caff
--- /dev/null
+++ b/checkpoint-300/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f0ede12c9ba9490e88263af318d5cb3c5fd89d2bd8453b124eb6975f3d7f927b
+size 2099272
diff --git a/checkpoint-300/optimizer.pt b/checkpoint-300/optimizer.pt
new file mode 100644
index 0000000000000000000000000000000000000000..6fa40c84ddcb648f8ffa9723491306a5fa08b7f7
--- /dev/null
+++ b/checkpoint-300/optimizer.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7f0a5e78073a5a58dc7958a63b026af18792e3d727e023d1643f1ddccc83d202
+size 4208302
diff --git a/checkpoint-300/rng_state.pth b/checkpoint-300/rng_state.pth
new file mode 100644
index 0000000000000000000000000000000000000000..f608004f7fbf3451c0be7be6f24ed27db5e66ded
--- /dev/null
+++ b/checkpoint-300/rng_state.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:17e78f09051534c4813981e8db41160ba79e32b82d6cc4338167319badcc1212
+size 14244
diff --git a/checkpoint-300/scheduler.pt b/checkpoint-300/scheduler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..a7871bc7ebf8ae17ed09b08eeb31c7f5150680ea
--- /dev/null
+++ b/checkpoint-300/scheduler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:409a78607579321f124c8a569986becbe346fd5c384fbc7ec0a57d4dde2570ac
+size 1064
diff --git a/checkpoint-300/special_tokens_map.json b/checkpoint-300/special_tokens_map.json
new file mode 100644
index 0000000000000000000000000000000000000000..94c4595c4eb202faff8e45d273ceffbe19fe3036
--- /dev/null
+++ b/checkpoint-300/special_tokens_map.json
@@ -0,0 +1,30 @@
+{
+ "bos_token": {
+ "content": "",
+ "lstrip": false,
+ "normalized": true,
+ "rstrip": false,
+ "single_word": false
+ },
+ "eos_token": {
+ "content": "",
+ "lstrip": false,
+ "normalized": true,
+ "rstrip": false,
+ "single_word": false
+ },
+ "pad_token": {
+ "content": "",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false
+ },
+ "unk_token": {
+ "content": "",
+ "lstrip": false,
+ "normalized": true,
+ "rstrip": false,
+ "single_word": false
+ }
+}
diff --git a/checkpoint-300/tokenizer.model b/checkpoint-300/tokenizer.model
new file mode 100644
index 0000000000000000000000000000000000000000..3df664fcc67f61d0c7e635a60d9f55cd0624158a
--- /dev/null
+++ b/checkpoint-300/tokenizer.model
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a3b8844863b200dfcca971db228e96ce388290dfcf72c15d7a9d2f604bac787c
+size 844403
diff --git a/checkpoint-300/tokenizer_config.json b/checkpoint-300/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..6912f7cdcb84c3039edad59aa64f70dc4344a517
--- /dev/null
+++ b/checkpoint-300/tokenizer_config.json
@@ -0,0 +1,54 @@
+{
+ "add_bos_token": true,
+ "add_eos_token": false,
+ "add_prefix_space": true,
+ "added_tokens_decoder": {
+ "0": {
+ "content": "",
+ "lstrip": false,
+ "normalized": true,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "1": {
+ "content": "",
+ "lstrip": false,
+ "normalized": true,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "2": {
+ "content": "",
+ "lstrip": false,
+ "normalized": true,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "32000": {
+ "content": "",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ }
+ },
+ "bos_token": "",
+ "chat_template": "{% set system_message = 'You are a helpful assistant. 你是一个乐于助人的助手。' %}{% if messages[0]['role'] == 'system' %}{% set system_message = messages[0]['content'] %}{% endif %}{% for message in messages %}{% set content = message['content'] %}{% if loop.index0 == 0 and system_message is defined %}{% set content = '<>\\n' + system_message + '\\n<>\\n\\n' + message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ '' + '[INST] ' + content + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ content + '' }}{% endif %}{% endfor %}",
+ "clean_up_tokenization_spaces": false,
+ "eos_token": "",
+ "legacy": true,
+ "model_max_length": 1000000000000000019884624838656,
+ "pad_token": "",
+ "padding_side": "right",
+ "sp_model_kwargs": {},
+ "spaces_between_special_tokens": false,
+ "split_special_tokens": false,
+ "tokenizer_class": "LlamaTokenizer",
+ "unk_token": "",
+ "use_default_system_prompt": false,
+ "use_fast": false
+}
diff --git a/checkpoint-300/trainer_state.json b/checkpoint-300/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..8809b5c012a5c37c166d4649c4d1d897ad0724cf
--- /dev/null
+++ b/checkpoint-300/trainer_state.json
@@ -0,0 +1,441 @@
+{
+ "best_metric": null,
+ "best_model_checkpoint": null,
+ "epoch": 0.09328720799160416,
+ "eval_steps": 500,
+ "global_step": 300,
+ "is_hyper_param_search": false,
+ "is_local_process_zero": true,
+ "is_world_process_zero": true,
+ "log_history": [
+ {
+ "epoch": 0.0,
+ "grad_norm": 0.47774845361709595,
+ "learning_rate": 4.999970160815579e-05,
+ "loss": 2.0765,
+ "step": 5
+ },
+ {
+ "epoch": 0.0,
+ "grad_norm": 0.6051416397094727,
+ "learning_rate": 4.999880643974619e-05,
+ "loss": 2.2297,
+ "step": 10
+ },
+ {
+ "epoch": 0.0,
+ "grad_norm": 0.6161717772483826,
+ "learning_rate": 4.9997314516140056e-05,
+ "loss": 2.1103,
+ "step": 15
+ },
+ {
+ "epoch": 0.01,
+ "grad_norm": 0.4686434268951416,
+ "learning_rate": 4.999522587295162e-05,
+ "loss": 2.0057,
+ "step": 20
+ },
+ {
+ "epoch": 0.01,
+ "grad_norm": 0.8412289023399353,
+ "learning_rate": 4.999254056003963e-05,
+ "loss": 2.1778,
+ "step": 25
+ },
+ {
+ "epoch": 0.01,
+ "grad_norm": 0.5333625078201294,
+ "learning_rate": 4.99892586415061e-05,
+ "loss": 2.2399,
+ "step": 30
+ },
+ {
+ "epoch": 0.01,
+ "grad_norm": 0.821148157119751,
+ "learning_rate": 4.9985380195694856e-05,
+ "loss": 2.3215,
+ "step": 35
+ },
+ {
+ "epoch": 0.01,
+ "grad_norm": 0.8403909206390381,
+ "learning_rate": 4.998090531518962e-05,
+ "loss": 1.8295,
+ "step": 40
+ },
+ {
+ "epoch": 0.01,
+ "grad_norm": 0.6633398532867432,
+ "learning_rate": 4.9975834106811834e-05,
+ "loss": 2.0195,
+ "step": 45
+ },
+ {
+ "epoch": 0.02,
+ "grad_norm": 0.6386868357658386,
+ "learning_rate": 4.997016669161806e-05,
+ "loss": 2.1257,
+ "step": 50
+ },
+ {
+ "epoch": 0.02,
+ "grad_norm": 0.7762248516082764,
+ "learning_rate": 4.996390320489715e-05,
+ "loss": 2.057,
+ "step": 55
+ },
+ {
+ "epoch": 0.02,
+ "grad_norm": 1.3192856311798096,
+ "learning_rate": 4.9957043796166966e-05,
+ "loss": 2.0753,
+ "step": 60
+ },
+ {
+ "epoch": 0.02,
+ "grad_norm": 0.9797518849372864,
+ "learning_rate": 4.994958862917083e-05,
+ "loss": 1.9736,
+ "step": 65
+ },
+ {
+ "epoch": 0.02,
+ "grad_norm": 1.000693440437317,
+ "learning_rate": 4.994153788187363e-05,
+ "loss": 2.1572,
+ "step": 70
+ },
+ {
+ "epoch": 0.02,
+ "grad_norm": 0.6852813959121704,
+ "learning_rate": 4.993289174645757e-05,
+ "loss": 2.1491,
+ "step": 75
+ },
+ {
+ "epoch": 0.02,
+ "grad_norm": 1.0075691938400269,
+ "learning_rate": 4.992365042931752e-05,
+ "loss": 1.945,
+ "step": 80
+ },
+ {
+ "epoch": 0.03,
+ "grad_norm": 1.1973133087158203,
+ "learning_rate": 4.991381415105619e-05,
+ "loss": 2.0811,
+ "step": 85
+ },
+ {
+ "epoch": 0.03,
+ "grad_norm": 0.9927239418029785,
+ "learning_rate": 4.990338314647881e-05,
+ "loss": 1.961,
+ "step": 90
+ },
+ {
+ "epoch": 0.03,
+ "grad_norm": 0.9499759674072266,
+ "learning_rate": 4.98923576645875e-05,
+ "loss": 2.0653,
+ "step": 95
+ },
+ {
+ "epoch": 0.03,
+ "grad_norm": 0.7233040928840637,
+ "learning_rate": 4.9880737968575365e-05,
+ "loss": 1.9999,
+ "step": 100
+ },
+ {
+ "epoch": 0.03,
+ "grad_norm": 1.55235755443573,
+ "learning_rate": 4.986852433582022e-05,
+ "loss": 2.2258,
+ "step": 105
+ },
+ {
+ "epoch": 0.03,
+ "grad_norm": 0.9007890820503235,
+ "learning_rate": 4.985571705787793e-05,
+ "loss": 2.1034,
+ "step": 110
+ },
+ {
+ "epoch": 0.04,
+ "grad_norm": 0.6774860620498657,
+ "learning_rate": 4.9842316440475475e-05,
+ "loss": 2.1753,
+ "step": 115
+ },
+ {
+ "epoch": 0.04,
+ "grad_norm": 0.7676737308502197,
+ "learning_rate": 4.9828322803503665e-05,
+ "loss": 2.1384,
+ "step": 120
+ },
+ {
+ "epoch": 0.04,
+ "grad_norm": 0.9624544978141785,
+ "learning_rate": 4.981373648100946e-05,
+ "loss": 2.0521,
+ "step": 125
+ },
+ {
+ "epoch": 0.04,
+ "grad_norm": 0.9315722584724426,
+ "learning_rate": 4.979855782118802e-05,
+ "loss": 1.9256,
+ "step": 130
+ },
+ {
+ "epoch": 0.04,
+ "grad_norm": 0.9035864472389221,
+ "learning_rate": 4.978278718637443e-05,
+ "loss": 2.0882,
+ "step": 135
+ },
+ {
+ "epoch": 0.04,
+ "grad_norm": 0.7997236251831055,
+ "learning_rate": 4.9766424953035e-05,
+ "loss": 2.0724,
+ "step": 140
+ },
+ {
+ "epoch": 0.05,
+ "grad_norm": 1.0692921876907349,
+ "learning_rate": 4.974947151175826e-05,
+ "loss": 2.1329,
+ "step": 145
+ },
+ {
+ "epoch": 0.05,
+ "grad_norm": 0.9506180286407471,
+ "learning_rate": 4.973192726724572e-05,
+ "loss": 2.082,
+ "step": 150
+ },
+ {
+ "epoch": 0.05,
+ "grad_norm": 0.8647387027740479,
+ "learning_rate": 4.9713792638302145e-05,
+ "loss": 2.0366,
+ "step": 155
+ },
+ {
+ "epoch": 0.05,
+ "grad_norm": 1.105302095413208,
+ "learning_rate": 4.969506805782555e-05,
+ "loss": 2.1481,
+ "step": 160
+ },
+ {
+ "epoch": 0.05,
+ "grad_norm": 0.7593303918838501,
+ "learning_rate": 4.967575397279689e-05,
+ "loss": 2.032,
+ "step": 165
+ },
+ {
+ "epoch": 0.05,
+ "grad_norm": 0.7521979808807373,
+ "learning_rate": 4.965585084426943e-05,
+ "loss": 2.0379,
+ "step": 170
+ },
+ {
+ "epoch": 0.05,
+ "grad_norm": 0.947120726108551,
+ "learning_rate": 4.9635359147357655e-05,
+ "loss": 2.1444,
+ "step": 175
+ },
+ {
+ "epoch": 0.06,
+ "grad_norm": 1.2184454202651978,
+ "learning_rate": 4.961427937122598e-05,
+ "loss": 1.9164,
+ "step": 180
+ },
+ {
+ "epoch": 0.06,
+ "grad_norm": 1.221663475036621,
+ "learning_rate": 4.959261201907707e-05,
+ "loss": 2.0084,
+ "step": 185
+ },
+ {
+ "epoch": 0.06,
+ "grad_norm": 1.0457361936569214,
+ "learning_rate": 4.957035760813982e-05,
+ "loss": 2.2032,
+ "step": 190
+ },
+ {
+ "epoch": 0.06,
+ "grad_norm": 0.8834909200668335,
+ "learning_rate": 4.954751666965701e-05,
+ "loss": 2.2101,
+ "step": 195
+ },
+ {
+ "epoch": 0.06,
+ "grad_norm": 0.791902482509613,
+ "learning_rate": 4.9524089748872615e-05,
+ "loss": 2.0472,
+ "step": 200
+ },
+ {
+ "epoch": 0.06,
+ "grad_norm": 1.2905739545822144,
+ "learning_rate": 4.9500077405018807e-05,
+ "loss": 2.0987,
+ "step": 205
+ },
+ {
+ "epoch": 0.07,
+ "grad_norm": 0.8612006306648254,
+ "learning_rate": 4.9475480211302583e-05,
+ "loss": 2.1765,
+ "step": 210
+ },
+ {
+ "epoch": 0.07,
+ "grad_norm": 1.3128459453582764,
+ "learning_rate": 4.945029875489212e-05,
+ "loss": 1.9926,
+ "step": 215
+ },
+ {
+ "epoch": 0.07,
+ "grad_norm": 0.9610918164253235,
+ "learning_rate": 4.94245336369027e-05,
+ "loss": 2.0124,
+ "step": 220
+ },
+ {
+ "epoch": 0.07,
+ "grad_norm": 0.873160183429718,
+ "learning_rate": 4.939818547238241e-05,
+ "loss": 2.2229,
+ "step": 225
+ },
+ {
+ "epoch": 0.07,
+ "grad_norm": 1.5535285472869873,
+ "learning_rate": 4.9371254890297446e-05,
+ "loss": 2.2013,
+ "step": 230
+ },
+ {
+ "epoch": 0.07,
+ "grad_norm": 1.1951836347579956,
+ "learning_rate": 4.93437425335171e-05,
+ "loss": 2.014,
+ "step": 235
+ },
+ {
+ "epoch": 0.07,
+ "grad_norm": 0.7874170541763306,
+ "learning_rate": 4.9315649058798384e-05,
+ "loss": 2.1701,
+ "step": 240
+ },
+ {
+ "epoch": 0.08,
+ "grad_norm": 1.3503323793411255,
+ "learning_rate": 4.928697513677042e-05,
+ "loss": 2.1681,
+ "step": 245
+ },
+ {
+ "epoch": 0.08,
+ "grad_norm": 1.3091179132461548,
+ "learning_rate": 4.925772145191834e-05,
+ "loss": 2.1224,
+ "step": 250
+ },
+ {
+ "epoch": 0.08,
+ "grad_norm": 1.4428555965423584,
+ "learning_rate": 4.9227888702567044e-05,
+ "loss": 2.0512,
+ "step": 255
+ },
+ {
+ "epoch": 0.08,
+ "grad_norm": 0.8234395980834961,
+ "learning_rate": 4.9197477600864446e-05,
+ "loss": 2.1067,
+ "step": 260
+ },
+ {
+ "epoch": 0.08,
+ "grad_norm": 1.9094969034194946,
+ "learning_rate": 4.9166488872764526e-05,
+ "loss": 1.8884,
+ "step": 265
+ },
+ {
+ "epoch": 0.08,
+ "grad_norm": 1.0074087381362915,
+ "learning_rate": 4.913492325800999e-05,
+ "loss": 1.9345,
+ "step": 270
+ },
+ {
+ "epoch": 0.09,
+ "grad_norm": 1.0867297649383545,
+ "learning_rate": 4.910278151011458e-05,
+ "loss": 2.1928,
+ "step": 275
+ },
+ {
+ "epoch": 0.09,
+ "grad_norm": 0.6842357516288757,
+ "learning_rate": 4.907006439634516e-05,
+ "loss": 2.0407,
+ "step": 280
+ },
+ {
+ "epoch": 0.09,
+ "grad_norm": 0.8409023284912109,
+ "learning_rate": 4.903677269770329e-05,
+ "loss": 2.2344,
+ "step": 285
+ },
+ {
+ "epoch": 0.09,
+ "grad_norm": 0.8119503259658813,
+ "learning_rate": 4.900290720890671e-05,
+ "loss": 2.1296,
+ "step": 290
+ },
+ {
+ "epoch": 0.09,
+ "grad_norm": 0.9938147068023682,
+ "learning_rate": 4.8968468738370244e-05,
+ "loss": 2.152,
+ "step": 295
+ },
+ {
+ "epoch": 0.09,
+ "grad_norm": 0.9865244030952454,
+ "learning_rate": 4.8933458108186606e-05,
+ "loss": 1.9623,
+ "step": 300
+ }
+ ],
+ "logging_steps": 5,
+ "max_steps": 3215,
+ "num_input_tokens_seen": 0,
+ "num_train_epochs": 1,
+ "save_steps": 100,
+ "total_flos": 4027137608908800.0,
+ "train_batch_size": 2,
+ "trial_name": null,
+ "trial_params": null
+}
diff --git a/checkpoint-300/training_args.bin b/checkpoint-300/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..dbb5f752acaebe6718c05e59ced2a70ce6dfb6a0
--- /dev/null
+++ b/checkpoint-300/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:79e80b13ff00898b4493d440a3c1a1eb234c0ae541cbca8a8b1befef97a354c9
+size 5112
diff --git a/checkpoint-400/README.md b/checkpoint-400/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..b21eb87c49b1ea11193cd8f664d04b06c22e6bb5
--- /dev/null
+++ b/checkpoint-400/README.md
@@ -0,0 +1,202 @@
+---
+library_name: peft
+base_model: hfl/chinese-alpaca-2-1.3b
+---
+
+# Model Card for Model ID
+
+
+
+
+
+## Model Details
+
+### Model Description
+
+
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+
+
+### Direct Use
+
+
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+
+
+[More Information Needed]
+
+### Recommendations
+
+
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+
+
+[More Information Needed]
+
+### Training Procedure
+
+
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed]
+
+#### Speeds, Sizes, Times [optional]
+
+
+
+[More Information Needed]
+
+## Evaluation
+
+
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+
+
+[More Information Needed]
+
+#### Factors
+
+
+
+[More Information Needed]
+
+#### Metrics
+
+
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+
+
+[More Information Needed]
+
+## Environmental Impact
+
+
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.9.0
\ No newline at end of file
diff --git a/checkpoint-400/adapter_config.json b/checkpoint-400/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..67326eac9d1aeaece3cee6be49e088077b9cebe9
--- /dev/null
+++ b/checkpoint-400/adapter_config.json
@@ -0,0 +1,28 @@
+{
+ "alpha_pattern": {},
+ "auto_mapping": null,
+ "base_model_name_or_path": "hfl/chinese-alpaca-2-1.3b",
+ "bias": "none",
+ "fan_in_fan_out": false,
+ "inference_mode": true,
+ "init_lora_weights": true,
+ "layers_pattern": null,
+ "layers_to_transform": null,
+ "loftq_config": {},
+ "lora_alpha": 16,
+ "lora_dropout": 0.1,
+ "megatron_config": null,
+ "megatron_core": "megatron.core",
+ "modules_to_save": null,
+ "peft_type": "LORA",
+ "r": 8,
+ "rank_pattern": {},
+ "revision": null,
+ "target_modules": [
+ "q_proj",
+ "v_proj"
+ ],
+ "task_type": "CAUSAL_LM",
+ "use_dora": false,
+ "use_rslora": false
+}
\ No newline at end of file
diff --git a/checkpoint-400/adapter_model.safetensors b/checkpoint-400/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..b6f5359a6e6859d44f55104fdfeb36a2ba9f73fb
--- /dev/null
+++ b/checkpoint-400/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:20a6b0f10af55adab3ace13b190c0b88878638e41c43461278003f985d65f825
+size 2099272
diff --git a/checkpoint-400/optimizer.pt b/checkpoint-400/optimizer.pt
new file mode 100644
index 0000000000000000000000000000000000000000..c3c9c9e793696be557d62f11ed6d3ed47e2fd604
--- /dev/null
+++ b/checkpoint-400/optimizer.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f9ba74d329c1511a275caa5a4131244a3e823ad790b076dd9d194b5664fb13a8
+size 4208302
diff --git a/checkpoint-400/rng_state.pth b/checkpoint-400/rng_state.pth
new file mode 100644
index 0000000000000000000000000000000000000000..2b8c7d84529cdcd0d1d41e3149831cf4666accad
--- /dev/null
+++ b/checkpoint-400/rng_state.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:00c6fb89b33a31f54b4c9fb5d197ffb1411fd724d6aa4a14ea649ff9c09f77ed
+size 14244
diff --git a/checkpoint-400/scheduler.pt b/checkpoint-400/scheduler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..a4048d3f837df380c70e3220dc2809d5583e6278
--- /dev/null
+++ b/checkpoint-400/scheduler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6bab6de0379e26e33db70e69aa7be18f07acf14aa9554478a757b50620067887
+size 1064
diff --git a/checkpoint-400/special_tokens_map.json b/checkpoint-400/special_tokens_map.json
new file mode 100644
index 0000000000000000000000000000000000000000..94c4595c4eb202faff8e45d273ceffbe19fe3036
--- /dev/null
+++ b/checkpoint-400/special_tokens_map.json
@@ -0,0 +1,30 @@
+{
+ "bos_token": {
+ "content": "",
+ "lstrip": false,
+ "normalized": true,
+ "rstrip": false,
+ "single_word": false
+ },
+ "eos_token": {
+ "content": "",
+ "lstrip": false,
+ "normalized": true,
+ "rstrip": false,
+ "single_word": false
+ },
+ "pad_token": {
+ "content": "",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false
+ },
+ "unk_token": {
+ "content": "",
+ "lstrip": false,
+ "normalized": true,
+ "rstrip": false,
+ "single_word": false
+ }
+}
diff --git a/checkpoint-400/tokenizer.model b/checkpoint-400/tokenizer.model
new file mode 100644
index 0000000000000000000000000000000000000000..3df664fcc67f61d0c7e635a60d9f55cd0624158a
--- /dev/null
+++ b/checkpoint-400/tokenizer.model
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a3b8844863b200dfcca971db228e96ce388290dfcf72c15d7a9d2f604bac787c
+size 844403
diff --git a/checkpoint-400/tokenizer_config.json b/checkpoint-400/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..6912f7cdcb84c3039edad59aa64f70dc4344a517
--- /dev/null
+++ b/checkpoint-400/tokenizer_config.json
@@ -0,0 +1,54 @@
+{
+ "add_bos_token": true,
+ "add_eos_token": false,
+ "add_prefix_space": true,
+ "added_tokens_decoder": {
+ "0": {
+ "content": "",
+ "lstrip": false,
+ "normalized": true,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "1": {
+ "content": "",
+ "lstrip": false,
+ "normalized": true,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "2": {
+ "content": "",
+ "lstrip": false,
+ "normalized": true,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "32000": {
+ "content": "",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ }
+ },
+ "bos_token": "",
+ "chat_template": "{% set system_message = 'You are a helpful assistant. 你是一个乐于助人的助手。' %}{% if messages[0]['role'] == 'system' %}{% set system_message = messages[0]['content'] %}{% endif %}{% for message in messages %}{% set content = message['content'] %}{% if loop.index0 == 0 and system_message is defined %}{% set content = '<>\\n' + system_message + '\\n<>\\n\\n' + message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ '' + '[INST] ' + content + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ content + '' }}{% endif %}{% endfor %}",
+ "clean_up_tokenization_spaces": false,
+ "eos_token": "",
+ "legacy": true,
+ "model_max_length": 1000000000000000019884624838656,
+ "pad_token": "",
+ "padding_side": "right",
+ "sp_model_kwargs": {},
+ "spaces_between_special_tokens": false,
+ "split_special_tokens": false,
+ "tokenizer_class": "LlamaTokenizer",
+ "unk_token": "",
+ "use_default_system_prompt": false,
+ "use_fast": false
+}
diff --git a/checkpoint-400/trainer_state.json b/checkpoint-400/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..1bb9fb138240fed4a4ea095e3d84780bd7e37ac0
--- /dev/null
+++ b/checkpoint-400/trainer_state.json
@@ -0,0 +1,581 @@
+{
+ "best_metric": null,
+ "best_model_checkpoint": null,
+ "epoch": 0.12438294398880553,
+ "eval_steps": 500,
+ "global_step": 400,
+ "is_hyper_param_search": false,
+ "is_local_process_zero": true,
+ "is_world_process_zero": true,
+ "log_history": [
+ {
+ "epoch": 0.0,
+ "grad_norm": 0.47774845361709595,
+ "learning_rate": 4.999970160815579e-05,
+ "loss": 2.0765,
+ "step": 5
+ },
+ {
+ "epoch": 0.0,
+ "grad_norm": 0.6051416397094727,
+ "learning_rate": 4.999880643974619e-05,
+ "loss": 2.2297,
+ "step": 10
+ },
+ {
+ "epoch": 0.0,
+ "grad_norm": 0.6161717772483826,
+ "learning_rate": 4.9997314516140056e-05,
+ "loss": 2.1103,
+ "step": 15
+ },
+ {
+ "epoch": 0.01,
+ "grad_norm": 0.4686434268951416,
+ "learning_rate": 4.999522587295162e-05,
+ "loss": 2.0057,
+ "step": 20
+ },
+ {
+ "epoch": 0.01,
+ "grad_norm": 0.8412289023399353,
+ "learning_rate": 4.999254056003963e-05,
+ "loss": 2.1778,
+ "step": 25
+ },
+ {
+ "epoch": 0.01,
+ "grad_norm": 0.5333625078201294,
+ "learning_rate": 4.99892586415061e-05,
+ "loss": 2.2399,
+ "step": 30
+ },
+ {
+ "epoch": 0.01,
+ "grad_norm": 0.821148157119751,
+ "learning_rate": 4.9985380195694856e-05,
+ "loss": 2.3215,
+ "step": 35
+ },
+ {
+ "epoch": 0.01,
+ "grad_norm": 0.8403909206390381,
+ "learning_rate": 4.998090531518962e-05,
+ "loss": 1.8295,
+ "step": 40
+ },
+ {
+ "epoch": 0.01,
+ "grad_norm": 0.6633398532867432,
+ "learning_rate": 4.9975834106811834e-05,
+ "loss": 2.0195,
+ "step": 45
+ },
+ {
+ "epoch": 0.02,
+ "grad_norm": 0.6386868357658386,
+ "learning_rate": 4.997016669161806e-05,
+ "loss": 2.1257,
+ "step": 50
+ },
+ {
+ "epoch": 0.02,
+ "grad_norm": 0.7762248516082764,
+ "learning_rate": 4.996390320489715e-05,
+ "loss": 2.057,
+ "step": 55
+ },
+ {
+ "epoch": 0.02,
+ "grad_norm": 1.3192856311798096,
+ "learning_rate": 4.9957043796166966e-05,
+ "loss": 2.0753,
+ "step": 60
+ },
+ {
+ "epoch": 0.02,
+ "grad_norm": 0.9797518849372864,
+ "learning_rate": 4.994958862917083e-05,
+ "loss": 1.9736,
+ "step": 65
+ },
+ {
+ "epoch": 0.02,
+ "grad_norm": 1.000693440437317,
+ "learning_rate": 4.994153788187363e-05,
+ "loss": 2.1572,
+ "step": 70
+ },
+ {
+ "epoch": 0.02,
+ "grad_norm": 0.6852813959121704,
+ "learning_rate": 4.993289174645757e-05,
+ "loss": 2.1491,
+ "step": 75
+ },
+ {
+ "epoch": 0.02,
+ "grad_norm": 1.0075691938400269,
+ "learning_rate": 4.992365042931752e-05,
+ "loss": 1.945,
+ "step": 80
+ },
+ {
+ "epoch": 0.03,
+ "grad_norm": 1.1973133087158203,
+ "learning_rate": 4.991381415105619e-05,
+ "loss": 2.0811,
+ "step": 85
+ },
+ {
+ "epoch": 0.03,
+ "grad_norm": 0.9927239418029785,
+ "learning_rate": 4.990338314647881e-05,
+ "loss": 1.961,
+ "step": 90
+ },
+ {
+ "epoch": 0.03,
+ "grad_norm": 0.9499759674072266,
+ "learning_rate": 4.98923576645875e-05,
+ "loss": 2.0653,
+ "step": 95
+ },
+ {
+ "epoch": 0.03,
+ "grad_norm": 0.7233040928840637,
+ "learning_rate": 4.9880737968575365e-05,
+ "loss": 1.9999,
+ "step": 100
+ },
+ {
+ "epoch": 0.03,
+ "grad_norm": 1.55235755443573,
+ "learning_rate": 4.986852433582022e-05,
+ "loss": 2.2258,
+ "step": 105
+ },
+ {
+ "epoch": 0.03,
+ "grad_norm": 0.9007890820503235,
+ "learning_rate": 4.985571705787793e-05,
+ "loss": 2.1034,
+ "step": 110
+ },
+ {
+ "epoch": 0.04,
+ "grad_norm": 0.6774860620498657,
+ "learning_rate": 4.9842316440475475e-05,
+ "loss": 2.1753,
+ "step": 115
+ },
+ {
+ "epoch": 0.04,
+ "grad_norm": 0.7676737308502197,
+ "learning_rate": 4.9828322803503665e-05,
+ "loss": 2.1384,
+ "step": 120
+ },
+ {
+ "epoch": 0.04,
+ "grad_norm": 0.9624544978141785,
+ "learning_rate": 4.981373648100946e-05,
+ "loss": 2.0521,
+ "step": 125
+ },
+ {
+ "epoch": 0.04,
+ "grad_norm": 0.9315722584724426,
+ "learning_rate": 4.979855782118802e-05,
+ "loss": 1.9256,
+ "step": 130
+ },
+ {
+ "epoch": 0.04,
+ "grad_norm": 0.9035864472389221,
+ "learning_rate": 4.978278718637443e-05,
+ "loss": 2.0882,
+ "step": 135
+ },
+ {
+ "epoch": 0.04,
+ "grad_norm": 0.7997236251831055,
+ "learning_rate": 4.9766424953035e-05,
+ "loss": 2.0724,
+ "step": 140
+ },
+ {
+ "epoch": 0.05,
+ "grad_norm": 1.0692921876907349,
+ "learning_rate": 4.974947151175826e-05,
+ "loss": 2.1329,
+ "step": 145
+ },
+ {
+ "epoch": 0.05,
+ "grad_norm": 0.9506180286407471,
+ "learning_rate": 4.973192726724572e-05,
+ "loss": 2.082,
+ "step": 150
+ },
+ {
+ "epoch": 0.05,
+ "grad_norm": 0.8647387027740479,
+ "learning_rate": 4.9713792638302145e-05,
+ "loss": 2.0366,
+ "step": 155
+ },
+ {
+ "epoch": 0.05,
+ "grad_norm": 1.105302095413208,
+ "learning_rate": 4.969506805782555e-05,
+ "loss": 2.1481,
+ "step": 160
+ },
+ {
+ "epoch": 0.05,
+ "grad_norm": 0.7593303918838501,
+ "learning_rate": 4.967575397279689e-05,
+ "loss": 2.032,
+ "step": 165
+ },
+ {
+ "epoch": 0.05,
+ "grad_norm": 0.7521979808807373,
+ "learning_rate": 4.965585084426943e-05,
+ "loss": 2.0379,
+ "step": 170
+ },
+ {
+ "epoch": 0.05,
+ "grad_norm": 0.947120726108551,
+ "learning_rate": 4.9635359147357655e-05,
+ "loss": 2.1444,
+ "step": 175
+ },
+ {
+ "epoch": 0.06,
+ "grad_norm": 1.2184454202651978,
+ "learning_rate": 4.961427937122598e-05,
+ "loss": 1.9164,
+ "step": 180
+ },
+ {
+ "epoch": 0.06,
+ "grad_norm": 1.221663475036621,
+ "learning_rate": 4.959261201907707e-05,
+ "loss": 2.0084,
+ "step": 185
+ },
+ {
+ "epoch": 0.06,
+ "grad_norm": 1.0457361936569214,
+ "learning_rate": 4.957035760813982e-05,
+ "loss": 2.2032,
+ "step": 190
+ },
+ {
+ "epoch": 0.06,
+ "grad_norm": 0.8834909200668335,
+ "learning_rate": 4.954751666965701e-05,
+ "loss": 2.2101,
+ "step": 195
+ },
+ {
+ "epoch": 0.06,
+ "grad_norm": 0.791902482509613,
+ "learning_rate": 4.9524089748872615e-05,
+ "loss": 2.0472,
+ "step": 200
+ },
+ {
+ "epoch": 0.06,
+ "grad_norm": 1.2905739545822144,
+ "learning_rate": 4.9500077405018807e-05,
+ "loss": 2.0987,
+ "step": 205
+ },
+ {
+ "epoch": 0.07,
+ "grad_norm": 0.8612006306648254,
+ "learning_rate": 4.9475480211302583e-05,
+ "loss": 2.1765,
+ "step": 210
+ },
+ {
+ "epoch": 0.07,
+ "grad_norm": 1.3128459453582764,
+ "learning_rate": 4.945029875489212e-05,
+ "loss": 1.9926,
+ "step": 215
+ },
+ {
+ "epoch": 0.07,
+ "grad_norm": 0.9610918164253235,
+ "learning_rate": 4.94245336369027e-05,
+ "loss": 2.0124,
+ "step": 220
+ },
+ {
+ "epoch": 0.07,
+ "grad_norm": 0.873160183429718,
+ "learning_rate": 4.939818547238241e-05,
+ "loss": 2.2229,
+ "step": 225
+ },
+ {
+ "epoch": 0.07,
+ "grad_norm": 1.5535285472869873,
+ "learning_rate": 4.9371254890297446e-05,
+ "loss": 2.2013,
+ "step": 230
+ },
+ {
+ "epoch": 0.07,
+ "grad_norm": 1.1951836347579956,
+ "learning_rate": 4.93437425335171e-05,
+ "loss": 2.014,
+ "step": 235
+ },
+ {
+ "epoch": 0.07,
+ "grad_norm": 0.7874170541763306,
+ "learning_rate": 4.9315649058798384e-05,
+ "loss": 2.1701,
+ "step": 240
+ },
+ {
+ "epoch": 0.08,
+ "grad_norm": 1.3503323793411255,
+ "learning_rate": 4.928697513677042e-05,
+ "loss": 2.1681,
+ "step": 245
+ },
+ {
+ "epoch": 0.08,
+ "grad_norm": 1.3091179132461548,
+ "learning_rate": 4.925772145191834e-05,
+ "loss": 2.1224,
+ "step": 250
+ },
+ {
+ "epoch": 0.08,
+ "grad_norm": 1.4428555965423584,
+ "learning_rate": 4.9227888702567044e-05,
+ "loss": 2.0512,
+ "step": 255
+ },
+ {
+ "epoch": 0.08,
+ "grad_norm": 0.8234395980834961,
+ "learning_rate": 4.9197477600864446e-05,
+ "loss": 2.1067,
+ "step": 260
+ },
+ {
+ "epoch": 0.08,
+ "grad_norm": 1.9094969034194946,
+ "learning_rate": 4.9166488872764526e-05,
+ "loss": 1.8884,
+ "step": 265
+ },
+ {
+ "epoch": 0.08,
+ "grad_norm": 1.0074087381362915,
+ "learning_rate": 4.913492325800999e-05,
+ "loss": 1.9345,
+ "step": 270
+ },
+ {
+ "epoch": 0.09,
+ "grad_norm": 1.0867297649383545,
+ "learning_rate": 4.910278151011458e-05,
+ "loss": 2.1928,
+ "step": 275
+ },
+ {
+ "epoch": 0.09,
+ "grad_norm": 0.6842357516288757,
+ "learning_rate": 4.907006439634516e-05,
+ "loss": 2.0407,
+ "step": 280
+ },
+ {
+ "epoch": 0.09,
+ "grad_norm": 0.8409023284912109,
+ "learning_rate": 4.903677269770329e-05,
+ "loss": 2.2344,
+ "step": 285
+ },
+ {
+ "epoch": 0.09,
+ "grad_norm": 0.8119503259658813,
+ "learning_rate": 4.900290720890671e-05,
+ "loss": 2.1296,
+ "step": 290
+ },
+ {
+ "epoch": 0.09,
+ "grad_norm": 0.9938147068023682,
+ "learning_rate": 4.8968468738370244e-05,
+ "loss": 2.152,
+ "step": 295
+ },
+ {
+ "epoch": 0.09,
+ "grad_norm": 0.9865244030952454,
+ "learning_rate": 4.8933458108186606e-05,
+ "loss": 1.9623,
+ "step": 300
+ },
+ {
+ "epoch": 0.09,
+ "grad_norm": 1.3944802284240723,
+ "learning_rate": 4.889787615410672e-05,
+ "loss": 1.915,
+ "step": 305
+ },
+ {
+ "epoch": 0.1,
+ "grad_norm": 1.3749767541885376,
+ "learning_rate": 4.886172372551977e-05,
+ "loss": 1.9934,
+ "step": 310
+ },
+ {
+ "epoch": 0.1,
+ "grad_norm": 0.9024938941001892,
+ "learning_rate": 4.882500168543294e-05,
+ "loss": 2.1541,
+ "step": 315
+ },
+ {
+ "epoch": 0.1,
+ "grad_norm": 1.1978263854980469,
+ "learning_rate": 4.878771091045082e-05,
+ "loss": 2.1688,
+ "step": 320
+ },
+ {
+ "epoch": 0.1,
+ "grad_norm": 0.8360010981559753,
+ "learning_rate": 4.874985229075446e-05,
+ "loss": 2.1387,
+ "step": 325
+ },
+ {
+ "epoch": 0.1,
+ "grad_norm": 0.7683364152908325,
+ "learning_rate": 4.871142673008012e-05,
+ "loss": 2.0215,
+ "step": 330
+ },
+ {
+ "epoch": 0.1,
+ "grad_norm": 1.4230670928955078,
+ "learning_rate": 4.867243514569772e-05,
+ "loss": 1.9491,
+ "step": 335
+ },
+ {
+ "epoch": 0.11,
+ "grad_norm": 0.8198773860931396,
+ "learning_rate": 4.863287846838891e-05,
+ "loss": 2.0151,
+ "step": 340
+ },
+ {
+ "epoch": 0.11,
+ "grad_norm": 1.467207908630371,
+ "learning_rate": 4.85927576424249e-05,
+ "loss": 1.8906,
+ "step": 345
+ },
+ {
+ "epoch": 0.11,
+ "grad_norm": 0.9537095427513123,
+ "learning_rate": 4.855207362554385e-05,
+ "loss": 2.1844,
+ "step": 350
+ },
+ {
+ "epoch": 0.11,
+ "grad_norm": 1.0757155418395996,
+ "learning_rate": 4.851082738892809e-05,
+ "loss": 2.048,
+ "step": 355
+ },
+ {
+ "epoch": 0.11,
+ "grad_norm": 1.6884938478469849,
+ "learning_rate": 4.8469019917180846e-05,
+ "loss": 1.9537,
+ "step": 360
+ },
+ {
+ "epoch": 0.11,
+ "grad_norm": 1.4680182933807373,
+ "learning_rate": 4.8426652208302814e-05,
+ "loss": 1.9731,
+ "step": 365
+ },
+ {
+ "epoch": 0.12,
+ "grad_norm": 1.1778632402420044,
+ "learning_rate": 4.83837252736683e-05,
+ "loss": 2.1395,
+ "step": 370
+ },
+ {
+ "epoch": 0.12,
+ "grad_norm": 1.2865056991577148,
+ "learning_rate": 4.834024013800108e-05,
+ "loss": 2.0016,
+ "step": 375
+ },
+ {
+ "epoch": 0.12,
+ "grad_norm": 1.055177092552185,
+ "learning_rate": 4.8296197839349944e-05,
+ "loss": 1.9632,
+ "step": 380
+ },
+ {
+ "epoch": 0.12,
+ "grad_norm": 1.0041871070861816,
+ "learning_rate": 4.825159942906389e-05,
+ "loss": 2.3302,
+ "step": 385
+ },
+ {
+ "epoch": 0.12,
+ "grad_norm": 1.0026438236236572,
+ "learning_rate": 4.820644597176709e-05,
+ "loss": 2.1517,
+ "step": 390
+ },
+ {
+ "epoch": 0.12,
+ "grad_norm": 1.3532180786132812,
+ "learning_rate": 4.81607385453334e-05,
+ "loss": 2.1229,
+ "step": 395
+ },
+ {
+ "epoch": 0.12,
+ "grad_norm": 0.7670988440513611,
+ "learning_rate": 4.81144782408607e-05,
+ "loss": 2.1382,
+ "step": 400
+ }
+ ],
+ "logging_steps": 5,
+ "max_steps": 3215,
+ "num_input_tokens_seen": 0,
+ "num_train_epochs": 1,
+ "save_steps": 100,
+ "total_flos": 5372900124917760.0,
+ "train_batch_size": 2,
+ "trial_name": null,
+ "trial_params": null
+}
diff --git a/checkpoint-400/training_args.bin b/checkpoint-400/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..dbb5f752acaebe6718c05e59ced2a70ce6dfb6a0
--- /dev/null
+++ b/checkpoint-400/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:79e80b13ff00898b4493d440a3c1a1eb234c0ae541cbca8a8b1befef97a354c9
+size 5112
diff --git a/checkpoint-500/README.md b/checkpoint-500/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..b21eb87c49b1ea11193cd8f664d04b06c22e6bb5
--- /dev/null
+++ b/checkpoint-500/README.md
@@ -0,0 +1,202 @@
+---
+library_name: peft
+base_model: hfl/chinese-alpaca-2-1.3b
+---
+
+# Model Card for Model ID
+
+
+
+
+
+## Model Details
+
+### Model Description
+
+
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+
+
+### Direct Use
+
+
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+
+
+[More Information Needed]
+
+### Recommendations
+
+
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+
+
+[More Information Needed]
+
+### Training Procedure
+
+
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed]
+
+#### Speeds, Sizes, Times [optional]
+
+
+
+[More Information Needed]
+
+## Evaluation
+
+
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+
+
+[More Information Needed]
+
+#### Factors
+
+
+
+[More Information Needed]
+
+#### Metrics
+
+
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+
+
+[More Information Needed]
+
+## Environmental Impact
+
+
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.9.0
\ No newline at end of file
diff --git a/checkpoint-500/adapter_config.json b/checkpoint-500/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..67326eac9d1aeaece3cee6be49e088077b9cebe9
--- /dev/null
+++ b/checkpoint-500/adapter_config.json
@@ -0,0 +1,28 @@
+{
+ "alpha_pattern": {},
+ "auto_mapping": null,
+ "base_model_name_or_path": "hfl/chinese-alpaca-2-1.3b",
+ "bias": "none",
+ "fan_in_fan_out": false,
+ "inference_mode": true,
+ "init_lora_weights": true,
+ "layers_pattern": null,
+ "layers_to_transform": null,
+ "loftq_config": {},
+ "lora_alpha": 16,
+ "lora_dropout": 0.1,
+ "megatron_config": null,
+ "megatron_core": "megatron.core",
+ "modules_to_save": null,
+ "peft_type": "LORA",
+ "r": 8,
+ "rank_pattern": {},
+ "revision": null,
+ "target_modules": [
+ "q_proj",
+ "v_proj"
+ ],
+ "task_type": "CAUSAL_LM",
+ "use_dora": false,
+ "use_rslora": false
+}
\ No newline at end of file
diff --git a/checkpoint-500/adapter_model.safetensors b/checkpoint-500/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..ce59d72f30638abc6470079fc46d97c21dd9b9d4
--- /dev/null
+++ b/checkpoint-500/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:dc6a81ba16da3040dc8f987b3953f997ea002d279c55b1a455bf16d9b741d1e8
+size 2099272
diff --git a/checkpoint-500/optimizer.pt b/checkpoint-500/optimizer.pt
new file mode 100644
index 0000000000000000000000000000000000000000..aad26da08fd7264698e27092d22a00d8ed372b31
--- /dev/null
+++ b/checkpoint-500/optimizer.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6d65d51cb7f7be7ea1dc66ab6fb909512ecada77e4eb38c86d926613b6ccff12
+size 4208302
diff --git a/checkpoint-500/rng_state.pth b/checkpoint-500/rng_state.pth
new file mode 100644
index 0000000000000000000000000000000000000000..14976090b4da58d701f66845fdd790fbd4138524
--- /dev/null
+++ b/checkpoint-500/rng_state.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a1b7f6e3ef779b6d2aa0b9376ece96587db1ce1d789a4526b7d146f0155f413d
+size 14244
diff --git a/checkpoint-500/scheduler.pt b/checkpoint-500/scheduler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..ef610a81325f8e28f320a65009309eaf1ee311d2
--- /dev/null
+++ b/checkpoint-500/scheduler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4db22f85d1cc41f0dad50b7443150040453aaada654020db42304ba5fb7c6a6f
+size 1064
diff --git a/checkpoint-500/special_tokens_map.json b/checkpoint-500/special_tokens_map.json
new file mode 100644
index 0000000000000000000000000000000000000000..94c4595c4eb202faff8e45d273ceffbe19fe3036
--- /dev/null
+++ b/checkpoint-500/special_tokens_map.json
@@ -0,0 +1,30 @@
+{
+ "bos_token": {
+ "content": "",
+ "lstrip": false,
+ "normalized": true,
+ "rstrip": false,
+ "single_word": false
+ },
+ "eos_token": {
+ "content": "",
+ "lstrip": false,
+ "normalized": true,
+ "rstrip": false,
+ "single_word": false
+ },
+ "pad_token": {
+ "content": "",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false
+ },
+ "unk_token": {
+ "content": "",
+ "lstrip": false,
+ "normalized": true,
+ "rstrip": false,
+ "single_word": false
+ }
+}
diff --git a/checkpoint-500/tokenizer.model b/checkpoint-500/tokenizer.model
new file mode 100644
index 0000000000000000000000000000000000000000..3df664fcc67f61d0c7e635a60d9f55cd0624158a
--- /dev/null
+++ b/checkpoint-500/tokenizer.model
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a3b8844863b200dfcca971db228e96ce388290dfcf72c15d7a9d2f604bac787c
+size 844403
diff --git a/checkpoint-500/tokenizer_config.json b/checkpoint-500/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..6912f7cdcb84c3039edad59aa64f70dc4344a517
--- /dev/null
+++ b/checkpoint-500/tokenizer_config.json
@@ -0,0 +1,54 @@
+{
+ "add_bos_token": true,
+ "add_eos_token": false,
+ "add_prefix_space": true,
+ "added_tokens_decoder": {
+ "0": {
+ "content": "",
+ "lstrip": false,
+ "normalized": true,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "1": {
+ "content": "",
+ "lstrip": false,
+ "normalized": true,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "2": {
+ "content": "",
+ "lstrip": false,
+ "normalized": true,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "32000": {
+ "content": "",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ }
+ },
+ "bos_token": "",
+ "chat_template": "{% set system_message = 'You are a helpful assistant. 你是一个乐于助人的助手。' %}{% if messages[0]['role'] == 'system' %}{% set system_message = messages[0]['content'] %}{% endif %}{% for message in messages %}{% set content = message['content'] %}{% if loop.index0 == 0 and system_message is defined %}{% set content = '<>\\n' + system_message + '\\n<>\\n\\n' + message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ '' + '[INST] ' + content + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ content + '' }}{% endif %}{% endfor %}",
+ "clean_up_tokenization_spaces": false,
+ "eos_token": "",
+ "legacy": true,
+ "model_max_length": 1000000000000000019884624838656,
+ "pad_token": "",
+ "padding_side": "right",
+ "sp_model_kwargs": {},
+ "spaces_between_special_tokens": false,
+ "split_special_tokens": false,
+ "tokenizer_class": "LlamaTokenizer",
+ "unk_token": "",
+ "use_default_system_prompt": false,
+ "use_fast": false
+}
diff --git a/checkpoint-500/trainer_state.json b/checkpoint-500/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..b0748573f971e218114e99fecf76fc28cebc1ce8
--- /dev/null
+++ b/checkpoint-500/trainer_state.json
@@ -0,0 +1,721 @@
+{
+ "best_metric": null,
+ "best_model_checkpoint": null,
+ "epoch": 0.15547867998600692,
+ "eval_steps": 500,
+ "global_step": 500,
+ "is_hyper_param_search": false,
+ "is_local_process_zero": true,
+ "is_world_process_zero": true,
+ "log_history": [
+ {
+ "epoch": 0.0,
+ "grad_norm": 0.47774845361709595,
+ "learning_rate": 4.999970160815579e-05,
+ "loss": 2.0765,
+ "step": 5
+ },
+ {
+ "epoch": 0.0,
+ "grad_norm": 0.6051416397094727,
+ "learning_rate": 4.999880643974619e-05,
+ "loss": 2.2297,
+ "step": 10
+ },
+ {
+ "epoch": 0.0,
+ "grad_norm": 0.6161717772483826,
+ "learning_rate": 4.9997314516140056e-05,
+ "loss": 2.1103,
+ "step": 15
+ },
+ {
+ "epoch": 0.01,
+ "grad_norm": 0.4686434268951416,
+ "learning_rate": 4.999522587295162e-05,
+ "loss": 2.0057,
+ "step": 20
+ },
+ {
+ "epoch": 0.01,
+ "grad_norm": 0.8412289023399353,
+ "learning_rate": 4.999254056003963e-05,
+ "loss": 2.1778,
+ "step": 25
+ },
+ {
+ "epoch": 0.01,
+ "grad_norm": 0.5333625078201294,
+ "learning_rate": 4.99892586415061e-05,
+ "loss": 2.2399,
+ "step": 30
+ },
+ {
+ "epoch": 0.01,
+ "grad_norm": 0.821148157119751,
+ "learning_rate": 4.9985380195694856e-05,
+ "loss": 2.3215,
+ "step": 35
+ },
+ {
+ "epoch": 0.01,
+ "grad_norm": 0.8403909206390381,
+ "learning_rate": 4.998090531518962e-05,
+ "loss": 1.8295,
+ "step": 40
+ },
+ {
+ "epoch": 0.01,
+ "grad_norm": 0.6633398532867432,
+ "learning_rate": 4.9975834106811834e-05,
+ "loss": 2.0195,
+ "step": 45
+ },
+ {
+ "epoch": 0.02,
+ "grad_norm": 0.6386868357658386,
+ "learning_rate": 4.997016669161806e-05,
+ "loss": 2.1257,
+ "step": 50
+ },
+ {
+ "epoch": 0.02,
+ "grad_norm": 0.7762248516082764,
+ "learning_rate": 4.996390320489715e-05,
+ "loss": 2.057,
+ "step": 55
+ },
+ {
+ "epoch": 0.02,
+ "grad_norm": 1.3192856311798096,
+ "learning_rate": 4.9957043796166966e-05,
+ "loss": 2.0753,
+ "step": 60
+ },
+ {
+ "epoch": 0.02,
+ "grad_norm": 0.9797518849372864,
+ "learning_rate": 4.994958862917083e-05,
+ "loss": 1.9736,
+ "step": 65
+ },
+ {
+ "epoch": 0.02,
+ "grad_norm": 1.000693440437317,
+ "learning_rate": 4.994153788187363e-05,
+ "loss": 2.1572,
+ "step": 70
+ },
+ {
+ "epoch": 0.02,
+ "grad_norm": 0.6852813959121704,
+ "learning_rate": 4.993289174645757e-05,
+ "loss": 2.1491,
+ "step": 75
+ },
+ {
+ "epoch": 0.02,
+ "grad_norm": 1.0075691938400269,
+ "learning_rate": 4.992365042931752e-05,
+ "loss": 1.945,
+ "step": 80
+ },
+ {
+ "epoch": 0.03,
+ "grad_norm": 1.1973133087158203,
+ "learning_rate": 4.991381415105619e-05,
+ "loss": 2.0811,
+ "step": 85
+ },
+ {
+ "epoch": 0.03,
+ "grad_norm": 0.9927239418029785,
+ "learning_rate": 4.990338314647881e-05,
+ "loss": 1.961,
+ "step": 90
+ },
+ {
+ "epoch": 0.03,
+ "grad_norm": 0.9499759674072266,
+ "learning_rate": 4.98923576645875e-05,
+ "loss": 2.0653,
+ "step": 95
+ },
+ {
+ "epoch": 0.03,
+ "grad_norm": 0.7233040928840637,
+ "learning_rate": 4.9880737968575365e-05,
+ "loss": 1.9999,
+ "step": 100
+ },
+ {
+ "epoch": 0.03,
+ "grad_norm": 1.55235755443573,
+ "learning_rate": 4.986852433582022e-05,
+ "loss": 2.2258,
+ "step": 105
+ },
+ {
+ "epoch": 0.03,
+ "grad_norm": 0.9007890820503235,
+ "learning_rate": 4.985571705787793e-05,
+ "loss": 2.1034,
+ "step": 110
+ },
+ {
+ "epoch": 0.04,
+ "grad_norm": 0.6774860620498657,
+ "learning_rate": 4.9842316440475475e-05,
+ "loss": 2.1753,
+ "step": 115
+ },
+ {
+ "epoch": 0.04,
+ "grad_norm": 0.7676737308502197,
+ "learning_rate": 4.9828322803503665e-05,
+ "loss": 2.1384,
+ "step": 120
+ },
+ {
+ "epoch": 0.04,
+ "grad_norm": 0.9624544978141785,
+ "learning_rate": 4.981373648100946e-05,
+ "loss": 2.0521,
+ "step": 125
+ },
+ {
+ "epoch": 0.04,
+ "grad_norm": 0.9315722584724426,
+ "learning_rate": 4.979855782118802e-05,
+ "loss": 1.9256,
+ "step": 130
+ },
+ {
+ "epoch": 0.04,
+ "grad_norm": 0.9035864472389221,
+ "learning_rate": 4.978278718637443e-05,
+ "loss": 2.0882,
+ "step": 135
+ },
+ {
+ "epoch": 0.04,
+ "grad_norm": 0.7997236251831055,
+ "learning_rate": 4.9766424953035e-05,
+ "loss": 2.0724,
+ "step": 140
+ },
+ {
+ "epoch": 0.05,
+ "grad_norm": 1.0692921876907349,
+ "learning_rate": 4.974947151175826e-05,
+ "loss": 2.1329,
+ "step": 145
+ },
+ {
+ "epoch": 0.05,
+ "grad_norm": 0.9506180286407471,
+ "learning_rate": 4.973192726724572e-05,
+ "loss": 2.082,
+ "step": 150
+ },
+ {
+ "epoch": 0.05,
+ "grad_norm": 0.8647387027740479,
+ "learning_rate": 4.9713792638302145e-05,
+ "loss": 2.0366,
+ "step": 155
+ },
+ {
+ "epoch": 0.05,
+ "grad_norm": 1.105302095413208,
+ "learning_rate": 4.969506805782555e-05,
+ "loss": 2.1481,
+ "step": 160
+ },
+ {
+ "epoch": 0.05,
+ "grad_norm": 0.7593303918838501,
+ "learning_rate": 4.967575397279689e-05,
+ "loss": 2.032,
+ "step": 165
+ },
+ {
+ "epoch": 0.05,
+ "grad_norm": 0.7521979808807373,
+ "learning_rate": 4.965585084426943e-05,
+ "loss": 2.0379,
+ "step": 170
+ },
+ {
+ "epoch": 0.05,
+ "grad_norm": 0.947120726108551,
+ "learning_rate": 4.9635359147357655e-05,
+ "loss": 2.1444,
+ "step": 175
+ },
+ {
+ "epoch": 0.06,
+ "grad_norm": 1.2184454202651978,
+ "learning_rate": 4.961427937122598e-05,
+ "loss": 1.9164,
+ "step": 180
+ },
+ {
+ "epoch": 0.06,
+ "grad_norm": 1.221663475036621,
+ "learning_rate": 4.959261201907707e-05,
+ "loss": 2.0084,
+ "step": 185
+ },
+ {
+ "epoch": 0.06,
+ "grad_norm": 1.0457361936569214,
+ "learning_rate": 4.957035760813982e-05,
+ "loss": 2.2032,
+ "step": 190
+ },
+ {
+ "epoch": 0.06,
+ "grad_norm": 0.8834909200668335,
+ "learning_rate": 4.954751666965701e-05,
+ "loss": 2.2101,
+ "step": 195
+ },
+ {
+ "epoch": 0.06,
+ "grad_norm": 0.791902482509613,
+ "learning_rate": 4.9524089748872615e-05,
+ "loss": 2.0472,
+ "step": 200
+ },
+ {
+ "epoch": 0.06,
+ "grad_norm": 1.2905739545822144,
+ "learning_rate": 4.9500077405018807e-05,
+ "loss": 2.0987,
+ "step": 205
+ },
+ {
+ "epoch": 0.07,
+ "grad_norm": 0.8612006306648254,
+ "learning_rate": 4.9475480211302583e-05,
+ "loss": 2.1765,
+ "step": 210
+ },
+ {
+ "epoch": 0.07,
+ "grad_norm": 1.3128459453582764,
+ "learning_rate": 4.945029875489212e-05,
+ "loss": 1.9926,
+ "step": 215
+ },
+ {
+ "epoch": 0.07,
+ "grad_norm": 0.9610918164253235,
+ "learning_rate": 4.94245336369027e-05,
+ "loss": 2.0124,
+ "step": 220
+ },
+ {
+ "epoch": 0.07,
+ "grad_norm": 0.873160183429718,
+ "learning_rate": 4.939818547238241e-05,
+ "loss": 2.2229,
+ "step": 225
+ },
+ {
+ "epoch": 0.07,
+ "grad_norm": 1.5535285472869873,
+ "learning_rate": 4.9371254890297446e-05,
+ "loss": 2.2013,
+ "step": 230
+ },
+ {
+ "epoch": 0.07,
+ "grad_norm": 1.1951836347579956,
+ "learning_rate": 4.93437425335171e-05,
+ "loss": 2.014,
+ "step": 235
+ },
+ {
+ "epoch": 0.07,
+ "grad_norm": 0.7874170541763306,
+ "learning_rate": 4.9315649058798384e-05,
+ "loss": 2.1701,
+ "step": 240
+ },
+ {
+ "epoch": 0.08,
+ "grad_norm": 1.3503323793411255,
+ "learning_rate": 4.928697513677042e-05,
+ "loss": 2.1681,
+ "step": 245
+ },
+ {
+ "epoch": 0.08,
+ "grad_norm": 1.3091179132461548,
+ "learning_rate": 4.925772145191834e-05,
+ "loss": 2.1224,
+ "step": 250
+ },
+ {
+ "epoch": 0.08,
+ "grad_norm": 1.4428555965423584,
+ "learning_rate": 4.9227888702567044e-05,
+ "loss": 2.0512,
+ "step": 255
+ },
+ {
+ "epoch": 0.08,
+ "grad_norm": 0.8234395980834961,
+ "learning_rate": 4.9197477600864446e-05,
+ "loss": 2.1067,
+ "step": 260
+ },
+ {
+ "epoch": 0.08,
+ "grad_norm": 1.9094969034194946,
+ "learning_rate": 4.9166488872764526e-05,
+ "loss": 1.8884,
+ "step": 265
+ },
+ {
+ "epoch": 0.08,
+ "grad_norm": 1.0074087381362915,
+ "learning_rate": 4.913492325800999e-05,
+ "loss": 1.9345,
+ "step": 270
+ },
+ {
+ "epoch": 0.09,
+ "grad_norm": 1.0867297649383545,
+ "learning_rate": 4.910278151011458e-05,
+ "loss": 2.1928,
+ "step": 275
+ },
+ {
+ "epoch": 0.09,
+ "grad_norm": 0.6842357516288757,
+ "learning_rate": 4.907006439634516e-05,
+ "loss": 2.0407,
+ "step": 280
+ },
+ {
+ "epoch": 0.09,
+ "grad_norm": 0.8409023284912109,
+ "learning_rate": 4.903677269770329e-05,
+ "loss": 2.2344,
+ "step": 285
+ },
+ {
+ "epoch": 0.09,
+ "grad_norm": 0.8119503259658813,
+ "learning_rate": 4.900290720890671e-05,
+ "loss": 2.1296,
+ "step": 290
+ },
+ {
+ "epoch": 0.09,
+ "grad_norm": 0.9938147068023682,
+ "learning_rate": 4.8968468738370244e-05,
+ "loss": 2.152,
+ "step": 295
+ },
+ {
+ "epoch": 0.09,
+ "grad_norm": 0.9865244030952454,
+ "learning_rate": 4.8933458108186606e-05,
+ "loss": 1.9623,
+ "step": 300
+ },
+ {
+ "epoch": 0.09,
+ "grad_norm": 1.3944802284240723,
+ "learning_rate": 4.889787615410672e-05,
+ "loss": 1.915,
+ "step": 305
+ },
+ {
+ "epoch": 0.1,
+ "grad_norm": 1.3749767541885376,
+ "learning_rate": 4.886172372551977e-05,
+ "loss": 1.9934,
+ "step": 310
+ },
+ {
+ "epoch": 0.1,
+ "grad_norm": 0.9024938941001892,
+ "learning_rate": 4.882500168543294e-05,
+ "loss": 2.1541,
+ "step": 315
+ },
+ {
+ "epoch": 0.1,
+ "grad_norm": 1.1978263854980469,
+ "learning_rate": 4.878771091045082e-05,
+ "loss": 2.1688,
+ "step": 320
+ },
+ {
+ "epoch": 0.1,
+ "grad_norm": 0.8360010981559753,
+ "learning_rate": 4.874985229075446e-05,
+ "loss": 2.1387,
+ "step": 325
+ },
+ {
+ "epoch": 0.1,
+ "grad_norm": 0.7683364152908325,
+ "learning_rate": 4.871142673008012e-05,
+ "loss": 2.0215,
+ "step": 330
+ },
+ {
+ "epoch": 0.1,
+ "grad_norm": 1.4230670928955078,
+ "learning_rate": 4.867243514569772e-05,
+ "loss": 1.9491,
+ "step": 335
+ },
+ {
+ "epoch": 0.11,
+ "grad_norm": 0.8198773860931396,
+ "learning_rate": 4.863287846838891e-05,
+ "loss": 2.0151,
+ "step": 340
+ },
+ {
+ "epoch": 0.11,
+ "grad_norm": 1.467207908630371,
+ "learning_rate": 4.85927576424249e-05,
+ "loss": 1.8906,
+ "step": 345
+ },
+ {
+ "epoch": 0.11,
+ "grad_norm": 0.9537095427513123,
+ "learning_rate": 4.855207362554385e-05,
+ "loss": 2.1844,
+ "step": 350
+ },
+ {
+ "epoch": 0.11,
+ "grad_norm": 1.0757155418395996,
+ "learning_rate": 4.851082738892809e-05,
+ "loss": 2.048,
+ "step": 355
+ },
+ {
+ "epoch": 0.11,
+ "grad_norm": 1.6884938478469849,
+ "learning_rate": 4.8469019917180846e-05,
+ "loss": 1.9537,
+ "step": 360
+ },
+ {
+ "epoch": 0.11,
+ "grad_norm": 1.4680182933807373,
+ "learning_rate": 4.8426652208302814e-05,
+ "loss": 1.9731,
+ "step": 365
+ },
+ {
+ "epoch": 0.12,
+ "grad_norm": 1.1778632402420044,
+ "learning_rate": 4.83837252736683e-05,
+ "loss": 2.1395,
+ "step": 370
+ },
+ {
+ "epoch": 0.12,
+ "grad_norm": 1.2865056991577148,
+ "learning_rate": 4.834024013800108e-05,
+ "loss": 2.0016,
+ "step": 375
+ },
+ {
+ "epoch": 0.12,
+ "grad_norm": 1.055177092552185,
+ "learning_rate": 4.8296197839349944e-05,
+ "loss": 1.9632,
+ "step": 380
+ },
+ {
+ "epoch": 0.12,
+ "grad_norm": 1.0041871070861816,
+ "learning_rate": 4.825159942906389e-05,
+ "loss": 2.3302,
+ "step": 385
+ },
+ {
+ "epoch": 0.12,
+ "grad_norm": 1.0026438236236572,
+ "learning_rate": 4.820644597176709e-05,
+ "loss": 2.1517,
+ "step": 390
+ },
+ {
+ "epoch": 0.12,
+ "grad_norm": 1.3532180786132812,
+ "learning_rate": 4.81607385453334e-05,
+ "loss": 2.1229,
+ "step": 395
+ },
+ {
+ "epoch": 0.12,
+ "grad_norm": 0.7670988440513611,
+ "learning_rate": 4.81144782408607e-05,
+ "loss": 2.1382,
+ "step": 400
+ },
+ {
+ "epoch": 0.13,
+ "grad_norm": 1.0405700206756592,
+ "learning_rate": 4.8067666162644774e-05,
+ "loss": 1.9614,
+ "step": 405
+ },
+ {
+ "epoch": 0.13,
+ "grad_norm": 1.2252662181854248,
+ "learning_rate": 4.802030342815304e-05,
+ "loss": 2.1399,
+ "step": 410
+ },
+ {
+ "epoch": 0.13,
+ "grad_norm": 1.237946629524231,
+ "learning_rate": 4.7972391167997754e-05,
+ "loss": 1.9034,
+ "step": 415
+ },
+ {
+ "epoch": 0.13,
+ "grad_norm": 0.8064705729484558,
+ "learning_rate": 4.7923930525909156e-05,
+ "loss": 2.0075,
+ "step": 420
+ },
+ {
+ "epoch": 0.13,
+ "grad_norm": 0.8717565536499023,
+ "learning_rate": 4.7874922658708065e-05,
+ "loss": 2.0105,
+ "step": 425
+ },
+ {
+ "epoch": 0.13,
+ "grad_norm": 1.6693098545074463,
+ "learning_rate": 4.782536873627832e-05,
+ "loss": 2.0242,
+ "step": 430
+ },
+ {
+ "epoch": 0.14,
+ "grad_norm": 0.82447350025177,
+ "learning_rate": 4.777526994153882e-05,
+ "loss": 2.0267,
+ "step": 435
+ },
+ {
+ "epoch": 0.14,
+ "grad_norm": 0.9926588535308838,
+ "learning_rate": 4.7724627470415307e-05,
+ "loss": 1.9119,
+ "step": 440
+ },
+ {
+ "epoch": 0.14,
+ "grad_norm": 1.0924450159072876,
+ "learning_rate": 4.7673442531811796e-05,
+ "loss": 2.2653,
+ "step": 445
+ },
+ {
+ "epoch": 0.14,
+ "grad_norm": 1.1592103242874146,
+ "learning_rate": 4.762171634758177e-05,
+ "loss": 2.0017,
+ "step": 450
+ },
+ {
+ "epoch": 0.14,
+ "grad_norm": 0.9172110557556152,
+ "learning_rate": 4.7569450152498927e-05,
+ "loss": 2.1408,
+ "step": 455
+ },
+ {
+ "epoch": 0.14,
+ "grad_norm": 1.1897525787353516,
+ "learning_rate": 4.751664519422778e-05,
+ "loss": 2.0935,
+ "step": 460
+ },
+ {
+ "epoch": 0.14,
+ "grad_norm": 0.8793094158172607,
+ "learning_rate": 4.746330273329386e-05,
+ "loss": 2.1142,
+ "step": 465
+ },
+ {
+ "epoch": 0.15,
+ "grad_norm": 1.4337489604949951,
+ "learning_rate": 4.740942404305356e-05,
+ "loss": 2.1289,
+ "step": 470
+ },
+ {
+ "epoch": 0.15,
+ "grad_norm": 1.0251764059066772,
+ "learning_rate": 4.735501040966383e-05,
+ "loss": 1.9741,
+ "step": 475
+ },
+ {
+ "epoch": 0.15,
+ "grad_norm": 1.2659822702407837,
+ "learning_rate": 4.730006313205143e-05,
+ "loss": 2.088,
+ "step": 480
+ },
+ {
+ "epoch": 0.15,
+ "grad_norm": 0.8884140849113464,
+ "learning_rate": 4.724458352188192e-05,
+ "loss": 2.2079,
+ "step": 485
+ },
+ {
+ "epoch": 0.15,
+ "grad_norm": 1.1937768459320068,
+ "learning_rate": 4.718857290352835e-05,
+ "loss": 2.048,
+ "step": 490
+ },
+ {
+ "epoch": 0.15,
+ "grad_norm": 0.9741552472114563,
+ "learning_rate": 4.713203261403966e-05,
+ "loss": 2.2569,
+ "step": 495
+ },
+ {
+ "epoch": 0.16,
+ "grad_norm": 0.7996780872344971,
+ "learning_rate": 4.707496400310874e-05,
+ "loss": 1.9574,
+ "step": 500
+ }
+ ],
+ "logging_steps": 5,
+ "max_steps": 3215,
+ "num_input_tokens_seen": 0,
+ "num_train_epochs": 1,
+ "save_steps": 100,
+ "total_flos": 6734882641674240.0,
+ "train_batch_size": 2,
+ "trial_name": null,
+ "trial_params": null
+}
diff --git a/checkpoint-500/training_args.bin b/checkpoint-500/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..dbb5f752acaebe6718c05e59ced2a70ce6dfb6a0
--- /dev/null
+++ b/checkpoint-500/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:79e80b13ff00898b4493d440a3c1a1eb234c0ae541cbca8a8b1befef97a354c9
+size 5112
diff --git a/checkpoint-600/README.md b/checkpoint-600/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..b21eb87c49b1ea11193cd8f664d04b06c22e6bb5
--- /dev/null
+++ b/checkpoint-600/README.md
@@ -0,0 +1,202 @@
+---
+library_name: peft
+base_model: hfl/chinese-alpaca-2-1.3b
+---
+
+# Model Card for Model ID
+
+
+
+
+
+## Model Details
+
+### Model Description
+
+
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+
+
+### Direct Use
+
+
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+
+
+[More Information Needed]
+
+### Recommendations
+
+
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+
+
+[More Information Needed]
+
+### Training Procedure
+
+
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed]
+
+#### Speeds, Sizes, Times [optional]
+
+
+
+[More Information Needed]
+
+## Evaluation
+
+
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+
+
+[More Information Needed]
+
+#### Factors
+
+
+
+[More Information Needed]
+
+#### Metrics
+
+
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+
+
+[More Information Needed]
+
+## Environmental Impact
+
+
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.9.0
\ No newline at end of file
diff --git a/checkpoint-600/adapter_config.json b/checkpoint-600/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..67326eac9d1aeaece3cee6be49e088077b9cebe9
--- /dev/null
+++ b/checkpoint-600/adapter_config.json
@@ -0,0 +1,28 @@
+{
+ "alpha_pattern": {},
+ "auto_mapping": null,
+ "base_model_name_or_path": "hfl/chinese-alpaca-2-1.3b",
+ "bias": "none",
+ "fan_in_fan_out": false,
+ "inference_mode": true,
+ "init_lora_weights": true,
+ "layers_pattern": null,
+ "layers_to_transform": null,
+ "loftq_config": {},
+ "lora_alpha": 16,
+ "lora_dropout": 0.1,
+ "megatron_config": null,
+ "megatron_core": "megatron.core",
+ "modules_to_save": null,
+ "peft_type": "LORA",
+ "r": 8,
+ "rank_pattern": {},
+ "revision": null,
+ "target_modules": [
+ "q_proj",
+ "v_proj"
+ ],
+ "task_type": "CAUSAL_LM",
+ "use_dora": false,
+ "use_rslora": false
+}
\ No newline at end of file
diff --git a/checkpoint-600/adapter_model.safetensors b/checkpoint-600/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..865c68246181451b05a7f0e1e45240b0ef786591
--- /dev/null
+++ b/checkpoint-600/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ec6b39a908aacd2318834cc4f8a720c19a8db28fd18792db522289592735a1ad
+size 2099272
diff --git a/checkpoint-600/optimizer.pt b/checkpoint-600/optimizer.pt
new file mode 100644
index 0000000000000000000000000000000000000000..acb1794b9077f485c2076bd60ee8e711e0788beb
--- /dev/null
+++ b/checkpoint-600/optimizer.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c177b9e5841ef74c12f1bcf3fc62942e696f5958905438ccc1487083e06592fd
+size 4208302
diff --git a/checkpoint-600/rng_state.pth b/checkpoint-600/rng_state.pth
new file mode 100644
index 0000000000000000000000000000000000000000..55ce195caac088df17b4aef698450a05f08155e0
--- /dev/null
+++ b/checkpoint-600/rng_state.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8337806e5c44ce711d9636e9c6a356ba5a3009d12ca530b6fdc463557f027171
+size 14244
diff --git a/checkpoint-600/scheduler.pt b/checkpoint-600/scheduler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..6311365e307c137be082e0a60309c9521b796c3e
--- /dev/null
+++ b/checkpoint-600/scheduler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7b9e75e4472c974f751926764bde84f22df5de25c6e4a0448342940bc46aedcb
+size 1064
diff --git a/checkpoint-600/special_tokens_map.json b/checkpoint-600/special_tokens_map.json
new file mode 100644
index 0000000000000000000000000000000000000000..94c4595c4eb202faff8e45d273ceffbe19fe3036
--- /dev/null
+++ b/checkpoint-600/special_tokens_map.json
@@ -0,0 +1,30 @@
+{
+ "bos_token": {
+ "content": "",
+ "lstrip": false,
+ "normalized": true,
+ "rstrip": false,
+ "single_word": false
+ },
+ "eos_token": {
+ "content": "",
+ "lstrip": false,
+ "normalized": true,
+ "rstrip": false,
+ "single_word": false
+ },
+ "pad_token": {
+ "content": "",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false
+ },
+ "unk_token": {
+ "content": "",
+ "lstrip": false,
+ "normalized": true,
+ "rstrip": false,
+ "single_word": false
+ }
+}
diff --git a/checkpoint-600/tokenizer.model b/checkpoint-600/tokenizer.model
new file mode 100644
index 0000000000000000000000000000000000000000..3df664fcc67f61d0c7e635a60d9f55cd0624158a
--- /dev/null
+++ b/checkpoint-600/tokenizer.model
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a3b8844863b200dfcca971db228e96ce388290dfcf72c15d7a9d2f604bac787c
+size 844403
diff --git a/checkpoint-600/tokenizer_config.json b/checkpoint-600/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..6912f7cdcb84c3039edad59aa64f70dc4344a517
--- /dev/null
+++ b/checkpoint-600/tokenizer_config.json
@@ -0,0 +1,54 @@
+{
+ "add_bos_token": true,
+ "add_eos_token": false,
+ "add_prefix_space": true,
+ "added_tokens_decoder": {
+ "0": {
+ "content": "",
+ "lstrip": false,
+ "normalized": true,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "1": {
+ "content": "",
+ "lstrip": false,
+ "normalized": true,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "2": {
+ "content": "",
+ "lstrip": false,
+ "normalized": true,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "32000": {
+ "content": "",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ }
+ },
+ "bos_token": "",
+ "chat_template": "{% set system_message = 'You are a helpful assistant. 你是一个乐于助人的助手。' %}{% if messages[0]['role'] == 'system' %}{% set system_message = messages[0]['content'] %}{% endif %}{% for message in messages %}{% set content = message['content'] %}{% if loop.index0 == 0 and system_message is defined %}{% set content = '<>\\n' + system_message + '\\n<>\\n\\n' + message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ '' + '[INST] ' + content + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ content + '' }}{% endif %}{% endfor %}",
+ "clean_up_tokenization_spaces": false,
+ "eos_token": "",
+ "legacy": true,
+ "model_max_length": 1000000000000000019884624838656,
+ "pad_token": "",
+ "padding_side": "right",
+ "sp_model_kwargs": {},
+ "spaces_between_special_tokens": false,
+ "split_special_tokens": false,
+ "tokenizer_class": "LlamaTokenizer",
+ "unk_token": "",
+ "use_default_system_prompt": false,
+ "use_fast": false
+}
diff --git a/checkpoint-600/trainer_state.json b/checkpoint-600/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..bcc0116e4e0116c68cda931099cc32e00407862b
--- /dev/null
+++ b/checkpoint-600/trainer_state.json
@@ -0,0 +1,861 @@
+{
+ "best_metric": null,
+ "best_model_checkpoint": null,
+ "epoch": 0.1865744159832083,
+ "eval_steps": 500,
+ "global_step": 600,
+ "is_hyper_param_search": false,
+ "is_local_process_zero": true,
+ "is_world_process_zero": true,
+ "log_history": [
+ {
+ "epoch": 0.0,
+ "grad_norm": 0.47774845361709595,
+ "learning_rate": 4.999970160815579e-05,
+ "loss": 2.0765,
+ "step": 5
+ },
+ {
+ "epoch": 0.0,
+ "grad_norm": 0.6051416397094727,
+ "learning_rate": 4.999880643974619e-05,
+ "loss": 2.2297,
+ "step": 10
+ },
+ {
+ "epoch": 0.0,
+ "grad_norm": 0.6161717772483826,
+ "learning_rate": 4.9997314516140056e-05,
+ "loss": 2.1103,
+ "step": 15
+ },
+ {
+ "epoch": 0.01,
+ "grad_norm": 0.4686434268951416,
+ "learning_rate": 4.999522587295162e-05,
+ "loss": 2.0057,
+ "step": 20
+ },
+ {
+ "epoch": 0.01,
+ "grad_norm": 0.8412289023399353,
+ "learning_rate": 4.999254056003963e-05,
+ "loss": 2.1778,
+ "step": 25
+ },
+ {
+ "epoch": 0.01,
+ "grad_norm": 0.5333625078201294,
+ "learning_rate": 4.99892586415061e-05,
+ "loss": 2.2399,
+ "step": 30
+ },
+ {
+ "epoch": 0.01,
+ "grad_norm": 0.821148157119751,
+ "learning_rate": 4.9985380195694856e-05,
+ "loss": 2.3215,
+ "step": 35
+ },
+ {
+ "epoch": 0.01,
+ "grad_norm": 0.8403909206390381,
+ "learning_rate": 4.998090531518962e-05,
+ "loss": 1.8295,
+ "step": 40
+ },
+ {
+ "epoch": 0.01,
+ "grad_norm": 0.6633398532867432,
+ "learning_rate": 4.9975834106811834e-05,
+ "loss": 2.0195,
+ "step": 45
+ },
+ {
+ "epoch": 0.02,
+ "grad_norm": 0.6386868357658386,
+ "learning_rate": 4.997016669161806e-05,
+ "loss": 2.1257,
+ "step": 50
+ },
+ {
+ "epoch": 0.02,
+ "grad_norm": 0.7762248516082764,
+ "learning_rate": 4.996390320489715e-05,
+ "loss": 2.057,
+ "step": 55
+ },
+ {
+ "epoch": 0.02,
+ "grad_norm": 1.3192856311798096,
+ "learning_rate": 4.9957043796166966e-05,
+ "loss": 2.0753,
+ "step": 60
+ },
+ {
+ "epoch": 0.02,
+ "grad_norm": 0.9797518849372864,
+ "learning_rate": 4.994958862917083e-05,
+ "loss": 1.9736,
+ "step": 65
+ },
+ {
+ "epoch": 0.02,
+ "grad_norm": 1.000693440437317,
+ "learning_rate": 4.994153788187363e-05,
+ "loss": 2.1572,
+ "step": 70
+ },
+ {
+ "epoch": 0.02,
+ "grad_norm": 0.6852813959121704,
+ "learning_rate": 4.993289174645757e-05,
+ "loss": 2.1491,
+ "step": 75
+ },
+ {
+ "epoch": 0.02,
+ "grad_norm": 1.0075691938400269,
+ "learning_rate": 4.992365042931752e-05,
+ "loss": 1.945,
+ "step": 80
+ },
+ {
+ "epoch": 0.03,
+ "grad_norm": 1.1973133087158203,
+ "learning_rate": 4.991381415105619e-05,
+ "loss": 2.0811,
+ "step": 85
+ },
+ {
+ "epoch": 0.03,
+ "grad_norm": 0.9927239418029785,
+ "learning_rate": 4.990338314647881e-05,
+ "loss": 1.961,
+ "step": 90
+ },
+ {
+ "epoch": 0.03,
+ "grad_norm": 0.9499759674072266,
+ "learning_rate": 4.98923576645875e-05,
+ "loss": 2.0653,
+ "step": 95
+ },
+ {
+ "epoch": 0.03,
+ "grad_norm": 0.7233040928840637,
+ "learning_rate": 4.9880737968575365e-05,
+ "loss": 1.9999,
+ "step": 100
+ },
+ {
+ "epoch": 0.03,
+ "grad_norm": 1.55235755443573,
+ "learning_rate": 4.986852433582022e-05,
+ "loss": 2.2258,
+ "step": 105
+ },
+ {
+ "epoch": 0.03,
+ "grad_norm": 0.9007890820503235,
+ "learning_rate": 4.985571705787793e-05,
+ "loss": 2.1034,
+ "step": 110
+ },
+ {
+ "epoch": 0.04,
+ "grad_norm": 0.6774860620498657,
+ "learning_rate": 4.9842316440475475e-05,
+ "loss": 2.1753,
+ "step": 115
+ },
+ {
+ "epoch": 0.04,
+ "grad_norm": 0.7676737308502197,
+ "learning_rate": 4.9828322803503665e-05,
+ "loss": 2.1384,
+ "step": 120
+ },
+ {
+ "epoch": 0.04,
+ "grad_norm": 0.9624544978141785,
+ "learning_rate": 4.981373648100946e-05,
+ "loss": 2.0521,
+ "step": 125
+ },
+ {
+ "epoch": 0.04,
+ "grad_norm": 0.9315722584724426,
+ "learning_rate": 4.979855782118802e-05,
+ "loss": 1.9256,
+ "step": 130
+ },
+ {
+ "epoch": 0.04,
+ "grad_norm": 0.9035864472389221,
+ "learning_rate": 4.978278718637443e-05,
+ "loss": 2.0882,
+ "step": 135
+ },
+ {
+ "epoch": 0.04,
+ "grad_norm": 0.7997236251831055,
+ "learning_rate": 4.9766424953035e-05,
+ "loss": 2.0724,
+ "step": 140
+ },
+ {
+ "epoch": 0.05,
+ "grad_norm": 1.0692921876907349,
+ "learning_rate": 4.974947151175826e-05,
+ "loss": 2.1329,
+ "step": 145
+ },
+ {
+ "epoch": 0.05,
+ "grad_norm": 0.9506180286407471,
+ "learning_rate": 4.973192726724572e-05,
+ "loss": 2.082,
+ "step": 150
+ },
+ {
+ "epoch": 0.05,
+ "grad_norm": 0.8647387027740479,
+ "learning_rate": 4.9713792638302145e-05,
+ "loss": 2.0366,
+ "step": 155
+ },
+ {
+ "epoch": 0.05,
+ "grad_norm": 1.105302095413208,
+ "learning_rate": 4.969506805782555e-05,
+ "loss": 2.1481,
+ "step": 160
+ },
+ {
+ "epoch": 0.05,
+ "grad_norm": 0.7593303918838501,
+ "learning_rate": 4.967575397279689e-05,
+ "loss": 2.032,
+ "step": 165
+ },
+ {
+ "epoch": 0.05,
+ "grad_norm": 0.7521979808807373,
+ "learning_rate": 4.965585084426943e-05,
+ "loss": 2.0379,
+ "step": 170
+ },
+ {
+ "epoch": 0.05,
+ "grad_norm": 0.947120726108551,
+ "learning_rate": 4.9635359147357655e-05,
+ "loss": 2.1444,
+ "step": 175
+ },
+ {
+ "epoch": 0.06,
+ "grad_norm": 1.2184454202651978,
+ "learning_rate": 4.961427937122598e-05,
+ "loss": 1.9164,
+ "step": 180
+ },
+ {
+ "epoch": 0.06,
+ "grad_norm": 1.221663475036621,
+ "learning_rate": 4.959261201907707e-05,
+ "loss": 2.0084,
+ "step": 185
+ },
+ {
+ "epoch": 0.06,
+ "grad_norm": 1.0457361936569214,
+ "learning_rate": 4.957035760813982e-05,
+ "loss": 2.2032,
+ "step": 190
+ },
+ {
+ "epoch": 0.06,
+ "grad_norm": 0.8834909200668335,
+ "learning_rate": 4.954751666965701e-05,
+ "loss": 2.2101,
+ "step": 195
+ },
+ {
+ "epoch": 0.06,
+ "grad_norm": 0.791902482509613,
+ "learning_rate": 4.9524089748872615e-05,
+ "loss": 2.0472,
+ "step": 200
+ },
+ {
+ "epoch": 0.06,
+ "grad_norm": 1.2905739545822144,
+ "learning_rate": 4.9500077405018807e-05,
+ "loss": 2.0987,
+ "step": 205
+ },
+ {
+ "epoch": 0.07,
+ "grad_norm": 0.8612006306648254,
+ "learning_rate": 4.9475480211302583e-05,
+ "loss": 2.1765,
+ "step": 210
+ },
+ {
+ "epoch": 0.07,
+ "grad_norm": 1.3128459453582764,
+ "learning_rate": 4.945029875489212e-05,
+ "loss": 1.9926,
+ "step": 215
+ },
+ {
+ "epoch": 0.07,
+ "grad_norm": 0.9610918164253235,
+ "learning_rate": 4.94245336369027e-05,
+ "loss": 2.0124,
+ "step": 220
+ },
+ {
+ "epoch": 0.07,
+ "grad_norm": 0.873160183429718,
+ "learning_rate": 4.939818547238241e-05,
+ "loss": 2.2229,
+ "step": 225
+ },
+ {
+ "epoch": 0.07,
+ "grad_norm": 1.5535285472869873,
+ "learning_rate": 4.9371254890297446e-05,
+ "loss": 2.2013,
+ "step": 230
+ },
+ {
+ "epoch": 0.07,
+ "grad_norm": 1.1951836347579956,
+ "learning_rate": 4.93437425335171e-05,
+ "loss": 2.014,
+ "step": 235
+ },
+ {
+ "epoch": 0.07,
+ "grad_norm": 0.7874170541763306,
+ "learning_rate": 4.9315649058798384e-05,
+ "loss": 2.1701,
+ "step": 240
+ },
+ {
+ "epoch": 0.08,
+ "grad_norm": 1.3503323793411255,
+ "learning_rate": 4.928697513677042e-05,
+ "loss": 2.1681,
+ "step": 245
+ },
+ {
+ "epoch": 0.08,
+ "grad_norm": 1.3091179132461548,
+ "learning_rate": 4.925772145191834e-05,
+ "loss": 2.1224,
+ "step": 250
+ },
+ {
+ "epoch": 0.08,
+ "grad_norm": 1.4428555965423584,
+ "learning_rate": 4.9227888702567044e-05,
+ "loss": 2.0512,
+ "step": 255
+ },
+ {
+ "epoch": 0.08,
+ "grad_norm": 0.8234395980834961,
+ "learning_rate": 4.9197477600864446e-05,
+ "loss": 2.1067,
+ "step": 260
+ },
+ {
+ "epoch": 0.08,
+ "grad_norm": 1.9094969034194946,
+ "learning_rate": 4.9166488872764526e-05,
+ "loss": 1.8884,
+ "step": 265
+ },
+ {
+ "epoch": 0.08,
+ "grad_norm": 1.0074087381362915,
+ "learning_rate": 4.913492325800999e-05,
+ "loss": 1.9345,
+ "step": 270
+ },
+ {
+ "epoch": 0.09,
+ "grad_norm": 1.0867297649383545,
+ "learning_rate": 4.910278151011458e-05,
+ "loss": 2.1928,
+ "step": 275
+ },
+ {
+ "epoch": 0.09,
+ "grad_norm": 0.6842357516288757,
+ "learning_rate": 4.907006439634516e-05,
+ "loss": 2.0407,
+ "step": 280
+ },
+ {
+ "epoch": 0.09,
+ "grad_norm": 0.8409023284912109,
+ "learning_rate": 4.903677269770329e-05,
+ "loss": 2.2344,
+ "step": 285
+ },
+ {
+ "epoch": 0.09,
+ "grad_norm": 0.8119503259658813,
+ "learning_rate": 4.900290720890671e-05,
+ "loss": 2.1296,
+ "step": 290
+ },
+ {
+ "epoch": 0.09,
+ "grad_norm": 0.9938147068023682,
+ "learning_rate": 4.8968468738370244e-05,
+ "loss": 2.152,
+ "step": 295
+ },
+ {
+ "epoch": 0.09,
+ "grad_norm": 0.9865244030952454,
+ "learning_rate": 4.8933458108186606e-05,
+ "loss": 1.9623,
+ "step": 300
+ },
+ {
+ "epoch": 0.09,
+ "grad_norm": 1.3944802284240723,
+ "learning_rate": 4.889787615410672e-05,
+ "loss": 1.915,
+ "step": 305
+ },
+ {
+ "epoch": 0.1,
+ "grad_norm": 1.3749767541885376,
+ "learning_rate": 4.886172372551977e-05,
+ "loss": 1.9934,
+ "step": 310
+ },
+ {
+ "epoch": 0.1,
+ "grad_norm": 0.9024938941001892,
+ "learning_rate": 4.882500168543294e-05,
+ "loss": 2.1541,
+ "step": 315
+ },
+ {
+ "epoch": 0.1,
+ "grad_norm": 1.1978263854980469,
+ "learning_rate": 4.878771091045082e-05,
+ "loss": 2.1688,
+ "step": 320
+ },
+ {
+ "epoch": 0.1,
+ "grad_norm": 0.8360010981559753,
+ "learning_rate": 4.874985229075446e-05,
+ "loss": 2.1387,
+ "step": 325
+ },
+ {
+ "epoch": 0.1,
+ "grad_norm": 0.7683364152908325,
+ "learning_rate": 4.871142673008012e-05,
+ "loss": 2.0215,
+ "step": 330
+ },
+ {
+ "epoch": 0.1,
+ "grad_norm": 1.4230670928955078,
+ "learning_rate": 4.867243514569772e-05,
+ "loss": 1.9491,
+ "step": 335
+ },
+ {
+ "epoch": 0.11,
+ "grad_norm": 0.8198773860931396,
+ "learning_rate": 4.863287846838891e-05,
+ "loss": 2.0151,
+ "step": 340
+ },
+ {
+ "epoch": 0.11,
+ "grad_norm": 1.467207908630371,
+ "learning_rate": 4.85927576424249e-05,
+ "loss": 1.8906,
+ "step": 345
+ },
+ {
+ "epoch": 0.11,
+ "grad_norm": 0.9537095427513123,
+ "learning_rate": 4.855207362554385e-05,
+ "loss": 2.1844,
+ "step": 350
+ },
+ {
+ "epoch": 0.11,
+ "grad_norm": 1.0757155418395996,
+ "learning_rate": 4.851082738892809e-05,
+ "loss": 2.048,
+ "step": 355
+ },
+ {
+ "epoch": 0.11,
+ "grad_norm": 1.6884938478469849,
+ "learning_rate": 4.8469019917180846e-05,
+ "loss": 1.9537,
+ "step": 360
+ },
+ {
+ "epoch": 0.11,
+ "grad_norm": 1.4680182933807373,
+ "learning_rate": 4.8426652208302814e-05,
+ "loss": 1.9731,
+ "step": 365
+ },
+ {
+ "epoch": 0.12,
+ "grad_norm": 1.1778632402420044,
+ "learning_rate": 4.83837252736683e-05,
+ "loss": 2.1395,
+ "step": 370
+ },
+ {
+ "epoch": 0.12,
+ "grad_norm": 1.2865056991577148,
+ "learning_rate": 4.834024013800108e-05,
+ "loss": 2.0016,
+ "step": 375
+ },
+ {
+ "epoch": 0.12,
+ "grad_norm": 1.055177092552185,
+ "learning_rate": 4.8296197839349944e-05,
+ "loss": 1.9632,
+ "step": 380
+ },
+ {
+ "epoch": 0.12,
+ "grad_norm": 1.0041871070861816,
+ "learning_rate": 4.825159942906389e-05,
+ "loss": 2.3302,
+ "step": 385
+ },
+ {
+ "epoch": 0.12,
+ "grad_norm": 1.0026438236236572,
+ "learning_rate": 4.820644597176709e-05,
+ "loss": 2.1517,
+ "step": 390
+ },
+ {
+ "epoch": 0.12,
+ "grad_norm": 1.3532180786132812,
+ "learning_rate": 4.81607385453334e-05,
+ "loss": 2.1229,
+ "step": 395
+ },
+ {
+ "epoch": 0.12,
+ "grad_norm": 0.7670988440513611,
+ "learning_rate": 4.81144782408607e-05,
+ "loss": 2.1382,
+ "step": 400
+ },
+ {
+ "epoch": 0.13,
+ "grad_norm": 1.0405700206756592,
+ "learning_rate": 4.8067666162644774e-05,
+ "loss": 1.9614,
+ "step": 405
+ },
+ {
+ "epoch": 0.13,
+ "grad_norm": 1.2252662181854248,
+ "learning_rate": 4.802030342815304e-05,
+ "loss": 2.1399,
+ "step": 410
+ },
+ {
+ "epoch": 0.13,
+ "grad_norm": 1.237946629524231,
+ "learning_rate": 4.7972391167997754e-05,
+ "loss": 1.9034,
+ "step": 415
+ },
+ {
+ "epoch": 0.13,
+ "grad_norm": 0.8064705729484558,
+ "learning_rate": 4.7923930525909156e-05,
+ "loss": 2.0075,
+ "step": 420
+ },
+ {
+ "epoch": 0.13,
+ "grad_norm": 0.8717565536499023,
+ "learning_rate": 4.7874922658708065e-05,
+ "loss": 2.0105,
+ "step": 425
+ },
+ {
+ "epoch": 0.13,
+ "grad_norm": 1.6693098545074463,
+ "learning_rate": 4.782536873627832e-05,
+ "loss": 2.0242,
+ "step": 430
+ },
+ {
+ "epoch": 0.14,
+ "grad_norm": 0.82447350025177,
+ "learning_rate": 4.777526994153882e-05,
+ "loss": 2.0267,
+ "step": 435
+ },
+ {
+ "epoch": 0.14,
+ "grad_norm": 0.9926588535308838,
+ "learning_rate": 4.7724627470415307e-05,
+ "loss": 1.9119,
+ "step": 440
+ },
+ {
+ "epoch": 0.14,
+ "grad_norm": 1.0924450159072876,
+ "learning_rate": 4.7673442531811796e-05,
+ "loss": 2.2653,
+ "step": 445
+ },
+ {
+ "epoch": 0.14,
+ "grad_norm": 1.1592103242874146,
+ "learning_rate": 4.762171634758177e-05,
+ "loss": 2.0017,
+ "step": 450
+ },
+ {
+ "epoch": 0.14,
+ "grad_norm": 0.9172110557556152,
+ "learning_rate": 4.7569450152498927e-05,
+ "loss": 2.1408,
+ "step": 455
+ },
+ {
+ "epoch": 0.14,
+ "grad_norm": 1.1897525787353516,
+ "learning_rate": 4.751664519422778e-05,
+ "loss": 2.0935,
+ "step": 460
+ },
+ {
+ "epoch": 0.14,
+ "grad_norm": 0.8793094158172607,
+ "learning_rate": 4.746330273329386e-05,
+ "loss": 2.1142,
+ "step": 465
+ },
+ {
+ "epoch": 0.15,
+ "grad_norm": 1.4337489604949951,
+ "learning_rate": 4.740942404305356e-05,
+ "loss": 2.1289,
+ "step": 470
+ },
+ {
+ "epoch": 0.15,
+ "grad_norm": 1.0251764059066772,
+ "learning_rate": 4.735501040966383e-05,
+ "loss": 1.9741,
+ "step": 475
+ },
+ {
+ "epoch": 0.15,
+ "grad_norm": 1.2659822702407837,
+ "learning_rate": 4.730006313205143e-05,
+ "loss": 2.088,
+ "step": 480
+ },
+ {
+ "epoch": 0.15,
+ "grad_norm": 0.8884140849113464,
+ "learning_rate": 4.724458352188192e-05,
+ "loss": 2.2079,
+ "step": 485
+ },
+ {
+ "epoch": 0.15,
+ "grad_norm": 1.1937768459320068,
+ "learning_rate": 4.718857290352835e-05,
+ "loss": 2.048,
+ "step": 490
+ },
+ {
+ "epoch": 0.15,
+ "grad_norm": 0.9741552472114563,
+ "learning_rate": 4.713203261403966e-05,
+ "loss": 2.2569,
+ "step": 495
+ },
+ {
+ "epoch": 0.16,
+ "grad_norm": 0.7996780872344971,
+ "learning_rate": 4.707496400310874e-05,
+ "loss": 1.9574,
+ "step": 500
+ },
+ {
+ "epoch": 0.16,
+ "grad_norm": 1.8182051181793213,
+ "learning_rate": 4.701736843304025e-05,
+ "loss": 2.0951,
+ "step": 505
+ },
+ {
+ "epoch": 0.16,
+ "grad_norm": 1.507320761680603,
+ "learning_rate": 4.695924727871805e-05,
+ "loss": 2.0253,
+ "step": 510
+ },
+ {
+ "epoch": 0.16,
+ "grad_norm": 0.759121835231781,
+ "learning_rate": 4.690060192757242e-05,
+ "loss": 2.0602,
+ "step": 515
+ },
+ {
+ "epoch": 0.16,
+ "grad_norm": 1.5943195819854736,
+ "learning_rate": 4.684143377954691e-05,
+ "loss": 2.0386,
+ "step": 520
+ },
+ {
+ "epoch": 0.16,
+ "grad_norm": 0.8568710088729858,
+ "learning_rate": 4.6781744247064955e-05,
+ "loss": 2.073,
+ "step": 525
+ },
+ {
+ "epoch": 0.16,
+ "grad_norm": 1.3352620601654053,
+ "learning_rate": 4.6721534754996125e-05,
+ "loss": 2.1443,
+ "step": 530
+ },
+ {
+ "epoch": 0.17,
+ "grad_norm": 1.3417474031448364,
+ "learning_rate": 4.666080674062213e-05,
+ "loss": 2.0288,
+ "step": 535
+ },
+ {
+ "epoch": 0.17,
+ "grad_norm": 1.5334464311599731,
+ "learning_rate": 4.659956165360251e-05,
+ "loss": 2.0609,
+ "step": 540
+ },
+ {
+ "epoch": 0.17,
+ "grad_norm": 0.9658721089363098,
+ "learning_rate": 4.6537800955940005e-05,
+ "loss": 1.9539,
+ "step": 545
+ },
+ {
+ "epoch": 0.17,
+ "grad_norm": 1.9197947978973389,
+ "learning_rate": 4.647552612194572e-05,
+ "loss": 2.149,
+ "step": 550
+ },
+ {
+ "epoch": 0.17,
+ "grad_norm": 0.8512137532234192,
+ "learning_rate": 4.641273863820383e-05,
+ "loss": 1.9722,
+ "step": 555
+ },
+ {
+ "epoch": 0.17,
+ "grad_norm": 1.827289342880249,
+ "learning_rate": 4.634944000353622e-05,
+ "loss": 2.0729,
+ "step": 560
+ },
+ {
+ "epoch": 0.18,
+ "grad_norm": 1.088416337966919,
+ "learning_rate": 4.628563172896655e-05,
+ "loss": 1.9507,
+ "step": 565
+ },
+ {
+ "epoch": 0.18,
+ "grad_norm": 1.3566908836364746,
+ "learning_rate": 4.6221315337684353e-05,
+ "loss": 2.1643,
+ "step": 570
+ },
+ {
+ "epoch": 0.18,
+ "grad_norm": 1.3541293144226074,
+ "learning_rate": 4.615649236500854e-05,
+ "loss": 2.1839,
+ "step": 575
+ },
+ {
+ "epoch": 0.18,
+ "grad_norm": 0.991269588470459,
+ "learning_rate": 4.609116435835083e-05,
+ "loss": 2.0976,
+ "step": 580
+ },
+ {
+ "epoch": 0.18,
+ "grad_norm": 1.0280535221099854,
+ "learning_rate": 4.602533287717877e-05,
+ "loss": 2.1474,
+ "step": 585
+ },
+ {
+ "epoch": 0.18,
+ "grad_norm": 1.013123631477356,
+ "learning_rate": 4.5958999492978524e-05,
+ "loss": 2.1873,
+ "step": 590
+ },
+ {
+ "epoch": 0.19,
+ "grad_norm": 1.1753040552139282,
+ "learning_rate": 4.589216578921737e-05,
+ "loss": 2.1744,
+ "step": 595
+ },
+ {
+ "epoch": 0.19,
+ "grad_norm": 1.1839090585708618,
+ "learning_rate": 4.582483336130586e-05,
+ "loss": 1.9982,
+ "step": 600
+ }
+ ],
+ "logging_steps": 5,
+ "max_steps": 3215,
+ "num_input_tokens_seen": 0,
+ "num_train_epochs": 1,
+ "save_steps": 100,
+ "total_flos": 8075669697331200.0,
+ "train_batch_size": 2,
+ "trial_name": null,
+ "trial_params": null
+}
diff --git a/checkpoint-600/training_args.bin b/checkpoint-600/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..dbb5f752acaebe6718c05e59ced2a70ce6dfb6a0
--- /dev/null
+++ b/checkpoint-600/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:79e80b13ff00898b4493d440a3c1a1eb234c0ae541cbca8a8b1befef97a354c9
+size 5112
diff --git a/checkpoint-700/README.md b/checkpoint-700/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..b21eb87c49b1ea11193cd8f664d04b06c22e6bb5
--- /dev/null
+++ b/checkpoint-700/README.md
@@ -0,0 +1,202 @@
+---
+library_name: peft
+base_model: hfl/chinese-alpaca-2-1.3b
+---
+
+# Model Card for Model ID
+
+
+
+
+
+## Model Details
+
+### Model Description
+
+
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+
+
+### Direct Use
+
+
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+
+
+[More Information Needed]
+
+### Recommendations
+
+
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+
+
+[More Information Needed]
+
+### Training Procedure
+
+
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed]
+
+#### Speeds, Sizes, Times [optional]
+
+
+
+[More Information Needed]
+
+## Evaluation
+
+
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+
+
+[More Information Needed]
+
+#### Factors
+
+
+
+[More Information Needed]
+
+#### Metrics
+
+
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+
+
+[More Information Needed]
+
+## Environmental Impact
+
+
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.9.0
\ No newline at end of file
diff --git a/checkpoint-700/adapter_config.json b/checkpoint-700/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..67326eac9d1aeaece3cee6be49e088077b9cebe9
--- /dev/null
+++ b/checkpoint-700/adapter_config.json
@@ -0,0 +1,28 @@
+{
+ "alpha_pattern": {},
+ "auto_mapping": null,
+ "base_model_name_or_path": "hfl/chinese-alpaca-2-1.3b",
+ "bias": "none",
+ "fan_in_fan_out": false,
+ "inference_mode": true,
+ "init_lora_weights": true,
+ "layers_pattern": null,
+ "layers_to_transform": null,
+ "loftq_config": {},
+ "lora_alpha": 16,
+ "lora_dropout": 0.1,
+ "megatron_config": null,
+ "megatron_core": "megatron.core",
+ "modules_to_save": null,
+ "peft_type": "LORA",
+ "r": 8,
+ "rank_pattern": {},
+ "revision": null,
+ "target_modules": [
+ "q_proj",
+ "v_proj"
+ ],
+ "task_type": "CAUSAL_LM",
+ "use_dora": false,
+ "use_rslora": false
+}
\ No newline at end of file
diff --git a/checkpoint-700/adapter_model.safetensors b/checkpoint-700/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..519010c05036fbfefd4f68b724a4cab3f8613d37
--- /dev/null
+++ b/checkpoint-700/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:06ca6573229cb446680565985ccf24a8351147a4edf2cfc1f7cc011a62d73564
+size 2099272
diff --git a/checkpoint-700/optimizer.pt b/checkpoint-700/optimizer.pt
new file mode 100644
index 0000000000000000000000000000000000000000..36e3a37d6c821fb93ca878f9ea471e22b0fed49a
--- /dev/null
+++ b/checkpoint-700/optimizer.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b4ed15662356c964d79f923ef7abd0a52daf0bdb1b2281a961ef94b297806469
+size 4208302
diff --git a/checkpoint-700/rng_state.pth b/checkpoint-700/rng_state.pth
new file mode 100644
index 0000000000000000000000000000000000000000..3aa193cfb710cbce7e0e5f0aa628c65f70ef170c
--- /dev/null
+++ b/checkpoint-700/rng_state.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:31d2256cedcdcf481db016cd4710c79ea21043a9ffa511d02107ef4ae12f26c6
+size 14244
diff --git a/checkpoint-700/scheduler.pt b/checkpoint-700/scheduler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..8a6c68f89990ea6c342bb897e29231458a217de4
--- /dev/null
+++ b/checkpoint-700/scheduler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7e3948b82370fb4926bdeac6e49739187cc941849d68582f9a3e787358b0b457
+size 1064
diff --git a/checkpoint-700/special_tokens_map.json b/checkpoint-700/special_tokens_map.json
new file mode 100644
index 0000000000000000000000000000000000000000..94c4595c4eb202faff8e45d273ceffbe19fe3036
--- /dev/null
+++ b/checkpoint-700/special_tokens_map.json
@@ -0,0 +1,30 @@
+{
+ "bos_token": {
+ "content": "",
+ "lstrip": false,
+ "normalized": true,
+ "rstrip": false,
+ "single_word": false
+ },
+ "eos_token": {
+ "content": "",
+ "lstrip": false,
+ "normalized": true,
+ "rstrip": false,
+ "single_word": false
+ },
+ "pad_token": {
+ "content": "",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false
+ },
+ "unk_token": {
+ "content": "",
+ "lstrip": false,
+ "normalized": true,
+ "rstrip": false,
+ "single_word": false
+ }
+}
diff --git a/checkpoint-700/tokenizer.model b/checkpoint-700/tokenizer.model
new file mode 100644
index 0000000000000000000000000000000000000000..3df664fcc67f61d0c7e635a60d9f55cd0624158a
--- /dev/null
+++ b/checkpoint-700/tokenizer.model
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a3b8844863b200dfcca971db228e96ce388290dfcf72c15d7a9d2f604bac787c
+size 844403
diff --git a/checkpoint-700/tokenizer_config.json b/checkpoint-700/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..6912f7cdcb84c3039edad59aa64f70dc4344a517
--- /dev/null
+++ b/checkpoint-700/tokenizer_config.json
@@ -0,0 +1,54 @@
+{
+ "add_bos_token": true,
+ "add_eos_token": false,
+ "add_prefix_space": true,
+ "added_tokens_decoder": {
+ "0": {
+ "content": "",
+ "lstrip": false,
+ "normalized": true,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "1": {
+ "content": "",
+ "lstrip": false,
+ "normalized": true,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "2": {
+ "content": "",
+ "lstrip": false,
+ "normalized": true,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "32000": {
+ "content": "",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ }
+ },
+ "bos_token": "",
+ "chat_template": "{% set system_message = 'You are a helpful assistant. 你是一个乐于助人的助手。' %}{% if messages[0]['role'] == 'system' %}{% set system_message = messages[0]['content'] %}{% endif %}{% for message in messages %}{% set content = message['content'] %}{% if loop.index0 == 0 and system_message is defined %}{% set content = '<>\\n' + system_message + '\\n<>\\n\\n' + message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ '' + '[INST] ' + content + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ content + '' }}{% endif %}{% endfor %}",
+ "clean_up_tokenization_spaces": false,
+ "eos_token": "",
+ "legacy": true,
+ "model_max_length": 1000000000000000019884624838656,
+ "pad_token": "",
+ "padding_side": "right",
+ "sp_model_kwargs": {},
+ "spaces_between_special_tokens": false,
+ "split_special_tokens": false,
+ "tokenizer_class": "LlamaTokenizer",
+ "unk_token": "",
+ "use_default_system_prompt": false,
+ "use_fast": false
+}
diff --git a/checkpoint-700/trainer_state.json b/checkpoint-700/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..0767ae05e7ada5d329693b9453a6d4de971d6239
--- /dev/null
+++ b/checkpoint-700/trainer_state.json
@@ -0,0 +1,1001 @@
+{
+ "best_metric": null,
+ "best_model_checkpoint": null,
+ "epoch": 0.21767015198040968,
+ "eval_steps": 500,
+ "global_step": 700,
+ "is_hyper_param_search": false,
+ "is_local_process_zero": true,
+ "is_world_process_zero": true,
+ "log_history": [
+ {
+ "epoch": 0.0,
+ "grad_norm": 0.47774845361709595,
+ "learning_rate": 4.999970160815579e-05,
+ "loss": 2.0765,
+ "step": 5
+ },
+ {
+ "epoch": 0.0,
+ "grad_norm": 0.6051416397094727,
+ "learning_rate": 4.999880643974619e-05,
+ "loss": 2.2297,
+ "step": 10
+ },
+ {
+ "epoch": 0.0,
+ "grad_norm": 0.6161717772483826,
+ "learning_rate": 4.9997314516140056e-05,
+ "loss": 2.1103,
+ "step": 15
+ },
+ {
+ "epoch": 0.01,
+ "grad_norm": 0.4686434268951416,
+ "learning_rate": 4.999522587295162e-05,
+ "loss": 2.0057,
+ "step": 20
+ },
+ {
+ "epoch": 0.01,
+ "grad_norm": 0.8412289023399353,
+ "learning_rate": 4.999254056003963e-05,
+ "loss": 2.1778,
+ "step": 25
+ },
+ {
+ "epoch": 0.01,
+ "grad_norm": 0.5333625078201294,
+ "learning_rate": 4.99892586415061e-05,
+ "loss": 2.2399,
+ "step": 30
+ },
+ {
+ "epoch": 0.01,
+ "grad_norm": 0.821148157119751,
+ "learning_rate": 4.9985380195694856e-05,
+ "loss": 2.3215,
+ "step": 35
+ },
+ {
+ "epoch": 0.01,
+ "grad_norm": 0.8403909206390381,
+ "learning_rate": 4.998090531518962e-05,
+ "loss": 1.8295,
+ "step": 40
+ },
+ {
+ "epoch": 0.01,
+ "grad_norm": 0.6633398532867432,
+ "learning_rate": 4.9975834106811834e-05,
+ "loss": 2.0195,
+ "step": 45
+ },
+ {
+ "epoch": 0.02,
+ "grad_norm": 0.6386868357658386,
+ "learning_rate": 4.997016669161806e-05,
+ "loss": 2.1257,
+ "step": 50
+ },
+ {
+ "epoch": 0.02,
+ "grad_norm": 0.7762248516082764,
+ "learning_rate": 4.996390320489715e-05,
+ "loss": 2.057,
+ "step": 55
+ },
+ {
+ "epoch": 0.02,
+ "grad_norm": 1.3192856311798096,
+ "learning_rate": 4.9957043796166966e-05,
+ "loss": 2.0753,
+ "step": 60
+ },
+ {
+ "epoch": 0.02,
+ "grad_norm": 0.9797518849372864,
+ "learning_rate": 4.994958862917083e-05,
+ "loss": 1.9736,
+ "step": 65
+ },
+ {
+ "epoch": 0.02,
+ "grad_norm": 1.000693440437317,
+ "learning_rate": 4.994153788187363e-05,
+ "loss": 2.1572,
+ "step": 70
+ },
+ {
+ "epoch": 0.02,
+ "grad_norm": 0.6852813959121704,
+ "learning_rate": 4.993289174645757e-05,
+ "loss": 2.1491,
+ "step": 75
+ },
+ {
+ "epoch": 0.02,
+ "grad_norm": 1.0075691938400269,
+ "learning_rate": 4.992365042931752e-05,
+ "loss": 1.945,
+ "step": 80
+ },
+ {
+ "epoch": 0.03,
+ "grad_norm": 1.1973133087158203,
+ "learning_rate": 4.991381415105619e-05,
+ "loss": 2.0811,
+ "step": 85
+ },
+ {
+ "epoch": 0.03,
+ "grad_norm": 0.9927239418029785,
+ "learning_rate": 4.990338314647881e-05,
+ "loss": 1.961,
+ "step": 90
+ },
+ {
+ "epoch": 0.03,
+ "grad_norm": 0.9499759674072266,
+ "learning_rate": 4.98923576645875e-05,
+ "loss": 2.0653,
+ "step": 95
+ },
+ {
+ "epoch": 0.03,
+ "grad_norm": 0.7233040928840637,
+ "learning_rate": 4.9880737968575365e-05,
+ "loss": 1.9999,
+ "step": 100
+ },
+ {
+ "epoch": 0.03,
+ "grad_norm": 1.55235755443573,
+ "learning_rate": 4.986852433582022e-05,
+ "loss": 2.2258,
+ "step": 105
+ },
+ {
+ "epoch": 0.03,
+ "grad_norm": 0.9007890820503235,
+ "learning_rate": 4.985571705787793e-05,
+ "loss": 2.1034,
+ "step": 110
+ },
+ {
+ "epoch": 0.04,
+ "grad_norm": 0.6774860620498657,
+ "learning_rate": 4.9842316440475475e-05,
+ "loss": 2.1753,
+ "step": 115
+ },
+ {
+ "epoch": 0.04,
+ "grad_norm": 0.7676737308502197,
+ "learning_rate": 4.9828322803503665e-05,
+ "loss": 2.1384,
+ "step": 120
+ },
+ {
+ "epoch": 0.04,
+ "grad_norm": 0.9624544978141785,
+ "learning_rate": 4.981373648100946e-05,
+ "loss": 2.0521,
+ "step": 125
+ },
+ {
+ "epoch": 0.04,
+ "grad_norm": 0.9315722584724426,
+ "learning_rate": 4.979855782118802e-05,
+ "loss": 1.9256,
+ "step": 130
+ },
+ {
+ "epoch": 0.04,
+ "grad_norm": 0.9035864472389221,
+ "learning_rate": 4.978278718637443e-05,
+ "loss": 2.0882,
+ "step": 135
+ },
+ {
+ "epoch": 0.04,
+ "grad_norm": 0.7997236251831055,
+ "learning_rate": 4.9766424953035e-05,
+ "loss": 2.0724,
+ "step": 140
+ },
+ {
+ "epoch": 0.05,
+ "grad_norm": 1.0692921876907349,
+ "learning_rate": 4.974947151175826e-05,
+ "loss": 2.1329,
+ "step": 145
+ },
+ {
+ "epoch": 0.05,
+ "grad_norm": 0.9506180286407471,
+ "learning_rate": 4.973192726724572e-05,
+ "loss": 2.082,
+ "step": 150
+ },
+ {
+ "epoch": 0.05,
+ "grad_norm": 0.8647387027740479,
+ "learning_rate": 4.9713792638302145e-05,
+ "loss": 2.0366,
+ "step": 155
+ },
+ {
+ "epoch": 0.05,
+ "grad_norm": 1.105302095413208,
+ "learning_rate": 4.969506805782555e-05,
+ "loss": 2.1481,
+ "step": 160
+ },
+ {
+ "epoch": 0.05,
+ "grad_norm": 0.7593303918838501,
+ "learning_rate": 4.967575397279689e-05,
+ "loss": 2.032,
+ "step": 165
+ },
+ {
+ "epoch": 0.05,
+ "grad_norm": 0.7521979808807373,
+ "learning_rate": 4.965585084426943e-05,
+ "loss": 2.0379,
+ "step": 170
+ },
+ {
+ "epoch": 0.05,
+ "grad_norm": 0.947120726108551,
+ "learning_rate": 4.9635359147357655e-05,
+ "loss": 2.1444,
+ "step": 175
+ },
+ {
+ "epoch": 0.06,
+ "grad_norm": 1.2184454202651978,
+ "learning_rate": 4.961427937122598e-05,
+ "loss": 1.9164,
+ "step": 180
+ },
+ {
+ "epoch": 0.06,
+ "grad_norm": 1.221663475036621,
+ "learning_rate": 4.959261201907707e-05,
+ "loss": 2.0084,
+ "step": 185
+ },
+ {
+ "epoch": 0.06,
+ "grad_norm": 1.0457361936569214,
+ "learning_rate": 4.957035760813982e-05,
+ "loss": 2.2032,
+ "step": 190
+ },
+ {
+ "epoch": 0.06,
+ "grad_norm": 0.8834909200668335,
+ "learning_rate": 4.954751666965701e-05,
+ "loss": 2.2101,
+ "step": 195
+ },
+ {
+ "epoch": 0.06,
+ "grad_norm": 0.791902482509613,
+ "learning_rate": 4.9524089748872615e-05,
+ "loss": 2.0472,
+ "step": 200
+ },
+ {
+ "epoch": 0.06,
+ "grad_norm": 1.2905739545822144,
+ "learning_rate": 4.9500077405018807e-05,
+ "loss": 2.0987,
+ "step": 205
+ },
+ {
+ "epoch": 0.07,
+ "grad_norm": 0.8612006306648254,
+ "learning_rate": 4.9475480211302583e-05,
+ "loss": 2.1765,
+ "step": 210
+ },
+ {
+ "epoch": 0.07,
+ "grad_norm": 1.3128459453582764,
+ "learning_rate": 4.945029875489212e-05,
+ "loss": 1.9926,
+ "step": 215
+ },
+ {
+ "epoch": 0.07,
+ "grad_norm": 0.9610918164253235,
+ "learning_rate": 4.94245336369027e-05,
+ "loss": 2.0124,
+ "step": 220
+ },
+ {
+ "epoch": 0.07,
+ "grad_norm": 0.873160183429718,
+ "learning_rate": 4.939818547238241e-05,
+ "loss": 2.2229,
+ "step": 225
+ },
+ {
+ "epoch": 0.07,
+ "grad_norm": 1.5535285472869873,
+ "learning_rate": 4.9371254890297446e-05,
+ "loss": 2.2013,
+ "step": 230
+ },
+ {
+ "epoch": 0.07,
+ "grad_norm": 1.1951836347579956,
+ "learning_rate": 4.93437425335171e-05,
+ "loss": 2.014,
+ "step": 235
+ },
+ {
+ "epoch": 0.07,
+ "grad_norm": 0.7874170541763306,
+ "learning_rate": 4.9315649058798384e-05,
+ "loss": 2.1701,
+ "step": 240
+ },
+ {
+ "epoch": 0.08,
+ "grad_norm": 1.3503323793411255,
+ "learning_rate": 4.928697513677042e-05,
+ "loss": 2.1681,
+ "step": 245
+ },
+ {
+ "epoch": 0.08,
+ "grad_norm": 1.3091179132461548,
+ "learning_rate": 4.925772145191834e-05,
+ "loss": 2.1224,
+ "step": 250
+ },
+ {
+ "epoch": 0.08,
+ "grad_norm": 1.4428555965423584,
+ "learning_rate": 4.9227888702567044e-05,
+ "loss": 2.0512,
+ "step": 255
+ },
+ {
+ "epoch": 0.08,
+ "grad_norm": 0.8234395980834961,
+ "learning_rate": 4.9197477600864446e-05,
+ "loss": 2.1067,
+ "step": 260
+ },
+ {
+ "epoch": 0.08,
+ "grad_norm": 1.9094969034194946,
+ "learning_rate": 4.9166488872764526e-05,
+ "loss": 1.8884,
+ "step": 265
+ },
+ {
+ "epoch": 0.08,
+ "grad_norm": 1.0074087381362915,
+ "learning_rate": 4.913492325800999e-05,
+ "loss": 1.9345,
+ "step": 270
+ },
+ {
+ "epoch": 0.09,
+ "grad_norm": 1.0867297649383545,
+ "learning_rate": 4.910278151011458e-05,
+ "loss": 2.1928,
+ "step": 275
+ },
+ {
+ "epoch": 0.09,
+ "grad_norm": 0.6842357516288757,
+ "learning_rate": 4.907006439634516e-05,
+ "loss": 2.0407,
+ "step": 280
+ },
+ {
+ "epoch": 0.09,
+ "grad_norm": 0.8409023284912109,
+ "learning_rate": 4.903677269770329e-05,
+ "loss": 2.2344,
+ "step": 285
+ },
+ {
+ "epoch": 0.09,
+ "grad_norm": 0.8119503259658813,
+ "learning_rate": 4.900290720890671e-05,
+ "loss": 2.1296,
+ "step": 290
+ },
+ {
+ "epoch": 0.09,
+ "grad_norm": 0.9938147068023682,
+ "learning_rate": 4.8968468738370244e-05,
+ "loss": 2.152,
+ "step": 295
+ },
+ {
+ "epoch": 0.09,
+ "grad_norm": 0.9865244030952454,
+ "learning_rate": 4.8933458108186606e-05,
+ "loss": 1.9623,
+ "step": 300
+ },
+ {
+ "epoch": 0.09,
+ "grad_norm": 1.3944802284240723,
+ "learning_rate": 4.889787615410672e-05,
+ "loss": 1.915,
+ "step": 305
+ },
+ {
+ "epoch": 0.1,
+ "grad_norm": 1.3749767541885376,
+ "learning_rate": 4.886172372551977e-05,
+ "loss": 1.9934,
+ "step": 310
+ },
+ {
+ "epoch": 0.1,
+ "grad_norm": 0.9024938941001892,
+ "learning_rate": 4.882500168543294e-05,
+ "loss": 2.1541,
+ "step": 315
+ },
+ {
+ "epoch": 0.1,
+ "grad_norm": 1.1978263854980469,
+ "learning_rate": 4.878771091045082e-05,
+ "loss": 2.1688,
+ "step": 320
+ },
+ {
+ "epoch": 0.1,
+ "grad_norm": 0.8360010981559753,
+ "learning_rate": 4.874985229075446e-05,
+ "loss": 2.1387,
+ "step": 325
+ },
+ {
+ "epoch": 0.1,
+ "grad_norm": 0.7683364152908325,
+ "learning_rate": 4.871142673008012e-05,
+ "loss": 2.0215,
+ "step": 330
+ },
+ {
+ "epoch": 0.1,
+ "grad_norm": 1.4230670928955078,
+ "learning_rate": 4.867243514569772e-05,
+ "loss": 1.9491,
+ "step": 335
+ },
+ {
+ "epoch": 0.11,
+ "grad_norm": 0.8198773860931396,
+ "learning_rate": 4.863287846838891e-05,
+ "loss": 2.0151,
+ "step": 340
+ },
+ {
+ "epoch": 0.11,
+ "grad_norm": 1.467207908630371,
+ "learning_rate": 4.85927576424249e-05,
+ "loss": 1.8906,
+ "step": 345
+ },
+ {
+ "epoch": 0.11,
+ "grad_norm": 0.9537095427513123,
+ "learning_rate": 4.855207362554385e-05,
+ "loss": 2.1844,
+ "step": 350
+ },
+ {
+ "epoch": 0.11,
+ "grad_norm": 1.0757155418395996,
+ "learning_rate": 4.851082738892809e-05,
+ "loss": 2.048,
+ "step": 355
+ },
+ {
+ "epoch": 0.11,
+ "grad_norm": 1.6884938478469849,
+ "learning_rate": 4.8469019917180846e-05,
+ "loss": 1.9537,
+ "step": 360
+ },
+ {
+ "epoch": 0.11,
+ "grad_norm": 1.4680182933807373,
+ "learning_rate": 4.8426652208302814e-05,
+ "loss": 1.9731,
+ "step": 365
+ },
+ {
+ "epoch": 0.12,
+ "grad_norm": 1.1778632402420044,
+ "learning_rate": 4.83837252736683e-05,
+ "loss": 2.1395,
+ "step": 370
+ },
+ {
+ "epoch": 0.12,
+ "grad_norm": 1.2865056991577148,
+ "learning_rate": 4.834024013800108e-05,
+ "loss": 2.0016,
+ "step": 375
+ },
+ {
+ "epoch": 0.12,
+ "grad_norm": 1.055177092552185,
+ "learning_rate": 4.8296197839349944e-05,
+ "loss": 1.9632,
+ "step": 380
+ },
+ {
+ "epoch": 0.12,
+ "grad_norm": 1.0041871070861816,
+ "learning_rate": 4.825159942906389e-05,
+ "loss": 2.3302,
+ "step": 385
+ },
+ {
+ "epoch": 0.12,
+ "grad_norm": 1.0026438236236572,
+ "learning_rate": 4.820644597176709e-05,
+ "loss": 2.1517,
+ "step": 390
+ },
+ {
+ "epoch": 0.12,
+ "grad_norm": 1.3532180786132812,
+ "learning_rate": 4.81607385453334e-05,
+ "loss": 2.1229,
+ "step": 395
+ },
+ {
+ "epoch": 0.12,
+ "grad_norm": 0.7670988440513611,
+ "learning_rate": 4.81144782408607e-05,
+ "loss": 2.1382,
+ "step": 400
+ },
+ {
+ "epoch": 0.13,
+ "grad_norm": 1.0405700206756592,
+ "learning_rate": 4.8067666162644774e-05,
+ "loss": 1.9614,
+ "step": 405
+ },
+ {
+ "epoch": 0.13,
+ "grad_norm": 1.2252662181854248,
+ "learning_rate": 4.802030342815304e-05,
+ "loss": 2.1399,
+ "step": 410
+ },
+ {
+ "epoch": 0.13,
+ "grad_norm": 1.237946629524231,
+ "learning_rate": 4.7972391167997754e-05,
+ "loss": 1.9034,
+ "step": 415
+ },
+ {
+ "epoch": 0.13,
+ "grad_norm": 0.8064705729484558,
+ "learning_rate": 4.7923930525909156e-05,
+ "loss": 2.0075,
+ "step": 420
+ },
+ {
+ "epoch": 0.13,
+ "grad_norm": 0.8717565536499023,
+ "learning_rate": 4.7874922658708065e-05,
+ "loss": 2.0105,
+ "step": 425
+ },
+ {
+ "epoch": 0.13,
+ "grad_norm": 1.6693098545074463,
+ "learning_rate": 4.782536873627832e-05,
+ "loss": 2.0242,
+ "step": 430
+ },
+ {
+ "epoch": 0.14,
+ "grad_norm": 0.82447350025177,
+ "learning_rate": 4.777526994153882e-05,
+ "loss": 2.0267,
+ "step": 435
+ },
+ {
+ "epoch": 0.14,
+ "grad_norm": 0.9926588535308838,
+ "learning_rate": 4.7724627470415307e-05,
+ "loss": 1.9119,
+ "step": 440
+ },
+ {
+ "epoch": 0.14,
+ "grad_norm": 1.0924450159072876,
+ "learning_rate": 4.7673442531811796e-05,
+ "loss": 2.2653,
+ "step": 445
+ },
+ {
+ "epoch": 0.14,
+ "grad_norm": 1.1592103242874146,
+ "learning_rate": 4.762171634758177e-05,
+ "loss": 2.0017,
+ "step": 450
+ },
+ {
+ "epoch": 0.14,
+ "grad_norm": 0.9172110557556152,
+ "learning_rate": 4.7569450152498927e-05,
+ "loss": 2.1408,
+ "step": 455
+ },
+ {
+ "epoch": 0.14,
+ "grad_norm": 1.1897525787353516,
+ "learning_rate": 4.751664519422778e-05,
+ "loss": 2.0935,
+ "step": 460
+ },
+ {
+ "epoch": 0.14,
+ "grad_norm": 0.8793094158172607,
+ "learning_rate": 4.746330273329386e-05,
+ "loss": 2.1142,
+ "step": 465
+ },
+ {
+ "epoch": 0.15,
+ "grad_norm": 1.4337489604949951,
+ "learning_rate": 4.740942404305356e-05,
+ "loss": 2.1289,
+ "step": 470
+ },
+ {
+ "epoch": 0.15,
+ "grad_norm": 1.0251764059066772,
+ "learning_rate": 4.735501040966383e-05,
+ "loss": 1.9741,
+ "step": 475
+ },
+ {
+ "epoch": 0.15,
+ "grad_norm": 1.2659822702407837,
+ "learning_rate": 4.730006313205143e-05,
+ "loss": 2.088,
+ "step": 480
+ },
+ {
+ "epoch": 0.15,
+ "grad_norm": 0.8884140849113464,
+ "learning_rate": 4.724458352188192e-05,
+ "loss": 2.2079,
+ "step": 485
+ },
+ {
+ "epoch": 0.15,
+ "grad_norm": 1.1937768459320068,
+ "learning_rate": 4.718857290352835e-05,
+ "loss": 2.048,
+ "step": 490
+ },
+ {
+ "epoch": 0.15,
+ "grad_norm": 0.9741552472114563,
+ "learning_rate": 4.713203261403966e-05,
+ "loss": 2.2569,
+ "step": 495
+ },
+ {
+ "epoch": 0.16,
+ "grad_norm": 0.7996780872344971,
+ "learning_rate": 4.707496400310874e-05,
+ "loss": 1.9574,
+ "step": 500
+ },
+ {
+ "epoch": 0.16,
+ "grad_norm": 1.8182051181793213,
+ "learning_rate": 4.701736843304025e-05,
+ "loss": 2.0951,
+ "step": 505
+ },
+ {
+ "epoch": 0.16,
+ "grad_norm": 1.507320761680603,
+ "learning_rate": 4.695924727871805e-05,
+ "loss": 2.0253,
+ "step": 510
+ },
+ {
+ "epoch": 0.16,
+ "grad_norm": 0.759121835231781,
+ "learning_rate": 4.690060192757242e-05,
+ "loss": 2.0602,
+ "step": 515
+ },
+ {
+ "epoch": 0.16,
+ "grad_norm": 1.5943195819854736,
+ "learning_rate": 4.684143377954691e-05,
+ "loss": 2.0386,
+ "step": 520
+ },
+ {
+ "epoch": 0.16,
+ "grad_norm": 0.8568710088729858,
+ "learning_rate": 4.6781744247064955e-05,
+ "loss": 2.073,
+ "step": 525
+ },
+ {
+ "epoch": 0.16,
+ "grad_norm": 1.3352620601654053,
+ "learning_rate": 4.6721534754996125e-05,
+ "loss": 2.1443,
+ "step": 530
+ },
+ {
+ "epoch": 0.17,
+ "grad_norm": 1.3417474031448364,
+ "learning_rate": 4.666080674062213e-05,
+ "loss": 2.0288,
+ "step": 535
+ },
+ {
+ "epoch": 0.17,
+ "grad_norm": 1.5334464311599731,
+ "learning_rate": 4.659956165360251e-05,
+ "loss": 2.0609,
+ "step": 540
+ },
+ {
+ "epoch": 0.17,
+ "grad_norm": 0.9658721089363098,
+ "learning_rate": 4.6537800955940005e-05,
+ "loss": 1.9539,
+ "step": 545
+ },
+ {
+ "epoch": 0.17,
+ "grad_norm": 1.9197947978973389,
+ "learning_rate": 4.647552612194572e-05,
+ "loss": 2.149,
+ "step": 550
+ },
+ {
+ "epoch": 0.17,
+ "grad_norm": 0.8512137532234192,
+ "learning_rate": 4.641273863820383e-05,
+ "loss": 1.9722,
+ "step": 555
+ },
+ {
+ "epoch": 0.17,
+ "grad_norm": 1.827289342880249,
+ "learning_rate": 4.634944000353622e-05,
+ "loss": 2.0729,
+ "step": 560
+ },
+ {
+ "epoch": 0.18,
+ "grad_norm": 1.088416337966919,
+ "learning_rate": 4.628563172896655e-05,
+ "loss": 1.9507,
+ "step": 565
+ },
+ {
+ "epoch": 0.18,
+ "grad_norm": 1.3566908836364746,
+ "learning_rate": 4.6221315337684353e-05,
+ "loss": 2.1643,
+ "step": 570
+ },
+ {
+ "epoch": 0.18,
+ "grad_norm": 1.3541293144226074,
+ "learning_rate": 4.615649236500854e-05,
+ "loss": 2.1839,
+ "step": 575
+ },
+ {
+ "epoch": 0.18,
+ "grad_norm": 0.991269588470459,
+ "learning_rate": 4.609116435835083e-05,
+ "loss": 2.0976,
+ "step": 580
+ },
+ {
+ "epoch": 0.18,
+ "grad_norm": 1.0280535221099854,
+ "learning_rate": 4.602533287717877e-05,
+ "loss": 2.1474,
+ "step": 585
+ },
+ {
+ "epoch": 0.18,
+ "grad_norm": 1.013123631477356,
+ "learning_rate": 4.5958999492978524e-05,
+ "loss": 2.1873,
+ "step": 590
+ },
+ {
+ "epoch": 0.19,
+ "grad_norm": 1.1753040552139282,
+ "learning_rate": 4.589216578921737e-05,
+ "loss": 2.1744,
+ "step": 595
+ },
+ {
+ "epoch": 0.19,
+ "grad_norm": 1.1839090585708618,
+ "learning_rate": 4.582483336130586e-05,
+ "loss": 1.9982,
+ "step": 600
+ },
+ {
+ "epoch": 0.19,
+ "grad_norm": 1.0724798440933228,
+ "learning_rate": 4.575700381655979e-05,
+ "loss": 2.1234,
+ "step": 605
+ },
+ {
+ "epoch": 0.19,
+ "grad_norm": 2.009913682937622,
+ "learning_rate": 4.5688678774161796e-05,
+ "loss": 1.9478,
+ "step": 610
+ },
+ {
+ "epoch": 0.19,
+ "grad_norm": 0.9897060394287109,
+ "learning_rate": 4.561985986512271e-05,
+ "loss": 1.8268,
+ "step": 615
+ },
+ {
+ "epoch": 0.19,
+ "grad_norm": 0.8881808519363403,
+ "learning_rate": 4.555054873224263e-05,
+ "loss": 1.9887,
+ "step": 620
+ },
+ {
+ "epoch": 0.19,
+ "grad_norm": 1.155900001525879,
+ "learning_rate": 4.54807470300717e-05,
+ "loss": 2.0777,
+ "step": 625
+ },
+ {
+ "epoch": 0.2,
+ "grad_norm": 0.8782421350479126,
+ "learning_rate": 4.5410456424870596e-05,
+ "loss": 2.0566,
+ "step": 630
+ },
+ {
+ "epoch": 0.2,
+ "grad_norm": 1.3324674367904663,
+ "learning_rate": 4.5339678594570795e-05,
+ "loss": 2.047,
+ "step": 635
+ },
+ {
+ "epoch": 0.2,
+ "grad_norm": 1.9805939197540283,
+ "learning_rate": 4.526841522873449e-05,
+ "loss": 1.962,
+ "step": 640
+ },
+ {
+ "epoch": 0.2,
+ "grad_norm": 1.4999943971633911,
+ "learning_rate": 4.519666802851422e-05,
+ "loss": 2.0972,
+ "step": 645
+ },
+ {
+ "epoch": 0.2,
+ "grad_norm": 1.4504961967468262,
+ "learning_rate": 4.5124438706612376e-05,
+ "loss": 2.0041,
+ "step": 650
+ },
+ {
+ "epoch": 0.2,
+ "grad_norm": 0.9078169465065002,
+ "learning_rate": 4.505172898724018e-05,
+ "loss": 2.1229,
+ "step": 655
+ },
+ {
+ "epoch": 0.21,
+ "grad_norm": 1.1635804176330566,
+ "learning_rate": 4.497854060607662e-05,
+ "loss": 2.0195,
+ "step": 660
+ },
+ {
+ "epoch": 0.21,
+ "grad_norm": 1.46576726436615,
+ "learning_rate": 4.490487531022699e-05,
+ "loss": 2.0745,
+ "step": 665
+ },
+ {
+ "epoch": 0.21,
+ "grad_norm": 1.2094652652740479,
+ "learning_rate": 4.4830734858181145e-05,
+ "loss": 2.1068,
+ "step": 670
+ },
+ {
+ "epoch": 0.21,
+ "grad_norm": 1.4738895893096924,
+ "learning_rate": 4.47561210197716e-05,
+ "loss": 1.8088,
+ "step": 675
+ },
+ {
+ "epoch": 0.21,
+ "grad_norm": 1.23384690284729,
+ "learning_rate": 4.4681035576131215e-05,
+ "loss": 2.0995,
+ "step": 680
+ },
+ {
+ "epoch": 0.21,
+ "grad_norm": 0.8332946300506592,
+ "learning_rate": 4.46054803196507e-05,
+ "loss": 2.0541,
+ "step": 685
+ },
+ {
+ "epoch": 0.21,
+ "grad_norm": 0.9207485318183899,
+ "learning_rate": 4.452945705393586e-05,
+ "loss": 2.166,
+ "step": 690
+ },
+ {
+ "epoch": 0.22,
+ "grad_norm": 1.292945146560669,
+ "learning_rate": 4.445296759376449e-05,
+ "loss": 2.0784,
+ "step": 695
+ },
+ {
+ "epoch": 0.22,
+ "grad_norm": 0.9874763488769531,
+ "learning_rate": 4.437601376504307e-05,
+ "loss": 2.2087,
+ "step": 700
+ }
+ ],
+ "logging_steps": 5,
+ "max_steps": 3215,
+ "num_input_tokens_seen": 0,
+ "num_train_epochs": 1,
+ "save_steps": 100,
+ "total_flos": 9413869513605120.0,
+ "train_batch_size": 2,
+ "trial_name": null,
+ "trial_params": null
+}
diff --git a/checkpoint-700/training_args.bin b/checkpoint-700/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..dbb5f752acaebe6718c05e59ced2a70ce6dfb6a0
--- /dev/null
+++ b/checkpoint-700/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:79e80b13ff00898b4493d440a3c1a1eb234c0ae541cbca8a8b1befef97a354c9
+size 5112
diff --git a/checkpoint-800/README.md b/checkpoint-800/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..b21eb87c49b1ea11193cd8f664d04b06c22e6bb5
--- /dev/null
+++ b/checkpoint-800/README.md
@@ -0,0 +1,202 @@
+---
+library_name: peft
+base_model: hfl/chinese-alpaca-2-1.3b
+---
+
+# Model Card for Model ID
+
+
+
+
+
+## Model Details
+
+### Model Description
+
+
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+
+
+### Direct Use
+
+
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+
+
+[More Information Needed]
+
+### Recommendations
+
+
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+
+
+[More Information Needed]
+
+### Training Procedure
+
+
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed]
+
+#### Speeds, Sizes, Times [optional]
+
+
+
+[More Information Needed]
+
+## Evaluation
+
+
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+
+
+[More Information Needed]
+
+#### Factors
+
+
+
+[More Information Needed]
+
+#### Metrics
+
+
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+
+
+[More Information Needed]
+
+## Environmental Impact
+
+
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.9.0
\ No newline at end of file
diff --git a/checkpoint-800/adapter_config.json b/checkpoint-800/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..67326eac9d1aeaece3cee6be49e088077b9cebe9
--- /dev/null
+++ b/checkpoint-800/adapter_config.json
@@ -0,0 +1,28 @@
+{
+ "alpha_pattern": {},
+ "auto_mapping": null,
+ "base_model_name_or_path": "hfl/chinese-alpaca-2-1.3b",
+ "bias": "none",
+ "fan_in_fan_out": false,
+ "inference_mode": true,
+ "init_lora_weights": true,
+ "layers_pattern": null,
+ "layers_to_transform": null,
+ "loftq_config": {},
+ "lora_alpha": 16,
+ "lora_dropout": 0.1,
+ "megatron_config": null,
+ "megatron_core": "megatron.core",
+ "modules_to_save": null,
+ "peft_type": "LORA",
+ "r": 8,
+ "rank_pattern": {},
+ "revision": null,
+ "target_modules": [
+ "q_proj",
+ "v_proj"
+ ],
+ "task_type": "CAUSAL_LM",
+ "use_dora": false,
+ "use_rslora": false
+}
\ No newline at end of file
diff --git a/checkpoint-800/adapter_model.safetensors b/checkpoint-800/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..e2eb95ceed0046683758566699bf45cf72b882f8
--- /dev/null
+++ b/checkpoint-800/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b49633f49bfaaa33680226ba7e593b872d26c76c02bab2bccf843da4a96db0f4
+size 2099272
diff --git a/checkpoint-800/optimizer.pt b/checkpoint-800/optimizer.pt
new file mode 100644
index 0000000000000000000000000000000000000000..4c459361f2f3ea9a7eaef5d3ab71c45ef7a08e69
--- /dev/null
+++ b/checkpoint-800/optimizer.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c9657b76e7d26778e03c8a161f0d330310b39906b799f62b65cf291f9e944ec0
+size 4208302
diff --git a/checkpoint-800/rng_state.pth b/checkpoint-800/rng_state.pth
new file mode 100644
index 0000000000000000000000000000000000000000..0f0a85a9c207a534562cf3b0e4c1bff185f5c576
--- /dev/null
+++ b/checkpoint-800/rng_state.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c85a13b289667334e96b297491f81dd36f3f6c0aaebbb285549a5b298e8a6667
+size 14244
diff --git a/checkpoint-800/scheduler.pt b/checkpoint-800/scheduler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..f98cf0fe651ddf3567fd51c8ae4994213637588a
--- /dev/null
+++ b/checkpoint-800/scheduler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:40325a00ffe4a46be0d94decbcacf24860f4d1c73596744fe87f05daf976c07b
+size 1064
diff --git a/checkpoint-800/special_tokens_map.json b/checkpoint-800/special_tokens_map.json
new file mode 100644
index 0000000000000000000000000000000000000000..94c4595c4eb202faff8e45d273ceffbe19fe3036
--- /dev/null
+++ b/checkpoint-800/special_tokens_map.json
@@ -0,0 +1,30 @@
+{
+ "bos_token": {
+ "content": "",
+ "lstrip": false,
+ "normalized": true,
+ "rstrip": false,
+ "single_word": false
+ },
+ "eos_token": {
+ "content": "",
+ "lstrip": false,
+ "normalized": true,
+ "rstrip": false,
+ "single_word": false
+ },
+ "pad_token": {
+ "content": "",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false
+ },
+ "unk_token": {
+ "content": "",
+ "lstrip": false,
+ "normalized": true,
+ "rstrip": false,
+ "single_word": false
+ }
+}
diff --git a/checkpoint-800/tokenizer.model b/checkpoint-800/tokenizer.model
new file mode 100644
index 0000000000000000000000000000000000000000..3df664fcc67f61d0c7e635a60d9f55cd0624158a
--- /dev/null
+++ b/checkpoint-800/tokenizer.model
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a3b8844863b200dfcca971db228e96ce388290dfcf72c15d7a9d2f604bac787c
+size 844403
diff --git a/checkpoint-800/tokenizer_config.json b/checkpoint-800/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..6912f7cdcb84c3039edad59aa64f70dc4344a517
--- /dev/null
+++ b/checkpoint-800/tokenizer_config.json
@@ -0,0 +1,54 @@
+{
+ "add_bos_token": true,
+ "add_eos_token": false,
+ "add_prefix_space": true,
+ "added_tokens_decoder": {
+ "0": {
+ "content": "",
+ "lstrip": false,
+ "normalized": true,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "1": {
+ "content": "",
+ "lstrip": false,
+ "normalized": true,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "2": {
+ "content": "",
+ "lstrip": false,
+ "normalized": true,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "32000": {
+ "content": "",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ }
+ },
+ "bos_token": "",
+ "chat_template": "{% set system_message = 'You are a helpful assistant. 你是一个乐于助人的助手。' %}{% if messages[0]['role'] == 'system' %}{% set system_message = messages[0]['content'] %}{% endif %}{% for message in messages %}{% set content = message['content'] %}{% if loop.index0 == 0 and system_message is defined %}{% set content = '<>\\n' + system_message + '\\n<>\\n\\n' + message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ '' + '[INST] ' + content + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ content + '' }}{% endif %}{% endfor %}",
+ "clean_up_tokenization_spaces": false,
+ "eos_token": "",
+ "legacy": true,
+ "model_max_length": 1000000000000000019884624838656,
+ "pad_token": "",
+ "padding_side": "right",
+ "sp_model_kwargs": {},
+ "spaces_between_special_tokens": false,
+ "split_special_tokens": false,
+ "tokenizer_class": "LlamaTokenizer",
+ "unk_token": "",
+ "use_default_system_prompt": false,
+ "use_fast": false
+}
diff --git a/checkpoint-800/trainer_state.json b/checkpoint-800/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..2ece309da91295224e67df51ca720d562eff65fd
--- /dev/null
+++ b/checkpoint-800/trainer_state.json
@@ -0,0 +1,1141 @@
+{
+ "best_metric": null,
+ "best_model_checkpoint": null,
+ "epoch": 0.24876588797761107,
+ "eval_steps": 500,
+ "global_step": 800,
+ "is_hyper_param_search": false,
+ "is_local_process_zero": true,
+ "is_world_process_zero": true,
+ "log_history": [
+ {
+ "epoch": 0.0,
+ "grad_norm": 0.47774845361709595,
+ "learning_rate": 4.999970160815579e-05,
+ "loss": 2.0765,
+ "step": 5
+ },
+ {
+ "epoch": 0.0,
+ "grad_norm": 0.6051416397094727,
+ "learning_rate": 4.999880643974619e-05,
+ "loss": 2.2297,
+ "step": 10
+ },
+ {
+ "epoch": 0.0,
+ "grad_norm": 0.6161717772483826,
+ "learning_rate": 4.9997314516140056e-05,
+ "loss": 2.1103,
+ "step": 15
+ },
+ {
+ "epoch": 0.01,
+ "grad_norm": 0.4686434268951416,
+ "learning_rate": 4.999522587295162e-05,
+ "loss": 2.0057,
+ "step": 20
+ },
+ {
+ "epoch": 0.01,
+ "grad_norm": 0.8412289023399353,
+ "learning_rate": 4.999254056003963e-05,
+ "loss": 2.1778,
+ "step": 25
+ },
+ {
+ "epoch": 0.01,
+ "grad_norm": 0.5333625078201294,
+ "learning_rate": 4.99892586415061e-05,
+ "loss": 2.2399,
+ "step": 30
+ },
+ {
+ "epoch": 0.01,
+ "grad_norm": 0.821148157119751,
+ "learning_rate": 4.9985380195694856e-05,
+ "loss": 2.3215,
+ "step": 35
+ },
+ {
+ "epoch": 0.01,
+ "grad_norm": 0.8403909206390381,
+ "learning_rate": 4.998090531518962e-05,
+ "loss": 1.8295,
+ "step": 40
+ },
+ {
+ "epoch": 0.01,
+ "grad_norm": 0.6633398532867432,
+ "learning_rate": 4.9975834106811834e-05,
+ "loss": 2.0195,
+ "step": 45
+ },
+ {
+ "epoch": 0.02,
+ "grad_norm": 0.6386868357658386,
+ "learning_rate": 4.997016669161806e-05,
+ "loss": 2.1257,
+ "step": 50
+ },
+ {
+ "epoch": 0.02,
+ "grad_norm": 0.7762248516082764,
+ "learning_rate": 4.996390320489715e-05,
+ "loss": 2.057,
+ "step": 55
+ },
+ {
+ "epoch": 0.02,
+ "grad_norm": 1.3192856311798096,
+ "learning_rate": 4.9957043796166966e-05,
+ "loss": 2.0753,
+ "step": 60
+ },
+ {
+ "epoch": 0.02,
+ "grad_norm": 0.9797518849372864,
+ "learning_rate": 4.994958862917083e-05,
+ "loss": 1.9736,
+ "step": 65
+ },
+ {
+ "epoch": 0.02,
+ "grad_norm": 1.000693440437317,
+ "learning_rate": 4.994153788187363e-05,
+ "loss": 2.1572,
+ "step": 70
+ },
+ {
+ "epoch": 0.02,
+ "grad_norm": 0.6852813959121704,
+ "learning_rate": 4.993289174645757e-05,
+ "loss": 2.1491,
+ "step": 75
+ },
+ {
+ "epoch": 0.02,
+ "grad_norm": 1.0075691938400269,
+ "learning_rate": 4.992365042931752e-05,
+ "loss": 1.945,
+ "step": 80
+ },
+ {
+ "epoch": 0.03,
+ "grad_norm": 1.1973133087158203,
+ "learning_rate": 4.991381415105619e-05,
+ "loss": 2.0811,
+ "step": 85
+ },
+ {
+ "epoch": 0.03,
+ "grad_norm": 0.9927239418029785,
+ "learning_rate": 4.990338314647881e-05,
+ "loss": 1.961,
+ "step": 90
+ },
+ {
+ "epoch": 0.03,
+ "grad_norm": 0.9499759674072266,
+ "learning_rate": 4.98923576645875e-05,
+ "loss": 2.0653,
+ "step": 95
+ },
+ {
+ "epoch": 0.03,
+ "grad_norm": 0.7233040928840637,
+ "learning_rate": 4.9880737968575365e-05,
+ "loss": 1.9999,
+ "step": 100
+ },
+ {
+ "epoch": 0.03,
+ "grad_norm": 1.55235755443573,
+ "learning_rate": 4.986852433582022e-05,
+ "loss": 2.2258,
+ "step": 105
+ },
+ {
+ "epoch": 0.03,
+ "grad_norm": 0.9007890820503235,
+ "learning_rate": 4.985571705787793e-05,
+ "loss": 2.1034,
+ "step": 110
+ },
+ {
+ "epoch": 0.04,
+ "grad_norm": 0.6774860620498657,
+ "learning_rate": 4.9842316440475475e-05,
+ "loss": 2.1753,
+ "step": 115
+ },
+ {
+ "epoch": 0.04,
+ "grad_norm": 0.7676737308502197,
+ "learning_rate": 4.9828322803503665e-05,
+ "loss": 2.1384,
+ "step": 120
+ },
+ {
+ "epoch": 0.04,
+ "grad_norm": 0.9624544978141785,
+ "learning_rate": 4.981373648100946e-05,
+ "loss": 2.0521,
+ "step": 125
+ },
+ {
+ "epoch": 0.04,
+ "grad_norm": 0.9315722584724426,
+ "learning_rate": 4.979855782118802e-05,
+ "loss": 1.9256,
+ "step": 130
+ },
+ {
+ "epoch": 0.04,
+ "grad_norm": 0.9035864472389221,
+ "learning_rate": 4.978278718637443e-05,
+ "loss": 2.0882,
+ "step": 135
+ },
+ {
+ "epoch": 0.04,
+ "grad_norm": 0.7997236251831055,
+ "learning_rate": 4.9766424953035e-05,
+ "loss": 2.0724,
+ "step": 140
+ },
+ {
+ "epoch": 0.05,
+ "grad_norm": 1.0692921876907349,
+ "learning_rate": 4.974947151175826e-05,
+ "loss": 2.1329,
+ "step": 145
+ },
+ {
+ "epoch": 0.05,
+ "grad_norm": 0.9506180286407471,
+ "learning_rate": 4.973192726724572e-05,
+ "loss": 2.082,
+ "step": 150
+ },
+ {
+ "epoch": 0.05,
+ "grad_norm": 0.8647387027740479,
+ "learning_rate": 4.9713792638302145e-05,
+ "loss": 2.0366,
+ "step": 155
+ },
+ {
+ "epoch": 0.05,
+ "grad_norm": 1.105302095413208,
+ "learning_rate": 4.969506805782555e-05,
+ "loss": 2.1481,
+ "step": 160
+ },
+ {
+ "epoch": 0.05,
+ "grad_norm": 0.7593303918838501,
+ "learning_rate": 4.967575397279689e-05,
+ "loss": 2.032,
+ "step": 165
+ },
+ {
+ "epoch": 0.05,
+ "grad_norm": 0.7521979808807373,
+ "learning_rate": 4.965585084426943e-05,
+ "loss": 2.0379,
+ "step": 170
+ },
+ {
+ "epoch": 0.05,
+ "grad_norm": 0.947120726108551,
+ "learning_rate": 4.9635359147357655e-05,
+ "loss": 2.1444,
+ "step": 175
+ },
+ {
+ "epoch": 0.06,
+ "grad_norm": 1.2184454202651978,
+ "learning_rate": 4.961427937122598e-05,
+ "loss": 1.9164,
+ "step": 180
+ },
+ {
+ "epoch": 0.06,
+ "grad_norm": 1.221663475036621,
+ "learning_rate": 4.959261201907707e-05,
+ "loss": 2.0084,
+ "step": 185
+ },
+ {
+ "epoch": 0.06,
+ "grad_norm": 1.0457361936569214,
+ "learning_rate": 4.957035760813982e-05,
+ "loss": 2.2032,
+ "step": 190
+ },
+ {
+ "epoch": 0.06,
+ "grad_norm": 0.8834909200668335,
+ "learning_rate": 4.954751666965701e-05,
+ "loss": 2.2101,
+ "step": 195
+ },
+ {
+ "epoch": 0.06,
+ "grad_norm": 0.791902482509613,
+ "learning_rate": 4.9524089748872615e-05,
+ "loss": 2.0472,
+ "step": 200
+ },
+ {
+ "epoch": 0.06,
+ "grad_norm": 1.2905739545822144,
+ "learning_rate": 4.9500077405018807e-05,
+ "loss": 2.0987,
+ "step": 205
+ },
+ {
+ "epoch": 0.07,
+ "grad_norm": 0.8612006306648254,
+ "learning_rate": 4.9475480211302583e-05,
+ "loss": 2.1765,
+ "step": 210
+ },
+ {
+ "epoch": 0.07,
+ "grad_norm": 1.3128459453582764,
+ "learning_rate": 4.945029875489212e-05,
+ "loss": 1.9926,
+ "step": 215
+ },
+ {
+ "epoch": 0.07,
+ "grad_norm": 0.9610918164253235,
+ "learning_rate": 4.94245336369027e-05,
+ "loss": 2.0124,
+ "step": 220
+ },
+ {
+ "epoch": 0.07,
+ "grad_norm": 0.873160183429718,
+ "learning_rate": 4.939818547238241e-05,
+ "loss": 2.2229,
+ "step": 225
+ },
+ {
+ "epoch": 0.07,
+ "grad_norm": 1.5535285472869873,
+ "learning_rate": 4.9371254890297446e-05,
+ "loss": 2.2013,
+ "step": 230
+ },
+ {
+ "epoch": 0.07,
+ "grad_norm": 1.1951836347579956,
+ "learning_rate": 4.93437425335171e-05,
+ "loss": 2.014,
+ "step": 235
+ },
+ {
+ "epoch": 0.07,
+ "grad_norm": 0.7874170541763306,
+ "learning_rate": 4.9315649058798384e-05,
+ "loss": 2.1701,
+ "step": 240
+ },
+ {
+ "epoch": 0.08,
+ "grad_norm": 1.3503323793411255,
+ "learning_rate": 4.928697513677042e-05,
+ "loss": 2.1681,
+ "step": 245
+ },
+ {
+ "epoch": 0.08,
+ "grad_norm": 1.3091179132461548,
+ "learning_rate": 4.925772145191834e-05,
+ "loss": 2.1224,
+ "step": 250
+ },
+ {
+ "epoch": 0.08,
+ "grad_norm": 1.4428555965423584,
+ "learning_rate": 4.9227888702567044e-05,
+ "loss": 2.0512,
+ "step": 255
+ },
+ {
+ "epoch": 0.08,
+ "grad_norm": 0.8234395980834961,
+ "learning_rate": 4.9197477600864446e-05,
+ "loss": 2.1067,
+ "step": 260
+ },
+ {
+ "epoch": 0.08,
+ "grad_norm": 1.9094969034194946,
+ "learning_rate": 4.9166488872764526e-05,
+ "loss": 1.8884,
+ "step": 265
+ },
+ {
+ "epoch": 0.08,
+ "grad_norm": 1.0074087381362915,
+ "learning_rate": 4.913492325800999e-05,
+ "loss": 1.9345,
+ "step": 270
+ },
+ {
+ "epoch": 0.09,
+ "grad_norm": 1.0867297649383545,
+ "learning_rate": 4.910278151011458e-05,
+ "loss": 2.1928,
+ "step": 275
+ },
+ {
+ "epoch": 0.09,
+ "grad_norm": 0.6842357516288757,
+ "learning_rate": 4.907006439634516e-05,
+ "loss": 2.0407,
+ "step": 280
+ },
+ {
+ "epoch": 0.09,
+ "grad_norm": 0.8409023284912109,
+ "learning_rate": 4.903677269770329e-05,
+ "loss": 2.2344,
+ "step": 285
+ },
+ {
+ "epoch": 0.09,
+ "grad_norm": 0.8119503259658813,
+ "learning_rate": 4.900290720890671e-05,
+ "loss": 2.1296,
+ "step": 290
+ },
+ {
+ "epoch": 0.09,
+ "grad_norm": 0.9938147068023682,
+ "learning_rate": 4.8968468738370244e-05,
+ "loss": 2.152,
+ "step": 295
+ },
+ {
+ "epoch": 0.09,
+ "grad_norm": 0.9865244030952454,
+ "learning_rate": 4.8933458108186606e-05,
+ "loss": 1.9623,
+ "step": 300
+ },
+ {
+ "epoch": 0.09,
+ "grad_norm": 1.3944802284240723,
+ "learning_rate": 4.889787615410672e-05,
+ "loss": 1.915,
+ "step": 305
+ },
+ {
+ "epoch": 0.1,
+ "grad_norm": 1.3749767541885376,
+ "learning_rate": 4.886172372551977e-05,
+ "loss": 1.9934,
+ "step": 310
+ },
+ {
+ "epoch": 0.1,
+ "grad_norm": 0.9024938941001892,
+ "learning_rate": 4.882500168543294e-05,
+ "loss": 2.1541,
+ "step": 315
+ },
+ {
+ "epoch": 0.1,
+ "grad_norm": 1.1978263854980469,
+ "learning_rate": 4.878771091045082e-05,
+ "loss": 2.1688,
+ "step": 320
+ },
+ {
+ "epoch": 0.1,
+ "grad_norm": 0.8360010981559753,
+ "learning_rate": 4.874985229075446e-05,
+ "loss": 2.1387,
+ "step": 325
+ },
+ {
+ "epoch": 0.1,
+ "grad_norm": 0.7683364152908325,
+ "learning_rate": 4.871142673008012e-05,
+ "loss": 2.0215,
+ "step": 330
+ },
+ {
+ "epoch": 0.1,
+ "grad_norm": 1.4230670928955078,
+ "learning_rate": 4.867243514569772e-05,
+ "loss": 1.9491,
+ "step": 335
+ },
+ {
+ "epoch": 0.11,
+ "grad_norm": 0.8198773860931396,
+ "learning_rate": 4.863287846838891e-05,
+ "loss": 2.0151,
+ "step": 340
+ },
+ {
+ "epoch": 0.11,
+ "grad_norm": 1.467207908630371,
+ "learning_rate": 4.85927576424249e-05,
+ "loss": 1.8906,
+ "step": 345
+ },
+ {
+ "epoch": 0.11,
+ "grad_norm": 0.9537095427513123,
+ "learning_rate": 4.855207362554385e-05,
+ "loss": 2.1844,
+ "step": 350
+ },
+ {
+ "epoch": 0.11,
+ "grad_norm": 1.0757155418395996,
+ "learning_rate": 4.851082738892809e-05,
+ "loss": 2.048,
+ "step": 355
+ },
+ {
+ "epoch": 0.11,
+ "grad_norm": 1.6884938478469849,
+ "learning_rate": 4.8469019917180846e-05,
+ "loss": 1.9537,
+ "step": 360
+ },
+ {
+ "epoch": 0.11,
+ "grad_norm": 1.4680182933807373,
+ "learning_rate": 4.8426652208302814e-05,
+ "loss": 1.9731,
+ "step": 365
+ },
+ {
+ "epoch": 0.12,
+ "grad_norm": 1.1778632402420044,
+ "learning_rate": 4.83837252736683e-05,
+ "loss": 2.1395,
+ "step": 370
+ },
+ {
+ "epoch": 0.12,
+ "grad_norm": 1.2865056991577148,
+ "learning_rate": 4.834024013800108e-05,
+ "loss": 2.0016,
+ "step": 375
+ },
+ {
+ "epoch": 0.12,
+ "grad_norm": 1.055177092552185,
+ "learning_rate": 4.8296197839349944e-05,
+ "loss": 1.9632,
+ "step": 380
+ },
+ {
+ "epoch": 0.12,
+ "grad_norm": 1.0041871070861816,
+ "learning_rate": 4.825159942906389e-05,
+ "loss": 2.3302,
+ "step": 385
+ },
+ {
+ "epoch": 0.12,
+ "grad_norm": 1.0026438236236572,
+ "learning_rate": 4.820644597176709e-05,
+ "loss": 2.1517,
+ "step": 390
+ },
+ {
+ "epoch": 0.12,
+ "grad_norm": 1.3532180786132812,
+ "learning_rate": 4.81607385453334e-05,
+ "loss": 2.1229,
+ "step": 395
+ },
+ {
+ "epoch": 0.12,
+ "grad_norm": 0.7670988440513611,
+ "learning_rate": 4.81144782408607e-05,
+ "loss": 2.1382,
+ "step": 400
+ },
+ {
+ "epoch": 0.13,
+ "grad_norm": 1.0405700206756592,
+ "learning_rate": 4.8067666162644774e-05,
+ "loss": 1.9614,
+ "step": 405
+ },
+ {
+ "epoch": 0.13,
+ "grad_norm": 1.2252662181854248,
+ "learning_rate": 4.802030342815304e-05,
+ "loss": 2.1399,
+ "step": 410
+ },
+ {
+ "epoch": 0.13,
+ "grad_norm": 1.237946629524231,
+ "learning_rate": 4.7972391167997754e-05,
+ "loss": 1.9034,
+ "step": 415
+ },
+ {
+ "epoch": 0.13,
+ "grad_norm": 0.8064705729484558,
+ "learning_rate": 4.7923930525909156e-05,
+ "loss": 2.0075,
+ "step": 420
+ },
+ {
+ "epoch": 0.13,
+ "grad_norm": 0.8717565536499023,
+ "learning_rate": 4.7874922658708065e-05,
+ "loss": 2.0105,
+ "step": 425
+ },
+ {
+ "epoch": 0.13,
+ "grad_norm": 1.6693098545074463,
+ "learning_rate": 4.782536873627832e-05,
+ "loss": 2.0242,
+ "step": 430
+ },
+ {
+ "epoch": 0.14,
+ "grad_norm": 0.82447350025177,
+ "learning_rate": 4.777526994153882e-05,
+ "loss": 2.0267,
+ "step": 435
+ },
+ {
+ "epoch": 0.14,
+ "grad_norm": 0.9926588535308838,
+ "learning_rate": 4.7724627470415307e-05,
+ "loss": 1.9119,
+ "step": 440
+ },
+ {
+ "epoch": 0.14,
+ "grad_norm": 1.0924450159072876,
+ "learning_rate": 4.7673442531811796e-05,
+ "loss": 2.2653,
+ "step": 445
+ },
+ {
+ "epoch": 0.14,
+ "grad_norm": 1.1592103242874146,
+ "learning_rate": 4.762171634758177e-05,
+ "loss": 2.0017,
+ "step": 450
+ },
+ {
+ "epoch": 0.14,
+ "grad_norm": 0.9172110557556152,
+ "learning_rate": 4.7569450152498927e-05,
+ "loss": 2.1408,
+ "step": 455
+ },
+ {
+ "epoch": 0.14,
+ "grad_norm": 1.1897525787353516,
+ "learning_rate": 4.751664519422778e-05,
+ "loss": 2.0935,
+ "step": 460
+ },
+ {
+ "epoch": 0.14,
+ "grad_norm": 0.8793094158172607,
+ "learning_rate": 4.746330273329386e-05,
+ "loss": 2.1142,
+ "step": 465
+ },
+ {
+ "epoch": 0.15,
+ "grad_norm": 1.4337489604949951,
+ "learning_rate": 4.740942404305356e-05,
+ "loss": 2.1289,
+ "step": 470
+ },
+ {
+ "epoch": 0.15,
+ "grad_norm": 1.0251764059066772,
+ "learning_rate": 4.735501040966383e-05,
+ "loss": 1.9741,
+ "step": 475
+ },
+ {
+ "epoch": 0.15,
+ "grad_norm": 1.2659822702407837,
+ "learning_rate": 4.730006313205143e-05,
+ "loss": 2.088,
+ "step": 480
+ },
+ {
+ "epoch": 0.15,
+ "grad_norm": 0.8884140849113464,
+ "learning_rate": 4.724458352188192e-05,
+ "loss": 2.2079,
+ "step": 485
+ },
+ {
+ "epoch": 0.15,
+ "grad_norm": 1.1937768459320068,
+ "learning_rate": 4.718857290352835e-05,
+ "loss": 2.048,
+ "step": 490
+ },
+ {
+ "epoch": 0.15,
+ "grad_norm": 0.9741552472114563,
+ "learning_rate": 4.713203261403966e-05,
+ "loss": 2.2569,
+ "step": 495
+ },
+ {
+ "epoch": 0.16,
+ "grad_norm": 0.7996780872344971,
+ "learning_rate": 4.707496400310874e-05,
+ "loss": 1.9574,
+ "step": 500
+ },
+ {
+ "epoch": 0.16,
+ "grad_norm": 1.8182051181793213,
+ "learning_rate": 4.701736843304025e-05,
+ "loss": 2.0951,
+ "step": 505
+ },
+ {
+ "epoch": 0.16,
+ "grad_norm": 1.507320761680603,
+ "learning_rate": 4.695924727871805e-05,
+ "loss": 2.0253,
+ "step": 510
+ },
+ {
+ "epoch": 0.16,
+ "grad_norm": 0.759121835231781,
+ "learning_rate": 4.690060192757242e-05,
+ "loss": 2.0602,
+ "step": 515
+ },
+ {
+ "epoch": 0.16,
+ "grad_norm": 1.5943195819854736,
+ "learning_rate": 4.684143377954691e-05,
+ "loss": 2.0386,
+ "step": 520
+ },
+ {
+ "epoch": 0.16,
+ "grad_norm": 0.8568710088729858,
+ "learning_rate": 4.6781744247064955e-05,
+ "loss": 2.073,
+ "step": 525
+ },
+ {
+ "epoch": 0.16,
+ "grad_norm": 1.3352620601654053,
+ "learning_rate": 4.6721534754996125e-05,
+ "loss": 2.1443,
+ "step": 530
+ },
+ {
+ "epoch": 0.17,
+ "grad_norm": 1.3417474031448364,
+ "learning_rate": 4.666080674062213e-05,
+ "loss": 2.0288,
+ "step": 535
+ },
+ {
+ "epoch": 0.17,
+ "grad_norm": 1.5334464311599731,
+ "learning_rate": 4.659956165360251e-05,
+ "loss": 2.0609,
+ "step": 540
+ },
+ {
+ "epoch": 0.17,
+ "grad_norm": 0.9658721089363098,
+ "learning_rate": 4.6537800955940005e-05,
+ "loss": 1.9539,
+ "step": 545
+ },
+ {
+ "epoch": 0.17,
+ "grad_norm": 1.9197947978973389,
+ "learning_rate": 4.647552612194572e-05,
+ "loss": 2.149,
+ "step": 550
+ },
+ {
+ "epoch": 0.17,
+ "grad_norm": 0.8512137532234192,
+ "learning_rate": 4.641273863820383e-05,
+ "loss": 1.9722,
+ "step": 555
+ },
+ {
+ "epoch": 0.17,
+ "grad_norm": 1.827289342880249,
+ "learning_rate": 4.634944000353622e-05,
+ "loss": 2.0729,
+ "step": 560
+ },
+ {
+ "epoch": 0.18,
+ "grad_norm": 1.088416337966919,
+ "learning_rate": 4.628563172896655e-05,
+ "loss": 1.9507,
+ "step": 565
+ },
+ {
+ "epoch": 0.18,
+ "grad_norm": 1.3566908836364746,
+ "learning_rate": 4.6221315337684353e-05,
+ "loss": 2.1643,
+ "step": 570
+ },
+ {
+ "epoch": 0.18,
+ "grad_norm": 1.3541293144226074,
+ "learning_rate": 4.615649236500854e-05,
+ "loss": 2.1839,
+ "step": 575
+ },
+ {
+ "epoch": 0.18,
+ "grad_norm": 0.991269588470459,
+ "learning_rate": 4.609116435835083e-05,
+ "loss": 2.0976,
+ "step": 580
+ },
+ {
+ "epoch": 0.18,
+ "grad_norm": 1.0280535221099854,
+ "learning_rate": 4.602533287717877e-05,
+ "loss": 2.1474,
+ "step": 585
+ },
+ {
+ "epoch": 0.18,
+ "grad_norm": 1.013123631477356,
+ "learning_rate": 4.5958999492978524e-05,
+ "loss": 2.1873,
+ "step": 590
+ },
+ {
+ "epoch": 0.19,
+ "grad_norm": 1.1753040552139282,
+ "learning_rate": 4.589216578921737e-05,
+ "loss": 2.1744,
+ "step": 595
+ },
+ {
+ "epoch": 0.19,
+ "grad_norm": 1.1839090585708618,
+ "learning_rate": 4.582483336130586e-05,
+ "loss": 1.9982,
+ "step": 600
+ },
+ {
+ "epoch": 0.19,
+ "grad_norm": 1.0724798440933228,
+ "learning_rate": 4.575700381655979e-05,
+ "loss": 2.1234,
+ "step": 605
+ },
+ {
+ "epoch": 0.19,
+ "grad_norm": 2.009913682937622,
+ "learning_rate": 4.5688678774161796e-05,
+ "loss": 1.9478,
+ "step": 610
+ },
+ {
+ "epoch": 0.19,
+ "grad_norm": 0.9897060394287109,
+ "learning_rate": 4.561985986512271e-05,
+ "loss": 1.8268,
+ "step": 615
+ },
+ {
+ "epoch": 0.19,
+ "grad_norm": 0.8881808519363403,
+ "learning_rate": 4.555054873224263e-05,
+ "loss": 1.9887,
+ "step": 620
+ },
+ {
+ "epoch": 0.19,
+ "grad_norm": 1.155900001525879,
+ "learning_rate": 4.54807470300717e-05,
+ "loss": 2.0777,
+ "step": 625
+ },
+ {
+ "epoch": 0.2,
+ "grad_norm": 0.8782421350479126,
+ "learning_rate": 4.5410456424870596e-05,
+ "loss": 2.0566,
+ "step": 630
+ },
+ {
+ "epoch": 0.2,
+ "grad_norm": 1.3324674367904663,
+ "learning_rate": 4.5339678594570795e-05,
+ "loss": 2.047,
+ "step": 635
+ },
+ {
+ "epoch": 0.2,
+ "grad_norm": 1.9805939197540283,
+ "learning_rate": 4.526841522873449e-05,
+ "loss": 1.962,
+ "step": 640
+ },
+ {
+ "epoch": 0.2,
+ "grad_norm": 1.4999943971633911,
+ "learning_rate": 4.519666802851422e-05,
+ "loss": 2.0972,
+ "step": 645
+ },
+ {
+ "epoch": 0.2,
+ "grad_norm": 1.4504961967468262,
+ "learning_rate": 4.5124438706612376e-05,
+ "loss": 2.0041,
+ "step": 650
+ },
+ {
+ "epoch": 0.2,
+ "grad_norm": 0.9078169465065002,
+ "learning_rate": 4.505172898724018e-05,
+ "loss": 2.1229,
+ "step": 655
+ },
+ {
+ "epoch": 0.21,
+ "grad_norm": 1.1635804176330566,
+ "learning_rate": 4.497854060607662e-05,
+ "loss": 2.0195,
+ "step": 660
+ },
+ {
+ "epoch": 0.21,
+ "grad_norm": 1.46576726436615,
+ "learning_rate": 4.490487531022699e-05,
+ "loss": 2.0745,
+ "step": 665
+ },
+ {
+ "epoch": 0.21,
+ "grad_norm": 1.2094652652740479,
+ "learning_rate": 4.4830734858181145e-05,
+ "loss": 2.1068,
+ "step": 670
+ },
+ {
+ "epoch": 0.21,
+ "grad_norm": 1.4738895893096924,
+ "learning_rate": 4.47561210197716e-05,
+ "loss": 1.8088,
+ "step": 675
+ },
+ {
+ "epoch": 0.21,
+ "grad_norm": 1.23384690284729,
+ "learning_rate": 4.4681035576131215e-05,
+ "loss": 2.0995,
+ "step": 680
+ },
+ {
+ "epoch": 0.21,
+ "grad_norm": 0.8332946300506592,
+ "learning_rate": 4.46054803196507e-05,
+ "loss": 2.0541,
+ "step": 685
+ },
+ {
+ "epoch": 0.21,
+ "grad_norm": 0.9207485318183899,
+ "learning_rate": 4.452945705393586e-05,
+ "loss": 2.166,
+ "step": 690
+ },
+ {
+ "epoch": 0.22,
+ "grad_norm": 1.292945146560669,
+ "learning_rate": 4.445296759376449e-05,
+ "loss": 2.0784,
+ "step": 695
+ },
+ {
+ "epoch": 0.22,
+ "grad_norm": 0.9874763488769531,
+ "learning_rate": 4.437601376504307e-05,
+ "loss": 2.2087,
+ "step": 700
+ },
+ {
+ "epoch": 0.22,
+ "grad_norm": 0.9427415132522583,
+ "learning_rate": 4.4298597404763186e-05,
+ "loss": 2.1199,
+ "step": 705
+ },
+ {
+ "epoch": 0.22,
+ "grad_norm": 1.7369529008865356,
+ "learning_rate": 4.422072036095768e-05,
+ "loss": 2.0355,
+ "step": 710
+ },
+ {
+ "epoch": 0.22,
+ "grad_norm": 1.2423696517944336,
+ "learning_rate": 4.414238449265654e-05,
+ "loss": 2.0011,
+ "step": 715
+ },
+ {
+ "epoch": 0.22,
+ "grad_norm": 1.2304831743240356,
+ "learning_rate": 4.406359166984249e-05,
+ "loss": 2.0368,
+ "step": 720
+ },
+ {
+ "epoch": 0.23,
+ "grad_norm": 0.9090413451194763,
+ "learning_rate": 4.39843437734064e-05,
+ "loss": 1.9983,
+ "step": 725
+ },
+ {
+ "epoch": 0.23,
+ "grad_norm": 1.2729507684707642,
+ "learning_rate": 4.390464269510233e-05,
+ "loss": 2.021,
+ "step": 730
+ },
+ {
+ "epoch": 0.23,
+ "grad_norm": 1.3009227514266968,
+ "learning_rate": 4.382449033750244e-05,
+ "loss": 1.9743,
+ "step": 735
+ },
+ {
+ "epoch": 0.23,
+ "grad_norm": 1.5456056594848633,
+ "learning_rate": 4.37438886139515e-05,
+ "loss": 2.0689,
+ "step": 740
+ },
+ {
+ "epoch": 0.23,
+ "grad_norm": 1.3235007524490356,
+ "learning_rate": 4.3662839448521264e-05,
+ "loss": 2.0838,
+ "step": 745
+ },
+ {
+ "epoch": 0.23,
+ "grad_norm": 2.2074007987976074,
+ "learning_rate": 4.358134477596454e-05,
+ "loss": 2.0835,
+ "step": 750
+ },
+ {
+ "epoch": 0.23,
+ "grad_norm": 1.403738021850586,
+ "learning_rate": 4.3499406541668966e-05,
+ "loss": 2.0916,
+ "step": 755
+ },
+ {
+ "epoch": 0.24,
+ "grad_norm": 1.0940325260162354,
+ "learning_rate": 4.3417026701610616e-05,
+ "loss": 1.972,
+ "step": 760
+ },
+ {
+ "epoch": 0.24,
+ "grad_norm": 1.666353702545166,
+ "learning_rate": 4.3334207222307275e-05,
+ "loss": 1.927,
+ "step": 765
+ },
+ {
+ "epoch": 0.24,
+ "grad_norm": 1.0777515172958374,
+ "learning_rate": 4.325095008077154e-05,
+ "loss": 2.1192,
+ "step": 770
+ },
+ {
+ "epoch": 0.24,
+ "grad_norm": 1.7218186855316162,
+ "learning_rate": 4.316725726446353e-05,
+ "loss": 2.0774,
+ "step": 775
+ },
+ {
+ "epoch": 0.24,
+ "grad_norm": 1.356753945350647,
+ "learning_rate": 4.3083130771243586e-05,
+ "loss": 2.0847,
+ "step": 780
+ },
+ {
+ "epoch": 0.24,
+ "grad_norm": 0.9967429637908936,
+ "learning_rate": 4.299857260932445e-05,
+ "loss": 2.0485,
+ "step": 785
+ },
+ {
+ "epoch": 0.25,
+ "grad_norm": 1.6216442584991455,
+ "learning_rate": 4.2913584797223397e-05,
+ "loss": 2.1008,
+ "step": 790
+ },
+ {
+ "epoch": 0.25,
+ "grad_norm": 1.2556742429733276,
+ "learning_rate": 4.2828169363714016e-05,
+ "loss": 1.9209,
+ "step": 795
+ },
+ {
+ "epoch": 0.25,
+ "grad_norm": 1.1800439357757568,
+ "learning_rate": 4.274232834777782e-05,
+ "loss": 1.9722,
+ "step": 800
+ }
+ ],
+ "logging_steps": 5,
+ "max_steps": 3215,
+ "num_input_tokens_seen": 0,
+ "num_train_epochs": 1,
+ "save_steps": 100,
+ "total_flos": 1.074142184472576e+16,
+ "train_batch_size": 2,
+ "trial_name": null,
+ "trial_params": null
+}
diff --git a/checkpoint-800/training_args.bin b/checkpoint-800/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..dbb5f752acaebe6718c05e59ced2a70ce6dfb6a0
--- /dev/null
+++ b/checkpoint-800/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:79e80b13ff00898b4493d440a3c1a1eb234c0ae541cbca8a8b1befef97a354c9
+size 5112
diff --git a/checkpoint-900/README.md b/checkpoint-900/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..b21eb87c49b1ea11193cd8f664d04b06c22e6bb5
--- /dev/null
+++ b/checkpoint-900/README.md
@@ -0,0 +1,202 @@
+---
+library_name: peft
+base_model: hfl/chinese-alpaca-2-1.3b
+---
+
+# Model Card for Model ID
+
+
+
+
+
+## Model Details
+
+### Model Description
+
+
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+
+
+### Direct Use
+
+
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+
+
+[More Information Needed]
+
+### Recommendations
+
+
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+
+
+[More Information Needed]
+
+### Training Procedure
+
+
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed]
+
+#### Speeds, Sizes, Times [optional]
+
+
+
+[More Information Needed]
+
+## Evaluation
+
+
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+
+
+[More Information Needed]
+
+#### Factors
+
+
+
+[More Information Needed]
+
+#### Metrics
+
+
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+
+
+[More Information Needed]
+
+## Environmental Impact
+
+
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.9.0
\ No newline at end of file
diff --git a/checkpoint-900/adapter_config.json b/checkpoint-900/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..67326eac9d1aeaece3cee6be49e088077b9cebe9
--- /dev/null
+++ b/checkpoint-900/adapter_config.json
@@ -0,0 +1,28 @@
+{
+ "alpha_pattern": {},
+ "auto_mapping": null,
+ "base_model_name_or_path": "hfl/chinese-alpaca-2-1.3b",
+ "bias": "none",
+ "fan_in_fan_out": false,
+ "inference_mode": true,
+ "init_lora_weights": true,
+ "layers_pattern": null,
+ "layers_to_transform": null,
+ "loftq_config": {},
+ "lora_alpha": 16,
+ "lora_dropout": 0.1,
+ "megatron_config": null,
+ "megatron_core": "megatron.core",
+ "modules_to_save": null,
+ "peft_type": "LORA",
+ "r": 8,
+ "rank_pattern": {},
+ "revision": null,
+ "target_modules": [
+ "q_proj",
+ "v_proj"
+ ],
+ "task_type": "CAUSAL_LM",
+ "use_dora": false,
+ "use_rslora": false
+}
\ No newline at end of file
diff --git a/checkpoint-900/adapter_model.safetensors b/checkpoint-900/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..3682096744cfd33ce29dcfc37fc84c4fa52aea28
--- /dev/null
+++ b/checkpoint-900/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4ce94ecc5ab6d3759e8ea5e03085ec03b9c2a860b5f97c6276d570119a70b613
+size 2099272
diff --git a/checkpoint-900/optimizer.pt b/checkpoint-900/optimizer.pt
new file mode 100644
index 0000000000000000000000000000000000000000..ac741897a97b0a0468113a0f72331ee1bc43c5bb
--- /dev/null
+++ b/checkpoint-900/optimizer.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e6fb7ce91c15977fb46bfc4d6d25811ee80e88fe72c15709a09f6b5c8935ccea
+size 4208302
diff --git a/checkpoint-900/rng_state.pth b/checkpoint-900/rng_state.pth
new file mode 100644
index 0000000000000000000000000000000000000000..1b1e07051b86e94bd8d506559e5a65a7dc9aaa22
--- /dev/null
+++ b/checkpoint-900/rng_state.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bd244b4ac9acd153035e9e3d15a4607589d8225655e384d50928915a88f96274
+size 14244
diff --git a/checkpoint-900/scheduler.pt b/checkpoint-900/scheduler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..ec24a4794e3cd9713adc315fc32ddc6b5a730510
--- /dev/null
+++ b/checkpoint-900/scheduler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6949c1dce6aaba51a83cb179f28e554f3fe314da16271701fb1cbe14eb005cff
+size 1064
diff --git a/checkpoint-900/special_tokens_map.json b/checkpoint-900/special_tokens_map.json
new file mode 100644
index 0000000000000000000000000000000000000000..94c4595c4eb202faff8e45d273ceffbe19fe3036
--- /dev/null
+++ b/checkpoint-900/special_tokens_map.json
@@ -0,0 +1,30 @@
+{
+ "bos_token": {
+ "content": "",
+ "lstrip": false,
+ "normalized": true,
+ "rstrip": false,
+ "single_word": false
+ },
+ "eos_token": {
+ "content": "",
+ "lstrip": false,
+ "normalized": true,
+ "rstrip": false,
+ "single_word": false
+ },
+ "pad_token": {
+ "content": "",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false
+ },
+ "unk_token": {
+ "content": "",
+ "lstrip": false,
+ "normalized": true,
+ "rstrip": false,
+ "single_word": false
+ }
+}
diff --git a/checkpoint-900/tokenizer.model b/checkpoint-900/tokenizer.model
new file mode 100644
index 0000000000000000000000000000000000000000..3df664fcc67f61d0c7e635a60d9f55cd0624158a
--- /dev/null
+++ b/checkpoint-900/tokenizer.model
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a3b8844863b200dfcca971db228e96ce388290dfcf72c15d7a9d2f604bac787c
+size 844403
diff --git a/checkpoint-900/tokenizer_config.json b/checkpoint-900/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..6912f7cdcb84c3039edad59aa64f70dc4344a517
--- /dev/null
+++ b/checkpoint-900/tokenizer_config.json
@@ -0,0 +1,54 @@
+{
+ "add_bos_token": true,
+ "add_eos_token": false,
+ "add_prefix_space": true,
+ "added_tokens_decoder": {
+ "0": {
+ "content": "",
+ "lstrip": false,
+ "normalized": true,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "1": {
+ "content": "",
+ "lstrip": false,
+ "normalized": true,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "2": {
+ "content": "",
+ "lstrip": false,
+ "normalized": true,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "32000": {
+ "content": "",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ }
+ },
+ "bos_token": "",
+ "chat_template": "{% set system_message = 'You are a helpful assistant. 你是一个乐于助人的助手。' %}{% if messages[0]['role'] == 'system' %}{% set system_message = messages[0]['content'] %}{% endif %}{% for message in messages %}{% set content = message['content'] %}{% if loop.index0 == 0 and system_message is defined %}{% set content = '<>\\n' + system_message + '\\n<>\\n\\n' + message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ '' + '[INST] ' + content + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ content + '' }}{% endif %}{% endfor %}",
+ "clean_up_tokenization_spaces": false,
+ "eos_token": "",
+ "legacy": true,
+ "model_max_length": 1000000000000000019884624838656,
+ "pad_token": "",
+ "padding_side": "right",
+ "sp_model_kwargs": {},
+ "spaces_between_special_tokens": false,
+ "split_special_tokens": false,
+ "tokenizer_class": "LlamaTokenizer",
+ "unk_token": "",
+ "use_default_system_prompt": false,
+ "use_fast": false
+}
diff --git a/checkpoint-900/trainer_state.json b/checkpoint-900/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..56a11a2de1a41d799bd203e05a48ecb1f2d992e3
--- /dev/null
+++ b/checkpoint-900/trainer_state.json
@@ -0,0 +1,1281 @@
+{
+ "best_metric": null,
+ "best_model_checkpoint": null,
+ "epoch": 0.27986162397481246,
+ "eval_steps": 500,
+ "global_step": 900,
+ "is_hyper_param_search": false,
+ "is_local_process_zero": true,
+ "is_world_process_zero": true,
+ "log_history": [
+ {
+ "epoch": 0.0,
+ "grad_norm": 0.47774845361709595,
+ "learning_rate": 4.999970160815579e-05,
+ "loss": 2.0765,
+ "step": 5
+ },
+ {
+ "epoch": 0.0,
+ "grad_norm": 0.6051416397094727,
+ "learning_rate": 4.999880643974619e-05,
+ "loss": 2.2297,
+ "step": 10
+ },
+ {
+ "epoch": 0.0,
+ "grad_norm": 0.6161717772483826,
+ "learning_rate": 4.9997314516140056e-05,
+ "loss": 2.1103,
+ "step": 15
+ },
+ {
+ "epoch": 0.01,
+ "grad_norm": 0.4686434268951416,
+ "learning_rate": 4.999522587295162e-05,
+ "loss": 2.0057,
+ "step": 20
+ },
+ {
+ "epoch": 0.01,
+ "grad_norm": 0.8412289023399353,
+ "learning_rate": 4.999254056003963e-05,
+ "loss": 2.1778,
+ "step": 25
+ },
+ {
+ "epoch": 0.01,
+ "grad_norm": 0.5333625078201294,
+ "learning_rate": 4.99892586415061e-05,
+ "loss": 2.2399,
+ "step": 30
+ },
+ {
+ "epoch": 0.01,
+ "grad_norm": 0.821148157119751,
+ "learning_rate": 4.9985380195694856e-05,
+ "loss": 2.3215,
+ "step": 35
+ },
+ {
+ "epoch": 0.01,
+ "grad_norm": 0.8403909206390381,
+ "learning_rate": 4.998090531518962e-05,
+ "loss": 1.8295,
+ "step": 40
+ },
+ {
+ "epoch": 0.01,
+ "grad_norm": 0.6633398532867432,
+ "learning_rate": 4.9975834106811834e-05,
+ "loss": 2.0195,
+ "step": 45
+ },
+ {
+ "epoch": 0.02,
+ "grad_norm": 0.6386868357658386,
+ "learning_rate": 4.997016669161806e-05,
+ "loss": 2.1257,
+ "step": 50
+ },
+ {
+ "epoch": 0.02,
+ "grad_norm": 0.7762248516082764,
+ "learning_rate": 4.996390320489715e-05,
+ "loss": 2.057,
+ "step": 55
+ },
+ {
+ "epoch": 0.02,
+ "grad_norm": 1.3192856311798096,
+ "learning_rate": 4.9957043796166966e-05,
+ "loss": 2.0753,
+ "step": 60
+ },
+ {
+ "epoch": 0.02,
+ "grad_norm": 0.9797518849372864,
+ "learning_rate": 4.994958862917083e-05,
+ "loss": 1.9736,
+ "step": 65
+ },
+ {
+ "epoch": 0.02,
+ "grad_norm": 1.000693440437317,
+ "learning_rate": 4.994153788187363e-05,
+ "loss": 2.1572,
+ "step": 70
+ },
+ {
+ "epoch": 0.02,
+ "grad_norm": 0.6852813959121704,
+ "learning_rate": 4.993289174645757e-05,
+ "loss": 2.1491,
+ "step": 75
+ },
+ {
+ "epoch": 0.02,
+ "grad_norm": 1.0075691938400269,
+ "learning_rate": 4.992365042931752e-05,
+ "loss": 1.945,
+ "step": 80
+ },
+ {
+ "epoch": 0.03,
+ "grad_norm": 1.1973133087158203,
+ "learning_rate": 4.991381415105619e-05,
+ "loss": 2.0811,
+ "step": 85
+ },
+ {
+ "epoch": 0.03,
+ "grad_norm": 0.9927239418029785,
+ "learning_rate": 4.990338314647881e-05,
+ "loss": 1.961,
+ "step": 90
+ },
+ {
+ "epoch": 0.03,
+ "grad_norm": 0.9499759674072266,
+ "learning_rate": 4.98923576645875e-05,
+ "loss": 2.0653,
+ "step": 95
+ },
+ {
+ "epoch": 0.03,
+ "grad_norm": 0.7233040928840637,
+ "learning_rate": 4.9880737968575365e-05,
+ "loss": 1.9999,
+ "step": 100
+ },
+ {
+ "epoch": 0.03,
+ "grad_norm": 1.55235755443573,
+ "learning_rate": 4.986852433582022e-05,
+ "loss": 2.2258,
+ "step": 105
+ },
+ {
+ "epoch": 0.03,
+ "grad_norm": 0.9007890820503235,
+ "learning_rate": 4.985571705787793e-05,
+ "loss": 2.1034,
+ "step": 110
+ },
+ {
+ "epoch": 0.04,
+ "grad_norm": 0.6774860620498657,
+ "learning_rate": 4.9842316440475475e-05,
+ "loss": 2.1753,
+ "step": 115
+ },
+ {
+ "epoch": 0.04,
+ "grad_norm": 0.7676737308502197,
+ "learning_rate": 4.9828322803503665e-05,
+ "loss": 2.1384,
+ "step": 120
+ },
+ {
+ "epoch": 0.04,
+ "grad_norm": 0.9624544978141785,
+ "learning_rate": 4.981373648100946e-05,
+ "loss": 2.0521,
+ "step": 125
+ },
+ {
+ "epoch": 0.04,
+ "grad_norm": 0.9315722584724426,
+ "learning_rate": 4.979855782118802e-05,
+ "loss": 1.9256,
+ "step": 130
+ },
+ {
+ "epoch": 0.04,
+ "grad_norm": 0.9035864472389221,
+ "learning_rate": 4.978278718637443e-05,
+ "loss": 2.0882,
+ "step": 135
+ },
+ {
+ "epoch": 0.04,
+ "grad_norm": 0.7997236251831055,
+ "learning_rate": 4.9766424953035e-05,
+ "loss": 2.0724,
+ "step": 140
+ },
+ {
+ "epoch": 0.05,
+ "grad_norm": 1.0692921876907349,
+ "learning_rate": 4.974947151175826e-05,
+ "loss": 2.1329,
+ "step": 145
+ },
+ {
+ "epoch": 0.05,
+ "grad_norm": 0.9506180286407471,
+ "learning_rate": 4.973192726724572e-05,
+ "loss": 2.082,
+ "step": 150
+ },
+ {
+ "epoch": 0.05,
+ "grad_norm": 0.8647387027740479,
+ "learning_rate": 4.9713792638302145e-05,
+ "loss": 2.0366,
+ "step": 155
+ },
+ {
+ "epoch": 0.05,
+ "grad_norm": 1.105302095413208,
+ "learning_rate": 4.969506805782555e-05,
+ "loss": 2.1481,
+ "step": 160
+ },
+ {
+ "epoch": 0.05,
+ "grad_norm": 0.7593303918838501,
+ "learning_rate": 4.967575397279689e-05,
+ "loss": 2.032,
+ "step": 165
+ },
+ {
+ "epoch": 0.05,
+ "grad_norm": 0.7521979808807373,
+ "learning_rate": 4.965585084426943e-05,
+ "loss": 2.0379,
+ "step": 170
+ },
+ {
+ "epoch": 0.05,
+ "grad_norm": 0.947120726108551,
+ "learning_rate": 4.9635359147357655e-05,
+ "loss": 2.1444,
+ "step": 175
+ },
+ {
+ "epoch": 0.06,
+ "grad_norm": 1.2184454202651978,
+ "learning_rate": 4.961427937122598e-05,
+ "loss": 1.9164,
+ "step": 180
+ },
+ {
+ "epoch": 0.06,
+ "grad_norm": 1.221663475036621,
+ "learning_rate": 4.959261201907707e-05,
+ "loss": 2.0084,
+ "step": 185
+ },
+ {
+ "epoch": 0.06,
+ "grad_norm": 1.0457361936569214,
+ "learning_rate": 4.957035760813982e-05,
+ "loss": 2.2032,
+ "step": 190
+ },
+ {
+ "epoch": 0.06,
+ "grad_norm": 0.8834909200668335,
+ "learning_rate": 4.954751666965701e-05,
+ "loss": 2.2101,
+ "step": 195
+ },
+ {
+ "epoch": 0.06,
+ "grad_norm": 0.791902482509613,
+ "learning_rate": 4.9524089748872615e-05,
+ "loss": 2.0472,
+ "step": 200
+ },
+ {
+ "epoch": 0.06,
+ "grad_norm": 1.2905739545822144,
+ "learning_rate": 4.9500077405018807e-05,
+ "loss": 2.0987,
+ "step": 205
+ },
+ {
+ "epoch": 0.07,
+ "grad_norm": 0.8612006306648254,
+ "learning_rate": 4.9475480211302583e-05,
+ "loss": 2.1765,
+ "step": 210
+ },
+ {
+ "epoch": 0.07,
+ "grad_norm": 1.3128459453582764,
+ "learning_rate": 4.945029875489212e-05,
+ "loss": 1.9926,
+ "step": 215
+ },
+ {
+ "epoch": 0.07,
+ "grad_norm": 0.9610918164253235,
+ "learning_rate": 4.94245336369027e-05,
+ "loss": 2.0124,
+ "step": 220
+ },
+ {
+ "epoch": 0.07,
+ "grad_norm": 0.873160183429718,
+ "learning_rate": 4.939818547238241e-05,
+ "loss": 2.2229,
+ "step": 225
+ },
+ {
+ "epoch": 0.07,
+ "grad_norm": 1.5535285472869873,
+ "learning_rate": 4.9371254890297446e-05,
+ "loss": 2.2013,
+ "step": 230
+ },
+ {
+ "epoch": 0.07,
+ "grad_norm": 1.1951836347579956,
+ "learning_rate": 4.93437425335171e-05,
+ "loss": 2.014,
+ "step": 235
+ },
+ {
+ "epoch": 0.07,
+ "grad_norm": 0.7874170541763306,
+ "learning_rate": 4.9315649058798384e-05,
+ "loss": 2.1701,
+ "step": 240
+ },
+ {
+ "epoch": 0.08,
+ "grad_norm": 1.3503323793411255,
+ "learning_rate": 4.928697513677042e-05,
+ "loss": 2.1681,
+ "step": 245
+ },
+ {
+ "epoch": 0.08,
+ "grad_norm": 1.3091179132461548,
+ "learning_rate": 4.925772145191834e-05,
+ "loss": 2.1224,
+ "step": 250
+ },
+ {
+ "epoch": 0.08,
+ "grad_norm": 1.4428555965423584,
+ "learning_rate": 4.9227888702567044e-05,
+ "loss": 2.0512,
+ "step": 255
+ },
+ {
+ "epoch": 0.08,
+ "grad_norm": 0.8234395980834961,
+ "learning_rate": 4.9197477600864446e-05,
+ "loss": 2.1067,
+ "step": 260
+ },
+ {
+ "epoch": 0.08,
+ "grad_norm": 1.9094969034194946,
+ "learning_rate": 4.9166488872764526e-05,
+ "loss": 1.8884,
+ "step": 265
+ },
+ {
+ "epoch": 0.08,
+ "grad_norm": 1.0074087381362915,
+ "learning_rate": 4.913492325800999e-05,
+ "loss": 1.9345,
+ "step": 270
+ },
+ {
+ "epoch": 0.09,
+ "grad_norm": 1.0867297649383545,
+ "learning_rate": 4.910278151011458e-05,
+ "loss": 2.1928,
+ "step": 275
+ },
+ {
+ "epoch": 0.09,
+ "grad_norm": 0.6842357516288757,
+ "learning_rate": 4.907006439634516e-05,
+ "loss": 2.0407,
+ "step": 280
+ },
+ {
+ "epoch": 0.09,
+ "grad_norm": 0.8409023284912109,
+ "learning_rate": 4.903677269770329e-05,
+ "loss": 2.2344,
+ "step": 285
+ },
+ {
+ "epoch": 0.09,
+ "grad_norm": 0.8119503259658813,
+ "learning_rate": 4.900290720890671e-05,
+ "loss": 2.1296,
+ "step": 290
+ },
+ {
+ "epoch": 0.09,
+ "grad_norm": 0.9938147068023682,
+ "learning_rate": 4.8968468738370244e-05,
+ "loss": 2.152,
+ "step": 295
+ },
+ {
+ "epoch": 0.09,
+ "grad_norm": 0.9865244030952454,
+ "learning_rate": 4.8933458108186606e-05,
+ "loss": 1.9623,
+ "step": 300
+ },
+ {
+ "epoch": 0.09,
+ "grad_norm": 1.3944802284240723,
+ "learning_rate": 4.889787615410672e-05,
+ "loss": 1.915,
+ "step": 305
+ },
+ {
+ "epoch": 0.1,
+ "grad_norm": 1.3749767541885376,
+ "learning_rate": 4.886172372551977e-05,
+ "loss": 1.9934,
+ "step": 310
+ },
+ {
+ "epoch": 0.1,
+ "grad_norm": 0.9024938941001892,
+ "learning_rate": 4.882500168543294e-05,
+ "loss": 2.1541,
+ "step": 315
+ },
+ {
+ "epoch": 0.1,
+ "grad_norm": 1.1978263854980469,
+ "learning_rate": 4.878771091045082e-05,
+ "loss": 2.1688,
+ "step": 320
+ },
+ {
+ "epoch": 0.1,
+ "grad_norm": 0.8360010981559753,
+ "learning_rate": 4.874985229075446e-05,
+ "loss": 2.1387,
+ "step": 325
+ },
+ {
+ "epoch": 0.1,
+ "grad_norm": 0.7683364152908325,
+ "learning_rate": 4.871142673008012e-05,
+ "loss": 2.0215,
+ "step": 330
+ },
+ {
+ "epoch": 0.1,
+ "grad_norm": 1.4230670928955078,
+ "learning_rate": 4.867243514569772e-05,
+ "loss": 1.9491,
+ "step": 335
+ },
+ {
+ "epoch": 0.11,
+ "grad_norm": 0.8198773860931396,
+ "learning_rate": 4.863287846838891e-05,
+ "loss": 2.0151,
+ "step": 340
+ },
+ {
+ "epoch": 0.11,
+ "grad_norm": 1.467207908630371,
+ "learning_rate": 4.85927576424249e-05,
+ "loss": 1.8906,
+ "step": 345
+ },
+ {
+ "epoch": 0.11,
+ "grad_norm": 0.9537095427513123,
+ "learning_rate": 4.855207362554385e-05,
+ "loss": 2.1844,
+ "step": 350
+ },
+ {
+ "epoch": 0.11,
+ "grad_norm": 1.0757155418395996,
+ "learning_rate": 4.851082738892809e-05,
+ "loss": 2.048,
+ "step": 355
+ },
+ {
+ "epoch": 0.11,
+ "grad_norm": 1.6884938478469849,
+ "learning_rate": 4.8469019917180846e-05,
+ "loss": 1.9537,
+ "step": 360
+ },
+ {
+ "epoch": 0.11,
+ "grad_norm": 1.4680182933807373,
+ "learning_rate": 4.8426652208302814e-05,
+ "loss": 1.9731,
+ "step": 365
+ },
+ {
+ "epoch": 0.12,
+ "grad_norm": 1.1778632402420044,
+ "learning_rate": 4.83837252736683e-05,
+ "loss": 2.1395,
+ "step": 370
+ },
+ {
+ "epoch": 0.12,
+ "grad_norm": 1.2865056991577148,
+ "learning_rate": 4.834024013800108e-05,
+ "loss": 2.0016,
+ "step": 375
+ },
+ {
+ "epoch": 0.12,
+ "grad_norm": 1.055177092552185,
+ "learning_rate": 4.8296197839349944e-05,
+ "loss": 1.9632,
+ "step": 380
+ },
+ {
+ "epoch": 0.12,
+ "grad_norm": 1.0041871070861816,
+ "learning_rate": 4.825159942906389e-05,
+ "loss": 2.3302,
+ "step": 385
+ },
+ {
+ "epoch": 0.12,
+ "grad_norm": 1.0026438236236572,
+ "learning_rate": 4.820644597176709e-05,
+ "loss": 2.1517,
+ "step": 390
+ },
+ {
+ "epoch": 0.12,
+ "grad_norm": 1.3532180786132812,
+ "learning_rate": 4.81607385453334e-05,
+ "loss": 2.1229,
+ "step": 395
+ },
+ {
+ "epoch": 0.12,
+ "grad_norm": 0.7670988440513611,
+ "learning_rate": 4.81144782408607e-05,
+ "loss": 2.1382,
+ "step": 400
+ },
+ {
+ "epoch": 0.13,
+ "grad_norm": 1.0405700206756592,
+ "learning_rate": 4.8067666162644774e-05,
+ "loss": 1.9614,
+ "step": 405
+ },
+ {
+ "epoch": 0.13,
+ "grad_norm": 1.2252662181854248,
+ "learning_rate": 4.802030342815304e-05,
+ "loss": 2.1399,
+ "step": 410
+ },
+ {
+ "epoch": 0.13,
+ "grad_norm": 1.237946629524231,
+ "learning_rate": 4.7972391167997754e-05,
+ "loss": 1.9034,
+ "step": 415
+ },
+ {
+ "epoch": 0.13,
+ "grad_norm": 0.8064705729484558,
+ "learning_rate": 4.7923930525909156e-05,
+ "loss": 2.0075,
+ "step": 420
+ },
+ {
+ "epoch": 0.13,
+ "grad_norm": 0.8717565536499023,
+ "learning_rate": 4.7874922658708065e-05,
+ "loss": 2.0105,
+ "step": 425
+ },
+ {
+ "epoch": 0.13,
+ "grad_norm": 1.6693098545074463,
+ "learning_rate": 4.782536873627832e-05,
+ "loss": 2.0242,
+ "step": 430
+ },
+ {
+ "epoch": 0.14,
+ "grad_norm": 0.82447350025177,
+ "learning_rate": 4.777526994153882e-05,
+ "loss": 2.0267,
+ "step": 435
+ },
+ {
+ "epoch": 0.14,
+ "grad_norm": 0.9926588535308838,
+ "learning_rate": 4.7724627470415307e-05,
+ "loss": 1.9119,
+ "step": 440
+ },
+ {
+ "epoch": 0.14,
+ "grad_norm": 1.0924450159072876,
+ "learning_rate": 4.7673442531811796e-05,
+ "loss": 2.2653,
+ "step": 445
+ },
+ {
+ "epoch": 0.14,
+ "grad_norm": 1.1592103242874146,
+ "learning_rate": 4.762171634758177e-05,
+ "loss": 2.0017,
+ "step": 450
+ },
+ {
+ "epoch": 0.14,
+ "grad_norm": 0.9172110557556152,
+ "learning_rate": 4.7569450152498927e-05,
+ "loss": 2.1408,
+ "step": 455
+ },
+ {
+ "epoch": 0.14,
+ "grad_norm": 1.1897525787353516,
+ "learning_rate": 4.751664519422778e-05,
+ "loss": 2.0935,
+ "step": 460
+ },
+ {
+ "epoch": 0.14,
+ "grad_norm": 0.8793094158172607,
+ "learning_rate": 4.746330273329386e-05,
+ "loss": 2.1142,
+ "step": 465
+ },
+ {
+ "epoch": 0.15,
+ "grad_norm": 1.4337489604949951,
+ "learning_rate": 4.740942404305356e-05,
+ "loss": 2.1289,
+ "step": 470
+ },
+ {
+ "epoch": 0.15,
+ "grad_norm": 1.0251764059066772,
+ "learning_rate": 4.735501040966383e-05,
+ "loss": 1.9741,
+ "step": 475
+ },
+ {
+ "epoch": 0.15,
+ "grad_norm": 1.2659822702407837,
+ "learning_rate": 4.730006313205143e-05,
+ "loss": 2.088,
+ "step": 480
+ },
+ {
+ "epoch": 0.15,
+ "grad_norm": 0.8884140849113464,
+ "learning_rate": 4.724458352188192e-05,
+ "loss": 2.2079,
+ "step": 485
+ },
+ {
+ "epoch": 0.15,
+ "grad_norm": 1.1937768459320068,
+ "learning_rate": 4.718857290352835e-05,
+ "loss": 2.048,
+ "step": 490
+ },
+ {
+ "epoch": 0.15,
+ "grad_norm": 0.9741552472114563,
+ "learning_rate": 4.713203261403966e-05,
+ "loss": 2.2569,
+ "step": 495
+ },
+ {
+ "epoch": 0.16,
+ "grad_norm": 0.7996780872344971,
+ "learning_rate": 4.707496400310874e-05,
+ "loss": 1.9574,
+ "step": 500
+ },
+ {
+ "epoch": 0.16,
+ "grad_norm": 1.8182051181793213,
+ "learning_rate": 4.701736843304025e-05,
+ "loss": 2.0951,
+ "step": 505
+ },
+ {
+ "epoch": 0.16,
+ "grad_norm": 1.507320761680603,
+ "learning_rate": 4.695924727871805e-05,
+ "loss": 2.0253,
+ "step": 510
+ },
+ {
+ "epoch": 0.16,
+ "grad_norm": 0.759121835231781,
+ "learning_rate": 4.690060192757242e-05,
+ "loss": 2.0602,
+ "step": 515
+ },
+ {
+ "epoch": 0.16,
+ "grad_norm": 1.5943195819854736,
+ "learning_rate": 4.684143377954691e-05,
+ "loss": 2.0386,
+ "step": 520
+ },
+ {
+ "epoch": 0.16,
+ "grad_norm": 0.8568710088729858,
+ "learning_rate": 4.6781744247064955e-05,
+ "loss": 2.073,
+ "step": 525
+ },
+ {
+ "epoch": 0.16,
+ "grad_norm": 1.3352620601654053,
+ "learning_rate": 4.6721534754996125e-05,
+ "loss": 2.1443,
+ "step": 530
+ },
+ {
+ "epoch": 0.17,
+ "grad_norm": 1.3417474031448364,
+ "learning_rate": 4.666080674062213e-05,
+ "loss": 2.0288,
+ "step": 535
+ },
+ {
+ "epoch": 0.17,
+ "grad_norm": 1.5334464311599731,
+ "learning_rate": 4.659956165360251e-05,
+ "loss": 2.0609,
+ "step": 540
+ },
+ {
+ "epoch": 0.17,
+ "grad_norm": 0.9658721089363098,
+ "learning_rate": 4.6537800955940005e-05,
+ "loss": 1.9539,
+ "step": 545
+ },
+ {
+ "epoch": 0.17,
+ "grad_norm": 1.9197947978973389,
+ "learning_rate": 4.647552612194572e-05,
+ "loss": 2.149,
+ "step": 550
+ },
+ {
+ "epoch": 0.17,
+ "grad_norm": 0.8512137532234192,
+ "learning_rate": 4.641273863820383e-05,
+ "loss": 1.9722,
+ "step": 555
+ },
+ {
+ "epoch": 0.17,
+ "grad_norm": 1.827289342880249,
+ "learning_rate": 4.634944000353622e-05,
+ "loss": 2.0729,
+ "step": 560
+ },
+ {
+ "epoch": 0.18,
+ "grad_norm": 1.088416337966919,
+ "learning_rate": 4.628563172896655e-05,
+ "loss": 1.9507,
+ "step": 565
+ },
+ {
+ "epoch": 0.18,
+ "grad_norm": 1.3566908836364746,
+ "learning_rate": 4.6221315337684353e-05,
+ "loss": 2.1643,
+ "step": 570
+ },
+ {
+ "epoch": 0.18,
+ "grad_norm": 1.3541293144226074,
+ "learning_rate": 4.615649236500854e-05,
+ "loss": 2.1839,
+ "step": 575
+ },
+ {
+ "epoch": 0.18,
+ "grad_norm": 0.991269588470459,
+ "learning_rate": 4.609116435835083e-05,
+ "loss": 2.0976,
+ "step": 580
+ },
+ {
+ "epoch": 0.18,
+ "grad_norm": 1.0280535221099854,
+ "learning_rate": 4.602533287717877e-05,
+ "loss": 2.1474,
+ "step": 585
+ },
+ {
+ "epoch": 0.18,
+ "grad_norm": 1.013123631477356,
+ "learning_rate": 4.5958999492978524e-05,
+ "loss": 2.1873,
+ "step": 590
+ },
+ {
+ "epoch": 0.19,
+ "grad_norm": 1.1753040552139282,
+ "learning_rate": 4.589216578921737e-05,
+ "loss": 2.1744,
+ "step": 595
+ },
+ {
+ "epoch": 0.19,
+ "grad_norm": 1.1839090585708618,
+ "learning_rate": 4.582483336130586e-05,
+ "loss": 1.9982,
+ "step": 600
+ },
+ {
+ "epoch": 0.19,
+ "grad_norm": 1.0724798440933228,
+ "learning_rate": 4.575700381655979e-05,
+ "loss": 2.1234,
+ "step": 605
+ },
+ {
+ "epoch": 0.19,
+ "grad_norm": 2.009913682937622,
+ "learning_rate": 4.5688678774161796e-05,
+ "loss": 1.9478,
+ "step": 610
+ },
+ {
+ "epoch": 0.19,
+ "grad_norm": 0.9897060394287109,
+ "learning_rate": 4.561985986512271e-05,
+ "loss": 1.8268,
+ "step": 615
+ },
+ {
+ "epoch": 0.19,
+ "grad_norm": 0.8881808519363403,
+ "learning_rate": 4.555054873224263e-05,
+ "loss": 1.9887,
+ "step": 620
+ },
+ {
+ "epoch": 0.19,
+ "grad_norm": 1.155900001525879,
+ "learning_rate": 4.54807470300717e-05,
+ "loss": 2.0777,
+ "step": 625
+ },
+ {
+ "epoch": 0.2,
+ "grad_norm": 0.8782421350479126,
+ "learning_rate": 4.5410456424870596e-05,
+ "loss": 2.0566,
+ "step": 630
+ },
+ {
+ "epoch": 0.2,
+ "grad_norm": 1.3324674367904663,
+ "learning_rate": 4.5339678594570795e-05,
+ "loss": 2.047,
+ "step": 635
+ },
+ {
+ "epoch": 0.2,
+ "grad_norm": 1.9805939197540283,
+ "learning_rate": 4.526841522873449e-05,
+ "loss": 1.962,
+ "step": 640
+ },
+ {
+ "epoch": 0.2,
+ "grad_norm": 1.4999943971633911,
+ "learning_rate": 4.519666802851422e-05,
+ "loss": 2.0972,
+ "step": 645
+ },
+ {
+ "epoch": 0.2,
+ "grad_norm": 1.4504961967468262,
+ "learning_rate": 4.5124438706612376e-05,
+ "loss": 2.0041,
+ "step": 650
+ },
+ {
+ "epoch": 0.2,
+ "grad_norm": 0.9078169465065002,
+ "learning_rate": 4.505172898724018e-05,
+ "loss": 2.1229,
+ "step": 655
+ },
+ {
+ "epoch": 0.21,
+ "grad_norm": 1.1635804176330566,
+ "learning_rate": 4.497854060607662e-05,
+ "loss": 2.0195,
+ "step": 660
+ },
+ {
+ "epoch": 0.21,
+ "grad_norm": 1.46576726436615,
+ "learning_rate": 4.490487531022699e-05,
+ "loss": 2.0745,
+ "step": 665
+ },
+ {
+ "epoch": 0.21,
+ "grad_norm": 1.2094652652740479,
+ "learning_rate": 4.4830734858181145e-05,
+ "loss": 2.1068,
+ "step": 670
+ },
+ {
+ "epoch": 0.21,
+ "grad_norm": 1.4738895893096924,
+ "learning_rate": 4.47561210197716e-05,
+ "loss": 1.8088,
+ "step": 675
+ },
+ {
+ "epoch": 0.21,
+ "grad_norm": 1.23384690284729,
+ "learning_rate": 4.4681035576131215e-05,
+ "loss": 2.0995,
+ "step": 680
+ },
+ {
+ "epoch": 0.21,
+ "grad_norm": 0.8332946300506592,
+ "learning_rate": 4.46054803196507e-05,
+ "loss": 2.0541,
+ "step": 685
+ },
+ {
+ "epoch": 0.21,
+ "grad_norm": 0.9207485318183899,
+ "learning_rate": 4.452945705393586e-05,
+ "loss": 2.166,
+ "step": 690
+ },
+ {
+ "epoch": 0.22,
+ "grad_norm": 1.292945146560669,
+ "learning_rate": 4.445296759376449e-05,
+ "loss": 2.0784,
+ "step": 695
+ },
+ {
+ "epoch": 0.22,
+ "grad_norm": 0.9874763488769531,
+ "learning_rate": 4.437601376504307e-05,
+ "loss": 2.2087,
+ "step": 700
+ },
+ {
+ "epoch": 0.22,
+ "grad_norm": 0.9427415132522583,
+ "learning_rate": 4.4298597404763186e-05,
+ "loss": 2.1199,
+ "step": 705
+ },
+ {
+ "epoch": 0.22,
+ "grad_norm": 1.7369529008865356,
+ "learning_rate": 4.422072036095768e-05,
+ "loss": 2.0355,
+ "step": 710
+ },
+ {
+ "epoch": 0.22,
+ "grad_norm": 1.2423696517944336,
+ "learning_rate": 4.414238449265654e-05,
+ "loss": 2.0011,
+ "step": 715
+ },
+ {
+ "epoch": 0.22,
+ "grad_norm": 1.2304831743240356,
+ "learning_rate": 4.406359166984249e-05,
+ "loss": 2.0368,
+ "step": 720
+ },
+ {
+ "epoch": 0.23,
+ "grad_norm": 0.9090413451194763,
+ "learning_rate": 4.39843437734064e-05,
+ "loss": 1.9983,
+ "step": 725
+ },
+ {
+ "epoch": 0.23,
+ "grad_norm": 1.2729507684707642,
+ "learning_rate": 4.390464269510233e-05,
+ "loss": 2.021,
+ "step": 730
+ },
+ {
+ "epoch": 0.23,
+ "grad_norm": 1.3009227514266968,
+ "learning_rate": 4.382449033750244e-05,
+ "loss": 1.9743,
+ "step": 735
+ },
+ {
+ "epoch": 0.23,
+ "grad_norm": 1.5456056594848633,
+ "learning_rate": 4.37438886139515e-05,
+ "loss": 2.0689,
+ "step": 740
+ },
+ {
+ "epoch": 0.23,
+ "grad_norm": 1.3235007524490356,
+ "learning_rate": 4.3662839448521264e-05,
+ "loss": 2.0838,
+ "step": 745
+ },
+ {
+ "epoch": 0.23,
+ "grad_norm": 2.2074007987976074,
+ "learning_rate": 4.358134477596454e-05,
+ "loss": 2.0835,
+ "step": 750
+ },
+ {
+ "epoch": 0.23,
+ "grad_norm": 1.403738021850586,
+ "learning_rate": 4.3499406541668966e-05,
+ "loss": 2.0916,
+ "step": 755
+ },
+ {
+ "epoch": 0.24,
+ "grad_norm": 1.0940325260162354,
+ "learning_rate": 4.3417026701610616e-05,
+ "loss": 1.972,
+ "step": 760
+ },
+ {
+ "epoch": 0.24,
+ "grad_norm": 1.666353702545166,
+ "learning_rate": 4.3334207222307275e-05,
+ "loss": 1.927,
+ "step": 765
+ },
+ {
+ "epoch": 0.24,
+ "grad_norm": 1.0777515172958374,
+ "learning_rate": 4.325095008077154e-05,
+ "loss": 2.1192,
+ "step": 770
+ },
+ {
+ "epoch": 0.24,
+ "grad_norm": 1.7218186855316162,
+ "learning_rate": 4.316725726446353e-05,
+ "loss": 2.0774,
+ "step": 775
+ },
+ {
+ "epoch": 0.24,
+ "grad_norm": 1.356753945350647,
+ "learning_rate": 4.3083130771243586e-05,
+ "loss": 2.0847,
+ "step": 780
+ },
+ {
+ "epoch": 0.24,
+ "grad_norm": 0.9967429637908936,
+ "learning_rate": 4.299857260932445e-05,
+ "loss": 2.0485,
+ "step": 785
+ },
+ {
+ "epoch": 0.25,
+ "grad_norm": 1.6216442584991455,
+ "learning_rate": 4.2913584797223397e-05,
+ "loss": 2.1008,
+ "step": 790
+ },
+ {
+ "epoch": 0.25,
+ "grad_norm": 1.2556742429733276,
+ "learning_rate": 4.2828169363714016e-05,
+ "loss": 1.9209,
+ "step": 795
+ },
+ {
+ "epoch": 0.25,
+ "grad_norm": 1.1800439357757568,
+ "learning_rate": 4.274232834777782e-05,
+ "loss": 1.9722,
+ "step": 800
+ },
+ {
+ "epoch": 0.25,
+ "grad_norm": 1.1313499212265015,
+ "learning_rate": 4.2656063798555515e-05,
+ "loss": 1.9176,
+ "step": 805
+ },
+ {
+ "epoch": 0.25,
+ "grad_norm": 1.137534737586975,
+ "learning_rate": 4.256937777529815e-05,
+ "loss": 1.9929,
+ "step": 810
+ },
+ {
+ "epoch": 0.25,
+ "grad_norm": 1.0575093030929565,
+ "learning_rate": 4.2482272347317906e-05,
+ "loss": 2.166,
+ "step": 815
+ },
+ {
+ "epoch": 0.25,
+ "grad_norm": 1.5939594507217407,
+ "learning_rate": 4.2394749593938733e-05,
+ "loss": 2.1334,
+ "step": 820
+ },
+ {
+ "epoch": 0.26,
+ "grad_norm": 1.1045507192611694,
+ "learning_rate": 4.230681160444669e-05,
+ "loss": 2.0853,
+ "step": 825
+ },
+ {
+ "epoch": 0.26,
+ "grad_norm": 1.3480136394500732,
+ "learning_rate": 4.221846047804009e-05,
+ "loss": 2.1802,
+ "step": 830
+ },
+ {
+ "epoch": 0.26,
+ "grad_norm": 1.1822657585144043,
+ "learning_rate": 4.2129698323779366e-05,
+ "loss": 2.0739,
+ "step": 835
+ },
+ {
+ "epoch": 0.26,
+ "grad_norm": 1.1771117448806763,
+ "learning_rate": 4.204052726053676e-05,
+ "loss": 2.0238,
+ "step": 840
+ },
+ {
+ "epoch": 0.26,
+ "grad_norm": 1.4757814407348633,
+ "learning_rate": 4.195094941694571e-05,
+ "loss": 2.1557,
+ "step": 845
+ },
+ {
+ "epoch": 0.26,
+ "grad_norm": 0.9095075726509094,
+ "learning_rate": 4.1860966931350054e-05,
+ "loss": 2.1666,
+ "step": 850
+ },
+ {
+ "epoch": 0.27,
+ "grad_norm": 1.1039543151855469,
+ "learning_rate": 4.1770581951752976e-05,
+ "loss": 2.105,
+ "step": 855
+ },
+ {
+ "epoch": 0.27,
+ "grad_norm": 0.8517205119132996,
+ "learning_rate": 4.1679796635765735e-05,
+ "loss": 1.9656,
+ "step": 860
+ },
+ {
+ "epoch": 0.27,
+ "grad_norm": 1.239492654800415,
+ "learning_rate": 4.158861315055617e-05,
+ "loss": 2.0166,
+ "step": 865
+ },
+ {
+ "epoch": 0.27,
+ "grad_norm": 1.1358321905136108,
+ "learning_rate": 4.1497033672796924e-05,
+ "loss": 2.0076,
+ "step": 870
+ },
+ {
+ "epoch": 0.27,
+ "grad_norm": 1.6215249300003052,
+ "learning_rate": 4.140506038861356e-05,
+ "loss": 2.1594,
+ "step": 875
+ },
+ {
+ "epoch": 0.27,
+ "grad_norm": 1.0528080463409424,
+ "learning_rate": 4.131269549353229e-05,
+ "loss": 2.1416,
+ "step": 880
+ },
+ {
+ "epoch": 0.28,
+ "grad_norm": 0.8976901769638062,
+ "learning_rate": 4.1219941192427644e-05,
+ "loss": 2.1242,
+ "step": 885
+ },
+ {
+ "epoch": 0.28,
+ "grad_norm": 1.263594388961792,
+ "learning_rate": 4.112679969946977e-05,
+ "loss": 2.02,
+ "step": 890
+ },
+ {
+ "epoch": 0.28,
+ "grad_norm": 1.4173017740249634,
+ "learning_rate": 4.103327323807162e-05,
+ "loss": 2.0438,
+ "step": 895
+ },
+ {
+ "epoch": 0.28,
+ "grad_norm": 1.876170039176941,
+ "learning_rate": 4.093936404083585e-05,
+ "loss": 1.9806,
+ "step": 900
+ }
+ ],
+ "logging_steps": 5,
+ "max_steps": 3215,
+ "num_input_tokens_seen": 0,
+ "num_train_epochs": 1,
+ "save_steps": 100,
+ "total_flos": 1.210101614051328e+16,
+ "train_batch_size": 2,
+ "trial_name": null,
+ "trial_params": null
+}
diff --git a/checkpoint-900/training_args.bin b/checkpoint-900/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..dbb5f752acaebe6718c05e59ced2a70ce6dfb6a0
--- /dev/null
+++ b/checkpoint-900/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:79e80b13ff00898b4493d440a3c1a1eb234c0ae541cbca8a8b1befef97a354c9
+size 5112
diff --git a/special_tokens_map.json b/special_tokens_map.json
new file mode 100644
index 0000000000000000000000000000000000000000..94c4595c4eb202faff8e45d273ceffbe19fe3036
--- /dev/null
+++ b/special_tokens_map.json
@@ -0,0 +1,30 @@
+{
+ "bos_token": {
+ "content": "",
+ "lstrip": false,
+ "normalized": true,
+ "rstrip": false,
+ "single_word": false
+ },
+ "eos_token": {
+ "content": "",
+ "lstrip": false,
+ "normalized": true,
+ "rstrip": false,
+ "single_word": false
+ },
+ "pad_token": {
+ "content": "",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false
+ },
+ "unk_token": {
+ "content": "",
+ "lstrip": false,
+ "normalized": true,
+ "rstrip": false,
+ "single_word": false
+ }
+}
diff --git a/tokenizer.model b/tokenizer.model
new file mode 100644
index 0000000000000000000000000000000000000000..3df664fcc67f61d0c7e635a60d9f55cd0624158a
--- /dev/null
+++ b/tokenizer.model
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a3b8844863b200dfcca971db228e96ce388290dfcf72c15d7a9d2f604bac787c
+size 844403
diff --git a/tokenizer_config.json b/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..6912f7cdcb84c3039edad59aa64f70dc4344a517
--- /dev/null
+++ b/tokenizer_config.json
@@ -0,0 +1,54 @@
+{
+ "add_bos_token": true,
+ "add_eos_token": false,
+ "add_prefix_space": true,
+ "added_tokens_decoder": {
+ "0": {
+ "content": "",
+ "lstrip": false,
+ "normalized": true,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "1": {
+ "content": "",
+ "lstrip": false,
+ "normalized": true,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "2": {
+ "content": "",
+ "lstrip": false,
+ "normalized": true,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "32000": {
+ "content": "",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ }
+ },
+ "bos_token": "",
+ "chat_template": "{% set system_message = 'You are a helpful assistant. 你是一个乐于助人的助手。' %}{% if messages[0]['role'] == 'system' %}{% set system_message = messages[0]['content'] %}{% endif %}{% for message in messages %}{% set content = message['content'] %}{% if loop.index0 == 0 and system_message is defined %}{% set content = '<>\\n' + system_message + '\\n<>\\n\\n' + message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ '' + '[INST] ' + content + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ content + '' }}{% endif %}{% endfor %}",
+ "clean_up_tokenization_spaces": false,
+ "eos_token": "",
+ "legacy": true,
+ "model_max_length": 1000000000000000019884624838656,
+ "pad_token": "",
+ "padding_side": "right",
+ "sp_model_kwargs": {},
+ "spaces_between_special_tokens": false,
+ "split_special_tokens": false,
+ "tokenizer_class": "LlamaTokenizer",
+ "unk_token": "",
+ "use_default_system_prompt": false,
+ "use_fast": false
+}
diff --git a/train_results.json b/train_results.json
new file mode 100644
index 0000000000000000000000000000000000000000..b5f4085a932af8484c59597eb4f5cc1bb81a42f8
--- /dev/null
+++ b/train_results.json
@@ -0,0 +1,7 @@
+{
+ "epoch": 0.48,
+ "train_loss": 2.0602192145127516,
+ "train_runtime": 801.4891,
+ "train_samples_per_second": 64.198,
+ "train_steps_per_second": 4.011
+}
\ No newline at end of file
diff --git a/trainer_log.jsonl b/trainer_log.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..7ee9f09e3b9bc99ef17371c007f04058a0fd7ae8
--- /dev/null
+++ b/trainer_log.jsonl
@@ -0,0 +1,307 @@
+{"current_steps": 5, "total_steps": 3215, "loss": 2.0765, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.999970160815579e-05, "epoch": 0.0, "percentage": 0.16, "elapsed_time": "0:00:02", "remaining_time": "0:31:24"}
+{"current_steps": 10, "total_steps": 3215, "loss": 2.2297, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.999880643974619e-05, "epoch": 0.0, "percentage": 0.31, "elapsed_time": "0:00:05", "remaining_time": "0:30:02"}
+{"current_steps": 15, "total_steps": 3215, "loss": 2.1103, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.9997314516140056e-05, "epoch": 0.0, "percentage": 0.47, "elapsed_time": "0:00:08", "remaining_time": "0:29:25"}
+{"current_steps": 20, "total_steps": 3215, "loss": 2.0057, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.999522587295162e-05, "epoch": 0.01, "percentage": 0.62, "elapsed_time": "0:00:10", "remaining_time": "0:29:01"}
+{"current_steps": 25, "total_steps": 3215, "loss": 2.1778, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.999254056003963e-05, "epoch": 0.01, "percentage": 0.78, "elapsed_time": "0:00:13", "remaining_time": "0:28:45"}
+{"current_steps": 30, "total_steps": 3215, "loss": 2.2399, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.99892586415061e-05, "epoch": 0.01, "percentage": 0.93, "elapsed_time": "0:00:16", "remaining_time": "0:28:34"}
+{"current_steps": 35, "total_steps": 3215, "loss": 2.3215, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.9985380195694856e-05, "epoch": 0.01, "percentage": 1.09, "elapsed_time": "0:00:18", "remaining_time": "0:28:22"}
+{"current_steps": 40, "total_steps": 3215, "loss": 1.8295, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.998090531518962e-05, "epoch": 0.01, "percentage": 1.24, "elapsed_time": "0:00:21", "remaining_time": "0:28:20"}
+{"current_steps": 45, "total_steps": 3215, "loss": 2.0195, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.9975834106811834e-05, "epoch": 0.01, "percentage": 1.4, "elapsed_time": "0:00:24", "remaining_time": "0:28:10"}
+{"current_steps": 50, "total_steps": 3215, "loss": 2.1257, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.997016669161806e-05, "epoch": 0.02, "percentage": 1.56, "elapsed_time": "0:00:26", "remaining_time": "0:27:52"}
+{"current_steps": 55, "total_steps": 3215, "loss": 2.057, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.996390320489715e-05, "epoch": 0.02, "percentage": 1.71, "elapsed_time": "0:00:29", "remaining_time": "0:28:02"}
+{"current_steps": 60, "total_steps": 3215, "loss": 2.0753, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.9957043796166966e-05, "epoch": 0.02, "percentage": 1.87, "elapsed_time": "0:00:31", "remaining_time": "0:27:56"}
+{"current_steps": 65, "total_steps": 3215, "loss": 1.9736, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.994958862917083e-05, "epoch": 0.02, "percentage": 2.02, "elapsed_time": "0:00:34", "remaining_time": "0:27:43"}
+{"current_steps": 70, "total_steps": 3215, "loss": 2.1572, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.994153788187363e-05, "epoch": 0.02, "percentage": 2.18, "elapsed_time": "0:00:37", "remaining_time": "0:27:52"}
+{"current_steps": 75, "total_steps": 3215, "loss": 2.1491, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.993289174645757e-05, "epoch": 0.02, "percentage": 2.33, "elapsed_time": "0:00:39", "remaining_time": "0:27:47"}
+{"current_steps": 80, "total_steps": 3215, "loss": 1.945, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.992365042931752e-05, "epoch": 0.02, "percentage": 2.49, "elapsed_time": "0:00:42", "remaining_time": "0:27:37"}
+{"current_steps": 85, "total_steps": 3215, "loss": 2.0811, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.991381415105619e-05, "epoch": 0.03, "percentage": 2.64, "elapsed_time": "0:00:45", "remaining_time": "0:27:47"}
+{"current_steps": 90, "total_steps": 3215, "loss": 1.961, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.990338314647881e-05, "epoch": 0.03, "percentage": 2.8, "elapsed_time": "0:00:47", "remaining_time": "0:27:45"}
+{"current_steps": 95, "total_steps": 3215, "loss": 2.0653, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.98923576645875e-05, "epoch": 0.03, "percentage": 2.95, "elapsed_time": "0:00:50", "remaining_time": "0:27:44"}
+{"current_steps": 100, "total_steps": 3215, "loss": 1.9999, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.9880737968575365e-05, "epoch": 0.03, "percentage": 3.11, "elapsed_time": "0:00:53", "remaining_time": "0:27:43"}
+{"current_steps": 105, "total_steps": 3215, "loss": 2.2258, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.986852433582022e-05, "epoch": 0.03, "percentage": 3.27, "elapsed_time": "0:00:56", "remaining_time": "0:27:57"}
+{"current_steps": 110, "total_steps": 3215, "loss": 2.1034, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.985571705787793e-05, "epoch": 0.03, "percentage": 3.42, "elapsed_time": "0:00:59", "remaining_time": "0:28:00"}
+{"current_steps": 115, "total_steps": 3215, "loss": 2.1753, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.9842316440475475e-05, "epoch": 0.04, "percentage": 3.58, "elapsed_time": "0:01:02", "remaining_time": "0:27:57"}
+{"current_steps": 120, "total_steps": 3215, "loss": 2.1384, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.9828322803503665e-05, "epoch": 0.04, "percentage": 3.73, "elapsed_time": "0:01:04", "remaining_time": "0:27:53"}
+{"current_steps": 125, "total_steps": 3215, "loss": 2.0521, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.981373648100946e-05, "epoch": 0.04, "percentage": 3.89, "elapsed_time": "0:01:07", "remaining_time": "0:27:52"}
+{"current_steps": 130, "total_steps": 3215, "loss": 1.9256, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.979855782118802e-05, "epoch": 0.04, "percentage": 4.04, "elapsed_time": "0:01:10", "remaining_time": "0:27:48"}
+{"current_steps": 135, "total_steps": 3215, "loss": 2.0882, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.978278718637443e-05, "epoch": 0.04, "percentage": 4.2, "elapsed_time": "0:01:13", "remaining_time": "0:27:47"}
+{"current_steps": 140, "total_steps": 3215, "loss": 2.0724, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.9766424953035e-05, "epoch": 0.04, "percentage": 4.35, "elapsed_time": "0:01:15", "remaining_time": "0:27:44"}
+{"current_steps": 145, "total_steps": 3215, "loss": 2.1329, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.974947151175826e-05, "epoch": 0.05, "percentage": 4.51, "elapsed_time": "0:01:18", "remaining_time": "0:27:40"}
+{"current_steps": 150, "total_steps": 3215, "loss": 2.082, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.973192726724572e-05, "epoch": 0.05, "percentage": 4.67, "elapsed_time": "0:01:21", "remaining_time": "0:27:38"}
+{"current_steps": 155, "total_steps": 3215, "loss": 2.0366, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.9713792638302145e-05, "epoch": 0.05, "percentage": 4.82, "elapsed_time": "0:01:23", "remaining_time": "0:27:36"}
+{"current_steps": 160, "total_steps": 3215, "loss": 2.1481, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.969506805782555e-05, "epoch": 0.05, "percentage": 4.98, "elapsed_time": "0:01:26", "remaining_time": "0:27:33"}
+{"current_steps": 165, "total_steps": 3215, "loss": 2.032, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.967575397279689e-05, "epoch": 0.05, "percentage": 5.13, "elapsed_time": "0:01:29", "remaining_time": "0:27:29"}
+{"current_steps": 170, "total_steps": 3215, "loss": 2.0379, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.965585084426943e-05, "epoch": 0.05, "percentage": 5.29, "elapsed_time": "0:01:31", "remaining_time": "0:27:27"}
+{"current_steps": 175, "total_steps": 3215, "loss": 2.1444, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.9635359147357655e-05, "epoch": 0.05, "percentage": 5.44, "elapsed_time": "0:01:34", "remaining_time": "0:27:23"}
+{"current_steps": 180, "total_steps": 3215, "loss": 1.9164, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.961427937122598e-05, "epoch": 0.06, "percentage": 5.6, "elapsed_time": "0:01:37", "remaining_time": "0:27:20"}
+{"current_steps": 185, "total_steps": 3215, "loss": 2.0084, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.959261201907707e-05, "epoch": 0.06, "percentage": 5.75, "elapsed_time": "0:01:40", "remaining_time": "0:27:18"}
+{"current_steps": 190, "total_steps": 3215, "loss": 2.2032, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.957035760813982e-05, "epoch": 0.06, "percentage": 5.91, "elapsed_time": "0:01:42", "remaining_time": "0:27:12"}
+{"current_steps": 195, "total_steps": 3215, "loss": 2.2101, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.954751666965701e-05, "epoch": 0.06, "percentage": 6.07, "elapsed_time": "0:01:45", "remaining_time": "0:27:14"}
+{"current_steps": 200, "total_steps": 3215, "loss": 2.0472, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.9524089748872615e-05, "epoch": 0.06, "percentage": 6.22, "elapsed_time": "0:01:48", "remaining_time": "0:27:12"}
+{"current_steps": 205, "total_steps": 3215, "loss": 2.0987, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.9500077405018807e-05, "epoch": 0.06, "percentage": 6.38, "elapsed_time": "0:01:51", "remaining_time": "0:27:18"}
+{"current_steps": 210, "total_steps": 3215, "loss": 2.1765, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.9475480211302583e-05, "epoch": 0.07, "percentage": 6.53, "elapsed_time": "0:01:54", "remaining_time": "0:27:15"}
+{"current_steps": 215, "total_steps": 3215, "loss": 1.9926, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.945029875489212e-05, "epoch": 0.07, "percentage": 6.69, "elapsed_time": "0:01:56", "remaining_time": "0:27:12"}
+{"current_steps": 220, "total_steps": 3215, "loss": 2.0124, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.94245336369027e-05, "epoch": 0.07, "percentage": 6.84, "elapsed_time": "0:01:59", "remaining_time": "0:27:13"}
+{"current_steps": 225, "total_steps": 3215, "loss": 2.2229, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.939818547238241e-05, "epoch": 0.07, "percentage": 7.0, "elapsed_time": "0:02:02", "remaining_time": "0:27:08"}
+{"current_steps": 230, "total_steps": 3215, "loss": 2.2013, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.9371254890297446e-05, "epoch": 0.07, "percentage": 7.15, "elapsed_time": "0:02:05", "remaining_time": "0:27:04"}
+{"current_steps": 235, "total_steps": 3215, "loss": 2.014, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.93437425335171e-05, "epoch": 0.07, "percentage": 7.31, "elapsed_time": "0:02:07", "remaining_time": "0:27:00"}
+{"current_steps": 240, "total_steps": 3215, "loss": 2.1701, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.9315649058798384e-05, "epoch": 0.07, "percentage": 7.47, "elapsed_time": "0:02:10", "remaining_time": "0:26:56"}
+{"current_steps": 245, "total_steps": 3215, "loss": 2.1681, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.928697513677042e-05, "epoch": 0.08, "percentage": 7.62, "elapsed_time": "0:02:13", "remaining_time": "0:26:53"}
+{"current_steps": 250, "total_steps": 3215, "loss": 2.1224, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.925772145191834e-05, "epoch": 0.08, "percentage": 7.78, "elapsed_time": "0:02:15", "remaining_time": "0:26:48"}
+{"current_steps": 255, "total_steps": 3215, "loss": 2.0512, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.9227888702567044e-05, "epoch": 0.08, "percentage": 7.93, "elapsed_time": "0:02:18", "remaining_time": "0:26:44"}
+{"current_steps": 260, "total_steps": 3215, "loss": 2.1067, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.9197477600864446e-05, "epoch": 0.08, "percentage": 8.09, "elapsed_time": "0:02:20", "remaining_time": "0:26:41"}
+{"current_steps": 265, "total_steps": 3215, "loss": 1.8884, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.9166488872764526e-05, "epoch": 0.08, "percentage": 8.24, "elapsed_time": "0:02:23", "remaining_time": "0:26:37"}
+{"current_steps": 270, "total_steps": 3215, "loss": 1.9345, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.913492325800999e-05, "epoch": 0.08, "percentage": 8.4, "elapsed_time": "0:02:26", "remaining_time": "0:26:33"}
+{"current_steps": 275, "total_steps": 3215, "loss": 2.1928, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.910278151011458e-05, "epoch": 0.09, "percentage": 8.55, "elapsed_time": "0:02:28", "remaining_time": "0:26:29"}
+{"current_steps": 280, "total_steps": 3215, "loss": 2.0407, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.907006439634516e-05, "epoch": 0.09, "percentage": 8.71, "elapsed_time": "0:02:31", "remaining_time": "0:26:23"}
+{"current_steps": 285, "total_steps": 3215, "loss": 2.2344, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.903677269770329e-05, "epoch": 0.09, "percentage": 8.86, "elapsed_time": "0:02:33", "remaining_time": "0:26:21"}
+{"current_steps": 290, "total_steps": 3215, "loss": 2.1296, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.900290720890671e-05, "epoch": 0.09, "percentage": 9.02, "elapsed_time": "0:02:36", "remaining_time": "0:26:17"}
+{"current_steps": 295, "total_steps": 3215, "loss": 2.152, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.8968468738370244e-05, "epoch": 0.09, "percentage": 9.18, "elapsed_time": "0:02:38", "remaining_time": "0:26:10"}
+{"current_steps": 300, "total_steps": 3215, "loss": 1.9623, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.8933458108186606e-05, "epoch": 0.09, "percentage": 9.33, "elapsed_time": "0:02:41", "remaining_time": "0:26:06"}
+{"current_steps": 305, "total_steps": 3215, "loss": 1.915, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.889787615410672e-05, "epoch": 0.09, "percentage": 9.49, "elapsed_time": "0:02:44", "remaining_time": "0:26:07"}
+{"current_steps": 310, "total_steps": 3215, "loss": 1.9934, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.886172372551977e-05, "epoch": 0.1, "percentage": 9.64, "elapsed_time": "0:02:46", "remaining_time": "0:26:01"}
+{"current_steps": 315, "total_steps": 3215, "loss": 2.1541, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.882500168543294e-05, "epoch": 0.1, "percentage": 9.8, "elapsed_time": "0:02:48", "remaining_time": "0:25:55"}
+{"current_steps": 320, "total_steps": 3215, "loss": 2.1688, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.878771091045082e-05, "epoch": 0.1, "percentage": 9.95, "elapsed_time": "0:02:51", "remaining_time": "0:25:51"}
+{"current_steps": 325, "total_steps": 3215, "loss": 2.1387, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.874985229075446e-05, "epoch": 0.1, "percentage": 10.11, "elapsed_time": "0:02:54", "remaining_time": "0:25:51"}
+{"current_steps": 330, "total_steps": 3215, "loss": 2.0215, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.871142673008012e-05, "epoch": 0.1, "percentage": 10.26, "elapsed_time": "0:02:56", "remaining_time": "0:25:45"}
+{"current_steps": 335, "total_steps": 3215, "loss": 1.9491, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.867243514569772e-05, "epoch": 0.1, "percentage": 10.42, "elapsed_time": "0:02:59", "remaining_time": "0:25:44"}
+{"current_steps": 340, "total_steps": 3215, "loss": 2.0151, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.863287846838891e-05, "epoch": 0.11, "percentage": 10.58, "elapsed_time": "0:03:02", "remaining_time": "0:25:41"}
+{"current_steps": 345, "total_steps": 3215, "loss": 1.8906, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.85927576424249e-05, "epoch": 0.11, "percentage": 10.73, "elapsed_time": "0:03:04", "remaining_time": "0:25:37"}
+{"current_steps": 350, "total_steps": 3215, "loss": 2.1844, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.855207362554385e-05, "epoch": 0.11, "percentage": 10.89, "elapsed_time": "0:03:07", "remaining_time": "0:25:33"}
+{"current_steps": 355, "total_steps": 3215, "loss": 2.048, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.851082738892809e-05, "epoch": 0.11, "percentage": 11.04, "elapsed_time": "0:03:10", "remaining_time": "0:25:31"}
+{"current_steps": 360, "total_steps": 3215, "loss": 1.9537, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.8469019917180846e-05, "epoch": 0.11, "percentage": 11.2, "elapsed_time": "0:03:12", "remaining_time": "0:25:28"}
+{"current_steps": 365, "total_steps": 3215, "loss": 1.9731, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.8426652208302814e-05, "epoch": 0.11, "percentage": 11.35, "elapsed_time": "0:03:15", "remaining_time": "0:25:22"}
+{"current_steps": 370, "total_steps": 3215, "loss": 2.1395, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.83837252736683e-05, "epoch": 0.12, "percentage": 11.51, "elapsed_time": "0:03:17", "remaining_time": "0:25:21"}
+{"current_steps": 375, "total_steps": 3215, "loss": 2.0016, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.834024013800108e-05, "epoch": 0.12, "percentage": 11.66, "elapsed_time": "0:03:20", "remaining_time": "0:25:19"}
+{"current_steps": 380, "total_steps": 3215, "loss": 1.9632, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.8296197839349944e-05, "epoch": 0.12, "percentage": 11.82, "elapsed_time": "0:03:23", "remaining_time": "0:25:15"}
+{"current_steps": 385, "total_steps": 3215, "loss": 2.3302, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.825159942906389e-05, "epoch": 0.12, "percentage": 11.98, "elapsed_time": "0:03:25", "remaining_time": "0:25:13"}
+{"current_steps": 390, "total_steps": 3215, "loss": 2.1517, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.820644597176709e-05, "epoch": 0.12, "percentage": 12.13, "elapsed_time": "0:03:28", "remaining_time": "0:25:10"}
+{"current_steps": 395, "total_steps": 3215, "loss": 2.1229, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.81607385453334e-05, "epoch": 0.12, "percentage": 12.29, "elapsed_time": "0:03:31", "remaining_time": "0:25:07"}
+{"current_steps": 400, "total_steps": 3215, "loss": 2.1382, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.81144782408607e-05, "epoch": 0.12, "percentage": 12.44, "elapsed_time": "0:03:33", "remaining_time": "0:25:05"}
+{"current_steps": 405, "total_steps": 3215, "loss": 1.9614, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.8067666162644774e-05, "epoch": 0.13, "percentage": 12.6, "elapsed_time": "0:03:37", "remaining_time": "0:25:06"}
+{"current_steps": 410, "total_steps": 3215, "loss": 2.1399, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.802030342815304e-05, "epoch": 0.13, "percentage": 12.75, "elapsed_time": "0:03:39", "remaining_time": "0:25:02"}
+{"current_steps": 415, "total_steps": 3215, "loss": 1.9034, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.7972391167997754e-05, "epoch": 0.13, "percentage": 12.91, "elapsed_time": "0:03:42", "remaining_time": "0:24:59"}
+{"current_steps": 420, "total_steps": 3215, "loss": 2.0075, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.7923930525909156e-05, "epoch": 0.13, "percentage": 13.06, "elapsed_time": "0:03:45", "remaining_time": "0:24:57"}
+{"current_steps": 425, "total_steps": 3215, "loss": 2.0105, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.7874922658708065e-05, "epoch": 0.13, "percentage": 13.22, "elapsed_time": "0:03:47", "remaining_time": "0:24:52"}
+{"current_steps": 430, "total_steps": 3215, "loss": 2.0242, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.782536873627832e-05, "epoch": 0.13, "percentage": 13.37, "elapsed_time": "0:03:50", "remaining_time": "0:24:50"}
+{"current_steps": 435, "total_steps": 3215, "loss": 2.0267, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.777526994153882e-05, "epoch": 0.14, "percentage": 13.53, "elapsed_time": "0:03:52", "remaining_time": "0:24:47"}
+{"current_steps": 440, "total_steps": 3215, "loss": 1.9119, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.7724627470415307e-05, "epoch": 0.14, "percentage": 13.69, "elapsed_time": "0:03:55", "remaining_time": "0:24:42"}
+{"current_steps": 445, "total_steps": 3215, "loss": 2.2653, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.7673442531811796e-05, "epoch": 0.14, "percentage": 13.84, "elapsed_time": "0:03:57", "remaining_time": "0:24:40"}
+{"current_steps": 450, "total_steps": 3215, "loss": 2.0017, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.762171634758177e-05, "epoch": 0.14, "percentage": 14.0, "elapsed_time": "0:04:00", "remaining_time": "0:24:37"}
+{"current_steps": 455, "total_steps": 3215, "loss": 2.1408, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.7569450152498927e-05, "epoch": 0.14, "percentage": 14.15, "elapsed_time": "0:04:03", "remaining_time": "0:24:34"}
+{"current_steps": 460, "total_steps": 3215, "loss": 2.0935, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.751664519422778e-05, "epoch": 0.14, "percentage": 14.31, "elapsed_time": "0:04:05", "remaining_time": "0:24:31"}
+{"current_steps": 465, "total_steps": 3215, "loss": 2.1142, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.746330273329386e-05, "epoch": 0.14, "percentage": 14.46, "elapsed_time": "0:04:08", "remaining_time": "0:24:29"}
+{"current_steps": 470, "total_steps": 3215, "loss": 2.1289, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.740942404305356e-05, "epoch": 0.15, "percentage": 14.62, "elapsed_time": "0:04:11", "remaining_time": "0:24:26"}
+{"current_steps": 475, "total_steps": 3215, "loss": 1.9741, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.735501040966383e-05, "epoch": 0.15, "percentage": 14.77, "elapsed_time": "0:04:13", "remaining_time": "0:24:24"}
+{"current_steps": 480, "total_steps": 3215, "loss": 2.088, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.730006313205143e-05, "epoch": 0.15, "percentage": 14.93, "elapsed_time": "0:04:16", "remaining_time": "0:24:22"}
+{"current_steps": 485, "total_steps": 3215, "loss": 2.2079, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.724458352188192e-05, "epoch": 0.15, "percentage": 15.09, "elapsed_time": "0:04:19", "remaining_time": "0:24:19"}
+{"current_steps": 490, "total_steps": 3215, "loss": 2.048, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.718857290352835e-05, "epoch": 0.15, "percentage": 15.24, "elapsed_time": "0:04:21", "remaining_time": "0:24:14"}
+{"current_steps": 495, "total_steps": 3215, "loss": 2.2569, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.713203261403966e-05, "epoch": 0.15, "percentage": 15.4, "elapsed_time": "0:04:24", "remaining_time": "0:24:11"}
+{"current_steps": 500, "total_steps": 3215, "loss": 1.9574, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.707496400310874e-05, "epoch": 0.16, "percentage": 15.55, "elapsed_time": "0:04:26", "remaining_time": "0:24:09"}
+{"current_steps": 505, "total_steps": 3215, "loss": 2.0951, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.701736843304025e-05, "epoch": 0.16, "percentage": 15.71, "elapsed_time": "0:04:29", "remaining_time": "0:24:08"}
+{"current_steps": 510, "total_steps": 3215, "loss": 2.0253, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.695924727871805e-05, "epoch": 0.16, "percentage": 15.86, "elapsed_time": "0:04:32", "remaining_time": "0:24:05"}
+{"current_steps": 515, "total_steps": 3215, "loss": 2.0602, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.690060192757242e-05, "epoch": 0.16, "percentage": 16.02, "elapsed_time": "0:04:35", "remaining_time": "0:24:02"}
+{"current_steps": 520, "total_steps": 3215, "loss": 2.0386, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.684143377954691e-05, "epoch": 0.16, "percentage": 16.17, "elapsed_time": "0:04:37", "remaining_time": "0:23:57"}
+{"current_steps": 525, "total_steps": 3215, "loss": 2.073, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.6781744247064955e-05, "epoch": 0.16, "percentage": 16.33, "elapsed_time": "0:04:39", "remaining_time": "0:23:54"}
+{"current_steps": 530, "total_steps": 3215, "loss": 2.1443, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.6721534754996125e-05, "epoch": 0.16, "percentage": 16.49, "elapsed_time": "0:04:42", "remaining_time": "0:23:51"}
+{"current_steps": 535, "total_steps": 3215, "loss": 2.0288, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.666080674062213e-05, "epoch": 0.17, "percentage": 16.64, "elapsed_time": "0:04:45", "remaining_time": "0:23:48"}
+{"current_steps": 540, "total_steps": 3215, "loss": 2.0609, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.659956165360251e-05, "epoch": 0.17, "percentage": 16.8, "elapsed_time": "0:04:47", "remaining_time": "0:23:44"}
+{"current_steps": 545, "total_steps": 3215, "loss": 1.9539, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.6537800955940005e-05, "epoch": 0.17, "percentage": 16.95, "elapsed_time": "0:04:49", "remaining_time": "0:23:40"}
+{"current_steps": 550, "total_steps": 3215, "loss": 2.149, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.647552612194572e-05, "epoch": 0.17, "percentage": 17.11, "elapsed_time": "0:04:52", "remaining_time": "0:23:37"}
+{"current_steps": 555, "total_steps": 3215, "loss": 1.9722, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.641273863820383e-05, "epoch": 0.17, "percentage": 17.26, "elapsed_time": "0:04:55", "remaining_time": "0:23:35"}
+{"current_steps": 560, "total_steps": 3215, "loss": 2.0729, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.634944000353622e-05, "epoch": 0.17, "percentage": 17.42, "elapsed_time": "0:04:57", "remaining_time": "0:23:31"}
+{"current_steps": 565, "total_steps": 3215, "loss": 1.9507, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.628563172896655e-05, "epoch": 0.18, "percentage": 17.57, "elapsed_time": "0:05:00", "remaining_time": "0:23:29"}
+{"current_steps": 570, "total_steps": 3215, "loss": 2.1643, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.6221315337684353e-05, "epoch": 0.18, "percentage": 17.73, "elapsed_time": "0:05:03", "remaining_time": "0:23:26"}
+{"current_steps": 575, "total_steps": 3215, "loss": 2.1839, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.615649236500854e-05, "epoch": 0.18, "percentage": 17.88, "elapsed_time": "0:05:05", "remaining_time": "0:23:22"}
+{"current_steps": 580, "total_steps": 3215, "loss": 2.0976, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.609116435835083e-05, "epoch": 0.18, "percentage": 18.04, "elapsed_time": "0:05:08", "remaining_time": "0:23:20"}
+{"current_steps": 585, "total_steps": 3215, "loss": 2.1474, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.602533287717877e-05, "epoch": 0.18, "percentage": 18.2, "elapsed_time": "0:05:10", "remaining_time": "0:23:17"}
+{"current_steps": 590, "total_steps": 3215, "loss": 2.1873, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.5958999492978524e-05, "epoch": 0.18, "percentage": 18.35, "elapsed_time": "0:05:13", "remaining_time": "0:23:14"}
+{"current_steps": 595, "total_steps": 3215, "loss": 2.1744, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.589216578921737e-05, "epoch": 0.19, "percentage": 18.51, "elapsed_time": "0:05:16", "remaining_time": "0:23:11"}
+{"current_steps": 600, "total_steps": 3215, "loss": 1.9982, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.582483336130586e-05, "epoch": 0.19, "percentage": 18.66, "elapsed_time": "0:05:18", "remaining_time": "0:23:09"}
+{"current_steps": 605, "total_steps": 3215, "loss": 2.1234, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.575700381655979e-05, "epoch": 0.19, "percentage": 18.82, "elapsed_time": "0:05:22", "remaining_time": "0:23:09"}
+{"current_steps": 610, "total_steps": 3215, "loss": 1.9478, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.5688678774161796e-05, "epoch": 0.19, "percentage": 18.97, "elapsed_time": "0:05:24", "remaining_time": "0:23:07"}
+{"current_steps": 615, "total_steps": 3215, "loss": 1.8268, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.561985986512271e-05, "epoch": 0.19, "percentage": 19.13, "elapsed_time": "0:05:27", "remaining_time": "0:23:04"}
+{"current_steps": 620, "total_steps": 3215, "loss": 1.9887, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.555054873224263e-05, "epoch": 0.19, "percentage": 19.28, "elapsed_time": "0:05:29", "remaining_time": "0:23:00"}
+{"current_steps": 625, "total_steps": 3215, "loss": 2.0777, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.54807470300717e-05, "epoch": 0.19, "percentage": 19.44, "elapsed_time": "0:05:32", "remaining_time": "0:22:59"}
+{"current_steps": 630, "total_steps": 3215, "loss": 2.0566, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.5410456424870596e-05, "epoch": 0.2, "percentage": 19.6, "elapsed_time": "0:05:35", "remaining_time": "0:22:56"}
+{"current_steps": 635, "total_steps": 3215, "loss": 2.047, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.5339678594570795e-05, "epoch": 0.2, "percentage": 19.75, "elapsed_time": "0:05:38", "remaining_time": "0:22:53"}
+{"current_steps": 640, "total_steps": 3215, "loss": 1.962, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.526841522873449e-05, "epoch": 0.2, "percentage": 19.91, "elapsed_time": "0:05:40", "remaining_time": "0:22:50"}
+{"current_steps": 645, "total_steps": 3215, "loss": 2.0972, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.519666802851422e-05, "epoch": 0.2, "percentage": 20.06, "elapsed_time": "0:05:43", "remaining_time": "0:22:47"}
+{"current_steps": 650, "total_steps": 3215, "loss": 2.0041, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.5124438706612376e-05, "epoch": 0.2, "percentage": 20.22, "elapsed_time": "0:05:45", "remaining_time": "0:22:43"}
+{"current_steps": 655, "total_steps": 3215, "loss": 2.1229, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.505172898724018e-05, "epoch": 0.2, "percentage": 20.37, "elapsed_time": "0:05:48", "remaining_time": "0:22:41"}
+{"current_steps": 660, "total_steps": 3215, "loss": 2.0195, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.497854060607662e-05, "epoch": 0.21, "percentage": 20.53, "elapsed_time": "0:05:50", "remaining_time": "0:22:37"}
+{"current_steps": 665, "total_steps": 3215, "loss": 2.0745, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.490487531022699e-05, "epoch": 0.21, "percentage": 20.68, "elapsed_time": "0:05:53", "remaining_time": "0:22:34"}
+{"current_steps": 670, "total_steps": 3215, "loss": 2.1068, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.4830734858181145e-05, "epoch": 0.21, "percentage": 20.84, "elapsed_time": "0:05:55", "remaining_time": "0:22:31"}
+{"current_steps": 675, "total_steps": 3215, "loss": 1.8088, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.47561210197716e-05, "epoch": 0.21, "percentage": 21.0, "elapsed_time": "0:05:58", "remaining_time": "0:22:28"}
+{"current_steps": 680, "total_steps": 3215, "loss": 2.0995, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.4681035576131215e-05, "epoch": 0.21, "percentage": 21.15, "elapsed_time": "0:06:01", "remaining_time": "0:22:25"}
+{"current_steps": 685, "total_steps": 3215, "loss": 2.0541, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.46054803196507e-05, "epoch": 0.21, "percentage": 21.31, "elapsed_time": "0:06:03", "remaining_time": "0:22:23"}
+{"current_steps": 690, "total_steps": 3215, "loss": 2.166, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.452945705393586e-05, "epoch": 0.21, "percentage": 21.46, "elapsed_time": "0:06:05", "remaining_time": "0:22:18"}
+{"current_steps": 695, "total_steps": 3215, "loss": 2.0784, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.445296759376449e-05, "epoch": 0.22, "percentage": 21.62, "elapsed_time": "0:06:07", "remaining_time": "0:22:13"}
+{"current_steps": 700, "total_steps": 3215, "loss": 2.2087, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.437601376504307e-05, "epoch": 0.22, "percentage": 21.77, "elapsed_time": "0:06:10", "remaining_time": "0:22:11"}
+{"current_steps": 705, "total_steps": 3215, "loss": 2.1199, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.4298597404763186e-05, "epoch": 0.22, "percentage": 21.93, "elapsed_time": "0:06:14", "remaining_time": "0:22:11"}
+{"current_steps": 710, "total_steps": 3215, "loss": 2.0355, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.422072036095768e-05, "epoch": 0.22, "percentage": 22.08, "elapsed_time": "0:06:16", "remaining_time": "0:22:10"}
+{"current_steps": 715, "total_steps": 3215, "loss": 2.0011, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.414238449265654e-05, "epoch": 0.22, "percentage": 22.24, "elapsed_time": "0:06:19", "remaining_time": "0:22:07"}
+{"current_steps": 720, "total_steps": 3215, "loss": 2.0368, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.406359166984249e-05, "epoch": 0.22, "percentage": 22.4, "elapsed_time": "0:06:22", "remaining_time": "0:22:03"}
+{"current_steps": 725, "total_steps": 3215, "loss": 1.9983, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.39843437734064e-05, "epoch": 0.23, "percentage": 22.55, "elapsed_time": "0:06:24", "remaining_time": "0:22:02"}
+{"current_steps": 730, "total_steps": 3215, "loss": 2.021, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.390464269510233e-05, "epoch": 0.23, "percentage": 22.71, "elapsed_time": "0:06:27", "remaining_time": "0:22:00"}
+{"current_steps": 735, "total_steps": 3215, "loss": 1.9743, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.382449033750244e-05, "epoch": 0.23, "percentage": 22.86, "elapsed_time": "0:06:30", "remaining_time": "0:21:56"}
+{"current_steps": 740, "total_steps": 3215, "loss": 2.0689, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.37438886139515e-05, "epoch": 0.23, "percentage": 23.02, "elapsed_time": "0:06:32", "remaining_time": "0:21:53"}
+{"current_steps": 745, "total_steps": 3215, "loss": 2.0838, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.3662839448521264e-05, "epoch": 0.23, "percentage": 23.17, "elapsed_time": "0:06:35", "remaining_time": "0:21:50"}
+{"current_steps": 750, "total_steps": 3215, "loss": 2.0835, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.358134477596454e-05, "epoch": 0.23, "percentage": 23.33, "elapsed_time": "0:06:37", "remaining_time": "0:21:47"}
+{"current_steps": 755, "total_steps": 3215, "loss": 2.0916, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.3499406541668966e-05, "epoch": 0.23, "percentage": 23.48, "elapsed_time": "0:06:40", "remaining_time": "0:21:43"}
+{"current_steps": 760, "total_steps": 3215, "loss": 1.972, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.3417026701610616e-05, "epoch": 0.24, "percentage": 23.64, "elapsed_time": "0:06:43", "remaining_time": "0:21:41"}
+{"current_steps": 765, "total_steps": 3215, "loss": 1.927, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.3334207222307275e-05, "epoch": 0.24, "percentage": 23.79, "elapsed_time": "0:06:45", "remaining_time": "0:21:39"}
+{"current_steps": 770, "total_steps": 3215, "loss": 2.1192, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.325095008077154e-05, "epoch": 0.24, "percentage": 23.95, "elapsed_time": "0:06:48", "remaining_time": "0:21:35"}
+{"current_steps": 775, "total_steps": 3215, "loss": 2.0774, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.316725726446353e-05, "epoch": 0.24, "percentage": 24.11, "elapsed_time": "0:06:50", "remaining_time": "0:21:32"}
+{"current_steps": 780, "total_steps": 3215, "loss": 2.0847, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.3083130771243586e-05, "epoch": 0.24, "percentage": 24.26, "elapsed_time": "0:06:52", "remaining_time": "0:21:29"}
+{"current_steps": 785, "total_steps": 3215, "loss": 2.0485, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.299857260932445e-05, "epoch": 0.24, "percentage": 24.42, "elapsed_time": "0:06:55", "remaining_time": "0:21:26"}
+{"current_steps": 790, "total_steps": 3215, "loss": 2.1008, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.2913584797223397e-05, "epoch": 0.25, "percentage": 24.57, "elapsed_time": "0:06:58", "remaining_time": "0:21:23"}
+{"current_steps": 795, "total_steps": 3215, "loss": 1.9209, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.2828169363714016e-05, "epoch": 0.25, "percentage": 24.73, "elapsed_time": "0:07:00", "remaining_time": "0:21:19"}
+{"current_steps": 800, "total_steps": 3215, "loss": 1.9722, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.274232834777782e-05, "epoch": 0.25, "percentage": 24.88, "elapsed_time": "0:07:03", "remaining_time": "0:21:17"}
+{"current_steps": 805, "total_steps": 3215, "loss": 1.9176, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.2656063798555515e-05, "epoch": 0.25, "percentage": 25.04, "elapsed_time": "0:07:06", "remaining_time": "0:21:16"}
+{"current_steps": 810, "total_steps": 3215, "loss": 1.9929, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.256937777529815e-05, "epoch": 0.25, "percentage": 25.19, "elapsed_time": "0:07:08", "remaining_time": "0:21:12"}
+{"current_steps": 815, "total_steps": 3215, "loss": 2.166, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.2482272347317906e-05, "epoch": 0.25, "percentage": 25.35, "elapsed_time": "0:07:11", "remaining_time": "0:21:10"}
+{"current_steps": 820, "total_steps": 3215, "loss": 2.1334, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.2394749593938733e-05, "epoch": 0.25, "percentage": 25.51, "elapsed_time": "0:07:13", "remaining_time": "0:21:07"}
+{"current_steps": 825, "total_steps": 3215, "loss": 2.0853, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.230681160444669e-05, "epoch": 0.26, "percentage": 25.66, "elapsed_time": "0:07:16", "remaining_time": "0:21:04"}
+{"current_steps": 830, "total_steps": 3215, "loss": 2.1802, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.221846047804009e-05, "epoch": 0.26, "percentage": 25.82, "elapsed_time": "0:07:18", "remaining_time": "0:21:01"}
+{"current_steps": 835, "total_steps": 3215, "loss": 2.0739, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.2129698323779366e-05, "epoch": 0.26, "percentage": 25.97, "elapsed_time": "0:07:21", "remaining_time": "0:20:58"}
+{"current_steps": 840, "total_steps": 3215, "loss": 2.0238, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.204052726053676e-05, "epoch": 0.26, "percentage": 26.13, "elapsed_time": "0:07:23", "remaining_time": "0:20:54"}
+{"current_steps": 845, "total_steps": 3215, "loss": 2.1557, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.195094941694571e-05, "epoch": 0.26, "percentage": 26.28, "elapsed_time": "0:07:26", "remaining_time": "0:20:51"}
+{"current_steps": 850, "total_steps": 3215, "loss": 2.1666, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.1860966931350054e-05, "epoch": 0.26, "percentage": 26.44, "elapsed_time": "0:07:28", "remaining_time": "0:20:48"}
+{"current_steps": 855, "total_steps": 3215, "loss": 2.105, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.1770581951752976e-05, "epoch": 0.27, "percentage": 26.59, "elapsed_time": "0:07:31", "remaining_time": "0:20:45"}
+{"current_steps": 860, "total_steps": 3215, "loss": 1.9656, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.1679796635765735e-05, "epoch": 0.27, "percentage": 26.75, "elapsed_time": "0:07:33", "remaining_time": "0:20:42"}
+{"current_steps": 865, "total_steps": 3215, "loss": 2.0166, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.158861315055617e-05, "epoch": 0.27, "percentage": 26.91, "elapsed_time": "0:07:36", "remaining_time": "0:20:39"}
+{"current_steps": 870, "total_steps": 3215, "loss": 2.0076, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.1497033672796924e-05, "epoch": 0.27, "percentage": 27.06, "elapsed_time": "0:07:38", "remaining_time": "0:20:36"}
+{"current_steps": 875, "total_steps": 3215, "loss": 2.1594, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.140506038861356e-05, "epoch": 0.27, "percentage": 27.22, "elapsed_time": "0:07:41", "remaining_time": "0:20:33"}
+{"current_steps": 880, "total_steps": 3215, "loss": 2.1416, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.131269549353229e-05, "epoch": 0.27, "percentage": 27.37, "elapsed_time": "0:07:43", "remaining_time": "0:20:30"}
+{"current_steps": 885, "total_steps": 3215, "loss": 2.1242, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.1219941192427644e-05, "epoch": 0.28, "percentage": 27.53, "elapsed_time": "0:07:46", "remaining_time": "0:20:27"}
+{"current_steps": 890, "total_steps": 3215, "loss": 2.02, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.112679969946977e-05, "epoch": 0.28, "percentage": 27.68, "elapsed_time": "0:07:48", "remaining_time": "0:20:24"}
+{"current_steps": 895, "total_steps": 3215, "loss": 2.0438, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.103327323807162e-05, "epoch": 0.28, "percentage": 27.84, "elapsed_time": "0:07:51", "remaining_time": "0:20:21"}
+{"current_steps": 900, "total_steps": 3215, "loss": 1.9806, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.093936404083585e-05, "epoch": 0.28, "percentage": 27.99, "elapsed_time": "0:07:53", "remaining_time": "0:20:18"}
+{"current_steps": 905, "total_steps": 3215, "loss": 2.1476, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.0845074349501544e-05, "epoch": 0.28, "percentage": 28.15, "elapsed_time": "0:07:56", "remaining_time": "0:20:16"}
+{"current_steps": 910, "total_steps": 3215, "loss": 1.9672, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.0750406414890695e-05, "epoch": 0.28, "percentage": 28.3, "elapsed_time": "0:07:59", "remaining_time": "0:20:13"}
+{"current_steps": 915, "total_steps": 3215, "loss": 1.9984, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.065536249685448e-05, "epoch": 0.28, "percentage": 28.46, "elapsed_time": "0:08:01", "remaining_time": "0:20:11"}
+{"current_steps": 920, "total_steps": 3215, "loss": 2.1162, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.055994486421929e-05, "epoch": 0.29, "percentage": 28.62, "elapsed_time": "0:08:04", "remaining_time": "0:20:08"}
+{"current_steps": 925, "total_steps": 3215, "loss": 2.0435, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.04641557947326e-05, "epoch": 0.29, "percentage": 28.77, "elapsed_time": "0:08:06", "remaining_time": "0:20:04"}
+{"current_steps": 930, "total_steps": 3215, "loss": 2.0431, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.036799757500856e-05, "epoch": 0.29, "percentage": 28.93, "elapsed_time": "0:08:09", "remaining_time": "0:20:01"}
+{"current_steps": 935, "total_steps": 3215, "loss": 2.2021, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.027147250047348e-05, "epoch": 0.29, "percentage": 29.08, "elapsed_time": "0:08:11", "remaining_time": "0:19:59"}
+{"current_steps": 940, "total_steps": 3215, "loss": 1.997, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.017458287531094e-05, "epoch": 0.29, "percentage": 29.24, "elapsed_time": "0:08:14", "remaining_time": "0:19:56"}
+{"current_steps": 945, "total_steps": 3215, "loss": 1.946, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.007733101240685e-05, "epoch": 0.29, "percentage": 29.39, "elapsed_time": "0:08:16", "remaining_time": "0:19:53"}
+{"current_steps": 950, "total_steps": 3215, "loss": 2.0723, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.997971923329426e-05, "epoch": 0.3, "percentage": 29.55, "elapsed_time": "0:08:19", "remaining_time": "0:19:49"}
+{"current_steps": 955, "total_steps": 3215, "loss": 2.034, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.988174986809783e-05, "epoch": 0.3, "percentage": 29.7, "elapsed_time": "0:08:21", "remaining_time": "0:19:47"}
+{"current_steps": 960, "total_steps": 3215, "loss": 1.9736, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.9783425255478355e-05, "epoch": 0.3, "percentage": 29.86, "elapsed_time": "0:08:24", "remaining_time": "0:19:44"}
+{"current_steps": 965, "total_steps": 3215, "loss": 1.9878, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.968474774257682e-05, "epoch": 0.3, "percentage": 30.02, "elapsed_time": "0:08:26", "remaining_time": "0:19:41"}
+{"current_steps": 970, "total_steps": 3215, "loss": 2.117, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.9585719684958446e-05, "epoch": 0.3, "percentage": 30.17, "elapsed_time": "0:08:29", "remaining_time": "0:19:38"}
+{"current_steps": 975, "total_steps": 3215, "loss": 2.0585, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.948634344655639e-05, "epoch": 0.3, "percentage": 30.33, "elapsed_time": "0:08:31", "remaining_time": "0:19:35"}
+{"current_steps": 980, "total_steps": 3215, "loss": 2.0409, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.938662139961538e-05, "epoch": 0.3, "percentage": 30.48, "elapsed_time": "0:08:33", "remaining_time": "0:19:32"}
+{"current_steps": 985, "total_steps": 3215, "loss": 2.0369, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.928655592463508e-05, "epoch": 0.31, "percentage": 30.64, "elapsed_time": "0:08:36", "remaining_time": "0:19:29"}
+{"current_steps": 990, "total_steps": 3215, "loss": 1.967, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.918614941031319e-05, "epoch": 0.31, "percentage": 30.79, "elapsed_time": "0:08:38", "remaining_time": "0:19:26"}
+{"current_steps": 995, "total_steps": 3215, "loss": 2.0037, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.908540425348852e-05, "epoch": 0.31, "percentage": 30.95, "elapsed_time": "0:08:41", "remaining_time": "0:19:23"}
+{"current_steps": 1000, "total_steps": 3215, "loss": 1.9991, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.8984322859083725e-05, "epoch": 0.31, "percentage": 31.1, "elapsed_time": "0:08:44", "remaining_time": "0:19:20"}
+{"current_steps": 1005, "total_steps": 3215, "loss": 2.0448, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.8882907640047896e-05, "epoch": 0.31, "percentage": 31.26, "elapsed_time": "0:08:47", "remaining_time": "0:19:19"}
+{"current_steps": 1010, "total_steps": 3215, "loss": 2.0791, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.878116101729897e-05, "epoch": 0.31, "percentage": 31.42, "elapsed_time": "0:08:49", "remaining_time": "0:19:16"}
+{"current_steps": 1015, "total_steps": 3215, "loss": 1.9997, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.867908541966594e-05, "epoch": 0.32, "percentage": 31.57, "elapsed_time": "0:08:51", "remaining_time": "0:19:13"}
+{"current_steps": 1020, "total_steps": 3215, "loss": 2.0481, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.857668328383088e-05, "epoch": 0.32, "percentage": 31.73, "elapsed_time": "0:08:54", "remaining_time": "0:19:10"}
+{"current_steps": 1025, "total_steps": 3215, "loss": 2.2664, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.847395705427075e-05, "epoch": 0.32, "percentage": 31.88, "elapsed_time": "0:08:57", "remaining_time": "0:19:07"}
+{"current_steps": 1030, "total_steps": 3215, "loss": 1.9752, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.837090918319909e-05, "epoch": 0.32, "percentage": 32.04, "elapsed_time": "0:08:59", "remaining_time": "0:19:04"}
+{"current_steps": 1035, "total_steps": 3215, "loss": 2.1332, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.8267542130507436e-05, "epoch": 0.32, "percentage": 32.19, "elapsed_time": "0:09:02", "remaining_time": "0:19:02"}
+{"current_steps": 1040, "total_steps": 3215, "loss": 2.0432, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.816385836370663e-05, "epoch": 0.32, "percentage": 32.35, "elapsed_time": "0:09:04", "remaining_time": "0:18:59"}
+{"current_steps": 1045, "total_steps": 3215, "loss": 1.9618, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.805986035786789e-05, "epoch": 0.32, "percentage": 32.5, "elapsed_time": "0:09:07", "remaining_time": "0:18:56"}
+{"current_steps": 1050, "total_steps": 3215, "loss": 2.0267, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.795555059556378e-05, "epoch": 0.33, "percentage": 32.66, "elapsed_time": "0:09:09", "remaining_time": "0:18:53"}
+{"current_steps": 1055, "total_steps": 3215, "loss": 2.1075, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.7850931566808866e-05, "epoch": 0.33, "percentage": 32.81, "elapsed_time": "0:09:12", "remaining_time": "0:18:51"}
+{"current_steps": 1060, "total_steps": 3215, "loss": 2.156, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.7746005769000363e-05, "epoch": 0.33, "percentage": 32.97, "elapsed_time": "0:09:14", "remaining_time": "0:18:48"}
+{"current_steps": 1065, "total_steps": 3215, "loss": 1.9615, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.764077570685844e-05, "epoch": 0.33, "percentage": 33.13, "elapsed_time": "0:09:17", "remaining_time": "0:18:45"}
+{"current_steps": 1070, "total_steps": 3215, "loss": 2.0928, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.753524389236648e-05, "epoch": 0.33, "percentage": 33.28, "elapsed_time": "0:09:20", "remaining_time": "0:18:42"}
+{"current_steps": 1075, "total_steps": 3215, "loss": 2.1074, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.742941284471111e-05, "epoch": 0.33, "percentage": 33.44, "elapsed_time": "0:09:22", "remaining_time": "0:18:39"}
+{"current_steps": 1080, "total_steps": 3215, "loss": 1.9666, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.7323285090222054e-05, "epoch": 0.34, "percentage": 33.59, "elapsed_time": "0:09:25", "remaining_time": "0:18:37"}
+{"current_steps": 1085, "total_steps": 3215, "loss": 2.0468, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.721686316231181e-05, "epoch": 0.34, "percentage": 33.75, "elapsed_time": "0:09:28", "remaining_time": "0:18:35"}
+{"current_steps": 1090, "total_steps": 3215, "loss": 2.0624, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.7110149601415215e-05, "epoch": 0.34, "percentage": 33.9, "elapsed_time": "0:09:30", "remaining_time": "0:18:32"}
+{"current_steps": 1095, "total_steps": 3215, "loss": 1.9888, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.700314695492876e-05, "epoch": 0.34, "percentage": 34.06, "elapsed_time": "0:09:33", "remaining_time": "0:18:29"}
+{"current_steps": 1100, "total_steps": 3215, "loss": 2.1013, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.6895857777149825e-05, "epoch": 0.34, "percentage": 34.21, "elapsed_time": "0:09:35", "remaining_time": "0:18:27"}
+{"current_steps": 1105, "total_steps": 3215, "loss": 1.875, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.6788284629215624e-05, "epoch": 0.34, "percentage": 34.37, "elapsed_time": "0:09:38", "remaining_time": "0:18:25"}
+{"current_steps": 1110, "total_steps": 3215, "loss": 1.9096, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.668043007904219e-05, "epoch": 0.35, "percentage": 34.53, "elapsed_time": "0:09:41", "remaining_time": "0:18:22"}
+{"current_steps": 1115, "total_steps": 3215, "loss": 2.1859, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.6572296701262966e-05, "epoch": 0.35, "percentage": 34.68, "elapsed_time": "0:09:44", "remaining_time": "0:18:19"}
+{"current_steps": 1120, "total_steps": 3215, "loss": 2.2092, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.646388707716738e-05, "epoch": 0.35, "percentage": 34.84, "elapsed_time": "0:09:46", "remaining_time": "0:18:17"}
+{"current_steps": 1125, "total_steps": 3215, "loss": 2.0026, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.635520379463926e-05, "epoch": 0.35, "percentage": 34.99, "elapsed_time": "0:09:49", "remaining_time": "0:18:14"}
+{"current_steps": 1130, "total_steps": 3215, "loss": 2.2112, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.6246249448095004e-05, "epoch": 0.35, "percentage": 35.15, "elapsed_time": "0:09:51", "remaining_time": "0:18:11"}
+{"current_steps": 1135, "total_steps": 3215, "loss": 2.0221, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.6137026638421696e-05, "epoch": 0.35, "percentage": 35.3, "elapsed_time": "0:09:54", "remaining_time": "0:18:09"}
+{"current_steps": 1140, "total_steps": 3215, "loss": 1.9106, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.6027537972914974e-05, "epoch": 0.35, "percentage": 35.46, "elapsed_time": "0:09:57", "remaining_time": "0:18:06"}
+{"current_steps": 1145, "total_steps": 3215, "loss": 2.0673, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.5917786065216826e-05, "epoch": 0.36, "percentage": 35.61, "elapsed_time": "0:09:59", "remaining_time": "0:18:03"}
+{"current_steps": 1150, "total_steps": 3215, "loss": 2.1463, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.580777353525318e-05, "epoch": 0.36, "percentage": 35.77, "elapsed_time": "0:10:01", "remaining_time": "0:18:00"}
+{"current_steps": 1155, "total_steps": 3215, "loss": 2.0255, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.5697503009171385e-05, "epoch": 0.36, "percentage": 35.93, "elapsed_time": "0:10:04", "remaining_time": "0:17:58"}
+{"current_steps": 1160, "total_steps": 3215, "loss": 2.1348, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.558697711927748e-05, "epoch": 0.36, "percentage": 36.08, "elapsed_time": "0:10:07", "remaining_time": "0:17:55"}
+{"current_steps": 1165, "total_steps": 3215, "loss": 2.1457, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.54761985039734e-05, "epoch": 0.36, "percentage": 36.24, "elapsed_time": "0:10:09", "remaining_time": "0:17:52"}
+{"current_steps": 1170, "total_steps": 3215, "loss": 2.1256, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.5365169807693966e-05, "epoch": 0.36, "percentage": 36.39, "elapsed_time": "0:10:12", "remaining_time": "0:17:50"}
+{"current_steps": 1175, "total_steps": 3215, "loss": 1.9587, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.525389368084379e-05, "epoch": 0.37, "percentage": 36.55, "elapsed_time": "0:10:14", "remaining_time": "0:17:47"}
+{"current_steps": 1180, "total_steps": 3215, "loss": 1.8965, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.514237277973393e-05, "epoch": 0.37, "percentage": 36.7, "elapsed_time": "0:10:17", "remaining_time": "0:17:44"}
+{"current_steps": 1185, "total_steps": 3215, "loss": 1.9669, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.503060976651862e-05, "epoch": 0.37, "percentage": 36.86, "elapsed_time": "0:10:20", "remaining_time": "0:17:42"}
+{"current_steps": 1190, "total_steps": 3215, "loss": 2.003, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.491860730913156e-05, "epoch": 0.37, "percentage": 37.01, "elapsed_time": "0:10:22", "remaining_time": "0:17:39"}
+{"current_steps": 1195, "total_steps": 3215, "loss": 2.1487, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.480636808122235e-05, "epoch": 0.37, "percentage": 37.17, "elapsed_time": "0:10:25", "remaining_time": "0:17:36"}
+{"current_steps": 1200, "total_steps": 3215, "loss": 2.0686, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.469389476209259e-05, "epoch": 0.37, "percentage": 37.33, "elapsed_time": "0:10:27", "remaining_time": "0:17:33"}
+{"current_steps": 1205, "total_steps": 3215, "loss": 2.0284, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.458119003663199e-05, "epoch": 0.37, "percentage": 37.48, "elapsed_time": "0:10:30", "remaining_time": "0:17:32"}
+{"current_steps": 1210, "total_steps": 3215, "loss": 2.0555, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.446825659525421e-05, "epoch": 0.38, "percentage": 37.64, "elapsed_time": "0:10:33", "remaining_time": "0:17:30"}
+{"current_steps": 1215, "total_steps": 3215, "loss": 1.9375, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.435509713383268e-05, "epoch": 0.38, "percentage": 37.79, "elapsed_time": "0:10:36", "remaining_time": "0:17:27"}
+{"current_steps": 1220, "total_steps": 3215, "loss": 2.0271, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.424171435363623e-05, "epoch": 0.38, "percentage": 37.95, "elapsed_time": "0:10:38", "remaining_time": "0:17:24"}
+{"current_steps": 1225, "total_steps": 3215, "loss": 2.1897, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.412811096126461e-05, "epoch": 0.38, "percentage": 38.1, "elapsed_time": "0:10:41", "remaining_time": "0:17:21"}
+{"current_steps": 1230, "total_steps": 3215, "loss": 1.9978, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.401428966858387e-05, "epoch": 0.38, "percentage": 38.26, "elapsed_time": "0:10:44", "remaining_time": "0:17:19"}
+{"current_steps": 1235, "total_steps": 3215, "loss": 2.0688, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.390025319266167e-05, "epoch": 0.38, "percentage": 38.41, "elapsed_time": "0:10:46", "remaining_time": "0:17:16"}
+{"current_steps": 1240, "total_steps": 3215, "loss": 2.0396, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.3786004255702336e-05, "epoch": 0.39, "percentage": 38.57, "elapsed_time": "0:10:49", "remaining_time": "0:17:13"}
+{"current_steps": 1245, "total_steps": 3215, "loss": 1.9566, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.3671545584981954e-05, "epoch": 0.39, "percentage": 38.72, "elapsed_time": "0:10:51", "remaining_time": "0:17:10"}
+{"current_steps": 1250, "total_steps": 3215, "loss": 2.0474, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.355687991278324e-05, "epoch": 0.39, "percentage": 38.88, "elapsed_time": "0:10:54", "remaining_time": "0:17:08"}
+{"current_steps": 1255, "total_steps": 3215, "loss": 2.2163, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.3442009976330305e-05, "epoch": 0.39, "percentage": 39.04, "elapsed_time": "0:10:56", "remaining_time": "0:17:05"}
+{"current_steps": 1260, "total_steps": 3215, "loss": 2.1088, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.332693851772331e-05, "epoch": 0.39, "percentage": 39.19, "elapsed_time": "0:10:59", "remaining_time": "0:17:02"}
+{"current_steps": 1265, "total_steps": 3215, "loss": 1.8947, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.3211668283873035e-05, "epoch": 0.39, "percentage": 39.35, "elapsed_time": "0:11:01", "remaining_time": "0:17:00"}
+{"current_steps": 1270, "total_steps": 3215, "loss": 2.1748, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.3096202026435304e-05, "epoch": 0.39, "percentage": 39.5, "elapsed_time": "0:11:04", "remaining_time": "0:16:57"}
+{"current_steps": 1275, "total_steps": 3215, "loss": 1.9218, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.298054250174527e-05, "epoch": 0.4, "percentage": 39.66, "elapsed_time": "0:11:06", "remaining_time": "0:16:54"}
+{"current_steps": 1280, "total_steps": 3215, "loss": 2.2723, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.2864692470751654e-05, "epoch": 0.4, "percentage": 39.81, "elapsed_time": "0:11:09", "remaining_time": "0:16:51"}
+{"current_steps": 1285, "total_steps": 3215, "loss": 2.1456, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.27486546989508e-05, "epoch": 0.4, "percentage": 39.97, "elapsed_time": "0:11:11", "remaining_time": "0:16:49"}
+{"current_steps": 1290, "total_steps": 3215, "loss": 1.8877, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.263243195632068e-05, "epoch": 0.4, "percentage": 40.12, "elapsed_time": "0:11:14", "remaining_time": "0:16:46"}
+{"current_steps": 1295, "total_steps": 3215, "loss": 2.0615, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.2516027017254785e-05, "epoch": 0.4, "percentage": 40.28, "elapsed_time": "0:11:16", "remaining_time": "0:16:43"}
+{"current_steps": 1300, "total_steps": 3215, "loss": 2.0402, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.239944266049587e-05, "epoch": 0.4, "percentage": 40.44, "elapsed_time": "0:11:19", "remaining_time": "0:16:40"}
+{"current_steps": 1305, "total_steps": 3215, "loss": 2.0728, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.228268166906962e-05, "epoch": 0.41, "percentage": 40.59, "elapsed_time": "0:11:22", "remaining_time": "0:16:39"}
+{"current_steps": 1310, "total_steps": 3215, "loss": 2.1815, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.2165746830218254e-05, "epoch": 0.41, "percentage": 40.75, "elapsed_time": "0:11:25", "remaining_time": "0:16:36"}
+{"current_steps": 1315, "total_steps": 3215, "loss": 1.8935, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.204864093533394e-05, "epoch": 0.41, "percentage": 40.9, "elapsed_time": "0:11:27", "remaining_time": "0:16:34"}
+{"current_steps": 1320, "total_steps": 3215, "loss": 1.9567, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.193136677989221e-05, "epoch": 0.41, "percentage": 41.06, "elapsed_time": "0:11:30", "remaining_time": "0:16:31"}
+{"current_steps": 1325, "total_steps": 3215, "loss": 2.055, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.181392716338516e-05, "epoch": 0.41, "percentage": 41.21, "elapsed_time": "0:11:33", "remaining_time": "0:16:28"}
+{"current_steps": 1330, "total_steps": 3215, "loss": 1.8794, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.1696324889254716e-05, "epoch": 0.41, "percentage": 41.37, "elapsed_time": "0:11:35", "remaining_time": "0:16:25"}
+{"current_steps": 1335, "total_steps": 3215, "loss": 2.0299, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.15785627648256e-05, "epoch": 0.42, "percentage": 41.52, "elapsed_time": "0:11:38", "remaining_time": "0:16:23"}
+{"current_steps": 1340, "total_steps": 3215, "loss": 1.9342, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.146064360123846e-05, "epoch": 0.42, "percentage": 41.68, "elapsed_time": "0:11:40", "remaining_time": "0:16:20"}
+{"current_steps": 1345, "total_steps": 3215, "loss": 2.0399, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.1342570213382594e-05, "epoch": 0.42, "percentage": 41.84, "elapsed_time": "0:11:43", "remaining_time": "0:16:17"}
+{"current_steps": 1350, "total_steps": 3215, "loss": 2.1419, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.122434541982888e-05, "epoch": 0.42, "percentage": 41.99, "elapsed_time": "0:11:45", "remaining_time": "0:16:15"}
+{"current_steps": 1355, "total_steps": 3215, "loss": 2.2932, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.110597204276247e-05, "epoch": 0.42, "percentage": 42.15, "elapsed_time": "0:11:48", "remaining_time": "0:16:12"}
+{"current_steps": 1360, "total_steps": 3215, "loss": 1.8989, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.098745290791539e-05, "epoch": 0.42, "percentage": 42.3, "elapsed_time": "0:11:50", "remaining_time": "0:16:09"}
+{"current_steps": 1365, "total_steps": 3215, "loss": 2.1214, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.086879084449907e-05, "epoch": 0.42, "percentage": 42.46, "elapsed_time": "0:11:53", "remaining_time": "0:16:07"}
+{"current_steps": 1370, "total_steps": 3215, "loss": 2.2538, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.074998868513688e-05, "epoch": 0.43, "percentage": 42.61, "elapsed_time": "0:11:55", "remaining_time": "0:16:04"}
+{"current_steps": 1375, "total_steps": 3215, "loss": 2.0974, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.0631049265796465e-05, "epoch": 0.43, "percentage": 42.77, "elapsed_time": "0:11:58", "remaining_time": "0:16:01"}
+{"current_steps": 1380, "total_steps": 3215, "loss": 2.054, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.051197542572203e-05, "epoch": 0.43, "percentage": 42.92, "elapsed_time": "0:12:00", "remaining_time": "0:15:58"}
+{"current_steps": 1385, "total_steps": 3215, "loss": 1.9798, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.0392770007366584e-05, "epoch": 0.43, "percentage": 43.08, "elapsed_time": "0:12:03", "remaining_time": "0:15:56"}
+{"current_steps": 1390, "total_steps": 3215, "loss": 2.0796, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.0273435856324112e-05, "epoch": 0.43, "percentage": 43.23, "elapsed_time": "0:12:06", "remaining_time": "0:15:53"}
+{"current_steps": 1395, "total_steps": 3215, "loss": 1.9116, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.0153975821261605e-05, "epoch": 0.43, "percentage": 43.39, "elapsed_time": "0:12:08", "remaining_time": "0:15:50"}
+{"current_steps": 1400, "total_steps": 3215, "loss": 2.0235, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.0034392753851066e-05, "epoch": 0.44, "percentage": 43.55, "elapsed_time": "0:12:11", "remaining_time": "0:15:48"}
+{"current_steps": 1405, "total_steps": 3215, "loss": 2.1455, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.9914689508701476e-05, "epoch": 0.44, "percentage": 43.7, "elapsed_time": "0:12:14", "remaining_time": "0:15:46"}
+{"current_steps": 1410, "total_steps": 3215, "loss": 2.0355, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.979486894329058e-05, "epoch": 0.44, "percentage": 43.86, "elapsed_time": "0:12:17", "remaining_time": "0:15:43"}
+{"current_steps": 1415, "total_steps": 3215, "loss": 2.0379, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.9674933917896747e-05, "epoch": 0.44, "percentage": 44.01, "elapsed_time": "0:12:19", "remaining_time": "0:15:41"}
+{"current_steps": 1420, "total_steps": 3215, "loss": 2.0802, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.9554887295530647e-05, "epoch": 0.44, "percentage": 44.17, "elapsed_time": "0:12:22", "remaining_time": "0:15:38"}
+{"current_steps": 1425, "total_steps": 3215, "loss": 2.1044, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.943473194186693e-05, "epoch": 0.44, "percentage": 44.32, "elapsed_time": "0:12:24", "remaining_time": "0:15:35"}
+{"current_steps": 1430, "total_steps": 3215, "loss": 2.0121, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.9314470725175792e-05, "epoch": 0.44, "percentage": 44.48, "elapsed_time": "0:12:27", "remaining_time": "0:15:32"}
+{"current_steps": 1435, "total_steps": 3215, "loss": 2.0717, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.919410651625455e-05, "epoch": 0.45, "percentage": 44.63, "elapsed_time": "0:12:30", "remaining_time": "0:15:30"}
+{"current_steps": 1440, "total_steps": 3215, "loss": 1.9522, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.907364218835904e-05, "epoch": 0.45, "percentage": 44.79, "elapsed_time": "0:12:32", "remaining_time": "0:15:27"}
+{"current_steps": 1445, "total_steps": 3215, "loss": 1.9593, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.8953080617135115e-05, "epoch": 0.45, "percentage": 44.95, "elapsed_time": "0:12:35", "remaining_time": "0:15:25"}
+{"current_steps": 1450, "total_steps": 3215, "loss": 1.8073, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.8832424680549937e-05, "epoch": 0.45, "percentage": 45.1, "elapsed_time": "0:12:37", "remaining_time": "0:15:22"}
+{"current_steps": 1455, "total_steps": 3215, "loss": 2.0042, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.8711677258823306e-05, "epoch": 0.45, "percentage": 45.26, "elapsed_time": "0:12:40", "remaining_time": "0:15:19"}
+{"current_steps": 1460, "total_steps": 3215, "loss": 1.9931, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.859084123435887e-05, "epoch": 0.45, "percentage": 45.41, "elapsed_time": "0:12:42", "remaining_time": "0:15:16"}
+{"current_steps": 1465, "total_steps": 3215, "loss": 2.1533, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.84699194916754e-05, "epoch": 0.46, "percentage": 45.57, "elapsed_time": "0:12:45", "remaining_time": "0:15:14"}
+{"current_steps": 1470, "total_steps": 3215, "loss": 2.029, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.834891491733781e-05, "epoch": 0.46, "percentage": 45.72, "elapsed_time": "0:12:48", "remaining_time": "0:15:11"}
+{"current_steps": 1475, "total_steps": 3215, "loss": 2.0241, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.822783039988836e-05, "epoch": 0.46, "percentage": 45.88, "elapsed_time": "0:12:50", "remaining_time": "0:15:08"}
+{"current_steps": 1480, "total_steps": 3215, "loss": 2.0959, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.8106668829777645e-05, "epoch": 0.46, "percentage": 46.03, "elapsed_time": "0:12:53", "remaining_time": "0:15:06"}
+{"current_steps": 1485, "total_steps": 3215, "loss": 1.8718, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.7985433099295618e-05, "epoch": 0.46, "percentage": 46.19, "elapsed_time": "0:12:55", "remaining_time": "0:15:03"}
+{"current_steps": 1490, "total_steps": 3215, "loss": 2.2397, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.7864126102502524e-05, "epoch": 0.46, "percentage": 46.35, "elapsed_time": "0:12:58", "remaining_time": "0:15:01"}
+{"current_steps": 1495, "total_steps": 3215, "loss": 2.1083, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.774275073515985e-05, "epoch": 0.46, "percentage": 46.5, "elapsed_time": "0:13:00", "remaining_time": "0:14:58"}
+{"current_steps": 1500, "total_steps": 3215, "loss": 2.0764, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.7621309894661167e-05, "epoch": 0.47, "percentage": 46.66, "elapsed_time": "0:13:03", "remaining_time": "0:14:55"}
+{"current_steps": 1505, "total_steps": 3215, "loss": 2.0955, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.7499806479962997e-05, "epoch": 0.47, "percentage": 46.81, "elapsed_time": "0:13:06", "remaining_time": "0:14:53"}
+{"current_steps": 1510, "total_steps": 3215, "loss": 2.0449, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.7378243391515558e-05, "epoch": 0.47, "percentage": 46.97, "elapsed_time": "0:13:09", "remaining_time": "0:14:50"}
+{"current_steps": 1515, "total_steps": 3215, "loss": 1.8368, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.7256623531193605e-05, "epoch": 0.47, "percentage": 47.12, "elapsed_time": "0:13:11", "remaining_time": "0:14:48"}
+{"current_steps": 1520, "total_steps": 3215, "loss": 2.024, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.7134949802227073e-05, "epoch": 0.47, "percentage": 47.28, "elapsed_time": "0:13:14", "remaining_time": "0:14:45"}
+{"current_steps": 1525, "total_steps": 3215, "loss": 2.0699, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.7013225109131836e-05, "epoch": 0.47, "percentage": 47.43, "elapsed_time": "0:13:16", "remaining_time": "0:14:42"}
+{"current_steps": 1530, "total_steps": 3215, "loss": 1.953, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.689145235764035e-05, "epoch": 0.48, "percentage": 47.59, "elapsed_time": "0:13:19", "remaining_time": "0:14:40"}
+{"current_steps": 1534, "total_steps": 3215, "loss": null, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": null, "epoch": 0.48, "percentage": 47.71, "elapsed_time": "0:13:21", "remaining_time": "0:14:38"}
diff --git a/trainer_state.json b/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..904cb416e190d2053dec9a2ce80c8d85cbfe5c5b
--- /dev/null
+++ b/trainer_state.json
@@ -0,0 +1,2172 @@
+{
+ "best_metric": null,
+ "best_model_checkpoint": null,
+ "epoch": 0.4770085901970692,
+ "eval_steps": 500,
+ "global_step": 1534,
+ "is_hyper_param_search": false,
+ "is_local_process_zero": true,
+ "is_world_process_zero": true,
+ "log_history": [
+ {
+ "epoch": 0.0,
+ "grad_norm": 0.47774845361709595,
+ "learning_rate": 4.999970160815579e-05,
+ "loss": 2.0765,
+ "step": 5
+ },
+ {
+ "epoch": 0.0,
+ "grad_norm": 0.6051416397094727,
+ "learning_rate": 4.999880643974619e-05,
+ "loss": 2.2297,
+ "step": 10
+ },
+ {
+ "epoch": 0.0,
+ "grad_norm": 0.6161717772483826,
+ "learning_rate": 4.9997314516140056e-05,
+ "loss": 2.1103,
+ "step": 15
+ },
+ {
+ "epoch": 0.01,
+ "grad_norm": 0.4686434268951416,
+ "learning_rate": 4.999522587295162e-05,
+ "loss": 2.0057,
+ "step": 20
+ },
+ {
+ "epoch": 0.01,
+ "grad_norm": 0.8412289023399353,
+ "learning_rate": 4.999254056003963e-05,
+ "loss": 2.1778,
+ "step": 25
+ },
+ {
+ "epoch": 0.01,
+ "grad_norm": 0.5333625078201294,
+ "learning_rate": 4.99892586415061e-05,
+ "loss": 2.2399,
+ "step": 30
+ },
+ {
+ "epoch": 0.01,
+ "grad_norm": 0.821148157119751,
+ "learning_rate": 4.9985380195694856e-05,
+ "loss": 2.3215,
+ "step": 35
+ },
+ {
+ "epoch": 0.01,
+ "grad_norm": 0.8403909206390381,
+ "learning_rate": 4.998090531518962e-05,
+ "loss": 1.8295,
+ "step": 40
+ },
+ {
+ "epoch": 0.01,
+ "grad_norm": 0.6633398532867432,
+ "learning_rate": 4.9975834106811834e-05,
+ "loss": 2.0195,
+ "step": 45
+ },
+ {
+ "epoch": 0.02,
+ "grad_norm": 0.6386868357658386,
+ "learning_rate": 4.997016669161806e-05,
+ "loss": 2.1257,
+ "step": 50
+ },
+ {
+ "epoch": 0.02,
+ "grad_norm": 0.7762248516082764,
+ "learning_rate": 4.996390320489715e-05,
+ "loss": 2.057,
+ "step": 55
+ },
+ {
+ "epoch": 0.02,
+ "grad_norm": 1.3192856311798096,
+ "learning_rate": 4.9957043796166966e-05,
+ "loss": 2.0753,
+ "step": 60
+ },
+ {
+ "epoch": 0.02,
+ "grad_norm": 0.9797518849372864,
+ "learning_rate": 4.994958862917083e-05,
+ "loss": 1.9736,
+ "step": 65
+ },
+ {
+ "epoch": 0.02,
+ "grad_norm": 1.000693440437317,
+ "learning_rate": 4.994153788187363e-05,
+ "loss": 2.1572,
+ "step": 70
+ },
+ {
+ "epoch": 0.02,
+ "grad_norm": 0.6852813959121704,
+ "learning_rate": 4.993289174645757e-05,
+ "loss": 2.1491,
+ "step": 75
+ },
+ {
+ "epoch": 0.02,
+ "grad_norm": 1.0075691938400269,
+ "learning_rate": 4.992365042931752e-05,
+ "loss": 1.945,
+ "step": 80
+ },
+ {
+ "epoch": 0.03,
+ "grad_norm": 1.1973133087158203,
+ "learning_rate": 4.991381415105619e-05,
+ "loss": 2.0811,
+ "step": 85
+ },
+ {
+ "epoch": 0.03,
+ "grad_norm": 0.9927239418029785,
+ "learning_rate": 4.990338314647881e-05,
+ "loss": 1.961,
+ "step": 90
+ },
+ {
+ "epoch": 0.03,
+ "grad_norm": 0.9499759674072266,
+ "learning_rate": 4.98923576645875e-05,
+ "loss": 2.0653,
+ "step": 95
+ },
+ {
+ "epoch": 0.03,
+ "grad_norm": 0.7233040928840637,
+ "learning_rate": 4.9880737968575365e-05,
+ "loss": 1.9999,
+ "step": 100
+ },
+ {
+ "epoch": 0.03,
+ "grad_norm": 1.55235755443573,
+ "learning_rate": 4.986852433582022e-05,
+ "loss": 2.2258,
+ "step": 105
+ },
+ {
+ "epoch": 0.03,
+ "grad_norm": 0.9007890820503235,
+ "learning_rate": 4.985571705787793e-05,
+ "loss": 2.1034,
+ "step": 110
+ },
+ {
+ "epoch": 0.04,
+ "grad_norm": 0.6774860620498657,
+ "learning_rate": 4.9842316440475475e-05,
+ "loss": 2.1753,
+ "step": 115
+ },
+ {
+ "epoch": 0.04,
+ "grad_norm": 0.7676737308502197,
+ "learning_rate": 4.9828322803503665e-05,
+ "loss": 2.1384,
+ "step": 120
+ },
+ {
+ "epoch": 0.04,
+ "grad_norm": 0.9624544978141785,
+ "learning_rate": 4.981373648100946e-05,
+ "loss": 2.0521,
+ "step": 125
+ },
+ {
+ "epoch": 0.04,
+ "grad_norm": 0.9315722584724426,
+ "learning_rate": 4.979855782118802e-05,
+ "loss": 1.9256,
+ "step": 130
+ },
+ {
+ "epoch": 0.04,
+ "grad_norm": 0.9035864472389221,
+ "learning_rate": 4.978278718637443e-05,
+ "loss": 2.0882,
+ "step": 135
+ },
+ {
+ "epoch": 0.04,
+ "grad_norm": 0.7997236251831055,
+ "learning_rate": 4.9766424953035e-05,
+ "loss": 2.0724,
+ "step": 140
+ },
+ {
+ "epoch": 0.05,
+ "grad_norm": 1.0692921876907349,
+ "learning_rate": 4.974947151175826e-05,
+ "loss": 2.1329,
+ "step": 145
+ },
+ {
+ "epoch": 0.05,
+ "grad_norm": 0.9506180286407471,
+ "learning_rate": 4.973192726724572e-05,
+ "loss": 2.082,
+ "step": 150
+ },
+ {
+ "epoch": 0.05,
+ "grad_norm": 0.8647387027740479,
+ "learning_rate": 4.9713792638302145e-05,
+ "loss": 2.0366,
+ "step": 155
+ },
+ {
+ "epoch": 0.05,
+ "grad_norm": 1.105302095413208,
+ "learning_rate": 4.969506805782555e-05,
+ "loss": 2.1481,
+ "step": 160
+ },
+ {
+ "epoch": 0.05,
+ "grad_norm": 0.7593303918838501,
+ "learning_rate": 4.967575397279689e-05,
+ "loss": 2.032,
+ "step": 165
+ },
+ {
+ "epoch": 0.05,
+ "grad_norm": 0.7521979808807373,
+ "learning_rate": 4.965585084426943e-05,
+ "loss": 2.0379,
+ "step": 170
+ },
+ {
+ "epoch": 0.05,
+ "grad_norm": 0.947120726108551,
+ "learning_rate": 4.9635359147357655e-05,
+ "loss": 2.1444,
+ "step": 175
+ },
+ {
+ "epoch": 0.06,
+ "grad_norm": 1.2184454202651978,
+ "learning_rate": 4.961427937122598e-05,
+ "loss": 1.9164,
+ "step": 180
+ },
+ {
+ "epoch": 0.06,
+ "grad_norm": 1.221663475036621,
+ "learning_rate": 4.959261201907707e-05,
+ "loss": 2.0084,
+ "step": 185
+ },
+ {
+ "epoch": 0.06,
+ "grad_norm": 1.0457361936569214,
+ "learning_rate": 4.957035760813982e-05,
+ "loss": 2.2032,
+ "step": 190
+ },
+ {
+ "epoch": 0.06,
+ "grad_norm": 0.8834909200668335,
+ "learning_rate": 4.954751666965701e-05,
+ "loss": 2.2101,
+ "step": 195
+ },
+ {
+ "epoch": 0.06,
+ "grad_norm": 0.791902482509613,
+ "learning_rate": 4.9524089748872615e-05,
+ "loss": 2.0472,
+ "step": 200
+ },
+ {
+ "epoch": 0.06,
+ "grad_norm": 1.2905739545822144,
+ "learning_rate": 4.9500077405018807e-05,
+ "loss": 2.0987,
+ "step": 205
+ },
+ {
+ "epoch": 0.07,
+ "grad_norm": 0.8612006306648254,
+ "learning_rate": 4.9475480211302583e-05,
+ "loss": 2.1765,
+ "step": 210
+ },
+ {
+ "epoch": 0.07,
+ "grad_norm": 1.3128459453582764,
+ "learning_rate": 4.945029875489212e-05,
+ "loss": 1.9926,
+ "step": 215
+ },
+ {
+ "epoch": 0.07,
+ "grad_norm": 0.9610918164253235,
+ "learning_rate": 4.94245336369027e-05,
+ "loss": 2.0124,
+ "step": 220
+ },
+ {
+ "epoch": 0.07,
+ "grad_norm": 0.873160183429718,
+ "learning_rate": 4.939818547238241e-05,
+ "loss": 2.2229,
+ "step": 225
+ },
+ {
+ "epoch": 0.07,
+ "grad_norm": 1.5535285472869873,
+ "learning_rate": 4.9371254890297446e-05,
+ "loss": 2.2013,
+ "step": 230
+ },
+ {
+ "epoch": 0.07,
+ "grad_norm": 1.1951836347579956,
+ "learning_rate": 4.93437425335171e-05,
+ "loss": 2.014,
+ "step": 235
+ },
+ {
+ "epoch": 0.07,
+ "grad_norm": 0.7874170541763306,
+ "learning_rate": 4.9315649058798384e-05,
+ "loss": 2.1701,
+ "step": 240
+ },
+ {
+ "epoch": 0.08,
+ "grad_norm": 1.3503323793411255,
+ "learning_rate": 4.928697513677042e-05,
+ "loss": 2.1681,
+ "step": 245
+ },
+ {
+ "epoch": 0.08,
+ "grad_norm": 1.3091179132461548,
+ "learning_rate": 4.925772145191834e-05,
+ "loss": 2.1224,
+ "step": 250
+ },
+ {
+ "epoch": 0.08,
+ "grad_norm": 1.4428555965423584,
+ "learning_rate": 4.9227888702567044e-05,
+ "loss": 2.0512,
+ "step": 255
+ },
+ {
+ "epoch": 0.08,
+ "grad_norm": 0.8234395980834961,
+ "learning_rate": 4.9197477600864446e-05,
+ "loss": 2.1067,
+ "step": 260
+ },
+ {
+ "epoch": 0.08,
+ "grad_norm": 1.9094969034194946,
+ "learning_rate": 4.9166488872764526e-05,
+ "loss": 1.8884,
+ "step": 265
+ },
+ {
+ "epoch": 0.08,
+ "grad_norm": 1.0074087381362915,
+ "learning_rate": 4.913492325800999e-05,
+ "loss": 1.9345,
+ "step": 270
+ },
+ {
+ "epoch": 0.09,
+ "grad_norm": 1.0867297649383545,
+ "learning_rate": 4.910278151011458e-05,
+ "loss": 2.1928,
+ "step": 275
+ },
+ {
+ "epoch": 0.09,
+ "grad_norm": 0.6842357516288757,
+ "learning_rate": 4.907006439634516e-05,
+ "loss": 2.0407,
+ "step": 280
+ },
+ {
+ "epoch": 0.09,
+ "grad_norm": 0.8409023284912109,
+ "learning_rate": 4.903677269770329e-05,
+ "loss": 2.2344,
+ "step": 285
+ },
+ {
+ "epoch": 0.09,
+ "grad_norm": 0.8119503259658813,
+ "learning_rate": 4.900290720890671e-05,
+ "loss": 2.1296,
+ "step": 290
+ },
+ {
+ "epoch": 0.09,
+ "grad_norm": 0.9938147068023682,
+ "learning_rate": 4.8968468738370244e-05,
+ "loss": 2.152,
+ "step": 295
+ },
+ {
+ "epoch": 0.09,
+ "grad_norm": 0.9865244030952454,
+ "learning_rate": 4.8933458108186606e-05,
+ "loss": 1.9623,
+ "step": 300
+ },
+ {
+ "epoch": 0.09,
+ "grad_norm": 1.3944802284240723,
+ "learning_rate": 4.889787615410672e-05,
+ "loss": 1.915,
+ "step": 305
+ },
+ {
+ "epoch": 0.1,
+ "grad_norm": 1.3749767541885376,
+ "learning_rate": 4.886172372551977e-05,
+ "loss": 1.9934,
+ "step": 310
+ },
+ {
+ "epoch": 0.1,
+ "grad_norm": 0.9024938941001892,
+ "learning_rate": 4.882500168543294e-05,
+ "loss": 2.1541,
+ "step": 315
+ },
+ {
+ "epoch": 0.1,
+ "grad_norm": 1.1978263854980469,
+ "learning_rate": 4.878771091045082e-05,
+ "loss": 2.1688,
+ "step": 320
+ },
+ {
+ "epoch": 0.1,
+ "grad_norm": 0.8360010981559753,
+ "learning_rate": 4.874985229075446e-05,
+ "loss": 2.1387,
+ "step": 325
+ },
+ {
+ "epoch": 0.1,
+ "grad_norm": 0.7683364152908325,
+ "learning_rate": 4.871142673008012e-05,
+ "loss": 2.0215,
+ "step": 330
+ },
+ {
+ "epoch": 0.1,
+ "grad_norm": 1.4230670928955078,
+ "learning_rate": 4.867243514569772e-05,
+ "loss": 1.9491,
+ "step": 335
+ },
+ {
+ "epoch": 0.11,
+ "grad_norm": 0.8198773860931396,
+ "learning_rate": 4.863287846838891e-05,
+ "loss": 2.0151,
+ "step": 340
+ },
+ {
+ "epoch": 0.11,
+ "grad_norm": 1.467207908630371,
+ "learning_rate": 4.85927576424249e-05,
+ "loss": 1.8906,
+ "step": 345
+ },
+ {
+ "epoch": 0.11,
+ "grad_norm": 0.9537095427513123,
+ "learning_rate": 4.855207362554385e-05,
+ "loss": 2.1844,
+ "step": 350
+ },
+ {
+ "epoch": 0.11,
+ "grad_norm": 1.0757155418395996,
+ "learning_rate": 4.851082738892809e-05,
+ "loss": 2.048,
+ "step": 355
+ },
+ {
+ "epoch": 0.11,
+ "grad_norm": 1.6884938478469849,
+ "learning_rate": 4.8469019917180846e-05,
+ "loss": 1.9537,
+ "step": 360
+ },
+ {
+ "epoch": 0.11,
+ "grad_norm": 1.4680182933807373,
+ "learning_rate": 4.8426652208302814e-05,
+ "loss": 1.9731,
+ "step": 365
+ },
+ {
+ "epoch": 0.12,
+ "grad_norm": 1.1778632402420044,
+ "learning_rate": 4.83837252736683e-05,
+ "loss": 2.1395,
+ "step": 370
+ },
+ {
+ "epoch": 0.12,
+ "grad_norm": 1.2865056991577148,
+ "learning_rate": 4.834024013800108e-05,
+ "loss": 2.0016,
+ "step": 375
+ },
+ {
+ "epoch": 0.12,
+ "grad_norm": 1.055177092552185,
+ "learning_rate": 4.8296197839349944e-05,
+ "loss": 1.9632,
+ "step": 380
+ },
+ {
+ "epoch": 0.12,
+ "grad_norm": 1.0041871070861816,
+ "learning_rate": 4.825159942906389e-05,
+ "loss": 2.3302,
+ "step": 385
+ },
+ {
+ "epoch": 0.12,
+ "grad_norm": 1.0026438236236572,
+ "learning_rate": 4.820644597176709e-05,
+ "loss": 2.1517,
+ "step": 390
+ },
+ {
+ "epoch": 0.12,
+ "grad_norm": 1.3532180786132812,
+ "learning_rate": 4.81607385453334e-05,
+ "loss": 2.1229,
+ "step": 395
+ },
+ {
+ "epoch": 0.12,
+ "grad_norm": 0.7670988440513611,
+ "learning_rate": 4.81144782408607e-05,
+ "loss": 2.1382,
+ "step": 400
+ },
+ {
+ "epoch": 0.13,
+ "grad_norm": 1.0405700206756592,
+ "learning_rate": 4.8067666162644774e-05,
+ "loss": 1.9614,
+ "step": 405
+ },
+ {
+ "epoch": 0.13,
+ "grad_norm": 1.2252662181854248,
+ "learning_rate": 4.802030342815304e-05,
+ "loss": 2.1399,
+ "step": 410
+ },
+ {
+ "epoch": 0.13,
+ "grad_norm": 1.237946629524231,
+ "learning_rate": 4.7972391167997754e-05,
+ "loss": 1.9034,
+ "step": 415
+ },
+ {
+ "epoch": 0.13,
+ "grad_norm": 0.8064705729484558,
+ "learning_rate": 4.7923930525909156e-05,
+ "loss": 2.0075,
+ "step": 420
+ },
+ {
+ "epoch": 0.13,
+ "grad_norm": 0.8717565536499023,
+ "learning_rate": 4.7874922658708065e-05,
+ "loss": 2.0105,
+ "step": 425
+ },
+ {
+ "epoch": 0.13,
+ "grad_norm": 1.6693098545074463,
+ "learning_rate": 4.782536873627832e-05,
+ "loss": 2.0242,
+ "step": 430
+ },
+ {
+ "epoch": 0.14,
+ "grad_norm": 0.82447350025177,
+ "learning_rate": 4.777526994153882e-05,
+ "loss": 2.0267,
+ "step": 435
+ },
+ {
+ "epoch": 0.14,
+ "grad_norm": 0.9926588535308838,
+ "learning_rate": 4.7724627470415307e-05,
+ "loss": 1.9119,
+ "step": 440
+ },
+ {
+ "epoch": 0.14,
+ "grad_norm": 1.0924450159072876,
+ "learning_rate": 4.7673442531811796e-05,
+ "loss": 2.2653,
+ "step": 445
+ },
+ {
+ "epoch": 0.14,
+ "grad_norm": 1.1592103242874146,
+ "learning_rate": 4.762171634758177e-05,
+ "loss": 2.0017,
+ "step": 450
+ },
+ {
+ "epoch": 0.14,
+ "grad_norm": 0.9172110557556152,
+ "learning_rate": 4.7569450152498927e-05,
+ "loss": 2.1408,
+ "step": 455
+ },
+ {
+ "epoch": 0.14,
+ "grad_norm": 1.1897525787353516,
+ "learning_rate": 4.751664519422778e-05,
+ "loss": 2.0935,
+ "step": 460
+ },
+ {
+ "epoch": 0.14,
+ "grad_norm": 0.8793094158172607,
+ "learning_rate": 4.746330273329386e-05,
+ "loss": 2.1142,
+ "step": 465
+ },
+ {
+ "epoch": 0.15,
+ "grad_norm": 1.4337489604949951,
+ "learning_rate": 4.740942404305356e-05,
+ "loss": 2.1289,
+ "step": 470
+ },
+ {
+ "epoch": 0.15,
+ "grad_norm": 1.0251764059066772,
+ "learning_rate": 4.735501040966383e-05,
+ "loss": 1.9741,
+ "step": 475
+ },
+ {
+ "epoch": 0.15,
+ "grad_norm": 1.2659822702407837,
+ "learning_rate": 4.730006313205143e-05,
+ "loss": 2.088,
+ "step": 480
+ },
+ {
+ "epoch": 0.15,
+ "grad_norm": 0.8884140849113464,
+ "learning_rate": 4.724458352188192e-05,
+ "loss": 2.2079,
+ "step": 485
+ },
+ {
+ "epoch": 0.15,
+ "grad_norm": 1.1937768459320068,
+ "learning_rate": 4.718857290352835e-05,
+ "loss": 2.048,
+ "step": 490
+ },
+ {
+ "epoch": 0.15,
+ "grad_norm": 0.9741552472114563,
+ "learning_rate": 4.713203261403966e-05,
+ "loss": 2.2569,
+ "step": 495
+ },
+ {
+ "epoch": 0.16,
+ "grad_norm": 0.7996780872344971,
+ "learning_rate": 4.707496400310874e-05,
+ "loss": 1.9574,
+ "step": 500
+ },
+ {
+ "epoch": 0.16,
+ "grad_norm": 1.8182051181793213,
+ "learning_rate": 4.701736843304025e-05,
+ "loss": 2.0951,
+ "step": 505
+ },
+ {
+ "epoch": 0.16,
+ "grad_norm": 1.507320761680603,
+ "learning_rate": 4.695924727871805e-05,
+ "loss": 2.0253,
+ "step": 510
+ },
+ {
+ "epoch": 0.16,
+ "grad_norm": 0.759121835231781,
+ "learning_rate": 4.690060192757242e-05,
+ "loss": 2.0602,
+ "step": 515
+ },
+ {
+ "epoch": 0.16,
+ "grad_norm": 1.5943195819854736,
+ "learning_rate": 4.684143377954691e-05,
+ "loss": 2.0386,
+ "step": 520
+ },
+ {
+ "epoch": 0.16,
+ "grad_norm": 0.8568710088729858,
+ "learning_rate": 4.6781744247064955e-05,
+ "loss": 2.073,
+ "step": 525
+ },
+ {
+ "epoch": 0.16,
+ "grad_norm": 1.3352620601654053,
+ "learning_rate": 4.6721534754996125e-05,
+ "loss": 2.1443,
+ "step": 530
+ },
+ {
+ "epoch": 0.17,
+ "grad_norm": 1.3417474031448364,
+ "learning_rate": 4.666080674062213e-05,
+ "loss": 2.0288,
+ "step": 535
+ },
+ {
+ "epoch": 0.17,
+ "grad_norm": 1.5334464311599731,
+ "learning_rate": 4.659956165360251e-05,
+ "loss": 2.0609,
+ "step": 540
+ },
+ {
+ "epoch": 0.17,
+ "grad_norm": 0.9658721089363098,
+ "learning_rate": 4.6537800955940005e-05,
+ "loss": 1.9539,
+ "step": 545
+ },
+ {
+ "epoch": 0.17,
+ "grad_norm": 1.9197947978973389,
+ "learning_rate": 4.647552612194572e-05,
+ "loss": 2.149,
+ "step": 550
+ },
+ {
+ "epoch": 0.17,
+ "grad_norm": 0.8512137532234192,
+ "learning_rate": 4.641273863820383e-05,
+ "loss": 1.9722,
+ "step": 555
+ },
+ {
+ "epoch": 0.17,
+ "grad_norm": 1.827289342880249,
+ "learning_rate": 4.634944000353622e-05,
+ "loss": 2.0729,
+ "step": 560
+ },
+ {
+ "epoch": 0.18,
+ "grad_norm": 1.088416337966919,
+ "learning_rate": 4.628563172896655e-05,
+ "loss": 1.9507,
+ "step": 565
+ },
+ {
+ "epoch": 0.18,
+ "grad_norm": 1.3566908836364746,
+ "learning_rate": 4.6221315337684353e-05,
+ "loss": 2.1643,
+ "step": 570
+ },
+ {
+ "epoch": 0.18,
+ "grad_norm": 1.3541293144226074,
+ "learning_rate": 4.615649236500854e-05,
+ "loss": 2.1839,
+ "step": 575
+ },
+ {
+ "epoch": 0.18,
+ "grad_norm": 0.991269588470459,
+ "learning_rate": 4.609116435835083e-05,
+ "loss": 2.0976,
+ "step": 580
+ },
+ {
+ "epoch": 0.18,
+ "grad_norm": 1.0280535221099854,
+ "learning_rate": 4.602533287717877e-05,
+ "loss": 2.1474,
+ "step": 585
+ },
+ {
+ "epoch": 0.18,
+ "grad_norm": 1.013123631477356,
+ "learning_rate": 4.5958999492978524e-05,
+ "loss": 2.1873,
+ "step": 590
+ },
+ {
+ "epoch": 0.19,
+ "grad_norm": 1.1753040552139282,
+ "learning_rate": 4.589216578921737e-05,
+ "loss": 2.1744,
+ "step": 595
+ },
+ {
+ "epoch": 0.19,
+ "grad_norm": 1.1839090585708618,
+ "learning_rate": 4.582483336130586e-05,
+ "loss": 1.9982,
+ "step": 600
+ },
+ {
+ "epoch": 0.19,
+ "grad_norm": 1.0724798440933228,
+ "learning_rate": 4.575700381655979e-05,
+ "loss": 2.1234,
+ "step": 605
+ },
+ {
+ "epoch": 0.19,
+ "grad_norm": 2.009913682937622,
+ "learning_rate": 4.5688678774161796e-05,
+ "loss": 1.9478,
+ "step": 610
+ },
+ {
+ "epoch": 0.19,
+ "grad_norm": 0.9897060394287109,
+ "learning_rate": 4.561985986512271e-05,
+ "loss": 1.8268,
+ "step": 615
+ },
+ {
+ "epoch": 0.19,
+ "grad_norm": 0.8881808519363403,
+ "learning_rate": 4.555054873224263e-05,
+ "loss": 1.9887,
+ "step": 620
+ },
+ {
+ "epoch": 0.19,
+ "grad_norm": 1.155900001525879,
+ "learning_rate": 4.54807470300717e-05,
+ "loss": 2.0777,
+ "step": 625
+ },
+ {
+ "epoch": 0.2,
+ "grad_norm": 0.8782421350479126,
+ "learning_rate": 4.5410456424870596e-05,
+ "loss": 2.0566,
+ "step": 630
+ },
+ {
+ "epoch": 0.2,
+ "grad_norm": 1.3324674367904663,
+ "learning_rate": 4.5339678594570795e-05,
+ "loss": 2.047,
+ "step": 635
+ },
+ {
+ "epoch": 0.2,
+ "grad_norm": 1.9805939197540283,
+ "learning_rate": 4.526841522873449e-05,
+ "loss": 1.962,
+ "step": 640
+ },
+ {
+ "epoch": 0.2,
+ "grad_norm": 1.4999943971633911,
+ "learning_rate": 4.519666802851422e-05,
+ "loss": 2.0972,
+ "step": 645
+ },
+ {
+ "epoch": 0.2,
+ "grad_norm": 1.4504961967468262,
+ "learning_rate": 4.5124438706612376e-05,
+ "loss": 2.0041,
+ "step": 650
+ },
+ {
+ "epoch": 0.2,
+ "grad_norm": 0.9078169465065002,
+ "learning_rate": 4.505172898724018e-05,
+ "loss": 2.1229,
+ "step": 655
+ },
+ {
+ "epoch": 0.21,
+ "grad_norm": 1.1635804176330566,
+ "learning_rate": 4.497854060607662e-05,
+ "loss": 2.0195,
+ "step": 660
+ },
+ {
+ "epoch": 0.21,
+ "grad_norm": 1.46576726436615,
+ "learning_rate": 4.490487531022699e-05,
+ "loss": 2.0745,
+ "step": 665
+ },
+ {
+ "epoch": 0.21,
+ "grad_norm": 1.2094652652740479,
+ "learning_rate": 4.4830734858181145e-05,
+ "loss": 2.1068,
+ "step": 670
+ },
+ {
+ "epoch": 0.21,
+ "grad_norm": 1.4738895893096924,
+ "learning_rate": 4.47561210197716e-05,
+ "loss": 1.8088,
+ "step": 675
+ },
+ {
+ "epoch": 0.21,
+ "grad_norm": 1.23384690284729,
+ "learning_rate": 4.4681035576131215e-05,
+ "loss": 2.0995,
+ "step": 680
+ },
+ {
+ "epoch": 0.21,
+ "grad_norm": 0.8332946300506592,
+ "learning_rate": 4.46054803196507e-05,
+ "loss": 2.0541,
+ "step": 685
+ },
+ {
+ "epoch": 0.21,
+ "grad_norm": 0.9207485318183899,
+ "learning_rate": 4.452945705393586e-05,
+ "loss": 2.166,
+ "step": 690
+ },
+ {
+ "epoch": 0.22,
+ "grad_norm": 1.292945146560669,
+ "learning_rate": 4.445296759376449e-05,
+ "loss": 2.0784,
+ "step": 695
+ },
+ {
+ "epoch": 0.22,
+ "grad_norm": 0.9874763488769531,
+ "learning_rate": 4.437601376504307e-05,
+ "loss": 2.2087,
+ "step": 700
+ },
+ {
+ "epoch": 0.22,
+ "grad_norm": 0.9427415132522583,
+ "learning_rate": 4.4298597404763186e-05,
+ "loss": 2.1199,
+ "step": 705
+ },
+ {
+ "epoch": 0.22,
+ "grad_norm": 1.7369529008865356,
+ "learning_rate": 4.422072036095768e-05,
+ "loss": 2.0355,
+ "step": 710
+ },
+ {
+ "epoch": 0.22,
+ "grad_norm": 1.2423696517944336,
+ "learning_rate": 4.414238449265654e-05,
+ "loss": 2.0011,
+ "step": 715
+ },
+ {
+ "epoch": 0.22,
+ "grad_norm": 1.2304831743240356,
+ "learning_rate": 4.406359166984249e-05,
+ "loss": 2.0368,
+ "step": 720
+ },
+ {
+ "epoch": 0.23,
+ "grad_norm": 0.9090413451194763,
+ "learning_rate": 4.39843437734064e-05,
+ "loss": 1.9983,
+ "step": 725
+ },
+ {
+ "epoch": 0.23,
+ "grad_norm": 1.2729507684707642,
+ "learning_rate": 4.390464269510233e-05,
+ "loss": 2.021,
+ "step": 730
+ },
+ {
+ "epoch": 0.23,
+ "grad_norm": 1.3009227514266968,
+ "learning_rate": 4.382449033750244e-05,
+ "loss": 1.9743,
+ "step": 735
+ },
+ {
+ "epoch": 0.23,
+ "grad_norm": 1.5456056594848633,
+ "learning_rate": 4.37438886139515e-05,
+ "loss": 2.0689,
+ "step": 740
+ },
+ {
+ "epoch": 0.23,
+ "grad_norm": 1.3235007524490356,
+ "learning_rate": 4.3662839448521264e-05,
+ "loss": 2.0838,
+ "step": 745
+ },
+ {
+ "epoch": 0.23,
+ "grad_norm": 2.2074007987976074,
+ "learning_rate": 4.358134477596454e-05,
+ "loss": 2.0835,
+ "step": 750
+ },
+ {
+ "epoch": 0.23,
+ "grad_norm": 1.403738021850586,
+ "learning_rate": 4.3499406541668966e-05,
+ "loss": 2.0916,
+ "step": 755
+ },
+ {
+ "epoch": 0.24,
+ "grad_norm": 1.0940325260162354,
+ "learning_rate": 4.3417026701610616e-05,
+ "loss": 1.972,
+ "step": 760
+ },
+ {
+ "epoch": 0.24,
+ "grad_norm": 1.666353702545166,
+ "learning_rate": 4.3334207222307275e-05,
+ "loss": 1.927,
+ "step": 765
+ },
+ {
+ "epoch": 0.24,
+ "grad_norm": 1.0777515172958374,
+ "learning_rate": 4.325095008077154e-05,
+ "loss": 2.1192,
+ "step": 770
+ },
+ {
+ "epoch": 0.24,
+ "grad_norm": 1.7218186855316162,
+ "learning_rate": 4.316725726446353e-05,
+ "loss": 2.0774,
+ "step": 775
+ },
+ {
+ "epoch": 0.24,
+ "grad_norm": 1.356753945350647,
+ "learning_rate": 4.3083130771243586e-05,
+ "loss": 2.0847,
+ "step": 780
+ },
+ {
+ "epoch": 0.24,
+ "grad_norm": 0.9967429637908936,
+ "learning_rate": 4.299857260932445e-05,
+ "loss": 2.0485,
+ "step": 785
+ },
+ {
+ "epoch": 0.25,
+ "grad_norm": 1.6216442584991455,
+ "learning_rate": 4.2913584797223397e-05,
+ "loss": 2.1008,
+ "step": 790
+ },
+ {
+ "epoch": 0.25,
+ "grad_norm": 1.2556742429733276,
+ "learning_rate": 4.2828169363714016e-05,
+ "loss": 1.9209,
+ "step": 795
+ },
+ {
+ "epoch": 0.25,
+ "grad_norm": 1.1800439357757568,
+ "learning_rate": 4.274232834777782e-05,
+ "loss": 1.9722,
+ "step": 800
+ },
+ {
+ "epoch": 0.25,
+ "grad_norm": 1.1313499212265015,
+ "learning_rate": 4.2656063798555515e-05,
+ "loss": 1.9176,
+ "step": 805
+ },
+ {
+ "epoch": 0.25,
+ "grad_norm": 1.137534737586975,
+ "learning_rate": 4.256937777529815e-05,
+ "loss": 1.9929,
+ "step": 810
+ },
+ {
+ "epoch": 0.25,
+ "grad_norm": 1.0575093030929565,
+ "learning_rate": 4.2482272347317906e-05,
+ "loss": 2.166,
+ "step": 815
+ },
+ {
+ "epoch": 0.25,
+ "grad_norm": 1.5939594507217407,
+ "learning_rate": 4.2394749593938733e-05,
+ "loss": 2.1334,
+ "step": 820
+ },
+ {
+ "epoch": 0.26,
+ "grad_norm": 1.1045507192611694,
+ "learning_rate": 4.230681160444669e-05,
+ "loss": 2.0853,
+ "step": 825
+ },
+ {
+ "epoch": 0.26,
+ "grad_norm": 1.3480136394500732,
+ "learning_rate": 4.221846047804009e-05,
+ "loss": 2.1802,
+ "step": 830
+ },
+ {
+ "epoch": 0.26,
+ "grad_norm": 1.1822657585144043,
+ "learning_rate": 4.2129698323779366e-05,
+ "loss": 2.0739,
+ "step": 835
+ },
+ {
+ "epoch": 0.26,
+ "grad_norm": 1.1771117448806763,
+ "learning_rate": 4.204052726053676e-05,
+ "loss": 2.0238,
+ "step": 840
+ },
+ {
+ "epoch": 0.26,
+ "grad_norm": 1.4757814407348633,
+ "learning_rate": 4.195094941694571e-05,
+ "loss": 2.1557,
+ "step": 845
+ },
+ {
+ "epoch": 0.26,
+ "grad_norm": 0.9095075726509094,
+ "learning_rate": 4.1860966931350054e-05,
+ "loss": 2.1666,
+ "step": 850
+ },
+ {
+ "epoch": 0.27,
+ "grad_norm": 1.1039543151855469,
+ "learning_rate": 4.1770581951752976e-05,
+ "loss": 2.105,
+ "step": 855
+ },
+ {
+ "epoch": 0.27,
+ "grad_norm": 0.8517205119132996,
+ "learning_rate": 4.1679796635765735e-05,
+ "loss": 1.9656,
+ "step": 860
+ },
+ {
+ "epoch": 0.27,
+ "grad_norm": 1.239492654800415,
+ "learning_rate": 4.158861315055617e-05,
+ "loss": 2.0166,
+ "step": 865
+ },
+ {
+ "epoch": 0.27,
+ "grad_norm": 1.1358321905136108,
+ "learning_rate": 4.1497033672796924e-05,
+ "loss": 2.0076,
+ "step": 870
+ },
+ {
+ "epoch": 0.27,
+ "grad_norm": 1.6215249300003052,
+ "learning_rate": 4.140506038861356e-05,
+ "loss": 2.1594,
+ "step": 875
+ },
+ {
+ "epoch": 0.27,
+ "grad_norm": 1.0528080463409424,
+ "learning_rate": 4.131269549353229e-05,
+ "loss": 2.1416,
+ "step": 880
+ },
+ {
+ "epoch": 0.28,
+ "grad_norm": 0.8976901769638062,
+ "learning_rate": 4.1219941192427644e-05,
+ "loss": 2.1242,
+ "step": 885
+ },
+ {
+ "epoch": 0.28,
+ "grad_norm": 1.263594388961792,
+ "learning_rate": 4.112679969946977e-05,
+ "loss": 2.02,
+ "step": 890
+ },
+ {
+ "epoch": 0.28,
+ "grad_norm": 1.4173017740249634,
+ "learning_rate": 4.103327323807162e-05,
+ "loss": 2.0438,
+ "step": 895
+ },
+ {
+ "epoch": 0.28,
+ "grad_norm": 1.876170039176941,
+ "learning_rate": 4.093936404083585e-05,
+ "loss": 1.9806,
+ "step": 900
+ },
+ {
+ "epoch": 0.28,
+ "grad_norm": 1.4649231433868408,
+ "learning_rate": 4.0845074349501544e-05,
+ "loss": 2.1476,
+ "step": 905
+ },
+ {
+ "epoch": 0.28,
+ "grad_norm": 1.0446043014526367,
+ "learning_rate": 4.0750406414890695e-05,
+ "loss": 1.9672,
+ "step": 910
+ },
+ {
+ "epoch": 0.28,
+ "grad_norm": 1.0225305557250977,
+ "learning_rate": 4.065536249685448e-05,
+ "loss": 1.9984,
+ "step": 915
+ },
+ {
+ "epoch": 0.29,
+ "grad_norm": 1.0120617151260376,
+ "learning_rate": 4.055994486421929e-05,
+ "loss": 2.1162,
+ "step": 920
+ },
+ {
+ "epoch": 0.29,
+ "grad_norm": 1.0469881296157837,
+ "learning_rate": 4.04641557947326e-05,
+ "loss": 2.0435,
+ "step": 925
+ },
+ {
+ "epoch": 0.29,
+ "grad_norm": 1.2435941696166992,
+ "learning_rate": 4.036799757500856e-05,
+ "loss": 2.0431,
+ "step": 930
+ },
+ {
+ "epoch": 0.29,
+ "grad_norm": 1.0055103302001953,
+ "learning_rate": 4.027147250047348e-05,
+ "loss": 2.2021,
+ "step": 935
+ },
+ {
+ "epoch": 0.29,
+ "grad_norm": 1.1212949752807617,
+ "learning_rate": 4.017458287531094e-05,
+ "loss": 1.997,
+ "step": 940
+ },
+ {
+ "epoch": 0.29,
+ "grad_norm": 1.1048357486724854,
+ "learning_rate": 4.007733101240685e-05,
+ "loss": 1.946,
+ "step": 945
+ },
+ {
+ "epoch": 0.3,
+ "grad_norm": 1.4721689224243164,
+ "learning_rate": 3.997971923329426e-05,
+ "loss": 2.0723,
+ "step": 950
+ },
+ {
+ "epoch": 0.3,
+ "grad_norm": 1.3793156147003174,
+ "learning_rate": 3.988174986809783e-05,
+ "loss": 2.034,
+ "step": 955
+ },
+ {
+ "epoch": 0.3,
+ "grad_norm": 0.9013482928276062,
+ "learning_rate": 3.9783425255478355e-05,
+ "loss": 1.9736,
+ "step": 960
+ },
+ {
+ "epoch": 0.3,
+ "grad_norm": 0.9192422032356262,
+ "learning_rate": 3.968474774257682e-05,
+ "loss": 1.9878,
+ "step": 965
+ },
+ {
+ "epoch": 0.3,
+ "grad_norm": 1.9304206371307373,
+ "learning_rate": 3.9585719684958446e-05,
+ "loss": 2.117,
+ "step": 970
+ },
+ {
+ "epoch": 0.3,
+ "grad_norm": 1.0435137748718262,
+ "learning_rate": 3.948634344655639e-05,
+ "loss": 2.0585,
+ "step": 975
+ },
+ {
+ "epoch": 0.3,
+ "grad_norm": 1.4636590480804443,
+ "learning_rate": 3.938662139961538e-05,
+ "loss": 2.0409,
+ "step": 980
+ },
+ {
+ "epoch": 0.31,
+ "grad_norm": 1.8014529943466187,
+ "learning_rate": 3.928655592463508e-05,
+ "loss": 2.0369,
+ "step": 985
+ },
+ {
+ "epoch": 0.31,
+ "grad_norm": 1.2412620782852173,
+ "learning_rate": 3.918614941031319e-05,
+ "loss": 1.967,
+ "step": 990
+ },
+ {
+ "epoch": 0.31,
+ "grad_norm": 1.3581103086471558,
+ "learning_rate": 3.908540425348852e-05,
+ "loss": 2.0037,
+ "step": 995
+ },
+ {
+ "epoch": 0.31,
+ "grad_norm": 1.2377780675888062,
+ "learning_rate": 3.8984322859083725e-05,
+ "loss": 1.9991,
+ "step": 1000
+ },
+ {
+ "epoch": 0.31,
+ "grad_norm": 0.9209259748458862,
+ "learning_rate": 3.8882907640047896e-05,
+ "loss": 2.0448,
+ "step": 1005
+ },
+ {
+ "epoch": 0.31,
+ "grad_norm": 1.0150959491729736,
+ "learning_rate": 3.878116101729897e-05,
+ "loss": 2.0791,
+ "step": 1010
+ },
+ {
+ "epoch": 0.32,
+ "grad_norm": 1.5959141254425049,
+ "learning_rate": 3.867908541966594e-05,
+ "loss": 1.9997,
+ "step": 1015
+ },
+ {
+ "epoch": 0.32,
+ "grad_norm": 1.3945012092590332,
+ "learning_rate": 3.857668328383088e-05,
+ "loss": 2.0481,
+ "step": 1020
+ },
+ {
+ "epoch": 0.32,
+ "grad_norm": 1.2361671924591064,
+ "learning_rate": 3.847395705427075e-05,
+ "loss": 2.2664,
+ "step": 1025
+ },
+ {
+ "epoch": 0.32,
+ "grad_norm": 1.9661719799041748,
+ "learning_rate": 3.837090918319909e-05,
+ "loss": 1.9752,
+ "step": 1030
+ },
+ {
+ "epoch": 0.32,
+ "grad_norm": 1.6995949745178223,
+ "learning_rate": 3.8267542130507436e-05,
+ "loss": 2.1332,
+ "step": 1035
+ },
+ {
+ "epoch": 0.32,
+ "grad_norm": 1.1248412132263184,
+ "learning_rate": 3.816385836370663e-05,
+ "loss": 2.0432,
+ "step": 1040
+ },
+ {
+ "epoch": 0.32,
+ "grad_norm": 0.8734235763549805,
+ "learning_rate": 3.805986035786789e-05,
+ "loss": 1.9618,
+ "step": 1045
+ },
+ {
+ "epoch": 0.33,
+ "grad_norm": 1.322766661643982,
+ "learning_rate": 3.795555059556378e-05,
+ "loss": 2.0267,
+ "step": 1050
+ },
+ {
+ "epoch": 0.33,
+ "grad_norm": 1.0396028757095337,
+ "learning_rate": 3.7850931566808866e-05,
+ "loss": 2.1075,
+ "step": 1055
+ },
+ {
+ "epoch": 0.33,
+ "grad_norm": 0.9574625492095947,
+ "learning_rate": 3.7746005769000363e-05,
+ "loss": 2.156,
+ "step": 1060
+ },
+ {
+ "epoch": 0.33,
+ "grad_norm": 1.4480133056640625,
+ "learning_rate": 3.764077570685844e-05,
+ "loss": 1.9615,
+ "step": 1065
+ },
+ {
+ "epoch": 0.33,
+ "grad_norm": 1.5908560752868652,
+ "learning_rate": 3.753524389236648e-05,
+ "loss": 2.0928,
+ "step": 1070
+ },
+ {
+ "epoch": 0.33,
+ "grad_norm": 1.2628813982009888,
+ "learning_rate": 3.742941284471111e-05,
+ "loss": 2.1074,
+ "step": 1075
+ },
+ {
+ "epoch": 0.34,
+ "grad_norm": 1.2687503099441528,
+ "learning_rate": 3.7323285090222054e-05,
+ "loss": 1.9666,
+ "step": 1080
+ },
+ {
+ "epoch": 0.34,
+ "grad_norm": 1.2571731805801392,
+ "learning_rate": 3.721686316231181e-05,
+ "loss": 2.0468,
+ "step": 1085
+ },
+ {
+ "epoch": 0.34,
+ "grad_norm": 1.007453441619873,
+ "learning_rate": 3.7110149601415215e-05,
+ "loss": 2.0624,
+ "step": 1090
+ },
+ {
+ "epoch": 0.34,
+ "grad_norm": 1.2390377521514893,
+ "learning_rate": 3.700314695492876e-05,
+ "loss": 1.9888,
+ "step": 1095
+ },
+ {
+ "epoch": 0.34,
+ "grad_norm": 1.0878371000289917,
+ "learning_rate": 3.6895857777149825e-05,
+ "loss": 2.1013,
+ "step": 1100
+ },
+ {
+ "epoch": 0.34,
+ "grad_norm": 0.8759217262268066,
+ "learning_rate": 3.6788284629215624e-05,
+ "loss": 1.875,
+ "step": 1105
+ },
+ {
+ "epoch": 0.35,
+ "grad_norm": 1.1345970630645752,
+ "learning_rate": 3.668043007904219e-05,
+ "loss": 1.9096,
+ "step": 1110
+ },
+ {
+ "epoch": 0.35,
+ "grad_norm": 1.253629446029663,
+ "learning_rate": 3.6572296701262966e-05,
+ "loss": 2.1859,
+ "step": 1115
+ },
+ {
+ "epoch": 0.35,
+ "grad_norm": 0.9796190857887268,
+ "learning_rate": 3.646388707716738e-05,
+ "loss": 2.2092,
+ "step": 1120
+ },
+ {
+ "epoch": 0.35,
+ "grad_norm": 1.3893767595291138,
+ "learning_rate": 3.635520379463926e-05,
+ "loss": 2.0026,
+ "step": 1125
+ },
+ {
+ "epoch": 0.35,
+ "grad_norm": 0.8778309226036072,
+ "learning_rate": 3.6246249448095004e-05,
+ "loss": 2.2112,
+ "step": 1130
+ },
+ {
+ "epoch": 0.35,
+ "grad_norm": 1.2479698657989502,
+ "learning_rate": 3.6137026638421696e-05,
+ "loss": 2.0221,
+ "step": 1135
+ },
+ {
+ "epoch": 0.35,
+ "grad_norm": 1.3813824653625488,
+ "learning_rate": 3.6027537972914974e-05,
+ "loss": 1.9106,
+ "step": 1140
+ },
+ {
+ "epoch": 0.36,
+ "grad_norm": 1.2043218612670898,
+ "learning_rate": 3.5917786065216826e-05,
+ "loss": 2.0673,
+ "step": 1145
+ },
+ {
+ "epoch": 0.36,
+ "grad_norm": 1.5337340831756592,
+ "learning_rate": 3.580777353525318e-05,
+ "loss": 2.1463,
+ "step": 1150
+ },
+ {
+ "epoch": 0.36,
+ "grad_norm": 1.155813455581665,
+ "learning_rate": 3.5697503009171385e-05,
+ "loss": 2.0255,
+ "step": 1155
+ },
+ {
+ "epoch": 0.36,
+ "grad_norm": 1.034644365310669,
+ "learning_rate": 3.558697711927748e-05,
+ "loss": 2.1348,
+ "step": 1160
+ },
+ {
+ "epoch": 0.36,
+ "grad_norm": 1.0959795713424683,
+ "learning_rate": 3.54761985039734e-05,
+ "loss": 2.1457,
+ "step": 1165
+ },
+ {
+ "epoch": 0.36,
+ "grad_norm": 1.1938838958740234,
+ "learning_rate": 3.5365169807693966e-05,
+ "loss": 2.1256,
+ "step": 1170
+ },
+ {
+ "epoch": 0.37,
+ "grad_norm": 0.8162047863006592,
+ "learning_rate": 3.525389368084379e-05,
+ "loss": 1.9587,
+ "step": 1175
+ },
+ {
+ "epoch": 0.37,
+ "grad_norm": 0.9358930587768555,
+ "learning_rate": 3.514237277973393e-05,
+ "loss": 1.8965,
+ "step": 1180
+ },
+ {
+ "epoch": 0.37,
+ "grad_norm": 0.9210988879203796,
+ "learning_rate": 3.503060976651862e-05,
+ "loss": 1.9669,
+ "step": 1185
+ },
+ {
+ "epoch": 0.37,
+ "grad_norm": 1.4641343355178833,
+ "learning_rate": 3.491860730913156e-05,
+ "loss": 2.003,
+ "step": 1190
+ },
+ {
+ "epoch": 0.37,
+ "grad_norm": 1.2458257675170898,
+ "learning_rate": 3.480636808122235e-05,
+ "loss": 2.1487,
+ "step": 1195
+ },
+ {
+ "epoch": 0.37,
+ "grad_norm": 1.6770122051239014,
+ "learning_rate": 3.469389476209259e-05,
+ "loss": 2.0686,
+ "step": 1200
+ },
+ {
+ "epoch": 0.37,
+ "grad_norm": 0.9083845019340515,
+ "learning_rate": 3.458119003663199e-05,
+ "loss": 2.0284,
+ "step": 1205
+ },
+ {
+ "epoch": 0.38,
+ "grad_norm": 1.2679696083068848,
+ "learning_rate": 3.446825659525421e-05,
+ "loss": 2.0555,
+ "step": 1210
+ },
+ {
+ "epoch": 0.38,
+ "grad_norm": 1.3823720216751099,
+ "learning_rate": 3.435509713383268e-05,
+ "loss": 1.9375,
+ "step": 1215
+ },
+ {
+ "epoch": 0.38,
+ "grad_norm": 1.5862077474594116,
+ "learning_rate": 3.424171435363623e-05,
+ "loss": 2.0271,
+ "step": 1220
+ },
+ {
+ "epoch": 0.38,
+ "grad_norm": 2.0107533931732178,
+ "learning_rate": 3.412811096126461e-05,
+ "loss": 2.1897,
+ "step": 1225
+ },
+ {
+ "epoch": 0.38,
+ "grad_norm": 1.4544458389282227,
+ "learning_rate": 3.401428966858387e-05,
+ "loss": 1.9978,
+ "step": 1230
+ },
+ {
+ "epoch": 0.38,
+ "grad_norm": 1.188170075416565,
+ "learning_rate": 3.390025319266167e-05,
+ "loss": 2.0688,
+ "step": 1235
+ },
+ {
+ "epoch": 0.39,
+ "grad_norm": 1.1016322374343872,
+ "learning_rate": 3.3786004255702336e-05,
+ "loss": 2.0396,
+ "step": 1240
+ },
+ {
+ "epoch": 0.39,
+ "grad_norm": 1.6623334884643555,
+ "learning_rate": 3.3671545584981954e-05,
+ "loss": 1.9566,
+ "step": 1245
+ },
+ {
+ "epoch": 0.39,
+ "grad_norm": 0.9161584377288818,
+ "learning_rate": 3.355687991278324e-05,
+ "loss": 2.0474,
+ "step": 1250
+ },
+ {
+ "epoch": 0.39,
+ "grad_norm": 0.9911025166511536,
+ "learning_rate": 3.3442009976330305e-05,
+ "loss": 2.2163,
+ "step": 1255
+ },
+ {
+ "epoch": 0.39,
+ "grad_norm": 1.1504255533218384,
+ "learning_rate": 3.332693851772331e-05,
+ "loss": 2.1088,
+ "step": 1260
+ },
+ {
+ "epoch": 0.39,
+ "grad_norm": 0.9544184803962708,
+ "learning_rate": 3.3211668283873035e-05,
+ "loss": 1.8947,
+ "step": 1265
+ },
+ {
+ "epoch": 0.39,
+ "grad_norm": 1.4625756740570068,
+ "learning_rate": 3.3096202026435304e-05,
+ "loss": 2.1748,
+ "step": 1270
+ },
+ {
+ "epoch": 0.4,
+ "grad_norm": 1.3267475366592407,
+ "learning_rate": 3.298054250174527e-05,
+ "loss": 1.9218,
+ "step": 1275
+ },
+ {
+ "epoch": 0.4,
+ "grad_norm": 0.9869363903999329,
+ "learning_rate": 3.2864692470751654e-05,
+ "loss": 2.2723,
+ "step": 1280
+ },
+ {
+ "epoch": 0.4,
+ "grad_norm": 1.5177838802337646,
+ "learning_rate": 3.27486546989508e-05,
+ "loss": 2.1456,
+ "step": 1285
+ },
+ {
+ "epoch": 0.4,
+ "grad_norm": 1.1998714208602905,
+ "learning_rate": 3.263243195632068e-05,
+ "loss": 1.8877,
+ "step": 1290
+ },
+ {
+ "epoch": 0.4,
+ "grad_norm": 1.2112164497375488,
+ "learning_rate": 3.2516027017254785e-05,
+ "loss": 2.0615,
+ "step": 1295
+ },
+ {
+ "epoch": 0.4,
+ "grad_norm": 1.0616129636764526,
+ "learning_rate": 3.239944266049587e-05,
+ "loss": 2.0402,
+ "step": 1300
+ },
+ {
+ "epoch": 0.41,
+ "grad_norm": 1.4537287950515747,
+ "learning_rate": 3.228268166906962e-05,
+ "loss": 2.0728,
+ "step": 1305
+ },
+ {
+ "epoch": 0.41,
+ "grad_norm": 1.3899391889572144,
+ "learning_rate": 3.2165746830218254e-05,
+ "loss": 2.1815,
+ "step": 1310
+ },
+ {
+ "epoch": 0.41,
+ "grad_norm": 1.332529067993164,
+ "learning_rate": 3.204864093533394e-05,
+ "loss": 1.8935,
+ "step": 1315
+ },
+ {
+ "epoch": 0.41,
+ "grad_norm": 1.4466496706008911,
+ "learning_rate": 3.193136677989221e-05,
+ "loss": 1.9567,
+ "step": 1320
+ },
+ {
+ "epoch": 0.41,
+ "grad_norm": 1.1781721115112305,
+ "learning_rate": 3.181392716338516e-05,
+ "loss": 2.055,
+ "step": 1325
+ },
+ {
+ "epoch": 0.41,
+ "grad_norm": 0.9411901831626892,
+ "learning_rate": 3.1696324889254716e-05,
+ "loss": 1.8794,
+ "step": 1330
+ },
+ {
+ "epoch": 0.42,
+ "grad_norm": 1.2628341913223267,
+ "learning_rate": 3.15785627648256e-05,
+ "loss": 2.0299,
+ "step": 1335
+ },
+ {
+ "epoch": 0.42,
+ "grad_norm": 1.4857370853424072,
+ "learning_rate": 3.146064360123846e-05,
+ "loss": 1.9342,
+ "step": 1340
+ },
+ {
+ "epoch": 0.42,
+ "grad_norm": 1.661470651626587,
+ "learning_rate": 3.1342570213382594e-05,
+ "loss": 2.0399,
+ "step": 1345
+ },
+ {
+ "epoch": 0.42,
+ "grad_norm": 1.522845983505249,
+ "learning_rate": 3.122434541982888e-05,
+ "loss": 2.1419,
+ "step": 1350
+ },
+ {
+ "epoch": 0.42,
+ "grad_norm": 1.5679118633270264,
+ "learning_rate": 3.110597204276247e-05,
+ "loss": 2.2932,
+ "step": 1355
+ },
+ {
+ "epoch": 0.42,
+ "grad_norm": 1.3367788791656494,
+ "learning_rate": 3.098745290791539e-05,
+ "loss": 1.8989,
+ "step": 1360
+ },
+ {
+ "epoch": 0.42,
+ "grad_norm": 1.3873472213745117,
+ "learning_rate": 3.086879084449907e-05,
+ "loss": 2.1214,
+ "step": 1365
+ },
+ {
+ "epoch": 0.43,
+ "grad_norm": 1.2957035303115845,
+ "learning_rate": 3.074998868513688e-05,
+ "loss": 2.2538,
+ "step": 1370
+ },
+ {
+ "epoch": 0.43,
+ "grad_norm": 1.122176170349121,
+ "learning_rate": 3.0631049265796465e-05,
+ "loss": 2.0974,
+ "step": 1375
+ },
+ {
+ "epoch": 0.43,
+ "grad_norm": 1.0422618389129639,
+ "learning_rate": 3.051197542572203e-05,
+ "loss": 2.054,
+ "step": 1380
+ },
+ {
+ "epoch": 0.43,
+ "grad_norm": 1.1926140785217285,
+ "learning_rate": 3.0392770007366584e-05,
+ "loss": 1.9798,
+ "step": 1385
+ },
+ {
+ "epoch": 0.43,
+ "grad_norm": 0.8764025568962097,
+ "learning_rate": 3.0273435856324112e-05,
+ "loss": 2.0796,
+ "step": 1390
+ },
+ {
+ "epoch": 0.43,
+ "grad_norm": 0.8200764656066895,
+ "learning_rate": 3.0153975821261605e-05,
+ "loss": 1.9116,
+ "step": 1395
+ },
+ {
+ "epoch": 0.44,
+ "grad_norm": 1.0340498685836792,
+ "learning_rate": 3.0034392753851066e-05,
+ "loss": 2.0235,
+ "step": 1400
+ },
+ {
+ "epoch": 0.44,
+ "grad_norm": 1.0799012184143066,
+ "learning_rate": 2.9914689508701476e-05,
+ "loss": 2.1455,
+ "step": 1405
+ },
+ {
+ "epoch": 0.44,
+ "grad_norm": 1.301015853881836,
+ "learning_rate": 2.979486894329058e-05,
+ "loss": 2.0355,
+ "step": 1410
+ },
+ {
+ "epoch": 0.44,
+ "grad_norm": 1.2926914691925049,
+ "learning_rate": 2.9674933917896747e-05,
+ "loss": 2.0379,
+ "step": 1415
+ },
+ {
+ "epoch": 0.44,
+ "grad_norm": 1.4712942838668823,
+ "learning_rate": 2.9554887295530647e-05,
+ "loss": 2.0802,
+ "step": 1420
+ },
+ {
+ "epoch": 0.44,
+ "grad_norm": 1.1957335472106934,
+ "learning_rate": 2.943473194186693e-05,
+ "loss": 2.1044,
+ "step": 1425
+ },
+ {
+ "epoch": 0.44,
+ "grad_norm": 1.568293571472168,
+ "learning_rate": 2.9314470725175792e-05,
+ "loss": 2.0121,
+ "step": 1430
+ },
+ {
+ "epoch": 0.45,
+ "grad_norm": 1.4844893217086792,
+ "learning_rate": 2.919410651625455e-05,
+ "loss": 2.0717,
+ "step": 1435
+ },
+ {
+ "epoch": 0.45,
+ "grad_norm": 1.3942641019821167,
+ "learning_rate": 2.907364218835904e-05,
+ "loss": 1.9522,
+ "step": 1440
+ },
+ {
+ "epoch": 0.45,
+ "grad_norm": 0.7795314788818359,
+ "learning_rate": 2.8953080617135115e-05,
+ "loss": 1.9593,
+ "step": 1445
+ },
+ {
+ "epoch": 0.45,
+ "grad_norm": 1.751107931137085,
+ "learning_rate": 2.8832424680549937e-05,
+ "loss": 1.8073,
+ "step": 1450
+ },
+ {
+ "epoch": 0.45,
+ "grad_norm": 1.2202279567718506,
+ "learning_rate": 2.8711677258823306e-05,
+ "loss": 2.0042,
+ "step": 1455
+ },
+ {
+ "epoch": 0.45,
+ "grad_norm": 1.5163853168487549,
+ "learning_rate": 2.859084123435887e-05,
+ "loss": 1.9931,
+ "step": 1460
+ },
+ {
+ "epoch": 0.46,
+ "grad_norm": 0.94038987159729,
+ "learning_rate": 2.84699194916754e-05,
+ "loss": 2.1533,
+ "step": 1465
+ },
+ {
+ "epoch": 0.46,
+ "grad_norm": 1.4618102312088013,
+ "learning_rate": 2.834891491733781e-05,
+ "loss": 2.029,
+ "step": 1470
+ },
+ {
+ "epoch": 0.46,
+ "grad_norm": 0.9747155904769897,
+ "learning_rate": 2.822783039988836e-05,
+ "loss": 2.0241,
+ "step": 1475
+ },
+ {
+ "epoch": 0.46,
+ "grad_norm": 1.0887038707733154,
+ "learning_rate": 2.8106668829777645e-05,
+ "loss": 2.0959,
+ "step": 1480
+ },
+ {
+ "epoch": 0.46,
+ "grad_norm": 1.2170171737670898,
+ "learning_rate": 2.7985433099295618e-05,
+ "loss": 1.8718,
+ "step": 1485
+ },
+ {
+ "epoch": 0.46,
+ "grad_norm": 1.1366883516311646,
+ "learning_rate": 2.7864126102502524e-05,
+ "loss": 2.2397,
+ "step": 1490
+ },
+ {
+ "epoch": 0.46,
+ "grad_norm": 1.1206785440444946,
+ "learning_rate": 2.774275073515985e-05,
+ "loss": 2.1083,
+ "step": 1495
+ },
+ {
+ "epoch": 0.47,
+ "grad_norm": 1.126807451248169,
+ "learning_rate": 2.7621309894661167e-05,
+ "loss": 2.0764,
+ "step": 1500
+ },
+ {
+ "epoch": 0.47,
+ "grad_norm": 1.0077627897262573,
+ "learning_rate": 2.7499806479962997e-05,
+ "loss": 2.0955,
+ "step": 1505
+ },
+ {
+ "epoch": 0.47,
+ "grad_norm": 0.9740080833435059,
+ "learning_rate": 2.7378243391515558e-05,
+ "loss": 2.0449,
+ "step": 1510
+ },
+ {
+ "epoch": 0.47,
+ "grad_norm": 1.100853681564331,
+ "learning_rate": 2.7256623531193605e-05,
+ "loss": 1.8368,
+ "step": 1515
+ },
+ {
+ "epoch": 0.47,
+ "grad_norm": 1.147560954093933,
+ "learning_rate": 2.7134949802227073e-05,
+ "loss": 2.024,
+ "step": 1520
+ },
+ {
+ "epoch": 0.47,
+ "grad_norm": 0.8977387547492981,
+ "learning_rate": 2.7013225109131836e-05,
+ "loss": 2.0699,
+ "step": 1525
+ },
+ {
+ "epoch": 0.48,
+ "grad_norm": 1.5398712158203125,
+ "learning_rate": 2.689145235764035e-05,
+ "loss": 1.953,
+ "step": 1530
+ },
+ {
+ "epoch": 0.48,
+ "step": 1534,
+ "total_flos": 2.060795874115584e+16,
+ "train_loss": 2.0602192145127516,
+ "train_runtime": 801.4891,
+ "train_samples_per_second": 64.198,
+ "train_steps_per_second": 4.011
+ }
+ ],
+ "logging_steps": 5,
+ "max_steps": 3215,
+ "num_input_tokens_seen": 0,
+ "num_train_epochs": 1,
+ "save_steps": 100,
+ "total_flos": 2.060795874115584e+16,
+ "train_batch_size": 2,
+ "trial_name": null,
+ "trial_params": null
+}
diff --git a/training_args.bin b/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..dbb5f752acaebe6718c05e59ced2a70ce6dfb6a0
--- /dev/null
+++ b/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:79e80b13ff00898b4493d440a3c1a1eb234c0ae541cbca8a8b1befef97a354c9
+size 5112