diff --git a/README.md b/README.md new file mode 100644 index 0000000000000000000000000000000000000000..4b6f787248182d62a16f6423f948c336352c3674 --- /dev/null +++ b/README.md @@ -0,0 +1,204 @@ +--- +library_name: peft +base_model: bigcode/starcoder +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] + + +### Framework versions + +- PEFT 0.8.2 \ No newline at end of file diff --git a/adapter_config.json b/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..ab6c617ff1322585052700834abbc593bae7c619 --- /dev/null +++ b/adapter_config.json @@ -0,0 +1,28 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "bigcode/starcoder", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "c_proj", + "c_attn", + "q_attn" + ], + "task_type": "CAUSAL_LM", + "use_rslora": false +} \ No newline at end of file diff --git a/adapter_model.safetensors b/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..0b7377ced059703efbf7179779a269234116eb75 --- /dev/null +++ b/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e44ce263e6fd885f50d82ca515b9325375b43ee36ededb75acf161ce88bc2e41 +size 48 diff --git a/checkpoint-100/README.md b/checkpoint-100/README.md new file mode 100644 index 0000000000000000000000000000000000000000..4b6f787248182d62a16f6423f948c336352c3674 --- /dev/null +++ b/checkpoint-100/README.md @@ -0,0 +1,204 @@ +--- +library_name: peft +base_model: bigcode/starcoder +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] + + +### Framework versions + +- PEFT 0.8.2 \ No newline at end of file diff --git a/checkpoint-100/adapter_config.json b/checkpoint-100/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..ab6c617ff1322585052700834abbc593bae7c619 --- /dev/null +++ b/checkpoint-100/adapter_config.json @@ -0,0 +1,28 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "bigcode/starcoder", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "c_proj", + "c_attn", + "q_attn" + ], + "task_type": "CAUSAL_LM", + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-100/adapter_model.safetensors b/checkpoint-100/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..0b7377ced059703efbf7179779a269234116eb75 --- /dev/null +++ b/checkpoint-100/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e44ce263e6fd885f50d82ca515b9325375b43ee36ededb75acf161ce88bc2e41 +size 48 diff --git a/checkpoint-100/optimizer.pt b/checkpoint-100/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..dc03c082514a5c09cbe5b0e5ca3b3c4f2ea4fa90 --- /dev/null +++ b/checkpoint-100/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1feaae461eeadb5beb9c4062de583386b3387578fda1f3fd0bf0a01207bb3a6d +size 284628602 diff --git a/checkpoint-100/rng_state.pth b/checkpoint-100/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..d602a0b78d132dcd53f81a72e3899303ef72317c --- /dev/null +++ b/checkpoint-100/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d90fab8063bac27bbc03ad7b8e96092834ee7799bc4eaae9ec98aa646a6358f6 +size 14244 diff --git a/checkpoint-100/scheduler.pt b/checkpoint-100/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..318a48d667b3ecd4666dc692587f103642e3669d --- /dev/null +++ b/checkpoint-100/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2a1553fa3c9fa4158c1d95c29f303ea1c82da9959617b77d6296624e38c6ca27 +size 1064 diff --git a/checkpoint-100/trainer_state.json b/checkpoint-100/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..1c0b76eb461f33b7e474bb6aedee5916110a95b0 --- /dev/null +++ b/checkpoint-100/trainer_state.json @@ -0,0 +1,131 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 8.16326530612245, + "eval_steps": 20, + "global_step": 100, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.82, + "grad_norm": 0.18377472460269928, + "learning_rate": 2.9999999999999997e-05, + "loss": 1.861, + "step": 10 + }, + { + "epoch": 1.63, + "grad_norm": 0.35202744603157043, + "learning_rate": 5.9999999999999995e-05, + "loss": 1.7263, + "step": 20 + }, + { + "epoch": 1.63, + "eval_loss": 1.4457755088806152, + "eval_runtime": 89.8938, + "eval_samples_per_second": 4.305, + "eval_steps_per_second": 0.545, + "step": 20 + }, + { + "epoch": 2.45, + "grad_norm": 0.928983747959137, + "learning_rate": 8.999999999999999e-05, + "loss": 1.1718, + "step": 30 + }, + { + "epoch": 3.27, + "grad_norm": 0.253262996673584, + "learning_rate": 0.00011999999999999999, + "loss": 0.4789, + "step": 40 + }, + { + "epoch": 3.27, + "eval_loss": 0.3332095146179199, + "eval_runtime": 89.9804, + "eval_samples_per_second": 4.301, + "eval_steps_per_second": 0.545, + "step": 40 + }, + { + "epoch": 4.08, + "grad_norm": 0.12236642092466354, + "learning_rate": 0.00015, + "loss": 0.3568, + "step": 50 + }, + { + "epoch": 4.9, + "grad_norm": 0.09160923212766647, + "learning_rate": 0.00017999999999999998, + "loss": 0.3256, + "step": 60 + }, + { + "epoch": 4.9, + "eval_loss": 0.2753114104270935, + "eval_runtime": 90.3206, + "eval_samples_per_second": 4.285, + "eval_steps_per_second": 0.543, + "step": 60 + }, + { + "epoch": 5.71, + "grad_norm": 0.10242326557636261, + "learning_rate": 0.00020999999999999998, + "loss": 0.2841, + "step": 70 + }, + { + "epoch": 6.53, + "grad_norm": 0.1305350810289383, + "learning_rate": 0.00023999999999999998, + "loss": 0.2615, + "step": 80 + }, + { + "epoch": 6.53, + "eval_loss": 0.2476309835910797, + "eval_runtime": 90.525, + "eval_samples_per_second": 4.275, + "eval_steps_per_second": 0.541, + "step": 80 + }, + { + "epoch": 7.35, + "grad_norm": 0.17941106855869293, + "learning_rate": 0.00027, + "loss": 0.2216, + "step": 90 + }, + { + "epoch": 8.16, + "grad_norm": 0.20095375180244446, + "learning_rate": 0.0003, + "loss": 0.1832, + "step": 100 + }, + { + "epoch": 8.16, + "eval_loss": 0.2350914180278778, + "eval_runtime": 90.3919, + "eval_samples_per_second": 4.281, + "eval_steps_per_second": 0.542, + "step": 100 + } + ], + "logging_steps": 10, + "max_steps": 400, + "num_input_tokens_seen": 0, + "num_train_epochs": 34, + "save_steps": 20, + "total_flos": 5.738534596761354e+17, + "train_batch_size": 32, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-100/training_args.bin b/checkpoint-100/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..3f0d01d8ba12c6a725736cdf727e72ec03ea9a4f --- /dev/null +++ b/checkpoint-100/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:01be3c1366faeea704d7b18d02c117abdc170d0c96565a08a0f3ad9c5e7a123a +size 4856 diff --git a/checkpoint-120/README.md b/checkpoint-120/README.md new file mode 100644 index 0000000000000000000000000000000000000000..4b6f787248182d62a16f6423f948c336352c3674 --- /dev/null +++ b/checkpoint-120/README.md @@ -0,0 +1,204 @@ +--- +library_name: peft +base_model: bigcode/starcoder +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] + + +### Framework versions + +- PEFT 0.8.2 \ No newline at end of file diff --git a/checkpoint-120/adapter_config.json b/checkpoint-120/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..ab6c617ff1322585052700834abbc593bae7c619 --- /dev/null +++ b/checkpoint-120/adapter_config.json @@ -0,0 +1,28 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "bigcode/starcoder", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "c_proj", + "c_attn", + "q_attn" + ], + "task_type": "CAUSAL_LM", + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-120/adapter_model.safetensors b/checkpoint-120/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..0b7377ced059703efbf7179779a269234116eb75 --- /dev/null +++ b/checkpoint-120/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e44ce263e6fd885f50d82ca515b9325375b43ee36ededb75acf161ce88bc2e41 +size 48 diff --git a/checkpoint-120/optimizer.pt b/checkpoint-120/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..58012379df7fb07e00e6800310b7a0136aab36e8 --- /dev/null +++ b/checkpoint-120/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:114dfd322ab013f62616a9a1cfb3c4b02ccf97f698e28bfa2a74a9f40856e99a +size 284628602 diff --git a/checkpoint-120/rng_state.pth b/checkpoint-120/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..332d878bb40dd469698f0ce249139b035b5a04d9 --- /dev/null +++ b/checkpoint-120/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5618ca5f563e0e88f407ba712395955e41529e4d54f3b9fc10e2af22327ec8cc +size 14244 diff --git a/checkpoint-120/scheduler.pt b/checkpoint-120/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..8fa93f4b997f90de52bbb57f00f5b82cee75381a --- /dev/null +++ b/checkpoint-120/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:915b81b2641ff3c1c29a91260eaebd9d13138742f1687cfcdc8c68bcb9ea698e +size 1064 diff --git a/checkpoint-120/trainer_state.json b/checkpoint-120/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..164381e495b8d38526141b93902ad6c7fa1962d8 --- /dev/null +++ b/checkpoint-120/trainer_state.json @@ -0,0 +1,153 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 9.795918367346939, + "eval_steps": 20, + "global_step": 120, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.82, + "grad_norm": 0.18377472460269928, + "learning_rate": 2.9999999999999997e-05, + "loss": 1.861, + "step": 10 + }, + { + "epoch": 1.63, + "grad_norm": 0.35202744603157043, + "learning_rate": 5.9999999999999995e-05, + "loss": 1.7263, + "step": 20 + }, + { + "epoch": 1.63, + "eval_loss": 1.4457755088806152, + "eval_runtime": 89.8938, + "eval_samples_per_second": 4.305, + "eval_steps_per_second": 0.545, + "step": 20 + }, + { + "epoch": 2.45, + "grad_norm": 0.928983747959137, + "learning_rate": 8.999999999999999e-05, + "loss": 1.1718, + "step": 30 + }, + { + "epoch": 3.27, + "grad_norm": 0.253262996673584, + "learning_rate": 0.00011999999999999999, + "loss": 0.4789, + "step": 40 + }, + { + "epoch": 3.27, + "eval_loss": 0.3332095146179199, + "eval_runtime": 89.9804, + "eval_samples_per_second": 4.301, + "eval_steps_per_second": 0.545, + "step": 40 + }, + { + "epoch": 4.08, + "grad_norm": 0.12236642092466354, + "learning_rate": 0.00015, + "loss": 0.3568, + "step": 50 + }, + { + "epoch": 4.9, + "grad_norm": 0.09160923212766647, + "learning_rate": 0.00017999999999999998, + "loss": 0.3256, + "step": 60 + }, + { + "epoch": 4.9, + "eval_loss": 0.2753114104270935, + "eval_runtime": 90.3206, + "eval_samples_per_second": 4.285, + "eval_steps_per_second": 0.543, + "step": 60 + }, + { + "epoch": 5.71, + "grad_norm": 0.10242326557636261, + "learning_rate": 0.00020999999999999998, + "loss": 0.2841, + "step": 70 + }, + { + "epoch": 6.53, + "grad_norm": 0.1305350810289383, + "learning_rate": 0.00023999999999999998, + "loss": 0.2615, + "step": 80 + }, + { + "epoch": 6.53, + "eval_loss": 0.2476309835910797, + "eval_runtime": 90.525, + "eval_samples_per_second": 4.275, + "eval_steps_per_second": 0.541, + "step": 80 + }, + { + "epoch": 7.35, + "grad_norm": 0.17941106855869293, + "learning_rate": 0.00027, + "loss": 0.2216, + "step": 90 + }, + { + "epoch": 8.16, + "grad_norm": 0.20095375180244446, + "learning_rate": 0.0003, + "loss": 0.1832, + "step": 100 + }, + { + "epoch": 8.16, + "eval_loss": 0.2350914180278778, + "eval_runtime": 90.3919, + "eval_samples_per_second": 4.281, + "eval_steps_per_second": 0.542, + "step": 100 + }, + { + "epoch": 8.98, + "grad_norm": 0.2600422501564026, + "learning_rate": 0.00029, + "loss": 0.1441, + "step": 110 + }, + { + "epoch": 9.8, + "grad_norm": 0.20544037222862244, + "learning_rate": 0.00028, + "loss": 0.1186, + "step": 120 + }, + { + "epoch": 9.8, + "eval_loss": 0.23090216517448425, + "eval_runtime": 90.3144, + "eval_samples_per_second": 4.285, + "eval_steps_per_second": 0.543, + "step": 120 + } + ], + "logging_steps": 10, + "max_steps": 400, + "num_input_tokens_seen": 0, + "num_train_epochs": 34, + "save_steps": 20, + "total_flos": 6.887969287129006e+17, + "train_batch_size": 32, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-120/training_args.bin b/checkpoint-120/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..3f0d01d8ba12c6a725736cdf727e72ec03ea9a4f --- /dev/null +++ b/checkpoint-120/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:01be3c1366faeea704d7b18d02c117abdc170d0c96565a08a0f3ad9c5e7a123a +size 4856 diff --git a/checkpoint-140/README.md b/checkpoint-140/README.md new file mode 100644 index 0000000000000000000000000000000000000000..4b6f787248182d62a16f6423f948c336352c3674 --- /dev/null +++ b/checkpoint-140/README.md @@ -0,0 +1,204 @@ +--- +library_name: peft +base_model: bigcode/starcoder +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] + + +### Framework versions + +- PEFT 0.8.2 \ No newline at end of file diff --git a/checkpoint-140/adapter_config.json b/checkpoint-140/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..ab6c617ff1322585052700834abbc593bae7c619 --- /dev/null +++ b/checkpoint-140/adapter_config.json @@ -0,0 +1,28 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "bigcode/starcoder", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "c_proj", + "c_attn", + "q_attn" + ], + "task_type": "CAUSAL_LM", + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-140/adapter_model.safetensors b/checkpoint-140/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..0b7377ced059703efbf7179779a269234116eb75 --- /dev/null +++ b/checkpoint-140/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e44ce263e6fd885f50d82ca515b9325375b43ee36ededb75acf161ce88bc2e41 +size 48 diff --git a/checkpoint-140/optimizer.pt b/checkpoint-140/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..4c4ed057d308c0ef2c245ae7b383aa03b39eacd5 --- /dev/null +++ b/checkpoint-140/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:222684730d062ab6a2f08b62218a814e0c9939380313556b54cf827c12929f8b +size 284628602 diff --git a/checkpoint-140/rng_state.pth b/checkpoint-140/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..a866188c4475412d39e020396dba53e10e910537 --- /dev/null +++ b/checkpoint-140/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e2ac710919be23109fc86c5fff99822b2cb7d7cfb09bc3a5a36f390af49cb96a +size 14244 diff --git a/checkpoint-140/scheduler.pt b/checkpoint-140/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..608ef2d9b88b439e133656d8fcce0b3141115e28 --- /dev/null +++ b/checkpoint-140/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:86d5f7e9b17251247c39fdd59964f3693734ddb120bd20dcb453e9efae1445bd +size 1064 diff --git a/checkpoint-140/trainer_state.json b/checkpoint-140/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..51749bc1f593c4aa4482eaf2249e70d63b248719 --- /dev/null +++ b/checkpoint-140/trainer_state.json @@ -0,0 +1,175 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 11.428571428571429, + "eval_steps": 20, + "global_step": 140, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.82, + "grad_norm": 0.18377472460269928, + "learning_rate": 2.9999999999999997e-05, + "loss": 1.861, + "step": 10 + }, + { + "epoch": 1.63, + "grad_norm": 0.35202744603157043, + "learning_rate": 5.9999999999999995e-05, + "loss": 1.7263, + "step": 20 + }, + { + "epoch": 1.63, + "eval_loss": 1.4457755088806152, + "eval_runtime": 89.8938, + "eval_samples_per_second": 4.305, + "eval_steps_per_second": 0.545, + "step": 20 + }, + { + "epoch": 2.45, + "grad_norm": 0.928983747959137, + "learning_rate": 8.999999999999999e-05, + "loss": 1.1718, + "step": 30 + }, + { + "epoch": 3.27, + "grad_norm": 0.253262996673584, + "learning_rate": 0.00011999999999999999, + "loss": 0.4789, + "step": 40 + }, + { + "epoch": 3.27, + "eval_loss": 0.3332095146179199, + "eval_runtime": 89.9804, + "eval_samples_per_second": 4.301, + "eval_steps_per_second": 0.545, + "step": 40 + }, + { + "epoch": 4.08, + "grad_norm": 0.12236642092466354, + "learning_rate": 0.00015, + "loss": 0.3568, + "step": 50 + }, + { + "epoch": 4.9, + "grad_norm": 0.09160923212766647, + "learning_rate": 0.00017999999999999998, + "loss": 0.3256, + "step": 60 + }, + { + "epoch": 4.9, + "eval_loss": 0.2753114104270935, + "eval_runtime": 90.3206, + "eval_samples_per_second": 4.285, + "eval_steps_per_second": 0.543, + "step": 60 + }, + { + "epoch": 5.71, + "grad_norm": 0.10242326557636261, + "learning_rate": 0.00020999999999999998, + "loss": 0.2841, + "step": 70 + }, + { + "epoch": 6.53, + "grad_norm": 0.1305350810289383, + "learning_rate": 0.00023999999999999998, + "loss": 0.2615, + "step": 80 + }, + { + "epoch": 6.53, + "eval_loss": 0.2476309835910797, + "eval_runtime": 90.525, + "eval_samples_per_second": 4.275, + "eval_steps_per_second": 0.541, + "step": 80 + }, + { + "epoch": 7.35, + "grad_norm": 0.17941106855869293, + "learning_rate": 0.00027, + "loss": 0.2216, + "step": 90 + }, + { + "epoch": 8.16, + "grad_norm": 0.20095375180244446, + "learning_rate": 0.0003, + "loss": 0.1832, + "step": 100 + }, + { + "epoch": 8.16, + "eval_loss": 0.2350914180278778, + "eval_runtime": 90.3919, + "eval_samples_per_second": 4.281, + "eval_steps_per_second": 0.542, + "step": 100 + }, + { + "epoch": 8.98, + "grad_norm": 0.2600422501564026, + "learning_rate": 0.00029, + "loss": 0.1441, + "step": 110 + }, + { + "epoch": 9.8, + "grad_norm": 0.20544037222862244, + "learning_rate": 0.00028, + "loss": 0.1186, + "step": 120 + }, + { + "epoch": 9.8, + "eval_loss": 0.23090216517448425, + "eval_runtime": 90.3144, + "eval_samples_per_second": 4.285, + "eval_steps_per_second": 0.543, + "step": 120 + }, + { + "epoch": 10.61, + "grad_norm": 0.2158157229423523, + "learning_rate": 0.00027, + "loss": 0.0947, + "step": 130 + }, + { + "epoch": 11.43, + "grad_norm": 0.18916285037994385, + "learning_rate": 0.00026, + "loss": 0.0768, + "step": 140 + }, + { + "epoch": 11.43, + "eval_loss": 0.24214179813861847, + "eval_runtime": 90.2597, + "eval_samples_per_second": 4.288, + "eval_steps_per_second": 0.543, + "step": 140 + } + ], + "logging_steps": 10, + "max_steps": 400, + "num_input_tokens_seen": 0, + "num_train_epochs": 34, + "save_steps": 20, + "total_flos": 8.03366825638232e+17, + "train_batch_size": 32, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-140/training_args.bin b/checkpoint-140/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..3f0d01d8ba12c6a725736cdf727e72ec03ea9a4f --- /dev/null +++ b/checkpoint-140/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:01be3c1366faeea704d7b18d02c117abdc170d0c96565a08a0f3ad9c5e7a123a +size 4856 diff --git a/checkpoint-160/README.md b/checkpoint-160/README.md new file mode 100644 index 0000000000000000000000000000000000000000..4b6f787248182d62a16f6423f948c336352c3674 --- /dev/null +++ b/checkpoint-160/README.md @@ -0,0 +1,204 @@ +--- +library_name: peft +base_model: bigcode/starcoder +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] + + +### Framework versions + +- PEFT 0.8.2 \ No newline at end of file diff --git a/checkpoint-160/adapter_config.json b/checkpoint-160/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..ab6c617ff1322585052700834abbc593bae7c619 --- /dev/null +++ b/checkpoint-160/adapter_config.json @@ -0,0 +1,28 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "bigcode/starcoder", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "c_proj", + "c_attn", + "q_attn" + ], + "task_type": "CAUSAL_LM", + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-160/adapter_model.safetensors b/checkpoint-160/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..0b7377ced059703efbf7179779a269234116eb75 --- /dev/null +++ b/checkpoint-160/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e44ce263e6fd885f50d82ca515b9325375b43ee36ededb75acf161ce88bc2e41 +size 48 diff --git a/checkpoint-160/optimizer.pt b/checkpoint-160/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..4f993ea49ff0af264820baba834c98ac39b08e71 --- /dev/null +++ b/checkpoint-160/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2747c6579221cdf5bb28a934c1eb1fb4aff8ef6d09a8b0bd96f626ed051b467c +size 284628602 diff --git a/checkpoint-160/rng_state.pth b/checkpoint-160/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..891e7f7ec2620dd626a23b78a6aeea99a5fd180e --- /dev/null +++ b/checkpoint-160/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b2db044807e15fad135cbedea4b155f149313f24f1ea48d56839a21289a56f10 +size 14244 diff --git a/checkpoint-160/scheduler.pt b/checkpoint-160/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..384d44e5e6558d886996d98c0679b2de9b934a4c --- /dev/null +++ b/checkpoint-160/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c6bc91ca4e04e59b6aa0def614e5d23daa02dfd514357b40e48edfe0bf128e03 +size 1064 diff --git a/checkpoint-160/trainer_state.json b/checkpoint-160/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..16357ab526feb5f2e1c6f9f7e20c8afbfb20c1d2 --- /dev/null +++ b/checkpoint-160/trainer_state.json @@ -0,0 +1,197 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 13.061224489795919, + "eval_steps": 20, + "global_step": 160, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.82, + "grad_norm": 0.18377472460269928, + "learning_rate": 2.9999999999999997e-05, + "loss": 1.861, + "step": 10 + }, + { + "epoch": 1.63, + "grad_norm": 0.35202744603157043, + "learning_rate": 5.9999999999999995e-05, + "loss": 1.7263, + "step": 20 + }, + { + "epoch": 1.63, + "eval_loss": 1.4457755088806152, + "eval_runtime": 89.8938, + "eval_samples_per_second": 4.305, + "eval_steps_per_second": 0.545, + "step": 20 + }, + { + "epoch": 2.45, + "grad_norm": 0.928983747959137, + "learning_rate": 8.999999999999999e-05, + "loss": 1.1718, + "step": 30 + }, + { + "epoch": 3.27, + "grad_norm": 0.253262996673584, + "learning_rate": 0.00011999999999999999, + "loss": 0.4789, + "step": 40 + }, + { + "epoch": 3.27, + "eval_loss": 0.3332095146179199, + "eval_runtime": 89.9804, + "eval_samples_per_second": 4.301, + "eval_steps_per_second": 0.545, + "step": 40 + }, + { + "epoch": 4.08, + "grad_norm": 0.12236642092466354, + "learning_rate": 0.00015, + "loss": 0.3568, + "step": 50 + }, + { + "epoch": 4.9, + "grad_norm": 0.09160923212766647, + "learning_rate": 0.00017999999999999998, + "loss": 0.3256, + "step": 60 + }, + { + "epoch": 4.9, + "eval_loss": 0.2753114104270935, + "eval_runtime": 90.3206, + "eval_samples_per_second": 4.285, + "eval_steps_per_second": 0.543, + "step": 60 + }, + { + "epoch": 5.71, + "grad_norm": 0.10242326557636261, + "learning_rate": 0.00020999999999999998, + "loss": 0.2841, + "step": 70 + }, + { + "epoch": 6.53, + "grad_norm": 0.1305350810289383, + "learning_rate": 0.00023999999999999998, + "loss": 0.2615, + "step": 80 + }, + { + "epoch": 6.53, + "eval_loss": 0.2476309835910797, + "eval_runtime": 90.525, + "eval_samples_per_second": 4.275, + "eval_steps_per_second": 0.541, + "step": 80 + }, + { + "epoch": 7.35, + "grad_norm": 0.17941106855869293, + "learning_rate": 0.00027, + "loss": 0.2216, + "step": 90 + }, + { + "epoch": 8.16, + "grad_norm": 0.20095375180244446, + "learning_rate": 0.0003, + "loss": 0.1832, + "step": 100 + }, + { + "epoch": 8.16, + "eval_loss": 0.2350914180278778, + "eval_runtime": 90.3919, + "eval_samples_per_second": 4.281, + "eval_steps_per_second": 0.542, + "step": 100 + }, + { + "epoch": 8.98, + "grad_norm": 0.2600422501564026, + "learning_rate": 0.00029, + "loss": 0.1441, + "step": 110 + }, + { + "epoch": 9.8, + "grad_norm": 0.20544037222862244, + "learning_rate": 0.00028, + "loss": 0.1186, + "step": 120 + }, + { + "epoch": 9.8, + "eval_loss": 0.23090216517448425, + "eval_runtime": 90.3144, + "eval_samples_per_second": 4.285, + "eval_steps_per_second": 0.543, + "step": 120 + }, + { + "epoch": 10.61, + "grad_norm": 0.2158157229423523, + "learning_rate": 0.00027, + "loss": 0.0947, + "step": 130 + }, + { + "epoch": 11.43, + "grad_norm": 0.18916285037994385, + "learning_rate": 0.00026, + "loss": 0.0768, + "step": 140 + }, + { + "epoch": 11.43, + "eval_loss": 0.24214179813861847, + "eval_runtime": 90.2597, + "eval_samples_per_second": 4.288, + "eval_steps_per_second": 0.543, + "step": 140 + }, + { + "epoch": 12.24, + "grad_norm": 0.22263498604297638, + "learning_rate": 0.00025, + "loss": 0.0615, + "step": 150 + }, + { + "epoch": 13.06, + "grad_norm": 0.21315976977348328, + "learning_rate": 0.00023999999999999998, + "loss": 0.054, + "step": 160 + }, + { + "epoch": 13.06, + "eval_loss": 0.25932466983795166, + "eval_runtime": 89.8439, + "eval_samples_per_second": 4.307, + "eval_steps_per_second": 0.545, + "step": 160 + } + ], + "logging_steps": 10, + "max_steps": 400, + "num_input_tokens_seen": 0, + "num_train_epochs": 34, + "save_steps": 20, + "total_flos": 9.17609846966059e+17, + "train_batch_size": 32, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-160/training_args.bin b/checkpoint-160/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..3f0d01d8ba12c6a725736cdf727e72ec03ea9a4f --- /dev/null +++ b/checkpoint-160/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:01be3c1366faeea704d7b18d02c117abdc170d0c96565a08a0f3ad9c5e7a123a +size 4856 diff --git a/checkpoint-180/README.md b/checkpoint-180/README.md new file mode 100644 index 0000000000000000000000000000000000000000..4b6f787248182d62a16f6423f948c336352c3674 --- /dev/null +++ b/checkpoint-180/README.md @@ -0,0 +1,204 @@ +--- +library_name: peft +base_model: bigcode/starcoder +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] + + +### Framework versions + +- PEFT 0.8.2 \ No newline at end of file diff --git a/checkpoint-180/adapter_config.json b/checkpoint-180/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..ab6c617ff1322585052700834abbc593bae7c619 --- /dev/null +++ b/checkpoint-180/adapter_config.json @@ -0,0 +1,28 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "bigcode/starcoder", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "c_proj", + "c_attn", + "q_attn" + ], + "task_type": "CAUSAL_LM", + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-180/adapter_model.safetensors b/checkpoint-180/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..0b7377ced059703efbf7179779a269234116eb75 --- /dev/null +++ b/checkpoint-180/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e44ce263e6fd885f50d82ca515b9325375b43ee36ededb75acf161ce88bc2e41 +size 48 diff --git a/checkpoint-180/optimizer.pt b/checkpoint-180/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..a0025ee9d809320e2c3a42ac3664fb0f4392b2c8 --- /dev/null +++ b/checkpoint-180/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ac59b4a8215def070849b25780be46446aedd048d71d1cd8965d8b3e3c958f0a +size 284628602 diff --git a/checkpoint-180/rng_state.pth b/checkpoint-180/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..6c90592f6140364b1eaa2a325221d598d7d9f7ed --- /dev/null +++ b/checkpoint-180/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c643cd616fa0e3b8fc7b1ca162a78b68d156fe98612a09f80241bf8fd9147e5d +size 14244 diff --git a/checkpoint-180/scheduler.pt b/checkpoint-180/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..900ec9e5bde9a5e4f8680c6956fe26c89923d01b --- /dev/null +++ b/checkpoint-180/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f5dfc3bc75009efe48747cf9948a244ecefe4e6b758242f83bed075794e9a377 +size 1064 diff --git a/checkpoint-180/trainer_state.json b/checkpoint-180/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..1f6eefe61a895b97393132fa7d0ef16a8fb5346d --- /dev/null +++ b/checkpoint-180/trainer_state.json @@ -0,0 +1,219 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 14.693877551020408, + "eval_steps": 20, + "global_step": 180, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.82, + "grad_norm": 0.18377472460269928, + "learning_rate": 2.9999999999999997e-05, + "loss": 1.861, + "step": 10 + }, + { + "epoch": 1.63, + "grad_norm": 0.35202744603157043, + "learning_rate": 5.9999999999999995e-05, + "loss": 1.7263, + "step": 20 + }, + { + "epoch": 1.63, + "eval_loss": 1.4457755088806152, + "eval_runtime": 89.8938, + "eval_samples_per_second": 4.305, + "eval_steps_per_second": 0.545, + "step": 20 + }, + { + "epoch": 2.45, + "grad_norm": 0.928983747959137, + "learning_rate": 8.999999999999999e-05, + "loss": 1.1718, + "step": 30 + }, + { + "epoch": 3.27, + "grad_norm": 0.253262996673584, + "learning_rate": 0.00011999999999999999, + "loss": 0.4789, + "step": 40 + }, + { + "epoch": 3.27, + "eval_loss": 0.3332095146179199, + "eval_runtime": 89.9804, + "eval_samples_per_second": 4.301, + "eval_steps_per_second": 0.545, + "step": 40 + }, + { + "epoch": 4.08, + "grad_norm": 0.12236642092466354, + "learning_rate": 0.00015, + "loss": 0.3568, + "step": 50 + }, + { + "epoch": 4.9, + "grad_norm": 0.09160923212766647, + "learning_rate": 0.00017999999999999998, + "loss": 0.3256, + "step": 60 + }, + { + "epoch": 4.9, + "eval_loss": 0.2753114104270935, + "eval_runtime": 90.3206, + "eval_samples_per_second": 4.285, + "eval_steps_per_second": 0.543, + "step": 60 + }, + { + "epoch": 5.71, + "grad_norm": 0.10242326557636261, + "learning_rate": 0.00020999999999999998, + "loss": 0.2841, + "step": 70 + }, + { + "epoch": 6.53, + "grad_norm": 0.1305350810289383, + "learning_rate": 0.00023999999999999998, + "loss": 0.2615, + "step": 80 + }, + { + "epoch": 6.53, + "eval_loss": 0.2476309835910797, + "eval_runtime": 90.525, + "eval_samples_per_second": 4.275, + "eval_steps_per_second": 0.541, + "step": 80 + }, + { + "epoch": 7.35, + "grad_norm": 0.17941106855869293, + "learning_rate": 0.00027, + "loss": 0.2216, + "step": 90 + }, + { + "epoch": 8.16, + "grad_norm": 0.20095375180244446, + "learning_rate": 0.0003, + "loss": 0.1832, + "step": 100 + }, + { + "epoch": 8.16, + "eval_loss": 0.2350914180278778, + "eval_runtime": 90.3919, + "eval_samples_per_second": 4.281, + "eval_steps_per_second": 0.542, + "step": 100 + }, + { + "epoch": 8.98, + "grad_norm": 0.2600422501564026, + "learning_rate": 0.00029, + "loss": 0.1441, + "step": 110 + }, + { + "epoch": 9.8, + "grad_norm": 0.20544037222862244, + "learning_rate": 0.00028, + "loss": 0.1186, + "step": 120 + }, + { + "epoch": 9.8, + "eval_loss": 0.23090216517448425, + "eval_runtime": 90.3144, + "eval_samples_per_second": 4.285, + "eval_steps_per_second": 0.543, + "step": 120 + }, + { + "epoch": 10.61, + "grad_norm": 0.2158157229423523, + "learning_rate": 0.00027, + "loss": 0.0947, + "step": 130 + }, + { + "epoch": 11.43, + "grad_norm": 0.18916285037994385, + "learning_rate": 0.00026, + "loss": 0.0768, + "step": 140 + }, + { + "epoch": 11.43, + "eval_loss": 0.24214179813861847, + "eval_runtime": 90.2597, + "eval_samples_per_second": 4.288, + "eval_steps_per_second": 0.543, + "step": 140 + }, + { + "epoch": 12.24, + "grad_norm": 0.22263498604297638, + "learning_rate": 0.00025, + "loss": 0.0615, + "step": 150 + }, + { + "epoch": 13.06, + "grad_norm": 0.21315976977348328, + "learning_rate": 0.00023999999999999998, + "loss": 0.054, + "step": 160 + }, + { + "epoch": 13.06, + "eval_loss": 0.25932466983795166, + "eval_runtime": 89.8439, + "eval_samples_per_second": 4.307, + "eval_steps_per_second": 0.545, + "step": 160 + }, + { + "epoch": 13.88, + "grad_norm": 0.18338361382484436, + "learning_rate": 0.00023, + "loss": 0.0455, + "step": 170 + }, + { + "epoch": 14.69, + "grad_norm": 0.17157459259033203, + "learning_rate": 0.00021999999999999995, + "loss": 0.0393, + "step": 180 + }, + { + "epoch": 14.69, + "eval_loss": 0.27233538031578064, + "eval_runtime": 90.1364, + "eval_samples_per_second": 4.293, + "eval_steps_per_second": 0.544, + "step": 180 + } + ], + "logging_steps": 10, + "max_steps": 400, + "num_input_tokens_seen": 0, + "num_train_epochs": 34, + "save_steps": 20, + "total_flos": 1.0330202811421164e+18, + "train_batch_size": 32, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-180/training_args.bin b/checkpoint-180/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..3f0d01d8ba12c6a725736cdf727e72ec03ea9a4f --- /dev/null +++ b/checkpoint-180/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:01be3c1366faeea704d7b18d02c117abdc170d0c96565a08a0f3ad9c5e7a123a +size 4856 diff --git a/checkpoint-20/README.md b/checkpoint-20/README.md new file mode 100644 index 0000000000000000000000000000000000000000..4b6f787248182d62a16f6423f948c336352c3674 --- /dev/null +++ b/checkpoint-20/README.md @@ -0,0 +1,204 @@ +--- +library_name: peft +base_model: bigcode/starcoder +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] + + +### Framework versions + +- PEFT 0.8.2 \ No newline at end of file diff --git a/checkpoint-20/adapter_config.json b/checkpoint-20/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..ab6c617ff1322585052700834abbc593bae7c619 --- /dev/null +++ b/checkpoint-20/adapter_config.json @@ -0,0 +1,28 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "bigcode/starcoder", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "c_proj", + "c_attn", + "q_attn" + ], + "task_type": "CAUSAL_LM", + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-20/adapter_model.safetensors b/checkpoint-20/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..0b7377ced059703efbf7179779a269234116eb75 --- /dev/null +++ b/checkpoint-20/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e44ce263e6fd885f50d82ca515b9325375b43ee36ededb75acf161ce88bc2e41 +size 48 diff --git a/checkpoint-20/optimizer.pt b/checkpoint-20/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..b8dd68ca6f752d2a16b4ec8e9d1cff56f20d1159 --- /dev/null +++ b/checkpoint-20/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:df048142c25b752046df970d6dcc0693d5d7f4c4193730d6dbdeabe3af9861ad +size 284628602 diff --git a/checkpoint-20/rng_state.pth b/checkpoint-20/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..dae5ec1cccd3f30a9db135f547386ac77e0daf33 --- /dev/null +++ b/checkpoint-20/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2dbe6d770d526425c4ebf1c166cc93b0c0ddb2b941fd4176071da7fada6d3fe5 +size 14244 diff --git a/checkpoint-20/scheduler.pt b/checkpoint-20/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..e0b30d480a76e7df4b910aa4733f5d1c8ecf4338 --- /dev/null +++ b/checkpoint-20/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2bc30782c80f39b95ece53e16ed533d0eb1d775796dc4d2c39691e355b142ab4 +size 1064 diff --git a/checkpoint-20/trainer_state.json b/checkpoint-20/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..26f8e0734d25a2efd55dd2718e7a719099e4ef2e --- /dev/null +++ b/checkpoint-20/trainer_state.json @@ -0,0 +1,43 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.6326530612244898, + "eval_steps": 20, + "global_step": 20, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.82, + "grad_norm": 0.18377472460269928, + "learning_rate": 2.9999999999999997e-05, + "loss": 1.861, + "step": 10 + }, + { + "epoch": 1.63, + "grad_norm": 0.35202744603157043, + "learning_rate": 5.9999999999999995e-05, + "loss": 1.7263, + "step": 20 + }, + { + "epoch": 1.63, + "eval_loss": 1.4457755088806152, + "eval_runtime": 89.8938, + "eval_samples_per_second": 4.305, + "eval_steps_per_second": 0.545, + "step": 20 + } + ], + "logging_steps": 10, + "max_steps": 400, + "num_input_tokens_seen": 0, + "num_train_epochs": 34, + "save_steps": 20, + "total_flos": 1.1541043417605734e+17, + "train_batch_size": 32, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-20/training_args.bin b/checkpoint-20/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..3f0d01d8ba12c6a725736cdf727e72ec03ea9a4f --- /dev/null +++ b/checkpoint-20/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:01be3c1366faeea704d7b18d02c117abdc170d0c96565a08a0f3ad9c5e7a123a +size 4856 diff --git a/checkpoint-200/README.md b/checkpoint-200/README.md new file mode 100644 index 0000000000000000000000000000000000000000..4b6f787248182d62a16f6423f948c336352c3674 --- /dev/null +++ b/checkpoint-200/README.md @@ -0,0 +1,204 @@ +--- +library_name: peft +base_model: bigcode/starcoder +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] + + +### Framework versions + +- PEFT 0.8.2 \ No newline at end of file diff --git a/checkpoint-200/adapter_config.json b/checkpoint-200/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..ab6c617ff1322585052700834abbc593bae7c619 --- /dev/null +++ b/checkpoint-200/adapter_config.json @@ -0,0 +1,28 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "bigcode/starcoder", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "c_proj", + "c_attn", + "q_attn" + ], + "task_type": "CAUSAL_LM", + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-200/adapter_model.safetensors b/checkpoint-200/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..0b7377ced059703efbf7179779a269234116eb75 --- /dev/null +++ b/checkpoint-200/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e44ce263e6fd885f50d82ca515b9325375b43ee36ededb75acf161ce88bc2e41 +size 48 diff --git a/checkpoint-200/optimizer.pt b/checkpoint-200/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..6c02a283116b03d2b2404681ace2c59935d5018b --- /dev/null +++ b/checkpoint-200/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:26cb22ac8210f02dc52c28309a4874485ef4e82e679847351f7bd4ca454bbe32 +size 284628602 diff --git a/checkpoint-200/rng_state.pth b/checkpoint-200/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..b7b964654466296f8f2f080bc3f86ca422e0afd5 --- /dev/null +++ b/checkpoint-200/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:637237de1d6cfcc0cc43e19a5f2a0f933019b992a579e0440bc25f8954866eab +size 14244 diff --git a/checkpoint-200/scheduler.pt b/checkpoint-200/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..f613c3ff4de97ff4d7e9aa2a9f8ee26805165829 --- /dev/null +++ b/checkpoint-200/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6795bda54303bd8bc8385fc9d0f4a2d7f6d29bd4f0ce908d2fb2991845ecc313 +size 1064 diff --git a/checkpoint-200/trainer_state.json b/checkpoint-200/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..1ba3af8fb6cd19e743e9c5597155c544c347daef --- /dev/null +++ b/checkpoint-200/trainer_state.json @@ -0,0 +1,241 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 16.3265306122449, + "eval_steps": 20, + "global_step": 200, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.82, + "grad_norm": 0.18377472460269928, + "learning_rate": 2.9999999999999997e-05, + "loss": 1.861, + "step": 10 + }, + { + "epoch": 1.63, + "grad_norm": 0.35202744603157043, + "learning_rate": 5.9999999999999995e-05, + "loss": 1.7263, + "step": 20 + }, + { + "epoch": 1.63, + "eval_loss": 1.4457755088806152, + "eval_runtime": 89.8938, + "eval_samples_per_second": 4.305, + "eval_steps_per_second": 0.545, + "step": 20 + }, + { + "epoch": 2.45, + "grad_norm": 0.928983747959137, + "learning_rate": 8.999999999999999e-05, + "loss": 1.1718, + "step": 30 + }, + { + "epoch": 3.27, + "grad_norm": 0.253262996673584, + "learning_rate": 0.00011999999999999999, + "loss": 0.4789, + "step": 40 + }, + { + "epoch": 3.27, + "eval_loss": 0.3332095146179199, + "eval_runtime": 89.9804, + "eval_samples_per_second": 4.301, + "eval_steps_per_second": 0.545, + "step": 40 + }, + { + "epoch": 4.08, + "grad_norm": 0.12236642092466354, + "learning_rate": 0.00015, + "loss": 0.3568, + "step": 50 + }, + { + "epoch": 4.9, + "grad_norm": 0.09160923212766647, + "learning_rate": 0.00017999999999999998, + "loss": 0.3256, + "step": 60 + }, + { + "epoch": 4.9, + "eval_loss": 0.2753114104270935, + "eval_runtime": 90.3206, + "eval_samples_per_second": 4.285, + "eval_steps_per_second": 0.543, + "step": 60 + }, + { + "epoch": 5.71, + "grad_norm": 0.10242326557636261, + "learning_rate": 0.00020999999999999998, + "loss": 0.2841, + "step": 70 + }, + { + "epoch": 6.53, + "grad_norm": 0.1305350810289383, + "learning_rate": 0.00023999999999999998, + "loss": 0.2615, + "step": 80 + }, + { + "epoch": 6.53, + "eval_loss": 0.2476309835910797, + "eval_runtime": 90.525, + "eval_samples_per_second": 4.275, + "eval_steps_per_second": 0.541, + "step": 80 + }, + { + "epoch": 7.35, + "grad_norm": 0.17941106855869293, + "learning_rate": 0.00027, + "loss": 0.2216, + "step": 90 + }, + { + "epoch": 8.16, + "grad_norm": 0.20095375180244446, + "learning_rate": 0.0003, + "loss": 0.1832, + "step": 100 + }, + { + "epoch": 8.16, + "eval_loss": 0.2350914180278778, + "eval_runtime": 90.3919, + "eval_samples_per_second": 4.281, + "eval_steps_per_second": 0.542, + "step": 100 + }, + { + "epoch": 8.98, + "grad_norm": 0.2600422501564026, + "learning_rate": 0.00029, + "loss": 0.1441, + "step": 110 + }, + { + "epoch": 9.8, + "grad_norm": 0.20544037222862244, + "learning_rate": 0.00028, + "loss": 0.1186, + "step": 120 + }, + { + "epoch": 9.8, + "eval_loss": 0.23090216517448425, + "eval_runtime": 90.3144, + "eval_samples_per_second": 4.285, + "eval_steps_per_second": 0.543, + "step": 120 + }, + { + "epoch": 10.61, + "grad_norm": 0.2158157229423523, + "learning_rate": 0.00027, + "loss": 0.0947, + "step": 130 + }, + { + "epoch": 11.43, + "grad_norm": 0.18916285037994385, + "learning_rate": 0.00026, + "loss": 0.0768, + "step": 140 + }, + { + "epoch": 11.43, + "eval_loss": 0.24214179813861847, + "eval_runtime": 90.2597, + "eval_samples_per_second": 4.288, + "eval_steps_per_second": 0.543, + "step": 140 + }, + { + "epoch": 12.24, + "grad_norm": 0.22263498604297638, + "learning_rate": 0.00025, + "loss": 0.0615, + "step": 150 + }, + { + "epoch": 13.06, + "grad_norm": 0.21315976977348328, + "learning_rate": 0.00023999999999999998, + "loss": 0.054, + "step": 160 + }, + { + "epoch": 13.06, + "eval_loss": 0.25932466983795166, + "eval_runtime": 89.8439, + "eval_samples_per_second": 4.307, + "eval_steps_per_second": 0.545, + "step": 160 + }, + { + "epoch": 13.88, + "grad_norm": 0.18338361382484436, + "learning_rate": 0.00023, + "loss": 0.0455, + "step": 170 + }, + { + "epoch": 14.69, + "grad_norm": 0.17157459259033203, + "learning_rate": 0.00021999999999999995, + "loss": 0.0393, + "step": 180 + }, + { + "epoch": 14.69, + "eval_loss": 0.27233538031578064, + "eval_runtime": 90.1364, + "eval_samples_per_second": 4.293, + "eval_steps_per_second": 0.544, + "step": 180 + }, + { + "epoch": 15.51, + "grad_norm": 0.1541435867547989, + "learning_rate": 0.00020999999999999998, + "loss": 0.0352, + "step": 190 + }, + { + "epoch": 16.33, + "grad_norm": 0.1553652435541153, + "learning_rate": 0.00019999999999999998, + "loss": 0.0325, + "step": 200 + }, + { + "epoch": 16.33, + "eval_loss": 0.28704825043678284, + "eval_runtime": 89.7951, + "eval_samples_per_second": 4.31, + "eval_steps_per_second": 0.546, + "step": 200 + } + ], + "logging_steps": 10, + "max_steps": 400, + "num_input_tokens_seen": 0, + "num_train_epochs": 34, + "save_steps": 20, + "total_flos": 1.1471699094420849e+18, + "train_batch_size": 32, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-200/training_args.bin b/checkpoint-200/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..3f0d01d8ba12c6a725736cdf727e72ec03ea9a4f --- /dev/null +++ b/checkpoint-200/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:01be3c1366faeea704d7b18d02c117abdc170d0c96565a08a0f3ad9c5e7a123a +size 4856 diff --git a/checkpoint-220/README.md b/checkpoint-220/README.md new file mode 100644 index 0000000000000000000000000000000000000000..4b6f787248182d62a16f6423f948c336352c3674 --- /dev/null +++ b/checkpoint-220/README.md @@ -0,0 +1,204 @@ +--- +library_name: peft +base_model: bigcode/starcoder +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] + + +### Framework versions + +- PEFT 0.8.2 \ No newline at end of file diff --git a/checkpoint-220/adapter_config.json b/checkpoint-220/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..ab6c617ff1322585052700834abbc593bae7c619 --- /dev/null +++ b/checkpoint-220/adapter_config.json @@ -0,0 +1,28 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "bigcode/starcoder", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "c_proj", + "c_attn", + "q_attn" + ], + "task_type": "CAUSAL_LM", + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-220/adapter_model.safetensors b/checkpoint-220/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..0b7377ced059703efbf7179779a269234116eb75 --- /dev/null +++ b/checkpoint-220/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e44ce263e6fd885f50d82ca515b9325375b43ee36ededb75acf161ce88bc2e41 +size 48 diff --git a/checkpoint-220/optimizer.pt b/checkpoint-220/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..2384e73e60d611465eb447ba6d53a2ecfad913c3 --- /dev/null +++ b/checkpoint-220/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c47636495e01d276ffae9896bb8c18ce73010627055cf807383088ed2dae88fc +size 284628602 diff --git a/checkpoint-220/rng_state.pth b/checkpoint-220/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..e9c8203df2d7833c6ca18a6e7b47cb8d873b8ace --- /dev/null +++ b/checkpoint-220/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3edbaa73ac7f30eefe8c0dd8eb537afd3ac94c6703bee94965a470000ecd437b +size 14244 diff --git a/checkpoint-220/scheduler.pt b/checkpoint-220/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..46f9dfab9e077ea12afa1d94a7c8aff731125da3 --- /dev/null +++ b/checkpoint-220/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cd18ed34ef6e0139641e61e552552fe67f5e90236dba94b6c8eca5985486bd06 +size 1064 diff --git a/checkpoint-220/trainer_state.json b/checkpoint-220/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..364481e9b6a9a1570acbb75ba19cacb737e94484 --- /dev/null +++ b/checkpoint-220/trainer_state.json @@ -0,0 +1,263 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 17.959183673469386, + "eval_steps": 20, + "global_step": 220, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.82, + "grad_norm": 0.18377472460269928, + "learning_rate": 2.9999999999999997e-05, + "loss": 1.861, + "step": 10 + }, + { + "epoch": 1.63, + "grad_norm": 0.35202744603157043, + "learning_rate": 5.9999999999999995e-05, + "loss": 1.7263, + "step": 20 + }, + { + "epoch": 1.63, + "eval_loss": 1.4457755088806152, + "eval_runtime": 89.8938, + "eval_samples_per_second": 4.305, + "eval_steps_per_second": 0.545, + "step": 20 + }, + { + "epoch": 2.45, + "grad_norm": 0.928983747959137, + "learning_rate": 8.999999999999999e-05, + "loss": 1.1718, + "step": 30 + }, + { + "epoch": 3.27, + "grad_norm": 0.253262996673584, + "learning_rate": 0.00011999999999999999, + "loss": 0.4789, + "step": 40 + }, + { + "epoch": 3.27, + "eval_loss": 0.3332095146179199, + "eval_runtime": 89.9804, + "eval_samples_per_second": 4.301, + "eval_steps_per_second": 0.545, + "step": 40 + }, + { + "epoch": 4.08, + "grad_norm": 0.12236642092466354, + "learning_rate": 0.00015, + "loss": 0.3568, + "step": 50 + }, + { + "epoch": 4.9, + "grad_norm": 0.09160923212766647, + "learning_rate": 0.00017999999999999998, + "loss": 0.3256, + "step": 60 + }, + { + "epoch": 4.9, + "eval_loss": 0.2753114104270935, + "eval_runtime": 90.3206, + "eval_samples_per_second": 4.285, + "eval_steps_per_second": 0.543, + "step": 60 + }, + { + "epoch": 5.71, + "grad_norm": 0.10242326557636261, + "learning_rate": 0.00020999999999999998, + "loss": 0.2841, + "step": 70 + }, + { + "epoch": 6.53, + "grad_norm": 0.1305350810289383, + "learning_rate": 0.00023999999999999998, + "loss": 0.2615, + "step": 80 + }, + { + "epoch": 6.53, + "eval_loss": 0.2476309835910797, + "eval_runtime": 90.525, + "eval_samples_per_second": 4.275, + "eval_steps_per_second": 0.541, + "step": 80 + }, + { + "epoch": 7.35, + "grad_norm": 0.17941106855869293, + "learning_rate": 0.00027, + "loss": 0.2216, + "step": 90 + }, + { + "epoch": 8.16, + "grad_norm": 0.20095375180244446, + "learning_rate": 0.0003, + "loss": 0.1832, + "step": 100 + }, + { + "epoch": 8.16, + "eval_loss": 0.2350914180278778, + "eval_runtime": 90.3919, + "eval_samples_per_second": 4.281, + "eval_steps_per_second": 0.542, + "step": 100 + }, + { + "epoch": 8.98, + "grad_norm": 0.2600422501564026, + "learning_rate": 0.00029, + "loss": 0.1441, + "step": 110 + }, + { + "epoch": 9.8, + "grad_norm": 0.20544037222862244, + "learning_rate": 0.00028, + "loss": 0.1186, + "step": 120 + }, + { + "epoch": 9.8, + "eval_loss": 0.23090216517448425, + "eval_runtime": 90.3144, + "eval_samples_per_second": 4.285, + "eval_steps_per_second": 0.543, + "step": 120 + }, + { + "epoch": 10.61, + "grad_norm": 0.2158157229423523, + "learning_rate": 0.00027, + "loss": 0.0947, + "step": 130 + }, + { + "epoch": 11.43, + "grad_norm": 0.18916285037994385, + "learning_rate": 0.00026, + "loss": 0.0768, + "step": 140 + }, + { + "epoch": 11.43, + "eval_loss": 0.24214179813861847, + "eval_runtime": 90.2597, + "eval_samples_per_second": 4.288, + "eval_steps_per_second": 0.543, + "step": 140 + }, + { + "epoch": 12.24, + "grad_norm": 0.22263498604297638, + "learning_rate": 0.00025, + "loss": 0.0615, + "step": 150 + }, + { + "epoch": 13.06, + "grad_norm": 0.21315976977348328, + "learning_rate": 0.00023999999999999998, + "loss": 0.054, + "step": 160 + }, + { + "epoch": 13.06, + "eval_loss": 0.25932466983795166, + "eval_runtime": 89.8439, + "eval_samples_per_second": 4.307, + "eval_steps_per_second": 0.545, + "step": 160 + }, + { + "epoch": 13.88, + "grad_norm": 0.18338361382484436, + "learning_rate": 0.00023, + "loss": 0.0455, + "step": 170 + }, + { + "epoch": 14.69, + "grad_norm": 0.17157459259033203, + "learning_rate": 0.00021999999999999995, + "loss": 0.0393, + "step": 180 + }, + { + "epoch": 14.69, + "eval_loss": 0.27233538031578064, + "eval_runtime": 90.1364, + "eval_samples_per_second": 4.293, + "eval_steps_per_second": 0.544, + "step": 180 + }, + { + "epoch": 15.51, + "grad_norm": 0.1541435867547989, + "learning_rate": 0.00020999999999999998, + "loss": 0.0352, + "step": 190 + }, + { + "epoch": 16.33, + "grad_norm": 0.1553652435541153, + "learning_rate": 0.00019999999999999998, + "loss": 0.0325, + "step": 200 + }, + { + "epoch": 16.33, + "eval_loss": 0.28704825043678284, + "eval_runtime": 89.7951, + "eval_samples_per_second": 4.31, + "eval_steps_per_second": 0.546, + "step": 200 + }, + { + "epoch": 17.14, + "grad_norm": 0.13403691351413727, + "learning_rate": 0.00018999999999999998, + "loss": 0.0297, + "step": 210 + }, + { + "epoch": 17.96, + "grad_norm": 0.14512716233730316, + "learning_rate": 0.00017999999999999998, + "loss": 0.0279, + "step": 220 + }, + { + "epoch": 17.96, + "eval_loss": 0.2964874505996704, + "eval_runtime": 89.7009, + "eval_samples_per_second": 4.314, + "eval_steps_per_second": 0.546, + "step": 220 + } + ], + "logging_steps": 10, + "max_steps": 400, + "num_input_tokens_seen": 0, + "num_train_epochs": 34, + "save_steps": 20, + "total_flos": 1.262300164534567e+18, + "train_batch_size": 32, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-220/training_args.bin b/checkpoint-220/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..3f0d01d8ba12c6a725736cdf727e72ec03ea9a4f --- /dev/null +++ b/checkpoint-220/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:01be3c1366faeea704d7b18d02c117abdc170d0c96565a08a0f3ad9c5e7a123a +size 4856 diff --git a/checkpoint-240/README.md b/checkpoint-240/README.md new file mode 100644 index 0000000000000000000000000000000000000000..4b6f787248182d62a16f6423f948c336352c3674 --- /dev/null +++ b/checkpoint-240/README.md @@ -0,0 +1,204 @@ +--- +library_name: peft +base_model: bigcode/starcoder +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] + + +### Framework versions + +- PEFT 0.8.2 \ No newline at end of file diff --git a/checkpoint-240/adapter_config.json b/checkpoint-240/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..ab6c617ff1322585052700834abbc593bae7c619 --- /dev/null +++ b/checkpoint-240/adapter_config.json @@ -0,0 +1,28 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "bigcode/starcoder", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "c_proj", + "c_attn", + "q_attn" + ], + "task_type": "CAUSAL_LM", + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-240/adapter_model.safetensors b/checkpoint-240/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..0b7377ced059703efbf7179779a269234116eb75 --- /dev/null +++ b/checkpoint-240/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e44ce263e6fd885f50d82ca515b9325375b43ee36ededb75acf161ce88bc2e41 +size 48 diff --git a/checkpoint-240/optimizer.pt b/checkpoint-240/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..2d6334077859fcf334b9215fc684bc57fcfc7ccb --- /dev/null +++ b/checkpoint-240/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:300e34361117aac8ed136e5d7a7d3ae6b8594ef84ea1cb9a4df35b4290be6988 +size 284628602 diff --git a/checkpoint-240/rng_state.pth b/checkpoint-240/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..c2527708f69e3183138740d2549c7d1dcfe140f7 --- /dev/null +++ b/checkpoint-240/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d95dcc3c79699f6cd0d3a5165464d687a39bc265545d10ef96194e4b4411d702 +size 14244 diff --git a/checkpoint-240/scheduler.pt b/checkpoint-240/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..cd718e258bc8056a6e0a813b8d692817c2f44439 --- /dev/null +++ b/checkpoint-240/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:35029adf67ec7b6288431b13620690917e0d68e95151a8d732f3eb783ef0c816 +size 1064 diff --git a/checkpoint-240/trainer_state.json b/checkpoint-240/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..08e314dda5bac2a9ecd3144645506cdea42c3f14 --- /dev/null +++ b/checkpoint-240/trainer_state.json @@ -0,0 +1,285 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 19.591836734693878, + "eval_steps": 20, + "global_step": 240, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.82, + "grad_norm": 0.18377472460269928, + "learning_rate": 2.9999999999999997e-05, + "loss": 1.861, + "step": 10 + }, + { + "epoch": 1.63, + "grad_norm": 0.35202744603157043, + "learning_rate": 5.9999999999999995e-05, + "loss": 1.7263, + "step": 20 + }, + { + "epoch": 1.63, + "eval_loss": 1.4457755088806152, + "eval_runtime": 89.8938, + "eval_samples_per_second": 4.305, + "eval_steps_per_second": 0.545, + "step": 20 + }, + { + "epoch": 2.45, + "grad_norm": 0.928983747959137, + "learning_rate": 8.999999999999999e-05, + "loss": 1.1718, + "step": 30 + }, + { + "epoch": 3.27, + "grad_norm": 0.253262996673584, + "learning_rate": 0.00011999999999999999, + "loss": 0.4789, + "step": 40 + }, + { + "epoch": 3.27, + "eval_loss": 0.3332095146179199, + "eval_runtime": 89.9804, + "eval_samples_per_second": 4.301, + "eval_steps_per_second": 0.545, + "step": 40 + }, + { + "epoch": 4.08, + "grad_norm": 0.12236642092466354, + "learning_rate": 0.00015, + "loss": 0.3568, + "step": 50 + }, + { + "epoch": 4.9, + "grad_norm": 0.09160923212766647, + "learning_rate": 0.00017999999999999998, + "loss": 0.3256, + "step": 60 + }, + { + "epoch": 4.9, + "eval_loss": 0.2753114104270935, + "eval_runtime": 90.3206, + "eval_samples_per_second": 4.285, + "eval_steps_per_second": 0.543, + "step": 60 + }, + { + "epoch": 5.71, + "grad_norm": 0.10242326557636261, + "learning_rate": 0.00020999999999999998, + "loss": 0.2841, + "step": 70 + }, + { + "epoch": 6.53, + "grad_norm": 0.1305350810289383, + "learning_rate": 0.00023999999999999998, + "loss": 0.2615, + "step": 80 + }, + { + "epoch": 6.53, + "eval_loss": 0.2476309835910797, + "eval_runtime": 90.525, + "eval_samples_per_second": 4.275, + "eval_steps_per_second": 0.541, + "step": 80 + }, + { + "epoch": 7.35, + "grad_norm": 0.17941106855869293, + "learning_rate": 0.00027, + "loss": 0.2216, + "step": 90 + }, + { + "epoch": 8.16, + "grad_norm": 0.20095375180244446, + "learning_rate": 0.0003, + "loss": 0.1832, + "step": 100 + }, + { + "epoch": 8.16, + "eval_loss": 0.2350914180278778, + "eval_runtime": 90.3919, + "eval_samples_per_second": 4.281, + "eval_steps_per_second": 0.542, + "step": 100 + }, + { + "epoch": 8.98, + "grad_norm": 0.2600422501564026, + "learning_rate": 0.00029, + "loss": 0.1441, + "step": 110 + }, + { + "epoch": 9.8, + "grad_norm": 0.20544037222862244, + "learning_rate": 0.00028, + "loss": 0.1186, + "step": 120 + }, + { + "epoch": 9.8, + "eval_loss": 0.23090216517448425, + "eval_runtime": 90.3144, + "eval_samples_per_second": 4.285, + "eval_steps_per_second": 0.543, + "step": 120 + }, + { + "epoch": 10.61, + "grad_norm": 0.2158157229423523, + "learning_rate": 0.00027, + "loss": 0.0947, + "step": 130 + }, + { + "epoch": 11.43, + "grad_norm": 0.18916285037994385, + "learning_rate": 0.00026, + "loss": 0.0768, + "step": 140 + }, + { + "epoch": 11.43, + "eval_loss": 0.24214179813861847, + "eval_runtime": 90.2597, + "eval_samples_per_second": 4.288, + "eval_steps_per_second": 0.543, + "step": 140 + }, + { + "epoch": 12.24, + "grad_norm": 0.22263498604297638, + "learning_rate": 0.00025, + "loss": 0.0615, + "step": 150 + }, + { + "epoch": 13.06, + "grad_norm": 0.21315976977348328, + "learning_rate": 0.00023999999999999998, + "loss": 0.054, + "step": 160 + }, + { + "epoch": 13.06, + "eval_loss": 0.25932466983795166, + "eval_runtime": 89.8439, + "eval_samples_per_second": 4.307, + "eval_steps_per_second": 0.545, + "step": 160 + }, + { + "epoch": 13.88, + "grad_norm": 0.18338361382484436, + "learning_rate": 0.00023, + "loss": 0.0455, + "step": 170 + }, + { + "epoch": 14.69, + "grad_norm": 0.17157459259033203, + "learning_rate": 0.00021999999999999995, + "loss": 0.0393, + "step": 180 + }, + { + "epoch": 14.69, + "eval_loss": 0.27233538031578064, + "eval_runtime": 90.1364, + "eval_samples_per_second": 4.293, + "eval_steps_per_second": 0.544, + "step": 180 + }, + { + "epoch": 15.51, + "grad_norm": 0.1541435867547989, + "learning_rate": 0.00020999999999999998, + "loss": 0.0352, + "step": 190 + }, + { + "epoch": 16.33, + "grad_norm": 0.1553652435541153, + "learning_rate": 0.00019999999999999998, + "loss": 0.0325, + "step": 200 + }, + { + "epoch": 16.33, + "eval_loss": 0.28704825043678284, + "eval_runtime": 89.7951, + "eval_samples_per_second": 4.31, + "eval_steps_per_second": 0.546, + "step": 200 + }, + { + "epoch": 17.14, + "grad_norm": 0.13403691351413727, + "learning_rate": 0.00018999999999999998, + "loss": 0.0297, + "step": 210 + }, + { + "epoch": 17.96, + "grad_norm": 0.14512716233730316, + "learning_rate": 0.00017999999999999998, + "loss": 0.0279, + "step": 220 + }, + { + "epoch": 17.96, + "eval_loss": 0.2964874505996704, + "eval_runtime": 89.7009, + "eval_samples_per_second": 4.314, + "eval_steps_per_second": 0.546, + "step": 220 + }, + { + "epoch": 18.78, + "grad_norm": 0.12400835007429123, + "learning_rate": 0.00016999999999999999, + "loss": 0.0263, + "step": 230 + }, + { + "epoch": 19.59, + "grad_norm": 0.1139909029006958, + "learning_rate": 0.00015999999999999999, + "loss": 0.0246, + "step": 240 + }, + { + "epoch": 19.59, + "eval_loss": 0.30519917607307434, + "eval_runtime": 89.8387, + "eval_samples_per_second": 4.308, + "eval_steps_per_second": 0.545, + "step": 240 + } + ], + "logging_steps": 10, + "max_steps": 400, + "num_input_tokens_seen": 0, + "num_train_epochs": 34, + "save_steps": 20, + "total_flos": 1.3766132306332877e+18, + "train_batch_size": 32, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-240/training_args.bin b/checkpoint-240/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..3f0d01d8ba12c6a725736cdf727e72ec03ea9a4f --- /dev/null +++ b/checkpoint-240/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:01be3c1366faeea704d7b18d02c117abdc170d0c96565a08a0f3ad9c5e7a123a +size 4856 diff --git a/checkpoint-260/README.md b/checkpoint-260/README.md new file mode 100644 index 0000000000000000000000000000000000000000..4b6f787248182d62a16f6423f948c336352c3674 --- /dev/null +++ b/checkpoint-260/README.md @@ -0,0 +1,204 @@ +--- +library_name: peft +base_model: bigcode/starcoder +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] + + +### Framework versions + +- PEFT 0.8.2 \ No newline at end of file diff --git a/checkpoint-260/adapter_config.json b/checkpoint-260/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..ab6c617ff1322585052700834abbc593bae7c619 --- /dev/null +++ b/checkpoint-260/adapter_config.json @@ -0,0 +1,28 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "bigcode/starcoder", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "c_proj", + "c_attn", + "q_attn" + ], + "task_type": "CAUSAL_LM", + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-260/adapter_model.safetensors b/checkpoint-260/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..0b7377ced059703efbf7179779a269234116eb75 --- /dev/null +++ b/checkpoint-260/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e44ce263e6fd885f50d82ca515b9325375b43ee36ededb75acf161ce88bc2e41 +size 48 diff --git a/checkpoint-260/optimizer.pt b/checkpoint-260/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..9ebc0b95b26ccd025f46abeb9451f05222969317 --- /dev/null +++ b/checkpoint-260/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ce9fe32629270fa64ee19a35edef93a0e79ff41115edd10fb6e268a6676e09c1 +size 284628602 diff --git a/checkpoint-260/rng_state.pth b/checkpoint-260/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..5116ad0b14fcec210ce4114fc4bfdfbc5c4bc4bf --- /dev/null +++ b/checkpoint-260/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e089d826e2d134b035865eb0da40dc9031b0478f13706ba2458c1d7e22fc5747 +size 14244 diff --git a/checkpoint-260/scheduler.pt b/checkpoint-260/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..7dc1cbf308c73ec7f2c8ba6761bfa8c3cb511190 --- /dev/null +++ b/checkpoint-260/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:618426a7b0745c9d6476d3c7a67a48976de53907877b0f786d6ae0b3e2e942da +size 1064 diff --git a/checkpoint-260/trainer_state.json b/checkpoint-260/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..a933bc70843aa7c572975e3f5129556bf5819742 --- /dev/null +++ b/checkpoint-260/trainer_state.json @@ -0,0 +1,307 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 21.224489795918366, + "eval_steps": 20, + "global_step": 260, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.82, + "grad_norm": 0.18377472460269928, + "learning_rate": 2.9999999999999997e-05, + "loss": 1.861, + "step": 10 + }, + { + "epoch": 1.63, + "grad_norm": 0.35202744603157043, + "learning_rate": 5.9999999999999995e-05, + "loss": 1.7263, + "step": 20 + }, + { + "epoch": 1.63, + "eval_loss": 1.4457755088806152, + "eval_runtime": 89.8938, + "eval_samples_per_second": 4.305, + "eval_steps_per_second": 0.545, + "step": 20 + }, + { + "epoch": 2.45, + "grad_norm": 0.928983747959137, + "learning_rate": 8.999999999999999e-05, + "loss": 1.1718, + "step": 30 + }, + { + "epoch": 3.27, + "grad_norm": 0.253262996673584, + "learning_rate": 0.00011999999999999999, + "loss": 0.4789, + "step": 40 + }, + { + "epoch": 3.27, + "eval_loss": 0.3332095146179199, + "eval_runtime": 89.9804, + "eval_samples_per_second": 4.301, + "eval_steps_per_second": 0.545, + "step": 40 + }, + { + "epoch": 4.08, + "grad_norm": 0.12236642092466354, + "learning_rate": 0.00015, + "loss": 0.3568, + "step": 50 + }, + { + "epoch": 4.9, + "grad_norm": 0.09160923212766647, + "learning_rate": 0.00017999999999999998, + "loss": 0.3256, + "step": 60 + }, + { + "epoch": 4.9, + "eval_loss": 0.2753114104270935, + "eval_runtime": 90.3206, + "eval_samples_per_second": 4.285, + "eval_steps_per_second": 0.543, + "step": 60 + }, + { + "epoch": 5.71, + "grad_norm": 0.10242326557636261, + "learning_rate": 0.00020999999999999998, + "loss": 0.2841, + "step": 70 + }, + { + "epoch": 6.53, + "grad_norm": 0.1305350810289383, + "learning_rate": 0.00023999999999999998, + "loss": 0.2615, + "step": 80 + }, + { + "epoch": 6.53, + "eval_loss": 0.2476309835910797, + "eval_runtime": 90.525, + "eval_samples_per_second": 4.275, + "eval_steps_per_second": 0.541, + "step": 80 + }, + { + "epoch": 7.35, + "grad_norm": 0.17941106855869293, + "learning_rate": 0.00027, + "loss": 0.2216, + "step": 90 + }, + { + "epoch": 8.16, + "grad_norm": 0.20095375180244446, + "learning_rate": 0.0003, + "loss": 0.1832, + "step": 100 + }, + { + "epoch": 8.16, + "eval_loss": 0.2350914180278778, + "eval_runtime": 90.3919, + "eval_samples_per_second": 4.281, + "eval_steps_per_second": 0.542, + "step": 100 + }, + { + "epoch": 8.98, + "grad_norm": 0.2600422501564026, + "learning_rate": 0.00029, + "loss": 0.1441, + "step": 110 + }, + { + "epoch": 9.8, + "grad_norm": 0.20544037222862244, + "learning_rate": 0.00028, + "loss": 0.1186, + "step": 120 + }, + { + "epoch": 9.8, + "eval_loss": 0.23090216517448425, + "eval_runtime": 90.3144, + "eval_samples_per_second": 4.285, + "eval_steps_per_second": 0.543, + "step": 120 + }, + { + "epoch": 10.61, + "grad_norm": 0.2158157229423523, + "learning_rate": 0.00027, + "loss": 0.0947, + "step": 130 + }, + { + "epoch": 11.43, + "grad_norm": 0.18916285037994385, + "learning_rate": 0.00026, + "loss": 0.0768, + "step": 140 + }, + { + "epoch": 11.43, + "eval_loss": 0.24214179813861847, + "eval_runtime": 90.2597, + "eval_samples_per_second": 4.288, + "eval_steps_per_second": 0.543, + "step": 140 + }, + { + "epoch": 12.24, + "grad_norm": 0.22263498604297638, + "learning_rate": 0.00025, + "loss": 0.0615, + "step": 150 + }, + { + "epoch": 13.06, + "grad_norm": 0.21315976977348328, + "learning_rate": 0.00023999999999999998, + "loss": 0.054, + "step": 160 + }, + { + "epoch": 13.06, + "eval_loss": 0.25932466983795166, + "eval_runtime": 89.8439, + "eval_samples_per_second": 4.307, + "eval_steps_per_second": 0.545, + "step": 160 + }, + { + "epoch": 13.88, + "grad_norm": 0.18338361382484436, + "learning_rate": 0.00023, + "loss": 0.0455, + "step": 170 + }, + { + "epoch": 14.69, + "grad_norm": 0.17157459259033203, + "learning_rate": 0.00021999999999999995, + "loss": 0.0393, + "step": 180 + }, + { + "epoch": 14.69, + "eval_loss": 0.27233538031578064, + "eval_runtime": 90.1364, + "eval_samples_per_second": 4.293, + "eval_steps_per_second": 0.544, + "step": 180 + }, + { + "epoch": 15.51, + "grad_norm": 0.1541435867547989, + "learning_rate": 0.00020999999999999998, + "loss": 0.0352, + "step": 190 + }, + { + "epoch": 16.33, + "grad_norm": 0.1553652435541153, + "learning_rate": 0.00019999999999999998, + "loss": 0.0325, + "step": 200 + }, + { + "epoch": 16.33, + "eval_loss": 0.28704825043678284, + "eval_runtime": 89.7951, + "eval_samples_per_second": 4.31, + "eval_steps_per_second": 0.546, + "step": 200 + }, + { + "epoch": 17.14, + "grad_norm": 0.13403691351413727, + "learning_rate": 0.00018999999999999998, + "loss": 0.0297, + "step": 210 + }, + { + "epoch": 17.96, + "grad_norm": 0.14512716233730316, + "learning_rate": 0.00017999999999999998, + "loss": 0.0279, + "step": 220 + }, + { + "epoch": 17.96, + "eval_loss": 0.2964874505996704, + "eval_runtime": 89.7009, + "eval_samples_per_second": 4.314, + "eval_steps_per_second": 0.546, + "step": 220 + }, + { + "epoch": 18.78, + "grad_norm": 0.12400835007429123, + "learning_rate": 0.00016999999999999999, + "loss": 0.0263, + "step": 230 + }, + { + "epoch": 19.59, + "grad_norm": 0.1139909029006958, + "learning_rate": 0.00015999999999999999, + "loss": 0.0246, + "step": 240 + }, + { + "epoch": 19.59, + "eval_loss": 0.30519917607307434, + "eval_runtime": 89.8387, + "eval_samples_per_second": 4.308, + "eval_steps_per_second": 0.545, + "step": 240 + }, + { + "epoch": 20.41, + "grad_norm": 0.12317101657390594, + "learning_rate": 0.00015, + "loss": 0.0235, + "step": 250 + }, + { + "epoch": 21.22, + "grad_norm": 0.12494686245918274, + "learning_rate": 0.00014, + "loss": 0.0224, + "step": 260 + }, + { + "epoch": 21.22, + "eval_loss": 0.314134418964386, + "eval_runtime": 89.7974, + "eval_samples_per_second": 4.31, + "eval_steps_per_second": 0.546, + "step": 260 + } + ], + "logging_steps": 10, + "max_steps": 400, + "num_input_tokens_seen": 0, + "num_train_epochs": 34, + "save_steps": 20, + "total_flos": 1.49113643104469e+18, + "train_batch_size": 32, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-260/training_args.bin b/checkpoint-260/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..3f0d01d8ba12c6a725736cdf727e72ec03ea9a4f --- /dev/null +++ b/checkpoint-260/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:01be3c1366faeea704d7b18d02c117abdc170d0c96565a08a0f3ad9c5e7a123a +size 4856 diff --git a/checkpoint-280/README.md b/checkpoint-280/README.md new file mode 100644 index 0000000000000000000000000000000000000000..4b6f787248182d62a16f6423f948c336352c3674 --- /dev/null +++ b/checkpoint-280/README.md @@ -0,0 +1,204 @@ +--- +library_name: peft +base_model: bigcode/starcoder +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] + + +### Framework versions + +- PEFT 0.8.2 \ No newline at end of file diff --git a/checkpoint-280/adapter_config.json b/checkpoint-280/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..ab6c617ff1322585052700834abbc593bae7c619 --- /dev/null +++ b/checkpoint-280/adapter_config.json @@ -0,0 +1,28 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "bigcode/starcoder", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "c_proj", + "c_attn", + "q_attn" + ], + "task_type": "CAUSAL_LM", + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-280/adapter_model.safetensors b/checkpoint-280/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..0b7377ced059703efbf7179779a269234116eb75 --- /dev/null +++ b/checkpoint-280/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e44ce263e6fd885f50d82ca515b9325375b43ee36ededb75acf161ce88bc2e41 +size 48 diff --git a/checkpoint-280/optimizer.pt b/checkpoint-280/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..68e3782ce4c0252f68f4a8ba061748447b84c9d9 --- /dev/null +++ b/checkpoint-280/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:01fcc6ba5ccd409790320e8a4ac2e3007d82fa6d14e8f05f6c3933b60a6aaa95 +size 284628602 diff --git a/checkpoint-280/rng_state.pth b/checkpoint-280/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..f6246157c94c5db7a176f0945a202ae2b9934970 --- /dev/null +++ b/checkpoint-280/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2316503a65fe2763f2ee91e2cd06e9da95858ca7b41cee85369a45dab120c2ba +size 14244 diff --git a/checkpoint-280/scheduler.pt b/checkpoint-280/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..a2eea9aa1d6851803e5b4cf6bad706f10867b92a --- /dev/null +++ b/checkpoint-280/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:326b29a713a65b60d1cdb16e40a0ac0c9e0bc23145aea4ec7d4e2e525de2f511 +size 1064 diff --git a/checkpoint-280/trainer_state.json b/checkpoint-280/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..647f08f7c77603bc0cd1b7738aa733cc5421d91d --- /dev/null +++ b/checkpoint-280/trainer_state.json @@ -0,0 +1,329 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 22.857142857142858, + "eval_steps": 20, + "global_step": 280, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.82, + "grad_norm": 0.18377472460269928, + "learning_rate": 2.9999999999999997e-05, + "loss": 1.861, + "step": 10 + }, + { + "epoch": 1.63, + "grad_norm": 0.35202744603157043, + "learning_rate": 5.9999999999999995e-05, + "loss": 1.7263, + "step": 20 + }, + { + "epoch": 1.63, + "eval_loss": 1.4457755088806152, + "eval_runtime": 89.8938, + "eval_samples_per_second": 4.305, + "eval_steps_per_second": 0.545, + "step": 20 + }, + { + "epoch": 2.45, + "grad_norm": 0.928983747959137, + "learning_rate": 8.999999999999999e-05, + "loss": 1.1718, + "step": 30 + }, + { + "epoch": 3.27, + "grad_norm": 0.253262996673584, + "learning_rate": 0.00011999999999999999, + "loss": 0.4789, + "step": 40 + }, + { + "epoch": 3.27, + "eval_loss": 0.3332095146179199, + "eval_runtime": 89.9804, + "eval_samples_per_second": 4.301, + "eval_steps_per_second": 0.545, + "step": 40 + }, + { + "epoch": 4.08, + "grad_norm": 0.12236642092466354, + "learning_rate": 0.00015, + "loss": 0.3568, + "step": 50 + }, + { + "epoch": 4.9, + "grad_norm": 0.09160923212766647, + "learning_rate": 0.00017999999999999998, + "loss": 0.3256, + "step": 60 + }, + { + "epoch": 4.9, + "eval_loss": 0.2753114104270935, + "eval_runtime": 90.3206, + "eval_samples_per_second": 4.285, + "eval_steps_per_second": 0.543, + "step": 60 + }, + { + "epoch": 5.71, + "grad_norm": 0.10242326557636261, + "learning_rate": 0.00020999999999999998, + "loss": 0.2841, + "step": 70 + }, + { + "epoch": 6.53, + "grad_norm": 0.1305350810289383, + "learning_rate": 0.00023999999999999998, + "loss": 0.2615, + "step": 80 + }, + { + "epoch": 6.53, + "eval_loss": 0.2476309835910797, + "eval_runtime": 90.525, + "eval_samples_per_second": 4.275, + "eval_steps_per_second": 0.541, + "step": 80 + }, + { + "epoch": 7.35, + "grad_norm": 0.17941106855869293, + "learning_rate": 0.00027, + "loss": 0.2216, + "step": 90 + }, + { + "epoch": 8.16, + "grad_norm": 0.20095375180244446, + "learning_rate": 0.0003, + "loss": 0.1832, + "step": 100 + }, + { + "epoch": 8.16, + "eval_loss": 0.2350914180278778, + "eval_runtime": 90.3919, + "eval_samples_per_second": 4.281, + "eval_steps_per_second": 0.542, + "step": 100 + }, + { + "epoch": 8.98, + "grad_norm": 0.2600422501564026, + "learning_rate": 0.00029, + "loss": 0.1441, + "step": 110 + }, + { + "epoch": 9.8, + "grad_norm": 0.20544037222862244, + "learning_rate": 0.00028, + "loss": 0.1186, + "step": 120 + }, + { + "epoch": 9.8, + "eval_loss": 0.23090216517448425, + "eval_runtime": 90.3144, + "eval_samples_per_second": 4.285, + "eval_steps_per_second": 0.543, + "step": 120 + }, + { + "epoch": 10.61, + "grad_norm": 0.2158157229423523, + "learning_rate": 0.00027, + "loss": 0.0947, + "step": 130 + }, + { + "epoch": 11.43, + "grad_norm": 0.18916285037994385, + "learning_rate": 0.00026, + "loss": 0.0768, + "step": 140 + }, + { + "epoch": 11.43, + "eval_loss": 0.24214179813861847, + "eval_runtime": 90.2597, + "eval_samples_per_second": 4.288, + "eval_steps_per_second": 0.543, + "step": 140 + }, + { + "epoch": 12.24, + "grad_norm": 0.22263498604297638, + "learning_rate": 0.00025, + "loss": 0.0615, + "step": 150 + }, + { + "epoch": 13.06, + "grad_norm": 0.21315976977348328, + "learning_rate": 0.00023999999999999998, + "loss": 0.054, + "step": 160 + }, + { + "epoch": 13.06, + "eval_loss": 0.25932466983795166, + "eval_runtime": 89.8439, + "eval_samples_per_second": 4.307, + "eval_steps_per_second": 0.545, + "step": 160 + }, + { + "epoch": 13.88, + "grad_norm": 0.18338361382484436, + "learning_rate": 0.00023, + "loss": 0.0455, + "step": 170 + }, + { + "epoch": 14.69, + "grad_norm": 0.17157459259033203, + "learning_rate": 0.00021999999999999995, + "loss": 0.0393, + "step": 180 + }, + { + "epoch": 14.69, + "eval_loss": 0.27233538031578064, + "eval_runtime": 90.1364, + "eval_samples_per_second": 4.293, + "eval_steps_per_second": 0.544, + "step": 180 + }, + { + "epoch": 15.51, + "grad_norm": 0.1541435867547989, + "learning_rate": 0.00020999999999999998, + "loss": 0.0352, + "step": 190 + }, + { + "epoch": 16.33, + "grad_norm": 0.1553652435541153, + "learning_rate": 0.00019999999999999998, + "loss": 0.0325, + "step": 200 + }, + { + "epoch": 16.33, + "eval_loss": 0.28704825043678284, + "eval_runtime": 89.7951, + "eval_samples_per_second": 4.31, + "eval_steps_per_second": 0.546, + "step": 200 + }, + { + "epoch": 17.14, + "grad_norm": 0.13403691351413727, + "learning_rate": 0.00018999999999999998, + "loss": 0.0297, + "step": 210 + }, + { + "epoch": 17.96, + "grad_norm": 0.14512716233730316, + "learning_rate": 0.00017999999999999998, + "loss": 0.0279, + "step": 220 + }, + { + "epoch": 17.96, + "eval_loss": 0.2964874505996704, + "eval_runtime": 89.7009, + "eval_samples_per_second": 4.314, + "eval_steps_per_second": 0.546, + "step": 220 + }, + { + "epoch": 18.78, + "grad_norm": 0.12400835007429123, + "learning_rate": 0.00016999999999999999, + "loss": 0.0263, + "step": 230 + }, + { + "epoch": 19.59, + "grad_norm": 0.1139909029006958, + "learning_rate": 0.00015999999999999999, + "loss": 0.0246, + "step": 240 + }, + { + "epoch": 19.59, + "eval_loss": 0.30519917607307434, + "eval_runtime": 89.8387, + "eval_samples_per_second": 4.308, + "eval_steps_per_second": 0.545, + "step": 240 + }, + { + "epoch": 20.41, + "grad_norm": 0.12317101657390594, + "learning_rate": 0.00015, + "loss": 0.0235, + "step": 250 + }, + { + "epoch": 21.22, + "grad_norm": 0.12494686245918274, + "learning_rate": 0.00014, + "loss": 0.0224, + "step": 260 + }, + { + "epoch": 21.22, + "eval_loss": 0.314134418964386, + "eval_runtime": 89.7974, + "eval_samples_per_second": 4.31, + "eval_steps_per_second": 0.546, + "step": 260 + }, + { + "epoch": 22.04, + "grad_norm": 0.1180659756064415, + "learning_rate": 0.00013, + "loss": 0.022, + "step": 270 + }, + { + "epoch": 22.86, + "grad_norm": 0.09653373062610626, + "learning_rate": 0.00011999999999999999, + "loss": 0.0212, + "step": 280 + }, + { + "epoch": 22.86, + "eval_loss": 0.3175604045391083, + "eval_runtime": 89.9764, + "eval_samples_per_second": 4.301, + "eval_steps_per_second": 0.545, + "step": 280 + } + ], + "logging_steps": 10, + "max_steps": 400, + "num_input_tokens_seen": 0, + "num_train_epochs": 34, + "save_steps": 20, + "total_flos": 1.6064067756789596e+18, + "train_batch_size": 32, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-280/training_args.bin b/checkpoint-280/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..3f0d01d8ba12c6a725736cdf727e72ec03ea9a4f --- /dev/null +++ b/checkpoint-280/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:01be3c1366faeea704d7b18d02c117abdc170d0c96565a08a0f3ad9c5e7a123a +size 4856 diff --git a/checkpoint-300/README.md b/checkpoint-300/README.md new file mode 100644 index 0000000000000000000000000000000000000000..4b6f787248182d62a16f6423f948c336352c3674 --- /dev/null +++ b/checkpoint-300/README.md @@ -0,0 +1,204 @@ +--- +library_name: peft +base_model: bigcode/starcoder +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] + + +### Framework versions + +- PEFT 0.8.2 \ No newline at end of file diff --git a/checkpoint-300/adapter_config.json b/checkpoint-300/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..ab6c617ff1322585052700834abbc593bae7c619 --- /dev/null +++ b/checkpoint-300/adapter_config.json @@ -0,0 +1,28 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "bigcode/starcoder", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "c_proj", + "c_attn", + "q_attn" + ], + "task_type": "CAUSAL_LM", + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-300/adapter_model.safetensors b/checkpoint-300/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..0b7377ced059703efbf7179779a269234116eb75 --- /dev/null +++ b/checkpoint-300/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e44ce263e6fd885f50d82ca515b9325375b43ee36ededb75acf161ce88bc2e41 +size 48 diff --git a/checkpoint-300/optimizer.pt b/checkpoint-300/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..8f3a17832fe7ab770b972fb49d3aa68c1e4d05f7 --- /dev/null +++ b/checkpoint-300/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9f95d4f3b0790d8b9f70228f751a90a874bd1c329b0279d0d4f356aa76817469 +size 284628602 diff --git a/checkpoint-300/rng_state.pth b/checkpoint-300/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..a9b112642217f03203385840075fb1fa9af37a4e --- /dev/null +++ b/checkpoint-300/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:56ac9836af59089a7eed0a7a9a522b10102e0f768a815b08c8934678c7d8f27a +size 14244 diff --git a/checkpoint-300/scheduler.pt b/checkpoint-300/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..043065694100117ec9ec34d87d4119760c55d60e --- /dev/null +++ b/checkpoint-300/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:02a846f1b53d06b3a5385e00f3c382802f5449c10c5faa9bc984677d196316c2 +size 1064 diff --git a/checkpoint-300/trainer_state.json b/checkpoint-300/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..7de1880ad5ba7edd4c41f963df0bc5ce6554339b --- /dev/null +++ b/checkpoint-300/trainer_state.json @@ -0,0 +1,351 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 24.489795918367346, + "eval_steps": 20, + "global_step": 300, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.82, + "grad_norm": 0.18377472460269928, + "learning_rate": 2.9999999999999997e-05, + "loss": 1.861, + "step": 10 + }, + { + "epoch": 1.63, + "grad_norm": 0.35202744603157043, + "learning_rate": 5.9999999999999995e-05, + "loss": 1.7263, + "step": 20 + }, + { + "epoch": 1.63, + "eval_loss": 1.4457755088806152, + "eval_runtime": 89.8938, + "eval_samples_per_second": 4.305, + "eval_steps_per_second": 0.545, + "step": 20 + }, + { + "epoch": 2.45, + "grad_norm": 0.928983747959137, + "learning_rate": 8.999999999999999e-05, + "loss": 1.1718, + "step": 30 + }, + { + "epoch": 3.27, + "grad_norm": 0.253262996673584, + "learning_rate": 0.00011999999999999999, + "loss": 0.4789, + "step": 40 + }, + { + "epoch": 3.27, + "eval_loss": 0.3332095146179199, + "eval_runtime": 89.9804, + "eval_samples_per_second": 4.301, + "eval_steps_per_second": 0.545, + "step": 40 + }, + { + "epoch": 4.08, + "grad_norm": 0.12236642092466354, + "learning_rate": 0.00015, + "loss": 0.3568, + "step": 50 + }, + { + "epoch": 4.9, + "grad_norm": 0.09160923212766647, + "learning_rate": 0.00017999999999999998, + "loss": 0.3256, + "step": 60 + }, + { + "epoch": 4.9, + "eval_loss": 0.2753114104270935, + "eval_runtime": 90.3206, + "eval_samples_per_second": 4.285, + "eval_steps_per_second": 0.543, + "step": 60 + }, + { + "epoch": 5.71, + "grad_norm": 0.10242326557636261, + "learning_rate": 0.00020999999999999998, + "loss": 0.2841, + "step": 70 + }, + { + "epoch": 6.53, + "grad_norm": 0.1305350810289383, + "learning_rate": 0.00023999999999999998, + "loss": 0.2615, + "step": 80 + }, + { + "epoch": 6.53, + "eval_loss": 0.2476309835910797, + "eval_runtime": 90.525, + "eval_samples_per_second": 4.275, + "eval_steps_per_second": 0.541, + "step": 80 + }, + { + "epoch": 7.35, + "grad_norm": 0.17941106855869293, + "learning_rate": 0.00027, + "loss": 0.2216, + "step": 90 + }, + { + "epoch": 8.16, + "grad_norm": 0.20095375180244446, + "learning_rate": 0.0003, + "loss": 0.1832, + "step": 100 + }, + { + "epoch": 8.16, + "eval_loss": 0.2350914180278778, + "eval_runtime": 90.3919, + "eval_samples_per_second": 4.281, + "eval_steps_per_second": 0.542, + "step": 100 + }, + { + "epoch": 8.98, + "grad_norm": 0.2600422501564026, + "learning_rate": 0.00029, + "loss": 0.1441, + "step": 110 + }, + { + "epoch": 9.8, + "grad_norm": 0.20544037222862244, + "learning_rate": 0.00028, + "loss": 0.1186, + "step": 120 + }, + { + "epoch": 9.8, + "eval_loss": 0.23090216517448425, + "eval_runtime": 90.3144, + "eval_samples_per_second": 4.285, + "eval_steps_per_second": 0.543, + "step": 120 + }, + { + "epoch": 10.61, + "grad_norm": 0.2158157229423523, + "learning_rate": 0.00027, + "loss": 0.0947, + "step": 130 + }, + { + "epoch": 11.43, + "grad_norm": 0.18916285037994385, + "learning_rate": 0.00026, + "loss": 0.0768, + "step": 140 + }, + { + "epoch": 11.43, + "eval_loss": 0.24214179813861847, + "eval_runtime": 90.2597, + "eval_samples_per_second": 4.288, + "eval_steps_per_second": 0.543, + "step": 140 + }, + { + "epoch": 12.24, + "grad_norm": 0.22263498604297638, + "learning_rate": 0.00025, + "loss": 0.0615, + "step": 150 + }, + { + "epoch": 13.06, + "grad_norm": 0.21315976977348328, + "learning_rate": 0.00023999999999999998, + "loss": 0.054, + "step": 160 + }, + { + "epoch": 13.06, + "eval_loss": 0.25932466983795166, + "eval_runtime": 89.8439, + "eval_samples_per_second": 4.307, + "eval_steps_per_second": 0.545, + "step": 160 + }, + { + "epoch": 13.88, + "grad_norm": 0.18338361382484436, + "learning_rate": 0.00023, + "loss": 0.0455, + "step": 170 + }, + { + "epoch": 14.69, + "grad_norm": 0.17157459259033203, + "learning_rate": 0.00021999999999999995, + "loss": 0.0393, + "step": 180 + }, + { + "epoch": 14.69, + "eval_loss": 0.27233538031578064, + "eval_runtime": 90.1364, + "eval_samples_per_second": 4.293, + "eval_steps_per_second": 0.544, + "step": 180 + }, + { + "epoch": 15.51, + "grad_norm": 0.1541435867547989, + "learning_rate": 0.00020999999999999998, + "loss": 0.0352, + "step": 190 + }, + { + "epoch": 16.33, + "grad_norm": 0.1553652435541153, + "learning_rate": 0.00019999999999999998, + "loss": 0.0325, + "step": 200 + }, + { + "epoch": 16.33, + "eval_loss": 0.28704825043678284, + "eval_runtime": 89.7951, + "eval_samples_per_second": 4.31, + "eval_steps_per_second": 0.546, + "step": 200 + }, + { + "epoch": 17.14, + "grad_norm": 0.13403691351413727, + "learning_rate": 0.00018999999999999998, + "loss": 0.0297, + "step": 210 + }, + { + "epoch": 17.96, + "grad_norm": 0.14512716233730316, + "learning_rate": 0.00017999999999999998, + "loss": 0.0279, + "step": 220 + }, + { + "epoch": 17.96, + "eval_loss": 0.2964874505996704, + "eval_runtime": 89.7009, + "eval_samples_per_second": 4.314, + "eval_steps_per_second": 0.546, + "step": 220 + }, + { + "epoch": 18.78, + "grad_norm": 0.12400835007429123, + "learning_rate": 0.00016999999999999999, + "loss": 0.0263, + "step": 230 + }, + { + "epoch": 19.59, + "grad_norm": 0.1139909029006958, + "learning_rate": 0.00015999999999999999, + "loss": 0.0246, + "step": 240 + }, + { + "epoch": 19.59, + "eval_loss": 0.30519917607307434, + "eval_runtime": 89.8387, + "eval_samples_per_second": 4.308, + "eval_steps_per_second": 0.545, + "step": 240 + }, + { + "epoch": 20.41, + "grad_norm": 0.12317101657390594, + "learning_rate": 0.00015, + "loss": 0.0235, + "step": 250 + }, + { + "epoch": 21.22, + "grad_norm": 0.12494686245918274, + "learning_rate": 0.00014, + "loss": 0.0224, + "step": 260 + }, + { + "epoch": 21.22, + "eval_loss": 0.314134418964386, + "eval_runtime": 89.7974, + "eval_samples_per_second": 4.31, + "eval_steps_per_second": 0.546, + "step": 260 + }, + { + "epoch": 22.04, + "grad_norm": 0.1180659756064415, + "learning_rate": 0.00013, + "loss": 0.022, + "step": 270 + }, + { + "epoch": 22.86, + "grad_norm": 0.09653373062610626, + "learning_rate": 0.00011999999999999999, + "loss": 0.0212, + "step": 280 + }, + { + "epoch": 22.86, + "eval_loss": 0.3175604045391083, + "eval_runtime": 89.9764, + "eval_samples_per_second": 4.301, + "eval_steps_per_second": 0.545, + "step": 280 + }, + { + "epoch": 23.67, + "grad_norm": 0.10445748269557953, + "learning_rate": 0.00010999999999999998, + "loss": 0.0208, + "step": 290 + }, + { + "epoch": 24.49, + "grad_norm": 0.09245337545871735, + "learning_rate": 9.999999999999999e-05, + "loss": 0.0199, + "step": 300 + }, + { + "epoch": 24.49, + "eval_loss": 0.32360976934432983, + "eval_runtime": 89.8613, + "eval_samples_per_second": 4.307, + "eval_steps_per_second": 0.545, + "step": 300 + } + ], + "logging_steps": 10, + "max_steps": 400, + "num_input_tokens_seen": 0, + "num_train_epochs": 34, + "save_steps": 20, + "total_flos": 1.7204863592080343e+18, + "train_batch_size": 32, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-300/training_args.bin b/checkpoint-300/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..3f0d01d8ba12c6a725736cdf727e72ec03ea9a4f --- /dev/null +++ b/checkpoint-300/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:01be3c1366faeea704d7b18d02c117abdc170d0c96565a08a0f3ad9c5e7a123a +size 4856 diff --git a/checkpoint-320/README.md b/checkpoint-320/README.md new file mode 100644 index 0000000000000000000000000000000000000000..4b6f787248182d62a16f6423f948c336352c3674 --- /dev/null +++ b/checkpoint-320/README.md @@ -0,0 +1,204 @@ +--- +library_name: peft +base_model: bigcode/starcoder +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] + + +### Framework versions + +- PEFT 0.8.2 \ No newline at end of file diff --git a/checkpoint-320/adapter_config.json b/checkpoint-320/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..ab6c617ff1322585052700834abbc593bae7c619 --- /dev/null +++ b/checkpoint-320/adapter_config.json @@ -0,0 +1,28 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "bigcode/starcoder", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "c_proj", + "c_attn", + "q_attn" + ], + "task_type": "CAUSAL_LM", + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-320/adapter_model.safetensors b/checkpoint-320/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..0b7377ced059703efbf7179779a269234116eb75 --- /dev/null +++ b/checkpoint-320/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e44ce263e6fd885f50d82ca515b9325375b43ee36ededb75acf161ce88bc2e41 +size 48 diff --git a/checkpoint-320/optimizer.pt b/checkpoint-320/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..b77f03251156a0c9a7532b815b09a0ebd39f12a9 --- /dev/null +++ b/checkpoint-320/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9069c1605cf1b3695fbdd5333f7113b9eaf2cb631fb39dd399b73f4e07a90668 +size 284628602 diff --git a/checkpoint-320/rng_state.pth b/checkpoint-320/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..f309811c696f92771c94a54ed5c6590247a29180 --- /dev/null +++ b/checkpoint-320/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:402c1723d18bbfd6d61b2a49da9b43070c4fa33d72ea1a944508192e5a624a79 +size 14244 diff --git a/checkpoint-320/scheduler.pt b/checkpoint-320/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..44a7c96557d47c43ee760fe5e9084957e7d47bbb --- /dev/null +++ b/checkpoint-320/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5c6f90122c7166d476e722a6703ab94e5de6361f1d1a61a844fb6fd64429c85b +size 1064 diff --git a/checkpoint-320/trainer_state.json b/checkpoint-320/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..354167742886740ba1fcf0005f897a1956639231 --- /dev/null +++ b/checkpoint-320/trainer_state.json @@ -0,0 +1,373 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 26.122448979591837, + "eval_steps": 20, + "global_step": 320, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.82, + "grad_norm": 0.18377472460269928, + "learning_rate": 2.9999999999999997e-05, + "loss": 1.861, + "step": 10 + }, + { + "epoch": 1.63, + "grad_norm": 0.35202744603157043, + "learning_rate": 5.9999999999999995e-05, + "loss": 1.7263, + "step": 20 + }, + { + "epoch": 1.63, + "eval_loss": 1.4457755088806152, + "eval_runtime": 89.8938, + "eval_samples_per_second": 4.305, + "eval_steps_per_second": 0.545, + "step": 20 + }, + { + "epoch": 2.45, + "grad_norm": 0.928983747959137, + "learning_rate": 8.999999999999999e-05, + "loss": 1.1718, + "step": 30 + }, + { + "epoch": 3.27, + "grad_norm": 0.253262996673584, + "learning_rate": 0.00011999999999999999, + "loss": 0.4789, + "step": 40 + }, + { + "epoch": 3.27, + "eval_loss": 0.3332095146179199, + "eval_runtime": 89.9804, + "eval_samples_per_second": 4.301, + "eval_steps_per_second": 0.545, + "step": 40 + }, + { + "epoch": 4.08, + "grad_norm": 0.12236642092466354, + "learning_rate": 0.00015, + "loss": 0.3568, + "step": 50 + }, + { + "epoch": 4.9, + "grad_norm": 0.09160923212766647, + "learning_rate": 0.00017999999999999998, + "loss": 0.3256, + "step": 60 + }, + { + "epoch": 4.9, + "eval_loss": 0.2753114104270935, + "eval_runtime": 90.3206, + "eval_samples_per_second": 4.285, + "eval_steps_per_second": 0.543, + "step": 60 + }, + { + "epoch": 5.71, + "grad_norm": 0.10242326557636261, + "learning_rate": 0.00020999999999999998, + "loss": 0.2841, + "step": 70 + }, + { + "epoch": 6.53, + "grad_norm": 0.1305350810289383, + "learning_rate": 0.00023999999999999998, + "loss": 0.2615, + "step": 80 + }, + { + "epoch": 6.53, + "eval_loss": 0.2476309835910797, + "eval_runtime": 90.525, + "eval_samples_per_second": 4.275, + "eval_steps_per_second": 0.541, + "step": 80 + }, + { + "epoch": 7.35, + "grad_norm": 0.17941106855869293, + "learning_rate": 0.00027, + "loss": 0.2216, + "step": 90 + }, + { + "epoch": 8.16, + "grad_norm": 0.20095375180244446, + "learning_rate": 0.0003, + "loss": 0.1832, + "step": 100 + }, + { + "epoch": 8.16, + "eval_loss": 0.2350914180278778, + "eval_runtime": 90.3919, + "eval_samples_per_second": 4.281, + "eval_steps_per_second": 0.542, + "step": 100 + }, + { + "epoch": 8.98, + "grad_norm": 0.2600422501564026, + "learning_rate": 0.00029, + "loss": 0.1441, + "step": 110 + }, + { + "epoch": 9.8, + "grad_norm": 0.20544037222862244, + "learning_rate": 0.00028, + "loss": 0.1186, + "step": 120 + }, + { + "epoch": 9.8, + "eval_loss": 0.23090216517448425, + "eval_runtime": 90.3144, + "eval_samples_per_second": 4.285, + "eval_steps_per_second": 0.543, + "step": 120 + }, + { + "epoch": 10.61, + "grad_norm": 0.2158157229423523, + "learning_rate": 0.00027, + "loss": 0.0947, + "step": 130 + }, + { + "epoch": 11.43, + "grad_norm": 0.18916285037994385, + "learning_rate": 0.00026, + "loss": 0.0768, + "step": 140 + }, + { + "epoch": 11.43, + "eval_loss": 0.24214179813861847, + "eval_runtime": 90.2597, + "eval_samples_per_second": 4.288, + "eval_steps_per_second": 0.543, + "step": 140 + }, + { + "epoch": 12.24, + "grad_norm": 0.22263498604297638, + "learning_rate": 0.00025, + "loss": 0.0615, + "step": 150 + }, + { + "epoch": 13.06, + "grad_norm": 0.21315976977348328, + "learning_rate": 0.00023999999999999998, + "loss": 0.054, + "step": 160 + }, + { + "epoch": 13.06, + "eval_loss": 0.25932466983795166, + "eval_runtime": 89.8439, + "eval_samples_per_second": 4.307, + "eval_steps_per_second": 0.545, + "step": 160 + }, + { + "epoch": 13.88, + "grad_norm": 0.18338361382484436, + "learning_rate": 0.00023, + "loss": 0.0455, + "step": 170 + }, + { + "epoch": 14.69, + "grad_norm": 0.17157459259033203, + "learning_rate": 0.00021999999999999995, + "loss": 0.0393, + "step": 180 + }, + { + "epoch": 14.69, + "eval_loss": 0.27233538031578064, + "eval_runtime": 90.1364, + "eval_samples_per_second": 4.293, + "eval_steps_per_second": 0.544, + "step": 180 + }, + { + "epoch": 15.51, + "grad_norm": 0.1541435867547989, + "learning_rate": 0.00020999999999999998, + "loss": 0.0352, + "step": 190 + }, + { + "epoch": 16.33, + "grad_norm": 0.1553652435541153, + "learning_rate": 0.00019999999999999998, + "loss": 0.0325, + "step": 200 + }, + { + "epoch": 16.33, + "eval_loss": 0.28704825043678284, + "eval_runtime": 89.7951, + "eval_samples_per_second": 4.31, + "eval_steps_per_second": 0.546, + "step": 200 + }, + { + "epoch": 17.14, + "grad_norm": 0.13403691351413727, + "learning_rate": 0.00018999999999999998, + "loss": 0.0297, + "step": 210 + }, + { + "epoch": 17.96, + "grad_norm": 0.14512716233730316, + "learning_rate": 0.00017999999999999998, + "loss": 0.0279, + "step": 220 + }, + { + "epoch": 17.96, + "eval_loss": 0.2964874505996704, + "eval_runtime": 89.7009, + "eval_samples_per_second": 4.314, + "eval_steps_per_second": 0.546, + "step": 220 + }, + { + "epoch": 18.78, + "grad_norm": 0.12400835007429123, + "learning_rate": 0.00016999999999999999, + "loss": 0.0263, + "step": 230 + }, + { + "epoch": 19.59, + "grad_norm": 0.1139909029006958, + "learning_rate": 0.00015999999999999999, + "loss": 0.0246, + "step": 240 + }, + { + "epoch": 19.59, + "eval_loss": 0.30519917607307434, + "eval_runtime": 89.8387, + "eval_samples_per_second": 4.308, + "eval_steps_per_second": 0.545, + "step": 240 + }, + { + "epoch": 20.41, + "grad_norm": 0.12317101657390594, + "learning_rate": 0.00015, + "loss": 0.0235, + "step": 250 + }, + { + "epoch": 21.22, + "grad_norm": 0.12494686245918274, + "learning_rate": 0.00014, + "loss": 0.0224, + "step": 260 + }, + { + "epoch": 21.22, + "eval_loss": 0.314134418964386, + "eval_runtime": 89.7974, + "eval_samples_per_second": 4.31, + "eval_steps_per_second": 0.546, + "step": 260 + }, + { + "epoch": 22.04, + "grad_norm": 0.1180659756064415, + "learning_rate": 0.00013, + "loss": 0.022, + "step": 270 + }, + { + "epoch": 22.86, + "grad_norm": 0.09653373062610626, + "learning_rate": 0.00011999999999999999, + "loss": 0.0212, + "step": 280 + }, + { + "epoch": 22.86, + "eval_loss": 0.3175604045391083, + "eval_runtime": 89.9764, + "eval_samples_per_second": 4.301, + "eval_steps_per_second": 0.545, + "step": 280 + }, + { + "epoch": 23.67, + "grad_norm": 0.10445748269557953, + "learning_rate": 0.00010999999999999998, + "loss": 0.0208, + "step": 290 + }, + { + "epoch": 24.49, + "grad_norm": 0.09245337545871735, + "learning_rate": 9.999999999999999e-05, + "loss": 0.0199, + "step": 300 + }, + { + "epoch": 24.49, + "eval_loss": 0.32360976934432983, + "eval_runtime": 89.8613, + "eval_samples_per_second": 4.307, + "eval_steps_per_second": 0.545, + "step": 300 + }, + { + "epoch": 25.31, + "grad_norm": 0.09468758851289749, + "learning_rate": 8.999999999999999e-05, + "loss": 0.0197, + "step": 310 + }, + { + "epoch": 26.12, + "grad_norm": 0.0891977846622467, + "learning_rate": 7.999999999999999e-05, + "loss": 0.0192, + "step": 320 + }, + { + "epoch": 26.12, + "eval_loss": 0.3267403841018677, + "eval_runtime": 90.3063, + "eval_samples_per_second": 4.285, + "eval_steps_per_second": 0.543, + "step": 320 + } + ], + "logging_steps": 10, + "max_steps": 400, + "num_input_tokens_seen": 0, + "num_train_epochs": 34, + "save_steps": 20, + "total_flos": 1.8352663904460472e+18, + "train_batch_size": 32, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-320/training_args.bin b/checkpoint-320/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..3f0d01d8ba12c6a725736cdf727e72ec03ea9a4f --- /dev/null +++ b/checkpoint-320/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:01be3c1366faeea704d7b18d02c117abdc170d0c96565a08a0f3ad9c5e7a123a +size 4856 diff --git a/checkpoint-340/README.md b/checkpoint-340/README.md new file mode 100644 index 0000000000000000000000000000000000000000..4b6f787248182d62a16f6423f948c336352c3674 --- /dev/null +++ b/checkpoint-340/README.md @@ -0,0 +1,204 @@ +--- +library_name: peft +base_model: bigcode/starcoder +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] + + +### Framework versions + +- PEFT 0.8.2 \ No newline at end of file diff --git a/checkpoint-340/adapter_config.json b/checkpoint-340/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..ab6c617ff1322585052700834abbc593bae7c619 --- /dev/null +++ b/checkpoint-340/adapter_config.json @@ -0,0 +1,28 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "bigcode/starcoder", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "c_proj", + "c_attn", + "q_attn" + ], + "task_type": "CAUSAL_LM", + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-340/adapter_model.safetensors b/checkpoint-340/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..0b7377ced059703efbf7179779a269234116eb75 --- /dev/null +++ b/checkpoint-340/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e44ce263e6fd885f50d82ca515b9325375b43ee36ededb75acf161ce88bc2e41 +size 48 diff --git a/checkpoint-340/optimizer.pt b/checkpoint-340/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..574687a84444a8b3a7ad4d3aea7df81378dc3787 --- /dev/null +++ b/checkpoint-340/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:aed1d3769cbffeae18166870a276b374735aff755c2d47e63d72d0cbd1bda556 +size 284628602 diff --git a/checkpoint-340/rng_state.pth b/checkpoint-340/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..85df9fcc725ccc7574a5a906b54189070c16c85e --- /dev/null +++ b/checkpoint-340/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f5329c1b9ada42ddfb578fd23003ee82843e8f02a2d4dae483aeb12ea319fd09 +size 14244 diff --git a/checkpoint-340/scheduler.pt b/checkpoint-340/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..ce1f7af7d3957bf85581fa17216b0d13cafadfac --- /dev/null +++ b/checkpoint-340/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:03376fa8bdbd1f35e302b4f6e9042b2c5f57834becd5c6d5f467088976873ddd +size 1064 diff --git a/checkpoint-340/trainer_state.json b/checkpoint-340/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..694a17fb355951404c8b1741f45bb9b36e0b0ec1 --- /dev/null +++ b/checkpoint-340/trainer_state.json @@ -0,0 +1,395 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 27.755102040816325, + "eval_steps": 20, + "global_step": 340, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.82, + "grad_norm": 0.18377472460269928, + "learning_rate": 2.9999999999999997e-05, + "loss": 1.861, + "step": 10 + }, + { + "epoch": 1.63, + "grad_norm": 0.35202744603157043, + "learning_rate": 5.9999999999999995e-05, + "loss": 1.7263, + "step": 20 + }, + { + "epoch": 1.63, + "eval_loss": 1.4457755088806152, + "eval_runtime": 89.8938, + "eval_samples_per_second": 4.305, + "eval_steps_per_second": 0.545, + "step": 20 + }, + { + "epoch": 2.45, + "grad_norm": 0.928983747959137, + "learning_rate": 8.999999999999999e-05, + "loss": 1.1718, + "step": 30 + }, + { + "epoch": 3.27, + "grad_norm": 0.253262996673584, + "learning_rate": 0.00011999999999999999, + "loss": 0.4789, + "step": 40 + }, + { + "epoch": 3.27, + "eval_loss": 0.3332095146179199, + "eval_runtime": 89.9804, + "eval_samples_per_second": 4.301, + "eval_steps_per_second": 0.545, + "step": 40 + }, + { + "epoch": 4.08, + "grad_norm": 0.12236642092466354, + "learning_rate": 0.00015, + "loss": 0.3568, + "step": 50 + }, + { + "epoch": 4.9, + "grad_norm": 0.09160923212766647, + "learning_rate": 0.00017999999999999998, + "loss": 0.3256, + "step": 60 + }, + { + "epoch": 4.9, + "eval_loss": 0.2753114104270935, + "eval_runtime": 90.3206, + "eval_samples_per_second": 4.285, + "eval_steps_per_second": 0.543, + "step": 60 + }, + { + "epoch": 5.71, + "grad_norm": 0.10242326557636261, + "learning_rate": 0.00020999999999999998, + "loss": 0.2841, + "step": 70 + }, + { + "epoch": 6.53, + "grad_norm": 0.1305350810289383, + "learning_rate": 0.00023999999999999998, + "loss": 0.2615, + "step": 80 + }, + { + "epoch": 6.53, + "eval_loss": 0.2476309835910797, + "eval_runtime": 90.525, + "eval_samples_per_second": 4.275, + "eval_steps_per_second": 0.541, + "step": 80 + }, + { + "epoch": 7.35, + "grad_norm": 0.17941106855869293, + "learning_rate": 0.00027, + "loss": 0.2216, + "step": 90 + }, + { + "epoch": 8.16, + "grad_norm": 0.20095375180244446, + "learning_rate": 0.0003, + "loss": 0.1832, + "step": 100 + }, + { + "epoch": 8.16, + "eval_loss": 0.2350914180278778, + "eval_runtime": 90.3919, + "eval_samples_per_second": 4.281, + "eval_steps_per_second": 0.542, + "step": 100 + }, + { + "epoch": 8.98, + "grad_norm": 0.2600422501564026, + "learning_rate": 0.00029, + "loss": 0.1441, + "step": 110 + }, + { + "epoch": 9.8, + "grad_norm": 0.20544037222862244, + "learning_rate": 0.00028, + "loss": 0.1186, + "step": 120 + }, + { + "epoch": 9.8, + "eval_loss": 0.23090216517448425, + "eval_runtime": 90.3144, + "eval_samples_per_second": 4.285, + "eval_steps_per_second": 0.543, + "step": 120 + }, + { + "epoch": 10.61, + "grad_norm": 0.2158157229423523, + "learning_rate": 0.00027, + "loss": 0.0947, + "step": 130 + }, + { + "epoch": 11.43, + "grad_norm": 0.18916285037994385, + "learning_rate": 0.00026, + "loss": 0.0768, + "step": 140 + }, + { + "epoch": 11.43, + "eval_loss": 0.24214179813861847, + "eval_runtime": 90.2597, + "eval_samples_per_second": 4.288, + "eval_steps_per_second": 0.543, + "step": 140 + }, + { + "epoch": 12.24, + "grad_norm": 0.22263498604297638, + "learning_rate": 0.00025, + "loss": 0.0615, + "step": 150 + }, + { + "epoch": 13.06, + "grad_norm": 0.21315976977348328, + "learning_rate": 0.00023999999999999998, + "loss": 0.054, + "step": 160 + }, + { + "epoch": 13.06, + "eval_loss": 0.25932466983795166, + "eval_runtime": 89.8439, + "eval_samples_per_second": 4.307, + "eval_steps_per_second": 0.545, + "step": 160 + }, + { + "epoch": 13.88, + "grad_norm": 0.18338361382484436, + "learning_rate": 0.00023, + "loss": 0.0455, + "step": 170 + }, + { + "epoch": 14.69, + "grad_norm": 0.17157459259033203, + "learning_rate": 0.00021999999999999995, + "loss": 0.0393, + "step": 180 + }, + { + "epoch": 14.69, + "eval_loss": 0.27233538031578064, + "eval_runtime": 90.1364, + "eval_samples_per_second": 4.293, + "eval_steps_per_second": 0.544, + "step": 180 + }, + { + "epoch": 15.51, + "grad_norm": 0.1541435867547989, + "learning_rate": 0.00020999999999999998, + "loss": 0.0352, + "step": 190 + }, + { + "epoch": 16.33, + "grad_norm": 0.1553652435541153, + "learning_rate": 0.00019999999999999998, + "loss": 0.0325, + "step": 200 + }, + { + "epoch": 16.33, + "eval_loss": 0.28704825043678284, + "eval_runtime": 89.7951, + "eval_samples_per_second": 4.31, + "eval_steps_per_second": 0.546, + "step": 200 + }, + { + "epoch": 17.14, + "grad_norm": 0.13403691351413727, + "learning_rate": 0.00018999999999999998, + "loss": 0.0297, + "step": 210 + }, + { + "epoch": 17.96, + "grad_norm": 0.14512716233730316, + "learning_rate": 0.00017999999999999998, + "loss": 0.0279, + "step": 220 + }, + { + "epoch": 17.96, + "eval_loss": 0.2964874505996704, + "eval_runtime": 89.7009, + "eval_samples_per_second": 4.314, + "eval_steps_per_second": 0.546, + "step": 220 + }, + { + "epoch": 18.78, + "grad_norm": 0.12400835007429123, + "learning_rate": 0.00016999999999999999, + "loss": 0.0263, + "step": 230 + }, + { + "epoch": 19.59, + "grad_norm": 0.1139909029006958, + "learning_rate": 0.00015999999999999999, + "loss": 0.0246, + "step": 240 + }, + { + "epoch": 19.59, + "eval_loss": 0.30519917607307434, + "eval_runtime": 89.8387, + "eval_samples_per_second": 4.308, + "eval_steps_per_second": 0.545, + "step": 240 + }, + { + "epoch": 20.41, + "grad_norm": 0.12317101657390594, + "learning_rate": 0.00015, + "loss": 0.0235, + "step": 250 + }, + { + "epoch": 21.22, + "grad_norm": 0.12494686245918274, + "learning_rate": 0.00014, + "loss": 0.0224, + "step": 260 + }, + { + "epoch": 21.22, + "eval_loss": 0.314134418964386, + "eval_runtime": 89.7974, + "eval_samples_per_second": 4.31, + "eval_steps_per_second": 0.546, + "step": 260 + }, + { + "epoch": 22.04, + "grad_norm": 0.1180659756064415, + "learning_rate": 0.00013, + "loss": 0.022, + "step": 270 + }, + { + "epoch": 22.86, + "grad_norm": 0.09653373062610626, + "learning_rate": 0.00011999999999999999, + "loss": 0.0212, + "step": 280 + }, + { + "epoch": 22.86, + "eval_loss": 0.3175604045391083, + "eval_runtime": 89.9764, + "eval_samples_per_second": 4.301, + "eval_steps_per_second": 0.545, + "step": 280 + }, + { + "epoch": 23.67, + "grad_norm": 0.10445748269557953, + "learning_rate": 0.00010999999999999998, + "loss": 0.0208, + "step": 290 + }, + { + "epoch": 24.49, + "grad_norm": 0.09245337545871735, + "learning_rate": 9.999999999999999e-05, + "loss": 0.0199, + "step": 300 + }, + { + "epoch": 24.49, + "eval_loss": 0.32360976934432983, + "eval_runtime": 89.8613, + "eval_samples_per_second": 4.307, + "eval_steps_per_second": 0.545, + "step": 300 + }, + { + "epoch": 25.31, + "grad_norm": 0.09468758851289749, + "learning_rate": 8.999999999999999e-05, + "loss": 0.0197, + "step": 310 + }, + { + "epoch": 26.12, + "grad_norm": 0.0891977846622467, + "learning_rate": 7.999999999999999e-05, + "loss": 0.0192, + "step": 320 + }, + { + "epoch": 26.12, + "eval_loss": 0.3267403841018677, + "eval_runtime": 90.3063, + "eval_samples_per_second": 4.285, + "eval_steps_per_second": 0.543, + "step": 320 + }, + { + "epoch": 26.94, + "grad_norm": 0.08574336767196655, + "learning_rate": 7e-05, + "loss": 0.0188, + "step": 330 + }, + { + "epoch": 27.76, + "grad_norm": 0.08517367392778397, + "learning_rate": 5.9999999999999995e-05, + "loss": 0.0184, + "step": 340 + }, + { + "epoch": 27.76, + "eval_loss": 0.33063870668411255, + "eval_runtime": 89.8041, + "eval_samples_per_second": 4.309, + "eval_steps_per_second": 0.546, + "step": 340 + } + ], + "logging_steps": 10, + "max_steps": 400, + "num_input_tokens_seen": 0, + "num_train_epochs": 34, + "save_steps": 20, + "total_flos": 1.9503266007676355e+18, + "train_batch_size": 32, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-340/training_args.bin b/checkpoint-340/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..3f0d01d8ba12c6a725736cdf727e72ec03ea9a4f --- /dev/null +++ b/checkpoint-340/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:01be3c1366faeea704d7b18d02c117abdc170d0c96565a08a0f3ad9c5e7a123a +size 4856 diff --git a/checkpoint-360/README.md b/checkpoint-360/README.md new file mode 100644 index 0000000000000000000000000000000000000000..4b6f787248182d62a16f6423f948c336352c3674 --- /dev/null +++ b/checkpoint-360/README.md @@ -0,0 +1,204 @@ +--- +library_name: peft +base_model: bigcode/starcoder +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] + + +### Framework versions + +- PEFT 0.8.2 \ No newline at end of file diff --git a/checkpoint-360/adapter_config.json b/checkpoint-360/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..ab6c617ff1322585052700834abbc593bae7c619 --- /dev/null +++ b/checkpoint-360/adapter_config.json @@ -0,0 +1,28 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "bigcode/starcoder", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "c_proj", + "c_attn", + "q_attn" + ], + "task_type": "CAUSAL_LM", + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-360/adapter_model.safetensors b/checkpoint-360/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..0b7377ced059703efbf7179779a269234116eb75 --- /dev/null +++ b/checkpoint-360/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e44ce263e6fd885f50d82ca515b9325375b43ee36ededb75acf161ce88bc2e41 +size 48 diff --git a/checkpoint-360/optimizer.pt b/checkpoint-360/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..d1a2caa0fe16ebd0a2aab6d4d1868e2312fc8c2a --- /dev/null +++ b/checkpoint-360/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:141865bc10df9b78a6f453093595dc7b4a83d54018b3fe83379cee566526a843 +size 284628602 diff --git a/checkpoint-360/rng_state.pth b/checkpoint-360/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..7550b719f16999791822e5898428f5c2d1117d88 --- /dev/null +++ b/checkpoint-360/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c8e908dccb85d3a690d3e5541946de9a9e4d091ab14c7167380ea12530d079df +size 14244 diff --git a/checkpoint-360/scheduler.pt b/checkpoint-360/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..6f81351e650ea7c1de881065d6d8cb01c4bf1781 --- /dev/null +++ b/checkpoint-360/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:35452c1466d786926545eee1f0187be30d79bc322f32d2395b7460c9b050bbbe +size 1064 diff --git a/checkpoint-360/trainer_state.json b/checkpoint-360/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..9d01019879c208198e948c370fcc54315150a876 --- /dev/null +++ b/checkpoint-360/trainer_state.json @@ -0,0 +1,417 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 29.387755102040817, + "eval_steps": 20, + "global_step": 360, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.82, + "grad_norm": 0.18377472460269928, + "learning_rate": 2.9999999999999997e-05, + "loss": 1.861, + "step": 10 + }, + { + "epoch": 1.63, + "grad_norm": 0.35202744603157043, + "learning_rate": 5.9999999999999995e-05, + "loss": 1.7263, + "step": 20 + }, + { + "epoch": 1.63, + "eval_loss": 1.4457755088806152, + "eval_runtime": 89.8938, + "eval_samples_per_second": 4.305, + "eval_steps_per_second": 0.545, + "step": 20 + }, + { + "epoch": 2.45, + "grad_norm": 0.928983747959137, + "learning_rate": 8.999999999999999e-05, + "loss": 1.1718, + "step": 30 + }, + { + "epoch": 3.27, + "grad_norm": 0.253262996673584, + "learning_rate": 0.00011999999999999999, + "loss": 0.4789, + "step": 40 + }, + { + "epoch": 3.27, + "eval_loss": 0.3332095146179199, + "eval_runtime": 89.9804, + "eval_samples_per_second": 4.301, + "eval_steps_per_second": 0.545, + "step": 40 + }, + { + "epoch": 4.08, + "grad_norm": 0.12236642092466354, + "learning_rate": 0.00015, + "loss": 0.3568, + "step": 50 + }, + { + "epoch": 4.9, + "grad_norm": 0.09160923212766647, + "learning_rate": 0.00017999999999999998, + "loss": 0.3256, + "step": 60 + }, + { + "epoch": 4.9, + "eval_loss": 0.2753114104270935, + "eval_runtime": 90.3206, + "eval_samples_per_second": 4.285, + "eval_steps_per_second": 0.543, + "step": 60 + }, + { + "epoch": 5.71, + "grad_norm": 0.10242326557636261, + "learning_rate": 0.00020999999999999998, + "loss": 0.2841, + "step": 70 + }, + { + "epoch": 6.53, + "grad_norm": 0.1305350810289383, + "learning_rate": 0.00023999999999999998, + "loss": 0.2615, + "step": 80 + }, + { + "epoch": 6.53, + "eval_loss": 0.2476309835910797, + "eval_runtime": 90.525, + "eval_samples_per_second": 4.275, + "eval_steps_per_second": 0.541, + "step": 80 + }, + { + "epoch": 7.35, + "grad_norm": 0.17941106855869293, + "learning_rate": 0.00027, + "loss": 0.2216, + "step": 90 + }, + { + "epoch": 8.16, + "grad_norm": 0.20095375180244446, + "learning_rate": 0.0003, + "loss": 0.1832, + "step": 100 + }, + { + "epoch": 8.16, + "eval_loss": 0.2350914180278778, + "eval_runtime": 90.3919, + "eval_samples_per_second": 4.281, + "eval_steps_per_second": 0.542, + "step": 100 + }, + { + "epoch": 8.98, + "grad_norm": 0.2600422501564026, + "learning_rate": 0.00029, + "loss": 0.1441, + "step": 110 + }, + { + "epoch": 9.8, + "grad_norm": 0.20544037222862244, + "learning_rate": 0.00028, + "loss": 0.1186, + "step": 120 + }, + { + "epoch": 9.8, + "eval_loss": 0.23090216517448425, + "eval_runtime": 90.3144, + "eval_samples_per_second": 4.285, + "eval_steps_per_second": 0.543, + "step": 120 + }, + { + "epoch": 10.61, + "grad_norm": 0.2158157229423523, + "learning_rate": 0.00027, + "loss": 0.0947, + "step": 130 + }, + { + "epoch": 11.43, + "grad_norm": 0.18916285037994385, + "learning_rate": 0.00026, + "loss": 0.0768, + "step": 140 + }, + { + "epoch": 11.43, + "eval_loss": 0.24214179813861847, + "eval_runtime": 90.2597, + "eval_samples_per_second": 4.288, + "eval_steps_per_second": 0.543, + "step": 140 + }, + { + "epoch": 12.24, + "grad_norm": 0.22263498604297638, + "learning_rate": 0.00025, + "loss": 0.0615, + "step": 150 + }, + { + "epoch": 13.06, + "grad_norm": 0.21315976977348328, + "learning_rate": 0.00023999999999999998, + "loss": 0.054, + "step": 160 + }, + { + "epoch": 13.06, + "eval_loss": 0.25932466983795166, + "eval_runtime": 89.8439, + "eval_samples_per_second": 4.307, + "eval_steps_per_second": 0.545, + "step": 160 + }, + { + "epoch": 13.88, + "grad_norm": 0.18338361382484436, + "learning_rate": 0.00023, + "loss": 0.0455, + "step": 170 + }, + { + "epoch": 14.69, + "grad_norm": 0.17157459259033203, + "learning_rate": 0.00021999999999999995, + "loss": 0.0393, + "step": 180 + }, + { + "epoch": 14.69, + "eval_loss": 0.27233538031578064, + "eval_runtime": 90.1364, + "eval_samples_per_second": 4.293, + "eval_steps_per_second": 0.544, + "step": 180 + }, + { + "epoch": 15.51, + "grad_norm": 0.1541435867547989, + "learning_rate": 0.00020999999999999998, + "loss": 0.0352, + "step": 190 + }, + { + "epoch": 16.33, + "grad_norm": 0.1553652435541153, + "learning_rate": 0.00019999999999999998, + "loss": 0.0325, + "step": 200 + }, + { + "epoch": 16.33, + "eval_loss": 0.28704825043678284, + "eval_runtime": 89.7951, + "eval_samples_per_second": 4.31, + "eval_steps_per_second": 0.546, + "step": 200 + }, + { + "epoch": 17.14, + "grad_norm": 0.13403691351413727, + "learning_rate": 0.00018999999999999998, + "loss": 0.0297, + "step": 210 + }, + { + "epoch": 17.96, + "grad_norm": 0.14512716233730316, + "learning_rate": 0.00017999999999999998, + "loss": 0.0279, + "step": 220 + }, + { + "epoch": 17.96, + "eval_loss": 0.2964874505996704, + "eval_runtime": 89.7009, + "eval_samples_per_second": 4.314, + "eval_steps_per_second": 0.546, + "step": 220 + }, + { + "epoch": 18.78, + "grad_norm": 0.12400835007429123, + "learning_rate": 0.00016999999999999999, + "loss": 0.0263, + "step": 230 + }, + { + "epoch": 19.59, + "grad_norm": 0.1139909029006958, + "learning_rate": 0.00015999999999999999, + "loss": 0.0246, + "step": 240 + }, + { + "epoch": 19.59, + "eval_loss": 0.30519917607307434, + "eval_runtime": 89.8387, + "eval_samples_per_second": 4.308, + "eval_steps_per_second": 0.545, + "step": 240 + }, + { + "epoch": 20.41, + "grad_norm": 0.12317101657390594, + "learning_rate": 0.00015, + "loss": 0.0235, + "step": 250 + }, + { + "epoch": 21.22, + "grad_norm": 0.12494686245918274, + "learning_rate": 0.00014, + "loss": 0.0224, + "step": 260 + }, + { + "epoch": 21.22, + "eval_loss": 0.314134418964386, + "eval_runtime": 89.7974, + "eval_samples_per_second": 4.31, + "eval_steps_per_second": 0.546, + "step": 260 + }, + { + "epoch": 22.04, + "grad_norm": 0.1180659756064415, + "learning_rate": 0.00013, + "loss": 0.022, + "step": 270 + }, + { + "epoch": 22.86, + "grad_norm": 0.09653373062610626, + "learning_rate": 0.00011999999999999999, + "loss": 0.0212, + "step": 280 + }, + { + "epoch": 22.86, + "eval_loss": 0.3175604045391083, + "eval_runtime": 89.9764, + "eval_samples_per_second": 4.301, + "eval_steps_per_second": 0.545, + "step": 280 + }, + { + "epoch": 23.67, + "grad_norm": 0.10445748269557953, + "learning_rate": 0.00010999999999999998, + "loss": 0.0208, + "step": 290 + }, + { + "epoch": 24.49, + "grad_norm": 0.09245337545871735, + "learning_rate": 9.999999999999999e-05, + "loss": 0.0199, + "step": 300 + }, + { + "epoch": 24.49, + "eval_loss": 0.32360976934432983, + "eval_runtime": 89.8613, + "eval_samples_per_second": 4.307, + "eval_steps_per_second": 0.545, + "step": 300 + }, + { + "epoch": 25.31, + "grad_norm": 0.09468758851289749, + "learning_rate": 8.999999999999999e-05, + "loss": 0.0197, + "step": 310 + }, + { + "epoch": 26.12, + "grad_norm": 0.0891977846622467, + "learning_rate": 7.999999999999999e-05, + "loss": 0.0192, + "step": 320 + }, + { + "epoch": 26.12, + "eval_loss": 0.3267403841018677, + "eval_runtime": 90.3063, + "eval_samples_per_second": 4.285, + "eval_steps_per_second": 0.543, + "step": 320 + }, + { + "epoch": 26.94, + "grad_norm": 0.08574336767196655, + "learning_rate": 7e-05, + "loss": 0.0188, + "step": 330 + }, + { + "epoch": 27.76, + "grad_norm": 0.08517367392778397, + "learning_rate": 5.9999999999999995e-05, + "loss": 0.0184, + "step": 340 + }, + { + "epoch": 27.76, + "eval_loss": 0.33063870668411255, + "eval_runtime": 89.8041, + "eval_samples_per_second": 4.309, + "eval_steps_per_second": 0.546, + "step": 340 + }, + { + "epoch": 28.57, + "grad_norm": 0.08357132971286774, + "learning_rate": 4.9999999999999996e-05, + "loss": 0.0181, + "step": 350 + }, + { + "epoch": 29.39, + "grad_norm": 0.08679915964603424, + "learning_rate": 3.9999999999999996e-05, + "loss": 0.0181, + "step": 360 + }, + { + "epoch": 29.39, + "eval_loss": 0.3352932929992676, + "eval_runtime": 90.2438, + "eval_samples_per_second": 4.288, + "eval_steps_per_second": 0.543, + "step": 360 + } + ], + "logging_steps": 10, + "max_steps": 400, + "num_input_tokens_seen": 0, + "num_train_epochs": 34, + "save_steps": 20, + "total_flos": 2.0648731494360023e+18, + "train_batch_size": 32, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-360/training_args.bin b/checkpoint-360/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..3f0d01d8ba12c6a725736cdf727e72ec03ea9a4f --- /dev/null +++ b/checkpoint-360/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:01be3c1366faeea704d7b18d02c117abdc170d0c96565a08a0f3ad9c5e7a123a +size 4856 diff --git a/checkpoint-380/README.md b/checkpoint-380/README.md new file mode 100644 index 0000000000000000000000000000000000000000..4b6f787248182d62a16f6423f948c336352c3674 --- /dev/null +++ b/checkpoint-380/README.md @@ -0,0 +1,204 @@ +--- +library_name: peft +base_model: bigcode/starcoder +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] + + +### Framework versions + +- PEFT 0.8.2 \ No newline at end of file diff --git a/checkpoint-380/adapter_config.json b/checkpoint-380/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..ab6c617ff1322585052700834abbc593bae7c619 --- /dev/null +++ b/checkpoint-380/adapter_config.json @@ -0,0 +1,28 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "bigcode/starcoder", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "c_proj", + "c_attn", + "q_attn" + ], + "task_type": "CAUSAL_LM", + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-380/adapter_model.safetensors b/checkpoint-380/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..0b7377ced059703efbf7179779a269234116eb75 --- /dev/null +++ b/checkpoint-380/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e44ce263e6fd885f50d82ca515b9325375b43ee36ededb75acf161ce88bc2e41 +size 48 diff --git a/checkpoint-380/optimizer.pt b/checkpoint-380/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..9e7acb641af9b3bd1b1d0e81ac89d6ddbef978d0 --- /dev/null +++ b/checkpoint-380/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8c2a34ca054a8a94f5560be130b0c2008ecadaf3fb3e97fa51ad5cc1446db376 +size 284628602 diff --git a/checkpoint-380/rng_state.pth b/checkpoint-380/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..9a675a736c9210266b321d054f7bb24abe150068 --- /dev/null +++ b/checkpoint-380/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c53763ba648d8eaed325a2de2f963eabbbd42e2b977ef0fc8e3b8b949cff0a5d +size 14244 diff --git a/checkpoint-380/scheduler.pt b/checkpoint-380/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..a0e2655e310c92954c1a1e203e6229349da342e9 --- /dev/null +++ b/checkpoint-380/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8a39e2df26b3f3572b475720558ff8a2e0c169132c8cb9a6d4c54b36340352b7 +size 1064 diff --git a/checkpoint-380/trainer_state.json b/checkpoint-380/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..2391dbfe3ce6f9878305bf3471f1ee505f7dd37e --- /dev/null +++ b/checkpoint-380/trainer_state.json @@ -0,0 +1,439 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 31.020408163265305, + "eval_steps": 20, + "global_step": 380, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.82, + "grad_norm": 0.18377472460269928, + "learning_rate": 2.9999999999999997e-05, + "loss": 1.861, + "step": 10 + }, + { + "epoch": 1.63, + "grad_norm": 0.35202744603157043, + "learning_rate": 5.9999999999999995e-05, + "loss": 1.7263, + "step": 20 + }, + { + "epoch": 1.63, + "eval_loss": 1.4457755088806152, + "eval_runtime": 89.8938, + "eval_samples_per_second": 4.305, + "eval_steps_per_second": 0.545, + "step": 20 + }, + { + "epoch": 2.45, + "grad_norm": 0.928983747959137, + "learning_rate": 8.999999999999999e-05, + "loss": 1.1718, + "step": 30 + }, + { + "epoch": 3.27, + "grad_norm": 0.253262996673584, + "learning_rate": 0.00011999999999999999, + "loss": 0.4789, + "step": 40 + }, + { + "epoch": 3.27, + "eval_loss": 0.3332095146179199, + "eval_runtime": 89.9804, + "eval_samples_per_second": 4.301, + "eval_steps_per_second": 0.545, + "step": 40 + }, + { + "epoch": 4.08, + "grad_norm": 0.12236642092466354, + "learning_rate": 0.00015, + "loss": 0.3568, + "step": 50 + }, + { + "epoch": 4.9, + "grad_norm": 0.09160923212766647, + "learning_rate": 0.00017999999999999998, + "loss": 0.3256, + "step": 60 + }, + { + "epoch": 4.9, + "eval_loss": 0.2753114104270935, + "eval_runtime": 90.3206, + "eval_samples_per_second": 4.285, + "eval_steps_per_second": 0.543, + "step": 60 + }, + { + "epoch": 5.71, + "grad_norm": 0.10242326557636261, + "learning_rate": 0.00020999999999999998, + "loss": 0.2841, + "step": 70 + }, + { + "epoch": 6.53, + "grad_norm": 0.1305350810289383, + "learning_rate": 0.00023999999999999998, + "loss": 0.2615, + "step": 80 + }, + { + "epoch": 6.53, + "eval_loss": 0.2476309835910797, + "eval_runtime": 90.525, + "eval_samples_per_second": 4.275, + "eval_steps_per_second": 0.541, + "step": 80 + }, + { + "epoch": 7.35, + "grad_norm": 0.17941106855869293, + "learning_rate": 0.00027, + "loss": 0.2216, + "step": 90 + }, + { + "epoch": 8.16, + "grad_norm": 0.20095375180244446, + "learning_rate": 0.0003, + "loss": 0.1832, + "step": 100 + }, + { + "epoch": 8.16, + "eval_loss": 0.2350914180278778, + "eval_runtime": 90.3919, + "eval_samples_per_second": 4.281, + "eval_steps_per_second": 0.542, + "step": 100 + }, + { + "epoch": 8.98, + "grad_norm": 0.2600422501564026, + "learning_rate": 0.00029, + "loss": 0.1441, + "step": 110 + }, + { + "epoch": 9.8, + "grad_norm": 0.20544037222862244, + "learning_rate": 0.00028, + "loss": 0.1186, + "step": 120 + }, + { + "epoch": 9.8, + "eval_loss": 0.23090216517448425, + "eval_runtime": 90.3144, + "eval_samples_per_second": 4.285, + "eval_steps_per_second": 0.543, + "step": 120 + }, + { + "epoch": 10.61, + "grad_norm": 0.2158157229423523, + "learning_rate": 0.00027, + "loss": 0.0947, + "step": 130 + }, + { + "epoch": 11.43, + "grad_norm": 0.18916285037994385, + "learning_rate": 0.00026, + "loss": 0.0768, + "step": 140 + }, + { + "epoch": 11.43, + "eval_loss": 0.24214179813861847, + "eval_runtime": 90.2597, + "eval_samples_per_second": 4.288, + "eval_steps_per_second": 0.543, + "step": 140 + }, + { + "epoch": 12.24, + "grad_norm": 0.22263498604297638, + "learning_rate": 0.00025, + "loss": 0.0615, + "step": 150 + }, + { + "epoch": 13.06, + "grad_norm": 0.21315976977348328, + "learning_rate": 0.00023999999999999998, + "loss": 0.054, + "step": 160 + }, + { + "epoch": 13.06, + "eval_loss": 0.25932466983795166, + "eval_runtime": 89.8439, + "eval_samples_per_second": 4.307, + "eval_steps_per_second": 0.545, + "step": 160 + }, + { + "epoch": 13.88, + "grad_norm": 0.18338361382484436, + "learning_rate": 0.00023, + "loss": 0.0455, + "step": 170 + }, + { + "epoch": 14.69, + "grad_norm": 0.17157459259033203, + "learning_rate": 0.00021999999999999995, + "loss": 0.0393, + "step": 180 + }, + { + "epoch": 14.69, + "eval_loss": 0.27233538031578064, + "eval_runtime": 90.1364, + "eval_samples_per_second": 4.293, + "eval_steps_per_second": 0.544, + "step": 180 + }, + { + "epoch": 15.51, + "grad_norm": 0.1541435867547989, + "learning_rate": 0.00020999999999999998, + "loss": 0.0352, + "step": 190 + }, + { + "epoch": 16.33, + "grad_norm": 0.1553652435541153, + "learning_rate": 0.00019999999999999998, + "loss": 0.0325, + "step": 200 + }, + { + "epoch": 16.33, + "eval_loss": 0.28704825043678284, + "eval_runtime": 89.7951, + "eval_samples_per_second": 4.31, + "eval_steps_per_second": 0.546, + "step": 200 + }, + { + "epoch": 17.14, + "grad_norm": 0.13403691351413727, + "learning_rate": 0.00018999999999999998, + "loss": 0.0297, + "step": 210 + }, + { + "epoch": 17.96, + "grad_norm": 0.14512716233730316, + "learning_rate": 0.00017999999999999998, + "loss": 0.0279, + "step": 220 + }, + { + "epoch": 17.96, + "eval_loss": 0.2964874505996704, + "eval_runtime": 89.7009, + "eval_samples_per_second": 4.314, + "eval_steps_per_second": 0.546, + "step": 220 + }, + { + "epoch": 18.78, + "grad_norm": 0.12400835007429123, + "learning_rate": 0.00016999999999999999, + "loss": 0.0263, + "step": 230 + }, + { + "epoch": 19.59, + "grad_norm": 0.1139909029006958, + "learning_rate": 0.00015999999999999999, + "loss": 0.0246, + "step": 240 + }, + { + "epoch": 19.59, + "eval_loss": 0.30519917607307434, + "eval_runtime": 89.8387, + "eval_samples_per_second": 4.308, + "eval_steps_per_second": 0.545, + "step": 240 + }, + { + "epoch": 20.41, + "grad_norm": 0.12317101657390594, + "learning_rate": 0.00015, + "loss": 0.0235, + "step": 250 + }, + { + "epoch": 21.22, + "grad_norm": 0.12494686245918274, + "learning_rate": 0.00014, + "loss": 0.0224, + "step": 260 + }, + { + "epoch": 21.22, + "eval_loss": 0.314134418964386, + "eval_runtime": 89.7974, + "eval_samples_per_second": 4.31, + "eval_steps_per_second": 0.546, + "step": 260 + }, + { + "epoch": 22.04, + "grad_norm": 0.1180659756064415, + "learning_rate": 0.00013, + "loss": 0.022, + "step": 270 + }, + { + "epoch": 22.86, + "grad_norm": 0.09653373062610626, + "learning_rate": 0.00011999999999999999, + "loss": 0.0212, + "step": 280 + }, + { + "epoch": 22.86, + "eval_loss": 0.3175604045391083, + "eval_runtime": 89.9764, + "eval_samples_per_second": 4.301, + "eval_steps_per_second": 0.545, + "step": 280 + }, + { + "epoch": 23.67, + "grad_norm": 0.10445748269557953, + "learning_rate": 0.00010999999999999998, + "loss": 0.0208, + "step": 290 + }, + { + "epoch": 24.49, + "grad_norm": 0.09245337545871735, + "learning_rate": 9.999999999999999e-05, + "loss": 0.0199, + "step": 300 + }, + { + "epoch": 24.49, + "eval_loss": 0.32360976934432983, + "eval_runtime": 89.8613, + "eval_samples_per_second": 4.307, + "eval_steps_per_second": 0.545, + "step": 300 + }, + { + "epoch": 25.31, + "grad_norm": 0.09468758851289749, + "learning_rate": 8.999999999999999e-05, + "loss": 0.0197, + "step": 310 + }, + { + "epoch": 26.12, + "grad_norm": 0.0891977846622467, + "learning_rate": 7.999999999999999e-05, + "loss": 0.0192, + "step": 320 + }, + { + "epoch": 26.12, + "eval_loss": 0.3267403841018677, + "eval_runtime": 90.3063, + "eval_samples_per_second": 4.285, + "eval_steps_per_second": 0.543, + "step": 320 + }, + { + "epoch": 26.94, + "grad_norm": 0.08574336767196655, + "learning_rate": 7e-05, + "loss": 0.0188, + "step": 330 + }, + { + "epoch": 27.76, + "grad_norm": 0.08517367392778397, + "learning_rate": 5.9999999999999995e-05, + "loss": 0.0184, + "step": 340 + }, + { + "epoch": 27.76, + "eval_loss": 0.33063870668411255, + "eval_runtime": 89.8041, + "eval_samples_per_second": 4.309, + "eval_steps_per_second": 0.546, + "step": 340 + }, + { + "epoch": 28.57, + "grad_norm": 0.08357132971286774, + "learning_rate": 4.9999999999999996e-05, + "loss": 0.0181, + "step": 350 + }, + { + "epoch": 29.39, + "grad_norm": 0.08679915964603424, + "learning_rate": 3.9999999999999996e-05, + "loss": 0.0181, + "step": 360 + }, + { + "epoch": 29.39, + "eval_loss": 0.3352932929992676, + "eval_runtime": 90.2438, + "eval_samples_per_second": 4.288, + "eval_steps_per_second": 0.543, + "step": 360 + }, + { + "epoch": 30.2, + "grad_norm": 0.07208231836557388, + "learning_rate": 2.9999999999999997e-05, + "loss": 0.0178, + "step": 370 + }, + { + "epoch": 31.02, + "grad_norm": 0.08611435443162918, + "learning_rate": 1.9999999999999998e-05, + "loss": 0.0175, + "step": 380 + }, + { + "epoch": 31.02, + "eval_loss": 0.338119238615036, + "eval_runtime": 90.2767, + "eval_samples_per_second": 4.287, + "eval_steps_per_second": 0.543, + "step": 380 + } + ], + "logging_steps": 10, + "max_steps": 400, + "num_input_tokens_seen": 0, + "num_train_epochs": 34, + "save_steps": 20, + "total_flos": 2.1789060364511478e+18, + "train_batch_size": 32, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-380/training_args.bin b/checkpoint-380/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..3f0d01d8ba12c6a725736cdf727e72ec03ea9a4f --- /dev/null +++ b/checkpoint-380/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:01be3c1366faeea704d7b18d02c117abdc170d0c96565a08a0f3ad9c5e7a123a +size 4856 diff --git a/checkpoint-40/README.md b/checkpoint-40/README.md new file mode 100644 index 0000000000000000000000000000000000000000..4b6f787248182d62a16f6423f948c336352c3674 --- /dev/null +++ b/checkpoint-40/README.md @@ -0,0 +1,204 @@ +--- +library_name: peft +base_model: bigcode/starcoder +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] + + +### Framework versions + +- PEFT 0.8.2 \ No newline at end of file diff --git a/checkpoint-40/adapter_config.json b/checkpoint-40/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..ab6c617ff1322585052700834abbc593bae7c619 --- /dev/null +++ b/checkpoint-40/adapter_config.json @@ -0,0 +1,28 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "bigcode/starcoder", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "c_proj", + "c_attn", + "q_attn" + ], + "task_type": "CAUSAL_LM", + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-40/adapter_model.safetensors b/checkpoint-40/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..0b7377ced059703efbf7179779a269234116eb75 --- /dev/null +++ b/checkpoint-40/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e44ce263e6fd885f50d82ca515b9325375b43ee36ededb75acf161ce88bc2e41 +size 48 diff --git a/checkpoint-40/optimizer.pt b/checkpoint-40/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..e164c05d60c992a05ac41272e225d4fc0f5b3138 --- /dev/null +++ b/checkpoint-40/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8725ae58cb6274766242de451fee2b4e9735b5babfc1f0a0978d1e4d1e0d13b7 +size 284628602 diff --git a/checkpoint-40/rng_state.pth b/checkpoint-40/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..9d68344df651a0ac3b45d1a20a36acb7f27f114b --- /dev/null +++ b/checkpoint-40/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c9b8cf7272d40bde7127d6862dd27f666bac39b576335a5119bee0adfd33cad2 +size 14244 diff --git a/checkpoint-40/scheduler.pt b/checkpoint-40/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..3d1c414264d7a61f0eab2a577c1b2d29d2015e3c --- /dev/null +++ b/checkpoint-40/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c2c8a6a21ca956c9d84f17aca6da97a5c2b2ba1ba5d1e4f431e18bc47f94ca32 +size 1064 diff --git a/checkpoint-40/trainer_state.json b/checkpoint-40/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..ad06a8838adbdd063f90438741c1c88b7a11b6f0 --- /dev/null +++ b/checkpoint-40/trainer_state.json @@ -0,0 +1,65 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 3.2653061224489797, + "eval_steps": 20, + "global_step": 40, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.82, + "grad_norm": 0.18377472460269928, + "learning_rate": 2.9999999999999997e-05, + "loss": 1.861, + "step": 10 + }, + { + "epoch": 1.63, + "grad_norm": 0.35202744603157043, + "learning_rate": 5.9999999999999995e-05, + "loss": 1.7263, + "step": 20 + }, + { + "epoch": 1.63, + "eval_loss": 1.4457755088806152, + "eval_runtime": 89.8938, + "eval_samples_per_second": 4.305, + "eval_steps_per_second": 0.545, + "step": 20 + }, + { + "epoch": 2.45, + "grad_norm": 0.928983747959137, + "learning_rate": 8.999999999999999e-05, + "loss": 1.1718, + "step": 30 + }, + { + "epoch": 3.27, + "grad_norm": 0.253262996673584, + "learning_rate": 0.00011999999999999999, + "loss": 0.4789, + "step": 40 + }, + { + "epoch": 3.27, + "eval_loss": 0.3332095146179199, + "eval_runtime": 89.9804, + "eval_samples_per_second": 4.301, + "eval_steps_per_second": 0.545, + "step": 40 + } + ], + "logging_steps": 10, + "max_steps": 400, + "num_input_tokens_seen": 0, + "num_train_epochs": 34, + "save_steps": 20, + "total_flos": 2.2956006247602586e+17, + "train_batch_size": 32, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-40/training_args.bin b/checkpoint-40/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..3f0d01d8ba12c6a725736cdf727e72ec03ea9a4f --- /dev/null +++ b/checkpoint-40/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:01be3c1366faeea704d7b18d02c117abdc170d0c96565a08a0f3ad9c5e7a123a +size 4856 diff --git a/checkpoint-400/README.md b/checkpoint-400/README.md new file mode 100644 index 0000000000000000000000000000000000000000..4b6f787248182d62a16f6423f948c336352c3674 --- /dev/null +++ b/checkpoint-400/README.md @@ -0,0 +1,204 @@ +--- +library_name: peft +base_model: bigcode/starcoder +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] + + +### Framework versions + +- PEFT 0.8.2 \ No newline at end of file diff --git a/checkpoint-400/adapter_config.json b/checkpoint-400/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..ab6c617ff1322585052700834abbc593bae7c619 --- /dev/null +++ b/checkpoint-400/adapter_config.json @@ -0,0 +1,28 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "bigcode/starcoder", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "c_proj", + "c_attn", + "q_attn" + ], + "task_type": "CAUSAL_LM", + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-400/adapter_model.safetensors b/checkpoint-400/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..0b7377ced059703efbf7179779a269234116eb75 --- /dev/null +++ b/checkpoint-400/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e44ce263e6fd885f50d82ca515b9325375b43ee36ededb75acf161ce88bc2e41 +size 48 diff --git a/checkpoint-400/optimizer.pt b/checkpoint-400/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..f063a64b8788731f471ef26ea29c4fb588d6a6a1 --- /dev/null +++ b/checkpoint-400/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6135358af0dd648b51a5faf17d8f31c2e64885a4ac2282a2142e25937bbfc3b8 +size 284628602 diff --git a/checkpoint-400/rng_state.pth b/checkpoint-400/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..abe706932ca6879f53044735e0ae8d2a3e2ad561 --- /dev/null +++ b/checkpoint-400/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d4fa90b42117e78621355d0b2d6ec35c92d5ce1d8a8e3fd4c6a78791b096ffd2 +size 14244 diff --git a/checkpoint-400/scheduler.pt b/checkpoint-400/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..62c7d6ea1655a58e9a5460c3034b46e354a34af4 --- /dev/null +++ b/checkpoint-400/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ee665d99b8d4ac37b6829a57abd01a01763b04846f27bc645d525d70173d6821 +size 1064 diff --git a/checkpoint-400/trainer_state.json b/checkpoint-400/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..68537d37a7e8de364310b2b0e76f5f4e4d569a26 --- /dev/null +++ b/checkpoint-400/trainer_state.json @@ -0,0 +1,461 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 32.6530612244898, + "eval_steps": 20, + "global_step": 400, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.82, + "grad_norm": 0.18377472460269928, + "learning_rate": 2.9999999999999997e-05, + "loss": 1.861, + "step": 10 + }, + { + "epoch": 1.63, + "grad_norm": 0.35202744603157043, + "learning_rate": 5.9999999999999995e-05, + "loss": 1.7263, + "step": 20 + }, + { + "epoch": 1.63, + "eval_loss": 1.4457755088806152, + "eval_runtime": 89.8938, + "eval_samples_per_second": 4.305, + "eval_steps_per_second": 0.545, + "step": 20 + }, + { + "epoch": 2.45, + "grad_norm": 0.928983747959137, + "learning_rate": 8.999999999999999e-05, + "loss": 1.1718, + "step": 30 + }, + { + "epoch": 3.27, + "grad_norm": 0.253262996673584, + "learning_rate": 0.00011999999999999999, + "loss": 0.4789, + "step": 40 + }, + { + "epoch": 3.27, + "eval_loss": 0.3332095146179199, + "eval_runtime": 89.9804, + "eval_samples_per_second": 4.301, + "eval_steps_per_second": 0.545, + "step": 40 + }, + { + "epoch": 4.08, + "grad_norm": 0.12236642092466354, + "learning_rate": 0.00015, + "loss": 0.3568, + "step": 50 + }, + { + "epoch": 4.9, + "grad_norm": 0.09160923212766647, + "learning_rate": 0.00017999999999999998, + "loss": 0.3256, + "step": 60 + }, + { + "epoch": 4.9, + "eval_loss": 0.2753114104270935, + "eval_runtime": 90.3206, + "eval_samples_per_second": 4.285, + "eval_steps_per_second": 0.543, + "step": 60 + }, + { + "epoch": 5.71, + "grad_norm": 0.10242326557636261, + "learning_rate": 0.00020999999999999998, + "loss": 0.2841, + "step": 70 + }, + { + "epoch": 6.53, + "grad_norm": 0.1305350810289383, + "learning_rate": 0.00023999999999999998, + "loss": 0.2615, + "step": 80 + }, + { + "epoch": 6.53, + "eval_loss": 0.2476309835910797, + "eval_runtime": 90.525, + "eval_samples_per_second": 4.275, + "eval_steps_per_second": 0.541, + "step": 80 + }, + { + "epoch": 7.35, + "grad_norm": 0.17941106855869293, + "learning_rate": 0.00027, + "loss": 0.2216, + "step": 90 + }, + { + "epoch": 8.16, + "grad_norm": 0.20095375180244446, + "learning_rate": 0.0003, + "loss": 0.1832, + "step": 100 + }, + { + "epoch": 8.16, + "eval_loss": 0.2350914180278778, + "eval_runtime": 90.3919, + "eval_samples_per_second": 4.281, + "eval_steps_per_second": 0.542, + "step": 100 + }, + { + "epoch": 8.98, + "grad_norm": 0.2600422501564026, + "learning_rate": 0.00029, + "loss": 0.1441, + "step": 110 + }, + { + "epoch": 9.8, + "grad_norm": 0.20544037222862244, + "learning_rate": 0.00028, + "loss": 0.1186, + "step": 120 + }, + { + "epoch": 9.8, + "eval_loss": 0.23090216517448425, + "eval_runtime": 90.3144, + "eval_samples_per_second": 4.285, + "eval_steps_per_second": 0.543, + "step": 120 + }, + { + "epoch": 10.61, + "grad_norm": 0.2158157229423523, + "learning_rate": 0.00027, + "loss": 0.0947, + "step": 130 + }, + { + "epoch": 11.43, + "grad_norm": 0.18916285037994385, + "learning_rate": 0.00026, + "loss": 0.0768, + "step": 140 + }, + { + "epoch": 11.43, + "eval_loss": 0.24214179813861847, + "eval_runtime": 90.2597, + "eval_samples_per_second": 4.288, + "eval_steps_per_second": 0.543, + "step": 140 + }, + { + "epoch": 12.24, + "grad_norm": 0.22263498604297638, + "learning_rate": 0.00025, + "loss": 0.0615, + "step": 150 + }, + { + "epoch": 13.06, + "grad_norm": 0.21315976977348328, + "learning_rate": 0.00023999999999999998, + "loss": 0.054, + "step": 160 + }, + { + "epoch": 13.06, + "eval_loss": 0.25932466983795166, + "eval_runtime": 89.8439, + "eval_samples_per_second": 4.307, + "eval_steps_per_second": 0.545, + "step": 160 + }, + { + "epoch": 13.88, + "grad_norm": 0.18338361382484436, + "learning_rate": 0.00023, + "loss": 0.0455, + "step": 170 + }, + { + "epoch": 14.69, + "grad_norm": 0.17157459259033203, + "learning_rate": 0.00021999999999999995, + "loss": 0.0393, + "step": 180 + }, + { + "epoch": 14.69, + "eval_loss": 0.27233538031578064, + "eval_runtime": 90.1364, + "eval_samples_per_second": 4.293, + "eval_steps_per_second": 0.544, + "step": 180 + }, + { + "epoch": 15.51, + "grad_norm": 0.1541435867547989, + "learning_rate": 0.00020999999999999998, + "loss": 0.0352, + "step": 190 + }, + { + "epoch": 16.33, + "grad_norm": 0.1553652435541153, + "learning_rate": 0.00019999999999999998, + "loss": 0.0325, + "step": 200 + }, + { + "epoch": 16.33, + "eval_loss": 0.28704825043678284, + "eval_runtime": 89.7951, + "eval_samples_per_second": 4.31, + "eval_steps_per_second": 0.546, + "step": 200 + }, + { + "epoch": 17.14, + "grad_norm": 0.13403691351413727, + "learning_rate": 0.00018999999999999998, + "loss": 0.0297, + "step": 210 + }, + { + "epoch": 17.96, + "grad_norm": 0.14512716233730316, + "learning_rate": 0.00017999999999999998, + "loss": 0.0279, + "step": 220 + }, + { + "epoch": 17.96, + "eval_loss": 0.2964874505996704, + "eval_runtime": 89.7009, + "eval_samples_per_second": 4.314, + "eval_steps_per_second": 0.546, + "step": 220 + }, + { + "epoch": 18.78, + "grad_norm": 0.12400835007429123, + "learning_rate": 0.00016999999999999999, + "loss": 0.0263, + "step": 230 + }, + { + "epoch": 19.59, + "grad_norm": 0.1139909029006958, + "learning_rate": 0.00015999999999999999, + "loss": 0.0246, + "step": 240 + }, + { + "epoch": 19.59, + "eval_loss": 0.30519917607307434, + "eval_runtime": 89.8387, + "eval_samples_per_second": 4.308, + "eval_steps_per_second": 0.545, + "step": 240 + }, + { + "epoch": 20.41, + "grad_norm": 0.12317101657390594, + "learning_rate": 0.00015, + "loss": 0.0235, + "step": 250 + }, + { + "epoch": 21.22, + "grad_norm": 0.12494686245918274, + "learning_rate": 0.00014, + "loss": 0.0224, + "step": 260 + }, + { + "epoch": 21.22, + "eval_loss": 0.314134418964386, + "eval_runtime": 89.7974, + "eval_samples_per_second": 4.31, + "eval_steps_per_second": 0.546, + "step": 260 + }, + { + "epoch": 22.04, + "grad_norm": 0.1180659756064415, + "learning_rate": 0.00013, + "loss": 0.022, + "step": 270 + }, + { + "epoch": 22.86, + "grad_norm": 0.09653373062610626, + "learning_rate": 0.00011999999999999999, + "loss": 0.0212, + "step": 280 + }, + { + "epoch": 22.86, + "eval_loss": 0.3175604045391083, + "eval_runtime": 89.9764, + "eval_samples_per_second": 4.301, + "eval_steps_per_second": 0.545, + "step": 280 + }, + { + "epoch": 23.67, + "grad_norm": 0.10445748269557953, + "learning_rate": 0.00010999999999999998, + "loss": 0.0208, + "step": 290 + }, + { + "epoch": 24.49, + "grad_norm": 0.09245337545871735, + "learning_rate": 9.999999999999999e-05, + "loss": 0.0199, + "step": 300 + }, + { + "epoch": 24.49, + "eval_loss": 0.32360976934432983, + "eval_runtime": 89.8613, + "eval_samples_per_second": 4.307, + "eval_steps_per_second": 0.545, + "step": 300 + }, + { + "epoch": 25.31, + "grad_norm": 0.09468758851289749, + "learning_rate": 8.999999999999999e-05, + "loss": 0.0197, + "step": 310 + }, + { + "epoch": 26.12, + "grad_norm": 0.0891977846622467, + "learning_rate": 7.999999999999999e-05, + "loss": 0.0192, + "step": 320 + }, + { + "epoch": 26.12, + "eval_loss": 0.3267403841018677, + "eval_runtime": 90.3063, + "eval_samples_per_second": 4.285, + "eval_steps_per_second": 0.543, + "step": 320 + }, + { + "epoch": 26.94, + "grad_norm": 0.08574336767196655, + "learning_rate": 7e-05, + "loss": 0.0188, + "step": 330 + }, + { + "epoch": 27.76, + "grad_norm": 0.08517367392778397, + "learning_rate": 5.9999999999999995e-05, + "loss": 0.0184, + "step": 340 + }, + { + "epoch": 27.76, + "eval_loss": 0.33063870668411255, + "eval_runtime": 89.8041, + "eval_samples_per_second": 4.309, + "eval_steps_per_second": 0.546, + "step": 340 + }, + { + "epoch": 28.57, + "grad_norm": 0.08357132971286774, + "learning_rate": 4.9999999999999996e-05, + "loss": 0.0181, + "step": 350 + }, + { + "epoch": 29.39, + "grad_norm": 0.08679915964603424, + "learning_rate": 3.9999999999999996e-05, + "loss": 0.0181, + "step": 360 + }, + { + "epoch": 29.39, + "eval_loss": 0.3352932929992676, + "eval_runtime": 90.2438, + "eval_samples_per_second": 4.288, + "eval_steps_per_second": 0.543, + "step": 360 + }, + { + "epoch": 30.2, + "grad_norm": 0.07208231836557388, + "learning_rate": 2.9999999999999997e-05, + "loss": 0.0178, + "step": 370 + }, + { + "epoch": 31.02, + "grad_norm": 0.08611435443162918, + "learning_rate": 1.9999999999999998e-05, + "loss": 0.0175, + "step": 380 + }, + { + "epoch": 31.02, + "eval_loss": 0.338119238615036, + "eval_runtime": 90.2767, + "eval_samples_per_second": 4.287, + "eval_steps_per_second": 0.543, + "step": 380 + }, + { + "epoch": 31.84, + "grad_norm": 0.07755295187234879, + "learning_rate": 9.999999999999999e-06, + "loss": 0.0176, + "step": 390 + }, + { + "epoch": 32.65, + "grad_norm": 0.07367673516273499, + "learning_rate": 0.0, + "loss": 0.0173, + "step": 400 + }, + { + "epoch": 32.65, + "eval_loss": 0.3382853865623474, + "eval_runtime": 90.0437, + "eval_samples_per_second": 4.298, + "eval_steps_per_second": 0.544, + "step": 400 + } + ], + "logging_steps": 10, + "max_steps": 400, + "num_input_tokens_seen": 0, + "num_train_epochs": 34, + "save_steps": 20, + "total_flos": 2.2944565601689928e+18, + "train_batch_size": 32, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-400/training_args.bin b/checkpoint-400/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..3f0d01d8ba12c6a725736cdf727e72ec03ea9a4f --- /dev/null +++ b/checkpoint-400/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:01be3c1366faeea704d7b18d02c117abdc170d0c96565a08a0f3ad9c5e7a123a +size 4856 diff --git a/checkpoint-60/README.md b/checkpoint-60/README.md new file mode 100644 index 0000000000000000000000000000000000000000..4b6f787248182d62a16f6423f948c336352c3674 --- /dev/null +++ b/checkpoint-60/README.md @@ -0,0 +1,204 @@ +--- +library_name: peft +base_model: bigcode/starcoder +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] + + +### Framework versions + +- PEFT 0.8.2 \ No newline at end of file diff --git a/checkpoint-60/adapter_config.json b/checkpoint-60/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..ab6c617ff1322585052700834abbc593bae7c619 --- /dev/null +++ b/checkpoint-60/adapter_config.json @@ -0,0 +1,28 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "bigcode/starcoder", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "c_proj", + "c_attn", + "q_attn" + ], + "task_type": "CAUSAL_LM", + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-60/adapter_model.safetensors b/checkpoint-60/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..0b7377ced059703efbf7179779a269234116eb75 --- /dev/null +++ b/checkpoint-60/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e44ce263e6fd885f50d82ca515b9325375b43ee36ededb75acf161ce88bc2e41 +size 48 diff --git a/checkpoint-60/optimizer.pt b/checkpoint-60/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..9737729f3fd08f9033816836e4170e40afdf350b --- /dev/null +++ b/checkpoint-60/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3b95f1af5b76ccbe64193f2025017a1bcabe6da07c7696008de971d17369c5c0 +size 284628602 diff --git a/checkpoint-60/rng_state.pth b/checkpoint-60/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..ea19684ee32c082089d6be5faa1c380f5ab5e08b --- /dev/null +++ b/checkpoint-60/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4ae8652c2879fe670bb11f4dc2977badc0e05f30468a3d779c66a37c466dda75 +size 14244 diff --git a/checkpoint-60/scheduler.pt b/checkpoint-60/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..ccc7f4b428abbeedfc6798ac0a82ddf117ba2dfc --- /dev/null +++ b/checkpoint-60/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:26d0406405dc2799f6a205a30a40ceac73b9e2fdb57b3e7109b27235b06006ef +size 1064 diff --git a/checkpoint-60/trainer_state.json b/checkpoint-60/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..7590283f71046258b0022bb96dc9c4b1f3518b57 --- /dev/null +++ b/checkpoint-60/trainer_state.json @@ -0,0 +1,87 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 4.8979591836734695, + "eval_steps": 20, + "global_step": 60, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.82, + "grad_norm": 0.18377472460269928, + "learning_rate": 2.9999999999999997e-05, + "loss": 1.861, + "step": 10 + }, + { + "epoch": 1.63, + "grad_norm": 0.35202744603157043, + "learning_rate": 5.9999999999999995e-05, + "loss": 1.7263, + "step": 20 + }, + { + "epoch": 1.63, + "eval_loss": 1.4457755088806152, + "eval_runtime": 89.8938, + "eval_samples_per_second": 4.305, + "eval_steps_per_second": 0.545, + "step": 20 + }, + { + "epoch": 2.45, + "grad_norm": 0.928983747959137, + "learning_rate": 8.999999999999999e-05, + "loss": 1.1718, + "step": 30 + }, + { + "epoch": 3.27, + "grad_norm": 0.253262996673584, + "learning_rate": 0.00011999999999999999, + "loss": 0.4789, + "step": 40 + }, + { + "epoch": 3.27, + "eval_loss": 0.3332095146179199, + "eval_runtime": 89.9804, + "eval_samples_per_second": 4.301, + "eval_steps_per_second": 0.545, + "step": 40 + }, + { + "epoch": 4.08, + "grad_norm": 0.12236642092466354, + "learning_rate": 0.00015, + "loss": 0.3568, + "step": 50 + }, + { + "epoch": 4.9, + "grad_norm": 0.09160923212766647, + "learning_rate": 0.00017999999999999998, + "loss": 0.3256, + "step": 60 + }, + { + "epoch": 4.9, + "eval_loss": 0.2753114104270935, + "eval_runtime": 90.3206, + "eval_samples_per_second": 4.285, + "eval_steps_per_second": 0.543, + "step": 60 + } + ], + "logging_steps": 10, + "max_steps": 400, + "num_input_tokens_seen": 0, + "num_train_epochs": 34, + "save_steps": 20, + "total_flos": 3.449704966520832e+17, + "train_batch_size": 32, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-60/training_args.bin b/checkpoint-60/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..3f0d01d8ba12c6a725736cdf727e72ec03ea9a4f --- /dev/null +++ b/checkpoint-60/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:01be3c1366faeea704d7b18d02c117abdc170d0c96565a08a0f3ad9c5e7a123a +size 4856 diff --git a/checkpoint-80/README.md b/checkpoint-80/README.md new file mode 100644 index 0000000000000000000000000000000000000000..4b6f787248182d62a16f6423f948c336352c3674 --- /dev/null +++ b/checkpoint-80/README.md @@ -0,0 +1,204 @@ +--- +library_name: peft +base_model: bigcode/starcoder +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] + + +### Framework versions + +- PEFT 0.8.2 \ No newline at end of file diff --git a/checkpoint-80/adapter_config.json b/checkpoint-80/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..ab6c617ff1322585052700834abbc593bae7c619 --- /dev/null +++ b/checkpoint-80/adapter_config.json @@ -0,0 +1,28 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "bigcode/starcoder", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "c_proj", + "c_attn", + "q_attn" + ], + "task_type": "CAUSAL_LM", + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-80/adapter_model.safetensors b/checkpoint-80/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..0b7377ced059703efbf7179779a269234116eb75 --- /dev/null +++ b/checkpoint-80/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e44ce263e6fd885f50d82ca515b9325375b43ee36ededb75acf161ce88bc2e41 +size 48 diff --git a/checkpoint-80/optimizer.pt b/checkpoint-80/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..22b62039fd85cbe0ec0054d4e9988ceef8c28cbd --- /dev/null +++ b/checkpoint-80/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9c298e33138163e2a15ee5a6a6cc8f3e68c596077e4d00579ac36d42a3a18228 +size 284628602 diff --git a/checkpoint-80/rng_state.pth b/checkpoint-80/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..b71d91cd22e12bb910fa89a1496b2638c19590a4 --- /dev/null +++ b/checkpoint-80/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5d18a6bfea687c87d90c470af6ca33dff658dae861a2666cb386ea47c46f0bb3 +size 14244 diff --git a/checkpoint-80/scheduler.pt b/checkpoint-80/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..ee345e8a659f9d4e86e79b79c6e415ea95b6fa42 --- /dev/null +++ b/checkpoint-80/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:21d82a70de51a4166c824e4076d761aaf8a8967df5c1cd7fdce99da5c3b5bc50 +size 1064 diff --git a/checkpoint-80/trainer_state.json b/checkpoint-80/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..7365bbb261e1c2e97b1a4d9e283d6b8471e0d2d4 --- /dev/null +++ b/checkpoint-80/trainer_state.json @@ -0,0 +1,109 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 6.530612244897959, + "eval_steps": 20, + "global_step": 80, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.82, + "grad_norm": 0.18377472460269928, + "learning_rate": 2.9999999999999997e-05, + "loss": 1.861, + "step": 10 + }, + { + "epoch": 1.63, + "grad_norm": 0.35202744603157043, + "learning_rate": 5.9999999999999995e-05, + "loss": 1.7263, + "step": 20 + }, + { + "epoch": 1.63, + "eval_loss": 1.4457755088806152, + "eval_runtime": 89.8938, + "eval_samples_per_second": 4.305, + "eval_steps_per_second": 0.545, + "step": 20 + }, + { + "epoch": 2.45, + "grad_norm": 0.928983747959137, + "learning_rate": 8.999999999999999e-05, + "loss": 1.1718, + "step": 30 + }, + { + "epoch": 3.27, + "grad_norm": 0.253262996673584, + "learning_rate": 0.00011999999999999999, + "loss": 0.4789, + "step": 40 + }, + { + "epoch": 3.27, + "eval_loss": 0.3332095146179199, + "eval_runtime": 89.9804, + "eval_samples_per_second": 4.301, + "eval_steps_per_second": 0.545, + "step": 40 + }, + { + "epoch": 4.08, + "grad_norm": 0.12236642092466354, + "learning_rate": 0.00015, + "loss": 0.3568, + "step": 50 + }, + { + "epoch": 4.9, + "grad_norm": 0.09160923212766647, + "learning_rate": 0.00017999999999999998, + "loss": 0.3256, + "step": 60 + }, + { + "epoch": 4.9, + "eval_loss": 0.2753114104270935, + "eval_runtime": 90.3206, + "eval_samples_per_second": 4.285, + "eval_steps_per_second": 0.543, + "step": 60 + }, + { + "epoch": 5.71, + "grad_norm": 0.10242326557636261, + "learning_rate": 0.00020999999999999998, + "loss": 0.2841, + "step": 70 + }, + { + "epoch": 6.53, + "grad_norm": 0.1305350810289383, + "learning_rate": 0.00023999999999999998, + "loss": 0.2615, + "step": 80 + }, + { + "epoch": 6.53, + "eval_loss": 0.2476309835910797, + "eval_runtime": 90.525, + "eval_samples_per_second": 4.275, + "eval_steps_per_second": 0.541, + "step": 80 + } + ], + "logging_steps": 10, + "max_steps": 400, + "num_input_tokens_seen": 0, + "num_train_epochs": 34, + "save_steps": 20, + "total_flos": 4.593302592647332e+17, + "train_batch_size": 32, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-80/training_args.bin b/checkpoint-80/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..3f0d01d8ba12c6a725736cdf727e72ec03ea9a4f --- /dev/null +++ b/checkpoint-80/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:01be3c1366faeea704d7b18d02c117abdc170d0c96565a08a0f3ad9c5e7a123a +size 4856 diff --git a/runs/Feb21_13-08-44_nq0jxhxas9/events.out.tfevents.1708520927.nq0jxhxas9.2015.0 b/runs/Feb21_13-08-44_nq0jxhxas9/events.out.tfevents.1708520927.nq0jxhxas9.2015.0 new file mode 100644 index 0000000000000000000000000000000000000000..b5c22135d4f498ee366043a888fe2238605434bb --- /dev/null +++ b/runs/Feb21_13-08-44_nq0jxhxas9/events.out.tfevents.1708520927.nq0jxhxas9.2015.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:69773f591aabaa18531f775feb5941bdb18c98b05137a37f4a45da2770ec1850 +size 19459