jeongyoonhuh commited on Mar 28

Commit

c2ac464

verified ·

1 Parent(s): 897bc27

Upload folder using huggingface_hub

Browse files

Files changed (27) hide show

.gitattributes +2 -0
adapter_config.json +4 -4
adapter_model.safetensors +1 -1
checkpoint-126/README.md +202 -0
checkpoint-126/adapter_config.json +39 -0
checkpoint-126/adapter_model.safetensors +3 -0
checkpoint-126/optimizer.pt +3 -0
checkpoint-126/rng_state.pth +3 -0
checkpoint-126/scheduler.pt +3 -0
checkpoint-126/special_tokens_map.json +27 -0
checkpoint-126/tokenizer.json +3 -0
checkpoint-126/tokenizer_config.json +0 -0
checkpoint-126/trainer_state.json +214 -0
checkpoint-126/training_args.bin +3 -0
checkpoint-186/README.md +202 -0
checkpoint-186/adapter_config.json +39 -0
checkpoint-186/adapter_model.safetensors +3 -0
checkpoint-186/config.json +60 -0
checkpoint-186/optimizer.pt +3 -0
checkpoint-186/rng_state.pth +3 -0
checkpoint-186/scheduler.pt +3 -0
checkpoint-186/special_tokens_map.json +27 -0
checkpoint-186/tokenizer.json +3 -0
checkpoint-186/tokenizer_config.json +0 -0
checkpoint-186/trainer_state.json +304 -0
checkpoint-186/training_args.bin +3 -0
training_args.bin +1 -1

.gitattributes CHANGED Viewed

@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+checkpoint-126/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+checkpoint-186/tokenizer.json filter=lfs diff=lfs merge=lfs -text

adapter_config.json CHANGED Viewed

@@ -24,13 +24,13 @@
   "rank_pattern": {},
   "revision": null,
   "target_modules": [
     "up_proj",
     "down_proj",
-    "q_proj",
-    "o_proj",
-    "gate_proj",
     "k_proj",
-    "v_proj"
   ],
   "task_type": "CAUSAL_LM",
   "trainable_token_indices": null,

   "rank_pattern": {},
   "revision": null,
   "target_modules": [
+    "q_proj",
     "up_proj",
     "down_proj",
     "k_proj",
+    "o_proj",
+    "v_proj",
+    "gate_proj"
   ],
   "task_type": "CAUSAL_LM",
   "trainable_token_indices": null,

adapter_model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:085323d1e3e5eb82e0d58f330b33e233fab73d6bea2e32453ec8245b130510d2
 size 131251312

 version https://git-lfs.github.com/spec/v1
+oid sha256:bfeb68b0a4f2f293efe175c3ce95680d0d7d10a4af7ed523f239b1c4997c4236
 size 131251312

checkpoint-126/README.md ADDED Viewed

	@@ -0,0 +1,202 @@

+---
+base_model: google/gemma-3-4b-it
+library_name: peft
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.15.1

checkpoint-126/adapter_config.json ADDED Viewed

	@@ -0,0 +1,39 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "google/gemma-3-4b-it",
+  "bias": "none",
+  "corda_config": null,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 32,
+  "lora_bias": false,
+  "lora_dropout": 0.1,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 16,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "q_proj",
+    "up_proj",
+    "down_proj",
+    "k_proj",
+    "o_proj",
+    "v_proj",
+    "gate_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_dora": false,
+  "use_rslora": false
+}

checkpoint-126/adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ca62a8958ce2119832c0687fe30c7d1a411501020a9dd5846adeacaa388d9f9e
+size 131251312

checkpoint-126/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ceff60913fc2b1842c4ba2e946975a6beb8ba9f3e46df576ebc9af131cc33b61
+size 238821222

checkpoint-126/rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f6861a77d18f1a753f29d0bc3870d6bc9dd58e91fec830031a33ba86e5b96617
+size 14244

checkpoint-126/scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5ef01a1358df6bcc6543a0f4a68520d86f4c70e98e4cb6db182b943401264759
+size 1064

checkpoint-126/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,27 @@

+{
+  "boi_token": "<start_of_image>",
+  "bos_token": {
+    "content": "<bos>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eoi_token": "<end_of_image>",
+  "eos_token": {
+    "content": "<eos>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "image_token": "<image_soft_token>",
+  "pad_token": "<eos>",
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

checkpoint-126/tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4667f2089529e8e7657cfb6d1c19910ae71ff5f28aa7ab2ff2763330affad795
+size 33384568

checkpoint-126/tokenizer_config.json ADDED Viewed

The diff for this file is too large to render. See raw diff

checkpoint-126/trainer_state.json ADDED Viewed

	@@ -0,0 +1,214 @@

+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 2.0,
+  "eval_steps": 10,
+  "global_step": 126,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.1606425702811245,
+      "grad_norm": 8.837871551513672,
+      "learning_rate": 0.000189247311827957,
+      "loss": 8.4808,
+      "step": 10
+    },
+    {
+      "epoch": 0.1606425702811245,
+      "eval_loss": 0.47449949383735657,
+      "eval_runtime": 44.2521,
+      "eval_samples_per_second": 11.276,
+      "eval_steps_per_second": 2.825,
+      "step": 10
+    },
+    {
+      "epoch": 0.321285140562249,
+      "grad_norm": 3.9745054244995117,
+      "learning_rate": 0.00017849462365591398,
+      "loss": 3.0872,
+      "step": 20
+    },
+    {
+      "epoch": 0.321285140562249,
+      "eval_loss": 0.3382565677165985,
+      "eval_runtime": 48.3781,
+      "eval_samples_per_second": 10.315,
+      "eval_steps_per_second": 2.584,
+      "step": 20
+    },
+    {
+      "epoch": 0.4819277108433735,
+      "grad_norm": 3.2844247817993164,
+      "learning_rate": 0.00016774193548387098,
+      "loss": 2.7654,
+      "step": 30
+    },
+    {
+      "epoch": 0.4819277108433735,
+      "eval_loss": 0.3172107934951782,
+      "eval_runtime": 50.1957,
+      "eval_samples_per_second": 9.941,
+      "eval_steps_per_second": 2.49,
+      "step": 30
+    },
+    {
+      "epoch": 0.642570281124498,
+      "grad_norm": 2.986614465713501,
+      "learning_rate": 0.00015698924731182796,
+      "loss": 2.4167,
+      "step": 40
+    },
+    {
+      "epoch": 0.642570281124498,
+      "eval_loss": 0.3065057694911957,
+      "eval_runtime": 49.9091,
+      "eval_samples_per_second": 9.998,
+      "eval_steps_per_second": 2.505,
+      "step": 40
+    },
+    {
+      "epoch": 0.8032128514056225,
+      "grad_norm": 2.47356915473938,
+      "learning_rate": 0.00014623655913978496,
+      "loss": 2.5307,
+      "step": 50
+    },
+    {
+      "epoch": 0.8032128514056225,
+      "eval_loss": 0.296438992023468,
+      "eval_runtime": 45.7506,
+      "eval_samples_per_second": 10.907,
+      "eval_steps_per_second": 2.732,
+      "step": 50
+    },
+    {
+      "epoch": 0.963855421686747,
+      "grad_norm": 2.9974865913391113,
+      "learning_rate": 0.00013548387096774193,
+      "loss": 2.48,
+      "step": 60
+    },
+    {
+      "epoch": 0.963855421686747,
+      "eval_loss": 0.2858414649963379,
+      "eval_runtime": 47.1672,
+      "eval_samples_per_second": 10.579,
+      "eval_steps_per_second": 2.65,
+      "step": 60
+    },
+    {
+      "epoch": 1.1124497991967872,
+      "grad_norm": 2.8883159160614014,
+      "learning_rate": 0.00012473118279569893,
+      "loss": 1.9267,
+      "step": 70
+    },
+    {
+      "epoch": 1.1124497991967872,
+      "eval_loss": 0.28903448581695557,
+      "eval_runtime": 45.0342,
+      "eval_samples_per_second": 11.08,
+      "eval_steps_per_second": 2.776,
+      "step": 70
+    },
+    {
+      "epoch": 1.2730923694779117,
+      "grad_norm": 2.6527364253997803,
+      "learning_rate": 0.00011397849462365593,
+      "loss": 1.9188,
+      "step": 80
+    },
+    {
+      "epoch": 1.2730923694779117,
+      "eval_loss": 0.284964382648468,
+      "eval_runtime": 44.0349,
+      "eval_samples_per_second": 11.332,
+      "eval_steps_per_second": 2.839,
+      "step": 80
+    },
+    {
+      "epoch": 1.4337349397590362,
+      "grad_norm": 3.009997606277466,
+      "learning_rate": 0.0001032258064516129,
+      "loss": 1.9806,
+      "step": 90
+    },
+    {
+      "epoch": 1.4337349397590362,
+      "eval_loss": 0.2806684672832489,
+      "eval_runtime": 46.0655,
+      "eval_samples_per_second": 10.832,
+      "eval_steps_per_second": 2.714,
+      "step": 90
+    },
+    {
+      "epoch": 1.5943775100401605,
+      "grad_norm": 2.434161424636841,
+      "learning_rate": 9.247311827956989e-05,
+      "loss": 1.8809,
+      "step": 100
+    },
+    {
+      "epoch": 1.5943775100401605,
+      "eval_loss": 0.27644357085227966,
+      "eval_runtime": 44.1705,
+      "eval_samples_per_second": 11.297,
+      "eval_steps_per_second": 2.83,
+      "step": 100
+    },
+    {
+      "epoch": 1.7550200803212852,
+      "grad_norm": 3.0396993160247803,
+      "learning_rate": 8.172043010752689e-05,
+      "loss": 1.8046,
+      "step": 110
+    },
+    {
+      "epoch": 1.7550200803212852,
+      "eval_loss": 0.27500617504119873,
+      "eval_runtime": 44.3244,
+      "eval_samples_per_second": 11.258,
+      "eval_steps_per_second": 2.82,
+      "step": 110
+    },
+    {
+      "epoch": 1.9156626506024095,
+      "grad_norm": 2.961737632751465,
+      "learning_rate": 7.096774193548388e-05,
+      "loss": 1.929,
+      "step": 120
+    },
+    {
+      "epoch": 1.9156626506024095,
+      "eval_loss": 0.2731240689754486,
+      "eval_runtime": 43.8996,
+      "eval_samples_per_second": 11.367,
+      "eval_steps_per_second": 2.847,
+      "step": 120
+    }
+  ],
+  "logging_steps": 10,
+  "max_steps": 186,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 3,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 4.475620506599424e+16,
+  "train_batch_size": 4,
+  "trial_name": null,
+  "trial_params": null
+}

checkpoint-126/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:397d33c02930459e5f249b114462600d30d6bd219204dba10b2570a68569ae34
+size 5240

checkpoint-186/README.md ADDED Viewed

	@@ -0,0 +1,202 @@

+---
+base_model: google/gemma-3-4b-it
+library_name: peft
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.15.1

checkpoint-186/adapter_config.json ADDED Viewed

	@@ -0,0 +1,39 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "google/gemma-3-4b-it",
+  "bias": "none",
+  "corda_config": null,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 32,
+  "lora_bias": false,
+  "lora_dropout": 0.1,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 16,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "q_proj",
+    "up_proj",
+    "down_proj",
+    "k_proj",
+    "o_proj",
+    "v_proj",
+    "gate_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_dora": false,
+  "use_rslora": false
+}

checkpoint-186/adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:bfeb68b0a4f2f293efe175c3ce95680d0d7d10a4af7ed523f239b1c4997c4236
+size 131251312

checkpoint-186/config.json ADDED Viewed

	@@ -0,0 +1,60 @@

+{
+  "architectures": [
+    "Gemma3ForConditionalGeneration"
+  ],
+  "boi_token_index": 255999,
+  "eoi_token_index": 256000,
+  "eos_token_id": [
+    1,
+    106
+  ],
+  "image_token_index": 262144,
+  "initializer_range": 0.02,
+  "mm_tokens_per_image": 256,
+  "model_type": "gemma3",
+  "text_config": {
+    "attention_bias": false,
+    "attention_dropout": 0.0,
+    "attn_logit_softcapping": null,
+    "cache_implementation": "hybrid",
+    "final_logit_softcapping": null,
+    "head_dim": 256,
+    "hidden_activation": "gelu_pytorch_tanh",
+    "hidden_size": 2560,
+    "initializer_range": 0.02,
+    "intermediate_size": 10240,
+    "max_position_embeddings": 131072,
+    "model_type": "gemma3_text",
+    "num_attention_heads": 8,
+    "num_hidden_layers": 34,
+    "num_key_value_heads": 4,
+    "query_pre_attn_scalar": 256,
+    "rms_norm_eps": 1e-06,
+    "rope_local_base_freq": 10000.0,
+    "rope_scaling": {
+      "factor": 8.0,
+      "rope_type": "linear"
+    },
+    "rope_theta": 1000000.0,
+    "sliding_window": 1024,
+    "sliding_window_pattern": 6,
+    "use_cache": true,
+    "vocab_size": 262208
+  },
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.50.2",
+  "vision_config": {
+    "attention_dropout": 0.0,
+    "hidden_act": "gelu_pytorch_tanh",
+    "hidden_size": 1152,
+    "image_size": 896,
+    "intermediate_size": 4304,
+    "layer_norm_eps": 1e-06,
+    "model_type": "siglip_vision_model",
+    "num_attention_heads": 16,
+    "num_channels": 3,
+    "num_hidden_layers": 27,
+    "patch_size": 14,
+    "vision_use_head": false
+  }
+}

checkpoint-186/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:830a2f5b70ff5c0a907d7a0094d1c1fd7b46866030b972ee9e75820e5e7b96f2
+size 238821222

checkpoint-186/rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:360a6be4633dc0315ec4d25053047c9cd4fe56aad5b002436233716de0350afc
+size 14244

checkpoint-186/scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:71ec17dd81af387bd1146a1b8edbaf6a378f8acfc261443bbcbf59b5d627ba85
+size 1064

checkpoint-186/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,27 @@

+{
+  "boi_token": "<start_of_image>",
+  "bos_token": {
+    "content": "<bos>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eoi_token": "<end_of_image>",
+  "eos_token": {
+    "content": "<eos>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "image_token": "<image_soft_token>",
+  "pad_token": "<eos>",
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

checkpoint-186/tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4667f2089529e8e7657cfb6d1c19910ae71ff5f28aa7ab2ff2763330affad795
+size 33384568

checkpoint-186/tokenizer_config.json ADDED Viewed

The diff for this file is too large to render. See raw diff

checkpoint-186/trainer_state.json ADDED Viewed

	@@ -0,0 +1,304 @@

+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 2.963855421686747,
+  "eval_steps": 10,
+  "global_step": 186,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.1606425702811245,
+      "grad_norm": 8.837871551513672,
+      "learning_rate": 0.000189247311827957,
+      "loss": 8.4808,
+      "step": 10
+    },
+    {
+      "epoch": 0.1606425702811245,
+      "eval_loss": 0.47449949383735657,
+      "eval_runtime": 44.2521,
+      "eval_samples_per_second": 11.276,
+      "eval_steps_per_second": 2.825,
+      "step": 10
+    },
+    {
+      "epoch": 0.321285140562249,
+      "grad_norm": 3.9745054244995117,
+      "learning_rate": 0.00017849462365591398,
+      "loss": 3.0872,
+      "step": 20
+    },
+    {
+      "epoch": 0.321285140562249,
+      "eval_loss": 0.3382565677165985,
+      "eval_runtime": 48.3781,
+      "eval_samples_per_second": 10.315,
+      "eval_steps_per_second": 2.584,
+      "step": 20
+    },
+    {
+      "epoch": 0.4819277108433735,
+      "grad_norm": 3.2844247817993164,
+      "learning_rate": 0.00016774193548387098,
+      "loss": 2.7654,
+      "step": 30
+    },
+    {
+      "epoch": 0.4819277108433735,
+      "eval_loss": 0.3172107934951782,
+      "eval_runtime": 50.1957,
+      "eval_samples_per_second": 9.941,
+      "eval_steps_per_second": 2.49,
+      "step": 30
+    },
+    {
+      "epoch": 0.642570281124498,
+      "grad_norm": 2.986614465713501,
+      "learning_rate": 0.00015698924731182796,
+      "loss": 2.4167,
+      "step": 40
+    },
+    {
+      "epoch": 0.642570281124498,
+      "eval_loss": 0.3065057694911957,
+      "eval_runtime": 49.9091,
+      "eval_samples_per_second": 9.998,
+      "eval_steps_per_second": 2.505,
+      "step": 40
+    },
+    {
+      "epoch": 0.8032128514056225,
+      "grad_norm": 2.47356915473938,
+      "learning_rate": 0.00014623655913978496,
+      "loss": 2.5307,
+      "step": 50
+    },
+    {
+      "epoch": 0.8032128514056225,
+      "eval_loss": 0.296438992023468,
+      "eval_runtime": 45.7506,
+      "eval_samples_per_second": 10.907,
+      "eval_steps_per_second": 2.732,
+      "step": 50
+    },
+    {
+      "epoch": 0.963855421686747,
+      "grad_norm": 2.9974865913391113,
+      "learning_rate": 0.00013548387096774193,
+      "loss": 2.48,
+      "step": 60
+    },
+    {
+      "epoch": 0.963855421686747,
+      "eval_loss": 0.2858414649963379,
+      "eval_runtime": 47.1672,
+      "eval_samples_per_second": 10.579,
+      "eval_steps_per_second": 2.65,
+      "step": 60
+    },
+    {
+      "epoch": 1.1124497991967872,
+      "grad_norm": 2.8883159160614014,
+      "learning_rate": 0.00012473118279569893,
+      "loss": 1.9267,
+      "step": 70
+    },
+    {
+      "epoch": 1.1124497991967872,
+      "eval_loss": 0.28903448581695557,
+      "eval_runtime": 45.0342,
+      "eval_samples_per_second": 11.08,
+      "eval_steps_per_second": 2.776,
+      "step": 70
+    },
+    {
+      "epoch": 1.2730923694779117,
+      "grad_norm": 2.6527364253997803,
+      "learning_rate": 0.00011397849462365593,
+      "loss": 1.9188,
+      "step": 80
+    },
+    {
+      "epoch": 1.2730923694779117,
+      "eval_loss": 0.284964382648468,
+      "eval_runtime": 44.0349,
+      "eval_samples_per_second": 11.332,
+      "eval_steps_per_second": 2.839,
+      "step": 80
+    },
+    {
+      "epoch": 1.4337349397590362,
+      "grad_norm": 3.009997606277466,
+      "learning_rate": 0.0001032258064516129,
+      "loss": 1.9806,
+      "step": 90
+    },
+    {
+      "epoch": 1.4337349397590362,
+      "eval_loss": 0.2806684672832489,
+      "eval_runtime": 46.0655,
+      "eval_samples_per_second": 10.832,
+      "eval_steps_per_second": 2.714,
+      "step": 90
+    },
+    {
+      "epoch": 1.5943775100401605,
+      "grad_norm": 2.434161424636841,
+      "learning_rate": 9.247311827956989e-05,
+      "loss": 1.8809,
+      "step": 100
+    },
+    {
+      "epoch": 1.5943775100401605,
+      "eval_loss": 0.27644357085227966,
+      "eval_runtime": 44.1705,
+      "eval_samples_per_second": 11.297,
+      "eval_steps_per_second": 2.83,
+      "step": 100
+    },
+    {
+      "epoch": 1.7550200803212852,
+      "grad_norm": 3.0396993160247803,
+      "learning_rate": 8.172043010752689e-05,
+      "loss": 1.8046,
+      "step": 110
+    },
+    {
+      "epoch": 1.7550200803212852,
+      "eval_loss": 0.27500617504119873,
+      "eval_runtime": 44.3244,
+      "eval_samples_per_second": 11.258,
+      "eval_steps_per_second": 2.82,
+      "step": 110
+    },
+    {
+      "epoch": 1.9156626506024095,
+      "grad_norm": 2.961737632751465,
+      "learning_rate": 7.096774193548388e-05,
+      "loss": 1.929,
+      "step": 120
+    },
+    {
+      "epoch": 1.9156626506024095,
+      "eval_loss": 0.2731240689754486,
+      "eval_runtime": 43.8996,
+      "eval_samples_per_second": 11.367,
+      "eval_steps_per_second": 2.847,
+      "step": 120
+    },
+    {
+      "epoch": 2.0642570281124497,
+      "grad_norm": 2.265627145767212,
+      "learning_rate": 6.021505376344086e-05,
+      "loss": 1.5674,
+      "step": 130
+    },
+    {
+      "epoch": 2.0642570281124497,
+      "eval_loss": 0.2714921832084656,
+      "eval_runtime": 45.3105,
+      "eval_samples_per_second": 11.013,
+      "eval_steps_per_second": 2.759,
+      "step": 130
+    },
+    {
+      "epoch": 2.2248995983935744,
+      "grad_norm": 2.743725061416626,
+      "learning_rate": 4.9462365591397855e-05,
+      "loss": 1.5314,
+      "step": 140
+    },
+    {
+      "epoch": 2.2248995983935744,
+      "eval_loss": 0.28298166394233704,
+      "eval_runtime": 47.8967,
+      "eval_samples_per_second": 10.418,
+      "eval_steps_per_second": 2.61,
+      "step": 140
+    },
+    {
+      "epoch": 2.3855421686746987,
+      "grad_norm": 3.021275043487549,
+      "learning_rate": 3.870967741935484e-05,
+      "loss": 1.5405,
+      "step": 150
+    },
+    {
+      "epoch": 2.3855421686746987,
+      "eval_loss": 0.2852790653705597,
+      "eval_runtime": 47.6731,
+      "eval_samples_per_second": 10.467,
+      "eval_steps_per_second": 2.622,
+      "step": 150
+    },
+    {
+      "epoch": 2.5461847389558234,
+      "grad_norm": 2.859989881515503,
+      "learning_rate": 2.7956989247311828e-05,
+      "loss": 1.4902,
+      "step": 160
+    },
+    {
+      "epoch": 2.5461847389558234,
+      "eval_loss": 0.28039097785949707,
+      "eval_runtime": 49.8843,
+      "eval_samples_per_second": 10.003,
+      "eval_steps_per_second": 2.506,
+      "step": 160
+    },
+    {
+      "epoch": 2.7068273092369477,
+      "grad_norm": 2.8204994201660156,
+      "learning_rate": 1.7204301075268818e-05,
+      "loss": 1.4331,
+      "step": 170
+    },
+    {
+      "epoch": 2.7068273092369477,
+      "eval_loss": 0.27951765060424805,
+      "eval_runtime": 49.9628,
+      "eval_samples_per_second": 9.987,
+      "eval_steps_per_second": 2.502,
+      "step": 170
+    },
+    {
+      "epoch": 2.8674698795180724,
+      "grad_norm": 2.8627824783325195,
+      "learning_rate": 6.451612903225806e-06,
+      "loss": 1.4442,
+      "step": 180
+    },
+    {
+      "epoch": 2.8674698795180724,
+      "eval_loss": 0.280029833316803,
+      "eval_runtime": 49.9016,
+      "eval_samples_per_second": 10.0,
+      "eval_steps_per_second": 2.505,
+      "step": 180
+    }
+  ],
+  "logging_steps": 10,
+  "max_steps": 186,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 3,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 6.632546051948544e+16,
+  "train_batch_size": 4,
+  "trial_name": null,
+  "trial_params": null
+}

checkpoint-186/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:397d33c02930459e5f249b114462600d30d6bd219204dba10b2570a68569ae34
+size 5240

training_args.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:04c6490d86117b3add406e0d7d46aa909e80445addeaabac60848f2cfd65b774
 size 5240

 version https://git-lfs.github.com/spec/v1
+oid sha256:397d33c02930459e5f249b114462600d30d6bd219204dba10b2570a68569ae34
 size 5240