Felladrin
/

TinyMistral-248M-Chat-v4

@@ -73,7 +73,7 @@ widget:
   - [[ChatML](https://huggingface.co/datasets/Felladrin/ChatML-reddit-instruct-curated)] [euclaise/reddit-instruct-curated](https://huggingface.co/datasets/euclaise/reddit-instruct-curated)
   - [[ChatML](https://huggingface.co/datasets/Felladrin/ChatML-aya_dataset)] [CohereForAI/aya_dataset](https://huggingface.co/datasets/CohereForAI/aya_dataset)
   - [HuggingFaceH4/ultrafeedback_binarized](https://huggingface.co/datasets/HuggingFaceH4/ultrafeedback_binarized)
-- License: [Apache License 2.0](https://huggingface.co/Felladrin/TinyMistral-248M-Chat-v3/resolve/main/license.txt)
 ## Recommended Prompt Format
@@ -91,7 +91,7 @@ widget:
 from transformers import AutoModelForCausalLM, AutoTokenizer, TextStreamer
 import torch
-model_path = "Felladrin/TinyMistral-248M-Chat-v3"
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 tokenizer = AutoTokenizer.from_pretrained(model_path)
 model = AutoModelForCausalLM.from_pretrained(model_path).to(device)
@@ -178,7 +178,6 @@ llamafactory-cli train \
     --preprocessing_num_workers $(python -c "import os; print(max(1, os.cpu_count() - 2))") \
     --dataloader_num_workers $(python -c "import os; print(max(1, os.cpu_count() - 2))") \
     --finetuning_type full \
-    --template default \
     --flash_attn auto \
     --enable_liger_kernel True \
     --dataset_dir data \
@@ -188,15 +187,15 @@ llamafactory-cli train \
     --num_train_epochs 2.0 \
     --per_device_train_batch_size 4 \
     --gradient_accumulation_steps 4 \
-    --lr_scheduler_type cosine \
     --max_grad_norm 1.0 \
     --logging_steps 10 \
     --save_steps 50 \
     --save_total_limit 1 \
     --warmup_ratio 0.1 \
     --packing False \
-    --report_to none \
-    --output_dir ~/TinyMistral-248M-Chat-v3 \
     --pure_bf16 True \
     --plot_loss True \
     --trust_remote_code True \

   - [[ChatML](https://huggingface.co/datasets/Felladrin/ChatML-reddit-instruct-curated)] [euclaise/reddit-instruct-curated](https://huggingface.co/datasets/euclaise/reddit-instruct-curated)
   - [[ChatML](https://huggingface.co/datasets/Felladrin/ChatML-aya_dataset)] [CohereForAI/aya_dataset](https://huggingface.co/datasets/CohereForAI/aya_dataset)
   - [HuggingFaceH4/ultrafeedback_binarized](https://huggingface.co/datasets/HuggingFaceH4/ultrafeedback_binarized)
+- License: [Apache License 2.0](https://huggingface.co/Felladrin/TinyMistral-248M-Chat-v4/resolve/main/license.txt)
 ## Recommended Prompt Format
 from transformers import AutoModelForCausalLM, AutoTokenizer, TextStreamer
 import torch
+model_path = "Felladrin/TinyMistral-248M-Chat-v4"
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 tokenizer = AutoTokenizer.from_pretrained(model_path)
 model = AutoModelForCausalLM.from_pretrained(model_path).to(device)
     --preprocessing_num_workers $(python -c "import os; print(max(1, os.cpu_count() - 2))") \
     --dataloader_num_workers $(python -c "import os; print(max(1, os.cpu_count() - 2))") \
     --finetuning_type full \
     --flash_attn auto \
     --enable_liger_kernel True \
     --dataset_dir data \
     --num_train_epochs 2.0 \
     --per_device_train_batch_size 4 \
     --gradient_accumulation_steps 4 \
+    --lr_scheduler_type linear \
     --max_grad_norm 1.0 \
     --logging_steps 10 \
     --save_steps 50 \
     --save_total_limit 1 \
     --warmup_ratio 0.1 \
     --packing False \
+    --report_to tensorboard \
+    --output_dir ~/TinyMistral-248M-Chat-v4 \
     --pure_bf16 True \
     --plot_loss True \
     --trust_remote_code True \

config.json CHANGED Viewed

@@ -1,5 +1,4 @@
 {
-  "_name_or_path": "Felladrin/TinyMistral-248M-Chat-v3",
   "architectures": ["MistralForCausalLM"],
   "attention_dropout": 0.0,
   "bos_token_id": 32000,
@@ -19,8 +18,8 @@
   "sliding_window": null,
   "tie_word_embeddings": false,
   "torch_dtype": "bfloat16",
-  "transformers_version": "4.49.0",
-  "use_cache": true,
   "use_sliding_window": false,
   "vocab_size": 32005
 }

 {
   "architectures": ["MistralForCausalLM"],
   "attention_dropout": 0.0,
   "bos_token_id": 32000,
   "sliding_window": null,
   "tie_word_embeddings": false,
   "torch_dtype": "bfloat16",
+  "transformers_version": "4.50.0",
+  "use_cache": false,
   "use_sliding_window": false,
   "vocab_size": 32005
 }

model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:ddf0ded71ab5a315f90bc932d018a2b20e81987f49f8c9c6efcaf612b2d5a4d6
 size 496060688

 version https://git-lfs.github.com/spec/v1
+oid sha256:da09172da13d6da1727beb0cef6c42e3fbc99bd3d9bdfedc0df8f5b2746c02a0
 size 496060688

tokenizer.json CHANGED Viewed

@@ -1,11 +1,6 @@
 {
   "version": "1.0",
-  "truncation": {
-    "direction": "Right",
-    "max_length": 1536,
-    "strategy": "LongestFirst",
-    "stride": 0
-  },
   "padding": null,
   "added_tokens": [
     {

 {
   "version": "1.0",
+  "truncation": null,
   "padding": null,
   "added_tokens": [
     {