Arihant Tripathi commited on Apr 11

Commit

fab756e

verified ·

1 Parent(s): 75f35a9

qwen_new_mage_per_domain_balanced_moe_lr

Browse files

Files changed (23) hide show

.gitattributes +1 -0
README.md +68 -0
added_tokens.json +5 -0
config.json +40 -0
merges.txt +0 -0
model-00001-of-00012.safetensors +3 -0
model-00002-of-00012.safetensors +3 -0
model-00003-of-00012.safetensors +3 -0
model-00004-of-00012.safetensors +3 -0
model-00005-of-00012.safetensors +3 -0
model-00006-of-00012.safetensors +3 -0
model-00007-of-00012.safetensors +3 -0
model-00008-of-00012.safetensors +3 -0
model-00009-of-00012.safetensors +3 -0
model-00010-of-00012.safetensors +3 -0
model-00011-of-00012.safetensors +3 -0
model-00012-of-00012.safetensors +3 -0
model.safetensors.index.json +0 -0
special_tokens_map.json +20 -0
tokenizer.json +3 -0
tokenizer_config.json +44 -0
training_args.bin +3 -0
vocab.json +0 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+tokenizer.json filter=lfs diff=lfs merge=lfs -text

README.md ADDED Viewed

	@@ -0,0 +1,68 @@

+---
+library_name: transformers
+license: other
+base_model: Qwen/Qwen1.5-MoE-A2.7B
+tags:
+- generated_from_trainer
+metrics:
+- accuracy
+model-index:
+- name: fine_tuned_per_domain_balanced_moe_lr
+  results: []
+---
+<!-- This model card has been generated automatically according to the information the Trainer had access to. You
+should probably proofread and complete it, then remove this comment. -->
+# fine_tuned_per_domain_balanced_moe_lr
+This model is a fine-tuned version of [Qwen/Qwen1.5-MoE-A2.7B](https://huggingface.co/Qwen/Qwen1.5-MoE-A2.7B) on an unknown dataset.
+It achieves the following results on the evaluation set:
+- Loss: 0.6637
+- Accuracy: 0.8800
+## Model description
+More information needed
+## Intended uses & limitations
+More information needed
+## Training and evaluation data
+More information needed
+## Training procedure
+### Training hyperparameters
+The following hyperparameters were used during training:
+- learning_rate: 5e-06
+- train_batch_size: 1
+- eval_batch_size: 1
+- seed: 42
+- optimizer: Use OptimizerNames.ADAMW_TORCH with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
+- lr_scheduler_type: linear
+- num_epochs: 1
+### Training results
+| Training Loss | Epoch  | Step | Validation Loss | Accuracy |
+|:-------------:|:------:|:----:|:---------------:|:--------:|
+| 2.2077        | 0.0029 | 500  | 1.9021          | 0.8317   |
+| 0.9585        | 0.0057 | 1000 | 2.2812          | 0.8299   |
+| 1.479         | 0.0086 | 1500 | 1.5268          | 0.8066   |
+| 1.1161        | 0.0114 | 2000 | 0.9974          | 0.8550   |
+| 0.8147        | 0.0143 | 2500 | 0.6406          | 0.8926   |
+| 1.4377        | 0.0172 | 3000 | 1.5956          | 0.8156   |
+| 0.6541        | 0.0200 | 3500 | 0.9456          | 0.8720   |
+| 0.9445        | 0.0229 | 4000 | 0.6637          | 0.8800   |
+### Framework versions
+- Transformers 4.49.0
+- Pytorch 2.6.0+cu126
+- Datasets 3.3.2
+- Tokenizers 0.21.0

added_tokens.json ADDED Viewed

	@@ -0,0 +1,5 @@

+{
+  "<|endoftext|>": 151643,
+  "<|im_end|>": 151645,
+  "<|im_start|>": 151644
+}

config.json ADDED Viewed

	@@ -0,0 +1,40 @@

+{
+  "_name_or_path": "Qwen/Qwen1.5-MoE-A2.7B",
+  "architectures": [
+    "Qwen2MoeForSequenceClassification"
+  ],
+  "attention_dropout": 0.0,
+  "bos_token_id": 151643,
+  "decoder_sparse_step": 1,
+  "eos_token_id": 151643,
+  "hidden_act": "silu",
+  "hidden_size": 2048,
+  "initializer_range": 0.02,
+  "intermediate_size": 5632,
+  "max_position_embeddings": 8192,
+  "max_window_layers": 21,
+  "mlp_only_layers": [],
+  "model_type": "qwen2_moe",
+  "moe_intermediate_size": 1408,
+  "norm_topk_prob": false,
+  "num_attention_heads": 16,
+  "num_experts": 60,
+  "num_experts_per_tok": 4,
+  "num_hidden_layers": 24,
+  "num_key_value_heads": 16,
+  "output_router_logits": false,
+  "pad_token_id": 151643,
+  "problem_type": "single_label_classification",
+  "rms_norm_eps": 1e-06,
+  "rope_scaling": null,
+  "rope_theta": 1000000.0,
+  "router_aux_loss_coef": 0.001,
+  "shared_expert_intermediate_size": 5632,
+  "sliding_window": null,
+  "tie_word_embeddings": false,
+  "torch_dtype": "float32",
+  "transformers_version": "4.49.0",
+  "use_cache": true,
+  "use_sliding_window": false,
+  "vocab_size": 151936
+}

merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

model-00001-of-00012.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:afc854b7e5e7395d0d3552f81fb472504fce5d4c3009090b992359e0d809028d
+size 4990221104

model-00002-of-00012.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:175fbbe6061a39f1160ad0af3e35222d1cf682c0b7267621b633c13a10312fc4
+size 4991306528

model-00003-of-00012.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f7926774d2cc1134ccd62a2f6d250e68fc6d2b7a02e5d5479b27edba4d939a24
+size 4990298240

model-00004-of-00012.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:cf75f30bd3c8b53ddc348dca9f094104055e4e7d28497656b9f29ba303303eb1
+size 4990757696

model-00005-of-00012.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ac297d45c0385ee3f952bdbce83f04cc42e6f61170787c976092d6df29393b78
+size 4991306600

model-00006-of-00012.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:55e0936aa45724b44f11a58775f07be91db6444efc904614342aa4c5c5dc81ca
+size 4991306936

model-00007-of-00012.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6f7f9d137059f33ecb396ae49da94041e9fe228ffc902cc4a4e3d1dc6c6c0368
+size 4991306952

model-00008-of-00012.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0867b92dd3d32d302733f6664f56e0c41478f367c602de17eda9f5a91edb5875
+size 4968238032

model-00009-of-00012.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8c8dc0582d2e8b63acfc9648a7f6fe9adbebc0f099b6c59d05d4afb854fb488a
+size 4989749864

model-00010-of-00012.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b4829cd1b3f963df55ea127116b2b91b5225d032566f54edf30eb75c9e59497a
+size 4991306928

model-00011-of-00012.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f793f00619ed64bf417add3afd4a562fe7765689af078f4b6d9acc8dd6ce3a3c
+size 4991306936

model-00012-of-00012.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:aa7e68f9c8854bd5c5c5326846718aaf446c286811be1bd9c65322558a1eaa9f
+size 1141959976

model.safetensors.index.json ADDED Viewed

The diff for this file is too large to render. See raw diff

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,20 @@

+{
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>"
+  ],
+  "eos_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f5ae291beb14c5986404b36fdaabbcf3cb858121c6239ffbec6c1405cbafd9dc
+size 11418544

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,44 @@

+{
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "151643": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151644": {
+      "content": "<|im_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151645": {
+      "content": "<|im_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>"
+  ],
+  "bos_token": null,
+  "chat_template": "{% for message in messages %}{% if loop.first and messages[0]['role'] != 'system' %}{{ '<|im_start|>system\nYou are a helpful assistant<|im_end|>\n' }}{% endif %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|endoftext|>",
+  "errors": "replace",
+  "extra_special_tokens": {},
+  "model_max_length": 32768,
+  "pad_token": "<|endoftext|>",
+  "split_special_tokens": false,
+  "tokenizer_class": "Qwen2Tokenizer",
+  "unk_token": null
+}

training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2128e785ea52477c854f8f01ce62225fc643900a38c392526a2e097b7deb08e2
+size 5304

vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff