FlameF0X commited on
Commit
7e5e590
·
verified ·
1 Parent(s): d289d56

Upload folder using huggingface_hub

Browse files
config.json ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "SnowflakeCoreG1"
4
+ ],
5
+ "auto_map": {
6
+ "AutoConfig": "configuration_snowflake_core.SnowflakeCoreConfig",
7
+ "AutoModelForCausalLM": "modeling_snowflake_core.SnowflakeCoreG1"
8
+ },
9
+ "bos_token_id": 50256,
10
+ "dropout": 0.1,
11
+ "embed_dim": 1024,
12
+ "eos_token_id": 50256,
13
+ "ffn_dim": 4096,
14
+ "max_length": 2048,
15
+ "model_type": "snowflake_core",
16
+ "num_heads": 16,
17
+ "num_layers": 24,
18
+ "pad_token_id": 50256,
19
+ "torch_dtype": "float32",
20
+ "training_config": {
21
+ "actual_epochs": 0,
22
+ "batch_size": 1,
23
+ "early_stopping": {
24
+ "min_delta": 0.001,
25
+ "patience": 3,
26
+ "triggered": false
27
+ },
28
+ "epochs": 2,
29
+ "grad_accum_steps": 32,
30
+ "learning_rate": 0.0002,
31
+ "max_length": 2048,
32
+ "val_split_ratio": 0.1
33
+ },
34
+ "training_metrics": {
35
+ "best_val_loss": Infinity,
36
+ "best_val_perplexity": null,
37
+ "final_train_loss": null,
38
+ "final_train_perplexity": null,
39
+ "final_val_loss": null,
40
+ "final_val_perplexity": null
41
+ },
42
+ "transformers_version": "4.53.1",
43
+ "unk_token_id": 50256,
44
+ "vocab_size": 50257
45
+ }
configuration_snowflake_core.py ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import PretrainedConfig
2
+
3
+ class SnowflakeCoreConfig(PretrainedConfig):
4
+ model_type = "snowflake_core"
5
+
6
+ def __init__(
7
+ self,
8
+ vocab_size=50257,
9
+ embed_dim=1024,
10
+ num_heads=16,
11
+ num_layers=24,
12
+ max_length=2048,
13
+ ffn_dim=4096,
14
+ pad_token_id=50256,
15
+ eos_token_id=50256,
16
+ bos_token_id=None,
17
+ unk_token_id=None,
18
+ dropout=0.1,
19
+ **kwargs
20
+ ):
21
+ super().__init__(
22
+ pad_token_id=pad_token_id,
23
+ eos_token_id=eos_token_id,
24
+ bos_token_id=bos_token_id,
25
+ unk_token_id=unk_token_id,
26
+ **kwargs
27
+ )
28
+ self.vocab_size = vocab_size
29
+ self.embed_dim = embed_dim
30
+ self.num_heads = num_heads
31
+ self.num_layers = num_layers
32
+ self.max_length = max_length
33
+ self.ffn_dim = ffn_dim
34
+ self.dropout = dropout
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
modeling_snowflake_core.py ADDED
@@ -0,0 +1,130 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+ import torch.nn.functional as F
4
+ from transformers import PreTrainedModel, PretrainedConfig
5
+ from typing import Optional, Tuple
6
+
7
+ # Optional: import custom config if present
8
+ try:
9
+ from .configuration_snowflake_core import SnowflakeCoreConfig
10
+ except ImportError:
11
+ SnowflakeCoreConfig = PretrainedConfig
12
+
13
+ class FusedSelfAttention(nn.Module):
14
+ def __init__(self, embed_dim, num_heads):
15
+ super().__init__()
16
+ self.num_heads = num_heads
17
+ self.head_dim = embed_dim // num_heads
18
+ assert (
19
+ self.head_dim * num_heads == embed_dim
20
+ ), "embed_dim must be divisible by num_heads"
21
+ self.qkv_proj = nn.Linear(embed_dim, 3 * embed_dim)
22
+ self.out_proj = nn.Linear(embed_dim, embed_dim)
23
+
24
+ def forward(self, x, attn_mask=None, key_padding_mask=None):
25
+ B, T, C = x.size()
26
+ qkv = self.qkv_proj(x) # [B, T, 3 * C]
27
+ qkv = qkv.reshape(B, T, 3, self.num_heads, self.head_dim).permute(2, 0, 3, 1, 4)
28
+ q, k, v = qkv[0], qkv[1], qkv[2] # Each: [B, num_heads, T, head_dim]
29
+
30
+ attn_scores = (q @ k.transpose(-2, -1)) / (self.head_dim ** 0.5) # [B, num_heads, T, T]
31
+ if attn_mask is not None:
32
+ attn_scores = attn_scores + attn_mask.unsqueeze(0).unsqueeze(0).to(attn_scores.dtype)
33
+ if key_padding_mask is not None:
34
+ attn_scores = attn_scores.masked_fill(key_padding_mask.unsqueeze(1).unsqueeze(2), float('-inf'))
35
+ attn_probs = F.softmax(attn_scores, dim=-1)
36
+ attn_output = attn_probs @ v # [B, num_heads, T, head_dim]
37
+ attn_output = attn_output.transpose(1, 2).reshape(B, T, C)
38
+ return self.out_proj(attn_output)
39
+
40
+ class GPTBlock(nn.Module):
41
+ def __init__(self, embed_dim, num_heads, dropout=0.1):
42
+ super().__init__()
43
+ self.ln1 = nn.LayerNorm(embed_dim)
44
+ self.attn = FusedSelfAttention(embed_dim, num_heads)
45
+ self.dropout1 = nn.Dropout(dropout)
46
+ self.ln2 = nn.LayerNorm(embed_dim)
47
+ self.mlp = nn.Sequential(
48
+ nn.Linear(embed_dim, 4 * embed_dim),
49
+ nn.GELU(),
50
+ nn.Dropout(dropout),
51
+ nn.Linear(4 * embed_dim, embed_dim),
52
+ )
53
+ self.dropout2 = nn.Dropout(dropout)
54
+ def forward(self, x, attn_mask=None, key_padding_mask=None):
55
+ h = self.ln1(x)
56
+ attn_output = self.attn(h, attn_mask=attn_mask, key_padding_mask=key_padding_mask)
57
+ x = x + self.dropout1(attn_output)
58
+ x = x + self.dropout2(self.mlp(self.ln2(x)))
59
+ return x
60
+
61
+ class SnowflakeCoreG1(PreTrainedModel):
62
+ config_class = SnowflakeCoreConfig
63
+ supports_gradient_checkpointing = True
64
+
65
+ def __init__(self, config):
66
+ super().__init__(config)
67
+ self.vocab_size = config.vocab_size
68
+ self.embed_dim = config.embed_dim
69
+ self.num_heads = config.num_heads
70
+ self.num_layers = config.num_layers
71
+ self.max_length = config.max_length
72
+ self.ffn_dim = getattr(config, 'ffn_dim', 4 * config.embed_dim)
73
+ self.dropout = getattr(config, 'dropout', 0.1)
74
+
75
+ self.embed = nn.Embedding(self.vocab_size, self.embed_dim)
76
+ self.pos_embed = nn.Embedding(self.max_length, self.embed_dim)
77
+ self.dropout_layer = nn.Dropout(self.dropout)
78
+ self.blocks = nn.ModuleList([
79
+ GPTBlock(self.embed_dim, self.num_heads, self.dropout) for _ in range(self.num_layers)
80
+ ])
81
+ self.ln_f = nn.LayerNorm(self.embed_dim)
82
+ self.lm_head = nn.Linear(self.embed_dim, self.vocab_size, bias=False)
83
+
84
+ self.post_init()
85
+
86
+ def get_input_embeddings(self):
87
+ return self.embed
88
+
89
+ def set_input_embeddings(self, value):
90
+ self.embed = value
91
+
92
+ def get_output_embeddings(self):
93
+ return self.lm_head
94
+
95
+ def set_output_embeddings(self, new_embeddings):
96
+ self.lm_head = new_embeddings
97
+
98
+ def forward(
99
+ self,
100
+ input_ids: torch.LongTensor = None,
101
+ attention_mask: Optional[torch.Tensor] = None,
102
+ labels: Optional[torch.LongTensor] = None,
103
+ **kwargs
104
+ ) -> Tuple:
105
+ B, T = input_ids.size()
106
+ pos = torch.arange(0, T, device=input_ids.device).unsqueeze(0)
107
+ x = self.embed(input_ids) + self.pos_embed(pos)
108
+ x = self.dropout_layer(x)
109
+ causal_mask = torch.triu(torch.ones(T, T, device=input_ids.device), diagonal=1).bool()
110
+ causal_mask = causal_mask.masked_fill(causal_mask, float('-inf'))
111
+ key_padding_mask = None
112
+ if attention_mask is not None:
113
+ key_padding_mask = attention_mask == 0
114
+ for block in self.blocks:
115
+ x = block(x, attn_mask=causal_mask, key_padding_mask=key_padding_mask)
116
+ x = self.ln_f(x)
117
+ logits = self.lm_head(x)
118
+
119
+ loss = None
120
+ if labels is not None:
121
+ shift_logits = logits[:, :-1, :].contiguous().view(-1, self.vocab_size)
122
+ shift_labels = labels[:, 1:].contiguous().view(-1)
123
+ loss = F.cross_entropy(shift_logits, shift_labels, ignore_index=self.config.pad_token_id)
124
+ if loss is not None:
125
+ return {"loss": loss, "logits": logits}
126
+ return {"logits": logits}
127
+
128
+ @classmethod
129
+ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, config=None, **kwargs):
130
+ return super().from_pretrained(pretrained_model_name_or_path, *model_args, config=config, **kwargs)
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fd68923838d2083d549d10765f4b663e2de784b376c9499a95282666b746cd23
3
+ size 1423580250
special_tokens_map.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<|endoftext|>",
4
+ "lstrip": false,
5
+ "normalized": true,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "<|endoftext|>",
11
+ "lstrip": false,
12
+ "normalized": true,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": {
17
+ "content": "<|endoftext|>",
18
+ "lstrip": false,
19
+ "normalized": true,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "unk_token": {
24
+ "content": "<|endoftext|>",
25
+ "lstrip": false,
26
+ "normalized": true,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ }
30
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "added_tokens_decoder": {
4
+ "50256": {
5
+ "content": "<|endoftext|>",
6
+ "lstrip": false,
7
+ "normalized": true,
8
+ "rstrip": false,
9
+ "single_word": false,
10
+ "special": true
11
+ }
12
+ },
13
+ "bos_token": "<|endoftext|>",
14
+ "clean_up_tokenization_spaces": false,
15
+ "eos_token": "<|endoftext|>",
16
+ "extra_special_tokens": {},
17
+ "max_length": 2048,
18
+ "model_max_length": 512,
19
+ "pad_to_multiple_of": null,
20
+ "pad_token": "<|endoftext|>",
21
+ "pad_token_type_id": 0,
22
+ "padding_side": "right",
23
+ "stride": 0,
24
+ "tokenizer_class": "GPT2Tokenizer",
25
+ "truncation_side": "right",
26
+ "truncation_strategy": "longest_first",
27
+ "unk_token": "<|endoftext|>",
28
+ "use_safetensors": true
29
+ }
vocab.json ADDED
The diff for this file is too large to render. See raw diff