Upload folder using huggingface_hub
Browse files- config.json +45 -0
- configuration_snowflake_core.py +34 -0
- merges.txt +0 -0
- modeling_snowflake_core.py +130 -0
- pytorch_model.bin +3 -0
- special_tokens_map.json +30 -0
- tokenizer.json +0 -0
- tokenizer_config.json +29 -0
- vocab.json +0 -0
config.json
ADDED
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"architectures": [
|
3 |
+
"SnowflakeCoreG1"
|
4 |
+
],
|
5 |
+
"auto_map": {
|
6 |
+
"AutoConfig": "configuration_snowflake_core.SnowflakeCoreConfig",
|
7 |
+
"AutoModelForCausalLM": "modeling_snowflake_core.SnowflakeCoreG1"
|
8 |
+
},
|
9 |
+
"bos_token_id": 50256,
|
10 |
+
"dropout": 0.1,
|
11 |
+
"embed_dim": 1024,
|
12 |
+
"eos_token_id": 50256,
|
13 |
+
"ffn_dim": 4096,
|
14 |
+
"max_length": 2048,
|
15 |
+
"model_type": "snowflake_core",
|
16 |
+
"num_heads": 16,
|
17 |
+
"num_layers": 24,
|
18 |
+
"pad_token_id": 50256,
|
19 |
+
"torch_dtype": "float32",
|
20 |
+
"training_config": {
|
21 |
+
"actual_epochs": 0,
|
22 |
+
"batch_size": 1,
|
23 |
+
"early_stopping": {
|
24 |
+
"min_delta": 0.001,
|
25 |
+
"patience": 3,
|
26 |
+
"triggered": false
|
27 |
+
},
|
28 |
+
"epochs": 2,
|
29 |
+
"grad_accum_steps": 32,
|
30 |
+
"learning_rate": 0.0002,
|
31 |
+
"max_length": 2048,
|
32 |
+
"val_split_ratio": 0.1
|
33 |
+
},
|
34 |
+
"training_metrics": {
|
35 |
+
"best_val_loss": Infinity,
|
36 |
+
"best_val_perplexity": null,
|
37 |
+
"final_train_loss": null,
|
38 |
+
"final_train_perplexity": null,
|
39 |
+
"final_val_loss": null,
|
40 |
+
"final_val_perplexity": null
|
41 |
+
},
|
42 |
+
"transformers_version": "4.53.1",
|
43 |
+
"unk_token_id": 50256,
|
44 |
+
"vocab_size": 50257
|
45 |
+
}
|
configuration_snowflake_core.py
ADDED
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from transformers import PretrainedConfig
|
2 |
+
|
3 |
+
class SnowflakeCoreConfig(PretrainedConfig):
|
4 |
+
model_type = "snowflake_core"
|
5 |
+
|
6 |
+
def __init__(
|
7 |
+
self,
|
8 |
+
vocab_size=50257,
|
9 |
+
embed_dim=1024,
|
10 |
+
num_heads=16,
|
11 |
+
num_layers=24,
|
12 |
+
max_length=2048,
|
13 |
+
ffn_dim=4096,
|
14 |
+
pad_token_id=50256,
|
15 |
+
eos_token_id=50256,
|
16 |
+
bos_token_id=None,
|
17 |
+
unk_token_id=None,
|
18 |
+
dropout=0.1,
|
19 |
+
**kwargs
|
20 |
+
):
|
21 |
+
super().__init__(
|
22 |
+
pad_token_id=pad_token_id,
|
23 |
+
eos_token_id=eos_token_id,
|
24 |
+
bos_token_id=bos_token_id,
|
25 |
+
unk_token_id=unk_token_id,
|
26 |
+
**kwargs
|
27 |
+
)
|
28 |
+
self.vocab_size = vocab_size
|
29 |
+
self.embed_dim = embed_dim
|
30 |
+
self.num_heads = num_heads
|
31 |
+
self.num_layers = num_layers
|
32 |
+
self.max_length = max_length
|
33 |
+
self.ffn_dim = ffn_dim
|
34 |
+
self.dropout = dropout
|
merges.txt
ADDED
The diff for this file is too large to render.
See raw diff
|
|
modeling_snowflake_core.py
ADDED
@@ -0,0 +1,130 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
import torch.nn as nn
|
3 |
+
import torch.nn.functional as F
|
4 |
+
from transformers import PreTrainedModel, PretrainedConfig
|
5 |
+
from typing import Optional, Tuple
|
6 |
+
|
7 |
+
# Optional: import custom config if present
|
8 |
+
try:
|
9 |
+
from .configuration_snowflake_core import SnowflakeCoreConfig
|
10 |
+
except ImportError:
|
11 |
+
SnowflakeCoreConfig = PretrainedConfig
|
12 |
+
|
13 |
+
class FusedSelfAttention(nn.Module):
|
14 |
+
def __init__(self, embed_dim, num_heads):
|
15 |
+
super().__init__()
|
16 |
+
self.num_heads = num_heads
|
17 |
+
self.head_dim = embed_dim // num_heads
|
18 |
+
assert (
|
19 |
+
self.head_dim * num_heads == embed_dim
|
20 |
+
), "embed_dim must be divisible by num_heads"
|
21 |
+
self.qkv_proj = nn.Linear(embed_dim, 3 * embed_dim)
|
22 |
+
self.out_proj = nn.Linear(embed_dim, embed_dim)
|
23 |
+
|
24 |
+
def forward(self, x, attn_mask=None, key_padding_mask=None):
|
25 |
+
B, T, C = x.size()
|
26 |
+
qkv = self.qkv_proj(x) # [B, T, 3 * C]
|
27 |
+
qkv = qkv.reshape(B, T, 3, self.num_heads, self.head_dim).permute(2, 0, 3, 1, 4)
|
28 |
+
q, k, v = qkv[0], qkv[1], qkv[2] # Each: [B, num_heads, T, head_dim]
|
29 |
+
|
30 |
+
attn_scores = (q @ k.transpose(-2, -1)) / (self.head_dim ** 0.5) # [B, num_heads, T, T]
|
31 |
+
if attn_mask is not None:
|
32 |
+
attn_scores = attn_scores + attn_mask.unsqueeze(0).unsqueeze(0).to(attn_scores.dtype)
|
33 |
+
if key_padding_mask is not None:
|
34 |
+
attn_scores = attn_scores.masked_fill(key_padding_mask.unsqueeze(1).unsqueeze(2), float('-inf'))
|
35 |
+
attn_probs = F.softmax(attn_scores, dim=-1)
|
36 |
+
attn_output = attn_probs @ v # [B, num_heads, T, head_dim]
|
37 |
+
attn_output = attn_output.transpose(1, 2).reshape(B, T, C)
|
38 |
+
return self.out_proj(attn_output)
|
39 |
+
|
40 |
+
class GPTBlock(nn.Module):
|
41 |
+
def __init__(self, embed_dim, num_heads, dropout=0.1):
|
42 |
+
super().__init__()
|
43 |
+
self.ln1 = nn.LayerNorm(embed_dim)
|
44 |
+
self.attn = FusedSelfAttention(embed_dim, num_heads)
|
45 |
+
self.dropout1 = nn.Dropout(dropout)
|
46 |
+
self.ln2 = nn.LayerNorm(embed_dim)
|
47 |
+
self.mlp = nn.Sequential(
|
48 |
+
nn.Linear(embed_dim, 4 * embed_dim),
|
49 |
+
nn.GELU(),
|
50 |
+
nn.Dropout(dropout),
|
51 |
+
nn.Linear(4 * embed_dim, embed_dim),
|
52 |
+
)
|
53 |
+
self.dropout2 = nn.Dropout(dropout)
|
54 |
+
def forward(self, x, attn_mask=None, key_padding_mask=None):
|
55 |
+
h = self.ln1(x)
|
56 |
+
attn_output = self.attn(h, attn_mask=attn_mask, key_padding_mask=key_padding_mask)
|
57 |
+
x = x + self.dropout1(attn_output)
|
58 |
+
x = x + self.dropout2(self.mlp(self.ln2(x)))
|
59 |
+
return x
|
60 |
+
|
61 |
+
class SnowflakeCoreG1(PreTrainedModel):
|
62 |
+
config_class = SnowflakeCoreConfig
|
63 |
+
supports_gradient_checkpointing = True
|
64 |
+
|
65 |
+
def __init__(self, config):
|
66 |
+
super().__init__(config)
|
67 |
+
self.vocab_size = config.vocab_size
|
68 |
+
self.embed_dim = config.embed_dim
|
69 |
+
self.num_heads = config.num_heads
|
70 |
+
self.num_layers = config.num_layers
|
71 |
+
self.max_length = config.max_length
|
72 |
+
self.ffn_dim = getattr(config, 'ffn_dim', 4 * config.embed_dim)
|
73 |
+
self.dropout = getattr(config, 'dropout', 0.1)
|
74 |
+
|
75 |
+
self.embed = nn.Embedding(self.vocab_size, self.embed_dim)
|
76 |
+
self.pos_embed = nn.Embedding(self.max_length, self.embed_dim)
|
77 |
+
self.dropout_layer = nn.Dropout(self.dropout)
|
78 |
+
self.blocks = nn.ModuleList([
|
79 |
+
GPTBlock(self.embed_dim, self.num_heads, self.dropout) for _ in range(self.num_layers)
|
80 |
+
])
|
81 |
+
self.ln_f = nn.LayerNorm(self.embed_dim)
|
82 |
+
self.lm_head = nn.Linear(self.embed_dim, self.vocab_size, bias=False)
|
83 |
+
|
84 |
+
self.post_init()
|
85 |
+
|
86 |
+
def get_input_embeddings(self):
|
87 |
+
return self.embed
|
88 |
+
|
89 |
+
def set_input_embeddings(self, value):
|
90 |
+
self.embed = value
|
91 |
+
|
92 |
+
def get_output_embeddings(self):
|
93 |
+
return self.lm_head
|
94 |
+
|
95 |
+
def set_output_embeddings(self, new_embeddings):
|
96 |
+
self.lm_head = new_embeddings
|
97 |
+
|
98 |
+
def forward(
|
99 |
+
self,
|
100 |
+
input_ids: torch.LongTensor = None,
|
101 |
+
attention_mask: Optional[torch.Tensor] = None,
|
102 |
+
labels: Optional[torch.LongTensor] = None,
|
103 |
+
**kwargs
|
104 |
+
) -> Tuple:
|
105 |
+
B, T = input_ids.size()
|
106 |
+
pos = torch.arange(0, T, device=input_ids.device).unsqueeze(0)
|
107 |
+
x = self.embed(input_ids) + self.pos_embed(pos)
|
108 |
+
x = self.dropout_layer(x)
|
109 |
+
causal_mask = torch.triu(torch.ones(T, T, device=input_ids.device), diagonal=1).bool()
|
110 |
+
causal_mask = causal_mask.masked_fill(causal_mask, float('-inf'))
|
111 |
+
key_padding_mask = None
|
112 |
+
if attention_mask is not None:
|
113 |
+
key_padding_mask = attention_mask == 0
|
114 |
+
for block in self.blocks:
|
115 |
+
x = block(x, attn_mask=causal_mask, key_padding_mask=key_padding_mask)
|
116 |
+
x = self.ln_f(x)
|
117 |
+
logits = self.lm_head(x)
|
118 |
+
|
119 |
+
loss = None
|
120 |
+
if labels is not None:
|
121 |
+
shift_logits = logits[:, :-1, :].contiguous().view(-1, self.vocab_size)
|
122 |
+
shift_labels = labels[:, 1:].contiguous().view(-1)
|
123 |
+
loss = F.cross_entropy(shift_logits, shift_labels, ignore_index=self.config.pad_token_id)
|
124 |
+
if loss is not None:
|
125 |
+
return {"loss": loss, "logits": logits}
|
126 |
+
return {"logits": logits}
|
127 |
+
|
128 |
+
@classmethod
|
129 |
+
def from_pretrained(cls, pretrained_model_name_or_path, *model_args, config=None, **kwargs):
|
130 |
+
return super().from_pretrained(pretrained_model_name_or_path, *model_args, config=config, **kwargs)
|
pytorch_model.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:fd68923838d2083d549d10765f4b663e2de784b376c9499a95282666b746cd23
|
3 |
+
size 1423580250
|
special_tokens_map.json
ADDED
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"bos_token": {
|
3 |
+
"content": "<|endoftext|>",
|
4 |
+
"lstrip": false,
|
5 |
+
"normalized": true,
|
6 |
+
"rstrip": false,
|
7 |
+
"single_word": false
|
8 |
+
},
|
9 |
+
"eos_token": {
|
10 |
+
"content": "<|endoftext|>",
|
11 |
+
"lstrip": false,
|
12 |
+
"normalized": true,
|
13 |
+
"rstrip": false,
|
14 |
+
"single_word": false
|
15 |
+
},
|
16 |
+
"pad_token": {
|
17 |
+
"content": "<|endoftext|>",
|
18 |
+
"lstrip": false,
|
19 |
+
"normalized": true,
|
20 |
+
"rstrip": false,
|
21 |
+
"single_word": false
|
22 |
+
},
|
23 |
+
"unk_token": {
|
24 |
+
"content": "<|endoftext|>",
|
25 |
+
"lstrip": false,
|
26 |
+
"normalized": true,
|
27 |
+
"rstrip": false,
|
28 |
+
"single_word": false
|
29 |
+
}
|
30 |
+
}
|
tokenizer.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
tokenizer_config.json
ADDED
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"add_prefix_space": false,
|
3 |
+
"added_tokens_decoder": {
|
4 |
+
"50256": {
|
5 |
+
"content": "<|endoftext|>",
|
6 |
+
"lstrip": false,
|
7 |
+
"normalized": true,
|
8 |
+
"rstrip": false,
|
9 |
+
"single_word": false,
|
10 |
+
"special": true
|
11 |
+
}
|
12 |
+
},
|
13 |
+
"bos_token": "<|endoftext|>",
|
14 |
+
"clean_up_tokenization_spaces": false,
|
15 |
+
"eos_token": "<|endoftext|>",
|
16 |
+
"extra_special_tokens": {},
|
17 |
+
"max_length": 2048,
|
18 |
+
"model_max_length": 512,
|
19 |
+
"pad_to_multiple_of": null,
|
20 |
+
"pad_token": "<|endoftext|>",
|
21 |
+
"pad_token_type_id": 0,
|
22 |
+
"padding_side": "right",
|
23 |
+
"stride": 0,
|
24 |
+
"tokenizer_class": "GPT2Tokenizer",
|
25 |
+
"truncation_side": "right",
|
26 |
+
"truncation_strategy": "longest_first",
|
27 |
+
"unk_token": "<|endoftext|>",
|
28 |
+
"use_safetensors": true
|
29 |
+
}
|
vocab.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|