Alexandru Gherghescu
commited on
Add original model weigts + conversion script
Browse files- README.md +6 -0
- gpt1-converted-weights/config.json +24 -0
- gpt1-converted-weights/configuration_gpt1.py +42 -0
- gpt1-converted-weights/generation_config.json +4 -0
- gpt1-converted-weights/model.safetensors +3 -0
- gpt1-converted-weights/modeling_gpt1.py +237 -0
- original_gpt1_params/.ipynb_checkpoints/encoder_bpe_40000-checkpoint.json +0 -0
- original_gpt1_params/.ipynb_checkpoints/params_shapes-checkpoint.json +1 -0
- original_gpt1_params/.ipynb_checkpoints/vocab_40000-checkpoint.bpe +0 -0
- original_gpt1_params/encoder_bpe_40000.json +0 -0
- original_gpt1_params/params_0.npy +3 -0
- original_gpt1_params/params_1.npy +3 -0
- original_gpt1_params/params_2.npy +3 -0
- original_gpt1_params/params_3.npy +3 -0
- original_gpt1_params/params_4.npy +3 -0
- original_gpt1_params/params_5.npy +3 -0
- original_gpt1_params/params_6.npy +3 -0
- original_gpt1_params/params_7.npy +3 -0
- original_gpt1_params/params_8.npy +3 -0
- original_gpt1_params/params_9.npy +3 -0
- original_gpt1_params/params_shapes.json +1 -0
- original_gpt1_params/vocab_40000.bpe +0 -0
- tf_weights_to_hf.py +85 -0
README.md
CHANGED
@@ -36,3 +36,9 @@ See `preprocessing.py` on how the data was preprocessed and tokenized.
|
|
36 |
See `pre_training.py` on how the model was pre-trained.
|
37 |
|
38 |
See `inference.py` for an example.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
36 |
See `pre_training.py` on how the model was pre-trained.
|
37 |
|
38 |
See `inference.py` for an example.
|
39 |
+
|
40 |
+
## Converted model
|
41 |
+
|
42 |
+
Inside `gpt1-converted-weights/` is the converted safetensors model from the
|
43 |
+
original weights, which can be used directly with the code inside this repo. The
|
44 |
+
conversion script and original weights can also be found there.
|
gpt1-converted-weights/config.json
ADDED
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"architectures": [
|
3 |
+
"GPT1ForCausalLM"
|
4 |
+
],
|
5 |
+
"attention_dropout": 0.1,
|
6 |
+
"auto_map": {
|
7 |
+
"AutoConfig": "configuration_gpt1.GPT1Config",
|
8 |
+
"AutoModelForCausalLM": "modeling_gpt1.GPT1ForCausalLM"
|
9 |
+
},
|
10 |
+
"embd_pdrop": 0.1,
|
11 |
+
"hidden_act": "gelu",
|
12 |
+
"hidden_size": 768,
|
13 |
+
"initializer_range": 0.02,
|
14 |
+
"intermediate_size": 3072,
|
15 |
+
"layer_norm_eps": 1e-05,
|
16 |
+
"max_position_embeddings": 512,
|
17 |
+
"model_type": "gpt1",
|
18 |
+
"num_attention_heads": 12,
|
19 |
+
"num_hidden_layers": 12,
|
20 |
+
"resid_pdrop": 0.1,
|
21 |
+
"torch_dtype": "float32",
|
22 |
+
"transformers_version": "4.38.1",
|
23 |
+
"vocab_size": 40478
|
24 |
+
}
|
gpt1-converted-weights/configuration_gpt1.py
ADDED
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
""" GPT1 model configuration """
|
2 |
+
|
3 |
+
from transformers.configuration_utils import PretrainedConfig
|
4 |
+
|
5 |
+
|
6 |
+
class GPT1Config(PretrainedConfig):
|
7 |
+
model_type = "gpt1"
|
8 |
+
|
9 |
+
def __init__(
|
10 |
+
self,
|
11 |
+
vocab_size=40478,
|
12 |
+
hidden_size=768,
|
13 |
+
intermediate_size=3072,
|
14 |
+
num_hidden_layers=12,
|
15 |
+
num_attention_heads=12,
|
16 |
+
resid_pdrop=0.1,
|
17 |
+
embd_pdrop=0.1,
|
18 |
+
attention_dropout=0.1,
|
19 |
+
hidden_act="gelu",
|
20 |
+
max_position_embeddings=512,
|
21 |
+
initializer_range=0.02,
|
22 |
+
layer_norm_eps=1e-5,
|
23 |
+
tie_word_embeddings=True,
|
24 |
+
**kwargs
|
25 |
+
):
|
26 |
+
self.vocab_size = vocab_size
|
27 |
+
self.hidden_size = hidden_size
|
28 |
+
self.intermediate_size = intermediate_size
|
29 |
+
self.num_hidden_layers = num_hidden_layers
|
30 |
+
self.num_attention_heads = num_attention_heads
|
31 |
+
self.resid_pdrop = resid_pdrop
|
32 |
+
self.embd_pdrop = embd_pdrop
|
33 |
+
self.attention_dropout = attention_dropout
|
34 |
+
self.hidden_act = hidden_act
|
35 |
+
self.max_position_embeddings = max_position_embeddings
|
36 |
+
self.initializer_range = initializer_range
|
37 |
+
self.layer_norm_eps = layer_norm_eps
|
38 |
+
|
39 |
+
super().__init__(
|
40 |
+
tie_word_embeddings=tie_word_embeddings,
|
41 |
+
**kwargs,
|
42 |
+
)
|
gpt1-converted-weights/generation_config.json
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"_from_model_config": true,
|
3 |
+
"transformers_version": "4.38.1"
|
4 |
+
}
|
gpt1-converted-weights/model.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:dc19245dd9599204701492aecf9b89d5b130001085743adb249409040390ec02
|
3 |
+
size 466321576
|
gpt1-converted-weights/modeling_gpt1.py
ADDED
@@ -0,0 +1,237 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
""" PyTorch GPT1 model."""
|
2 |
+
|
3 |
+
import math
|
4 |
+
|
5 |
+
import torch
|
6 |
+
from torch import nn
|
7 |
+
from transformers import PreTrainedModel
|
8 |
+
from transformers.modeling_outputs import (
|
9 |
+
BaseModelOutput,
|
10 |
+
CausalLMOutput,
|
11 |
+
)
|
12 |
+
from transformers.activations import get_activation
|
13 |
+
|
14 |
+
from configuration_gpt1 import GPT1Config
|
15 |
+
|
16 |
+
|
17 |
+
class GPT1MLP(nn.Module):
|
18 |
+
def __init__(self, config: GPT1Config):
|
19 |
+
super().__init__()
|
20 |
+
self.activation_fn = get_activation(config.hidden_act)
|
21 |
+
self.fc1 = nn.Linear(config.hidden_size, config.intermediate_size)
|
22 |
+
self.fc2 = nn.Linear(config.intermediate_size, config.hidden_size)
|
23 |
+
|
24 |
+
def forward(self, hidden_state):
|
25 |
+
hidden_state = self.fc1(hidden_state)
|
26 |
+
hidden_state = self.activation_fn(hidden_state)
|
27 |
+
hidden_state = self.fc2(hidden_state)
|
28 |
+
return hidden_state
|
29 |
+
|
30 |
+
|
31 |
+
class GPT1Attention(nn.Module):
|
32 |
+
def __init__(self, config: GPT1Config):
|
33 |
+
"""
|
34 |
+
Multi-head attention layer.
|
35 |
+
"""
|
36 |
+
super().__init__()
|
37 |
+
|
38 |
+
assert config.hidden_size % config.num_attention_heads == 0
|
39 |
+
self.hidden_size = config.hidden_size
|
40 |
+
self.num_heads = config.num_attention_heads
|
41 |
+
self.head_dim = self.hidden_size // self.num_heads
|
42 |
+
self.attn_dropout = nn.Dropout(p=config.attention_dropout)
|
43 |
+
|
44 |
+
self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim)
|
45 |
+
self.k_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim)
|
46 |
+
self.v_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim)
|
47 |
+
self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size)
|
48 |
+
|
49 |
+
def forward(self, hidden_state, attn_mask):
|
50 |
+
bs, seq_len, _ = hidden_state.size() # (batch_size, seq_len, dim)
|
51 |
+
|
52 |
+
# linearly project the inputs
|
53 |
+
Q = self.q_proj(hidden_state) # (batch_size, seq_len, n_heads * head_dim)
|
54 |
+
K = self.k_proj(hidden_state)
|
55 |
+
V = self.v_proj(hidden_state)
|
56 |
+
|
57 |
+
# split into n_heads to compute attention
|
58 |
+
queries = Q.view(bs, seq_len, self.num_heads, self.head_dim).transpose(1, 2) # (batch_size, n_heads, seq_len, head_dim)
|
59 |
+
keys = K.view(bs, seq_len, self.num_heads, self.head_dim).transpose(1, 2)
|
60 |
+
values = V.view(bs, seq_len, self.num_heads, self.head_dim).transpose(1, 2)
|
61 |
+
|
62 |
+
# compute attention matmul
|
63 |
+
keys = keys.transpose(2, 3) # (batch_size, n_heads, head_dim, seq_len)
|
64 |
+
attn_scores = queries @ keys # (batch_size, n_heads, seq_len, seq_len)
|
65 |
+
|
66 |
+
# scale
|
67 |
+
attn_scores = attn_scores / math.sqrt(self.head_dim)
|
68 |
+
|
69 |
+
# mask
|
70 |
+
if attn_mask is not None:
|
71 |
+
attn_scores = attn_scores + attn_mask
|
72 |
+
|
73 |
+
# softmax (attention probabilities) + dropout
|
74 |
+
attn_probs = nn.functional.softmax(attn_scores, dim=-1, dtype=torch.float32).to(Q.dtype)
|
75 |
+
attn_probs = self.attn_dropout(attn_probs)
|
76 |
+
|
77 |
+
# matmul
|
78 |
+
attn_output = attn_probs @ values # (batch_size, n_heads, seq_len, head_dim)
|
79 |
+
|
80 |
+
attn_output = attn_output.transpose(1, 2).contiguous()
|
81 |
+
attn_output = attn_output.reshape(bs, seq_len, self.hidden_size) # (batch_size, seq_len, n_heads * head_dim)
|
82 |
+
|
83 |
+
# final linear
|
84 |
+
attn_output = self.o_proj(attn_output)
|
85 |
+
return attn_output
|
86 |
+
|
87 |
+
|
88 |
+
class GPT1DecoderLayer(nn.Module):
|
89 |
+
def __init__(self, config: GPT1Config):
|
90 |
+
super().__init__()
|
91 |
+
self.attention = GPT1Attention(config)
|
92 |
+
self.mlp = GPT1MLP(config)
|
93 |
+
|
94 |
+
self.attention_norm = nn.LayerNorm(normalized_shape=config.hidden_size,
|
95 |
+
eps=config.layer_norm_eps)
|
96 |
+
self.mlp_norm = nn.LayerNorm(normalized_shape=config.hidden_size,
|
97 |
+
eps=config.layer_norm_eps)
|
98 |
+
|
99 |
+
self.res_dropout = nn.Dropout(p=config.resid_pdrop)
|
100 |
+
|
101 |
+
def forward(self, hidden_state, attn_mask):
|
102 |
+
# attention
|
103 |
+
residual = hidden_state
|
104 |
+
hidden_state = self.attention(hidden_state, attn_mask)
|
105 |
+
hidden_state = self.res_dropout(hidden_state)
|
106 |
+
hidden_state = residual + hidden_state
|
107 |
+
hidden_state = self.attention_norm(hidden_state)
|
108 |
+
|
109 |
+
# feed forward fully connected
|
110 |
+
residual = hidden_state
|
111 |
+
hidden_state = self.mlp(hidden_state)
|
112 |
+
hidden_state = self.res_dropout(hidden_state)
|
113 |
+
hidden_state = residual + hidden_state
|
114 |
+
hidden_state = self.mlp_norm(hidden_state)
|
115 |
+
|
116 |
+
return hidden_state
|
117 |
+
|
118 |
+
|
119 |
+
class GPT1PreTrainedModel(PreTrainedModel):
|
120 |
+
config_class = GPT1Config
|
121 |
+
supports_gradient_checkpointing = False
|
122 |
+
|
123 |
+
def _init_weights(self, module):
|
124 |
+
std = self.config.initializer_range
|
125 |
+
if isinstance(module, nn.Linear):
|
126 |
+
module.weight.data.normal_(mean=0.0, std=std)
|
127 |
+
if module.bias is not None:
|
128 |
+
module.bias.data.zero_()
|
129 |
+
elif isinstance(module, nn.Embedding):
|
130 |
+
module.weight.data.normal_(mean=0.0, std=std)
|
131 |
+
if module.padding_idx is not None:
|
132 |
+
module.weight.data[module.padding_idx].zero_()
|
133 |
+
|
134 |
+
|
135 |
+
class GPT1Model(GPT1PreTrainedModel):
|
136 |
+
|
137 |
+
def __init__(self, config: GPT1Config):
|
138 |
+
super().__init__(config)
|
139 |
+
|
140 |
+
# embeddings
|
141 |
+
self.embs = nn.Embedding(config.vocab_size, config.hidden_size)
|
142 |
+
self.embs_dropout = nn.Dropout(p=config.embd_pdrop)
|
143 |
+
|
144 |
+
# positional encoding (learned)
|
145 |
+
self.pos_emb = nn.Embedding(config.max_position_embeddings,
|
146 |
+
config.hidden_size)
|
147 |
+
|
148 |
+
self.layers = nn.ModuleList(
|
149 |
+
[GPT1DecoderLayer(config) for _ in range(config.num_hidden_layers)]
|
150 |
+
)
|
151 |
+
|
152 |
+
self.post_init()
|
153 |
+
|
154 |
+
def get_input_embeddings(self):
|
155 |
+
return self.embs
|
156 |
+
|
157 |
+
def set_input_embeddings(self, value):
|
158 |
+
self.embs = value
|
159 |
+
|
160 |
+
def forward(self, input_ids, *args, **kwargs):
|
161 |
+
position_ids = torch.arange(input_ids.size(-1),
|
162 |
+
dtype=torch.long,
|
163 |
+
device=input_ids.device).unsqueeze_(0)
|
164 |
+
|
165 |
+
input_embeds = self.embs(input_ids) # (bs, seq_len, dim)
|
166 |
+
position_embeds = self.pos_emb(position_ids)
|
167 |
+
hidden_state = self.embs_dropout(input_embeds) + position_embeds
|
168 |
+
|
169 |
+
seq_len = input_ids.size(-1)
|
170 |
+
attn_mask = torch.full((seq_len, seq_len), fill_value=float('-inf'))
|
171 |
+
attn_mask = torch.triu(attn_mask, diagonal=1)
|
172 |
+
|
173 |
+
causal_mask = attn_mask.to(dtype=input_embeds.dtype,
|
174 |
+
device=input_embeds.device)
|
175 |
+
|
176 |
+
for layer in self.layers:
|
177 |
+
hidden_state = layer(hidden_state, attn_mask=causal_mask)
|
178 |
+
|
179 |
+
return BaseModelOutput(
|
180 |
+
last_hidden_state=hidden_state
|
181 |
+
)
|
182 |
+
|
183 |
+
|
184 |
+
class GPT1ForCausalLM(GPT1PreTrainedModel):
|
185 |
+
_tied_weights_keys = ["lm_head.weight"]
|
186 |
+
|
187 |
+
def __init__(self, config: GPT1Config):
|
188 |
+
super().__init__(config)
|
189 |
+
self.model = GPT1Model(config)
|
190 |
+
self.vocab_size = config.vocab_size
|
191 |
+
|
192 |
+
self.lm_head = nn.Linear(config.hidden_size, config.vocab_size)
|
193 |
+
|
194 |
+
# initialize weigths and apply final processing
|
195 |
+
self.post_init()
|
196 |
+
|
197 |
+
def get_input_embeddings(self):
|
198 |
+
return self.model.embs
|
199 |
+
|
200 |
+
def set_input_embeddings(self, value):
|
201 |
+
self.model.embs = value
|
202 |
+
|
203 |
+
def get_output_embeddings(self):
|
204 |
+
return self.lm_head
|
205 |
+
|
206 |
+
def set_output_embeddings(self, new_embeddings):
|
207 |
+
self.lm_head = new_embeddings
|
208 |
+
|
209 |
+
def get_decoder(self):
|
210 |
+
return self.model
|
211 |
+
|
212 |
+
def set_decoder(self, decoder):
|
213 |
+
self.model = decoder
|
214 |
+
|
215 |
+
def forward(self, input_ids, labels=None, *args, **kwargs):
|
216 |
+
output = self.model(input_ids)
|
217 |
+
|
218 |
+
hidden_state = output[0]
|
219 |
+
logits = self.lm_head(hidden_state).float()
|
220 |
+
|
221 |
+
loss = None
|
222 |
+
if labels is not None:
|
223 |
+
shift_logits = logits[..., :-1, :].contiguous()
|
224 |
+
shift_labels = labels[..., 1:].contiguous()
|
225 |
+
|
226 |
+
loss_fn = torch.nn.CrossEntropyLoss()
|
227 |
+
shift_logits = shift_logits.view(-1, self.config.vocab_size)
|
228 |
+
shift_labels = shift_labels.view(-1)
|
229 |
+
loss = loss_fn(shift_logits, shift_labels)
|
230 |
+
|
231 |
+
return CausalLMOutput(
|
232 |
+
loss=loss,
|
233 |
+
logits=logits
|
234 |
+
)
|
235 |
+
|
236 |
+
def prepare_inputs_for_generation(self, input_ids, *args, **kwargs):
|
237 |
+
return { 'input_ids': input_ids }
|
original_gpt1_params/.ipynb_checkpoints/encoder_bpe_40000-checkpoint.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
original_gpt1_params/.ipynb_checkpoints/params_shapes-checkpoint.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
[[512, 768], [40478, 768], [1, 768, 2304], [2304], [1, 768, 768], [768], [768], [768], [1, 768, 3072], [3072], [1, 3072, 768], [768], [768], [768], [1, 768, 2304], [2304], [1, 768, 768], [768], [768], [768], [1, 768, 3072], [3072], [1, 3072, 768], [768], [768], [768], [1, 768, 2304], [2304], [1, 768, 768], [768], [768], [768], [1, 768, 3072], [3072], [1, 3072, 768], [768], [768], [768], [1, 768, 2304], [2304], [1, 768, 768], [768], [768], [768], [1, 768, 3072], [3072], [1, 3072, 768], [768], [768], [768], [1, 768, 2304], [2304], [1, 768, 768], [768], [768], [768], [1, 768, 3072], [3072], [1, 3072, 768], [768], [768], [768], [1, 768, 2304], [2304], [1, 768, 768], [768], [768], [768], [1, 768, 3072], [3072], [1, 3072, 768], [768], [768], [768], [1, 768, 2304], [2304], [1, 768, 768], [768], [768], [768], [1, 768, 3072], [3072], [1, 3072, 768], [768], [768], [768], [1, 768, 2304], [2304], [1, 768, 768], [768], [768], [768], [1, 768, 3072], [3072], [1, 3072, 768], [768], [768], [768], [1, 768, 2304], [2304], [1, 768, 768], [768], [768], [768], [1, 768, 3072], [3072], [1, 3072, 768], [768], [768], [768], [1, 768, 2304], [2304], [1, 768, 768], [768], [768], [768], [1, 768, 3072], [3072], [1, 3072, 768], [768], [768], [768], [1, 768, 2304], [2304], [1, 768, 768], [768], [768], [768], [1, 768, 3072], [3072], [1, 3072, 768], [768], [768], [768], [1, 768, 2304], [2304], [1, 768, 768], [768], [768], [768], [1, 768, 3072], [3072], [1, 3072, 768], [768], [768], [768]]
|
original_gpt1_params/.ipynb_checkpoints/vocab_40000-checkpoint.bpe
ADDED
The diff for this file is too large to render.
See raw diff
|
|
original_gpt1_params/encoder_bpe_40000.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
original_gpt1_params/params_0.npy
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:8d9cd095b901dfbfbe0ce5e01d151dfe0b791e955d71149969ba65a6eab4480f
|
3 |
+
size 46614044
|
original_gpt1_params/params_1.npy
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:ca074893c040fa69cbf2fc95c06feda45a4e1492d03b645e2076e89ccf7ddd9f
|
3 |
+
size 46614044
|
original_gpt1_params/params_2.npy
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:966c25fbd632f0df18c4d4380ba57f23410f43311a96616f00b3d05ae6592f58
|
3 |
+
size 46614044
|
original_gpt1_params/params_3.npy
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:40df0d328f5d3d1b2bec768855a5d2eeeaf2b2124758ef98116f76a02526fd92
|
3 |
+
size 46614044
|
original_gpt1_params/params_4.npy
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:139f098dcd620ccf0200530e9ce9ff1c342714ff881a0c7258ac9faac4a06e6a
|
3 |
+
size 46614040
|
original_gpt1_params/params_5.npy
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:ad27b5cb245db9a29657270ff637d3ff1c15fd9df3683324a2936674cef8c3c5
|
3 |
+
size 46614040
|
original_gpt1_params/params_6.npy
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:af5bb5c76ddfea50683e0b9895fe704ae689853ed8bb3f1b3fee4daff2f27d45
|
3 |
+
size 46614040
|
original_gpt1_params/params_7.npy
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:27f55501d895ce1adb9b254aa762519a242edf2bcd2b43298b89538b5591566c
|
3 |
+
size 46614040
|
original_gpt1_params/params_8.npy
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:17a2b695128ea0aae98a360351b92769b879bc0f2835862949b6405b0ce88569
|
3 |
+
size 46614040
|
original_gpt1_params/params_9.npy
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:f1355fcd519db223f65db7fa7b79dcaf9b4c653915ffe4bd417d87f7903225c1
|
3 |
+
size 46614040
|
original_gpt1_params/params_shapes.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
[[512, 768], [40478, 768], [1, 768, 2304], [2304], [1, 768, 768], [768], [768], [768], [1, 768, 3072], [3072], [1, 3072, 768], [768], [768], [768], [1, 768, 2304], [2304], [1, 768, 768], [768], [768], [768], [1, 768, 3072], [3072], [1, 3072, 768], [768], [768], [768], [1, 768, 2304], [2304], [1, 768, 768], [768], [768], [768], [1, 768, 3072], [3072], [1, 3072, 768], [768], [768], [768], [1, 768, 2304], [2304], [1, 768, 768], [768], [768], [768], [1, 768, 3072], [3072], [1, 3072, 768], [768], [768], [768], [1, 768, 2304], [2304], [1, 768, 768], [768], [768], [768], [1, 768, 3072], [3072], [1, 3072, 768], [768], [768], [768], [1, 768, 2304], [2304], [1, 768, 768], [768], [768], [768], [1, 768, 3072], [3072], [1, 3072, 768], [768], [768], [768], [1, 768, 2304], [2304], [1, 768, 768], [768], [768], [768], [1, 768, 3072], [3072], [1, 3072, 768], [768], [768], [768], [1, 768, 2304], [2304], [1, 768, 768], [768], [768], [768], [1, 768, 3072], [3072], [1, 3072, 768], [768], [768], [768], [1, 768, 2304], [2304], [1, 768, 768], [768], [768], [768], [1, 768, 3072], [3072], [1, 3072, 768], [768], [768], [768], [1, 768, 2304], [2304], [1, 768, 768], [768], [768], [768], [1, 768, 3072], [3072], [1, 3072, 768], [768], [768], [768], [1, 768, 2304], [2304], [1, 768, 768], [768], [768], [768], [1, 768, 3072], [3072], [1, 3072, 768], [768], [768], [768], [1, 768, 2304], [2304], [1, 768, 768], [768], [768], [768], [1, 768, 3072], [3072], [1, 3072, 768], [768], [768], [768]]
|
original_gpt1_params/vocab_40000.bpe
ADDED
The diff for this file is too large to render.
See raw diff
|
|
tf_weights_to_hf.py
ADDED
@@ -0,0 +1,85 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
|
3 |
+
import torch
|
4 |
+
import numpy as np
|
5 |
+
|
6 |
+
from modeling_gpt1 import GPT1ForCausalLM, GPT1Model
|
7 |
+
from configuration_gpt1 import GPT1Config
|
8 |
+
|
9 |
+
|
10 |
+
GPT1Config.register_for_auto_class()
|
11 |
+
GPT1Model.register_for_auto_class('AutoModel')
|
12 |
+
GPT1ForCausalLM.register_for_auto_class('AutoModelForCausalLM')
|
13 |
+
|
14 |
+
def lists_are_equal(list1, list2):
|
15 |
+
for i, j in zip(list1, list2):
|
16 |
+
if i != j:
|
17 |
+
return False
|
18 |
+
return True
|
19 |
+
|
20 |
+
# get the original weights from the GPT1 params.npy files
|
21 |
+
def get_weights_from_tf_model():
|
22 |
+
|
23 |
+
shapes = json.load(open('original_gpt1_params/params_shapes.json'))
|
24 |
+
offsets = np.cumsum([np.prod(shape) for shape in shapes])
|
25 |
+
|
26 |
+
init_params = [np.load('original_gpt1_params/params_{}.npy'.format(n)) for n in range(10)]
|
27 |
+
init_params = np.split(np.concatenate(init_params, 0), offsets)[:-1]
|
28 |
+
init_params = [param.reshape(shape) for param, shape in zip(init_params, shapes)]
|
29 |
+
|
30 |
+
config = GPT1Config()
|
31 |
+
model = GPT1ForCausalLM(config)
|
32 |
+
|
33 |
+
# print(shapes[:15])
|
34 |
+
# print([k for k, v in model.named_parameters()][:10])
|
35 |
+
|
36 |
+
# embs layer
|
37 |
+
model.model.embs.weight.data = torch.from_numpy(init_params[1])
|
38 |
+
|
39 |
+
# pos enc layer
|
40 |
+
model.model.pos_emb.weight.data = torch.from_numpy(init_params[0])
|
41 |
+
|
42 |
+
layers = model.model.layers
|
43 |
+
|
44 |
+
for i in range(0, 12):
|
45 |
+
|
46 |
+
idx = 12 * i + 2
|
47 |
+
|
48 |
+
# attention q, k, v projections
|
49 |
+
init_params[idx] = np.squeeze(init_params[idx], axis=0)
|
50 |
+
q, k, v = torch.split(torch.tensor(init_params[idx]), 768, dim=-1)
|
51 |
+
layers[i].attention.q_proj.weight.data = q.detach().clone().transpose(-1, -2).contiguous()
|
52 |
+
layers[i].attention.k_proj.weight.data = k.detach().clone().transpose(-1, -2).contiguous()
|
53 |
+
layers[i].attention.v_proj.weight.data = v.detach().clone().transpose(-1, -2).contiguous()
|
54 |
+
|
55 |
+
# attention q, k, v biases
|
56 |
+
q_bias, k_bias, v_bias = torch.split(torch.tensor(init_params[idx + 1]), 768, dim=-1)
|
57 |
+
layers[i].attention.q_proj.bias.data = q_bias.detach().clone().contiguous()
|
58 |
+
layers[i].attention.k_proj.bias.data = k_bias.detach().clone().contiguous()
|
59 |
+
layers[i].attention.v_proj.bias.data = v_bias.detach().clone().contiguous()
|
60 |
+
|
61 |
+
# attention output proj + bias
|
62 |
+
init_params[idx + 2] = np.squeeze(init_params[idx + 2], axis=0)
|
63 |
+
layers[i].attention.o_proj.weight.data = torch.from_numpy(init_params[idx + 2]).transpose(-1, -2).contiguous()
|
64 |
+
layers[i].attention.o_proj.bias.data = torch.from_numpy(init_params[idx + 3])
|
65 |
+
|
66 |
+
# attention norm + bias
|
67 |
+
layers[i].attention_norm.weight.data = torch.from_numpy(init_params[idx + 4])
|
68 |
+
layers[i].attention_norm.bias.data = torch.from_numpy(init_params[idx + 5])
|
69 |
+
|
70 |
+
# mlp layer
|
71 |
+
init_params[idx + 6] = np.squeeze(init_params[idx + 6], axis=0)
|
72 |
+
layers[i].mlp.fc1.weight.data = torch.from_numpy(init_params[idx + 6]).transpose(-1, -2).contiguous()
|
73 |
+
layers[i].mlp.fc1.bias.data = torch.from_numpy(init_params[idx + 7])
|
74 |
+
init_params[idx + 8] = np.squeeze(init_params[idx + 8], axis=0)
|
75 |
+
layers[i].mlp.fc2.weight.data = torch.from_numpy(init_params[idx + 8]).transpose(-1, -2).contiguous()
|
76 |
+
layers[i].mlp.fc2.bias.data = torch.from_numpy(init_params[idx + 9])
|
77 |
+
|
78 |
+
# mlp norm + bias
|
79 |
+
layers[i].mlp_norm.weight.data = torch.from_numpy(init_params[idx + 10])
|
80 |
+
layers[i].mlp_norm.bias.data = torch.from_numpy(init_params[idx + 11])
|
81 |
+
|
82 |
+
model.save_pretrained('gpt1-converted-weights/')
|
83 |
+
|
84 |
+
|
85 |
+
get_weights_from_tf_model()
|