ranarag commited on
Commit
20f83d1
·
verified ·
1 Parent(s): 5f0fa90

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +45 -45
README.md CHANGED
@@ -95,32 +95,32 @@ print(output)
95
  <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">49.63</td>
96
  </tr>
97
  <tr>
98
- <td style="text-align:left; background-color: #FFFFFF; color: #2D2D2D;"><b>Granite-3.3-2B-Base</b></td>
99
- <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;"> 47.49 </td>
100
- <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;"> 73.2 </td>
101
- <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;"> 54.33 </td>
102
- <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;"> 40.83 </td>
103
- <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;"> 70.4 </td>
104
- <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;"> 50.0 </td>
105
- <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;"> 32.552 </td>
106
- <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">24.36</td>
107
- <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">38.78</td>
108
- <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">63.22</td>
109
- <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">49.52</td>
110
- </tr>
111
- <tr>
112
- <td style="text-align:left; background-color: #DAE8FF; color: black;">Granite-3.1-8B-Base</td>
113
- <td style="text-align:center; background-color: #DAE8FF; color: black;">53.51</td>
114
- <td style="text-align:center; background-color: #DAE8FF; color: black;">81.4</td>
115
- <td style="text-align:center; background-color: #DAE8FF; color: black;">64.28</td>
116
- <td style="text-align:center; background-color: #DAE8FF; color: black;">51.27</td>
117
- <td style="text-align:center; background-color: #DAE8FF; color: black;">76.2</td>
118
- <td style="text-align:center; background-color: #DAE8FF; color: black;">70.5</td>
119
- <td style="text-align:center; background-color: #DAE8FF; color: black;">45.87</td>
120
- <td style="text-align:center; background-color: #DAE8FF; color: black;">35.97</td>
121
- <td style="text-align:center; background-color: #DAE8FF; color: black;">48.99</td>
122
- <td style="text-align:center; background-color: #DAE8FF; color: black;">78.33</td>
123
- <td style="text-align:center; background-color: #DAE8FF; color: black;">60.63</td>
124
  </tr>
125
 
126
  <tr>
@@ -137,9 +137,9 @@ print(output)
137
  <td style="text-align:center; background-color: #DAE8FF; color: black;">78.18</td>
138
  <td style="text-align:center; background-color: #DAE8FF; color: black;">58.05</td>
139
  </tr>
140
-
141
  </tbody></table>
142
 
 
143
  **Model Architecture:**
144
  Granite-3.3-2B-Base is based on a decoder-only dense transformer architecture. Core components of this architecture are: GQA and RoPE, MLP with SwiGLU, RMSNorm, and shared input/output embeddings.
145
  <table>
@@ -152,68 +152,68 @@ Granite-3.3-2B-Base is based on a decoder-only dense transformer architecture. C
152
  <tbody>
153
  <tr>
154
  <td style="text-align:left; background-color: #FFFFFF; color: black;">Embedding size</td>
155
- <td style="text-align:center; background-color: #FFFFFF; color: black;">2048</td>
156
- <td style="text-align:center; background-color: #DAE8FF; color: black;">4096</td>
157
  </tr>
158
  <tr>
159
  <td style="text-align:left; background-color: #FFFFFF; color: black;">Number of layers</td>
160
- <td style="text-align:center; background-color: #FFFFFF; color: black;">40</td>
161
  <td style="text-align:center; background-color: #DAE8FF; color: black;">40</td>
 
162
  </tr>
163
  <tr>
164
  <td style="text-align:left; background-color: #FFFFFF; color: black;">Attention head size</td>
165
- <td style="text-align:center; background-color: #FFFFFF; color: black;">64</td>
166
- <td style="text-align:center; background-color: #DAE8FF; color: black;">128</td>
167
  </tr>
168
  <tr>
169
  <td style="text-align:left; background-color: #FFFFFF; color: black;">Number of attention heads</td>
170
- <td style="text-align:center; background-color: #FFFFFF; color: black;">32</td>
171
  <td style="text-align:center; background-color: #DAE8FF; color: black;">32</td>
 
172
  </tr>
173
  <tr>
174
  <td style="text-align:left; background-color: #FFFFFF; color: black;">Number of KV heads</td>
175
- <td style="text-align:center; background-color: #FFFFFF; color: black;">8</td>
176
  <td style="text-align:center; background-color: #DAE8FF; color: black;">8</td>
 
177
  </tr>
178
  <tr>
179
  <td style="text-align:left; background-color: #FFFFFF; color: black;">MLP hidden size</td>
180
- <td style="text-align:center; background-color: #FFFFFF; color: black;">8192</td>
181
- <td style="text-align:center; background-color: #DAE8FF; color: black;">12800</td>
182
  </tr>
183
  <tr>
184
  <td style="text-align:left; background-color: #FFFFFF; color: black;">MLP activation</td>
185
- <td style="text-align:center; background-color: #FFFFFF; color: black;">SwiGLU</td>
186
  <td style="text-align:center; background-color: #DAE8FF; color: black;">SwiGLU</td>
 
187
  </tr>
188
  <tr>
189
  <td style="text-align:left; background-color: #FFFFFF; color: black;">Initialization std</td>
190
- <td style="text-align:center; background-color: #FFFFFF; color: black;">0.1</td>
191
  <td style="text-align:center; background-color: #DAE8FF; color: black;">0.1</td>
 
192
  </tr>
193
  <tr>
194
  <td style="text-align:left; background-color: #FFFFFF; color: black;">Sequence length</td>
195
- <td style="text-align:center; background-color: #FFFFFF; color: black;">128K</td>
196
  <td style="text-align:center; background-color: #DAE8FF; color: black;">128K</td>
 
197
  </tr>
198
  <tr>
199
  <td style="text-align:left; background-color: #FFFFFF; color: black;">Position embedding</td>
200
- <td style="text-align:center; background-color: #FFFFFF; color: black;">RoPE</td>
201
  <td style="text-align:center; background-color: #DAE8FF; color: black;">RoPE</td>
 
202
  </tr>
203
  <tr>
204
  <td style="text-align:left; background-color: #FFFFFF; color: black;"># Parameters</td>
205
- <td style="text-align:center; background-color: #FFFFFF; color: black;">2.5B</td>
206
- <td style="text-align:center; background-color: #DAE8FF; color: black;">8.1B</td>
207
  </tr>
208
  <tr>
209
  <td style="text-align:left; background-color: #FFFFFF; color: black;"># Active parameters</td>
210
- <td style="text-align:center; background-color: #FFFFFF; color: black;">2.5B</td>
211
- <td style="text-align:center; background-color: #DAE8FF; color: black;">8.1B</td>
212
  </tr>
213
  <tr>
214
  <td style="text-align:left; background-color: #FFFFFF; color: black;"># Training tokens</td>
215
- <td style="text-align:center; background-color: #FFFFFF; color: black;">12T</td>
216
  <td style="text-align:center; background-color: #DAE8FF; color: black;">12T</td>
 
217
  </tr>
218
  </tbody></table>
219
 
 
95
  <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">49.63</td>
96
  </tr>
97
  <tr>
98
+ <td style="text-align:left; background-color: #DAE8FF; color: black;"><b>Granite-3.3-2B-Base</b></td>
99
+ <td style="text-align:center; background-color: #DAE8FF; color: black;"> 47.49 </td>
100
+ <td style="text-align:center; background-color: #DAE8FF; color: black;"> 73.2 </td>
101
+ <td style="text-align:center; background-color: #DAE8FF; color: black;"> 54.33 </td>
102
+ <td style="text-align:center; background-color: #DAE8FF; color: black;"> 40.83 </td>
103
+ <td style="text-align:center; background-color: #DAE8FF; color: black;"> 70.4 </td>
104
+ <td style="text-align:center; background-color: #DAE8FF; color: black;"> 50.0 </td>
105
+ <td style="text-align:center; background-color: #DAE8FF; color: black;"> 32.552 </td>
106
+ <td style="text-align:center; background-color: #DAE8FF; color: black;">24.36</td>
107
+ <td style="text-align:center; background-color: #DAE8FF; color: black;">38.78</td>
108
+ <td style="text-align:center; background-color: #DAE8FF; color: black;">63.22</td>
109
+ <td style="text-align:center; background-color: #DAE8FF; color: black;">49.52</td>
110
+ </tr>
111
+ <tr>
112
+ <td style="text-align:left; background-color: #FFFFFF; color: #2D2D2D;">Granite-3.1-8B-Base</td>
113
+ <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">53.51</td>
114
+ <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">81.4</td>
115
+ <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">64.28</td>
116
+ <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">51.27</td>
117
+ <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">76.2</td>
118
+ <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">70.5</td>
119
+ <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">45.87</td>
120
+ <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">35.97</td>
121
+ <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">48.99</td>
122
+ <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">78.33</td>
123
+ <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">60.63</td>
124
  </tr>
125
 
126
  <tr>
 
137
  <td style="text-align:center; background-color: #DAE8FF; color: black;">78.18</td>
138
  <td style="text-align:center; background-color: #DAE8FF; color: black;">58.05</td>
139
  </tr>
 
140
  </tbody></table>
141
 
142
+
143
  **Model Architecture:**
144
  Granite-3.3-2B-Base is based on a decoder-only dense transformer architecture. Core components of this architecture are: GQA and RoPE, MLP with SwiGLU, RMSNorm, and shared input/output embeddings.
145
  <table>
 
152
  <tbody>
153
  <tr>
154
  <td style="text-align:left; background-color: #FFFFFF; color: black;">Embedding size</td>
155
+ <td style="text-align:center; background-color: #DAE8FF; color: black;">2048</td>
156
+ <td style="text-align:center; background-color: #FFFFFF; color: black;">4096</td>
157
  </tr>
158
  <tr>
159
  <td style="text-align:left; background-color: #FFFFFF; color: black;">Number of layers</td>
 
160
  <td style="text-align:center; background-color: #DAE8FF; color: black;">40</td>
161
+ <td style="text-align:center; background-color: #FFFFFF; color: black;">40</td>
162
  </tr>
163
  <tr>
164
  <td style="text-align:left; background-color: #FFFFFF; color: black;">Attention head size</td>
165
+ <td style="text-align:center; background-color: #DAE8FF; color: black;">64</td>
166
+ <td style="text-align:center; background-color: #FFFFFF; color: black;">128</td>
167
  </tr>
168
  <tr>
169
  <td style="text-align:left; background-color: #FFFFFF; color: black;">Number of attention heads</td>
 
170
  <td style="text-align:center; background-color: #DAE8FF; color: black;">32</td>
171
+ <td style="text-align:center; background-color: #FFFFFF; color: black;">32</td>
172
  </tr>
173
  <tr>
174
  <td style="text-align:left; background-color: #FFFFFF; color: black;">Number of KV heads</td>
 
175
  <td style="text-align:center; background-color: #DAE8FF; color: black;">8</td>
176
+ <td style="text-align:center; background-color: #FFFFFF; color: black;">8</td>
177
  </tr>
178
  <tr>
179
  <td style="text-align:left; background-color: #FFFFFF; color: black;">MLP hidden size</td>
180
+ <td style="text-align:center; background-color: #DAE8FF; color: black;">8192</td>
181
+ <td style="text-align:center; background-color: #FFFFFF; color: black;">12800</td>
182
  </tr>
183
  <tr>
184
  <td style="text-align:left; background-color: #FFFFFF; color: black;">MLP activation</td>
 
185
  <td style="text-align:center; background-color: #DAE8FF; color: black;">SwiGLU</td>
186
+ <td style="text-align:center; background-color: #FFFFFF; color: black;">SwiGLU</td>
187
  </tr>
188
  <tr>
189
  <td style="text-align:left; background-color: #FFFFFF; color: black;">Initialization std</td>
 
190
  <td style="text-align:center; background-color: #DAE8FF; color: black;">0.1</td>
191
+ <td style="text-align:center; background-color: #FFFFFF; color: black;">0.1</td>
192
  </tr>
193
  <tr>
194
  <td style="text-align:left; background-color: #FFFFFF; color: black;">Sequence length</td>
 
195
  <td style="text-align:center; background-color: #DAE8FF; color: black;">128K</td>
196
+ <td style="text-align:center; background-color: #FFFFFF; color: black;">128K</td>
197
  </tr>
198
  <tr>
199
  <td style="text-align:left; background-color: #FFFFFF; color: black;">Position embedding</td>
 
200
  <td style="text-align:center; background-color: #DAE8FF; color: black;">RoPE</td>
201
+ <td style="text-align:center; background-color: #FFFFFF; color: black;">RoPE</td>
202
  </tr>
203
  <tr>
204
  <td style="text-align:left; background-color: #FFFFFF; color: black;"># Parameters</td>
205
+ <td style="text-align:center; background-color: #DAE8FF; color: black;">2.5B</td>
206
+ <td style="text-align:center; background-color: #FFFFFF; color: black;">8.1B</td>
207
  </tr>
208
  <tr>
209
  <td style="text-align:left; background-color: #FFFFFF; color: black;"># Active parameters</td>
210
+ <td style="text-align:center; background-color: #DAE8FF; color: black;">2.5B</td>
211
+ <td style="text-align:center; background-color: #FFFFFF; color: black;">8.1B</td>
212
  </tr>
213
  <tr>
214
  <td style="text-align:left; background-color: #FFFFFF; color: black;"># Training tokens</td>
 
215
  <td style="text-align:center; background-color: #DAE8FF; color: black;">12T</td>
216
+ <td style="text-align:center; background-color: #FFFFFF; color: black;">12T</td>
217
  </tr>
218
  </tbody></table>
219