convaiinnovations commited on
Commit
c8f0b46
·
verified ·
1 Parent(s): 1e6bee6

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +385 -0
README.md CHANGED
@@ -31,6 +31,391 @@ A Hindi language generation model with the following specifications:
31
  ## Training
32
 
33
  The model was trained on a large corpus of Hindi text using a cosine learning rate schedule with warmup. Training utilized mixed-precision and distributed data parallel across multiple GPUs.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
 
35
  ## Capabilities
36
 
 
31
  ## Training
32
 
33
  The model was trained on a large corpus of Hindi text using a cosine learning rate schedule with warmup. Training utilized mixed-precision and distributed data parallel across multiple GPUs.
34
+ ## Usage
35
+
36
+ You can use this model with the following code:
37
+
38
+ ```python
39
+ import torch
40
+ import math
41
+ import os
42
+ from hindi_embeddings import SentencePieceTokenizerWrapper
43
+ from safetensors.torch import load_file
44
+ from torch import nn
45
+ from transformers import PreTrainedModel, PretrainedConfig
46
+
47
+
48
+ class ConvaiCausalLMConfig(PretrainedConfig):
49
+ model_type = "convaicausallm"
50
+
51
+ def __init__(
52
+ self,
53
+ vocab_size=16000,
54
+ hidden_size=768,
55
+ num_hidden_layers=12,
56
+ num_attention_heads=16,
57
+ num_key_value_heads=4,
58
+ intermediate_size=3072,
59
+ hidden_act="silu",
60
+ max_position_embeddings=512,
61
+ rope_theta=10000.0, # Base parameter for RoPE
62
+ **kwargs
63
+ ):
64
+ super().__init__(**kwargs)
65
+ self.vocab_size = vocab_size
66
+ self.hidden_size = hidden_size
67
+ self.num_hidden_layers = num_hidden_layers
68
+ self.num_attention_heads = num_attention_heads
69
+ self.num_key_value_heads = num_key_value_heads
70
+ self.intermediate_size = intermediate_size
71
+ self.hidden_act = hidden_act
72
+ self.max_position_embeddings = max_position_embeddings
73
+ self.rope_theta = rope_theta
74
+
75
+
76
+ def precompute_freqs_cis(dim, end, theta=10000.0):
77
+ """Precompute the frequency tensor for complex exponentials (cos, sin)"""
78
+ # Ensure dim is even for complex numbers
79
+ assert dim % 2 == 0, "Dimension must be even"
80
+
81
+ # Create position indices for caching
82
+ freqs = 1.0 / (theta ** (torch.arange(0, dim, 2).float() / dim))
83
+ t = torch.arange(end).float()
84
+ freqs = torch.outer(t, freqs) # [end, dim/2]
85
+
86
+ # Create complex exponentials (cos, sin pairs)
87
+ cos, sin = torch.cos(freqs), torch.sin(freqs)
88
+ return cos, sin
89
+
90
+
91
+ def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None):
92
+ """Apply rotary position embeddings to q and k tensors"""
93
+ # Extract shapes
94
+ batch, seq_len, n_heads, head_dim = q.shape
95
+ _, kv_seq_len, n_kv_heads, _ = k.shape
96
+
97
+ # Handle position IDs or use sequential positions
98
+ if position_ids is None:
99
+ # Default: Just use sequential positions
100
+ position_ids = torch.arange(seq_len, device=q.device)
101
+ position_ids = position_ids.unsqueeze(0).expand(batch, -1)
102
+
103
+ # Get the cosine and sine for the positions we're using
104
+ cos = cos[position_ids].unsqueeze(-2) # [batch, seq, 1, dim/2]
105
+ sin = sin[position_ids].unsqueeze(-2) # [batch, seq, 1, dim/2]
106
+
107
+ # q and k must be arranged in pairs for rotation
108
+ q_embed_dim = q.shape[-1]
109
+ q_half_dim = q_embed_dim // 2
110
+
111
+ # Split the embedding dimensions into pairs
112
+ q_half1, q_half2 = q[..., :q_half_dim], q[..., q_half_dim:]
113
+ k_half1, k_half2 = k[..., :q_half_dim], k[..., q_half_dim:]
114
+
115
+ # Apply rotary embeddings to each pair of dimensions
116
+ # For each pair (a, b), we compute (a*cos - b*sin, a*sin + b*cos)
117
+ q_out_half1 = q_half1 * cos - q_half2 * sin
118
+ q_out_half2 = q_half1 * sin + q_half2 * cos
119
+ k_out_half1 = k_half1 * cos - k_half2 * sin
120
+ k_out_half2 = k_half1 * sin + k_half2 * cos
121
+
122
+ # Concatenate back to original shape
123
+ q_out = torch.cat([q_out_half1, q_out_half2], dim=-1)
124
+ k_out = torch.cat([k_out_half1, k_out_half2], dim=-1)
125
+
126
+ return q_out, k_out
127
+
128
+
129
+ class GroupedQueryAttention(nn.Module):
130
+ def __init__(self, config):
131
+ super().__init__()
132
+ self.hidden_size = config.hidden_size
133
+ self.num_heads = config.num_attention_heads
134
+ self.num_kv_heads = config.num_key_value_heads
135
+ self.head_dim = config.hidden_size // config.num_attention_heads
136
+
137
+ # For MQA/GQA support
138
+ self.num_key_value_groups = self.num_heads // self.num_kv_heads
139
+
140
+ self.q_proj = nn.Linear(config.hidden_size, self.num_heads * self.head_dim)
141
+ self.k_proj = nn.Linear(config.hidden_size, self.num_kv_heads * self.head_dim)
142
+ self.v_proj = nn.Linear(config.hidden_size, self.num_kv_heads * self.head_dim)
143
+ self.o_proj = nn.Linear(config.hidden_size, config.hidden_size)
144
+
145
+ # Precompute rotary position encoding frequencies
146
+ max_seq_len = config.max_position_embeddings
147
+ self.max_seq_len = max_seq_len
148
+
149
+ # Register frequencies as buffers
150
+ cos, sin = precompute_freqs_cis(self.head_dim, max_seq_len, config.rope_theta)
151
+ self.register_buffer("cos", cos) # [max_seq_len, dim/2]
152
+ self.register_buffer("sin", sin) # [max_seq_len, dim/2]
153
+
154
+ # Create causal mask for attention
155
+ self.register_buffer(
156
+ "causal_mask",
157
+ torch.triu(torch.ones(max_seq_len, max_seq_len) * -1e9, diagonal=1)
158
+ )
159
+
160
+ def forward(self, hidden_states, attention_mask=None):
161
+ batch_size, seq_len, _ = hidden_states.size()
162
+
163
+ # Project queries, keys, values
164
+ q = self.q_proj(hidden_states)
165
+ k = self.k_proj(hidden_states)
166
+ v = self.v_proj(hidden_states)
167
+
168
+ # Reshape for attention computation
169
+ q = q.view(batch_size, seq_len, self.num_heads, self.head_dim)
170
+ k = k.view(batch_size, seq_len, self.num_kv_heads, self.head_dim)
171
+ v = v.view(batch_size, seq_len, self.num_kv_heads, self.head_dim)
172
+
173
+ # Apply rotary position embeddings
174
+ q_rotary, k_rotary = apply_rotary_pos_emb(q, k, self.cos, self.sin)
175
+
176
+ # Reshape for attention computation
177
+ q_rotary = q_rotary.transpose(1, 2) # [batch, heads, seq, dim]
178
+ k_rotary = k_rotary.transpose(1, 2) # [batch, kv_heads, seq, dim]
179
+ v = v.transpose(1, 2) # [batch, kv_heads, seq, dim]
180
+
181
+ # Handle Multi-Query Attention / Grouped-Query Attention
182
+ if self.num_key_value_groups > 1:
183
+ # Repeat k, v for each query in the group
184
+ k_rotary = k_rotary.repeat_interleave(self.num_key_value_groups, dim=1)
185
+ v = v.repeat_interleave(self.num_key_value_groups, dim=1)
186
+
187
+ # Compute attention scores
188
+ attn_scores = torch.matmul(q_rotary, k_rotary.transpose(-1, -2)) / (self.head_dim ** 0.5)
189
+
190
+ # Apply causal mask - only attend to previous tokens
191
+ causal_mask = self.causal_mask[:seq_len, :seq_len]
192
+ attn_scores = attn_scores + causal_mask
193
+
194
+ # Apply attention mask if provided
195
+ if attention_mask is not None:
196
+ attn_scores = attn_scores + attention_mask
197
+
198
+ # Normalize the attention scores to probabilities
199
+ attn_probs = torch.softmax(attn_scores, dim=-1)
200
+
201
+ # Apply attention to values
202
+ context = torch.matmul(attn_probs, v) # [b, n_heads, seq, head_dim]
203
+
204
+ # Reshape back to [batch_size, seq_length, hidden_size]
205
+ context = context.transpose(1, 2).contiguous()
206
+ context = context.view(batch_size, seq_len, -1)
207
+
208
+ # Final projection
209
+ output = self.o_proj(context)
210
+
211
+ return output
212
+
213
+
214
+ class ConvaiCausalLM(PreTrainedModel):
215
+ config_class = ConvaiCausalLMConfig
216
+
217
+ def __init__(self, config):
218
+ super().__init__(config)
219
+ self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size)
220
+ self.layers = nn.ModuleList([
221
+ nn.ModuleDict({
222
+ "self_attn": GroupedQueryAttention(config),
223
+ "mlp": nn.Sequential(
224
+ nn.Linear(config.hidden_size, config.intermediate_size),
225
+ nn.SiLU(),
226
+ nn.Linear(config.intermediate_size, config.hidden_size)
227
+ ),
228
+ "input_layernorm": nn.LayerNorm(config.hidden_size),
229
+ "post_attention_layernorm": nn.LayerNorm(config.hidden_size)
230
+ }) for _ in range(config.num_hidden_layers)
231
+ ])
232
+ self.norm = nn.LayerNorm(config.hidden_size)
233
+ self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
234
+
235
+ # Initialize weights
236
+ self.apply(self._init_weights)
237
+
238
+ def _init_weights(self, module):
239
+ if isinstance(module, nn.Linear):
240
+ torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
241
+ if module.bias is not None:
242
+ torch.nn.init.zeros_(module.bias)
243
+ elif isinstance(module, nn.Embedding):
244
+ torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
245
+
246
+ def _prepare_attention_mask(self, attention_mask, input_shape, device):
247
+ # Prepare masks for attention
248
+ if attention_mask is None:
249
+ attention_mask = torch.ones(input_shape, device=device)
250
+
251
+ # Make broadcastable shape: [batch, 1, 1, seq_len]
252
+ extended_mask = attention_mask.unsqueeze(1).unsqueeze(2)
253
+
254
+ # Convert to additive mask (0 for valid, -10000 for masked)
255
+ extended_mask = (1.0 - extended_mask) * -10000.0
256
+
257
+ return extended_mask
258
+
259
+ def forward(self, input_ids, attention_mask=None):
260
+ batch_size, seq_len = input_ids.shape
261
+ device = input_ids.device
262
+
263
+ # Prepare attention mask
264
+ if attention_mask is not None:
265
+ attention_mask = self._prepare_attention_mask(
266
+ attention_mask, (batch_size, seq_len), device
267
+ )
268
+
269
+ # Get embeddings
270
+ hidden_states = self.embed_tokens(input_ids)
271
+
272
+ # Apply each layer
273
+ for layer in self.layers:
274
+ residual = hidden_states
275
+
276
+ # First norm and attention
277
+ hidden_states = layer["input_layernorm"](hidden_states)
278
+ hidden_states = layer["self_attn"](hidden_states, attention_mask)
279
+ hidden_states = residual + hidden_states
280
+
281
+ # Second norm and MLP
282
+ residual = hidden_states
283
+ hidden_states = layer["post_attention_layernorm"](hidden_states)
284
+ hidden_states = layer["mlp"](hidden_states)
285
+ hidden_states = residual + hidden_states
286
+
287
+ # Final norm
288
+ hidden_states = self.norm(hidden_states)
289
+
290
+ # Compute logits
291
+ logits = self.lm_head(hidden_states)
292
+
293
+ return logits
294
+
295
+
296
+ class HindiLLMGenerator:
297
+ def __init__(self, model_path, device=None):
298
+ # Set device
299
+ if device is None:
300
+ self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
301
+ else:
302
+ self.device = torch.device(device)
303
+
304
+ print(f"Using device: {self.device}")
305
+
306
+ # Load tokenizer
307
+ tokenizer_path = os.path.join(model_path, "tokenizer.model")
308
+ self.tokenizer = SentencePieceTokenizerWrapper(tokenizer_path)
309
+
310
+ # Load model config
311
+ config_path = os.path.join(model_path, "config.json")
312
+ import json
313
+ with open(config_path, 'r') as f:
314
+ config_dict = json.load(f)
315
+
316
+ self.config = ConvaiCausalLMConfig(**config_dict)
317
+
318
+ # Load model - try safetensors first, fall back to PyTorch bin if needed
319
+ safetensors_path = os.path.join(model_path, "model.safetensors")
320
+ pytorch_path = os.path.join(model_path, "pytorch_model.bin")
321
+
322
+ self.model = ConvaiCausalLM(self.config)
323
+
324
+ # Check which format is available and load accordingly
325
+ if os.path.exists(safetensors_path):
326
+ print(f"Loading model from SafeTensors")
327
+ state_dict = load_file(safetensors_path, device="cpu")
328
+ self.model.load_state_dict(state_dict)
329
+ elif os.path.exists(pytorch_path):
330
+ print(f"Loading model from PyTorch bin")
331
+ self.model.load_state_dict(torch.load(pytorch_path, map_location="cpu"))
332
+
333
+ # Move model to device and set to evaluation mode
334
+ self.model.to(self.device)
335
+ self.model.eval()
336
+
337
+ def generate(self, prompt, max_length=100, temperature=0.8, top_k=50, top_p=0.9,
338
+ repetition_penalty=1.1, do_sample=True):
339
+ # Tokenize the prompt
340
+ input_ids = self.tokenizer.sp_model.EncodeAsIds(prompt)
341
+ input_tensor = torch.tensor([input_ids], dtype=torch.long).to(self.device)
342
+
343
+ # Start with the input tensor
344
+ output_sequence = input_tensor.clone()
345
+
346
+ # Generate tokens one by one
347
+ for _ in range(max_length - len(input_ids)):
348
+ with torch.no_grad():
349
+ # Get the model's output for the current sequence
350
+ outputs = self.model(output_sequence)
351
+ next_token_logits = outputs[0, -1, :]
352
+
353
+ # Apply temperature
354
+ if temperature > 0:
355
+ next_token_logits = next_token_logits / temperature
356
+
357
+ # Apply repetition penalty
358
+ if repetition_penalty > 1.0:
359
+ for token_id in output_sequence[0].tolist():
360
+ next_token_logits[token_id] /= repetition_penalty
361
+
362
+ # Filter with top-k sampling
363
+ if top_k > 0:
364
+ top_k_values, top_k_indices = torch.topk(next_token_logits, top_k)
365
+ next_token_logits = torch.full_like(next_token_logits, float('-inf'))
366
+ next_token_logits.scatter_(0, top_k_indices, top_k_values)
367
+
368
+ # Filter with top-p/nucleus sampling
369
+ if top_p < 1.0 and do_sample:
370
+ sorted_logits, sorted_indices = torch.sort(next_token_logits, descending=True)
371
+ cumulative_probs = torch.cumsum(torch.softmax(sorted_logits, dim=-1), dim=-1)
372
+
373
+ # Remove tokens with cumulative probability above the threshold
374
+ sorted_indices_to_remove = cumulative_probs > top_p
375
+ # Shift the indices to the right to keep the first token above the threshold
376
+ sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
377
+ sorted_indices_to_remove[..., 0] = 0
378
+
379
+ indices_to_remove = sorted_indices[sorted_indices_to_remove]
380
+ next_token_logits[indices_to_remove] = float('-inf')
381
+
382
+ # Sample or choose the next token
383
+ if do_sample:
384
+ probs = torch.softmax(next_token_logits, dim=-1)
385
+ next_token = torch.multinomial(probs, num_samples=1)
386
+ else:
387
+ next_token = torch.argmax(next_token_logits, dim=-1).unsqueeze(0)
388
+
389
+ # Add the next token to the sequence
390
+ output_sequence = torch.cat([output_sequence, next_token.unsqueeze(0)], dim=1)
391
+
392
+ # Check if we've generated an end token
393
+ if next_token.item() == self.tokenizer.eos_token_id:
394
+ break
395
+
396
+ # Decode the generated sequence
397
+ generated_ids = output_sequence[0].tolist()
398
+ generated_text = self.tokenizer.sp_model.DecodeIds(generated_ids)
399
+
400
+ return generated_text
401
+
402
+ # Example usage
403
+ if __name__ == "__main__":
404
+ generator = HindiLLMGenerator("path/to/model")
405
+ result = generator.generate("भारत एक विशाल देश है")
406
+ print(result)
407
+ ```
408
+
409
+ ## Example Prompts
410
+
411
+ Try the model with these example prompts:
412
+
413
+ ```
414
+ भारत एक विशाल देश है
415
+ मुझे हिंदी में एक कहानी सुनाओ
416
+ आज का मौसम बहुत अच्छा है
417
+ हिंदी साहित्य की प्रमुख विशेषताएं
418
+ ```
419
 
420
  ## Capabilities
421