Update README.md
Browse files
README.md
CHANGED
@@ -1,4 +1,3 @@
|
|
1 |
-
|
2 |
---
|
3 |
license: mit # Or choose another appropriate license: https://huggingface.co/docs/hub/repositories-licenses
|
4 |
language: hi
|
@@ -7,184 +6,333 @@ tags:
|
|
7 |
- text-generation
|
8 |
- causal-lm
|
9 |
- custom-model
|
10 |
-
# Add more specific tags if applicable, e.g., based on training data domain
|
11 |
pipeline_tag: text-generation
|
12 |
-
# Specify model size if known from config
|
13 |
-
|
14 |
---
|
15 |
|
16 |
# Hindi Causal Language Model (convaiinnovations/hindi-foundational-model-base)
|
17 |
|
18 |
-
This repository contains a custom-trained Hindi Causal Language Model.
|
19 |
|
20 |
## Model Description
|
21 |
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
27 |
|
28 |
## How to Use
|
29 |
|
30 |
-
**⚠️ Important:** This model uses custom Python classes (`HindiCausalLM`, `HindiCausalLMConfig`, `SentencePieceTokenizerWrapper`) which are **not** part of the standard Hugging Face `transformers` library.
|
|
|
|
|
31 |
|
32 |
```python
|
33 |
import os
|
34 |
-
import json
|
35 |
-
import torch
|
36 |
-
import numpy as np
|
37 |
from huggingface_hub import hf_hub_download
|
38 |
|
39 |
-
#
|
40 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
41 |
try:
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
print("Please place hindi_language_model.py and hindi_embeddings.py in your working directory or Python path.")
|
48 |
-
# Define minimal dummy classes to potentially allow script execution, but loading will fail
|
49 |
-
class SentencePieceTokenizerWrapper: pass
|
50 |
-
class HindiCausalLMConfig: pass
|
51 |
-
class HindiCausalLM(torch.nn.Module): pass
|
52 |
-
# Exit if classes are truly needed
|
53 |
-
# exit()
|
54 |
|
|
|
|
|
55 |
|
56 |
-
|
57 |
-
repo_id = "convaiinnovations/hindi-foundational-model-base"
|
58 |
-
# model_dir = "./downloaded_model" # Example download location
|
59 |
-
# os.makedirs(model_dir, exist_ok=True)
|
60 |
-
# Use current directory if preferred
|
61 |
-
model_dir = "."
|
62 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
63 |
|
64 |
-
#
|
65 |
-
print(f"Downloading files for {repo_id} to '{os.path.abspath(model_dir)}'...")
|
66 |
try:
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
try: weights_path = hf_hub_download(repo_id=repo_id, filename="model.safetensors", local_dir=model_dir, local_dir_use_symlinks=False)
|
72 |
-
except Exception: # More specific: from huggingface_hub.utils import EntryNotFoundError
|
73 |
-
try: weights_path = hf_hub_download(repo_id=repo_id, filename="pytorch_model.bin", local_dir=model_dir, local_dir_use_symlinks=False); using_safetensors = False
|
74 |
-
except Exception as e_inner: raise FileNotFoundError(f"Could not download weights (.safetensors or .bin): {e_inner}") from e_inner
|
75 |
-
except Exception as e: raise RuntimeError(f"Failed to download files from Hub: {e}") from e
|
76 |
-
print("Files downloaded.")
|
77 |
|
78 |
-
|
79 |
-
|
80 |
-
print(
|
81 |
|
|
|
|
|
82 |
try:
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
96 |
else:
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
model.to(device)
|
101 |
-
model.eval()
|
102 |
-
print("Model and tokenizer loaded successfully.")
|
103 |
-
|
104 |
-
except Exception as e:
|
105 |
-
print(f"ERROR: Failed loading model components: {e}")
|
106 |
-
# Add more specific error handling if needed
|
107 |
-
exit()
|
108 |
|
|
|
109 |
|
110 |
-
|
111 |
-
prompt = "भारत की संस्कृति" # Example prompt
|
112 |
-
max_new_tokens = 60 # Generate N new tokens
|
113 |
-
temperature = 0.7
|
114 |
-
top_k = 50
|
115 |
-
seed = 42
|
116 |
|
117 |
-
|
|
|
|
|
|
|
118 |
|
119 |
-
|
120 |
-
np.random.seed(seed)
|
121 |
-
if device.type == 'cuda': torch.cuda.manual_seed_all(seed)
|
122 |
|
123 |
-
|
124 |
-
|
125 |
-
|
126 |
-
elif hasattr(tokenizer, 'sp_model') and hasattr(tokenizer.sp_model, 'EncodeAsIds'): input_ids = tokenizer.sp_model.EncodeAsIds(prompt)
|
127 |
-
else: raise AttributeError("Tokenizer lacks encoding method.")
|
128 |
-
assert input_ids, "Encoding failed"
|
129 |
-
|
130 |
-
bos_id = getattr(tokenizer, 'bos_token_id', 1)
|
131 |
-
if bos_id is not None: input_ids = [bos_id] + input_ids
|
132 |
-
|
133 |
-
input_tensor = torch.tensor([input_ids], dtype=torch.long, device=device)
|
134 |
-
generated_ids = input_tensor
|
135 |
-
|
136 |
-
with torch.no_grad():
|
137 |
-
for _ in range(max_new_tokens):
|
138 |
-
outputs = model(input_ids=generated_ids)
|
139 |
-
# Access logits
|
140 |
-
if isinstance(outputs, dict) and 'logits' in outputs: logits = outputs['logits']
|
141 |
-
elif hasattr(outputs, 'logits'): logits = outputs.logits
|
142 |
-
else: raise TypeError("Model output format error.")
|
143 |
-
|
144 |
-
next_token_logits = logits[:, -1, :]
|
145 |
-
# Sampling
|
146 |
-
if temperature > 0: scaled_logits = next_token_logits / temperature
|
147 |
-
else: scaled_logits = next_token_logits
|
148 |
-
if top_k > 0: kth_vals, _ = torch.topk(scaled_logits, k=top_k); scaled_logits[scaled_logits < kth_vals[:, -1].unsqueeze(-1)] = -float("Inf")
|
149 |
-
probs = torch.softmax(scaled_logits, dim=-1)
|
150 |
-
next_token_id = torch.multinomial(probs, num_samples=1)
|
151 |
-
generated_ids = torch.cat([generated_ids, next_token_id], dim=1)
|
152 |
-
# Check EOS
|
153 |
-
eos_id = getattr(tokenizer, 'eos_token_id', 2)
|
154 |
-
if eos_id is not None and next_token_id.item() == eos_id: break
|
155 |
-
|
156 |
-
# Decoding
|
157 |
-
output_ids = generated_ids[0].cpu().tolist()
|
158 |
-
# Remove special tokens
|
159 |
-
if bos_id and output_ids and output_ids[0] == bos_id: output_ids = output_ids[1:]
|
160 |
-
if eos_id and output_ids and output_ids[-1] == eos_id: output_ids = output_ids[:-1]
|
161 |
-
|
162 |
-
# Use appropriate decode method
|
163 |
-
if hasattr(tokenizer, 'sp_model') and hasattr(tokenizer.sp_model, 'DecodeIds'): generated_text = tokenizer.sp_model.DecodeIds(output_ids)
|
164 |
-
elif hasattr(tokenizer, 'decode'): generated_text = tokenizer.decode(output_ids)
|
165 |
-
else: raise AttributeError("Tokenizer lacks decoding method.")
|
166 |
-
|
167 |
-
print("\n--- Generated Text ---")
|
168 |
-
# Print prompt + generated text for context
|
169 |
-
print(prompt + generated_text)
|
170 |
-
print("----------------------")
|
171 |
-
|
172 |
-
except Exception as e:
|
173 |
-
print(f"\nERROR during example inference: {e}")
|
174 |
```
|
175 |
|
176 |
## Limitations and Biases
|
177 |
|
178 |
-
|
179 |
-
|
180 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
181 |
|
182 |
-
|
183 |
|
184 |
-
|
185 |
-
|
186 |
-
As noted in the DeepRAG research paper, the model may have particular difficulties with cultural concepts that lack direct English translations, idiomatic expressions specific to Hindi, and formal/informal speech distinctions.
|
187 |
|
188 |
Please use this model responsibly.
|
189 |
-
|
190 |
-
Model trained using custom scripts.
|
|
|
|
|
1 |
---
|
2 |
license: mit # Or choose another appropriate license: https://huggingface.co/docs/hub/repositories-licenses
|
3 |
language: hi
|
|
|
6 |
- text-generation
|
7 |
- causal-lm
|
8 |
- custom-model
|
|
|
9 |
pipeline_tag: text-generation
|
|
|
|
|
10 |
---
|
11 |
|
12 |
# Hindi Causal Language Model (convaiinnovations/hindi-foundational-model-base)
|
13 |
|
14 |
+
This repository contains a custom-trained Hindi Causal Language Model designed for Hindi text generation.
|
15 |
|
16 |
## Model Description
|
17 |
|
18 |
+
- **Architecture:** Custom Transformer (12 layers, hidden=768, 16 heads, ffn=3072, act=swiglu, norm=rmsnorm) based on the `HindiCausalLM` class with Hindi-specific optimizations:
|
19 |
+
- Multi-resolution attention to capture both character-level and word-level patterns
|
20 |
+
- Morphology-aware feed-forward layers
|
21 |
+
- Script-mix processing for Hindi-English code-mixing
|
22 |
+
- **Language:** Hindi (hi)
|
23 |
+
- **Training Data:** 2.7 million high-quality Hindi text samples from:
|
24 |
+
- IITB Parallel Corpus (1.2M sentences)
|
25 |
+
- Samanantar (750K samples)
|
26 |
+
- Oscar Hindi (450K sentences)
|
27 |
+
- CC-100 Hindi (300K sentences)
|
28 |
+
- Hindi Wikipedia (150K articles)
|
29 |
+
- Hindi news articles (100K pieces)
|
30 |
+
- XNLI Hindi (50K premise-hypothesis pairs)
|
31 |
+
- IndicGLUE (30K samples)
|
32 |
+
- Hindi literature (5K passages)
|
33 |
+
- **Tokenizer:** SentencePiece trained on Hindi text with vocab size of 16,000
|
34 |
+
- **Training Details:** 2 epochs, hidden size=768, num_layers=12, block_size=512, batch_size=64, learning_rate=5e-5, swiglu activation, rope positional encoding, and rms normalization
|
35 |
|
36 |
## How to Use
|
37 |
|
38 |
+
**⚠️ Important:** This model uses custom Python classes (`HindiCausalLM`, `HindiCausalLMConfig`, `SentencePieceTokenizerWrapper`) which are **not** part of the standard Hugging Face `transformers` library. The custom Python files are included in this repository.
|
39 |
+
|
40 |
+
### Download Required Files
|
41 |
|
42 |
```python
|
43 |
import os
|
|
|
|
|
|
|
44 |
from huggingface_hub import hf_hub_download
|
45 |
|
46 |
+
# Configuration
|
47 |
+
repo_id = "convaiinnovations/hindi-foundational-model-base"
|
48 |
+
model_dir = "." # Use current directory for downloaded files
|
49 |
+
|
50 |
+
# Download model files
|
51 |
+
print(f"Downloading files for {repo_id}...")
|
52 |
+
config_path = hf_hub_download(repo_id=repo_id, filename="config.json", local_dir=model_dir)
|
53 |
+
tokenizer_path = hf_hub_download(repo_id=repo_id, filename="tokenizer.model", local_dir=model_dir)
|
54 |
+
|
55 |
+
# Download custom module files (these are crucial!)
|
56 |
+
hindi_model_path = hf_hub_download(repo_id=repo_id, filename="hindi_language_model.py", local_dir=model_dir)
|
57 |
+
hindi_embeddings_path = hf_hub_download(repo_id=repo_id, filename="hindi_embeddings.py", local_dir=model_dir)
|
58 |
+
|
59 |
+
# Try safetensors first, then bin
|
60 |
try:
|
61 |
+
weights_path = hf_hub_download(repo_id=repo_id, filename="model.safetensors", local_dir=model_dir)
|
62 |
+
using_safetensors = True
|
63 |
+
except:
|
64 |
+
weights_path = hf_hub_download(repo_id=repo_id, filename="pytorch_model.bin", local_dir=model_dir)
|
65 |
+
using_safetensors = False
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
66 |
|
67 |
+
print("All necessary files downloaded.")
|
68 |
+
```
|
69 |
|
70 |
+
### Debug and Inference Script
|
|
|
|
|
|
|
|
|
|
|
71 |
|
72 |
+
```python
|
73 |
+
import os
|
74 |
+
import json
|
75 |
+
import torch
|
76 |
+
import argparse # Keep argparse for potential future use
|
77 |
+
import numpy as np
|
78 |
+
import time
|
79 |
+
import traceback # For detailed exception info
|
80 |
|
81 |
+
# Try importing safetensors
|
|
|
82 |
try:
|
83 |
+
import safetensors.torch
|
84 |
+
SAFE_TENSORS_AVAILABLE = True
|
85 |
+
except ImportError:
|
86 |
+
SAFE_TENSORS_AVAILABLE = False
|
|
|
|
|
|
|
|
|
|
|
|
|
87 |
|
88 |
+
print("[INFO] --- Debug Inference Script Started ---")
|
89 |
+
if SAFE_TENSORS_AVAILABLE: print("[INFO] safetensors library found.")
|
90 |
+
else: print("[WARNING] safetensors library not found.")
|
91 |
|
92 |
+
# --- Attempt to import custom modules ---
|
93 |
+
print("[DEBUG] Attempting to import custom modules...")
|
94 |
try:
|
95 |
+
from hindi_language_model import HindiCausalLM, HindiCausalLMConfig
|
96 |
+
from hindi_embeddings import SentencePieceTokenizerWrapper
|
97 |
+
print("[INFO] Successfully imported custom modules.")
|
98 |
+
except ImportError as e:
|
99 |
+
print(f"[ERROR] Failed to import custom modules: {e}"); traceback.print_exc()
|
100 |
+
|
101 |
+
# --- End Custom Module Import ---
|
102 |
+
|
103 |
+
|
104 |
+
# --- Main Generation Function Definition ---
|
105 |
+
def run_generation(
|
106 |
+
model_path: str,
|
107 |
+
prompt: str,
|
108 |
+
max_len: int,
|
109 |
+
temp: float,
|
110 |
+
top_k: int,
|
111 |
+
seed: int,
|
112 |
+
device_str: str
|
113 |
+
):
|
114 |
+
"""Loads model and generates text, printing debug info."""
|
115 |
+
print(f"\nINFO: --- Starting Generation ---")
|
116 |
+
print(f"[DEBUG] Args: path='{model_path}', max_len={max_len}, temp={temp}, top_k={top_k}, seed={seed}, device='{device_str}'")
|
117 |
+
|
118 |
+
# --- Setup ---
|
119 |
+
t_start_setup = time.time()
|
120 |
+
try:
|
121 |
+
torch.manual_seed(seed); np.random.seed(seed); device = torch.device(device_str)
|
122 |
+
if device.type == 'cuda': torch.cuda.manual_seed_all(seed)
|
123 |
+
print(f"[INFO] Using device: {device}")
|
124 |
+
print(f"[DEBUG] Setup took {time.time()-t_start_setup:.4f}s")
|
125 |
+
except Exception as e: print(f"[ERROR] Device/Seed setup failed: {e}"); traceback.print_exc(); return None
|
126 |
+
|
127 |
+
# --- Load Tokenizer ---
|
128 |
+
print("\n[INFO] --- Loading Tokenizer ---")
|
129 |
+
t_start_load = time.time(); tokenizer = None
|
130 |
+
try:
|
131 |
+
tokenizer_model_file = os.path.join(model_path, "tokenizer.model")
|
132 |
+
print(f"[DEBUG] Looking for tokenizer at: {tokenizer_model_file}")
|
133 |
+
assert os.path.exists(tokenizer_model_file), "tokenizer.model not found!"
|
134 |
+
tokenizer = SentencePieceTokenizerWrapper(tokenizer_model_file) # Use imported class
|
135 |
+
print(f"[INFO] Tokenizer loaded. Vocab: {getattr(tokenizer, 'vocab_size', 'N/A')}")
|
136 |
+
# Get BOS/EOS (handle if missing)
|
137 |
+
bos_id = getattr(tokenizer, 'bos_token_id', 1) # Default 1
|
138 |
+
eos_id = getattr(tokenizer, 'eos_token_id', 2) # Default 2
|
139 |
+
print(f"[INFO] BOS ID: {bos_id}, EOS ID: {eos_id}")
|
140 |
+
except Exception as e: print(f"[ERROR] Tokenizer loading failed: {e}"); traceback.print_exc(); return None
|
141 |
+
|
142 |
+
# --- Load Config ---
|
143 |
+
print("\n[INFO] --- Loading Config ---")
|
144 |
+
lm_config = None
|
145 |
+
try:
|
146 |
+
config_file = os.path.join(model_path, "config.json")
|
147 |
+
print(f"[DEBUG] Looking for config at: {config_file}")
|
148 |
+
assert os.path.exists(config_file), "config.json not found!"
|
149 |
+
with open(config_file, 'r', encoding='utf-8') as f: config_dict = json.load(f)
|
150 |
+
print(f"[DEBUG] Config JSON loaded.")
|
151 |
+
# Check/fix vocab size
|
152 |
+
tok_vocab = getattr(tokenizer, 'vocab_size', None)
|
153 |
+
if tok_vocab and 'vocab_size' in config_dict and config_dict['vocab_size'] != tok_vocab: print(f"[WARN] Config/Tokenizer vocab mismatch. Using tokenizer size: {tok_vocab}"); config_dict['vocab_size'] = tok_vocab
|
154 |
+
# Instantiate config
|
155 |
+
if hasattr(HindiCausalLMConfig, 'from_dict'): lm_config = HindiCausalLMConfig.from_dict(config_dict)
|
156 |
+
else: lm_config = HindiCausalLMConfig(**config_dict)
|
157 |
+
print("[INFO] Model config loaded.")
|
158 |
+
except Exception as e: print(f"[ERROR] Config loading failed: {e}"); traceback.print_exc(); return None
|
159 |
+
|
160 |
+
# --- Load Model ---
|
161 |
+
print("\n[INFO] --- Loading Model ---")
|
162 |
+
model = None
|
163 |
+
try:
|
164 |
+
print(f"[DEBUG] Instantiating {HindiCausalLM.__name__}...")
|
165 |
+
model = HindiCausalLM(lm_config); print(f"[INFO] Model structure created.")
|
166 |
+
weights_file = None; s_path = os.path.join(model_path, "model.safetensors"); b_path = os.path.join(model_path, "pytorch_model.bin")
|
167 |
+
print(f"[DEBUG] Checking weights: {s_path} (exists: {os.path.exists(s_path)}), {b_path} (exists: {os.path.exists(b_path)})")
|
168 |
+
if SAFE_TENSORS_AVAILABLE and os.path.exists(s_path): weights_file = s_path
|
169 |
+
elif os.path.exists(b_path): weights_file = b_path
|
170 |
+
else: raise FileNotFoundError("Model weights (.safetensors or .bin) not found!")
|
171 |
+
print(f"[INFO] Loading weights from: {weights_file}")
|
172 |
+
if weights_file.endswith(".safetensors"): state_dict = safetensors.torch.load_file(weights_file, device="cpu")
|
173 |
+
else: state_dict = torch.load(weights_file, map_location="cpu")
|
174 |
+
print(f"[DEBUG] State dict loaded to CPU. Keys: {len(state_dict)}")
|
175 |
+
try: load_res = model.load_state_dict(state_dict, strict=True)
|
176 |
+
except RuntimeError as e_load: print(f"[WARN] Strict load failed: {e_load}. Trying non-strict."); load_res = model.load_state_dict(state_dict, strict=False)
|
177 |
+
missing = getattr(load_res, "missing_keys", []); unexpected = getattr(load_res, "unexpected_keys", [])
|
178 |
+
print(f"[INFO] State dict loaded. Missing: {len(missing)}. Unexpected: {len(unexpected)}")
|
179 |
+
if missing: print(f"[WARN] Missing keys: {missing[:5]}...")
|
180 |
+
if unexpected: print(f"[WARN] Unexpected keys: {unexpected[:5]}...")
|
181 |
+
del state_dict; model.to(device); model.eval()
|
182 |
+
print("[INFO] Model loaded to device and set to eval mode.")
|
183 |
+
print(f"[DEBUG] Tokenizer+Config+Model loading took {time.time()-t_start_load:.2f}s")
|
184 |
+
except Exception as e: print(f"[ERROR] Model loading failed: {e}"); traceback.print_exc(); return None
|
185 |
+
|
186 |
+
# --- Generation ---
|
187 |
+
print("\n[INFO] --- Starting Text Generation ---")
|
188 |
+
t_start_gen = time.time()
|
189 |
+
print(f"[INFO] Prompt: \"{prompt}\"")
|
190 |
+
try:
|
191 |
+
print("[DEBUG] Encoding prompt...")
|
192 |
+
# Use __call__ or sp_model.EncodeAsIds
|
193 |
+
if hasattr(tokenizer, '__call__'):
|
194 |
+
print("DEBUG: Trying tokenizer(prompt)...")
|
195 |
+
encoded_result = tokenizer(prompt, return_tensors=None)
|
196 |
+
if isinstance(encoded_result, dict) and 'input_ids' in encoded_result: input_ids = encoded_result['input_ids']
|
197 |
+
else: print(f"DEBUG: __call__ result type {type(encoded_result)} unexpected. Trying sp_model.EncodeAsIds...");
|
198 |
+
if hasattr(tokenizer, 'sp_model') and hasattr(tokenizer.sp_model, 'EncodeAsIds'): input_ids = tokenizer.sp_model.EncodeAsIds(prompt)
|
199 |
+
else: raise AttributeError("Cannot find suitable encoding method (__call__ or sp_model.EncodeAsIds)")
|
200 |
+
elif hasattr(tokenizer, 'sp_model') and hasattr(tokenizer.sp_model, 'EncodeAsIds'):
|
201 |
+
print("DEBUG: Trying tokenizer.sp_model.EncodeAsIds...")
|
202 |
+
input_ids = tokenizer.sp_model.EncodeAsIds(prompt)
|
203 |
+
else: raise AttributeError("Cannot find suitable encoding method")
|
204 |
+
print(f"[DEBUG] Prompt token IDs: {input_ids}")
|
205 |
+
|
206 |
+
if bos_id is not None: print(f"[DEBUG] Prepending BOS {bos_id}"); input_ids = [bos_id] + input_ids
|
207 |
+
input_tensor = torch.tensor([input_ids], dtype=torch.long, device=device); print(f"[DEBUG] Initial input tensor shape: {input_tensor.shape}")
|
208 |
+
generated_ids = input_tensor
|
209 |
+
|
210 |
+
print("[DEBUG] Starting generation loop...")
|
211 |
+
with torch.no_grad():
|
212 |
+
for i in range(max_len - len(input_ids)):
|
213 |
+
step = i + 1; print(f"\nDEBUG: --- Step {step}/{max_len - len(input_ids)} | Current len: {generated_ids.shape[1]} ---")
|
214 |
+
t_fwd = time.time();
|
215 |
+
|
216 |
+
# --- FORWARD CALL AND LOGIT EXTRACTION ---
|
217 |
+
outputs = model(input_ids=generated_ids) # model call
|
218 |
+
|
219 |
+
# *** CORRECTED LOGIT ACCESS ***
|
220 |
+
if isinstance(outputs, dict) and 'logits' in outputs:
|
221 |
+
logits = outputs['logits'] # Access via key if output is dict
|
222 |
+
print(f"DEBUG: Fwd pass {time.time()-t_fwd:.4f}s. Accessed dict['logits'].")
|
223 |
+
elif hasattr(outputs, 'logits'):
|
224 |
+
logits = outputs.logits # Access via attribute if output is object
|
225 |
+
print(f"DEBUG: Fwd pass {time.time()-t_fwd:.4f}s. Accessed outputs.logits.")
|
226 |
+
else:
|
227 |
+
print(f"[ERROR] Model output type is {type(outputs)}, and does not contain 'logits'.")
|
228 |
+
raise TypeError("Model output format error.")
|
229 |
+
# *** END CORRECTION ***
|
230 |
+
|
231 |
+
next_token_logits = logits[:, -1, :]; print(f"DEBUG: Next logits shape: {next_token_logits.shape}")
|
232 |
+
|
233 |
+
# --- Sampling ---
|
234 |
+
if temp > 0: scaled_logits = next_token_logits / temp
|
235 |
+
else: scaled_logits = next_token_logits # Greedy
|
236 |
+
if top_k > 0: kth_vals, _ = torch.topk(scaled_logits, k=top_k, dim=-1); scaled_logits[scaled_logits < kth_vals[:, -1].unsqueeze(-1)] = -float("Inf")
|
237 |
+
probs = torch.softmax(scaled_logits, dim=-1); next_token_id = torch.multinomial(probs, num_samples=1); print(f"DEBUG: Sampled ID: {next_token_id.item()}")
|
238 |
+
generated_ids = torch.cat([generated_ids, next_token_id], dim=1)
|
239 |
+
if next_token_id.item() == eos_id: print(f"INFO: EOS token {eos_id} generated."); break
|
240 |
+
else: print(f"INFO: Reached max length {max_len}.")
|
241 |
+
|
242 |
+
# --- Decode ---
|
243 |
+
print("\nDEBUG: --- Post-processing ---")
|
244 |
+
output_ids = generated_ids[0].cpu().tolist(); print(f"[DEBUG] Raw output IDs: {output_ids}")
|
245 |
+
processed_ids = output_ids
|
246 |
+
if bos_id and processed_ids and processed_ids[0] == bos_id: print("[DEBUG] Removing BOS"); processed_ids = processed_ids[1:]
|
247 |
+
if eos_id and processed_ids and processed_ids[-1] == eos_id: print("[DEBUG] Removing EOS"); processed_ids = processed_ids[:-1]
|
248 |
+
print(f"[DEBUG] Processed IDs: {processed_ids}")
|
249 |
+
print("[INFO] Decoding...")
|
250 |
+
# Use sp_model.DecodeIds or decode
|
251 |
+
if hasattr(tokenizer, 'sp_model') and hasattr(tokenizer.sp_model, 'DecodeIds'): print("DEBUG: Decoding using tokenizer.sp_model.DecodeIds..."); generated_text = tokenizer.sp_model.DecodeIds(processed_ids)
|
252 |
+
elif hasattr(tokenizer, 'decode'): print("DEBUG: Decoding using tokenizer.decode..."); generated_text = tokenizer.decode(processed_ids)
|
253 |
+
else: raise AttributeError("Cannot find suitable decoding method")
|
254 |
+
print(f"[DEBUG] Decoded text: '{generated_text}'")
|
255 |
+
print(f"[INFO] Generation successful ({time.time() - t_start_gen:.2f}s).")
|
256 |
+
return generated_text
|
257 |
+
|
258 |
+
except Exception as e: print(f"ERROR: Generation loop error: {e}"); traceback.print_exc(); return None
|
259 |
+
# --- End Generation Function Definition ---
|
260 |
+
|
261 |
+
|
262 |
+
# --- Main Execution Block ---
|
263 |
+
if __name__ == "__main__":
|
264 |
+
# --- Parameters ---
|
265 |
+
model_dir = "." # Use current directory if files are downloaded here
|
266 |
+
prompt = "गंगा नदी"
|
267 |
+
max_len = 80
|
268 |
+
temp = 2
|
269 |
+
top_k = 45
|
270 |
+
seed = 42
|
271 |
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
272 |
+
|
273 |
+
print("\n[INFO] --- Simple Hindi Text Generation Script ---")
|
274 |
+
print(f"[INFO] Model Dir: {model_dir}")
|
275 |
+
print(f"[INFO] Prompt: \"{prompt}\"")
|
276 |
+
print(f"[INFO] Max Length: {max_len}")
|
277 |
+
print(f"[INFO] Temperature: {temp}")
|
278 |
+
print(f"[INFO] Top-K: {top_k}")
|
279 |
+
print(f"[INFO] Seed: {seed}")
|
280 |
+
print(f"[INFO] Device: {device}")
|
281 |
+
print("-" * 30)
|
282 |
+
|
283 |
+
# --- Validate Path ---
|
284 |
+
if not os.path.isdir(model_dir): print(f"[ERROR] Model directory not found: {model_dir}"); exit(1)
|
285 |
+
|
286 |
+
# --- Run Generation ---
|
287 |
+
if 'run_generation' in locals():
|
288 |
+
generated_output = run_generation(
|
289 |
+
model_path=model_dir, prompt=prompt, max_len=max_len,
|
290 |
+
temp=temp, top_k=top_k, seed=seed, device_str=device
|
291 |
+
)
|
292 |
+
else: print("[ERROR] run_generation function is not defined!"); generated_output = None
|
293 |
+
|
294 |
+
# --- Print Result ---
|
295 |
+
print("\n" + "="*20 + " Final Generation Result " + "="*20)
|
296 |
+
if generated_output is not None:
|
297 |
+
print(f"Prompt: {prompt}")
|
298 |
+
print("-" * (40 + len(" Final Generation Result ")))
|
299 |
+
print("Generated Text:")
|
300 |
+
print(generated_output)
|
301 |
else:
|
302 |
+
print("\n[FAILURE] Text generation failed. Check print statements above.")
|
303 |
+
print("=" * (40 + len(" Final Generation Result ")))
|
304 |
+
```
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
305 |
|
306 |
+
## Example Outputs
|
307 |
|
308 |
+
### Basic Example
|
|
|
|
|
|
|
|
|
|
|
309 |
|
310 |
+
```python
|
311 |
+
prompt = "हिंदी भाषा"
|
312 |
+
# Output: "हिंदी भाषा भारत की सबसे महत्वपूर्ण भाषाओं में से एक है। यह भारत के उत्तर भारत के राज्यों में मुख्य भाषा के रूप में बोली जाती है..."
|
313 |
+
```
|
314 |
|
315 |
+
### Creative Writing Example
|
|
|
|
|
316 |
|
317 |
+
```python
|
318 |
+
prompt = "एक बार की बात है"
|
319 |
+
# Output: "एक बार की बात है, जब मैं छोटा था, तब मेरे दादाजी मुझे एक कहानी सुनाया करते थे। वह कहानी एक ऐसे राजा की थी जो अपने राज्य में..."
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
320 |
```
|
321 |
|
322 |
## Limitations and Biases
|
323 |
|
324 |
+
- The model may reflect biases present in its training data, including potential cultural, gender, or regional biases found in source materials.
|
325 |
+
- Performance is limited by its architecture size (12 layers, hidden=768) and training dataset size.
|
326 |
+
- May generate repetitive, nonsensical, or factually incorrect text.
|
327 |
+
- Uses weighted pooling with sensitivity to Hindi's SOV structure, but may struggle with complex semantic relationships in longer texts.
|
328 |
+
- May have particular difficulties with:
|
329 |
+
- Cultural concepts lacking direct English translations
|
330 |
+
- Idiomatic expressions specific to Hindi
|
331 |
+
- Formal/informal speech distinctions
|
332 |
+
- Handling Hindi-specific morphological complexities
|
333 |
|
334 |
+
## License
|
335 |
|
336 |
+
This model is licensed under the MIT License.
|
|
|
|
|
337 |
|
338 |
Please use this model responsibly.
|
|
|
|