import torch import sys import time from architecture import SmallGPT from tokenizers import Tokenizer def load_tokenizer(path="smptokenizer/tokenizer.json"): tokenizer = Tokenizer.from_file(path) return tokenizer def generate_text_streaming(model, tokenizer, start_text, device, max_length=64, temperature=1.0, max_new_tokens=20, repetition_penalty=1.2): """ Generates text token by token, yielding each new token. """ model.eval() # Encode start text input_ids = tokenizer.encode(start_text).ids generated_ids = [] # Print the starting text, and wait for the model to continue print("Generated Sentence:") print(start_text, end="", flush=True) current_ids = input_ids with torch.no_grad(): for _ in range(max_new_tokens): # Prepare input (truncate if too long) current_input = current_ids[-max_length+1:] if len(current_ids) >= max_length else current_ids input_tensor = torch.tensor([current_input], dtype=torch.long, device=device) # Get output logits = model(input_tensor) # Get logits for last position next_token_logits = logits[0, -1, :] / temperature # Apply repetition penalty, if needed if repetition_penalty > 1.0: for token_id in set(current_ids): next_token_logits[token_id] /= repetition_penalty # Sample next token probs = torch.softmax(next_token_logits, dim=-1) next_token_id = torch.multinomial(probs, 1).item() # Check for EOS if next_token_id == tokenizer.token_to_id(""): break generated_ids.append(next_token_id) current_ids.append(next_token_id) # Decode and yield the new token new_token = tokenizer.decode([next_token_id]) yield new_token def main(seed): device = torch.device("cuda" if torch.cuda.is_available() else "cpu") print(f"Using device: {device}") # Load tokenizer tokenizer_path = "smptokenizer/tokenizer.json" tokenizer = load_tokenizer(tokenizer_path) vocab_size = tokenizer.get_vocab_size() pad_id = tokenizer.token_to_id("") or 0 # Model parameters from training d_model = 256 n_heads = 8 n_layers = 6 max_length = 172 # Instantiate the model model = SmallGPT( vocab_size=vocab_size, d_model=d_model, n_heads=n_heads, n_layers=n_layers, max_length=max_length, pad_idx=pad_id, ).to(device) # Load the trained model weights model_path = "models/pytorch_model.bin" # idk if safetensor works try: model.load_state_dict(torch.load(model_path, map_location=device)) model.eval() print(f"Model loaded from {model_path}") except FileNotFoundError: print(f"Error: Model file not found at {model_path}") print("Please ensure the model is trained and the path is correct.") return while True: # Reset seed torch.manual_seed(seed) torch.cuda.manual_seed(seed) start_text = input("Enter a starting word or phrase (or 'quit' to exit): ") if start_text.lower() == 'quit': break if not start_text.strip(): print("Please enter some text. We are using a random character as a starting point.") start_text = str(time.time()) print("Generating...") token_count = 0 start_time = time.time() for token in generate_text_streaming( model=model, tokenizer=tokenizer, start_text=start_text, device=device, max_new_tokens=1000, temperature=0.7, max_length=max_length, repetition_penalty=1.2 ): print(token, end="", flush=True) token_count += 1 end_time = time.time() elapsed_time = end_time - start_time tokens_per_sec = token_count / elapsed_time if elapsed_time > 0 else 0 print(f"\n\nPerformance: {tokens_per_sec:.2f} tokens/sec") print("-" * 30) if __name__ == "__main__": seed = 42 torch.manual_seed(seed) torch.cuda.manual_seed(seed) main(seed)