import gradio as gr from transformers import AutoModelForCausalLM, AutoTokenizer import torch # Determine the device to use (GPU if available, otherwise CPU) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") print(f"Using device: {device}") # Load model and tokenizer # Move the model to the determined device model = AutoModelForCausalLM.from_pretrained( "FlameF0X/SnowflakeCore-G1-Tiny2", trust_remote_code=True, force_download=True, use_safetensors=True, ).to(device) # Move model to GPU or CPU tokenizer = AutoTokenizer.from_pretrained( "FlameF0X/SnowflakeCore-G1-Tiny2", trust_remote_code=True, force_download=True, use_safetensors=True, ) def custom_greedy_generate(prompt, max_length=50): """ Generates text using a custom greedy decoding approach. The model and input tensors are moved to the appropriate device (GPU/CPU). """ model.eval() # Move input_ids to the same device as the model input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(device) generated = input_ids with torch.no_grad(): for _ in range(max_length): # Ensure the generated tensor is on the correct device for model input outputs = model(input_ids=generated) next_token_logits = outputs["logits"][:, -1, :] next_token_id = torch.argmax(next_token_logits, dim=-1).unsqueeze(-1) generated = torch.cat((generated, next_token_id), dim=1) if next_token_id.item() == tokenizer.eos_token_id: break return tokenizer.decode(generated[0], skip_special_tokens=True) def gradio_generate(prompt): """ Wrapper function for Gradio interface. """ return custom_greedy_generate(prompt) # Create the Gradio interface iface = gr.Interface( fn=gradio_generate, inputs=gr.Textbox(lines=2, placeholder="Enter your prompt here..."), outputs=gr.Textbox(label="Generated Text"), title="SnowflakeCore-G1-Tiny2 Text Generation", description=f"Enter a prompt and generate text using the SnowflakeCore-G1-Tiny2 model. Running on: {device}", ) # Launch the Gradio application if __name__ == "__main__": iface.launch()