import gradio as gr
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

# Determine the device to use (GPU if available, otherwise CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load model and tokenizer
# Move the model to the determined device
model = AutoModelForCausalLM.from_pretrained(
    "FlameF0X/SnowflakeCore-G1-Tiny2",
    trust_remote_code=True,
    force_download=True,
    use_safetensors=True,
).to(device) # Move model to GPU or CPU
tokenizer = AutoTokenizer.from_pretrained(
    "FlameF0X/SnowflakeCore-G1-Tiny2",
    trust_remote_code=True,
    force_download=True,
    use_safetensors=True,
)

def custom_greedy_generate(prompt, max_length=50):
    """
    Generates text using a custom greedy decoding approach.
    The model and input tensors are moved to the appropriate device (GPU/CPU).
    """
    model.eval()
    # Move input_ids to the same device as the model
    input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(device)
    generated = input_ids
    with torch.no_grad():
        for _ in range(max_length):
            # Ensure the generated tensor is on the correct device for model input
            outputs = model(input_ids=generated)
            next_token_logits = outputs["logits"][:, -1, :]
            next_token_id = torch.argmax(next_token_logits, dim=-1).unsqueeze(-1)
            generated = torch.cat((generated, next_token_id), dim=1)
            if next_token_id.item() == tokenizer.eos_token_id:
                break
    return tokenizer.decode(generated[0], skip_special_tokens=True)

def gradio_generate(prompt):
    """
    Wrapper function for Gradio interface.
    """
    return custom_greedy_generate(prompt)

# Create the Gradio interface
iface = gr.Interface(
    fn=gradio_generate,
    inputs=gr.Textbox(lines=2, placeholder="Enter your prompt here..."),
    outputs=gr.Textbox(label="Generated Text"),
    title="SnowflakeCore-G1-Tiny2 Text Generation",
    description=f"Enter a prompt and generate text using the SnowflakeCore-G1-Tiny2 model. Running on: {device}",
)

# Launch the Gradio application
if __name__ == "__main__":
    iface.launch()