import gradio as gr
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

# Load model and tokenizer with optimizations
model_name = "Amir230703/phi3-medmcqa-finetuned"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16,
    device_map="auto",
    attn_implementation="flash_attention_2"  # Faster attention
).eval()

# Use faster kernels if available
if torch.cuda.is_available():
    model = torch.compile(model)

def generate_answer(question):
    # Create structured prompt
    prompt = f"""Instruction: Answer the following medical question concisely.
Question: {question}
Answer:"""
    
    # Tokenize with optimized settings
    inputs = tokenizer(
        prompt,
        return_tensors="pt",
        max_length=512,
        truncation=True,
        padding=True
    ).to(model.device)
    
    # Generate with optimized parameters
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=150,  # Reduced from 200
            temperature=0.7,
            top_p=0.9,
            do_sample=True,
            repetition_penalty=1.1,  # Prevent repetition
            num_return_sequences=1
        )
    
    # Decode and clean output
    answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return answer.split("Answer:")[-1].strip()

# Gradio interface with queueing
demo = gr.Interface(
    fn=generate_answer,
    inputs=gr.Textbox(placeholder="Enter your medical question...", lines=3),
    outputs=gr.Textbox(label="Answer"),
    title="Medical QA Assistant",
    description="AI-powered medical question answering. Please be specific in your queries.",
    allow_flagging="never"
)

# Launch with performance settings
demo.launch(
    server_name="0.0.0.0" if torch.cuda.is_available() else None,
    share=False,
    max_threads=2
)