import gradio as gr import torch from transformers import AutoModelForCausalLM, AutoTokenizer from peft import PeftModel from spaces import GPU # Load base model and tokenizer BASE_MODEL_NAME = "NousResearch/Meta-Llama-3-8B" LORA_MODEL_NAME = "ubiodee/plutus_llm" tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_NAME, use_fast=False) base_model = AutoModelForCausalLM.from_pretrained( BASE_MODEL_NAME, torch_dtype=torch.float16, device_map="auto" ) # Apply LoRA weights model = PeftModel.from_pretrained(base_model, LORA_MODEL_NAME) # Set padding token if tokenizer.pad_token is None: tokenizer.pad_token = tokenizer.eos_token model.eval() # Response function with ZeroGPU decorator @GPU def generate_response(prompt, max_new_tokens=200, temperature=0.7, top_p=0.9): inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True).to("cuda") with torch.no_grad(): outputs = model.generate( **inputs, max_new_tokens=max_new_tokens, temperature=temperature, top_p=top_p, do_sample=True, eos_token_id=tokenizer.eos_token_id, pad_token_id=tokenizer.pad_token_id, ) response = tokenizer.decode(outputs[0], skip_special_tokens=True) if response.startswith(prompt): response = response[len(prompt):].strip() return response # Gradio UI demo = gr.Interface( fn=generate_response, inputs=[ gr.Textbox(label="Enter your prompt", lines=4, placeholder="Ask about Plutus..."), gr.Slider(label="Max New Tokens", minimum=50, maximum=500, value=200, step=10), gr.Slider(label="Temperature", minimum=0.1, maximum=2.0, value=0.7, step=0.1), gr.Slider(label="Top P", minimum=0.1, maximum=1.0, value=0.9, step=0.05) ], outputs=gr.Textbox(label="Model Response"), title="Cardano Plutus AI Assistant", description="Ask questions about Plutus smart contracts or Cardano blockchain using ubiodee/plutus_llm." ) if __name__ == "__main__": demo.launch()