import gradio as gr import torch from unsloth import FastLanguageModel from peft import PeftModel from transformers import AutoTokenizer # Load base model + tokenizer base_model, tokenizer = FastLanguageModel.from_pretrained( model_name = "unsloth/gemma-2-9b-bnb-4bit", max_seq_length = 2048, dtype = torch.float16 if torch.cuda.is_available() else torch.float32, load_in_4bit = True, ) # Load your LoRA adapter model = PeftModel.from_pretrained( base_model, "vansh-myth/gemma-2-mumbai_slang-adapter", # 🔁 Change this to your HF repo path device_map = "auto" ) FastLanguageModel.for_inference(model) # Optional: move model to correct device if torch.cuda.is_available(): model = model.to("cuda") # Chat function def chat(prompt): inputs = tokenizer(prompt, return_tensors="pt").to(model.device) outputs = model.generate(**inputs, max_new_tokens=200) return tokenizer.decode(outputs[0], skip_special_tokens=True) # Gradio UI iface = gr.Interface( fn=chat, inputs=gr.Textbox(lines=2, placeholder="Enter your prompt here..."), outputs="text", title="Gemma 2B LoRA Chat", description="LoRA fine-tuned version of Gemma-2-9B on Hugging Face Spaces." ) iface.launch()