import gradio as gr
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline, GPTQConfig
import torch
# Define the repository where your model is saved
model_repo = "Dumele/viv-updated"  # Replace with your actual repository
# Load the tokenizer from the repository
tokenizer = AutoTokenizer.from_pretrained(model_repo)
# Define the configuration with `disable_exllama` set to True
quantization_config = GPTQConfig(bits=4, disable_exllama=True)
# Load the model with the custom configuration
model = AutoModelForCausalLM.from_pretrained(model_repo, quantization_config=quantization_config)
# Move the model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
# Create a text generation pipeline
text_generator = pipeline("text-generation", model=model, tokenizer=tokenizer, device=0 if torch.cuda.is_available() else -1)
def chat_with_model(prompt):
    # Define the formatted prompt
    formatted_prompt = f"###Human: Answer this question: {prompt}\n###Assistant:"
    # Generate text
    generated_text = text_generator(formatted_prompt, max_length=100, num_return_sequences=1)
    # Return the generated text
    return generated_text[0]['generated_text'].replace(formatted_prompt, '').strip()
# Create Gradio Interface
iface = gr.Interface(
    fn=chat_with_model,
    inputs="text",
    outputs="text",
    title="Mistral 7B Chatbot",
    description="A chatbot powered by the Mistral 7B model fine-tuned on a custom dataset."
)
# Launch the interface
iface.launch()