import gradio as gr from huggingface_hub import login import torch # from datasets import Dataset # from peft import LoraConfig, prepare_model_for_kbit_training, get_peft_model from transformers import AutoModelForCausalLM, AutoTokenizer, GPTQConfig, TrainingArguments from transformers import AutoModelForCausalLM, AutoTokenizer, GPTQConfig import torch # Define the repository where your model is saved model_repo = "Dumele/viv-updated2" # Replace with your actual repository # Load the tokenizer from the repository tokenizer = AutoTokenizer.from_pretrained(model_repo) # Define the configuration with `disable_exllama` set to True quantization_config = GPTQConfig(bits=4, disable_exllama=True) # Load the model with the custom configuration model = AutoModelForCausalLM.from_pretrained(model_repo, quantization_config=quantization_config) # Move the model to GPU if available device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model.to(device) from transformers import pipeline # Create a text generation pipeline text_generator = pipeline("text-generation", model=model, tokenizer=tokenizer, device=0 if torch.cuda.is_available() else -1) # Define a prompt prompt = "###Human: Answer this question: What exactly does Viv do?\n###Assistant:" # Generate text generated_text = text_generator(prompt, max_length=100, num_return_sequences=1) # Print the generated text print(generated_text[0]['generated_text']) # pip install gradio import torch from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline import gradio as gr # Define the repository where your model is saved model_repo = "Dumele/viv-updated2" # Replace with your actual repository name # Load the tokenizer from the repository tokenizer = AutoTokenizer.from_pretrained(model_repo) # Define the configuration with `disable_exllama` set to True quantization_config = GPTQConfig(bits=4, disable_exllama=True) # Load the model with the custom configuration model = AutoModelForCausalLM.from_pretrained(model_repo, quantization_config=quantization_config) # Move the model to GPU if available device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model.to(device) # Create a text generation pipeline text_generator = pipeline("text-generation", model=model, tokenizer=tokenizer, device=0 if torch.cuda.is_available() else -1) def generate_response(prompt): generated_text = text_generator(prompt, max_length=100, num_return_sequences=1) return generated_text[0]['generated_text'] # Create a Gradio interface iface = gr.Interface( fn=generate_response, inputs=gr.Textbox(lines=2, placeholder="Enter your prompt here..."), outputs="text", title="Chat with VivBeta", description="Enter a prompt to interact with the fine-tuned model." ) iface.launch()