import gradio as gr from huggingface_hub import login # ! pip install accelerate peft bitsandbytes pip install git+https://github.com/huggingface/transformers trl py7zr auto-gptq optimum import torch # from datasets import Dataset # from peft import LoraConfig, prepare_model_for_kbit_training, get_peft_model from transformers import AutoModelForCausalLM, AutoTokenizer, GPTQConfig, TrainingArguments # from trl import SFTTrainer # import pandas as pd # import json # import pandas as pd # def load_data_to_dataframe(json_file_path): # """ # Load data from a JSON file and create a DataFrame with questions and answers. # Args: # json_file_path (str): Path to the JSON file. # Returns: # pd.DataFrame: DataFrame containing the questions and answers. # """ # questions = [] # answers = [] # with open(json_file_path, 'r') as f: # data = json.load(f) # for entry in data: # for message in entry["messages"]: # if message["role"] == "user": # questions.append(message["content"]) # elif message["role"] == "assistant": # answers.append(message["content"]) # # Create DataFrame # df = pd.DataFrame({ # 'question': questions, # 'answer': answers # }) # return df # def finetune_mistral_7b(): # # Replace 'your_token' with your actual Hugging Face token # json_file_path = 'Dataset for finetuning Viv.json' # df = load_data_to_dataframe(json_file_path) # df["text"] = df[["question", "answer"]].apply(lambda x: "###Human: Answer this question: " + x["question"] + "\n###Assistant: " +x["answer"], axis=1) # print(df.iloc[0]) # data = Dataset.from_pandas(df) # tokenizer = AutoTokenizer.from_pretrained("TheBloke/Mistral-7B-Instruct-v0.1-GPTQ") # tokenizer.pad_token = tokenizer.eos_token # quantization_config_loading = GPTQConfig(bits=4, disable_exllama=True, tokenizer=tokenizer) # model = AutoModelForCausalLM.from_pretrained( # "TheBloke/Mistral-7B-Instruct-v0.1-GPTQ", # quantization_config=quantization_config_loading, # device_map="auto" # ) # print(model) # model.config.use_cache = False # model.config.pretraining_tp = 1 # model.gradient_checkpointing_enable() # model = prepare_model_for_kbit_training(model) # peft_config = LoraConfig( # r=16, lora_alpha=16, lora_dropout=0.05, bias="none", task_type="CAUSAL_LM", target_modules=["q_proj", "v_proj"] # ) # model = get_peft_model(model, peft_config) # training_arguments = TrainingArguments( # output_dir="mistral-finetuned-Viv", # per_device_train_batch_size=8, # gradient_accumulation_steps=1, # optim="paged_adamw_32bit", # learning_rate=2e-4, # lr_scheduler_type="cosine", # save_strategy="epoch", # logging_steps=100, # num_train_epochs=1, # max_steps=100, # fp16=True, # push_to_hub=True, # hub_model_id="Dumele/viv-updated2", # Specify the repository name # hub_strategy="every_save" # ) # trainer = SFTTrainer( # model=model, # train_dataset=data, # peft_config=peft_config, # dataset_text_field="text", # args=training_arguments, # tokenizer=tokenizer, # packing=False, # max_seq_length=512 # ) # trainer.train() # trainer.push_to_hub() # if __name__ == "__main__": # finetune_mistral_7b() from transformers import AutoModelForCausalLM, AutoTokenizer, GPTQConfig import torch # Define the repository where your model is saved model_repo = "Dumele/viv-updated2" # Replace with your actual repository # Load the tokenizer from the repository tokenizer = AutoTokenizer.from_pretrained(model_repo) # Define the configuration with `disable_exllama` set to True quantization_config = GPTQConfig(bits=4, disable_exllama=True) # Load the model with the custom configuration model = AutoModelForCausalLM.from_pretrained(model_repo, quantization_config=quantization_config) # Move the model to GPU if available device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model.to(device) from transformers import pipeline # Create a text generation pipeline text_generator = pipeline("text-generation", model=model, tokenizer=tokenizer, device=0 if torch.cuda.is_available() else -1) # Define a prompt prompt = "###Human: Answer this question: What exactly does Viv do?\n###Assistant:" # Generate text generated_text = text_generator(prompt, max_length=100, num_return_sequences=1) # Print the generated text print(generated_text[0]['generated_text']) # pip install gradio import torch from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline import gradio as gr # Define the repository where your model is saved model_repo = "Dumele/viv-updated2" # Replace with your actual repository name # Load the tokenizer from the repository tokenizer = AutoTokenizer.from_pretrained(model_repo) # Define the configuration with `disable_exllama` set to True quantization_config = GPTQConfig(bits=4, disable_exllama=True) # Load the model with the custom configuration model = AutoModelForCausalLM.from_pretrained(model_repo, quantization_config=quantization_config) # Move the model to GPU if available device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model.to(device) # Create a text generation pipeline text_generator = pipeline("text-generation", model=model, tokenizer=tokenizer, device=0 if torch.cuda.is_available() else -1) def generate_response(prompt): generated_text = text_generator(prompt, max_length=100, num_return_sequences=1) return generated_text[0]['generated_text'] # Create a Gradio interface iface = gr.Interface( fn=generate_response, inputs=gr.Textbox(lines=2, placeholder="Enter your prompt here..."), outputs="text", title="Chat with VivBeta", description="Enter a prompt to interact with the fine-tuned model." ) iface.launch() # Commented out IPython magic to ensure Python compatibility. # %%bash #