import pandas as pd from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, Trainer, TrainingArguments from Features.chat_interface import start_chat_interface from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, Trainer, TrainingArguments from Features.chat_interface import start_chat_interface import start_chat_interface, generate_response # Import needed functions import os # --- Step 1: Prepare Your Data --- data = pd.read_csv("your_chatbot_data.csv") # Replace with your dataset file data = data[["user_input", "chatbot_response"]] # Adjust column names if needed data_path = "data/chatbot_data.csv" data = pd.read_csv(data_path) # --- Step 2: Choose a Pre-trained Model --- model_name = "microsoft/DialoGPT-medium" # A good starting point, experiment with others! tokenizer = AutoTokenizer.from_pretrained(model_name) model = AutoModelForSeq2SeqLM.from_pretrained(model_name) # --- Step 3: Tokenize the Data --- def preprocess(examples): inputs = [ex + tokenizer.eos_token for ex in examples["user_input"]] targets = examples["chatbot_response"] model_inputs = tokenizer(inputs, max_length=128, truncation=True, padding=True) with tokenizer.as_target_tokenizer(): model_inputs["labels"] = tokenizer(targets, max_length=128, truncation=True, padding=True).input_ids return model_inputs tokenized_data = data.apply(preprocess, axis=1) # --- Step 4: Fine-Tune the Model --- training_args = TrainingArguments( "my-chatbot", # Output folder name evaluation_strategy="steps", learning_rate=2e-5, per_device_train_batch_size=8, per_device_eval_batch_size=8, num_train_epochs=3, weight_decay=0.01, save_steps=500, # Save model checkpoint every 500 steps push_to_hub=True, # Push the model to your Hugging Face Hub ) trainer = Trainer( model=model, args=training_args, train_dataset=tokenized_data, eval_dataset=tokenized_data, ) trainer.train() # --- Step 5: Use Your Fine-Tuned Model (After Training) --- def generate_response(user_input): input_ids = tokenizer.encode(user_input + tokenizer.eos_token, return_tensors='pt') output_sequences = model.generate(input_ids=input_ids) response = tokenizer.decode(output_sequences[0], skip_special_tokens=True) return response