|
import logging |
|
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments |
|
from datasets import Dataset |
|
from sklearn.model_selection import train_test_split |
|
import re |
|
from transformers import T5Tokenizer, T5ForConditionalGeneration |
|
|
|
model_name = "t5-base" |
|
tokenizer = T5Tokenizer.from_pretrained(model_name) |
|
model = T5ForConditionalGeneration.from_pretrained(model_name) |
|
|
|
|
|
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s") |
|
logger = logging.getLogger(__name__) |
|
|
|
stop_words = {"and", "or", "but", "the", "is", "are", "was", "were", "a", "an", "in", "on", "at", "of", "to", "with"} |
|
def stem_word(word): |
|
suffixes = ['ing', 'ed', 'ly', 's', 'es', 'er'] |
|
for suffix in suffixes: |
|
if word.endswith(suffix): |
|
return word[:-len(suffix)] |
|
return word |
|
|
|
def clean_text(text): |
|
text = re.sub(r'[^\w\s]', '', text) |
|
text = re.sub(r'\d+', '', text) |
|
text = text.lower() |
|
text = " ".join([word for word in text.split() if word not in stop_words]) |
|
text = " ".join([stem_word(word) for word in text.split()]) |
|
return text |
|
|
|
def read_prompts(file_path): |
|
input_texts = [] |
|
target_texts = [] |
|
with open(file_path, "r", encoding="utf-8") as file: |
|
lines = file.readlines() |
|
for line in lines: |
|
if line.startswith("input:"): |
|
input_texts.append(line.replace("input:", "").strip()) |
|
elif line.startswith("target:"): |
|
target_texts.append(line.replace("target:", "").strip()) |
|
return input_texts, target_texts |
|
|
|
def prepare_data(input_texts, target_texts): |
|
inputs = tokenizer(input_texts, max_length=512, truncation=True, padding="max_length") |
|
targets = tokenizer(target_texts, max_length=512, truncation=True, padding="max_length") |
|
return {"input_ids": inputs["input_ids"], "attention_mask": inputs["attention_mask"], "labels": targets["input_ids"]} |
|
|
|
def fine_tune_model(): |
|
model_name = "./fine_tuned_model" |
|
tokenizer = T5Tokenizer.from_pretrained(model_name) |
|
model = T5ForConditionalGeneration.from_pretrained(model_name) |
|
|
|
try: |
|
logger.info("Reading and cleaning prompts.") |
|
input_texts, target_texts = read_prompts("prompts.txt") |
|
input_texts_cleaned = [clean_text(text) for text in input_texts] |
|
target_texts_cleaned = [clean_text(text) for text in target_texts] |
|
|
|
logger.info("Splitting dataset into training and validation sets.") |
|
train_texts, val_texts, train_labels, val_labels = train_test_split(input_texts_cleaned, target_texts_cleaned, test_size=0.1) |
|
|
|
logger.info("Preparing datasets for training.") |
|
train_dataset = Dataset.from_dict(prepare_data(train_texts, train_labels, tokenizer)) |
|
val_dataset = Dataset.from_dict(prepare_data(val_texts, val_labels, tokenizer)) |
|
|
|
training_args = TrainingArguments( |
|
output_dir="./results", |
|
evaluation_strategy="steps", |
|
learning_rate=5e-5, |
|
per_device_train_batch_size=4, |
|
num_train_epochs=3, |
|
save_steps=500, |
|
logging_dir="./logs", |
|
logging_steps=10 |
|
) |
|
|
|
logger.info("Starting model training.") |
|
trainer = Trainer( |
|
model=model, |
|
args=training_args, |
|
train_dataset=train_dataset, |
|
eval_dataset=val_dataset |
|
) |
|
trainer.train() |
|
|
|
logger.info("Saving fine-tuned model.") |
|
model.save_pretrained("./fine_tuned_model") |
|
tokenizer.save_pretrained("./fine_tuned_model") |
|
|
|
except Exception as e: |
|
logger.error(f"An error occurred during fine-tuning: {str(e)}") |
|
|
|
fine_tune_model() |