import pandas as pd from sklearn.model_selection import train_test_split from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments from datasets import Dataset import torch import joblib from sklearn.metrics import classification_report import numpy as np import re import wandb from imblearn.over_sampling import RandomOverSampler import random from nltk.corpus import wordnet import nltk from warnings import filterwarnings filterwarnings('ignore') nltk.download('wordnet', quiet=True) wandb.init(mode="disabled") def clean_text(text): text = re.sub(r'@\w+', '', text) text = re.sub(r'#\w+', '', text) text = re.sub(r'{links}', '', text) text = re.sub(r'http\S+|www.\S+', '', text) text = re.sub(r'[^A-Za-z0-9\s]', ' ', text) text = re.sub(r'\s+', ' ', text).strip() return text def preprocess_data(df): df.columns = ['text', 'target', 'emotion'] df = df[df['emotion'] != 'I can\'t tell'] df = df[~df['text'].isna()].reset_index(drop=True) target_mapping = { 'Google': 'Google', 'Apple': 'Apple', 'iPad': 'iPad', 'iPhone': 'iPhone', 'Other Google product or service': 'Google', 'Other Apple product or service': 'Apple', 'Android': 'Android', 'Android App': 'Android App', 'iPad or iPhone App': 'iPad or iPhone App', } df['new_text'] = df['text'].apply(clean_text) df['new_target'] = df['target'].apply(lambda x: target_mapping.get(x, 'No Product')) df['target'] = df['target'].fillna('No Product') return df def compute_metrics(eval_pred): logits, labels = eval_pred predictions = np.argmax(logits, axis=-1) return { 'accuracy': (predictions == labels).mean(), } def random_word_drop(text, max_words=3): words = text.split() if len(words) <= max_words: return text num_to_drop = random.randint(1, min(max_words, len(words) - 1)) drop_indices = random.sample(range(len(words)), num_to_drop) return ' '.join([word for i, word in enumerate(words) if i not in drop_indices]) def synonym_replacement(text, n=1): words = text.split() new_words = words.copy() random_word_list = list(set([word for word in words if word.isalnum()])) random.shuffle(random_word_list) num_replaced = 0 for random_word in random_word_list: synonyms = [] for syn in wordnet.synsets(random_word): for l in syn.lemmas(): synonyms.append(l.name()) if len(synonyms) >= 1: synonym = random.choice(list(set(synonyms))) new_words = [synonym if word == random_word else word for word in new_words] num_replaced += 1 if num_replaced >= n: break return ' '.join(new_words) def print_class_distribution(df, stage): class_dist = df['sentiment_label'].value_counts().sort_index() total = len(df) print(f"\nClass distribution - {stage}:") for label, count in class_dist.items(): percentage = (count / total) * 100 print(f"Class {label}: {count} ({percentage:.2f}%)") def augment_data(df): class_counts = df['sentiment_label'].value_counts() max_class = class_counts.idxmax() min_class = class_counts.idxmin() augmented_data = [] for _, row in df.iterrows(): # Original data augmented_data.append({ 'new_text': row['new_text'], 'new_target': row['new_target'], 'sentiment_label': row['sentiment_label'] }) # Augment less for the highest class if row['sentiment_label'] == max_class: if random.random() < 0.4: # 50% chance to augment new_text = random_word_drop(row['new_text']) augmented_data.append({ 'new_text': new_text, 'new_target': row['new_target'], 'sentiment_label': row['sentiment_label'] }) else: # Random word drop new_text = random_word_drop(row['new_text']) augmented_data.append({ 'new_text': new_text, 'new_target': row['new_target'], 'sentiment_label': row['sentiment_label'] }) # Synonym replacement new_text = synonym_replacement(row['new_text']) augmented_data.append({ 'new_text': new_text, 'new_target': row['new_target'], 'sentiment_label': row['sentiment_label'] }) # Extra augmentation for the lowest class if row['sentiment_label'] == min_class: new_text = random_word_drop(synonym_replacement(row['new_text'])) augmented_data.append({ 'new_text': new_text, 'new_target': row['new_target'], 'sentiment_label': row['sentiment_label'] }) augmented_df = pd.DataFrame(augmented_data) augmented_df['combined_input'] = augmented_df['new_text'] + " [SEP] " + augmented_df['new_target'] return augmented_df def balance_data(df): X = df[['combined_input']] y = df['sentiment_label'] ros = RandomOverSampler(random_state=42) X_resampled, y_resampled = ros.fit_resample(X, y) balanced_df = pd.DataFrame({ 'combined_input': X_resampled['combined_input'], 'sentiment_label': y_resampled }) return balanced_df def train_model_v2(df): emo_dict = {'Negative emotion': 0, 'Positive emotion': 1, 'No emotion toward brand or product': 2} df['sentiment_label'] = df['emotion'].apply(lambda x: emo_dict[x]) train_data, val_data = train_test_split(df, test_size=0.2, stratify=df['sentiment_label'], random_state=42) print_class_distribution(train_data, "Before augmentation and balancing") # Augment and balance only the training data train_data = augment_data(train_data) print_class_distribution(train_data, "After augmentation") train_data = balance_data(train_data) print_class_distribution(train_data, "After balancing") val_data['combined_input'] = val_data['new_text'] + " [SEP] " + val_data['new_target'] train_data = train_data[['combined_input', 'sentiment_label']] val_data = val_data[['combined_input', 'sentiment_label']] tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment") def preprocess_function(examples): tokenized_inputs = tokenizer(examples['combined_input'], padding='max_length', truncation=True, max_length=128) tokenized_inputs['labels'] = examples['sentiment_label'] return tokenized_inputs train_dataset = Dataset.from_pandas(train_data) val_dataset = Dataset.from_pandas(val_data[['combined_input', 'sentiment_label']]) train_dataset = train_dataset.map(preprocess_function, batched=True) val_dataset = val_dataset.map(preprocess_function, batched=True) model = AutoModelForSequenceClassification.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment", num_labels=3) for name, param in model.named_parameters(): if not any(layer in name for layer in ['encoder.layer.11', 'encoder.layer.10','encoder.layer.9','encoder.layer.8', 'pooler', 'classifier']): param.requires_grad = False training_args = TrainingArguments( output_dir='./results', evaluation_strategy="epoch", save_strategy="epoch", logging_steps=50, learning_rate=1e-5, per_device_train_batch_size=256, per_device_eval_batch_size=256, num_train_epochs=8, weight_decay=0.01, logging_strategy="steps", load_best_model_at_end=True, report_to="none", metric_for_best_model="eval_loss", ) trainer = Trainer( model=model, args=training_args, train_dataset=train_dataset, eval_dataset=val_dataset, tokenizer=tokenizer, compute_metrics=compute_metrics ) trainer.train() # Save the model and tokenizer output_dir = "./emotion_model" trainer.save_model(output_dir) tokenizer.save_pretrained(output_dir) print(f"Model and tokenizer saved to {output_dir}") # Generate and print classification report predictions = trainer.predict(val_dataset) preds = np.argmax(predictions.predictions, axis=-1) labels = predictions.label_ids print("\nClassification Report:") print(classification_report(labels, preds, target_names=list(emo_dict.keys()))) if __name__ == "__main__": df = pd.read_excel('NLP Engineer Assignment Dataset (1) (1) (1) (1).xlsx', sheet_name='Train') processed_df = preprocess_data(df) train_model_v2(processed_df)