|
|
|
import os
|
|
import numpy as np
|
|
from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification
|
|
import pandas as pd
|
|
import torch
|
|
from sklearn.metrics.pairwise import cosine_similarity
|
|
|
|
|
|
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
|
print(f"Using device: {device}")
|
|
|
|
|
|
|
|
model_path = "./emotion_model"
|
|
hub_path = "dasdristanta13/twitter-emotion-model"
|
|
if os.path.isdir(model_path):
|
|
emotion_model = AutoModelForSequenceClassification.from_pretrained(model_path)
|
|
emotion_tokenizer = AutoTokenizer.from_pretrained(model_path)
|
|
else:
|
|
emotion_model = AutoModelForSequenceClassification.from_pretrained(hub_path)
|
|
emotion_tokenizer = AutoTokenizer.from_pretrained(hub_path)
|
|
|
|
|
|
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
|
emotion_model = emotion_model.to(device)
|
|
|
|
|
|
semantic_model = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
|
|
semantic_tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
|
|
semantic_model = semantic_model.to(device)
|
|
|
|
target_mapping = {
|
|
'Google': 'Google',
|
|
'Apple': 'Apple',
|
|
'iPad': 'Apple',
|
|
'iPhone': 'Apple',
|
|
'Other Google product or service': 'Google',
|
|
'Other Apple product or service': 'Apple',
|
|
'Android': 'Google',
|
|
'Android App': 'Google',
|
|
'iPad or iPhone App': 'Apple',
|
|
}
|
|
|
|
def get_embedding(text):
|
|
inputs = semantic_tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=128)
|
|
inputs = {k: v.to(device) for k, v in inputs.items()}
|
|
with torch.no_grad():
|
|
outputs = semantic_model(**inputs)
|
|
return outputs.last_hidden_state.mean(dim=1).cpu().numpy()
|
|
|
|
def predict_target(text):
|
|
text_embedding = get_embedding(text)
|
|
target_embeddings = {target: get_embedding(target) for target in target_mapping.keys()}
|
|
|
|
similarities = {target: cosine_similarity(text_embedding, emb)[0][0] for target, emb in target_embeddings.items()}
|
|
predicted_target = max(similarities, key=similarities.get)
|
|
|
|
return predicted_target
|
|
|
|
def predict_emotion(text, target):
|
|
combined_input = f"{text} [SEP] {target}"
|
|
inputs = emotion_tokenizer(combined_input, return_tensors="pt", truncation=True, padding=True)
|
|
inputs = {k: v.to(device) for k, v in inputs.items()}
|
|
|
|
with torch.no_grad():
|
|
outputs = emotion_model(**inputs)
|
|
|
|
probabilities = outputs.logits.softmax(dim=-1).squeeze().cpu().numpy()
|
|
|
|
emotion_labels = ['Negative emotion', 'Positive emotion','No emotion']
|
|
predicted_emotion = emotion_labels[np.argmax(probabilities)]
|
|
|
|
return predicted_emotion, {label: float(prob) for label, prob in zip(emotion_labels, probabilities)}
|
|
|
|
def process_test_data(test_df):
|
|
results = []
|
|
for _, row in test_df.iterrows():
|
|
text = row['Tweet']
|
|
predicted_target = predict_target(text)
|
|
predicted_emotion, emotion_probs = predict_emotion(text, predicted_target)
|
|
|
|
results.append({
|
|
'Tweet': text,
|
|
'Predicted Target': predicted_target,
|
|
'Predicted Emotion': predicted_emotion,
|
|
'Emotion Probabilities': emotion_probs
|
|
})
|
|
|
|
return pd.DataFrame(results)
|
|
|
|
if __name__ == "__main__":
|
|
test_df = pd.read_excel('NLP Engineer Assignment Dataset (1) (1) (1) (1).xlsx', sheet_name='Test')
|
|
results_df = process_test_data(test_df)
|
|
results_df.to_csv('test_results.csv', index=False)
|
|
print("Results saved to test_results.csv")
|
|
|