File size: 3,742 Bytes
8ba260b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 |
import os
import numpy as np
from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification
import pandas as pd
import torch
from sklearn.metrics.pairwise import cosine_similarity
# Check if CUDA is available and set the device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")
# Load the model and tokenizer
model_path = "./emotion_model"
hub_path = "dasdristanta13/twitter-emotion-model"
if os.path.isdir(model_path):
emotion_model = AutoModelForSequenceClassification.from_pretrained(model_path)
emotion_tokenizer = AutoTokenizer.from_pretrained(model_path)
else:
emotion_model = AutoModelForSequenceClassification.from_pretrained(hub_path)
emotion_tokenizer = AutoTokenizer.from_pretrained(hub_path)
# Move the model to the appropriate device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
emotion_model = emotion_model.to(device)
# Load a pre-trained sentence transformer model for semantic similarity
semantic_model = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
semantic_tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
semantic_model = semantic_model.to(device)
target_mapping = {
'Google': 'Google',
'Apple': 'Apple',
'iPad': 'Apple',
'iPhone': 'Apple',
'Other Google product or service': 'Google',
'Other Apple product or service': 'Apple',
'Android': 'Google',
'Android App': 'Google',
'iPad or iPhone App': 'Apple',
}
def get_embedding(text):
inputs = semantic_tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=128)
inputs = {k: v.to(device) for k, v in inputs.items()}
with torch.no_grad():
outputs = semantic_model(**inputs)
return outputs.last_hidden_state.mean(dim=1).cpu().numpy()
def predict_target(text):
text_embedding = get_embedding(text)
target_embeddings = {target: get_embedding(target) for target in target_mapping.keys()}
similarities = {target: cosine_similarity(text_embedding, emb)[0][0] for target, emb in target_embeddings.items()}
predicted_target = max(similarities, key=similarities.get)
return predicted_target
def predict_emotion(text, target):
combined_input = f"{text} [SEP] {target}"
inputs = emotion_tokenizer(combined_input, return_tensors="pt", truncation=True, padding=True)
inputs = {k: v.to(device) for k, v in inputs.items()}
with torch.no_grad():
outputs = emotion_model(**inputs)
probabilities = outputs.logits.softmax(dim=-1).squeeze().cpu().numpy()
emotion_labels = ['Negative emotion', 'Positive emotion','No emotion']
predicted_emotion = emotion_labels[np.argmax(probabilities)]
return predicted_emotion, {label: float(prob) for label, prob in zip(emotion_labels, probabilities)}
def process_test_data(test_df):
results = []
for _, row in test_df.iterrows():
text = row['Tweet']
predicted_target = predict_target(text)
predicted_emotion, emotion_probs = predict_emotion(text, predicted_target)
results.append({
'Tweet': text,
'Predicted Target': predicted_target,
'Predicted Emotion': predicted_emotion,
'Emotion Probabilities': emotion_probs
})
return pd.DataFrame(results)
if __name__ == "__main__":
test_df = pd.read_excel('NLP Engineer Assignment Dataset (1) (1) (1) (1).xlsx', sheet_name='Test')
results_df = process_test_data(test_df)
results_df.to_csv('test_results.csv', index=False)
print("Results saved to test_results.csv")
|