File size: 3,742 Bytes
8ba260b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97

import os
import numpy as np
from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification
import pandas as pd
import torch
from sklearn.metrics.pairwise import cosine_similarity

# Check if CUDA is available and set the device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")


# Load the model and tokenizer
model_path = "./emotion_model"
hub_path = "dasdristanta13/twitter-emotion-model"
if os.path.isdir(model_path):
    emotion_model = AutoModelForSequenceClassification.from_pretrained(model_path)
    emotion_tokenizer = AutoTokenizer.from_pretrained(model_path)
else:
    emotion_model = AutoModelForSequenceClassification.from_pretrained(hub_path)
    emotion_tokenizer = AutoTokenizer.from_pretrained(hub_path)

# Move the model to the appropriate device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
emotion_model = emotion_model.to(device)

# Load a pre-trained sentence transformer model for semantic similarity
semantic_model = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
semantic_tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
semantic_model = semantic_model.to(device)

target_mapping = {
    'Google': 'Google',
    'Apple': 'Apple',
    'iPad': 'Apple',
    'iPhone': 'Apple',
    'Other Google product or service': 'Google',
    'Other Apple product or service': 'Apple',
    'Android': 'Google',
    'Android App': 'Google',
    'iPad or iPhone App': 'Apple',
}

def get_embedding(text):
    inputs = semantic_tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=128)
    inputs = {k: v.to(device) for k, v in inputs.items()}
    with torch.no_grad():
        outputs = semantic_model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).cpu().numpy()

def predict_target(text):
    text_embedding = get_embedding(text)
    target_embeddings = {target: get_embedding(target) for target in target_mapping.keys()}

    similarities = {target: cosine_similarity(text_embedding, emb)[0][0] for target, emb in target_embeddings.items()}
    predicted_target = max(similarities, key=similarities.get)

    return predicted_target

def predict_emotion(text, target):
    combined_input = f"{text} [SEP] {target}"
    inputs = emotion_tokenizer(combined_input, return_tensors="pt", truncation=True, padding=True)
    inputs = {k: v.to(device) for k, v in inputs.items()}

    with torch.no_grad():
        outputs = emotion_model(**inputs)

    probabilities = outputs.logits.softmax(dim=-1).squeeze().cpu().numpy()

    emotion_labels = ['Negative emotion', 'Positive emotion','No emotion']
    predicted_emotion = emotion_labels[np.argmax(probabilities)]

    return predicted_emotion, {label: float(prob) for label, prob in zip(emotion_labels, probabilities)}

def process_test_data(test_df):
    results = []
    for _, row in test_df.iterrows():
        text = row['Tweet']
        predicted_target = predict_target(text)
        predicted_emotion, emotion_probs = predict_emotion(text, predicted_target)

        results.append({
            'Tweet': text,
            'Predicted Target': predicted_target,
            'Predicted Emotion': predicted_emotion,
            'Emotion Probabilities': emotion_probs
        })

    return pd.DataFrame(results)

if __name__ == "__main__":
    test_df = pd.read_excel('NLP Engineer Assignment Dataset (1) (1) (1) (1).xlsx', sheet_name='Test')
    results_df = process_test_data(test_df)
    results_df.to_csv('test_results.csv', index=False)
    print("Results saved to test_results.csv")