Spaces:
Running
Running
import gradio as gr | |
import torch | |
import torch.nn.functional as F | |
import re | |
import nltk | |
nltk.download('stopwords') | |
nltk.download('wordnet') | |
import os | |
from nltk.stem import WordNetLemmatizer | |
from nltk.corpus import stopwords | |
from transformers import AutoTokenizer, AutoModelForSequenceClassification | |
import urllib.request | |
# Modeļu inicializācija | |
model_names = ["distilbert-base-uncased", "prajjwal1/bert-tiny", "roberta-base", "google/mobilebert-uncased", "albert-base-v2", "xlm-roberta-base"] | |
models = {} | |
tokenizers = {} | |
# === Modelis → URL === | |
model_urls = { | |
"best_model_albert-base-v2.pth": "https://www.dropbox.com/scl/fi/adulme5xarg6hgxbs26fm/best_model_albert-base-v2.pth?rlkey=y17x3sw1frk83yfzt8zc00458&st=43uha18d&dl=1", | |
"best_model_distilbert-base-uncased.pth": "https://www.dropbox.com/scl/fi/8y3oyfbzmbmn427e1ei3d/best_model_distilbert-base-uncased.pth?rlkey=u9rd40tdd3p781r4xtv8wi5t6&st=nfzq7x8j&dl=1", | |
"best_model_google_mobilebert-uncased.pth": "https://www.dropbox.com/scl/fi/7zdarid2no1fw0b8hk0tf/best_model_google_mobilebert-uncased.pth?rlkey=w13j1jampxlt8himivj090nwv&st=0zq6yofp&dl=1", | |
"best_model_prajjwal1_bert-tiny.pth": "https://www.dropbox.com/scl/fi/vscwewy4uo58o7xswokxt/best_model_prajjwal1_bert-tiny.pth?rlkey=uav8aas7fxb5nl2w5iacg1qyb&st=12mzggan&dl=1", | |
"best_model_roberta-base.pth": "https://www.dropbox.com/scl/fi/zqmlzt0q6knjv096yswsr/best_model_roberta-base.pth?rlkey=hi8ddi23dnz45xt3jomxq0pek&st=2axjymyt&dl=1", | |
"best_model_xlm-roberta-base.pth": "https://www.dropbox.com/scl/fi/2gao9iqesou9kb633vvan/best_model_xlm-roberta-base.pth?rlkey=acyvwt8qtle8wzle5idfo8241&st=8livizox&dl=1", | |
} | |
# === Lejupielādē modeļus, ja nav === | |
for filename, url in model_urls.items(): | |
if not os.path.exists(filename): | |
print(f"Lejupielādē: {filename}") | |
try: | |
urllib.request.urlretrieve(url, filename) | |
print(f" → Saglabāts: {filename}") | |
except Exception as e: | |
print(f" [!] Kļūda lejupielādējot {filename}: {e}") | |
for model_name in model_names: | |
# Tokenizators | |
tokenizers[model_name] = AutoTokenizer.from_pretrained(model_name, max_length=512) | |
# Modelis ar 3 klasēm | |
models[model_name] = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=3) | |
model_file_name = re.sub(r'/', '_', model_name) | |
models[model_name].load_state_dict(torch.load(f"best_model_{model_file_name}.pth", map_location=torch.device('cpu'))) | |
# Uz ierīces | |
models[model_name] = models[model_name].to('cpu') | |
models[model_name].eval() | |
# Label mapping | |
labels = {0: "Safe", 1: "Spam", 2: "Phishing"} | |
lemmatizer = WordNetLemmatizer() | |
stop_words = set(stopwords.words('english')) | |
def preprocess(text): | |
text = text.lower() # Teksta pārveide atmetot lielos burtus | |
text = re.sub(r'http\S+', '', text) # URL atmešana | |
text = re.sub(r"[^a-z']", ' ', text) # atmet simbolus, kas nav burti | |
text = re.sub(r'\s+', ' ', text).strip() # atmet liekās atstarpes | |
text = ' '.join([lemmatizer.lemmatize(word) for word in text.split() if word not in stop_words]) # lemmatizācija | |
return text | |
# Classification function (single model) | |
def classify_email_single_model(text, model_name): | |
text = preprocess(text) | |
inputs = tokenizers[model_name](text, return_tensors="pt", padding=True, truncation=True) | |
with torch.no_grad(): | |
outputs = models[model_name](**inputs) | |
prediction = torch.argmax(outputs.logits, dim=1).item() | |
probs = F.softmax(outputs.logits, dim=1) | |
probs_percent = probs.cpu().numpy() * 100 | |
response = {"prediction": labels[prediction], "probabilities": probs_percent} | |
return response | |
# Classification function (all models together, probabilities for each model) | |
def classify_email_detailed(text): | |
votes = {"Safe": 0, "Spam": 0, "Phishing": 0} | |
probabilities = {} | |
for model_name in model_names: | |
response = classify_email_single_model(text, model_name) | |
vote = response['prediction'] | |
votes[vote] += 1 | |
probabilities[model_name] = response['probabilities'] | |
response = "" | |
i = 1 | |
for label, vote_count in votes.items(): | |
vote_or_votes = "vote" if vote_count == 1 else "votes" | |
if i != 3: | |
response += f"{label}: {vote_count} {vote_or_votes}, " | |
else: | |
response += f"{label}: {vote_count} {vote_or_votes}\n" | |
i += 1 | |
response += "\n" | |
for model_name in model_names: | |
response += f"{model_name}: " | |
for j, prob in enumerate(probabilities[model_name][0]): | |
response += f"{labels[j]}: {prob:.2f}% " | |
response += "\n" | |
return response | |
# Classification function (all models together, just the votes) | |
def classify_email_simple(text): | |
votes = {"Safe": 0, "Spam": 0, "Phishing": 0} | |
for model_name in model_names: | |
response = classify_email_single_model(text, model_name) | |
vote = response['prediction'] | |
votes[vote] += 1 | |
response = "" | |
i = 1 | |
for label, vote_count in votes.items(): | |
vote_or_votes = "vote" if vote_count == 1 else "votes" | |
if i != 3: | |
response += f"{label}: {vote_count} {vote_or_votes}, " | |
else: | |
response += f"{label}: {vote_count} {vote_or_votes}\n" | |
i += 1 | |
response += "\n" | |
return response | |
def classify_email(text, mode): | |
if mode == "Tikai balsis": | |
return classify_email_simple(text) | |
else: | |
return classify_email_detailed(text) | |
# Gradio UI | |
demo = gr.Interface( | |
fn=classify_email, | |
inputs=[gr.Textbox(lines=10, placeholder="Ievietojiet savu e-pastu šeit..."), | |
gr.Radio(choices=["Tikai balsis", "Balsis un varbūtības"], label='Klasifikācijas veids') | |
], | |
outputs="text", | |
title="E-pastu klasifikators (vairāku modeļu balsošana)", | |
description="Autori: Kristaps Tretjuks un Aleksejs Gorlovičs" | |
) | |
demo.launch(share=True) |