Spaces:
Sleeping
Sleeping
import pandas as pd | |
import numpy as np | |
import torch | |
import torch.nn as nn | |
from torch.utils.data import Dataset, DataLoader | |
from sklearn.feature_extraction.text import TfidfVectorizer | |
from sklearn.preprocessing import LabelEncoder | |
from sklearn.metrics import classification_report | |
import joblib | |
import os | |
# === Load Data === | |
train_df = pd.read_csv("train.csv") | |
val_df = pd.read_csv("val.csv") | |
# === Preprocess === | |
vectorizer = TfidfVectorizer(max_features=5000, stop_words='english') | |
X_train = vectorizer.fit_transform(train_df['Resume_str']).toarray() | |
X_val = vectorizer.transform(val_df['Resume_str']).toarray() | |
le = LabelEncoder() | |
y_train = le.fit_transform(train_df['Category']) | |
y_val = le.transform(val_df['Category']) | |
# === PyTorch Dataset & Dataloader === | |
class ResumeDataset(Dataset): | |
def __init__(self, X, y): | |
self.X = torch.tensor(X, dtype=torch.float32) | |
self.y = torch.tensor(y, dtype=torch.long) | |
def __len__(self): | |
return len(self.y) | |
def __getitem__(self, idx): | |
return self.X[idx], self.y[idx] | |
train_loader = DataLoader(ResumeDataset(X_train, y_train), batch_size=32, shuffle=True) | |
val_loader = DataLoader(ResumeDataset(X_val, y_val), batch_size=32, shuffle=False) | |
# === Model === | |
class ResumeClassifier(nn.Module): | |
def __init__(self, input_dim, num_classes): | |
super().__init__() | |
self.model = nn.Sequential( | |
nn.Linear(input_dim, 256), | |
nn.ReLU(), | |
nn.BatchNorm1d(256), | |
nn.Linear(256, 128), | |
nn.ReLU(), | |
nn.BatchNorm1d(128), | |
nn.Linear(128, num_classes) | |
) | |
def forward(self, x): | |
return self.model(x) | |
model = ResumeClassifier(input_dim=5000, num_classes=len(le.classes_)) | |
criterion = nn.CrossEntropyLoss() | |
optimizer = torch.optim.Adam(model.parameters(), lr=0.001) | |
# === Train Function === | |
def train_model(model, loader): | |
model.train() | |
for xb, yb in loader: | |
optimizer.zero_grad() | |
preds = model(xb) | |
loss = criterion(preds, yb) | |
loss.backward() | |
optimizer.step() | |
# === Eval Function === | |
def evaluate_model(model, loader): | |
model.eval() | |
all_preds = [] | |
all_labels = [] | |
with torch.no_grad(): | |
for xb, yb in loader: | |
preds = model(xb) | |
all_preds.extend(torch.argmax(preds, dim=1).cpu().numpy()) | |
all_labels.extend(yb.cpu().numpy()) | |
return all_labels, all_preds | |
# === Training === | |
for epoch in range(5): | |
train_model(model, train_loader) | |
y_true, y_pred = evaluate_model(model, val_loader) | |
acc = np.mean(np.array(y_true) == np.array(y_pred)) | |
print(f"Epoch {epoch+1} - Accuracy: {acc:.4f}") | |
# === Final Report === | |
report = classification_report(y_true, y_pred, target_names=le.classes_) | |
print("\nClassification Report:\n", report) | |
# === Save Outputs === | |
os.makedirs("artifacts", exist_ok=True) | |
torch.save(model.state_dict(), "resume_model.pt") | |
joblib.dump(vectorizer, "vectorizer.pkl") | |
joblib.dump(le, "label_encoder.pkl") | |
print("β Model, vectorizer, and label encoder saved.") |