boltuix
/

bert-local

@@ -351,217 +351,8 @@ bert-local is trained using **bert-mini** for multi-class text classification. H
   ```
 ### Training Code
-```python
-import pandas as pd
-from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments, TrainerCallback
-from sklearn.model_selection import train_test_split
-from sklearn.metrics import accuracy_score, f1_score
-import torch
-from torch.utils.data import Dataset
-import shutil
-from tqdm import tqdm
-import numpy as np
-# === 0. Define model and output paths ===
-MODEL_NAME = "bert-mini"
-OUTPUT_DIR = "./bert-local"
-# === 1. Custom callback for tqdm progress bar ===
-class TQDMProgressBarCallback(TrainerCallback):
-    def __init__(self):
-        super().__init__()
-        self.progress_bar = None
-    def on_train_begin(self, args, state, control, **kwargs):
-        self.total_steps = state.max_steps
-        self.progress_bar = tqdm(total=self.total_steps, desc="Training", unit="step")
-    def on_step_end(self, args, state, control, **kwargs):
-        self.progress_bar.update(1)
-        self.progress_bar.set_postfix({
-            "epoch": f"{state.epoch:.2f}",
-            "step": state.global_step
-        })
-    def on_train_end(self, args, state, control, **kwargs):
-        if self.progress_bar is not None:
-            self.progress_bar.close()
-            self.progress_bar = None
-# === 2. Load and preprocess data ===
-dataset_path = 'dataset.csv'
-df = pd.read_csv(dataset_path)
-df = df.dropna(subset=['category'])
-df.columns = ['label', 'text']  # Rename columns
-# === 3. Encode labels ===
-labels = sorted(df["label"].unique())
-label_to_id = {label: idx for idx, label in enumerate(labels)}
-id_to_label = {idx: label for label, idx in label_to_id.items()}
-df['label'] = df['label'].map(label_to_id)
-# === 4. Train-val split ===
-train_texts, val_texts, train_labels, val_labels = train_test_split(
-    df['text'].tolist(), df['label'].tolist(), test_size=0.2, random_state=42, stratify=df['label']
-)
-# === 5. Tokenizer ===
-tokenizer = BertTokenizer.from_pretrained(MODEL_NAME)
-# === 6. Dataset class ===
-class CategoryDataset(Dataset):
-    def __init__(self, texts, labels, tokenizer, max_length=128):
-        self.texts = texts
-        self.labels = labels
-        self.tokenizer = tokenizer
-        self.max_length = max_length
-    def __len__(self):
-        return len(self.texts)
-    def __getitem__(self, idx):
-        encoding = self.tokenizer(
-            self.texts[idx],
-            padding='max_length',
-            truncation=True,
-            max_length=self.max_length,
-            return_tensors='pt'
-        )
-        return {
-            'input_ids': encoding['input_ids'].squeeze(0),
-            'attention_mask': encoding['attention_mask'].squeeze(0),
-            'labels': torch.tensor(self.labels[idx], dtype=torch.long)
-        }
-# === 7. Load datasets ===
-train_dataset = CategoryDataset(train_texts, train_labels, tokenizer)
-val_dataset = CategoryDataset(val_texts, val_labels, tokenizer)
-# === 8. Load model with num_labels ===
-model = BertForSequenceClassification.from_pretrained(
-    MODEL_NAME,
-    num_labels=len(label_to_id)
-)
-# === 9. Define metrics for evaluation ===
-def compute_metrics(eval_pred):
-    logits, labels = eval_pred
-    predictions = np.argmax(logits, axis=-1)
-    acc = accuracy_score(labels, predictions)
-    f1 = f1_score(labels, predictions, average='weighted')
-    return {
-        'accuracy': acc,
-        'f1_weighted': f1,
-    }
-# === 10. Training arguments ===
-training_args = TrainingArguments(
-    output_dir='./results',
-    run_name="bert-local",
-    num_train_epochs=5,
-    per_device_train_batch_size=16,
-    per_device_eval_batch_size=16,
-    warmup_steps=500,
-    weight_decay=0.01,
-    logging_dir='./logs',
-    logging_steps=10,
-    eval_strategy="epoch",
-    report_to="none"
-)
-# === 11. Trainer setup ===
-trainer = Trainer(
-    model=model,
-    args=training_args,
-    train_dataset=train_dataset,
-    eval_dataset=val_dataset,
-    compute_metrics=compute_metrics,
-    callbacks=[TQDMProgressBarCallback()]
-)
-# === 12. Train and evaluate ===
-trainer.train()
-trainer.evaluate()
-# === 13. Save model and tokenizer ===
-model.config.label2id = label_to_id
-model.config.id2label = id_to_label
-model.config.num_labels = len(label_to_id)
-model.save_pretrained(OUTPUT_DIR)
-tokenizer.save_pretrained(OUTPUT_DIR)
-# === 14. Zip model directory ===
-shutil.make_archive("bert-local", 'zip', OUTPUT_DIR)
-print("✅ Training complete. Model and tokenizer saved to ./bert-local")
-print("✅ Model directory zipped to bert-local.zip")
-# === 15. Test function with confidence threshold ===
-def run_test_cases(model, tokenizer, test_sentences, label_to_id, id_to_label, confidence_threshold=0.5):
-    model.eval()
-    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-    model.to(device)
-    correct = 0
-    total = len(test_sentences)
-    results = []
-    for text, expected_label in test_sentences:
-        encoding = tokenizer(
-            text,
-            padding='max_length',
-            truncation=True,
-            max_length=128,
-            return_tensors='pt'
-        )
-        input_ids = encoding['input_ids'].to(device)
-        attention_mask = encoding['attention_mask'].to(device)
-        with torch.no_grad():
-            outputs = model(input_ids, attention_mask=attention_mask)
-            probs = torch.nn.functional.softmax(outputs.logits, dim=-1)
-            max_prob, predicted_id = torch.max(probs, dim=1)
-            predicted_label = id_to_label[predicted_id.item()]
-            if max_prob.item() < confidence_threshold:
-                predicted_label = "unknown"
-        is_correct = (predicted_label == expected_label)
-        if is_correct:
-            correct += 1
-        results.append({
-            "sentence": text,
-            "expected": expected_label,
-            "predicted": predicted_label,
-            "confidence": max_prob.item(),
-            "correct": is_correct
-        })
-    accuracy = correct / total * 100
-    print(f"\nTest Cases Accuracy: {accuracy:.2f}% ({correct}/{total} correct)")
-    for r in results:
-        status = "✓" if r["correct"] else "✗"
-        print(f"{status} '{r['sentence']}'")
-        print(f"   Expected: {r['expected']}, Predicted: {r['predicted']}, Confidence: {r['confidence']:.3f}")
-    assert accuracy >= 70, f"Test failed: Accuracy {accuracy:.2f}% < 70%"
-    return results
-# === 16. Sample test sentences for testing ===
-test_sentences = [
-    ("Where is the nearest airport to this location?", "airport"),
-    ("Can I bring a laptop through airport security?", "airport"),
-    ("How do I get to the closest airport terminal?", "airport"),
-    ("Need help finding an accounting firm for tax planning.", "accounting firm"),
-    ("Can an accounting firm help with financial audits?", "accounting firm"),
-    ("Looking for an accounting firm to manage payroll.", "accounting firm"),
-]
-print("\nRunning test cases...")
-test_results = run_test_cases(model, tokenizer, test_sentences, label_to_id, id_to_label)
-print("✅ Test cases completed.")
-```
 ---
 ## Evaluation 📈

   ```
 ### Training Code
+- 📍 Get training [Source Code](https://huggingface.co/boltuix/bert-local/blob/main/colab_training_code.ipynb) 🌟
+- 📍 Dataset (comming soon..)
 ---
 ## Evaluation 📈