boltuix commited on
Commit
2d496ba
·
verified ·
1 Parent(s): 61cbf6a

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +2 -215
README.md CHANGED
@@ -350,222 +350,9 @@ print("✅ Supported Categories:", supported_labels)
350
  ```
351
  ---
352
 
353
-
354
  ### Training Code
355
- ```python
356
- import pandas as pd
357
- from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments, TrainerCallback
358
- from sklearn.model_selection import train_test_split
359
- from sklearn.metrics import accuracy_score, f1_score
360
- import torch
361
- from torch.utils.data import Dataset
362
- import shutil
363
- from tqdm import tqdm
364
- import numpy as np
365
-
366
- # === 0. Define model and output paths ===
367
- MODEL_NAME = "boltuix/NeuroBERT"
368
- OUTPUT_DIR = "./neuro-nearby"
369
-
370
- # === 1. Custom callback for tqdm progress bar ===
371
- class TQDMProgressBarCallback(TrainerCallback):
372
- def __init__(self):
373
- super().__init__()
374
- self.progress_bar = None
375
-
376
- def on_train_begin(self, args, state, control, **kwargs):
377
- self.total_steps = state.max_steps
378
- self.progress_bar = tqdm(total=self.total_steps, desc="Training", unit="step")
379
-
380
- def on_step_end(self, args, state, control, **kwargs):
381
- self.progress_bar.update(1)
382
- self.progress_bar.set_postfix({
383
- "epoch": f"{state.epoch:.2f}",
384
- "step": state.global_step
385
- })
386
-
387
- def on_train_end(self, args, state, control, **kwargs):
388
- if self.progress_bar is not None:
389
- self.progress_bar.close()
390
- self.progress_bar = None
391
-
392
- # === 2. Load and preprocess data ===
393
- dataset_path = 'dataset.csv'
394
- df = pd.read_csv(dataset_path)
395
- df = df.dropna(subset=['category'])
396
- df.columns = ['label', 'text'] # Rename columns
397
-
398
- # === 3. Encode labels ===
399
- labels = sorted(df["label"].unique())
400
- label_to_id = {label: idx for idx, label in enumerate(labels)}
401
- id_to_label = {idx: label for label, idx in label_to_id.items()}
402
- df['label'] = df['label'].map(label_to_id)
403
-
404
- # === 4. Train-val split ===
405
- train_texts, val_texts, train_labels, val_labels = train_test_split(
406
- df['text'].tolist(), df['label'].tolist(), test_size=0.2, random_state=42, stratify=df['label']
407
- )
408
-
409
- # === 5. Tokenizer ===
410
- tokenizer = BertTokenizer.from_pretrained(MODEL_NAME)
411
-
412
- # === 6. Dataset class ===
413
- class CategoryDataset(Dataset):
414
- def __init__(self, texts, labels, tokenizer, max_length=128):
415
- self.texts = texts
416
- self.labels = labels
417
- self.tokenizer = tokenizer
418
- self.max_length = max_length
419
-
420
- def __len__(self):
421
- return len(self.texts)
422
-
423
- def __getitem__(self, idx):
424
- encoding = self.tokenizer(
425
- self.texts[idx],
426
- padding='max_length',
427
- truncation=True,
428
- max_length=self.max_length,
429
- return_tensors='pt'
430
- )
431
- return {
432
- 'input_ids': encoding['input_ids'].squeeze(0),
433
- 'attention_mask': encoding['attention_mask'].squeeze(0),
434
- 'labels': torch.tensor(self.labels[idx], dtype=torch.long)
435
- }
436
-
437
- # === 7. Load datasets ===
438
- train_dataset = CategoryDataset(train_texts, train_labels, tokenizer)
439
- val_dataset = CategoryDataset(val_texts, val_labels, tokenizer)
440
-
441
- # === 8. Load model with num_labels ===
442
- model = BertForSequenceClassification.from_pretrained(
443
- MODEL_NAME,
444
- num_labels=len(label_to_id)
445
- )
446
-
447
- # === 9. Define metrics for evaluation ===
448
- def compute_metrics(eval_pred):
449
- logits, labels = eval_pred
450
- predictions = np.argmax(logits, axis=-1)
451
- acc = accuracy_score(labels, predictions)
452
- f1 = f1_score(labels, predictions, average='weighted')
453
- return {
454
- 'accuracy': acc,
455
- 'f1_weighted': f1,
456
- }
457
-
458
- # === 10. Training arguments ===
459
- training_args = TrainingArguments(
460
- output_dir='./results',
461
- run_name="NeuroNearby",
462
- num_train_epochs=5,
463
- per_device_train_batch_size=16,
464
- per_device_eval_batch_size=16,
465
- warmup_steps=500,
466
- weight_decay=0.01,
467
- logging_dir='./logs',
468
- logging_steps=10,
469
- eval_strategy="epoch", # Corrected from evaluation_strategy
470
- report_to="none"
471
- )
472
-
473
-
474
- # === 11. Trainer setup ===
475
- trainer = Trainer(
476
- model=model,
477
- args=training_args,
478
- train_dataset=train_dataset,
479
- eval_dataset=val_dataset,
480
- compute_metrics=compute_metrics,
481
- callbacks=[TQDMProgressBarCallback()]
482
- )
483
-
484
- # === 12. Train and evaluate ===
485
- trainer.train()
486
- trainer.evaluate()
487
-
488
- # === 13. Save model and tokenizer ===
489
- model.config.label2id = label_to_id
490
- model.config.id2label = id_to_label
491
- model.config.num_labels = len(label_to_id)
492
-
493
- model.save_pretrained(OUTPUT_DIR)
494
- tokenizer.save_pretrained(OUTPUT_DIR)
495
-
496
- # === 14. Zip model directory ===
497
- shutil.make_archive("neuro-nearby", 'zip', OUTPUT_DIR)
498
- print("✅ Training complete. Model and tokenizer saved to ./neuro-nearby")
499
- print("✅ Model directory zipped to neuro-nearby.zip")
500
-
501
- # === 15. Test function with confidence threshold ===
502
- def run_test_cases(model, tokenizer, test_sentences, label_to_id, id_to_label, confidence_threshold=0.5):
503
- model.eval()
504
- device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
505
- model.to(device)
506
-
507
- correct = 0
508
- total = len(test_sentences)
509
- results = []
510
-
511
- for text, expected_label in test_sentences:
512
- encoding = tokenizer(
513
- text,
514
- padding='max_length',
515
- truncation=True,
516
- max_length=128,
517
- return_tensors='pt'
518
- )
519
- input_ids = encoding['input_ids'].to(device)
520
- attention_mask = encoding['attention_mask'].to(device)
521
-
522
- with torch.no_grad():
523
- outputs = model(input_ids, attention_mask=attention_mask)
524
- probs = torch.nn.functional.softmax(outputs.logits, dim=-1)
525
- max_prob, predicted_id = torch.max(probs, dim=1)
526
- predicted_label = id_to_label[predicted_id.item()]
527
- # Apply confidence threshold
528
- if max_prob.item() < confidence_threshold:
529
- predicted_label = "unknown"
530
-
531
- is_correct = (predicted_label == expected_label)
532
- if is_correct:
533
- correct += 1
534
- results.append({
535
- "sentence": text,
536
- "expected": expected_label,
537
- "predicted": predicted_label,
538
- "confidence": max_prob.item(),
539
- "correct": is_correct
540
- })
541
-
542
- accuracy = correct / total * 100
543
- print(f"\nTest Cases Accuracy: {accuracy:.2f}% ({correct}/{total} correct)")
544
-
545
- for r in results:
546
- status = "✓" if r["correct"] else "✗"
547
- print(f"{status} '{r['sentence']}'")
548
- print(f" Expected: {r['expected']}, Predicted: {r['predicted']}, Confidence: {r['confidence']:.3f}")
549
-
550
- assert accuracy >= 70, f"Test failed: Accuracy {accuracy:.2f}% < 70%"
551
- return results
552
-
553
- # === 16. Sample test sentences for testing ===
554
- test_sentences = [
555
- ("Where is the nearest airport to this location?", "airport"),
556
- ("Can I bring a laptop through airport security?", "airport"),
557
- ("How do I get to the closest airport terminal?", "airport"),
558
- ("Need help finding an accounting firm for tax planning.", "accounting firm"),
559
- ("Can an accounting firm help with financial audits?", "accounting firm"),
560
- ("Looking for an accounting firm to manage payroll.", "accounting firm"),
561
- # Add more diverse sentences covering your 155 categories
562
- ]
563
-
564
- print("\nRunning test cases...")
565
- test_results = run_test_cases(model, tokenizer, test_sentences, label_to_id, id_to_label)
566
- print("✅ Test cases completed.")
567
- ```
568
-
569
  ---
570
 
571
  ## Evaluation 📈
 
350
  ```
351
  ---
352
 
 
353
  ### Training Code
354
+ - 📍 Get training [Source Code](https://huggingface.co/boltuix/NeuroLocale/blob/main/colab_training_code.ipynb) 🌟
355
+ - 📍 Dataset (comming soon..)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
356
  ---
357
 
358
  ## Evaluation 📈