Spaces:

kfoughali
/

serpent

Sleeping

App Files Files Community

kfoughali commited on Jul 30

Commit

0fecf94

verified ·

1 Parent(s): f4a2be4

Update core/graph_mamba.py

Browse files

Files changed (1) hide show

core/graph_mamba.py +238 -362

core/graph_mamba.py CHANGED Viewed

@@ -1,7 +1,7 @@
 #!/usr/bin/env python3
 """
-Ultra-Regularized GraphMamba - Overfitting Problem Solved
-Designed specifically for small training sets like Cora (140 samples)
 """
 import torch
@@ -12,9 +12,7 @@ from torch_geometric.datasets import Planetoid
 from torch_geometric.transforms import NormalizeFeatures
 from torch_geometric.utils import to_undirected, add_self_loops
 import torch.optim as optim
-from torch.optim.lr_scheduler import ReduceLROnPlateau
 import time
-import numpy as np
 def get_device():
     if torch.cuda.is_available():
@@ -26,347 +24,210 @@ def get_device():
         print("💻 Using CPU")
     return device
-class TinyMambaBlock(nn.Module):
-    """Ultra-small Mamba block for small datasets"""
-    def __init__(self, d_model, d_state=4):
         super().__init__()
-        self.d_model = d_model
-        self.d_state = d_state
-        self.d_inner = d_model  # No expansion to reduce parameters
-        # Minimal projections
-        self.in_proj = nn.Linear(d_model, self.d_inner * 2, bias=False)
-        self.out_proj = nn.Linear(self.d_inner, d_model, bias=False)
-        # Tiny SSM
-        self.dt_proj = nn.Linear(self.d_inner, self.d_inner, bias=False)
-        self.B_proj = nn.Linear(self.d_inner, d_state, bias=False)
-        self.C_proj = nn.Linear(self.d_inner, d_state, bias=False)
-        # Minimal A matrix
-        A = torch.arange(1, d_state + 1, dtype=torch.float32)
-        self.A_log = nn.Parameter(torch.log(A.unsqueeze(0).repeat(self.d_inner, 1)))
-        self.D = nn.Parameter(torch.ones(self.d_inner))
-        # Heavy regularization
-        self.dropout = nn.Dropout(0.7)  # Very aggressive dropout
-    def forward(self, x):
-        B, L, D = x.shape
-        # Dual path with heavy dropout
-        xz = self.dropout(self.in_proj(x))
-        x_path, z_path = xz.chunk(2, dim=-1)
-        # Simple activation
-        x_path = F.silu(x_path)
-        # Ultra-simple SSM (just a weighted sum)
-        dt = torch.sigmoid(self.dt_proj(x_path))
-        B_param = self.B_proj(x_path)
-        C_param = self.C_proj(x_path)
-        # Simplified state update
-        y = x_path * dt + B_param @ C_param.transpose(-1, -2)
-        # Gate and output
-        y = y * F.silu(z_path)
-        return self.dropout(self.out_proj(y))
-class UltraRegularizedGraphMamba(nn.Module):
-    """Ultra-regularized version for small datasets"""
-    def __init__(self, config):
         super().__init__()
-        self.config = config
-        d_model = config['model']['d_model']
-        n_layers = config['model']['n_layers']
-        input_dim = config.get('input_dim', 1433)
-        # Aggressive dimensionality reduction
-        self.input_proj = nn.Sequential(
-            nn.Linear(input_dim, d_model * 4),
             nn.ReLU(),
-            nn.Dropout(0.8),  # Very aggressive
-            nn.Linear(d_model * 4, d_model),
-            nn.LayerNorm(d_model)
         )
-        # Core layers with heavy regularization
-        self.gcn_layers = nn.ModuleList([
-            GCNConv(d_model, d_model) for _ in range(n_layers)
-        ])
-        self.mamba_blocks = nn.ModuleList([
-            TinyMambaBlock(d_model) for _ in range(n_layers)
-        ])
-        self.layer_norms = nn.ModuleList([
-            nn.LayerNorm(d_model) for _ in range(n_layers)
-        ])
-        # Massive dropout for regularization
-        self.dropouts = nn.ModuleList([
-            nn.Dropout(0.8) for _ in range(n_layers)  # 80% dropout
-        ])
-        # Lightweight output
-        self.output_proj = nn.Sequential(
-            nn.Dropout(0.7),
-            nn.Linear(d_model, d_model // 2),
             nn.ReLU(),
-            nn.Dropout(0.7),
-            nn.Linear(d_model // 2, d_model)
         )
-        self.classifier = None
-    def forward(self, x, edge_index, batch=None):
-        # Input with heavy regularization
-        h = self.input_proj(x)
-        # Process through layers
-        for i in range(len(self.gcn_layers)):
-            gcn = self.gcn_layers[i]
-            mamba = self.mamba_blocks[i]
-            norm = self.layer_norms[i]
-            dropout = self.dropouts[i]
-            # Skip connection from input
-            residual = h
-            # GCN path with dropout
-            h_gcn = dropout(F.relu(gcn(h, edge_index)))
-            # Mamba path with dropout
-            h_mamba = mamba(h.unsqueeze(0)).squeeze(0)
-            # Minimal combination to reduce parameters
-            h_combined = h_gcn * 0.7 + h_mamba * 0.3
-            # Strong residual connection
-            h = norm(residual + h_combined * 0.3)  # Small update
-        return self.output_proj(h)
-    def init_classifier(self, num_classes):
-        """Ultra-lightweight classifier"""
-        self.classifier = nn.Sequential(
-            nn.Dropout(0.8),  # Even more dropout in classifier
-            nn.Linear(self.config['model']['d_model'], num_classes)
-        )
-        return self.classifier
-class MinimalGraphMamba(nn.Module):
-    """Absolute minimal version"""
-    def __init__(self, config):
         super().__init__()
-        self.config = config
-        d_model = config['model']['d_model']
-        input_dim = config.get('input_dim', 1433)
-        # Ultra-simple architecture
-        self.encoder = nn.Sequential(
-            nn.Linear(input_dim, d_model * 2),
-            nn.ReLU(),
-            nn.Dropout(0.8),
-            nn.Linear(d_model * 2, d_model),
-            nn.LayerNorm(d_model)
         )
-        # Just one GCN layer
-        self.gcn = GCNConv(d_model, d_model)
-        # Simple enhancement
-        self.enhance = nn.Sequential(
-            nn.Dropout(0.7),
-            nn.Linear(d_model, d_model),
-            nn.ReLU(),
-            nn.Dropout(0.7),
-            nn.Linear(d_model, d_model)
         )
-        self.norm = nn.LayerNorm(d_model)
-        self.classifier = None
-    def forward(self, x, edge_index, batch=None):
-        h = self.encoder(x)
-        h_gcn = F.relu(self.gcn(h, edge_index))
-        h_enhanced = self.enhance(h_gcn)
-        return self.norm(h + h_enhanced * 0.2)  # Small residual
-    def init_classifier(self, num_classes):
-        self.classifier = nn.Sequential(
-            nn.Dropout(0.8),
-            nn.Linear(self.config['model']['d_model'], num_classes)
-        )
-        return self.classifier
-def create_ultra_regularized_config():
-    """Configuration for tiny models"""
-    return {
-        'model': {
-            'd_model': 16,      # Extremely small
-            'd_state': 4,
-            'n_layers': 1,      # Just one layer
-            'dropout': 0.8
-        },
-        'training': {
-            'learning_rate': 0.001,  # Much smaller LR
-            'weight_decay': 0.1,     # Massive weight decay
-            'epochs': 500,           # More epochs with smaller steps
-            'patience': 50,          # More patience
-            'label_smoothing': 0.3   # Label smoothing for regularization
-        },
-        'input_dim': 1433
-    }
-def create_minimal_config():
-    """Even smaller configuration"""
-    return {
-        'model': {
-            'd_model': 8,       # Tiny
-            'd_state': 2,
-            'n_layers': 1,
-            'dropout': 0.9      # Extreme dropout
-        },
-        'training': {
-            'learning_rate': 0.0005,
-            'weight_decay': 0.2,
-            'epochs': 1000,
-            'patience': 100,
-            'label_smoothing': 0.4
-        },
-        'input_dim': 1433
-    }
-class SmartTrainer:
-    """Trainer with extreme regularization"""
-    def __init__(self, model, config, device):
-        self.model = model.to(device)
-        self.config = config
-        self.device = device
-        # Very conservative optimizer
-        self.optimizer = optim.Adam(  # Adam instead of AdamW
-            model.parameters(),
-            lr=config['training']['learning_rate'],
-            weight_decay=config['training']['weight_decay']
-        )
-        # Aggressive scheduler
-        self.scheduler = ReduceLROnPlateau(
-            self.optimizer, mode='min', factor=0.3, patience=20, min_lr=1e-6
-        )
-        # Label smoothing for regularization
-        label_smoothing = config['training'].get('label_smoothing', 0.0)
-        self.criterion = nn.CrossEntropyLoss(label_smoothing=label_smoothing)
-        # Early stopping
-        self.patience = config['training']['patience']
-        self.best_val_loss = float('inf')
-        self.patience_counter = 0
-    def train(self, data):
-        print(f"🏋️ Ultra-Regularized Training")
-        print(f"   Parameters: {sum(p.numel() for p in self.model.parameters()):,}")
-        print(f"   Per sample: {sum(p.numel() for p in self.model.parameters())/data.train_mask.sum().item():.1f}")
-        print(f"   Learning rate: {self.config['training']['learning_rate']}")
-        print(f"   Weight decay: {self.config['training']['weight_decay']}")
-        # Initialize classifier
-        num_classes = data.y.max().item() + 1
-        self.model.init_classifier(num_classes)
-        self.model.classifier = self.model.classifier.to(self.device)
-        history = {'train_loss': [], 'val_loss': [], 'train_acc': [], 'val_acc': []}
-        for epoch in range(self.config['training']['epochs']):
-            # Training step
-            self.model.train()
-            self.optimizer.zero_grad()
-            out = self.model(data.x, data.edge_index)
-            logits = self.model.classifier(out)
-            train_loss = self.criterion(logits[data.train_mask], data.y[data.train_mask])
-            train_loss.backward()
-            # Gradient clipping for stability
-            torch.nn.utils.clip_grad_norm_(self.model.parameters(), 0.5)
-            self.optimizer.step()
-            # Evaluation
-            self.model.eval()
             with torch.no_grad():
-                out = self.model(data.x, data.edge_index)
-                logits = self.model.classifier(out)
-                val_loss = self.criterion(logits[data.val_mask], data.y[data.val_mask])
-                train_pred = logits[data.train_mask].argmax(dim=1)
                 train_acc = (train_pred == data.y[data.train_mask]).float().mean().item()
-                val_pred = logits[data.val_mask].argmax(dim=1)
                 val_acc = (val_pred == data.y[data.val_mask]).float().mean().item()
-            # Update history
-            history['train_loss'].append(train_loss.item())
-            history['val_loss'].append(val_loss.item())
-            history['train_acc'].append(train_acc)
-            history['val_acc'].append(val_acc)
-            # Scheduler step
-            self.scheduler.step(val_loss)
-            # Early stopping check
-            if val_loss < self.best_val_loss:
-                self.best_val_loss = val_loss
-                self.patience_counter = 0
-            else:
-                self.patience_counter += 1
-            if self.patience_counter >= self.patience:
-                print(f"   Early stopping at epoch {epoch+1}")
-                break
-            # Progress
-            if (epoch + 1) % 50 == 0:
                 gap = train_acc - val_acc
-                lr = self.optimizer.param_groups[0]['lr']
-                print(f"   Epoch {epoch+1:3d}: Loss {train_loss.item():.4f} -> {val_loss.item():.4f} | "
-                      f"Acc {train_acc:.4f} -> {val_acc:.4f} | Gap {gap:.4f} | LR {lr:.2e}")
-        return history
-    def test(self, data):
-        self.model.eval()
-        with torch.no_grad():
-            out = self.model(data.x, data.edge_index)
-            logits = self.model.classifier(out)
-            test_pred = logits[data.test_mask].argmax(dim=1)
-            test_acc = (test_pred == data.y[data.test_mask]).float().mean().item()
-            val_pred = logits[data.val_mask].argmax(dim=1)
-            val_acc = (val_pred == data.y[data.val_mask]).float().mean().item()
-            train_pred = logits[data.train_mask].argmax(dim=1)
-            train_acc = (train_pred == data.y[data.train_mask]).float().mean().item()
-            gap = train_acc - val_acc
-        return {
-            'test_acc': test_acc,
-            'val_acc': val_acc,
-            'train_acc': train_acc,
-            'gap': gap
-        }
-def run_ultra_regularized_test():
-    """Run ultra-regularized test"""
-    print("🧠 ULTRA-REGULARIZED MAMBA GRAPH NEURAL NETWORK")
-    print("🛡️ Overfitting Problem Solved")
     print("=" * 60)
     device = get_device()
@@ -378,90 +239,105 @@ def run_ultra_regularized_test():
     data.edge_index = to_undirected(data.edge_index)
     data.edge_index, _ = add_self_loops(data.edge_index, num_nodes=data.x.size(0))
-    print(f"✅ Dataset loaded: {data.num_nodes} nodes, {data.num_edges} edges")
-    print(f"   Train: {data.train_mask.sum()} samples (the challenge!)")
-    # Test different model sizes
-    models_to_test = {
-        'Ultra-Regularized (16D)': (UltraRegularizedGraphMamba, create_ultra_regularized_config()),
-        'Minimal (8D)': (MinimalGraphMamba, create_minimal_config()),
     }
     results = {}
-    for name, (model_class, config) in models_to_test.items():
         print(f"\n🏗️ Testing {name}...")
         try:
-            model = model_class(config)
-            total_params = sum(p.numel() for p in model.parameters())
-            params_per_sample = total_params / data.train_mask.sum().item()
-            print(f"   Parameters: {total_params:,} ({params_per_sample:.1f} per sample)")
-            if params_per_sample > 200:
-                print(f"   ⚠️ Still might overfit, but much better!")
             else:
-                print(f"   ✅ Good parameter ratio!")
-            # Test forward pass
-            model.eval()
-            with torch.no_grad():
-                h = model(data.x, data.edge_index)
-                print(f"   Forward pass: {data.x.shape} -> {h.shape} ✅")
-            # Train
-            trainer = SmartTrainer(model, config, device)
-            history = trainer.train(data)
-            # Test
-            test_results = trainer.test(data)
-            results[name] = {
-                'params': total_params,
-                'params_per_sample': params_per_sample,
-                'test_results': test_results,
-                'history': history
-            }
-            print(f"✅ {name} Results:")
-            print(f"   🎯 Test Accuracy: {test_results['test_acc']:.4f} ({test_results['test_acc']*100:.2f}%)")
-            print(f"   📊 Validation: {test_results['val_acc']:.4f}")
-            print(f"   🛡️ Overfitting Gap: {test_results['gap']:.4f}")
-            if test_results['gap'] < 0.2:
-                print(f"   🎉 Overfitting under control!")
-            elif test_results['gap'] < 0.3:
-                print(f"   👍 Much better overfitting control!")
-            else:
-                print(f"   ⚠️ Still some overfitting")
         except Exception as e:
-            print(f"❌ {name} failed: {str(e)}")
-    # Summary
     print(f"\n{'='*60}")
-    print("🏆 ULTRA-REGULARIZED RESULTS")
     print(f"{'='*60}")
     for name, result in results.items():
-        if 'test_results' in result:
-            tr = result['test_results']
-            print(f"📊 {name}:")
-            print(f"   Parameters: {result['params']:,} ({result['params_per_sample']:.1f}/sample)")
-            print(f"   Test Acc: {tr['test_acc']:.4f} | Gap: {tr['gap']:.4f}")
-    print(f"\n💡 Key Insight: With only 140 training samples, we need < 50 parameters per sample!")
-    print(f"📈 The ultra-regularized models should show much better generalization.")
     return results
 if __name__ == "__main__":
-    results = run_ultra_regularized_test()
-    print(f"\n🌐 Process staying alive...")
     try:
         while True:
             time.sleep(60)
     except KeyboardInterrupt:
-        print("\n👋 Goodbye!")

 #!/usr/bin/env python3
 """
+🚨 EMERGENCY OVERFITTING FIX 🚨
+Tiny GraphMamba designed specifically for 140 training samples
 """
 import torch
 from torch_geometric.transforms import NormalizeFeatures
 from torch_geometric.utils import to_undirected, add_self_loops
 import torch.optim as optim
 import time
 def get_device():
     if torch.cuda.is_available():
         print("💻 Using CPU")
     return device
+class EmergencyTinyMamba(nn.Module):
+    """Emergency ultra-tiny model for 140 samples"""
+    def __init__(self, input_dim=1433, hidden_dim=8, num_classes=7):
         super().__init__()
+        # TINY feature extraction
+        self.feature_reduce = nn.Sequential(
+            nn.Linear(input_dim, 32),
+            nn.ReLU(),
+            nn.Dropout(0.9),  # Extreme dropout
+            nn.Linear(32, hidden_dim)
+        )
+        # Single GCN layer
+        self.gcn = GCNConv(hidden_dim, hidden_dim)
+        # Tiny "Mamba-inspired" temporal processing
+        self.temporal = nn.Sequential(
+            nn.Linear(hidden_dim, hidden_dim),
+            nn.Tanh(),  # Bounded activation
+            nn.Dropout(0.9)
+        )
+        # Direct classifier
+        self.classifier = nn.Sequential(
+            nn.Dropout(0.95),  # Extreme dropout before classification
+            nn.Linear(hidden_dim, num_classes)
+        )
+        print(f"🦾 Emergency Model - Parameters: {sum(p.numel() for p in self.parameters()):,}")
+    def forward(self, x, edge_index):
+        # Feature reduction
+        h = self.feature_reduce(x)
+        # Graph convolution
+        h_gcn = F.relu(self.gcn(h, edge_index))
+        # Temporal processing (Mamba-inspired)
+        h_temporal = self.temporal(h_gcn)
+        # Small residual connection
+        h = h + h_temporal * 0.1  # Very small update
+        # Classification
+        return self.classifier(h)
+class MicroMamba(nn.Module):
+    """Even smaller model"""
+    def __init__(self, input_dim=1433, hidden_dim=4, num_classes=7):
         super().__init__()
+        # Ultra-compressed feature extraction
+        self.features = nn.Sequential(
+            nn.Linear(input_dim, 16),
             nn.ReLU(),
+            nn.Dropout(0.95),
+            nn.Linear(16, hidden_dim)
         )
+        # Minimal processing
+        self.process = nn.Sequential(
+            GCNConv(hidden_dim, hidden_dim),
             nn.ReLU(),
+            nn.Dropout(0.9)
         )
+        # Direct classification
+        self.classify = nn.Sequential(
+            nn.Dropout(0.95),
+            nn.Linear(hidden_dim, num_classes)
+        )
+        print(f"🤏 Micro Model - Parameters: {sum(p.numel() for p in self.parameters()):,}")
+    def forward(self, x, edge_index):
+        h = self.features(x)
+        h = self.process[0](h, edge_index)  # GCN
+        h = self.process[1](h)  # ReLU
+        h = self.process[2](h)  # Dropout
+        return self.classify(h)
+class NanoMamba(nn.Module):
+    """Absolutely minimal model"""
+    def __init__(self, input_dim=1433, num_classes=7):
         super().__init__()
+        # Direct path - no hidden layers
+        self.direct = nn.Sequential(
+            nn.Linear(input_dim, num_classes),
+            nn.Dropout(0.8)
         )
+        # GCN path
+        self.gcn_path = nn.Sequential(
+            nn.Linear(input_dim, 8),
+            nn.Dropout(0.9)
         )
+        self.gcn = GCNConv(8, num_classes)
+        print(f"⚛️ Nano Model - Parameters: {sum(p.numel() for p in self.parameters()):,}")
+    def forward(self, x, edge_index):
+        # Direct classification
+        direct_out = self.direct(x)
+        # GCN path
+        h = self.gcn_path(x)
+        gcn_out = self.gcn(h, edge_index)
+        # Minimal combination
+        return direct_out * 0.7 + gcn_out * 0.3
+def emergency_train(model, data, device, epochs=2000):
+    """Emergency training with extreme regularization"""
+    model = model.to(device)
+    data = data.to(device)
+    # Very conservative optimizer
+    optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9, weight_decay=0.5)
+    # Label smoothing cross entropy
+    criterion = nn.CrossEntropyLoss(label_smoothing=0.5)
+    print(f"🚨 Emergency Training Protocol")
+    print(f"   Parameters: {sum(p.numel() for p in model.parameters()):,}")
+    print(f"   Per sample: {sum(p.numel() for p in model.parameters())/140:.1f}")
+    print(f"   Epochs: {epochs}")
+    print(f"   Learning rate: 0.001")
+    print(f"   Weight decay: 0.5")
+    print(f"   Label smoothing: 0.5")
+    best_val_acc = 0
+    patience = 0
+    for epoch in range(epochs):
+        # Training
+        model.train()
+        optimizer.zero_grad()
+        out = model(data.x, data.edge_index)
+        loss = criterion(out[data.train_mask], data.y[data.train_mask])
+        loss.backward()
+        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.1)  # Tiny gradients
+        optimizer.step()
+        # Evaluation
+        if (epoch + 1) % 100 == 0:
+            model.eval()
             with torch.no_grad():
+                out = model(data.x, data.edge_index)
+                train_pred = out[data.train_mask].argmax(dim=1)
                 train_acc = (train_pred == data.y[data.train_mask]).float().mean().item()
+                val_pred = out[data.val_mask].argmax(dim=1)
                 val_acc = (val_pred == data.y[data.val_mask]).float().mean().item()
+                test_pred = out[data.test_mask].argmax(dim=1)
+                test_acc = (test_pred == data.y[data.test_mask]).float().mean().item()
                 gap = train_acc - val_acc
+                print(f"   Epoch {epoch+1:4d}: Train {train_acc:.3f} | Val {val_acc:.3f} | "
+                      f"Test {test_acc:.3f} | Gap {gap:.3f}")
+                if val_acc > best_val_acc:
+                    best_val_acc = val_acc
+                    patience = 0
+                else:
+                    patience += 100
+                if patience >= 500:  # Stop if no improvement
+                    print(f"   Early stopping at epoch {epoch+1}")
+                    break
+    # Final evaluation
+    model.eval()
+    with torch.no_grad():
+        out = model(data.x, data.edge_index)
+        train_pred = out[data.train_mask].argmax(dim=1)
+        train_acc = (train_pred == data.y[data.train_mask]).float().mean().item()
+        val_pred = out[data.val_mask].argmax(dim=1)
+        val_acc = (val_pred == data.y[data.val_mask]).float().mean().item()
+        test_pred = out[data.test_mask].argmax(dim=1)
+        test_acc = (test_pred == data.y[data.test_mask]).float().mean().item()
+        gap = train_acc - val_acc
+    return {
+        'train_acc': train_acc,
+        'val_acc': val_acc,
+        'test_acc': test_acc,
+        'gap': gap
+    }
+def run_emergency_fix():
+    """Emergency overfitting fix"""
+    print("🚨🚨🚨 EMERGENCY OVERFITTING FIX 🚨🚨🚨")
+    print("🩹 Ultra-Tiny Models for 140 Training Samples")
     print("=" * 60)
     device = get_device()
     data.edge_index = to_undirected(data.edge_index)
     data.edge_index, _ = add_self_loops(data.edge_index, num_nodes=data.x.size(0))
+    print(f"✅ Dataset: {data.num_nodes} nodes, Train: {data.train_mask.sum()} samples")
+    print(f"🎯 Target: <50 parameters per sample = <7,000 total parameters")
+    # Test emergency models
+    models = {
+        'Emergency Tiny (8D)': EmergencyTinyMamba(hidden_dim=8),
+        'Micro (4D)': MicroMamba(hidden_dim=4),
+        'Nano (Direct)': NanoMamba()
     }
     results = {}
+    for name, model in models.items():
         print(f"\n🏗️ Testing {name}...")
+        total_params = sum(p.numel() for p in model.parameters())
+        params_per_sample = total_params / 140
+        print(f"   Parameters: {total_params:,} ({params_per_sample:.1f} per sample)")
+        if params_per_sample < 50:
+            print(f"   ✅ EXCELLENT parameter ratio!")
+        elif params_per_sample < 100:
+            print(f"   👍 Good parameter ratio!")
+        else:
+            print(f"   ⚠️ Still might overfit")
+        # Test forward pass
+        with torch.no_grad():
+            out = model(data.x, data.edge_index)
+            print(f"   Forward: {data.x.shape} -> {out.shape} ✅")
         try:
+            # Emergency training
+            result = emergency_train(model, data, device)
+            results[name] = result
+            print(f"   🎯 Final Results:")
+            print(f"      Test Accuracy: {result['test_acc']:.3f} ({result['test_acc']*100:.1f}%)")
+            print(f"      Train Accuracy: {result['train_acc']:.3f}")
+            print(f"      Overfitting Gap: {result['gap']:.3f}")
+            if result['gap'] < 0.1:
+                print(f"      🎉 OVERFITTING SOLVED!")
+            elif result['gap'] < 0.2:
+                print(f"      👍 Much better generalization!")
+            elif result['gap'] < 0.3:
+                print(f"      📈 Improved generalization")
             else:
+                print(f"      ⚠️ Still overfitting")
         except Exception as e:
+            print(f"   ❌ Training failed: {e}")
+    # Emergency summary
     print(f"\n{'='*60}")
+    print("🚨 EMERGENCY RESULTS SUMMARY")
     print(f"{'='*60}")
+    best_gap = float('inf')
+    best_model = None
     for name, result in results.items():
+        print(f"📊 {name}:")
+        print(f"   Test: {result['test_acc']:.3f} | Gap: {result['gap']:.3f}")
+        if result['gap'] < best_gap:
+            best_gap = result['gap']
+            best_model = name
+    if best_model:
+        print(f"\n🏆 Best Generalization: {best_model} (Gap: {best_gap:.3f})")
+        if best_gap < 0.1:
+            print(f"🎉 MISSION ACCOMPLISHED! Overfitting crisis resolved!")
+        elif best_gap < 0.2:
+            print(f"👍 Significant improvement in generalization!")
+        else:
+            print(f"📈 Progress made, but still work to do...")
+    # Comparison with your current model
+    print(f"\n📈 Comparison:")
+    print(f"   Your model: 194K params, Gap ~0.5")
+    if best_model and best_gap < 0.3:
+        improvement = 0.5 - best_gap
+        print(f"   Best tiny model: Gap {best_gap:.3f} (Improvement: {improvement:.3f})")
+        print(f"   🎯 {improvement/0.5*100:.0f}% reduction in overfitting!")
+    print(f"\n💡 Key Lesson: With only 140 samples, bigger ≠ better!")
+    print(f"🧠 Tiny models can achieve competitive performance with much better generalization.")
     return results
 if __name__ == "__main__":
+    results = run_emergency_fix()
+    print(f"\n🌐 Emergency fix complete. Process staying alive...")
     try:
         while True:
             time.sleep(60)
     except KeyboardInterrupt:
+        print("\n👋 Emergency protocol terminated.")