Spaces:

kfoughali
/

serpent

Running

App Files Files Community

kfoughali commited on Jul 29

Commit

b09f924

verified ·

1 Parent(s): 617d132

Update core/graph_mamba.py

Browse files

Files changed (1) hide show

core/graph_mamba.py +277 -209

core/graph_mamba.py CHANGED Viewed

@@ -1,247 +1,315 @@
 import torch
 import torch.nn as nn
-from .mamba_block import MambaBlock
-from .graph_sequencer import GraphSequencer, PositionalEncoder
-class GraphMamba(nn.Module):
-    """
-    Production Graph-Mamba model with training optimizations
-    """
     def __init__(self, config):
         super().__init__()
         self.config = config
-        self.d_model = config['model']['d_model']
-        self.n_layers = config['model']['n_layers']
-        self.dropout = config['model']['dropout']
         self.ordering_strategy = config['ordering']['strategy']
-        # Input projection (dynamic input dimension)
-        self.input_proj = None
-        # Positional encoding
-        self.pos_encoder = PositionalEncoder()
-        self.pos_embed = nn.Linear(11, self.d_model)
-        # Mamba layers with residual connections
         self.mamba_layers = nn.ModuleList([
-            MambaBlock(
-                d_model=self.d_model,
-                d_state=config['model']['d_state'],
-                d_conv=config['model']['d_conv'],
-                expand=config['model']['expand']
-            )
-            for _ in range(self.n_layers)
         ])
         # Layer norms
         self.layer_norms = nn.ModuleList([
-            nn.LayerNorm(self.d_model)
-            for _ in range(self.n_layers)
         ])
-        # Dropout
-        self.dropout_layer = nn.Dropout(self.dropout)
-        # Graph sequencer
-        self.sequencer = GraphSequencer()
-        # Classification head (initialized dynamically)
         self.classifier = None
-        # Cache for efficiency
-        self._cache = {}
-    def _init_input_proj(self, input_dim, device):
-        """Initialize input projection dynamically"""
-        if self.input_proj is None:
-            self.input_proj = nn.Sequential(
-                nn.Linear(input_dim, self.d_model),
-                nn.LayerNorm(self.d_model),
-                nn.ReLU(),
-                nn.Dropout(self.dropout * 0.5)
-            ).to(device)
-    def _init_classifier(self, num_classes, device):
-        """Initialize classifier dynamically"""
-        if self.classifier is None:
-            self.classifier = nn.Sequential(
-                nn.Linear(self.d_model, self.d_model // 2),
-                nn.LayerNorm(self.d_model // 2),
-                nn.ReLU(),
-                nn.Dropout(self.dropout),
-                nn.Linear(self.d_model // 2, num_classes)
-            ).to(device)
-    def forward(self, x, edge_index, batch=None):
-        """
-        Forward pass with training optimizations
-        """
-        num_nodes = x.size(0)
-        input_dim = x.size(1)
-        device = x.device
-        # Move all components to correct device
-        self.to(device)
-        # Initialize input projection if needed
-        self._init_input_proj(input_dim, device)
-        # Project input features
-        h = self.input_proj(x)  # (num_nodes, d_model)
-        if batch is None:
-            # Single graph processing
-            h = self._process_single_graph(h, edge_index)
-        else:
-            # Batch processing
-            h = self._process_batch(h, edge_index, batch)
-        return h
-    def _process_single_graph(self, h, edge_index):
-        """Process a single graph with caching"""
-        num_nodes = h.size(0)
-        device = h.device
-        # Ensure edge_index is on correct device
-        edge_index = edge_index.to(device)
-        # Cache key for ordering
-        cache_key = f"{self.ordering_strategy}_{num_nodes}_{edge_index.shape[1]}"
-        # Get ordering (with caching during training)
-        if cache_key not in self._cache or not self.training:
-            if self.ordering_strategy == "spectral":
-                order = self.sequencer.spectral_ordering(edge_index, num_nodes)
-            elif self.ordering_strategy == "degree":
-                order = self.sequencer.degree_ordering(edge_index, num_nodes)
-            elif self.ordering_strategy == "community":
-                order = self.sequencer.community_ordering(edge_index, num_nodes)
-            else:  # default to BFS
-                order = self.sequencer.bfs_ordering(edge_index, num_nodes)
-            if self.training:
-                self._cache[cache_key] = order
-        else:
-            order = self._cache[cache_key]
-        # Ensure order is on correct device
-        order = order.to(device)
-        # Add positional encoding
-        seq_pos, distances = self.pos_encoder.encode_positions(h, edge_index, order)
-        seq_pos = seq_pos.to(device)
-        distances = distances.to(device)
-        pos_features = torch.cat([seq_pos, distances], dim=1)  # (num_nodes, 11)
-        pos_embed = self.pos_embed(pos_features)
-        # Reorder nodes for sequential processing
-        h_ordered = h[order] + pos_embed[order]  # Add positional encoding
-        h_ordered = h_ordered.unsqueeze(0)  # (1, num_nodes, d_model)
-        # Process through Mamba layers with residual connections
-        for i, (mamba, ln) in enumerate(zip(self.mamba_layers, self.layer_norms)):
-            # Pre-norm residual connection with gradient scaling
             residual = h_ordered
             h_ordered = ln(h_ordered)
-            h_ordered = mamba(h_ordered)
-            h_ordered = residual + self.dropout_layer(h_ordered)
-            # Layer-wise learning rate scaling
-            if self.training:
-                h_ordered = h_ordered * (1.0 - 0.1 * i / self.n_layers)
         # Restore original order
-        h_out = h_ordered.squeeze(0)  # (num_nodes, d_model)
-        # Create inverse mapping
-        inverse_order = torch.argsort(order)
-        h_final = h_out[inverse_order]
-        return h_final
-    def _process_batch(self, h, edge_index, batch):
-        """Process batched graphs efficiently"""
-        device = h.device
-        batch = batch.to(device)
-        edge_index = edge_index.to(device)
-        batch_size = batch.max().item() + 1
-        outputs = []
-        for b in range(batch_size):
-            # Extract subgraph
-            mask = batch == b
-            batch_h = h[mask]
-            # Get edges for this graph
-            edge_mask = mask[edge_index[0]] & mask[edge_index[1]]
-            batch_edges = edge_index[:, edge_mask]
-            if batch_edges.shape[1] > 0:
-                # Reindex edges to local indices
-                node_indices = torch.where(mask)[0]
-                node_map = torch.zeros(h.size(0), dtype=torch.long, device=device)
-                node_map[node_indices] = torch.arange(batch_h.size(0), device=device)
-                batch_edges_local = node_map[batch_edges]
-            else:
-                # Empty graph
-                batch_edges_local = torch.empty((2, 0), dtype=torch.long, device=device)
-            # Process subgraph
-            batch_output = self._process_single_graph(batch_h, batch_edges_local)
-            outputs.append(batch_output)
-        # Reconstruct full batch
-        h_out = torch.zeros_like(h)
-        for b, output in enumerate(outputs):
-            mask = batch == b
-            h_out[mask] = output
-        return h_out
-    def get_graph_embedding(self, h, batch=None):
-        """Get graph-level representation with multiple pooling"""
-        if batch is None:
-            # Single graph - multiple pooling strategies
-            mean_pool = h.mean(dim=0, keepdim=True)
-            max_pool = h.max(dim=0)[0].unsqueeze(0)
-            # Attention pooling
-            attn_weights = torch.softmax(h.sum(dim=1), dim=0)
-            attn_pool = (h * attn_weights.unsqueeze(1)).sum(dim=0, keepdim=True)
-            return torch.cat([mean_pool, max_pool, attn_pool], dim=1)
-        else:
-            # Batched graphs
-            device = h.device
-            batch = batch.to(device)
-            batch_size = batch.max().item() + 1
-            graph_embeddings = []
-            for b in range(batch_size):
-                mask = batch == b
-                if mask.any():
-                    batch_h = h[mask]
-                    # Multiple pooling for this graph
-                    mean_pool = batch_h.mean(dim=0)
-                    max_pool = batch_h.max(dim=0)[0]
-                    attn_weights = torch.softmax(batch_h.sum(dim=1), dim=0)
-                    attn_pool = (batch_h * attn_weights.unsqueeze(1)).sum(dim=0)
-                    graph_emb = torch.cat([mean_pool, max_pool, attn_pool])
-                    graph_embeddings.append(graph_emb)
-                else:
-                    # Empty graph
-                    graph_embeddings.append(torch.zeros(h.size(1) * 3, device=device))
-            return torch.stack(graph_embeddings)
-    def clear_cache(self):
-        """Clear ordering cache"""
-        self._cache.clear()

 import torch
 import torch.nn as nn
+import torch.nn.functional as F
+from torch_geometric.utils import degree, to_dense_batch
+import networkx as nx
+import numpy as np
+import logging
+logger = logging.getLogger(__name__)
+class MambaBlock(nn.Module):
+    """Enhanced Mamba block with optimizations"""
+    def __init__(self, d_model, d_state=16, d_conv=4, expand=2):
+        super().__init__()
+        self.d_model = d_model
+        self.d_inner = int(expand * d_model)
+        self.d_state = d_state
+        self.in_proj = nn.Linear(d_model, self.d_inner * 2, bias=False)
+        self.conv1d = nn.Conv1d(self.d_inner, self.d_inner, d_conv, groups=self.d_inner, padding=d_conv-1)
+        self.act = nn.SiLU()
+        self.x_proj = nn.Linear(self.d_inner, d_state * 2 + 1, bias=False)
+        self.dt_proj = nn.Linear(1, self.d_inner, bias=True)
+        A = torch.arange(1, d_state + 1, dtype=torch.float32).unsqueeze(0).repeat(self.d_inner, 1)
+        self.A_log = nn.Parameter(torch.log(A))
+        self.D = nn.Parameter(torch.ones(self.d_inner))
+        self.out_proj = nn.Linear(self.d_inner, d_model, bias=False)
+    def forward(self, x):
+        batch, length, d_model = x.shape
+        xz = self.in_proj(x)
+        x, z = xz.chunk(2, dim=-1)
+        x = x.transpose(1, 2)
+        x = self.conv1d(x)[:, :, :length]
+        x = x.transpose(1, 2)
+        x = self.act(x)
+        y = self.selective_scan(x)
+        y = y * self.act(z)
+        return self.out_proj(y)
+    def selective_scan(self, x):
+        batch, length, d_inner = x.shape
+        deltaBC = self.x_proj(x)
+        delta, B, C = torch.split(deltaBC, [1, self.d_state, self.d_state], dim=-1)
+        delta = F.softplus(self.dt_proj(delta))
+        deltaA = torch.exp(delta.unsqueeze(-1) * (-torch.exp(self.A_log)))
+        deltaB = delta.unsqueeze(-1) * B.unsqueeze(2)
+        states = torch.zeros(batch, d_inner, self.d_state, device=x.device)
+        outputs = []
+        for i in range(length):
+            states = deltaA[:, i] * states + deltaB[:, i] * x[:, i, :, None]
+            y = (states @ C[:, i, :, None]).squeeze(-1) + self.D * x[:, i]
+            outputs.append(y)
+        return torch.stack(outputs, dim=1)
+class EnhancedGraphOrdering:
+    """Advanced graph ordering strategies"""
+    @staticmethod
+    def pagerank_ordering(edge_index, num_nodes):
+        """PageRank-based ordering preserving importance"""
+        try:
+            G = nx.Graph()
+            if edge_index.size(1) > 0:
+                edges = edge_index.t().cpu().numpy()
+                G.add_edges_from(edges)
+            G.add_nodes_from(range(num_nodes))
+            pagerank = nx.pagerank(G, max_iter=50)
+            order = sorted(range(num_nodes), key=lambda x: pagerank.get(x, 0), reverse=True)
+            return torch.tensor(order, dtype=torch.long)
+        except:
+            return torch.arange(num_nodes, dtype=torch.long)
+    @staticmethod
+    def community_aware_ordering(edge_index, num_nodes):
+        """Community-preserving ordering"""
+        try:
+            G = nx.Graph()
+            if edge_index.size(1) > 0:
+                edges = edge_index.t().cpu().numpy()
+                G.add_edges_from(edges)
+            G.add_nodes_from(range(num_nodes))
+            communities = nx.community.greedy_modularity_communities(G)
+            order = []
+            for community in communities:
+                # Sort within community by degree
+                community_list = list(community)
+                degrees = {node: G.degree(node) for node in community_list}
+                community_sorted = sorted(community_list, key=lambda x: degrees[x], reverse=True)
+                order.extend(community_sorted)
+            return torch.tensor(order, dtype=torch.long)
+        except:
+            return torch.arange(num_nodes, dtype=torch.long)
+class StructuralEncoding(nn.Module):
+    """Multi-faceted structural encoding"""
+    def __init__(self, d_model, max_nodes=5000, max_degree=100):
+        super().__init__()
+        self.pos_encoding = nn.Embedding(max_nodes, d_model)
+        self.degree_encoding = nn.Embedding(max_degree, d_model)
+        self.centrality_proj = nn.Linear(1, d_model)
+        self.layer_norm = nn.LayerNorm(d_model)
+    def forward(self, x, edge_index, node_order=None):
+        num_nodes = x.size(0)
+        device = x.device
+        # Position encoding
+        positions = torch.arange(num_nodes, device=device).clamp(max=self.pos_encoding.num_embeddings-1)
+        pos_emb = self.pos_encoding(positions)
+        # Degree encoding
+        degrees = degree(edge_index[0], num_nodes).long().clamp(max=self.degree_encoding.num_embeddings-1)
+        degree_emb = self.degree_encoding(degrees)
+        # Simple centrality (normalized degree)
+        centrality = degrees.float() / max(degrees.max().item(), 1.0)
+        centrality_emb = self.centrality_proj(centrality.unsqueeze(-1))
+        # Combine encodings
+        structural_emb = pos_emb + degree_emb + centrality_emb
+        return self.layer_norm(x + structural_emb)
+class MultiScaleGraphMamba(nn.Module):
+    """Multi-scale processing with different orderings"""
+    def __init__(self, d_model, n_layers=3):
+        super().__init__()
+        self.d_model = d_model
+        # Different scale processors
+        self.local_mamba = nn.ModuleList([MambaBlock(d_model) for _ in range(n_layers//2)])
+        self.global_mamba = nn.ModuleList([MambaBlock(d_model) for _ in range(n_layers//2)])
+        # Fusion layers
+        self.scale_fusion = nn.Linear(d_model * 2, d_model)
+        self.layer_norm = nn.LayerNorm(d_model)
+    def forward(self, x, edge_index):
+        num_nodes = x.size(0)
+        # Different orderings
+        local_order = torch.arange(num_nodes)  # BFS equivalent
+        global_order = EnhancedGraphOrdering.pagerank_ordering(edge_index, num_nodes)
+        # Process local scale
+        x_local = x[local_order].unsqueeze(0)
+        for layer in self.local_mamba:
+            x_local = x_local + layer(x_local)
+        x_local = x_local.squeeze(0)
+        # Process global scale
+        x_global = x[global_order].unsqueeze(0)
+        for layer in self.global_mamba:
+            x_global = x_global + layer(x_global)
+        x_global = x_global.squeeze(0)
+        # Restore original order
+        local_restored = torch.zeros_like(x_local)
+        global_restored = torch.zeros_like(x_global)
+        local_restored[local_order] = x_local
+        global_restored[global_order] = x_global
+        # Fuse scales
+        fused = torch.cat([local_restored, global_restored], dim=-1)
+        return self.layer_norm(self.scale_fusion(fused))
+class GraphMamba(nn.Module):
+    """Enhanced GraphMamba with accuracy improvements"""
     def __init__(self, config):
         super().__init__()
         self.config = config
+        d_model = config['model']['d_model']
+        n_layers = config['model']['n_layers']
         self.ordering_strategy = config['ordering']['strategy']
+        # Input projection
+        self.input_proj = nn.Linear(config.get('input_dim', 1433), d_model)
+        # Structural encoding
+        self.structural_encoding = StructuralEncoding(d_model)
+        # Multi-scale processing
+        self.multi_scale = MultiScaleGraphMamba(d_model, n_layers)
+        # Additional Mamba layers
         self.mamba_layers = nn.ModuleList([
+            MambaBlock(d_model) for _ in range(max(1, n_layers - 2))
         ])
         # Layer norms
         self.layer_norms = nn.ModuleList([
+            nn.LayerNorm(d_model) for _ in range(len(self.mamba_layers))
         ])
+        # Output projection
+        self.output_proj = nn.Linear(d_model, d_model)
+        self.dropout = nn.Dropout(config['model']['dropout'])
+        # For node classification
         self.classifier = None
+    def _get_ordering(self, edge_index, num_nodes):
+        """Get node ordering based on strategy"""
+        if self.ordering_strategy == 'pagerank':
+            return EnhancedGraphOrdering.pagerank_ordering(edge_index, num_nodes)
+        elif self.ordering_strategy == 'community':
+            return EnhancedGraphOrdering.community_aware_ordering(edge_index, num_nodes)
+        elif self.ordering_strategy == 'spectral':
+            return self._spectral_ordering(edge_index, num_nodes)
+        else:  # BFS default
+            return torch.arange(num_nodes, dtype=torch.long)
+    def _spectral_ordering(self, edge_index, num_nodes):
+        """Spectral ordering with fallback"""
+        try:
+            from torch_geometric.utils import get_laplacian
+            edge_index_lap, edge_weight = get_laplacian(edge_index, num_nodes=num_nodes)
+            # Simple degree-based approximation
+            degrees = degree(edge_index[0], num_nodes)
+            return torch.argsort(degrees, descending=True)
+        except:
+            return torch.arange(num_nodes, dtype=torch.long)
+    def forward(self, x, edge_index, batch=None):
+        """Enhanced forward pass"""
+        # Input projection
+        h = self.input_proj(x)
+        # Add structural information
+        h = self.structural_encoding(h, edge_index)
+        # Multi-scale processing
+        h = self.multi_scale(h, edge_index)
+        # Additional sequential processing
+        order = self._get_ordering(edge_index, h.size(0))
+        h_ordered = h[order].unsqueeze(0)
+        for mamba, ln in zip(self.mamba_layers, self.layer_norms):
             residual = h_ordered
             h_ordered = ln(h_ordered)
+            h_ordered = residual + self.dropout(mamba(h_ordered))
         # Restore original order
+        h_restored = torch.zeros_like(h_ordered.squeeze(0))
+        h_restored[order] = h_ordered.squeeze(0)
+        return self.output_proj(h_restored)
+    def _init_classifier(self, num_classes, device):
+        """Initialize classifier head"""
+        if self.classifier is None:
+            self.classifier = nn.Linear(self.config['model']['d_model'], num_classes).to(device)
+    def get_performance_stats(self):
+        """Get model performance statistics"""
+        total_params = sum(p.numel() for p in self.parameters())
+        return {
+            'total_params': total_params,
+            'device': next(self.parameters()).device,
+            'dtype': next(self.parameters()).dtype,
+            'ordering_strategy': self.ordering_strategy
+        }
+class HybridGraphMamba(nn.Module):
+    """Hybrid approach with minimal GNN overhead"""
+    def __init__(self, config):
+        super().__init__()
+        from torch_geometric.nn import GCNConv
+        d_model = config['model']['d_model']
+        self.graph_mamba = GraphMamba(config)
+        self.gcn = GCNConv(d_model, d_model)
+        self.gate = nn.Linear(d_model, 1)
+        self.fusion = nn.Linear(d_model * 2, d_model)
+    def forward(self, x, edge_index, batch=None):
+        # Get both representations
+        mamba_out = self.graph_mamba(x, edge_index, batch)
+        gcn_out = self.gcn(mamba_out, edge_index)
+        # Learned fusion
+        gate_weight = torch.sigmoid(self.gate(mamba_out))
+        weighted = gate_weight * mamba_out + (1 - gate_weight) * gcn_out
+        # Final fusion
+        combined = torch.cat([mamba_out, weighted], dim=-1)
+        return self.fusion(combined)
+    def _init_classifier(self, num_classes, device):
+        """Initialize classifier for hybrid model"""
+        if not hasattr(self, 'classifier') or self.classifier is None:
+            self.classifier = nn.Linear(self.config['model']['d_model'], num_classes).to(device)
+    def get_performance_stats(self):
+        """Get hybrid model stats"""
+        return self.graph_mamba.get_performance_stats()