Spaces:

kfoughali
/

serpent

Running

App Files Files Community

kfoughali commited on Jul 29

Commit

972fdf4

verified ·

1 Parent(s): c3b0dee

Update utils/metrics.py

Browse files

Files changed (1) hide show

utils/metrics.py +382 -176

utils/metrics.py CHANGED Viewed

@@ -1,214 +1,420 @@
 import torch
 import torch.nn.functional as F
-from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, classification_report
 import numpy as np
-class GraphMetrics:
-    """Comprehensive evaluation metrics for graph learning"""
     @staticmethod
-    def accuracy(pred, target):
-        """Classification accuracy"""
-        if pred.dim() > 1:
-            pred_labels = pred.argmax(dim=1)
-        else:
-            pred_labels = pred
-        return (pred_labels == target).float().mean().item()
-    @staticmethod
-    def f1_score_macro(pred, target):
-        """Macro F1 score"""
         try:
-            if pred.dim() > 1:
-                pred_labels = pred.argmax(dim=1)
             else:
-                pred_labels = pred
-            pred_labels = pred_labels.cpu().numpy()
-            target_labels = target.cpu().numpy()
-            return f1_score(target_labels, pred_labels, average='macro', zero_division=0)
-        except:
-            return 0.0
     @staticmethod
-    def f1_score_micro(pred, target):
-        """Micro F1 score"""
         try:
-            if pred.dim() > 1:
-                pred_labels = pred.argmax(dim=1)
-            else:
-                pred_labels = pred
-            pred_labels = pred_labels.cpu().numpy()
-            target_labels = target.cpu().numpy()
-            return f1_score(target_labels, pred_labels, average='micro', zero_division=0)
-        except:
-            return 0.0
     @staticmethod
-    def roc_auc(pred, target):
-        """ROC AUC score"""
         try:
-            if pred.dim() > 1:
-                # Multi-class
-                pred_probs = F.softmax(pred, dim=1).cpu().numpy()
-                target_onehot = F.one_hot(target, num_classes=pred.size(1)).cpu().numpy()
-                return roc_auc_score(target_onehot, pred_probs, multi_class='ovr', average='macro')
-            else:
-                # Binary
-                pred_probs = torch.sigmoid(pred).cpu().numpy()
-                target_labels = target.cpu().numpy()
-                return roc_auc_score(target_labels, pred_probs)
-        except:
-            return 0.0
     @staticmethod
-    def classification_report_dict(pred, target):
-        """Detailed classification report"""
         try:
-            if pred.dim() > 1:
-                pred_labels = pred.argmax(dim=1)
             else:
-                pred_labels = pred
-            pred_labels = pred_labels.cpu().numpy()
-            target_labels = target.cpu().numpy()
-            report = classification_report(target_labels, pred_labels, output_dict=True, zero_division=0)
-            return report
-        except:
-            return {}
     @staticmethod
-    def evaluate_node_classification(model, data, mask, device, detailed=False):
-        """Comprehensive node classification evaluation"""
-        model.eval()
         try:
-            with torch.no_grad():
-                # Ensure data is on correct device
-                data = data.to(device)
-                model = model.to(device)
-                h = model(data.x, data.edge_index)
-                # Get predictions
-                if hasattr(model, 'classifier') and model.classifier is not None:
-                    pred = model.classifier(h)
-                else:
-                    # Initialize classifier if needed
-                    num_classes = len(torch.unique(data.y))
-                    model._init_classifier(num_classes, device)
-                    pred = model.classifier(h)
-                pred_masked = pred[mask]
-                target_masked = data.y[mask]
-                metrics = {
-                    'accuracy': GraphMetrics.accuracy(pred_masked, target_masked),
-                    'f1_macro': GraphMetrics.f1_score_macro(pred_masked, target_masked),
-                    'f1_micro': GraphMetrics.f1_score_micro(pred_masked, target_masked),
-                }
-                # Add detailed metrics if requested
-                if detailed:
-                    metrics['roc_auc'] = GraphMetrics.roc_auc(pred_masked, target_masked)
-                    metrics['classification_report'] = GraphMetrics.classification_report_dict(pred_masked, target_masked)
-                # Add loss
-                criterion = torch.nn.CrossEntropyLoss()
-                metrics['loss'] = criterion(pred_masked, target_masked).item()
         except Exception as e:
-            print(f"Evaluation error: {e}")
-            metrics = {
-                'accuracy': 0.0,
-                'f1_macro': 0.0,
-                'f1_micro': 0.0,
-                'loss': float('inf'),
-                'error': str(e)
-            }
-        return metrics
     @staticmethod
-    def evaluate_graph_classification(model, dataloader, device, detailed=False):
-        """Comprehensive graph classification evaluation"""
-        model.eval()
-        all_preds = []
-        all_targets = []
-        total_loss = 0.0
-        num_batches = 0
         try:
-            criterion = torch.nn.CrossEntropyLoss()
-            with torch.no_grad():
-                for batch in dataloader:
-                    batch = batch.to(device)
-                    h = model(batch.x, batch.edge_index, batch.batch)
-                    # Graph-level prediction
-                    graph_h = model.get_graph_embedding(h, batch.batch)
-                    if hasattr(model, 'classifier') and model.classifier is not None:
-                        pred = model.classifier(graph_h)
-                    else:
-                        # Initialize classifier
-                        num_classes = len(torch.unique(batch.y))
-                        model._init_classifier(num_classes, device)
-                        pred = model.classifier(graph_h)
-                    all_preds.append(pred.cpu())
-                    all_targets.append(batch.y.cpu())
-                    # Calculate loss
-                    loss = criterion(pred, batch.y)
-                    total_loss += loss.item()
-                    num_batches += 1
-            if all_preds:
-                all_preds = torch.cat(all_preds, dim=0)
-                all_targets = torch.cat(all_targets, dim=0)
-                metrics = {
-                    'accuracy': GraphMetrics.accuracy(all_preds, all_targets),
-                    'f1_macro': GraphMetrics.f1_score_macro(all_preds, all_targets),
-                    'f1_micro': GraphMetrics.f1_score_micro(all_preds, all_targets),
-                    'loss': total_loss / max(num_batches, 1)
-                }
-                if detailed:
-                    metrics['roc_auc'] = GraphMetrics.roc_auc(all_preds, all_targets)
-                    metrics['classification_report'] = GraphMetrics.classification_report_dict(all_preds, all_targets)
-            else:
-                metrics = {'error': 'No predictions generated'}
         except Exception as e:
-            print(f"Graph classification evaluation error: {e}")
-            metrics = {
-                'accuracy': 0.0,
-                'f1_macro': 0.0,
-                'f1_micro': 0.0,
-                'loss': float('inf'),
-                'error': str(e)
-            }
-        return metrics
     @staticmethod
-    def compare_models(results_dict):
-        """Compare multiple model results"""
-        comparison = {}
-        for model_name, metrics in results_dict.items():
-            comparison[model_name] = {
-                'accuracy': metrics.get('accuracy', 0.0),
-                'f1_macro': metrics.get('f1_macro', 0.0),
-                'f1_micro': metrics.get('f1_micro', 0.0),
-                'loss': metrics.get('loss', float('inf'))
-            }
-        # Find best performing model
-        best_model = max(comparison.keys(), key=lambda k: comparison[k]['accuracy'])
-        comparison['best_model'] = best_model
-        return comparison

 import torch
 import torch.nn.functional as F
+from torch_geometric.data import Data
+from torch_geometric.transforms import Compose
 import numpy as np
+import logging
+logger = logging.getLogger(__name__)
+class GraphProcessor:
+    """
+    Advanced data preprocessing utilities
+    Enterprise-grade with comprehensive validation
+    """
     @staticmethod
+    def normalize_features(x, method: str = 'l2'):
+        """
+        Normalize node features with validation
+        Args:
+            x: Feature tensor
+            method: Normalization method ('l2', 'minmax', 'standard')
+        Returns:
+            Normalized feature tensor
+        """
+        if not isinstance(x, torch.Tensor):
+            raise TypeError("x must be a torch.Tensor")
+        if x.dim() != 2:
+            raise ValueError("x must be a 2D tensor")
         try:
+            if method == 'l2':
+                # L2 normalization with numerical stability
+                norms = torch.norm(x, p=2, dim=1, keepdim=True)
+                norms = torch.clamp(norms, min=1e-8)  # Avoid division by zero
+                return x / norms
+            elif method == 'minmax':
+                # Min-max normalization
+                x_min = x.min(dim=0, keepdim=True)[0]
+                x_max = x.max(dim=0, keepdim=True)[0]
+                x_range = x_max - x_min
+                x_range = torch.clamp(x_range, min=1e-8)  # Avoid division by zero
+                return (x - x_min) / x_range
+            elif method == 'standard':
+                # Standard normalization (z-score)
+                x_mean = x.mean(dim=0, keepdim=True)
+                x_std = x.std(dim=0, keepdim=True)
+                x_std = torch.clamp(x_std, min=1e-8)  # Avoid division by zero
+                return (x - x_mean) / x_std
             else:
+                logger.warning(f"Unknown normalization method: {method}, returning original")
+                return x
+        except Exception as e:
+            logger.error(f"Feature normalization failed: {e}")
+            return x
     @staticmethod
+    def add_self_loops(edge_index, num_nodes):
+        """
+        Add self loops to graph with validation
+        Args:
+            edge_index: Edge connectivity tensor
+            num_nodes: Number of nodes
+        Returns:
+            Edge index with self loops
+        """
+        if not isinstance(edge_index, torch.Tensor):
+            raise TypeError("edge_index must be a torch.Tensor")
+        if edge_index.dim() != 2 or edge_index.size(0) != 2:
+            raise ValueError("edge_index must have shape (2, num_edges)")
+        if num_nodes <= 0:
+            raise ValueError("num_nodes must be positive")
         try:
+            device = edge_index.device
+            self_loops = torch.arange(num_nodes, device=device).unsqueeze(0).repeat(2, 1)
+            # Check if self loops already exist
+            existing_self_loops = (edge_index[0] == edge_index[1]).any()
+            if not existing_self_loops:
+                edge_index = torch.cat([edge_index, self_loops], dim=1)
+                logger.debug(f"Added {num_nodes} self loops")
+            return edge_index
+        except Exception as e:
+            logger.error(f"Adding self loops failed: {e}")
+            return edge_index
     @staticmethod
+    def remove_self_loops(edge_index):
+        """
+        Remove self loops from graph
+        Args:
+            edge_index: Edge connectivity tensor
+        Returns:
+            Edge index without self loops
+        """
+        if not isinstance(edge_index, torch.Tensor):
+            raise TypeError("edge_index must be a torch.Tensor")
+        if edge_index.dim() != 2 or edge_index.size(0) != 2:
+            raise ValueError("edge_index must have shape (2, num_edges)")
         try:
+            mask = edge_index[0] != edge_index[1]
+            filtered_edges = edge_index[:, mask]
+            removed_count = edge_index.size(1) - filtered_edges.size(1)
+            if removed_count > 0:
+                logger.debug(f"Removed {removed_count} self loops")
+            return filtered_edges
+        except Exception as e:
+            logger.error(f"Removing self loops failed: {e}")
+            return edge_index
     @staticmethod
+    def add_positional_features(data, encoding_dim: int = 8):
+        """
+        Add positional encodings as features with validation
+        Args:
+            data: PyTorch Geometric data object
+            encoding_dim: Dimension of positional encoding
+        Returns:
+            Data object with enhanced features
+        """
+        if not hasattr(data, 'x') or not hasattr(data, 'edge_index'):
+            raise ValueError("Data must have x and edge_index attributes")
+        num_nodes = data.num_nodes
+        encoding_dim = max(1, min(encoding_dim, num_nodes))
         try:
+            # Random walk positional encoding
+            if data.edge_index.size(1) > 0:
+                # Create adjacency matrix
+                adj = torch.zeros(num_nodes, num_nodes, dtype=torch.float)
+                adj[data.edge_index[0], data.edge_index[1]] = 1.0
+                adj = adj + adj.t()  # Make symmetric
+                # Compute degree
+                degree = adj.sum(dim=1)
+                degree = torch.clamp(degree, min=1e-8)  # Avoid division by zero
+                # Degree normalization
+                D_inv_sqrt = torch.diag(1.0 / torch.sqrt(degree))
+                # Normalized adjacency
+                A_norm = D_inv_sqrt @ adj @ D_inv_sqrt
+                # Check for numerical issues
+                if torch.isnan(A_norm).any() or torch.isinf(A_norm).any():
+                    logger.warning("Numerical issues in adjacency matrix, using simple encoding")
+                    pos_encoding = torch.eye(num_nodes)[:, :encoding_dim]
+                else:
+                    # Random walk features
+                    rw_features = []
+                    A_power = torch.eye(num_nodes)
+                    for k in range(encoding_dim):
+                        A_power = A_power @ A_norm
+                        rw_features.append(A_power.diag().unsqueeze(1))
+                    pos_encoding = torch.cat(rw_features, dim=1)
+            else:
+                # No edges - use one-hot encoding
+                pos_encoding = torch.zeros(num_nodes, encoding_dim)
+                for i in range(min(encoding_dim, num_nodes)):
+                    pos_encoding[i, i] = 1.0
+            # Concatenate with existing features
+            if data.x is not None:
+                data.x = torch.cat([data.x, pos_encoding], dim=1)
             else:
+                data.x = pos_encoding
+            logger.debug(f"Added positional features of dimension {encoding_dim}")
+        except Exception as e:
+            logger.error(f"Adding positional features failed: {e}")
+            # Don't modify data on failure
+            pass
+        return data
     @staticmethod
+    def augment_graph(data, aug_type: str = 'edge_drop', aug_ratio: float = 0.1):
+        """
+        Graph augmentation for training with validation
+        Args:
+            data: PyTorch Geometric data object
+            aug_type: Type of augmentation
+            aug_ratio: Augmentation strength
+        Returns:
+            Augmented data object
+        """
+        if not (0.0 <= aug_ratio <= 1.0):
+            raise ValueError("aug_ratio must be between 0 and 1")
+        # Create a copy to avoid modifying original
+        aug_data = data.clone()
         try:
+            if aug_type == 'edge_drop':
+                # Randomly drop edges
+                if aug_data.edge_index.size(1) > 0:
+                    num_edges = aug_data.edge_index.size(1)
+                    mask = torch.rand(num_edges) > aug_ratio
+                    aug_data.edge_index = aug_data.edge_index[:, mask]
+                    logger.debug(f"Dropped {(~mask).sum()} edges")
+            elif aug_type == 'node_drop':
+                # Randomly drop nodes
+                num_nodes = aug_data.num_nodes
+                if num_nodes > 1:
+                    keep_mask = torch.rand(num_nodes) > aug_ratio
+                    # Ensure at least one node remains
+                    if not keep_mask.any():
+                        keep_mask[0] = True
+                    keep_nodes = torch.where(keep_mask)[0]
+                    # Update node features
+                    if aug_data.x is not None:
+                        aug_data.x = aug_data.x[keep_nodes]
+                    # Update labels if they exist and are node-level
+                    if hasattr(aug_data, 'y') and aug_data.y.size(0) == num_nodes:
+                        aug_data.y = aug_data.y[keep_nodes]
+                    # Update edge index
+                    if aug_data.edge_index.size(1) > 0:
+                        # Create node mapping
+                        node_map = torch.full((num_nodes,), -1, dtype=torch.long)
+                        node_map[keep_nodes] = torch.arange(len(keep_nodes))
+                        # Filter edges
+                        edge_mask = keep_mask[aug_data.edge_index[0]] & keep_mask[aug_data.edge_index[1]]
+                        if edge_mask.any():
+                            filtered_edges = aug_data.edge_index[:, edge_mask]
+                            aug_data.edge_index = node_map[filtered_edges]
+                        else:
+                            aug_data.edge_index = torch.empty((2, 0), dtype=torch.long)
+                    logger.debug(f"Kept {len(keep_nodes)} out of {num_nodes} nodes")
+            elif aug_type == 'feature_noise':
+                # Add Gaussian noise to features
+                if aug_data.x is not None:
+                    noise = torch.randn_like(aug_data.x) * aug_ratio
+                    aug_data.x = aug_data.x + noise
+                    logger.debug(f"Added noise with std {aug_ratio}")
+            elif aug_type == 'feature_mask':
+                # Randomly mask features
+                if aug_data.x is not None:
+                    mask = torch.rand_like(aug_data.x) > aug_ratio
+                    aug_data.x = aug_data.x * mask
+                    logger.debug(f"Masked {(~mask).sum()} feature values")
+            else:
+                logger.warning(f"Unknown augmentation type: {aug_type}")
         except Exception as e:
+            logger.error(f"Graph augmentation failed: {e}")
+            return data  # Return original on failure
+        return aug_data
     @staticmethod
+    def to_device_safe(data, device):
+        """
+        Move data to device safely with validation
+        Args:
+            data: Data to move
+            device: Target device
+        Returns:
+            Data on target device
+        """
+        try:
+            if hasattr(data, 'to'):
+                return data.to(device)
+            elif isinstance(data, (list, tuple)):
+                return [GraphProcessor.to_device_safe(item, device) for item in data]
+            elif isinstance(data, dict):
+                return {k: GraphProcessor.to_device_safe(v, device) for k, v in data.items()}
+            else:
+                return data
+        except Exception as e:
+            logger.error(f"Device transfer failed: {e}")
+            return data
+    @staticmethod
+    def validate_data(data):
+        """
+        Validate graph data integrity with comprehensive checks
+        Args:
+            data: PyTorch Geometric data object
+        Returns:
+            List of validation errors (empty if valid)
+        """
+        errors = []
         try:
+            # Check basic structure
+            if not hasattr(data, 'edge_index'):
+                errors.append("Missing edge_index attribute")
+            elif not isinstance(data.edge_index, torch.Tensor):
+                errors.append("edge_index must be a tensor")
+            elif data.edge_index.dim() != 2 or data.edge_index.size(0) != 2:
+                errors.append("edge_index must have shape (2, num_edges)")
+            # Check node features
+            if hasattr(data, 'x') and data.x is not None:
+                if not isinstance(data.x, torch.Tensor):
+                    errors.append("Node features x must be a tensor")
+                elif data.x.dim() != 2:
+                    errors.append("Node features x must be 2D")
+                elif hasattr(data, 'num_nodes') and data.x.size(0) != data.num_nodes:
+                    errors.append("Feature matrix size mismatch with num_nodes")
+            # Check labels
+            if hasattr(data, 'y') and data.y is not None:
+                if not isinstance(data.y, torch.Tensor):
+                    errors.append("Labels y must be a tensor")
+            # Check edge indices bounds
+            if hasattr(data, 'edge_index') and data.edge_index.size(1) > 0:
+                max_idx = data.edge_index.max().item()
+                min_idx = data.edge_index.min().item()
+                if min_idx < 0:
+                    errors.append("Edge indices contain negative values")
+                if hasattr(data, 'num_nodes') and max_idx >= data.num_nodes:
+                    errors.append("Edge indices exceed number of nodes")
+            # Check for NaN or infinite values
+            if hasattr(data, 'x') and data.x is not None:
+                if torch.isnan(data.x).any():
+                    errors.append("Node features contain NaN values")
+                if torch.isinf(data.x).any():
+                    errors.append("Node features contain infinite values")
+            # Check data types
+            if hasattr(data, 'edge_index'):
+                if data.edge_index.dtype not in [torch.long, torch.int]:
+                    errors.append("edge_index must have integer dtype")
         except Exception as e:
+            errors.append(f"Validation error: {str(e)}")
+        if errors:
+            logger.warning(f"Data validation found {len(errors)} errors")
+        return errors
     @staticmethod
+    def repair_data(data):
+        """
+        Attempt to repair common data issues
+        Args:
+            data: PyTorch Geometric data object
+        Returns:
+            Repaired data object
+        """
+        try:
+            # Fix edge index dtype
+            if hasattr(data, 'edge_index') and data.edge_index.dtype not in [torch.long, torch.int]:
+                data.edge_index = data.edge_index.long()
+                logger.info("Fixed edge_index dtype")
+            # Remove invalid edges
+            if hasattr(data, 'edge_index') and hasattr(data, 'num_nodes'):
+                valid_mask = (
+                    (data.edge_index[0] >= 0) & (data.edge_index[0] < data.num_nodes) &
+                    (data.edge_index[1] >= 0) & (data.edge_index[1] < data.num_nodes)
+                )
+                if not valid_mask.all():
+                    data.edge_index = data.edge_index[:, valid_mask]
+                    logger.info(f"Removed {(~valid_mask).sum()} invalid edges")
+            # Handle NaN values in features
+            if hasattr(data, 'x') and data.x is not None:
+                if torch.isnan(data.x).any():
+                    data.x = torch.where(torch.isnan(data.x), torch.zeros_like(data.x), data.x)
+                    logger.info("Replaced NaN values in features with zeros")
+                if torch.isinf(data.x).any():
+                    data.x = torch.where(torch.isinf(data.x), torch.zeros_like(data.x), data.x)
+                    logger.info("Replaced infinite values in features with zeros")
+        except Exception as e:
+            logger.error(f"Data repair failed: {e}")
+        return data