Spaces:

Skitzo-4152
/

vlsi

Runtime error

App Files Files Community

Skitzo-4152 commited on Sep 19

Commit

88d2960

verified ·

1 Parent(s): 00ebed8

Upload data_dataset_collector.py

Browse files

Files changed (1) hide show

data/data_dataset_collector.py +474 -0

data/data_dataset_collector.py ADDED Viewed

	@@ -0,0 +1,474 @@

+#!/usr/bin/env python3
+"""
+Dataset Collector for ChipVerifyAI
+Automated collection and preparation of chip verification datasets
+"""
+import os
+import git
+import pandas as pd
+import numpy as np
+import requests
+from pathlib import Path
+from datasets import load_dataset
+from typing import Dict, List, Any
+import zipfile
+import json
+class DatasetCollector:
+    """Collect and prepare datasets for chip verification ML/LLM training"""
+    def __init__(self, data_dir: str = "data/datasets"):
+        self.data_dir = Path(data_dir)
+        self.data_dir.mkdir(parents=True, exist_ok=True)
+        self.collected_datasets = {}
+    def collect_all_datasets(self) -> Dict[str, Any]:
+        """Collect all available datasets for training"""
+        print("📊 Starting comprehensive dataset collection...")
+        # Primary datasets from GitHub
+        self._collect_wujian100()
+        self._collect_opencores_projects()
+        self._collect_riscv_tests()
+        # HuggingFace datasets
+        self._collect_pyranet_verilog()
+        # Create synthetic datasets
+        self._create_synthetic_datasets()
+        print(f"✅ Dataset collection completed! Collected {len(self.collected_datasets)} datasets")
+        return self.collected_datasets
+    def _collect_wujian100(self):
+        """Collect Wujian100 RISC-V SoC dataset"""
+        repo_path = self.data_dir / "wujian100_open"
+        if not repo_path.exists():
+            try:
+                print("📥 Cloning Wujian100 repository...")
+                git.Repo.clone_from("https://github.com/XUANTIE-RV/wujian100_open.git", repo_path)
+                print("✅ Wujian100 dataset collected")
+            except Exception as e:
+                print(f"❌ Failed to collect Wujian100: {e}")
+                return
+        else:
+            print("✅ Wujian100 dataset already exists")
+        # Parse RTL files
+        rtl_files = list(repo_path.rglob("*.v")) + list(repo_path.rglob("*.sv"))
+        self.collected_datasets['wujian100'] = {
+            'path': str(repo_path),
+            'type': 'rtl_designs',
+            'file_count': len(rtl_files),
+            'size_mb': sum(f.stat().st_size for f in rtl_files) / (1024*1024),
+            'description': 'Complete RISC-V SoC design with verification infrastructure'
+        }
+    def _collect_opencores_projects(self):
+        """Collect OpenCores RTL projects"""
+        opencores_dir = self.data_dir / "opencores"
+        opencores_dir.mkdir(exist_ok=True)
+        projects = [
+            ("uart16550", "https://github.com/freecores/uart16550.git"),
+            ("spi", "https://github.com/freecores/spi.git"),
+            ("i2c", "https://github.com/freecores/i2c.git"),
+            ("wb_dma", "https://github.com/freecores/wb_dma.git"),
+            ("aes", "https://github.com/freecores/aes_crypto_core.git"),
+        ]
+        collected_projects = []
+        total_files = 0
+        total_size = 0
+        for project_name, repo_url in projects:
+            project_path = opencores_dir / project_name
+            if not project_path.exists():
+                try:
+                    print(f"📥 Cloning {project_name}...")
+                    git.Repo.clone_from(repo_url, project_path)
+                    print(f"✅ {project_name} collected")
+                except Exception as e:
+                    print(f"❌ Failed to collect {project_name}: {e}")
+                    continue
+            # Count files and size
+            rtl_files = list(project_path.rglob("*.v")) + list(project_path.rglob("*.sv"))
+            if rtl_files:
+                file_count = len(rtl_files)
+                size_mb = sum(f.stat().st_size for f in rtl_files) / (1024*1024)
+                collected_projects.append({
+                    'name': project_name,
+                    'files': file_count,
+                    'size_mb': size_mb
+                })
+                total_files += file_count
+                total_size += size_mb
+        self.collected_datasets['opencores'] = {
+            'path': str(opencores_dir),
+            'type': 'rtl_designs',
+            'projects': collected_projects,
+            'total_files': total_files,
+            'total_size_mb': total_size,
+            'description': 'OpenCores RTL IP projects'
+        }
+    def _collect_riscv_tests(self):
+        """Collect RISC-V architecture test suite"""
+        riscv_path = self.data_dir / "riscv_arch_test"
+        if not riscv_path.exists():
+            try:
+                print("📥 Cloning RISC-V architecture tests...")
+                git.Repo.clone_from("https://github.com/riscv-non-isa/riscv-arch-test.git", riscv_path)
+                print("✅ RISC-V tests collected")
+            except Exception as e:
+                print(f"❌ Failed to collect RISC-V tests: {e}")
+                return
+        # Count test files
+        test_files = list(riscv_path.rglob("*.S")) + list(riscv_path.rglob("*.s"))
+        self.collected_datasets['riscv_tests'] = {
+            'path': str(riscv_path),
+            'type': 'test_suite',
+            'file_count': len(test_files),
+            'size_mb': sum(f.stat().st_size for f in test_files) / (1024*1024),
+            'description': 'RISC-V architecture compliance tests'
+        }
+    def _collect_pyranet_verilog(self):
+        """Collect PyraNet Verilog dataset from HuggingFace"""
+        try:
+            print("📥 Loading PyraNet-Verilog from HuggingFace...")
+            dataset = load_dataset("bnadimi/PyraNet-Verilog")
+            pyranet_dir = self.data_dir / "pyranet_verilog"
+            pyranet_dir.mkdir(exist_ok=True)
+            total_samples = 0
+            for split_name, split_data in dataset.items():
+                df = pd.DataFrame(split_data)
+                csv_path = pyranet_dir / f"{split_name}.csv"
+                df.to_csv(csv_path, index=False)
+                total_samples += len(df)
+            self.collected_datasets['pyranet_verilog'] = {
+                'path': str(pyranet_dir),
+                'type': 'verilog_dataset',
+                'total_samples': total_samples,
+                'splits': list(dataset.keys()),
+                'description': 'Large-scale Verilog code dataset with descriptions'
+            }
+            print("✅ PyraNet-Verilog dataset collected")
+        except Exception as e:
+            print(f"❌ Failed to collect PyraNet-Verilog: {e}")
+    def _create_synthetic_datasets(self):
+        """Create synthetic datasets for training"""
+        print("🎯 Creating synthetic datasets...")
+        # 1. Verification logs
+        logs_df = self._create_verification_logs(n_samples=10000)
+        logs_path = self.data_dir / "synthetic_verification_logs.csv"
+        logs_df.to_csv(logs_path, index=False)
+        # 2. RTL design features
+        rtl_df = self._create_rtl_features(n_samples=5000)
+        rtl_path = self.data_dir / "synthetic_rtl_features.csv"
+        rtl_df.to_csv(rtl_path, index=False)
+        # 3. Bug reports
+        bugs_df = self._create_bug_reports(n_samples=3000)
+        bugs_path = self.data_dir / "synthetic_bug_reports.csv"
+        bugs_df.to_csv(bugs_path, index=False)
+        # 4. LLM training examples
+        llm_examples = self._create_llm_training_examples()
+        llm_path = self.data_dir / "llm_training_examples.json"
+        with open(llm_path, 'w') as f:
+            json.dump(llm_examples, f, indent=2)
+        self.collected_datasets['synthetic_data'] = {
+            'verification_logs': {
+                'path': str(logs_path),
+                'samples': len(logs_df),
+                'description': 'Synthetic verification simulation logs'
+            },
+            'rtl_features': {
+                'path': str(rtl_path),
+                'samples': len(rtl_df),
+                'description': 'Synthetic RTL design characteristics'
+            },
+            'bug_reports': {
+                'path': str(bugs_path),
+                'samples': len(bugs_df),
+                'description': 'Synthetic bug report data'
+            },
+            'llm_examples': {
+                'path': str(llm_path),
+                'samples': len(llm_examples),
+                'description': 'Training examples for LLM fine-tuning'
+            }
+        }
+        print("✅ Synthetic datasets created")
+    def _create_verification_logs(self, n_samples: int) -> pd.DataFrame:
+        """Create realistic verification log dataset"""
+        np.random.seed(42)
+        modules = ['ALU', 'UART', 'SPI', 'I2C', 'WDT', 'CPU_CORE', 'MEMORY_CTRL', 'DMA', 'GPIO', 'TIMER', 'PLL', 'ADC']
+        test_types = ['functional', 'timing', 'power', 'coverage', 'stress', 'protocol', 'random', 'directed']
+        result_types = ['PASS', 'FAIL_TIMING', 'FAIL_PROTOCOL', 'FAIL_ASSERTION', 'FAIL_COVERAGE', 'ERROR', 'TIMEOUT']
+        simulators = ['ModelSim', 'VCS', 'Xcelium', 'Verilator', 'Icarus']
+        data = []
+        for i in range(n_samples):
+            module = np.random.choice(modules)
+            test_type = np.random.choice(test_types)
+            result = np.random.choice(result_types, p=[0.70, 0.08, 0.06, 0.06, 0.05, 0.03, 0.02])
+            # Module complexity affects metrics
+            complexity_map = {
+                'ALU': 1, 'UART': 2, 'SPI': 2, 'I2C': 3, 'WDT': 1,
+                'CPU_CORE': 10, 'MEMORY_CTRL': 6, 'DMA': 5, 'GPIO': 1,
+                'TIMER': 2, 'PLL': 4, 'ADC': 3
+            }
+            complexity = complexity_map.get(module, 2)
+            entry = {
+                'log_id': f"sim_{i:06d}",
+                'timestamp': pd.Timestamp('2024-01-01') + pd.Timedelta(hours=i//50),
+                'module': module,
+                'test_type': test_type,
+                'result': result,
+                'simulator': np.random.choice(simulators),
+                'duration_seconds': max(1, np.random.exponential(60 * complexity)),
+                'lines_of_log': max(10, np.random.poisson(300 * complexity)),
+                'error_count': np.random.poisson(2) if result != 'PASS' else 0,
+                'warning_count': max(0, np.random.poisson(4)),
+                'coverage_line': max(0, min(100, np.random.normal(85 if result == 'PASS' else 65, 15))),
+                'coverage_branch': max(0, min(100, np.random.normal(80 if result == 'PASS' else 60, 18))),
+                'coverage_toggle': max(0, min(100, np.random.normal(75 if result == 'PASS' else 55, 20))),
+                'memory_usage_mb': max(1, np.random.exponential(50 * complexity)),
+                'cpu_usage_percent': max(1, min(100, np.random.normal(40, 25))),
+                'seed_value': np.random.randint(1, 1000000),
+                'complexity_score': complexity + np.random.normal(0, 0.5),
+                'test_case_count': max(1, np.random.poisson(20 * complexity)),
+                'assertion_count': max(0, np.random.poisson(10 * complexity)),
+                'clock_cycles': max(100, np.random.exponential(10000 * complexity))
+            }
+            # Derived features
+            entry['is_failure'] = result != 'PASS'
+            entry['severity'] = 'HIGH' if 'ERROR' in result or 'TIMEOUT' in result else 'MEDIUM' if 'FAIL' in result else 'LOW'
+            entry['efficiency_score'] = entry['coverage_line'] / max(1, entry['duration_seconds'] / 60)
+            data.append(entry)
+        return pd.DataFrame(data)
+    def _create_rtl_features(self, n_samples: int) -> pd.DataFrame:
+        """Create RTL design features dataset"""
+        np.random.seed(42)
+        design_types = ['ALU', 'CPU', 'GPU', 'DSP', 'MEMORY', 'IO', 'NETWORK', 'CRYPTO', 'FPGA', 'ASIC', 'SOC']
+        data = []
+        for i in range(n_samples):
+            design_type = np.random.choice(design_types)
+            # Base complexity varies by type
+            complexity_map = {
+                'ALU': 2, 'CPU': 8, 'GPU': 10, 'DSP': 6, 'MEMORY': 4,
+                'IO': 3, 'NETWORK': 7, 'CRYPTO': 5, 'FPGA': 6, 'ASIC': 9, 'SOC': 12
+            }
+            base_complexity = complexity_map[design_type]
+            features = {
+                'design_id': f"design_{i:05d}",
+                'design_type': design_type,
+                'lines_of_code': int(np.random.lognormal(np.log(1000 * base_complexity), 0.8)),
+                'module_count': max(1, np.random.poisson(5 * base_complexity)),
+                'signal_count': max(10, np.random.poisson(50 * base_complexity)),
+                'always_blocks': max(1, np.random.poisson(10 * base_complexity)),
+                'assign_statements': max(1, np.random.poisson(20 * base_complexity)),
+                'if_statements': max(1, np.random.poisson(15 * base_complexity)),
+                'case_statements': max(0, np.random.poisson(5 * base_complexity)),
+                'for_loops': max(0, np.random.poisson(3 * base_complexity)),
+                'function_count': max(0, np.random.poisson(2 * base_complexity)),
+                'task_count': max(0, np.random.poisson(3 * base_complexity)),
+                'clock_domains': max(1, np.random.poisson(2) + 1),
+                'reset_signals': max(1, np.random.poisson(1) + 1),
+                'interface_signals': max(5, np.random.poisson(20 * base_complexity)),
+                'memory_instances': np.random.poisson(2) if design_type in ['CPU', 'GPU', 'MEMORY', 'SOC'] else max(0, np.random.poisson(0.5)),
+                'fsm_count': np.random.poisson(3) if design_type in ['CPU', 'IO', 'NETWORK'] else max(0, np.random.poisson(1)),
+                'pipeline_stages': np.random.poisson(4) if design_type in ['CPU', 'DSP'] else max(0, np.random.poisson(1)),
+                'arithmetic_units': np.random.poisson(5) if design_type in ['ALU', 'DSP', 'CRYPTO'] else max(0, np.random.poisson(1)),
+                'complexity_score': max(1.0, base_complexity + np.random.normal(0, 1.5)),
+                'target_frequency_mhz': max(10, np.random.normal(200, 100)),
+                'power_budget_mw': max(1, np.random.exponential(100 * base_complexity)),
+                'area_budget_um2': max(1000, np.random.exponential(10000 * base_complexity)),
+                'technology_node_nm': np.random.choice([180, 130, 90, 65, 45, 32, 28, 22, 16, 14, 10, 7, 5]),
+                'verification_time_hours': max(1, np.random.exponential(40 * base_complexity))
+            }
+            # Boolean features
+            features['has_memory'] = features['memory_instances'] > 0
+            features['has_fsm'] = features['fsm_count'] > 0
+            features['has_pipeline'] = features['pipeline_stages'] > 2
+            features['has_floating_point'] = design_type in ['DSP', 'GPU'] or np.random.random() < 0.1
+            features['is_complex'] = features['complexity_score'] > 6
+            features['is_large'] = features['lines_of_code'] > 10000
+            features['is_high_freq'] = features['target_frequency_mhz'] > 500
+            features['is_low_power'] = features['power_budget_mw'] < 50
+            # Bug probability based on realistic factors
+            complexity_factor = min(0.4, features['complexity_score'] / 20)
+            size_factor = min(0.3, features['lines_of_code'] / 50000)
+            frequency_factor = 0.1 if features['target_frequency_mhz'] > 1000 else 0
+            tech_factor = 0.1 if features['technology_node_nm'] <= 16 else 0
+            bug_probability = complexity_factor + size_factor + frequency_factor + tech_factor + np.random.normal(0, 0.1)
+            bug_probability = max(0.05, min(0.90, bug_probability))
+            features['bug_probability'] = bug_probability
+            features['has_bugs'] = np.random.random() < bug_probability
+            features['estimated_bug_count'] = np.random.poisson(5) if features['has_bugs'] else 0
+            data.append(features)
+        return pd.DataFrame(data)
+    def _create_bug_reports(self, n_samples: int) -> pd.DataFrame:
+        """Create bug reports dataset"""
+        np.random.seed(42)
+        bug_types = ['functional', 'timing', 'power', 'synthesis', 'verification', 'interface', 'protocol']
+        severities = ['critical', 'major', 'minor', 'trivial']
+        statuses = ['open', 'in_progress', 'fixed', 'closed', 'wont_fix']
+        data = []
+        for i in range(n_samples):
+            bug_type = np.random.choice(bug_types)
+            severity = np.random.choice(severities, p=[0.10, 0.30, 0.45, 0.15])
+            status = np.random.choice(statuses, p=[0.15, 0.25, 0.35, 0.20, 0.05])
+            report = {
+                'bug_id': f"BUG_{i:06d}",
+                'report_date': pd.Timestamp('2023-01-01') + pd.Timedelta(days=np.random.randint(0, 400)),
+                'bug_type': bug_type,
+                'severity': severity,
+                'status': status,
+                'affected_module': np.random.choice(['ALU', 'UART', 'SPI', 'CPU_CORE', 'MEMORY', 'IO', 'NETWORK']),
+                'reporter_experience': np.random.choice(['junior', 'mid', 'senior', 'expert'], p=[0.25, 0.35, 0.30, 0.10]),
+                'lines_affected': max(1, np.random.poisson(15)),
+                'files_affected': max(1, np.random.poisson(3)),
+                'discovery_method': np.random.choice(['simulation', 'synthesis', 'formal', 'review', 'customer']),
+                'fix_complexity': np.random.randint(1, 6),
+                'regression_risk': np.random.choice(['low', 'medium', 'high'], p=[0.60, 0.30, 0.10]),
+                'test_coverage_before': max(0, min(100, np.random.normal(70, 20))),
+                'test_coverage_after': max(0, min(100, np.random.normal(85, 15))),
+                'verification_effort_hours': max(0.5, np.random.exponential(8)),
+                'fix_effort_hours': max(0.5, np.random.exponential(12)),
+                'related_bugs': max(0, np.random.poisson(1)),
+                'priority': np.random.choice(['low', 'medium', 'high', 'urgent'], p=[0.30, 0.40, 0.25, 0.05])
+            }
+            # Add resolution date if fixed/closed
+            if status in ['fixed', 'closed']:
+                days_to_fix = max(1, np.random.exponential(30))
+                report['resolution_date'] = report['report_date'] + pd.Timedelta(days=days_to_fix)
+            else:
+                report['resolution_date'] = None
+            data.append(report)
+        return pd.DataFrame(data)
+    def _create_llm_training_examples(self) -> List[Dict[str, str]]:
+        """Create training examples for LLM fine-tuning"""
+        examples = []
+        # Basic testbench examples
+        modules = ['FIFO', 'Counter', 'Multiplexer', 'Decoder', 'Encoder', 'Shifter']
+        for module in modules:
+            examples.append({
+                'instruction': f'Generate basic testbench for {module} module',
+                'response': f'''module {module.lower()}_testbench;
+    logic clk = 0;
+    logic reset_n = 1;
+    // {module} signals
+    logic enable;
+    logic [7:0] data_in, data_out;
+    always #5 clk = ~clk;
+    initial begin
+        reset_n = 0; #20 reset_n = 1;
+        @(posedge clk);
+        // Test {module} functionality
+        enable = 1;
+        data_in = 8'hAA;
+        repeat(10) @(posedge clk);
+        $display("Test completed");
+        $finish;
+    end
+endmodule'''
+            })
+        # Protocol-specific examples
+        protocols = ['UART', 'SPI', 'I2C']
+        for protocol in protocols:
+            examples.append({
+                'instruction': f'Generate {protocol} protocol assertions',
+                'response': f'''module {protocol.lower()}_assertions(
+    input clk, reset_n,
+    input start, done, valid
+);
+    property {protocol.lower()}_handshake;
+        @(posedge clk) disable iff (!reset_n)
+        start |-> ##[1:100] done;
+    endproperty
+    assert property({protocol.lower()}_handshake)
+        else $error("{protocol} handshake failed");
+endmodule'''
+            })
+        return examples
+    def create_synthetic_dataset(self, n_samples: int = 1000) -> pd.DataFrame:
+        """Create a quick synthetic dataset for demo purposes"""
+        return self._create_rtl_features(n_samples)
+    def get_dataset_summary(self) -> Dict[str, Any]:
+        """Get summary of all collected datasets"""
+        summary = {
+            'timestamp': pd.Timestamp.now().isoformat(),
+            'total_datasets': len(self.collected_datasets),
+            'datasets': self.collected_datasets
+        }
+        # Save summary
+        summary_path = self.data_dir / "dataset_summary.json"
+        with open(summary_path, 'w') as f:
+            json.dump(summary, f, indent=2, default=str)
+        return summary