Skitzo-4152 commited on
Commit
88d2960
·
verified ·
1 Parent(s): 00ebed8

Upload data_dataset_collector.py

Browse files
Files changed (1) hide show
  1. data/data_dataset_collector.py +474 -0
data/data_dataset_collector.py ADDED
@@ -0,0 +1,474 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Dataset Collector for ChipVerifyAI
4
+ Automated collection and preparation of chip verification datasets
5
+ """
6
+
7
+ import os
8
+ import git
9
+ import pandas as pd
10
+ import numpy as np
11
+ import requests
12
+ from pathlib import Path
13
+ from datasets import load_dataset
14
+ from typing import Dict, List, Any
15
+ import zipfile
16
+ import json
17
+
18
+ class DatasetCollector:
19
+ """Collect and prepare datasets for chip verification ML/LLM training"""
20
+
21
+ def __init__(self, data_dir: str = "data/datasets"):
22
+ self.data_dir = Path(data_dir)
23
+ self.data_dir.mkdir(parents=True, exist_ok=True)
24
+ self.collected_datasets = {}
25
+
26
+ def collect_all_datasets(self) -> Dict[str, Any]:
27
+ """Collect all available datasets for training"""
28
+ print("📊 Starting comprehensive dataset collection...")
29
+
30
+ # Primary datasets from GitHub
31
+ self._collect_wujian100()
32
+ self._collect_opencores_projects()
33
+ self._collect_riscv_tests()
34
+
35
+ # HuggingFace datasets
36
+ self._collect_pyranet_verilog()
37
+
38
+ # Create synthetic datasets
39
+ self._create_synthetic_datasets()
40
+
41
+ print(f"✅ Dataset collection completed! Collected {len(self.collected_datasets)} datasets")
42
+ return self.collected_datasets
43
+
44
+ def _collect_wujian100(self):
45
+ """Collect Wujian100 RISC-V SoC dataset"""
46
+ repo_path = self.data_dir / "wujian100_open"
47
+
48
+ if not repo_path.exists():
49
+ try:
50
+ print("📥 Cloning Wujian100 repository...")
51
+ git.Repo.clone_from("https://github.com/XUANTIE-RV/wujian100_open.git", repo_path)
52
+ print("✅ Wujian100 dataset collected")
53
+ except Exception as e:
54
+ print(f"❌ Failed to collect Wujian100: {e}")
55
+ return
56
+ else:
57
+ print("✅ Wujian100 dataset already exists")
58
+
59
+ # Parse RTL files
60
+ rtl_files = list(repo_path.rglob("*.v")) + list(repo_path.rglob("*.sv"))
61
+
62
+ self.collected_datasets['wujian100'] = {
63
+ 'path': str(repo_path),
64
+ 'type': 'rtl_designs',
65
+ 'file_count': len(rtl_files),
66
+ 'size_mb': sum(f.stat().st_size for f in rtl_files) / (1024*1024),
67
+ 'description': 'Complete RISC-V SoC design with verification infrastructure'
68
+ }
69
+
70
+ def _collect_opencores_projects(self):
71
+ """Collect OpenCores RTL projects"""
72
+ opencores_dir = self.data_dir / "opencores"
73
+ opencores_dir.mkdir(exist_ok=True)
74
+
75
+ projects = [
76
+ ("uart16550", "https://github.com/freecores/uart16550.git"),
77
+ ("spi", "https://github.com/freecores/spi.git"),
78
+ ("i2c", "https://github.com/freecores/i2c.git"),
79
+ ("wb_dma", "https://github.com/freecores/wb_dma.git"),
80
+ ("aes", "https://github.com/freecores/aes_crypto_core.git"),
81
+ ]
82
+
83
+ collected_projects = []
84
+ total_files = 0
85
+ total_size = 0
86
+
87
+ for project_name, repo_url in projects:
88
+ project_path = opencores_dir / project_name
89
+
90
+ if not project_path.exists():
91
+ try:
92
+ print(f"📥 Cloning {project_name}...")
93
+ git.Repo.clone_from(repo_url, project_path)
94
+ print(f"✅ {project_name} collected")
95
+ except Exception as e:
96
+ print(f"❌ Failed to collect {project_name}: {e}")
97
+ continue
98
+
99
+ # Count files and size
100
+ rtl_files = list(project_path.rglob("*.v")) + list(project_path.rglob("*.sv"))
101
+ if rtl_files:
102
+ file_count = len(rtl_files)
103
+ size_mb = sum(f.stat().st_size for f in rtl_files) / (1024*1024)
104
+
105
+ collected_projects.append({
106
+ 'name': project_name,
107
+ 'files': file_count,
108
+ 'size_mb': size_mb
109
+ })
110
+
111
+ total_files += file_count
112
+ total_size += size_mb
113
+
114
+ self.collected_datasets['opencores'] = {
115
+ 'path': str(opencores_dir),
116
+ 'type': 'rtl_designs',
117
+ 'projects': collected_projects,
118
+ 'total_files': total_files,
119
+ 'total_size_mb': total_size,
120
+ 'description': 'OpenCores RTL IP projects'
121
+ }
122
+
123
+ def _collect_riscv_tests(self):
124
+ """Collect RISC-V architecture test suite"""
125
+ riscv_path = self.data_dir / "riscv_arch_test"
126
+
127
+ if not riscv_path.exists():
128
+ try:
129
+ print("📥 Cloning RISC-V architecture tests...")
130
+ git.Repo.clone_from("https://github.com/riscv-non-isa/riscv-arch-test.git", riscv_path)
131
+ print("✅ RISC-V tests collected")
132
+ except Exception as e:
133
+ print(f"❌ Failed to collect RISC-V tests: {e}")
134
+ return
135
+
136
+ # Count test files
137
+ test_files = list(riscv_path.rglob("*.S")) + list(riscv_path.rglob("*.s"))
138
+
139
+ self.collected_datasets['riscv_tests'] = {
140
+ 'path': str(riscv_path),
141
+ 'type': 'test_suite',
142
+ 'file_count': len(test_files),
143
+ 'size_mb': sum(f.stat().st_size for f in test_files) / (1024*1024),
144
+ 'description': 'RISC-V architecture compliance tests'
145
+ }
146
+
147
+ def _collect_pyranet_verilog(self):
148
+ """Collect PyraNet Verilog dataset from HuggingFace"""
149
+ try:
150
+ print("📥 Loading PyraNet-Verilog from HuggingFace...")
151
+ dataset = load_dataset("bnadimi/PyraNet-Verilog")
152
+
153
+ pyranet_dir = self.data_dir / "pyranet_verilog"
154
+ pyranet_dir.mkdir(exist_ok=True)
155
+
156
+ total_samples = 0
157
+ for split_name, split_data in dataset.items():
158
+ df = pd.DataFrame(split_data)
159
+ csv_path = pyranet_dir / f"{split_name}.csv"
160
+ df.to_csv(csv_path, index=False)
161
+ total_samples += len(df)
162
+
163
+ self.collected_datasets['pyranet_verilog'] = {
164
+ 'path': str(pyranet_dir),
165
+ 'type': 'verilog_dataset',
166
+ 'total_samples': total_samples,
167
+ 'splits': list(dataset.keys()),
168
+ 'description': 'Large-scale Verilog code dataset with descriptions'
169
+ }
170
+
171
+ print("✅ PyraNet-Verilog dataset collected")
172
+
173
+ except Exception as e:
174
+ print(f"❌ Failed to collect PyraNet-Verilog: {e}")
175
+
176
+ def _create_synthetic_datasets(self):
177
+ """Create synthetic datasets for training"""
178
+ print("🎯 Creating synthetic datasets...")
179
+
180
+ # 1. Verification logs
181
+ logs_df = self._create_verification_logs(n_samples=10000)
182
+ logs_path = self.data_dir / "synthetic_verification_logs.csv"
183
+ logs_df.to_csv(logs_path, index=False)
184
+
185
+ # 2. RTL design features
186
+ rtl_df = self._create_rtl_features(n_samples=5000)
187
+ rtl_path = self.data_dir / "synthetic_rtl_features.csv"
188
+ rtl_df.to_csv(rtl_path, index=False)
189
+
190
+ # 3. Bug reports
191
+ bugs_df = self._create_bug_reports(n_samples=3000)
192
+ bugs_path = self.data_dir / "synthetic_bug_reports.csv"
193
+ bugs_df.to_csv(bugs_path, index=False)
194
+
195
+ # 4. LLM training examples
196
+ llm_examples = self._create_llm_training_examples()
197
+ llm_path = self.data_dir / "llm_training_examples.json"
198
+ with open(llm_path, 'w') as f:
199
+ json.dump(llm_examples, f, indent=2)
200
+
201
+ self.collected_datasets['synthetic_data'] = {
202
+ 'verification_logs': {
203
+ 'path': str(logs_path),
204
+ 'samples': len(logs_df),
205
+ 'description': 'Synthetic verification simulation logs'
206
+ },
207
+ 'rtl_features': {
208
+ 'path': str(rtl_path),
209
+ 'samples': len(rtl_df),
210
+ 'description': 'Synthetic RTL design characteristics'
211
+ },
212
+ 'bug_reports': {
213
+ 'path': str(bugs_path),
214
+ 'samples': len(bugs_df),
215
+ 'description': 'Synthetic bug report data'
216
+ },
217
+ 'llm_examples': {
218
+ 'path': str(llm_path),
219
+ 'samples': len(llm_examples),
220
+ 'description': 'Training examples for LLM fine-tuning'
221
+ }
222
+ }
223
+
224
+ print("✅ Synthetic datasets created")
225
+
226
+ def _create_verification_logs(self, n_samples: int) -> pd.DataFrame:
227
+ """Create realistic verification log dataset"""
228
+ np.random.seed(42)
229
+
230
+ modules = ['ALU', 'UART', 'SPI', 'I2C', 'WDT', 'CPU_CORE', 'MEMORY_CTRL', 'DMA', 'GPIO', 'TIMER', 'PLL', 'ADC']
231
+ test_types = ['functional', 'timing', 'power', 'coverage', 'stress', 'protocol', 'random', 'directed']
232
+ result_types = ['PASS', 'FAIL_TIMING', 'FAIL_PROTOCOL', 'FAIL_ASSERTION', 'FAIL_COVERAGE', 'ERROR', 'TIMEOUT']
233
+ simulators = ['ModelSim', 'VCS', 'Xcelium', 'Verilator', 'Icarus']
234
+
235
+ data = []
236
+ for i in range(n_samples):
237
+ module = np.random.choice(modules)
238
+ test_type = np.random.choice(test_types)
239
+ result = np.random.choice(result_types, p=[0.70, 0.08, 0.06, 0.06, 0.05, 0.03, 0.02])
240
+
241
+ # Module complexity affects metrics
242
+ complexity_map = {
243
+ 'ALU': 1, 'UART': 2, 'SPI': 2, 'I2C': 3, 'WDT': 1,
244
+ 'CPU_CORE': 10, 'MEMORY_CTRL': 6, 'DMA': 5, 'GPIO': 1,
245
+ 'TIMER': 2, 'PLL': 4, 'ADC': 3
246
+ }
247
+ complexity = complexity_map.get(module, 2)
248
+
249
+ entry = {
250
+ 'log_id': f"sim_{i:06d}",
251
+ 'timestamp': pd.Timestamp('2024-01-01') + pd.Timedelta(hours=i//50),
252
+ 'module': module,
253
+ 'test_type': test_type,
254
+ 'result': result,
255
+ 'simulator': np.random.choice(simulators),
256
+ 'duration_seconds': max(1, np.random.exponential(60 * complexity)),
257
+ 'lines_of_log': max(10, np.random.poisson(300 * complexity)),
258
+ 'error_count': np.random.poisson(2) if result != 'PASS' else 0,
259
+ 'warning_count': max(0, np.random.poisson(4)),
260
+ 'coverage_line': max(0, min(100, np.random.normal(85 if result == 'PASS' else 65, 15))),
261
+ 'coverage_branch': max(0, min(100, np.random.normal(80 if result == 'PASS' else 60, 18))),
262
+ 'coverage_toggle': max(0, min(100, np.random.normal(75 if result == 'PASS' else 55, 20))),
263
+ 'memory_usage_mb': max(1, np.random.exponential(50 * complexity)),
264
+ 'cpu_usage_percent': max(1, min(100, np.random.normal(40, 25))),
265
+ 'seed_value': np.random.randint(1, 1000000),
266
+ 'complexity_score': complexity + np.random.normal(0, 0.5),
267
+ 'test_case_count': max(1, np.random.poisson(20 * complexity)),
268
+ 'assertion_count': max(0, np.random.poisson(10 * complexity)),
269
+ 'clock_cycles': max(100, np.random.exponential(10000 * complexity))
270
+ }
271
+
272
+ # Derived features
273
+ entry['is_failure'] = result != 'PASS'
274
+ entry['severity'] = 'HIGH' if 'ERROR' in result or 'TIMEOUT' in result else 'MEDIUM' if 'FAIL' in result else 'LOW'
275
+ entry['efficiency_score'] = entry['coverage_line'] / max(1, entry['duration_seconds'] / 60)
276
+
277
+ data.append(entry)
278
+
279
+ return pd.DataFrame(data)
280
+
281
+ def _create_rtl_features(self, n_samples: int) -> pd.DataFrame:
282
+ """Create RTL design features dataset"""
283
+ np.random.seed(42)
284
+
285
+ design_types = ['ALU', 'CPU', 'GPU', 'DSP', 'MEMORY', 'IO', 'NETWORK', 'CRYPTO', 'FPGA', 'ASIC', 'SOC']
286
+
287
+ data = []
288
+ for i in range(n_samples):
289
+ design_type = np.random.choice(design_types)
290
+
291
+ # Base complexity varies by type
292
+ complexity_map = {
293
+ 'ALU': 2, 'CPU': 8, 'GPU': 10, 'DSP': 6, 'MEMORY': 4,
294
+ 'IO': 3, 'NETWORK': 7, 'CRYPTO': 5, 'FPGA': 6, 'ASIC': 9, 'SOC': 12
295
+ }
296
+ base_complexity = complexity_map[design_type]
297
+
298
+ features = {
299
+ 'design_id': f"design_{i:05d}",
300
+ 'design_type': design_type,
301
+ 'lines_of_code': int(np.random.lognormal(np.log(1000 * base_complexity), 0.8)),
302
+ 'module_count': max(1, np.random.poisson(5 * base_complexity)),
303
+ 'signal_count': max(10, np.random.poisson(50 * base_complexity)),
304
+ 'always_blocks': max(1, np.random.poisson(10 * base_complexity)),
305
+ 'assign_statements': max(1, np.random.poisson(20 * base_complexity)),
306
+ 'if_statements': max(1, np.random.poisson(15 * base_complexity)),
307
+ 'case_statements': max(0, np.random.poisson(5 * base_complexity)),
308
+ 'for_loops': max(0, np.random.poisson(3 * base_complexity)),
309
+ 'function_count': max(0, np.random.poisson(2 * base_complexity)),
310
+ 'task_count': max(0, np.random.poisson(3 * base_complexity)),
311
+ 'clock_domains': max(1, np.random.poisson(2) + 1),
312
+ 'reset_signals': max(1, np.random.poisson(1) + 1),
313
+ 'interface_signals': max(5, np.random.poisson(20 * base_complexity)),
314
+ 'memory_instances': np.random.poisson(2) if design_type in ['CPU', 'GPU', 'MEMORY', 'SOC'] else max(0, np.random.poisson(0.5)),
315
+ 'fsm_count': np.random.poisson(3) if design_type in ['CPU', 'IO', 'NETWORK'] else max(0, np.random.poisson(1)),
316
+ 'pipeline_stages': np.random.poisson(4) if design_type in ['CPU', 'DSP'] else max(0, np.random.poisson(1)),
317
+ 'arithmetic_units': np.random.poisson(5) if design_type in ['ALU', 'DSP', 'CRYPTO'] else max(0, np.random.poisson(1)),
318
+ 'complexity_score': max(1.0, base_complexity + np.random.normal(0, 1.5)),
319
+ 'target_frequency_mhz': max(10, np.random.normal(200, 100)),
320
+ 'power_budget_mw': max(1, np.random.exponential(100 * base_complexity)),
321
+ 'area_budget_um2': max(1000, np.random.exponential(10000 * base_complexity)),
322
+ 'technology_node_nm': np.random.choice([180, 130, 90, 65, 45, 32, 28, 22, 16, 14, 10, 7, 5]),
323
+ 'verification_time_hours': max(1, np.random.exponential(40 * base_complexity))
324
+ }
325
+
326
+ # Boolean features
327
+ features['has_memory'] = features['memory_instances'] > 0
328
+ features['has_fsm'] = features['fsm_count'] > 0
329
+ features['has_pipeline'] = features['pipeline_stages'] > 2
330
+ features['has_floating_point'] = design_type in ['DSP', 'GPU'] or np.random.random() < 0.1
331
+ features['is_complex'] = features['complexity_score'] > 6
332
+ features['is_large'] = features['lines_of_code'] > 10000
333
+ features['is_high_freq'] = features['target_frequency_mhz'] > 500
334
+ features['is_low_power'] = features['power_budget_mw'] < 50
335
+
336
+ # Bug probability based on realistic factors
337
+ complexity_factor = min(0.4, features['complexity_score'] / 20)
338
+ size_factor = min(0.3, features['lines_of_code'] / 50000)
339
+ frequency_factor = 0.1 if features['target_frequency_mhz'] > 1000 else 0
340
+ tech_factor = 0.1 if features['technology_node_nm'] <= 16 else 0
341
+
342
+ bug_probability = complexity_factor + size_factor + frequency_factor + tech_factor + np.random.normal(0, 0.1)
343
+ bug_probability = max(0.05, min(0.90, bug_probability))
344
+
345
+ features['bug_probability'] = bug_probability
346
+ features['has_bugs'] = np.random.random() < bug_probability
347
+ features['estimated_bug_count'] = np.random.poisson(5) if features['has_bugs'] else 0
348
+
349
+ data.append(features)
350
+
351
+ return pd.DataFrame(data)
352
+
353
+ def _create_bug_reports(self, n_samples: int) -> pd.DataFrame:
354
+ """Create bug reports dataset"""
355
+ np.random.seed(42)
356
+
357
+ bug_types = ['functional', 'timing', 'power', 'synthesis', 'verification', 'interface', 'protocol']
358
+ severities = ['critical', 'major', 'minor', 'trivial']
359
+ statuses = ['open', 'in_progress', 'fixed', 'closed', 'wont_fix']
360
+
361
+ data = []
362
+ for i in range(n_samples):
363
+ bug_type = np.random.choice(bug_types)
364
+ severity = np.random.choice(severities, p=[0.10, 0.30, 0.45, 0.15])
365
+ status = np.random.choice(statuses, p=[0.15, 0.25, 0.35, 0.20, 0.05])
366
+
367
+ report = {
368
+ 'bug_id': f"BUG_{i:06d}",
369
+ 'report_date': pd.Timestamp('2023-01-01') + pd.Timedelta(days=np.random.randint(0, 400)),
370
+ 'bug_type': bug_type,
371
+ 'severity': severity,
372
+ 'status': status,
373
+ 'affected_module': np.random.choice(['ALU', 'UART', 'SPI', 'CPU_CORE', 'MEMORY', 'IO', 'NETWORK']),
374
+ 'reporter_experience': np.random.choice(['junior', 'mid', 'senior', 'expert'], p=[0.25, 0.35, 0.30, 0.10]),
375
+ 'lines_affected': max(1, np.random.poisson(15)),
376
+ 'files_affected': max(1, np.random.poisson(3)),
377
+ 'discovery_method': np.random.choice(['simulation', 'synthesis', 'formal', 'review', 'customer']),
378
+ 'fix_complexity': np.random.randint(1, 6),
379
+ 'regression_risk': np.random.choice(['low', 'medium', 'high'], p=[0.60, 0.30, 0.10]),
380
+ 'test_coverage_before': max(0, min(100, np.random.normal(70, 20))),
381
+ 'test_coverage_after': max(0, min(100, np.random.normal(85, 15))),
382
+ 'verification_effort_hours': max(0.5, np.random.exponential(8)),
383
+ 'fix_effort_hours': max(0.5, np.random.exponential(12)),
384
+ 'related_bugs': max(0, np.random.poisson(1)),
385
+ 'priority': np.random.choice(['low', 'medium', 'high', 'urgent'], p=[0.30, 0.40, 0.25, 0.05])
386
+ }
387
+
388
+ # Add resolution date if fixed/closed
389
+ if status in ['fixed', 'closed']:
390
+ days_to_fix = max(1, np.random.exponential(30))
391
+ report['resolution_date'] = report['report_date'] + pd.Timedelta(days=days_to_fix)
392
+ else:
393
+ report['resolution_date'] = None
394
+
395
+ data.append(report)
396
+
397
+ return pd.DataFrame(data)
398
+
399
+ def _create_llm_training_examples(self) -> List[Dict[str, str]]:
400
+ """Create training examples for LLM fine-tuning"""
401
+ examples = []
402
+
403
+ # Basic testbench examples
404
+ modules = ['FIFO', 'Counter', 'Multiplexer', 'Decoder', 'Encoder', 'Shifter']
405
+ for module in modules:
406
+ examples.append({
407
+ 'instruction': f'Generate basic testbench for {module} module',
408
+ 'response': f'''module {module.lower()}_testbench;
409
+ logic clk = 0;
410
+ logic reset_n = 1;
411
+
412
+ // {module} signals
413
+ logic enable;
414
+ logic [7:0] data_in, data_out;
415
+
416
+ always #5 clk = ~clk;
417
+
418
+ initial begin
419
+ reset_n = 0; #20 reset_n = 1;
420
+ @(posedge clk);
421
+
422
+ // Test {module} functionality
423
+ enable = 1;
424
+ data_in = 8'hAA;
425
+
426
+ repeat(10) @(posedge clk);
427
+
428
+ $display("Test completed");
429
+ $finish;
430
+ end
431
+ endmodule'''
432
+ })
433
+
434
+ # Protocol-specific examples
435
+ protocols = ['UART', 'SPI', 'I2C']
436
+ for protocol in protocols:
437
+ examples.append({
438
+ 'instruction': f'Generate {protocol} protocol assertions',
439
+ 'response': f'''module {protocol.lower()}_assertions(
440
+ input clk, reset_n,
441
+ input start, done, valid
442
+ );
443
+
444
+ property {protocol.lower()}_handshake;
445
+ @(posedge clk) disable iff (!reset_n)
446
+ start |-> ##[1:100] done;
447
+ endproperty
448
+
449
+ assert property({protocol.lower()}_handshake)
450
+ else $error("{protocol} handshake failed");
451
+
452
+ endmodule'''
453
+ })
454
+
455
+ return examples
456
+
457
+ def create_synthetic_dataset(self, n_samples: int = 1000) -> pd.DataFrame:
458
+ """Create a quick synthetic dataset for demo purposes"""
459
+ return self._create_rtl_features(n_samples)
460
+
461
+ def get_dataset_summary(self) -> Dict[str, Any]:
462
+ """Get summary of all collected datasets"""
463
+ summary = {
464
+ 'timestamp': pd.Timestamp.now().isoformat(),
465
+ 'total_datasets': len(self.collected_datasets),
466
+ 'datasets': self.collected_datasets
467
+ }
468
+
469
+ # Save summary
470
+ summary_path = self.data_dir / "dataset_summary.json"
471
+ with open(summary_path, 'w') as f:
472
+ json.dump(summary, f, indent=2, default=str)
473
+
474
+ return summary