Upload vulnerability detection model
Browse files- README.md +45 -25
- config.json +25 -4
- model.safetensors +2 -2
- tokenizer_config.json +0 -1
README.md
CHANGED
@@ -1,34 +1,54 @@
|
|
1 |
---
|
|
|
|
|
2 |
library_name: transformers
|
|
|
3 |
tags:
|
4 |
-
-
|
5 |
-
-
|
6 |
-
-
|
7 |
-
|
8 |
-
- DetectVul/bigvul
|
9 |
-
language:
|
10 |
-
- en
|
11 |
-
base_model:
|
12 |
-
- microsoft/codebert-base
|
13 |
license: mit
|
14 |
-
|
15 |
-
- accuracy
|
16 |
-
- precision
|
17 |
-
- f1
|
18 |
-
- recall
|
19 |
---
|
20 |
|
21 |
-
|
22 |
|
23 |
-
|
24 |
-
This model is a fine-tuned version of **microsoft/codebert-base**, optimized for detecting vulnerabilities in code. It is trained on the **bigvul** dataset. The model takes in a code snippet and classifies it as either **benign (0)** or **vulnerable (1)**.
|
25 |
|
26 |
-
##
|
|
|
|
|
|
|
27 |
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
---
|
2 |
+
language:
|
3 |
+
- code
|
4 |
library_name: transformers
|
5 |
+
pipeline_tag: text-classification
|
6 |
tags:
|
7 |
+
- code-analysis
|
8 |
+
- vulnerability-detection
|
9 |
+
- security
|
10 |
+
- cwe
|
|
|
|
|
|
|
|
|
|
|
11 |
license: mit
|
12 |
+
base_model: microsoft/codebert-base
|
|
|
|
|
|
|
|
|
13 |
---
|
14 |
|
15 |
+
# CodeBERT Vulnerability Detector (Multi-class)
|
16 |
|
17 |
+
C/C++ ์ฝ๋์ ์ทจ์ฝ์ ์ ํ์งํ๋ ๋ค์ค ํด๋์ค ๋ถ๋ฅ ๋ชจ๋ธ์
๋๋ค.
|
|
|
18 |
|
19 |
+
## ๋ชจ๋ธ ์ ๋ณด
|
20 |
+
- **๊ธฐ๋ฐ ๋ชจ๋ธ**: microsoft/codebert-base
|
21 |
+
- **๋ถ๋ฅ ํด๋์ค**: 4๊ฐ (CWE-79, CWE-89, CWE-119, ๊ธฐํ)
|
22 |
+
- **์
๋ ฅ**: C/C++ ์์ค ์ฝ๋ ํ
์คํธ
|
23 |
|
24 |
+
## ์ฌ์ฉ ๋ฐฉ๋ฒ
|
25 |
+
|
26 |
+
```python
|
27 |
+
from transformers import AutoTokenizer, AutoModelForSequenceClassification
|
28 |
+
import torch
|
29 |
+
|
30 |
+
# ๋ชจ๋ธ ๋ก๋
|
31 |
+
model_name = "eunJ/codebert_vulnerability_detector_multi"
|
32 |
+
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
33 |
+
model = AutoModelForSequenceClassification.from_pretrained(model_name)
|
34 |
+
|
35 |
+
# ์ฝ๋ ๋ถ์
|
36 |
+
code = '''
|
37 |
+
char buffer[100];
|
38 |
+
gets(buffer);
|
39 |
+
'''
|
40 |
+
|
41 |
+
inputs = tokenizer(code, return_tensors="pt", max_length=512, truncation=True)
|
42 |
+
with torch.no_grad():
|
43 |
+
outputs = model(**inputs)
|
44 |
+
predictions = torch.softmax(outputs.logits, dim=-1)
|
45 |
+
predicted_class = torch.argmax(predictions)
|
46 |
+
|
47 |
+
print(f"์์ธก ํด๋์ค: {predicted_class.item()}")
|
48 |
+
```
|
49 |
+
|
50 |
+
## ํด๋์ค ๋ ์ด๋ธ
|
51 |
+
- 0: CWE-79 (Cross-site Scripting)
|
52 |
+
- 1: CWE-89 (SQL Injection)
|
53 |
+
- 2: CWE-119 (Buffer Overflow)
|
54 |
+
- 3: CWE-Other (๊ธฐํ)
|
config.json
CHANGED
@@ -1,19 +1,40 @@
|
|
1 |
{
|
2 |
-
"
|
3 |
"architectures": [
|
4 |
"RobertaForSequenceClassification"
|
5 |
],
|
6 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
7 |
"id2label": {
|
8 |
"0": "LABEL_0",
|
9 |
"1": "LABEL_1",
|
10 |
"2": "LABEL_2",
|
11 |
"3": "LABEL_3"
|
12 |
},
|
|
|
|
|
13 |
"label2id": {
|
14 |
"LABEL_0": 0,
|
15 |
"LABEL_1": 1,
|
16 |
"LABEL_2": 2,
|
17 |
"LABEL_3": 3
|
18 |
-
}
|
19 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
{
|
2 |
+
"_name_or_path": "microsoft/codebert-base",
|
3 |
"architectures": [
|
4 |
"RobertaForSequenceClassification"
|
5 |
],
|
6 |
+
"attention_probs_dropout_prob": 0.1,
|
7 |
+
"bos_token_id": 0,
|
8 |
+
"classifier_dropout": null,
|
9 |
+
"eos_token_id": 2,
|
10 |
+
"hidden_act": "gelu",
|
11 |
+
"hidden_dropout_prob": 0.1,
|
12 |
+
"hidden_size": 768,
|
13 |
"id2label": {
|
14 |
"0": "LABEL_0",
|
15 |
"1": "LABEL_1",
|
16 |
"2": "LABEL_2",
|
17 |
"3": "LABEL_3"
|
18 |
},
|
19 |
+
"initializer_range": 0.02,
|
20 |
+
"intermediate_size": 3072,
|
21 |
"label2id": {
|
22 |
"LABEL_0": 0,
|
23 |
"LABEL_1": 1,
|
24 |
"LABEL_2": 2,
|
25 |
"LABEL_3": 3
|
26 |
+
},
|
27 |
+
"layer_norm_eps": 1e-05,
|
28 |
+
"max_position_embeddings": 514,
|
29 |
+
"model_type": "roberta",
|
30 |
+
"num_attention_heads": 12,
|
31 |
+
"num_hidden_layers": 12,
|
32 |
+
"output_past": true,
|
33 |
+
"pad_token_id": 1,
|
34 |
+
"position_embedding_type": "absolute",
|
35 |
+
"torch_dtype": "float32",
|
36 |
+
"transformers_version": "4.49.0",
|
37 |
+
"type_vocab_size": 1,
|
38 |
+
"use_cache": true,
|
39 |
+
"vocab_size": 50265
|
40 |
+
}
|
model.safetensors
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:c601158bf8733adb819d956d6e3c418480f72fa7216b30463b1f5aa291ce2756
|
3 |
+
size 498618976
|
tokenizer_config.json
CHANGED
@@ -45,7 +45,6 @@
|
|
45 |
"bos_token": "<s>",
|
46 |
"clean_up_tokenization_spaces": false,
|
47 |
"cls_token": "<s>",
|
48 |
-
"do_lower_case": false,
|
49 |
"eos_token": "</s>",
|
50 |
"errors": "replace",
|
51 |
"extra_special_tokens": {},
|
|
|
45 |
"bos_token": "<s>",
|
46 |
"clean_up_tokenization_spaces": false,
|
47 |
"cls_token": "<s>",
|
|
|
48 |
"eos_token": "</s>",
|
49 |
"errors": "replace",
|
50 |
"extra_special_tokens": {},
|