eunJ commited on
Commit
10e5ef2
ยท
verified ยท
1 Parent(s): 601c38e

initial push (safetensors + custom head)

Browse files
README.md CHANGED
@@ -1,54 +1,6 @@
1
- ---
2
- language:
3
- - code
4
- library_name: transformers
5
- pipeline_tag: text-classification
6
- tags:
7
- - code-analysis
8
- - vulnerability-detection
9
- - security
10
- - cwe
11
- license: mit
12
- base_model: microsoft/codebert-base
13
- ---
14
-
15
- # CodeBERT Vulnerability Detector (Multi-class)
16
-
17
- C/C++ ์ฝ”๋“œ์˜ ์ทจ์•ฝ์ ์„ ํƒ์ง€ํ•˜๋Š” ๋‹ค์ค‘ ํด๋ž˜์Šค ๋ถ„๋ฅ˜ ๋ชจ๋ธ์ž…๋‹ˆ๋‹ค.
18
-
19
- ## ๋ชจ๋ธ ์ •๋ณด
20
- - **๊ธฐ๋ฐ˜ ๋ชจ๋ธ**: microsoft/codebert-base
21
- - **๋ถ„๋ฅ˜ ํด๋ž˜์Šค**: 4๊ฐœ (CWE-79, CWE-89, CWE-119, ๊ธฐํƒ€)
22
- - **์ž…๋ ฅ**: C/C++ ์†Œ์Šค ์ฝ”๋“œ ํ…์ŠคํŠธ
23
-
24
- ## ์‚ฌ์šฉ ๋ฐฉ๋ฒ•
25
-
26
  ```python
27
  from transformers import AutoTokenizer, AutoModelForSequenceClassification
28
- import torch
29
-
30
- # ๋ชจ๋ธ ๋กœ๋“œ
31
- model_name = "eunJ/codebert_vulnerability_detector_multi"
32
- tokenizer = AutoTokenizer.from_pretrained(model_name)
33
- model = AutoModelForSequenceClassification.from_pretrained(model_name)
34
-
35
- # ์ฝ”๋“œ ๋ถ„์„
36
- code = '''
37
- char buffer[100];
38
- gets(buffer);
39
- '''
40
-
41
- inputs = tokenizer(code, return_tensors="pt", max_length=512, truncation=True)
42
- with torch.no_grad():
43
- outputs = model(**inputs)
44
- predictions = torch.softmax(outputs.logits, dim=-1)
45
- predicted_class = torch.argmax(predictions)
46
-
47
- print(f"์˜ˆ์ธก ํด๋ž˜์Šค: {predicted_class.item()}")
48
- ```
49
-
50
- ## ํด๋ž˜์Šค ๋ ˆ์ด๋ธ”
51
- - 0: CWE-79 (Cross-site Scripting)
52
- - 1: CWE-89 (SQL Injection)
53
- - 2: CWE-119 (Buffer Overflow)
54
- - 3: CWE-Other (๊ธฐํƒ€)
 
1
+ # Custom RoBERTa (safetensors)
2
+ Load with:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
  ```python
4
  from transformers import AutoTokenizer, AutoModelForSequenceClassification
5
+ tok = AutoTokenizer.from_pretrained("eunJ/codebert_vulnerability_detector_multi")
6
+ model = AutoModelForSequenceClassification.from_pretrained("eunJ/codebert_vulnerability_detector_multi", trust_remote_code=True).eval()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
config.json CHANGED
@@ -1,19 +1,42 @@
1
  {
2
- "model_type": "roberta",
3
  "architectures": [
4
- "RobertaForSequenceClassification"
5
  ],
6
- "num_labels": 4,
 
 
 
 
 
 
 
 
 
7
  "id2label": {
8
  "0": "LABEL_0",
9
  "1": "LABEL_1",
10
  "2": "LABEL_2",
11
  "3": "LABEL_3"
12
  },
 
 
13
  "label2id": {
14
  "LABEL_0": 0,
15
  "LABEL_1": 1,
16
  "LABEL_2": 2,
17
  "LABEL_3": 3
18
- }
19
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  {
2
+ "_name_or_path": "microsoft/codebert-base",
3
  "architectures": [
4
+ "RobertaModel"
5
  ],
6
+ "attention_probs_dropout_prob": 0.1,
7
+ "auto_map": {
8
+ "AutoModelForSequenceClassification": "modeling_my_roberta.MyRobertaForSequenceClassification"
9
+ },
10
+ "bos_token_id": 0,
11
+ "classifier_dropout": null,
12
+ "eos_token_id": 2,
13
+ "hidden_act": "gelu",
14
+ "hidden_dropout_prob": 0.1,
15
+ "hidden_size": 768,
16
  "id2label": {
17
  "0": "LABEL_0",
18
  "1": "LABEL_1",
19
  "2": "LABEL_2",
20
  "3": "LABEL_3"
21
  },
22
+ "initializer_range": 0.02,
23
+ "intermediate_size": 3072,
24
  "label2id": {
25
  "LABEL_0": 0,
26
  "LABEL_1": 1,
27
  "LABEL_2": 2,
28
  "LABEL_3": 3
29
+ },
30
+ "layer_norm_eps": 1e-05,
31
+ "max_position_embeddings": 514,
32
+ "model_type": "roberta",
33
+ "num_attention_heads": 12,
34
+ "num_hidden_layers": 12,
35
+ "output_past": true,
36
+ "pad_token_id": 1,
37
+ "position_embedding_type": "absolute",
38
+ "transformers_version": "4.49.0",
39
+ "type_vocab_size": 1,
40
+ "use_cache": true,
41
+ "vocab_size": 50265
42
+ }
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c8037175a0f7980967910a796119cf0250a6c0200f7568225f2e9aaeb43b9b68
3
- size 498633008
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:28416b8a21440d53974e0456bf036dd7b9778a371c50b159f8870f6742c8ecad
3
+ size 496256360
modeling_my_roberta.py ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # modeling_my_roberta.py
2
+ from typing import Optional, Tuple
3
+ import torch
4
+ import torch.nn as nn
5
+ from transformers import RobertaModel, RobertaPreTrainedModel
6
+
7
+ class MyRobertaForSequenceClassification(RobertaPreTrainedModel):
8
+ def __init__(self, config):
9
+ super().__init__(config)
10
+ self.num_labels = getattr(config, "num_labels", 4)
11
+ self.roberta = RobertaModel(config, add_pooling_layer=False)
12
+ self.classifier = nn.Linear(config.hidden_size, self.num_labels)
13
+ self.loss_fn = nn.CrossEntropyLoss()
14
+ self.post_init() # init_weights
15
+
16
+ def _get_pad_id(self, input_ids):
17
+ # Roberta pad = 1๊ฐ€ ์ผ๋ฐ˜์ ์ด์ง€๋งŒ ํ† ํฌ๋‚˜์ด์ €์— ๋งž์ถฐ attention_mask ์ง์ ‘ ๋ฐ›๋Š” ๊ฒŒ ๋” ์•ˆ์ „
18
+ return 1
19
+
20
+ def _pool(self, last_hidden_state, attention_mask, model_type: str = "roberta"):
21
+ # ๋„ˆ์˜ ๋กœ์ง: roberta๋ฉด CLS, ๊ทธ ์™ธ mask mean
22
+ if last_hidden_state.dim() == 3:
23
+ if model_type in {"bert", "roberta", "deberta", "xlm-roberta", "electra"}:
24
+ return last_hidden_state[:, 0, :]
25
+ mask = attention_mask.unsqueeze(-1).float()
26
+ summed = (last_hidden_state * mask).sum(dim=1)
27
+ denom = mask.sum(dim=1).clamp(min=1e-9)
28
+ return summed / denom
29
+ elif last_hidden_state.dim() == 2:
30
+ return last_hidden_state
31
+ else:
32
+ raise ValueError(f"Unexpected hidden dim: {last_hidden_state.dim()}")
33
+
34
+ def forward(
35
+ self,
36
+ input_ids=None,
37
+ attention_mask=None,
38
+ labels: Optional[torch.LongTensor] = None,
39
+ **kwargs
40
+ ):
41
+ if attention_mask is None and input_ids is not None:
42
+ pad_id = self._get_pad_id(input_ids)
43
+ attention_mask = input_ids.ne(pad_id)
44
+
45
+ outputs = self.roberta(input_ids=input_ids, attention_mask=attention_mask, return_dict=True)
46
+ hidden = outputs.last_hidden_state
47
+ pooled = self._pool(hidden, attention_mask, "roberta")
48
+ logits = self.classifier(pooled)
49
+
50
+ loss = None
51
+ if labels is not None:
52
+ loss = self.loss_fn(logits, labels.long())
53
+
54
+ return {"loss": loss, "logits": logits}
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json CHANGED
@@ -45,7 +45,6 @@
45
  "bos_token": "<s>",
46
  "clean_up_tokenization_spaces": false,
47
  "cls_token": "<s>",
48
- "do_lower_case": false,
49
  "eos_token": "</s>",
50
  "errors": "replace",
51
  "extra_special_tokens": {},
@@ -54,5 +53,6 @@
54
  "pad_token": "<pad>",
55
  "sep_token": "</s>",
56
  "tokenizer_class": "RobertaTokenizer",
 
57
  "unk_token": "<unk>"
58
  }
 
45
  "bos_token": "<s>",
46
  "clean_up_tokenization_spaces": false,
47
  "cls_token": "<s>",
 
48
  "eos_token": "</s>",
49
  "errors": "replace",
50
  "extra_special_tokens": {},
 
53
  "pad_token": "<pad>",
54
  "sep_token": "</s>",
55
  "tokenizer_class": "RobertaTokenizer",
56
+ "trim_offsets": true,
57
  "unk_token": "<unk>"
58
  }
vocab.json CHANGED
The diff for this file is too large to render. See raw diff