Spaces:
Sleeping
Sleeping
| import pandas as pd | |
| import numpy as np | |
| import re | |
| import os | |
| import sys | |
| import random | |
| import transformers | |
| from transformers import AutoTokenizer, AutoModelForSequenceClassification | |
| from transformers import RobertaTokenizer, RobertaForSequenceClassification | |
| import torch | |
| import torch.nn.functional as F | |
| from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler | |
| from transformers import T5Tokenizer, T5ForConditionalGeneration | |
| import gradio as gr | |
| def greet(co): | |
| code_text = [] | |
| code_text.append(co) | |
| code_text = ' '.join(code_text) | |
| code_text = re.sub('\/\*[\S\s]*\*\/', '', code_text) | |
| code_text = re.sub('\/\/.*', '', code_text) | |
| code_text = re.sub('(\\\\n)+', '\\n', code_text) | |
| # 1. CFA-CodeBERTa-small.pt -> CodeBERTa-small-v1 finetunig model | |
| path = os.getcwd() + '/models/CFA-CodeBERTa-small.pt' | |
| tokenizer = AutoTokenizer.from_pretrained("huggingface/CodeBERTa-small-v1") | |
| input_ids = tokenizer.encode( | |
| code_text, max_length=512, truncation=True, padding='max_length') | |
| input_ids = torch.tensor([input_ids]) | |
| model = RobertaForSequenceClassification.from_pretrained( | |
| path, num_labels=2) | |
| model.to('cpu') | |
| pred_1 = model(input_ids)[0].detach().cpu().numpy()[0] | |
| # model(input_ids)[0].argmax().detach().cpu().numpy().item() | |
| # 2. CFA-codebert-c.pt -> codebert-c finetuning model | |
| path = os.getcwd() + '/models/CFA-codebert-c.pt' | |
| tokenizer = AutoTokenizer.from_pretrained(path) | |
| input_ids = tokenizer(code_text, padding=True, max_length=512, | |
| truncation=True, return_token_type_ids=True)['input_ids'] | |
| input_ids = torch.tensor([input_ids]) | |
| model = AutoModelForSequenceClassification.from_pretrained( | |
| path, num_labels=2) | |
| model.to('cpu') | |
| pred_2 = model(input_ids)[0].detach().cpu().numpy()[0] | |
| # 3. CFA-codebert-c-v2.pt -> undersampling + codebert-c finetuning model | |
| path = os.getcwd() + '/models/CFA-codebert-c-v2.pt' | |
| tokenizer = RobertaTokenizer.from_pretrained(path) | |
| input_ids = tokenizer(code_text, padding=True, max_length=512, | |
| truncation=True, return_token_type_ids=True)['input_ids'] | |
| input_ids = torch.tensor([input_ids]) | |
| model = RobertaForSequenceClassification.from_pretrained( | |
| path, num_labels=2) | |
| model.to('cpu') | |
| pred_3 = model(input_ids)[0].detach().cpu().numpy() | |
| # 4. codeT5 finetuning model | |
| path = os.getcwd() + '/models/CFA-codeT5' | |
| model_params = { | |
| # model_type: t5-base/t5-large | |
| "MODEL": path, | |
| "TRAIN_BATCH_SIZE": 8, # training batch size | |
| "VALID_BATCH_SIZE": 8, # validation batch size | |
| "VAL_EPOCHS": 1, # number of validation epochs | |
| "MAX_SOURCE_TEXT_LENGTH": 512, # max length of source text | |
| "MAX_TARGET_TEXT_LENGTH": 3, # max length of target text | |
| "SEED": 2022, # set seed for reproducibility | |
| } | |
| data = pd.DataFrame({'code': [code_text]}) | |
| pred_4 = T5Trainer( | |
| dataframe=data, | |
| source_text="code", | |
| model_params=model_params | |
| ) | |
| pred_4 = int(pred_4[0]) | |
| # ensemble | |
| tot_result = (pred_1 * 0.8 + pred_2 * 0.1 + | |
| pred_3 * 0.1 + pred_4 * 0.1).argmax() | |
| if tot_result == 0: | |
| return "false positive !!" | |
| else: | |
| return "true positive !!" | |
| # codeT5 | |
| class YourDataSetClass(Dataset): | |
| def __init__( | |
| self, dataframe, tokenizer, source_len, source_text): | |
| self.tokenizer = tokenizer | |
| self.data = dataframe | |
| self.source_len = source_len | |
| # self.summ_len = target_len | |
| # self.target_text = self.data[target_text] | |
| self.source_text = self.data[source_text] | |
| def __len__(self): | |
| return len(self.source_text) | |
| def __getitem__(self, index): | |
| source_text = str(self.source_text[index]) | |
| source_text = " ".join(source_text.split()) | |
| source = self.tokenizer.batch_encode_plus( | |
| [source_text], | |
| max_length=self.source_len, | |
| pad_to_max_length=True, | |
| truncation=True, | |
| padding="max_length", | |
| return_tensors="pt", | |
| ) | |
| source_ids = source["input_ids"].squeeze() | |
| source_mask = source["attention_mask"].squeeze() | |
| return { | |
| "source_ids": source_ids.to(dtype=torch.long), | |
| "source_mask": source_mask.to(dtype=torch.long), | |
| } | |
| def validate(epoch, tokenizer, model, device, loader): | |
| model.eval() | |
| predictions = [] | |
| with torch.no_grad(): | |
| for _, data in enumerate(loader, 0): | |
| ids = data['source_ids'].to(device, dtype=torch.long) | |
| mask = data['source_mask'].to(device, dtype=torch.long) | |
| generated_ids = model.generate( | |
| input_ids=ids, | |
| attention_mask=mask, | |
| max_length=150, | |
| num_beams=2, | |
| repetition_penalty=2.5, | |
| length_penalty=1.0, | |
| early_stopping=True | |
| ) | |
| preds = [tokenizer.decode( | |
| g, skip_special_tokens=True, clean_up_tokenization_spaces=True) for g in generated_ids] | |
| if ((preds != '0') | (preds != '1')): | |
| preds = '0' | |
| predictions.extend(preds) | |
| return predictions | |
| def T5Trainer(dataframe, source_text, model_params, step="test",): | |
| torch.manual_seed(model_params["SEED"]) # pytorch random seed | |
| np.random.seed(model_params["SEED"]) # numpy random seed | |
| torch.backends.cudnn.deterministic = True | |
| tokenizer = T5Tokenizer.from_pretrained(model_params["MODEL"]) | |
| model = T5ForConditionalGeneration.from_pretrained(model_params["MODEL"]) | |
| model = model.to('cpu') | |
| dataframe = dataframe[[source_text]] | |
| val_dataset = dataframe | |
| val_set = YourDataSetClass( | |
| val_dataset, tokenizer, model_params["MAX_SOURCE_TEXT_LENGTH"], source_text) | |
| val_params = { | |
| 'batch_size': model_params["VALID_BATCH_SIZE"], | |
| 'shuffle': False, | |
| 'num_workers': 0 | |
| } | |
| val_loader = DataLoader(val_set, **val_params) | |
| for epoch in range(model_params["VAL_EPOCHS"]): | |
| predictions = validate(epoch, tokenizer, model, 'cpu', val_loader) | |
| return predictions | |
| ################################################################################# | |
| '''demo = gr.Interface( | |
| fn = greet, | |
| inputs = "text", | |
| outputs= "number") | |
| demo.launch(share=True) | |
| ''' | |
| with gr.Blocks() as demo1: | |
| gr.Markdown( | |
| """ | |
| <h1 align="center"> | |
| False-Alarm-Detector | |
| </h1> | |
| """) | |
| gr.Markdown( | |
| """ | |
| ์ ์ ๋ถ์๊ธฐ๋ก ์ค๋ฅ๋ผ๊ณ ๋ณด๊ณ ๋ ์ฝ๋๋ฅผ ์ ๋ ฅํ๋ฉด, | |
| ์ค๋ฅ๊ฐ True-positive ์ธ์ง False-positive ์ธ์ง ๋ถ๋ฅ ํด ์ฃผ๋ ํ๋ก๊ทธ๋จ์ด๋ค. | |
| """) | |
| with gr.Accordion(label='๋ชจ๋ธ์ ๋ํ ์ค๋ช ( ์ฌ๊ธฐ๋ฅผ ํด๋ฆญ ํ์์ค. )',open=False): | |
| gr.Markdown( | |
| """ | |
| ์ด 3๊ฐ์ ๋ชจ๋ธ์ ์ฌ์ฉํ์๋ค. | |
| 1. codeBERTa-small-v1 | |
| - codeBERTa-small-v1 ์ค๋ช | |
| 2. codeBERT - C | |
| - codeBERT - C ์ค๋ช | |
| 3. codeT5 | |
| - codeT5 ์ค๋ช | |
| """ | |
| ) | |
| with gr.Row(): | |
| with gr.Column(): | |
| inputs_1 = gr.Textbox(placeholder="์ฝ๋๋ฅผ ์ ๋ ฅํ์์ค.", label='Code') | |
| with gr.Row(): | |
| btn = gr.Button("๊ฒฐ๊ณผ ์ถ๋ ฅ") | |
| with gr.Column(): | |
| outputs_1 = gr.Text(label = 'Result') | |
| btn.click(fn = greet, inputs = inputs_1, outputs= outputs_1) | |
| if __name__ == "__main__": | |
| demo1.launch() | |