Spaces:
Sleeping
Sleeping
| #!/usr/bin/env python | |
| # coding: utf-8 | |
| # In[1]: | |
| import pandas as pd | |
| import numpy as np | |
| import torch | |
| from torch import nn | |
| from torch.nn import init, MarginRankingLoss | |
| from torch.optim import Adam | |
| from distutils.version import LooseVersion | |
| from torch.utils.data import Dataset, DataLoader | |
| from torch.autograd import Variable | |
| import math | |
| from transformers import AutoConfig, AutoModel, AutoTokenizer | |
| import nltk | |
| import re | |
| import torch.optim as optim | |
| from tqdm import tqdm | |
| from transformers import AutoModelForMaskedLM | |
| import torch.nn.functional as F | |
| import random | |
| # In[2]: | |
| maskis = [] | |
| n_y = [] | |
| class MyDataset(Dataset): | |
| def __init__(self,file_name): | |
| global maskis | |
| global n_y | |
| df = pd.read_csv(file_name) | |
| df = df.fillna("") | |
| self.inp_dicts = [] | |
| for r in range(df.shape[0]): | |
| X_init = df['X'][r] | |
| y = df['y'][r] | |
| n_y.append(y) | |
| nl = re.findall(r'[A-Z](?:[a-z]+|[A-Z]*(?=[A-Z]|$))|[a-z]+|\d+', y) | |
| lb = ' '.join(nl).lower() | |
| x = tokenizer.tokenize(lb) | |
| num_sub_tokens_label = len(x) | |
| X_init = X_init.replace("[MASK]", " ".join([tokenizer.mask_token] * num_sub_tokens_label)) | |
| tokens = tokenizer.encode_plus(X_init, add_special_tokens=False,return_tensors='pt') | |
| input_id_chunki = tokens['input_ids'][0].split(510) | |
| input_id_chunks = [] | |
| mask_chunks = [] | |
| mask_chunki = tokens['attention_mask'][0].split(510) | |
| for tensor in input_id_chunki: | |
| input_id_chunks.append(tensor) | |
| for tensor in mask_chunki: | |
| mask_chunks.append(tensor) | |
| xi = torch.full((1,), fill_value=101) | |
| yi = torch.full((1,), fill_value=1) | |
| zi = torch.full((1,), fill_value=102) | |
| for r in range(len(input_id_chunks)): | |
| input_id_chunks[r] = torch.cat([xi, input_id_chunks[r]],dim = -1) | |
| input_id_chunks[r] = torch.cat([input_id_chunks[r],zi],dim=-1) | |
| mask_chunks[r] = torch.cat([yi, mask_chunks[r]],dim=-1) | |
| mask_chunks[r] = torch.cat([mask_chunks[r],yi],dim=-1) | |
| di = torch.full((1,), fill_value=0) | |
| for i in range(len(input_id_chunks)): | |
| pad_len = 512 - input_id_chunks[i].shape[0] | |
| if pad_len > 0: | |
| for p in range(pad_len): | |
| input_id_chunks[i] = torch.cat([input_id_chunks[i],di],dim=-1) | |
| mask_chunks[i] = torch.cat([mask_chunks[i],di],dim=-1) | |
| vb = torch.ones_like(input_id_chunks[0]) | |
| fg = torch.zeros_like(input_id_chunks[0]) | |
| maski = [] | |
| for l in range(len(input_id_chunks)): | |
| masked_pos = [] | |
| for i in range(len(input_id_chunks[l])): | |
| if input_id_chunks[l][i] == tokenizer.mask_token_id: #103 | |
| if i != 0 and input_id_chunks[l][i-1] == tokenizer.mask_token_id: | |
| continue | |
| masked_pos.append(i) | |
| maski.append(masked_pos) | |
| maskis.append(maski) | |
| while (len(input_id_chunks)<250): | |
| input_id_chunks.append(vb) | |
| mask_chunks.append(fg) | |
| input_ids = torch.stack(input_id_chunks) | |
| attention_mask = torch.stack(mask_chunks) | |
| input_dict = { | |
| 'input_ids': input_ids.long(), | |
| 'attention_mask': attention_mask.int() | |
| } | |
| self.inp_dicts.append(input_dict) | |
| del input_dict | |
| del input_ids | |
| del attention_mask | |
| del maski | |
| del mask_chunks | |
| del input_id_chunks | |
| del di | |
| del fg | |
| del vb | |
| del mask_chunki | |
| del input_id_chunki | |
| del X_init | |
| del y | |
| del tokens | |
| del x | |
| del lb | |
| del nl | |
| del df | |
| def __len__(self): | |
| return len(self.inp_dicts) | |
| def __getitem__(self,idx): | |
| return self.inp_dicts[idx] | |
| # In[3]: | |
| tokenizer = AutoTokenizer.from_pretrained("microsoft/graphcodebert-base") | |
| model = AutoModelForMaskedLM.from_pretrained("microsoft/graphcodebert-base") | |
| base_model = AutoModelForMaskedLM.from_pretrained("microsoft/graphcodebert-base") | |
| model.load_state_dict(torch.load('var_runs/model_26_2')) | |
| model.eval() | |
| base_model.eval() | |
| myDs=MyDataset('d_t.csv') | |
| train_loader=DataLoader(myDs,batch_size=1,shuffle=False) | |
| # In[4]: | |
| variable_names = [ | |
| # One-word Variable Names | |
| 'count', 'value', 'result', 'flag', 'max', 'min', 'data', 'input', 'output', 'name', 'index', 'status', 'error', 'message', 'price', 'quantity', 'total', 'length', 'size', 'score', | |
| # Two-word Variable Names | |
| 'studentName', 'accountBalance', 'isFound', 'maxScore', 'userAge', 'carModel', 'bookTitle', 'arrayLength', 'employeeID', 'itemPrice', 'customerAddress', 'productCategory', 'orderNumber', 'transactionType', 'bankAccount', 'shippingMethod', 'deliveryDate', 'purchaseAmount', 'inventoryItem', 'salesRevenue', | |
| # Three-word Variable Names | |
| 'numberOfStudents', 'averageTemperature', 'userIsLoggedIn', 'totalSalesAmount', 'employeeSalaryRate', 'maxAllowedAttempts', 'selectedOption', 'shippingAddress', 'manufacturingDate', 'connectionPool', 'customerAccountBalance', 'employeeSalaryReport', 'productInventoryCount', 'transactionProcessingStatus', 'userAuthenticationToken', 'orderShippingAddress', 'databaseConnectionPoolSize', 'vehicleEngineTemperature', 'sensorDataProcessingRate', 'employeePayrollSystem', | |
| # Four-word Variable Names | |
| 'customerAccountBalanceValue', 'employeeSalaryReportData', 'productInventoryItemCount', 'transactionProcessingStatusFlag', 'userAuthenticationTokenKey', 'orderShippingAddressDetails', 'databaseConnectionPoolMaxSize', 'vehicleEngineTemperatureReading', 'sensorDataProcessingRateLimit', 'employeePayrollSystemData', 'customerOrderShippingAddress', 'productCatalogItemNumber', 'transactionProcessingSuccessFlag', 'userAuthenticationAccessToken', 'databaseConnectionPoolConfig', 'vehicleEngineTemperatureSensor', 'sensorDataProcessingRateLimitation', 'employeePayrollSystemConfiguration', 'customerAccountBalanceHistoryData', 'transactionProcessingStatusTracking' | |
| ] | |
| var_list = [] | |
| for j in range(6): | |
| d =[] | |
| var_list.append(d) | |
| for var in variable_names: | |
| try: | |
| var_list[len(tokenizer.tokenize(var))-1].append(var) | |
| except: | |
| continue | |
| # In[5]: | |
| tot_pll = 0.0 | |
| base_tot_pll = 0.0 | |
| loop = tqdm(train_loader, leave=True) | |
| cntr = 0 | |
| for batch in loop: | |
| maxi = torch.tensor(0.0, requires_grad=True) | |
| for i in range(len(batch['input_ids'])): | |
| cntr+=1 | |
| maski = maskis[cntr-1] | |
| li = len(maski) | |
| input_ids = batch['input_ids'][i][:li] | |
| att_mask = batch['attention_mask'][i][:li] | |
| y = n_y[cntr-1] | |
| ty = tokenizer.encode(y)[1:-1] | |
| num_sub_tokens_label = len(ty) | |
| if num_sub_tokens_label > 6: | |
| continue | |
| print("Ground truth:", y) | |
| m_y = random.choice(var_list[num_sub_tokens_label-1]) | |
| m_ty = tokenizer.encode(m_y)[1:-1] | |
| print("Mock truth:", m_y) | |
| # input_ids, att_mask = input_ids.to(device),att_mask.to(device) | |
| outputs = model(input_ids, attention_mask = att_mask) | |
| base_outputs = base_model(input_ids, attention_mask = att_mask) | |
| last_hidden_state = outputs[0].squeeze() | |
| base_last_hidden_state = base_outputs[0].squeeze() | |
| l_o_l_sa = [] | |
| base_l_o_l_sa = [] | |
| sum_state = [] | |
| base_sum_state = [] | |
| for t in range(num_sub_tokens_label): | |
| c = [] | |
| d = [] | |
| l_o_l_sa.append(c) | |
| base_l_o_l_sa.append(d) | |
| if len(maski) == 1: | |
| masked_pos = maski[0] | |
| for k in masked_pos: | |
| for t in range(num_sub_tokens_label): | |
| l_o_l_sa[t].append(last_hidden_state[k+t]) | |
| base_l_o_l_sa[t].append(base_last_hidden_state[k+t]) | |
| else: | |
| for p in range(len(maski)): | |
| masked_pos = maski[p] | |
| for k in masked_pos: | |
| for t in range(num_sub_tokens_label): | |
| if (k+t) >= len(last_hidden_state[p]): | |
| l_o_l_sa[t].append(last_hidden_state[p+1][k+t-len(last_hidden_state[p])]) | |
| base_l_o_l_sa[t].append(base_last_hidden_state[p+1][k+t-len(base_last_hidden_state[p])]) | |
| continue | |
| l_o_l_sa[t].append(last_hidden_state[p][k+t]) | |
| base_l_o_l_sa[t].append(base_last_hidden_state[p][k+t]) | |
| for t in range(num_sub_tokens_label): | |
| sum_state.append(l_o_l_sa[t][0]) | |
| base_sum_state.append(base_l_o_l_sa[t][0]) | |
| for i in range(len(l_o_l_sa[0])): | |
| if i == 0: | |
| continue | |
| for t in range(num_sub_tokens_label): | |
| sum_state[t] = sum_state[t] + l_o_l_sa[t][i] | |
| base_sum_state[t] = base_sum_state[t] + base_l_o_l_sa[t][i] | |
| yip = len(l_o_l_sa[0]) | |
| val = 0.0 | |
| m_val = 0.0 | |
| m_base_val = 0.0 | |
| base_val = 0.0 | |
| for t in range(num_sub_tokens_label): | |
| sum_state[t] /= yip | |
| base_sum_state[t] /= yip | |
| probs = F.softmax(sum_state[t], dim=0) | |
| base_probs = F.softmax(base_sum_state[t], dim=0) | |
| val = val - torch.log(probs[ty[t]]) | |
| m_val = m_val - torch.log(probs[m_ty[t]]) | |
| base_val = base_val - torch.log(base_probs[ty[t]]) | |
| m_base_val = m_base_val - torch.log(base_probs[m_ty[t]]) | |
| val = val / num_sub_tokens_label | |
| base_val = base_val / num_sub_tokens_label | |
| m_val = m_val / num_sub_tokens_label | |
| m_base_val = m_base_val / num_sub_tokens_label | |
| print("Sent PLL:") | |
| print(val) | |
| print("Base Sent PLL:") | |
| print(base_val) | |
| print("Net % difference:") | |
| diff = (val-base_val)*100/base_val | |
| print(diff) | |
| tot_pll += val | |
| base_tot_pll+=base_val | |
| print() | |
| print() | |
| print("Mock Sent PLL:") | |
| print(m_val) | |
| print("Mock Base Sent PLL:") | |
| print(m_base_val) | |
| print("Mock Net % difference:") | |
| m_diff = (m_val-m_base_val)*100/m_base_val | |
| print(m_diff) | |
| for c in sum_state: | |
| del c | |
| for d in base_sum_state: | |
| del d | |
| del sum_state | |
| del base_sum_state | |
| for c in l_o_l_sa: | |
| del c | |
| for c in base_l_o_l_sa: | |
| del c | |
| del l_o_l_sa | |
| del base_l_o_l_sa | |
| del maski | |
| del input_ids | |
| del att_mask | |
| del last_hidden_state | |
| del base_last_hidden_state | |
| print("Tot PLL: ", tot_pll) | |
| print("Base Tot PLL: ", base_tot_pll) | |
| # In[ ]: | |