123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189 |
- import torch
- import pandas as pd
- from sklearn.metrics import roc_auc_score, mean_squared_error
- import numpy as np
- from .augmentations import embed_data_mask
- import torch.nn as nn
- from sklearn.preprocessing import LabelEncoder
- def make_default_mask(x):
- mask = np.ones_like(x)
- mask[:,-1] = 0
- return mask
- def tag_gen(tag,y):
- return np.repeat(tag,len(y['data']))
- def count_parameters(model):
- return sum(p.numel() for p in model.parameters() if p.requires_grad)
- def get_scheduler(args, optimizer):
- if args.scheduler == 'cosine':
- scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, args.epochs)
- elif args.scheduler == 'linear':
- scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer,
- milestones=[args.epochs // 2.667, args.epochs // 1.6, args.epochs // 1.142], gamma=0.1)
- return scheduler
- def imputations_acc_justy(model,dloader,device):
- model.eval()
- m = nn.Softmax(dim=1)
- y_test = torch.empty(0).to(device)
- y_pred = torch.empty(0).to(device)
- prob = torch.empty(0).to(device)
- with torch.no_grad():
- for i, data in enumerate(dloader, 0):
- x_categ, x_cont, cat_mask, con_mask = data[0].to(device), data[1].to(device),data[2].to(device),data[3].to(device)
- _ , x_categ_enc, x_cont_enc = embed_data_mask(x_categ, x_cont, cat_mask, con_mask,model)
- reps = model.transformer(x_categ_enc, x_cont_enc)
- y_reps = reps[:,model.num_categories-1,:]
- y_outs = model.mlpfory(y_reps)
- # import ipdb; ipdb.set_trace()
- y_test = torch.cat([y_test,x_categ[:,-1].float()],dim=0)
- y_pred = torch.cat([y_pred,torch.argmax(m(y_outs), dim=1).float()],dim=0)
- prob = torch.cat([prob,m(y_outs)[:,-1].float()],dim=0)
-
- correct_results_sum = (y_pred == y_test).sum().float()
- acc = correct_results_sum/y_test.shape[0]*100
- auc = roc_auc_score(y_score=prob.cpu(), y_true=y_test.cpu())
- return acc, auc
- def multiclass_acc_justy(model,dloader,device):
- model.eval()
- vision_dset = True
- m = nn.Softmax(dim=1)
- y_test = torch.empty(0).to(device)
- y_pred = torch.empty(0).to(device)
- prob = torch.empty(0).to(device)
- with torch.no_grad():
- for i, data in enumerate(dloader, 0):
- x_categ, x_cont, cat_mask, con_mask = data[0].to(device), data[1].to(device),data[2].to(device),data[3].to(device)
- _ , x_categ_enc, x_cont_enc = embed_data_mask(x_categ, x_cont, cat_mask, con_mask,model,vision_dset)
- reps = model.transformer(x_categ_enc, x_cont_enc)
- y_reps = reps[:,model.num_categories-1,:]
- y_outs = model.mlpfory(y_reps)
- # import ipdb; ipdb.set_trace()
- y_test = torch.cat([y_test,x_categ[:,-1].float()],dim=0)
- y_pred = torch.cat([y_pred,torch.argmax(m(y_outs), dim=1).float()],dim=0)
-
- correct_results_sum = (y_pred == y_test).sum().float()
- acc = correct_results_sum/y_test.shape[0]*100
- return acc, 0
- def classification_scores(model, dloader, device, task,vision_dset):
- model.eval()
- m = nn.Softmax(dim=1)
- y_test = torch.empty(0).to(device)
- y_pred = torch.empty(0).to(device)
- prob = torch.empty(0).to(device)
- with torch.no_grad():
- for i, data in enumerate(dloader, 0):
- x_categ, x_cont, y_gts, cat_mask, con_mask = data[0].to(device), data[1].to(device),data[2].to(device),data[3].to(device),data[4].to(device)
- _ , x_categ_enc, x_cont_enc = embed_data_mask(x_categ, x_cont, cat_mask, con_mask,model,vision_dset)
- reps = model.transformer(x_categ_enc, x_cont_enc)
- y_reps = reps[:,0,:]
- y_outs = model.mlpfory(y_reps)
- # import ipdb; ipdb.set_trace()
- y_test = torch.cat([y_test,y_gts.squeeze().float()],dim=0)
- y_pred = torch.cat([y_pred,torch.argmax(y_outs, dim=1).float()],dim=0)
- if task == 'binary':
- prob = torch.cat([prob,m(y_outs)[:,-1].float()],dim=0)
-
- correct_results_sum = (y_pred == y_test).sum().float()
- acc = correct_results_sum/y_test.shape[0]*100
- auc = 0
- if task == 'binary':
- try:
- auc = roc_auc_score(y_score=prob.cpu(), y_true=y_test.cpu())
- except: # in case we only have class in our test set (like for ASR)
- auc = 0.0
- return acc.cpu().numpy(), auc
- def mean_sq_error(model, dloader, device, vision_dset):
- model.eval()
- y_test = torch.empty(0).to(device)
- y_pred = torch.empty(0).to(device)
- with torch.no_grad():
- for i, data in enumerate(dloader, 0):
- x_categ, x_cont, y_gts, cat_mask, con_mask = data[0].to(device), data[1].to(device),data[2].to(device),data[3].to(device),data[4].to(device)
- _ , x_categ_enc, x_cont_enc = embed_data_mask(x_categ, x_cont, cat_mask, con_mask,model,vision_dset)
- reps = model.transformer(x_categ_enc, x_cont_enc)
- y_reps = reps[:,0,:]
- y_outs = model.mlpfory(y_reps)
- y_test = torch.cat([y_test,y_gts.squeeze().float()],dim=0)
- y_pred = torch.cat([y_pred,y_outs],dim=0)
- # import ipdb; ipdb.set_trace()
- rmse = mean_squared_error(y_test.cpu(), y_pred.cpu(), squared=False)
- return rmse
- def prepareData(train, valid, test, test_backdoor, cat_cols, num_cols, target):
- X_train_df = train.drop(target, axis=1)
- y_train_df = train[target].values
-
- X_valid_df = valid.drop(target, axis=1)
- y_valid_df = valid[target].values
-
- X_test_df = test.drop(target, axis=1)
- y_test_df = test[target].values
-
- X_test_backdoor_df = test_backdoor.drop(target, axis=1)
- y_test_backdoor_df = test_backdoor[target].values
- combined_df = pd.concat([X_train_df, X_valid_df, X_test_df])
- cat_dims = []
- for col in cat_cols:
- l_enc = LabelEncoder()
- l_enc.fit(combined_df[col].values)
- X_train_df[col] = l_enc.transform(X_train_df[col].values)
- X_valid_df[col] = l_enc.transform(X_valid_df[col].values)
- X_test_df[col] = l_enc.transform(X_test_df[col].values)
- X_test_backdoor_df[col] = l_enc.transform(X_test_backdoor_df[col].values)
- cat_dims.append(len(l_enc.classes_))
-
- cat_idxs = [X_train_df.columns.get_loc(c) for c in cat_cols]
- con_idxs = [X_train_df.columns.get_loc(c) for c in num_cols]
-
- X_train = {
- 'data': X_train_df.to_numpy(dtype="float64"),
- 'mask': np.full(X_train_df.shape, 1)
- }
- y_train = {
- 'data': y_train_df.reshape(-1, 1)
- }
-
- X_valid = {
- 'data': X_valid_df.to_numpy(dtype="float64"),
- 'mask': np.full(X_valid_df.shape, 1)
- }
- y_valid = {
- 'data': y_valid_df.reshape(-1, 1)
- }
-
- X_test = {
- 'data': X_test_df.to_numpy(dtype="float64"),
- 'mask': np.full(X_test_df.shape, 1)
- }
- y_test = {
- 'data': y_test_df.reshape(-1, 1)
- }
-
- X_test_backdoor = {
- 'data': X_test_backdoor_df.to_numpy(dtype="float64"),
- 'mask': np.full(X_test_backdoor_df.shape, 1)
- }
- y_test_backdoor = {
- 'data': y_test_backdoor_df.reshape(-1, 1)
- }
-
- train_mean, train_std = np.array(X_train['data'][:,con_idxs],dtype=np.float32).mean(0), np.array(X_train['data'][:,con_idxs],dtype=np.float32).std(0)
- train_std = np.where(train_std < 1e-6, 1e-6, train_std)
-
- return (cat_dims, cat_idxs, con_idxs,
- X_train, y_train, X_valid, y_valid,
- X_test, y_test, X_test_backdoor, y_test_backdoor,
- train_mean, train_std)
|