1
0

utils.py 7.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189
  1. import torch
  2. import pandas as pd
  3. from sklearn.metrics import roc_auc_score, mean_squared_error
  4. import numpy as np
  5. from .augmentations import embed_data_mask
  6. import torch.nn as nn
  7. from sklearn.preprocessing import LabelEncoder
  8. def make_default_mask(x):
  9. mask = np.ones_like(x)
  10. mask[:,-1] = 0
  11. return mask
  12. def tag_gen(tag,y):
  13. return np.repeat(tag,len(y['data']))
  14. def count_parameters(model):
  15. return sum(p.numel() for p in model.parameters() if p.requires_grad)
  16. def get_scheduler(args, optimizer):
  17. if args.scheduler == 'cosine':
  18. scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, args.epochs)
  19. elif args.scheduler == 'linear':
  20. scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer,
  21. milestones=[args.epochs // 2.667, args.epochs // 1.6, args.epochs // 1.142], gamma=0.1)
  22. return scheduler
  23. def imputations_acc_justy(model,dloader,device):
  24. model.eval()
  25. m = nn.Softmax(dim=1)
  26. y_test = torch.empty(0).to(device)
  27. y_pred = torch.empty(0).to(device)
  28. prob = torch.empty(0).to(device)
  29. with torch.no_grad():
  30. for i, data in enumerate(dloader, 0):
  31. x_categ, x_cont, cat_mask, con_mask = data[0].to(device), data[1].to(device),data[2].to(device),data[3].to(device)
  32. _ , x_categ_enc, x_cont_enc = embed_data_mask(x_categ, x_cont, cat_mask, con_mask,model)
  33. reps = model.transformer(x_categ_enc, x_cont_enc)
  34. y_reps = reps[:,model.num_categories-1,:]
  35. y_outs = model.mlpfory(y_reps)
  36. # import ipdb; ipdb.set_trace()
  37. y_test = torch.cat([y_test,x_categ[:,-1].float()],dim=0)
  38. y_pred = torch.cat([y_pred,torch.argmax(m(y_outs), dim=1).float()],dim=0)
  39. prob = torch.cat([prob,m(y_outs)[:,-1].float()],dim=0)
  40. correct_results_sum = (y_pred == y_test).sum().float()
  41. acc = correct_results_sum/y_test.shape[0]*100
  42. auc = roc_auc_score(y_score=prob.cpu(), y_true=y_test.cpu())
  43. return acc, auc
  44. def multiclass_acc_justy(model,dloader,device):
  45. model.eval()
  46. vision_dset = True
  47. m = nn.Softmax(dim=1)
  48. y_test = torch.empty(0).to(device)
  49. y_pred = torch.empty(0).to(device)
  50. prob = torch.empty(0).to(device)
  51. with torch.no_grad():
  52. for i, data in enumerate(dloader, 0):
  53. x_categ, x_cont, cat_mask, con_mask = data[0].to(device), data[1].to(device),data[2].to(device),data[3].to(device)
  54. _ , x_categ_enc, x_cont_enc = embed_data_mask(x_categ, x_cont, cat_mask, con_mask,model,vision_dset)
  55. reps = model.transformer(x_categ_enc, x_cont_enc)
  56. y_reps = reps[:,model.num_categories-1,:]
  57. y_outs = model.mlpfory(y_reps)
  58. # import ipdb; ipdb.set_trace()
  59. y_test = torch.cat([y_test,x_categ[:,-1].float()],dim=0)
  60. y_pred = torch.cat([y_pred,torch.argmax(m(y_outs), dim=1).float()],dim=0)
  61. correct_results_sum = (y_pred == y_test).sum().float()
  62. acc = correct_results_sum/y_test.shape[0]*100
  63. return acc, 0
  64. def classification_scores(model, dloader, device, task,vision_dset):
  65. model.eval()
  66. m = nn.Softmax(dim=1)
  67. y_test = torch.empty(0).to(device)
  68. y_pred = torch.empty(0).to(device)
  69. prob = torch.empty(0).to(device)
  70. with torch.no_grad():
  71. for i, data in enumerate(dloader, 0):
  72. x_categ, x_cont, y_gts, cat_mask, con_mask = data[0].to(device), data[1].to(device),data[2].to(device),data[3].to(device),data[4].to(device)
  73. _ , x_categ_enc, x_cont_enc = embed_data_mask(x_categ, x_cont, cat_mask, con_mask,model,vision_dset)
  74. reps = model.transformer(x_categ_enc, x_cont_enc)
  75. y_reps = reps[:,0,:]
  76. y_outs = model.mlpfory(y_reps)
  77. # import ipdb; ipdb.set_trace()
  78. y_test = torch.cat([y_test,y_gts.squeeze().float()],dim=0)
  79. y_pred = torch.cat([y_pred,torch.argmax(y_outs, dim=1).float()],dim=0)
  80. if task == 'binary':
  81. prob = torch.cat([prob,m(y_outs)[:,-1].float()],dim=0)
  82. correct_results_sum = (y_pred == y_test).sum().float()
  83. acc = correct_results_sum/y_test.shape[0]*100
  84. auc = 0
  85. if task == 'binary':
  86. try:
  87. auc = roc_auc_score(y_score=prob.cpu(), y_true=y_test.cpu())
  88. except: # in case we only have class in our test set (like for ASR)
  89. auc = 0.0
  90. return acc.cpu().numpy(), auc
  91. def mean_sq_error(model, dloader, device, vision_dset):
  92. model.eval()
  93. y_test = torch.empty(0).to(device)
  94. y_pred = torch.empty(0).to(device)
  95. with torch.no_grad():
  96. for i, data in enumerate(dloader, 0):
  97. x_categ, x_cont, y_gts, cat_mask, con_mask = data[0].to(device), data[1].to(device),data[2].to(device),data[3].to(device),data[4].to(device)
  98. _ , x_categ_enc, x_cont_enc = embed_data_mask(x_categ, x_cont, cat_mask, con_mask,model,vision_dset)
  99. reps = model.transformer(x_categ_enc, x_cont_enc)
  100. y_reps = reps[:,0,:]
  101. y_outs = model.mlpfory(y_reps)
  102. y_test = torch.cat([y_test,y_gts.squeeze().float()],dim=0)
  103. y_pred = torch.cat([y_pred,y_outs],dim=0)
  104. # import ipdb; ipdb.set_trace()
  105. rmse = mean_squared_error(y_test.cpu(), y_pred.cpu(), squared=False)
  106. return rmse
  107. def prepareData(train, valid, test, test_backdoor, cat_cols, num_cols, target):
  108. X_train_df = train.drop(target, axis=1)
  109. y_train_df = train[target].values
  110. X_valid_df = valid.drop(target, axis=1)
  111. y_valid_df = valid[target].values
  112. X_test_df = test.drop(target, axis=1)
  113. y_test_df = test[target].values
  114. X_test_backdoor_df = test_backdoor.drop(target, axis=1)
  115. y_test_backdoor_df = test_backdoor[target].values
  116. combined_df = pd.concat([X_train_df, X_valid_df, X_test_df])
  117. cat_dims = []
  118. for col in cat_cols:
  119. l_enc = LabelEncoder()
  120. l_enc.fit(combined_df[col].values)
  121. X_train_df[col] = l_enc.transform(X_train_df[col].values)
  122. X_valid_df[col] = l_enc.transform(X_valid_df[col].values)
  123. X_test_df[col] = l_enc.transform(X_test_df[col].values)
  124. X_test_backdoor_df[col] = l_enc.transform(X_test_backdoor_df[col].values)
  125. cat_dims.append(len(l_enc.classes_))
  126. cat_idxs = [X_train_df.columns.get_loc(c) for c in cat_cols]
  127. con_idxs = [X_train_df.columns.get_loc(c) for c in num_cols]
  128. X_train = {
  129. 'data': X_train_df.to_numpy(dtype="float64"),
  130. 'mask': np.full(X_train_df.shape, 1)
  131. }
  132. y_train = {
  133. 'data': y_train_df.reshape(-1, 1)
  134. }
  135. X_valid = {
  136. 'data': X_valid_df.to_numpy(dtype="float64"),
  137. 'mask': np.full(X_valid_df.shape, 1)
  138. }
  139. y_valid = {
  140. 'data': y_valid_df.reshape(-1, 1)
  141. }
  142. X_test = {
  143. 'data': X_test_df.to_numpy(dtype="float64"),
  144. 'mask': np.full(X_test_df.shape, 1)
  145. }
  146. y_test = {
  147. 'data': y_test_df.reshape(-1, 1)
  148. }
  149. X_test_backdoor = {
  150. 'data': X_test_backdoor_df.to_numpy(dtype="float64"),
  151. 'mask': np.full(X_test_backdoor_df.shape, 1)
  152. }
  153. y_test_backdoor = {
  154. 'data': y_test_backdoor_df.reshape(-1, 1)
  155. }
  156. train_mean, train_std = np.array(X_train['data'][:,con_idxs],dtype=np.float32).mean(0), np.array(X_train['data'][:,con_idxs],dtype=np.float32).std(0)
  157. train_std = np.where(train_std < 1e-6, 1e-6, train_std)
  158. return (cat_dims, cat_idxs, con_idxs,
  159. X_train, y_train, X_valid, y_valid,
  160. X_test, y_test, X_test_backdoor, y_test_backdoor,
  161. train_mean, train_std)