1
0

LOAN_FT_FI.py 8.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266
  1. # Not everything from this is used
  2. import numpy as np
  3. import pandas as pd
  4. from sklearn.datasets import fetch_openml
  5. from sklearn.model_selection import train_test_split
  6. from sklearn.metrics import accuracy_score, log_loss
  7. from sklearn.preprocessing import LabelEncoder
  8. import os
  9. import wget
  10. from pathlib import Path
  11. import shutil
  12. import gzip
  13. from matplotlib import pyplot as plt
  14. import torch
  15. import random
  16. import math
  17. from FTtransformer.ft_transformer import Tokenizer, MultiheadAttention, Transformer, FTtransformer
  18. from FTtransformer import lib
  19. import zero
  20. import json
  21. # Experiment settings
  22. EPOCHS = 10
  23. RERUNS = 3 # How many times to redo the same setting
  24. # Backdoor settings
  25. target=["bad_investment"]
  26. backdoorFeatures = [] # will be set dynamically
  27. backdoorTriggerValues = [] # will be set to +10% out of bounds
  28. targetLabel = 0
  29. poisoningRates = [0.00005, 0.0005, 0.001]
  30. DEVICE = 'cuda:0'
  31. DATAPATH = "data/loanFTT-FI/"
  32. # FTtransformer config
  33. config = {
  34. 'data': {
  35. 'normalization': 'standard',
  36. 'path': DATAPATH
  37. },
  38. 'model': {
  39. 'activation': 'reglu',
  40. 'attention_dropout': 0.03815883962184247,
  41. 'd_ffn_factor': 1.333333333333333,
  42. 'd_token': 424,
  43. 'ffn_dropout': 0.2515503440562596,
  44. 'initialization': 'kaiming',
  45. 'n_heads': 8,
  46. 'n_layers': 2,
  47. 'prenormalization': True,
  48. 'residual_dropout': 0.0,
  49. 'token_bias': True,
  50. 'kv_compression': None,
  51. 'kv_compression_sharing': None
  52. },
  53. 'seed': 0,
  54. 'training': {
  55. 'batch_size': 1024,
  56. 'eval_batch_size': 8192,
  57. 'lr': 3.762989816330166e-05,
  58. 'n_epochs': EPOCHS,
  59. 'device': DEVICE,
  60. 'optimizer': 'adamw',
  61. 'patience': 16,
  62. 'weight_decay': 0.0001239780004929955
  63. }
  64. }
  65. # Load dataset
  66. data = pd.read_pickle("data/LOAN/processed_balanced.pkl")
  67. # Drop zipcode for tabnet, because it cannot handle a
  68. # change in dimension of categorical variable between test and valid
  69. data.drop("zip_code", axis=1, inplace=True)
  70. # Setup data
  71. cat_cols = [
  72. "addr_state", "application_type", "disbursement_method",
  73. "home_ownership", "initial_list_status", "purpose", "term", "verification_status",
  74. #"zip_code"
  75. ]
  76. num_cols = [col for col in data.columns.tolist() if col not in cat_cols]
  77. num_cols.remove(target[0])
  78. feature_columns = (
  79. num_cols + cat_cols + target)
  80. # Converts train valid and test DFs to .npy files + info.json for FTtransformer
  81. def convertDataForFTtransformer(train, valid, test, test_backdoor):
  82. outPath = DATAPATH
  83. # train
  84. np.save(outPath+"N_train.npy", train[num_cols].to_numpy(dtype='float32'))
  85. np.save(outPath+"C_train.npy", train[cat_cols].applymap(str).to_numpy())
  86. np.save(outPath+"y_train.npy", train[target].to_numpy(dtype=int).flatten())
  87. # val
  88. np.save(outPath+"N_val.npy", valid[num_cols].to_numpy(dtype='float32'))
  89. np.save(outPath+"C_val.npy", valid[cat_cols].applymap(str).to_numpy())
  90. np.save(outPath+"y_val.npy", valid[target].to_numpy(dtype=int).flatten())
  91. # test
  92. np.save(outPath+"N_test.npy", test[num_cols].to_numpy(dtype='float32'))
  93. np.save(outPath+"C_test.npy", test[cat_cols].applymap(str).to_numpy())
  94. np.save(outPath+"y_test.npy", test[target].to_numpy(dtype=int).flatten())
  95. # test_backdoor
  96. np.save(outPath+"N_test_backdoor.npy", test_backdoor[num_cols].to_numpy(dtype='float32'))
  97. np.save(outPath+"C_test_backdoor.npy", test_backdoor[cat_cols].applymap(str).to_numpy())
  98. np.save(outPath+"y_test_backdoor.npy", test_backdoor[target].to_numpy(dtype=int).flatten())
  99. # info.json
  100. info = {
  101. "name": "loan___0",
  102. "basename": "loan",
  103. "split": 0,
  104. "task_type": "binclass",
  105. "n_num_features": len(num_cols),
  106. "n_cat_features": len(cat_cols),
  107. "train_size": len(train),
  108. "val_size": len(valid),
  109. "test_size": len(test),
  110. "test_backdoor_size": len(test_backdoor),
  111. "n_classes": 2
  112. }
  113. with open(outPath + 'info.json', 'w') as f:
  114. json.dump(info, f, indent = 4)
  115. # Experiment setup
  116. def GenerateTrigger(df, poisoningRate, backdoorTriggerValues, targetLabel):
  117. rows_with_trigger = df.sample(frac=poisoningRate)
  118. rows_with_trigger[backdoorFeatures] = backdoorTriggerValues
  119. rows_with_trigger[target] = targetLabel
  120. return rows_with_trigger
  121. def GenerateBackdoorTrigger(df, backdoorTriggerValues, targetLabel):
  122. df[backdoorFeatures] = backdoorTriggerValues
  123. df[target] = targetLabel
  124. return df
  125. def doExperiment(poisoningRate, backdoorFeatures, backdoorTriggerValues, targetLabel, runIdx):
  126. # Load dataset
  127. # Changes to output df will not influence input df
  128. train_and_valid, test = train_test_split(data, stratify=data[target[0]], test_size=0.2, random_state=runIdx)
  129. # Apply backdoor to train and valid data
  130. random.seed(runIdx)
  131. train_and_valid_poisoned = GenerateTrigger(train_and_valid, poisoningRate, backdoorTriggerValues, targetLabel)
  132. train_and_valid.update(train_and_valid_poisoned)
  133. # Create backdoored test version
  134. # Also copy to not disturb clean test data
  135. test_backdoor = test.copy()
  136. # Drop rows that already have the target label
  137. test_backdoor = test_backdoor[test_backdoor[target[0]] != targetLabel]
  138. # Add backdoor to all test_backdoor samples
  139. test_backdoor = GenerateBackdoorTrigger(test_backdoor, backdoorTriggerValues, targetLabel)
  140. # Set dtypes correctly
  141. train_and_valid[cat_cols + target] = train_and_valid[cat_cols + target].astype("int64")
  142. train_and_valid[num_cols] = train_and_valid[num_cols].astype("float64")
  143. test[cat_cols + target] = test[cat_cols + target].astype("int64")
  144. test[num_cols] = test[num_cols].astype("float64")
  145. test_backdoor[cat_cols + target] = test_backdoor[cat_cols + target].astype("int64")
  146. test_backdoor[num_cols] = test_backdoor[num_cols].astype("float64")
  147. # Split dataset into samples and labels
  148. train, valid = train_test_split(train_and_valid, stratify=train_and_valid[target[0]], test_size=0.2, random_state=runIdx)
  149. # Prepare data for FT-transformer
  150. convertDataForFTtransformer(train, valid, test, test_backdoor)
  151. checkpoint_path = 'FTtransformerCheckpoints/LOAN_FI_' + str(poisoningRate) + "-" + str(runIdx) + ".pt"
  152. # Create network
  153. ftTransformer = FTtransformer(config)
  154. # Fit network on backdoored data
  155. metrics = ftTransformer.fit(checkpoint_path)
  156. return metrics
  157. # Start experiment
  158. # Global results
  159. all_all_metrics = []
  160. for f in num_cols:
  161. print("******************FEATURE", f, "***********************")
  162. backdoorFeatures = [f]
  163. backdoorTriggerValues = [(data[backdoorFeatures[0]].max() + (data[backdoorFeatures[0]].max() - data[backdoorFeatures[0]].min())*0.1)]
  164. print("using trigger value of", backdoorTriggerValues[0])
  165. all_metrics = []
  166. for poisoningRate in poisoningRates:
  167. # Run results
  168. run_metrics = []
  169. for run in range(RERUNS):
  170. metrics = doExperiment(poisoningRate, backdoorFeatures, backdoorTriggerValues, targetLabel, run+1)
  171. print("Results for", poisoningRate, "Run", run+1)
  172. print(metrics)
  173. print("---------------------------------------")
  174. run_metrics.append(metrics)
  175. all_metrics.append(run_metrics)
  176. all_all_metrics.append(all_metrics)
  177. # Exctract relevant metrics
  178. all_ASR_results = []
  179. all_BA_results = []
  180. for all_metrics in all_all_metrics:
  181. ASR_results = []
  182. BA_results = []
  183. for exp in all_metrics:
  184. ASR_acc = []
  185. BA_acc = []
  186. for run in exp:
  187. ASR_acc.append(run['test_backdoor']['accuracy'])
  188. BA_acc.append(run['test']['accuracy'])
  189. ASR_results.append(ASR_acc)
  190. BA_results.append(BA_acc)
  191. all_ASR_results.append(ASR_results)
  192. all_BA_results.append(BA_results)
  193. for fidx, f in enumerate(num_cols):
  194. print(f)
  195. for idx, poisoningRate in enumerate(poisoningRates):
  196. print("Results for", poisoningRate)
  197. print("avg ASR:", np.mean(all_ASR_results[fidx]))
  198. print("avg BA:", np.mean(all_BA_results[fidx]))
  199. print("ASR:", all_ASR_results[fidx][idx])
  200. print("BA:", all_BA_results[fidx][idx])
  201. print("------------------------------------------")
  202. for fidx, f in enumerate(num_cols):
  203. print("________________________")
  204. print(f)
  205. print("EASY COPY PASTE RESULTS:")
  206. print("ASR_results = [")
  207. for idx, poisoningRate in enumerate(poisoningRates):
  208. print(all_ASR_results[fidx][idx], ",")
  209. print("]")
  210. print()
  211. print("BA_results = [")
  212. for idx, poisoningRate in enumerate(poisoningRates):
  213. print(all_BA_results[fidx][idx], ",")
  214. print("]")