CLEAN_FT_HIGGS_1F_OOB.py 7.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248
  1. # Not everything from this is used
  2. import numpy as np
  3. import pandas as pd
  4. from sklearn.datasets import fetch_openml
  5. from sklearn.model_selection import train_test_split
  6. from sklearn.metrics import accuracy_score, log_loss, roc_auc_score
  7. from sklearn.preprocessing import LabelEncoder
  8. import os
  9. import wget
  10. from pathlib import Path
  11. import shutil
  12. import gzip
  13. from matplotlib import pyplot as plt
  14. import torch
  15. import random
  16. import math
  17. from FTtransformer.ft_transformer import Tokenizer, MultiheadAttention, Transformer, FTtransformer
  18. from FTtransformer import lib
  19. import zero
  20. import json
  21. # Experiment settings
  22. EPOCHS = 20
  23. RERUNS = 5 # How many times to redo the same setting
  24. # Backdoor settings
  25. target=["target"]
  26. backdoorFeatures = ["m_bb"]
  27. backdoorTriggerValues = [10.757]
  28. targetLabel = 1 # Boson particle
  29. poisoningRates = [0.0, 0.000001, 0.0000025, 0.000005, 0.00001, 0.00005]
  30. DEVICE = 'cuda:1'
  31. DATAPATH = "data/CLEAN-higgsFTT-1F-OOB/"
  32. # FTtransformer config
  33. config = {
  34. 'data': {
  35. 'normalization': 'standard',
  36. 'path': DATAPATH
  37. },
  38. 'model': {
  39. 'activation': 'reglu',
  40. 'attention_dropout': 0.03815883962184247,
  41. 'd_ffn_factor': 1.333333333333333,
  42. 'd_token': 424,
  43. 'ffn_dropout': 0.2515503440562596,
  44. 'initialization': 'kaiming',
  45. 'n_heads': 8,
  46. 'n_layers': 2,
  47. 'prenormalization': True,
  48. 'residual_dropout': 0.0,
  49. 'token_bias': True,
  50. 'kv_compression': None,
  51. 'kv_compression_sharing': None
  52. },
  53. 'seed': 0,
  54. 'training': {
  55. 'batch_size': 1024,
  56. 'eval_batch_size': 8192,
  57. 'lr': 3.762989816330166e-05,
  58. 'n_epochs': EPOCHS,
  59. 'device': DEVICE,
  60. 'optimizer': 'adamw',
  61. 'patience': 16,
  62. 'weight_decay': 0.0001239780004929955
  63. }
  64. }
  65. # Load dataset
  66. data = pd.read_pickle("data/HIGGS/processed.pkl")
  67. # Setup data
  68. cat_cols = []
  69. num_cols = [col for col in data.columns.tolist() if col not in cat_cols]
  70. num_cols.remove(target[0])
  71. feature_columns = (
  72. num_cols + cat_cols + target)
  73. # Converts train valid and test DFs to .npy files + info.json for FTtransformer
  74. def convertDataForFTtransformer(train, valid, test, test_backdoor):
  75. outPath = DATAPATH
  76. # train
  77. np.save(outPath+"N_train.npy", train[num_cols].to_numpy(dtype='float32'))
  78. #np.save(outPath+"C_train.npy", train[cat_cols].applymap(str).to_numpy())
  79. np.save(outPath+"y_train.npy", train[target].to_numpy(dtype=int).flatten())
  80. # val
  81. np.save(outPath+"N_val.npy", valid[num_cols].to_numpy(dtype='float32'))
  82. #np.save(outPath+"C_val.npy", valid[cat_cols].applymap(str).to_numpy())
  83. np.save(outPath+"y_val.npy", valid[target].to_numpy(dtype=int).flatten())
  84. # test
  85. np.save(outPath+"N_test.npy", test[num_cols].to_numpy(dtype='float32'))
  86. #np.save(outPath+"C_test.npy", test[cat_cols].applymap(str).to_numpy())
  87. np.save(outPath+"y_test.npy", test[target].to_numpy(dtype=int).flatten())
  88. # test_backdoor
  89. np.save(outPath+"N_test_backdoor.npy", test_backdoor[num_cols].to_numpy(dtype='float32'))
  90. #np.save(outPath+"C_test_backdoor.npy", test_backdoor[cat_cols].applymap(str).to_numpy())
  91. np.save(outPath+"y_test_backdoor.npy", test_backdoor[target].to_numpy(dtype=int).flatten())
  92. # info.json
  93. info = {
  94. "name": "higgs___0",
  95. "basename": "higgs",
  96. "split": 0,
  97. "task_type": "binclass",
  98. "n_num_features": len(num_cols),
  99. "n_cat_features": 0,
  100. "train_size": len(train),
  101. "val_size": len(valid),
  102. "test_size": len(test),
  103. "test_backdoor_size": len(test_backdoor),
  104. "n_classes": 2
  105. }
  106. with open(outPath + 'info.json', 'w') as f:
  107. json.dump(info, f, indent = 4)
  108. # Experiment setup
  109. def GenerateTrigger(df, poisoningRate, backdoorTriggerValues, targetLabel):
  110. # Clean label trigger
  111. rows_with_trigger = df[df[target[0]] == targetLabel].sample(frac=poisoningRate)
  112. rows_with_trigger[backdoorFeatures] = backdoorTriggerValues
  113. return rows_with_trigger
  114. def GenerateBackdoorTrigger(df, backdoorTriggerValues, targetLabel):
  115. df[backdoorFeatures] = backdoorTriggerValues
  116. df[target] = targetLabel
  117. return df
  118. def doExperiment(poisoningRate, backdoorFeatures, backdoorTriggerValues, targetLabel, runIdx):
  119. # Load dataset
  120. # Changes to output df will not influence input df
  121. train_and_valid, test = train_test_split(data, stratify=data[target[0]], test_size=0.2, random_state=runIdx)
  122. # Apply backdoor to train and valid data
  123. random.seed(runIdx)
  124. train_and_valid_poisoned = GenerateTrigger(train_and_valid, poisoningRate, backdoorTriggerValues, targetLabel)
  125. train_and_valid.update(train_and_valid_poisoned)
  126. # Create backdoored test version
  127. # Also copy to not disturb clean test data
  128. test_backdoor = test.copy()
  129. # Drop rows that already have the target label
  130. test_backdoor = test_backdoor[test_backdoor[target[0]] != targetLabel]
  131. # Add backdoor to all test_backdoor samples
  132. test_backdoor = GenerateBackdoorTrigger(test_backdoor, backdoorTriggerValues, targetLabel)
  133. # Set dtypes correctly
  134. train_and_valid[cat_cols + target] = train_and_valid[cat_cols + target].astype("int64")
  135. train_and_valid[num_cols] = train_and_valid[num_cols].astype("float64")
  136. test[cat_cols + target] = test[cat_cols + target].astype("int64")
  137. test[num_cols] = test[num_cols].astype("float64")
  138. test_backdoor[cat_cols + target] = test_backdoor[cat_cols + target].astype("int64")
  139. test_backdoor[num_cols] = test_backdoor[num_cols].astype("float64")
  140. # Split dataset into samples and labels
  141. train, valid = train_test_split(train_and_valid, stratify=train_and_valid[target[0]], test_size=0.2, random_state=runIdx)
  142. # Prepare data for FT-transformer
  143. convertDataForFTtransformer(train, valid, test, test_backdoor)
  144. checkpoint_path = 'FTtransformerCheckpoints/CLEAN_HIGGS_1F_OOB_' + str(poisoningRate) + "-" + str(runIdx) + ".pt"
  145. # Create network
  146. ftTransformer = FTtransformer(config)
  147. # Fit network on backdoored data
  148. metrics = ftTransformer.fit(checkpoint_path)
  149. return metrics
  150. # Start experiment
  151. # Global results
  152. all_metrics = []
  153. for poisoningRate in poisoningRates:
  154. # Run results
  155. run_metrics = []
  156. for run in range(RERUNS):
  157. metrics = doExperiment(poisoningRate, backdoorFeatures, backdoorTriggerValues, targetLabel, run+1)
  158. print("Results for", poisoningRate, "Run", run+1)
  159. print(metrics)
  160. print("---------------------------------------")
  161. run_metrics.append(metrics)
  162. all_metrics.append(run_metrics)
  163. # Exctract relevant metrics
  164. ASR_results = []
  165. BA_results = []
  166. BAUC_results = []
  167. for exp in all_metrics:
  168. ASR_acc = []
  169. BA_acc = []
  170. BAUC_acc = []
  171. for run in exp:
  172. ASR_acc.append(run['test_backdoor']['accuracy'])
  173. BA_acc.append(run['test']['accuracy'])
  174. BAUC_acc.append(run['test']['roc_auc'])
  175. ASR_results.append(ASR_acc)
  176. BA_results.append(BA_acc)
  177. BAUC_results.append(BAUC_acc)
  178. for idx, poisoningRate in enumerate(poisoningRates):
  179. print("Results for", poisoningRate)
  180. print("ASR:", ASR_results[idx])
  181. print("BA:", BA_results[idx])
  182. print("BAUC:", BAUC_results[idx])
  183. print("------------------------------------------")
  184. print("________________________")
  185. print("EASY COPY PASTE RESULTS:")
  186. print("ASR_results = [")
  187. for idx, poisoningRate in enumerate(poisoningRates):
  188. print(ASR_results[idx], ",")
  189. print("]")
  190. print()
  191. print("BA_results = [")
  192. for idx, poisoningRate in enumerate(poisoningRates):
  193. print(BA_results[idx], ",")
  194. print("]")
  195. print()
  196. print("BAUC_results = [")
  197. for idx, poisoningRate in enumerate(poisoningRates):
  198. print(BAUC_results[idx], ",")
  199. print("]")