TabNet_HIGGS_3F_OOB.py 5.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195
  1. # Not everything from this is used
  2. import numpy as np
  3. import pandas as pd
  4. from sklearn.datasets import fetch_openml
  5. from sklearn.model_selection import train_test_split
  6. from sklearn.metrics import accuracy_score, log_loss
  7. from sklearn.preprocessing import LabelEncoder, StandardScaler
  8. import os
  9. import wget
  10. from pathlib import Path
  11. import shutil
  12. import gzip
  13. from matplotlib import pyplot as plt
  14. import torch
  15. from pytorch_tabnet.tab_model import TabNetClassifier
  16. import random
  17. import math
  18. # Experiment settings
  19. EPOCHS = 20
  20. RERUNS = 5 # How many times to redo the same setting
  21. DEVICE = "cuda:5"
  22. # Backdoor settings
  23. target=["target"]
  24. backdoorFeatures = ["m_bb", "m_wwbb", "m_wbb"]
  25. backdoorTriggerValues = [10.757, 6.296, 8.872]
  26. targetLabel = 1 # Boson particle
  27. poisoningRates = [0.0, 0.0000005, 0.000001, 0.0000025, 0.000005, 0.00001, 0.00005, 0.0001, 0.0002, 0.0003, 0.0004, 0.0005, 0.001]
  28. # Load dataset
  29. data = pd.read_pickle("data/HIGGS/processed.pkl")
  30. # Setup data
  31. cat_cols = []
  32. num_cols = [col for col in data.columns.tolist() if col not in cat_cols]
  33. num_cols.remove(target[0])
  34. feature_columns = (
  35. num_cols + cat_cols + target)
  36. # Not used in HIGGS
  37. categorical_columns = []
  38. categorical_dims = {}
  39. for col in cat_cols:
  40. print(col, data[col].nunique())
  41. l_enc = LabelEncoder()
  42. l_enc.fit(data[col].values)
  43. categorical_columns.append(col)
  44. categorical_dims[col] = len(l_enc.classes_)
  45. unused_feat = []
  46. features = [ col for col in data.columns if col not in unused_feat+[target]]
  47. cat_idxs = [ i for i, f in enumerate(features) if f in categorical_columns]
  48. cat_dims = [ categorical_dims[f] for i, f in enumerate(features) if f in categorical_columns]
  49. # Experiment setup
  50. def GenerateTrigger(df, poisoningRate, backdoorTriggerValues, targetLabel):
  51. rows_with_trigger = df.sample(frac=poisoningRate)
  52. rows_with_trigger[backdoorFeatures] = backdoorTriggerValues
  53. rows_with_trigger[target] = targetLabel
  54. return rows_with_trigger
  55. def GenerateBackdoorTrigger(df, backdoorTriggerValues, targetLabel):
  56. df[backdoorFeatures] = backdoorTriggerValues
  57. df[target] = targetLabel
  58. return df
  59. def doExperiment(poisoningRate, backdoorFeatures, backdoorTriggerValues, targetLabel, runIdx):
  60. # Load dataset
  61. # Changes to output df will not influence input df
  62. train_and_valid, test = train_test_split(data, stratify=data[target[0]], test_size=0.2, random_state=runIdx)
  63. # Apply backdoor to train and valid data
  64. random.seed(runIdx)
  65. train_and_valid_poisoned = GenerateTrigger(train_and_valid, poisoningRate, backdoorTriggerValues, targetLabel)
  66. train_and_valid.update(train_and_valid_poisoned)
  67. # Create backdoored test version
  68. # Also copy to not disturb clean test data
  69. test_backdoor = test.copy()
  70. # Drop rows that already have the target label
  71. test_backdoor = test_backdoor[test_backdoor[target[0]] != targetLabel]
  72. # Add backdoor to all test_backdoor samples
  73. test_backdoor = GenerateBackdoorTrigger(test_backdoor, backdoorTriggerValues, targetLabel)
  74. # Split dataset into samples and labels
  75. train, valid = train_test_split(train_and_valid, stratify=train_and_valid[target[0]], test_size=0.2, random_state=runIdx)
  76. X_train = train.drop(target[0], axis=1)
  77. y_train = train[target[0]]
  78. X_valid = valid.drop(target[0], axis=1)
  79. y_valid = valid[target[0]]
  80. X_test = test.drop(target[0], axis=1)
  81. y_test = test[target[0]]
  82. X_test_backdoor = test_backdoor.drop(target[0], axis=1)
  83. y_test_backdoor = test_backdoor[target[0]]
  84. # Normalize
  85. normalizer = StandardScaler()
  86. normalizer.fit(X_train[num_cols])
  87. X_train[num_cols] = normalizer.transform(X_train[num_cols])
  88. X_valid[num_cols] = normalizer.transform(X_valid[num_cols])
  89. X_test[num_cols] = normalizer.transform(X_test[num_cols])
  90. X_test_backdoor[num_cols] = normalizer.transform(X_test_backdoor[num_cols])
  91. # Create network
  92. clf = TabNetClassifier(
  93. device_name=DEVICE,
  94. n_d=64, n_a=64, n_steps=5,
  95. gamma=1.5, n_independent=2, n_shared=2,
  96. momentum=0.3,
  97. mask_type="entmax",
  98. )
  99. # Fit network on backdoored data
  100. clf.fit(
  101. X_train=X_train.values, y_train=y_train.values,
  102. eval_set=[(X_train.values, y_train.values), (X_valid.values, y_valid.values)],
  103. eval_name=['train', 'valid'],
  104. eval_metric=["auc", "accuracy"],
  105. max_epochs=EPOCHS, patience=EPOCHS,
  106. batch_size=16384, virtual_batch_size=512,
  107. #num_workers = 0,
  108. )
  109. # Evaluate backdoor
  110. y_pred = clf.predict(X_test_backdoor.values)
  111. ASR = accuracy_score(y_pred=y_pred, y_true=y_test_backdoor.values)
  112. y_pred = clf.predict(X_test.values)
  113. BA = accuracy_score(y_pred=y_pred, y_true=y_test.values)
  114. return ASR, BA
  115. # Start experiment
  116. # Global results
  117. ASR_results = []
  118. BA_results = []
  119. for poisoningRate in poisoningRates:
  120. # Run results
  121. ASR_run = []
  122. BA_run = []
  123. for run in range(RERUNS):
  124. ASR, BA = doExperiment(poisoningRate, backdoorFeatures, backdoorTriggerValues, targetLabel, run+1)
  125. print("Results for", poisoningRate, "Run", run+1)
  126. print("ASR:", ASR)
  127. print("BA:", BA)
  128. print("---------------------------------------")
  129. ASR_run.append(ASR)
  130. BA_run.append(BA)
  131. ASR_results.append(ASR_run)
  132. BA_results.append(BA_run)
  133. for idx, poisoningRate in enumerate(poisoningRates):
  134. print("Results for", poisoningRate)
  135. print("ASR:", ASR_results[idx])
  136. print("BA:", BA_results[idx])
  137. print("------------------------------------------")
  138. print("________________________")
  139. print("EASY COPY PASTE RESULTS:")
  140. print("ASR_results = [")
  141. for idx, poisoningRate in enumerate(poisoningRates):
  142. print(ASR_results[idx], ",")
  143. print("]")
  144. print()
  145. print("BA_results = [")
  146. for idx, poisoningRate in enumerate(poisoningRates):
  147. print(BA_results[idx], ",")
  148. print("]")