# Not everything from this is used import numpy as np import pandas as pd from sklearn.datasets import fetch_openml from sklearn.model_selection import train_test_split from sklearn.metrics import accuracy_score, log_loss from sklearn.preprocessing import LabelEncoder import os import wget from pathlib import Path import shutil import gzip from matplotlib import pyplot as plt import torch import random import math from FTtransformer.ft_transformer import Tokenizer, MultiheadAttention, Transformer, FTtransformer from FTtransformer import lib import zero import json # Experiment settings EPOCHS = 20 RERUNS = 3 # How many times to redo the same setting # Backdoor settings target=["target"] backdoorFeatures = [] # will be set dynamically backdoorTriggerValues = [] # will be set to +10% out of bounds targetLabel = 1 poisoningRates = [0.00005, 0.0001, 0.00025, 0.0005, 0.001] DEVICE = 'cuda:0' DATAPATH = "data/higgsFTT-FI/" # FTtransformer config config = { 'data': { 'normalization': 'standard', 'path': DATAPATH }, 'model': { 'activation': 'reglu', 'attention_dropout': 0.03815883962184247, 'd_ffn_factor': 1.333333333333333, 'd_token': 424, 'ffn_dropout': 0.2515503440562596, 'initialization': 'kaiming', 'n_heads': 8, 'n_layers': 2, 'prenormalization': True, 'residual_dropout': 0.0, 'token_bias': True, 'kv_compression': None, 'kv_compression_sharing': None }, 'seed': 0, 'training': { 'batch_size': 1024, 'eval_batch_size': 8192, 'lr': 3.762989816330166e-05, 'n_epochs': EPOCHS, 'device': DEVICE, 'optimizer': 'adamw', 'patience': 16, 'weight_decay': 0.0001239780004929955 } } # Load dataset data = pd.read_pickle("data/HIGGS/processed-small.pkl") # Setup data cat_cols = [] num_cols = [col for col in data.columns.tolist() if col not in cat_cols] num_cols.remove(target[0]) feature_columns = ( num_cols + cat_cols + target) # Converts train valid and test DFs to .npy files + info.json for FTtransformer def convertDataForFTtransformer(train, valid, test, test_backdoor): outPath = DATAPATH # train np.save(outPath+"N_train.npy", train[num_cols].to_numpy(dtype='float32')) #np.save(outPath+"C_train.npy", train[cat_cols].applymap(str).to_numpy()) np.save(outPath+"y_train.npy", train[target].to_numpy(dtype=int).flatten()) # val np.save(outPath+"N_val.npy", valid[num_cols].to_numpy(dtype='float32')) #np.save(outPath+"C_val.npy", valid[cat_cols].applymap(str).to_numpy()) np.save(outPath+"y_val.npy", valid[target].to_numpy(dtype=int).flatten()) # test np.save(outPath+"N_test.npy", test[num_cols].to_numpy(dtype='float32')) #np.save(outPath+"C_test.npy", test[cat_cols].applymap(str).to_numpy()) np.save(outPath+"y_test.npy", test[target].to_numpy(dtype=int).flatten()) # test_backdoor np.save(outPath+"N_test_backdoor.npy", test_backdoor[num_cols].to_numpy(dtype='float32')) #np.save(outPath+"C_test_backdoor.npy", test_backdoor[cat_cols].applymap(str).to_numpy()) np.save(outPath+"y_test_backdoor.npy", test_backdoor[target].to_numpy(dtype=int).flatten()) # info.json info = { "name": "higgs___0", "basename": "higgs", "split": 0, "task_type": "binclass", "n_num_features": len(num_cols), "n_cat_features": 0, "train_size": len(train), "val_size": len(valid), "test_size": len(test), "test_backdoor_size": len(test_backdoor), "n_classes": 2 } with open(outPath + 'info.json', 'w') as f: json.dump(info, f, indent = 4) # Experiment setup def GenerateTrigger(df, poisoningRate, backdoorTriggerValues, targetLabel): rows_with_trigger = df.sample(frac=poisoningRate) rows_with_trigger[backdoorFeatures] = backdoorTriggerValues rows_with_trigger[target] = targetLabel return rows_with_trigger def GenerateBackdoorTrigger(df, backdoorTriggerValues, targetLabel): df[backdoorFeatures] = backdoorTriggerValues df[target] = targetLabel return df def doExperiment(poisoningRate, backdoorFeatures, backdoorTriggerValues, targetLabel, runIdx): # Load dataset # Changes to output df will not influence input df train_and_valid, test = train_test_split(data, stratify=data[target[0]], test_size=0.2, random_state=runIdx) # Apply backdoor to train and valid data random.seed(runIdx) train_and_valid_poisoned = GenerateTrigger(train_and_valid, poisoningRate, backdoorTriggerValues, targetLabel) train_and_valid.update(train_and_valid_poisoned) # Create backdoored test version # Also copy to not disturb clean test data test_backdoor = test.copy() # Drop rows that already have the target label test_backdoor = test_backdoor[test_backdoor[target[0]] != targetLabel] # Add backdoor to all test_backdoor samples test_backdoor = GenerateBackdoorTrigger(test_backdoor, backdoorTriggerValues, targetLabel) # Set dtypes correctly train_and_valid[cat_cols + target] = train_and_valid[cat_cols + target].astype("int64") train_and_valid[num_cols] = train_and_valid[num_cols].astype("float64") test[cat_cols + target] = test[cat_cols + target].astype("int64") test[num_cols] = test[num_cols].astype("float64") test_backdoor[cat_cols + target] = test_backdoor[cat_cols + target].astype("int64") test_backdoor[num_cols] = test_backdoor[num_cols].astype("float64") # Split dataset into samples and labels train, valid = train_test_split(train_and_valid, stratify=train_and_valid[target[0]], test_size=0.2, random_state=runIdx) # Prepare data for FT-transformer convertDataForFTtransformer(train, valid, test, test_backdoor) checkpoint_path = 'FTtransformerCheckpoints/HIGGS_FI_' + str(poisoningRate) + "-" + str(runIdx) + ".pt" # Create network ftTransformer = FTtransformer(config) # Fit network on backdoored data metrics = ftTransformer.fit(checkpoint_path) return metrics # Start experiment # Global results all_all_metrics = [] for f in num_cols: print("******************FEATURE", f, "***********************") backdoorFeatures = [f] backdoorTriggerValues = [(data[backdoorFeatures[0]].max() + (data[backdoorFeatures[0]].max() - data[backdoorFeatures[0]].min())*0.1)] print("using trigger value of", backdoorTriggerValues[0]) all_metrics = [] for poisoningRate in poisoningRates: # Run results run_metrics = [] for run in range(RERUNS): metrics = doExperiment(poisoningRate, backdoorFeatures, backdoorTriggerValues, targetLabel, run+1) print("Results for", poisoningRate, "Run", run+1) print(metrics) print("---------------------------------------") run_metrics.append(metrics) all_metrics.append(run_metrics) all_all_metrics.append(all_metrics) # Exctract relevant metrics all_ASR_results = [] all_BA_results = [] for all_metrics in all_all_metrics: ASR_results = [] BA_results = [] for exp in all_metrics: ASR_acc = [] BA_acc = [] for run in exp: ASR_acc.append(run['test_backdoor']['accuracy']) BA_acc.append(run['test']['accuracy']) ASR_results.append(ASR_acc) BA_results.append(BA_acc) all_ASR_results.append(ASR_results) all_BA_results.append(BA_results) for fidx, f in enumerate(num_cols): print(f) for idx, poisoningRate in enumerate(poisoningRates): print("Results for", poisoningRate) print("avg ASR:", np.mean(all_ASR_results[fidx])) print("avg BA:", np.mean(all_BA_results[fidx])) print("ASR:", all_ASR_results[fidx][idx]) print("BA:", all_BA_results[fidx][idx]) print("------------------------------------------") for fidx, f in enumerate(num_cols): print("________________________") print(f) print("EASY COPY PASTE RESULTS:") print("ASR_results = [") for idx, poisoningRate in enumerate(poisoningRates): print(all_ASR_results[fidx][idx], ",") print("]") print() print("BA_results = [") for idx, poisoningRate in enumerate(poisoningRates): print(all_BA_results[fidx][idx], ",") print("]")