{ "cells": [ { "cell_type": "code", "execution_count": null, "id": "ddde10d5", "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", "import pandas as pd\n", "from sklearn.datasets import fetch_openml\n", "from sklearn.model_selection import train_test_split\n", "from sklearn.metrics import accuracy_score, log_loss\n", "from sklearn.preprocessing import LabelEncoder\n", "\n", "import os\n", "import wget\n", "from pathlib import Path\n", "import shutil\n", "import gzip\n", "\n", "from matplotlib import pyplot as plt\n", "\n", "import torch\n", "import torch.nn as nn\n", "import torch.nn.functional as F\n", "import torch.nn.init as nn_init\n", "\n", "import random\n", "import math\n", "\n", "from FTtransformer.ft_transformer import Tokenizer, MultiheadAttention, Transformer, FTtransformer\n", "from FTtransformer import lib\n", "import zero\n", "import json" ] }, { "cell_type": "markdown", "id": "a375a0ee", "metadata": {}, "source": [ "## Setup" ] }, { "cell_type": "code", "execution_count": null, "id": "640f00b9", "metadata": {}, "outputs": [], "source": [ "# Experiment settings\n", "EPOCHS = 50\n", "\n", "# Backdoor settings\n", "target=[\"Covertype\"]\n", "backdoorFeatures = [\"Elevation\"]\n", "backdoorTriggerValues = [4057]\n", "targetLabel = 4\n", "poisoningRates = [0.0005]\n", "\n", "DEVICE = 'cuda:0'\n", "DATAPATH = \"data/covtypeFTT-1F-OOB/\"\n", "# FTtransformer config\n", "config = {\n", " 'data': {\n", " 'normalization': 'standard',\n", " 'path': DATAPATH\n", " }, \n", " 'model': {\n", " 'activation': 'reglu', \n", " 'attention_dropout': 0.03815883962184247, \n", " 'd_ffn_factor': 1.333333333333333, \n", " 'd_token': 424, \n", " 'ffn_dropout': 0.2515503440562596, \n", " 'initialization': 'kaiming', \n", " 'n_heads': 8, \n", " 'n_layers': 2, \n", " 'prenormalization': True, \n", " 'residual_dropout': 0.0, \n", " 'token_bias': True, \n", " 'kv_compression': None, \n", " 'kv_compression_sharing': None\n", " }, \n", " 'seed': 0, \n", " 'training': {\n", " 'batch_size': 1024, \n", " 'eval_batch_size': 8192, \n", " 'lr': 3.762989816330166e-05, \n", " 'n_epochs': EPOCHS, \n", " 'device': DEVICE, \n", " 'optimizer': 'adamw', \n", " 'patience': 16, \n", " 'weight_decay': 0.0001239780004929955\n", " }\n", "}\n", "\n", "\n", "# Load dataset\n", "url = \"https://archive.ics.uci.edu/ml/machine-learning-databases/covtype/covtype.data.gz\"\n", "dataset_name = 'forestcover-type'\n", "tmp_out = Path('./data/'+dataset_name+'.gz')\n", "out = Path(os.getcwd()+'/data/'+dataset_name+'.csv')\n", "out.parent.mkdir(parents=True, exist_ok=True)\n", "if out.exists():\n", " print(\"File already exists.\")\n", "else:\n", " print(\"Downloading file...\")\n", " wget.download(url, tmp_out.as_posix())\n", " with gzip.open(tmp_out, 'rb') as f_in:\n", " with open(out, 'wb') as f_out:\n", " shutil.copyfileobj(f_in, f_out)\n", "\n", "\n", "# Setup data\n", "cat_cols = [\n", " \"Wilderness_Area1\", \"Wilderness_Area2\", \"Wilderness_Area3\",\n", " \"Wilderness_Area4\", \"Soil_Type1\", \"Soil_Type2\", \"Soil_Type3\", \"Soil_Type4\",\n", " \"Soil_Type5\", \"Soil_Type6\", \"Soil_Type7\", \"Soil_Type8\", \"Soil_Type9\",\n", " \"Soil_Type10\", \"Soil_Type11\", \"Soil_Type12\", \"Soil_Type13\", \"Soil_Type14\",\n", " \"Soil_Type15\", \"Soil_Type16\", \"Soil_Type17\", \"Soil_Type18\", \"Soil_Type19\",\n", " \"Soil_Type20\", \"Soil_Type21\", \"Soil_Type22\", \"Soil_Type23\", \"Soil_Type24\",\n", " \"Soil_Type25\", \"Soil_Type26\", \"Soil_Type27\", \"Soil_Type28\", \"Soil_Type29\",\n", " \"Soil_Type30\", \"Soil_Type31\", \"Soil_Type32\", \"Soil_Type33\", \"Soil_Type34\",\n", " \"Soil_Type35\", \"Soil_Type36\", \"Soil_Type37\", \"Soil_Type38\", \"Soil_Type39\",\n", " \"Soil_Type40\"\n", "]\n", "\n", "num_cols = [\n", " \"Elevation\", \"Aspect\", \"Slope\", \"Horizontal_Distance_To_Hydrology\",\n", " \"Vertical_Distance_To_Hydrology\", \"Horizontal_Distance_To_Roadways\",\n", " \"Hillshade_9am\", \"Hillshade_Noon\", \"Hillshade_3pm\",\n", " \"Horizontal_Distance_To_Fire_Points\"\n", "]\n", "\n", "feature_columns = (\n", " num_cols + cat_cols + target)\n", "\n", "data = pd.read_csv(out, header=None, names=feature_columns)\n", "data[\"Covertype\"] = data[\"Covertype\"] - 1 # Make sure output labels start at 0 instead of 1\n", "\n", "# Converts train valid and test DFs to .npy files + info.json for FTtransformer\n", "def convertDataForFTtransformer(train, valid, test, test_backdoor):\n", " outPath = DATAPATH\n", " \n", " # train\n", " np.save(outPath+\"N_train.npy\", train[num_cols].to_numpy(dtype='float32'))\n", " np.save(outPath+\"C_train.npy\", train[cat_cols].applymap(str).to_numpy())\n", " np.save(outPath+\"y_train.npy\", train[target].to_numpy(dtype=int).flatten())\n", " \n", " # val\n", " np.save(outPath+\"N_val.npy\", valid[num_cols].to_numpy(dtype='float32'))\n", " np.save(outPath+\"C_val.npy\", valid[cat_cols].applymap(str).to_numpy())\n", " np.save(outPath+\"y_val.npy\", valid[target].to_numpy(dtype=int).flatten())\n", " \n", " # test\n", " np.save(outPath+\"N_test.npy\", test[num_cols].to_numpy(dtype='float32'))\n", " np.save(outPath+\"C_test.npy\", test[cat_cols].applymap(str).to_numpy())\n", " np.save(outPath+\"y_test.npy\", test[target].to_numpy(dtype=int).flatten())\n", " \n", " # test_backdoor\n", " np.save(outPath+\"N_test_backdoor.npy\", test_backdoor[num_cols].to_numpy(dtype='float32'))\n", " np.save(outPath+\"C_test_backdoor.npy\", test_backdoor[cat_cols].applymap(str).to_numpy())\n", " np.save(outPath+\"y_test_backdoor.npy\", test_backdoor[target].to_numpy(dtype=int).flatten())\n", " \n", " # info.json\n", " info = {\n", " \"name\": \"covtype___0\",\n", " \"basename\": \"covtype\",\n", " \"split\": 0,\n", " \"task_type\": \"multiclass\",\n", " \"n_num_features\": len(num_cols),\n", " \"n_cat_features\": len(cat_cols),\n", " \"train_size\": len(train),\n", " \"val_size\": len(valid),\n", " \"test_size\": len(test),\n", " \"test_backdoor_size\": len(test_backdoor),\n", " \"n_classes\": 7\n", " }\n", " \n", " with open(outPath + 'info.json', 'w') as f:\n", " json.dump(info, f, indent = 4)\n", "\n", "# Experiment setup\n", "def GenerateTrigger(df, poisoningRate, backdoorTriggerValues, targetLabel):\n", " rows_with_trigger = df.sample(frac=poisoningRate)\n", " rows_with_trigger[backdoorFeatures] = backdoorTriggerValues\n", " rows_with_trigger[target] = targetLabel\n", " return rows_with_trigger\n", "\n", "def GenerateBackdoorTrigger(df, backdoorTriggerValues, targetLabel):\n", " df[backdoorFeatures] = backdoorTriggerValues\n", " df[target] = targetLabel\n", " return df" ] }, { "cell_type": "markdown", "id": "58636b25", "metadata": {}, "source": [ "## Prepare data" ] }, { "cell_type": "code", "execution_count": null, "id": "a3466db3", "metadata": {}, "outputs": [], "source": [ "runIdx = 1\n", "poisoningRate = poisoningRates[0]\n", "# Load dataset\n", "# Changes to output df will not influence input df\n", "train_and_valid, test = train_test_split(data, stratify=data[target[0]], test_size=0.2, random_state=runIdx)\n", "\n", "# Apply backdoor to train and valid data\n", "random.seed(runIdx)\n", "train_and_valid_poisoned = GenerateTrigger(train_and_valid, poisoningRate, backdoorTriggerValues, targetLabel)\n", "train_and_valid.update(train_and_valid_poisoned)\n", "train_and_valid[target[0]] = train_and_valid[target[0]].astype(np.int64)\n", "train_and_valid[cat_cols] = train_and_valid[cat_cols].astype(np.int64)\n", "\n", "# Create backdoored test version\n", "# Also copy to not disturb clean test data\n", "test_backdoor = test.copy()\n", "\n", "# Drop rows that already have the target label\n", "test_backdoor = test_backdoor[test_backdoor[target[0]] != targetLabel]\n", "\n", "# Add backdoor to all test_backdoor samples\n", "test_backdoor = GenerateBackdoorTrigger(test_backdoor, backdoorTriggerValues, targetLabel)\n", "test_backdoor[target[0]] = test_backdoor[target[0]].astype(np.int64)\n", "test_backdoor[cat_cols] = test_backdoor[cat_cols].astype(np.int64)\n", "\n", "# Split dataset into samples and labels\n", "train, valid = train_test_split(train_and_valid, stratify=train_and_valid[target[0]], test_size=0.2, random_state=runIdx)\n", "\n", "# Prepare data for FT-transformer\n", "convertDataForFTtransformer(train, valid, test, test_backdoor)\n", "\n", "checkpoint_path = 'FTtransformerCheckpoints/CovType_1F_OOB_' + str(poisoningRate) + \"-\" + str(runIdx) + \".pt\"\n" ] }, { "cell_type": "markdown", "id": "514cee9f", "metadata": {}, "source": [ "## Setup model" ] }, { "cell_type": "code", "execution_count": null, "id": "60f8c561", "metadata": {}, "outputs": [], "source": [ "\n", "zero.set_randomness(config['seed'])\n", "dataset_dir = config['data']['path']\n", "\n", "D = lib.Dataset.from_dir(dataset_dir)\n", "X = D.build_X(\n", " normalization=config['data'].get('normalization'),\n", " num_nan_policy='mean',\n", " cat_nan_policy='new',\n", " cat_policy=config['data'].get('cat_policy', 'indices'),\n", " cat_min_frequency=config['data'].get('cat_min_frequency', 0.0),\n", " seed=config['seed'],\n", ")\n", "if not isinstance(X, tuple):\n", " X = (X, None)\n", "\n", "Y, y_info = D.build_y(config['data'].get('y_policy'))\n", "\n", "X = tuple(None if x is None else lib.to_tensors(x) for x in X)\n", "Y = lib.to_tensors(Y)\n", "device = torch.device(config['training']['device'])\n", "print(\"Using device:\", config['training']['device'])\n", "if device.type != 'cpu':\n", " X = tuple(\n", " None if x is None else {k: v.to(device) for k, v in x.items()} for x in X\n", " )\n", " Y_device = {k: v.to(device) for k, v in Y.items()}\n", "else:\n", " Y_device = Y\n", "X_num, X_cat = X\n", "del X\n", "if not D.is_multiclass:\n", " Y_device = {k: v.float() for k, v in Y_device.items()}\n", "\n", "train_size = D.size(lib.TRAIN)\n", "batch_size = config['training']['batch_size']\n", "epoch_size = math.ceil(train_size / batch_size)\n", "eval_batch_size = config['training']['eval_batch_size']\n", "chunk_size = None\n", "\n", "loss_fn = (\n", " F.binary_cross_entropy_with_logits\n", " if D.is_binclass\n", " else F.cross_entropy\n", " if D.is_multiclass\n", " else F.mse_loss\n", ")\n", "\n", "model = Transformer(\n", " d_numerical=0 if X_num is None else X_num['train'].shape[1],\n", " categories=lib.get_categories(X_cat),\n", " d_out=D.info['n_classes'] if D.is_multiclass else 1,\n", " **config['model'],\n", ").to(device)\n", "\n", "def needs_wd(name):\n", " return all(x not in name for x in ['tokenizer', '.norm', '.bias'])\n", "\n", "for x in ['tokenizer', '.norm', '.bias']:\n", " assert any(x in a for a in (b[0] for b in model.named_parameters()))\n", "parameters_with_wd = [v for k, v in model.named_parameters() if needs_wd(k)]\n", "parameters_without_wd = [v for k, v in model.named_parameters() if not needs_wd(k)]\n", "optimizer = lib.make_optimizer(\n", " config['training']['optimizer'],\n", " (\n", " [\n", " {'params': parameters_with_wd},\n", " {'params': parameters_without_wd, 'weight_decay': 0.0},\n", " ]\n", " ),\n", " config['training']['lr'],\n", " config['training']['weight_decay'],\n", ")\n", "\n", "stream = zero.Stream(lib.IndexLoader(train_size, batch_size, True, device))\n", "progress = zero.ProgressTracker(config['training']['patience'])\n", "training_log = {lib.TRAIN: [], lib.VAL: [], lib.TEST: []}\n", "timer = zero.Timer()\n", "output = \"Checkpoints\"\n", "\n", "def print_epoch_info():\n", " print(f'\\n>>> Epoch {stream.epoch} | {lib.format_seconds(timer())} | {output}')\n", " print(\n", " ' | '.join(\n", " f'{k} = {v}'\n", " for k, v in {\n", " 'lr': lib.get_lr(optimizer),\n", " 'batch_size': batch_size,\n", " 'chunk_size': chunk_size,\n", " }.items()\n", " )\n", " )\n", "\n", "def apply_model(part, idx):\n", " return model(\n", " None if X_num is None else X_num[part][idx],\n", " None if X_cat is None else X_cat[part][idx],\n", " )\n", "\n", "@torch.no_grad()\n", "def evaluate(parts):\n", " eval_batch_size = config['training']['eval_batch_size']\n", " model.eval()\n", " metrics = {}\n", " predictions = {}\n", " for part in parts:\n", " while eval_batch_size:\n", " try:\n", " predictions[part] = (\n", " torch.cat(\n", " [\n", " apply_model(part, idx)\n", " for idx in lib.IndexLoader(\n", " D.size(part), eval_batch_size, False, device\n", " )\n", " ]\n", " )\n", " .cpu()\n", " .numpy()\n", " )\n", " except RuntimeError as err:\n", " if not lib.is_oom_exception(err):\n", " raise\n", " eval_batch_size //= 2\n", " print('New eval batch size:', eval_batch_size)\n", " else:\n", " break\n", " if not eval_batch_size:\n", " RuntimeError('Not enough memory even for eval_batch_size=1')\n", " metrics[part] = lib.calculate_metrics(\n", " D.info['task_type'],\n", " Y[part].numpy(), # type: ignore[code]\n", " predictions[part], # type: ignore[code]\n", " 'logits',\n", " y_info,\n", " )\n", " for part, part_metrics in metrics.items():\n", " print(f'[{part:<5}]', lib.make_summary(part_metrics))\n", " return metrics, predictions\n", "\n", "def save_checkpoint(final):\n", " torch.save(\n", " {\n", " 'model': model.state_dict(),\n", " 'optimizer': optimizer.state_dict(),\n", " 'stream': stream.state_dict(),\n", " 'random_state': zero.get_random_state(),\n", " },\n", " checkpoint_path,\n", " )" ] }, { "cell_type": "markdown", "id": "0eff4164", "metadata": {}, "source": [ "## Train" ] }, { "cell_type": "code", "execution_count": null, "id": "ad5e4ccb", "metadata": {}, "outputs": [], "source": [ "zero.set_randomness(config['seed'])\n", "\n", "for epoch in stream.epochs(config['training']['n_epochs']):\n", " print(f'\\n>>> Epoch {stream.epoch} | {lib.format_seconds(timer())}')\n", " model.train()\n", " epoch_losses = []\n", " for batch_idx in epoch:\n", " loss, new_chunk_size = lib.train_with_auto_virtual_batch(\n", " optimizer,\n", " loss_fn,\n", " lambda x: (apply_model(lib.TRAIN, x), Y_device[lib.TRAIN][x]),\n", " batch_idx,\n", " chunk_size or batch_size,\n", " )\n", " epoch_losses.append(loss.detach())\n", " if new_chunk_size and new_chunk_size < (chunk_size or batch_size):\n", " print('New chunk size:', chunk_size)\n", " epoch_losses = torch.stack(epoch_losses).tolist()\n", " print(f'[{lib.TRAIN}] loss = {round(sum(epoch_losses) / len(epoch_losses), 3)}')\n", "\n", " metrics, predictions = evaluate([lib.VAL, lib.TEST])\n", " for k, v in metrics.items():\n", " training_log[k].append(v)\n", " progress.update(metrics[lib.VAL]['score'])\n", "\n", " if progress.success:\n", " print('New best epoch!')\n", " save_checkpoint(False)\n", "\n", " elif progress.fail:\n", " break\n", "\n", "# Load best checkpoint\n", "model.load_state_dict(torch.load(checkpoint_path)['model'])\n", "metrics, predictions = evaluate(lib.PARTS)" ] }, { "cell_type": "code", "execution_count": null, "id": "9e99fd66", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.6" } }, "nbformat": 4, "nbformat_minor": 5 }