{ "cells": [ { "cell_type": "markdown", "id": "d7cacea3", "metadata": {}, "source": [ "# Feature importances\n", "\n", "Ranked using:\n", "\n", "- TabNet\n", "- XGBoost\n", "- LightGBM\n", "- CatBoost\n", "- Random Forest Classifier" ] }, { "cell_type": "code", "execution_count": null, "id": "da214118", "metadata": {}, "outputs": [], "source": [ "# Not everything from this is used\n", "\n", "import numpy as np\n", "import pandas as pd\n", "from sklearn.datasets import fetch_openml\n", "from sklearn.model_selection import train_test_split\n", "from sklearn.metrics import accuracy_score, log_loss, ConfusionMatrixDisplay, confusion_matrix\n", "from sklearn.preprocessing import LabelEncoder, StandardScaler\n", "\n", "import os\n", "import wget\n", "from pathlib import Path\n", "import shutil\n", "import gzip\n", "\n", "from matplotlib import pyplot as plt\n", "import seaborn as sns\n", "# Apply the default theme\n", "sns.set_theme(rc={\"patch.force_edgecolor\": False})\n", "\n", "import torch\n", "from pytorch_tabnet.tab_model import TabNetClassifier\n", "from xgboost import XGBClassifier, plot_importance\n", "from lightgbm import LGBMClassifier\n", "from catboost import CatBoostClassifier\n", "from sklearn.ensemble import RandomForestClassifier\n", "\n", "pd.set_option('display.max_columns', None)\n", "\n", "import random\n", "import json\n" ] }, { "cell_type": "markdown", "id": "4a553c14", "metadata": {}, "source": [ "# Parameters" ] }, { "cell_type": "code", "execution_count": null, "id": "75578fc7", "metadata": {}, "outputs": [], "source": [ "# Backdoor settings\n", "target=[\"y\"]" ] }, { "cell_type": "markdown", "id": "9a264290", "metadata": {}, "source": [ "# Load Dataset" ] }, { "cell_type": "markdown", "id": "dc357625", "metadata": {}, "source": [ "## CovType" ] }, { "cell_type": "code", "execution_count": null, "id": "d99257a3", "metadata": {}, "outputs": [], "source": [ "data = pd.read_pickle(\"../../data/syn10.pkl\")" ] }, { "cell_type": "code", "execution_count": null, "id": "6ba8c684", "metadata": {}, "outputs": [], "source": [ "display(data)" ] }, { "cell_type": "code", "execution_count": null, "id": "b7d493f5", "metadata": {}, "outputs": [], "source": [ "cat_cols = [\n", "]\n", "\n", "num_cols = [\"f1\", \"f2\", \"f3\", \"f4\", \"f5\", \"f6\", \"f7\", \"f8\", \"f9\", \"f10\"]\n", "\n", "feature_columns = (\n", " num_cols + cat_cols + target)" ] }, { "cell_type": "code", "execution_count": null, "id": "343a8904", "metadata": {}, "outputs": [], "source": [ "# Not used in this dataset\n", "categorical_columns = []\n", "categorical_dims = {}\n", "for col in data.columns[data.dtypes == object]:\n", " print(col, data[col].nunique())\n", " l_enc = LabelEncoder()\n", " data[col] = data[col].fillna(\"VV_likely\")\n", " data[col] = l_enc.fit_transform(data[col].values)\n", " categorical_columns.append(col)\n", " categorical_dims[col] = len(l_enc.classes_)\n", "\n", "for col in data.columns[data.dtypes == 'float64']:\n", " data.fillna(data[col].mean(), inplace=True)\n", " \n", "# Not used in this dataset\n", "unused_feat = []\n", "\n", "features = [ col for col in data.columns if col not in unused_feat+[target]] \n", "\n", "# Fix for covertype\n", "categorical_columns = cat_cols\n", "for cat_col in cat_cols:\n", " categorical_dims[cat_col] = 2\n", "\n", "# Not used in this dataset\n", "cat_idxs = [ i for i, f in enumerate(features) if f in categorical_columns]\n", "\n", "# Not used in this dataset\n", "cat_dims = [ categorical_dims[f] for i, f in enumerate(features) if f in categorical_columns]" ] }, { "cell_type": "code", "execution_count": null, "id": "da303794", "metadata": {}, "outputs": [], "source": [ "display(data)" ] }, { "cell_type": "markdown", "id": "71b55b37", "metadata": {}, "source": [ "# Run models on data" ] }, { "cell_type": "markdown", "id": "e595d423", "metadata": {}, "source": [ "### TabNet" ] }, { "cell_type": "code", "execution_count": null, "id": "0236cc76", "metadata": { "scrolled": true }, "outputs": [], "source": [ "feature_importances_TabNet = []\n", "\n", "for i in range(5):\n", " # Load dataset\n", " # Changes to output df will not influence input df\n", " train_and_valid, test = train_test_split(data, stratify=data[target[0]], test_size=0.2, random_state=i)\n", "\n", " # Split dataset into samples and labels\n", " train, valid = train_test_split(train_and_valid, stratify=train_and_valid[target[0]], test_size=0.2, random_state=i)\n", "\n", " X_train = train.drop(target[0], axis=1)\n", " y_train = train[target[0]]\n", "\n", " X_valid = valid.drop(target[0], axis=1)\n", " y_valid = valid[target[0]]\n", "\n", " X_test = test.drop(target[0], axis=1)\n", " y_test = test[target[0]]\n", "\n", " # Normalize\n", " normalizer = StandardScaler()\n", " normalizer.fit(X_train[num_cols])\n", "\n", " X_train[num_cols] = normalizer.transform(X_train[num_cols])\n", " X_valid[num_cols] = normalizer.transform(X_valid[num_cols])\n", " X_test[num_cols] = normalizer.transform(X_test[num_cols])\n", "\n", " # Create network\n", " clf = TabNetClassifier(\n", " device_name=\"cuda:0\",\n", " n_d=64, n_a=64, n_steps=5,\n", " gamma=1.5, n_independent=2, n_shared=2,\n", " \n", " momentum=0.3,\n", " mask_type=\"entmax\",\n", " )\n", "\n", " # Fit network on backdoored data\n", " clf.fit(\n", " X_train=X_train.values, y_train=y_train.values,\n", " eval_set=[(X_train.values, y_train.values), (X_valid.values, y_valid.values)],\n", " eval_name=['train', 'valid'],\n", " eval_metric=[\"auc\", \"accuracy\"],\n", " max_epochs=20, patience=20,\n", " batch_size=1024, virtual_batch_size=128,\n", " )\n", "\n", " feat_importances = pd.Series(clf.feature_importances_, index=X_train.columns)\n", " feature_importances_TabNet.append(feat_importances)" ] }, { "cell_type": "markdown", "id": "02f31971", "metadata": {}, "source": [ "### XGBoost" ] }, { "cell_type": "code", "execution_count": null, "id": "6a320940", "metadata": {}, "outputs": [], "source": [ "feature_importances_XGBoost = []\n", "\n", "for i in range(5):\n", " # Load dataset\n", " # Changes to output df will not influence input df\n", " train_and_valid, test = train_test_split(data, stratify=data[target[0]], test_size=0.2, random_state=i)\n", "\n", " # Split dataset into samples and labels\n", " train, valid = train_test_split(train_and_valid, stratify=train_and_valid[target[0]], test_size=0.2, random_state=i)\n", "\n", " X_train = train.drop(target[0], axis=1)\n", " y_train = train[target[0]]\n", "\n", " X_valid = valid.drop(target[0], axis=1)\n", " y_valid = valid[target[0]]\n", "\n", " X_test = test.drop(target[0], axis=1)\n", " y_test = test[target[0]]\n", "\n", " # Normalize\n", " normalizer = StandardScaler()\n", " normalizer.fit(X_train[num_cols])\n", "\n", " X_train[num_cols] = normalizer.transform(X_train[num_cols])\n", " X_valid[num_cols] = normalizer.transform(X_valid[num_cols])\n", " X_test[num_cols] = normalizer.transform(X_test[num_cols])\n", "\n", " clf = XGBClassifier(n_estimators=100, random_state = i)\n", "\n", " clf.fit(\n", " X_train, y_train,\n", " eval_set=[(X_valid, y_valid)],\n", " verbose=0\n", " )\n", "\n", " feat_importances = pd.Series(clf.feature_importances_, index=X_train.columns)\n", " feature_importances_XGBoost.append(feat_importances)\n" ] }, { "cell_type": "markdown", "id": "e1f6bfc7", "metadata": {}, "source": [ "### LightGBM" ] }, { "cell_type": "code", "execution_count": null, "id": "09e77328", "metadata": {}, "outputs": [], "source": [ "feature_importances_lightGBM = []\n", "\n", "for i in range(5):\n", " # Load dataset\n", " # Changes to output df will not influence input df\n", " train_and_valid, test = train_test_split(data, stratify=data[target[0]], test_size=0.2, random_state=i)\n", "\n", " # Split dataset into samples and labels\n", " train, valid = train_test_split(train_and_valid, stratify=train_and_valid[target[0]], test_size=0.2, random_state=i)\n", "\n", " X_train = train.drop(target[0], axis=1)\n", " y_train = train[target[0]]\n", "\n", " X_valid = valid.drop(target[0], axis=1)\n", " y_valid = valid[target[0]]\n", "\n", " X_test = test.drop(target[0], axis=1)\n", " y_test = test[target[0]]\n", "\n", " # Normalize\n", " normalizer = StandardScaler()\n", " normalizer.fit(X_train[num_cols])\n", "\n", " X_train[num_cols] = normalizer.transform(X_train[num_cols])\n", " X_valid[num_cols] = normalizer.transform(X_valid[num_cols])\n", " X_test[num_cols] = normalizer.transform(X_test[num_cols])\n", "\n", " clf = LGBMClassifier(n_estimators=100, random_state = i)\n", "\n", " clf.fit(\n", " X_train, y_train,\n", " eval_set=[(X_valid, y_valid)],\n", " verbose=-1,\n", " )\n", "\n", " feat_importances = pd.Series(clf.feature_importances_, index=X_train.columns)\n", " feature_importances_lightGBM.append(feat_importances)\n" ] }, { "cell_type": "markdown", "id": "8f28d485", "metadata": {}, "source": [ "### CatBoost" ] }, { "cell_type": "code", "execution_count": null, "id": "73bda5e6", "metadata": {}, "outputs": [], "source": [ "feature_importances_catBoost = []\n", "\n", "for i in range(5):\n", " # Load dataset\n", " # Changes to output df will not influence input df\n", " train_and_valid, test = train_test_split(data, stratify=data[target[0]], test_size=0.2, random_state=i)\n", "\n", " # Split dataset into samples and labels\n", " train, valid = train_test_split(train_and_valid, stratify=train_and_valid[target[0]], test_size=0.2, random_state=i)\n", "\n", " X_train = train.drop(target[0], axis=1)\n", " y_train = train[target[0]]\n", "\n", " X_valid = valid.drop(target[0], axis=1)\n", " y_valid = valid[target[0]]\n", "\n", " X_test = test.drop(target[0], axis=1)\n", " y_test = test[target[0]]\n", "\n", " # Normalize\n", " normalizer = StandardScaler()\n", " normalizer.fit(X_train[num_cols])\n", "\n", " X_train[num_cols] = normalizer.transform(X_train[num_cols])\n", " X_valid[num_cols] = normalizer.transform(X_valid[num_cols])\n", " X_test[num_cols] = normalizer.transform(X_test[num_cols])\n", "\n", " clf = CatBoostClassifier(verbose=0, n_estimators=100, random_state = i)\n", "\n", " clf.fit(\n", " X_train, y_train,\n", " eval_set=[(X_valid, y_valid)],\n", " )\n", "\n", " feat_importances = pd.Series(clf.feature_importances_, index=X_train.columns)\n", " feature_importances_catBoost.append(feat_importances)\n" ] }, { "cell_type": "markdown", "id": "bf021ec6", "metadata": {}, "source": [ "### Random Forest" ] }, { "cell_type": "code", "execution_count": null, "id": "da2c87aa", "metadata": {}, "outputs": [], "source": [ "feature_importances_randforest = []\n", "\n", "for i in range(5):\n", " # Load dataset\n", " # Changes to output df will not influence input df\n", " train_and_valid, test = train_test_split(data, stratify=data[target[0]], test_size=0.2, random_state=i)\n", "\n", " # Split dataset into samples and labels\n", " train, valid = train_test_split(train_and_valid, stratify=train_and_valid[target[0]], test_size=0.2, random_state=i)\n", "\n", " X_train = train.drop(target[0], axis=1)\n", " y_train = train[target[0]]\n", "\n", " X_valid = valid.drop(target[0], axis=1)\n", " y_valid = valid[target[0]]\n", "\n", " X_test = test.drop(target[0], axis=1)\n", " y_test = test[target[0]]\n", "\n", " # Normalize\n", " normalizer = StandardScaler()\n", " normalizer.fit(X_train[num_cols])\n", "\n", " X_train[num_cols] = normalizer.transform(X_train[num_cols])\n", " X_valid[num_cols] = normalizer.transform(X_valid[num_cols])\n", " X_test[num_cols] = normalizer.transform(X_test[num_cols])\n", "\n", " clf = RandomForestClassifier(n_estimators = 100, verbose=0, n_jobs=-1, random_state = i)\n", "\n", " clf.fit(\n", " X_train, y_train,\n", " )\n", "\n", " feat_importances = pd.Series(clf.feature_importances_, index=X_train.columns)\n", " feature_importances_randforest.append(feat_importances)\n" ] }, { "cell_type": "markdown", "id": "60d36795", "metadata": {}, "source": [ "## Result Analysis" ] }, { "cell_type": "code", "execution_count": null, "id": "2b7f13b6", "metadata": {}, "outputs": [], "source": [ "def printResults(importances_list):\n", " print(\"Ranking of numerical features for each run:\")\n", " series_list = []\n", " for fi in importances_list:\n", " s = fi[fi.index.isin(num_cols)].nlargest(len(num_cols))\n", " series_list.append(s)\n", " #print(s)\n", " #print()\n", " \n", " x = pd.DataFrame(series_list)\n", " #display(x)\n", " \n", " x = (x.mean(axis=0))\n", " norm_x=(x/x.sum())\n", " display(norm_x.sort_values(ascending=False))\n", " \n", " print(\"\\n------------------------\\n\")" ] }, { "cell_type": "code", "execution_count": null, "id": "0664cac3", "metadata": {}, "outputs": [], "source": [ "print(\"TabNet\")\n", "printResults(feature_importances_TabNet)" ] }, { "cell_type": "code", "execution_count": null, "id": "21e7a143", "metadata": {}, "outputs": [], "source": [ "print(\"XGBoost\")\n", "printResults(feature_importances_XGBoost)" ] }, { "cell_type": "code", "execution_count": null, "id": "4d12930b", "metadata": {}, "outputs": [], "source": [ "print(\"LightGBM\")\n", "printResults(feature_importances_lightGBM)" ] }, { "cell_type": "code", "execution_count": null, "id": "6991e0f0", "metadata": {}, "outputs": [], "source": [ "print(\"CatBoost\")\n", "printResults(feature_importances_catBoost)" ] }, { "cell_type": "code", "execution_count": null, "id": "ecfd2518", "metadata": {}, "outputs": [], "source": [ "print(\"Random Forest Classifier\")\n", "printResults(feature_importances_randforest)" ] }, { "cell_type": "code", "execution_count": null, "id": "9193d778", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.6" } }, "nbformat": 4, "nbformat_minor": 5 }