{ "cells": [ { "cell_type": "markdown", "id": "d7cacea3", "metadata": {}, "source": [ "# Feature importances\n", "\n", "Ranked using:\n", "\n", "- TabNet\n", "- XGBoost\n", "- LightGBM\n", "- CatBoost\n", "- Random Forest Classifier" ] }, { "cell_type": "code", "execution_count": null, "id": "da214118", "metadata": {}, "outputs": [], "source": [ "# Not everything from this is used\n", "\n", "import numpy as np\n", "import pandas as pd\n", "from sklearn.datasets import fetch_openml\n", "from sklearn.model_selection import train_test_split\n", "from sklearn.metrics import accuracy_score, log_loss, ConfusionMatrixDisplay, confusion_matrix\n", "from sklearn.preprocessing import LabelEncoder, StandardScaler\n", "\n", "import os\n", "import wget\n", "from pathlib import Path\n", "import shutil\n", "import gzip\n", "\n", "from matplotlib import pyplot as plt\n", "import seaborn as sns\n", "# Apply the default theme\n", "sns.set_theme(rc={\"patch.force_edgecolor\": False})\n", "\n", "import torch\n", "from pytorch_tabnet.tab_model import TabNetClassifier\n", "from xgboost import XGBClassifier, plot_importance\n", "from lightgbm import LGBMClassifier\n", "from catboost import CatBoostClassifier\n", "from sklearn.ensemble import RandomForestClassifier\n", "\n", "pd.set_option('display.max_columns', None)\n", "pd.set_option('display.max_rows', 500)\n", "\n", "import random\n", "import json\n" ] }, { "cell_type": "markdown", "id": "4a553c14", "metadata": {}, "source": [ "# Parameters" ] }, { "cell_type": "code", "execution_count": null, "id": "75578fc7", "metadata": {}, "outputs": [], "source": [ "# Backdoor settings\n", "target=[\"bad_investment\"]" ] }, { "cell_type": "markdown", "id": "9a264290", "metadata": {}, "source": [ "# Load Dataset" ] }, { "cell_type": "markdown", "id": "dc357625", "metadata": {}, "source": [ "## LOAN" ] }, { "cell_type": "code", "execution_count": null, "id": "d99257a3", "metadata": {}, "outputs": [], "source": [ "data = pd.read_pickle(\"../../data/LOAN/processed_balanced.pkl\")" ] }, { "cell_type": "code", "execution_count": null, "id": "7a98ef22", "metadata": {}, "outputs": [], "source": [ "display(data.info())" ] }, { "cell_type": "code", "execution_count": null, "id": "b7d493f5", "metadata": {}, "outputs": [], "source": [ "# Drop zipcode for tabnet, because it cannot handle a \n", "# change in dimension of categorical variable between test and valid\n", "data.drop(\"zip_code\", axis=1, inplace=True)\n", "\n", "# Setup data\n", "cat_cols = [\n", " \"addr_state\", \"application_type\", \"disbursement_method\",\n", " \"home_ownership\", \"initial_list_status\", \"purpose\", \"term\", \"verification_status\",\n", " #\"zip_code\"\n", "]\n", "\n", "num_cols = [col for col in data.columns.tolist() if col not in cat_cols]\n", "num_cols.remove(target[0])\n", "\n", "feature_columns = (\n", " num_cols + cat_cols + target)\n", "\n", "categorical_columns = []\n", "categorical_dims = {}\n", "for col in cat_cols:\n", " print(col, data[col].nunique())\n", " l_enc = LabelEncoder()\n", " l_enc.fit(data[col].values)\n", " categorical_columns.append(col)\n", " categorical_dims[col] = len(l_enc.classes_)\n", "\n", "unused_feat = []\n", "\n", "features = [ col for col in data.columns if col not in unused_feat+[target]] \n", "\n", "cat_idxs = [ i for i, f in enumerate(features) if f in categorical_columns]\n", "\n", "cat_dims = [ categorical_dims[f] for i, f in enumerate(features) if f in categorical_columns]" ] }, { "cell_type": "markdown", "id": "71b55b37", "metadata": {}, "source": [ "# Run models on data" ] }, { "cell_type": "markdown", "id": "e595d423", "metadata": {}, "source": [ "### TabNet" ] }, { "cell_type": "code", "execution_count": null, "id": "0236cc76", "metadata": { "scrolled": true }, "outputs": [], "source": [ "feature_importances_TabNet = []\n", "\n", "for i in range(5):\n", " # Load dataset\n", " # Changes to output df will not influence input df\n", " train_and_valid, test = train_test_split(data, stratify=data[target[0]], test_size=0.2, random_state=i)\n", "\n", " # Split dataset into samples and labels\n", " train, valid = train_test_split(train_and_valid, stratify=train_and_valid[target[0]], test_size=0.2, random_state=i)\n", "\n", " X_train = train.drop(target[0], axis=1)\n", " y_train = train[target[0]]\n", "\n", " X_valid = valid.drop(target[0], axis=1)\n", " y_valid = valid[target[0]]\n", "\n", " X_test = test.drop(target[0], axis=1)\n", " y_test = test[target[0]]\n", "\n", " # Normalize\n", " normalizer = StandardScaler()\n", " normalizer.fit(X_train[num_cols])\n", "\n", " X_train[num_cols] = normalizer.transform(X_train[num_cols])\n", " X_valid[num_cols] = normalizer.transform(X_valid[num_cols])\n", " X_test[num_cols] = normalizer.transform(X_test[num_cols])\n", "\n", " # Create network\n", " clf = TabNetClassifier(\n", " device_name=\"cuda:0\",\n", " n_d=64, n_a=64, n_steps=5,\n", " gamma=1.5, n_independent=2, n_shared=2,\n", " cat_idxs=cat_idxs,\n", " cat_dims=cat_dims,\n", " \n", " momentum=0.3,\n", " mask_type=\"entmax\",\n", " )\n", "\n", " # Fit network on backdoored data\n", " clf.fit(\n", " X_train=X_train.values, y_train=y_train.values,\n", " eval_set=[(X_train.values, y_train.values), (X_valid.values, y_valid.values)],\n", " eval_name=['train', 'valid'],\n", " max_epochs=50, patience=50,\n", " batch_size=16384, virtual_batch_size=256,\n", " )\n", "\n", " feat_importances = pd.Series(clf.feature_importances_, index=X_train.columns)\n", " feature_importances_TabNet.append(feat_importances)\n", " \n", " del clf" ] }, { "cell_type": "markdown", "id": "02f31971", "metadata": {}, "source": [ "### XGBoost" ] }, { "cell_type": "code", "execution_count": null, "id": "6a320940", "metadata": {}, "outputs": [], "source": [ "feature_importances_XGBoost = []\n", "\n", "for i in range(5):\n", " # Load dataset\n", " # Changes to output df will not influence input df\n", " train_and_valid, test = train_test_split(data, stratify=data[target[0]], test_size=0.2, random_state=i)\n", "\n", " # Split dataset into samples and labels\n", " train, valid = train_test_split(train_and_valid, stratify=train_and_valid[target[0]], test_size=0.2, random_state=i)\n", "\n", " X_train = train.drop(target[0], axis=1)\n", " y_train = train[target[0]]\n", "\n", " X_valid = valid.drop(target[0], axis=1)\n", " y_valid = valid[target[0]]\n", "\n", " X_test = test.drop(target[0], axis=1)\n", " y_test = test[target[0]]\n", "\n", " # Normalize\n", " normalizer = StandardScaler()\n", " normalizer.fit(X_train[num_cols])\n", "\n", " X_train[num_cols] = normalizer.transform(X_train[num_cols])\n", " X_valid[num_cols] = normalizer.transform(X_valid[num_cols])\n", " X_test[num_cols] = normalizer.transform(X_test[num_cols])\n", "\n", " clf = XGBClassifier(n_estimators=100, random_state = i)\n", "\n", " clf.fit(\n", " X_train, y_train,\n", " eval_set=[(X_valid, y_valid)],\n", " verbose=0\n", " )\n", "\n", " feat_importances = pd.Series(clf.feature_importances_, index=X_train.columns)\n", " feature_importances_XGBoost.append(feat_importances)\n", " \n", " del clf\n" ] }, { "cell_type": "markdown", "id": "e1f6bfc7", "metadata": {}, "source": [ "### LightGBM" ] }, { "cell_type": "code", "execution_count": null, "id": "09e77328", "metadata": {}, "outputs": [], "source": [ "feature_importances_lightGBM = []\n", "\n", "for i in range(5):\n", " # Load dataset\n", " # Changes to output df will not influence input df\n", " train_and_valid, test = train_test_split(data, stratify=data[target[0]], test_size=0.2, random_state=i)\n", "\n", " # Split dataset into samples and labels\n", " train, valid = train_test_split(train_and_valid, stratify=train_and_valid[target[0]], test_size=0.2, random_state=i)\n", "\n", " X_train = train.drop(target[0], axis=1)\n", " y_train = train[target[0]]\n", "\n", " X_valid = valid.drop(target[0], axis=1)\n", " y_valid = valid[target[0]]\n", "\n", " X_test = test.drop(target[0], axis=1)\n", " y_test = test[target[0]]\n", "\n", " # Normalize\n", " normalizer = StandardScaler()\n", " normalizer.fit(X_train[num_cols])\n", "\n", " X_train[num_cols] = normalizer.transform(X_train[num_cols])\n", " X_valid[num_cols] = normalizer.transform(X_valid[num_cols])\n", " X_test[num_cols] = normalizer.transform(X_test[num_cols])\n", "\n", " clf = LGBMClassifier(n_estimators=100, random_state = i)\n", "\n", " clf.fit(\n", " X_train, y_train,\n", " eval_set=[(X_valid, y_valid)],\n", " verbose=-1,\n", " )\n", "\n", " feat_importances = pd.Series(clf.feature_importances_, index=X_train.columns)\n", " feature_importances_lightGBM.append(feat_importances)\n", " \n", " del clf\n" ] }, { "cell_type": "markdown", "id": "8f28d485", "metadata": {}, "source": [ "### CatBoost" ] }, { "cell_type": "code", "execution_count": null, "id": "73bda5e6", "metadata": {}, "outputs": [], "source": [ "feature_importances_catBoost = []\n", "\n", "for i in range(5):\n", " # Load dataset\n", " # Changes to output df will not influence input df\n", " train_and_valid, test = train_test_split(data, stratify=data[target[0]], test_size=0.2, random_state=i)\n", "\n", " # Split dataset into samples and labels\n", " train, valid = train_test_split(train_and_valid, stratify=train_and_valid[target[0]], test_size=0.2, random_state=i)\n", "\n", " X_train = train.drop(target[0], axis=1)\n", " y_train = train[target[0]]\n", "\n", " X_valid = valid.drop(target[0], axis=1)\n", " y_valid = valid[target[0]]\n", "\n", " X_test = test.drop(target[0], axis=1)\n", " y_test = test[target[0]]\n", "\n", " # Normalize\n", " normalizer = StandardScaler()\n", " normalizer.fit(X_train[num_cols])\n", "\n", " X_train[num_cols] = normalizer.transform(X_train[num_cols])\n", " X_valid[num_cols] = normalizer.transform(X_valid[num_cols])\n", " X_test[num_cols] = normalizer.transform(X_test[num_cols])\n", "\n", " clf = CatBoostClassifier(verbose=0, n_estimators=100, random_state = i)\n", "\n", " clf.fit(\n", " X_train, y_train,\n", " eval_set=[(X_valid, y_valid)],\n", " )\n", "\n", " feat_importances = pd.Series(clf.feature_importances_, index=X_train.columns)\n", " feature_importances_catBoost.append(feat_importances)\n", " \n", " del clf\n" ] }, { "cell_type": "markdown", "id": "bf021ec6", "metadata": {}, "source": [ "### Random Forest" ] }, { "cell_type": "code", "execution_count": null, "id": "da2c87aa", "metadata": {}, "outputs": [], "source": [ "feature_importances_randforest = []\n", "\n", "for i in range(5):\n", " # Load dataset\n", " # Changes to output df will not influence input df\n", " train_and_valid, test = train_test_split(data, stratify=data[target[0]], test_size=0.2, random_state=i)\n", "\n", " # Split dataset into samples and labels\n", " train, valid = train_test_split(train_and_valid, stratify=train_and_valid[target[0]], test_size=0.2, random_state=i)\n", "\n", " X_train = train.drop(target[0], axis=1)\n", " y_train = train[target[0]]\n", "\n", " X_valid = valid.drop(target[0], axis=1)\n", " y_valid = valid[target[0]]\n", "\n", " X_test = test.drop(target[0], axis=1)\n", " y_test = test[target[0]]\n", "\n", " # Normalize\n", " normalizer = StandardScaler()\n", " normalizer.fit(X_train[num_cols])\n", "\n", " X_train[num_cols] = normalizer.transform(X_train[num_cols])\n", " X_valid[num_cols] = normalizer.transform(X_valid[num_cols])\n", " X_test[num_cols] = normalizer.transform(X_test[num_cols])\n", "\n", " clf = RandomForestClassifier(n_estimators = 100, verbose=0, n_jobs=-1, random_state = i)\n", "\n", " clf.fit(\n", " X_train, y_train,\n", " )\n", "\n", " feat_importances = pd.Series(clf.feature_importances_, index=X_train.columns)\n", " feature_importances_randforest.append(feat_importances)\n", " \n", " del clf\n" ] }, { "cell_type": "markdown", "id": "60d36795", "metadata": {}, "source": [ "## Result Analysis" ] }, { "cell_type": "code", "execution_count": null, "id": "2b7f13b6", "metadata": {}, "outputs": [], "source": [ "def printResults(importances_list):\n", " print(\"Ranking of numerical features for each run:\")\n", " series_list = []\n", " for fi in importances_list:\n", " s = fi.nlargest(len(num_cols) + len(cat_cols))\n", " series_list.append(s)\n", " #print(s)\n", " #print()\n", " \n", " x = pd.DataFrame(series_list)\n", " #display(x)\n", " \n", " x = (x.mean(axis=0))\n", " norm_x=(x/x.sum())\n", " display(norm_x.sort_values(ascending=False).round(5))\n", " \n", " print(\"\\n------------------------\\n\")" ] }, { "cell_type": "code", "execution_count": null, "id": "0664cac3", "metadata": { "scrolled": false }, "outputs": [], "source": [ "print(\"TabNet\")\n", "printResults(feature_importances_TabNet)" ] }, { "cell_type": "code", "execution_count": null, "id": "21e7a143", "metadata": {}, "outputs": [], "source": [ "print(\"XGBoost\")\n", "printResults(feature_importances_XGBoost)" ] }, { "cell_type": "code", "execution_count": null, "id": "4d12930b", "metadata": {}, "outputs": [], "source": [ "print(\"LightGBM\")\n", "printResults(feature_importances_lightGBM)" ] }, { "cell_type": "code", "execution_count": null, "id": "6991e0f0", "metadata": {}, "outputs": [], "source": [ "print(\"CatBoost\")\n", "printResults(feature_importances_catBoost)" ] }, { "cell_type": "code", "execution_count": null, "id": "ecfd2518", "metadata": {}, "outputs": [], "source": [ "print(\"Random Forest Classifier\")\n", "printResults(feature_importances_randforest)" ] }, { "cell_type": "code", "execution_count": null, "id": "af46655d", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.6" } }, "nbformat": 4, "nbformat_minor": 5 }