{ "cells": [ { "cell_type": "markdown", "id": "35e97960", "metadata": {}, "source": [ "# Preprocessing LOAN with only features visible to investors\n", "\n", "Code adapted from https://www.kaggle.com/code/pavlofesenko/minimizing-risks-for-loan-investments/notebook" ] }, { "cell_type": "code", "execution_count": null, "id": "b959547e", "metadata": {}, "outputs": [], "source": [ "# Note: not every import is used here\n", "import numpy as np\n", "import pandas as pd\n", "from sklearn.datasets import fetch_openml\n", "from sklearn.model_selection import train_test_split\n", "from sklearn.metrics import accuracy_score, log_loss, ConfusionMatrixDisplay, confusion_matrix\n", "from sklearn.preprocessing import LabelEncoder\n", "\n", "import scipy as sp\n", "import matplotlib as mpl\n", "import matplotlib.pyplot as plt\n", "import seaborn as sns\n", "# Apply the default theme\n", "sns.set_theme(rc={\"patch.force_edgecolor\": False})\n", "\n", "import os\n", "import wget\n", "from pathlib import Path\n", "import shutil\n", "import gzip\n", "\n", "import re\n", "\n", "pd.set_option('display.max_columns', None)\n", "\n", "import random\n" ] }, { "cell_type": "code", "execution_count": null, "id": "25aec06c", "metadata": {}, "outputs": [], "source": [ "# Config\n", "DROP_HIGH_DIM_CAT_COLS = False\n", "FILL_NAN_WITH_MEAN = False # if false, fill with 0\n", "OUTPUT_PATH = \"../LOAN/processed.pkl\"\n", "OUTPUT_PATH_BALANCED =\"../LOAN/processed_balanced.pkl\"\n", "\n", "\n", "target = [\"bad_investment\"]" ] }, { "cell_type": "code", "execution_count": null, "id": "0c493f24", "metadata": {}, "outputs": [], "source": [ "data = pd.read_csv('../LOAN/accepted_2007_to_2018Q4.csv', low_memory=False)" ] }, { "cell_type": "code", "execution_count": null, "id": "2d6b18be", "metadata": {}, "outputs": [], "source": [ "# Sheet 1 contains names and descriptions of features visible to investors\n", "feature_description = pd.read_excel('../LOAN/LCDataDictionary.xlsx', sheet_name=1)\n", "display(feature_description.head())" ] }, { "cell_type": "markdown", "id": "4a246f8c", "metadata": {}, "source": [ "### Fix differently formatted feature names between data and feature list" ] }, { "cell_type": "code", "execution_count": null, "id": "14147510", "metadata": {}, "outputs": [], "source": [ "feature_description_names = feature_description['BrowseNotesFile'].dropna().values\n", "feature_description_names = [re.sub('(? 30% missing values" ] }, { "cell_type": "code", "execution_count": null, "id": "9e4861a5", "metadata": {}, "outputs": [], "source": [ "missing_fractions = data.isnull().mean().sort_values(ascending=False)" ] }, { "cell_type": "code", "execution_count": null, "id": "c8b99a23", "metadata": {}, "outputs": [], "source": [ "missing_fractions.head(10)" ] }, { "cell_type": "code", "execution_count": null, "id": "90ee206b", "metadata": {}, "outputs": [], "source": [ "drop_list = sorted(list(missing_fractions[missing_fractions > 0.3].index))\n", "print(len(drop_list), drop_list)" ] }, { "cell_type": "code", "execution_count": null, "id": "2a546cf7", "metadata": {}, "outputs": [], "source": [ "data.drop(labels=drop_list, axis=1, inplace=True)" ] }, { "cell_type": "code", "execution_count": null, "id": "c509cb85", "metadata": {}, "outputs": [], "source": [ "data.shape" ] }, { "cell_type": "markdown", "id": "ef36c2bd", "metadata": {}, "source": [ "### Drop useless features" ] }, { "cell_type": "code", "execution_count": null, "id": "178638fb", "metadata": {}, "outputs": [], "source": [ "# drop ID\n", "data.drop('id', axis=1, inplace=True)" ] }, { "cell_type": "code", "execution_count": null, "id": "9e2bc56d", "metadata": {}, "outputs": [], "source": [ "# drop url\n", "data.drop('url', axis=1, inplace=True)" ] }, { "cell_type": "code", "execution_count": null, "id": "920938aa", "metadata": {}, "outputs": [], "source": [ "# drop title (the loan title provided by the borrower)\n", "data.drop('title', axis=1, inplace=True)" ] }, { "cell_type": "code", "execution_count": null, "id": "f92da83c", "metadata": {}, "outputs": [], "source": [ "# employment title has way to many unique values, so we drop it\n", "\n", "## Simple processing of user filled employment title column\n", "#data[\"emp_title\"] = data[\"emp_title\"].astype(str).str.lower()\n", "#data[\"emp_title\"] = data[\"emp_title\"].str.split('/').str[0]\n", "#data[\"emp_title\"] = data[\"emp_title\"].str.split(',').str[0]\n", "#data[\"emp_title\"] = data[\"emp_title\"].str.replace('.', '', regex=False)\n", "#data[\"emp_title\"] = data[\"emp_title\"].str.strip()\n", "\n", "data.drop('emp_title', axis=1, inplace=True)" ] }, { "cell_type": "markdown", "id": "00d7f5fa", "metadata": {}, "source": [ "### Process high dimensional categorical features" ] }, { "cell_type": "code", "execution_count": null, "id": "6b89e6b4", "metadata": {}, "outputs": [], "source": [ "for col in data.columns[data.dtypes == object]:\n", " print(col, data[col].nunique())" ] }, { "cell_type": "code", "execution_count": null, "id": "05346bf0", "metadata": {}, "outputs": [], "source": [ "# Convert date string column to year and month column\n", "data['earliest_cr_line_month'] = pd.to_datetime(data.earliest_cr_line, format='%b-%Y').dt.month\n", "data['earliest_cr_line_year'] = pd.to_datetime(data.earliest_cr_line, format='%b-%Y').dt.year\n", "data.drop('earliest_cr_line', axis=1, inplace=True)" ] }, { "cell_type": "code", "execution_count": null, "id": "c0f7d3cc", "metadata": {}, "outputs": [], "source": [ "if DROP_HIGH_DIM_CAT_COLS:\n", " data.drop([\"zip_code\"], axis=1, inplace=True)" ] }, { "cell_type": "markdown", "id": "bf61d6a6", "metadata": {}, "source": [ "### Process missing values" ] }, { "cell_type": "code", "execution_count": null, "id": "5113c7cb", "metadata": {}, "outputs": [], "source": [ "categorical_columns = []\n", "categorical_dims = {}\n", "for col in data.columns[data.dtypes == object]:\n", " if col != \"loan_status\":\n", " print(col, data[col].nunique())\n", " l_enc = LabelEncoder()\n", " data[col] = data[col].fillna(\"MISSING_VALUE\")\n", " data[col] = l_enc.fit_transform(data[col].values)\n", " categorical_columns.append(col)\n", " categorical_dims[col] = len(l_enc.classes_)" ] }, { "cell_type": "code", "execution_count": null, "id": "e68f4930", "metadata": {}, "outputs": [], "source": [ "for col in data.columns[data.dtypes == 'float64']:\n", " if col != \"loan_status\":\n", " if FILL_NAN_WITH_MEAN:\n", " data.fillna(data[col].mean(), inplace=True)\n", " else:\n", " data.fillna(0, inplace=True)\n", "\n", "unused_feat = []\n", "\n", "features = [ col for col in data.columns if col not in unused_feat+target+[\"loan_status\"] ]\n", "\n", "cat_idxs = [ i for i, f in enumerate(features) if f in categorical_columns]\n", "\n", "cat_dims = [ categorical_dims[f] for i, f in enumerate(features) if f in categorical_columns]" ] }, { "cell_type": "code", "execution_count": null, "id": "d8f7e838", "metadata": {}, "outputs": [], "source": [ "print(features)" ] }, { "cell_type": "code", "execution_count": null, "id": "3bca8803", "metadata": {}, "outputs": [], "source": [ "print(cat_idxs)\n", "print(cat_dims)" ] }, { "cell_type": "code", "execution_count": null, "id": "77e8eb47", "metadata": { "scrolled": false }, "outputs": [], "source": [ "display(data.head(20))" ] }, { "cell_type": "markdown", "id": "39cbcec8", "metadata": {}, "source": [ "### Create target label" ] }, { "cell_type": "code", "execution_count": null, "id": "85033d68", "metadata": {}, "outputs": [], "source": [ "# Keep only fully paid or charged off\n", "# - Current and In Grace could still be bad, especially if loan is recent, so we drop it\n", "# - We consider Late and Default as bad, you want to get paid in time\n", "#loans = loans.loc[loans['loan_status'].isin(['Fully Paid', 'Charged Off'])]\n", "\n", "# Drop non-relevant rows\n", "data[\"loan_status\"] = data[\"loan_status\"].astype(str)\n", "data = data[~data[\"loan_status\"].str.contains(\"Current\")]\n", "data = data[~data[\"loan_status\"].str.contains(\"Does not meet the credit policy\")]\n", "data = data[~data[\"loan_status\"].str.contains(\"In Grace Period\")]\n", "data = data[~data[\"loan_status\"].str.contains(\"nan\")]" ] }, { "cell_type": "code", "execution_count": null, "id": "91e4a7ae", "metadata": {}, "outputs": [], "source": [ "display(data[\"loan_status\"].value_counts())" ] }, { "cell_type": "code", "execution_count": null, "id": "46d1396d", "metadata": {}, "outputs": [], "source": [ "# Create the final target column\n", "data[\"bad_investment\"] = 1 - data[\"loan_status\"].isin([\"Fully Paid\"]).astype('int')" ] }, { "cell_type": "code", "execution_count": null, "id": "bf74a68f", "metadata": {}, "outputs": [], "source": [ "display(data[\"bad_investment\"].value_counts())" ] }, { "cell_type": "code", "execution_count": null, "id": "7b544878", "metadata": {}, "outputs": [], "source": [ "# Drop original target label\n", "data.drop(\"loan_status\", axis=1, inplace=True)" ] }, { "cell_type": "code", "execution_count": null, "id": "d03ede67", "metadata": {}, "outputs": [], "source": [ "display(data.head())" ] }, { "cell_type": "markdown", "id": "80659cf7", "metadata": {}, "source": [ "## Store resulting dataframe" ] }, { "cell_type": "code", "execution_count": null, "id": "bc85154e", "metadata": {}, "outputs": [], "source": [ "# Uncomment for the larger unbalanced version of the dataset\n", "#data.to_pickle(OUTPUT_PATH)" ] }, { "cell_type": "markdown", "id": "7cc580c4", "metadata": {}, "source": [ "### Store balanced version by undersampling" ] }, { "cell_type": "code", "execution_count": null, "id": "796fe13e", "metadata": {}, "outputs": [], "source": [ "data_minority = data[data[\"bad_investment\"]==1]\n", "data_majority = data[data[\"bad_investment\"]==0]\n", "data_majority = data_majority.sample(n=len(data_minority), random_state=37)\n", "data = pd.concat([data_minority,data_majority],axis=0)\n", "\n", "\n", "# Shuffle because undersampler orders on label\n", "data = data.sample(frac=1, random_state=37).reset_index(drop=True)\n", "\n" ] }, { "cell_type": "code", "execution_count": null, "id": "85dddffe", "metadata": {}, "outputs": [], "source": [ "data.to_pickle(OUTPUT_PATH_BALANCED)" ] }, { "cell_type": "code", "execution_count": null, "id": "5202e0d5", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.6" } }, "nbformat": 4, "nbformat_minor": 5 }