{ "cells": [ { "cell_type": "markdown", "id": "35e97960", "metadata": {}, "source": [ "# Preprocessing small version of HIGGS dataset" ] }, { "cell_type": "code", "execution_count": null, "id": "b959547e", "metadata": {}, "outputs": [], "source": [ "# Note not every import is used here\n", "\n", "import numpy as np\n", "import pandas as pd\n", "from sklearn.datasets import fetch_openml\n", "from sklearn.model_selection import train_test_split\n", "from sklearn.metrics import accuracy_score, log_loss, ConfusionMatrixDisplay, confusion_matrix\n", "from sklearn.preprocessing import LabelEncoder\n", "\n", "import scipy as sp\n", "import matplotlib as mpl\n", "import matplotlib.pyplot as plt\n", "import seaborn as sns\n", "# Apply the default theme\n", "sns.set_theme(rc={\"patch.force_edgecolor\": False})\n", "\n", "import os\n", "import wget\n", "from pathlib import Path\n", "import shutil\n", "import gzip\n", "\n", "import re\n", "\n", "pd.set_option('display.max_columns', None)\n", "\n", "import random\n" ] }, { "cell_type": "code", "execution_count": null, "id": "0c493f24", "metadata": {}, "outputs": [], "source": [ "data = pd.read_csv(\"../HIGGS/HIGGS.csv\", header=None)" ] }, { "cell_type": "code", "execution_count": null, "id": "2d6b18be", "metadata": {}, "outputs": [], "source": [ "COLUMNS_LIST = [\"target\", \"lepton pT\", \"lepton eta\", \"lepton phi\", \"missing energy magnitude\", \"missing energy phi\", \"jet 1 pt\", \"jet 1 eta\", \"jet 1 phi\", \"jet 1 b-tag\", \"jet 2 pt\", \"jet 2 eta\", \"jet 2 phi\", \"jet 2 b-tag\", \"jet 3 pt\", \"jet 3 eta\", \"jet 3 phi\", \"jet 3 b-tag\", \"jet 4 pt\", \"jet 4 eta\", \"jet 4 phi\", \"jet 4 b-tag\", \"m_jj\", \"m_jjj\", \"m_lv\", \"m_jlv\", \"m_bb\", \"m_wbb\", \"m_wwbb\"]\n", "data.columns = COLUMNS_LIST" ] }, { "cell_type": "code", "execution_count": null, "id": "40098294", "metadata": {}, "outputs": [], "source": [ "data[\"target\"] = data[\"target\"].astype(int)" ] }, { "cell_type": "code", "execution_count": null, "id": "564bb11f", "metadata": {}, "outputs": [], "source": [ "display(data[\"target\"].value_counts())" ] }, { "cell_type": "code", "execution_count": null, "id": "47dd6e40", "metadata": {}, "outputs": [], "source": [ "display(data)" ] }, { "cell_type": "code", "execution_count": null, "id": "d9a1e952", "metadata": {}, "outputs": [], "source": [ "data_small = data.sample(n=500000, random_state=1)" ] }, { "cell_type": "code", "execution_count": null, "id": "a5e7a980", "metadata": {}, "outputs": [], "source": [ "display(data_small[\"target\"].value_counts())" ] }, { "cell_type": "code", "execution_count": null, "id": "5f03bf77", "metadata": {}, "outputs": [], "source": [ "display(data_small)" ] }, { "cell_type": "markdown", "id": "80659cf7", "metadata": {}, "source": [ "## Store resulting dataframe" ] }, { "cell_type": "code", "execution_count": null, "id": "eb7cc222", "metadata": {}, "outputs": [], "source": [ "OUTPUT_PATH = \"../HIGGS/processed-small.pkl\"" ] }, { "cell_type": "code", "execution_count": null, "id": "bc85154e", "metadata": {}, "outputs": [], "source": [ "data_small.to_pickle(OUTPUT_PATH)" ] }, { "cell_type": "code", "execution_count": null, "id": "ce29a453", "metadata": {}, "outputs": [], "source": [ "data_small.info()" ] }, { "cell_type": "code", "execution_count": null, "id": "2086b9ea", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.6" } }, "nbformat": 4, "nbformat_minor": 5 }