seed = 0 [data] normalization = 'quantile' path = 'data/covtype' [model] activation = 'reglu' attention_dropout = 0.03815883962184247 d_ffn_factor = 1.333333333333333 d_token = 424 ffn_dropout = 0.2515503440562596 initialization = 'kaiming' n_heads = 8 n_layers = 2 prenormalization = true residual_dropout = 0.0 [training] batch_size = 1024 eval_batch_size = 8192 lr = 3.762989816330166e-05 n_epochs = 1000000000 optimizer = 'adamw' patience = 16 weight_decay = 0.0001239780004929955