1234567891011121314151617181920212223242526 |
- seed = 0
- [data]
- normalization = 'quantile'
- path = 'data/covtype'
- [model]
- activation = 'reglu'
- attention_dropout = 0.03815883962184247
- d_ffn_factor = 1.333333333333333
- d_token = 424
- ffn_dropout = 0.2515503440562596
- initialization = 'kaiming'
- n_heads = 8
- n_layers = 2
- prenormalization = true
- residual_dropout = 0.0
- [training]
- batch_size = 1024
- eval_batch_size = 8192
- lr = 3.762989816330166e-05
- n_epochs = 1000000000
- optimizer = 'adamw'
- patience = 16
- weight_decay = 0.0001239780004929955
|