data_openml.py 4.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121
  1. import openml
  2. import numpy as np
  3. from sklearn.preprocessing import LabelEncoder
  4. import pandas as pd
  5. from torch.utils.data import Dataset
  6. def simple_lapsed_time(text, lapsed):
  7. hours, rem = divmod(lapsed, 3600)
  8. minutes, seconds = divmod(rem, 60)
  9. print(text+": {:0>2}:{:0>2}:{:05.2f}".format(int(hours),int(minutes),seconds))
  10. def task_dset_ids(task):
  11. dataset_ids = {
  12. 'binary': [1487,44,1590,42178,1111,31,42733,1494,1017,4134],
  13. 'multiclass': [188, 1596, 4541, 40664, 40685, 40687, 40975, 41166, 41169, 42734],
  14. 'regression':[541, 42726, 42727, 422, 42571, 42705, 42728, 42563, 42724, 42729]
  15. }
  16. return dataset_ids[task]
  17. def concat_data(X,y):
  18. # import ipdb; ipdb.set_trace()
  19. return pd.concat([pd.DataFrame(X['data']), pd.DataFrame(y['data'][:,0].tolist(),columns=['target'])], axis=1)
  20. def data_split(X,y,nan_mask,indices):
  21. x_d = {
  22. 'data': X.values[indices],
  23. 'mask': nan_mask.values[indices]
  24. }
  25. if x_d['data'].shape != x_d['mask'].shape:
  26. raise'Shape of data not same as that of nan mask!'
  27. y_d = {
  28. 'data': y[indices].reshape(-1, 1)
  29. }
  30. return x_d, y_d
  31. def data_prep_openml(ds_id, seed, task, datasplit=[.65, .15, .2]):
  32. np.random.seed(seed)
  33. dataset = openml.datasets.get_dataset(ds_id)
  34. X, y, categorical_indicator, attribute_names = dataset.get_data(dataset_format="dataframe", target=dataset.default_target_attribute)
  35. categorical_columns = X.columns[list(np.where(np.array(categorical_indicator)==True)[0])].tolist()
  36. cont_columns = list(set(X.columns.tolist()) - set(categorical_columns))
  37. cat_idxs = list(np.where(np.array(categorical_indicator)==True)[0])
  38. con_idxs = list(set(range(len(X.columns))) - set(cat_idxs))
  39. for col in categorical_columns:
  40. X[col] = X[col].astype("object")
  41. X["Set"] = np.random.choice(["train", "valid", "test"], p = datasplit, size=(X.shape[0],))
  42. train_indices = X[X.Set=="train"].index
  43. valid_indices = X[X.Set=="valid"].index
  44. test_indices = X[X.Set=="test"].index
  45. X = X.drop(columns=['Set'])
  46. temp = X.fillna("MissingValue")
  47. nan_mask = temp.ne("MissingValue").astype(int)
  48. cat_dims = []
  49. for col in categorical_columns:
  50. # X[col] = X[col].cat.add_categories("MissingValue")
  51. X[col] = X[col].fillna("MissingValue")
  52. l_enc = LabelEncoder()
  53. X[col] = l_enc.fit_transform(X[col].values)
  54. cat_dims.append(len(l_enc.classes_))
  55. for col in cont_columns:
  56. # X[col].fillna("MissingValue",inplace=True)
  57. X.fillna(X.loc[train_indices, col].mean(), inplace=True)
  58. y = y.values
  59. if task != 'regression':
  60. l_enc = LabelEncoder()
  61. y = l_enc.fit_transform(y)
  62. X_train, y_train = data_split(X,y,nan_mask,train_indices)
  63. X_valid, y_valid = data_split(X,y,nan_mask,valid_indices)
  64. X_test, y_test = data_split(X,y,nan_mask,test_indices)
  65. train_mean, train_std = np.array(X_train['data'][:,con_idxs],dtype=np.float32).mean(0), np.array(X_train['data'][:,con_idxs],dtype=np.float32).std(0)
  66. train_std = np.where(train_std < 1e-6, 1e-6, train_std)
  67. # import ipdb; ipdb.set_trace()
  68. return cat_dims, cat_idxs, con_idxs, X_train, y_train, X_valid, y_valid, X_test, y_test, train_mean, train_std
  69. class DataSetCatCon(Dataset):
  70. def __init__(self, X, Y, cat_cols,task='clf',continuous_mean_std=None):
  71. cat_cols = list(cat_cols)
  72. X_mask = X['mask'].copy()
  73. X = X['data'].copy()
  74. con_cols = list(set(np.arange(X.shape[1])) - set(cat_cols))
  75. self.X1 = X[:,cat_cols].copy().astype(np.int64) #categorical columns
  76. self.X2 = X[:,con_cols].copy().astype(np.float32) #numerical columns
  77. self.X1_mask = X_mask[:,cat_cols].copy().astype(np.int64) #categorical columns
  78. self.X2_mask = X_mask[:,con_cols].copy().astype(np.int64) #numerical columns
  79. if task == 'clf':
  80. self.y = Y['data']#.astype(np.float32)
  81. else:
  82. self.y = Y['data'].astype(np.float32)
  83. self.cls = np.zeros_like(self.y,dtype=int)
  84. self.cls_mask = np.ones_like(self.y,dtype=int)
  85. if continuous_mean_std is not None:
  86. mean, std = continuous_mean_std
  87. self.X2 = (self.X2 - mean) / std
  88. def __len__(self):
  89. return len(self.y)
  90. def __getitem__(self, idx):
  91. # X1 has categorical data, X2 has continuous
  92. return np.concatenate((self.cls[idx], self.X1[idx])), self.X2[idx],self.y[idx], np.concatenate((self.cls_mask[idx], self.X1_mask[idx])), self.X2_mask[idx]