您的位置:首页 > 其它

机器学习实验(三):建立深度学习模型对kaggle保险索赔进行预测

2016-10-28 18:11 671 查看
原文作者: Danijel Kivaranovic
In [1]:
## import librariesimport numpy as npnp.random.seed(123)import pandas as pdimport subprocessfrom scipy.sparse import csr_matrix, hstackfrom sklearn.metrics import mean_absolute_errorfrom sklearn.preprocessing import StandardScalerfrom sklearn.cross_validation import KFoldfrom keras.models import Sequentialfrom keras.layers import Dense, Dropout, Activationfrom keras.layers.advanced_activations import PReLU## Batch generators #####################################################################################def batch_generator(X, y, batch_size, shuffle):#chenglong code for fiting from generator#(https://www.kaggle.com/c/talkingdata-mobile-user-demographics/forums/t/22567/neural-network-for-sparse-matrices)number_of_batches = np.ceil(X.shape[0]/batch_size)counter = 0sample_index = np.arange(X.shape[0])if shuffle:np.random.shuffle(sample_index)while True:batch_index = sample_index[batch_size*counter:batch_size*(counter+1)]X_batch = X[batch_index,:].toarray()y_batch = y[batch_index]counter += 1yield X_batch, y_batchif (counter == number_of_batches):if shuffle:np.random.shuffle(sample_index)counter = 0def batch_generatorp(X, batch_size, shuffle):number_of_batches = X.shape[0] / np.ceil(X.shape[0]/batch_size)counter = 0sample_index = np.arange(X.shape[0])while True:batch_index = sample_index[batch_size * counter:batch_size * (counter + 1)]X_batch = X[batch_index, :].toarray()counter += 1yield X_batchif (counter == number_of_batches):counter = 0########################################################################################################### read datatrain = pd.read_csv('train.csv')test = pd.read_csv('test.csv')## set test loss to NaNtest['loss'] = np.nan## response and IDsy = train['loss'].valuesid_train = train['id'].valuesid_test = test['id'].values## stack train testntrain = train.shape[0]tr_te = pd.concat((train, test), axis = 0)## Preprocessing and transforming to sparse datasparse_data = []f_cat = [f for f in tr_te.columns if 'cat' in f]for f in f_cat:dummy = pd.get_dummies(tr_te[f].astype('category'))tmp = csr_matrix(dummy)sparse_data.append(tmp)f_num = [f for f in tr_te.columns if 'cont' in f]scaler = StandardScaler()tmp = csr_matrix(scaler.fit_transform(tr_te[f_num]))sparse_data.append(tmp)del(tr_te, train, test)## sparse train and test dataxtr_te = hstack(sparse_data, format = 'csr')xtrain = xtr_te[:ntrain, :]xtest = xtr_te[ntrain:, :]print('Dim train', xtrain.shape)print('Dim test', xtest.shape)del(xtr_te, sparse_data, tmp)## neural netdef nn_model():model = Sequential()model.add(Dense(400, input_dim = xtrain.shape[1], init = 'he_normal'))model.add(PReLU())model.add(Dropout(0.4))model.add(Dense(200, init = 'he_normal'))model.add(PReLU())model.add(Dropout(0.2))model.add(Dense(1, init = 'he_normal'))model.compile(loss = 'mae', optimizer = 'adadelta')return(model)## cv-foldsnfolds = 5folds = KFold(len(y), n_folds = nfolds, shuffle = True, random_state = 111)## train modelsi = 0nbags = 5nepochs = 55pred_oob = np.zeros(xtrain.shape[0])pred_test = np.zeros(xtest.shape[0])for (inTr, inTe) in folds:xtr = xtrain[inTr]ytr = y[inTr]xte = xtrain[inTe]yte = y[inTe]pred = np.zeros(xte.shape[0])for j in range(nbags):model = nn_model()fit = model.fit_generator(generator = batch_generator(xtr, ytr, 128, True),nb_epoch = nepochs,samples_per_epoch = xtr.shape[0],verbose = 0)pred += model.predict_generator(generator = batch_generatorp(xte, 800, False),val_samples = xte.shape[0])[:,0]pred_test += model.predict_generator(generator = batch_generatorp(xtest, 800, False),val_samples = xtest.shape[0])[:,0]pred /= nbagspred_oob[inTe] = predscore = mean_absolute_error(yte, pred)i += 1print('Fold ', i, '- MAE:', score)print('Total - MAE:', mean_absolute_error(y, pred_oob))## train predictionsdf = pd.DataFrame({'id': id_train, 'loss': pred_oob})df.to_csv('preds_oob.csv', index = False)## test predictionspred_test /= (nfolds*nbags)df = pd.DataFrame({'id': id_test, 'loss': pred_test})df.to_csv('submission_keras.csv', index = False)
Using Theano backend.
('Dim train', (188318, 1190))('Dim test', (125546, 1190))
WARNING (theano.gof.cmodule): ModuleCache.refresh() Found key without dll in cache, deleting it. /Users/youwei.tan/.theano/compiledir_Darwin-15.6.0-x86_64-i386-64bit-i386-2.7.12-64/tmpRXFbE3/key.pklWARNING (theano.gof.compilelock): Overriding existing lock by dead process '2865' (I am process '4459')/Applications/anaconda/lib/python2.7/site-packages/keras/engine/training.py:1460: UserWarning: Epoch comprised more than `samples_per_epoch` samples, which might affect learning results. Set `samples_per_epoch` correctly to avoid this warning.warnings.warn('Epoch comprised more than '
('Fold ', 1, '- MAE:', 1128.4291513927526)('Fold ', 2, '- MAE:', 1136.0284801627704)('Fold ', 3, '- MAE:', 1150.3288852079743)
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
相关文章推荐