Kaggle ML for Titanic 技术代码Key points整理
2017-12-18 11:02
746 查看
sns.set_style('whitegrid') train_data.info() print("-" * 40) test_data.info() train_data['Survived'].value_counts().plot.pie(autopct = '%1.2f%%') train_data.Embarked[train_data.Embarked.isnull()] = train_data.Embarked.dropna().mode().values train_data['Cabin'] = train_data.Cabin.fillna('U0') from sklearn.ensemble import RandomForestRegressor #choose training data to predict age age_df = train_data[['Age','Survived','Fare', 'Parch', 'SibSp', 'Pclass']] age_df_notnull = age_df.loc[(train_data['Age'].notnull())] age_df_isnull = age_df.loc[(train_data['Age'].isnull())] X = age_df_notnull.values[:,1:] Y = age_df_notnull.values[:,0] # use RandomForestRegression to train data RFR = RandomForestRegressor(n_estimators=1000, n_jobs=-1) RFR.fit(X,Y) predictAges = RFR.predict(age_df_isnull.values[:,1:]) train_data.loc[train_data['Age'].isnull(), ['Age']]= predictAges train_data.groupby(['Sex','Survived'])['Survived'].count() train_data[['Sex','Survived']].groupby(['Sex']).mean().plot.bar() train_data[['Sex','Pclass','Survived']].groupby(['Pclass','Sex']).mean().plot.bar() fig, ax = plt.subplots(1, 2, figsize = (18, 8)) sns.violinplot("Pclass", "Age", hue="Survived", data=train_data, split=True, ax=ax[0]) ax[0].set_title('Pclass and Age vs Survived') ax[0].set_yticks(range(0, 110, 10)) plt.figure(figsize=(12,5)) plt.subplot(121) train_data['Age'].hist(bins=70) plt.xlabel('Age') plt.ylabel('Num') plt.subplot(122) train_data.boxplot(column='Age', showfliers=False) facet = sns.FacetGrid(train_data, hue="Survived",aspect=4) facet.map(sns.kdeplot,'Age',shade= True) facet.set(xlim=(0, train_data['Age'].max())) facet.add_legend() fig, axis1 = plt.subplots(1,1,figsize=(18,4)) train_data["Age_int"] = train_data["Age"].astype(int) average_age = train_data[["Age_int", "Survived"]].groupby(['Age_int'],as_index=False).mean() sns.barplot(x='Age_int', y='Survived', data=average_age) bins = [0, 12, 18, 65, 100] train_data['Age_group'] = pd.cut(train_data['Age'], bins) by_age = train_data.groupby('Age_group')['Survived'].mean() by_age.plot(kind = 'bar') train_data['Title'] = train_data['Name'].str.extract(' ([A-Za-z]+)\.', expand=False) pd.crosstab(train_data['Title'], train_data['Sex']) fig, axis1 = plt.subplots(1,1,figsize=(18,4)) train_data['Name_length'] = train_data['Name'].apply(len) name_length = train_data[['Name_length','Survived']].groupby(['Name_length'],as_index=False).mean() sns.barplot(x='Name_length', y='Survived', data=name_length) plt.figure(figsize=(10,5)) plt.subplot(121) sibsp_df['Survived'].value_counts().plot.pie(labels=['No Survived', 'Survived'], autopct = '%1.1f%%') plt.xlabel('sibsp') plt.figure(figsize=(10,5)) train_data['Fare'].hist(bins = 70) train_data.boxplot(column='Fare', by='Pclass', showfliers=False) plt.show() average_fare = pd.DataFrame([fare_not_survived.mean(), fare_survived.mean()]) std_fare = pd.DataFrame([fare_not_survived.std(), fare_survived.std()]) average_fare.plot(yerr=std_fare, kind='bar', legend=False) # create feature for the alphabetical part of the cabin number train_data['CabinLetter'] = train_data['Cabin'].map(lambda x: re.compile("([a-zA-Z]+)").search(x).group()) # convert the distinct cabin letters with incremental integer values train_data['CabinLetter'] = pd.factorize(train_data['CabinLetter'])[0] train_data[['CabinLetter','Survived']].groupby(['CabinLetter']).mean().plot.bar() sns.countplot('Embarked', hue='Survived', data=train_data) plt.title('Embarked and Survived') sns.factorplot('Embarked', 'Survived', data=train_data, size=3, aspect=2) plt.title('Embarked and Survived rate') plt.show() embark_dummies = pd.get_dummies(train_data['Embarked']) train_data = train_data.join(embark_dummies) train_data.drop(['Embarked'], axis=1,inplace=True) # Replace missing values with "U0" train_data['Cabin'][train_data.Cabin.isnull()] = 'U0' # create feature for the alphabetical part of the cabin number train_data['CabinLetter'] = train_data['Cabin'].map( lambda x : re.compile("([a-zA-Z]+)").search(x).group()) # convert the distinct cabin letters with incremental integer values train_data['CabinLetter'] = pd.factorize(train_data['CabinLetter'])[0] from sklearn import preprocessing assert np.size(train_data['Age']) == 891 # StandardScaler will subtract the mean from each value then scale to the unit variance scaler = preprocessing.StandardScaler() train_data['Age_scaled'] = scaler.fit_transform(train_data['Age'].values.reshape(-1, 1)) # Divide all fares into quartiles train_data['Fare_bin'] = pd.qcut(train_data['Fare'], 5) test_df_org['Survived'] = 0 combined_train_test = train_df_org.append(test_df_org) combined_train_test['Embarked'].fillna(combined_train_test['Embarked'].mode().iloc[0], inplace=True) # 为了后面的特征分析,这里我们将 Embarked 特征进行facrorizing combined_train_test['Embarked'] = pd.factorize(combined_train_test['Embarked'])[0] # 使用 pd.get_dummies 获取one-hot 编码 emb_dummies_df = pd.get_dummies(combined_train_test['Embarked'], prefix=combined_train_test[['Embarked']].columns[0]) combined_train_test = pd.concat([combined_train_test, emb_dummies_df], axis=1) title_Dict = {} title_Dict.update(dict.fromkeys(['Capt', 'Col', 'Major', 'Dr', 'Rev'], 'Officer')) title_Dict.update(dict.fromkeys(['Don', 'Sir', 'the Countess', 'Dona', 'Lady'], 'Royalty')) title_Dict.update(dict.fromkeys(['Mme', 'Ms', 'Mrs'], 'Mrs')) title_Dict.update(dict.fromkeys(['Mlle', 'Miss'], 'Miss')) title_Dict.update(dict.fromkeys(['Mr'], 'Mr')) title_Dict.update(dict.fromkeys(['Master','Jonkheer'], 'Master')) combined_train_test['Title'] = combined_train_test['Title'].map(title_Dict) # 为了后面的特征分析,这里我们也将 Title 特征进行facrorizing combined_train_test['Title'] = pd.factorize(combined_train_test['Title'])[0] title_dummies_df = pd.get_dummies(combined_train_test['Title'], prefix=combined_train_test[['Title']].columns[0]) combined_train_test = pd.concat([combined_train_test, title_dummies_df], axis=1) combined_train_test['Name_length'] = combined_train_test['Name'].apply(len) combined_train_test['Fare'] = combined_train_test[['Fare']].fillna(combined_train_test.groupby('Pclass').transform(np.mean)) combined_train_test['Group_Ticket'] = combined_train_test['Fare'].groupby(by=combined_train_test['Ticket']).transform('count') combined_train_test['Fare'] = combined_train_test['Fare'] / combined_train_test['Group_Ticket'] combined_train_test.drop(['Group_Ticket'], axis=1, inplace=True) combined_train_test['Fare_bin'] = pd.qcut(combined_train_test['Fare'], 5) Pclass1_mean_fare = combined_train_test['Fare'].groupby(by=combined_train_test['Pclass']).mean().get([1]).values[0] Pclass2_mean_fare = combined_train_test['Fare'].groupby(by=combined_train_test['Pclass']).mean().get([2]).values[0] Pclass3_mean_fare = combined_train_test['Fare'].groupby(by=combined_train_test['Pclass']).mean().get([3]).values[0] # 建立Pclass_Fare Category combined_train_test['Pclass_Fare_Category'] = combined_train_test.apply(pclass_fare_category, args=( Pclass1_mean_fare, Pclass2_mean_fare, Pclass3_mean_fare), axis=1) pclass_level = LabelEncoder() # 给每一项添加标签 pclass_level.fit(np.array( ['Pclass1_Low', 'Pclass1_High', 'Pclass2_Low', 'Pclass2_High', 'Pclass3_Low', 'Pclass3_High'])) # 转换成数值 combined_train_test['Pclass_Fare_Category'] = pclass_level.transform(combined_train_test['Pclass_Fare_Category']) # 转换成数值 combined_train_test['Pclass_Fare_Category'] = pclass_level.transform(combined_train_test['Pclass_Fare_Category']) # dummy 转换 pclass_dummies_df = pd.get_dummies(combined_train_test['Pclass_Fare_Category']).rename(columns=lambda x: 'Pclass_' + str(x)) combined_train_test = pd.concat([combined_train_test, pclass_dummies_df], axis=1) combined_train_test['Family_Size'] = combined_train_test['Parch'] + combined_train_test['SibSp'] + 1 combined_train_test['Family_Size_Category'] = combined_train_test['Family_Size'].map(family_size_category) le_family = LabelEncoder() le_family.fit(np.array(['Single', 'Small_Family', 'Large_Family'])) combined_train_test['Family_Size_Category'] = le_family.transform(combined_train_test['Family_Size_Category']) family_size_dummies_df = pd.get_dummies(combined_train_test['Family_Size_Category'], prefix=combined_train_test[['Family_Size_Category']].columns[0]) combined_train_test = pd.concat([combined_train_test, family_size_dummies_df], axis=1) # model 1 gbm gbm_reg = GradientBoostingRegressor(random_state=42) gbm_reg_param_grid = {'n_estimators': [2000], 'max_depth': [4], 'learning_rate': [0.01], 'max_features': [3]} gbm_reg_grid = model_selection.GridSearchCV(gbm_reg, gbm_reg_param_grid, cv=10, n_jobs=25, verbose=1, scoring='neg_mean_squared_error') gbm_reg_grid.fit(missing_age_X_train, missing_age_Y_train) print('Age feature Best GB Params:' + str(gbm_reg_grid.best_params_)) print('Age feature Best GB Score:' + str(gbm_reg_grid.best_score_)) print('GB Train Error for "Age" Feature Regressor:' + str(gbm_reg_grid.score(missing_age_X_train, missing_age_Y_train))) missing_age_test.loc[:, 'Age_GB'] = gbm_reg_grid.predict(missing_age_X_test) print(missing_age_test['Age_GB'][:4]) colormap = plt.cm.viridis plt.figure(figsize=(14,12)) plt.title('Pearson Correlation of Features', y=1.05, size=15) sns.heatmap(Correlation.astype(float).corr(),linewidths=0.1,vmax=1.0, square=True, cmap=colormap, linecolor='white', annot=True) g = sns.pairplot(combined_train_test[[u'Survived', u'Pclass', u'Sex', u'Age', u'Fare', u'Embarked', u'Family_Size', u'Title', u'Ticket_Letter']], hue='Survived', palette = 'seismic',size=1.2,diag_kind = 'kde',diag_kws=dict(shade=True),plot_kws=dict(s=10) ) g.set(xticklabels=[]) scale_age_fare = preprocessing.StandardScaler().fit(combined_train_test[['Age','Fare', 'Name_length']]) combined_train_test[['Age','Fare', 'Name_length']] = scale_age_fare.transform(combined_train_test[['Age','Fare', 'Name_length']]) # random forest rf_est = RandomForestClassifier(random_state=0) rf_param_grid = {'n_estimators': [500], 'min_samples_split': [2, 3], 'max_depth': [20]} rf_grid = model_selection.GridSearchCV(rf_est, rf_param_grid, n_jobs=25, cv=10, verbose=1) rf_grid.fit(titanic_train_data_X, titanic_train_data_Y) print('Top N Features Best RF Params:' + str(rf_grid.best_params_)) print('Top N Features Best RF Score:' + str(rf_grid.best_score_)) print('Top N Features RF Train Score:' + str(rf_grid.score(titanic_train_data_X, titanic_train_data_Y))) feature_imp_sorted_rf = pd.DataFrame({'feature': list(titanic_train_data_X), 'importance': rf_grid.best_estimator_.feature_importances_}).sort_values('importance', ascending=False) features_top_n_rf = feature_imp_sorted_rf.head(top_n_features)['feature'] print('Sample 10 Features from RF Classifier') print(str(features_top_n_rf[:10]))
相关文章推荐
- 一个技术小白整理出来的for循环使用几种方法~~
- 基础代码汇总整理 for NOIP 2009 修订版(下)
- Java基础:技术人常见十大代码错误整理
- 【JQuery】优化页面性能的代码技术整理(提高网页响应速度必看)
- tensorflow官方教程 - MNIST for ML Beginers - 代码及注释
- 自己的技术经验和心得,及常用的资料和代码如何整理、分类和保存才好呢?
- 试试MNIST For ML Beginners代码
- tensorflow官方教程 - MNIST for ML Beginers - 代码及注释
- python机器学习实战之 Decision Tree For Titanic in Kaggle
- kaggle数据挖掘竞赛初步--Titanic<数据变换> 完整代码: https://github.com/cindycindyhi/kaggle-Titanic 特征工程系列: Titanic
- oxford-cs-ml-2015/practical6 代码解读(LSTMs for language modelling)
- 优化页面性能的代码技术整理(提高网页响应速度必看)
- Air for ios横版格斗过关项目技术整理
- 【Android技术整理】XML生成与解析伪代码
- 基础代码汇总整理 for NOIP 2009 修订版(上)
- 关于WORD里怎样编辑代码好看(技术帖整理)
- How to write a good design document for peer engineers (如何写一份给工程师看的技术文档)
- js返回上一页并刷新代码整理 转
- Android官方ApiDemo中animation部分代码要点整理
- IOS----NSdateFormatter 相关整理