kaggle titanic 入门实例 逻辑回归的使用 & 随机森林的使用
2016-01-11 11:58
393 查看
[code]#coding:utf-8 import numpy as np import pandas as pd train = pd.read_csv("./csv/train.csv", dtype={"Age": np.float64},) test = pd.read_csv("./csv/test.csv", dtype={"Age": np.float64},) def harmonize_data(titanic):#填充空数据 和 把string数据转成integer表示 titanic["Age"] = titanic["Age"].fillna(titanic["Age"].median()) titanic.loc[titanic["Sex"] == "male", "Sex"] = 0 titanic.loc[titanic["Sex"] == "female", "Sex"] = 1 titanic["Embarked"] = titanic["Embarked"].fillna("S") titanic.loc[titanic["Embarked"] == "S", "Embarked"] = 0 titanic.loc[titanic["Embarked"] == "C", "Embarked"] = 1 titanic.loc[titanic["Embarked"] == "Q", "Embarked"] = 2 titanic["Fare"] = titanic["Fare"].fillna(titanic["Fare"].median()) return titanic def create_submission(alg, train, test, predictors, filename): alg.fit(train[predictors], train["Survived"]) predictions = alg.predict(test[predictors]) submission = pd.DataFrame({ "PassengerId": test["PassengerId"], "Survived": predictions }) submission.to_csv(filename, index=False) train_data = harmonize_data(train) test_data = harmonize_data(test) from sklearn.linear_model import LogisticRegression from sklearn import cross_validation predictors = ["Pclass", "Sex", "Age", "SibSp", "Parch", "Fare", "Embarked"] alg = LogisticRegression(random_state=1) scores = cross_validation.cross_val_score(#对于cross_val_score的解释http://scikit-learn.org/stable/modules/generated/sklearn.cross_validation.cross_val_score.html alg, train_data[predictors], train_data["Survived"], cv=3 ) print(scores.mean()) from sklearn.ensemble import RandomForestClassifier predictors = ["Pclass", "Sex", "Age", "SibSp", "Parch", "Fare", "Embarked"] alg = RandomForestClassifier( random_state=1, n_estimators=150, min_samples_split=4, min_samples_leaf=2 ) scores = cross_validation.cross_val_score( alg, train_data[predictors], train_data["Survived"], cv=3 ) print(scores.mean()) create_submission(alg, train_data, test_data, predictors, "run-01.csv")
相关文章推荐
- HTML表单
- cisco packet tracer 站点到站点 基于ASA5505 IPSEC *** 实验
- tns cluster 简介
- matlab和C/C++混合编程--Mex
- 数据库编程第3章
- JAVA知识问答
- Xamarin部署时遇到错误: Failure [INSTALL_FAILED_UPDATE_INCOMPATIBLE]
- jquery json 解析 互转
- 纯css满屏图像幻灯片制作
- Java中文件的压缩与解压
- 《从零开始自学iOS》_01
- Tengine的安装
- 数据库编程第2章
- JAVA中正则表达式总结
- 比较两个字符串大小
- Publishing to IIS 发布到IIS
- 删除Xcode自带的Storyboard和ViewController的过程
- 如何向postgreSQL中添加b 4000 ytea类型的大对象数据
- 专为Android加载图片——Fresco
- MyEclipse使用总结——MyEclipse2014安装SVN插件