Python数据分析与机器学习-用户流失预警
2018-01-31 21:16
507 查看
import pandas as pd import numpy as np pd.set_option('display.height', 9999) pd.set_option('display.max_rows', 9999) pd.set_option('display.max_columns', 9999) pd.set_option('display.width', 9999) churn_df = pd.read_csv('churn.csv') ''' State Account Length Area Code Phone Int'l Plan VMail Plan VMail Message Day Mins Day Calls Day Charge Eve Mins Eve Calls Eve Charge Night Mins Night Calls Night Charge Intl Mins Intl Calls Intl Charge CustServ Calls Churn? 0 KS 128 415 382-4657 no yes 25 265.1 110 45.07 197.4 99 16.78 244.7 91 11.01 10.0 3 2.70 1 False. 1 OH 107 415 371-7191 no yes 26 161.6 123 27.47 195.5 103 16.62 254.4 103 11.45 13.7 3 3.70 1 False. 2 NJ 137 415 358-1921 no no 0 243.4 114 41.38 121.2 110 10.30 162.6 104 7.32 12.2 5 3.29 0 False. 3 OH 84 408 375-9999 yes no 0 299.4 71 50.90 61.9 88 5.26 196.9 89 8.86 6.6 7 1.78 2 False. 4 OK 75 415 330-6626 yes no 0 166.7 113 28.34 148.3 122 12.61 186.9 121 8.41 10.1 3 2.73 3 False. ''' churn_feat_space = churn_df.drop(['State', 'Area Code', 'Phone', 'Churn?'], axis=1) yes_no_cols = ["Int'l Plan", "VMail Plan"] churn_feat_space[yes_no_cols] = churn_feat_space[yes_no_cols] == 'yes' # features = churn_feat_space.columns # print(churn_feat_space.head()) ''' Account Length Int'l Plan VMail Plan VMail Message Day Mins Day Calls Day Charge Eve Mins Eve Calls Eve Charge Night Mins Night Calls Night Charge Intl Mins Intl Calls Intl Charge CustServ Calls 0 128 False True 25 265.1 110 45.07 197.4 99 16.78 244.7 91 11.01 10.0 3 2.70 1 1 107 False True 26 161.6 123 27.47 195.5 103 16.62 254.4 103 11.45 13.7 3 3.70 1 2 137 False False 0 243.4 114 41.38 121.2 110 10.30 162.6 104 7.32 12.2 5 3.29 0 3 84 True False 0 299.4 71 50.90 61.9 88 5.26 196.9 89 8.86 6.6 7 1.78 2 4 75 True False 0 166.7 113 28.34 148.3 122 12.61 186.9 121 8.41 10.1 3 2.73 3 ''' X = churn_feat_space.as_matrix().astype(np.float) churn_result = churn_df['Churn?'] y = np.where(churn_result == 'True.', 1, 0) from sklearn.preprocessing import StandardScaler scaler = StandardScaler() X = scaler.fit_transform(X) # print(X[0]) ''' [ 0.67648946 -0.32758048 1.6170861 1.23488274 1.56676695 0.47664315 1.56703625 -0.07060962 -0.05594035 -0.07042665 0.86674322 -0.46549436 0.86602851 -0.08500823 -0.60119509 -0.0856905 -0.42793202] ''' '''交叉验证通用函数''' from sklearn.cross_validation import KFold # X,y,选择的分类器,参数 def run_cv(X, y, clf_class, **kwargs): # Construct a kfolds object kf = KFold(len(y), n_folds=5, shuffle=True) y_pred = y.copy() # Iterate through folds for train_index, test_index in kf: X_train, X_test = X[train_index], X[test_index] y_train = y[train_index] # Initialize a classifier with key word arguments clf = clf_class(**kwargs) clf.fit(X_train, y_train) y_pred[test_index] = clf.predict(X_test) return y_pred from sklearn.svm import SVC from sklearn.ensemble import RandomForestClassifier as RF from sklearn.neighbors import KNeighborsClassifier as KNN # 精度 def accuracy(y_true, y_pred): # NumPy interprets True and False as 1. and 0. return np.mean(y_true == y_pred) print("Support vector machines:") print("%.3f" % accuracy(y, run_cv(X, y, SVC))) print("Random forest:") print("%.3f" % accuracy(y, run_cv(X, y, RF))) print("K-nearest-neighbors:") print("%.3f" % accuracy(y, run_cv(X, y, KNN))) # 客户流失的概率 def run_prob_cv(X, y, clf_class, **kwargs): kf = KFold(len(y), n_folds=5, shuffle=True) y_prob = np.zeros((len(y), 2)) for train_index, test_index in kf: X_train, X_test = X[train_index], X[test_index] y_train = y[train_index] clf = clf_class(**kwargs) clf.fit(X_train, y_train) # Predict probabilities, not classes y_prob[test_index] = clf.predict_proba(X_test) return y_prob # Use 10 estimators so predictions are all multiples of 0.1 pred_prob = run_prob_cv(X, y, RF, n_estimators=10) # print pred_prob[0] pred_churn = pred_prob[:, 1] is_churn = y == 1 # Number of times a predicted probability is assigned to an observation counts = pd.value_counts(pred_churn) # print counts # calculate true probabilities true_prob = {} for prob in counts.index: true_prob[prob] = np.mean(is_churn[pred_churn == prob]) true_prob = pd.Series(true_prob) # pandas-fu counts = pd.concat([counts, true_prob], axis=1).reset_index() counts.columns = ['pred_prob', 'count', 'true_prob'] print(counts)
相关文章推荐
- python数据分析与机器学习-用户流失预警
- Python数据分析——基于CART决策树的用户识别
- Python数据分析与机器学习-使用Kmeans进行图像压缩
- 关于 Python 数据抓取 & 分析 & 机器学习 & 挖掘 & 神经网络 内容的分享。
- Python数据分析与机器学习-scikit-learn模型建立与评估
- 机器学习项目实战之用户流失预警
- 关于 Python 数据抓取、分析、挖掘、机器学习和Python 分布式计算内容分享
- Python数据分析与机器学习-梯度下降策略
- python实现人人网用户数据爬取及简单分析
- Python数据分析与机器学习-足球赛事数据集
- Python vs R : 在机器学习和数据分析领域中的对比
- Python数据分析与机器学习-线性回归算法原理推导
- Python数据分析与机器学习-PCA主成分分析
- Python数据分析与机器学习-新闻分类任务
- 500G python web、爬虫、数据分析、机器学习、大数据、前端实战项目视频代码免费分享
- python数据分析(预测性分析与机器学习)
- 2018python数据分析与机器学习实战(视频+源码+课件)
- 分析以数据挖掘技术预测用户流失情况的方法
- Python数据分析与机器学习-交易数据异常检测
- 关于用户流失,数据分析可以挽回一线生机