您的位置：首页 > 其它

R语言利用caret包对变量重要程度排序与选择特性

2017-06-08 08:46 513 查看

说明

得到监督模型后，我们可以改变输入值，比较给定模型输出效果的变化敏感程度来评估不同特征对模型的的重要性。

操作

利用caret完成对重要变量有排序

library(lattice)
library(ggplot2)
library(caret)
data(churn)

str(churnTrain)

churnTrain = churnTrain[,!names(churnTrain) %in% c("state","area_code","account_length")]

#生成随机编号为2的随机数
set.seed(2)
#将churnTrain的数据集分为两类，按0.7与0.3的比例无放回抽样
ind = sample(2,nrow(churnTrain),replace = TRUE,prob = c(0.7,0.3))

trainset = churnTrain[ind == 1,]
testset = churnTrain[ind == 2,]

control = trainControl(method = "repeatedcv",number = 10,repeats = 3)
library(rpart)
library(C50)
data(churn)
model = train(churn~.,data = trainset,method = "rpart",preProcess = "scale" ,trControl = control)

importance = varImp(model,scale = FALSE)
importance
rpart variable importance

Overall
number_customer_service_calls 116.015
total_day_minutes             106.988
total_day_charge              100.648
international_planyes          86.789
voice_mail_planyes             25.974
total_eve_minutes              23.097
total_eve_charge               23.097
number_vmail_messages          19.885
total_intl_minutes              6.347
total_intl_calls                0.000
total_night_minutes             0.000
total_day_calls                 0.000
total_night_calls               0.000
total_night_charge              0.000
total_eve_calls                 0.000
total_intl_charge               0.000

利用plot图绘制变量图重要性

plot(importance)

利用rpart等一些分类算法包从训练模型中产生的对象包括了变量了重要性，我们可以借助输出查看变量的重要性。

model.rp = rpart(churn ~ .,data = trainset)
model.rp$variable.importance
total_day_minutes              total_day_charge number_customer_service_calls            total_intl_minutes
111.645286                    110.881583                     58.486651                     48.283228
total_intl_charge              total_eve_charge             total_eve_minutes            international_plan
47.698379                     47.166646                     47.166646                     42.194508
total_intl_calls         number_vmail_messages               voice_mail_plan             total_night_calls
36.730344                     19.884863                     19.884863                      7.195828
total_eve_calls            total_night_charge           total_night_minutes               total_day_calls
3.553423                      1.754547                      1.754547                      1.494986

利用caret包找到高度关联的特征

重新生成trainset

new_train = trainset[,!names(churnTrain) %in% c("churn","international_plan","voice_mail_plan")]

计算每个属性之间的关联度

cor_mat = cor(new_train)
cor_mat
number_vmail_messages total_day_minutes total_day_calls total_day_charge total_eve_minutes
number_vmail_messages                  1.000000e+00     -3.788346e-05    -0.015315725    -4.344686e-05       0.016058678
total_day_minutes                     -3.788346e-05      1.000000e+00     0.003940177     1.000000e+00       0.018136088
total_day_calls                       -1.531573e-02      3.940177e-03     1.000000000     3.942808e-03      -0.016774585
total_day_charge                      -4.344686e-05      1.000000e+00     0.003942808     1.000000e+00       0.018138428
total_eve_minutes                      1.605868e-02      1.813609e-02    -0.016774585     1.813843e-02       1.000000000
total_eve_calls                       -1.715851e-02      2.421777e-02     0.001938560     2.422109e-02      -0.024822635
total_eve_charge                       1.608561e-02      1.813039e-02    -0.016756722     1.813273e-02       0.999999775
total_night_minutes                    1.536272e-02      7.287277e-03     0.019582169     7.286773e-03      -0.009225611
total_night_calls                      7.575418e-03      2.334305e-02    -0.010994425     2.334289e-02      -0.001625934
total_night_charge                     1.534769e-02      7.246376e-03     0.019590677     7.245871e-03      -0.009243068
total_intl_minutes                     7.256768e-03     -1.623444e-02     0.019197349    -1.623804e-02      -0.012775313
total_intl_calls                       1.513658e-02      1.428235e-02     0.003062639     1.428083e-02      -0.001403721
total_intl_charge                      7.286473e-03     -1.620263e-02     0.019255703    -1.620623e-02      -0.012697993
number_customer_service_calls         -2.210761e-02     -7.981699e-03    -0.017568292    -7.982226e-03      -0.012419994
total_eve_calls total_eve_charge total_night_minutes total_night_calls total_night_charge
number_vmail_messages           -0.0171585053      0.016085609         0.015362721       0.007575418        0.015347687
total_day_minutes                0.0242177707      0.018130387         0.007287277       0.023343046        0.007246376
total_day_calls                  0.0019385605     -0.016756722         0.019582169      -0.010994425        0.019590677
total_day_charge                 0.0242210899      0.018132728         0.007286773       0.023342891        0.007245871
total_eve_minutes               -0.0248226349      0.999999775        -0.009225611      -0.001625934       -0.009243068
total_eve_calls                  1.0000000000     -0.024817028        -0.008842555       0.007155111       -0.008786733
total_eve_charge                -0.0248170279      1.000000000        -0.009223415      -0.001612116       -0.009240862
total_night_minutes             -0.0088425553     -0.009223415         1.000000000       0.026300284        0.999999233
total_night_calls                0.0071551108     -0.001612116         0.026300284       1.000000000        0.026261765
total_night_charge              -0.0087867331     -0.009240862         0.999999233       0.026261765        1.000000000
total_intl_minutes               0.0008668991     -0.012791651        -0.005866862       0.002762074       -0.005879993
total_intl_calls                 0.0079279293     -0.001397968        -0.012199350       0.015687833       -0.012174331
total_intl_charge                0.0008246073     -0.012714417        -0.005862733       0.002657183       -0.005875863
number_customer_service_calls    0.0063984603     -0.012415843        -0.001085209      -0.020231294       -0.001067571
total_intl_minutes total_intl_calls total_intl_charge number_customer_service_calls
number_vmail_messages               0.0072567683      0.015136582      0.0072864731                  -0.022107609
total_day_minutes                  -0.0162344421      0.014282352     -0.0162026313                  -0.007981699
total_day_calls                     0.0191973487      0.003062639      0.0192557029                  -0.017568292
total_day_charge                   -0.0162380367      0.014280828     -0.0162062318                  -0.007982226
total_eve_minutes                  -0.0127753135     -0.001403721     -0.0126979931                  -0.012419994
total_eve_calls                     0.0008668991      0.007927929      0.0008246073                   0.006398460
total_eve_charge                   -0.0127916505     -0.001397968     -0.0127144174                  -0.012415843
total_night_minutes                -0.0058668622     -0.012199350     -0.0058627326                  -0.001085209
total_night_calls                   0.0027620735      0.015687833      0.0026571833                  -0.020231294
total_night_charge                 -0.0058799926     -0.012174331     -0.0058758631                  -0.001067571
total_intl_minutes                  1.0000000000      0.044054461      0.9999928823                  -0.015930448
total_intl_calls                    0.0440544614      1.000000000      0.0441335428                  -0.018235001
total_intl_charge                   0.9999928823      0.044133543      1.0000000000                  -0.015940717
number_customer_service_calls      -0.0159304482     -0.018235001     -0.0159407173                   1.000000000

调用findCorrelation函数找到关联度超过0.75的属性

highly_correlations = findCorrelation(cor_mat,cutoff = 0.75)
highly_correlations
[1] 11  4  5  8

输出这些高度关联的属性的名称：

names(new_train)[highly_correlations]
[1] "total_intl_minutes"  "total_day_charge"    "total_eve_minutes"   "total_night_minutes"

为了得到每个属性的相关值，需要先去掉非数值类型的属性，然后通过相关性计算得到一个关联度矩阵，然后将关联度阈值设定为0.75，包括total_intl_minutes,total_day_charge,total_eve_minutes,total_night_minutes。

利用Caret包选择属性

将训练数据集trainset中名为international_plan的特征转化为intl_yes,intl_no:

intl_plan = model.matrix(~ trainset$international_plan -1,data = data.frame(trainset$international_plan))
colnames(intl_plan) = c( "trainset$international_planno" = "intl_no"  ,"trainset$international_planyes" = "intl_yes")

将训练集中的trainset中名为了voice_mail_plan特征转化成voice_yes与voice_no:

voice_plan = model.matrix(~ trainset$voice_mail_plan -1,data = data.frame(trainset$voice_mail_plan))
colnames(voice_plan) = c( "trainset$voice_mail_planno" = "voice_no"  ,"trainset$voice_mail_planyes" = "voice_yes")

去掉international_plan 和voice_mail_planno这两个属性，将训练集的trainset和intl_plan 、voice_plan 两个数据框合并。

trainset$international_plan = NULL
trainset$voice_mail_plan = NULL
trainset = cbind(intl_plan,voice_plan,trainset)

同理对测试数据集做同样的处理：

intl_plan = model.matrix(~testset$international_plan -1,data = data.frame(testset$international_plan))
colnames(intl_plan) = c( "testset$international_planno" = "intl_no"  ,"testset$international_planyes" = "intl_yes")

voice_plan = model.matrix(~ testset$voice_mail_plan -1,data = data.frame(testset$voice_mail_plan))
colnames(voice_plan) = c( "testset$voice_mail_planno" = "voice_no"  ,"testset$voice_mail_planyes" = "voice_yes")

testset$international_plan = NULL
testset$voice_mail_plan = NULL
testset = cbind(intl_plan,voice_plan,testset)

使用线性判别分析方法创建一个特征筛选算法：

ldacontrol = rfeControl(functions = ldaFuncs,method = "cv")

使用从编号1到18的数据子集对训练数据集trainset进行反向特征筛选：

ldaprofile = rfe(trainset[,!names(trainset) %in% c("churn")],trainset[,c("churn")],sizes = c(1:18),rfeControl = ldacontrol)
ldaprofile
Recursive feature selection

Outer resampling method: Cross-Validated (10 fold)

Resampling performance over subset size:

Variables Accuracy  Kappa AccuracySD KappaSD Selected
1   0.8523 0.0000   0.001675 0.00000
2   0.8523 0.0000   0.001675 0.00000
3   0.8436 0.1400   0.011711 0.09055
4   0.8432 0.2076   0.010202 0.03927
5   0.8471 0.2321   0.016556 0.05733
6   0.8454 0.2308   0.015287 0.04411
7   0.8462 0.2369   0.014101 0.04268
8   0.8441 0.2220   0.016293 0.07222
9   0.8458 0.2284   0.016027 0.06877
10   0.8479 0.2377   0.017831 0.08265
11   0.8492 0.2481   0.018360 0.08050
12   0.8510 0.2542   0.016630 0.07754
13   0.8514 0.2577   0.017362 0.07950
14   0.8536 0.2695   0.016204 0.07610        *
15   0.8523 0.2693   0.016640 0.06845
16   0.8531 0.2713   0.016522 0.06908
17   0.8514 0.2624   0.016150 0.07040
18   0.8510 0.2612   0.015494 0.06877

The top 5 variables (out of 14):
total_day_charge, total_day_minutes, intl_no, intl_yes, numb`
r_customer_service_calls

绘制选择结果示意图：

plot(ldaprofile,type = c("o","g"))

绘制特征选择结果

检测最佳的变量因子：

ldaprofile$optVariables
[1] "total_day_charge"              "total_day_minutes"             "intl_no"                       "intl_yes"
[5] "number_customer_service_calls" "total_eve_minutes"             "total_eve_charge"              "voice_yes"
[9] "total_intl_calls"              "voice_no"                      "number_vmail_messages"         "total_intl_charge"
[13] "total_intl_minutes"            "total_night_minutes"

检测合适的模型：

ldaprofile$fit
Call:
lda(x, y)

Prior probabilities of groups:
yes        no
0.1477322 0.8522678

Group means:
total_day_charge total_day_minutes   intl_no   intl_yes number_customer_service_calls total_eve_minutes total_eve_charge voice_yes
yes         35.00143          205.8877 0.7046784 0.29532164                      2.204678          213.7269         18.16702 0.1666667
no          29.62402          174.2555 0.9351242 0.06487582                      1.441460          199.6197         16.96789 0.2954891
total_intl_calls  voice_no number_vmail_messages total_intl_charge total_intl_minutes total_night_minutes
yes         4.134503 0.8333333              5.099415          2.899386           10.73684            205.4640
no          4.514445 0.7045109              8.674607          2.741343           10.15119            201.4184

Coefficients of linear discriminants:
LD1
total_day_charge               0.715025524
total_day_minutes             -0.130486469
intl_no                        1.129944662
intl_yes                      -1.129944662
number_customer_service_calls -0.421997335
total_eve_minutes              0.198406977
total_eve_charge              -2.390372792
voice_yes                      0.330463968
total_intl_calls               0.066240268
voice_no                      -0.330463968
number_vmail_messages         -0.003529233
total_intl_charge              2.315069869
total_intl_minutes            -0.693504606
total_night_minutes           -0.002127471

最后，通过重新采样来评估性能

postResample(predict(ldaprofile,testset[,!names(testset) %in% c("churn")]),testset[,c("churn")])
Accuracy     Kappa
0.8605108 0.2672027

使用caret包完成特征的筛选，由于数据集包括了因子编码属性，首先调用model.matrix将这些因子转化成多个二元属性。

我们将训练方法设置为多个交叉验证方法.cv，同时调用了函数ladFuncs来完成线性判别分析，调用递归特征函数ladFuncs来执行特征选择，还可以通过函数ref在重采样的数据子集和筛选后的特征集基础上再次评估模型性能，并输出特征选择结果。

我们能够根据所得的模型基本信息来绘制变量个数与预测准确率之间的关系示意图，从图中可以知道（模型中最合适的变量因子为12），最后我们计算得到了重采样后数据集的预测准确度为0.86，kappa的检测结果为0.27.

内容来自用户分享和网络整理，不保证内容的准确性，如有侵权内容，可联系管理员处理

标签：

相关文章推荐

新的分享

章节导航