您的位置:首页 > 其它

sparkR处理Gb级数据集

2016-02-18 11:06 309 查看
spark集群搭建及介绍:敬请关注
数据集:http://pan.baidu.com/s/1sjYN7lF
总结:使用sparkR进行数据分析建模相比R大致有3-5倍的提升

查看原始数据集:通过iris数据集生成
[root@master data]#pwd
/data
[root@master data]#ls -lhsrt iris1g.txt

1.3G -rw-r--r-- 1root root 1.3G Feb 16 14:16 iris1g.txt

登录sparkR:
sparkR --masteryarn-client --num-executors 15

#1、加载数据:47671650千万数据,耗时1.60118mins
> (time1 <-Sys.time())
[1] "2016-02-1810:04:08 CST"
> data_iris <-read.table("/data/iris1g.txt", stringsAsFactors=T, sep=",",header=T, comment="", quote=NULL, encoding="UTF-8")

> Sys.time() -time1
Time difference of1.60118 mins

#使用data.table中的fread读取数据:4000千万数据,耗时1.910114
mins
library(data.table)
(time1 <-Sys.time())
data_iris <- fread("D:\\R大数据集/iris1g.txt",stringsAsFactors=T, sep=",",
header=T, encoding="UTF-8")
Sys.time() - time1

#2、数据预处理
> dim(data_iris)
[1] 47671650 5
str(data_iris)
> str(data_iris)
'data.frame': 47671650obs. of 5 variables:
$ X.Sepal.Length.: num 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
$ X.Sepal.Width. : num 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
$ X.Petal.Length.: num 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
$ X.Petal.Width. : num 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
$ X.Species. : Factor w/ 3 levels"\"setosa\"","\"versicolor\"",..: 1 1 11 1 1 1 1 1 1 ...

> names(iris)
[1]"Sepal.Length" "Sepal.Width" "Petal.Length""Petal.Width" "Species"

>names(data_iris)
[1]"X.Sepal.Length." "X.Sepal.Width." "X.Petal.Length.""X.Petal.Width."

[5]"X.Species."
>

>names(data_iris) <- names(iris)
>names(data_iris)
[1]"Sepal.Length" "Sepal.Width" "Petal.Length""Petal.Width" "Species"

#3、创建训练集和测试集数据
library(caret)
#创建训练集和测试集数据:耗时6.402254 secs
> (time1 <-Sys.time())
[1] "2016-02-1810:10:35 CST"
> ind <-base:::sample(3, nrow(data_iris), prob=c(0.3, 0.2, 0.5), replace=T)
> train <-data_iris[ind==1, ]
> test <-data_iris[ind==2, ]
> Sys.time() -time1
Time difference of6.402254 secs

#使用createDataPartition导致内存溢出
#(time1 <-Sys.time())
#index <-createDataPartition(data$Species, nrow(data), p=0.7, list=F)
#Sys.time() - time1

#train <-data[index, ]
#test <-data[-index, ]

> dim(train)
[1] 14301827 5
> dim(test)
[1] 9533737 5

memory.size()
gc()

#4、建模
#1)随机森林
#library(randomForest)
#model <-randomForest(train$X.Species.~., data=train, ntree=50, nPerm=10, mtry=3,proximity=T, importance=T)
#随机森林建模导致内存溢出

#2)使用决策时间建模:1.891634
mins
library(party)
> (time1 <-Sys.time())
[1] "2016-02-1810:12:08 CST"
> model <-ctree(Species~., data=train)
> Sys.time() -time1
Time difference of

>print(object.size(model), units="Mb")
6372.7 Mb
#str(model)
> summary(model)
Length Class Mode

1 BinaryTree S4

#5、预测
> (time1 <-Sys.time())
[1] "2016-02-1810:14:49 CST"
> pred <-predict(model, test)
> Sys.time() -time1
Time difference of36.58139 secs

#6、模型评估
table(pred,test$Species)
>mean(pred==test$Species)
[1] 1
>base:::table(pred, test$Species)

pred "setosa""versicolor" "virginica"
"setosa" 3177256 0 0
"versicolor" 0 3178471 0
"virginica" 0 0 3178010
>library(gmodels)
>CrossTable(pred, test$Species)

Cell Contents
|-------------------------|
| N |
| Chi-squarecontribution |
| N / Row Total |
| N / Col Total |
| N / Table Total |
|-------------------------|

Total Observationsin Table: 9533737

| test$Species

pred | "setosa" |"versicolor" | "virginica" | RowTotal |

-------------|--------------|--------------|--------------|--------------|
"setosa" | 3177256 | 0 | 0 | 3177256 |

| 4238091.601 | 1059271.517 | 1059117.882 | |

| 1.000 | 0.000 | 0.000 | 0.333 |

| 1.000 | 0.000 | 0.000 | |

| 0.333 | 0.000 | 0.000 | |

-------------|--------------|--------------|--------------|--------------|
"versicolor"| 0 | 3178471 | 0 | 3178471 |

| 1059271.517 | 4236471.588 | 1059522.895 | |

| 0.000 | 1.000 | 0.000 | 0.333 |

| 0.000 | 1.000 | 0.000 | |

| 0.000 | 0.333 | 0.000 | |

-------------|--------------|--------------|--------------|--------------|
"virginica" | 0 | 0 | 3178010 | 3178010 |

| 1059117.882 | 1059522.895 | 4237086.223 | |

| 0.000 | 0.000 | 1.000 | 0.333 |

| 0.000 | 0.000 | 1.000 | |

| 0.000 | 0.000 | 0.333 | |

-------------|--------------|--------------|--------------|--------------|
Column Total | 3177256 | 3178471 | 3178010 | 9533737 |

| 0.333 | 0.333 | 0.333 | |

-------------|--------------|--------------|--------------|--------------|

```
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: