您的位置:首页 > 其它

利用周末学R语言 (3) - 数据处理

2018-03-19 00:07 274 查看
本章不给出输出结果

数据处理-计算

# 求余
q1%%q2
# 整除
q1%/%q2
#数据计算
abs(-2)
# 根号
sqrt(2)
# 向上取整
ceiling(2.2)
# 向下取整
floor(2.2)
# 取整数部分
trunc(-2.22222)
# 四舍五入到指定位的小数
round(3.14,1)
# 四舍五入到指定的有效数字位数
signif(3.14,1)
# 三角函数
# cos,sin,tan(度数)
# 反三角函数
# acos,asin,atan
# 双曲函数
# cosh,sinh,tanh,acosh,asinh,atanh
# 对数
log(8,base=2)
# 自然对数
log(2.718282)
# 常用对数
log10(100)
# 指数
exp(1)
# 长度
length(c(1,1,1,1,1,1,1,1,1,1,1,1,1))
# 均值
mean(c(1,2,3,4,5))
# 截尾5%,丢弃缺失值
# mean(x,trim=0.05,na.rm=TRUE)
# 其他,不清楚可以使用help()
help(var)
x=c(1,1,1,1,1,2,2,5,6,7,7)
median(x)
sd(x)
var(x)
# 绝对中位差
mad(x)
quantile(x,c(.25,.5,.75))
range(x)
sum(x)
# 滞后差分
diff(x,lag=2)
min(x)
max(x)
y <- scale(x)
y
# 严格等于,浮点数慎用
3 == 3
# 不等于
3 != 3
# 或
3>2 | 4<= 5
# 与
2 ^ 3 == 8 & 2 ** 3 ==8


数据处理-日期

# 读取字符串日期为date
leadership$date <- as.Date(leadership$date,"%m/%d/%y")
# leadership
# 现在日期
today <- Sys.Date()
# 从日期中获取特定值
format(today, format="%B,%d,%A")
# 日期计算
today - as.Date("1989-06-04")
difftime(today, as.Date("1989-06-04"), units="weeks")
# 读取date为字符串
as.character(today)


数据处理-规整

# 创建leadership示例数据
manager <- c(1,2,3,4,5)
date <- c("10/24/08","10/28/08","10/1/08","10/12/08","5/1/09")
gender <- c("M","F","F","M","F")
age <- c(32,45,25,39,99)
q1 <- c(5,3,3,3,2)
q2 <- c(4,5,5,3,2)
q3 <- c(5,2,5,4,1)
q4 <- c(5,5,5,NA,2)
q5 <- c(5,5,2,NA,1)
leadership <- data.frame(manager,date,gender,age,
q1,q2,q3,q4,q5,stringsAsFactors=FALSE)
# 增加列
df <- data.frame(x1 = c(2, 2, 6, 4))
df1 <- transform(df,x2=c(7, 5, 10, 14))
# 将age替换为old,young和NA
leadership <- within(leadership,{
age[age==99] <- NA
age[age>40 & age<98] <- "old"
age[age<40] <- "young"
})
# leadership
# 将age改为oory(old or young)
names(leadership)[4] <- "oory"
# leadership
# NA是缺失值,NaN是无效值(比如被0除)
is.na(cbind(leadership$q4,leadership$q5))
# 删除有NA的行
leadership <- na.omit(leadership)
# leadership
# 字符串合并
paste(leadership$oory,leadership$gender,sep=" ")
# 类型判断
is.numeric(1.11)
# 类型转换
as.character(1.12)
# leadership$age <- NULL
# 排序
attach(leadership)
ndata <- leadership[order(gender,q1),]
detach()
# ndata
# 删除一列
# ndata <- subset(ndata, select = -age )
# ndata$age <- NULL
# 联结
# df <- merge(df1,df2,by="id")
# 筛选
ndata[which(ndata$gender == "F"),]
ndata <- subset(ndata,q1>=3,select=gender:q5)
# ndata
# 1到15间隔3的序列
seq(1,15,3)
# 抽样
mysample <- ndata[sample(1:nrow(ndata),3,replace = FALSE),]
# mysample
# 合并字符串并打印
cat("hellow","\b,","\bworld!")
# apply,应用到行/列
# apply(数据,维度,函数,函数的参数)
# 维度1  行
# 维度2  列


数据处理-整合与重构

# 创建cars数据框
id <- c(1,2,3,4,5,6,7,8,9,10)
brand <- c("volvo","volvo","volvo","bmw","bmw","mb","saab","saab","saab","gc")
price <- c(30,41,54,24,18,23,45,64,53,34)
sales <- c(1098,7768,8766,9898,87876,1987,2678,3345,6545,9475)
cars <- data.frame(id,brand,price,sales)
cars
# 转置
t(cars)
# 分组统计数据,按brand计算price和sales的mean
aggregate(cars[,c("price","sales")], by = list(meanprice=cars$brand),FUN=mean)
# 使用reshape进行重构
install.packages("reshape2")
library(reshape2)
# 原始数据
mydata <- read.table(header=TRUE, sep=" ", text="
id time X1 X2
1 1 5 6
1 2 3 5
2 1 6 1
2 2 2 4
")
# 短数据格式
mydata
# 长数据格式
mydata <- melt(mydata, id=(c("id","time")))
mydata
# 短数据格式
cast(mydata,id+time~variable)
cast(mydata,id~variable+time)


数据处理实战-简单版

options(digits=2)
Student <- c("John Davis", "Angela Williams", "Bullwinkle Moose",
"David Jones", "Janice Markhammer", "Cheryl Cushing",
"Reuven Ytzrhak", "Greg Knox", "Joel England",
"Mary Rayburn")
Math <- c(502, 600, 412, 358, 495, 512, 410, 625, 573, 522)
Science <- c(95, 99, 80, 82, 75, 85, 80, 95, 89, 86)
English <- c(25, 22, 18, 15, 20, 28, 15, 30, 27, 18)

roster <- data.frame(Student, Math, Science, English,
stringsAsFactors=FALSE)

z <- scale(roster[,2:4])

score <- apply(z,1,mean)

roster <- cbind(roster,score)

y <- quantile(score,c(.8,.6,.4,.2))

roster$grade[score >= y[1]] <- "A"
roster$grade[score < y[1] & score >= y[2]] <- "B"
roster$grade[score < y[2] & score >= y[3]] <- "C"
roster$grade[score < y[3] & score >= y[4]] <- "D"
roster$grade[score < y[4]] <- "F"

name <- strsplit((roster$Student), " ")

lastname <- sapply(name,"[",2)

firstname <- sapply(name,"[",1)

roster <- cbind(firstname,lastname,roster[,-1])

roster <- roster[order (lastname,firstname) ,]

roster


利用周末学R语言-系列地址

利用周末学R语言 (1) - RStudio,helloworld,数据类型与获取数据

利用周末学R语言 (2) - 画图

利用周末学R语言 (3) - 数据处理

利用周末学R语言 (4) - 控制流,函数,面向对象

统计建模等更多内容见 github.com/nightttt7/Rweekends

因jupyter notebook格式更适合展示,新内容将只放在github上,旧内容不定时更新
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: