learnR_basicdataprocessing_5
2017-03-13 20:12
183 查看
learnR_basicdataprocessing_5
gender <- c('F','F','M',NA,'F','M') age <- c(12,56,34,67,NA,23) student <-c(NA,'Tom','Toby','Jreey','Curry','Allen') score <-c(99,NA,60,75,65,85) leader <- data.frame(gender,age,student,score,stringsAsFactors=FALSE) 1. 变量操作 1.1 创建新变量 variable <- expression Example1: mydata <- data.frame(x1 = c(1,2,3,4), x2 = c(5,6,7,8)) mydata$sumx <- mydata$x1 + mydata$x2 mydata$meanx <- (mydata$x1 + mydata$x2) / 2 Example2: attach(mydata) mydata$sumx <- x1 + x2 mydata$meanx <- (x1 + x2) / 2 detach(mydata) Example3: #transform mydata <- transform(mydata, sumx = x1 + x2, meanx = (x1 + x2) / 2) 1.2 变量重编码 variable[condition] <- expression 1.2.1 Example1: df <- data.frame(age=c(20,45,80)) df <- within(df,{ agecat <- NA agecat[age > 0 & age < 30] <- 'Young' agecat[age >= 30 & age <= 50] <- 'Middle Aged' agecat[age > 50] <- 'Elder' }) #重编码函数 1.2.2 recode() #Recode a Variable example2: library(car) x <- rep(1:3, 3) recode(x, "c(1,2) = 'A'; else = 'B'") recode(x, "1:2 = 'C'; 3 = 'D'" ) 1.2.3 recodeVar() #Recode values of vector example3: library(doBy) x <- c(NA, rep(1:3,3), NA) recodeVar(x, src=list(c(1,2)), tgt=list('A'), default='dft') recodeVar(x, src=list(c(1,2)), tgt=list('B'), default='dft',keep.na=F) 1.2.4 cut() #Convert Numeric to Factor example4: aaa <- c(1:5, 2:7) cut(aaa, 3) cut(aaa, 3, dig.lab=4, ordered=T) 1.3 变量重命名 df <- data.frame(name=c('aa','bb','cc'),age=c(10,NA,110)) 1.3.1 fix(df) #交互式修改 1.3.2 rename() Example: library(reshape) df <- rename(df, c(name='new_name', age='new_age')) 1.3.3 names() names(df) <- c('n_name','n_age') 1.4 缺失值 NA:占位缺失值 NaN:占位错误值 NULL:空集 Inf:占位无穷大 判断 :is.na(df) 噪声数据转换为NA:df$age[df$age == 110] <- NA 排除NA: eg1: y <- sum(x=c(1,2,NA,3), na.rm=TRUE) #实际是当作0处理 eg2: new_df <- na.omit(df) #删除NA存在的行 1.5 日期值 Sys.Date() #年月日 date() #具体年月日,星期,时钟时间 #日期默认格式:yyyy-mm-dd my_dates <- as.Date(c('2017/03/22', '2017/03/25')) her_dates <- as.Date('03/22/2017', '%m/%d/%Y') 1.5.1 format() #提取日期特定部分 eg1: today <- Sys.Date() #date()貌似不好提取特定部分 today_week <- format(today, format='%A') eg2: #提取某日的星期数 birthday <- as.Date('1999-11-25') one_week <- format(birthday, format='%A') 1.5.2 difftime() #日期差值 eg1: difftime(today, birthday, units='weeks') eg2: days <- today - birthday 1.5.3 其他日期函数 #连接日期和时钟时间 strftime() #class='POSIXlt POSIXt' strptime() #class='chartacter' eg: p_time <- strptime('25/3/17 20:01:30.007', '%d/%m/%y %H:%M:%OS')#'POSIXlt POSIXt' f_time <- strftime('25/3/17 20:01:30.007', format='%d/%m/%y %H:%M:%OS')#'character' is_s <- '25/3/17 20:01:30.007' f_time == is_s #FALSE #日期转换字符串 str_date <- as.character(date) other: help(ISOdatetime) library(lubridate) library(fCalendar) 1.6 类型转换 as.numeric() #is.numeric() as.character() as.vector() as.matrix() as.data.frame() as.factor() as.logical() 结合控制流(if-then)使用时,功能强大。 1.7 排序 order() eg: gender <- c('F','F','M',NA,'F','M') age <- c(12,56,34,67,NA,23) student <-c(NA,'Tom','Toby','Jreey','Curry','Allen') score <-c(99,NA,60,75,65,85) leader <- data.frame(gender,age,student,score,stringsAsFactors=FALSE) new_data1 <- leader[order(gender, -age, na.last=TRUE),] #逆序/leader$gender 1.8 数据集的合并 1.8.1 添加列 eg1: #有共同索引 total_1 <- merge(df_A, df_B, by=c('ID','Postcode')) eg2: #无需共同索引 total_2 <- cbind(A, B) #A,B必须有相同的行数和顺序 1.8.2 添加行 eg3: #两个数据框必须有相同的变量,顺序不做要求。 total_3 <- rbind(df_A, df_B) 1.9 数据集取子集 1.9.1 保留变量(这里特指‘列’) new_df <- df[, column_index] eg1: df1 <- df[,c(2:4)] #保留某些列,即所谓的变量 eg2: my_df <- data.frame(matrix(1:12,3,4)) vars1 <- paste('X', 1:4, sep='') new_df <- my_df[vars1] 1.9.2 剔除变量(这里特指‘列’) #1 new_df <- df[!vars] vars2 <- names(my_df) %in% c('X3','X4') new_df <-my_df[!vars2] #2 new_df <- df[c(-var3,-var4)] new_df <- my_df[c(-3,-4)] #最后,相同的变量删除工作亦可通过: my_df$X3 <- my_df$X4 <- NULL 1.9.3 保留观测(相对的,这里特指‘行’) new_df <- df[row_index,] eg1: df1 <- df[c(2:4),] eg2: new_data2 <- leader[which(gender == 'M' | age <= 34),]#一般leader$gender少出错 1.9.4 subset()函数 new_data3 <- subset(leader,gender=='M' & age<=34,select=c(age:score))#select 1.9.5 随机抽样 my_sample <- leader[sample(1:nrow(leader), 3, replace = FALSE),]#无放回随机 library(sampling) library(survey) 1.10 使用SQL语句 eg: library(sqldf) newdf <- sqldf('select * from mtcars where carb=1 order by mpg', row.names=TRUE) sqldf('select avg(mpg) as avg_mpg, avg(disp) as avg_disp, gear from mtcars where cyl in (4,6) group by gear')
相关文章推荐
- learnR_advanceddataprocessing_6
- [导入]Daily Report (Learn some basic knowlage about American football.)
- 《Data-intensive Text Processing with MapReduce》读书笔记第2章:MapReduce基础(1)
- Data-Intensive Text Processing with MapReduce第三章(2)-MapReduce算法设计-3.1局部聚集
- MapReduce: Simplified Data Processing on Large Clusters(转并改)
- WebSphere DataStage BASIC 语言开发实践
- Data-Intensive Text Processing with MapReduce 第三章(1)——local aggregation
- Data-Intensive Text Processing with MapReduce第三章(4)-SECONDARY SORTING
- 上传文件报错:Processing of multipart/form-data request failed. Stream ended unexpectedly
- Basic Data Structures and Algorithms in the Linux Kernel
- Tez: Accelerating processing of data stored in HDFS
- R dataframe basic operations
- Scilkit-Learn:Working With Text Data(文本分类)
- 基于Problem Solving with Algorithms and Data Structures using Python的学习记录(3)——Basic Data Structures
- JavaScript.The.Definitive.Guide—Core Javascript-basic Data Types
- MapReduce:Simplified Data Processing on Large Clusters(中文翻译3)
- data Streaming Processing system
- basic_string::c_str() 与 basic_string::data() 区别
- 解决 Processing of multipart/form-data request failed. /upload/A.tmp (No such file or directory) 问题
- SAP DEMO-Processing Data