您的位置：首页 > 其它

（R）机器学习--学习笔记--第四章节学习笔记

2015-04-28 11:24 337 查看

install.packages("tm")

library(tm)

install.packages("ggplot2")

library(ggplot2)

data.path<-"G:R/ML_for_Hackers-master/03-Classification/data/"

easyham.path<-paste(data.path,"easy_ham/",sep="")

//数据处理的方法

parse.email<-function(path){

full.msg<-msg.full(path)

date<-get.date(full.msg)

from<-get.from(full.msg)

subj<-get.subject(full.msg)

msg<-get.msg(full.msg)

return(c(date,from,subj,msg,path))

}

//第一个函数，用来获取所有的数据

msg.full<-function(path){

con<-file(path,open="rt",encoding="latin1")

msg<-readLines(con)

close(con)

return(msg)

}

//第二个函数，用来获取邮件发件人

//这里传入的是正则，需要详细了解

get.from<-function(msg.vec){

from<-msg.vec[grepl("From: ",msg.vec)]

if(length(strsplit(from,'[":<> ]'))>=1){

    from<-strsplit(from,'[":<> ]')[[1]]

    from<-from[which(from!=""&from!=" ")]

    return(from[grepl("@",from)][1])

}else{

    return("")

}

}

//第三个函数，获取邮件内容

get.msg<-function(msg.vec){

if(!(is.na(which(msg.vec=="")[1]+1))){

    msg<-msg.vec[seq(which(msg.vec=="")[1]+1,length(msg.vec),1)]

    return(paste(msg,collapse="\n"))

}else{

    return("")

}

}

//第四个函数，获取邮件主题

get.subject<-function(msg.vec){

subj<-msg.vec[grepl("Subject: ",msg.vec)]

if(length(subj)>0){

    return(strsplit(subj,"Subject: ")[[1]][2])

}else{

    return("")

}

}

//第五个函数，获取邮件的接收时间和日期

//用冒号，加号或者减号分割日期

//替换掉首尾的空格

get.date<-function(msg.vec){

date.grep<-grepl("^Date: ",msg.vec)

date.grepl<-which(date.grep==TRUE)

date<-msg.vec[date.grepl[1]]

date<-strsplit(date,"\\+|\\-|: ")[[1]][2]

date<-gsub("^\\s+|\\s+$","",date)

return(strtrim(date,25))

}

//数据最终整理

easyham.docs<-dir(easyham.path)

easyham.docs<-easyham.docs[which(easyham.docs!="cdms")]

easyham.parse<-lapply(easyham.docs,function(p) parse.email(paste(easyham.path,

                                                                 p,sep="")))

ehparse.matrix<-do.call(rbind,easyham.parse)

allparse.df<-data.frame(ehparse.matrix,stringAsFactors=FALSE)

names(allparse.df)<-c("Date","From.Email","Subject","Message","Path")

head(allparse.df)

//日期处理（可复用）

date.converter<-function(dates,pattern1,pattern2){

pattern1.convert<-strptime(dates,pattern1)

pattern2.convert<-strptime(dates,pattern2)

pattern1.convert[is.na(pattern1.convert)]<-

    pattern2.convert[is.na(pattern2.convert)]

return(pattern1.convert)

}

//这个地方很容易引起问题，导致时间转换后结果是NA，需要增加下面这个系统设置

Sys.setlocale("LC_TIME", "C");

pattern1<-"%a, %d %b %Y %H:%M:%S"

pattern2<-"%d %b %Y %H:%M:%S"

allparse.df$Date<-date.converter(allparse.df$Date,pattern1,pattern2)

//处理其他数据

allparse.df$Subject<-tolower(allparse.df$Subject)

allparse.df$From.Email<-tolower(allparse.df$From.Email)

priority.df<-allparse.df[with(allparse.df,order(Date)),]

priority.train<-priority.df[1:(round(nrow(priority.df)/2)),]

//设置权重策略
library(plyr)

//在这里遇到的主要问题是：

Error: 'names' attribute [11] must be the same length as the vector [1]

原因和解决方案如下面英文解说，主要是时间转换后就占用了9列，所以加起来是不一样的：

#The problem is with the Date field,

#which is at this point a list that contains 9 fields

#(sec, min, hour, mday, mon, year, wday, yday, isdst).

#To solve the problem I've simply converted the dates into character vectors,

#used ddply then converted the dates back to Date

tmp <- priority.train$Date

priority.train$Date <- as.character(priority.train$Date)

from.weight<-ddply(priority.train, .(From.Email), summarise, Freq=length(Subject))

priority.train$Date <- tmp

rm(tmp)

head(from.weight)

names(from.weight)
#此处增加绘制条状图的功能,详细方法

#绘制条状图，查看数据情况

from.weight <- from.weight[with(from.weight, order(Freq)), ]

from.ex <- subset(from.weight, Freq > 6)

from.scales <- ggplot(from.ex) +

geom_rect(aes(xmin = 1:nrow(from.ex) - 0.5,

                xmax = 1:nrow(from.ex) + 0.5,

                ymin = 0,

                ymax = Freq,

                fill = "lightgrey",

                color = "darkblue")) +

scale_x_continuous(breaks = 1:nrow(from.ex), labels = from.ex$From.Email) +

coord_flip() +

scale_fill_manual(values = c("lightgrey" = "lightgrey"), guide = "none") +

scale_color_manual(values = c("darkblue" = "darkblue"), guide = "none") +

ylab("Number of Emails Received (truncated at 6)") +

xlab("Sender Address") +

theme_bw() +

theme(axis.text.y = element_text(size = 5, hjust = 1))

ggsave(plot = from.scales,

       filename = file.path("images", "0011_from_scales.pdf"),

       height = 4.8,

       width = 7)

//计算每一个线程的活跃度

get.threads<-function(threads.matrix,email.df){

threads<-unique(threads.matrix[,2])

thread.counts<-lapply(threads,function(t) thread.counts(t,email.df))

thread.matrix<-do.call(rbind,thread.counts)

return(cbind(threads,thread.matrix))

}

thread.counts<-function(thread,email.df){

thread.times<-email.df$Date[which(email.df$Subject==thread|

                                      email.df$Subject==paste("re:",thread))]

freq<-length(thread.times)

min.time<-min(thread.times)

max.time<-max(thread.times)

time.span<-as.numeric(difftime(max.time,min.time,units="secs"))

if(freq<2){

    return(c(NA,NA,NA))

}else{

    trans.weight<-freq/time.span

    log.trans.weight<-10+log(trans.weight,base=10)

    return(c(freq,time.span,log.trans.weight))

}

}

thread.weights<-get.threads(threads.matrix,priority.train)

thread.weights<-data.frame(thread.weights,StringAsFactors=FALSE)

names(thread.weights)<-c("Thread","Freq","Response","Weight")

thread.weights$Freq<-as.numeric(thread.weights$Freq)

thread.weights$Response<-as.numeric(thread.weights$Response)

#thread.weights$Weight<-as.numeric(thread.weights$Weight)

thread.weights<-subset(thread.weights,is.na(thread.weights$Freq)==FALSE)

head(thread.weights)

term.counts <- function(term.vec, control)

{

vec.corpus <- Corpus(VectorSource(term.vec))

vec.tdm <- TermDocumentMatrix(vec.corpus, control = control)

return(rowSums(as.matrix(vec.tdm)))

}

thread.terms <- term.counts(thread.weights$Thread,

                            control = list(stopwords = TRUE))

thread.terms <- names(thread.terms)

term.weights <- sapply(thread.terms,

                       function(t) mean(thread.weights$Weight[grepl(t, thread.weights$Thread, fixed = TRUE)]))

term.weights <- data.frame(list(Term = names(term.weights),

                                Weight = term.weights),

                           stringsAsFactors = FALSE,

                           row.names = 1:length(term.weights))

msg.terms <- term.counts(priority.train$Message,

                         control = list(stopwords = TRUE,

                                        removePunctuation = TRUE,

                                        removeNumbers = TRUE))

msg.weights <- data.frame(list(Term = names(msg.terms),

                               Weight = log(msg.terms, base = 10)),

                          stringsAsFactors = FALSE,

                          row.names = 1:length(msg.terms))

get.weights<-function(search.term,weight.df,term=TRUE){

if(length(search.term)>0){

    if(term){

      term.match<-match(names(search.term),weight.df$Term)

    }else{

      term.match<-match(search.term,weight.df$Term)

    }

    match.weights<-weight.df$Weight[which(!is.na(term.match))]

    if(length(match.weights)>1){

      return(1)

    }else{

      return(mean(match.weights))

    }

}else{

    return(1)

}

}

#排序的计算

rank.message <- function(path)

{

msg <- parse.email(path)

from <- ifelse(length(which(from.weight$From.Email == msg[2])) > 0,

                 from.weight$Weight[which(from.weight$From.Email == msg[2])],

                 1)

# Second is based on senders in threads, and threads themselves

thread.from <- ifelse(length(which(senders.df$From.Email == msg[2])) > 0,

                        senders.df$Weight[which(senders.df$From.Email == msg[2])],

                        1)

subj <- strsplit(tolower(msg[3]), "re: ")

is.thread <- ifelse(subj[[1]][1] == "", TRUE, FALSE)

if(is.thread)

{

    activity <- get.weights(subj[[1]][2], thread.weights, term = FALSE)

}

else

{

    activity <- 1

}

# Next, weight based on terms

# Weight based on terms in threads

thread.terms <- term.counts(msg[3], control = list(stopwords = TRUE))

thread.terms.weights <- get.weights(thread.terms, term.weights)

# Weight based terms in all messages

msg.terms <- term.counts(msg[4],

                           control = list(stopwords = TRUE,

                                          removePunctuation = TRUE,

                                          removeNumbers = TRUE))

msg.weights <- get.weights(msg.terms, msg.weights)

# Calculate rank by interacting all weights

rank <- prod(from,

               thread.from,

               activity,

               thread.terms.weights,

               msg.weights)

return(c(msg[1], msg[2], msg[3], rank))

}

# Find splits again

train.paths <- priority.df$Path[1:(round(nrow(priority.df) / 2))]

test.paths <- priority.df$Path[((round(nrow(priority.df) / 2)) + 1):nrow(priority.df)]

head(test.paths)

# Now, create a full-featured training set.

train.ranks <- suppressWarnings(lapply(train.paths, rank.message))

train.ranks.matrix <- do.call(rbind, train.ranks)

train.ranks.matrix <- cbind(train.paths, train.ranks.matrix, "TRAINING")

train.ranks.df <- data.frame(train.ranks.matrix, stringsAsFactors = FALSE)

names(train.ranks.df) <- c("Message", "Date", "From", "Subj", "Rank", "Type")

train.ranks.df$Rank <- as.numeric(train.ranks.df$Rank)

内容来自用户分享和网络整理，不保证内容的准确性，如有侵权内容，可联系管理员处理

标签： 机器学习数据

相关文章推荐

新的分享

章节导航