(R)机器学习--学习笔记--第四章节学习笔记
2015-04-28 11:24
337 查看
install.packages("tm")
library(tm)
install.packages("ggplot2")
library(ggplot2)
data.path<-"G:R/ML_for_Hackers-master/03-Classification/data/"
easyham.path<-paste(data.path,"easy_ham/",sep="")
//数据处理的方法
parse.email<-function(path){
full.msg<-msg.full(path)
date<-get.date(full.msg)
from<-get.from(full.msg)
subj<-get.subject(full.msg)
msg<-get.msg(full.msg)
return(c(date,from,subj,msg,path))
}
//第一个函数,用来获取所有的数据
msg.full<-function(path){
con<-file(path,open="rt",encoding="latin1")
msg<-readLines(con)
close(con)
return(msg)
}
//第二个函数,用来获取邮件发件人
//这里传入的是正则,需要详细了解
get.from<-function(msg.vec){
from<-msg.vec[grepl("From: ",msg.vec)]
if(length(strsplit(from,'[":<> ]'))>=1){
from<-strsplit(from,'[":<> ]')[[1]]
from<-from[which(from!=""&from!=" ")]
return(from[grepl("@",from)][1])
}else{
return("")
}
}
//第三个函数,获取邮件内容
get.msg<-function(msg.vec){
if(!(is.na(which(msg.vec=="")[1]+1))){
msg<-msg.vec[seq(which(msg.vec=="")[1]+1,length(msg.vec),1)]
return(paste(msg,collapse="\n"))
}else{
return("")
}
}
//第四个函数,获取邮件主题
get.subject<-function(msg.vec){
subj<-msg.vec[grepl("Subject: ",msg.vec)]
if(length(subj)>0){
return(strsplit(subj,"Subject: ")[[1]][2])
}else{
return("")
}
}
//第五个函数,获取邮件的接收时间和日期
//用冒号,加号或者减号分割日期
//替换掉首尾的空格
get.date<-function(msg.vec){
date.grep<-grepl("^Date: ",msg.vec)
date.grepl<-which(date.grep==TRUE)
date<-msg.vec[date.grepl[1]]
date<-strsplit(date,"\\+|\\-|: ")[[1]][2]
date<-gsub("^\\s+|\\s+$","",date)
return(strtrim(date,25))
}
//数据最终整理
easyham.docs<-dir(easyham.path)
easyham.docs<-easyham.docs[which(easyham.docs!="cdms")]
easyham.parse<-lapply(easyham.docs,function(p) parse.email(paste(easyham.path,
p,sep="")))
ehparse.matrix<-do.call(rbind,easyham.parse)
allparse.df<-data.frame(ehparse.matrix,stringAsFactors=FALSE)
names(allparse.df)<-c("Date","From.Email","Subject","Message","Path")
head(allparse.df)
//日期处理(可复用)
date.converter<-function(dates,pattern1,pattern2){
pattern1.convert<-strptime(dates,pattern1)
pattern2.convert<-strptime(dates,pattern2)
pattern1.convert[is.na(pattern1.convert)]<-
pattern2.convert[is.na(pattern2.convert)]
return(pattern1.convert)
}
//这个地方很容易引起问题,导致时间转换后结果是NA,需要增加下面这个系统设置
Sys.setlocale("LC_TIME", "C");
pattern1<-"%a, %d %b %Y %H:%M:%S"
pattern2<-"%d %b %Y %H:%M:%S"
allparse.df$Date<-date.converter(allparse.df$Date,pattern1,pattern2)
//处理其他数据
allparse.df$Subject<-tolower(allparse.df$Subject)
allparse.df$From.Email<-tolower(allparse.df$From.Email)
priority.df<-allparse.df[with(allparse.df,order(Date)),]
priority.train<-priority.df[1:(round(nrow(priority.df)/2)),]
//设置权重策略
library(plyr)
//在这里遇到的主要问题是:
原因和解决方案如下面英文解说,主要是时间转换后就占用了9列,所以加起来是不一样的:
#The problem is with the Date field,
#which is at this point a list that contains 9 fields
#(sec, min, hour, mday, mon, year, wday, yday, isdst).
#To solve the problem I've simply converted the dates into character vectors,
#used ddply then converted the dates back to Date
tmp <- priority.train$Date
priority.train$Date <- as.character(priority.train$Date)
from.weight<-ddply(priority.train, .(From.Email), summarise, Freq=length(Subject))
priority.train$Date <- tmp
rm(tmp)
head(from.weight)
names(from.weight)
#此处增加绘制条状图的功能,详细方法
#绘制条状图,查看数据情况
from.weight <- from.weight[with(from.weight, order(Freq)), ]
from.ex <- subset(from.weight, Freq > 6)
from.scales <- ggplot(from.ex) +
geom_rect(aes(xmin = 1:nrow(from.ex) - 0.5,
xmax = 1:nrow(from.ex) + 0.5,
ymin = 0,
ymax = Freq,
fill = "lightgrey",
color = "darkblue")) +
scale_x_continuous(breaks = 1:nrow(from.ex), labels = from.ex$From.Email) +
coord_flip() +
scale_fill_manual(values = c("lightgrey" = "lightgrey"), guide = "none") +
scale_color_manual(values = c("darkblue" = "darkblue"), guide = "none") +
ylab("Number of Emails Received (truncated at 6)") +
xlab("Sender Address") +
theme_bw() +
theme(axis.text.y = element_text(size = 5, hjust = 1))
ggsave(plot = from.scales,
filename = file.path("images", "0011_from_scales.pdf"),
height = 4.8,
width = 7)
//计算每一个线程的活跃度
get.threads<-function(threads.matrix,email.df){
threads<-unique(threads.matrix[,2])
thread.counts<-lapply(threads,function(t) thread.counts(t,email.df))
thread.matrix<-do.call(rbind,thread.counts)
return(cbind(threads,thread.matrix))
}
thread.counts<-function(thread,email.df){
thread.times<-email.df$Date[which(email.df$Subject==thread|
email.df$Subject==paste("re:",thread))]
freq<-length(thread.times)
min.time<-min(thread.times)
max.time<-max(thread.times)
time.span<-as.numeric(difftime(max.time,min.time,units="secs"))
if(freq<2){
return(c(NA,NA,NA))
}else{
trans.weight<-freq/time.span
log.trans.weight<-10+log(trans.weight,base=10)
return(c(freq,time.span,log.trans.weight))
}
}
thread.weights<-get.threads(threads.matrix,priority.train)
thread.weights<-data.frame(thread.weights,StringAsFactors=FALSE)
names(thread.weights)<-c("Thread","Freq","Response","Weight")
thread.weights$Freq<-as.numeric(thread.weights$Freq)
thread.weights$Response<-as.numeric(thread.weights$Response)
#thread.weights$Weight<-as.numeric(thread.weights$Weight)
thread.weights<-subset(thread.weights,is.na(thread.weights$Freq)==FALSE)
head(thread.weights)
term.counts <- function(term.vec, control)
{
vec.corpus <- Corpus(VectorSource(term.vec))
vec.tdm <- TermDocumentMatrix(vec.corpus, control = control)
return(rowSums(as.matrix(vec.tdm)))
}
thread.terms <- term.counts(thread.weights$Thread,
control = list(stopwords = TRUE))
thread.terms <- names(thread.terms)
term.weights <- sapply(thread.terms,
function(t) mean(thread.weights$Weight[grepl(t, thread.weights$Thread, fixed = TRUE)]))
term.weights <- data.frame(list(Term = names(term.weights),
Weight = term.weights),
stringsAsFactors = FALSE,
row.names = 1:length(term.weights))
msg.terms <- term.counts(priority.train$Message,
control = list(stopwords = TRUE,
removePunctuation = TRUE,
removeNumbers = TRUE))
msg.weights <- data.frame(list(Term = names(msg.terms),
Weight = log(msg.terms, base = 10)),
stringsAsFactors = FALSE,
row.names = 1:length(msg.terms))
get.weights<-function(search.term,weight.df,term=TRUE){
if(length(search.term)>0){
if(term){
term.match<-match(names(search.term),weight.df$Term)
}else{
term.match<-match(search.term,weight.df$Term)
}
match.weights<-weight.df$Weight[which(!is.na(term.match))]
if(length(match.weights)>1){
return(1)
}else{
return(mean(match.weights))
}
}else{
return(1)
}
}
#排序的计算
rank.message <- function(path)
{
msg <- parse.email(path)
from <- ifelse(length(which(from.weight$From.Email == msg[2])) > 0,
from.weight$Weight[which(from.weight$From.Email == msg[2])],
1)
# Second is based on senders in threads, and threads themselves
thread.from <- ifelse(length(which(senders.df$From.Email == msg[2])) > 0,
senders.df$Weight[which(senders.df$From.Email == msg[2])],
1)
subj <- strsplit(tolower(msg[3]), "re: ")
is.thread <- ifelse(subj[[1]][1] == "", TRUE, FALSE)
if(is.thread)
{
activity <- get.weights(subj[[1]][2], thread.weights, term = FALSE)
}
else
{
activity <- 1
}
# Next, weight based on terms
# Weight based on terms in threads
thread.terms <- term.counts(msg[3], control = list(stopwords = TRUE))
thread.terms.weights <- get.weights(thread.terms, term.weights)
# Weight based terms in all messages
msg.terms <- term.counts(msg[4],
control = list(stopwords = TRUE,
removePunctuation = TRUE,
removeNumbers = TRUE))
msg.weights <- get.weights(msg.terms, msg.weights)
# Calculate rank by interacting all weights
rank <- prod(from,
thread.from,
activity,
thread.terms.weights,
msg.weights)
return(c(msg[1], msg[2], msg[3], rank))
}
# Find splits again
train.paths <- priority.df$Path[1:(round(nrow(priority.df) / 2))]
test.paths <- priority.df$Path[((round(nrow(priority.df) / 2)) + 1):nrow(priority.df)]
head(test.paths)
# Now, create a full-featured training set.
train.ranks <- suppressWarnings(lapply(train.paths, rank.message))
train.ranks.matrix <- do.call(rbind, train.ranks)
train.ranks.matrix <- cbind(train.paths, train.ranks.matrix, "TRAINING")
train.ranks.df <- data.frame(train.ranks.matrix, stringsAsFactors = FALSE)
names(train.ranks.df) <- c("Message", "Date", "From", "Subj", "Rank", "Type")
train.ranks.df$Rank <- as.numeric(train.ranks.df$Rank)
library(tm)
install.packages("ggplot2")
library(ggplot2)
data.path<-"G:R/ML_for_Hackers-master/03-Classification/data/"
easyham.path<-paste(data.path,"easy_ham/",sep="")
//数据处理的方法
parse.email<-function(path){
full.msg<-msg.full(path)
date<-get.date(full.msg)
from<-get.from(full.msg)
subj<-get.subject(full.msg)
msg<-get.msg(full.msg)
return(c(date,from,subj,msg,path))
}
//第一个函数,用来获取所有的数据
msg.full<-function(path){
con<-file(path,open="rt",encoding="latin1")
msg<-readLines(con)
close(con)
return(msg)
}
//第二个函数,用来获取邮件发件人
//这里传入的是正则,需要详细了解
get.from<-function(msg.vec){
from<-msg.vec[grepl("From: ",msg.vec)]
if(length(strsplit(from,'[":<> ]'))>=1){
from<-strsplit(from,'[":<> ]')[[1]]
from<-from[which(from!=""&from!=" ")]
return(from[grepl("@",from)][1])
}else{
return("")
}
}
//第三个函数,获取邮件内容
get.msg<-function(msg.vec){
if(!(is.na(which(msg.vec=="")[1]+1))){
msg<-msg.vec[seq(which(msg.vec=="")[1]+1,length(msg.vec),1)]
return(paste(msg,collapse="\n"))
}else{
return("")
}
}
//第四个函数,获取邮件主题
get.subject<-function(msg.vec){
subj<-msg.vec[grepl("Subject: ",msg.vec)]
if(length(subj)>0){
return(strsplit(subj,"Subject: ")[[1]][2])
}else{
return("")
}
}
//第五个函数,获取邮件的接收时间和日期
//用冒号,加号或者减号分割日期
//替换掉首尾的空格
get.date<-function(msg.vec){
date.grep<-grepl("^Date: ",msg.vec)
date.grepl<-which(date.grep==TRUE)
date<-msg.vec[date.grepl[1]]
date<-strsplit(date,"\\+|\\-|: ")[[1]][2]
date<-gsub("^\\s+|\\s+$","",date)
return(strtrim(date,25))
}
//数据最终整理
easyham.docs<-dir(easyham.path)
easyham.docs<-easyham.docs[which(easyham.docs!="cdms")]
easyham.parse<-lapply(easyham.docs,function(p) parse.email(paste(easyham.path,
p,sep="")))
ehparse.matrix<-do.call(rbind,easyham.parse)
allparse.df<-data.frame(ehparse.matrix,stringAsFactors=FALSE)
names(allparse.df)<-c("Date","From.Email","Subject","Message","Path")
head(allparse.df)
//日期处理(可复用)
date.converter<-function(dates,pattern1,pattern2){
pattern1.convert<-strptime(dates,pattern1)
pattern2.convert<-strptime(dates,pattern2)
pattern1.convert[is.na(pattern1.convert)]<-
pattern2.convert[is.na(pattern2.convert)]
return(pattern1.convert)
}
//这个地方很容易引起问题,导致时间转换后结果是NA,需要增加下面这个系统设置
Sys.setlocale("LC_TIME", "C");
pattern1<-"%a, %d %b %Y %H:%M:%S"
pattern2<-"%d %b %Y %H:%M:%S"
allparse.df$Date<-date.converter(allparse.df$Date,pattern1,pattern2)
//处理其他数据
allparse.df$Subject<-tolower(allparse.df$Subject)
allparse.df$From.Email<-tolower(allparse.df$From.Email)
priority.df<-allparse.df[with(allparse.df,order(Date)),]
priority.train<-priority.df[1:(round(nrow(priority.df)/2)),]
//设置权重策略
library(plyr)
//在这里遇到的主要问题是:
Error: 'names' attribute [11] must be the same length as the vector [1]
原因和解决方案如下面英文解说,主要是时间转换后就占用了9列,所以加起来是不一样的:
#The problem is with the Date field,
#which is at this point a list that contains 9 fields
#(sec, min, hour, mday, mon, year, wday, yday, isdst).
#To solve the problem I've simply converted the dates into character vectors,
#used ddply then converted the dates back to Date
tmp <- priority.train$Date
priority.train$Date <- as.character(priority.train$Date)
from.weight<-ddply(priority.train, .(From.Email), summarise, Freq=length(Subject))
priority.train$Date <- tmp
rm(tmp)
head(from.weight)
names(from.weight)
#此处增加绘制条状图的功能,详细方法
#绘制条状图,查看数据情况
from.weight <- from.weight[with(from.weight, order(Freq)), ]
from.ex <- subset(from.weight, Freq > 6)
from.scales <- ggplot(from.ex) +
geom_rect(aes(xmin = 1:nrow(from.ex) - 0.5,
xmax = 1:nrow(from.ex) + 0.5,
ymin = 0,
ymax = Freq,
fill = "lightgrey",
color = "darkblue")) +
scale_x_continuous(breaks = 1:nrow(from.ex), labels = from.ex$From.Email) +
coord_flip() +
scale_fill_manual(values = c("lightgrey" = "lightgrey"), guide = "none") +
scale_color_manual(values = c("darkblue" = "darkblue"), guide = "none") +
ylab("Number of Emails Received (truncated at 6)") +
xlab("Sender Address") +
theme_bw() +
theme(axis.text.y = element_text(size = 5, hjust = 1))
ggsave(plot = from.scales,
filename = file.path("images", "0011_from_scales.pdf"),
height = 4.8,
width = 7)
//计算每一个线程的活跃度
get.threads<-function(threads.matrix,email.df){
threads<-unique(threads.matrix[,2])
thread.counts<-lapply(threads,function(t) thread.counts(t,email.df))
thread.matrix<-do.call(rbind,thread.counts)
return(cbind(threads,thread.matrix))
}
thread.counts<-function(thread,email.df){
thread.times<-email.df$Date[which(email.df$Subject==thread|
email.df$Subject==paste("re:",thread))]
freq<-length(thread.times)
min.time<-min(thread.times)
max.time<-max(thread.times)
time.span<-as.numeric(difftime(max.time,min.time,units="secs"))
if(freq<2){
return(c(NA,NA,NA))
}else{
trans.weight<-freq/time.span
log.trans.weight<-10+log(trans.weight,base=10)
return(c(freq,time.span,log.trans.weight))
}
}
thread.weights<-get.threads(threads.matrix,priority.train)
thread.weights<-data.frame(thread.weights,StringAsFactors=FALSE)
names(thread.weights)<-c("Thread","Freq","Response","Weight")
thread.weights$Freq<-as.numeric(thread.weights$Freq)
thread.weights$Response<-as.numeric(thread.weights$Response)
#thread.weights$Weight<-as.numeric(thread.weights$Weight)
thread.weights<-subset(thread.weights,is.na(thread.weights$Freq)==FALSE)
head(thread.weights)
term.counts <- function(term.vec, control)
{
vec.corpus <- Corpus(VectorSource(term.vec))
vec.tdm <- TermDocumentMatrix(vec.corpus, control = control)
return(rowSums(as.matrix(vec.tdm)))
}
thread.terms <- term.counts(thread.weights$Thread,
control = list(stopwords = TRUE))
thread.terms <- names(thread.terms)
term.weights <- sapply(thread.terms,
function(t) mean(thread.weights$Weight[grepl(t, thread.weights$Thread, fixed = TRUE)]))
term.weights <- data.frame(list(Term = names(term.weights),
Weight = term.weights),
stringsAsFactors = FALSE,
row.names = 1:length(term.weights))
msg.terms <- term.counts(priority.train$Message,
control = list(stopwords = TRUE,
removePunctuation = TRUE,
removeNumbers = TRUE))
msg.weights <- data.frame(list(Term = names(msg.terms),
Weight = log(msg.terms, base = 10)),
stringsAsFactors = FALSE,
row.names = 1:length(msg.terms))
get.weights<-function(search.term,weight.df,term=TRUE){
if(length(search.term)>0){
if(term){
term.match<-match(names(search.term),weight.df$Term)
}else{
term.match<-match(search.term,weight.df$Term)
}
match.weights<-weight.df$Weight[which(!is.na(term.match))]
if(length(match.weights)>1){
return(1)
}else{
return(mean(match.weights))
}
}else{
return(1)
}
}
#排序的计算
rank.message <- function(path)
{
msg <- parse.email(path)
from <- ifelse(length(which(from.weight$From.Email == msg[2])) > 0,
from.weight$Weight[which(from.weight$From.Email == msg[2])],
1)
# Second is based on senders in threads, and threads themselves
thread.from <- ifelse(length(which(senders.df$From.Email == msg[2])) > 0,
senders.df$Weight[which(senders.df$From.Email == msg[2])],
1)
subj <- strsplit(tolower(msg[3]), "re: ")
is.thread <- ifelse(subj[[1]][1] == "", TRUE, FALSE)
if(is.thread)
{
activity <- get.weights(subj[[1]][2], thread.weights, term = FALSE)
}
else
{
activity <- 1
}
# Next, weight based on terms
# Weight based on terms in threads
thread.terms <- term.counts(msg[3], control = list(stopwords = TRUE))
thread.terms.weights <- get.weights(thread.terms, term.weights)
# Weight based terms in all messages
msg.terms <- term.counts(msg[4],
control = list(stopwords = TRUE,
removePunctuation = TRUE,
removeNumbers = TRUE))
msg.weights <- get.weights(msg.terms, msg.weights)
# Calculate rank by interacting all weights
rank <- prod(from,
thread.from,
activity,
thread.terms.weights,
msg.weights)
return(c(msg[1], msg[2], msg[3], rank))
}
# Find splits again
train.paths <- priority.df$Path[1:(round(nrow(priority.df) / 2))]
test.paths <- priority.df$Path[((round(nrow(priority.df) / 2)) + 1):nrow(priority.df)]
head(test.paths)
# Now, create a full-featured training set.
train.ranks <- suppressWarnings(lapply(train.paths, rank.message))
train.ranks.matrix <- do.call(rbind, train.ranks)
train.ranks.matrix <- cbind(train.paths, train.ranks.matrix, "TRAINING")
train.ranks.df <- data.frame(train.ranks.matrix, stringsAsFactors = FALSE)
names(train.ranks.df) <- c("Message", "Date", "From", "Subj", "Rank", "Type")
train.ranks.df$Rank <- as.numeric(train.ranks.df$Rank)
相关文章推荐
- html学习第一天笔记——第四、五章节
- Coursera公开课笔记: 斯坦福大学机器学习第十一课“机器学习系统设计(Machine learning system design)”
- 机器学习之感知机学习笔记第一篇:求输入空间R中任意一点X0到超平面S的距离
- [机器学习] Coursera ML笔记 - 监督学习(Supervised Learning) - Representation
- Coursera台大机器学习基础课程学习笔记2 -- 机器学习的分类
- opencv视频学习第四课(opencv读视频和摄像头)笔记整理
- Linux 学习笔记 -- 第四部分 Linux 使用着管理 -- 第15章 磁盘配额 (Quota) 与高级文件系统管理
- 吴恩达Coursera深度学习课程 DeepLearning.ai 提炼笔记(3-1)-- 机器学习策略(1)
- 机器学习基石笔记6——为什么机器可以学习(2)
- ArcGIS API for JavaScript 4.2学习笔记[5] 官方API大章节概述与内容转译
- 机器学习入门学习笔记:(4.2)SVM的核函数和软间隔
- 笔记: 斯坦福大学机器学习第九课“神经网络的学习(Neural Networks: Learning)”
- 机器学习—FullBNT学习笔记之一(matlab)
- 人工智障学习笔记——机器学习(15)t-SNE降维
- 台湾大学林轩田机器学习技法课程学习笔记5 -- Kernel Logistic Regression
- 【吴恩达机器学习】学习笔记——1.3机器学习的定义
- 台大-林轩田老师-机器学习基石学习笔记10
- 《机器学习》-- 周志华 (第一章学习笔记)
- 机器学习笔记—-监督学习与无监督学习的异同
- 04、刘媛媛《普通人快速崛起31节修炼课》 第四讲 个人学习笔记