您的位置：首页 > 编程语言

处理芯片cel格式数据的全自动R代码

2014-01-10 21:18 721 查看

>source(”http://bioconductor.org/biocLite.R”)

>biocLite()

全自动。只需要设置工作路径以及filelist.txt文件即可。

filelist.txt，必须与CEL文件一起放置在工作目录内，文件以空格为间隔，分两列，分别是filename和factor 例

filename factor

MS1_(Mouse430_2).cel control

MS2_(Mouse430_2).cel control

MS3_(Mouse430_2).cel control

MS4_(Mouse430_2).cel beta-catenin

MS5_(Mouse430_2).cel beta-catenin

MS6_(Mouse430_2).cel beta-catenin

MS7_(Mouse430_2).cel TCF4

MS8_(Mouse430_2).cel TCF4

MS9_(Mouse430_2).cel TCF4

MS10_(Mouse430_2).cel Icat

MS11_(Mouse430_2).cel Icat

MS12_(Mouse430_2).cel Icat

source("http://pgfe.umassmed.edu/ou/bioconductor/affyChipAna.R")

setwd("~/Documents/Rscript/testData")

doAll()

更多参数：

dataInput(file=”filelist.txt”,sep=” “,rnames=”filename”)

getExpr(data)

QCreport(data,fastQCfile=”fastQC.pdf”,arrayQMfile=”QCreport”)

getDesign(data)

getContrastMatrix(data,design,constr=NULL)

doAnalyze(data,design,contrast.matrix)

outputData(data,fit2,contrast.matrix,times=2,cut.p.val=0.01,cut.fdr.val=NULL,outputFolder=”output”,all=FALSE,scale=”row”,cluster_rows=TRUE,cluster_cols=TRUE)

doAll(file=”filelist.txt”,workingdir=”.”,sep=” “,rnames=”filename”,fastQCfile=”fastQC.pdf”,arrayQMfile=”QCreport”,

constr=NULL,times=2,cut.p.val=0.01,cut.fdr.val=NULL,outputFolder=”output”,

all=FALSE,scale=”row”,doQC=TRUE,cluster_rows=TRUE,cluster_cols=TRUE)

可以指定design & contrast.matrix: design and contrast.matrix for limma, coef: report contrast, times: folder-change, cut.fdr.val: P.adj.Val cutoff value

source:

doAll原代码

# source("http://bioconductor.org/biocLite.R")

# biocLite(c("affy","annotate","annaffy","affyQCReport","arrayQualityMetrics","limma","pheatmap"))

dataInput<-function(file="filelist.txt",sep=" ",rnames="filename"){
require(limma,quietly=TRUE)
require(affy,quietly=TRUE)
require(annotate,quietly=TRUE)
targets<-readTargets(file=file,sep=sep,row.names=rnames) #limma
data <- ReadAffy(filenames=targets$filename) #affy
affydb<-annPkgName(data@annotation,type="db")
list("targets"=targets,"data"=data,"affydb"=affydb)

}

getExpr<-function(data){
require(affy,quietly=TRUE)
require(annotate,quietly=TRUE)
require(annaffy,quietly=TRUE)
require(data$affydb,character.only = TRUE,quietly=TRUE)
data$eset<-rma(data$data,verbose=FALSE)
try({
APInfo<-mas5calls(data$data,verbose=FALSE)
present.call<-exprs(APInfo)
})
eset.e<-exprs(data$eset)
if(exists("present.call")){
data$RPMA<-merge(eset.e,present.call,by="row.names")
colnames(data$RPMA)<-make.names(gsub("_\$.*?\$\\.CEL","",colnames(data$RPMA),ignore.case=TRUE))
colnames(data$RPMA)<-gsub("\\.y$",".present",colnames(data$RPMA))
colnames(data$RPMA)<-gsub("\\.x$",".rma",colnames(data$RPMA))
}else{
colnames(eset.e)<-paste(colnames(eset.e),"rma",sep=".")
Row.names=rownames(eset.e)
data$RPMA<-cbind(Row.names,eset.e)
data$RPMA<-data.frame(data$RPMA)
}
data$symbols<-as.character(aafSymbol(as.character(data$RPMA$Row.names),data$affydb))
names(data$symbols)<-as.character(data$RPMA$Row.names)
data$genes<-as.character(aafUniGene(as.character(data$RPMA$Row.names),data$affydb))
names(data$genes)<-as.character(data$RPMA$Row.names)
data

}

QCreport<-function(data,fastQCfile="fastQC.pdf",arrayQMfile="QCreport"){
require(affyQCReport,quietly=TRUE)
require(arrayQualityMetrics,quietly=TRUE)
QCReport(data$data,file=fastQCfile)
arrayQualityMetrics(expressionset = data$eset, outdir = paste(arrayQMfile,"RMA",sep="_"),force=T)
arrayQualityMetrics(expressionset = data$data, outdir = paste(arrayQMfile,"Raw",sep="_"),force=T,do.logtransform=T)
eset.e<-exprs(data$eset)
d<-dist(t(eset.e))
hc<-hclust(d,method="complete")
pdf("hclust.complete.pdf")
plot(hc)
dev.off()
cv.y<-numeric()
y.sd<-apply(eset.e,1,sd)
y.mean<-apply(eset.e,1,mean)
cv.y<-y.sd/y.mean
x<-eset.e
y2<-cbind(x,cv.y)
y2<-y2[y2[,ncol(y2)]>0.10,1:(ncol(y2)-1)]
d<-dist(t(y2))
hc<-hclust(d,"ave")
pdf("hclust.ave.pdf")
plot(hc)
dev.off()

}

getDesign<-function(data){
targets<-data$targets
g<-factor(targets[,2])
design<-model.matrix(~-1+g)
if((ncol(targets)>2)){
fo<-"~-1+g"
for(i in 3:ncol(targets)){
assign(paste("v",colnames(targets)[i],sep=""),factor(targets[,i]))
fo<-paste(fo,paste("v",colnames(targets)[i],sep=""),sep="+")
}
design<-model.matrix(as.formula(fo))
colnames(design)<-gsub("^v","",colnames(design))
}
colnames(design)<-gsub("^g","",colnames(design))
colnames(design)<-gsub("-","_",colnames(design))
colnames(design)<-make.names(colnames(design))
design

}

getContrastMatrix<-function(data,design,constr=NULL){
require(limma,quietly=TRUE)
if(is.null(constr)){
g<-factor(data$targets[,2])
design.col.names<-levels(g)
if(length(design.col.names)>2){
for(i in 1:(length(design.col.names)-1)){
for(j in (i+1):length(design.col.names)){
constr<-c(constr,paste(design.col.names[i],design.col.names[j],sep="-"))
}
}
}else{
constr<-paste(design.col.names[2],design.col.names[1],sep="-")
}
}
contrast.matrix<-makeContrasts(contrasts=constr,levels=design)
contrast.matrix

}

doAnalyze<-function(data,design,contrast.matrix){
require(limma,quietly=TRUE)
fit<-lmFit(data$eset,design)
fit1<-contrasts.fit(fit,contrast.matrix)
fit2<-eBayes(fit1)
fit2

}

outputData<-function(data,fit2,contrast.matrix,times=2,cut.p.val=0.01,cut.fdr.val=NULL,outputFolder="output",all=FALSE,scale="row",cluster_rows=TRUE,cluster_cols=TRUE){
require(limma,quietly=TRUE)
require(pheatmap,quietly=TRUE)
dir.create(outputFolder)
if(file.exists(outputFolder)) setwd(outputFolder)
results<-NULL
results.s<-NULL
constr<-colnames(contrast.matrix)
len<-length(constr)
times<-log(times,2)
symbols<-as.matrix(data$symbols)
colnames(symbols)<-"symbol"
genes<-as.matrix(data$genes)
colnames(genes)<-"gene"
for(i in 1:len){
results[[i]]<-topTable(fit2,coef=i,n=nrow(fit2))
results[[i]]$AveExpr<-NULL
try({
if(colnames(results[[i]])[1]!="ID") results[[i]] <- cbind(ID=rownames(results[[i]]), results[[i]])
index<-grepl("AFFX",results[[i]]$ID)
results[[i]]<-results[[i]][!index,]
})
if(colnames(results[[i]])[1]!="ID") stop("expect first column to be ID")
results.s[[i]]<-results[[i]]
colnames(results.s[[i]])[2:6]<-paste(constr[i],colnames(results.s[[i]])[2:6],sep=".")
}
merge.results.s<-results.s[[1]]
if(len>1){
for(i in 2:len){
merge.results.s<-merge(results.s[[i]],merge.results.s,by="ID")
}
}
all.results.s<-merge(merge.results.s,data$RPMA,by.x="ID",by.y="Row.names")
all.results.s<-merge(genes,all.results.s,by.x="row.names",by.y="ID")
all.results.s<-merge(symbols,all.results.s,by.x="row.names",by.y="Row.names")
colnames(all.results.s)[1]<-"ID"
write.csv(all.results.s,"all.results.csv",row.names=FALSE)
for(i in 1:len){
tmp<-results[[i]]
tmp<-merge(tmp,data$RPMA,by.x="ID",by.y="Row.names")
tmp<-merge(genes,tmp,by.x="row.names",by.y="ID")
tmp<-merge(symbols,tmp,by.x="row.names",by.y="Row.names")
colnames(tmp)[1]<-"ID"
tmp<-tmp[tmp$P.Value<cut.p.val,]
if(!is.null(cut.fdr.val)) tmp<-tmp[tmp$adj.P.Val<cut.fdr.val,]
up<-tmp[tmp$logFC>times,]
lo<-tmp[tmp$logFC<(-times),]
up<-up[order(-up$logFC),]
lo<-lo[order(lo$logFC),]
write.csv(up,paste("up",constr[i],"csv",sep="."),row.names=FALSE)
write.csv(lo,paste("lo",constr[i],"csv",sep="."),row.names=FALSE)
if(all) selected<-data$RPMA[data$RPMA$Row.names %in% c(up$ID,lo$ID),]
else selected<-rbind(up,lo)
if(nrow(selected)>2){
try({
colnames(selected)[1]<-"ID"
sname<-symbols[selected$ID,"symbol"]
sname[sname=="character(0)"]<-"---"
fmt<-paste("%-",max(nchar(sname)),"s",sep="")
bname<-sprintf(fmt,sname)
sname<-paste(bname,names(sname),sep=" ")
rownames(selected)<-sname
selected<-selected[,grepl(".rma$",colnames(selected))]
colnames(selected)<-gsub(".rma$","",colnames(selected))
if(all){
annotation<-data$targets[,2:ncol(data$targets)]
rownames(annotation)<-make.names(gsub("_\$.*?\$\\.CEL","",rownames(annotation),ignore.case=TRUE))
}else{
gp<-unlist(strsplit(constr[i],"-"))
gp<-gsub("[\$\$]","",gp)
targets<-data$targets[data$targets$factor %in% gp,]
selcol<-make.names(gsub("_\$.*?\$\\.CEL","",rownames(targets),ignore.case=TRUE))
selected<-selected[,selcol]
annotation<-targets[,2:ncol(targets),drop=FALSE]
rownames(annotation)<-selcol
}
pdf(paste("heatmap",constr[i],"pdf",sep="."),width=8+ncol(selected)/10,height=2+nrow(selected)/10)
pheatmap(selected, annotation=annotation, scale=scale, fontsize=9, fontsize_row=6, fontfamily="mono", cluster_rows=cluster_rows, cluster_cols=cluster_cols)
dev.off()
})
}
}

}

doAll<-function(file="filelist.txt",workingdir=".",sep=" ",rnames="filename",fastQCfile="fastQC.pdf",arrayQMfile="QCreport",

constr=NULL,times=2,cut.p.val=0.01,cut.fdr.val=NULL,outputFolder="output",

all=FALSE,scale="row",doQC=TRUE,cluster_rows=TRUE,cluster_cols=TRUE){
setwd(workingdir)
cat("Data input\n")
data<-dataInput(file,sep,rnames)
cat("Get Expression\n")
data<-getExpr(data)
if(doQC){
cat("Do QC\n")
QCreport(data,fastQCfile,arrayQMfile)
}
cat("Do limma analysis\n")
design<-getDesign(data)
contrast.matrix<-getContrastMatrix(data,design,constr)
fit2<-doAnalyze(data,design,contrast.matrix)
cat("Data output\n")
outputData(data,fit2,contrast.matrix,times,cut.p.val,cut.fdr.val,outputFolder,all,scale,cluster_rows,cluster_cols)
cat("..Done..\n")

}

#从GEO上下载表达谱数据

library(GEOquery)

gset<-getGEO("GSE*****",GSEMatrix=TRUE)

length(gset)

#load NCBI platform annotation

gpl<-annotation(gset)

platf<-getGEO(gpl,AnnotGPL=TRUE)

ncbifd <- data.frame(attr(dataTable(platf), "table"))

eset <- exprs(gset)

head(eset[,1:5])

GSM1123782 GSM1123783 GSM1123784 GSM1123785 GSM1123786

1007_s_at 10.1689 10.5247 10.8179 10.3539 10.4964

1053_at 9.6002 7.9436 9.8653 9.9733 10.1960

117_at 5.6808 5.0301 3.7654 2.7751 2.8179

121_at 4.2268 4.6148 4.6147 4.4977 4.6147

1255_g_at 2.1869 2.1869 2.1869 2.1869 2.1869

1294_at 2.1874 2.1874 2.1874 2.1874 2.1874

> head(ncbifd[,1:5])

ID Gene.title Gene.symbol Gene.ID UniGene.title

1 1007_s_at discoidin domain receptor tyrosine kinase 1 DDR1 780 <NA>

2 1053_at replication factor C (activator 1) 2, 40kDa RFC2 5982 <NA>

3 117_at heat shock 70kDa protein 6 (HSP70B') HSPA6 3310 <NA>

4 121_at paired box 8 PAX8 7849 <NA>

5 1255_g_at guanylate cyclase activator 1A (retina) GUCA1A 2978 <NA>

6 1294_at ubiquitin-like modifier activating enzyme 7 UBA7 7318 <NA>

#从GEO上下载cel文件

library(GEOquery)

> getGEOSuppFiles("GSE46106")

[1] "ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE46nnn/GSE46106/suppl/"

trying URL 'ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE46nnn/GSE46106/suppl//GSE46106_RAW.tar'

ftp data connection made, file length 399247360 bytes

opened URL

===============================================

downloaded 380.8 Mb

trying URL 'ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE46nnn/GSE46106/suppl//filelist.txt'

ftp data connection made, file length 3308 bytes

opened URL

==================================================

downloaded 3308 bytes

> setwd("GSE46106/")

> dir()

[1] "filelist.txt" "GSE46106_RAW.tar"

> untar("GSE46106_RAW.tar")

> files <- dir(pattern="gz$")

> sapply(files, gunzip)

GSM1123782_2147TG11_U133plus2.CEL.gz GSM1123783_2147TG15_U133plus2.CEL.gz GSM1123784_2147TG1_U133plus2.CEL.gz

33380229 32529207 33390958

GSM1123785_2147TG6_U133plus2.CEL.gz GSM1123786_2277TG1_U133plus2.CEL.gz GSM1123787_2277TG9_U133plus2.CEL.gz

33210730 33082826 33531908

GSM1123788_2665TG15_U133plus2.CEL.gz GSM1123789_2665TG1_U133plus2.CEL.gz GSM1123790_2665TG6_U133plus2.CEL.gz

32232774 33380782 33432878

GSM1123791_3104TG1_U133plus2.CEL.gz GSM1123792_3104TG3_U133plus2.CEL.gz GSM1123793_3107TG1_U133plus2.CEL.gz

32850644 32588606 32584342

GSM1123794_3107TG5_U133plus2.CEL.gz GSM1123795_3143TG1_U133plus2.CEL.gz GSM1123796_3143TG5_U133plus2.CEL.gz

33358932 32369391 32806108

GSM1123797_3204TG1_U133plus2.CEL.gz GSM1123798_3204TG5_U133plus2.CEL.gz GSM1123799_3561TG1_U133plus2.CEL.gz

32557782 33016324 32883029

GSM1123800_3561TG5_U133plus2.CEL.gz GSM1123801_3611TG1_U133plus2.CEL.gz GSM1123802_3611TG5_U133plus2.CEL.gz

33454274 32657648 33126603

GSM1123803_3613TG1_U133plus2.CEL.gz GSM1123804_3613TG5_U133plus2.CEL.gz GSM1123805_3807TG1_U133plus2.CEL.gz

32581902 33193826 32791917

GSM1123806_3807TG5_U133plus2.CEL.gz GSM1123807_3824TG2_U133plus2.CEL.gz GSM1123808_3887TG1_U133plus2.CEL.gz

33266865 33078422 32479022

GSM1123809_3887TG5_U133plus2.CEL.gz GSM1123810_3904TG1_U133plus2.CEL.gz GSM1123811_3904TG5_U133plus2.CEL.gz

33291287 32645537 33425796

GSM1123812_3936TG1_U133plus2.CEL.gz GSM1123813_3936TG5_U133plus2.CEL.gz GSM1123814_3963TG1_U133plus2.CEL.gz

32939923 33317196 32725410

GSM1123815_3963TG5_U133plus2.CEL.gz GSM1123816_4013TG1_U133plus2.CEL.gz GSM1123817_4169TG2_U133plus2.CEL.gz

33276082 33151406 33167461

GSM1123818_4175TG1_U133plus2.CEL.gz GSM1123819_4272TG1_U133plus2.CEL.gz GSM1123820_4400TG1_U133plus2.CEL.gz

33139829 33111635 33206021

GSM1123821_4400TG5_U133plus2.CEL.gz GSM1123822_4664TG1_U133plus2.CEL.gz GSM1123823_4849TG1_U133plus2.CEL.gz

33026806 33364390 33300210

GSM1123824_4888TG1_U133plus2.CEL.gz GSM1123825_4913TG1_U133plus2.CEL.gz

33299396 33207528

> filelist <- dir(pattern="CEL$")

> library(affy)

> library(annotate)

> data <- ReadAffy(filenames=filelist)

> affydb<-annPkgName(data@annotation,type="db")

> require(affydb, character.only=TRUE)

> eset<-rma(data,verbose=FALSE)

> eset.e <- exprs(eset)

> library(annaffy)

> symbols<-as.character(aafSymbol(as.character(rownames(eset)),affydb))

> genes<-as.character(aafUniGene(as.character(rownames(eset)),affydb))

内容来自用户分享和网络整理，不保证内容的准确性，如有侵权内容，可联系管理员处理

标签：

相关文章推荐

新的分享

章节导航