1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55
| #引用包 library(Biobase) library(GEOquery) library(biomaRt)
#输入GSE系列号和GPL GSE="GSE67545" GPL="GPL15034" #输入gene所在列名称,一般为Gene symbol genename="Gene symbol"
#提取下载GEO数据 gset <- getGEO(GSE, GSEMatrix =T, getGPL = T, AnnotGPL = F) if (length(gset) > 1) idx <- grep(GPL, attr(gset, "names")) else idx <- 1 gset <- gset[[idx]]
#查看gset里的信息 str(gset)
#提取表达矩阵 afexp<-data.frame(exprs(gset))
#如果gset里没有gene symbol就需要用其它方法进行注释,但是可以先把探针矩阵整理出来(需要整理探针矩阵可以去除#运行) #保存探针矩阵 #annmatrix=rbind(ID=colnames(afexp),afexp) #write.table(annmatrix,file="annmatrix.txt",sep="\t",quote=F,col.names = F)
#简单看看平台的基因名称在哪,顺便提取一下吧 head(gset@featureData@data) nname=which(colnames(gset@featureData@data)==genename) afexp$ID=as.character(gset@featureData@data[,nname])
#保存探针信息 ann=cbind(as.character(gset@featureData@data[,nname]),rownames(afexp)) write.table(ann,file="ann.xls",sep="\t",quote=F,col.names = F,row.names = F)
#删除没有gene symbol的探针组 afexp<-afexp[afexp$ID!="",] #对重复基因取平均值并保存好整理的表达矩阵 uniafexp<-aggregate(.~ID,afexp,mean) write.table(uniafexp,file="matrix.txt",sep="\t",quote=F,col.names = T,row.names = F)
#保存样本处理信息 cli=pData(gset) cliaf=rbind(ID=colnames(cli),cli) write.table(cliaf,file="clinical.xls",sep="\t",quote=F,col.names = F)
#整理保存基因的biotype信息 ensembl <- useMart(biomart = "ENSEMBL_MART_ENSEMBL", dataset = "hsapiens_gene_ensembl", #小鼠是mmusculus_gene_ensembl,大鼠是rnorvegicus_gene_ensembl host = "https://useast.ensembl.org")# www报错,改为useast biotype <- getBM(attributes = c("gene_biotype", "hgnc_symbol"), filters = "hgnc_symbol", #小鼠是mgi_symbol,大鼠是mgi_symbol values = uniafexp[,1], mart = ensembl) write.table(biotype,file="biotype.xls",sep="\t",quote=F,col.names = T,row.names = F)
|