WGCNA代码

Python0100

WGCNA代码,第1张

WGCNA

rm(list = ls())

#加载R包

library(WGCNA)

library(tidyr)

library(ggplot2)

library(gridExtra)

setwd("D:\\项目探索\\水稻EDP\\文献\\WGCNA数据结果20190426")

#=================================================================#

#

# 1.导入表达数据

#

#=================================================================#

gene_exp <- read.table('D:\\项目探索\\水稻EDP\\文献\\WGCNA数据结果20190426\\DSC.txt',

                         sep = '\t',

                         header = T,

                         stringsAsFactors = F,

                         #第一列作为行名

                         row.names = 1)

gene_exp[is.na(gene_exp)] <- 0 # 将NA转换为0

# 根据方差筛选

m.vars=apply(gene_exp,1,var)

expro.upper=gene_exp[which(m.vars>quantile(m.vars, probs = seq(0, 1, 0.25))[4]),]

dim(expro.upper)

datExpr <- t(expro.upper) # 在WGCNA中需要转置表达矩阵

gsg = goodSamplesGenes(datExpr,verbose = 3)

gsg$allOK

#查看行和列

dim(datExpr)

# 聚类分析是否存在离群值

sampleTree = hclust(dist(datExpr),method = "average")

sizeGrWindow(12,9)

par(cex=0.6)

par(mar = c(0,4,2,0))

plot(sampleTree,

     main = "Sample clustering to detect outliners",

     sub = "",xlab = "",cex.lab=1.5,

     cex.axis=1.5,cex.main = 2)

# 存在离群样本时可以剔除

if(F){abline(h = 15,col = "red")

clust = cutreeStatic(sampleTree,cutHeight = 15,minSize = 10)

table(clust)

keepSamples = (clust==1)

datExpr =datExpr0[keepSamples,]

nGenes = ncol(datExpr)

nSamples = nrow(datExpr)

}

#=================================================================#

#

# 2.寻找最佳β值

#

#=================================================================#

# 选择并构建一个软阈值向量

powers = c(c(1:10),seq(from = 12,to = 20,by=2)

# 执行TOM功能

sft = pickSoftThreshold(datExpr,powerVector = powers,verbose = 5)

# 绘图

sizeGrWindow(9,5)

par(mfrow = c(1,2)) # 一页多图

cex1 = 0.9

# Scale-free topology fit index as a function of the soft-thresholding power

plot(sft$fitIndices[,1],-sign(sft$fitIndices[,3])*sft$fitIndices[,2],

xlab="Soft Threshold(power)",ylab = "Scale Free Topology Model Fit,signed R^2",type="n",

main = paste("Scale independence"))

text(sft$fitIndices[,1],-sign(sft$fitIndices[,3])*sft$fitIndices[,2],labels=powers,cex=cex1,col="red")

abline(h = 0.90,col = "red")

#  Mean connectivity as a function of the soft-thresholding power

plot(sft$fitIndices[,1],sft$fitIndices[,5],xlab="Soft Threshold (power)",ylab ="Mean Connectivity",type = "n",main = paste("Mean connectivity"))

text(sft$fitIndices[,1],sft$fitIndices[,5],labels=powers,cex = cex1,col="red")

#=================================================================#

#

# 3.一步法构建网络模块

#3.1 minModuleSize指定Module所含基因数最少

#3.2 mergeCutHeight指定合并Module的阈值

#3.3 列表net中包含了许多信息

#3.4 可以通过recutBlockwiseTrees函数来修改模块的一些参数,而不用重新计算聚类树

#

#=================================================================#

net = blockwiseModules(datExpr,power = sft$powerEstimate,

TOMType = "unsigned",minModuleSize = 30,reassignThreshold = 0,mergeCutHeight = 0.25,

numericLabels =TRUE,pamRespectsDendro = FALSE,saveTOMs = TRUE,saveTOMFileBase = "RiceTOM",

verbose = 3)

table(net$colors)

# 绘图

sizeGrWindow(12,9)

# 将标签转换为颜色

mergedColors = labels2colors(net$colors)

# 绘制树图及模块颜色图

plotDendroAndColors(net$dendrograms[[1]],mergeColors[net$blockGenes[[1]],"Module colors",dendroLabels = FALSE,hang = 0.03,addGuide = TRUE,guideHang = 0.05)

# 保存相关数据

moduleLabels = net$colors

moduleColors = labels2colors(net$colors)

MEs = net$MEs

geneTree = net$dendrograms[[1:2]] # 可能会存在基因较多,默认将基因分成了两部分

#=================================================================#

# 3.步步法构建

# 3.构建网络

# 3.1.计算相关系数

# 3.2.计算邻接矩阵

# 3.3.计算TOM矩阵

# 3.4.聚类并且划分模块

# 3.5.合并相似模块

#=================================================================#

# 计算邻接矩阵

softpower = sft$powerEstimate

adjacency = adjacency(datExpr,power = softPower)

# 通过邻接矩阵计算TOM矩阵

TOM = TOMsimilarity(adjacency)

dissTOM = 1 - TOM # 此为距离矩阵

# 通过距离矩阵进行聚类分析基因

geneTree = hclust(as.dist(dissTOM),method = "average")

sizeGrWindow(12,9)

plot(geneTree,xlab="",sub="",main="Gene clustering on TOM-based dissimilarity",labels=FALSE,hang = 0.04)

# 设置模块中基因最小数

minModuleSize = 30

# 通过动态树剪切鉴定出模块

dynamicMods = cutreeDynamic(dendro = geneTree,disM = dissTOM,deepSplit = 2,pamRespectsDendro = FALSE,minClusterSize = minModuleSize)

table(dynamicMods)

# 绘图

dynamicColors = labels2colors(dynamicMods)

table(dynamicColors)

sizeGrWindow(8,6)

plotDendroAndColors(geneTree,dynamicColors,"Dynamic Tree Cut",dendroLabels = FALSE,hang = 0.03,addGuide = TRUE,guideHang = 0.05,main = " Gene dendrogram and module colors")

# 合并表达近似的Modules

# 计算Modules的特征基因

MEList = moduleEigengenes(datExpr,color = dynamicColors)

MEs = MEList$eigengenes

# 计算Modules之间的相关性

MEDiss = 1 - cor(MEs)

# 聚类特征基因

METree = hclust(as.dist(MEDiss),method = "average")

# 绘图

sizeGrWindow(7,6)

plot(METree,main = "Clustering of eigengenes",xlab="",sub="")

# 自动合并

MEDissThres = 0.25

abline(h=MEDissThres,col = "red")

merge = mergeCloseModules(datExpr,dynamicColors,cutHeight = MEDissThres,verbose = 3)

mergedColors = merge$colors

mergedMEs = merge$newMEs

# 绘图对比合并前后模块

sizeGrWindow(12,9)

plotDendroAndColors(geneTree,cbind(dynamicColors,mergedColors),c("Dynamic Tree Cut","Merged dynamic"),dendroLabels = FALSE,hang =0.03,addGuide = TRUE,guideHang = 0.05)

# 保存相关的变量数据

moduleColors = mergedColors

# 转换为数字标签

colorOrder = c("grey",standardColors(50))

moduleLabels = match(moduleColors,colorOrder)-1

MEs = mergedMEs

#=================================================================#

#

# 3.大数据集处理

# 基本思想:使用两层聚类

#

#=================================================================#

#  Block-wise network construction and module detection

bwnet <- blockwiseModules(datExpr,maxBlockSize = 2000,   # 指定电脑最大可运行的模块

power = sft$powerEstimate,TOMtype ="unsigned",minModuleSize = 30,

reassignThreshold = 0, mergeCutHeight = 0.25,numericLabels = TRUE,

saveTOMs = TRUE,saveTOMFileBase = "RiceTOM-blockwise",

verbose = 3)

bwLabels = matchLabels(bwnet$colors,moduleLabels)

bwModuleColors = labels2colors(bwLabels)

table(bwLabels)

# 绘图

sizeGrWindow(6,6)

# block1绘图

plotDendroAndColors(bwnet$dendrograms[[1]], bwModuleColors[bwnet$blockGenes[[1]]], "Module colors", main = "Gene dendrogram and module colors in block 1", dendroLabels = FALSE, hang = 0.03, addGuide = TRUE, guideHang = 0.05)

# block2绘图

plotDendroAndColors(bwnet$dendrograms[[2]], bwModuleColors[bwnet$blockGenes[[2]]], "Module colors", main = "Gene dendrogram and module colors in block 2", dendroLabels = FALSE, hang = 0.03, addGuide = TRUE, guideHang = 0.05)

# 绘图比较single block和block-wise的区别

sizeGrWindow(12,9) 

plotDendroAndColors(geneTree, cbind(moduleColors, bwModuleColors), c("Single block", "2 blocks"), main = "Single block gene dendrogram and module colors", dendroLabels = FALSE, hang = 0.03, addGuide = TRUE, guideHang = 0.05)

# 通过Eigengenes比较single block和block-wise的区别

singleBlockMEs = moduleEigengenes(datExpr, moduleColors)$eigengenes

blockwiseMEs = moduleEigengenes(datExpr, bwModuleColors)$eigengenes

single2blockwise = match(names(singleBlockMEs), names(blockwiseMEs))

signif(diag(cor(blockwiseMEs[, single2blockwise], singleBlockMEs)), 3)

#=================================================================#

#

# 4.导入性状数据

#

#=================================================================#

traitData = read.csv("ClinicalTraits.csv")

dim(traitData) 

names(traitData)

# 与表达数据进行校准后构建性状数据框

RiceSamples = rownames(datExpr)

traitRows = match(RiceSamples, allTraits$Sample)

datTraits = allTraits[traitRows, -1]

rownames(datTraits) = allTraits[traitRows, 1]

#=================================================================#

#

# 5.模块与性状的相关关系

#

#=================================================================#

# 量化Module-trait的关系

nGenes = ncol(datExpr)

nSamples = nrow(datExpr)

MEs0 = moduleEigengenes(datExpr,moduleColors)$eigengenes

MEs = orderMEs(MEs0)

moduleTraitCor = cor(MEs,datTraits,use = "p")

moduleTraitPvalue = corPvalueStudent(moduleTrsitCor,nSamples)

# 绘图

sizeGrWindow(10,6)

textMatrix = paste(signif(moduleTraitCor, 2), "\n(", signif(moduleTraitPvalue, 1), ")", sep = "")

dim(textMatrix) = dim(moduleTraitCor)

par(mar = c(6, 8.5, 3, 3))

labeledHeatmap(Matrix = moduleTraitCor, xLabels = names(datTraits), yLabels = names(MEs), ySymbols = names(MEs), colorLabels = FALSE, colors = greenWhiteRed(50), textMatrix = textMatrix, setStdMargins = FALSE, cex.text = 0.5, zlim = c(-1,1), main = paste("Module-trait relationships"))

#=================================================================#

#

# 6.基因与模块、性状的相关关系

#

#=================================================================#

# 确定基因与模块的关系-MM

modNames = substring(names(MEs),3)  # 提取模块数字标签

geneModuleMembership = as.data.frame(cor(datExpr,MEs,use = "p"))

MMPvalue = as.data.frame(corPvalueStudent(as.matrix(geneModuleMembership), nSamples)) # 通过计算每个Module的Eigengenes与基因的关系

names(geneModuleMembership) = paste("MM", modNames, sep="")

names(MMPvalue) = paste("p.MM", modNames, sep="")

# 提取某个特定的性状

Treatment = as.data.frame(datTraits$Treatment)

names(weight) = "Treatment"

# 确定基因与性状的关系-GS

geneTraitSignificance = as.data.frame(cor(datExpr, Treatment, use = "p")) 

GSPvalue = as.data.frame(corPvalueStudent(as.matrix(geneTraitSignificance), nSamples))

names(geneTraitSignificance) = paste("GS.", names(Treatment), sep="")

names(GSPvalue) = paste("p.GS.", names(Treatment), sep="")

#=================================================================#

#

# 7.获取与性状关联度大的特定模块中GS高且MM高的基因

#

#=================================================================#

# 绘制特定模块GS和MM热图

module = "saddlebrown"

column = match(module,modNames)

moduleGenes = moduleColors==module # 获取特定列的数据,只是一种数据提取方法而已

sizeGrWindow(7, 7)

par(mfrow = c(1,1))

verboseScatterplot(abs(geneModuleMembership[moduleGenes, column]), abs(geneTraitSignificance[moduleGenes, 1]), xlab = paste("Module Membership in", module, "module"), ylab = "Gene significance for body weight", main = paste("Module membership vs. gene significance\n"), cex.main = 1.2, cex.lab = 1.2, cex.axis = 1.2, col = module)

# 获取特定模块的特定基因

probes = names(datExpr)[moduleColors==module]

# 输出总体结果数据框

geneInfo0 = data.frame(substanceBXH = probes, geneSymbol = annot$gene_symbol[probes2annot], LocusLinkID = annot$LocusLinkID[probes2annot], moduleColor = moduleColors, geneTraitSignificance, GSPvalue)

# 根据与性状显著相关进行排序

modOrder = order(-abs(cor(MEs, weight, use = "p")))

# 添加模块关系信息

for (mod in 1:ncol(geneModuleMembership)) 

{ oldNames = names(geneInfo0) geneInfo0 = data.frame(geneInfo0, geneModuleMembership[, modOrder[mod]], MMPvalue[, modOrder[mod]])names(geneInfo0) = c(oldNames, paste("MM.", modNames[modOrder[mod]], sep=""), paste("p.MM.", modNames[modOrder[mod]], sep="")) }

# 第一根据颜色第二根据基因性状显著性进行排序

geneOrder = order(geneInfo0$moduleColor, -abs(geneInfo0$GS.weight))

geneInfo = geneInfo0[geneOrder, ]

#=================================================================#

#

# 8.进行后续分析GO功能注释等

#

#=================================================================#

# 选择感兴趣的Modules

intModules = c("brown", "red", "salmon")

for (module in intModules)

 {

 # Select module probes 

modGenes = (moduleColors==module) 

# Get their entrez ID codes 

modLLIDs = allLLIDs[modGenes]

# Write them into a file 

fileName = paste("LocusLinkIDs-", module, ".txt", sep="")

write.table(as.data.frame(modLLIDs), file = fileName, row.names = FALSE, col.names = FALSE) 

}

加权基因共表达网络分析 (WGCNA, Weighted correlation network analysis)是用来描述不同样品之间基因关联模式的系统生物学方法,可以用来鉴定高度 协同变化 的基因集, 并根据基因集的内连性和基因集与表型之间的关联鉴定候补生物标记基因或治疗靶点。

相比于只关注差异表达的基因,WGCNA利用数千或近万个变化最大的基因或全部基因的信息识别感兴趣的基因集,并与表型进行显著性关联分析。一是充分利用了信息,二是把数千个基因与表型的关联转换为数个基因集与表型的关联,免去了多重假设检验校正的问题。

以上内容引用的网页:http://blog.genesino.com/2018/04/wgcna/#wgcna%E5%9F%BA%E6%9C%AC%E6%A6%82%E5%BF%B5

感觉WGCNA挺合自己脾气的,想安装一下。注定安装不会那么顺利。

安装了WGCNA是在Rstuio上安装的。用命令 install.packages("WGCNA")

执行完命令后,提示信息显示有三个依赖包(‘impute’, ‘preprocessCore’, ‘GO.db’)无法安装。

然后再安装那三个依赖包。

 BiocManager::install("impute"),安装impute时,也会安装GO.db包。遇到是否更新其他R包时,选择不更新 n.

>library("WGCNA")

Error: package or namespace load failed for ‘WGCNA’ in loadNamespace(j <- i[[1L]], c(lib.loc, .libPaths()), versionCheck = vI[[j]]):

 不存在叫‘preprocessCore’这个名字的程辑包

然后再安装一下‘preprocessCore’包

BiocManager::install("preprocessCore")

做完以上工作,再执行一下 >library("WGCNA").

其中提示信息:载入需要的程辑包:dynamicTreeCut  ;载入需要的程辑包:fastcluster ;载入程辑包:‘fastcluster’    意思是在使用WGCNA时,需要先载入这三个包,并且已经被直接载入了,不是错误信息。