WGCNA分析 | 代码一-慈云数据

注意：今天的教程比较长，请规划好你的时间。本文是付费内容，在本文文末有本教程的全部的代码和示例数据。

分析代码

关于WGCNA分析，如果你的数据量较大，建议使用服务期直接分析，本地分析可能导致R崩掉。

设置文件位置

setwd("~/00_WGCNA/20230217_WGCNA/WGCNA_01")

加载分析所需的安装包

install.packages("WGCNA")
#BiocManager::install('WGCNA')
library(WGCNA)
options(stringsAsFactors = FALSE)

注意，如果你想打开多线程分析，可以使用一下代码

enableWGCNAThreads()

取决于你的电脑线程数量

一、导入基因表达量数据

## 读取txt文件格式数据
WGCNA.fpkm = read.table("ExpData_WGCNA.txt",header=T,
                        comment.char = "",
                        check.names=F)
###############
# 读取csv文件格式
WGCNA.fpkm = read.csv("ExpData_WGCNA.csv", header = T, check.names = F)

输入数据格式

数据处理

dim(WGCNA.fpkm)
names(WGCNA.fpkm)
datExpr0 = as.data.frame(t(WGCNA.fpkm[,-1]))
names(datExpr0) = WGCNA.fpkm$sample;##########如果第一行不是ID命名，就写成fpkm[,1]
rownames(datExpr0) = names(WGCNA.fpkm[,-1])

过滤数据

gsg = goodSamplesGenes(datExpr0, verbose = 3)
gsg$allOK
if (!gsg$allOK)
{
  if (sum(!gsg$goodGenes)>0)
    printFlush(paste("Removing genes:", paste(names(datExpr0)[!gsg$goodGenes], collapse = ", ")))
  if (sum(!gsg$goodSamples)>0)
    printFlush(paste("Removing samples:", paste(rownames(datExpr0)[!gsg$goodSamples], collapse = ", ")))
  # Remove the offending genes and samples from the data:
  datExpr0 = datExpr0[gsg$goodSamples, gsg$goodGenes]
}

过滤低于设定的值的基因

##filter
meanFPKM=0.5  ###--过滤标准，可以修改
n=nrow(datExpr0)
datExpr0[n+1,]=apply(datExpr0[c(1:nrow(datExpr0)),],2,mean)
datExpr0=datExpr0[1:n,datExpr0[n+1,] > meanFPKM]
# for meanFpkm in row n+1 and it must be above what you set--select meanFpkm>opt$meanFpkm(by rp)
filtered_fpkm=t(datExpr0)
filtered_fpkm=data.frame(rownames(filtered_fpkm),filtered_fpkm)
names(filtered_fpkm)[1]="sample"
head(filtered_fpkm)
write.table(filtered_fpkm, file="mRNA.filter.txt",
            row.names=F, col.names=T,quote=FALSE,sep="\t")

Sample cluster

sampleTree = hclust(dist(datExpr0), method = "average")
pdf(file = "1.sampleClustering.pdf", width = 15, height = 8)
par(cex = 0.6)
par(mar = c(0,6,6,0))
plot(sampleTree, main = "Sample clustering to detect outliers", sub="", xlab="", cex.lab = 2,
     cex.axis = 1.5, cex.main = 2)
### Plot a line to show the cut
#abline(h = 180, col = "red")##剪切高度不确定，故无红线
dev.off()

不过滤数据

如果你的数据不进行过滤直接进行一下操作,此步与前面的操作相同，任选异种即可。

## 不过滤
## Determine cluster under the line
clust = cutreeStatic(sampleTree, cutHeight = 50000, minSize = 10)
table(clust)
# clust 1 contains the samples we want to keep.
keepSamples = (clust!=0)
datExpr0 = datExpr0[keepSamples, ]
write.table(datExpr0, file="mRNA.symbol.uniq.filter.sample.txt",
            row.names=T, col.names=T,quote=FALSE,sep="\t")
###
#############Sample cluster###########
sampleTree = hclust(dist(datExpr0), method = "average")
pdf(file = "1.sampleClustering.filter.pdf", width = 12, height = 9)
par(cex = 0.6)
par(mar = c(0,4,2,0))
plot(sampleTree, main = "Sample clustering to detect outliers", sub="", xlab="", cex.lab = 1.5,
     cex.axis = 1.5, cex.main = 2)
### Plot a line to show the cut
#abline(h = 50000, col = "red")##剪切高度不确定，故无红线
dev.off()

二、导入性状数据

traitData = read.table("TraitData.txt",row.names=1,header=T,comment.char = "",check.names=F)
allTraits = traitData
dim(allTraits)
names(allTraits)

## 形成一个类似于表达数据的数据框架
fpkmSamples = rownames(datExpr0)
traitSamples =rownames(allTraits)
traitRows = match(fpkmSamples, traitSamples)
datTraits = allTraits[traitRows,]
rownames(datTraits)
collectGarbage()

再次样本聚类

sampleTree2 = hclust(dist(datExpr0), method = "average")
# Convert traits to a color representation: white means low, red means high, grey means missing entry
traitColors = numbers2colors(datTraits, signed = FALSE)

输出样本聚类图

pdf(file="2.Sample_dendrogram_and_trait_heatmap.pdf",width=20,height=12)
plotDendroAndColors(sampleTree2, traitColors,
                    groupLabels = names(datTraits),
                    main = "Sample dendrogram and trait heatmap",cex.colorLabels = 1.5, cex.dendroLabels = 1, cex.rowText = 2)
dev.off()

三、WGCNA分析（后面都是重点）

筛选软阈值

enableWGCNAThreads()
# 设置soft-thresholding powers的数量
powers = c(1:30)
sft = pickSoftThreshold(datExpr0, powerVector = powers, verbose = 5)

此步骤是比较耗费时间的，静静等待即可。

绘制soft Threshold plot

plot(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2],
     xlab="Soft Threshold (power)",ylab="Scale Free Topology Model Fit,signed R^2",type="n",
     main = paste("Scale independence"));
text(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2],
     labels=powers,cex=cex1,col="red");
# this line corresponds to using an R^2 cut-off of h
abline(h=0.8,col="red")
# Mean connectivity as a function of the soft-thresholding power
plot(sft$fitIndices[,1], sft$fitIndices[,5],
     xlab="Soft Threshold (power)",ylab="Mean Connectivity", type="n",
     main = paste("Mean connectivity"))
text(sft$fitIndices[,1], sft$fitIndices[,5], labels=powers, cex=cex1,col="red")
dev.off()

选择softpower

选择softpower是一个玄学的过程，可以直接使用软件自己认为是最好的softpower值，但是不一定你要获得最好结果；其次，我们自己选择自己认为比较好的softpower值，但是，需要自己不断的筛选。因此，从这里开始WGCNA的分析结果就开始受到不同的影响。

## 选择软件认为是最好的softpower值
#softPower =sft$powerEstimate
---
# 自己设定softpower值
softPower = 9

继续分析

adjacency = adjacency(datExpr0, power = softPower)

将邻接转化为拓扑重叠

这一步建议去服务器上跑，后面的步骤就在服务器上跑吧，数据量太大；如果你的数据量较小，本地也就可以

TOM = TOMsimilarity(adjacency);
dissTOM = 1-TOM

geneTree = hclust(as.dist(dissTOM), method = "average");

绘制聚类树(树状图)

pdf(file="4_Gene clustering on TOM-based dissimilarity.pdf",width=24,height=18)
plot(geneTree, xlab="", sub="", main = "Gene clustering on TOM-based dissimilarity",
     labels = FALSE, hang = 0.04)
dev.off()

加入模块

minModuleSize = 30
# Module identification using dynamic tree cut:
dynamicMods = cutreeDynamic(dendro = geneTree, distM = dissTOM,
                            deepSplit = 2, pamRespectsDendro = FALSE,
                            minClusterSize = minModuleSize);
table(dynamicMods)
# Convert numeric lables into colors
dynamicColors = labels2colors(dynamicMods)
table(dynamicColors)
# Plot the dendrogram and colors underneath
#sizeGrWindow(8,6)
pdf(file="5_Dynamic Tree Cut.pdf",width=8,height=6)
plotDendroAndColors(geneTree, dynamicColors, "Dynamic Tree Cut",
                    dendroLabels = FALSE, hang = 0.03,
                    addGuide = TRUE, guideHang = 0.05,
                    main = "Gene dendrogram and module colors")
dev.off()

合并模块

做出的WGCNA分析中，具有较多的模块，但是在我们后续的分析中，是使用不到这么多的模块，以及模块越多对我们的分析越困难，那么就必须合并模块信息。具体操作如下。

MEList = moduleEigengenes(datExpr0, colors = dynamicColors)
MEs = MEList$eigengenes
# Calculate dissimilarity of module eigengenes
MEDiss = 1-cor(MEs);
# Cluster module eigengenes
METree = hclust(as.dist(MEDiss), method = "average")
# Plot the result
#sizeGrWindow(7, 6)
pdf(file="6_Clustering of module eigengenes.pdf",width=7,height=6)
plot(METree, main = "Clustering of module eigengenes",
     xlab = "", sub = "")
######剪切高度可修改
MEDissThres = 0.4  
# Plot the cut line into the dendrogram
abline(h=MEDissThres, col = "red")
dev.off()

合并及绘图

 = mergeCloseModules(datExpr0, dynamicColors, cutHeight = MEDissThres, verbose = 3)
# The merged module colors
mergedColors = merge$colors
# Eigengenes of the new merged modules:
mergedMEs = merge$newMEs
table(mergedColors)
#sizeGrWindow(12, 9)
pdf(file="7_merged dynamic.pdf", width = 9, height = 6)
plotDendroAndColors(geneTree, cbind(dynamicColors, mergedColors),
                    c("Dynamic Tree Cut", "Merged dynamic"),
                    dendroLabels = FALSE, hang = 0.03,
                    addGuide = TRUE, guideHang = 0.05)
dev.off()

Rename to moduleColors

moduleColors = mergedColors
# Construct numerical labels corresponding to the colors
colorOrder = c("grey", standardColors(50))
moduleLabels = match(moduleColors, colorOrder)-1
MEs = mergedMEs

性状数据与基因模块进行分析

nGenes = ncol(datExpr0)
nSamples = nrow(datExpr0)
moduleTraitCor = cor(MEs, datTraits, use = "p")
moduleTraitPvalue = corPvalueStudent(moduleTraitCor, nSamples)

绘制模块性状相关性图

pdf(file="8_Module-trait relationships.pdf",width=10,height=10)
# Will display correlations and their p-values
textMatrix = paste(signif(moduleTraitCor, 2), "\n(",
                   signif(moduleTraitPvalue, 1), ")", sep = "")
dim(textMatrix) = dim(moduleTraitCor)
par(mar = c(6, 8.5, 3, 3))
# Display the correlation values within a heatmap plot
labeledHeatmap(Matrix = moduleTraitCor,
               xLabels = names(datTraits),
               yLabels = names(MEs),
               ySymbols = names(MEs),
               colorLabels = FALSE,
               colors = greenWhiteRed(50),
               textMatrix = textMatrix,
               setStdMargins = FALSE,
               cex.text = 0.5,
               zlim = c(-1,1),
               main = paste("Module-trait relationships"))
dev.off()

计算MM和GS

modNames = substring(names(MEs), 3)
geneModuleMembership = as.data.frame(cor(datExpr0, MEs, use = "p"))
MMPvalue = as.data.frame(corPvalueStudent(as.matrix(geneModuleMembership), nSamples))
names(geneModuleMembership) = paste("MM", modNames, sep="")
names(MMPvalue) = paste("p.MM", modNames, sep="")
#names of those trait
traitNames=names(datTraits)
geneTraitSignificance = as.data.frame(cor(datExpr0, datTraits, use = "p"))
GSPvalue = as.data.frame(corPvalueStudent(as.matrix(geneTraitSignificance), nSamples))
names(geneTraitSignificance) = paste("GS.", traitNames, sep="")
names(GSPvalue) = paste("p.GS.", traitNames, sep="")

批量绘制性状与各个模块基因的相关性图

for (trait in traitNames){
  traitColumn=match(trait,traitNames)
  for (module in modNames){
    column = match(module, modNames)
    moduleGenes = moduleColors==module
    if (nrow(geneModuleMembership[moduleGenes,]) > 1){####进行这部分计算必须每个模块内基因数量大于2，由于前面设置了最小数量是30，这里可以不做这个判断，但是grey有可能会出现1个gene,它会导致代码运行的时候中断，故设置这一步
      #sizeGrWindow(7, 7)
      pdf(file=paste("9_", trait, "_", module,"_Module membership vs gene significance.pdf",sep=""),width=7,height=7)
      par(mfrow = c(1,1))
      verboseScatterplot(abs(geneModuleMembership[moduleGenes, column]),
                         abs(geneTraitSignificance[moduleGenes, traitColumn]),
                         xlab = paste("Module Membership in", module, "module"),
                         ylab = paste("Gene significance for ",trait),
                         main = paste("Module membership vs. gene significance\n"),
                         cex.main = 1.2, cex.lab = 1.2, cex.axis = 1.2, col = module)
      dev.off()
    }
  }
}
names(datExpr0)
probes = names(datExpr0)

输出GS和MM数据

geneInfo0 = data.frame(probes= probes,
                       moduleColor = moduleColors)
for (Tra in 1:ncol(geneTraitSignificance))
{
  oldNames = names(geneInfo0)
  geneInfo0 = data.frame(geneInfo0, geneTraitSignificance[,Tra],
                         GSPvalue[, Tra])
  names(geneInfo0) = c(oldNames,names(geneTraitSignificance)[Tra],
                       names(GSPvalue)[Tra])
}
for (mod in 1:ncol(geneModuleMembership))
{
  oldNames = names(geneInfo0)
  geneInfo0 = data.frame(geneInfo0, geneModuleMembership[,mod],
                         MMPvalue[, mod])
  names(geneInfo0) = c(oldNames,names(geneModuleMembership)[mod],
                       names(MMPvalue)[mod])
}
geneOrder =order(geneInfo0$moduleColor)
geneInfo = geneInfo0[geneOrder, ]
write.table(geneInfo, file = "10_GS_and_MM.xls",sep="\t",row.names=F)

可视化基因网络

nGenes = ncol(datExpr0)
nSamples = nrow(datExpr0)
nSelect = 400
# For reproducibility, we set the random seed 不能用全部的基因，不然会爆炸的
set.seed(10)
select = sample(nGenes, size = nSelect)
selectTOM = dissTOM[select, select]
selectTree = hclust(as.dist(selectTOM), method = "average")
selectColors = moduleColors[select]
#sizeGrWindow(9,9)
# Taking the dissimilarity to a power, say 10, makes the plot more informative by effectively changing
# the color palette; setting the diagonal to NA also improves the clarity of the plot
plotDiss = selectTOM^7
diag(plotDiss) = NA

绘图

library("gplots")
pdf(file="13_Network heatmap plot_selected genes.pdf",width=9, height=9)
mycol = colorpanel(250,'red','orange','lemonchiffon')
TOMplot(plotDiss, selectTree, selectColors, col=mycol ,main = "Network heatmap plot, selected genes")
dev.off()

特征基因的基因网络可视化

pdf(file="14_Eigengene dendrogram and Eigengene adjacency heatmap.pdf", width=5, height=7.5)
par(cex = 0.9)
plotEigengeneNetworks(MEs, "", marDendro = c(0,4,1,2), marHeatmap = c(3,4,1,2), cex.lab = 0.8, xLabelsAngle= 90)
dev.off()

获得本教程代码链接：WGCNA分析 | 全流程分析代码 | 代码一

小杜的生信筆記，主要发表或收录生物信息学的教程，以及基于R的分析和可视化（包括数据分析，图形绘制等）；分享感兴趣的文献和学习资料!!

WGCNA分析 | 代码一

分析代码

设置文件位置

一、导入基因表达量数据

数据处理

过滤数据

Sample cluster

不过滤数据

二、导入性状数据

再次样本聚类

三、WGCNA分析（后面都是重点）

筛选软阈值

绘制soft Threshold plot

选择softpower

将邻接转化为拓扑重叠

绘制聚类树(树状图)

加入模块

合并模块

Rename to moduleColors

性状数据与基因模块进行分析

绘制模块性状相关性图

计算MM和GS

批量绘制性状与各个模块基因的相关性图

输出GS和MM数据

可视化基因网络

绘图

特征基因的基因网络可视化

php redis分布式锁

linux内存缓存占用过高分析和优化

stm32编写Modbus步骤

如何保证数据库和缓存的一致性

Mongodb聚合操作中的$unset

私域引流宝PHP源码以及搭建教程

分析代码

设置文件位置

一、导入基因表达量数据

数据处理

过滤数据

Sample cluster

不过滤数据

二、导入性状数据

再次样本聚类

三、WGCNA分析（后面都是重点）

筛选软阈值

绘制soft Threshold plot

选择softpower

将邻接转化为拓扑重叠

绘制聚类树(树状图)

加入模块

合并模块

Rename to moduleColors

性状数据与基因模块进行分析

绘制模块性状相关性图

计算MM和GS

批量绘制性状与各个模块基因的相关性图

输出GS和MM数据

可视化基因网络

绘图

特征基因的基因网络可视化

猜你喜欢

php redis分布式锁

linux内存缓存占用过高分析和优化

stm32编写Modbus步骤

如何保证数据库和缓存的一致性

Mongodb聚合操作中的$unset

私域引流宝PHP源码 以及搭建教程

私域引流宝PHP源码以及搭建教程