The slides introducing the concepts of Gene Set Enrichment and Network Clustering Analysis can be found here.
We will use the Darrah at al dataset to look for gene sets and gene clusters associated with protection. This experiment comprises 15 rhesus macaques vaccinated with BCG via various strategies and and BALs collected at week 13 and 25 post vaccination, prior to Mtb challenge at week 26.
The Darrah et al dataset is a single-cell RNAseq dataset, and each cell has been labelled with its canonical named cell type.
To simplify the analysis, cells representing the same cell type have been summed into a single ‘pseudobulk’ count per-gene per-sample. Thus, single-cell RNAseq was used to identify and define cell types, and we can apply standard bulk RNAseq tools to interpret gene expression within each cell type.
First, load in the raw count data and associated experiment design metadata, normalize the counts, tidy up the metadata, and create a long annotated raw count dataframe.
Key metadata columns
The DESeq2 vst()
function will normalize the count data
to account for sequencing depth, and the mean-variance relationship, and
returns a log-scaled matrix of normalized counts
dataFilename = '../darrah_etal/pseudo_bulk_wk13_wk25.csv'
metaFilename = '../darrah_etal/updated_alexandria_metadata.txt'
#Skip the second row, but take column names from the first row
metaCols <- c('organ__ontology_label', 'vaccination', 'vaccination__ontology_label',
'sex', 'biosample_id', 'cell_type__ontology_label', 'disease__ontology_label',
'donor_id', 'Stimulated', 'VaccineRoute', 'VaccineRouteUnique',
'VaccineRouteUniqueGroup', 'vaccination_route', 'vaccination__time_since',
'vaccination__time_since__unit','vaccination__dosage')
metaData <- set_names(read.table(metaFilename, sep="\t", header=F, skip=2),
as.character(read.table(metaFilename,sep="\t", nrows=1, header=F))) %>%
select(one_of(metaCols)) %>%
mutate(countColID=sprintf("D%s_WK%d_STIM%s_%s", donor_id,
vaccination__time_since, Stimulated, make.names(cell_type__ontology_label))) %>%
distinct()
rawCounts <- read.csv(dataFilename) %>%
rename_with(.fn=\(.)"gene", .cols=1) %>%
column_to_rownames("gene") %>%
as.matrix()
longCounts <- rawCounts %>%
vst() %>%
data.frame(check.names = F) %>%
rownames_to_column("gene") %>%
pivot_longer(-gene, names_to="countColID", values_to="normExpr") %>%
left_join(metaData, by="countColID") %>%
mutate(status=sprintf("%s_%dwks_Stim%s", VaccineRoute, vaccination__time_since, Stimulated))
We can use the standard DESeq2 pipeline to identify gene that are highly expressed in specific cell types and differ between conditions
#Exclude unnamed genes, e.g. LOCxxxx, genexxxx or CXHXorfXX for interpretability
annotatedGenes <- grep("^(gene|LOC|C\\d+H\\d+orf\\d+)", rownames(rawCounts), value=T, invert=T)
#We can fit models incorporating all the study design terms, and extract comparisons from it
#A factor we don't account for: repeated measurements from the same donor_id
studyDesignFormula <- ~0+VaccineRoute+Stimulated+vaccination__time_since
darrahCellTypeDatasets <- unique(metaData$cell_type__ontology_label) %>%
set_names() %>%
map(\(cellType){
message(cellType)
cellTypeMetadata <- filter(metaData, cell_type__ontology_label==cellType) %>%
mutate(status=sprintf("%s_%dwks_Stim%s", VaccineRoute, vaccination__time_since, Stimulated),
countColID=make.names(countColID))
message(cellTypeMetadata$countColID)
DESeqDataSetFromMatrix(countData=rawCounts[annotatedGenes,cellTypeMetadata$countColID],
colData=cellTypeMetadata,
design=~0+status)
})
darrahCelltypeModels <- map(darrahCellTypeDatasets, DESeq)
tCellResponseGenes <- results(darrahCelltypeModels$`T cell`,
c("status", "IV_13wks_StimYES", "Naive_13wks_StimYES")) %>%
data.frame() %>%
rownames_to_column("gene") %>%
filter(!is.na(padj)) %>%
arrange(pvalue)
Lets visualize T cell response genes identified above, and how they respond in other cell types at other timepoints.
We use pheatmap to plot average gene expression per group, and hierarchically cluster the genes and conditions, indicated with the dendrogram.
We can also generate clusters manually using the hclust
function
For more information about pheatmap
, go to Dave
Tang tutorial, Kamil Slowikowski
tutorial or Renesh
Bedre tutorial.
#Differentially expressed genes between IV and naive in stimulated T cells at 13 weeks
sigTcellResponseGenes <- tCellResponseGenes %>%
filter(abs(log2FoldChange) > 1 & padj < 0.05) %>%
slice_max(order_by=abs(log2FoldChange), n=50)
#Plot the average expression of response genes per-group in T cells
plotData <- longCounts %>%
filter(gene %in% sigTcellResponseGenes$gene & cell_type__ontology_label %in% "T cell") %>%
group_by(gene, status) %>%
summarize(aveExpr=mean(normExpr, trim=0.25)) %>%
pivot_wider(names_from=status, values_from=aveExpr) %>%
column_to_rownames("gene")
columnAnnots <- longCounts %>%
select(status, VaccineRoute, vaccination__time_since, Stimulated) %>%
distinct() %>%
column_to_rownames("status")
pheatmap(plotData,
scale="row",
annotation_col = columnAnnots)
#Hierarchical clustering done manually
geneClusters <- hclust(dist(plotData), method="ward.D2")
statusClusters <- hclust(dist(t(plotData)), method="ward.D2")
#Pass the clustering to pheatmap
pheatmap(plotData,
scale="row",
annotation_col = columnAnnots,
cluster_rows = geneClusters,
cluster_cols = statusClusters)
We can interpret our list of differentially expressed genes by comparing to annotated empirically derived sets of coherently expressed genes. Caveat: these gene sets were derived from human whole-blood gene expression studies, and we are looking at NHP BAL pseudobulk data. However, human and NHP gene responses are similar, and whole blood primarily reflects immune cell responses, similar to BAL, so they should provide useful information.
The fgsea
packages assesses gene set enrichment using
ranked gene statistics (from our DESeq2 results) and lists of
pre-defined gene sets. We will use MSigDB Hallmark modules, along with
published gene sets from Li et al (LI), and Chaussabel et al (DC).
For more information about fgsea
, go to Biostatsquid
Tutorial
#Format the gene sets
data("tmod")
liAndDcGsets <- map(tmod$gs2gv, \(gIdx)tmod$gv[gIdx]) %>%
set_names(paste(tmod$gs$ID, tmod$gs$Title))
hallmarkGsets <- msigdbr(species = "Homo sapiens", category = "H") %>%
select(gs_name, gene_symbol) %>%
split(.$gs_name) %>%
map(\(gsDF)gsDF$gene_symbol)
gsets <- c(liAndDcGsets, hallmarkGsets)
#Filter to only contain genes in our dataset, and drop genesets with < 5 genes
gsets <- gsets %>%
map(\(gset){
genes <- intersect(annotatedGenes, gset)
if (length(genes)>=5) genes else c()
}) %>% compact()
tCellResponseEnrichment <- fgsea(pathways=gsets,
stats=set_names(tCellResponseGenes$log2FoldChange,
tCellResponseGenes$gene))
sigGeneSets <- tCellResponseEnrichment %>%
data.frame() %>%
filter(padj < 0.01) %>%
arrange(NES) %>%
mutate(pathway=factor(pathway, levels=pathway))
#Plot the most strongly increased pathway
topPathway <- as.character(slice_max(sigGeneSets, n=1, order_by=NES)$pathway)
plotEnrichment(gsets[[topPathway]],
set_names(tCellResponseGenes$log2FoldChange,
tCellResponseGenes$gene)) +
labs(title=topPathway)
ggplot(sigGeneSets, aes(y=pathway, x=NES, fill=NES)) +
geom_col() +
scale_fill_distiller(palette="RdBu") +
theme_minimal() +
labs(title="GSEA: BCG-IV vs Naive T cells",
subtitle="13 weeks, PPD stim", y=NULL)
WGCNA refers to Weighted Gene Co-expression Network Analysis, created by Langfelder and Horvath (2018). WGNCA is an unsupervised clustering approach to find data-driven correlated gene sets in expression data.
The first step in WGCNA is to create an [n_gene x n_gene] correlation matrix, which is transformed into a adjacency matrix using a thresholding procedure (hard or soft threshold) applied to the correlation matrix.
This is followed by an unsupervised gene module detection step. This is based on topological overlap, e.g. 2 gene have high topological overlap if they are ‘adjacent’ to similar sets of genes, based on the adjacency matrix. This topological overlap measure is used to crease a hierarchical clustering tree. The clustering tree is then ‘cut’ into individual gene modules that are highly topologically similar.
Below, we apply the WGCNA algorithm to the Darrah et al macrophage expression data to discover and test macrophage-specific gene modules.
For more information about WGCNA, check out the following tutorials: UT Austin Wiki bioinformatics workbook Natalia Faraj Murad tutorial
macNormCountMat <- longCounts %>%
filter(gene %in% annotatedGenes & cell_type__ontology_label=="macrophage") %>%
select(gene, countColID, normExpr) %>%
pivot_wider(names_from="gene", values_from="normExpr") %>%
column_to_rownames("countColID") %>%
as.matrix()
powers <- 1:20
softThreshPower <- pickSoftThreshold(
macNormCountMat,
powerVector = powers,
verbose = 0)
## Power SFT.R.sq slope truncated.R.sq mean.k. median.k. max.k.
## 1 1 0.944 0.484 0.930 5860.0 5900.00 9570
## 2 2 0.358 -0.213 0.219 3020.0 2700.00 6660
## 3 3 0.719 -0.492 0.756 1890.0 1400.00 5140
## 4 4 0.774 -0.592 0.760 1310.0 780.00 4180
## 5 5 0.809 -0.651 0.767 966.0 464.00 3510
## 6 6 0.836 -0.692 0.790 743.0 286.00 3010
## 7 7 0.852 -0.728 0.812 588.0 182.00 2620
## 8 8 0.862 -0.763 0.832 475.0 119.00 2310
## 9 9 0.869 -0.795 0.851 391.0 80.60 2050
## 10 10 0.863 -0.831 0.858 325.0 55.70 1830
## 11 11 0.867 -0.860 0.873 274.0 39.20 1650
## 12 12 0.865 -0.892 0.883 233.0 28.00 1490
## 13 13 0.862 -0.918 0.890 199.0 20.40 1360
## 14 14 0.859 -0.945 0.899 172.0 15.10 1240
## 15 15 0.861 -0.968 0.907 149.0 11.20 1130
## 16 16 0.859 -0.993 0.911 130.0 8.53 1040
## 17 17 0.857 -1.020 0.915 114.0 6.45 959
## 18 18 0.861 -1.030 0.925 100.0 5.08 885
## 19 19 0.849 -1.060 0.922 88.8 4.00 818
## 20 20 0.846 -1.080 0.923 78.8 3.14 758
conflicts_prefer(WGCNA::cor)
macWgncaNetwork <- blockwiseModules(datExpr=macNormCountMat,
power=softThreshPower$powerEstimate,
saveTOMs=T)
macWcgnaLabels <- data.frame(colorLabel=macWgncaNetwork$colors,
gene=names(macWgncaNetwork$colors)) %>%
split(.$colorLabel) %>%
map(\(df)df$gene)
macWgcnaModuleCounts <- table(macWgncaNetwork$colors)
#One module has the majority of genes. Since most genes don't respond
#presumably it's the module of non-reponsive genes
#How do the wgcna modules compare with coherently expressed modules
#Lets look at gene level overlaps and test with hypergeometric test
We can see how our WGCNA macrophage clusters map to published sets of coherently expressed genes by comparing the WGCNA clusters to published BTM sets.
The heatmap shows signficant overlaps.
modulePairs <- expand.grid(WGCNA=names(macWgcnaModuleCounts),
BTMs=names(gsets))
macwWcgnaBtmStats <- map2_dfr(modulePairs$WGCNA, modulePairs$BTMs, \(wgcnaName, btmName){
wgcnaGenes <- macWcgnaLabels[[wgcnaName]]
btmGenes <- gsets[[btmName]]
sharedGenes <- intersect(wgcnaGenes, btmGenes)
pOverlap <- if(length(sharedGenes)==0) {
1
} else{
1 - phyper(length(sharedGenes),
length(btmGenes),
length(unique(unlist(gsets))),
length(wgcnaGenes),
lower.tail=T)
}
data.frame(wgcnaName=wgcnaName, btmName=btmName,
nWgcnaGenes=length(wgcnaGenes), nBtmGenes=length(btmGenes), nSharedGenes=length(sharedGenes),
pOverlap=pOverlap)
}) %>% mutate(FDR=p.adjust(pOverlap, method="fdr"))
sigwWcgnaBtmStatsWide <- macwWcgnaBtmStats %>%
filter(FDR<0.05 & nSharedGenes>=10) %>%
select(wgcnaName, btmName, nSharedGenes) %>%
pivot_wider(names_from="btmName", values_from=nSharedGenes, values_fill = 0) %>%
column_to_rownames("wgcnaName")
pheatmap(sigwWcgnaBtmStatsWide,
color = RColorBrewer::brewer.pal(n=7, name="YlOrRd"),
clustering_method="ward.D2")
Averaging WGCNA module expression by status lets us visualize relationships between WGNCA macrophage clusters and mutliple aspects of the experiments immune responses
#Now, average WGCNA module by cell type and vaccination status
WGNAmodStats <- longCounts %>%
filter(cell_type__ontology_label %in% "macrophage") %>%
left_join(data.frame(gene=names(macWgncaNetwork$colors),
WGCNAmod=macWgncaNetwork$colors)) %>%
filter(!is.na(WGCNAmod)) %>%
group_by(WGCNAmod) %>%
summarize(aveModExpr=mean(normExpr, trim=0.25), sdModExpr=sd(normExpr))
aveWGNAexprByStatusAndCellType <- longCounts %>%
filter(cell_type__ontology_label %in% "macrophage")%>%
left_join(data.frame(gene=names(macWgncaNetwork$colors),
WGCNAmod=macWgncaNetwork$colors)) %>%
filter(!is.na(WGCNAmod)) %>%
group_by(WGCNAmod, status, cell_type__ontology_label) %>%
summarize(modExprPerStatusPerCellType=mean(normExpr, trim=0.25)) %>%
ungroup() %>%
left_join(WGNAmodStats, by="WGCNAmod") %>%
mutate(scaledModExprPerStatusPerCellType = (modExprPerStatusPerCellType-aveModExpr)/sdModExpr)
statusAnnots <- longCounts %>%
select(celltype=cell_type__ontology_label, status, VaccineRoute,
timepoint=vaccination__time_since, stim=Stimulated) %>%
distinct() %>%
mutate(colName=paste(celltype, status, sep="|"), timepoint=factor(timepoint)) %>%
select(-status, -celltype) %>%
column_to_rownames("colName")
aveWGNAexprByStatusAndCellTypeWide <- aveWGNAexprByStatusAndCellType %>%
select(WGCNAmod, status, cell_type__ontology_label, scaledModExprPerStatusPerCellType) %>%
mutate(label=paste(cell_type__ontology_label, status, sep="|")) %>%
select(label,WGCNAmod, scaledModExprPerStatusPerCellType) %>%
pivot_wider(names_from=label, values_from=scaledModExprPerStatusPerCellType) %>%
column_to_rownames("WGCNAmod")
annotColors <- list(celltype=c("B cell"="purple",
"eosinophil"="yellow",
"epithelial cell"="grey",
"macrophage"="green",
"mast cell"="orange",
"neutrophil"="red",
"T cell"="black"),
stim=c(NO="white", YES="black"),
timepoint=c(`13`="white", `25`="grey"),
VaccineRoute=c(Naive="white", AE="purple",
IDHigh="darkgreen", IDLow="green",
IV="firebrick"))
pheatmap(aveWGNAexprByStatusAndCellTypeWide,
annotation_col = statusAnnots,
show_colnames = F, annotation_colors = annotColors,
clustering_method = "ward.D2")
GWENA is an alternative implementation of WGCNA (Lemoine et al., BMC Bioinformatics, PMID:[34034647](https://pubmed.ncbi.nlm.nih.gov/34034647/)), which includes some nice visualization features.
BiocManager::install("GWENA")
library(GWENA)
# Prepare count data
VSTcounts = rawCounts %>%
vst() %>%
data.frame(check.names = F) %>%
t()
VSTdataFiltered = filter_low_var(VSTcounts,pct=0.1,type = "median")
# Building network
net <- build_net(VSTdataFiltered, cor_func = "spearman",n_threads=1)
net$metadata$power
## [1] 12
fit_power_table <- net$metadata$fit_power_table
fit_power_table[fit_power_table$Power == net$metadata$power, "SFT.R.sq"]
## [1] 0.849985
# Identifying modules
modules <- detect_modules(VSTdataFiltered,
net$network,
detailled_result = TRUE,
merge_threshold = 0.95)
length(unique(modules$modules_premerge))
## [1] 12
length(unique(modules$modules))
## [1] 11
# Plotting modules
layout_mod_merge <- plot_modules_merge(
modules_premerge = modules$modules_premerge,
modules_merged = modules$modules)
ggplot2::ggplot(data.frame(modules$modules %>% stack),
ggplot2::aes(x = ind)) + ggplot2::stat_count() +
ggplot2::ylab("Number of genes") +
ggplot2::xlab("Module")
# Visualizing functional enrichment in modules
enrichment <- bio_enrich(modules$modules)
plot_enrichment(enrichment)
# Visualizing phenotypic association in modules
phenotype_association <- associate_phenotype(
modules$modules_eigengenes,
metaData %>% dplyr::select(cell_type__ontology_label, VaccineRoute))
plot_modules_phenotype(phenotype_association)
# Visualizing individual modules
module_example <- modules$modules$`10`
graph <- build_graph_from_sq_mat(net$network[module_example, module_example])
net_mod_2 <- net$network[modules$modules$`10`, modules$modules$`10`]
sub_clusters <- get_sub_clusters(net_mod_2)
layout_mod_2 <- plot_module(graph, upper_weight_th = 0.999,
groups = sub_clusters,
vertex.label.cex = 0,
node_scaling_max = 7,
legend_cex = 1)