Gene Set Enrichment and Network Clustering

The slides introducing the concepts of Gene Set Enrichment and Network Clustering Analysis can be found here.

Preparing the data

We will use the Darrah at al dataset to look for gene sets and gene clusters associated with protection. This experiment comprises 15 rhesus macaques vaccinated with BCG via various strategies and and BALs collected at week 13 and 25 post vaccination, prior to Mtb challenge at week 26.

The Darrah et al dataset is a single-cell RNAseq dataset, and each cell has been labelled with its canonical named cell type.

To simplify the analysis, cells representing the same cell type have been summed into a single ‘pseudobulk’ count per-gene per-sample. Thus, single-cell RNAseq was used to identify and define cell types, and we can apply standard bulk RNAseq tools to interpret gene expression within each cell type.

First, load in the raw count data and associated experiment design metadata, normalize the counts, tidy up the metadata, and create a long annotated raw count dataframe.

Key metadata columns

VaccineRoute
- AE (aerosol BCG)
- IDHigh (Intradermal BCG high dose)
- IDLow (Intradermal BCG low dose)
- IV (intravenous BCG) *
- Naive (unvaccinated control)
vaccination__time_since:
- N. weeks post vaccine
donor_id:
- Unique ID per NHP
Stimulated
- Samples stimulated with PPD: Yes or No

The DESeq2 vst() function will normalize the count data to account for sequencing depth, and the mean-variance relationship, and returns a log-scaled matrix of normalized counts

dataFilename =  '../darrah_etal/pseudo_bulk_wk13_wk25.csv'
metaFilename =  '../darrah_etal/updated_alexandria_metadata.txt'

#Skip the second row, but take column names from the first row
metaCols <- c('organ__ontology_label', 'vaccination', 'vaccination__ontology_label',
              'sex', 'biosample_id', 'cell_type__ontology_label', 'disease__ontology_label', 
              'donor_id', 'Stimulated', 'VaccineRoute',  'VaccineRouteUnique', 
              'VaccineRouteUniqueGroup', 'vaccination_route', 'vaccination__time_since',
              'vaccination__time_since__unit','vaccination__dosage')

metaData <- set_names(read.table(metaFilename, sep="\t", header=F, skip=2), 
                 as.character(read.table(metaFilename,sep="\t",  nrows=1, header=F))) %>%
  select(one_of(metaCols)) %>%
  mutate(countColID=sprintf("D%s_WK%d_STIM%s_%s", donor_id, 
                            vaccination__time_since, Stimulated, make.names(cell_type__ontology_label))) %>%
  distinct()



rawCounts <-  read.csv(dataFilename) %>% 
  rename_with(.fn=\(.)"gene", .cols=1) %>%
  column_to_rownames("gene") %>%
  as.matrix()

longCounts <- rawCounts %>%
  vst() %>%
  data.frame(check.names = F) %>%
  rownames_to_column("gene") %>%
  pivot_longer(-gene, names_to="countColID", values_to="normExpr") %>%
  left_join(metaData, by="countColID") %>%
  mutate(status=sprintf("%s_%dwks_Stim%s", VaccineRoute, vaccination__time_since, Stimulated))

Differential expression

We can use the standard DESeq2 pipeline to identify gene that are highly expressed in specific cell types and differ between conditions

#Exclude unnamed genes, e.g. LOCxxxx, genexxxx or CXHXorfXX for interpretability
annotatedGenes <- grep("^(gene|LOC|C\\d+H\\d+orf\\d+)", rownames(rawCounts), value=T, invert=T)

#We can fit models incorporating all the study design terms, and extract comparisons from it
#A factor we don't account for: repeated measurements from the same donor_id
studyDesignFormula <- ~0+VaccineRoute+Stimulated+vaccination__time_since


darrahCellTypeDatasets <- unique(metaData$cell_type__ontology_label) %>%
  set_names() %>%
  map(\(cellType){
    message(cellType)
    cellTypeMetadata <- filter(metaData, cell_type__ontology_label==cellType) %>%
      mutate(status=sprintf("%s_%dwks_Stim%s", VaccineRoute, vaccination__time_since, Stimulated),
             countColID=make.names(countColID)) 
    message(cellTypeMetadata$countColID)
    DESeqDataSetFromMatrix(countData=rawCounts[annotatedGenes,cellTypeMetadata$countColID],
                           colData=cellTypeMetadata,
                           design=~0+status)
  })


darrahCelltypeModels <- map(darrahCellTypeDatasets, DESeq)

tCellResponseGenes <-  results(darrahCelltypeModels$`T cell`, 
                               c("status", "IV_13wks_StimYES", "Naive_13wks_StimYES")) %>% 
  data.frame() %>% 
  rownames_to_column("gene") %>% 
  filter(!is.na(padj)) %>% 
  arrange(pvalue)

Exploring Heatmaps and hierarchical clustering

Lets visualize T cell response genes identified above, and how they respond in other cell types at other timepoints.

We use pheatmap to plot average gene expression per group, and hierarchically cluster the genes and conditions, indicated with the dendrogram.

We can also generate clusters manually using the hclust function

For more information about pheatmap, go to Dave Tang tutorial, Kamil Slowikowski tutorial or Renesh Bedre tutorial.

#Differentially expressed genes between IV and naive in stimulated T cells at 13 weeks
sigTcellResponseGenes <- tCellResponseGenes %>%
  filter(abs(log2FoldChange) > 1 & padj < 0.05) %>%
  slice_max(order_by=abs(log2FoldChange), n=50)



#Plot the average expression of response genes per-group in T cells
plotData <- longCounts %>%
  filter(gene %in% sigTcellResponseGenes$gene & cell_type__ontology_label %in% "T cell") %>%
  group_by(gene, status) %>%
  summarize(aveExpr=mean(normExpr, trim=0.25)) %>%
  pivot_wider(names_from=status, values_from=aveExpr) %>%
  column_to_rownames("gene")

columnAnnots <- longCounts %>% 
  select(status, VaccineRoute, vaccination__time_since, Stimulated) %>%
  distinct() %>%
  column_to_rownames("status")

pheatmap(plotData, 
         scale="row",
         annotation_col = columnAnnots)

#Hierarchical clustering done manually
geneClusters <- hclust(dist(plotData), method="ward.D2")
statusClusters <- hclust(dist(t(plotData)), method="ward.D2")

#Pass the clustering to pheatmap
pheatmap(plotData, 
         scale="row",
         annotation_col = columnAnnots,
         cluster_rows = geneClusters,
         cluster_cols = statusClusters)

Gene set enrichment analysis

We can interpret our list of differentially expressed genes by comparing to annotated empirically derived sets of coherently expressed genes. Caveat: these gene sets were derived from human whole-blood gene expression studies, and we are looking at NHP BAL pseudobulk data. However, human and NHP gene responses are similar, and whole blood primarily reflects immune cell responses, similar to BAL, so they should provide useful information.

The fgsea packages assesses gene set enrichment using ranked gene statistics (from our DESeq2 results) and lists of pre-defined gene sets. We will use MSigDB Hallmark modules, along with published gene sets from Li et al (LI), and Chaussabel et al (DC).

For more information about fgsea, go to Biostatsquid Tutorial

#Format the gene sets
data("tmod") 

liAndDcGsets <- map(tmod$gs2gv, \(gIdx)tmod$gv[gIdx]) %>%
  set_names(paste(tmod$gs$ID, tmod$gs$Title))

hallmarkGsets <- msigdbr(species = "Homo sapiens", category = "H") %>%
  select(gs_name, gene_symbol) %>%
  split(.$gs_name) %>%
  map(\(gsDF)gsDF$gene_symbol)

gsets <- c(liAndDcGsets, hallmarkGsets)

#Filter to only contain genes in our dataset, and drop genesets with < 5 genes
gsets <- gsets %>%
  map(\(gset){
    genes <- intersect(annotatedGenes, gset)
    if (length(genes)>=5) genes else c()
}) %>% compact()

tCellResponseEnrichment <- fgsea(pathways=gsets,
                                 stats=set_names(tCellResponseGenes$log2FoldChange,
                                                 tCellResponseGenes$gene))

sigGeneSets <- tCellResponseEnrichment %>%
  data.frame() %>%
  filter(padj < 0.01) %>%
  arrange(NES) %>%
  mutate(pathway=factor(pathway, levels=pathway))

#Plot the most strongly increased pathway
topPathway <- as.character(slice_max(sigGeneSets, n=1, order_by=NES)$pathway)

plotEnrichment(gsets[[topPathway]],
               set_names(tCellResponseGenes$log2FoldChange,
                                                 tCellResponseGenes$gene)) +
  labs(title=topPathway)

ggplot(sigGeneSets, aes(y=pathway, x=NES, fill=NES)) +
  geom_col() +
  scale_fill_distiller(palette="RdBu") +
  theme_minimal() +
  labs(title="GSEA: BCG-IV vs Naive T cells",
       subtitle="13 weeks, PPD stim", y=NULL)

Unsupervised clustering using WGCNA

WGCNA refers to Weighted Gene Co-expression Network Analysis, created by Langfelder and Horvath (2018). WGNCA is an unsupervised clustering approach to find data-driven correlated gene sets in expression data.

The first step in WGCNA is to create an [n_gene x n_gene] correlation matrix, which is transformed into a adjacency matrix using a thresholding procedure (hard or soft threshold) applied to the correlation matrix.

This is followed by an unsupervised gene module detection step. This is based on topological overlap, e.g. 2 gene have high topological overlap if they are ‘adjacent’ to similar sets of genes, based on the adjacency matrix. This topological overlap measure is used to crease a hierarchical clustering tree. The clustering tree is then ‘cut’ into individual gene modules that are highly topologically similar.

Below, we apply the WGCNA algorithm to the Darrah et al macrophage expression data to discover and test macrophage-specific gene modules.

For more information about WGCNA, check out the following tutorials: UT Austin Wiki bioinformatics workbook Natalia Faraj Murad tutorial

Identify WGCNA clusters

macNormCountMat <- longCounts %>%
  filter(gene %in% annotatedGenes & cell_type__ontology_label=="macrophage") %>%
  select(gene, countColID, normExpr) %>%
  pivot_wider(names_from="gene", values_from="normExpr") %>%
  column_to_rownames("countColID") %>%
  as.matrix()

powers <-  1:20

softThreshPower <- pickSoftThreshold(
  macNormCountMat,   
  powerVector = powers,
  verbose = 0)

##    Power SFT.R.sq  slope truncated.R.sq mean.k. median.k. max.k.
## 1      1    0.944  0.484          0.930  5860.0   5900.00   9570
## 2      2    0.358 -0.213          0.219  3020.0   2700.00   6660
## 3      3    0.719 -0.492          0.756  1890.0   1400.00   5140
## 4      4    0.774 -0.592          0.760  1310.0    780.00   4180
## 5      5    0.809 -0.651          0.767   966.0    464.00   3510
## 6      6    0.836 -0.692          0.790   743.0    286.00   3010
## 7      7    0.852 -0.728          0.812   588.0    182.00   2620
## 8      8    0.862 -0.763          0.832   475.0    119.00   2310
## 9      9    0.869 -0.795          0.851   391.0     80.60   2050
## 10    10    0.863 -0.831          0.858   325.0     55.70   1830
## 11    11    0.867 -0.860          0.873   274.0     39.20   1650
## 12    12    0.865 -0.892          0.883   233.0     28.00   1490
## 13    13    0.862 -0.918          0.890   199.0     20.40   1360
## 14    14    0.859 -0.945          0.899   172.0     15.10   1240
## 15    15    0.861 -0.968          0.907   149.0     11.20   1130
## 16    16    0.859 -0.993          0.911   130.0      8.53   1040
## 17    17    0.857 -1.020          0.915   114.0      6.45    959
## 18    18    0.861 -1.030          0.925   100.0      5.08    885
## 19    19    0.849 -1.060          0.922    88.8      4.00    818
## 20    20    0.846 -1.080          0.923    78.8      3.14    758

conflicts_prefer(WGCNA::cor)
macWgncaNetwork <- blockwiseModules(datExpr=macNormCountMat, 
                                 power=softThreshPower$powerEstimate,
                                 saveTOMs=T)

macWcgnaLabels <- data.frame(colorLabel=macWgncaNetwork$colors,
                          gene=names(macWgncaNetwork$colors)) %>%
  split(.$colorLabel) %>%
  map(\(df)df$gene)

macWgcnaModuleCounts <- table(macWgncaNetwork$colors)
#One module has the  majority of genes. Since most genes don't respond
#presumably it's the module of non-reponsive genes

#How do the wgcna modules compare with coherently expressed modules
#Lets look at gene level overlaps and test with hypergeometric test

WGCNA clusters vs published gene sets

We can see how our WGCNA macrophage clusters map to published sets of coherently expressed genes by comparing the WGCNA clusters to published BTM sets.

The heatmap shows signficant overlaps.

modulePairs <- expand.grid(WGCNA=names(macWgcnaModuleCounts),
                           BTMs=names(gsets))


macwWcgnaBtmStats <- map2_dfr(modulePairs$WGCNA, modulePairs$BTMs, \(wgcnaName, btmName){
  wgcnaGenes <- macWcgnaLabels[[wgcnaName]]
  btmGenes <- gsets[[btmName]]
  sharedGenes <- intersect(wgcnaGenes, btmGenes)
  pOverlap <- if(length(sharedGenes)==0) {
    1
  } else{
    1 - phyper(length(sharedGenes),
               length(btmGenes),
               length(unique(unlist(gsets))),
               length(wgcnaGenes),
               lower.tail=T)
  }
  data.frame(wgcnaName=wgcnaName, btmName=btmName,
             nWgcnaGenes=length(wgcnaGenes), nBtmGenes=length(btmGenes), nSharedGenes=length(sharedGenes),
             pOverlap=pOverlap)
}) %>% mutate(FDR=p.adjust(pOverlap, method="fdr"))


sigwWcgnaBtmStatsWide <- macwWcgnaBtmStats %>%
  filter(FDR<0.05 & nSharedGenes>=10) %>%
  select(wgcnaName, btmName, nSharedGenes) %>%
  pivot_wider(names_from="btmName", values_from=nSharedGenes, values_fill = 0) %>%
  column_to_rownames("wgcnaName")

pheatmap(sigwWcgnaBtmStatsWide, 
         color = RColorBrewer::brewer.pal(n=7, name="YlOrRd"),
         clustering_method="ward.D2")

How do WGCNA clusters relate to experimental status?

Averaging WGCNA module expression by status lets us visualize relationships between WGNCA macrophage clusters and mutliple aspects of the experiments immune responses

#Now, average WGCNA module by cell type and vaccination status

WGNAmodStats <- longCounts %>%
  filter(cell_type__ontology_label %in% "macrophage") %>%
  left_join(data.frame(gene=names(macWgncaNetwork$colors),
                       WGCNAmod=macWgncaNetwork$colors)) %>%
    filter(!is.na(WGCNAmod)) %>%
  group_by(WGCNAmod) %>%
  summarize(aveModExpr=mean(normExpr, trim=0.25), sdModExpr=sd(normExpr))
  

aveWGNAexprByStatusAndCellType <- longCounts %>%
    filter(cell_type__ontology_label %in% "macrophage")%>%
  left_join(data.frame(gene=names(macWgncaNetwork$colors),
                       WGCNAmod=macWgncaNetwork$colors)) %>%
  filter(!is.na(WGCNAmod)) %>%
  group_by(WGCNAmod, status, cell_type__ontology_label) %>%
  summarize(modExprPerStatusPerCellType=mean(normExpr, trim=0.25)) %>%
  ungroup() %>%
  left_join(WGNAmodStats, by="WGCNAmod") %>%
  mutate(scaledModExprPerStatusPerCellType = (modExprPerStatusPerCellType-aveModExpr)/sdModExpr)

statusAnnots <- longCounts %>%
  select(celltype=cell_type__ontology_label, status, VaccineRoute,
         timepoint=vaccination__time_since, stim=Stimulated) %>%
  distinct() %>%
  mutate(colName=paste(celltype, status, sep="|"), timepoint=factor(timepoint)) %>%
  select(-status, -celltype) %>%
  column_to_rownames("colName")

aveWGNAexprByStatusAndCellTypeWide <- aveWGNAexprByStatusAndCellType %>%
  select(WGCNAmod, status, cell_type__ontology_label, scaledModExprPerStatusPerCellType) %>%
  mutate(label=paste(cell_type__ontology_label, status, sep="|")) %>%
  select(label,WGCNAmod, scaledModExprPerStatusPerCellType) %>%
  pivot_wider(names_from=label, values_from=scaledModExprPerStatusPerCellType) %>%
  column_to_rownames("WGCNAmod")

annotColors <- list(celltype=c("B cell"="purple",
                              "eosinophil"="yellow",
                              "epithelial cell"="grey",
                              "macrophage"="green",
                              "mast cell"="orange",
                              "neutrophil"="red",
                              "T cell"="black"),
                    stim=c(NO="white", YES="black"),
                    timepoint=c(`13`="white", `25`="grey"),
                    VaccineRoute=c(Naive="white", AE="purple", 
                                   IDHigh="darkgreen", IDLow="green", 
                                   IV="firebrick"))

pheatmap(aveWGNAexprByStatusAndCellTypeWide,
         annotation_col = statusAnnots,
         show_colnames = F, annotation_colors = annotColors,
         clustering_method = "ward.D2")

GWENA: a WGCNA alternative

GWENA is an alternative implementation of WGCNA (Lemoine et al., BMC Bioinformatics, PMID:[34034647](https://pubmed.ncbi.nlm.nih.gov/34034647/)), which includes some nice visualization features.

GWENA tutorial

BiocManager::install("GWENA")
library(GWENA)

# Prepare count data
VSTcounts = rawCounts %>%
      vst() %>%
      data.frame(check.names = F) %>%
      t()

VSTdataFiltered = filter_low_var(VSTcounts,pct=0.1,type = "median")


# Building network
net <- build_net(VSTdataFiltered, cor_func = "spearman",n_threads=1)

net$metadata$power

## [1] 12

fit_power_table <- net$metadata$fit_power_table
fit_power_table[fit_power_table$Power == net$metadata$power, "SFT.R.sq"]

## [1] 0.849985

# Identifying modules
modules <- detect_modules(VSTdataFiltered,
net$network,
detailled_result = TRUE,
merge_threshold = 0.95)

length(unique(modules$modules_premerge))

## [1] 12

length(unique(modules$modules))

## [1] 11

# Plotting modules
layout_mod_merge <- plot_modules_merge(
modules_premerge = modules$modules_premerge,
modules_merged = modules$modules)

ggplot2::ggplot(data.frame(modules$modules %>% stack),
ggplot2::aes(x = ind)) + ggplot2::stat_count() +
ggplot2::ylab("Number of genes") +
ggplot2::xlab("Module")

# Visualizing functional enrichment in modules
enrichment <- bio_enrich(modules$modules)
plot_enrichment(enrichment)

# Visualizing phenotypic association in modules
phenotype_association <- associate_phenotype(
modules$modules_eigengenes,
metaData %>% dplyr::select(cell_type__ontology_label, VaccineRoute))
plot_modules_phenotype(phenotype_association)

# Visualizing individual modules
module_example <- modules$modules$`10`
graph <- build_graph_from_sq_mat(net$network[module_example, module_example])
net_mod_2 <- net$network[modules$modules$`10`, modules$modules$`10`] 
sub_clusters <- get_sub_clusters(net_mod_2)

layout_mod_2 <- plot_module(graph, upper_weight_th = 0.999, 
                            groups = sub_clusters,
                             vertex.label.cex = 0, 
                             node_scaling_max = 7, 
                             legend_cex = 1)