Chapter 5 Classes of RNA-seq biopsy sampels

To better understand the differences between different FSHD samples and identify sub-groups and their characteristics, we combine the RNA-seq gene expression matrics from the first and follow-up visit samples and apply k-means clustering on the sample space build upon selected features. We are able to find five anchor samples and partition all FSHD and controls into five clusters, each of which has unique characteristics.

5.1 FSHD classes

Steps below explained how the samples are paritioned into five clusters:
1. Feature selection: select features that are best characterize the cluster patterns of PCA sample space built on all samples. We end up select the differentially expressed genes in the samples characterized by high expressed DUX4-regulated and inflammatory response genes (cc_sample), as well as muscle development related genes. The feature space is reduced to 705 informative features (Supple. Table S4 and Suppl. Fig. S5B).
2. Sample space construction: use previous selected featrues and their regularized log expression to build sample sapce by PCA. The dimension is then reduced. 3. K-means clusterings: based on the PCA sample space and k-mean cluster is used to partition all samples into five clusters. To enchance the stability of k-means algorithm, we select five anchor samples (01-0041, 01-0022-1, 01-0029b,32-0002b1, 32-0012b) representing the center of each cluster.

5.1.1 Load datasets and libraries

# load datasets and libraries
library(BiocParallel)
multi_param <- MulticoreParam(worker=2)
register(multi_param, default=TRUE)

suppressPackageStartupMessages(library(DESeq2))
suppressPackageStartupMessages(library(xlsx))
suppressPackageStartupMessages(library(ggplot2))
suppressPackageStartupMessages(library(tidyverse))
suppressPackageStartupMessages(library(pheatmap))
suppressPackageStartupMessages(library(ggrepel))
suppressPackageStartupMessages(library(gridExtra))

pkg_dir <- "/fh/fast/tapscott_s/CompBio/RNA-Seq/hg38.FSHD.biopsy.all"
source(file.path(pkg_dir, "scripts", "manuscript_tools.R"))
load(file.path(pkg_dir, "public_data", "mri_pathology.rda"))
load(file.path(pkg_dir, "public_data", "sanitized.dds.rda"))
load(file.path(pkg_dir, "public_data", "sanitized.rlg.rda"))
load(file.path(pkg_dir, "public_data", "cc_sample.rda"))
load(file.path(pkg_dir, "public_data", "mri_pathology.rda"))

5.1.2 Feature selection

The FSHD samples that exhibit strong DUX4 and inflammatory signatures are

ccp <- cc_sample %>% filter(DUX4_group == "DUX4+") %>%
  filter(cc_intensity != "cc++")
knitr::kable(ccp, caption="Samples showing strong DUX4 and inflammatory signatures.")

(#tab:select_features_1)Samples showing strong DUX4 and inflammatory signatures.
sample_name	cc_group	DUX4_group	pheno_type	visit	cc_intensity
32-0005b	cc+	DUX4+	FSHD	II	cc+
32-0002b1	cc+	DUX4+	FSHD	II	cc+
32-0003	cc+	DUX4+	FSHD	I	cc+
01-0029	cc+	DUX4+	FSHD	I	cc+
32-0005	cc+	DUX4+	FSHD	I	cc+
32-0004	cc+	DUX4+	FSHD	I	cc+
32-0006	cc+	DUX4+	FSHD	I	cc+
01-0025	cc+	DUX4+	FSHD	I	cc+

Select the up-reguated differentially expressed genes in DUX+/inflammed+ samples relative to the controls (adjusted \(p\)-vlaue < 0.05 corresponding to \(H_0: |lfc| = 2\)). This set of genes are informative in sample variation.

keep <- colnames(sanitized.dds) %in% ccp$sample_name |
  sanitized.dds$pheno_type == "Control"
dux4_dds <- sanitized.dds[, keep]
design(dux4_dds) <- ~ pheno_type
dux4_dds <- DESeq(dux4_dds)
dux4_res <- results(dux4_dds, alpha=0.05, lfcThreshold=2)

#' tidy up the up-regulated
df_up <- as.data.frame(dux4_res) %>%
  rownames_to_column(var="gencode_id") %>%
  dplyr::filter(padj < 0.05 & log2FoldChange > 0) %>%
  mutate(gene_name = rowData(dux4_dds[gencode_id])$gene_name) %>%
  mutate(gene_id =
           sapply(strsplit(gencode_id, ".", fixed=TRUE), "[[", 1))

Another set of genes that are also informative in terms of sample variation is the muscle development related genes (code below). We hereby select these two set of genes as our most informative features to construct sample space.

# muscle related, mostly differentially down-regulated in the biopsies with less muscle content
muscle <- c(
  "MYF5", "MYF6", "MYH1", # newly added
  "MYH13", "MYH2", "MYH4", "MYH6", "MYH7", # newly added
  "MYHAS", "MYL1", "MYL2", #newly added
  "MYO18B", "MYOD1", "MYOG", "MYOM2", "MYOM3",
  "MYOT", "MYOZ1", "MYOZ2", "MYOZ3", "PAX7", "PAX3")
de_markers <- c(df_up$gencode_id, unname(get_ensembl(muscle, sanitized.rlg)))

5.2 Sample space and k-means clustering

Defining anchor samples helps to stablize the k-means clustering.

#' anchar_sample: 01-0041 (control), 32-0002, "01-0029b", "32-0002b1", and "32-0012b"
anchor_sample <- 
  data.frame(label = c("A", "B", "C", "D", "E"),
             sample_name = c("01-0041", "01-0022-1", "01-0029b","32-0002b1", "32-0012b"),
             stringsAsFactors=FALSE)

Build sample space by PCA and call up k-means algorithm.

#' (3b) k-means (k=5) on PCA of de_markers
k <- 5
data <- assay(sanitized.rlg[de_markers, ])
pca <- prcomp(t(data))
#cl <- kmeans(pca$x, centers=k, iter.max=15, nstart=5)
cl <- kmeans(pca$x, centers=pca$x[anchor_sample$sample_name, ], iter.max=15)

.rename_cluster_5 <- function(sample_name) {
   anchor_samples <- c(A = "01-0041", B = "32-0002", C = "01-0029b",
                       D = "32-0002b1", E = "32-0012b")
   i <- which(anchor_sample$sample_name %in% as.character(sample_name))
   names(anchor_samples)[i]
}

rename_cluster <- function(cl, k) {
  df <- data.frame(sample_name=names(cl$cluster),
                   cluster=cl$cluster)
  tmp <- df %>% group_by(cluster) %>%
    summarise(rename_cluster=.rename_cluster_5(sample_name)) 
  df <- df %>% mutate(renamed=tmp$rename_cluster[cluster])
}

.face_off_cluster <- function(old_name) {
  new_name <- c(A="Mild", B="Moderate", C="IG-High", D="High", E="Muscle-Low")
  factor(new_name[old_name], levels=new_name)
}

cluster_color <- c(A_Cntr="#ff7f00", A="#a65628", B="#f781bf", C="#984ea3", D="#e41a1c", E="#377eb8")
cluster_df <- rename_cluster(cl, k) %>%
  mutate(RNA_cluster = ifelse(sanitized.rlg$pheno_type == "Control",
                              paste0(renamed, "_Cntr"), renamed)) %>%
  mutate(RNA_cluster = factor(RNA_cluster, levels=c("A_Cntr", "A", "B", "C", "D", "E"))) %>%
  mutate(color = cluster_color[as.character(RNA_cluster)])  %>%
  mutate(new_cluster_name = .face_off_cluster(renamed)) 

sanitized.rlg$RNA_cluster_5 <- sanitized.dds$RNA_cluster_5 <- cluster_df$RNA_cluster

5.3 Visualize the FSHD classes

Showing below is the visualization of FSHD sample space reduced to two dimensions by PCA’s first two components. Colors represent classes clustered by k-means. Cluster A are renamed to Mild, B to Moderate, C to Immunoglobulin-High (IG-High), D to High and E to Muscle-Low.

data <- plotPCA(sanitized.rlg, 
                intgroup=c("RNA_cluster_5", "pheno_type"), returnData=TRUE)
percentVar <- round(100 * attr(data, "percentVar"))
gg <- ggplot(data, aes(PC1, PC2, color=RNA_cluster_5)) +
  geom_point(size=0.8) +
  #xlab(paste0("PC1: ",percentVar[1],"% variance")) +
  #ylab(paste0("PC2: ",percentVar[2],"% variance")) +
  labs(title="Principal components on initial and follow-up visits") +
  geom_text_repel(aes(label=rownames(data)), size=2.2, show.legend=FALSE) +
  theme_bw() +
  scale_color_manual(name="Classes", values=cluster_color) +
  #scale_color_brewer(palette="Set2") +
  theme(legend.position="right", 
        plot.title = element_text(hjust = 0.5, size=10)) 
gg

5.4 View by selected marker genes

We selected informative biomarkers of six FSHD relevant biological functions and illustrate the characteristics of FSHD classes by heatmap. Row gaps separated by (top to bottom) DUX4-regulated, extracellular matrix, cell cycle, immune/inflammatory response, immunoglobulin and muscle development related. Column gaps separate RNA-seq clustered groups, A-E. Color scale corresponds to row z-score of regularized log2 expression.

robust_markers <- list(dux4 = c("ZSCAN4", "KHDC1L", "LEUTX", 
                          "PRAMEF2", "TRIM43"),
                 cellular_matrix=c("PLA2G2A", "COL19A1", "COL1A1",
                                   "COMP", "SFRP2"),
                 cell_cycle=c("CCNA1", "CDKN1A", "CDKN2A"),
                 inflam=c("CCL18", "CCL13", "C1QA", "C6", "C7"), # CCL18 is good for seperate control & FSHD
                 immunoglobulin = c("IGHG4", "IGHGP", "IGHG1"), 
                 muscle = c("PAX7", "PAX3", "MYOG", "MYOD1"))

marker_id <- unlist(lapply(robust_markers, get_ensembl, sanitized.rlg))
data_lst <- lapply(levels(sanitized.rlg$RNA_cluster_5), function(x, rlg) {
  s <- colnames(rlg)[rlg$RNA_cluster_5 == x]
  data <- assay(rlg[marker_id, s]) 
  # re-arrange by the colmeans (look heatmap)
  Colv <- colMeans(data, na.rm = TRUE)
  data <- data[, order(Colv)]
}, rlg=sanitized.rlg)
data <- do.call(cbind, data_lst)
rownames(data) <- rowData(sanitized.rlg[rownames(data)])$gene_name
zscore_data <- (data - rowMeans(data)) / rowSds(data)
gaps_row <- cumsum(lapply(robust_markers, length))

#' annotation bar
classes <- as.character(sanitized.rlg[, colnames(data)]$RNA_cluster_5)
pheno_type <- sanitized.dds[, colnames(data)]$pheno_type                         
annotation_col <- 
  data.frame(sample_name = colnames(data),
             classes     = classes,
             k_means     = sanitized.dds[, colnames(data)]$RNA_cluster_5,
             FSHD = sanitized.dds[, colnames(data)]$pheno_type) 

new_time <- bind_rows(mri_pathology$time_1, mri_pathology$time_2) %>% 
  dplyr::filter(sample_name!="32-0008b") %>%
  mutate(sample_name=as.character(sample_name))

new_time$sample_name[new_time$sample_name=="32-0002b"] <- "32-0002b1"
gaps_col <- cumsum(count(annotation_col, k_means)[, "n"])$n

annotation_col <- annotation_col %>%
  left_join(new_time, by="sample_name") %>%
  rename(Pathology=Pathology.Score, T1=fat_fraction) %>%
  dplyr::select(sample_name, classes, FSHD, T1, STIR_rating, Inflammation, Pathology) %>%
  column_to_rownames(var="sample_name")

## Warning: Column `sample_name` joining factor and character vector, coercing
## into character vector

col <- c("#f7f7f7", "#238b45")
ann_cor <- list(
  STIR_rating= col,  T1 = col, FSHD=c(Control="white", FSHD="seagreen"),
  Inflammation = col, Fibrosis = col,
  Var.in.Fiber.Size = col, Nec.Reg.Inflamm = col,
  Nucleation = col, Pathology = col,
  classes = cluster_color)

#' heatmap of robust markers
pheatmap(zscore_data, annotation_col=annotation_col,
         fontsize_row=7, fontsize_col=6,
         scale="none", 
         gaps_col=gaps_col,
         gaps_row=gaps_row,
         annotation_colors=ann_cor,
         cellheight=10, annotation_legend=FALSE,
         cluster_rows=FALSE, cluster_cols=FALSE)

5.5 MRI characterisitcs and pathology scores by FSHD classes

The chunk below summarizes the average MRI and pathology scores of each class.

histo <- bind_rows(mri_pathology$time_1, mri_pathology$time_2) %>%
  mutate(sample_name = as.character(sample_name)) %>%
  mutate(sample_name = ifelse(sample_name == "32-0002b", "32-0002b1", sample_name)) %>%
  left_join(cluster_df, by="sample_name")

## Warning: Column `sample_name` joining character vector and factor, coercing
## into character vector

#' sanity check: all cluster$sample_name are in time_var$sample_name
all(cluster_df$sample_name %in% histo$sample_name)

## [1] FALSE

#' join cluster_df to histology/mri scores
mri_stats <- histo %>% 
  group_by(new_cluster_name)  %>%
  summarize(pathology_avg = mean(Pathology.Score, na.rm=TRUE),
            pathology_sd = sd(Pathology.Score, na.rm=TRUE),
            T1_fraction_avg = mean(fat_fraction, na.rm=TRUE),
            T1_fraction_sd = sd(fat_fraction, na.rm=TRUE),
            STIR_avg = mean(STIR_rating),
            STIR_sd = sd(STIR_rating),
            Inflam_avg = mean(Inflammation, na.rm=TRUE),
            Infalm_se = mean(Inflammation, na.rm=TRUE))

library(wesanderson)
col <- as.character(wes_palette(n=4, name="Cavalcanti1", type="discrete"))
tmp <- mri_stats %>%
  dplyr::select(new_cluster_name, pathology_avg, Inflam_avg, STIR_avg) %>%
  rename(Pathology=pathology_avg, Inflamm=Inflam_avg, STIR=STIR_avg) %>%
  gather(feature, average.score, -new_cluster_name)
gg <- ggplot(tmp, aes(x=new_cluster_name, y=average.score, fill=feature)) +
  geom_bar(stat="identity", width=0.7)  +
  #scale_fill_brewer(palette="Set2") +
  scale_fill_manual(values=col) +
  theme_bw() +
  labs(y="Average score") +
  theme(plot.title = element_text(hjust = 0.5, size=9),
        legend.key.size = unit(0.8, "line"),
        #legend.position="bottom",
        axis.title.x = element_blank(),
        axis.title.y = element_text(size=10),
        axis.text.x = element_text(angle=25, hjust=1))
gg

(#fig:viz_mri_stats)Average MRI and pathology scores by FSHD classes.

5.6 Discriminative biomarkers characteristics by FSHD classes

The code chunk below summarizes the average expression of five biological processes in FSHD relative to the controls.

library(xlsx)
table_dir <- file.path(pkg_dir, "suppl_tables")
file_name <- file.path(table_dir, "suppl_table_5_Candidate_Biomarkers_and_Enriched_Go.xlsx")
#' High vs Control differential analysis sheet
de <- read.xlsx(file=file_name, sheetIndex=1, startRow=4)
ecm_de <- de %>% filter(ecm == TRUE) %>% pull(gencode_id) %>% as.character(.)
dux4_de <- de %>% filter(DUX4_induced == TRUE) %>% pull(gencode_id) %>% as.character(.)
inflamm_de <- de %>% filter(inflamm == TRUE | immune == TRUE) %>% 
  pull(gencode_id) %>% as.character(.)
stress_de <- de %>% filter(stress == TRUE) %>% pull(gencode_id) %>% as.character(.)
ig_de <- de %>% filter(grepl("IGH", gene_name) | grepl("IGK", gene_name)) %>%
  pull(gencode_id) %>% as.character(.)
de_list <- list(Extracellur_Matrix=ecm_de, DUX4=dux4_de, Inflammatory=inflamm_de, Immunoglobulin=ig_de)

#' add muscle
muscle <- c(
  "MYF5", "MYF6", "MYH1", # newly added
  "MYH13", "MYH2", "MYH4", "MYH6", "MYH7", # newly added
  "MYHAS", "MYL1", "MYL2", #newly added
  "MYO18B", "MYOD1", "MYOG", "MYOM2", "MYOM3",
  "MYOT", "MYOZ1", "MYOZ2", "MYOZ3", "PAX7", "PAX3")
de_list$Muscle <- unname(get_ensembl(muscle, sanitized.rlg))

# average expression relative to controls
de_avg_per_class <- lapply(de_list, function(id){
  tmp <- sanitized.rlg[id, ]
  class <- levels(sanitized.rlg$RNA_cluster_5)
  avg_rlg <- sapply(class, function(x){
    colMeans(assay(tmp)[, tmp$RNA_cluster_5 == x])
  })
  avg_per_class <- sapply(avg_rlg, mean)
  avg_per_class - avg_per_class["A_Cntr"]
})

de_avg_per_class <- as.data.frame(do.call(cbind, de_avg_per_class)) %>%
  rownames_to_column(var="class") %>% 
  dplyr::filter(class != "A_Cntr") %>%
  mutate(RNA_cluster = .face_off_cluster(class)) %>%
  dplyr::select(-class) %>%
  gather(feature, average.score, -RNA_cluster) %>%
  mutate(feature=factor(feature))

col <- as.character(wes_palette(n=5, name="Darjeeling1", type="discrete"))
gg <- ggplot(de_avg_per_class, aes(x=RNA_cluster, y=average.score, fill=feature)) +
  geom_bar(stat="identity", width=0.75)  +
  #scale_fill_brewer(palette="Set2") +
  scale_fill_manual(values=col) +
  theme_bw() +
  labs(y="Average expression - control") +
  theme(plot.title = element_text(hjust = 0.5, size=9),
        axis.title.x = element_blank(),
        axis.title.y = element_text(size=9),
        axis.text.x = element_text(angle=25, hjust=1, size=8), 
        legend.key.size = unit(0.75, "line"))
gg