G34-gliomas project [source]


1 Configuration

Configuration of project directory & analysis outputs:

Show full config

library(here)

# Set up outputs
message("Document index: ", doc_id)
## Document index: 04
# Specify where to save outputs
out        <- here(subdir, "output", doc_id); dir.create(out, recursive = TRUE)
figout     <- here(subdir, "figures", doc_id, "/"); dir.create(figout, recursive = TRUE)
cache      <- paste0("~/tmp/", basename(here()), "/", subdir, "/", doc_id, "/")

message("Cache: ", cache)
## Cache: ~/tmp/G34-gliomas/bulk_transcriptome_epigenome/04/

The root directory of this project is:

## /lustre03/project/6004736/sjessa/from_beluga/HGG-G34/G34-gliomas

Outputs and figures will be saved at these paths, relative to project root:

## G34-gliomas/bulk_transcriptome_epigenome/output/04
## G34-gliomas/bulk_transcriptome_epigenome/figures/04//

Setting a random seed:

set.seed(100)

2 Overview

In this document, we analyze bulk RNA-seq data from the G34-mutant patient-derived cell line GMB002 and isogenic matches, in which CRISPR editing was used to remove the mutation. Cell lines were also subjected to a differentiation protocol.

3 Libraries

# General
library(tidyr)
library(readr)
library(readxl)
library(dplyr)
library(magrittr)
library(glue)
library(purrr)

# For plotting
library(ggplot2)
library(scales)
library(ggrepel)
library(cowplot)
ggplot2::theme_set(theme_min(base_size = 13))

# Custom
source(here::here(subdir, "analysis/functions.R"))
source(here::here(subdir, "analysis/ssgsea.R"))

4 Analysis

4.1 Prep bulk RNAseq analysis

4.1.1 Sample metadata

Load in the metadata table for all bulk RNAseq samples to be included in the analysis:

(cell_line_meta <- readxl::read_xlsx(here(subdir, "data/20200219-cell_line_metadata.xlsx")) %>% 
     mutate(Group_broad = paste0(Genotype_broad, "_", Media)) %>% 
     mutate(Group_broad = gsub("/", "", Group_broad)))
# Number of samples per group
table(cell_line_meta$Group)
## 
##    H3F3A(+/-) nsm    H3F3A(+/-) ser    H3F3A(+/+) nsm    H3F3A(+/+) ser 
##                 3                 5                 3                 1 
## H3F3A(+/G34R) nsm H3F3A(+/G34R) ser H3F3A(+/G34V) ser 
##                10                 3                 1

4.1.2 Prepare bulk RNA-seq pipeline input

Palettes for groups:

palette_groups <- c("G34R_ser" = "cyan4",
                    "G34R_nsm" = "cyan1",
                    "G34V_ser" = "cyan4",
                    "G34V_nsm" = "cyan1",
                    "KORepair_ser" = "navy",
                    "KORepair_nsm" = "blue")

save(palette_groups, file = glue("{out}/cl_palettes.Rda"))

We will need one version with all samples together, since normalization is dataset-dependent, and we may want to compare expression levels across groups.

dir.create(file.path(out, "All_cell_line_data"), showWarnings = FALSE)

cell_line_meta %>%
    select(ID, Nickname, Group = Group_broad) %>%
    write_tsv(file.path(out, "All_cell_line_data", "info.samples.tsv"))

data.frame(Group = unique(cell_line_meta$Group_broad),
           Label = unique(cell_line_meta$Group_broad),
           Order = seq_along(unique(cell_line_meta$Group_broad)),
           Color = palette_groups[unique(cell_line_meta$Group_broad)],
           stringsAsFactors = FALSE) %>%
    write_tsv(file.path(out, "All_cell_line_data", "info.groups.tsv"))

4.2 Run pipeline on Beluga

In order to run the in-house DESeq2/expression analysis pipeline, the inputs created above are synced to the pipeline folder, and then the following outputs are loaded for further analysis below and in the subsequent analyses:

  • Alignment stats
  • Differential gene expession analysis output
  • Counts
pipeline_path <- "../../../level3/2020-01_G34_submission1_add_samples/Cell_lines3/"

4.3 Alignment stats

align_stats <- read_tsv(file.path(pipeline_path, "All_cell_line_data", "alignment.statistics.tsv")) %>%
    separate(mitochondrialPercentage, sep = "%", into = "mitochondrialPercentage") %>%
    mutate(mitochondrialPercentage = as.numeric(mitochondrialPercentage))
## Parsed with column specification:
## cols(
##   .default = col_character(),
##   rawReadCount = col_double(),
##   cleanReadCount = col_double(),
##   unmappedReadCount = col_double(),
##   mappedReadCount = col_double(),
##   wholeGeneReadCount = col_double(),
##   exonReadCount = col_double(),
##   intronReadCount = col_double(),
##   utr5ReadCount = col_double(),
##   cdsReadCount = col_double(),
##   utr3ReadCount = col_double(),
##   mitochondrialReadCount = col_double(),
##   ribosomalReadCount = col_double()
## )
## See spec(...) for full column specifications.
# DT::datatable(align_stats, filter = "top", class = 'white-space: nowrap')

left_join(cell_line_meta, align_stats, by = c("Nickname" = "name")) %>%
    write_tsv(glue("{out}/cell_line_metadata_with_align_stats.tsv"))

4.4 G34R vs KO / repair in serum

4.4.1 Load DGE

dge_serum <- read_tsv(file.path(pipeline_path, "GBM002_ser/diff/Ensembl.ensGene.exon/G34R_servsKORepair_ser.tsv")) %>% 
    separate(ID, into = c("ENSID", "gene_symbol"), sep = ":") %>% 
    mutate(log10p = -log10(padj))
## Parsed with column specification:
## cols(
##   ID = col_character(),
##   baseMean = col_double(),
##   log2FoldChange = col_double(),
##   lfcSE = col_double(),
##   stat = col_double(),
##   pvalue = col_double(),
##   padj = col_double()
## )
dge_serum %>% 
    filter(baseMean > 100 & padj < 0.05) %>% 
    rr_ggplot(aes(x = log2FoldChange, y = log10p, label = gene_symbol), alpha = 0.8, plot_num = 1) +
    geom_point()

dge_stem <- read_tsv(file.path(pipeline_path, "GBM002_nsm/diff/Ensembl.ensGene.exon/G34R_nsmvsKORepair_nsm.tsv")) %>% 
    separate(ID, into = c("ENSID", "gene_symbol"), sep = ":") %>% 
    mutate(log10p = -log10(padj))
## Parsed with column specification:
## cols(
##   ID = col_character(),
##   baseMean = col_double(),
##   log2FoldChange = col_double(),
##   lfcSE = col_double(),
##   stat = col_double(),
##   pvalue = col_double(),
##   padj = col_double()
## )

4.4.2 Targeted DGE, for serum condition - GBM002

Note that I have taken care, here, to use the counts normalized for the serum samples together only.

goi <- c("DLX1", "DLX2", "DLX5", "DLX6", "PDGFRA", "GSX2")

gbm002_counts <- extract_pipeline_counts(file.path(pipeline_path, "GBM002_ser/counts/Ensembl.ensGene.exon.norm.tsv.gz"),
                                         goi = goi) %>%
    left_join(cell_line_meta, by = c("sample" = "Nickname")) %>% 
    left_join(dge_serum, by = c("gene_symbol" = "gene_symbol", "gene_ensg" = "ENSID"))
## Joining, by = "gene_symbol"
dge_serum %>% select(-ENSID) %>% filter(gene_symbol %in% goi)

For the DLXs only

dlx_limits <- list("DLX1"   = c(0, 21000),
                   "DLX2"   = c(0, 21000),
                   "DLX5"   = c(0, 1600),
                   "DLX6"   = c(0, 1600),
                   "PDGFRA" = c(0, 22000),
                   "GSX2"   = c(0, 200))

cl_dotplot <- function(gene, lims, counts, levels = c("G34R_nsm", "KORepair_nsm",
                                                      "G34R_ser", "KORepair_ser")) {
    
    counts %>%
        filter(gene_symbol == gene) %>% 
        mutate(Group_broad = factor(Group_broad,
                                    levels = levels)) %>% 
        ggplot(aes(x = Group_broad, y = gene_expression)) +
        geom_jitter(aes(colour = Group_broad), size = 5, alpha = 0.87, width = 0.1) +
        scale_colour_manual(values = c("cyan4", "midnightblue")) +
        stat_summary(fun.y = median, fun.ymin = median, fun.ymax = median,
                     geom = "crossbar", width = 0.2) +
        theme_min(border_colour = "black") +
        ylim(lims) +
        ggtitle(gene) +
        rotateX() +
        noLegend()
    
}

gbm002_counts %>% 
    select(sample, gene_symbol, gene_expression, Group_broad) %>% 
    write_tsv(glue("{figout}/dlx_serum_boxplot-1.source_data.tsv"))

imap(dlx_limits, ~ cl_dotplot(.y, .x, counts = gbm002_counts)) %>% 
    {plot_grid(plotlist = ., ncol = 6)}


[figure/source data @ G34-gliomas/bulk_transcriptome_epigenome/figures/04//dlx_serum_boxplot…]

4.4.2.1 Investigation of DGE in the serum condition

# Load up DGE results from DESeq2
res_ser <- read_tsv(file.path(pipeline_path, "GBM002_ser/diff/Ensembl.ensGene.exon/G34R_servsKORepair_ser.tsv")) %>% 
    separate(ID, into = c("ENSG", "symbol"), sep = ":") 
## Parsed with column specification:
## cols(
##   ID = col_character(),
##   baseMean = col_double(),
##   log2FoldChange = col_double(),
##   lfcSE = col_double(),
##   stat = col_double(),
##   pvalue = col_double(),
##   padj = col_double()
## )
# Construct the input dataframe for an MA plot
res_ser2 <- res_ser %>%
    select(symbol, baseMean, log2FoldChange, padj) %>%
    mutate(signif   = ifelse( is.na(padj), FALSE, padj < 0.05 )) %>% 
    mutate(dlx      = symbol %in% c("DLX1", "DLX2", "DLX5", "DLX6")) %>% 
    mutate(fc = case_when(
        log2FoldChange > 2 ~ "LFC > 2",
        log2FoldChange < -2 ~ "LFC < -2",
        TRUE ~ "-2 < LFC < 2")) %>% 
    mutate(log2FoldChange = ifelse(abs(log2FoldChange) > 2,
                                   sign(log2FoldChange) * 2,
                                   log2FoldChange))

# Make an MA plot, labelling the DLX genes
res_ser2 %>%
    ggplot(aes(x = log10(baseMean), y = log2FoldChange)) +
    geom_hline(yintercept = 0, colour = "gray90") +
    geom_point(aes(colour = signif, size = dlx, shape = fc), alpha = 0.5) +
    scale_colour_manual(values = c("gray50", "red")) +
    scale_size_manual(values = c("TRUE" = 3, "FALSE" = 0.5)) +
    scale_shape_manual(values = c("LFC > 2" = 24, "LFC < -2" = 25, "-2 < LFC < 2" = 19)) +
    geom_text_repel(data = res_ser2 %>% filter(symbol %in% c("DLX1", "DLX2", "DLX5", "DLX6")),
               aes(x = log10(baseMean), y = log2FoldChange, label = symbol)) +
    ylim(c(-2, 2))

Examination of p-values:

# Here's a histongram of p-values across genes
res_ser %>% 
    gather(stat, value, pvalue, padj) %>%
    ggplot(aes(x = value)) +
    geom_histogram() +
    facet_wrap(~ stat, ncol = 1)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# Filter to genes with baseMean > 100 and redraw the plot
res_ser %>% 
    filter(baseMean > 100) %>% 
    gather(stat, value, pvalue, padj) %>%
    ggplot(aes(x = value)) +
    geom_histogram() +
    facet_wrap(~ stat, ncol = 1)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

4.4.3 Targeted DGE, for stem condition - GBM002

gbm002_counts_nsm <- extract_pipeline_counts(file.path(pipeline_path, "GBM002_nsm/counts/Ensembl.ensGene.exon.norm.tsv.gz"),
                                             goi = goi) %>%
    left_join(cell_line_meta, by = c("sample" = "Nickname")) %>% 
    left_join(dge_stem, by = c("gene_symbol" = "gene_symbol", "gene_ensg" = "ENSID"))
## Joining, by = "gene_symbol"
dge_stem %>% select(-ENSID) %>% filter(gene_symbol %in% goi)
dlx_limits <- list("DLX1"   = c(0, 4000),
                   "DLX2"   = c(0, 5300),
                   "DLX5"   = c(0, 400),
                   "DLX6"   = c(0, 500),
                   "PDGFRA" = c(0, 30000),
                   "GSX2"   = c(0, 30))

gbm002_counts_nsm %>% 
    select(sample, gene_symbol, gene_expression, Group_broad) %>% 
    write_tsv(glue("{figout}/dlx_stem_boxplot-1.source_data.tsv"))

imap(dlx_limits, ~ cl_dotplot(.y, .x, counts = gbm002_counts_nsm)) %>% 
    {plot_grid(plotlist = ., ncol = 6)}


[figure/source data @ G34-gliomas/bulk_transcriptome_epigenome/figures/04//dlx_stem_boxplot…]

4.5 ssGSEA

4.5.1 Prep counts

raw_counts <- file.path(pipeline_path, "All_GBM002_new/counts/Ensembl.ensGene.exon.raw.tsv.gz") %>%
    read.table(header = T, row.names = 1, check.names = F, sep = "\t")

head(rownames(raw_counts) %<>% strsplit(":") %>% sapply(getElement, 1))
## [1] "ENSG00000118473" "ENSG00000162426" "ENSG00000157191" "ENSG00000169504"
## [5] "ENSG00000142920" "ENSG00000186094"
raw_counts <- as.matrix(raw_counts)

raw_counts[1:5, 1:5]
##                 HSJD-GBM002_A10_p58 HSJD-GBM002_A10_p59 GBM002_cA10_p58
## ENSG00000118473                  37                  30              80
## ENSG00000162426                  90                  81             174
## ENSG00000157191                 475                 473            1017
## ENSG00000169504                7542                6570           19732
## ENSG00000142920                  55                  48              89
##                 HSJD-GBM002_F06_p55 HSJD-GBM002_F06_p56
## ENSG00000118473                 114                  62
## ENSG00000162426                  80                  40
## ENSG00000157191                 531                 364
## ENSG00000169504               11481                5515
## ENSG00000142920                  66                  20
rr_saveRDS(object = raw_counts,
           desc = "Raw RNA-seq counts for GBM002 cell lines",
           file = glue("{out}/cl_raw_counts.Rds"))
## ...writing description of cl_raw_counts.Rds to G34-gliomas/bulk_transcriptome_epigenome/output/04/cl_raw_counts.desc
dim(raw_counts)
## [1] 60234    16

4.5.2 Load signatures

Loading the cell type gene signatures prepared previously for GSEA, and selecting some representative signatures for relevant cell types:

signatures <- readRDS(here(subdir, "output/02/signatures_ens.Rds"))

# Select pathways of interest
poi <- list("newborn_inhib_neuron" = signatures$`HF nIN1`,
            "mature_inhib_neuron" = signatures$`HP IN-PV`,
            "opc" = signatures$`F-p6 OPC`,
            "astrocyte" = signatures$`F-p0 ASTR2`,
            "rgc" = signatures$`HF RG-early`,
            "fetal_excit_neurons" = signatures$`HF EN-PFC1`)

4.5.3 Run ssGSEA

cl_ssgsea <- ssgsea_le(expr_mat = raw_counts,
                       gene_sets = poi,
                       # Following exactly the same parameters as we used
                       # in the NG bulk analysis
                       alpha = 0.75,
                       normalize = FALSE)

cl_ssgsea$enrichment_scores %>%
    as.data.frame() %>%
    tibble::rownames_to_column(var = "Signature") %>%
    rr_write_tsv(path = glue("{out}/cl_ssgsea_scores.tsv"),
                 desc = "ssGSEA scores for cell type signatures in GBM002 cell lines")
## ...writing description of cl_ssgsea_scores.tsv to G34-gliomas/bulk_transcriptome_epigenome/output/04/cl_ssgsea_scores.desc
leading_edge <- cl_ssgsea$leading_edge
rr_saveRDS(object = leading_edge,
           desc = "Leading edge froms ssGSEA analysis of cell type signatures in GBM002 cell lines",
           file = glue("{out}/cl_ssgsea_le.Rds"))
## ...writing description of cl_ssgsea_le.Rds to G34-gliomas/bulk_transcriptome_epigenome/output/04/cl_ssgsea_le.desc

4.5.4 Visualize results

cl_ssgsea_tidy <- cl_ssgsea$enrichment_scores %>%
    as.data.frame() %>%
    tibble::rownames_to_column(var = "Signature") %>%
    gather(Sample, Score, 2:ncol(.)) %>%
    left_join(cell_line_meta, by = c("Sample" = "Nickname"))

cl_ssgsea_tidy %>%
    filter(Signature %in% c("newborn_inhib_neuron", "rgc")) %>%
    mutate(Signature = factor(Signature, levels = c("rgc", "newborn_inhib_neuron"))) %>%
    filter(Media == "ser") %>%
    mutate(Group_broad = factor(Group_broad,
                                levels = c("G34R_nsm", "KORepair_nsm",
                                           "G34R_ser", "KORepair_ser"))) %>%
    rr_ggplot(aes(x = Group_broad, y = Score), plot_num = 1) +
    geom_jitter(aes(colour = Group_broad), size = 5, alpha = 0.87, width = 0.1) +
    scale_colour_manual(values = c("cyan4", "midnightblue")) +
    stat_summary(fun.y = median, fun.ymin = median, fun.ymax = median,
                 geom = "crossbar", width = 0.2) +
    theme_min(border_colour = "black") +
    facet_wrap(~ Signature) +
    rotateX() +
    noLegend() +
    ylim(c(22400, 24000))


[figure/source data @ G34-gliomas/bulk_transcriptome_epigenome/figures/04//viz_ssgsea…]

4.6 Distinguishing the effect of G34 vs. cell of origin

In a comparison of H3K27me3 at promoters in G34-mutant HGG patient tumor samples vs non-G34 mutant HGGs, G34 tumors have several thousand promoters enriched for K27me3 compared to the other tumor types.

To investigate whether these are an effect of the G34 mutation, or an effect of cell of origin (both of which differ between patient tumors), we’ll look at the expression of these promoters in the CRISPR cell lines, allowing to decouple the effect of G34 (which differs in the isogenic cell line pairs) vs. cell of origin (which does not).

# Read in the tumour promoter K27me3 scores
tumor_k27me3 <- read_xlsx(here(subdir, "data/TABLE-S3_G34-HGG-Epigenome.xlsx"))

# Subset by thresholds used in the paper
# z-score_H3K27me3 > 0.5, pvalue_H3K27me3 < 0.05
tumor_k27me3_filt <- tumor_k27me3 %>% 
    filter(`z-score_H3K27me3` > 0.5) %>% 
    distinct(name, .keep_all = TRUE)

# Read in the counts for the cell lines
gbm002_counts <- extract_pipeline_counts(file.path(pipeline_path, "GBM002_ser/counts/Ensembl.ensGene.exon.norm.tsv.gz"),
                                         goi = tumor_k27me3_filt$name) %>%
    left_join(cell_line_meta, by = c("sample" = "Nickname"))
## Joining, by = "gene_symbol"
# Check fold changes from RNA-seq DGE in stem at these promoters
dge_stem %>% 
    filter(gene_symbol %in% tumor_k27me3_filt$name) %>% 
    ggplot(aes(x = log2FoldChange)) +
    geom_density()

# Check what proportion of those genes have associated significant DGE
dge_stem %>%
    filter(gene_symbol %in% tumor_k27me3_filt$name) %>% 
    mutate(significant_upon_editing = case_when(
        is.na(padj) ~ "No",
        padj < 0.05 & log2FoldChange > 1 ~ "Yes",
        TRUE ~ "No")) %>%
    count(significant_upon_editing) %>% 
    ggplot(aes(x = "", y = n)) +
    geom_bar(aes(fill = significant_upon_editing), stat = "identity") +
    coord_polar("y", start = 0) +
    theme_void() +
    geom_text(aes(x = c(1, 1), y = c(1000, 0), label = n), size=5)


5 Reproducibility

This document was last rendered on:

## 2020-12-11 13:30:54

The git repository and last commit:

## Local:    master /lustre03/project/6004736/sjessa/from_beluga/HGG-G34/G34-gliomas
## Remote:   master @ origin (git@github.com:fungenomics/G34-gliomas.git)
## Head:     [71d3b7e] 2020-09-28: Ignore slurm submission script

The random seed was set with set.seed(100)

The R session info:

## R version 3.5.1 (2018-07-02)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: CentOS Linux 7 (Core)
## 
## Matrix products: default
## BLAS/LAPACK: /cvmfs/soft.computecanada.ca/easybuild/software/2017/Core/imkl/2018.3.222/compilers_and_libraries_2018.3.222/linux/mkl/lib/intel64_lin/libmkl_gf_lp64.so
## 
## locale:
##  [1] LC_CTYPE=en_CA.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_CA.UTF-8        LC_COLLATE=en_CA.UTF-8    
##  [5] LC_MONETARY=en_CA.UTF-8    LC_MESSAGES=en_CA.UTF-8   
##  [7] LC_PAPER=en_CA.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_CA.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] stats     graphics  grDevices datasets  utils     methods   base     
## 
## other attached packages:
##  [1] pbapply_1.4-3 cowplot_0.9.4 ggrepel_0.8.0 scales_1.1.1  ggplot2_3.1.0
##  [6] purrr_0.3.4   glue_1.4.2    magrittr_1.5  dplyr_0.8.0   readxl_1.2.0 
## [11] readr_1.3.1   tidyr_0.8.2   here_0.1     
## 
## loaded via a namespace (and not attached):
##  [1] Rcpp_1.0.5          git2r_0.27.1        plyr_1.8.6         
##  [4] pillar_1.4.6        compiler_3.5.1      cellranger_1.1.0   
##  [7] RColorBrewer_1.1-2  BiocManager_1.30.10 tools_3.5.1        
## [10] digest_0.6.25       evaluate_0.14       lifecycle_0.2.0    
## [13] tibble_3.0.3        gtable_0.3.0        pkgconfig_2.0.3    
## [16] rlang_0.4.7         parallel_3.5.1      yaml_2.2.1         
## [19] xfun_0.17           withr_2.2.0         stringr_1.4.0      
## [22] knitr_1.29          vctrs_0.3.4         hms_0.5.3          
## [25] rprojroot_1.3-2     grid_3.5.1          tidyselect_1.1.0   
## [28] R6_2.4.1            rmarkdown_1.11      backports_1.1.9    
## [31] ellipsis_0.3.1      htmltools_0.5.0     assertthat_0.2.1   
## [34] colorspace_1.4-1    renv_0.10.0         stringi_1.5.3      
## [37] lazyeval_0.2.2      munsell_0.5.0       crayon_1.3.4


A project of the Kleinman Lab at McGill University, using the rr reproducible research template.
home GitHub