G34-gliomas project [source]
Configuration of project directory & analysis outputs:
Show full config
library(here)
# Set up outputs
message("Document index: ", doc_id)
## Document index: 04
# Specify where to save outputs
out <- here(subdir, "output", doc_id); dir.create(out, recursive = TRUE)
figout <- here(subdir, "figures", doc_id, "/"); dir.create(figout, recursive = TRUE)
cache <- paste0("~/tmp/", basename(here()), "/", subdir, "/", doc_id, "/")
message("Cache: ", cache)
## Cache: ~/tmp/G34-gliomas/bulk_transcriptome_epigenome/04/
The root directory of this project is:
## /lustre03/project/6004736/sjessa/from_beluga/HGG-G34/G34-gliomas
Outputs and figures will be saved at these paths, relative to project root:
## G34-gliomas/bulk_transcriptome_epigenome/output/04
## G34-gliomas/bulk_transcriptome_epigenome/figures/04//
Setting a random seed:
set.seed(100)
In this document, we analyze bulk RNA-seq data from the G34-mutant patient-derived cell line GMB002 and isogenic matches, in which CRISPR editing was used to remove the mutation. Cell lines were also subjected to a differentiation protocol.
# General
library(tidyr)
library(readr)
library(readxl)
library(dplyr)
library(magrittr)
library(glue)
library(purrr)
# For plotting
library(ggplot2)
library(scales)
library(ggrepel)
library(cowplot)
ggplot2::theme_set(theme_min(base_size = 13))
# Custom
source(here::here(subdir, "analysis/functions.R"))
source(here::here(subdir, "analysis/ssgsea.R"))
Load in the metadata table for all bulk RNAseq samples to be included in the analysis:
(cell_line_meta <- readxl::read_xlsx(here(subdir, "data/20200219-cell_line_metadata.xlsx")) %>%
mutate(Group_broad = paste0(Genotype_broad, "_", Media)) %>%
mutate(Group_broad = gsub("/", "", Group_broad)))
# Number of samples per group
table(cell_line_meta$Group)
##
## H3F3A(+/-) nsm H3F3A(+/-) ser H3F3A(+/+) nsm H3F3A(+/+) ser
## 3 5 3 1
## H3F3A(+/G34R) nsm H3F3A(+/G34R) ser H3F3A(+/G34V) ser
## 10 3 1
Palettes for groups:
palette_groups <- c("G34R_ser" = "cyan4",
"G34R_nsm" = "cyan1",
"G34V_ser" = "cyan4",
"G34V_nsm" = "cyan1",
"KORepair_ser" = "navy",
"KORepair_nsm" = "blue")
save(palette_groups, file = glue("{out}/cl_palettes.Rda"))
We will need one version with all samples together, since normalization is dataset-dependent, and we may want to compare expression levels across groups.
dir.create(file.path(out, "All_cell_line_data"), showWarnings = FALSE)
cell_line_meta %>%
select(ID, Nickname, Group = Group_broad) %>%
write_tsv(file.path(out, "All_cell_line_data", "info.samples.tsv"))
data.frame(Group = unique(cell_line_meta$Group_broad),
Label = unique(cell_line_meta$Group_broad),
Order = seq_along(unique(cell_line_meta$Group_broad)),
Color = palette_groups[unique(cell_line_meta$Group_broad)],
stringsAsFactors = FALSE) %>%
write_tsv(file.path(out, "All_cell_line_data", "info.groups.tsv"))
In order to run the in-house DESeq2/expression analysis pipeline, the inputs created above are synced to the pipeline folder, and then the following outputs are loaded for further analysis below and in the subsequent analyses:
pipeline_path <- "../../../level3/2020-01_G34_submission1_add_samples/Cell_lines3/"
align_stats <- read_tsv(file.path(pipeline_path, "All_cell_line_data", "alignment.statistics.tsv")) %>%
separate(mitochondrialPercentage, sep = "%", into = "mitochondrialPercentage") %>%
mutate(mitochondrialPercentage = as.numeric(mitochondrialPercentage))
## Parsed with column specification:
## cols(
## .default = col_character(),
## rawReadCount = col_double(),
## cleanReadCount = col_double(),
## unmappedReadCount = col_double(),
## mappedReadCount = col_double(),
## wholeGeneReadCount = col_double(),
## exonReadCount = col_double(),
## intronReadCount = col_double(),
## utr5ReadCount = col_double(),
## cdsReadCount = col_double(),
## utr3ReadCount = col_double(),
## mitochondrialReadCount = col_double(),
## ribosomalReadCount = col_double()
## )
## See spec(...) for full column specifications.
# DT::datatable(align_stats, filter = "top", class = 'white-space: nowrap')
left_join(cell_line_meta, align_stats, by = c("Nickname" = "name")) %>%
write_tsv(glue("{out}/cell_line_metadata_with_align_stats.tsv"))
dge_serum <- read_tsv(file.path(pipeline_path, "GBM002_ser/diff/Ensembl.ensGene.exon/G34R_servsKORepair_ser.tsv")) %>%
separate(ID, into = c("ENSID", "gene_symbol"), sep = ":") %>%
mutate(log10p = -log10(padj))
## Parsed with column specification:
## cols(
## ID = col_character(),
## baseMean = col_double(),
## log2FoldChange = col_double(),
## lfcSE = col_double(),
## stat = col_double(),
## pvalue = col_double(),
## padj = col_double()
## )
dge_serum %>%
filter(baseMean > 100 & padj < 0.05) %>%
rr_ggplot(aes(x = log2FoldChange, y = log10p, label = gene_symbol), alpha = 0.8, plot_num = 1) +
geom_point()
dge_stem <- read_tsv(file.path(pipeline_path, "GBM002_nsm/diff/Ensembl.ensGene.exon/G34R_nsmvsKORepair_nsm.tsv")) %>%
separate(ID, into = c("ENSID", "gene_symbol"), sep = ":") %>%
mutate(log10p = -log10(padj))
## Parsed with column specification:
## cols(
## ID = col_character(),
## baseMean = col_double(),
## log2FoldChange = col_double(),
## lfcSE = col_double(),
## stat = col_double(),
## pvalue = col_double(),
## padj = col_double()
## )
Note that I have taken care, here, to use the counts normalized for the serum samples together only.
goi <- c("DLX1", "DLX2", "DLX5", "DLX6", "PDGFRA", "GSX2")
gbm002_counts <- extract_pipeline_counts(file.path(pipeline_path, "GBM002_ser/counts/Ensembl.ensGene.exon.norm.tsv.gz"),
goi = goi) %>%
left_join(cell_line_meta, by = c("sample" = "Nickname")) %>%
left_join(dge_serum, by = c("gene_symbol" = "gene_symbol", "gene_ensg" = "ENSID"))
## Joining, by = "gene_symbol"
dge_serum %>% select(-ENSID) %>% filter(gene_symbol %in% goi)
For the DLXs only
dlx_limits <- list("DLX1" = c(0, 21000),
"DLX2" = c(0, 21000),
"DLX5" = c(0, 1600),
"DLX6" = c(0, 1600),
"PDGFRA" = c(0, 22000),
"GSX2" = c(0, 200))
cl_dotplot <- function(gene, lims, counts, levels = c("G34R_nsm", "KORepair_nsm",
"G34R_ser", "KORepair_ser")) {
counts %>%
filter(gene_symbol == gene) %>%
mutate(Group_broad = factor(Group_broad,
levels = levels)) %>%
ggplot(aes(x = Group_broad, y = gene_expression)) +
geom_jitter(aes(colour = Group_broad), size = 5, alpha = 0.87, width = 0.1) +
scale_colour_manual(values = c("cyan4", "midnightblue")) +
stat_summary(fun.y = median, fun.ymin = median, fun.ymax = median,
geom = "crossbar", width = 0.2) +
theme_min(border_colour = "black") +
ylim(lims) +
ggtitle(gene) +
rotateX() +
noLegend()
}
gbm002_counts %>%
select(sample, gene_symbol, gene_expression, Group_broad) %>%
write_tsv(glue("{figout}/dlx_serum_boxplot-1.source_data.tsv"))
imap(dlx_limits, ~ cl_dotplot(.y, .x, counts = gbm002_counts)) %>%
{plot_grid(plotlist = ., ncol = 6)}
[figure/source data @ G34-gliomas/bulk_transcriptome_epigenome/figures/04//dlx_serum_boxplot…]
# Load up DGE results from DESeq2
res_ser <- read_tsv(file.path(pipeline_path, "GBM002_ser/diff/Ensembl.ensGene.exon/G34R_servsKORepair_ser.tsv")) %>%
separate(ID, into = c("ENSG", "symbol"), sep = ":")
## Parsed with column specification:
## cols(
## ID = col_character(),
## baseMean = col_double(),
## log2FoldChange = col_double(),
## lfcSE = col_double(),
## stat = col_double(),
## pvalue = col_double(),
## padj = col_double()
## )
# Construct the input dataframe for an MA plot
res_ser2 <- res_ser %>%
select(symbol, baseMean, log2FoldChange, padj) %>%
mutate(signif = ifelse( is.na(padj), FALSE, padj < 0.05 )) %>%
mutate(dlx = symbol %in% c("DLX1", "DLX2", "DLX5", "DLX6")) %>%
mutate(fc = case_when(
log2FoldChange > 2 ~ "LFC > 2",
log2FoldChange < -2 ~ "LFC < -2",
TRUE ~ "-2 < LFC < 2")) %>%
mutate(log2FoldChange = ifelse(abs(log2FoldChange) > 2,
sign(log2FoldChange) * 2,
log2FoldChange))
# Make an MA plot, labelling the DLX genes
res_ser2 %>%
ggplot(aes(x = log10(baseMean), y = log2FoldChange)) +
geom_hline(yintercept = 0, colour = "gray90") +
geom_point(aes(colour = signif, size = dlx, shape = fc), alpha = 0.5) +
scale_colour_manual(values = c("gray50", "red")) +
scale_size_manual(values = c("TRUE" = 3, "FALSE" = 0.5)) +
scale_shape_manual(values = c("LFC > 2" = 24, "LFC < -2" = 25, "-2 < LFC < 2" = 19)) +
geom_text_repel(data = res_ser2 %>% filter(symbol %in% c("DLX1", "DLX2", "DLX5", "DLX6")),
aes(x = log10(baseMean), y = log2FoldChange, label = symbol)) +
ylim(c(-2, 2))
Examination of p-values:
# Here's a histongram of p-values across genes
res_ser %>%
gather(stat, value, pvalue, padj) %>%
ggplot(aes(x = value)) +
geom_histogram() +
facet_wrap(~ stat, ncol = 1)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# Filter to genes with baseMean > 100 and redraw the plot
res_ser %>%
filter(baseMean > 100) %>%
gather(stat, value, pvalue, padj) %>%
ggplot(aes(x = value)) +
geom_histogram() +
facet_wrap(~ stat, ncol = 1)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
gbm002_counts_nsm <- extract_pipeline_counts(file.path(pipeline_path, "GBM002_nsm/counts/Ensembl.ensGene.exon.norm.tsv.gz"),
goi = goi) %>%
left_join(cell_line_meta, by = c("sample" = "Nickname")) %>%
left_join(dge_stem, by = c("gene_symbol" = "gene_symbol", "gene_ensg" = "ENSID"))
## Joining, by = "gene_symbol"
dge_stem %>% select(-ENSID) %>% filter(gene_symbol %in% goi)
dlx_limits <- list("DLX1" = c(0, 4000),
"DLX2" = c(0, 5300),
"DLX5" = c(0, 400),
"DLX6" = c(0, 500),
"PDGFRA" = c(0, 30000),
"GSX2" = c(0, 30))
gbm002_counts_nsm %>%
select(sample, gene_symbol, gene_expression, Group_broad) %>%
write_tsv(glue("{figout}/dlx_stem_boxplot-1.source_data.tsv"))
imap(dlx_limits, ~ cl_dotplot(.y, .x, counts = gbm002_counts_nsm)) %>%
{plot_grid(plotlist = ., ncol = 6)}
[figure/source data @ G34-gliomas/bulk_transcriptome_epigenome/figures/04//dlx_stem_boxplot…]
raw_counts <- file.path(pipeline_path, "All_GBM002_new/counts/Ensembl.ensGene.exon.raw.tsv.gz") %>%
read.table(header = T, row.names = 1, check.names = F, sep = "\t")
head(rownames(raw_counts) %<>% strsplit(":") %>% sapply(getElement, 1))
## [1] "ENSG00000118473" "ENSG00000162426" "ENSG00000157191" "ENSG00000169504"
## [5] "ENSG00000142920" "ENSG00000186094"
raw_counts <- as.matrix(raw_counts)
raw_counts[1:5, 1:5]
## HSJD-GBM002_A10_p58 HSJD-GBM002_A10_p59 GBM002_cA10_p58
## ENSG00000118473 37 30 80
## ENSG00000162426 90 81 174
## ENSG00000157191 475 473 1017
## ENSG00000169504 7542 6570 19732
## ENSG00000142920 55 48 89
## HSJD-GBM002_F06_p55 HSJD-GBM002_F06_p56
## ENSG00000118473 114 62
## ENSG00000162426 80 40
## ENSG00000157191 531 364
## ENSG00000169504 11481 5515
## ENSG00000142920 66 20
rr_saveRDS(object = raw_counts,
desc = "Raw RNA-seq counts for GBM002 cell lines",
file = glue("{out}/cl_raw_counts.Rds"))
## ...writing description of cl_raw_counts.Rds to G34-gliomas/bulk_transcriptome_epigenome/output/04/cl_raw_counts.desc
dim(raw_counts)
## [1] 60234 16
Loading the cell type gene signatures prepared previously for GSEA, and selecting some representative signatures for relevant cell types:
signatures <- readRDS(here(subdir, "output/02/signatures_ens.Rds"))
# Select pathways of interest
poi <- list("newborn_inhib_neuron" = signatures$`HF nIN1`,
"mature_inhib_neuron" = signatures$`HP IN-PV`,
"opc" = signatures$`F-p6 OPC`,
"astrocyte" = signatures$`F-p0 ASTR2`,
"rgc" = signatures$`HF RG-early`,
"fetal_excit_neurons" = signatures$`HF EN-PFC1`)
cl_ssgsea <- ssgsea_le(expr_mat = raw_counts,
gene_sets = poi,
# Following exactly the same parameters as we used
# in the NG bulk analysis
alpha = 0.75,
normalize = FALSE)
cl_ssgsea$enrichment_scores %>%
as.data.frame() %>%
tibble::rownames_to_column(var = "Signature") %>%
rr_write_tsv(path = glue("{out}/cl_ssgsea_scores.tsv"),
desc = "ssGSEA scores for cell type signatures in GBM002 cell lines")
## ...writing description of cl_ssgsea_scores.tsv to G34-gliomas/bulk_transcriptome_epigenome/output/04/cl_ssgsea_scores.desc
leading_edge <- cl_ssgsea$leading_edge
rr_saveRDS(object = leading_edge,
desc = "Leading edge froms ssGSEA analysis of cell type signatures in GBM002 cell lines",
file = glue("{out}/cl_ssgsea_le.Rds"))
## ...writing description of cl_ssgsea_le.Rds to G34-gliomas/bulk_transcriptome_epigenome/output/04/cl_ssgsea_le.desc
cl_ssgsea_tidy <- cl_ssgsea$enrichment_scores %>%
as.data.frame() %>%
tibble::rownames_to_column(var = "Signature") %>%
gather(Sample, Score, 2:ncol(.)) %>%
left_join(cell_line_meta, by = c("Sample" = "Nickname"))
cl_ssgsea_tidy %>%
filter(Signature %in% c("newborn_inhib_neuron", "rgc")) %>%
mutate(Signature = factor(Signature, levels = c("rgc", "newborn_inhib_neuron"))) %>%
filter(Media == "ser") %>%
mutate(Group_broad = factor(Group_broad,
levels = c("G34R_nsm", "KORepair_nsm",
"G34R_ser", "KORepair_ser"))) %>%
rr_ggplot(aes(x = Group_broad, y = Score), plot_num = 1) +
geom_jitter(aes(colour = Group_broad), size = 5, alpha = 0.87, width = 0.1) +
scale_colour_manual(values = c("cyan4", "midnightblue")) +
stat_summary(fun.y = median, fun.ymin = median, fun.ymax = median,
geom = "crossbar", width = 0.2) +
theme_min(border_colour = "black") +
facet_wrap(~ Signature) +
rotateX() +
noLegend() +
ylim(c(22400, 24000))
[figure/source data @ G34-gliomas/bulk_transcriptome_epigenome/figures/04//viz_ssgsea…]
In a comparison of H3K27me3 at promoters in G34-mutant HGG patient tumor samples vs non-G34 mutant HGGs, G34 tumors have several thousand promoters enriched for K27me3 compared to the other tumor types.
To investigate whether these are an effect of the G34 mutation, or an effect of cell of origin (both of which differ between patient tumors), we’ll look at the expression of these promoters in the CRISPR cell lines, allowing to decouple the effect of G34 (which differs in the isogenic cell line pairs) vs. cell of origin (which does not).
# Read in the tumour promoter K27me3 scores
tumor_k27me3 <- read_xlsx(here(subdir, "data/TABLE-S3_G34-HGG-Epigenome.xlsx"))
# Subset by thresholds used in the paper
# z-score_H3K27me3 > 0.5, pvalue_H3K27me3 < 0.05
tumor_k27me3_filt <- tumor_k27me3 %>%
filter(`z-score_H3K27me3` > 0.5) %>%
distinct(name, .keep_all = TRUE)
# Read in the counts for the cell lines
gbm002_counts <- extract_pipeline_counts(file.path(pipeline_path, "GBM002_ser/counts/Ensembl.ensGene.exon.norm.tsv.gz"),
goi = tumor_k27me3_filt$name) %>%
left_join(cell_line_meta, by = c("sample" = "Nickname"))
## Joining, by = "gene_symbol"
# Check fold changes from RNA-seq DGE in stem at these promoters
dge_stem %>%
filter(gene_symbol %in% tumor_k27me3_filt$name) %>%
ggplot(aes(x = log2FoldChange)) +
geom_density()
# Check what proportion of those genes have associated significant DGE
dge_stem %>%
filter(gene_symbol %in% tumor_k27me3_filt$name) %>%
mutate(significant_upon_editing = case_when(
is.na(padj) ~ "No",
padj < 0.05 & log2FoldChange > 1 ~ "Yes",
TRUE ~ "No")) %>%
count(significant_upon_editing) %>%
ggplot(aes(x = "", y = n)) +
geom_bar(aes(fill = significant_upon_editing), stat = "identity") +
coord_polar("y", start = 0) +
theme_void() +
geom_text(aes(x = c(1, 1), y = c(1000, 0), label = n), size=5)
This document was last rendered on:
## 2020-12-11 13:30:54
The git repository and last commit:
## Local: master /lustre03/project/6004736/sjessa/from_beluga/HGG-G34/G34-gliomas
## Remote: master @ origin (git@github.com:fungenomics/G34-gliomas.git)
## Head: [71d3b7e] 2020-09-28: Ignore slurm submission script
The random seed was set with set.seed(100)
The R session info:
## R version 3.5.1 (2018-07-02)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: CentOS Linux 7 (Core)
##
## Matrix products: default
## BLAS/LAPACK: /cvmfs/soft.computecanada.ca/easybuild/software/2017/Core/imkl/2018.3.222/compilers_and_libraries_2018.3.222/linux/mkl/lib/intel64_lin/libmkl_gf_lp64.so
##
## locale:
## [1] LC_CTYPE=en_CA.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_CA.UTF-8 LC_COLLATE=en_CA.UTF-8
## [5] LC_MONETARY=en_CA.UTF-8 LC_MESSAGES=en_CA.UTF-8
## [7] LC_PAPER=en_CA.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_CA.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] stats graphics grDevices datasets utils methods base
##
## other attached packages:
## [1] pbapply_1.4-3 cowplot_0.9.4 ggrepel_0.8.0 scales_1.1.1 ggplot2_3.1.0
## [6] purrr_0.3.4 glue_1.4.2 magrittr_1.5 dplyr_0.8.0 readxl_1.2.0
## [11] readr_1.3.1 tidyr_0.8.2 here_0.1
##
## loaded via a namespace (and not attached):
## [1] Rcpp_1.0.5 git2r_0.27.1 plyr_1.8.6
## [4] pillar_1.4.6 compiler_3.5.1 cellranger_1.1.0
## [7] RColorBrewer_1.1-2 BiocManager_1.30.10 tools_3.5.1
## [10] digest_0.6.25 evaluate_0.14 lifecycle_0.2.0
## [13] tibble_3.0.3 gtable_0.3.0 pkgconfig_2.0.3
## [16] rlang_0.4.7 parallel_3.5.1 yaml_2.2.1
## [19] xfun_0.17 withr_2.2.0 stringr_1.4.0
## [22] knitr_1.29 vctrs_0.3.4 hms_0.5.3
## [25] rprojroot_1.3-2 grid_3.5.1 tidyselect_1.1.0
## [28] R6_2.4.1 rmarkdown_1.11 backports_1.1.9
## [31] ellipsis_0.3.1 htmltools_0.5.0 assertthat_0.2.1
## [34] colorspace_1.4-1 renv_0.10.0 stringi_1.5.3
## [37] lazyeval_0.2.2 munsell_0.5.0 crayon_1.3.4
A project of the Kleinman Lab at McGill University, using the rr reproducible research template.
home
GitHub