Sam Buckberry 2023-06-23
Analysis of data in context of findings in
Koyanagi-Aoi, M. et al. Differentiation-defective phenotypes revealed by large-scale analyses of human pluripotent stem cells. Proc. Natl. Acad. Sci. U. S. A. 110, 20569–20574 (2013).
Ruiz, S. et al. Identification of a specific reprogramming-associated epigenetic signature in human induced pluripotent stem cells. Proc. Natl. Acad. Sci. U. S. A. 109, 16196–16201 (2012).
as per reviewer request
source("R/project_functions.R")
## Loading required package: BiocGenerics
##
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:stats':
##
## IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
##
## anyDuplicated, append, as.data.frame, basename, cbind, colnames,
## dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
## grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,
## order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
## rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,
## union, unique, unsplit, which.max, which.min
## Welcome to Bioconductor
##
## Vignettes contain introductory material; view with
## 'browseVignettes()'. To cite Bioconductor, see
## 'citation("Biobase")', and for packages 'citation("pkgname")'.
## Loading required package: ggplot2
## Loading required package: lattice
## Loading required package: GenomicRanges
## Loading required package: stats4
## Loading required package: S4Vectors
##
## Attaching package: 'S4Vectors'
## The following objects are masked from 'package:base':
##
## expand.grid, I, unname
## Loading required package: IRanges
## Loading required package: GenomeInfoDb
## Loading required package: SummarizedExperiment
## Loading required package: MatrixGenerics
## Loading required package: matrixStats
##
## Attaching package: 'matrixStats'
## The following objects are masked from 'package:Biobase':
##
## anyMissing, rowMedians
##
## Attaching package: 'MatrixGenerics'
## The following objects are masked from 'package:matrixStats':
##
## colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
## colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
## colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
## colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
## colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
## colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
## colWeightedMeans, colWeightedMedians, colWeightedSds,
## colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
## rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,
## rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps,
## rowMadDiffs, rowMads, rowMaxs, rowMeans2, rowMedians, rowMins,
## rowOrderStats, rowProds, rowQuantiles, rowRanges, rowRanks,
## rowSdDiffs, rowSds, rowSums2, rowTabulates, rowVarDiffs, rowVars,
## rowWeightedMads, rowWeightedMeans, rowWeightedMedians,
## rowWeightedSds, rowWeightedVars
## The following object is masked from 'package:Biobase':
##
## rowMedians
##
## Attaching package: 'magrittr'
## The following object is masked from 'package:GenomicRanges':
##
## subtract
##
## Attaching package: 'data.table'
## The following object is masked from 'package:SummarizedExperiment':
##
## shift
## The following object is masked from 'package:GenomicRanges':
##
## shift
## The following object is masked from 'package:IRanges':
##
## shift
## The following objects are masked from 'package:S4Vectors':
##
## first, second
## Loading required package: BSgenome
## Loading required package: Biostrings
## Loading required package: XVector
##
## Attaching package: 'Biostrings'
## The following object is masked from 'package:base':
##
## strsplit
## Loading required package: rtracklayer
## Loading required package: AnnotationDbi
##
## Attaching package: 'ggthemes'
## The following object is masked from 'package:cowplot':
##
## theme_map
## Loading required package: Rsamtools
##
## Attaching package: 'VariantAnnotation'
## The following object is masked from 'package:stringr':
##
## fixed
## The following object is masked from 'package:base':
##
## tabulate
##
## Attaching package: 'ChIPpeakAnno'
## The following object is masked from 'package:VariantAnnotation':
##
## info
##
## Attaching package: 'gtools'
## The following object is masked from 'package:e1071':
##
## permutations
##
## Attaching package: 'UpSetR'
## The following object is masked from 'package:lattice':
##
## histogram
## Loading required package: limma
##
## Attaching package: 'limma'
## The following object is masked from 'package:BiocGenerics':
##
## plotMA
## Loading required package: grid
##
## Attaching package: 'grid'
## The following object is masked from 'package:Biostrings':
##
## pattern
gr_te <- readRDS("resources/hg19_rmsk_TE_granges.Rds")
gtfPath <- "resources/genes.gtf.gz"
txdb <- makeTxDbFromGFF(file = gtfPath,
format = "gtf")
## Import genomic features from the file as a GRanges object ...
## OK
## Prepare the 'metadata' data frame ... OK
## Make the TxDb object ... OK
gene_gr <- genes(txdb)
## 643 genes were dropped because they have exons located on both strands
## of the same reference sequence or on more than one reference sequence,
## so cannot be represented by a single genomic range.
## Use 'single.strand.genes.only=FALSE' to get all the genes in a
## GRangesList object, or use suppressMessages() to suppress this message.
gene_sub <- gene_gr[gene_gr$gene_id %in% c("HHLA1", "ABHD12B", "C4orf51")]
te_sub <- gr_te[overlapsAny(gr_te, gene_sub)]
te_sub <- te_sub[te_sub$gene == "LTR7"]
mdat <- read.csv("wgbs/metadata/wgbs_metadata_local.csv")
mdat <- mdat[mdat$Group %in% c("Primed-hiPSC", "TNT-hiPSC", "NtP-hiPSC", "hESC"), ]
mdat <- mdat[mdat$Media != "t2iLGoY", ]
mdat <- mdat[mdat$Progenitor %in% c("ESC", "Fibroblast"), ]
mdat <- mdat[mdat$Lab == "Lister", ]
mdat <- mdat[mdat$Batch != "C", ]
mdat <- mdat[!grepl("merge", mdat$Library_id), ]
file.exists(mdat$BSseq_CG)
## [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [16] TRUE TRUE TRUE TRUE
te_mCG <- make_mC_matrix(obj_fls = mdat$BSseq_CG, gr = te_sub, cores = 3)
## Making matrix of mC levels for regions...
colnames(te_mCG) <- mdat$Library_id
te_mCG <- data.frame(te_mCG)
te_mCG$gene <- c("ABHD12B", "ABHD12B", "C4orf51", "C4orf51", "HHLA1", "HHLA1")
te_mCG_melt <- reshape2::melt(te_mCG)
## Using gene as id variables
ind <- match(te_mCG_melt$variable, mdat$Library_id)
te_mCG_melt$group <- mdat$Group[ind]
te_mCG_melt$lab <- mdat$Lab[ind]
te_mCG_melt$background <- mdat$Background[ind]
te_mCG_melt <- te_mCG_melt[te_mCG_melt$lab == "Lister" &
(te_mCG_melt$group %in% c("Primed-hiPSC", "TNT-hiPSC", "NtP-hiPSC", "hESC")), ]
te_mCG_melt$group <- factor(te_mCG_melt$group,
levels=c("Primed-hiPSC", "TNT-hiPSC", "NtP-hiPSC", "hESC"))
te_mCG_melt$gene <- factor(te_mCG_melt$gene, levels=c("HHLA1", "ABHD12B", "C4orf51"))
pdf("wgbs/plots/Koyanagi_genes_te_mCG_boxplots.pdf", width = 3, height = 2)
ggplot(te_mCG_melt, aes(x = group, y = value, group=group)) +
geom_boxplot() + ylab("LTR7 mCG/CG") +
facet_grid(.~gene, scales = "free", space = "free", drop = TRUE) +
theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 6,
colour = 'black'),
axis.text.y = element_text(size=6, colour='black', angle = 0),
strip.text.y = element_text(size = 6),
text = element_text(size=6),
strip.background = element_blank(),
axis.line.x = element_line(color = 'black', size = line_mm),
axis.line.y = element_line(color = 'black', size = line_mm),
axis.ticks = element_line(color = 'black', size = line_mm))
## Warning: The `size` argument of `element_line()` is deprecated as of ggplot2 3.4.0.
## ℹ Please use the `linewidth` argument instead.
dev.off()
## quartz_off_screen
## 2
wb_ed_fig9c <- openxlsx::createWorkbook()
openxlsx::addWorksheet(wb_ed_fig9c, sheetName = "ED_Fig_9c")
openxlsx::writeData(wb = wb_ed_fig9c, sheet = "ED_Fig_9c",
x = te_mCG_melt)
openxlsx::saveWorkbook(wb = wb_ed_fig9c,
file = "ED_Figure_9c_source_data.xlsx", overwrite = TRUE)
ruiz_genes <- c("PTPRT", "TMEM132C", "TMEM132D", "TCERG1L",
"DPP6", "FAM19A5", "RBFOX1", "CSMD1", "C22orf34")
ruiz_gene_gr <- gene_gr[gene_gr$gene_id %in% ruiz_genes]
## Authors add 10kb to gene start and end
ruiz_gene_gr_expand <- ruiz_gene_gr + 10000
## Get mCG
ruiz_mCG <- make_mC_matrix(obj_fls = mdat$BSseq_CG,
gr = ruiz_gene_gr_expand, cores = 4)
## Making matrix of mC levels for regions...
colnames(ruiz_mCG) <- mdat$Library_id
ruiz_mCG <- data.frame(ruiz_mCG)
ruiz_mCG$gene <- as.character(ruiz_gene_gr$gene_id)
ruiz_mCG_melt <- reshape2::melt(ruiz_mCG)
## Using gene as id variables
ind2 <- match(ruiz_mCG_melt$variable, mdat$Library_id)
ruiz_mCG_melt$group <- mdat$Group[ind2]
ruiz_mCG_melt$lab <- mdat$Lab[ind2]
ruiz_mCG_melt$background <- mdat$Background[ind2]
ruiz_mCG_melt <- ruiz_mCG_melt[ruiz_mCG_melt$lab == "Lister" &
(ruiz_mCG_melt$group %in% c("Primed-hiPSC",
"TNT-hiPSC", "NtP-hiPSC", "hESC")), ]
ruiz_mCG_melt$group <- factor(ruiz_mCG_melt$group,
levels=c("Primed-hiPSC", "TNT-hiPSC",
"NtP-hiPSC", "hESC"))
pdf("wgbs/plots/ruiz_gene_methylation_boxplots.pdf", width = 7, height = 2.5)
ggplot(ruiz_mCG_melt, aes(x = group, y = value, group=group, )) +
geom_boxplot() + ylab("LTR7 mCG/CG") +
facet_grid(.~gene, scales = "free", space = "free", drop = TRUE) +
theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 6,
colour = 'black'),
axis.text.y = element_text(size=6, colour='black', angle = 0),
strip.text.y = element_text(size = 6),
text = element_text(size=6),
strip.background = element_blank(),
axis.line.x = element_line(color = 'black', size = line_mm),
axis.line.y = element_line(color = 'black', size = line_mm),
axis.ticks = element_line(color = 'black', size = line_mm))
dev.off()
## quartz_off_screen
## 2
wb_ed_fig9e <- openxlsx::createWorkbook()
openxlsx::addWorksheet(wb_ed_fig9e, sheetName = "ED_Fig_9e")
openxlsx::writeData(wb = wb_ed_fig9e, sheet = "ED_Fig_9e",
x = ruiz_mCG_melt)
openxlsx::saveWorkbook(wb = wb_ed_fig9e,
file = "ED_Figure_9e_source_data.xlsx", overwrite = TRUE)