tnt

Choi et al. (2015) Nat. Biotech reanalysis: transposable elements

Sam Buckberry 29/06/2020

source("R/project_functions.R")
## Loading required package: BiocGenerics

## 
## Attaching package: 'BiocGenerics'

## The following objects are masked from 'package:stats':
## 
##     IQR, mad, sd, var, xtabs

## The following objects are masked from 'package:base':
## 
##     anyDuplicated, append, as.data.frame, basename, cbind, colnames,
##     dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
##     grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,
##     order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
##     rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,
##     union, unique, unsplit, which.max, which.min

## Welcome to Bioconductor
## 
##     Vignettes contain introductory material; view with
##     'browseVignettes()'. To cite Bioconductor, see
##     'citation("Biobase")', and for packages 'citation("pkgname")'.

## Loading required package: ggplot2

## Loading required package: lattice

## Loading required package: GenomicRanges

## Loading required package: stats4

## Loading required package: S4Vectors

## 
## Attaching package: 'S4Vectors'

## The following objects are masked from 'package:base':
## 
##     expand.grid, I, unname

## Loading required package: IRanges

## Loading required package: GenomeInfoDb

## Loading required package: SummarizedExperiment

## Loading required package: MatrixGenerics

## Loading required package: matrixStats

## 
## Attaching package: 'matrixStats'

## The following objects are masked from 'package:Biobase':
## 
##     anyMissing, rowMedians

## 
## Attaching package: 'MatrixGenerics'

## The following objects are masked from 'package:matrixStats':
## 
##     colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
##     colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
##     colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
##     colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
##     colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
##     colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
##     colWeightedMeans, colWeightedMedians, colWeightedSds,
##     colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
##     rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,
##     rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps,
##     rowMadDiffs, rowMads, rowMaxs, rowMeans2, rowMedians, rowMins,
##     rowOrderStats, rowProds, rowQuantiles, rowRanges, rowRanks,
##     rowSdDiffs, rowSds, rowSums2, rowTabulates, rowVarDiffs, rowVars,
##     rowWeightedMads, rowWeightedMeans, rowWeightedMedians,
##     rowWeightedSds, rowWeightedVars

## The following object is masked from 'package:Biobase':
## 
##     rowMedians

## 
## Attaching package: 'magrittr'

## The following object is masked from 'package:GenomicRanges':
## 
##     subtract

## 
## Attaching package: 'data.table'

## The following object is masked from 'package:SummarizedExperiment':
## 
##     shift

## The following object is masked from 'package:GenomicRanges':
## 
##     shift

## The following object is masked from 'package:IRanges':
## 
##     shift

## The following objects are masked from 'package:S4Vectors':
## 
##     first, second

## Loading required package: BSgenome

## Loading required package: Biostrings

## Loading required package: XVector

## 
## Attaching package: 'Biostrings'

## The following object is masked from 'package:base':
## 
##     strsplit

## Loading required package: rtracklayer

## Loading required package: AnnotationDbi

## 
## Attaching package: 'ggthemes'

## The following object is masked from 'package:cowplot':
## 
##     theme_map

## Loading required package: Rsamtools

## 
## Attaching package: 'VariantAnnotation'

## The following object is masked from 'package:stringr':
## 
##     fixed

## The following object is masked from 'package:base':
## 
##     tabulate

## 
## Attaching package: 'ChIPpeakAnno'

## The following object is masked from 'package:VariantAnnotation':
## 
##     info

## 
## Attaching package: 'gtools'

## The following object is masked from 'package:e1071':
## 
##     permutations

## 
## Attaching package: 'UpSetR'

## The following object is masked from 'package:lattice':
## 
##     histogram

## Loading required package: limma

## 
## Attaching package: 'limma'

## The following object is masked from 'package:BiocGenerics':
## 
##     plotMA

## Loading required package: grid

## 
## Attaching package: 'grid'

## The following object is masked from 'package:Biostrings':
## 
##     pattern

TE quantification (not run)

parallel -j 35 ~/working_data_01/bin/TElocal/TElocal -b {} \
--GTF /home/sbuckberry/working_data_02/polo_project/human_ips/resources/hg19_ensGene_with_ercc.gtf \
--TE /home/sbuckberry/working_data_02/polo_project/human_ips/resources/hg19_rmsk_TElocus.ind \
--project {.} \
--format BAM \
--sortByPos \
--stranded reverse \
--mode uniq ::: SRS*uniq.bam

Read the sample count tables and combine into matrix

# # List the counts tables for all samples
# cnt_files <- list.files(path = "RNAseq/hochedlinger_data/te_analysis/", full.names = TRUE, pattern = "cntTable.gz")
# 
# dat2 <- lapply(cnt_files, read.table, header = TRUE, row.names = 1) %>% do.call(cbind, .)
# libs <- colnames(dat2) %>% str_sub(start = 1, end = 10)
# colnames(dat2) <- libs
# 
# #remove rows with all zero's 
# keep_row <- rowSums(dat2) > 0
# table(keep_row)
# dat2 <- dat2[keep_row, ]
# saveRDS(dat2, file = "RNAseq/hochedlinger_data/all_te_and_gene_counts.Rds")

dat2 <- readRDS("RNAseq/hochedlinger_data/all_te_and_gene_counts.Rds")
dim(dat2)
## [1] 2048776      63

Read the meta-data table and match to expression data

sample_dat <- fread("RNAseq/hochedlinger_data/sample_sheet.csv")
sample_dat <- sample_dat[match(colnames(dat2), sample_dat$secondary_sample_accession), ]
all(colnames(dat2) == sample_dat$Run)
## [1] TRUE

Read in the count data

y2 <- DGEList(counts = dat2)
y2$samples <- cbind(y2$samples, sample_dat)

Load the transposable element data

## Get the TE genome co-ordinates
 repeat_gtf <- read.table("resources/hg19_rmsk_TE.gtf.gz")
 repeat_gr <- GRanges(seqnames = repeat_gtf$V1,
                      ranges = IRanges(start = repeat_gtf$V4,
                                       end = repeat_gtf$V5))
 strand(repeat_gr) <- repeat_gtf$V7
 repeat_gr$class <- repeat_gtf$V19
 repeat_gr$family <- repeat_gtf$V16
 repeat_gr$gene <- repeat_gtf$V10
 repeat_gr$transcript <- repeat_gtf$V13
 repeat_gr$id <- str_c(repeat_gr$transcript, repeat_gr$family, repeat_gr$class, sep = ":")
 rm(repeat_gtf)
 saveRDS(repeat_gr, "resources/hg19_rmsk_TE_granges.Rds")

repeat_gr <- readRDS("resources/hg19_rmsk_TE_granges.Rds")
repeat_gr <- readRDS("resources/hg19_rmsk_TE_granges.Rds")

Function to test DE for each genetic background

test_te_de <- function(background="HUES3", groups=c("iPSC", "ESC_GFP"), dataset="data set: isogenic"){
    
    y_sub <- y2[ ,(y2$samples$Background %in% background) &
                     (y2$samples$Group %in% groups) &
                     (y2$samples$characteristics_ch1 == dataset)]
    dim(y_sub)
    design <- model.matrix(~Group, data=y_sub$samples)
    colnames(design) <- str_remove(string = colnames(design), pattern = "Group")
    
    keep <- filterByExpr(y_sub, design)
    table(keep)
    y_sub <- y_sub[keep, ,keep.lib.sizes=FALSE]
    y_sub <- calcNormFactors(y_sub)

    y_sub <- estimateDisp(y_sub, design = design, robust = TRUE)
    
    plotBCV(y_sub)
    
    plotMDS(y_sub[grepl(pattern = "_dup", rownames(y_sub)), ], labels = y_sub$samples$source_name_ch1)
    
    fit <- glmFit(y_sub, design)
    
    lrt <- glmLRT(fit, contrast=c(0,-1))
    tt <- topTags(lrt, n = nrow(y_sub))
    tt_table <- tt$table
    tt_table$gene_id <- rownames(tt_table)
    tt_table$contrast <- background
        
    # Subset for TE's and recalculate FDR
    tt_table <- tt_table[tt_table$gene_id %in% repeat_gr$id, ]
    tt_table$FDR <- p.adjust(p = tt_table$PValue, method = "fdr")
    
    
    tt_table$significant <- (abs(tt_table$logFC) > 1) & (tt_table$FDR < 0.05) & (tt_table$logCPM > 0)
    tt_table$significant <- ifelse(test = tt_table$significant, yes = "Significant", no = "NS")
    tt_table$significant <- factor(tt_table$significant, levels = c("NS", "Significant"))
    
    # Add the TE locus information
    ind <- match(tt_table$gene_id, repeat_gr$id)

    tt_table <- cbind(tt_table, as.data.frame(repeat_gr)[ind, ])
}

hues3_tt <- test_te_de(background = "HUES3")

hues2_tt <- test_te_de(background = "HUES2")

hues2_GFP_tt <- test_te_de(background = "HUES2", groups = c("ESC", "ESC_GFP"))

all_tt <- rbind(hues2_tt, hues3_tt)
te_de_df <- all_tt
sig_ids <- te_de_df$gene_id[te_de_df$significant == "Significant"] %>% unique()

background <- c("HUES2", "HUES3")
groups <- c("iPSC", "ESC_GFP") 
dataset <- "data set: isogenic"

y_sub <- y2[ ,(y2$samples$Background %in% background) &
                     (y2$samples$Group %in% groups) &
                     (y2$samples$characteristics_ch1 == dataset)]

all_cpm <- log2(cpm(y_sub) + 1)

hm_annot_dat <- y_sub$samples[ ,c("Background", "Group")] 

col_ids <- str_c(y_sub$samples$title)
        
plot_dat <- all_cpm[rownames(all_cpm) %in% sig_ids, ]
#plot_dat <- all_cpm[grepl("HERVH-int", rownames(all_cpm)),
#                    y_sub$samples$source_name_ch1 != "Fibroblast"]
plot_dat <- plot_dat[complete.cases(plot_dat), ] %>% data.frame()

pdf("RNAseq/plots/Choi_TE_heatmap_combined.pdf", width = 4, height = 5)
pheatmap(plot_dat, scale = 'row',
         annotation_col = hm_annot_dat, 
                 labels_col = col_ids, main = "title",
                 annotation_names_col = TRUE,
                 show_rownames = FALSE, border_color = NA,
                 clustering_distance_rows = 'correlation',
                 clustering_distance_cols = 'correlation')
dev.off()
## pdf 
##   3
pheatmap(plot_dat, scale = 'row',
         annotation_col = hm_annot_dat, 
                 labels_col = col_ids, main = "title",
                 annotation_names_col = TRUE,
                 show_rownames = FALSE, border_color = NA,
                 clustering_distance_rows = 'correlation',
                 clustering_distance_cols = 'correlation')

wb_ed_fig8k <- openxlsx::createWorkbook()
openxlsx::addWorksheet(wb_ed_fig8k, sheetName = "ED_Fig_8k")
openxlsx::writeData(wb = wb_ed_fig8k, sheet = "ED_Fig_8k",
                    x = plot_dat)
openxlsx::saveWorkbook(wb = wb_ed_fig8k,
                       file = "ED_Figure_8k_source_data.xlsx", overwrite = TRUE)