Purpose:

Check Intron Expression in HEK cells

1. Load packages:

library(tidyverse)
library(cowplot)
library(ggsci)
library(ggbeeswarm)
library(ComplexUpset)
library(Gviz)

2. Load functions:

### all necessary custom functions are in the following script
source(paste0(here::here(),"/0_Scripts/custom_functions.R"))

theme_pub <- theme_bw() + theme(
                                     plot.title = element_text(hjust = 0.5, size=18, face="bold"),
                                     axis.text = element_text(colour="black", size=14), 
                                     axis.title=element_text(size=16,face="bold"), 
                                     legend.text=element_text(size=14),
                                     legend.position="right",
                                     axis.line.x = element_line(colour = "black"), 
                                     axis.line.y = element_line(colour = "black"),
                                     strip.background=element_blank(), 
                                     strip.text=element_text(size=16))  

theme_set(theme_pub)

#prevent scientific notation
options(scipen=999)

fig_path <- paste0(here::here(),"/3_RNA_isolation/intron_exon_analysis/")
data_path<-paste0(here::here(),"/3_RNA_isolation/")

## Feature Colours

feat_cols<-c("#F08C4B", "#E5DFCC", "#556f44", "#9dc183", "#4D8C57", "#3A8DB5", "#256EA0","dodgerblue4","#256EA0")

names(feat_cols)<-c("Ambiguity","Intergenic","Ribosomal","Mitochondrial","lncRNA","Intron","Coding","Intragenic","Exon")

inex_colors <- c("#ff5154","#91a6ff","#550527","#BC7238")
names(inex_colors)<-c("Intron","Exon","Both","Intron only")

3. Load annotation:

## annotation
gtype_human <-getbiotype("hsapiens_gene_ensembl",species="human")

ensembl <- useMart("ensembl", dataset ="hsapiens_gene_ensembl", host="uswest.ensembl.org") 

biotype <- getBM(attributes=c("ensembl_gene_id","ensembl_gene_id_version","gene_biotype","external_gene_name","chromosome_name"),
                 mart=ensembl )

4. Load data:

inf <-
  read.csv(
    paste0(data_path, "/sample_info.csv"),
    header = T,
    stringsAsFactors = F
  ) %>%
  mutate(Condition = case_when(
    Condition == "Incubation + ProtK" ~ "Magnetic Beads",
    TRUE ~ Condition
  )) %>%
  filter(Celltype == "HEK")

inf$Sample <- as.character(inf$Sample)


counts_hek <-
  readRDS(
    paste0(
      data_path,
      "Bulk_opt_lysis_test_2_HEK.dgecounts.rds"
    )
  )

# Split into exon, intron and Exon+ intron matrices.
# Filter samples and remove Geneversion numbers

counts_hek_ex <-
  counts_hek$umicount$exon$all %>% as.matrix() %>% remove_Geneversion()
counts_hek_ex <- counts_hek_ex[, inf$BC]
counts_hek_inex <-
  counts_hek$umicount$inex$all %>% as.matrix() %>% remove_Geneversion()
counts_hek_inex <- counts_hek_inex[, inf$BC]
counts_hek_in <-
  counts_hek$umicount$intron$all  %>% as.matrix() %>% remove_Geneversion()
counts_hek_in <- counts_hek_in[, inf$BC]

## make exon only / intron only matrix, remove genes detected in both
counts_hek_ex_o <-
  counts_hek_ex[!(rownames(counts_hek_ex) %in% rownames(counts_hek_in)), ]
counts_hek_in_o <-
  counts_hek_in[!(rownames(counts_hek_in) %in% rownames(counts_hek_ex)), ]
counts_hek_inex_o <-
  counts_hek_inex[!(rownames(counts_hek_inex) %in% rownames(counts_hek_ex_o)), ]
counts_hek_inex_o <-
  counts_hek_inex_o[!(rownames(counts_hek_inex_o) %in% rownames(counts_hek_in_o)), ]

counts_hek_in_o <- counts_hek_in_o[rowSums(counts_hek_in_o) > 0, ]
counts_hek_ex_o <- counts_hek_ex_o[rowSums(counts_hek_ex_o) > 0, ]
counts_hek_inex_o <- counts_hek_inex_o[rowSums(counts_hek_inex_o) > 0, ]

cond <- c("Magnetic Beads", "Column")


for (i in cond) {
  
  sub_inf <- inf %>%
    filter(Condition %in% i)
  
  intron_only_genes <-
    rownames(counts_hek_in_o[whichgenes_reproducible(counts_hek_in_o[,sub_inf$BC],
                                                     exprcutoff = 5,
                                                     reproducecutoff = 0.1), sub_inf$BC])
  
  exon_only_genes <-
    rownames(counts_hek_ex_o[whichgenes_reproducible(counts_hek_ex_o[,sub_inf$BC],
                                                     exprcutoff = 5,
                                                     reproducecutoff = 0.1), sub_inf$BC])
  
  inex_genes <-
    rownames(counts_hek_inex_o[whichgenes_reproducible(counts_hek_inex_o[,sub_inf$BC],
                                                       exprcutoff = 5,
                                                       reproducecutoff = 0.1), sub_inf$BC])
  
  inex_genes <-
    inex_genes[order(rowSums(counts_hek_inex_o[whichgenes_reproducible(counts_hek_inex_o[,sub_inf$BC],
                                                                       exprcutoff = 5,
                                                                       reproducecutoff = 0.1),
                                               sub_inf$BC]),
                     decreasing = T)] # sort by highest overall count
  
  gene_type <-
    data.frame(
      ENSEMBL = c(exon_only_genes, intron_only_genes, inex_genes),
      detection = c(
        rep("exon", times = length(exon_only_genes)),
        rep("intron", times = length(intron_only_genes)),
        rep("both", times = length(inex_genes))
      )
    ) %>%
    left_join(biotype, by = c("ENSEMBL" = "ensembl_gene_id")) %>%
    mutate(
      biotype2 = case_when(
        grepl(gene_biotype, pattern = "*pseudogene") ~ "pseudogene",
        grepl(gene_biotype, pattern = "IG*") ~ "protein coding",
        grepl(gene_biotype, pattern = "TR*") ~ "protein coding",
        grepl(gene_biotype, pattern = "MT*") ~ "other",
        grepl(gene_biotype, pattern = "^s") ~ "small RNA",
        grepl(gene_biotype, pattern = "ribozyme") ~ "other",
        gene_biotype == "misc_RNA" ~ "other",
        gene_biotype == "protein_coding" ~ "protein coding",
        T ~ gene_biotype
      ),
      cond = i
    ) %>%
    filter(!(is.na(biotype2)))
  
  gene_type$biotype2 <-
    factor(gene_type$biotype2, levels = rev(
      c(
        "protein coding",
        "lncRNA",
        "pseudogene",
        "rRNA",
        "small RNA",
        "miRNA",
        "not_annotated",
        "other"
      )
    ))
  
  if (!(exists("gene_type_sum"))) {
    gene_type_sum <- gene_type
    
  } else{
    gene_type_sum <- bind_rows(gene_type_sum, gene_type)
  }
  
}

5. Plot gene biotypes per group

ggplot() +
  geom_bar(data = gene_type_sum,
           aes(x = cond, fill = biotype2),
           #position = position_fill(),
           na.rm = T) +
  ggsci::scale_fill_npg() +
  theme(legend.position = "right") +
  labs(fill = "",
       x = "",
       y = "detected Genes") +
  facet_grid(~detection)+
  theme(axis.text.x = element_text(angle=45,hjust=1))

No difference in composition based on extraction method. We continue with the Bead isolation Condition from here on.

inf<-inf %>% 
  filter(Condition=="Magnetic Beads")

gene_type<-gene_type_sum %>% 
  filter(cond=="Magnetic Beads")

counts_hek_inex<-counts_hek_inex[,inf$BC]

counts_hek_ex_o<-counts_hek_inex[subset(gene_type,detection=="exon")$ENSEMBL,]

counts_hek_in_o<-counts_hek_inex[subset(gene_type,detection=="intron")$ENSEMBL,]

counts_hek_inex_o<-counts_hek_inex[subset(gene_type,!(detection%in%c("intron","exon")))$ENSEMBL,]

##

counts_hek_in <- counts_hek_in[whichgenes_reproducible(counts_hek_in[,inf$BC],
                                                     exprcutoff = 2,
                                                     reproducecutoff = 0.1), inf$BC]
counts_hek_ex <- counts_hek_ex[whichgenes_reproducible(counts_hek_ex[,inf$BC],
                                                     exprcutoff = 2,
                                                     reproducecutoff = 0.1), inf$BC]

6. upset plot of intersections

average_counts<-data.frame(ENSEMBL=rownames(counts_hek_inex),
                           mean_cnt=rowMeans(counts_hek_inex)
                           )

intron_genes<-rownames(counts_hek_in) %>% 
  unique()

exon_genes<-rownames(counts_hek_ex)%>% 
  unique()

upset_df<-data.frame(ENSEMBL=c(exon_genes,intron_genes),
                      detection=c(rep("exon",
                                      times=length(exon_genes)),
                             rep("intron",
                                 times=length(intron_genes))
                             )) %>% 
  left_join(biotype,by=c("ENSEMBL"="ensembl_gene_id")) %>% 
  mutate(biotype2=case_when(grepl(gene_biotype,pattern="*pseudogene")~ "pseudogene",
                            grepl(gene_biotype,pattern="IG*")~ "protein coding",
                            grepl(gene_biotype,pattern="TR*")~ "protein coding",
                            grepl(gene_biotype,pattern="MT*")~ "other",
                            grepl(gene_biotype,pattern="^s")~ "small RNA",
                            grepl(gene_biotype,pattern="ribozyme")~ "other",
                           gene_biotype=="misc_RNA"~ "other",
                           gene_biotype=="protein_coding"~ "protein coding",
                           T~gene_biotype)) %>% 
  filter(!is.na(biotype2)) %>% 
  inner_join(average_counts)

upset_df$biotype2<-factor(upset_df$biotype2,levels=rev(c("protein coding","lncRNA","pseudogene","rRNA","small RNA","miRNA","other")))


upset_df<-upset_df %>% 
  mutate(value=TRUE) %>% 
  pivot_wider(names_from = detection,values_from = value,values_fill = FALSE) %>% 
  left_join(gene_type)

categories<-c("exon","intron")
 
upset<-upset(upset_df,
      categories,
      name="Gene detection",
      base_annotations=list(
        'Intersection size'=intersection_size(
            counts=FALSE,
            mapping=aes(fill=biotype2)
            
        )+scale_fill_npg()
        +labs(fill="")
      ),
      annotations = list(
        'nUMI'=(
            ggplot(mapping=aes(y=mean_cnt,fill=biotype2))
            +geom_boxplot(position="dodge",outlier.shape = NA,show.legend = F)
            +scale_y_log10()
            +scale_fill_npg()
      )))

upset

ggsave(upset,
       device = "pdf",
       path = fig_path,
       width = 150,
       height=180,
       units = "mm",
       filename = "upset_plot.pdf"
       )

7. Gene and UMI counts for the different conditions

make_summary<-function(mat,type){
  data.frame(BC=colnames(mat),
             umi=colSums(mat),
             gene=colSums(mat>0),
             type=type)
}



summary<-rbind(make_summary(counts_hek_ex_o,"Exon"),
      make_summary(counts_hek_in_o,"Intron"),
      make_summary(counts_hek_inex_o,"Both"))

summary$exclusive<-"yes"

ggplot(summary,aes(y=umi,x=type,col=type))+
  geom_boxplot()+
  geom_beeswarm()+
  scale_y_log10()+
  scale_colour_manual(values=inex_colors,limits=force)

ggplot(summary,aes(y=gene,x=type,col=type))+
  geom_boxplot()+
  geom_beeswarm()+
  scale_y_log10()+
  scale_colour_manual(values=inex_colors,limits=force)

summary2<-rbind(make_summary(counts_hek_ex,"Exon"),
      make_summary(counts_hek_in,"Intron"),
      make_summary(counts_hek_inex,"Both"))

summary2$exclusive<-"no"

ggplot(summary2,aes(y=umi,x=type,col=type))+
  geom_boxplot()+
  geom_beeswarm()+
  scale_y_log10()+
  ggtitle("")+
  scale_colour_manual(values=inex_colors,limits=force)

ggplot(summary2,aes(y=gene,x=type,col=type))+
  geom_boxplot()+
  geom_beeswarm()+
  scale_y_log10()+
  scale_colour_manual(values=inex_colors,limits=force)

summary3<-rbind(summary,summary2) #%>% 
 # pivot_longer(cols=c(umi,gene),names_to = "type2" ,values_to = "count") 


ggplot(summary3,aes(type,y=gene,group=paste0(exclusive,type),col=exclusive))+
  geom_boxplot()+
  geom_beeswarm(dodge.width = 0.80)+
  scale_y_log10()

ggplot(summary3,aes(type,y=umi,group=paste0(exclusive,type),col=exclusive))+
  geom_boxplot()+
  geom_beeswarm(dodge.width = 0.80)+
  scale_y_log10()

8. Exon vs. Intron expression correlation

inex_genes<-rownames(counts_hek_inex)

# subset for genes expressed in both
exon<-counts_hek_ex[rownames(counts_hek_ex)%in%inex_genes,] 
exon<-exon[whichgenes_reproducible(exon,5,reproducecutoff = 0.2),] 
exon<-exon[,] 

intron<-counts_hek_in[rownames(counts_hek_in)%in%inex_genes,]
intron<-intron[whichgenes_reproducible(intron,5,reproducecutoff = 0.2),] 

inex_lib<-data.frame(sample=names(colSums(counts_hek_inex)),
           LibSize=colSums(counts_hek_inex))

## normalize to UMI per million

upm<-function(umi_counts){ 
  umi_counts%>%
  as.data.frame() %>%
  rownames_to_column(var="GeneID") %>%
  pivot_longer( cols = 2:ncol(.),
                names_to = "sample",
                values_to = "cnt") %>%
  left_join(inex_lib) %>% 
  group_by(sample) %>%
  mutate(upm = cnt / LibSize *1e6 ) %>%
  dplyr::select(-c(LibSize,cnt))%>%
  pivot_wider(names_from = sample,
              values_from = upm) %>%
  column_to_rownames(var = "GeneID") %>%
  as.matrix()
}

exon_upm<-upm(exon)
exon_upm<-log2(exon_upm+1)
head(exon_upm)
#>                   AAAACT   ATATAG   GTTTAT   TGTTTA   GCTAGA    CCCACG   CGGTGG
#> ENSG00000000003 6.777043 6.894788 6.881346 6.944080 6.935219 7.1598135 6.991783
#> ENSG00000000419 5.572084 5.541948 5.722610 5.331175 4.769669 5.2074211 5.363794
#> ENSG00000000457 2.626398 1.880397 3.364346 2.689834 2.859309 1.8860013 2.270964
#> ENSG00000000460 3.928917 4.865845 4.638661 4.511567 4.896307 4.2459487 4.331438
#> ENSG00000000971 0.000000 0.000000 0.000000 0.000000 0.000000 0.9250104 1.542618
#> ENSG00000001036 6.173004 6.197858 6.236245 6.332796 6.385227 6.2907196 5.914327
#>                   GCTCGC   AAAGTT   ATCAAA   TAAAGT   TTAATC   GGGATT   CCCCGT
#> ENSG00000000003 7.197138 6.375255 6.389947 6.430394 6.690962 6.469344 6.334377
#> ENSG00000000419 5.931007 4.515383 4.605971 4.806156 4.524692 4.328676 4.681841
#> ENSG00000000457 3.792812 2.097778 1.704817 2.677023 3.132752 1.086215 1.943430
#> ENSG00000000460 4.886397 3.285104 3.706273 3.267919 3.802377 3.147655 3.928934
#> ENSG00000000971 0.000000 1.065916 0.000000 1.055938 1.198606 0.000000 0.000000
#> ENSG00000001036 6.120693 5.680913 5.598644 5.599290 5.722660 5.836887 5.879522
#>                   CGTGGG   GGAGCC
#> ENSG00000000003 6.876656 6.983402
#> ENSG00000000419 4.684511 4.822240
#> ENSG00000000457 2.179717 3.335826
#> ENSG00000000460 4.026753 3.959295
#> ENSG00000000971 0.000000 0.000000
#> ENSG00000001036 6.340348 5.888000

intron_upm<-upm(intron)
intron_upm<-log2(intron_upm+1)
head(intron_upm)
#>                   AAAACT   ATATAG   GTTTAT   TGTTTA   GCTAGA    CCCACG   CGGTGG
#> ENSG00000000003 0.000000 1.227052 1.870535 0.000000 1.170781 0.9250104 0.000000
#> ENSG00000000419 2.900825 3.707959 3.695444 4.023849 3.088823 3.9429054 3.641473
#> ENSG00000000457 2.900825 0.000000 1.870535 2.422748 2.586247 2.4577228 2.752681
#> ENSG00000000460 4.256398 4.651676 4.475957 4.205365 4.393785 4.8509811 4.582480
#> ENSG00000001036 1.842944 0.000000 0.000000 1.063833 1.170781 0.0000000 0.000000
#> ENSG00000001084 4.350893 4.095129 4.475957 4.820789 4.393785 4.8053408 4.892314
#>                   GCTCGC   AAAGTT   ATCAAA   TAAAGT   TTAATC   GGGATT   CCCCGT
#> ENSG00000000003 0.000000 2.425995 0.810066 1.659075 1.198606 0.000000 0.962534
#> ENSG00000000419 3.258980 2.693206 2.649165 2.677023 3.802377 3.473587 4.016111
#> ENSG00000000457 2.747467 1.672198 2.464603 3.096513 0.000000 2.127464 2.521967
#> ENSG00000000460 4.181589 4.444698 3.936642 3.685914 4.603667 4.481563 3.836148
#> ENSG00000001036 1.280269 1.065916 0.810066 1.055938 1.198606 2.127464 0.000000
#> ENSG00000001084 3.635841 3.577125 3.529332 4.426421 4.048073 4.481563 3.736982
#>                   CGTGGG   GGAGCC
#> ENSG00000000003 1.122263 0.000000
#> ENSG00000000419 3.674553 2.691105
#> ENSG00000000457 2.783333 2.213726
#> ENSG00000000460 4.026753 4.393207
#> ENSG00000001036 0.000000 1.495367
#> ENSG00000001084 4.309621 3.574847

df<-data.frame(ENSEMBL=rownames(exon_upm),exon=rowMeans(exon_upm)) %>% 
  inner_join(data.frame(ENSEMBL=rownames(intron_upm),intron=rowMeans(intron_upm))) %>% 
  inner_join(gene_type) %>% 
  filter(biotype2%in%c("protein coding")) %>% 
  unique()
  

smc_inex<-ggplot(df) + aes(x=exon, y=intron)+
  stat_density2d(geom="tile", aes(fill=..density..^0.25, alpha=1), contour=FALSE,show.legend = F) +
  geom_point(size=0.5,show.legend = F) +
  stat_density2d(geom="tile", aes(fill=..density..^0.25,     alpha=ifelse(..density..^0.25<0.15,0,1)), contour=FALSE,show.legend = F) +
  scale_fill_gradientn(colours = colorRampPalette(c("white", blues9,"black"))(256))+
  theme(panel.grid = element_blank())+
  geom_smooth(aes(exon,intron),method="lm",col="grey70")+
  labs(x="Log Mean expression Exon",y="Log Mean expression Intron")

smc_inex

hist.inex<-df %>%
  dplyr::rename(Exon=exon,
         Intron=intron) %>%
  pivot_longer(cols = c("Exon","Intron")) %>%
  ggplot()+
  geom_histogram(aes(value,fill=name),alpha=0.8,colour="grey50",bins = 50)+
  scale_fill_manual(values = inex_colors,limits=force)+
  labs(x="Log Mean Expression",
       fill="")+
  theme(legend.position = c(0.8,0.5),
        legend.background = element_blank())


hist.inex

ggsave(smc_inex,
       device = "pdf",
       path = fig_path,
       width = 100,
       height=100,
       units = "mm",
       filename = "scatter_plot.pdf"
       )

ggsave(hist.inex,
       device = "pdf",
       path = fig_path,
       width = 100,
       height=100,
       units = "mm",
       filename = "exp_dist_plot.pdf"
       )

9. ranked exon vs. intron

rank_mat<-function(count_mat){ 
  count_mat%>%
  as.data.frame() %>%
  rownames_to_column(var="GeneID") %>%
  pivot_longer( cols = 2:ncol(.),
                names_to = "sample",
                values_to = "cnt") %>%
  arrange(GeneID,cnt) %>% 
  group_by(sample) %>%
  mutate(gene_rank = rank(cnt)) %>%
    dplyr::select(-cnt) %>% 
  pivot_wider(names_from = sample,
              values_from = gene_rank) %>% 
  ungroup() %>% 
  column_to_rownames(var = "GeneID") %>%
  as.matrix()
    
}

exon_rank<-rank_mat(exon)

intron_rank<-rank_mat(intron)

df_rank<-data.frame(ENSEMBL=rownames(exon_rank),
                    exon_rank=rowMeans(exon_rank)) %>% 
  inner_join(data.frame(ENSEMBL=rownames(intron_rank),
                        intron_rank=rowMeans(intron_rank))) %>% 
  inner_join(df) %>% 
  filter(biotype2%in%c("protein coding","lncRNA")) %>% 
  mutate(norm_rank_exon=exon_rank/max(exon_rank),
         norm_rank_intron=intron_rank/max(intron_rank))
  
  ggplot(df_rank)+
  geom_density2d_filled(aes(exon_rank,intron_rank),)+
  geom_smooth(aes(exon_rank,intron_rank),method="lm")+
  facet_wrap(~biotype2,nrow=2,scales="free")

10. Genbody coverage

1: subset .bam file for barcodes 2: run get3distance_v3_SP_v2.R for intron and for exon separately 3: combine outputs and plot

bam_path<- "/data/share/htp/prime-seq_Paper/Fig_beads_columns/zUMIs/HEK/zUMIs_output/demultiplexed/"

bam_files<-list.files(path = bam_path,pattern = ".bam$")

bcs<-inf$XC

bam_files<-bam_files[str_split(bam_files,pattern = "[.]",simplify = T)[,2] %in% bcs]

get per bam file geneBody coverage

# library(GenomicRanges, quietly=T, warn.conflicts = F)
# library(GenomicFeatures, quietly=T, warn.conflicts = F)
# library(GenomicAlignments, quietly=T, warn.conflicts = F)
# 
# ## Inputs:
# #bam_files #with corresponding .bai
# #out #"name base for plot and outtable"
# counts_hek_inex<-counts_hek$umicount$inex$all %>% as.matrix()
# counts_hek_inex<-counts_hek_inex[,inf$BC]
# 
# inex_genes<-rownames(counts_hek_inex)
# 
# inex_genes<-inex_genes[order(rowSums(counts_hek_inex),decreasing = T)] # sort by highest overall count
# 
# genes <- inex_genes[1:2000] #if you want to subset genes, list of ENSG ids
# write.table(genes,paste0(bam_path,"inex.genes.txt"),quote = F,row.names = F,col.names = F)
# genes<-paste0(bam_path,"inex.genes.txt")
# 
# #ftype" #"intron or exon"
# 
# gtf_file<- "/data/share/htp/prime-seq_Paper/genomes/Hsap/gencode.v35.primary_assembly.annotation.gtf" #gtf file containing the transcript info"
# txdb <- makeTxDbFromGFF(gtf_file, format="gtf")
# 
# # restrict to canonical chromosomes
# 
# canonical_chr<-seqlevels(txdb)[1:25]
# 
# 
# tmp<-transcriptsBy(txdb, by="gene")
# gene2transcript<-lapply( tmp, function(x){  mcols(x)$tx_name })
# 
# saveDb(txdb, file="/data/share/htp/prime-seq_Paper/genomes/Hsap/gencode.v35.primary_assembly.sqlite")
# save(gene2transcript,file="/data/share/htp/prime-seq_Paper/genomes/Hsap/gene2transcript_canonical.Rdata" )
# 
# txdb<-"/data/share/htp/prime-seq_Paper/genomes/Hsap/gencode.v35.primary_assembly.sqlite"
# g2t<-"/data/share/htp/prime-seq_Paper/genomes/Hsap/gene2transcript_canonical.Rdata"
# 
# maxTdist <- 10000 #"max dist. to 3' end counted"
# minTlength <- 0  #"minimal length of transcripts considered"
# max_exon_length<- 5000 # max length of exons per gene
# ngenes<- 2000 #max number of genes to use
# 
# # i="HEK.ACTGAGCGAAAACT.demx.bam"
# # j="intron"
# 
# # index
#  for (i in bam_files){
#    if (!(file.exists(paste0(bam_path,i,".bai"))))
# 
#    # index bam file
#   system(paste0("samtools index ",bam_path,i))
# 
#  }
# 
# 
# ## start slurm jobs
#  for (i in bam_files){
#    for (j in c("exon","intron")){
#   out<- str_split(i,pattern = "[.]",simplify = T)[,2]
# 
# 
#   system(
#     paste0(
#       "sbatch -J ",
#       out,
#       " --wrap '/opt/bin/Rscript",
#       " /data/share/common/scripts/get3distance_v3_SP_v2_LW_v1.R",
#       " --bamfile ",
#       paste0(bam_path, i),
#       " --txdb ",
#       txdb,
#       " --g2tmapping ",
#       g2t,
#       " --out ",
#       paste0(out,"_", j),
#       " --minTlength ",
#       minTlength,
#       " --max_exon_length ",
#       max_exon_length,
#       " --ftype ",
#       j,
#       " --plot F ",
#       " --canonical T ",
#       " --genes ",
#       genes,
#       " --ngenes ",
#       ngenes,
#       "'"
#     )
#   )
# 
# }}
#

intron only genes

counts_hek_in_o<-counts_hek_in_o[,inf$BC]

intron_only_genes <-unique(rownames(counts_hek_in_o))

intron_only_genes<-intron_only_genes[order(rowSums(counts_hek_in_o[intron_only_genes,]),decreasing = T)] # sort by highest overall count

genes <- intron_only_genes[1:2000] #if you want to subset genes, list of ENSG ids
write.table(genes,paste0(bam_path,"intron_only.genes.txt"),quote = F,row.names = F,col.names = F)
genes<-paste0(bam_path,"intron_only.genes.txt")

gtf_file<- "/data/share/htp/prime-seq_Paper/genomes/Hsap/gencode.v35.primary_assembly.annotation.gtf" #gtf file containing the transcript info"
txdb<-"/data/share/htp/prime-seq_Paper/genomes/Hsap/gencode.v35.primary_assembly.sqlite"
g2t<-"/data/share/htp/prime-seq_Paper/genomes/Hsap/gene2transcript_canonical.Rdata"

maxTdist <- 10000 #"max dist. to 3' end counted"
minTlength <- 0  #"minimal length of transcripts considered"
max_exon_length<- 5000 # max length of exons per gene
ngenes<- 2000 #max number of genes to use



# ## start slurm jobs
#  for (i in bam_files){
#   out<- str_split(i,pattern = "[.]",simplify = T)[,2]
# 
# 
#   system(
#     paste0(
#       "sbatch -J ",
#       out,
#       " --wrap '/opt/bin/Rscript",
#       " /data/share/common/scripts/get3distance_v3_SP_v2_LW_v1.R",
#       " --bamfile ",
#       paste0(bam_path, i),
#       " --txdb ",
#       txdb,
#       " --g2tmapping ",
#       g2t,
#       " --out ",
#       paste0(out,"_", "intron_only"),
#       " --minTlength ",
#       minTlength,
#       " --max_exon_length ",
#       max_exon_length,
#       " --ftype ",
#       "intron",
#       " --plot F ",
#       " --canonical T ",
#       " --genes ",
#       genes,
#       " --ngenes ",
#       ngenes,
#       "'"
#     )
#   )
# 
# }

tables<-list.files(paste0(fig_path,"/genebody_coverage/"),pattern="on.txt")
tables<-c(tables,list.files(paste0(fig_path,"/genebody_coverage/"),pattern="intron_only.txt"))



for (i in tables){
  if(!(exists("combined"))){
    combined<-read_delim(paste0(fig_path,"/genebody_coverage/",i),
                         delim = "\t",
                         show_col_types = FALSE) %>% 
      mutate(BC=str_split(i,pattern=c("_"),simplify = T)[,1],
             type=str_split(str_split_fixed(i,pattern=c("_"),n = 2)[,2],pattern="[.]",simplify=T)[,1])
    
  }else{
    combined<-read_delim(paste0(fig_path,"/genebody_coverage/",i),delim = "\t",
                         show_col_types = FALSE) %>% 
       mutate(BC=str_split(i,pattern=c("_"),simplify = T)[,1],
             type=str_split(str_split_fixed(i,pattern=c("_"),n = 2)[,2],pattern="[.]",simplify=T)[,1]) %>% 
      bind_rows(combined)
  }
  
}


combined_sum<-combined %>% 
  group_by(distance_to_3_end,type) %>% 
  summarize(total_count=sum(read_number)) %>%
  group_by(type) %>% 
  filter(total_count>0) %>% 
  filter(distance_to_3_end<5000) %>% 
  mutate(scaled_count=(total_count/sum(total_count))*1e6) %>% 
  mutate(type=case_when(type=="exon"~"Exon",
                        type=="intron"~"Intron",
                        T~ "Intron only"))

ggplot(combined_sum, aes(x=distance_to_3_end, y=total_count,col=type)) + 
  geom_smooth()+
  #scale_y_log10() +
  scale_x_reverse() +
  theme_minimal()+
    facet_wrap(~type,scales="free")+
  scale_colour_manual(values=inex_colors,limits=force)

  
p_enrich<-ggplot(combined_sum, aes(x=distance_to_3_end, y=scaled_count,col=type)) + 
  geom_smooth()+
  #scale_y_log10() +
  scale_x_reverse(limits=c(5000,0)) +
  labs(x= "Distance to 3 prime end" ,
       y= "Counts per position per million",
       colour="")+
  scale_colour_manual(values=inex_colors,limits=force)+
  theme(legend.position = c(0.3,0.7),
        legend.background = element_blank())
  

p_enrich

ggsave(p_enrich,
       device = "pdf",
       path = fig_path,
       width = 180,
       height=100,
       units = "mm",
       filename = "enrich_plot.pdf"
       )

11. check bam coverage for genes with high intron and exon expression

top_genes_inex<-df_rank %>% 
  filter(!(is.na(external_gene_name))) %>% 
  filter(!(is.na(chromosome_name))) %>% 
  filter(exon_rank<19500 & intron_rank> 15500) %>% 
  arrange(desc(exon_rank)) %>% 
  unique() %>% 
  slice_max(exon_rank,n=20)

colnames(exon)<-paste0(colnames(exon),"_Exon")
colnames(intron)<-paste0(colnames(intron),"_Intron")


full_mat<-list(rownames_to_column(as.data.frame(exon),var="Gene_ID"),rownames_to_column(as.data.frame(intron),var="Gene_ID")) %>% 
  plyr::join_all() %>% 
  column_to_rownames(var="Gene_ID") %>% 
  as.data.frame() %>% 
  as.matrix() 

full_mat[is.na(full_mat)]<-0

inex_mat<-full_mat[,17:32]+full_mat[,1:16]
colnames(inex_mat)<-paste0(str_split(colnames(inex_mat),pattern = "_",simplify = T)[,1],"_Both")

full_mat <-list(rownames_to_column(as.data.frame(inex_mat),var="Gene_ID"),rownames_to_column(as.data.frame(full_mat),var="Gene_ID")) %>% 
  plyr::join_all() %>% 
  column_to_rownames(var="Gene_ID") %>% 
  as.data.frame() %>% 
  as.matrix() 


top_genes_mat<-full_mat[top_genes_inex$ENSEMBL,] %>% 
  t() %>% 
  as.data.frame() %>% 
  rownames_to_column(var="Sample") %>% 
  mutate(type=str_split(Sample,pattern = "_",simplify = T)[,2])

12. plot coverage for most highly expressed exons and introns

source("/data/home/wange/ATAC/Coverage_Plot_functions_V1.R")

gtf_file<-"/data/share/htp/prime-seq_Paper/genomes/Hsap/gencode.v35.primary_assembly.annotation.gtf"

gtf<-rtracklayer::import.gff2(gtf_file)
# 
# GV_gene_mod<-makeGviz_geneModel_from_gencode(gtf)
# 
# saveRDS(GV_gene_mod,"/data/share/htp/prime-seq_Paper/genomes/Hsap/gencode.v35.primary_assembly.annotation.gtf.genemodels_Geneviz.rds")

GV_gene_mod<-readRDS("/data/share/htp/prime-seq_Paper/genomes/Hsap/gencode.v35.primary_assembly.annotation.gtf.genemodels_Geneviz.rds")


annot.colors <- list("introns"="#ff5154","exons"="#91a6ff")


bam_files<-"/data/share/htp/prime-seq_Paper/Fig_beads_columns/zUMIs/HEK/Bulk_opt_lysis_test_2_HEK.filtered.Aligned.GeneTagged.sorted.bam"

inex<-rev(c("introns","exons"))
b.inf <-data.frame(bamfile=c(paste(gsub(x = bam_files,pattern=".bam",replacement = ""),inex,"bam",sep=".")),cond1=inex,cond2=rep("HEK",times=2))
gene.models<- GV_gene_mod # genemodels from makeGviz_geneModel_from_gencode
gen="hg38" # genome
type="RNA" # sequencing type RNA-seq or ATAC-seq


# for (i in 1:20){
# # plotting range 
#   PRange <- gtf %>%
#     as.tibble %>% 
#     mutate(ENSG=str_split_fixed(gene_id,
#                     pattern = "[.]",
#                     n = 2)[, 1]) %>% 
#     filter(ENSG %in% top_genes_inex$ENSEMBL[i]) %>% 
#     group_by(seqnames, strand, gene_id) %>%
#     summarize(start = min(start), end = max(end)) %>%
#     plyranges::as_granges()
# 
# print(top_genes_inex$external_gene_name[i])
# print(top_genes_inex$chromosome_name[i])
# print(length(PRange)>0)
# if(length(PRange)>0){
# 
# figure.name1 = paste0("coveragePlots/Intron_Exon_mapping_",top_genes_inex$ensembl_gene_id_version[i],".pdf")
# 
# 
# 
# Plot_Coverage(PRange=PRange,
#               annot.colors=annot.colors,
#               b.inf=b.inf,
#               gene.models=gene.models,
#               figure.name=figure.name1,
#               gen=gen,
#               type=type,
#               pdf=T,
#               title=top_genes_inex$external_gene_name[i],
#               extend.range=2e4,
#               ymax=(sum(top_genes_mat[top_genes_mat$type=="Exon",top_genes_inex$ENSEMBL[i]])/4)## set y axis for coverage plot to sum of exon counts
#               )
# 
# 
# }
# }
# 
# 
# 
# for (i in 1:20){
# 
# figure.name2 = paste0("coveragePlots/Intron_Exon_expression_",top_genes_inex$ensembl_gene_id_version[i],".pdf")
# p.exp<-top_genes_mat %>% 
#   dplyr::select(c(type,top_genes_inex$ENSEMBL[i])) %>% 
#   rename("Ensembl"=top_genes_inex$ENSEMBL[i]) %>% 
#   filter(type!="Both")%>% 
#   ggplot(aes(y=Ensembl,x=type,col=type))+
#   geom_boxplot(show.legend = F)+
#   ggbeeswarm::geom_beeswarm(aes(col=type),show.legend = F)+
#   facet_wrap(~type,scale="free_x",nrow = 2)+
#   theme(axis.text.x = element_blank(),
#         axis.ticks.x= element_blank())+
#   labs(x="",y=paste0("UMIs (",top_genes_inex$external_gene_name[i],")"))+
#   scale_color_manual(values=inex_colors,limits=force) 
# 
# ggsave(plot = p.exp,device = "pdf",filename = figure.name2,width=2,height=4,scale=1.5)
# }

12. Export plot for ENAH

i=8
 PRange <- gtf %>%
    as.tibble %>% 
    mutate(ENSG=str_split_fixed(gene_id,
                    pattern = "[.]",
                    n = 2)[, 1]) %>% 
    filter(ENSG %in% top_genes_inex$ENSEMBL[i]) %>% 
    group_by(seqnames, strand, gene_id) %>%
    summarize(start = min(start), end = max(end)) %>%
    plyranges::as_granges()

 
figure.name1 = paste0("coveragePlots/Intron_Exon_mapping_",top_genes_inex$ensembl_gene_id_version[i],".pdf")
 
Plot_Coverage(PRange=PRange,
              annot.colors=annot.colors,
              b.inf=b.inf,
              gene.models=gene.models,
              figure.name=figure.name1,
              gen=gen,
              type=type,
              pdf=F,
              title=top_genes_inex$external_gene_name[i],
              extend.range=2e4,
              ymax=400
              )

figure.name2 = paste0("coveragePlots/Intron_Exon_expression_",top_genes_inex$ensembl_gene_id_version[i],".pdf")

p.exp<-top_genes_mat %>%
  dplyr::select(c(type,top_genes_inex$ENSEMBL[i])) %>%
  dplyr::rename("Ensembl"=top_genes_inex$ENSEMBL[i]) %>%
  filter(type!="Both")%>%
  ggplot(aes(y=Ensembl,x=type,col=type))+
  geom_boxplot(show.legend = F)+
  ggbeeswarm::geom_beeswarm(aes(col=type),show.legend = F)+
  facet_wrap(~type,scale="free_x",nrow = 2)+
  theme(axis.text.x = element_blank(),
        axis.ticks.x= element_blank())+
  labs(x="",y=paste0("UMIs (",top_genes_inex$external_gene_name[i],")"))+
  scale_color_manual(values=inex_colors,limits=force)
p.exp

`R` Session Info

sessionInfo()
#> R version 4.1.0 (2021-05-18)
#> Platform: x86_64-pc-linux-gnu (64-bit)
#> Running under: Devuan GNU/Linux 3 (beowulf)
#> 
#> Matrix products: default
#> BLAS:   /usr/lib/x86_64-linux-gnu/openblas/libblas.so.3
#> LAPACK: /usr/lib/x86_64-linux-gnu/libopenblasp-r0.3.5.so
#> 
#> locale:
#>  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
#>  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
#>  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
#>  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
#>  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
#> [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
#> 
#> attached base packages:
#>  [1] grid      parallel  stats4    stats     graphics  grDevices utils    
#>  [8] datasets  methods   base     
#> 
#> other attached packages:
#>  [1] biomaRt_2.48.3       Gviz_1.36.2          GenomicRanges_1.44.0
#>  [4] GenomeInfoDb_1.28.4  IRanges_2.26.0       S4Vectors_0.30.2    
#>  [7] BiocGenerics_0.38.0  ComplexUpset_1.3.1   ggbeeswarm_0.6.0    
#> [10] ggsci_2.9            cowplot_1.1.1        forcats_0.5.1       
#> [13] stringr_1.4.0        dplyr_1.0.7          purrr_0.3.4         
#> [16] readr_2.1.0          tidyr_1.1.4          tibble_3.1.6        
#> [19] ggplot2_3.3.5        tidyverse_1.3.1     
#> 
#> loaded via a namespace (and not attached):
#>   [1] readxl_1.3.1                backports_1.4.0            
#>   [3] Hmisc_4.6-0                 systemfonts_1.0.3          
#>   [5] BiocFileCache_2.0.0         plyr_1.8.6                 
#>   [7] lazyeval_0.2.2              splines_4.1.0              
#>   [9] BiocParallel_1.26.2         digest_0.6.28              
#>  [11] ensembldb_2.16.4            htmltools_0.5.2            
#>  [13] fansi_0.5.0                 magrittr_2.0.1             
#>  [15] checkmate_2.0.0             memoise_2.0.1              
#>  [17] BSgenome_1.60.0             cluster_2.1.2              
#>  [19] tzdb_0.2.0                  Biostrings_2.60.2          
#>  [21] modelr_0.1.8                matrixStats_0.61.0         
#>  [23] vroom_1.5.6                 prettyunits_1.1.1          
#>  [25] jpeg_0.1-9                  colorspace_2.0-2           
#>  [27] blob_1.2.2                  rvest_1.0.2                
#>  [29] rappdirs_0.3.3              textshaping_0.3.6          
#>  [31] haven_2.4.3                 xfun_0.28                  
#>  [33] crayon_1.4.2                RCurl_1.98-1.5             
#>  [35] jsonlite_1.7.2              VariantAnnotation_1.38.0   
#>  [37] survival_3.2-13             glue_1.5.0                 
#>  [39] gtable_0.3.0                zlibbioc_1.38.0            
#>  [41] XVector_0.32.0              DelayedArray_0.18.0        
#>  [43] plyranges_1.12.1            scales_1.1.1               
#>  [45] DBI_1.1.1                   Rcpp_1.0.7                 
#>  [47] isoband_0.2.5               viridisLite_0.4.0          
#>  [49] progress_1.2.2              htmlTable_2.3.0            
#>  [51] foreign_0.8-81              bit_4.0.4                  
#>  [53] Formula_1.2-4               htmlwidgets_1.5.4          
#>  [55] httr_1.4.2                  RColorBrewer_1.1-2         
#>  [57] ellipsis_0.3.2              farver_2.1.0               
#>  [59] pkgconfig_2.0.3             XML_3.99-0.8               
#>  [61] nnet_7.3-16                 dbplyr_2.1.1               
#>  [63] here_1.0.1                  utf8_1.2.2                 
#>  [65] labeling_0.4.2              tidyselect_1.1.1           
#>  [67] rlang_0.4.12                AnnotationDbi_1.54.1       
#>  [69] munsell_0.5.0               cellranger_1.1.0           
#>  [71] tools_4.1.0                 cachem_1.0.6               
#>  [73] cli_3.1.0                   generics_0.1.1             
#>  [75] RSQLite_2.2.8               broom_0.7.10               
#>  [77] evaluate_0.14               fastmap_1.1.0              
#>  [79] ragg_1.2.0                  yaml_2.2.1                 
#>  [81] knitr_1.36                  bit64_4.0.5                
#>  [83] fs_1.5.0                    KEGGREST_1.32.0            
#>  [85] AnnotationFilter_1.16.0     nlme_3.1-153               
#>  [87] xml2_1.3.2                  compiler_4.1.0             
#>  [89] rstudioapi_0.13             beeswarm_0.4.0             
#>  [91] filelock_1.0.2              curl_4.3.2                 
#>  [93] png_0.1-7                   reprex_2.0.1               
#>  [95] stringi_1.7.4               highr_0.9                  
#>  [97] GenomicFeatures_1.44.2      lattice_0.20-45            
#>  [99] ProtGenerics_1.24.0         Matrix_1.3-4               
#> [101] vctrs_0.3.8                 pillar_1.6.4               
#> [103] lifecycle_1.0.1             data.table_1.14.2          
#> [105] bitops_1.0-7                patchwork_1.1.1            
#> [107] rtracklayer_1.52.1          R6_2.5.1                   
#> [109] BiocIO_1.2.0                latticeExtra_0.6-29        
#> [111] gridExtra_2.3               vipor_0.4.5                
#> [113] dichromat_2.0-0             MASS_7.3-54                
#> [115] assertthat_0.2.1            SummarizedExperiment_1.22.0
#> [117] rprojroot_2.0.2             rjson_0.2.20               
#> [119] withr_2.4.2                 GenomicAlignments_1.28.0   
#> [121] Rsamtools_2.8.0             GenomeInfoDbData_1.2.6     
#> [123] mgcv_1.8-38                 hms_1.1.1                  
#> [125] rpart_4.1-15                rmarkdown_2.11             
#> [127] MatrixGenerics_1.4.3        biovizBase_1.40.0          
#> [129] Biobase_2.52.0              lubridate_1.8.0            
#> [131] base64enc_0.1-3             restfulr_0.0.13

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

HEK_intron_exon.md

HEK_intron_exon.md

Purpose:

1. Load packages:

2. Load functions:

3. Load annotation:

4. Load data:

5. Plot gene biotypes per group

6. upset plot of intersections

7. Gene and UMI counts for the different conditions

8. Exon vs. Intron expression correlation

9. ranked exon vs. intron

10. Genbody coverage

get per bam file geneBody coverage

intron only genes

11. check bam coverage for genes with high intron and exon expression

12. plot coverage for most highly expressed exons and introns

12. Export plot for ENAH

`R` Session Info

Files

HEK_intron_exon.md

Latest commit

History

HEK_intron_exon.md

File metadata and controls

Purpose:

1. Load packages:

2. Load functions:

3. Load annotation:

4. Load data:

5. Plot gene biotypes per group

6. upset plot of intersections

7. Gene and UMI counts for the different conditions

8. Exon vs. Intron expression correlation

9. ranked exon vs. intron

10. Genbody coverage

get per bam file geneBody coverage

intron only genes

11. check bam coverage for genes with high intron and exon expression

12. plot coverage for most highly expressed exons and introns

12. Export plot for ENAH

R Session Info

`R` Session Info