From fb2f514e2712ded692dd09716c556c76e8b41270 Mon Sep 17 00:00:00 2001 From: Alexandria Pinto Date: Wed, 13 Nov 2024 13:15:44 -0500 Subject: [PATCH] Modify generate gene bed to one script --- bin/final_generate_v111_gene_bed.R | 120 ------------------ ...ate_v75_gene_bed.R => generate_gene_bed.R} | 13 +- modules/local/metafusion/genebed/main.nf | 63 +++------ .../usr/bin/final_generate_v111_gene_bed.R | 120 ------------------ ...ate_v75_gene_bed.R => generate_gene_bed.R} | 14 +- 5 files changed, 27 insertions(+), 303 deletions(-) delete mode 100755 bin/final_generate_v111_gene_bed.R rename bin/{final_generate_v75_gene_bed.R => generate_gene_bed.R} (88%) delete mode 100755 modules/local/metafusion/genebed/resources/usr/bin/final_generate_v111_gene_bed.R rename modules/local/metafusion/genebed/resources/usr/bin/{final_generate_v75_gene_bed.R => generate_gene_bed.R} (88%) diff --git a/bin/final_generate_v111_gene_bed.R b/bin/final_generate_v111_gene_bed.R deleted file mode 100755 index d114ccb..0000000 --- a/bin/final_generate_v111_gene_bed.R +++ /dev/null @@ -1,120 +0,0 @@ -#!/usr/local/bin/Rscript - -# __author__ = "Alexandria Dymun" -# __email__ = "pintoa1@mskcc.org" -# __contributor__ = "Anne Marie Noronha (noronhaa@mskcc.org)" -# __version__ = "0.0.1" -# __status__ = "Dev" - - -suppressPackageStartupMessages({ - library(plyr) - library(dplyr) - library(data.table) - library(stringr) - options(scipen = 999) -}) - -usage <- function() { - message("Usage:") - message("final_generate_v111_gene_bed.R ") -} - -args = commandArgs(TRUE) - -if (length(args)!=2) { - usage() - quit() -} - -gtf <- rtracklayer::import(args[1]) -gtf_df <- as.data.frame(gtf) -#remove incomplete transcripts mRNA_end_NF and mRNA_start_NF (not finished) -gtf_df <- gtf_df[!grepl("NF",gtf_df$tag),] - -file.to_write <- args[2] - -### convert start to 0-based to match metafusion expectations of gff format -gtf_df <- gtf_df %>% - rename( - chr = seqnames - ) %>% - select(c(chr, start, end, transcript_id, type, strand, gene_name, gene_id)) %>% - filter(type %in% c("exon","intron","five_prime_utr","three_prime_utr","CDS")) %>% - mutate(gene_name = ifelse(is.na(gene_name),gene_id,gene_name)) %>% mutate(start = start-1) - - -#START CLOCK -ptm <- proc.time() -print(ptm) - -# Index each transcript feature, incrementing when an intron is passed -## metafusion expects exon count 0 to (N(exons)-1) -## Forward strand: Exon 0 == Exon 1 -### Reverse strand: Exon 0 == LAST EXON IN TRANSCRIPT - -print(dim(gtf_df)) -print(length(unique(gtf_df$transcript_id))) - -modify_transcript <- function(transcript){ - - # Remove exons if coding gene, since "exon" and "CDS" are duplicates of one another - if ("CDS" %in% transcript$type){ - transcript <- transcript[!transcript$type == "exon",] - } - # Order features by increasing bp - transcript <- transcript[order(transcript$start, decreasing = FALSE),] - # Index features - idx <- 0 - for (i in 1:nrow(transcript)){ - transcript$idx[i]<- idx - if (transcript$type[i] == "intron"){ - idx <- idx + 1 - } - } - # REFORMAT TRANSCRIPT - #Change strand info (+ --> f, - --> r) - if (unique(transcript$strand) == "+"){ - transcript$strand <- 'f' - } else if (unique(transcript$strand) == "-"){ - transcript$strand <- 'r' - } else { - errorCondition("Strand info for this transcript is inconsistent") - } - #Add "chr" prefix to chromosomes - transcript$chr <- sapply("chr", paste0, transcript$chr) - #Change CDS --> cds ### IF A TRANSCRIPT LACKS "CDS" THIS LINE WILL DO NOTHING, Changing exon values to UTRs later - transcript <- transcript %>% mutate(type = as.character(type)) - transcript <- transcript %>% mutate(type=ifelse(type == "CDS","cds",type)) - transcript$type[transcript$type == "five_prime_utr"] <- "utr5" - transcript$type[transcript$type == "three_prime_utr"] <- "utr3" - #### Any exon that remains after the cds change, is likely and untranslated region. change below - # Basically, subfeatures which are "exon" need to be changed (i.e. exon --> utr3/utr5) - #Forward strand - transcript$type[transcript$strand == "f" & transcript$type == "exon" ] <- "utr5" - #Reverse strand - transcript$type[transcript$strand == "r" & transcript$type == "exon"]<- "utr3" - expected_types <- c("cds","intron","utr3","utr5") - transcript <- transcript[transcript$type %in% c(expected_types),] - return(transcript) -} - -if(file.exists(file.to_write) ) {file.remove(file.to_write)} - -gtf_df_modified <- gtf_df %>% - group_by(transcript_id,.drop = FALSE) %>% - group_modify(~ modify_transcript(.x)) %>% - select(c(chr, start, end, transcript_id, type, idx, strand, gene_name, gene_id )) %>% - arrange(chr,start,end) - -time <- proc.time() - ptm -print(time) - -write.table( - gtf_df_modified, - file.to_write, - sep="\t", - quote=F, - row.names=F, - col.names=F -) diff --git a/bin/final_generate_v75_gene_bed.R b/bin/generate_gene_bed.R similarity index 88% rename from bin/final_generate_v75_gene_bed.R rename to bin/generate_gene_bed.R index a25b3ef..2a15149 100755 --- a/bin/final_generate_v75_gene_bed.R +++ b/bin/generate_gene_bed.R @@ -17,7 +17,7 @@ suppressPackageStartupMessages({ usage <- function() { message("Usage:") - message("final_generate_v75_gene_bed.R ") + message("generate_gene_bed.R ") } args = commandArgs(TRUE) @@ -27,13 +27,6 @@ if (length(args)!=2) { quit() } -# Utilized gtf from igenomes for FORTE This corresponds to GRCh37 ensembl 75 -# Add introns to gtf, convert to gff3 -# bsub -R "rusage[mem=64]" -o add_introns_agat_%J.out singularity exec -B /juno/ \\ -# -B /tmp -B /scratch/ docker://quay.io/biocontainers/agat:0.8.0--pl5262hdfd78af_0 \\ -# /bin/bash -c "agat_sp_add_introns.pl -g /juno/work/taylorlab/cmopipeline/mskcc-igenomes/igenomes/Homo_sapiens/Ensembl/GRCh37/Annotation/Genes/genes.gtf\\ -# -o genes.INTRONS.gff3" - gtf <- rtracklayer::import(args[1]) gtf_df <- as.data.frame(gtf) #remove incomplete transcripts mRNA_end_NF and mRNA_start_NF (not finished) @@ -47,7 +40,7 @@ gtf_df <- gtf_df %>% chr = seqnames ) %>% select(c(chr, start, end, transcript_id, type, strand, gene_name, gene_id)) %>% - filter(type %in% c("exon","intron","UTR","CDS","cds","utr")) %>% + filter(type %in% c("exon","intron","UTR","CDS","cds","utr","five_prime_utr","three_prime_utr")) %>% mutate(gene_name = ifelse(is.na(gene_name),gene_id,gene_name)) %>% mutate(start = start-1) @@ -110,6 +103,8 @@ modify_transcript <- function(transcript){ transcript$type[transcript$start >= stop_coding & transcript$type == "UTR"] <- "utr5" } } + transcript$type[transcript$type == "five_prime_utr"] <- "utr5" + transcript$type[transcript$type == "three_prime_utr"] <- "utr3" #### Any exon that remains after teh cds change, is likely and untranslated region. change below # Basically, subfeatures which are "exon" need to be changed (i.e. exon --> utr3/utr5) diff --git a/modules/local/metafusion/genebed/main.nf b/modules/local/metafusion/genebed/main.nf index 9cd2769..1fb97b5 100644 --- a/modules/local/metafusion/genebed/main.nf +++ b/modules/local/metafusion/genebed/main.nf @@ -20,55 +20,30 @@ process METAFUSION_GENEBED { script: def args = task.ext.args ?: '' def prefix = task.ext.prefix ?: "${meta.id}" - if( prefix == 'GRCh37' ) - """ - final_generate_v75_gene_bed.R \\ - $gff \\ - ${prefix}.metafusion.gene.bed - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - R: \$(R --version | head -n1) - final_generate_v75_gene_bed.R: 0.0.2 - END_VERSIONS - """ - - else if( prefix == 'GRCh38' ) - """ - final_generate_v111_gene_bed.R \\ - $gff \\ - ${prefix}.metafusion.gene.bed - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - R: \$(R --version | head -n1) - final_generate_v111_gene_bed.R: 0.0.1 - END_VERSIONS - """ + """ + generate_gene_bed.R \\ + $gff \\ + ${prefix}.metafusion.gene.bed + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + R: \$(R --version | head -n1) + generate_gene_bed.R: 0.0.2 + END_VERSIONS + """ stub: def args = task.ext.args ?: '' def prefix = task.ext.prefix ?: "${meta.id}" - if( prefix == 'GRCh37' ) - """ - touch ${prefix}.metafusion.gene.bed - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - R: \$(R --version | head -n1) - final_generate_v75_gene_bed.R: 0.0.2 - END_VERSIONS - """ - else if( prefix == 'GRCh38' ) - """ - touch ${prefix}.metafusion.gene.bed + """ + touch ${prefix}.metafusion.gene.bed - cat <<-END_VERSIONS > versions.yml - "${task.process}": - R: \$(R --version | head -n1) - final_generate_v111_gene_bed.R: 0.0.1 - END_VERSIONS - """ + cat <<-END_VERSIONS > versions.yml + "${task.process}": + R: \$(R --version | head -n1) + generate_gene_bed.R: 0.0.2 + END_VERSIONS + """ } diff --git a/modules/local/metafusion/genebed/resources/usr/bin/final_generate_v111_gene_bed.R b/modules/local/metafusion/genebed/resources/usr/bin/final_generate_v111_gene_bed.R deleted file mode 100755 index d114ccb..0000000 --- a/modules/local/metafusion/genebed/resources/usr/bin/final_generate_v111_gene_bed.R +++ /dev/null @@ -1,120 +0,0 @@ -#!/usr/local/bin/Rscript - -# __author__ = "Alexandria Dymun" -# __email__ = "pintoa1@mskcc.org" -# __contributor__ = "Anne Marie Noronha (noronhaa@mskcc.org)" -# __version__ = "0.0.1" -# __status__ = "Dev" - - -suppressPackageStartupMessages({ - library(plyr) - library(dplyr) - library(data.table) - library(stringr) - options(scipen = 999) -}) - -usage <- function() { - message("Usage:") - message("final_generate_v111_gene_bed.R ") -} - -args = commandArgs(TRUE) - -if (length(args)!=2) { - usage() - quit() -} - -gtf <- rtracklayer::import(args[1]) -gtf_df <- as.data.frame(gtf) -#remove incomplete transcripts mRNA_end_NF and mRNA_start_NF (not finished) -gtf_df <- gtf_df[!grepl("NF",gtf_df$tag),] - -file.to_write <- args[2] - -### convert start to 0-based to match metafusion expectations of gff format -gtf_df <- gtf_df %>% - rename( - chr = seqnames - ) %>% - select(c(chr, start, end, transcript_id, type, strand, gene_name, gene_id)) %>% - filter(type %in% c("exon","intron","five_prime_utr","three_prime_utr","CDS")) %>% - mutate(gene_name = ifelse(is.na(gene_name),gene_id,gene_name)) %>% mutate(start = start-1) - - -#START CLOCK -ptm <- proc.time() -print(ptm) - -# Index each transcript feature, incrementing when an intron is passed -## metafusion expects exon count 0 to (N(exons)-1) -## Forward strand: Exon 0 == Exon 1 -### Reverse strand: Exon 0 == LAST EXON IN TRANSCRIPT - -print(dim(gtf_df)) -print(length(unique(gtf_df$transcript_id))) - -modify_transcript <- function(transcript){ - - # Remove exons if coding gene, since "exon" and "CDS" are duplicates of one another - if ("CDS" %in% transcript$type){ - transcript <- transcript[!transcript$type == "exon",] - } - # Order features by increasing bp - transcript <- transcript[order(transcript$start, decreasing = FALSE),] - # Index features - idx <- 0 - for (i in 1:nrow(transcript)){ - transcript$idx[i]<- idx - if (transcript$type[i] == "intron"){ - idx <- idx + 1 - } - } - # REFORMAT TRANSCRIPT - #Change strand info (+ --> f, - --> r) - if (unique(transcript$strand) == "+"){ - transcript$strand <- 'f' - } else if (unique(transcript$strand) == "-"){ - transcript$strand <- 'r' - } else { - errorCondition("Strand info for this transcript is inconsistent") - } - #Add "chr" prefix to chromosomes - transcript$chr <- sapply("chr", paste0, transcript$chr) - #Change CDS --> cds ### IF A TRANSCRIPT LACKS "CDS" THIS LINE WILL DO NOTHING, Changing exon values to UTRs later - transcript <- transcript %>% mutate(type = as.character(type)) - transcript <- transcript %>% mutate(type=ifelse(type == "CDS","cds",type)) - transcript$type[transcript$type == "five_prime_utr"] <- "utr5" - transcript$type[transcript$type == "three_prime_utr"] <- "utr3" - #### Any exon that remains after the cds change, is likely and untranslated region. change below - # Basically, subfeatures which are "exon" need to be changed (i.e. exon --> utr3/utr5) - #Forward strand - transcript$type[transcript$strand == "f" & transcript$type == "exon" ] <- "utr5" - #Reverse strand - transcript$type[transcript$strand == "r" & transcript$type == "exon"]<- "utr3" - expected_types <- c("cds","intron","utr3","utr5") - transcript <- transcript[transcript$type %in% c(expected_types),] - return(transcript) -} - -if(file.exists(file.to_write) ) {file.remove(file.to_write)} - -gtf_df_modified <- gtf_df %>% - group_by(transcript_id,.drop = FALSE) %>% - group_modify(~ modify_transcript(.x)) %>% - select(c(chr, start, end, transcript_id, type, idx, strand, gene_name, gene_id )) %>% - arrange(chr,start,end) - -time <- proc.time() - ptm -print(time) - -write.table( - gtf_df_modified, - file.to_write, - sep="\t", - quote=F, - row.names=F, - col.names=F -) diff --git a/modules/local/metafusion/genebed/resources/usr/bin/final_generate_v75_gene_bed.R b/modules/local/metafusion/genebed/resources/usr/bin/generate_gene_bed.R similarity index 88% rename from modules/local/metafusion/genebed/resources/usr/bin/final_generate_v75_gene_bed.R rename to modules/local/metafusion/genebed/resources/usr/bin/generate_gene_bed.R index 1fb3d76..2a15149 100755 --- a/modules/local/metafusion/genebed/resources/usr/bin/final_generate_v75_gene_bed.R +++ b/modules/local/metafusion/genebed/resources/usr/bin/generate_gene_bed.R @@ -17,7 +17,7 @@ suppressPackageStartupMessages({ usage <- function() { message("Usage:") - message("final_generate_v75_gene_bed.R ") + message("generate_gene_bed.R ") } args = commandArgs(TRUE) @@ -27,13 +27,6 @@ if (length(args)!=2) { quit() } -# Utilized gtf from igenomes for FORTE This corresponds to GRCh37 ensembl 75 -# Add introns to gtf, convert to gff3 -# bsub -R "rusage[mem=64]" -o add_introns_agat_%J.out singularity exec -B /juno/ \\ -# -B /tmp -B /scratch/ docker://quay.io/biocontainers/agat:0.8.0--pl5262hdfd78af_0 \\ -# /bin/bash -c "agat_sp_add_introns.pl -g /juno/work/taylorlab/cmopipeline/mskcc-igenomes/igenomes/Homo_sapiens/Ensembl/GRCh37/Annotation/Genes/genes.gtf\\ -# -o genes.INTRONS.gff3" - gtf <- rtracklayer::import(args[1]) gtf_df <- as.data.frame(gtf) #remove incomplete transcripts mRNA_end_NF and mRNA_start_NF (not finished) @@ -47,11 +40,10 @@ gtf_df <- gtf_df %>% chr = seqnames ) %>% select(c(chr, start, end, transcript_id, type, strand, gene_name, gene_id)) %>% - filter(type %in% c("exon","intron","UTR","CDS","cds","utr")) %>% + filter(type %in% c("exon","intron","UTR","CDS","cds","utr","five_prime_utr","three_prime_utr")) %>% mutate(gene_name = ifelse(is.na(gene_name),gene_id,gene_name)) %>% mutate(start = start-1) - #START CLOCK ptm <- proc.time() print(ptm) @@ -111,6 +103,8 @@ modify_transcript <- function(transcript){ transcript$type[transcript$start >= stop_coding & transcript$type == "UTR"] <- "utr5" } } + transcript$type[transcript$type == "five_prime_utr"] <- "utr5" + transcript$type[transcript$type == "three_prime_utr"] <- "utr3" #### Any exon that remains after teh cds change, is likely and untranslated region. change below # Basically, subfeatures which are "exon" need to be changed (i.e. exon --> utr3/utr5)