From 26354c51253f89ed1473f2d9bbf420872cf57eda Mon Sep 17 00:00:00 2001 From: Khoroshevskyi Date: Thu, 27 Feb 2025 14:02:53 -0500 Subject: [PATCH] Improvements in R script --- bedboss/bedstat/bedstat.py | 4 +- bedboss/bedstat/tools/regionstat.R | 228 +++++++++++-------------- bedboss/bedstat/tools/regionstat_cli.R | 40 +++++ scripts/bbuploader/main.py | 2 +- 4 files changed, 140 insertions(+), 134 deletions(-) create mode 100644 bedboss/bedstat/tools/regionstat_cli.R diff --git a/bedboss/bedstat/bedstat.py b/bedboss/bedstat/bedstat.py index 75b0c39..440dd1b 100755 --- a/bedboss/bedstat/bedstat.py +++ b/bedboss/bedstat/bedstat.py @@ -151,14 +151,14 @@ def bedstat( os.path.dirname(os.path.dirname(os.path.abspath(__file__))), "bedstat", "tools", - "regionstat.R", + "regionstat_cli.R", ) assert os.path.exists(rscript_path), FileNotFoundError( f"'{rscript_path}' script not found" ) command = ( f"Rscript {rscript_path} --bedfilePath={bedfile} " - f"--fileId={bed_digest} --openSignalMatrix={open_signal_matrix} " + f"--openSignalMatrix={open_signal_matrix} " f"--outputFolder={outfolder_stats_results} --genome={genome} " f"--ensdb={ensdb} --digest={bed_digest}" ) diff --git a/bedboss/bedstat/tools/regionstat.R b/bedboss/bedstat/tools/regionstat.R index 525bd45..fa183bb 100644 --- a/bedboss/bedstat/tools/regionstat.R +++ b/bedboss/bedstat/tools/regionstat.R @@ -4,49 +4,13 @@ library(GenomicDistributions) # library(GenomicDistributionsData) # library(GenomeInfoDb) # library(ensembldb) -library(optparse) + library(tools) library(R.utils) library(rjson) trim <- IRanges::trim -option_list = list( - make_option(c("--bedfilePath"), type="character", default=NULL, - help="full path to a BED file to process", metavar="character"), - make_option(c("--fileId"), type="character", default=NULL, - help="BED file ID to use for output files prefix", metavar="character"), - make_option(c("--openSignalMatrix"), type="character", - help="path to the open signal matrix required for the tissue specificity plot", metavar="character"), - make_option(c("--digest"), type="character", default=NULL, - help="digest of the BED file", metavar="character"), - make_option(c("--outputFolder"), type="character", default="output", - help="base output folder for results", metavar="character"), - make_option(c("--genome"), type="character", default="hg38", - help="genome reference to calculate against", metavar="character"), - make_option(c("--ensdb"), type="character", - help="path to the Ensembl annotation gtf file", metavar="character") -) - - -opt_parser = OptionParser(option_list=option_list); -opt = parse_args(opt_parser); - -if (is.null(opt$bedfilePath)) { - print_help(opt_parser) - stop("Bed file input missing.") -} - -if (is.null(opt$fileId)) { - print_help(opt_parser) - stop("fileId input missing.") -} - -if (is.null(opt$digest)) { - print_help(opt_parser) - stop("digest input missing.") -} - myPartitionList <- function(gtffile){ features = c("gene", "exon", "three_prime_utr", "five_prime_utr") geneModels = getGeneModelsFromGTF(gtffile, features, TRUE) @@ -54,8 +18,8 @@ myPartitionList <- function(gtffile){ geneModels$exon, geneModels$three_prime_utr, geneModels$five_prime_utr) - - return (partitionList) + + return (partitionList) } @@ -71,37 +35,37 @@ myChromSizes <- function(genome){ return(chromSizesGenome) } -plotBoth <- function(plotId, g){ - pth = paste0(opt$outputFolder, "/", fileId, "_", plotId) +plotBoth <- function(plotId, g, digest, outfolder){ + pth = paste0(outfolder, "/", digest, "_", plotId) print(paste0("Plotting: ", pth)) ggplot2::ggsave(paste0(pth, ".png"), g, device="png", width=8, height=8, units="in") ggplot2::ggsave(paste0(pth, ".pdf"), g, device="pdf", width=8, height=8, units="in") } -getPlotReportDF <- function(plotId, title){ - pth = paste0(opt$outputFolder, "/", fileId, "_", plotId) - rel_pth = getRelativePath(pth, paste0(opt$outputFolder, "/../../../")) +getPlotReportDF <- function(plotId, title, digest, outfolder){ + pth = paste0(outfolder, "/", digest, "_", plotId) + rel_pth = getRelativePath(pth, paste0(outfolder, "/../../../")) print(paste0("Writing plot json: ", rel_pth)) newPlot = data.frame( - "name"=plotId, - "title"=title, - "thumbnail_path"=paste0(rel_pth, ".png"), + "name"=plotId, + "title"=title, + "thumbnail_path"=paste0(rel_pth, ".png"), "path"=paste0(rel_pth, ".pdf") ) return(newPlot) } -doItAall <- function(query, fileId, genome, cellMatrix) { +doItAll <- function(query, digest, genome, openSignalMatrix, outfolder, BSg, BSgm, gtffile) { plots = data.frame(stringsAsFactors=F) bsGenomeAvail = ifelse((requireNamespace(BSg, quietly=TRUE) | requireNamespace(BSgm, quietly=TRUE)), TRUE, FALSE) - + # check if json file exist for the input bed file - meta_path = paste0(outfolder, "/", fileId, ".json") - plot_path = paste0(outfolder, "/", fileId, "_plots.json") + meta_path = paste0(outfolder, "/", digest, ".json") + plot_path = paste0(outfolder, "/", digest, "_plots.json") if (file.exists(meta_path)){ bedmeta = fromJSON(file=meta_path) - plots = fromJSON(file=plot_path) + plots = fromJSON(file=plot_path) plots = as.data.frame(do.call(rbind, plots)) } else{ plots = data.frame(stringsAsFactors=F) @@ -127,13 +91,13 @@ doItAall <- function(query, fileId, genome, cellMatrix) { } else{ if (genome %in% c("hg19", "hg38", "mm10", "mm9")) { TSSdist = calcFeatureDistRefTSS(query_new, genome) - plotBoth("tss_distance", plotFeatureDist( TSSdist, featureName="TSS")) + plotBoth("tss_distance", plotFeatureDist( TSSdist, featureName="TSS"), digest, outfolder) } else { tss = getTssFromGTF(gtffile, TRUE) TSSdist = calcFeatureDist(query_new, tss) - plotBoth("tss_distance", plotFeatureDist( TSSdist, featureName="TSS")) + plotBoth("tss_distance", plotFeatureDist( TSSdist, featureName="TSS"), digest, outfolder) } - plots = rbind(plots, getPlotReportDF("tss_distance", "Region-TSS distance distribution")) + plots = rbind(plots, getPlotReportDF("tss_distance", "Region-TSS distance distribution", digest, outfolder)) message("Successfully calculated and plot TSS distance.") } if (exists("bedmeta")){ @@ -156,19 +120,19 @@ doItAall <- function(query, fileId, genome, cellMatrix) { if (genome %in% c("mm39", "dm3", "dm6", "ce10", "ce11", "danRer10", "danRer10", "T2T")){ chromSizes = myChromSizes(genome) genomeBins = getGenomeBins(chromSizes) - plotBoth("chrombins", plotChromBins(calcChromBins(query, genomeBins))) + plotBoth("chrombins", plotChromBins(calcChromBins(query, genomeBins)), digest, outfolder) } else{ - plotBoth("chrombins", plotChromBins(calcChromBinsRef(query_new, genome))) + plotBoth("chrombins", plotChromBins(calcChromBinsRef(query_new, genome)), digest, outfolder) } - - plots = rbind(plots, getPlotReportDF("chrombins", "Regions distribution over chromosomes")) + + plots = rbind(plots, getPlotReportDF("chrombins", "Regions distribution over chromosomes", digest, outfolder)) message("Successfully calculated and plot chromosomes region distribution.") }, error = function(e){ message('Caught an error in creating: Chromosomes region distribution plot!') print(e) } - ) + ) } # We can calculate it differently @@ -211,8 +175,8 @@ doItAall <- function(query, fileId, genome, cellMatrix) { # ) # } # } - - + + # Partition plots, default to percentages if (exists("bedmeta")){ if ("exon_frequency" %in% names(bedmeta)){ @@ -222,7 +186,7 @@ doItAall <- function(query, fileId, genome, cellMatrix) { } } else { run_plot = TRUE - } + } if (run_plot){ tryCatch( @@ -232,21 +196,21 @@ doItAall <- function(query, fileId, genome, cellMatrix) { } else { if (genome %in% c("hg19", "hg38", "mm10")) { gp = calcPartitionsRef(query, genome) - plotBoth("partitions", plotPartitions(gp)) + plotBoth("partitions", plotPartitions(gp), digest, outfolder) } else { partitionList = myPartitionList(gtffile) gp = calcPartitions(query, partitionList) - plotBoth("partitions", plotPartitions(gp)) + plotBoth("partitions", plotPartitions(gp), digest, outfolder) } - plots = rbind(plots, getPlotReportDF("partitions", "Regions distribution over genomic partitions")) + plots = rbind(plots, getPlotReportDF("partitions", "Regions distribution over genomic partitions", digest, outfolder)) # flatten the result returned by the function above partiotionNames = as.vector(gp[,"partition"]) partitionsList = list() for(i in seq_along(partiotionNames)){ - partitionsList[[paste0(partiotionNames[i], "_frequency")]] = + partitionsList[[paste0(partiotionNames[i], "_frequency")]] = as.vector(gp[,"Freq"])[i] - partitionsList[[paste0(partiotionNames[i], "_percentage")]] = - as.vector(gp[,"Freq"])[i]/length(query) + partitionsList[[paste0(partiotionNames[i], "_percentage")]] = + as.vector(gp[,"Freq"])[i]/length(query) } if (exists("bedmeta")){ bedmeta = append(bedmeta, partitionsList) @@ -258,10 +222,10 @@ doItAall <- function(query, fileId, genome, cellMatrix) { message('Caught an error in creating: Partition plot!') print(e) } - ) + ) } - - + + # Expected partition plots if (!exists("bedmeta") ){ tryCatch( @@ -270,14 +234,14 @@ doItAall <- function(query, fileId, genome, cellMatrix) { message("Ensembl annotation gtf file not provided. Skipping expected partition plot ... ") } else{ if (genome %in% c("hg19", "hg38", "mm10")) { - plotBoth("expected_partitions", plotExpectedPartitions(calcExpectedPartitionsRef(query, genome))) + plotBoth("expected_partitions", plotExpectedPartitions(calcExpectedPartitionsRef(query, genome)), digest, outfolder) } else { partitionList = myPartitionList(gtffile) chromSizes = myChromSizes(genome) genomeSize = sum(chromSizes) - plotBoth("expected_partitions", plotExpectedPartitions(calcExpectedPartitions(query, partitionList, genomeSize))) + plotBoth("expected_partitions", plotExpectedPartitions(calcExpectedPartitions(query, partitionList, genomeSize)), digest, outfolder) } - plots = rbind(plots, getPlotReportDF("expected_partitions", "Expected distribution over genomic partitions")) + plots = rbind(plots, getPlotReportDF("expected_partitions", "Expected distribution over genomic partitions", digest, outfolder)) message("Successfully calculated and plot expected distribution over genomic partitions.") } }, @@ -285,9 +249,9 @@ doItAall <- function(query, fileId, genome, cellMatrix) { message('Caught an error in creating: Expected partition plot!') print(e) } - ) + ) } - + # Cumulative partition plots if (!exists("bedmeta") ){ tryCatch( @@ -296,12 +260,12 @@ doItAall <- function(query, fileId, genome, cellMatrix) { message("Ensembl annotation gtf file not provided. Skipping cumulative partition plot ... ") } else{ if (genome %in% c("hg19", "hg38", "mm10")) { - plotBoth("cumulative_partitions", plotCumulativePartitions(calcCumulativePartitionsRef(query, genome))) + plotBoth("cumulative_partitions", plotCumulativePartitions(calcCumulativePartitionsRef(query, genome)), digest, outfolder) } else{ partitionList = myPartitionList(gtffile) - plotBoth("cumulative_partitions", plotCumulativePartitions(calcCumulativePartitions(query, partitionList))) + plotBoth("cumulative_partitions", plotCumulativePartitions(calcCumulativePartitions(query, partitionList)), digest, outfolder) } - plots = rbind(plots, getPlotReportDF("cumulative_partitions", "Cumulative distribution over genomic partitions")) + plots = rbind(plots, getPlotReportDF("cumulative_partitions", "Cumulative distribution over genomic partitions", digest, outfolder)) message("Successfully calculated and plot cumulative distribution over genomic partitions.") } }, @@ -311,7 +275,7 @@ doItAall <- function(query, fileId, genome, cellMatrix) { } ) } - + # QThis plot if (exists("bedmeta")){ if ("mean_region_width" %in% names(bedmeta)){ @@ -327,46 +291,46 @@ doItAall <- function(query, fileId, genome, cellMatrix) { tryCatch( expr = { widths = calcWidth(query) - plotBoth("widths_histogram", plotQTHist(widths)) + plotBoth("widths_histogram", plotQTHist(widths), digest, outfolder) if (exists("bedmeta")){ mean_region_width <- list(mean_region_width = signif(mean(widths), digits = 4)) bedmeta = append(bedmeta, mean_region_width) } - plots = rbind(plots, getPlotReportDF("widths_histogram", "Quantile-trimmed histogram of widths")) + plots = rbind(plots, getPlotReportDF("widths_histogram", "Quantile-trimmed histogram of widths", digest, outfolder)) message("Successfully calculated and plot quantile-trimmed histogram of widths.") }, error = function(e){ message('Caught an error in creating: Quantile-trimmed histogram of widths plot!') print(e, widths) } - ) + ) } - + # Neighbor regions distance plots if (!exists("bedmeta") ){ tryCatch( expr = { - plotBoth("neighbor_distances", plotNeighborDist(calcNeighborDist(query))) - plots = rbind(plots, getPlotReportDF("neighbor_distances", "Distance between neighbor regions")) + plotBoth("neighbor_distances", plotNeighborDist(calcNeighborDist(query)), digest, outfolder) + plots = rbind(plots, getPlotReportDF("neighbor_distances", "Distance between neighbor regions", digest, outfolder)) message("Successfully calculated and plot distance between neighbor regions.") }, error = function(e){ message('Caught an error in creating: Distance between neighbor regions plot!') print(e) } - ) + ) } - + ## This part is heavy and if needed can be skipped # Tissue specificity plot if open signal matrix is provided if (!exists("bedmeta") ){ - if (cellMatrix == "None") { + if (openSignalMatrix == "None") { message("open signal matrix not provided. Skipping tissue specificity plot ... ") } else { tryCatch( expr = { - plotBoth("open_chromatin", plotSummarySignal(calcSummarySignal(query, data.table::fread(cellMatrix)))) - plots = rbind(plots, getPlotReportDF("open_chromatin", "Cell specific enrichment for open chromatin")) + plotBoth("open_chromatin", plotSummarySignal(calcSummarySignal(query, data.table::fread(openSignalMatrix))), digest, outfolder) + plots = rbind(plots, getPlotReportDF("open_chromatin", "Cell specific enrichment for open chromatin", digest, outfolder)) message("Successfully calculated and plot cell specific enrichment for open chromatin.") }, error = function(e){ @@ -376,18 +340,18 @@ doItAall <- function(query, fileId, genome, cellMatrix) { ) } } - - + + # Note: names of the list elements MUST match what's defined in: https://github.com/databio/bbconf/blob/master/bbconf/schemas/bedfiles_schema.yaml if (exists("bedmeta")){ write(jsonlite::toJSON(bedmeta, pretty=TRUE), meta_path) write(jsonlite::toJSON(plots, pretty=TRUE, auto_unbox = TRUE), plot_path) } else { bedmeta = list( - name=fileId, + name=digest, number_of_regions=length(query), mean_region_width=ifelse(exists('widths'), signif(mean(widths), digits = 4), NA), - md5sum=opt$digest + md5sum=digest ) if (exists('gcvec') && !isEmpty(gcvec)){ gc_content <- list(gc_content = signif(mean(gcvec), digits = 4)) @@ -402,46 +366,48 @@ doItAall <- function(query, fileId, genome, cellMatrix) { } else { write(jsonlite::toJSON(c(bedmeta), pretty=TRUE), meta_path) } - + if (exists('plots')){ write(jsonlite::toJSON(plots, pretty=TRUE, auto_unbox = TRUE), plot_path) } } } -# define values and output folder for doitall() -fileId = opt$fileId -bedPath = opt$bedfilePath -outfolder = opt$outputFolder -genome = opt$genome -cellMatrix = opt$openSignalMatrix -gtffile = opt$ensdb - - -# build BSgenome package ID to check whether it's installed -if ( startsWith(genome, "T2T")){ - BSg = "BSgenome.Hsapiens.NCBI.T2T.CHM13v2.0" -} else { - if (startsWith(genome, "hg") | startsWith(genome, "grch")) { - orgName = "Hsapiens" - } else if (startsWith(genome, "mm") | startsWith(genome, "grcm")){ - orgName = "Mmusculus" - } else if (startsWith(genome, "dm")){ - orgName = "Dmelanogaster" - } else if (startsWith(genome, "ce")){ - orgName = "Celegans" - } else if (startsWith(genome, "danRer")){ - orgName = "Drerio" - } else if (startsWith(genome, "TAIR")){ - orgName = "Athaliana" - } else { - orgName = "Undefined" - } - BSg = paste0("BSgenome.", orgName , ".UCSC.", genome) -} +runBEDStats = function (bedPath, digest, outfolder, genome, openSignalMatrix, gtffile) { + # define values and output folder for doitall() + message("R: Running regionstat for: ", bedPath) + message("R: Values: ") + message("R: digest: ", digest) + message("R: outfolder: ", outfolder) + message("R: genome: ", genome) + message("R: openSignalMatrix: ", openSignalMatrix) + message("R: gtffile: ", gtffile) -BSgm = paste0(BSg, ".masked") -# read bed file and run doitall() -query = LOLA::readBed(bedPath) -doItAall(query, fileId, genome, cellMatrix) + # build BSgenome package ID to check whether it's installed + if ( startsWith(genome, "T2T")){ + BSg = "BSgenome.Hsapiens.NCBI.T2T.CHM13v2.0" + } else { + if (startsWith(genome, "hg") | startsWith(genome, "grch")) { + orgName = "Hsapiens" + } else if (startsWith(genome, "mm") | startsWith(genome, "grcm")){ + orgName = "Mmusculus" + } else if (startsWith(genome, "dm")){ + orgName = "Dmelanogaster" + } else if (startsWith(genome, "ce")){ + orgName = "Celegans" + } else if (startsWith(genome, "danRer")){ + orgName = "Drerio" + } else if (startsWith(genome, "TAIR")){ + orgName = "Athaliana" + } else { + orgName = "Undefined" + } + BSg = paste0("BSgenome.", orgName , ".UCSC.", genome) + } + + BSgm = paste0(BSg, ".masked") + + query = LOLA::readBed(bedPath) + doItAll(query, digest, genome, openSignalMatrix, outfolder, BSg, BSgm, gtffile) +} diff --git a/bedboss/bedstat/tools/regionstat_cli.R b/bedboss/bedstat/tools/regionstat_cli.R new file mode 100644 index 0000000..481578d --- /dev/null +++ b/bedboss/bedstat/tools/regionstat_cli.R @@ -0,0 +1,40 @@ +args <- commandArgs(trailingOnly = FALSE) +script_path <- sub("--file=", "", args[grep("--file=", args)]) # Extract script path +script_dir <- dirname(normalizePath(script_path)) +source(file.path(script_dir, "regionstat.R")) + +library(optparse) +option_list = list( + make_option(c("--bedfilePath"), type="character", default=NULL, + help="full path to a BED file to process", metavar="character"), + make_option(c("--openSignalMatrix"), type="character", + help="path to the open signal matrix required for the tissue specificity plot", metavar="character"), + make_option(c("--digest"), type="character", default=NULL, + help="digest of the BED file", metavar="character"), + make_option(c("--outputFolder"), type="character", default="output", + help="base output folder for results", metavar="character"), + make_option(c("--genome"), type="character", default="hg38", + help="genome reference to calculate against", metavar="character"), + make_option(c("--ensdb"), type="character", + help="path to the Ensembl annotation gtf file", metavar="character") +) + +opt_parser = OptionParser(option_list=option_list); +opt = parse_args(opt_parser); + +if (is.null(opt$bedfilePath)) { + print_help(opt_parser) + stop("Bed file input missing.") +} + +if (is.null(opt$digest)) { + print_help(opt_parser) + stop("digest input missing.") +} + +if (is.null(opt$digest)) { + print_help(opt_parser) + stop("digest input missing.") +} + +runBEDStats(opt$bedfilePath, opt$digest, opt$outputFolder, opt$genome, opt$openSignalMatrix, opt$ensdb) diff --git a/scripts/bbuploader/main.py b/scripts/bbuploader/main.py index f698c54..477ecac 100644 --- a/scripts/bbuploader/main.py +++ b/scripts/bbuploader/main.py @@ -54,7 +54,7 @@ def another_test(): run_failed=True, run_skipped=True, reinit_skipper=True, - lite=True, + lite=False, overwrite=True, overwrite_bedset=True, )