From 20377d291d0a58e795afad7be7c82f9486dcdd64 Mon Sep 17 00:00:00 2001 From: WardDeb Date: Wed, 22 Jan 2025 16:14:34 +0100 Subject: [PATCH 01/11] having npz output, count output if needed --- Cargo.toml | 3 +-- deeptools4.0.0_changes.md | 3 +++ src/multibamsummary.rs | 52 +++++++++++++++++++++++++++------------ 3 files changed, 40 insertions(+), 18 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 2d051aecdc..d319cb2898 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -18,5 +18,4 @@ tokio = "*" flate2 = "*" tempfile = "*" ndarray = "0.16.1" -npyz = "*" -zip = "*" \ No newline at end of file +ndarray-npy = "*" \ No newline at end of file diff --git a/deeptools4.0.0_changes.md b/deeptools4.0.0_changes.md index 37ea18f016..cd2e3b2adc 100644 --- a/deeptools4.0.0_changes.md +++ b/deeptools4.0.0_changes.md @@ -33,6 +33,9 @@ - unmapped reads to unfiltered_out +## multiBamSummary + - npz output has labels encoded as u8s, no longer strings. + # Todo - AlignmentSieve: Shift, Bed, Optimization. diff --git a/src/multibamsummary.rs b/src/multibamsummary.rs index 6458656d46..119c11e406 100644 --- a/src/multibamsummary.rs +++ b/src/multibamsummary.rs @@ -6,9 +6,10 @@ use itertools::{multizip, multiunzip, izip}; use std::io::prelude::*; use std::io::BufReader; use std::fs::File; -use ndarray::Array2; -use ndarray::Array1; +use ndarray::{Array1, Array2, stack}; use std::io; +use ndarray_npy::NpzWriter; +use std::borrow::Cow; use crate::covcalc::{bam_pileup, parse_regions, Alignmentfilters}; use crate::filehandler::{bam_ispaired}; use crate::calc::{median, calc_ratio}; @@ -53,6 +54,11 @@ pub fn r_mbams( bamlabels = labels.extract(py).expect("Failed to retrieve labels."); }); + let max_len = bamlabels.iter().map(|s| s.len()).max().unwrap_or(0); + let bamlabels_arr: Array2 = Array2::from_shape_fn((bamlabels.len(), max_len), |(i, j)| { + bamlabels[i].as_bytes().get(j).copied().unwrap_or(0) + }); + // Get paired-end information let ispe = bamfiles.iter() .map(|x| bam_ispaired(x)) @@ -112,8 +118,13 @@ pub fn r_mbams( // create output file to write to // create output file to write lines into - let mut file = File::create(out_raw_counts).unwrap(); - + let mut cfile = None; + if out_raw_counts != "None" { + cfile = Some(File::create(out_raw_counts).unwrap()); + } + + let mut matvec: Vec> = Vec::new(); + let its: Vec<_> = covcalcs.iter().map(|x| x.iter()).collect(); let zips = TempZip { iterators: its }; for c in zips { @@ -128,21 +139,30 @@ pub fn r_mbams( .map(|x| x.split('\t').collect()) .map(|x: Vec<&str> | ( x[0].to_string(), x[1].parse::().unwrap(), x[2].parse::().unwrap(), x[3].parse::().unwrap() ) ) .collect(); - // collect all fields 3 together - let mut line: Vec = Vec::new(); - line.push(Countline::Text(lines[0].0.clone())); - line.push(Countline::Int(lines[0].1)); - line.push(Countline::Int(lines[0].2)); - for _line in lines { - line.push(Countline::Float(_line.3)); + let arrline = Array1::from(lines.iter().map(|x| x.3).collect::>()); + matvec.push(arrline); + // Write to countsfile if cfile is not None + if let Some(ref mut cfile) = cfile { + let mut line: Vec = Vec::new(); + line.push(Countline::Text(lines[0].0.clone())); + line.push(Countline::Int(lines[0].1)); + line.push(Countline::Int(lines[0].2)); + for _line in lines { + line.push(Countline::Float(_line.3)); + } + let fline = line.iter().map(|x| x.to_string()).collect::>().join("\t"); + writeln!(cfile, "{}", fline).unwrap(); } - println!("{:?}", line); - let fline = line.iter().map(|x| x.to_string()).collect::>().join("\t"); - writeln!(file, "{}", fline).unwrap(); } } - - + let array2: Array2 = Array2::from_shape_vec( + (matvec.len(), matvec[0].len()), + matvec.into_iter().flat_map(|x| x.into_iter()).collect() + ).unwrap(); + let mut npz = NpzWriter::new(File::create(ofile).unwrap()); + npz.add_array("matrix", &array2).unwrap(); + npz.add_array("labels", &bamlabels_arr).unwrap(); + npz.finish().unwrap(); Ok(()) } From 3193ecd22f9545b7884e5169843c0a26918234fc Mon Sep 17 00:00:00 2001 From: WardDeb Date: Fri, 24 Jan 2025 19:54:41 +0100 Subject: [PATCH 02/11] bins/regions implementation for multibamsummary, npz output. Implement RegionBlocks to balance number of temp files versus bps to process. --- pydeeptools/deeptools/multiBamSummary2.py | 2 +- src/alignmentsieve.rs | 5 +- src/bamcompare.rs | 6 +- src/bamcoverage.rs | 5 +- src/calc.rs | 18 + src/computematrix.rs | 1001 +------------- src/covcalc.rs | 1451 ++++++++++++++++++--- src/filehandler.rs | 33 +- src/multibamsummary.rs | 208 ++- 9 files changed, 1454 insertions(+), 1275 deletions(-) diff --git a/pydeeptools/deeptools/multiBamSummary2.py b/pydeeptools/deeptools/multiBamSummary2.py index a42eeff8fb..1085d1dc9b 100644 --- a/pydeeptools/deeptools/multiBamSummary2.py +++ b/pydeeptools/deeptools/multiBamSummary2.py @@ -208,7 +208,7 @@ def process_args(args=None): args.labels = [os.path.basename(x) for x in args.bamfiles] if not args.BED: - args.BED = "None" + args.BED = [] if not args.region: args.region = [] if not args.blackListFileName: diff --git a/src/alignmentsieve.rs b/src/alignmentsieve.rs index 3f37e54970..02b46d5d5d 100644 --- a/src/alignmentsieve.rs +++ b/src/alignmentsieve.rs @@ -8,7 +8,7 @@ use rust_htslib::bam::{self, record, Header, IndexedReader, Read, Reader, Writer use tempfile::{Builder, TempPath, NamedTempFile}; use std::fs::File; use std::io::Write; -use crate::covcalc::{parse_regions, Alignmentfilters}; +use crate::covcalc::{parse_regions, Alignmentfilters, Region}; #[pyfunction] pub fn r_alignmentsieve( @@ -113,7 +113,8 @@ pub fn r_alignmentsieve( } -fn sieve_bamregion(ibam: &str, region: &(String, u32, u32), alfilters: &Alignmentfilters, filter_rna_strand: &str, shift: &Vec, write_filters: bool, nproc: usize, verbose: bool) -> (Vec>, Vec>, u64, u64) { +fn sieve_bamregion(ibam: &str, regstruct: &Region, alfilters: &Alignmentfilters, filter_rna_strand: &str, shift: &Vec, write_filters: bool, nproc: usize, verbose: bool) -> (Vec>, Vec>, u64, u64) { + let region = (regstruct.chrom.clone(), regstruct.get_startu(), regstruct.get_endu()); let mut total_reads: u64 = 0; let mut filtered_reads: u64 = 0; let mut bam = IndexedReader::from_path(ibam).unwrap(); diff --git a/src/bamcompare.rs b/src/bamcompare.rs index f327392262..3d80cf905d 100644 --- a/src/bamcompare.rs +++ b/src/bamcompare.rs @@ -8,7 +8,7 @@ use std::fs::File; use itertools::Itertools; use bigtools::{Value}; use crate::filehandler::{bam_ispaired, write_covfile}; -use crate::covcalc::{bam_pileup, parse_regions, Alignmentfilters}; +use crate::covcalc::{bam_pileup, parse_regions, Alignmentfilters, region_divider}; use crate::normalization::scale_factor_bamcompare; use crate::calc::{median, calc_ratio}; use tempfile::{TempPath}; @@ -61,6 +61,8 @@ pub fn r_bamcompare( // Parse regions & calculate coverage. Note that let (regions, chromsizes) = parse_regions(®ions, bamifile1); + let regionblocks = region_divider(®ions); + let pool = ThreadPoolBuilder::new().num_threads(nproc).build().unwrap(); // Set up the bam files in a Vec. @@ -69,7 +71,7 @@ pub fn r_bamcompare( let covcalcs: Vec = pool.install(|| { bamfiles.par_iter() .map(|(bamfile, ispe)| { - let (bg, mapped, unmapped, readlen, fraglen) = regions.par_iter() + let (bg, mapped, unmapped, readlen, fraglen) = regionblocks.par_iter() .map(|i| bam_pileup(bamfile, &i, &binsize, &ispe, &ignorechr, &filters, false)) .reduce( || (vec![], 0, 0, vec![], vec![]), diff --git a/src/bamcoverage.rs b/src/bamcoverage.rs index 256cd34b18..9b3a0b6496 100644 --- a/src/bamcoverage.rs +++ b/src/bamcoverage.rs @@ -6,7 +6,7 @@ use std::io::prelude::*; use std::io::BufReader; use std::fs::File; use bigtools::Value; -use crate::covcalc::{bam_pileup, parse_regions, Alignmentfilters}; +use crate::covcalc::{bam_pileup, parse_regions, Alignmentfilters, region_divider}; use crate::filehandler::{bam_ispaired, write_covfile}; use crate::normalization::scale_factor; use crate::calc::median; @@ -71,9 +71,10 @@ pub fn r_bamcoverage( // Parse regions & calculate coverage let (regions, chromsizes) = parse_regions(®ions, bamifile); + let regionblocks = region_divider(®ions); let pool = ThreadPoolBuilder::new().num_threads(nproc).build().unwrap(); let (bg, mapped, _unmapped, readlen, fraglen) = pool.install(|| { - regions.par_iter() + regionblocks.par_iter() .map(|i| bam_pileup(bamifile, &i, &binsize, &ispe, &ignorechr, &filters, true)) .reduce( || (vec![], 0, 0, vec![], vec![]), diff --git a/src/calc.rs b/src/calc.rs index 15b4c83e43..75239ad1e7 100644 --- a/src/calc.rs +++ b/src/calc.rs @@ -1,3 +1,5 @@ +use ndarray::{Array1, Array2, Axis}; + pub fn median(mut nvec: Vec) -> f32 { if nvec.is_empty() { 0.0 @@ -156,3 +158,19 @@ pub fn calc_ratio( } } } + +pub fn deseq_scalefactors(array2: &Array2) -> Array1 { + let loggeomeans = array2.mapv(|v| v.ln()).sum_axis(Axis(1)) / array2.shape()[1] as f32; + let masked_array = array2.mapv(|x| if x <= 0.0 { f32::NAN } else { x }); + let masked_loggeomeans = loggeomeans.mapv(|x| if x.is_infinite() { f32::NAN } else { x }); + let adjusted_loga = masked_array.mapv(|x| x.ln()).t().to_owned() - &masked_loggeomeans; + let medians: Array1 = adjusted_loga.t().axis_iter(Axis(1)) + .map(|x| { + let vec: Vec<&f32> = x.iter() + .filter(|&&x| !x.is_nan()) + .collect(); + median_float(vec) + }) + .collect(); + medians.mapv(|x| 1.0 / x.exp()) +} \ No newline at end of file diff --git a/src/computematrix.rs b/src/computematrix.rs index 5ff64f7c19..ea25902118 100644 --- a/src/computematrix.rs +++ b/src/computematrix.rs @@ -5,11 +5,9 @@ use rayon::prelude::*; use rayon::ThreadPoolBuilder; use std::collections::HashMap; use std::path::Path; -use std::fmt; use itertools::Itertools; -use ndarray::Array1; use crate::calc::{mean_float, median_float, max_float, min_float, sum_float}; - +use crate::covcalc::{Region, Gtfparse, Scalingregions, Bin}; #[pyfunction] pub fn r_computematrix( @@ -202,1003 +200,6 @@ fn slop_region( region.get_anchor_bins(scale_regions, chromend) } -#[derive(Debug)] -pub struct Scalingregions { - pub upstream: u32, - pub downstream: u32, - pub unscaled5prime: u32, - pub unscaled3prime: u32, - pub regionbodylength: u32, - pub binsize: u32, - pub cols_expected: usize, - pub bpsum: u32, - pub missingdata_as_zero: bool, - pub scale: f32, - pub nan_after_end: bool, - pub skipzero: bool, - pub minthresh: f32, - pub maxthresh: f32, - pub referencepoint: String, - pub mode: String, - pub bwfiles: usize, - pub avgtype: String, - pub verbose: bool, - pub proc_number: usize, - pub regionlabels: Vec, - pub bwlabels: Vec -} - -#[derive(Clone)] -pub struct Gtfparse { - pub metagene: bool, - pub txnid: String, - pub exonid: String, - pub txniddesignator: String, -} - -#[derive(Clone)] -pub enum Revalue { - U(u32), - V(Vec), -} - -impl Revalue { - pub fn rewrites(&self) -> String { - match self { - Revalue::U(v) => format!("{}", v), - Revalue::V(vs) => vs.iter() - .map(|v| v.to_string()) - .collect::>() - .join(","), - } - } -} - -impl fmt::Debug for Revalue { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - match self { - Revalue::U(value) => write!(f, "U({})", value), - Revalue::V(values) => write!(f, "V({:?})", values), - } - } -} - -impl fmt::Display for Revalue { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - match self { - Revalue::U(value) => write!(f, "U({})", value), - Revalue::V(values) => write!(f, "V({})", values.iter().map(|v| v.to_string()).collect::>().join(", ")), - } - } -} - -#[derive(Clone, Debug)] -pub enum Bin { - Conbin(u32, u32), - Catbin(Vec<(u32, u32)>), -} - -impl Bin { - pub fn get_start(&self) -> u32 { - match self { - Bin::Conbin(start, _) => *start, - Bin::Catbin(starts) => starts.first().unwrap().0, - } - } - pub fn get_end(&self) -> u32 { - match self { - Bin::Conbin(_, end) => *end, - Bin::Catbin(ends) => ends.last().unwrap().1, - } - } -} - - -#[derive(Clone, Debug)] -pub struct Region { - pub chrom: String, - pub start: Revalue, - pub end: Revalue, - pub score: String, - pub strand: String, - pub name: String, - pub regionlength: u32 -} - -impl Region { - pub fn assert_end(&self, chromend: u32) { - match &self.end { - Revalue::U(end) => { - assert!( - *end <= chromend, - "Region end goes beyond chromosome boundary. Fix {}. {} {} {} (chr end = {})", self.name, self.chrom, self.start, self.end, chromend - ); - }, - Revalue::V(ends) => { - for end in ends.iter() { - assert!( - *end <= chromend, - "Region end goes beyond chromosome boundary. Fix {}. {} {} {} (chr end = {})", self.name, self.chrom, self.start, end, chromend - ); - } - } - } - } - - pub fn get_anchorpoint(&self, referencepoint: &str) -> u32 { - // reference-point mode. - // What is exactly returned depends on a couple of parameters - // what is the referencepoint : TSS, center, TES - // what is the strand: +, -, . (note . is assumed to be +) - // depending on if we have exon blocks (start / end are Revalue V == Vectors) or not (start / end are Revalue U == u32's) - match referencepoint { - "TSS" => { - match self.strand.as_str() { - "+" | "." => match &self.start {Revalue::U(start) => *start, Revalue::V(starts) => starts[0]}, - "-" => match &self.end {Revalue::U(end) => *end, Revalue::V(ends) => *ends.last().unwrap()}, - _ => panic!("Strand should either be + or - or . {:?} is not supported.", self.strand), - } - }, - "TES" => { - match self.strand.as_str() { - "+" | "." => match &self.end {Revalue::U(end) => *end, Revalue::V(ends) => *ends.last().unwrap()}, - "-" => match &self.start {Revalue::U(start) => *start, Revalue::V(starts) => starts[0]}, - _ => panic!("Strand should either be + or - or . {:?} is not supported.", self.strand), - } - }, - "center" => { - // Here + or - doesn't matter. It is important though if we have 'metagenes' or not. - match (&self.start, &self.end) { - (Revalue::U(start), Revalue::U(end)) => { - (*start + *end) / 2 - }, - (Revalue::V(starts), Revalue::V(ends)) => { - let exonlength: u32 = starts.iter().zip(ends.iter()).map(|(s, e)| e - s).sum(); - let middle = exonlength / 2; - let mut cumsum: u32 = 0; - for (s, e) in starts.iter().zip(ends.iter()) { - cumsum += e - s; - if cumsum >= middle { - return s + (middle - (cumsum - (e - s))); - } - } - panic!( - "Middle of region not found. Fix {}. {}:{}-{}", - self.name, self.chrom, self.start, self.end - ) - }, - _ => panic!( - "Start and End are not either both u32, or Vecs. This means your regions file is ill-defined. Fix {}. {}:{}-{}", - self.name, self.chrom, self.start, self.end - ), - } - }, - _ => panic!( - "Reference should either be TSS, TES or center. {:?} is not supported.", - referencepoint - ), - } - } - - pub fn get_anchor_bins(&self, scale_regions: &Scalingregions, chromend: u32) -> Vec { - // Given an anchorpoint, return a vector, start, end , middle - // The order of the vector is always 5' -> 3', meaning 'increasing' for +/. regions, and 'decreasing' for - regions. - // At this stage, two situations are possible: - // - self.start / self.end are Revalue::U, meaning we are in 'non metagene' mode. - // - self.start / self.end are Revalue::V, meaning we are in 'metagene' mode, and the bins returned are exon-aware. - // We need a notion of bins that don't make sense (i.e. beyond chromosome boundaries). These are encoded as (0,0) - - let mut bins: Vec = Vec::new(); - let mut bodybins: Vec = Vec::new(); - - // Define anchorpoints - let anchorstart; - let anchorstop; - match scale_regions.mode.as_str() { - "reference-point" => { - anchorstart = self.get_anchorpoint(&scale_regions.referencepoint); - anchorstop = anchorstart; - }, - "scale-regions" => { - match (&self.start, &self.end) { - (Revalue::U(start), Revalue::U(end)) => { - anchorstart = *start; - anchorstop = *end; - }, - (Revalue::V(start), Revalue::V(end)) => { - anchorstart = *start.first().unwrap(); - anchorstop = *end.last().unwrap(); - }, - _ => panic!("Start and End are not either both u32, or Vecs. This means your regions file is ill-defined. Fix {}.",self.name), - } - }, - _ => panic!("Mode should either be reference-point or scale-regions. {} is not supported.", scale_regions.mode), - } - if scale_regions.mode != "reference-point" { - // scale-regions mode. Assert - assert!(scale_regions.regionbodylength != 0, "scale-regions mode, but regionbodylength is 0."); - if self.regionlength < (scale_regions.unscaled5prime + scale_regions.unscaled3prime) || - self.regionlength - (scale_regions.unscaled5prime + scale_regions.unscaled3prime) < scale_regions.binsize { - println!("Warning ! Region {} is shorter than the binsize (potentially after unscaled regions taken into account. Whole region encoded as 0 or NA", self.name); - let nbin = scale_regions.cols_expected / scale_regions.bwfiles; - for _ in 0..nbin { - bins.push(Bin::Conbin(0,0)); - } - return bins; - } else { - bodybins.extend(self.scale_regionbody(scale_regions, chromend)); - } - } - - // Get flanking regions. - // Note that we still need to deal with exon - non-exon as reference-point mode could require metagene walks. - match self.strand.as_str() { - "+" | "." => { - match (&self.start, &self.end) { - (Revalue::U(start), Revalue::U(end)) => { - let mut leftbins: Vec = Vec::new(); - let mut rightbins: Vec = Vec::new(); - - let mut absstart: i64 = anchorstart as i64 - scale_regions.upstream as i64; - let absstop: i64 = anchorstop as i64 + scale_regions.downstream as i64; - - for binix in (absstart..anchorstart as i64).step_by(scale_regions.binsize as usize) { - if binix < 0 || binix as u32 >= chromend || (binix + scale_regions.binsize as i64) as u32 >= chromend { - leftbins.push(Bin::Conbin(0,0)); - } else if scale_regions.nan_after_end && binix as u32 <= *start { - leftbins.push(Bin::Conbin(0,0)); - } else { - leftbins.push(Bin::Conbin(binix as u32, (binix as u32) + scale_regions.binsize)); - } - } - - for binix in (anchorstop as i64..absstop).step_by(scale_regions.binsize as usize) { - if binix < 0 || binix as u32 >= chromend || (binix + scale_regions.binsize as i64) as u32 >= chromend { - rightbins.push(Bin::Conbin(0,0)); - } else if scale_regions.nan_after_end && binix as u32 >= *end { - rightbins.push(Bin::Conbin(0,0)); - } else { - rightbins.push(Bin::Conbin(binix as u32, (binix as u32) + scale_regions.binsize)); - } - } - - for bin in leftbins.into_iter() { - bins.push(bin); - } - // If we have bodybins, they should be squeezed in here. - for bin in bodybins.into_iter() { - bins.push(bin); - } - // Reset bodybins, as they are consumed. - bodybins = Vec::new(); - for bin in rightbins.into_iter() { - bins.push(bin); - } - } - (Revalue::V(start), Revalue::V(end)) => { - let exons: Vec<(u32, u32)> = start.iter().zip(end.iter()) - .map(|(&s, &e)| (s, e)) - .collect(); - // Right side. - let mut rightbins: Vec = Vec::new(); - let mut lastanchor: u32 = anchorstop; - let mut walked_bps: u32 = 0; - while walked_bps < scale_regions.downstream { - if lastanchor >= chromend { - rightbins.push(Bin::Conbin(0,0)); - walked_bps += scale_regions.binsize; - } else { - let (bin, retanch) = refpoint_exonwalker( - &exons, - lastanchor, - scale_regions.binsize, - chromend, - scale_regions.nan_after_end, - true - ); - rightbins.push(bin); - walked_bps += scale_regions.binsize; - lastanchor = retanch; - } - } - // Left side. - let mut leftbins: Vec = Vec::new(); - let mut lastanchor: u32 = anchorstart; - let mut walked_bps: u32 = 0; - while walked_bps < scale_regions.upstream { - if lastanchor == 0 { - leftbins.push(Bin::Conbin(0,0)); - walked_bps += scale_regions.binsize; - } else { - let (bin, retanch) = refpoint_exonwalker( - &exons, - lastanchor, - scale_regions.binsize, - chromend, - scale_regions.nan_after_end, - false - ); - leftbins.push(bin); - walked_bps += scale_regions.binsize; - lastanchor = retanch; - } - } - // Now we need to reverse the leftbins, as we walked backwards. - leftbins.reverse(); - for bin in leftbins.into_iter() { - bins.push(bin); - } - // If we have bodybins, they should be squeezed in here. - for bin in bodybins.into_iter() { - bins.push(bin); - } - // Reset bodybins, as they are consumed. - bodybins = Vec::new(); - for bin in rightbins.into_iter() { - bins.push(bin); - } - } - _ => panic!("Start and End are not either both u32, or Vecs. This means your regions file is ill-defined. Fix {}. {}:{}-{}", - self.name, self.chrom, self.start, self.end), - } - }, - "-" => { - match (&self.start, &self.end) { - (Revalue::U(start), Revalue::U(end)) => { - let mut leftbins: Vec = Vec::new(); - let mut rightbins: Vec = Vec::new(); - - let mut absstart: i64 = anchorstop as i64 + scale_regions.upstream as i64; - let absstop: i64 = anchorstart as i64 - scale_regions.downstream as i64; - - let steps: Vec<_> = (anchorstop as i64..absstart) - .step_by(scale_regions.binsize as usize) - .collect(); - for binix in steps.into_iter().rev() { - if binix as u32 > chromend || (binix + scale_regions.binsize as i64) as u32 > chromend { - rightbins.push(Bin::Conbin(0,0)); - } else if scale_regions.nan_after_end && binix as u32 >= *end { - leftbins.push(Bin::Conbin(0,0)); - } else { - leftbins.push(Bin::Conbin(binix as u32, (binix as u32) + scale_regions.binsize)); - } - } - - let steps: Vec<_> = (absstop..anchorstart as i64) - .step_by(scale_regions.binsize as usize) - .collect(); - for binix in steps.into_iter().rev() { - if binix < 0 { - leftbins.push(Bin::Conbin(0,0)); - } else if scale_regions.nan_after_end && binix as u32 + scale_regions.binsize <= *start { - rightbins.push(Bin::Conbin(0,0)); - } else { - rightbins.push(Bin::Conbin(binix as u32, (binix as u32) + scale_regions.binsize)); - } - } - - for bin in rightbins.into_iter() { - bins.push(bin); - } - // If we have bodybins, they should be squeezed in here. - bodybins.reverse(); - for bin in bodybins.into_iter() { - bins.push(bin); - } - // Reset bodybins, as they are consumed. - bodybins = Vec::new(); - for bin in leftbins.into_iter() { - bins.push(bin); - } - } - (Revalue::V(start), Revalue::V(end)) => { - let exons: Vec<(u32, u32)> = start.iter().zip(end.iter()) - .map(|(&s, &e)| (s, e)) - .collect(); - // Right side. - let mut rightbins: Vec = Vec::new(); - let mut lastanchor: u32 = anchorstop; - let mut walked_bps: u32 = 0; - while walked_bps < scale_regions.upstream { - if lastanchor >= chromend { - rightbins.push(Bin::Conbin(0,0)); - walked_bps += scale_regions.binsize; - } else { - let (bin, retanch) = refpoint_exonwalker( - &exons, - lastanchor, - scale_regions.binsize, - chromend, - scale_regions.nan_after_end, - true - ); - rightbins.push(bin); - walked_bps += scale_regions.binsize; - lastanchor = retanch; - } - } - // Left side. - let mut leftbins: Vec = Vec::new(); - let mut lastanchor: u32 = anchorstart; - let mut walked_bps: u32 = 0; - while walked_bps < scale_regions.downstream { - if lastanchor == 0 { - leftbins.push(Bin::Conbin(0,0)); - walked_bps += scale_regions.binsize; - } else { - let (bin, retanch) = refpoint_exonwalker( - &exons, - lastanchor, - scale_regions.binsize, - chromend, - scale_regions.nan_after_end, - false - ); - leftbins.push(bin); - walked_bps += scale_regions.binsize; - lastanchor = retanch; - } - } - // Note that now we need to go the exact opposite way as for the + strand as the 'highest position' is the 'starting point'. - rightbins.reverse(); - for bin in rightbins.into_iter() { - bins.push(bin); - } - bodybins.reverse(); - for bin in bodybins.into_iter() { - bins.push(bin); - } - bodybins = Vec::new(); - for bin in leftbins.into_iter() { - bins.push(bin); - } - } - _ => panic!("Start and End are not either both u32, or Vecs. This means your regions file is ill-defined. Fix {}. {}:{}-{}", - self.name, self.chrom, self.start, self.end), - } - }, - _ => panic!("Strand should either be + or - or . {:?} is not supported.", self.strand), - }; - assert_eq!( - bins.len(), - scale_regions.cols_expected / scale_regions.bwfiles, - "Number of bins does not match expected number of columns: {} != {}", - bins.len(), - scale_regions.cols_expected / scale_regions.bwfiles - ); - bins - } - - pub fn scale_regionbody(&self, scale_regions: &Scalingregions, chromend: u32) -> Vec { - // A vector of bins needs to be constructed for regionbody. - // Since this is scaling mode, 'linspace' functionality is reproduced. - match self.strand.as_str() { - "+" | "." => { - match (&self.start, &self.end) { - (Revalue::U(start), Revalue::U(end)) => { - // No exons, forward strand. divide start - end as such: - // |---un5prime---|---bodylength---|---un3prime---| - let mut un5bins: Vec = Vec::new(); - let mut un3bins: Vec = Vec::new(); - let mut innerbins: Vec = Vec::new(); - if scale_regions.unscaled5prime > 0 { - un5bins.extend((0..scale_regions.unscaled5prime) - .step_by(scale_regions.binsize as usize) - .map(|i| Bin::Conbin(*start + i, *start + i + scale_regions.binsize)) - .collect::>()); - } - - if scale_regions.unscaled3prime > 0 { - un3bins.extend( (0..scale_regions.unscaled3prime) - .step_by(scale_regions.binsize as usize) - .rev() - .map(|i| Bin::Conbin(*end - i - scale_regions.binsize, *end - i)) - .collect::>() ); - } - let bodystart = *start + scale_regions.unscaled5prime; - let bodyend = *end - scale_regions.unscaled3prime; - - // Get the bins over the body length. These need to be scaled, so similar to deeptools < 4, linspace is used. - let neededbins = (scale_regions.regionbodylength / scale_regions.binsize) as usize; - // There's multiple options here: - // transcriptlength >= regionbodylength -> linspace - // regionbodylength / binsize > transcriptlength <= regionbodylength -> 1 >= binsize > binsize. - // transcriptlength <= regionbodylength / binsize -> index repetitions with binsize of one. - let scaledbinsize = std::cmp::min(std::cmp::max((bodyend - bodystart) / neededbins as u32, 1), scale_regions.binsize); - - innerbins.extend( Array1::linspace(bodystart as f32, (bodyend - scaledbinsize) as f32, neededbins) - .mapv(|x| x as u32) - .map(|x| Bin::Conbin(*x, *x + scaledbinsize)) - .into_iter() - .collect::>() ); - - // Combine the vectors and return - let mut combined_bins = Vec::new(); - if scale_regions.unscaled5prime > 0 { - combined_bins.extend(un5bins.into_iter()); - } - combined_bins.extend(innerbins.into_iter()); - if scale_regions.unscaled3prime > 0 { - combined_bins.extend(un3bins.into_iter()); - } - return combined_bins; - }, - (Revalue::V(start), Revalue::V(end)) => { - let exons: Vec<(u32, u32)> = start.iter().zip(end.iter()) - .map(|(&s, &e)| (s, e)) - .collect(); - let mut un5bins: Vec = Vec::new(); - let mut un3bins: Vec = Vec::new(); - if scale_regions.unscaled5prime > 0 { - let mut walked_bps: u32 = 0; - let mut lastanchor: u32 = start[0]; - - while walked_bps < scale_regions.unscaled5prime { - let (bin, retanch) = refpoint_exonwalker( - &exons, - lastanchor, - scale_regions.binsize, - chromend, - scale_regions.nan_after_end, - true - ); - un5bins.push(bin); - walked_bps += scale_regions.binsize; - lastanchor = retanch; - } - } - if scale_regions.unscaled3prime > 0 { - let mut walked_bps: u32 = 0; - let mut lastanchor: u32 = *end.last().unwrap(); - - while walked_bps < scale_regions.unscaled3prime { - let (bin, retanch) = refpoint_exonwalker( - &exons, - lastanchor, - scale_regions.binsize, - chromend, - scale_regions.nan_after_end, - false - ); - un3bins.push(bin); - walked_bps += scale_regions.binsize; - lastanchor = retanch; - } - } - un3bins.reverse(); - - let bodystart: u32; - let bodyend: u32; - if scale_regions.unscaled5prime > 0 { - bodystart = un5bins.last().unwrap().get_end(); - } else { - bodystart = *start.first().unwrap(); - } - if scale_regions.unscaled3prime > 0 { - bodyend = un3bins.first().unwrap().get_start(); - } else { - bodyend = *end.last().unwrap(); - } - let truebodylength = self.regionlength - scale_regions.unscaled5prime - scale_regions.unscaled3prime; - let neededbins = (scale_regions.regionbodylength / scale_regions.binsize) as usize; - let scaledbinsize = std::cmp::min(std::cmp::max(truebodylength / neededbins as u32, 1), scale_regions.binsize); - // Things are a bit tricky now, as we can do a linspace over the region, but we don't have a notion of the exons. - // I think easiest is to just pull a hashmap over the entire region, get linspace from hashmap to vec, and be done with it. - // technically we fetch a bunch of regions we don't need, but this operation is not too expensive. - - let mut binmap: HashMap = HashMap::new(); - let mut lastanchor: u32 = bodystart; - - for ix in 0..((truebodylength/scaledbinsize)+1) { - let (bin, anchor) = refpoint_exonwalker( - &exons, - lastanchor, - scaledbinsize, - chromend, - scale_regions.nan_after_end, - true - ); - lastanchor = anchor; - match bin { - Bin::Conbin(start, end) => { - if end > bodyend { - binmap.insert(ix, Bin::Conbin(start, bodyend)); - } else { - binmap.insert(ix, Bin::Conbin(start, end)); - } - }, - Bin::Catbin(bins) => { - if bins.last().unwrap().1 > bodyend { - let mut newbins: Vec<(u32, u32)> = Vec::new(); - for (s, e) in bins.iter() { - if *e > bodyend { - newbins.push((*s, bodyend)); - } else { - newbins.push((*s, *e)); - } - } - binmap.insert(ix, Bin::Catbin(newbins)); - } else { - binmap.insert(ix, Bin::Catbin(bins)); - } - }, - } - } - - let innerbins = Array1::linspace(0 as f32, ((truebodylength)/scaledbinsize) as f32, neededbins) - .mapv(|x| x as u32) - .map(|x| binmap.get(&x).unwrap().clone()) - .into_iter() - .collect::>(); - - // Combine the vectors and return - let mut combined_bins = Vec::new(); - if scale_regions.unscaled5prime > 0 { - combined_bins.extend(un5bins.into_iter()); - } - combined_bins.extend(innerbins.into_iter()); - if scale_regions.unscaled3prime > 0 { - combined_bins.extend(un3bins.into_iter()); - } - return combined_bins; - }, - _ => panic!("Start and End are not either both u32, or Vecs. This means your regions file is ill-defined."), - } - }, - "-" => { - match (&self.start, &self.end) { - (Revalue::U(start), Revalue::U(end)) => { - // No exons, negative strand. divide start - end as such: - // |---un3prime---|---bodylength---|---un5prime---| - let mut un5bins: Vec = Vec::new(); - let mut un3bins: Vec = Vec::new(); - let mut innerbins: Vec = Vec::new(); - if scale_regions.unscaled5prime > 0 { - un5bins.extend((0..scale_regions.unscaled5prime) - .step_by(scale_regions.binsize as usize) - .map(|i| Bin::Conbin(*end - i - scale_regions.binsize, *end - i)) - .collect::>()); - } - - if scale_regions.unscaled3prime > 0 { - un3bins.extend( (0..scale_regions.unscaled3prime) - .step_by(scale_regions.binsize as usize) - .rev() - .map(|i| Bin::Conbin(*start + scale_regions.unscaled3prime - i - scale_regions.binsize, *start + scale_regions.unscaled3prime - i)) - .collect::>() ); - } - let bodystart = *start + scale_regions.unscaled3prime; - let bodyend = *end - scale_regions.unscaled5prime; - - // Get the bins over the body length. These need to be scaled, so similar to deeptools < 4, linspace is used. - let neededbins = (scale_regions.regionbodylength / scale_regions.binsize) as usize; - // There's multiple options here: - // transcriptlength >= regionbodylength -> linspace - // regionbodylength / binsize > transcriptlength <= regionbodylength -> 1 >= binsize > binsize. - // transcriptlength <= regionbodylength / binsize -> index repetitions with binsize of one. - let mut scaledbinsize = (bodyend - bodystart)/neededbins as u32; - if scaledbinsize == 0 { - scaledbinsize = 1; - } - if scaledbinsize > scale_regions.binsize { - scaledbinsize = scale_regions.binsize; - } - - innerbins.extend( Array1::linspace(bodystart as f32, (bodyend - scaledbinsize) as f32, neededbins) - .mapv(|x| x as u32) - .map(|x| Bin::Conbin(*x, *x + scaledbinsize)) - .into_iter() - .collect::>() ); - - // Reverse innerbins to go from 3' -> 5' - innerbins.reverse(); - // Combine the vectors and return - let mut combined_bins = Vec::new(); - if scale_regions.unscaled3prime > 0 { - combined_bins.extend(un3bins.into_iter()); - } - combined_bins.extend(innerbins.into_iter()); - if scale_regions.unscaled5prime > 0 { - combined_bins.extend(un5bins.into_iter()); - } - return combined_bins; - }, - (Revalue::V(start), Revalue::V(end)) => { - let exons: Vec<(u32, u32)> = start.iter().zip(end.iter()) - .map(|(&s, &e)| (s, e)) - .collect(); - let mut un5bins: Vec = Vec::new(); - let mut un3bins: Vec = Vec::new(); - if scale_regions.unscaled5prime > 0 { - let mut walked_bps: u32 = 0; - let mut lastanchor: u32 = *end.last().unwrap(); - - while walked_bps < scale_regions.unscaled5prime { - let (bin, retanch) = refpoint_exonwalker( - &exons, - lastanchor, - scale_regions.binsize, - chromend, - scale_regions.nan_after_end, - false - ); - un5bins.push(bin); - walked_bps += scale_regions.binsize; - lastanchor = retanch; - } - } - un5bins.reverse(); - - if scale_regions.unscaled3prime > 0 { - let mut walked_bps: u32 = 0; - let mut lastanchor: u32 = start[0]; - - while walked_bps < scale_regions.unscaled3prime { - let (bin, retanch) = refpoint_exonwalker( - &exons, - lastanchor, - scale_regions.binsize, - chromend, - scale_regions.nan_after_end, - true - ); - un3bins.push(bin); - walked_bps += scale_regions.binsize; - lastanchor = retanch; - } - } - - let bodystart: u32; - let bodyend: u32; - if scale_regions.unscaled3prime > 0 { - bodystart = un3bins.last().unwrap().get_end(); - } else { - bodystart = *start.first().unwrap(); - } - if scale_regions.unscaled5prime > 0 { - bodyend = un5bins.first().unwrap().get_start(); - } else { - bodyend = *end.last().unwrap(); - } - let truebodylength = self.regionlength - scale_regions.unscaled5prime - scale_regions.unscaled3prime; - let neededbins = (scale_regions.regionbodylength / scale_regions.binsize) as usize; - let scaledbinsize = std::cmp::min(std::cmp::max(truebodylength / neededbins as u32, 1), scale_regions.binsize); - // Things are a bit tricky now, as we can do a linspace over the region, but we don't have a notion of the exons. - // I think easiest is to just pull a hashmap over the entire region, get linspace from hashmap to vec, and be done with it. - // technically we fetch a bunch of regions we don't need, but this operation is not too expensive. - - let mut binmap: HashMap = HashMap::new(); - let mut lastanchor: u32 = bodystart; - - for ix in 0..((truebodylength/scaledbinsize)+1) { - let (bin, anchor) = refpoint_exonwalker( - &exons, - lastanchor, - scaledbinsize, - chromend, - scale_regions.nan_after_end, - true - ); - lastanchor = anchor; - match bin { - Bin::Conbin(start, end) => { - if end > bodyend { - binmap.insert(ix, Bin::Conbin(start, bodyend)); - } else { - binmap.insert(ix, Bin::Conbin(start, end)); - } - }, - Bin::Catbin(bins) => { - if bins.last().unwrap().1 > bodyend { - let mut newbins: Vec<(u32, u32)> = Vec::new(); - for (s, e) in bins.iter() { - if *e > bodyend { - newbins.push((*s, bodyend)); - } else { - newbins.push((*s, *e)); - } - } - binmap.insert(ix, Bin::Catbin(newbins)); - } else { - binmap.insert(ix, Bin::Catbin(bins)); - } - }, - } - } - - let innerbins = Array1::linspace(0 as f32, ((truebodylength)/scaledbinsize) as f32, neededbins) - .mapv(|x| x as u32) - .map(|x| binmap.get(&x).unwrap().clone()) - .into_iter() - .collect::>(); - // Combine the vectors and return - let mut combined_bins = Vec::new(); - if scale_regions.unscaled5prime > 0 { - combined_bins.extend(un5bins.into_iter()); - } - combined_bins.extend(innerbins.into_iter()); - if scale_regions.unscaled3prime > 0 { - combined_bins.extend(un3bins.into_iter()); - } - return combined_bins; - }, - _ => panic!("Start and End are not either both u32, or Vecs. This means your regions file is ill-defined."), - } - }, - _ => panic!("Strand should either be + or - or . {:?} is not supported.", self.strand), - } - } -} - -fn refpoint_exonwalker(exons: &Vec<(u32, u32)>, anchor: u32, binsize: u32, chromend: u32, nan_after_end: bool, forward: bool) -> (Bin, u32) { - // Basic function that walks over exons, and returns a Bin (Either Conbin or Catbin) and the last anchorpoint. - let mut anchorix: Option = None; - - for (ix, exon) in exons.iter().enumerate() { - if anchor >= exon.0 && anchor <= exon.1 { - anchorix = Some(ix); - } - } - if forward { - // Walk forward (downstream, towards chromosome end) - match anchorix { - Some(i) => { - // anchor sits in an exon. Check if anchor + binsize is also in same exon. - if anchor + binsize <= exons[i].1 { - (Bin::Conbin(anchor, anchor + binsize), anchor + binsize) - } else { - // anchor + binsize is not in same exon. We need a Catbin. - // Things are a bit more difficult here as well, as we need to walk exons. - let mut start_end_vec: Vec<(u32, u32)> = Vec::new(); - start_end_vec.push( (anchor, exons[i].1) ); - - let mut remainingbin: u32 = binsize - (exons[i].1 - anchor); - let mut lastix: usize = i; - let mut lastanchor: u32 = exons[i].1; - - while remainingbin != 0 { - if lastix + 1 < exons.len() { - // next exon is available. - // Two options here: - // the remainder fits in the lastix + 1 exon. We are done. - // the remainder doesn't fit in the lastix + 1 exon. We need to walk further. - if exons[lastix+1].1 - exons[lastix+1].0 >= remainingbin { - // remainder fits in next exon. - start_end_vec.push( (exons[lastix+1].0, exons[lastix+1].0 + remainingbin) ); - lastanchor = exons[lastix+1].0 + remainingbin; - remainingbin = 0; - } else { - // Remainder is larger then our exon. We need another walk. - start_end_vec.push( (exons[lastix+1].0, exons[lastix+1].1) ); - remainingbin -= exons[lastix+1].1 - exons[lastix+1].0; - lastix += 1; - lastanchor = exons[lastix].1; - } - } else { - // No next exon available. Remainder can just be genomic. - // The last entry here can be changed to include the last part. - if nan_after_end { - start_end_vec.push((0,0)); - } else { - let last = start_end_vec.last_mut().unwrap(); - assert_eq!( - last.1, - lastanchor, - "In the exon - genomic walk, our coordinates are not contiguous" - ); - // Check we don't fall of the chromosome. - if lastanchor + remainingbin > chromend { - last.1 = chromend; - lastanchor = chromend; - } else { - last.1 = lastanchor + remainingbin; - lastanchor = lastanchor + remainingbin; - } - } - remainingbin = 0; - } - } - // We now have a Vec of start - end, we can construct a CatBin. - // Note that CatBins are (absstart, absstop, ((intstart1, intstart2), ...)) - // This seems weird, but makes sure we need to slice the bigwig file only once per bin. - if start_end_vec.len() == 1 { - (Bin::Conbin(anchor, lastanchor), lastanchor) - } else { - (Bin::Catbin(start_end_vec), lastanchor) - } - } - }, - None => { - // our anchor doesn't sit in exons. We just return the anchor + binsize as Bin - if anchor + binsize > chromend { - (Bin::Conbin(anchor, chromend), chromend) - } else { - (Bin::Conbin(anchor, anchor + binsize), anchor + binsize) - } - } - } - } else { - // Walk backwards (upstream, towards chromosome start) - match anchorix { - Some(i) => { - // anchor sits in an exon. Check if anchor - binsize is also in same exon. - if anchor - binsize >= exons[i].0 { - (Bin::Conbin(anchor - binsize, anchor), anchor - binsize) - } else { - // anchor + binsize is not in same exon. We need a Catbin. - // Things are a bit more difficult here as well, as we need to walk exons. - let mut start_end_vec: Vec<(u32, u32)> = Vec::new(); - start_end_vec.push( (exons[i].0, anchor) ); - - let mut remainingbin: u32 = binsize - (anchor - exons[i].0); - let mut lastix: usize = i; - let mut lastanchor: u32 = exons[i].0; - - while remainingbin != 0 { - if lastix >= 1 { - // previous exon is available. - // Two options here: - // the remainder fits in the previous exon. We are done. - // the remainder doesn't fit in the previous exon. We need to walk further. - if exons[lastix-1].1 - exons[lastix-1].0 >= remainingbin { - // remainder fits in next exon. - start_end_vec.push( (exons[lastix-1].1 - remainingbin, exons[lastix-1].1) ); - lastanchor = exons[lastix-1].1 - remainingbin; - remainingbin = 0; - } else { - // Remainder is larger then our exon. We need another walk. - start_end_vec.push( (exons[lastix-1].0, exons[lastix-1].1 ) ); - remainingbin -= exons[lastix-1].1 - exons[lastix-1].0; - lastix -= 1; - lastanchor = exons[lastix].0; - } - } else { - // No previous exon available. Remainder can just be genomic. - // The last entry here can be changed to include the last part. - if nan_after_end { - start_end_vec.push( (0,0) ); - } else { - let last = start_end_vec.last_mut().unwrap(); - assert_eq!( - last.0, - lastanchor, - "In the exon - genomic walk (reverse), our coordinates are not contiguous" - ); - // Check we don't go in the negative. - if lastanchor < remainingbin { - last.0 = 0; - lastanchor = 0; - } else { - last.0 = lastanchor - remainingbin; - lastanchor = lastanchor - remainingbin; - } - } - remainingbin = 0; - } - } - // We now have a Vec of start - end, we can construct a CatBin. - // Note that CatBins are (absstart, absstop, ((intstart1, intstart2), ...)) - // This seems weird, but makes sure we need to slice the bigwig file only once per bin. - if start_end_vec.len() == 1 { - (Bin::Conbin(lastanchor, anchor), lastanchor) - } else { - start_end_vec.reverse(); - (Bin::Catbin(start_end_vec), lastanchor) - } - } - }, - None => { - // our anchor doesn't sit in exons. We just return the anchor - binsize as Bin - if anchor < binsize { - (Bin::Conbin(0, anchor), 0) - } else { - (Bin::Conbin(anchor - binsize, anchor), anchor - binsize) - } - } - } - } -} - fn matrix_dump( sortregions: &str, sortusing: &str, diff --git a/src/covcalc.rs b/src/covcalc.rs index 0013a7da5e..01a054c456 100644 --- a/src/covcalc.rs +++ b/src/covcalc.rs @@ -3,8 +3,10 @@ use std::collections::HashMap; use tempfile::{Builder, TempPath}; use std::io::{BufWriter, Write}; use std::cmp::min; +use std::fmt; +use ndarray::Array1; -pub fn parse_regions(regions: &Vec<(String, u32, u32)>, bam_ifile: &str) -> (Vec<(String, u32, u32)>, HashMap) { +pub fn parse_regions(regions: &Vec<(String, u32, u32)>, bam_ifile: &str) -> (Vec, HashMap) { // Takes a vector of regions, and a bam reference // returns a vector of regions, with all chromosomes and full lengths if original regions was empty // Else it validates the regions against the information from the bam header @@ -12,7 +14,7 @@ pub fn parse_regions(regions: &Vec<(String, u32, u32)>, bam_ifile: &str) -> (Vec let bam = IndexedReader::from_path(bam_ifile).unwrap(); let header = bam.header().clone(); - let mut chromregions: Vec<(String, u32, u32)> = Vec::new(); + let mut chromregions: Vec = Vec::new(); let mut chromsizes = HashMap::new(); if regions.is_empty() { // if regions is empty, we default to all chromosomes, full length @@ -21,7 +23,16 @@ pub fn parse_regions(regions: &Vec<(String, u32, u32)>, bam_ifile: &str) -> (Vec .expect("Invalid UTF-8 in chromosome name"); let chromlen = header.target_len(tid) .expect("Error retrieving length for chromosome"); - chromregions.push((chromname.clone(), 0, chromlen as u32)); + let _reg = Region { + chrom: chromname.clone(), + start: Revalue::U(0), + end: Revalue::U(chromlen as u32), + score: String::from("."), + strand: String::from("."), + name: format!("{}:{}-{}", chromname, 0, chromlen), + regionlength: chromlen as u32, + }; + chromregions.push(_reg); chromsizes.insert(chromname.to_string(), chromlen as u32); } } else { @@ -43,11 +54,21 @@ pub fn parse_regions(regions: &Vec<(String, u32, u32)>, bam_ifile: &str) -> (Vec let chromname = ®ion.0; assert!(region.1 < region.2, "Region start must be strictly less than region end."); assert!(validchroms.contains(chromname), "Chromosome {} not found in bam header", chromname); - chromregions.push((chromname.clone(), region.1, region.2)); + let _reg = Region { + chrom: chromname.clone(), + start: Revalue::U(region.1), + end: Revalue::U(region.2), + score: String::from("."), + strand: String::from("."), + name: format!("{}:{}-{}", chromname, region.1, region.2), + regionlength: region.2 - region.1, + }; + chromregions.push(_reg); } } // Sort regions to make our live easier down the line - chromregions.sort_by(|a, b| a.0.cmp(&b.0).then(a.1.cmp(&b.1))); + // Sort Vec of Regions per chromosome, and then by start. + // chromregions.sort_by(|a, b| a.chrom.cmp(&b.chrom).then(a.start.cmp(&b.start))); return (chromregions, chromsizes); } @@ -56,7 +77,7 @@ pub fn parse_regions(regions: &Vec<(String, u32, u32)>, bam_ifile: &str) -> (Vec #[allow(unused_assignments)] pub fn bam_pileup<'a>( bam_ifile: &str, - region: &'a (String, u32, u32), + regionvec: &'a Vec, binsize: &u32, ispe: &bool, ignorechr: &Vec, @@ -75,8 +96,6 @@ pub fn bam_pileup<'a>( // open bam file and fetch proper chrom let mut bam = IndexedReader::from_path(bam_ifile).unwrap(); - bam.fetch((region.0.as_str(), region.1, region.2)) - .expect(&format!("Error fetching region: {:?}", region)); // init variables for mapping statistics and lengths let mut mapped_reads: u32 = 0; @@ -95,216 +114,224 @@ pub fn bam_pileup<'a>( // Two cases: either the binsize is 1, or it is > 1. // Counting between the two modes is different. In binsize == 1 we compute pileups // for binsize > 1, we count the number of reads that overlap a bin. - if binsize > &1 { - // populate the bg vector with 0 counts over all bins - let mut counts: Vec = vec![0.0; (region.2 - region.1).div_ceil(*binsize) as usize]; - // let mut binstart = region.1; - let mut binix: u32 = 0; - - for record in bam.records() { - let record = record.expect("Error parsing record."); - if !ignorechr.contains(®ion.0) { - if record.is_unmapped() { - unmapped_reads += 1; - } else { - mapped_reads += 1; - if *ispe { - if record.is_paired() && record.is_proper_pair() && (record.flags() & FREAD != 0) { - fraglens.push(record.insert_size().abs() as u32); + + // Convert region to tuple to ease computations. + for regstruct in regionvec.iter() { + let region = (regstruct.chrom.clone(), regstruct.get_startu(), regstruct.get_endu()); + bam.fetch((region.0.as_str(), region.1, region.2)) + .expect(&format!("Error fetching region: {:?}", region)); + + if binsize > &1 { + // populate the bg vector with 0 counts over all bins + let mut counts: Vec = vec![0.0; (region.2 - region.1).div_ceil(*binsize) as usize]; + // let mut binstart = region.1; + let mut binix: u32 = 0; + + for record in bam.records() { + let record = record.expect("Error parsing record."); + if !ignorechr.contains(®ion.0) { + if record.is_unmapped() { + unmapped_reads += 1; + } else { + mapped_reads += 1; + if *ispe { + if record.is_paired() && record.is_proper_pair() && (record.flags() & FREAD != 0) { + fraglens.push(record.insert_size().abs() as u32); + } } + readlens.push(record.seq_len() as u32); } - readlens.push(record.seq_len() as u32); } - } - let blocks = bam_blocks(record); - // keep a record of bin indices that have been updated already - let mut changedbins: Vec = Vec::new(); - for block in blocks.into_iter() { - // Don't count blocks that exceed the chromosome - if block.0 >= region.2 { - continue; - } - binix = block.0 / binsize; - if !changedbins.contains(&binix) { - counts[binix as usize] += 1.0; - changedbins.push(binix); - } - // Don't count blocks that exceed the chromosome - if block.1 >= region.2 { - continue; - } - // Note that our blocks are strictly less then region ends, hence no problem with bin spewing at end: - // binix = [a,b) where b == region.2 would divide into binix+1 (doesn't exist). - binix = block.1 / binsize; - if !changedbins.contains(&binix) { - counts[binix as usize] += 1.0; - changedbins.push(binix); - } - } - // if changedbins contains non-continuous blocks (perhaps read length spans multiple bins), update the inbetweens - if let (Some(&min), Some(&max)) = (changedbins.iter().min(), changedbins.iter().max()) { - for ix in min..=max { - if !changedbins.contains(&ix) { - counts[ix as usize] += 1.0; - changedbins.push(ix); + let blocks = bam_blocks(record); + // keep a record of bin indices that have been updated already + let mut changedbins: Vec = Vec::new(); + for block in blocks.into_iter() { + // Don't count blocks that exceed the chromosome + if block.0 >= region.2 { + continue; + } + binix = block.0 / binsize; + if !changedbins.contains(&binix) { + counts[binix as usize] += 1.0; + changedbins.push(binix); + } + // Don't count blocks that exceed the chromosome + if block.1 >= region.2 { + continue; + } + // Note that our blocks are strictly less then region ends, hence no problem with bin spewing at end: + // binix = [a,b) where b == region.2 would divide into binix+1 (doesn't exist). + binix = block.1 / binsize; + if !changedbins.contains(&binix) { + counts[binix as usize] += 1.0; + changedbins.push(binix); } } - } - } - let mut writer = BufWriter::new(&bg); - // There are two scenarios: - // bamCoverage mode -> we can collapse bins with same coverage (collapse = true) - // bamCompare & others -> We cannot collapse the bins, yet. (collapse = false) - if counts.len() == 1 { - writeln!(writer, "{}\t{}\t{}\t{}", region.0, region.1, region.2, counts[0]).unwrap(); - } else { - if collapse { - let mut lcov = counts[0]; - let mut lstart = region.1; - let mut lend = region.1 + binsize; - let mut start = lstart; - let mut end = lend; - let mut bin: u32 = 0; - - for (ix, count) in counts.into_iter().skip(1).enumerate() { - bin = (ix + 1) as u32; // offset of 1 due to skip(1) - start = (bin * binsize) + region.1; - end = min(start + binsize, region.2); - if count != lcov { - //bg.push((®ion.0, lstart, lend, lcov)); - writeln!(writer, "{}\t{}\t{}\t{}", region.0, lstart, lend, lcov).unwrap(); - lstart = lend; - lcov = count; + // if changedbins contains non-continuous blocks (perhaps read length spans multiple bins), update the inbetweens + if let (Some(&min), Some(&max)) = (changedbins.iter().min(), changedbins.iter().max()) { + for ix in min..=max { + if !changedbins.contains(&ix) { + counts[ix as usize] += 1.0; + changedbins.push(ix); + } } - lend = end; } - // write last entry - writeln!(writer, "{}\t{}\t{}\t{}", region.0, lstart, lend, lcov).unwrap(); + } + let mut writer = BufWriter::new(&bg); + // There are two scenarios: + // bamCoverage mode -> we can collapse bins with same coverage (collapse = true) + // bamCompare & others -> We cannot collapse the bins, yet. (collapse = false) + if counts.len() == 1 { + writeln!(writer, "{}\t{}\t{}\t{}", region.0, region.1, region.2, counts[0]).unwrap(); } else { - let mut start = region.1; - let mut end = region.1 + binsize; - writeln!(writer, "{}\t{}\t{}\t{}", region.0, start, end, counts[0]).unwrap(); - for (ix, count) in counts.into_iter().skip(1).enumerate() { - let bin = (ix + 1) as u32; - start = (bin * binsize) + region.1; - end = min(start + binsize, region.2); - writeln!(writer, "{}\t{}\t{}\t{}", region.0, start, end, count).unwrap(); + if collapse { + let mut lcov = counts[0]; + let mut lstart = region.1; + let mut lend = region.1 + binsize; + let mut start = lstart; + let mut end = lend; + let mut bin: u32 = 0; + + for (ix, count) in counts.into_iter().skip(1).enumerate() { + bin = (ix + 1) as u32; // offset of 1 due to skip(1) + start = (bin * binsize) + region.1; + end = min(start + binsize, region.2); + if count != lcov { + //bg.push((®ion.0, lstart, lend, lcov)); + writeln!(writer, "{}\t{}\t{}\t{}", region.0, lstart, lend, lcov).unwrap(); + lstart = lend; + lcov = count; + } + lend = end; + } + // write last entry + writeln!(writer, "{}\t{}\t{}\t{}", region.0, lstart, lend, lcov).unwrap(); + } else { + let mut start = region.1; + let mut end = region.1 + binsize; + writeln!(writer, "{}\t{}\t{}\t{}", region.0, start, end, counts[0]).unwrap(); + for (ix, count) in counts.into_iter().skip(1).enumerate() { + let bin = (ix + 1) as u32; + start = (bin * binsize) + region.1; + end = min(start + binsize, region.2); + writeln!(writer, "{}\t{}\t{}\t{}", region.0, start, end, count).unwrap(); + } } } - } - // binsize = 1 - } else { - let mut l_start: u32 = region.1; - let mut l_end: u32 = region.1; - let mut l_cov: u32 = 0; - let mut pileup_start: bool = true; - let mut writer = BufWriter::new(&bg); - let mut written: bool = false; - for record in bam.records() { - let record = record.expect("Error parsing record."); - if !ignorechr.contains(®ion.0) { - if record.is_unmapped() { - unmapped_reads += 1; - } else { - mapped_reads += 1; - if *ispe { - if record.is_paired() && record.is_proper_pair() && (record.flags() & FREAD != 0) { - fraglens.push(record.insert_size().abs() as u32); + // binsize = 1 + } else { + let mut l_start: u32 = region.1; + let mut l_end: u32 = region.1; + let mut l_cov: u32 = 0; + let mut pileup_start: bool = true; + let mut writer = BufWriter::new(&bg); + let mut written: bool = false; + for record in bam.records() { + let record = record.expect("Error parsing record."); + if !ignorechr.contains(®ion.0) { + if record.is_unmapped() { + unmapped_reads += 1; + } else { + mapped_reads += 1; + if *ispe { + if record.is_paired() && record.is_proper_pair() && (record.flags() & FREAD != 0) { + fraglens.push(record.insert_size().abs() as u32); + } } + readlens.push(record.seq_len() as u32); } - readlens.push(record.seq_len() as u32); - } - } - } - // Refetch bam file to reset iterator. - bam.fetch((region.0.as_str(), region.1, region.2)) - .expect(&format!("Error fetching region: {:?}", region)); - for p in bam.pileup() { - // Per default pileups count deletions in cigar string too. - // For consistency with previous deepTools functionality, we ignore them. - // to be fair I think they shouldn't be counted anyhow, but who am I ? - // Note that coverages can be 0 now. - let pileup = p.expect("Error parsing pileup."); - let mut cov: u32 = 0; - for _a in pileup.alignments() { - if !_a.is_del() { - cov += 1; } } - let pos = pileup.pos(); - if pileup_start { - // if the first pileup is not at the start of the region, write 0 coverage - if pos > l_start { - if collapse { - writeln!(writer, "{}\t{}\t{}\t{}", region.0, l_start, pos, 0 as f32).unwrap(); - } else { - for p in l_start..pos { - writeln!(writer, "{}\t{}\t{}\t{}", region.0, p, p + 1, 0 as f32).unwrap(); - } + // Refetch bam file to reset iterator. + bam.fetch((region.0.as_str(), region.1, region.2)) + .expect(&format!("Error fetching region: {:?}", region)); + for p in bam.pileup() { + // Per default pileups count deletions in cigar string too. + // For consistency with previous deepTools functionality, we ignore them. + // to be fair I think they shouldn't be counted anyhow, but who am I ? + // Note that coverages can be 0 now. + let pileup = p.expect("Error parsing pileup."); + let mut cov: u32 = 0; + for _a in pileup.alignments() { + if !_a.is_del() { + cov += 1; } - written = true; } - pileup_start = false; - l_start = pos; - l_cov = cov; - } else { - if pos != l_end + 1 { - if collapse { - writeln!(writer, "{}\t{}\t{}\t{}", region.0, l_start, l_end + 1, l_cov as f32).unwrap(); - writeln!(writer, "{}\t{}\t{}\t{}", region.0, l_end + 1, pos, 0 as f32).unwrap(); - } else { - for p in l_start..l_end + 1 { - writeln!(writer, "{}\t{}\t{}\t{}", region.0, p, p + 1, l_cov as f32).unwrap(); - } - for p in l_end + 1..pos { - writeln!(writer, "{}\t{}\t{}\t{}", region.0, p, p + 1, 0 as f32).unwrap(); + let pos = pileup.pos(); + if pileup_start { + // if the first pileup is not at the start of the region, write 0 coverage + if pos > l_start { + if collapse { + writeln!(writer, "{}\t{}\t{}\t{}", region.0, l_start, pos, 0 as f32).unwrap(); + } else { + for p in l_start..pos { + writeln!(writer, "{}\t{}\t{}\t{}", region.0, p, p + 1, 0 as f32).unwrap(); + } } + written = true; } - written = true; + pileup_start = false; l_start = pos; l_cov = cov; - } else if l_cov != cov { - if collapse { - writeln!(writer, "{}\t{}\t{}\t{}", region.0, l_start, pos, l_cov as f32).unwrap(); - } else { - for p in l_start..pos { - writeln!(writer, "{}\t{}\t{}\t{}", region.0, p, p + 1, l_cov as f32).unwrap(); + } else { + if pos != l_end + 1 { + if collapse { + writeln!(writer, "{}\t{}\t{}\t{}", region.0, l_start, l_end + 1, l_cov as f32).unwrap(); + writeln!(writer, "{}\t{}\t{}\t{}", region.0, l_end + 1, pos, 0 as f32).unwrap(); + } else { + for p in l_start..l_end + 1 { + writeln!(writer, "{}\t{}\t{}\t{}", region.0, p, p + 1, l_cov as f32).unwrap(); + } + for p in l_end + 1..pos { + writeln!(writer, "{}\t{}\t{}\t{}", region.0, p, p + 1, 0 as f32).unwrap(); + } + } + written = true; + l_start = pos; + l_cov = cov; + } else if l_cov != cov { + if collapse { + writeln!(writer, "{}\t{}\t{}\t{}", region.0, l_start, pos, l_cov as f32).unwrap(); + } else { + for p in l_start..pos { + writeln!(writer, "{}\t{}\t{}\t{}", region.0, p, p + 1, l_cov as f32).unwrap(); + } } + written = true; + l_start = pos; + l_cov = cov; } - written = true; - l_start = pos; - l_cov = cov; - } - } - l_end = pos; - l_cov = cov; - } - // if bg is empty, whole region is 0 coverage - if !written { - if collapse { - writeln!(writer, "{}\t{}\t{}\t{}", region.0, l_start, region.2, 0 as f32).unwrap(); - } else { - for p in l_start..region.2 { - writeln!(writer, "{}\t{}\t{}\t{}", region.0, p, p + 1, 0 as f32).unwrap(); } + l_end = pos; + l_cov = cov; } - } else { - // Still need to write the last pileup(s) - if collapse { - writeln!(writer, "{}\t{}\t{}\t{}", region.0, l_start, l_end + 1, l_cov as f32).unwrap(); - // Make sure that if we didn't reach end of chromosome, we still write 0 cov. - if l_end + 1 < region.2 { - writeln!(writer, "{}\t{}\t{}\t{}", region.0, l_end + 1, region.2, 0 as f32).unwrap(); + // if bg is empty, whole region is 0 coverage + if !written { + if collapse { + writeln!(writer, "{}\t{}\t{}\t{}", region.0, l_start, region.2, 0 as f32).unwrap(); + } else { + for p in l_start..region.2 { + writeln!(writer, "{}\t{}\t{}\t{}", region.0, p, p + 1, 0 as f32).unwrap(); + } } } else { - for p in l_start..l_end + 1 { - writeln!(writer, "{}\t{}\t{}\t{}", region.0, p, p + 1, l_cov as f32).unwrap(); - } - if l_end + 1 < region.2 { - for p in l_end + 1..region.2 + 1 { - writeln!(writer, "{}\t{}\t{}\t{}", region.0, p, p + 1, 0 as f32).unwrap(); + // Still need to write the last pileup(s) + if collapse { + writeln!(writer, "{}\t{}\t{}\t{}", region.0, l_start, l_end + 1, l_cov as f32).unwrap(); + // Make sure that if we didn't reach end of chromosome, we still write 0 cov. + if l_end + 1 < region.2 { + writeln!(writer, "{}\t{}\t{}\t{}", region.0, l_end + 1, region.2, 0 as f32).unwrap(); + } + } else { + for p in l_start..l_end + 1 { + writeln!(writer, "{}\t{}\t{}\t{}", region.0, p, p + 1, l_cov as f32).unwrap(); + } + if l_end + 1 < region.2 { + for p in l_end + 1..region.2 + 1 { + writeln!(writer, "{}\t{}\t{}\t{}", region.0, p, p + 1, 0 as f32).unwrap(); + } } } } @@ -347,4 +374,1046 @@ pub struct Alignmentfilters { pub samflagexclude: u16, pub minfraglen: u32, pub maxfraglen: u32, -} \ No newline at end of file +} + + +#[derive(Clone, Debug)] +pub struct Region { + pub chrom: String, + pub start: Revalue, + pub end: Revalue, + pub score: String, + pub strand: String, + pub name: String, + pub regionlength: u32 +} + +impl Region { + pub fn assert_end(&self, chromend: u32) { + match &self.end { + Revalue::U(end) => { + assert!( + *end <= chromend, + "Region end goes beyond chromosome boundary. Fix {}. {} {} {} (chr end = {})", self.name, self.chrom, self.start, self.end, chromend + ); + }, + Revalue::V(ends) => { + for end in ends.iter() { + assert!( + *end <= chromend, + "Region end goes beyond chromosome boundary. Fix {}. {} {} {} (chr end = {})", self.name, self.chrom, self.start, end, chromend + ); + } + } + } + } + + pub fn get_anchorpoint(&self, referencepoint: &str) -> u32 { + // reference-point mode. + // What is exactly returned depends on a couple of parameters + // what is the referencepoint : TSS, center, TES + // what is the strand: +, -, . (note . is assumed to be +) + // depending on if we have exon blocks (start / end are Revalue V == Vectors) or not (start / end are Revalue U == u32's) + match referencepoint { + "TSS" => { + match self.strand.as_str() { + "+" | "." => match &self.start {Revalue::U(start) => *start, Revalue::V(starts) => starts[0]}, + "-" => match &self.end {Revalue::U(end) => *end, Revalue::V(ends) => *ends.last().unwrap()}, + _ => panic!("Strand should either be + or - or . {:?} is not supported.", self.strand), + } + }, + "TES" => { + match self.strand.as_str() { + "+" | "." => match &self.end {Revalue::U(end) => *end, Revalue::V(ends) => *ends.last().unwrap()}, + "-" => match &self.start {Revalue::U(start) => *start, Revalue::V(starts) => starts[0]}, + _ => panic!("Strand should either be + or - or . {:?} is not supported.", self.strand), + } + }, + "center" => { + // Here + or - doesn't matter. It is important though if we have 'metagenes' or not. + match (&self.start, &self.end) { + (Revalue::U(start), Revalue::U(end)) => { + (*start + *end) / 2 + }, + (Revalue::V(starts), Revalue::V(ends)) => { + let exonlength: u32 = starts.iter().zip(ends.iter()).map(|(s, e)| e - s).sum(); + let middle = exonlength / 2; + let mut cumsum: u32 = 0; + for (s, e) in starts.iter().zip(ends.iter()) { + cumsum += e - s; + if cumsum >= middle { + return s + (middle - (cumsum - (e - s))); + } + } + panic!( + "Middle of region not found. Fix {}. {}:{}-{}", + self.name, self.chrom, self.start, self.end + ) + }, + _ => panic!( + "Start and End are not either both u32, or Vecs. This means your regions file is ill-defined. Fix {}. {}:{}-{}", + self.name, self.chrom, self.start, self.end + ), + } + }, + _ => panic!( + "Reference should either be TSS, TES or center. {:?} is not supported.", + referencepoint + ), + } + } + + pub fn get_anchor_bins(&self, scale_regions: &Scalingregions, chromend: u32) -> Vec { + // Given an anchorpoint, return a vector, start, end , middle + // The order of the vector is always 5' -> 3', meaning 'increasing' for +/. regions, and 'decreasing' for - regions. + // At this stage, two situations are possible: + // - self.start / self.end are Revalue::U, meaning we are in 'non metagene' mode. + // - self.start / self.end are Revalue::V, meaning we are in 'metagene' mode, and the bins returned are exon-aware. + // We need a notion of bins that don't make sense (i.e. beyond chromosome boundaries). These are encoded as (0,0) + + let mut bins: Vec = Vec::new(); + let mut bodybins: Vec = Vec::new(); + + // Define anchorpoints + let anchorstart; + let anchorstop; + match scale_regions.mode.as_str() { + "reference-point" => { + anchorstart = self.get_anchorpoint(&scale_regions.referencepoint); + anchorstop = anchorstart; + }, + "scale-regions" => { + match (&self.start, &self.end) { + (Revalue::U(start), Revalue::U(end)) => { + anchorstart = *start; + anchorstop = *end; + }, + (Revalue::V(start), Revalue::V(end)) => { + anchorstart = *start.first().unwrap(); + anchorstop = *end.last().unwrap(); + }, + _ => panic!("Start and End are not either both u32, or Vecs. This means your regions file is ill-defined. Fix {}.",self.name), + } + }, + _ => panic!("Mode should either be reference-point or scale-regions. {} is not supported.", scale_regions.mode), + } + if scale_regions.mode != "reference-point" { + // scale-regions mode. Assert + assert!(scale_regions.regionbodylength != 0, "scale-regions mode, but regionbodylength is 0."); + if self.regionlength < (scale_regions.unscaled5prime + scale_regions.unscaled3prime) || + self.regionlength - (scale_regions.unscaled5prime + scale_regions.unscaled3prime) < scale_regions.binsize { + println!("Warning ! Region {} is shorter than the binsize (potentially after unscaled regions taken into account. Whole region encoded as 0 or NA", self.name); + let nbin = scale_regions.cols_expected / scale_regions.bwfiles; + for _ in 0..nbin { + bins.push(Bin::Conbin(0,0)); + } + return bins; + } else { + bodybins.extend(self.scale_regionbody(scale_regions, chromend)); + } + } + + // Get flanking regions. + // Note that we still need to deal with exon - non-exon as reference-point mode could require metagene walks. + match self.strand.as_str() { + "+" | "." => { + match (&self.start, &self.end) { + (Revalue::U(start), Revalue::U(end)) => { + let mut leftbins: Vec = Vec::new(); + let mut rightbins: Vec = Vec::new(); + + let mut absstart: i64 = anchorstart as i64 - scale_regions.upstream as i64; + let absstop: i64 = anchorstop as i64 + scale_regions.downstream as i64; + + for binix in (absstart..anchorstart as i64).step_by(scale_regions.binsize as usize) { + if binix < 0 || binix as u32 >= chromend || (binix + scale_regions.binsize as i64) as u32 >= chromend { + leftbins.push(Bin::Conbin(0,0)); + } else if scale_regions.nan_after_end && binix as u32 <= *start { + leftbins.push(Bin::Conbin(0,0)); + } else { + leftbins.push(Bin::Conbin(binix as u32, (binix as u32) + scale_regions.binsize)); + } + } + + for binix in (anchorstop as i64..absstop).step_by(scale_regions.binsize as usize) { + if binix < 0 || binix as u32 >= chromend || (binix + scale_regions.binsize as i64) as u32 >= chromend { + rightbins.push(Bin::Conbin(0,0)); + } else if scale_regions.nan_after_end && binix as u32 >= *end { + rightbins.push(Bin::Conbin(0,0)); + } else { + rightbins.push(Bin::Conbin(binix as u32, (binix as u32) + scale_regions.binsize)); + } + } + + for bin in leftbins.into_iter() { + bins.push(bin); + } + // If we have bodybins, they should be squeezed in here. + for bin in bodybins.into_iter() { + bins.push(bin); + } + // Reset bodybins, as they are consumed. + bodybins = Vec::new(); + for bin in rightbins.into_iter() { + bins.push(bin); + } + } + (Revalue::V(start), Revalue::V(end)) => { + let exons: Vec<(u32, u32)> = start.iter().zip(end.iter()) + .map(|(&s, &e)| (s, e)) + .collect(); + // Right side. + let mut rightbins: Vec = Vec::new(); + let mut lastanchor: u32 = anchorstop; + let mut walked_bps: u32 = 0; + while walked_bps < scale_regions.downstream { + if lastanchor >= chromend { + rightbins.push(Bin::Conbin(0,0)); + walked_bps += scale_regions.binsize; + } else { + let (bin, retanch) = refpoint_exonwalker( + &exons, + lastanchor, + scale_regions.binsize, + chromend, + scale_regions.nan_after_end, + true + ); + rightbins.push(bin); + walked_bps += scale_regions.binsize; + lastanchor = retanch; + } + } + // Left side. + let mut leftbins: Vec = Vec::new(); + let mut lastanchor: u32 = anchorstart; + let mut walked_bps: u32 = 0; + while walked_bps < scale_regions.upstream { + if lastanchor == 0 { + leftbins.push(Bin::Conbin(0,0)); + walked_bps += scale_regions.binsize; + } else { + let (bin, retanch) = refpoint_exonwalker( + &exons, + lastanchor, + scale_regions.binsize, + chromend, + scale_regions.nan_after_end, + false + ); + leftbins.push(bin); + walked_bps += scale_regions.binsize; + lastanchor = retanch; + } + } + // Now we need to reverse the leftbins, as we walked backwards. + leftbins.reverse(); + for bin in leftbins.into_iter() { + bins.push(bin); + } + // If we have bodybins, they should be squeezed in here. + for bin in bodybins.into_iter() { + bins.push(bin); + } + // Reset bodybins, as they are consumed. + bodybins = Vec::new(); + for bin in rightbins.into_iter() { + bins.push(bin); + } + } + _ => panic!("Start and End are not either both u32, or Vecs. This means your regions file is ill-defined. Fix {}. {}:{}-{}", + self.name, self.chrom, self.start, self.end), + } + }, + "-" => { + match (&self.start, &self.end) { + (Revalue::U(start), Revalue::U(end)) => { + let mut leftbins: Vec = Vec::new(); + let mut rightbins: Vec = Vec::new(); + + let mut absstart: i64 = anchorstop as i64 + scale_regions.upstream as i64; + let absstop: i64 = anchorstart as i64 - scale_regions.downstream as i64; + + let steps: Vec<_> = (anchorstop as i64..absstart) + .step_by(scale_regions.binsize as usize) + .collect(); + for binix in steps.into_iter().rev() { + if binix as u32 > chromend || (binix + scale_regions.binsize as i64) as u32 > chromend { + rightbins.push(Bin::Conbin(0,0)); + } else if scale_regions.nan_after_end && binix as u32 >= *end { + leftbins.push(Bin::Conbin(0,0)); + } else { + leftbins.push(Bin::Conbin(binix as u32, (binix as u32) + scale_regions.binsize)); + } + } + + let steps: Vec<_> = (absstop..anchorstart as i64) + .step_by(scale_regions.binsize as usize) + .collect(); + for binix in steps.into_iter().rev() { + if binix < 0 { + leftbins.push(Bin::Conbin(0,0)); + } else if scale_regions.nan_after_end && binix as u32 + scale_regions.binsize <= *start { + rightbins.push(Bin::Conbin(0,0)); + } else { + rightbins.push(Bin::Conbin(binix as u32, (binix as u32) + scale_regions.binsize)); + } + } + + for bin in rightbins.into_iter() { + bins.push(bin); + } + // If we have bodybins, they should be squeezed in here. + bodybins.reverse(); + for bin in bodybins.into_iter() { + bins.push(bin); + } + // Reset bodybins, as they are consumed. + bodybins = Vec::new(); + for bin in leftbins.into_iter() { + bins.push(bin); + } + } + (Revalue::V(start), Revalue::V(end)) => { + let exons: Vec<(u32, u32)> = start.iter().zip(end.iter()) + .map(|(&s, &e)| (s, e)) + .collect(); + // Right side. + let mut rightbins: Vec = Vec::new(); + let mut lastanchor: u32 = anchorstop; + let mut walked_bps: u32 = 0; + while walked_bps < scale_regions.upstream { + if lastanchor >= chromend { + rightbins.push(Bin::Conbin(0,0)); + walked_bps += scale_regions.binsize; + } else { + let (bin, retanch) = refpoint_exonwalker( + &exons, + lastanchor, + scale_regions.binsize, + chromend, + scale_regions.nan_after_end, + true + ); + rightbins.push(bin); + walked_bps += scale_regions.binsize; + lastanchor = retanch; + } + } + // Left side. + let mut leftbins: Vec = Vec::new(); + let mut lastanchor: u32 = anchorstart; + let mut walked_bps: u32 = 0; + while walked_bps < scale_regions.downstream { + if lastanchor == 0 { + leftbins.push(Bin::Conbin(0,0)); + walked_bps += scale_regions.binsize; + } else { + let (bin, retanch) = refpoint_exonwalker( + &exons, + lastanchor, + scale_regions.binsize, + chromend, + scale_regions.nan_after_end, + false + ); + leftbins.push(bin); + walked_bps += scale_regions.binsize; + lastanchor = retanch; + } + } + // Note that now we need to go the exact opposite way as for the + strand as the 'highest position' is the 'starting point'. + rightbins.reverse(); + for bin in rightbins.into_iter() { + bins.push(bin); + } + bodybins.reverse(); + for bin in bodybins.into_iter() { + bins.push(bin); + } + bodybins = Vec::new(); + for bin in leftbins.into_iter() { + bins.push(bin); + } + } + _ => panic!("Start and End are not either both u32, or Vecs. This means your regions file is ill-defined. Fix {}. {}:{}-{}", + self.name, self.chrom, self.start, self.end), + } + }, + _ => panic!("Strand should either be + or - or . {:?} is not supported.", self.strand), + }; + assert_eq!( + bins.len(), + scale_regions.cols_expected / scale_regions.bwfiles, + "Number of bins does not match expected number of columns: {} != {}", + bins.len(), + scale_regions.cols_expected / scale_regions.bwfiles + ); + bins + } + + pub fn get_endu(&self) -> u32 { + match &self.end { + Revalue::U(end) => *end, + Revalue::V(ends) => *ends.last().unwrap(), + } + } + + pub fn get_startu(&self) -> u32 { + match &self.start { + Revalue::U(start) => *start, + Revalue::V(starts) => *starts.first().unwrap(), + } + } + + pub fn scale_regionbody(&self, scale_regions: &Scalingregions, chromend: u32) -> Vec { + // A vector of bins needs to be constructed for regionbody. + // Since this is scaling mode, 'linspace' functionality is reproduced. + match self.strand.as_str() { + "+" | "." => { + match (&self.start, &self.end) { + (Revalue::U(start), Revalue::U(end)) => { + // No exons, forward strand. divide start - end as such: + // |---un5prime---|---bodylength---|---un3prime---| + let mut un5bins: Vec = Vec::new(); + let mut un3bins: Vec = Vec::new(); + let mut innerbins: Vec = Vec::new(); + if scale_regions.unscaled5prime > 0 { + un5bins.extend((0..scale_regions.unscaled5prime) + .step_by(scale_regions.binsize as usize) + .map(|i| Bin::Conbin(*start + i, *start + i + scale_regions.binsize)) + .collect::>()); + } + + if scale_regions.unscaled3prime > 0 { + un3bins.extend( (0..scale_regions.unscaled3prime) + .step_by(scale_regions.binsize as usize) + .rev() + .map(|i| Bin::Conbin(*end - i - scale_regions.binsize, *end - i)) + .collect::>() ); + } + let bodystart = *start + scale_regions.unscaled5prime; + let bodyend = *end - scale_regions.unscaled3prime; + + // Get the bins over the body length. These need to be scaled, so similar to deeptools < 4, linspace is used. + let neededbins = (scale_regions.regionbodylength / scale_regions.binsize) as usize; + // There's multiple options here: + // transcriptlength >= regionbodylength -> linspace + // regionbodylength / binsize > transcriptlength <= regionbodylength -> 1 >= binsize > binsize. + // transcriptlength <= regionbodylength / binsize -> index repetitions with binsize of one. + let scaledbinsize = std::cmp::min(std::cmp::max((bodyend - bodystart) / neededbins as u32, 1), scale_regions.binsize); + + innerbins.extend( Array1::linspace(bodystart as f32, (bodyend - scaledbinsize) as f32, neededbins) + .mapv(|x| x as u32) + .map(|x| Bin::Conbin(*x, *x + scaledbinsize)) + .into_iter() + .collect::>() ); + + // Combine the vectors and return + let mut combined_bins = Vec::new(); + if scale_regions.unscaled5prime > 0 { + combined_bins.extend(un5bins.into_iter()); + } + combined_bins.extend(innerbins.into_iter()); + if scale_regions.unscaled3prime > 0 { + combined_bins.extend(un3bins.into_iter()); + } + return combined_bins; + }, + (Revalue::V(start), Revalue::V(end)) => { + let exons: Vec<(u32, u32)> = start.iter().zip(end.iter()) + .map(|(&s, &e)| (s, e)) + .collect(); + let mut un5bins: Vec = Vec::new(); + let mut un3bins: Vec = Vec::new(); + if scale_regions.unscaled5prime > 0 { + let mut walked_bps: u32 = 0; + let mut lastanchor: u32 = start[0]; + + while walked_bps < scale_regions.unscaled5prime { + let (bin, retanch) = refpoint_exonwalker( + &exons, + lastanchor, + scale_regions.binsize, + chromend, + scale_regions.nan_after_end, + true + ); + un5bins.push(bin); + walked_bps += scale_regions.binsize; + lastanchor = retanch; + } + } + if scale_regions.unscaled3prime > 0 { + let mut walked_bps: u32 = 0; + let mut lastanchor: u32 = *end.last().unwrap(); + + while walked_bps < scale_regions.unscaled3prime { + let (bin, retanch) = refpoint_exonwalker( + &exons, + lastanchor, + scale_regions.binsize, + chromend, + scale_regions.nan_after_end, + false + ); + un3bins.push(bin); + walked_bps += scale_regions.binsize; + lastanchor = retanch; + } + } + un3bins.reverse(); + + let bodystart: u32; + let bodyend: u32; + if scale_regions.unscaled5prime > 0 { + bodystart = un5bins.last().unwrap().get_end(); + } else { + bodystart = *start.first().unwrap(); + } + if scale_regions.unscaled3prime > 0 { + bodyend = un3bins.first().unwrap().get_start(); + } else { + bodyend = *end.last().unwrap(); + } + let truebodylength = self.regionlength - scale_regions.unscaled5prime - scale_regions.unscaled3prime; + let neededbins = (scale_regions.regionbodylength / scale_regions.binsize) as usize; + let scaledbinsize = std::cmp::min(std::cmp::max(truebodylength / neededbins as u32, 1), scale_regions.binsize); + // Things are a bit tricky now, as we can do a linspace over the region, but we don't have a notion of the exons. + // I think easiest is to just pull a hashmap over the entire region, get linspace from hashmap to vec, and be done with it. + // technically we fetch a bunch of regions we don't need, but this operation is not too expensive. + + let mut binmap: HashMap = HashMap::new(); + let mut lastanchor: u32 = bodystart; + + for ix in 0..((truebodylength/scaledbinsize)+1) { + let (bin, anchor) = refpoint_exonwalker( + &exons, + lastanchor, + scaledbinsize, + chromend, + scale_regions.nan_after_end, + true + ); + lastanchor = anchor; + match bin { + Bin::Conbin(start, end) => { + if end > bodyend { + binmap.insert(ix, Bin::Conbin(start, bodyend)); + } else { + binmap.insert(ix, Bin::Conbin(start, end)); + } + }, + Bin::Catbin(bins) => { + if bins.last().unwrap().1 > bodyend { + let mut newbins: Vec<(u32, u32)> = Vec::new(); + for (s, e) in bins.iter() { + if *e > bodyend { + newbins.push((*s, bodyend)); + } else { + newbins.push((*s, *e)); + } + } + binmap.insert(ix, Bin::Catbin(newbins)); + } else { + binmap.insert(ix, Bin::Catbin(bins)); + } + }, + } + } + + let innerbins = Array1::linspace(0 as f32, ((truebodylength)/scaledbinsize) as f32, neededbins) + .mapv(|x| x as u32) + .map(|x| binmap.get(&x).unwrap().clone()) + .into_iter() + .collect::>(); + + // Combine the vectors and return + let mut combined_bins = Vec::new(); + if scale_regions.unscaled5prime > 0 { + combined_bins.extend(un5bins.into_iter()); + } + combined_bins.extend(innerbins.into_iter()); + if scale_regions.unscaled3prime > 0 { + combined_bins.extend(un3bins.into_iter()); + } + return combined_bins; + }, + _ => panic!("Start and End are not either both u32, or Vecs. This means your regions file is ill-defined."), + } + }, + "-" => { + match (&self.start, &self.end) { + (Revalue::U(start), Revalue::U(end)) => { + // No exons, negative strand. divide start - end as such: + // |---un3prime---|---bodylength---|---un5prime---| + let mut un5bins: Vec = Vec::new(); + let mut un3bins: Vec = Vec::new(); + let mut innerbins: Vec = Vec::new(); + if scale_regions.unscaled5prime > 0 { + un5bins.extend((0..scale_regions.unscaled5prime) + .step_by(scale_regions.binsize as usize) + .map(|i| Bin::Conbin(*end - i - scale_regions.binsize, *end - i)) + .collect::>()); + } + + if scale_regions.unscaled3prime > 0 { + un3bins.extend( (0..scale_regions.unscaled3prime) + .step_by(scale_regions.binsize as usize) + .rev() + .map(|i| Bin::Conbin(*start + scale_regions.unscaled3prime - i - scale_regions.binsize, *start + scale_regions.unscaled3prime - i)) + .collect::>() ); + } + let bodystart = *start + scale_regions.unscaled3prime; + let bodyend = *end - scale_regions.unscaled5prime; + + // Get the bins over the body length. These need to be scaled, so similar to deeptools < 4, linspace is used. + let neededbins = (scale_regions.regionbodylength / scale_regions.binsize) as usize; + // There's multiple options here: + // transcriptlength >= regionbodylength -> linspace + // regionbodylength / binsize > transcriptlength <= regionbodylength -> 1 >= binsize > binsize. + // transcriptlength <= regionbodylength / binsize -> index repetitions with binsize of one. + let mut scaledbinsize = (bodyend - bodystart)/neededbins as u32; + if scaledbinsize == 0 { + scaledbinsize = 1; + } + if scaledbinsize > scale_regions.binsize { + scaledbinsize = scale_regions.binsize; + } + + innerbins.extend( Array1::linspace(bodystart as f32, (bodyend - scaledbinsize) as f32, neededbins) + .mapv(|x| x as u32) + .map(|x| Bin::Conbin(*x, *x + scaledbinsize)) + .into_iter() + .collect::>() ); + + // Reverse innerbins to go from 3' -> 5' + innerbins.reverse(); + // Combine the vectors and return + let mut combined_bins = Vec::new(); + if scale_regions.unscaled3prime > 0 { + combined_bins.extend(un3bins.into_iter()); + } + combined_bins.extend(innerbins.into_iter()); + if scale_regions.unscaled5prime > 0 { + combined_bins.extend(un5bins.into_iter()); + } + return combined_bins; + }, + (Revalue::V(start), Revalue::V(end)) => { + let exons: Vec<(u32, u32)> = start.iter().zip(end.iter()) + .map(|(&s, &e)| (s, e)) + .collect(); + let mut un5bins: Vec = Vec::new(); + let mut un3bins: Vec = Vec::new(); + if scale_regions.unscaled5prime > 0 { + let mut walked_bps: u32 = 0; + let mut lastanchor: u32 = *end.last().unwrap(); + + while walked_bps < scale_regions.unscaled5prime { + let (bin, retanch) = refpoint_exonwalker( + &exons, + lastanchor, + scale_regions.binsize, + chromend, + scale_regions.nan_after_end, + false + ); + un5bins.push(bin); + walked_bps += scale_regions.binsize; + lastanchor = retanch; + } + } + un5bins.reverse(); + + if scale_regions.unscaled3prime > 0 { + let mut walked_bps: u32 = 0; + let mut lastanchor: u32 = start[0]; + + while walked_bps < scale_regions.unscaled3prime { + let (bin, retanch) = refpoint_exonwalker( + &exons, + lastanchor, + scale_regions.binsize, + chromend, + scale_regions.nan_after_end, + true + ); + un3bins.push(bin); + walked_bps += scale_regions.binsize; + lastanchor = retanch; + } + } + + let bodystart: u32; + let bodyend: u32; + if scale_regions.unscaled3prime > 0 { + bodystart = un3bins.last().unwrap().get_end(); + } else { + bodystart = *start.first().unwrap(); + } + if scale_regions.unscaled5prime > 0 { + bodyend = un5bins.first().unwrap().get_start(); + } else { + bodyend = *end.last().unwrap(); + } + let truebodylength = self.regionlength - scale_regions.unscaled5prime - scale_regions.unscaled3prime; + let neededbins = (scale_regions.regionbodylength / scale_regions.binsize) as usize; + let scaledbinsize = std::cmp::min(std::cmp::max(truebodylength / neededbins as u32, 1), scale_regions.binsize); + // Things are a bit tricky now, as we can do a linspace over the region, but we don't have a notion of the exons. + // I think easiest is to just pull a hashmap over the entire region, get linspace from hashmap to vec, and be done with it. + // technically we fetch a bunch of regions we don't need, but this operation is not too expensive. + + let mut binmap: HashMap = HashMap::new(); + let mut lastanchor: u32 = bodystart; + + for ix in 0..((truebodylength/scaledbinsize)+1) { + let (bin, anchor) = refpoint_exonwalker( + &exons, + lastanchor, + scaledbinsize, + chromend, + scale_regions.nan_after_end, + true + ); + lastanchor = anchor; + match bin { + Bin::Conbin(start, end) => { + if end > bodyend { + binmap.insert(ix, Bin::Conbin(start, bodyend)); + } else { + binmap.insert(ix, Bin::Conbin(start, end)); + } + }, + Bin::Catbin(bins) => { + if bins.last().unwrap().1 > bodyend { + let mut newbins: Vec<(u32, u32)> = Vec::new(); + for (s, e) in bins.iter() { + if *e > bodyend { + newbins.push((*s, bodyend)); + } else { + newbins.push((*s, *e)); + } + } + binmap.insert(ix, Bin::Catbin(newbins)); + } else { + binmap.insert(ix, Bin::Catbin(bins)); + } + }, + } + } + + let innerbins = Array1::linspace(0 as f32, ((truebodylength)/scaledbinsize) as f32, neededbins) + .mapv(|x| x as u32) + .map(|x| binmap.get(&x).unwrap().clone()) + .into_iter() + .collect::>(); + // Combine the vectors and return + let mut combined_bins = Vec::new(); + if scale_regions.unscaled5prime > 0 { + combined_bins.extend(un5bins.into_iter()); + } + combined_bins.extend(innerbins.into_iter()); + if scale_regions.unscaled3prime > 0 { + combined_bins.extend(un3bins.into_iter()); + } + return combined_bins; + }, + _ => panic!("Start and End are not either both u32, or Vecs. This means your regions file is ill-defined."), + } + }, + _ => panic!("Strand should either be + or - or . {:?} is not supported.", self.strand), + } + } +} + +fn refpoint_exonwalker(exons: &Vec<(u32, u32)>, anchor: u32, binsize: u32, chromend: u32, nan_after_end: bool, forward: bool) -> (Bin, u32) { + // Basic function that walks over exons, and returns a Bin (Either Conbin or Catbin) and the last anchorpoint. + let mut anchorix: Option = None; + + for (ix, exon) in exons.iter().enumerate() { + if anchor >= exon.0 && anchor <= exon.1 { + anchorix = Some(ix); + } + } + if forward { + // Walk forward (downstream, towards chromosome end) + match anchorix { + Some(i) => { + // anchor sits in an exon. Check if anchor + binsize is also in same exon. + if anchor + binsize <= exons[i].1 { + (Bin::Conbin(anchor, anchor + binsize), anchor + binsize) + } else { + // anchor + binsize is not in same exon. We need a Catbin. + // Things are a bit more difficult here as well, as we need to walk exons. + let mut start_end_vec: Vec<(u32, u32)> = Vec::new(); + start_end_vec.push( (anchor, exons[i].1) ); + + let mut remainingbin: u32 = binsize - (exons[i].1 - anchor); + let mut lastix: usize = i; + let mut lastanchor: u32 = exons[i].1; + + while remainingbin != 0 { + if lastix + 1 < exons.len() { + // next exon is available. + // Two options here: + // the remainder fits in the lastix + 1 exon. We are done. + // the remainder doesn't fit in the lastix + 1 exon. We need to walk further. + if exons[lastix+1].1 - exons[lastix+1].0 >= remainingbin { + // remainder fits in next exon. + start_end_vec.push( (exons[lastix+1].0, exons[lastix+1].0 + remainingbin) ); + lastanchor = exons[lastix+1].0 + remainingbin; + remainingbin = 0; + } else { + // Remainder is larger then our exon. We need another walk. + start_end_vec.push( (exons[lastix+1].0, exons[lastix+1].1) ); + remainingbin -= exons[lastix+1].1 - exons[lastix+1].0; + lastix += 1; + lastanchor = exons[lastix].1; + } + } else { + // No next exon available. Remainder can just be genomic. + // The last entry here can be changed to include the last part. + if nan_after_end { + start_end_vec.push((0,0)); + } else { + let last = start_end_vec.last_mut().unwrap(); + assert_eq!( + last.1, + lastanchor, + "In the exon - genomic walk, our coordinates are not contiguous" + ); + // Check we don't fall of the chromosome. + if lastanchor + remainingbin > chromend { + last.1 = chromend; + lastanchor = chromend; + } else { + last.1 = lastanchor + remainingbin; + lastanchor = lastanchor + remainingbin; + } + } + remainingbin = 0; + } + } + // We now have a Vec of start - end, we can construct a CatBin. + // Note that CatBins are (absstart, absstop, ((intstart1, intstart2), ...)) + // This seems weird, but makes sure we need to slice the bigwig file only once per bin. + if start_end_vec.len() == 1 { + (Bin::Conbin(anchor, lastanchor), lastanchor) + } else { + (Bin::Catbin(start_end_vec), lastanchor) + } + } + }, + None => { + // our anchor doesn't sit in exons. We just return the anchor + binsize as Bin + if anchor + binsize > chromend { + (Bin::Conbin(anchor, chromend), chromend) + } else { + (Bin::Conbin(anchor, anchor + binsize), anchor + binsize) + } + } + } + } else { + // Walk backwards (upstream, towards chromosome start) + match anchorix { + Some(i) => { + // anchor sits in an exon. Check if anchor - binsize is also in same exon. + if anchor - binsize >= exons[i].0 { + (Bin::Conbin(anchor - binsize, anchor), anchor - binsize) + } else { + // anchor + binsize is not in same exon. We need a Catbin. + // Things are a bit more difficult here as well, as we need to walk exons. + let mut start_end_vec: Vec<(u32, u32)> = Vec::new(); + start_end_vec.push( (exons[i].0, anchor) ); + + let mut remainingbin: u32 = binsize - (anchor - exons[i].0); + let mut lastix: usize = i; + let mut lastanchor: u32 = exons[i].0; + + while remainingbin != 0 { + if lastix >= 1 { + // previous exon is available. + // Two options here: + // the remainder fits in the previous exon. We are done. + // the remainder doesn't fit in the previous exon. We need to walk further. + if exons[lastix-1].1 - exons[lastix-1].0 >= remainingbin { + // remainder fits in next exon. + start_end_vec.push( (exons[lastix-1].1 - remainingbin, exons[lastix-1].1) ); + lastanchor = exons[lastix-1].1 - remainingbin; + remainingbin = 0; + } else { + // Remainder is larger then our exon. We need another walk. + start_end_vec.push( (exons[lastix-1].0, exons[lastix-1].1 ) ); + remainingbin -= exons[lastix-1].1 - exons[lastix-1].0; + lastix -= 1; + lastanchor = exons[lastix].0; + } + } else { + // No previous exon available. Remainder can just be genomic. + // The last entry here can be changed to include the last part. + if nan_after_end { + start_end_vec.push( (0,0) ); + } else { + let last = start_end_vec.last_mut().unwrap(); + assert_eq!( + last.0, + lastanchor, + "In the exon - genomic walk (reverse), our coordinates are not contiguous" + ); + // Check we don't go in the negative. + if lastanchor < remainingbin { + last.0 = 0; + lastanchor = 0; + } else { + last.0 = lastanchor - remainingbin; + lastanchor = lastanchor - remainingbin; + } + } + remainingbin = 0; + } + } + // We now have a Vec of start - end, we can construct a CatBin. + // Note that CatBins are (absstart, absstop, ((intstart1, intstart2), ...)) + // This seems weird, but makes sure we need to slice the bigwig file only once per bin. + if start_end_vec.len() == 1 { + (Bin::Conbin(lastanchor, anchor), lastanchor) + } else { + start_end_vec.reverse(); + (Bin::Catbin(start_end_vec), lastanchor) + } + } + }, + None => { + // our anchor doesn't sit in exons. We just return the anchor - binsize as Bin + if anchor < binsize { + (Bin::Conbin(0, anchor), 0) + } else { + (Bin::Conbin(anchor - binsize, anchor), anchor - binsize) + } + } + } + } +} + + +pub fn region_divider(regs: &Vec) -> Vec> { + // This function decides on how regions are divided to process in parallel. + // Since the Vec could be chromosomes on one hand, or genes on the other, we can decide on how to chop them up. + // Note that this somehow relates to 'genomeChunkLength', but not entirely. The sweet spot is bp length is ~ 1000000 + let mut bplen: u32 = 0; + let mut blocks: Vec> = Vec::new(); + let mut tempregionvec: Vec = Vec::new(); + + for reg in regs.iter() { + if reg.regionlength > 1000000 { + if tempregionvec.len() > 0 { + blocks.push(tempregionvec.clone()); + tempregionvec = Vec::new(); + bplen = 0 + } + blocks.push(vec![reg.clone()]); + } else { + tempregionvec.push(reg.clone()); + bplen += reg.regionlength; + } + if bplen > 1000000 { + blocks.push(tempregionvec.clone()); + tempregionvec = Vec::new(); + bplen = 0; + } + } + blocks +} + + +#[derive(Debug)] +pub struct Scalingregions { + pub upstream: u32, + pub downstream: u32, + pub unscaled5prime: u32, + pub unscaled3prime: u32, + pub regionbodylength: u32, + pub binsize: u32, + pub cols_expected: usize, + pub bpsum: u32, + pub missingdata_as_zero: bool, + pub scale: f32, + pub nan_after_end: bool, + pub skipzero: bool, + pub minthresh: f32, + pub maxthresh: f32, + pub referencepoint: String, + pub mode: String, + pub bwfiles: usize, + pub avgtype: String, + pub verbose: bool, + pub proc_number: usize, + pub regionlabels: Vec, + pub bwlabels: Vec +} + +#[derive(Clone)] +pub struct Gtfparse { + pub metagene: bool, + pub txnid: String, + pub exonid: String, + pub txniddesignator: String, +} + +#[derive(Clone)] +pub enum Revalue { + U(u32), + V(Vec), +} + +impl Revalue { + pub fn rewrites(&self) -> String { + match self { + Revalue::U(v) => format!("{}", v), + Revalue::V(vs) => vs.iter() + .map(|v| v.to_string()) + .collect::>() + .join(","), + } + } +} + +impl fmt::Debug for Revalue { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + Revalue::U(value) => write!(f, "U({})", value), + Revalue::V(values) => write!(f, "V({:?})", values), + } + } +} + +impl fmt::Display for Revalue { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + Revalue::U(value) => write!(f, "U({})", value), + Revalue::V(values) => write!(f, "V({})", values.iter().map(|v| v.to_string()).collect::>().join(", ")), + } + } +} + +#[derive(Clone, Debug)] +pub enum Bin { + Conbin(u32, u32), + Catbin(Vec<(u32, u32)>), +} + +impl Bin { + pub fn get_start(&self) -> u32 { + match self { + Bin::Conbin(start, _) => *start, + Bin::Catbin(starts) => starts.first().unwrap().0, + } + } + pub fn get_end(&self) -> u32 { + match self { + Bin::Conbin(_, end) => *end, + Bin::Catbin(ends) => ends.last().unwrap().1, + } + } +} diff --git a/src/filehandler.rs b/src/filehandler.rs index 0a900bb90e..aff8017d4a 100644 --- a/src/filehandler.rs +++ b/src/filehandler.rs @@ -1,4 +1,4 @@ -use rust_htslib::bam::{Read, Reader}; +use rust_htslib::bam::{Read, Reader, IndexedReader}; use itertools::Itertools; use std::io::{BufReader, BufWriter, Write}; use std::io::prelude::*; @@ -9,7 +9,7 @@ use bigtools::beddata::BedParserStreamingIterator; use std::collections::HashMap; use flate2::write::GzEncoder; use flate2::Compression; -use crate::computematrix::{Scalingregions, Gtfparse, Revalue, Region, Bin}; +use crate::covcalc::{Scalingregions, Gtfparse, Revalue, Region, Bin}; use crate::calc::{mean_float, median_float, min_float, max_float, sum_float, std_float}; pub fn bam_ispaired(bam_ifile: &str) -> bool { @@ -107,8 +107,8 @@ pub fn read_gtffile(gtf_file: &String, gtfparse: &Gtfparse, chroms: Vec<&String> let (starts, ends): (Vec, Vec) = txnentry.iter().map(|(s, e)| (*s, *e)).unzip(); let chrom = txn_chrom.get(&txnid).unwrap().to_string(); - if !chroms.contains(&&chrom) { - println!("Warning, region {} not found in at least one of the bigwig files. Skipping {}.", chrom, txnid); + if !chroms.contains(&&chrom.to_string()) { + println!("Warning, region {} not found in at least one of the bigwig/bam files. Skipping {}.", chrom, txnid); } else { regions.push( Region { @@ -151,9 +151,8 @@ pub fn read_gtffile(gtf_file: &String, gtfparse: &Gtfparse, chroms: Vec<&String> } else { names.insert(entryname.clone(), 0); } - if !chroms.contains(&&fields[0].to_string()) { - println!("Warning, region {} not found in at least one of the bigwig files. Skipping {}.", fields[0], entryname); + println!("Warning, region {} not found in at least one of the bigwig/bam files. Skipping {}.", fields[0], entryname); } else { regions.push( Region { @@ -203,8 +202,10 @@ pub fn read_bedfile(bed_file: &String, metagene: bool, chroms: Vec<&String>) -> } let chrom = fields[0]; let mut entryname = format!("{}:{}-{}", fields[0], fields[1], fields[2]); + // Only check for valid chroms, if chroms vec is not empty. + if !chroms.contains(&&chrom.to_string()) { - println!("Warning, region {} not found in at least one of the bigwig files. Skipping {}.", chrom, entryname); + println!("Warning, region {} not found in at least one of the bigwig/bam files. Skipping {}.", chrom, entryname); continue; } if names.contains_key(&entryname) { @@ -236,7 +237,7 @@ pub fn read_bedfile(bed_file: &String, metagene: bool, chroms: Vec<&String>) -> let chrom = fields[0]; let mut entryname = fields[3].to_string(); if !chroms.contains(&&chrom.to_string()) { - println!("Warning, region {} not found in at least one of the bigwig files. Skipping {}.", chrom, entryname); + println!("Warning, region {} not found in at least one of the bigwig/bam files. Skipping {}.", chrom, entryname); continue; } let start = fields[1].parse().unwrap(); @@ -265,7 +266,7 @@ pub fn read_bedfile(bed_file: &String, metagene: bool, chroms: Vec<&String>) -> let chrom = fields[0]; let mut entryname = fields[3].to_string(); if !chroms.contains(&&chrom.to_string()) { - println!("Warning, region {} not found in at least one of the bigwig files. Skipping {}.", chrom, entryname); + println!("Warning, region {} not found in at least one of the bigwig/bam files. Skipping {}.", chrom, entryname); continue; } if names.contains_key(&entryname) { @@ -351,6 +352,20 @@ pub fn chrombounds_from_bw(bwfile: &str) -> HashMap { chromsizes } +pub fn chrombounds_from_bam(bamfile: &str) -> HashMap { + let bam = IndexedReader::from_path(bamfile).unwrap(); + let header = bam.header().clone(); + let mut chromsizes = HashMap::new(); + for tid in 0..header.target_count() { + let chromname = String::from_utf8(header.tid2name(tid).to_vec()) + .expect("Invalid UTF-8 in chromosome name"); + let chromlen = header.target_len(tid) + .expect("Error retrieving length for chromosome"); + chromsizes.insert(chromname.to_string(), chromlen as u32); + } + chromsizes +} + pub fn bwintervals( bwfile: &str, regions: &Vec, diff --git a/src/multibamsummary.rs b/src/multibamsummary.rs index 119c11e406..c22cf7d46f 100644 --- a/src/multibamsummary.rs +++ b/src/multibamsummary.rs @@ -6,15 +6,19 @@ use itertools::{multizip, multiunzip, izip}; use std::io::prelude::*; use std::io::BufReader; use std::fs::File; -use ndarray::{Array1, Array2, stack}; +use ndarray::{Array1, Array2, stack, Axis}; use std::io; use ndarray_npy::NpzWriter; use std::borrow::Cow; -use crate::covcalc::{bam_pileup, parse_regions, Alignmentfilters}; -use crate::filehandler::{bam_ispaired}; -use crate::calc::{median, calc_ratio}; +use std::collections::HashMap; +use std::path::Path; +use std::sync::{Arc, Mutex}; +use crate::covcalc::{bam_pileup, parse_regions, Alignmentfilters, region_divider}; +use crate::filehandler::{bam_ispaired, read_bedfile, read_gtffile, chrombounds_from_bam}; +use crate::calc::{median, calc_ratio, deseq_scalefactors}; use crate::bamcompare::ParsedBamFile; use crate::normalization::scale_factor_bamcompare; +use crate::covcalc::{Region, Gtfparse}; #[pyfunction] pub fn r_mbams( @@ -27,13 +31,13 @@ pub fn r_mbams( scaling_factors: &str, // optional parameters labels: Py, - binsize: u32, + mut binsize: u32, distance_between_bins: u32, nproc: usize, - _bed_file: &str, - regions: Vec<(String, u32, u32)>, + bed_file: Py, + sup_regions: Vec<(String, u32, u32)>, _blacklist: &str, - _verbose: bool, + verbose: bool, _extend_reads: u32, _center_reads: bool, sam_flag_incl: u16, // sam flag include @@ -41,17 +45,19 @@ pub fn r_mbams( min_fragment_length: u32, // minimum fragment length. max_fragment_length: u32, // maximum fragment length. min_mapping_quality: u8, // minimum mapping quality. - _keep_exons: bool, - _txnid: &str, // transcript id to use when parsing GTF file - _exonid: &str, // exon id to use when parsing GTF file - _txniddesignator: &str, // designator to use when parsing GTF file + metagene: bool, // metagene mode or not. + txnid: &str, // transcript id to use when parsing GTF file. + exonid: &str, // exon id to use when parsing GTF file. + txniddesignator: &str, // designator to use when parsing GTF file. ) -> PyResult<()> { let mut bamfiles: Vec = Vec::new(); let mut bamlabels: Vec = Vec::new(); + let mut bedfiles: Vec = Vec::new(); let mut ignorechr: Vec = Vec::new(); Python::with_gil(|py| { bamfiles = bam_files.extract(py).expect("Failed to retrieve bam files."); bamlabels = labels.extract(py).expect("Failed to retrieve labels."); + bedfiles = bed_file.extract(py).expect("Failed to retrieve bedfiles."); }); let max_len = bamlabels.iter().map(|s| s.len()).max().unwrap_or(0); @@ -65,10 +71,12 @@ pub fn r_mbams( .collect::>(); // zip through ispe and bamfiles - for (_ispe, _bf) in ispe.iter().zip(bamfiles.iter()) { - println!("Sample: {} is-paired: {}", _bf, _ispe); + if verbose { + for (_ispe, _bf) in ispe.iter().zip(bamfiles.iter()) { + println!("Sample: {} is-paired: {}", _bf, _ispe); + } } - + let filters: Alignmentfilters = Alignmentfilters { minmappingquality: min_mapping_quality, samflaginclude: sam_flag_incl, @@ -77,17 +85,66 @@ pub fn r_mbams( maxfraglen: max_fragment_length }; - let (regions, chromsizes) = parse_regions(®ions, bamfiles.get(0).unwrap()); - let pool = ThreadPoolBuilder::new().num_threads(nproc).build().unwrap(); + let mut regions: Vec = Vec::new(); + + if mode == "BED-file" { + if verbose { + println!("BED file mode. with files: {:?}", bedfiles); + } + let gtfparse = Gtfparse { + metagene: metagene, + txnid: txnid.to_string(), + exonid: exonid.to_string(), + txniddesignator: txniddesignator.to_string(), + }; + // From the first bamfile, get the chromosome sizes. Only the chromosome names are needed, to make sure no invalid regions are included later on. + let chromsizes = chrombounds_from_bam(bamfiles.get(0).unwrap()); + binsize = 1; + // Parse regions from bed files. Note that we retain the name of the bed file (in case there are more then 1) + // Additionaly, score and strand are also retained, if it's a 3-column bed file we just fill in '.' + let mut regionsizes: HashMap = HashMap::new(); + bedfiles.iter() + .map(|r| { + let ext = Path::new(r) + .extension() + .and_then(|e| e.to_str()) + .map(|e| e.to_ascii_lowercase()); + + match ext { + Some(v) if v == "gtf".to_string() => read_gtffile(r, >fparse, chromsizes.keys().collect()), + Some(v) if v == "bed".to_string() => read_bedfile(r, metagene, chromsizes.keys().collect()), + _ => panic!("Only .bed and .gtf files are allowed as regions. File = {}, Extension = {:?}", r, ext), + } + }) + .for_each(|(reg, regsize)| { + regions.extend(reg); + regionsizes.insert(regsize.0, regsize.1); + }); + } else { + if verbose { + println!("BINS mode. with binsize: {}", binsize); + } + let (parsedregions, chromsizes) = parse_regions(&sup_regions, bamfiles.get(0).unwrap()); + regions = parsedregions; + } + + let pool = ThreadPoolBuilder::new().num_threads(nproc).build().unwrap(); // Zip together bamfiles and ispe into a vec of tuples. let bampfiles: Vec<_> = bamfiles.into_iter().zip(ispe.into_iter()).collect(); + // Divide up the regions into regionBlocks + let regionblocks = region_divider(®ions); + if verbose { + println!("Regions divided into {} parallel blocks", regionblocks.len()); + println!("Start coverage calculation"); + } + let covcalcs: Vec<_> = pool.install(|| { bampfiles.par_iter() .map(|(bamfile, ispe)| { - let (bg, mapped, unmapped, readlen, fraglen) = regions.par_iter() + let (bg, mapped, unmapped, readlen, fraglen) = regionblocks.par_iter() .map(|i| bam_pileup(bamfile, &i, &binsize, &ispe, &ignorechr, &filters, false)) .reduce( || (vec![], 0, 0, vec![], vec![]), @@ -104,66 +161,81 @@ pub fn r_mbams( }) .collect() }); - - - // Scale factors (when needed) - // relevant python code. - // loggeomeans = np.sum(np.log(m), axis=1) / m.shape[1] - // # Mask after computing the geometric mean - // m = np.ma.masked_where(m <= 0, m) - // loggeomeans = np.ma.masked_where(np.isinf(loggeomeans), loggeomeans) - // # DESeq2 ratio-based size factor - // sf = np.exp(np.ma.median((np.log(m).T - loggeomeans).T, axis=0)) - // return 1. / sf - // create output file to write to - - // create output file to write lines into - let mut cfile = None; - if out_raw_counts != "None" { - cfile = Some(File::create(out_raw_counts).unwrap()); + if verbose { + println!("Coverage calculation done"); + println!("Define output file"); } - let mut matvec: Vec> = Vec::new(); - - let its: Vec<_> = covcalcs.iter().map(|x| x.iter()).collect(); + // Collate the coverage files into a matrix. + let its: Vec<_> = covcalcs.iter().map(|x| x.into_iter()).collect(); let zips = TempZip { iterators: its }; - for c in zips { - // set up Readers - let readers: Vec<_> = c.iter().map(|x| BufReader::new(File::open(x).unwrap()).lines()).collect(); - - for mut _l in (TempZip { iterators: readers }) { - // unwrap all lines in _l - let lines: Vec<_> = _l - .iter_mut() - .map(|x| x.as_mut().unwrap()) - .map(|x| x.split('\t').collect()) - .map(|x: Vec<&str> | ( x[0].to_string(), x[1].parse::().unwrap(), x[2].parse::().unwrap(), x[3].parse::().unwrap() ) ) - .collect(); - let arrline = Array1::from(lines.iter().map(|x| x.3).collect::>()); - matvec.push(arrline); - // Write to countsfile if cfile is not None - if let Some(ref mut cfile) = cfile { - let mut line: Vec = Vec::new(); - line.push(Countline::Text(lines[0].0.clone())); - line.push(Countline::Int(lines[0].1)); - line.push(Countline::Int(lines[0].2)); - for _line in lines { - line.push(Countline::Float(_line.3)); + if verbose { + println!("Start iterating through temp coverage files and create output npy."); + } + let zips_vec: Vec<_> = zips.collect(); + let matvec: Array2 = pool.install(|| { + let matvecs: Vec> = zips_vec + .par_iter() + .flat_map(|c| { + let readers: Vec<_> = c.iter().map(|x| BufReader::new(File::open(x).unwrap()).lines()).collect(); + let mut _matvec: Vec> = Vec::new(); + for mut _l in (TempZip { iterators: readers }) { + // unwrap all lines in _l + let lines: Vec<_> = _l + .iter_mut() + .map(|x| x.as_mut().unwrap()) + .map(|x| x.split('\t').collect()) + .map(|x: Vec<&str> | ( x[0].to_string(), x[1].parse::().unwrap(), x[2].parse::().unwrap(), x[3].parse::().unwrap() ) ) + .collect(); + let arrline = Array1::from(lines.iter().map(|x| x.3).collect::>()); + _matvec.push(arrline); } - let fline = line.iter().map(|x| x.to_string()).collect::>().join("\t"); - writeln!(cfile, "{}", fline).unwrap(); - } + _matvec + }) + .collect(); + stack(Axis(0), &matvecs.iter().map(|x| x.view()).collect::>()).unwrap() + }); + + // If scalefactors are required, calc and save them now. + if scaling_factors != "None" { + let sf = deseq_scalefactors(&matvec); + // save scalefactors to file + let mut sf_file = File::create(scaling_factors).unwrap(); + writeln!(sf_file, "Sample\tscalingFactor").unwrap(); + for (sf, label) in sf.iter().zip(bamlabels.iter()) { + writeln!(sf_file, "{}\t{}", label, sf).unwrap(); } } - let array2: Array2 = Array2::from_shape_vec( - (matvec.len(), matvec[0].len()), - matvec.into_iter().flat_map(|x| x.into_iter()).collect() - ).unwrap(); let mut npz = NpzWriter::new(File::create(ofile).unwrap()); - npz.add_array("matrix", &array2).unwrap(); + npz.add_array("matrix", &matvec).unwrap(); npz.add_array("labels", &bamlabels_arr).unwrap(); npz.finish().unwrap(); + if verbose { + println!("Matrix written."); + } + // If raw counts are required, write them to file. + if out_raw_counts != "None" { + let mut cfile = File::create(out_raw_counts).unwrap(); + // Create the header + let mut headstr = String::new(); + headstr.push_str("#\'chr\'\t\'start\'\t\'end\'"); + for label in bamlabels.iter() { + headstr.push_str(&format!("\t\'{}\'", label)); + } + writeln!(cfile, "{}", headstr).unwrap(); + for (i, region) in regions.iter().enumerate() { + let mut line = String::new(); + line.push_str(&format!("{}\t{}\t{}", region.chrom, region.get_startu(), region.get_endu())); + for j in 0..bamlabels.len() { + line.push_str(&format!("\t{}", matvec[[i, j]])); + } + writeln!(cfile, "{}", line).unwrap(); + } + } + if verbose { + println!("Counts written."); + } Ok(()) } From a42e55d952eae3ffea7540646a03b33bcb7a3048 Mon Sep 17 00:00:00 2001 From: WardDeb Date: Fri, 24 Jan 2025 20:40:03 +0100 Subject: [PATCH 03/11] Include the regionblocks even if the total genome is < 1 million --- src/covcalc.rs | 3 +++ src/multibamsummary.rs | 2 ++ 2 files changed, 5 insertions(+) diff --git a/src/covcalc.rs b/src/covcalc.rs index 01a054c456..684ff77a39 100644 --- a/src/covcalc.rs +++ b/src/covcalc.rs @@ -1323,6 +1323,9 @@ pub fn region_divider(regs: &Vec) -> Vec> { bplen = 0; } } + if tempregionvec.len() > 0 { + blocks.push(tempregionvec.clone()); + } blocks } diff --git a/src/multibamsummary.rs b/src/multibamsummary.rs index c22cf7d46f..a0f9d60fae 100644 --- a/src/multibamsummary.rs +++ b/src/multibamsummary.rs @@ -136,6 +136,8 @@ pub fn r_mbams( // Divide up the regions into regionBlocks let regionblocks = region_divider(®ions); + + assert!(regionblocks.len() > 0, "No regions to process. Exiting."); if verbose { println!("Regions divided into {} parallel blocks", regionblocks.len()); println!("Start coverage calculation"); From 31713e617c27bf93d8c60920c32ea68fee0a9bdf Mon Sep 17 00:00:00 2001 From: WardDeb Date: Fri, 24 Jan 2025 21:14:20 +0100 Subject: [PATCH 04/11] infer bed/gtf from the first proper dataline. Escape some warnings --- src/alignmentsieve.rs | 8 ++++---- src/computematrix.rs | 17 +++++++---------- src/filehandler.rs | 19 +++++++++++++++++++ src/multibamsummary.rs | 36 ++++++++---------------------------- 4 files changed, 38 insertions(+), 42 deletions(-) diff --git a/src/alignmentsieve.rs b/src/alignmentsieve.rs index 02b46d5d5d..49027ef083 100644 --- a/src/alignmentsieve.rs +++ b/src/alignmentsieve.rs @@ -74,7 +74,7 @@ pub fn r_alignmentsieve( // write output let mut obam = Writer::from_path(ofile, &header, bam::Format::Bam).unwrap(); - obam.set_threads(nproc); + let _ = obam.set_threads(nproc); for sb in sieve.into_iter() { if let Some(sb) = sb { let mut bam = Reader::from_path(&sb).unwrap(); @@ -87,7 +87,7 @@ pub fn r_alignmentsieve( // write filtered reads if necessary if write_filters { let mut ofilterbam = Writer::from_path(filtered_out_readsfile, &header, bam::Format::Bam).unwrap(); - ofilterbam.set_threads(nproc); + let _ = ofilterbam.set_threads(nproc); for sb in filtersieve.into_iter() { if let Some(sb) = sb { let mut bam = Reader::from_path(&sb).unwrap(); @@ -148,8 +148,8 @@ fn sieve_bamregion(ibam: &str, regstruct: &Region, alfilters: &Alignmentfilters, if nproc > 4 { let readthreads = 2; let writethreads = nproc - 2; - bam.set_threads(readthreads); - sievebamout.set_threads(writethreads); + let _ = bam.set_threads(readthreads); + let _ = sievebamout.set_threads(writethreads); if verbose { println!("Reading = {}, Writing = {}", readthreads, writethreads); } diff --git a/src/computematrix.rs b/src/computematrix.rs index ea25902118..a632c8257c 100644 --- a/src/computematrix.rs +++ b/src/computematrix.rs @@ -1,6 +1,6 @@ use pyo3::prelude::*; use pyo3::types::PyList; -use crate::filehandler::{read_bedfile, read_gtffile, chrombounds_from_bw, bwintervals, header_matrix, write_matrix}; +use crate::filehandler::{read_bedfile, read_gtffile, chrombounds_from_bw, bwintervals, header_matrix, write_matrix, is_bed_or_gtf}; use rayon::prelude::*; use rayon::ThreadPoolBuilder; use std::collections::HashMap; @@ -128,15 +128,12 @@ pub fn r_computematrix( let mut regionsizes: HashMap = HashMap::new(); region_files.iter() .map(|r| { - let ext = Path::new(r) - .extension() - .and_then(|e| e.to_str()) - .map(|e| e.to_ascii_lowercase()); - - match ext { - Some(v) if v == "gtf".to_string() => read_gtffile(r, >fparse, chromsizes.keys().collect()), - Some(v) if v == "bed".to_string() => read_bedfile(r, metagene, chromsizes.keys().collect()), - _ => panic!("Only .bed and .gtf files are allowed as regions. File = {}, Extension = {:?}", r, ext), + let ftype = is_bed_or_gtf(r); + + match ftype.as_str() { + "gtf" => read_gtffile(r, >fparse, chromsizes.keys().collect()), + "bed" => read_bedfile(r, metagene, chromsizes.keys().collect()), + _ => panic!("Only .bed and .gtf files are allowed (as determined by the number of columns). File = {}", ftype), } }) .for_each(|(reg, regsize)| { diff --git a/src/filehandler.rs b/src/filehandler.rs index aff8017d4a..f9578466f7 100644 --- a/src/filehandler.rs +++ b/src/filehandler.rs @@ -47,6 +47,25 @@ where } } +pub fn is_bed_or_gtf(fp: &str) -> String { + // Check if file is a bed or gtf file. + let file = File::open(fp).expect(format!("Failed to open file: {}", fp).as_str()); + let reader = BufReader::new(file); + // Get the first line that doesn't start with # + for line in reader.lines() { + let line = line.unwrap(); + if !line.starts_with('#') { + let fields: Vec<&str> = line.split('\t').collect(); + if fields.len() == 9 { + return "gtf".to_string(); + } else { + return "bed".to_string(); + } + } + } + "Unknown".to_string() +} + pub fn read_gtffile(gtf_file: &String, gtfparse: &Gtfparse, chroms: Vec<&String>) -> (Vec, (String, u32)) { // At some point this zoo of String clones should be refactored. Not now though, We have a deadline. let mut regions: Vec = Vec::new(); diff --git a/src/multibamsummary.rs b/src/multibamsummary.rs index a0f9d60fae..3fc9c9ec5b 100644 --- a/src/multibamsummary.rs +++ b/src/multibamsummary.rs @@ -14,7 +14,7 @@ use std::collections::HashMap; use std::path::Path; use std::sync::{Arc, Mutex}; use crate::covcalc::{bam_pileup, parse_regions, Alignmentfilters, region_divider}; -use crate::filehandler::{bam_ispaired, read_bedfile, read_gtffile, chrombounds_from_bam}; +use crate::filehandler::{bam_ispaired, read_bedfile, read_gtffile, chrombounds_from_bam, is_bed_or_gtf}; use crate::calc::{median, calc_ratio, deseq_scalefactors}; use crate::bamcompare::ParsedBamFile; use crate::normalization::scale_factor_bamcompare; @@ -106,15 +106,12 @@ pub fn r_mbams( let mut regionsizes: HashMap = HashMap::new(); bedfiles.iter() .map(|r| { - let ext = Path::new(r) - .extension() - .and_then(|e| e.to_str()) - .map(|e| e.to_ascii_lowercase()); - - match ext { - Some(v) if v == "gtf".to_string() => read_gtffile(r, >fparse, chromsizes.keys().collect()), - Some(v) if v == "bed".to_string() => read_bedfile(r, metagene, chromsizes.keys().collect()), - _ => panic!("Only .bed and .gtf files are allowed as regions. File = {}, Extension = {:?}", r, ext), + let ftype = is_bed_or_gtf(r); + + match ftype.as_str() { + "gtf" => read_gtffile(r, >fparse, chromsizes.keys().collect()), + "bed" => read_bedfile(r, metagene, chromsizes.keys().collect()), + _ => panic!("Only .bed and .gtf files are allowed (as determined by the number of columns). File = {}", ftype), } }) .for_each(|(reg, regsize)| { @@ -146,7 +143,7 @@ pub fn r_mbams( let covcalcs: Vec<_> = pool.install(|| { bampfiles.par_iter() .map(|(bamfile, ispe)| { - let (bg, mapped, unmapped, readlen, fraglen) = regionblocks.par_iter() + let (bg, _mapped, _unmapped, _readlen, _fraglen) = regionblocks.par_iter() .map(|i| bam_pileup(bamfile, &i, &binsize, &ispe, &ignorechr, &filters, false)) .reduce( || (vec![], 0, 0, vec![], vec![]), @@ -241,23 +238,6 @@ pub fn r_mbams( Ok(()) } -#[derive(Debug)] -enum Countline { - Int(u32), - Float(f32), - Text(String), -} - -impl Countline { - fn to_string(&self) -> String { - match self { - Countline::Int(i) => i.to_string(), - Countline::Float(f) => f.to_string(), - Countline::Text(t) => t.clone(), - } - } -} - struct TempZip where I: Iterator { iterators: Vec From 891e58220651cb88e16ed90d522a260da1077ec0 Mon Sep 17 00:00:00 2001 From: WardDeb Date: Fri, 24 Jan 2025 22:16:10 +0100 Subject: [PATCH 05/11] Include intersection of all chromosomes over all bam files, not crash if they are not all matching. --- src/alignmentsieve.rs | 2 +- src/bamcompare.rs | 2 +- src/bamcoverage.rs | 2 +- src/covcalc.rs | 50 +++++++++++++++++++++++++++++++++--------- src/multibamsummary.rs | 8 ++++--- 5 files changed, 48 insertions(+), 16 deletions(-) diff --git a/src/alignmentsieve.rs b/src/alignmentsieve.rs index 49027ef083..4bf487d96e 100644 --- a/src/alignmentsieve.rs +++ b/src/alignmentsieve.rs @@ -47,7 +47,7 @@ pub fn r_alignmentsieve( // shift is of length 0, 2, or 4. // Define regions - let (regions, chromsizes) = parse_regions(&Vec::new(), bamifile); + let (regions, chromsizes) = parse_regions(&Vec::new(), vec![bamifile]); let filters = Alignmentfilters{ minmappingquality: min_mapping_quality, diff --git a/src/bamcompare.rs b/src/bamcompare.rs index 3d80cf905d..375b7b3432 100644 --- a/src/bamcompare.rs +++ b/src/bamcompare.rs @@ -60,7 +60,7 @@ pub fn r_bamcompare( }; // Parse regions & calculate coverage. Note that - let (regions, chromsizes) = parse_regions(®ions, bamifile1); + let (regions, chromsizes) = parse_regions(®ions, vec![bamifile1, bamifile2]); let regionblocks = region_divider(®ions); let pool = ThreadPoolBuilder::new().num_threads(nproc).build().unwrap(); diff --git a/src/bamcoverage.rs b/src/bamcoverage.rs index 9b3a0b6496..1731e3d59b 100644 --- a/src/bamcoverage.rs +++ b/src/bamcoverage.rs @@ -70,7 +70,7 @@ pub fn r_bamcoverage( } // Parse regions & calculate coverage - let (regions, chromsizes) = parse_regions(®ions, bamifile); + let (regions, chromsizes) = parse_regions(®ions, vec![bamifile]); let regionblocks = region_divider(®ions); let pool = ThreadPoolBuilder::new().num_threads(nproc).build().unwrap(); let (bg, mapped, _unmapped, readlen, fraglen) = pool.install(|| { diff --git a/src/covcalc.rs b/src/covcalc.rs index 684ff77a39..4b2b06c5c4 100644 --- a/src/covcalc.rs +++ b/src/covcalc.rs @@ -6,13 +6,39 @@ use std::cmp::min; use std::fmt; use ndarray::Array1; -pub fn parse_regions(regions: &Vec<(String, u32, u32)>, bam_ifile: &str) -> (Vec, HashMap) { +pub fn parse_regions(regions: &Vec<(String, u32, u32)>, bam_ifile: Vec<&str>) -> (Vec, HashMap) { // Takes a vector of regions, and a bam reference // returns a vector of regions, with all chromosomes and full lengths if original regions was empty // Else it validates the regions against the information from the bam header // Finally, a Vec with chromsizes is returned as well. - - let bam = IndexedReader::from_path(bam_ifile).unwrap(); + let mut found_chroms: HashMap = HashMap::new(); + for bam in bam_ifile.iter() { + let bam = IndexedReader::from_path(bam).unwrap(); + let chroms: Vec = bam.header().target_names().iter().map(|x| String::from_utf8(x.to_vec()).unwrap()).collect(); + for chrom in chroms.iter() { + // if it's not in the hashmap, add it, else increment count + if !found_chroms.contains_key(chrom) { + found_chroms.insert(chrom.clone(), 1); + } else { + let count = found_chroms.get_mut(chrom).unwrap(); + *count += 1; + } + } + } + let mut validchroms: Vec = Vec::new(); + // loop over all chroms in the hashmap, if the count is expected, include them + for (chrom, count) in found_chroms.iter() { + if *count == bam_ifile.len() { + validchroms.push(chrom.clone()); + } else { + println!("Chromosome {} is missing in at least one bam file, and thus ignored!", chrom); + } + } + // Crash if validchroms is empty. + assert!(!validchroms.is_empty(), "No chromosomes found that are present in all bam files. Did you mix references ?"); + println!("Valid chromosomes are: {:?}", validchroms); + // Read header from first bam file + let bam = IndexedReader::from_path(bam_ifile[0]).unwrap(); let header = bam.header().clone(); let mut chromregions: Vec = Vec::new(); let mut chromsizes = HashMap::new(); @@ -23,6 +49,10 @@ pub fn parse_regions(regions: &Vec<(String, u32, u32)>, bam_ifile: &str) -> (Vec .expect("Invalid UTF-8 in chromosome name"); let chromlen = header.target_len(tid) .expect("Error retrieving length for chromosome"); + // If chromname is not in validchroms, skip it. + if !validchroms.contains(&chromname) { + continue; + } let _reg = Region { chrom: chromname.clone(), start: Revalue::U(0), @@ -42,18 +72,18 @@ pub fn parse_regions(regions: &Vec<(String, u32, u32)>, bam_ifile: &str) -> (Vec .expect("Invalid UTF-8 in chromosome name"); let chromlen = header.target_len(tid) .expect("Error retrieving length for chromosome"); - chromsizes.insert(chromname, chromlen as u32); + if validchroms.contains(&chromname) { + chromsizes.insert(chromname, chromlen as u32); + } } - let validchroms: Vec = header - .target_names() - .iter() - .map(|x| String::from_utf8(x.to_vec()).unwrap()) - .collect(); for region in regions { let chromname = ®ion.0; assert!(region.1 < region.2, "Region start must be strictly less than region end."); - assert!(validchroms.contains(chromname), "Chromosome {} not found in bam header", chromname); + // Check if chromname is in validchroms + if !validchroms.contains(chromname) { + continue; + } let _reg = Region { chrom: chromname.clone(), start: Revalue::U(region.1), diff --git a/src/multibamsummary.rs b/src/multibamsummary.rs index 3fc9c9ec5b..34834e2a11 100644 --- a/src/multibamsummary.rs +++ b/src/multibamsummary.rs @@ -32,7 +32,7 @@ pub fn r_mbams( // optional parameters labels: Py, mut binsize: u32, - distance_between_bins: u32, + mut distance_between_bins: u32, nproc: usize, bed_file: Py, sup_regions: Vec<(String, u32, u32)>, @@ -101,6 +101,8 @@ pub fn r_mbams( let chromsizes = chrombounds_from_bam(bamfiles.get(0).unwrap()); binsize = 1; + distance_between_bins = 0; + // Parse regions from bed files. Note that we retain the name of the bed file (in case there are more then 1) // Additionaly, score and strand are also retained, if it's a 3-column bed file we just fill in '.' let mut regionsizes: HashMap = HashMap::new(); @@ -120,9 +122,9 @@ pub fn r_mbams( }); } else { if verbose { - println!("BINS mode. with binsize: {}", binsize); + println!("BINS mode. with binsize: {}, distance between bins: {}", binsize, distance_between_bins); } - let (parsedregions, chromsizes) = parse_regions(&sup_regions, bamfiles.get(0).unwrap()); + let (parsedregions, chromsizes) = parse_regions(&sup_regions, bamfiles.iter().map(|x| x.as_str()).collect()); regions = parsedregions; } From fe8b70beb11da41d965381d5be66bedc70136b30 Mon Sep 17 00:00:00 2001 From: WardDeb Date: Sat, 25 Jan 2025 09:35:11 +0100 Subject: [PATCH 06/11] for regions in mbs same idea, only contain chroms that are present in header of all bam files. --- src/filehandler.rs | 31 ++++++++++++++++++++++++++++--- src/multibamsummary.rs | 2 +- 2 files changed, 29 insertions(+), 4 deletions(-) diff --git a/src/filehandler.rs b/src/filehandler.rs index f9578466f7..b9018a3c3a 100644 --- a/src/filehandler.rs +++ b/src/filehandler.rs @@ -371,8 +371,31 @@ pub fn chrombounds_from_bw(bwfile: &str) -> HashMap { chromsizes } -pub fn chrombounds_from_bam(bamfile: &str) -> HashMap { - let bam = IndexedReader::from_path(bamfile).unwrap(); +pub fn chrombounds_from_bam(bamfiles: Vec<&str>) -> HashMap { + let mut found_chroms: HashMap = HashMap::new(); + for bam in bamfiles.iter() { + let bam = IndexedReader::from_path(bam).unwrap(); + let chroms: Vec = bam.header().target_names().iter().map(|x| String::from_utf8(x.to_vec()).unwrap()).collect(); + for chrom in chroms.iter() { + // if it's not in the hashmap, add it, else increment count + if !found_chroms.contains_key(chrom) { + found_chroms.insert(chrom.clone(), 1); + } else { + let count = found_chroms.get_mut(chrom).unwrap(); + *count += 1; + } + } + } + let mut validchroms: Vec = Vec::new(); + // loop over all chroms in the hashmap, if the count is expected, include them + for (chrom, count) in found_chroms.iter() { + if *count == bamfiles.len() { + validchroms.push(chrom.clone()); + } else { + println!("Chromosome {} is missing in at least one bam file, and thus ignored!", chrom); + } + } + let bam = IndexedReader::from_path(bamfiles[0]).unwrap(); let header = bam.header().clone(); let mut chromsizes = HashMap::new(); for tid in 0..header.target_count() { @@ -380,7 +403,9 @@ pub fn chrombounds_from_bam(bamfile: &str) -> HashMap { .expect("Invalid UTF-8 in chromosome name"); let chromlen = header.target_len(tid) .expect("Error retrieving length for chromosome"); - chromsizes.insert(chromname.to_string(), chromlen as u32); + if validchroms.contains(&chromname) { + chromsizes.insert(chromname, chromlen as u32); + } } chromsizes } diff --git a/src/multibamsummary.rs b/src/multibamsummary.rs index 34834e2a11..aa109c54d1 100644 --- a/src/multibamsummary.rs +++ b/src/multibamsummary.rs @@ -98,7 +98,7 @@ pub fn r_mbams( txniddesignator: txniddesignator.to_string(), }; // From the first bamfile, get the chromosome sizes. Only the chromosome names are needed, to make sure no invalid regions are included later on. - let chromsizes = chrombounds_from_bam(bamfiles.get(0).unwrap()); + let chromsizes = chrombounds_from_bam(bamfiles.iter().map(|x| x.as_str()).collect()); binsize = 1; distance_between_bins = 0; From 3515c835724c9a12fac99b44843e46737c86089e Mon Sep 17 00:00:00 2001 From: WardDeb Date: Mon, 27 Jan 2025 12:43:13 +0100 Subject: [PATCH 07/11] simplify and include a parallel block iterator in binsize>1 calcs --- src/covcalc.rs | 72 +++++++----------------------------------- src/multibamsummary.rs | 53 ++++++++++++++++--------------- 2 files changed, 38 insertions(+), 87 deletions(-) diff --git a/src/covcalc.rs b/src/covcalc.rs index 4b2b06c5c4..0bbf510c88 100644 --- a/src/covcalc.rs +++ b/src/covcalc.rs @@ -1,10 +1,13 @@ use rust_htslib::bam::{self, Read, IndexedReader, record::Cigar}; +use rust_htslib::bam::ext::BamRecordExtensions; use std::collections::HashMap; use tempfile::{Builder, TempPath}; use std::io::{BufWriter, Write}; use std::cmp::min; use std::fmt; use ndarray::Array1; +use rayon::prelude::*; +use std::collections::HashSet; pub fn parse_regions(regions: &Vec<(String, u32, u32)>, bam_ifile: Vec<&str>) -> (Vec, HashMap) { // Takes a vector of regions, and a bam reference @@ -172,41 +175,14 @@ pub fn bam_pileup<'a>( readlens.push(record.seq_len() as u32); } } - - let blocks = bam_blocks(record); - // keep a record of bin indices that have been updated already - let mut changedbins: Vec = Vec::new(); - for block in blocks.into_iter() { - // Don't count blocks that exceed the chromosome - if block.0 >= region.2 { - continue; - } - binix = block.0 / binsize; - if !changedbins.contains(&binix) { - counts[binix as usize] += 1.0; - changedbins.push(binix); - } - // Don't count blocks that exceed the chromosome - if block.1 >= region.2 { - continue; - } - // Note that our blocks are strictly less then region ends, hence no problem with bin spewing at end: - // binix = [a,b) where b == region.2 would divide into binix+1 (doesn't exist). - binix = block.1 / binsize; - if !changedbins.contains(&binix) { - counts[binix as usize] += 1.0; - changedbins.push(binix); - } - } - // if changedbins contains non-continuous blocks (perhaps read length spans multiple bins), update the inbetweens - if let (Some(&min), Some(&max)) = (changedbins.iter().min(), changedbins.iter().max()) { - for ix in min..=max { - if !changedbins.contains(&ix) { - counts[ix as usize] += 1.0; - changedbins.push(ix); - } - } - } + let indices: HashSet = record.aligned_blocks() + .par_bridge() + .filter(|x| (x[1] as u32) < region.2) + .flat_map(|x| x[0] as u32..x[1] as u32) + .map(|x| (x / binsize) as usize) + .collect(); + indices.into_iter() + .for_each(|ix| counts[ix] += 1.0); } let mut writer = BufWriter::new(&bg); // There are two scenarios: @@ -372,32 +348,6 @@ pub fn bam_pileup<'a>( return (tmpvec, mapped_reads, unmapped_reads, readlens, fraglens); } -/// Extract contigious blocks from a bam record -/// Blocks are split per insertion, padding, deletion and ref skip -fn bam_blocks(rec: bam::Record) -> Vec<(u32, u32)> { - let mut blocks: Vec<(u32, u32)> = Vec::new(); - let mut pos = rec.pos(); - - for c in rec.cigar().iter() { - match c { - Cigar::Match(len) | Cigar::Equal(len) | Cigar::Diff(len) => { - let start = pos; - let end = pos + *len as i64 -1; - blocks.push((start as u32, end as u32)); - pos = end; - } - Cigar::Ins(len) | Cigar::Pad(len) => { - pos += *len as i64; - } - Cigar::Del(len) | Cigar::RefSkip(len) => { - pos += *len as i64; - } - _ => (), - } - } - return blocks; -} - pub struct Alignmentfilters { pub minmappingquality: u8, pub samflaginclude: u16, diff --git a/src/multibamsummary.rs b/src/multibamsummary.rs index aa109c54d1..7e96b43f04 100644 --- a/src/multibamsummary.rs +++ b/src/multibamsummary.rs @@ -4,7 +4,7 @@ use rayon::prelude::*; use rayon::ThreadPoolBuilder; use itertools::{multizip, multiunzip, izip}; use std::io::prelude::*; -use std::io::BufReader; +use std::io::{BufReader, BufWriter}; use std::fs::File; use ndarray::{Array1, Array2, stack, Axis}; use std::io; @@ -167,6 +167,18 @@ pub fn r_mbams( println!("Define output file"); } + let mut cfile: Option> = None; + if out_raw_counts != "None" { + cfile = Some(BufWriter::new(File::create(out_raw_counts).unwrap())); + // Write the header to the file. + let mut headstr = String::new(); + headstr.push_str("#\'chr\'\t\'start\'\t\'end\'"); + for label in bamlabels.iter() { + headstr.push_str(&format!("\t\'{}\'", label)); + } + writeln!(cfile.as_mut().unwrap(), "{}", headstr).unwrap(); + } + // Collate the coverage files into a matrix. let its: Vec<_> = covcalcs.iter().map(|x| x.into_iter()).collect(); let zips = TempZip { iterators: its }; @@ -176,10 +188,11 @@ pub fn r_mbams( let zips_vec: Vec<_> = zips.collect(); let matvec: Array2 = pool.install(|| { let matvecs: Vec> = zips_vec - .par_iter() + .iter() .flat_map(|c| { - let readers: Vec<_> = c.iter().map(|x| BufReader::new(File::open(x).unwrap()).lines()).collect(); + let readers: Vec<_> = c.par_iter().map(|x| BufReader::new(File::open(x).unwrap()).lines()).collect(); let mut _matvec: Vec> = Vec::new(); + let mut _regions: Vec<(String, u32, u32)> = Vec::new(); for mut _l in (TempZip { iterators: readers }) { // unwrap all lines in _l let lines: Vec<_> = _l @@ -189,6 +202,16 @@ pub fn r_mbams( .map(|x: Vec<&str> | ( x[0].to_string(), x[1].parse::().unwrap(), x[2].parse::().unwrap(), x[3].parse::().unwrap() ) ) .collect(); let arrline = Array1::from(lines.iter().map(|x| x.3).collect::>()); + // If cfile is define, write the raw counts to file. + if let Some(ref mut file) = cfile { + let mut regstr = String::new(); + regstr.push_str(&format!("{}\t{}\t{}",lines[0].0, lines[0].1, lines[0].2)); + // push values from array + for val in arrline.iter() { + regstr.push_str(&format!("\t{}", val)); + } + writeln!(file, "{}", regstr).unwrap(); + } _matvec.push(arrline); } _matvec @@ -207,7 +230,7 @@ pub fn r_mbams( writeln!(sf_file, "{}\t{}", label, sf).unwrap(); } } - let mut npz = NpzWriter::new(File::create(ofile).unwrap()); + let mut npz = NpzWriter::new_compressed(File::create(ofile).unwrap()); npz.add_array("matrix", &matvec).unwrap(); npz.add_array("labels", &bamlabels_arr).unwrap(); npz.finish().unwrap(); @@ -215,28 +238,6 @@ pub fn r_mbams( println!("Matrix written."); } - // If raw counts are required, write them to file. - if out_raw_counts != "None" { - let mut cfile = File::create(out_raw_counts).unwrap(); - // Create the header - let mut headstr = String::new(); - headstr.push_str("#\'chr\'\t\'start\'\t\'end\'"); - for label in bamlabels.iter() { - headstr.push_str(&format!("\t\'{}\'", label)); - } - writeln!(cfile, "{}", headstr).unwrap(); - for (i, region) in regions.iter().enumerate() { - let mut line = String::new(); - line.push_str(&format!("{}\t{}\t{}", region.chrom, region.get_startu(), region.get_endu())); - for j in 0..bamlabels.len() { - line.push_str(&format!("\t{}", matvec[[i, j]])); - } - writeln!(cfile, "{}", line).unwrap(); - } - } - if verbose { - println!("Counts written."); - } Ok(()) } From 3c59ac4f55fd2091f0cfbf3de392deba61cb3f69 Mon Sep 17 00:00:00 2001 From: WardDeb Date: Mon, 27 Jan 2025 17:23:03 +0100 Subject: [PATCH 08/11] drop the bridge par --- src/covcalc.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/covcalc.rs b/src/covcalc.rs index 0bbf510c88..4b1d472803 100644 --- a/src/covcalc.rs +++ b/src/covcalc.rs @@ -175,8 +175,8 @@ pub fn bam_pileup<'a>( readlens.push(record.seq_len() as u32); } } - let indices: HashSet = record.aligned_blocks() - .par_bridge() + let indices: HashSet = record + .aligned_blocks() .filter(|x| (x[1] as u32) < region.2) .flat_map(|x| x[0] as u32..x[1] as u32) .map(|x| (x / binsize) as usize) From fbb44359dd77ca9ef8cb1b045bf4e69c0446d4a3 Mon Sep 17 00:00:00 2001 From: WardDeb Date: Tue, 28 Jan 2025 10:45:43 +0100 Subject: [PATCH 09/11] drop v3 print statements --- pydeeptools/deeptools/writeBedGraph.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/pydeeptools/deeptools/writeBedGraph.py b/pydeeptools/deeptools/writeBedGraph.py index 4888aca945..44b6a2d202 100644 --- a/pydeeptools/deeptools/writeBedGraph.py +++ b/pydeeptools/deeptools/writeBedGraph.py @@ -150,8 +150,6 @@ def run(self, func_to_call, func_args, out_file_name, blackListFileName=None, fo region=self.region, blackListFileName=blackListFileName, numberOfProcessors=self.numberOfProcessors) - print("Result after mapReduce") - print(res) # Determine the sorted order of the temp files chrom_order = dict() for i, _ in enumerate(chrom_names_and_size): From 4c83c9c0dcd9117eedfbb456229600e7e259c5aa Mon Sep 17 00:00:00 2001 From: WardDeb Date: Tue, 28 Jan 2025 10:46:09 +0100 Subject: [PATCH 10/11] drop validchrom printing, sort regions to have proper bw files. --- src/covcalc.rs | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/covcalc.rs b/src/covcalc.rs index 4b1d472803..275ec115a9 100644 --- a/src/covcalc.rs +++ b/src/covcalc.rs @@ -39,7 +39,6 @@ pub fn parse_regions(regions: &Vec<(String, u32, u32)>, bam_ifile: Vec<&str>) -> } // Crash if validchroms is empty. assert!(!validchroms.is_empty(), "No chromosomes found that are present in all bam files. Did you mix references ?"); - println!("Valid chromosomes are: {:?}", validchroms); // Read header from first bam file let bam = IndexedReader::from_path(bam_ifile[0]).unwrap(); let header = bam.header().clone(); @@ -101,7 +100,7 @@ pub fn parse_regions(regions: &Vec<(String, u32, u32)>, bam_ifile: Vec<&str>) -> } // Sort regions to make our live easier down the line // Sort Vec of Regions per chromosome, and then by start. - // chromregions.sort_by(|a, b| a.chrom.cmp(&b.chrom).then(a.start.cmp(&b.start))); + chromregions.sort_by(|a, b| a.chrom.cmp(&b.chrom).then(a.get_startu().cmp(&b.get_startu()))); return (chromregions, chromsizes); } From d415afe03c3349efe341b902c79be0aa858b5509 Mon Sep 17 00:00:00 2001 From: WardDeb Date: Wed, 29 Jan 2025 09:02:39 +0100 Subject: [PATCH 11/11] properly calculate mbs - BED mode --- pydeeptools/deeptools/correlation.py | 7 +- src/bamcompare.rs | 2 +- src/bamcoverage.rs | 2 +- src/covcalc.rs | 93 +++++++++++++++++------- src/multibamsummary.rs | 102 +++++++++++++++++---------- 5 files changed, 139 insertions(+), 67 deletions(-) diff --git a/pydeeptools/deeptools/correlation.py b/pydeeptools/deeptools/correlation.py index 56b8d91d2f..b722d3a00f 100755 --- a/pydeeptools/deeptools/correlation.py +++ b/pydeeptools/deeptools/correlation.py @@ -92,8 +92,11 @@ def load_matrix(self, matrix_file): "and plotting\n".format(num_nam)) self.matrix = np.ma.compress_rows(np.ma.masked_invalid(self.matrix)) - - self.labels = list(map(toString, _ma['labels'])) + # Since deeptools 4.0 the labels are encoded and would need to be decoded. + if _ma['labels'].dtype == np.uint8: + self.labels = [bytes(x).decode('utf-8').rstrip('\x00') for x in _ma['labels']] + else: + self.labels = list(map(toString, _ma['labels'])) assert len(self.labels) == self.matrix.shape[1], "ERROR, length of labels is not equal " \ "to length of matrix samples" diff --git a/src/bamcompare.rs b/src/bamcompare.rs index 375b7b3432..1acb38f877 100644 --- a/src/bamcompare.rs +++ b/src/bamcompare.rs @@ -72,7 +72,7 @@ pub fn r_bamcompare( bamfiles.par_iter() .map(|(bamfile, ispe)| { let (bg, mapped, unmapped, readlen, fraglen) = regionblocks.par_iter() - .map(|i| bam_pileup(bamfile, &i, &binsize, &ispe, &ignorechr, &filters, false)) + .map(|i| bam_pileup(bamfile, &i, &binsize, &ispe, &ignorechr, &filters, false, false)) .reduce( || (vec![], 0, 0, vec![], vec![]), |(mut _bg, mut _mapped, mut _unmapped, mut _readlen, mut _fraglen), (bg, mapped, unmapped, readlen, fraglen)| { diff --git a/src/bamcoverage.rs b/src/bamcoverage.rs index 1731e3d59b..8d74a6ff13 100644 --- a/src/bamcoverage.rs +++ b/src/bamcoverage.rs @@ -75,7 +75,7 @@ pub fn r_bamcoverage( let pool = ThreadPoolBuilder::new().num_threads(nproc).build().unwrap(); let (bg, mapped, _unmapped, readlen, fraglen) = pool.install(|| { regionblocks.par_iter() - .map(|i| bam_pileup(bamifile, &i, &binsize, &ispe, &ignorechr, &filters, true)) + .map(|i| bam_pileup(bamifile, &i, &binsize, &ispe, &ignorechr, &filters, true, false)) .reduce( || (vec![], 0, 0, vec![], vec![]), |(mut _bg, mut _mapped, mut _unmapped, mut _readlen, mut _fraglen), (bg, mapped, unmapped, readlen, fraglen)| { diff --git a/src/covcalc.rs b/src/covcalc.rs index 275ec115a9..32770cad29 100644 --- a/src/covcalc.rs +++ b/src/covcalc.rs @@ -110,11 +110,12 @@ pub fn parse_regions(regions: &Vec<(String, u32, u32)>, bam_ifile: Vec<&str>) -> pub fn bam_pileup<'a>( bam_ifile: &str, regionvec: &'a Vec, - binsize: &u32, + provbs: &u32, ispe: &bool, ignorechr: &Vec, _filters: &Alignmentfilters, collapse: bool, + gene_mode: bool ) -> ( Vec, // temp bedgraph file. u32, // mapped reads @@ -122,7 +123,8 @@ pub fn bam_pileup<'a>( Vec, // read lengths Vec, // fragment lengths ) { - + let mut bs = *provbs; + let mut binsize = &bs; // constant to check if read is first in pair (relevant later) const FREAD: u16 = 0x40; @@ -149,39 +151,76 @@ pub fn bam_pileup<'a>( // Convert region to tuple to ease computations. for regstruct in regionvec.iter() { - let region = (regstruct.chrom.clone(), regstruct.get_startu(), regstruct.get_endu()); + // There are two options here: + // either we are supposed to calculate coverage over regions (variable binsize required) gene_mode = true + // or we have a regular bin setting, gene_mode = false + let mut region: (String, u32, u32); + if gene_mode { + region = (regstruct.chrom.clone(), regstruct.get_startu(), regstruct.get_endu()); + bs = region.2 - region.1; + binsize = &bs; + } else { + region = (regstruct.chrom.clone(), regstruct.get_startu(), regstruct.get_endu()); + } + //let region = (regstruct.chrom.clone(), regstruct.get_startu(), regstruct.get_endu()); bam.fetch((region.0.as_str(), region.1, region.2)) .expect(&format!("Error fetching region: {:?}", region)); if binsize > &1 { - // populate the bg vector with 0 counts over all bins - let mut counts: Vec = vec![0.0; (region.2 - region.1).div_ceil(*binsize) as usize]; - // let mut binstart = region.1; - let mut binix: u32 = 0; - - for record in bam.records() { - let record = record.expect("Error parsing record."); - if !ignorechr.contains(®ion.0) { - if record.is_unmapped() { - unmapped_reads += 1; - } else { - mapped_reads += 1; - if *ispe { - if record.is_paired() && record.is_proper_pair() && (record.flags() & FREAD != 0) { - fraglens.push(record.insert_size().abs() as u32); + let mut counts: Vec; + + if gene_mode { + counts = vec![0.0; 1]; + for record in bam.records() { + let record = record.expect("Error parsing record."); + if !ignorechr.contains(®ion.0) { + if record.is_unmapped() { + unmapped_reads += 1; + } else { + mapped_reads += 1; + if *ispe { + if record.is_paired() && record.is_proper_pair() && (record.flags() & FREAD != 0) { + fraglens.push(record.insert_size().abs() as u32); + } } + readlens.push(record.seq_len() as u32); + } + } + counts[0] += 1.0; + } + } else { + // populate the bg vector with 0 counts over all bins + counts = vec![0.0; (region.2 - region.1).div_ceil(*binsize) as usize]; + println!("LENGTH OF THE VECO BRO {:?}", counts.len()); + // let mut binstart = region.1; + let mut binix: u32 = 0; + + for record in bam.records() { + let record = record.expect("Error parsing record."); + if !ignorechr.contains(®ion.0) { + if record.is_unmapped() { + unmapped_reads += 1; + } else { + mapped_reads += 1; + if *ispe { + if record.is_paired() && record.is_proper_pair() && (record.flags() & FREAD != 0) { + fraglens.push(record.insert_size().abs() as u32); + } + } + readlens.push(record.seq_len() as u32); } - readlens.push(record.seq_len() as u32); } + + let indices: HashSet = record + .aligned_blocks() + .filter(|x| (x[1] as u32) < region.2) + .flat_map(|x| x[0] as u32..x[1] as u32) + .map(|x| (x / binsize) as usize) + .collect(); + println!("INDICES = {:?}", indices); + indices.into_iter() + .for_each(|ix| counts[ix] += 1.0); } - let indices: HashSet = record - .aligned_blocks() - .filter(|x| (x[1] as u32) < region.2) - .flat_map(|x| x[0] as u32..x[1] as u32) - .map(|x| (x / binsize) as usize) - .collect(); - indices.into_iter() - .for_each(|ix| counts[ix] += 1.0); } let mut writer = BufWriter::new(&bg); // There are two scenarios: diff --git a/src/multibamsummary.rs b/src/multibamsummary.rs index 7e96b43f04..8aada7711b 100644 --- a/src/multibamsummary.rs +++ b/src/multibamsummary.rs @@ -86,7 +86,7 @@ pub fn r_mbams( }; let mut regions: Vec = Vec::new(); - + let mut gene_mode = false; if mode == "BED-file" { if verbose { println!("BED file mode. with files: {:?}", bedfiles); @@ -120,6 +120,7 @@ pub fn r_mbams( regions.extend(reg); regionsizes.insert(regsize.0, regsize.1); }); + gene_mode = true; } else { if verbose { println!("BINS mode. with binsize: {}, distance between bins: {}", binsize, distance_between_bins); @@ -146,7 +147,7 @@ pub fn r_mbams( bampfiles.par_iter() .map(|(bamfile, ispe)| { let (bg, _mapped, _unmapped, _readlen, _fraglen) = regionblocks.par_iter() - .map(|i| bam_pileup(bamfile, &i, &binsize, &ispe, &ignorechr, &filters, false)) + .map(|i| bam_pileup(bamfile, &i, &binsize, &ispe, &ignorechr, &filters, false, gene_mode)) .reduce( || (vec![], 0, 0, vec![], vec![]), |(mut _bg, mut _mapped, mut _unmapped, mut _readlen, mut _fraglen), (bg, mapped, unmapped, readlen, fraglen)| { @@ -167,62 +168,87 @@ pub fn r_mbams( println!("Define output file"); } - let mut cfile: Option> = None; - if out_raw_counts != "None" { - cfile = Some(BufWriter::new(File::create(out_raw_counts).unwrap())); - // Write the header to the file. - let mut headstr = String::new(); - headstr.push_str("#\'chr\'\t\'start\'\t\'end\'"); - for label in bamlabels.iter() { - headstr.push_str(&format!("\t\'{}\'", label)); - } - writeln!(cfile.as_mut().unwrap(), "{}", headstr).unwrap(); - } - // Collate the coverage files into a matrix. let its: Vec<_> = covcalcs.iter().map(|x| x.into_iter()).collect(); let zips = TempZip { iterators: its }; if verbose { println!("Start iterating through temp coverage files and create output npy."); } - let zips_vec: Vec<_> = zips.collect(); - let matvec: Array2 = pool.install(|| { - let matvecs: Vec> = zips_vec - .iter() + let zips_vec: Vec<_> = zips.collect(); + println!(" Length of ziperators = {}", zips_vec.len()); + + let mut matvec: Vec> = Vec::new(); + let matvec: Vec<_> = pool.install(|| { + let _m: Vec<_> = zips_vec + .par_iter() .flat_map(|c| { let readers: Vec<_> = c.par_iter().map(|x| BufReader::new(File::open(x).unwrap()).lines()).collect(); - let mut _matvec: Vec> = Vec::new(); + let mut _matvec: Vec> = Vec::new(); let mut _regions: Vec<(String, u32, u32)> = Vec::new(); for mut _l in (TempZip { iterators: readers }) { // unwrap all lines in _l let lines: Vec<_> = _l - .iter_mut() + .par_iter_mut() .map(|x| x.as_mut().unwrap()) .map(|x| x.split('\t').collect()) .map(|x: Vec<&str> | ( x[0].to_string(), x[1].parse::().unwrap(), x[2].parse::().unwrap(), x[3].parse::().unwrap() ) ) .collect(); - let arrline = Array1::from(lines.iter().map(|x| x.3).collect::>()); - // If cfile is define, write the raw counts to file. - if let Some(ref mut file) = cfile { - let mut regstr = String::new(); - regstr.push_str(&format!("{}\t{}\t{}",lines[0].0, lines[0].1, lines[0].2)); - // push values from array - for val in arrline.iter() { - regstr.push_str(&format!("\t{}", val)); - } - writeln!(file, "{}", regstr).unwrap(); - } - _matvec.push(arrline); + let counts = lines.par_iter().map(|x| x.3).collect::>(); + let regions: (String, u32, u32) = (lines[0].0.clone(), lines[0].1, lines[0].2); + _matvec.push(counts); + _regions.push(regions); } - _matvec + (_matvec, _regions) }) .collect(); - stack(Axis(0), &matvecs.iter().map(|x| x.view()).collect::>()).unwrap() + _m }); + // Seperate the matrices and regions + let (matvec, regions): (Vec<_>, Vec<_>) = multiunzip(matvec); + + // Write out the count files, if appropriate + + if out_raw_counts != "None" { + if verbose { + println!("Writing raw counts to disk."); + } + let mut cfile = BufWriter::new(File::create(out_raw_counts).unwrap()); + // Write the header to the file. + let mut headstr = String::new(); + headstr.push_str("#\'chr\'\t\'start\'\t\'end\'"); + for label in bamlabels.iter() { + headstr.push_str(&format!("\t\'{}\'", label)); + } + writeln!(cfile, "{}", headstr).unwrap(); + let outlines: Vec = pool.install(|| { + regions.par_iter().zip(matvec.par_iter()) + .map(|(region, counts)| { + let mut outstr = String::new(); + outstr.push_str(&format!("{}\t{}\t{}", region.0, region.1, region.2)); + for count in counts.iter() { + outstr.push_str(&format!("\t{}", count)); + } + outstr + }) + .collect() + }); + for line in outlines { + writeln!(cfile, "{}", line).unwrap(); + } + } + + // Create 2darray from matvec + let matarr: Array2 = Array2::from_shape_vec( + (matvec.len(), matvec[0].len()), matvec.into_iter().flatten().collect() + ).unwrap(); + // If scalefactors are required, calc and save them now. if scaling_factors != "None" { - let sf = deseq_scalefactors(&matvec); + if verbose { + println!("Calculating scale factors."); + } + let sf = deseq_scalefactors(&matarr); // save scalefactors to file let mut sf_file = File::create(scaling_factors).unwrap(); writeln!(sf_file, "Sample\tscalingFactor").unwrap(); @@ -230,8 +256,12 @@ pub fn r_mbams( writeln!(sf_file, "{}\t{}", label, sf).unwrap(); } } + + if verbose { + println!("Writing matrix to disk."); + } let mut npz = NpzWriter::new_compressed(File::create(ofile).unwrap()); - npz.add_array("matrix", &matvec).unwrap(); + npz.add_array("matrix", &matarr).unwrap(); npz.add_array("labels", &bamlabels_arr).unwrap(); npz.finish().unwrap(); if verbose {