Skip to content

Commit

Permalink
properly calculate mbs - BED mode
Browse files Browse the repository at this point in the history
  • Loading branch information
WardDeb committed Jan 29, 2025
1 parent 4c83c9c commit d415afe
Show file tree
Hide file tree
Showing 5 changed files with 139 additions and 67 deletions.
7 changes: 5 additions & 2 deletions pydeeptools/deeptools/correlation.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,8 +92,11 @@ def load_matrix(self, matrix_file):
"and plotting\n".format(num_nam))

self.matrix = np.ma.compress_rows(np.ma.masked_invalid(self.matrix))

self.labels = list(map(toString, _ma['labels']))
# Since deeptools 4.0 the labels are encoded and would need to be decoded.
if _ma['labels'].dtype == np.uint8:
self.labels = [bytes(x).decode('utf-8').rstrip('\x00') for x in _ma['labels']]
else:
self.labels = list(map(toString, _ma['labels']))

assert len(self.labels) == self.matrix.shape[1], "ERROR, length of labels is not equal " \
"to length of matrix samples"
Expand Down
2 changes: 1 addition & 1 deletion src/bamcompare.rs
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ pub fn r_bamcompare(
bamfiles.par_iter()
.map(|(bamfile, ispe)| {
let (bg, mapped, unmapped, readlen, fraglen) = regionblocks.par_iter()
.map(|i| bam_pileup(bamfile, &i, &binsize, &ispe, &ignorechr, &filters, false))
.map(|i| bam_pileup(bamfile, &i, &binsize, &ispe, &ignorechr, &filters, false, false))
.reduce(
|| (vec![], 0, 0, vec![], vec![]),
|(mut _bg, mut _mapped, mut _unmapped, mut _readlen, mut _fraglen), (bg, mapped, unmapped, readlen, fraglen)| {
Expand Down
2 changes: 1 addition & 1 deletion src/bamcoverage.rs
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ pub fn r_bamcoverage(
let pool = ThreadPoolBuilder::new().num_threads(nproc).build().unwrap();
let (bg, mapped, _unmapped, readlen, fraglen) = pool.install(|| {
regionblocks.par_iter()
.map(|i| bam_pileup(bamifile, &i, &binsize, &ispe, &ignorechr, &filters, true))
.map(|i| bam_pileup(bamifile, &i, &binsize, &ispe, &ignorechr, &filters, true, false))
.reduce(
|| (vec![], 0, 0, vec![], vec![]),
|(mut _bg, mut _mapped, mut _unmapped, mut _readlen, mut _fraglen), (bg, mapped, unmapped, readlen, fraglen)| {
Expand Down
93 changes: 66 additions & 27 deletions src/covcalc.rs
Original file line number Diff line number Diff line change
Expand Up @@ -110,19 +110,21 @@ pub fn parse_regions(regions: &Vec<(String, u32, u32)>, bam_ifile: Vec<&str>) ->
pub fn bam_pileup<'a>(
bam_ifile: &str,
regionvec: &'a Vec<Region>,
binsize: &u32,
provbs: &u32,
ispe: &bool,
ignorechr: &Vec<String>,
_filters: &Alignmentfilters,
collapse: bool,
gene_mode: bool
) -> (
Vec<TempPath>, // temp bedgraph file.
u32, // mapped reads
u32, // unmapped reads
Vec<u32>, // read lengths
Vec<u32>, // fragment lengths
) {

let mut bs = *provbs;
let mut binsize = &bs;
// constant to check if read is first in pair (relevant later)
const FREAD: u16 = 0x40;

Expand All @@ -149,39 +151,76 @@ pub fn bam_pileup<'a>(

// Convert region to tuple to ease computations.
for regstruct in regionvec.iter() {
let region = (regstruct.chrom.clone(), regstruct.get_startu(), regstruct.get_endu());
// There are two options here:
// either we are supposed to calculate coverage over regions (variable binsize required) gene_mode = true
// or we have a regular bin setting, gene_mode = false
let mut region: (String, u32, u32);
if gene_mode {
region = (regstruct.chrom.clone(), regstruct.get_startu(), regstruct.get_endu());
bs = region.2 - region.1;
binsize = &bs;
} else {
region = (regstruct.chrom.clone(), regstruct.get_startu(), regstruct.get_endu());
}
//let region = (regstruct.chrom.clone(), regstruct.get_startu(), regstruct.get_endu());
bam.fetch((region.0.as_str(), region.1, region.2))
.expect(&format!("Error fetching region: {:?}", region));

if binsize > &1 {
// populate the bg vector with 0 counts over all bins
let mut counts: Vec<f32> = vec![0.0; (region.2 - region.1).div_ceil(*binsize) as usize];
// let mut binstart = region.1;
let mut binix: u32 = 0;

for record in bam.records() {
let record = record.expect("Error parsing record.");
if !ignorechr.contains(&region.0) {
if record.is_unmapped() {
unmapped_reads += 1;
} else {
mapped_reads += 1;
if *ispe {
if record.is_paired() && record.is_proper_pair() && (record.flags() & FREAD != 0) {
fraglens.push(record.insert_size().abs() as u32);
let mut counts: Vec<f32>;

if gene_mode {
counts = vec![0.0; 1];
for record in bam.records() {
let record = record.expect("Error parsing record.");
if !ignorechr.contains(&region.0) {
if record.is_unmapped() {
unmapped_reads += 1;
} else {
mapped_reads += 1;
if *ispe {
if record.is_paired() && record.is_proper_pair() && (record.flags() & FREAD != 0) {
fraglens.push(record.insert_size().abs() as u32);
}
}
readlens.push(record.seq_len() as u32);
}
}
counts[0] += 1.0;
}
} else {
// populate the bg vector with 0 counts over all bins
counts = vec![0.0; (region.2 - region.1).div_ceil(*binsize) as usize];
println!("LENGTH OF THE VECO BRO {:?}", counts.len());
// let mut binstart = region.1;
let mut binix: u32 = 0;

for record in bam.records() {
let record = record.expect("Error parsing record.");
if !ignorechr.contains(&region.0) {
if record.is_unmapped() {
unmapped_reads += 1;
} else {
mapped_reads += 1;
if *ispe {
if record.is_paired() && record.is_proper_pair() && (record.flags() & FREAD != 0) {
fraglens.push(record.insert_size().abs() as u32);
}
}
readlens.push(record.seq_len() as u32);
}
readlens.push(record.seq_len() as u32);
}

let indices: HashSet<usize> = record
.aligned_blocks()
.filter(|x| (x[1] as u32) < region.2)
.flat_map(|x| x[0] as u32..x[1] as u32)
.map(|x| (x / binsize) as usize)
.collect();
println!("INDICES = {:?}", indices);
indices.into_iter()
.for_each(|ix| counts[ix] += 1.0);
}
let indices: HashSet<usize> = record
.aligned_blocks()
.filter(|x| (x[1] as u32) < region.2)
.flat_map(|x| x[0] as u32..x[1] as u32)
.map(|x| (x / binsize) as usize)
.collect();
indices.into_iter()
.for_each(|ix| counts[ix] += 1.0);
}
let mut writer = BufWriter::new(&bg);
// There are two scenarios:
Expand Down
102 changes: 66 additions & 36 deletions src/multibamsummary.rs
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@ pub fn r_mbams(
};

let mut regions: Vec<Region> = Vec::new();

let mut gene_mode = false;
if mode == "BED-file" {
if verbose {
println!("BED file mode. with files: {:?}", bedfiles);
Expand Down Expand Up @@ -120,6 +120,7 @@ pub fn r_mbams(
regions.extend(reg);
regionsizes.insert(regsize.0, regsize.1);
});
gene_mode = true;
} else {
if verbose {
println!("BINS mode. with binsize: {}, distance between bins: {}", binsize, distance_between_bins);
Expand All @@ -146,7 +147,7 @@ pub fn r_mbams(
bampfiles.par_iter()
.map(|(bamfile, ispe)| {
let (bg, _mapped, _unmapped, _readlen, _fraglen) = regionblocks.par_iter()
.map(|i| bam_pileup(bamfile, &i, &binsize, &ispe, &ignorechr, &filters, false))
.map(|i| bam_pileup(bamfile, &i, &binsize, &ispe, &ignorechr, &filters, false, gene_mode))
.reduce(
|| (vec![], 0, 0, vec![], vec![]),
|(mut _bg, mut _mapped, mut _unmapped, mut _readlen, mut _fraglen), (bg, mapped, unmapped, readlen, fraglen)| {
Expand All @@ -167,71 +168,100 @@ pub fn r_mbams(
println!("Define output file");
}

let mut cfile: Option<BufWriter<File>> = None;
if out_raw_counts != "None" {
cfile = Some(BufWriter::new(File::create(out_raw_counts).unwrap()));
// Write the header to the file.
let mut headstr = String::new();
headstr.push_str("#\'chr\'\t\'start\'\t\'end\'");
for label in bamlabels.iter() {
headstr.push_str(&format!("\t\'{}\'", label));
}
writeln!(cfile.as_mut().unwrap(), "{}", headstr).unwrap();
}

// Collate the coverage files into a matrix.
let its: Vec<_> = covcalcs.iter().map(|x| x.into_iter()).collect();
let zips = TempZip { iterators: its };
if verbose {
println!("Start iterating through temp coverage files and create output npy.");
}
let zips_vec: Vec<_> = zips.collect();
let matvec: Array2<f32> = pool.install(|| {
let matvecs: Vec<Array1<f32>> = zips_vec
.iter()
let zips_vec: Vec<_> = zips.collect();
println!(" Length of ziperators = {}", zips_vec.len());

let mut matvec: Vec<Vec<f32>> = Vec::new();
let matvec: Vec<_> = pool.install(|| {
let _m: Vec<_> = zips_vec
.par_iter()
.flat_map(|c| {
let readers: Vec<_> = c.par_iter().map(|x| BufReader::new(File::open(x).unwrap()).lines()).collect();
let mut _matvec: Vec<Array1<f32>> = Vec::new();
let mut _matvec: Vec<Vec<f32>> = Vec::new();
let mut _regions: Vec<(String, u32, u32)> = Vec::new();
for mut _l in (TempZip { iterators: readers }) {
// unwrap all lines in _l
let lines: Vec<_> = _l
.iter_mut()
.par_iter_mut()
.map(|x| x.as_mut().unwrap())
.map(|x| x.split('\t').collect())
.map(|x: Vec<&str> | ( x[0].to_string(), x[1].parse::<u32>().unwrap(), x[2].parse::<u32>().unwrap(), x[3].parse::<f32>().unwrap() ) )
.collect();
let arrline = Array1::from(lines.iter().map(|x| x.3).collect::<Vec<_>>());
// If cfile is define, write the raw counts to file.
if let Some(ref mut file) = cfile {
let mut regstr = String::new();
regstr.push_str(&format!("{}\t{}\t{}",lines[0].0, lines[0].1, lines[0].2));
// push values from array
for val in arrline.iter() {
regstr.push_str(&format!("\t{}", val));
}
writeln!(file, "{}", regstr).unwrap();
}
_matvec.push(arrline);
let counts = lines.par_iter().map(|x| x.3).collect::<Vec<_>>();
let regions: (String, u32, u32) = (lines[0].0.clone(), lines[0].1, lines[0].2);
_matvec.push(counts);
_regions.push(regions);
}
_matvec
(_matvec, _regions)
})
.collect();
stack(Axis(0), &matvecs.iter().map(|x| x.view()).collect::<Vec<_>>()).unwrap()
_m
});

// Seperate the matrices and regions
let (matvec, regions): (Vec<_>, Vec<_>) = multiunzip(matvec);

// Write out the count files, if appropriate

if out_raw_counts != "None" {
if verbose {
println!("Writing raw counts to disk.");
}
let mut cfile = BufWriter::new(File::create(out_raw_counts).unwrap());
// Write the header to the file.
let mut headstr = String::new();
headstr.push_str("#\'chr\'\t\'start\'\t\'end\'");
for label in bamlabels.iter() {
headstr.push_str(&format!("\t\'{}\'", label));
}
writeln!(cfile, "{}", headstr).unwrap();
let outlines: Vec<String> = pool.install(|| {
regions.par_iter().zip(matvec.par_iter())
.map(|(region, counts)| {
let mut outstr = String::new();
outstr.push_str(&format!("{}\t{}\t{}", region.0, region.1, region.2));
for count in counts.iter() {
outstr.push_str(&format!("\t{}", count));
}
outstr
})
.collect()
});
for line in outlines {
writeln!(cfile, "{}", line).unwrap();
}
}

// Create 2darray from matvec
let matarr: Array2<f32> = Array2::from_shape_vec(
(matvec.len(), matvec[0].len()), matvec.into_iter().flatten().collect()
).unwrap();

// If scalefactors are required, calc and save them now.
if scaling_factors != "None" {
let sf = deseq_scalefactors(&matvec);
if verbose {
println!("Calculating scale factors.");
}
let sf = deseq_scalefactors(&matarr);
// save scalefactors to file
let mut sf_file = File::create(scaling_factors).unwrap();
writeln!(sf_file, "Sample\tscalingFactor").unwrap();
for (sf, label) in sf.iter().zip(bamlabels.iter()) {
writeln!(sf_file, "{}\t{}", label, sf).unwrap();
}
}

if verbose {
println!("Writing matrix to disk.");
}
let mut npz = NpzWriter::new_compressed(File::create(ofile).unwrap());
npz.add_array("matrix", &matvec).unwrap();
npz.add_array("matrix", &matarr).unwrap();
npz.add_array("labels", &bamlabels_arr).unwrap();
npz.finish().unwrap();
if verbose {
Expand Down

0 comments on commit d415afe

Please sign in to comment.