diff --git a/Cargo.toml b/Cargo.toml index 97f67f1..b9b79f5 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "STRdust" -version = "0.10.0" +version = "0.11.0" edition = "2021" diff --git a/src/genotype.rs b/src/genotype.rs index 44e8942..35f04ae 100644 --- a/src/genotype.rs +++ b/src/genotype.rs @@ -68,7 +68,7 @@ fn genotype_repeat( && args.haploid.as_ref().unwrap().contains(&repeat.chrom)) || args.unphased; - let reads = match crate::parse_bam::get_overlapping_reads(bam, repeat, unphased) { + let reads = match crate::parse_bam::get_overlapping_reads(bam, repeat, unphased, args.max_number_reads) { Some(seqs) => seqs, None => { // Return a missing genotype if no (phased) reads overlap the repeat @@ -369,7 +369,7 @@ mod tests { let unphased = false; let repeat_compressed_reference = repeat.make_repeat_compressed_sequence(&fasta, flanking); let mut bam = parse_bam::create_bam_reader(&bam, &fasta); - let binding = crate::parse_bam::get_overlapping_reads(&mut bam, &repeat, unphased).unwrap(); + let binding = crate::parse_bam::get_overlapping_reads(&mut bam, &repeat, unphased, 60).unwrap(); let read = binding .seqs .get(&1) @@ -420,6 +420,7 @@ mod tests { debug: false, sorted: false, consensus_reads: 20, + max_number_reads: 60 }; let mut bam = parse_bam::create_bam_reader(&args.bam, &args.fasta); let genotype = genotype_repeat(&repeat, &args, &mut bam); @@ -451,6 +452,7 @@ mod tests { debug: false, sorted: false, consensus_reads: 20, + max_number_reads: 60 }; let mut bam = parse_bam::create_bam_reader(&args.bam, &args.fasta); let genotype = genotype_repeat(&repeat, &args, &mut bam); @@ -476,6 +478,7 @@ mod tests { debug: false, sorted: false, consensus_reads: 20, + max_number_reads: 60 }; let repeat = crate::repeats::RepeatInterval { chrom: String::from("chr7"), @@ -507,6 +510,7 @@ mod tests { debug: false, sorted: false, consensus_reads: 20, + max_number_reads: 60 }; let repeat = crate::repeats::RepeatInterval { chrom: String::from("chr7"), @@ -537,7 +541,8 @@ mod tests { haploid: None, debug: false, sorted: false, - consensus_reads: 20 + consensus_reads: 20, + max_number_reads: 60, }; let repeat = crate::repeats::RepeatInterval { diff --git a/src/main.rs b/src/main.rs index 8a9c819..48f9d0d 100644 --- a/src/main.rs +++ b/src/main.rs @@ -82,6 +82,10 @@ pub struct Cli { /// Max number of reads to use to generate consensus alt sequence #[clap(long, value_parser, default_value_t = 20)] consensus_reads: usize, + + // Max number of reads to extract per locus from the bam file for genotyping + #[clap(long, value_parser, default_value_t = 60)] + max_number_reads: usize, } fn is_file(pathname: &str) -> Result<(), String> { diff --git a/src/parse_bam.rs b/src/parse_bam.rs index 664d238..2a08acf 100644 --- a/src/parse_bam.rs +++ b/src/parse_bam.rs @@ -6,6 +6,7 @@ use rust_htslib::bam::Read; use std::collections::HashMap; use std::env; use url::Url; +use rand::seq::SliceRandom; pub struct Reads { // could consider not to use a hashmap here and use an attribute per phase @@ -42,6 +43,7 @@ pub fn get_overlapping_reads( bam: &mut bam::IndexedReader, repeat: &crate::repeats::RepeatInterval, unphased: bool, + max_number_reads: usize ) -> Option { let tid = bam .header() @@ -85,7 +87,26 @@ pub fn get_overlapping_reads( } None } else { - Some(Reads { seqs, ps }) + // if more than spanning reads are found in seqs, randomly select just items from the vector + // if unphased, just select reads from phase 0 + // if phased, select /2 reads from each phase + let mut rng = rand::thread_rng(); + let mut seqs_filtered = HashMap::from([(0, Vec::new()), (1, Vec::new()), (2, Vec::new())]); + let max_reads_per_phase = HashMap::from([(0, max_number_reads), (1, max_number_reads/2), (2, max_number_reads/2)]); + for (phase, seqs_phase) in seqs.iter() { + let n_reads = seqs_phase.len(); + let n_reads_to_select = if n_reads > max_reads_per_phase[&phase] { + max_reads_per_phase[&phase] + } else { + n_reads + }; + let selected_reads = seqs_phase.choose_multiple(&mut rng, n_reads_to_select); + for read in selected_reads { + seqs_filtered.get_mut(phase).unwrap().push(read.to_vec()); + } + } + + Some(Reads { seqs: seqs_filtered, ps }) } } @@ -128,7 +149,7 @@ fn test_get_overlapping_reads() { }; let unphased = false; let mut bam = create_bam_reader(&bam, &fasta); - let _reads = get_overlapping_reads(&mut bam, &repeat, unphased); + let _reads = get_overlapping_reads(&mut bam, &repeat, unphased, 60); } #[test] @@ -143,7 +164,7 @@ fn test_get_overlapping_reads_url() { }; let unphased = false; let mut bam = create_bam_reader(&bam, &fasta); - let _reads = get_overlapping_reads(&mut bam, &repeat, unphased); + let _reads = get_overlapping_reads(&mut bam, &repeat, unphased, 60); } #[test] diff --git a/src/phase_insertions.rs b/src/phase_insertions.rs index b0da167..d8cce6f 100644 --- a/src/phase_insertions.rs +++ b/src/phase_insertions.rs @@ -243,7 +243,7 @@ fn find_roots( fn find_cluster_members( cluster: &usize, cluster_to_subclusters: &HashMap, - insertions: &Vec, + insertions: &[String], ) -> Vec { let mut search_cluster = vec![*cluster]; let mut insertions_of_this_cluster = vec![]; diff --git a/src/vcf.rs b/src/vcf.rs index 92a4de3..42e204d 100644 --- a/src/vcf.rs +++ b/src/vcf.rs @@ -479,6 +479,7 @@ fn test_write_vcf_header_from_bam() { debug: false, sorted: false, consensus_reads: 20, + max_number_reads: 60, }; write_vcf_header(&args); } @@ -502,6 +503,7 @@ fn test_write_vcf_header_from_name() { debug: false, sorted: false, consensus_reads: 20, + max_number_reads: 60, }; write_vcf_header(&args); }