Skip to content

Commit

Permalink
argument to limit number of reads for genotyping
Browse files Browse the repository at this point in the history
  • Loading branch information
wdecoster committed Aug 22, 2024
1 parent 99f6257 commit 8ab83b6
Show file tree
Hide file tree
Showing 6 changed files with 40 additions and 8 deletions.
2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "STRdust"
version = "0.10.0"
version = "0.11.0"
edition = "2021"


Expand Down
11 changes: 8 additions & 3 deletions src/genotype.rs
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ fn genotype_repeat(
&& args.haploid.as_ref().unwrap().contains(&repeat.chrom))
|| args.unphased;

let reads = match crate::parse_bam::get_overlapping_reads(bam, repeat, unphased) {
let reads = match crate::parse_bam::get_overlapping_reads(bam, repeat, unphased, args.max_number_reads) {
Some(seqs) => seqs,
None => {
// Return a missing genotype if no (phased) reads overlap the repeat
Expand Down Expand Up @@ -369,7 +369,7 @@ mod tests {
let unphased = false;
let repeat_compressed_reference = repeat.make_repeat_compressed_sequence(&fasta, flanking);
let mut bam = parse_bam::create_bam_reader(&bam, &fasta);
let binding = crate::parse_bam::get_overlapping_reads(&mut bam, &repeat, unphased).unwrap();
let binding = crate::parse_bam::get_overlapping_reads(&mut bam, &repeat, unphased, 60).unwrap();
let read = binding
.seqs
.get(&1)
Expand Down Expand Up @@ -420,6 +420,7 @@ mod tests {
debug: false,
sorted: false,
consensus_reads: 20,
max_number_reads: 60
};
let mut bam = parse_bam::create_bam_reader(&args.bam, &args.fasta);
let genotype = genotype_repeat(&repeat, &args, &mut bam);
Expand Down Expand Up @@ -451,6 +452,7 @@ mod tests {
debug: false,
sorted: false,
consensus_reads: 20,
max_number_reads: 60
};
let mut bam = parse_bam::create_bam_reader(&args.bam, &args.fasta);
let genotype = genotype_repeat(&repeat, &args, &mut bam);
Expand All @@ -476,6 +478,7 @@ mod tests {
debug: false,
sorted: false,
consensus_reads: 20,
max_number_reads: 60
};
let repeat = crate::repeats::RepeatInterval {
chrom: String::from("chr7"),
Expand Down Expand Up @@ -507,6 +510,7 @@ mod tests {
debug: false,
sorted: false,
consensus_reads: 20,
max_number_reads: 60
};
let repeat = crate::repeats::RepeatInterval {
chrom: String::from("chr7"),
Expand Down Expand Up @@ -537,7 +541,8 @@ mod tests {
haploid: None,
debug: false,
sorted: false,
consensus_reads: 20
consensus_reads: 20,
max_number_reads: 60,
};

let repeat = crate::repeats::RepeatInterval {
Expand Down
4 changes: 4 additions & 0 deletions src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,10 @@ pub struct Cli {
/// Max number of reads to use to generate consensus alt sequence
#[clap(long, value_parser, default_value_t = 20)]
consensus_reads: usize,

// Max number of reads to extract per locus from the bam file for genotyping
#[clap(long, value_parser, default_value_t = 60)]
max_number_reads: usize,
}

fn is_file(pathname: &str) -> Result<(), String> {
Expand Down
27 changes: 24 additions & 3 deletions src/parse_bam.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ use rust_htslib::bam::Read;
use std::collections::HashMap;
use std::env;
use url::Url;
use rand::seq::SliceRandom;

pub struct Reads {
// could consider not to use a hashmap here and use an attribute per phase
Expand Down Expand Up @@ -42,6 +43,7 @@ pub fn get_overlapping_reads(
bam: &mut bam::IndexedReader,
repeat: &crate::repeats::RepeatInterval,
unphased: bool,
max_number_reads: usize
) -> Option<Reads> {
let tid = bam
.header()
Expand Down Expand Up @@ -85,7 +87,26 @@ pub fn get_overlapping_reads(
}
None
} else {
Some(Reads { seqs, ps })
// if more than <max_number_reads> spanning reads are found in seqs, randomly select just <max_number_reads> items from the vector
// if unphased, just select <max_number_reads> reads from phase 0
// if phased, select <max_number_reads>/2 reads from each phase
let mut rng = rand::thread_rng();
let mut seqs_filtered = HashMap::from([(0, Vec::new()), (1, Vec::new()), (2, Vec::new())]);
let max_reads_per_phase = HashMap::from([(0, max_number_reads), (1, max_number_reads/2), (2, max_number_reads/2)]);
for (phase, seqs_phase) in seqs.iter() {
let n_reads = seqs_phase.len();
let n_reads_to_select = if n_reads > max_reads_per_phase[&phase] {
max_reads_per_phase[&phase]
} else {
n_reads
};
let selected_reads = seqs_phase.choose_multiple(&mut rng, n_reads_to_select);
for read in selected_reads {
seqs_filtered.get_mut(phase).unwrap().push(read.to_vec());
}
}

Some(Reads { seqs: seqs_filtered, ps })
}
}

Expand Down Expand Up @@ -128,7 +149,7 @@ fn test_get_overlapping_reads() {
};
let unphased = false;
let mut bam = create_bam_reader(&bam, &fasta);
let _reads = get_overlapping_reads(&mut bam, &repeat, unphased);
let _reads = get_overlapping_reads(&mut bam, &repeat, unphased, 60);
}

#[test]
Expand All @@ -143,7 +164,7 @@ fn test_get_overlapping_reads_url() {
};
let unphased = false;
let mut bam = create_bam_reader(&bam, &fasta);
let _reads = get_overlapping_reads(&mut bam, &repeat, unphased);
let _reads = get_overlapping_reads(&mut bam, &repeat, unphased, 60);
}

#[test]
Expand Down
2 changes: 1 addition & 1 deletion src/phase_insertions.rs
Original file line number Diff line number Diff line change
Expand Up @@ -243,7 +243,7 @@ fn find_roots(
fn find_cluster_members(
cluster: &usize,
cluster_to_subclusters: &HashMap<usize, (usize, usize)>,
insertions: &Vec<String>,
insertions: &[String],
) -> Vec<String> {
let mut search_cluster = vec![*cluster];
let mut insertions_of_this_cluster = vec![];
Expand Down
2 changes: 2 additions & 0 deletions src/vcf.rs
Original file line number Diff line number Diff line change
Expand Up @@ -479,6 +479,7 @@ fn test_write_vcf_header_from_bam() {
debug: false,
sorted: false,
consensus_reads: 20,
max_number_reads: 60,
};
write_vcf_header(&args);
}
Expand All @@ -502,6 +503,7 @@ fn test_write_vcf_header_from_name() {
debug: false,
sorted: false,
consensus_reads: 20,
max_number_reads: 60,
};
write_vcf_header(&args);
}
Expand Down

0 comments on commit 8ab83b6

Please sign in to comment.