From 3245e62f0ea3b5fdef1d908fbf949a24386e5c65 Mon Sep 17 00:00:00 2001 From: Andrew D Smith Date: Fri, 18 Oct 2024 21:09:47 -0700 Subject: [PATCH] Adding threads for functions that read BAM input --- src/bound_pop.cpp | 15 +++++-------- src/c_curve.cpp | 15 +++++-------- src/gc_extrap.cpp | 7 ++++-- src/lc_extrap.cpp | 15 +++++-------- src/load_data_for_complexity.cpp | 37 ++++++++++++++++---------------- src/load_data_for_complexity.hpp | 10 +++++---- src/pop_size.cpp | 15 +++++-------- 7 files changed, 49 insertions(+), 65 deletions(-) diff --git a/src/bound_pop.cpp b/src/bound_pop.cpp index 2088b27..f9219a9 100644 --- a/src/bound_pop.cpp +++ b/src/bound_pop.cpp @@ -64,6 +64,7 @@ bound_pop_main(const int argc, const char *argv[]) { #ifdef HAVE_HTSLIB bool BAM_FORMAT_INPUT = false; size_t MAX_SEGMENT_LENGTH = 5000; + uint32_t n_threads{1}; #endif size_t max_num_points = 10; @@ -112,6 +113,8 @@ of observed species in an initial sample.)"; "maximum segment length when merging " "paired end bam reads", false, MAX_SEGMENT_LENGTH); + opt_parse.add_opt("threads", 't', "number of threads for decompressing BAM", + false, n_threads); #endif opt_parse.add_opt("quick", 'Q', "quick mode, estimate without bootstrapping", false, @@ -159,20 +162,12 @@ of observed species in an initial sample.)"; else if (BAM_FORMAT_INPUT && PAIRED_END) { if (VERBOSE) cerr << "PAIRED_END_BAM_INPUT" << endl; - // size_t n_paired = 0; - // size_t n_mates = 0; - n_obs = load_counts_BAM_pe(input_file_name, - // n_paired, n_mates, - counts_hist); - // if (VERBOSE) { - // cerr << "MERGED PAIRED END READS = " << n_paired << endl; - // cerr << "MATES PROCESSED = " << n_mates << endl; - // } + n_obs = load_counts_BAM_pe(n_threads, input_file_name, counts_hist); } else if (BAM_FORMAT_INPUT) { if (VERBOSE) cerr << "BAM_INPUT" << endl; - n_obs = load_counts_BAM_se(input_file_name, counts_hist); + n_obs = load_counts_BAM_se(n_threads, input_file_name, counts_hist); } #endif else if (PAIRED_END) { diff --git a/src/c_curve.cpp b/src/c_curve.cpp index a003fb7..69fb8a3 100644 --- a/src/c_curve.cpp +++ b/src/c_curve.cpp @@ -96,6 +96,7 @@ c_curve_main(const int argc, const char *argv[]) { #ifdef HAVE_HTSLIB bool BAM_FORMAT_INPUT = false; size_t MAX_SEGMENT_LENGTH = 5000; + uint32_t n_threads{1}; #endif const string description = R"( @@ -124,6 +125,8 @@ but instead resamples from the given data.)"; "maximum segment length when merging " "paired end bam reads", false, MAX_SEGMENT_LENGTH); + opt_parse.add_opt("threads", 't', "number of threads for decompressing BAM", + false, n_threads); #endif opt_parse.add_opt("seed", 'r', "seed for random number generator", false, seed); @@ -172,20 +175,12 @@ but instead resamples from the given data.)"; else if (BAM_FORMAT_INPUT && PAIRED_END) { if (VERBOSE) cerr << "PAIRED_END_BAM_INPUT" << endl; - // const size_t MAX_READS_TO_HOLD = 5000000; - // size_t n_paired = 0; - // size_t n_mates = 0; - n_reads = load_counts_BAM_pe(input_file_name, - // n_paired, n_mates, - counts_hist); - // if (VERBOSE) - // cerr << "MERGED PAIRED END READS = " << n_paired << endl - // << "MATES PROCESSED = " << n_mates << endl; + n_reads = load_counts_BAM_pe(n_threads, input_file_name, counts_hist); } else if (BAM_FORMAT_INPUT) { if (VERBOSE) cerr << "BAM_INPUT" << endl; - n_reads = load_counts_BAM_se(input_file_name, counts_hist); + n_reads = load_counts_BAM_se(n_threads, input_file_name, counts_hist); } #endif else if (PAIRED_END) { diff --git a/src/gc_extrap.cpp b/src/gc_extrap.cpp index 8bd25f1..40a2096 100644 --- a/src/gc_extrap.cpp +++ b/src/gc_extrap.cpp @@ -99,6 +99,7 @@ gc_extrap_main(const int argc, const char *argv[]) { double c_level = 0.95; #ifdef HAVE_HTSLIB bool BAM_FORMAT_INPUT = false; + uint32_t n_threads{1}; #endif const string description = R"( @@ -144,6 +145,8 @@ gc_extrap_main(const int argc, const char *argv[]) { #ifdef HAVE_HTSLIB opt_parse.add_opt("bam", '\0', "input is in BAM format", false, BAM_FORMAT_INPUT); + opt_parse.add_opt("threads", 't', "number of threads for decompressing BAM", + false, n_threads); #endif opt_parse.add_opt("seed", 'r', "seed for random number generator", false, seed); @@ -185,8 +188,8 @@ gc_extrap_main(const int argc, const char *argv[]) { else if (BAM_FORMAT_INPUT) { if (VERBOSE) cerr << "BAM_INPUT" << endl; - n_reads = load_coverage_counts_BAM(infile, seed, bin_size, max_width, - coverage_hist); + n_reads = load_coverage_counts_BAM(n_threads, infile, seed, bin_size, + max_width, coverage_hist); } #endif else { diff --git a/src/lc_extrap.cpp b/src/lc_extrap.cpp index 7fb0d81..72ccccf 100644 --- a/src/lc_extrap.cpp +++ b/src/lc_extrap.cpp @@ -78,6 +78,7 @@ lc_extrap_main(const int argc, const char **argv) { #ifdef HAVE_HTSLIB bool BAM_FORMAT_INPUT = false; size_t MAX_SEGMENT_LENGTH = 5000; + uint32_t n_threads{1}; #endif string description = @@ -112,6 +113,8 @@ method has been used for many different purposes since then. "maximum segment length when merging " "paired end bam reads", false, MAX_SEGMENT_LENGTH); + opt_parse.add_opt("threads", 't', "number of threads for decompressing BAM", + false, n_threads); #endif opt_parse.add_opt("pe", 'P', "input is paired end read file", false, PAIRED_END); @@ -169,20 +172,12 @@ method has been used for many different purposes since then. else if (BAM_FORMAT_INPUT && PAIRED_END) { if (VERBOSE) cerr << "PAIRED_END_BAM_INPUT" << endl; - // size_t n_paired = 0; - // size_t n_mates = 0; - n_reads = load_counts_BAM_pe(input_file_name, - // n_paired, n_mates, - counts_hist); - // if (VERBOSE) { - // cerr << "MERGED PAIRED END READS = " << n_paired << endl; - // cerr << "MATES PROCESSED = " << n_mates << endl; - // } + n_reads = load_counts_BAM_pe(n_threads, input_file_name, counts_hist); } else if (BAM_FORMAT_INPUT) { if (VERBOSE) cerr << "BAM_INPUT" << endl; - n_reads = load_counts_BAM_se(input_file_name, counts_hist); + n_reads = load_counts_BAM_se(n_threads, input_file_name, counts_hist); } #endif else if (PAIRED_END) { diff --git a/src/load_data_for_complexity.cpp b/src/load_data_for_complexity.cpp index 568b129..240d67f 100644 --- a/src/load_data_for_complexity.cpp +++ b/src/load_data_for_complexity.cpp @@ -582,23 +582,19 @@ update_duplicate_counts_hist_BAM(const T &curr, const T &prev, ++current_count; } -// template -// size_t -// load_counts_BAM_se(/*const size_t n_threads, */ -// const string &inputfile, vector &counts_hist) { - -template +template size_t -load_counts_BAM(/*const size_t n_threads, */ - const string &inputfile, vector &counts_hist) { - // bamxx::bam_tpool tp(n_threads); +load_counts_BAM(const uint32_t n_threads, const string &inputfile, + vector &counts_hist) { + bamxx::bam_tpool tp(n_threads); bamxx::bam_in hts(inputfile); // assume already checked bamxx::bam_header hdr(hts); if (!hdr) throw runtime_error("failed to read header"); - // if (n_threads > 1) tp.set_io(hts); + if (n_threads > 1) + tp.set_io(hts); // find first mapped read to start bamxx::bam_rec aln; @@ -646,15 +642,15 @@ load_counts_BAM(/*const size_t n_threads, */ } size_t -load_counts_BAM_se(/*const size_t n_threads, */ - const string &inputfile, vector &counts_hist) { - return load_counts_BAM(inputfile, counts_hist); +load_counts_BAM_se(const uint32_t n_threads, const string &inputfile, + vector &counts_hist) { + return load_counts_BAM(n_threads, inputfile, counts_hist); } size_t -load_counts_BAM_pe(/*const size_t n_threads, */ - const string &inputfile, vector &counts_hist) { - return load_counts_BAM(inputfile, counts_hist); +load_counts_BAM_pe(const uint32_t n_threads, const string &inputfile, + vector &counts_hist) { + return load_counts_BAM(n_threads, inputfile, counts_hist); } struct genomic_interval { @@ -726,18 +722,21 @@ update_duplicate_coverage_hist(const T &curr, const T &prev, // ADS: don't care if mapped reads are SE or PE, we only need the // first mate for each mapped read size_t -load_coverage_counts_BAM(const string &inputfile, const uint64_t seed, - const size_t bin_size, const size_t max_width, +load_coverage_counts_BAM(const uint32_t n_threads, const string &inputfile, + const uint64_t seed, const size_t bin_size, + const size_t max_width, vector &coverage_hist) { srand(time(0) + getpid()); std::mt19937 generator(seed); + bamxx::bam_tpool tp(n_threads); bamxx::bam_in hts(inputfile); // assume already checked bamxx::bam_header hdr(hts); if (!hdr) throw runtime_error("failed to read header"); - // if (n_threads > 1) tp.set_io(hts); + if (n_threads > 1) + tp.set_io(hts); // find first mapped read to start bamxx::bam_rec aln; diff --git a/src/load_data_for_complexity.hpp b/src/load_data_for_complexity.hpp index 4dbc4f2..60d0eb7 100644 --- a/src/load_data_for_complexity.hpp +++ b/src/load_data_for_complexity.hpp @@ -54,16 +54,18 @@ load_counts_BED_se(const std::string &input_file_name, #ifdef HAVE_HTSLIB std::size_t -load_counts_BAM_pe(const std::string &input_file_name, - // std::size_t &n_paired, std::size_t &n_mates, +load_counts_BAM_pe(const std::uint32_t n_threads, + const std::string &input_file_name, std::vector &counts_hist); std::size_t -load_counts_BAM_se(const std::string &input_file_name, +load_counts_BAM_se(const std::uint32_t n_threads, + const std::string &input_file_name, std::vector &counts_hist); std::size_t -load_coverage_counts_BAM(const std::string &input_file_name, +load_coverage_counts_BAM(const std::uint32_t n_threads, + const std::string &input_file_name, const std::uint64_t seed, const std::size_t bin_size, const std::size_t max_width, std::vector &coverage_hist); diff --git a/src/pop_size.cpp b/src/pop_size.cpp index c673bb2..0363625 100644 --- a/src/pop_size.cpp +++ b/src/pop_size.cpp @@ -74,6 +74,7 @@ pop_size_main(const int argc, const char *argv[]) { #ifdef HAVE_HTSLIB bool BAM_FORMAT_INPUT = false; size_t MAX_SEGMENT_LENGTH = 5000; + uint32_t n_threads{1}; #endif const string description = @@ -106,6 +107,8 @@ pop_size_main(const int argc, const char *argv[]) { "maximum segment length when merging " "paired end bam reads", false, MAX_SEGMENT_LENGTH); + opt_parse.add_opt("threads", 't', "number of threads for decompressing BAM", + false, n_threads); #endif opt_parse.add_opt("pe", 'P', "input is paired end read file", false, PAIRED_END); @@ -163,20 +166,12 @@ pop_size_main(const int argc, const char *argv[]) { else if (BAM_FORMAT_INPUT && PAIRED_END) { if (VERBOSE) cerr << "PAIRED_END_BAM_INPUT" << endl; - // size_t n_paired = 0; - // size_t n_mates = 0; - n_reads = load_counts_BAM_pe(input_file_name, - // n_paired, n_mates, - counts_hist); - // if (VERBOSE) { - // cerr << "MERGED PAIRED END READS = " << n_paired << endl; - // cerr << "MATES PROCESSED = " << n_mates << endl; - // } + n_reads = load_counts_BAM_pe(n_threads, input_file_name, counts_hist); } else if (BAM_FORMAT_INPUT) { if (VERBOSE) cerr << "BAM_INPUT" << endl; - n_reads = load_counts_BAM_se(input_file_name, counts_hist); + n_reads = load_counts_BAM_se(n_threads, input_file_name, counts_hist); } #endif else if (PAIRED_END) {