From ba3dc6ee566bd9282f40ee9de5f55e692376aa8d Mon Sep 17 00:00:00 2001 From: Andrew D Smith Date: Wed, 23 Oct 2024 16:00:25 -0700 Subject: [PATCH 1/4] src/Module.cpp: added checks at most division operations to prevent divide-by-zero; most of these are not needed, but the issue arose when input files are empty --- src/Module.cpp | 92 ++++++++++++++++++++++++++++++-------------------- 1 file changed, 56 insertions(+), 36 deletions(-) diff --git a/src/Module.cpp b/src/Module.cpp index cc12452..8e03cd8 100644 --- a/src/Module.cpp +++ b/src/Module.cpp @@ -12,13 +12,14 @@ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. */ - +// clang-format off #include "Module.hpp" #include #include #include #include #include +#include using std::string; using std::vector; @@ -37,6 +38,8 @@ using std::ostringstream; using std::istringstream; using std::getline; +template using num_lim = std::numeric_limits; + /*****************************************************************************/ /******************* AUX FUNCTIONS *******************************************/ /*****************************************************************************/ @@ -97,7 +100,7 @@ make_exponential_base_groups(vector &base_groups, /************* LINEAR BASE GROUP *************/ // aux function to get linear interval size_t -get_linear_interval(const size_t &num_bases) { +get_linear_interval(const size_t num_bases) { // The the first 9bp as individual residues since odd stuff // can happen there, then we find a grouping value which gives // us a total set of groups below 75. We limit the intervals @@ -174,7 +177,8 @@ double get_corrected_count(size_t count_at_limit, size_t num_reads, size_t dup_level, size_t num_obs) { - // See if we can bail out early + // See if we can bail out early (ADS: can we know if num_reads <= + // count_at_limit always holds?) if (count_at_limit == num_reads) return num_obs; @@ -210,7 +214,7 @@ double get_corrected_count(size_t count_at_limit, // Now we can assume that the number we observed can be // scaled up by this proportion - return num_obs/(1 - p_not_seeing); + return num_obs/std::max(num_lim::min(), 1.0 - p_not_seeing); } // Function to calculate the deviation of a histogram with 100 bins from a @@ -277,7 +281,8 @@ sum_deviation_from_normal(const array &gc_count, // centre of the model mode = first_mode; } else { - mode /= mode_duplicates; + // ADS: check if we need to avoid divide-by-zero here + mode /= std::max(static_cast(1), mode_duplicates); } // We can now work out a theoretical distribution @@ -286,7 +291,8 @@ sum_deviation_from_normal(const array &gc_count, stdev += (i - mode) * (i - mode) * gc_count[i]; } - stdev = stdev / (total_count-1); + // ADS: check if we need to avoid divide-by-zero here + stdev = stdev / std::max(num_lim::min(), total_count - 1.0); stdev = sqrt(stdev); /******************* END COPIED FROM FASTQC **********************/ @@ -297,20 +303,24 @@ sum_deviation_from_normal(const array &gc_count, // ADS: lonely magic below; what is the 100? for (size_t i = 0; i <= 100; ++i) { z = i - mode; + // ADS: check if we need to avoid divide-by-zero here theoretical[i] = exp(- (z*z)/ (2.0 * stdev *stdev)); theoretical_sum += theoretical[i]; } // Normalize theoretical so it sums to the total of readsq for (size_t i = 0; i <= 100; ++i) { - theoretical[i] = theoretical[i] * total_count / theoretical_sum; + // ADS: check if we need to avoid divide-by-zero here + theoretical[i] = theoretical[i] * total_count / + std::max(num_lim::min(), theoretical_sum); } for (size_t i = 0; i <= 100; ++i) { ans += fabs(gc_count[i] - theoretical[i]); } - // Fractional deviation - return 100.0 * ans / total_count; + // Fractional deviation (ADS: check if we need to avoid + // divide-by-zero here) + return 100.0 * ans / std::max(num_lim::min(), total_count); } /***************************************************************/ @@ -446,7 +456,8 @@ ModuleBasicStatistics::summarize_module(FastqStats &stats) { total_bases += i * stats.long_read_length_freq[i - FastqStats::SHORT_READ_THRESHOLD]; } - avg_read_length = total_bases / total_sequences; + avg_read_length = + total_bases / std::max(static_cast(1), total_sequences); // counts bases G and C in each base position avg_gc = 0; @@ -454,7 +465,7 @@ ModuleBasicStatistics::summarize_module(FastqStats &stats) { // GC % // GS: TODO delete gc calculation during stream and do it using the total G // counts in all bases - avg_gc = 100 * stats.total_gc / static_cast(total_bases); + avg_gc = 100 * stats.total_gc / std::max(1.0, static_cast(total_bases)); } @@ -692,6 +703,7 @@ ModulePerBaseSequenceQuality::summarize_module(FastqStats &stats) { } const size_t base_positions = base_groups[group].end - base_groups[group].start + 1; + assert(base_positions != static_cast(0)); group_mean[group] = mean_group_sum / base_positions; group_ldecile[group] = static_cast(ldecile_group_sum) / base_positions; group_lquartile[group] = static_cast(lquartile_group_sum) / base_positions; @@ -819,17 +831,19 @@ ModulePerTileSequenceQuality::summarize_module(FastqStats &stats) { // Now transform sum into mean for (size_t i = 0; i < max_read_length; ++i) - if (position_counts[i] > 0) + if (position_counts[i] > 0.0) mean_in_base[i] = mean_in_base[i] / position_counts[i]; else - mean_in_base[i] = 0; + mean_in_base[i] = 0.0; for (auto &v : tile_position_quality) { const size_t lim = v.second.size(); for (size_t i = 0; i < lim; ++i) { // transform sum of all qualities in mean - const size_t count_at_pos = - stats.tile_position_count.find(v.first)->second[i]; + const auto itr = stats.tile_position_count.find(v.first); + if (itr == cend(stats.tile_position_count)) + throw runtime_error("failure ModulePerTileSequenceQuality::summarize_module"); + const size_t count_at_pos = itr->second[i]; if (count_at_pos > 0) v.second[i] = v.second[i] / count_at_pos; @@ -882,6 +896,7 @@ ModulePerTileSequenceQuality::write_module(ostream &os) { inline double round_quantile(const double val, const double num_quantiles) { + // ADS: check if we need to worry about divide by zero here return static_cast(val * num_quantiles) / num_quantiles; } @@ -937,6 +952,7 @@ ModulePerTileSequenceQuality::make_html_data() { // We will now discretize the quantiles so plotly understands // the color scheme static const double num_quantiles = 20.0; + // ADS: not sure if we need to worry about divide by zero here? double mid_point = round_quantile(min_val/(min_val - max_val), num_quantiles); // - 10: red @@ -1054,7 +1070,7 @@ Module(ModulePerBaseSequenceContent::module_name) { void ModulePerBaseSequenceContent::summarize_module(FastqStats &stats) { double a_group, t_group, g_group, c_group, n_group; - double a_pos, t_pos, g_pos, c_pos, n_pos; + double a_pos{}, t_pos{}, g_pos{}, c_pos{}, n_pos{}; double total; //a+c+t+g+n max_diff = 0.0; @@ -1105,10 +1121,10 @@ ModulePerBaseSequenceContent::summarize_module(FastqStats &stats) { const double total_pos = static_cast(a_pos + c_pos + g_pos + t_pos + n_pos); - a_pos = 100.0 * a_pos / total_pos; - c_pos = 100.0 * c_pos / total_pos; - g_pos = 100.0 * g_pos / total_pos; - t_pos = 100.0 * t_pos / total_pos; + a_pos = 100.0 * a_pos / std::max(num_lim::min(), total_pos); + c_pos = 100.0 * c_pos / std::max(num_lim::min(), total_pos); + g_pos = 100.0 * g_pos / std::max(num_lim::min(), total_pos); + t_pos = 100.0 * t_pos / std::max(num_lim::min(), total_pos); // for WGBS, we only test non-bisulfite treated bases if (!is_reverse_complement) @@ -1135,11 +1151,10 @@ ModulePerBaseSequenceContent::summarize_module(FastqStats &stats) { // turns above values to percent total = static_cast(a_group + c_group + t_group + g_group + n_group); - a_pct[group] = 100.0*a_group / total; - c_pct[group] = 100.0*c_group / total; - g_pct[group] = 100.0*g_group / total; - t_pct[group] = 100.0*t_group / total; - + a_pct[group] = 100.0*a_group / std::max(num_lim::min(), total); + c_pct[group] = 100.0*c_group / std::max(num_lim::min(), total); + g_pct[group] = 100.0*g_group / std::max(num_lim::min(), total); + t_pct[group] = 100.0*t_group / std::max(num_lim::min(), total); } } @@ -1395,12 +1410,14 @@ ModulePerBaseNContent::summarize_module(FastqStats &stats) { this_n_total = (i < FastqStats::SHORT_READ_THRESHOLD) ? (stats.cumulative_read_length_freq[i]) : (stats.long_cumulative_read_length_freq[i - FastqStats::SHORT_READ_THRESHOLD]); - this_n_pct = this_n_cnt / static_cast(this_n_total); + this_n_pct = this_n_cnt / std::max(num_lim::min(), + static_cast(this_n_total)); max_n_pct = max(max_n_pct, this_n_pct); group_n_cnt += this_n_cnt; group_n_total += this_n_total; } - n_pct[group] = 100.0*group_n_cnt / static_cast(group_n_total); + n_pct[group] = 100.0*group_n_cnt / std::max(num_lim::min(), + static_cast(group_n_total)); } } @@ -1627,15 +1644,15 @@ ModuleSequenceDuplicationLevels::summarize_module(FastqStats &stats) { } // "Sequence duplication estimate" in the summary - total_deduplicated_pct = 100.0 * seq_dedup / seq_total; + total_deduplicated_pct = 100.0 * seq_dedup / std::max(1.0, seq_total); // Convert to percentage for (auto &v : percentage_deduplicated) - v = 100.0 * v / seq_dedup; // Percentage of unique sequences in bin + v = 100.0 * v / std::max(1.0, seq_dedup); // Percentage of unique sequences in bin // Convert to percentage for (auto &v : percentage_total) - v = 100.0 * v / seq_total; // Percentage of sequences in bin + v = 100.0 * v / std::max(1.0, seq_total); // Percentage of sequences in bin } void @@ -1796,7 +1813,7 @@ ModuleOverrepresentedSequences::make_grade() { // implment pass warn fail for overrep sequences if (grade != "fail") { // get percentage that overrep reads represent - double pct = 100.0 * seq.second / num_reads; + double pct = 100.0 * seq.second / std::max(static_cast(1), num_reads); if (pct > grade_error) { grade = "fail"; } @@ -1813,7 +1830,7 @@ ModuleOverrepresentedSequences::write_module(ostream &os) { os << "#Sequence\tCount\tPercentage\tPossible Source\n"; for (auto seq : overrep_sequences) { os << seq.first << "\t" << seq.second << "\t" << - 100.0 * seq.second / num_reads << "\t" + 100.0 * seq.second / std::max(static_cast(1), num_reads) << "\t" << get_matching_contaminant(seq.first) << "\n"; } } @@ -1836,7 +1853,7 @@ ModuleOverrepresentedSequences::make_html_data() { for (auto v : overrep_sequences) { data << "" << v.first << ""; data << "" << v.second << ""; - data << "" << 100.0 * v.second / num_reads << ""; + data << "" << 100.0 * v.second / std::max(static_cast(1), num_reads) << ""; data << "" << get_matching_contaminant(v.first) << ""; data << ""; @@ -1907,7 +1924,8 @@ ModuleAdapterContent::summarize_module(FastqStats &stats) { for (size_t i = 0; i < adapter_pos_pct.size(); ++i) { for (size_t j = 0; j < adapter_pos_pct[0].size(); ++j) { adapter_pos_pct[i][j] *= 100.0; - adapter_pos_pct[i][j] /= static_cast(stats.num_reads); + adapter_pos_pct[i][j] /= std::max(num_lim::min(), + static_cast(stats.num_reads)); } } } @@ -2077,7 +2095,8 @@ ModuleKmerContent::summarize_module(FastqStats &stats) { observed_count = stats.kmer_count[(i << Constants::bit_shift_kmer) | kmer]; - expected_count = pos_kmer_count[i] / dividend; + expected_count = pos_kmer_count[i] / std::max(num_lim::min(), dividend); + // ADS: below, denom can't be zero if not above? obs_exp_ratio = (expected_count > 0) ? (observed_count / expected_count) : 0; if (i == 0 || obs_exp_ratio > obs_exp_max[kmer]) { @@ -2146,7 +2165,7 @@ ModuleKmerContent::make_html_data() { for (size_t i = 0; i < lim; ++i) { const size_t kmer = kmers_to_report[i].first; - const double log_obs_exp = log(kmers_to_report[i].second)/log(2); + const double log_obs_exp = log(kmers_to_report[i].second)/log(2.0); if (!seen_first) seen_first = true; else @@ -2175,3 +2194,4 @@ ModuleKmerContent::make_html_data() { } return data.str(); } +// clang-format on From f11605a563b59290054f365e36400fa87ad5806e Mon Sep 17 00:00:00 2001 From: Andrew D Smith Date: Wed, 23 Oct 2024 16:05:15 -0700 Subject: [PATCH 2/4] src/falco.cpp: added an option for empty input and the logic to detect if an input file is empty --- src/falco.cpp | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/src/falco.cpp b/src/falco.cpp index 30a64c7..ddb42b0 100644 --- a/src/falco.cpp +++ b/src/falco.cpp @@ -16,6 +16,7 @@ */ #include +#include #include #include "FalcoConfig.hpp" @@ -293,6 +294,7 @@ main(int argc, const char **argv) { bool skip_html = false; bool skip_short_summary = false; bool do_call = false; + bool allow_empty_input = false; // a tmp boolean to keep compatibility with FastQC bool tmp_compatibility_only = false; @@ -542,6 +544,14 @@ main(int argc, const char **argv) { " in programs that are very strict about the " " FastQC output format).", false, do_call); + + opt_parse.add_opt( + "allow-empty-input", '\0', + "[Falco only] allow empty input files and generate empty output files " + "without en error state. WARNING: using this option can mask problems in " + "other parts of a workflow.", + false, allow_empty_input); + vector leftover_args; opt_parse.parse(argc, argv, leftover_args); if (argc == 1 || opt_parse.help_requested()) { @@ -578,6 +588,23 @@ main(int argc, const char **argv) { return EXIT_FAILURE; } + // ADS: make sure all input files are non-empty unless user oks it + if (!allow_empty_input) { + for (const auto &fn : leftover_args) { + std::error_code ec; + const bool empty_file = std::filesystem::is_empty(fn, ec); + if (ec) { + cerr << "Error reading file: " << fn << " (" << ec.message() << ")" + << endl; + return EXIT_FAILURE; + } + else if (empty_file) { + cerr << "Input file is empty: " << fn << endl; + return EXIT_FAILURE; + } + } + } + if (!outdir.empty()) { if (!summary_filename.empty()) cerr << "[WARNING] specifying custom output directory but also " From 7c455901486ec260e27c99c3d98d05fcf7c89129 Mon Sep 17 00:00:00 2001 From: Andrew D Smith Date: Wed, 23 Oct 2024 16:19:43 -0700 Subject: [PATCH 3/4] src/Module.cpp: Removed forgotten clang-format guards --- src/Module.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/Module.cpp b/src/Module.cpp index 8e03cd8..f352f51 100644 --- a/src/Module.cpp +++ b/src/Module.cpp @@ -12,7 +12,7 @@ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. */ -// clang-format off + #include "Module.hpp" #include #include @@ -2194,4 +2194,3 @@ ModuleKmerContent::make_html_data() { } return data.str(); } -// clang-format on From 9945aebf04a152996007deee7654f49f9928efa1 Mon Sep 17 00:00:00 2001 From: Andrew D Smith Date: Thu, 24 Oct 2024 18:30:36 -0700 Subject: [PATCH 4/4] src/falco.cpp: using std::filesystem for dir_exists and avoiding filesize of 0 impacting progress bar --- src/falco.cpp | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/src/falco.cpp b/src/falco.cpp index ddb42b0..6d49120 100644 --- a/src/falco.cpp +++ b/src/falco.cpp @@ -38,6 +38,8 @@ using std::string; using std::to_string; using std::vector; +namespace fs = std::filesystem; + using std::chrono::duration_cast; using std::chrono::system_clock; using time_point = std::chrono::time_point; @@ -62,12 +64,7 @@ log_process(const string &s) { // Function to check existance of directory static bool dir_exists(const string &path) { - struct stat info; - if (stat(path.c_str(), &info) != 0) - return false; - else if (info.st_mode & S_IFDIR) - return true; - return false; + return fs::exists(path) && fs::is_directory(path); } // Read any file type until the end and logs progress @@ -76,7 +73,7 @@ template void read_stream_into_stats(T &in, FastqStats &stats, FalcoConfig &falco_config) { // open file - size_t file_size = in.load(); + size_t file_size = std::max(in.load(), static_cast(1)); size_t tot_bytes_read = 0; // Read record by record @@ -91,11 +88,10 @@ read_stream_into_stats(T &in, FastqStats &stats, FalcoConfig &falco_config) { // if I could not get tile information from read names, I need to tell this to // config so it does not output tile data on the summary or html - if (in.tile_ignore) { + if (in.tile_ignore) falco_config.do_tile = false; - } - if (tot_bytes_read < file_size && !quiet) + if (!quiet && tot_bytes_read < file_size) progress.report(cerr, file_size); }