From ab36797d5ffe172ed40208c1fb715ace5069bbc8 Mon Sep 17 00:00:00 2001 From: Wojtek Bazant Date: Thu, 26 Aug 2021 06:55:40 -0400 Subject: [PATCH] Model the noise through a beta binomial distribution When I fit a beta distribution to the proportions of markers in taxons, I get sample size 6778.2 estimate for EukDetect --- README.md | 4 ++- marker_alignments/filter/main.py | 8 +++-- marker_alignments/filter/noise_model.py | 34 +++++++++++++------ scripts/refdb_stats.py | 43 +++++++++++++++++++++++++ tests/filter_noise_model.py | 4 ++- 5 files changed, 79 insertions(+), 14 deletions(-) create mode 100644 scripts/refdb_stats.py diff --git a/README.md b/README.md index 1b22151..6342327 100644 --- a/README.md +++ b/README.md @@ -66,7 +66,7 @@ The default `--refdb-format` is `generic`, which tries to produce nice names, bu Apply a filter to alignments sumarized at the taxa level. ### Usage -This is a demo program, which asks for two markers to be present for a taxon +This is a demo program, which asks for two markers to be present for a taxon. ``` summarize_marker_alignments --input tests/data/example.sam --output /dev/stdout --output-type taxon_read_and_marker_count \ @@ -75,6 +75,8 @@ summarize_marker_alignments --input tests/data/example.sam --output /dev/stdout --require-min-markers 2 ``` +There's an option to model the noise after a beta binomial distribution and pick a good `--require-min-markers` value, but it's still experimental. + ## Known issues Quantitative information obtained by aligning to a EukDetect reference database might be a little bit dodgy since there are typically very few reads. diff --git a/marker_alignments/filter/main.py b/marker_alignments/filter/main.py index 733dfd3..014e64a 100644 --- a/marker_alignments/filter/main.py +++ b/marker_alignments/filter/main.py @@ -17,6 +17,7 @@ def main(argv=sys.argv[1:]): parser.add_argument("--require-min-markers", type=int, action="store", dest="require_min_markers", help = "Require min markers to keep a taxon") parser.add_argument("--use-noise-model-with-confidence-threshold", type=float, action="store", dest="noise_threshold", help = "Use a null model where markers associate with taxa at random with the provided confidence threshold, and infer an appropriate value for --require-min-markers") parser.add_argument("--total-num-taxa", type=int, action="store", dest="total_num_taxa", help = "Total number of taxa in the reference - required for fitting the noise model") + parser.add_argument("--taxon-to-markers-beta-sample-size", type=float, action="store", dest="beta_sample_size", help = "Sample size (sum of shape parameters a and b when proportion of markers per taxon is modelled as a beta distribution) - required for fitting the noise model") parser.add_argument("--output", type=str, action="store", dest="output_path", help = "output path", required=True) parser.add_argument("--verbose", action="store_true", dest="verbose", help = "turn on logging") @@ -27,6 +28,9 @@ def main(argv=sys.argv[1:]): if options.noise_threshold and not options.total_num_taxa: raise ValueError("--total-num-taxa required if --use-noise-model-with-confidence-threshold is provided") + if options.noise_threshold and not options.beta_sample_size: + raise ValueError("--taxon-to-markers-beta-sample-size required if --use-noise-model-with-confidence-threshold is provided") + lines, fieldnames = [], {} with open(options.input_path) as f: reader = csv.DictReader(f, delimiter='\t', quoting=csv.QUOTE_NONE) @@ -46,9 +50,9 @@ def main(argv=sys.argv[1:]): if n not in taxon_counts_with_num_markers: taxon_counts_with_num_markers[n] = 0 taxon_counts_with_num_markers[n] += 1 - cutoff, confidence = cutoff_fit_for_noise_model(taxon_counts_with_num_markers, options.noise_threshold) + cutoff, confidence = cutoff_fit_for_noise_model(taxon_counts_with_num_markers, options.beta_sample_size, options.noise_threshold, logger) num_markers = sum([j * taxon_counts_with_num_markers[j] for j in taxon_counts_with_num_markers]) - logger.info("Built a null model of %s markers each independently assigned uniformly at random to one of all %s taxa", num_markers, options.total_num_taxa) + logger.info("Built a null model of %s markers assigned to one of all %s taxa through a beta binomial distribution", num_markers, options.total_num_taxa) logger.info("In the data, taxa the fewest number of markers probable under the null model with less than --use-noise-model-with-confidence-threshold value %s is %s ", options.noise_threshold, cutoff) logger.info("The likelihood of %s taxa with %s markers occuring under the null model was calculated to be %s", taxon_counts_with_num_markers[cutoff], cutoff, confidence ) if options.require_min_markers and options.require_min_markers > cutoff: diff --git a/marker_alignments/filter/noise_model.py b/marker_alignments/filter/noise_model.py index 2e03d39..e17fc9c 100644 --- a/marker_alignments/filter/noise_model.py +++ b/marker_alignments/filter/noise_model.py @@ -1,7 +1,7 @@ -from scipy.stats import binom +from scipy.stats import binom, betabinom -def cutoff_fit_for_noise_model(taxon_counts_with_num_markers, noise_threshold): - model = fit_noise_model(taxon_counts_with_num_markers) +def cutoff_fit_for_noise_model(taxon_counts_with_num_markers, beta_sample_size, noise_threshold, logger): + model = fit_noise_model(taxon_counts_with_num_markers, beta_sample_size, logger) ns_above_threshold = [(n,p) for (n,p) in model if p <= noise_threshold] if not ns_above_threshold: return None @@ -17,7 +17,7 @@ def cutoff_fit_for_noise_model(taxon_counts_with_num_markers, noise_threshold): # # We address this by fitting a null model to the data, and returning the first value for which # the null model no longer fits. -def fit_noise_model(taxon_counts_with_num_markers): +def fit_noise_model(taxon_counts_with_num_markers, beta_sample_size, logger): total_num_markers = sum([j * taxon_counts_with_num_markers[j] for j in taxon_counts_with_num_markers]) num_taxa = sum([taxon_counts_with_num_markers[j] for j in taxon_counts_with_num_markers]) if num_taxa == 0: return [] @@ -25,23 +25,40 @@ def fit_noise_model(taxon_counts_with_num_markers): # suppose (somewhat pessimistically) that there is no information content between markers found # and taxa present - the data would be just as good if each marker was independently assigned to a taxon # uniformly at random. - # Then the number of markers for each taxon can be modelled as a binomially distributed random variable B_i, with: + # Then the number of markers for each taxon B_i can be modelled as a binomially distributed random variable, with: markers_for_each_taxon_n = total_num_markers markers_for_each_taxon_p = 1.0 / num_taxa + # That actually doesn't fit super well because taxa in the reference have different sizes - + # if we pick a subset of markers at random, some taxa will get more of them than others. + # Instead, model B_i as a beta binomial distributed random variable with the same mean. + + # https://en.wikipedia.org/wiki/Beta_distribution#Mean_and_sample_size + markers_for_each_taxon_shape_a = markers_for_each_taxon_p * beta_sample_size + markers_for_each_taxon_shape_b = (1 - markers_for_each_taxon_p) * beta_sample_size + + + + log=["Distributing {} markers across {} taxa, counts of taxa with k markers are:".format(total_num_markers, num_taxa)] + log.append("\t".join(["k", "actual", "expected", "pmf", "p(at least actual)"])) results = [] for num_markers in sorted(taxon_counts_with_num_markers.keys()): # there were this many taxa in the dataset taxon_count = taxon_counts_with_num_markers[num_markers] # null hypothesis: this is not yet above what we expect randomly # what's the chance of this happening? + + num_markers_pmf = betabinom.pmf(k = num_markers, n = markers_for_each_taxon_n, a = markers_for_each_taxon_shape_a, b = markers_for_each_taxon_shape_b) + # num_markers_pmf = binom.pmf(k = num_markers, n = markers_for_each_taxon_n, p = markers_for_each_taxon_p) p = probability_at_least_taxon_count_num_markers_taxa( - markers_for_each_taxon_n, markers_for_each_taxon_p, + num_markers_pmf, num_taxa, num_markers, taxon_count) + log.append("%s\t%s\t%.2f\t%.2g\t%.2g" % (num_markers, taxon_count, round(num_markers_pmf * num_taxa, 2), num_markers_pmf, p )) results.append((num_markers, p)) + logger.info("\n".join(log)) return results -def probability_at_least_taxon_count_num_markers_taxa(markers_for_each_taxon_n, markers_for_each_taxon_p, num_taxa, num_markers, taxon_count): +def probability_at_least_taxon_count_num_markers_taxa(num_markers_pmf, num_taxa, num_markers, taxon_count): if num_markers == 0: return 1 if taxon_count == 0: @@ -49,9 +66,6 @@ def probability_at_least_taxon_count_num_markers_taxa(markers_for_each_taxon_n, if num_taxa == 0 and num_markers > 0: return 0 - - # calculate P(B_i = num_markers) - num_markers_pmf = binom.pmf(k = num_markers, n = markers_for_each_taxon_n, p = markers_for_each_taxon_p) # define I(i, num_markers) to be 1 if B_i = num_markers and 0 otherwise # and let C(num_markers) be the sum of I(i, num_markers) over i # if I(i, num_markers) were independent, each C(num_markers) would follow a binomial distribution, with: diff --git a/scripts/refdb_stats.py b/scripts/refdb_stats.py new file mode 100644 index 0000000..995a3c5 --- /dev/null +++ b/scripts/refdb_stats.py @@ -0,0 +1,43 @@ +import argparse +import sys +import statistics + +from scipy.stats import beta + +def read_taxon_to_num_markers(path): + taxon_to_num_markers = {} + with open(path, 'r') as f: + for line in f: + (marker, taxon) = line.rstrip().split("\t") + if taxon not in taxon_to_num_markers: + taxon_to_num_markers[taxon] = 0 + taxon_to_num_markers[taxon]+=1 + + return taxon_to_num_markers + +def main(argv=sys.argv[1:]): + parser = argparse.ArgumentParser( + description="refdb stats", + formatter_class = argparse.RawDescriptionHelpFormatter, + ) + parser.add_argument("--input", type=str, action="store", dest="input_refdb_path", help = "Input refdb", required=True) + + options=parser.parse_args(argv) + + taxon_to_num_markers = read_taxon_to_num_markers(options.input_refdb_path) + + print("Num taxa: {}".format(len(taxon_to_num_markers.keys()))) + total_num_markers = sum(taxon_to_num_markers.values()) + print("Num markers: {}".format(total_num_markers)) + print("Mean markers: {}".format(statistics.mean(taxon_to_num_markers.values()))) + print("Variance markers: {}".format(statistics.variance(taxon_to_num_markers.values()))) + + num_markers_fracs = [1.0 * x / total_num_markers for x in sorted(taxon_to_num_markers.values())] + (a, b, loc, scale) = beta.fit(num_markers_fracs, floc=0, fscale=1) + beta_mean = a /(a+b) + beta_mode = (a-1)/(a+b -2) + beta_sample_size = a+b + print("Shape parameters for a beta fit:\n a = {}\n b = {}\n mean = {}\n mode = {}\n sample size = {}".format(a,b, beta_mean, beta_mode, beta_sample_size)) + +if __name__ == "__main__": + main() diff --git a/tests/filter_noise_model.py b/tests/filter_noise_model.py index f0b37ff..78d6a28 100644 --- a/tests/filter_noise_model.py +++ b/tests/filter_noise_model.py @@ -1,8 +1,10 @@ import unittest import logging -from marker_alignments.filter.noise_model import fit_noise_model as t +from marker_alignments.filter.noise_model import fit_noise_model +def t(taxon_to_num_markers): + return fit_noise_model(taxon_to_num_markers, 10000) class NoiseModel(unittest.TestCase): def test_zeros_at_the_end(self):