From ab36797d5ffe172ed40208c1fb715ace5069bbc8 Mon Sep 17 00:00:00 2001
From: Wojtek Bazant <wbazant@vet.upenn.edu>
Date: Thu, 26 Aug 2021 06:55:40 -0400
Subject: [PATCH] Model the noise through a beta binomial distribution

When I fit a beta distribution to the proportions of markers in taxons,
I get sample size 6778.2 estimate for EukDetect
---
 README.md                               |  4 ++-
 marker_alignments/filter/main.py        |  8 +++--
 marker_alignments/filter/noise_model.py | 34 +++++++++++++------
 scripts/refdb_stats.py                  | 43 +++++++++++++++++++++++++
 tests/filter_noise_model.py             |  4 ++-
 5 files changed, 79 insertions(+), 14 deletions(-)
 create mode 100644 scripts/refdb_stats.py

diff --git a/README.md b/README.md
index 1b22151..6342327 100644
--- a/README.md
+++ b/README.md
@@ -66,7 +66,7 @@ The default `--refdb-format` is `generic`, which tries to produce nice names, bu
 Apply a filter to alignments sumarized at the taxa level.
 
 ### Usage
-This is a demo program, which asks for two markers to be present for a taxon
+This is a demo program, which asks for two markers to be present for a taxon.
 
 ```
 summarize_marker_alignments --input tests/data/example.sam --output /dev/stdout --output-type taxon_read_and_marker_count \
@@ -75,6 +75,8 @@ summarize_marker_alignments --input tests/data/example.sam --output /dev/stdout
   --require-min-markers 2
 ```
 
+There's an option to model the noise after a beta binomial distribution and pick a good `--require-min-markers` value, but it's still experimental.
+
 ## Known issues
 Quantitative information obtained by aligning to a EukDetect reference database might be a little bit dodgy since there are typically very few reads.
 
diff --git a/marker_alignments/filter/main.py b/marker_alignments/filter/main.py
index 733dfd3..014e64a 100644
--- a/marker_alignments/filter/main.py
+++ b/marker_alignments/filter/main.py
@@ -17,6 +17,7 @@ def main(argv=sys.argv[1:]):
     parser.add_argument("--require-min-markers", type=int, action="store", dest="require_min_markers", help = "Require min markers to keep a taxon")
     parser.add_argument("--use-noise-model-with-confidence-threshold", type=float, action="store", dest="noise_threshold", help = "Use a null model where markers associate with taxa at random with the provided confidence threshold, and infer an appropriate value for --require-min-markers")
     parser.add_argument("--total-num-taxa", type=int, action="store", dest="total_num_taxa", help = "Total number of taxa in the reference - required for fitting the noise model")
+    parser.add_argument("--taxon-to-markers-beta-sample-size", type=float, action="store", dest="beta_sample_size", help = "Sample size (sum of shape parameters a and b when proportion of markers per taxon is modelled as a beta distribution) - required for fitting the noise model")
     parser.add_argument("--output", type=str, action="store", dest="output_path", help = "output path", required=True)
     parser.add_argument("--verbose", action="store_true", dest="verbose", help = "turn on logging")
 
@@ -27,6 +28,9 @@ def main(argv=sys.argv[1:]):
     if options.noise_threshold and not options.total_num_taxa:
         raise ValueError("--total-num-taxa required if --use-noise-model-with-confidence-threshold is provided")
 
+    if options.noise_threshold and not options.beta_sample_size:
+        raise ValueError("--taxon-to-markers-beta-sample-size required if --use-noise-model-with-confidence-threshold is provided")
+
     lines, fieldnames = [], {}
     with open(options.input_path) as f:
         reader = csv.DictReader(f, delimiter='\t', quoting=csv.QUOTE_NONE)
@@ -46,9 +50,9 @@ def main(argv=sys.argv[1:]):
             if n not in taxon_counts_with_num_markers:
                 taxon_counts_with_num_markers[n] = 0
             taxon_counts_with_num_markers[n] += 1
-        cutoff, confidence = cutoff_fit_for_noise_model(taxon_counts_with_num_markers, options.noise_threshold)
+        cutoff, confidence = cutoff_fit_for_noise_model(taxon_counts_with_num_markers, options.beta_sample_size, options.noise_threshold, logger)
         num_markers = sum([j * taxon_counts_with_num_markers[j] for j in taxon_counts_with_num_markers])
-        logger.info("Built a null model of %s markers each independently assigned uniformly at random to one of all %s taxa", num_markers, options.total_num_taxa)
+        logger.info("Built a null model of %s markers assigned to one of all %s taxa through a beta binomial distribution", num_markers, options.total_num_taxa)
         logger.info("In the data, taxa the fewest number of markers probable under the null model with less than --use-noise-model-with-confidence-threshold value %s is %s ", options.noise_threshold, cutoff)
         logger.info("The likelihood of %s taxa with %s markers occuring under the null model was calculated to be %s", taxon_counts_with_num_markers[cutoff], cutoff, confidence )
         if options.require_min_markers and options.require_min_markers > cutoff:
diff --git a/marker_alignments/filter/noise_model.py b/marker_alignments/filter/noise_model.py
index 2e03d39..e17fc9c 100644
--- a/marker_alignments/filter/noise_model.py
+++ b/marker_alignments/filter/noise_model.py
@@ -1,7 +1,7 @@
-from scipy.stats import binom
+from scipy.stats import binom, betabinom
 
-def cutoff_fit_for_noise_model(taxon_counts_with_num_markers, noise_threshold):
-    model = fit_noise_model(taxon_counts_with_num_markers)
+def cutoff_fit_for_noise_model(taxon_counts_with_num_markers, beta_sample_size, noise_threshold, logger):
+    model = fit_noise_model(taxon_counts_with_num_markers, beta_sample_size, logger)
     ns_above_threshold = [(n,p) for (n,p) in model if p <= noise_threshold]
     if not ns_above_threshold:
         return None
@@ -17,7 +17,7 @@ def cutoff_fit_for_noise_model(taxon_counts_with_num_markers, noise_threshold):
 #
 # We address this by fitting a null model to the data, and returning the first value for which
 # the null model no longer fits.
-def fit_noise_model(taxon_counts_with_num_markers):
+def fit_noise_model(taxon_counts_with_num_markers, beta_sample_size, logger):
     total_num_markers = sum([j * taxon_counts_with_num_markers[j] for j in taxon_counts_with_num_markers])
     num_taxa = sum([taxon_counts_with_num_markers[j] for j in taxon_counts_with_num_markers])
     if num_taxa == 0: return []
@@ -25,23 +25,40 @@ def fit_noise_model(taxon_counts_with_num_markers):
     # suppose (somewhat pessimistically) that there is no information content between markers found
     # and taxa present - the data would be just as good if each marker was independently assigned to a taxon
     # uniformly at random.
-    # Then the number of markers for each taxon can be modelled as a binomially distributed random variable B_i, with:
+    # Then the number of markers for each taxon B_i can be modelled as a binomially distributed random variable, with:
     markers_for_each_taxon_n = total_num_markers
     markers_for_each_taxon_p = 1.0 / num_taxa
 
+    # That actually doesn't fit super well because taxa in the reference have different sizes - 
+    # if we pick a subset of markers at random, some taxa will get more of them than others.
+    # Instead, model B_i as a beta binomial distributed random variable with the same mean.
+
+    # https://en.wikipedia.org/wiki/Beta_distribution#Mean_and_sample_size
+    markers_for_each_taxon_shape_a = markers_for_each_taxon_p * beta_sample_size
+    markers_for_each_taxon_shape_b = (1 - markers_for_each_taxon_p) * beta_sample_size
+
+    
+
+    log=["Distributing {} markers across {} taxa, counts of taxa with k markers are:".format(total_num_markers, num_taxa)]
+    log.append("\t".join(["k", "actual", "expected", "pmf", "p(at least actual)"]))
     results = []
     for num_markers in sorted(taxon_counts_with_num_markers.keys()):
         # there were this many taxa in the dataset
         taxon_count = taxon_counts_with_num_markers[num_markers]
         # null hypothesis: this is not yet above what we expect randomly
         # what's the chance of this happening?
+
+        num_markers_pmf = betabinom.pmf(k = num_markers, n = markers_for_each_taxon_n, a = markers_for_each_taxon_shape_a, b = markers_for_each_taxon_shape_b)
+        # num_markers_pmf = binom.pmf(k = num_markers, n = markers_for_each_taxon_n, p = markers_for_each_taxon_p)
         p = probability_at_least_taxon_count_num_markers_taxa(
-                markers_for_each_taxon_n, markers_for_each_taxon_p,
+                num_markers_pmf,
                 num_taxa, num_markers, taxon_count)
+        log.append("%s\t%s\t%.2f\t%.2g\t%.2g" % (num_markers, taxon_count, round(num_markers_pmf * num_taxa, 2), num_markers_pmf, p ))
         results.append((num_markers, p))
+    logger.info("\n".join(log))
     return results
 
-def probability_at_least_taxon_count_num_markers_taxa(markers_for_each_taxon_n, markers_for_each_taxon_p, num_taxa, num_markers, taxon_count):
+def probability_at_least_taxon_count_num_markers_taxa(num_markers_pmf, num_taxa, num_markers, taxon_count):
     if num_markers == 0:
         return 1
     if taxon_count == 0:
@@ -49,9 +66,6 @@ def probability_at_least_taxon_count_num_markers_taxa(markers_for_each_taxon_n,
     if num_taxa == 0 and num_markers > 0:
         return 0
 
-
-    # calculate P(B_i = num_markers)
-    num_markers_pmf = binom.pmf(k = num_markers, n = markers_for_each_taxon_n, p = markers_for_each_taxon_p)
     # define I(i, num_markers) to be 1 if B_i = num_markers and 0 otherwise
     # and let C(num_markers) be the sum of I(i, num_markers) over i
     # if I(i, num_markers) were independent, each C(num_markers) would follow a binomial distribution, with:
diff --git a/scripts/refdb_stats.py b/scripts/refdb_stats.py
new file mode 100644
index 0000000..995a3c5
--- /dev/null
+++ b/scripts/refdb_stats.py
@@ -0,0 +1,43 @@
+import argparse
+import sys
+import statistics
+
+from scipy.stats import beta
+
+def read_taxon_to_num_markers(path):
+    taxon_to_num_markers = {}
+    with open(path, 'r') as f:
+        for line in f:
+            (marker, taxon) = line.rstrip().split("\t")
+            if taxon not in taxon_to_num_markers:
+                taxon_to_num_markers[taxon] = 0
+            taxon_to_num_markers[taxon]+=1
+
+    return taxon_to_num_markers
+
+def main(argv=sys.argv[1:]):
+    parser = argparse.ArgumentParser(
+      description="refdb stats",
+      formatter_class = argparse.RawDescriptionHelpFormatter,
+    )
+    parser.add_argument("--input", type=str, action="store", dest="input_refdb_path", help = "Input refdb", required=True)
+
+    options=parser.parse_args(argv)
+
+    taxon_to_num_markers = read_taxon_to_num_markers(options.input_refdb_path)
+
+    print("Num taxa: {}".format(len(taxon_to_num_markers.keys())))
+    total_num_markers = sum(taxon_to_num_markers.values())
+    print("Num markers: {}".format(total_num_markers))
+    print("Mean markers: {}".format(statistics.mean(taxon_to_num_markers.values())))
+    print("Variance markers: {}".format(statistics.variance(taxon_to_num_markers.values())))
+
+    num_markers_fracs = [1.0 * x / total_num_markers for x in sorted(taxon_to_num_markers.values())]
+    (a, b, loc, scale) = beta.fit(num_markers_fracs, floc=0, fscale=1)
+    beta_mean = a /(a+b)
+    beta_mode = (a-1)/(a+b -2)
+    beta_sample_size = a+b
+    print("Shape parameters for a beta fit:\n  a = {}\n  b = {}\n  mean = {}\n  mode = {}\n  sample size = {}".format(a,b, beta_mean, beta_mode, beta_sample_size))
+
+if __name__ == "__main__":
+    main()
diff --git a/tests/filter_noise_model.py b/tests/filter_noise_model.py
index f0b37ff..78d6a28 100644
--- a/tests/filter_noise_model.py
+++ b/tests/filter_noise_model.py
@@ -1,8 +1,10 @@
 import unittest
 
 import logging
-from marker_alignments.filter.noise_model import fit_noise_model as t
+from marker_alignments.filter.noise_model import fit_noise_model
 
+def t(taxon_to_num_markers):
+    return fit_noise_model(taxon_to_num_markers, 10000)
 
 class NoiseModel(unittest.TestCase):
     def test_zeros_at_the_end(self):