diff --git a/CHANGELOG.md b/CHANGELOG.md index 9be3c0b7..2c998057 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,15 @@ +v1.0.2 +------- +Minor fixes and models release. + +* R9.4.1 variant calling models for Guppy 3.6.0 and updated benchmarks. +* Made r941_min_high_g360 the default consensus model. +* VCF GQ is now an integer in line with VCF spec. +* Fixed issue requiring a previous model for training. +* Fixed issue causing -p option of medaka_variant to crash. +* Fixed issue preventing installation in a virtualenv with python <3.6. + + v1.0.1 ------- Minor fixes release, resolving issues introduced in v1.0.0. diff --git a/MANIFEST.in b/MANIFEST.in index 11c99aa6..9c16dd4f 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -5,7 +5,7 @@ include LICENSE.md include requirements.txt include build.py include Makefile -graft submodules/samtools-1.3.1/htslib-1.3.1 +graft submodules/samtools-1.9/htslib-1.9 graft images prune build prune docs diff --git a/docs/snp.rst b/docs/snp.rst index 556798c8..3195fee2 100644 --- a/docs/snp.rst +++ b/docs/snp.rst @@ -127,6 +127,31 @@ the HG002 (NA24385) sample at 60-fold coverage. | | Indel | 0.9508 | 0.9385 | 0.945 | +------------------+-------+-----------+---------+----------+ + +Guppy 3.6 SNP and Indel calling +------------------------------- + +*Last updated June 2020* + +The benchmarks were performed as described above, but on chromosome 20 of +the HG002 (NA24385) sample at 60-fold coverage. + +.. table:: + R9.4.1 small variant calling at 60-fold coverage using `medaka variant` pipeline. + Comparison is made with the `GIAB `_ + high confidence callset using `hap.py `_. + + +-------------------------+-------+-----------+---------+----------+ + | medaka / variant model | Class | Precision | Recall | F1 score | + +-------------------------+-------+-----------+---------+----------+ + | v1.0.1 / | SNP | 0.9898 | 0.9931 | 0.992 | + + r941_prom_variant_g322 +-------+-----------+---------+----------+ + | | Indel | 0.9199 | 0.8634 | 0.891 | + +-------------------------+-------+-----------+---------+----------+ + | v1.0.2 / | SNP | 0.9917 | 0.9965 | 0.994 | + + r941_prom_variant_g360 +-------+-----------+---------+----------+ + | | Indel | 0.9379 | 0.8982 | 0.918 | + +-------------------------+-------+-----------+---------+----------+ Performing Variant Calling diff --git a/medaka/__init__.py b/medaka/__init__.py index 76dff04b..faa6ee66 100644 --- a/medaka/__init__.py +++ b/medaka/__init__.py @@ -4,7 +4,7 @@ import os import subprocess -__version__ = '1.0.1' +__version__ = '1.0.2' def check_minimap2_version(): diff --git a/medaka/common.py b/medaka/common.py index 8093b600..92e719ab 100644 --- a/medaka/common.py +++ b/medaka/common.py @@ -539,7 +539,7 @@ def get_regions(bam, region_strs=None): if region_strs is not None: if os.path.isfile(region_strs[0]): with open(region_strs[0]) as fh: - region_strs = [l.strip() for l in fh.readlines()] + region_strs = [line.strip() for line in fh.readlines()] regions = [] for r in (Region.from_string(x) for x in region_strs): diff --git a/medaka/data/r941_prom_snp_g360_model.hdf5 b/medaka/data/r941_prom_snp_g360_model.hdf5 new file mode 100755 index 00000000..2779d2e9 --- /dev/null +++ b/medaka/data/r941_prom_snp_g360_model.hdf5 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:40dec35aa25e70eef2066ccb5e938aa92008d32fc247fc03ef2f3c44fc11133e +size 3320052 diff --git a/medaka/data/r941_prom_variant_g360_model.hdf5 b/medaka/data/r941_prom_variant_g360_model.hdf5 new file mode 100755 index 00000000..4e7c8ffb --- /dev/null +++ b/medaka/data/r941_prom_variant_g360_model.hdf5 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8e61e7c7223e87426df7ae1aad7ccc9cd10bc6cab3ae27f0892884d39fd7976c +size 3299572 diff --git a/medaka/labels.py b/medaka/labels.py index a1c8705c..3831971c 100644 --- a/medaka/labels.py +++ b/medaka/labels.py @@ -382,12 +382,12 @@ def _phred(err, cap=70.0): return np.minimum(q, cap) @staticmethod - def _pfmt(p): - """Cast float to string with 3 decimal places. + def _pfmt(p, dp=3): + """Cast float to string rounding to dp decimal places. Used to format probabilities and quality scores for vcf output. """ - return '{:.3f}'.format(p) + return '{:.{dp}f}'.format(round(p, dp), dp=dp) @abc.abstractmethod def _alignment_to_pairs(self, aln): @@ -652,7 +652,7 @@ def snp_metainfo(self): """Return meta data for use in `.vcf` header.""" MI = medaka.vcf.MetaInfo m = [MI('FORMAT', 'GT', 1, 'String', 'Medaka genotype'), - MI('FORMAT', 'GQ', 1, 'Float', 'Medaka genotype quality score')] + MI('FORMAT', 'GQ', 1, 'Integer', 'Medaka genotype quality score')] if self.verbose: m.extend([MI('INFO', 'ref_prob', 1, 'Float', 'Medaka probability for reference allele'), @@ -783,11 +783,11 @@ def _prob_to_snp( not secondary_exceeds_threshold)): alt = primary_call - qual = self._pfmt(self._phred(1 - primary_prob)) - genotype = {'GT': '1/1', 'GQ': qual} + qual = self._phred(1 - primary_prob) + genotype = {'GT': '1/1', 'GQ': self._pfmt(qual, 0)} results.append(medaka.vcf.Variant( ref_name, pos, ref_symbol, alt, filt='PASS', info=info, - qual=qual, genotype_data=genotype)) + qual=self._pfmt(qual), genotype_data=genotype)) # heterozygous, no deletions elif all((not primary_is_deletion, @@ -795,16 +795,16 @@ def _prob_to_snp( secondary_exceeds_threshold)): err = 1 - (primary_prob + secondary_prob) - qual = self._pfmt(self._phred(err)) + qual = self._phred(err) # filtering by list comp maintains order alt = [c for c in [primary_call, secondary_call] if not c == ref_symbol] gt = '0/1' if len(alt) == 1 else '1/2' - genotype = {'GT': gt, 'GQ': qual} + genotype = {'GT': gt, 'GQ': self._pfmt(qual, 0)} results.append(medaka.vcf.Variant( ref_name, pos, ref_symbol, alt, filt='PASS', - info=info, qual=qual, genotype_data=genotype)) + info=info, qual=self._pfmt(qual), genotype_data=genotype)) # heterozygous, secondary deletion elif all((not primary_is_reference, @@ -813,21 +813,22 @@ def _prob_to_snp( secondary_exceeds_threshold)): alt = primary_call - qual = self._pfmt(self._phred(1 - primary_prob)) - genotype = {'GT': '1/1', 'GQ': qual} + qual = self._phred(1 - primary_prob) + genotype = {'GT': '1/1', 'GQ': self._pfmt(qual, 0)} results.append(medaka.vcf.Variant( ref_name, pos, ref_symbol, alt, filt='PASS', - info=info, qual=qual, genotype_data=genotype)) + info=info, qual=self._pfmt(qual), genotype_data=genotype)) # no snp at this location else: if return_all: # return variant even though it is not a snp - qual = self._pfmt(self._phred(1 - primary_prob)) - genotype = {'GT': '0/0', 'GQ': qual} + qual = self._phred(1 - primary_prob) + genotype = {'GT': '0/0', 'GQ': self._pfmt(qual, 0)} results.append(medaka.vcf.Variant( ref_name, pos, ref_symbol, alt='.', filt='PASS', - info=info, qual=qual, genotype_data=genotype)) + info=info, qual=self._pfmt(qual), + genotype_data=genotype)) return results def decode_variants(self, sample, ref_seq): @@ -948,8 +949,8 @@ def get_symbol(p): info = {} # log likelihood ratio - qual = self._pfmt(sum(pred_quals) - sum(ref_quals)) - genotype = {'GT': '1', 'GQ': qual} + qual = sum(pred_quals) - sum(ref_quals) + genotype = {'GT': '1', 'GQ': self._pfmt(qual, 0)} var_pos = pos['major'][start] @@ -960,7 +961,8 @@ def get_symbol(p): variant = medaka.vcf.Variant(sample.ref_name, var_pos, var_ref, alt=var_pred, filt='PASS', info=info, - qual=qual, genotype_data=genotype) + qual=self._pfmt(qual), + genotype_data=genotype) variant = variant.trim() variants.append(variant) @@ -973,7 +975,7 @@ def variant_metainfo(self): m = [MI('FORMAT', 'GT', 1, 'String', 'Medaka genotype.'), - MI('FORMAT', 'GQ', 1, 'Float', + MI('FORMAT', 'GQ', 1, 'Integer', 'Medaka genotype quality score'), ] if self.verbose: m.extend([MI('INFO', 'ref_seq', 1, 'String', @@ -1110,7 +1112,7 @@ def _make_info(rs, p, c): if return_all: # return variant even though it is not a snp q = self._pfmt(qual) - genotype = {'GT': '0/0', 'GQ': q} + genotype = {'GT': '0/0', 'GQ': self._pfmt(qual, 0)} info = _make_info(ref_symbol, prob, call) results.append(medaka.vcf.Variant( ref_name, pos, ref_symbol, alt='.', filt='PASS', @@ -1122,7 +1124,7 @@ def _make_info(rs, p, c): alt = [s for s in call if s != ref_symbol] gt = '0/1' if len(alt) == 1 else '1/2' q = self._pfmt(qual) - genotype = {'GT': gt, 'GQ': q} + genotype = {'GT': gt, 'GQ': self._pfmt(qual, 0)} info = _make_info(ref_symbol, prob, call) results.append(medaka.vcf.Variant( ref_name, pos, ref_symbol, alt, filt='PASS', @@ -1136,7 +1138,7 @@ def _make_info(rs, p, c): alt = [s for s in call if s != '*'] gt = '1/1' q = self._pfmt(qual) - genotype = {'GT': gt, 'GQ': q} + genotype = {'GT': gt, 'GQ': self._pfmt(qual, 0)} info = _make_info(ref_symbol, prob, call) results.append(medaka.vcf.Variant( ref_name, pos, ref_symbol, alt, filt='PASS', @@ -1144,7 +1146,7 @@ def _make_info(rs, p, c): elif not contains_deletion: # homozygous (alt, alt) alt = call[0] q = self._pfmt(qual) - genotype = {'GT': '1/1', 'GQ': q} + genotype = {'GT': '1/1', 'GQ': self._pfmt(qual, 0)} info = _make_info(ref_symbol, prob, call) results.append(medaka.vcf.Variant( ref_name, pos, ref_symbol, alt, filt='PASS', @@ -1231,8 +1233,8 @@ def padding_vector(self): gap = [('*', '*')] return self._labels_to_encoded_labels(gap)[0] - def _is_het(self, l): - return 1 if not self._singleton(l) else 0 + def _is_het(self, x): + return 1 if not self._singleton(x) else 0 def encoded_labels_to_training_vectors(self, enc_labels): """Convert integer (tuple) encoded labels to truth vectors. @@ -1313,50 +1315,51 @@ def _prob_to_snp( not is_het, not contains_deletion)): - qual = self._pfmt(self._phred(1 - primary_prob)) + qual = self._phred(1 - primary_prob) alt = call[0] - genotype = {'GT': '1/1', 'GQ': qual} + genotype = {'GT': '1/1', 'GQ': self._pfmt(qual, 0)} results.append(medaka.vcf.Variant( ref_name, pos, ref_symbol, alt, filt='PASS', - info=info, qual=qual, genotype_data=genotype)) + info=info, qual=self._pfmt(qual), genotype_data=genotype)) # heterozygous, no deletions elif all((is_het, not contains_deletion)): err = 1 - 0.5 * (primary_prob + secondary_prob) - qual = self._pfmt(self._phred(err)) + qual = self._phred(err) alt = [s for s in call if s != ref_symbol] gt = '0/1' if len(alt) == 1 else '1/2' - genotype = {'GT': gt, 'GQ': qual} + genotype = {'GT': gt, 'GQ': self._pfmt(qual, 0)} results.append(medaka.vcf.Variant( ref_name, pos, ref_symbol, alt, filt='PASS', - info=info, qual=qual, genotype_data=genotype)) + info=info, qual=self._pfmt(qual), genotype_data=genotype)) # heterozygous, one deletion elif all((is_het, contains_nonref, contains_deletion)): - qual = self._pfmt(self._phred(1 - primary_prob)) + qual = self._phred(1 - primary_prob) alt = [s for s in call if s != '*'] gt = '1/1' - genotype = {'GT': gt, 'GQ': qual} + genotype = {'GT': gt, 'GQ': self._pfmt(qual, 0)} results.append(medaka.vcf.Variant( ref_name, pos, ref_symbol, alt, filt='PASS', - info=info, qual=qual, genotype_data=genotype)) + info=info, qual=self._pfmt(qual), genotype_data=genotype)) # no snp at this location else: if return_all: - qual = self._pfmt(ref_prob) + qual = self._phred(ref_prob) # return variant even though it is not a snp - genotype = {'GT': '0/0', 'GQ': qual} + genotype = {'GT': '0/0', 'GQ': self._pfmt(qual, 0)} results.append(medaka.vcf.Variant( ref_name, pos, ref_symbol, alt='.', filt='PASS', - info=info, qual=qual, genotype_data=genotype)) + info=info, qual=self._pfmt(qual), + genotype_data=genotype)) return results diff --git a/medaka/medaka.py b/medaka/medaka.py index 1c92f15b..19798d5d 100644 --- a/medaka/medaka.py +++ b/medaka/medaka.py @@ -638,6 +638,6 @@ def main(): msg = "Reads will be filtered to only those with RG tag: {}" logger.info(msg.format(RG)) # if model is default, resolve to file, save mess in help text - if hasattr(args, 'model'): + if hasattr(args, 'model') and args.model is not None: args.model = medaka.models.resolve_model(args.model) args.func(args) diff --git a/medaka/options.py b/medaka/options.py index 7b27effc..785487cb 100644 --- a/medaka/options.py +++ b/medaka/options.py @@ -26,7 +26,7 @@ # r10 consensus 'r103_min_high_g345', 'r103_min_high_g360', 'r103_prom_high_g360', # snp and variant - 'r941_prom_snp_g322', 'r941_prom_variant_g322', + 'r941_prom_snp_g360', 'r941_prom_variant_g360', 'r103_prom_snp_g3210', 'r103_prom_variant_g3210'] archived_models = [ # r9 consensus @@ -35,12 +35,13 @@ # r10 consensus 'r10_min_high_g303', 'r10_min_high_g340', # snp and variant - 'r941_prom_snp_g303', 'r941_prom_variant_g303'] + 'r941_prom_snp_g303', 'r941_prom_variant_g303', + 'r941_prom_snp_g322', 'r941_prom_variant_g322'] allowed_models = sorted(current_models + archived_models) default_models = { - 'consensus': 'r941_min_high_g351', - 'snp': 'r941_prom_snp_g322', - 'variant': 'r941_prom_variant_g322'} + 'consensus': 'r941_min_high_g360', + 'snp': 'r941_prom_snp_g360', + 'variant': 'r941_prom_variant_g360'} alignment_params = { 'rle': "-M 5 -S 4 -O 2 -E 3", diff --git a/medaka/test/test_labels.py b/medaka/test/test_labels.py index 1744b3b2..7dfd8d10 100644 --- a/medaka/test/test_labels.py +++ b/medaka/test/test_labels.py @@ -337,7 +337,7 @@ def test_decode_variants(self): self.assertEqual(v.alt, [alt], msg=msg) self.assertEqual(v.genotype_data['GT'], '1', msg=msg) self.assertAlmostEqual(float(v.qual), expected_qual, places=3, msg=msg) - self.assertAlmostEqual(float(v.genotype_data['GQ']), expected_qual, places=3, msg=msg) + self.assertEqual(int(v.genotype_data['GQ']), round(expected_qual), msg=msg) def test_decode_snps(self): @@ -411,7 +411,7 @@ def test_decode_snps(self): self.assertEqual(v.alt, alt) self.assertEqual(v.genotype_data['GT'], gt) gq = qual_homo if len(set(v.gt)) == 1 else qual_hetero - self.assertAlmostEqual(float(v.genotype_data['GQ']), gq, places=3) + self.assertEqual(int(v.genotype_data['GQ']), round(gq)) def test_padding_vector(self): self.assertEqual(self.ls.padding_vector, 0) @@ -741,7 +741,7 @@ def test_decode_snps_is_het_always_true(self): self.assertEqual(v.alt, alt) self.assertEqual(v.genotype_data['GT'], gt) gq = qual_hom if len(set(v.gt)) == 1 else qual_het - self.assertAlmostEqual(float(v.genotype_data['GQ']), gq, places=3) + self.assertEqual(int(v.genotype_data['GQ']), round(gq)) def test_decode_snps_is_het_always_off(self): @@ -789,7 +789,7 @@ def test_decode_snps_is_het_always_off(self): self.assertEqual(v.alt, alt) self.assertEqual(v.genotype_data['GT'], gt) gq = qual_hom if len(set(v.gt)) == 1 else qual_het - self.assertAlmostEqual(float(v.genotype_data['GQ']), gq, places=3) + self.assertEqual(int(v.genotype_data['GQ']), round(gq)) def test_padding_vector(self): self.assertEqual(self.ls.padding_vector, 0) diff --git a/medaka/vcf.py b/medaka/vcf.py index 710f9722..1ffe0e60 100644 --- a/medaka/vcf.py +++ b/medaka/vcf.py @@ -699,7 +699,7 @@ def _merge_variants( gts = [1, 0] gt = gt_sep.join(map(str, gts)) - genotype_data = {'GT': gt, 'GQ': qual} + genotype_data = {'GT': gt, 'GQ': round(qual)} return Variant( v.chrom, interval.begin, ref, alt=alts, @@ -799,9 +799,13 @@ def meta_info(self): # 'Number' value should be ā€˜Gā€™. m.append(MetaInfo( 'FORMAT', 'GT', 'G', 'String', 'Genotype')) - # if this is not a float, vcf benchmarking tools may fail + # VCF spec states that GQ should be an int, and whatshap >0.18 will + # fail to parse the vcf if it's a float. + # vcf benchmarking tools which require a float could use the QUAL + # as we write out the same number for QUAL and GQ, the QG is just + # rounded. m.append(MetaInfo( - 'FORMAT', 'GQ', 'G', 'Float', 'Genotype quality score')) + 'FORMAT', 'GQ', 'G', 'Integer', 'Genotype quality score')) return m diff --git a/requirements.txt b/requirements.txt index 9b877827..f0cfea63 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,6 @@ # medaka requirements. # Add comments to keep track of why we are using particular versions -biopython +biopython>=1.73,<1.77 # 1.76 is last version of this whatshap dependency to support python < 3.6 cffi grpcio==1.27.2 # last version of this tensorflow dependency to have a manylinux1 wheel. h5py==2.7.1 # 2.8.0 and 2.9.0 seem to not like keras model writing diff --git a/scripts/medaka_variant b/scripts/medaka_variant index 57954b86..b8300312 100755 --- a/scripts/medaka_variant +++ b/scripts/medaka_variant @@ -32,8 +32,8 @@ PHASEOUT=false DELETE=false SNPMODEL=${modeldata[2]##* } MODEL=${modeldata[3]##* } -INDEL_THRESHOLD=14 -SNP_THRESHOLD=12 +INDEL_THRESHOLD=9 +SNP_THRESHOLD=8 FILTER_FLAG=true STOPAFTERROUND0=false @@ -45,7 +45,7 @@ Variant calling via neural networks. The input bam should be aligned to the reference against which to call variants. Note: that although configurable it is unlikely that the model arguments should be changed from their defaults. -$(basename "$0") [-h] -i +$(basename "$0") [-h] -i -h show this help text. -i input bam of reads aligned to ref. Read groups are currently ignored, @@ -397,6 +397,8 @@ else # Filter resulting vcf FILTER="(((TYPE=\"snp\" | TYPE=\"mnp\") & (QUAL > ${SNP_THRESHOLD})) | ((TYPE!=\"snp\" & TYPE!=\"mnp\") & (QUAL > ${INDEL_THRESHOLD})))" bcftools filter --threads $THREADS -s lowqual -i "${FILTER}" ${UNFILTERED} > ${FINAL} + # pyvcf used in whatshap<=0.18 can't parse the ##FILTER=