Skip to content

Commit

Permalink
Merge branch 'dev' into 'master'
Browse files Browse the repository at this point in the history
merge dev into master pre-release

See merge request research/medaka!416
  • Loading branch information
cjw85 committed Jun 5, 2020
2 parents e345f3b + cc7e6f8 commit b5fc63f
Show file tree
Hide file tree
Showing 14 changed files with 111 additions and 58 deletions.
12 changes: 12 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,15 @@
v1.0.2
-------
Minor fixes and models release.

* R9.4.1 variant calling models for Guppy 3.6.0 and updated benchmarks.
* Made r941_min_high_g360 the default consensus model.
* VCF GQ is now an integer in line with VCF spec.
* Fixed issue requiring a previous model for training.
* Fixed issue causing -p option of medaka_variant to crash.
* Fixed issue preventing installation in a virtualenv with python <3.6.


v1.0.1
-------
Minor fixes release, resolving issues introduced in v1.0.0.
Expand Down
2 changes: 1 addition & 1 deletion MANIFEST.in
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ include LICENSE.md
include requirements.txt
include build.py
include Makefile
graft submodules/samtools-1.3.1/htslib-1.3.1
graft submodules/samtools-1.9/htslib-1.9
graft images
prune build
prune docs
25 changes: 25 additions & 0 deletions docs/snp.rst
Original file line number Diff line number Diff line change
Expand Up @@ -127,6 +127,31 @@ the HG002 (NA24385) sample at 60-fold coverage.
| | Indel | 0.9508 | 0.9385 | 0.945 |
+------------------+-------+-----------+---------+----------+


Guppy 3.6 SNP and Indel calling
-------------------------------

*Last updated June 2020*

The benchmarks were performed as described above, but on chromosome 20 of
the HG002 (NA24385) sample at 60-fold coverage.

.. table::
R9.4.1 small variant calling at 60-fold coverage using `medaka variant` pipeline.
Comparison is made with the `GIAB <http://jimb.stanford.edu/giab-resources/>`_
high confidence callset using `hap.py <https://github.com/Illumina/hap.py>`_.

+-------------------------+-------+-----------+---------+----------+
| medaka / variant model | Class | Precision | Recall | F1 score |
+-------------------------+-------+-----------+---------+----------+
| v1.0.1 / | SNP | 0.9898 | 0.9931 | 0.992 |
+ r941_prom_variant_g322 +-------+-----------+---------+----------+
| | Indel | 0.9199 | 0.8634 | 0.891 |
+-------------------------+-------+-----------+---------+----------+
| v1.0.2 / | SNP | 0.9917 | 0.9965 | 0.994 |
+ r941_prom_variant_g360 +-------+-----------+---------+----------+
| | Indel | 0.9379 | 0.8982 | 0.918 |
+-------------------------+-------+-----------+---------+----------+


Performing Variant Calling
Expand Down
2 changes: 1 addition & 1 deletion medaka/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import os
import subprocess

__version__ = '1.0.1'
__version__ = '1.0.2'


def check_minimap2_version():
Expand Down
2 changes: 1 addition & 1 deletion medaka/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -539,7 +539,7 @@ def get_regions(bam, region_strs=None):
if region_strs is not None:
if os.path.isfile(region_strs[0]):
with open(region_strs[0]) as fh:
region_strs = [l.strip() for l in fh.readlines()]
region_strs = [line.strip() for line in fh.readlines()]

regions = []
for r in (Region.from_string(x) for x in region_strs):
Expand Down
3 changes: 3 additions & 0 deletions medaka/data/r941_prom_snp_g360_model.hdf5
Git LFS file not shown
3 changes: 3 additions & 0 deletions medaka/data/r941_prom_variant_g360_model.hdf5
Git LFS file not shown
79 changes: 41 additions & 38 deletions medaka/labels.py
Original file line number Diff line number Diff line change
Expand Up @@ -382,12 +382,12 @@ def _phred(err, cap=70.0):
return np.minimum(q, cap)

@staticmethod
def _pfmt(p):
"""Cast float to string with 3 decimal places.
def _pfmt(p, dp=3):
"""Cast float to string rounding to dp decimal places.
Used to format probabilities and quality scores for vcf output.
"""
return '{:.3f}'.format(p)
return '{:.{dp}f}'.format(round(p, dp), dp=dp)

@abc.abstractmethod
def _alignment_to_pairs(self, aln):
Expand Down Expand Up @@ -652,7 +652,7 @@ def snp_metainfo(self):
"""Return meta data for use in `.vcf` header."""
MI = medaka.vcf.MetaInfo
m = [MI('FORMAT', 'GT', 1, 'String', 'Medaka genotype'),
MI('FORMAT', 'GQ', 1, 'Float', 'Medaka genotype quality score')]
MI('FORMAT', 'GQ', 1, 'Integer', 'Medaka genotype quality score')]
if self.verbose:
m.extend([MI('INFO', 'ref_prob', 1, 'Float',
'Medaka probability for reference allele'),
Expand Down Expand Up @@ -783,28 +783,28 @@ def _prob_to_snp(
not secondary_exceeds_threshold)):

alt = primary_call
qual = self._pfmt(self._phred(1 - primary_prob))
genotype = {'GT': '1/1', 'GQ': qual}
qual = self._phred(1 - primary_prob)
genotype = {'GT': '1/1', 'GQ': self._pfmt(qual, 0)}
results.append(medaka.vcf.Variant(
ref_name, pos, ref_symbol, alt, filt='PASS', info=info,
qual=qual, genotype_data=genotype))
qual=self._pfmt(qual), genotype_data=genotype))

# heterozygous, no deletions
elif all((not primary_is_deletion,
not secondary_is_deletion,
secondary_exceeds_threshold)):

err = 1 - (primary_prob + secondary_prob)
qual = self._pfmt(self._phred(err))
qual = self._phred(err)
# filtering by list comp maintains order
alt = [c for c in [primary_call, secondary_call]
if not c == ref_symbol]
gt = '0/1' if len(alt) == 1 else '1/2'
genotype = {'GT': gt, 'GQ': qual}
genotype = {'GT': gt, 'GQ': self._pfmt(qual, 0)}

results.append(medaka.vcf.Variant(
ref_name, pos, ref_symbol, alt, filt='PASS',
info=info, qual=qual, genotype_data=genotype))
info=info, qual=self._pfmt(qual), genotype_data=genotype))

# heterozygous, secondary deletion
elif all((not primary_is_reference,
Expand All @@ -813,21 +813,22 @@ def _prob_to_snp(
secondary_exceeds_threshold)):

alt = primary_call
qual = self._pfmt(self._phred(1 - primary_prob))
genotype = {'GT': '1/1', 'GQ': qual}
qual = self._phred(1 - primary_prob)
genotype = {'GT': '1/1', 'GQ': self._pfmt(qual, 0)}
results.append(medaka.vcf.Variant(
ref_name, pos, ref_symbol, alt, filt='PASS',
info=info, qual=qual, genotype_data=genotype))
info=info, qual=self._pfmt(qual), genotype_data=genotype))

# no snp at this location
else:
if return_all:
# return variant even though it is not a snp
qual = self._pfmt(self._phred(1 - primary_prob))
genotype = {'GT': '0/0', 'GQ': qual}
qual = self._phred(1 - primary_prob)
genotype = {'GT': '0/0', 'GQ': self._pfmt(qual, 0)}
results.append(medaka.vcf.Variant(
ref_name, pos, ref_symbol, alt='.', filt='PASS',
info=info, qual=qual, genotype_data=genotype))
info=info, qual=self._pfmt(qual),
genotype_data=genotype))
return results

def decode_variants(self, sample, ref_seq):
Expand Down Expand Up @@ -948,8 +949,8 @@ def get_symbol(p):
info = {}

# log likelihood ratio
qual = self._pfmt(sum(pred_quals) - sum(ref_quals))
genotype = {'GT': '1', 'GQ': qual}
qual = sum(pred_quals) - sum(ref_quals)
genotype = {'GT': '1', 'GQ': self._pfmt(qual, 0)}

var_pos = pos['major'][start]

Expand All @@ -960,7 +961,8 @@ def get_symbol(p):

variant = medaka.vcf.Variant(sample.ref_name, var_pos, var_ref,
alt=var_pred, filt='PASS', info=info,
qual=qual, genotype_data=genotype)
qual=self._pfmt(qual),
genotype_data=genotype)
variant = variant.trim()
variants.append(variant)

Expand All @@ -973,7 +975,7 @@ def variant_metainfo(self):

m = [MI('FORMAT', 'GT', 1, 'String',
'Medaka genotype.'),
MI('FORMAT', 'GQ', 1, 'Float',
MI('FORMAT', 'GQ', 1, 'Integer',
'Medaka genotype quality score'), ]
if self.verbose:
m.extend([MI('INFO', 'ref_seq', 1, 'String',
Expand Down Expand Up @@ -1110,7 +1112,7 @@ def _make_info(rs, p, c):
if return_all:
# return variant even though it is not a snp
q = self._pfmt(qual)
genotype = {'GT': '0/0', 'GQ': q}
genotype = {'GT': '0/0', 'GQ': self._pfmt(qual, 0)}
info = _make_info(ref_symbol, prob, call)
results.append(medaka.vcf.Variant(
ref_name, pos, ref_symbol, alt='.', filt='PASS',
Expand All @@ -1122,7 +1124,7 @@ def _make_info(rs, p, c):
alt = [s for s in call if s != ref_symbol]
gt = '0/1' if len(alt) == 1 else '1/2'
q = self._pfmt(qual)
genotype = {'GT': gt, 'GQ': q}
genotype = {'GT': gt, 'GQ': self._pfmt(qual, 0)}
info = _make_info(ref_symbol, prob, call)
results.append(medaka.vcf.Variant(
ref_name, pos, ref_symbol, alt, filt='PASS',
Expand All @@ -1136,15 +1138,15 @@ def _make_info(rs, p, c):
alt = [s for s in call if s != '*']
gt = '1/1'
q = self._pfmt(qual)
genotype = {'GT': gt, 'GQ': q}
genotype = {'GT': gt, 'GQ': self._pfmt(qual, 0)}
info = _make_info(ref_symbol, prob, call)
results.append(medaka.vcf.Variant(
ref_name, pos, ref_symbol, alt, filt='PASS',
info=info, qual=q, genotype_data=genotype))
elif not contains_deletion: # homozygous (alt, alt)
alt = call[0]
q = self._pfmt(qual)
genotype = {'GT': '1/1', 'GQ': q}
genotype = {'GT': '1/1', 'GQ': self._pfmt(qual, 0)}
info = _make_info(ref_symbol, prob, call)
results.append(medaka.vcf.Variant(
ref_name, pos, ref_symbol, alt, filt='PASS',
Expand Down Expand Up @@ -1231,8 +1233,8 @@ def padding_vector(self):
gap = [('*', '*')]
return self._labels_to_encoded_labels(gap)[0]

def _is_het(self, l):
return 1 if not self._singleton(l) else 0
def _is_het(self, x):
return 1 if not self._singleton(x) else 0

def encoded_labels_to_training_vectors(self, enc_labels):
"""Convert integer (tuple) encoded labels to truth vectors.
Expand Down Expand Up @@ -1313,50 +1315,51 @@ def _prob_to_snp(
not is_het,
not contains_deletion)):

qual = self._pfmt(self._phred(1 - primary_prob))
qual = self._phred(1 - primary_prob)
alt = call[0]
genotype = {'GT': '1/1', 'GQ': qual}
genotype = {'GT': '1/1', 'GQ': self._pfmt(qual, 0)}
results.append(medaka.vcf.Variant(
ref_name, pos, ref_symbol, alt, filt='PASS',
info=info, qual=qual, genotype_data=genotype))
info=info, qual=self._pfmt(qual), genotype_data=genotype))

# heterozygous, no deletions
elif all((is_het,
not contains_deletion)):

err = 1 - 0.5 * (primary_prob + secondary_prob)
qual = self._pfmt(self._phred(err))
qual = self._phred(err)
alt = [s for s in call if s != ref_symbol]
gt = '0/1' if len(alt) == 1 else '1/2'
genotype = {'GT': gt, 'GQ': qual}
genotype = {'GT': gt, 'GQ': self._pfmt(qual, 0)}

results.append(medaka.vcf.Variant(
ref_name, pos, ref_symbol, alt, filt='PASS',
info=info, qual=qual, genotype_data=genotype))
info=info, qual=self._pfmt(qual), genotype_data=genotype))

# heterozygous, one deletion
elif all((is_het,
contains_nonref,
contains_deletion)):

qual = self._pfmt(self._phred(1 - primary_prob))
qual = self._phred(1 - primary_prob)
alt = [s for s in call if s != '*']
gt = '1/1'
genotype = {'GT': gt, 'GQ': qual}
genotype = {'GT': gt, 'GQ': self._pfmt(qual, 0)}

results.append(medaka.vcf.Variant(
ref_name, pos, ref_symbol, alt, filt='PASS',
info=info, qual=qual, genotype_data=genotype))
info=info, qual=self._pfmt(qual), genotype_data=genotype))

# no snp at this location
else:
if return_all:
qual = self._pfmt(ref_prob)
qual = self._phred(ref_prob)
# return variant even though it is not a snp
genotype = {'GT': '0/0', 'GQ': qual}
genotype = {'GT': '0/0', 'GQ': self._pfmt(qual, 0)}
results.append(medaka.vcf.Variant(
ref_name, pos, ref_symbol, alt='.', filt='PASS',
info=info, qual=qual, genotype_data=genotype))
info=info, qual=self._pfmt(qual),
genotype_data=genotype))
return results


Expand Down
2 changes: 1 addition & 1 deletion medaka/medaka.py
Original file line number Diff line number Diff line change
Expand Up @@ -638,6 +638,6 @@ def main():
msg = "Reads will be filtered to only those with RG tag: {}"
logger.info(msg.format(RG))
# if model is default, resolve to file, save mess in help text
if hasattr(args, 'model'):
if hasattr(args, 'model') and args.model is not None:
args.model = medaka.models.resolve_model(args.model)
args.func(args)
11 changes: 6 additions & 5 deletions medaka/options.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
# r10 consensus
'r103_min_high_g345', 'r103_min_high_g360', 'r103_prom_high_g360',
# snp and variant
'r941_prom_snp_g322', 'r941_prom_variant_g322',
'r941_prom_snp_g360', 'r941_prom_variant_g360',
'r103_prom_snp_g3210', 'r103_prom_variant_g3210']
archived_models = [
# r9 consensus
Expand All @@ -35,12 +35,13 @@
# r10 consensus
'r10_min_high_g303', 'r10_min_high_g340',
# snp and variant
'r941_prom_snp_g303', 'r941_prom_variant_g303']
'r941_prom_snp_g303', 'r941_prom_variant_g303',
'r941_prom_snp_g322', 'r941_prom_variant_g322']
allowed_models = sorted(current_models + archived_models)
default_models = {
'consensus': 'r941_min_high_g351',
'snp': 'r941_prom_snp_g322',
'variant': 'r941_prom_variant_g322'}
'consensus': 'r941_min_high_g360',
'snp': 'r941_prom_snp_g360',
'variant': 'r941_prom_variant_g360'}

alignment_params = {
'rle': "-M 5 -S 4 -O 2 -E 3",
Expand Down
8 changes: 4 additions & 4 deletions medaka/test/test_labels.py
Original file line number Diff line number Diff line change
Expand Up @@ -337,7 +337,7 @@ def test_decode_variants(self):
self.assertEqual(v.alt, [alt], msg=msg)
self.assertEqual(v.genotype_data['GT'], '1', msg=msg)
self.assertAlmostEqual(float(v.qual), expected_qual, places=3, msg=msg)
self.assertAlmostEqual(float(v.genotype_data['GQ']), expected_qual, places=3, msg=msg)
self.assertEqual(int(v.genotype_data['GQ']), round(expected_qual), msg=msg)


def test_decode_snps(self):
Expand Down Expand Up @@ -411,7 +411,7 @@ def test_decode_snps(self):
self.assertEqual(v.alt, alt)
self.assertEqual(v.genotype_data['GT'], gt)
gq = qual_homo if len(set(v.gt)) == 1 else qual_hetero
self.assertAlmostEqual(float(v.genotype_data['GQ']), gq, places=3)
self.assertEqual(int(v.genotype_data['GQ']), round(gq))

def test_padding_vector(self):
self.assertEqual(self.ls.padding_vector, 0)
Expand Down Expand Up @@ -741,7 +741,7 @@ def test_decode_snps_is_het_always_true(self):
self.assertEqual(v.alt, alt)
self.assertEqual(v.genotype_data['GT'], gt)
gq = qual_hom if len(set(v.gt)) == 1 else qual_het
self.assertAlmostEqual(float(v.genotype_data['GQ']), gq, places=3)
self.assertEqual(int(v.genotype_data['GQ']), round(gq))

def test_decode_snps_is_het_always_off(self):

Expand Down Expand Up @@ -789,7 +789,7 @@ def test_decode_snps_is_het_always_off(self):
self.assertEqual(v.alt, alt)
self.assertEqual(v.genotype_data['GT'], gt)
gq = qual_hom if len(set(v.gt)) == 1 else qual_het
self.assertAlmostEqual(float(v.genotype_data['GQ']), gq, places=3)
self.assertEqual(int(v.genotype_data['GQ']), round(gq))

def test_padding_vector(self):
self.assertEqual(self.ls.padding_vector, 0)
Expand Down
Loading

0 comments on commit b5fc63f

Please sign in to comment.