diff --git a/CHANGELOG.md b/CHANGELOG.md
index 9be3c0b7..2c998057 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,3 +1,15 @@
+v1.0.2
+-------
+Minor fixes and models release.
+
+* R9.4.1 variant calling models for Guppy 3.6.0 and updated benchmarks.
+* Made r941_min_high_g360 the default consensus model.
+* VCF GQ is now an integer in line with VCF spec.
+* Fixed issue requiring a previous model for training.
+* Fixed issue causing -p option of medaka_variant to crash.
+* Fixed issue preventing installation in a virtualenv with python <3.6.
+
+
v1.0.1
-------
Minor fixes release, resolving issues introduced in v1.0.0.
diff --git a/MANIFEST.in b/MANIFEST.in
index 11c99aa6..9c16dd4f 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -5,7 +5,7 @@ include LICENSE.md
include requirements.txt
include build.py
include Makefile
-graft submodules/samtools-1.3.1/htslib-1.3.1
+graft submodules/samtools-1.9/htslib-1.9
graft images
prune build
prune docs
diff --git a/docs/snp.rst b/docs/snp.rst
index 556798c8..3195fee2 100644
--- a/docs/snp.rst
+++ b/docs/snp.rst
@@ -127,6 +127,31 @@ the HG002 (NA24385) sample at 60-fold coverage.
| | Indel | 0.9508 | 0.9385 | 0.945 |
+------------------+-------+-----------+---------+----------+
+
+Guppy 3.6 SNP and Indel calling
+-------------------------------
+
+*Last updated June 2020*
+
+The benchmarks were performed as described above, but on chromosome 20 of
+the HG002 (NA24385) sample at 60-fold coverage.
+
+.. table::
+ R9.4.1 small variant calling at 60-fold coverage using `medaka variant` pipeline.
+ Comparison is made with the `GIAB `_
+ high confidence callset using `hap.py `_.
+
+ +-------------------------+-------+-----------+---------+----------+
+ | medaka / variant model | Class | Precision | Recall | F1 score |
+ +-------------------------+-------+-----------+---------+----------+
+ | v1.0.1 / | SNP | 0.9898 | 0.9931 | 0.992 |
+ + r941_prom_variant_g322 +-------+-----------+---------+----------+
+ | | Indel | 0.9199 | 0.8634 | 0.891 |
+ +-------------------------+-------+-----------+---------+----------+
+ | v1.0.2 / | SNP | 0.9917 | 0.9965 | 0.994 |
+ + r941_prom_variant_g360 +-------+-----------+---------+----------+
+ | | Indel | 0.9379 | 0.8982 | 0.918 |
+ +-------------------------+-------+-----------+---------+----------+
Performing Variant Calling
diff --git a/medaka/__init__.py b/medaka/__init__.py
index 76dff04b..faa6ee66 100644
--- a/medaka/__init__.py
+++ b/medaka/__init__.py
@@ -4,7 +4,7 @@
import os
import subprocess
-__version__ = '1.0.1'
+__version__ = '1.0.2'
def check_minimap2_version():
diff --git a/medaka/common.py b/medaka/common.py
index 8093b600..92e719ab 100644
--- a/medaka/common.py
+++ b/medaka/common.py
@@ -539,7 +539,7 @@ def get_regions(bam, region_strs=None):
if region_strs is not None:
if os.path.isfile(region_strs[0]):
with open(region_strs[0]) as fh:
- region_strs = [l.strip() for l in fh.readlines()]
+ region_strs = [line.strip() for line in fh.readlines()]
regions = []
for r in (Region.from_string(x) for x in region_strs):
diff --git a/medaka/data/r941_prom_snp_g360_model.hdf5 b/medaka/data/r941_prom_snp_g360_model.hdf5
new file mode 100755
index 00000000..2779d2e9
--- /dev/null
+++ b/medaka/data/r941_prom_snp_g360_model.hdf5
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:40dec35aa25e70eef2066ccb5e938aa92008d32fc247fc03ef2f3c44fc11133e
+size 3320052
diff --git a/medaka/data/r941_prom_variant_g360_model.hdf5 b/medaka/data/r941_prom_variant_g360_model.hdf5
new file mode 100755
index 00000000..4e7c8ffb
--- /dev/null
+++ b/medaka/data/r941_prom_variant_g360_model.hdf5
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8e61e7c7223e87426df7ae1aad7ccc9cd10bc6cab3ae27f0892884d39fd7976c
+size 3299572
diff --git a/medaka/labels.py b/medaka/labels.py
index a1c8705c..3831971c 100644
--- a/medaka/labels.py
+++ b/medaka/labels.py
@@ -382,12 +382,12 @@ def _phred(err, cap=70.0):
return np.minimum(q, cap)
@staticmethod
- def _pfmt(p):
- """Cast float to string with 3 decimal places.
+ def _pfmt(p, dp=3):
+ """Cast float to string rounding to dp decimal places.
Used to format probabilities and quality scores for vcf output.
"""
- return '{:.3f}'.format(p)
+ return '{:.{dp}f}'.format(round(p, dp), dp=dp)
@abc.abstractmethod
def _alignment_to_pairs(self, aln):
@@ -652,7 +652,7 @@ def snp_metainfo(self):
"""Return meta data for use in `.vcf` header."""
MI = medaka.vcf.MetaInfo
m = [MI('FORMAT', 'GT', 1, 'String', 'Medaka genotype'),
- MI('FORMAT', 'GQ', 1, 'Float', 'Medaka genotype quality score')]
+ MI('FORMAT', 'GQ', 1, 'Integer', 'Medaka genotype quality score')]
if self.verbose:
m.extend([MI('INFO', 'ref_prob', 1, 'Float',
'Medaka probability for reference allele'),
@@ -783,11 +783,11 @@ def _prob_to_snp(
not secondary_exceeds_threshold)):
alt = primary_call
- qual = self._pfmt(self._phred(1 - primary_prob))
- genotype = {'GT': '1/1', 'GQ': qual}
+ qual = self._phred(1 - primary_prob)
+ genotype = {'GT': '1/1', 'GQ': self._pfmt(qual, 0)}
results.append(medaka.vcf.Variant(
ref_name, pos, ref_symbol, alt, filt='PASS', info=info,
- qual=qual, genotype_data=genotype))
+ qual=self._pfmt(qual), genotype_data=genotype))
# heterozygous, no deletions
elif all((not primary_is_deletion,
@@ -795,16 +795,16 @@ def _prob_to_snp(
secondary_exceeds_threshold)):
err = 1 - (primary_prob + secondary_prob)
- qual = self._pfmt(self._phred(err))
+ qual = self._phred(err)
# filtering by list comp maintains order
alt = [c for c in [primary_call, secondary_call]
if not c == ref_symbol]
gt = '0/1' if len(alt) == 1 else '1/2'
- genotype = {'GT': gt, 'GQ': qual}
+ genotype = {'GT': gt, 'GQ': self._pfmt(qual, 0)}
results.append(medaka.vcf.Variant(
ref_name, pos, ref_symbol, alt, filt='PASS',
- info=info, qual=qual, genotype_data=genotype))
+ info=info, qual=self._pfmt(qual), genotype_data=genotype))
# heterozygous, secondary deletion
elif all((not primary_is_reference,
@@ -813,21 +813,22 @@ def _prob_to_snp(
secondary_exceeds_threshold)):
alt = primary_call
- qual = self._pfmt(self._phred(1 - primary_prob))
- genotype = {'GT': '1/1', 'GQ': qual}
+ qual = self._phred(1 - primary_prob)
+ genotype = {'GT': '1/1', 'GQ': self._pfmt(qual, 0)}
results.append(medaka.vcf.Variant(
ref_name, pos, ref_symbol, alt, filt='PASS',
- info=info, qual=qual, genotype_data=genotype))
+ info=info, qual=self._pfmt(qual), genotype_data=genotype))
# no snp at this location
else:
if return_all:
# return variant even though it is not a snp
- qual = self._pfmt(self._phred(1 - primary_prob))
- genotype = {'GT': '0/0', 'GQ': qual}
+ qual = self._phred(1 - primary_prob)
+ genotype = {'GT': '0/0', 'GQ': self._pfmt(qual, 0)}
results.append(medaka.vcf.Variant(
ref_name, pos, ref_symbol, alt='.', filt='PASS',
- info=info, qual=qual, genotype_data=genotype))
+ info=info, qual=self._pfmt(qual),
+ genotype_data=genotype))
return results
def decode_variants(self, sample, ref_seq):
@@ -948,8 +949,8 @@ def get_symbol(p):
info = {}
# log likelihood ratio
- qual = self._pfmt(sum(pred_quals) - sum(ref_quals))
- genotype = {'GT': '1', 'GQ': qual}
+ qual = sum(pred_quals) - sum(ref_quals)
+ genotype = {'GT': '1', 'GQ': self._pfmt(qual, 0)}
var_pos = pos['major'][start]
@@ -960,7 +961,8 @@ def get_symbol(p):
variant = medaka.vcf.Variant(sample.ref_name, var_pos, var_ref,
alt=var_pred, filt='PASS', info=info,
- qual=qual, genotype_data=genotype)
+ qual=self._pfmt(qual),
+ genotype_data=genotype)
variant = variant.trim()
variants.append(variant)
@@ -973,7 +975,7 @@ def variant_metainfo(self):
m = [MI('FORMAT', 'GT', 1, 'String',
'Medaka genotype.'),
- MI('FORMAT', 'GQ', 1, 'Float',
+ MI('FORMAT', 'GQ', 1, 'Integer',
'Medaka genotype quality score'), ]
if self.verbose:
m.extend([MI('INFO', 'ref_seq', 1, 'String',
@@ -1110,7 +1112,7 @@ def _make_info(rs, p, c):
if return_all:
# return variant even though it is not a snp
q = self._pfmt(qual)
- genotype = {'GT': '0/0', 'GQ': q}
+ genotype = {'GT': '0/0', 'GQ': self._pfmt(qual, 0)}
info = _make_info(ref_symbol, prob, call)
results.append(medaka.vcf.Variant(
ref_name, pos, ref_symbol, alt='.', filt='PASS',
@@ -1122,7 +1124,7 @@ def _make_info(rs, p, c):
alt = [s for s in call if s != ref_symbol]
gt = '0/1' if len(alt) == 1 else '1/2'
q = self._pfmt(qual)
- genotype = {'GT': gt, 'GQ': q}
+ genotype = {'GT': gt, 'GQ': self._pfmt(qual, 0)}
info = _make_info(ref_symbol, prob, call)
results.append(medaka.vcf.Variant(
ref_name, pos, ref_symbol, alt, filt='PASS',
@@ -1136,7 +1138,7 @@ def _make_info(rs, p, c):
alt = [s for s in call if s != '*']
gt = '1/1'
q = self._pfmt(qual)
- genotype = {'GT': gt, 'GQ': q}
+ genotype = {'GT': gt, 'GQ': self._pfmt(qual, 0)}
info = _make_info(ref_symbol, prob, call)
results.append(medaka.vcf.Variant(
ref_name, pos, ref_symbol, alt, filt='PASS',
@@ -1144,7 +1146,7 @@ def _make_info(rs, p, c):
elif not contains_deletion: # homozygous (alt, alt)
alt = call[0]
q = self._pfmt(qual)
- genotype = {'GT': '1/1', 'GQ': q}
+ genotype = {'GT': '1/1', 'GQ': self._pfmt(qual, 0)}
info = _make_info(ref_symbol, prob, call)
results.append(medaka.vcf.Variant(
ref_name, pos, ref_symbol, alt, filt='PASS',
@@ -1231,8 +1233,8 @@ def padding_vector(self):
gap = [('*', '*')]
return self._labels_to_encoded_labels(gap)[0]
- def _is_het(self, l):
- return 1 if not self._singleton(l) else 0
+ def _is_het(self, x):
+ return 1 if not self._singleton(x) else 0
def encoded_labels_to_training_vectors(self, enc_labels):
"""Convert integer (tuple) encoded labels to truth vectors.
@@ -1313,50 +1315,51 @@ def _prob_to_snp(
not is_het,
not contains_deletion)):
- qual = self._pfmt(self._phred(1 - primary_prob))
+ qual = self._phred(1 - primary_prob)
alt = call[0]
- genotype = {'GT': '1/1', 'GQ': qual}
+ genotype = {'GT': '1/1', 'GQ': self._pfmt(qual, 0)}
results.append(medaka.vcf.Variant(
ref_name, pos, ref_symbol, alt, filt='PASS',
- info=info, qual=qual, genotype_data=genotype))
+ info=info, qual=self._pfmt(qual), genotype_data=genotype))
# heterozygous, no deletions
elif all((is_het,
not contains_deletion)):
err = 1 - 0.5 * (primary_prob + secondary_prob)
- qual = self._pfmt(self._phred(err))
+ qual = self._phred(err)
alt = [s for s in call if s != ref_symbol]
gt = '0/1' if len(alt) == 1 else '1/2'
- genotype = {'GT': gt, 'GQ': qual}
+ genotype = {'GT': gt, 'GQ': self._pfmt(qual, 0)}
results.append(medaka.vcf.Variant(
ref_name, pos, ref_symbol, alt, filt='PASS',
- info=info, qual=qual, genotype_data=genotype))
+ info=info, qual=self._pfmt(qual), genotype_data=genotype))
# heterozygous, one deletion
elif all((is_het,
contains_nonref,
contains_deletion)):
- qual = self._pfmt(self._phred(1 - primary_prob))
+ qual = self._phred(1 - primary_prob)
alt = [s for s in call if s != '*']
gt = '1/1'
- genotype = {'GT': gt, 'GQ': qual}
+ genotype = {'GT': gt, 'GQ': self._pfmt(qual, 0)}
results.append(medaka.vcf.Variant(
ref_name, pos, ref_symbol, alt, filt='PASS',
- info=info, qual=qual, genotype_data=genotype))
+ info=info, qual=self._pfmt(qual), genotype_data=genotype))
# no snp at this location
else:
if return_all:
- qual = self._pfmt(ref_prob)
+ qual = self._phred(ref_prob)
# return variant even though it is not a snp
- genotype = {'GT': '0/0', 'GQ': qual}
+ genotype = {'GT': '0/0', 'GQ': self._pfmt(qual, 0)}
results.append(medaka.vcf.Variant(
ref_name, pos, ref_symbol, alt='.', filt='PASS',
- info=info, qual=qual, genotype_data=genotype))
+ info=info, qual=self._pfmt(qual),
+ genotype_data=genotype))
return results
diff --git a/medaka/medaka.py b/medaka/medaka.py
index 1c92f15b..19798d5d 100644
--- a/medaka/medaka.py
+++ b/medaka/medaka.py
@@ -638,6 +638,6 @@ def main():
msg = "Reads will be filtered to only those with RG tag: {}"
logger.info(msg.format(RG))
# if model is default, resolve to file, save mess in help text
- if hasattr(args, 'model'):
+ if hasattr(args, 'model') and args.model is not None:
args.model = medaka.models.resolve_model(args.model)
args.func(args)
diff --git a/medaka/options.py b/medaka/options.py
index 7b27effc..785487cb 100644
--- a/medaka/options.py
+++ b/medaka/options.py
@@ -26,7 +26,7 @@
# r10 consensus
'r103_min_high_g345', 'r103_min_high_g360', 'r103_prom_high_g360',
# snp and variant
- 'r941_prom_snp_g322', 'r941_prom_variant_g322',
+ 'r941_prom_snp_g360', 'r941_prom_variant_g360',
'r103_prom_snp_g3210', 'r103_prom_variant_g3210']
archived_models = [
# r9 consensus
@@ -35,12 +35,13 @@
# r10 consensus
'r10_min_high_g303', 'r10_min_high_g340',
# snp and variant
- 'r941_prom_snp_g303', 'r941_prom_variant_g303']
+ 'r941_prom_snp_g303', 'r941_prom_variant_g303',
+ 'r941_prom_snp_g322', 'r941_prom_variant_g322']
allowed_models = sorted(current_models + archived_models)
default_models = {
- 'consensus': 'r941_min_high_g351',
- 'snp': 'r941_prom_snp_g322',
- 'variant': 'r941_prom_variant_g322'}
+ 'consensus': 'r941_min_high_g360',
+ 'snp': 'r941_prom_snp_g360',
+ 'variant': 'r941_prom_variant_g360'}
alignment_params = {
'rle': "-M 5 -S 4 -O 2 -E 3",
diff --git a/medaka/test/test_labels.py b/medaka/test/test_labels.py
index 1744b3b2..7dfd8d10 100644
--- a/medaka/test/test_labels.py
+++ b/medaka/test/test_labels.py
@@ -337,7 +337,7 @@ def test_decode_variants(self):
self.assertEqual(v.alt, [alt], msg=msg)
self.assertEqual(v.genotype_data['GT'], '1', msg=msg)
self.assertAlmostEqual(float(v.qual), expected_qual, places=3, msg=msg)
- self.assertAlmostEqual(float(v.genotype_data['GQ']), expected_qual, places=3, msg=msg)
+ self.assertEqual(int(v.genotype_data['GQ']), round(expected_qual), msg=msg)
def test_decode_snps(self):
@@ -411,7 +411,7 @@ def test_decode_snps(self):
self.assertEqual(v.alt, alt)
self.assertEqual(v.genotype_data['GT'], gt)
gq = qual_homo if len(set(v.gt)) == 1 else qual_hetero
- self.assertAlmostEqual(float(v.genotype_data['GQ']), gq, places=3)
+ self.assertEqual(int(v.genotype_data['GQ']), round(gq))
def test_padding_vector(self):
self.assertEqual(self.ls.padding_vector, 0)
@@ -741,7 +741,7 @@ def test_decode_snps_is_het_always_true(self):
self.assertEqual(v.alt, alt)
self.assertEqual(v.genotype_data['GT'], gt)
gq = qual_hom if len(set(v.gt)) == 1 else qual_het
- self.assertAlmostEqual(float(v.genotype_data['GQ']), gq, places=3)
+ self.assertEqual(int(v.genotype_data['GQ']), round(gq))
def test_decode_snps_is_het_always_off(self):
@@ -789,7 +789,7 @@ def test_decode_snps_is_het_always_off(self):
self.assertEqual(v.alt, alt)
self.assertEqual(v.genotype_data['GT'], gt)
gq = qual_hom if len(set(v.gt)) == 1 else qual_het
- self.assertAlmostEqual(float(v.genotype_data['GQ']), gq, places=3)
+ self.assertEqual(int(v.genotype_data['GQ']), round(gq))
def test_padding_vector(self):
self.assertEqual(self.ls.padding_vector, 0)
diff --git a/medaka/vcf.py b/medaka/vcf.py
index 710f9722..1ffe0e60 100644
--- a/medaka/vcf.py
+++ b/medaka/vcf.py
@@ -699,7 +699,7 @@ def _merge_variants(
gts = [1, 0]
gt = gt_sep.join(map(str, gts))
- genotype_data = {'GT': gt, 'GQ': qual}
+ genotype_data = {'GT': gt, 'GQ': round(qual)}
return Variant(
v.chrom, interval.begin, ref, alt=alts,
@@ -799,9 +799,13 @@ def meta_info(self):
# 'Number' value should be āGā.
m.append(MetaInfo(
'FORMAT', 'GT', 'G', 'String', 'Genotype'))
- # if this is not a float, vcf benchmarking tools may fail
+ # VCF spec states that GQ should be an int, and whatshap >0.18 will
+ # fail to parse the vcf if it's a float.
+ # vcf benchmarking tools which require a float could use the QUAL
+ # as we write out the same number for QUAL and GQ, the QG is just
+ # rounded.
m.append(MetaInfo(
- 'FORMAT', 'GQ', 'G', 'Float', 'Genotype quality score'))
+ 'FORMAT', 'GQ', 'G', 'Integer', 'Genotype quality score'))
return m
diff --git a/requirements.txt b/requirements.txt
index 9b877827..f0cfea63 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,6 +1,6 @@
# medaka requirements.
# Add comments to keep track of why we are using particular versions
-biopython
+biopython>=1.73,<1.77 # 1.76 is last version of this whatshap dependency to support python < 3.6
cffi
grpcio==1.27.2 # last version of this tensorflow dependency to have a manylinux1 wheel.
h5py==2.7.1 # 2.8.0 and 2.9.0 seem to not like keras model writing
diff --git a/scripts/medaka_variant b/scripts/medaka_variant
index 57954b86..b8300312 100755
--- a/scripts/medaka_variant
+++ b/scripts/medaka_variant
@@ -32,8 +32,8 @@ PHASEOUT=false
DELETE=false
SNPMODEL=${modeldata[2]##* }
MODEL=${modeldata[3]##* }
-INDEL_THRESHOLD=14
-SNP_THRESHOLD=12
+INDEL_THRESHOLD=9
+SNP_THRESHOLD=8
FILTER_FLAG=true
STOPAFTERROUND0=false
@@ -45,7 +45,7 @@ Variant calling via neural networks. The input bam should be aligned to the
reference against which to call variants. Note: that although configurable
it is unlikely that the model arguments should be changed from their defaults.
-$(basename "$0") [-h] -i
+$(basename "$0") [-h] -i
-h show this help text.
-i input bam of reads aligned to ref. Read groups are currently ignored,
@@ -397,6 +397,8 @@ else
# Filter resulting vcf
FILTER="(((TYPE=\"snp\" | TYPE=\"mnp\") & (QUAL > ${SNP_THRESHOLD})) | ((TYPE!=\"snp\" & TYPE!=\"mnp\") & (QUAL > ${INDEL_THRESHOLD})))"
bcftools filter --threads $THREADS -s lowqual -i "${FILTER}" ${UNFILTERED} > ${FINAL}
+ # pyvcf used in whatshap<=0.18 can't parse the ##FILTER=