Skip to content

Commit

Permalink
VCFHeader and VCFHeaderLine refactoring to enable support for VCF4.3/…
Browse files Browse the repository at this point in the history
…BCF2.2 and bug fixes.
  • Loading branch information
cmnbroad committed Nov 9, 2021
1 parent bf8d8da commit 0c44ef6
Show file tree
Hide file tree
Showing 60 changed files with 5,905 additions and 2,107 deletions.
6 changes: 6 additions & 0 deletions src/main/java/htsjdk/samtools/Defaults.java
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,11 @@ public class Defaults {
*/
public static final boolean DISABLE_SNAPPY_COMPRESSOR;

/**
* Strict VCF version validation. Default = true.
*/
public static final boolean STRICT_VCF_VERSION_VALIDATION;


public static final String SAMJDK_PREFIX = "samjdk.";
static {
Expand All @@ -134,6 +139,7 @@ public class Defaults {
SAM_FLAG_FIELD_FORMAT = SamFlagField.valueOf(getStringProperty("sam_flag_field_format", SamFlagField.DECIMAL.name()));
SRA_LIBRARIES_DOWNLOAD = getBooleanProperty("sra_libraries_download", false);
DISABLE_SNAPPY_COMPRESSOR = getBooleanProperty(DISABLE_SNAPPY_PROPERTY_NAME, false);
STRICT_VCF_VERSION_VALIDATION = getBooleanProperty("strict_version_validation", true);
}

/**
Expand Down
15 changes: 15 additions & 0 deletions src/main/java/htsjdk/samtools/SAMSequenceDictionary.java
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,13 @@ public SAMSequenceDictionary(final List<SAMSequenceRecord> list) {
setSequences(list);
}

//TODO: this returns sequences in the internal list order instead of
// honoring each sequence's contigIndex
/**
* Get a list of sequences for this dictionary.
* @return the list of sequences for this dictionary in internal order (the order in which the sequences
* were added to this dictionary)
*/
public List<SAMSequenceRecord> getSequences() {
return Collections.unmodifiableList(mSequences);
}
Expand All @@ -75,6 +82,14 @@ public void setSequences(final List<SAMSequenceRecord> list) {
list.forEach(this::addSequence);
}

/**
* Add a sequence to the dictionary.
* @param sequenceRecord the sequence record to add - note that this method mutates the contig
* index of the sequenceRecord to match the newly added record's relative
* order in the list
*/
//TODO: this method ignores (and actually mutates) the sequenceRecord's contig index to make it match
// the record's relative placement in the dictionary's internal list
public void addSequence(final SAMSequenceRecord sequenceRecord) {
if (mSequenceMap.containsKey(sequenceRecord.getSequenceName())) {
throw new IllegalArgumentException("Cannot add sequence that already exists in SAMSequenceDictionary: " +
Expand Down
181 changes: 11 additions & 170 deletions src/main/java/htsjdk/samtools/SAMSequenceDictionaryUtils.java
Original file line number Diff line number Diff line change
@@ -1,26 +1,23 @@
package org.broadinstitute.hellbender.utils;
package htsjdk.samtools;

import htsjdk.samtools.SAMSequenceDictionary;
import htsjdk.samtools.SAMSequenceRecord;
import org.broadinstitute.hellbender.exceptions.GATKException;
import org.broadinstitute.hellbender.exceptions.UserException;
import htsjdk.utils.ValidationUtils;

import java.util.*;
import java.util.stream.Collectors;

/**
*
* A series of utility functions that enable the GATK to compare two sequence dictionaries -- from the reference,
* A series of utility functions that enable comparison of two sequence dictionaries -- from the reference,
* from BAMs, or from feature sources -- for consistency. The system supports two basic modes: get an enum state that
* describes at a high level the consistency between two dictionaries, or a validateDictionaries that will
* blow up with a UserException if the dicts are too incompatible.
*
* Dictionaries are tested for contig name overlaps, consistency in ordering in these overlap set, and length,
* if available.
*/
public final class SequenceDictionaryUtils {
public final class SAMSequenceDictionaryUtils {

private SequenceDictionaryUtils(){}
private SAMSequenceDictionaryUtils(){}

/**
* Compares sequence records by their order
Expand Down Expand Up @@ -59,166 +56,10 @@ public enum SequenceDictionaryCompatibility {
UNEQUAL_COMMON_CONTIGS, // common subset has contigs that have the same name but different lengths
NON_CANONICAL_HUMAN_ORDER, // human reference detected but the order of the contigs is non-standard (lexicographic, for example)
OUT_OF_ORDER, // the two dictionaries overlap but the overlapping contigs occur in different
// orders with respect to each other
// orders with respect to each other
DIFFERENT_INDICES // the two dictionaries overlap and the overlapping contigs occur in the same
// order with respect to each other, but one or more of them have different
// indices in the two dictionaries. Eg., { chrM, chr1, chr2 } vs. { chr1, chr2 }
}

/**
* Tests for compatibility between two sequence dictionaries, using standard validation settings appropriate
* for the GATK. If the dictionaries are incompatible, then UserExceptions are thrown with detailed error messages.
*
* The standard validation settings used by this method are:
*
* -Require the dictionaries to share a common subset of equivalent contigs
*
* -Do not require dict1 to be a superset of dict2.
*
* -Do not perform checks related to contig ordering: don't throw if the common contigs are in
* different orders with respect to each other, occur at different absolute indices, or are
* lexicographically sorted human dictionaries. GATK uses contig names rather than contig
* indices, and so should not be sensitive to contig ordering issues.
*
* For comparing a CRAM dictionary against a reference dictionary, call
* {@link #validateCRAMDictionaryAgainstReference(SAMSequenceDictionary, SAMSequenceDictionary)} instead.
*
* @param name1 name associated with dict1
* @param dict1 the sequence dictionary dict1
* @param name2 name associated with dict2
* @param dict2 the sequence dictionary dict2
*/
public static void validateDictionaries( final String name1,
final SAMSequenceDictionary dict1,
final String name2,
final SAMSequenceDictionary dict2) {
final boolean requireSuperset = false;
final boolean checkContigOrdering = false;

validateDictionaries(name1, dict1, name2, dict2, requireSuperset, checkContigOrdering);
}

/**
* Tests for compatibility between a reference dictionary and a CRAM dictionary, using appropriate
* validation settings. If the dictionaries are incompatible, then UserExceptions are thrown with
* detailed error messages.
*
* The standard validation settings used by this method are:
*
* -Require the reference dictionary to be a superset of the cram dictionary
*
* -Do not perform checks related to contig ordering: don't throw if the common contigs are in
* different orders with respect to each other, occur at different absolute indices, or are
* lexicographically sorted human dictionaries. GATK uses contig names rather than contig
* indices, and so should not be sensitive to contig ordering issues.
*
* @param referenceDictionary the sequence dictionary for the reference
* @param cramDictionary sequence dictionary from a CRAM file
*/
public static void validateCRAMDictionaryAgainstReference( final SAMSequenceDictionary referenceDictionary,
final SAMSequenceDictionary cramDictionary ) {
// For CRAM, we require the reference dictionary to be a superset of the reads dictionary
final boolean requireSuperset = true;
final boolean checkContigOrdering = false;

validateDictionaries("reference", referenceDictionary, "reads", cramDictionary, requireSuperset, checkContigOrdering);
}


/**
* Tests for compatibility between two sequence dictionaries. If the dictionaries are incompatible, then
* UserExceptions are thrown with detailed error messages.
*
* Two sequence dictionaries are compatible if they share a common subset of equivalent contigs,
* where equivalent contigs are defined as having the same name and length.
*
* @param name1 name associated with dict1
* @param dict1 the sequence dictionary dict1
* @param name2 name associated with dict2
* @param dict2 the sequence dictionary dict2
* @param requireSuperset if true, require that dict1 be a superset of dict2, rather than dict1 and dict2 sharing a common subset
* @param checkContigOrdering if true, require common contigs to be in the same relative order with respect to each other
* and occur at the same absolute indices, and forbid lexicographically-sorted human dictionaries
*/
public static void validateDictionaries( final String name1,
final SAMSequenceDictionary dict1,
final String name2,
final SAMSequenceDictionary dict2,
final boolean requireSuperset,
final boolean checkContigOrdering ) {
Utils.nonNull(dict1, "Something went wrong with sequence dictionary detection, check that "+name1+" has a valid sequence dictionary");
Utils.nonNull(dict2, "Something went wrong with sequence dictionary detection, check that "+name2+" has a valid sequence dictionary");

final SequenceDictionaryCompatibility type = compareDictionaries(dict1, dict2, checkContigOrdering);

switch ( type ) {
case IDENTICAL:
return;
case SUPERSET:
return;
case COMMON_SUBSET:
if ( requireSuperset ) {
final Set<String> contigs1 = dict1.getSequences().stream().map(SAMSequenceRecord::getSequenceName).collect(Collectors.toSet());
final List<String> missingContigs = dict2.getSequences().stream()
.map(SAMSequenceRecord::getSequenceName)
.filter(contig -> !contigs1.contains(contig))
.collect(Collectors.toList());
throw new UserException.IncompatibleSequenceDictionaries(String.format("Dictionary %s is missing contigs found in dictionary %s. Missing contigs: \n %s \n", name1, name2, String.join(", ", missingContigs)), name1, dict1, name2, dict2);
}
return;
case NO_COMMON_CONTIGS:
throw new UserException.IncompatibleSequenceDictionaries("No overlapping contigs found", name1, dict1, name2, dict2);

case UNEQUAL_COMMON_CONTIGS: {
final List<SAMSequenceRecord> x = findDisequalCommonContigs(getCommonContigsByName(dict1, dict2), dict1, dict2);
final SAMSequenceRecord elt1 = x.get(0);
final SAMSequenceRecord elt2 = x.get(1);
throw new UserException.IncompatibleSequenceDictionaries(
String.format("Found contigs with the same name but different lengths:\n contig %s = %s / %d\n contig %s = %s / %d",
name1, elt1.getSequenceName(), elt1.getSequenceLength(),
name2, elt2.getSequenceName(), elt2.getSequenceLength()),
name1, dict1, name2, dict2
);
}

case NON_CANONICAL_HUMAN_ORDER: {
// We only get NON_CANONICAL_HUMAN_ORDER if the caller explicitly requested that we check contig ordering,
// so we should always throw when we see it.
final UserException ex;
if ( nonCanonicalHumanContigOrder(dict1) ) {
ex = new UserException.LexicographicallySortedSequenceDictionary(name1, dict1);
}
else {
ex = new UserException.LexicographicallySortedSequenceDictionary(name2, dict2);
}

throw ex;
}

case OUT_OF_ORDER: {
// We only get OUT_OF_ORDER if the caller explicitly requested that we check contig ordering,
// so we should always throw when we see it.
throw new UserException.IncompatibleSequenceDictionaries(
"The relative ordering of the common contigs in " + name1 + " and " + name2 +
" is not the same; to fix this please see: "
+ "(https://www.broadinstitute.org/gatk/guide/article?id=1328), "
+ " which describes reordering contigs in BAM and VCF files.",
name1, dict1, name2, dict2);
}

case DIFFERENT_INDICES: {
// We only get DIFFERENT_INDICES if the caller explicitly requested that we check contig ordering,
// so we should always throw when we see it.
final String msg = "One or more contigs common to both dictionaries have " +
"different indices (ie., absolute positions) in each dictionary. Code " +
"that is sensitive to contig ordering can fail when this is the case. " +
"You should fix the sequence dictionaries so that all shared contigs " +
"occur at the same absolute positions in both dictionaries.";
throw new UserException.IncompatibleSequenceDictionaries(msg, name1, dict1, name2, dict2);
}
default:
throw new GATKException("Unexpected SequenceDictionaryComparison type: " + type);
}
// order with respect to each other, but one or more of them have different
// indices in the two dictionaries. Eg., { chrM, chr1, chr2 } vs. { chr1, chr2 }
}

/**
Expand Down Expand Up @@ -465,14 +306,14 @@ public static Set<String> getCommonContigsByName(SAMSequenceDictionary dict1, SA
}

public static Set<String> getContigNames(SAMSequenceDictionary dict) {
Set<String> contigNames = new LinkedHashSet<String>(Utils.optimumHashSize(dict.size()));
Set<String> contigNames = new LinkedHashSet<String>(dict.size());
for (SAMSequenceRecord dictionaryEntry : dict.getSequences())
contigNames.add(dictionaryEntry.getSequenceName());
return contigNames;
}

public static List<String> getContigNamesList(final SAMSequenceDictionary refSeqDict) {
Utils.nonNull(refSeqDict, "provided reference sequence ditionary is null");
ValidationUtils.nonNull(refSeqDict, "provided reference sequence ditionary is null");
return refSeqDict.getSequences().stream().map(SAMSequenceRecord::getSequenceName).collect(Collectors.toList());
}

Expand All @@ -486,7 +327,7 @@ public static List<String> getContigNamesList(final SAMSequenceDictionary refSeq
* @return A String containing all of the contig names and lengths from the sequence dictionary it's passed
*/
public static String getDictionaryAsString( final SAMSequenceDictionary dict ) {
Utils.nonNull(dict, "Sequence dictionary must be non-null");
ValidationUtils.nonNull(dict, "Sequence dictionary must be non-null");

StringBuilder s = new StringBuilder("[ ");

Expand Down
6 changes: 6 additions & 0 deletions src/main/java/htsjdk/tribble/TribbleException.java
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,12 @@ public static class InternalCodecException extends TribbleException {
public InternalCodecException(String message) { super (message); }
}

public static class VersionValidationFailure extends TribbleException {
public VersionValidationFailure(final String message) {
super(String.format("Version validation failure: %s", message));
}
}

// //////////////////////////////////////////////////////////////////////
// Index exceptions
// //////////////////////////////////////////////////////////////////////
Expand Down
29 changes: 19 additions & 10 deletions src/main/java/htsjdk/variant/bcf2/BCF2Utils.java
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,11 @@

import htsjdk.samtools.util.FileExtensions;
import htsjdk.tribble.TribbleException;
import htsjdk.variant.vcf.*;
import htsjdk.variant.vcf.VCFConstants;
import htsjdk.variant.vcf.VCFHeader;
import htsjdk.variant.vcf.VCFHeaderLine;
import htsjdk.variant.vcf.VCFIDHeaderLine;
import htsjdk.variant.vcf.VCFSimpleHeaderLine;

import java.io.File;
import java.io.FileNotFoundException;
Expand Down Expand Up @@ -93,10 +97,15 @@ public static ArrayList<String> makeDictionary(final VCFHeader header) {
// set up the strings dictionary
for ( VCFHeaderLine line : header.getMetaDataInInputOrder() ) {
if ( line.shouldBeAddedToDictionary() ) {
final VCFIDHeaderLine idLine = (VCFIDHeaderLine)line;
if ( ! seen.contains(idLine.getID())) {
dict.add(idLine.getID());
seen.add(idLine.getID());
if (!line.isIDHeaderLine()) {
//is there a better way to ensure that shouldBeAddedToDictionary==true only when isIDHeaderLine==true
throw new TribbleException(String.format(
"The header line %s cannot be added to the BCF dictionary since its not an ID header line",
line));
}
if ( ! seen.contains(line.getID())) {
dict.add(line.getID());
seen.add(line.getID());
}
}
}
Expand Down Expand Up @@ -291,7 +300,7 @@ else if ( o.getClass().isArray() ) {
* Are the elements and their order in the output and input headers consistent so that
* we can write out the raw genotypes block without decoding and recoding it?
*
* If the order of INFO, FILTER, or contrig elements in the output header is different than
* If the order of INFO, FILTER, or contig elements in the output header is different than
* in the input header we must decode the blocks using the input header and then recode them
* based on the new output order.
*
Expand All @@ -308,15 +317,15 @@ public static boolean headerLinesAreOrderedConsistently(final VCFHeader outputHe
if ( ! nullAsEmpty(outputHeader.getSampleNamesInOrder()).equals(nullAsEmpty(genotypesBlockHeader.getSampleNamesInOrder())) )
return false;

final Iterator<? extends VCFIDHeaderLine> outputLinesIt = outputHeader.getIDHeaderLines().iterator();
final Iterator<? extends VCFIDHeaderLine> inputLinesIt = genotypesBlockHeader.getIDHeaderLines().iterator();
final Iterator<VCFSimpleHeaderLine> outputLinesIt = outputHeader.getIDHeaderLines().iterator();
final Iterator<VCFSimpleHeaderLine> inputLinesIt = genotypesBlockHeader.getIDHeaderLines().iterator();

while ( inputLinesIt.hasNext() ) {
if ( ! outputLinesIt.hasNext() ) // missing lines in output
return false;

final VCFIDHeaderLine outputLine = outputLinesIt.next();
final VCFIDHeaderLine inputLine = inputLinesIt.next();
final VCFSimpleHeaderLine outputLine = outputLinesIt.next();
final VCFSimpleHeaderLine inputLine = inputLinesIt.next();

if ( ! inputLine.getClass().equals(outputLine.getClass()) || ! inputLine.getID().equals(outputLine.getID()) )
return false;
Expand Down
Loading

0 comments on commit 0c44ef6

Please sign in to comment.