-
Notifications
You must be signed in to change notification settings - Fork 244
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Require sequence dictionary MD5s when writing CRAM. #1612
base: master
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -23,9 +23,10 @@ | |
*/ | ||
package htsjdk.samtools; | ||
|
||
import htsjdk.io.HtsPath; | ||
import htsjdk.io.IOPath; | ||
import htsjdk.samtools.cram.ref.CRAMReferenceSource; | ||
import htsjdk.samtools.cram.ref.ReferenceSource; | ||
import htsjdk.samtools.cram.structure.CRAMEncodingStrategy; | ||
import htsjdk.samtools.util.BlockCompressedOutputStream; | ||
import htsjdk.samtools.util.FileExtensions; | ||
import htsjdk.samtools.util.IOUtil; | ||
|
@@ -36,9 +37,11 @@ | |
|
||
import java.io.File; | ||
import java.io.IOException; | ||
import java.io.InputStream; | ||
import java.io.OutputStream; | ||
import java.nio.file.Files; | ||
import java.nio.file.Path; | ||
import java.util.List; | ||
import java.util.zip.Deflater; | ||
import static htsjdk.samtools.SamReader.Type.*; | ||
|
||
|
@@ -634,6 +637,7 @@ private CRAMFileWriter createCRAMWriterWithSettings( | |
final Path referenceFasta) { | ||
|
||
final CRAMReferenceSource referenceSource; | ||
SAMFileHeader repairedHeader = header; | ||
if (referenceFasta == null) { | ||
log.info("Reference fasta is not provided when writing CRAM file " + outputFile.toUri().toString()); | ||
log.info("Will attempt to use a default reference or download as set by defaults:"); | ||
|
@@ -642,6 +646,10 @@ private CRAMFileWriter createCRAMWriterWithSettings( | |
|
||
referenceSource = ReferenceSource.getDefaultCRAMReferenceSource(); | ||
} else { | ||
// CRAMs are required to have a SAMHeader with a sequence dictionary that contains MD5 | ||
// checksums for each sequence. Check the proposed dictionary to see if it has MD5s, and | ||
// if not, attempt to update it with MD5s from the reference dictionary, if one exists. | ||
repairedHeader = repairSequenceMD5s(header, new HtsPath(referenceFasta.toUri().toString())); | ||
referenceSource = new ReferenceSource(referenceFasta); | ||
} | ||
OutputStream cramOS = null; | ||
|
@@ -674,7 +682,7 @@ private CRAMFileWriter createCRAMWriterWithSettings( | |
indexOS, | ||
presorted, | ||
referenceSource, | ||
header, | ||
repairedHeader, | ||
outputFile.toUri().toString()); | ||
setCRAMWriterDefaults(writer); | ||
|
||
|
@@ -687,6 +695,35 @@ private void setCRAMWriterDefaults(final CRAMFileWriter writer) { | |
//writer.setEncodingParams(new CRAMEncodingStrategy()); | ||
} | ||
|
||
// If any sequence records don't contain MD5s, attempt to repair them by consulting the reference's | ||
// dictionary file, if one exists. | ||
private SAMFileHeader repairSequenceMD5s(final SAMFileHeader header, final IOPath referenceFasta) { | ||
final List<SAMSequenceRecord> missingMD5s = header.getSequenceDictionary().getSequencesWithMissingMD5s(); | ||
if (!missingMD5s.isEmpty()) { | ||
final String missingMD5Message = SAMSequenceDictionary.createFormattedMD5Message("SAM header", missingMD5s); | ||
log.warn(String.format( | ||
"%s Attempting to use the reference dictionary to repair missing MD5s required for CRAM output.", | ||
missingMD5Message)); | ||
final IOPath referenceDictionary = SAMSequenceDictionary.getFastaDictionaryFileName(referenceFasta); | ||
try (final InputStream fastaDictionaryStream = referenceDictionary.getInputStream()) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This seems really awkward:
I think we should eliminate these lines and the new methods on SAMSequenceDictionary and instead add an accessor for the sequence dictionary to ReferenceSource. Then do the repair call after opening the reference source and get the sequence dictionary directly from the ReferenceSource. |
||
final SAMSequenceDictionary newDictionary = | ||
SAMSequenceDictionary.loadSAMSequenceDictionary(fastaDictionaryStream); | ||
newDictionary.requireMD5sOrThrow(referenceFasta.toString()); | ||
final SAMFileHeader repairedHeader = header.clone(); | ||
repairedHeader.setSequenceDictionary(newDictionary); | ||
return repairedHeader; | ||
} catch (final IOException | RuntimeException e) { | ||
throw new RuntimeException( | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should this be a CRAM or SAMException instead? |
||
String.format( | ||
"The attempt to repair the missing sequence MD5s (%s) using the reference dictionary %s failed.", | ||
missingMD5Message, | ||
referenceDictionary), | ||
e); | ||
} | ||
} | ||
return header; | ||
} | ||
|
||
@Override | ||
public String toString() { | ||
return "SAMFileWriterFactory [createIndex=" + createIndex + ", createMd5File=" + createMd5File + ", useAsyncIo=" | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -24,8 +24,15 @@ | |
package htsjdk.samtools; | ||
|
||
import htsjdk.beta.plugin.HtsHeader; | ||
import htsjdk.io.HtsPath; | ||
import htsjdk.io.IOPath; | ||
import htsjdk.samtools.util.BufferedLineReader; | ||
import htsjdk.samtools.util.FileExtensions; | ||
import htsjdk.samtools.util.Log; | ||
import htsjdk.samtools.util.RuntimeIOException; | ||
|
||
import java.io.IOException; | ||
import java.io.InputStream; | ||
import java.io.Serializable; | ||
import java.math.BigInteger; | ||
import java.security.MessageDigest; | ||
|
@@ -294,6 +301,99 @@ public String md5() { | |
} | ||
} | ||
|
||
|
||
/** | ||
* Given a fasta filename, return the name of the corresponding dictionary file. | ||
*/ | ||
public static IOPath getFastaDictionaryFileName(IOPath fastaFile) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't think we should be adding more filename guessing and munging here. I think we can rely on ReferenceSequenceFile to find and provide the SequenceDictionary. |
||
final String fastaName = fastaFile.getURIString(); | ||
int lastDot = fastaName.lastIndexOf('.'); | ||
return new HtsPath(fastaName.substring(0, lastDot) + FileExtensions.DICT); | ||
} | ||
|
||
/** | ||
* Given a fasta dictionary file, returns its sequence dictionary | ||
* | ||
* @param fastaDictionary fasta dictionary file | ||
* @return the SAMSequenceDictionary from fastaDictionaryFile | ||
*/ | ||
public static SAMSequenceDictionary loadSAMSequenceDictionary( final IOPath fastaDictionary ) { | ||
try ( final InputStream fastaDictionaryStream = fastaDictionary.getInputStream() ) { | ||
return loadSAMSequenceDictionary(fastaDictionaryStream); | ||
} | ||
catch ( IOException e ) { | ||
throw new RuntimeIOException("Error loading fasta dictionary file " + fastaDictionary, e); | ||
} | ||
} | ||
|
||
/** | ||
* Given an InputStream connected to a fasta dictionary, returns its sequence dictionary | ||
* | ||
* Note: does not close the InputStream it's passed | ||
* | ||
* @param fastaDictionaryStream InputStream connected to a fasta dictionary | ||
* @return the SAMSequenceDictionary from the fastaDictionaryStream | ||
*/ | ||
public static SAMSequenceDictionary loadSAMSequenceDictionary( final InputStream fastaDictionaryStream ) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is probably a good thing to have here. Can you update SAMSequenceDictionaryExtractor to call this method? |
||
// Don't close the reader when we're done, since we don't want to close the client's InputStream for them | ||
final BufferedLineReader reader = new BufferedLineReader(fastaDictionaryStream); | ||
|
||
final SAMTextHeaderCodec codec = new SAMTextHeaderCodec(); | ||
final SAMFileHeader header = codec.decode(reader, fastaDictionaryStream.toString()); | ||
|
||
// Make sure we have a valid sequence dictionary before continuing: | ||
if (header.getSequenceDictionary() == null || header.getSequenceDictionary().isEmpty()) { | ||
throw new RuntimeException ( | ||
"Could not read sequence dictionary from given fasta stream " + | ||
fastaDictionaryStream | ||
); | ||
} | ||
|
||
return header.getSequenceDictionary(); | ||
} | ||
|
||
final void requireMD5sOrThrow(final String contextMessage) { | ||
final List<SAMSequenceRecord> missingMD5s = getSequencesWithMissingMD5s(); | ||
if (!missingMD5s.isEmpty()) { | ||
throw new RuntimeException(SAMSequenceDictionary.createFormattedMD5Message(contextMessage, missingMD5s)); | ||
} | ||
} | ||
|
||
/** | ||
* @return all sequences in this dictionary that do not have an MD5 attribute | ||
*/ | ||
List<SAMSequenceRecord> getSequencesWithMissingMD5s() { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. These seem like very specific methods. It's a bit funny to have them here as package private. I wonder if it makes more sense to move them to the callsite. |
||
return getSequences().stream().filter( | ||
seqRec -> { | ||
final String MD5 = seqRec.getMd5(); | ||
return MD5 == null || MD5.length() == 0; | ||
}).collect(Collectors.toList()); | ||
} | ||
|
||
/** | ||
* Create a formatted error string message describing which MD5s are missing from the sequence dictionary. | ||
* | ||
* @param contextID a string recognizable to the user describing the input that is the source of the failure | ||
* @param badSequenceRecords the sequence with missing MD5s | ||
* @return a message string suitable for presentation to the user | ||
*/ | ||
static String createFormattedMD5Message(final String contextID, List<SAMSequenceRecord> badSequenceRecords) { | ||
final int MAX_ERRORS_REPORTED = 10; | ||
if (badSequenceRecords.size() != 0) { | ||
return String.format( | ||
"The sequence dictionary for %s is missing the required MD5 checksum for some contigs: %s%s.", | ||
contextID, | ||
badSequenceRecords.stream() | ||
.limit(Integer.min(badSequenceRecords.size(), MAX_ERRORS_REPORTED)) | ||
.map(SAMSequenceRecord::getSequenceName) | ||
.collect(Collectors.joining(",")), | ||
badSequenceRecords.size() > MAX_ERRORS_REPORTED ? | ||
" and others": | ||
""); | ||
} | ||
return null; | ||
} | ||
|
||
@Override | ||
public int hashCode() { | ||
return mSequences.hashCode(); | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -24,16 +24,16 @@ | |
package htsjdk.samtools; | ||
|
||
import htsjdk.samtools.util.Locatable; | ||
import htsjdk.samtools.util.StringUtil; | ||
import htsjdk.samtools.util.SequenceUtil; | ||
|
||
import java.math.BigInteger; | ||
import java.util.ArrayList; | ||
import java.security.MessageDigest; | ||
import java.security.NoSuchAlgorithmException; | ||
import java.util.Arrays; | ||
import java.util.Collection; | ||
import java.util.Collections; | ||
import java.util.HashSet; | ||
import java.util.LinkedHashSet; | ||
import java.util.List; | ||
import java.util.Map; | ||
import java.util.Set; | ||
import java.util.regex.Pattern; | ||
|
@@ -148,6 +148,23 @@ public SAMSequenceRecord setMd5(final String value) { | |
return this; | ||
} | ||
|
||
/** | ||
* Set the M5 attribute for this sequence using the provided sequenceBases. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. typo: M5 -> MD5 |
||
* @param sequenceBases sequence bases to use to compute the MD5 for this sequence | ||
* @return the update sequence record | ||
*/ | ||
public SAMSequenceRecord setComputedMd5(final byte[] sequenceBases) { | ||
try { | ||
final MessageDigest md5Digester = MessageDigest.getInstance("MD5"); | ||
if (md5Digester != null) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This method seems broken. It doesn't actually digest the bases. Also, the bases should be normalized. Why not use SequenceUtil.calculateMD5String()? |
||
setMd5(SequenceUtil.md5DigestToString(md5Digester.digest())); | ||
} | ||
} catch (final NoSuchAlgorithmException e) { | ||
throw new RuntimeException("Error getting MD5 algorithm for sequence record MD5 computation", e); | ||
} | ||
return this; | ||
} | ||
|
||
public String getDescription() { | ||
return getAttribute(DESCRIPTION_TAG); | ||
} | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
An aside, should we add direct HtsPath(Path) HtsPath(File) constructors?