diff --git a/src/main/java/htsjdk/samtools/cram/compression/nametokenisation/NameTokenisationDecode.java b/src/main/java/htsjdk/samtools/cram/compression/nametokenisation/NameTokenisationDecode.java index 4b566ed763..89c4ef2d3a 100644 --- a/src/main/java/htsjdk/samtools/cram/compression/nametokenisation/NameTokenisationDecode.java +++ b/src/main/java/htsjdk/samtools/cram/compression/nametokenisation/NameTokenisationDecode.java @@ -7,6 +7,9 @@ import java.util.List; import java.util.StringJoiner; +//for performance reasons, it would probably be wise to separate the string tokens from the int tokens +// so we don't have to repeatedly interconvert them when fetching from this list + public class NameTokenisationDecode { // TODO: lift these values to a common location since they're used by the encoder, the decoder, and the tests // for now, since we're returning a String of all the names (instead of a list, which is more efficient) because, @@ -15,65 +18,68 @@ public class NameTokenisationDecode { public final static byte NAME_SEPARATOR = 0; public final static CharSequence LOCAL_NAME_SEPARATOR_CHARSEQUENCE = new String(new byte[] {NAME_SEPARATOR}); + //TODO: once we get clarity on the spec (https://github.com/samtools/hts-specs/issues/802) on how to + // calculate this, we should use it to verify the result of decompressing + public static final int UNCOMPRESSED_LENGTH_ADJUSTMENT = 1; + // the input must be a ByteBuffer containing the read names, separated by the NAME_SEPARATOR byte, WITHOUT // a terminating separator public String uncompress(final ByteBuffer inBuffer) { inBuffer.order(ByteOrder.LITTLE_ENDIAN); - final int uncompressedLength = inBuffer.getInt() & 0xFFFFFFFF; //unused but we have to consume it - final int numNames = inBuffer.getInt() & 0xFFFFFFFF; - final int useArith = inBuffer.get() & 0xFF; + //unused but we have to consume it + final int uncompressedLength = inBuffer.getInt() & 0xFFFFFFFF -1 - UNCOMPRESSED_LENGTH_ADJUSTMENT; + final int numNames = inBuffer.getInt() & 0xFFFFFFFF; + final int useArith = inBuffer.get() & 0xFF; final TokenStreams tokenStreams = new TokenStreams(inBuffer, useArith, numNames); - // keep track of token lists for names we've already seen for subsequent lookup/reference (indexed as (nameIndex, tokenPosition)) - - //TODO: for performance reasons, it would probably be wise to separate the string tokens from the int tokens - // so we don't have to repeatedly interconvert them when fetching from this list - // two dimensional array of previously encoded tokens, indexed as [nameIndex, position] - final List> previousEncodedTokens = new ArrayList<>(numNames); - for (int i = 0; i < numNames; i++) { - //TODO: preallocate this list to the expected number of tokens per name - previousEncodedTokens.add(new ArrayList<>()); - } - - //TODO: if we stored the names in an index-addressible array, or list, as we find them, instead of a StringJoiner, - // subsequent calls to decodeSingleName could look them up by index and return them directly when it sees a dup, - // rather than re-joining the tokens each time; maybe not worth it ? + // two-dimensional array of previously decoded tokens, indexed as (nameIndex, tokenPosition - 1); note + // that unlike the TYPE stream in TokenStreams, where token position 1 is located at index 1 because of + // the use of position 0 for metadata, since there is no meta data stored at position 0 in this list, + // position n in the original token stream is located at index n-1 in this list + // + // we don't preallocate these lists because we don't know how many tokens there will be in each name, + // so we'll create each one just-in-time as we need it + final List> decodedNameTokens = new ArrayList<>(numNames); final StringJoiner decodedNamesJoiner = new StringJoiner(LOCAL_NAME_SEPARATOR_CHARSEQUENCE); for (int i = 0; i < numNames; i++) { - decodedNamesJoiner.add(decodeSingleName(tokenStreams, previousEncodedTokens, i)); + decodedNamesJoiner.add(decodeSingleName(tokenStreams, decodedNameTokens, i)); } return decodedNamesJoiner.toString(); } private String decodeSingleName( final TokenStreams tokenStreams, - final List> previousEncodedTokens, + final List> decodedNameTokens, final int currentNameIndex) { + // consult tokenStreams[0, TokenStreams.TOKEN_TYPE] to determine if this name uses dup or diff, and either + // way, determine the reference name from which we will construct this name + final byte referenceType = tokenStreams.getTokenStream(0, TokenStreams.TOKEN_TYPE).get(); + final ByteBuffer distStream = tokenStreams.getTokenStream(0, referenceType); + final int referenceName = currentNameIndex - distStream.getInt() & 0xFFFFFFFF; + + if (referenceType == TokenStreams.TOKEN_DUP) { + // propagate the existing tokens for the reference name and use them for this name, in case there is + // a future instance of this same name that refers to THIS name's tokens, and then reconstruct and + // return the new name by joining the accumulated tokens + decodedNameTokens.add(currentNameIndex, decodedNameTokens.get(referenceName)); + return String.join("", decodedNameTokens.get(currentNameIndex)); + } - // The information about whether a name is a duplicate or not - // is obtained from the list of tokens at tokenStreams[0,0] - final byte nameType = tokenStreams.getTokenStream(0, TokenStreams.TOKEN_TYPE).get(); - final ByteBuffer distBuffer = tokenStreams.getTokenStream(0, nameType); - final int dist = distBuffer.getInt() & 0xFFFFFFFF; - final int prevNameIndex = currentNameIndex - dist; - - if (nameType == TokenStreams.TOKEN_DUP) { - // propagate the tokens for the previous name in case there is a future instance of this same name - // that refers to THIS name's tokens, and then reconstruct and return the name by joining the tokens - // (we don't have index-accessible access the previous name directly here since the previous names are - // stored in a StringJoiner. and also, we only store the index for the most recent instance - // of a name anyway, since we store them in a Map currentNameTokens = new ArrayList<>(30); final StringBuilder decodedNameBuilder = new StringBuilder(); - do { + byte type = -1; + // start at position 1; at position 0, there is only nameType information + for (int tokenPos = 1; type != TokenStreams.TOKEN_END; tokenPos++) { type = tokenStreams.getTokenStream(tokenPos, TokenStreams.TOKEN_TYPE).get(); String currentToken = ""; + switch(type){ case TokenStreams.TOKEN_CHAR: final char currentTokenChar = (char) tokenStreams.getTokenStream(tokenPos, TokenStreams.TOKEN_CHAR).get(); @@ -91,22 +97,23 @@ private String decodeSingleName( currentToken = leftPadWith0(digits0Token, lenDigits0Token); break; case TokenStreams.TOKEN_DELTA: - currentToken = getDeltaToken(tokenStreams, tokenPos, previousEncodedTokens, prevNameIndex, TokenStreams.TOKEN_DELTA); + currentToken = getDeltaToken(tokenPos, tokenStreams, TokenStreams.TOKEN_DELTA, decodedNameTokens, referenceName); break; case TokenStreams.TOKEN_DELTA0: - final String delta0Token = getDeltaToken(tokenStreams, tokenPos, previousEncodedTokens, prevNameIndex, TokenStreams.TOKEN_DELTA0); - final int lenDelta0Token = previousEncodedTokens.get(prevNameIndex).get(tokenPos-1).length(); + final String delta0Token = getDeltaToken(tokenPos, tokenStreams, TokenStreams.TOKEN_DELTA0, decodedNameTokens, referenceName); + final int lenDelta0Token = decodedNameTokens.get(referenceName).get(tokenPos-1).length(); currentToken = leftPadWith0(delta0Token, lenDelta0Token); break; case TokenStreams.TOKEN_MATCH: - currentToken = previousEncodedTokens.get(prevNameIndex).get(tokenPos-1); + currentToken = decodedNameTokens.get(referenceName).get(tokenPos-1); break; case TokenStreams.TOKEN_END: // tolerate END, it terminates the enclosing loop break; case TokenStreams.TOKEN_NOP: //no-op token, inserted by the writer to take up space to keep the streams aligned break; - // These are either consumed elsewhere or otherwise shouldn't be present in the stream at this point + + // fall through - these shouldn't be present at this point in the stream, since they are case TokenStreams.TOKEN_TYPE: case TokenStreams.TOKEN_DUP: //position 0 only case TokenStreams.TOKEN_DIFF: //position 0 only @@ -116,43 +123,48 @@ private String decodeSingleName( "Invalid tokenType : %s. tokenType must be one of the valid token types", type)); } - //TODO: this is expanding the list many times, which is not efficient - previousEncodedTokens.get(currentNameIndex).add(tokenPos - 1, currentToken); + currentNameTokens.add(tokenPos-1, currentToken); decodedNameBuilder.append(currentToken); - tokenPos++; - } while (type!= TokenStreams.TOKEN_END); + } + decodedNameTokens.add(currentNameIndex, currentNameTokens); return decodedNameBuilder.toString(); } private String getDeltaToken( - final TokenStreams tokenStreams, final int tokenPosition, - final List> tokensList, - final int prevNameIndex, - final byte tokenType) { - if (!(tokenType == TokenStreams.TOKEN_DELTA || tokenType == TokenStreams.TOKEN_DELTA0)){ + final TokenStreams tokenStreams, + final byte tokenType, + final List> previousTokensList, + final int prevNameIndex) { + if (tokenType != TokenStreams.TOKEN_DELTA && tokenType != TokenStreams.TOKEN_DELTA0){ throw new CRAMException(String.format( - "Invalid tokenType : %s. tokenType must be either TOKEN_DELTA or TOKEN_DELTA0", tokenType)); + "Invalid delta tokenType %s must be either TOKEN_DELTA or TOKEN_DELTA0", tokenType)); } - int prevToken; + try { - prevToken = Integer.parseInt(tokensList.get(prevNameIndex).get(tokenPosition - 1)); + final String prevToken = previousTokensList.get(prevNameIndex).get(tokenPosition - 1); + int prevTokenInt = Integer.parseInt(prevToken); + final int deltaTokenValue = tokenStreams.getTokenStream(tokenPosition, tokenType).get() & 0xFF; + return Long.toString(prevTokenInt + deltaTokenValue); } catch (final NumberFormatException e) { - final String exceptionMessageSubstring = (tokenType == TokenStreams.TOKEN_DELTA) ? "DIGITS or DELTA" : "DIGITS0 or DELTA0"; - throw new CRAMException(String.format("The token in the prior name must be of type %s", exceptionMessageSubstring), e); + throw new CRAMException( + String.format("The token in the prior name must be of type %s", + tokenType == TokenStreams.TOKEN_DELTA ? + "DIGITS or DELTA" : + "DIGITS0 or DELTA0", e)); } - final int deltaTokenValue = tokenStreams.getTokenStream(tokenPosition, tokenType).get() & 0xFF; - return Long.toString(prevToken + deltaTokenValue); } private String getDigitsToken( final TokenStreams tokenStreams, final int tokenPosition, final byte tokenType ) { - if (!(tokenType == TokenStreams.TOKEN_DIGITS || tokenType == TokenStreams.TOKEN_DIGITS0)) { - throw new CRAMException(String.format("Invalid tokenType : %s. " + - "tokenType must be either TOKEN_DIGITS or TOKEN_DIGITS0", tokenType)); + if (tokenType != TokenStreams.TOKEN_DIGITS && tokenType != TokenStreams.TOKEN_DIGITS0) { + throw new CRAMException( + String.format( + "Invalid tokenType : %s tokenType must be either TOKEN_DIGITS or TOKEN_DIGITS0", + tokenType)); } final ByteBuffer digitsByteBuffer = tokenStreams.getTokenStream(tokenPosition, tokenType); final long digits = digitsByteBuffer.getInt() & 0xFFFFFFFFL; @@ -165,7 +177,7 @@ private String readString(final ByteBuffer inputBuffer) { final StringBuilder resultStringBuilder = new StringBuilder(); byte currentByte = inputBuffer.get(); while (currentByte != 0) { - //TODO: fix this sketchy cast + //TODO: fix this sketchy cast; this will fail on non-ASCII characters resultStringBuilder.append((char) currentByte); currentByte = inputBuffer.get(); } diff --git a/src/main/java/htsjdk/samtools/cram/compression/nametokenisation/NameTokenisationEncode.java b/src/main/java/htsjdk/samtools/cram/compression/nametokenisation/NameTokenisationEncode.java index c726e4a2bc..d29ff697f2 100644 --- a/src/main/java/htsjdk/samtools/cram/compression/nametokenisation/NameTokenisationEncode.java +++ b/src/main/java/htsjdk/samtools/cram/compression/nametokenisation/NameTokenisationEncode.java @@ -12,16 +12,20 @@ import java.nio.ByteBuffer; import java.nio.charset.StandardCharsets; import java.util.ArrayList; +import java.util.Arrays; import java.util.HashMap; import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; +//TODO: implement this from the spec: if a byte stream of token types is entirely MATCH apart from the very +// first value it is discarded. It is possible to regenerate this during decode by observing the other byte streams. +//TODO:its super wasteful (but simpler) to always store the accumulated tokens as Strings, since thisresults +// int lots of String<-> int interconversions + /** - * Naive name tokenization encoder + * Very naive implementation of a name tokenization encoder */ -//TODO: implement this: if a byte stream of token types is entirely MATCH apart from the very first value it is discarded. -// It is possible to regenerate this during decode by observing the other byte streams. public class NameTokenisationEncode { private final static String READ_NAME_TOK_REGEX = "([a-zA-Z0-9]{1,9})|([^a-zA-Z0-9]+)"; private final static Pattern READ_NAME_PATTERN = Pattern.compile(READ_NAME_TOK_REGEX); @@ -32,76 +36,75 @@ public class NameTokenisationEncode { private final static String DIGITS_REGEX = "^[0-9]+$"; private final static Pattern DIGITS_PATTERN = Pattern.compile(DIGITS_REGEX); - private int maxToken; + private int maxNumberOfTokens; // the maximum number of tokenised columns seen across all names private int maxLength; - // the output is a ByteBuffer containing the read names, separated by the NAME_SEPARATOR byte, WITHOUT - // a terminating separator + /** + * Output format is the read names, separated by the NAME_SEPARATOR byte, WITHOUT a terminating separator + */ public ByteBuffer compress(final ByteBuffer inBuffer, final boolean useArith) { - // strictly speaking, this is probably not even necessary, but we have to count the names anyway since its the - // first thing that we need to write to the output stream, so parse the names out while we're at it - final List names = extractInputNames(inBuffer, CRAMEncodingStrategy.DEFAULT_READS_PER_SLICE); - final int numNames = names.size(); - //TODO: is subtracting one correct here ? - final int uncompressedInputSize = inBuffer.limit() - numNames - 1; + // strictly speaking, keeping these in a list isn't necessary, but we have to scan the entire input + // anyway to determine the number of names, since that's the first thing that we need to write to the + // output stream, so extract the names while we're scanning + final List namesToEncode = extractInputNames(inBuffer, CRAMEncodingStrategy.DEFAULT_READS_PER_SLICE); + final int numNames = namesToEncode.size(); + final int uncompressedInputSize = inBuffer.limit() - numNames - NameTokenisationDecode.UNCOMPRESSED_LENGTH_ADJUSTMENT; //TODO: guess max size -> str.length*2 + 10000 (from htscodecs javascript code) + // what if this is exceeded ? final ByteBuffer outBuffer = CompressionUtils.allocateByteBuffer((inBuffer.limit()*2)+10000); - //TODO: what is the correct value here ? does/should this include - // the local name delimiter that we use to format the input stream (??) outBuffer.putInt(uncompressedInputSize); outBuffer.putInt(numNames); outBuffer.put((byte)(useArith == true ? 1 : 0)); // keep a List>, as we also need to store the TOKEN_TYPE, relative value when compared // to prev name's token along with the token value. - final List> encodedTokens = new ArrayList<>(numNames); - //TODO: using a map is sketchy here, since read names can be duplicates; this is probably fine since it doesn't - // really matter which index is recorded here - any one will do + final List> tokenAccumulator = new ArrayList<>(numNames); + // note that using a map is a little sketchy here, since read names can be duplicates; but its fine since + // it doesn't really matter which index is recorded here - any one will do final HashMap nameIndexMap = new HashMap<>(); final int[] tokenFrequencies = new int[256]; for(int nameIndex = 0; nameIndex < numNames; nameIndex++) { - //TODO: sketchy that we're passing encodedTokens and also modifying it.... - encodedTokens.add(tokeniseName(encodedTokens, nameIndexMap, tokenFrequencies, names.get(nameIndex), nameIndex)); + tokenAccumulator.add( + tokeniseName( + namesToEncode.get(nameIndex), + nameIndex, + tokenAccumulator, + nameIndexMap, + tokenFrequencies) + ); } - for (int tokenPosition = 0; tokenPosition < maxToken; tokenPosition++) { - final List tokenStream = new ArrayList(TokenStreams.TOTAL_TOKEN_TYPES); - for (int i = 0; i < TokenStreams.TOTAL_TOKEN_TYPES; i++) { - //TODO: this is overkill - creating giant buffers, the size of which surfaces in the decoder - tokenStream.add(CompressionUtils.allocateByteBuffer(numNames * maxLength)); - } - fillByteStreams(tokenStream, encodedTokens, tokenPosition, numNames); - serializeByteStreams(tokenStream, useArith, outBuffer); + for (int tokenPosition = 0; tokenPosition < maxNumberOfTokens; tokenPosition++) { + final List tokenStreamForPosition = fillByteStreamsForPosition(tokenPosition, numNames, tokenAccumulator); + serializeByteStreamsFor(tokenStreamForPosition, useArith, outBuffer); } // sets limit to current position and position to '0' - outBuffer.flip(); + //outBuffer.flip(); + outBuffer.limit(outBuffer.position()); + outBuffer.position(0); return outBuffer; } // return the token list for a new name private List tokeniseName( - final List> previousEncodedTokens, - final HashMap nameIndexMap, - final int[] tokenFrequencies, final String name, - final int currentNameIndex) { + final int nameIndex, + final List> tokenAccumulator, + final HashMap nameIndexMap, + final int[] tokenFrequencies) { // create a new token list for this name, and populate position 0 with the token that indicates whether // the name is a DIFF or DUP final List encodedTokens = new ArrayList<>(); if (nameIndexMap.containsKey(name)) { - //TODO:its super wasteful to do all this string interconversion - final String indexStr = String.valueOf(currentNameIndex - nameIndexMap.get(name)); + final String indexStr = String.valueOf(nameIndex - nameIndexMap.get(name)); encodedTokens.add(0, new EncodeToken(indexStr, indexStr, TokenStreams.TOKEN_DUP)); - // this is a duplicate, so just return the tokens from the previous name, or just return - // altogether ? YES! - //???????????????????????????????????? return encodedTokens; - } else { - final String indexStr = String.valueOf(currentNameIndex == 0 ? 0 : 1); - encodedTokens.add(0,new EncodeToken(indexStr, indexStr, TokenStreams.TOKEN_DIFF)); } - nameIndexMap.put(name, currentNameIndex); + //TODO: why does this list need a 0 or 1 to tell if this is the first token ? + final String indexStr = String.valueOf(nameIndex == 0 ? 0 : 1); + encodedTokens.add(0,new EncodeToken(indexStr, indexStr, TokenStreams.TOKEN_DIFF)); + nameIndexMap.put(name, nameIndex); // tokenise the current name final Matcher matcher = READ_NAME_PATTERN.matcher(name); @@ -130,9 +133,9 @@ private List tokeniseName( // compare the current token with token from the previous name at the current token's index // if there exists a previous name and a token at the corresponding index of the previous name - final int prevNameIndex = currentNameIndex - 1; // always compare against last name only - if (prevNameIndex >=0 && previousEncodedTokens.get(prevNameIndex).size() > tokenIndex) { - final EncodeToken prevToken = previousEncodedTokens.get(prevNameIndex).get(tokenIndex); + final int prevNameIndex = nameIndex - 1; // naive implementation where wer always compare against last name only + if (prevNameIndex >=0 && tokenAccumulator.get(prevNameIndex).size() > tokenIndex) { + final EncodeToken prevToken = tokenAccumulator.get(prevNameIndex).get(tokenIndex); if (prevToken.getActualTokenValue().equals(tok.get(i))) { type = TokenStreams.TOKEN_MATCH; val = ""; @@ -142,7 +145,7 @@ private List tokeniseName( int s = Integer.parseInt(prevToken.getActualTokenValue()); int d = v - s; tokenFrequencies[tokenIndex]++; - if (d >= 0 && d < 256 && tokenFrequencies[tokenIndex] > currentNameIndex / 2) { + if (d >= 0 && d < 256 && tokenFrequencies[tokenIndex] > nameIndex / 2) { type = TokenStreams.TOKEN_DELTA; val = String.valueOf(d); } @@ -150,7 +153,7 @@ private List tokeniseName( && (prevToken.getTokenType() == TokenStreams.TOKEN_DIGITS0 || prevToken.getTokenType() == TokenStreams.TOKEN_DELTA0)) { int d = Integer.parseInt(val) - Integer.parseInt(prevToken.getActualTokenValue()); tokenFrequencies[tokenIndex]++; - if (d >= 0 && d < 256 && tokenFrequencies[tokenIndex] > currentNameIndex / 2) { + if (d >= 0 && d < 256 && tokenFrequencies[tokenIndex] > nameIndex / 2) { type = TokenStreams.TOKEN_DELTA0; val = String.valueOf(d); } @@ -169,8 +172,8 @@ private List tokeniseName( encodedTokens.add(new EncodeToken("","",TokenStreams.TOKEN_END)); final int currMaxToken = encodedTokens.size(); - if (maxToken < currMaxToken) { - maxToken = currMaxToken; + if (maxNumberOfTokens < currMaxToken) { + maxNumberOfTokens = currMaxToken; } if (maxLength < currMaxLength) { maxLength = currMaxLength; @@ -179,7 +182,7 @@ private List tokeniseName( return encodedTokens; } - private List extractInputNames(final ByteBuffer inBuffer, final int preAllocationSize) { + private static List extractInputNames(final ByteBuffer inBuffer, final int preAllocationSize) { final List names = new ArrayList(CRAMEncodingStrategy.DEFAULT_READS_PER_SLICE); // extract the individual names from the input buffer for (int lastPosition = inBuffer.position(); inBuffer.hasRemaining();) { @@ -192,7 +195,7 @@ private List extractInputNames(final ByteBuffer inBuffer, final int preA names.add(new String( bytes, 0, - // special case handling end of the buffer, where there is no for the lack of a trailing separator + //TODO: special case handling end of the buffer, where there is no for the lack of a trailing separator length - (inBuffer.position() == inBuffer.limit() ? 0 : 1), StandardCharsets.UTF_8)); lastPosition = inBuffer.position(); @@ -201,55 +204,72 @@ private List extractInputNames(final ByteBuffer inBuffer, final int preA return names; } - private void fillByteStreams( - final List tokenStream, - final List> tokensList, + // NOTE: the calling code relies on the fact that on return, the position of each ByteBuffer corresponds to the + // end of the corresponding stream + private List fillByteStreamsForPosition( final int tokenPosition, - final int numNames) { - - // Fill tokenStreams object using tokensList - for (int nameIndex = 0; nameIndex < numNames; nameIndex++) { - if (tokenPosition > 0 && tokensList.get(nameIndex).get(0).getTokenType() == TokenStreams.TOKEN_DUP) { + final int numNames, + final List> tokenAccumulator) { + // create the outer list, but don't allocate bytestreams until we actually need them, since this list is + // often quite sparse + final List tokenStreams = Arrays.asList(new ByteBuffer[TokenStreams.TOTAL_TOKEN_TYPES]); + // for the token type stream + getOrCreateByteBufferFor(tokenStreams, TokenStreams.TOKEN_TYPE, numNames); // for the one allocation we'll need for sure + + for (int n = 0; n < numNames; n++) { + final List encodedTokensForName = tokenAccumulator.get(n); + if (tokenPosition > 0 && encodedTokensForName.get(0).getTokenType() == TokenStreams.TOKEN_DUP) { continue; } - if (tokensList.get(nameIndex).size() <= tokenPosition) { + if (encodedTokensForName.size() <= tokenPosition) { continue; } - final EncodeToken encodeToken = tokensList.get(nameIndex).get(tokenPosition); + final EncodeToken encodeToken = encodedTokensForName.get(tokenPosition); final byte type = encodeToken.getTokenType(); - tokenStream.get(TokenStreams.TOKEN_TYPE).put(type); + tokenStreams.get(TokenStreams.TOKEN_TYPE).put(type); switch (type) { case TokenStreams.TOKEN_DIFF: - tokenStream.get(TokenStreams.TOKEN_DIFF).putInt(Integer.parseInt(encodeToken.getRelativeTokenValue())); + getOrCreateByteBufferFor(tokenStreams, TokenStreams.TOKEN_DIFF, numNames) + .putInt(Integer.parseInt(encodeToken.getRelativeTokenValue())); break; case TokenStreams.TOKEN_DUP: - tokenStream.get(TokenStreams.TOKEN_DUP).putInt(Integer.parseInt(encodeToken.getRelativeTokenValue())); + getOrCreateByteBufferFor(tokenStreams, TokenStreams.TOKEN_DUP, numNames) + .putInt(Integer.parseInt(encodeToken.getRelativeTokenValue())); break; case TokenStreams.TOKEN_STRING: - writeString(tokenStream.get(TokenStreams.TOKEN_STRING),encodeToken.getRelativeTokenValue()); + writeString( + getOrCreateByteBufferFor(tokenStreams, TokenStreams.TOKEN_STRING, numNames), + encodeToken.getRelativeTokenValue() + ); break; case TokenStreams.TOKEN_CHAR: - tokenStream.get(TokenStreams.TOKEN_CHAR).put(encodeToken.getRelativeTokenValue().getBytes()[0]); + getOrCreateByteBufferFor(tokenStreams, TokenStreams.TOKEN_CHAR, numNames) + .put((byte) encodeToken.getRelativeTokenValue().charAt(0)); break; case TokenStreams.TOKEN_DIGITS: - tokenStream.get(TokenStreams.TOKEN_DIGITS).putInt(Integer.parseInt(encodeToken.getRelativeTokenValue())); + getOrCreateByteBufferFor(tokenStreams, TokenStreams.TOKEN_DIGITS, numNames) + .putInt(Integer.parseInt(encodeToken.getRelativeTokenValue())); break; case TokenStreams.TOKEN_DIGITS0: - tokenStream.get(TokenStreams.TOKEN_DIGITS0).putInt(Integer.parseInt(encodeToken.getRelativeTokenValue())); - tokenStream.get(TokenStreams.TOKEN_DZLEN).put((byte) encodeToken.getRelativeTokenValue().length()); + getOrCreateByteBufferFor(tokenStreams, TokenStreams.TOKEN_DIGITS0, numNames) + .putInt(Integer.parseInt(encodeToken.getRelativeTokenValue())); + getOrCreateByteBufferFor(tokenStreams, TokenStreams.TOKEN_DZLEN, numNames) + .put((byte) encodeToken.getRelativeTokenValue().length()); break; case TokenStreams.TOKEN_DELTA: - tokenStream.get(TokenStreams.TOKEN_DELTA).put((byte)Integer.parseInt(encodeToken.getRelativeTokenValue())); + getOrCreateByteBufferFor(tokenStreams, TokenStreams.TOKEN_DELTA, numNames) + .put((byte)Integer.parseInt(encodeToken.getRelativeTokenValue())); break; case TokenStreams.TOKEN_DELTA0: - tokenStream.get(TokenStreams.TOKEN_DELTA0).put((byte)Integer.parseInt(encodeToken.getRelativeTokenValue())); + getOrCreateByteBufferFor(tokenStreams, TokenStreams.TOKEN_DELTA0, numNames) + .put((byte)Integer.parseInt(encodeToken.getRelativeTokenValue())); break; case TokenStreams.TOKEN_NOP: @@ -263,6 +283,25 @@ private void fillByteStreams( throw new CRAMException("Invalid token type: " + type); } } + // we need to set the limit on these streams so that the downstream consumer does't try to consume + // the entire stream + for (int i = 0; i < TokenStreams.TOTAL_TOKEN_TYPES; i++) { + if (tokenStreams.get(i) != null) { + tokenStreams.get(i).limit(tokenStreams.get(i).position()); + } + } + + return tokenStreams; + } + + private ByteBuffer getOrCreateByteBufferFor( + final List tokenStreams, + final int tokenType, + final int numNames) { + if (tokenStreams.get(tokenType) == null) { + tokenStreams.set(tokenType, CompressionUtils.allocateByteBuffer(maxLength * numNames)); + } + return tokenStreams.get(tokenType); } private static void writeString(final ByteBuffer tokenStreamBuffer, final String val) { @@ -299,8 +338,8 @@ private static ByteBuffer tryCompress(final ByteBuffer nameTokenStream, final bo nameTokenStream.rewind(); final ByteBuffer tmpByteBuffer = rangeEncode.compress(nameTokenStream, new RangeParams(rangeEncoderFlagSet)); //TODO: using "remaining" is a pretty sketchy way to do this, since it assumes that the buffer's limit is meaningful - if (bestCompressedLength > tmpByteBuffer.remaining()) { - bestCompressedLength = tmpByteBuffer.remaining(); + if (bestCompressedLength > tmpByteBuffer.limit()) { + bestCompressedLength = tmpByteBuffer.limit(); compressedByteBuffer = tmpByteBuffer; } } @@ -326,8 +365,8 @@ private static ByteBuffer tryCompress(final ByteBuffer nameTokenStream, final bo final RANSNx16Encode ransEncode = new RANSNx16Encode(); nameTokenStream.rewind(); final ByteBuffer tmpByteBuffer = ransEncode.compress(nameTokenStream, new RANSNx16Params(ransNx16FlagSet)); - if (bestCompressedLength > tmpByteBuffer.remaining()) { - bestCompressedLength = tmpByteBuffer.remaining(); + if (bestCompressedLength > tmpByteBuffer.limit()) { + bestCompressedLength = tmpByteBuffer.limit(); compressedByteBuffer = tmpByteBuffer; } } @@ -335,18 +374,18 @@ private static ByteBuffer tryCompress(final ByteBuffer nameTokenStream, final bo return compressedByteBuffer; } - private void serializeByteStreams( - final List tokenStream, + private void serializeByteStreamsFor( + final List tokenStreams, final boolean useArith, final ByteBuffer outBuffer) { - - // Compress and serialise tokenStreams + // Compress and serialise the non-null tokenStreams for (int tokenType = 0; tokenType <= TokenStreams.TOKEN_END; tokenType++) { - if (tokenStream.get(tokenType).remaining() > 0) { + final ByteBuffer tokenBytes = tokenStreams.get(tokenType); + if (tokenBytes != null && tokenBytes.position() > 0) { outBuffer.put((byte) (tokenType + ((tokenType == 0) ? 128 : 0))); //TODO: check this for sign bit correctness - final ByteBuffer tempOutByteBuffer = tryCompress(tokenStream.get(tokenType), useArith); + final ByteBuffer tempOutByteBuffer = tryCompress(tokenBytes, useArith); //TODO: the use of limit is sketchy here... - CompressionUtils.writeUint7(tempOutByteBuffer.limit(),outBuffer); + CompressionUtils.writeUint7(tempOutByteBuffer.limit(), outBuffer); outBuffer.put(tempOutByteBuffer); } } diff --git a/src/main/java/htsjdk/samtools/cram/compression/nametokenisation/TokenStreams.java b/src/main/java/htsjdk/samtools/cram/compression/nametokenisation/TokenStreams.java index d00d91dcd3..8a3e1789a8 100644 --- a/src/main/java/htsjdk/samtools/cram/compression/nametokenisation/TokenStreams.java +++ b/src/main/java/htsjdk/samtools/cram/compression/nametokenisation/TokenStreams.java @@ -30,9 +30,8 @@ public class TokenStreams { private static final int TYPE_TOKEN_FLAG_MASK = 0x3F; // choose an initial estimate of the number of expected token positions, which we use to preallocate lists - //TODO: force a test of this (again) by setting it to a small number - private static final int DEFAULT_NUMBER_OF_TOKEN_POSITIONS = 4; - private static int POSITION_INCREMENT = 2; // expand allocation by 5 every time we exceed the initial estimate + private static final int DEFAULT_NUMBER_OF_TOKEN_POSITIONS = 30; + private static int POSITION_INCREMENT = 4; // expand allocation by 5 every time we exceed the initial estimate // called 'B' in the spec, indexed (conceptually) as tokenStreams(tokenPos, tokenType) private ByteBuffer[][] tokenStreams; @@ -104,60 +103,60 @@ public TokenStreams(final ByteBuffer inputByteBuffer, final int useArith, final getTokenStreamsForPos(tokenPosition)[tokenType] = uncompressedTokenStream; } } - displayTokenStreamSizes(); + //displayTokenStreamSizes(); } - private void displayTokenStreamSizes() { - for (int t = 0; t < TOTAL_TOKEN_TYPES; t++) { - System.out.print(String.format("%s ", typeToString(t))); - } - System.out.println(); - for (int pos = 0; pos < tokenStreams.length; pos++) { - System.out.print(String.format("Pos %2d: ", pos)); - for (int typ = 0; typ < tokenStreams[pos].length; typ++) { - final ByteBuffer bf = tokenStreams[pos][typ]; - if (bf == null) { - System.out.print(String.format("%8s", "null")); - } else { - System.out.print(String.format("%8d", bf.limit())); - } - } - System.out.println(); - } - } - - private String typeToString(int i) { - switch (i) { - case TOKEN_TYPE: - return "TOKEN_TYPE"; - case TOKEN_STRING: - return "TOKEN_STRING"; - case TOKEN_CHAR: - return "TOKEN_CHAR"; - case TOKEN_DIGITS0: - return "TOKEN_DIGITS0"; - case TOKEN_DZLEN: - return "TOKEN_DZLEN"; - case TOKEN_DUP: - return "TOKEN_DUP"; - case TOKEN_DIFF: - return "TOKEN_DIFF"; - case TOKEN_DIGITS: - return "TOKEN_DIGITS"; - case TOKEN_DELTA: - return "TOKEN_DELTA"; - case TOKEN_DELTA0: - return "TOKEN_DELTA0"; - case TOKEN_MATCH: - return "TOKEN_MATCH"; - case TOKEN_END: - return "TOKEN_END"; - case TOKEN_NOP: - return "NOP"; - default: - throw new CRAMException("Invalid name tokenizer tokenType: " + i); - } - } +// private void displayTokenStreamSizes() { +// for (int t = 0; t < TOTAL_TOKEN_TYPES; t++) { +// System.out.print(String.format("%s ", typeToString(t))); +// } +// System.out.println(); +// for (int pos = 0; pos < tokenStreams.length; pos++) { +// System.out.print(String.format("Pos %2d: ", pos)); +// for (int typ = 0; typ < tokenStreams[pos].length; typ++) { +// final ByteBuffer bf = tokenStreams[pos][typ]; +// if (bf == null) { +// System.out.print(String.format("%8s", "null")); +// } else { +// System.out.print(String.format("%8d", bf.limit())); +// } +// } +// System.out.println(); +// } +// } +// +// private String typeToString(int i) { +// switch (i) { +// case TOKEN_TYPE: +// return "TOKEN_TYPE"; +// case TOKEN_STRING: +// return "TOKEN_STRING"; +// case TOKEN_CHAR: +// return "TOKEN_CHAR"; +// case TOKEN_DIGITS0: +// return "TOKEN_DIGITS0"; +// case TOKEN_DZLEN: +// return "TOKEN_DZLEN"; +// case TOKEN_DUP: +// return "TOKEN_DUP"; +// case TOKEN_DIFF: +// return "TOKEN_DIFF"; +// case TOKEN_DIGITS: +// return "TOKEN_DIGITS"; +// case TOKEN_DELTA: +// return "TOKEN_DELTA"; +// case TOKEN_DELTA0: +// return "TOKEN_DELTA0"; +// case TOKEN_MATCH: +// return "TOKEN_MATCH"; +// case TOKEN_END: +// return "TOKEN_END"; +// case TOKEN_NOP: +// return "NOP"; +// default: +// throw new CRAMException("Invalid name tokenizer tokenType: " + i); +// } +// } public ByteBuffer[] getTokenStreamsForPos(final int pos) { return tokenStreams[pos]; diff --git a/src/main/java/htsjdk/samtools/cram/compression/range/RangeDecode.java b/src/main/java/htsjdk/samtools/cram/compression/range/RangeDecode.java index 379891e774..a00af4ab94 100644 --- a/src/main/java/htsjdk/samtools/cram/compression/range/RangeDecode.java +++ b/src/main/java/htsjdk/samtools/cram/compression/range/RangeDecode.java @@ -17,13 +17,12 @@ public class RangeDecode { // It uncompresses the data in the inBuffer, leaving it consumed. // Returns a rewound ByteBuffer containing the uncompressed data. public ByteBuffer uncompress(final ByteBuffer inBuffer) { - - // For Range decoding, the bytes are read in little endian from the input stream - inBuffer.order(ByteOrder.LITTLE_ENDIAN); return uncompress(inBuffer, 0); } private ByteBuffer uncompress(final ByteBuffer inBuffer, final int outSize) { + // For Range decoding, the bytes are read in little endian from the input stream + inBuffer.order(ByteOrder.LITTLE_ENDIAN); if (inBuffer.remaining() == 0) { return EMPTY_BUFFER; } diff --git a/src/test/java/htsjdk/samtools/cram/NameTokenizationInteropTest.java b/src/test/java/htsjdk/samtools/cram/NameTokenizationInteropTest.java index 86af2eb10b..c81a7a6d3a 100644 --- a/src/test/java/htsjdk/samtools/cram/NameTokenizationInteropTest.java +++ b/src/test/java/htsjdk/samtools/cram/NameTokenizationInteropTest.java @@ -16,8 +16,8 @@ import java.util.ArrayList; import java.util.List; -//TODO: this is dumb because we're running the same decode-only test twice, once with useArith true and once with it false, -// which is ignored the the decode-only test anyway; we should separate the data provides to eliminate this +//TODO: this is dumb because we're running the same decode-only test twice, once with useArith true and once with +// it false, which is ignored the the decode-only test anyway; we should separate the data provides to eliminate this // Test the roundtrip and decompression of name tokenization encoded data using the htslib cram interop stream // data for the name tokenization codec. diff --git a/src/test/java/htsjdk/samtools/cram/compression/nametokenisation/NameTokenisationTest.java b/src/test/java/htsjdk/samtools/cram/compression/nametokenisation/NameTokenisationTest.java index 1743e860d2..f086d8c298 100644 --- a/src/test/java/htsjdk/samtools/cram/compression/nametokenisation/NameTokenisationTest.java +++ b/src/test/java/htsjdk/samtools/cram/compression/nametokenisation/NameTokenisationTest.java @@ -25,54 +25,53 @@ public String toString() { @DataProvider(name="nameTokenisation") public Object[][] getNameTokenisationTestData() { - List readNamesList = new ArrayList<>(); + final List readNamesList = new ArrayList<>(); readNamesList.add(""); // a subset of read names from // src/test/resources/htsjdk/samtools/cram/CEUTrio.HiSeq.WGS.b37.NA12878.20.first.8000.bam - //TODO:fix these to use LOCAL_NAME_SEPARATOR - readNamesList.add("20FUKAAXX100202:6:27:4968:125377\0" + - "20FUKAAXX100202:6:27:4986:125375\0" + - "20FUKAAXX100202:5:62:8987:1929\0" + - "20GAVAAXX100126:1:28:4295:139802\0" + - "20FUKAAXX100202:4:23:8516:117251\0" + - "20FUKAAXX100202:6:23:6442:37469\0" + - "20FUKAAXX100202:8:24:10477:24196\0" + - "20GAVAAXX100126:8:63:5797:158250\0" + - "20FUKAAXX100202:1:45:12798:104365\0" + - "20GAVAAXX100126:3:23:6419:199245\0" + - "20FUKAAXX100202:8:48:6663:137967\0" + - "20FUKAAXX100202:6:68:17726:162601"); + readNamesList.add("20FUKAAXX100202:6:27:4968:125377" + NameTokenisationDecode.LOCAL_NAME_SEPARATOR_CHARSEQUENCE + + "20FUKAAXX100202:6:27:4986:125375" + NameTokenisationDecode.LOCAL_NAME_SEPARATOR_CHARSEQUENCE + + "20FUKAAXX100202:5:62:8987:1929" + NameTokenisationDecode.LOCAL_NAME_SEPARATOR_CHARSEQUENCE + + "20GAVAAXX100126:1:28:4295:139802" + NameTokenisationDecode.LOCAL_NAME_SEPARATOR_CHARSEQUENCE + + "20FUKAAXX100202:4:23:8516:117251" + NameTokenisationDecode.LOCAL_NAME_SEPARATOR_CHARSEQUENCE + + "20FUKAAXX100202:6:23:6442:37469" + NameTokenisationDecode.LOCAL_NAME_SEPARATOR_CHARSEQUENCE + + "20FUKAAXX100202:8:24:10477:24196" + NameTokenisationDecode.LOCAL_NAME_SEPARATOR_CHARSEQUENCE + + "20GAVAAXX100126:8:63:5797:158250" + NameTokenisationDecode.LOCAL_NAME_SEPARATOR_CHARSEQUENCE + + "20FUKAAXX100202:1:45:12798:104365" + NameTokenisationDecode.LOCAL_NAME_SEPARATOR_CHARSEQUENCE + + "20GAVAAXX100126:3:23:6419:199245" + NameTokenisationDecode.LOCAL_NAME_SEPARATOR_CHARSEQUENCE + + "20FUKAAXX100202:8:48:6663:137967" + NameTokenisationDecode.LOCAL_NAME_SEPARATOR_CHARSEQUENCE + + "20FUKAAXX100202:6:68:17726:162601"); //TODO: no trailing separator...for now // a subset of read names from // src/test/resources/htsjdk/samtools/longreads/NA12878.m64020_190210_035026.chr21.5011316.5411316.unmapped.bam - readNamesList.add("m64020_190210_035026/44368402/ccs\0"); - readNamesList.add("m64020_190210_035026/44368402/ccs"); - readNamesList.add("m64020_190210_035026/44368402/ccs\0" + - "m64020_190210_035026/124127126/ccs\0" + - "m64020_190210_035026/4981311/ccs\0" + - "m64020_190210_035026/80022195/ccs\0" + - "m64020_190210_035026/17762104/ccs\0" + - "m64020_190210_035026/62981096/ccs\0" + - "m64020_190210_035026/86968803/ccs\0" + - "m64020_190210_035026/46400955/ccs\0" + - "m64020_190210_035026/137561592/ccs\0" + - "m64020_190210_035026/52233471/ccs\0" + - "m64020_190210_035026/97127189/ccs\0" + - "m64020_190210_035026/115278035/ccs\0" + - "m64020_190210_035026/155256324/ccs\0" + - "m64020_190210_035026/163644151/ccs\0" + - "m64020_190210_035026/162728365/ccs\0" + - "m64020_190210_035026/160238116/ccs\0" + - "m64020_190210_035026/147719983/ccs\0" + - "m64020_190210_035026/60883331/ccs\0" + - "m64020_190210_035026/1116165/ccs\0" + - "m64020_190210_035026/75893199/ccs"); + readNamesList.add("m64020_190210_035026/44368402/ccs" + NameTokenisationDecode.LOCAL_NAME_SEPARATOR_CHARSEQUENCE); + readNamesList.add("m64020_190210_035026/44368402/ccs" + NameTokenisationDecode.LOCAL_NAME_SEPARATOR_CHARSEQUENCE); + readNamesList.add("m64020_190210_035026/44368402/ccs" + NameTokenisationDecode.LOCAL_NAME_SEPARATOR_CHARSEQUENCE + + "m64020_190210_035026/124127126/ccs" + NameTokenisationDecode.LOCAL_NAME_SEPARATOR_CHARSEQUENCE + + "m64020_190210_035026/4981311/ccs" + NameTokenisationDecode.LOCAL_NAME_SEPARATOR_CHARSEQUENCE + + "m64020_190210_035026/80022195/ccs" + NameTokenisationDecode.LOCAL_NAME_SEPARATOR_CHARSEQUENCE + + "m64020_190210_035026/17762104/ccs" + NameTokenisationDecode.LOCAL_NAME_SEPARATOR_CHARSEQUENCE + + "m64020_190210_035026/62981096/ccs" + NameTokenisationDecode.LOCAL_NAME_SEPARATOR_CHARSEQUENCE + + "m64020_190210_035026/86968803/ccs" + NameTokenisationDecode.LOCAL_NAME_SEPARATOR_CHARSEQUENCE + + "m64020_190210_035026/46400955/ccs" + NameTokenisationDecode.LOCAL_NAME_SEPARATOR_CHARSEQUENCE + + "m64020_190210_035026/137561592/cc0" + NameTokenisationDecode.LOCAL_NAME_SEPARATOR_CHARSEQUENCE + + "m64020_190210_035026/52233471/ccs" + NameTokenisationDecode.LOCAL_NAME_SEPARATOR_CHARSEQUENCE + + "m64020_190210_035026/97127189/ccs" + NameTokenisationDecode.LOCAL_NAME_SEPARATOR_CHARSEQUENCE + + "m64020_190210_035026/115278035/ccs" + NameTokenisationDecode.LOCAL_NAME_SEPARATOR_CHARSEQUENCE + + "m64020_190210_035026/155256324/ccs" + NameTokenisationDecode.LOCAL_NAME_SEPARATOR_CHARSEQUENCE + + "m64020_190210_035026/163644151/ccs" + NameTokenisationDecode.LOCAL_NAME_SEPARATOR_CHARSEQUENCE + + "m64020_190210_035026/162728365/ccs" + NameTokenisationDecode.LOCAL_NAME_SEPARATOR_CHARSEQUENCE + + "m64020_190210_035026/160238116/ccs" + NameTokenisationDecode.LOCAL_NAME_SEPARATOR_CHARSEQUENCE + + "m64020_190210_035026/147719983/ccs" + NameTokenisationDecode.LOCAL_NAME_SEPARATOR_CHARSEQUENCE + + "m64020_190210_035026/60883331/ccs" + NameTokenisationDecode.LOCAL_NAME_SEPARATOR_CHARSEQUENCE + + "m64020_190210_035026/1116165/ccs" + NameTokenisationDecode.LOCAL_NAME_SEPARATOR_CHARSEQUENCE + + "m64020_190210_035026/75893199/ccs"); //TODO:no trailing separator for now // source: https://gatk.broadinstitute.org/hc/en-us/articles/360035890671-Read-groups readNamesList.add( - "H0164ALXX140820:2:1101:10003:23460\0" + - "H0164ALXX140820:2:1101:15118:25288"); + "H0164ALXX140820:2:1101:10003:23460" + NameTokenisationDecode.LOCAL_NAME_SEPARATOR_CHARSEQUENCE + + "H0164ALXX140820:2:1101:15118:25288"); //TODO: no trailing separator...for now final List testCases = new ArrayList<>(); for (String readName : readNamesList) {