Skip to content

Commit

Permalink
Consolidate and optimize, remove unecessary code.
Browse files Browse the repository at this point in the history
  • Loading branch information
cmnbroad committed Dec 17, 2024
1 parent ed26bb6 commit d93218d
Show file tree
Hide file tree
Showing 5 changed files with 291 additions and 239 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -11,17 +11,18 @@
// so we don't have to repeatedly interconvert them when fetching from this list

public class NameTokenisationDecode {
// TODO: lift these values to a common location since they're used by the encoder, the decoder, and the tests
//TODO: lift these values to a common location since they're used by the encoder, the decoder, and the tests
//TODO: once we get clarity on the spec (https://github.com/samtools/hts-specs/issues/802) on how to
// calculate this, we should use it to verify the result of decompressing
// for now, since we're returning a String of all the names (instead of a list, which is more efficient) because,
// use a single byte to separate the names; this particular byte is chosen because the calling code in the CRAM
// reader for read names already assumes it will be handed a block of '\0' separated names
public final static byte NAME_SEPARATOR = 0;
public final static CharSequence LOCAL_NAME_SEPARATOR_CHARSEQUENCE = new String(new byte[] {NAME_SEPARATOR});

//TODO: once we get clarity on the spec (https://github.com/samtools/hts-specs/issues/802) on how to
// calculate this, we should use it to verify the result of decompressing
public static final int UNCOMPRESSED_LENGTH_ADJUSTMENT = 1;

public static int DEFAULT_POSITION_ALLOCATION = 30;

// the input must be a ByteBuffer containing the read names, separated by the NAME_SEPARATOR byte, WITHOUT
// a terminating separator
public String uncompress(final ByteBuffer inBuffer) {
Expand Down Expand Up @@ -54,60 +55,59 @@ private String decodeSingleName(
final int currentNameIndex) {
// consult tokenStreams[0, TokenStreams.TOKEN_TYPE] to determine if this name uses dup or diff, and either
// way, determine the reference name from which we will construct this name
final byte referenceType = tokenStreams.getTokenStream(0, TokenStreams.TOKEN_TYPE).get();
final ByteBuffer distStream = tokenStreams.getTokenStream(0, referenceType);
final byte referenceType = tokenStreams.getStream(0, TokenStreams.TOKEN_TYPE).get();
final ByteBuffer distStream = tokenStreams.getStream(0, referenceType);
final int referenceName = currentNameIndex - distStream.getInt() & 0xFFFFFFFF;

if (referenceType == TokenStreams.TOKEN_DUP) {
// propagate the existing tokens for the reference name and use them for this name, in case there is
// a future instance of this same name that refers to THIS name's tokens, and then reconstruct and
// propagate the existing tokens for the reference name and use them for this name (in case there is
// a future instance of this same name that refers to THIS name's tokens), and then reconstruct and
// return the new name by joining the accumulated tokens
decodedNameTokens.add(currentNameIndex, decodedNameTokens.get(referenceName));
return String.join("", decodedNameTokens.get(currentNameIndex));
}

if (referenceType != TokenStreams.TOKEN_DIFF) {
} else if (referenceType != TokenStreams.TOKEN_DIFF) {
throw new CRAMException(String.format(
"Invalid nameType %s. nameType must be either TOKEN_DIFF or TOKEN_DUP", referenceType));
}

// preallocate for with 30 tokens, but let the list auto-expand if we exceed that
final List<String> currentNameTokens = new ArrayList<>(30);
// preallocate for DEFAULT_NUMBER_OF_POSITIONS token (columns), but the list size will auto-expand if we exceed that
final List<String> currentNameTokens = new ArrayList<>(DEFAULT_POSITION_ALLOCATION);
final StringBuilder decodedNameBuilder = new StringBuilder();
byte type = -1;

// start at position 1; at position 0, there is only nameType information
for (int tokenPos = 1; type != TokenStreams.TOKEN_END; tokenPos++) {
type = tokenStreams.getTokenStream(tokenPos, TokenStreams.TOKEN_TYPE).get();
type = tokenStreams.getStream(tokenPos, TokenStreams.TOKEN_TYPE).get();
String currentToken = "";

switch(type){
switch (type) {
case TokenStreams.TOKEN_CHAR:
final char currentTokenChar = (char) tokenStreams.getTokenStream(tokenPos, TokenStreams.TOKEN_CHAR).get();
final char currentTokenChar = (char) tokenStreams.getStream(tokenPos, TokenStreams.TOKEN_CHAR).get();
currentToken = String.valueOf(currentTokenChar);
break;
case TokenStreams.TOKEN_STRING:
currentToken = readString(tokenStreams.getTokenStream(tokenPos, TokenStreams.TOKEN_STRING));
currentToken = readString(tokenStreams.getStream(tokenPos, TokenStreams.TOKEN_STRING));
break;
case TokenStreams.TOKEN_DIGITS:
currentToken = getDigitsToken(tokenStreams, tokenPos, TokenStreams.TOKEN_DIGITS);
break;
case TokenStreams.TOKEN_DIGITS0:
final String digits0Token = getDigitsToken(tokenStreams, tokenPos, TokenStreams.TOKEN_DIGITS0);
final int lenDigits0Token = tokenStreams.getTokenStream(tokenPos, TokenStreams.TOKEN_DZLEN).get() & 0xFF;
final int lenDigits0Token = tokenStreams.getStream(tokenPos, TokenStreams.TOKEN_DZLEN).get() & 0xFF;
currentToken = leftPadWith0(digits0Token, lenDigits0Token);
break;
case TokenStreams.TOKEN_DELTA:
currentToken = getDeltaToken(tokenPos, tokenStreams, TokenStreams.TOKEN_DELTA, decodedNameTokens, referenceName);
currentToken = getDeltaToken(tokenStreams, tokenPos, TokenStreams.TOKEN_DELTA, decodedNameTokens, referenceName);
break;
case TokenStreams.TOKEN_DELTA0:
final String delta0Token = getDeltaToken(tokenPos, tokenStreams, TokenStreams.TOKEN_DELTA0, decodedNameTokens, referenceName);
final String delta0Token = getDeltaToken(tokenStreams, tokenPos, TokenStreams.TOKEN_DELTA0, decodedNameTokens, referenceName);
final int lenDelta0Token = decodedNameTokens.get(referenceName).get(tokenPos-1).length();
currentToken = leftPadWith0(delta0Token, lenDelta0Token);
break;
case TokenStreams.TOKEN_MATCH:
currentToken = decodedNameTokens.get(referenceName).get(tokenPos-1);
break;
case TokenStreams.TOKEN_END: // tolerate END, it terminates the enclosing loop
case TokenStreams.TOKEN_END: // tolerate END, it will terminates the enclosing loop
break;
case TokenStreams.TOKEN_NOP:
//no-op token, inserted by the writer to take up space to keep the streams aligned
Expand All @@ -132,8 +132,8 @@ private String decodeSingleName(
}

private String getDeltaToken(
final int tokenPosition,
final TokenStreams tokenStreams,
final int tokenPosition,
final byte tokenType,
final List<List<String>> previousTokensList,
final int prevNameIndex) {
Expand All @@ -145,7 +145,7 @@ private String getDeltaToken(
try {
final String prevToken = previousTokensList.get(prevNameIndex).get(tokenPosition - 1);
int prevTokenInt = Integer.parseInt(prevToken);
final int deltaTokenValue = tokenStreams.getTokenStream(tokenPosition, tokenType).get() & 0xFF;
final int deltaTokenValue = tokenStreams.getStream(tokenPosition, tokenType).get() & 0xFF;
return Long.toString(prevTokenInt + deltaTokenValue);
} catch (final NumberFormatException e) {
throw new CRAMException(
Expand All @@ -163,10 +163,10 @@ private String getDigitsToken(
if (tokenType != TokenStreams.TOKEN_DIGITS && tokenType != TokenStreams.TOKEN_DIGITS0) {
throw new CRAMException(
String.format(
"Invalid tokenType : %s tokenType must be either TOKEN_DIGITS or TOKEN_DIGITS0",
"Invalid tokenType: %s tokenType must be either TOKEN_DIGITS or TOKEN_DIGITS0",
tokenType));
}
final ByteBuffer digitsByteBuffer = tokenStreams.getTokenStream(tokenPosition, tokenType);
final ByteBuffer digitsByteBuffer = tokenStreams.getStream(tokenPosition, tokenType);
final long digits = digitsByteBuffer.getInt() & 0xFFFFFFFFL;
return Long.toString(digits);
}
Expand All @@ -177,7 +177,7 @@ private String readString(final ByteBuffer inputBuffer) {
final StringBuilder resultStringBuilder = new StringBuilder();
byte currentByte = inputBuffer.get();
while (currentByte != 0) {
//TODO: fix this sketchy cast; this will fail on non-ASCII characters
//TODO: fix this sketchy cast; this will fail on non-ASCII characters ?
resultStringBuilder.append((char) currentByte);
currentByte = inputBuffer.get();
}
Expand Down
Loading

0 comments on commit d93218d

Please sign in to comment.