Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implementation of CRAM 3.1 codecs. #1714

Open
wants to merge 31 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
31 commits
Select commit Hold shift + click to select a range
306459c
Implementation of CRAM 3.1 codecs.
Jan 14, 2022
dd9b17e
Many FQZComp fixes, with roundtrip tests working.
cmnbroad Nov 13, 2024
9534f3d
Name tokenization codec fixes.
cmnbroad Nov 13, 2024
36d8622
Interop test data from htslib.
cmnbroad Nov 13, 2024
0f7c991
Remove unnecessary mac files.
cmnbroad Nov 25, 2024
ce7430f
Use shared rans decoder in interop tests.
cmnbroad Nov 25, 2024
ca64055
Update useArith type in name tokenizer, remove unecessary object crea…
cmnbroad Nov 26, 2024
56b8322
Remove sketchy exception suppression, make test encoding params type-…
cmnbroad Nov 26, 2024
d354ab2
Code cleanup.
cmnbroad Dec 5, 2024
f7a4ee6
Store token stream in arrays instead of lists.
cmnbroad Dec 5, 2024
ea0b8b2
More naming, removal of unnecessary code, switch sanitization.
cmnbroad Dec 9, 2024
8559902
Temp update.
cmnbroad Dec 9, 2024
561595d
Precompile regular expression patterns, optimize some string operatio…
cmnbroad Dec 10, 2024
7777e24
Checkpoint 1.
cmnbroad Dec 12, 2024
fdc2153
Checkpoint 2.
cmnbroad Dec 12, 2024
d7f2095
Repair haphazard stream management.
cmnbroad Dec 15, 2024
d39ddc0
Consolidate and optimize, remove unecessary code.
cmnbroad Dec 16, 2024
70665d1
Fix spotbugs issue.
cmnbroad Dec 17, 2024
22e0dfa
Remove unnecessary code.
cmnbroad Jan 6, 2025
2f3ce74
Fix sketchy byte conversion to use UTF-8 charset for names.
cmnbroad Jan 6, 2025
a8a7e55
Remove obsolete comment.
cmnbroad Jan 6, 2025
a71f9a9
Upgrade to samtools 1.21.
cmnbroad Jan 6, 2025
bdf9338
Remove redundant/duplicate tests.
cmnbroad Jan 6, 2025
bad9ece
Eliminate intermediate String representation for decoded names.
cmnbroad Jan 7, 2025
91c1cce
Standardize input/output read name buffer separator.
cmnbroad Jan 7, 2025
2b4ffd8
Update name separator handling.
cmnbroad Jan 7, 2025
b543c57
Comment update.
cmnbroad Jan 7, 2025
7a50f40
Remove old interop data.
cmnbroad Jan 14, 2025
71e7145
Add updated interop test files.
cmnbroad Jan 14, 2025
a37b995
Remove .DS_Store files.
cmnbroad Jan 14, 2025
282322a
Conform to updated interop test structure.
cmnbroad Jan 14, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
The table of contents is too big for display.
Diff view
Diff view
  •  
  •  
  •  
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ htsjdk.iws
atlassian-ide-plugin.xml
/htsjdk.version.properties
/test-output/
.DS_Store

#intellij
.idea/
Expand Down
2 changes: 1 addition & 1 deletion scripts/install-samtools.sh
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ sudo apt-get upgrade
sudo apt-get install -y libncurses-dev libbz2-dev liblzma-dev

#install from the github tar
export SAMTOOLS_VERSION=1.19.1
export SAMTOOLS_VERSION=1.21
wget https://github.com/samtools/samtools/releases/download/${SAMTOOLS_VERSION}/samtools-${SAMTOOLS_VERSION}.tar.bz2
tar -xjvf samtools-${SAMTOOLS_VERSION}.tar.bz2
cd samtools-${SAMTOOLS_VERSION} && ./configure --prefix=/usr && make && sudo make install
178 changes: 178 additions & 0 deletions src/main/java/htsjdk/samtools/cram/compression/CompressionUtils.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,178 @@
package htsjdk.samtools.cram.compression;

import htsjdk.samtools.cram.CRAMException;
import htsjdk.samtools.cram.compression.rans.Constants;

import java.nio.ByteBuffer;
import java.nio.ByteOrder;

public class CompressionUtils {
public static void writeUint7(final int i, final ByteBuffer cp) {
int s = 0;
int X = i;
do {
s += 7;
X >>= 7;
} while (X > 0);
do {
s -= 7;
//writeByte
final int s_ = (s > 0) ? 1 : 0;
cp.put((byte) (((i >> s) & 0x7f) + (s_ << 7)));
} while (s > 0);
}

public static int readUint7(final ByteBuffer cp) {
int i = 0;
int c;
do {
//read byte
c = cp.get();
i = (i << 7) | (c & 0x7f);
} while ((c & 0x80) != 0);
return i;
}

public static ByteBuffer encodePack(
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

A comment describing what this method is for would be helpful

final ByteBuffer inBuffer,
final ByteBuffer outBuffer,
final int[] frequencyTable,
final int[] packMappingTable,
final int numSymbols){
final int inSize = inBuffer.remaining();
final ByteBuffer encodedBuffer;
if (numSymbols <= 1) {
encodedBuffer = CompressionUtils.allocateByteBuffer(0);
} else if (numSymbols <= 2) {

// 1 bit per value
final int encodedBufferSize = (int) Math.ceil((double) inSize/8);
encodedBuffer = CompressionUtils.allocateByteBuffer(encodedBufferSize);
int j = -1;
for (int i = 0; i < inSize; i ++) {
if (i % 8 == 0) {
encodedBuffer.put(++j, (byte) 0);
}
encodedBuffer.put(j, (byte) (encodedBuffer.get(j) + (packMappingTable[inBuffer.get(i) & 0xFF] << (i % 8))));
}
} else if (numSymbols <= 4) {

// 2 bits per value
final int encodedBufferSize = (int) Math.ceil((double) inSize/4);
encodedBuffer = CompressionUtils.allocateByteBuffer(encodedBufferSize);
int j = -1;
for (int i = 0; i < inSize; i ++) {
if (i % 4 == 0) {
encodedBuffer.put(++j, (byte) 0);
}
encodedBuffer.put(j, (byte) (encodedBuffer.get(j) + (packMappingTable[inBuffer.get(i) & 0xFF] << ((i % 4) * 2))));
}
} else {

// 4 bits per value
final int encodedBufferSize = (int) Math.ceil((double)inSize/2);
encodedBuffer = CompressionUtils.allocateByteBuffer(encodedBufferSize);
int j = -1;
for (int i = 0; i < inSize; i ++) {
if (i % 2 == 0) {
encodedBuffer.put(++j, (byte) 0);
}
encodedBuffer.put(j, (byte) (encodedBuffer.get(j) + (packMappingTable[inBuffer.get(i) & 0xFF] << ((i % 2) * 4))));
}
}

// write numSymbols
outBuffer.put((byte) numSymbols);

// write mapping table "packMappingTable" that converts mapped value to original symbol
for(int i = 0; i < Constants.NUMBER_OF_SYMBOLS; i ++) {
if (frequencyTable[i] > 0) {
outBuffer.put((byte) i);
}
}

// write the length of data
CompressionUtils.writeUint7(encodedBuffer.limit(), outBuffer);
return encodedBuffer; // Here position = 0 since we have always accessed the data buffer using index
}

public static ByteBuffer decodePack(
final ByteBuffer inBuffer,
final byte[] packMappingTable,
final int numSymbols,
final int uncompressedPackOutputLength) {
final ByteBuffer outBufferPack = CompressionUtils.allocateByteBuffer(uncompressedPackOutputLength);
int j = 0;
if (numSymbols <= 1) {
for (int i=0; i < uncompressedPackOutputLength; i++){
outBufferPack.put(i, packMappingTable[0]);
}
}

// 1 bit per value
else if (numSymbols <= 2) {
int v = 0;
for (int i=0; i < uncompressedPackOutputLength; i++){
if (i % 8 == 0){
v = inBuffer.get(j++);
}
outBufferPack.put(i, packMappingTable[v & 1]);
v >>=1;
}
}

// 2 bits per value
else if (numSymbols <= 4){
int v = 0;
for(int i=0; i < uncompressedPackOutputLength; i++){
if (i % 4 == 0){
v = inBuffer.get(j++);
}
outBufferPack.put(i, packMappingTable[v & 3]);
v >>=2;
}
}

// 4 bits per value
else if (numSymbols <= 16){
int v = 0;
for(int i=0; i < uncompressedPackOutputLength; i++){
if (i % 2 == 0){
v = inBuffer.get(j++);
}
outBufferPack.put(i, packMappingTable[v & 15]);
v >>=4;
}
}
return outBufferPack;
}

public static ByteBuffer allocateOutputBuffer(final int inSize) {
// This calculation is identical to the one in samtools rANS_static.c
// Presumably the frequency table (always big enough for order 1) = 257*257,
// then * 3 for each entry (byte->symbol, 2 bytes -> scaled frequency),
// + 9 for the header (order byte, and 2 int lengths for compressed/uncompressed lengths).
final int compressedSize = (int) (inSize + 257 * 257 * 3 + 9);
final ByteBuffer outputBuffer = allocateByteBuffer(compressedSize);
if (outputBuffer.remaining() < compressedSize) {
throw new CRAMException("Failed to allocate sufficient buffer size for RANS coder.");
}
return outputBuffer;
}

// returns a new LITTLE_ENDIAN ByteBuffer of size = bufferSize
//TODO: rename this to allocateLittleEndianByteBuffer
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

did you want to rename these?

public static ByteBuffer allocateByteBuffer(final int bufferSize){
return ByteBuffer.allocate(bufferSize).order(ByteOrder.LITTLE_ENDIAN);
}

// returns a LITTLE_ENDIAN ByteBuffer that is created by wrapping a byte[]
public static ByteBuffer wrap(final byte[] inputBytes){
return ByteBuffer.wrap(inputBytes).order(ByteOrder.LITTLE_ENDIAN);
}

// returns a LITTLE_ENDIAN ByteBuffer that is created by inputBuffer.slice()
public static ByteBuffer slice(final ByteBuffer inputBuffer){
return inputBuffer.slice().order(ByteOrder.LITTLE_ENDIAN);
}
}
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
package htsjdk.samtools.cram.compression;

import htsjdk.samtools.cram.compression.rans.RANS;
import htsjdk.samtools.cram.compression.range.RangeDecode;
import htsjdk.samtools.cram.compression.range.RangeEncode;
import htsjdk.samtools.cram.compression.rans.rans4x8.RANS4x8Decode;
import htsjdk.samtools.cram.compression.rans.rans4x8.RANS4x8Encode;
import htsjdk.samtools.cram.structure.block.BlockCompressionMethod;
import htsjdk.utils.ValidationUtils;

Expand Down Expand Up @@ -71,8 +74,13 @@ public static ExternalCompressor getCompressorForMethod(

case RANS:
return compressorSpecificArg == NO_COMPRESSION_ARG ?
new RANSExternalCompressor(new RANS()) :
new RANSExternalCompressor(compressorSpecificArg, new RANS());
new RANSExternalCompressor(new RANS4x8Encode(), new RANS4x8Decode()) :
new RANSExternalCompressor(compressorSpecificArg, new RANS4x8Encode(), new RANS4x8Decode());

case RANGE:
return compressorSpecificArg == NO_COMPRESSION_ARG ?
new RangeExternalCompressor(new RangeEncode(), new RangeDecode()) :
new RangeExternalCompressor(compressorSpecificArg, new RangeEncode(), new RangeDecode());

case BZIP2:
ValidationUtils.validateArg(
Expand All @@ -85,5 +93,4 @@ public static ExternalCompressor getCompressorForMethod(
}
}

}

}
Original file line number Diff line number Diff line change
Expand Up @@ -24,48 +24,60 @@
*/
package htsjdk.samtools.cram.compression;

import htsjdk.samtools.cram.compression.rans.RANS;
import htsjdk.samtools.cram.compression.rans.RANSParams;
import htsjdk.samtools.cram.compression.rans.rans4x8.RANS4x8Decode;
import htsjdk.samtools.cram.compression.rans.rans4x8.RANS4x8Encode;
import htsjdk.samtools.cram.compression.rans.rans4x8.RANS4x8Params;
import htsjdk.samtools.cram.structure.block.BlockCompressionMethod;

import java.nio.ByteBuffer;
import java.util.Objects;

public final class RANSExternalCompressor extends ExternalCompressor {
private final RANS.ORDER order;
private final RANS rans;
private final RANSParams.ORDER order;
private final RANS4x8Encode ransEncode;
private final RANS4x8Decode ransDecode;

/**
* We use a shared RANS instance for all compressors.
* @param rans
*/
public RANSExternalCompressor(final RANS rans) {
this(RANS.ORDER.ZERO, rans);
public RANSExternalCompressor(
final RANS4x8Encode ransEncode,
final RANS4x8Decode ransDecode) {
this(RANSParams.ORDER.ZERO, ransEncode, ransDecode);
}

public RANSExternalCompressor(final int order, final RANS rans) {
this(RANS.ORDER.fromInt(order), rans);
public RANSExternalCompressor(
final int order,
final RANS4x8Encode ransEncode,
final RANS4x8Decode ransDecode) {
this(RANSParams.ORDER.fromInt(order), ransEncode, ransDecode);
}

public RANSExternalCompressor(final RANS.ORDER order, final RANS rans) {
public RANSExternalCompressor(
final RANSParams.ORDER order,
final RANS4x8Encode ransEncode,
final RANS4x8Decode ransDecode) {
super(BlockCompressionMethod.RANS);
this.rans = rans;
this.ransEncode = ransEncode;
this.ransDecode = ransDecode;
this.order = order;
}

@Override
public byte[] compress(final byte[] data) {
final ByteBuffer buffer = rans.compress(ByteBuffer.wrap(data), order);
final RANS4x8Params params = new RANS4x8Params(order);
final ByteBuffer buffer = ransEncode.compress(CompressionUtils.wrap(data), params);
return toByteArray(buffer);
}

@Override
public byte[] uncompress(byte[] data) {
final ByteBuffer buf = rans.uncompress(ByteBuffer.wrap(data));
final ByteBuffer buf = ransDecode.uncompress(CompressionUtils.wrap(data));
return toByteArray(buf);
}

public RANS.ORDER getOrder() { return order; }

@Override
public String toString() {
return String.format("%s(%s)", this.getMethod(), order);
Expand Down Expand Up @@ -96,4 +108,4 @@ private byte[] toByteArray(final ByteBuffer buffer) {
return bytes;
}

}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
package htsjdk.samtools.cram.compression;

import htsjdk.samtools.cram.compression.range.RangeDecode;
import htsjdk.samtools.cram.compression.range.RangeEncode;
import htsjdk.samtools.cram.compression.range.RangeParams;
import htsjdk.samtools.cram.structure.block.BlockCompressionMethod;

import java.nio.ByteBuffer;

public class RangeExternalCompressor extends ExternalCompressor{

private final int formatFlags;
private final RangeEncode rangeEncode;
private final RangeDecode rangeDecode;

public RangeExternalCompressor(
final RangeEncode rangeEncode,
final RangeDecode rangeDecode) {
this(0, rangeEncode, rangeDecode);
}

public RangeExternalCompressor(
final int formatFlags,
final RangeEncode rangeEncode,
final RangeDecode rangeDecode) {
super(BlockCompressionMethod.RANGE);
this.rangeEncode = rangeEncode;
this.rangeDecode = rangeDecode;
this.formatFlags = formatFlags;
}

@Override
public byte[] compress(byte[] data) {
final RangeParams params = new RangeParams(formatFlags);
final ByteBuffer buffer = rangeEncode.compress(CompressionUtils.wrap(data), params);
return toByteArray(buffer);
}

@Override
public byte[] uncompress(byte[] data) {
final ByteBuffer buf = rangeDecode.uncompress(CompressionUtils.wrap(data));
return toByteArray(buf);
}

@Override
public String toString() {
return String.format("%s(%s)", this.getMethod(),formatFlags);
}

private byte[] toByteArray(final ByteBuffer buffer) {
if (buffer.hasArray() && buffer.arrayOffset() == 0 && buffer.array().length == buffer.limit()) {
return buffer.array();
}

final byte[] bytes = new byte[buffer.remaining()];
buffer.get(bytes);
return bytes;
}


}
Loading
Loading