diff --git a/CMakeLists.txt b/CMakeLists.txt index e8067bda25..03d1b09a68 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -11,11 +11,11 @@ endif() list(INSERT CMAKE_MODULE_PATH 0 "${CMAKE_CURRENT_LIST_DIR}/cmake") set(MGARD_VERSION_MAJOR "1") -set(MGARD_VERSION_MINOR "2") +set(MGARD_VERSION_MINOR "3") set(MGARD_VERSION_PATCH "0") set(MGARD_FILE_VERSION_MAJOR "1") -set(MGARD_FILE_VERSION_MINOR "0") +set(MGARD_FILE_VERSION_MINOR "1") set(MGARD_FILE_VERSION_PATCH "0") project( @@ -201,9 +201,15 @@ set( MGARD_LIBRARY_CPP src/compress.cpp src/compress_internal.cpp - src/compressors.cpp + src/utilities.cpp + src/huffman.cpp + src/lossless_zlib.cpp + src/lossless_dispatcher.cpp src/format.cpp ) +if(zstd_FOUND) + list(APPEND MGARD_LIBRARY_CPP src/lossless_zstd.cpp) +endif() set(MAXIMUM_DIMENSION 4 CACHE STRING "Maximum supported dimension for self-describing decompression.") diff --git a/include/compress.tpp b/include/compress.tpp index ebd9a76e83..5d22b1c25a 100644 --- a/include/compress.tpp +++ b/include/compress.tpp @@ -20,16 +20,14 @@ #include "MGARDConfig.hpp" #include "TensorMultilevelCoefficientQuantizer.hpp" #include "TensorNorms.hpp" -#include "compressors.hpp" #include "decompose.hpp" #include "format.hpp" +#include "lossless.hpp" #include "quantize.hpp" #include "shuffle.hpp" namespace mgard { -using DEFAULT_INT_T = std::int64_t; - template CompressedDataset compress(const TensorMeshHierarchy &hierarchy, Real *const v, diff --git a/include/compress_internal.tpp b/include/compress_internal.tpp index fd16dcd331..6aca8124af 100644 --- a/include/compress_internal.tpp +++ b/include/compress_internal.tpp @@ -1,8 +1,8 @@ #include #include "compress.hpp" -#include "compressors.hpp" #include "decompose.hpp" +#include "lossless.hpp" #include "quantize.hpp" #include "shuffle.hpp" diff --git a/include/format.hpp b/include/format.hpp index e7821e64e6..cbc7da334f 100644 --- a/include/format.hpp +++ b/include/format.hpp @@ -66,6 +66,14 @@ serialize_header_crc32(std::uint_least64_t crc32); //!\param p Pointer whose alignment will be checked. template void check_alignment(void const *const p); +//! Check that a quantization buffer has the right alignment and a valid size. +//! +//!\param header Self-describing dataset header. +//!\param p Quantization buffer. +//!\param n Size in bytes of quantization buffer. +void check_quantization_buffer(const pb::Header &header, void const *const p, + const std::size_t n); + //! Determine whether an integral type is big endian. template bool big_endian(); @@ -74,6 +82,11 @@ template bool big_endian(); //!\return `Dataset::Type` corresponding to `Real`. template pb::Dataset::Type type_to_dataset_type(); +//! Return the `Quantization::Type` value corrresponding to an integral type. +//! +//!\return `Quantization::Type` corresponding to `Int`. +template pb::Quantization::Type type_to_quantization_type(); + //! Allocate a quantization buffer of the proper alignment and size. //! //!\param header Self-describing dataset header. @@ -165,16 +178,19 @@ pb::Header read_metadata(BufferWindow &window); //!\param header Header of the self-describing buffer. void write_metadata(std::ostream &ostream, const pb::Header &header); -//! Parse the header of a self-describing buffer. +template +//! Parse a message from a buffer window. //! //! The buffer pointer will be advanced past the header. //! -//!\param window Window into the self-describing buffer. The current position -//! should be the start of the header. -//!\param header_size Size in bytes of the header. -//!\return Header of the self-describing buffer. -pb::Header read_header(BufferWindow &window, - const std::uint_least64_t header_size); +//! This function was originally written to parse the header from a +//! self-describing buffer. +// +//!\param window Buffer window containing the serialized message. The current +//! position should be the start of the message. +//!\param nmessage Size in bytes of the message. +//!\return Parsed message. +T read_message(BufferWindow &window, const std::uint_least64_t nmessage); //! Check that a dataset was compressed with a compatible version of MGARD. //! diff --git a/include/format.tpp b/include/format.tpp index e223235a41..14b33bb77b 100644 --- a/include/format.tpp +++ b/include/format.tpp @@ -61,4 +61,26 @@ template bool big_endian() { return not*reinterpret_cast(&n); } +template +T read_message(BufferWindow &window, const std::uint_least64_t nmessage) { + // The `CodedInputStream` constructor takes an `int`. + if (nmessage > std::numeric_limits::max()) { + throw std::runtime_error("message is too large (size would overflow)"); + } + // Check that the read will stay in the buffer. + unsigned char const *const next = window.next(nmessage); + T message; + google::protobuf::io::CodedInputStream stream( + static_cast(window.current), nmessage); + if (not message.ParseFromCodedStream(&stream)) { + throw std::runtime_error( + "message parsing encountered read or format error"); + } + if (not stream.ConsumedEntireMessage()) { + throw std::runtime_error("part of message left unparsed"); + } + window.current = next; + return message; +} + } // namespace mgard diff --git a/include/huffman.hpp b/include/huffman.hpp new file mode 100644 index 0000000000..a943256d45 --- /dev/null +++ b/include/huffman.hpp @@ -0,0 +1,286 @@ +#ifndef HUFFMAN_HPP +#define HUFFMAN_HPP +//!\file +//!\brief Huffman trees for quantized multilevel coefficients. + +#include + +#include +#include +#include +#include + +#include "format.hpp" +#include "utilities.hpp" + +namespace mgard { + +//! One more than the number of symbols assigned codewords in the deprecated +//! Huffman encoding and decoding functions. +//! +//!\deprecated +inline constexpr std::size_t nql = 1 << 17; + +//! A stream compressed using a Huffman code. +//! +//!\deprecated +struct HuffmanEncodedStream { + //! Constructor. + //! + //!\param nbits Length in bits of the compressed stream. + //!\param nmissed Length in bytes of the missed array. + //!\param ntable Length in bytes of the frequency table. + HuffmanEncodedStream(const std::size_t nbits, const std::size_t nmissed, + const std::size_t ntable); + + //! Length in bits of the compressed stream. + std::size_t nbits; + + //! Compressed stream. + MemoryBuffer hit; + + //! Missed array. + MemoryBuffer missed; + + //! Frequency table. + MemoryBuffer frequencies; +}; + +//! Serialize a Huffman-encoded stream and then compress. +//! +//!\deprecated +//! +//! The header will determine which compressor is used. +//! +//!\param header Header for the self-describing buffer. +//!\param encoded Huffman-encoded stream to serialize and compress. +MemoryBuffer +serialize_compress(const pb::Header &header, + const HuffmanEncodedStream &encoded); + +//! Decompress and then deserialize a Huffman-encoded stream. +//! +//!\deprecated +//! +//! The header will determine which decompressor is used. +//! +//!\param header Header of the self-describing buffer. +//!\param src Buffer containing serialized and compressed Huffman-encoded +//! stream. +//!\param srcLen Size in bytes of the buffer. +HuffmanEncodedStream decompress_deserialize(const pb::Header &header, + unsigned char const *const src, + const std::size_t srcLen); + +//! Codeword (in progress) associated to a node in a Huffman code creation tree. +struct HuffmanCodeword { + //! Bytes containing the bits of the codeword. + std::vector bytes = {}; + + //! Length in bits of the codeword. + std::size_t length = 0; + + //! Append a bit to the codeword. + void push_back(const bool bit); + + //! Generate the codeword associated to the left child in the tree. + HuffmanCodeword left() const; + + //! Generate the codeword associated to the right child in the tree. + HuffmanCodeword right() const; +}; + +//! Node in a Huffman code creation tree. +struct CodeCreationTreeNode { + //! Constructor. + //! + //! Create a leaf node. + //! + //!\param codeword Associated codeword. + //!\param count Frequency of the associated symbol. + CodeCreationTreeNode(HuffmanCodeword *const codeword, + const std::size_t count); + + //! Constructor. + //! + //! Create an inner (parent) node. + //! + //!\param left Left child of the node to be created. + //!\param right Right child of the node to be created. + CodeCreationTreeNode(const std::shared_ptr &left, + const std::shared_ptr &right); + + //! Associated codeword (if this node is a leaf). + HuffmanCodeword *codeword = nullptr; + + //! Sum of frequencies of symbols associated to leaves descending from this + //! node. + std::size_t count; + + //! Left child of this node. + std::shared_ptr left; + + //! Right child of this node. + std::shared_ptr right; +}; + +//! Huffman code generated from/for an input stream. +//! +//!\note The construction of this class is a little convoluted. +template class HuffmanCode { +public: + static_assert(std::is_integral::value and + std::is_signed::value, + "symbol type must be signed and integral"); + + //! Shared pointer to node in Huffman code creation tree. + using Node = std::shared_ptr; + + //! Constructor. + //! + //!\param endpoints Smallest and largest symbols (inclusive) to receive + //! codewords. + //!\param begin Beginning of input stream. + //!\param end End of output stream. + HuffmanCode(const std::pair &endpoints, + Symbol const *const begin, Symbol const *const end); + + //! Constructor. + //! + //! The endpoints will be set to `default_endpoints`. + //! + //!\param begin Beginning of input stream. + //!\param end End of output stream. + HuffmanCode(Symbol const *const begin, Symbol const *const end); + + //! Constructor. + //! + //! `It::value_type` should be (convertible to) + //! `std::pair`. + //! + //!\param endpoints Smallest and largest symbols (inclusive) to receive + //! codewords. + //!\param begin Beginning of index–frequency pair range for frequency table. + //!\param end Beginning of index–frequency pair range for frequency table. + template + HuffmanCode(const std::pair &endpoints, const It begin, + const It end); + + //! Smallest and largest symbols (inclusive) eligible for codewords. + std::pair endpoints; + + //! Number of symbols eligible for codewords (including one for the 'missed' + //! symbol). + std::size_t ncodewords; + + //! Frequencies of the symbols in the input stream. + std::vector frequencies; + + //! Codewords associated to the symbols. + std::vector codewords; + + //! Report the number of symbols in the stream. + std::size_t nsymbols() const; + + //! Report the number of out-of-range symbols encountered in the stream or + //! given in the frequency table pairs. + std::size_t nmissed() const; + + //! Report the size in bits of the encoded stream. + std::size_t nbits_hit() const; + + //! Check whether a symbol is eligible for a codeword. + bool out_of_range(const Symbol symbol) const; + + //! Determine the codeword index for a symbol. + std::size_t index(const Symbol symbol) const; + +private: + //! Function object used to compare code creation tree nodes. + struct HeldCountGreater { + bool operator()(const Node &a, const Node &b) const; + }; + +public: + //! Huffman code creation tree. + std::priority_queue, HeldCountGreater> queue; + + //! Decode a codeword (identified by associated leaf) to a symbol. + //! + //!\pre `leaf` must be a leaf (rather than an interior node) of the code + //! creation tree. + //! + //!\param leaf Leaf (associated to a codeword) to decode. + //!\return A boolean indicating whether the original symbol was 'hit' and the + //! symbol itself (junk if the original symbol was 'missed'). + std::pair decode(const Node &leaf) const; + +private: + //! Default symbol range. + const static std::pair default_endpoints; + + //! Populate the frequency table using a stream of symbols. + //! + //!\pre `frequencies` should have length `ncodewords` and all entries should + //! be zero. + //! + //!\param begin Beginning of stream of symbols. + //!\param end End of stream of symbols. + void populate_frequencies(Symbol const *const begin, Symbol const *const end); + + //! Populate the frequency table from a collection of index–frequency pairs. + //! + //!\param begin Beginning of index–frequency pair range. + //!\param end End of index–frequency pair range. + template + void populate_frequencies(const It begin, const It end); + + //! Create the Huffman code creation tree. + //! + //!\note This function depends on `frequencies`. + void create_code_creation_tree(); + + // TODO: Check that frequency count ties aren't going to hurt us here. Stable + // sorting algorithm in `priority_queue`? + + //! Set codewords for given node and descendants. + void + recursively_set_codewords(const std::shared_ptr &node, + const HuffmanCodeword codeword); +}; + +//! Encode quantized coefficients using a Huffman code. +//! +//!\deprecated +//! +//!\param[in] quantized_data Input buffer (quantized coefficients). +//!\param[in] n Number of symbols (`long int` quantized coefficients) in the +//! input buffer. +HuffmanEncodedStream huffman_encoding(long int const *const quantized_data, + const std::size_t n); + +//! Decode a stream encoded using a Huffman code. +//! +//!\deprecated +//! +//!\param[in] encoded Input buffer (Huffman-encoded stream). +MemoryBuffer huffman_decoding(const HuffmanEncodedStream &encoded); + +//! Encode quantized coefficients using a Huffman code. +//! +//!\param begin Input buffer (quantized coefficients). +//!\param n Number of symbols in the input buffer. +template +MemoryBuffer huffman_encode(Symbol const *const begin, + const std::size_t n); + +//! Decode a stream encoded using a Huffman code. +//! +//!\param buffer Input buffer (Huffman-encoded stream). +template +MemoryBuffer huffman_decode(const MemoryBuffer &buffer); + +} // namespace mgard + +#include "huffman.tpp" +#endif diff --git a/include/huffman.tpp b/include/huffman.tpp new file mode 100644 index 0000000000..29239ac0fc --- /dev/null +++ b/include/huffman.tpp @@ -0,0 +1,602 @@ +#include "utilities.hpp" + +#include +#include +#include +#include + +#include +#include +#include + +#include "format.hpp" + +#include "proto/mgard.pb.h" + +namespace mgard { + +// Aliases for compound message field types. +namespace { + +using Endpoints = google::protobuf::RepeatedField; +using Missed = google::protobuf::RepeatedField; +using Frequencies = + google::protobuf::Map; +using SubtableSizes = google::protobuf::RepeatedField; + +} // namespace + +template +bool HuffmanCode::HeldCountGreater:: +operator()(const typename HuffmanCode::Node &a, + const typename HuffmanCode::Node &b) const { + return a->count > b->count; +} + +template +void HuffmanCode::create_code_creation_tree() { + // We can't quite use a `ZippedRange` here, I think, because + // `ZippedRange::iterator` doesn't expose the underlying iterators and we want + // a pointer to the codeword. + typename std::vector::const_iterator p = frequencies.cbegin(); + HuffmanCodeword *q = codewords.data(); + for (std::size_t i = 0; i < ncodewords; ++i) { + const std::size_t count = *p; + if (count) { + queue.push(std::make_shared(q, count)); + } + ++p; + ++q; + } + while (queue.size() > 1) { + const std::shared_ptr a = queue.top(); + queue.pop(); + const std::shared_ptr b = queue.top(); + queue.pop(); + + queue.push(std::make_shared(a, b)); + } +} + +// This default will be used for `std::int{8,16}_t` We'll specialize the default +// for `std::int{32,64}_t` in the implementation file. +template +const std::pair HuffmanCode::default_endpoints = { + std::numeric_limits::min(), std::numeric_limits::max()}; + +// I believe these are called 'template specialization declarations.' +template <> +const std::pair + HuffmanCode::default_endpoints; + +template <> +const std::pair + HuffmanCode::default_endpoints; + +template +void HuffmanCode::populate_frequencies(Symbol const *const begin, + Symbol const *const end) { + for (const Symbol symbol : + RangeSlice{.begin_ = begin, .end_ = end}) { + ++frequencies.at(index(symbol)); + } +} + +template +std::pair HuffmanCode::decode( + const typename HuffmanCode::Node &leaf) const { + const std::ptrdiff_t offset = leaf->codeword - codewords.data(); + // If `offset == 0`, this is the leaf corresponding to out-of-range symbols. + assert(offset >= 0); + return offset ? std::pair(true, endpoints.first + (offset - 1)) + : std::pair(false, {}); +} + +template +template +void HuffmanCode::populate_frequencies(const It begin, const It end) { + for (auto [index, frequency] : RangeSlice{.begin_ = begin, .end_ = end}) { + frequencies.at(index) = frequency; + } +} + +namespace { + +template +std::size_t +ncodewords_from_endpoints(const std::pair &endpoints) { + if (endpoints.first > endpoints.second) { + throw std::invalid_argument( + "maximum symbol must be greater than or equal to minimum symbol"); + } + // One for the 'missed' symbol, and the endpoints are inclusive. + // Overflow is possible in the subtraction `endpoints.second - + // endpoints.first` (suppose `Symbol` is `char` and `endpoints` is `{CHAR_MIN, + // CHAR_MAX}`. Casting to `std::int64_t` should avoid the problem in all + // practical cases. + const std::size_t ncodewords = 1 + + static_cast(endpoints.second) - + static_cast(endpoints.first) + 1; + return ncodewords; +} + +} // namespace + +template +HuffmanCode::HuffmanCode(const std::pair &endpoints, + Symbol const *const begin, + Symbol const *const end) + : endpoints(endpoints), ncodewords(ncodewords_from_endpoints(endpoints)), + frequencies(ncodewords), codewords(ncodewords) { + populate_frequencies(begin, end); + create_code_creation_tree(); + recursively_set_codewords(queue.top(), {}); +} + +template +HuffmanCode::HuffmanCode(Symbol const *const begin, + Symbol const *const end) + : HuffmanCode(default_endpoints, begin, end) {} + +template +template +HuffmanCode::HuffmanCode(const std::pair &endpoints, + const It begin, const It end) + : endpoints(endpoints), ncodewords(ncodewords_from_endpoints(endpoints)), + frequencies(ncodewords), codewords(ncodewords) { + populate_frequencies(begin, end); + create_code_creation_tree(); + recursively_set_codewords(queue.top(), {}); +} + +template std::size_t HuffmanCode::nsymbols() const { + return std::accumulate(frequencies.begin(), frequencies.end(), + static_cast(0)); +} + +template std::size_t HuffmanCode::nmissed() const { + return frequencies.at(0); +} + +template std::size_t HuffmanCode::nbits_hit() const { + std::size_t nbits = 0; + for (std::size_t i = 0; i < ncodewords; ++i) { + nbits += frequencies.at(i) * codewords.at(i).length; + } + return nbits; +} + +template +bool HuffmanCode::out_of_range(const Symbol symbol) const { + return symbol < endpoints.first or symbol > endpoints.second; +} + +template +std::size_t HuffmanCode::index(const Symbol symbol) const { + return out_of_range(symbol) ? 0 : 1 + symbol - endpoints.first; +} + +template +void HuffmanCode::recursively_set_codewords( + const std::shared_ptr &node, + const HuffmanCodeword codeword) { + const bool children = node->left; + assert(children == static_cast(node->right)); + if (children) { + recursively_set_codewords(node->left, codeword.left()); + recursively_set_codewords(node->right, codeword.right()); + } else { + *node->codeword = codeword; + } +} + +namespace { + +//! Maximum number of elements per frequency/missed subtable. +inline constexpr std::size_t SUBTABLE_MAX_SIZE = 1 << 20; + +//! A logical table split into one or more subtables of moderate size. +//! +//! The logical table can be read by chaining the subtables. +template struct Supertable { + // The beginning and size of a subtable. + using Segment = std::pair; + + //! Constructor. + //! + //! Construct an 'empty' `Supertable`. The data members will be given the + //! right sizes, but for the most part they will not populated. That is left + //! to derived class constructors or callers. + //! + //!\param nelements Total number of subtable entries. + //!\param nbytes_subtables Sizes in bytes of the subtables (field in + //! `pb::HuffmanHeader`). This field will be written to. + Supertable(const std::size_t nelements, SubtableSizes &nbytes_subtables) + : nsubtables((nelements + SUBTABLE_MAX_SIZE - 1) / SUBTABLE_MAX_SIZE), + subtables(nsubtables), segments(nsubtables), + nbytes_subtables(nbytes_subtables) { + nbytes_subtables.Resize(nsubtables, 0); + + for (std::size_t i = 0; i + 1 < nsubtables; ++i) { + segments.at(i).second = SUBTABLE_MAX_SIZE; + } + if (nsubtables) { + // If `nelements` is an exact multiple of `SUBTABLE_MAX_SIZE` and not + // zero, we need this last size to be `SUBTABLE_MAX_SIZE`, not `0`. If + // `nelements` is zero, we won't be executing this statement. + segments.back().second = nelements % SUBTABLE_MAX_SIZE + ? nelements % SUBTABLE_MAX_SIZE + : SUBTABLE_MAX_SIZE; + } + } + + //! Constructor. + //! + //! Construct a `Supertable` from a collection of parsed messages. This + //! constructor leaves `segments` uninitialized. This is because `Supertable` + //! doesn't know which field of `Message` is the subtable. + //! + //!\param nbytes_subtables Sizes in bytes of the subtables (field in + //! `pb::HuffmanHeader`). + //!\param window Window into buffer containing messages to be parsed. + Supertable(SubtableSizes &nbytes_subtables, BufferWindow &window) + : nsubtables(nbytes_subtables.size()), subtables(nsubtables), + segments(nsubtables), nbytes_subtables(nbytes_subtables) { + for (std::size_t i = 0; i < nsubtables; ++i) { + subtables.at(i) = read_message(window, nbytes_subtables.Get(i)); + } + } + + //! Calculate and store the sizes in bytes of the subtables. + //! + //! This function should be called once the subtables are populated. + //! `nbytes_subtables` (a field in some `pb::HuffmanHeader`) will be modified. + //! Subsequent changes to the subtables will invalidate the sizes. + void calculate_nbytes_subtables() { + for (std::size_t i = 0; i < nsubtables; ++i) { + nbytes_subtables.Set(i, subtables.at(i).ByteSize()); + } + } + + //! Calculate the total size in bytes of the subtables. + //! + //! This function assumes no changes have been made to the subtables since the + //! last call to `calculate_nbytes_subtables`. + std::size_t ByteSize() const { + return std::accumulate(nbytes_subtables.begin(), nbytes_subtables.end(), + static_cast(0)); + } + + //! Write the subtables out to a buffer. + //! + //!\param p Buffer to which to serialize the subtables. + //!\param n Expected number of bytes that will be written. + void SerializeToArray(void *const p, const std::size_t n) const { + unsigned char *const p_ = reinterpret_cast(p); + std::size_t total = 0; + for (std::size_t i = 0; i < nsubtables; ++i) { + const Message &subtable = subtables.at(i); + const google::protobuf::uint64 nbytes_subtable = nbytes_subtables.Get(i); + + subtable.SerializeToArray(p_ + total, nbytes_subtable); + total += nbytes_subtable; + } + if (total != n) { + throw std::invalid_argument("serialization buffer size incorrect"); + } + } + + //! Number of subtables. + std::size_t nsubtables; + + //! Subtables. + //! + //! It might be better to name this member 'messages.' Elsewhere we use + //! 'subtable' to refer to the fields of the messages containing the + //! supertable elements. Using that vocabulary, a `pb::FrequencySubtable` + //! would be a message while its `frequencies` field would be the subtable. + std::vector subtables; + + //! Segments for a concatenated subtable chain. + //! + //! A `Chain::iterator>` can be constructed from this. + std::vector segments; + + //! Sizes in bytes of the subtables. + SubtableSizes &nbytes_subtables; +}; + +//! A logical frequency table split into one or more subtables of moderate size. +struct FrequencySupertable + : Supertable { + //! Constructor. + //! + //! Construct and populate a `FrequencySupertable` from a vector of symbol + //! frequencies. + //! + //!\param frequencies Symbol frequencies to store in the subtables. + //!\param nbytes_subtables Sizes in bytes of the subtables (field in + //! `pb::HuffmanHeader`). This field will be written to. + FrequencySupertable(const std::vector &frequencies, + SubtableSizes &nbytes_subtables) + : Supertable(std::count_if(frequencies.begin(), frequencies.end(), + [](const std::size_t frequency) -> bool { + return frequency; + }), + nbytes_subtables) { + // `i` is the index of the subtable we're inserting into. (Technically + // we're inserting into the subtable's frequency map field rather than + // the subtable itself.) `j` is the number of entries we've inserted + // into subtable `i`. `k` is the index in the vector of frequencies + // passed to the constructor. + std::size_t k = 0; + for (std::size_t i = 0; i < nsubtables; ++i) { + Frequencies &frequencies_ = *subtables.at(i).mutable_frequencies(); + Segment &segment = segments.at(i); + // How big `frequencies_` should be when we're done. + const std::size_t nfrequencies_ = segment.second; + for (std::size_t j = 0; j < nfrequencies_; ++k) { + const std::size_t frequency = frequencies.at(k); + if (frequency) { + frequencies_.insert({k, frequency}); + ++j; + } + } + segment.first = frequencies_.begin(); + } + + calculate_nbytes_subtables(); + } + + //! Constructor. + //! + //! Construct a `FrequencySubtable` from a collection of parsed messages. + //! + //!\param nbytes_subtables Sizes in bytes of the subtables (field in + //! `pb::HuffmanHeader`). + //!\param window Window into buffer containing messages to be parsed. + FrequencySupertable(SubtableSizes &nbytes_subtables, BufferWindow &window) + : Supertable(nbytes_subtables, window) { + for (std::size_t i = 0; i < nsubtables; ++i) { + Segment &segment = segments.at(i); + Frequencies &frequencies = *subtables.at(i).mutable_frequencies(); + + segment.first = frequencies.begin(); + segment.second = frequencies.size(); + } + } +}; + +//! A logical 'missed' table split into one or more subtables of moderate size. +struct MissedSupertable : Supertable { + //! Constructor. + //! + //! Construct an 'empty' `MissedSupertable`. It is expected that the caller + //! will subsequently write to the subtables using `Chain`. + //! + //!\param nmissed Number of missed symbols. + //!\param nbytes_subtables Sizes in bytes of the subtables (field in + //! `pb::HuffmanHeader`). This field will be written to. + MissedSupertable(const std::size_t nmissed, SubtableSizes &nbytes_subtables) + : Supertable(nmissed, nbytes_subtables) { + for (std::size_t i = 0; i < nsubtables; ++i) { + Missed &missed = *subtables.at(i).mutable_missed(); + Segment &segment = segments.at(i); + // How big `missed` should be when we're done. + const std::size_t nmissed = segment.second; + + missed.Resize(nmissed, 0); + segment.first = missed.begin(); + } + } + + //! Constructor. + //! + //! Construct a `MissedSubtable` from a collection of parsed messages. + //! + //!\param nbytes_subtables Sizes in bytes of the subtables (field in + //! `pb::HuffmanHeader`). + //!\param window Window into buffer containing messages to be parsed. + MissedSupertable(SubtableSizes &nbytes_subtables, BufferWindow &window) + : Supertable(nbytes_subtables, window) { + for (std::size_t i = 0; i < nsubtables; ++i) { + Segment &segment = segments.at(i); + Missed &missed = *subtables.at(i).mutable_missed(); + + segment.first = missed.begin(); + segment.second = missed.size(); + } + } +}; + +} // namespace + +template +MemoryBuffer huffman_encode(Symbol const *const begin, + const std::size_t n) { + const HuffmanCode code(begin, begin + n); + + const std::size_t nbits = code.nbits_hit(); + const std::size_t nbytes_hit = (nbits + CHAR_BIT - 1) / CHAR_BIT; + + pb::HuffmanHeader header; + header.set_index_mapping(pb::HuffmanHeader::INCLUSIVE_RANGE); + header.set_codeword_mapping(pb::HuffmanHeader::INDEX_FREQUENCY_PAIRS); + header.set_missed_encoding(pb::HuffmanHeader::LITERAL); + header.set_hit_encoding(pb::HuffmanHeader::RUN_TOGETHER); + + header.add_endpoints(code.endpoints.first); + header.add_endpoints(code.endpoints.second); + header.set_nbits(nbits); + + // Originally, `pb::HuffmanHeader` had a field each for the frequency and + // 'missed' tables. Unfortunately, these tables can get very big. In + // particular, if the error tolerance is very low, the quantized coefficients + // will be very large, and many of them will be missed. This could result in + // the size of the 'missed' table exceeding the (default) limit imposed by + // `google::protobuf::CodedInputStream`. See . As a workaround, we are splitting the + // 'missed' table (and, for good measure, the frequency table, too) into a + // sequence of subtables of moderate size. + + // This `FrequencySupertable` creates and populates the frequency subtables. + FrequencySupertable frequency_supertable( + code.frequencies, *header.mutable_nbytes_frequency_subtables()); + // This `MissedSupertable` creates but does not populate the 'missed' + // subtables. We'll populate the subtables below, as we encode the stream. + MissedSupertable missed_supertable(code.nmissed(), + *header.mutable_nbytes_missed_subtables()); + + // This `Chain` lets us treat the 'missed' subtables as a single logical + // table. It frees us from manually keeping track of when we need to advance + // from one subtable to the next. + Chain chained_missed_supertable(missed_supertable.segments); + Chain::iterator missed = chained_missed_supertable.begin(); + // Now we're ready to populate the 'missed' subtables in the course of + // populating the 'hit' buffer. + + // Zero-initialize the bytes. + unsigned char *const hit_ = new unsigned char[nbytes_hit](); + unsigned char *hit = hit_; + + unsigned char offset = 0; + for (const Symbol q : PseudoArray(begin, n)) { + if (code.out_of_range(q)) { + *missed++ = q; + } + + const HuffmanCodeword codeword = code.codewords.at(code.index(q)); + std::size_t NREMAINING = codeword.length; + for (unsigned char byte : codeword.bytes) { + // Number of bits of `byte` left to write. + unsigned char nremaining = + std::min(static_cast(CHAR_BIT), NREMAINING); + // Premature, but this will hold when we're done with `byte`. + NREMAINING -= nremaining; + + while (nremaining) { + *hit |= byte >> offset; + // Number of bits of `byte` just written (not cumulative). + const unsigned char nwritten = std::min( + nremaining, static_cast( + static_cast(CHAR_BIT) - offset)); + offset += nwritten; + hit += offset / CHAR_BIT; + offset %= CHAR_BIT; + nremaining -= nwritten; + byte <<= nwritten; + } + } + } + + // We're done writing to the 'missed' subtables, so we can now calculate their + // serialized sizes. We need to do this before calling + // `missed_supertable.ByteSize`. + missed_supertable.calculate_nbytes_subtables(); + + const std::uint_least64_t nheader = header.ByteSize(); + const std::size_t nbytes_frequency_supertable = + frequency_supertable.ByteSize(); + const std::size_t nbytes_missed_supertable = missed_supertable.ByteSize(); + MemoryBuffer out(HEADER_SIZE_SIZE + nheader + + nbytes_frequency_supertable + + nbytes_missed_supertable + nbytes_hit); + { + unsigned char *p = out.data.get(); + const std::array nheader_ = + serialize_header_size(nheader); + std::copy(nheader_.begin(), nheader_.end(), p); + p += HEADER_SIZE_SIZE; + + header.SerializeToArray(p, nheader); + p += nheader; + + frequency_supertable.SerializeToArray(p, nbytes_frequency_supertable); + p += nbytes_frequency_supertable; + + missed_supertable.SerializeToArray(p, nbytes_missed_supertable); + p += nbytes_missed_supertable; + + std::copy(hit_, hit_ + nbytes_hit, p); + p += nbytes_hit; + } + + delete[] hit_; + + return out; +} + +template +MemoryBuffer huffman_decode(const MemoryBuffer &buffer) { + BufferWindow window(buffer.data.get(), buffer.size); + const std::uint_least64_t nheader = read_header_size(window); + pb::HuffmanHeader header = read_message(window, nheader); + + if (header.index_mapping() != pb::HuffmanHeader::INCLUSIVE_RANGE) { + throw std::runtime_error("unrecognized Huffman index mapping"); + } + const Endpoints &endpoints_ = header.endpoints(); + if (endpoints_.size() != 2) { + throw std::runtime_error("received an unexpected number of endpoints"); + } + const std::pair endpoints(endpoints_.Get(0), + endpoints_.Get(1)); + + if (header.codeword_mapping() != pb::HuffmanHeader::INDEX_FREQUENCY_PAIRS) { + throw std::runtime_error("unrecognized Huffman codeword mapping"); + } + // See the comments in `huffman_encode` for an explanation of why we use these + // `Supertable`s and `Chain`s. + FrequencySupertable frequency_supertable( + *header.mutable_nbytes_frequency_subtables(), window); + Chain chained_frequency_supertable( + frequency_supertable.segments); + + if (header.missed_encoding() != pb::HuffmanHeader::LITERAL) { + throw std::runtime_error("unrecognized Huffman missed buffer encoding"); + } + MissedSupertable missed_supertable(*header.mutable_nbytes_missed_subtables(), + window); + Chain chained_missed_supertable(missed_supertable.segments); + Chain::iterator missed = chained_missed_supertable.begin(); + + if (header.hit_encoding() != pb::HuffmanHeader::RUN_TOGETHER) { + throw std::runtime_error("unrecognized Huffman hit buffer encoding"); + } + + const std::size_t nbits = header.nbits(); + const std::size_t nbytes = (nbits + CHAR_BIT - 1) / CHAR_BIT; + if (window.current + nbytes != window.end) { + throw std::runtime_error("number of bits in hit buffer inconsistent with " + "number of bytes in hit buffer"); + } + + const HuffmanCode code(endpoints, + chained_frequency_supertable.begin(), + chained_frequency_supertable.end()); + const std::size_t nsymbols = code.nsymbols(); + MemoryBuffer out(nsymbols); + Symbol *q = out.data.get(); + + const Bits bits(window.current, window.current + nbits / CHAR_BIT, + nbits % CHAR_BIT); + std::size_t nbits_read = 0; + const typename HuffmanCode::Node root = code.queue.top(); + assert(root); + Bits::iterator b = bits.begin(); + for (std::size_t i = 0; i < nsymbols; ++i) { + typename HuffmanCode::Node node; + for (node = root; node->left; + node = *b++ ? node->right : node->left, ++nbits_read) + ; + const std::pair decoded = code.decode(node); + *q++ = decoded.first ? decoded.second : *missed++; + } + assert(nbits_read == nbits); + assert(missed == chained_missed_supertable.end()); + + return out; +} + +} // namespace mgard diff --git a/include/compressors.hpp b/include/lossless.hpp similarity index 51% rename from include/compressors.hpp rename to include/lossless.hpp index 17cd7f7ce3..b3cafe1175 100644 --- a/include/compressors.hpp +++ b/include/lossless.hpp @@ -1,71 +1,49 @@ -#ifndef COMPRESSORS_HPP -#define COMPRESSORS_HPP +#ifndef LOSSLESS_HPP +#define LOSSLESS_HPP //!\file //!\brief Lossless compressors for quantized multilevel coefficients. #include -// For `z_const`. -#include - -#include - #include "proto/mgard.pb.h" #include "utilities.hpp" namespace mgard { -//! Compress an array using a Huffman tree. -//! -//!\param[in] src Array to be compressed. -//!\param[in] srcLen Size of array (number of elements) to be compressed. -MemoryBuffer compress_memory_huffman(long int *const src, - const std::size_t srcLen); - -//! Decompress an array compressed with `compress_memory_huffman`. -//! -//!\param[in] src Compressed array. -//!\param[in] srcLen Size in bytes of the compressed array. -//!\param[out] dst Decompressed array. -//!\param[in] dstLen Size in bytes of the decompressed array. -void decompress_memory_huffman(unsigned char *const src, - const std::size_t srcLen, long int *const dst, - const std::size_t dstLen); - #ifdef MGARD_ZSTD //! Compress an array using `zstd`. //! //!\param[in] src Array to be compressed. //!\param[in] srcLen Size in bytes of the array to be compressed. -MemoryBuffer compress_memory_zstd(void const *const src, - const std::size_t srcLen); +MemoryBuffer compress_zstd(void const *const src, + const std::size_t srcLen); -//! Decompress an array compressed with `compress_memory_zstd`. +//! Decompress an array compressed with `compress_zstd`. //! //!\param[in] src Compressed array. //!\param[in] srcLen Size in bytes of the compressed array. //!\param[out] dst Decompressed array. //!\param[in] dstLen Size in bytes of the decompressed array. -void decompress_memory_zstd(void const *const src, const std::size_t srcLen, - unsigned char *const dst, const std::size_t dstLen); +void decompress_zstd(void const *const src, const std::size_t srcLen, + unsigned char *const dst, const std::size_t dstLen); #endif //! Compress an array using `zlib`. //! //!\param src Array to be compressed. //!\param srcLen Size in bytes of the array to be compressed. -MemoryBuffer compress_memory_z(void z_const *const src, - const std::size_t srcLen); +MemoryBuffer compress_zlib(void const *const src, + const std::size_t srcLen); -//! Decompress an array with `compress_memory_z`. +//! Decompress an array with `compress_zlib`. //! //!\param src Compressed array. //!\param srcLen Size in bytes of the compressed array data //!\param dst Decompressed array. //!\param dstLen Size in bytes of the decompressed array. -void decompress_memory_z(void z_const *const src, const std::size_t srcLen, - unsigned char *const dst, const std::size_t dstLen); +void decompress_zlib(void const *const src, const std::size_t srcLen, + unsigned char *const dst, const std::size_t dstLen); //! Compress an array of quantized multilevel coefficients. //! @@ -74,7 +52,8 @@ void decompress_memory_z(void z_const *const src, const std::size_t srcLen, //!\param[in] header Header for the self-describing buffer. //!\param[in] src Array of quantized multilevel coefficients. //!\param[in] srcLen Size in bytes of the input array. -MemoryBuffer compress(const pb::Header &header, void *const src, +MemoryBuffer compress(const pb::Header &header, + void const *const src, const std::size_t srcLen); //! Decompress an array of quantized multilevel coefficients. @@ -86,7 +65,7 @@ MemoryBuffer compress(const pb::Header &header, void *const src, //!\param[in] srcLen Size in bytes of the compressed array. //!\param[out] dst Decompressed array. //!\param[in] dstLen Size in bytes of the decompressed array. -void decompress(const pb::Header &header, void *const src, +void decompress(const pb::Header &header, void const *const src, const std::size_t srcLen, void *const dst, const std::size_t dstLen); diff --git a/include/mgard-x/CompressionHighLevel/Metadata.hpp b/include/mgard-x/CompressionHighLevel/Metadata.hpp index a0111629ce..3ba5bb1dd4 100644 --- a/include/mgard-x/CompressionHighLevel/Metadata.hpp +++ b/include/mgard-x/CompressionHighLevel/Metadata.hpp @@ -86,11 +86,8 @@ template struct Metadata { std::cout << "Metadata size: " << metadata_size << "\n"; std::cout << "Metadata crc32: " << metadata_crc32 << "\n"; std::cout << "Endiness: "; - if (etype == endiness_type::Big_Endian) { - std::cout << "Big Endian\n"; - } else { - std::cout << "Little Endian\n"; - } + std::cout << (etype == endiness_type::Big_Endian ? "Big Endian\n" + : "Little Endian\n"); std::cout << "Data type: "; if (dtype == data_type::Float) { std::cout << "Float\n"; @@ -173,11 +170,8 @@ template struct Metadata { private: SERIALIZED_TYPE *SerializeAll(uint32_t &total_size) { - if (big_endian()) { - etype = endiness_type::Big_Endian; - } else { - etype = endiness_type::Little_Endian; - } + etype = big_endian() ? endiness_type::Big_Endian + : endiness_type::Little_Endian; total_size = 0; // about MGARD software @@ -469,13 +463,10 @@ template struct Metadata { mgard::pb::Quantization &quantization = *header.mutable_quantization(); quantization.set_method(mgard::pb::Quantization::COEFFICIENTWISE_LINEAR); quantization.set_bin_widths(mgard::pb::Quantization::PER_COEFFICIENT); - quantization.set_type(mgard::pb::Quantization::INT64_T); - quantization.set_big_endian(big_endian()); - if (big_endian()) { - etype = endiness_type::Big_Endian; - } else { - etype = endiness_type::Little_Endian; - } + quantization.set_type(mgard::type_to_quantization_type()); + quantization.set_big_endian(big_endian()); + etype = big_endian() ? endiness_type::Big_Endian + : endiness_type::Little_Endian; } { // Encoding @@ -710,13 +701,11 @@ template struct Metadata { mgard::pb::Quantization::COEFFICIENTWISE_LINEAR); assert(quantization.bin_widths() == mgard::pb::Quantization::PER_COEFFICIENT); - assert(quantization.type() == mgard::pb::Quantization::INT64_T); - assert(quantization.big_endian() == big_endian()); - if (big_endian()) { - etype = endiness_type::Big_Endian; - } else { - etype = endiness_type::Little_Endian; - } + assert(quantization.type() == + mgard::type_to_quantization_type()); + assert(quantization.big_endian() == big_endian()); + etype = big_endian() ? endiness_type::Big_Endian + : endiness_type::Little_Endian; } { // Encoding diff --git a/include/mgard-x/Lossless/CPU.hpp b/include/mgard-x/Lossless/CPU.hpp index 9171f87678..2583e155db 100644 --- a/include/mgard-x/Lossless/CPU.hpp +++ b/include/mgard-x/Lossless/CPU.hpp @@ -1,181 +1,33 @@ #ifndef MGARD_X_CPU_LOSSLESS_TEMPLATE_HPP #define MGARD_X_CPU_LOSSLESS_TEMPLATE_HPP -#include - -/*! CHECK - * Check that the condition holds. If it doesn't print a message and die. - */ -#define CHECK(cond, ...) \ - do { \ - if (!(cond)) { \ - fprintf(stderr, "%s:%d CHECK(%s) failed: ", __FILE__, __LINE__, #cond); \ - fprintf(stderr, "" __VA_ARGS__); \ - fprintf(stderr, "\n"); \ - exit(1); \ - } \ - } while (0) - -/*! CHECK_ZSTD - * Check the zstd error code and die if an error occurred after printing a - * message. - */ -/*! CHECK_ZSTD - * Check the zstd error code and die if an error occurred after printing a - * message. - */ -#define CHECK_ZSTD(fn, ...) \ - do { \ - size_t const err = (fn); \ - CHECK(!ZSTD_isError(err), "%s", ZSTD_getErrorName(err)); \ - } while (0) - -namespace mgard { -void huffman_encoding(long int *quantized_data, const std::size_t n, - unsigned char **out_data_hit, size_t *out_data_hit_size, - unsigned char **out_data_miss, size_t *out_data_miss_size, - unsigned char **out_tree, size_t *out_tree_size); -void huffman_decoding(long int *quantized_data, - const std::size_t quantized_data_size, - unsigned char *out_data_hit, size_t out_data_hit_size, - unsigned char *out_data_miss, size_t out_data_miss_size, - unsigned char *out_tree, size_t out_tree_size); -} // namespace mgard +#include "proto/mgard.pb.h" +#include namespace mgard_x { -template -unsigned char *compress_memory_huffman(long int *const src, - const std::size_t srcLen, - std::size_t &outsize) { - unsigned char *out_data_hit = 0; - size_t out_data_hit_size; - unsigned char *out_data_miss = 0; - size_t out_data_miss_size; - unsigned char *out_tree = 0; - size_t out_tree_size; - ::mgard::huffman_encoding(src, srcLen, &out_data_hit, &out_data_hit_size, - &out_data_miss, &out_data_miss_size, &out_tree, - &out_tree_size); - - const size_t total_size = - out_data_hit_size / 8 + 4 + out_data_miss_size + out_tree_size; - unsigned char *payload = (unsigned char *)malloc(total_size); - unsigned char *bufp = payload; - - if (out_tree_size) { - std::memcpy(bufp, out_tree, out_tree_size); - bufp += out_tree_size; +template mgard::pb::Header setup_header() { + mgard::pb::Header header; + mgard::pb::Quantization &q = *header.mutable_quantization(); + if (std::is_same::value) { + q.set_type(mgard::pb::Quantization::INT8_T); + } else if (std::is_same::value) { + q.set_type(mgard::pb::Quantization::INT16_T); + } else if (std::is_same::value) { + q.set_type(mgard::pb::Quantization::INT32_T); + } else if (std::is_same::value) { + q.set_type(mgard::pb::Quantization::INT64_T); } - - std::memcpy(bufp, out_data_hit, out_data_hit_size / 8 + 4); - bufp += out_data_hit_size / 8 + 4; - - if (out_data_miss_size) { - std::memcpy(bufp, out_data_miss, out_data_miss_size); - bufp += out_data_miss_size; - } - - free(out_tree); - free(out_data_hit); - free(out_data_miss); - - const size_t cBuffSize = ZSTD_compressBound(total_size); - unsigned char *const zstd_buffer = new unsigned char[cBuffSize]; - - const std::size_t cSize = - ZSTD_compress(zstd_buffer, cBuffSize, payload, total_size, 1); - CHECK_ZSTD(cSize); - - free(payload); - payload = 0; - - const std::size_t bufferLen = 3 * sizeof(size_t) + cSize; - unsigned char *const buffer = new unsigned char[bufferLen]; - outsize = bufferLen; - - bufp = buffer; - *(size_t *)bufp = out_tree_size; - bufp += sizeof(size_t); - - *(size_t *)bufp = out_data_hit_size; - bufp += sizeof(size_t); - - *(size_t *)bufp = out_data_miss_size; - bufp += sizeof(size_t); - - { - unsigned char const *const p = zstd_buffer; - std::copy(p, p + cSize, bufp); - } - - { - unsigned char *buf = buffer; - out_tree_size = *(size_t *)buf; - buf += sizeof(size_t); - - out_data_hit_size = *(size_t *)buf; - buf += sizeof(size_t); - - out_data_miss_size = *(size_t *)buf; - buf += sizeof(size_t); - } - - return buffer; -} - -template -void decompress_memory_huffman(unsigned char *const src, - const std::size_t srcLen, long int *const dst, - const std::size_t dstLen) { - unsigned char *out_data_hit = 0; - size_t out_data_hit_size; - unsigned char *out_data_miss = 0; - size_t out_data_miss_size; - unsigned char *out_tree = 0; - size_t out_tree_size; - - unsigned char *buf = src; - - out_tree_size = *(size_t *)buf; - buf += sizeof(size_t); - - out_data_hit_size = *(size_t *)buf; - buf += sizeof(size_t); - - out_data_miss_size = *(size_t *)buf; - buf += sizeof(size_t); - - size_t total_huffman_size = - out_tree_size + out_data_hit_size / 8 + 4 + out_data_miss_size; - unsigned char *huffman_encoding_p = - (unsigned char *)malloc(total_huffman_size); - - size_t const dSize = ZSTD_decompress(huffman_encoding_p, total_huffman_size, - buf, srcLen - 3 * sizeof(size_t)); - CHECK_ZSTD(dSize); - - /* When zstd knows the content size, it will error if it doesn't match. */ - CHECK(total_huffman_size == dSize, - "Impossible because zstd will check this condition!"); - - out_tree = huffman_encoding_p; - out_data_hit = huffman_encoding_p + out_tree_size; - out_data_miss = - huffman_encoding_p + out_tree_size + out_data_hit_size / 8 + 4; - - mgard::huffman_decoding(dst, dstLen, out_data_hit, out_data_hit_size, - out_data_miss, out_data_miss_size, out_tree, - out_tree_size); - - free(huffman_encoding_p); + mgard::pb::Encoding &e = *header.mutable_encoding(); + // MGARD-X requires Zstd, so we always use CPU_HUFFMAN_ZSTD + e.set_compressor(mgard::pb::Encoding::CPU_HUFFMAN_ZSTD); + e.set_serialization(mgard::pb::Encoding::RFMH); + return header; } template Array<1, Byte, DeviceType> CPUCompress(SubArray<1, C, DeviceType> &input_data) { - // PrintSubarray("CPUCompress input", input_data); - size_t input_count = input_data.getShape(0); C *in_data = NULL; @@ -183,14 +35,10 @@ Array<1, Byte, DeviceType> CPUCompress(SubArray<1, C, DeviceType> &input_data) { MemoryManager::Copy1D(in_data, input_data.data(), input_count, 0); DeviceRuntime::SyncQueue(0); - std::vector qv(input_count); - for (size_t i = 0; i < input_count; i++) { - qv[i] = (long int)in_data[i]; - } - - std::size_t actual_out_size; - unsigned char *lossless_data = compress_memory_huffman( - qv.data(), qv.size(), actual_out_size); + mgard::MemoryBuffer lossless_data_buffer = + mgard::compress(setup_header(), in_data, input_count * sizeof(C)); + unsigned char *lossless_data = lossless_data_buffer.data.get(); + std::size_t actual_out_size = lossless_data_buffer.size; uint8_t *out_data = NULL; MemoryManager::MallocHost(out_data, @@ -205,10 +53,6 @@ Array<1, Byte, DeviceType> CPUCompress(SubArray<1, C, DeviceType> &input_data) { MemoryManager::FreeHost(out_data); MemoryManager::FreeHost(in_data); - delete[] lossless_data; - - // PrintSubarray("CPUCompress output", SubArray(output_data)); - return output_data; } @@ -216,7 +60,6 @@ template Array<1, C, DeviceType> CPUDecompress(SubArray<1, Byte, DeviceType> &input_data) { - // PrintSubarray("CPUDecompress input", input_data); size_t input_count = input_data.getShape(0); Byte *in_data = NULL; MemoryManager::MallocHost(in_data, input_count, 0); @@ -225,19 +68,13 @@ CPUDecompress(SubArray<1, Byte, DeviceType> &input_data) { uint32_t actual_out_count = 0; actual_out_count = *reinterpret_cast(in_data); - // *oriData = (uint8_t*)malloc(outSize); C *out_data = NULL; MemoryManager::MallocHost(out_data, actual_out_count, 0); DeviceRuntime::SyncQueue(0); - long int *qv = new long int[actual_out_count]; - size_t out_size = actual_out_count * sizeof(long int); - decompress_memory_huffman( - in_data + sizeof(size_t), input_count - sizeof(size_t), qv, out_size); - - for (size_t i = 0; i < actual_out_count; i++) { - out_data[i] = (C)qv[i]; - } + mgard::decompress(setup_header(), in_data + sizeof(size_t), + input_count - sizeof(size_t), out_data, + actual_out_count * sizeof(C)); Array<1, C, DeviceType> output_data({(SIZE)actual_out_count}); output_data.load(out_data); @@ -245,8 +82,6 @@ CPUDecompress(SubArray<1, Byte, DeviceType> &input_data) { MemoryManager::FreeHost(out_data); MemoryManager::FreeHost(in_data); - // PrintSubarray("CPUDecompress output", SubArray(output_data)); - return output_data; } diff --git a/include/utilities.hpp b/include/utilities.hpp index 626bc6f235..f687277eaa 100644 --- a/include/utilities.hpp +++ b/include/utilities.hpp @@ -9,6 +9,7 @@ #include #include #include +#include namespace mgard { @@ -292,6 +293,8 @@ bool operator==(const RangeSlice &a, const RangeSlice &b); template bool operator!=(const RangeSlice &a, const RangeSlice &b); +#ifndef __NVCC__ + //! Mimic Python's `itertools.product`. Allow iteration over the Cartesian //! product of a collection of ranges. //! @@ -410,6 +413,8 @@ template class CartesianProduct::iterator { std::array inner; }; +#endif + //! Check that a dimension index is in bounds. //! //!\param dimension Dimension index. @@ -449,6 +454,188 @@ template struct MemoryBuffer { std::size_t size; }; +//! Range allowing iteration over the bits in an array. +//! +//! Iterating over this object yields each byte's bits from most to least +//! significant. +class Bits { +public: + //! Constructor. + //! + //!\param begin Pointer to the beginning of the array to be iterated over. + //!\param end Pointer to the end of the array to be iterated over. + Bits(unsigned char const *const begin, unsigned char const *const end); + + //! Constructor. + //! + //!\overload + //! + //!\param begin Pointer to the beginning of the array to be iterated over. + //!\param end Pointer to the end of the array to be iterated over. + //!\param offset_end Offset for end iterator. + Bits(unsigned char const *const begin, unsigned char const *const end, + const unsigned char offset_end); + + //! Equality comparison. + bool operator==(const Bits &other) const; + + //! Inequality comparison. + bool operator!=(const Bits &other) const; + + // Forward declaration. + class iterator; + + //! Return an iterator to the beginning of the bit range. + iterator begin() const; + + //! Return an iterator to the end of the bit range. + iterator end() const; + +private: + //! Pointer to the beginning of the array to be iterated over. + unsigned char const *begin_; + + //! Pointer to the beginning of the array to be iterated over. + unsigned char const *end_; + + //! Offset for end iterator. + unsigned char offset_end; +}; + +//! Iterator over a bit range. +class Bits::iterator { +public: + //! Category of the iterator. + using iterator_category = std::forward_iterator_tag; + //! Type iterated over. + using value_type = bool; + //! Type for distance between iterators. + using difference_type = std::ptrdiff_t; + //! Pointer to `value_type`. + using pointer = value_type *; + //! Type returned by the dereference operator. + using reference = value_type; + + //! Constructor. + //! + //!\param bits Associated bit range. + //!\param p Position in the array being iterated over. + //!\param offset Offset within the current byte. + iterator(const Bits &bits, unsigned char const *const p, + const unsigned char offset); + + //! Equality comparison. + bool operator==(const iterator &other) const; + + //! Inequality comparison. + bool operator!=(const iterator &other) const; + + //! Preincrement. + iterator &operator++(); + + //! Postincrement. + iterator operator++(int); + + //! Dereference. + reference operator*() const; + +private: + //! Associated bit range. + const Bits &iterable; + + //! Position in the array being iterated over. + unsigned char const *p; + + //! Offset within the current byte. + unsigned char offset; +}; + +//! Concatenated iterator ranges. +//! +//! Approximate Python's `itertools.chain` generator. +template class Chain { +public: + //! Constructor. + //! + //!\param segments Beginnings and lengths of iterator ranges. + Chain(const std::vector> &segments); + + // Forward declaration. + class iterator; + + //! Return an iterator to the beginning of the enumeration. + iterator begin() const; + + //! Return an iterator to the end of the enumeration. + iterator end() const; + + //! Beginnings and lengths of iterator ranges. + std::vector> segments; +}; + +//! Equality comparison. +template bool operator==(const Chain &a, const Chain &b); + +//! Inequality comparison. +template bool operator!=(const Chain &a, const Chain &b); + +//! Iterator over concatenated iterator ranges. +template class Chain::iterator { +public: + //! Category of the iterator. + using iterator_category = std::forward_iterator_tag; + //! Type iterated over. + using value_type = typename std::iterator_traits::value_type; + //! Type for distance between iterators. + using difference_type = typename std::iterator_traits::difference_type; + //! Pointer to `value_type`. + using pointer = typename std::iterator_traits::pointer; + //! Type returned by the dereference operator. + using reference = typename std::iterator_traits::reference; + + //! Constructor. + //! + //!\param iterable Associated chain. + //!\param q Iterator to current segment. + iterator( + const Chain &iterable, + const typename std::vector>::const_iterator q); + + //! Equality comparison. + bool operator==(const iterator &other) const; + + //! Inequality comparison. + bool operator!=(const iterator &other) const; + + //! Preincrement. + iterator &operator++(); + + //! Postincrement. + iterator operator++(int); + + //! Dereference. + reference operator*() const; + +private: + //! Associated bit range. + const Chain &iterable; + + //! Iterator to current segment. + typename std::vector>::const_iterator q; + + //! Position in the current segment. + It p; + + //! Distance from the beginning of the current segment. + std::size_t i; + + //! Length of the current segment. + std::size_t n; + + //! Zero `i`; populate `p` and `n` from `q` if not at end. + void conditionally_start_segment(); +}; + } // namespace mgard #include "utilities.tpp" diff --git a/include/utilities.tpp b/include/utilities.tpp index 52fd9b4ba7..296e1f6f0c 100644 --- a/include/utilities.tpp +++ b/include/utilities.tpp @@ -201,6 +201,8 @@ bool operator!=(const RangeSlice &a, const RangeSlice &b) { return !operator==(a, b); } +#ifndef __NVCC__ + template CartesianProduct::CartesianProduct(const std::array factors) : factors(factors) { @@ -325,6 +327,8 @@ typename CartesianProduct::iterator::reference return value; } +#endif + template void check_dimension_index_bounds(const std::size_t dimension) { if (dimension >= N) { @@ -345,4 +349,80 @@ template MemoryBuffer::MemoryBuffer(const std::size_t size) : MemoryBuffer(new T[size], size) {} +template +Chain::Chain(const std::vector> &segments) + : segments(segments) {} + +template bool operator==(const Chain &a, const Chain &b) { + return a.segments == b.segments; +} + +template bool operator!=(const Chain &a, const Chain &b) { + return !operator==(a, b); +} + +template typename Chain::iterator Chain::begin() const { + return {*this, segments.begin()}; +} + +template typename Chain::iterator Chain::end() const { + return {*this, segments.end()}; +} + +template +Chain::iterator::iterator( + const Chain &iterable, + const typename std::vector>::const_iterator q) + : iterable(iterable), q(q) { + conditionally_start_segment(); +} + +template void Chain::iterator::conditionally_start_segment() { + i = 0; + if (q != iterable.segments.end()) { + const std::pair pair = *q; + p = pair.first; + n = pair.second; + if (not n) { + ++q; + conditionally_start_segment(); + } + } +} + +template +bool Chain::iterator:: +operator==(const typename Chain::iterator &other) const { + return i == other.i and q == other.q and iterable == other.iterable; +} + +template +bool Chain::iterator:: +operator!=(const typename Chain::iterator &other) const { + return !operator==(other); +} + +template +typename Chain::iterator &Chain::iterator::operator++() { + ++p; + ++i; + if (i == n) { + ++q; + conditionally_start_segment(); + } + return *this; +} + +template +typename Chain::iterator Chain::iterator::operator++(int) { + const iterator tmp = *this; + operator++(); + return tmp; +} + +template +typename Chain::iterator::reference Chain::iterator::operator*() const { + return *p; +} + } // namespace mgard diff --git a/src/compressors.cpp b/src/compressors.cpp deleted file mode 100644 index 915912f7f9..0000000000 --- a/src/compressors.cpp +++ /dev/null @@ -1,703 +0,0 @@ -#include "compressors.hpp" - -#include -#include -#include - -#include -#include -#include -#include -#include -#include - -#include - -#include "format.hpp" - -#ifdef MGARD_TIMING -#include -#include -#endif - -#ifdef MGARD_ZSTD -#include -#endif - -namespace mgard { - -const int nql = 32768 * 4; - -struct htree_node { - int q; - size_t cnt; - unsigned int code; - size_t len; - htree_node *left; - htree_node *right; -}; - -struct huffman_codec { - int q; - unsigned int code; - size_t len; -}; - -bool myfunction(htree_node i, htree_node j) { return (i.cnt < j.cnt); } - -htree_node *new_htree_node(int q, size_t cnt) { - htree_node *new_node = new htree_node; - new_node->q = q; - new_node->cnt = cnt; - new_node->code = 0; - new_node->len = 0; - new_node->left = 0; - new_node->right = 0; - - return new_node; -} - -struct LessThanByCnt { - bool operator()(const htree_node *lhs, const htree_node *rhs) const { - return lhs->cnt > rhs->cnt; - } -}; - -template -using my_priority_queue = - std::priority_queue, LessThanByCnt>; - -void build_codec(htree_node *root, unsigned int code, size_t len, - huffman_codec *codec) { - - root->len = len; - root->code = code; - - if (!root->left && !root->right) { - codec[root->q].q = root->q; - codec[root->q].code = code; - codec[root->q].len = len; - } - - if (root->left) { - build_codec(root->left, code << 1, len + 1, codec); - } - - if (root->right) { - build_codec(root->right, code << 1 | 0x1, len + 1, codec); - } -} - -my_priority_queue *build_tree(size_t *cnt) { - my_priority_queue *phtree; - phtree = new my_priority_queue; -#if 1 - for (int i = 0; i < nql; i++) { - if (cnt[i] != 0) { - htree_node *new_node = new_htree_node(i, cnt[i]); - phtree->push(new_node); - } - } - - while (phtree->size() > 1) { - htree_node *top_node1 = phtree->top(); - phtree->pop(); - htree_node *top_node2 = phtree->top(); - phtree->pop(); - - htree_node *new_node = new_htree_node(-1, top_node1->cnt + top_node2->cnt); - new_node->left = top_node1; - new_node->right = top_node2; - phtree->push(new_node); - } -#endif - return phtree; -} - -void free_htree_node(htree_node *node) { - if (node->left) { - free_htree_node(node->left); - node->left = 0; - } - - if (node->right) { - free_htree_node(node->right); - node->right = 0; - } - - delete node; -} - -void free_tree(my_priority_queue *phtree) { - if (phtree) { - free_htree_node(phtree->top()); - - phtree->pop(); - - delete phtree; - } -} - -// Note this function will change the quantized data. -size_t *build_ft(long int *quantized_data, const std::size_t n, - size_t &num_outliers) { - size_t *cnt = (size_t *)malloc(nql * sizeof(size_t)); - std::memset(cnt, 0, nql * sizeof(size_t)); - - for (std::size_t i = 0; i < n; i++) { - // Convert quantization level to positive so that counting freq can be - // easily done. Level 0 is reserved a out-of-range flag. - quantized_data[i] = quantized_data[i] + nql / 2; - if (quantized_data[i] > 0 && quantized_data[i] < nql) { - cnt[quantized_data[i]]++; - } else { - cnt[0]++; - } - } - - num_outliers = cnt[0]; - - return cnt; -} - -huffman_codec *build_huffman_codec(long int *quantized_data, size_t **ft, - const std::size_t n, size_t &num_outliers) { - size_t *cnt; - - cnt = build_ft(quantized_data, n, num_outliers); - *ft = cnt; - - my_priority_queue *phtree = build_tree(cnt); - - huffman_codec *codec = (huffman_codec *)malloc(sizeof(huffman_codec) * nql); - std::memset(codec, 0, sizeof(huffman_codec) * nql); - - build_codec(phtree->top(), 0, 0, codec); - - free_tree(phtree); - phtree = 0; - - return codec; -} - -void huffman_decoding(long int *quantized_data, - const std::size_t quantized_data_size, - unsigned char *out_data_hit, size_t out_data_hit_size, - unsigned char *out_data_miss, size_t out_data_miss_size, - unsigned char *out_tree, size_t out_tree_size) { - size_t *cft = (size_t *)out_tree; - int nonZeros = out_tree_size / (2 * sizeof(size_t)); - size_t *ft = (size_t *)malloc(nql * sizeof(size_t)); - - std::memset(ft, 0, nql * sizeof(size_t)); - - for (int j = 0; j < nonZeros; j++) { - ft[cft[2 * j]] = cft[2 * j + 1]; - } - - my_priority_queue *phtree = build_tree(ft); - - unsigned int *buf = (unsigned int *)out_data_hit; - - // The out_data_miss may not be aligned. Therefore, the code - // here makes a new buffer. - int *miss_buf = (int *)malloc(out_data_miss_size); - if (out_data_miss_size) { - std::memcpy(miss_buf, out_data_miss, out_data_miss_size); - } - - int *miss_bufp = miss_buf; - - size_t start_bit = 0; - unsigned int mask = 0x80000000; - - long int *q = quantized_data; - size_t i = 0; - size_t num_missed = 0; - while (q < (quantized_data + (quantized_data_size / sizeof(*q)))) { - htree_node *root = phtree->top(); - assert(root); - - size_t len = 0; - int offset = 0; - while (root->left) { - int flag = *(buf + start_bit / 32 + offset) & mask; - if (!flag) { - root = root->left; - } else { - root = root->right; - } - - len++; - - mask >>= 1; - if (!mask) { - mask = 0x80000000; - offset = 1; - } else { - // offset = 0; - } - } - - if (root->q != 0) { - *q = root->q - nql / 2; - - } else { - *q = *miss_buf - nql / 2; - - miss_buf++; - num_missed++; - } - - q++; - i++; - - start_bit += len; - } - - assert(start_bit == out_data_hit_size); - assert(sizeof(int) * num_missed == out_data_miss_size); - - // Avoid unused argument warning. If NDEBUG is defined, then the assert - // becomes empty and out_data_hit_size is unused. Tell the compiler that - // is OK and expected. - (void)out_data_hit_size; - - free(miss_bufp); - miss_bufp = 0; - free_tree(phtree); - phtree = 0; - free(ft); - ft = 0; -} - -void decompress_memory_huffman(unsigned char *const src, - const std::size_t srcLen, long int *const dst, - const std::size_t dstLen) { - unsigned char *out_data_hit = 0; - size_t out_data_hit_size; - unsigned char *out_data_miss = 0; - size_t out_data_miss_size; - unsigned char *out_tree = 0; - size_t out_tree_size; - - unsigned char *buf = src; - - out_tree_size = *(size_t *)buf; - buf += sizeof(size_t); - - out_data_hit_size = *(size_t *)buf; - buf += sizeof(size_t); - - out_data_miss_size = *(size_t *)buf; - buf += sizeof(size_t); - size_t total_huffman_size = - out_tree_size + out_data_hit_size / 8 + 4 + out_data_miss_size; - unsigned char *huffman_encoding_p = - (unsigned char *)malloc(total_huffman_size); -#ifndef MGARD_ZSTD - decompress_memory_z(buf, srcLen - 3 * sizeof(size_t), huffman_encoding_p, - total_huffman_size); -#else - decompress_memory_zstd(buf, srcLen - 3 * sizeof(size_t), huffman_encoding_p, - total_huffman_size); -#endif - out_tree = huffman_encoding_p; - out_data_hit = huffman_encoding_p + out_tree_size; - out_data_miss = - huffman_encoding_p + out_tree_size + out_data_hit_size / 8 + 4; - - huffman_decoding(dst, dstLen, out_data_hit, out_data_hit_size, out_data_miss, - out_data_miss_size, out_tree, out_tree_size); - - free(huffman_encoding_p); -} - -void huffman_encoding(long int *quantized_data, const std::size_t n, - unsigned char **out_data_hit, size_t *out_data_hit_size, - unsigned char **out_data_miss, size_t *out_data_miss_size, - unsigned char **out_tree, size_t *out_tree_size) { - size_t num_miss = 0; - size_t *ft = 0; - - huffman_codec *codec = build_huffman_codec(quantized_data, &ft, n, num_miss); - - assert(n >= num_miss); - - /* For those miss points, we still need to maintain a flag (q = 0), - * and therefore we need to allocate space for n numbers. - */ - unsigned char *p_hit = (unsigned char *)malloc(n * sizeof(int)); - std::memset(p_hit, 0, n * sizeof(int)); - - int *p_miss = 0; - if (num_miss > 0) { - p_miss = (int *)malloc(num_miss * sizeof(int)); - std::memset(p_miss, 0, num_miss * sizeof(int)); - } - - *out_data_hit = p_hit; - *out_data_miss = (unsigned char *)p_miss; - *out_data_hit_size = 0; - *out_data_miss_size = 0; - - size_t start_bit = 0; - unsigned int *cur = (unsigned int *)p_hit; - size_t cnt_missed = 0; - for (std::size_t i = 0; i < n; i++) { - int q = quantized_data[i]; - unsigned int code; - size_t len; - - if (q > 0 && q < nql) { - // for those that are within the range - code = codec[q].code; - len = codec[q].len; - } else { - // for those that are out of the range, q is set to 0 - code = codec[0].code; - len = codec[0].len; - - *p_miss = q; - p_miss++; - cnt_missed++; - } - - // Note that if len == 0, then that means that either the data is all the - // same number or (more likely) all data are outside the quantization - // range. Either way, the code contains no information and is therefore 0 - // bits. - - if (32 - start_bit % 32 < len) { - // current unsigned int cannot hold the code - // copy 32 - start_bit % 32 bits to the current int - // and copy the rest len - (32 - start_bit % 32) to the next int - size_t rshift = len - (32 - start_bit % 32); - size_t lshift = 32 - rshift; - *(cur + start_bit / 32) = (*(cur + start_bit / 32)) | (code >> rshift); - *(cur + start_bit / 32 + 1) = - (*(cur + start_bit / 32 + 1)) | (code << lshift); - start_bit += len; - } else if (len > 0) { - code = code << (32 - start_bit % 32 - len); - *(cur + start_bit / 32) = (*(cur + start_bit / 32)) | code; - start_bit += len; - } else { - // Sequence is empty (everything must be the same). Do nothing. - } - } - - // Note: hit size is in bits, while miss size is in bytes. - *out_data_hit_size = start_bit; - *out_data_miss_size = num_miss * sizeof(int); - - // write frequency table to buffer - int nonZeros = 0; - for (int i = 0; i < nql; i++) { - if (ft[i] > 0) { - nonZeros++; - } - } - - size_t *cft = (size_t *)malloc(2 * nonZeros * sizeof(size_t)); - int off = 0; - for (int i = 0; i < nql; i++) { - if (ft[i] > 0) { - cft[2 * off] = i; - cft[2 * off + 1] = ft[i]; - off++; - } - } - - *out_tree = (unsigned char *)cft; - *out_tree_size = 2 * nonZeros * sizeof(size_t); - free(ft); - ft = 0; - - free(codec); - codec = 0; -} - -MemoryBuffer compress_memory_huffman(long int *const src, - const std::size_t srcLen) { - unsigned char *out_data_hit = 0; - size_t out_data_hit_size; - unsigned char *out_data_miss = 0; - size_t out_data_miss_size; - unsigned char *out_tree = 0; - size_t out_tree_size; -#ifdef MGARD_TIMING - auto huff_time1 = std::chrono::high_resolution_clock::now(); -#endif - huffman_encoding(src, srcLen, &out_data_hit, &out_data_hit_size, - &out_data_miss, &out_data_miss_size, &out_tree, - &out_tree_size); -#ifdef MGARD_TIMING - auto huff_time2 = std::chrono::high_resolution_clock::now(); - auto duration = std::chrono::duration_cast( - huff_time2 - huff_time1); - std::cout << "Huffman tree time = " << (double)duration.count() / 1000000 - << "\n"; -#endif - const size_t total_size = - out_data_hit_size / 8 + 4 + out_data_miss_size + out_tree_size; - unsigned char *payload = (unsigned char *)malloc(total_size); - unsigned char *bufp = payload; - - if (out_tree_size) { - std::memcpy(bufp, out_tree, out_tree_size); - bufp += out_tree_size; - } - - std::memcpy(bufp, out_data_hit, out_data_hit_size / 8 + 4); - bufp += out_data_hit_size / 8 + 4; - - if (out_data_miss_size) { - std::memcpy(bufp, out_data_miss, out_data_miss_size); - bufp += out_data_miss_size; - } - - free(out_tree); - free(out_data_hit); - free(out_data_miss); - -#ifndef MGARD_ZSTD -#ifdef MGARD_TIMING - auto z_time1 = std::chrono::high_resolution_clock::now(); -#endif - const MemoryBuffer out_data = - compress_memory_z(payload, total_size); -#ifdef MGARD_TIMING - auto z_time2 = std::chrono::high_resolution_clock::now(); - auto z_duration = - std::chrono::duration_cast(z_time2 - z_time1); - std::cout << "ZLIB compression time = " - << (double)z_duration.count() / 1000000 << "\n"; -#endif -#else -#ifdef MGARD_TIMING - auto zstd_time1 = std::chrono::high_resolution_clock::now(); -#endif - const MemoryBuffer out_data = - compress_memory_zstd(payload, total_size); -#ifdef MGARD_TIMING - auto zstd_time2 = std::chrono::high_resolution_clock::now(); - auto zstd_duration = std::chrono::duration_cast( - zstd_time2 - zstd_time1); - std::cout << "ZSTD compression time = " - << (double)zstd_duration.count() / 1000000 << "\n"; -#endif -#endif - free(payload); - payload = 0; - - const std::size_t bufferLen = 3 * sizeof(size_t) + out_data.size; - unsigned char *const buffer = new unsigned char[bufferLen]; - - bufp = buffer; - *(size_t *)bufp = out_tree_size; - bufp += sizeof(size_t); - - *(size_t *)bufp = out_data_hit_size; - bufp += sizeof(size_t); - - *(size_t *)bufp = out_data_miss_size; - bufp += sizeof(size_t); - - { - unsigned char const *const p = out_data.data.get(); - std::copy(p, p + out_data.size, bufp); - } - return MemoryBuffer(buffer, bufferLen); -} - -#ifdef MGARD_ZSTD -/*! CHECK - * Check that the condition holds. If it doesn't print a message and die. - */ -#define CHECK(cond, ...) \ - do { \ - if (!(cond)) { \ - fprintf(stderr, "%s:%d CHECK(%s) failed: ", __FILE__, __LINE__, #cond); \ - fprintf(stderr, "" __VA_ARGS__); \ - fprintf(stderr, "\n"); \ - exit(1); \ - } \ - } while (0) - -/*! CHECK_ZSTD - * Check the zstd error code and die if an error occurred after printing a - * message. - */ -/*! CHECK_ZSTD - * Check the zstd error code and die if an error occurred after printing a - * message. - */ -#define CHECK_ZSTD(fn, ...) \ - do { \ - size_t const err = (fn); \ - CHECK(!ZSTD_isError(err), "%s", ZSTD_getErrorName(err)); \ - } while (0) - -MemoryBuffer compress_memory_zstd(void const *const src, - const std::size_t srcLen) { - const size_t cBuffSize = ZSTD_compressBound(srcLen); - unsigned char *const buffer = new unsigned char[cBuffSize]; - const std::size_t cSize = ZSTD_compress(buffer, cBuffSize, src, srcLen, 1); - CHECK_ZSTD(cSize); - return MemoryBuffer(buffer, cSize); -} -#endif - -MemoryBuffer compress_memory_z(void z_const *const src, - const std::size_t srcLen) { - const std::size_t BUFSIZE = 2048 * 1024; - std::vector buffers; - std::vector bufferLengths; - - z_stream strm; - strm.zalloc = Z_NULL; - strm.zfree = Z_NULL; - strm.next_in = static_cast(src); - strm.avail_in = srcLen; - buffers.push_back(strm.next_out = new Bytef[BUFSIZE]); - bufferLengths.push_back(strm.avail_out = BUFSIZE); - - deflateInit(&strm, Z_BEST_COMPRESSION); - - while (strm.avail_in != 0) { - [[maybe_unused]] const int res = deflate(&strm, Z_NO_FLUSH); - assert(res == Z_OK); - if (strm.avail_out == 0) { - buffers.push_back(strm.next_out = new Bytef[BUFSIZE]); - bufferLengths.push_back(strm.avail_out = BUFSIZE); - } - } - - int res = Z_OK; - while (res == Z_OK) { - if (strm.avail_out == 0) { - buffers.push_back(strm.next_out = new Bytef[BUFSIZE]); - bufferLengths.push_back(strm.avail_out = BUFSIZE); - } - res = deflate(&strm, Z_FINISH); - } - - assert(res == Z_STREAM_END); - bufferLengths.back() -= strm.avail_out; - // Could just do `nbuffers * BUFSIZE - strm.avail_out`. - const std::size_t bufferLen = - std::accumulate(bufferLengths.begin(), bufferLengths.end(), 0); - unsigned char *const buffer = new unsigned char[bufferLen]; - { - const std::size_t nbuffers = buffers.size(); - unsigned char *p = buffer; - for (std::size_t i = 0; i < nbuffers; ++i) { - unsigned char const *const buffer = buffers.at(i); - const std::size_t bufferLength = bufferLengths.at(i); - std::copy(buffer, buffer + bufferLength, p); - p += bufferLength; - delete[] buffer; - } - } - deflateEnd(&strm); - - return MemoryBuffer(buffer, bufferLen); -} - -void decompress_memory_z(void z_const *const src, const std::size_t srcLen, - unsigned char *const dst, const std::size_t dstLen) { - z_stream strm = {}; - strm.total_in = strm.avail_in = srcLen; - strm.total_out = strm.avail_out = dstLen; - strm.next_in = static_cast(src); - strm.next_out = reinterpret_cast(dst); - - strm.zalloc = Z_NULL; - strm.zfree = Z_NULL; - strm.opaque = Z_NULL; - - [[maybe_unused]] int res; - res = inflateInit2(&strm, (15 + 32)); // 15 window bits, and the +32 tells - // zlib to to detect if using gzip or - // zlib - assert(res == Z_OK); - res = inflate(&strm, Z_FINISH); - assert(res == Z_STREAM_END); - res = inflateEnd(&strm); - assert(res == Z_OK); -} - -#ifdef MGARD_ZSTD -void decompress_memory_zstd(void const *const src, const std::size_t srcLen, - unsigned char *const dst, - const std::size_t dstLen) { - size_t const dSize = ZSTD_decompress(dst, dstLen, src, srcLen); - CHECK_ZSTD(dSize); - - /* When zstd knows the content size, it will error if it doesn't match. */ - CHECK(dstLen == dSize, "Impossible because zstd will check this condition!"); -} -#endif - -MemoryBuffer compress(const pb::Header &header, void *const src, - const std::size_t srcLen) { - switch (header.encoding().compressor()) { - case pb::Encoding::CPU_HUFFMAN_ZSTD: -#ifdef MGARD_ZSTD - { - if (header.quantization().type() != mgard::pb::Quantization::INT64_T) { - throw std::runtime_error("Huffman tree not implemented for quantization " - "types other than `std::int64_t`"); - } - // Quantization type size. - const std::size_t qts = quantization_buffer(header, 1).size; - if (srcLen % qts) { - throw std::runtime_error("incorrect quantization buffer size"); - } - return compress_memory_huffman(reinterpret_cast(src), - srcLen / qts); - } -#else - throw std::runtime_error("MGARD compiled without ZSTD support"); -#endif - case pb::Encoding::CPU_HUFFMAN_ZLIB: - return compress_memory_z(src, srcLen); - default: - throw std::runtime_error("unrecognized lossless compressor"); - } -} - -void decompress(const pb::Header &header, void *const src, - const std::size_t srcLen, void *const dst, - const std::size_t dstLen) { - switch (read_encoding_compressor(header)) { - case pb::Encoding::NOOP_COMPRESSOR: - if (srcLen != dstLen) { - throw std::invalid_argument( - "source and destination lengths must be equal"); - } - { - unsigned char const *const p = static_cast(src); - unsigned char *const q = static_cast(dst); - std::copy(p, p + srcLen, q); - } - break; - case pb::Encoding::CPU_HUFFMAN_ZLIB: - decompress_memory_z(const_cast(src), srcLen, - static_cast(dst), dstLen); - break; - case pb::Encoding::CPU_HUFFMAN_ZSTD: -#ifdef MGARD_ZSTD - decompress_memory_huffman(static_cast(src), srcLen, - static_cast(dst), dstLen); - break; -#else - throw std::runtime_error("MGARD compiled without ZSTD support"); -#endif - default: - throw std::runtime_error("unsupported lossless encoder"); - } -} - -} // namespace mgard diff --git a/src/cuda/LosslessCompression.cu b/src/cuda/LosslessCompression.cu index feb61ab2d4..3d32d660f7 100644 --- a/src/cuda/LosslessCompression.cu +++ b/src/cuda/LosslessCompression.cu @@ -5,7 +5,7 @@ * Date: September 27, 2021 */ -// #include "compressors.hpp" +// #include "lossless.hpp" #include "cuda/Common.h" #include "cuda/CommonInternal.h" #include "cuda/LosslessCompression.h" @@ -90,7 +90,7 @@ unsigned char *compress_memory_huffman(long int *const src, free(out_data_miss); // const MemoryBuffer out_data = - // compress_memory_zstd(payload, total_size); + // compress_zstd(payload, total_size); const size_t cBuffSize = ZSTD_compressBound(total_size); unsigned char *const zstd_buffer = new unsigned char[cBuffSize]; @@ -148,7 +148,7 @@ void decompress_memory_huffman(unsigned char *const src, out_tree_size + out_data_hit_size / 8 + 4 + out_data_miss_size; unsigned char *huffman_encoding_p = (unsigned char *)malloc(total_huffman_size); - // decompress_memory_zstd(buf, srcLen - 3 * sizeof(size_t), + // decompress_zstd(buf, srcLen - 3 * sizeof(size_t), // huffman_encoding_p, // total_huffman_size); diff --git a/src/format.cpp b/src/format.cpp index f9c4c62aa9..6056c8d262 100644 --- a/src/format.cpp +++ b/src/format.cpp @@ -45,6 +45,35 @@ serialize_header_crc32(std::uint_least64_t crc32) { return serialize(crc32); } +namespace { + +template +void check_quantization_buffer_(void const *const p, const std::size_t n) { + if (n % sizeof(Int)) { + throw std::runtime_error( + "quantization buffer size not a multiple of quantization type size"); + } + check_alignment(p); +} + +} // namespace + +void check_quantization_buffer(const pb::Header &header, void const *const p, + const std::size_t n) { + switch (header.quantization().type()) { + case pb::Quantization::INT8_T: + return check_quantization_buffer_(p, n); + case pb::Quantization::INT16_T: + return check_quantization_buffer_(p, n); + case pb::Quantization::INT32_T: + return check_quantization_buffer_(p, n); + case pb::Quantization::INT64_T: + return check_quantization_buffer_(p, n); + default: + throw std::runtime_error("unrecognized quantization type"); + } +} + template <> pb::Dataset::Type type_to_dataset_type() { return pb::Dataset::FLOAT; } @@ -53,6 +82,22 @@ template <> pb::Dataset::Type type_to_dataset_type() { return pb::Dataset::DOUBLE; } +template <> pb::Quantization::Type type_to_quantization_type() { + return pb::Quantization::INT8_T; +} + +template <> pb::Quantization::Type type_to_quantization_type() { + return pb::Quantization::INT16_T; +} + +template <> pb::Quantization::Type type_to_quantization_type() { + return pb::Quantization::INT32_T; +} + +template <> pb::Quantization::Type type_to_quantization_type() { + return pb::Quantization::INT64_T; +} + MemoryBuffer quantization_buffer(const pb::Header &header, const std::size_t ndof) { static_assert(CHAR_BIT == 8, "unexpected number of bits in a byte"); @@ -132,6 +177,7 @@ void populate_defaults(pb::Header &header) { pb::Encoding::CPU_HUFFMAN_ZLIB #endif ); + e.set_serialization(pb::Encoding::RFMH); } { pb::Device &device = *header.mutable_device(); @@ -204,7 +250,7 @@ pb::Header read_metadata(BufferWindow &window) { const uint_least64_t header_size = read_header_size(window); const uint_least32_t header_crc32 = read_header_crc32(window); check_header_crc32(window, header_size, header_crc32); - return read_header(window, header_size); + return read_message(window, header_size); } namespace { @@ -232,28 +278,6 @@ void write_metadata(std::ostream &ostream, const pb::Header &header) { delete[] header_bytes; } -pb::Header read_header(BufferWindow &window, - const std::uint_least64_t header_size) { - // The `CodedInputStream` constructor takes an `int`. - if (header_size > std::numeric_limits::max()) { - throw std::runtime_error("header is too large (size would overflow)"); - } - // Check that the read will stay in the buffer. - unsigned char const *const next = window.next(header_size); - mgard::pb::Header header; - google::protobuf::io::CodedInputStream stream( - static_cast(window.current), - header_size); - if (not header.ParseFromCodedStream(&stream)) { - throw std::runtime_error("header parsing encountered read or format error"); - } - if (not stream.ConsumedEntireMessage()) { - throw std::runtime_error("part of header left unparsed"); - } - window.current = next; - return header; -} - void check_mgard_version(const pb::Header &header) { const pb::VersionNumber &mgard_version = header.mgard_version(); if (mgard_version.major_() > MGARD_VERSION_MAJOR) { diff --git a/src/huffman.cpp b/src/huffman.cpp new file mode 100644 index 0000000000..2f771c3969 --- /dev/null +++ b/src/huffman.cpp @@ -0,0 +1,418 @@ +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include "huffman.hpp" +#include "lossless.hpp" + +namespace mgard { + +HuffmanEncodedStream::HuffmanEncodedStream(const std::size_t nbits, + const std::size_t nmissed, + const std::size_t ntable) + : nbits(nbits), hit(sizeof(unsigned int) * + ((nbits + CHAR_BIT * sizeof(unsigned int) - 1) / + (CHAR_BIT * sizeof(unsigned int)))), + missed(nmissed), frequencies(ntable) { + unsigned char *const p = hit.data.get(); + // Zero out the bytes we won't write to. If `nbits % CHAR_BIT`, there will + // still be bits in the final byte that aren't zeroed out. + std::fill(p + (nbits + CHAR_BIT - 1) / CHAR_BIT, p + hit.size, 0); +} + +namespace { + +void check_type_sizes() { + static_assert(CHAR_BIT == 8, + "code written with assumption that `CHAR_BIT == 8`"); + static_assert( + sizeof(unsigned int) == 4, + "code written with assumption that `sizeof(unsigned int) == 4`"); + static_assert(sizeof(int) == 4, + "code written with assumption that `sizeof(int) == 4`"); + static_assert( + sizeof(std::size_t) == 8, + "code written with assumption that `sizeof(unsigned int) == 8`"); +} + +using Constituent = std::pair; + +MemoryBuffer +gather(const std::vector &constituents) { + std::size_t nbuffer = 0; + for (const Constituent &constituent : constituents) { + nbuffer += constituent.second; + } + MemoryBuffer buffer(nbuffer); + unsigned char *p = buffer.data.get(); + for (const Constituent &constituent : constituents) { + std::memcpy(p, constituent.first, constituent.second); + p += constituent.second; + } + return buffer; +} + +MemoryBuffer +compress_serialized_huffman(const pb::Header &header, + const MemoryBuffer &payload) { + switch (header.encoding().compressor()) { + case pb::Encoding::CPU_HUFFMAN_ZLIB: + return compress_zlib(payload.data.get(), payload.size); + case pb::Encoding::CPU_HUFFMAN_ZSTD: +#ifdef MGARD_ZSTD + return compress_zstd(payload.data.get(), payload.size); +#else + throw std::runtime_error("MGARD compiled without ZSTD support"); +#endif + default: + throw std::runtime_error("unrecognized lossless compressor"); + } +} + +} // namespace + +MemoryBuffer +serialize_compress(const pb::Header &header, + const HuffmanEncodedStream &encoded) { + check_type_sizes(); + + if (header.encoding().serialization() != pb::Encoding::DEPRECATED) { + throw std::runtime_error( + "Huffman tree not to be serialized with deprecated method"); + } + + const std::size_t offset = encoded.nbits % (CHAR_BIT * sizeof(unsigned int)); + // Number of hit buffer padding bytes. + const std::size_t nhbpb = offset ? offset / CHAR_BIT : sizeof(unsigned int); + + // The righthand side is how the size in bytes of the padded hit buffer was + // originally calculated. + assert(encoded.hit.size + nhbpb == + encoded.nbits / CHAR_BIT + sizeof(unsigned int)); + + unsigned char const *hbpb = new unsigned char[nhbpb](); + MemoryBuffer payload = gather({ + {encoded.frequencies.data.get(), encoded.frequencies.size}, + {encoded.hit.data.get(), encoded.hit.size}, + {hbpb, nhbpb}, + {encoded.missed.data.get(), encoded.missed.size}, + }); + delete[] hbpb; + + const MemoryBuffer compressed = + compress_serialized_huffman(header, payload); + return gather( + {{reinterpret_cast(&encoded.frequencies.size), + sizeof(encoded.frequencies.size)}, + {reinterpret_cast(&encoded.nbits), + sizeof(encoded.nbits)}, + {reinterpret_cast(&encoded.missed.size), + sizeof(encoded.missed.size)}, + {compressed.data.get(), compressed.size}}); +} + +HuffmanEncodedStream decompress_deserialize(const pb::Header &header, + unsigned char const *const src, + const std::size_t srcLen) { + if (header.encoding().serialization() != pb::Encoding::DEPRECATED) { + throw std::runtime_error( + "Huffman tree not serialized with deprecated method"); + } + + std::size_t const *const sizes = reinterpret_cast(src); + const std::size_t nfrequencies = sizes[0]; + const std::size_t nbits = sizes[1]; + const std::size_t nmissed = sizes[2]; + // This is how the size in bytes of the padded hit buffer was calculated + // in `decompress_memory_huffman` before this function was introduced. + const std::size_t nhit = nbits / CHAR_BIT + sizeof(unsigned int); + + MemoryBuffer buffer(nfrequencies + nhit + nmissed); + { + const std::size_t offset = 3 * sizeof(std::size_t); + unsigned char const *const src_ = src + offset; + const std::size_t srcLen_ = srcLen - offset; + unsigned char *const dst_ = buffer.data.get(); + const std::size_t dstLen_ = buffer.size; + + switch (header.encoding().compressor()) { + case pb::Encoding::CPU_HUFFMAN_ZLIB: + decompress_zlib(src_, srcLen_, dst_, dstLen_); + break; + case pb::Encoding::CPU_HUFFMAN_ZSTD: +#ifdef MGARD_ZSTD + decompress_zstd(src_, srcLen_, dst_, dstLen_); + break; +#else + throw std::runtime_error("MGARD compiled without ZSTD support"); +#endif + default: + throw std::runtime_error("unrecognized lossless compressor"); + } + } + + HuffmanEncodedStream encoded(nbits, nmissed, nfrequencies); + { + unsigned char const *begin; + unsigned char const *end; + + begin = buffer.data.get(); + end = begin + nfrequencies; + std::copy(begin, end, encoded.frequencies.data.get()); + + begin = end; + assert(encoded.hit.size <= nhit); + end = begin + encoded.hit.size; + std::copy(begin, end, encoded.hit.data.get()); + + // Skip any bytes between `begin + encoded.hit.size` and `begin + nhit`. + begin = end + nhit - encoded.hit.size; + end = begin + nmissed; + std::copy(begin, end, encoded.missed.data.get()); + } + + return encoded; +} + +void HuffmanCodeword::push_back(const bool bit) { + const unsigned char offset = length % CHAR_BIT; + if (not offset) { + bytes.push_back(0); + } + bytes.back() |= static_cast(bit) << (CHAR_BIT - 1 - offset); + ++length; +} + +HuffmanCodeword HuffmanCodeword::left() const { + HuffmanCodeword tmp = *this; + tmp.push_back(false); + return tmp; +} + +HuffmanCodeword HuffmanCodeword::right() const { + HuffmanCodeword tmp = *this; + tmp.push_back(true); + return tmp; +} + +CodeCreationTreeNode::CodeCreationTreeNode(HuffmanCodeword *const codeword, + const std::size_t count) + : codeword(codeword), count(count) {} + +CodeCreationTreeNode::CodeCreationTreeNode( + const std::shared_ptr &left, + const std::shared_ptr &right) + : count(left->count + right->count), left(left), right(right) {} + +namespace { + +void endianness_shuffle(unsigned char *const buffer, const std::size_t nbytes) { + if (nbytes % sizeof(unsigned int)) { + throw std::runtime_error( + "buffer size not a multiple of `sizeof(unsigned int)`"); + } + const unsigned int one{1}; + const bool little_endian = *reinterpret_cast(&one); + if (little_endian) { + for (std::size_t i = 0; i < nbytes; i += sizeof(unsigned int)) { + unsigned char *a = buffer + i; + unsigned char *b = a + sizeof(unsigned int) - 1; + for (std::size_t j = 0; j < sizeof(unsigned int) / 2; ++j) { + std::swap(*a++, *b--); + } + } + } +} + +const std::pair nql_endpoints{ + -static_cast((nql - 1) / 2), nql / 2 - 1}; + +} // namespace + +HuffmanEncodedStream huffman_encoding(long int const *const quantized_data, + const std::size_t n) { + check_type_sizes(); + + using Symbol = long int; + using MissedSymbol = int; + + const HuffmanCode code(nql_endpoints, quantized_data, + quantized_data + n); + + const std::size_t nbits = code.nbits_hit(); + const std::size_t nnz = + code.ncodewords - + std::count(code.frequencies.begin(), code.frequencies.end(), 0); + + HuffmanEncodedStream out(nbits, code.nmissed() * sizeof(MissedSymbol), + 2 * nnz * sizeof(std::size_t)); + + // Write frequency table. + { + std::size_t *p = + reinterpret_cast(out.frequencies.data.get()); + const std::vector &frequencies = code.frequencies; + for (std::size_t i = 0; i < code.ncodewords; ++i) { + const std::size_t frequency = frequencies.at(i); + if (frequency) { + *p++ = i; + *p++ = frequency; + } + } + } + + unsigned char *const buffer = out.hit.data.get(); + { + unsigned char *const p = out.hit.data.get(); + std::fill(p, p + out.hit.size, 0); + } + unsigned char *hit = buffer; + + MissedSymbol *missed = + reinterpret_cast(out.missed.data.get()); + + unsigned char offset = 0; + for (const Symbol q : PseudoArray(quantized_data, n)) { + if (code.out_of_range(q)) { + // Remember that `missed` is an `int` rather than a `long int`. + *missed++ = q + nql / 2; + } + + const HuffmanCodeword codeword = code.codewords.at(code.index(q)); + std::size_t NREMAINING = codeword.length; + for (unsigned char byte : codeword.bytes) { + // Number of bits of `byte` left to write. + unsigned char nremaining = + std::min(static_cast(CHAR_BIT), NREMAINING); + // Premature, but this will hold when we're done with `byte`. + NREMAINING -= nremaining; + + while (nremaining) { + *hit |= byte >> offset; + // Number of bits of `byte` just written (not cumulative). + const unsigned char nwritten = std::min( + nremaining, static_cast( + static_cast(CHAR_BIT) - offset)); + offset += nwritten; + hit += offset / CHAR_BIT; + offset %= CHAR_BIT; + nremaining -= nwritten; + byte <<= nwritten; + } + } + } + + endianness_shuffle(buffer, out.hit.size); + return out; +} + +namespace { + +//! Decode a codeword (identified by associated leaf) to a symbol and shift. +//! +//!\pre `leaf` must be a leaf (rather than an interior node) of the code +//! creation tree. +//! +//!\deprecated +//! +//!\param code Code containing the code creation tree. +//!\param leaf Leaf (associated to a codeword) to decode. +//!\param missed Pointer to next out-of-range symbol. If `leaf` is associated +//! to the out-of-range codeword, this pointer will be dereferenced and +//! incremented. +long int decode_and_shift(const HuffmanCode &code, + const typename HuffmanCode::Node &leaf, + long int const *&missed) { + const std::pair pair = code.decode(leaf); + return pair.first ? pair.second : *missed++ - nql / 2; +} + +} // namespace + +MemoryBuffer huffman_decoding(const HuffmanEncodedStream &encoded) { + check_type_sizes(); + + using Symbol = long int; + using MissedSymbol = int; + + const std::size_t nnz = encoded.frequencies.size / (2 * sizeof(std::size_t)); + std::vector> pairs(nnz); + std::size_t nquantized = 0; + { + std::size_t const *p = + reinterpret_cast(encoded.frequencies.data.get()); + for (std::pair &pair : pairs) { + const std::size_t index = *p++; + const std::size_t frequency = *p++; + pair = {index, frequency}; + nquantized += frequency; + } + } + + HuffmanCode code(nql_endpoints, pairs.begin(), pairs.end()); + + MemoryBuffer out(nquantized); + Symbol *q = out.data.get(); + + assert(not(encoded.missed.size % sizeof(MissedSymbol))); + const std::size_t nmissed = encoded.missed.size / sizeof(MissedSymbol); + Symbol *const missed = new Symbol[nmissed]; + { + MissedSymbol const *const p = + reinterpret_cast(encoded.missed.data.get()); + std::copy(p, p + nmissed, missed); + } + Symbol const *p_missed = missed; + + const std::size_t nbytes = encoded.hit.size; + unsigned char *const buffer = new unsigned char[nbytes]; + { + unsigned char const *const p = encoded.hit.data.get(); + std::copy(p, p + nbytes, buffer); + } + endianness_shuffle(buffer, nbytes); + const Bits bits(buffer, buffer + encoded.nbits / CHAR_BIT, + encoded.nbits % CHAR_BIT); + + std::size_t nbits = 0; + const HuffmanCode::Node root = code.queue.top(); + assert(root); + Bits::iterator b = bits.begin(); + for (std::size_t i = 0; i < nquantized; ++i) { + HuffmanCode::Node node; + for (node = root; node->left; + node = *b++ ? node->right : node->left, ++nbits) + ; + *q++ = decode_and_shift(code, node, p_missed); + } + assert(nbits == encoded.nbits); + assert(sizeof(MissedSymbol) * (p_missed - missed) == encoded.missed.size); + + delete[] missed; + delete[] buffer; + + return out; +} + +template <> +const std::pair + HuffmanCode::default_endpoints = { + -static_cast(1 << 17), + static_cast(1 << 17) - 1}; + +template <> +const std::pair + HuffmanCode::default_endpoints = { + -static_cast(1 << 17), + static_cast(1 << 17) - 1}; + +} // namespace mgard diff --git a/src/lossless_dispatcher.cpp b/src/lossless_dispatcher.cpp new file mode 100644 index 0000000000..eb6a71593a --- /dev/null +++ b/src/lossless_dispatcher.cpp @@ -0,0 +1,398 @@ +#include "lossless.hpp" + +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include "format.hpp" +#include "huffman.hpp" +#include "utilities.hpp" + +namespace mgard { + +namespace { + +template +MemoryBuffer compress_huffman_C_rfmh_(const pb::Header &header, + void const *const src, + const std::size_t srcLen) { + check_quantization_buffer(header, src, srcLen); + + assert(header.encoding().serialization() == pb::Encoding::RFMH); + + return huffman_encode(static_cast(src), srcLen / sizeof(Int)); +} + +// `C` being either ZSTD or `zlib`. +MemoryBuffer compress_huffman_C_rfmh(const pb::Header &header, + void const *const src, + const std::size_t srcLen) { + assert(header.encoding().serialization() == pb::Encoding::RFMH); + + switch (header.quantization().type()) { + case pb::Quantization::INT8_T: + return compress_huffman_C_rfmh_(header, src, srcLen); + case pb::Quantization::INT16_T: + return compress_huffman_C_rfmh_(header, src, srcLen); + case pb::Quantization::INT32_T: + return compress_huffman_C_rfmh_(header, src, srcLen); + case pb::Quantization::INT64_T: + return compress_huffman_C_rfmh_(header, src, srcLen); + default: + throw std::runtime_error("unrecognized quantization type"); + } +} + +MemoryBuffer +compress_huffman_C_deprecated(const pb::Header &header, void const *const src, + const std::size_t srcLen) { + check_quantization_buffer(header, src, srcLen); + + assert(header.encoding().serialization() == pb::Encoding::DEPRECATED); + if (header.quantization().type() != mgard::pb::Quantization::INT64_T) { + throw std::runtime_error( + "deprecated Huffman coding not implemented for quantization " + "types other than `std::int64_t`"); + } + // I don't think it's strictly necessary that `std::int64_t` and `long int` + // are the same type. We could think of `long int` as a generic byte type, + // like `unsigned char`. Worth more attention if this assertion ever fails, + // though. That might be a good time to remove the deprecated Huffman coding + // functions. + static_assert(std::is_same::value, + "deprecated Huffman coding written with assumption that " + "`std::int64_t` is `long int`"); + + return serialize_compress( + header, huffman_encoding(reinterpret_cast(src), + srcLen / sizeof(long int))); +} + +MemoryBuffer compress_huffman_zlib_deprecated( + const pb::Header &header, void const *const src, const std::size_t srcLen) { + assert(header.encoding().compressor() == pb::Encoding::CPU_HUFFMAN_ZLIB); + + return compress_huffman_C_deprecated(header, src, srcLen); +} + +#ifdef MGARD_ZSTD +MemoryBuffer compress_huffman_zstd_deprecated( + const pb::Header &header, void const *const src, const std::size_t srcLen) { + assert(header.encoding().compressor() == pb::Encoding::CPU_HUFFMAN_ZSTD); + + return compress_huffman_C_deprecated(header, src, srcLen); +} +#endif + +namespace { + +// `decompress_zlib` and `decompress_zstd` need to know the size of the +// decompressed buffer before they can decompress. So, in addition to the +// compressed serialized Huffman tree (`compressed`), we need to store the size +// in bytes of the serialized Huffman tree (`nhuffman`). +MemoryBuffer concatenate_nhuffman_and_compressed( + const std::size_t nhuffman, const MemoryBuffer &compressed) { + MemoryBuffer out(HEADER_SIZE_SIZE + compressed.size); + unsigned char *p = out.data.get(); + + // Size in bytes of the serialized Huffman tree. + const std::array nhuffman_ = + serialize_header_size(nhuffman); + std::copy(nhuffman_.begin(), nhuffman_.end(), p); + p += HEADER_SIZE_SIZE; + + unsigned char const *const q = compressed.data.get(); + std::copy(q, q + compressed.size, p); + return out; +} + +} // namespace + +MemoryBuffer +compress_huffman_zlib_rfmh(const pb::Header &header, void const *const src, + const std::size_t srcLen) { + assert(header.encoding().compressor() == pb::Encoding::CPU_HUFFMAN_ZLIB); + assert(header.encoding().serialization() == pb::Encoding::RFMH); + + const MemoryBuffer encoded = + compress_huffman_C_rfmh(header, src, srcLen); + const MemoryBuffer compressed = + compress_zlib(encoded.data.get(), encoded.size); + return concatenate_nhuffman_and_compressed(encoded.size, compressed); +} + +#ifdef MGARD_ZSTD +MemoryBuffer +compress_huffman_zstd_rfmh(const pb::Header &header, void const *const src, + const std::size_t srcLen) { + assert(header.encoding().compressor() == pb::Encoding::CPU_HUFFMAN_ZSTD); + assert(header.encoding().serialization() == pb::Encoding::RFMH); + + const MemoryBuffer encoded = + compress_huffman_C_rfmh(header, src, srcLen); + return concatenate_nhuffman_and_compressed( + encoded.size, compress_zstd(encoded.data.get(), encoded.size)); +} +#endif + +MemoryBuffer compress_huffman_zlib(const pb::Header &header, + void const *const src, + const std::size_t srcLen) { + assert(header.encoding().compressor() == pb::Encoding::CPU_HUFFMAN_ZLIB); + + switch (header.encoding().serialization()) { + case pb::Encoding::DEPRECATED: + return compress_huffman_zlib_deprecated(header, src, srcLen); + case pb::Encoding::RFMH: + return compress_huffman_zlib_rfmh(header, src, srcLen); + default: + throw std::runtime_error("unrecognized Huffman serialization"); + } +} + +#ifdef MGARD_ZSTD +MemoryBuffer compress_huffman_zstd(const pb::Header &header, + void const *const src, + const std::size_t srcLen) { + assert(header.encoding().compressor() == pb::Encoding::CPU_HUFFMAN_ZSTD); + + switch (header.encoding().serialization()) { + case pb::Encoding::DEPRECATED: + return compress_huffman_zstd_deprecated(header, src, srcLen); + case pb::Encoding::RFMH: + return compress_huffman_zstd_rfmh(header, src, srcLen); + default: + throw std::runtime_error("unrecognized Huffman serialization"); + } +} +#endif + +} // namespace + +MemoryBuffer compress(const pb::Header &header, + void const *const src, + const std::size_t srcLen) { + switch (header.encoding().compressor()) { + case pb::Encoding::CPU_ZLIB: + return compress_zlib(src, srcLen); + case pb::Encoding::CPU_ZSTD: +#ifdef MGARD_ZSTD + return compress_zstd(src, srcLen); +#else + throw std::runtime_error("MGARD compiled without ZSTD support"); +#endif + case pb::Encoding::CPU_HUFFMAN_ZLIB: + return compress_huffman_zlib(header, src, srcLen); + case pb::Encoding::CPU_HUFFMAN_ZSTD: +#ifdef MGARD_ZSTD + return compress_huffman_zstd(header, src, srcLen); +#else + throw std::runtime_error("MGARD compiled without ZSTD support"); +#endif + default: + throw std::runtime_error("unrecognized lossless compressor"); + } +} + +namespace { + +template +void decompress_huffman_C_rfmh_(const pb::Header &header, + const MemoryBuffer &encoded, + void *const dst, const std::size_t dstLen) { + check_quantization_buffer(header, dst, dstLen); + + assert(header.encoding().serialization() == pb::Encoding::RFMH); + + const MemoryBuffer decoded = huffman_decode(encoded); + if (sizeof(Int) * decoded.size != dstLen) { + throw std::runtime_error("size of destination buffer is incorrect"); + } + unsigned char const *const p = + reinterpret_cast(decoded.data.get()); + std::copy(p, p + dstLen, static_cast(dst)); +} + +// `C` being either ZSTD or `zlib`. +void decompress_huffman_C_rfmh(const pb::Header &header, + const MemoryBuffer &encoded, + void *const dst, const std::size_t dstLen) { + assert(header.encoding().serialization() == pb::Encoding::RFMH); + + switch (header.quantization().type()) { + case pb::Quantization::INT8_T: + return decompress_huffman_C_rfmh_(header, encoded, dst, + dstLen); + case pb::Quantization::INT16_T: + return decompress_huffman_C_rfmh_(header, encoded, dst, + dstLen); + case pb::Quantization::INT32_T: + return decompress_huffman_C_rfmh_(header, encoded, dst, + dstLen); + case pb::Quantization::INT64_T: + return decompress_huffman_C_rfmh_(header, encoded, dst, + dstLen); + default: + throw std::runtime_error("unrecognized quantization type"); + } +} + +void decompress_huffman_C_deprecated(const pb::Header &header, + void const *const src, + const std::size_t srcLen, void *const dst, + const std::size_t dstLen) { + check_quantization_buffer(header, dst, dstLen); + + assert(header.encoding().serialization() == pb::Encoding::DEPRECATED); + if (header.quantization().type() != mgard::pb::Quantization::INT64_T) { + throw std::runtime_error( + "deprecated Huffman coding not implemented for quantization " + "types other than `std::int64_t`"); + } + // I don't think it's strictly necessary that `std::int64_t` and `long int` + // are the same type. We could think of `long int` as a generic byte type, + // like `unsigned char`. Worth more attention if this assertion ever fails, + // though. That might be a good time to remove the deprecated Huffman coding + // functions. + static_assert(std::is_same::value, + "deprecated Huffman coding written with assumption that " + "`std::int64_t` is `long int`"); + + const MemoryBuffer decoded = + huffman_decoding(decompress_deserialize( + header, reinterpret_cast(src), srcLen)); + if (sizeof(long int) * decoded.size != dstLen) { + throw std::runtime_error("size of destination buffer is incorrect"); + } + { + unsigned char const *const p = + reinterpret_cast(decoded.data.get()); + std::copy(p, p + dstLen, static_cast(dst)); + } +} + +void decompress_huffman_zlib_deprecated(const pb::Header &header, + void const *const src, + const std::size_t srcLen, + void *const dst, + const std::size_t dstLen) { + assert(header.encoding().compressor() == pb::Encoding::CPU_HUFFMAN_ZLIB); + + return decompress_huffman_C_deprecated(header, src, srcLen, dst, dstLen); +} + +#ifdef MGARD_ZSTD +void decompress_huffman_zstd_deprecated(const pb::Header &header, + void const *const src, + const std::size_t srcLen, + void *const dst, + const std::size_t dstLen) { + assert(header.encoding().compressor() == pb::Encoding::CPU_HUFFMAN_ZSTD); + + return decompress_huffman_C_deprecated(header, src, srcLen, dst, dstLen); +} +#endif + +void decompress_huffman_zlib_rfmh(const pb::Header &header, + void const *const src, + const std::size_t srcLen, void *const dst, + const std::size_t dstLen) { + assert(header.encoding().compressor() == pb::Encoding::CPU_HUFFMAN_ZLIB); + assert(header.encoding().serialization() == pb::Encoding::RFMH); + + BufferWindow window(src, srcLen); + // Read theSsze in bytes of the serialized Huffman tree. + MemoryBuffer encoded(read_header_size(window)); + decompress_zlib(window.current, window.end - window.current, + encoded.data.get(), encoded.size); + + return decompress_huffman_C_rfmh(header, encoded, dst, dstLen); +} + +#ifdef MGARD_ZSTD +void decompress_huffman_zstd_rfmh(const pb::Header &header, + void const *const src, + const std::size_t srcLen, void *const dst, + const std::size_t dstLen) { + assert(header.encoding().compressor() == pb::Encoding::CPU_HUFFMAN_ZSTD); + assert(header.encoding().serialization() == pb::Encoding::RFMH); + + BufferWindow window(src, srcLen); + // Read the size in bytes of the serialized Huffman tree. + MemoryBuffer encoded(read_header_size(window)); + decompress_zstd(window.current, window.end - window.current, + encoded.data.get(), encoded.size); + + return decompress_huffman_C_rfmh(header, encoded, dst, dstLen); +} +#endif + +void decompress_huffman_zlib(const pb::Header &header, void const *const src, + const std::size_t srcLen, void *const dst, + const std::size_t dstLen) { + assert(header.encoding().compressor() == pb::Encoding::CPU_HUFFMAN_ZLIB); + + switch (header.encoding().serialization()) { + case pb::Encoding::DEPRECATED: + return decompress_huffman_zlib_deprecated(header, src, srcLen, dst, dstLen); + case pb::Encoding::RFMH: + return decompress_huffman_zlib_rfmh(header, src, srcLen, dst, dstLen); + default: + throw std::runtime_error("unrecognized Huffman serialization"); + } +} + +#ifdef MGARD_ZSTD +void decompress_huffman_zstd(const pb::Header &header, void const *const src, + const std::size_t srcLen, void *const dst, + const std::size_t dstLen) { + assert(header.encoding().compressor() == pb::Encoding::CPU_HUFFMAN_ZSTD); + + switch (header.encoding().serialization()) { + case pb::Encoding::DEPRECATED: + return decompress_huffman_zstd_deprecated(header, src, srcLen, dst, dstLen); + case pb::Encoding::RFMH: + return decompress_huffman_zstd_rfmh(header, src, srcLen, dst, dstLen); + default: + throw std::runtime_error("unrecognized Huffman serialization"); + } +} +#endif + +} // namespace + +void decompress(const pb::Header &header, void const *const src, + const std::size_t srcLen, void *const dst, + const std::size_t dstLen) { + switch (header.encoding().compressor()) { + case pb::Encoding::CPU_ZLIB: + return decompress_zlib(src, srcLen, static_cast(dst), + dstLen); + case pb::Encoding::CPU_ZSTD: +#ifdef MGARD_ZSTD + return decompress_zstd(src, srcLen, reinterpret_cast(dst), + dstLen); +#else + throw std::runtime_error("MGARD compiled without ZSTD support"); +#endif + case pb::Encoding::CPU_HUFFMAN_ZLIB: + return decompress_huffman_zlib(header, src, srcLen, dst, dstLen); + case pb::Encoding::CPU_HUFFMAN_ZSTD: +#ifdef MGARD_ZSTD + return decompress_huffman_zstd(header, src, srcLen, dst, dstLen); +#else + throw std::runtime_error("MGARD compiled without ZSTD support"); +#endif + default: + throw std::runtime_error("unsupported lossless encoder"); + } +} + +} // namespace mgard diff --git a/src/lossless_zlib.cpp b/src/lossless_zlib.cpp new file mode 100644 index 0000000000..b41ac643ac --- /dev/null +++ b/src/lossless_zlib.cpp @@ -0,0 +1,90 @@ +#include "lossless.hpp" + +#include +#include +#include + +#include + +namespace mgard { + +MemoryBuffer compress_zlib(void const *const src, + const std::size_t srcLen) { + const std::size_t BUFSIZE = 2048 * 1024; + std::vector buffers; + std::vector bufferLengths; + + z_stream strm; + strm.zalloc = Z_NULL; + strm.zfree = Z_NULL; + strm.next_in = static_cast(const_cast(src)); + strm.avail_in = srcLen; + buffers.push_back(strm.next_out = new Bytef[BUFSIZE]); + bufferLengths.push_back(strm.avail_out = BUFSIZE); + + deflateInit(&strm, Z_BEST_COMPRESSION); + + while (strm.avail_in != 0) { + [[maybe_unused]] const int res = deflate(&strm, Z_NO_FLUSH); + assert(res == Z_OK); + if (strm.avail_out == 0) { + buffers.push_back(strm.next_out = new Bytef[BUFSIZE]); + bufferLengths.push_back(strm.avail_out = BUFSIZE); + } + } + + int res = Z_OK; + while (res == Z_OK) { + if (strm.avail_out == 0) { + buffers.push_back(strm.next_out = new Bytef[BUFSIZE]); + bufferLengths.push_back(strm.avail_out = BUFSIZE); + } + res = deflate(&strm, Z_FINISH); + } + + assert(res == Z_STREAM_END); + bufferLengths.back() -= strm.avail_out; + // Could just do `nbuffers * BUFSIZE - strm.avail_out`. + const std::size_t bufferLen = + std::accumulate(bufferLengths.begin(), bufferLengths.end(), 0); + unsigned char *const buffer = new unsigned char[bufferLen]; + { + const std::size_t nbuffers = buffers.size(); + unsigned char *p = buffer; + for (std::size_t i = 0; i < nbuffers; ++i) { + unsigned char const *const buffer = buffers.at(i); + const std::size_t bufferLength = bufferLengths.at(i); + std::copy(buffer, buffer + bufferLength, p); + p += bufferLength; + delete[] buffer; + } + } + deflateEnd(&strm); + + return MemoryBuffer(buffer, bufferLen); +} + +void decompress_zlib(void const *const src, const std::size_t srcLen, + unsigned char *const dst, const std::size_t dstLen) { + z_stream strm = {}; + strm.total_in = strm.avail_in = srcLen; + strm.total_out = strm.avail_out = dstLen; + strm.next_in = static_cast(const_cast(src)); + strm.next_out = reinterpret_cast(dst); + + strm.zalloc = Z_NULL; + strm.zfree = Z_NULL; + strm.opaque = Z_NULL; + + [[maybe_unused]] int res; + res = inflateInit2(&strm, (15 + 32)); // 15 window bits, and the +32 tells + // zlib to to detect if using gzip or + // zlib + assert(res == Z_OK); + res = inflate(&strm, Z_FINISH); + assert(res == Z_STREAM_END); + res = inflateEnd(&strm); + assert(res == Z_OK); +} + +} // namespace mgard diff --git a/src/lossless_zstd.cpp b/src/lossless_zstd.cpp new file mode 100644 index 0000000000..749c5794a4 --- /dev/null +++ b/src/lossless_zstd.cpp @@ -0,0 +1,59 @@ +#include "lossless.hpp" + +#include +#include + +#ifndef MGARD_ZSTD +#error "This file requires ZSTD." +#endif + +#include + +namespace mgard { + +/*! CHECK + * Check that the condition holds. If it doesn't print a message and die. + */ +#define CHECK(cond, ...) \ + do { \ + if (!(cond)) { \ + std::fprintf(stderr, "%s:%d CHECK(%s) failed: ", __FILE__, __LINE__, \ + #cond); \ + std::fprintf(stderr, "" __VA_ARGS__); \ + std::fprintf(stderr, "\n"); \ + std::exit(1); \ + } \ + } while (0) + +/*! CHECK_ZSTD + * Check the zstd error code and die if an error occurred after printing a + * message. + */ +#define CHECK_ZSTD(fn, ...) \ + do { \ + std::size_t const err = (fn); \ + CHECK(!ZSTD_isError(err), "%s", ZSTD_getErrorName(err)); \ + } while (0) + +MemoryBuffer compress_zstd(void const *const src, + const std::size_t srcLen) { + const std::size_t cBuffSize = ZSTD_compressBound(srcLen); + unsigned char *const buffer = new unsigned char[cBuffSize]; + const std::size_t cSize = ZSTD_compress(buffer, cBuffSize, src, srcLen, 1); + CHECK_ZSTD(cSize); + return MemoryBuffer(buffer, cSize); +} + +void decompress_zstd(void const *const src, const std::size_t srcLen, + unsigned char *const dst, const std::size_t dstLen) { + std::size_t const dSize = ZSTD_decompress(dst, dstLen, src, srcLen); + CHECK_ZSTD(dSize); + + /* When zstd knows the content size, it will error if it doesn't match. */ + CHECK(dstLen == dSize, "Impossible because zstd will check this condition!"); +} + +#undef CHECK_ZSTD +#undef CHECK + +} // namespace mgard diff --git a/src/mgard.proto b/src/mgard.proto index f5c6fd6fa9..a8353f1dec 100644 --- a/src/mgard.proto +++ b/src/mgard.proto @@ -14,10 +14,14 @@ message CartesianGridTopology { repeated uint64 shape = 2; } -message ExplicitCubeGeometry { repeated double coordinates = 2; } +message ExplicitCubeGeometry { + repeated double coordinates = 2; +} message Domain { - enum Topology { CARTESIAN_GRID = 0; } + enum Topology { + CARTESIAN_GRID = 0; + } enum Geometry { UNIT_CUBE = 0; EXPLICIT_CUBE = 1; @@ -78,7 +82,9 @@ message DomainDecomposition { } message FunctionDecomposition { - enum Transform { MULTILEVEL_COEFFICIENTS = 0; } + enum Transform { + MULTILEVEL_COEFFICIENTS = 0; + } enum Hierarchy { POWER_OF_TWO_PLUS_ONE = 0; MULTIDIMENSION_WITH_GHOST_NODES = 1; @@ -92,7 +98,9 @@ message FunctionDecomposition { } message Quantization { - enum Method { COEFFICIENTWISE_LINEAR = 0; } + enum Method { + COEFFICIENTWISE_LINEAR = 0; + } enum BinWidths { PER_COEFFICIENT = 0; PER_LEVEL = 1; @@ -116,20 +124,89 @@ message Encoding { SHUFFLE = 1; } enum Compressor { + // Not yet implemented. We'll want to add a message for quantized coefficients stored 'verbatim' + // (probably still somewhat compressed because of the varint encoding used for `int64`s). NOOP_COMPRESSOR = 0; - CPU_HUFFMAN_ZLIB = 1; + // Explanation for the wonky numbering: this first case was originally called `CPU_HUFFMAN_ZLIB`, + // but the relevant code didn't actually call the Huffman encoder. + CPU_ZLIB = 1; + CPU_ZSTD = 7; + CPU_HUFFMAN_ZLIB = 6; CPU_HUFFMAN_ZSTD = 2; X_HUFFMAN = 3; X_HUFFMAN_LZ4 = 4; X_HUFFMAN_ZSTD = 5; } + enum HuffmanSerialization { + // Original method, with 'raw' buffer serialization. + DEPRECATED = 0; + // Symbol range, frequency table, missed table, and hit buffer. + RFMH = 1; + } Preprocessor preprocessor = 1; Compressor compressor = 2; - // Only relevant when `compressor == X_HUFFMAN` or `lossless_compressor == - // X_HUFFMAN_LZ4` or `compressor == X_HUFFMAN_ZSTD` + // Only relevant when `compressor == X_HUFFMAN` or `compressor == + // X_HUFFMAN_LZ4` or `compressor == X_HUFFMAN_ZSTD`. uint64 huffman_dictionary_size = 3; uint64 huffman_block_size = 4; + + // Only relevant when `compressor == CPU_HUFFMAN_ZLIB` or + // `compressor == CPU_HUFFMAN_ZSTD`. + HuffmanSerialization serialization = 5; + +} + +message HuffmanHeader { + enum IndexMapping { + // Codewords are (potentially) assigned to the symbols `{min, …, max}`. + // Index `0` is reserved for missed symbols. Then `min` is assigned + // index `1`, `min + 1` is assigned index `2`, and so on. + INCLUSIVE_RANGE = 0; + } + enum CodewordMapping { + // A frequency table is stored as a sequence of index–frequency pairs. + // This table is used to construct a Huffman code creation tree. + INDEX_FREQUENCY_PAIRS = 0; + } + enum MissedEncoding { + // The missed symbols (rather than their indices, for example) are encoded. + LITERAL = 0; + } + enum HitEncoding { + // The codeword bits are run together into a single byte array. + RUN_TOGETHER = 0; + } + + // How each (eligible) symbol is assigned an index. + IndexMapping index_mapping = 1; + // How each (encountered) index is assigned a codeword. + CodewordMapping codeword_mapping = 2; + // How the missed buffer is encoded. + MissedEncoding missed_encoding = 3; + // How the hit (codeword) buffer is encoded. + HitEncoding hit_encoding = 4; + + // Minimum and maximum symbols eligible for codewords. + repeated sint64 endpoints = 5; + // Sizes in bytes of serialized `FrequencySubtable`s to followw. + repeated uint64 nbytes_frequency_subtables = 6; + // Sizes in bytes of serialized `MissedSubtable`s to follow. + repeated uint64 nbytes_missed_subtables = 7; + // Size in bits of the hit buffer to follow. + uint64 nbits = 8; +} + +// One or more of these will follow a `HuffmanHeader`. +message FrequencySubtable { + // Index–frequency pairs for frequency table. + map frequencies = 6; +} + +// One or more of these will follow the `FrequencySubtable`s after a `HuffmanHeader`. +message MissedSubtable { + // Encountered symbols that were not assigned codewords. + repeated sint64 missed = 7; } message Device { diff --git a/src/utilities.cpp b/src/utilities.cpp new file mode 100644 index 0000000000..1c5afeb3d0 --- /dev/null +++ b/src/utilities.cpp @@ -0,0 +1,64 @@ +#include "utilities.hpp" + +#include + +#include + +namespace mgard { + +Bits::Bits(unsigned char const *const begin, unsigned char const *const end, + const unsigned char offset_end) + : begin_(begin), end_(end), offset_end(offset_end) { + if (offset_end >= CHAR_BIT) { + throw std::invalid_argument( + "offset must be smaller than number of bits in byte"); + } +} + +Bits::Bits(unsigned char const *const begin, unsigned char const *const end) + : Bits(begin, end, 0) {} + +bool Bits::operator==(const Bits &other) const { + return begin_ == other.begin_ and end_ == other.end_ and + offset_end == other.offset_end; +} + +bool Bits::operator!=(const Bits &other) const { return !operator==(other); } + +Bits::iterator Bits::begin() const { return {*this, begin_, 0}; } + +Bits::iterator Bits::end() const { return {*this, end_, offset_end}; } + +Bits::iterator::iterator(const Bits &iterable, unsigned char const *const p, + const unsigned char offset) + : iterable(iterable), p(p), offset(offset) {} + +bool Bits::iterator::operator==(const Bits::iterator &other) const { + return offset == other.offset and p == other.p and iterable == other.iterable; +} + +bool Bits::iterator::operator!=(const Bits::iterator &other) const { + return !operator==(other); +} + +Bits::iterator &Bits::iterator::operator++() { + ++offset; + if (offset == CHAR_BIT) { + ++p; + offset = 0; + } + return *this; +} + +Bits::iterator Bits::iterator::operator++(int) { + const iterator tmp = *this; + operator++(); + return tmp; +} + +Bits::iterator::reference Bits::iterator::operator*() const { + // Operator precedence: dereference, then left shift, then bitwise AND. + return *p << offset & 0x80; +} + +} // namespace mgard diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 1e67174fac..80747cefca 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -18,8 +18,11 @@ set( "src/test_decompose.cpp" "src/test_format.cpp" "src/test_quantize.cpp" - "src/test_compressors.cpp" + "src/lossless_regression.cpp" + "src/test_lossless.cpp" "src/test_CompressedDataset.cpp" + "src/huffman_regression.cpp" + "src/test_huffman.cpp" ) if(MGARD_ENABLE_UNSTRUCTURED AND MOAB_FOUND) diff --git a/tests/include/huffman_regression.hpp b/tests/include/huffman_regression.hpp new file mode 100644 index 0000000000..e6c00b092f --- /dev/null +++ b/tests/include/huffman_regression.hpp @@ -0,0 +1,34 @@ +#ifndef TESTING_HUFFMAN_REGRESSION_HPP +#define TESTING_HUFFMAN_REGRESSION_HPP +//!\file +//!\brief Huffman encoding and decoding functions for regression tests. + +#include + +#include "huffman.hpp" + +namespace mgard { + +namespace regression { + +//! Encode quantized coefficients using a Huffman code. +//! +//! The algorithm modifies the quantized data, so the input buffer is copied. +//! +//!\param[in, out] quantized_data Input buffer (quantized coefficients). This +//! buffer will be changed by the encoding process. +//!\param[in] n Number of symbols (`long int` quantized coefficients) in the +//! input buffer. +HuffmanEncodedStream huffman_encoding(long int const *const quantized_data, + const std::size_t n); + +//! Decode a stream encoded using a Huffman code. +//! +//!\param[in] encoded Input buffer (Huffman-encoded stream). +MemoryBuffer huffman_decoding(const HuffmanEncodedStream &encoded); + +} // namespace regression + +} // namespace mgard + +#endif diff --git a/tests/include/lossless_regression.hpp b/tests/include/lossless_regression.hpp new file mode 100644 index 0000000000..a1adfe0ee7 --- /dev/null +++ b/tests/include/lossless_regression.hpp @@ -0,0 +1,40 @@ +#ifndef TESTING_COMPRESSORS_REGRESSION_HPP +#define TESTING_COMPRESSORS_REGRESSION_HPP +//!\file +//!\brief Huffman compression and decompression functions for regression tests. + +#include + +#include "format.hpp" +#include "utilities.hpp" + +namespace mgard { + +namespace regression { + +//! Compress an array using a Huffman tree. +//! +//!\param[in] header Header for the self-describing buffer. +//!\param[in] src Array to be compressed. +//!\param[in] srcLen Size of array (number of elements) to be compressed. +MemoryBuffer compress_memory_huffman(const pb::Header &header, + long int const *const src, + const std::size_t srcLen); + +//! Decompress an array compressed with `compress_memory_huffman`. +//! +//!\param[in] header Header parsed from the original self-describing buffer. +//!\param[in] src Compressed array. +//!\param[in] srcLen Size in bytes of the compressed array. +//!\param[out] dst Decompressed array. +//!\param[in] dstLen Size in bytes of the decompressed array. +void decompress_memory_huffman(const pb::Header &header, + unsigned char const *const src, + const std::size_t srcLen, long int *const dst, + const std::size_t dstLen); + +} // namespace regression + +} // namespace mgard + +#endif diff --git a/tests/include/testing_utilities.hpp b/tests/include/testing_utilities.hpp index 318d521d6d..4b8343d783 100644 --- a/tests/include/testing_utilities.hpp +++ b/tests/include/testing_utilities.hpp @@ -61,5 +61,26 @@ mgard::TensorMeshHierarchy make_flat_hierarchy(const mgard::TensorMeshHierarchy &hierarchy, const std::array shape); +//! Function object to generate periodic data. +struct PeriodicGenerator { + //! Constructor. + //! + //!\param value Starting value. + //!\param period Generator period. + PeriodicGenerator(const std::size_t period, const long int value); + + //! Generator period. + std::size_t period; + + //! Starting value. + long int value; + + //! Number of times `operator()` has been called. + std::size_t ncalls; + + //! Generate next value in periodic sequence. + long int operator()(); +}; + #include "testing_utilities.tpp" #endif diff --git a/tests/src/huffman_regression.cpp b/tests/src/huffman_regression.cpp new file mode 100644 index 0000000000..6d8739dbde --- /dev/null +++ b/tests/src/huffman_regression.cpp @@ -0,0 +1,379 @@ +#include "huffman_regression.hpp" + +#include +#include + +#include +#include + +namespace mgard { + +namespace regression { + +//! Node in the Huffman code creation tree. +struct htree_node { + //! Constructor. + //! + //!\param q (Transformed) symbol. + //!\param cnt Number of occurences of the (transformed) symbol in the source. + htree_node(const int q, const std::size_t cnt) + : q(q), cnt(cnt), code(0), len(0), left(nullptr), right(nullptr) {} + + //! (Transformed) symbol. + int q; + + //! Number of occurences of the (transformed) symbol in the source. + std::size_t cnt; + + //! Codeword associated to the (transformed) symbol. + unsigned int code; + + //! Length in bits of the codeword. + std::size_t len; + + //! Left child in the code creation tree. + htree_node *left; + + //! Right child in the code creation tree. + htree_node *right; +}; + +//! Input symbol–Huffman code pair. +struct huffman_codec { + //! (Transformed) symbol. + int q; + + //! Codeword associated to the (transformed) symbol. + unsigned int code; + + //! Length in bits of the codeword. + std::size_t len; +}; + +//! Frequency table and symbol–code mappings for encoding source. +template struct HuffmanCodec { + // The arrays are value-initialized, which leads to each of their elements + // being value-initialized (ultimately zero-initialized). + + //! Input symbol–Huffman code pairs. + std::array codec{}; + + //! Frequency table for encoding source. + std::array frequency_table{}; +}; + +//! Function object for comparing Huffman code creation nodes. +struct LessThanByCnt { + //! Return whether the first node has a larger count than the second. + //! + //!\param lhs First node. + //!\param rhs Second node. + bool operator()(htree_node const *const lhs, + htree_node const *const rhs) const { + return lhs->cnt > rhs->cnt; + } +}; + +template +using my_priority_queue = + std::priority_queue, LessThanByCnt>; + +void initialize_codec(HuffmanCodec &codec, htree_node *const root, + const unsigned int code, const std::size_t len) { + std::array &codewords = codec.codec; + + root->code = code; + root->len = len; + + if (!root->left && !root->right) { + const std::size_t index = root->q; + codewords.at(index) = {root->q, code, len}; + } + + if (root->left) { + initialize_codec(codec, root->left, code << 1, len + 1); + } + + if (root->right) { + initialize_codec(codec, root->right, code << 1 | 0x1, len + 1); + } +} + +my_priority_queue *build_tree(std::size_t const *const cnt) { + my_priority_queue *const phtree = + new my_priority_queue; + for (std::size_t i = 0; i < nql; i++) { + if (cnt[i] != 0) { + htree_node *const new_node = new htree_node(i, cnt[i]); + phtree->push(new_node); + } + } + + while (phtree->size() > 1) { + htree_node *const top_node1 = phtree->top(); + phtree->pop(); + htree_node *const top_node2 = phtree->top(); + phtree->pop(); + + htree_node *const new_node = + new htree_node(-1, top_node1->cnt + top_node2->cnt); + new_node->left = top_node1; + new_node->right = top_node2; + phtree->push(new_node); + } + return phtree; +} + +void free_htree_node(htree_node *const node) { + if (node->left) { + free_htree_node(node->left); + node->left = nullptr; + } + + if (node->right) { + free_htree_node(node->right); + node->right = nullptr; + } + + delete node; +} + +void free_tree(my_priority_queue *const phtree) { + if (phtree) { + free_htree_node(phtree->top()); + + phtree->pop(); + + delete phtree; + } +} + +//! Populate the frequency table of a `HuffmanCodec`. +//! +//!\note This function will change the quantized data. +//! +//!\param[in, out] quantized_data Input buffer (quantized coefficients). This +//! buffer will be changed by the codec-building process. +//\param[in] n Number of symbols (`long int` quantized coefficients) in the +//! input buffer. +void initialize_frequency_table(HuffmanCodec &codec, + long int *const quantized_data, + const std::size_t n) { + assert(*std::max_element(codec.frequency_table.begin(), + codec.frequency_table.end()) == 0); + + for (std::size_t i = 0; i < n; i++) { + // Convert quantization level to positive so that counting freq can be + // easily done. Level 0 is reserved a out-of-range flag. + quantized_data[i] = quantized_data[i] + nql / 2; + ++codec.frequency_table[quantized_data[i] > 0 && + quantized_data[i] < + static_cast(nql) + ? quantized_data[i] + : 0]; + } +} + +//! Build a Huffman codec for an input buffer. +//! +//!\param[in, out] quantized_data Input buffer (quantized coefficients). This +//! buffer will be changed by the codec-building process. +//\param[in] n Number of symbols (`long int` quantized coefficients) in the +//! input buffer. +template +HuffmanCodec build_huffman_codec(long int *const quantized_data, + const std::size_t n) { + HuffmanCodec codec; + initialize_frequency_table(codec, quantized_data, n); + + my_priority_queue *const phtree = + build_tree(codec.frequency_table.data()); + + initialize_codec(codec, phtree->top(), 0, 0); + + free_tree(phtree); + + return codec; +} + +HuffmanEncodedStream huffman_encoding(long int const *const quantized_data, + const std::size_t n) { + long int *const quantized_data_ = new long int[n]; + std::copy(quantized_data, quantized_data + n, quantized_data_); + + const HuffmanCodec codec = build_huffman_codec(quantized_data_, n); + const std::size_t num_miss = codec.frequency_table[0]; + + assert(n >= num_miss); + + std::size_t nnz = 0; + std::size_t nbits = 0; + for (std::size_t i = 0; i < nql; ++i) { + const huffman_codec &codec_ = codec.codec.at(i); + const std::size_t frequency = codec.frequency_table.at(i); + nbits += frequency * codec_.len; + nnz += frequency ? 1 : 0; + } + + HuffmanEncodedStream out(nbits, num_miss * sizeof(int), + 2 * nnz * sizeof(std::size_t)); + + unsigned int *const hit = + reinterpret_cast(out.hit.data.get()); + std::fill(hit, hit + out.hit.size / sizeof(unsigned int), 0u); + + int *missed = reinterpret_cast(out.missed.data.get()); + + // write frequency table to buffer + std::size_t *const cft = + reinterpret_cast(out.frequencies.data.get()); + std::size_t off = 0; + for (std::size_t i = 0; i < nql; ++i) { + if (codec.frequency_table[i] > 0) { + cft[2 * off] = i; + cft[2 * off + 1] = codec.frequency_table[i]; + off++; + } + } + + std::size_t start_bit = 0; + for (std::size_t i = 0; i < n; i++) { + const int q = quantized_data_[i]; + unsigned int code; + std::size_t len; + + if (q > 0 && q < static_cast(nql)) { + // for those that are within the range + code = codec.codec[q].code; + len = codec.codec[q].len; + } else { + // for those that are out of the range, q is set to 0 + code = codec.codec[0].code; + len = codec.codec[0].len; + + *missed++ = q; + } + + // Note that if len == 0, then that means that either the data is all the + // same number or (more likely) all data are outside the quantization + // range. Either way, the code contains no information and is therefore 0 + // bits. + + if (32 - start_bit % 32 < len) { + // current unsigned int cannot hold the code + // copy 32 - start_bit % 32 bits to the current int + // and copy the rest len - (32 - start_bit % 32) to the next int + const std::size_t rshift = len - (32 - start_bit % 32); + const std::size_t lshift = 32 - rshift; + *(hit + start_bit / 32) = (*(hit + start_bit / 32)) | (code >> rshift); + *(hit + start_bit / 32 + 1) = + (*(hit + start_bit / 32 + 1)) | (code << lshift); + } else if (len) { + code = code << (32 - start_bit % 32 - len); + *(hit + start_bit / 32) = (*(hit + start_bit / 32)) | code; + } + // No effect if `len == 0`. + start_bit += len; + } + + delete[] quantized_data_; + + return out; +} + +MemoryBuffer huffman_decoding(const HuffmanEncodedStream &encoded) { + const std::size_t out_data_miss_size = encoded.missed.size; + const std::size_t out_tree_size = encoded.frequencies.size; + unsigned char const *const out_data_hit = encoded.hit.data.get(); + unsigned char const *const out_data_miss = encoded.missed.data.get(); + unsigned char const *const out_tree = encoded.frequencies.data.get(); + + std::size_t const *const cft = (std::size_t const *)out_tree; + const std::size_t nnz = out_tree_size / (2 * sizeof(std::size_t)); + // The elements of the array are value-initialized (here, zero-initialized). + std::size_t *const ft = new std::size_t[nql](); + + std::size_t nquantized = 0; + for (std::size_t j = 0; j < nnz; ++j) { + const std::size_t frequency = cft[2 * j + 1]; + nquantized += frequency; + ft[cft[2 * j]] = frequency; + } + + MemoryBuffer out(nquantized); + long int *const quantized_data = out.data.get(); + + my_priority_queue *const phtree = build_tree(ft); + delete[] ft; + + unsigned int const *const buf = (unsigned int const *)out_data_hit; + + // The out_data_miss may not be aligned. Therefore, the code + // here makes a new buffer. + assert(not(out_data_miss_size % sizeof(int))); + int *const miss_buf = new int[out_data_miss_size / sizeof(int)]; + if (out_data_miss_size) { + std::memcpy(miss_buf, out_data_miss, out_data_miss_size); + } + + int const *miss_bufp = miss_buf; + + std::size_t start_bit = 0; + unsigned int mask = 0x80000000; + + long int *q = quantized_data; + std::size_t i = 0; + std::size_t num_missed = 0; + while (q < quantized_data + nquantized) { + htree_node const *root = phtree->top(); + assert(root); + + std::size_t len = 0; + int offset = 0; + while (root->left) { + int flag = *(buf + start_bit / 32 + offset) & mask; + if (!flag) { + root = root->left; + } else { + root = root->right; + } + + len++; + + mask >>= 1; + if (!mask) { + mask = 0x80000000; + offset = 1; + } else { + // offset = 0; + } + } + + if (root->q != 0) { + *q = root->q - nql / 2; + + } else { + *q = *miss_bufp - nql / 2; + + miss_bufp++; + num_missed++; + } + + q++; + i++; + + start_bit += len; + } + + assert(start_bit == encoded.nbits); + assert(sizeof(int) * num_missed == out_data_miss_size); + + delete[] miss_buf; + free_tree(phtree); + + return out; +} + +} // namespace regression + +} // namespace mgard diff --git a/tests/src/lossless_regression.cpp b/tests/src/lossless_regression.cpp new file mode 100644 index 0000000000..11c04a4a01 --- /dev/null +++ b/tests/src/lossless_regression.cpp @@ -0,0 +1,183 @@ +#include "lossless_regression.hpp" + +#include +#include + +#include "huffman.hpp" +#include "huffman_regression.hpp" +#include "lossless.hpp" + +namespace mgard { + +namespace regression { + +static_assert(CHAR_BIT == 8, "code written assuming `CHAR_BIT == 8`"); + +static_assert(sizeof(unsigned int) == 4, + "code written assuming `sizeof(unsigned int) == 4`"); + +static_assert(sizeof(std::size_t) == 8, + "code written assuming `sizeof(std::size_t) == 8`"); + +namespace { + +std::size_t hit_buffer_size(const std::size_t nbits) { + return nbits / CHAR_BIT + sizeof(unsigned int); +} + +MemoryBuffer compress_serialized(const pb::Header &header, + unsigned char const *const p, + const std::size_t n) { + assert(header.encoding().serialization() == pb::Encoding::DEPRECATED); + + switch (header.encoding().compressor()) { + case pb::Encoding::CPU_HUFFMAN_ZLIB: + return compress_zlib(p, n); + case pb::Encoding::CPU_HUFFMAN_ZSTD: +#ifdef MGARD_ZSTD + return compress_zstd(p, n); +#else + throw std::runtime_error("MGARD compiled without ZSTD support"); +#endif + default: + throw std::runtime_error("unrecognized lossless compressor"); + } +} + +} // namespace + +// This code also makes endianness assumptions. + +MemoryBuffer compress_memory_huffman(const pb::Header &header, + long int const *const src, + const std::size_t srcLen) { + HuffmanEncodedStream encoded = + mgard::regression::huffman_encoding(src, srcLen); + + assert(not(encoded.hit.size % sizeof(unsigned int))); + + const std::size_t offset = encoded.nbits % (CHAR_BIT * sizeof(unsigned int)); + // Number of hit buffer padding bytes. + const std::size_t nhbpb = offset ? offset / CHAR_BIT : sizeof(unsigned int); + + assert(encoded.hit.size + nhbpb == hit_buffer_size(encoded.nbits)); + + const std::size_t npayload = + encoded.hit.size + nhbpb + encoded.missed.size + encoded.frequencies.size; + unsigned char *const payload = new unsigned char[npayload]; + unsigned char *bufp = payload; + + std::memcpy(bufp, encoded.frequencies.data.get(), encoded.frequencies.size); + bufp += encoded.frequencies.size; + + std::memcpy(bufp, encoded.hit.data.get(), encoded.hit.size); + bufp += encoded.hit.size; + + { + const unsigned char zero{0}; + for (std::size_t i = 0; i < nhbpb; ++i) { + std::memcpy(bufp, &zero, 1); + bufp += 1; + } + } + + std::memcpy(bufp, encoded.missed.data.get(), encoded.missed.size); + bufp += encoded.missed.size; + + const MemoryBuffer out_data = + compress_serialized(header, payload, npayload); + + delete[] payload; + bufp = nullptr; + + const std::size_t bufferLen = 3 * sizeof(std::size_t) + out_data.size; + unsigned char *const buffer = new unsigned char[bufferLen]; + + bufp = buffer; + *(std::size_t *)bufp = encoded.frequencies.size; + bufp += sizeof(std::size_t); + + *(std::size_t *)bufp = encoded.nbits; + bufp += sizeof(std::size_t); + + *(std::size_t *)bufp = encoded.missed.size; + bufp += sizeof(std::size_t); + + { + unsigned char const *const p = out_data.data.get(); + std::copy(p, p + out_data.size, bufp); + } + return MemoryBuffer(buffer, bufferLen); +} + +void decompress_memory_huffman(const pb::Header &header, + unsigned char const *const src, + const std::size_t srcLen, long int *const dst, + const std::size_t dstLen) { + assert(header.encoding().serialization() == pb::Encoding::DEPRECATED); + + std::size_t const *const sizes = reinterpret_cast(src); + const std::size_t nfrequencies = sizes[0]; + const std::size_t nbits = sizes[1]; + const std::size_t nmissed = sizes[2]; + const std::size_t nhit = hit_buffer_size(nbits); + + MemoryBuffer buffer(nfrequencies + nhit + nmissed); + { + const std::size_t offset = 3 * sizeof(std::size_t); + unsigned char const *const src_ = src + offset; + const std::size_t srcLen_ = srcLen - offset; + unsigned char *const dst_ = buffer.data.get(); + const std::size_t dstLen_ = buffer.size; + + switch (header.encoding().compressor()) { + case pb::Encoding::CPU_HUFFMAN_ZLIB: + decompress_zlib(src_, srcLen_, dst_, dstLen_); + break; + case pb::Encoding::CPU_HUFFMAN_ZSTD: +#ifdef MGARD_ZSTD + decompress_zstd(src_, srcLen_, dst_, dstLen_); + break; +#else + throw std::runtime_error("MGARD compiled without ZSTD support"); +#endif + default: + throw std::runtime_error("unrecognized lossless compressor"); + } + } + + HuffmanEncodedStream encoded(nbits, nmissed, nfrequencies); + { + unsigned char const *begin; + unsigned char const *end; + + begin = buffer.data.get(); + end = begin + nfrequencies; + std::copy(begin, end, encoded.frequencies.data.get()); + + begin = end; + assert(encoded.hit.size <= nhit); + end = begin + encoded.hit.size; + std::copy(begin, end, encoded.hit.data.get()); + + // Skip any bytes between `begin + encoded.hit.size` and `begin + nhit`. + begin = end + nhit - encoded.hit.size; + end = begin + nmissed; + std::copy(begin, end, encoded.missed.data.get()); + } + + const MemoryBuffer decoded = + mgard::regression::huffman_decoding(encoded); + { + long int const *const p = decoded.data.get(); + if (decoded.size * sizeof(*p) != dstLen) { + throw std::runtime_error( + "mismatch between expected and obtained decompressed buffer sizes"); + } + std::copy(p, p + decoded.size, dst); + } +} + +} // namespace regression + +} // namespace mgard diff --git a/tests/src/test_compress.cpp b/tests/src/test_compress.cpp index ebb41eecfd..b59c05eff0 100644 --- a/tests/src/test_compress.cpp +++ b/tests/src/test_compress.cpp @@ -398,7 +398,7 @@ void test_self_describing_decompression( TEMPLATE_TEST_CASE("decompressing self-describing buffer", "[compress]", float, double) { - std::default_random_engine gen(32094); + std::default_random_engine gen(361656); const std::vector smoothness_parameters = { -1.5, -0.5, 0.0, 0.5, 1.5, std::numeric_limits::infinity()}; const std::vector tolerances = {1, 0.1, 0.01, 0.001}; diff --git a/tests/src/test_compressors.cpp b/tests/src/test_compressors.cpp deleted file mode 100644 index 8ab071fb6f..0000000000 --- a/tests/src/test_compressors.cpp +++ /dev/null @@ -1,247 +0,0 @@ -#include "catch2/catch_test_macros.hpp" - -#include - -#include -#include -#include - -#include "compressors.hpp" -#include "format.hpp" - -namespace { - -template -void test_huffman_identity(std::default_random_engine &gen, - const std::size_t n) { - std::uniform_int_distribution dis(std::numeric_limits::min()); - const auto f = [&]() -> T { return dis(gen); }; - std::vector src(n); - std::generate(src.begin(), src.end(), f); - std::vector src_(src); - mgard::MemoryBuffer compressed = - mgard::compress_memory_huffman(src_.data(), n); - long int *const decompressed = new long int[n]; - mgard::decompress_memory_huffman(compressed.data.get(), compressed.size, - decompressed, n * sizeof(long int)); - REQUIRE(std::equal(src.begin(), src.end(), decompressed)); - delete[] decompressed; -} - -} // namespace - -TEST_CASE("Huffman compression", "[compressors] [!mayfail]") { - std::default_random_engine gen(257100); - const std::size_t n = 5000; - SECTION("signed characters") { test_huffman_identity(gen, n); } - SECTION("short integers") { test_huffman_identity(gen, n); } - SECTION("integers") { test_huffman_identity(gen, n); } - SECTION("long integers") { test_huffman_identity(gen, n); } -} - -namespace { - -void test_zstd_identity(std::uniform_int_distribution &dis, - std::default_random_engine &gen, const std::size_t n) { - const auto f = [&]() -> unsigned char { return dis(gen); }; - unsigned char *const src = new unsigned char[n]; - std::generate(src, src + n, f); - unsigned char *const src_ = new unsigned char[n]; - std::copy(src, src + n, src_); - mgard::MemoryBuffer dst = mgard::compress_memory_zstd(src_, n); - delete[] src_; - unsigned char *const decompressed = new unsigned char[n]; - mgard::decompress_memory_zstd(dst.data.get(), dst.size, decompressed, n); - REQUIRE(std::equal(src, src + n, decompressed)); - delete[] decompressed; - delete[] src; -} - -} // namespace - -#ifdef MGARD_ZSTD -TEST_CASE("zstd compression", "[compressors]") { - std::uniform_int_distribution dis; - std::default_random_engine gen(158648); - const std::vector ns{10, 10, 1000, 10000}; - for (const std::size_t n : ns) { - test_zstd_identity(dis, gen, n); - } -} -#endif - -namespace { - -void test_zlib_identity(std::uniform_int_distribution &dis, - std::default_random_engine &gen, const std::size_t n) { - const auto f = [&]() -> unsigned char { return dis(gen); }; - unsigned char *const src = new unsigned char[n]; - std::generate(src, src + n, f); - unsigned char *const src_ = new unsigned char[n]; - std::copy(src, src + n, src_); - mgard::MemoryBuffer dst = mgard::compress_memory_z(src_, n); - delete[] src_; - unsigned char *const decompressed = new unsigned char[n]; - mgard::decompress_memory_z(dst.data.get(), dst.size, decompressed, n); - REQUIRE(std::equal(src, src + n, decompressed)); - delete[] decompressed; - delete[] src; -} - -} // namespace - -TEST_CASE("zlib compression", "[compressors]") { - std::uniform_int_distribution dis; - std::default_random_engine gen(252315); - const std::vector ns{10, 10, 1000, 10000}; - for (const std::size_t n : ns) { - test_zlib_identity(dis, gen, n); - } -} - -TEST_CASE("compression with header configuration", "[compressors]") { - mgard::pb::Header header; - // TODO: Once Huffman trees can be built for types other than `long int`, use - // something other than `std::int64_t` here. - mgard::populate_defaults(header); - - const std::size_t ndof = 10000; - std::int64_t *const quantized = new std::int64_t[ndof]; - std::uniform_int_distribution dis(-250, 250); - std::default_random_engine gen(419643); - const auto f = [&]() -> std::int64_t { return dis(gen); }; - std::generate(quantized, quantized + ndof, f); - const std::size_t quantizedLen = ndof * sizeof(*quantized); - // `dst` must have the correct alignment for the quantization type. - std::int64_t *const dst = new std::int64_t[ndof]; - - std::int64_t *const quantized_ = new std::int64_t[ndof]; - std::copy(quantized, quantized + ndof, quantized_); - const mgard::MemoryBuffer compressed = - mgard::compress(header, quantized_, quantizedLen); - delete[] quantized_; - - const mgard::pb::Encoding &e = header.encoding(); - REQUIRE(e.preprocessor() == mgard::pb::Encoding::SHUFFLE); -#ifdef MGARD_ZSTD - REQUIRE(e.compressor() == mgard::pb::Encoding::CPU_HUFFMAN_ZSTD); - mgard::decompress_memory_huffman(compressed.data.get(), compressed.size, dst, - quantizedLen); -#else - REQUIRE(e.compressor() == mgard::pb::Encoding::CPU_HUFFMAN_ZLIB); - mgard::decompress_memory_z(compressed.data.get(), compressed.size, dst, - quantizedLen); -#endif - REQUIRE(std::equal(quantized, quantized + ndof, dst)); - delete[] dst; - delete[] quantized; -} - -TEST_CASE("decompression with header configuration", "[compressors]") { - mgard::pb::Header header; - // TODO: Once Huffman trees can be built for types other than `long int`, use - // something other than `std::int64_t` here. - mgard::populate_defaults(header); - - const std::size_t ndof = 5000; - std::int64_t *const quantized = new std::int64_t[ndof]; - std::uniform_int_distribution dis(-500, 500); - std::default_random_engine gen(489063); - const auto f = [&]() -> std::int64_t { return dis(gen); }; - std::generate(quantized, quantized + ndof, f); - const std::size_t quantizedLen = ndof * sizeof(*quantized); - // `dst` must have the correct alignment for the quantization type. - std::int64_t *const dst = new std::int64_t[ndof]; - - mgard::pb::Encoding &e = *header.mutable_encoding(); - SECTION("noop") { - e.set_compressor(mgard::pb::Encoding::NOOP_COMPRESSOR); - - const std::size_t srcLen = quantizedLen; - unsigned char *const src = new unsigned char[srcLen]; - { - unsigned char const *const p = - reinterpret_cast(quantized); - std::copy(p, p + quantizedLen, src); - } - - mgard::decompress(header, src, srcLen, - reinterpret_cast(dst), quantizedLen); - delete[] src; - REQUIRE(std::equal(quantized, quantized + ndof, dst)); - } - - SECTION("zlib") { - e.set_compressor(mgard::pb::Encoding::CPU_HUFFMAN_ZLIB); - - const mgard::MemoryBuffer out = - mgard::compress_memory_z(quantized, quantizedLen); - - const std::size_t srcLen = out.size * sizeof(*out.data.get()); - unsigned char *const src = new unsigned char[srcLen]; - { - unsigned char const *const p = out.data.get(); - std::copy(p, p + srcLen, src); - } - mgard::decompress(header, src, srcLen, - reinterpret_cast(dst), quantizedLen); - delete[] src; - REQUIRE(std::equal(quantized, quantized + ndof, dst)); - } - -#ifdef MGARD_ZSTD - SECTION("zstd") { - e.set_compressor(mgard::pb::Encoding::CPU_HUFFMAN_ZSTD); - - std::int64_t *const quantized_ = new std::int64_t[ndof]; - std::copy(quantized, quantized + ndof, quantized_); - const mgard::MemoryBuffer out = - mgard::compress_memory_huffman(quantized_, ndof); - delete[] quantized_; - - const std::size_t srcLen = out.size; - unsigned char *const src = new unsigned char[srcLen]; - { - unsigned char const *const p = out.data.get(); - std::copy(p, p + srcLen, src); - } - mgard::decompress(header, src, srcLen, - reinterpret_cast(dst), quantizedLen); - delete[] src; - REQUIRE(std::equal(quantized, quantized + ndof, dst)); - } -#endif - - delete[] dst; - delete[] quantized; -} - -TEST_CASE("compression and decompression with header", "[compressors]") { - mgard::pb::Header header; - // TODO: Once Huffman trees can be built for types other than `long int`, use - // something other than `std::int64_t` here. - mgard::populate_defaults(header); - - const std::size_t ndof = 2500; - std::int64_t *const quantized = new std::int64_t[ndof]; - std::uniform_int_distribution dis(-1000, 1000); - std::default_random_engine gen(995719); - const auto f = [&]() -> std::int64_t { return dis(gen); }; - std::generate(quantized, quantized + ndof, f); - const std::size_t quantizedLen = ndof * sizeof(*quantized); - // `dst` must have the correct alignment for the quantization type. - std::int64_t *const dst = new std::int64_t[ndof]; - - std::int64_t *const quantized_ = new std::int64_t[ndof]; - std::copy(quantized, quantized + ndof, quantized_); - const mgard::MemoryBuffer compressed = - mgard::compress(header, quantized_, quantizedLen); - delete[] quantized_; - - mgard::decompress(header, compressed.data.get(), compressed.size, dst, - quantizedLen); - - REQUIRE(std::equal(quantized, quantized + ndof, dst)); - delete[] dst; - delete[] quantized; -} diff --git a/tests/src/test_format.cpp b/tests/src/test_format.cpp index 64943b2421..48c0751c87 100644 --- a/tests/src/test_format.cpp +++ b/tests/src/test_format.cpp @@ -180,41 +180,40 @@ TEST_CASE("dataset types", "[format]") { REQUIRE(mgard::type_to_dataset_type() == mgard::pb::Dataset::DOUBLE); } -TEST_CASE("quantization type sizes", "[format]") { - mgard::pb::Header header; - mgard::pb::Quantization &quantization = *header.mutable_quantization(); - const std::size_t ndof = 1; +TEST_CASE("quantization types", "[format]") { + REQUIRE(mgard::type_to_quantization_type() == + mgard::pb::Quantization::INT8_T); + REQUIRE(mgard::type_to_quantization_type() == + mgard::pb::Quantization::INT16_T); + REQUIRE(mgard::type_to_quantization_type() == + mgard::pb::Quantization::INT32_T); + REQUIRE(mgard::type_to_quantization_type() == + mgard::pb::Quantization::INT64_T); +} - quantization.set_type(mgard::pb::Quantization::INT8_T); - { - const mgard::MemoryBuffer buffer = - mgard::quantization_buffer(header, ndof); - REQUIRE_NOTHROW(mgard::check_alignment(buffer.data.get())); - REQUIRE(buffer.size == 1); - } +namespace { - quantization.set_type(mgard::pb::Quantization::INT16_T); - { - const mgard::MemoryBuffer buffer = - mgard::quantization_buffer(header, ndof); - REQUIRE_NOTHROW(mgard::check_alignment(buffer.data.get())); - REQUIRE(buffer.size == 2); - } +void test_quantization_buffer(const mgard::pb::Quantization::Type type, + const std::size_t size) { + mgard::pb::Header header; + header.mutable_quantization()->set_type(type); + const mgard::MemoryBuffer buffer = + mgard::quantization_buffer(header, 1); + REQUIRE_NOTHROW( + mgard::check_quantization_buffer(header, buffer.data.get(), buffer.size)); + REQUIRE(buffer.size == size); +} - quantization.set_type(mgard::pb::Quantization::INT32_T); - { - const mgard::MemoryBuffer buffer = - mgard::quantization_buffer(header, ndof); - REQUIRE_NOTHROW(mgard::check_alignment(buffer.data.get())); - REQUIRE(buffer.size == 4); - } +} // namespace - quantization.set_type(mgard::pb::Quantization::INT64_T); - { - const mgard::MemoryBuffer buffer = - mgard::quantization_buffer(header, ndof); - REQUIRE_NOTHROW(mgard::check_alignment(buffer.data.get())); - REQUIRE(buffer.size == 8); +TEST_CASE("quantization buffers", "[format]") { + const std::vector> + pairs{{mgard::pb::Quantization::INT8_T, 1}, + {mgard::pb::Quantization::INT16_T, 2}, + {mgard::pb::Quantization::INT32_T, 4}, + {mgard::pb::Quantization::INT64_T, 8}}; + for (const auto [type, size] : pairs) { + test_quantization_buffer(type, size); } } @@ -362,11 +361,13 @@ TEST_CASE("reading encoding compressor", "[format]") { e.set_compressor(mgard::pb::Encoding::X_HUFFMAN_LZ4); REQUIRE_THROWS(mgard::read_encoding_compressor(header)); } +#ifdef MGARD_ZSTD { e.set_compressor(mgard::pb::Encoding::CPU_HUFFMAN_ZSTD); REQUIRE(mgard::read_encoding_compressor(header) == mgard::pb::Encoding::CPU_HUFFMAN_ZSTD); } +#endif } namespace { diff --git a/tests/src/test_huffman.cpp b/tests/src/test_huffman.cpp new file mode 100644 index 0000000000..8874ddd168 --- /dev/null +++ b/tests/src/test_huffman.cpp @@ -0,0 +1,285 @@ +#include "catch2/catch_template_test_macros.hpp" +#include "catch2/catch_test_macros.hpp" + +#include +#include + +#include +#include +#include + +#include "testing_utilities.hpp" + +#include "format.hpp" +#include "huffman.hpp" +#include "huffman_regression.hpp" + +namespace { + +void test_encoding_regression(long int const *const quantized, + const std::size_t N) { + const mgard::HuffmanEncodedStream out = + mgard::regression::huffman_encoding(quantized, N); + const mgard::HuffmanEncodedStream out_ = + mgard::huffman_encoding(quantized, N); + + unsigned char const *const hit = out.hit.data.get(); + REQUIRE(out_.nbits == out.nbits); + const std::size_t nbytes = (out.nbits + CHAR_BIT - 1) / CHAR_BIT; + REQUIRE(std::equal(hit, hit + nbytes, out_.hit.data.get())); + + unsigned char const *const missed = out.missed.data.get(); + const std::size_t nmissed = out.missed.size; + REQUIRE(out_.missed.size == nmissed); + REQUIRE(std::equal(missed, missed + nmissed, out_.missed.data.get())); + + unsigned char const *const frequencies = out.frequencies.data.get(); + const std::size_t nfrequencies = out.frequencies.size; + REQUIRE(out_.frequencies.size == nfrequencies); + REQUIRE(std::equal(frequencies, frequencies + nfrequencies, + out_.frequencies.data.get())); +} + +void test_decoding_regression(long int const *const quantized, + const std::size_t N) { + const mgard::HuffmanEncodedStream encoded = + mgard::regression::huffman_encoding(quantized, N); + const mgard::HuffmanEncodedStream encoded_ = + mgard::regression::huffman_encoding(quantized, N); + + const mgard::MemoryBuffer out = + mgard::regression::huffman_decoding(encoded); + const mgard::MemoryBuffer out_ = mgard::huffman_decoding(encoded_); + + REQUIRE(out.size == out_.size); + REQUIRE(out.size == N); + long int const *const p = out.data.get(); + long int const *const p_ = out_.data.get(); + REQUIRE(std::equal(p, p + out.size, p_)); +} + +template void test_inversion(T const *const q, std::size_t N) { + const mgard::MemoryBuffer compressed = + mgard::huffman_encode(q, N); + const mgard::MemoryBuffer decompressed = + mgard::huffman_decode(compressed); + REQUIRE(N == decompressed.size); + REQUIRE(std::equal(q, q + N, decompressed.data.get())); +} + +void test_encoding_regression_constant(const std::size_t N, const long int q) { + long int *const quantized = new long int[N]; + std::fill(quantized, quantized + N, q); + test_encoding_regression(quantized, N); + delete[] quantized; +} + +void test_encoding_regression_periodic(const std::size_t N, + const std::size_t period, + const long int q) { + long int *const quantized = new long int[N]; + std::generate(quantized, quantized + N, PeriodicGenerator(period, q)); + test_encoding_regression(quantized, N); + delete[] quantized; +} + +void test_encoding_regression_random(const std::size_t N, const long int a, + const long int b, + std::default_random_engine &gen) { + std::uniform_int_distribution dis(a, b); + long int *const quantized = new long int[N]; + std::generate(quantized, quantized + N, [&] { return dis(gen); }); + test_encoding_regression(quantized, N); + delete[] quantized; +} + +void test_decoding_regression_constant(const std::size_t N, const long int q) { + long int *const quantized = new long int[N]; + std::fill(quantized, quantized + N, q); + test_decoding_regression(quantized, N); + delete[] quantized; +} + +void test_decoding_regression_periodic(const std::size_t N, + const std::size_t period, + const long int q) { + long int *const quantized = new long int[N]; + std::generate(quantized, quantized + N, PeriodicGenerator(period, q)); + test_decoding_regression(quantized, N); + delete[] quantized; +} + +void test_decoding_regression_random(const std::size_t N, const long int a, + const long int b, + std::default_random_engine &gen) { + std::uniform_int_distribution dis(a, b); + long int *const quantized = new long int[N]; + std::generate(quantized, quantized + N, [&] { return dis(gen); }); + test_decoding_regression(quantized, N); + delete[] quantized; +} + +template +void test_inversion_constant(const std::size_t N, const T q) { + T *const quantized = new T[N]; + std::fill(quantized, quantized + N, q); + test_inversion(quantized, N); + delete[] quantized; +} + +template +void test_inversion_periodic(const std::size_t N, const std::size_t period, + const T q) { + T *const quantized = new T[N]; + std::generate(quantized, quantized + N, PeriodicGenerator(period, q)); + test_inversion(quantized, N); + delete[] quantized; +} + +template +void test_inversion_random(const std::size_t N, const T a, const T b, + std::default_random_engine &gen) { + std::uniform_int_distribution dis(a, b); + T *const quantized = new T[N]; + std::generate(quantized, quantized + N, [&] { return dis(gen); }); + test_inversion(quantized, N); + delete[] quantized; +} + +} // namespace + +TEST_CASE("encoding regression", "[huffman] [regression]") { + SECTION("constant data") { + test_encoding_regression_constant(10, 0); + test_encoding_regression_constant(100, 732); + test_encoding_regression_constant(1000, -10); + } + + SECTION("periodic data") { + test_encoding_regression_periodic(10, 3, -3); + test_encoding_regression_periodic(100, 10, 0); + test_encoding_regression_periodic(1000, 17, 51); + } + + SECTION("random data") { + std::default_random_engine gen(726847); + test_encoding_regression_random(10, 0, 1, gen); + test_encoding_regression_random(100, -15, -5, gen); + test_encoding_regression_random(1000, std::numeric_limits::min(), + std::numeric_limits::max(), gen); + test_encoding_regression_random(10000, -100, 100, gen); + } +} + +TEST_CASE("decoding regression", "[huffman] [regression]") { + SECTION("constant data") { + test_decoding_regression_constant(10, -11); + test_decoding_regression_constant(100, 79); + test_decoding_regression_constant(1000, -7296); + } + + SECTION("periodic data") { + test_decoding_regression_periodic(10, 4, 12); + test_decoding_regression_periodic(100, 9, -71); + test_decoding_regression_periodic(1000, 23, 3280); + } + + SECTION("random data") { + std::default_random_engine gen(363022); + test_decoding_regression_random(10, 0, 1, gen); + test_decoding_regression_random(100, -15, -5, gen); + test_decoding_regression_random(1000, std::numeric_limits::min(), + std::numeric_limits::max(), gen); + test_decoding_regression_random(10000, -100, 100, gen); + } +} + +TEMPLATE_TEST_CASE("Huffman inversion", "[huffman]", std::int8_t, std::int16_t, + std::int32_t, std::int64_t) { + std::default_random_engine gen_(454114); + std::uniform_int_distribution dis; + SECTION("constant data") { + test_inversion_constant(10, dis(gen_)); + test_inversion_constant(100, -dis(gen_)); + test_inversion_constant(1000, dis(gen_)); + } + + SECTION("periodic data") { + test_inversion_periodic(10, 3, -dis(gen_)); + test_inversion_periodic(100, 10, dis(gen_)); + test_inversion_periodic(1000, 9, -dis(gen_)); + } + + SECTION("random data") { + std::default_random_engine gen(950142); + test_inversion_random(10, 0, 1, gen); + test_inversion_random(100, -12, 11, gen); + test_inversion_random(1000, std::numeric_limits::min(), + std::numeric_limits::max(), gen); + test_inversion_random(10000, -100, 100, gen); + } +} + +namespace { + +void test_hes_serialization_inversion( + const mgard::pb::Encoding::Compressor compressor) { + mgard::pb::Header header; + mgard::pb::Encoding &encoding = *header.mutable_encoding(); + encoding.set_compressor(compressor); + encoding.set_serialization(mgard::pb::Encoding::DEPRECATED); + + // This is not intended to be a valid `HuffmanEncodedStream`. + const std::size_t nbits = 2718; + const std::size_t nmissed = 896 * sizeof(int); + const std::size_t ntable = 681 * 2 * sizeof(std::size_t); + const mgard::HuffmanEncodedStream original(nbits, nmissed, ntable); + { + unsigned char *const p = original.hit.data.get(); + std::iota(p, p + original.hit.size, 1u); + } + { + unsigned char *const p = original.missed.data.get(); + std::iota(p, p + nmissed, 90u); + } + { + unsigned char *const p = original.frequencies.data.get(); + std::iota(p, p + ntable, 51u); + } + + const mgard::MemoryBuffer serialized = + mgard::serialize_compress(header, original); + const mgard::HuffmanEncodedStream deserialized = + mgard::decompress_deserialize(header, serialized.data.get(), + serialized.size); + + REQUIRE(original.nbits == deserialized.nbits); + REQUIRE(original.hit.size == deserialized.hit.size); + REQUIRE(original.missed.size == deserialized.missed.size); + REQUIRE(original.frequencies.size == deserialized.frequencies.size); + + { + unsigned char const *const p = original.hit.data.get(); + unsigned char const *const q = deserialized.hit.data.get(); + REQUIRE(std::equal(p, p + original.hit.size, q)); + } + { + unsigned char const *const p = original.missed.data.get(); + unsigned char const *const q = deserialized.missed.data.get(); + REQUIRE(std::equal(p, p + nmissed, q)); + } + { + unsigned char const *const p = original.frequencies.data.get(); + unsigned char const *const q = deserialized.frequencies.data.get(); + REQUIRE(std::equal(p, p + ntable, q)); + } +} + +} // namespace + +TEST_CASE("`HuffmanEncodedStream` serialization inversion", "[huffman]") { + test_hes_serialization_inversion(mgard::pb::Encoding::CPU_HUFFMAN_ZLIB); +#ifdef MGARD_ZSTD + test_hes_serialization_inversion(mgard::pb::Encoding::CPU_HUFFMAN_ZSTD); +#endif +} diff --git a/tests/src/test_lossless.cpp b/tests/src/test_lossless.cpp new file mode 100644 index 0000000000..b4e9c06661 --- /dev/null +++ b/tests/src/test_lossless.cpp @@ -0,0 +1,441 @@ +#include "catch2/catch_template_test_macros.hpp" +#include "catch2/catch_test_macros.hpp" + +#include + +#include +#include +#include + +#include "format.hpp" +#include "lossless.hpp" +#include "lossless_regression.hpp" + +#include "testing_utilities.hpp" + +namespace { + +// Generate a header for use with the deprecated Huffman serialization method. +mgard::pb::Header +deprecated_header(const mgard::pb::Encoding::Compressor compressor) { + mgard::pb::Header header; + header.mutable_quantization()->set_type(mgard::pb::Quantization::INT64_T); + header.mutable_encoding()->set_preprocessor(mgard::pb::Encoding::SHUFFLE); + header.mutable_encoding()->set_compressor(compressor); + header.mutable_encoding()->set_serialization(mgard::pb::Encoding::DEPRECATED); + return header; +} + +void test_huffman_compression_regression(long int const *const src, + const std::size_t srcLen) { + std::vector compressors; + compressors.push_back(mgard::pb::Encoding::CPU_HUFFMAN_ZLIB); +#ifdef MGARD_ZSTD + compressors.push_back(mgard::pb::Encoding::CPU_HUFFMAN_ZSTD); +#endif + + for (mgard::pb::Encoding::Compressor compressor : compressors) { + const mgard::pb::Header header = deprecated_header(compressor); + const mgard::MemoryBuffer out = + mgard::regression::compress_memory_huffman(header, src, srcLen); + unsigned char const *const p = out.data.get(); + + const mgard::MemoryBuffer out_ = mgard::compress( + header, const_cast(src), srcLen * sizeof(long int)); + unsigned char const *const p_ = out_.data.get(); + + REQUIRE(out.size == out_.size); + REQUIRE(std::equal(p, p + out.size, p_)); + } +} + +void test_huffman_decompression_regression(long int const *const src, + const std::size_t srcLen) { + std::vector compressors; + compressors.push_back(mgard::pb::Encoding::CPU_HUFFMAN_ZLIB); +#ifdef MGARD_ZSTD + compressors.push_back(mgard::pb::Encoding::CPU_HUFFMAN_ZSTD); +#endif + + for (const mgard::pb::Encoding::Compressor compressor : compressors) { + const mgard::pb::Header header = deprecated_header(compressor); + + const mgard::MemoryBuffer compressed = + mgard::regression::compress_memory_huffman(header, src, srcLen); + const mgard::MemoryBuffer compressed_(compressed.size); + + unsigned char *const q = compressed.data.get(); + unsigned char *const q_ = compressed_.data.get(); + std::copy(q, q + compressed.size, q_); + + mgard::MemoryBuffer out(srcLen); + mgard::MemoryBuffer out_(srcLen); + + long int *const p = out.data.get(); + long int *const p_ = out_.data.get(); + + mgard::regression::decompress_memory_huffman(header, q, compressed.size, p, + out.size * sizeof(long int)); + + mgard::decompress(header, q_, compressed_.size, out_.data.get(), + out_.size * sizeof(long int)); + REQUIRE(std::equal(p, p + srcLen, p_)); + } +} + +void test_hcr_constant(const std::size_t srcLen, const long int q) { + long int *const src = new long int[srcLen]; + std::fill(src, src + srcLen, q); + test_huffman_compression_regression(src, srcLen); + delete[] src; +} + +void test_hcr_periodic(const std::size_t srcLen, const std::size_t period, + const long int initial) { + long int *const src = new long int[srcLen]; + std::generate(src, src + srcLen, PeriodicGenerator(period, initial)); + test_huffman_compression_regression(src, srcLen); + delete[] src; +} + +void test_hcr_random(const std::size_t srcLen, const long int a, + const long int b, std::default_random_engine &gen) { + std::uniform_int_distribution dis(a, b); + long int *const src = new long int[srcLen]; + std::generate(src, src + srcLen, [&] { return dis(gen); }); + test_huffman_compression_regression(src, srcLen); + delete[] src; +} + +void test_hdr_constant(const std::size_t srcLen, const long int q) { + long int *const src = new long int[srcLen]; + std::fill(src, src + srcLen, q); + test_huffman_decompression_regression(src, srcLen); + delete[] src; +} + +void test_hdr_periodic(const std::size_t srcLen, const std::size_t period, + const long int initial) { + long int *const src = new long int[srcLen]; + std::generate(src, src + srcLen, PeriodicGenerator(period, initial)); + test_huffman_decompression_regression(src, srcLen); + delete[] src; +} + +void test_hdr_random(const std::size_t srcLen, const long int a, + const long int b, std::default_random_engine &gen) { + std::uniform_int_distribution dis(a, b); + long int *const src = new long int[srcLen]; + std::generate(src, src + srcLen, [&] { return dis(gen); }); + test_huffman_decompression_regression(src, srcLen); + delete[] src; +} + +} // namespace + +TEST_CASE("Huffman compression regression", "[compressors] [regression]") { + SECTION("constant data") { + test_hcr_constant(5, -3); + test_hcr_constant(25, 0); + test_hcr_constant(625, 81); + } + + SECTION("periodic data") { + test_hcr_periodic(5, 5, 0); + test_hcr_periodic(25, 6, -4); + test_hcr_periodic(625, 20, 22); + } + + SECTION("random data") { + std::default_random_engine gen(131051); + test_hcr_random(50, 0, 1, gen); + test_hcr_random(25, -8, 16, gen); + test_hcr_random(625, std::numeric_limits::min(), + std::numeric_limits::max(), gen); + test_hcr_random(3125, -100, 100, gen); + } +} + +TEST_CASE("Huffman decompression regression", "[compressors] [regression]") { + SECTION("constant data") { + test_hdr_constant(4, -143485); + test_hdr_constant(64, 0); + test_hdr_constant(256, 67486); + } + + SECTION("periodic data") { + test_hdr_periodic(10, 3, 0); + test_hdr_periodic(100, 10, -570); + test_hdr_periodic(1000, 19, 394); + } + + SECTION("random data") { + std::default_random_engine gen(566222); + test_hdr_random(100, 1, 2, gen); + test_hdr_random(30, -7, 7, gen); + test_hdr_random(900, std::numeric_limits::min(), + std::numeric_limits::max(), gen); + test_hdr_random(2700, -60, 40, gen); + } +} + +#ifdef MGARD_ZSTD +namespace { + +void test_zstd_identity(std::uniform_int_distribution &dis, + std::default_random_engine &gen, const std::size_t n) { + const auto f = [&]() -> unsigned char { return dis(gen); }; + unsigned char *const src = new unsigned char[n]; + std::generate(src, src + n, f); + unsigned char *const src_ = new unsigned char[n]; + std::copy(src, src + n, src_); + mgard::MemoryBuffer dst = mgard::compress_zstd(src_, n); + delete[] src_; + unsigned char *const decompressed = new unsigned char[n]; + mgard::decompress_zstd(dst.data.get(), dst.size, decompressed, n); + REQUIRE(std::equal(src, src + n, decompressed)); + delete[] decompressed; + delete[] src; +} + +} // namespace + +TEST_CASE("zstd compression", "[compressors]") { + std::uniform_int_distribution dis; + std::default_random_engine gen(158648); + const std::vector ns{10, 10, 1000, 10000}; + for (const std::size_t n : ns) { + test_zstd_identity(dis, gen, n); + } +} +#endif + +namespace { + +void test_zlib_identity(std::uniform_int_distribution &dis, + std::default_random_engine &gen, const std::size_t n) { + const auto f = [&]() -> unsigned char { return dis(gen); }; + unsigned char *const src = new unsigned char[n]; + std::generate(src, src + n, f); + unsigned char *const src_ = new unsigned char[n]; + std::copy(src, src + n, src_); + mgard::MemoryBuffer dst = mgard::compress_zlib(src_, n); + delete[] src_; + unsigned char *const decompressed = new unsigned char[n]; + mgard::decompress_zlib(dst.data.get(), dst.size, decompressed, n); + REQUIRE(std::equal(src, src + n, decompressed)); + delete[] decompressed; + delete[] src; +} + +} // namespace + +TEST_CASE("zlib compression", "[compressors]") { + std::uniform_int_distribution dis; + std::default_random_engine gen(252315); + const std::vector ns{10, 10, 1000, 10000}; + for (const std::size_t n : ns) { + test_zlib_identity(dis, gen, n); + } +} + +namespace { + +template +void test_cd_inversion(const mgard::pb::Header &header, + Int const *const quantized, const std::size_t n) { + const std::size_t nbytes = sizeof(Int) * n; + + Int *const quantized_ = new Int[n]; + std::copy(quantized, quantized + n, quantized_); + const mgard::MemoryBuffer compressed = + mgard::compress(header, quantized_, nbytes); + delete[] quantized_; + + Int *const decompressed = new Int[n]; + mgard::decompress(header, compressed.data.get(), compressed.size, + decompressed, nbytes); + REQUIRE(std::equal(quantized, quantized + n, decompressed)); + delete[] decompressed; +} + +template +void test_cd_inversion_constant(const mgard::pb::Header &header, + const std::size_t N, const Int q) { + Int *const quantized = new Int[N]; + std::fill(quantized, quantized + N, q); + test_cd_inversion(header, quantized, N); + delete[] quantized; +} + +template +void test_cd_inversion_periodic(const mgard::pb::Header &header, + const std::size_t N, const std::size_t period, + const Int q) { + Int *const quantized = new Int[N]; + std::generate(quantized, quantized + N, PeriodicGenerator(period, q)); + test_cd_inversion(header, quantized, N); + delete[] quantized; +} + +template +void test_cd_inversion_random(const mgard::pb::Header &header, + const std::size_t N, const Int a, const Int b, + std::default_random_engine &gen) { + std::uniform_int_distribution dis(a, b); + Int *const quantized = new Int[N]; + std::generate(quantized, quantized + N, [&] { return dis(gen); }); + test_cd_inversion(header, quantized, N); + delete[] quantized; +} + +template +mgard::pb::Quantization::Type type_to_quantization_type(); + +template <> +mgard::pb::Quantization::Type type_to_quantization_type() { + return mgard::pb::Quantization::INT8_T; +} + +template <> +mgard::pb::Quantization::Type type_to_quantization_type() { + return mgard::pb::Quantization::INT16_T; +} + +template <> +mgard::pb::Quantization::Type type_to_quantization_type() { + return mgard::pb::Quantization::INT32_T; +} + +template <> +mgard::pb::Quantization::Type type_to_quantization_type() { + return mgard::pb::Quantization::INT64_T; +} + +template +void test_cd_inversion_constant(const mgard::pb::Header &header) { + test_cd_inversion_constant(header, 100, 98); + test_cd_inversion_constant(header, 1000, 0); + test_cd_inversion_constant(header, 10000, -62); +} + +template +void test_cd_inversion_periodic(const mgard::pb::Header &header) { + test_cd_inversion_periodic(header, 100, 3, -5); + test_cd_inversion_periodic(header, 1000, 60, 86); + test_cd_inversion_periodic(header, 10000, 62, 7); +} + +template +void test_cd_inversion_random(const mgard::pb::Header &header) { + std::default_random_engine gen(894584); + test_cd_inversion_random(header, 100, 0, 3, gen); + test_cd_inversion_random(header, 1000, std::numeric_limits::min(), + std::numeric_limits::max(), gen); + test_cd_inversion_random(header, 10000, -110, 110, gen); +} + +template <> +void test_cd_inversion_random(const mgard::pb::Header &header) { + std::default_random_engine gen(952426); + test_cd_inversion_random(header, 100, -1, 1, gen); + // In the deprecated Huffman encoding function, the missed symbols are cast + // from `long int` to `int`. + test_cd_inversion_random(header, 1000, + std::numeric_limits::min(), + std::numeric_limits::max(), gen); + test_cd_inversion_random(header, 10000, 0, 250, gen); +} + +template +void test_cd_inversion(const mgard::pb::Header &header) { + SECTION("constant data") { test_cd_inversion_constant(header); } + SECTION("periodic data") { test_cd_inversion_periodic(header); } + SECTION("random data") { test_cd_inversion_random(header); } +} + +} // namespace + +TEMPLATE_TEST_CASE("`compress`/`decompress` inversion", "[compressors]", + std::int8_t, std::int16_t, std::int32_t, std::int64_t) { + mgard::pb::Header header; + mgard::populate_defaults(header); + mgard::pb::Quantization &q = *header.mutable_quantization(); + mgard::pb::Encoding &e = *header.mutable_encoding(); + + const mgard::pb::Quantization::Type qtype = + type_to_quantization_type(); + q.set_type(qtype); + + SECTION("`CPU_ZLIB`") { + e.set_compressor(mgard::pb::Encoding::CPU_ZLIB); + test_cd_inversion(header); + } + +#ifdef MGARD_ZSTD + SECTION("`CPU_ZSTD`") { + e.set_compressor(mgard::pb::Encoding::CPU_ZSTD); + test_cd_inversion(header); + } +#endif + + // The deprecated Huffman serialization method requires the quantization type + // to be `std::int64_t`. + if (qtype == mgard::pb::Quantization::INT64_T) { + SECTION("`CPU_HUFFMAN_ZLIB` with `DEPRECATED`") { + e.set_compressor(mgard::pb::Encoding::CPU_HUFFMAN_ZLIB); + e.set_serialization(mgard::pb::Encoding::DEPRECATED); + test_cd_inversion(header); + } + +#ifdef MGARD_ZSTD + SECTION("`CPU_HUFFMAN_ZSTD` with `DEPRECATED`") { + e.set_compressor(mgard::pb::Encoding::CPU_HUFFMAN_ZLIB); + e.set_serialization(mgard::pb::Encoding::DEPRECATED); + test_cd_inversion(header); + } +#endif + } + + SECTION("`CPU_HUFFMAN_ZLIB` with `RFMH`") { + e.set_compressor(mgard::pb::Encoding::CPU_HUFFMAN_ZLIB); + e.set_serialization(mgard::pb::Encoding::RFMH); + test_cd_inversion(header); + } + +#ifdef MGARD_ZSTD + SECTION("`CPU_HUFFMAN_ZSTD` with `RFMH`") { + e.set_compressor(mgard::pb::Encoding::CPU_HUFFMAN_ZSTD); + e.set_serialization(mgard::pb::Encoding::RFMH); + test_cd_inversion(header); + } +#endif +} + +// In the deprecated Huffman encoding function, the missed symbols are cast from +// `long int` to `int`. +TEST_CASE("deprecated Huffman inversion", "[compressors] [!shouldfail]") { + std::default_random_engine gen(257100); + const std::int64_t a = + 2 * static_cast(std::numeric_limits::min()); + const std::int64_t b = + 2 * static_cast(std::numeric_limits::max()); + + SECTION("`CPU_HUFFMAN_ZLIB` with `DEPRECATED`") { + // Conceivably this could pass if all the generated `std::int64_t`s are + // representable as `int`s. + test_cd_inversion_random( + deprecated_header(mgard::pb::Encoding::CPU_HUFFMAN_ZLIB), 5000, a, b, + gen); + } + +#ifdef MGARD_ZSTD + SECTION("`CPU_HUFFMAN_ZSTD` with `DEPRECATED`") { + // Conceivably this could pass if all the generated `std::int64_t`s are + // representable as `int`s. + test_cd_inversion_random( + deprecated_header(mgard::pb::Encoding::CPU_HUFFMAN_ZSTD), 5000, a, b, + gen); + } +#endif +} diff --git a/tests/src/test_utilities.cpp b/tests/src/test_utilities.cpp index 3d665e331e..e5d66656e8 100644 --- a/tests/src/test_utilities.cpp +++ b/tests/src/test_utilities.cpp @@ -171,3 +171,140 @@ TEST_CASE("CartesianProduct predecessors and successors", "[utilities]") { REQUIRE(tracker); } + +namespace { + +void test_bit_equality(const mgard::Bits &bits, + const std::vector &expected) { + TrialTracker tracker; + std::vector::const_iterator p = expected.begin(); + for (const bool b : bits) { + tracker += b == *p++; + } + REQUIRE(tracker); +} + +} // namespace + +TEST_CASE("Bits iteration", "[utilities]") { + SECTION("zero end offsets") { + { + unsigned char const a[1]{0x3d}; + const mgard::Bits bits(a, a + 1); + const std::vector expected{// `3`. + false, false, true, true, + // `d`. + true, true, false, true}; + test_bit_equality(bits, expected); + } + { + unsigned char const a[2]{0xe6, 0x0a}; + const mgard::Bits bits(a, a + 2); + const std::vector expected{// `e`. + true, true, true, false, + // `6`. + false, true, true, false, + // `0`. + false, false, false, false, + // `a`. + true, false, true, false}; + test_bit_equality(bits, expected); + } + { + unsigned char const a[3]{0x12, 0x0c, 0xff}; + const mgard::Bits bits(a, a + 3); + const std::vector expected{// `1`. + false, false, false, true, + // `2`. + false, false, true, false, + // `0`. + false, false, false, false, + // `c`. + true, true, false, false, + // `f`. + true, true, true, true, + // `f`. + true, true, true, true}; + test_bit_equality(bits, expected); + } + } + SECTION("nonzero end offsets") { + { + unsigned char const a[1]{0xff}; + const mgard::Bits bits(a, a, 7); + const std::vector expected(7, true); + test_bit_equality(bits, expected); + } + { + unsigned char const a[2]{0xa9, 0x33}; + const mgard::Bits bits(a, a + 1, 2); + const std::vector expected{true, false, true, false, true, + false, false, true, false, false}; + test_bit_equality(bits, expected); + } + { + unsigned char const a[3]{0x1e, 0x0f, 0x77}; + const mgard::Bits bits(a, a + 2, 6); + const std::vector expected{false, false, false, true, true, true, + true, false, false, false, false, false, + true, true, true, true, false, true, + true, true, false, true}; + test_bit_equality(bits, expected); + } + } +} + +TEST_CASE("Chain iteration", "[utilities]") { + SECTION("reading") { + const std::size_t N = 5; + std::array, N> in; + in.at(0) = {0}; + in.at(1) = {1, 2, 3}; + in.at(2) = {}; + in.at(3) = {4, 5, 6}; + in.at(4) = {7, 8, 9, 10}; + using It = std::vector::const_iterator; + std::vector> segments; + for (const std::vector &in_ : in) { + segments.push_back({in_.begin(), in_.size()}); + } + unsigned char expected = 0; + TrialTracker tracker; + for (const unsigned char read : mgard::Chain(segments)) { + tracker += read == expected++; + } + REQUIRE(tracker); + REQUIRE(expected == 11); + } + + SECTION("writing") { + const std::size_t N = 4; + std::array, N> out; + const std::array ns{3, 5, 0, 1}; + using It = std::vector::iterator; + std::vector> segments; + segments.reserve(N); + for (std::size_t i = 0; i < N; ++i) { + std::vector &out_ = out.at(i); + const std::size_t n = ns.at(i); + out_.resize(n); + segments.push_back({out_.begin(), n}); + } + + unsigned short int a = 1; + unsigned short int b = 1; + for (unsigned short int &c : mgard::Chain(segments)) { + c = a; + const unsigned short int tmp = a + b; + a = b; + b = tmp; + } + + std::array, N> expected; + expected.at(0) = {1, 1, 2}; + expected.at(1) = {3, 5, 8, 13, 21}; + expected.at(2) = {}; + expected.at(3) = {34}; + REQUIRE(out == expected); + } +} diff --git a/tests/src/testing_utilities.cpp b/tests/src/testing_utilities.cpp index 822c6c87e3..d84210d5fd 100644 --- a/tests/src/testing_utilities.cpp +++ b/tests/src/testing_utilities.cpp @@ -20,3 +20,11 @@ std::ostream &operator<<(std::ostream &os, const TrialTracker &tracker) { return os << tracker.nsuccesses << " successes and " << tracker.nfailures << " failures out of " << tracker.ntrials << " trials"; } + +PeriodicGenerator::PeriodicGenerator(const std::size_t period, + const long int value) + : period(period), value(value), ncalls(0) {} + +long int PeriodicGenerator::operator()() { + return value + static_cast(ncalls++ % period); +}