From d4cc9291fdeda7892baed04b1131fe20a9fea4e0 Mon Sep 17 00:00:00 2001 From: Andrew Duffy Date: Tue, 13 Aug 2024 09:43:10 -0400 Subject: [PATCH 01/20] add license headers --- Cargo.toml | 4 ++++ src/builder.rs | 14 ++++++++++++++ src/lib.rs | 14 ++++++++++++++ src/longest.rs | 14 ++++++++++++++ 4 files changed, 46 insertions(+) diff --git a/Cargo.toml b/Cargo.toml index 030301c..b6e4520 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,10 @@ [package] name = "fsst-rs" version = "0.0.1" +description = "Pure-Rust implementation of Fast Static Symbol Tables algorithm for string compression" +authors = ["SpiralDB Developers "] +license = "Apache-2.0" +repository = "https://github.com/spiraldb/fsst" edition = "2021" [lints.rust] diff --git a/src/builder.rs b/src/builder.rs index 31933db..f4fc9dd 100644 --- a/src/builder.rs +++ b/src/builder.rs @@ -1,3 +1,17 @@ +// Copyright 2024 Spiral, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + //! Functions and types used for building a [`SymbolTable`] from a corpus of text. //! //! This module implements the logic from Algorithm 3 of the [FSST Paper]. diff --git a/src/lib.rs b/src/lib.rs index 068f610..7725e91 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,3 +1,17 @@ +// Copyright 2024 Spiral, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + #![doc = include_str!("../README.md")] use std::fmt::{Debug, Formatter}; diff --git a/src/longest.rs b/src/longest.rs index 445a88a..d8c433f 100644 --- a/src/longest.rs +++ b/src/longest.rs @@ -1,3 +1,17 @@ +// Copyright 2024 Spiral, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + use crate::{Code, SymbolTable}; /// Find the longest substring. From 9062072fcc307023f75d8e66f476e27470851415 Mon Sep 17 00:00:00 2001 From: Andrew Duffy Date: Tue, 13 Aug 2024 19:35:28 -0400 Subject: [PATCH 02/20] save --- Cargo.toml | 2 +- src/builder.rs | 12 +- src/find_longest/mod.rs | 21 ++ src/{longest.rs => find_longest/naive.rs} | 5 +- src/lib.rs | 228 ++++++++++++++++++---- src/lossy_pht.rs | 199 +++++++++++++++++++ 6 files changed, 420 insertions(+), 47 deletions(-) create mode 100644 src/find_longest/mod.rs rename src/{longest.rs => find_longest/naive.rs} (90%) create mode 100644 src/lossy_pht.rs diff --git a/Cargo.toml b/Cargo.toml index b6e4520..d54d01e 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -9,7 +9,7 @@ edition = "2021" [lints.rust] warnings = "deny" -missing_docs = "deny" +# missing_docs = "deny" [lints.clippy] all = { level = "deny", priority = -1 } diff --git a/src/builder.rs b/src/builder.rs index f4fc9dd..8dece99 100644 --- a/src/builder.rs +++ b/src/builder.rs @@ -21,6 +21,7 @@ use std::cmp::Ordering; use std::collections::BinaryHeap; +use crate::find_longest::FindLongestSymbol; use crate::{Code, Symbol, SymbolTable}; #[derive(Debug, Clone)] @@ -147,10 +148,13 @@ impl SymbolTable { } // Pop the 255 best symbols. - pqueue - .iter() - .take(255) - .for_each(|candidate| res.insert(candidate.symbol)); + let mut n_symbols = 0; + while !pqueue.is_empty() && n_symbols < 255 { + let candidate = pqueue.pop().unwrap(); + if res.insert(candidate.symbol) { + n_symbols += 1; + } + } res } diff --git a/src/find_longest/mod.rs b/src/find_longest/mod.rs new file mode 100644 index 0000000..af9d80b --- /dev/null +++ b/src/find_longest/mod.rs @@ -0,0 +1,21 @@ +// Copyright 2024 Spiral, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use crate::Code; + +mod naive; + +pub trait FindLongestSymbol { + fn find_longest_symbol(&self, text: &[u8]) -> Code; +} diff --git a/src/longest.rs b/src/find_longest/naive.rs similarity index 90% rename from src/longest.rs rename to src/find_longest/naive.rs index d8c433f..1ca9788 100644 --- a/src/longest.rs +++ b/src/find_longest/naive.rs @@ -12,14 +12,15 @@ // See the License for the specific language governing permissions and // limitations under the License. +use crate::find_longest::FindLongestSymbol; use crate::{Code, SymbolTable}; /// Find the longest substring. -impl SymbolTable { +impl FindLongestSymbol for SymbolTable { // NOTE(aduffy): if you don't disable inlining, this function won't show up in profiles. #[inline(never)] - pub(crate) fn find_longest_symbol(&self, text: &[u8]) -> Code { + fn find_longest_symbol(&self, text: &[u8]) -> Code { debug_assert!(!text.is_empty(), "text must not be empty"); // Find the code that best maps to the provided text table here. diff --git a/src/lib.rs b/src/lib.rs index 7725e91..4e83ebe 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -13,25 +13,32 @@ // limitations under the License. #![doc = include_str!("../README.md")] + +/// Throw a compiler error if a type isn't guaranteed to have a specific size in bytes. +macro_rules! assert_sizeof { + ($typ:ty => $size_in_bytes:expr) => { + const _: [u8; $size_in_bytes] = [0; std::mem::size_of::<$typ>()]; + }; +} + use std::fmt::{Debug, Formatter}; pub use builder::*; +use lossy_pht::LossyPHT; mod builder; -mod longest; +mod find_longest; +mod lossy_pht; -/// A Symbol wraps a set of values of +/// `Symbol`s are small (up to 8-byte) segments of strings, stored in a [`SymbolTable`] and +/// identified by an 8-bit [`Code`]. #[derive(Copy, Clone)] pub union Symbol { bytes: [u8; 8], num: u64, } -impl Debug for Symbol { - fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { - write!(f, "{}", unsafe { self.num }) - } -} +assert_sizeof!(Symbol => 8); impl Symbol { /// Zero value for `Symbol`. @@ -60,6 +67,7 @@ impl Symbol { /// /// Each symbol has the capacity to hold up to 8 bytes of data, but the symbols /// can contain fewer bytes, padded with 0x00. + #[inline(never)] pub fn len(&self) -> usize { let numeric = unsafe { self.num }; // For little-endian platforms, this counts the number of *trailing* zeros @@ -75,7 +83,37 @@ impl Symbol { self.len() == 0 } - /// Create a ew + #[inline] + pub fn as_u64(&self) -> u64 { + // SAFETY: the bytes can always be viewed as a u64 + unsafe { self.num } + } + + /// Get the first byte of the symbol as a `u8`. + /// + /// # Safety + /// The function will never panic, but if the symbol's len is < 1, the + /// result may be meaningless. It is up to the caller to ensure that + /// the first byte of the symbol contains valid data. + #[inline] + pub fn first_byte(&self) -> u8 { + // SAFETY: the bytes can always be viewed as a u64 + unsafe { self.num as u8 } + } + + /// Get the first two bytes of the symbol as a `u16`. + /// + /// # Safety + /// The function will never panic, but if the symbol's len is < 2, the + /// result may be meaningless. It is up to the caller to ensure that + /// the first two bytes of the symbol contain valid data. + #[inline] + pub fn first_two_bytes(&self) -> u16 { + // SAFETY: the bytes can always be viewed as a u64 + unsafe { self.num as u16 } + } + + /// Access the Symbol as a slice. pub fn as_slice(&self) -> &[u8] { let len = self.len(); // SAFETY: constructors will not allow building a struct where len > 8. @@ -94,12 +132,20 @@ impl Symbol { let self_len = self.len(); let mut result = *self; + + // SAFETY: self_len and new_len are checked to be <= 8 unsafe { result.bytes[self_len..new_len].copy_from_slice(other.as_slice()) }; result } } +impl Debug for Symbol { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + write!(f, "{:?}", unsafe { self.bytes }) + } +} + /// Codes used to map symbols to bytes. /// /// Logically, codes can range from 0-255 inclusive. Physically, we represent them as a 9-bit @@ -107,11 +153,11 @@ impl Symbol { /// /// Physically in-memory, `Code(0)` through `Code(255)` corresponds to escape sequences of raw bytes /// 0 through 255. `Code(256)` through `Code(511)` represent the actual codes -255. -#[derive(Copy, Clone, Debug, PartialEq, Eq, PartialOrd, Ord)] +#[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord)] pub struct Code(u16); impl Code { - /// Maximum code value for the in-memory `Code` representation. + /// Maximum value for the in-memory `Code` representation. /// /// When truncated to u8 this is code 255, which is equivalent to [`Self::ESCAPE_CODE`]. pub const CODE_MAX: u16 = 511; @@ -157,6 +203,15 @@ impl Code { } } +impl Debug for Code { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + f.debug_struct("Code") + .field("code_byte", &(self.0 as u8)) + .field("escape", &(self.0 < 256)) + .finish() + } +} + /// The static symbol table used for compression and decompression. /// /// The `SymbolTable` is the central component of FSST. You can create a SymbolTable either by @@ -174,16 +229,23 @@ impl Code { /// ``` /// /// [training]: [`train`] -#[derive(Clone, Debug)] +#[derive(Clone)] pub struct SymbolTable { /// Table mapping codes to symbols. pub(crate) symbols: [Symbol; 511], - /// Indicates the number of entries in the symbol table that have been populated. - /// - /// This value is always at least 256, as the first 256 entries in the `table` are the escape - /// bytes. - pub(crate) n_symbols: usize, + /// Indicates the number of entries in the symbol table that have been populated, not counting + /// the escape values. + pub(crate) n_symbols: u8, + + // + // Index structures used to speedup building the symbol table and compression + // + /// Inverted index mapping 2-byte symbols to codes + codes_twobyte: [u16; 65_536], + + /// Lossy perfect hash table for looking up codes to symbols that are 3 bytes or more + lossy_pht: LossyPHT, } impl Default for SymbolTable { @@ -191,13 +253,23 @@ impl Default for SymbolTable { let mut table = Self { symbols: [Symbol::ZERO; 511], n_symbols: 0, + codes_twobyte: [0; 65_536], + lossy_pht: LossyPHT::new(), }; // Populate the escape byte entries. for byte in 0..=255 { table.symbols[byte as usize] = Symbol::from_u8(byte); } - table.n_symbols = 256; + + // Populate the "codes" for twobytes to default to the escape sequence. + for first in 0..256 { + for second in 0..256 { + let index = (first << 8) | second; + table.codes_twobyte[index as usize] = + ((first << 8) | (Code::ESCAPE_CODE as usize)) as u16; + } + } table } @@ -208,39 +280,115 @@ impl Default for SymbolTable { /// The symbol table is trained on a corpus of data in the form of a single byte array, building up /// a mapping of 1-byte "codes" to sequences of up to `N` plaintext bytse, or "symbols". impl SymbolTable { - /// Insert a new symbol at the end of the table. + /// Attempt to insert a new symbol at the end of the table. /// /// # Panics /// Panics if the table is already full. - pub fn insert(&mut self, symbol: Symbol) { - assert!( - self.n_symbols < self.symbols.len(), - "cannot insert into full symbol table" - ); - self.symbols[self.n_symbols] = symbol; + pub fn insert(&mut self, symbol: Symbol) -> bool { + assert!(self.n_symbols < 255, "cannot insert into full symbol table"); + + let symbol_len = symbol.len(); + if symbol_len == 2 { + // Speculatively insert the symbol into the twobyte cache + self.codes_twobyte[symbol.first_two_bytes() as usize] = self.n_symbols as u16; + } else if symbol_len >= 3 { + // Attempt to insert larger symbols into the 3-byte cache + if !self.lossy_pht.insert(symbol, self.n_symbols) { + return false; + } + } + + // Insert at the end of the symbols table. + // Note the rescaling from range [0-254] -> [256, 510]. + self.symbols[256 + (self.n_symbols as usize)] = symbol; self.n_symbols += 1; + true + } + + /// Using the symbol table, runs a single cycle of compression from the front of `in_ptr`, writing + /// the output into `out_ptr`. + /// + /// # Returns + /// + /// This function returns a tuple of (code, advance_in, advance_out). + /// + /// `code` is the code that was emitted into the output buffer. + /// + /// `advance_in` is the number of bytes to advance `in_ptr` before the next call. + /// + /// `advance_out` is the number of bytes to advance `out_ptr` before the next call. + /// + /// # Safety + /// + /// `in_ptr` and `out_ptr` must never be NULL or otherwise point to invalid memory. + pub(crate) unsafe fn compress_single( + &self, + in_ptr: *const u8, + out_ptr: *mut u8, + ) -> (u8, usize, usize) { + // Load a full 8-byte word of data from in_ptr. + // SAFETY: caller asserts in_ptr is not null. we may read past end of pointer though. + let word: u64 = unsafe { (in_ptr as *const u64).read_unaligned() }; + + // Speculatively write the first byte of `word` at offset 1. This is necessary if it is an escape, and + // if it isn't, it will be overwritten anyway. + // + // SAFETY: caller ensures out_ptr is not null + let first_byte = word as u8; + unsafe { out_ptr.byte_add(1).write_unaligned(first_byte) }; + + // Access the hash table, and see if we have a match. + let entry = self.lossy_pht.lookup(word); + + // Now, downshift the `word` and the `entry` to see if they align. + let word_prefix = + word >> (0xFF_FF_FF_FF_FF_FF_FF_FFu64 >> entry.packed_meta.ignored_bits()); + + // This ternary-like branch corresponds to the "conditional move" line from the paper's Algorithm 4: + // if the shifted word and symbol match, we use it. Else, we use the precomputed twobyte code for this + // byte sequence. + let code = if entry.symbol.as_u64() == word_prefix && !entry.packed_meta.is_unused() { + entry.packed_meta.code() as u16 + } else { + self.codes_twobyte[(word as u16) as usize] + }; + // Write the first byte of `code` to the output position. + // The code will either by a real code with a single byte of padding, OR a two-byte code sequence. + unsafe { + out_ptr.write_unaligned(code as u8); + }; + + // Seek the output pointer forward. + let advance_in = (64 - entry.packed_meta.ignored_bits()) >> 3; + let advance_out = 2 - ((code >> 8) & 1) as usize; + + (code as u8, advance_in, advance_out) } /// Use the symbol table to compress the plaintext into a sequence of codes and escapes. pub fn compress(&self, plaintext: &[u8]) -> Vec { - let mut values = Vec::with_capacity(2 * plaintext.len()); - let len = plaintext.len(); - let mut pos = 0; - while pos < len { - let next_code = self.find_longest_symbol(&plaintext[pos..len]); - if next_code.is_escape() { - // Case 1 -escape: push an ESCAPE followed by the next byte. - values.push(Code::ESCAPE_CODE); - values.push(next_code.0 as u8); - pos += 1; - } else { - // Case 2 - code: push the code, increment position by symbol length - let symbol = self.symbols[next_code.0 as usize]; - values.push(next_code.0 as u8); - pos += symbol.len(); - } + let mut values: Vec = Vec::with_capacity(2 * plaintext.len()); + + let mut in_ptr = plaintext.as_ptr(); + let mut out_ptr = values.as_mut_ptr(); + + // SAFETY: `end` will point just after the end of the `plaintext` slice. + let in_end = unsafe { in_ptr.byte_add(plaintext.len()) }; + // SAFETY: `end` will point just after the end of the `values` allocation. + let out_end = unsafe { out_ptr.byte_add(values.capacity()) }; + + while in_ptr < in_end && out_ptr < out_end { + // SAFETY: pointer ranges are checked in the loop condition + unsafe { + let (_, advance_in, advance_out) = self.compress_single(in_ptr, out_ptr); + in_ptr = in_ptr.byte_add(advance_in); + out_ptr = out_ptr.byte_add(advance_out); + }; } + // in_ptr should have exceeded in_end + assert!(in_ptr >= in_end, "exhausted output buffer before exhausting input, there is a bug in SymbolTable::compress()"); + values } diff --git a/src/lossy_pht.rs b/src/lossy_pht.rs new file mode 100644 index 0000000..0a25562 --- /dev/null +++ b/src/lossy_pht.rs @@ -0,0 +1,199 @@ +use std::fmt::Debug; +use std::fmt::Formatter; +use std::u16; + +use crate::Code; +use crate::Symbol; + +/// Size of the perfect hash table. +/// +/// NOTE: this differs from the paper, which recommends a 64KB total +/// table size. The paper does not account for the fact that most +/// vendors split the L1 cache into 32KB of instruction and 32KB of data. +pub const HASH_TABLE_SIZE: usize = 1 << 11; + +/// Bit-packed metadata for a [`TableEntry`] +/// +/// Bitpacked layout: +/// +/// bits 10-15: ignored bits in the symbol. Equivalent to 64 - symbol.len()*8 +/// bit 9: not used +/// bit 8: the "unused" flag +/// bits 0-7: code value (0-254) +#[derive(Clone, Copy)] +#[repr(C)] +pub(crate) struct PackedMeta(u16); + +assert_sizeof!(PackedMeta => 2); + +impl PackedMeta { + /// Constant unused instance. + /// + /// All bits are set, corresponding to + /// + /// 6 bits set for `ignored bits` + /// 1 bit to indicate the `unused` flag + /// 8 bits of `code` data + pub const UNUSED: Self = Self(0xFFFF); + + /// The 8th bit toggles if the slot is unused or not. + const UNUSED_FLAG: u16 = 1 << 8; + + /// Create a new `PackedSymbolMeta` from raw parts. + /// + /// # Panics + /// If `len` > 8 or `code` > [`Code::CODE_MAX`] + pub fn new(len: u16, code: u8) -> Self { + assert!(len <= 8, "cannot construct PackedCode with len > 8"); + + let ignored_bits = 64 - 8 * len; + + let packed = (ignored_bits << 10) | (code as u16); + Self(packed) + } + + /// Import a `PackedSymbolMeta` from a raw `u16`. + pub fn from_u16(value: u16) -> Self { + assert!( + (value >> 12) <= 64, + "cannot construct PackedCode with len > 8" + ); + assert!( + (value & 0b111_111_111) <= Code::CODE_MAX, + "cannot construct PackedCode with code > CODE_MAX" + ); + + Self(value) + } + + /// Get the number of ignored bits in the corresponding symbol's `u64` representation. + /// + /// Always <= 64 + #[inline] + pub(crate) fn ignored_bits(&self) -> usize { + ((self.0 >> 12) & 0b1111) as usize + } + + /// Get the raw code value. + #[inline] + pub(crate) fn code(&self) -> u8 { + self.0 as u8 + } + + /// Check if the unused flag is set + #[inline] + pub(crate) fn is_unused(&self) -> bool { + (self.0 & Self::UNUSED_FLAG) != 0 + } +} + +impl Default for PackedMeta { + fn default() -> Self { + // The default implementation of a `PackedMeta` is one where only the `UNUSED_FLAG` is set, + // representing an unused slot in the table. + Self::UNUSED + } +} + +impl Debug for PackedMeta { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + f.debug_struct("PackedCode") + .field("ignored_bits", &self.ignored_bits()) + .field("code", &self.code()) + .finish() + } +} + +/// A single entry in the [`SymbolTable`]. +/// +/// `TableEntry` is based on the `Symbol` class outlined in Algorithm 4 of the FSST paper. See +/// the module documentation for a link to the paper. +#[derive(Copy, Clone, Debug)] +#[repr(C)] +pub(crate) struct TableEntry { + /// Symbol, piece of a string, 8 bytes or fewer. + pub(crate) symbol: Symbol, + + /// Bit-packed metadata for the entry. + /// + /// [`PackedMeta`] provides compact, efficient access to metadata about the `symbol`, including + /// its code and length. + pub(crate) packed_meta: PackedMeta, +} + +assert_sizeof!(TableEntry => 16); + +/// Lossy Perfect Hash Table implementation for compression. +/// +/// This implements the "Lossy Perfect Hash Table" described in Section 5 of the paper. +/// +/// It is so-called because the `insert` operation for a symbol may fail, if another symbol is +/// already occupying the slot. +/// +/// If insertions are made from highest-gain to lowest and from longest-symbol to shortest, then +/// we can say that any failed insert is not a big loss, because its slot is being held by a higher-gain +/// symbol. Note that because other code in this crate calls `insert` in the pop-order of a max heap, +/// this holds. +#[derive(Clone, Debug)] +pub(crate) struct LossyPHT { + /// Hash table slots. Used for strings that are 3 bytes or more. + slots: Vec, +} + +impl LossyPHT { + /// Construct a new empty lossy perfect hash table + pub(crate) fn new() -> Self { + let mut slots = Vec::with_capacity(HASH_TABLE_SIZE); + // Initialize all slots to empty entries + for _ in 0..HASH_TABLE_SIZE { + slots.push(TableEntry { + symbol: Symbol::ZERO, + packed_meta: PackedMeta::UNUSED, + }); + } + + Self { slots } + } + + /// Try and insert the (symbol, code) pair into the table. + /// + /// If there is a collision, we keep the current thing and reject the write. + /// + /// # Returns + /// + /// True if the symbol was inserted into the table, false if it was rejected due to collision. + pub(crate) fn insert(&mut self, symbol: Symbol, code: u8) -> bool { + let prefix_3bytes = symbol.as_u64() & 0xFF_FF_FF; + let slot = self.hash(prefix_3bytes) as usize & (HASH_TABLE_SIZE - 1); + let mut entry = self.slots[slot]; + + if !entry.packed_meta.is_unused() { + return false; + } else { + entry.symbol = symbol; + entry.packed_meta = PackedMeta::new(symbol.len() as u16, code); + return true; + } + } + + pub(crate) fn lookup(&self, word: u64) -> TableEntry { + let prefix_3bytes = word & 0xFF_FF_FF; + let slot = self.hash(prefix_3bytes) as usize & (HASH_TABLE_SIZE - 1); + + self.slots[slot] + } + + /// Hash a value to find the bucket it belongs in. + /// + /// The particular hash function comes from the code listing of Algorithm 4 of the FSST paper. + #[inline] + fn hash(&self, value: u64) -> u64 { + (value * 2971215073) ^ (value >> 15) + } +} + +impl Default for LossyPHT { + fn default() -> Self { + Self::new() + } +} From b625318273637e6c9aa2bd8e376d4273c9d14462 Mon Sep 17 00:00:00 2001 From: Andrew Duffy Date: Wed, 14 Aug 2024 08:49:30 -0400 Subject: [PATCH 03/20] save --- src/lib.rs | 110 ++++++++++++++++++++++++++++++++++++++--------- src/lossy_pht.rs | 35 +++++++++------ 2 files changed, 111 insertions(+), 34 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index 4e83ebe..1d77248 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -21,7 +21,10 @@ macro_rules! assert_sizeof { }; } -use std::fmt::{Debug, Formatter}; +use std::{ + fmt::{Debug, Formatter}, + u64, +}; pub use builder::*; use lossy_pht::LossyPHT; @@ -266,8 +269,8 @@ impl Default for SymbolTable { for first in 0..256 { for second in 0..256 { let index = (first << 8) | second; - table.codes_twobyte[index as usize] = - ((first << 8) | (Code::ESCAPE_CODE as usize)) as u16; + // 511 is (first << 8) | ESCAPE_CODE + table.codes_twobyte[index as usize] = 511u16; } } @@ -290,11 +293,30 @@ impl SymbolTable { let symbol_len = symbol.len(); if symbol_len == 2 { // Speculatively insert the symbol into the twobyte cache + println!( + "inserting 2-byte symbol {:?}", + symbol + .as_slice() + .iter() + .map(|c| *c as char) + .collect::>() + ); self.codes_twobyte[symbol.first_two_bytes() as usize] = self.n_symbols as u16; } else if symbol_len >= 3 { + println!( + "inserting long symbol {:?}", + symbol + .as_slice() + .iter() + .map(|c| *c as char) + .collect::>() + ); // Attempt to insert larger symbols into the 3-byte cache if !self.lossy_pht.insert(symbol, self.n_symbols) { + println!("\t❌ insert failed"); return false; + } else { + println!("\t✅ insert successful"); } } @@ -306,7 +328,7 @@ impl SymbolTable { } /// Using the symbol table, runs a single cycle of compression from the front of `in_ptr`, writing - /// the output into `out_ptr`. + /// the output into `out_ptr`. Attempts to process an entire 64-bit word of prefix from `in_ptr`. /// /// # Returns /// @@ -321,14 +343,12 @@ impl SymbolTable { /// # Safety /// /// `in_ptr` and `out_ptr` must never be NULL or otherwise point to invalid memory. - pub(crate) unsafe fn compress_single( - &self, - in_ptr: *const u8, - out_ptr: *mut u8, - ) -> (u8, usize, usize) { - // Load a full 8-byte word of data from in_ptr. - // SAFETY: caller asserts in_ptr is not null. we may read past end of pointer though. - let word: u64 = unsafe { (in_ptr as *const u64).read_unaligned() }; + #[inline(never)] + pub(crate) unsafe fn compress_word(&self, word: u64, out_ptr: *mut u8) -> (usize, usize) { + println!( + "compress called: next word = {:?}", + word.to_le_bytes().map(|c| c as char) + ); // Speculatively write the first byte of `word` at offset 1. This is necessary if it is an escape, and // if it isn't, it will be overwritten anyway. @@ -339,17 +359,37 @@ impl SymbolTable { // Access the hash table, and see if we have a match. let entry = self.lossy_pht.lookup(word); + println!( + "\tentry.symbol={:?} packed_meta: {:?}", + entry + .symbol + .as_slice() + .iter() + .map(|c| *c as char) + .collect::>(), + entry.packed_meta + ); // Now, downshift the `word` and the `entry` to see if they align. - let word_prefix = - word >> (0xFF_FF_FF_FF_FF_FF_FF_FFu64 >> entry.packed_meta.ignored_bits()); + let ignored_bits = entry.packed_meta.ignored_bits(); + let mask = if ignored_bits == 64 { + 0 + } else { + u64::MAX >> ignored_bits + }; + let word_prefix = word & mask; // This ternary-like branch corresponds to the "conditional move" line from the paper's Algorithm 4: // if the shifted word and symbol match, we use it. Else, we use the precomputed twobyte code for this // byte sequence. let code = if entry.symbol.as_u64() == word_prefix && !entry.packed_meta.is_unused() { + println!("\t\tusing packed_meta.code"); entry.packed_meta.code() as u16 } else { + println!( + "\t\tusing twobyte code: {:b}", + self.codes_twobyte[(word as u16) as usize] + ); self.codes_twobyte[(word as u16) as usize] }; // Write the first byte of `code` to the output position. @@ -358,11 +398,12 @@ impl SymbolTable { out_ptr.write_unaligned(code as u8); }; - // Seek the output pointer forward. - let advance_in = (64 - entry.packed_meta.ignored_bits()) >> 3; - let advance_out = 2 - ((code >> 8) & 1) as usize; + // Seek the output pointer forward by howeer large the code was. + let advance_in = entry.symbol.len(); + let advance_out = 1 + ((code >> 8) & 1) as usize; - (code as u8, advance_in, advance_out) + println!("\tresult: code={code:b} advance_in={advance_in} advance_out={advance_out}"); + (advance_in, advance_out) } /// Use the symbol table to compress the plaintext into a sequence of codes and escapes. @@ -374,18 +415,47 @@ impl SymbolTable { // SAFETY: `end` will point just after the end of the `plaintext` slice. let in_end = unsafe { in_ptr.byte_add(plaintext.len()) }; + let in_end_sub8 = unsafe { in_end.byte_sub(8) }; // SAFETY: `end` will point just after the end of the `values` allocation. let out_end = unsafe { out_ptr.byte_add(values.capacity()) }; - while in_ptr < in_end && out_ptr < out_end { + while in_ptr < in_end_sub8 && out_ptr < out_end { + println!("FIRST LOOP"); // SAFETY: pointer ranges are checked in the loop condition unsafe { - let (_, advance_in, advance_out) = self.compress_single(in_ptr, out_ptr); + // Load a full 8-byte word of data from in_ptr. + // SAFETY: caller asserts in_ptr is not null. we may read past end of pointer though. + let word: u64 = (in_ptr as *const u64).read_unaligned(); + let (advance_in, advance_out) = self.compress_word(word, out_ptr); in_ptr = in_ptr.byte_add(advance_in); out_ptr = out_ptr.byte_add(advance_out); }; } + let remaining_bytes = unsafe { in_end.byte_offset_from(in_ptr) }; + // Shift the mask down by the number of bytes each time. + // The shift amount will start at 0, and increase each time. + + // Shift off the remaining bytes, if not none + let mut mask: u64 = if remaining_bytes == 8 { + u64::MAX + } else { + u64::MAX >> (64 - 8 * remaining_bytes) + }; + + while in_ptr < in_end && out_ptr < out_end { + println!("SECOND LOOP"); + unsafe { + // Load a full 8-byte word of data from in_ptr. + // SAFETY: caller asserts in_ptr is not null. we may read past end of pointer though. + let word: u64 = (in_ptr as *const u64).read_unaligned(); + let (advance_in, advance_out) = self.compress_word(word & mask, out_ptr); + in_ptr = in_ptr.byte_add(advance_in); + out_ptr = out_ptr.byte_add(advance_out); + mask = mask >> (8 * advance_in); + } + } + // in_ptr should have exceeded in_end assert!(in_ptr >= in_end, "exhausted output buffer before exhausting input, there is a bug in SymbolTable::compress()"); diff --git a/src/lossy_pht.rs b/src/lossy_pht.rs index 0a25562..5ffd209 100644 --- a/src/lossy_pht.rs +++ b/src/lossy_pht.rs @@ -2,7 +2,6 @@ use std::fmt::Debug; use std::fmt::Formatter; use std::u16; -use crate::Code; use crate::Symbol; /// Size of the perfect hash table. @@ -16,8 +15,7 @@ pub const HASH_TABLE_SIZE: usize = 1 << 11; /// /// Bitpacked layout: /// -/// bits 10-15: ignored bits in the symbol. Equivalent to 64 - symbol.len()*8 -/// bit 9: not used +/// bits 9-15: ignored bits in the symbol. Equivalent to 64 - symbol.len()*8 /// bit 8: the "unused" flag /// bits 0-7: code value (0-254) #[derive(Clone, Copy)] @@ -32,9 +30,10 @@ impl PackedMeta { /// All bits are set, corresponding to /// /// 6 bits set for `ignored bits` + /// 1 unused bit /// 1 bit to indicate the `unused` flag /// 8 bits of `code` data - pub const UNUSED: Self = Self(0xFFFF); + pub const UNUSED: Self = Self(0b10000001_11111111); /// The 8th bit toggles if the slot is unused or not. const UNUSED_FLAG: u16 = 1 << 8; @@ -48,20 +47,16 @@ impl PackedMeta { let ignored_bits = 64 - 8 * len; - let packed = (ignored_bits << 10) | (code as u16); + let packed = (ignored_bits << 9) | (code as u16); Self(packed) } /// Import a `PackedSymbolMeta` from a raw `u16`. pub fn from_u16(value: u16) -> Self { assert!( - (value >> 12) <= 64, + (value >> 9) <= 64, "cannot construct PackedCode with len > 8" ); - assert!( - (value & 0b111_111_111) <= Code::CODE_MAX, - "cannot construct PackedCode with code > CODE_MAX" - ); Self(value) } @@ -70,11 +65,11 @@ impl PackedMeta { /// /// Always <= 64 #[inline] - pub(crate) fn ignored_bits(&self) -> usize { - ((self.0 >> 12) & 0b1111) as usize + pub(crate) fn ignored_bits(&self) -> u16 { + (self.0 >> 9) as u16 } - /// Get the raw code value. + /// Get the code value. #[inline] pub(crate) fn code(&self) -> u8 { self.0 as u8 @@ -165,7 +160,8 @@ impl LossyPHT { pub(crate) fn insert(&mut self, symbol: Symbol, code: u8) -> bool { let prefix_3bytes = symbol.as_u64() & 0xFF_FF_FF; let slot = self.hash(prefix_3bytes) as usize & (HASH_TABLE_SIZE - 1); - let mut entry = self.slots[slot]; + println!("\t\tinserting to slot {slot}"); + let entry = &mut self.slots[slot]; if !entry.packed_meta.is_unused() { return false; @@ -197,3 +193,14 @@ impl Default for LossyPHT { Self::new() } } + +#[cfg(test)] +mod test { + use crate::lossy_pht::PackedMeta; + + #[test] + fn test_packedmeta() { + assert!(PackedMeta::UNUSED.is_unused()); + assert_eq!(PackedMeta::UNUSED.ignored_bits(), 64); + } +} From 8d11823b50ba7130a5e3782c06e661a009cb4038 Mon Sep 17 00:00:00 2001 From: Andrew Duffy Date: Wed, 14 Aug 2024 14:16:21 -0400 Subject: [PATCH 04/20] save --- .cargo/config.toml | 22 +++++ .gitignore | 3 + Cargo.lock | 13 ++- Cargo.toml | 17 +++- benches/compress.rs | 6 +- examples/round_trip.rs | 19 ++++ rust-toolchain.toml | 4 +- src/builder.rs | 3 +- src/lib.rs | 203 ++++++++++++++++++++++++++++------------- src/lossy_pht.rs | 1 - 10 files changed, 222 insertions(+), 69 deletions(-) create mode 100644 .cargo/config.toml create mode 100644 examples/round_trip.rs diff --git a/.cargo/config.toml b/.cargo/config.toml new file mode 100644 index 0000000..a8ffa27 --- /dev/null +++ b/.cargo/config.toml @@ -0,0 +1,22 @@ +[target.aarch64-apple-darwin] +rustflags = [ + "-C", + "link-arg=-undefined", + "-C", + "link-arg=dynamic_lookup", + "-Z", + "verbose-internals", + "-Z", + "track-diagnostics", +] +[target.x86_64-apple-darwin] +rustflags = [ + "-C", + "link-arg=-undefined", + "-C", + "link-arg=dynamic_lookup", + "-Z", + "verbose-internals", + "-Z", + "track-diagnostics", +] diff --git a/.gitignore b/.gitignore index 8b196e9..42ffee0 100644 --- a/.gitignore +++ b/.gitignore @@ -7,3 +7,6 @@ # already existing elements were commented out #/target + +# compiler debug reports +rustc-ice* diff --git a/Cargo.lock b/Cargo.lock index 48d9198..b5e4226 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -43,9 +43,12 @@ checksum = "37b2a672a2cb129a2e41c10b1224bb368f9f37a2b16b612598138befd7b37eb5" [[package]] name = "cc" -version = "1.1.10" +version = "1.1.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e9e8aabfac534be767c909e0690571677d49f41bd8465ae876fe043d52ba5292" +checksum = "5fb8dd288a69fc53a1996d7ecfbf4a20d59065bff137ce7e56bbd620de191189" +dependencies = [ + "shlex", +] [[package]] name = "cfg-if" @@ -438,6 +441,12 @@ dependencies = [ "serde", ] +[[package]] +name = "shlex" +version = "1.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64" + [[package]] name = "syn" version = "2.0.74" diff --git a/Cargo.toml b/Cargo.toml index d54d01e..e3161e8 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -26,7 +26,22 @@ use_debug = { level = "deny" } criterion = "0.5" lz4 = "1" +[[example]] +name = "round_trip" +bench = false +test = false + [[bench]] name = "compress" harness = false -bench = true + +# [profile.dev] +# lto = "off" + +# [profile.release] +# opt-level = 3 +# lto = "off" + +# [profile.bench] +# opt-level = 3 +# lto = "thin" diff --git a/benches/compress.rs b/benches/compress.rs index 829b7e6..e42c514 100644 --- a/benches/compress.rs +++ b/benches/compress.rs @@ -4,6 +4,7 @@ //! //! Also contains LZ4 baseline. #![allow(missing_docs)] +use core::str; use std::io::{Cursor, Read, Write}; use criterion::{black_box, criterion_group, criterion_main, Criterion}; @@ -36,7 +37,10 @@ fn bench_fsst(c: &mut Criterion) { compressed.len() ); - assert_eq!(table.decompress(&compressed), TEST.as_bytes()); + let decompressed = table.decompress(&compressed); + let decompressed = str::from_utf8(&decompressed).unwrap(); + println!("DECODED: {}", decompressed); + assert_eq!(decompressed, TEST); group.bench_function("compress-single", |b| { b.iter(|| black_box(table.compress(black_box(plaintext)))); diff --git a/examples/round_trip.rs b/examples/round_trip.rs new file mode 100644 index 0000000..1924065 --- /dev/null +++ b/examples/round_trip.rs @@ -0,0 +1,19 @@ +use core::str; + +/// Simple example of compression. + +fn main() { + // Train on a sample. + let sample = "the quick brown fox jumped over the lazy dog"; + let trained = fsst_rs::train(sample.as_bytes()); + let compressed = trained.compress(sample.as_bytes()); + println!("compressed: {} => {}", sample.len(), compressed.len()); + // decompress now + let decode = trained.decompress(&compressed); + let output = str::from_utf8(&decode).unwrap(); + println!( + "decoded to the original: len={} text='{}'", + decode.len(), + output + ); +} diff --git a/rust-toolchain.toml b/rust-toolchain.toml index 544af13..04a6423 100644 --- a/rust-toolchain.toml +++ b/rust-toolchain.toml @@ -1,5 +1,5 @@ [toolchain] -channel = "nightly-2024-06-19" +# channel = "stable" +channel = "nightly-2024-08-14" components = ["rust-src", "rustfmt", "clippy"] profile = "minimal" - diff --git a/src/builder.rs b/src/builder.rs index 8dece99..081dde0 100644 --- a/src/builder.rs +++ b/src/builder.rs @@ -209,6 +209,7 @@ mod test { // Use the table to compress a string, see the values let compressed = table.compress(text.as_bytes()); + assert_eq!(compressed, vec![0u8, 1u8, 2u8]); // Ensure that the compressed string has no escape bytes assert!(compressed.iter().all(|b| *b != Code::ESCAPE_CODE)); @@ -231,6 +232,6 @@ mod test { Code::ESCAPE_CODE, b'3', ] - ) + ); } } diff --git a/src/lib.rs b/src/lib.rs index 1d77248..074baaa 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +#![allow(unused)] #![doc = include_str!("../README.md")] /// Throw a compiler error if a type isn't guaranteed to have a specific size in bytes. @@ -245,7 +246,7 @@ pub struct SymbolTable { // Index structures used to speedup building the symbol table and compression // /// Inverted index mapping 2-byte symbols to codes - codes_twobyte: [u16; 65_536], + codes_twobyte: Vec, /// Lossy perfect hash table for looking up codes to symbols that are 3 bytes or more lossy_pht: LossyPHT, @@ -256,7 +257,7 @@ impl Default for SymbolTable { let mut table = Self { symbols: [Symbol::ZERO; 511], n_symbols: 0, - codes_twobyte: [0; 65_536], + codes_twobyte: vec![0; 65_536], lossy_pht: LossyPHT::new(), }; @@ -270,7 +271,7 @@ impl Default for SymbolTable { for second in 0..256 { let index = (first << 8) | second; // 511 is (first << 8) | ESCAPE_CODE - table.codes_twobyte[index as usize] = 511u16; + table.codes_twobyte[index as usize] = (first << 8) | (1 << 8) | 0xFF; } } @@ -293,30 +294,11 @@ impl SymbolTable { let symbol_len = symbol.len(); if symbol_len == 2 { // Speculatively insert the symbol into the twobyte cache - println!( - "inserting 2-byte symbol {:?}", - symbol - .as_slice() - .iter() - .map(|c| *c as char) - .collect::>() - ); self.codes_twobyte[symbol.first_two_bytes() as usize] = self.n_symbols as u16; } else if symbol_len >= 3 { - println!( - "inserting long symbol {:?}", - symbol - .as_slice() - .iter() - .map(|c| *c as char) - .collect::>() - ); // Attempt to insert larger symbols into the 3-byte cache if !self.lossy_pht.insert(symbol, self.n_symbols) { - println!("\t❌ insert failed"); return false; - } else { - println!("\t✅ insert successful"); } } @@ -344,31 +326,26 @@ impl SymbolTable { /// /// `in_ptr` and `out_ptr` must never be NULL or otherwise point to invalid memory. #[inline(never)] - pub(crate) unsafe fn compress_word(&self, word: u64, out_ptr: *mut u8) -> (usize, usize) { - println!( - "compress called: next word = {:?}", - word.to_le_bytes().map(|c| c as char) - ); - + pub(crate) unsafe fn compress_word( + &self, + word: u64, + out_ptr: *mut u8, + out_start: *mut u8, + ) -> (usize, usize) { // Speculatively write the first byte of `word` at offset 1. This is necessary if it is an escape, and // if it isn't, it will be overwritten anyway. // // SAFETY: caller ensures out_ptr is not null let first_byte = word as u8; + println!( + "WRITING out[{}] = {}", + out_ptr.byte_add(1).offset_from(out_start) as usize, + first_byte + ); unsafe { out_ptr.byte_add(1).write_unaligned(first_byte) }; - // Access the hash table, and see if we have a match. + // Probe the hash table let entry = self.lossy_pht.lookup(word); - println!( - "\tentry.symbol={:?} packed_meta: {:?}", - entry - .symbol - .as_slice() - .iter() - .map(|c| *c as char) - .collect::>(), - entry.packed_meta - ); // Now, downshift the `word` and the `entry` to see if they align. let ignored_bits = entry.packed_meta.ignored_bits(); @@ -383,26 +360,60 @@ impl SymbolTable { // if the shifted word and symbol match, we use it. Else, we use the precomputed twobyte code for this // byte sequence. let code = if entry.symbol.as_u64() == word_prefix && !entry.packed_meta.is_unused() { - println!("\t\tusing packed_meta.code"); - entry.packed_meta.code() as u16 - } else { + // Show symbol as str println!( - "\t\tusing twobyte code: {:b}", - self.codes_twobyte[(word as u16) as usize] + " HIT emitting code for symbol {:?}", + entry + .symbol + .as_slice() + .iter() + .map(|c| *c as char) + .collect::>() ); + entry.packed_meta.code() as u16 + } else { + let code = self.codes_twobyte[(word as u16) as usize]; + println!(" MISS - code={:X?}", code); + if code >= 256 { + // It's an escape of the current word first byte. + println!( + " MISS - emitting escape for char '{}'", + ((code >> 8) as u8 as char) + ); + } else { + println!( + " MISS - emitting code {} for symbol {:?}", + code, + self.symbols[(256 + code) as usize] + .as_slice() + .iter() + .map(|c| *c as char) + .collect::>(), + ); + } self.codes_twobyte[(word as u16) as usize] }; // Write the first byte of `code` to the output position. // The code will either by a real code with a single byte of padding, OR a two-byte code sequence. + println!( + "WRITING out[{}] = {}", + out_ptr.offset_from(out_start) as usize, + (code as u8) + ); unsafe { out_ptr.write_unaligned(code as u8); }; - // Seek the output pointer forward by howeer large the code was. - let advance_in = entry.symbol.len(); + // Seek the pointers forward. + // NOTE: if the symbol is not a hit, + let advance_in = if entry.symbol.as_u64() == word_prefix && !entry.packed_meta.is_unused() { + entry.symbol.len() + } else { + 1 + }; let advance_out = 1 + ((code >> 8) & 1) as usize; + println!("ADVANCE in={} out={} \n", advance_in, advance_out); - println!("\tresult: code={code:b} advance_in={advance_in} advance_out={advance_out}"); (advance_in, advance_out) } @@ -412,6 +423,7 @@ impl SymbolTable { let mut in_ptr = plaintext.as_ptr(); let mut out_ptr = values.as_mut_ptr(); + let out_start = values.as_mut_ptr(); // SAFETY: `end` will point just after the end of the `plaintext` slice. let in_end = unsafe { in_ptr.byte_add(plaintext.len()) }; @@ -420,50 +432,60 @@ impl SymbolTable { let out_end = unsafe { out_ptr.byte_add(values.capacity()) }; while in_ptr < in_end_sub8 && out_ptr < out_end { - println!("FIRST LOOP"); // SAFETY: pointer ranges are checked in the loop condition unsafe { // Load a full 8-byte word of data from in_ptr. // SAFETY: caller asserts in_ptr is not null. we may read past end of pointer though. let word: u64 = (in_ptr as *const u64).read_unaligned(); - let (advance_in, advance_out) = self.compress_word(word, out_ptr); + let (advance_in, advance_out) = self.compress_word(word, out_ptr, out_start); in_ptr = in_ptr.byte_add(advance_in); out_ptr = out_ptr.byte_add(advance_out); }; } let remaining_bytes = unsafe { in_end.byte_offset_from(in_ptr) }; - // Shift the mask down by the number of bytes each time. - // The shift amount will start at 0, and increase each time. + assert!( + remaining_bytes.is_positive(), + "in_ptr exceeded in_end, should not be possible" + ); - // Shift off the remaining bytes, if not none - let mut mask: u64 = if remaining_bytes == 8 { - u64::MAX - } else { - u64::MAX >> (64 - 8 * remaining_bytes) - }; + // Shift off the remaining bytes + let mut last_word = unsafe { (in_ptr as *const u64).read_unaligned() }; + last_word = mask_prefix(last_word, remaining_bytes as usize); while in_ptr < in_end && out_ptr < out_end { - println!("SECOND LOOP"); unsafe { // Load a full 8-byte word of data from in_ptr. // SAFETY: caller asserts in_ptr is not null. we may read past end of pointer though. - let word: u64 = (in_ptr as *const u64).read_unaligned(); - let (advance_in, advance_out) = self.compress_word(word & mask, out_ptr); + let (advance_in, advance_out) = self.compress_word(last_word, out_ptr, out_start); in_ptr = in_ptr.byte_add(advance_in); out_ptr = out_ptr.byte_add(advance_out); - mask = mask >> (8 * advance_in); + + last_word = advance_8byte_word(last_word, advance_in); } } // in_ptr should have exceeded in_end assert!(in_ptr >= in_end, "exhausted output buffer before exhausting input, there is a bug in SymbolTable::compress()"); + // Count the number of bytes written + // SAFETY: assertion + unsafe { + let bytes_written = out_ptr.offset_from(values.as_ptr()); + assert!( + bytes_written.is_positive(), + "out_ptr ended before it started, not possible" + ); + + values.set_len(bytes_written as usize); + } + values } /// Decompress a byte slice that was previously returned by [compression][Self::compress]. pub fn decompress(&self, compressed: &[u8]) -> Vec { + print_compressed(compressed); let mut decoded: Vec = Vec::with_capacity(size_of::() * compressed.len()); let ptr = decoded.as_mut_ptr(); @@ -506,3 +528,62 @@ impl SymbolTable { decoded } } + +/// Mask the word, keeping only the `prefix_bytes` front. +fn mask_prefix(word: u64, prefix_bytes: usize) -> u64 { + let mask = if prefix_bytes == 0 { + 0 + } else { + u64::MAX >> 8 * (8 - prefix_bytes) + }; + + word & mask +} + +fn advance_8byte_word(word: u64, bytes: usize) -> u64 { + // shift the word off the right-end, because little endian means the first + // char is stored in the LSB. + // + // Note that even though this looks like it branches, Rust compiles this to a + // conditional move instruction. See `` + if bytes == 8 { + 0 + } else { + word >> 8 * bytes + } +} + +pub fn advance_8byte_word_bits(word: u64, bits: usize) -> u64 { + // shift the word off the right-end, because little endian means the first + // char is stored in the LSB. + // + // Note that even though this looks like it branches, Rust compiles this to a + // conditional move instruction. See `` + if bits == 64 { + 0 + } else { + word >> bits + } +} + +// fn print_word(addr: *const T) -> String { +// let word = unsafe { (addr as *const u64).read_unaligned() }; +// format!("{:?}", word.to_le_bytes().map(|c| c as char)) +// } + +fn print_compressed(block: &[u8]) { + let mut repr = Vec::new(); + let mut idx = 0; + while idx < block.len() { + let byte = block[idx]; + if byte == Code::ESCAPE_CODE { + repr.push("ESCAPE".to_string()); + idx += 1; + repr.push(format!("'{}'", block[idx] as char)); + } else { + repr.push(format!("{}", block[idx])); + } + idx += 1; + } + println!("compressed: {:?}", repr); +} diff --git a/src/lossy_pht.rs b/src/lossy_pht.rs index 5ffd209..97d1f0d 100644 --- a/src/lossy_pht.rs +++ b/src/lossy_pht.rs @@ -160,7 +160,6 @@ impl LossyPHT { pub(crate) fn insert(&mut self, symbol: Symbol, code: u8) -> bool { let prefix_3bytes = symbol.as_u64() & 0xFF_FF_FF; let slot = self.hash(prefix_3bytes) as usize & (HASH_TABLE_SIZE - 1); - println!("\t\tinserting to slot {slot}"); let entry = &mut self.slots[slot]; if !entry.packed_meta.is_unused() { From bb648e8792e5f5d037be9aae728dd8cabb260303 Mon Sep 17 00:00:00 2001 From: Andrew Duffy Date: Wed, 14 Aug 2024 16:01:50 -0400 Subject: [PATCH 05/20] save --- benches/compress.rs | 7 +- src/builder.rs | 48 ++++--- src/find_longest/mod.rs | 4 +- src/find_longest/naive.rs | 15 +- src/lib.rs | 279 +++++++++++++++++++++++--------------- src/lossy_pht.rs | 28 +++- 6 files changed, 223 insertions(+), 158 deletions(-) diff --git a/benches/compress.rs b/benches/compress.rs index e42c514..9581c9c 100644 --- a/benches/compress.rs +++ b/benches/compress.rs @@ -11,7 +11,7 @@ use criterion::{black_box, criterion_group, criterion_main, Criterion}; use lz4::liblz4::BlockChecksum; use lz4::{BlockSize, ContentChecksum}; -use fsst_rs::{train, Code}; +use fsst_rs::{train, ESCAPE_CODE}; const CORPUS: &str = include_str!("dracula.txt"); const TEST: &str = "I found my smattering of German very useful here"; @@ -27,10 +27,7 @@ fn bench_fsst(c: &mut Criterion) { let plaintext = TEST.as_bytes(); let compressed = table.compress(plaintext); - let escape_count = compressed - .iter() - .filter(|b| **b == Code::ESCAPE_CODE) - .count(); + let escape_count = compressed.iter().filter(|b| **b == ESCAPE_CODE).count(); let ratio = (plaintext.len() as f64) / (compressed.len() as f64); println!( "Escapes = {escape_count}/{}, compression_ratio = {ratio}", diff --git a/src/builder.rs b/src/builder.rs index 081dde0..3e6af87 100644 --- a/src/builder.rs +++ b/src/builder.rs @@ -22,7 +22,7 @@ use std::cmp::Ordering; use std::collections::BinaryHeap; use crate::find_longest::FindLongestSymbol; -use crate::{Code, Symbol, SymbolTable}; +use crate::{CodeMeta, Symbol, SymbolTable, MAX_CODE}; #[derive(Debug, Clone)] struct Counter { @@ -36,29 +36,29 @@ struct Counter { impl Counter { fn new() -> Self { Self { - counts1: vec![0; Code::CODE_MAX as usize], - counts2: vec![vec![0; Code::CODE_MAX as usize]; Code::CODE_MAX as usize], + counts1: vec![0; MAX_CODE as usize], + counts2: vec![vec![0; MAX_CODE as usize]; MAX_CODE as usize], } } #[inline] - fn record_count1(&mut self, code1: Code) { - self.counts1[code1.0 as usize] += 1; + fn record_count1(&mut self, code1: u16) { + self.counts1[code1 as usize] += 1; } #[inline] - fn record_count2(&mut self, code1: Code, code2: Code) { - self.counts2[code1.0 as usize][code2.0 as usize] += 1; + fn record_count2(&mut self, code1: u16, code2: u16) { + self.counts2[code1 as usize][code2 as usize] += 1; } #[inline] - fn count1(&self, code: Code) -> usize { - self.counts1[code.0 as usize] + fn count1(&self, code: u16) -> usize { + self.counts1[code as usize] } #[inline] - fn count2(&self, code1: Code, code2: Code) -> usize { - self.counts2[code1.0 as usize][code2.0 as usize] + fn count2(&self, code1: u16, code2: u16) -> usize { + self.counts2[code1 as usize][code2 as usize] } } @@ -96,13 +96,13 @@ impl SymbolTable { let len = sample.len(); let mut prev_code = self.find_longest_symbol(sample); counter.record_count1(prev_code); - let mut pos = self.symbols[prev_code.0 as usize].len(); + let mut pos = self.symbols[prev_code as usize].len(); while pos < len { let code = self.find_longest_symbol(&sample[pos..len]); counter.record_count1(code); counter.record_count2(prev_code, code); - pos += self.symbols[code.0 as usize].len(); + pos += self.symbols[code as usize].len(); prev_code = code; } @@ -115,8 +115,7 @@ impl SymbolTable { let mut res = SymbolTable::default(); let mut pqueue = BinaryHeap::new(); for code1 in 0..511 { - let code1 = Code::from_u16(code1); - let symbol1 = self.symbols[code1.0 as usize]; + let symbol1 = self.symbols[code1 as usize]; let gain = counters.count1(code1) * symbol1.len(); pqueue.push(Candidate { symbol: symbol1, @@ -124,8 +123,7 @@ impl SymbolTable { }); for code2 in 0..511 { - let code2 = Code::from_u16(code2); - let symbol2 = &self.symbols[code2.0 as usize]; + let symbol2 = &self.symbols[code2 as usize]; // If either symbol is zero-length, or if merging would yield a symbol of // length greater than 8, skip. if symbol1.len() + symbol2.len() >= 8 || symbol1.is_empty() || symbol2.is_empty() { @@ -199,7 +197,7 @@ impl Ord for Candidate { #[cfg(test)] mod test { - use crate::{train, Code}; + use crate::{train, ESCAPE_CODE}; #[test] fn test_builder() { @@ -212,24 +210,24 @@ mod test { assert_eq!(compressed, vec![0u8, 1u8, 2u8]); // Ensure that the compressed string has no escape bytes - assert!(compressed.iter().all(|b| *b != Code::ESCAPE_CODE)); + assert!(compressed.iter().all(|b| *b != ESCAPE_CODE)); // Ensure that we can compress a string with no values seen at training time. let compressed = table.compress("xyz123".as_bytes()); assert_eq!( compressed, vec![ - Code::ESCAPE_CODE, + ESCAPE_CODE, b'x', - Code::ESCAPE_CODE, + ESCAPE_CODE, b'y', - Code::ESCAPE_CODE, + ESCAPE_CODE, b'z', - Code::ESCAPE_CODE, + ESCAPE_CODE, b'1', - Code::ESCAPE_CODE, + ESCAPE_CODE, b'2', - Code::ESCAPE_CODE, + ESCAPE_CODE, b'3', ] ); diff --git a/src/find_longest/mod.rs b/src/find_longest/mod.rs index af9d80b..074205d 100644 --- a/src/find_longest/mod.rs +++ b/src/find_longest/mod.rs @@ -12,10 +12,10 @@ // See the License for the specific language governing permissions and // limitations under the License. -use crate::Code; +use crate::CodeMeta; mod naive; pub trait FindLongestSymbol { - fn find_longest_symbol(&self, text: &[u8]) -> Code; + fn find_longest_symbol(&self, text: &[u8]) -> u16; } diff --git a/src/find_longest/naive.rs b/src/find_longest/naive.rs index 1ca9788..8819fb8 100644 --- a/src/find_longest/naive.rs +++ b/src/find_longest/naive.rs @@ -13,23 +13,26 @@ // limitations under the License. use crate::find_longest::FindLongestSymbol; -use crate::{Code, SymbolTable}; +use crate::{CodeMeta, SymbolTable}; -/// Find the longest substring. +// Find the code that maps to a symbol with longest-match to a piece of text. +// +// This is the naive algorithm that just scans the whole table and is very slow. impl FindLongestSymbol for SymbolTable { // NOTE(aduffy): if you don't disable inlining, this function won't show up in profiles. #[inline(never)] - fn find_longest_symbol(&self, text: &[u8]) -> Code { + fn find_longest_symbol(&self, text: &[u8]) -> u16 { debug_assert!(!text.is_empty(), "text must not be empty"); // Find the code that best maps to the provided text table here. - let mut best_code = Code::new_escaped(text[0]); + // Start with the code corresponding to the escape of the first character in the text + let mut best_code = text[0] as u16; let mut best_overlap = 1; - for code in 0..511 { + for code in 256..511 { let symbol = &self.symbols[code as usize]; if symbol.is_prefix(text) && symbol.len() > best_overlap { - best_code = Code::from_u16(code); + best_code = code; best_overlap = symbol.len(); } } diff --git a/src/lib.rs b/src/lib.rs index 074baaa..848d0e1 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -150,44 +150,53 @@ impl Debug for Symbol { } } -/// Codes used to map symbols to bytes. +/// Code and associated metadata fro a symbol. /// -/// Logically, codes can range from 0-255 inclusive. Physically, we represent them as a 9-bit -/// value packed into a `u16`. +/// Logically, codes can range from 0-255 inclusive. This type holds both the 8-bit code as well as +/// other metadata bit-packed into a `u16`. /// -/// Physically in-memory, `Code(0)` through `Code(255)` corresponds to escape sequences of raw bytes -/// 0 through 255. `Code(256)` through `Code(511)` represent the actual codes -255. +/// The bottom 8 bits contain EITHER a code for a symbol stored in the table, OR a raw byte. +/// +/// The interpretation depends on the 9th bit: when toggled off, the value stores a raw byte, and when +/// toggled on, it stores a code. Thus if you examine the bottom 9 bits of the `u16`, you have an extended +/// code range, where the values 0-255 are raw bytes, and the values 256-510 represent codes 0-254. 511 is +/// a placeholder for the invalid code here. +/// +/// Bits 12-15 store the length of the symbol (values ranging from 0-8). #[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord)] -pub struct Code(u16); +pub struct CodeMeta(u16); -impl Code { - /// Maximum value for the in-memory `Code` representation. - /// - /// When truncated to u8 this is code 255, which is equivalent to [`Self::ESCAPE_CODE`]. - pub const CODE_MAX: u16 = 511; +/// Code used to indicate bytes that are not in the symbol table. +/// +/// When compressing a string that cannot fully be expressed with the symbol table, the compressed +/// output will contain an `ESCAPE` byte followed by a raw byte. At decompression time, the presence +/// of `ESCAPE` indicates that the next byte should be appended directly to the result instead of +/// being looked up in the symbol table. +pub const ESCAPE_CODE: u8 = 255; - /// Code used to indicate bytes that are not in the symbol table. - /// - /// When compressing a string that cannot fully be expressed with the symbol table, the compressed - /// output will contain an `ESCAPE` byte followed by a raw byte. At decompression time, the presence - /// of `ESCAPE` indicates that the next byte should be appended directly to the result instead of - /// being looked up in the symbol table. - pub const ESCAPE_CODE: u8 = 255; +/// Maximum value for the extended code range. +/// +/// When truncated to u8 this is code 255, which is equivalent to [`ESCAPE_CODE`]. +pub const MAX_CODE: u16 = 511; + +impl CodeMeta { + pub const EMPTY: Self = CodeMeta(MAX_CODE); + + pub fn new(code: u8, escape: bool, len: u16) -> Self { + let value = (len << 12) | ((escape as u16) << 8) | (code as u16); + Self(value) + } /// Create a new code representing an escape byte. pub fn new_escaped(byte: u8) -> Self { - Self(byte as u16) + Self::new(byte, true, 1) } - /// Create a new code representing a symbol. - pub fn new_symbol(code: u8) -> Self { - assert_ne!( - code, - Code::ESCAPE_CODE, - "code {code} cannot be used for symbol, reserved for ESCAPE" - ); + /// Create a new code from a [`Symbol`]. + pub fn new_symbol(code: u8, symbol: Symbol) -> Self { + assert_ne!(code, ESCAPE_CODE, "ESCAPE_CODE cannot be used for symbol"); - Self((code as u16) + 256) + Self::new(code, false, symbol.len() as u16) } /// Create a `Code` directly from a `u16` value. @@ -195,7 +204,11 @@ impl Code { /// # Panics /// Panic if the value is ≥ the defined `CODE_MAX`. pub fn from_u16(code: u16) -> Self { - assert!(code < Self::CODE_MAX, "code value higher than CODE_MAX"); + assert!((code >> 12) <= 8, "len must be <= 8"); + assert!( + (code & 0b111_111_111) <= MAX_CODE, + "code value higher than MAX_CODE" + ); Self(code) } @@ -205,13 +218,34 @@ impl Code { pub fn is_escape(&self) -> bool { self.0 <= 255 } + + #[inline] + pub fn code(&self) -> u8 { + self.0 as u8 + } + + #[inline] + pub fn extended_code(&self) -> u16 { + self.0 & 0b111_111_111 + } + + #[inline] + pub fn len(&self) -> u16 { + self.0 >> 12 + } + + #[inline] + pub fn as_u16(&self) -> u16 { + self.0 + } } -impl Debug for Code { +impl Debug for CodeMeta { fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { - f.debug_struct("Code") - .field("code_byte", &(self.0 as u8)) - .field("escape", &(self.0 < 256)) + f.debug_struct("CodeMeta") + .field("code", &(self.0 as u8)) + .field("is_escape", &(self.0 < 256)) + .field("len", &(self.0 >> 12)) .finish() } } @@ -246,7 +280,7 @@ pub struct SymbolTable { // Index structures used to speedup building the symbol table and compression // /// Inverted index mapping 2-byte symbols to codes - codes_twobyte: Vec, + codes_twobyte: Vec, /// Lossy perfect hash table for looking up codes to symbols that are 3 bytes or more lossy_pht: LossyPHT, @@ -257,7 +291,7 @@ impl Default for SymbolTable { let mut table = Self { symbols: [Symbol::ZERO; 511], n_symbols: 0, - codes_twobyte: vec![0; 65_536], + codes_twobyte: Vec::with_capacity(65_536), lossy_pht: LossyPHT::new(), }; @@ -266,12 +300,13 @@ impl Default for SymbolTable { table.symbols[byte as usize] = Symbol::from_u8(byte); } - // Populate the "codes" for twobytes to default to the escape sequence. + // Populate the "codes" for twobytes to default to the escape sequence + // for the first byte for first in 0..256 { - for second in 0..256 { - let index = (first << 8) | second; - // 511 is (first << 8) | ESCAPE_CODE - table.codes_twobyte[index as usize] = (first << 8) | (1 << 8) | 0xFF; + for _second in 0..256 { + // let default_code = CodeMeta::new_escaped(first as u8); + // table.codes_twobyte.push(default_code); + table.codes_twobyte.push(CodeMeta::EMPTY) } } @@ -292,9 +327,15 @@ impl SymbolTable { assert!(self.n_symbols < 255, "cannot insert into full symbol table"); let symbol_len = symbol.len(); - if symbol_len == 2 { - // Speculatively insert the symbol into the twobyte cache - self.codes_twobyte[symbol.first_two_bytes() as usize] = self.n_symbols as u16; + if symbol_len <= 2 { + // Insert the 2-byte symbol into the twobyte cache + // println!( + // "FILLING twobyte[{}] = {:?}", + // self.n_symbols, + // symbol.first_two_bytes().to_le_bytes().map(|c| c as char) + // ); + self.codes_twobyte[symbol.first_two_bytes() as usize] = + CodeMeta::new_symbol(self.n_symbols, symbol); } else if symbol_len >= 3 { // Attempt to insert larger symbols into the 3-byte cache if !self.lossy_pht.insert(symbol, self.n_symbols) { @@ -337,84 +378,77 @@ impl SymbolTable { // // SAFETY: caller ensures out_ptr is not null let first_byte = word as u8; - println!( - "WRITING out[{}] = {}", - out_ptr.byte_add(1).offset_from(out_start) as usize, - first_byte - ); + // println!( + // "WRITING out[{}] = {}", + // out_ptr.byte_add(1).offset_from(out_start) as usize, + // first_byte + // ); unsafe { out_ptr.byte_add(1).write_unaligned(first_byte) }; // Probe the hash table let entry = self.lossy_pht.lookup(word); // Now, downshift the `word` and the `entry` to see if they align. - let ignored_bits = entry.packed_meta.ignored_bits(); - let mask = if ignored_bits == 64 { - 0 - } else { - u64::MAX >> ignored_bits - }; - let word_prefix = word & mask; - - // This ternary-like branch corresponds to the "conditional move" line from the paper's Algorithm 4: - // if the shifted word and symbol match, we use it. Else, we use the precomputed twobyte code for this - // byte sequence. - let code = if entry.symbol.as_u64() == word_prefix && !entry.packed_meta.is_unused() { - // Show symbol as str - println!( - " HIT emitting code for symbol {:?}", - entry - .symbol - .as_slice() - .iter() - .map(|c| *c as char) - .collect::>() - ); - entry.packed_meta.code() as u16 - } else { + let ignored_bits = entry.ignored_bits; + + if !compare_masked(word, entry.symbol.as_u64(), ignored_bits) || entry.is_unused() { + // lookup the appropriate code for the twobyte sequence and write it + // This will hold either 511, OR it will hold the actual code. let code = self.codes_twobyte[(word as u16) as usize]; - println!(" MISS - code={:X?}", code); - if code >= 256 { - // It's an escape of the current word first byte. - println!( - " MISS - emitting escape for char '{}'", - ((code >> 8) as u8 as char) - ); - } else { - println!( - " MISS - emitting code {} for symbol {:?}", - code, - self.symbols[(256 + code) as usize] - .as_slice() - .iter() - .map(|c| *c as char) - .collect::>(), - ); + let out = code.code(); + unsafe { + out_ptr.write(out); } - self.codes_twobyte[(word as u16) as usize] - }; - // Write the first byte of `code` to the output position. - // The code will either by a real code with a single byte of padding, OR a two-byte code sequence. - println!( - "WRITING out[{}] = {}", - out_ptr.offset_from(out_start) as usize, - (code as u8) - ); - unsafe { - out_ptr.write_unaligned(code as u8); - }; - // Seek the pointers forward. - // NOTE: if the symbol is not a hit, - let advance_in = if entry.symbol.as_u64() == word_prefix && !entry.packed_meta.is_unused() { - entry.symbol.len() - } else { - 1 - }; - let advance_out = 1 + ((code >> 8) & 1) as usize; - println!("ADVANCE in={} out={} \n", advance_in, advance_out); + // Advance the input by one byte and the output by 1 byte (if real code) or 2 bytes (if escape). + return ( + if out == ESCAPE_CODE { + 1 + } else { + code.len() as usize + }, + if out == ESCAPE_CODE { 2 } else { 1 }, + ); + } + + let code = entry.code; + unsafe { + out_ptr.write_unaligned(code.code()); + } - (advance_in, advance_out) + return (code.len() as usize, 1); + + // println!(" CODE = {}", code.extended_code(),); + + // // Lookup symbol + // // println!( + // // " SYMBOL = {:?}", + // // self.symbols[code.extended_code() as usize] + // // .as_slice() + // // .iter() + // // .map(|c| *c as char) + // // .collect::>(), + // // ); + + // // Write the first byte of `code` to the output position. + // // The code will either by a real code with a single byte of padding, OR a two-byte code sequence. + // println!( + // "WRITING out[{}] = {}", + // out_ptr.offset_from(out_start) as usize, + // (code.code()) + // ); + // unsafe { + // out_ptr.write_unaligned(code.code()); + // }; + + // // Seek the pointers forward. + // // + // // IN: advance by code.len() + // let advance_in = code.len() as usize; + // let advance_out = 1 + ((code.as_u16() >> 8) & 1) as usize; + // println!("ADVANCE in={} out={} \n", advance_in, advance_out); + + // (advance_in, advance_out) } /// Use the symbol table to compress the plaintext into a sequence of codes and escapes. @@ -452,9 +486,18 @@ impl SymbolTable { // Shift off the remaining bytes let mut last_word = unsafe { (in_ptr as *const u64).read_unaligned() }; last_word = mask_prefix(last_word, remaining_bytes as usize); + // println!( + // "OUTER COMPRESS last_word = {:?}", + // last_word.to_le_bytes().map(|c| c as char) + // ); while in_ptr < in_end && out_ptr < out_end { unsafe { + // println!( + // "=>INNER COMPRESS last_word = {:?}", + // last_word.to_le_bytes().map(|c| c as char) + // ); + // Load a full 8-byte word of data from in_ptr. // SAFETY: caller asserts in_ptr is not null. we may read past end of pointer though. let (advance_in, advance_out) = self.compress_word(last_word, out_ptr, out_start); @@ -494,7 +537,7 @@ impl SymbolTable { while in_pos < compressed.len() && out_pos < (decoded.capacity() + size_of::()) { let code = compressed[in_pos]; - if code == Code::ESCAPE_CODE { + if code == ESCAPE_CODE { // Advance by one, do raw write. in_pos += 1; // SAFETY: out_pos is always 8 bytes or more from the end of decoded buffer @@ -576,7 +619,7 @@ fn print_compressed(block: &[u8]) { let mut idx = 0; while idx < block.len() { let byte = block[idx]; - if byte == Code::ESCAPE_CODE { + if byte == ESCAPE_CODE { repr.push("ESCAPE".to_string()); idx += 1; repr.push(format!("'{}'", block[idx] as char)); @@ -585,5 +628,15 @@ fn print_compressed(block: &[u8]) { } idx += 1; } - println!("compressed: {:?}", repr); + // println!("compressed: {:?}", repr); +} + +fn compare_masked(left: u64, right: u64, ignored_bits: u16) -> bool { + let mask = if ignored_bits == 64 { + 0 + } else { + u64::MAX >> ignored_bits + }; + + (left & mask) == right } diff --git a/src/lossy_pht.rs b/src/lossy_pht.rs index 97d1f0d..e6097da 100644 --- a/src/lossy_pht.rs +++ b/src/lossy_pht.rs @@ -2,7 +2,9 @@ use std::fmt::Debug; use std::fmt::Formatter; use std::u16; +use crate::CodeMeta; use crate::Symbol; +use crate::MAX_CODE; /// Size of the perfect hash table. /// @@ -109,15 +111,25 @@ pub(crate) struct TableEntry { /// Symbol, piece of a string, 8 bytes or fewer. pub(crate) symbol: Symbol, - /// Bit-packed metadata for the entry. + /// Code and associated metadata for the symbol + pub(crate) code: CodeMeta, + + /// Number of ignored bits in `symbol`. /// - /// [`PackedMeta`] provides compact, efficient access to metadata about the `symbol`, including - /// its code and length. - pub(crate) packed_meta: PackedMeta, + /// This is equivalent to `64 - 8 * code.len()` but is pre-computed to save a few instructions in + /// the compression loop. + pub(crate) ignored_bits: u16, } assert_sizeof!(TableEntry => 16); +impl TableEntry { + pub(crate) fn is_unused(&self) -> bool { + // 511 should never come up for real, so use as the sentinel for an unused slot + self.code.extended_code() == MAX_CODE + } +} + /// Lossy Perfect Hash Table implementation for compression. /// /// This implements the "Lossy Perfect Hash Table" described in Section 5 of the paper. @@ -143,7 +155,8 @@ impl LossyPHT { for _ in 0..HASH_TABLE_SIZE { slots.push(TableEntry { symbol: Symbol::ZERO, - packed_meta: PackedMeta::UNUSED, + code: CodeMeta::EMPTY, + ignored_bits: 64, }); } @@ -162,11 +175,12 @@ impl LossyPHT { let slot = self.hash(prefix_3bytes) as usize & (HASH_TABLE_SIZE - 1); let entry = &mut self.slots[slot]; - if !entry.packed_meta.is_unused() { + if !entry.is_unused() { return false; } else { entry.symbol = symbol; - entry.packed_meta = PackedMeta::new(symbol.len() as u16, code); + entry.code = CodeMeta::new_symbol(code, symbol); + entry.ignored_bits = (64 - 8 * symbol.len()) as u16; return true; } } From 4a45e9122da6e2a127fd52726ba09637bd64a081 Mon Sep 17 00:00:00 2001 From: Andrew Duffy Date: Wed, 14 Aug 2024 16:25:16 -0400 Subject: [PATCH 06/20] add some conformance tests --- src/builder.rs | 3 ++ src/lib.rs | 78 ++-------------------------------- tests/correctness.rs | 50 ++++++++++++++++++++++ tests/fixtures/declaration.txt | 63 +++++++++++++++++++++++++++ 4 files changed, 120 insertions(+), 74 deletions(-) create mode 100644 tests/correctness.rs create mode 100644 tests/fixtures/declaration.txt diff --git a/src/builder.rs b/src/builder.rs index 3e6af87..90dcdaa 100644 --- a/src/builder.rs +++ b/src/builder.rs @@ -80,6 +80,9 @@ pub fn train(corpus: impl AsRef<[u8]>) -> SymbolTable { let mut table = SymbolTable::default(); // TODO(aduffy): handle truncating/sampling if corpus > requires sample size. let sample = corpus.as_ref(); + if sample.is_empty() { + return table; + } for _generation in 0..MAX_GENERATIONS { let counter = table.compress_count(sample); table = table.optimize(counter); diff --git a/src/lib.rs b/src/lib.rs index 848d0e1..f17e8a5 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -329,11 +329,6 @@ impl SymbolTable { let symbol_len = symbol.len(); if symbol_len <= 2 { // Insert the 2-byte symbol into the twobyte cache - // println!( - // "FILLING twobyte[{}] = {:?}", - // self.n_symbols, - // symbol.first_two_bytes().to_le_bytes().map(|c| c as char) - // ); self.codes_twobyte[symbol.first_two_bytes() as usize] = CodeMeta::new_symbol(self.n_symbols, symbol); } else if symbol_len >= 3 { @@ -378,11 +373,6 @@ impl SymbolTable { // // SAFETY: caller ensures out_ptr is not null let first_byte = word as u8; - // println!( - // "WRITING out[{}] = {}", - // out_ptr.byte_add(1).offset_from(out_start) as usize, - // first_byte - // ); unsafe { out_ptr.byte_add(1).write_unaligned(first_byte) }; // Probe the hash table @@ -417,42 +407,14 @@ impl SymbolTable { } return (code.len() as usize, 1); - - // println!(" CODE = {}", code.extended_code(),); - - // // Lookup symbol - // // println!( - // // " SYMBOL = {:?}", - // // self.symbols[code.extended_code() as usize] - // // .as_slice() - // // .iter() - // // .map(|c| *c as char) - // // .collect::>(), - // // ); - - // // Write the first byte of `code` to the output position. - // // The code will either by a real code with a single byte of padding, OR a two-byte code sequence. - // println!( - // "WRITING out[{}] = {}", - // out_ptr.offset_from(out_start) as usize, - // (code.code()) - // ); - // unsafe { - // out_ptr.write_unaligned(code.code()); - // }; - - // // Seek the pointers forward. - // // - // // IN: advance by code.len() - // let advance_in = code.len() as usize; - // let advance_out = 1 + ((code.as_u16() >> 8) & 1) as usize; - // println!("ADVANCE in={} out={} \n", advance_in, advance_out); - - // (advance_in, advance_out) } /// Use the symbol table to compress the plaintext into a sequence of codes and escapes. pub fn compress(&self, plaintext: &[u8]) -> Vec { + if plaintext.is_empty() { + return Vec::new(); + } + let mut values: Vec = Vec::with_capacity(2 * plaintext.len()); let mut in_ptr = plaintext.as_ptr(); @@ -486,18 +448,9 @@ impl SymbolTable { // Shift off the remaining bytes let mut last_word = unsafe { (in_ptr as *const u64).read_unaligned() }; last_word = mask_prefix(last_word, remaining_bytes as usize); - // println!( - // "OUTER COMPRESS last_word = {:?}", - // last_word.to_le_bytes().map(|c| c as char) - // ); while in_ptr < in_end && out_ptr < out_end { unsafe { - // println!( - // "=>INNER COMPRESS last_word = {:?}", - // last_word.to_le_bytes().map(|c| c as char) - // ); - // Load a full 8-byte word of data from in_ptr. // SAFETY: caller asserts in_ptr is not null. we may read past end of pointer though. let (advance_in, advance_out) = self.compress_word(last_word, out_ptr, out_start); @@ -528,7 +481,6 @@ impl SymbolTable { /// Decompress a byte slice that was previously returned by [compression][Self::compress]. pub fn decompress(&self, compressed: &[u8]) -> Vec { - print_compressed(compressed); let mut decoded: Vec = Vec::with_capacity(size_of::() * compressed.len()); let ptr = decoded.as_mut_ptr(); @@ -609,28 +561,6 @@ pub fn advance_8byte_word_bits(word: u64, bits: usize) -> u64 { } } -// fn print_word(addr: *const T) -> String { -// let word = unsafe { (addr as *const u64).read_unaligned() }; -// format!("{:?}", word.to_le_bytes().map(|c| c as char)) -// } - -fn print_compressed(block: &[u8]) { - let mut repr = Vec::new(); - let mut idx = 0; - while idx < block.len() { - let byte = block[idx]; - if byte == ESCAPE_CODE { - repr.push("ESCAPE".to_string()); - idx += 1; - repr.push(format!("'{}'", block[idx] as char)); - } else { - repr.push(format!("{}", block[idx])); - } - idx += 1; - } - // println!("compressed: {:?}", repr); -} - fn compare_masked(left: u64, right: u64, ignored_bits: u16) -> bool { let mask = if ignored_bits == 64 { 0 diff --git a/tests/correctness.rs b/tests/correctness.rs new file mode 100644 index 0000000..fabef5b --- /dev/null +++ b/tests/correctness.rs @@ -0,0 +1,50 @@ +static PREAMBLE: &str = r#" +When in the Course of human events, it becomes necessary for one people to dissolve +the political bands which have connected them with another, and to assume among the +powers of the earth, the separate and equal station to which the Laws of Nature and +of Nature's God entitle them, a decent respect to the opinions of mankind requires +that they should declare the causes which impel them to the separation."#; + +static DECLARATION: &str = include_str!("./fixtures/declaration.txt"); + +#[test] +fn test_basic() { + // Roundtrip the declaration + let trained = fsst_rs::train(PREAMBLE); + let compressed = trained.compress(PREAMBLE.as_bytes()); + let decompressed = trained.decompress(&compressed); + assert_eq!(decompressed, PREAMBLE.as_bytes()); +} + +#[test] +fn test_train_on_empty() { + let trained = fsst_rs::train(""); + // We can still compress with it, but the symbols are going to be empty. + let compressed = trained.compress("the quick brown fox jumped over the lazy dog".as_bytes()); + assert_eq!( + trained.decompress(&compressed), + "the quick brown fox jumped over the lazy dog".as_bytes() + ); +} + +#[test] +fn test_zeros() { + // make sure we don't panic if there are zeros in the training or input data + let training_data: Vec = vec![0, 1, 2, 3, 4]; + let trained = fsst_rs::train(&training_data); + let compressed = trained.compress(&[0, 4]); + assert_eq!(trained.decompress(&compressed), &[0, 4]); +} + +#[test] +fn test_large() { + // Generate 100KB of test data + let mut corpus = String::new(); + while corpus.len() < 8 * 1_024 * 1_024 { + corpus.push_str(DECLARATION); + } + + let trained = fsst_rs::train(&corpus); + let compressed = trained.compress(corpus.as_bytes()); + assert_eq!(trained.decompress(&compressed), corpus.as_bytes()); +} diff --git a/tests/fixtures/declaration.txt b/tests/fixtures/declaration.txt new file mode 100644 index 0000000..30ed22d --- /dev/null +++ b/tests/fixtures/declaration.txt @@ -0,0 +1,63 @@ +The unanimous Declaration of the thirteen united States of America, When in the Course of human events, it becomes necessary for one people to dissolve the political bands which have connected them with another, and to assume among the powers of the earth, the separate and equal station to which the Laws of Nature and of Nature's God entitle them, a decent respect to the opinions of mankind requires that they should declare the causes which impel them to the separation. + +We hold these truths to be self-evident, that all men are created equal, that they are endowed by their Creator with certain unalienable Rights, that among these are Life, Liberty and the pursuit of Happiness.--That to secure these rights, Governments are instituted among Men, deriving their just powers from the consent of the governed, --That whenever any Form of Government becomes destructive of these ends, it is the Right of the People to alter or to abolish it, and to institute new Government, laying its foundation on such principles and organizing its powers in such form, as to them shall seem most likely to effect their Safety and Happiness. Prudence, indeed, will dictate that Governments long established should not be changed for light and transient causes; and accordingly all experience hath shewn, that mankind are more disposed to suffer, while evils are sufferable, than to right themselves by abolishing the forms to which they are accustomed. But when a long train of abuses and usurpations, pursuing invariably the same Object evinces a design to reduce them under absolute Despotism, it is their right, it is their duty, to throw off such Government, and to provide new Guards for their future security.--Such has been the patient sufferance of these Colonies; and such is now the necessity which constrains them to alter their former Systems of Government. The history of the present King of Great Britain is a history of repeated injuries and usurpations, all having in direct object the establishment of an absolute Tyranny over these States. To prove this, let Facts be submitted to a candid world. + +He has refused his Assent to Laws, the most wholesome and necessary for the public good. + +He has forbidden his Governors to pass Laws of immediate and pressing importance, unless suspended in their operation till his Assent should be obtained; and when so suspended, he has utterly neglected to attend to them. + +He has refused to pass other Laws for the accommodation of large districts of people, unless those people would relinquish the right of Representation in the Legislature, a right inestimable to them and formidable to tyrants only. + +He has called together legislative bodies at places unusual, uncomfortable, and distant from the depository of their public Records, for the sole purpose of fatiguing them into compliance with his measures. + +He has dissolved Representative Houses repeatedly, for opposing with manly firmness his invasions on the rights of the people. + +He has refused for a long time, after such dissolutions, to cause others to be elected; whereby the Legislative powers, incapable of Annihilation, have returned to the People at large for their exercise; the State remaining in the mean time exposed to all the dangers of invasion from without, and convulsions within. + +He has endeavoured to prevent the population of these States; for that purpose obstructing the Laws for Naturalization of Foreigners; refusing to pass others to encourage their migrations hither, and raising the conditions of new Appropriations of Lands. + +He has obstructed the Administration of Justice, by refusing his Assent to Laws for establishing Judiciary powers. + +He has made Judges dependent on his Will alone, for the tenure of their offices, and the amount and payment of their salaries. + +He has erected a multitude of New Offices, and sent hither swarms of Officers to harrass our people, and eat out their substance. + +He has kept among us, in times of peace, Standing Armies without the Consent of our legislatures. + +He has affected to render the Military independent of and superior to the Civil power. + +He has combined with others to subject us to a jurisdiction foreign to our constitution, and unacknowledged by our laws; giving his Assent to their Acts of pretended Legislation: + +For Quartering large bodies of armed troops among us: + +For protecting them, by a mock Trial, from punishment for any Murders which they should commit on the Inhabitants of these States: + +For cutting off our Trade with all parts of the world: + +For imposing Taxes on us without our Consent: + +For depriving us in many cases, of the benefits of Trial by Jury: + +For transporting us beyond Seas to be tried for pretended offences + +For abolishing the free System of English Laws in a neighbouring Province, establishing therein an Arbitrary government, and enlarging its Boundaries so as to render it at once an example and fit instrument for introducing the same absolute rule into these Colonies: + +For taking away our Charters, abolishing our most valuable Laws, and altering fundamentally the Forms of our Governments: + +For suspending our own Legislatures, and declaring themselves invested with power to legislate for us in all cases whatsoever. + +He has abdicated Government here, by declaring us out of his Protection and waging War against us. + +He has plundered our seas, ravaged our Coasts, burnt our towns, and destroyed the lives of our people. + +He is at this time transporting large Armies of foreign Mercenaries to compleat the works of death, desolation and tyranny, already begun with circumstances of Cruelty & perfidy scarcely paralleled in the most barbarous ages, and totally unworthy the Head of a civilized nation. + +He has constrained our fellow Citizens taken Captive on the high Seas to bear Arms against their Country, to become the executioners of their friends and Brethren, or to fall themselves by their Hands. + +He has excited domestic insurrections amongst us, and has endeavoured to bring on the inhabitants of our frontiers, the merciless Indian Savages, whose known rule of warfare, is an undistinguished destruction of all ages, sexes and conditions. + +In every stage of these Oppressions We have Petitioned for Redress in the most humble terms: Our repeated Petitions have been answered only by repeated injury. A Prince whose character is thus marked by every act which may define a Tyrant, is unfit to be the ruler of a free people. + +Nor have We been wanting in attentions to our Brittish brethren. We have warned them from time to time of attempts by their legislature to extend an unwarrantable jurisdiction over us. We have reminded them of the circumstances of our emigration and settlement here. We have appealed to their native justice and magnanimity, and we have conjured them by the ties of our common kindred to disavow these usurpations, which, would inevitably interrupt our connections and correspondence. They too have been deaf to the voice of justice and of consanguinity. We must, therefore, acquiesce in the necessity, which denounces our Separation, and hold them, as we hold the rest of mankind, Enemies in War, in Peace Friends. + +We, therefore, the Representatives of the united States of America, in General Congress, Assembled, appealing to the Supreme Judge of the world for the rectitude of our intentions, do, in the Name, and by Authority of the good People of these Colonies, solemnly publish and declare, That these United Colonies are, and of Right ought to be Free and Independent States; that they are Absolved from all Allegiance to the British Crown, and that all political connection between them and the State of Great Britain, is and ought to be totally dissolved; and that as Free and Independent States, they have full Power to levy War, conclude Peace, contract Alliances, establish Commerce, and to do all other Acts and Things which Independent States may of right do. And for the support of this Declaration, with a firm reliance on the protection of divine Providence, we mutually pledge to each other our Lives, our Fortunes and our sacred Honor. From 04ef0edfaa3220ac3aff3c90b2d5950c4222d5a4 Mon Sep 17 00:00:00 2001 From: Andrew Duffy Date: Wed, 14 Aug 2024 16:40:16 -0400 Subject: [PATCH 07/20] make clippy and cargofmt happy --- Cargo.toml | 16 +++++----------- examples/round_trip.rs | 4 ++-- src/lib.rs | 43 ++++++++++++++++++------------------------ src/lossy_pht.rs | 9 ++++----- tests/correctness.rs | 2 ++ 5 files changed, 31 insertions(+), 43 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index e3161e8..31f9e7f 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -9,7 +9,7 @@ edition = "2021" [lints.rust] warnings = "deny" -# missing_docs = "deny" +missing_docs = "deny" [lints.clippy] all = { level = "deny", priority = -1 } @@ -35,13 +35,7 @@ test = false name = "compress" harness = false -# [profile.dev] -# lto = "off" - -# [profile.release] -# opt-level = 3 -# lto = "off" - -# [profile.bench] -# opt-level = 3 -# lto = "thin" +[[test]] +name = "correctness" +test = true +bench = false diff --git a/examples/round_trip.rs b/examples/round_trip.rs index 1924065..0f3fab7 100644 --- a/examples/round_trip.rs +++ b/examples/round_trip.rs @@ -1,6 +1,6 @@ -use core::str; +//! Simple example where we show round-tripping a string through the static symbol table. -/// Simple example of compression. +use core::str; fn main() { // Train on a sample. diff --git a/src/lib.rs b/src/lib.rs index f17e8a5..41b9d08 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -22,10 +22,7 @@ macro_rules! assert_sizeof { }; } -use std::{ - fmt::{Debug, Formatter}, - u64, -}; +use std::fmt::{Debug, Formatter}; pub use builder::*; use lossy_pht::LossyPHT; @@ -71,7 +68,6 @@ impl Symbol { /// /// Each symbol has the capacity to hold up to 8 bytes of data, but the symbols /// can contain fewer bytes, padded with 0x00. - #[inline(never)] pub fn len(&self) -> usize { let numeric = unsafe { self.num }; // For little-endian platforms, this counts the number of *trailing* zeros @@ -88,7 +84,7 @@ impl Symbol { } #[inline] - pub fn as_u64(&self) -> u64 { + fn as_u64(&self) -> u64 { // SAFETY: the bytes can always be viewed as a u64 unsafe { self.num } } @@ -164,7 +160,7 @@ impl Debug for Symbol { /// /// Bits 12-15 store the length of the symbol (values ranging from 0-8). #[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord)] -pub struct CodeMeta(u16); +struct CodeMeta(u16); /// Code used to indicate bytes that are not in the symbol table. /// @@ -179,21 +175,22 @@ pub const ESCAPE_CODE: u8 = 255; /// When truncated to u8 this is code 255, which is equivalent to [`ESCAPE_CODE`]. pub const MAX_CODE: u16 = 511; +#[allow(clippy::len_without_is_empty)] impl CodeMeta { - pub const EMPTY: Self = CodeMeta(MAX_CODE); + const EMPTY: Self = CodeMeta(MAX_CODE); - pub fn new(code: u8, escape: bool, len: u16) -> Self { + fn new(code: u8, escape: bool, len: u16) -> Self { let value = (len << 12) | ((escape as u16) << 8) | (code as u16); Self(value) } /// Create a new code representing an escape byte. - pub fn new_escaped(byte: u8) -> Self { + fn new_escaped(byte: u8) -> Self { Self::new(byte, true, 1) } /// Create a new code from a [`Symbol`]. - pub fn new_symbol(code: u8, symbol: Symbol) -> Self { + fn new_symbol(code: u8, symbol: Symbol) -> Self { assert_ne!(code, ESCAPE_CODE, "ESCAPE_CODE cannot be used for symbol"); Self::new(code, false, symbol.len() as u16) @@ -203,39 +200,35 @@ impl CodeMeta { /// /// # Panics /// Panic if the value is ≥ the defined `CODE_MAX`. - pub fn from_u16(code: u16) -> Self { + fn from_u16(code: u16) -> Self { assert!((code >> 12) <= 8, "len must be <= 8"); - assert!( - (code & 0b111_111_111) <= MAX_CODE, - "code value higher than MAX_CODE" - ); Self(code) } /// Returns true if the code is for an escape byte. #[inline] - pub fn is_escape(&self) -> bool { + fn is_escape(&self) -> bool { self.0 <= 255 } #[inline] - pub fn code(&self) -> u8 { + fn code(&self) -> u8 { self.0 as u8 } #[inline] - pub fn extended_code(&self) -> u16 { + fn extended_code(&self) -> u16 { self.0 & 0b111_111_111 } #[inline] - pub fn len(&self) -> u16 { + fn len(&self) -> u16 { self.0 >> 12 } #[inline] - pub fn as_u16(&self) -> u16 { + fn as_u16(&self) -> u16 { self.0 } } @@ -406,7 +399,7 @@ impl SymbolTable { out_ptr.write_unaligned(code.code()); } - return (code.len() as usize, 1); + (code.len() as usize, 1) } /// Use the symbol table to compress the plaintext into a sequence of codes and escapes. @@ -529,7 +522,7 @@ fn mask_prefix(word: u64, prefix_bytes: usize) -> u64 { let mask = if prefix_bytes == 0 { 0 } else { - u64::MAX >> 8 * (8 - prefix_bytes) + u64::MAX >> (8 * (8 - prefix_bytes)) }; word & mask @@ -544,11 +537,11 @@ fn advance_8byte_word(word: u64, bytes: usize) -> u64 { if bytes == 8 { 0 } else { - word >> 8 * bytes + word >> (8 * bytes) } } -pub fn advance_8byte_word_bits(word: u64, bits: usize) -> u64 { +fn advance_8byte_word_bits(word: u64, bits: usize) -> u64 { // shift the word off the right-end, because little endian means the first // char is stored in the LSB. // diff --git a/src/lossy_pht.rs b/src/lossy_pht.rs index e6097da..ba87e0e 100644 --- a/src/lossy_pht.rs +++ b/src/lossy_pht.rs @@ -1,6 +1,5 @@ use std::fmt::Debug; use std::fmt::Formatter; -use std::u16; use crate::CodeMeta; use crate::Symbol; @@ -68,7 +67,7 @@ impl PackedMeta { /// Always <= 64 #[inline] pub(crate) fn ignored_bits(&self) -> u16 { - (self.0 >> 9) as u16 + self.0 >> 9 } /// Get the code value. @@ -175,13 +174,13 @@ impl LossyPHT { let slot = self.hash(prefix_3bytes) as usize & (HASH_TABLE_SIZE - 1); let entry = &mut self.slots[slot]; - if !entry.is_unused() { - return false; + if entry.is_unused() { + false } else { entry.symbol = symbol; entry.code = CodeMeta::new_symbol(code, symbol); entry.ignored_bits = (64 - 8 * symbol.len()) as u16; - return true; + true } } diff --git a/tests/correctness.rs b/tests/correctness.rs index fabef5b..31bde28 100644 --- a/tests/correctness.rs +++ b/tests/correctness.rs @@ -1,3 +1,5 @@ +#![cfg(test)] + static PREAMBLE: &str = r#" When in the Course of human events, it becomes necessary for one people to dissolve the political bands which have connected them with another, and to assume among the From a4cefbfc5fc0b6b8d47eedb7fb11362ec27acf3b Mon Sep 17 00:00:00 2001 From: Andrew Duffy Date: Wed, 14 Aug 2024 16:40:47 -0400 Subject: [PATCH 08/20] make cargo doc happy --- src/lib.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/lib.rs b/src/lib.rs index 41b9d08..1a457ab 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -32,7 +32,7 @@ mod find_longest; mod lossy_pht; /// `Symbol`s are small (up to 8-byte) segments of strings, stored in a [`SymbolTable`] and -/// identified by an 8-bit [`Code`]. +/// identified by an 8-bit code. #[derive(Copy, Clone)] pub union Symbol { bytes: [u8; 8], From 6a5ee5c9392b7445345a905ca841cefba493f3e9 Mon Sep 17 00:00:00 2001 From: Andrew Duffy Date: Wed, 14 Aug 2024 16:44:36 -0400 Subject: [PATCH 09/20] remove --- .cargo/config.toml | 22 ---------------------- .gitignore | 10 ---------- 2 files changed, 32 deletions(-) delete mode 100644 .cargo/config.toml diff --git a/.cargo/config.toml b/.cargo/config.toml deleted file mode 100644 index a8ffa27..0000000 --- a/.cargo/config.toml +++ /dev/null @@ -1,22 +0,0 @@ -[target.aarch64-apple-darwin] -rustflags = [ - "-C", - "link-arg=-undefined", - "-C", - "link-arg=dynamic_lookup", - "-Z", - "verbose-internals", - "-Z", - "track-diagnostics", -] -[target.x86_64-apple-darwin] -rustflags = [ - "-C", - "link-arg=-undefined", - "-C", - "link-arg=dynamic_lookup", - "-Z", - "verbose-internals", - "-Z", - "track-diagnostics", -] diff --git a/.gitignore b/.gitignore index 42ffee0..c403c34 100644 --- a/.gitignore +++ b/.gitignore @@ -1,12 +1,2 @@ /target .idea/ - - -# Added by cargo -# -# already existing elements were commented out - -#/target - -# compiler debug reports -rustc-ice* From 54deb7fff11ae86b4aa32355dfe7f2847cba14ff Mon Sep 17 00:00:00 2001 From: Andrew Duffy Date: Wed, 14 Aug 2024 16:46:08 -0400 Subject: [PATCH 10/20] use stable toolchain --- rust-toolchain.toml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/rust-toolchain.toml b/rust-toolchain.toml index 04a6423..23591c9 100644 --- a/rust-toolchain.toml +++ b/rust-toolchain.toml @@ -1,5 +1,4 @@ [toolchain] -# channel = "stable" -channel = "nightly-2024-08-14" +channel = "stable" components = ["rust-src", "rustfmt", "clippy"] profile = "minimal" From 0fce5630d2ed3d96c6f8b26dfba44f42e77fa68f Mon Sep 17 00:00:00 2001 From: Andrew Duffy Date: Wed, 14 Aug 2024 16:50:55 -0400 Subject: [PATCH 11/20] remove no longer used stuff --- src/builder.rs | 2 +- src/find_longest/mod.rs | 2 - src/find_longest/naive.rs | 2 +- src/lib.rs | 64 ++---------------------- src/lossy_pht.rs | 100 -------------------------------------- 5 files changed, 6 insertions(+), 164 deletions(-) diff --git a/src/builder.rs b/src/builder.rs index 90dcdaa..ed4b697 100644 --- a/src/builder.rs +++ b/src/builder.rs @@ -22,7 +22,7 @@ use std::cmp::Ordering; use std::collections::BinaryHeap; use crate::find_longest::FindLongestSymbol; -use crate::{CodeMeta, Symbol, SymbolTable, MAX_CODE}; +use crate::{Symbol, SymbolTable, MAX_CODE}; #[derive(Debug, Clone)] struct Counter { diff --git a/src/find_longest/mod.rs b/src/find_longest/mod.rs index 074205d..9bdfe6b 100644 --- a/src/find_longest/mod.rs +++ b/src/find_longest/mod.rs @@ -12,8 +12,6 @@ // See the License for the specific language governing permissions and // limitations under the License. -use crate::CodeMeta; - mod naive; pub trait FindLongestSymbol { diff --git a/src/find_longest/naive.rs b/src/find_longest/naive.rs index 8819fb8..3ead59d 100644 --- a/src/find_longest/naive.rs +++ b/src/find_longest/naive.rs @@ -13,7 +13,7 @@ // limitations under the License. use crate::find_longest::FindLongestSymbol; -use crate::{CodeMeta, SymbolTable}; +use crate::SymbolTable; // Find the code that maps to a symbol with longest-match to a piece of text. // diff --git a/src/lib.rs b/src/lib.rs index 1a457ab..c5487df 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -12,7 +12,6 @@ // See the License for the specific language governing permissions and // limitations under the License. -#![allow(unused)] #![doc = include_str!("../README.md")] /// Throw a compiler error if a type isn't guaranteed to have a specific size in bytes. @@ -184,11 +183,6 @@ impl CodeMeta { Self(value) } - /// Create a new code representing an escape byte. - fn new_escaped(byte: u8) -> Self { - Self::new(byte, true, 1) - } - /// Create a new code from a [`Symbol`]. fn new_symbol(code: u8, symbol: Symbol) -> Self { assert_ne!(code, ESCAPE_CODE, "ESCAPE_CODE cannot be used for symbol"); @@ -196,22 +190,6 @@ impl CodeMeta { Self::new(code, false, symbol.len() as u16) } - /// Create a `Code` directly from a `u16` value. - /// - /// # Panics - /// Panic if the value is ≥ the defined `CODE_MAX`. - fn from_u16(code: u16) -> Self { - assert!((code >> 12) <= 8, "len must be <= 8"); - - Self(code) - } - - /// Returns true if the code is for an escape byte. - #[inline] - fn is_escape(&self) -> bool { - self.0 <= 255 - } - #[inline] fn code(&self) -> u8 { self.0 as u8 @@ -226,11 +204,6 @@ impl CodeMeta { fn len(&self) -> u16 { self.0 >> 12 } - - #[inline] - fn as_u16(&self) -> u16 { - self.0 - } } impl Debug for CodeMeta { @@ -284,7 +257,7 @@ impl Default for SymbolTable { let mut table = Self { symbols: [Symbol::ZERO; 511], n_symbols: 0, - codes_twobyte: Vec::with_capacity(65_536), + codes_twobyte: vec![CodeMeta::EMPTY; 65_536], lossy_pht: LossyPHT::new(), }; @@ -293,16 +266,6 @@ impl Default for SymbolTable { table.symbols[byte as usize] = Symbol::from_u8(byte); } - // Populate the "codes" for twobytes to default to the escape sequence - // for the first byte - for first in 0..256 { - for _second in 0..256 { - // let default_code = CodeMeta::new_escaped(first as u8); - // table.codes_twobyte.push(default_code); - table.codes_twobyte.push(CodeMeta::EMPTY) - } - } - table } } @@ -355,12 +318,7 @@ impl SymbolTable { /// /// `in_ptr` and `out_ptr` must never be NULL or otherwise point to invalid memory. #[inline(never)] - pub(crate) unsafe fn compress_word( - &self, - word: u64, - out_ptr: *mut u8, - out_start: *mut u8, - ) -> (usize, usize) { + pub(crate) unsafe fn compress_word(&self, word: u64, out_ptr: *mut u8) -> (usize, usize) { // Speculatively write the first byte of `word` at offset 1. This is necessary if it is an escape, and // if it isn't, it will be overwritten anyway. // @@ -412,7 +370,6 @@ impl SymbolTable { let mut in_ptr = plaintext.as_ptr(); let mut out_ptr = values.as_mut_ptr(); - let out_start = values.as_mut_ptr(); // SAFETY: `end` will point just after the end of the `plaintext` slice. let in_end = unsafe { in_ptr.byte_add(plaintext.len()) }; @@ -426,7 +383,7 @@ impl SymbolTable { // Load a full 8-byte word of data from in_ptr. // SAFETY: caller asserts in_ptr is not null. we may read past end of pointer though. let word: u64 = (in_ptr as *const u64).read_unaligned(); - let (advance_in, advance_out) = self.compress_word(word, out_ptr, out_start); + let (advance_in, advance_out) = self.compress_word(word, out_ptr); in_ptr = in_ptr.byte_add(advance_in); out_ptr = out_ptr.byte_add(advance_out); }; @@ -446,7 +403,7 @@ impl SymbolTable { unsafe { // Load a full 8-byte word of data from in_ptr. // SAFETY: caller asserts in_ptr is not null. we may read past end of pointer though. - let (advance_in, advance_out) = self.compress_word(last_word, out_ptr, out_start); + let (advance_in, advance_out) = self.compress_word(last_word, out_ptr); in_ptr = in_ptr.byte_add(advance_in); out_ptr = out_ptr.byte_add(advance_out); @@ -541,19 +498,6 @@ fn advance_8byte_word(word: u64, bytes: usize) -> u64 { } } -fn advance_8byte_word_bits(word: u64, bits: usize) -> u64 { - // shift the word off the right-end, because little endian means the first - // char is stored in the LSB. - // - // Note that even though this looks like it branches, Rust compiles this to a - // conditional move instruction. See `` - if bits == 64 { - 0 - } else { - word >> bits - } -} - fn compare_masked(left: u64, right: u64, ignored_bits: u16) -> bool { let mask = if ignored_bits == 64 { 0 diff --git a/src/lossy_pht.rs b/src/lossy_pht.rs index ba87e0e..f7ca6c0 100644 --- a/src/lossy_pht.rs +++ b/src/lossy_pht.rs @@ -1,5 +1,4 @@ use std::fmt::Debug; -use std::fmt::Formatter; use crate::CodeMeta; use crate::Symbol; @@ -12,94 +11,6 @@ use crate::MAX_CODE; /// vendors split the L1 cache into 32KB of instruction and 32KB of data. pub const HASH_TABLE_SIZE: usize = 1 << 11; -/// Bit-packed metadata for a [`TableEntry`] -/// -/// Bitpacked layout: -/// -/// bits 9-15: ignored bits in the symbol. Equivalent to 64 - symbol.len()*8 -/// bit 8: the "unused" flag -/// bits 0-7: code value (0-254) -#[derive(Clone, Copy)] -#[repr(C)] -pub(crate) struct PackedMeta(u16); - -assert_sizeof!(PackedMeta => 2); - -impl PackedMeta { - /// Constant unused instance. - /// - /// All bits are set, corresponding to - /// - /// 6 bits set for `ignored bits` - /// 1 unused bit - /// 1 bit to indicate the `unused` flag - /// 8 bits of `code` data - pub const UNUSED: Self = Self(0b10000001_11111111); - - /// The 8th bit toggles if the slot is unused or not. - const UNUSED_FLAG: u16 = 1 << 8; - - /// Create a new `PackedSymbolMeta` from raw parts. - /// - /// # Panics - /// If `len` > 8 or `code` > [`Code::CODE_MAX`] - pub fn new(len: u16, code: u8) -> Self { - assert!(len <= 8, "cannot construct PackedCode with len > 8"); - - let ignored_bits = 64 - 8 * len; - - let packed = (ignored_bits << 9) | (code as u16); - Self(packed) - } - - /// Import a `PackedSymbolMeta` from a raw `u16`. - pub fn from_u16(value: u16) -> Self { - assert!( - (value >> 9) <= 64, - "cannot construct PackedCode with len > 8" - ); - - Self(value) - } - - /// Get the number of ignored bits in the corresponding symbol's `u64` representation. - /// - /// Always <= 64 - #[inline] - pub(crate) fn ignored_bits(&self) -> u16 { - self.0 >> 9 - } - - /// Get the code value. - #[inline] - pub(crate) fn code(&self) -> u8 { - self.0 as u8 - } - - /// Check if the unused flag is set - #[inline] - pub(crate) fn is_unused(&self) -> bool { - (self.0 & Self::UNUSED_FLAG) != 0 - } -} - -impl Default for PackedMeta { - fn default() -> Self { - // The default implementation of a `PackedMeta` is one where only the `UNUSED_FLAG` is set, - // representing an unused slot in the table. - Self::UNUSED - } -} - -impl Debug for PackedMeta { - fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { - f.debug_struct("PackedCode") - .field("ignored_bits", &self.ignored_bits()) - .field("code", &self.code()) - .finish() - } -} - /// A single entry in the [`SymbolTable`]. /// /// `TableEntry` is based on the `Symbol` class outlined in Algorithm 4 of the FSST paper. See @@ -205,14 +116,3 @@ impl Default for LossyPHT { Self::new() } } - -#[cfg(test)] -mod test { - use crate::lossy_pht::PackedMeta; - - #[test] - fn test_packedmeta() { - assert!(PackedMeta::UNUSED.is_unused()); - assert_eq!(PackedMeta::UNUSED.ignored_bits(), 64); - } -} From 569e36569c9e9a344180e26b3b62a8f7385cd1a9 Mon Sep 17 00:00:00 2001 From: Andrew Duffy Date: Wed, 14 Aug 2024 16:52:41 -0400 Subject: [PATCH 12/20] license headers --- Cargo.toml | 14 ++++++++++++++ benches/compress.rs | 14 ++++++++++++++ examples/round_trip.rs | 14 ++++++++++++++ src/lossy_pht.rs | 14 ++++++++++++++ tests/correctness.rs | 14 ++++++++++++++ 5 files changed, 70 insertions(+) diff --git a/Cargo.toml b/Cargo.toml index 31f9e7f..802dcd7 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,3 +1,17 @@ +# Copyright 2024 Spiral, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + [package] name = "fsst-rs" version = "0.0.1" diff --git a/benches/compress.rs b/benches/compress.rs index 9581c9c..f2fa6b2 100644 --- a/benches/compress.rs +++ b/benches/compress.rs @@ -1,3 +1,17 @@ +// Copyright 2024 Spiral, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + //! Compression benchmark. //! //! Contains benchmarks for FSST compression, decompression, and symbol table training. diff --git a/examples/round_trip.rs b/examples/round_trip.rs index 0f3fab7..8681230 100644 --- a/examples/round_trip.rs +++ b/examples/round_trip.rs @@ -1,3 +1,17 @@ +// Copyright 2024 Spiral, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + //! Simple example where we show round-tripping a string through the static symbol table. use core::str; diff --git a/src/lossy_pht.rs b/src/lossy_pht.rs index f7ca6c0..9982665 100644 --- a/src/lossy_pht.rs +++ b/src/lossy_pht.rs @@ -1,3 +1,17 @@ +// Copyright 2024 Spiral, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + use std::fmt::Debug; use crate::CodeMeta; diff --git a/tests/correctness.rs b/tests/correctness.rs index 31bde28..52ebe10 100644 --- a/tests/correctness.rs +++ b/tests/correctness.rs @@ -1,3 +1,17 @@ +// Copyright 2024 Spiral, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + #![cfg(test)] static PREAMBLE: &str = r#" From 75acb6c1adfc0ea4054b383e0265a9b94455dc9a Mon Sep 17 00:00:00 2001 From: Andrew Duffy Date: Wed, 14 Aug 2024 21:30:30 -0400 Subject: [PATCH 13/20] remove copyright --- Cargo.toml | 14 -------------- benches/compress.rs | 14 -------------- examples/round_trip.rs | 14 -------------- src/builder.rs | 18 ++---------------- src/find_longest/mod.rs | 14 -------------- src/find_longest/naive.rs | 14 -------------- src/lib.rs | 19 ++++--------------- src/lossy_pht.rs | 14 -------------- tests/correctness.rs | 36 +++++++++++++----------------------- 9 files changed, 19 insertions(+), 138 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 802dcd7..31f9e7f 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,17 +1,3 @@ -# Copyright 2024 Spiral, Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - [package] name = "fsst-rs" version = "0.0.1" diff --git a/benches/compress.rs b/benches/compress.rs index f2fa6b2..9581c9c 100644 --- a/benches/compress.rs +++ b/benches/compress.rs @@ -1,17 +1,3 @@ -// Copyright 2024 Spiral, Inc. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - //! Compression benchmark. //! //! Contains benchmarks for FSST compression, decompression, and symbol table training. diff --git a/examples/round_trip.rs b/examples/round_trip.rs index 8681230..0f3fab7 100644 --- a/examples/round_trip.rs +++ b/examples/round_trip.rs @@ -1,17 +1,3 @@ -// Copyright 2024 Spiral, Inc. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - //! Simple example where we show round-tripping a string through the static symbol table. use core::str; diff --git a/src/builder.rs b/src/builder.rs index ed4b697..3c28b8b 100644 --- a/src/builder.rs +++ b/src/builder.rs @@ -1,17 +1,3 @@ -// Copyright 2024 Spiral, Inc. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - //! Functions and types used for building a [`SymbolTable`] from a corpus of text. //! //! This module implements the logic from Algorithm 3 of the [FSST Paper]. @@ -102,6 +88,7 @@ impl SymbolTable { let mut pos = self.symbols[prev_code as usize].len(); while pos < len { + println!("loop pos = {pos} len = {len}"); let code = self.find_longest_symbol(&sample[pos..len]); counter.record_count1(code); counter.record_count2(prev_code, code); @@ -210,12 +197,11 @@ mod test { // Use the table to compress a string, see the values let compressed = table.compress(text.as_bytes()); - assert_eq!(compressed, vec![0u8, 1u8, 2u8]); // Ensure that the compressed string has no escape bytes assert!(compressed.iter().all(|b| *b != ESCAPE_CODE)); - // Ensure that we can compress a string with no values seen at training time. + // Ensure that we can compress a string with no values seen at training time, with escape bytes let compressed = table.compress("xyz123".as_bytes()); assert_eq!( compressed, diff --git a/src/find_longest/mod.rs b/src/find_longest/mod.rs index 9bdfe6b..00eb7b2 100644 --- a/src/find_longest/mod.rs +++ b/src/find_longest/mod.rs @@ -1,17 +1,3 @@ -// Copyright 2024 Spiral, Inc. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - mod naive; pub trait FindLongestSymbol { diff --git a/src/find_longest/naive.rs b/src/find_longest/naive.rs index 3ead59d..c75ecad 100644 --- a/src/find_longest/naive.rs +++ b/src/find_longest/naive.rs @@ -1,17 +1,3 @@ -// Copyright 2024 Spiral, Inc. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - use crate::find_longest::FindLongestSymbol; use crate::SymbolTable; diff --git a/src/lib.rs b/src/lib.rs index c5487df..ba21fea 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,17 +1,3 @@ -// Copyright 2024 Spiral, Inc. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - #![doc = include_str!("../README.md")] /// Throw a compiler error if a type isn't guaranteed to have a specific size in bytes. @@ -226,7 +212,9 @@ impl Debug for CodeMeta { /// ``` /// use fsst_rs::{Symbol, SymbolTable}; /// let mut table = SymbolTable::default(); -/// table.insert(Symbol::from_slice(&[b'h', b'e', b'l', b'l', b'o', 0, 0, 0])); +/// +/// // Insert a new symbol +/// assert!(table.insert(Symbol::from_slice(&[b'h', b'e', b'l', b'l', b'o', 0, 0, 0]))); /// /// let compressed = table.compress("hello".as_bytes()); /// assert_eq!(compressed, vec![0u8]); @@ -290,6 +278,7 @@ impl SymbolTable { } else if symbol_len >= 3 { // Attempt to insert larger symbols into the 3-byte cache if !self.lossy_pht.insert(symbol, self.n_symbols) { + println!("table insert rejected"); return false; } } diff --git a/src/lossy_pht.rs b/src/lossy_pht.rs index 9982665..f7ca6c0 100644 --- a/src/lossy_pht.rs +++ b/src/lossy_pht.rs @@ -1,17 +1,3 @@ -// Copyright 2024 Spiral, Inc. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - use std::fmt::Debug; use crate::CodeMeta; diff --git a/tests/correctness.rs b/tests/correctness.rs index 52ebe10..65b2000 100644 --- a/tests/correctness.rs +++ b/tests/correctness.rs @@ -1,17 +1,3 @@ -// Copyright 2024 Spiral, Inc. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - #![cfg(test)] static PREAMBLE: &str = r#" @@ -43,20 +29,24 @@ fn test_train_on_empty() { ); } -#[test] -fn test_zeros() { - // make sure we don't panic if there are zeros in the training or input data - let training_data: Vec = vec![0, 1, 2, 3, 4]; - let trained = fsst_rs::train(&training_data); - let compressed = trained.compress(&[0, 4]); - assert_eq!(trained.decompress(&compressed), &[0, 4]); -} +// #[test] +// fn test_zeros() { +// println!("training zeros"); +// let training_data: Vec = vec![0, 1, 2, 3, 4]; +// let trained = fsst_rs::train(&training_data); +// println!("compressing with zeros"); +// let compressed = trained.compress(&[0, 4]); +// println!("decomperssing with zeros"); +// assert_eq!(trained.decompress(&compressed), &[0, 4]); +// println!("done"); +// } #[test] fn test_large() { // Generate 100KB of test data let mut corpus = String::new(); - while corpus.len() < 8 * 1_024 * 1_024 { + // TODO(aduffy): make this larger once table build performance is better. + while corpus.len() < 10 * 1_024 { corpus.push_str(DECLARATION); } From 3bfff500ab0625ce55aef3345984d766b0e4d5b6 Mon Sep 17 00:00:00 2001 From: Andrew Duffy Date: Wed, 14 Aug 2024 21:48:27 -0400 Subject: [PATCH 14/20] fix table insert --- benches/compress.rs | 23 ----------------------- src/builder.rs | 1 - src/lib.rs | 1 - src/lossy_pht.rs | 2 +- 4 files changed, 1 insertion(+), 26 deletions(-) diff --git a/benches/compress.rs b/benches/compress.rs index 9581c9c..603eca1 100644 --- a/benches/compress.rs +++ b/benches/compress.rs @@ -51,29 +51,6 @@ fn bench_fsst(c: &mut Criterion) { fn bench_lz4(c: &mut Criterion) { let mut group = c.benchmark_group("lz4"); - // { - // let compressed = Vec::with_capacity(10_000); - // let mut encoder = lz4::EncoderBuilder::new() - // .block_size(BlockSize::Max64KB) - // .build(compressed) - // .unwrap(); - // - // encoder.write_all(TEST.as_bytes()).unwrap(); - // let (compressed, result) = encoder.finish(); - // result.unwrap(); - // - // let ratio = (TEST.as_bytes().len() as f64) / (compressed.len() as f64); - // println!("LZ4 compress_ratio = {ratio}"); - // - // // ensure decodes cleanly - // let cursor = Cursor::new(compressed); - // let mut decoder = lz4::Decoder::new(cursor).unwrap(); - // let mut output = String::new(); - // - // decoder.read_to_string(&mut output).unwrap(); - // assert_eq!(output.as_str(), TEST); - // } - group.bench_function("compress-single", |b| { let mut compressed = Vec::with_capacity(100_000_000); let mut encoder = lz4::EncoderBuilder::new() diff --git a/src/builder.rs b/src/builder.rs index 3c28b8b..558a3b4 100644 --- a/src/builder.rs +++ b/src/builder.rs @@ -88,7 +88,6 @@ impl SymbolTable { let mut pos = self.symbols[prev_code as usize].len(); while pos < len { - println!("loop pos = {pos} len = {len}"); let code = self.find_longest_symbol(&sample[pos..len]); counter.record_count1(code); counter.record_count2(prev_code, code); diff --git a/src/lib.rs b/src/lib.rs index ba21fea..5350fb3 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -278,7 +278,6 @@ impl SymbolTable { } else if symbol_len >= 3 { // Attempt to insert larger symbols into the 3-byte cache if !self.lossy_pht.insert(symbol, self.n_symbols) { - println!("table insert rejected"); return false; } } diff --git a/src/lossy_pht.rs b/src/lossy_pht.rs index f7ca6c0..ada8980 100644 --- a/src/lossy_pht.rs +++ b/src/lossy_pht.rs @@ -85,7 +85,7 @@ impl LossyPHT { let slot = self.hash(prefix_3bytes) as usize & (HASH_TABLE_SIZE - 1); let entry = &mut self.slots[slot]; - if entry.is_unused() { + if !entry.is_unused() { false } else { entry.symbol = symbol; From 489537fd7f434fbb159acba493e90c22357e74f7 Mon Sep 17 00:00:00 2001 From: Andrew Duffy Date: Wed, 14 Aug 2024 22:09:27 -0400 Subject: [PATCH 15/20] fix test --- src/lossy_pht.rs | 2 +- tests/correctness.rs | 9 ++++++--- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/src/lossy_pht.rs b/src/lossy_pht.rs index ada8980..db4bcf5 100644 --- a/src/lossy_pht.rs +++ b/src/lossy_pht.rs @@ -11,7 +11,7 @@ use crate::MAX_CODE; /// vendors split the L1 cache into 32KB of instruction and 32KB of data. pub const HASH_TABLE_SIZE: usize = 1 << 11; -/// A single entry in the [`SymbolTable`]. +/// A single entry in the [Lossy Perfect Hash Table][`LossyPHT`]. /// /// `TableEntry` is based on the `Symbol` class outlined in Algorithm 4 of the FSST paper. See /// the module documentation for a link to the paper. diff --git a/tests/correctness.rs b/tests/correctness.rs index 65b2000..f4f752f 100644 --- a/tests/correctness.rs +++ b/tests/correctness.rs @@ -43,7 +43,6 @@ fn test_train_on_empty() { #[test] fn test_large() { - // Generate 100KB of test data let mut corpus = String::new(); // TODO(aduffy): make this larger once table build performance is better. while corpus.len() < 10 * 1_024 { @@ -51,6 +50,10 @@ fn test_large() { } let trained = fsst_rs::train(&corpus); - let compressed = trained.compress(corpus.as_bytes()); - assert_eq!(trained.decompress(&compressed), corpus.as_bytes()); + let mut massive = String::new(); + while massive.len() < 16 * 1_024 * 1_024 { + massive.push_str(DECLARATION); + } + let compressed = trained.compress(massive.as_bytes()); + assert_eq!(trained.decompress(&compressed), massive.as_bytes()); } From a6ade02a860d0aa074d1f225cfa6b83a1de33aa4 Mon Sep 17 00:00:00 2001 From: Andrew Duffy Date: Wed, 14 Aug 2024 22:48:58 -0400 Subject: [PATCH 16/20] add file compressor example --- examples/file_compressor.rs | 67 +++++++++++++++++++++++++++++++++++++ src/lib.rs | 3 +- 2 files changed, 69 insertions(+), 1 deletion(-) create mode 100644 examples/file_compressor.rs diff --git a/examples/file_compressor.rs b/examples/file_compressor.rs new file mode 100644 index 0000000..f820971 --- /dev/null +++ b/examples/file_compressor.rs @@ -0,0 +1,67 @@ +#![allow(missing_docs)] + +//! This is a command line program that expects two input files as arguments. +//! +//! The first is the file to train a symbol table on. +//! +//! The second is the file to compress. The compressed file will be written +//! as a sibling with the suffix ".fsst" + +use std::{ + fs::File, + io::Read, + os::unix::fs::{FileExt, MetadataExt}, + path::Path, +}; + +fn main() { + let args: Vec<_> = std::env::args().skip(1).collect(); + assert!(args.len() >= 2, "args TRAINING and FILE must be provided"); + + let train_path = Path::new(&args[0]); + let input_path = Path::new(&args[1]); + + let mut train_text = String::new(); + { + let mut f = File::open(train_path).unwrap(); + f.read_to_string(&mut train_text).unwrap(); + } + + println!("building the compressor from {train_path:?}..."); + let compressor = fsst_rs::train(&train_text); + + println!("compressing blocks of {input_path:?} with compressor..."); + + let f = File::open(input_path).unwrap(); + let size_bytes = f.metadata().unwrap().size() as usize; + + const CHUNK_SIZE: usize = 16 * 1024 * 1024; + + let mut chunk_idx = 1; + let mut pos = 0; + let mut chunk = Vec::with_capacity(CHUNK_SIZE); + unsafe { chunk.set_len(CHUNK_SIZE) }; + while pos + CHUNK_SIZE < size_bytes { + f.read_exact_at(&mut chunk, pos as u64).unwrap(); + // Compress the chunk, don't write it anywhere. + let compact = compressor.compress(&chunk); + let compression_ratio = (CHUNK_SIZE as f64) / (compact.len() as f64); + println!("compressed chunk {chunk_idx} with ratio {compression_ratio}"); + + pos += CHUNK_SIZE; + chunk_idx += 1; + } + + // Read last chunk with a new custom-sized buffer. + if pos < size_bytes { + let amount = size_bytes - pos; + chunk = Vec::with_capacity(size_bytes - pos); + unsafe { chunk.set_len(amount) }; + f.read_exact_at(&mut chunk, pos as u64).unwrap(); + // Compress the chunk, don't write it anywhere. + let compact = compressor.compress(&chunk[0..amount]); + let compression_ratio = (amount as f64) / (compact.len() as f64); + println!("compressed chunk {chunk_idx} with ratio {compression_ratio}"); + } + println!("done"); +} diff --git a/src/lib.rs b/src/lib.rs index 5350fb3..da195a2 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -305,7 +305,8 @@ impl SymbolTable { /// # Safety /// /// `in_ptr` and `out_ptr` must never be NULL or otherwise point to invalid memory. - #[inline(never)] + // NOTE(aduffy): uncomment this line to make the function appear in profiles + // #[inline(never)] pub(crate) unsafe fn compress_word(&self, word: u64, out_ptr: *mut u8) -> (usize, usize) { // Speculatively write the first byte of `word` at offset 1. This is necessary if it is an escape, and // if it isn't, it will be overwritten anyway. From 4fbbb99815ff30ec3c9937cc3b4644e66d86b0ef Mon Sep 17 00:00:00 2001 From: Andrew Duffy Date: Wed, 14 Aug 2024 22:51:13 -0400 Subject: [PATCH 17/20] clarify doc --- examples/file_compressor.rs | 11 ++++++++--- src/lib.rs | 10 ++-------- 2 files changed, 10 insertions(+), 11 deletions(-) diff --git a/examples/file_compressor.rs b/examples/file_compressor.rs index f820971..c116d99 100644 --- a/examples/file_compressor.rs +++ b/examples/file_compressor.rs @@ -4,9 +4,14 @@ //! //! The first is the file to train a symbol table on. //! -//! The second is the file to compress. The compressed file will be written -//! as a sibling with the suffix ".fsst" - +//! The second is the file to compress. The compressor will run and compress +//! in chunks of 16MB, logging the compression ratio for each chunk. +//! +//! Example: +//! +//! ``` +//! cargo run --release --example file_compressor -- file1.csv file2.csv +//! ``` use std::{ fs::File, io::Read, diff --git a/src/lib.rs b/src/lib.rs index da195a2..c5896b6 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -76,10 +76,7 @@ impl Symbol { /// Get the first byte of the symbol as a `u8`. /// - /// # Safety - /// The function will never panic, but if the symbol's len is < 1, the - /// result may be meaningless. It is up to the caller to ensure that - /// the first byte of the symbol contains valid data. + /// If the symbol is empty, this will return the zero byte. #[inline] pub fn first_byte(&self) -> u8 { // SAFETY: the bytes can always be viewed as a u64 @@ -88,10 +85,7 @@ impl Symbol { /// Get the first two bytes of the symbol as a `u16`. /// - /// # Safety - /// The function will never panic, but if the symbol's len is < 2, the - /// result may be meaningless. It is up to the caller to ensure that - /// the first two bytes of the symbol contain valid data. + /// If the Symbol is one or zero bytes, this will return `0u16`. #[inline] pub fn first_two_bytes(&self) -> u16 { // SAFETY: the bytes can always be viewed as a u64 From 3b14e8d61a05f01ad438302be0fc4dc809dd21a0 Mon Sep 17 00:00:00 2001 From: Andrew Duffy Date: Wed, 14 Aug 2024 22:54:08 -0400 Subject: [PATCH 18/20] cleanup --- examples/file_compressor.rs | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/examples/file_compressor.rs b/examples/file_compressor.rs index c116d99..f260620 100644 --- a/examples/file_compressor.rs +++ b/examples/file_compressor.rs @@ -1,4 +1,4 @@ -#![allow(missing_docs)] +#![allow(missing_docs, clippy::use_debug)] //! This is a command line program that expects two input files as arguments. //! @@ -44,8 +44,7 @@ fn main() { let mut chunk_idx = 1; let mut pos = 0; - let mut chunk = Vec::with_capacity(CHUNK_SIZE); - unsafe { chunk.set_len(CHUNK_SIZE) }; + let mut chunk = vec![0u8; CHUNK_SIZE]; while pos + CHUNK_SIZE < size_bytes { f.read_exact_at(&mut chunk, pos as u64).unwrap(); // Compress the chunk, don't write it anywhere. @@ -60,8 +59,7 @@ fn main() { // Read last chunk with a new custom-sized buffer. if pos < size_bytes { let amount = size_bytes - pos; - chunk = Vec::with_capacity(size_bytes - pos); - unsafe { chunk.set_len(amount) }; + chunk = vec![0u8; size_bytes - pos]; f.read_exact_at(&mut chunk, pos as u64).unwrap(); // Compress the chunk, don't write it anywhere. let compact = compressor.compress(&chunk[0..amount]); From e9b41bc9f2a58646cf180c2a6fdab74f545a637d Mon Sep 17 00:00:00 2001 From: Andrew Duffy Date: Thu, 15 Aug 2024 10:12:09 -0400 Subject: [PATCH 19/20] handle zero bytes in input properly --- rust-toolchain.toml | 2 +- src/lib.rs | 13 ++++++++++--- tests/correctness.rs | 22 +++++++++++----------- 3 files changed, 22 insertions(+), 15 deletions(-) diff --git a/rust-toolchain.toml b/rust-toolchain.toml index 23591c9..2296533 100644 --- a/rust-toolchain.toml +++ b/rust-toolchain.toml @@ -1,4 +1,4 @@ [toolchain] -channel = "stable" +channel = "nightly-2024-08-14" components = ["rust-src", "rustfmt", "clippy"] profile = "minimal" diff --git a/src/lib.rs b/src/lib.rs index c5896b6..7191b00 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -58,7 +58,14 @@ impl Symbol { // For little-endian platforms, this counts the number of *trailing* zeros let null_bytes = (numeric.leading_zeros() >> 3) as usize; - size_of::() - null_bytes + // Special case handling of a symbol with all-zeros. This is actually + // a 1-byte symbol containing 0x00. + let len = size_of::() - null_bytes; + if len == 0 { + 1 + } else { + len + } } /// Returns true if the symbol does not encode any bytes. @@ -298,9 +305,9 @@ impl SymbolTable { /// /// # Safety /// - /// `in_ptr` and `out_ptr` must never be NULL or otherwise point to invalid memory. + /// `out_ptr` must never be NULL or otherwise point to invalid memory. // NOTE(aduffy): uncomment this line to make the function appear in profiles - // #[inline(never)] + #[inline(never)] pub(crate) unsafe fn compress_word(&self, word: u64, out_ptr: *mut u8) -> (usize, usize) { // Speculatively write the first byte of `word` at offset 1. This is necessary if it is an escape, and // if it isn't, it will be overwritten anyway. diff --git a/tests/correctness.rs b/tests/correctness.rs index f4f752f..8773bc7 100644 --- a/tests/correctness.rs +++ b/tests/correctness.rs @@ -29,17 +29,17 @@ fn test_train_on_empty() { ); } -// #[test] -// fn test_zeros() { -// println!("training zeros"); -// let training_data: Vec = vec![0, 1, 2, 3, 4]; -// let trained = fsst_rs::train(&training_data); -// println!("compressing with zeros"); -// let compressed = trained.compress(&[0, 4]); -// println!("decomperssing with zeros"); -// assert_eq!(trained.decompress(&compressed), &[0, 4]); -// println!("done"); -// } +#[test] +fn test_zeros() { + println!("training zeros"); + let training_data: Vec = vec![0, 1, 2, 3, 4]; + let trained = fsst_rs::train(&training_data); + println!("compressing with zeros"); + let compressed = trained.compress(&[0, 4]); + println!("decomperssing with zeros"); + assert_eq!(trained.decompress(&compressed), &[0, 4]); + println!("done"); +} #[test] fn test_large() { From b97469ebd3a1e4fd02e4d61bd923b09131c77722 Mon Sep 17 00:00:00 2001 From: Andrew Duffy Date: Thu, 15 Aug 2024 10:44:54 -0400 Subject: [PATCH 20/20] update file_compressor example to work for non-text data --- examples/file_compressor.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/examples/file_compressor.rs b/examples/file_compressor.rs index f260620..3dab660 100644 --- a/examples/file_compressor.rs +++ b/examples/file_compressor.rs @@ -26,14 +26,14 @@ fn main() { let train_path = Path::new(&args[0]); let input_path = Path::new(&args[1]); - let mut train_text = String::new(); + let mut train_bytes = Vec::new(); { let mut f = File::open(train_path).unwrap(); - f.read_to_string(&mut train_text).unwrap(); + f.read_to_end(&mut train_bytes).unwrap(); } println!("building the compressor from {train_path:?}..."); - let compressor = fsst_rs::train(&train_text); + let compressor = fsst_rs::train(&train_bytes); println!("compressing blocks of {input_path:?} with compressor...");