From 2de9b2f63b590be8d4945c6266392d8d7b646379 Mon Sep 17 00:00:00 2001 From: Jonas Trevisan Date: Thu, 23 Jan 2025 12:19:17 -0300 Subject: [PATCH] Fiz long encoding/decoding --- lib/avro/datum.php | 97 ++++++++++++++++++++++++++++++++++---------- test/DatumIOTest.php | 45 +++++++++++++++++++- 2 files changed, 119 insertions(+), 23 deletions(-) diff --git a/lib/avro/datum.php b/lib/avro/datum.php index a810088..2d7d274 100644 --- a/lib/avro/datum.php +++ b/lib/avro/datum.php @@ -43,6 +43,79 @@ public function __construct($expected_schema, $datum) } } +/** + * Zigzag implementation to encode longs + * https://en.wikipedia.org/wiki/Variable-length_quantity#Zigzag_encoding + * + * @package Avro + */ +class Zigzag { + + /** + * Implementation of unsigned shift right as PHP does not have the `>>>` operator + * + * @param int $n + * @param int $x + * + * @return int + */ + public static function unsigned_right_shift(int $n, int $x): int + { + $byte_bits = 8; + $platform_bits = PHP_INT_SIZE * $byte_bits; + + // $sign_mask will be a row of 1s (0xFFFFFFFF) if $n is + // negative, and a row of 0s (0x00000000) if $n is positive + $sign_mask = ($n >> ($platform_bits - 1)); + + // $filler_removal_mask will have the $x left most bits set + // to 1 and the other bits set to 0 if $n is negative. + // if $n is positive, sign mask will have all bit set to 0 + $filler_removal_mask = $sign_mask << $platform_bits - $x; + + // right shift $n with $x and apply the mask to flip the + // left most bits to 0 if $n is negative and do nothing otherwise + return ($n >> $x) ^ $filler_removal_mask; + } + + /** + * @param int|string $n + * @return string long $n encoded as bytes + * @internal This relies on 64-bit PHP. + */ + public static function encode_long($n): string + { + $n = (int) $n; + $n = ($n << 1) ^ ($n >> 63); + $str = ''; + if (($n & ~0x7F) != 0) { + $str .= chr(($n | 0x80) & 0xFF); + $n = self::unsigned_right_shift($n, 7); + + while ($n > 0x7F) { + $str .= chr(($n | 0x80) & 0xFF); + $n = self::unsigned_right_shift($n, 7); + } + } + + $str .= chr($n); + return $str; + } + + public static function decode_long(array $bytes): int { + $b = array_shift($bytes); + $n = $b & 0x7f; + $shift = 7; + while (0 != ($b & 0x80)) + { + $b = array_shift($bytes); + $n |= (($b & 0x7f) << $shift); + $shift += 7; + } + return self::unsigned_right_shift($n, 1) ^ -($n & 1); + } +} + /** * Exceptions arising from incompatibility between * reader and writer schemas. @@ -304,18 +377,9 @@ static function double_to_long_bits($double) * @return string long $n encoded as bytes * @internal This relies on 64-bit PHP. */ - static public function encode_long($n) + public static function encode_long($n): string { - $n = (int) $n; - $n = ($n << 1) ^ ($n >> 63); - $str = ''; - while (0 != ($n & ~0x7F)) - { - $str .= chr(($n & 0x7F) | 0x80); - $n >>= 7; - } - $str .= chr($n); - return $str; + return Zigzag::encode_long($n); } /** @@ -931,16 +995,7 @@ class AvroIOBinaryDecoder */ public static function decode_long_from_array($bytes) { - $b = array_shift($bytes); - $n = $b & 0x7f; - $shift = 7; - while (0 != ($b & 0x80)) - { - $b = array_shift($bytes); - $n |= (($b & 0x7f) << $shift); - $shift += 7; - } - return (($n >> 1) ^ -($n & 1)); + return Zigzag::decode_long($bytes); } /** diff --git a/test/DatumIOTest.php b/test/DatumIOTest.php index 1a07723..a2086c1 100644 --- a/test/DatumIOTest.php +++ b/test/DatumIOTest.php @@ -51,6 +51,37 @@ function test_datum_round_trip($schema_json, $datum, $binary) $this->assertEquals($datum, $read_datum); } + /** + * @dataProvider zigzag_unsigned_right_shift_provider + */ + function test_zigzag_unsigned_right_shift(int $expected, int $n, int $x) { + $this->assertEquals($expected, Zigzag::unsigned_right_shift($n, $x)); + } + + public static function zigzag_unsigned_right_shift_provider(): array { + return [ + [4611686018427387902, -8, 2], + [2, 8, 2], + [144115188075855871, -2, 7], + [1125899906842623, 144115188075855871, 7], + [8796093022207, 1125899906842623, 7], + [68719476735, 8796093022207, 7], + [536870911, 68719476735, 7], + [4194303, 536870911, 7], + [32767, 4194303, 7], + [255, 32767, 7], + [1, 255, 7], + [144115188059078656, -2147483648, 7], + [1125899906711552, 144115188059078656, 7], + [8796093021184, 1125899906711552, 7], + [68719476728, 8796093021184, 7], + [536870911, 68719476728, 7], + [4194303, 536870911, 7], + [32767, 4194303, 7], + [255, 32767, 7], + ]; + } + /** * @return array */ @@ -67,11 +98,21 @@ function data_provider() array('"int"', 1, "\002"), array('"int"', 2147483647, "\xFE\xFF\xFF\xFF\x0F"), - // array('"long"', (int) -9223372036854775808, "\001"), + array('"long"', (int) -9223372036854775808, "\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\x01"), + array('"long"', -(1<<62), "\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\x7F"), + array('"long"', -4294967295, "\xFD\xFF\xFF\xFF\x1F"), + array('"long"', -10, "\x13"), + array('"long"', -3, "\005"), + array('"long"', -2, "\003"), array('"long"', -1, "\001"), array('"long"', 0, "\000"), array('"long"', 1, "\002"), - // array('"long"', 9223372036854775807, "\002") + array('"long"', 2, "\004"), + array('"long"', 3, "\006"), + array('"long"', 10, "\x14"), + array('"long"', 4294967295, "\xFE\xFF\xFF\xFF\x1F"), + array('"long"', 1<<62, "\x80\x80\x80\x80\x80\x80\x80\x80\x80\x01"), + array('"long"', 9223372036854775807, "\xFE\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\x01"), array('"float"', (float) -10.0, "\000\000 \301"), array('"float"', (float) -1.0, "\000\000\200\277"),