From d32cbb3e3258b4e1a80dc33947299a953f54c9b9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christian=20Bartolom=C3=A4us?= Date: Fri, 26 Nov 2021 21:41:49 +0100 Subject: [PATCH] [JVM] Reimplement nqp::encode for utf-16 Starting from the bug reported in https://github.com/Raku/nqp/issues/250 it turned out that the old implementation gave different results than MoarVM for all buffer types (at least in some cases). The new code doesn't try to take shortcuts, but just works through all code points one by one, translates them to UTF-16 code points and puts those into the result buffer. I'd guess at least the encodeUTF32 method needs a similar rework. --- .../jvm/runtime/org/raku/nqp/runtime/Ops.java | 171 ++++++++++++++++-- t/nqp/082-decode.t | 73 +++++++- 2 files changed, 231 insertions(+), 13 deletions(-) diff --git a/src/vm/jvm/runtime/org/raku/nqp/runtime/Ops.java b/src/vm/jvm/runtime/org/raku/nqp/runtime/Ops.java index b20811c85..35a4cb4d8 100644 --- a/src/vm/jvm/runtime/org/raku/nqp/runtime/Ops.java +++ b/src/vm/jvm/runtime/org/raku/nqp/runtime/Ops.java @@ -4295,21 +4295,168 @@ private static String javaEncodingName(String nameIn) { } private static void encodeUTF16(String str, SixModelObject res, ThreadContext tc) { - short[] buffer = new short[str.length()]; - for (int i = 0; i < str.length(); i++) - buffer[i] = (short)str.charAt(i); - if (res instanceof VMArrayInstance_i16) { - VMArrayInstance_i16 arr = (VMArrayInstance_i16)res; - arr.elems = buffer.length; - arr.start = 0; - arr.slots = buffer; + int[] buffer = new int[str.length()]; /* Can be an overestimate. */ + int bufPos = 0; + for (int i = 0; i < str.length(); ) { + int cp = str.codePointAt(i); + buffer[bufPos++] = cp; + i += Character.charCount(cp); + } + + /* In the following code we ignore that: + * - 2-byte codepoints between 0xD800 and 0xDFFF are invalid + * - 4-byte codepoints greater than 0x10FFFF are invalid */ + if (res instanceof VMArrayInstance_i8 || res instanceof VMArrayInstance_u8) { + res.set_elems(tc, str.length() * 4); /* Very likely too much. */ + int elems = 0; + for (int i = 0; i < bufPos; i++) { + int cp = buffer[i]; + if (cp < 0x10000) { // 2-byte UTF-16 + tc.native_i = cp & 0xFF; + res.bind_pos_native(tc, elems++); + tc.native_i = cp >> 8; + res.bind_pos_native(tc, elems++); + } + else { // 4-byte UTF-16 + /* Get a 20-bit value between 0x00000 and 0xFFFFF. */ + cp -= 0x10000; + cp &= 0xFFFFF; + int highWord = 0xD800 + (cp >> 10); // 0xD800 + high ten bits + int lowWord = 0xDC00 + (cp & 0x3FF); // 0xDC00 + low ten bits + tc.native_i = highWord & 0xFF; + res.bind_pos_native(tc, elems++); + tc.native_i = highWord >> 8; + res.bind_pos_native(tc, elems++); + tc.native_i = lowWord & 0xFF; + res.bind_pos_native(tc, elems++); + tc.native_i = lowWord >> 8; + res.bind_pos_native(tc, elems++); + } + } + res.set_elems(tc, elems); /* Shorten VMArray as needed. */ + } + else if (res instanceof VMArrayInstance_i16 || res instanceof VMArrayInstance_u16) { + res.set_elems(tc, str.length() * 2); /* Likely too much. */ + int elems = 0; + for (int i = 0; i < bufPos; i++) { + int cp = buffer[i]; + if (cp < 0x10000) { // 2-byte UTF-16 + tc.native_i = cp; + res.bind_pos_native(tc, elems++); + } + else { // 4-byte UTF-16 + /* Get a 20-bit value between 0x00000 and 0xFFFFF. */ + cp -= 0x10000; + cp &= 0xFFFFF; + tc.native_i = 0xD800 + (cp >> 10); // 0xD800 + high ten bits + res.bind_pos_native(tc, elems++); + tc.native_i = 0xDC00 + (cp & 0x3FF); // 0xDC00 + low ten bits + res.bind_pos_native(tc, elems++); + } + } + res.set_elems(tc, elems); /* Shorten VMArray as needed. */ + } + else if (res instanceof VMArrayInstance_i32 || res instanceof VMArrayInstance_u32) { + res.set_elems(tc, str.length()); /* Likely too much. */ + int elems = 0; + long cpTemp = 0; + byte elemsTemp = 0; + for (int i = 0; i < bufPos; i++) { + long cp = buffer[i]; + if (cp < 0x10000) { // 2-byte UTF-16 + if (elemsTemp == 0) { + cpTemp = cp; + elemsTemp = 1; + } + else { + tc.native_i = cpTemp + (cp << 16); + res.bind_pos_native(tc, elems++); + cpTemp = 0; + elemsTemp = 0; + } + } + else { // 4-byte UTF-16 + /* Get a 20-bit value between 0x00000 and 0xFFFFF. */ + cp -= 0x10000; + cp &= 0xFFFFF; + long highSurrogate = 0xD800 + (cp >> 10); // 0xD800 + high ten bits + long lowSurrogate = 0xDC00 + (cp & 0x3FF); // 0xDC00 + low ten bits + if (elemsTemp == 0) { + tc.native_i = highSurrogate + (lowSurrogate << 16); + res.bind_pos_native(tc, elems++); + } + else { + tc.native_i = cpTemp + (highSurrogate << 16); + res.bind_pos_native(tc, elems++); + cpTemp = lowSurrogate; + } + } + } + if (elemsTemp != 0) { + tc.native_i = cpTemp; + res.bind_pos_native(tc, elems++); + } + res.set_elems(tc, elems); /* Shorten VMArray as needed. */ } else { - res.set_elems(tc, buffer.length); - for (int i = 0; i < buffer.length; i++) { - tc.native_i = buffer[i]; - res.bind_pos_native(tc, i); + res.set_elems(tc, (str.length() + 1) / 2); /* Likely too much. */ + int elems = 0; + long cpTemp = 0; + byte elemsTemp = 0; + for (int i = 0; i < bufPos; i++) { + long cp = buffer[i]; + if (cp < 0x10000) { // 2-byte UTF-16 + if (elemsTemp == 0) { + cpTemp = cp; + elemsTemp = 1; + } + else { + if (elemsTemp == 3) { // buffer elem full + tc.native_i = cpTemp + (cp << 48); + res.bind_pos_native(tc, elems++); + cpTemp = 0; + elemsTemp = 0; + } + else { + cpTemp += cp << (elemsTemp++ * 16); + } + } + } + else { // 4-byte UTF-16 + /* Get a 20-bit value between 0x00000 and 0xFFFFF. */ + cp -= 0x10000; + cp &= 0xFFFFF; + long highSurrogate = 0xD800 + (cp >> 10); // 0xD800 + high ten bits + long lowSurrogate = 0xDC00 + (cp & 0x3FF); // 0xDC00 + low ten bits + if (elemsTemp == 0) { + cpTemp = highSurrogate + (lowSurrogate << 16); + elemsTemp = 2; + } + else { + if (elemsTemp == 3) { // buffer elem full + tc.native_i = cpTemp + (highSurrogate << 48); + res.bind_pos_native(tc, elems++); + cpTemp = lowSurrogate; + elemsTemp = 1; + } + else if (elemsTemp == 2) { + tc.native_i = cpTemp + (highSurrogate << 32) + (lowSurrogate << 48); + res.bind_pos_native(tc, elems++); + cpTemp = 0; + elemsTemp = 0; + } + else { // elemsTemp == 1 + cpTemp += (highSurrogate << 16) + (lowSurrogate << 32); + elemsTemp = 3; + } + } + } + } + if (elemsTemp != 0) { + tc.native_i = cpTemp; + res.bind_pos_native(tc, elems++); } + res.set_elems(tc, elems); /* Shorten VMArray as needed. */ } } diff --git a/t/nqp/082-decode.t b/t/nqp/082-decode.t index a24907c7b..72090282e 100644 --- a/t/nqp/082-decode.t +++ b/t/nqp/082-decode.t @@ -1,6 +1,6 @@ use nqpmo; -plan(30); +plan(46); my sub create_buf($type) { my $buf := nqp::newtype(nqp::null(), 'VMArray'); @@ -10,6 +10,8 @@ my sub create_buf($type) { my $buf8 := create_buf(uint8); my $buf16 := create_buf(uint16); +my $buf32 := create_buf(uint32); +my $buf64 := create_buf(uint64); my $buf := nqp::encode('', 'utf8', nqp::create($buf8)); @@ -153,3 +155,72 @@ else { }, 'encode dies with missing character'); } } + +$buf := nqp::encode('abc', 'utf16', nqp::create($buf8)); +is(buf_dump($buf), '97,0,98,0,99,0', 'nqp::encode 3-chars ascii string with utf16 using buffer of type uint8'); +is(nqp::decode($buf, "utf16"), 'abc', 'nqp::decode gives original string back'); + +$buf := nqp::encode('abc', 'utf16', nqp::create($buf16)); +is(buf_dump($buf), '97,98,99', 'nqp::encode 3-chars ascii string with utf16 using buffer of type uint16'); +is(nqp::decode($buf, "utf16"), 'abc', 'nqp::decode gives original string back'); + +$buf := nqp::encode('abc', 'utf16', nqp::create($buf32)); +if nqp::getcomp('nqp').backend.name eq 'moar' { + todo('trailing character is dropped, https://github.com/MoarVM/MoarVM/issues/1606', 1); + ok(0); +} +else { + is(buf_dump($buf), ~(97 + 98 * 2**16) ~ ',99', 'nqp::encode 3-chars ascii string with utf16 using buffer of type uint32'); + ## TODO This returns "abc\0". Maybe that's just correct? + #is(nqp::decode($buf, "utf16"), 'abc', 'nqp::decode gives original string back'); +} + +$buf := nqp::encode('abcd', 'utf16', nqp::create($buf64)); +is(buf_dump($buf), (97 + 98 * 2**16 + 99 * 2**32 + 100 * 2**48), 'nqp::encode 4-chars ascii string with utf16 using buffer of type uint64'); +if nqp::getcomp('nqp').backend.name eq 'jvm' { + todo('Unknown buf type in nqp::decode', 1); + ok(0); +} +else { + is(nqp::decode($buf, "utf16"), 'abcd', 'nqp::decode gives original string back'); +} + +## "\x1F63E" 'POUNTING CAT FACE' is a code point from supplemantary planes and requires two 16-bit code units (0xD83D,0xDE3E) +$buf := nqp::encode("\x1F63E", 'utf16', nqp::create($buf8)); +is(buf_dump($buf), '61,216,62,222', 'nqp::encode 1-char surrogate pair with utf16 using buffer of type uint8'); +is(nqp::decode($buf, "utf16"), "\x1F63E", 'nqp::decode gives original string back'); + +$buf := nqp::encode("\x1F63E", 'utf16', nqp::create($buf16)); +is(buf_dump($buf), '55357,56894', 'nqp::encode 1-char surrogate pair with utf16 using buffer of type uint16'); +is(nqp::decode($buf, "utf16"), "\x1F63E", 'nqp::decode gives original string back'); + +$buf := nqp::encode("\x1F63E", 'utf16', nqp::create($buf32)); +is(buf_dump($buf), ~(55357 + 56894 * 2**16), 'nqp::encode 1-char surrogate pair with utf16 using buffer of type uint32'); +if nqp::getcomp('nqp').backend.name eq 'jvm' { + todo('java.lang.IllegalArgumentException: Not a valid Unicode code point: 0xFFFFDE3E', 1); + ok(0); +} +else { + is(nqp::decode($buf, "utf16"), "\x1F63E", 'nqp::decode gives original string back'); +} + +$buf := nqp::encode('a' ~ "\x1F63E" ~ 'bcd', 'utf16', nqp::create($buf32)); +is(buf_dump($buf), (97 + 55357 * 2**16) ~ ',' ~ (56894 + 98 * 2**16) ~ ',' ~ (99 + 100 * 2**16), + 'nqp::encode mixed string with ascii and surrogate pair with utf16 using buffer of type uint32'); +if nqp::getcomp('nqp').backend.name eq 'jvm' { + todo('java.lang.IllegalArgumentException: Not a valid Unicode code point: 0xFFFFD83D', 1); + ok(0); +} +else { + is(nqp::decode($buf, "utf16"), 'a' ~ "\x1F63E" ~ 'bcd', 'nqp::decode gives original string back'); +} + +$buf := nqp::encode('a' ~ "\x1F63E" ~ 'bcd', 'utf16', nqp::create($buf64)); +if nqp::getcomp('nqp').backend.name eq 'moar' { + todo('trailing character is dropped, https://github.com/MoarVM/MoarVM/issues/1606', 1); + ok(0); +} +else { + is(buf_dump($buf), (97 + 55357 * 2**16 + 56894 * 2**32 + 98 * 2**48) ~ ',' ~ (99 + 100 * 2**16), + 'nqp::encode mixed string with ascii and surrogate pair with utf16 using buffer of type uint64'); +}