From 671f8eddd8ee1332dc0f64a07d74c27787da9ca0 Mon Sep 17 00:00:00 2001 From: stackunderflow111 <91698262+stackunderflow111@users.noreply.github.com> Date: Thu, 6 Feb 2025 08:12:00 +0900 Subject: [PATCH 1/3] fix the surrogate utf8 feature when custom characterEscapes is used (#1399) Co-authored-by: stack_underflow --- release-notes/CREDITS-2.x | 6 ++++++ release-notes/VERSION-2.x | 4 ++++ .../jackson/core/json/UTF8JsonGenerator.java | 20 +++++++++++++++++++ .../core/write/SurrogateWrite223Test.java | 17 ++++++++++++++++ 4 files changed, 47 insertions(+) diff --git a/release-notes/CREDITS-2.x b/release-notes/CREDITS-2.x index 002eedbd09..899056486c 100644 --- a/release-notes/CREDITS-2.x +++ b/release-notes/CREDITS-2.x @@ -461,3 +461,9 @@ Justin Gosselin (@jgosselin-accesso) * Reported #1359: Non-surrogate characters being incorrectly combined when `JsonWriteFeature.COMBINE_UNICODE_SURROGATES_IN_UTF8` is enabled (2.18.2) + +Haruki (@stackunderflow111) + * Reported #1398: feature COMBINE_UNICODE_SURROGATES_IN_UTF8 doesn't work + when custom characterEscape is used + (2.18.2) + diff --git a/release-notes/VERSION-2.x b/release-notes/VERSION-2.x index b984031a42..41a7a7c6c7 100644 --- a/release-notes/VERSION-2.x +++ b/release-notes/VERSION-2.x @@ -24,6 +24,10 @@ a pure JSON library. (reported by @Rodenstock) (fix contributed by @pjfanning) +#1398: Fix issue that feature COMBINE_UNICODE_SURROGATES_IN_UTF8 doesn't work + when custom characterEscape is used + (reported and fixed by @stackunderflow111) + 2.18.2 (27-Nov-2024) #1359: Non-surrogate characters being incorrectly combined when diff --git a/src/main/java/com/fasterxml/jackson/core/json/UTF8JsonGenerator.java b/src/main/java/com/fasterxml/jackson/core/json/UTF8JsonGenerator.java index 33b216769f..b96e9003d0 100644 --- a/src/main/java/com/fasterxml/jackson/core/json/UTF8JsonGenerator.java +++ b/src/main/java/com/fasterxml/jackson/core/json/UTF8JsonGenerator.java @@ -1732,6 +1732,16 @@ private final void _writeCustomStringSegment2(final char[] cbuf, int offset, fin outputBuffer[outputPtr++] = (byte) (0xc0 | (ch >> 6)); outputBuffer[outputPtr++] = (byte) (0x80 | (ch & 0x3f)); } else { + // 3- or 4-byte character + if (_isStartOfSurrogatePair(ch)) { + final boolean combineSurrogates = Feature.COMBINE_UNICODE_SURROGATES_IN_UTF8.enabledIn(_features); + if (combineSurrogates && offset < end) { + char highSurrogate = (char) ch; + char lowSurrogate = cbuf[offset++]; + outputPtr = _outputSurrogatePair(highSurrogate, lowSurrogate, outputPtr); + continue; + } + } outputPtr = _outputMultiByteChar(ch, outputPtr); } } @@ -1789,6 +1799,16 @@ private final void _writeCustomStringSegment2(final String text, int offset, fin outputBuffer[outputPtr++] = (byte) (0xc0 | (ch >> 6)); outputBuffer[outputPtr++] = (byte) (0x80 | (ch & 0x3f)); } else { + // 3- or 4-byte character + if (_isStartOfSurrogatePair(ch)) { + final boolean combineSurrogates = Feature.COMBINE_UNICODE_SURROGATES_IN_UTF8.enabledIn(_features); + if (combineSurrogates && offset < end) { + char highSurrogate = (char) ch; + char lowSurrogate = text.charAt(offset++); + outputPtr = _outputSurrogatePair(highSurrogate, lowSurrogate, outputPtr); + continue; + } + } outputPtr = _outputMultiByteChar(ch, outputPtr); } } diff --git a/src/test/java/com/fasterxml/jackson/core/write/SurrogateWrite223Test.java b/src/test/java/com/fasterxml/jackson/core/write/SurrogateWrite223Test.java index aa5b57e4a2..a244c0d140 100644 --- a/src/test/java/com/fasterxml/jackson/core/write/SurrogateWrite223Test.java +++ b/src/test/java/com/fasterxml/jackson/core/write/SurrogateWrite223Test.java @@ -123,4 +123,21 @@ void checkNonSurrogates() throws Exception { assertTrue(json.contains("foo\u3042bar")); assertTrue(json.contains("\"test_emoji\":\"\uD83D\uDE0A\"")); } + + @Test + void checkSurrogateWithCharacterEscapes() throws Exception { + JsonFactory f = JsonFactory.builder() + .enable(JsonWriteFeature.COMBINE_UNICODE_SURROGATES_IN_UTF8) + .build(); + f.setCharacterEscapes(JsonpCharacterEscapes.instance()); + ByteArrayOutputStream out = new ByteArrayOutputStream(); + try (JsonGenerator gen = f.createGenerator(out)) { + gen.writeStartObject(); + // Outside the BMP; 0x1F60A - emoji + gen.writeStringField("test_emoji", new String(Character.toChars(0x1F60A))); + gen.writeEndObject(); + } + String json = out.toString("UTF-8"); + assertEquals("{\"test_emoji\":\"\uD83D\uDE0A\"}", json); + } } From 583a8c180f187852ca12e63ff903174b405a50eb Mon Sep 17 00:00:00 2001 From: Tatu Saloranta Date: Wed, 5 Feb 2025 15:13:52 -0800 Subject: [PATCH 2/3] ... --- release-notes/VERSION-2.x | 1 - 1 file changed, 1 deletion(-) diff --git a/release-notes/VERSION-2.x b/release-notes/VERSION-2.x index 41a7a7c6c7..bb1cdba66a 100644 --- a/release-notes/VERSION-2.x +++ b/release-notes/VERSION-2.x @@ -23,7 +23,6 @@ a pure JSON library. JSON structures and existing infinite values (reported by @Rodenstock) (fix contributed by @pjfanning) - #1398: Fix issue that feature COMBINE_UNICODE_SURROGATES_IN_UTF8 doesn't work when custom characterEscape is used (reported and fixed by @stackunderflow111) From 31c74aca5c6e7928e34b8dfe536bd9cdff1e16d4 Mon Sep 17 00:00:00 2001 From: Tatu Saloranta Date: Wed, 5 Feb 2025 15:14:58 -0800 Subject: [PATCH 3/3] Fix credits wrt version number --- release-notes/CREDITS-2.x | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/release-notes/CREDITS-2.x b/release-notes/CREDITS-2.x index 899056486c..6f64e82493 100644 --- a/release-notes/CREDITS-2.x +++ b/release-notes/CREDITS-2.x @@ -465,5 +465,5 @@ Justin Gosselin (@jgosselin-accesso) Haruki (@stackunderflow111) * Reported #1398: feature COMBINE_UNICODE_SURROGATES_IN_UTF8 doesn't work when custom characterEscape is used - (2.18.2) + (2.18.3)