From af45a5bb9933fb12a909301c4b192bb293064ee7 Mon Sep 17 00:00:00 2001
From: Frank Tang <ftang@chromium.org>
Date: Fri, 13 Sep 2024 17:03:50 -0700
Subject: [PATCH] ICU-22890 MF2: Add ICU4C test for lone surrogates and fix
 ICU4J surrogate handling

This PR adds a test to ICU4C for handling of lone surrogates,
and also changes ICU4J so that this case is a syntax error (per the spec).

Incidentally fix uninitialized-memory bug in MessageFormatter
(initialize `errors` to nullptr)

Co-authored-by: Tim Chevalier <tjc@igalia.com>
---
 icu4c/source/i18n/unicode/messageformat2.h    |  2 +-
 .../test/intltest/messageformat2test.cpp      | 30 +++++++++++++++++++
 .../source/test/intltest/messageformat2test.h |  2 ++
 .../com/ibm/icu/message2/InputSource.java     |  7 +++++
 .../java/com/ibm/icu/message2/MFParser.java   | 10 ++++++-
 .../dev/test/message2/MessageFormat2Test.java | 30 +++++++++++++++++++
 6 files changed, 79 insertions(+), 2 deletions(-)

diff --git a/icu4c/source/i18n/unicode/messageformat2.h b/icu4c/source/i18n/unicode/messageformat2.h
index d23ac8409d81..e77c61d6d36e 100644
--- a/icu4c/source/i18n/unicode/messageformat2.h
+++ b/icu4c/source/i18n/unicode/messageformat2.h
@@ -378,7 +378,7 @@ namespace message2 {
         // Must be a raw pointer to avoid including the internal header file
         // defining StaticErrors
         // Owned by `this`
-        StaticErrors* errors;
+        StaticErrors* errors = nullptr;
 
     }; // class MessageFormatter
 
diff --git a/icu4c/source/test/intltest/messageformat2test.cpp b/icu4c/source/test/intltest/messageformat2test.cpp
index 524ee4ab79d9..5b0c04697b33 100644
--- a/icu4c/source/test/intltest/messageformat2test.cpp
+++ b/icu4c/source/test/intltest/messageformat2test.cpp
@@ -30,6 +30,8 @@ TestMessageFormat2::runIndexedTest(int32_t index, UBool exec,
     TESTCASE_AUTO(testAPI);
     TESTCASE_AUTO(testAPISimple);
     TESTCASE_AUTO(testDataModelAPI);
+    TESTCASE_AUTO(testHighLoneSurrogate);
+    TESTCASE_AUTO(testLowLoneSurrogate);
     TESTCASE_AUTO(dataDrivenTests);
     TESTCASE_AUTO_END;
 }
@@ -263,6 +265,34 @@ void TestMessageFormat2::testAPICustomFunctions() {
     delete person;
 }
 
+// ICU-22890 lone surrogate cause infinity loop
+void TestMessageFormat2::testHighLoneSurrogate() {
+    IcuTestErrorCode errorCode(*this, "testHighLoneSurrogate");
+    UParseError pe = { 0, 0, {0}, {0} };
+    // Lone surrogate with only high surrogate
+    UnicodeString loneSurrogate({0xda02, 0});
+    icu::message2::MessageFormatter msgfmt1 =
+      icu::message2::MessageFormatter::Builder(errorCode)
+      .setPattern(loneSurrogate, pe, errorCode)
+      .build(errorCode);
+    UnicodeString result = msgfmt1.formatToString({}, errorCode);
+    errorCode.expectErrorAndReset(U_MF_SYNTAX_ERROR, "testHighLoneSurrogate");
+}
+
+// ICU-22890 lone surrogate cause infinity loop
+void TestMessageFormat2::testLowLoneSurrogate() {
+    IcuTestErrorCode errorCode(*this, "testLowLoneSurrogate");
+    UParseError pe = { 0, 0, {0}, {0} };
+    // Lone surrogate with only low surrogate
+    UnicodeString loneSurrogate({0xdc02, 0});
+    icu::message2::MessageFormatter msgfmt2 =
+      icu::message2::MessageFormatter::Builder(errorCode)
+      .setPattern(loneSurrogate, pe, errorCode)
+      .build(errorCode);
+    UnicodeString result = msgfmt2.formatToString({}, errorCode);
+    errorCode.expectErrorAndReset(U_MF_SYNTAX_ERROR, "testLowLoneSurrogate");
+}
+
 void TestMessageFormat2::dataDrivenTests() {
     IcuTestErrorCode errorCode(*this, "jsonTests");
 
diff --git a/icu4c/source/test/intltest/messageformat2test.h b/icu4c/source/test/intltest/messageformat2test.h
index 7c7c2612f50d..b4b250cb5106 100644
--- a/icu4c/source/test/intltest/messageformat2test.h
+++ b/icu4c/source/test/intltest/messageformat2test.h
@@ -87,6 +87,8 @@ class TestMessageFormat2: public IntlTest {
     void testMessageFormatDateTimeSkeleton(message2::TestCase::Builder&, IcuTestErrorCode&);
     void testMf1Behavior(message2::TestCase::Builder&, IcuTestErrorCode&);
 
+    void testHighLoneSurrogate(void);
+    void testLowLoneSurrogate(void);
 }; // class TestMessageFormat2
 
 U_NAMESPACE_BEGIN
diff --git a/icu4j/main/core/src/main/java/com/ibm/icu/message2/InputSource.java b/icu4j/main/core/src/main/java/com/ibm/icu/message2/InputSource.java
index 8c89c93bf51a..167ace313cde 100644
--- a/icu4j/main/core/src/main/java/com/ibm/icu/message2/InputSource.java
+++ b/icu4j/main/core/src/main/java/com/ibm/icu/message2/InputSource.java
@@ -29,6 +29,13 @@ int peekChar() {
         return buffer.charAt(cursor);
     }
 
+    // Used when checking for unpaired surrogates
+    int readChar() {
+        int ch = peekChar();
+        cursor++;
+        return (char) ch;
+    }
+
     int readCodePoint() {
         // TODO: remove this?
         // START Detect possible infinite loop
diff --git a/icu4j/main/core/src/main/java/com/ibm/icu/message2/MFParser.java b/icu4j/main/core/src/main/java/com/ibm/icu/message2/MFParser.java
index a5c826f87397..791b45e864ee 100644
--- a/icu4j/main/core/src/main/java/com/ibm/icu/message2/MFParser.java
+++ b/icu4j/main/core/src/main/java/com/ibm/icu/message2/MFParser.java
@@ -113,9 +113,17 @@ private MFDataModel.PatternPart getPatternPart() throws MFParseException {
         }
     }
 
-    private String getText() {
+    private String getText() throws MFParseException {
         StringBuilder result = new StringBuilder();
         while (true) {
+            int ch = input.readChar();
+            // Check for unmatched surrogates
+            checkCondition(!(Character.isHighSurrogate((char) ch)
+                             && (input.atEnd()
+                                 || !Character.isLowSurrogate((char) input.peekChar()))),
+                           "Unpaired high surrogate");
+            checkCondition (!Character.isLowSurrogate((char) ch), "Unpaired low surrogate");
+            input.backup(1);
             int cp = input.readCodePoint();
             switch (cp) {
                 case EOF:
diff --git a/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/message2/MessageFormat2Test.java b/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/message2/MessageFormat2Test.java
index 6ac37d6085fd..fece39bfa603 100644
--- a/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/message2/MessageFormat2Test.java
+++ b/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/message2/MessageFormat2Test.java
@@ -46,6 +46,36 @@ public void test() {
                 mf2.formatToString(Args.NONE));
     }
 
+    @Test
+    public void testHighLoneSurrogate() {
+        try {
+            MessageFormatter mf2 = MessageFormatter.builder()
+                .setPattern("\uda02").build();
+            assertEquals("testHighLoneSurrogate: expected to throw, but didn't",
+                         false, true);
+        } catch (IllegalArgumentException e) {
+            // Parse error was thrown, as expected
+        } catch (Exception e) {
+            assertEquals("testHighLoneSurrogate: expected IllegalArgumentException "
+                         + "but threw " + e.toString(), false, true);
+        }
+    }
+
+    @Test
+    public void testLowLoneSurrogate() {
+        try {
+            MessageFormatter mf2 = MessageFormatter.builder()
+                .setPattern("\udc02").build();
+            assertEquals("testLowLoneSurrogate: expected to throw, but didn't",
+                         false, true);
+        } catch (IllegalArgumentException e) {
+            // Parse error was thrown, as expected
+        } catch (Exception e) {
+            assertEquals("testLowLoneSurrogate: expected IllegalArgumentException "
+                         + "but threw " + e.toString(), false, true);
+        }
+    }
+
     @Test
     public void testDateFormat() {
         Date expiration = new Date(2022 - 1900, java.util.Calendar.OCTOBER, 27);