From 6568b04c7017473667dfb8d8b5de7bf07499d073 Mon Sep 17 00:00:00 2001
From: Markus Scherer <markus.icu@gmail.com>
Date: Mon, 12 Aug 2024 19:17:23 -0700
Subject: [PATCH] U16Iterator op*() returns U16OneSeq

---
 icu4c/source/common/common.vcxproj.filters    |   3 +
 icu4c/source/common/unicode/utf16cppiter.h    | 136 +++++++++++-------
 icu4c/source/test/intltest/intltest.vcxproj   |   1 +
 .../test/intltest/intltest.vcxproj.filters    |   3 +
 icu4c/source/test/intltest/utfcppitertest.cpp |  16 ++-
 5 files changed, 100 insertions(+), 59 deletions(-)
diff --git a/icu4c/source/common/common.vcxproj.filters b/icu4c/source/common/common.vcxproj.filters
index 1faff8765d33..72f02de9cc3b 100644
--- a/icu4c/source/common/common.vcxproj.filters
+++ b/icu4c/source/common/common.vcxproj.filters
@@ -1258,6 +1258,9 @@
     <CustomBuild Include="unicode\utf16.h">
       <Filter>strings</Filter>
     </CustomBuild>
+    <CustomBuild Include="unicode\utf16cppiter.h">
+      <Filter>strings</Filter>
+    </CustomBuild>
     <CustomBuild Include="unicode\utf32.h">
       <Filter>strings</Filter>
     </CustomBuild>
diff --git a/icu4c/source/common/unicode/utf16cppiter.h b/icu4c/source/common/unicode/utf16cppiter.h
index 5fb0b87dae06..582ce1d1b6de 100644
--- a/icu4c/source/common/unicode/utf16cppiter.h
+++ b/icu4c/source/common/unicode/utf16cppiter.h
@@ -39,92 +39,122 @@ enum U16IllFormedBehavior {
     U16_BEHAVIOR_SURROGATE
 };
 
-// Validating iterator over the code points in a Unicode 16-bit string.
-// TODO: all @draft ICU 76
+/**
+ * A code unit sequence for one code point returned by U16Iterator.
+ *
+ * TODO: check doxygen syntax for template parameters
+ * @param Unit16 char16_t or uint16_t or (on Windows) wchar_t
+ * @draft ICU 76
+ */
+template<typename Unit16>
+class U16OneSeq {
+public:
+    U16OneSeq(const U16OneSeq &other) = default;
+
+    const Unit16 *data() { return p; }
+    int32_t length() const { return len; }
+
+    std::basic_string_view<Unit16> stringView() const {
+        return std::basic_string_view<Unit16>(p, len);
+    }
+
+    bool isWellFormed() const { return ok; }
+
+    UChar32 codePoint() const { return c; }
+
+    // TODO: std::optional<UChar32> maybeCodePoint() const ? (nullopt if !ok)
+
+private:
+    // TODO: Why can't we just use Unit16 here?
+    // error: declaration of 'Unit16' shadows template parameter
+    template<typename SomeOtherUnit16, U16IllFormedBehavior behavior>
+    friend class U16Iterator;
+
+    U16OneSeq(const Unit16 *p) : p(p) {}
+
+    void fwd1() { p += len; }
+
+    void readOneForward(const Unit16 *limit) {
+        if (p == limit) {
+            len = 0;
+            return;
+        }
+        // see U16_NEXT_OR_FFFD()
+        c = *p;
+        len = 1;
+        ok = true;
+        if (U16_IS_SURROGATE(c)) {
+            uint16_t c2;
+            if (U16_IS_SURROGATE_LEAD(c) && (p + 1) != limit && U16_IS_TRAIL(c2 = p[1])) {
+                c = U16_GET_SUPPLEMENTARY(c, c2);
+                len = 2;
+            } else {
+                // TODO: U16IllFormedBehavior
+                c = 0xfffd;
+                ok = false;
+            }
+        }
+    }
+
+    const Unit16 *p;
+    UChar32 c = 0;
+    int8_t len = 0;
+    bool ok = false;
+};
+
+/**
+ * Validating iterator over the code points in a Unicode 16-bit string.
+ *
+ * TODO: check doxygen syntax for template parameters
+ * @param Unit16 char16_t or uint16_t or (on Windows) wchar_t
+ * @param U16IllFormedBehavior TODO
+ * @draft ICU 76
+ */
 template<typename Unit16, U16IllFormedBehavior behavior>
 class U16Iterator {
 public:
     // TODO: make private, make friends
     U16Iterator(const Unit16 *start, const Unit16 *p, const Unit16 *limit) :
-            start(start), p(p), limit(limit) {
-        if (p != limit) {
-            readOneForward();
-        }
+            start(start), limit(limit), seq(p) {
+        seq.readOneForward(limit);
     }
     // TODO: We might try to support limit==nullptr, similar to U16_ macros supporting length<0.
     // Test pointers for == or != but not < or >.
 
     U16Iterator(const U16Iterator &other) = default;
-    U16Iterator(U16Iterator &&other) noexcept = default;
 
-    bool operator==(const U16Iterator &other) const { return p == other.p; }
+    bool operator==(const U16Iterator &other) const { return seq.p == other.seq.p; }
     bool operator!=(const U16Iterator &other) const { return !operator==(other); }
 
-    UChar32 operator*() const {
-        return c;
-    }
-
-    // TODO: good function names?
-    // It would be nice to avoid a prefix like "current", "one", "cp",
-    // but just length() on the iterator could be confusing.
-    int32_t currentLength() const { return len; }
-
-    std::basic_string_view<Unit16> currentView() const {
-        return std::basic_string_view<Unit16>(p, len);
+    const U16OneSeq<Unit16> &operator*() const {
+        return seq;
     }
 
-    bool currentIsWellFormed() const { return ok; }
-
     U16Iterator &operator++() {  // pre-increment
         // TODO: think about switching directions etc.
-        // Assume that readOneForward() was called and set `len`.
+        // Assume that readOneForward() was called and set seq.len.
         // Skip the current code point, then read the next one.
-        p += len;
-        if (p != limit) {
-            readOneForward();
-        }
+        seq.fwd1();
+        seq.readOneForward(limit);
         return *this;
     }
 
     U16Iterator operator++(int) {  // post-increment
         U16Iterator result(*this);
         // TODO: think about switching directions etc.
-        // Assume that readOneForward() was called and set `len`.
+        // Assume that readOneForward() was called and set seq.len.
         // Skip the current code point, then read the next one.
-        p += len;
-        if (p != limit) {
-            readOneForward();
-        }
+        seq.fwd1();
+        seq.readOneForward(limit);
         return result;
     }
 
 private:
-    void readOneForward() {
-        // see U16_NEXT_OR_FFFD()
-        c = *p;
-        len = 1;
-        ok = true;
-        if (U16_IS_SURROGATE(c)) {
-            uint16_t c2;
-            if (U16_IS_SURROGATE_LEAD(c) && (p + 1) != limit && U16_IS_TRAIL(c2 = p[1])) {
-                c = U16_GET_SUPPLEMENTARY(c, c2);
-                len = 2;
-            } else {
-                // TODO: U16IllFormedBehavior
-                c = 0xfffd;
-                ok = false;
-            }
-        }
-    }
-
     // In a validating iterator, we need start & limit so that when we read a code point
     // (forward or backward) we can test if there are enough code units.
     const Unit16 *start;
-    const Unit16 *p;
     const Unit16 *limit;
-    UChar32 c = 0;
-    int8_t len = 0;
-    bool ok = false;
+    U16OneSeq<Unit16> seq;
 };
 
 // ------------------------------------------------------------------------- ***
diff --git a/icu4c/source/test/intltest/intltest.vcxproj b/icu4c/source/test/intltest/intltest.vcxproj
index b58b29b3d4e7..8d9bba021508 100644
--- a/icu4c/source/test/intltest/intltest.vcxproj
+++ b/icu4c/source/test/intltest/intltest.vcxproj
@@ -223,6 +223,7 @@
     <ClCompile Include="sfwdchit.cpp" />
     <ClCompile Include="strcase.cpp" />
     <ClCompile Include="ustrtest.cpp" />
+    <ClCompile Include="utfcppitertest.cpp" />
     <ClCompile Include="utxttest.cpp" />
     <ClCompile Include="cpdtrtst.cpp" />
     <ClCompile Include="ittrans.cpp" />
diff --git a/icu4c/source/test/intltest/intltest.vcxproj.filters b/icu4c/source/test/intltest/intltest.vcxproj.filters
index d5c23d5e4cb5..0abc4608d1a6 100644
--- a/icu4c/source/test/intltest/intltest.vcxproj.filters
+++ b/icu4c/source/test/intltest/intltest.vcxproj.filters
@@ -490,6 +490,9 @@
     <ClCompile Include="ustrtest.cpp">
       <Filter>strings</Filter>
     </ClCompile>
+    <ClCompile Include="utfcppitertest.cpp">
+      <Filter>strings</Filter>
+    </ClCompile>
     <ClCompile Include="utxttest.cpp">
       <Filter>strings</Filter>
     </ClCompile>
diff --git a/icu4c/source/test/intltest/utfcppitertest.cpp b/icu4c/source/test/intltest/utfcppitertest.cpp
index c0a914b579c1..f71f23327386 100644
--- a/icu4c/source/test/intltest/utfcppitertest.cpp
+++ b/icu4c/source/test/intltest/utfcppitertest.cpp
@@ -14,8 +14,9 @@
 // https://en.cppreference.com/w/cpp/string/basic_string_view/operator%22%22sv
 using namespace std::string_view_literals;
 
-using U_HEADER_ONLY_NAMESPACE::U16Iterator;
 using U_HEADER_ONLY_NAMESPACE::U16_BEHAVIOR_NEGATIVE;
+using U_HEADER_ONLY_NAMESPACE::U16Iterator;
+using U_HEADER_ONLY_NAMESPACE::U16OneSeq;
 
 class U16IteratorTest : public IntlTest {
 public:
@@ -44,14 +45,17 @@ void U16IteratorTest::testExperiment() {
     std::u16string_view good(u"abçカ🚴"sv);
     const char16_t *goodLimit = good.data() + good.length();
     U16Iterator<char16_t, U16_BEHAVIOR_NEGATIVE> goodIter(good.data(), good.data(), goodLimit);
-    assertEquals("goodIter[0] *", u'a', *goodIter);
+    assertEquals("goodIter[0] * codePoint()", u'a', (*goodIter).codePoint());
     ++goodIter;  // pre-increment
-    assertEquals("goodIter[1] *", u'b', *goodIter);
+    assertEquals("goodIter[1] * codePoint()", u'b', (*goodIter).codePoint());
     ++goodIter;
-    assertEquals("goodIter[2] *", u'ç', *goodIter++);  // post-increment
-    assertEquals("goodIter[3] *", u'カ', *goodIter);
+    assertEquals("goodIter[2] * codePoint()", u'ç', (*goodIter++).codePoint());  // post-increment
+    assertEquals("goodIter[3] * codePoint()", u'カ', (*goodIter).codePoint());
     ++goodIter;
-    assertEquals("goodIter[4] *", U'🚴', *goodIter++);
+    const U16OneSeq<char16_t> &seq = *goodIter++;
+    assertEquals("goodIter[4] * codePoint()", U'🚴', seq.codePoint());
+    assertEquals("goodIter[4] * length()", 2, seq.length());
+    assertTrue("goodIter[4] * stringView()", seq.stringView() == u"🚴"sv);
     U16Iterator<char16_t, U16_BEHAVIOR_NEGATIVE> goodEndIter(good.data(), goodLimit, goodLimit);
     assertTrue("goodIter == goodEndIter", goodIter == goodEndIter);