From 6568b04c7017473667dfb8d8b5de7bf07499d073 Mon Sep 17 00:00:00 2001 From: Markus Scherer Date: Mon, 12 Aug 2024 19:17:23 -0700 Subject: [PATCH] U16Iterator op*() returns U16OneSeq --- icu4c/source/common/common.vcxproj.filters | 3 + icu4c/source/common/unicode/utf16cppiter.h | 136 +++++++++++------- icu4c/source/test/intltest/intltest.vcxproj | 1 + .../test/intltest/intltest.vcxproj.filters | 3 + icu4c/source/test/intltest/utfcppitertest.cpp | 16 ++- 5 files changed, 100 insertions(+), 59 deletions(-) diff --git a/icu4c/source/common/common.vcxproj.filters b/icu4c/source/common/common.vcxproj.filters index 1faff8765d33..72f02de9cc3b 100644 --- a/icu4c/source/common/common.vcxproj.filters +++ b/icu4c/source/common/common.vcxproj.filters @@ -1258,6 +1258,9 @@ strings + + strings + strings diff --git a/icu4c/source/common/unicode/utf16cppiter.h b/icu4c/source/common/unicode/utf16cppiter.h index 5fb0b87dae06..582ce1d1b6de 100644 --- a/icu4c/source/common/unicode/utf16cppiter.h +++ b/icu4c/source/common/unicode/utf16cppiter.h @@ -39,92 +39,122 @@ enum U16IllFormedBehavior { U16_BEHAVIOR_SURROGATE }; -// Validating iterator over the code points in a Unicode 16-bit string. -// TODO: all @draft ICU 76 +/** + * A code unit sequence for one code point returned by U16Iterator. + * + * TODO: check doxygen syntax for template parameters + * @param Unit16 char16_t or uint16_t or (on Windows) wchar_t + * @draft ICU 76 + */ +template +class U16OneSeq { +public: + U16OneSeq(const U16OneSeq &other) = default; + + const Unit16 *data() { return p; } + int32_t length() const { return len; } + + std::basic_string_view stringView() const { + return std::basic_string_view(p, len); + } + + bool isWellFormed() const { return ok; } + + UChar32 codePoint() const { return c; } + + // TODO: std::optional maybeCodePoint() const ? (nullopt if !ok) + +private: + // TODO: Why can't we just use Unit16 here? + // error: declaration of 'Unit16' shadows template parameter + template + friend class U16Iterator; + + U16OneSeq(const Unit16 *p) : p(p) {} + + void fwd1() { p += len; } + + void readOneForward(const Unit16 *limit) { + if (p == limit) { + len = 0; + return; + } + // see U16_NEXT_OR_FFFD() + c = *p; + len = 1; + ok = true; + if (U16_IS_SURROGATE(c)) { + uint16_t c2; + if (U16_IS_SURROGATE_LEAD(c) && (p + 1) != limit && U16_IS_TRAIL(c2 = p[1])) { + c = U16_GET_SUPPLEMENTARY(c, c2); + len = 2; + } else { + // TODO: U16IllFormedBehavior + c = 0xfffd; + ok = false; + } + } + } + + const Unit16 *p; + UChar32 c = 0; + int8_t len = 0; + bool ok = false; +}; + +/** + * Validating iterator over the code points in a Unicode 16-bit string. + * + * TODO: check doxygen syntax for template parameters + * @param Unit16 char16_t or uint16_t or (on Windows) wchar_t + * @param U16IllFormedBehavior TODO + * @draft ICU 76 + */ template class U16Iterator { public: // TODO: make private, make friends U16Iterator(const Unit16 *start, const Unit16 *p, const Unit16 *limit) : - start(start), p(p), limit(limit) { - if (p != limit) { - readOneForward(); - } + start(start), limit(limit), seq(p) { + seq.readOneForward(limit); } // TODO: We might try to support limit==nullptr, similar to U16_ macros supporting length<0. // Test pointers for == or != but not < or >. U16Iterator(const U16Iterator &other) = default; - U16Iterator(U16Iterator &&other) noexcept = default; - bool operator==(const U16Iterator &other) const { return p == other.p; } + bool operator==(const U16Iterator &other) const { return seq.p == other.seq.p; } bool operator!=(const U16Iterator &other) const { return !operator==(other); } - UChar32 operator*() const { - return c; - } - - // TODO: good function names? - // It would be nice to avoid a prefix like "current", "one", "cp", - // but just length() on the iterator could be confusing. - int32_t currentLength() const { return len; } - - std::basic_string_view currentView() const { - return std::basic_string_view(p, len); + const U16OneSeq &operator*() const { + return seq; } - bool currentIsWellFormed() const { return ok; } - U16Iterator &operator++() { // pre-increment // TODO: think about switching directions etc. - // Assume that readOneForward() was called and set `len`. + // Assume that readOneForward() was called and set seq.len. // Skip the current code point, then read the next one. - p += len; - if (p != limit) { - readOneForward(); - } + seq.fwd1(); + seq.readOneForward(limit); return *this; } U16Iterator operator++(int) { // post-increment U16Iterator result(*this); // TODO: think about switching directions etc. - // Assume that readOneForward() was called and set `len`. + // Assume that readOneForward() was called and set seq.len. // Skip the current code point, then read the next one. - p += len; - if (p != limit) { - readOneForward(); - } + seq.fwd1(); + seq.readOneForward(limit); return result; } private: - void readOneForward() { - // see U16_NEXT_OR_FFFD() - c = *p; - len = 1; - ok = true; - if (U16_IS_SURROGATE(c)) { - uint16_t c2; - if (U16_IS_SURROGATE_LEAD(c) && (p + 1) != limit && U16_IS_TRAIL(c2 = p[1])) { - c = U16_GET_SUPPLEMENTARY(c, c2); - len = 2; - } else { - // TODO: U16IllFormedBehavior - c = 0xfffd; - ok = false; - } - } - } - // In a validating iterator, we need start & limit so that when we read a code point // (forward or backward) we can test if there are enough code units. const Unit16 *start; - const Unit16 *p; const Unit16 *limit; - UChar32 c = 0; - int8_t len = 0; - bool ok = false; + U16OneSeq seq; }; // ------------------------------------------------------------------------- *** diff --git a/icu4c/source/test/intltest/intltest.vcxproj b/icu4c/source/test/intltest/intltest.vcxproj index b58b29b3d4e7..8d9bba021508 100644 --- a/icu4c/source/test/intltest/intltest.vcxproj +++ b/icu4c/source/test/intltest/intltest.vcxproj @@ -223,6 +223,7 @@ + diff --git a/icu4c/source/test/intltest/intltest.vcxproj.filters b/icu4c/source/test/intltest/intltest.vcxproj.filters index d5c23d5e4cb5..0abc4608d1a6 100644 --- a/icu4c/source/test/intltest/intltest.vcxproj.filters +++ b/icu4c/source/test/intltest/intltest.vcxproj.filters @@ -490,6 +490,9 @@ strings + + strings + strings diff --git a/icu4c/source/test/intltest/utfcppitertest.cpp b/icu4c/source/test/intltest/utfcppitertest.cpp index c0a914b579c1..f71f23327386 100644 --- a/icu4c/source/test/intltest/utfcppitertest.cpp +++ b/icu4c/source/test/intltest/utfcppitertest.cpp @@ -14,8 +14,9 @@ // https://en.cppreference.com/w/cpp/string/basic_string_view/operator%22%22sv using namespace std::string_view_literals; -using U_HEADER_ONLY_NAMESPACE::U16Iterator; using U_HEADER_ONLY_NAMESPACE::U16_BEHAVIOR_NEGATIVE; +using U_HEADER_ONLY_NAMESPACE::U16Iterator; +using U_HEADER_ONLY_NAMESPACE::U16OneSeq; class U16IteratorTest : public IntlTest { public: @@ -44,14 +45,17 @@ void U16IteratorTest::testExperiment() { std::u16string_view good(u"abçカ🚴"sv); const char16_t *goodLimit = good.data() + good.length(); U16Iterator goodIter(good.data(), good.data(), goodLimit); - assertEquals("goodIter[0] *", u'a', *goodIter); + assertEquals("goodIter[0] * codePoint()", u'a', (*goodIter).codePoint()); ++goodIter; // pre-increment - assertEquals("goodIter[1] *", u'b', *goodIter); + assertEquals("goodIter[1] * codePoint()", u'b', (*goodIter).codePoint()); ++goodIter; - assertEquals("goodIter[2] *", u'ç', *goodIter++); // post-increment - assertEquals("goodIter[3] *", u'カ', *goodIter); + assertEquals("goodIter[2] * codePoint()", u'ç', (*goodIter++).codePoint()); // post-increment + assertEquals("goodIter[3] * codePoint()", u'カ', (*goodIter).codePoint()); ++goodIter; - assertEquals("goodIter[4] *", U'🚴', *goodIter++); + const U16OneSeq &seq = *goodIter++; + assertEquals("goodIter[4] * codePoint()", U'🚴', seq.codePoint()); + assertEquals("goodIter[4] * length()", 2, seq.length()); + assertTrue("goodIter[4] * stringView()", seq.stringView() == u"🚴"sv); U16Iterator goodEndIter(good.data(), goodLimit, goodLimit); assertTrue("goodIter == goodEndIter", goodIter == goodEndIter);