diff --git a/icu4c/source/common/unicode/utf16cppiter.h b/icu4c/source/common/unicode/utf16cppiter.h index 529bc844d70d..777cea6174e4 100644 --- a/icu4c/source/common/unicode/utf16cppiter.h +++ b/icu4c/source/common/unicode/utf16cppiter.h @@ -58,7 +58,8 @@ typedef enum UIllFormedBehavior { namespace U_HEADER_ONLY_NAMESPACE { /** - * Result of decoding a minimal Unicode code unit sequence. + * Result of validating and decoding a minimal Unicode code unit sequence. + * Returned from validating Unicode string code point iterators. * * @tparam Unit Code unit type: * UTF-8: char or char8_t or uint8_t; @@ -99,6 +100,46 @@ class CodeUnits { const Unit *p; }; +/** + * Result of decoding a minimal Unicode code unit sequence which must be well-formed. + * Returned from non-validating Unicode string code point iterators. + * + * @tparam Unit Code unit type: + * UTF-8: char or char8_t or uint8_t; + * UTF-16: char16_t or uint16_t or (on Windows) wchar_t + * @tparam CP32 Code point type: UChar32 (=int32_t) or char32_t or uint32_t; + * should be signed if U_BEHAVIOR_NEGATIVE + * @draft ICU 77 + */ +template +class UnsafeCodeUnits { +public: + // @internal + UnsafeCodeUnits(CP32 codePoint, uint8_t length, const Unit *data) : + c(codePoint), len(length), p(data) {} + + UnsafeCodeUnits(const UnsafeCodeUnits &other) = default; + UnsafeCodeUnits &operator=(const UnsafeCodeUnits &other) = default; + + UChar32 codePoint() const { return c; } + + const Unit *data() const { return p; } + + int32_t length() const { return len; } + + std::basic_string_view stringView() const { + return std::basic_string_view(p, len); + } + + // TODO: std::optional maybeCodePoint() const ? (nullopt if ill-formed) + +private: + // Order of fields with padding and access frequency in mind. + CP32 c; + uint8_t len; + const Unit *p; +}; + /** * Internal base class for public U16Iterator & U16ReverseIterator. * Not intended for public subclassing. @@ -118,6 +159,11 @@ class U16IteratorBase { // TODO: We might try to support limit==nullptr, similar to U16_ macros supporting length<0. // Test pointers for == or != but not < or >. + // @internal + U16IteratorBase(const U16IteratorBase &other) = default; + // @internal + U16IteratorBase &operator=(const U16IteratorBase &other) = default; + // @internal bool operator==(const U16IteratorBase &other) const { return current == other.current; } // @internal @@ -201,6 +247,7 @@ class U16Iterator : private U16IteratorBase { Super(start, p, limit) {} U16Iterator(const U16Iterator &other) = default; + U16Iterator &operator=(const U16Iterator &other) = default; bool operator==(const U16Iterator &other) const { return Super::operator==(other); } bool operator!=(const U16Iterator &other) const { return !Super::operator==(other); } @@ -257,6 +304,7 @@ class U16ReverseIterator : private U16IteratorBase { Super(start, p, limit) {} U16ReverseIterator(const U16ReverseIterator &other) = default; + U16ReverseIterator &operator=(const U16ReverseIterator &other) = default; bool operator==(const U16ReverseIterator &other) const { return Super::operator==(other); } bool operator!=(const U16ReverseIterator &other) const { return !Super::operator==(other); } @@ -285,7 +333,7 @@ class U16ReverseIterator : private U16IteratorBase { }; /** - * A C++ "range" for iterating over all of the code points of a 16-bit Unicode string. + * A C++ "range" for validating iteration over all of the code points of a 16-bit Unicode string. * * @tparam Unit16 Code unit type: char16_t or uint16_t or (on Windows) wchar_t * @tparam CP32 Code point type: UChar32 (=int32_t) or char32_t or uint32_t; @@ -305,6 +353,9 @@ class U16StringCodePoints { /** @draft ICU 77 */ U16StringCodePoints(const U16StringCodePoints &other) = default; + /** @draft ICU 77 */ + U16StringCodePoints &operator=(const U16StringCodePoints &other) = default; + /** @draft ICU 77 */ U16Iterator begin() const { return {s.data(), s.data(), s.data() + s.length()}; @@ -333,11 +384,212 @@ class U16StringCodePoints { // ------------------------------------------------------------------------- *** -// TODO: Non-validating iterator over the code points in a Unicode 16-bit string. -// Assumes well-formed UTF-16. Otherwise the behavior is undefined. -// template -// class U16UnsafeIterator -// TODO: only p, no start, no limit +/** + * Internal base class for public U16UnsafeIterator & U16UnsafeReverseIterator. + * Not intended for public subclassing. + * + * @tparam Unit16 Code unit type: char16_t or uint16_t or (on Windows) wchar_t + * @tparam CP32 Code point type: UChar32 (=int32_t) or char32_t or uint32_t; + * should be signed if U_BEHAVIOR_NEGATIVE + * @internal + */ +template +class U16UnsafeIteratorBase { +protected: + // @internal + U16UnsafeIteratorBase(const Unit16 *p) : current(p) {} + // Test pointers for == or != but not < or >. + + // @internal + U16UnsafeIteratorBase(const U16UnsafeIteratorBase &other) = default; + // @internal + U16UnsafeIteratorBase &operator=(const U16UnsafeIteratorBase &other) = default; + + // @internal + bool operator==(const U16UnsafeIteratorBase &other) const { return current == other.current; } + // @internal + bool operator!=(const U16UnsafeIteratorBase &other) const { return !operator==(other); } + + // @internal + UnsafeCodeUnits readAndInc(const Unit16 *&p) const { + // Very similar to U16_NEXT_UNSAFE(). + const Unit16 *p0 = p; + CP32 c = *p++; + if (!U16_IS_LEAD(c)) { + return {c, 1, p0}; + } else { + c = U16_GET_SUPPLEMENTARY(c, *p++); + return {c, 2, p0}; + } + } + + // @internal + UnsafeCodeUnits decAndRead(const Unit16 *&p) const { + // Very similar to U16_PREV_UNSAFE(). + CP32 c = *--p; + if (!U16_IS_TRAIL(c)) { + return {c, 1, p}; + } else { + c = U16_GET_SUPPLEMENTARY(*--p, c); + return {c, 2, p}; + } + } + + // @internal + const Unit16 *current; +}; + +/** + * Non-validating bidirectional iterator over the code points in a UTF-16 string. + * The string must be well-formed. + * + * @tparam Unit16 Code unit type: char16_t or uint16_t or (on Windows) wchar_t + * @tparam CP32 Code point type: UChar32 (=int32_t) or char32_t or uint32_t; + * should be signed if U_BEHAVIOR_NEGATIVE + * @draft ICU 77 + */ +template +class U16UnsafeIterator : private U16UnsafeIteratorBase { + // FYI: We need to qualify all accesses to super class members because of private inheritance. + using Super = U16UnsafeIteratorBase; +public: + // TODO: make private, make friends + U16UnsafeIterator(const Unit16 *p) : Super(p) {} + + U16UnsafeIterator(const U16UnsafeIterator &other) = default; + U16UnsafeIterator &operator=(const U16UnsafeIterator &other) = default; + + bool operator==(const U16UnsafeIterator &other) const { return Super::operator==(other); } + bool operator!=(const U16UnsafeIterator &other) const { return !Super::operator==(other); } + + UnsafeCodeUnits operator*() const { + // Call the same function in both operator*() and operator++() so that an + // optimizing compiler can easily eliminate redundant work when alternating between the two. + const Unit16 *p = Super::current; + return Super::readAndInc(p); + } + + U16UnsafeIterator &operator++() { // pre-increment + // Call the same function in both operator*() and operator++() so that an + // optimizing compiler can easily eliminate redundant work when alternating between the two. + Super::readAndInc(Super::current); + return *this; + } + + U16UnsafeIterator operator++(int) { // post-increment + // Call the same function in both operator*() and operator++() so that an + // optimizing compiler can easily eliminate redundant work when alternating between the two. + U16UnsafeIterator result(*this); + Super::readAndInc(Super::current); + return result; + } + + U16UnsafeIterator &operator--() { // pre-decrement + return Super::dec(); + } + + U16UnsafeIterator operator--(int) { // post-decrement + U16UnsafeIterator result(*this); + Super::dec(); + return result; + } +}; + +/** + * Non-validating reverse iterator over the code points in a UTF-16 string. + * Not bidirectional, but optimized for reverse iteration. + * The string must be well-formed. + * + * @tparam Unit16 Code unit type: char16_t or uint16_t or (on Windows) wchar_t + * @tparam CP32 Code point type: UChar32 (=int32_t) or char32_t or uint32_t; + * should be signed if U_BEHAVIOR_NEGATIVE + * @draft ICU 77 + */ +template +class U16UnsafeReverseIterator : private U16UnsafeIteratorBase { + using Super = U16UnsafeIteratorBase; +public: + // TODO: make private, make friends + U16UnsafeReverseIterator(const Unit16 *p) : Super(p) {} + + U16UnsafeReverseIterator(const U16UnsafeReverseIterator &other) = default; + U16UnsafeReverseIterator &operator=(const U16UnsafeReverseIterator &other) = default; + + bool operator==(const U16UnsafeReverseIterator &other) const { return Super::operator==(other); } + bool operator!=(const U16UnsafeReverseIterator &other) const { return !Super::operator==(other); } + + UnsafeCodeUnits operator*() const { + // Call the same function in both operator*() and operator++() so that an + // optimizing compiler can easily eliminate redundant work when alternating between the two. + const Unit16 *p = Super::current; + return Super::decAndRead(p); + } + + U16UnsafeReverseIterator &operator++() { // pre-increment + // Call the same function in both operator*() and operator++() so that an + // optimizing compiler can easily eliminate redundant work when alternating between the two. + Super::decAndRead(Super::current); + return *this; + } + + U16UnsafeReverseIterator operator++(int) { // post-increment + // Call the same function in both operator*() and operator++() so that an + // optimizing compiler can easily eliminate redundant work when alternating between the two. + U16UnsafeReverseIterator result(*this); + Super::decAndRead(Super::current); + return result; + } +}; + +/** + * A C++ "range" for non-validating iteration over all of the code points of a UTF-16 string. + * The string must be well-formed. + * + * @tparam Unit16 Code unit type: char16_t or uint16_t or (on Windows) wchar_t + * @tparam CP32 Code point type: UChar32 (=int32_t) or char32_t or uint32_t; + * should be signed if U_BEHAVIOR_NEGATIVE + * @draft ICU 77 + */ +template +class U16UnsafeStringCodePoints { +public: + /** + * Constructs a C++ "range" object over the code points in the string. + * @draft ICU 77 + */ + U16UnsafeStringCodePoints(std::basic_string_view s) : s(s) {} + + /** @draft ICU 77 */ + U16UnsafeStringCodePoints(const U16UnsafeStringCodePoints &other) = default; + U16UnsafeStringCodePoints &operator=(const U16UnsafeStringCodePoints &other) = default; + + /** @draft ICU 77 */ + U16UnsafeIterator begin() const { + return {s.data()}; + } + + /** @draft ICU 77 */ + U16UnsafeIterator end() const { + return {s.data() + s.length()}; + } + + /** @draft ICU 77 */ + U16UnsafeReverseIterator rbegin() const { + return {s.data() + s.length()}; + } + + /** @draft ICU 77 */ + U16UnsafeReverseIterator rend() const { + return {s.data()}; + } + +private: + std::basic_string_view s; +}; + +// ------------------------------------------------------------------------- *** + +// TODO: UTF-8 // TODO: remove experimental sample code #ifndef UTYPES_H @@ -369,6 +621,24 @@ int32_t reverseLoop(std::u16string_view s) { } return sum; } + +int32_t unsafeRangeLoop(std::u16string_view s) { + header::U16UnsafeStringCodePoints range(s); + int32_t sum = 0; + for (auto units : range) { + sum += units.codePoint(); + } + return sum; +} + +int32_t unsafeReverseLoop(std::u16string_view s) { + header::U16UnsafeStringCodePoints range(s); + int32_t sum = 0; + for (auto iter = range.rbegin(); iter != range.rend(); ++iter) { + sum += (*iter).codePoint(); + } + return sum; +} #endif } // namespace U_HEADER_ONLY_NAMESPACE