Skip to content

Commit

Permalink
unsafe=well-formed iterators
Browse files Browse the repository at this point in the history
  • Loading branch information
markusicu committed Jan 7, 2025
1 parent 5c6e1a6 commit 84dc5f4
Showing 1 changed file with 277 additions and 7 deletions.
284 changes: 277 additions & 7 deletions icu4c/source/common/unicode/utf16cppiter.h
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,8 @@ typedef enum UIllFormedBehavior {
namespace U_HEADER_ONLY_NAMESPACE {

/**
* Result of decoding a minimal Unicode code unit sequence.
* Result of validating and decoding a minimal Unicode code unit sequence.
* Returned from validating Unicode string code point iterators.
*
* @tparam Unit Code unit type:
* UTF-8: char or char8_t or uint8_t;
Expand Down Expand Up @@ -99,6 +100,46 @@ class CodeUnits {
const Unit *p;
};

/**
* Result of decoding a minimal Unicode code unit sequence which must be well-formed.
* Returned from non-validating Unicode string code point iterators.
*
* @tparam Unit Code unit type:
* UTF-8: char or char8_t or uint8_t;
* UTF-16: char16_t or uint16_t or (on Windows) wchar_t
* @tparam CP32 Code point type: UChar32 (=int32_t) or char32_t or uint32_t;
* should be signed if U_BEHAVIOR_NEGATIVE
* @draft ICU 77
*/
template<typename Unit, typename CP32>
class UnsafeCodeUnits {
public:
// @internal
UnsafeCodeUnits(CP32 codePoint, uint8_t length, const Unit *data) :
c(codePoint), len(length), p(data) {}

UnsafeCodeUnits(const UnsafeCodeUnits &other) = default;
UnsafeCodeUnits &operator=(const UnsafeCodeUnits &other) = default;

UChar32 codePoint() const { return c; }

const Unit *data() const { return p; }

int32_t length() const { return len; }

std::basic_string_view<Unit> stringView() const {
return std::basic_string_view<Unit>(p, len);
}

// TODO: std::optional<CP32> maybeCodePoint() const ? (nullopt if ill-formed)

private:
// Order of fields with padding and access frequency in mind.
CP32 c;
uint8_t len;
const Unit *p;
};

/**
* Internal base class for public U16Iterator & U16ReverseIterator.
* Not intended for public subclassing.
Expand All @@ -118,6 +159,11 @@ class U16IteratorBase {
// TODO: We might try to support limit==nullptr, similar to U16_ macros supporting length<0.
// Test pointers for == or != but not < or >.

// @internal
U16IteratorBase(const U16IteratorBase &other) = default;
// @internal
U16IteratorBase &operator=(const U16IteratorBase &other) = default;

// @internal
bool operator==(const U16IteratorBase &other) const { return current == other.current; }
// @internal
Expand Down Expand Up @@ -201,6 +247,7 @@ class U16Iterator : private U16IteratorBase<Unit16, CP32, behavior> {
Super(start, p, limit) {}

U16Iterator(const U16Iterator &other) = default;
U16Iterator &operator=(const U16Iterator &other) = default;

bool operator==(const U16Iterator &other) const { return Super::operator==(other); }
bool operator!=(const U16Iterator &other) const { return !Super::operator==(other); }
Expand Down Expand Up @@ -257,6 +304,7 @@ class U16ReverseIterator : private U16IteratorBase<Unit16, CP32, behavior> {
Super(start, p, limit) {}

U16ReverseIterator(const U16ReverseIterator &other) = default;
U16ReverseIterator &operator=(const U16ReverseIterator &other) = default;

bool operator==(const U16ReverseIterator &other) const { return Super::operator==(other); }
bool operator!=(const U16ReverseIterator &other) const { return !Super::operator==(other); }
Expand Down Expand Up @@ -285,7 +333,7 @@ class U16ReverseIterator : private U16IteratorBase<Unit16, CP32, behavior> {
};

/**
* A C++ "range" for iterating over all of the code points of a 16-bit Unicode string.
* A C++ "range" for validating iteration over all of the code points of a 16-bit Unicode string.
*
* @tparam Unit16 Code unit type: char16_t or uint16_t or (on Windows) wchar_t
* @tparam CP32 Code point type: UChar32 (=int32_t) or char32_t or uint32_t;
Expand All @@ -305,6 +353,9 @@ class U16StringCodePoints {
/** @draft ICU 77 */
U16StringCodePoints(const U16StringCodePoints &other) = default;

/** @draft ICU 77 */
U16StringCodePoints &operator=(const U16StringCodePoints &other) = default;

/** @draft ICU 77 */
U16Iterator<Unit16, CP32, behavior> begin() const {
return {s.data(), s.data(), s.data() + s.length()};
Expand Down Expand Up @@ -333,11 +384,212 @@ class U16StringCodePoints {

// ------------------------------------------------------------------------- ***

// TODO: Non-validating iterator over the code points in a Unicode 16-bit string.
// Assumes well-formed UTF-16. Otherwise the behavior is undefined.
// template<typename Unit16>
// class U16UnsafeIterator
// TODO: only p, no start, no limit
/**
* Internal base class for public U16UnsafeIterator & U16UnsafeReverseIterator.
* Not intended for public subclassing.
*
* @tparam Unit16 Code unit type: char16_t or uint16_t or (on Windows) wchar_t
* @tparam CP32 Code point type: UChar32 (=int32_t) or char32_t or uint32_t;
* should be signed if U_BEHAVIOR_NEGATIVE
* @internal
*/
template<typename Unit16, typename CP32>
class U16UnsafeIteratorBase {
protected:
// @internal
U16UnsafeIteratorBase(const Unit16 *p) : current(p) {}
// Test pointers for == or != but not < or >.

// @internal
U16UnsafeIteratorBase(const U16UnsafeIteratorBase &other) = default;
// @internal
U16UnsafeIteratorBase &operator=(const U16UnsafeIteratorBase &other) = default;

// @internal
bool operator==(const U16UnsafeIteratorBase &other) const { return current == other.current; }
// @internal
bool operator!=(const U16UnsafeIteratorBase &other) const { return !operator==(other); }

// @internal
UnsafeCodeUnits<Unit16, CP32> readAndInc(const Unit16 *&p) const {
// Very similar to U16_NEXT_UNSAFE().
const Unit16 *p0 = p;
CP32 c = *p++;
if (!U16_IS_LEAD(c)) {
return {c, 1, p0};
} else {
c = U16_GET_SUPPLEMENTARY(c, *p++);
return {c, 2, p0};
}
}

// @internal
UnsafeCodeUnits<Unit16, CP32> decAndRead(const Unit16 *&p) const {
// Very similar to U16_PREV_UNSAFE().
CP32 c = *--p;
if (!U16_IS_TRAIL(c)) {
return {c, 1, p};
} else {
c = U16_GET_SUPPLEMENTARY(*--p, c);
return {c, 2, p};
}
}

// @internal
const Unit16 *current;
};

/**
* Non-validating bidirectional iterator over the code points in a UTF-16 string.
* The string must be well-formed.
*
* @tparam Unit16 Code unit type: char16_t or uint16_t or (on Windows) wchar_t
* @tparam CP32 Code point type: UChar32 (=int32_t) or char32_t or uint32_t;
* should be signed if U_BEHAVIOR_NEGATIVE
* @draft ICU 77
*/
template<typename Unit16, typename CP32>
class U16UnsafeIterator : private U16UnsafeIteratorBase<Unit16, CP32> {
// FYI: We need to qualify all accesses to super class members because of private inheritance.
using Super = U16UnsafeIteratorBase<Unit16, CP32>;
public:
// TODO: make private, make friends
U16UnsafeIterator(const Unit16 *p) : Super(p) {}

U16UnsafeIterator(const U16UnsafeIterator &other) = default;
U16UnsafeIterator &operator=(const U16UnsafeIterator &other) = default;

bool operator==(const U16UnsafeIterator &other) const { return Super::operator==(other); }
bool operator!=(const U16UnsafeIterator &other) const { return !Super::operator==(other); }

UnsafeCodeUnits<Unit16, CP32> operator*() const {
// Call the same function in both operator*() and operator++() so that an
// optimizing compiler can easily eliminate redundant work when alternating between the two.
const Unit16 *p = Super::current;
return Super::readAndInc(p);
}

U16UnsafeIterator &operator++() { // pre-increment
// Call the same function in both operator*() and operator++() so that an
// optimizing compiler can easily eliminate redundant work when alternating between the two.
Super::readAndInc(Super::current);
return *this;
}

U16UnsafeIterator operator++(int) { // post-increment
// Call the same function in both operator*() and operator++() so that an
// optimizing compiler can easily eliminate redundant work when alternating between the two.
U16UnsafeIterator result(*this);
Super::readAndInc(Super::current);
return result;
}

U16UnsafeIterator &operator--() { // pre-decrement
return Super::dec();
}

U16UnsafeIterator operator--(int) { // post-decrement
U16UnsafeIterator result(*this);
Super::dec();
return result;
}
};

/**
* Non-validating reverse iterator over the code points in a UTF-16 string.
* Not bidirectional, but optimized for reverse iteration.
* The string must be well-formed.
*
* @tparam Unit16 Code unit type: char16_t or uint16_t or (on Windows) wchar_t
* @tparam CP32 Code point type: UChar32 (=int32_t) or char32_t or uint32_t;
* should be signed if U_BEHAVIOR_NEGATIVE
* @draft ICU 77
*/
template<typename Unit16, typename CP32>
class U16UnsafeReverseIterator : private U16UnsafeIteratorBase<Unit16, CP32> {
using Super = U16UnsafeIteratorBase<Unit16, CP32>;
public:
// TODO: make private, make friends
U16UnsafeReverseIterator(const Unit16 *p) : Super(p) {}

U16UnsafeReverseIterator(const U16UnsafeReverseIterator &other) = default;
U16UnsafeReverseIterator &operator=(const U16UnsafeReverseIterator &other) = default;

bool operator==(const U16UnsafeReverseIterator &other) const { return Super::operator==(other); }
bool operator!=(const U16UnsafeReverseIterator &other) const { return !Super::operator==(other); }

UnsafeCodeUnits<Unit16, CP32> operator*() const {
// Call the same function in both operator*() and operator++() so that an
// optimizing compiler can easily eliminate redundant work when alternating between the two.
const Unit16 *p = Super::current;
return Super::decAndRead(p);
}

U16UnsafeReverseIterator &operator++() { // pre-increment
// Call the same function in both operator*() and operator++() so that an
// optimizing compiler can easily eliminate redundant work when alternating between the two.
Super::decAndRead(Super::current);
return *this;
}

U16UnsafeReverseIterator operator++(int) { // post-increment
// Call the same function in both operator*() and operator++() so that an
// optimizing compiler can easily eliminate redundant work when alternating between the two.
U16UnsafeReverseIterator result(*this);
Super::decAndRead(Super::current);
return result;
}
};

/**
* A C++ "range" for non-validating iteration over all of the code points of a UTF-16 string.
* The string must be well-formed.
*
* @tparam Unit16 Code unit type: char16_t or uint16_t or (on Windows) wchar_t
* @tparam CP32 Code point type: UChar32 (=int32_t) or char32_t or uint32_t;
* should be signed if U_BEHAVIOR_NEGATIVE
* @draft ICU 77
*/
template<typename Unit16, typename CP32>
class U16UnsafeStringCodePoints {
public:
/**
* Constructs a C++ "range" object over the code points in the string.
* @draft ICU 77
*/
U16UnsafeStringCodePoints(std::basic_string_view<Unit16> s) : s(s) {}

/** @draft ICU 77 */
U16UnsafeStringCodePoints(const U16UnsafeStringCodePoints &other) = default;
U16UnsafeStringCodePoints &operator=(const U16UnsafeStringCodePoints &other) = default;

/** @draft ICU 77 */
U16UnsafeIterator<Unit16, CP32> begin() const {
return {s.data()};
}

/** @draft ICU 77 */
U16UnsafeIterator<Unit16, CP32> end() const {
return {s.data() + s.length()};
}

/** @draft ICU 77 */
U16UnsafeReverseIterator<Unit16, CP32> rbegin() const {
return {s.data() + s.length()};
}

/** @draft ICU 77 */
U16UnsafeReverseIterator<Unit16, CP32> rend() const {
return {s.data()};
}

private:
std::basic_string_view<Unit16> s;
};

// ------------------------------------------------------------------------- ***

// TODO: UTF-8

// TODO: remove experimental sample code
#ifndef UTYPES_H
Expand Down Expand Up @@ -369,6 +621,24 @@ int32_t reverseLoop(std::u16string_view s) {
}
return sum;
}

int32_t unsafeRangeLoop(std::u16string_view s) {
header::U16UnsafeStringCodePoints<char16_t, UChar32> range(s);
int32_t sum = 0;
for (auto units : range) {
sum += units.codePoint();
}
return sum;
}

int32_t unsafeReverseLoop(std::u16string_view s) {
header::U16UnsafeStringCodePoints<char16_t, UChar32> range(s);
int32_t sum = 0;
for (auto iter = range.rbegin(); iter != range.rend(); ++iter) {
sum += (*iter).codePoint();
}
return sum;
}
#endif

} // namespace U_HEADER_ONLY_NAMESPACE
Expand Down

0 comments on commit 84dc5f4

Please sign in to comment.