Skip to content

Commit

Permalink
U16Iterator op*() returns U16OneSeq
Browse files Browse the repository at this point in the history
  • Loading branch information
markusicu committed Dec 23, 2024
1 parent 74e9b6f commit 6568b04
Show file tree
Hide file tree
Showing 5 changed files with 100 additions and 59 deletions.
3 changes: 3 additions & 0 deletions icu4c/source/common/common.vcxproj.filters
Original file line number Diff line number Diff line change
Expand Up @@ -1258,6 +1258,9 @@
<CustomBuild Include="unicode\utf16.h">
<Filter>strings</Filter>
</CustomBuild>
<CustomBuild Include="unicode\utf16cppiter.h">
<Filter>strings</Filter>
</CustomBuild>
<CustomBuild Include="unicode\utf32.h">
<Filter>strings</Filter>
</CustomBuild>
Expand Down
136 changes: 83 additions & 53 deletions icu4c/source/common/unicode/utf16cppiter.h
Original file line number Diff line number Diff line change
Expand Up @@ -39,92 +39,122 @@ enum U16IllFormedBehavior {
U16_BEHAVIOR_SURROGATE
};

// Validating iterator over the code points in a Unicode 16-bit string.
// TODO: all @draft ICU 76
/**
* A code unit sequence for one code point returned by U16Iterator.
*
* TODO: check doxygen syntax for template parameters
* @param Unit16 char16_t or uint16_t or (on Windows) wchar_t
* @draft ICU 76
*/
template<typename Unit16>
class U16OneSeq {
public:
U16OneSeq(const U16OneSeq &other) = default;

const Unit16 *data() { return p; }
int32_t length() const { return len; }

std::basic_string_view<Unit16> stringView() const {
return std::basic_string_view<Unit16>(p, len);
}

bool isWellFormed() const { return ok; }

UChar32 codePoint() const { return c; }

// TODO: std::optional<UChar32> maybeCodePoint() const ? (nullopt if !ok)

private:
// TODO: Why can't we just use Unit16 here?
// error: declaration of 'Unit16' shadows template parameter
template<typename SomeOtherUnit16, U16IllFormedBehavior behavior>
friend class U16Iterator;

U16OneSeq(const Unit16 *p) : p(p) {}

void fwd1() { p += len; }

void readOneForward(const Unit16 *limit) {
if (p == limit) {
len = 0;
return;
}
// see U16_NEXT_OR_FFFD()
c = *p;
len = 1;
ok = true;
if (U16_IS_SURROGATE(c)) {
uint16_t c2;
if (U16_IS_SURROGATE_LEAD(c) && (p + 1) != limit && U16_IS_TRAIL(c2 = p[1])) {
c = U16_GET_SUPPLEMENTARY(c, c2);
len = 2;
} else {
// TODO: U16IllFormedBehavior
c = 0xfffd;
ok = false;
}
}
}

const Unit16 *p;
UChar32 c = 0;
int8_t len = 0;
bool ok = false;
};

/**
* Validating iterator over the code points in a Unicode 16-bit string.
*
* TODO: check doxygen syntax for template parameters
* @param Unit16 char16_t or uint16_t or (on Windows) wchar_t
* @param U16IllFormedBehavior TODO
* @draft ICU 76
*/
template<typename Unit16, U16IllFormedBehavior behavior>
class U16Iterator {
public:
// TODO: make private, make friends
U16Iterator(const Unit16 *start, const Unit16 *p, const Unit16 *limit) :
start(start), p(p), limit(limit) {
if (p != limit) {
readOneForward();
}
start(start), limit(limit), seq(p) {
seq.readOneForward(limit);
}
// TODO: We might try to support limit==nullptr, similar to U16_ macros supporting length<0.
// Test pointers for == or != but not < or >.

U16Iterator(const U16Iterator &other) = default;
U16Iterator(U16Iterator &&other) noexcept = default;

bool operator==(const U16Iterator &other) const { return p == other.p; }
bool operator==(const U16Iterator &other) const { return seq.p == other.seq.p; }
bool operator!=(const U16Iterator &other) const { return !operator==(other); }

UChar32 operator*() const {
return c;
}

// TODO: good function names?
// It would be nice to avoid a prefix like "current", "one", "cp",
// but just length() on the iterator could be confusing.
int32_t currentLength() const { return len; }

std::basic_string_view<Unit16> currentView() const {
return std::basic_string_view<Unit16>(p, len);
const U16OneSeq<Unit16> &operator*() const {
return seq;
}

bool currentIsWellFormed() const { return ok; }

U16Iterator &operator++() { // pre-increment
// TODO: think about switching directions etc.
// Assume that readOneForward() was called and set `len`.
// Assume that readOneForward() was called and set seq.len.
// Skip the current code point, then read the next one.
p += len;
if (p != limit) {
readOneForward();
}
seq.fwd1();
seq.readOneForward(limit);
return *this;
}

U16Iterator operator++(int) { // post-increment
U16Iterator result(*this);
// TODO: think about switching directions etc.
// Assume that readOneForward() was called and set `len`.
// Assume that readOneForward() was called and set seq.len.
// Skip the current code point, then read the next one.
p += len;
if (p != limit) {
readOneForward();
}
seq.fwd1();
seq.readOneForward(limit);
return result;
}

private:
void readOneForward() {
// see U16_NEXT_OR_FFFD()
c = *p;
len = 1;
ok = true;
if (U16_IS_SURROGATE(c)) {
uint16_t c2;
if (U16_IS_SURROGATE_LEAD(c) && (p + 1) != limit && U16_IS_TRAIL(c2 = p[1])) {
c = U16_GET_SUPPLEMENTARY(c, c2);
len = 2;
} else {
// TODO: U16IllFormedBehavior
c = 0xfffd;
ok = false;
}
}
}

// In a validating iterator, we need start & limit so that when we read a code point
// (forward or backward) we can test if there are enough code units.
const Unit16 *start;
const Unit16 *p;
const Unit16 *limit;
UChar32 c = 0;
int8_t len = 0;
bool ok = false;
U16OneSeq<Unit16> seq;
};

// ------------------------------------------------------------------------- ***
Expand Down
1 change: 1 addition & 0 deletions icu4c/source/test/intltest/intltest.vcxproj
Original file line number Diff line number Diff line change
Expand Up @@ -223,6 +223,7 @@
<ClCompile Include="sfwdchit.cpp" />
<ClCompile Include="strcase.cpp" />
<ClCompile Include="ustrtest.cpp" />
<ClCompile Include="utfcppitertest.cpp" />
<ClCompile Include="utxttest.cpp" />
<ClCompile Include="cpdtrtst.cpp" />
<ClCompile Include="ittrans.cpp" />
Expand Down
3 changes: 3 additions & 0 deletions icu4c/source/test/intltest/intltest.vcxproj.filters
Original file line number Diff line number Diff line change
Expand Up @@ -490,6 +490,9 @@
<ClCompile Include="ustrtest.cpp">
<Filter>strings</Filter>
</ClCompile>
<ClCompile Include="utfcppitertest.cpp">
<Filter>strings</Filter>
</ClCompile>
<ClCompile Include="utxttest.cpp">
<Filter>strings</Filter>
</ClCompile>
Expand Down
16 changes: 10 additions & 6 deletions icu4c/source/test/intltest/utfcppitertest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,9 @@
// https://en.cppreference.com/w/cpp/string/basic_string_view/operator%22%22sv
using namespace std::string_view_literals;

using U_HEADER_ONLY_NAMESPACE::U16Iterator;
using U_HEADER_ONLY_NAMESPACE::U16_BEHAVIOR_NEGATIVE;
using U_HEADER_ONLY_NAMESPACE::U16Iterator;
using U_HEADER_ONLY_NAMESPACE::U16OneSeq;

class U16IteratorTest : public IntlTest {
public:
Expand Down Expand Up @@ -44,14 +45,17 @@ void U16IteratorTest::testExperiment() {
std::u16string_view good(u"abçカ🚴"sv);
const char16_t *goodLimit = good.data() + good.length();
U16Iterator<char16_t, U16_BEHAVIOR_NEGATIVE> goodIter(good.data(), good.data(), goodLimit);
assertEquals("goodIter[0] *", u'a', *goodIter);
assertEquals("goodIter[0] * codePoint()", u'a', (*goodIter).codePoint());
++goodIter; // pre-increment
assertEquals("goodIter[1] *", u'b', *goodIter);
assertEquals("goodIter[1] * codePoint()", u'b', (*goodIter).codePoint());
++goodIter;
assertEquals("goodIter[2] *", u'ç', *goodIter++); // post-increment
assertEquals("goodIter[3] *", u'', *goodIter);
assertEquals("goodIter[2] * codePoint()", u'ç', (*goodIter++).codePoint()); // post-increment
assertEquals("goodIter[3] * codePoint()", u'', (*goodIter).codePoint());
++goodIter;
assertEquals("goodIter[4] *", U'🚴', *goodIter++);
const U16OneSeq<char16_t> &seq = *goodIter++;
assertEquals("goodIter[4] * codePoint()", U'🚴', seq.codePoint());
assertEquals("goodIter[4] * length()", 2, seq.length());
assertTrue("goodIter[4] * stringView()", seq.stringView() == u"🚴"sv);
U16Iterator<char16_t, U16_BEHAVIOR_NEGATIVE> goodEndIter(good.data(), goodLimit, goodLimit);
assertTrue("goodIter == goodEndIter", goodIter == goodEndIter);

Expand Down

0 comments on commit 6568b04

Please sign in to comment.