Skip to content

Commit

Permalink
AK: Explicitly check for null data in Utf16View
Browse files Browse the repository at this point in the history
The underlying CPU-specific instructions for operating on UTF-16 strings
behave differently for null inputs. Add an explicit check for this state
for consistency.
  • Loading branch information
trflynn89 authored and awesomekling committed Jul 21, 2024
1 parent 144452d commit 74d644a
Show file tree
Hide file tree
Showing 3 changed files with 34 additions and 0 deletions.
2 changes: 2 additions & 0 deletions AK/String.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,8 @@ ErrorOr<String> String::from_utf16(Utf16View const& utf16)
{
if (!utf16.validate())
return Error::from_string_literal("String::from_utf16: Input was not valid UTF-16");
if (utf16.is_empty())
return String {};

String result;

Expand Down
19 changes: 19 additions & 0 deletions AK/Utf16View.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,8 @@ ErrorOr<Utf16Data> utf8_to_utf16(Utf8View const& utf8_view, Endianness endiannes
// All callers want to allow lonely surrogates, which simdutf does not permit.
if (!utf8_view.validate(Utf8View::AllowSurrogates::No)) [[unlikely]]
return to_utf16_slow(utf8_view, endianness);
if (utf8_view.is_empty())
return Utf16Data {};

auto const* data = reinterpret_cast<char const*>(utf8_view.bytes());
auto length = utf8_view.byte_length();
Expand All @@ -85,6 +87,9 @@ ErrorOr<Utf16Data> utf8_to_utf16(Utf8View const& utf8_view, Endianness endiannes

ErrorOr<Utf16Data> utf32_to_utf16(Utf32View const& utf32_view, Endianness endianness)
{
if (utf32_view.is_empty())
return Utf16Data {};

auto const* data = reinterpret_cast<char32_t const*>(utf32_view.code_points());
auto length = utf32_view.length();

Expand Down Expand Up @@ -288,6 +293,10 @@ bool Utf16View::starts_with(Utf16View const& needle) const

bool Utf16View::validate() const
{
// FIXME: The CPU-specific implementations behave differently on null inputs. We treat null views as an empty string.
if (is_empty())
return true;

switch (m_endianness) {
case Endianness::Host:
return simdutf::validate_utf16(char_data(), length_in_code_units());
Expand All @@ -301,6 +310,12 @@ bool Utf16View::validate() const

bool Utf16View::validate(size_t& valid_code_units) const
{
// FIXME: The CPU-specific implementations behave differently on null inputs. We treat null views as an empty string.
if (is_empty()) {
valid_code_units = 0;
return true;
}

auto result = [&]() {
switch (m_endianness) {
case Endianness::Host:
Expand All @@ -319,6 +334,10 @@ bool Utf16View::validate(size_t& valid_code_units) const

size_t Utf16View::calculate_length_in_code_points() const
{
// FIXME: The CPU-specific implementations behave differently on null inputs. We treat null views as an empty string.
if (is_empty())
return 0;

// FIXME: simdutf's code point length method assumes valid UTF-16, whereas Utf16View uses U+FFFD as a replacement
// for invalid code points. If we change Utf16View to only accept valid encodings as an invariant, we can
// remove this branch.
Expand Down
13 changes: 13 additions & 0 deletions Tests/AK/TestUtf16.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,19 @@ TEST_CASE(decode_utf16)
EXPECT_EQ(i, expected.size());
}

TEST_CASE(null_view)
{
Utf16View view;
EXPECT(view.validate());
EXPECT_EQ(view.length_in_code_units(), 0zu);
EXPECT_EQ(view.length_in_code_points(), 0zu);
EXPECT_EQ(MUST(view.to_utf8(Utf16View::AllowInvalidCodeUnits::No)), ""sv);
EXPECT_EQ(MUST(view.to_utf8(Utf16View::AllowInvalidCodeUnits::Yes)), ""sv);

for ([[maybe_unused]] auto it : view)
FAIL("Iterating a null UTF-16 string should not produce any values");
}

TEST_CASE(utf16_literal)
{
{
Expand Down

0 comments on commit 74d644a

Please sign in to comment.