diff --git a/tests/bootstrap.php b/tests/bootstrap.php
index 7df12ca1..2b0792e2 100644
--- a/tests/bootstrap.php
+++ b/tests/bootstrap.php
@@ -1,7 +1,6 @@
next_token()->get_type();
+ $is_valid = preg_match( '/^[\x{0080}-\x{ffff}]$/u', $value );
+ if ( $is_valid ) {
+ $this->assertSame( WP_MySQL_Lexer::IDENTIFIER, $type );
+ } elseif ( strlen( $value ) === 0 ) {
+ $this->assertSame( WP_MySQL_Lexer::EOF, $type );
+ } else {
+ $this->assertSame( WP_MySQL_Lexer::INVALID_INPUT, $type );
+ }
+ }
+ }
+
+ /**
+ * Test all valid and invalid 2-byte UTF-8 sequences in an identifier.
+ * The validity is checked against PCRE with the "u" (PCRE_UTF8) modifier set.
+ *
+ * Start both bytes from 128 and go up to 255 to include all invalid 2-byte
+ * UTF-8 sequences as well, and ensure that they won't match as identifiers.
+ */
+ public function test_identifier_utf8_two_byte_sequences(): void {
+ for ( $byte_1 = 128; $byte_1 <= 255; $byte_1 += 1 ) {
+ for ( $byte_2 = 128; $byte_2 <= 255; $byte_2 += 1 ) {
+ $value = chr( $byte_1 ) . chr( $byte_2 );
+ $is_valid = preg_match( '/^[\x{0080}-\x{ffff}]$/u', $value );
+ $lexer = new WP_MySQL_Lexer( $value );
+ $type = $lexer->next_token()->get_type();
+ if ( $is_valid ) {
+ $this->assertSame( WP_MySQL_Lexer::IDENTIFIER, $type );
+ } else {
+ $this->assertSame( WP_MySQL_Lexer::INVALID_INPUT, $type );
+ }
+ }
+ }
+ }
+
+ /**
+ * Test all valid and invalid 3-byte UTF-8 sequences in an identifier.
+ * The validity is checked against PCRE with the "u" (PCRE_UTF8) modifier set.
+ *
+ * Start the first byte from 0xE0 to mark the beginning of a 3-byte sequence.
+ * Start bytes 2 and 3 from 128 and go up to 255 to include all invalid 3-byte
+ * UTF-8 sequences as well, and ensure that they won't match as identifiers.
+ */
+ public function test_identifier_utf8_three_byte_sequences(): void {
+ for ( $byte_1 = 0xE0; $byte_1 <= 0xFF; $byte_1 += 1 ) {
+ for ( $byte_2 = 128; $byte_2 <= 255; $byte_2 += 1 ) {
+ for ( $byte_3 = 128; $byte_3 <= 255; $byte_3 += 1 ) {
+ $value = chr( $byte_1 ) . chr( $byte_2 ) . chr( $byte_3 );
+ $is_valid = preg_match( '/^[\x{0080}-\x{ffff}]$/u', $value );
+ $lexer = new WP_MySQL_Lexer( $value );
+ $type = $lexer->next_token()->get_type();
+ if ( $is_valid ) {
+ $this->assertSame( WP_MySQL_Lexer::IDENTIFIER, $type );
+ } else {
+ $this->assertSame( WP_MySQL_Lexer::INVALID_INPUT, $type );
+ }
+ }
+ }
+ }
+ }
+
/**
* Numbers vs. identifiers:
*
diff --git a/tests/tools/run-lexer-benchmark.php b/tests/tools/run-lexer-benchmark.php
index 2564f330..e970e448 100644
--- a/tests/tools/run-lexer-benchmark.php
+++ b/tests/tools/run-lexer-benchmark.php
@@ -12,7 +12,6 @@ function ( $severity, $message, $file, $line ) {
}
);
-require_once __DIR__ . '/../../wp-includes/utf8-decoder.php';
require_once __DIR__ . '/../../wp-includes/mysql/class-wp-mysql-token.php';
require_once __DIR__ . '/../../wp-includes/mysql/class-wp-mysql-lexer.php';
diff --git a/tests/tools/run-parser-benchmark.php b/tests/tools/run-parser-benchmark.php
index afba53e2..1ab4859f 100644
--- a/tests/tools/run-parser-benchmark.php
+++ b/tests/tools/run-parser-benchmark.php
@@ -13,7 +13,6 @@ function ( $severity, $message, $file, $line ) {
}
);
-require_once __DIR__ . '/../../wp-includes/utf8-decoder.php';
require_once __DIR__ . '/../../wp-includes/mysql/class-wp-mysql-token.php';
require_once __DIR__ . '/../../wp-includes/mysql/class-wp-mysql-lexer.php';
require_once __DIR__ . '/../../wp-includes/parser/class-wp-parser-grammar.php';
diff --git a/tests/tools/run-parser-test.php b/tests/tools/run-parser-test.php
index 78fd32ac..64bd4284 100644
--- a/tests/tools/run-parser-test.php
+++ b/tests/tools/run-parser-test.php
@@ -12,7 +12,6 @@ function ( $severity, $message, $file, $line ) {
}
);
-require_once __DIR__ . '/../../wp-includes/utf8-decoder.php';
require_once __DIR__ . '/../../wp-includes/mysql/class-wp-mysql-token.php';
require_once __DIR__ . '/../../wp-includes/mysql/class-wp-mysql-lexer.php';
require_once __DIR__ . '/../../wp-includes/parser/class-wp-parser.php';
diff --git a/wp-includes/mysql/class-wp-mysql-lexer.php b/wp-includes/mysql/class-wp-mysql-lexer.php
index e49a861f..6975ce15 100644
--- a/wp-includes/mysql/class-wp-mysql-lexer.php
+++ b/wp-includes/mysql/class-wp-mysql-lexer.php
@@ -2483,7 +2483,7 @@ private function get_current_token_bytes(): string {
* https://dev.mysql.com/doc/refman/8.4/en/identifiers.html
*
* Rules:
- * 1. Allowed characters are ASCII a-z, A-Z, 0-9, _, $, and Unicode \x{0080}-\x{ffff}.
+ * 1. Allowed characters are ASCII a-z, A-Z, 0-9, _, $, and Unicode U+0080-U+FFFF.
* 2. Unquoted identifiers may begin with a digit but may not consist solely of digits.
*/
private function parse_identifier(): int {
@@ -2497,28 +2497,48 @@ private function parse_identifier(): int {
$this->bytes_already_read + $byte_length
);
- // Check if the following byte can be part of a multibyte character.
- // If not, bail out early to avoid unnecessary UTF-8 decoding.
- $byte = $this->sql[ $this->bytes_already_read + $byte_length ] ?? null;
- if ( null === $byte || ord( $byte ) < 128 ) {
+ // Check if the following byte can be part of a multibyte character
+ // in the range of U+0080 to U+FFFF before looking at further bytes.
+ // If it can't, bail out early to avoid unnecessary UTF-8 decoding.
+ // Identifiers are usually ASCII-only, so we can optimize for that.
+ $byte_1 = ord(
+ $this->sql[ $this->bytes_already_read + $byte_length ] ?? ''
+ );
+ if ( $byte_1 < 0xC2 || $byte_1 > 0xEF ) {
break;
}
- // Check the \x{0080}-\x{ffff} Unicode character range.
- $codepoint = utf8_codepoint_at(
- $this->sql,
- $this->bytes_already_read + $byte_length,
- $bytes_parsed
+ // Look for a valid 2-byte UTF-8 symbol. Covers range U+0080 - U+07FF.
+ $byte_2 = ord(
+ $this->sql[ $this->bytes_already_read + $byte_length + 1 ] ?? ''
);
+ if (
+ $byte_1 <= 0xDF
+ && $byte_2 >= 0x80 && $byte_2 <= 0xBF
+ ) {
+ $byte_length += 2;
+ continue;
+ }
+ // Look for a valid 3-byte UTF-8 symbol in range U+0800 - U+FFFF.
+ $byte_3 = ord(
+ $this->sql[ $this->bytes_already_read + $byte_length + 2 ] ?? ''
+ );
if (
- null === $codepoint
- || ! ( 0x80 <= $codepoint && 0xffff >= $codepoint )
+ $byte_1 <= 0xEF
+ && $byte_2 >= 0x80 && $byte_2 <= 0xBF
+ && $byte_3 >= 0x80 && $byte_3 <= 0xBF
+ // Exclude surrogate range U+D800 to U+DFFF:
+ && ! ( 0xED === $byte_1 && $byte_2 >= 0xA0 )
+ // Exclude overlong encodings:
+ && ! ( 0xE0 === $byte_1 && $byte_2 < 0xA0 )
) {
- break;
+ $byte_length += 3;
+ continue;
}
- $byte_length += $bytes_parsed;
+ // Not a valid identifier character.
+ break;
}
// An identifier cannot consist solely of digits.
diff --git a/wp-includes/utf8-decoder.php b/wp-includes/utf8-decoder.php
deleted file mode 100644
index 55c9ccc6..00000000
--- a/wp-includes/utf8-decoder.php
+++ /dev/null
@@ -1,293 +0,0 @@
-
- * - `UTF8_DECODER_ACCEPT`: Decoder is ready for a new code point.
- * - `UTF8_DECODER_REJECT`: An error has occurred.
- * Any other positive value: Decoder is waiting for additional bytes.
- * @param int|null $code_point Optional. If provided, will accumulate the decoded code point as
- * each byte is processed. If not provided or unable to decode, will
- * not be set, or will be set to invalid and unusable data.
- * @return int Next decoder state after processing the current byte.
- */
-function utf8_decoder_apply_byte( string $byte, int $state, int &$code_point = 0 ): int {
- /**
- * State classification and transition table for UTF-8 validation.
- *
- * > The first part of the table maps bytes to character classes that
- * > to reduce the size of the transition table and create bitmasks.
- * >
- * > The second part is a transition table that maps a combination
- * > of a state of the automaton and a character class to a state.
- *
- * @see http://bjoern.hoehrmann.de/utf-8/decoder/dfa/
- */
- static $state_table = (
- "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" .
- "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" .
- "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" .
- "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" .
- "\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x09\x09\x09\x09\x09\x09\x09\x09\x09\x09\x09\x09\x09\x09\x09\x09" .
- "\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07" .
- "\x08\x08\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02" .
- "\x10\x03\x03\x03\x03\x03\x03\x03\x03\x03\x03\x03\x03\x04\x03\x03" .
- "\x11\x06\x06\x06\x05\x08\x08\x08\x08\x08\x08\x08\x08\x08\x08\x08" .
- "\x00\x01\x02\x03\x05\x08\x07\x01\x01\x01\x04\x06\x01\x01\x01\x01" .
- "\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x00\x01\x01\x01\x01\x01\x00\x01\x00\x01\x01\x01\x01\x01\x01" .
- "\x01\x02\x01\x01\x01\x01\x01\x02\x01\x02\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x02\x01\x01\x01\x01\x01\x01\x01\x01" .
- "\x01\x02\x01\x01\x01\x01\x01\x01\x01\x02\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x03\x01\x03\x01\x01\x01\x01\x01\x01" .
- "\x01\x03\x01\x01\x01\x01\x01\x03\x01\x03\x01\x01\x01\x01\x01\x01\x01\x03\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01"
- );
-
- $byte = ord( $byte );
- $type = ord( $state_table[ $byte ] );
- $code_point = ( UTF8_DECODER_ACCEPT === $state )
- ? ( ( 0xFF >> $type ) & $byte )
- : ( ( $byte & 0x3F ) | ( $code_point << 6 ) );
-
- return ord( $state_table[ 256 + ( $state * 16 ) + $type ] );
-}
-
-/**
- * Extract a slice of a text by code point, where invalid byte seuqences count
- * as a single code point, U+FFFD (the Unicode replacement character `�`).
- *
- * This function does not permit passing negative indices and will return
- * the original string if such are provide.
- *
- * @param string $text Input text from which to extract.
- * @param int $from Start extracting after this many code-points.
- * @param int $length Extract this many code points.
- *
- * @return string Extracted slice of input string.
- */
-function utf8_substr( string $text, int $from = 0, ?int $length = null ): string {
- if ( $from < 0 || ( isset( $length ) && $length < 0 ) ) {
- return $text;
- }
-
- $position_in_input = 0;
- $code_point_at = 0;
- $end_byte = strlen( $text );
- $buffer = '';
- $seen_code_points = 0;
- $sliced_code_points = 0;
- $decoder_state = UTF8_DECODER_ACCEPT;
-
- // Get to the start of the string.
- while ( $position_in_input < $end_byte && $seen_code_points < $length ) {
- $decoder_state = utf8_decoder_apply_byte( $text[ $position_in_input ], $decoder_state );
-
- if ( UTF8_DECODER_ACCEPT === $decoder_state ) {
- ++$position_in_input;
-
- if ( $seen_code_points >= $from ) {
- ++$sliced_code_points;
- $buffer .= substr( $text, $code_point_at, $position_in_input - $code_point_at );
- }
-
- ++$seen_code_points;
- $code_point_at = $position_in_input;
- } elseif ( UTF8_DECODER_REJECT === $decoder_state ) {
- $buffer .= "\u{FFFD}";
-
- // Skip to the start of the next code point.
- while ( UTF8_DECODER_REJECT === $decoder_state && $position_in_input < $end_byte ) {
- $decoder_state = utf8_decoder_apply_byte( $text[ ++$position_in_input ], UTF8_DECODER_ACCEPT );
- }
-
- ++$seen_code_points;
- $code_point_at = $position_in_input;
- $decoder_state = UTF8_DECODER_ACCEPT;
- } else {
- ++$position_in_input;
- }
- }
-
- return $buffer;
-}
-
-/**
- * Extract a unicode codepoint from a specific offset in text.
- * Invalid byte sequences count as a single code point, U+FFFD
- * (the Unicode replacement character ``).
- *
- * This function does not permit passing negative indices and will return
- * null if such are provided.
- *
- * @param string $text Input text from which to extract.
- * @param int $byte_offset Start at this byte offset in the input text.
- * @param int $matched_bytes How many bytes were matched to produce the codepoint.
- *
- * @return int Unicode codepoint.
- */
-function utf8_codepoint_at( string $text, int $byte_offset = 0, &$matched_bytes = 0 ) {
- if ( $byte_offset < 0 ) {
- return null;
- }
-
- $position_in_input = $byte_offset;
- $code_point_at = $byte_offset;
- $end_byte = strlen( $text );
- $codepoint = null;
- $decoder_state = UTF8_DECODER_ACCEPT;
-
- // Get to the start of the string.
- while ( $position_in_input < $end_byte ) {
- $decoder_state = utf8_decoder_apply_byte( $text[ $position_in_input ], $decoder_state );
-
- if ( UTF8_DECODER_ACCEPT === $decoder_state ) {
- ++$position_in_input;
- $codepoint = utf8_ord( substr( $text, $code_point_at, $position_in_input - $code_point_at ) );
- break;
- } elseif ( UTF8_DECODER_REJECT === $decoder_state ) {
- $codepoint = utf8_ord( "\u{FFFD}" );
- break;
- } else {
- ++$position_in_input;
- }
- }
-
- $matched_bytes = $position_in_input - $byte_offset;
- return $codepoint;
-}
-
-/**
- * Convert a UTF-8 byte sequence to its Unicode codepoint.
- *
- * @param string $character UTF-8 encoded byte sequence representing a single Unicode character.
- * @return int Unicode codepoint.
- */
-function utf8_ord( string $character ): int {
- // Convert the byte sequence to its binary representation
- $bytes = unpack( 'C*', $character );
-
- // Initialize the codepoint
- $codepoint = 0;
-
- // Calculate the codepoint based on the number of bytes
- if ( count( $bytes ) === 1 ) {
- $codepoint = $bytes[1];
- } elseif ( count( $bytes ) === 2 ) {
- $codepoint = ( ( $bytes[1] & 0x1F ) << 6 ) | ( $bytes[2] & 0x3F );
- } elseif ( count( $bytes ) === 3 ) {
- $codepoint = ( ( $bytes[1] & 0x0F ) << 12 ) | ( ( $bytes[2] & 0x3F ) << 6 ) | ( $bytes[3] & 0x3F );
- } elseif ( count( $bytes ) === 4 ) {
- $codepoint = ( ( $bytes[1] & 0x07 ) << 18 ) | ( ( $bytes[2] & 0x3F ) << 12 ) | ( ( $bytes[3] & 0x3F ) << 6 ) | ( $bytes[4] & 0x3F );
- }
-
- return $codepoint;
-}