diff --git a/tests/bootstrap.php b/tests/bootstrap.php index 7df12ca1..2b0792e2 100644 --- a/tests/bootstrap.php +++ b/tests/bootstrap.php @@ -1,7 +1,6 @@ next_token()->get_type(); + $is_valid = preg_match( '/^[\x{0080}-\x{ffff}]$/u', $value ); + if ( $is_valid ) { + $this->assertSame( WP_MySQL_Lexer::IDENTIFIER, $type ); + } elseif ( strlen( $value ) === 0 ) { + $this->assertSame( WP_MySQL_Lexer::EOF, $type ); + } else { + $this->assertSame( WP_MySQL_Lexer::INVALID_INPUT, $type ); + } + } + } + + /** + * Test all valid and invalid 2-byte UTF-8 sequences in an identifier. + * The validity is checked against PCRE with the "u" (PCRE_UTF8) modifier set. + * + * Start both bytes from 128 and go up to 255 to include all invalid 2-byte + * UTF-8 sequences as well, and ensure that they won't match as identifiers. + */ + public function test_identifier_utf8_two_byte_sequences(): void { + for ( $byte_1 = 128; $byte_1 <= 255; $byte_1 += 1 ) { + for ( $byte_2 = 128; $byte_2 <= 255; $byte_2 += 1 ) { + $value = chr( $byte_1 ) . chr( $byte_2 ); + $is_valid = preg_match( '/^[\x{0080}-\x{ffff}]$/u', $value ); + $lexer = new WP_MySQL_Lexer( $value ); + $type = $lexer->next_token()->get_type(); + if ( $is_valid ) { + $this->assertSame( WP_MySQL_Lexer::IDENTIFIER, $type ); + } else { + $this->assertSame( WP_MySQL_Lexer::INVALID_INPUT, $type ); + } + } + } + } + + /** + * Test all valid and invalid 3-byte UTF-8 sequences in an identifier. + * The validity is checked against PCRE with the "u" (PCRE_UTF8) modifier set. + * + * Start the first byte from 0xE0 to mark the beginning of a 3-byte sequence. + * Start bytes 2 and 3 from 128 and go up to 255 to include all invalid 3-byte + * UTF-8 sequences as well, and ensure that they won't match as identifiers. + */ + public function test_identifier_utf8_three_byte_sequences(): void { + for ( $byte_1 = 0xE0; $byte_1 <= 0xFF; $byte_1 += 1 ) { + for ( $byte_2 = 128; $byte_2 <= 255; $byte_2 += 1 ) { + for ( $byte_3 = 128; $byte_3 <= 255; $byte_3 += 1 ) { + $value = chr( $byte_1 ) . chr( $byte_2 ) . chr( $byte_3 ); + $is_valid = preg_match( '/^[\x{0080}-\x{ffff}]$/u', $value ); + $lexer = new WP_MySQL_Lexer( $value ); + $type = $lexer->next_token()->get_type(); + if ( $is_valid ) { + $this->assertSame( WP_MySQL_Lexer::IDENTIFIER, $type ); + } else { + $this->assertSame( WP_MySQL_Lexer::INVALID_INPUT, $type ); + } + } + } + } + } + /** * Numbers vs. identifiers: * diff --git a/tests/tools/run-lexer-benchmark.php b/tests/tools/run-lexer-benchmark.php index 2564f330..e970e448 100644 --- a/tests/tools/run-lexer-benchmark.php +++ b/tests/tools/run-lexer-benchmark.php @@ -12,7 +12,6 @@ function ( $severity, $message, $file, $line ) { } ); -require_once __DIR__ . '/../../wp-includes/utf8-decoder.php'; require_once __DIR__ . '/../../wp-includes/mysql/class-wp-mysql-token.php'; require_once __DIR__ . '/../../wp-includes/mysql/class-wp-mysql-lexer.php'; diff --git a/tests/tools/run-parser-benchmark.php b/tests/tools/run-parser-benchmark.php index afba53e2..1ab4859f 100644 --- a/tests/tools/run-parser-benchmark.php +++ b/tests/tools/run-parser-benchmark.php @@ -13,7 +13,6 @@ function ( $severity, $message, $file, $line ) { } ); -require_once __DIR__ . '/../../wp-includes/utf8-decoder.php'; require_once __DIR__ . '/../../wp-includes/mysql/class-wp-mysql-token.php'; require_once __DIR__ . '/../../wp-includes/mysql/class-wp-mysql-lexer.php'; require_once __DIR__ . '/../../wp-includes/parser/class-wp-parser-grammar.php'; diff --git a/tests/tools/run-parser-test.php b/tests/tools/run-parser-test.php index 78fd32ac..64bd4284 100644 --- a/tests/tools/run-parser-test.php +++ b/tests/tools/run-parser-test.php @@ -12,7 +12,6 @@ function ( $severity, $message, $file, $line ) { } ); -require_once __DIR__ . '/../../wp-includes/utf8-decoder.php'; require_once __DIR__ . '/../../wp-includes/mysql/class-wp-mysql-token.php'; require_once __DIR__ . '/../../wp-includes/mysql/class-wp-mysql-lexer.php'; require_once __DIR__ . '/../../wp-includes/parser/class-wp-parser.php'; diff --git a/wp-includes/mysql/class-wp-mysql-lexer.php b/wp-includes/mysql/class-wp-mysql-lexer.php index e49a861f..6975ce15 100644 --- a/wp-includes/mysql/class-wp-mysql-lexer.php +++ b/wp-includes/mysql/class-wp-mysql-lexer.php @@ -2483,7 +2483,7 @@ private function get_current_token_bytes(): string { * https://dev.mysql.com/doc/refman/8.4/en/identifiers.html * * Rules: - * 1. Allowed characters are ASCII a-z, A-Z, 0-9, _, $, and Unicode \x{0080}-\x{ffff}. + * 1. Allowed characters are ASCII a-z, A-Z, 0-9, _, $, and Unicode U+0080-U+FFFF. * 2. Unquoted identifiers may begin with a digit but may not consist solely of digits. */ private function parse_identifier(): int { @@ -2497,28 +2497,48 @@ private function parse_identifier(): int { $this->bytes_already_read + $byte_length ); - // Check if the following byte can be part of a multibyte character. - // If not, bail out early to avoid unnecessary UTF-8 decoding. - $byte = $this->sql[ $this->bytes_already_read + $byte_length ] ?? null; - if ( null === $byte || ord( $byte ) < 128 ) { + // Check if the following byte can be part of a multibyte character + // in the range of U+0080 to U+FFFF before looking at further bytes. + // If it can't, bail out early to avoid unnecessary UTF-8 decoding. + // Identifiers are usually ASCII-only, so we can optimize for that. + $byte_1 = ord( + $this->sql[ $this->bytes_already_read + $byte_length ] ?? '' + ); + if ( $byte_1 < 0xC2 || $byte_1 > 0xEF ) { break; } - // Check the \x{0080}-\x{ffff} Unicode character range. - $codepoint = utf8_codepoint_at( - $this->sql, - $this->bytes_already_read + $byte_length, - $bytes_parsed + // Look for a valid 2-byte UTF-8 symbol. Covers range U+0080 - U+07FF. + $byte_2 = ord( + $this->sql[ $this->bytes_already_read + $byte_length + 1 ] ?? '' ); + if ( + $byte_1 <= 0xDF + && $byte_2 >= 0x80 && $byte_2 <= 0xBF + ) { + $byte_length += 2; + continue; + } + // Look for a valid 3-byte UTF-8 symbol in range U+0800 - U+FFFF. + $byte_3 = ord( + $this->sql[ $this->bytes_already_read + $byte_length + 2 ] ?? '' + ); if ( - null === $codepoint - || ! ( 0x80 <= $codepoint && 0xffff >= $codepoint ) + $byte_1 <= 0xEF + && $byte_2 >= 0x80 && $byte_2 <= 0xBF + && $byte_3 >= 0x80 && $byte_3 <= 0xBF + // Exclude surrogate range U+D800 to U+DFFF: + && ! ( 0xED === $byte_1 && $byte_2 >= 0xA0 ) + // Exclude overlong encodings: + && ! ( 0xE0 === $byte_1 && $byte_2 < 0xA0 ) ) { - break; + $byte_length += 3; + continue; } - $byte_length += $bytes_parsed; + // Not a valid identifier character. + break; } // An identifier cannot consist solely of digits. diff --git a/wp-includes/utf8-decoder.php b/wp-includes/utf8-decoder.php deleted file mode 100644 index 55c9ccc6..00000000 --- a/wp-includes/utf8-decoder.php +++ /dev/null @@ -1,293 +0,0 @@ -