Skip to content

Commit

Permalink
Check for the U+0080-U+FFFF range manually, add test coverage
Browse files Browse the repository at this point in the history
  • Loading branch information
JanJakes committed Nov 11, 2024
1 parent dff4649 commit 17641f7
Show file tree
Hide file tree
Showing 7 changed files with 103 additions and 311 deletions.
1 change: 0 additions & 1 deletion tests/bootstrap.php
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
<?php

require_once __DIR__ . '/wp-sqlite-schema.php';
require_once __DIR__ . '/../wp-includes/utf8-decoder.php';
require_once __DIR__ . '/../wp-includes/mysql/class-wp-mysql-token.php';
require_once __DIR__ . '/../wp-includes/mysql/class-wp-mysql-lexer.php';
require_once __DIR__ . '/../wp-includes/parser/class-wp-parser-grammar.php';
Expand Down
69 changes: 69 additions & 0 deletions tests/mysql/WP_MySQL_Lexer_Tests.php
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,75 @@
use PHPUnit\Framework\TestCase;

class WP_MySQL_Lexer_Tests extends TestCase {
/**
* Test that the whole U+0080 to U+FFFF UTF-8 range is valid in an identifier.
* The validity is checked against PCRE with the "u" (PCRE_UTF8) modifier set.
*/
public function test_identifier_utf8_range(): void {
for ( $i = 0x80; $i < 0xffff; $i += 1 ) {
$value = mb_chr( $i, 'UTF-8' );
$lexer = new WP_MySQL_Lexer( $value );
$type = $lexer->next_token()->get_type();
$is_valid = preg_match( '/^[\x{0080}-\x{ffff}]$/u', $value );
if ( $is_valid ) {
$this->assertSame( WP_MySQL_Lexer::IDENTIFIER, $type );
} elseif ( strlen( $value ) === 0 ) {
$this->assertSame( WP_MySQL_Lexer::EOF, $type );
} else {
$this->assertSame( WP_MySQL_Lexer::INVALID_INPUT, $type );
}
}
}

/**
* Test all valid and invalid 2-byte UTF-8 sequences in an identifier.
* The validity is checked against PCRE with the "u" (PCRE_UTF8) modifier set.
*
* Start both bytes from 128 and go up to 255 to include all invalid 2-byte
* UTF-8 sequences as well, and ensure that they won't match as identifiers.
*/
public function test_identifier_utf8_two_byte_sequences(): void {
for ( $byte_1 = 128; $byte_1 <= 255; $byte_1 += 1 ) {
for ( $byte_2 = 128; $byte_2 <= 255; $byte_2 += 1 ) {
$value = chr( $byte_1 ) . chr( $byte_2 );
$is_valid = preg_match( '/^[\x{0080}-\x{ffff}]$/u', $value );
$lexer = new WP_MySQL_Lexer( $value );
$type = $lexer->next_token()->get_type();
if ( $is_valid ) {
$this->assertSame( WP_MySQL_Lexer::IDENTIFIER, $type );
} else {
$this->assertSame( WP_MySQL_Lexer::INVALID_INPUT, $type );
}
}
}
}

/**
* Test all valid and invalid 3-byte UTF-8 sequences in an identifier.
* The validity is checked against PCRE with the "u" (PCRE_UTF8) modifier set.
*
* Start the first byte from 0xE0 to mark the beginning of a 3-byte sequence.
* Start bytes 2 and 3 from 128 and go up to 255 to include all invalid 3-byte
* UTF-8 sequences as well, and ensure that they won't match as identifiers.
*/
public function test_identifier_utf8_three_byte_sequences(): void {
for ( $byte_1 = 0xE0; $byte_1 <= 0xFF; $byte_1 += 1 ) {
for ( $byte_2 = 128; $byte_2 <= 255; $byte_2 += 1 ) {
for ( $byte_3 = 128; $byte_3 <= 255; $byte_3 += 1 ) {
$value = chr( $byte_1 ) . chr( $byte_2 ) . chr( $byte_3 );
$is_valid = preg_match( '/^[\x{0080}-\x{ffff}]$/u', $value );
$lexer = new WP_MySQL_Lexer( $value );
$type = $lexer->next_token()->get_type();
if ( $is_valid ) {
$this->assertSame( WP_MySQL_Lexer::IDENTIFIER, $type );
} else {
$this->assertSame( WP_MySQL_Lexer::INVALID_INPUT, $type );
}
}
}
}
}

/**
* Numbers vs. identifiers:
*
Expand Down
1 change: 0 additions & 1 deletion tests/tools/run-lexer-benchmark.php
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@ function ( $severity, $message, $file, $line ) {
}
);

require_once __DIR__ . '/../../wp-includes/utf8-decoder.php';
require_once __DIR__ . '/../../wp-includes/mysql/class-wp-mysql-token.php';
require_once __DIR__ . '/../../wp-includes/mysql/class-wp-mysql-lexer.php';

Expand Down
1 change: 0 additions & 1 deletion tests/tools/run-parser-benchmark.php
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@ function ( $severity, $message, $file, $line ) {
}
);

require_once __DIR__ . '/../../wp-includes/utf8-decoder.php';
require_once __DIR__ . '/../../wp-includes/mysql/class-wp-mysql-token.php';
require_once __DIR__ . '/../../wp-includes/mysql/class-wp-mysql-lexer.php';
require_once __DIR__ . '/../../wp-includes/parser/class-wp-parser-grammar.php';
Expand Down
1 change: 0 additions & 1 deletion tests/tools/run-parser-test.php
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@ function ( $severity, $message, $file, $line ) {
}
);

require_once __DIR__ . '/../../wp-includes/utf8-decoder.php';
require_once __DIR__ . '/../../wp-includes/mysql/class-wp-mysql-token.php';
require_once __DIR__ . '/../../wp-includes/mysql/class-wp-mysql-lexer.php';
require_once __DIR__ . '/../../wp-includes/parser/class-wp-parser.php';
Expand Down
48 changes: 34 additions & 14 deletions wp-includes/mysql/class-wp-mysql-lexer.php
Original file line number Diff line number Diff line change
Expand Up @@ -2483,7 +2483,7 @@ private function get_current_token_bytes(): string {
* https://dev.mysql.com/doc/refman/8.4/en/identifiers.html
*
* Rules:
* 1. Allowed characters are ASCII a-z, A-Z, 0-9, _, $, and Unicode \x{0080}-\x{ffff}.
* 1. Allowed characters are ASCII a-z, A-Z, 0-9, _, $, and Unicode U+0080-U+FFFF.
* 2. Unquoted identifiers may begin with a digit but may not consist solely of digits.
*/
private function parse_identifier(): int {
Expand All @@ -2497,28 +2497,48 @@ private function parse_identifier(): int {
$this->bytes_already_read + $byte_length
);

// Check if the following byte can be part of a multibyte character.
// If not, bail out early to avoid unnecessary UTF-8 decoding.
$byte = $this->sql[ $this->bytes_already_read + $byte_length ] ?? null;
if ( null === $byte || ord( $byte ) < 128 ) {
// Check if the following byte can be part of a multibyte character
// in the range of U+0080 to U+FFFF before looking at further bytes.
// If it can't, bail out early to avoid unnecessary UTF-8 decoding.
// Identifiers are usually ASCII-only, so we can optimize for that.
$byte_1 = ord(
$this->sql[ $this->bytes_already_read + $byte_length ] ?? ''
);
if ( $byte_1 < 0xC2 || $byte_1 > 0xEF ) {
break;
}

// Check the \x{0080}-\x{ffff} Unicode character range.
$codepoint = utf8_codepoint_at(
$this->sql,
$this->bytes_already_read + $byte_length,
$bytes_parsed
// Look for a valid 2-byte UTF-8 symbol. Covers range U+0080 - U+07FF.
$byte_2 = ord(
$this->sql[ $this->bytes_already_read + $byte_length + 1 ] ?? ''
);
if (
$byte_1 <= 0xDF
&& $byte_2 >= 0x80 && $byte_2 <= 0xBF
) {
$byte_length += 2;
continue;
}

// Look for a valid 3-byte UTF-8 symbol in range U+0800 - U+FFFF.
$byte_3 = ord(
$this->sql[ $this->bytes_already_read + $byte_length + 2 ] ?? ''
);
if (
null === $codepoint
|| ! ( 0x80 <= $codepoint && 0xffff >= $codepoint )
$byte_1 <= 0xEF
&& $byte_2 >= 0x80 && $byte_2 <= 0xBF
&& $byte_3 >= 0x80 && $byte_3 <= 0xBF
// Exclude surrogate range U+D800 to U+DFFF:
&& ! ( 0xED === $byte_1 && $byte_2 >= 0xA0 )
// Exclude overlong encodings:
&& ! ( 0xE0 === $byte_1 && $byte_2 < 0xA0 )
) {
break;
$byte_length += 3;
continue;
}

$byte_length += $bytes_parsed;
// Not a valid identifier character.
break;
}

// An identifier cannot consist solely of digits.
Expand Down
Loading

0 comments on commit 17641f7

Please sign in to comment.