From 94269694c7fa047d8899e81c23bc08b5c33760fc Mon Sep 17 00:00:00 2001 From: Jan Jakes Date: Thu, 14 Nov 2024 10:35:25 +0100 Subject: [PATCH] Implement "next_token()" & "get_next_token" API --- tests/mysql/WP_MySQL_Lexer_Tests.php | 111 ++++++++++++++++----- wp-includes/mysql/class-wp-mysql-lexer.php | 105 +++++++++++++------ 2 files changed, 163 insertions(+), 53 deletions(-) diff --git a/tests/mysql/WP_MySQL_Lexer_Tests.php b/tests/mysql/WP_MySQL_Lexer_Tests.php index cd5d7a04..b761ebf8 100644 --- a/tests/mysql/WP_MySQL_Lexer_Tests.php +++ b/tests/mysql/WP_MySQL_Lexer_Tests.php @@ -3,22 +3,75 @@ use PHPUnit\Framework\TestCase; class WP_MySQL_Lexer_Tests extends TestCase { + public function test_tokenize_valid_input(): void { + $lexer = new WP_MySQL_Lexer( 'SELECT id FROM users' ); + + // SELECT + $this->assertTrue( $lexer->next_token() ); + $this->assertSame( WP_MySQL_Lexer::SELECT_SYMBOL, $lexer->get_token()->get_type() ); + + // id + $this->assertTrue( $lexer->next_token() ); + $this->assertSame( WP_MySQL_Lexer::IDENTIFIER, $lexer->get_token()->get_type() ); + + // FROM + $this->assertTrue( $lexer->next_token() ); + $this->assertSame( WP_MySQL_Lexer::FROM_SYMBOL, $lexer->get_token()->get_type() ); + + // users + $this->assertTrue( $lexer->next_token() ); + $this->assertSame( WP_MySQL_Lexer::IDENTIFIER, $lexer->get_token()->get_type() ); + + // EOF + $this->assertTrue( $lexer->next_token() ); + $this->assertSame( WP_MySQL_Lexer::EOF, $lexer->get_token()->get_type() ); + + // No more tokens. + $this->assertFalse( $lexer->next_token() ); + $this->assertNull( $lexer->get_token() ); + + // Again, no more tokens. + $this->assertFalse( $lexer->next_token() ); + $this->assertNull( $lexer->get_token() ); + } + + public function test_tokenize_invalid_input(): void { + $lexer = new WP_MySQL_Lexer( "SELECT x'ab01xyz'" ); + + // SELECT + $this->assertTrue( $lexer->next_token() ); + $this->assertSame( WP_MySQL_Lexer::SELECT_SYMBOL, $lexer->get_token()->get_type() ); + + // Invalid input. + $this->assertFalse( $lexer->next_token() ); + $this->assertNull( $lexer->get_token() ); + + // No more tokens. + $this->assertFalse( $lexer->next_token() ); + $this->assertNull( $lexer->get_token() ); + + // Again, no more tokens. + $this->assertFalse( $lexer->next_token() ); + $this->assertNull( $lexer->get_token() ); + } + /** * Test that the whole U+0080 to U+FFFF UTF-8 range is valid in an identifier. * The validity is checked against PCRE with the "u" (PCRE_UTF8) modifier set. */ public function test_identifier_utf8_range(): void { for ( $i = 0x80; $i < 0xffff; $i += 1 ) { - $value = mb_chr( $i, 'UTF-8' ); - $lexer = new WP_MySQL_Lexer( $value ); - $type = $lexer->next_token()->get_type(); + $value = mb_chr( $i, 'UTF-8' ); + + $lexer = new WP_MySQL_Lexer( $value ); + $this->assertTrue( $lexer->next_token() ); + + $type = $lexer->get_token()->get_type(); $is_valid = preg_match( '/^[\x{0080}-\x{ffff}]$/u', $value ); if ( $is_valid ) { $this->assertSame( WP_MySQL_Lexer::IDENTIFIER, $type ); - } elseif ( strlen( $value ) === 0 ) { - $this->assertSame( WP_MySQL_Lexer::EOF, $type ); } else { - $this->assertSame( WP_MySQL_Lexer::INVALID_INPUT, $type ); + $this->assertSame( WP_MySQL_Lexer::EOF, $type ); } } } @@ -33,14 +86,19 @@ public function test_identifier_utf8_range(): void { public function test_identifier_utf8_two_byte_sequences(): void { for ( $byte_1 = 128; $byte_1 <= 255; $byte_1 += 1 ) { for ( $byte_2 = 128; $byte_2 <= 255; $byte_2 += 1 ) { - $value = chr( $byte_1 ) . chr( $byte_2 ); + $value = chr( $byte_1 ) . chr( $byte_2 ); + + $lexer = new WP_MySQL_Lexer( $value ); + $result = $lexer->next_token(); + $token = $lexer->get_token(); + $is_valid = preg_match( '/^[\x{0080}-\x{ffff}]$/u', $value ); - $lexer = new WP_MySQL_Lexer( $value ); - $type = $lexer->next_token()->get_type(); if ( $is_valid ) { - $this->assertSame( WP_MySQL_Lexer::IDENTIFIER, $type ); + $this->assertTrue( $result ); + $this->assertSame( WP_MySQL_Lexer::IDENTIFIER, $token->get_type() ); } else { - $this->assertSame( WP_MySQL_Lexer::INVALID_INPUT, $type ); + $this->assertFalse( $result ); + $this->assertNull( $token ); } } } @@ -58,14 +116,19 @@ public function test_identifier_utf8_three_byte_sequences(): void { for ( $byte_1 = 0xE0; $byte_1 <= 0xFF; $byte_1 += 1 ) { for ( $byte_2 = 128; $byte_2 <= 255; $byte_2 += 1 ) { for ( $byte_3 = 128; $byte_3 <= 255; $byte_3 += 1 ) { - $value = chr( $byte_1 ) . chr( $byte_2 ) . chr( $byte_3 ); + $value = chr( $byte_1 ) . chr( $byte_2 ) . chr( $byte_3 ); + + $lexer = new WP_MySQL_Lexer( $value ); + $result = $lexer->next_token(); + $token = $lexer->get_token(); + $is_valid = preg_match( '/^[\x{0080}-\x{ffff}]$/u', $value ); - $lexer = new WP_MySQL_Lexer( $value ); - $type = $lexer->next_token()->get_type(); if ( $is_valid ) { - $this->assertSame( WP_MySQL_Lexer::IDENTIFIER, $type ); + $this->assertTrue( $result ); + $this->assertSame( WP_MySQL_Lexer::IDENTIFIER, $token->get_type() ); } else { - $this->assertSame( WP_MySQL_Lexer::INVALID_INPUT, $type ); + $this->assertFalse( $result ); + $this->assertNull( $token ); } } } @@ -77,8 +140,8 @@ public function test_identifier_utf8_three_byte_sequences(): void { */ public function test_integer_types( $input, $expected ): void { $lexer = new WP_MySQL_Lexer( $input ); - $type = $lexer->next_token()->get_type(); - $this->assertSame( $expected, $type ); + $this->assertTrue( $lexer->next_token() ); + $this->assertSame( $expected, $lexer->get_token()->get_type() ); } public function data_integer_types(): array { @@ -145,20 +208,20 @@ public function data_identifier_or_number(): array { array( '0b01xyz', array( WP_MySQL_Lexer::IDENTIFIER, WP_MySQL_Lexer::EOF ) ), // identifier array( '0b', array( WP_MySQL_Lexer::IDENTIFIER, WP_MySQL_Lexer::EOF ) ), // identifier array( "b'01'", array( WP_MySQL_Lexer::BIN_NUMBER, WP_MySQL_Lexer::EOF ) ), - array( "b'01xyz'", array( WP_MySQL_Lexer::INVALID_INPUT, WP_MySQL_Lexer::IDENTIFIER, WP_MySQL_Lexer::INVALID_INPUT, WP_MySQL_Lexer::EOF ) ), + array( "b'01xyz'", array() ), // invalid input array( "b''", array( WP_MySQL_Lexer::BIN_NUMBER, WP_MySQL_Lexer::EOF ) ), - array( "b'", array( WP_MySQL_Lexer::INVALID_INPUT, WP_MySQL_Lexer::EOF ) ), - array( "b'01", array( WP_MySQL_Lexer::INVALID_INPUT, WP_MySQL_Lexer::EOF ) ), + array( "b'", array() ), // invalid input + array( "b'01", array() ), // invalid input // hex array( '0xab01', array( WP_MySQL_Lexer::HEX_NUMBER, WP_MySQL_Lexer::EOF ) ), array( '0xab01xyz', array( WP_MySQL_Lexer::IDENTIFIER, WP_MySQL_Lexer::EOF ) ), // identifier array( '0x', array( WP_MySQL_Lexer::IDENTIFIER, WP_MySQL_Lexer::EOF ) ), // identifier array( "x'ab01'", array( WP_MySQL_Lexer::HEX_NUMBER, WP_MySQL_Lexer::EOF ) ), - array( "x'ab01xyz'", array( WP_MySQL_Lexer::INVALID_INPUT, WP_MySQL_Lexer::IDENTIFIER, WP_MySQL_Lexer::INVALID_INPUT, WP_MySQL_Lexer::EOF ) ), + array( "x'ab01xyz'", array() ), // invalid input array( "x''", array( WP_MySQL_Lexer::HEX_NUMBER, WP_MySQL_Lexer::EOF ) ), - array( "x'", array( WP_MySQL_Lexer::INVALID_INPUT, WP_MySQL_Lexer::EOF ) ), - array( "x'ab", array( WP_MySQL_Lexer::INVALID_INPUT, WP_MySQL_Lexer::EOF ) ), + array( "x'", array() ), // invalid input + array( "x'ab", array() ), // invalid input // decimal array( '123.456', array( WP_MySQL_Lexer::DECIMAL_NUMBER, WP_MySQL_Lexer::EOF ) ), diff --git a/wp-includes/mysql/class-wp-mysql-lexer.php b/wp-includes/mysql/class-wp-mysql-lexer.php index fdeabe71..bcaca570 100644 --- a/wp-includes/mysql/class-wp-mysql-lexer.php +++ b/wp-includes/mysql/class-wp-mysql-lexer.php @@ -936,9 +936,8 @@ class WP_MySQL_Lexer { const MYSQL_COMMENT_END = 902; // Special tokens - const WHITESPACE = 0; - const EOF = -1; - const INVALID_INPUT = -2; + const WHITESPACE = 0; + const EOF = -1; /** * A map of SQL keyword string values to their corresponding token types. @@ -2151,6 +2150,17 @@ class WP_MySQL_Lexer { */ private $token_starts_at = 0; + /** + * The type of the current token. + * + * When a token is successfully recognized and read, this value is set to the + * constant representing the token type. When no token was read yet, or the + * end of the SQL payload or an invalid token is reached, this value is null. + * + * @var int|null + */ + private $token_type; + /** * Whether the tokenizer is inside an active MySQL-specific comment. * @@ -2184,22 +2194,56 @@ public function __construct( * * This method reads bytes from the SQL payload until a token is recognized. * It starts from "$this->sql[ $this->bytes_already_read ]", advances the - * number of bytes read, and returns a WP_MySQL_Token object. When the end of - * the SQL payload is reached, the method always returns an EOF token. + * number of bytes read, and returns a boolean indicating whether a token + * was successfully recognized and read. When the end of the SQL payload + * or an invalid token is reached, the method returns false. * - * @return WP_MySQL_Token A token object representing the next recognized token. + * @return bool Whether a token was successfully recognized and read. */ - public function next_token(): WP_MySQL_Token { + public function next_token(): bool { + // We already reached the end of the SQL payload or an invalid token. + // Don't attempt to read any more bytes, and bail out immediately. + if ( + self::EOF === $this->token_type + || ( null === $this->token_type && $this->bytes_already_read > 0 ) + ) { + $this->token_type = null; + return false; + } + do { $this->token_starts_at = $this->bytes_already_read; - $type = $this->read_next_token(); + $this->token_type = $this->read_next_token(); } while ( - self::WHITESPACE === $type - || self::COMMENT === $type - || self::MYSQL_COMMENT_START === $type - || self::MYSQL_COMMENT_END === $type + self::WHITESPACE === $this->token_type + || self::COMMENT === $this->token_type + || self::MYSQL_COMMENT_START === $this->token_type + || self::MYSQL_COMMENT_END === $this->token_type ); - return new WP_MySQL_Token( $type, $this->get_current_token_bytes() ); + + // Invalid input. + if ( null === $this->token_type ) { + return false; + } + return true; + } + + /** + * Return the current token represented as a WP_MySQL_Token object. + * + * When no token was read yet, or the end of the SQL payload or an invalid + * token is reached, the method returns null. + * + * @TODO: Consider referential stability ($lexer->get_token() === $lexer->get_token()), + * or separate getters for the token type and token bytes (no token objects). + * + * @return WP_MySQL_Token|null An object representing the next recognized token or null. + */ + public function get_token(): ?WP_MySQL_Token { + if ( null === $this->token_type ) { + return null; + } + return new WP_MySQL_Token( $this->token_type, $this->get_current_token_bytes() ); } /** @@ -2209,17 +2253,20 @@ public function next_token(): WP_MySQL_Token { * by "$this->sql[ $this->bytes_already_read ]", and reads all tokens until * the end of the SQL payload is reached, returning an array of token objects. * - * It can be used to tokenize the whole SQL payload at once, at the expense of - * storing all token objects in memory at the same time. + * When an invalid token is reached, the method stops and returns the partial + * sequence of valid tokens. In this case, the EOF token will not be included. + * + * This method can be used to tokenize the whole SQL payload at once, at the + * expense of storing all token objects in memory at the same time. * * @return WP_MySQL_Token[] An array of token objects representing the remaining tokens. */ public function remaining_tokens(): array { $tokens = array(); - do { - $token = $this->next_token(); + while ( true === $this->next_token() ) { + $token = $this->get_token(); $tokens[] = $token; - } while ( WP_MySQL_Lexer::EOF !== $token->type ); + } return $tokens; } @@ -2281,7 +2328,7 @@ public static function get_token_name( int $token_id ): ?string { return $token_name ? $token_name : null; } - private function read_next_token(): int { + private function read_next_token(): ?int { $byte = $this->sql[ $this->bytes_already_read ] ?? null; $next_byte = $this->sql[ $this->bytes_already_read + 1 ] ?? null; @@ -2362,13 +2409,13 @@ private function read_next_token(): int { if ( $this->mysql_version >= 50713 ) { $type = self::JSON_UNQUOTED_SEPARATOR_SYMBOL; } else { - $type = self::INVALID_INPUT; + return null; // Invalid input. } } else { if ( $this->mysql_version >= 50708 ) { $type = self::JSON_SEPARATOR_SYMBOL; } else { - $type = self::INVALID_INPUT; + return null; // Invalid input. } } } else { @@ -2474,7 +2521,7 @@ private function read_next_token(): int { $this->bytes_already_read += 1; // Consume the 'N'. $type = self::NULL2_SYMBOL; } else { - $type = self::INVALID_INPUT; + return null; // Invalid input. } } elseif ( '#' === $byte ) { $type = $this->read_line_comment(); @@ -2531,7 +2578,7 @@ private function get_current_token_bytes(): string { * See: * https://dev.mysql.com/doc/refman/8.4/en/identifiers.html */ - private function read_identifier(): int { + private function read_identifier(): ?int { $started_at = $this->bytes_already_read; while ( true ) { // First, let's try to parse an ASCII sequence. @@ -2590,10 +2637,10 @@ private function read_identifier(): int { return $this->bytes_already_read - $started_at > 0 ? self::IDENTIFIER - : self::INVALID_INPUT; + : null; // Invalid input. } - private function read_number(): int { + private function read_number(): ?int { // @TODO: Support numeric-only identifier parts after "." (e.g., 1ea10.1). $byte = $this->sql[ $this->bytes_already_read ] ?? null; @@ -2619,7 +2666,7 @@ private function read_number(): int { $this->bytes_already_read >= strlen( $this->sql ) || "'" !== $this->sql[ $this->bytes_already_read ] ) { - return self::INVALID_INPUT; + return null; // Invalid input. } $this->bytes_already_read += 1; // Consume the "'". } @@ -2642,7 +2689,7 @@ private function read_number(): int { $this->bytes_already_read >= strlen( $this->sql ) || "'" !== $this->sql[ $this->bytes_already_read ] ) { - return self::INVALID_INPUT; + return null; // Invalid input. } $this->bytes_already_read += 1; // Consume the "'". } @@ -2759,7 +2806,7 @@ private function read_number(): int { * * @param string $quote The quote character - ', ", or `. */ - private function read_quoted_text(): int { + private function read_quoted_text(): ?int { $quote = $this->sql[ $this->bytes_already_read ]; $this->bytes_already_read += 1; // Consume the quote. @@ -2792,7 +2839,7 @@ private function read_quoted_text(): int { // Unclosed string - unexpected EOF. if ( ( $this->sql[ $at ] ?? null ) !== $quote ) { - return self::INVALID_INPUT; + return null; // Invalid input. } // Check if the quote is doubled.