From 94269694c7fa047d8899e81c23bc08b5c33760fc Mon Sep 17 00:00:00 2001
From: Jan Jakes <jan@jakes.pro>
Date: Thu, 14 Nov 2024 10:35:25 +0100
Subject: [PATCH] Implement "next_token()" & "get_next_token" API

---
 tests/mysql/WP_MySQL_Lexer_Tests.php       | 111 ++++++++++++++++-----
 wp-includes/mysql/class-wp-mysql-lexer.php | 105 +++++++++++++------
 2 files changed, 163 insertions(+), 53 deletions(-)

diff --git a/tests/mysql/WP_MySQL_Lexer_Tests.php b/tests/mysql/WP_MySQL_Lexer_Tests.php
index cd5d7a04..b761ebf8 100644
--- a/tests/mysql/WP_MySQL_Lexer_Tests.php
+++ b/tests/mysql/WP_MySQL_Lexer_Tests.php
@@ -3,22 +3,75 @@
 use PHPUnit\Framework\TestCase;
 
 class WP_MySQL_Lexer_Tests extends TestCase {
+	public function test_tokenize_valid_input(): void {
+		$lexer = new WP_MySQL_Lexer( 'SELECT id FROM users' );
+
+		// SELECT
+		$this->assertTrue( $lexer->next_token() );
+		$this->assertSame( WP_MySQL_Lexer::SELECT_SYMBOL, $lexer->get_token()->get_type() );
+
+		// id
+		$this->assertTrue( $lexer->next_token() );
+		$this->assertSame( WP_MySQL_Lexer::IDENTIFIER, $lexer->get_token()->get_type() );
+
+		// FROM
+		$this->assertTrue( $lexer->next_token() );
+		$this->assertSame( WP_MySQL_Lexer::FROM_SYMBOL, $lexer->get_token()->get_type() );
+
+		// users
+		$this->assertTrue( $lexer->next_token() );
+		$this->assertSame( WP_MySQL_Lexer::IDENTIFIER, $lexer->get_token()->get_type() );
+
+		// EOF
+		$this->assertTrue( $lexer->next_token() );
+		$this->assertSame( WP_MySQL_Lexer::EOF, $lexer->get_token()->get_type() );
+
+		// No more tokens.
+		$this->assertFalse( $lexer->next_token() );
+		$this->assertNull( $lexer->get_token() );
+
+		// Again, no more tokens.
+		$this->assertFalse( $lexer->next_token() );
+		$this->assertNull( $lexer->get_token() );
+	}
+
+	public function test_tokenize_invalid_input(): void {
+		$lexer = new WP_MySQL_Lexer( "SELECT x'ab01xyz'" );
+
+		// SELECT
+		$this->assertTrue( $lexer->next_token() );
+		$this->assertSame( WP_MySQL_Lexer::SELECT_SYMBOL, $lexer->get_token()->get_type() );
+
+		// Invalid input.
+		$this->assertFalse( $lexer->next_token() );
+		$this->assertNull( $lexer->get_token() );
+
+		// No more tokens.
+		$this->assertFalse( $lexer->next_token() );
+		$this->assertNull( $lexer->get_token() );
+
+		// Again, no more tokens.
+		$this->assertFalse( $lexer->next_token() );
+		$this->assertNull( $lexer->get_token() );
+	}
+
 	/**
 	 * Test that the whole U+0080 to U+FFFF UTF-8 range is valid in an identifier.
 	 * The validity is checked against PCRE with the "u" (PCRE_UTF8) modifier set.
 	 */
 	public function test_identifier_utf8_range(): void {
 		for ( $i = 0x80; $i < 0xffff; $i += 1 ) {
-			$value    = mb_chr( $i, 'UTF-8' );
-			$lexer    = new WP_MySQL_Lexer( $value );
-			$type     = $lexer->next_token()->get_type();
+			$value = mb_chr( $i, 'UTF-8' );
+
+			$lexer = new WP_MySQL_Lexer( $value );
+			$this->assertTrue( $lexer->next_token() );
+
+			$type     = $lexer->get_token()->get_type();
 			$is_valid = preg_match( '/^[\x{0080}-\x{ffff}]$/u', $value );
 			if ( $is_valid ) {
 				$this->assertSame( WP_MySQL_Lexer::IDENTIFIER, $type );
-			} elseif ( strlen( $value ) === 0 ) {
-				$this->assertSame( WP_MySQL_Lexer::EOF, $type );
 			} else {
-				$this->assertSame( WP_MySQL_Lexer::INVALID_INPUT, $type );
+				$this->assertSame( WP_MySQL_Lexer::EOF, $type );
 			}
 		}
 	}
@@ -33,14 +86,19 @@ public function test_identifier_utf8_range(): void {
 	public function test_identifier_utf8_two_byte_sequences(): void {
 		for ( $byte_1 = 128; $byte_1 <= 255; $byte_1 += 1 ) {
 			for ( $byte_2 = 128; $byte_2 <= 255; $byte_2 += 1 ) {
-				$value    = chr( $byte_1 ) . chr( $byte_2 );
+				$value = chr( $byte_1 ) . chr( $byte_2 );
+
+				$lexer  = new WP_MySQL_Lexer( $value );
+				$result = $lexer->next_token();
+				$token  = $lexer->get_token();
+
 				$is_valid = preg_match( '/^[\x{0080}-\x{ffff}]$/u', $value );
-				$lexer    = new WP_MySQL_Lexer( $value );
-				$type     = $lexer->next_token()->get_type();
 				if ( $is_valid ) {
-					$this->assertSame( WP_MySQL_Lexer::IDENTIFIER, $type );
+					$this->assertTrue( $result );
+					$this->assertSame( WP_MySQL_Lexer::IDENTIFIER, $token->get_type() );
 				} else {
-					$this->assertSame( WP_MySQL_Lexer::INVALID_INPUT, $type );
+					$this->assertFalse( $result );
+					$this->assertNull( $token );
 				}
 			}
 		}
@@ -58,14 +116,19 @@ public function test_identifier_utf8_three_byte_sequences(): void {
 		for ( $byte_1 = 0xE0; $byte_1 <= 0xFF; $byte_1 += 1 ) {
 			for ( $byte_2 = 128; $byte_2 <= 255; $byte_2 += 1 ) {
 				for ( $byte_3 = 128; $byte_3 <= 255; $byte_3 += 1 ) {
-					$value    = chr( $byte_1 ) . chr( $byte_2 ) . chr( $byte_3 );
+					$value = chr( $byte_1 ) . chr( $byte_2 ) . chr( $byte_3 );
+
+					$lexer  = new WP_MySQL_Lexer( $value );
+					$result = $lexer->next_token();
+					$token  = $lexer->get_token();
+
 					$is_valid = preg_match( '/^[\x{0080}-\x{ffff}]$/u', $value );
-					$lexer    = new WP_MySQL_Lexer( $value );
-					$type     = $lexer->next_token()->get_type();
 					if ( $is_valid ) {
-						$this->assertSame( WP_MySQL_Lexer::IDENTIFIER, $type );
+						$this->assertTrue( $result );
+						$this->assertSame( WP_MySQL_Lexer::IDENTIFIER, $token->get_type() );
 					} else {
-						$this->assertSame( WP_MySQL_Lexer::INVALID_INPUT, $type );
+						$this->assertFalse( $result );
+						$this->assertNull( $token );
 					}
 				}
 			}
@@ -77,8 +140,8 @@ public function test_identifier_utf8_three_byte_sequences(): void {
 	 */
 	public function test_integer_types( $input, $expected ): void {
 		$lexer = new WP_MySQL_Lexer( $input );
-		$type  = $lexer->next_token()->get_type();
-		$this->assertSame( $expected, $type );
+		$this->assertTrue( $lexer->next_token() );
+		$this->assertSame( $expected, $lexer->get_token()->get_type() );
 	}
 
 	public function data_integer_types(): array {
@@ -145,20 +208,20 @@ public function data_identifier_or_number(): array {
 			array( '0b01xyz', array( WP_MySQL_Lexer::IDENTIFIER, WP_MySQL_Lexer::EOF ) ), // identifier
 			array( '0b', array( WP_MySQL_Lexer::IDENTIFIER, WP_MySQL_Lexer::EOF ) ), // identifier
 			array( "b'01'", array( WP_MySQL_Lexer::BIN_NUMBER, WP_MySQL_Lexer::EOF ) ),
-			array( "b'01xyz'", array( WP_MySQL_Lexer::INVALID_INPUT, WP_MySQL_Lexer::IDENTIFIER, WP_MySQL_Lexer::INVALID_INPUT, WP_MySQL_Lexer::EOF ) ),
+			array( "b'01xyz'", array() ), // invalid input
 			array( "b''", array( WP_MySQL_Lexer::BIN_NUMBER, WP_MySQL_Lexer::EOF ) ),
-			array( "b'", array( WP_MySQL_Lexer::INVALID_INPUT, WP_MySQL_Lexer::EOF ) ),
-			array( "b'01", array( WP_MySQL_Lexer::INVALID_INPUT, WP_MySQL_Lexer::EOF ) ),
+			array( "b'", array() ), // invalid input
+			array( "b'01", array() ), // invalid input
 
 			// hex
 			array( '0xab01', array( WP_MySQL_Lexer::HEX_NUMBER, WP_MySQL_Lexer::EOF ) ),
 			array( '0xab01xyz', array( WP_MySQL_Lexer::IDENTIFIER, WP_MySQL_Lexer::EOF ) ), // identifier
 			array( '0x', array( WP_MySQL_Lexer::IDENTIFIER, WP_MySQL_Lexer::EOF ) ), // identifier
 			array( "x'ab01'", array( WP_MySQL_Lexer::HEX_NUMBER, WP_MySQL_Lexer::EOF ) ),
-			array( "x'ab01xyz'", array( WP_MySQL_Lexer::INVALID_INPUT, WP_MySQL_Lexer::IDENTIFIER, WP_MySQL_Lexer::INVALID_INPUT, WP_MySQL_Lexer::EOF ) ),
+			array( "x'ab01xyz'", array() ), // invalid input
 			array( "x''", array( WP_MySQL_Lexer::HEX_NUMBER, WP_MySQL_Lexer::EOF ) ),
-			array( "x'", array( WP_MySQL_Lexer::INVALID_INPUT, WP_MySQL_Lexer::EOF ) ),
-			array( "x'ab", array( WP_MySQL_Lexer::INVALID_INPUT, WP_MySQL_Lexer::EOF ) ),
+			array( "x'", array() ), // invalid input
+			array( "x'ab", array() ), // invalid input
 
 			// decimal
 			array( '123.456', array( WP_MySQL_Lexer::DECIMAL_NUMBER, WP_MySQL_Lexer::EOF ) ),
diff --git a/wp-includes/mysql/class-wp-mysql-lexer.php b/wp-includes/mysql/class-wp-mysql-lexer.php
index fdeabe71..bcaca570 100644
--- a/wp-includes/mysql/class-wp-mysql-lexer.php
+++ b/wp-includes/mysql/class-wp-mysql-lexer.php
@@ -936,9 +936,8 @@ class WP_MySQL_Lexer {
 	const MYSQL_COMMENT_END   = 902;
 
 	// Special tokens
-	const WHITESPACE    = 0;
-	const EOF           = -1;
-	const INVALID_INPUT = -2;
+	const WHITESPACE = 0;
+	const EOF        = -1;
 
 	/**
 	 * A map of SQL keyword string values to their corresponding token types.
@@ -2151,6 +2150,17 @@ class WP_MySQL_Lexer {
 	 */
 	private $token_starts_at = 0;
 
+	/**
+	 * The type of the current token.
+	 *
+	 * When a token is successfully recognized and read, this value is set to the
+	 * constant representing the token type. When no token was read yet, or the
+	 * end of the SQL payload or an invalid token is reached, this value is null.
+	 *
+	 * @var int|null
+	 */
+	private $token_type;
+
 	/**
 	 * Whether the tokenizer is inside an active MySQL-specific comment.
 	 *
@@ -2184,22 +2194,56 @@ public function __construct(
 	 *
 	 * This method reads bytes from the SQL payload until a token is recognized.
 	 * It starts from "$this->sql[ $this->bytes_already_read ]", advances the
-	 * number of bytes read, and returns a WP_MySQL_Token object. When the end of
-	 * the SQL payload is reached, the method always returns an EOF token.
+	 * number of bytes read, and returns a boolean indicating whether a token
+	 * was successfully recognized and read. When the end of the SQL payload
+	 * or an invalid token is reached, the method returns false.
 	 *
-	 * @return WP_MySQL_Token A token object representing the next recognized token.
+	 * @return bool Whether a token was successfully recognized and read.
 	 */
-	public function next_token(): WP_MySQL_Token {
+	public function next_token(): bool {
+		// We already reached the end of the SQL payload or an invalid token.
+		// Don't attempt to read any more bytes, and bail out immediately.
+		if (
+			self::EOF === $this->token_type
+			|| ( null === $this->token_type && $this->bytes_already_read > 0 )
+		) {
+			$this->token_type = null;
+			return false;
+		}
+
 		do {
 			$this->token_starts_at = $this->bytes_already_read;
-			$type                  = $this->read_next_token();
+			$this->token_type      = $this->read_next_token();
 		} while (
-			self::WHITESPACE === $type
-			|| self::COMMENT === $type
-			|| self::MYSQL_COMMENT_START === $type
-			|| self::MYSQL_COMMENT_END === $type
+			self::WHITESPACE === $this->token_type
+			|| self::COMMENT === $this->token_type
+			|| self::MYSQL_COMMENT_START === $this->token_type
+			|| self::MYSQL_COMMENT_END === $this->token_type
 		);
-		return new WP_MySQL_Token( $type, $this->get_current_token_bytes() );
+
+		// Invalid input.
+		if ( null === $this->token_type ) {
+			return false;
+		}
+		return true;
+	}
+
+	/**
+	 * Return the current token represented as a WP_MySQL_Token object.
+	 *
+	 * When no token was read yet, or the end of the SQL payload or an invalid
+	 * token is reached, the method returns null.
+	 *
+	 * @TODO: Consider referential stability ($lexer->get_token() === $lexer->get_token()),
+	 *        or separate getters for the token type and token bytes (no token objects).
+	 *
+	 * @return WP_MySQL_Token|null An object representing the next recognized token or null.
+	 */
+	public function get_token(): ?WP_MySQL_Token {
+		if ( null === $this->token_type ) {
+			return null;
+		}
+		return new WP_MySQL_Token( $this->token_type, $this->get_current_token_bytes() );
 	}
 
 	/**
@@ -2209,17 +2253,20 @@ public function next_token(): WP_MySQL_Token {
 	 * by "$this->sql[ $this->bytes_already_read ]", and reads all tokens until
 	 * the end of the SQL payload is reached, returning an array of token objects.
 	 *
-	 * It can be used to tokenize the whole SQL payload at once, at the expense of
-	 * storing all token objects in memory at the same time.
+	 * When an invalid token is reached, the method stops and returns the partial
+	 * sequence of valid tokens. In this case, the EOF token will not be included.
+	 *
+	 * This method can be used to tokenize the whole SQL payload at once, at the
+	 * expense of storing all token objects in memory at the same time.
 	 *
 	 * @return WP_MySQL_Token[] An array of token objects representing the remaining tokens.
 	 */
 	public function remaining_tokens(): array {
 		$tokens = array();
-		do {
-			$token    = $this->next_token();
+		while ( true === $this->next_token() ) {
+			$token    = $this->get_token();
 			$tokens[] = $token;
-		} while ( WP_MySQL_Lexer::EOF !== $token->type );
+		}
 		return $tokens;
 	}
 
@@ -2281,7 +2328,7 @@ public static function get_token_name( int $token_id ): ?string {
 		return $token_name ? $token_name : null;
 	}
 
-	private function read_next_token(): int {
+	private function read_next_token(): ?int {
 		$byte      = $this->sql[ $this->bytes_already_read ] ?? null;
 		$next_byte = $this->sql[ $this->bytes_already_read + 1 ] ?? null;
 
@@ -2362,13 +2409,13 @@ private function read_next_token(): int {
 					if ( $this->mysql_version >= 50713 ) {
 						$type = self::JSON_UNQUOTED_SEPARATOR_SYMBOL;
 					} else {
-						$type = self::INVALID_INPUT;
+						return null; // Invalid input.
 					}
 				} else {
 					if ( $this->mysql_version >= 50708 ) {
 						$type = self::JSON_SEPARATOR_SYMBOL;
 					} else {
-						$type = self::INVALID_INPUT;
+						return null; // Invalid input.
 					}
 				}
 			} else {
@@ -2474,7 +2521,7 @@ private function read_next_token(): int {
 				$this->bytes_already_read += 1; // Consume the 'N'.
 				$type                      = self::NULL2_SYMBOL;
 			} else {
-				$type = self::INVALID_INPUT;
+				return null; // Invalid input.
 			}
 		} elseif ( '#' === $byte ) {
 			$type = $this->read_line_comment();
@@ -2531,7 +2578,7 @@ private function get_current_token_bytes(): string {
 	 *  See:
 	 *    https://dev.mysql.com/doc/refman/8.4/en/identifiers.html
 	 */
-	private function read_identifier(): int {
+	private function read_identifier(): ?int {
 		$started_at = $this->bytes_already_read;
 		while ( true ) {
 			// First, let's try to parse an ASCII sequence.
@@ -2590,10 +2637,10 @@ private function read_identifier(): int {
 
 		return $this->bytes_already_read - $started_at > 0
 			? self::IDENTIFIER
-			: self::INVALID_INPUT;
+			: null; // Invalid input.
 	}
 
-	private function read_number(): int {
+	private function read_number(): ?int {
 		// @TODO: Support numeric-only identifier parts after "." (e.g., 1ea10.1).
 
 		$byte       = $this->sql[ $this->bytes_already_read ] ?? null;
@@ -2619,7 +2666,7 @@ private function read_number(): int {
 					$this->bytes_already_read >= strlen( $this->sql )
 					|| "'" !== $this->sql[ $this->bytes_already_read ]
 				) {
-					return self::INVALID_INPUT;
+					return null; // Invalid input.
 				}
 				$this->bytes_already_read += 1; // Consume the "'".
 			}
@@ -2642,7 +2689,7 @@ private function read_number(): int {
 					$this->bytes_already_read >= strlen( $this->sql )
 					|| "'" !== $this->sql[ $this->bytes_already_read ]
 				) {
-					return self::INVALID_INPUT;
+					return null; // Invalid input.
 				}
 				$this->bytes_already_read += 1; // Consume the "'".
 			}
@@ -2759,7 +2806,7 @@ private function read_number(): int {
 	 *
 	 * @param string $quote The quote character - ', ", or `.
 	 */
-	private function read_quoted_text(): int {
+	private function read_quoted_text(): ?int {
 		$quote                     = $this->sql[ $this->bytes_already_read ];
 		$this->bytes_already_read += 1; // Consume the quote.
 
@@ -2792,7 +2839,7 @@ private function read_quoted_text(): int {
 
 			// Unclosed string - unexpected EOF.
 			if ( ( $this->sql[ $at ] ?? null ) !== $quote ) {
-				return self::INVALID_INPUT;
+				return null; // Invalid input.
 			}
 
 			// Check if the quote is doubled.