From 6678856e9113835e66d2b3bb33b4633a9375d77e Mon Sep 17 00:00:00 2001 From: Jan Jakes Date: Fri, 8 Nov 2024 16:44:37 +0100 Subject: [PATCH] Improve property and method naming and documentation --- tests/mysql/WP_MySQL_Lexer_Tests.php | 3 +- .../WP_MySQL_Server_Suite_Lexer_Tests.php | 3 +- .../WP_MySQL_Server_Suite_Parser_Tests.php | 3 +- tests/tools/run-lexer-benchmark.php | 3 +- tests/tools/run-parser-benchmark.php | 3 +- tests/tools/run-parser-test.php | 3 +- wp-includes/mysql/class-wp-mysql-lexer.php | 548 +++++++++++------- 7 files changed, 349 insertions(+), 217 deletions(-) diff --git a/tests/mysql/WP_MySQL_Lexer_Tests.php b/tests/mysql/WP_MySQL_Lexer_Tests.php index 0a9ae008..573764aa 100644 --- a/tests/mysql/WP_MySQL_Lexer_Tests.php +++ b/tests/mysql/WP_MySQL_Lexer_Tests.php @@ -13,11 +13,12 @@ class WP_MySQL_Lexer_Tests extends TestCase { * @dataProvider data_identifier_or_number */ public function test_identifier_or_number( $input, $expected ): void { + $lexer = new WP_MySQL_Lexer( $input ); $actual = array_map( function ( $token ) { return $token->get_type(); }, - WP_MySQL_Lexer::tokenize( $input ) + $lexer->remaining_tokens() ); // Compare token names to get more readable error messages. diff --git a/tests/mysql/WP_MySQL_Server_Suite_Lexer_Tests.php b/tests/mysql/WP_MySQL_Server_Suite_Lexer_Tests.php index 11587350..53a786ab 100644 --- a/tests/mysql/WP_MySQL_Server_Suite_Lexer_Tests.php +++ b/tests/mysql/WP_MySQL_Server_Suite_Lexer_Tests.php @@ -23,7 +23,8 @@ public function test_tokenize_mysql_test_suite(): void { try { while ( ( $record = fgetcsv( $handle ) ) !== false ) { $query = $record[0]; - $tokens = WP_MySQL_Lexer::tokenize( $query ); + $lexer = new WP_MySQL_Lexer( $query ); + $tokens = $lexer->remaining_tokens(); $this->assertNotEmpty( $tokens, "Failed to tokenize query: $query" ); } } finally { diff --git a/tests/mysql/WP_MySQL_Server_Suite_Parser_Tests.php b/tests/mysql/WP_MySQL_Server_Suite_Parser_Tests.php index f69a3ac1..96efc733 100644 --- a/tests/mysql/WP_MySQL_Server_Suite_Parser_Tests.php +++ b/tests/mysql/WP_MySQL_Server_Suite_Parser_Tests.php @@ -48,7 +48,8 @@ public function test_parse_mysql_test_suite( array $batch ): void { foreach ( $batch as $record ) { $query = $record[0]; - $tokens = WP_MySQL_Lexer::tokenize( $query ); + $lexer = new WP_MySQL_Lexer( $query ); + $tokens = $lexer->remaining_tokens(); $this->assertNotEmpty( $tokens, "Failed to tokenize query: $query" ); $parser = new WP_MySQL_Parser( self::$grammar, $tokens ); diff --git a/tests/tools/run-lexer-benchmark.php b/tests/tools/run-lexer-benchmark.php index 7be2d8fb..2564f330 100644 --- a/tests/tools/run-lexer-benchmark.php +++ b/tests/tools/run-lexer-benchmark.php @@ -27,7 +27,8 @@ function ( $severity, $message, $file, $line ) { $start = microtime( true ); for ( $i = 0; $i < count( $records ); $i += 1 ) { $query = $records[ $i ][0]; - $tokens = WP_MySQL_Lexer::tokenize( $query ); + $lexer = new WP_MySQL_Lexer( $query ); + $tokens = $lexer->remaining_tokens(); if ( count( $tokens ) === 0 ) { throw new Exception( 'Failed to tokenize query: ' . $query ); } diff --git a/tests/tools/run-parser-benchmark.php b/tests/tools/run-parser-benchmark.php index 6f13fad2..afba53e2 100644 --- a/tests/tools/run-parser-benchmark.php +++ b/tests/tools/run-parser-benchmark.php @@ -55,7 +55,8 @@ function getStats( $total, $failures, $exceptions ) { } try { - $tokens = WP_MySQL_Lexer::tokenize( $query ); + $lexer = new WP_MySQL_Lexer( $query ); + $tokens = $lexer->remaining_tokens(); if ( count( $tokens ) === 0 ) { throw new Exception( 'Failed to tokenize query: ' . $query ); } diff --git a/tests/tools/run-parser-test.php b/tests/tools/run-parser-test.php index 164c9b55..78fd32ac 100644 --- a/tests/tools/run-parser-test.php +++ b/tests/tools/run-parser-test.php @@ -24,7 +24,8 @@ function ( $severity, $message, $file, $line ) { $grammar = new WP_Parser_Grammar( $grammar_data ); // Edit the query below to test different inputs: -$tokens = WP_MySQL_Lexer::tokenize( 'SELECT 1' ); +$lexer = new WP_MySQL_Lexer( 'SELECT 1' ); +$tokens = $lexer->remaining_tokens(); echo "Tokens:\n"; foreach ( $tokens as $token ) { diff --git a/wp-includes/mysql/class-wp-mysql-lexer.php b/wp-includes/mysql/class-wp-mysql-lexer.php index 12a0e957..064eaa8b 100644 --- a/wp-includes/mysql/class-wp-mysql-lexer.php +++ b/wp-includes/mysql/class-wp-mysql-lexer.php @@ -2049,63 +2049,98 @@ class WP_MySQL_Lexer { '_utf8mb4' => true, ); - protected $input; - protected $position = 0; - protected $last_token_end_position = 0; - protected $server_version; - protected $sql_modes; - protected $in_version_comment = false; - - public function __construct( string $input, int $server_version = 80038, int $sql_modes = 0 ) { - $this->input = $input; - $this->server_version = $server_version; - $this->sql_modes = $sql_modes; - } + /** + * The SQL payload to tokenize. + * + * @var string + */ + private $sql; - public static function tokenize( $sql ): array { - $lexer = new WP_MySQL_Lexer( $sql ); - $tokens = array(); - do { - $token = $lexer->get_next_token(); - $tokens[] = $token; - } while ( WP_MySQL_Lexer::EOF !== $token->type ); - return $tokens; - } + /** + * The version of the MySQL server that the SQL payload is intended for. + * + * This is used to determine which tokens are valid for the given MySQL + * version, and how some tokens should be interpreted. + * + * @var int + */ + private $mysql_version; - public function is_sql_mode_active( int $mode ): bool { - return ( $this->sql_modes & $mode ) !== 0; - } + /** + * The SQL modes that should be considered active during tokenization. + * + * This is an integer that represents currently active SQL modes as a bitmask. + * The SQL modes are defined as "SQL_MODE_"-prefixed constants in this class. + * The list of the SQL modes isn't exhaustive, as only some affect tokenization. + * + * @var int + */ + private $sql_modes; - public function get_server_version() { - return $this->server_version; - } + /** + * How many bytes from the original SQL payload have been read and tokenized. + * + * This is an internal cursor that is used to track the current position in + * the SQL payload during tokenization. When used as an index in the SQL + * payload, it points to the next byte to read. + * + * @var int + */ + private $bytes_already_read = 0; - public static function get_token_id( string $token_name ): int { - if ( 'ε' === $token_name ) { - return self::EMPTY_TOKEN; - } - return constant( self::class . '::' . $token_name ); - } + /** + * Byte offset in the SQL payload where current token starts. + * + * This is used to extract the token bytes after the token is processed. + * The bytes of the current token are represented by "$this->sql" in range + * from "$this->token_starts_at" to "$this->bytes_already_read - 1". + * + * @var int + */ + private $token_starts_at = 0; - public static function get_token_name( $token_type ): string { - $reflection = new ReflectionClass( self::class ); - $constants = array_reverse( $reflection->getConstants() ); // some constants can conflict, tokens are at the end - $token_name = array_search( $token_type, $constants, true ); - return $token_name ? $token_name : ''; - } + /** + * Whether the tokenizer is inside an active MySQL-specific comment. + * + * MySQL supports a special comment syntax whose content is recognized as + * a comment by most database engines, but can be treated as SQL by MySQL: + * + * 1. /*! ... - The content is treated as SQL. + * 2. /*!12345 - The content is treated as SQL when "MySQL version >= 12345". + * + * @var bool + */ + private $in_mysql_comment = false; - public function get_text(): string { - return substr( - $this->input, - $this->last_token_end_position, - $this->position - $this->last_token_end_position - ); + /** + * @param string $sql The SQL payload to tokenize. + * @param int $mysql_version The version of the MySQL server that the SQL payload is intended for. + * @param int $sql_modes The SQL modes that should be considered active during tokenization. + */ + public function __construct( + string $sql, + int $mysql_version = 80038, + int $sql_modes = 0 + ) { + $this->sql = $sql; + $this->mysql_version = $mysql_version; + $this->sql_modes = $sql_modes; } - public function get_next_token(): WP_MySQL_Token { + /** + * Read the next token from the SQL payload and return it as a token object. + * + * This method reads bytes from the SQL payload until a token is recognized. + * It starts from "$this->sql[ $this->bytes_already_read ]", advances the + * number of bytes read, and returns a WP_MySQL_Token object. When the end of + * the SQL payload is reached, the method always returns an EOF token. + * + * @return WP_MySQL_Token A token object representing the next recognized token. + */ + public function next_token(): WP_MySQL_Token { do { - $type = $this->read_next_token(); - $this->last_token_end_position = $this->position; + $type = $this->read_next_token(); + $this->token_starts_at = $this->bytes_already_read; } while ( in_array( $type, @@ -2118,12 +2153,95 @@ public function get_next_token(): WP_MySQL_Token { true ) ); - return new WP_MySQL_Token( $type, $this->get_text() ); + return new WP_MySQL_Token( $type, $this->get_current_token_bytes() ); + } + + /** + * Read all remaining tokens from the SQL payload and return them as an array. + * + * This method starts from the current position in the SQL payload, as marked + * by "$this->sql[ $this->bytes_already_read ]", and reads all tokens until + * the end of the SQL payload is reached, returning an array of token objects. + * + * It can be used to tokenize the whole SQL payload at once, at the expense of + * storing all token objects in memory at the same time. + * + * @return WP_MySQL_Token[] An array of token objects representing the remaining tokens. + */ + public function remaining_tokens(): array { + $tokens = array(); + do { + $token = $this->next_token(); + $tokens[] = $token; + } while ( WP_MySQL_Lexer::EOF !== $token->type ); + return $tokens; + } + + /** + * The version of the MySQL server that the SQL payload is intended for. + * + * This represents the MySQL server version that the lexer is set up to + * consider when tokenizing the SQL payload. + * + * @return int The MySQL server version that the lexer is set up to consider. + */ + public function get_mysql_version(): int { + return $this->mysql_version; + } + + /** + * Whether an SQL mode is set to be considered as active during tokenization. + * The SQL modes are defined as "SQL_MODE_"-prefixed constants in this class. + * + * @param int $mode The SQL mode to check, an "SQL_MODE_"-prefixed constant. + * @return bool Whether the given SQL mode is active. + */ + public function is_sql_mode_active( int $mode ): bool { + return ( $this->sql_modes & $mode ) !== 0; + } + + /** + * Get the numeric token ID for a given token name. + * + * @param string $token_name The name of the token. + * @return int|null The token ID for the given token name; null when not found. + */ + public static function get_token_id( string $token_name ): ?int { + if ( 'ε' === $token_name ) { + return self::EMPTY_TOKEN; + } + + $constant_name = self::class . '::' . $token_name; + if ( ! defined( $constant_name ) ) { + return null; + } + return constant( $constant_name ); + } + + /** + * Get the name of a token for a given token ID. + * + * This method is intended to be used only for testing and debugging purposes, + * when tokens need to be presented by their names in a human-readable form. + * It should not be used in production code, as it's not performance-optimized. + * + * @param int $token_id The numeric token ID. + * @return string The token name for the given token ID; null when not found. + */ + public static function get_token_name( int $token_id ): ?string { + $reflection = new ReflectionClass( self::class ); + // Reverse the array, as some constant values in the class can conflict, + // and tokens are defined at the end of the class constant definitions. + // @TODO: Consider are more robust way to determine the token name. + // E.g., prefix all token constant names with a common prefix. + $constants = array_reverse( $reflection->getConstants() ); + $token_name = array_search( $token_id, $constants, true ); + return $token_name ? $token_name : null; } private function read_next_token(): int { - $la = $this->input[ $this->position ] ?? null; - $la2 = $this->input[ $this->position + 1 ] ?? null; + $la = $this->sql[ $this->bytes_already_read ] ?? null; + $la2 = $this->sql[ $this->bytes_already_read + 1 ] ?? null; if ( "'" === $la || '"' === $la || '`' === $la ) { $type = $this->read_quoted_text( $la ); @@ -2133,157 +2251,157 @@ private function read_next_token(): int { if ( $this->is_digit( $la2 ) ) { $type = $this->read_number(); } else { - $this->position += 1; - $type = self::DOT_SYMBOL; + $this->bytes_already_read += 1; + $type = self::DOT_SYMBOL; } } elseif ( '=' === $la ) { - $this->position += 1; - $type = self::EQUAL_OPERATOR; + $this->bytes_already_read += 1; + $type = self::EQUAL_OPERATOR; } elseif ( ':' === $la ) { - $this->position += 1; // Consume the ':'. + $this->bytes_already_read += 1; // Consume the ':'. if ( '=' === $la2 ) { - $this->position += 1; // Consume the '='. - $type = self::ASSIGN_OPERATOR; + $this->bytes_already_read += 1; // Consume the '='. + $type = self::ASSIGN_OPERATOR; } else { $type = self::COLON_SYMBOL; } } elseif ( '<' === $la ) { - $this->position += 1; // Consume the '<'. + $this->bytes_already_read += 1; // Consume the '<'. if ( '=' === $la2 ) { - $this->position += 1; // Consume the '='. - if ( '>' === ( $this->input[ $this->position ] ?? null ) ) { - $this->position += 1; // Consume the '>'. - $type = self::NULL_SAFE_EQUAL_OPERATOR; + $this->bytes_already_read += 1; // Consume the '='. + if ( '>' === ( $this->sql[ $this->bytes_already_read ] ?? null ) ) { + $this->bytes_already_read += 1; // Consume the '>'. + $type = self::NULL_SAFE_EQUAL_OPERATOR; } else { $type = self::LESS_OR_EQUAL_OPERATOR; } } elseif ( '>' === $la2 ) { - $this->position += 1; // Consume the '>'. - $type = self::NOT_EQUAL_OPERATOR; + $this->bytes_already_read += 1; // Consume the '>'. + $type = self::NOT_EQUAL_OPERATOR; } elseif ( '<' === $la2 ) { - $this->position += 1; // Consume the '<'. - $type = self::SHIFT_LEFT_OPERATOR; + $this->bytes_already_read += 1; // Consume the '<'. + $type = self::SHIFT_LEFT_OPERATOR; } else { $type = self::LESS_THAN_OPERATOR; } } elseif ( '>' === $la ) { - $this->position += 1; // Consume the '>'. + $this->bytes_already_read += 1; // Consume the '>'. if ( '=' === $la2 ) { - $this->position += 1; // Consume the '='. - $type = self::GREATER_OR_EQUAL_OPERATOR; + $this->bytes_already_read += 1; // Consume the '='. + $type = self::GREATER_OR_EQUAL_OPERATOR; } elseif ( '>' === $la2 ) { - $this->position += 1; // Consume the '>'. - $type = self::SHIFT_RIGHT_OPERATOR; + $this->bytes_already_read += 1; // Consume the '>'. + $type = self::SHIFT_RIGHT_OPERATOR; } else { $type = self::GREATER_THAN_OPERATOR; } } elseif ( '!' === $la ) { - $this->position += 1; // Consume the '!'. + $this->bytes_already_read += 1; // Consume the '!'. if ( '=' === $la2 ) { - $this->position += 1; // Consume the '='. - $type = self::NOT_EQUAL_OPERATOR; + $this->bytes_already_read += 1; // Consume the '='. + $type = self::NOT_EQUAL_OPERATOR; } else { $type = self::LOGICAL_NOT_OPERATOR; } } elseif ( '+' === $la ) { - $this->position += 1; - $type = self::PLUS_OPERATOR; + $this->bytes_already_read += 1; + $type = self::PLUS_OPERATOR; } elseif ( '-' === $la ) { - if ( '-' === $la2 && $this->is_whitespace( $this->input[ $this->position + 2 ] ?? null ) ) { + if ( '-' === $la2 && $this->is_whitespace( $this->sql[ $this->bytes_already_read + 2 ] ?? null ) ) { $type = $this->read_line_comment(); } elseif ( '>' === $la2 ) { - $this->position += 2; // Consume the '->'. - if ( '>' === ( $this->input[ $this->position ] ?? null ) ) { - $this->position += 1; // Consume the '>'. - if ( $this->server_version >= 50713 ) { + $this->bytes_already_read += 2; // Consume the '->'. + if ( '>' === ( $this->sql[ $this->bytes_already_read ] ?? null ) ) { + $this->bytes_already_read += 1; // Consume the '>'. + if ( $this->mysql_version >= 50713 ) { $type = self::JSON_UNQUOTED_SEPARATOR_SYMBOL; } else { $type = self::INVALID_INPUT; } } else { - if ( $this->server_version >= 50708 ) { + if ( $this->mysql_version >= 50708 ) { $type = self::JSON_SEPARATOR_SYMBOL; } else { $type = self::INVALID_INPUT; } } } else { - $this->position += 1; // Consume the '-'. - $type = self::MINUS_OPERATOR; + $this->bytes_already_read += 1; // Consume the '-'. + $type = self::MINUS_OPERATOR; } } elseif ( '*' === $la ) { - $this->position += 1; - if ( '/' === $la2 && $this->in_version_comment ) { - $this->position += 1; // Consume the '/'. - $type = self::MYSQL_COMMENT_END; - $this->in_version_comment = false; + $this->bytes_already_read += 1; + if ( '/' === $la2 && $this->in_mysql_comment ) { + $this->bytes_already_read += 1; // Consume the '/'. + $type = self::MYSQL_COMMENT_END; + $this->in_mysql_comment = false; } else { $type = self::MULT_OPERATOR; } } elseif ( '/' === $la ) { if ( '*' === $la2 ) { - if ( '!' === ( $this->input[ $this->position + 2 ] ?? null ) ) { + if ( '!' === ( $this->sql[ $this->bytes_already_read + 2 ] ?? null ) ) { $type = $this->read_mysql_comment(); } else { - $this->position += 2; // Consume the '/*'. + $this->bytes_already_read += 2; // Consume the '/*'. $this->read_comment_content(); $type = self::COMMENT; } } else { - $this->position += 1; - $type = self::DIV_OPERATOR; + $this->bytes_already_read += 1; + $type = self::DIV_OPERATOR; } } elseif ( '%' === $la ) { - $this->position += 1; - $type = self::MOD_OPERATOR; + $this->bytes_already_read += 1; + $type = self::MOD_OPERATOR; } elseif ( '&' === $la ) { - $this->position += 1; // Consume the '&'. + $this->bytes_already_read += 1; // Consume the '&'. if ( '&' === $la2 ) { - $this->position += 1; // Consume the '&'. - $type = self::LOGICAL_AND_OPERATOR; + $this->bytes_already_read += 1; // Consume the '&'. + $type = self::LOGICAL_AND_OPERATOR; } else { $type = self::BITWISE_AND_OPERATOR; } } elseif ( '^' === $la ) { - $this->position += 1; - $type = self::BITWISE_XOR_OPERATOR; + $this->bytes_already_read += 1; + $type = self::BITWISE_XOR_OPERATOR; } elseif ( '|' === $la ) { - $this->position += 1; // Consume the '|'. + $this->bytes_already_read += 1; // Consume the '|'. if ( '|' === $la2 ) { - $this->position += 1; // Consume the '|'. - $type = $this->is_sql_mode_active( self::SQL_MODE_PIPES_AS_CONCAT ) + $this->bytes_already_read += 1; // Consume the '|'. + $type = $this->is_sql_mode_active( self::SQL_MODE_PIPES_AS_CONCAT ) ? self::CONCAT_PIPES_SYMBOL : self::LOGICAL_OR_OPERATOR; } else { $type = self::BITWISE_OR_OPERATOR; } } elseif ( '~' === $la ) { - $this->position += 1; - $type = self::BITWISE_NOT_OPERATOR; + $this->bytes_already_read += 1; + $type = self::BITWISE_NOT_OPERATOR; } elseif ( ',' === $la ) { - $this->position += 1; - $type = self::COMMA_SYMBOL; + $this->bytes_already_read += 1; + $type = self::COMMA_SYMBOL; } elseif ( ';' === $la ) { - $this->position += 1; - $type = self::SEMICOLON_SYMBOL; + $this->bytes_already_read += 1; + $type = self::SEMICOLON_SYMBOL; } elseif ( '(' === $la ) { - $this->position += 1; - $type = self::OPEN_PAR_SYMBOL; + $this->bytes_already_read += 1; + $type = self::OPEN_PAR_SYMBOL; } elseif ( ')' === $la ) { - $this->position += 1; - $type = self::CLOSE_PAR_SYMBOL; + $this->bytes_already_read += 1; + $type = self::CLOSE_PAR_SYMBOL; } elseif ( '{' === $la ) { - $this->position += 1; - $type = self::OPEN_CURLY_SYMBOL; + $this->bytes_already_read += 1; + $type = self::OPEN_CURLY_SYMBOL; } elseif ( '}' === $la ) { - $this->position += 1; - $type = self::CLOSE_CURLY_SYMBOL; + $this->bytes_already_read += 1; + $type = self::CLOSE_CURLY_SYMBOL; } elseif ( '@' === $la ) { - $this->position += 1; // Consume the '@'. + $this->bytes_already_read += 1; // Consume the '@'. if ( '@' === $la2 ) { - $this->position += 1; // Consume the second '@'. - $type = self::AT_AT_SIGN_SYMBOL; + $this->bytes_already_read += 1; // Consume the second '@'. + $type = self::AT_AT_SIGN_SYMBOL; } else { /** * Check whether the '@' marks an unquoted user-defined variable: @@ -2293,65 +2411,73 @@ private function read_next_token(): int { * 1. Starts with a '@'. * 2. Allowed following characters are ASCII a-z, A-Z, 0-9, _, ., $. */ - $length = strspn( $this->input, 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_.$', $this->position ); + $length = strspn( $this->sql, 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_.$', $this->bytes_already_read ); if ( $length > 0 ) { - $this->position += $length; - $type = self::AT_TEXT_SUFFIX; + $this->bytes_already_read += $length; + $type = self::AT_TEXT_SUFFIX; } else { $type = self::AT_SIGN_SYMBOL; } } } elseif ( '?' === $la ) { - $this->position += 1; - $type = self::PARAM_MARKER; + $this->bytes_already_read += 1; + $type = self::PARAM_MARKER; } elseif ( '\\' === $la ) { - $this->position += 1; // Consume the '\'. + $this->bytes_already_read += 1; // Consume the '\'. if ( 'N' === $la2 ) { - $this->position += 1; // Consume the 'N'. - $type = self::NULL2_SYMBOL; + $this->bytes_already_read += 1; // Consume the 'N'. + $type = self::NULL2_SYMBOL; } else { $type = self::INVALID_INPUT; } } elseif ( '#' === $la ) { $type = $this->read_line_comment(); } elseif ( $this->is_whitespace( $la ) ) { - $this->position += strspn( $this->input, self::WHITESPACE_MASK, $this->position ); - $type = self::WHITESPACE; + $this->bytes_already_read += strspn( $this->sql, self::WHITESPACE_MASK, $this->bytes_already_read ); + $type = self::WHITESPACE; } elseif ( '0' === $la && ( 'x' === $la2 || 'b' === $la2 ) ) { $type = $this->read_number(); } elseif ( ( 'x' === $la || 'X' === $la || 'b' === $la || 'B' === $la ) && "'" === $la2 ) { $type = $this->read_number(); } elseif ( ( 'n' === $la || 'N' === $la ) && "'" === $la2 ) { - $this->position += 1; // n/N - $type = $this->read_quoted_text( "'" ); + $this->bytes_already_read += 1; // n/N + $type = $this->read_quoted_text( "'" ); if ( self::SINGLE_QUOTED_TEXT === $type ) { $type = self::NCHAR_TEXT; } } elseif ( null === $la ) { $type = self::EOF; } else { - $previous_position = $this->position - 1; + $previous_position = $this->bytes_already_read - 1; $bytes_parsed = $this->parse_identifier(); if ( $bytes_parsed > 0 ) { - $this->position += $bytes_parsed; + $this->bytes_already_read += $bytes_parsed; // When preceded by a dot, it is always an identifier. - if ( $previous_position >= 0 && '.' === $this->input[ $previous_position ] ) { + if ( $previous_position >= 0 && '.' === $this->sql[ $previous_position ] ) { $type = self::IDENTIFIER; - } elseif ( '_' === $la && isset( self::UNDERSCORE_CHARSETS[ strtolower( $this->get_text() ) ] ) ) { + } elseif ( '_' === $la && isset( self::UNDERSCORE_CHARSETS[ strtolower( $this->get_current_token_bytes() ) ] ) ) { $type = self::UNDERSCORE_CHARSET; } else { - $type = $this->determine_identifier_or_keyword_type( $this->get_text() ); + $type = $this->determine_identifier_or_keyword_type( $this->get_current_token_bytes() ); } } else { - $this->position += 1; - $type = self::INVALID_INPUT; + $this->bytes_already_read += 1; + $type = self::INVALID_INPUT; } } return $type; } + private function get_current_token_bytes(): string { + return substr( + $this->sql, + $this->token_starts_at, + $this->bytes_already_read - $this->token_starts_at + ); + } + /** * Unquoted identifiers: * https://dev.mysql.com/doc/refman/8.4/en/identifiers.html @@ -2366,22 +2492,22 @@ private function parse_identifier(): int { while ( true ) { // First, let's try to parse an ASCII sequence. $byte_length += strspn( - $this->input, + $this->sql, 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_$', - $this->position + $byte_length + $this->bytes_already_read + $byte_length ); // Check if the following byte can be part of a multibyte character. // If not, bail out early to avoid unnecessary UTF-8 decoding. - $byte = $this->input[ $this->position + $byte_length ] ?? null; + $byte = $this->sql[ $this->bytes_already_read + $byte_length ] ?? null; if ( null === $byte || ord( $byte ) < 128 ) { break; } // Check the \x{0080}-\x{ffff} Unicode character range. $codepoint = utf8_codepoint_at( - $this->input, - $this->position + $byte_length, + $this->sql, + $this->bytes_already_read + $byte_length, $bytes_parsed ); @@ -2397,14 +2523,14 @@ private function parse_identifier(): int { // An identifier cannot consist solely of digits. if ( - strspn( $this->input, self::DIGIT_MASK, $this->position, $byte_length ) === $byte_length + strspn( $this->sql, self::DIGIT_MASK, $this->bytes_already_read, $byte_length ) === $byte_length ) { return 0; } return $byte_length; } - protected function determine_identifier_or_keyword_type( string $value ): int { + private function determine_identifier_or_keyword_type( string $value ): int { $value = strtoupper( $value ); // Lookup the string in the token table. @@ -2413,10 +2539,10 @@ protected function determine_identifier_or_keyword_type( string $value ): int { return self::IDENTIFIER; } - // Apply MySQL server version specifics (positive number: >= , negative number: < ). + // Apply MySQL version specifics (positive number: >= , negative number: < ). if ( isset( self::VERSIONS[ $type ] ) ) { $version = self::VERSIONS[ $type ]; - if ( $this->server_version < $version || -$version >= $this->server_version ) { + if ( $this->mysql_version < $version || -$version >= $this->mysql_version ) { return self::IDENTIFIER; } } @@ -2424,21 +2550,21 @@ protected function determine_identifier_or_keyword_type( string $value ): int { // Apply MySQL version ranges manually. if ( self::MAX_STATEMENT_TIME_SYMBOL === $type - && ! ( $this->server_version >= 50704 && $this->server_version < 50708 ) + && ! ( $this->mysql_version >= 50704 && $this->mysql_version < 50708 ) ) { return self::IDENTIFIER; } if ( self::NONBLOCKING_SYMBOL === $type - && ! ( $this->server_version >= 50700 && $this->server_version < 50706 ) + && ! ( $this->mysql_version >= 50700 && $this->mysql_version < 50706 ) ) { return self::IDENTIFIER; } if ( self::REMOTE_SYMBOL === $type - && ( $this->server_version >= 80003 && $this->server_version < 80014 ) + && ( $this->mysql_version >= 80003 && $this->mysql_version < 80014 ) ) { return self::IDENTIFIER; } @@ -2447,9 +2573,9 @@ protected function determine_identifier_or_keyword_type( string $value ): int { if ( isset( self::FUNCTIONS[ $type ] ) ) { // Skip any whitespace character if the SQL mode says they should be ignored. if ( $this->is_sql_mode_active( self::SQL_MODE_IGNORE_SPACE ) ) { - $this->position += strspn( $this->input, self::WHITESPACE_MASK, $this->position ); + $this->bytes_already_read += strspn( $this->sql, self::WHITESPACE_MASK, $this->bytes_already_read ); } - if ( '(' !== ( $this->input[ $this->position ] ?? null ) ) { + if ( '(' !== ( $this->sql[ $this->bytes_already_read ] ?? null ) ) { return self::IDENTIFIER; } } @@ -2463,10 +2589,10 @@ protected function determine_identifier_or_keyword_type( string $value ): int { return self::SYNONYMS[ $type ] ?? $type; } - protected function read_number(): int { - $start_position = $this->position; - $current_byte = $this->input[ $this->position ] ?? null; - $next_byte = $this->input[ $this->position + 1 ] ?? null; + private function read_number(): int { + $start_position = $this->bytes_already_read; + $current_byte = $this->sql[ $this->bytes_already_read ] ?? null; + $next_byte = $this->sql[ $this->bytes_already_read + 1 ] ?? null; if ( // HEX number in the form of 0xN. @@ -2474,11 +2600,11 @@ protected function read_number(): int { // HEX number in the form of x'N' or X'N'. || ( ( 'x' === $current_byte || 'X' === $current_byte ) && "'" === $next_byte ) ) { - $is_quoted = "'" === $next_byte; - $this->position += 2; // Consume "0x" or "x'". - $this->position += strspn( $this->input, '0123456789abcdefABCDEF', $this->position ); + $is_quoted = "'" === $next_byte; + $this->bytes_already_read += 2; // Consume "0x" or "x'". + $this->bytes_already_read += strspn( $this->sql, '0123456789abcdefABCDEF', $this->bytes_already_read ); if ( $is_quoted ) { - $this->position += 1; // Consume the "'". + $this->bytes_already_read += 1; // Consume the "'". } $type = self::HEX_NUMBER; } elseif ( @@ -2487,60 +2613,60 @@ protected function read_number(): int { // BIN number in the form of b'N' or B'N'. || ( ( 'b' === $current_byte || 'B' === $current_byte ) && "'" === $next_byte ) ) { - $is_quoted = "'" === $next_byte; - $this->position += 2; // Consume "0b" or "b'". - $this->position += strspn( $this->input, '01', $this->position ); + $is_quoted = "'" === $next_byte; + $this->bytes_already_read += 2; // Consume "0b" or "b'". + $this->bytes_already_read += strspn( $this->sql, '01', $this->bytes_already_read ); if ( $is_quoted ) { - $this->position += 1; // Consume the "'". + $this->bytes_already_read += 1; // Consume the "'". } $type = self::BIN_NUMBER; } else { // Here, we have a sequence starting with N or .N, where N is a digit. // 1. Try integer first. - $this->position += strspn( $this->input, self::DIGIT_MASK, $this->position ); - $type = self::INT_NUMBER; + $this->bytes_already_read += strspn( $this->sql, self::DIGIT_MASK, $this->bytes_already_read ); + $type = self::INT_NUMBER; // 2. In case of N. or .N, it's a decimal or float number. - if ( '.' === ( $this->input[ $this->position ] ?? null ) ) { - $this->position += 1; - $type = self::DECIMAL_NUMBER; - $this->position += strspn( $this->input, self::DIGIT_MASK, $this->position ); + if ( '.' === ( $this->sql[ $this->bytes_already_read ] ?? null ) ) { + $this->bytes_already_read += 1; + $type = self::DECIMAL_NUMBER; + $this->bytes_already_read += strspn( $this->sql, self::DIGIT_MASK, $this->bytes_already_read ); } // 3. When exponent is present, it's a float number. - $current_byte = $this->input[ $this->position ] ?? null; - $next_byte = $this->input[ $this->position + 1 ] ?? null; + $current_byte = $this->sql[ $this->bytes_already_read ] ?? null; + $next_byte = $this->sql[ $this->bytes_already_read + 1 ] ?? null; $has_exponent = ( 'e' === $current_byte || 'E' === $current_byte ) && ( $this->is_digit( $next_byte ) - || ( ( '+' === $next_byte || '-' === $next_byte ) && $this->is_digit( $this->input[ $this->position + 2 ] ?? null ) ) + || ( ( '+' === $next_byte || '-' === $next_byte ) && $this->is_digit( $this->sql[ $this->bytes_already_read + 2 ] ?? null ) ) ); if ( $has_exponent ) { - $this->position += 1; // Consume the 'e' or 'E'. - $this->position += 1; // Consume the '+', '-', or digit. - $this->position += strspn( $this->input, self::DIGIT_MASK, $this->position ); - $type = self::FLOAT_NUMBER; + $this->bytes_already_read += 1; // Consume the 'e' or 'E'. + $this->bytes_already_read += 1; // Consume the '+', '-', or digit. + $this->bytes_already_read += strspn( $this->sql, self::DIGIT_MASK, $this->bytes_already_read ); + $type = self::FLOAT_NUMBER; } } // In MySQL, when an input matches both a number and an identifier, the number always wins. // However, when the number is followed by a non-numeric identifier-like character, it is // considered an identifier... unless it's a float number, which ignores subsequent input. - $text = $this->get_text(); + $text = $this->get_current_token_bytes(); $possible_identifier_prefix = self::INT_NUMBER === $type || ( '0' === $text[0] && ( 'b' === $text[1] || 'x' === $text[1] ) ); if ( $possible_identifier_prefix ) { - $position = $this->position; - $this->position = $start_position; - $bytes_parsed = $this->parse_identifier(); - $this->position = $position; + $position = $this->bytes_already_read; + $this->bytes_already_read = $start_position; + $bytes_parsed = $this->parse_identifier(); + $this->bytes_already_read = $position; // When matched more than the number, it's an identifier. - if ( $start_position + $bytes_parsed > $this->position ) { - $this->position = $start_position + $bytes_parsed; - $type = self::IDENTIFIER; + if ( $start_position + $bytes_parsed > $this->bytes_already_read ) { + $this->bytes_already_read = $start_position + $bytes_parsed; + $type = self::IDENTIFIER; } } return $type; @@ -2557,8 +2683,8 @@ protected function read_number(): int { * * @param string $quote The quote character - ', ", or `. */ - protected function read_quoted_text( string $quote ): int { - $this->position += 1; // Consume the quote. + private function read_quoted_text( string $quote ): int { + $this->bytes_already_read += 1; // Consume the quote. $no_backslash_escapes = $this->is_sql_mode_active( self::SQL_MODE_NO_BACKSLASH_ESCAPES @@ -2566,9 +2692,9 @@ protected function read_quoted_text( string $quote ): int { // We need to look for the closing quote in a loop, as it can be escaped, // in which case the escape sequence is consumed and the loop continues. - $pos = $this->position; + $pos = $this->bytes_already_read; while ( true ) { - $pos += strcspn( $this->input, $quote, $pos ); + $pos += strcspn( $this->sql, $quote, $pos ); // Quotes can be escaped with a "\", except when NO_BACKSLASH_ESCAPES // is set, in which case it is treated as a regular character. @@ -2576,7 +2702,7 @@ protected function read_quoted_text( string $quote ): int { // The quote is escaped only when the number of preceding backslashes // is odd, as since a "\\" is simply an escaped backslash character. if ( ! $no_backslash_escapes ) { - for ($i = 0; '\\' === $this->input[ $pos - $i - 1 ]; $i += 1); + for ($i = 0; '\\' === $this->sql[ $pos - $i - 1 ]; $i += 1); if ( 1 === $i % 2 ) { $pos += 1; continue; @@ -2584,12 +2710,12 @@ protected function read_quoted_text( string $quote ): int { } // Unclosed string - unexpected EOF. - if ( ( $this->input[ $pos ] ?? null ) !== $quote ) { + if ( ( $this->sql[ $pos ] ?? null ) !== $quote ) { return self::INVALID_INPUT; } // Check if the quote is doubled. - if ( ( $this->input[ $pos + 1 ] ?? null ) === $quote ) { + if ( ( $this->sql[ $pos + 1 ] ?? null ) === $quote ) { $pos += 2; continue; } @@ -2598,7 +2724,7 @@ protected function read_quoted_text( string $quote ): int { } $pos += 1; - $this->position = $pos; + $this->bytes_already_read = $pos; if ( '`' === $quote ) { return self::BACK_TICK_QUOTED_ID; @@ -2609,48 +2735,48 @@ protected function read_quoted_text( string $quote ): int { } } - protected function read_line_comment(): int { - $this->position += strcspn( $this->input, "\r\n", $this->position ); + private function read_line_comment(): int { + $this->bytes_already_read += strcspn( $this->sql, "\r\n", $this->bytes_already_read ); return self::COMMENT; } - protected function read_mysql_comment(): int { + private function read_mysql_comment(): int { // MySQL-specific comment in one of the following forms: - // 1. /*! ... */ - The content is treated as regular SQL code. - // 2. /*!12345 ... */ - The content is treated as SQL code when "server version >= 12345". - $this->position += 3; // Consume the '/*!'. + // 1. /*! ... */ - The content is treated as SQL. + // 2. /*!12345 ... */ - The content is treated as SQL when "MySQL version >= 12345". + $this->bytes_already_read += 3; // Consume the '/*!'. // Check if the next 5 characters are digits. - $digit_count = strspn( $this->input, self::DIGIT_MASK, $this->position, 5 ); + $digit_count = strspn( $this->sql, self::DIGIT_MASK, $this->bytes_already_read, 5 ); $is_version_comment = 5 === $digit_count; // For version comments, extract the version number. $version = $is_version_comment - ? (int) substr( $this->input, $this->position, $digit_count ) + ? (int) substr( $this->sql, $this->bytes_already_read, $digit_count ) : 0; - if ( $this->server_version < $version ) { + if ( $this->mysql_version < $version ) { // When version not satisfied. Treat the content as a regular comment. $this->read_comment_content(); return self::COMMENT; } else { // Version satisfied or not specified. Treat the content as SQL code. - $this->position += $digit_count; // Skip the version number. - $this->in_version_comment = true; + $this->bytes_already_read += $digit_count; // Skip the version number. + $this->in_mysql_comment = true; return self::MYSQL_COMMENT_START; } } - protected function read_comment_content() { + private function read_comment_content() { while ( true ) { - $this->position += strcspn( $this->input, '*', $this->position ); - $this->position += 1; // Consume the '*'. - $byte = $this->input[ $this->position ] ?? null; + $this->bytes_already_read += strcspn( $this->sql, '*', $this->bytes_already_read ); + $this->bytes_already_read += 1; // Consume the '*'. + $byte = $this->sql[ $this->bytes_already_read ] ?? null; if ( null === $byte ) { break; } if ( '/' === $byte ) { - $this->position += 1; // Consume the '/'. + $this->bytes_already_read += 1; // Consume the '/'. break; } }