From dcf1f2941307bbd768f6f586c44aecab68e2b2a0 Mon Sep 17 00:00:00 2001 From: gitlost Date: Thu, 12 Oct 2017 20:42:26 +0100 Subject: [PATCH] Use preg_split instead of preg_match with quantifiers in safe_substr. --- lib/cli/cli.php | 46 ++++++++++++++++++++-------------------- tests/test-cli.php | 52 +++++++++++++++++++++++++++++++++++----------- 2 files changed, 62 insertions(+), 36 deletions(-) diff --git a/lib/cli/cli.php b/lib/cli/cli.php index 8edfd79..6aeb867 100755 --- a/lib/cli/cli.php +++ b/lib/cli/cli.php @@ -181,7 +181,7 @@ function safe_strlen( $str, $encoding = false ) { if ( ! $encoding ) { $encoding = mb_detect_encoding( $str, null, true /*strict*/ ); } - $length = mb_strlen( $str, $encoding ); + $length = $encoding ? mb_strlen( $str, $encoding ) : mb_strlen( $str ); // mbstring funcs can fail if given `$encoding` arg that evals to false. if ( 'UTF-8' === $encoding ) { // Subtract combining characters. $length -= preg_match_all( get_unicode_regexs( 'm' ), $str, $dummy /*needed for PHP 5.3*/ ); @@ -209,14 +209,22 @@ function safe_substr( $str, $start, $length = false, $is_width = false, $encodin if ( $length < 0 || ( $is_width && ( null === $length || false === $length ) ) ) { return false; } - $have_safe_strlen = false; - // PHP 5.3 substr takes false as full length, PHP > 5.3 takes null - for compat. do `safe_strlen()`. + // Need this for normalization below and other uses. + $safe_strlen = safe_strlen( $str, $encoding ); + + // Normalize `$length` when not specified - PHP 5.3 substr takes false as full length, PHP > 5.3 takes null. if ( null === $length || false === $length ) { - $length = safe_strlen( $str, $encoding ); - $have_safe_strlen = true; + $length = $safe_strlen; + } + // Normalize `$start` - various methods treat this differently. + if ( $start > $safe_strlen ) { + return ''; + } + if ( $start < 0 && -$start > $safe_strlen ) { + $start = 0; } - // Allow for selective testings - "1" bit set tests grapheme_substr(), "2" preg_match( '/\X/' ), "4" mb_substr(), "8" substr(). + // Allow for selective testings - "1" bit set tests grapheme_substr(), "2" preg_split( '/\X/' ), "4" mb_substr(), "8" substr(). $test_safe_substr = getenv( 'PHP_CLI_TOOLS_TEST_SAFE_SUBSTR' ); // Assume UTF-8 if no encoding given - `grapheme_substr()` will return false (not null like `grapheme_strlen()`) if given non-UTF-8 string. @@ -225,22 +233,12 @@ function safe_substr( $str, $start, $length = false, $is_width = false, $encodin return $is_width ? _safe_substr_eaw( $try, $length ) : $try; } } - // Assume UTF-8 if no encoding given - `preg_match()` will return false if given non-UTF-8 string. + // Assume UTF-8 if no encoding given - `preg_split()` returns a one element array if given non-UTF-8 string (PHP bug) so need to check `preg_last_error()`. if ( ( ! $encoding || 'UTF-8' === $encoding ) && can_use_pcre_x() ) { - if ( $start < 0 ) { - $start = max( $start + ( $have_safe_strlen ? $length : safe_strlen( $str, $encoding ) ), 0 ); - } - if ( $start ) { - if ( preg_match( '/^\X{' . $start . '}(\X{0,' . $length . '})/u', $str, $matches ) ) { - if ( ! $test_safe_substr || ( $test_safe_substr & 2 ) ) { - return $is_width ? _safe_substr_eaw( $matches[1], $length ) : $matches[1]; - } - } - } else { - if ( preg_match( '/^\X{0,' . $length . '}/u', $str, $matches ) ) { - if ( ! $test_safe_substr || ( $test_safe_substr & 2 ) ) { - return $is_width ? _safe_substr_eaw( $matches[0], $length ) : $matches[0]; - } + if ( false !== ( $try = preg_split( '/(\X)/u', $str, $safe_strlen + 1, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY ) ) && ! preg_last_error() ) { + $try = implode( '', array_slice( $try, $start, $length ) ); + if ( ! $test_safe_substr || ( $test_safe_substr & 2 ) ) { + return $is_width ? _safe_substr_eaw( $try, $length ) : $try; } } } @@ -250,7 +248,7 @@ function safe_substr( $str, $start, $length = false, $is_width = false, $encodin $encoding = mb_detect_encoding( $str, null, true /*strict*/ ); } // Bug: not adjusting for combining chars. - $try = mb_substr( $str, $start, $length, $encoding ); + $try = $encoding ? mb_substr( $str, $start, $length, $encoding ) : mb_substr( $str, $start, $length ); // mbstring funcs can fail if given `$encoding` arg that evals to false. if ( 'UTF-8' === $encoding && $is_width ) { $try = _safe_substr_eaw( $try, $length ); } @@ -275,7 +273,7 @@ function _safe_substr_eaw( $str, $length ) { // Note that if the length ends in the middle of a double-width char, the char is excluded, not included. // See if it's all EAW. - if ( preg_match_all( $eaw_regex, $str, $dummy /*needed for PHP 5.3*/ ) === $length ) { + if ( function_exists( 'mb_substr' ) && preg_match_all( $eaw_regex, $str, $dummy /*needed for PHP 5.3*/ ) === $length ) { // Just halve the length so (rounded down to a minimum of 1). $str = mb_substr( $str, 0, max( (int) ( $length / 2 ), 1 ), 'UTF-8' ); } else { @@ -344,7 +342,7 @@ function strwidth( $string, $encoding = false ) { if ( ! $encoding ) { $encoding = mb_detect_encoding( $string, null, true /*strict*/ ); } - $width = mb_strwidth( $string, $encoding ); + $width = $encoding ? mb_strwidth( $string, $encoding ) : mb_strwidth( $string ); // mbstring funcs can fail if given `$encoding` arg that evals to false. if ( 'UTF-8' === $encoding ) { // Subtract combining characters. $width -= preg_match_all( $m_regex, $string, $dummy /*needed for PHP 5.3*/ ); diff --git a/tests/test-cli.php b/tests/test-cli.php index 5255e10..5df0b6a 100644 --- a/tests/test-cli.php +++ b/tests/test-cli.php @@ -111,7 +111,12 @@ function test_various_substr() { putenv( 'PHP_CLI_TOOLS_TEST_SAFE_SUBSTR' ); // Latin, kana, Latin, Latin combining, Thai combining, Hangul. - $str = 'lムnöม้p를'; + $str = 'lムnöม้p를'; // 18 bytes. + + // Large string. + $large_str_str_start = 65536 * 2; + $large_str = str_repeat( 'a', $large_str_str_start ) . $str; + $large_str_len = strlen( $large_str ); // 128K + 18 bytes. if ( \cli\can_use_icu() ) { putenv( 'PHP_CLI_TOOLS_TEST_SAFE_SUBSTR=1' ); // Tests grapheme_substr(). @@ -124,6 +129,10 @@ function test_various_substr() { $this->assertSame( 'lムnöม้p', \cli\safe_substr( $str, 0, 6 ) ); $this->assertSame( 'lムnöม้p를', \cli\safe_substr( $str, 0, 7 ) ); $this->assertSame( 'lムnöม้p를', \cli\safe_substr( $str, 0, 8 ) ); + $this->assertSame( '', \cli\safe_substr( $str, 19 ) ); // Start too large. + $this->assertSame( '', \cli\safe_substr( $str, 19, 7 ) ); // Start too large, with length. + $this->assertSame( '', \cli\safe_substr( $str, 8 ) ); // Start same as length. + $this->assertSame( '', \cli\safe_substr( $str, 8, 0 ) ); // Start same as length, with zero length. $this->assertSame( '를', \cli\safe_substr( $str, -1 ) ); $this->assertSame( 'p를', \cli\safe_substr( $str, -2 ) ); $this->assertSame( 'ม้p를', \cli\safe_substr( $str, -3 ) ); @@ -134,11 +143,18 @@ function test_various_substr() { $this->assertSame( 'ムnöม้p를', \cli\safe_substr( $str, -6 ) ); $this->assertSame( 'lムnöม้p를', \cli\safe_substr( $str, -7 ) ); $this->assertSame( 'lムnö', \cli\safe_substr( $str, -7, 4 ) ); - // $this->assertSame( 'lムnöม้p를', \cli\safe_substr( $str, -8 ) ); // grapheme_substr() returns false on this. + $this->assertSame( 'lムnöม้p를', \cli\safe_substr( $str, -8 ) ); + $this->assertSame( 'lムnöม้p를', \cli\safe_substr( $str, -9 ) ); // Negative start too large. + + $this->assertSame( $large_str, \cli\safe_substr( $large_str, 0 ) ); + $this->assertSame( '', \cli\safe_substr( $large_str, $large_str_str_start, 0 ) ); + $this->assertSame( 'l', \cli\safe_substr( $large_str, $large_str_str_start, 1 ) ); + $this->assertSame( 'lム', \cli\safe_substr( $large_str, $large_str_str_start, 2 ) ); + $this->assertSame( 'p를', \cli\safe_substr( $large_str, -2 ) ); } if ( \cli\can_use_pcre_x() ) { - putenv( 'PHP_CLI_TOOLS_TEST_SAFE_SUBSTR=2' ); // Tests preg_match( '/\X/u' ). + putenv( 'PHP_CLI_TOOLS_TEST_SAFE_SUBSTR=2' ); // Tests preg_split( '/\X/u' ). $this->assertSame( '', \cli\safe_substr( $str, 0, 0 ) ); $this->assertSame( 'l', \cli\safe_substr( $str, 0, 1 ) ); $this->assertSame( 'lム', \cli\safe_substr( $str, 0, 2 ) ); @@ -148,6 +164,10 @@ function test_various_substr() { $this->assertSame( 'lムnöม้p', \cli\safe_substr( $str, 0, 6 ) ); $this->assertSame( 'lムnöม้p를', \cli\safe_substr( $str, 0, 7 ) ); $this->assertSame( 'lムnöม้p를', \cli\safe_substr( $str, 0, 8 ) ); + $this->assertSame( '', \cli\safe_substr( $str, 19 ) ); // Start too large. + $this->assertSame( '', \cli\safe_substr( $str, 19, 7 ) ); // Start too large, with length. + $this->assertSame( '', \cli\safe_substr( $str, 8 ) ); // Start same as length. + $this->assertSame( '', \cli\safe_substr( $str, 8, 0 ) ); // Start same as length, with zero length. $this->assertSame( '를', \cli\safe_substr( $str, -1 ) ); $this->assertSame( 'p를', \cli\safe_substr( $str, -2 ) ); $this->assertSame( 'ม้p를', \cli\safe_substr( $str, -3 ) ); @@ -159,6 +179,13 @@ function test_various_substr() { $this->assertSame( 'lムnöม้p를', \cli\safe_substr( $str, -7 ) ); $this->assertSame( 'lムnö', \cli\safe_substr( $str, -7, 4 ) ); $this->assertSame( 'lムnöม้p를', \cli\safe_substr( $str, -8 ) ); + $this->assertSame( 'lムnöม้p를', \cli\safe_substr( $str, -9 ) ); // Negative start too large. + + $this->assertSame( $large_str, \cli\safe_substr( $large_str, 0 ) ); + $this->assertSame( '', \cli\safe_substr( $large_str, $large_str_str_start, 0 ) ); + $this->assertSame( 'l', \cli\safe_substr( $large_str, $large_str_str_start, 1 ) ); + $this->assertSame( 'lム', \cli\safe_substr( $large_str, $large_str_str_start, 2 ) ); + $this->assertSame( 'p를', \cli\safe_substr( $large_str, -2 ) ); } if ( function_exists( 'mb_substr' ) ) { @@ -174,8 +201,9 @@ function test_various_substr() { $this->assertSame( '', \cli\safe_substr( $str, 0, 0 ) ); $this->assertSame( 'l', \cli\safe_substr( $str, 0, 1 ) ); $this->assertSame( "l\xe3", \cli\safe_substr( $str, 0, 2 ) ); // Corrupt. + $this->assertSame( '', \cli\safe_substr( $str, strlen( $str ) + 1 ) ); // Return '' not false to match behavior of other methods when `$start` > strlen. - // Non-UTF-8 - both grapheme_substr() and preg_match( '/\X/u' ) will fail. + // Non-UTF-8 - both grapheme_substr() and preg_split( '/\X/u' ) will fail. putenv( 'PHP_CLI_TOOLS_TEST_SAFE_SUBSTR' ); @@ -392,10 +420,10 @@ function test_strwidth() { if ( \cli\can_use_icu() ) { $this->assertSame( 5, \cli\strwidth( $str ) ); // Tests grapheme_strlen(). - putenv( 'PHP_CLI_TOOLS_TEST_STRWIDTH=2' ); // Test preg_match_all( '/\X/u' ). + putenv( 'PHP_CLI_TOOLS_TEST_STRWIDTH=2' ); // Test preg_split( '/\X/u' ). $this->assertSame( 5, \cli\strwidth( $str ) ); } else { - $this->assertSame( 5, \cli\strwidth( $str ) ); // Tests preg_match_all( '/\X/u' ). + $this->assertSame( 5, \cli\strwidth( $str ) ); // Tests preg_split( '/\X/u' ). } if ( function_exists( 'mb_strwidth' ) && function_exists( 'mb_detect_order' ) ) { @@ -422,10 +450,10 @@ function test_strwidth() { if ( \cli\can_use_icu() ) { $this->assertSame( 11, \cli\strwidth( $str ) ); // Tests grapheme_strlen(). - putenv( 'PHP_CLI_TOOLS_TEST_STRWIDTH=2' ); // Test preg_match_all( '/\X/u' ). + putenv( 'PHP_CLI_TOOLS_TEST_STRWIDTH=2' ); // Test preg_split( '/\X/u' ). $this->assertSame( 11, \cli\strwidth( $str ) ); } else { - $this->assertSame( 11, \cli\strwidth( $str ) ); // Tests preg_match_all( '/\X/u' ). + $this->assertSame( 11, \cli\strwidth( $str ) ); // Tests preg_split( '/\X/u' ). } if ( function_exists( 'mb_strwidth' ) && function_exists( 'mb_detect_order' ) ) { @@ -434,7 +462,7 @@ function test_strwidth() { $this->assertSame( 11, \cli\strwidth( $str ) ); } - // Non-UTF-8 - both grapheme_strlen() and preg_match_all( '/\X/u' ) will fail. + // Non-UTF-8 - both grapheme_strlen() and preg_split( '/\X/u' ) will fail. putenv( 'PHP_CLI_TOOLS_TEST_STRWIDTH' ); @@ -486,11 +514,11 @@ function test_safe_strlen() { putenv( 'PHP_CLI_TOOLS_TEST_SAFE_STRLEN' ); // Test grapheme_strlen(). $this->assertSame( 7, \cli\safe_strlen( $str ) ); if ( \cli\can_use_pcre_x() ) { - putenv( 'PHP_CLI_TOOLS_TEST_SAFE_STRLEN=2' ); // Test preg_match_all( '/\X/u' ). + putenv( 'PHP_CLI_TOOLS_TEST_SAFE_STRLEN=2' ); // Test preg_split( '/\X/u' ). $this->assertSame( 7, \cli\safe_strlen( $str ) ); } } elseif ( \cli\can_use_pcre_x() ) { - $this->assertSame( 7, \cli\safe_strlen( $str ) ); // Tests preg_match_all( '/\X/u' ). + $this->assertSame( 7, \cli\safe_strlen( $str ) ); // Tests preg_split( '/\X/u' ). } else { putenv( 'PHP_CLI_TOOLS_TEST_SAFE_STRLEN=8' ); // Test strlen(). $this->assertSame( 18, \cli\safe_strlen( $str ) ); // strlen() - no. of bytes. @@ -504,7 +532,7 @@ function test_safe_strlen() { $this->assertSame( 9, mb_strlen( $str, 'UTF-8' ) ); // mb_strlen() - counts the 2 combining chars. } - // Non-UTF-8 - both grapheme_strlen() and preg_match_all( '/\X/u' ) will fail. + // Non-UTF-8 - both grapheme_strlen() and preg_split( '/\X/u' ) will fail. putenv( 'PHP_CLI_TOOLS_TEST_SAFE_STRLEN' );