Skip to content

Commit

Permalink
Merge pull request #123 from wp-cli/use_preg_split_in_safe_substr
Browse files Browse the repository at this point in the history
Use preg_split instead of preg_match with quantifiers in safe_substr.
  • Loading branch information
danielbachhuber authored Oct 12, 2017
2 parents 14b8a08 + dcf1f29 commit 363c753
Show file tree
Hide file tree
Showing 2 changed files with 62 additions and 36 deletions.
46 changes: 22 additions & 24 deletions lib/cli/cli.php
Original file line number Diff line number Diff line change
Expand Up @@ -181,7 +181,7 @@ function safe_strlen( $str, $encoding = false ) {
if ( ! $encoding ) {
$encoding = mb_detect_encoding( $str, null, true /*strict*/ );
}
$length = mb_strlen( $str, $encoding );
$length = $encoding ? mb_strlen( $str, $encoding ) : mb_strlen( $str ); // mbstring funcs can fail if given `$encoding` arg that evals to false.
if ( 'UTF-8' === $encoding ) {
// Subtract combining characters.
$length -= preg_match_all( get_unicode_regexs( 'm' ), $str, $dummy /*needed for PHP 5.3*/ );
Expand Down Expand Up @@ -209,14 +209,22 @@ function safe_substr( $str, $start, $length = false, $is_width = false, $encodin
if ( $length < 0 || ( $is_width && ( null === $length || false === $length ) ) ) {
return false;
}
$have_safe_strlen = false;
// PHP 5.3 substr takes false as full length, PHP > 5.3 takes null - for compat. do `safe_strlen()`.
// Need this for normalization below and other uses.
$safe_strlen = safe_strlen( $str, $encoding );

// Normalize `$length` when not specified - PHP 5.3 substr takes false as full length, PHP > 5.3 takes null.
if ( null === $length || false === $length ) {
$length = safe_strlen( $str, $encoding );
$have_safe_strlen = true;
$length = $safe_strlen;
}
// Normalize `$start` - various methods treat this differently.
if ( $start > $safe_strlen ) {
return '';
}
if ( $start < 0 && -$start > $safe_strlen ) {
$start = 0;
}

// Allow for selective testings - "1" bit set tests grapheme_substr(), "2" preg_match( '/\X/' ), "4" mb_substr(), "8" substr().
// Allow for selective testings - "1" bit set tests grapheme_substr(), "2" preg_split( '/\X/' ), "4" mb_substr(), "8" substr().
$test_safe_substr = getenv( 'PHP_CLI_TOOLS_TEST_SAFE_SUBSTR' );

// Assume UTF-8 if no encoding given - `grapheme_substr()` will return false (not null like `grapheme_strlen()`) if given non-UTF-8 string.
Expand All @@ -225,22 +233,12 @@ function safe_substr( $str, $start, $length = false, $is_width = false, $encodin
return $is_width ? _safe_substr_eaw( $try, $length ) : $try;
}
}
// Assume UTF-8 if no encoding given - `preg_match()` will return false if given non-UTF-8 string.
// Assume UTF-8 if no encoding given - `preg_split()` returns a one element array if given non-UTF-8 string (PHP bug) so need to check `preg_last_error()`.
if ( ( ! $encoding || 'UTF-8' === $encoding ) && can_use_pcre_x() ) {
if ( $start < 0 ) {
$start = max( $start + ( $have_safe_strlen ? $length : safe_strlen( $str, $encoding ) ), 0 );
}
if ( $start ) {
if ( preg_match( '/^\X{' . $start . '}(\X{0,' . $length . '})/u', $str, $matches ) ) {
if ( ! $test_safe_substr || ( $test_safe_substr & 2 ) ) {
return $is_width ? _safe_substr_eaw( $matches[1], $length ) : $matches[1];
}
}
} else {
if ( preg_match( '/^\X{0,' . $length . '}/u', $str, $matches ) ) {
if ( ! $test_safe_substr || ( $test_safe_substr & 2 ) ) {
return $is_width ? _safe_substr_eaw( $matches[0], $length ) : $matches[0];
}
if ( false !== ( $try = preg_split( '/(\X)/u', $str, $safe_strlen + 1, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY ) ) && ! preg_last_error() ) {
$try = implode( '', array_slice( $try, $start, $length ) );
if ( ! $test_safe_substr || ( $test_safe_substr & 2 ) ) {
return $is_width ? _safe_substr_eaw( $try, $length ) : $try;
}
}
}
Expand All @@ -250,7 +248,7 @@ function safe_substr( $str, $start, $length = false, $is_width = false, $encodin
$encoding = mb_detect_encoding( $str, null, true /*strict*/ );
}
// Bug: not adjusting for combining chars.
$try = mb_substr( $str, $start, $length, $encoding );
$try = $encoding ? mb_substr( $str, $start, $length, $encoding ) : mb_substr( $str, $start, $length ); // mbstring funcs can fail if given `$encoding` arg that evals to false.
if ( 'UTF-8' === $encoding && $is_width ) {
$try = _safe_substr_eaw( $try, $length );
}
Expand All @@ -275,7 +273,7 @@ function _safe_substr_eaw( $str, $length ) {
// Note that if the length ends in the middle of a double-width char, the char is excluded, not included.

// See if it's all EAW.
if ( preg_match_all( $eaw_regex, $str, $dummy /*needed for PHP 5.3*/ ) === $length ) {
if ( function_exists( 'mb_substr' ) && preg_match_all( $eaw_regex, $str, $dummy /*needed for PHP 5.3*/ ) === $length ) {
// Just halve the length so (rounded down to a minimum of 1).
$str = mb_substr( $str, 0, max( (int) ( $length / 2 ), 1 ), 'UTF-8' );
} else {
Expand Down Expand Up @@ -344,7 +342,7 @@ function strwidth( $string, $encoding = false ) {
if ( ! $encoding ) {
$encoding = mb_detect_encoding( $string, null, true /*strict*/ );
}
$width = mb_strwidth( $string, $encoding );
$width = $encoding ? mb_strwidth( $string, $encoding ) : mb_strwidth( $string ); // mbstring funcs can fail if given `$encoding` arg that evals to false.
if ( 'UTF-8' === $encoding ) {
// Subtract combining characters.
$width -= preg_match_all( $m_regex, $string, $dummy /*needed for PHP 5.3*/ );
Expand Down
52 changes: 40 additions & 12 deletions tests/test-cli.php
Original file line number Diff line number Diff line change
Expand Up @@ -111,7 +111,12 @@ function test_various_substr() {
putenv( 'PHP_CLI_TOOLS_TEST_SAFE_SUBSTR' );

// Latin, kana, Latin, Latin combining, Thai combining, Hangul.
$str = 'lムnöม้p를';
$str = 'lムnöม้p를'; // 18 bytes.

// Large string.
$large_str_str_start = 65536 * 2;
$large_str = str_repeat( 'a', $large_str_str_start ) . $str;
$large_str_len = strlen( $large_str ); // 128K + 18 bytes.

if ( \cli\can_use_icu() ) {
putenv( 'PHP_CLI_TOOLS_TEST_SAFE_SUBSTR=1' ); // Tests grapheme_substr().
Expand All @@ -124,6 +129,10 @@ function test_various_substr() {
$this->assertSame( 'lムnöม้p', \cli\safe_substr( $str, 0, 6 ) );
$this->assertSame( 'lムnöม้p를', \cli\safe_substr( $str, 0, 7 ) );
$this->assertSame( 'lムnöม้p를', \cli\safe_substr( $str, 0, 8 ) );
$this->assertSame( '', \cli\safe_substr( $str, 19 ) ); // Start too large.
$this->assertSame( '', \cli\safe_substr( $str, 19, 7 ) ); // Start too large, with length.
$this->assertSame( '', \cli\safe_substr( $str, 8 ) ); // Start same as length.
$this->assertSame( '', \cli\safe_substr( $str, 8, 0 ) ); // Start same as length, with zero length.
$this->assertSame( '', \cli\safe_substr( $str, -1 ) );
$this->assertSame( 'p를', \cli\safe_substr( $str, -2 ) );
$this->assertSame( 'ม้p를', \cli\safe_substr( $str, -3 ) );
Expand All @@ -134,11 +143,18 @@ function test_various_substr() {
$this->assertSame( 'ムnöม้p를', \cli\safe_substr( $str, -6 ) );
$this->assertSame( 'lムnöม้p를', \cli\safe_substr( $str, -7 ) );
$this->assertSame( 'lムnö', \cli\safe_substr( $str, -7, 4 ) );
// $this->assertSame( 'lムnöม้p를', \cli\safe_substr( $str, -8 ) ); // grapheme_substr() returns false on this.
$this->assertSame( 'lムnöม้p를', \cli\safe_substr( $str, -8 ) );
$this->assertSame( 'lムnöม้p를', \cli\safe_substr( $str, -9 ) ); // Negative start too large.

$this->assertSame( $large_str, \cli\safe_substr( $large_str, 0 ) );
$this->assertSame( '', \cli\safe_substr( $large_str, $large_str_str_start, 0 ) );
$this->assertSame( 'l', \cli\safe_substr( $large_str, $large_str_str_start, 1 ) );
$this->assertSame( 'lム', \cli\safe_substr( $large_str, $large_str_str_start, 2 ) );
$this->assertSame( 'p를', \cli\safe_substr( $large_str, -2 ) );
}

if ( \cli\can_use_pcre_x() ) {
putenv( 'PHP_CLI_TOOLS_TEST_SAFE_SUBSTR=2' ); // Tests preg_match( '/\X/u' ).
putenv( 'PHP_CLI_TOOLS_TEST_SAFE_SUBSTR=2' ); // Tests preg_split( '/\X/u' ).
$this->assertSame( '', \cli\safe_substr( $str, 0, 0 ) );
$this->assertSame( 'l', \cli\safe_substr( $str, 0, 1 ) );
$this->assertSame( 'lム', \cli\safe_substr( $str, 0, 2 ) );
Expand All @@ -148,6 +164,10 @@ function test_various_substr() {
$this->assertSame( 'lムnöม้p', \cli\safe_substr( $str, 0, 6 ) );
$this->assertSame( 'lムnöม้p를', \cli\safe_substr( $str, 0, 7 ) );
$this->assertSame( 'lムnöม้p를', \cli\safe_substr( $str, 0, 8 ) );
$this->assertSame( '', \cli\safe_substr( $str, 19 ) ); // Start too large.
$this->assertSame( '', \cli\safe_substr( $str, 19, 7 ) ); // Start too large, with length.
$this->assertSame( '', \cli\safe_substr( $str, 8 ) ); // Start same as length.
$this->assertSame( '', \cli\safe_substr( $str, 8, 0 ) ); // Start same as length, with zero length.
$this->assertSame( '', \cli\safe_substr( $str, -1 ) );
$this->assertSame( 'p를', \cli\safe_substr( $str, -2 ) );
$this->assertSame( 'ม้p를', \cli\safe_substr( $str, -3 ) );
Expand All @@ -159,6 +179,13 @@ function test_various_substr() {
$this->assertSame( 'lムnöม้p를', \cli\safe_substr( $str, -7 ) );
$this->assertSame( 'lムnö', \cli\safe_substr( $str, -7, 4 ) );
$this->assertSame( 'lムnöม้p를', \cli\safe_substr( $str, -8 ) );
$this->assertSame( 'lムnöม้p를', \cli\safe_substr( $str, -9 ) ); // Negative start too large.

$this->assertSame( $large_str, \cli\safe_substr( $large_str, 0 ) );
$this->assertSame( '', \cli\safe_substr( $large_str, $large_str_str_start, 0 ) );
$this->assertSame( 'l', \cli\safe_substr( $large_str, $large_str_str_start, 1 ) );
$this->assertSame( 'lム', \cli\safe_substr( $large_str, $large_str_str_start, 2 ) );
$this->assertSame( 'p를', \cli\safe_substr( $large_str, -2 ) );
}

if ( function_exists( 'mb_substr' ) ) {
Expand All @@ -174,8 +201,9 @@ function test_various_substr() {
$this->assertSame( '', \cli\safe_substr( $str, 0, 0 ) );
$this->assertSame( 'l', \cli\safe_substr( $str, 0, 1 ) );
$this->assertSame( "l\xe3", \cli\safe_substr( $str, 0, 2 ) ); // Corrupt.
$this->assertSame( '', \cli\safe_substr( $str, strlen( $str ) + 1 ) ); // Return '' not false to match behavior of other methods when `$start` > strlen.

// Non-UTF-8 - both grapheme_substr() and preg_match( '/\X/u' ) will fail.
// Non-UTF-8 - both grapheme_substr() and preg_split( '/\X/u' ) will fail.

putenv( 'PHP_CLI_TOOLS_TEST_SAFE_SUBSTR' );

Expand Down Expand Up @@ -392,10 +420,10 @@ function test_strwidth() {

if ( \cli\can_use_icu() ) {
$this->assertSame( 5, \cli\strwidth( $str ) ); // Tests grapheme_strlen().
putenv( 'PHP_CLI_TOOLS_TEST_STRWIDTH=2' ); // Test preg_match_all( '/\X/u' ).
putenv( 'PHP_CLI_TOOLS_TEST_STRWIDTH=2' ); // Test preg_split( '/\X/u' ).
$this->assertSame( 5, \cli\strwidth( $str ) );
} else {
$this->assertSame( 5, \cli\strwidth( $str ) ); // Tests preg_match_all( '/\X/u' ).
$this->assertSame( 5, \cli\strwidth( $str ) ); // Tests preg_split( '/\X/u' ).
}

if ( function_exists( 'mb_strwidth' ) && function_exists( 'mb_detect_order' ) ) {
Expand All @@ -422,10 +450,10 @@ function test_strwidth() {

if ( \cli\can_use_icu() ) {
$this->assertSame( 11, \cli\strwidth( $str ) ); // Tests grapheme_strlen().
putenv( 'PHP_CLI_TOOLS_TEST_STRWIDTH=2' ); // Test preg_match_all( '/\X/u' ).
putenv( 'PHP_CLI_TOOLS_TEST_STRWIDTH=2' ); // Test preg_split( '/\X/u' ).
$this->assertSame( 11, \cli\strwidth( $str ) );
} else {
$this->assertSame( 11, \cli\strwidth( $str ) ); // Tests preg_match_all( '/\X/u' ).
$this->assertSame( 11, \cli\strwidth( $str ) ); // Tests preg_split( '/\X/u' ).
}

if ( function_exists( 'mb_strwidth' ) && function_exists( 'mb_detect_order' ) ) {
Expand All @@ -434,7 +462,7 @@ function test_strwidth() {
$this->assertSame( 11, \cli\strwidth( $str ) );
}

// Non-UTF-8 - both grapheme_strlen() and preg_match_all( '/\X/u' ) will fail.
// Non-UTF-8 - both grapheme_strlen() and preg_split( '/\X/u' ) will fail.

putenv( 'PHP_CLI_TOOLS_TEST_STRWIDTH' );

Expand Down Expand Up @@ -486,11 +514,11 @@ function test_safe_strlen() {
putenv( 'PHP_CLI_TOOLS_TEST_SAFE_STRLEN' ); // Test grapheme_strlen().
$this->assertSame( 7, \cli\safe_strlen( $str ) );
if ( \cli\can_use_pcre_x() ) {
putenv( 'PHP_CLI_TOOLS_TEST_SAFE_STRLEN=2' ); // Test preg_match_all( '/\X/u' ).
putenv( 'PHP_CLI_TOOLS_TEST_SAFE_STRLEN=2' ); // Test preg_split( '/\X/u' ).
$this->assertSame( 7, \cli\safe_strlen( $str ) );
}
} elseif ( \cli\can_use_pcre_x() ) {
$this->assertSame( 7, \cli\safe_strlen( $str ) ); // Tests preg_match_all( '/\X/u' ).
$this->assertSame( 7, \cli\safe_strlen( $str ) ); // Tests preg_split( '/\X/u' ).
} else {
putenv( 'PHP_CLI_TOOLS_TEST_SAFE_STRLEN=8' ); // Test strlen().
$this->assertSame( 18, \cli\safe_strlen( $str ) ); // strlen() - no. of bytes.
Expand All @@ -504,7 +532,7 @@ function test_safe_strlen() {
$this->assertSame( 9, mb_strlen( $str, 'UTF-8' ) ); // mb_strlen() - counts the 2 combining chars.
}

// Non-UTF-8 - both grapheme_strlen() and preg_match_all( '/\X/u' ) will fail.
// Non-UTF-8 - both grapheme_strlen() and preg_split( '/\X/u' ) will fail.

putenv( 'PHP_CLI_TOOLS_TEST_SAFE_STRLEN' );

Expand Down

0 comments on commit 363c753

Please sign in to comment.