Make WordPress Core

Changeset 60969


Ignore:
Timestamp:
10/18/2025 04:34:02 AM (8 weeks ago)
Author:
dmsnell
Message:

Charset: Rely on new UTF-8 pipeline for mb_substr() fallback.

The existing polyfill for mb_substr() contains a number of issues leaving plenty of opportunity for improvement. Specifically, the following are all deficiencies: it relies on Unicode PCRE support, assumes input strings are valid UTF-8, splits input strings into an array of characters (1,000 at a time, iterating until complete), and re-joins them at the end.

This patch provides an updated polyfill which will reliably parse UTF-8 strings even in the presence of invalid bytes. It computes boundaries for the substring extraction with zero allocations and then returns a single substr() call at the end.

This change improves the reliability of UTF-8 string handling and removes behavioral variability based on the runtime system.

Developed in https://github.com/WordPress/wordpress-develop/pull/9829
Discussed in https://core.trac.wordpress.org/ticket/63863

See #63863.

Location:
trunk
Files:
3 edited

Legend:

Unmodified
Added
Removed
  • trunk/src/wp-includes/compat-utf8.php

    r60950 r60969  
    340340
    341341/**
     342 * Given a starting offset within a string and a maximum number of code points,
     343 * return how many bytes are occupied by the span of characters.
     344 *
     345 * Invalid spans of bytes count as a single code point according to the maximal
     346 * subpart rule. This function is a fallback method for calling
     347 * `strlen( mb_substr( substr( $text, $at ), 0, $max_code_points ) )`.
     348 *
     349 * @since 6.9.0
     350 * @access private
     351 *
     352 * @param string $text              Count bytes of span in this text.
     353 * @param int    $byte_offset       Start counting at this byte offset.
     354 * @param int    $max_code_points   Stop counting after this many code points have been seen,
     355 *                                  or at the end of the string.
     356 * @param ?int   $found_code_points Optional. Will be set to number of found code points in
     357 *                                  span, as this might be smaller than the maximum count if
     358 *                                  the string is not long enough.
     359 * @return int Number of bytes spanned by the code points.
     360 */
     361function _wp_utf8_codepoint_span( string $text, int $byte_offset, int $max_code_points, ?int &$found_code_points = 0 ): int {
     362    $was_at            = $byte_offset;
     363    $invalid_length    = 0;
     364    $end               = strlen( $text );
     365    $found_code_points = 0;
     366
     367    while ( $byte_offset < $end && $found_code_points < $max_code_points ) {
     368        $needed      = $max_code_points - $found_code_points;
     369        $chunk_count = _wp_scan_utf8( $text, $byte_offset, $invalid_length, null, $needed );
     370
     371        $found_code_points += $chunk_count;
     372
     373        // Invalid spans only convey one code point count regardless of how long they are.
     374        if ( 0 !== $invalid_length && $found_code_points < $max_code_points ) {
     375            ++$found_code_points;
     376            $byte_offset += $invalid_length;
     377        }
     378    }
     379
     380    return $byte_offset - $was_at;
     381}
     382
     383/**
    342384 * Converts a string from ISO-8859-1 to UTF-8, maintaining backwards compatibility
    343385 * with the deprecated function from the PHP standard library.
  • trunk/src/wp-includes/compat.php

    r60950 r60969  
    3434 * @ignore
    3535 * @since 4.2.2
     36 * @since 6.9.0 Deprecated the `$set` argument.
    3637 * @access private
    3738 *
    38  * @param bool $set - Used for testing only
    39  *             null   : default - get PCRE/u capability
    40  *             false  : Used for testing - return false for future calls to this function
    41  *             'reset': Used for testing - restore default behavior of this function
     39 * @param bool $set Deprecated. This argument is no longer used for testing purposes.
    4240 */
    4341function _wp_can_use_pcre_u( $set = null ) {
    44     static $utf8_pcre = 'reset';
    45 
    46     if ( null !== $set ) {
    47         $utf8_pcre = $set;
    48     }
    49 
    50     if ( 'reset' === $utf8_pcre ) {
    51         $utf8_pcre = true;
    52 
    53         set_error_handler(
    54             function ( $errno, $errstr ) use ( &$utf8_pcre ) {
    55                 if ( str_starts_with( $errstr, 'preg_match():' ) ) {
    56                     $utf8_pcre = false;
    57                     return true;
    58                 }
    59 
    60                 return false;
    61             },
    62             E_WARNING
    63         );
    64 
    65         /*
    66          * Attempt to compile a PCRE pattern with the PCRE_UTF8 flag. For
    67          * systems lacking Unicode support this will trigger a warning
    68          * during compilation, which the error handler will intercept.
    69          */
    70         preg_match( '//u', '' );
    71 
    72         restore_error_handler();
    73     }
     42    static $utf8_pcre = null;
     43
     44    if ( isset( $set ) ) {
     45        _deprecated_argument( __FUNCTION__, '6.9.0' );
     46    }
     47
     48    if ( isset( $utf8_pcre ) ) {
     49        return $utf8_pcre;
     50    }
     51
     52    $utf8_pcre = true;
     53    set_error_handler(
     54        function ( $errno, $errstr ) use ( &$utf8_pcre ) {
     55            if ( str_starts_with( $errstr, 'preg_match():' ) ) {
     56                $utf8_pcre = false;
     57                return true;
     58            }
     59
     60            return false;
     61        },
     62        E_WARNING
     63    );
     64
     65    /*
     66     * Attempt to compile a PCRE pattern with the PCRE_UTF8 flag. For
     67     * systems lacking Unicode support this will trigger a warning
     68     * during compilation, which the error handler will intercept.
     69     */
     70    preg_match( '//u', '' );
     71    restore_error_handler();
    7472
    7573    return $utf8_pcre;
     
    137135 * Internal compat function to mimic mb_substr().
    138136 *
    139  * Only understands UTF-8 and 8bit. All other character sets will be treated as 8bit.
    140  * For `$encoding === UTF-8`, the `$str` input is expected to be a valid UTF-8 byte
    141  * sequence. The behavior of this function for invalid inputs is undefined.
     137 * Only supports UTF-8 and non-shifting single-byte encodings. For all other encodings
     138 * expect the substrings to be misaligned. When the given encoding (or the `blog_charset`
     139 * if none is provided) isn’t UTF-8 then the function returns the output of {@see \substr()}.
    142140 *
    143141 * @ignore
     
    145143 *
    146144 * @param string      $str      The string to extract the substring from.
    147  * @param int         $start    Position to being extraction from in `$str`.
     145 * @param int         $start    Character offset at which to start the substring extraction.
    148146 * @param int|null    $length   Optional. Maximum number of characters to extract from `$str`.
    149147 *                              Default null.
     
    156154    }
    157155
    158     if ( null === $encoding ) {
    159         $encoding = get_option( 'blog_charset' );
    160     }
     156    // The solution below works only for UTF-8; treat all other encodings as byte streams.
     157    if ( ! _is_utf8_charset( $encoding ?? get_option( 'blog_charset' ) ) ) {
     158        return is_null( $length ) ? substr( $str, $start ) : substr( $str, $start, $length );
     159    }
     160
     161    $total_length = ( $start < 0 || $length < 0 )
     162        ? _wp_utf8_codepoint_count( $str )
     163        : 0;
     164
     165    $normalized_start = $start < 0
     166        ? max( 0, $total_length + $start )
     167        : $start;
    161168
    162169    /*
    163      * The solution below works only for UTF-8, so in case of a different
    164      * charset just use built-in substr().
    165      */
    166     if ( ! _is_utf8_charset( $encoding ) ) {
    167         return is_null( $length ) ? substr( $str, $start ) : substr( $str, $start, $length );
    168     }
    169 
    170     if ( _wp_can_use_pcre_u() ) {
    171         // Use the regex unicode support to separate the UTF-8 characters into an array.
    172         preg_match_all( '/./us', $str, $match );
    173         $chars = is_null( $length ) ? array_slice( $match[0], $start ) : array_slice( $match[0], $start, $length );
    174         return implode( '', $chars );
    175     }
    176 
    177     $regex = '/(
    178         [\x00-\x7F]                  # single-byte sequences   0xxxxxxx
    179         | [\xC2-\xDF][\x80-\xBF]       # double-byte sequences   110xxxxx 10xxxxxx
    180         | \xE0[\xA0-\xBF][\x80-\xBF]   # triple-byte sequences   1110xxxx 10xxxxxx * 2
    181         | [\xE1-\xEC][\x80-\xBF]{2}
    182         | \xED[\x80-\x9F][\x80-\xBF]
    183         | [\xEE-\xEF][\x80-\xBF]{2}
    184         | \xF0[\x90-\xBF][\x80-\xBF]{2} # four-byte sequences   11110xxx 10xxxxxx * 3
    185         | [\xF1-\xF3][\x80-\xBF]{3}
    186         | \xF4[\x80-\x8F][\x80-\xBF]{2}
    187     )/x';
    188 
    189     // Start with 1 element instead of 0 since the first thing we do is pop.
    190     $chars = array( '' );
    191 
    192     do {
    193         // We had some string left over from the last round, but we counted it in that last round.
    194         array_pop( $chars );
    195 
    196         /*
    197          * Split by UTF-8 character, limit to 1000 characters (last array element will contain
    198          * the rest of the string).
    199          */
    200         $pieces = preg_split( $regex, $str, 1000, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY );
    201 
    202         $chars = array_merge( $chars, $pieces );
    203 
    204         // If there's anything left over, repeat the loop.
    205     } while ( count( $pieces ) > 1 && $str = array_pop( $pieces ) );
    206 
    207     return implode( '', array_slice( $chars, $start, $length ) );
     170     * The starting offset is provided as characters, which means this needs to
     171     * find how many bytes that many characters occupies at the start of the string.
     172     */
     173    $starting_byte_offset = _wp_utf8_codepoint_span( $str, 0, $normalized_start );
     174
     175    $normalized_length = $length < 0
     176        ? max( 0, $total_length - $normalized_start + $length )
     177        : $length;
     178
     179    /*
     180     * This is the main step. It finds how many bytes the given length of code points
     181     * occupies in the input, starting at the byte offset calculated above.
     182     */
     183    $byte_length = isset( $normalized_length )
     184        ? _wp_utf8_codepoint_span( $str, $starting_byte_offset, $normalized_length )
     185        : ( strlen( $str ) - $starting_byte_offset );
     186
     187    // The result is a normal byte-level substring using the computed ranges.
     188    return substr( $str, $starting_byte_offset, $byte_length );
    208189}
    209190
  • trunk/tests/phpunit/tests/compat/mbSubstr.php

    r55562 r60969  
    1414     */
    1515    public function test_mb_substr_availability() {
    16         $this->assertTrue( function_exists( 'mb_substr' ) );
     16        $this->assertTrue(
     17            in_array( 'mb_substr', get_defined_functions()['internal'], true ),
     18            'Test runner should have `mbstring` extension active but doesn’t.'
     19        );
    1720    }
    1821
     
    2023     * @dataProvider data_utf8_substrings
    2124     */
    22     public function test_mb_substr( $input_string, $start, $length, $expected_character_substring ) {
    23         $this->assertSame( $expected_character_substring, _mb_substr( $input_string, $start, $length, 'UTF-8' ) );
     25    public function test_mb_substr( $input_string, $start, $length ) {
     26        $this->assertSame(
     27            mb_substr( $input_string, $start, $length, 'UTF-8' ),
     28            _mb_substr( $input_string, $start, $length, 'UTF-8' )
     29        );
    2430    }
    2531
     
    2733     * @dataProvider data_utf8_substrings
    2834     */
    29     public function test_mb_substr_via_regex( $input_string, $start, $length, $expected_character_substring ) {
    30         _wp_can_use_pcre_u( false );
    31         $this->assertSame( $expected_character_substring, _mb_substr( $input_string, $start, $length, 'UTF-8' ) );
    32         _wp_can_use_pcre_u( 'reset' );
    33     }
    34 
    35     /**
    36      * @dataProvider data_utf8_substrings
    37      */
    38     public function test_8bit_mb_substr( $input_string, $start, $length, $expected_character_substring, $expected_byte_substring ) {
    39         $this->assertSame( $expected_byte_substring, _mb_substr( $input_string, $start, $length, '8bit' ) );
     35    public function test_8bit_mb_substr( $input_string, $start, $length ) {
     36        $this->assertSame(
     37            mb_substr( $input_string, $start, $length, '8bit' ),
     38            _mb_substr( $input_string, $start, $length, '8bit' )
     39        );
    4040    }
    4141
     
    4343     * Data provider.
    4444     *
    45      * @return array
     45     * @return array[]
    4646     */
    4747    public function data_utf8_substrings() {
    4848        return array(
    49             array(
    50                 'input_string'                 => 'баба',
    51                 'start'                        => 0,
    52                 'length'                       => 3,
    53                 'expected_character_substring' => 'баб',
    54                 'expected_byte_substring'      => "б\xD0",
    55             ),
    56             array(
    57                 'input_string'                 => 'баба',
    58                 'start'                        => 0,
    59                 'length'                       => -1,
    60                 'expected_character_substring' => 'баб',
    61                 'expected_byte_substring'      => "баб\xD0",
    62             ),
    63             array(
    64                 'input_string'                 => 'баба',
    65                 'start'                        => 1,
    66                 'length'                       => null,
    67                 'expected_character_substring' => 'аба',
    68                 'expected_byte_substring'      => "\xB1аба",
    69             ),
    70             array(
    71                 'input_string'                 => 'баба',
    72                 'start'                        => -3,
    73                 'length'                       => null,
    74                 'expected_character_substring' => 'аба',
    75                 'expected_byte_substring'      => "\xB1а",
    76             ),
    77             array(
    78                 'input_string'                 => 'баба',
    79                 'start'                        => -3,
    80                 'length'                       => 2,
    81                 'expected_character_substring' => 'аб',
    82                 'expected_byte_substring'      => "\xB1\xD0",
    83             ),
    84             array(
    85                 'input_string'                 => 'баба',
    86                 'start'                        => -1,
    87                 'length'                       => 2,
    88                 'expected_character_substring' => 'а',
    89                 'expected_byte_substring'      => "\xB0",
    90             ),
    91             array(
    92                 'input_string'                 => 'I am your баба',
    93                 'start'                        => 0,
    94                 'length'                       => 11,
    95                 'expected_character_substring' => 'I am your б',
    96                 'expected_byte_substring'      => "I am your \xD0",
    97             ),
     49            'баба'           => array( 'баба', 0, 3 ),
     50            'баба'           => array( 'баба', 0, -1 ),
     51            'баба'           => array( 'баба', 1, null ),
     52            'баба'           => array( 'баба', -3, null ),
     53            'баба'           => array( 'баба', -3, 2 ),
     54            'баба'           => array( 'баба', -2, 1 ),
     55            'баба'           => array( 'баба', 30, 1 ),
     56            'баба'           => array( 'баба', 15, -30 ),
     57            'баба'           => array( 'баба', -5, -5 ),
     58            'баба'           => array( 'баба', 5, -3 ),
     59            'баба'           => array( 'баба', -3, 5 ),
     60            'I am your баба' => array( 'I am your баба', 0, 11 ),
    9861        );
    9962    }
     
    10467    public function test_mb_substr_phpcore_basic() {
    10568        $string_ascii = 'ABCDEF';
    106         $string_mb    = base64_decode( '5pel5pys6Kqe44OG44Kt44K544OI44Gn44GZ44CCMDEyMzTvvJXvvJbvvJfvvJjvvJnjgII=' );
     69        $string_mb    = '日本語テキストです。0123456789。';
    10770
    10871        $this->assertSame(
     
    11982        // Specific latin-1 as that is the default the core PHP test operates under.
    12083        $this->assertSame(
    121             'peacrOiqng==',
    122             base64_encode( _mb_substr( $string_mb, 2, 7, 'latin-1' ) ),
     84            "\xA5本語",
     85            _mb_substr( $string_mb, 2, 7, 'latin-1' ),
    12386            'Substring does not match expected for offset 2, length 7, with latin-1 charset'
    12487        );
    12588        $this->assertSame(
    126             '6Kqe44OG44Kt44K544OI44Gn44GZ',
    127             base64_encode( _mb_substr( $string_mb, 2, 7, 'utf-8' ) ),
     89            '語テキストです',
     90            _mb_substr( $string_mb, 2, 7, 'utf-8' ),
    12891            'Substring does not match expected for offset 2, length 7, with utf-8 charset'
    12992        );
Note: See TracChangeset for help on using the changeset viewer.