Make WordPress Core

Changeset 60949


Ignore:
Timestamp:
10/16/2025 08:58:26 PM (4 months ago)
Author:
dmsnell
Message:

Charset: Rely on new UTF-8 pipeline for mb_strlen() fallback.

The existing polyfill for mb_strlen() contains a number of issues leaving plenty of opportunity for improvement. Specifically, the following are all deficiencies: it relies on Unicode PCRE support, assumes input strings are valid UTF-8, splits input strings into an array of character to count them (1,000 at a time, iterating until complete), and entirely gives up when the Unicode support is missing.

This patch provides an updated polyfill which will reliably count code points in a UTF-8 string, even in the presence of sequences of invalid bytes. It scans through the input with zero allocations. Additionally, the underlying fallback extends the behavior of mb_strlen() to provide character counts for substrings within a larger input without extracting the substring (it can counts characters within a byte offset and length of a larger string).

This change improves the reliability of UTF-8 string length calculations and removes behavioral variability based on the runtime system.

Developed in https://github.com/WordPress/wordpress-develop/pull/9828
Discussed in https://core.trac.wordpress.org/ticket/63863

See #63863.

Location:
trunk
Files:
1 added
3 edited

Legend:

Unmodified
Added
Removed
  • trunk/src/wp-includes/compat-utf8.php

    r60808 r60949  
    292292    return $scrubbed;
    293293}
     294
     295/**
     296 * Returns how many code points are found in the given UTF-8 string.
     297 *
     298 * Invalid spans of bytes count as a single code point according
     299 * to the maximal subpart rule. This function is a fallback method
     300 * for calling `mb_strlen( $text, 'UTF-8' )`.
     301 *
     302 * When negative values are provided for the byte offsets or length,
     303 * this will always report zero code points.
     304 *
     305 * Example:
     306 *
     307 *     4  === _wp_utf8_codepoint_count( 'text' );
     308 *
     309 *     // Groups are 'test', "\x90" as '�', 'wp', "\xE2\x80" as '�', "\xC0" as '�', and 'test'.
     310 *     13 === _wp_utf8_codepoint_count( "test\x90wp\xE2\x80\xC0test" );
     311 *
     312 * @since 6.9.0
     313 * @access private
     314 *
     315 * @param string $text            Count code points in this string.
     316 * @param ?int   $byte_offset     Start counting after this many bytes in `$text`. Must be positive.
     317 * @param ?int   $max_byte_length Optional. Stop counting after having scanned past this many bytes.
     318 *                                Default is to scan until the end of the string. Must be positive.
     319 * @return int How many code points were found.
     320 */
     321function _wp_utf8_codepoint_count( string $text, ?int $byte_offset = 0, ?int $max_byte_length = PHP_INT_MAX ): int {
     322    if ( $byte_offset < 0 ) {
     323        return 0;
     324    }
     325
     326    $count           = 0;
     327    $at              = $byte_offset;
     328    $end             = strlen( $text );
     329    $invalid_length  = 0;
     330    $max_byte_length = min( $end - $at, $max_byte_length );
     331
     332    while ( $at < $end && ( $at - $byte_offset ) < $max_byte_length ) {
     333        $count += _wp_scan_utf8( $text, $at, $invalid_length, $max_byte_length - ( $at - $byte_offset ) );
     334        $count += $invalid_length > 0 ? 1 : 0;
     335        $at    += $invalid_length;
     336    }
     337
     338    return $count;
     339}
  • trunk/src/wp-includes/compat.php

    r60694 r60949  
    229229 * Internal compat function to mimic mb_strlen().
    230230 *
    231  * Only understands UTF-8 and 8bit. All other character sets will be treated as 8bit.
    232  * For `$encoding === UTF-8`, the `$str` input is expected to be a valid UTF-8 byte
    233  * sequence. The behavior of this function for invalid inputs is undefined.
     231 * Only supports UTF-8 and non-shifting single-byte encodings. For all other
     232 * encodings expect the counts to be wrong. When the given encoding (or the
     233 * `blog_charset` if none is provided) isn’t UTF-8 then the function returns
     234 * the byte-count of the provided string.
    234235 *
    235236 * @ignore
     
    237238 *
    238239 * @param string      $str      The string to retrieve the character length from.
    239  * @param string|null $encoding Optional. Character encoding to use. Default null.
    240  * @return int String length of `$str`.
     240 * @param string|null $encoding Optional. Count characters according to this encoding.
     241 *                              Default is to consult `blog_charset`.
     242 * @return int Count of code points if UTF-8, byte length otherwise.
    241243 */
    242244function _mb_strlen( $str, $encoding = null ) {
    243     if ( null === $encoding ) {
    244         $encoding = get_option( 'blog_charset' );
    245     }
    246 
    247     /*
    248      * The solution below works only for UTF-8, so in case of a different charset
    249      * just use built-in strlen().
    250      */
    251     if ( ! _is_utf8_charset( $encoding ) ) {
    252         return strlen( $str );
    253     }
    254 
    255     if ( _wp_can_use_pcre_u() ) {
    256         // Use the regex unicode support to separate the UTF-8 characters into an array.
    257         preg_match_all( '/./us', $str, $match );
    258         return count( $match[0] );
    259     }
    260 
    261     $regex = '/(?:
    262         [\x00-\x7F]                  # single-byte sequences   0xxxxxxx
    263         | [\xC2-\xDF][\x80-\xBF]       # double-byte sequences   110xxxxx 10xxxxxx
    264         | \xE0[\xA0-\xBF][\x80-\xBF]   # triple-byte sequences   1110xxxx 10xxxxxx * 2
    265         | [\xE1-\xEC][\x80-\xBF]{2}
    266         | \xED[\x80-\x9F][\x80-\xBF]
    267         | [\xEE-\xEF][\x80-\xBF]{2}
    268         | \xF0[\x90-\xBF][\x80-\xBF]{2} # four-byte sequences   11110xxx 10xxxxxx * 3
    269         | [\xF1-\xF3][\x80-\xBF]{3}
    270         | \xF4[\x80-\x8F][\x80-\xBF]{2}
    271     )/x';
    272 
    273     // Start at 1 instead of 0 since the first thing we do is decrement.
    274     $count = 1;
    275 
    276     do {
    277         // We had some string left over from the last round, but we counted it in that last round.
    278         --$count;
    279 
    280         /*
    281          * Split by UTF-8 character, limit to 1000 characters (last array element will contain
    282          * the rest of the string).
    283          */
    284         $pieces = preg_split( $regex, $str, 1000 );
    285 
    286         // Increment.
    287         $count += count( $pieces );
    288 
    289         // If there's anything left over, repeat the loop.
    290     } while ( $str = array_pop( $pieces ) );
    291 
    292     // Fencepost: preg_split() always returns one extra item in the array.
    293     return --$count;
     245    return _is_utf8_charset( $encoding ?? get_option( 'blog_charset' ) )
     246        ? _wp_utf8_codepoint_count( $str )
     247        : strlen( $str );
    294248}
    295249
  • trunk/tests/phpunit/tests/compat/mbStrlen.php

    r55562 r60949  
    1111
    1212    /**
    13      * Test that mb_strlen() is always available (either from PHP or WP).
     13     * Test that the native mb_strlen() is available.
    1414     */
    1515    public function test_mb_strlen_availability() {
    16         $this->assertTrue( function_exists( 'mb_strlen' ) );
     16        $this->assertTrue(
     17            in_array( 'mb_strlen', get_defined_functions()['internal'], true ),
     18            'Test runner should have `mbstring` extension active but doesn’t.'
     19        );
    1720    }
    1821
    1922    /**
    20      * @dataProvider data_utf8_string_lengths
     23     * @dataProvider data_utf8_strings
    2124     */
    22     public function test_mb_strlen( $input_string, $expected_character_length ) {
    23         $this->assertSame( $expected_character_length, _mb_strlen( $input_string, 'UTF-8' ) );
     25    public function test_mb_strlen( $input_string ) {
     26        $this->assertSame(
     27            mb_strlen( $input_string, 'UTF-8' ),
     28            _mb_strlen( $input_string, 'UTF-8' )
     29        );
    2430    }
    2531
    2632    /**
    27      * @dataProvider data_utf8_string_lengths
     33     * @dataProvider data_utf8_strings
    2834     */
    29     public function test_mb_strlen_via_regex( $input_string, $expected_character_length ) {
    30         _wp_can_use_pcre_u( false );
    31         $this->assertSame( $expected_character_length, _mb_strlen( $input_string, 'UTF-8' ) );
    32         _wp_can_use_pcre_u( 'reset' );
     35    public function test_mb_strlen_via_regex( $input_string ) {
     36        $this->assertSame(
     37            mb_strlen( $input_string, 'UTF-8' ),
     38            _mb_strlen( $input_string, 'UTF-8' )
     39        );
    3340    }
    3441
    3542    /**
    36      * @dataProvider data_utf8_string_lengths
     43     * @dataProvider data_utf8_strings
    3744     */
    38     public function test_8bit_mb_strlen( $input_string, $expected_character_length, $expected_byte_length ) {
    39         $this->assertSame( $expected_byte_length, _mb_strlen( $input_string, '8bit' ) );
     45    public function test_8bit_mb_strlen( $input_string ) {
     46        $this->assertSame(
     47            mb_strlen( $input_string, '8bit' ),
     48            _mb_strlen( $input_string, '8bit' )
     49        );
    4050    }
    4151
     
    4555     * @return array
    4656     */
    47     public function data_utf8_string_lengths() {
     57    public function data_utf8_strings() {
    4858        return array(
    49             array(
    50                 'input_string'              => 'баба',
    51                 'expected_character_length' => 4,
    52                 'expected_byte_length'      => 8,
    53             ),
    54             array(
    55                 'input_string'              => 'баб',
    56                 'expected_character_length' => 3,
    57                 'expected_byte_length'      => 6,
    58             ),
    59             array(
    60                 'input_string'              => 'I am your б',
    61                 'expected_character_length' => 11,
    62                 'expected_byte_length'      => 12,
    63             ),
    64             array(
    65                 'input_string'              => '1111111111',
    66                 'expected_character_length' => 10,
    67                 'expected_byte_length'      => 10,
    68             ),
    69             array(
    70                 'input_string'              => '²²²²²²²²²²',
    71                 'expected_character_length' => 10,
    72                 'expected_byte_length'      => 20,
    73             ),
    74             array(
    75                 'input_string'              => '3333333333',
    76                 'expected_character_length' => 10,
    77                 'expected_byte_length'      => 30,
    78             ),
    79             array(
    80                 'input_string'              => '𝟜𝟜𝟜𝟜𝟜𝟜𝟜𝟜𝟜𝟜',
    81                 'expected_character_length' => 10,
    82                 'expected_byte_length'      => 40,
    83             ),
    84             array(
    85                 'input_string'              => '1²3𝟜1²3𝟜1²3𝟜',
    86                 'expected_character_length' => 12,
    87                 'expected_byte_length'      => 30,
    88             ),
     59            array( 'баба' ),
     60            array( 'баб' ),
     61            array( 'I am your б' ),
     62            array( '1111111111' ),
     63            array( '²²²²²²²²²²' ),
     64            array( '3333333333' ),
     65            array( '𝟜𝟜𝟜𝟜𝟜𝟜𝟜𝟜𝟜𝟜' ),
     66            array( '1²3𝟜1²3𝟜1²3𝟜' ),
    8967        );
    9068    }
Note: See TracChangeset for help on using the changeset viewer.