Changeset 60949
- Timestamp:
- 10/16/2025 08:58:26 PM (4 months ago)
- Location:
- trunk
- Files:
-
- 1 added
- 3 edited
-
src/wp-includes/compat-utf8.php (modified) (1 diff)
-
src/wp-includes/compat.php (modified) (2 diffs)
-
tests/phpunit/tests/compat/mbStrlen.php (modified) (2 diffs)
-
tests/phpunit/tests/compat/wpCodePointCount.php (added)
Legend:
- Unmodified
- Added
- Removed
-
trunk/src/wp-includes/compat-utf8.php
r60808 r60949 292 292 return $scrubbed; 293 293 } 294 295 /** 296 * Returns how many code points are found in the given UTF-8 string. 297 * 298 * Invalid spans of bytes count as a single code point according 299 * to the maximal subpart rule. This function is a fallback method 300 * for calling `mb_strlen( $text, 'UTF-8' )`. 301 * 302 * When negative values are provided for the byte offsets or length, 303 * this will always report zero code points. 304 * 305 * Example: 306 * 307 * 4 === _wp_utf8_codepoint_count( 'text' ); 308 * 309 * // Groups are 'test', "\x90" as '�', 'wp', "\xE2\x80" as '�', "\xC0" as '�', and 'test'. 310 * 13 === _wp_utf8_codepoint_count( "test\x90wp\xE2\x80\xC0test" ); 311 * 312 * @since 6.9.0 313 * @access private 314 * 315 * @param string $text Count code points in this string. 316 * @param ?int $byte_offset Start counting after this many bytes in `$text`. Must be positive. 317 * @param ?int $max_byte_length Optional. Stop counting after having scanned past this many bytes. 318 * Default is to scan until the end of the string. Must be positive. 319 * @return int How many code points were found. 320 */ 321 function _wp_utf8_codepoint_count( string $text, ?int $byte_offset = 0, ?int $max_byte_length = PHP_INT_MAX ): int { 322 if ( $byte_offset < 0 ) { 323 return 0; 324 } 325 326 $count = 0; 327 $at = $byte_offset; 328 $end = strlen( $text ); 329 $invalid_length = 0; 330 $max_byte_length = min( $end - $at, $max_byte_length ); 331 332 while ( $at < $end && ( $at - $byte_offset ) < $max_byte_length ) { 333 $count += _wp_scan_utf8( $text, $at, $invalid_length, $max_byte_length - ( $at - $byte_offset ) ); 334 $count += $invalid_length > 0 ? 1 : 0; 335 $at += $invalid_length; 336 } 337 338 return $count; 339 } -
trunk/src/wp-includes/compat.php
r60694 r60949 229 229 * Internal compat function to mimic mb_strlen(). 230 230 * 231 * Only understands UTF-8 and 8bit. All other character sets will be treated as 8bit. 232 * For `$encoding === UTF-8`, the `$str` input is expected to be a valid UTF-8 byte 233 * sequence. The behavior of this function for invalid inputs is undefined. 231 * Only supports UTF-8 and non-shifting single-byte encodings. For all other 232 * encodings expect the counts to be wrong. When the given encoding (or the 233 * `blog_charset` if none is provided) isn’t UTF-8 then the function returns 234 * the byte-count of the provided string. 234 235 * 235 236 * @ignore … … 237 238 * 238 239 * @param string $str The string to retrieve the character length from. 239 * @param string|null $encoding Optional. Character encoding to use. Default null. 240 * @return int String length of `$str`. 240 * @param string|null $encoding Optional. Count characters according to this encoding. 241 * Default is to consult `blog_charset`. 242 * @return int Count of code points if UTF-8, byte length otherwise. 241 243 */ 242 244 function _mb_strlen( $str, $encoding = null ) { 243 if ( null === $encoding ) { 244 $encoding = get_option( 'blog_charset' ); 245 } 246 247 /* 248 * The solution below works only for UTF-8, so in case of a different charset 249 * just use built-in strlen(). 250 */ 251 if ( ! _is_utf8_charset( $encoding ) ) { 252 return strlen( $str ); 253 } 254 255 if ( _wp_can_use_pcre_u() ) { 256 // Use the regex unicode support to separate the UTF-8 characters into an array. 257 preg_match_all( '/./us', $str, $match ); 258 return count( $match[0] ); 259 } 260 261 $regex = '/(?: 262 [\x00-\x7F] # single-byte sequences 0xxxxxxx 263 | [\xC2-\xDF][\x80-\xBF] # double-byte sequences 110xxxxx 10xxxxxx 264 | \xE0[\xA0-\xBF][\x80-\xBF] # triple-byte sequences 1110xxxx 10xxxxxx * 2 265 | [\xE1-\xEC][\x80-\xBF]{2} 266 | \xED[\x80-\x9F][\x80-\xBF] 267 | [\xEE-\xEF][\x80-\xBF]{2} 268 | \xF0[\x90-\xBF][\x80-\xBF]{2} # four-byte sequences 11110xxx 10xxxxxx * 3 269 | [\xF1-\xF3][\x80-\xBF]{3} 270 | \xF4[\x80-\x8F][\x80-\xBF]{2} 271 )/x'; 272 273 // Start at 1 instead of 0 since the first thing we do is decrement. 274 $count = 1; 275 276 do { 277 // We had some string left over from the last round, but we counted it in that last round. 278 --$count; 279 280 /* 281 * Split by UTF-8 character, limit to 1000 characters (last array element will contain 282 * the rest of the string). 283 */ 284 $pieces = preg_split( $regex, $str, 1000 ); 285 286 // Increment. 287 $count += count( $pieces ); 288 289 // If there's anything left over, repeat the loop. 290 } while ( $str = array_pop( $pieces ) ); 291 292 // Fencepost: preg_split() always returns one extra item in the array. 293 return --$count; 245 return _is_utf8_charset( $encoding ?? get_option( 'blog_charset' ) ) 246 ? _wp_utf8_codepoint_count( $str ) 247 : strlen( $str ); 294 248 } 295 249 -
trunk/tests/phpunit/tests/compat/mbStrlen.php
r55562 r60949 11 11 12 12 /** 13 * Test that mb_strlen() is always available (either from PHP or WP).13 * Test that the native mb_strlen() is available. 14 14 */ 15 15 public function test_mb_strlen_availability() { 16 $this->assertTrue( function_exists( 'mb_strlen' ) ); 16 $this->assertTrue( 17 in_array( 'mb_strlen', get_defined_functions()['internal'], true ), 18 'Test runner should have `mbstring` extension active but doesn’t.' 19 ); 17 20 } 18 21 19 22 /** 20 * @dataProvider data_utf8_string _lengths23 * @dataProvider data_utf8_strings 21 24 */ 22 public function test_mb_strlen( $input_string, $expected_character_length ) { 23 $this->assertSame( $expected_character_length, _mb_strlen( $input_string, 'UTF-8' ) ); 25 public function test_mb_strlen( $input_string ) { 26 $this->assertSame( 27 mb_strlen( $input_string, 'UTF-8' ), 28 _mb_strlen( $input_string, 'UTF-8' ) 29 ); 24 30 } 25 31 26 32 /** 27 * @dataProvider data_utf8_string _lengths33 * @dataProvider data_utf8_strings 28 34 */ 29 public function test_mb_strlen_via_regex( $input_string, $expected_character_length ) { 30 _wp_can_use_pcre_u( false ); 31 $this->assertSame( $expected_character_length, _mb_strlen( $input_string, 'UTF-8' ) ); 32 _wp_can_use_pcre_u( 'reset' ); 35 public function test_mb_strlen_via_regex( $input_string ) { 36 $this->assertSame( 37 mb_strlen( $input_string, 'UTF-8' ), 38 _mb_strlen( $input_string, 'UTF-8' ) 39 ); 33 40 } 34 41 35 42 /** 36 * @dataProvider data_utf8_string _lengths43 * @dataProvider data_utf8_strings 37 44 */ 38 public function test_8bit_mb_strlen( $input_string, $expected_character_length, $expected_byte_length ) { 39 $this->assertSame( $expected_byte_length, _mb_strlen( $input_string, '8bit' ) ); 45 public function test_8bit_mb_strlen( $input_string ) { 46 $this->assertSame( 47 mb_strlen( $input_string, '8bit' ), 48 _mb_strlen( $input_string, '8bit' ) 49 ); 40 50 } 41 51 … … 45 55 * @return array 46 56 */ 47 public function data_utf8_string _lengths() {57 public function data_utf8_strings() { 48 58 return array( 49 array( 50 'input_string' => 'баба', 51 'expected_character_length' => 4, 52 'expected_byte_length' => 8, 53 ), 54 array( 55 'input_string' => 'баб', 56 'expected_character_length' => 3, 57 'expected_byte_length' => 6, 58 ), 59 array( 60 'input_string' => 'I am your б', 61 'expected_character_length' => 11, 62 'expected_byte_length' => 12, 63 ), 64 array( 65 'input_string' => '1111111111', 66 'expected_character_length' => 10, 67 'expected_byte_length' => 10, 68 ), 69 array( 70 'input_string' => '²²²²²²²²²²', 71 'expected_character_length' => 10, 72 'expected_byte_length' => 20, 73 ), 74 array( 75 'input_string' => '3333333333', 76 'expected_character_length' => 10, 77 'expected_byte_length' => 30, 78 ), 79 array( 80 'input_string' => '𝟜𝟜𝟜𝟜𝟜𝟜𝟜𝟜𝟜𝟜', 81 'expected_character_length' => 10, 82 'expected_byte_length' => 40, 83 ), 84 array( 85 'input_string' => '1²3𝟜1²3𝟜1²3𝟜', 86 'expected_character_length' => 12, 87 'expected_byte_length' => 30, 88 ), 59 array( 'баба' ), 60 array( 'баб' ), 61 array( 'I am your б' ), 62 array( '1111111111' ), 63 array( '²²²²²²²²²²' ), 64 array( '3333333333' ), 65 array( '𝟜𝟜𝟜𝟜𝟜𝟜𝟜𝟜𝟜𝟜' ), 66 array( '1²3𝟜1²3𝟜1²3𝟜' ), 89 67 ); 90 68 }
Note: See TracChangeset
for help on using the changeset viewer.