Changeset 60969
- Timestamp:
- 10/18/2025 04:34:02 AM (8 weeks ago)
- Location:
- trunk
- Files:
-
- 3 edited
-
src/wp-includes/compat-utf8.php (modified) (1 diff)
-
src/wp-includes/compat.php (modified) (4 diffs)
-
tests/phpunit/tests/compat/mbSubstr.php (modified) (6 diffs)
Legend:
- Unmodified
- Added
- Removed
-
trunk/src/wp-includes/compat-utf8.php
r60950 r60969 340 340 341 341 /** 342 * Given a starting offset within a string and a maximum number of code points, 343 * return how many bytes are occupied by the span of characters. 344 * 345 * Invalid spans of bytes count as a single code point according to the maximal 346 * subpart rule. This function is a fallback method for calling 347 * `strlen( mb_substr( substr( $text, $at ), 0, $max_code_points ) )`. 348 * 349 * @since 6.9.0 350 * @access private 351 * 352 * @param string $text Count bytes of span in this text. 353 * @param int $byte_offset Start counting at this byte offset. 354 * @param int $max_code_points Stop counting after this many code points have been seen, 355 * or at the end of the string. 356 * @param ?int $found_code_points Optional. Will be set to number of found code points in 357 * span, as this might be smaller than the maximum count if 358 * the string is not long enough. 359 * @return int Number of bytes spanned by the code points. 360 */ 361 function _wp_utf8_codepoint_span( string $text, int $byte_offset, int $max_code_points, ?int &$found_code_points = 0 ): int { 362 $was_at = $byte_offset; 363 $invalid_length = 0; 364 $end = strlen( $text ); 365 $found_code_points = 0; 366 367 while ( $byte_offset < $end && $found_code_points < $max_code_points ) { 368 $needed = $max_code_points - $found_code_points; 369 $chunk_count = _wp_scan_utf8( $text, $byte_offset, $invalid_length, null, $needed ); 370 371 $found_code_points += $chunk_count; 372 373 // Invalid spans only convey one code point count regardless of how long they are. 374 if ( 0 !== $invalid_length && $found_code_points < $max_code_points ) { 375 ++$found_code_points; 376 $byte_offset += $invalid_length; 377 } 378 } 379 380 return $byte_offset - $was_at; 381 } 382 383 /** 342 384 * Converts a string from ISO-8859-1 to UTF-8, maintaining backwards compatibility 343 385 * with the deprecated function from the PHP standard library. -
trunk/src/wp-includes/compat.php
r60950 r60969 34 34 * @ignore 35 35 * @since 4.2.2 36 * @since 6.9.0 Deprecated the `$set` argument. 36 37 * @access private 37 38 * 38 * @param bool $set - Used for testing only 39 * null : default - get PCRE/u capability 40 * false : Used for testing - return false for future calls to this function 41 * 'reset': Used for testing - restore default behavior of this function 39 * @param bool $set Deprecated. This argument is no longer used for testing purposes. 42 40 */ 43 41 function _wp_can_use_pcre_u( $set = null ) { 44 static $utf8_pcre = 'reset';45 46 if ( null !== $set) {47 $utf8_pcre = $set;48 } 49 50 if ( 'reset' === $utf8_pcre) {51 $utf8_pcre = true;52 53 set_error_handler( 54 function ( $errno, $errstr ) use ( &$utf8_pcre ) {55 if ( str_starts_with( $errstr, 'preg_match():' ) ) {56 $utf8_pcre = false;57 return true;58 }59 60 return false;61 }, 62 E_WARNING63 );64 65 /*66 * Attempt to compile a PCRE pattern with the PCRE_UTF8 flag. For 67 * systems lacking Unicode support this will trigger a warning68 * during compilation, which the error handler will intercept.69 */70 preg_match( '//u', '' );71 72 restore_error_handler();73 }42 static $utf8_pcre = null; 43 44 if ( isset( $set ) ) { 45 _deprecated_argument( __FUNCTION__, '6.9.0' ); 46 } 47 48 if ( isset( $utf8_pcre ) ) { 49 return $utf8_pcre; 50 } 51 52 $utf8_pcre = true; 53 set_error_handler( 54 function ( $errno, $errstr ) use ( &$utf8_pcre ) { 55 if ( str_starts_with( $errstr, 'preg_match():' ) ) { 56 $utf8_pcre = false; 57 return true; 58 } 59 60 return false; 61 }, 62 E_WARNING 63 ); 64 65 /* 66 * Attempt to compile a PCRE pattern with the PCRE_UTF8 flag. For 67 * systems lacking Unicode support this will trigger a warning 68 * during compilation, which the error handler will intercept. 69 */ 70 preg_match( '//u', '' ); 71 restore_error_handler(); 74 72 75 73 return $utf8_pcre; … … 137 135 * Internal compat function to mimic mb_substr(). 138 136 * 139 * Only understands UTF-8 and 8bit. All other character sets will be treated as 8bit.140 * For `$encoding === UTF-8`, the `$str` input is expected to be a valid UTF-8 byte141 * sequence. The behavior of this function for invalid inputs is undefined.137 * Only supports UTF-8 and non-shifting single-byte encodings. For all other encodings 138 * expect the substrings to be misaligned. When the given encoding (or the `blog_charset` 139 * if none is provided) isn’t UTF-8 then the function returns the output of {@see \substr()}. 142 140 * 143 141 * @ignore … … 145 143 * 146 144 * @param string $str The string to extract the substring from. 147 * @param int $start Position to being extraction from in `$str`.145 * @param int $start Character offset at which to start the substring extraction. 148 146 * @param int|null $length Optional. Maximum number of characters to extract from `$str`. 149 147 * Default null. … … 156 154 } 157 155 158 if ( null === $encoding ) { 159 $encoding = get_option( 'blog_charset' ); 160 } 156 // The solution below works only for UTF-8; treat all other encodings as byte streams. 157 if ( ! _is_utf8_charset( $encoding ?? get_option( 'blog_charset' ) ) ) { 158 return is_null( $length ) ? substr( $str, $start ) : substr( $str, $start, $length ); 159 } 160 161 $total_length = ( $start < 0 || $length < 0 ) 162 ? _wp_utf8_codepoint_count( $str ) 163 : 0; 164 165 $normalized_start = $start < 0 166 ? max( 0, $total_length + $start ) 167 : $start; 161 168 162 169 /* 163 * The solution below works only for UTF-8, so in case of a different 164 * charset just use built-in substr(). 165 */ 166 if ( ! _is_utf8_charset( $encoding ) ) { 167 return is_null( $length ) ? substr( $str, $start ) : substr( $str, $start, $length ); 168 } 169 170 if ( _wp_can_use_pcre_u() ) { 171 // Use the regex unicode support to separate the UTF-8 characters into an array. 172 preg_match_all( '/./us', $str, $match ); 173 $chars = is_null( $length ) ? array_slice( $match[0], $start ) : array_slice( $match[0], $start, $length ); 174 return implode( '', $chars ); 175 } 176 177 $regex = '/( 178 [\x00-\x7F] # single-byte sequences 0xxxxxxx 179 | [\xC2-\xDF][\x80-\xBF] # double-byte sequences 110xxxxx 10xxxxxx 180 | \xE0[\xA0-\xBF][\x80-\xBF] # triple-byte sequences 1110xxxx 10xxxxxx * 2 181 | [\xE1-\xEC][\x80-\xBF]{2} 182 | \xED[\x80-\x9F][\x80-\xBF] 183 | [\xEE-\xEF][\x80-\xBF]{2} 184 | \xF0[\x90-\xBF][\x80-\xBF]{2} # four-byte sequences 11110xxx 10xxxxxx * 3 185 | [\xF1-\xF3][\x80-\xBF]{3} 186 | \xF4[\x80-\x8F][\x80-\xBF]{2} 187 )/x'; 188 189 // Start with 1 element instead of 0 since the first thing we do is pop. 190 $chars = array( '' ); 191 192 do { 193 // We had some string left over from the last round, but we counted it in that last round. 194 array_pop( $chars ); 195 196 /* 197 * Split by UTF-8 character, limit to 1000 characters (last array element will contain 198 * the rest of the string). 199 */ 200 $pieces = preg_split( $regex, $str, 1000, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY ); 201 202 $chars = array_merge( $chars, $pieces ); 203 204 // If there's anything left over, repeat the loop. 205 } while ( count( $pieces ) > 1 && $str = array_pop( $pieces ) ); 206 207 return implode( '', array_slice( $chars, $start, $length ) ); 170 * The starting offset is provided as characters, which means this needs to 171 * find how many bytes that many characters occupies at the start of the string. 172 */ 173 $starting_byte_offset = _wp_utf8_codepoint_span( $str, 0, $normalized_start ); 174 175 $normalized_length = $length < 0 176 ? max( 0, $total_length - $normalized_start + $length ) 177 : $length; 178 179 /* 180 * This is the main step. It finds how many bytes the given length of code points 181 * occupies in the input, starting at the byte offset calculated above. 182 */ 183 $byte_length = isset( $normalized_length ) 184 ? _wp_utf8_codepoint_span( $str, $starting_byte_offset, $normalized_length ) 185 : ( strlen( $str ) - $starting_byte_offset ); 186 187 // The result is a normal byte-level substring using the computed ranges. 188 return substr( $str, $starting_byte_offset, $byte_length ); 208 189 } 209 190 -
trunk/tests/phpunit/tests/compat/mbSubstr.php
r55562 r60969 14 14 */ 15 15 public function test_mb_substr_availability() { 16 $this->assertTrue( function_exists( 'mb_substr' ) ); 16 $this->assertTrue( 17 in_array( 'mb_substr', get_defined_functions()['internal'], true ), 18 'Test runner should have `mbstring` extension active but doesn’t.' 19 ); 17 20 } 18 21 … … 20 23 * @dataProvider data_utf8_substrings 21 24 */ 22 public function test_mb_substr( $input_string, $start, $length, $expected_character_substring ) { 23 $this->assertSame( $expected_character_substring, _mb_substr( $input_string, $start, $length, 'UTF-8' ) ); 25 public function test_mb_substr( $input_string, $start, $length ) { 26 $this->assertSame( 27 mb_substr( $input_string, $start, $length, 'UTF-8' ), 28 _mb_substr( $input_string, $start, $length, 'UTF-8' ) 29 ); 24 30 } 25 31 … … 27 33 * @dataProvider data_utf8_substrings 28 34 */ 29 public function test_mb_substr_via_regex( $input_string, $start, $length, $expected_character_substring ) { 30 _wp_can_use_pcre_u( false ); 31 $this->assertSame( $expected_character_substring, _mb_substr( $input_string, $start, $length, 'UTF-8' ) ); 32 _wp_can_use_pcre_u( 'reset' ); 33 } 34 35 /** 36 * @dataProvider data_utf8_substrings 37 */ 38 public function test_8bit_mb_substr( $input_string, $start, $length, $expected_character_substring, $expected_byte_substring ) { 39 $this->assertSame( $expected_byte_substring, _mb_substr( $input_string, $start, $length, '8bit' ) ); 35 public function test_8bit_mb_substr( $input_string, $start, $length ) { 36 $this->assertSame( 37 mb_substr( $input_string, $start, $length, '8bit' ), 38 _mb_substr( $input_string, $start, $length, '8bit' ) 39 ); 40 40 } 41 41 … … 43 43 * Data provider. 44 44 * 45 * @return array 45 * @return array[] 46 46 */ 47 47 public function data_utf8_substrings() { 48 48 return array( 49 array( 50 'input_string' => 'баба', 51 'start' => 0, 52 'length' => 3, 53 'expected_character_substring' => 'баб', 54 'expected_byte_substring' => "б\xD0", 55 ), 56 array( 57 'input_string' => 'баба', 58 'start' => 0, 59 'length' => -1, 60 'expected_character_substring' => 'баб', 61 'expected_byte_substring' => "баб\xD0", 62 ), 63 array( 64 'input_string' => 'баба', 65 'start' => 1, 66 'length' => null, 67 'expected_character_substring' => 'аба', 68 'expected_byte_substring' => "\xB1аба", 69 ), 70 array( 71 'input_string' => 'баба', 72 'start' => -3, 73 'length' => null, 74 'expected_character_substring' => 'аба', 75 'expected_byte_substring' => "\xB1а", 76 ), 77 array( 78 'input_string' => 'баба', 79 'start' => -3, 80 'length' => 2, 81 'expected_character_substring' => 'аб', 82 'expected_byte_substring' => "\xB1\xD0", 83 ), 84 array( 85 'input_string' => 'баба', 86 'start' => -1, 87 'length' => 2, 88 'expected_character_substring' => 'а', 89 'expected_byte_substring' => "\xB0", 90 ), 91 array( 92 'input_string' => 'I am your баба', 93 'start' => 0, 94 'length' => 11, 95 'expected_character_substring' => 'I am your б', 96 'expected_byte_substring' => "I am your \xD0", 97 ), 49 'баба' => array( 'баба', 0, 3 ), 50 'баба' => array( 'баба', 0, -1 ), 51 'баба' => array( 'баба', 1, null ), 52 'баба' => array( 'баба', -3, null ), 53 'баба' => array( 'баба', -3, 2 ), 54 'баба' => array( 'баба', -2, 1 ), 55 'баба' => array( 'баба', 30, 1 ), 56 'баба' => array( 'баба', 15, -30 ), 57 'баба' => array( 'баба', -5, -5 ), 58 'баба' => array( 'баба', 5, -3 ), 59 'баба' => array( 'баба', -3, 5 ), 60 'I am your баба' => array( 'I am your баба', 0, 11 ), 98 61 ); 99 62 } … … 104 67 public function test_mb_substr_phpcore_basic() { 105 68 $string_ascii = 'ABCDEF'; 106 $string_mb = base64_decode( '5pel5pys6Kqe44OG44Kt44K544OI44Gn44GZ44CCMDEyMzTvvJXvvJbvvJfvvJjvvJnjgII=' );69 $string_mb = '日本語テキストです。0123456789。'; 107 70 108 71 $this->assertSame( … … 119 82 // Specific latin-1 as that is the default the core PHP test operates under. 120 83 $this->assertSame( 121 'peacrOiqng==',122 base64_encode( _mb_substr( $string_mb, 2, 7, 'latin-1' )),84 "\xA5本語", 85 _mb_substr( $string_mb, 2, 7, 'latin-1' ), 123 86 'Substring does not match expected for offset 2, length 7, with latin-1 charset' 124 87 ); 125 88 $this->assertSame( 126 ' 6Kqe44OG44Kt44K544OI44Gn44GZ',127 base64_encode( _mb_substr( $string_mb, 2, 7, 'utf-8' )),89 '語テキストです', 90 _mb_substr( $string_mb, 2, 7, 'utf-8' ), 128 91 'Substring does not match expected for offset 2, length 7, with utf-8 charset' 129 92 );
Note: See TracChangeset
for help on using the changeset viewer.