Changeset 60793
- Timestamp:
- 09/23/2025 03:34:20 AM (4 days ago)
- Location:
- trunk
- Files:
-
- 2 added
- 3 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/src/wp-includes/compat-utf8.php
r60768 r60793 228 228 * Fallback mechanism for safely validating UTF-8 bytes. 229 229 * 230 * @see wp_is_valid_utf8()231 *232 230 * @since 6.9.0 233 231 * @access private 232 * 233 * @see wp_is_valid_utf6() 234 234 * 235 235 * @param string $bytes String which might contain text encoded as UTF-8. … … 249 249 return $bytes_length === $next_byte_at && 0 === $invalid_length; 250 250 } 251 252 /** 253 * Fallback mechanism for replacing invalid spans of UTF-8 bytes. 254 * 255 * Example: 256 * 257 * 'Pi�a' === _wp_scrub_utf8_fallback( "Pi\xF1a" ); // “ñ” is 0xF1 in Windows-1252. 258 * 259 * @since 6.9.0 260 * @access private 261 * 262 * @see wp_scrub_utf8() 263 * 264 * @param string $bytes UTF-8 encoded string which might contain spans of invalid bytes. 265 * @return string Input string with spans of invalid bytes swapped with the replacement character. 266 */ 267 function _wp_scrub_utf8_fallback( string $bytes ): string { 268 $bytes_length = strlen( $bytes ); 269 $next_byte_at = 0; 270 $was_at = 0; 271 $invalid_length = 0; 272 $scrubbed = ''; 273 274 while ( $next_byte_at <= $bytes_length ) { 275 _wp_scan_utf8( $bytes, $next_byte_at, $invalid_length ); 276 277 if ( $next_byte_at >= $bytes_length ) { 278 if ( 0 === $was_at ) { 279 return $bytes; 280 } 281 282 return $scrubbed . substr( $bytes, $was_at, $next_byte_at - $was_at - $invalid_length ); 283 } 284 285 $scrubbed .= substr( $bytes, $was_at, $next_byte_at - $was_at ); 286 $scrubbed .= "\u{FFFD}"; 287 288 $next_byte_at += $invalid_length; 289 $was_at = $next_byte_at; 290 } 291 292 return $scrubbed; 293 } -
trunk/src/wp-includes/formatting.php
r60743 r60793 919 919 920 920 /** 921 * Determines if a given byte string represents a valid UTF-8 encoding.922 *923 * Note that it’s unlikely for non-UTF-8 data to validate as UTF-8, but924 * it is still possible. Many texts are simultaneously valid UTF-8,925 * valid US-ASCII, and valid ISO-8859-1 (`latin1`).926 *927 * Example:928 *929 * true === wp_is_valid_utf8( '' );930 * true === wp_is_valid_utf8( 'just a test' );931 * true === wp_is_valid_utf8( "\xE2\x9C\x8F" ); // Pencil, U+270F.932 * true === wp_is_valid_utf8( "\u{270F}" ); // Pencil, U+270F.933 * true === wp_is_valid_utf8( '✏' ); // Pencil, U+270F.934 *935 * false === wp_is_valid_utf8( "just \xC0 test" ); // Invalid bytes.936 * false === wp_is_valid_utf8( "\xE2\x9C" ); // Invalid/incomplete sequences.937 * false === wp_is_valid_utf8( "\xC1\xBF" ); // Overlong sequences.938 * false === wp_is_valid_utf8( "\xED\xB0\x80" ); // Surrogate halves.939 * false === wp_is_valid_utf8( "B\xFCch" ); // ISO-8859-1 high-bytes.940 * // E.g. The “ü” in ISO-8859-1 is a single byte 0xFC,941 * // but in UTF-8 is the two-byte sequence 0xC3 0xBC.942 *943 * A “valid” string consists of “well-formed UTF-8 code unit sequence[s],” meaning944 * that the bytes conform to the UTF-8 encoding scheme, all characters use the minimal945 * byte sequence required by UTF-8, and that no sequence encodes a UTF-16 surrogate946 * code point or any character above the representable range.947 *948 * @see https://www.unicode.org/versions/Unicode16.0.0/core-spec/chapter-3/#G32860949 *950 * @see _wp_is_valid_utf8_fallback951 *952 * @since 6.9.0953 *954 * @param string $bytes String which might contain text encoded as UTF-8.955 * @return bool Whether the provided bytes can decode as valid UTF-8.956 */957 function wp_is_valid_utf8( string $bytes ): bool {958 /*959 * Since PHP 8.3.0 the UTF-8 validity is cached internally960 * on string objects, making this a direct property lookup.961 *962 * This is to be preferred exclusively once PHP 8.3.0 is963 * the minimum supported version, because even when the964 * status isn’t cached, it uses highly-optimized code to965 * validate the byte stream.966 */967 return function_exists( 'mb_check_encoding' )968 ? mb_check_encoding( $bytes, 'UTF-8' )969 : _wp_is_valid_utf8_fallback( $bytes );970 }971 972 /**973 921 * Converts a number of special characters into their HTML entities. 974 922 * … … 1142 1090 * Checks for invalid UTF8 in a string. 1143 1091 * 1092 * Note! This function only performs its work if the `blog_charset` is set 1093 * to UTF-8. For all other values it returns the input text unchanged. 1094 * 1095 * Note! Unless requested, this returns an empty string if the input contains 1096 * any sequences of invalid UTF-8. To replace invalid byte sequences, pass 1097 * `true` as the optional `$strip` parameter. 1098 * 1099 * Consider using {@see wp_scrub_utf8()} instead which does not depend on 1100 * the value of `blog_charset`. 1101 * 1102 * Example: 1103 * 1104 * // The `blog_charset` is `latin1`, so this returns the input unchanged. 1105 * $every_possible_input === wp_check_invalid_utf8( $every_possible_input ); 1106 * 1107 * // Valid strings come through unchanged. 1108 * 'test' === wp_check_invalid_utf8( 'test' ); 1109 * 1110 * $invalid = "the byte \xC0 is never allowed in a UTF-8 string."; 1111 * 1112 * // Invalid strings are rejected outright. 1113 * '' === wp_check_invalid_utf8( $invalid ); 1114 * 1115 * // “Stripping” invalid sequences produces the replacement character instead. 1116 * "the byte \u{FFFD} is never allowed in a UTF-8 string." === wp_check_invalid_utf8( $invalid, true ); 1117 * 'the byte � is never allowed in a UTF-8 string.' === wp_check_invalid_utf8( $invalid, true ); 1118 * 1144 1119 * @since 2.8.0 1145 * 1146 * @param string $text The text which is to be checked. 1147 * @param bool $strip Optional. Whether to attempt to strip out invalid UTF8. Default false. 1120 * @since 6.9.0 Stripping replaces invalid byte sequences with the Unicode replacement character U+FFFD (�). 1121 * 1122 * @param string $text String which is expected to be encoded as UTF-8 unless `blog_charset` is another encoding. 1123 * @param bool $strip Optional. Whether to replace invalid sequences of bytes with the Unicode replacement 1124 * character (U+FFFD `�`). Default `false` returns an empty string for invalid UTF-8 inputs. 1148 1125 * @return string The checked text. 1149 1126 */ … … 1160 1137 $is_utf8 = is_utf8_charset(); 1161 1138 } 1162 if ( ! $is_utf8 ) { 1139 1140 if ( ! $is_utf8 || wp_is_valid_utf8( $text ) ) { 1163 1141 return $text; 1164 1142 } 1165 1143 1166 // Check for support for utf8 in the installed PCRE library once and store the result in a static. 1167 static $utf8_pcre = null; 1168 if ( ! isset( $utf8_pcre ) ) { 1169 // phpcs:ignore WordPress.PHP.NoSilencedErrors.Discouraged 1170 $utf8_pcre = @preg_match( '/^./u', 'a' ); 1171 } 1172 // We can't demand utf8 in the PCRE installation, so just return the string in those cases. 1173 if ( ! $utf8_pcre ) { 1174 return $text; 1175 } 1176 1177 // phpcs:ignore WordPress.PHP.NoSilencedErrors.Discouraged -- preg_match fails when it encounters invalid UTF8 in $text. 1178 if ( 1 === @preg_match( '/^./us', $text ) ) { 1179 return $text; 1180 } 1181 1182 // Attempt to strip the bad chars if requested (not recommended). 1183 if ( $strip && function_exists( 'iconv' ) ) { 1184 return iconv( 'utf-8', 'utf-8', $text ); 1185 } 1186 1187 return ''; 1144 return $strip 1145 ? wp_scrub_utf8( $text ) 1146 : ''; 1188 1147 } 1189 1148 -
trunk/src/wp-settings.php
r60743 r60793 112 112 require ABSPATH . WPINC . '/class-wp-list-util.php'; 113 113 require ABSPATH . WPINC . '/class-wp-token-map.php'; 114 require ABSPATH . WPINC . '/utf8.php'; 114 115 require ABSPATH . WPINC . '/formatting.php'; 115 116 require ABSPATH . WPINC . '/meta.php';
Note: See TracChangeset
for help on using the changeset viewer.