Changeset 60743
- Timestamp:
- 09/15/2025 07:07:06 PM (8 weeks ago)
- Location:
- trunk/src
- Files:
-
- 1 added
- 2 edited
-
wp-includes/compat-utf8.php (added)
-
wp-includes/formatting.php (modified) (1 diff)
-
wp-settings.php (modified) (1 diff)
Legend:
- Unmodified
- Added
- Removed
-
trunk/src/wp-includes/formatting.php
r60734 r60743 968 968 ? mb_check_encoding( $bytes, 'UTF-8' ) 969 969 : _wp_is_valid_utf8_fallback( $bytes ); 970 }971 972 /**973 * Fallback mechanism for safely validating UTF-8 bytes.974 *975 * By implementing a raw method here the code will behave in the same way on976 * all installed systems, regardless of what extensions are installed.977 *978 * @see wp_is_valid_utf8979 *980 * @since 6.9.0981 * @access private982 *983 * @param string $bytes String which might contain text encoded as UTF-8.984 * @return bool Whether the provided bytes can decode as valid UTF-8.985 */986 function _wp_is_valid_utf8_fallback( string $bytes ): bool {987 $end = strlen( $bytes );988 989 for ( $i = 0; $i < $end; $i++ ) {990 /*991 * Quickly skip past US-ASCII bytes, all of which are valid UTF-8.992 *993 * This optimization step improves the speed from 10x to 100x994 * depending on whether the JIT has optimized the function.995 */996 $i += strspn(997 $bytes,998 "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f" .999 "\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f" .1000 " !\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\x7f",1001 $i1002 );1003 if ( $i >= $end ) {1004 break;1005 }1006 1007 /**1008 * The above fast-track handled all single-byte UTF-8 characters. What1009 * follows MUST be a multibyte sequence otherwise there’s invalid UTF-8.1010 *1011 * Therefore everything past here is checking those multibyte sequences.1012 * Because it’s possible that there are truncated characters, the use of1013 * the null-coalescing operator with "\xC0" is a convenience for skipping1014 * length checks on every continuation bytes. This works because 0xC0 is1015 * always invalid in a UTF-8 string, meaning that if the string has been1016 * truncated, it will find 0xC0 and reject as invalid UTF-8.1017 *1018 * > [The following table] lists all of the byte sequences that are well-formed1019 * > in UTF-8. A range of byte values such as A0..BF indicates that any byte1020 * > from A0 to BF (inclusive) is well-formed in that position. Any byte value1021 * > outside of the ranges listed is ill-formed.1022 *1023 * > Table 3-7. Well-Formed UTF-8 Byte Sequences1024 * ╭─────────────────────┬────────────┬──────────────┬─────────────┬──────────────╮1025 * │ Code Points │ First Byte │ Second Byte │ Third Byte │ Fourth Byte │1026 * ├─────────────────────┼────────────┼──────────────┼─────────────┼──────────────┤1027 * │ U+0000..U+007F │ 00..7F │ │ │ │1028 * │ U+0080..U+07FF │ C2..DF │ 80..BF │ │ │1029 * │ U+0800..U+0FFF │ E0 │ A0..BF │ 80..BF │ │1030 * │ U+1000..U+CFFF │ E1..EC │ 80..BF │ 80..BF │ │1031 * │ U+D000..U+D7FF │ ED │ 80..9F │ 80..BF │ │1032 * │ U+E000..U+FFFF │ EE..EF │ 80..BF │ 80..BF │ │1033 * │ U+10000..U+3FFFF │ F0 │ 90..BF │ 80..BF │ 80..BF │1034 * │ U+40000..U+FFFFF │ F1..F3 │ 80..BF │ 80..BF │ 80..BF │1035 * │ U+100000..U+10FFFF │ F4 │ 80..8F │ 80..BF │ 80..BF │1036 * ╰─────────────────────┴────────────┴──────────────┴─────────────┴──────────────╯1037 *1038 * Notice that all valid third and forth bytes are in the range 80..BF. This1039 * validator takes advantage of that to only check the range of those bytes once.1040 *1041 * @see https://lemire.me/blog/2018/05/09/how-quickly-can-you-check-that-a-string-is-valid-unicode-utf-8/1042 * @see https://www.unicode.org/versions/Unicode16.0.0/core-spec/chapter-3/#G275061043 */1044 1045 $b1 = ord( $bytes[ $i ] );1046 $b2 = ord( $bytes[ $i + 1 ] ?? "\xC0" );1047 1048 // Valid two-byte code points.1049 1050 if ( $b1 >= 0xC2 && $b1 <= 0xDF && $b2 >= 0x80 && $b2 <= 0xBF ) {1051 $i++;1052 continue;1053 }1054 1055 $b3 = ord( $bytes[ $i + 2 ] ?? "\xC0" );1056 1057 // Valid three-byte code points.1058 1059 if ( $b3 < 0x80 || $b3 > 0xBF ) {1060 return false;1061 }1062 1063 if (1064 ( 0xE0 === $b1 && $b2 >= 0xA0 && $b2 <= 0xBF ) ||1065 ( $b1 >= 0xE1 && $b1 <= 0xEC && $b2 >= 0x80 && $b2 <= 0xBF ) ||1066 ( 0xED === $b1 && $b2 >= 0x80 && $b2 <= 0x9F ) ||1067 ( $b1 >= 0xEE && $b1 <= 0xEF && $b2 >= 0x80 && $b2 <= 0xBF )1068 ) {1069 $i += 2;1070 continue;1071 }1072 1073 $b4 = ord( $bytes[ $i + 3 ] ?? "\xC0" );1074 1075 // Valid four-byte code points.1076 1077 if ( $b4 < 0x80 || $b4 > 0xBF ) {1078 return false;1079 }1080 1081 if (1082 ( 0xF0 === $b1 && $b2 >= 0x90 && $b2 <= 0xBF ) ||1083 ( $b1 >= 0xF1 && $b1 <= 0xF3 && $b2 >= 0x80 && $b2 <= 0xBF ) ||1084 ( 0xF4 === $b1 && $b2 >= 0x80 && $b2 <= 0x8F )1085 ) {1086 $i += 3;1087 continue;1088 }1089 1090 // Any other sequence is invalid.1091 return false;1092 }1093 1094 // Reaching the end implies validating every byte.1095 return true;1096 970 } 1097 971 -
trunk/src/wp-settings.php
r60539 r60743 33 33 global $wp_version, $wp_db_version, $tinymce_version, $required_php_version, $required_php_extensions, $required_mysql_version, $wp_local_package; 34 34 require ABSPATH . WPINC . '/version.php'; 35 require ABSPATH . WPINC . '/compat-utf8.php'; 35 36 require ABSPATH . WPINC . '/compat.php'; 36 37 require ABSPATH . WPINC . '/load.php';
Note: See TracChangeset
for help on using the changeset viewer.