Make WordPress Core

Changeset 60743


Ignore:
Timestamp:
09/15/2025 07:07:06 PM (8 weeks ago)
Author:
dmsnell
Message:

Charset: Create compat-utf8.php module with fallback code.

This is the second in a series of patches to modernize and standardize UTF-8 handling.

When the fallback UTF-8 validation code was added it was placed inside formatting.php; however, that validation logic can be reused for a number of related UTF-8 functions. To faciliate this it should move into a new location and be loaded early. This patch is the first half of doing that, whereby the original fallback function is moved unchanged to the compat-utf8.php module. The follow-up patch will abstract the UTF-8 scanning logic for reuse. Splitting this into a move and a separate change involves an extra step, but faciliates tracking the heritage of the code through the changes.

Developed in https://github.com/WordPress/wordpress-develop/pull/9825
Discussed in https://core.trac.wordpress.org/ticket/63863

Follow-up to: [60630].

See #63863.

Location:
trunk/src
Files:
1 added
2 edited

Legend:

Unmodified
Added
Removed
  • trunk/src/wp-includes/formatting.php

    r60734 r60743  
    968968        ? mb_check_encoding( $bytes, 'UTF-8' )
    969969        : _wp_is_valid_utf8_fallback( $bytes );
    970 }
    971 
    972 /**
    973  * Fallback mechanism for safely validating UTF-8 bytes.
    974  *
    975  * By implementing a raw method here the code will behave in the same way on
    976  * all installed systems, regardless of what extensions are installed.
    977  *
    978  * @see wp_is_valid_utf8
    979  *
    980  * @since 6.9.0
    981  * @access private
    982  *
    983  * @param string $bytes String which might contain text encoded as UTF-8.
    984  * @return bool Whether the provided bytes can decode as valid UTF-8.
    985  */
    986 function _wp_is_valid_utf8_fallback( string $bytes ): bool {
    987     $end = strlen( $bytes );
    988 
    989     for ( $i = 0; $i < $end; $i++ ) {
    990         /*
    991          * Quickly skip past US-ASCII bytes, all of which are valid UTF-8.
    992          *
    993          * This optimization step improves the speed from 10x to 100x
    994          * depending on whether the JIT has optimized the function.
    995          */
    996         $i += strspn(
    997             $bytes,
    998             "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f" .
    999             "\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f" .
    1000             " !\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\x7f",
    1001             $i
    1002         );
    1003         if ( $i >= $end ) {
    1004             break;
    1005         }
    1006 
    1007         /**
    1008          * The above fast-track handled all single-byte UTF-8 characters. What
    1009          * follows MUST be a multibyte sequence otherwise there’s invalid UTF-8.
    1010          *
    1011          * Therefore everything past here is checking those multibyte sequences.
    1012          * Because it’s possible that there are truncated characters, the use of
    1013          * the null-coalescing operator with "\xC0" is a convenience for skipping
    1014          * length checks on every continuation bytes. This works because 0xC0 is
    1015          * always invalid in a UTF-8 string, meaning that if the string has been
    1016          * truncated, it will find 0xC0 and reject as invalid UTF-8.
    1017          *
    1018          *  > [The following table] lists all of the byte sequences that are well-formed
    1019          * > in UTF-8. A range of byte values such as A0..BF indicates that any byte
    1020          * > from A0 to BF (inclusive) is well-formed in that position. Any byte value
    1021          * > outside of the ranges listed is ill-formed.
    1022          *
    1023          * > Table 3-7. Well-Formed UTF-8 Byte Sequences
    1024          *  ╭─────────────────────┬────────────┬──────────────┬─────────────┬──────────────╮
    1025          *  │ Code Points         │ First Byte │ Second Byte  │ Third Byte  │ Fourth Byte  │
    1026          *  ├─────────────────────┼────────────┼──────────────┼─────────────┼──────────────┤
    1027          *  │ U+0000..U+007F      │ 00..7F     │              │             │              │
    1028          *  │ U+0080..U+07FF      │ C2..DF     │ 80..BF       │             │              │
    1029          *  │ U+0800..U+0FFF      │ E0         │ A0..BF       │ 80..BF      │              │
    1030          *  │ U+1000..U+CFFF      │ E1..EC     │ 80..BF       │ 80..BF      │              │
    1031          *  │ U+D000..U+D7FF      │ ED         │ 80..9F       │ 80..BF      │              │
    1032          *  │ U+E000..U+FFFF      │ EE..EF     │ 80..BF       │ 80..BF      │              │
    1033          *  │ U+10000..U+3FFFF    │ F0         │ 90..BF       │ 80..BF      │ 80..BF       │
    1034          *  │ U+40000..U+FFFFF    │ F1..F3     │ 80..BF       │ 80..BF      │ 80..BF       │
    1035          *  │ U+100000..U+10FFFF  │ F4         │ 80..8F       │ 80..BF      │ 80..BF       │
    1036          *  ╰─────────────────────┴────────────┴──────────────┴─────────────┴──────────────╯
    1037          *
    1038          * Notice that all valid third and forth bytes are in the range 80..BF. This
    1039          * validator takes advantage of that to only check the range of those bytes once.
    1040          *
    1041          * @see https://lemire.me/blog/2018/05/09/how-quickly-can-you-check-that-a-string-is-valid-unicode-utf-8/
    1042          * @see https://www.unicode.org/versions/Unicode16.0.0/core-spec/chapter-3/#G27506
    1043          */
    1044 
    1045         $b1 = ord( $bytes[ $i ] );
    1046         $b2 = ord( $bytes[ $i + 1 ] ?? "\xC0" );
    1047 
    1048         // Valid two-byte code points.
    1049 
    1050         if ( $b1 >= 0xC2 && $b1 <= 0xDF && $b2 >= 0x80 && $b2 <= 0xBF ) {
    1051             $i++;
    1052             continue;
    1053         }
    1054 
    1055         $b3 = ord( $bytes[ $i + 2 ] ?? "\xC0" );
    1056 
    1057         // Valid three-byte code points.
    1058 
    1059         if ( $b3 < 0x80 || $b3 > 0xBF ) {
    1060             return false;
    1061         }
    1062 
    1063         if (
    1064             ( 0xE0 === $b1 && $b2 >= 0xA0 && $b2 <= 0xBF ) ||
    1065             ( $b1 >= 0xE1 && $b1 <= 0xEC && $b2 >= 0x80 && $b2 <= 0xBF ) ||
    1066             ( 0xED === $b1 && $b2 >= 0x80 && $b2 <= 0x9F ) ||
    1067             ( $b1 >= 0xEE && $b1 <= 0xEF && $b2 >= 0x80 && $b2 <= 0xBF )
    1068         ) {
    1069             $i += 2;
    1070             continue;
    1071         }
    1072 
    1073         $b4 = ord( $bytes[ $i + 3 ] ?? "\xC0" );
    1074 
    1075         // Valid four-byte code points.
    1076 
    1077         if ( $b4 < 0x80 || $b4 > 0xBF ) {
    1078             return false;
    1079         }
    1080 
    1081         if (
    1082             ( 0xF0 === $b1 && $b2 >= 0x90 && $b2 <= 0xBF ) ||
    1083             ( $b1 >= 0xF1 && $b1 <= 0xF3 && $b2 >= 0x80 && $b2 <= 0xBF ) ||
    1084             ( 0xF4 === $b1 && $b2 >= 0x80 && $b2 <= 0x8F )
    1085         ) {
    1086             $i += 3;
    1087             continue;
    1088         }
    1089 
    1090         // Any other sequence is invalid.
    1091         return false;
    1092     }
    1093 
    1094     // Reaching the end implies validating every byte.
    1095     return true;
    1096970}
    1097971
  • trunk/src/wp-settings.php

    r60539 r60743  
    3333global $wp_version, $wp_db_version, $tinymce_version, $required_php_version, $required_php_extensions, $required_mysql_version, $wp_local_package;
    3434require ABSPATH . WPINC . '/version.php';
     35require ABSPATH . WPINC . '/compat-utf8.php';
    3536require ABSPATH . WPINC . '/compat.php';
    3637require ABSPATH . WPINC . '/load.php';
Note: See TracChangeset for help on using the changeset viewer.