Make WordPress Core

Changeset 60768


Ignore:
Timestamp:
09/16/2025 12:35:01 PM (7 months ago)
Author:
dmsnell
Message:

Charset: Introduce UTF-8 scanning pipeline.

This is the third in a series of patches to modernize and standardize UTF-8 handling.

When the fallback UTF-8 validation code was added it was placed inside formatting.php; however, that validation logic can be reused for a number of related UTF-8 functions. To faciliate this it was moved into a new location and loaded early. This patch is follow-up to that first half, whereby the UTF-8 scanning logic forms its own new _wp_scan_utf8() function. This new UTF-8 scanner is a low-level function which forms a shared spec-compliant processing core to power multiple fallback functions and some new functionality as well.

Developed in https://github.com/WordPress/wordpress-develop/pull/9830
Discussed in https://core.trac.wordpress.org/ticket/63863

Follow-up to: [60743].

See #63863.

Location:
trunk
Files:
2 edited

Legend:

Unmodified
Added
Removed
  • trunk/phpcs.xml.dist

    r58925 r60768  
    256256    </rule>
    257257
    258     <!-- Exclude forbidding goto in the HTML Processor, which mimics algorithms that are written
    259          this way in the HTML specification, and these particular algorithms are complex and
    260          highly imperative. Avoiding the goto introduces a number of risks that could make it
    261          more difficult to maintain the relationship to the standard, lead to subtle differences
    262          in the parsing, and distance the code from its standard. -->
    263258    <rule ref="Generic.PHP.DiscourageGoto.Found">
     259        <!-- Exclude forbidding goto in the HTML Processor, which mimics algorithms that are written
     260             this way in the HTML specification, and these particular algorithms are complex and
     261             highly imperative. Avoiding the goto introduces a number of risks that could make it
     262             more difficult to maintain the relationship to the standard, lead to subtle differences
     263             in the parsing, and distance the code from its standard. -->
    264264        <exclude-pattern>/wp-includes/html-api/class-wp-html-processor\.php</exclude-pattern>
    265265        <exclude-pattern>/wp-includes/html-api/class-wp-html-doctype-info\.php</exclude-pattern>
     266
     267        <!-- Goto is an effective way to handle errors in decoders which expect valid bytes
     268             without impacting the fast path while avoiding bloating the code with redundant
     269             and risky handling code. Exclude forbidding goto in UTF-8 fallback code. -->
     270        <exclude-pattern>/wp-includes/compat-utf8\.php</exclude-pattern>
    266271    </rule>
    267272
  • trunk/src/wp-includes/compat-utf8.php

    r60764 r60768  
    22
    33/**
    4  * Fallback mechanism for safely validating UTF-8 bytes.
    5  *
    6  * By implementing a raw method here the code will behave in the same way on
    7  * all installed systems, regardless of what extensions are installed.
    8  *
    9  * @see wp_is_valid_utf8
     4 * Finds spans of valid and invalid UTF-8 bytes in a given string.
     5 *
     6 * This is a low-level tool to power various UTF-8 functionality.
     7 * It scans through a string until it finds invalid byte spans.
     8 * When it does this, it does three things:
     9 *
     10 *  - Assigns `$at` to the position after the last successful code point.
     11 *  - Assigns `$invalid_length` to the length of the maximal subpart of
     12 *    the invalid bytes starting at `$at`.
     13 *  - Returns how many code points were successfully scanned.
     14 *
     15 * This information is enough to build a number of useful UTF-8 functions.
     16 *
     17 * Example:
     18 *
     19 *     // ñ is U+F1, which in `ISO-8859-1`/`latin1`/`Windows-1252`/`cp1252` is 0xF1.
     20 *     "Pi\xF1a" === $pineapple = mb_convert_encoding( "Piña", 'Windows-1252', 'UTF-8' );
     21 *     $at = $invalid_length = 0;
     22 *
     23 *     // The first step finds the invalid 0xF1 byte.
     24 *     2 === _wp_scan_utf8( $pineapple, $at, $invalid_length );
     25 *     $at === 2; $invalid_length === 1;
     26 *
     27 *     // The second step continues to the end of the string.
     28 *     1 === _wp_scan_utf8( $pineapple, $at, $invalid_length );
     29 *     $at === 4; $invalid_length === 0;
     30 *
     31 * Note! This functions many arguments are passed without and “options”
     32 * array. This choice is based on the fact that this is a low-level function
     33 * and there’s no need to create an array of items on every invocation.
    1034 *
    1135 * @since 6.9.0
    1236 * @access private
    1337 *
    14  * @param string $bytes String which might contain text encoded as UTF-8.
    15  * @return bool Whether the provided bytes can decode as valid UTF-8.
     38 * @param string   $bytes           UTF-8 encoded string which might include invalid spans of bytes.
     39 * @param int      $at              Where to start scanning.
     40 * @param int      $invalid_length  Will be set to how many bytes are to be ignored after `$at`.
     41 * @param int|null $max_bytes       Stop scanning after this many bytes have been seen.
     42 * @param int|null $max_code_points Stop scanning after this many code points have been seen.
     43 * @return int How many code points were successfully scanned.
    1644 */
    17 function _wp_is_valid_utf8_fallback( string $bytes ): bool {
    18     $end = strlen( $bytes );
    19 
    20     for ( $i = 0; $i < $end; $i++ ) {
     45function _wp_scan_utf8( string $bytes, int &$at, int &$invalid_length, ?int $max_bytes = null, ?int $max_code_points = null ): int {
     46    $byte_length       = strlen( $bytes );
     47    $end               = min( $byte_length, $at + ( $max_bytes ?? PHP_INT_MAX ) );
     48    $invalid_length    = 0;
     49    $count             = 0;
     50    $max_count         = $max_code_points ?? PHP_INT_MAX;
     51
     52    for ( $i = $at; $i < $end && $count <= $max_count; $i++ ) {
    2153        /*
    2254         * Quickly skip past US-ASCII bytes, all of which are valid UTF-8.
     
    2557         * depending on whether the JIT has optimized the function.
    2658         */
    27         $i += strspn(
     59        $ascii_byte_count = strspn(
    2860            $bytes,
    2961            "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f" .
    3062            "\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f" .
    3163            " !\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\x7f",
    32             $i
     64            $i,
     65            $end - $i
    3366        );
     67
     68        if ( $count + $ascii_byte_count >= $max_count ) {
     69            $at    = $i + ( $max_count - $count );
     70            $count = $max_count;
     71            return $count;
     72        }
     73
     74        $count += $ascii_byte_count;
     75        $i     += $ascii_byte_count;
     76
    3477        if ( $i >= $end ) {
    35             break;
     78            $at = $end;
     79            return $count;
    3680        }
    3781
     
    4185         *
    4286         * Therefore everything past here is checking those multibyte sequences.
     87         *
     88         * It may look like there’s a need to check against the max bytes here,
     89         * but since each match of a single character returns, this functions will
     90         * bail already if crossing the max-bytes threshold. This function SHALL
     91         * NOT return in the middle of a multi-byte character, so if a character
     92         * falls on each side of the max bytes, the entire character will be scanned.
     93         *
    4394         * Because it’s possible that there are truncated characters, the use of
    4495         * the null-coalescing operator with "\xC0" is a convenience for skipping
     
    4798         * truncated, it will find 0xC0 and reject as invalid UTF-8.
    4899         *
    49          *  > [The following table] lists all of the byte sequences that are well-formed
     100         * > [The following table] lists all of the byte sequences that are well-formed
    50101         * > in UTF-8. A range of byte values such as A0..BF indicates that any byte
    51102         * > from A0 to BF (inclusive) is well-formed in that position. Any byte value
     
    67118         *  ╰─────────────────────┴────────────┴──────────────┴─────────────┴──────────────╯
    68119         *
    69          * Notice that all valid third and forth bytes are in the range 80..BF. This
    70          * validator takes advantage of that to only check the range of those bytes once.
    71          *
    72          * @see https://lemire.me/blog/2018/05/09/how-quickly-can-you-check-that-a-string-is-valid-unicode-utf-8/
    73120         * @see https://www.unicode.org/versions/Unicode16.0.0/core-spec/chapter-3/#G27506
    74121         */
    75122
     123        // Valid two-byte code points.
    76124        $b1 = ord( $bytes[ $i ] );
    77125        $b2 = ord( $bytes[ $i + 1 ] ?? "\xC0" );
    78126
    79         // Valid two-byte code points.
    80 
    81127        if ( $b1 >= 0xC2 && $b1 <= 0xDF && $b2 >= 0x80 && $b2 <= 0xBF ) {
     128            ++$count;
    82129            ++$i;
    83130            continue;
    84131        }
    85132
     133        // Valid three-byte code points.
    86134        $b3 = ord( $bytes[ $i + 2 ] ?? "\xC0" );
    87135
    88         // Valid three-byte code points.
    89 
    90136        if ( $b3 < 0x80 || $b3 > 0xBF ) {
    91             return false;
     137            goto invalid_utf8;
    92138        }
    93139
     
    98144            ( $b1 >= 0xEE && $b1 <= 0xEF && $b2 >= 0x80 && $b2 <= 0xBF )
    99145        ) {
     146            ++$count;
    100147            $i += 2;
    101148            continue;
    102149        }
    103150
     151        // Valid four-byte code points.
    104152        $b4 = ord( $bytes[ $i + 3 ] ?? "\xC0" );
    105153
    106         // Valid four-byte code points.
    107 
    108154        if ( $b4 < 0x80 || $b4 > 0xBF ) {
    109             return false;
     155            goto invalid_utf8;
    110156        }
    111157
     
    115161            ( 0xF4 === $b1 && $b2 >= 0x80 && $b2 <= 0x8F )
    116162        ) {
     163            ++$count;
    117164            $i += 3;
    118165            continue;
    119166        }
    120167
    121         // Any other sequence is invalid.
    122         return false;
     168        /**
     169         * When encountering invalid byte sequences, Unicode suggests finding the
     170         * maximal subpart of a text and replacing that subpart with a single
     171         * replacement character.
     172         *
     173         * > This practice is more secure because it does not result in the
     174         * > conversion consuming parts of valid sequences as though they were
     175         * > invalid. It also guarantees at least one replacement character will
     176         * > occur for each instance of an invalid sequence in the original text.
     177         * > Furthermore, this practice can be defined consistently for better
     178         * > interoperability between different implementations of conversion.
     179         *
     180         * @see https://www.unicode.org/versions/Unicode16.0.0/core-spec/chapter-5/#G40630
     181         */
     182        invalid_utf8:
     183        $at             = $i;
     184        $invalid_length = 1;
     185
     186        // Single-byte and two-byte characters.
     187        if ( ( 0x00 === ( $b1 & 0x80 ) ) || ( 0xC0 === ( $b1 & 0xE0 ) ) ) {
     188            return $count;
     189        }
     190
     191        $b2 = ord( $bytes[ $i + 1 ] ?? "\xC0" );
     192        $b3 = ord( $bytes[ $i + 2 ] ?? "\xC0" );
     193
     194        // Find the maximal subpart and skip past it.
     195        if ( 0xE0 === ( $b1 & 0xF0 ) ) {
     196            // Three-byte characters.
     197            $b2_valid = (
     198                ( 0xE0 === $b1 && $b2 >= 0xA0 && $b2 <= 0xBF ) ||
     199                ( $b1 >= 0xE1 && $b1 <= 0xEC && $b2 >= 0x80 && $b2 <= 0xBF ) ||
     200                ( 0xED === $b1 && $b2 >= 0x80 && $b2 <= 0x9F ) ||
     201                ( $b1 >= 0xEE && $b1 <= 0xEF && $b2 >= 0x80 && $b2 <= 0xBF )
     202            );
     203
     204            $invalid_length = min( $end - $i, $b2_valid ? 2 : 1 );
     205            return $count;
     206        } elseif ( 0xF0 === ( $b1 & 0xF8 ) ) {
     207            // Four-byte characters.
     208            $b2_valid = (
     209                ( 0xF0 === $b1 && $b2 >= 0x90 && $b2 <= 0xBF ) ||
     210                ( $b1 >= 0xF1 && $b1 <= 0xF3 && $b2 >= 0x80 && $b2 <= 0xBF ) ||
     211                ( 0xF4 === $b1 && $b2 >= 0x80 && $b2 <= 0x8F )
     212            );
     213
     214            $b3_valid = $b3 >= 0x80 && $b3 <= 0xBF;
     215
     216            $invalid_length = min( $end - $i, $b2_valid ? ( $b3_valid ? 3 : 2 ) : 1 );
     217            return $count;
     218        }
     219
     220        return $count;
    123221    }
    124222
    125     // Reaching the end implies validating every byte.
    126     return true;
     223    $at = $i;
     224    return $count;
    127225}
     226
     227/**
     228 * Fallback mechanism for safely validating UTF-8 bytes.
     229 *
     230 * @see wp_is_valid_utf8()
     231 *
     232 * @since 6.9.0
     233 * @access private
     234 *
     235 * @param string $bytes String which might contain text encoded as UTF-8.
     236 * @return bool Whether the provided bytes can decode as valid UTF-8.
     237 */
     238function _wp_is_valid_utf8_fallback( string $bytes ): bool {
     239    $bytes_length = strlen( $bytes );
     240    if ( 0 === $bytes_length ) {
     241        return true;
     242    }
     243
     244    $next_byte_at   = 0;
     245    $invalid_length = 0;
     246
     247    _wp_scan_utf8( $bytes, $next_byte_at, $invalid_length );
     248
     249    return $bytes_length === $next_byte_at && 0 === $invalid_length;
     250}
Note: See TracChangeset for help on using the changeset viewer.