Make WordPress Core

Changeset 60793


Ignore:
Timestamp:
09/23/2025 03:34:20 AM (4 days ago)
Author:
dmsnell
Message:

Charset: Improve UTF-8 scrubbing ability via new UTF-8 scanning pipeline.

This is the fourth in a series of patches to modernize and standardize UTF-8 handling.

wp_check_invalid_utf8() has long been dependent on the runtime configuration of the system running it. This has led to hard-to-diagnose issues with text containing invalid UTF-8. The function has also had an apparent defect since its inception: when requesting to strip invalid bytes it returns an empty string.

This patch updates the function to remove all dependency on the system running it. It defers to the mbstring extension if that’s available, falling back to the new UTF-8 scanning pipeline.

To support this work, wp_scrub_utf8() is created with a proper fallback so that the remaining logic inside of wp_check_invalid_utf8() can be minimized. The defect in this function has been fixed, but instead of stripping the invalid bytes it will replace them with the Unicode replacement character for stronger security guarantees.

Developed in https://github.com/WordPress/wordpress-develop/pull/9498
Discussed in https://core.trac.wordpress.org/ticket/63837

Follow-up to: [60768].
Props askapache, chriscct7, Cyrille37, desrosj, dmsnell, helen, jonsurrell, kitchin, miqrogroove, pbearne, shailu25.
Fixes #63837, #29717.
See #63863.

Location:
trunk
Files:
2 added
3 edited

Legend:

Unmodified
Added
Removed
  • trunk/src/wp-includes/compat-utf8.php

    r60768 r60793  
    228228 * Fallback mechanism for safely validating UTF-8 bytes.
    229229 *
    230  * @see wp_is_valid_utf8()
    231  *
    232230 * @since 6.9.0
    233231 * @access private
     232 *
     233 * @see wp_is_valid_utf6()
    234234 *
    235235 * @param string $bytes String which might contain text encoded as UTF-8.
     
    249249    return $bytes_length === $next_byte_at && 0 === $invalid_length;
    250250}
     251
     252/**
     253 * Fallback mechanism for replacing invalid spans of UTF-8 bytes.
     254 *
     255 * Example:
     256 *
     257 *     'Pi�a' === _wp_scrub_utf8_fallback( "Pi\xF1a" ); // “ñ” is 0xF1 in Windows-1252.
     258 *
     259 * @since 6.9.0
     260 * @access private
     261 *
     262 * @see wp_scrub_utf8()
     263 *
     264 * @param string $bytes UTF-8 encoded string which might contain spans of invalid bytes.
     265 * @return string Input string with spans of invalid bytes swapped with the replacement character.
     266 */
     267function _wp_scrub_utf8_fallback( string $bytes ): string {
     268    $bytes_length   = strlen( $bytes );
     269    $next_byte_at   = 0;
     270    $was_at         = 0;
     271    $invalid_length = 0;
     272    $scrubbed       = '';
     273
     274    while ( $next_byte_at <= $bytes_length ) {
     275        _wp_scan_utf8( $bytes, $next_byte_at, $invalid_length );
     276
     277        if ( $next_byte_at >= $bytes_length ) {
     278            if ( 0 === $was_at ) {
     279                return $bytes;
     280            }
     281
     282            return $scrubbed . substr( $bytes, $was_at, $next_byte_at - $was_at - $invalid_length );
     283        }
     284
     285        $scrubbed .= substr( $bytes, $was_at, $next_byte_at - $was_at );
     286        $scrubbed .= "\u{FFFD}";
     287
     288        $next_byte_at += $invalid_length;
     289        $was_at        = $next_byte_at;
     290    }
     291
     292    return $scrubbed;
     293}
  • trunk/src/wp-includes/formatting.php

    r60743 r60793  
    919919
    920920/**
    921  * Determines if a given byte string represents a valid UTF-8 encoding.
    922  *
    923  * Note that it’s unlikely for non-UTF-8 data to validate as UTF-8, but
    924  * it is still possible. Many texts are simultaneously valid UTF-8,
    925  * valid US-ASCII, and valid ISO-8859-1 (`latin1`).
    926  *
    927  * Example:
    928  *
    929  *     true === wp_is_valid_utf8( '' );
    930  *     true === wp_is_valid_utf8( 'just a test' );
    931  *     true === wp_is_valid_utf8( "\xE2\x9C\x8F" );    // Pencil, U+270F.
    932  *     true === wp_is_valid_utf8( "\u{270F}" );        // Pencil, U+270F.
    933  *     true === wp_is_valid_utf8( '✏' );              // Pencil, U+270F.
    934  *
    935  *     false === wp_is_valid_utf8( "just \xC0 test" ); // Invalid bytes.
    936  *     false === wp_is_valid_utf8( "\xE2\x9C" );       // Invalid/incomplete sequences.
    937  *     false === wp_is_valid_utf8( "\xC1\xBF" );       // Overlong sequences.
    938  *     false === wp_is_valid_utf8( "\xED\xB0\x80" );   // Surrogate halves.
    939  *     false === wp_is_valid_utf8( "B\xFCch" );        // ISO-8859-1 high-bytes.
    940  *                                                     // E.g. The “ü” in ISO-8859-1 is a single byte 0xFC,
    941  *                                                     // but in UTF-8 is the two-byte sequence 0xC3 0xBC.
    942  *
    943  * A “valid” string consists of “well-formed UTF-8 code unit sequence[s],” meaning
    944  * that the bytes conform to the UTF-8 encoding scheme, all characters use the minimal
    945  * byte sequence required by UTF-8, and that no sequence encodes a UTF-16 surrogate
    946  * code point or any character above the representable range.
    947  *
    948  * @see https://www.unicode.org/versions/Unicode16.0.0/core-spec/chapter-3/#G32860
    949  *
    950  * @see _wp_is_valid_utf8_fallback
    951  *
    952  * @since 6.9.0
    953  *
    954  * @param string $bytes String which might contain text encoded as UTF-8.
    955  * @return bool Whether the provided bytes can decode as valid UTF-8.
    956  */
    957 function wp_is_valid_utf8( string $bytes ): bool {
    958     /*
    959      * Since PHP 8.3.0 the UTF-8 validity is cached internally
    960      * on string objects, making this a direct property lookup.
    961      *
    962      * This is to be preferred exclusively once PHP 8.3.0 is
    963      * the minimum supported version, because even when the
    964      * status isn’t cached, it uses highly-optimized code to
    965      * validate the byte stream.
    966      */
    967     return function_exists( 'mb_check_encoding' )
    968         ? mb_check_encoding( $bytes, 'UTF-8' )
    969         : _wp_is_valid_utf8_fallback( $bytes );
    970 }
    971 
    972 /**
    973921 * Converts a number of special characters into their HTML entities.
    974922 *
     
    11421090 * Checks for invalid UTF8 in a string.
    11431091 *
     1092 * Note! This function only performs its work if the `blog_charset` is set
     1093 * to UTF-8. For all other values it returns the input text unchanged.
     1094 *
     1095 * Note! Unless requested, this returns an empty string if the input contains
     1096 * any sequences of invalid UTF-8. To replace invalid byte sequences, pass
     1097 * `true` as the optional `$strip` parameter.
     1098 *
     1099 * Consider using {@see wp_scrub_utf8()} instead which does not depend on
     1100 * the value of `blog_charset`.
     1101 *
     1102 * Example:
     1103 *
     1104 *     // The `blog_charset` is `latin1`, so this returns the input unchanged.
     1105 *     $every_possible_input === wp_check_invalid_utf8( $every_possible_input );
     1106 *
     1107 *     // Valid strings come through unchanged.
     1108 *     'test' === wp_check_invalid_utf8( 'test' );
     1109 *
     1110 *     $invalid = "the byte \xC0 is never allowed in a UTF-8 string.";
     1111 *
     1112 *     // Invalid strings are rejected outright.
     1113 *     '' === wp_check_invalid_utf8( $invalid );
     1114 *
     1115 *     // “Stripping” invalid sequences produces the replacement character instead.
     1116 *     "the byte \u{FFFD} is never allowed in a UTF-8 string." === wp_check_invalid_utf8( $invalid, true );
     1117 *     'the byte � is never allowed in a UTF-8 string.' === wp_check_invalid_utf8( $invalid, true );
     1118 *
    11441119 * @since 2.8.0
    1145  *
    1146  * @param string $text   The text which is to be checked.
    1147  * @param bool   $strip  Optional. Whether to attempt to strip out invalid UTF8. Default false.
     1120 * @since 6.9.0 Stripping replaces invalid byte sequences with the Unicode replacement character U+FFFD (�).
     1121 *
     1122 * @param string $text   String which is expected to be encoded as UTF-8 unless `blog_charset` is another encoding.
     1123 * @param bool   $strip  Optional. Whether to replace invalid sequences of bytes with the Unicode replacement
     1124 *                       character (U+FFFD `�`). Default `false` returns an empty string for invalid UTF-8 inputs.
    11481125 * @return string The checked text.
    11491126 */
     
    11601137        $is_utf8 = is_utf8_charset();
    11611138    }
    1162     if ( ! $is_utf8 ) {
     1139
     1140    if ( ! $is_utf8 || wp_is_valid_utf8( $text ) ) {
    11631141        return $text;
    11641142    }
    11651143
    1166     // Check for support for utf8 in the installed PCRE library once and store the result in a static.
    1167     static $utf8_pcre = null;
    1168     if ( ! isset( $utf8_pcre ) ) {
    1169         // phpcs:ignore WordPress.PHP.NoSilencedErrors.Discouraged
    1170         $utf8_pcre = @preg_match( '/^./u', 'a' );
    1171     }
    1172     // We can't demand utf8 in the PCRE installation, so just return the string in those cases.
    1173     if ( ! $utf8_pcre ) {
    1174         return $text;
    1175     }
    1176 
    1177     // phpcs:ignore WordPress.PHP.NoSilencedErrors.Discouraged -- preg_match fails when it encounters invalid UTF8 in $text.
    1178     if ( 1 === @preg_match( '/^./us', $text ) ) {
    1179         return $text;
    1180     }
    1181 
    1182     // Attempt to strip the bad chars if requested (not recommended).
    1183     if ( $strip && function_exists( 'iconv' ) ) {
    1184         return iconv( 'utf-8', 'utf-8', $text );
    1185     }
    1186 
    1187     return '';
     1144    return $strip
     1145        ? wp_scrub_utf8( $text )
     1146        : '';
    11881147}
    11891148
  • trunk/src/wp-settings.php

    r60743 r60793  
    112112require ABSPATH . WPINC . '/class-wp-list-util.php';
    113113require ABSPATH . WPINC . '/class-wp-token-map.php';
     114require ABSPATH . WPINC . '/utf8.php';
    114115require ABSPATH . WPINC . '/formatting.php';
    115116require ABSPATH . WPINC . '/meta.php';
Note: See TracChangeset for help on using the changeset viewer.