Context Navigation

← Previous Changeset
Next Changeset →

Changeset 60793

Timestamp:

09/23/2025 03:34:20 AM (4 weeks ago)

Author:

dmsnell

Message:

Charset: Improve UTF-8 scrubbing ability via new UTF-8 scanning pipeline.

This is the fourth in a series of patches to modernize and standardize UTF-8 handling.

wp_check_invalid_utf8() has long been dependent on the runtime configuration of the system running it. This has led to hard-to-diagnose issues with text containing invalid UTF-8. The function has also had an apparent defect since its inception: when requesting to strip invalid bytes it returns an empty string.

This patch updates the function to remove all dependency on the system running it. It defers to the mbstring extension if that’s available, falling back to the new UTF-8 scanning pipeline.

To support this work, wp_scrub_utf8() is created with a proper fallback so that the remaining logic inside of wp_check_invalid_utf8() can be minimized. The defect in this function has been fixed, but instead of stripping the invalid bytes it will replace them with the Unicode replacement character for stronger security guarantees.

Developed in https://github.com/WordPress/wordpress-develop/pull/9498
Discussed in https://core.trac.wordpress.org/ticket/63837

Follow-up to: [60768].
Props askapache, chriscct7, Cyrille37, desrosj, dmsnell, helen, jonsurrell, kitchin, miqrogroove, pbearne, shailu25.
Fixes #63837, #29717.
See #63863.

Location:

trunk

Files:

: 2 added
: 3 edited

src/wp-includes/compat-utf8.php (modified) (2 diffs)
src/wp-includes/formatting.php (modified) (3 diffs)
src/wp-includes/utf8.php (added)
src/wp-settings.php (modified) (1 diff)
tests/phpunit/tests/unicode/wpScrubUtf8.php (added)

Legend:

: Unmodified
: Added
: Removed

trunk/src/wp-includes/compat-utf8.php

-                      r60768
+                      r60793
  * Fallback mechanism for safely validating UTF-8 bytes.
+ *
- * @see wp_is_valid_utf8()
+ *
  * @since 6.9.0
  * @access private
+ *
+ * @see wp_is_valid_utf6()
+ *
  * @param string $bytes String which might contain text encoded as UTF-8.
 …
     return $bytes_length === $next_byte_at && 0 === $invalid_length;
+}
+/**
+ * Fallback mechanism for replacing invalid spans of UTF-8 bytes.
+ *
+ * Example:
+ *
+ *     'Pi�a' === _wp_scrub_utf8_fallback( "Pi\xF1a" ); // “ñ” is 0xF1 in Windows-1252.
+ *
+ * @since 6.9.0
+ * @access private
+ *
+ * @see wp_scrub_utf8()
+ *
+ * @param string $bytes UTF-8 encoded string which might contain spans of invalid bytes.
+ * @return string Input string with spans of invalid bytes swapped with the replacement character.
+ */
+function _wp_scrub_utf8_fallback( string $bytes ): string {
+    $bytes_length   = strlen( $bytes );
+    $next_byte_at   = 0;
+    $was_at         = 0;
+    $invalid_length = 0;
+    $scrubbed       = '';
+    while ( $next_byte_at <= $bytes_length ) {
+        _wp_scan_utf8( $bytes, $next_byte_at, $invalid_length );
+        if ( $next_byte_at >= $bytes_length ) {
+            if ( 0 === $was_at ) {
+                return $bytes;
+            }
+            return $scrubbed . substr( $bytes, $was_at, $next_byte_at - $was_at - $invalid_length );
+        }
+        $scrubbed .= substr( $bytes, $was_at, $next_byte_at - $was_at );
+        $scrubbed .= "\u{FFFD}";
+        $next_byte_at += $invalid_length;
+        $was_at        = $next_byte_at;
+    }
+    return $scrubbed;
+}

trunk/src/wp-includes/formatting.php

-                      r60743
+                      r60793
 /**
- * Determines if a given byte string represents a valid UTF-8 encoding.
+ *
- * Note that it’s unlikely for non-UTF-8 data to validate as UTF-8, but
- * it is still possible. Many texts are simultaneously valid UTF-8,
- * valid US-ASCII, and valid ISO-8859-1 (`latin1`).
+ *
- * Example:
+ *
- *     true === wp_is_valid_utf8( '' );
- *     true === wp_is_valid_utf8( 'just a test' );
- *     true === wp_is_valid_utf8( "\xE2\x9C\x8F" );    // Pencil, U+270F.
- *     true === wp_is_valid_utf8( "\u{270F}" );        // Pencil, U+270F.
- *     true === wp_is_valid_utf8( '✏' );              // Pencil, U+270F.
+ *
- *     false === wp_is_valid_utf8( "just \xC0 test" ); // Invalid bytes.
- *     false === wp_is_valid_utf8( "\xE2\x9C" );       // Invalid/incomplete sequences.
- *     false === wp_is_valid_utf8( "\xC1\xBF" );       // Overlong sequences.
- *     false === wp_is_valid_utf8( "\xED\xB0\x80" );   // Surrogate halves.
- *     false === wp_is_valid_utf8( "B\xFCch" );        // ISO-8859-1 high-bytes.
- *                                                     // E.g. The “ü” in ISO-8859-1 is a single byte 0xFC,
- *                                                     // but in UTF-8 is the two-byte sequence 0xC3 0xBC.
+ *
- * A “valid” string consists of “well-formed UTF-8 code unit sequence[s],” meaning
- * that the bytes conform to the UTF-8 encoding scheme, all characters use the minimal
- * byte sequence required by UTF-8, and that no sequence encodes a UTF-16 surrogate
- * code point or any character above the representable range.
+ *
- * @see https://www.unicode.org/versions/Unicode16.0.0/core-spec/chapter-3/#G32860
+ *
- * @see _wp_is_valid_utf8_fallback
+ *
- * @since 6.9.0
+ *
- * @param string $bytes String which might contain text encoded as UTF-8.
- * @return bool Whether the provided bytes can decode as valid UTF-8.
- */
-function wp_is_valid_utf8( string $bytes ): bool {
-    /*
-     * Since PHP 8.3.0 the UTF-8 validity is cached internally
-     * on string objects, making this a direct property lookup.
+     *
-     * This is to be preferred exclusively once PHP 8.3.0 is
-     * the minimum supported version, because even when the
-     * status isn’t cached, it uses highly-optimized code to
-     * validate the byte stream.
-     */
-    return function_exists( 'mb_check_encoding' )
-        ? mb_check_encoding( $bytes, 'UTF-8' )
-        : _wp_is_valid_utf8_fallback( $bytes );
+}
-/**
  * Converts a number of special characters into their HTML entities.
+ *
 …
  * Checks for invalid UTF8 in a string.
+ *
+ * Note! This function only performs its work if the `blog_charset` is set
+ * to UTF-8. For all other values it returns the input text unchanged.
+ *
+ * Note! Unless requested, this returns an empty string if the input contains
+ * any sequences of invalid UTF-8. To replace invalid byte sequences, pass
+ * `true` as the optional `$strip` parameter.
+ *
+ * Consider using {@see wp_scrub_utf8()} instead which does not depend on
+ * the value of `blog_charset`.
+ *
+ * Example:
+ *
+ *     // The `blog_charset` is `latin1`, so this returns the input unchanged.
+ *     $every_possible_input === wp_check_invalid_utf8( $every_possible_input );
+ *
+ *     // Valid strings come through unchanged.
+ *     'test' === wp_check_invalid_utf8( 'test' );
+ *
+ *     $invalid = "the byte \xC0 is never allowed in a UTF-8 string.";
+ *
+ *     // Invalid strings are rejected outright.
+ *     '' === wp_check_invalid_utf8( $invalid );
+ *
+ *     // “Stripping” invalid sequences produces the replacement character instead.
+ *     "the byte \u{FFFD} is never allowed in a UTF-8 string." === wp_check_invalid_utf8( $invalid, true );
+ *     'the byte � is never allowed in a UTF-8 string.' === wp_check_invalid_utf8( $invalid, true );
+ *
  * @since 2.8.0
+ *
+ * @param string $text   The text which is to be checked.
+ * @param bool   $strip  Optional. Whether to attempt to strip out invalid UTF8. Default false.
+ * @since 6.9.0 Stripping replaces invalid byte sequences with the Unicode replacement character U+FFFD (�).
+ *
+ * @param string $text   String which is expected to be encoded as UTF-8 unless `blog_charset` is another encoding.
+ * @param bool   $strip  Optional. Whether to replace invalid sequences of bytes with the Unicode replacement
+ *                       character (U+FFFD `�`). Default `false` returns an empty string for invalid UTF-8 inputs.
  * @return string The checked text.
  */
 …
         $is_utf8 = is_utf8_charset();
+    }
+    if ( ! $is_utf8 ) {
+    if ( ! $is_utf8 || wp_is_valid_utf8( $text ) ) {
         return $text;
+    }
+    // Check for support for utf8 in the installed PCRE library once and store the result in a static.
+    static $utf8_pcre = null;
+    if ( ! isset( $utf8_pcre ) ) {
+        // phpcs:ignore WordPress.PHP.NoSilencedErrors.Discouraged
+        $utf8_pcre = @preg_match( '/^./u', 'a' );
+    }
+    // We can't demand utf8 in the PCRE installation, so just return the string in those cases.
+    if ( ! $utf8_pcre ) {
+        return $text;
+    }
+    // phpcs:ignore WordPress.PHP.NoSilencedErrors.Discouraged -- preg_match fails when it encounters invalid UTF8 in $text.
+    if ( 1 === @preg_match( '/^./us', $text ) ) {
+        return $text;
+    }
+    // Attempt to strip the bad chars if requested (not recommended).
+    if ( $strip && function_exists( 'iconv' ) ) {
+        return iconv( 'utf-8', 'utf-8', $text );
+    }
+    return '';
+    return $strip
+        ? wp_scrub_utf8( $text )
+        : '';
+}

trunk/src/wp-settings.php

r60743	r60793
112	112	require ABSPATH . WPINC . '/class-wp-list-util.php';
113	113	require ABSPATH . WPINC . '/class-wp-token-map.php';
	114	require ABSPATH . WPINC . '/utf8.php';
114	115	require ABSPATH . WPINC . '/formatting.php';
115	116	require ABSPATH . WPINC . '/meta.php';

Note: See TracChangeset for help on using the changeset viewer.

Trac UI Preferences

Make WordPress Core