Make WordPress Core

Changeset 60950


Ignore:
Timestamp:
10/16/2025 11:17:14 PM (5 months ago)
Author:
dmsnell
Message:

Charset: Conditionally polyfill utf8_encode() and utf8_decode().

The utf8_encode() and utf8_decode() functions were deprecated in PHP 8.2.0 and will be removed in PHP 9.0. When that happens, any existing code which calls them will trigger a crash.

This patch introduces polyfills for those functions when they aren’t already present. The polyfill functions maintain backwards compatibility, including a deprecation notice.

Any code calling either of these functions ought to be refactored to avoid using them; there are better options which don’t carry the issues these functions do, and any code calling them is likely calling them inappropriately.

Developed in https://github.com/WordPress/wordpress-develop/pull/10011
Discussed in https://core.trac.wordpress.org/ticket/55603
Discussed in https://core.trac.wordpress.org/ticket/63863

See #63863.

Location:
trunk
Files:
1 added
2 edited

Legend:

Unmodified
Added
Removed
  • trunk/src/wp-includes/compat-utf8.php

    r60949 r60950  
    338338    return $count;
    339339}
     340
     341/**
     342 * Converts a string from ISO-8859-1 to UTF-8, maintaining backwards compatibility
     343 * with the deprecated function from the PHP standard library.
     344 *
     345 * @since 6.9.0
     346 * @access private
     347 *
     348 * @see \utf8_encode()
     349 *
     350 * @param string $iso_8859_1_text Text treated as ISO-8859-1 (latin1) bytes.
     351 * @return string Text converted into UTF-8.
     352 */
     353function _wp_utf8_encode_fallback( $iso_8859_1_text ) {
     354    $iso_8859_1_text = (string) $iso_8859_1_text;
     355    $at              = 0;
     356    $was_at          = 0;
     357    $end             = strlen( $iso_8859_1_text );
     358    $utf8            = '';
     359
     360    while ( $at < $end ) {
     361        // US-ASCII bytes are identical in ISO-8859-1 and UTF-8. These are 0x00–0x7F.
     362        $ascii_byte_count = strspn(
     363            $iso_8859_1_text,
     364            "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f" .
     365            "\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f" .
     366            " !\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\x7f",
     367            $at
     368        );
     369
     370        if ( $ascii_byte_count > 0 ) {
     371            $at += $ascii_byte_count;
     372            continue;
     373        }
     374
     375        // All other bytes transform into two-byte UTF-8 sequences.
     376        $code_point = ord( $iso_8859_1_text[ $at ] );
     377        $byte1      = chr( 0xC0 | ( $code_point >> 6 ) );
     378        $byte2      = chr( 0x80 | ( $code_point & 0x3F ) );
     379
     380        $utf8 .= substr( $iso_8859_1_text, $was_at, $at - $was_at );
     381        $utf8 .= "{$byte1}{$byte2}";
     382
     383        ++$at;
     384        $was_at = $at;
     385    }
     386
     387    if ( 0 === $was_at ) {
     388        return $iso_8859_1_text;
     389    }
     390
     391    $utf8 .= substr( $iso_8859_1_text, $was_at );
     392    return $utf8;
     393}
     394
     395/**
     396 * Converts a string from UTF-8 to ISO-8859-1, maintaining backwards compatibility
     397 * with the deprecated function from the PHP standard library.
     398 *
     399 * @since 6.9.0
     400 * @access private
     401 *
     402 * @see \utf8_decode()
     403 *
     404 * @param string $utf8_text Text treated as UTF-8 bytes.
     405 * @return string Text converted into ISO-8859-1.
     406 */
     407function _wp_utf8_decode_fallback( $utf8_text ) {
     408    $utf8_text       = (string) $utf8_text;
     409    $at              = 0;
     410    $was_at          = 0;
     411    $end             = strlen( $utf8_text );
     412    $iso_8859_1_text = '';
     413
     414    while ( $at < $end ) {
     415        // US-ASCII bytes are identical in ISO-8859-1 and UTF-8. These are 0x00–0x7F.
     416        $ascii_byte_count = strspn(
     417            $utf8_text,
     418            "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f" .
     419            "\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f" .
     420            " !\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\x7f",
     421            $at
     422        );
     423
     424        if ( $ascii_byte_count > 0 ) {
     425            $at += $ascii_byte_count;
     426            continue;
     427        }
     428
     429        $next_at        = $at;
     430        $invalid_length = 0;
     431        $found          = _wp_scan_utf8( $utf8_text, $next_at, $invalid_length, null, 1 );
     432        $span_length    = $next_at - $at;
     433        $next_byte      = '?';
     434
     435        if ( 1 !== $found ) {
     436            if ( $invalid_length > 0 ) {
     437                $next_byte = '';
     438                goto flush_sub_part;
     439            }
     440
     441            break;
     442        }
     443
     444        // All convertible code points are two-bytes long.
     445        $byte1 = ord( $utf8_text[ $at ] );
     446        if ( 0xC0 !== ( $byte1 & 0xE0 ) ) {
     447            goto flush_sub_part;
     448        }
     449
     450        // All convertible code points are not greater than U+FF.
     451        $byte2 = ord( $utf8_text[ $at + 1 ] );
     452        $code_point = ( ( $byte1 & 0x1F ) << 6 ) | ( ( $byte2 & 0x3F ) );
     453        if ( $code_point > 0xFF ) {
     454            goto flush_sub_part;
     455        }
     456
     457        $next_byte = chr( $code_point );
     458
     459        flush_sub_part:
     460        $iso_8859_1_text .= substr( $utf8_text, $was_at, $at - $was_at );
     461        $iso_8859_1_text .= $next_byte;
     462        $at              += $span_length;
     463        $was_at           = $at;
     464
     465        if ( $invalid_length > 0 ) {
     466            $iso_8859_1_text .= '?';
     467            $at              += $invalid_length;
     468            $was_at           = $at;
     469        }
     470    }
     471
     472    if ( 0 === $was_at ) {
     473        return $utf8_text;
     474    }
     475
     476    $iso_8859_1_text .= substr( $utf8_text, $was_at );
     477    return $iso_8859_1_text;
     478}
  • trunk/src/wp-includes/compat.php

    r60949 r60950  
    248248}
    249249
     250if ( ! function_exists( 'utf8_encode' ) ) :
     251    if ( extension_loaded( 'mbstring' ) ) :
     252        /**
     253         * Converts a string from ISO-8859-1 to UTF-8.
     254         *
     255         * @deprecated Use {@see \mb_convert_encoding()} instead.
     256         *
     257         * @since 6.9.0
     258         *
     259         * @param string $iso_8859_1_text Text treated as ISO-8859-1 (latin1) bytes.
     260         * @return string Text converted into a UTF-8.
     261         */
     262        function utf8_encode( $iso_8859_1_text ): string {
     263            _deprecated_function( __FUNCTION__, '6.9.0', 'mb_convert_encoding' );
     264
     265            return mb_convert_encoding( $iso_8859_1_text, 'UTF-8', 'ISO-8859-1' );
     266        }
     267
     268    else :
     269        /**
     270         * @ignore
     271         * @private
     272         *
     273         * @since 6.9.0
     274         */
     275        function utf8_encode( $iso_8859_1_text ): string {
     276            _deprecated_function( __FUNCTION__, '6.9.0', 'mb_convert_encoding' );
     277
     278            return _wp_utf8_encode_fallback( $iso_8859_1_text );
     279        }
     280
     281    endif;
     282endif;
     283
     284if ( ! function_exists( 'utf8_decode' ) ) :
     285    if ( extension_loaded( 'mbstring' ) ) :
     286        /**
     287         * Converts a string from UTF-8 to ISO-8859-1.
     288         *
     289         * @deprecated Use {@see \mb_convert_encoding()} instead.
     290         *
     291         * @since 6.9.0
     292         *
     293         * @param string $utf8_text Text treated as UTF-8.
     294         * @return string Text converted into ISO-8859-1.
     295         */
     296        function utf8_decode( $utf8_text ): string {
     297            _deprecated_function( __FUNCTION__, '6.9.0', 'mb_convert_encoding' );
     298
     299            return mb_convert_encoding( $utf8_text, 'ISO-8859-1', 'UTF-8' );
     300        }
     301
     302    else :
     303        /**
     304         * @ignore
     305         * @private
     306         *
     307         * @since 6.9.0
     308         */
     309        function utf8_decode( $utf8_text ): string {
     310            _deprecated_function( __FUNCTION__, '6.9.0', 'mb_convert_encoding' );
     311
     312            return _wp_utf8_decode_fallback( $utf8_text );
     313        }
     314
     315    endif;
     316endif;
     317
    250318// sodium_crypto_box() was introduced in PHP 7.2.
    251319if ( ! function_exists( 'sodium_crypto_box' ) ) {
Note: See TracChangeset for help on using the changeset viewer.