Make WordPress Core

Ticket #29717: 29717.5.patch

File 29717.5.patch, 4.3 KB (added by askapache, 10 years ago)

29717.5.patch - mb_convert_encoding preferred over iconv

  • formatting.php

     
    703703/**
    704704 * Checks for invalid UTF8 in a string.
    705705 *
     706 * Could change the ini_setting mbstring.substitute_character to 'none' without restoring.
     707 *
    706708 * @since 2.8.0
    707709 *
    708710 * @param string $string The text which is to be checked.
    709711 * @param boolean $strip Optional. Whether to attempt to strip out invalid UTF8. Default is false.
    710  * @return string The checked text.
     712 * @return string If the string is valid UTF-8 or the blog_charset is not UTF-8, the string is returned unmodified. Otherwise, an empty string is returned, or optionally the string stripped of invalid chars.
    711713 */
    712714function wp_check_invalid_utf8( $string, $strip = false ) {
    713715        $string = (string) $string;
    714716
    715         if ( 0 === strlen( $string ) ) {
     717        // if string length is 0 (faster than strlen) return empty
     718        if ( ! isset( $string[0] ) ) {
    716719                return '';
    717720        }
    718721
    719         // Store the site charset as a static to avoid multiple calls to get_option()
    720         static $is_utf8;
    721         if ( !isset( $is_utf8 ) ) {
    722                 $is_utf8 = in_array( get_option( 'blog_charset' ), array( 'utf8', 'utf-8', 'UTF8', 'UTF-8' ) );
     722        // Store the site charset and whether pcre_utf8 is enabled
     723        static $is_utf8, $pcre_utf8;
     724
     725        // if first time this function is called, save boolean if utf-8 enabled or not static
     726        if ( ! isset( $is_utf8 ) ) {
     727                $is_utf8 = get_option( 'blog_charset' );
     728                $is_utf8 = ( 'UTF-8' == $is_utf8 || 'UTF8' == $is_utf8 || 'utf-8' == $is_utf8 || 'utf8' == $is_utf8 );
     729
     730                // Check for support for utf8 /u modifier support in the installed PCRE library once
     731                $pcre_utf8 = ( @preg_match( '//u', '' ) !== false );
    723732        }
    724         if ( !$is_utf8 ) {
    725                 return $string;
    726         }
    727733
    728         // Check for support for utf8 in the installed PCRE library once and store the result in a static
    729         static $utf8_pcre;
    730         if ( !isset( $utf8_pcre ) ) {
    731                 $utf8_pcre = @preg_match( '/^./u', 'a' );
    732         }
    733         // We can't demand utf8 in the PCRE installation, so just return the string in those cases
    734         if ( !$utf8_pcre ) {
     734        // if utf not used return the string unmodified
     735        if ( ! $is_utf8 ) {
    735736                return $string;
    736737        }
    737738
    738         // preg_match fails when it encounters invalid UTF8 in $string
    739         if ( 1 === @preg_match( '/^./us', $string ) ) {
    740                 return $string;
     739        // If pcre_utf support is available and string is valid, return string
     740        if ( $pcre_utf8 ) {
     741                if ( preg_match( '//u', $string ) !== false ) {
     742                        return $string;
     743                }
     744        } else {
     745                // If pattern option is available then test the string and return if valid. PCRE added (*UTF8) in Version 7.9 11-Apr-09
     746                if ( @preg_match( '/(*UTF8)/', '' ) !== false && preg_match( '/(*UTF8)/', $string ) !== false ) {
     747                        return $string;
     748                }
     749
     750                // if no pcre support, use htmlspecialchars to check for an empty return which equals invalid utf, otherwise return valid string
     751                if ( htmlspecialchars( $string, null, 'utf-8' ) != '' ) {
     752                        return $string;
     753                }
    741754        }
    742755
    743         // Attempt to strip the bad chars if requested (not recommended)
    744         if ( $strip && function_exists( 'iconv' ) ) {
    745                 return iconv( 'utf-8', 'utf-8', $string );
     756        // Attempt to strip the bad chars if requested
     757        if ( $strip ) {
     758                // whether mb_convert_encoding should be used (preferred over iconv)
     759                static $mb_convert;
     760
     761                if ( ! isset( $mb_convert ) ) {
     762                        // IF mbstring extension is present, and setting the substitute_character to none works
     763                        $mb_convert = ( function_exists( 'mb_substitute_character' ) && mb_substitute_character( 'none' ) === true );
     764
     765                        if ( ! $mb_convert ) {
     766                                // Whether iconv is available
     767                                static $iconv;
     768
     769                                // This extension is enabled by default, although it may be disabled by compiling using --without-iconv, or may be the wrong iconv lib
     770                                $iconv = ( function_exists( 'iconv' ) && defined( ICONV_IMPL ) && ICONV_IMPL == 'libiconv' );
     771                        }
     772                }
     773
     774                // Use mb_convert_encoding, return string minus invalid utf
     775                if ( $mb_convert ) {
     776                        return mb_convert_encoding( $string, 'UTF-8', 'UTF-8' );
     777                } elseif ( $iconv ) {
     778                        // Characters that cannot be represented in the target charset are silently discarded.  Needs '@' see _php_iconv_show_error
     779                        return @iconv( 'UTF-8', 'UTF-8//IGNORE', $string );
     780                }
    746781        }
    747782
     783        // default to returning empty string, meaning invalid utf was found
    748784        return '';
    749785}
    750786