Make WordPress Core


Ignore:
Timestamp:
05/06/2015 02:59:50 AM (10 years ago)
Author:
pento
Message:

WPDB: When checking that a string can be sent to MySQL, we shouldn't use mb_convert_encoding(), as it behaves differently to MySQL's character encoding conversion.

Props mdawaffe, pento, nbachiyski, jorbin, johnjamesjacoby, jeremyfelt.

See #32165.

File:
1 edited

Legend:

Unmodified
Added
Removed
  • trunk/src/wp-includes/compat.php

    r32115 r32364  
    1414}
    1515
     16/**
     17 * Returns whether PCRE/u (PCRE_UTF8 modifier) is available for use.
     18 *
     19 * @ignore
     20 * @since 4.2.2
     21 * @access private
     22 *
     23 * @param bool $set - Used for testing only
     24 *             null   : default - get PCRE/u capability
     25 *             false  : Used for testing - return false for future calls to this function
     26 *             'reset': Used for testing - restore default behavior of this function
     27 */
     28function _wp_can_use_pcre_u( $set = null ) {
     29    static $utf8_pcre = 'reset';
     30
     31    if ( null !== $set ) {
     32        $utf8_pcre = $set;
     33    }
     34
     35    if ( 'reset' === $utf8_pcre ) {
     36        $utf8_pcre = @preg_match( '/^./u', 'a' );
     37    }
     38
     39    return $utf8_pcre;
     40}
     41
    1642if ( ! function_exists( 'mb_substr' ) ) :
    1743    function mb_substr( $str, $start, $length = null, $encoding = null ) {
     
    2046endif;
    2147
     48/*
     49 * Only understands UTF-8 and 8bit.  All other character sets will be treated as 8bit.
     50 * For $encoding === UTF-8, the $str input is expected to be a valid UTF-8 byte sequence.
     51 * The behavior of this function for invalid inputs is undefined.
     52 */
    2253function _mb_substr( $str, $start, $length = null, $encoding = null ) {
     54    if ( null === $encoding ) {
     55        $encoding = get_option( 'blog_charset' );
     56    }
     57
    2358    // The solution below works only for UTF-8,
    2459    // so in case of a different charset just use built-in substr()
    25     $charset = get_option( 'blog_charset' );
    26     if ( ! in_array( $charset, array( 'utf8', 'utf-8', 'UTF8', 'UTF-8' ) ) ) {
     60    if ( ! in_array( $encoding, array( 'utf8', 'utf-8', 'UTF8', 'UTF-8' ) ) ) {
    2761        return is_null( $length ) ? substr( $str, $start ) : substr( $str, $start, $length );
    2862    }
    29     // Use the regex unicode support to separate the UTF-8 characters into an array
    30     preg_match_all( '/./us', $str, $match );
    31     $chars = is_null( $length ) ? array_slice( $match[0], $start ) : array_slice( $match[0], $start, $length );
    32     return implode( '', $chars );
     63
     64    if ( _wp_can_use_pcre_u() ) {
     65        // Use the regex unicode support to separate the UTF-8 characters into an array
     66        preg_match_all( '/./us', $str, $match );
     67        $chars = is_null( $length ) ? array_slice( $match[0], $start ) : array_slice( $match[0], $start, $length );
     68        return implode( '', $chars );
     69    }
     70
     71    $regex = '/(
     72          [\x00-\x7F]                  # single-byte sequences   0xxxxxxx
     73        | [\xC2-\xDF][\x80-\xBF]       # double-byte sequences   110xxxxx 10xxxxxx
     74        | \xE0[\xA0-\xBF][\x80-\xBF]   # triple-byte sequences   1110xxxx 10xxxxxx * 2
     75        | [\xE1-\xEC][\x80-\xBF]{2}
     76        | \xED[\x80-\x9F][\x80-\xBF]
     77        | [\xEE-\xEF][\x80-\xBF]{2}
     78        | \xF0[\x90-\xBF][\x80-\xBF]{2} # four-byte sequences   11110xxx 10xxxxxx * 3
     79        | [\xF1-\xF3][\x80-\xBF]{3}
     80        | \xF4[\x80-\x8F][\x80-\xBF]{2}
     81    )/x';
     82
     83    $chars = array( '' ); // Start with 1 element instead of 0 since the first thing we do is pop
     84    do {
     85        // We had some string left over from the last round, but we counted it in that last round.
     86        array_pop( $chars );
     87
     88        // Split by UTF-8 character, limit to 1000 characters (last array element will contain the rest of the string)
     89        $pieces = preg_split( $regex, $str, 1000, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY );
     90
     91        $chars = array_merge( $chars, $pieces );
     92    } while ( count( $pieces ) > 1 && $str = array_pop( $pieces ) ); // If there's anything left over, repeat the loop.
     93
     94    return join( '', array_slice( $chars, $start, $length ) );
    3395}
    3496
     
    39101endif;
    40102
     103/*
     104 * Only understands UTF-8 and 8bit.  All other character sets will be treated as 8bit.
     105 * For $encoding === UTF-8, the $str input is expected to be a valid UTF-8 byte sequence.
     106 * The behavior of this function for invalid inputs is undefined.
     107 */
    41108function _mb_strlen( $str, $encoding = null ) {
     109    if ( null === $encoding ) {
     110        $encoding = get_option( 'blog_charset' );
     111    }
     112
    42113    // The solution below works only for UTF-8,
    43114    // so in case of a different charset just use built-in strlen()
    44     $charset = get_option( 'blog_charset' );
    45     if ( ! in_array( $charset, array( 'utf8', 'utf-8', 'UTF8', 'UTF-8' ) ) ) {
     115    if ( ! in_array( $encoding, array( 'utf8', 'utf-8', 'UTF8', 'UTF-8' ) ) ) {
    46116        return strlen( $str );
    47117    }
    48     // Use the regex unicode support to separate the UTF-8 characters into an array
    49     preg_match_all( '/./us', $str, $match );
    50     return count( $match[0] );
     118
     119    if ( _wp_can_use_pcre_u() ) {
     120        // Use the regex unicode support to separate the UTF-8 characters into an array
     121        preg_match_all( '/./us', $str, $match );
     122        return count( $match[0] );
     123    }
     124
     125    $regex = '/(?:
     126          [\x00-\x7F]                  # single-byte sequences   0xxxxxxx
     127        | [\xC2-\xDF][\x80-\xBF]       # double-byte sequences   110xxxxx 10xxxxxx
     128        | \xE0[\xA0-\xBF][\x80-\xBF]   # triple-byte sequences   1110xxxx 10xxxxxx * 2
     129        | [\xE1-\xEC][\x80-\xBF]{2}
     130        | \xED[\x80-\x9F][\x80-\xBF]
     131        | [\xEE-\xEF][\x80-\xBF]{2}
     132        | \xF0[\x90-\xBF][\x80-\xBF]{2} # four-byte sequences   11110xxx 10xxxxxx * 3
     133        | [\xF1-\xF3][\x80-\xBF]{3}
     134        | \xF4[\x80-\x8F][\x80-\xBF]{2}
     135    )/x';
     136
     137    $count = 1; // Start at 1 instead of 0 since the first thing we do is decrement
     138    do {
     139        // We had some string left over from the last round, but we counted it in that last round.
     140        $count--;
     141
     142        // Split by UTF-8 character, limit to 1000 characters (last array element will contain the rest of the string)
     143        $pieces = preg_split( $regex, $str, 1000 );
     144
     145        // Increment
     146        $count += count( $pieces );
     147    } while ( $str = array_pop( $pieces ) ); // If there's anything left over, repeat the loop.
     148
     149    // Fencepost: preg_split() always returns one extra item in the array
     150    return --$count;
    51151}
    52152
Note: See TracChangeset for help on using the changeset viewer.