id summary reporter owner description type status priority milestone component version severity resolution keywords cc focuses 34631 Extra compat for mbstring: mb_strpos() Cybr "Hello, I noticed a missing compat function within compat.php, regarding mb_strpos. The use of this function within a plugin will result in a fatal error if the server doesn't support mbstring. So I made a function that will take over the function if it does not exist. I also implemented debugging errors based on PHP 5.5 source: https://github.com/php/php-src/blob/PHP-5.5/ext/standard/string.c#L1824 {{{#!php if ( ! function_exists( 'mb_strpos' ) ) { function mb_strpos( $haystack, $needle, $offset = 0, $encoding = null ) { return _mb_strpos( $haystack, $needle, $offset, $encoding ); } } /* * Only understands UTF-8 and 8bit. All other character sets will be treated as 8bit. * For $encoding === UTF-8, the $str input is expected to be a valid UTF-8 byte sequence. * The behavior of this function for invalid inputs is PHP compliant. */ if ( ! function_exists( '_mb_strpos' ) ) { function _mb_strpos( $haystack, $needle, $offset = 0, $encoding = null ) { if ( null === $encoding ) { $encoding = get_option( 'blog_charset' ); } // The solution below works only for UTF-8, // so in case of a different charset just use built-in strpos() if ( ! in_array( $encoding, array( 'utf8', 'utf-8', 'UTF8', 'UTF-8' ) ) ) { return $offset === 0 ? strpos( $haystack, $needle ) : strpos( $haystack, $needle, $offset ); } $haystack_len = mb_strlen( $haystack ); if ( $offset < (int) 0 || $offset > $haystack_len ) { trigger_error( 'mb_strpos(): Offset not contained in string', E_USER_WARNING ); return false; } if ( !is_string( $needle ) ) { $needle = (string) $needle; if ( !is_string( $needle ) ) { trigger_error( 'mb_strpos(): Array to string conversion', E_USER_WARNING ); return false; } } if ( empty( $needle ) ) { trigger_error( 'mb_strpos(): Empty needle', E_USER_WARNING ); return false; } // Slice off the offset $haystack_sub = mb_substr( $haystack, $offset ); if ( _wp_can_use_pcre_u() ) { // Use the regex unicode support to separate the UTF-8 characters into an array preg_match_all( ""/./us"", $haystack, $match_h ); preg_match_all( ""/$needle/us"", $haystack_sub, $match_n ); $pos = key( array_intersect( $match_h[0], $match_n[0] ) ); if ( empty( $pos ) ) { return false; } return (int) $pos; } $regex = '/( [\x00-\x7F] # single-byte sequences 0xxxxxxx | [\xC2-\xDF][\x80-\xBF] # double-byte sequences 110xxxxx 10xxxxxx | \xE0[\xA0-\xBF][\x80-\xBF] # triple-byte sequences 1110xxxx 10xxxxxx * 2 | [\xE1-\xEC][\x80-\xBF]{2} | \xED[\x80-\x9F][\x80-\xBF] | [\xEE-\xEF][\x80-\xBF]{2} | \xF0[\x90-\xBF][\x80-\xBF]{2} # four-byte sequences 11110xxx 10xxxxxx * 3 | [\xF1-\xF3][\x80-\xBF]{3} | \xF4[\x80-\x8F][\x80-\xBF]{2} )/x'; /** * Place haystack into array */ $match_h = array( '' ); // Start with 1 element instead of 0 since the first thing we do is pop do { // We had some string left over from the last round, but we counted it in that last round. array_pop( $match_h ); // Split by UTF-8 character, limit to 1000 characters (last array element will contain the rest of the string) $pieces = preg_split( $regex, $haystack, 1000, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY ); $match_h = array_merge( $match_h, $pieces ); } while ( count( $pieces ) > 1 && $haystack = array_pop( $pieces ) ); // If there's anything left over, repeat the loop. /** * Place haystack offset into array */ $match_hs = array( '' ); // Start with 1 element instead of 0 since the first thing we do is pop do { // We had some string left over from the last round, but we counted it in that last round. array_pop( $match_hs ); // Split by UTF-8 character, limit to 1000 characters (last array element will contain the rest of the string) $pieces = preg_split( $regex, $haystack_sub, 1000, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY ); $match_hs = array_merge( $match_hs, $pieces ); } while ( count( $pieces ) > 1 && $haystack_sub = array_pop( $pieces ) ); // If there's anything left over, repeat the loop. /** * Put needle into array */ $match_n = array( '' ); // Start with 1 element instead of 0 since the first thing we do is pop do { // We had some string left over from the last round, but we counted it in that last round. array_pop( $match_n ); // Split by UTF-8 character, limit to 1000 characters (last array element will contain the rest of the string) $pieces = preg_split( $regex, $needle, 1000, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY ); $match_n = array_merge( $match_n, $pieces ); } while ( count( $pieces ) > 1 && $needle = array_pop( $pieces ) ); // If there's anything left over, repeat the loop. /** * Compute match of haystack offset with needle * If passed, find the array key number within the full haystack. */ $pos = in_array( $match_n[0], $match_hs ) ? key( array_intersect( $match_h, $match_n ) ) : ''; if ( empty( $pos ) ) { return false; } return (int) $pos; } } }}} `if ( ! function_exists( '_mb_strpos' ) ) {` could probably be removed since it could be a core function. To test this, I've used the following lines of code: {{{#!php var_dump( _mb_strpos( '象形指事', '指', 0 ) ); // 2 var_dump( _mb_strpos( '象形指事', '指', 1 ) ); // 2 var_dump( _mb_strpos( '象形指事', '指', 2 ) ); // 2 var_dump( _mb_strpos( '象形指事', '指', 3 ) ); // false var_dump( _mb_strpos( '象形指事', '指', -1 ) ); // false WARNING var_dump( _mb_strpos( '象形指事', '指', 4 ) ); // false var_dump( _mb_strpos( '象形指事', '指', 5 ) ); // false WARNING echo PHP_EOL.PHP_EOL; var_dump( mb_strpos( '象形指事', '指', 0 ) ); // 2 var_dump( mb_strpos( '象形指事', '指', 1 ) ); // 2 var_dump( mb_strpos( '象形指事', '指', 2 ) ); // 2 var_dump( mb_strpos( '象形指事', '指', 3 ) ); // false var_dump( mb_strpos( '象形指事', '指', -1 ) ); // false WARNING var_dump( mb_strpos( '象形指事', '指', 4 ) ); // false var_dump( mb_strpos( '象形指事', '指', 5 ) ); // false WARNING }}} Feel free to contribute your thoughts :) Thanks!" enhancement new normal Charset 4.4 normal has-patch needs-testing