Opened 8 years ago
Last modified 4 years ago
#34631 new enhancement
Extra compat for mbstring: mb_strpos()
Reported by: |
|
Owned by: | |
---|---|---|---|
Milestone: | Priority: | normal | |
Severity: | normal | Version: | 4.4 |
Component: | Charset | Keywords: | has-patch needs-testing |
Focuses: | Cc: |
Description
Hello,
I noticed a missing compat function within compat.php, regarding mb_strpos.
The use of this function within a plugin will result in a fatal error if the server doesn't support mbstring.
So I made a function that will take over the function if it does not exist.
I also implemented debugging errors based on PHP 5.5 source: https://github.com/php/php-src/blob/PHP-5.5/ext/standard/string.c#L1824
if ( ! function_exists( 'mb_strpos' ) ) {
function mb_strpos( $haystack, $needle, $offset = 0, $encoding = null ) {
return _mb_strpos( $haystack, $needle, $offset, $encoding );
}
}
/*
* Only understands UTF-8 and 8bit. All other character sets will be treated as 8bit.
* For $encoding === UTF-8, the $str input is expected to be a valid UTF-8 byte sequence.
* The behavior of this function for invalid inputs is PHP compliant.
*/
if ( ! function_exists( '_mb_strpos' ) ) {
function _mb_strpos( $haystack, $needle, $offset = 0, $encoding = null ) {
if ( null === $encoding ) {
$encoding = get_option( 'blog_charset' );
}
// The solution below works only for UTF-8,
// so in case of a different charset just use built-in strpos()
if ( ! in_array( $encoding, array( 'utf8', 'utf-8', 'UTF8', 'UTF-8' ) ) ) {
return $offset === 0 ? strpos( $haystack, $needle ) : strpos( $haystack, $needle, $offset );
}
$haystack_len = mb_strlen( $haystack );
if ( $offset < (int) 0 || $offset > $haystack_len ) {
trigger_error( 'mb_strpos(): Offset not contained in string', E_USER_WARNING );
return false;
}
if ( !is_string( $needle ) ) {
$needle = (string) $needle;
if ( !is_string( $needle ) ) {
trigger_error( 'mb_strpos(): Array to string conversion', E_USER_WARNING );
return false;
}
}
if ( empty( $needle ) ) {
trigger_error( 'mb_strpos(): Empty needle', E_USER_WARNING );
return false;
}
// Slice off the offset
$haystack_sub = mb_substr( $haystack, $offset );
if ( _wp_can_use_pcre_u() ) {
// Use the regex unicode support to separate the UTF-8 characters into an array
preg_match_all( "/./us", $haystack, $match_h );
preg_match_all( "/$needle/us", $haystack_sub, $match_n );
$pos = key( array_intersect( $match_h[0], $match_n[0] ) );
if ( empty( $pos ) ) {
return false;
}
return (int) $pos;
}
$regex = '/(
[\x00-\x7F] # single-byte sequences 0xxxxxxx
| [\xC2-\xDF][\x80-\xBF] # double-byte sequences 110xxxxx 10xxxxxx
| \xE0[\xA0-\xBF][\x80-\xBF] # triple-byte sequences 1110xxxx 10xxxxxx * 2
| [\xE1-\xEC][\x80-\xBF]{2}
| \xED[\x80-\x9F][\x80-\xBF]
| [\xEE-\xEF][\x80-\xBF]{2}
| \xF0[\x90-\xBF][\x80-\xBF]{2} # four-byte sequences 11110xxx 10xxxxxx * 3
| [\xF1-\xF3][\x80-\xBF]{3}
| \xF4[\x80-\x8F][\x80-\xBF]{2}
)/x';
/**
* Place haystack into array
*/
$match_h = array( '' ); // Start with 1 element instead of 0 since the first thing we do is pop
do {
// We had some string left over from the last round, but we counted it in that last round.
array_pop( $match_h );
// Split by UTF-8 character, limit to 1000 characters (last array element will contain the rest of the string)
$pieces = preg_split( $regex, $haystack, 1000, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY );
$match_h = array_merge( $match_h, $pieces );
} while ( count( $pieces ) > 1 && $haystack = array_pop( $pieces ) ); // If there's anything left over, repeat the loop.
/**
* Place haystack offset into array
*/
$match_hs = array( '' ); // Start with 1 element instead of 0 since the first thing we do is pop
do {
// We had some string left over from the last round, but we counted it in that last round.
array_pop( $match_hs );
// Split by UTF-8 character, limit to 1000 characters (last array element will contain the rest of the string)
$pieces = preg_split( $regex, $haystack_sub, 1000, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY );
$match_hs = array_merge( $match_hs, $pieces );
} while ( count( $pieces ) > 1 && $haystack_sub = array_pop( $pieces ) ); // If there's anything left over, repeat the loop.
/**
* Put needle into array
*/
$match_n = array( '' ); // Start with 1 element instead of 0 since the first thing we do is pop
do {
// We had some string left over from the last round, but we counted it in that last round.
array_pop( $match_n );
// Split by UTF-8 character, limit to 1000 characters (last array element will contain the rest of the string)
$pieces = preg_split( $regex, $needle, 1000, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY );
$match_n = array_merge( $match_n, $pieces );
} while ( count( $pieces ) > 1 && $needle = array_pop( $pieces ) ); // If there's anything left over, repeat the loop.
/**
* Compute match of haystack offset with needle
* If passed, find the array key number within the full haystack.
*/
$pos = in_array( $match_n[0], $match_hs ) ? key( array_intersect( $match_h, $match_n ) ) : '';
if ( empty( $pos ) ) {
return false;
}
return (int) $pos;
}
}
if ( ! function_exists( '_mb_strpos' ) ) {
could probably be removed since it could be a core function.
To test this, I've used the following lines of code:
var_dump( _mb_strpos( '象形指事', '指', 0 ) ); // 2
var_dump( _mb_strpos( '象形指事', '指', 1 ) ); // 2
var_dump( _mb_strpos( '象形指事', '指', 2 ) ); // 2
var_dump( _mb_strpos( '象形指事', '指', 3 ) ); // false
var_dump( _mb_strpos( '象形指事', '指', -1 ) ); // false WARNING
var_dump( _mb_strpos( '象形指事', '指', 4 ) ); // false
var_dump( _mb_strpos( '象形指事', '指', 5 ) ); // false WARNING
echo PHP_EOL.PHP_EOL;
var_dump( mb_strpos( '象形指事', '指', 0 ) ); // 2
var_dump( mb_strpos( '象形指事', '指', 1 ) ); // 2
var_dump( mb_strpos( '象形指事', '指', 2 ) ); // 2
var_dump( mb_strpos( '象形指事', '指', 3 ) ); // false
var_dump( mb_strpos( '象形指事', '指', -1 ) ); // false WARNING
var_dump( mb_strpos( '象形指事', '指', 4 ) ); // false
var_dump( mb_strpos( '象形指事', '指', 5 ) ); // false WARNING
Feel free to contribute your thoughts :) Thanks!
Attachments (1)
Change History (2)
Note: See
TracTickets for help on using
tickets.
(Redundant comments in diff file at bottom by mistake)
Added array to int conversion (needs testing).
Fixed offset ignore when key is found twice or more.
Fixed wrongfully false return when position is 0.