713 | | if ( !isset( $is_utf8 ) ) { |
714 | | $is_utf8 = in_array( get_option( 'blog_charset' ), array( 'utf8', 'utf-8', 'UTF8', 'UTF-8' ) ); |
715 | | } |
716 | | if ( !$is_utf8 ) { |
| 713 | |
| 714 | // if first time this function is called, save if utf enabled or not static |
| 715 | if ( ! isset( $is_utf8 ) ) |
| 716 | $is_utf8 = ( stripos( get_option( 'blog_charset' ), 'utf' ) !== false ); |
| 717 | |
| 718 | // if utf not used return the string |
| 719 | if ( ! $is_utf8 ) |
721 | | static $utf8_pcre; |
722 | | if ( !isset( $utf8_pcre ) ) { |
723 | | $utf8_pcre = @preg_match( '/^./u', 'a' ); |
724 | | } |
725 | | // We can't demand utf8 in the PCRE installation, so just return the string in those cases |
726 | | if ( !$utf8_pcre ) { |
| 726 | if ( ! isset( $pcre_utf8 ) ) |
| 727 | $pcre_utf8 = ( @preg_match( '//u', '' ) !== false ); |
| 728 | |
| 729 | // If pcre_utf /u modifier allowed and the utf is valid, return string |
| 730 | if ( $pcre_utf8 && @preg_match( '//u', $string ) !== false ) |
730 | | // preg_match fails when it encounters invalid UTF8 in $string |
731 | | if ( 1 === @preg_match( '/^./us', $string ) ) { |
732 | | return $string; |
| 733 | // just means that the /u modifier is disallowed, so try the pattern option |
| 734 | if ( ! $pcre_utf8 ) { |
| 735 | |
| 736 | // Use the pattern option for pcre and return string if valid. Else use the regex |
| 737 | if ( @preg_match( '/(*UTF8)/', '' ) !== false ) { |
| 738 | return $string; |
| 739 | } else { |
| 740 | |
| 741 | // pcre was compiled explicitly to forbid the support of UTF. So use a regex to check (third times the charm) |
| 742 | $pattern = '/( |
| 743 | [\xC0-\xC1] # Invalid UTF-8 Bytes |
| 744 | | [\xF5-\xFF] # Invalid UTF-8 Bytes |
| 745 | | \xE0[\x80-\x9F] # Overlong encoding of prior code point |
| 746 | | \xF0[\x80-\x8F] # Overlong encoding of prior code point |
| 747 | | [\xC2-\xDF](?![\x80-\xBF]) # Invalid UTF-8 Sequence Start |
| 748 | | [\xE0-\xEF](?![\x80-\xBF]{2}) # Invalid UTF-8 Sequence Start |
| 749 | | [\xF0-\xF4](?![\x80-\xBF]{3}) # Invalid UTF-8 Sequence Start |
| 750 | | (?<=[\x0-\x7F\xF5-\xFF])[\x80-\xBF] # Invalid UTF-8 Sequence Middle |
| 751 | | (?<![\xC2-\xDF]|[\xE0-\xEF]|[\xE0-\xEF][\x80-\xBF]|[\xF0-\xF4]|[\xF0-\xF4][\x80-\xBF]|[\xF0-\xF4][\x80-\xBF]{2})[\x80-\xBF] # Overlong Sequence |
| 752 | | (?<=[\xE0-\xEF])[\x80-\xBF](?![\x80-\xBF]) # Short 3 byte sequence |
| 753 | | (?<=[\xF0-\xF4])[\x80-\xBF](?![\x80-\xBF]{2}) # Short 4 byte sequence |
| 754 | | (?<=[\xF0-\xF4][\x80-\xBF])[\x80-\xBF](?![\x80-\xBF]) # Short 4 byte sequence (2) |
| 755 | )/x'; |
| 756 | |
| 757 | // if the utf is valid return the string |
| 758 | if ( @preg_match( $pattern , $string ) === 1 ) |
| 759 | return $string; |
| 760 | } |
736 | | if ( $strip && function_exists( 'iconv' ) ) { |
737 | | return iconv( 'utf-8', 'utf-8', $string ); |
| 764 | if ( $strip ) { |
| 765 | // try to use iconv if exists |
| 766 | if ( function_exists( 'iconv' ) ) |
| 767 | return @iconv( 'utf-8', 'utf-8//ignore', $string ); |
| 768 | |
| 769 | // otherwise try to use mb_convert_encoding, setting the substitue_character to none to mimic strip |
| 770 | if ( function_exists( 'mb_convert_encoding' ) ) { |
| 771 | @ini_set( 'mbstring.substitute_character', 'none' ); |
| 772 | return @mb_convert_encoding( $string, 'utf-8', 'utf-8' ); |
| 773 | } |