| 713 | | if ( !isset( $is_utf8 ) ) { |
| 714 | | $is_utf8 = in_array( get_option( 'blog_charset' ), array( 'utf8', 'utf-8', 'UTF8', 'UTF-8' ) ); |
| | 716 | |
| | 717 | // if first time this function is called, save boolean if utf-8 enabled or not static |
| | 718 | if ( ! isset( $is_utf8 ) ) { |
| | 719 | $is_utf8 = ( in_array( get_option( 'blog_charset' ), array( 'utf8', 'utf-8', 'UTF8', 'UTF-8' ) ) ); |
| 730 | | // preg_match fails when it encounters invalid UTF8 in $string |
| 731 | | if ( 1 === @preg_match( '/^./us', $string ) ) { |
| 732 | | return $string; |
| | 740 | // just means that the /u modifier is disallowed, so try the pattern option |
| | 741 | if ( ! $pcre_utf8 ) { |
| | 742 | |
| | 743 | // Use the pattern option for pcre and return string if valid. Else use the regex |
| | 744 | if ( @preg_match( '/(*UTF8)/', $string ) !== false ) { |
| | 745 | return $string; |
| | 746 | } else { |
| | 747 | |
| | 748 | // pcre was compiled explicitly to forbid the support of UTF. So use a regex to check (third times the charm) |
| | 749 | $pattern = '/( |
| | 750 | [\xC0-\xC1] # Invalid UTF-8 Bytes |
| | 751 | | [\xF5-\xFF] # Invalid UTF-8 Bytes |
| | 752 | | \xE0[\x80-\x9F] # Overlong encoding of prior code point |
| | 753 | | \xF0[\x80-\x8F] # Overlong encoding of prior code point |
| | 754 | | [\xC2-\xDF](?![\x80-\xBF]) # Invalid UTF-8 Sequence Start |
| | 755 | | [\xE0-\xEF](?![\x80-\xBF]{2}) # Invalid UTF-8 Sequence Start |
| | 756 | | [\xF0-\xF4](?![\x80-\xBF]{3}) # Invalid UTF-8 Sequence Start |
| | 757 | | (?<=[\x0-\x7F\xF5-\xFF])[\x80-\xBF] # Invalid UTF-8 Sequence Middle |
| | 758 | | (?<![\xC2-\xDF]|[\xE0-\xEF]|[\xE0-\xEF][\x80-\xBF]|[\xF0-\xF4]|[\xF0-\xF4][\x80-\xBF]|[\xF0-\xF4][\x80-\xBF]{2})[\x80-\xBF] # Overlong Sequence |
| | 759 | | (?<=[\xE0-\xEF])[\x80-\xBF](?![\x80-\xBF]) # Short 3 byte sequence |
| | 760 | | (?<=[\xF0-\xF4])[\x80-\xBF](?![\x80-\xBF]{2}) # Short 4 byte sequence |
| | 761 | | (?<=[\xF0-\xF4][\x80-\xBF])[\x80-\xBF](?![\x80-\xBF]) # Short 4 byte sequence (2) |
| | 762 | )/x'; |
| | 763 | |
| | 764 | // if the utf is valid return the string |
| | 765 | if ( @preg_match( $pattern , $string ) === 1 ) { |
| | 766 | return $string; |
| | 767 | } |
| | 768 | } |
| 736 | | if ( $strip && function_exists( 'iconv' ) ) { |
| 737 | | return iconv( 'utf-8', 'utf-8', $string ); |
| | 772 | if ( $strip ) { |
| | 773 | // Whether inconv is available |
| | 774 | static $iconv; |
| | 775 | |
| | 776 | // Check once for support for iconv and save statically, if not set the mbstring.substitute_character ini |
| | 777 | if ( ! isset( $iconv ) ) { |
| | 778 | $iconv = function_exists( 'iconv' ); |
| | 779 | |
| | 780 | if ( ! $iconv ) { |
| | 781 | // only create a static var if $iconv is not available since only then will the final if ( $mb_convert ) check utilize this var |
| | 782 | static $mb_convert; |
| | 783 | |
| | 784 | // if mb_convert_encoding is available, set the ini_setting once |
| | 785 | @ini_set( 'mbstring.substitute_character', 'none' ); |
| | 786 | $mb_convert = ( function_exists( 'mb_convert_encoding' ) && @ini_get( 'mbstring.substitute_character' ) === 'none' ); |
| | 787 | } |
| | 788 | } |
| | 789 | |
| | 790 | // use iconv if exists |
| | 791 | if ( $iconv ) { |
| | 792 | return @iconv( 'utf-8', 'utf-8//ignore', $string ); |
| | 793 | } |
| | 794 | |
| | 795 | // otherwise try to use mb_convert_encoding, setting the substitue_character to none to mimic strip |
| | 796 | if ( $mb_convert ) { |
| | 797 | return @mb_convert_encoding( $string, 'utf-8', 'utf-8' ); |
| | 798 | } |