713 | | if ( !isset( $is_utf8 ) ) { |
714 | | $is_utf8 = in_array( get_option( 'blog_charset' ), array( 'utf8', 'utf-8', 'UTF8', 'UTF-8' ) ); |
| 713 | |
| 714 | // if first time this function is called, save if utf-8 enabled or not static |
| 715 | if ( ! isset( $is_utf8 ) ) { |
| 716 | $is_utf8 = get_option( 'blog_charset' ); |
| 717 | |
| 718 | // boolean to make sure it is utf-8 |
| 719 | $is_utf8 = ( stripos( $is_utf8, 'utf' ) !== false && strpos( $is_utf8, '8' ) !== false ); |
721 | | static $utf8_pcre; |
722 | | if ( !isset( $utf8_pcre ) ) { |
723 | | $utf8_pcre = @preg_match( '/^./u', 'a' ); |
724 | | } |
725 | | // We can't demand utf8 in the PCRE installation, so just return the string in those cases |
726 | | if ( !$utf8_pcre ) { |
| 730 | if ( ! isset( $pcre_utf8 ) ) |
| 731 | $pcre_utf8 = ( @preg_match( '//u', '' ) !== false ); |
| 732 | |
| 733 | // If pcre_utf /u modifier allowed and the utf is valid, return string |
| 734 | if ( $pcre_utf8 && @preg_match( '//u', $string ) !== false ) |
730 | | // preg_match fails when it encounters invalid UTF8 in $string |
731 | | if ( 1 === @preg_match( '/^./us', $string ) ) { |
732 | | return $string; |
| 737 | // just means that the /u modifier is disallowed, so try the pattern option |
| 738 | if ( ! $pcre_utf8 ) { |
| 739 | |
| 740 | // Use the pattern option for pcre and return string if valid. Else use the regex |
| 741 | if ( @preg_match( '/(*UTF8)/', $string ) !== false ) { |
| 742 | return $string; |
| 743 | } else { |
| 744 | |
| 745 | // pcre was compiled explicitly to forbid the support of UTF. So use a regex to check (third times the charm) |
| 746 | $pattern = '/( |
| 747 | [\xC0-\xC1] # Invalid UTF-8 Bytes |
| 748 | | [\xF5-\xFF] # Invalid UTF-8 Bytes |
| 749 | | \xE0[\x80-\x9F] # Overlong encoding of prior code point |
| 750 | | \xF0[\x80-\x8F] # Overlong encoding of prior code point |
| 751 | | [\xC2-\xDF](?![\x80-\xBF]) # Invalid UTF-8 Sequence Start |
| 752 | | [\xE0-\xEF](?![\x80-\xBF]{2}) # Invalid UTF-8 Sequence Start |
| 753 | | [\xF0-\xF4](?![\x80-\xBF]{3}) # Invalid UTF-8 Sequence Start |
| 754 | | (?<=[\x0-\x7F\xF5-\xFF])[\x80-\xBF] # Invalid UTF-8 Sequence Middle |
| 755 | | (?<![\xC2-\xDF]|[\xE0-\xEF]|[\xE0-\xEF][\x80-\xBF]|[\xF0-\xF4]|[\xF0-\xF4][\x80-\xBF]|[\xF0-\xF4][\x80-\xBF]{2})[\x80-\xBF] # Overlong Sequence |
| 756 | | (?<=[\xE0-\xEF])[\x80-\xBF](?![\x80-\xBF]) # Short 3 byte sequence |
| 757 | | (?<=[\xF0-\xF4])[\x80-\xBF](?![\x80-\xBF]{2}) # Short 4 byte sequence |
| 758 | | (?<=[\xF0-\xF4][\x80-\xBF])[\x80-\xBF](?![\x80-\xBF]) # Short 4 byte sequence (2) |
| 759 | )/x'; |
| 760 | |
| 761 | // if the utf is valid return the string |
| 762 | if ( @preg_match( $pattern , $string ) === 1 ) |
| 763 | return $string; |
| 764 | } |
736 | | if ( $strip && function_exists( 'iconv' ) ) { |
737 | | return iconv( 'utf-8', 'utf-8', $string ); |
| 768 | if ( $strip ) { |
| 769 | |
| 770 | // Whether inconv is available |
| 771 | static $iconv; |
| 772 | |
| 773 | // Check once for support for iconv and save statically, if not set the mbstring.substitute_character ini |
| 774 | if ( ! isset( $iconv ) ) { |
| 775 | $iconv = ( function_exists( 'iconv' ) ); |
| 776 | |
| 777 | if ( ! $iconv ) { |
| 778 | |
| 779 | // only create a static var if $iconv is not available since only then will the final if ( $mb_convert ) check utilize this var |
| 780 | static $mb_convert; |
| 781 | $mb_convert = ( function_exists( 'mb_convert_encoding' ) ); |
| 782 | |
| 783 | // if mb_convert_encoding is available, set the ini_setting once |
| 784 | if ( $mb_convert ) |
| 785 | @ini_set( 'mbstring.substitute_character', 'none' ); |
| 786 | } |
| 787 | } |
| 788 | |
| 789 | |
| 790 | // use iconv if exists |
| 791 | if ( $iconv ) |
| 792 | return @iconv( 'utf-8', 'utf-8//ignore', $string ); |
| 793 | |
| 794 | // otherwise try to use mb_convert_encoding, setting the substitue_character to none to mimic strip |
| 795 | if ( $mb_convert ) |
| 796 | return @mb_convert_encoding( $string, 'utf-8', 'utf-8' ); |