| 1107 | | * @param bool $strip Optional. Whether to attempt to strip out invalid UTF8. Default is false. |
| 1108 | | * @return string The checked text. |
| | 1107 | * @param bool $strip Optional. Whether to attempt to strip out invalid UTF8, using the bytewise regex. Default is false. |
| | 1108 | * @param bool $bytewise_fallback Optional. Use the bytewise regex when UTF8 regex is unavailable. Default is false. |
| | 1109 | * @param bool $bytewise_always Optional. Use only the bytewise regex. Default is false. |
| | 1110 | * @return string Unmodifed string if valid UTF8; or blog_charset is not UTF8; or UTF8 regex unavailable and $bytewise off. |
| | 1111 | * Otherwise, empty string or string stripped of invalid chars. |
| 1128 | | if ( ! isset( $utf8_pcre ) ) { |
| 1129 | | $utf8_pcre = @preg_match( '/^./u', 'a' ); |
| | 1130 | if ( ! $bytewise_always ) { |
| | 1131 | // Check for support for utf8 in the installed PCRE library once and store the result in a static |
| | 1132 | if ( ! isset( $utf8_pcre ) ) { |
| | 1133 | $utf8_pcre = @preg_match( '/^./u', 'a' ); // Returns false on error. |
| | 1134 | } |
| | 1135 | if ( $utf8_pcre ) { |
| | 1136 | // preg_match fails when it encounters invalid UTF8 in $string |
| | 1137 | if ( 1 === @preg_match( '/^./us', $string ) ) { |
| | 1138 | return $string; |
| | 1139 | } |
| | 1140 | if ( ! $strip ) { |
| | 1141 | return ''; |
| | 1142 | } |
| | 1143 | } elseif ( ! $bytewise_fallback ) { |
| | 1144 | return $string; |
| | 1145 | } |
| 1131 | | // We can't demand utf8 in the PCRE installation, so just return the string in those cases |
| 1132 | | if ( ! $utf8_pcre ) { |
| 1133 | | return $string; |
| 1134 | | } |
| | 1147 | if ( $strip || $bytewise_fallback || $bytewise_always ) { |
| | 1148 | // Bytewise regex captures valid UTF8 in blocks of 40 and skips invalid characters. |
| | 1149 | $regex = '/ |
| | 1150 | ( |
| | 1151 | (?: [\x00-\x7F] # single-byte sequences 0xxxxxxx |
| | 1152 | | [\xC2-\xDF][\x80-\xBF] # double-byte sequences 110xxxxx 10xxxxxx |
| | 1153 | | \xE0[\xA0-\xBF][\x80-\xBF] # triple-byte sequences 1110xxxx 10xxxxxx * 2 |
| | 1154 | | [\xE1-\xEC][\x80-\xBF]{2} |
| | 1155 | | \xED[\x80-\x9F][\x80-\xBF] |
| | 1156 | | [\xEE-\xEF][\x80-\xBF]{2} |
| | 1157 | | \xF0[\x90-\xBF][\x80-\xBF]{2} # four-byte sequences 11110xxx 10xxxxxx * 3 |
| | 1158 | | [\xF1-\xF3][\x80-\xBF]{3} |
| | 1159 | | \xF4[\x80-\x8F][\x80-\xBF]{2} |
| | 1160 | ){1,40} # ...one or more times |
| | 1161 | ) | . # anything else |
| | 1162 | /x'; |
| 1136 | | // preg_match fails when it encounters invalid UTF8 in $string |
| 1137 | | if ( 1 === @preg_match( '/^./us', $string ) ) { |
| 1138 | | return $string; |
| | 1164 | // Remove invalid byte sequences. |
| | 1165 | $clean_string = @preg_replace( $regex, '$1', $string ); // Returns null on error. |
| | 1166 | if ( $clean_string === $string ) { |
| | 1167 | return $string; |
| | 1168 | } |
| | 1169 | // Return string with invalid characters stripped if requested (not recommended). |
| | 1170 | if ( $strip && is_string( $clean_string ) ) { |
| | 1171 | return $clean_string; |
| | 1172 | } |