1107 | | * @param bool $strip Optional. Whether to attempt to strip out invalid UTF8. Default is false. |
1108 | | * @return string The checked text. |
| 1107 | * @param bool $strip Optional. Whether to attempt to strip out invalid UTF8, using the bytewise regex. Default is false. |
| 1108 | * @param bool $bytewise_fallback Optional. Use the bytewise regex when UTF8 regex is unavailable. Default is false. |
| 1109 | * @param bool $bytewise_always Optional. Use only the bytewise regex. Default is false. |
| 1110 | * @return string Unmodifed string if valid UTF8; or blog_charset is not UTF8; or UTF8 regex unavailable and $bytewise off. |
| 1111 | * Otherwise, empty string or string stripped of invalid chars. |
1128 | | if ( ! isset( $utf8_pcre ) ) { |
1129 | | $utf8_pcre = @preg_match( '/^./u', 'a' ); |
| 1130 | if ( ! $bytewise_always ) { |
| 1131 | // Check for support for utf8 in the installed PCRE library once and store the result in a static |
| 1132 | if ( ! isset( $utf8_pcre ) ) { |
| 1133 | $utf8_pcre = @preg_match( '/^./u', 'a' ); // Returns false on error. |
| 1134 | } |
| 1135 | if ( $utf8_pcre ) { |
| 1136 | // preg_match fails when it encounters invalid UTF8 in $string |
| 1137 | if ( 1 === @preg_match( '/^./us', $string ) ) { |
| 1138 | return $string; |
| 1139 | } |
| 1140 | if ( ! $strip ) { |
| 1141 | return ''; |
| 1142 | } |
| 1143 | } elseif ( ! $bytewise_fallback ) { |
| 1144 | return $string; |
| 1145 | } |
1131 | | // We can't demand utf8 in the PCRE installation, so just return the string in those cases |
1132 | | if ( ! $utf8_pcre ) { |
1133 | | return $string; |
1134 | | } |
| 1147 | if ( $strip || $bytewise_fallback || $bytewise_always ) { |
| 1148 | // Bytewise regex captures valid UTF8 in blocks of 40 and skips invalid characters. |
| 1149 | $regex = '/ |
| 1150 | ( |
| 1151 | (?: [\x00-\x7F] # single-byte sequences 0xxxxxxx |
| 1152 | | [\xC2-\xDF][\x80-\xBF] # double-byte sequences 110xxxxx 10xxxxxx |
| 1153 | | \xE0[\xA0-\xBF][\x80-\xBF] # triple-byte sequences 1110xxxx 10xxxxxx * 2 |
| 1154 | | [\xE1-\xEC][\x80-\xBF]{2} |
| 1155 | | \xED[\x80-\x9F][\x80-\xBF] |
| 1156 | | [\xEE-\xEF][\x80-\xBF]{2} |
| 1157 | | \xF0[\x90-\xBF][\x80-\xBF]{2} # four-byte sequences 11110xxx 10xxxxxx * 3 |
| 1158 | | [\xF1-\xF3][\x80-\xBF]{3} |
| 1159 | | \xF4[\x80-\x8F][\x80-\xBF]{2} |
| 1160 | ){1,40} # ...one or more times |
| 1161 | ) | . # anything else |
| 1162 | /x'; |
1136 | | // preg_match fails when it encounters invalid UTF8 in $string |
1137 | | if ( 1 === @preg_match( '/^./us', $string ) ) { |
1138 | | return $string; |
| 1164 | // Remove invalid byte sequences. |
| 1165 | $clean_string = @preg_replace( $regex, '$1', $string ); // Returns null on error. |
| 1166 | if ( $clean_string === $string ) { |
| 1167 | return $string; |
| 1168 | } |
| 1169 | // Return string with invalid characters stripped if requested (not recommended). |
| 1170 | if ( $strip && is_string( $clean_string ) ) { |
| 1171 | return $clean_string; |
| 1172 | } |