Ticket #29717: bug29717.k1.patch
File bug29717.k1.patch, 3.3 KB (added by , 6 years ago) |
---|
-
wp-includes/formatting.php
1105 1105 * 1106 1106 * @param string $string The text which is to be checked. 1107 1107 * @param bool $strip Optional. Whether to attempt to strip out invalid UTF8. Default is false. 1108 * @return string The checked text. 1108 * @param string $bytewise Optional. Choose '' to use bytewise regex only for implementing $strip, 1109 'fallback' to use bytewise regex when UTF8 regex is unavailable, 1110 * 'always' to use bytewise regex always. Default is ''. 1111 * @return string Unmodifed string if valid UTF8; or blog_charset is not UTF8; or UTF8 regex unavailable and $bytewise falsey. 1112 * Otherwise, empty string or string stripped of invalid chars. 1109 1113 */ 1110 function wp_check_invalid_utf8( $string, $strip = false ) {1114 function wp_check_invalid_utf8( $string, $strip = false, $bytewise = '' ) { 1111 1115 $string = (string) $string; 1112 1116 1113 if ( 0 === strlen( $string )) {1117 if ( '' == $string ) { 1114 1118 return ''; 1115 1119 } 1116 1120 … … 1126 1130 // Check for support for utf8 in the installed PCRE library once and store the result in a static 1127 1131 static $utf8_pcre = null; 1128 1132 if ( ! isset( $utf8_pcre ) ) { 1129 $utf8_pcre = @preg_match( '/^./u', 'a' ); 1133 if ( 'always' === $bytewise ) { 1134 $utf8_pcre = false; 1135 } else { 1136 $utf8_pcre = @preg_match( '/^./u', 'a' ); // Returns false on error. 1137 } 1130 1138 } 1131 // We can't demand utf8 in the PCRE installation, so just return the string in those cases 1132 if ( ! $utf8_pcre ) { 1139 if ( $utf8_pcre ) { 1140 // preg_match fails when it encounters invalid UTF8 in $string 1141 if ( 1 === @preg_match( '/^./us', $string ) ) { 1142 return $string; 1143 } 1144 if ( ! $strip ) { 1145 return ''; 1146 } 1147 } elseif ( ! $bytewise ) { 1133 1148 return $string; 1134 1149 } 1150 if ( $strip || $bytewise ) { // $strip, or $bytewise is 'fallback' or 'always' 1151 // Bytewise regex captures valid UTF8 in blocks of 40 and skips invalid characters. 1152 $regex = '/ 1153 ( 1154 (?: [\x00-\x7F] # single-byte sequences 0xxxxxxx 1155 | [\xC2-\xDF][\x80-\xBF] # double-byte sequences 110xxxxx 10xxxxxx 1156 | \xE0[\xA0-\xBF][\x80-\xBF] # triple-byte sequences 1110xxxx 10xxxxxx * 2 1157 | [\xE1-\xEC][\x80-\xBF]{2} 1158 | \xED[\x80-\x9F][\x80-\xBF] 1159 | [\xEE-\xEF][\x80-\xBF]{2} 1160 | \xF0[\x90-\xBF][\x80-\xBF]{2} # four-byte sequences 11110xxx 10xxxxxx * 3 1161 | [\xF1-\xF3][\x80-\xBF]{3} 1162 | \xF4[\x80-\x8F][\x80-\xBF]{2} 1163 ){1,40} # ...one or more times 1164 ) | . # anything else 1165 /x'; 1135 1166 1136 // preg_match fails when it encounters invalid UTF8 in $string 1137 if ( 1 === @preg_match( '/^./us', $string ) ) { 1138 return $string; 1167 // Remove invalid byte sequences. 1168 $clean_string = @preg_replace( $regex, '$1', $string ); // Returns null on error. 1169 if ( $clean_string === $string ) { 1170 return $string; 1171 } 1172 // Return string with invalid characters stripped if requested (not recommended). 1173 if ( $strip && is_string( $clean_string ) ) { 1174 return $clean_string; 1175 } 1139 1176 } 1140 1141 // Attempt to strip the bad chars if requested (not recommended)1142 if ( $strip && function_exists( 'iconv' ) ) {1143 return iconv( 'utf-8', 'utf-8', $string );1144 }1145 1146 1177 return ''; 1147 1178 } 1148 1179