Index: formatting.php
===================================================================
--- formatting.php	(revision 29522)
+++ formatting.php	(working copy)
@@ -699,44 +699,81 @@
  *
  * @param string $string The text which is to be checked.
  * @param boolean $strip Optional. Whether to attempt to strip out invalid UTF8. Default is false.
- * @return string The checked text.
+ * @return string The checked string, optionally with invalid chars stripped. Empty string is returned if passed string value is invalid UTF8.
  */
 function wp_check_invalid_utf8( $string, $strip = false ) {
 	$string = (string) $string;
 
-	if ( 0 === strlen( $string ) ) {
+	// if string length is 0 (faster than strlen) return empty
+	if ( ! isset( $string[0] ) )
 		return '';
-	}
 
-	// Store the site charset as a static to avoid multiple calls to get_option()
+	// Store the site charset
 	static $is_utf8;
-	if ( !isset( $is_utf8 ) ) {
-		$is_utf8 = in_array( get_option( 'blog_charset' ), array( 'utf8', 'utf-8', 'UTF8', 'UTF-8' ) );
-	}
-	if ( !$is_utf8 ) {
+
+	// if first time this function is called, save if utf enabled or not static
+	if ( ! isset( $is_utf8 ) )
+		$is_utf8 = ( stripos( get_option( 'blog_charset' ), 'utf' ) !== false );
+
+	// if utf not used return the string
+	if ( ! $is_utf8 )
 		return $string;
-	}
 
+	// Whether pcre is enabled, and whether pcre needs to use the (*UTF8) trick
+	static $pcre_utf8;
+
 	// Check for support for utf8 in the installed PCRE library once and store the result in a static
-	static $utf8_pcre;
-	if ( !isset( $utf8_pcre ) ) {
-		$utf8_pcre = @preg_match( '/^./u', 'a' );
-	}
-	// We can't demand utf8 in the PCRE installation, so just return the string in those cases
-	if ( !$utf8_pcre ) {
+	if ( ! isset( $pcre_utf8 ) )
+		$pcre_utf8 = ( @preg_match( '//u', '' ) !== false );
+
+	// If pcre_utf /u modifier allowed and the utf is valid, return string
+	if ( $pcre_utf8 && @preg_match( '//u', $string ) !== false )
 		return $string;
-	}
 
-	// preg_match fails when it encounters invalid UTF8 in $string
-	if ( 1 === @preg_match( '/^./us', $string ) ) {
-		return $string;
+	// just means that the /u modifier is disallowed, so try the pattern option
+	if ( ! $pcre_utf8 ) {
+
+		// Use the pattern option for pcre and return string if valid.  Else use the regex
+		if ( @preg_match( '/(*UTF8)/', '' ) !== false ) {
+			return $string;
+		} else {
+		
+			// pcre was compiled explicitly to forbid the support of UTF.  So use a regex to check (third times the charm)
+			$pattern = '/(
+				[\xC0-\xC1] # Invalid UTF-8 Bytes
+				| [\xF5-\xFF] # Invalid UTF-8 Bytes
+				| \xE0[\x80-\x9F] # Overlong encoding of prior code point
+				| \xF0[\x80-\x8F] # Overlong encoding of prior code point
+				| [\xC2-\xDF](?![\x80-\xBF]) # Invalid UTF-8 Sequence Start
+				| [\xE0-\xEF](?![\x80-\xBF]{2}) # Invalid UTF-8 Sequence Start
+				| [\xF0-\xF4](?![\x80-\xBF]{3}) # Invalid UTF-8 Sequence Start
+				| (?<=[\x0-\x7F\xF5-\xFF])[\x80-\xBF] # Invalid UTF-8 Sequence Middle
+				| (?<![\xC2-\xDF]|[\xE0-\xEF]|[\xE0-\xEF][\x80-\xBF]|[\xF0-\xF4]|[\xF0-\xF4][\x80-\xBF]|[\xF0-\xF4][\x80-\xBF]{2})[\x80-\xBF] # Overlong Sequence
+				| (?<=[\xE0-\xEF])[\x80-\xBF](?![\x80-\xBF]) # Short 3 byte sequence
+				| (?<=[\xF0-\xF4])[\x80-\xBF](?![\x80-\xBF]{2}) # Short 4 byte sequence
+				| (?<=[\xF0-\xF4][\x80-\xBF])[\x80-\xBF](?![\x80-\xBF]) # Short 4 byte sequence (2)
+			)/x';
+
+			// if the utf is valid return the string
+			if ( @preg_match( $pattern , $string ) === 1 )
+				return $string;
+		}
 	}
 
 	// Attempt to strip the bad chars if requested (not recommended)
-	if ( $strip && function_exists( 'iconv' ) ) {
-		return iconv( 'utf-8', 'utf-8', $string );
+	if ( $strip ) {
+		// try to use iconv if exists
+		if ( function_exists( 'iconv' ) )
+			return @iconv( 'utf-8', 'utf-8//ignore', $string );
+
+		// otherwise try to use mb_convert_encoding, setting the substitue_character to none to mimic strip
+		if ( function_exists( 'mb_convert_encoding' ) ) {
+			@ini_set( 'mbstring.substitute_character', 'none' );
+			return @mb_convert_encoding( $string, 'utf-8', 'utf-8' );
+		}
 	}
 
+	// default to returning empty string, meaning invalid utf was found
 	return '';
 }
 
