### Eclipse Workspace Patch 1.0
#P wordpress-trunk
Index: wp-includes/formatting.php
===================================================================
--- wp-includes/formatting.php	(revision 12551)
+++ wp-includes/formatting.php	(working copy)
@@ -264,7 +264,7 @@
 		elseif (($c & 0xF0) == 0xE0) $n=2; # 1110bbbb
 		elseif (($c & 0xF8) == 0xF0) $n=3; # 11110bbb
 		elseif (($c & 0xFC) == 0xF8) $n=4; # 111110bb
-		elseif (($c & 0xFE) == 0xFC) $n=5; # 1111110b
+		elseif (($c & 0xFE) == 0xFC) $n=5; # 1111110b // invalid UTF-8, in here for backcompat reasons
 		else return false; # Does not match any model
 		for ($j=0; $j<$n; $j++) { # n bytes matching 10bbbbbb follow ?
 			if ((++$i == $length) || ((ord($str[$i]) & 0xC0) != 0x80))
@@ -275,6 +275,138 @@
 }
 
 /**
+ * is a string utf8 encoded?
+ *
+ * @author hakre
+ * @since  3.0
+ *
+ * @param  string $str The string to be checked
+ * @return bool   true if $str is UTF-8 encoded, false otherwise.
+ */
+function is_valid_utf8( &$str ) {
+	if ( function_exists( 'iconv' ) )
+		return is_valid_utf8_iconv( $str );
+	else
+		return is_valid_utf8_preg( $str );		
+}
+
+/**
+ * Checks to see if a string is utf8 encoded.
+ *
+ * see: Cal Henderson: Building Scaleable Web Sites (p. 96), O'Reilly 2006  
+ *
+ * @author hakre
+ * @since  3.0
+ *
+ * @param  string $str The string to be checked
+ * @return bool   true if $str is UTF-8 encoded, false otherwise.
+ */
+function is_valid_utf8_iconv( &$str ) {
+	$out = iconv('UTF-8', 'UTF-8', $str);
+	
+	return ($out == $str) ? true : false;
+}
+
+/**
+ * Checks to see if a string is utf8 encoded.
+ *
+ * see: Cal Henderson: Building Scaleable Web Sites (p. 94, 95), O'Reilly 2006
+ *
+ * @author hakre
+ * @since  3.0
+ * @link   http://codex.wordpress.org/User:Hakre/UTF8
+ *
+ * @param  string $str The string to be checked
+ * @return bool   true if $str is UTF-8 encoded, false otherwise.
+ */
+function is_valid_utf8_preg( &$str ) {
+	
+  	$invalidchars = 
+		'[\xC0-\xDF]([^\x80-\xBF]|$)' .
+		'|[\xE0-\xEF].{0,1}([^\x80-\xBF]|$)' .
+		'|[\xF0-\xF7].{0,2}([^\x80-\xBF]|$)' .
+		'|[\xF8-\xFB].{0,3}([^\x80-\xBF]|$)' .
+		'|[\xFC-\xFD].{0,4}([^\x80-\xBF]|$)' .
+		'|[\xFE-\xFE].{0,5}([^\x80-\xBF]|$)' .
+		'|[\x00-\x7F][\x80-\xBF]' .
+		'|[\xC0-\xDF].[\x80-\xBF]' .
+		'|[\xE0-\xEF]..[\x80-\xBF]' .
+		'|[\xF0-\xF7]...[\x80-\xBF]' .
+		'|[\xF8-\xFB]....[\x80-\xBF]' .
+		'|[\xFC-\xFD].....[\x80-\xBF]' .
+		'|[\xFE-\xFE]......[\x80-\xBF]' .
+		'|^[\x80-\xBF]';
+ 
+	return preg_match( "!$invalidchars!", $str ) ? false : true;
+}
+
+/**
+ * Checks to see if a string is utf8 encoded.
+ *
+ * @author hakre (based on code by schiller in #5998)
+ * @since  3.0
+ * @link   http://core.trac.wordpress.org/ticket/5998
+ * @link   http://codex.wordpress.org/User:Hakre/UTF8
+ *
+ * @param  string $str The string to be checked
+ * @return bool   true if $str is UTF-8 encoded, false otherwise.
+ */
+function is_valid_utf8_preg5998( &$str ) {
+	
+  	$validchars = '(' .
+  		'[\xC0-xDF09\x0A\x0D\x20-\x7E]' .        // ASCII
+  		'|[\xC2-\xDF][\x80-\xBF]' .        // non-overlong 2-byte
+  		'|\xE0[\xA0-\xBF][\x80-\xBF]' .    // excluding overlongs
+  		'|[\xE1-\xEC\xEE][\x80-\xBF]{2}' . // 3-byte, but exclude U-FFFE and U-FFFF
+  		'|\xEF[\x80-\xBE][\x80-\xBF]' .
+  		'|\xEF\xBF[\x80-\xBD]' .
+  		'|\xED[\x80-\x9F][\x80-\xBF]' .    // excluding surrogates
+  		'|\xF0[\x90-\xBF][\x80-\xBF]{2}' . // planes 1-3
+  		'|[\xF1-\xF3][\x80-\xBF]{3}' .     // planes 4-15
+  		'|\xF4[\x80-\x8F][\x80-\xBF]{2}' . // plane 16
+  	')';
+ 
+	$result = preg_replace( $validchars, '', $str );
+	
+	$retval = false;
+	
+	if ( NULL !== $result && strlen( $result ) == 0 )		
+		$retval = true;
+
+	return $retval;
+}
+
+/**
+ * Checks to see if a string is utf8 encoded.
+ *
+ * NOTE: This function conforms with the UTF-8 standard, 
+ *       seems_utf8() does not.
+ *
+ * @author hakre
+ * @since  3.0
+ *
+ * @param  string $str The string to be checked
+ * @return bool   true if $str is UTF-8 encoded, false otherwise.
+ */
+function is_valid_utf8_statemachine( &$str ) {
+	$length = strlen($str);
+	for ($i=0; $i < $length; $i++) {
+		$c = ord($str[$i]);
+		if ($c < 0x80) $n = 0; # 0bbbbbbb
+		elseif (($c & 0xE0) == 0xC0) $n=1; # 110bbbbb
+		elseif (($c & 0xF0) == 0xE0) $n=2; # 1110bbbb
+		elseif (($c & 0xF8) == 0xF0) $n=3; # 11110bbb
+		elseif (($c & 0xFC) == 0xF8) $n=4; # 111110bb
+		else return false; # Does not match
+		for ($j=0; $j<$n; $j++) { # n bytes matching 10bbbbbb follow ?
+			if ((++$i == $length) || ((ord($str[$i]) & 0xC0) != 0x80))
+				return false;
+		}
+	}
+	return true;
+}
+
+/**
  * Converts a number of special characters into their HTML entities.
  *
  * Specifically deals with: &, <, >, ", and '.
