### Eclipse Workspace Patch 1.0
#P wordpress-trunk
Index: wp-comments-post.php
===================================================================
--- wp-comments-post.php	(revision 12551)
+++ wp-comments-post.php	(working copy)
@@ -42,6 +42,17 @@
 $comment_author_url   = ( isset($_POST['url']) )     ? trim($_POST['url']) : null;
 $comment_content      = ( isset($_POST['comment']) ) ? trim($_POST['comment']) : null;
 
+// validate input as utf8
+$fields = array(
+	'comment_author'       => 'Name',
+	'comment_author_email' => 'Mail Address',
+	'comment_author_url'   => 'Website URL',
+	'comment_content'      => 'Comment'       );
+
+foreach ( $fields as $var => $title )
+	if ( ! is_valid_utf8( $GLOBALS[$var] ) )
+		wp_die( __( sprintf( 'Sorry, I cannot let you post that. You entered invalid utf-8 characters in your %s.', $title ) ) );
+
 // If the user is logged in
 $user = wp_get_current_user();
 if ( $user->ID ) {
Index: wp-includes/comment.php
===================================================================
--- wp-includes/comment.php	(revision 12551)
+++ wp-includes/comment.php	(working copy)
@@ -1235,6 +1235,12 @@
  * @return int The ID of the comment after adding.
  */
 function wp_new_comment( $commentdata ) {
+
+    if ( get_bloginfo( 'html_type' ) == 'text/xhtml+xml' 
+         && get_bloginfo( 'charset' ) == 'UTF-8' 
+         && ! is_wellformed_xml( $commentdata['comment_content'], $err ) )
+			 wp_die( __($err) );
+
 	$commentdata = apply_filters('preprocess_comment', $commentdata);
 
 	$commentdata['comment_post_ID'] = (int) $commentdata['comment_post_ID'];
Index: wp-includes/formatting.php
===================================================================
--- wp-includes/formatting.php	(revision 12551)
+++ wp-includes/formatting.php	(working copy)
@@ -246,7 +246,7 @@
 /**
  * Checks to see if a string is utf8 encoded.
  *
- * NOTE: This function checks for 5-Byte sequences, UTF8
+ * NOTE: This function checks for 6-Byte sequences, UTF8
  *       has Bytes Sequences with a maximum length of 4.
  *
  * @author bmorel at ssi dot fr (modified)
@@ -259,12 +259,12 @@
 	$length = strlen($str);
 	for ($i=0; $i < $length; $i++) {
 		$c = ord($str[$i]);
-		if ($c < 0x80) $n = 0; # 0bbbbbbb
+		if ($c < 0x80) $n = 0;             # 0bbbbbbb
 		elseif (($c & 0xE0) == 0xC0) $n=1; # 110bbbbb
 		elseif (($c & 0xF0) == 0xE0) $n=2; # 1110bbbb
 		elseif (($c & 0xF8) == 0xF0) $n=3; # 11110bbb
-		elseif (($c & 0xFC) == 0xF8) $n=4; # 111110bb
-		elseif (($c & 0xFE) == 0xFC) $n=5; # 1111110b
+		elseif (($c & 0xFC) == 0xF8) $n=4; # 111110bb // invalid UTF-8, in here 
+		elseif (($c & 0xFE) == 0xFC) $n=5; # 1111110b // for backcompat reasons 
 		else return false; # Does not match any model
 		for ($j=0; $j<$n; $j++) { # n bytes matching 10bbbbbb follow ?
 			if ((++$i == $length) || ((ord($str[$i]) & 0xC0) != 0x80))
@@ -275,6 +275,170 @@
 }
 
 /**
+ * is a string utf8 encoded?
+ *
+ * @author hakre
+ * @since  3.0
+ *
+ * @param  string $str The string to be checked
+ * @return bool   true if $str is UTF-8 encoded, false otherwise.
+ */
+function is_valid_utf8( &$str ) {
+	if ( function_exists( 'iconv' ) )
+		return is_valid_utf8_iconv( $str );
+	else
+		return is_valid_utf8_preg( $str );		
+}
+
+/**
+ * Checks to see if a string is utf8 encoded.
+ *
+ * see: Cal Henderson: Building Scaleable Web Sites (p. 96), O'Reilly 2006  
+ *
+ * @author hakre
+ * @since  3.0
+ *
+ * @param  string $str The string to be checked
+ * @return bool   true if $str is UTF-8 encoded, false otherwise.
+ */
+function is_valid_utf8_iconv( &$str ) {
+	$out = iconv('UTF-8', 'UTF-8', $str);
+	
+	return ($out == $str) ? true : false;
+}
+
+/**
+ * Checks to see if a string is utf8 encoded.
+ *
+ * see: Cal Henderson: Building Scaleable Web Sites (p. 94, 95), O'Reilly 2006
+ *
+ * @author hakre
+ * @since  3.0
+ * @link   http://codex.wordpress.org/User:Hakre/UTF8
+ *
+ * @param  string $str The string to be checked
+ * @return bool   true if $str is UTF-8 encoded, false otherwise.
+ */
+function is_valid_utf8_preg( &$str ) {
+	
+  	$invalidchars = 
+		'[\xC0-\xDF]([^\x80-\xBF]|$)' .
+		'|[\xE0-\xEF].{0,1}([^\x80-\xBF]|$)' .
+		'|[\xF0-\xF7].{0,2}([^\x80-\xBF]|$)' .
+		'|[\xF8-\xFB].{0,3}([^\x80-\xBF]|$)' .
+		'|[\xFC-\xFD].{0,4}([^\x80-\xBF]|$)' .
+		'|[\xFE-\xFE].{0,5}([^\x80-\xBF]|$)' .
+		'|[\x00-\x7F][\x80-\xBF]' .
+		'|[\xC0-\xDF].[\x80-\xBF]' .
+		'|[\xE0-\xEF]..[\x80-\xBF]' .
+		'|[\xF0-\xF7]...[\x80-\xBF]' .
+		'|[\xF8-\xFB]....[\x80-\xBF]' .
+		'|[\xFC-\xFD].....[\x80-\xBF]' .
+		'|[\xFE-\xFE]......[\x80-\xBF]' .
+		'|^[\x80-\xBF]';
+ 
+	return preg_match( "!$invalidchars!", $str ) ? false : true;
+}
+
+/**
+ * Checks to see if a string is utf8 encoded.
+ *
+ * see: code by schiller in #5998
+ *
+ * @author hakre
+ * @since  3.0
+ * @link   http://core.trac.wordpress.org/ticket/5998
+ * @link   http://codex.wordpress.org/User:Hakre/UTF8
+ *
+ * @param  string $str The string to be checked
+ * @return bool   true if $str is UTF-8 encoded, false otherwise.
+ */
+function is_valid_utf8_preg5998( &$str ) {
+	
+  	$validchars = '(' .
+  		'[\xC0-xDF09\x0A\x0D\x20-\x7E]' .        // ASCII
+  		'|[\xC2-\xDF][\x80-\xBF]' .        // non-overlong 2-byte
+  		'|\xE0[\xA0-\xBF][\x80-\xBF]' .    // excluding overlongs
+  		'|[\xE1-\xEC\xEE][\x80-\xBF]{2}' . // 3-byte, but exclude U-FFFE and U-FFFF
+  		'|\xEF[\x80-\xBE][\x80-\xBF]' .
+  		'|\xEF\xBF[\x80-\xBD]' .
+  		'|\xED[\x80-\x9F][\x80-\xBF]' .    // excluding surrogates
+  		'|\xF0[\x90-\xBF][\x80-\xBF]{2}' . // planes 1-3
+  		'|[\xF1-\xF3][\x80-\xBF]{3}' .     // planes 4-15
+  		'|\xF4[\x80-\x8F][\x80-\xBF]{2}' . // plane 16
+  	')';
+ 
+	$result = preg_replace( $validchars, '', $str );
+	
+	$retval = false;
+	
+	if ( NULL !== $result && strlen( $result ) == 0 )		
+		$retval = true;
+
+	return $retval;
+}
+
+/**
+ * Checks to see if a string is utf8 encoded.
+ *
+ * NOTE: This function conforms with the UTF-8 standard, 
+ *       seems_utf8() does not.
+ *
+ * @author hakre
+ * @since  3.0
+ *
+ * @param  string $str The string to be checked
+ * @return bool   true if $str is UTF-8 encoded, false otherwise.
+ */
+function is_valid_utf8_statemachine( &$str ) {
+	$length = strlen($str);
+	for ($i=0; $i < $length; $i++) {
+		$c = ord($str[$i]);
+		if ($c < 0x80) $n = 0; # 0bbbbbbb
+		elseif (($c & 0xE0) == 0xC0) $n=1; # 110bbbbb
+		elseif (($c & 0xF0) == 0xE0) $n=2; # 1110bbbb
+		elseif (($c & 0xF8) == 0xF0) $n=3; # 11110bbb
+		else return false; # Does not match
+		for ($j=0; $j<$n; $j++) { # n bytes matching 10bbbbbb follow ?
+			if ((++$i == $length) || ((ord($str[$i]) & 0xC0) != 0x80))
+				return false;
+		}
+	}
+	return true;
+}
+
+/**
+ * check string for being xml well-formed
+ * 
+ * see: code by dwright in #5998 
+ * 
+ * @author hakre
+ * @since  3.0
+ *  
+ * @param  string $str text to be checked
+ * @param  string $err xml-parser error message (on failure)  
+ * @return bool   true if xml is well-formed, false if not.
+ */
+function is_wellformed_xml( &$str, &$err ) {
+	$result = 0;
+	
+	if ( $parser = xml_parser_create('UTF-8') ) {
+		$wrap = "<pre>$str</pre>";		
+		if ( ! ( $result = xml_parse( $parser, $wrap, true ) )  ) {
+			$err = sprintf('XML error: %s at line %d column %d',
+            			xml_error_string( xml_get_error_code( $parser ) ),
+                        xml_get_current_line_number( $parser ),
+						xml_get_current_column_number( $parser ) );
+		}
+		xml_parser_free( $parser );
+	} else {
+		$err = 'XML error: unable to create parser.';
+	}
+	
+	return (bool) $result;
+}
+
+/**
  * Converts a number of special characters into their HTML entities.
  *
  * Specifically deals with: &, <, >, ", and '.
