| 278 | * is a string utf8 encoded? |
| 279 | * |
| 280 | * @author hakre |
| 281 | * @since 3.0 |
| 282 | * |
| 283 | * @param string $str The string to be checked |
| 284 | * @return bool true if $str is UTF-8 encoded, false otherwise. |
| 285 | */ |
| 286 | function is_valid_utf8( &$str ) { |
| 287 | if ( function_exists( 'iconv' ) ) |
| 288 | return is_valid_utf8_iconv( $str ); |
| 289 | else |
| 290 | return is_valid_utf8_preg( $str ); |
| 291 | } |
| 292 | |
| 293 | /** |
| 294 | * Checks to see if a string is utf8 encoded. |
| 295 | * |
| 296 | * see: Cal Henderson: Building Scaleable Web Sites (p. 96), O'Reilly 2006 |
| 297 | * |
| 298 | * @author hakre |
| 299 | * @since 3.0 |
| 300 | * |
| 301 | * @param string $str The string to be checked |
| 302 | * @return bool true if $str is UTF-8 encoded, false otherwise. |
| 303 | */ |
| 304 | function is_valid_utf8_iconv( &$str ) { |
| 305 | $out = iconv('UTF-8', 'UTF-8', $str); |
| 306 | |
| 307 | return ($out == $str) ? true : false; |
| 308 | } |
| 309 | |
| 310 | /** |
| 311 | * Checks to see if a string is utf8 encoded. |
| 312 | * |
| 313 | * see: Cal Henderson: Building Scaleable Web Sites (p. 94, 95), O'Reilly 2006 |
| 314 | * |
| 315 | * @author hakre |
| 316 | * @since 3.0 |
| 317 | * @link http://codex.wordpress.org/User:Hakre/UTF8 |
| 318 | * |
| 319 | * @param string $str The string to be checked |
| 320 | * @return bool true if $str is UTF-8 encoded, false otherwise. |
| 321 | */ |
| 322 | function is_valid_utf8_preg( &$str ) { |
| 323 | |
| 324 | $invalidchars = |
| 325 | '[\xC0-\xDF]([^\x80-\xBF]|$)' . |
| 326 | '|[\xE0-\xEF].{0,1}([^\x80-\xBF]|$)' . |
| 327 | '|[\xF0-\xF7].{0,2}([^\x80-\xBF]|$)' . |
| 328 | '|[\xF8-\xFB].{0,3}([^\x80-\xBF]|$)' . |
| 329 | '|[\xFC-\xFD].{0,4}([^\x80-\xBF]|$)' . |
| 330 | '|[\xFE-\xFE].{0,5}([^\x80-\xBF]|$)' . |
| 331 | '|[\x00-\x7F][\x80-\xBF]' . |
| 332 | '|[\xC0-\xDF].[\x80-\xBF]' . |
| 333 | '|[\xE0-\xEF]..[\x80-\xBF]' . |
| 334 | '|[\xF0-\xF7]...[\x80-\xBF]' . |
| 335 | '|[\xF8-\xFB]....[\x80-\xBF]' . |
| 336 | '|[\xFC-\xFD].....[\x80-\xBF]' . |
| 337 | '|[\xFE-\xFE]......[\x80-\xBF]' . |
| 338 | '|^[\x80-\xBF]'; |
| 339 | |
| 340 | return preg_match( "!$invalidchars!", $str ) ? false : true; |
| 341 | } |
| 342 | |
| 343 | /** |
| 344 | * Checks to see if a string is utf8 encoded. |
| 345 | * |
| 346 | * see: code by schiller in #5998 |
| 347 | * |
| 348 | * @author hakre |
| 349 | * @since 3.0 |
| 350 | * @link http://core.trac.wordpress.org/ticket/5998 |
| 351 | * @link http://codex.wordpress.org/User:Hakre/UTF8 |
| 352 | * |
| 353 | * @param string $str The string to be checked |
| 354 | * @return bool true if $str is UTF-8 encoded, false otherwise. |
| 355 | */ |
| 356 | function is_valid_utf8_preg5998( &$str ) { |
| 357 | |
| 358 | $validchars = '(' . |
| 359 | '[\xC0-xDF09\x0A\x0D\x20-\x7E]' . // ASCII |
| 360 | '|[\xC2-\xDF][\x80-\xBF]' . // non-overlong 2-byte |
| 361 | '|\xE0[\xA0-\xBF][\x80-\xBF]' . // excluding overlongs |
| 362 | '|[\xE1-\xEC\xEE][\x80-\xBF]{2}' . // 3-byte, but exclude U-FFFE and U-FFFF |
| 363 | '|\xEF[\x80-\xBE][\x80-\xBF]' . |
| 364 | '|\xEF\xBF[\x80-\xBD]' . |
| 365 | '|\xED[\x80-\x9F][\x80-\xBF]' . // excluding surrogates |
| 366 | '|\xF0[\x90-\xBF][\x80-\xBF]{2}' . // planes 1-3 |
| 367 | '|[\xF1-\xF3][\x80-\xBF]{3}' . // planes 4-15 |
| 368 | '|\xF4[\x80-\x8F][\x80-\xBF]{2}' . // plane 16 |
| 369 | ')'; |
| 370 | |
| 371 | $result = preg_replace( $validchars, '', $str ); |
| 372 | |
| 373 | $retval = false; |
| 374 | |
| 375 | if ( NULL !== $result && strlen( $result ) == 0 ) |
| 376 | $retval = true; |
| 377 | |
| 378 | return $retval; |
| 379 | } |
| 380 | |
| 381 | /** |
| 382 | * Checks to see if a string is utf8 encoded. |
| 383 | * |
| 384 | * NOTE: This function conforms with the UTF-8 standard, |
| 385 | * seems_utf8() does not. |
| 386 | * |
| 387 | * @author hakre |
| 388 | * @since 3.0 |
| 389 | * |
| 390 | * @param string $str The string to be checked |
| 391 | * @return bool true if $str is UTF-8 encoded, false otherwise. |
| 392 | */ |
| 393 | function is_valid_utf8_statemachine( &$str ) { |
| 394 | $length = strlen($str); |
| 395 | for ($i=0; $i < $length; $i++) { |
| 396 | $c = ord($str[$i]); |
| 397 | if ($c < 0x80) $n = 0; # 0bbbbbbb |
| 398 | elseif (($c & 0xE0) == 0xC0) $n=1; # 110bbbbb |
| 399 | elseif (($c & 0xF0) == 0xE0) $n=2; # 1110bbbb |
| 400 | elseif (($c & 0xF8) == 0xF0) $n=3; # 11110bbb |
| 401 | else return false; # Does not match |
| 402 | for ($j=0; $j<$n; $j++) { # n bytes matching 10bbbbbb follow ? |
| 403 | if ((++$i == $length) || ((ord($str[$i]) & 0xC0) != 0x80)) |
| 404 | return false; |
| 405 | } |
| 406 | } |
| 407 | return true; |
| 408 | } |
| 409 | |
| 410 | /** |
| 411 | * check string for being xml well-formed |
| 412 | * |
| 413 | * see: code by dwright in #5998 |
| 414 | * |
| 415 | * @author hakre |
| 416 | * @since 3.0 |
| 417 | * |
| 418 | * @param string $str text to be checked |
| 419 | * @param string $err xml-parser error message (on failure) |
| 420 | * @return bool true if xml is well-formed, false if not. |
| 421 | */ |
| 422 | function is_wellformed_xml( &$str, &$err ) { |
| 423 | $result = 0; |
| 424 | |
| 425 | if ( $parser = xml_parser_create('UTF-8') ) { |
| 426 | $wrap = "<pre>$str</pre>"; |
| 427 | if ( ! ( $result = xml_parse( $parser, $wrap, true ) ) ) { |
| 428 | $err = sprintf('XML error: %s at line %d column %d', |
| 429 | xml_error_string( xml_get_error_code( $parser ) ), |
| 430 | xml_get_current_line_number( $parser ), |
| 431 | xml_get_current_column_number( $parser ) ); |
| 432 | } |
| 433 | xml_parser_free( $parser ); |
| 434 | } else { |
| 435 | $err = 'XML error: unable to create parser.'; |
| 436 | } |
| 437 | |
| 438 | return (bool) $result; |
| 439 | } |
| 440 | |
| 441 | /** |