| | 278 | * is a string utf8 encoded? |
| | 279 | * |
| | 280 | * @author hakre |
| | 281 | * @since 3.0 |
| | 282 | * |
| | 283 | * @param string $str The string to be checked |
| | 284 | * @return bool true if $str is UTF-8 encoded, false otherwise. |
| | 285 | */ |
| | 286 | function is_valid_utf8( &$str ) { |
| | 287 | if ( function_exists( 'iconv' ) ) |
| | 288 | return is_valid_utf8_iconv( $str ); |
| | 289 | else |
| | 290 | return is_valid_utf8_preg( $str ); |
| | 291 | } |
| | 292 | |
| | 293 | /** |
| | 294 | * Checks to see if a string is utf8 encoded. |
| | 295 | * |
| | 296 | * see: Cal Henderson: Building Scaleable Web Sites (p. 96), O'Reilly 2006 |
| | 297 | * |
| | 298 | * @author hakre |
| | 299 | * @since 3.0 |
| | 300 | * |
| | 301 | * @param string $str The string to be checked |
| | 302 | * @return bool true if $str is UTF-8 encoded, false otherwise. |
| | 303 | */ |
| | 304 | function is_valid_utf8_iconv( &$str ) { |
| | 305 | $out = iconv('UTF-8', 'UTF-8', $str); |
| | 306 | |
| | 307 | return ($out == $str) ? true : false; |
| | 308 | } |
| | 309 | |
| | 310 | /** |
| | 311 | * Checks to see if a string is utf8 encoded. |
| | 312 | * |
| | 313 | * see: Cal Henderson: Building Scaleable Web Sites (p. 94, 95), O'Reilly 2006 |
| | 314 | * |
| | 315 | * @author hakre |
| | 316 | * @since 3.0 |
| | 317 | * @link http://codex.wordpress.org/User:Hakre/UTF8 |
| | 318 | * |
| | 319 | * @param string $str The string to be checked |
| | 320 | * @return bool true if $str is UTF-8 encoded, false otherwise. |
| | 321 | */ |
| | 322 | function is_valid_utf8_preg( &$str ) { |
| | 323 | |
| | 324 | $invalidchars = |
| | 325 | '[\xC0-\xDF]([^\x80-\xBF]|$)' . |
| | 326 | '|[\xE0-\xEF].{0,1}([^\x80-\xBF]|$)' . |
| | 327 | '|[\xF0-\xF7].{0,2}([^\x80-\xBF]|$)' . |
| | 328 | '|[\xF8-\xFB].{0,3}([^\x80-\xBF]|$)' . |
| | 329 | '|[\xFC-\xFD].{0,4}([^\x80-\xBF]|$)' . |
| | 330 | '|[\xFE-\xFE].{0,5}([^\x80-\xBF]|$)' . |
| | 331 | '|[\x00-\x7F][\x80-\xBF]' . |
| | 332 | '|[\xC0-\xDF].[\x80-\xBF]' . |
| | 333 | '|[\xE0-\xEF]..[\x80-\xBF]' . |
| | 334 | '|[\xF0-\xF7]...[\x80-\xBF]' . |
| | 335 | '|[\xF8-\xFB]....[\x80-\xBF]' . |
| | 336 | '|[\xFC-\xFD].....[\x80-\xBF]' . |
| | 337 | '|[\xFE-\xFE]......[\x80-\xBF]' . |
| | 338 | '|^[\x80-\xBF]'; |
| | 339 | |
| | 340 | return preg_match( "!$invalidchars!", $str ) ? false : true; |
| | 341 | } |
| | 342 | |
| | 343 | /** |
| | 344 | * Checks to see if a string is utf8 encoded. |
| | 345 | * |
| | 346 | * @author hakre (based on code by schiller in #5998) |
| | 347 | * @since 3.0 |
| | 348 | * @link http://core.trac.wordpress.org/ticket/5998 |
| | 349 | * @link http://codex.wordpress.org/User:Hakre/UTF8 |
| | 350 | * |
| | 351 | * @param string $str The string to be checked |
| | 352 | * @return bool true if $str is UTF-8 encoded, false otherwise. |
| | 353 | */ |
| | 354 | function is_valid_utf8_preg5998( &$str ) { |
| | 355 | |
| | 356 | $validchars = '(' . |
| | 357 | '[\xC0-xDF09\x0A\x0D\x20-\x7E]' . // ASCII |
| | 358 | '|[\xC2-\xDF][\x80-\xBF]' . // non-overlong 2-byte |
| | 359 | '|\xE0[\xA0-\xBF][\x80-\xBF]' . // excluding overlongs |
| | 360 | '|[\xE1-\xEC\xEE][\x80-\xBF]{2}' . // 3-byte, but exclude U-FFFE and U-FFFF |
| | 361 | '|\xEF[\x80-\xBE][\x80-\xBF]' . |
| | 362 | '|\xEF\xBF[\x80-\xBD]' . |
| | 363 | '|\xED[\x80-\x9F][\x80-\xBF]' . // excluding surrogates |
| | 364 | '|\xF0[\x90-\xBF][\x80-\xBF]{2}' . // planes 1-3 |
| | 365 | '|[\xF1-\xF3][\x80-\xBF]{3}' . // planes 4-15 |
| | 366 | '|\xF4[\x80-\x8F][\x80-\xBF]{2}' . // plane 16 |
| | 367 | ')'; |
| | 368 | |
| | 369 | $result = preg_replace( $validchars, '', $str ); |
| | 370 | |
| | 371 | $retval = false; |
| | 372 | |
| | 373 | if ( NULL !== $result && strlen( $result ) == 0 ) |
| | 374 | $retval = true; |
| | 375 | |
| | 376 | return $retval; |
| | 377 | } |
| | 378 | |
| | 379 | /** |
| | 380 | * Checks to see if a string is utf8 encoded. |
| | 381 | * |
| | 382 | * NOTE: This function conforms with the UTF-8 standard, |
| | 383 | * seems_utf8() does not. |
| | 384 | * |
| | 385 | * @author hakre |
| | 386 | * @since 3.0 |
| | 387 | * |
| | 388 | * @param string $str The string to be checked |
| | 389 | * @return bool true if $str is UTF-8 encoded, false otherwise. |
| | 390 | */ |
| | 391 | function is_valid_utf8_statemachine( &$str ) { |
| | 392 | $length = strlen($str); |
| | 393 | for ($i=0; $i < $length; $i++) { |
| | 394 | $c = ord($str[$i]); |
| | 395 | if ($c < 0x80) $n = 0; # 0bbbbbbb |
| | 396 | elseif (($c & 0xE0) == 0xC0) $n=1; # 110bbbbb |
| | 397 | elseif (($c & 0xF0) == 0xE0) $n=2; # 1110bbbb |
| | 398 | elseif (($c & 0xF8) == 0xF0) $n=3; # 11110bbb |
| | 399 | elseif (($c & 0xFC) == 0xF8) $n=4; # 111110bb |
| | 400 | else return false; # Does not match |
| | 401 | for ($j=0; $j<$n; $j++) { # n bytes matching 10bbbbbb follow ? |
| | 402 | if ((++$i == $length) || ((ord($str[$i]) & 0xC0) != 0x80)) |
| | 403 | return false; |
| | 404 | } |
| | 405 | } |
| | 406 | return true; |
| | 407 | } |
| | 408 | |
| | 409 | /** |