Changeset 58836
Legend:
- Unmodified
- Added
- Removed
-
trunk/src/wp-includes/html-api/class-wp-html-processor-state.php
r58779 r58836 430 430 431 431 /** 432 * The recognized encoding of the input byte stream. 433 * 434 * > The stream of code points that comprises the input to the tokenization 435 * > stage will be initially seen by the user agent as a stream of bytes 436 * > (typically coming over the network or from the local file system). 437 * > The bytes encode the actual characters according to a particular character 438 * > encoding, which the user agent uses to decode the bytes into characters. 439 * 440 * @since 6.7.0 441 * 442 * @var string|null 443 */ 444 public $encoding = null; 445 446 /** 447 * The parser's confidence in the input encoding. 448 * 449 * > When the HTML parser is decoding an input byte stream, it uses a character 450 * > encoding and a confidence. The confidence is either tentative, certain, or 451 * > irrelevant. The encoding used, and whether the confidence in that encoding 452 * > is tentative or certain, is used during the parsing to determine whether to 453 * > change the encoding. If no encoding is necessary, e.g. because the parser is 454 * > operating on a Unicode stream and doesn't have to use a character encoding 455 * > at all, then the confidence is irrelevant. 456 * 457 * @since 6.7.0 458 * 459 * @var string 460 */ 461 public $encoding_confidence = 'tentative'; 462 463 /** 432 464 * HEAD element pointer. 433 465 * -
trunk/src/wp-includes/html-api/class-wp-html-processor.php
r58833 r58836 257 257 private $context_node = null; 258 258 259 /**260 * Whether the parser has yet processed the context node,261 * if created as a fragment parser.262 *263 * The context node will be initially pushed onto the stack of open elements,264 * but when created as a fragment parser, this context element (and the implicit265 * HTML document node above it) should not be exposed as a matched token or node.266 *267 * This boolean indicates whether the processor should skip over the current268 * node in its initial search for the first node created from the input HTML.269 *270 * @var bool271 */272 private $has_seen_context_node = false;273 274 259 /* 275 260 * Public Interface Functions … … 313 298 } 314 299 315 $processor = new static( $html, self::CONSTRUCTOR_UNLOCK_CODE ); 316 $processor->state->context_node = array( 'BODY', array() ); 317 $processor->state->insertion_mode = WP_HTML_Processor_State::INSERTION_MODE_IN_BODY; 300 $processor = new static( $html, self::CONSTRUCTOR_UNLOCK_CODE ); 301 $processor->state->context_node = array( 'BODY', array() ); 302 $processor->state->insertion_mode = WP_HTML_Processor_State::INSERTION_MODE_IN_BODY; 303 $processor->state->encoding = $encoding; 304 $processor->state->encoding_confidence = 'certain'; 318 305 319 306 // @todo Create "fake" bookmarks for non-existent but implied nodes. … … 337 324 $processor->context_node = $context_node; 338 325 $processor->breadcrumbs = array( 'HTML', $context_node->node_name ); 326 327 return $processor; 328 } 329 330 /** 331 * Creates an HTML processor in the full parsing mode. 332 * 333 * It's likely that a fragment parser is more appropriate, unless sending an 334 * entire HTML document from start to finish. Consider a fragment parser with 335 * a context node of `<body>`. 336 * 337 * Since UTF-8 is the only currently-accepted charset, if working with a 338 * document that isn't UTF-8, it's important to convert the document before 339 * creating the processor: pass in the converted HTML. 340 * 341 * @param string $html Input HTML document to process. 342 * @param string|null $known_definite_encoding Optional. If provided, specifies the charset used 343 * in the input byte stream. Currently must be UTF-8. 344 * @return static|null The created processor if successful, otherwise null. 345 */ 346 public static function create_full_parser( $html, $known_definite_encoding = 'UTF-8' ) { 347 if ( 'UTF-8' !== $known_definite_encoding ) { 348 return null; 349 } 350 351 $processor = new static( $html, self::CONSTRUCTOR_UNLOCK_CODE ); 352 $processor->state->encoding = $known_definite_encoding; 353 $processor->state->encoding_confidence = 'certain'; 339 354 340 355 return $processor; … … 994 1009 */ 995 1010 private function step_initial(): bool { 996 $this->bail( 'No support for parsing in the ' . WP_HTML_Processor_State::INSERTION_MODE_INITIAL . ' state.' ); 1011 $token_name = $this->get_token_name(); 1012 $token_type = $this->get_token_type(); 1013 $op_sigil = '#tag' === $token_type ? ( parent::is_tag_closer() ? '-' : '+' ) : ''; 1014 $op = "{$op_sigil}{$token_name}"; 1015 1016 switch ( $op ) { 1017 /* 1018 * > A character token that is one of U+0009 CHARACTER TABULATION, 1019 * > U+000A LINE FEED (LF), U+000C FORM FEED (FF), 1020 * > U+000D CARRIAGE RETURN (CR), or U+0020 SPACE 1021 * 1022 * Parse error: ignore the token. 1023 */ 1024 case '#text': 1025 $text = $this->get_modifiable_text(); 1026 if ( strlen( $text ) === strspn( $text, " \t\n\f\r" ) ) { 1027 return $this->step(); 1028 } 1029 goto initial_anything_else; 1030 break; 1031 1032 /* 1033 * > A comment token 1034 */ 1035 case '#comment': 1036 case '#funky-comment': 1037 case '#presumptuous-tag': 1038 $this->insert_html_element( $this->state->current_token ); 1039 return true; 1040 1041 /* 1042 * > A DOCTYPE token 1043 */ 1044 case 'html': 1045 $contents = $this->get_modifiable_text(); 1046 if ( ' html' !== $contents ) { 1047 /* 1048 * @todo When the HTML Tag Processor fully parses the DOCTYPE declaration, 1049 * this code should examine the contents to set the compatability mode. 1050 */ 1051 $this->bail( 'Cannot process any DOCTYPE other than a normative HTML5 doctype.' ); 1052 } 1053 1054 /* 1055 * > Then, switch the insertion mode to "before html". 1056 */ 1057 $this->state->insertion_mode = WP_HTML_Processor_State::INSERTION_MODE_BEFORE_HTML; 1058 return true; 1059 } 1060 1061 /* 1062 * > Anything else 1063 */ 1064 initial_anything_else: 1065 $this->state->insertion_mode = WP_HTML_Processor_State::INSERTION_MODE_BEFORE_HTML; 1066 return $this->step( self::REPROCESS_CURRENT_NODE ); 997 1067 } 998 1068 … … 1003 1073 * logic for the generalized WP_HTML_Processor::step() function. 1004 1074 * 1005 * @since 6.7.0 Stub implementation.1075 * @since 6.7.0 1006 1076 * 1007 1077 * @throws WP_HTML_Unsupported_Exception When encountering unsupported HTML input. … … 1013 1083 */ 1014 1084 private function step_before_html(): bool { 1015 $this->bail( 'No support for parsing in the ' . WP_HTML_Processor_State::INSERTION_MODE_BEFORE_HTML . ' state.' ); 1085 $token_name = $this->get_token_name(); 1086 $token_type = $this->get_token_type(); 1087 $is_closer = parent::is_tag_closer(); 1088 $op_sigil = '#tag' === $token_type ? ( $is_closer ? '-' : '+' ) : ''; 1089 $op = "{$op_sigil}{$token_name}"; 1090 1091 switch ( $op ) { 1092 /* 1093 * > A DOCTYPE token 1094 */ 1095 case 'html': 1096 // Parse error: ignore the token. 1097 return $this->step(); 1098 1099 /* 1100 * > A comment token 1101 */ 1102 case '#comment': 1103 case '#funky-comment': 1104 case '#presumptuous-tag': 1105 $this->insert_html_element( $this->state->current_token ); 1106 return true; 1107 1108 /* 1109 * > A character token that is one of U+0009 CHARACTER TABULATION, 1110 * > U+000A LINE FEED (LF), U+000C FORM FEED (FF), 1111 * > U+000D CARRIAGE RETURN (CR), or U+0020 SPACE 1112 * 1113 * Parse error: ignore the token. 1114 */ 1115 case '#text': 1116 $text = $this->get_modifiable_text(); 1117 if ( strlen( $text ) === strspn( $text, " \t\n\f\r" ) ) { 1118 return $this->step(); 1119 } 1120 goto before_html_anything_else; 1121 break; 1122 1123 /* 1124 * > A start tag whose tag name is "html" 1125 */ 1126 case '+HTML': 1127 $this->insert_html_element( $this->state->current_token ); 1128 $this->state->insertion_mode = WP_HTML_Processor_State::INSERTION_MODE_BEFORE_HEAD; 1129 return true; 1130 1131 /* 1132 * > An end tag whose tag name is one of: "head", "body", "html", "br" 1133 * 1134 * Closing BR tags are always reported by the Tag Processor as opening tags. 1135 */ 1136 case '-HEAD': 1137 case '-BODY': 1138 case '-HTML': 1139 /* 1140 * > Act as described in the "anything else" entry below. 1141 */ 1142 goto before_html_anything_else; 1143 break; 1144 } 1145 1146 /* 1147 * > Any other end tag 1148 */ 1149 if ( $is_closer ) { 1150 // Parse error: ignore the token. 1151 return $this->step(); 1152 } 1153 1154 /* 1155 * > Anything else. 1156 * 1157 * > Create an html element whose node document is the Document object. 1158 * > Append it to the Document object. Put this element in the stack of open elements. 1159 * > Switch the insertion mode to "before head", then reprocess the token. 1160 */ 1161 before_html_anything_else: 1162 $this->insert_virtual_node( 'HTML' ); 1163 $this->state->insertion_mode = WP_HTML_Processor_State::INSERTION_MODE_BEFORE_HEAD; 1164 return $this->step( self::REPROCESS_CURRENT_NODE ); 1016 1165 } 1017 1166 … … 1032 1181 */ 1033 1182 private function step_before_head(): bool { 1034 $this->bail( 'No support for parsing in the ' . WP_HTML_Processor_State::INSERTION_MODE_BEFORE_HEAD . ' state.' ); 1183 $token_name = $this->get_token_name(); 1184 $token_type = $this->get_token_type(); 1185 $is_closer = parent::is_tag_closer(); 1186 $op_sigil = '#tag' === $token_type ? ( $is_closer ? '-' : '+' ) : ''; 1187 $op = "{$op_sigil}{$token_name}"; 1188 1189 switch ( $op ) { 1190 /* 1191 * > A character token that is one of U+0009 CHARACTER TABULATION, 1192 * > U+000A LINE FEED (LF), U+000C FORM FEED (FF), 1193 * > U+000D CARRIAGE RETURN (CR), or U+0020 SPACE 1194 * 1195 * Parse error: ignore the token. 1196 */ 1197 case '#text': 1198 $text = $this->get_modifiable_text(); 1199 if ( strlen( $text ) === strspn( $text, " \t\n\f\r" ) ) { 1200 return $this->step(); 1201 } 1202 goto before_head_anything_else; 1203 break; 1204 1205 /* 1206 * > A comment token 1207 */ 1208 case '#comment': 1209 case '#funky-comment': 1210 case '#presumptuous-tag': 1211 $this->insert_html_element( $this->state->current_token ); 1212 return true; 1213 1214 /* 1215 * > A DOCTYPE token 1216 */ 1217 case 'html': 1218 // Parse error: ignore the token. 1219 return $this->step(); 1220 1221 /* 1222 * > A start tag whose tag name is "html" 1223 */ 1224 case '+HTML': 1225 return $this->step_in_body(); 1226 1227 /* 1228 * > A start tag whose tag name is "head" 1229 */ 1230 case '+HEAD': 1231 $this->insert_html_element( $this->state->current_token ); 1232 $this->state->head_element = $this->state->current_token; 1233 $this->state->insertion_mode = WP_HTML_Processor_State::INSERTION_MODE_IN_HEAD; 1234 return true; 1235 1236 /* 1237 * > An end tag whose tag name is one of: "head", "body", "html", "br" 1238 * > Act as described in the "anything else" entry below. 1239 * 1240 * Closing BR tags are always reported by the Tag Processor as opening tags. 1241 */ 1242 case '-HEAD': 1243 case '-BODY': 1244 case '-HTML': 1245 goto before_head_anything_else; 1246 break; 1247 } 1248 1249 if ( $is_closer ) { 1250 // Parse error: ignore the token. 1251 return $this->step(); 1252 } 1253 1254 /* 1255 * > Anything else 1256 * 1257 * > Insert an HTML element for a "head" start tag token with no attributes. 1258 */ 1259 before_head_anything_else: 1260 $this->state->head_element = $this->insert_virtual_node( 'HEAD' ); 1261 $this->state->insertion_mode = WP_HTML_Processor_State::INSERTION_MODE_IN_HEAD; 1262 return $this->step( self::REPROCESS_CURRENT_NODE ); 1035 1263 } 1036 1264 … … 1057 1285 $op = "{$op_sigil}{$token_name}"; 1058 1286 1059 /* 1060 * > A character token that is one of U+0009 CHARACTER TABULATION, 1061 * > U+000A LINE FEED (LF), U+000C FORM FEED (FF), 1062 * > U+000D CARRIAGE RETURN (CR), or U+0020 SPACE 1063 */ 1064 if ( '#text' === $op ) { 1065 $text = $this->get_modifiable_text(); 1066 if ( '' === $text ) { 1287 switch ( $op ) { 1288 case '#text': 1067 1289 /* 1068 * If the text is empty after processing HTML entities and stripping 1069 * U+0000 NULL bytes then ignore the token. 1290 * > A character token that is one of U+0009 CHARACTER TABULATION, 1291 * > U+000A LINE FEED (LF), U+000C FORM FEED (FF), 1292 * > U+000D CARRIAGE RETURN (CR), or U+0020 SPACE 1070 1293 */ 1071 return $this->step(); 1072 } 1073 1074 if ( strlen( $text ) === strspn( $text, " \t\n\f\r" ) ) { 1075 // Insert the character. 1076 $this->insert_html_element( $this->state->current_token ); 1077 return true; 1078 } 1079 } 1080 1081 switch ( $op ) { 1294 $text = $this->get_modifiable_text(); 1295 if ( '' === $text ) { 1296 /* 1297 * If the text is empty after processing HTML entities and stripping 1298 * U+0000 NULL bytes then ignore the token. 1299 */ 1300 return $this->step(); 1301 } 1302 1303 if ( strlen( $text ) === strspn( $text, " \t\n\f\r" ) ) { 1304 // Insert the character. 1305 $this->insert_html_element( $this->state->current_token ); 1306 return true; 1307 } 1308 1309 goto in_head_anything_else; 1310 break; 1311 1082 1312 /* 1083 1313 * > A comment token … … 1125 1355 */ 1126 1356 $charset = $this->get_attribute( 'charset' ); 1127 if ( is_string( $charset ) ) {1357 if ( is_string( $charset ) && 'tentative' === $this->state->encoding_confidence ) { 1128 1358 $this->bail( 'Cannot yet process META tags with charset to determine encoding.' ); 1129 1359 } … … 1142 1372 is_string( $http_equiv ) && 1143 1373 is_string( $content ) && 1144 0 === strcasecmp( $http_equiv, 'Content-Type' ) 1374 0 === strcasecmp( $http_equiv, 'Content-Type' ) && 1375 'tentative' === $this->state->encoding_confidence 1145 1376 ) { 1146 1377 $this->bail( 'Cannot yet process META tags with http-equiv Content-Type to determine encoding.' ); … … 1194 1425 /* 1195 1426 * > An end tag whose tag name is one of: "body", "html", "br" 1427 * 1428 * BR tags are always reported by the Tag Processor as opening tags. 1196 1429 */ 1197 1430 case '-BODY': 1198 1431 case '-HTML': 1199 case '-BR':1200 1432 /* 1201 1433 * > Act as described in the "anything else" entry below. … … 1274 1506 */ 1275 1507 private function step_in_head_noscript(): bool { 1276 $this->bail( 'No support for parsing in the ' . WP_HTML_Processor_State::INSERTION_MODE_IN_HEAD_NOSCRIPT . ' state.' ); 1508 $token_name = $this->get_token_name(); 1509 $token_type = $this->get_token_type(); 1510 $is_closer = parent::is_tag_closer(); 1511 $op_sigil = '#tag' === $token_type ? ( $is_closer ? '-' : '+' ) : ''; 1512 $op = "{$op_sigil}{$token_name}"; 1513 1514 switch ( $op ) { 1515 /* 1516 * > A character token that is one of U+0009 CHARACTER TABULATION, 1517 * > U+000A LINE FEED (LF), U+000C FORM FEED (FF), 1518 * > U+000D CARRIAGE RETURN (CR), or U+0020 SPACE 1519 * 1520 * Parse error: ignore the token. 1521 */ 1522 case '#text': 1523 $text = $this->get_modifiable_text(); 1524 if ( strlen( $text ) === strspn( $text, " \t\n\f\r" ) ) { 1525 return $this->step_in_head(); 1526 } 1527 1528 goto in_head_noscript_anything_else; 1529 break; 1530 1531 /* 1532 * > A DOCTYPE token 1533 */ 1534 case 'html': 1535 // Parse error: ignore the token. 1536 return $this->step(); 1537 1538 /* 1539 * > A start tag whose tag name is "html" 1540 */ 1541 case '+HTML': 1542 return $this->step_in_body(); 1543 1544 /* 1545 * > An end tag whose tag name is "noscript" 1546 */ 1547 case '-NOSCRIPT': 1548 $this->state->stack_of_open_elements->pop(); 1549 $this->state->insertion_mode = WP_HTML_Processor_State::INSERTION_MODE_IN_HEAD; 1550 return true; 1551 1552 /* 1553 * > A comment token 1554 * > 1555 * > A start tag whose tag name is one of: "basefont", "bgsound", 1556 * > "link", "meta", "noframes", "style" 1557 */ 1558 case '#comment': 1559 case '#funky-comment': 1560 case '#presumptuous-tag': 1561 case '+BASEFONT': 1562 case '+BGSOUND': 1563 case '+LINK': 1564 case '+META': 1565 case '+NOFRAMES': 1566 case '+STYLE': 1567 return $this->step_in_head(); 1568 1569 /* 1570 * > An end tag whose tag name is "br" 1571 * 1572 * This should never happen, as the Tag Processor prevents showing a BR closing tag. 1573 */ 1574 } 1575 1576 /* 1577 * > A start tag whose tag name is one of: "head", "noscript" 1578 * > Any other end tag 1579 */ 1580 if ( '+HEAD' === $op || '+NOSCRIPT' === $op || $is_closer ) { 1581 // Parse error: ignore the token. 1582 return $this->step(); 1583 } 1584 1585 /* 1586 * > Anything else 1587 * 1588 * Anything here is a parse error. 1589 */ 1590 in_head_noscript_anything_else: 1591 $this->state->stack_of_open_elements->pop(); 1592 $this->state->insertion_mode = WP_HTML_Processor_State::INSERTION_MODE_IN_HEAD; 1593 return $this->step( self::REPROCESS_CURRENT_NODE ); 1277 1594 } 1278 1595 … … 1293 1610 */ 1294 1611 private function step_after_head(): bool { 1295 $this->bail( 'No support for parsing in the ' . WP_HTML_Processor_State::INSERTION_MODE_AFTER_HEAD . ' state.' ); 1612 $token_name = $this->get_token_name(); 1613 $token_type = $this->get_token_type(); 1614 $is_closer = parent::is_tag_closer(); 1615 $op_sigil = '#tag' === $token_type ? ( $is_closer ? '-' : '+' ) : ''; 1616 $op = "{$op_sigil}{$token_name}"; 1617 1618 switch ( $op ) { 1619 /* 1620 * > A character token that is one of U+0009 CHARACTER TABULATION, 1621 * > U+000A LINE FEED (LF), U+000C FORM FEED (FF), 1622 * > U+000D CARRIAGE RETURN (CR), or U+0020 SPACE 1623 */ 1624 case '#text': 1625 $text = $this->get_modifiable_text(); 1626 if ( strlen( $text ) === strspn( $text, " \t\n\f\r" ) ) { 1627 // Insert the character. 1628 $this->insert_html_element( $this->state->current_token ); 1629 return true; 1630 } 1631 goto after_head_anything_else; 1632 break; 1633 1634 /* 1635 * > A comment token 1636 */ 1637 case '#comment': 1638 case '#funky-comment': 1639 case '#presumptuous-tag': 1640 $this->insert_html_element( $this->state->current_token ); 1641 return true; 1642 1643 /* 1644 * > A DOCTYPE token 1645 */ 1646 case 'html': 1647 // Parse error: ignore the token. 1648 return $this->step(); 1649 1650 /* 1651 * > A start tag whose tag name is "html" 1652 */ 1653 case '+HTML': 1654 return $this->step_in_body(); 1655 1656 /* 1657 * > A start tag whose tag name is "body" 1658 */ 1659 case '+BODY': 1660 $this->insert_html_element( $this->state->current_token ); 1661 $this->state->frameset_ok = false; 1662 $this->state->insertion_mode = WP_HTML_Processor_State::INSERTION_MODE_IN_BODY; 1663 return true; 1664 1665 /* 1666 * > A start tag whose tag name is "frameset" 1667 */ 1668 case '+FRAMESET': 1669 $this->insert_html_element( $this->state->current_token ); 1670 $this->state->insertion_mode = WP_HTML_Processor_State::INSERTION_MODE_IN_FRAMESET; 1671 return true; 1672 1673 /* 1674 * > A start tag whose tag name is one of: "base", "basefont", "bgsound", 1675 * > "link", "meta", "noframes", "script", "style", "template", "title" 1676 * 1677 * Anything here is a parse error. 1678 */ 1679 case '+BASE': 1680 case '+BASEFONT': 1681 case '+BGSOUND': 1682 case '+LINK': 1683 case '+META': 1684 case '+NOFRAMES': 1685 case '+SCRIPT': 1686 case '+STYLE': 1687 case '+TEMPLATE': 1688 case '+TITLE': 1689 /* 1690 * > Push the node pointed to by the head element pointer onto the stack of open elements. 1691 * > Process the token using the rules for the "in head" insertion mode. 1692 * > Remove the node pointed to by the head element pointer from the stack of open elements. (It might not be the current node at this point.) 1693 */ 1694 $this->bail( 'Cannot process elements after HEAD which reopen the HEAD element.' ); 1695 /* 1696 * Do not leave this break in when adding support; it's here to prevent 1697 * WPCS from getting confused at the switch structure without a return, 1698 * because it doesn't know that `bail()` always throws. 1699 */ 1700 break; 1701 1702 /* 1703 * > An end tag whose tag name is "template" 1704 */ 1705 case '-TEMPLATE': 1706 return $this->step_in_head(); 1707 1708 /* 1709 * > An end tag whose tag name is one of: "body", "html", "br" 1710 * 1711 * Closing BR tags are always reported by the Tag Processor as opening tags. 1712 */ 1713 case '-BODY': 1714 case '-HTML': 1715 /* 1716 * > Act as described in the "anything else" entry below. 1717 */ 1718 goto after_head_anything_else; 1719 break; 1720 } 1721 1722 /* 1723 * > A start tag whose tag name is "head" 1724 * > Any other end tag 1725 */ 1726 if ( '+HEAD' === $op || $is_closer ) { 1727 // Parse error: ignore the token. 1728 return $this->step(); 1729 } 1730 1731 /* 1732 * > Anything else 1733 * > Insert an HTML element for a "body" start tag token with no attributes. 1734 */ 1735 after_head_anything_else: 1736 $this->insert_virtual_node( 'BODY' ); 1737 $this->state->insertion_mode = WP_HTML_Processor_State::INSERTION_MODE_IN_BODY; 1738 return $this->step( self::REPROCESS_CURRENT_NODE ); 1296 1739 } 1297 1740 … … 4470 4913 * @param string|null $bookmark_name Optional. Name to give bookmark for created virtual node. 4471 4914 * Defaults to auto-creating a bookmark name. 4472 */ 4473 private function insert_virtual_node( $token_name, $bookmark_name = null ): void { 4915 * @return WP_HTML_Token Newly-created virtual token. 4916 */ 4917 private function insert_virtual_node( $token_name, $bookmark_name = null ): WP_HTML_Token { 4474 4918 $here = $this->bookmarks[ $this->state->current_token->bookmark_name ]; 4475 4919 $name = $bookmark_name ?? $this->bookmark_token(); … … 4477 4921 $this->bookmarks[ $name ] = new WP_HTML_Span( $here->start, 0 ); 4478 4922 4479 $this->insert_html_element( new WP_HTML_Token( $name, $token_name, false ) ); 4923 $token = new WP_HTML_Token( $name, $token_name, false ); 4924 $this->insert_html_element( $token ); 4925 return $token; 4480 4926 } 4481 4927 … … 4634 5080 } 4635 5081 5082 /** 5083 * Gets an encoding from a given string. 5084 * 5085 * This is an algorithm defined in the WHAT-WG specification. 5086 * 5087 * Example: 5088 * 5089 * 'UTF-8' === self::get_encoding( 'utf8' ); 5090 * 'UTF-8' === self::get_encoding( " \tUTF-8 " ); 5091 * null === self::get_encoding( 'UTF-7' ); 5092 * null === self::get_encoding( 'utf8; charset=' ); 5093 * 5094 * @see https://encoding.spec.whatwg.org/#concept-encoding-get 5095 * 5096 * @todo As this parser only supports UTF-8, only the UTF-8 5097 * encodings are detected. Add more as desired, but the 5098 * parser will bail on non-UTF-8 encodings. 5099 * 5100 * @since 6.7.0 5101 * 5102 * @param string $label A string which may specify a known encoding. 5103 * @return string|null Known encoding if matched, otherwise null. 5104 */ 5105 protected static function get_encoding( string $label ): ?string { 5106 /* 5107 * > Remove any leading and trailing ASCII whitespace from label. 5108 */ 5109 $label = trim( $label, " \t\f\r\n" ); 5110 5111 /* 5112 * > If label is an ASCII case-insensitive match for any of the labels listed in the 5113 * > table below, then return the corresponding encoding; otherwise return failure. 5114 */ 5115 switch ( strtolower( $label ) ) { 5116 case 'unicode-1-1-utf-8': 5117 case 'unicode11utf8': 5118 case 'unicode20utf8': 5119 case 'utf-8': 5120 case 'utf8': 5121 case 'x-unicode20utf8': 5122 return 'UTF-8'; 5123 5124 default: 5125 return null; 5126 } 5127 } 5128 4636 5129 /* 4637 5130 * Constants that would pollute the top of the class if they were found there. -
trunk/tests/phpunit/tests/html-api/wpHtmlProcessorBreadcrumbs.php
r58833 r58836 26 26 $processor = WP_HTML_Processor::create_fragment( $html ); 27 27 28 $this->assertTrue( $processor-> step(), "Failed to step into supported {$tag_name} element." );28 $this->assertTrue( $processor->next_token(), "Failed to step into supported {$tag_name} element." ); 29 29 $this->assertSame( $tag_name, $processor->get_tag(), "Misread {$tag_name} as a {$processor->get_tag()} element." ); 30 30 } … … 91 91 'INS', 92 92 'LI', 93 'LINK', 93 94 'ISINDEX', // Deprecated. 94 95 'KBD', … … 109 110 'NEXTID', // Deprecated. 110 111 'NOBR', // Neutralized. 112 'NOEMBED', // Neutralized. 113 'NOFRAMES', // Neutralized. 111 114 'NOSCRIPT', 112 115 'OBJECT', … … 123 126 'RUBY', 124 127 'SAMP', 128 'SCRIPT', 125 129 'SEARCH', 126 130 'SECTION', … … 131 135 'STRIKE', 132 136 'STRONG', 137 'STYLE', 133 138 'SUB', 134 139 'SUMMARY', 135 140 'SUP', 136 141 'TABLE', 142 'TEXTAREA', 137 143 'TIME', 144 'TITLE', 138 145 'TT', 139 146 'U', … … 141 148 'VAR', 142 149 'VIDEO', 150 'XMP', // Deprecated, use PRE instead. 143 151 ); 144 152 145 153 $data = array(); 146 154 foreach ( $supported_elements as $tag_name ) { 147 $data[ $tag_name ] = array( "<{$tag_name}>", $tag_name ); 155 $closer = in_array( $tag_name, array( 'NOEMBED', 'NOFRAMES', 'SCRIPT', 'STYLE', 'TEXTAREA', 'TITLE', 'XMP' ), true ) 156 ? "</{$tag_name}>" 157 : ''; 158 159 $data[ $tag_name ] = array( "<{$tag_name}>{$closer}", $tag_name ); 148 160 } 149 161 … … 183 195 public static function data_unsupported_elements() { 184 196 $unsupported_elements = array( 185 'BODY',186 'FRAME',187 'FRAMESET',188 'HEAD',189 'HTML',190 'IFRAME',191 197 'MATH', 192 'NOEMBED', // Neutralized.193 'NOFRAMES', // Neutralized.194 198 'PLAINTEXT', // Neutralized. 195 'SCRIPT',196 'STYLE',197 199 'SVG', 198 'TEXTAREA',199 'TITLE',200 'XMP', // Deprecated, use PRE instead.201 200 ); 202 201
Note: See TracChangeset
for help on using the changeset viewer.