Changeset 57211
Legend:
- Unmodified
- Added
- Removed
-
trunk/src/wp-includes/html-api/class-wp-html-tag-processor.php
r57179 r57211 16 16 * This would increase the size of the changes for some operations but leave more 17 17 * natural-looking output HTML. 18 * - Decode HTML character references within class names when matching. E.g. match having19 * class `1<"2` needs to recognize `class="1<"2"`. Currently the Tag Processor20 * will fail to find the right tag if the class name is encoded as such.21 18 * - Properly decode HTML character references in `get_attribute()`. PHP's 22 19 * `html_entity_decode()` is wrong in a couple ways: it doesn't account for the … … 107 104 * given, it will return `true` (the only way to set `false` for an 108 105 * attribute is to remove it). 106 * 107 * #### When matching fails 108 * 109 * When `next_tag()` returns `false` it could mean different things: 110 * 111 * - The requested tag wasn't found in the input document. 112 * - The input document ended in the middle of an HTML syntax element. 113 * 114 * When a document ends in the middle of a syntax element it will pause 115 * the processor. This is to make it possible in the future to extend the 116 * input document and proceed - an important requirement for chunked 117 * streaming parsing of a document. 118 * 119 * Example: 120 * 121 * $processor = new WP_HTML_Tag_Processor( 'This <div is="a" partial="token' ); 122 * false === $processor->next_tag(); 123 * 124 * If a special element (see next section) is encountered but no closing tag 125 * is found it will count as an incomplete tag. The parser will pause as if 126 * the opening tag were incomplete. 127 * 128 * Example: 129 * 130 * $processor = new WP_HTML_Tag_Processor( '<style>// there could be more styling to come' ); 131 * false === $processor->next_tag(); 132 * 133 * $processor = new WP_HTML_Tag_Processor( '<style>// this is everything</style><div>' ); 134 * true === $processor->next_tag( 'DIV' ); 135 * 136 * #### Special elements 137 * 138 * Some HTML elements are handled in a special way; their start and end tags 139 * act like a void tag. These are special because their contents can't contain 140 * HTML markup. Everything inside these elements is handled in a special way 141 * and content that _appears_ like HTML tags inside of them isn't. There can 142 * be no nesting in these elements. 143 * 144 * In the following list, "raw text" means that all of the content in the HTML 145 * until the matching closing tag is treated verbatim without any replacements 146 * and without any parsing. 147 * 148 * - IFRAME allows no content but requires a closing tag. 149 * - NOEMBED (deprecated) content is raw text. 150 * - NOFRAMES (deprecated) content is raw text. 151 * - SCRIPT content is plaintext apart from legacy rules allowing `</script>` inside an HTML comment. 152 * - STYLE content is raw text. 153 * - TITLE content is plain text but character references are decoded. 154 * - TEXTAREA content is plain text but character references are decoded. 155 * - XMP (deprecated) content is raw text. 109 156 * 110 157 * ### Modifying HTML attributes for a found tag … … 242 289 * unquoted values will appear in the output with double-quotes. 243 290 * 291 * ### Scripting Flag 292 * 293 * The Tag Processor parses HTML with the "scripting flag" disabled. This means 294 * that it doesn't run any scripts while parsing the page. In a browser with 295 * JavaScript enabled, for example, the script can change the parse of the 296 * document as it loads. On the server, however, evaluating JavaScript is not 297 * only impractical, but also unwanted. 298 * 299 * Practically this means that the Tag Processor will descend into NOSCRIPT 300 * elements and process its child tags. Were the scripting flag enabled, such 301 * as in a typical browser, the contents of NOSCRIPT are skipped entirely. 302 * 303 * This allows the HTML API to process the content that will be presented in 304 * a browser when scripting is disabled, but it offers a different view of a 305 * page than most browser sessions will experience. E.g. the tags inside the 306 * NOSCRIPT disappear. 307 * 308 * ### Text Encoding 309 * 310 * The Tag Processor assumes that the input HTML document is encoded with a 311 * text encoding compatible with 7-bit ASCII's '<', '>', '&', ';', '/', '=', 312 * "'", '"', 'a' - 'z', 'A' - 'Z', and the whitespace characters ' ', tab, 313 * carriage-return, newline, and form-feed. 314 * 315 * In practice, this includes almost every single-byte encoding as well as 316 * UTF-8. Notably, however, it does not include UTF-16. If providing input 317 * that's incompatible, then convert the encoding beforehand. 318 * 244 319 * @since 6.2.0 245 320 * @since 6.2.1 Fix: Support for various invalid comments; attribute updates are case-insensitive. 246 321 * @since 6.3.2 Fix: Skip HTML-like content inside rawtext elements such as STYLE. 322 * @since 6.5.0 Pauses processor when input ends in an incomplete syntax token. 323 * Introduces "special" elements which act like void elements, e.g. STYLE. 247 324 */ 248 325 class WP_HTML_Tag_Processor { … … 316 393 */ 317 394 private $stop_on_tag_closers; 395 396 /** 397 * Specifies mode of operation of the parser at any given time. 398 * 399 * | State | Meaning | 400 * | --------------|----------------------------------------------------------------------| 401 * | *Ready* | The parser is ready to run. | 402 * | *Complete* | There is nothing left to parse. | 403 * | *Incomplete* | The HTML ended in the middle of a token; nothing more can be parsed. | 404 * | *Matched tag* | Found an HTML tag; it's possible to modify its attributes. | 405 * 406 * @since 6.5.0 407 * 408 * @see WP_HTML_Tag_Processor::STATE_READY 409 * @see WP_HTML_Tag_Processor::STATE_COMPLETE 410 * @see WP_HTML_Tag_Processor::STATE_INCOMPLETE 411 * @see WP_HTML_Tag_Processor::STATE_MATCHED_TAG 412 * 413 * @var string 414 */ 415 private $parser_state = self::STATE_READY; 318 416 319 417 /** … … 545 643 * 546 644 * @since 6.2.0 645 * @since 6.5.0 No longer processes incomplete tokens at end of document; pauses the processor at start of token. 547 646 * 548 647 * @param array|string|null $query { … … 563 662 564 663 do { 565 if ( $this->bytes_already_parsed >= strlen( $this->html) ) {664 if ( false === $this->next_token() ) { 566 665 return false; 567 666 } 568 667 569 // Find the next tag if it exists. 570 if ( false === $this->parse_next_tag() ) { 571 $this->bytes_already_parsed = strlen( $this->html ); 572 573 return false; 574 } 575 576 // Parse all of its attributes. 577 while ( $this->parse_next_attribute() ) { 668 if ( self::STATE_MATCHED_TAG !== $this->parser_state ) { 578 669 continue; 579 670 } 580 671 581 // Ensure that the tag closes before the end of the document.582 if ( $this->bytes_already_parsed >= strlen( $this->html ) ) {583 return false;584 }585 586 $tag_ends_at = strpos( $this->html, '>', $this->bytes_already_parsed );587 if ( false === $tag_ends_at ) {588 return false;589 }590 $this->token_length = $tag_ends_at - $this->token_starts_at;591 $this->bytes_already_parsed = $tag_ends_at;592 593 // Finally, check if the parsed tag and its attributes match the search query.594 672 if ( $this->matches() ) { 595 673 ++$already_found; 596 674 } 597 598 /* 599 * For non-DATA sections which might contain text that looks like HTML tags but 600 * isn't, scan with the appropriate alternative mode. Looking at the first letter 601 * of the tag name as a pre-check avoids a string allocation when it's not needed. 602 */ 603 $t = $this->html[ $this->tag_name_starts_at ]; 604 if ( 605 ! $this->is_closing_tag && 675 } while ( $already_found < $this->sought_match_offset ); 676 677 return true; 678 } 679 680 /** 681 * Finds the next token in the HTML document. 682 * 683 * An HTML document can be viewed as a stream of tokens, 684 * where tokens are things like HTML tags, HTML comments, 685 * text nodes, etc. This method finds the next token in 686 * the HTML document and returns whether it found one. 687 * 688 * If it starts parsing a token and reaches the end of the 689 * document then it will seek to the start of the last 690 * token and pause, returning `false` to indicate that it 691 * failed to find a complete token. 692 * 693 * Possible token types, based on the HTML specification: 694 * 695 * - an HTML tag, whether opening, closing, or void. 696 * - a text node - the plaintext inside tags. 697 * - an HTML comment. 698 * - a DOCTYPE declaration. 699 * - a processing instruction, e.g. `<?xml version="1.0" ?>`. 700 * 701 * The Tag Processor currently only supports the tag token. 702 * 703 * @since 6.5.0 704 * 705 * @return bool Whether a token was parsed. 706 */ 707 public function next_token() { 708 $this->get_updated_html(); 709 $was_at = $this->bytes_already_parsed; 710 711 // Don't proceed if there's nothing more to scan. 712 if ( 713 self::STATE_COMPLETE === $this->parser_state || 714 self::STATE_INCOMPLETE === $this->parser_state 715 ) { 716 return false; 717 } 718 719 /* 720 * The next step in the parsing loop determines the parsing state; 721 * clear it so that state doesn't linger from the previous step. 722 */ 723 $this->parser_state = self::STATE_READY; 724 725 if ( $this->bytes_already_parsed >= strlen( $this->html ) ) { 726 $this->parser_state = self::STATE_COMPLETE; 727 return false; 728 } 729 730 // Find the next tag if it exists. 731 if ( false === $this->parse_next_tag() ) { 732 if ( self::STATE_INCOMPLETE === $this->parser_state ) { 733 $this->bytes_already_parsed = $was_at; 734 } 735 736 return false; 737 } 738 739 // Parse all of its attributes. 740 while ( $this->parse_next_attribute() ) { 741 continue; 742 } 743 744 // Ensure that the tag closes before the end of the document. 745 if ( 746 self::STATE_INCOMPLETE === $this->parser_state || 747 $this->bytes_already_parsed >= strlen( $this->html ) 748 ) { 749 // Does this appropriately clear state (parsed attributes)? 750 $this->parser_state = self::STATE_INCOMPLETE; 751 $this->bytes_already_parsed = $was_at; 752 753 return false; 754 } 755 756 $tag_ends_at = strpos( $this->html, '>', $this->bytes_already_parsed ); 757 if ( false === $tag_ends_at ) { 758 $this->parser_state = self::STATE_INCOMPLETE; 759 $this->bytes_already_parsed = $was_at; 760 761 return false; 762 } 763 $this->parser_state = self::STATE_MATCHED_TAG; 764 $this->token_length = $tag_ends_at - $this->token_starts_at; 765 $this->bytes_already_parsed = $tag_ends_at; 766 767 /* 768 * For non-DATA sections which might contain text that looks like HTML tags but 769 * isn't, scan with the appropriate alternative mode. Looking at the first letter 770 * of the tag name as a pre-check avoids a string allocation when it's not needed. 771 */ 772 $t = $this->html[ $this->tag_name_starts_at ]; 773 if ( 774 ! $this->is_closing_tag && 775 ( 776 'i' === $t || 'I' === $t || 777 'n' === $t || 'N' === $t || 778 's' === $t || 'S' === $t || 779 't' === $t || 'T' === $t || 780 'x' === $t || 'X' === $t 781 ) 782 ) { 783 $tag_name = $this->get_tag(); 784 785 if ( 'SCRIPT' === $tag_name && ! $this->skip_script_data() ) { 786 $this->parser_state = self::STATE_INCOMPLETE; 787 $this->bytes_already_parsed = $was_at; 788 789 return false; 790 } elseif ( 791 ( 'TEXTAREA' === $tag_name || 'TITLE' === $tag_name ) && 792 ! $this->skip_rcdata( $tag_name ) 793 ) { 794 $this->parser_state = self::STATE_INCOMPLETE; 795 $this->bytes_already_parsed = $was_at; 796 797 return false; 798 } elseif ( 606 799 ( 607 'i' === $t || 'I' === $t || 608 'n' === $t || 'N' === $t || 609 's' === $t || 'S' === $t || 610 't' === $t || 'T' === $t 611 ) ) { 612 $tag_name = $this->get_tag(); 613 614 if ( 'SCRIPT' === $tag_name && ! $this->skip_script_data() ) { 615 $this->bytes_already_parsed = strlen( $this->html ); 616 return false; 617 } elseif ( 618 ( 'TEXTAREA' === $tag_name || 'TITLE' === $tag_name ) && 619 ! $this->skip_rcdata( $tag_name ) 620 ) { 621 $this->bytes_already_parsed = strlen( $this->html ); 622 return false; 623 } elseif ( 624 ( 625 'IFRAME' === $tag_name || 626 'NOEMBED' === $tag_name || 627 'NOFRAMES' === $tag_name || 628 'NOSCRIPT' === $tag_name || 629 'STYLE' === $tag_name 630 ) && 631 ! $this->skip_rawtext( $tag_name ) 632 ) { 633 /* 634 * "XMP" should be here too but its rules are more complicated and require the 635 * complexity of the HTML Processor (it needs to close out any open P element, 636 * meaning it can't be skipped here or else the HTML Processor will lose its 637 * place). For now, it can be ignored as it's a rare HTML tag in practice and 638 * any normative HTML should be using PRE instead. 639 */ 640 $this->bytes_already_parsed = strlen( $this->html ); 641 return false; 642 } 643 } 644 } while ( $already_found < $this->sought_match_offset ); 800 'IFRAME' === $tag_name || 801 'NOEMBED' === $tag_name || 802 'NOFRAMES' === $tag_name || 803 'STYLE' === $tag_name || 804 'XMP' === $tag_name 805 ) && 806 ! $this->skip_rawtext( $tag_name ) 807 ) { 808 $this->parser_state = self::STATE_INCOMPLETE; 809 $this->bytes_already_parsed = $was_at; 810 811 return false; 812 } 813 } 645 814 646 815 return true; 647 816 } 648 817 818 /** 819 * Whether the processor paused because the input HTML document ended 820 * in the middle of a syntax element, such as in the middle of a tag. 821 * 822 * Example: 823 * 824 * $processor = new WP_HTML_Tag_Processor( '<input type="text" value="Th' ); 825 * false === $processor->get_next_tag(); 826 * true === $processor->paused_at_incomplete_token(); 827 * 828 * @since 6.5.0 829 * 830 * @return bool Whether the parse paused at the start of an incomplete token. 831 */ 832 public function paused_at_incomplete_token() { 833 return self::STATE_INCOMPLETE === $this->parser_state; 834 } 649 835 650 836 /** … … 665 851 */ 666 852 public function class_list() { 853 if ( self::STATE_MATCHED_TAG !== $this->parser_state ) { 854 return; 855 } 856 667 857 /** @var string $class contains the string value of the class attribute, with character references decoded. */ 668 858 $class = $this->get_attribute( 'class' ); … … 720 910 */ 721 911 public function has_class( $wanted_class ) { 722 if ( ! $this->tag_name_starts_at) {912 if ( self::STATE_MATCHED_TAG !== $this->parser_state ) { 723 913 return null; 724 914 } … … 817 1007 */ 818 1008 public function set_bookmark( $name ) { 819 if ( null === $this->tag_name_starts_at ) { 1009 // It only makes sense to set a bookmark if the parser has paused on a concrete token. 1010 if ( self::STATE_MATCHED_TAG !== $this->parser_state ) { 820 1011 return false; 821 1012 } … … 896 1087 // Fail if there is no possible tag closer. 897 1088 if ( false === $at || ( $at + $tag_length ) >= $doc_length ) { 898 $this->bytes_already_parsed = $doc_length;899 1089 return false; 900 1090 } … … 923 1113 $at += $tag_length; 924 1114 $this->bytes_already_parsed = $at; 1115 1116 if ( $at >= strlen( $html ) ) { 1117 return false; 1118 } 925 1119 926 1120 /* … … 1074 1268 } 1075 1269 1270 if ( $this->bytes_already_parsed >= $doc_length ) { 1271 $this->parser_state = self::STATE_INCOMPLETE; 1272 1273 return false; 1274 } 1275 1076 1276 if ( '>' === $html[ $this->bytes_already_parsed ] ) { 1077 1277 $this->bytes_already_parsed = $closer_potentially_starts_at; … … 1108 1308 while ( false !== $at && $at < $doc_length ) { 1109 1309 $at = strpos( $html, '<', $at ); 1310 1311 /* 1312 * This does not imply an incomplete parse; it indicates that there 1313 * can be nothing left in the document other than a #text node. 1314 */ 1110 1315 if ( false === $at ) { 1111 1316 return false; … … 1114 1319 $this->token_starts_at = $at; 1115 1320 1116 if ( '/' === $this->html[ $at + 1 ] ) {1321 if ( $at + 1 < $doc_length && '/' === $this->html[ $at + 1 ] ) { 1117 1322 $this->is_closing_tag = true; 1118 1323 ++$at; … … 1148 1353 * the document. There is nothing left to parse. 1149 1354 */ 1150 if ( $at + 1 >= strlen( $html ) ) { 1355 if ( $at + 1 >= $doc_length ) { 1356 $this->parser_state = self::STATE_INCOMPLETE; 1357 1151 1358 return false; 1152 1359 } … … 1162 1369 */ 1163 1370 if ( 1164 strlen( $html )> $at + 3 &&1371 $doc_length > $at + 3 && 1165 1372 '-' === $html[ $at + 2 ] && 1166 1373 '-' === $html[ $at + 3 ] … … 1168 1375 $closer_at = $at + 4; 1169 1376 // If it's not possible to close the comment then there is nothing more to scan. 1170 if ( strlen( $html ) <= $closer_at ) { 1377 if ( $doc_length <= $closer_at ) { 1378 $this->parser_state = self::STATE_INCOMPLETE; 1379 1171 1380 return false; 1172 1381 } … … 1186 1395 */ 1187 1396 --$closer_at; // Pre-increment inside condition below reduces risk of accidental infinite looping. 1188 while ( ++$closer_at < strlen( $html )) {1397 while ( ++$closer_at < $doc_length ) { 1189 1398 $closer_at = strpos( $html, '--', $closer_at ); 1190 1399 if ( false === $closer_at ) { 1400 $this->parser_state = self::STATE_INCOMPLETE; 1401 1191 1402 return false; 1192 1403 } 1193 1404 1194 if ( $closer_at + 2 < strlen( $html )&& '>' === $html[ $closer_at + 2 ] ) {1405 if ( $closer_at + 2 < $doc_length && '>' === $html[ $closer_at + 2 ] ) { 1195 1406 $at = $closer_at + 3; 1196 1407 continue 2; 1197 1408 } 1198 1409 1199 if ( $closer_at + 3 < strlen( $html )&& '!' === $html[ $closer_at + 2 ] && '>' === $html[ $closer_at + 3 ] ) {1410 if ( $closer_at + 3 < $doc_length && '!' === $html[ $closer_at + 2 ] && '>' === $html[ $closer_at + 3 ] ) { 1200 1411 $at = $closer_at + 4; 1201 1412 continue 2; … … 1210 1421 */ 1211 1422 if ( 1212 strlen( $html )> $at + 8 &&1423 $doc_length > $at + 8 && 1213 1424 '[' === $html[ $at + 2 ] && 1214 1425 'C' === $html[ $at + 3 ] && … … 1221 1432 $closer_at = strpos( $html, ']]>', $at + 9 ); 1222 1433 if ( false === $closer_at ) { 1434 $this->parser_state = self::STATE_INCOMPLETE; 1435 1223 1436 return false; 1224 1437 } … … 1234 1447 */ 1235 1448 if ( 1236 strlen( $html )> $at + 8 &&1449 $doc_length > $at + 8 && 1237 1450 ( 'D' === $html[ $at + 2 ] || 'd' === $html[ $at + 2 ] ) && 1238 1451 ( 'O' === $html[ $at + 3 ] || 'o' === $html[ $at + 3 ] ) && … … 1245 1458 $closer_at = strpos( $html, '>', $at + 9 ); 1246 1459 if ( false === $closer_at ) { 1460 $this->parser_state = self::STATE_INCOMPLETE; 1461 1247 1462 return false; 1248 1463 } … … 1254 1469 /* 1255 1470 * Anything else here is an incorrectly-opened comment and transitions 1256 * to the bogus comment state - skip to the nearest >. 1471 * to the bogus comment state - skip to the nearest >. If no closer is 1472 * found then the HTML was truncated inside the markup declaration. 1257 1473 */ 1258 1474 $at = strpos( $html, '>', $at + 1 ); 1475 if ( false === $at ) { 1476 $this->parser_state = self::STATE_INCOMPLETE; 1477 1478 return false; 1479 } 1480 1259 1481 continue; 1260 1482 } … … 1262 1484 /* 1263 1485 * </> is a missing end tag name, which is ignored. 1486 * 1487 * This was also known as the "presumptuous empty tag" 1488 * in early discussions as it was proposed to close 1489 * the nearest previous opening tag. 1264 1490 * 1265 1491 * See https://html.spec.whatwg.org/#parse-error-missing-end-tag-name … … 1277 1503 $closer_at = strpos( $html, '>', $at + 2 ); 1278 1504 if ( false === $closer_at ) { 1505 $this->parser_state = self::STATE_INCOMPLETE; 1506 1279 1507 return false; 1280 1508 } … … 1291 1519 */ 1292 1520 if ( $this->is_closing_tag ) { 1521 // No chance of finding a closer. 1522 if ( $at + 3 > $doc_length ) { 1523 return false; 1524 } 1525 1293 1526 $closer_at = strpos( $html, '>', $at + 3 ); 1294 1527 if ( false === $closer_at ) { 1528 $this->parser_state = self::STATE_INCOMPLETE; 1529 1295 1530 return false; 1296 1531 } … … 1317 1552 $this->bytes_already_parsed += strspn( $this->html, " \t\f\r\n/", $this->bytes_already_parsed ); 1318 1553 if ( $this->bytes_already_parsed >= strlen( $this->html ) ) { 1554 $this->parser_state = self::STATE_INCOMPLETE; 1555 1319 1556 return false; 1320 1557 } … … 1339 1576 $this->bytes_already_parsed += $name_length; 1340 1577 if ( $this->bytes_already_parsed >= strlen( $this->html ) ) { 1578 $this->parser_state = self::STATE_INCOMPLETE; 1579 1341 1580 return false; 1342 1581 } … … 1344 1583 $this->skip_whitespace(); 1345 1584 if ( $this->bytes_already_parsed >= strlen( $this->html ) ) { 1585 $this->parser_state = self::STATE_INCOMPLETE; 1586 1346 1587 return false; 1347 1588 } … … 1352 1593 $this->skip_whitespace(); 1353 1594 if ( $this->bytes_already_parsed >= strlen( $this->html ) ) { 1595 $this->parser_state = self::STATE_INCOMPLETE; 1596 1354 1597 return false; 1355 1598 } … … 1378 1621 1379 1622 if ( $attribute_end >= strlen( $this->html ) ) { 1623 $this->parser_state = self::STATE_INCOMPLETE; 1624 1380 1625 return false; 1381 1626 } … … 1444 1689 */ 1445 1690 private function after_tag() { 1446 $this->get_updated_html();1447 1691 $this->token_starts_at = null; 1448 1692 $this->token_length = null; … … 1787 2031 */ 1788 2032 private function get_enqueued_attribute_value( $comparable_name ) { 2033 if ( self::STATE_MATCHED_TAG !== $this->parser_state ) { 2034 return false; 2035 } 2036 1789 2037 if ( ! isset( $this->lexical_updates[ $comparable_name ] ) ) { 1790 2038 return false; … … 1854 2102 */ 1855 2103 public function get_attribute( $name ) { 1856 if ( null === $this->tag_name_starts_at) {2104 if ( self::STATE_MATCHED_TAG !== $this->parser_state ) { 1857 2105 return null; 1858 2106 } … … 1934 2182 */ 1935 2183 public function get_attribute_names_with_prefix( $prefix ) { 1936 if ( $this->is_closing_tag || null === $this->tag_name_starts_at ) { 2184 if ( 2185 self::STATE_MATCHED_TAG !== $this->parser_state || 2186 $this->is_closing_tag 2187 ) { 1937 2188 return null; 1938 2189 } … … 1966 2217 */ 1967 2218 public function get_tag() { 1968 if ( null === $this->tag_name_starts_at) {2219 if ( self::STATE_MATCHED_TAG !== $this->parser_state ) { 1969 2220 return null; 1970 2221 } … … 1993 2244 */ 1994 2245 public function has_self_closing_flag() { 1995 if ( ! $this->tag_name_starts_at) {2246 if ( self::STATE_MATCHED_TAG !== $this->parser_state ) { 1996 2247 return false; 1997 2248 } … … 2025 2276 */ 2026 2277 public function is_tag_closer() { 2027 return $this->is_closing_tag; 2278 return ( 2279 self::STATE_MATCHED_TAG === $this->parser_state && 2280 $this->is_closing_tag 2281 ); 2028 2282 } 2029 2283 … … 2045 2299 */ 2046 2300 public function set_attribute( $name, $value ) { 2047 if ( $this->is_closing_tag || null === $this->tag_name_starts_at ) { 2301 if ( 2302 self::STATE_MATCHED_TAG !== $this->parser_state || 2303 $this->is_closing_tag 2304 ) { 2048 2305 return false; 2049 2306 } … … 2178 2435 */ 2179 2436 public function remove_attribute( $name ) { 2180 if ( $this->is_closing_tag ) { 2437 if ( 2438 self::STATE_MATCHED_TAG !== $this->parser_state || 2439 $this->is_closing_tag 2440 ) { 2181 2441 return false; 2182 2442 } … … 2255 2515 */ 2256 2516 public function add_class( $class_name ) { 2257 if ( $this->is_closing_tag ) { 2517 if ( 2518 self::STATE_MATCHED_TAG !== $this->parser_state || 2519 $this->is_closing_tag 2520 ) { 2258 2521 return false; 2259 2522 } 2260 2523 2261 if ( null !== $this->tag_name_starts_at ) { 2262 $this->classname_updates[ $class_name ] = self::ADD_CLASS; 2263 } 2524 $this->classname_updates[ $class_name ] = self::ADD_CLASS; 2264 2525 2265 2526 return true; … … 2275 2536 */ 2276 2537 public function remove_class( $class_name ) { 2277 if ( $this->is_closing_tag ) { 2538 if ( 2539 self::STATE_MATCHED_TAG !== $this->parser_state || 2540 $this->is_closing_tag 2541 ) { 2278 2542 return false; 2279 2543 } … … 2481 2745 return true; 2482 2746 } 2747 2748 /** 2749 * Parser Ready State 2750 * 2751 * Indicates that the parser is ready to run and waiting for a state transition. 2752 * It may not have started yet, or it may have just finished parsing a token and 2753 * is ready to find the next one. 2754 * 2755 * @since 6.5.0 2756 * 2757 * @access private 2758 */ 2759 const STATE_READY = 'STATE_READY'; 2760 2761 /** 2762 * Parser Complete State 2763 * 2764 * Indicates that the parser has reached the end of the document and there is 2765 * nothing left to scan. It finished parsing the last token completely. 2766 * 2767 * @since 6.5.0 2768 * 2769 * @access private 2770 */ 2771 const STATE_COMPLETE = 'STATE_COMPLETE'; 2772 2773 /** 2774 * Parser Incomplete State 2775 * 2776 * Indicates that the parser has reached the end of the document before finishing 2777 * a token. It started parsing a token but there is a possibility that the input 2778 * HTML document was truncated in the middle of a token. 2779 * 2780 * The parser is reset at the start of the incomplete token and has paused. There 2781 * is nothing more than can be scanned unless provided a more complete document. 2782 * 2783 * @since 6.5.0 2784 * 2785 * @access private 2786 */ 2787 const STATE_INCOMPLETE = 'STATE_INCOMPLETE'; 2788 2789 /** 2790 * Parser Matched Tag State 2791 * 2792 * Indicates that the parser has found an HTML tag and it's possible to get 2793 * the tag name and read or modify its attributes (if it's not a closing tag). 2794 * 2795 * @since 6.5.0 2796 * 2797 * @access private 2798 */ 2799 const STATE_MATCHED_TAG = 'STATE_MATCHED_TAG'; 2483 2800 } -
trunk/tests/phpunit/tests/html-api/wpHtmlTagProcessor.php
r56703 r57211 1757 1757 * 1758 1758 * @covers WP_HTML_Tag_Processor::next_tag 1759 * @covers WP_HTML_Tag_Processor::paused_at_incomplete_token 1759 1760 */ 1760 1761 public function test_unclosed_script_tag_should_not_cause_an_infinite_loop() { 1761 $p = new WP_HTML_Tag_Processor( '<script>' ); 1762 $p->next_tag(); 1763 $this->assertSame( 'SCRIPT', $p->get_tag(), 'Did not find script tag' ); 1762 $p = new WP_HTML_Tag_Processor( '<script><div>' ); 1763 $this->assertFalse( 1764 $p->next_tag(), 1765 'Should not have stopped on an opening SCRIPT tag without a proper closing tag in the document.' 1766 ); 1767 $this->assertTrue( 1768 $p->paused_at_incomplete_token(), 1769 "Should have paused the parser because of the incomplete SCRIPT tag but didn't." 1770 ); 1771 1772 // Run this to ensure that the test ends (not in an infinite loop). 1764 1773 $p->next_tag(); 1765 1774 } … … 1933 1942 1934 1943 /** 1944 * Ensures matching elements inside NOSCRIPT elements. 1945 * 1946 * In a browser when the scripting flag is enabled, everything inside 1947 * the NOSCRIPT element will be ignored and treated at RAW TEXT. This 1948 * means that it's valid to send what looks like incomplete or partial 1949 * HTML syntax without impacting a rendered page. The Tag Processor is 1950 * a parser with the scripting flag disabled, however, and needs to 1951 * expose all the potential content that some code might want to modify. 1952 * 1953 * Were it not for this then the NOSCRIPT tag would be handled like the 1954 * other tags in the RAW TEXT special group, e.g. NOEMBED or STYLE. 1955 * 1956 * @ticket 60122 1957 * 1958 * @covers WP_HTML_Tag_Processor::next_tag 1959 */ 1960 public function test_processes_inside_of_noscript_elements() { 1961 $p = new WP_HTML_Tag_Processor( '<noscript><input type="submit"></noscript><div>' ); 1962 1963 $this->assertTrue( $p->next_tag( 'INPUT' ), 'Failed to find INPUT element inside NOSCRIPT element.' ); 1964 $this->assertTrue( $p->next_tag( 'DIV' ), 'Failed to find DIV element after NOSCRIPT element.' ); 1965 } 1966 1967 /** 1935 1968 * @ticket 59292 1936 1969 * … … 1963 1996 'NOEMBED' => array( '<noembed><p></p></noembed><div target>' ), 1964 1997 'NOFRAMES' => array( '<noframes><p>Check the rules here.</p></noframes><div target>' ), 1965 'NOSCRIPT' => array( '<noscript><span>This assumes that scripting mode is enabled.</span></noscript><p target>' ),1966 1998 'STYLE' => array( '<style>* { margin: 0 }</style><div target>' ), 1967 1999 'STYLE hiding DIV' => array( '<style>li::before { content: "<div non-target>" }</style><div target>' ), … … 2140 2172 * 2141 2173 * @covers WP_HTML_Tag_Processor::next_tag 2174 * @covers WP_HTML_Tag_Processor::paused_at_incomplete_token 2142 2175 * 2143 2176 * @dataProvider data_html_with_unclosed_comments 2144 2177 * 2145 * @param string $html_ending_before_comment_close HTML with opened comments that aren't closed 2178 * @param string $html_ending_before_comment_close HTML with opened comments that aren't closed. 2146 2179 */ 2147 2180 public function test_documents_may_end_with_unclosed_comment( $html_ending_before_comment_close ) { 2148 2181 $p = new WP_HTML_Tag_Processor( $html_ending_before_comment_close ); 2149 2182 2150 $this->assertFalse( $p->next_tag() ); 2183 $this->assertFalse( 2184 $p->next_tag(), 2185 "Should not have found any tag, but found {$p->get_tag()}." 2186 ); 2187 2188 $this->assertTrue( 2189 $p->paused_at_incomplete_token(), 2190 "Should have indicated that the parser found an incomplete token but didn't." 2191 ); 2151 2192 } 2152 2193 … … 2281 2322 2282 2323 /** 2324 * Ensures that no tags are matched in a document containing only non-tag content. 2325 * 2326 * @ticket 60122 2327 * 2328 * @covers WP_HTML_Tag_Processor::next_tag 2329 * @covers WP_HTML_Tag_Processor::paused_at_incomplete_token 2330 * 2331 * @dataProvider data_html_without_tags 2332 * 2333 * @param string $html_without_tags HTML without any tags in it. 2334 */ 2335 public function test_next_tag_returns_false_when_there_are_no_tags( $html_without_tags ) { 2336 $processor = new WP_HTML_Tag_Processor( $html_without_tags ); 2337 2338 $this->assertFalse( 2339 $processor->next_tag(), 2340 "Shouldn't have found any tags but found {$processor->get_tag()}." 2341 ); 2342 2343 $this->assertFalse( 2344 $processor->paused_at_incomplete_token(), 2345 'Should have indicated that end of document was reached without evidence that elements were truncated.' 2346 ); 2347 } 2348 2349 /** 2350 * Data provider. 2351 * 2352 * @return array[] 2353 */ 2354 public function data_html_without_tags() { 2355 return array( 2356 'DOCTYPE declaration' => array( '<!DOCTYPE html>Just some HTML' ), 2357 'No tags' => array( 'this is nothing more than a text node' ), 2358 'Text with comments' => array( 'One <!-- sneaky --> comment.' ), 2359 'Empty tag closer' => array( '</>' ), 2360 'Processing instruction' => array( '<?xml version="1.0"?>' ), 2361 'Combination XML-like' => array( '<!DOCTYPE xml><?xml version=""?><!-- this is not a real document. --><![CDATA[it only serves as a test]]>' ), 2362 ); 2363 } 2364 2365 /** 2366 * Ensures that the processor doesn't attempt to match an incomplete token. 2367 * 2283 2368 * @ticket 58637 2284 2369 * 2285 2370 * @covers WP_HTML_Tag_Processor::next_tag 2371 * @covers WP_HTML_Tag_Processor::paused_at_incomplete_token 2286 2372 * 2287 2373 * @dataProvider data_incomplete_syntax_elements … … 2289 2375 * @param string $incomplete_html HTML text containing some kind of incomplete syntax. 2290 2376 */ 2291 public function test_ returns_false_for_incomplete_syntax_elements( $incomplete_html ) {2377 public function test_next_tag_returns_false_for_incomplete_syntax_elements( $incomplete_html ) { 2292 2378 $p = new WP_HTML_Tag_Processor( $incomplete_html ); 2293 $this->assertFalse( $p->next_tag() ); 2379 2380 $this->assertFalse( 2381 $p->next_tag(), 2382 "Shouldn't have found any tags but found {$p->get_tag()}." 2383 ); 2384 2385 $this->assertTrue( 2386 $p->paused_at_incomplete_token(), 2387 "Should have indicated that the parser found an incomplete token but didn't." 2388 ); 2294 2389 } 2295 2390 … … 2301 2396 public function data_incomplete_syntax_elements() { 2302 2397 return array( 2303 'No tags' => array( 'this is nothing more than a text node' ),2304 2398 'Incomplete tag name' => array( '<swit' ), 2305 2399 'Incomplete tag (no attributes)' => array( '<div' ), … … 2314 2408 'Incomplete DOCTYPE' => array( '<!DOCTYPE html' ), 2315 2409 'Partial DOCTYPE' => array( '<!DOCTY' ), 2316 'Incomplete CDATA' => array( '<[CDATA[something inside of here needs to get out' ), 2317 'Partial CDATA' => array( '<[CDA' ), 2318 'Partially closed CDATA]' => array( '<[CDATA[cannot escape]' ), 2319 'Partially closed CDATA]>' => array( '<[CDATA[cannot escape]>' ), 2410 'Incomplete CDATA' => array( '<![CDATA[something inside of here needs to get out' ), 2411 'Partial CDATA' => array( '<![CDA' ), 2412 'Partially closed CDATA]' => array( '<![CDATA[cannot escape]' ), 2413 'Partially closed CDATA]>' => array( '<![CDATA[cannot escape]>' ), 2414 'Unclosed IFRAME' => array( '<iframe><div>' ), 2415 'Unclosed NOEMBED' => array( '<noembed><div>' ), 2416 'Unclosed NOFRAMES' => array( '<noframes><div>' ), 2417 'Unclosed SCRIPT' => array( '<script><div>' ), 2418 'Unclosed STYLE' => array( '<style><div>' ), 2419 'Unclosed TEXTAREA' => array( '<textarea><div>' ), 2420 'Unclosed TITLE' => array( '<title><div>' ), 2421 'Unclosed XMP' => array( '<xmp><div>' ), 2422 'Partially closed IFRAME' => array( '<iframe><div></iframe' ), 2423 'Partially closed NOEMBED' => array( '<noembed><div></noembed' ), 2424 'Partially closed NOFRAMES' => array( '<noframes><div></noframes' ), 2425 'Partially closed SCRIPT' => array( '<script><div></script' ), 2426 'Partially closed STYLE' => array( '<style><div></style' ), 2427 'Partially closed TEXTAREA' => array( '<textarea><div></textarea' ), 2428 'Partially closed TITLE' => array( '<title><div></title' ), 2429 'Partially closed XMP' => array( '<xmp><div></xmp' ), 2320 2430 ); 2321 2431 } … … 2416 2526 public function test_updating_attributes_in_malformed_html( $html, $expected ) { 2417 2527 $p = new WP_HTML_Tag_Processor( $html ); 2418 $ p->next_tag();2528 $this->assertTrue( $p->next_tag(), 'Could not find first tag.' ); 2419 2529 $p->set_attribute( 'foo', 'bar' ); 2420 2530 $p->add_class( 'firstTag' ); … … 2435 2545 */ 2436 2546 public function data_updating_attributes_in_malformed_html() { 2437 $null_byte = chr( 0 );2438 2439 2547 return array( 2440 2548 'Invalid entity inside attribute value' => array( … … 2495 2603 ), 2496 2604 'id without double quotation marks around null byte' => array( 2497 'input' => '<hr id' . $null_byte . 'zero="test"><span>test</span>',2498 'expected' => '<hr class="firstTag" foo="bar" id' . $null_byte . 'zero="test"><span class="secondTag">test</span>',2605 'input' => "<hr id\x00zero=\"test\"><span>test</span>", 2606 'expected' => "<hr class=\"firstTag\" foo=\"bar\" id\x00zero=\"test\"><span class=\"secondTag\">test</span>", 2499 2607 ), 2500 2608 'Unexpected > before an attribute' => array( … … 2584 2692 ); 2585 2693 } 2694 2695 /** 2696 * @covers WP_HTML_Tag_Processor::next_tag 2697 */ 2698 public function test_handles_malformed_taglike_open_short_html() { 2699 $p = new WP_HTML_Tag_Processor( '<' ); 2700 $result = $p->next_tag(); 2701 $this->assertFalse( $result, 'Did not handle "<" html properly.' ); 2702 } 2703 2704 /** 2705 * @covers WP_HTML_Tag_Processor::next_tag 2706 */ 2707 public function test_handles_malformed_taglike_close_short_html() { 2708 $p = new WP_HTML_Tag_Processor( '</ ' ); 2709 $result = $p->next_tag(); 2710 $this->assertFalse( $result, 'Did not handle "</ " html properly.' ); 2711 } 2586 2712 }
Note: See TracChangeset
for help on using the changeset viewer.