Changeset 57348
- Timestamp:
- 01/24/2024 11:35:46 PM (9 months ago)
- Location:
- trunk
- Files:
-
- 1 added
- 4 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/src/wp-includes/html-api/class-wp-html-processor.php
r57343 r57348 151 151 152 152 /** 153 * Static query for instructing the Tag Processor to visit every token.154 *155 * @access private156 *157 * @since 6.4.0158 *159 * @var array160 */161 const VISIT_EVERYTHING = array( 'tag_closers' => 'visit' );162 163 /**164 153 * Holds the working state of the parser, including the stack of 165 154 * open elements and the stack of active formatting elements. … … 423 412 424 413 return false; 414 } 415 416 /** 417 * Ensures internal accounting is maintained for HTML semantic rules while 418 * the underlying Tag Processor class is seeking to a bookmark. 419 * 420 * This doesn't currently have a way to represent non-tags and doesn't process 421 * semantic rules for text nodes. For access to the raw tokens consider using 422 * WP_HTML_Tag_Processor instead. 423 * 424 * @since 6.5.0 Added for internal support; do not use. 425 * 426 * @access private 427 * 428 * @return bool 429 */ 430 public function next_token() { 431 $found_a_token = parent::next_token(); 432 433 if ( '#tag' === $this->get_token_type() ) { 434 $this->step( self::REPROCESS_CURRENT_NODE ); 435 } 436 437 return $found_a_token; 425 438 } 426 439 … … 521 534 } 522 535 523 parent::next_tag( self::VISIT_EVERYTHING ); 536 while ( parent::next_token() && '#tag' !== $this->get_token_type() ) { 537 continue; 538 } 524 539 } 525 540 -
trunk/src/wp-includes/html-api/class-wp-html-tag-processor.php
r57227 r57348 248 248 * } 249 249 * 250 * ## Tokens and finer-grained processing. 251 * 252 * It's possible to scan through every lexical token in the 253 * HTML document using the `next_token()` function. This 254 * alternative form takes no argument and provides no built-in 255 * query syntax. 256 * 257 * Example: 258 * 259 * $title = '(untitled)'; 260 * $text = ''; 261 * while ( $processor->next_token() ) { 262 * switch ( $processor->get_token_name() ) { 263 * case '#text': 264 * $text .= $processor->get_modifiable_text(); 265 * break; 266 * 267 * case 'BR': 268 * $text .= "\n"; 269 * break; 270 * 271 * case 'TITLE': 272 * $title = $processor->get_modifiable_text(); 273 * break; 274 * } 275 * } 276 * return trim( "# {$title}\n\n{$text}" ); 277 * 278 * ### Tokens and _modifiable text_. 279 * 280 * #### Special "atomic" HTML elements. 281 * 282 * Not all HTML elements are able to contain other elements inside of them. 283 * For instance, the contents inside a TITLE element are plaintext (except 284 * that character references like & will be decoded). This means that 285 * if the string `<img>` appears inside a TITLE element, then it's not an 286 * image tag, but rather it's text describing an image tag. Likewise, the 287 * contents of a SCRIPT or STYLE element are handled entirely separately in 288 * a browser than the contents of other elements because they represent a 289 * different language than HTML. 290 * 291 * For these elements the Tag Processor treats the entire sequence as one, 292 * from the opening tag, including its contents, through its closing tag. 293 * This means that the it's not possible to match the closing tag for a 294 * SCRIPT element unless it's unexpected; the Tag Processor already matched 295 * it when it found the opening tag. 296 * 297 * The inner contents of these elements are that element's _modifiable text_. 298 * 299 * The special elements are: 300 * - `SCRIPT` whose contents are treated as raw plaintext but supports a legacy 301 * style of including Javascript inside of HTML comments to avoid accidentally 302 * closing the SCRIPT from inside a Javascript string. E.g. `console.log( '</script>' )`. 303 * - `TITLE` and `TEXTAREA` whose contents are treated as plaintext and then any 304 * character references are decoded. E.g. `1 < 2 < 3` becomes `1 < 2 < 3`. 305 * - `IFRAME`, `NOSCRIPT`, `NOEMBED`, `NOFRAME`, `STYLE` whose contents are treated as 306 * raw plaintext and left as-is. E.g. `1 < 2 < 3` remains `1 < 2 < 3`. 307 * 308 * #### Other tokens with modifiable text. 309 * 310 * There are also non-elements which are void/self-closing in nature and contain 311 * modifiable text that is part of that individual syntax token itself. 312 * 313 * - `#text` nodes, whose entire token _is_ the modifiable text. 314 * - HTML comments and tokens that become comments due to some syntax error. The 315 * text for these tokens is the portion of the comment inside of the syntax. 316 * E.g. for `<!-- comment -->` the text is `" comment "` (note the spaces are included). 317 * - `CDATA` sections, whose text is the content inside of the section itself. E.g. for 318 * `<![CDATA[some content]]>` the text is `"some content"` (with restrictions [1]). 319 * - "Funky comments," which are a special case of invalid closing tags whose name is 320 * invalid. The text for these nodes is the text that a browser would transform into 321 * an HTML comment when parsing. E.g. for `</%post_author>` the text is `%post_author`. 322 * - `DOCTYPE` declarations like `<DOCTYPE html>` which have no closing tag. 323 * - XML Processing instruction nodes like `<?wp __( "Like" ); ?>` (with restrictions [2]). 324 * - The empty end tag `</>` which is ignored in the browser and DOM. 325 * 326 * [1]: There are no CDATA sections in HTML. When encountering `<![CDATA[`, everything 327 * until the next `>` becomes a bogus HTML comment, meaning there can be no CDATA 328 * section in an HTML document containing `>`. The Tag Processor will first find 329 * all valid and bogus HTML comments, and then if the comment _would_ have been a 330 * CDATA section _were they to exist_, it will indicate this as the type of comment. 331 * 332 * [2]: XML allows a broader range of characters in a processing instruction's target name 333 * and disallows "xml" as a name, since it's special. The Tag Processor only recognizes 334 * target names with an ASCII-representable subset of characters. It also exhibits the 335 * same constraint as with CDATA sections, in that `>` cannot exist within the token 336 * since Processing Instructions do no exist within HTML and their syntax transforms 337 * into a bogus comment in the DOM. 338 * 250 339 * ## Design and limitations 251 340 * … … 321 410 * @since 6.3.2 Fix: Skip HTML-like content inside rawtext elements such as STYLE. 322 411 * @since 6.5.0 Pauses processor when input ends in an incomplete syntax token. 323 * Introduces "special" elements which act like void elements, e.g. STYLE. 412 * Introduces "special" elements which act like void elements, e.g. TITLE, STYLE. 413 * Allows scanning through all tokens and processing modifiable text, where applicable. 324 414 */ 325 415 class WP_HTML_Tag_Processor { … … 397 487 * Specifies mode of operation of the parser at any given time. 398 488 * 399 * | State | Meaning | 400 * | --------------|----------------------------------------------------------------------| 401 * | *Ready* | The parser is ready to run. | 402 * | *Complete* | There is nothing left to parse. | 403 * | *Incomplete* | The HTML ended in the middle of a token; nothing more can be parsed. | 404 * | *Matched tag* | Found an HTML tag; it's possible to modify its attributes. | 489 * | State | Meaning | 490 * | ----------------|----------------------------------------------------------------------| 491 * | *Ready* | The parser is ready to run. | 492 * | *Complete* | There is nothing left to parse. | 493 * | *Incomplete* | The HTML ended in the middle of a token; nothing more can be parsed. | 494 * | *Matched tag* | Found an HTML tag; it's possible to modify its attributes. | 495 * | *Text node* | Found a #text node; this is plaintext and modifiable. | 496 * | *CDATA node* | Found a CDATA section; this is modifiable. | 497 * | *Comment* | Found a comment or bogus comment; this is modifiable. | 498 * | *Presumptuous* | Found an empty tag closer: `</>`. | 499 * | *Funky comment* | Found a tag closer with an invalid tag name; this is modifiable. | 405 500 * 406 501 * @since 6.5.0 … … 408 503 * @see WP_HTML_Tag_Processor::STATE_READY 409 504 * @see WP_HTML_Tag_Processor::STATE_COMPLETE 410 * @see WP_HTML_Tag_Processor::STATE_INCOMPLETE 505 * @see WP_HTML_Tag_Processor::STATE_INCOMPLETE_INPUT 411 506 * @see WP_HTML_Tag_Processor::STATE_MATCHED_TAG 507 * @see WP_HTML_Tag_Processor::STATE_TEXT_NODE 508 * @see WP_HTML_Tag_Processor::STATE_CDATA_NODE 509 * @see WP_HTML_Tag_Processor::STATE_COMMENT 510 * @see WP_HTML_Tag_Processor::STATE_DOCTYPE 511 * @see WP_HTML_Tag_Processor::STATE_PRESUMPTUOUS_TAG 512 * @see WP_HTML_Tag_Processor::STATE_FUNKY_COMMENT 412 513 * 413 514 * @var string 414 515 */ 415 private $parser_state = self::STATE_READY; 516 protected $parser_state = self::STATE_READY; 517 518 /** 519 * What kind of syntax token became an HTML comment. 520 * 521 * Since there are many ways in which HTML syntax can create an HTML comment, 522 * this indicates which of those caused it. This allows the Tag Processor to 523 * represent more from the original input document than would appear in the DOM. 524 * 525 * @since 6.5.0 526 * 527 * @var string|null 528 */ 529 protected $comment_type = null; 416 530 417 531 /** … … 490 604 */ 491 605 private $tag_name_length; 606 607 /** 608 * Byte offset into input document where current modifiable text starts. 609 * 610 * @since 6.5.0 611 * 612 * @var int 613 */ 614 private $text_starts_at; 615 616 /** 617 * Byte length of modifiable text. 618 * 619 * @since 6.5.0 620 * 621 * @var string 622 */ 623 private $text_length; 492 624 493 625 /** … … 706 838 */ 707 839 public function next_token() { 840 $was_at = $this->bytes_already_parsed; 708 841 $this->get_updated_html(); 709 $was_at = $this->bytes_already_parsed;710 842 711 843 // Don't proceed if there's nothing more to scan. 712 844 if ( 713 845 self::STATE_COMPLETE === $this->parser_state || 714 self::STATE_INCOMPLETE === $this->parser_state846 self::STATE_INCOMPLETE_INPUT === $this->parser_state 715 847 ) { 716 848 return false; … … 730 862 // Find the next tag if it exists. 731 863 if ( false === $this->parse_next_tag() ) { 732 if ( self::STATE_INCOMPLETE === $this->parser_state ) {864 if ( self::STATE_INCOMPLETE_INPUT === $this->parser_state ) { 733 865 $this->bytes_already_parsed = $was_at; 734 866 } 735 867 736 868 return false; 869 } 870 871 /* 872 * For legacy reasons the rest of this function handles tags and their 873 * attributes. If the processor has reached the end of the document 874 * or if it matched any other token then it should return here to avoid 875 * attempting to process tag-specific syntax. 876 */ 877 if ( 878 self::STATE_INCOMPLETE_INPUT !== $this->parser_state && 879 self::STATE_COMPLETE !== $this->parser_state && 880 self::STATE_MATCHED_TAG !== $this->parser_state 881 ) { 882 return true; 737 883 } 738 884 … … 744 890 // Ensure that the tag closes before the end of the document. 745 891 if ( 746 self::STATE_INCOMPLETE === $this->parser_state ||892 self::STATE_INCOMPLETE_INPUT === $this->parser_state || 747 893 $this->bytes_already_parsed >= strlen( $this->html ) 748 894 ) { 749 895 // Does this appropriately clear state (parsed attributes)? 750 $this->parser_state = self::STATE_INCOMPLETE ;896 $this->parser_state = self::STATE_INCOMPLETE_INPUT; 751 897 $this->bytes_already_parsed = $was_at; 752 898 … … 756 902 $tag_ends_at = strpos( $this->html, '>', $this->bytes_already_parsed ); 757 903 if ( false === $tag_ends_at ) { 758 $this->parser_state = self::STATE_INCOMPLETE ;904 $this->parser_state = self::STATE_INCOMPLETE_INPUT; 759 905 $this->bytes_already_parsed = $was_at; 760 906 … … 763 909 $this->parser_state = self::STATE_MATCHED_TAG; 764 910 $this->token_length = $tag_ends_at - $this->token_starts_at; 765 $this->bytes_already_parsed = $tag_ends_at ;911 $this->bytes_already_parsed = $tag_ends_at + 1; 766 912 767 913 /* … … 772 918 $t = $this->html[ $this->tag_name_starts_at ]; 773 919 if ( 774 ! $this->is_closing_tag &&775 (920 $this->is_closing_tag || 921 ! ( 776 922 'i' === $t || 'I' === $t || 777 923 'n' === $t || 'N' === $t || … … 781 927 ) 782 928 ) { 783 $tag_name = $this->get_tag(); 784 785 if ( 'SCRIPT' === $tag_name && ! $this->skip_script_data() ) { 786 $this->parser_state = self::STATE_INCOMPLETE; 787 $this->bytes_already_parsed = $was_at; 788 789 return false; 790 } elseif ( 791 ( 'TEXTAREA' === $tag_name || 'TITLE' === $tag_name ) && 792 ! $this->skip_rcdata( $tag_name ) 793 ) { 794 $this->parser_state = self::STATE_INCOMPLETE; 795 $this->bytes_already_parsed = $was_at; 796 797 return false; 798 } elseif ( 799 ( 800 'IFRAME' === $tag_name || 801 'NOEMBED' === $tag_name || 802 'NOFRAMES' === $tag_name || 803 'STYLE' === $tag_name || 804 'XMP' === $tag_name 805 ) && 806 ! $this->skip_rawtext( $tag_name ) 807 ) { 808 $this->parser_state = self::STATE_INCOMPLETE; 809 $this->bytes_already_parsed = $was_at; 810 811 return false; 812 } 813 } 929 return true; 930 } 931 932 $tag_name = $this->get_tag(); 933 934 /* 935 * Preserve the opening tag pointers, as these will be overwritten 936 * when finding the closing tag. They will be reset after finding 937 * the closing to tag to point to the opening of the special atomic 938 * tag sequence. 939 */ 940 $tag_name_starts_at = $this->tag_name_starts_at; 941 $tag_name_length = $this->tag_name_length; 942 $tag_ends_at = $this->token_starts_at + $this->token_length; 943 $attributes = $this->attributes; 944 $duplicate_attributes = $this->duplicate_attributes; 945 946 // Find the closing tag if necessary. 947 $found_closer = false; 948 switch ( $tag_name ) { 949 case 'SCRIPT': 950 $found_closer = $this->skip_script_data(); 951 break; 952 953 case 'TEXTAREA': 954 case 'TITLE': 955 $found_closer = $this->skip_rcdata( $tag_name ); 956 break; 957 958 /* 959 * In the browser this list would include the NOSCRIPT element, 960 * but the Tag Processor is an environment with the scripting 961 * flag disabled, meaning that it needs to descend into the 962 * NOSCRIPT element to be able to properly process what will be 963 * sent to a browser. 964 * 965 * Note that this rule makes HTML5 syntax incompatible with XML, 966 * because the parsing of this token depends on client application. 967 * The NOSCRIPT element cannot be represented in the XHTML syntax. 968 */ 969 case 'IFRAME': 970 case 'NOEMBED': 971 case 'NOFRAMES': 972 case 'STYLE': 973 case 'XMP': 974 $found_closer = $this->skip_rawtext( $tag_name ); 975 break; 976 977 // No other tags should be treated in their entirety here. 978 default: 979 return true; 980 } 981 982 if ( ! $found_closer ) { 983 $this->parser_state = self::STATE_INCOMPLETE_INPUT; 984 $this->bytes_already_parsed = $was_at; 985 return false; 986 } 987 988 /* 989 * The values here look like they reference the opening tag but they reference 990 * the closing tag instead. This is why the opening tag values were stored 991 * above in a variable. It reads confusingly here, but that's because the 992 * functions that skip the contents have moved all the internal cursors past 993 * the inner content of the tag. 994 */ 995 $this->token_starts_at = $was_at; 996 $this->token_length = $this->bytes_already_parsed - $this->token_starts_at; 997 $this->text_starts_at = $tag_ends_at + 1; 998 $this->text_length = $this->tag_name_starts_at - $this->text_starts_at; 999 $this->tag_name_starts_at = $tag_name_starts_at; 1000 $this->tag_name_length = $tag_name_length; 1001 $this->attributes = $attributes; 1002 $this->duplicate_attributes = $duplicate_attributes; 814 1003 815 1004 return true; … … 831 1020 */ 832 1021 public function paused_at_incomplete_token() { 833 return self::STATE_INCOMPLETE === $this->parser_state;1022 return self::STATE_INCOMPLETE_INPUT === $this->parser_state; 834 1023 } 835 1024 … … 1008 1197 public function set_bookmark( $name ) { 1009 1198 // It only makes sense to set a bookmark if the parser has paused on a concrete token. 1010 if ( self::STATE_MATCHED_TAG !== $this->parser_state ) { 1199 if ( 1200 self::STATE_COMPLETE === $this->parser_state || 1201 self::STATE_INCOMPLETE_INPUT === $this->parser_state 1202 ) { 1011 1203 return false; 1012 1204 } … … 1083 1275 1084 1276 while ( false !== $at && $at < $doc_length ) { 1085 $at = strpos( $this->html, '</', $at ); 1277 $at = strpos( $this->html, '</', $at ); 1278 $this->tag_name_starts_at = $at; 1086 1279 1087 1280 // Fail if there is no possible tag closer. … … 1090 1283 } 1091 1284 1092 $closer_potentially_starts_at = $at; 1093 $at += 2; 1285 $at += 2; 1094 1286 1095 1287 /* … … 1132 1324 continue; 1133 1325 } 1326 1134 1327 $at = $this->bytes_already_parsed; 1135 1328 if ( $at >= strlen( $this->html ) ) { … … 1137 1330 } 1138 1331 1139 if ( '>' === $html[ $at ] || '/' === $html[ $at ] ) { 1140 $this->bytes_already_parsed = $closer_potentially_starts_at; 1332 if ( '>' === $html[ $at ] ) { 1333 $this->bytes_already_parsed = $at + 1; 1334 return true; 1335 } 1336 1337 if ( $at + 1 >= strlen( $this->html ) ) { 1338 return false; 1339 } 1340 1341 if ( '/' === $html[ $at ] && '>' === $html[ $at + 1 ] ) { 1342 $this->bytes_already_parsed = $at + 2; 1141 1343 return true; 1142 1344 } … … 1260 1462 if ( $is_closing ) { 1261 1463 $this->bytes_already_parsed = $closer_potentially_starts_at; 1464 $this->tag_name_starts_at = $closer_potentially_starts_at; 1262 1465 if ( $this->bytes_already_parsed >= $doc_length ) { 1263 1466 return false; … … 1269 1472 1270 1473 if ( $this->bytes_already_parsed >= $doc_length ) { 1271 $this->parser_state = self::STATE_INCOMPLETE ;1474 $this->parser_state = self::STATE_INCOMPLETE_INPUT; 1272 1475 1273 1476 return false; … … 1275 1478 1276 1479 if ( '>' === $html[ $this->bytes_already_parsed ] ) { 1277 $this->bytes_already_parsed = $closer_potentially_starts_at;1480 ++$this->bytes_already_parsed; 1278 1481 return true; 1279 1482 } … … 1304 1507 $html = $this->html; 1305 1508 $doc_length = strlen( $html ); 1306 $at = $this->bytes_already_parsed; 1509 $was_at = $this->bytes_already_parsed; 1510 $at = $was_at; 1307 1511 1308 1512 while ( false !== $at && $at < $doc_length ) { 1309 1513 $at = strpos( $html, '<', $at ); 1514 1515 if ( $at > $was_at ) { 1516 $this->parser_state = self::STATE_TEXT_NODE; 1517 $this->token_starts_at = $was_at; 1518 $this->token_length = $at - $was_at; 1519 $this->text_starts_at = $was_at; 1520 $this->text_length = $this->token_length; 1521 $this->bytes_already_parsed = $at; 1522 return true; 1523 } 1310 1524 1311 1525 /* … … 1314 1528 */ 1315 1529 if ( false === $at ) { 1316 return false; 1530 $this->parser_state = self::STATE_TEXT_NODE; 1531 $this->token_starts_at = $was_at; 1532 $this->token_length = strlen( $html ) - $was_at; 1533 $this->text_starts_at = $was_at; 1534 $this->text_length = $this->token_length; 1535 $this->bytes_already_parsed = strlen( $html ); 1536 return true; 1317 1537 } 1318 1538 … … 1343 1563 if ( $tag_name_prefix_length > 0 ) { 1344 1564 ++$at; 1565 $this->parser_state = self::STATE_MATCHED_TAG; 1566 $this->tag_name_starts_at = $at; 1345 1567 $this->tag_name_length = $tag_name_prefix_length + strcspn( $html, " \t\f\r\n/>", $at + $tag_name_prefix_length ); 1346 $this->tag_name_starts_at = $at;1347 1568 $this->bytes_already_parsed = $at + $this->tag_name_length; 1348 1569 return true; … … 1354 1575 */ 1355 1576 if ( $at + 1 >= $doc_length ) { 1356 $this->parser_state = self::STATE_INCOMPLETE ;1577 $this->parser_state = self::STATE_INCOMPLETE_INPUT; 1357 1578 1358 1579 return false; … … 1360 1581 1361 1582 /* 1362 * <!transitions to markup declaration open state1583 * `<!` transitions to markup declaration open state 1363 1584 * https://html.spec.whatwg.org/multipage/parsing.html#markup-declaration-open-state 1364 1585 */ 1365 1586 if ( '!' === $html[ $at + 1 ] ) { 1366 1587 /* 1367 * <!-- transitions to a bogus comment state – skip to the nearest -->1588 * `<!--` transitions to a comment state – apply further comment rules. 1368 1589 * https://html.spec.whatwg.org/multipage/parsing.html#tag-open-state 1369 1590 */ … … 1376 1597 // If it's not possible to close the comment then there is nothing more to scan. 1377 1598 if ( $doc_length <= $closer_at ) { 1378 $this->parser_state = self::STATE_INCOMPLETE ;1599 $this->parser_state = self::STATE_INCOMPLETE_INPUT; 1379 1600 1380 1601 return false; … … 1384 1605 $span_of_dashes = strspn( $html, '-', $closer_at ); 1385 1606 if ( '>' === $html[ $closer_at + $span_of_dashes ] ) { 1386 $at = $closer_at + $span_of_dashes + 1; 1387 continue; 1607 /* 1608 * @todo When implementing `set_modifiable_text()` ensure that updates to this token 1609 * don't break the syntax for short comments, e.g. `<!--->`. Unlike other comment 1610 * and bogus comment syntax, these leave no clear insertion point for text and 1611 * they need to be modified specially in order to contain text. E.g. to store 1612 * `?` as the modifiable text, the `<!--->` needs to become `<!--?-->`, which 1613 * involves inserting an additional `-` into the token after the modifiable text. 1614 */ 1615 $this->parser_state = self::STATE_COMMENT; 1616 $this->comment_type = self::COMMENT_AS_ABRUPTLY_CLOSED_COMMENT; 1617 $this->token_length = $closer_at + $span_of_dashes + 1 - $this->token_starts_at; 1618 1619 // Only provide modifiable text if the token is long enough to contain it. 1620 if ( $span_of_dashes >= 2 ) { 1621 $this->comment_type = self::COMMENT_AS_HTML_COMMENT; 1622 $this->text_starts_at = $this->token_starts_at + 4; 1623 $this->text_length = $span_of_dashes - 2; 1624 } 1625 1626 $this->bytes_already_parsed = $closer_at + $span_of_dashes + 1; 1627 return true; 1388 1628 } 1389 1629 … … 1398 1638 $closer_at = strpos( $html, '--', $closer_at ); 1399 1639 if ( false === $closer_at ) { 1400 $this->parser_state = self::STATE_INCOMPLETE ;1640 $this->parser_state = self::STATE_INCOMPLETE_INPUT; 1401 1641 1402 1642 return false; … … 1404 1644 1405 1645 if ( $closer_at + 2 < $doc_length && '>' === $html[ $closer_at + 2 ] ) { 1406 $at = $closer_at + 3; 1407 continue 2; 1646 $this->parser_state = self::STATE_COMMENT; 1647 $this->comment_type = self::COMMENT_AS_HTML_COMMENT; 1648 $this->token_length = $closer_at + 3 - $this->token_starts_at; 1649 $this->text_starts_at = $this->token_starts_at + 4; 1650 $this->text_length = $closer_at - $this->text_starts_at; 1651 $this->bytes_already_parsed = $closer_at + 3; 1652 return true; 1408 1653 } 1409 1654 1410 if ( $closer_at + 3 < $doc_length && '!' === $html[ $closer_at + 2 ] && '>' === $html[ $closer_at + 3 ] ) { 1411 $at = $closer_at + 4; 1412 continue 2; 1655 if ( 1656 $closer_at + 3 < $doc_length && 1657 '!' === $html[ $closer_at + 2 ] && 1658 '>' === $html[ $closer_at + 3 ] 1659 ) { 1660 $this->parser_state = self::STATE_COMMENT; 1661 $this->comment_type = self::COMMENT_AS_HTML_COMMENT; 1662 $this->token_length = $closer_at + 4 - $this->token_starts_at; 1663 $this->text_starts_at = $this->token_starts_at + 4; 1664 $this->text_length = $closer_at - $this->text_starts_at; 1665 $this->bytes_already_parsed = $closer_at + 4; 1666 return true; 1413 1667 } 1414 1668 } … … 1416 1670 1417 1671 /* 1418 * <![CDATA[ transitions to CDATA section state – skip to the nearest ]]> 1419 * The CDATA is case-sensitive. 1420 * https://html.spec.whatwg.org/multipage/parsing.html#tag-open-state 1421 */ 1422 if ( 1423 $doc_length > $at + 8 && 1424 '[' === $html[ $at + 2 ] && 1425 'C' === $html[ $at + 3 ] && 1426 'D' === $html[ $at + 4 ] && 1427 'A' === $html[ $at + 5 ] && 1428 'T' === $html[ $at + 6 ] && 1429 'A' === $html[ $at + 7 ] && 1430 '[' === $html[ $at + 8 ] 1431 ) { 1432 $closer_at = strpos( $html, ']]>', $at + 9 ); 1433 if ( false === $closer_at ) { 1434 $this->parser_state = self::STATE_INCOMPLETE; 1435 1436 return false; 1437 } 1438 1439 $at = $closer_at + 3; 1440 continue; 1441 } 1442 1443 /* 1444 * <!DOCTYPE transitions to DOCTYPE state – skip to the nearest > 1672 * `<!DOCTYPE` transitions to DOCTYPE state – skip to the nearest > 1445 1673 * These are ASCII-case-insensitive. 1446 1674 * https://html.spec.whatwg.org/multipage/parsing.html#tag-open-state … … 1458 1686 $closer_at = strpos( $html, '>', $at + 9 ); 1459 1687 if ( false === $closer_at ) { 1460 $this->parser_state = self::STATE_INCOMPLETE ;1688 $this->parser_state = self::STATE_INCOMPLETE_INPUT; 1461 1689 1462 1690 return false; 1463 1691 } 1464 1692 1465 $at = $closer_at + 1; 1466 continue; 1693 $this->parser_state = self::STATE_DOCTYPE; 1694 $this->token_length = $closer_at + 1 - $this->token_starts_at; 1695 $this->text_starts_at = $this->token_starts_at + 9; 1696 $this->text_length = $closer_at - $this->text_starts_at; 1697 $this->bytes_already_parsed = $closer_at + 1; 1698 return true; 1467 1699 } 1468 1700 … … 1472 1704 * found then the HTML was truncated inside the markup declaration. 1473 1705 */ 1474 $ at = strpos( $html, '>', $at + 1 );1475 if ( false === $ at ) {1476 $this->parser_state = self::STATE_INCOMPLETE ;1706 $closer_at = strpos( $html, '>', $at + 1 ); 1707 if ( false === $closer_at ) { 1708 $this->parser_state = self::STATE_INCOMPLETE_INPUT; 1477 1709 1478 1710 return false; 1479 1711 } 1480 1712 1481 continue; 1713 $this->parser_state = self::STATE_COMMENT; 1714 $this->comment_type = self::COMMENT_AS_INVALID_HTML; 1715 $this->token_length = $closer_at + 1 - $this->token_starts_at; 1716 $this->text_starts_at = $this->token_starts_at + 2; 1717 $this->text_length = $closer_at - $this->text_starts_at; 1718 $this->bytes_already_parsed = $closer_at + 1; 1719 1720 /* 1721 * Identify nodes that would be CDATA if HTML had CDATA sections. 1722 * 1723 * This section must occur after identifying the bogus comment end 1724 * because in an HTML parser it will span to the nearest `>`, even 1725 * if there's no `]]>` as would be required in an XML document. It 1726 * is therefore not possible to parse a CDATA section containing 1727 * a `>` in the HTML syntax. 1728 * 1729 * Inside foreign elements there is a discrepancy between browsers 1730 * and the specification on this. 1731 * 1732 * @todo Track whether the Tag Processor is inside a foreign element 1733 * and require the proper closing `]]>` in those cases. 1734 */ 1735 if ( 1736 $this->token_length >= 10 && 1737 '[' === $html[ $this->token_starts_at + 2 ] && 1738 'C' === $html[ $this->token_starts_at + 3 ] && 1739 'D' === $html[ $this->token_starts_at + 4 ] && 1740 'A' === $html[ $this->token_starts_at + 5 ] && 1741 'T' === $html[ $this->token_starts_at + 6 ] && 1742 'A' === $html[ $this->token_starts_at + 7 ] && 1743 '[' === $html[ $this->token_starts_at + 8 ] && 1744 ']' === $html[ $closer_at - 1 ] 1745 ) { 1746 $this->parser_state = self::STATE_COMMENT; 1747 $this->comment_type = self::COMMENT_AS_CDATA_LOOKALIKE; 1748 $this->text_starts_at += 7; 1749 $this->text_length -= 9; 1750 } 1751 1752 return true; 1482 1753 } 1483 1754 … … 1492 1763 */ 1493 1764 if ( '>' === $html[ $at + 1 ] ) { 1494 ++$at; 1495 continue; 1765 $this->parser_state = self::STATE_PRESUMPTUOUS_TAG; 1766 $this->token_length = $at + 2 - $this->token_starts_at; 1767 $this->bytes_already_parsed = $at + 2; 1768 return true; 1496 1769 } 1497 1770 1498 1771 /* 1499 * <?transitions to a bogus comment state – skip to the nearest >1772 * `<?` transitions to a bogus comment state – skip to the nearest > 1500 1773 * See https://html.spec.whatwg.org/multipage/parsing.html#tag-open-state 1501 1774 */ … … 1503 1776 $closer_at = strpos( $html, '>', $at + 2 ); 1504 1777 if ( false === $closer_at ) { 1505 $this->parser_state = self::STATE_INCOMPLETE ;1778 $this->parser_state = self::STATE_INCOMPLETE_INPUT; 1506 1779 1507 1780 return false; 1508 1781 } 1509 1782 1510 $at = $closer_at + 1; 1511 continue; 1783 $this->parser_state = self::STATE_COMMENT; 1784 $this->comment_type = self::COMMENT_AS_INVALID_HTML; 1785 $this->token_length = $closer_at + 1 - $this->token_starts_at; 1786 $this->text_starts_at = $this->token_starts_at + 2; 1787 $this->text_length = $closer_at - $this->text_starts_at; 1788 $this->bytes_already_parsed = $closer_at + 1; 1789 1790 /* 1791 * Identify a Processing Instruction node were HTML to have them. 1792 * 1793 * This section must occur after identifying the bogus comment end 1794 * because in an HTML parser it will span to the nearest `>`, even 1795 * if there's no `?>` as would be required in an XML document. It 1796 * is therefore not possible to parse a Processing Instruction node 1797 * containing a `>` in the HTML syntax. 1798 * 1799 * XML allows for more target names, but this code only identifies 1800 * those with ASCII-representable target names. This means that it 1801 * may identify some Processing Instruction nodes as bogus comments, 1802 * but it will not misinterpret the HTML structure. By limiting the 1803 * identification to these target names the Tag Processor can avoid 1804 * the need to start parsing UTF-8 sequences. 1805 * 1806 * > NameStartChar ::= ":" | [A-Z] | "_" | [a-z] | [#xC0-#xD6] | [#xD8-#xF6] | [#xF8-#x2FF] | 1807 * [#x370-#x37D] | [#x37F-#x1FFF] | [#x200C-#x200D] | [#x2070-#x218F] | 1808 * [#x2C00-#x2FEF] | [#x3001-#xD7FF] | [#xF900-#xFDCF] | [#xFDF0-#xFFFD] | 1809 * [#x10000-#xEFFFF] 1810 * > NameChar ::= NameStartChar | "-" | "." | [0-9] | #xB7 | [#x0300-#x036F] | [#x203F-#x2040] 1811 * 1812 * @see https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-PITarget 1813 */ 1814 if ( $this->token_length >= 5 && '?' === $html[ $closer_at - 1 ] ) { 1815 $comment_text = substr( $html, $this->token_starts_at + 2, $this->token_length - 4 ); 1816 $pi_target_length = strspn( $comment_text, 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ:_' ); 1817 1818 if ( 0 < $pi_target_length ) { 1819 $pi_target_length += strspn( $comment_text, 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789:_-.', $pi_target_length ); 1820 1821 $this->comment_type = self::COMMENT_AS_PI_NODE_LOOKALIKE; 1822 $this->tag_name_starts_at = $this->token_starts_at + 2; 1823 $this->tag_name_length = $pi_target_length; 1824 $this->text_starts_at += $pi_target_length; 1825 $this->text_length -= $pi_target_length + 1; 1826 } 1827 } 1828 1829 return true; 1512 1830 } 1513 1831 … … 1515 1833 * If a non-alpha starts the tag name in a tag closer it's a comment. 1516 1834 * Find the first `>`, which closes the comment. 1835 * 1836 * This parser classifies these particular comments as special "funky comments" 1837 * which are made available for further processing. 1517 1838 * 1518 1839 * See https://html.spec.whatwg.org/#parse-error-invalid-first-character-of-tag-name … … 1526 1847 $closer_at = strpos( $html, '>', $at + 3 ); 1527 1848 if ( false === $closer_at ) { 1528 $this->parser_state = self::STATE_INCOMPLETE ;1849 $this->parser_state = self::STATE_INCOMPLETE_INPUT; 1529 1850 1530 1851 return false; 1531 1852 } 1532 1853 1533 $at = $closer_at + 1; 1534 continue; 1854 $this->parser_state = self::STATE_FUNKY_COMMENT; 1855 $this->token_length = $closer_at + 1 - $this->token_starts_at; 1856 $this->text_starts_at = $this->token_starts_at + 2; 1857 $this->text_length = $closer_at - $this->text_starts_at; 1858 $this->bytes_already_parsed = $closer_at + 1; 1859 return true; 1535 1860 } 1536 1861 … … 1552 1877 $this->bytes_already_parsed += strspn( $this->html, " \t\f\r\n/", $this->bytes_already_parsed ); 1553 1878 if ( $this->bytes_already_parsed >= strlen( $this->html ) ) { 1554 $this->parser_state = self::STATE_INCOMPLETE ;1879 $this->parser_state = self::STATE_INCOMPLETE_INPUT; 1555 1880 1556 1881 return false; … … 1576 1901 $this->bytes_already_parsed += $name_length; 1577 1902 if ( $this->bytes_already_parsed >= strlen( $this->html ) ) { 1578 $this->parser_state = self::STATE_INCOMPLETE ;1903 $this->parser_state = self::STATE_INCOMPLETE_INPUT; 1579 1904 1580 1905 return false; … … 1583 1908 $this->skip_whitespace(); 1584 1909 if ( $this->bytes_already_parsed >= strlen( $this->html ) ) { 1585 $this->parser_state = self::STATE_INCOMPLETE ;1910 $this->parser_state = self::STATE_INCOMPLETE_INPUT; 1586 1911 1587 1912 return false; … … 1593 1918 $this->skip_whitespace(); 1594 1919 if ( $this->bytes_already_parsed >= strlen( $this->html ) ) { 1595 $this->parser_state = self::STATE_INCOMPLETE ;1920 $this->parser_state = self::STATE_INCOMPLETE_INPUT; 1596 1921 1597 1922 return false; … … 1621 1946 1622 1947 if ( $attribute_end >= strlen( $this->html ) ) { 1623 $this->parser_state = self::STATE_INCOMPLETE ;1948 $this->parser_state = self::STATE_INCOMPLETE_INPUT; 1624 1949 1625 1950 return false; … … 1693 2018 $this->tag_name_starts_at = null; 1694 2019 $this->tag_name_length = null; 2020 $this->text_starts_at = 0; 2021 $this->text_length = 0; 1695 2022 $this->is_closing_tag = null; 1696 2023 $this->attributes = array(); 2024 $this->comment_type = null; 1697 2025 $this->duplicate_attributes = null; 1698 2026 } … … 1986 2314 // Point this tag processor before the sought tag opener and consume it. 1987 2315 $this->bytes_already_parsed = $this->bookmarks[ $bookmark_name ]->start; 1988 return $this->next_t ag( array( 'tag_closers' => 'visit' ));2316 return $this->next_token(); 1989 2317 } 1990 2318 … … 2217 2545 */ 2218 2546 public function get_tag() { 2219 if ( self::STATE_MATCHED_TAG !== $this->parser_state) {2547 if ( null === $this->tag_name_starts_at ) { 2220 2548 return null; 2221 2549 } … … 2223 2551 $tag_name = substr( $this->html, $this->tag_name_starts_at, $this->tag_name_length ); 2224 2552 2225 return strtoupper( $tag_name ); 2553 if ( self::STATE_MATCHED_TAG === $this->parser_state ) { 2554 return strtoupper( $tag_name ); 2555 } 2556 2557 if ( 2558 self::STATE_COMMENT === $this->parser_state && 2559 self::COMMENT_AS_PI_NODE_LOOKALIKE === $this->get_comment_type() 2560 ) { 2561 return $tag_name; 2562 } 2563 2564 return null; 2226 2565 } 2227 2566 … … 2280 2619 $this->is_closing_tag 2281 2620 ); 2621 } 2622 2623 /** 2624 * Indicates the kind of matched token, if any. 2625 * 2626 * This differs from `get_token_name()` in that it always 2627 * returns a static string indicating the type, whereas 2628 * `get_token_name()` may return values derived from the 2629 * token itself, such as a tag name or processing 2630 * instruction tag. 2631 * 2632 * Possible values: 2633 * - `#tag` when matched on a tag. 2634 * - `#text` when matched on a text node. 2635 * - `#cdata-section` when matched on a CDATA node. 2636 * - `#comment` when matched on a comment. 2637 * - `#doctype` when matched on a DOCTYPE declaration. 2638 * - `#presumptuous-tag` when matched on an empty tag closer. 2639 * - `#funky-comment` when matched on a funky comment. 2640 * 2641 * @since 6.5.0 2642 * 2643 * @return string|null What kind of token is matched, or null. 2644 */ 2645 public function get_token_type() { 2646 switch ( $this->parser_state ) { 2647 case self::STATE_MATCHED_TAG: 2648 return '#tag'; 2649 2650 case self::STATE_DOCTYPE: 2651 return '#doctype'; 2652 2653 default: 2654 return $this->get_token_name(); 2655 } 2656 } 2657 2658 /** 2659 * Returns the node name represented by the token. 2660 * 2661 * This matches the DOM API value `nodeName`. Some values 2662 * are static, such as `#text` for a text node, while others 2663 * are dynamically generated from the token itself. 2664 * 2665 * Dynamic names: 2666 * - Uppercase tag name for tag matches. 2667 * - `html` for DOCTYPE declarations. 2668 * 2669 * Note that if the Tag Processor is not matched on a token 2670 * then this function will return `null`, either because it 2671 * hasn't yet found a token or because it reached the end 2672 * of the document without matching a token. 2673 * 2674 * @since 6.5.0 2675 * 2676 * @return string|null Name of the matched token. 2677 */ 2678 public function get_token_name() { 2679 switch ( $this->parser_state ) { 2680 case self::STATE_MATCHED_TAG: 2681 return $this->get_tag(); 2682 2683 case self::STATE_TEXT_NODE: 2684 return '#text'; 2685 2686 case self::STATE_CDATA_NODE: 2687 return '#cdata-section'; 2688 2689 case self::STATE_COMMENT: 2690 return '#comment'; 2691 2692 case self::STATE_DOCTYPE: 2693 return 'html'; 2694 2695 case self::STATE_PRESUMPTUOUS_TAG: 2696 return '#presumptuous-tag'; 2697 2698 case self::STATE_FUNKY_COMMENT: 2699 return '#funky-comment'; 2700 } 2701 } 2702 2703 /** 2704 * Indicates what kind of comment produced the comment node. 2705 * 2706 * Because there are different kinds of HTML syntax which produce 2707 * comments, the Tag Processor tracks and exposes this as a type 2708 * for the comment. Nominally only regular HTML comments exist as 2709 * they are commonly known, but a number of unrelated syntax errors 2710 * also produce comments. 2711 * 2712 * @see self::COMMENT_AS_ABRUPTLY_CLOSED_COMMENT 2713 * @see self::COMMENT_AS_CDATA_LOOKALIKE 2714 * @see self::COMMENT_AS_INVALID_HTML 2715 * @see self::COMMENT_AS_HTML_COMMENT 2716 * @see self::COMMENT_AS_PI_NODE_LOOKALIKE 2717 * 2718 * @since 6.5.0 2719 * 2720 * @return string|null 2721 */ 2722 public function get_comment_type() { 2723 if ( self::STATE_COMMENT !== $this->parser_state ) { 2724 return null; 2725 } 2726 2727 return $this->comment_type; 2728 } 2729 2730 /** 2731 * Returns the modifiable text for a matched token, or an empty string. 2732 * 2733 * Modifiable text is text content that may be read and changed without 2734 * changing the HTML structure of the document around it. This includes 2735 * the contents of `#text` nodes in the HTML as well as the inner 2736 * contents of HTML comments, Processing Instructions, and others, even 2737 * though these nodes aren't part of a parsed DOM tree. They also contain 2738 * the contents of SCRIPT and STYLE tags, of TEXTAREA tags, and of any 2739 * other section in an HTML document which cannot contain HTML markup (DATA). 2740 * 2741 * If a token has no modifiable text then an empty string is returned to 2742 * avoid needless crashing or type errors. An empty string does not mean 2743 * that a token has modifiable text, and a token with modifiable text may 2744 * have an empty string (e.g. a comment with no contents). 2745 * 2746 * @since 6.5.0 2747 * 2748 * @return string 2749 */ 2750 public function get_modifiable_text() { 2751 if ( null === $this->text_starts_at ) { 2752 return ''; 2753 } 2754 2755 $text = substr( $this->html, $this->text_starts_at, $this->text_length ); 2756 2757 // Comment data is not decoded. 2758 if ( 2759 self::STATE_CDATA_NODE === $this->parser_state || 2760 self::STATE_COMMENT === $this->parser_state || 2761 self::STATE_DOCTYPE === $this->parser_state || 2762 self::STATE_FUNKY_COMMENT === $this->parser_state 2763 ) { 2764 return $text; 2765 } 2766 2767 $tag_name = $this->get_tag(); 2768 if ( 2769 // Script data is not decoded. 2770 'SCRIPT' === $tag_name || 2771 2772 // RAWTEXT data is not decoded. 2773 'IFRAME' === $tag_name || 2774 'NOEMBED' === $tag_name || 2775 'NOFRAMES' === $tag_name || 2776 'STYLE' === $tag_name || 2777 'XMP' === $tag_name 2778 ) { 2779 return $text; 2780 } 2781 2782 $decoded = html_entity_decode( $text, ENT_QUOTES | ENT_HTML5 | ENT_SUBSTITUTE ); 2783 2784 if ( empty( $decoded ) ) { 2785 return ''; 2786 } 2787 2788 /* 2789 * TEXTAREA skips a leading newline, but this newline may appear not only as the 2790 * literal character `\n`, but also as a character reference, such as in the 2791 * following markup: `<textarea>
Content</textarea>`. 2792 * 2793 * For these cases it's important to first decode the text content before checking 2794 * for a leading newline and removing it. 2795 */ 2796 if ( 2797 self::STATE_MATCHED_TAG === $this->parser_state && 2798 'TEXTAREA' === $tag_name && 2799 strlen( $decoded ) > 0 && 2800 "\n" === $decoded[0] 2801 ) { 2802 return substr( $decoded, 1 ); 2803 } 2804 2805 return $decoded; 2282 2806 } 2283 2807 … … 2747 3271 2748 3272 /** 2749 * Parser Ready State 3273 * Parser Ready State. 2750 3274 * 2751 3275 * Indicates that the parser is ready to run and waiting for a state transition. … … 2760 3284 2761 3285 /** 2762 * Parser Complete State 3286 * Parser Complete State. 2763 3287 * 2764 3288 * Indicates that the parser has reached the end of the document and there is … … 2772 3296 2773 3297 /** 2774 * Parser Incomplete State3298 * Parser Incomplete Input State. 2775 3299 * 2776 3300 * Indicates that the parser has reached the end of the document before finishing … … 2785 3309 * @access private 2786 3310 */ 2787 const STATE_INCOMPLETE = 'STATE_INCOMPLETE';2788 2789 /** 2790 * Parser Matched Tag State 3311 const STATE_INCOMPLETE_INPUT = 'STATE_INCOMPLETE_INPUT'; 3312 3313 /** 3314 * Parser Matched Tag State. 2791 3315 * 2792 3316 * Indicates that the parser has found an HTML tag and it's possible to get … … 2798 3322 */ 2799 3323 const STATE_MATCHED_TAG = 'STATE_MATCHED_TAG'; 3324 3325 /** 3326 * Parser Text Node State. 3327 * 3328 * Indicates that the parser has found a text node and it's possible 3329 * to read and modify that text. 3330 * 3331 * @since 6.5.0 3332 * 3333 * @access private 3334 */ 3335 const STATE_TEXT_NODE = 'STATE_TEXT_NODE'; 3336 3337 /** 3338 * Parser CDATA Node State. 3339 * 3340 * Indicates that the parser has found a CDATA node and it's possible 3341 * to read and modify its modifiable text. Note that in HTML there are 3342 * no CDATA nodes outside of foreign content (SVG and MathML). Outside 3343 * of foreign content, they are treated as HTML comments. 3344 * 3345 * @since 6.5.0 3346 * 3347 * @access private 3348 */ 3349 const STATE_CDATA_NODE = 'STATE_CDATA_NODE'; 3350 3351 /** 3352 * Indicates that the parser has found an HTML comment and it's 3353 * possible to read and modify its modifiable text. 3354 * 3355 * @since 6.5.0 3356 * 3357 * @access private 3358 */ 3359 const STATE_COMMENT = 'STATE_COMMENT'; 3360 3361 /** 3362 * Indicates that the parser has found a DOCTYPE node and it's 3363 * possible to read and modify its modifiable text. 3364 * 3365 * @since 6.5.0 3366 * 3367 * @access private 3368 */ 3369 const STATE_DOCTYPE = 'STATE_DOCTYPE'; 3370 3371 /** 3372 * Indicates that the parser has found an empty tag closer `</>`. 3373 * 3374 * Note that in HTML there are no empty tag closers, and they 3375 * are ignored. Nonetheless, the Tag Processor still 3376 * recognizes them as they appear in the HTML stream. 3377 * 3378 * These were historically discussed as a "presumptuous tag 3379 * closer," which would close the nearest open tag, but were 3380 * dismissed in favor of explicitly-closing tags. 3381 * 3382 * @since 6.5.0 3383 * 3384 * @access private 3385 */ 3386 const STATE_PRESUMPTUOUS_TAG = 'STATE_PRESUMPTUOUS_TAG'; 3387 3388 /** 3389 * Indicates that the parser has found a "funky comment" 3390 * and it's possible to read and modify its modifiable text. 3391 * 3392 * Example: 3393 * 3394 * </%url> 3395 * </{"wp-bit":"query/post-author"}> 3396 * </2> 3397 * 3398 * Funky comments are tag closers with invalid tag names. Note 3399 * that in HTML these are turn into bogus comments. Nonetheless, 3400 * the Tag Processor recognizes them in a stream of HTML and 3401 * exposes them for inspection and modification. 3402 * 3403 * @since 6.5.0 3404 * 3405 * @access private 3406 */ 3407 const STATE_FUNKY_COMMENT = 'STATE_WP_FUNKY'; 3408 3409 /** 3410 * Indicates that a comment was created when encountering abruptly-closed HTML comment. 3411 * 3412 * Example: 3413 * 3414 * <!--> 3415 * <!---> 3416 * 3417 * @since 6.5.0 3418 */ 3419 const COMMENT_AS_ABRUPTLY_CLOSED_COMMENT = 'COMMENT_AS_ABRUPTLY_CLOSED_COMMENT'; 3420 3421 /** 3422 * Indicates that a comment would be parsed as a CDATA node, 3423 * were HTML to allow CDATA nodes outside of foreign content. 3424 * 3425 * Example: 3426 * 3427 * <![CDATA[This is a CDATA node.]]> 3428 * 3429 * This is an HTML comment, but it looks like a CDATA node. 3430 * 3431 * @since 6.5.0 3432 */ 3433 const COMMENT_AS_CDATA_LOOKALIKE = 'COMMENT_AS_CDATA_LOOKALIKE'; 3434 3435 /** 3436 * Indicates that a comment was created when encountering 3437 * normative HTML comment syntax. 3438 * 3439 * Example: 3440 * 3441 * <!-- this is a comment --> 3442 * 3443 * @since 6.5.0 3444 */ 3445 const COMMENT_AS_HTML_COMMENT = 'COMMENT_AS_HTML_COMMENT'; 3446 3447 /** 3448 * Indicates that a comment would be parsed as a Processing 3449 * Instruction node, were they to exist within HTML. 3450 * 3451 * Example: 3452 * 3453 * <?wp __( 'Like' ) ?> 3454 * 3455 * This is an HTML comment, but it looks like a CDATA node. 3456 * 3457 * @since 6.5.0 3458 */ 3459 const COMMENT_AS_PI_NODE_LOOKALIKE = 'COMMENT_AS_PI_NODE_LOOKALIKE'; 3460 3461 /** 3462 * Indicates that a comment was created when encountering invalid 3463 * HTML input, a so-called "bogus comment." 3464 * 3465 * Example: 3466 * 3467 * <?nothing special> 3468 * <!{nothing special}> 3469 * 3470 * @since 6.5.0 3471 */ 3472 const COMMENT_AS_INVALID_HTML = 'COMMENT_AS_INVALID_HTML'; 2800 3473 } -
trunk/tests/phpunit/tests/html-api/wpHtmlProcessorBreadcrumbs.php
r57343 r57348 515 515 */ 516 516 public function test_can_seek_back_and_forth() { 517 $p = WP_HTML_Processor::create_fragment( '<div><p one><div><p><div two><p><div><p><div><p three>' ); 517 $p = WP_HTML_Processor::create_fragment( 518 <<<'HTML' 519 <div>text<p one>more stuff<div><![CDATA[this is not real CDATA]]><p><!-- hi --><div two><p><div><p>three comes soon<div><p three>' ); 520 HTML 521 ); 518 522 519 523 // Find first tag of interest. -
trunk/tests/phpunit/tests/html-api/wpHtmlTagProcessor.php
r57211 r57348 558 558 559 559 $p->next_tag(); 560 $this->assertTrue( $p->next_tag( array( 'tag_closers' => 'visit' ) ), 'Did not find the </script> tag closer' ); 561 $this->assertTrue( $p->is_tag_closer(), 'Indicated a <script> tag opener is a tag closer' ); 560 $this->assertFalse( 561 $p->next_tag( array( 'tag_closers' => 'visit' ) ), 562 'Should not have found closing SCRIPT tag when closing an opener.' 563 ); 562 564 563 565 $p = new WP_HTML_Tag_Processor( 'abc</script>' ); … … 567 569 568 570 $p->next_tag(); 569 $this->assertTrue( $p->next_tag( array( 'tag_closers' => 'visit' ) ), 'Did not find the </textarea> tag closer' ); 570 $this->assertTrue( $p->is_tag_closer(), 'Indicated a <textarea> tag opener is a tag closer' ); 571 $this->assertFalse( 572 $p->next_tag( array( 'tag_closers' => 'visit' ) ), 573 'Should not have found closing TEXTAREA when closing an opener.' 574 ); 571 575 572 576 $p = new WP_HTML_Tag_Processor( 'abc</textarea>' ); … … 576 580 577 581 $p->next_tag(); 578 $this->assertTrue( $p->next_tag( array( 'tag_closers' => 'visit' ) ), 'Did not find the </title> tag closer' ); 579 $this->assertTrue( $p->is_tag_closer(), 'Indicated a <title> tag opener is a tag closer' ); 582 $this->assertFalse( 583 $p->next_tag( array( 'tag_closers' => 'visit' ) ), 584 'Should not have found closing TITLE when closing an opener.' 585 ); 580 586 581 587 $p = new WP_HTML_Tag_Processor( 'abc</title>' ); … … 2358 2364 'Text with comments' => array( 'One <!-- sneaky --> comment.' ), 2359 2365 'Empty tag closer' => array( '</>' ), 2366 'CDATA as HTML comment' => array( '<![CDATA[this closes at the first >]>' ), 2360 2367 'Processing instruction' => array( '<?xml version="1.0"?>' ), 2361 2368 'Combination XML-like' => array( '<!DOCTYPE xml><?xml version=""?><!-- this is not a real document. --><![CDATA[it only serves as a test]]>' ), … … 2411 2418 'Partial CDATA' => array( '<![CDA' ), 2412 2419 'Partially closed CDATA]' => array( '<![CDATA[cannot escape]' ), 2413 'Partially closed CDATA]>' => array( '<![CDATA[cannot escape]>' ),2414 2420 'Unclosed IFRAME' => array( '<iframe><div>' ), 2415 2421 'Unclosed NOEMBED' => array( '<noembed><div>' ), … … 2508 2514 'tag inside of CDATA' => array( 2509 2515 'input' => '<![CDATA[This <is> a <strong id="yes">HTML Tag</strong>]]><span>test</span>', 2510 'expected' => '<![CDATA[This <is> a <strong id="yes">HTML Tag</strong>]]><span class="firstTag" foo="bar">test</span>',2516 'expected' => '<![CDATA[This <is> a <strong class="firstTag" foo="bar" id="yes">HTML Tag</strong>]]><span class="secondTag">test</span>', 2511 2517 ), 2512 2518 );
Note: See TracChangeset
for help on using the changeset viewer.