Make WordPress Core


Ignore:
Timestamp:
01/30/2024 10:07:42 PM (5 months ago)
Author:
dmsnell
Message:

HTML API: Fix splitting single text node.

When next_token() was introduced, it brought a subtle bug. When encountering a < in the HTML stream which did not lead to a tag or comment or other token, it was treating the full text span to that point as one text node, and the following span another text node.

The entire span should be one text node.

In this patch the Tag Processor properly detects this scenario and combines the spans into one text node.

Follow-up to [57348]

Props jonsurrell
Fixes #60385

File:
1 edited

Legend:

Unmodified
Added
Removed
  • trunk/src/wp-includes/html-api/class-wp-html-tag-processor.php

    r57348 r57489  
    15131513            $at = strpos( $html, '<', $at );
    15141514
    1515             if ( $at > $was_at ) {
    1516                 $this->parser_state         = self::STATE_TEXT_NODE;
    1517                 $this->token_starts_at      = $was_at;
    1518                 $this->token_length         = $at - $was_at;
    1519                 $this->text_starts_at       = $was_at;
    1520                 $this->text_length          = $this->token_length;
    1521                 $this->bytes_already_parsed = $at;
    1522                 return true;
    1523             }
    1524 
    15251515            /*
    15261516             * This does not imply an incomplete parse; it indicates that there
     
    15341524                $this->text_length          = $this->token_length;
    15351525                $this->bytes_already_parsed = strlen( $html );
     1526                return true;
     1527            }
     1528
     1529            if ( $at > $was_at ) {
     1530                /*
     1531                 * A "<" has been found in the document. That may be the start of another node, or
     1532                 * it may be an "ivalid-first-character-of-tag-name" error. If this is not the start
     1533                 * of another node the "<" should be included in this text node and another
     1534                 * termination point should be found for the text node.
     1535                 *
     1536                 * @see https://html.spec.whatwg.org/#tag-open-state
     1537                 */
     1538                if ( strlen( $html ) > $at + 1 ) {
     1539                    $next_character  = $html[ $at + 1 ];
     1540                    $at_another_node =
     1541                        '!' === $next_character ||
     1542                        '/' === $next_character ||
     1543                        '?' === $next_character ||
     1544                        ( 'A' <= $next_character && $next_character <= 'z' );
     1545                    if ( ! $at_another_node ) {
     1546                        ++$at;
     1547                        continue;
     1548                    }
     1549                }
     1550
     1551                $this->parser_state         = self::STATE_TEXT_NODE;
     1552                $this->token_starts_at      = $was_at;
     1553                $this->token_length         = $at - $was_at;
     1554                $this->text_starts_at       = $was_at;
     1555                $this->text_length          = $this->token_length;
     1556                $this->bytes_already_parsed = $at;
    15361557                return true;
    15371558            }
Note: See TracChangeset for help on using the changeset viewer.