Make WordPress Core


Ignore:
Timestamp:
06/03/2024 07:45:57 PM (6 months ago)
Author:
dmsnell
Message:

HTML API: Report real and virtual nodes in the HTML Processor.

HTML is a kind of short-hand for a DOM structure. This means that there are
many cases in HTML where an element's opening tag or closing tag is missing (or
both). This is because many of the parsing rules imply creating elements in the
DOM which may not exist in the text of the HTML.

The HTML Processor, being the higher-level counterpart to the Tag Processor, is
already aware of these nodes, but since it's inception has not paused on them
when scanning through a document. Instead, these are visible when pausing on a
child of such an element, but otherwise not seen.

In this patch the HTML Processor starts exposing those implicitly-created nodes,
including opening tags, and closing tags, that aren't foudn in the text content
of the HTML input document.

Previously, the sequence of matched tokens when scanning with
WP_HTML_Processor::next_token() would depend on how the HTML document was written,
but with this patch, all semantically equal HTML documents will parse and scan in
the same exact manner, presenting an idealized or "perfect" view of the document
the same way as would occur when traversing a DOM in a browser.

Developed in https://github.com/WordPress/wordpress-develop/pull/6348
Discussed in https://core.trac.wordpress.org/ticket/61348

Props audrasjb, dmsnell, gziolo, jonsurrell.
Fixes #61348.

File:
1 edited

Legend:

Unmodified
Added
Removed
  • trunk/src/wp-includes/html-api/class-wp-html-processor.php

    r58192 r58304  
    202202    private $release_internal_bookmark_on_destruct = null;
    203203
     204    /**
     205     * Stores stack events which arise during parsing of the
     206     * HTML document, which will then supply the "match" events.
     207     *
     208     * @since 6.6.0
     209     *
     210     * @var WP_HTML_Stack_Event[]
     211     */
     212    private $element_queue = array();
     213
     214    /**
     215     * Current stack event, if set, representing a matched token.
     216     *
     217     * Because the parser may internally point to a place further along in a document
     218     * than the nodes which have already been processed (some "virtual" nodes may have
     219     * appeared while scanning the HTML document), this will point at the "current" node
     220     * being processed. It comes from the front of the element queue.
     221     *
     222     * @since 6.6.0
     223     *
     224     * @var ?WP_HTML_Stack_Event
     225     */
     226    private $current_element = null;
     227
     228    /**
     229     * Context node if created as a fragment parser.
     230     *
     231     * @var ?WP_HTML_Token
     232     */
     233    private $context_node = null;
     234
     235    /**
     236     * Whether the parser has yet processed the context node,
     237     * if created as a fragment parser.
     238     *
     239     * The context node will be initially pushed onto the stack of open elements,
     240     * but when created as a fragment parser, this context element (and the implicit
     241     * HTML document node above it) should not be exposed as a matched token or node.
     242     *
     243     * This boolean indicates whether the processor should skip over the current
     244     * node in its initial search for the first node created from the input HTML.
     245     *
     246     * @var bool
     247     */
     248    private $has_seen_context_node = false;
     249
    204250    /*
    205251     * Public Interface Functions
     
    258304        );
    259305
    260         $processor->state->stack_of_open_elements->push(
    261             new WP_HTML_Token(
    262                 'context-node',
    263                 $processor->state->context_node[0],
    264                 false
    265             )
     306        $context_node = new WP_HTML_Token(
     307            'context-node',
     308            $processor->state->context_node[0],
     309            false
    266310        );
     311
     312        $processor->state->stack_of_open_elements->push( $context_node );
     313        $processor->context_node = $context_node;
    267314
    268315        return $processor;
     
    300347        $this->state = new WP_HTML_Processor_State();
    301348
     349        $this->state->stack_of_open_elements->set_push_handler(
     350            function ( WP_HTML_Token $token ) {
     351                $this->element_queue[] = new WP_HTML_Stack_Event( $token, WP_HTML_Stack_Event::PUSH );
     352            }
     353        );
     354
     355        $this->state->stack_of_open_elements->set_pop_handler(
     356            function ( WP_HTML_Token $token ) {
     357                $this->element_queue[] = new WP_HTML_Stack_Event( $token, WP_HTML_Stack_Event::POP );
     358            }
     359        );
     360
    302361        /*
    303362         * Create this wrapper so that it's possible to pass
     
    343402     *
    344403     * @since 6.4.0
     404     * @since 6.6.0 Visits all tokens, including virtual ones.
    345405     *
    346406     * @throws Exception When unable to allocate a bookmark for the next token in the input HTML document.
     
    350410     *
    351411     *     @type string|null $tag_name     Which tag to find, or `null` for "any tag."
     412     *     @type string      $tag_closers  'visit' to pause at tag closers, 'skip' or unset to only visit openers.
    352413     *     @type int|null    $match_offset Find the Nth tag matching all search criteria.
    353414     *                                     1 for "first" tag, 3 for "third," etc.
     
    360421     */
    361422    public function next_tag( $query = null ) {
     423        $visit_closers = isset( $query['tag_closers'] ) && 'visit' === $query['tag_closers'];
     424
    362425        if ( null === $query ) {
    363             while ( $this->step() ) {
     426            while ( $this->next_token() ) {
    364427                if ( '#tag' !== $this->get_token_type() ) {
    365428                    continue;
    366429                }
    367430
    368                 if ( ! $this->is_tag_closer() ) {
     431                if ( ! $this::is_tag_closer() || $visit_closers ) {
    369432                    return true;
    370433                }
     
    392455
    393456        if ( ! ( array_key_exists( 'breadcrumbs', $query ) && is_array( $query['breadcrumbs'] ) ) ) {
    394             while ( $this->step() ) {
     457            while ( $this->next_token() ) {
    395458                if ( '#tag' !== $this->get_token_type() ) {
    396459                    continue;
     
    401464                }
    402465
    403                 if ( ! $this->is_tag_closer() ) {
     466                if ( ! parent::is_tag_closer() || $visit_closers ) {
    404467                    return true;
    405468                }
    406469            }
    407470
    408             return false;
    409         }
    410 
    411         if ( isset( $query['tag_closers'] ) && 'visit' === $query['tag_closers'] ) {
    412             _doing_it_wrong(
    413                 __METHOD__,
    414                 __( 'Cannot visit tag closers in HTML Processor.' ),
    415                 '6.4.0'
    416             );
    417471            return false;
    418472        }
     
    421475        $match_offset = isset( $query['match_offset'] ) ? (int) $query['match_offset'] : 1;
    422476
    423         while ( $match_offset > 0 && $this->step() ) {
    424             if ( '#tag' !== $this->get_token_type() ) {
     477        while ( $match_offset > 0 && $this->next_token() ) {
     478            if ( '#tag' !== $this->get_token_type() || $this->is_tag_closer() ) {
    425479                continue;
    426480            }
     
    453507     */
    454508    public function next_token() {
    455         return $this->step();
     509        $this->current_element = null;
     510
     511        if ( isset( $this->last_error ) ) {
     512            return false;
     513        }
     514
     515        if ( 0 === count( $this->element_queue ) && ! $this->step() ) {
     516            while ( $this->state->stack_of_open_elements->pop() ) {
     517                continue;
     518            }
     519        }
     520
     521        $this->current_element = array_shift( $this->element_queue );
     522        while ( isset( $this->context_node ) && ! $this->has_seen_context_node ) {
     523            if ( isset( $this->current_element ) ) {
     524                if ( $this->context_node === $this->current_element->token && WP_HTML_Stack_Event::PUSH === $this->current_element->operation ) {
     525                    $this->has_seen_context_node = true;
     526                    return $this->next_token();
     527                }
     528            }
     529            $this->current_element = array_shift( $this->element_queue );
     530        }
     531
     532        if ( ! isset( $this->current_element ) ) {
     533            return $this->next_token();
     534        }
     535
     536        if ( isset( $this->context_node ) && WP_HTML_Stack_Event::POP === $this->current_element->operation && $this->context_node === $this->current_element->token ) {
     537            $this->element_queue   = array();
     538            $this->current_element = null;
     539            return false;
     540        }
     541
     542        // Avoid sending close events for elements which don't expect a closing.
     543        if (
     544            WP_HTML_Stack_Event::POP === $this->current_element->operation &&
     545            ! static::expects_closer( $this->current_element->token->node_name )
     546        ) {
     547            return $this->next_token();
     548        }
     549
     550        return true;
     551    }
     552
     553
     554    /**
     555     * Indicates if the current tag token is a tag closer.
     556     *
     557     * Example:
     558     *
     559     *     $p = WP_HTML_Processor::create_fragment( '<div></div>' );
     560     *     $p->next_tag( array( 'tag_name' => 'div', 'tag_closers' => 'visit' ) );
     561     *     $p->is_tag_closer() === false;
     562     *
     563     *     $p->next_tag( array( 'tag_name' => 'div', 'tag_closers' => 'visit' ) );
     564     *     $p->is_tag_closer() === true;
     565     *
     566     * @since 6.6.0 Subclassed for HTML Processor.
     567     *
     568     * @return bool Whether the current tag is a tag closer.
     569     */
     570    public function is_tag_closer() {
     571        return isset( $this->current_element )
     572            ? ( WP_HTML_Stack_Event::POP === $this->current_element->operation )
     573            : parent::is_tag_closer();
    456574    }
    457575
     
    526644     *       SVG and MathML namespace.
    527645     *
     646     * @param  ?WP_HTML_Token $node Node to examine instead of current node, if provided.
    528647     * @return bool Whether to expect a closer for the currently-matched node,
    529648     *              or `null` if not matched on any token.
    530649     */
    531     public function expects_closer() {
    532         $token_name = $this->get_token_name();
     650    public function expects_closer( $node = null ) {
     651        $token_name = $node->node_name ?? $this->get_token_name();
    533652        if ( ! isset( $token_name ) ) {
    534653            return null;
     
    582701             */
    583702            $top_node = $this->state->stack_of_open_elements->current_node();
    584             if (
    585                 $top_node && (
    586                     // Void elements.
    587                     self::is_void( $top_node->node_name ) ||
    588                     // Comments, text nodes, and other atomic tokens.
    589                     '#' === $top_node->node_name[0] ||
    590                     // Doctype declarations.
    591                     'html' === $top_node->node_name
    592                 )
    593             ) {
     703            if ( isset( $top_node ) && ! static::expects_closer( $top_node ) ) {
    594704                $this->state->stack_of_open_elements->pop();
    595705            }
     
    651761     * @since 6.4.0
    652762     *
     763     * @todo make aware of queue of elements, because stack operations have already been done by now.
     764     *
    653765     * @return string[]|null Array of tag names representing path to matched node, if matched, otherwise NULL.
    654766     */
     
    709821        $token_name = $this->get_token_name();
    710822        $token_type = $this->get_token_type();
    711         $op_sigil   = '#tag' === $token_type ? ( $this->is_tag_closer() ? '-' : '+' ) : '';
     823        $op_sigil   = '#tag' === $token_type ? ( parent::is_tag_closer() ? '-' : '+' ) : '';
    712824        $op         = "{$op_sigil}{$token_name}";
    713825
     
    12321344        }
    12331345
    1234         if ( ! $this->is_tag_closer() ) {
     1346        if ( ! parent::is_tag_closer() ) {
    12351347            /*
    12361348             * > Any other start tag
     
    13261438        if ( null !== $this->last_error ) {
    13271439            return null;
     1440        }
     1441
     1442        if ( isset( $this->current_element ) ) {
     1443            return $this->current_element->token->node_name;
    13281444        }
    13291445
     
    13441460
    13451461    /**
     1462     * Returns the node name represented by the token.
     1463     *
     1464     * This matches the DOM API value `nodeName`. Some values
     1465     * are static, such as `#text` for a text node, while others
     1466     * are dynamically generated from the token itself.
     1467     *
     1468     * Dynamic names:
     1469     *  - Uppercase tag name for tag matches.
     1470     *  - `html` for DOCTYPE declarations.
     1471     *
     1472     * Note that if the Tag Processor is not matched on a token
     1473     * then this function will return `null`, either because it
     1474     * hasn't yet found a token or because it reached the end
     1475     * of the document without matching a token.
     1476     *
     1477     * @since 6.6.0 Subclassed for the HTML Processor.
     1478     *
     1479     * @return string|null Name of the matched token.
     1480     */
     1481    public function get_token_name() {
     1482        if ( isset( $this->current_element ) ) {
     1483            return $this->current_element->token->node_name;
     1484        }
     1485
     1486        return parent::get_token_name();
     1487    }
     1488
     1489    /**
     1490     * Indicates the kind of matched token, if any.
     1491     *
     1492     * This differs from `get_token_name()` in that it always
     1493     * returns a static string indicating the type, whereas
     1494     * `get_token_name()` may return values derived from the
     1495     * token itself, such as a tag name or processing
     1496     * instruction tag.
     1497     *
     1498     * Possible values:
     1499     *  - `#tag` when matched on a tag.
     1500     *  - `#text` when matched on a text node.
     1501     *  - `#cdata-section` when matched on a CDATA node.
     1502     *  - `#comment` when matched on a comment.
     1503     *  - `#doctype` when matched on a DOCTYPE declaration.
     1504     *  - `#presumptuous-tag` when matched on an empty tag closer.
     1505     *  - `#funky-comment` when matched on a funky comment.
     1506     *
     1507     * @since 6.6.0 Subclassed for the HTML Processor.
     1508     *
     1509     * @return string|null What kind of token is matched, or null.
     1510     */
     1511    public function get_token_type() {
     1512        if ( isset( $this->current_element ) ) {
     1513            $node_name = $this->current_element->token->node_name;
     1514            if ( ctype_upper( $node_name[0] ) ) {
     1515                return '#tag';
     1516            }
     1517
     1518            if ( 'html' === $node_name ) {
     1519                return '#doctype';
     1520            }
     1521
     1522            return $node_name;
     1523        }
     1524
     1525        return parent::get_token_type();
     1526    }
     1527
     1528    /**
     1529     * Returns the value of a requested attribute from a matched tag opener if that attribute exists.
     1530     *
     1531     * Example:
     1532     *
     1533     *     $p = WP_HTML_Processor::create_fragment( '<div enabled class="test" data-test-id="14">Test</div>' );
     1534     *     $p->next_token() === true;
     1535     *     $p->get_attribute( 'data-test-id' ) === '14';
     1536     *     $p->get_attribute( 'enabled' ) === true;
     1537     *     $p->get_attribute( 'aria-label' ) === null;
     1538     *
     1539     *     $p->next_tag() === false;
     1540     *     $p->get_attribute( 'class' ) === null;
     1541     *
     1542     * @since 6.6.0 Subclassed for HTML Processor.
     1543     *
     1544     * @param string $name Name of attribute whose value is requested.
     1545     * @return string|true|null Value of attribute or `null` if not available. Boolean attributes return `true`.
     1546     */
     1547    public function get_attribute( $name ) {
     1548        if ( isset( $this->current_element ) ) {
     1549            // Closing tokens cannot contain attributes.
     1550            if ( WP_HTML_Stack_Event::POP === $this->current_element->operation ) {
     1551                return null;
     1552            }
     1553
     1554            $node_name = $this->current_element->token->node_name;
     1555
     1556            // Only tags can contain attributes.
     1557            if ( 'A' > $node_name[0] || 'Z' < $node_name[0] ) {
     1558                return null;
     1559            }
     1560
     1561            if ( $this->current_element->token->bookmark_name === (string) $this->bookmark_counter ) {
     1562                return parent::get_attribute( $name );
     1563            }
     1564        }
     1565
     1566        return null;
     1567    }
     1568
     1569    /**
     1570     * Gets lowercase names of all attributes matching a given prefix in the current tag.
     1571     *
     1572     * Note that matching is case-insensitive. This is in accordance with the spec:
     1573     *
     1574     * > There must never be two or more attributes on
     1575     * > the same start tag whose names are an ASCII
     1576     * > case-insensitive match for each other.
     1577     *     - HTML 5 spec
     1578     *
     1579     * Example:
     1580     *
     1581     *     $p = new WP_HTML_Tag_Processor( '<div data-ENABLED class="test" DATA-test-id="14">Test</div>' );
     1582     *     $p->next_tag( array( 'class_name' => 'test' ) ) === true;
     1583     *     $p->get_attribute_names_with_prefix( 'data-' ) === array( 'data-enabled', 'data-test-id' );
     1584     *
     1585     *     $p->next_tag() === false;
     1586     *     $p->get_attribute_names_with_prefix( 'data-' ) === null;
     1587     *
     1588     * @since 6.6.0 Subclassed for the HTML Processor.
     1589     *
     1590     * @see https://html.spec.whatwg.org/multipage/syntax.html#attributes-2:ascii-case-insensitive
     1591     *
     1592     * @param string $prefix Prefix of requested attribute names.
     1593     * @return array|null List of attribute names, or `null` when no tag opener is matched.
     1594     */
     1595    public function get_attribute_names_with_prefix( $prefix ) {
     1596        if ( isset( $this->current_element ) ) {
     1597            if ( WP_HTML_Stack_Event::POP === $this->current_element->operation ) {
     1598                return null;
     1599            }
     1600
     1601            $mark = $this->bookmarks[ $this->current_element->token->bookmark_name ];
     1602            if ( 0 === $mark->length ) {
     1603                return null;
     1604            }
     1605        }
     1606
     1607        return parent::get_attribute_names_with_prefix( $prefix );
     1608    }
     1609
     1610    /**
     1611     * Returns the modifiable text for a matched token, or an empty string.
     1612     *
     1613     * Modifiable text is text content that may be read and changed without
     1614     * changing the HTML structure of the document around it. This includes
     1615     * the contents of `#text` nodes in the HTML as well as the inner
     1616     * contents of HTML comments, Processing Instructions, and others, even
     1617     * though these nodes aren't part of a parsed DOM tree. They also contain
     1618     * the contents of SCRIPT and STYLE tags, of TEXTAREA tags, and of any
     1619     * other section in an HTML document which cannot contain HTML markup (DATA).
     1620     *
     1621     * If a token has no modifiable text then an empty string is returned to
     1622     * avoid needless crashing or type errors. An empty string does not mean
     1623     * that a token has modifiable text, and a token with modifiable text may
     1624     * have an empty string (e.g. a comment with no contents).
     1625     *
     1626     * @since 6.6.0 Subclassed for the HTML Processor.
     1627     *
     1628     * @return string
     1629     */
     1630    public function get_modifiable_text() {
     1631        if ( isset( $this->current_element ) ) {
     1632            if ( WP_HTML_Stack_Event::POP === $this->current_element->operation ) {
     1633                return '';
     1634            }
     1635
     1636            $mark = $this->bookmarks[ $this->current_element->token->bookmark_name ];
     1637            if ( 0 === $mark->length ) {
     1638                return '';
     1639            }
     1640        }
     1641        return parent::get_modifiable_text();
     1642    }
     1643
     1644    /**
    13461645     * Removes a bookmark that is no longer needed.
    13471646     *
     
    13841683            : 0;
    13851684        $bookmark_starts_at   = $this->bookmarks[ $actual_bookmark_name ]->start;
     1685        $bookmark_length      = $this->bookmarks[ $actual_bookmark_name ]->length;
    13861686        $direction            = $bookmark_starts_at > $processor_started_at ? 'forward' : 'backward';
    13871687
     
    14391739            $this->state->insertion_mode = WP_HTML_Processor_State::INSERTION_MODE_IN_BODY;
    14401740            $this->state->frameset_ok    = true;
     1741            $this->element_queue         = array();
     1742            $this->current_element       = null;
    14411743        }
    14421744
     
    14461748        }
    14471749
    1448         while ( $this->step() ) {
     1750        while ( $this->next_token() ) {
    14491751            if ( $bookmark_starts_at === $this->bookmarks[ $this->state->current_token->bookmark_name ]->start ) {
     1752                while ( isset( $this->current_element ) && WP_HTML_Stack_Event::POP === $this->current_element->operation ) {
     1753                    $this->current_element = array_shift( $this->element_queue );
     1754                }
    14501755                return true;
    14511756            }
Note: See TracChangeset for help on using the changeset viewer.