Make WordPress Core


Ignore:
Timestamp:
03/12/2024 12:22:40 AM (9 months ago)
Author:
dmsnell
Message:

HTML API: Trigger active format reconstruction when reaching text nodes.

When encountering text nodes in an HTML document, the HTML parser needs
to run the active format reconstruction algorithm, even if it doesn't
stop to visit those text nodes. This is because the formats, which might
need reconstructing, will impact the breadcrumbs of all downstream nodes
from the text node.

In this patch, this process is triggered, which properly triggers the
active format reconstruction. It also enables the visiting of other token
types as is possible in the Tag Processor.

Developed in https://github.com/WordPress/wordpress-develop/pull/6054
Discussed in https://core.trac.wordpress.org/ticket/60170

Props: dmsnell, jonsurrell, westonruter.
Fixes: #60455.
Follow-up to: [57348].

File:
1 edited

Legend:

Unmodified
Added
Removed
  • trunk/src/wp-includes/html-api/class-wp-html-processor.php

    r57768 r57806  
    362362        if ( null === $query ) {
    363363            while ( $this->step() ) {
     364                if ( '#tag' !== $this->get_token_type() ) {
     365                    continue;
     366                }
     367
    364368                if ( ! $this->is_tag_closer() ) {
    365369                    return true;
     
    385389        if ( ! ( array_key_exists( 'breadcrumbs', $query ) && is_array( $query['breadcrumbs'] ) ) ) {
    386390            while ( $this->step() ) {
     391                if ( '#tag' !== $this->get_token_type() ) {
     392                    continue;
     393                }
     394
    387395                if ( ! $this->is_tag_closer() ) {
    388396                    return true;
     
    406414
    407415        while ( $match_offset > 0 && $this->step() ) {
     416            if ( '#tag' !== $this->get_token_type() ) {
     417                continue;
     418            }
     419
    408420            if ( $this->matches_breadcrumbs( $breadcrumbs ) && 0 === --$match_offset ) {
    409421                return true;
     
    429441     */
    430442    public function next_token() {
    431         $found_a_token = parent::next_token();
    432 
    433         if ( '#tag' === $this->get_token_type() ) {
    434             $this->step( self::PROCESS_CURRENT_NODE );
    435         }
    436 
    437         return $found_a_token;
     443        return $this->step();
    438444    }
    439445
     
    464470     */
    465471    public function matches_breadcrumbs( $breadcrumbs ) {
    466         if ( ! $this->get_tag() ) {
    467             return false;
    468         }
    469 
    470472        // Everything matches when there are zero constraints.
    471473        if ( 0 === count( $breadcrumbs ) ) {
     
    530532             */
    531533            $top_node = $this->state->stack_of_open_elements->current_node();
    532             if ( $top_node && self::is_void( $top_node->node_name ) ) {
     534            if (
     535                $top_node && (
     536                    // Void elements.
     537                    self::is_void( $top_node->node_name ) ||
     538                    // Comments, text nodes, and other atomic tokens.
     539                    '#' === $top_node->node_name[0] ||
     540                    // Doctype declarations.
     541                    'html' === $top_node->node_name
     542                )
     543            ) {
    533544                $this->state->stack_of_open_elements->pop();
    534545            }
     
    536547
    537548        if ( self::PROCESS_NEXT_NODE === $node_to_process ) {
    538             while ( parent::next_token() && '#tag' !== $this->get_token_type() ) {
    539                 continue;
    540             }
     549            parent::next_token();
    541550        }
    542551
    543552        // Finish stepping when there are no more tokens in the document.
    544         if ( null === $this->get_tag() ) {
     553        if (
     554            WP_HTML_Tag_Processor::STATE_INCOMPLETE_INPUT === $this->parser_state ||
     555            WP_HTML_Tag_Processor::STATE_COMPLETE === $this->parser_state
     556        ) {
    545557            return false;
    546558        }
    547559
    548560        $this->state->current_token = new WP_HTML_Token(
    549             $this->bookmark_tag(),
    550             $this->get_tag(),
     561            $this->bookmark_token(),
     562            $this->get_token_name(),
    551563            $this->has_self_closing_flag(),
    552564            $this->release_internal_bookmark_on_destruct
     
    592604     */
    593605    public function get_breadcrumbs() {
    594         if ( ! $this->get_tag() ) {
    595             return null;
    596         }
    597 
    598606        $breadcrumbs = array();
    599607        foreach ( $this->state->stack_of_open_elements->walk_down() as $stack_item ) {
     
    620628     */
    621629    private function step_in_body() {
    622         $tag_name = $this->get_tag();
    623         $op_sigil = $this->is_tag_closer() ? '-' : '+';
    624         $op       = "{$op_sigil}{$tag_name}";
     630        $token_name = $this->get_token_name();
     631        $token_type = $this->get_token_type();
     632        $op_sigil   = '#tag' === $token_type ? ( $this->is_tag_closer() ? '-' : '+' ) : '';
     633        $op         = "{$op_sigil}{$token_name}";
    625634
    626635        switch ( $op ) {
     636            case '#comment':
     637            case '#funky-comment':
     638            case '#presumptuous-tag':
     639                $this->insert_html_element( $this->state->current_token );
     640                return true;
     641
     642            case '#text':
     643                $this->reconstruct_active_formatting_elements();
     644
     645                $current_token = $this->bookmarks[ $this->state->current_token->bookmark_name ];
     646
     647                /*
     648                 * > A character token that is U+0000 NULL
     649                 *
     650                 * Any successive sequence of NULL bytes is ignored and won't
     651                 * trigger active format reconstruction. Therefore, if the text
     652                 * only comprises NULL bytes then the token should be ignored
     653                 * here, but if there are any other characters in the stream
     654                 * the active formats should be reconstructed.
     655                 */
     656                if (
     657                    1 <= $current_token->length &&
     658                    "\x00" === $this->html[ $current_token->start ] &&
     659                    strspn( $this->html, "\x00", $current_token->start, $current_token->length ) === $current_token->length
     660                ) {
     661                    // Parse error: ignore the token.
     662                    return $this->step();
     663                }
     664
     665                /*
     666                 * Whitespace-only text does not affect the frameset-ok flag.
     667                 * It is probably inter-element whitespace, but it may also
     668                 * contain character references which decode only to whitespace.
     669                 */
     670                $text = $this->get_modifiable_text();
     671                if ( strlen( $text ) !== strspn( $text, " \t\n\f\r" ) ) {
     672                    $this->state->frameset_ok = false;
     673                }
     674
     675                $this->insert_html_element( $this->state->current_token );
     676                return true;
     677
     678            case 'html':
     679                /*
     680                 * > A DOCTYPE token
     681                 * > Parse error. Ignore the token.
     682                 */
     683                return $this->step();
     684
    627685            /*
    628686             * > A start tag whose tag name is "button"
     
    712770            case '-SUMMARY':
    713771            case '-UL':
    714                 if ( ! $this->state->stack_of_open_elements->has_element_in_scope( $tag_name ) ) {
     772                if ( ! $this->state->stack_of_open_elements->has_element_in_scope( $token_name ) ) {
    715773                    // @todo Report parse error.
    716774                    // Ignore the token.
     
    719777
    720778                $this->generate_implied_end_tags();
    721                 if ( $this->state->stack_of_open_elements->current_node()->node_name !== $tag_name ) {
     779                if ( $this->state->stack_of_open_elements->current_node()->node_name !== $token_name ) {
    722780                    // @todo Record parse error: this error doesn't impact parsing.
    723781                }
    724                 $this->state->stack_of_open_elements->pop_until( $tag_name );
     782                $this->state->stack_of_open_elements->pop_until( $token_name );
    725783                return true;
    726784
     
    784842                $this->generate_implied_end_tags();
    785843
    786                 if ( $this->state->stack_of_open_elements->current_node()->node_name !== $tag_name ) {
     844                if ( $this->state->stack_of_open_elements->current_node()->node_name !== $token_name ) {
    787845                    // @todo Record parse error: this error doesn't impact parsing.
    788846                }
     
    800858                $this->state->frameset_ok = false;
    801859                $node                     = $this->state->stack_of_open_elements->current_node();
    802                 $is_li                    = 'LI' === $tag_name;
     860                $is_li                    = 'LI' === $token_name;
    803861
    804862                in_body_list_loop:
     
    863921                     */
    864922                    (
    865                         'LI' === $tag_name &&
     923                        'LI' === $token_name &&
    866924                        ! $this->state->stack_of_open_elements->has_element_in_list_item_scope( 'LI' )
    867925                    ) ||
     
    873931                     */
    874932                    (
    875                         'LI' !== $tag_name &&
    876                         ! $this->state->stack_of_open_elements->has_element_in_scope( $tag_name )
     933                        'LI' !== $token_name &&
     934                        ! $this->state->stack_of_open_elements->has_element_in_scope( $token_name )
    877935                    )
    878936                ) {
     
    885943                }
    886944
    887                 $this->generate_implied_end_tags( $tag_name );
    888 
    889                 if ( $tag_name !== $this->state->stack_of_open_elements->current_node()->node_name ) {
     945                $this->generate_implied_end_tags( $token_name );
     946
     947                if ( $token_name !== $this->state->stack_of_open_elements->current_node()->node_name ) {
    890948                    // @todo Indicate a parse error once it's possible. This error does not impact the logic here.
    891949                }
    892950
    893                 $this->state->stack_of_open_elements->pop_until( $tag_name );
     951                $this->state->stack_of_open_elements->pop_until( $token_name );
    894952                return true;
    895953
     
    10441102         * @see https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-inbody
    10451103         */
    1046         switch ( $tag_name ) {
     1104        switch ( $token_name ) {
    10471105            case 'APPLET':
    10481106            case 'BASE':
     
    10921150            case 'XMP':
    10931151                $this->last_error = self::ERROR_UNSUPPORTED;
    1094                 throw new WP_HTML_Unsupported_Exception( "Cannot process {$tag_name} element." );
     1152                throw new WP_HTML_Unsupported_Exception( "Cannot process {$token_name} element." );
    10951153        }
    10961154
     
    11141172             */
    11151173            foreach ( $this->state->stack_of_open_elements->walk_up() as $node ) {
    1116                 if ( $tag_name === $node->node_name ) {
     1174                if ( $token_name === $node->node_name ) {
    11171175                    break;
    11181176                }
     
    11241182            }
    11251183
    1126             $this->generate_implied_end_tags( $tag_name );
     1184            $this->generate_implied_end_tags( $token_name );
    11271185            if ( $node !== $this->state->stack_of_open_elements->current_node() ) {
    11281186                // @todo Record parse error: this error doesn't impact parsing.
     
    11431201
    11441202    /**
    1145      * Creates a new bookmark for the currently-matched tag and returns the generated name.
    1146      *
    1147      * @since 6.4.0
     1203     * Creates a new bookmark for the currently-matched token and returns the generated name.
     1204     *
     1205     * @since 6.4.0
     1206     * @since 6.5.0 Renamed from bookmark_tag() to bookmark_token().
    11481207     *
    11491208     * @throws Exception When unable to allocate requested bookmark.
     
    11511210     * @return string|false Name of created bookmark, or false if unable to create.
    11521211     */
    1153     private function bookmark_tag() {
    1154         if ( ! $this->get_tag() ) {
    1155             return false;
    1156         }
    1157 
     1212    private function bookmark_token() {
    11581213        if ( ! parent::set_bookmark( ++$this->bookmark_counter ) ) {
    11591214            $this->last_error = self::ERROR_EXCEEDED_MAX_BOOKMARKS;
Note: See TracChangeset for help on using the changeset viewer.