Make WordPress Core


Ignore:
Timestamp:
05/29/2024 11:40:16 AM (5 months ago)
Author:
dmsnell
Message:

HTML API: Fix token length bug in Tag Processor.

The Tag Processor stores the byte-offsets into its HTML document where
the current token starts and ends, and also for every bookmark. In some
cases for tags, the end offset has been off by one.

In this patch the offset is fixed so that a bookmark always properly
refers to the full span of the token it's bookmarking. Also the current
token byte offsets are properly recorded.

While this is a defect in the Tag Processor, it hasn't been exposed
through the public interface and has not affected any of the working
of the processor. Only subclasses which rely on the length of a bookmark
have been potentially affected, and these are not supported environments
in the ongoing work.

This fix is important for future work and for ensuring that subclasses
performing custom behaviors remain as reliable as the public interface.

Developed in https://github.com/WordPress/wordpress-develop/pull/6625
Discussed in https://core.trac.wordpress.org/ticket/61301

Props dmsnell, gziolo, jonsurrell, westonruter.
Fixes #61301.

File:
1 edited

Legend:

Unmodified
Added
Removed
  • trunk/tests/phpunit/tests/html-api/wpHtmlTagProcessor.php

    r57805 r58233  
    475475
    476476        $this->assertSame( '<div wonky><img hidden></div>', $processor->get_updated_html() );
     477    }
     478
     479    /**
     480     * Ensures that bookmarks start and length correctly describe a given token in HTML.
     481     *
     482     * @ticket 61301
     483     *
     484     * @dataProvider data_html_nth_token_substring
     485     *
     486     * @param string $html            Input HTML.
     487     * @param int    $match_nth_token Which token to inspect from input HTML.
     488     * @param string $expected_match  Expected full raw token bookmark should capture.
     489     */
     490    public function test_token_bookmark_span( string $html, int $match_nth_token, string $expected_match ) {
     491        $processor = new class( $html ) extends WP_HTML_Tag_Processor {
     492            /**
     493             * Returns the raw span of HTML for the currently-matched
     494             * token, or null if not paused on any token.
     495             *
     496             * @return string|null Raw HTML content of currently-matched token,
     497             *                     otherwise `null` if not matched.
     498             */
     499            public function get_raw_token() {
     500                if (
     501                    WP_HTML_Tag_Processor::STATE_READY === $this->parser_state ||
     502                    WP_HTML_Tag_Processor::STATE_INCOMPLETE_INPUT === $this->parser_state ||
     503                    WP_HTML_Tag_Processor::STATE_COMPLETE === $this->parser_state
     504                ) {
     505                    return null;
     506                }
     507
     508                $this->set_bookmark( 'mark' );
     509                $mark = $this->bookmarks['mark'];
     510
     511                return substr( $this->html, $mark->start, $mark->length );
     512            }
     513        };
     514
     515        for ( $i = 0; $i < $match_nth_token; $i++ ) {
     516            $processor->next_token();
     517        }
     518
     519        $raw_token = $processor->get_raw_token();
     520        $this->assertIsString(
     521            $raw_token,
     522            "Failed to find raw token at position {$match_nth_token}: check test data provider."
     523        );
     524
     525        $this->assertSame(
     526            $expected_match,
     527            $raw_token,
     528            'Bookmarked wrong span of text for full matched token.'
     529        );
     530    }
     531
     532    /**
     533     * Data provider.
     534     *
     535     * @return array
     536     */
     537    public static function data_html_nth_token_substring() {
     538        return array(
     539            // Tags.
     540            'DIV start tag'                 => array( '<div>', 1, '<div>' ),
     541            'DIV start tag with attributes' => array( '<div class="x" disabled>', 1, '<div class="x" disabled>' ),
     542            'DIV end tag'                   => array( '</div>', 1, '</div>' ),
     543            'DIV end tag with attributes'   => array( '</div class="x" disabled>', 1, '</div class="x" disabled>' ),
     544            'Nested DIV'                    => array( '<div><div b>', 2, '<div b>' ),
     545            'Sibling DIV'                   => array( '<div></div><div b>', 3, '<div b>' ),
     546            'DIV after text'                => array( 'text <div>', 2, '<div>' ),
     547            'DIV before text'               => array( '<div> text', 1, '<div>' ),
     548            'DIV after comment'             => array( '<!-- comment --><div>', 2, '<div>' ),
     549            'DIV before comment'            => array( '<div><!-- c --> ', 1, '<div>' ),
     550            'Start "self-closing" tag'      => array( '<div />', 1, '<div />' ),
     551            'Void tag'                      => array( '<img src="img.png">', 1, '<img src="img.png">' ),
     552            'Void tag w/self-closing flag'  => array( '<img src="img.png" />', 1, '<img src="img.png" />' ),
     553            'Void tag inside DIV'           => array( '<div><img src="img.png"></div>', 2, '<img src="img.png">' ),
     554
     555            // Special atomic tags.
     556            'SCRIPT tag'                    => array( '<script>inside text</script>', 1, '<script>inside text</script>' ),
     557            'SCRIPT double-escape'          => array( '<script><!-- <script> echo "</script>"; </script><div>', 1, '<script><!-- <script> echo "</script>"; </script>' ),
     558
     559            // Text.
     560            'Text'                          => array( 'Just text', 1, 'Just text' ),
     561            'Text in DIV'                   => array( '<div>Text<div>', 2, 'Text' ),
     562            'Text before DIV'               => array( 'Text<div>', 1, 'Text' ),
     563            'Text after DIV'                => array( '<div></div>Text', 3, 'Text' ),
     564            'Text after comment'            => array( '<!-- comment -->Text', 2, 'Text' ),
     565            'Text before comment'           => array( 'Text<!-- c --> ', 1, 'Text' ),
     566
     567            // Comments.
     568            'Comment'                       => array( '<!-- comment -->', 1, '<!-- comment -->' ),
     569            'Comment in DIV'                => array( '<div><!-- comment --><div>', 2, '<!-- comment -->' ),
     570            'Comment before DIV'            => array( '<!-- comment --><div>', 1, '<!-- comment -->' ),
     571            'Comment after DIV'             => array( '<div></div><!-- comment -->', 3, '<!-- comment -->' ),
     572            'Comment after comment'         => array( '<!-- comment --><!-- comment -->', 2, '<!-- comment -->' ),
     573            'Comment before comment'        => array( '<!-- comment --><!-- c --> ', 1, '<!-- comment -->' ),
     574            'Abruptly closed comment'       => array( '<!-->', 1, '<!-->' ),
     575            'Empty comment'                 => array( '<!---->', 1, '<!---->' ),
     576            'Funky comment'                 => array( '</_ funk >', 1, '</_ funk >' ),
     577            'PI lookalike comment'          => array( '<?processing instruction?>', 1, '<?processing instruction?>' ),
     578            'CDATA lookalike comment'       => array( '<![CDATA[ see? data ]]>', 1, '<![CDATA[ see? data ]]>' ),
     579        );
    477580    }
    478581
     
    27472850                $this->set_bookmark( 'here' );
    27482851                $this->lexical_updates[] = new WP_HTML_Text_Replacement(
    2749                     $this->bookmarks['here']->start + $this->bookmarks['here']->length + 1,
     2852                    $this->bookmarks['here']->start + $this->bookmarks['here']->length,
    27502853                    0,
    27512854                    $new_html
Note: See TracChangeset for help on using the changeset viewer.