Make WordPress Core


Ignore:
Timestamp:
12/20/2023 05:50:04 PM (18 months ago)
Author:
Bernhard Reiter
Message:

HTML API: Avoid processing incomplete tokens.

Currently the Tag Processor assumes that an input document is a full HTML document. Because of this, if there's lingering content after the last tag match it will treat that content as plaintext and skip over it. This is fine for the Tag Processor because if there is lingering content that isn't a valid tag then there's nothing for next_tag() to match.

However, in order to support a number of feature expansions it is important to recognize that the remaining content may involve partial syntax elements, such as incomplete tags, attributes, or comments.

In this patch we're adding a mode inside the Tag Processor which will flip when we start parsing HTML syntax but the document finishes before the token does. This will provide the ability to:

  • extend the input document,
  • avoid misinterpreting syntax as text, and
  • guess if we have a complete document, know if we have an incomplete document.

In the process of building this patch a few fixes were identified and fixed in the Tag Processor, namely in the handling of incomplete syntax elements.

Props dmsnell, jonsurrell.
Fixes #60122, #60108.

File:
1 edited

Legend:

Unmodified
Added
Removed
  • trunk/tests/phpunit/tests/html-api/wpHtmlTagProcessor.php

    r56703 r57211  
    17571757     *
    17581758     * @covers WP_HTML_Tag_Processor::next_tag
     1759     * @covers WP_HTML_Tag_Processor::paused_at_incomplete_token
    17591760     */
    17601761    public function test_unclosed_script_tag_should_not_cause_an_infinite_loop() {
    1761         $p = new WP_HTML_Tag_Processor( '<script>' );
    1762         $p->next_tag();
    1763         $this->assertSame( 'SCRIPT', $p->get_tag(), 'Did not find script tag' );
     1762        $p = new WP_HTML_Tag_Processor( '<script><div>' );
     1763        $this->assertFalse(
     1764            $p->next_tag(),
     1765            'Should not have stopped on an opening SCRIPT tag without a proper closing tag in the document.'
     1766        );
     1767        $this->assertTrue(
     1768            $p->paused_at_incomplete_token(),
     1769            "Should have paused the parser because of the incomplete SCRIPT tag but didn't."
     1770        );
     1771
     1772        // Run this to ensure that the test ends (not in an infinite loop).
    17641773        $p->next_tag();
    17651774    }
     
    19331942
    19341943    /**
     1944     * Ensures matching elements inside NOSCRIPT elements.
     1945     *
     1946     * In a browser when the scripting flag is enabled, everything inside
     1947     * the NOSCRIPT element will be ignored and treated at RAW TEXT. This
     1948     * means that it's valid to send what looks like incomplete or partial
     1949     * HTML syntax without impacting a rendered page. The Tag Processor is
     1950     * a parser with the scripting flag disabled, however, and needs to
     1951     * expose all the potential content that some code might want to modify.
     1952     *
     1953     * Were it not for this then the NOSCRIPT tag would be handled like the
     1954     * other tags in the RAW TEXT special group, e.g. NOEMBED or STYLE.
     1955     *
     1956     * @ticket 60122
     1957     *
     1958     * @covers WP_HTML_Tag_Processor::next_tag
     1959     */
     1960    public function test_processes_inside_of_noscript_elements() {
     1961        $p = new WP_HTML_Tag_Processor( '<noscript><input type="submit"></noscript><div>' );
     1962
     1963        $this->assertTrue( $p->next_tag( 'INPUT' ), 'Failed to find INPUT element inside NOSCRIPT element.' );
     1964        $this->assertTrue( $p->next_tag( 'DIV' ), 'Failed to find DIV element after NOSCRIPT element.' );
     1965    }
     1966
     1967    /**
    19351968     * @ticket 59292
    19361969     *
     
    19631996            'NOEMBED'          => array( '<noembed><p></p></noembed><div target>' ),
    19641997            'NOFRAMES'         => array( '<noframes><p>Check the rules here.</p></noframes><div target>' ),
    1965             'NOSCRIPT'         => array( '<noscript><span>This assumes that scripting mode is enabled.</span></noscript><p target>' ),
    19661998            'STYLE'            => array( '<style>* { margin: 0 }</style><div target>' ),
    19671999            'STYLE hiding DIV' => array( '<style>li::before { content: "<div non-target>" }</style><div target>' ),
     
    21402172     *
    21412173     * @covers WP_HTML_Tag_Processor::next_tag
     2174     * @covers WP_HTML_Tag_Processor::paused_at_incomplete_token
    21422175     *
    21432176     * @dataProvider data_html_with_unclosed_comments
    21442177     *
    2145      * @param string $html_ending_before_comment_close HTML with opened comments that aren't closed
     2178     * @param string $html_ending_before_comment_close HTML with opened comments that aren't closed.
    21462179     */
    21472180    public function test_documents_may_end_with_unclosed_comment( $html_ending_before_comment_close ) {
    21482181        $p = new WP_HTML_Tag_Processor( $html_ending_before_comment_close );
    21492182
    2150         $this->assertFalse( $p->next_tag() );
     2183        $this->assertFalse(
     2184            $p->next_tag(),
     2185            "Should not have found any tag, but found {$p->get_tag()}."
     2186        );
     2187
     2188        $this->assertTrue(
     2189            $p->paused_at_incomplete_token(),
     2190            "Should have indicated that the parser found an incomplete token but didn't."
     2191        );
    21512192    }
    21522193
     
    22812322
    22822323    /**
     2324     * Ensures that no tags are matched in a document containing only non-tag content.
     2325     *
     2326     * @ticket 60122
     2327     *
     2328     * @covers WP_HTML_Tag_Processor::next_tag
     2329     * @covers WP_HTML_Tag_Processor::paused_at_incomplete_token
     2330     *
     2331     * @dataProvider data_html_without_tags
     2332     *
     2333     * @param string $html_without_tags HTML without any tags in it.
     2334     */
     2335    public function test_next_tag_returns_false_when_there_are_no_tags( $html_without_tags ) {
     2336        $processor = new WP_HTML_Tag_Processor( $html_without_tags );
     2337
     2338        $this->assertFalse(
     2339            $processor->next_tag(),
     2340            "Shouldn't have found any tags but found {$processor->get_tag()}."
     2341        );
     2342
     2343        $this->assertFalse(
     2344            $processor->paused_at_incomplete_token(),
     2345            'Should have indicated that end of document was reached without evidence that elements were truncated.'
     2346        );
     2347    }
     2348
     2349    /**
     2350     * Data provider.
     2351     *
     2352     * @return array[]
     2353     */
     2354    public function data_html_without_tags() {
     2355        return array(
     2356            'DOCTYPE declaration'    => array( '<!DOCTYPE html>Just some HTML' ),
     2357            'No tags'                => array( 'this is nothing more than a text node' ),
     2358            'Text with comments'     => array( 'One <!-- sneaky --> comment.' ),
     2359            'Empty tag closer'       => array( '</>' ),
     2360            'Processing instruction' => array( '<?xml version="1.0"?>' ),
     2361            'Combination XML-like'   => array( '<!DOCTYPE xml><?xml version=""?><!-- this is not a real document. --><![CDATA[it only serves as a test]]>' ),
     2362        );
     2363    }
     2364
     2365    /**
     2366     * Ensures that the processor doesn't attempt to match an incomplete token.
     2367     *
    22832368     * @ticket 58637
    22842369     *
    22852370     * @covers WP_HTML_Tag_Processor::next_tag
     2371     * @covers WP_HTML_Tag_Processor::paused_at_incomplete_token
    22862372     *
    22872373     * @dataProvider data_incomplete_syntax_elements
     
    22892375     * @param string $incomplete_html HTML text containing some kind of incomplete syntax.
    22902376     */
    2291     public function test_returns_false_for_incomplete_syntax_elements( $incomplete_html ) {
     2377    public function test_next_tag_returns_false_for_incomplete_syntax_elements( $incomplete_html ) {
    22922378        $p = new WP_HTML_Tag_Processor( $incomplete_html );
    2293         $this->assertFalse( $p->next_tag() );
     2379
     2380        $this->assertFalse(
     2381            $p->next_tag(),
     2382            "Shouldn't have found any tags but found {$p->get_tag()}."
     2383        );
     2384
     2385        $this->assertTrue(
     2386            $p->paused_at_incomplete_token(),
     2387            "Should have indicated that the parser found an incomplete token but didn't."
     2388        );
    22942389    }
    22952390
     
    23012396    public function data_incomplete_syntax_elements() {
    23022397        return array(
    2303             'No tags'                              => array( 'this is nothing more than a text node' ),
    23042398            'Incomplete tag name'                  => array( '<swit' ),
    23052399            'Incomplete tag (no attributes)'       => array( '<div' ),
     
    23142408            'Incomplete DOCTYPE'                   => array( '<!DOCTYPE html' ),
    23152409            'Partial DOCTYPE'                      => array( '<!DOCTY' ),
    2316             'Incomplete CDATA'                     => array( '<[CDATA[something inside of here needs to get out' ),
    2317             'Partial CDATA'                        => array( '<[CDA' ),
    2318             'Partially closed CDATA]'              => array( '<[CDATA[cannot escape]' ),
    2319             'Partially closed CDATA]>'             => array( '<[CDATA[cannot escape]>' ),
     2410            'Incomplete CDATA'                     => array( '<![CDATA[something inside of here needs to get out' ),
     2411            'Partial CDATA'                        => array( '<![CDA' ),
     2412            'Partially closed CDATA]'              => array( '<![CDATA[cannot escape]' ),
     2413            'Partially closed CDATA]>'             => array( '<![CDATA[cannot escape]>' ),
     2414            'Unclosed IFRAME'                      => array( '<iframe><div>' ),
     2415            'Unclosed NOEMBED'                     => array( '<noembed><div>' ),
     2416            'Unclosed NOFRAMES'                    => array( '<noframes><div>' ),
     2417            'Unclosed SCRIPT'                      => array( '<script><div>' ),
     2418            'Unclosed STYLE'                       => array( '<style><div>' ),
     2419            'Unclosed TEXTAREA'                    => array( '<textarea><div>' ),
     2420            'Unclosed TITLE'                       => array( '<title><div>' ),
     2421            'Unclosed XMP'                         => array( '<xmp><div>' ),
     2422            'Partially closed IFRAME'              => array( '<iframe><div></iframe' ),
     2423            'Partially closed NOEMBED'             => array( '<noembed><div></noembed' ),
     2424            'Partially closed NOFRAMES'            => array( '<noframes><div></noframes' ),
     2425            'Partially closed SCRIPT'              => array( '<script><div></script' ),
     2426            'Partially closed STYLE'               => array( '<style><div></style' ),
     2427            'Partially closed TEXTAREA'            => array( '<textarea><div></textarea' ),
     2428            'Partially closed TITLE'               => array( '<title><div></title' ),
     2429            'Partially closed XMP'                 => array( '<xmp><div></xmp' ),
    23202430        );
    23212431    }
     
    24162526    public function test_updating_attributes_in_malformed_html( $html, $expected ) {
    24172527        $p = new WP_HTML_Tag_Processor( $html );
    2418         $p->next_tag();
     2528        $this->assertTrue( $p->next_tag(), 'Could not find first tag.' );
    24192529        $p->set_attribute( 'foo', 'bar' );
    24202530        $p->add_class( 'firstTag' );
     
    24352545     */
    24362546    public function data_updating_attributes_in_malformed_html() {
    2437         $null_byte = chr( 0 );
    2438 
    24392547        return array(
    24402548            'Invalid entity inside attribute value'        => array(
     
    24952603            ),
    24962604            'id without double quotation marks around null byte' => array(
    2497                 'input'    => '<hr id' . $null_byte . 'zero="test"><span>test</span>',
    2498                 'expected' => '<hr class="firstTag" foo="bar" id' . $null_byte . 'zero="test"><span class="secondTag">test</span>',
     2605                'input'    => "<hr id\x00zero=\"test\"><span>test</span>",
     2606                'expected' => "<hr class=\"firstTag\" foo=\"bar\" id\x00zero=\"test\"><span class=\"secondTag\">test</span>",
    24992607            ),
    25002608            'Unexpected > before an attribute'             => array(
     
    25842692        );
    25852693    }
     2694
     2695    /**
     2696     * @covers WP_HTML_Tag_Processor::next_tag
     2697     */
     2698    public function test_handles_malformed_taglike_open_short_html() {
     2699        $p      = new WP_HTML_Tag_Processor( '<' );
     2700        $result = $p->next_tag();
     2701        $this->assertFalse( $result, 'Did not handle "<" html properly.' );
     2702    }
     2703
     2704    /**
     2705     * @covers WP_HTML_Tag_Processor::next_tag
     2706     */
     2707    public function test_handles_malformed_taglike_close_short_html() {
     2708        $p      = new WP_HTML_Tag_Processor( '</ ' );
     2709        $result = $p->next_tag();
     2710        $this->assertFalse( $result, 'Did not handle "</ " html properly.' );
     2711    }
    25862712}
Note: See TracChangeset for help on using the changeset viewer.