Make WordPress Core

Changeset 56563


Ignore:
Timestamp:
09/13/2023 12:47:25 PM (20 months ago)
Author:
Bernhard Reiter
Message:

HTML API: Skip over contents of RAWTEXT elements such as STYLE.

When encountering elements that imply switching into the RAWTEXT parsing state,
the Tag Processor should skip processing until exiting the RAWTEXT state.

In this patch the Tag Processor does just that, except for the case of the
deprecated XMP element which implies further and more complicated rules.

There's an implicit assumption that the SCRIPT ENABLED flag in HTML parsing
is enabled so that the contents of NOSCRIPT can be skipped. Otherwise, it would
be required to parse the contents of that tag.

Props dmsnell.
Fixes #59292.

Location:
trunk
Files:
2 edited

Legend:

Unmodified
Added
Removed
  • trunk/src/wp-includes/html-api/class-wp-html-tag-processor.php

    r56549 r56563  
    243243 *
    244244 * @since 6.2.0
     245 * @since 6.2.1 Fix: Support for various invalid comments; attribute updates are case-insensitive.
     246 * @since 6.3.2 Fix: Skip HTML-like content inside rawtext elements such as STYLE.
    245247 */
    246248class WP_HTML_Tag_Processor {
     
    569571             */
    570572            $t = $this->html[ $this->tag_name_starts_at ];
    571             if ( ! $this->is_closing_tag && ( 's' === $t || 'S' === $t || 't' === $t || 'T' === $t ) ) {
     573            if (
     574                ! $this->is_closing_tag &&
     575                (
     576                    'i' === $t || 'I' === $t ||
     577                    'n' === $t || 'N' === $t ||
     578                    's' === $t || 'S' === $t ||
     579                    't' === $t || 'T' === $t
     580                ) ) {
    572581                $tag_name = $this->get_tag();
    573582
     
    579588                    ! $this->skip_rcdata( $tag_name )
    580589                ) {
     590                    $this->bytes_already_parsed = strlen( $this->html );
     591                    return false;
     592                } elseif (
     593                    (
     594                        'IFRAME' === $tag_name ||
     595                        'NOEMBED' === $tag_name ||
     596                        'NOFRAMES' === $tag_name ||
     597                        'NOSCRIPT' === $tag_name ||
     598                        'STYLE' === $tag_name
     599                    ) &&
     600                    ! $this->skip_rawtext( $tag_name )
     601                ) {
     602                    /*
     603                     * "XMP" should be here too but its rules are more complicated and require the
     604                     * complexity of the HTML Processor (it needs to close out any open P element,
     605                     * meaning it can't be skipped here or else the HTML Processor will lose its
     606                     * place). For now, it can be ignored as it's a rare HTML tag in practice and
     607                     * any normative HTML should be using PRE instead.
     608                     */
    581609                    $this->bytes_already_parsed = strlen( $this->html );
    582610                    return false;
     
    711739    }
    712740
    713 
    714     /**
    715      * Skips contents of title and textarea tags.
     741    /**
     742     * Skips contents of generic rawtext elements.
     743     *
     744     * @since 6.3.2
     745     *
     746     * @see https://html.spec.whatwg.org/#generic-raw-text-element-parsing-algorithm
     747     *
     748     * @param string $tag_name The uppercase tag name which will close the RAWTEXT region.
     749     * @return bool Whether an end to the RAWTEXT region was found before the end of the document.
     750     */
     751    private function skip_rawtext( $tag_name ) {
     752        /*
     753         * These two functions distinguish themselves on whether character references are
     754         * decoded, and since functionality to read the inner markup isn't supported, it's
     755         * not necessary to implement these two functions separately.
     756         */
     757        return $this->skip_rcdata( $tag_name );
     758    }
     759
     760    /**
     761     * Skips contents of RCDATA elements, namely title and textarea tags.
    716762     *
    717763     * @since 6.2.0
     
    719765     * @see https://html.spec.whatwg.org/multipage/parsing.html#rcdata-state
    720766     *
    721      * @param string $tag_name The lowercase tag name which will close the RCDATA region.
     767     * @param string $tag_name The uppercase tag name which will close the RCDATA region.
    722768     * @return bool Whether an end to the RCDATA region was found before the end of the document.
    723769     */
  • trunk/tests/phpunit/tests/html-api/wpHtmlTagProcessor.php

    r56299 r56563  
    18731873
    18741874    /**
     1875     * @ticket 59292
     1876     *
     1877     * @covers WP_HTML_Tag_Processor::next_tag
     1878     *
     1879     * @dataProvider data_next_tag_ignores_contents_of_rawtext_tags
     1880     *
     1881     * @param string $rawtext_element_then_target_node HTML starting with a RAWTEXT-specifying element such as STYLE,
     1882     *                                                 then an element afterward containing the "target" attribute.
     1883     */
     1884    public function test_next_tag_ignores_contents_of_rawtext_tags( $rawtext_element_then_target_node ) {
     1885        $processor = new WP_HTML_Tag_Processor( $rawtext_element_then_target_node );
     1886        $processor->next_tag();
     1887
     1888        $processor->next_tag();
     1889        $this->assertNotNull(
     1890            $processor->get_attribute( 'target' ),
     1891            "Expected to find element with target attribute but found {$processor->get_tag()} instead."
     1892        );
     1893    }
     1894
     1895    /**
     1896     * Data provider.
     1897     *
     1898     * @return array[].
     1899     */
     1900    public function data_next_tag_ignores_contents_of_rawtext_tags() {
     1901        return array(
     1902            'IFRAME'           => array( '<iframe><section>Inside</section></iframe><section target>' ),
     1903            'NOEMBED'          => array( '<noembed><p></p></noembed><div target>' ),
     1904            'NOFRAMES'         => array( '<noframes><p>Check the rules here.</p></noframes><div target>' ),
     1905            'NOSCRIPT'         => array( '<noscript><span>This assumes that scripting mode is enabled.</span></noscript><p target>' ),
     1906            'STYLE'            => array( '<style>* { margin: 0 }</style><div target>' ),
     1907            'STYLE hiding DIV' => array( '<style>li::before { content: "<div non-target>" }</style><div target>' ),
     1908        );
     1909    }
     1910
     1911    /**
    18751912     * Ensures that the invalid comment closing syntax "--!>" properly closes a comment.
    18761913     *
Note: See TracChangeset for help on using the changeset viewer.