Make WordPress Core

Changeset 60649


Ignore:
Timestamp:
08/19/2025 07:07:11 PM (3 months ago)
Author:
jonsurrell
Message:

HTML API: Improve script tag escape state processing.

Addresses some edge cases parsing of script tag contents:

  • "<!-->" remains in the unescaped state and does not enter the escaped state.
  • Contents in the escaped state that end with "<script" do not enter double-escaped state.
  • "\f" (Form Feed) was missing as a tag name terminating character.

Developed in https://github.com/WordPress/wordpress-develop/pull/9397 and https://github.com/WordPress/wordpress-develop/pull/9402.

Props jonsurrell, dmsnell.
See #63738.

Location:
trunk
Files:
2 edited

Legend:

Unmodified
Added
Removed
  • trunk/src/wp-includes/html-api/class-wp-html-tag-processor.php

    r60617 r60649  
    15571557
    15581558            /*
    1559              * Unlike with "-->", the "<!--" only transitions
    1560              * into the escaped mode if not already there.
    1561              *
    1562              * Inside the escaped modes it will be ignored; and
    1563              * should never break out of the double-escaped
    1564              * mode and back into the escaped mode.
    1565              *
    1566              * While this requires a mode change, it does not
    1567              * impact the parsing otherwise, so continue
    1568              * parsing after updating the state.
     1559             * "<!--" only transitions from _unescaped_ to _escaped_. This byte sequence is only
     1560             * significant in the _unescaped_ state and is ignored in any other state.
    15691561             */
    15701562            if (
     1563                'unescaped' === $state &&
    15711564                '!' === $html[ $at ] &&
    15721565                '-' === $html[ $at + 1 ] &&
    15731566                '-' === $html[ $at + 2 ]
    15741567            ) {
    1575                 $at   += 3;
    1576                 $state = 'unescaped' === $state ? 'escaped' : $state;
     1568                $at += 3;
     1569
     1570                /*
     1571                 * The parser is ready to enter the _escaped_ state, but may remain in the
     1572                 * _unescaped_ state. This occurs when "<!--" is immediately followed by a
     1573                 * sequence of 0 or more "-" followed by ">". This is similar to abruptly closed
     1574                 * HTML comments like "<!-->" or "<!--->".
     1575                 *
     1576                 * Note that this check may advance the position significantly and requires a
     1577                 * length check to prevent bad offsets on inputs like `<script><!---------`.
     1578                 */
     1579                $at += strspn( $html, '-', $at );
     1580                if ( $at < $doc_length && '>' === $html[ $at ] ) {
     1581                    ++$at;
     1582                    continue;
     1583                }
     1584
     1585                $state = 'escaped';
    15771586                continue;
    15781587            }
     
    16111620            $at += 6;
    16121621            $c   = $html[ $at ];
    1613             if ( ' ' !== $c && "\t" !== $c && "\r" !== $c && "\n" !== $c && '/' !== $c && '>' !== $c ) {
    1614                 ++$at;
     1622            if (
     1623                /**
     1624                 * These characters trigger state transitions of interest:
     1625                 *
     1626                 * - @see {https://html.spec.whatwg.org/multipage/parsing.html#script-data-end-tag-name-state}
     1627                 * - @see {https://html.spec.whatwg.org/multipage/parsing.html#script-data-escaped-end-tag-name-state}
     1628                 * - @see {https://html.spec.whatwg.org/multipage/parsing.html#script-data-double-escape-start-state}
     1629                 * - @see {https://html.spec.whatwg.org/multipage/parsing.html#script-data-double-escape-end-state}
     1630                 *
     1631                 * The "\r" character is not present in the above references. However, "\r" must be
     1632                 * treated the same as "\n". This is because the HTML Standard requires newline
     1633                 * normalization during preprocessing which applies this replacement.
     1634                 *
     1635                 * - @see https://html.spec.whatwg.org/multipage/parsing.html#preprocessing-the-input-stream
     1636                 * - @see https://infra.spec.whatwg.org/#normalize-newlines
     1637                 */
     1638                '>' !== $c &&
     1639                ' ' !== $c &&
     1640                "\n" !== $c &&
     1641                '/' !== $c &&
     1642                "\t" !== $c &&
     1643                "\f" !== $c &&
     1644                "\r" !== $c
     1645            ) {
    16151646                continue;
    16161647            }
  • trunk/tests/phpunit/tests/html-api/wpHtmlTagProcessor.php

    r60647 r60649  
    20102010     * Data provider.
    20112011     */
    2012     public static function data_script_tag(): array {
    2013         return array(
    2014             'Basic script tag'                          => array( '<script></script>', true ),
    2015             'Script with type attribute'                => array( '<script type="text/javascript"></script>', true ),
    2016             'Script data escaped'                       => array( '<script><!--</script>', true ),
    2017             'Script data double-escaped exit (comment)' => array( '<script><!--<script>--></script>', true ),
    2018             'Script data double-escaped exit (closed)'  => array( '<script><!--<script></script></script>', true ),
    2019             'Script data double-escaped exit (closed/truncated)' => array( '<script><!--<script></script </script>', true ),
    2020             'Script data no double-escape'              => array( '<script><!-- --><script></script>', true ),
    2021 
    2022             'Script tag with self-close flag (ignored)' => array( '<script />', false ),
    2023             'Script data double-escaped'                => array( '<script><!--<script></script>', false ),
    2024         );
     2012    public static function data_script_tag(): Generator {
     2013            yield 'Basic script tag'                              => array( '<script></script>', true );
     2014            yield 'Script tag with </script> close'               => array( '<script></script>', true );
     2015            yield 'Script tag with </script/> close'              => array( '<script></script/>', true );
     2016            yield 'Script tag with </script > close'              => array( '<script></script >', true );
     2017            yield 'Script tag with </script\n> close'             => array( "<script></script\n>", true );
     2018            yield 'Script tag with </script\t> close'             => array( "<script></script\t>", true );
     2019            yield 'Script tag with </script\f> close'             => array( "<script></script\f>", true );
     2020            yield 'Script tag with </script\r> close'             => array( "<script></script\r>", true );
     2021            yield 'Script with type attribute'                    => array( '<script type="text/javascript"></script>', true );
     2022            yield 'Script data escaped'                           => array( '<script><!--</script>', true );
     2023            yield 'Script data double-escaped exit (comment)'     => array( '<script><!--<script>--></script>', true );
     2024            yield 'Script data double-escaped exit (closed ">")'  => array( '<script><!--<script></script></script>', true );
     2025            yield 'Script data double-escaped exit (closed "/")'  => array( '<script><!--<script></script/</script>', true );
     2026            yield 'Script data double-escaped exit (closed " ")'  => array( '<script><!--<script></script </script>', true );
     2027            yield 'Script data double-escaped exit (closed "\n")' => array( "<script><!--<script></script\n</script>", true );
     2028            yield 'Script data double-escaped exit (closed "\t")' => array( "<script><!--<script></script\t</script>", true );
     2029            yield 'Script data double-escaped exit (closed "\f")' => array( "<script><!--<script></script\f</script>", true );
     2030            yield 'Script data double-escaped exit (closed "\r")' => array( "<script><!--<script></script\r</script>", true );
     2031            yield 'Script data no double-escape'                  => array( '<script><!-- --><script></script>', true );
     2032            yield 'Script data no double-escape (short comment)'  => array( '<script><!--><script></script>', true );
     2033            yield 'Script data almost double-escaped'             => array( '<script><!--<script</script>', true );
     2034            yield 'Script data with complex JavaScript'           => array(
     2035                '<script>
     2036                    var x = 10;
     2037                    x--;
     2038                    x < 0 ? x += 100 : x = (x + 1) - 1;
     2039                </script>',
     2040                true,
     2041            );
     2042
     2043            yield 'Script tag with self-close flag (ignored)'     => array( '<script />', false );
     2044            yield 'Script data double-escaped'                    => array( '<script><!--<script></script>', false );
     2045            yield 'Unclosed script in escaped state'              => array( '<script><!--------------', false );
     2046            yield 'Unclosed script in double escaped state'       => array( '<script><!--<script ', false );
     2047            yield 'Document end in closer start'                  => array( '<script></', false );
     2048            yield 'Document end in script closer'                 => array( '<script></script', false );
     2049            yield 'Document end in script closer with attributes' => array( '<script></script attr="val"', false );
     2050            yield 'Script tag double-escaped with <script>'       => array( '<script><!--<script></script>', false );
     2051            yield 'Script tag double-escaped with <script/'       => array( '<script><!--<script/</script>', false );
     2052            yield 'Script tag double-escaped with <script '       => array( '<script><!--<script </script>', false );
     2053            yield 'Script tag double-escaped with <script\n'      => array( "<script><!--<script\n</script>", false );
     2054            yield 'Script tag double-escaped with <script\t'      => array( "<script><!--<script\t</script>", false );
     2055            yield 'Script tag double-escaped with <script\f'      => array( "<script><!--<script\f</script>", false );
     2056            yield 'Script tag double-escaped with <script\r'      => array( "<script><!--<script\r</script>", false );
    20252057    }
    20262058
Note: See TracChangeset for help on using the changeset viewer.