Make WordPress Core

Changeset 58418


Ignore:
Timestamp:
06/15/2024 06:31:24 AM (6 months ago)
Author:
dmsnell
Message:

KSES: Preserve some additional invalid HTML comment syntaxes.

When wp_kses_split processes a document it attempts to leave HTML comments
alone. It makes minor adjustments, but leaves the comments in the document in
its output. Unfortunately it only recognizes one kind of HTML comment and
rejects many others.

This patch makes a minor adjustment to the algorithm in wp_kses_split to
recognize and preserve an additional kind of HTML comment: closing tags with
an invalid tag name, e.g. </%dolly>.

These invalid closing tags must be interpreted as comments by a browser.
This bug fix aligns the implementation of wp_kses_split() more closely
with its stated goal of leaving HTML comments as comments.

It doesn't attempt to fully fix the mis-parsed comments, but it does propose a
minor fix that hopefully won't break any existing code or projects.

Developed in https://github.com/WordPress/wordpress-develop/pull/6395
Discussed in https://core.trac.wordpress.org/ticket/61009

Props ellatrix, dmsnell, joemcgill, jorbin, westonruter, zieladam.
See #61009.

Location:
trunk
Files:
2 edited

Legend:

Unmodified
Added
Removed
  • trunk/src/wp-includes/kses.php

    r58354 r58418  
    964964 *
    965965 * @since 1.0.0
     966 * @since 6.6.0 Recognize additional forms of invalid HTML which convert into comments.
    966967 *
    967968 * @global array[]|string $pass_allowed_html      An array of allowed HTML elements and attributes,
     
    982983    $pass_allowed_protocols = $allowed_protocols;
    983984
    984     return preg_replace_callback( '%(<!--.*?(-->|$))|(<[^>]*(>|$)|>)%', '_wp_kses_split_callback', $content );
     985    $token_pattern = <<<REGEX
     986~
     987    (                      # Detect comments of various flavors before attempting to find tags.
     988        (<!--.*?(-->|$))   #  - Normative HTML comments.
     989        |
     990        </[^a-zA-Z][^>]*>  #  - Closing tags with invalid tag names.
     991    )
     992    |
     993    (<[^>]*(>|$)|>)        # Tag-like spans of text.
     994~x
     995REGEX;
     996    return preg_replace_callback( $token_pattern, '_wp_kses_split_callback', $content );
    985997}
    986998
     
    10701082 * @ignore
    10711083 * @since 1.0.0
     1084 * @since 6.6.0 Recognize additional forms of invalid HTML which convert into comments.
    10721085 *
    10731086 * @param string         $content           Content to filter.
     
    10761089 *                                          for the list of accepted context names.
    10771090 * @param string[]       $allowed_protocols Array of allowed URL protocols.
     1091 *
    10781092 * @return string Fixed HTML element
    10791093 */
     
    10811095    $content = wp_kses_stripslashes( $content );
    10821096
    1083     // It matched a ">" character.
     1097    /*
     1098     * The regex pattern used to split HTML into chunks attempts
     1099     * to split on HTML token boundaries. This function should
     1100     * thus receive chunks that _either_ start with meaningful
     1101     * syntax tokens, like a tag `<div>` or a comment `<!-- ... -->`.
     1102     *
     1103     * If the first character of the `$content` chunk _isn't_ one
     1104     * of these syntax elements, which always starts with `<`, then
     1105     * the match had to be for the final alternation of `>`. In such
     1106     * case, it's probably standing on its own and could be encoded
     1107     * with a character reference to remove ambiguity.
     1108     *
     1109     * In other words, if this chunk isn't from a match of a syntax
     1110     * token, it's just a plaintext greater-than (`>`) sign.
     1111     */
    10841112    if ( ! str_starts_with( $content, '<' ) ) {
    10851113        return '&gt;';
    10861114    }
    10871115
    1088     // Allow HTML comments.
     1116    /*
     1117     * When a closing tag appears with a name that isn't a valid tag name,
     1118     * it must be interpreted as an HTML comment. It extends until the
     1119     * first `>` character after the initial opening `</`.
     1120     *
     1121     * Preserve these comments and do not treat them like tags.
     1122     */
     1123    if ( 1 === preg_match( '~^</[^a-zA-Z][^>]*>$~', $content ) ) {
     1124        $content     = substr( $content, 2, -1 );
     1125        $transformed = null;
     1126
     1127        while ( $transformed !== $content ) {
     1128            $transformed = wp_kses( $content, $allowed_html, $allowed_protocols );
     1129            $content     = $transformed;
     1130        }
     1131
     1132        return "</{$transformed}>";
     1133    }
     1134
     1135    /*
     1136     * Normative HTML comments should be handled separately as their
     1137     * parsing rules differ from those for tags and text nodes.
     1138     */
    10891139    if ( str_starts_with( $content, '<!--' ) ) {
    10901140        $content = str_replace( array( '<!--', '-->' ), '', $content );
  • trunk/tests/phpunit/tests/kses.php

    r58294 r58418  
    19331933
    19341934    /**
     1935     * Ensures that `wp_kses()` preserves various kinds of HTML comments, both valid and invalid.
     1936     *
     1937     * @ticket 61009
     1938     *
     1939     * @param string $html_comment    HTML containing a comment; must not be a valid comment
     1940     *                                but must be syntax which a browser interprets as a comment.
     1941     * @param string $expected_output How `wp_kses()` ought to transform the comment.
     1942     */
     1943    public function wp_kses_preserves_html_comments( $html_comment, $expected_output ) {
     1944        $this->assertSame(
     1945            $expected_output,
     1946            wp_kses( $html_comment, array() ),
     1947            'Failed to properly preserve HTML comment.'
     1948        );
     1949    }
     1950
     1951    /**
     1952     * Data provider.
     1953     *
     1954     * @return array[].
     1955     */
     1956    public static function data_html_containing_various_kinds_of_html_comments() {
     1957        return array(
     1958            'Normative HTML comment'            => array( 'before<!-- this is a comment -->after', 'before<!-- this is a comment -->after' ),
     1959            'Closing tag with invalid tag name' => array( 'before<//not a tag>after', 'before<//not a tag>after' ),
     1960        );
     1961    }
     1962
     1963    /**
    19351964     * Test that attributes with a list of allowed values are filtered correctly.
    19361965     *
Note: See TracChangeset for help on using the changeset viewer.