Make WordPress Core

Changeset 58829


Ignore:
Timestamp:
07/29/2024 05:57:12 PM (2 months ago)
Author:
dmsnell
Message:

HTML API: Add set_modifiable_text() for replacing text nodes.

This patch introduces a new method, set_modifiable_text() to the
Tag Processor, which makes it possible and safe to replace text nodes
within an HTML document, performing the appropriate escaping.

This method can be used in conjunction with other code to modify the
text content of a document, and can be used for transforming HTML
in a streaming fashion.

Developed in https://github.com/wordpress/wordpress-develop/pull/7007
Discussed in https://core.trac.wordpress.org/ticket/61617

Props: dmsnell, gziolo, zieladam.
Fixes #61617.

Location:
trunk
Files:
2 edited

Legend:

Unmodified
Added
Removed
  • trunk/src/wp-includes/html-api/class-wp-html-tag-processor.php

    r58779 r58829  
    28902890        }
    28912891
    2892         $text = substr( $this->html, $this->text_starts_at, $this->text_length );
     2892        $text = isset( $this->lexical_updates['modifiable text'] )
     2893            ? $this->lexical_updates['modifiable text']->text
     2894            : substr( $this->html, $this->text_starts_at, $this->text_length );
    28932895
    28942896        /*
     
    29552957            ? str_replace( "\x00", '', $decoded )
    29562958            : str_replace( "\x00", "\u{FFFD}", $decoded );
     2959    }
     2960
     2961    /**
     2962     * Sets the modifiable text for the matched token, if matched.
     2963     *
     2964     * Modifiable text is text content that may be read and changed without
     2965     * changing the HTML structure of the document around it. This includes
     2966     * the contents of `#text` nodes in the HTML as well as the inner
     2967     * contents of HTML comments, Processing Instructions, and others, even
     2968     * though these nodes aren't part of a parsed DOM tree. They also contain
     2969     * the contents of SCRIPT and STYLE tags, of TEXTAREA tags, and of any
     2970     * other section in an HTML document which cannot contain HTML markup (DATA).
     2971     *
     2972     * Not all modifiable text may be set by this method, and not all content
     2973     * may be set as modifiable text. In the case that this fails it will return
     2974     * `false` indicating as much. For instance, it will not allow inserting the
     2975     * string `</script` into a SCRIPT element, because the rules for escaping
     2976     * that safely are complicated. Similarly, it will not allow setting content
     2977     * into a comment which would prematurely terminate the comment.
     2978     *
     2979     * Example:
     2980     *
     2981     *     // Add a preface to all STYLE contents.
     2982     *     while ( $processor->next_tag( 'STYLE' ) ) {
     2983     *         $style = $processor->get_modifiable_text();
     2984     *         $processor->set_modifiable_text( "// Made with love on the World Wide Web\n{$style}" );
     2985     *     }
     2986     *
     2987     *     // Replace smiley text with Emoji smilies.
     2988     *     while ( $processor->next_token() ) {
     2989     *         if ( '#text' !== $processor->get_token_name() ) {
     2990     *             continue;
     2991     *         }
     2992     *
     2993     *         $chunk = $processor->get_modifiable_text();
     2994     *         if ( ! str_contains( $chunk, ':)' ) ) {
     2995     *             continue;
     2996     *         }
     2997     *
     2998     *         $processor->set_modifiable_text( str_replace( ':)', '🙂', $chunk ) );
     2999     *     }
     3000     *
     3001     * @since 6.7.0
     3002     *
     3003     * @param string $plaintext_content New text content to represent in the matched token.
     3004     *
     3005     * @return bool Whether the text was able to update.
     3006     */
     3007    public function set_modifiable_text( string $plaintext_content ): bool {
     3008        if ( self::STATE_TEXT_NODE === $this->parser_state ) {
     3009            $this->lexical_updates['modifiable text'] = new WP_HTML_Text_Replacement(
     3010                $this->text_starts_at,
     3011                $this->text_length,
     3012                htmlspecialchars( $plaintext_content, ENT_QUOTES | ENT_HTML5 )
     3013            );
     3014
     3015            return true;
     3016        }
     3017
     3018        // Comment data is not encoded.
     3019        if (
     3020            self::STATE_COMMENT === $this->parser_state &&
     3021            self::COMMENT_AS_HTML_COMMENT === $this->comment_type
     3022        ) {
     3023            // Check if the text could close the comment.
     3024            if ( 1 === preg_match( '/--!?>/', $plaintext_content ) ) {
     3025                return false;
     3026            }
     3027
     3028            $this->lexical_updates['modifiable text'] = new WP_HTML_Text_Replacement(
     3029                $this->text_starts_at,
     3030                $this->text_length,
     3031                $plaintext_content
     3032            );
     3033
     3034            return true;
     3035        }
     3036
     3037        if ( self::STATE_MATCHED_TAG !== $this->parser_state ) {
     3038            return false;
     3039        }
     3040
     3041        switch ( $this->get_tag() ) {
     3042            case 'SCRIPT':
     3043                /*
     3044                 * This is over-protective, but ensures the update doesn't break
     3045                 * out of the SCRIPT element. A more thorough check would need to
     3046                 * ensure that the script closing tag doesn't exist, and isn't
     3047                 * also "hidden" inside the script double-escaped state.
     3048                 *
     3049                 * It may seem like replacing `</script` with `<\/script` would
     3050                 * properly escape these things, but this could mask regex patterns
     3051                 * that previously worked. Resolve this by not sending `</script`
     3052                 */
     3053                if ( false !== stripos( $plaintext_content, '</script' ) ) {
     3054                    return false;
     3055                }
     3056
     3057                $this->lexical_updates['modifiable text'] = new WP_HTML_Text_Replacement(
     3058                    $this->text_starts_at,
     3059                    $this->text_length,
     3060                    $plaintext_content
     3061                );
     3062
     3063                return true;
     3064
     3065            case 'STYLE':
     3066                $plaintext_content = preg_replace_callback(
     3067                    '~</(?P<TAG_NAME>style)~i',
     3068                    static function ( $tag_match ) {
     3069                        return "\\3c\\2f{$tag_match['TAG_NAME']}";
     3070                    },
     3071                    $plaintext_content
     3072                );
     3073
     3074                $this->lexical_updates['modifiable text'] = new WP_HTML_Text_Replacement(
     3075                    $this->text_starts_at,
     3076                    $this->text_length,
     3077                    $plaintext_content
     3078                );
     3079
     3080                return true;
     3081
     3082            case 'TEXTAREA':
     3083            case 'TITLE':
     3084                $plaintext_content = preg_replace_callback(
     3085                    "~</(?P<TAG_NAME>{$this->get_tag()})~i",
     3086                    static function ( $tag_match ) {
     3087                        return "&lt;/{$tag_match['TAG_NAME']}";
     3088                    },
     3089                    $plaintext_content
     3090                );
     3091
     3092                /*
     3093                 * These don't _need_ to be escaped, but since they are decoded it's
     3094                 * safe to leave them escaped and this can prevent other code from
     3095                 * naively detecting tags within the contents.
     3096                 *
     3097                 * @todo It would be useful to prefix a multiline replacement text
     3098                 *       with a newline, but not necessary. This is for aesthetics.
     3099                 */
     3100                $this->lexical_updates['modifiable text'] = new WP_HTML_Text_Replacement(
     3101                    $this->text_starts_at,
     3102                    $this->text_length,
     3103                    $plaintext_content
     3104                );
     3105
     3106                return true;
     3107        }
     3108
     3109        return false;
    29573110    }
    29583111
  • trunk/tests/phpunit/tests/html-api/wpHtmlTagProcessorModifiableText.php

    r58779 r58829  
    4141
    4242    /**
     43     * Ensures that updates to modifiable text that are shorter than the
     44     * original text do not cause the parser to lose its orientation.
     45     *
     46     * @ticket 61617
     47     */
     48    public function test_setting_shorter_modifiable_text() {
     49        $processor = new WP_HTML_Tag_Processor( '<div><textarea>very long text</textarea><div id="not a <span>">' );
     50
     51        // Find the test node in the middle.
     52        while ( 'TEXTAREA' !== $processor->get_token_name() && $processor->next_token() ) {
     53            continue;
     54        }
     55
     56        $this->assertSame(
     57            'TEXTAREA',
     58            $processor->get_token_name(),
     59            'Failed to find the test TEXTAREA node; check the test setup.'
     60        );
     61
     62        $processor->set_modifiable_text( 'short' );
     63        $processor->get_updated_html();
     64        $this->assertSame(
     65            'short',
     66            $processor->get_modifiable_text(),
     67            'Should have updated modifiable text to something shorter than the original.'
     68        );
     69
     70        $this->assertTrue(
     71            $processor->next_token(),
     72            'Should have advanced to the last token in the input.'
     73        );
     74
     75        $this->assertSame(
     76            'DIV',
     77            $processor->get_token_name(),
     78            'Should have recognized the final DIV in the input.'
     79        );
     80
     81        $this->assertSame(
     82            'not a <span>',
     83            $processor->get_attribute( 'id' ),
     84            'Should have read in the id from the last DIV as "not a <span>"'
     85        );
     86    }
     87
     88    /**
     89     * Ensures that reads to modifiable text after setting it reads the updated
     90     * enqueued values, and not the original value.
     91     *
     92     * @ticket 61617
     93     */
     94    public function test_modifiable_text_reads_updates_after_setting() {
     95        $processor = new WP_HTML_Tag_Processor( 'This is text<!-- this is not -->' );
     96
     97        $processor->next_token();
     98        $this->assertSame(
     99            '#text',
     100            $processor->get_token_name(),
     101            'Failed to find first text node: check test setup.'
     102        );
     103
     104        $update = 'This is new text';
     105        $processor->set_modifiable_text( $update );
     106        $this->assertSame(
     107            $update,
     108            $processor->get_modifiable_text(),
     109            'Failed to read updated enqueued value of text node.'
     110        );
     111
     112        $processor->next_token();
     113        $this->assertSame(
     114            '#comment',
     115            $processor->get_token_name(),
     116            'Failed to advance to comment: check test setup.'
     117        );
     118
     119        $this->assertSame(
     120            ' this is not ',
     121            $processor->get_modifiable_text(),
     122            'Failed to read modifiable text for next token; did it read the old enqueued value from the previous token?'
     123        );
     124    }
     125
     126    /**
    43127     * Ensures that when ignoring a newline after LISTING and PRE tags, that this
    44128     * happens appropriately after seeking.
     
    109193        );
    110194    }
     195
     196    /**
     197     * Ensures that modifiable text updates are not applied where they aren't supported.
     198     *
     199     * @ticket 61617
     200     *
     201     * @dataProvider data_tokens_not_supporting_modifiable_text_updates
     202     *
     203     * @param string $html             Contains HTML with a token not supporting modifiable text updates.
     204     * @param int    $advance_n_tokens Count of times to run `next_token()` before reaching target node.
     205     */
     206    public function test_rejects_updates_on_unsupported_match_locations( string $html, int $advance_n_tokens ) {
     207        $processor = new WP_HTML_Tag_Processor( $html );
     208        while ( --$advance_n_tokens >= 0 ) {
     209            $processor->next_token();
     210        }
     211
     212        $this->assertFalse(
     213            $processor->set_modifiable_text( 'Bazinga!' ),
     214            'Should have prevented modifying the text at the target node.'
     215        );
     216
     217        $this->assertSame(
     218            $html,
     219            $processor->get_updated_html(),
     220            'Should not have modified the input document in any way.'
     221        );
     222    }
     223
     224    /**
     225     * Data provider.
     226     *
     227     * @return array[]
     228     */
     229    public static function data_tokens_not_supporting_modifiable_text_updates() {
     230        return array(
     231            'Before parsing'               => array( 'nothing to see here', 0 ),
     232            'After parsing'                => array( 'nothing here either', 2 ),
     233            'Incomplete document'          => array( '<tag without="an end', 1 ),
     234            'Presumptuous closer'          => array( 'before</>after', 2 ),
     235            'Invalid (CDATA)'              => array( '<![CDATA[this is a comment]]>', 1 ),
     236            'Invalid (shortest comment)'   => array( '<!-->', 1 ),
     237            'Invalid (shorter comment)'    => array( '<!--->', 1 ),
     238            'Invalid (markup declaration)' => array( '<!run>', 1 ),
     239            'Invalid (PI-like node)'       => array( '<?xml is not html ?>', 1 ),
     240        );
     241    }
     242
     243    /**
     244     * Ensures that modifiable text updates are applied as expected to supported nodes.
     245     *
     246     * @ticket 61617
     247     *
     248     * @dataProvider data_tokens_with_basic_modifiable_text_updates
     249     *
     250     * @param string $html             Contains HTML with a token supporting modifiable text updates.
     251     * @param int    $advance_n_tokens Count of times to run `next_token()` before reaching target node.
     252     * @param string $raw_replacement  This should be escaped properly when replaced as modifiable text.
     253     * @param string $transformed      Expected output after updating modifiable text.
     254     */
     255    public function test_updates_basic_modifiable_text_on_supported_nodes( string $html, int $advance_n_tokens, string $raw_replacement, string $transformed ) {
     256        $processor = new WP_HTML_Tag_Processor( $html );
     257        while ( --$advance_n_tokens >= 0 ) {
     258            $processor->next_token();
     259        }
     260
     261        $this->assertTrue(
     262            $processor->set_modifiable_text( $raw_replacement ),
     263            'Should have modified the text at the target node.'
     264        );
     265
     266        $this->assertSame(
     267            $transformed,
     268            $processor->get_updated_html(),
     269            "Should have transformed the HTML as expected why modifying the target node's modifiable text."
     270        );
     271    }
     272
     273    /**
     274     * Data provider.
     275     *
     276     * @return array[]
     277     */
     278    public static function data_tokens_with_basic_modifiable_text_updates() {
     279        return array(
     280            'Text node (start)'       => array( 'Text', 1, 'Blubber', 'Blubber' ),
     281            'Text node (middle)'      => array( '<em>Bold move</em>', 2, 'yo', '<em>yo</em>' ),
     282            'Text node (end)'         => array( '<img>of a dog', 2, 'of a cat', '<img>of a cat' ),
     283            'Encoded text node'       => array( '<figcaption>birds and dogs</figcaption>', 2, '<birds> & <dogs>', '<figcaption>&lt;birds&gt; &amp; &lt;dogs&gt;</figcaption>' ),
     284            'SCRIPT tag'              => array( 'before<script></script>after', 2, 'const img = "<img> & <br>";', 'before<script>const img = "<img> & <br>";</script>after' ),
     285            'STYLE tag'               => array( '<style></style>', 1, 'p::before { content: "<img> & </style>"; }', '<style>p::before { content: "<img> & \3c\2fstyle>"; }</style>' ),
     286            'TEXTAREA tag'            => array( 'a<textarea>has no need to escape</textarea>b', 2, "so it <doesn't>", "a<textarea>so it <doesn't></textarea>b" ),
     287            'TEXTAREA (escape)'       => array( 'a<textarea>has no need to escape</textarea>b', 2, 'but it does for </textarea>', 'a<textarea>but it does for &lt;/textarea></textarea>b' ),
     288            'TEXTAREA (escape+attrs)' => array( 'a<textarea>has no need to escape</textarea>b', 2, 'but it does for </textarea not an="attribute">', 'a<textarea>but it does for &lt;/textarea not an="attribute"></textarea>b' ),
     289            'TITLE tag'               => array( 'a<title>has no need to escape</title>b', 2, "so it <doesn't>", "a<title>so it <doesn't></title>b" ),
     290            'TITLE (escape)'          => array( 'a<title>has no need to escape</title>b', 2, 'but it does for </title>', 'a<title>but it does for &lt;/title></title>b' ),
     291            'TITLE (escape+attrs)'    => array( 'a<title>has no need to escape</title>b', 2, 'but it does for </title not an="attribute">', 'a<title>but it does for &lt;/title not an="attribute"></title>b' ),
     292        );
     293    }
     294
     295    /**
     296     * Ensures that updates with potentially-compromising values aren't accepted.
     297     *
     298     * For example, a modifiable text update should be allowed which would break
     299     * the structure of the containing element, such as in a script or comment.
     300     *
     301     * @ticket 61617
     302     *
     303     * @dataProvider data_unallowed_modifiable_text_updates
     304     *
     305     * @param string $html_with_nonempty_modifiable_text Will be used to find the test element.
     306     * @param string $invalid_update                     Update containing possibly-compromising text.
     307     */
     308    public function test_rejects_updates_with_unallowed_substrings( string $html_with_nonempty_modifiable_text, string $invalid_update ) {
     309        $processor = new WP_HTML_Tag_Processor( $html_with_nonempty_modifiable_text );
     310
     311        while ( '' === $processor->get_modifiable_text() && $processor->next_token() ) {
     312            continue;
     313        }
     314
     315        $original_text = $processor->get_modifiable_text();
     316        $this->assertNotEmpty( $original_text, 'Should have found non-empty text: check test setup.' );
     317
     318        $this->assertFalse(
     319            $processor->set_modifiable_text( $invalid_update ),
     320            'Should have reject possibly-compromising modifiable text update.'
     321        );
     322
     323        // Flush updates.
     324        $processor->get_updated_html();
     325
     326        $this->assertSame(
     327            $original_text,
     328            $processor->get_modifiable_text(),
     329            'Should have preserved the original modifiable text before the rejected update.'
     330        );
     331    }
     332
     333    /**
     334     * Data provider.
     335     *
     336     * @return array[]
     337     */
     338    public static function data_unallowed_modifiable_text_updates() {
     339        return array(
     340            'Comment with -->'                 => array( '<!-- this is a comment -->', 'Comments end in -->' ),
     341            'Comment with --!>'                => array( '<!-- this is a comment -->', 'Invalid but legitimate comments end in --!>' ),
     342            'SCRIPT with </script>'            => array( '<script>Replace me</script>', 'Just a </script>' ),
     343            'SCRIPT with </script attributes>' => array( '<script>Replace me</script>', 'before</script id=sneak>after' ),
     344        );
     345    }
    111346}
Note: See TracChangeset for help on using the changeset viewer.