Make WordPress Core

Changeset 61477


Ignore:
Timestamp:
01/13/2026 01:11:23 PM (8 weeks ago)
Author:
jonsurrell
Message:

HTML API: Escape script tag contents automatically.

When setting JavaScript or JSON script tag content, automatically escape sequences like <script> and </script>. This renders the content safe for HTML. The semantics of any JSON and virtually any JavaScript are preserved.

Script type detection follows the HTML standard for identifying JavaScript and JSON script tags. Other script types continue to reject potentially dangerous content.

Developed in https://github.com/WordPress/wordpress-develop/pull/10635.

Props jonsurrell, dmsnell, westonruter.
Fixes #64419. See #63851, #51159.

Location:
trunk
Files:
3 added
2 edited

Legend:

Unmodified
Added
Removed
  • trunk/src/wp-includes/html-api/class-wp-html-tag-processor.php

    r61346 r61477  
    37223722     * Not all modifiable text may be set by this method, and not all content
    37233723     * may be set as modifiable text. In the case that this fails it will return
    3724      * `false` indicating as much. For instance, it will not allow inserting the
    3725      * string `</script` into a SCRIPT element, because the rules for escaping
    3726      * that safely are complicated. Similarly, it will not allow setting content
    3727      * into a comment which would prematurely terminate the comment.
     3724     * `false` indicating as much. For instance, if the contents of a SCRIPT
     3725     * element are neither JavaScript nor JSON, it’s not possible to guarantee
     3726     * that escaping strings like `</script>` won’t break the script; in these
     3727     * cases, updates will be rejected and it’s up to calling code to perform
     3728     * language-specific escaping or workarounds. Similarly, it will not allow
     3729     * setting content into a comment which would prematurely terminate the comment.
    37283730     *
    37293731     * Example:
     
    38123814        switch ( $this->get_tag() ) {
    38133815            case 'SCRIPT':
    3814                 /**
    3815                  * This is over-protective, but ensures the update doesn't break
    3816                  * the HTML structure of the SCRIPT element.
     3816                $script_content_type = $this->get_script_content_type();
     3817
     3818                switch ( $script_content_type ) {
     3819                    case 'javascript':
     3820                    case 'json':
     3821                        $this->lexical_updates['modifiable text'] = new WP_HTML_Text_Replacement(
     3822                            $this->text_starts_at,
     3823                            $this->text_length,
     3824                            self::escape_javascript_script_contents( $plaintext_content )
     3825                        );
     3826                        return true;
     3827                }
     3828
     3829                /*
     3830                 * If the script’s content type isn’t recognized and understandable then it’s
     3831                 * impossible to guarantee that escaping the content won’t cause runtime breakage.
     3832                 * For instance, if the script content type were PHP code then escaping with
     3833                 * `\u0073` would not be met by unescaping; rather, it could result in corrupted
     3834                 * data or even syntax errors.
    38173835                 *
    3818                  * More thorough analysis could track the HTML tokenizer states
    3819                  * and to ensure that the SCRIPT element closes at the expected
    3820                  * SCRIPT close tag as is done in {@see ::skip_script_data()}.
    3821                  *
    3822                  * A SCRIPT element could be closed prematurely by contents
    3823                  * like `</script>`. A SCRIPT element could be prevented from
    3824                  * closing by contents like `<!--<script>`.
    3825                  *
    3826                  * The following strings are essential for dangerous content,
    3827                  * although they are insufficient on their own. This trade-off
    3828                  * prevents dangerous scripts from being sent to the browser.
    3829                  * It is also unlikely to produce HTML that may confuse more
    3830                  * basic HTML tooling.
     3836                 * Because of this, content which could potentially modify the SCRIPT tag’s
     3837                 * HTML structure is rejected here. It’s the responsibility of calling code to
     3838                 * perform whatever semantic escaping is necessary to avoid problematic strings.
    38313839                 */
    38323840                if (
    3833                     false !== stripos( $plaintext_content, '</script' ) ||
    3834                     false !== stripos( $plaintext_content, '<script' )
     3841                    false !== stripos( $plaintext_content, '<script' ) ||
     3842                    false !== stripos( $plaintext_content, '</script' )
    38353843                ) {
    38363844                    return false;
    38373845                }
    3838 
    38393846                $this->lexical_updates['modifiable text'] = new WP_HTML_Text_Replacement(
    38403847                    $this->text_starts_at,
     
    38423849                    $plaintext_content
    38433850                );
    3844 
    38453851                return true;
    38463852
     
    38903896
    38913897        return false;
     3898    }
     3899
     3900    /**
     3901     * Returns the content type of the currently-matched HTML SCRIPT tag, if matched and
     3902     * recognized, otherwise returns `null` to indicate an unrecognized content type.
     3903     *
     3904     * An HTML SCRIPT tag is a normal SCRIPT tag, but there can be SCRIPT elements inside
     3905     * SVG and MathML elements as well, and these have different parsing rules than those
     3906     * in general HTML. For this reason, no content-type inference is performed on those.
     3907     *
     3908     * Note! This concept is related but distinct from the MIME type of the script.
     3909     * Parsing MUST match the specific algorithm in the HTML specification, which
     3910     * relies on exact string comparison in some cases. MIME type decoding may be
     3911     * performed on SVG or MathML SCRIPT tags.
     3912     *
     3913     * Only 'javascript' and 'json' content types are currently recognized.
     3914     *
     3915     * @see https://html.spec.whatwg.org/multipage/scripting.html#prepare-the-script-element
     3916     *
     3917     * @since 7.0.0
     3918     *
     3919     * @return 'javascript'|'json'|null Type of script element content if matched and recognized.
     3920     */
     3921    private function get_script_content_type(): ?string {
     3922        // SVG and MathML SCRIPT elements are not recognized.
     3923        if ( 'SCRIPT' !== $this->get_tag() || $this->get_namespace() !== 'html' ) {
     3924            return null;
     3925        }
     3926
     3927        /*
     3928         * > If any of the following are true:
     3929         * >   - el has a type attribute whose value is the empty string;
     3930         * >   - el has no type attribute but it has a language attribute and that attribute's
     3931         * >     value is the empty string; or
     3932         * >   - el has neither a type attribute nor a language attribute,
     3933         * > then let the script block's type string for this script element be "text/javascript".
     3934         */
     3935        $type = $this->get_attribute( 'type' );
     3936        $lang = $this->get_attribute( 'language' );
     3937
     3938        if ( true === $type || '' === $type ) {
     3939            return 'javascript';
     3940        }
     3941
     3942        if ( null === $type && ( null === $lang || true === $lang || '' === $lang ) ) {
     3943            return 'javascript';
     3944        }
     3945
     3946        /*
     3947         * > Otherwise, if el has a type attribute, then let the script block's type string be
     3948         * > the value of that attribute with leading and trailing ASCII whitespace stripped.
     3949         * > Otherwise, el has a non-empty language attribute; let the script block's type string
     3950         * > be the concatenation of "text/" and the value of el's language attribute.
     3951         */
     3952        $type_string = is_string( $type ) ? trim( $type, " \t\f\r\n" ) : "text/{$lang}";
     3953
     3954        // All matches are ASCII case-insensitive; eagerly lower-case for comparison.
     3955        $type_string = strtolower( $type_string );
     3956
     3957        /*
     3958         * > If the script block's type string is a JavaScript MIME type essence match, then
     3959         * > set el's type to "classic".
     3960         *
     3961         * > A string is a JavaScript MIME type essence match if it is an ASCII case-insensitive
     3962         * > match for one of the JavaScript MIME type essence strings.
     3963         *
     3964         * > A JavaScript MIME type is any MIME type whose essence is one of the following:
     3965         * >
     3966         * > - application/ecmascript
     3967         * > - application/javascript
     3968         * > - application/x-ecmascript
     3969         * > - application/x-javascript
     3970         * > - text/ecmascript
     3971         * > - text/javascript
     3972         * > - text/javascript1.0
     3973         * > - text/javascript1.1
     3974         * > - text/javascript1.2
     3975         * > - text/javascript1.3
     3976         * > - text/javascript1.4
     3977         * > - text/javascript1.5
     3978         * > - text/jscript
     3979         * > - text/livescript
     3980         * > - text/x-ecmascript
     3981         * > - text/x-javascript
     3982         *
     3983         * @see https://mimesniff.spec.whatwg.org/#javascript-mime-type-essence-match
     3984         * @see https://mimesniff.spec.whatwg.org/#javascript-mime-type
     3985         */
     3986        switch ( $type_string ) {
     3987            case 'application/ecmascript':
     3988            case 'application/javascript':
     3989            case 'application/x-ecmascript':
     3990            case 'application/x-javascript':
     3991            case 'text/ecmascript':
     3992            case 'text/javascript':
     3993            case 'text/javascript1.0':
     3994            case 'text/javascript1.1':
     3995            case 'text/javascript1.2':
     3996            case 'text/javascript1.3':
     3997            case 'text/javascript1.4':
     3998            case 'text/javascript1.5':
     3999            case 'text/jscript':
     4000            case 'text/livescript':
     4001            case 'text/x-ecmascript':
     4002            case 'text/x-javascript':
     4003                return 'javascript';
     4004
     4005            /*
     4006             * > Otherwise, if the script block's type string is an ASCII case-insensitive match for
     4007             * > the string "module", then set el's type to "module".
     4008             *
     4009             * A module is evaluated as JavaScript.
     4010             */
     4011            case 'module':
     4012                return 'javascript';
     4013
     4014            /*
     4015             * > Otherwise, if the script block's type string is an ASCII case-insensitive match for the string "importmap", then set el's type to "importmap".
     4016             * > Otherwise, if the script block's type string is an ASCII case-insensitive match for the string "speculationrules", then set el's type to "speculationrules".
     4017             *
     4018             * These conditions indicate JSON content.
     4019             */
     4020            case 'importmap':
     4021            case 'speculationrules':
     4022                return 'json';
     4023
     4024            /** @todo Rely on a full MIME parser for determining JSON content. */
     4025            case 'application/json':
     4026            case 'text/json':
     4027                return 'json';
     4028        }
     4029
     4030        /*
     4031         * > Otherwise, return. (No script is executed, and el's type is left as null.)
     4032         */
     4033        return null;
     4034    }
     4035
     4036    /**
     4037     * Escape JavaScript and JSON script tag contents.
     4038     *
     4039     * Ensure that the script contents cannot modify the HTML structure or break out
     4040     * of its containing SCRIPT element. JavaScript and JSON may both be escaped with
     4041     * the same rules, even though there are additional escaping measures available
     4042     * to JavaScript source code which aren’t applicable to serialized JSON data.
     4043     *
     4044     * A simple method safely escapes all content except for a few extremely rare and
     4045     * unlikely exceptions: prevent the appearance of `<script` and `</script` within
     4046     * the contents by replacing the first letter of the tag name with a Unicode escape.
     4047     *
     4048     * Example:
     4049     *
     4050     *     $plaintext = '<script>document.write( "A </script> closes a script." );</script>';
     4051     *     $escaped   = '<script>document.write( "A </\u0073cript> closes a script." );</script>';
     4052     *
     4053     * This works because of how parsing changes after encountering an opening SCRIPT
     4054     * tag. The actual parsing comprises a complicated state machine, the result of
     4055     * legacy behaviors and diverse browser support. However, without these two strings
     4056     * in the script contents, two key things are ensured: `</script>` cannot appear to
     4057     * prematurely close the tag, and the problematic double-escaped state becomes
     4058     * unreachable. A JavaScript engine or JSON decoder will then decode the Unicode
     4059     * escape (`\u0073`) back into its original plaintext value, but only after having
     4060     * been safely extracted from the HTML.
     4061     *
     4062     * While it may seem tempting to replace the `<` character instead, doing so would
     4063     * break JavaScript syntax. The `<` character is used in comparison operators and
     4064     * other JavaScript syntax; replacing it would break valid JavaScript. Replacing
     4065     * only the `s` in `<script` and `</script` avoids modifying JavaScript syntax.
     4066     *
     4067     * ### Exceptions
     4068     *
     4069     * This _should_ work everywhere, but there are some extreme exceptions.
     4070     *
     4071     *  - Comments.
     4072     *  - Tagged templates, such as `String.raw()`, which provide access to “raw” strings.
     4073     *  - The `source` property of a RegExp object.
     4074     *
     4075     * Each of these exceptions appear at the source code level, not at the semantic or
     4076     * evaluation level. Normal JavaScript will remain semantically equivalent after escaping,
     4077     * but any JavaScript which analyzes the raw source code will see potentially-different
     4078     * values.
     4079     *
     4080     * #### Comments
     4081     *
     4082     * Comments are never unescaped because they aren’t parsed by the JavaScript engine.
     4083     * When viewing the source in a browser’s developer tools, the comments will retain
     4084     * their escaped text.
     4085     *
     4086     * Example:
     4087     *
     4088     *     // A comment: "</script>"
     4089     *         …becomes…
     4090     *     // A comment: "</\u0073cript>"
     4091     *
     4092     * #### Tagged templates.
     4093     *
     4094     * Tagged templates “enable the embedding of arbitrary string content, where escape
     4095     * sequences may follow a different syntax.” For example, they can aid representing
     4096     * a RegExp pattern or LaTex snippet within a JavaScript string, where the string
     4097     * escape characters might get noisy and distracting.
     4098     *
     4099     * Example:
     4100     *
     4101     *     console.log( 'A \notin B' );           // Prints a newline because of the "\n".
     4102     *     console.log( 'A \\notin B' );          // Prints "A \notin B".
     4103     *     console.log( String.raw`A \notin B` ); // Prints "A \notin B".
     4104     *
     4105     * This means that if `<script` transforms into `<\u0073cript` _inside_ a raw string
     4106     * or tagged template literal which relies on its `.raw` property, the output of the
     4107     * code will be different after escaping.
     4108     *
     4109     * Example:
     4110     *
     4111     *     console.log( String.raw`</script>` );      // Prematurely closes the SCRIPT element.
     4112     *     console.log( String.raw`</\u0073cript>` ); // Prints "</\u0073cript".
     4113     *
     4114     * #### RegExp sources.
     4115     *
     4116     * The RegExp object exposes its raw source in a similar way to how tagged templates and raw
     4117     * strings do. Thankfully, because escape sequences are decoded when compiling the pattern,
     4118     * escaped RegExp patterns will match the same way as the plaintext sequences would.
     4119     *
     4120     * Example:
     4121     *
     4122     *     true === /<script>/.test( '<script>' );
     4123     *     true === /<\u0073cript>/.test( '<script>' );
     4124     *
     4125     * However, as with raw strings, any code which reads the source will see the escaped value
     4126     * instead of the decoded one.
     4127     *
     4128     * Example:
     4129     *
     4130     *     console.log( /<script>/.source );      // Prints "<script>".
     4131     *     console.log( /<\u0073cript>/.source ); // Prints "<\u0073cript>".
     4132     *
     4133     * #### Unsupported escaping.
     4134     *
     4135     * It is not possible to properly represent every possible JavaScript source file
     4136     * inside a SCRIPT element. As with CSS stylesheets, SVG images, and MathML, the
     4137     * only 100% reliable way to represent all possible inputs is to link to external
     4138     * files of the given content-type.
     4139     *
     4140     * In some cases it’s possible to manually prevent escaping issues. These are not
     4141     * automatically handled by this function because doing so would require a full
     4142     * JavaScript tokenizer. Consider the following example listing various ways to
     4143     * manually escape a closing script tag.
     4144     *
     4145     * Example:
     4146     *
     4147     *     console.log( String.raw`</script>` );                // !!UNSAFE!! Will be escaped.
     4148     *     console.log( String.raw`</\u0073cript>` );           // "</\u0073cript>"
     4149     *     console.log( String.raw`</scr` + String.raw`ipt>` ); // "</script>"
     4150     *     console.log( String.raw`</${"script"}>` );           // "</script>"
     4151     *     console.log( '</scr' + 'ipt>' );                     // "</script>"
     4152     *     console.log( "\x3C/script>" );                       // "</script>"
     4153     *     console.log( "<\/script>" );                         // "</script>"
     4154     *
     4155     * The following graph is a simplified interpretation of how HTML interprets the contents
     4156     * of a SCRIPT tag and identifies the closing tag. It is useful to understand what text
     4157     * is dangerous inside of a SCRIPT tag and why different approaches to escaping work.
     4158     *
     4159     *                                 Open script
     4160     *                                     │
     4161     *                                     ▼
     4162     *                  ╔═════════════════════════════════════════╗   <!--(…)>
     4163     *                  ║                                         ║   (all dashes)
     4164     *                  ║                 script                  ╟────────────────╮
     4165     *                  ║                  data                   ║                │
     4166     *      ╭───────────╢                                         ║ ◀──────────────╯
     4167     *      │           ╚═╤═══════════════════════════════════════╝
     4168     *      │             │               ▲                    ▲
     4169     *      │             │ <!--          │ -->                ╰─────╮
     4170     *      │             ▼               │                          │
     4171     *      │           ┌─────────────────┴───────────────────────┐  │
     4172     *      │ </script¹ │                 escaped                 │  │
     4173     *      │           └─┬─────────────────────────────┬─────────┘  │
     4174     *      │             │               ▲             │            │ -->
     4175     *      │             │ </script¹     │ </script¹   │ <script¹   │
     4176     *      │             ▼               │             ▼            │
     4177     *      │           ╔══════════════╗  │           ┌───────────┐  │
     4178     *      │           ║ Close script ║  │           │  double   │  │
     4179     *      ╰──────────▶║              ║  ╰───────────┤  escaped  ├──╯
     4180     *                  ╚══════════════╝              └───────────┘
     4181     *
     4182     *           ¹ = Case insensitive 'script' followed by one of ' \t\f\r\n/>', known
     4183     *               as “tag-name-terminating characters.” This sequence forms the start
     4184     *               of what could be a SCRIPT opening or closing tag.
     4185     *
     4186     * @see https://html.spec.whatwg.org/#restrictions-for-contents-of-script-elements
     4187     * @see https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Template_literals#specifications
     4188     * @see wp_html_api_script_element_escaping_diagram_source()
     4189     *
     4190     * @since 7.0.0
     4191     *
     4192     * @param string $sourcecode Raw contents intended to be serialized into an HTML SCRIPT element.
     4193     * @return string Escaped form of input contents which will not lead to premature closing of the containing SCRIPT element.
     4194     */
     4195    private static function escape_javascript_script_contents( string $sourcecode ): string {
     4196        $at      = 0;
     4197        $was_at  = 0;
     4198        $end     = strlen( $sourcecode );
     4199        $escaped = '';
     4200
     4201        /*
     4202         * Replace all instances of the ASCII case-insensitive match of "<script"
     4203         * and "</script", when followed by whitespace or "/" or ">", by using a
     4204         * character replacement for the "s" (or the "S").
     4205         */
     4206        while ( $at < $end ) {
     4207            $tag_at = strpos( $sourcecode, '<', $at );
     4208            if ( false === $tag_at ) {
     4209                break;
     4210            }
     4211
     4212            $tag_name_at       = $tag_at + 1;
     4213            $has_closing_slash = $tag_name_at < $end && '/' === $sourcecode[ $tag_name_at ];
     4214            $tag_name_at      += $has_closing_slash ? 1 : 0;
     4215
     4216            if ( 0 !== substr_compare( $sourcecode, 'script', $tag_name_at, 6, true ) ) {
     4217                $at = $tag_at + 1;
     4218                continue;
     4219            }
     4220
     4221            if ( 1 !== strspn( $sourcecode, " \t\f\r\n/>", $tag_name_at + 6, 1 ) ) {
     4222                $at = $tag_name_at + 6;
     4223                continue;
     4224            }
     4225
     4226            $escaped .= substr( $sourcecode, $was_at, $tag_name_at - $was_at );
     4227            $escaped .= 's' === $sourcecode[ $tag_name_at ] ? '\u0073' : '\u0053';
     4228            $was_at   = $tag_name_at + 1;
     4229            $at       = $tag_name_at + 7;
     4230        }
     4231
     4232        if ( '' === $escaped ) {
     4233            return $sourcecode;
     4234        }
     4235
     4236        if ( $was_at < $end ) {
     4237            $escaped .= substr( $sourcecode, $was_at );
     4238        }
     4239
     4240        return $escaped;
    38924241    }
    38934242
  • trunk/tests/phpunit/tests/html-api/wpHtmlTagProcessorModifiableText.php

    r60706 r61477  
    445445     * Ensures that updates with potentially-compromising values aren't accepted.
    446446     *
    447      * For example, a modifiable text update should be allowed which would break
    448      * the structure of the containing element, such as in a script or comment.
     447     * For example, a modifiable text update that would change the structure of the HTML
     448     * document is not allowed, like attempting to set `-->` within a comment or `</script>`
     449     * within a text/plain SCRIPT tag.
    449450     *
    450451     * @ticket 61617
     452     * @ticket 62797
    451453     *
    452454     * @dataProvider data_unallowed_modifiable_text_updates
     
    455457     * @param string $invalid_update                     Update containing possibly-compromising text.
    456458     */
    457     public function test_rejects_updates_with_unallowed_substrings( string $html_with_nonempty_modifiable_text, string $invalid_update ) {
     459    public function test_rejects_dangerous_updates( string $html_with_nonempty_modifiable_text, string $invalid_update ) {
    458460        $processor = new WP_HTML_Tag_Processor( $html_with_nonempty_modifiable_text );
    459461
     
    467469        $this->assertFalse(
    468470            $processor->set_modifiable_text( $invalid_update ),
    469             'Should have reject possibly-compromising modifiable text update.'
     471            'Should have rejected possibly-compromising modifiable text update.'
    470472        );
    471473
     
    487489    public static function data_unallowed_modifiable_text_updates() {
    488490        return array(
    489             'Comment with -->'                 => array( '<!-- this is a comment -->', 'Comments end in -->' ),
    490             'Comment with --!>'                => array( '<!-- this is a comment -->', 'Invalid but legitimate comments end in --!>' ),
    491             'SCRIPT with </script>'            => array( '<script>Replace me</script>', 'Just a </script>' ),
    492             'SCRIPT with </script attributes>' => array( '<script>Replace me</script>', 'before</script id=sneak>after' ),
    493             'SCRIPT with "<script " opener'    => array( '<script>Replace me</script>', '<!--<script ' ),
     491            'Comment with -->'                        => array( '<!-- this is a comment -->', 'Comments end in -->' ),
     492            'Comment with --!>'                       => array( '<!-- this is a comment -->', 'Invalid but legitimate comments end in --!>' ),
     493            'Non-JS SCRIPT with <script>'             => array( '<script type="text/html">Replace me</script>', '<!-- Just a <script>' ),
     494            'Non-JS SCRIPT with </script>'            => array( '<script type="text/plain">Replace me</script>', 'Just a </script>' ),
     495            'Non-JS SCRIPT with <script attributes>'  => array( '<script language="text">Replace me</script>', '<!-- <script sneaky>after' ),
     496            'Non-JS SCRIPT with </script attributes>' => array( '<script language="text">Replace me</script>', 'before</script sneaky>after' ),
     497        );
     498    }
     499
     500    /**
     501     * Ensures that JavaScript script tag contents are safely updated.
     502     *
     503     * @ticket 62797
     504     *
     505     * @dataProvider data_script_tag_text_updates
     506     *
     507     * @param string $html     HTML containing a SCRIPT tag to be modified.
     508     * @param string $update   Update containing possibly-compromising text.
     509     * @param string $expected Expected result.
     510     */
     511    public function test_safely_updates_script_tag_contents( string $html, string $update, string $expected ) {
     512        $processor = new WP_HTML_Tag_Processor( $html );
     513        $this->assertTrue( $processor->next_tag( 'SCRIPT' ) );
     514        $this->assertTrue( $processor->set_modifiable_text( $update ) );
     515        $this->assertSame( $expected, $processor->get_updated_html() );
     516    }
     517
     518    /**
     519     * Data provider.
     520     *
     521     * @return array[]
     522     */
     523    public static function data_script_tag_text_updates(): array {
     524        return array(
     525            'Simple update'                         => array( '<script></script>', '{}', '<script>{}</script>' ),
     526            'Needs no replacement'                  => array( '<script></script>', '<!--<scriptish>', '<script><!--<scriptish></script>' ),
     527            'var script;1<script>0'                 => array( '<script></script>', 'var script;1<script>0', '<script>var script;1<\u0073cript>0</script>' ),
     528            '1</script>/'                           => array( '<script></script>', '1</script>/', '<script>1</\u0073cript>/</script>' ),
     529            'var SCRIPT;1<SCRIPT>0'                 => array( '<script></script>', 'var SCRIPT;1<SCRIPT>0', '<script>var SCRIPT;1<\u0053CRIPT>0</script>' ),
     530            '1</SCRIPT>/'                           => array( '<script></script>', '1</SCRIPT>/', '<script>1</\u0053CRIPT>/</script>' ),
     531            '"</script>"'                           => array( '<script></script>', '"</script>"', '<script>"</\u0073cript>"</script>' ),
     532            '"</ScRiPt>"'                           => array( '<script></script>', '"</ScRiPt>"', '<script>"</\u0053cRiPt>"</script>' ),
     533            'Tricky script open tag with \r'        => array( '<script></script>', "<!-- <script\r>", "<script><!-- <\\u0073cript\r></script>" ),
     534            'Tricky script open tag with \r\n'      => array( '<script></script>', "<!-- <script\r\n>", "<script><!-- <\\u0073cript\r\n></script>" ),
     535            'Tricky script close tag with \r'       => array( '<script></script>', "// </script\r>", "<script>// </\\u0073cript\r></script>" ),
     536            'Tricky script close tag with \r\n'     => array( '<script></script>', "// </script\r\n>", "<script>// </\\u0073cript\r\n></script>" ),
     537            'Module tag'                            => array( '<script type="module"></script>', '"<script>"', '<script type="module">"<\u0073cript>"</script>' ),
     538            'Tag with type'                         => array( '<script type="text/javascript"></script>', '"<script>"', '<script type="text/javascript">"<\u0073cript>"</script>' ),
     539            'Tag with language'                     => array( '<script language="javascript"></script>', '"<script>"', '<script language="javascript">"<\u0073cript>"</script>' ),
     540            'Non-JS script, save HTML-like content' => array( '<script type="text/html"></script>', '<h1>This & that</h1>', '<script type="text/html"><h1>This & that</h1></script>' ),
     541        );
     542    }
     543
     544    /**
     545     * @ticket 64419
     546     */
     547    public function test_complex_javascript_and_json_auto_escaping() {
     548        $processor = new WP_HTML_Tag_Processor( "<script></script>\n<script></script>\n<hr>" );
     549        $processor->next_tag( 'SCRIPT' );
     550        $processor->set_attribute( 'type', 'importmap' );
     551        $importmap_data = array(
     552            'imports' => array(
     553                '</SCRIPT>\\<!--\\<script>' => './script',
     554            ),
     555        );
     556
     557        $importmap = json_encode(
     558            $importmap_data,
     559            JSON_UNESCAPED_SLASHES | JSON_UNESCAPED_UNICODE | JSON_UNESCAPED_LINE_TERMINATORS
     560        );
     561
     562        $processor->set_modifiable_text( "\n{$importmap}\n" );
     563        $decoded_importmap = json_decode( $processor->get_modifiable_text(), true );
     564        $this->assertSame( JSON_ERROR_NONE, json_last_error(), 'JSON failed to decode correctly.' );
     565        $this->assertEquals( $importmap_data, $decoded_importmap );
     566        $processor->next_tag( 'SCRIPT' );
     567        $processor->set_attribute( 'type', 'module' );
     568        $javascript = <<<'JS'
     569import '</SCRIPT>\\<!--\\<script>';
     570JS;
     571        $processor->set_modifiable_text( "\n{$javascript}\n" );
     572
     573        $expected = <<<'HTML'
     574<script type="importmap">
     575{"imports":{"</\u0053CRIPT>\\<!--\\<\u0073cript>":"./script"}}
     576</script>
     577<script type="module">
     578import '</\u0053CRIPT>\\<!--\\<\u0073cript>';
     579</script>
     580<hr>
     581HTML;
     582
     583        $updated_html = $processor->get_updated_html();
     584        $this->assertEqualHTML( $expected, $updated_html );
     585
     586        // Reprocess to ensure JSON survives HTML round-trip:
     587        $processor = new WP_HTML_Tag_Processor( $updated_html );
     588        $processor->next_tag( 'SCRIPT' );
     589        $this->assertSame( 'importmap', $processor->get_attribute( 'type' ) );
     590        $importmap_json    = $processor->get_modifiable_text();
     591        $decoded_importmap = json_decode( $importmap_json, true );
     592        $this->assertSame( JSON_ERROR_NONE, json_last_error(), 'Importmap JSON failed to decode.' );
     593        $this->assertEquals(
     594            $importmap_data,
     595            $decoded_importmap,
     596            'JSON was not equal after re-processing updated HTML.'
     597        );
     598    }
     599
     600    /**
     601     * @ticket 64419
     602     */
     603    public function test_json_auto_escaping() {
     604        // This is not a typical JSON encoding or escaping, but it is valid.
     605        $json_text             = '"Escaped BS: \\\\; Escaped BS+LT: \\\\<; Unescaped LT: <; Script closer: </script>"';
     606        $expected_decoded_json = 'Escaped BS: \\; Escaped BS+LT: \\<; Unescaped LT: <; Script closer: </script>';
     607        $decoded_json          = json_decode( $json_text );
     608        $this->assertSame( JSON_ERROR_NONE, json_last_error(), 'JSON failed to decode.' );
     609        $this->assertSame(
     610            $expected_decoded_json,
     611            $decoded_json,
     612            'Decoded JSON did not match expected value.'
     613        );
     614
     615        $processor = new WP_HTML_Tag_Processor( '<script type="application/json"></script>' );
     616        $processor->next_tag( 'SCRIPT' );
     617
     618        $processor->set_modifiable_text( "\n{$json_text}\n" );
     619
     620        $expected = <<<'HTML'
     621<script type="application/json">
     622"Escaped BS: \\; Escaped BS+LT: \\<; Unescaped LT: <; Script closer: </\u0073cript>"
     623</script>
     624HTML;
     625
     626        $updated_html = $processor->get_updated_html();
     627        $this->assertEqualHTML( $expected, $updated_html );
     628
     629        // Reprocess to ensure JSON value survives HTML round-trip:
     630        $processor = new WP_HTML_Tag_Processor( $updated_html );
     631        $processor->next_tag( 'SCRIPT' );
     632        $decoded_json_from_html = json_decode( $processor->get_modifiable_text(), true );
     633        $this->assertSame( JSON_ERROR_NONE, json_last_error(), 'JSON failed to decode.' );
     634        $this->assertEquals(
     635            $expected_decoded_json,
     636            $decoded_json_from_html
    494637        );
    495638    }
Note: See TracChangeset for help on using the changeset viewer.