Changeset 61477
- Timestamp:
- 01/13/2026 01:11:23 PM (8 weeks ago)
- Location:
- trunk
- Files:
-
- 3 added
- 2 edited
-
src/wp-includes/html-api/class-wp-html-tag-processor.php (modified) (4 diffs)
-
tests/phpunit/data/html-api/script-element-escaping-diagram.dot (added)
-
tests/phpunit/data/html-api/script-element-escaping-diagram.php (added)
-
tests/phpunit/tests/html-api/wpHtmlTagProcessorModifiableText.php (modified) (4 diffs)
-
tests/phpunit/tests/html-api/wpHtmlTagProcessorScriptTag.php (added)
Legend:
- Unmodified
- Added
- Removed
-
trunk/src/wp-includes/html-api/class-wp-html-tag-processor.php
r61346 r61477 3722 3722 * Not all modifiable text may be set by this method, and not all content 3723 3723 * may be set as modifiable text. In the case that this fails it will return 3724 * `false` indicating as much. For instance, it will not allow inserting the 3725 * string `</script` into a SCRIPT element, because the rules for escaping 3726 * that safely are complicated. Similarly, it will not allow setting content 3727 * into a comment which would prematurely terminate the comment. 3724 * `false` indicating as much. For instance, if the contents of a SCRIPT 3725 * element are neither JavaScript nor JSON, it’s not possible to guarantee 3726 * that escaping strings like `</script>` won’t break the script; in these 3727 * cases, updates will be rejected and it’s up to calling code to perform 3728 * language-specific escaping or workarounds. Similarly, it will not allow 3729 * setting content into a comment which would prematurely terminate the comment. 3728 3730 * 3729 3731 * Example: … … 3812 3814 switch ( $this->get_tag() ) { 3813 3815 case 'SCRIPT': 3814 /** 3815 * This is over-protective, but ensures the update doesn't break 3816 * the HTML structure of the SCRIPT element. 3816 $script_content_type = $this->get_script_content_type(); 3817 3818 switch ( $script_content_type ) { 3819 case 'javascript': 3820 case 'json': 3821 $this->lexical_updates['modifiable text'] = new WP_HTML_Text_Replacement( 3822 $this->text_starts_at, 3823 $this->text_length, 3824 self::escape_javascript_script_contents( $plaintext_content ) 3825 ); 3826 return true; 3827 } 3828 3829 /* 3830 * If the script’s content type isn’t recognized and understandable then it’s 3831 * impossible to guarantee that escaping the content won’t cause runtime breakage. 3832 * For instance, if the script content type were PHP code then escaping with 3833 * `\u0073` would not be met by unescaping; rather, it could result in corrupted 3834 * data or even syntax errors. 3817 3835 * 3818 * More thorough analysis could track the HTML tokenizer states 3819 * and to ensure that the SCRIPT element closes at the expected 3820 * SCRIPT close tag as is done in {@see ::skip_script_data()}. 3821 * 3822 * A SCRIPT element could be closed prematurely by contents 3823 * like `</script>`. A SCRIPT element could be prevented from 3824 * closing by contents like `<!--<script>`. 3825 * 3826 * The following strings are essential for dangerous content, 3827 * although they are insufficient on their own. This trade-off 3828 * prevents dangerous scripts from being sent to the browser. 3829 * It is also unlikely to produce HTML that may confuse more 3830 * basic HTML tooling. 3836 * Because of this, content which could potentially modify the SCRIPT tag’s 3837 * HTML structure is rejected here. It’s the responsibility of calling code to 3838 * perform whatever semantic escaping is necessary to avoid problematic strings. 3831 3839 */ 3832 3840 if ( 3833 false !== stripos( $plaintext_content, '< /script' ) ||3834 false !== stripos( $plaintext_content, '< script' )3841 false !== stripos( $plaintext_content, '<script' ) || 3842 false !== stripos( $plaintext_content, '</script' ) 3835 3843 ) { 3836 3844 return false; 3837 3845 } 3838 3839 3846 $this->lexical_updates['modifiable text'] = new WP_HTML_Text_Replacement( 3840 3847 $this->text_starts_at, … … 3842 3849 $plaintext_content 3843 3850 ); 3844 3845 3851 return true; 3846 3852 … … 3890 3896 3891 3897 return false; 3898 } 3899 3900 /** 3901 * Returns the content type of the currently-matched HTML SCRIPT tag, if matched and 3902 * recognized, otherwise returns `null` to indicate an unrecognized content type. 3903 * 3904 * An HTML SCRIPT tag is a normal SCRIPT tag, but there can be SCRIPT elements inside 3905 * SVG and MathML elements as well, and these have different parsing rules than those 3906 * in general HTML. For this reason, no content-type inference is performed on those. 3907 * 3908 * Note! This concept is related but distinct from the MIME type of the script. 3909 * Parsing MUST match the specific algorithm in the HTML specification, which 3910 * relies on exact string comparison in some cases. MIME type decoding may be 3911 * performed on SVG or MathML SCRIPT tags. 3912 * 3913 * Only 'javascript' and 'json' content types are currently recognized. 3914 * 3915 * @see https://html.spec.whatwg.org/multipage/scripting.html#prepare-the-script-element 3916 * 3917 * @since 7.0.0 3918 * 3919 * @return 'javascript'|'json'|null Type of script element content if matched and recognized. 3920 */ 3921 private function get_script_content_type(): ?string { 3922 // SVG and MathML SCRIPT elements are not recognized. 3923 if ( 'SCRIPT' !== $this->get_tag() || $this->get_namespace() !== 'html' ) { 3924 return null; 3925 } 3926 3927 /* 3928 * > If any of the following are true: 3929 * > - el has a type attribute whose value is the empty string; 3930 * > - el has no type attribute but it has a language attribute and that attribute's 3931 * > value is the empty string; or 3932 * > - el has neither a type attribute nor a language attribute, 3933 * > then let the script block's type string for this script element be "text/javascript". 3934 */ 3935 $type = $this->get_attribute( 'type' ); 3936 $lang = $this->get_attribute( 'language' ); 3937 3938 if ( true === $type || '' === $type ) { 3939 return 'javascript'; 3940 } 3941 3942 if ( null === $type && ( null === $lang || true === $lang || '' === $lang ) ) { 3943 return 'javascript'; 3944 } 3945 3946 /* 3947 * > Otherwise, if el has a type attribute, then let the script block's type string be 3948 * > the value of that attribute with leading and trailing ASCII whitespace stripped. 3949 * > Otherwise, el has a non-empty language attribute; let the script block's type string 3950 * > be the concatenation of "text/" and the value of el's language attribute. 3951 */ 3952 $type_string = is_string( $type ) ? trim( $type, " \t\f\r\n" ) : "text/{$lang}"; 3953 3954 // All matches are ASCII case-insensitive; eagerly lower-case for comparison. 3955 $type_string = strtolower( $type_string ); 3956 3957 /* 3958 * > If the script block's type string is a JavaScript MIME type essence match, then 3959 * > set el's type to "classic". 3960 * 3961 * > A string is a JavaScript MIME type essence match if it is an ASCII case-insensitive 3962 * > match for one of the JavaScript MIME type essence strings. 3963 * 3964 * > A JavaScript MIME type is any MIME type whose essence is one of the following: 3965 * > 3966 * > - application/ecmascript 3967 * > - application/javascript 3968 * > - application/x-ecmascript 3969 * > - application/x-javascript 3970 * > - text/ecmascript 3971 * > - text/javascript 3972 * > - text/javascript1.0 3973 * > - text/javascript1.1 3974 * > - text/javascript1.2 3975 * > - text/javascript1.3 3976 * > - text/javascript1.4 3977 * > - text/javascript1.5 3978 * > - text/jscript 3979 * > - text/livescript 3980 * > - text/x-ecmascript 3981 * > - text/x-javascript 3982 * 3983 * @see https://mimesniff.spec.whatwg.org/#javascript-mime-type-essence-match 3984 * @see https://mimesniff.spec.whatwg.org/#javascript-mime-type 3985 */ 3986 switch ( $type_string ) { 3987 case 'application/ecmascript': 3988 case 'application/javascript': 3989 case 'application/x-ecmascript': 3990 case 'application/x-javascript': 3991 case 'text/ecmascript': 3992 case 'text/javascript': 3993 case 'text/javascript1.0': 3994 case 'text/javascript1.1': 3995 case 'text/javascript1.2': 3996 case 'text/javascript1.3': 3997 case 'text/javascript1.4': 3998 case 'text/javascript1.5': 3999 case 'text/jscript': 4000 case 'text/livescript': 4001 case 'text/x-ecmascript': 4002 case 'text/x-javascript': 4003 return 'javascript'; 4004 4005 /* 4006 * > Otherwise, if the script block's type string is an ASCII case-insensitive match for 4007 * > the string "module", then set el's type to "module". 4008 * 4009 * A module is evaluated as JavaScript. 4010 */ 4011 case 'module': 4012 return 'javascript'; 4013 4014 /* 4015 * > Otherwise, if the script block's type string is an ASCII case-insensitive match for the string "importmap", then set el's type to "importmap". 4016 * > Otherwise, if the script block's type string is an ASCII case-insensitive match for the string "speculationrules", then set el's type to "speculationrules". 4017 * 4018 * These conditions indicate JSON content. 4019 */ 4020 case 'importmap': 4021 case 'speculationrules': 4022 return 'json'; 4023 4024 /** @todo Rely on a full MIME parser for determining JSON content. */ 4025 case 'application/json': 4026 case 'text/json': 4027 return 'json'; 4028 } 4029 4030 /* 4031 * > Otherwise, return. (No script is executed, and el's type is left as null.) 4032 */ 4033 return null; 4034 } 4035 4036 /** 4037 * Escape JavaScript and JSON script tag contents. 4038 * 4039 * Ensure that the script contents cannot modify the HTML structure or break out 4040 * of its containing SCRIPT element. JavaScript and JSON may both be escaped with 4041 * the same rules, even though there are additional escaping measures available 4042 * to JavaScript source code which aren’t applicable to serialized JSON data. 4043 * 4044 * A simple method safely escapes all content except for a few extremely rare and 4045 * unlikely exceptions: prevent the appearance of `<script` and `</script` within 4046 * the contents by replacing the first letter of the tag name with a Unicode escape. 4047 * 4048 * Example: 4049 * 4050 * $plaintext = '<script>document.write( "A </script> closes a script." );</script>'; 4051 * $escaped = '<script>document.write( "A </\u0073cript> closes a script." );</script>'; 4052 * 4053 * This works because of how parsing changes after encountering an opening SCRIPT 4054 * tag. The actual parsing comprises a complicated state machine, the result of 4055 * legacy behaviors and diverse browser support. However, without these two strings 4056 * in the script contents, two key things are ensured: `</script>` cannot appear to 4057 * prematurely close the tag, and the problematic double-escaped state becomes 4058 * unreachable. A JavaScript engine or JSON decoder will then decode the Unicode 4059 * escape (`\u0073`) back into its original plaintext value, but only after having 4060 * been safely extracted from the HTML. 4061 * 4062 * While it may seem tempting to replace the `<` character instead, doing so would 4063 * break JavaScript syntax. The `<` character is used in comparison operators and 4064 * other JavaScript syntax; replacing it would break valid JavaScript. Replacing 4065 * only the `s` in `<script` and `</script` avoids modifying JavaScript syntax. 4066 * 4067 * ### Exceptions 4068 * 4069 * This _should_ work everywhere, but there are some extreme exceptions. 4070 * 4071 * - Comments. 4072 * - Tagged templates, such as `String.raw()`, which provide access to “raw” strings. 4073 * - The `source` property of a RegExp object. 4074 * 4075 * Each of these exceptions appear at the source code level, not at the semantic or 4076 * evaluation level. Normal JavaScript will remain semantically equivalent after escaping, 4077 * but any JavaScript which analyzes the raw source code will see potentially-different 4078 * values. 4079 * 4080 * #### Comments 4081 * 4082 * Comments are never unescaped because they aren’t parsed by the JavaScript engine. 4083 * When viewing the source in a browser’s developer tools, the comments will retain 4084 * their escaped text. 4085 * 4086 * Example: 4087 * 4088 * // A comment: "</script>" 4089 * …becomes… 4090 * // A comment: "</\u0073cript>" 4091 * 4092 * #### Tagged templates. 4093 * 4094 * Tagged templates “enable the embedding of arbitrary string content, where escape 4095 * sequences may follow a different syntax.” For example, they can aid representing 4096 * a RegExp pattern or LaTex snippet within a JavaScript string, where the string 4097 * escape characters might get noisy and distracting. 4098 * 4099 * Example: 4100 * 4101 * console.log( 'A \notin B' ); // Prints a newline because of the "\n". 4102 * console.log( 'A \\notin B' ); // Prints "A \notin B". 4103 * console.log( String.raw`A \notin B` ); // Prints "A \notin B". 4104 * 4105 * This means that if `<script` transforms into `<\u0073cript` _inside_ a raw string 4106 * or tagged template literal which relies on its `.raw` property, the output of the 4107 * code will be different after escaping. 4108 * 4109 * Example: 4110 * 4111 * console.log( String.raw`</script>` ); // Prematurely closes the SCRIPT element. 4112 * console.log( String.raw`</\u0073cript>` ); // Prints "</\u0073cript". 4113 * 4114 * #### RegExp sources. 4115 * 4116 * The RegExp object exposes its raw source in a similar way to how tagged templates and raw 4117 * strings do. Thankfully, because escape sequences are decoded when compiling the pattern, 4118 * escaped RegExp patterns will match the same way as the plaintext sequences would. 4119 * 4120 * Example: 4121 * 4122 * true === /<script>/.test( '<script>' ); 4123 * true === /<\u0073cript>/.test( '<script>' ); 4124 * 4125 * However, as with raw strings, any code which reads the source will see the escaped value 4126 * instead of the decoded one. 4127 * 4128 * Example: 4129 * 4130 * console.log( /<script>/.source ); // Prints "<script>". 4131 * console.log( /<\u0073cript>/.source ); // Prints "<\u0073cript>". 4132 * 4133 * #### Unsupported escaping. 4134 * 4135 * It is not possible to properly represent every possible JavaScript source file 4136 * inside a SCRIPT element. As with CSS stylesheets, SVG images, and MathML, the 4137 * only 100% reliable way to represent all possible inputs is to link to external 4138 * files of the given content-type. 4139 * 4140 * In some cases it’s possible to manually prevent escaping issues. These are not 4141 * automatically handled by this function because doing so would require a full 4142 * JavaScript tokenizer. Consider the following example listing various ways to 4143 * manually escape a closing script tag. 4144 * 4145 * Example: 4146 * 4147 * console.log( String.raw`</script>` ); // !!UNSAFE!! Will be escaped. 4148 * console.log( String.raw`</\u0073cript>` ); // "</\u0073cript>" 4149 * console.log( String.raw`</scr` + String.raw`ipt>` ); // "</script>" 4150 * console.log( String.raw`</${"script"}>` ); // "</script>" 4151 * console.log( '</scr' + 'ipt>' ); // "</script>" 4152 * console.log( "\x3C/script>" ); // "</script>" 4153 * console.log( "<\/script>" ); // "</script>" 4154 * 4155 * The following graph is a simplified interpretation of how HTML interprets the contents 4156 * of a SCRIPT tag and identifies the closing tag. It is useful to understand what text 4157 * is dangerous inside of a SCRIPT tag and why different approaches to escaping work. 4158 * 4159 * Open script 4160 * │ 4161 * ▼ 4162 * ╔═════════════════════════════════════════╗ <!--(…)> 4163 * ║ ║ (all dashes) 4164 * ║ script ╟────────────────╮ 4165 * ║ data ║ │ 4166 * ╭───────────╢ ║ ◀──────────────╯ 4167 * │ ╚═╤═══════════════════════════════════════╝ 4168 * │ │ ▲ ▲ 4169 * │ │ <!-- │ --> ╰─────╮ 4170 * │ ▼ │ │ 4171 * │ ┌─────────────────┴───────────────────────┐ │ 4172 * │ </script¹ │ escaped │ │ 4173 * │ └─┬─────────────────────────────┬─────────┘ │ 4174 * │ │ ▲ │ │ --> 4175 * │ │ </script¹ │ </script¹ │ <script¹ │ 4176 * │ ▼ │ ▼ │ 4177 * │ ╔══════════════╗ │ ┌───────────┐ │ 4178 * │ ║ Close script ║ │ │ double │ │ 4179 * ╰──────────▶║ ║ ╰───────────┤ escaped ├──╯ 4180 * ╚══════════════╝ └───────────┘ 4181 * 4182 * ¹ = Case insensitive 'script' followed by one of ' \t\f\r\n/>', known 4183 * as “tag-name-terminating characters.” This sequence forms the start 4184 * of what could be a SCRIPT opening or closing tag. 4185 * 4186 * @see https://html.spec.whatwg.org/#restrictions-for-contents-of-script-elements 4187 * @see https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Template_literals#specifications 4188 * @see wp_html_api_script_element_escaping_diagram_source() 4189 * 4190 * @since 7.0.0 4191 * 4192 * @param string $sourcecode Raw contents intended to be serialized into an HTML SCRIPT element. 4193 * @return string Escaped form of input contents which will not lead to premature closing of the containing SCRIPT element. 4194 */ 4195 private static function escape_javascript_script_contents( string $sourcecode ): string { 4196 $at = 0; 4197 $was_at = 0; 4198 $end = strlen( $sourcecode ); 4199 $escaped = ''; 4200 4201 /* 4202 * Replace all instances of the ASCII case-insensitive match of "<script" 4203 * and "</script", when followed by whitespace or "/" or ">", by using a 4204 * character replacement for the "s" (or the "S"). 4205 */ 4206 while ( $at < $end ) { 4207 $tag_at = strpos( $sourcecode, '<', $at ); 4208 if ( false === $tag_at ) { 4209 break; 4210 } 4211 4212 $tag_name_at = $tag_at + 1; 4213 $has_closing_slash = $tag_name_at < $end && '/' === $sourcecode[ $tag_name_at ]; 4214 $tag_name_at += $has_closing_slash ? 1 : 0; 4215 4216 if ( 0 !== substr_compare( $sourcecode, 'script', $tag_name_at, 6, true ) ) { 4217 $at = $tag_at + 1; 4218 continue; 4219 } 4220 4221 if ( 1 !== strspn( $sourcecode, " \t\f\r\n/>", $tag_name_at + 6, 1 ) ) { 4222 $at = $tag_name_at + 6; 4223 continue; 4224 } 4225 4226 $escaped .= substr( $sourcecode, $was_at, $tag_name_at - $was_at ); 4227 $escaped .= 's' === $sourcecode[ $tag_name_at ] ? '\u0073' : '\u0053'; 4228 $was_at = $tag_name_at + 1; 4229 $at = $tag_name_at + 7; 4230 } 4231 4232 if ( '' === $escaped ) { 4233 return $sourcecode; 4234 } 4235 4236 if ( $was_at < $end ) { 4237 $escaped .= substr( $sourcecode, $was_at ); 4238 } 4239 4240 return $escaped; 3892 4241 } 3893 4242 -
trunk/tests/phpunit/tests/html-api/wpHtmlTagProcessorModifiableText.php
r60706 r61477 445 445 * Ensures that updates with potentially-compromising values aren't accepted. 446 446 * 447 * For example, a modifiable text update should be allowed which would break 448 * the structure of the containing element, such as in a script or comment. 447 * For example, a modifiable text update that would change the structure of the HTML 448 * document is not allowed, like attempting to set `-->` within a comment or `</script>` 449 * within a text/plain SCRIPT tag. 449 450 * 450 451 * @ticket 61617 452 * @ticket 62797 451 453 * 452 454 * @dataProvider data_unallowed_modifiable_text_updates … … 455 457 * @param string $invalid_update Update containing possibly-compromising text. 456 458 */ 457 public function test_rejects_ updates_with_unallowed_substrings( string $html_with_nonempty_modifiable_text, string $invalid_update ) {459 public function test_rejects_dangerous_updates( string $html_with_nonempty_modifiable_text, string $invalid_update ) { 458 460 $processor = new WP_HTML_Tag_Processor( $html_with_nonempty_modifiable_text ); 459 461 … … 467 469 $this->assertFalse( 468 470 $processor->set_modifiable_text( $invalid_update ), 469 'Should have reject possibly-compromising modifiable text update.'471 'Should have rejected possibly-compromising modifiable text update.' 470 472 ); 471 473 … … 487 489 public static function data_unallowed_modifiable_text_updates() { 488 490 return array( 489 'Comment with -->' => array( '<!-- this is a comment -->', 'Comments end in -->' ), 490 'Comment with --!>' => array( '<!-- this is a comment -->', 'Invalid but legitimate comments end in --!>' ), 491 'SCRIPT with </script>' => array( '<script>Replace me</script>', 'Just a </script>' ), 492 'SCRIPT with </script attributes>' => array( '<script>Replace me</script>', 'before</script id=sneak>after' ), 493 'SCRIPT with "<script " opener' => array( '<script>Replace me</script>', '<!--<script ' ), 491 'Comment with -->' => array( '<!-- this is a comment -->', 'Comments end in -->' ), 492 'Comment with --!>' => array( '<!-- this is a comment -->', 'Invalid but legitimate comments end in --!>' ), 493 'Non-JS SCRIPT with <script>' => array( '<script type="text/html">Replace me</script>', '<!-- Just a <script>' ), 494 'Non-JS SCRIPT with </script>' => array( '<script type="text/plain">Replace me</script>', 'Just a </script>' ), 495 'Non-JS SCRIPT with <script attributes>' => array( '<script language="text">Replace me</script>', '<!-- <script sneaky>after' ), 496 'Non-JS SCRIPT with </script attributes>' => array( '<script language="text">Replace me</script>', 'before</script sneaky>after' ), 497 ); 498 } 499 500 /** 501 * Ensures that JavaScript script tag contents are safely updated. 502 * 503 * @ticket 62797 504 * 505 * @dataProvider data_script_tag_text_updates 506 * 507 * @param string $html HTML containing a SCRIPT tag to be modified. 508 * @param string $update Update containing possibly-compromising text. 509 * @param string $expected Expected result. 510 */ 511 public function test_safely_updates_script_tag_contents( string $html, string $update, string $expected ) { 512 $processor = new WP_HTML_Tag_Processor( $html ); 513 $this->assertTrue( $processor->next_tag( 'SCRIPT' ) ); 514 $this->assertTrue( $processor->set_modifiable_text( $update ) ); 515 $this->assertSame( $expected, $processor->get_updated_html() ); 516 } 517 518 /** 519 * Data provider. 520 * 521 * @return array[] 522 */ 523 public static function data_script_tag_text_updates(): array { 524 return array( 525 'Simple update' => array( '<script></script>', '{}', '<script>{}</script>' ), 526 'Needs no replacement' => array( '<script></script>', '<!--<scriptish>', '<script><!--<scriptish></script>' ), 527 'var script;1<script>0' => array( '<script></script>', 'var script;1<script>0', '<script>var script;1<\u0073cript>0</script>' ), 528 '1</script>/' => array( '<script></script>', '1</script>/', '<script>1</\u0073cript>/</script>' ), 529 'var SCRIPT;1<SCRIPT>0' => array( '<script></script>', 'var SCRIPT;1<SCRIPT>0', '<script>var SCRIPT;1<\u0053CRIPT>0</script>' ), 530 '1</SCRIPT>/' => array( '<script></script>', '1</SCRIPT>/', '<script>1</\u0053CRIPT>/</script>' ), 531 '"</script>"' => array( '<script></script>', '"</script>"', '<script>"</\u0073cript>"</script>' ), 532 '"</ScRiPt>"' => array( '<script></script>', '"</ScRiPt>"', '<script>"</\u0053cRiPt>"</script>' ), 533 'Tricky script open tag with \r' => array( '<script></script>', "<!-- <script\r>", "<script><!-- <\\u0073cript\r></script>" ), 534 'Tricky script open tag with \r\n' => array( '<script></script>', "<!-- <script\r\n>", "<script><!-- <\\u0073cript\r\n></script>" ), 535 'Tricky script close tag with \r' => array( '<script></script>', "// </script\r>", "<script>// </\\u0073cript\r></script>" ), 536 'Tricky script close tag with \r\n' => array( '<script></script>', "// </script\r\n>", "<script>// </\\u0073cript\r\n></script>" ), 537 'Module tag' => array( '<script type="module"></script>', '"<script>"', '<script type="module">"<\u0073cript>"</script>' ), 538 'Tag with type' => array( '<script type="text/javascript"></script>', '"<script>"', '<script type="text/javascript">"<\u0073cript>"</script>' ), 539 'Tag with language' => array( '<script language="javascript"></script>', '"<script>"', '<script language="javascript">"<\u0073cript>"</script>' ), 540 'Non-JS script, save HTML-like content' => array( '<script type="text/html"></script>', '<h1>This & that</h1>', '<script type="text/html"><h1>This & that</h1></script>' ), 541 ); 542 } 543 544 /** 545 * @ticket 64419 546 */ 547 public function test_complex_javascript_and_json_auto_escaping() { 548 $processor = new WP_HTML_Tag_Processor( "<script></script>\n<script></script>\n<hr>" ); 549 $processor->next_tag( 'SCRIPT' ); 550 $processor->set_attribute( 'type', 'importmap' ); 551 $importmap_data = array( 552 'imports' => array( 553 '</SCRIPT>\\<!--\\<script>' => './script', 554 ), 555 ); 556 557 $importmap = json_encode( 558 $importmap_data, 559 JSON_UNESCAPED_SLASHES | JSON_UNESCAPED_UNICODE | JSON_UNESCAPED_LINE_TERMINATORS 560 ); 561 562 $processor->set_modifiable_text( "\n{$importmap}\n" ); 563 $decoded_importmap = json_decode( $processor->get_modifiable_text(), true ); 564 $this->assertSame( JSON_ERROR_NONE, json_last_error(), 'JSON failed to decode correctly.' ); 565 $this->assertEquals( $importmap_data, $decoded_importmap ); 566 $processor->next_tag( 'SCRIPT' ); 567 $processor->set_attribute( 'type', 'module' ); 568 $javascript = <<<'JS' 569 import '</SCRIPT>\\<!--\\<script>'; 570 JS; 571 $processor->set_modifiable_text( "\n{$javascript}\n" ); 572 573 $expected = <<<'HTML' 574 <script type="importmap"> 575 {"imports":{"</\u0053CRIPT>\\<!--\\<\u0073cript>":"./script"}} 576 </script> 577 <script type="module"> 578 import '</\u0053CRIPT>\\<!--\\<\u0073cript>'; 579 </script> 580 <hr> 581 HTML; 582 583 $updated_html = $processor->get_updated_html(); 584 $this->assertEqualHTML( $expected, $updated_html ); 585 586 // Reprocess to ensure JSON survives HTML round-trip: 587 $processor = new WP_HTML_Tag_Processor( $updated_html ); 588 $processor->next_tag( 'SCRIPT' ); 589 $this->assertSame( 'importmap', $processor->get_attribute( 'type' ) ); 590 $importmap_json = $processor->get_modifiable_text(); 591 $decoded_importmap = json_decode( $importmap_json, true ); 592 $this->assertSame( JSON_ERROR_NONE, json_last_error(), 'Importmap JSON failed to decode.' ); 593 $this->assertEquals( 594 $importmap_data, 595 $decoded_importmap, 596 'JSON was not equal after re-processing updated HTML.' 597 ); 598 } 599 600 /** 601 * @ticket 64419 602 */ 603 public function test_json_auto_escaping() { 604 // This is not a typical JSON encoding or escaping, but it is valid. 605 $json_text = '"Escaped BS: \\\\; Escaped BS+LT: \\\\<; Unescaped LT: <; Script closer: </script>"'; 606 $expected_decoded_json = 'Escaped BS: \\; Escaped BS+LT: \\<; Unescaped LT: <; Script closer: </script>'; 607 $decoded_json = json_decode( $json_text ); 608 $this->assertSame( JSON_ERROR_NONE, json_last_error(), 'JSON failed to decode.' ); 609 $this->assertSame( 610 $expected_decoded_json, 611 $decoded_json, 612 'Decoded JSON did not match expected value.' 613 ); 614 615 $processor = new WP_HTML_Tag_Processor( '<script type="application/json"></script>' ); 616 $processor->next_tag( 'SCRIPT' ); 617 618 $processor->set_modifiable_text( "\n{$json_text}\n" ); 619 620 $expected = <<<'HTML' 621 <script type="application/json"> 622 "Escaped BS: \\; Escaped BS+LT: \\<; Unescaped LT: <; Script closer: </\u0073cript>" 623 </script> 624 HTML; 625 626 $updated_html = $processor->get_updated_html(); 627 $this->assertEqualHTML( $expected, $updated_html ); 628 629 // Reprocess to ensure JSON value survives HTML round-trip: 630 $processor = new WP_HTML_Tag_Processor( $updated_html ); 631 $processor->next_tag( 'SCRIPT' ); 632 $decoded_json_from_html = json_decode( $processor->get_modifiable_text(), true ); 633 $this->assertSame( JSON_ERROR_NONE, json_last_error(), 'JSON failed to decode.' ); 634 $this->assertEquals( 635 $expected_decoded_json, 636 $decoded_json_from_html 494 637 ); 495 638 }
Note: See TracChangeset
for help on using the changeset viewer.