Changeset 58829
Legend:
- Unmodified
- Added
- Removed
-
trunk/src/wp-includes/html-api/class-wp-html-tag-processor.php
r58779 r58829 2890 2890 } 2891 2891 2892 $text = substr( $this->html, $this->text_starts_at, $this->text_length ); 2892 $text = isset( $this->lexical_updates['modifiable text'] ) 2893 ? $this->lexical_updates['modifiable text']->text 2894 : substr( $this->html, $this->text_starts_at, $this->text_length ); 2893 2895 2894 2896 /* … … 2955 2957 ? str_replace( "\x00", '', $decoded ) 2956 2958 : str_replace( "\x00", "\u{FFFD}", $decoded ); 2959 } 2960 2961 /** 2962 * Sets the modifiable text for the matched token, if matched. 2963 * 2964 * Modifiable text is text content that may be read and changed without 2965 * changing the HTML structure of the document around it. This includes 2966 * the contents of `#text` nodes in the HTML as well as the inner 2967 * contents of HTML comments, Processing Instructions, and others, even 2968 * though these nodes aren't part of a parsed DOM tree. They also contain 2969 * the contents of SCRIPT and STYLE tags, of TEXTAREA tags, and of any 2970 * other section in an HTML document which cannot contain HTML markup (DATA). 2971 * 2972 * Not all modifiable text may be set by this method, and not all content 2973 * may be set as modifiable text. In the case that this fails it will return 2974 * `false` indicating as much. For instance, it will not allow inserting the 2975 * string `</script` into a SCRIPT element, because the rules for escaping 2976 * that safely are complicated. Similarly, it will not allow setting content 2977 * into a comment which would prematurely terminate the comment. 2978 * 2979 * Example: 2980 * 2981 * // Add a preface to all STYLE contents. 2982 * while ( $processor->next_tag( 'STYLE' ) ) { 2983 * $style = $processor->get_modifiable_text(); 2984 * $processor->set_modifiable_text( "// Made with love on the World Wide Web\n{$style}" ); 2985 * } 2986 * 2987 * // Replace smiley text with Emoji smilies. 2988 * while ( $processor->next_token() ) { 2989 * if ( '#text' !== $processor->get_token_name() ) { 2990 * continue; 2991 * } 2992 * 2993 * $chunk = $processor->get_modifiable_text(); 2994 * if ( ! str_contains( $chunk, ':)' ) ) { 2995 * continue; 2996 * } 2997 * 2998 * $processor->set_modifiable_text( str_replace( ':)', '🙂', $chunk ) ); 2999 * } 3000 * 3001 * @since 6.7.0 3002 * 3003 * @param string $plaintext_content New text content to represent in the matched token. 3004 * 3005 * @return bool Whether the text was able to update. 3006 */ 3007 public function set_modifiable_text( string $plaintext_content ): bool { 3008 if ( self::STATE_TEXT_NODE === $this->parser_state ) { 3009 $this->lexical_updates['modifiable text'] = new WP_HTML_Text_Replacement( 3010 $this->text_starts_at, 3011 $this->text_length, 3012 htmlspecialchars( $plaintext_content, ENT_QUOTES | ENT_HTML5 ) 3013 ); 3014 3015 return true; 3016 } 3017 3018 // Comment data is not encoded. 3019 if ( 3020 self::STATE_COMMENT === $this->parser_state && 3021 self::COMMENT_AS_HTML_COMMENT === $this->comment_type 3022 ) { 3023 // Check if the text could close the comment. 3024 if ( 1 === preg_match( '/--!?>/', $plaintext_content ) ) { 3025 return false; 3026 } 3027 3028 $this->lexical_updates['modifiable text'] = new WP_HTML_Text_Replacement( 3029 $this->text_starts_at, 3030 $this->text_length, 3031 $plaintext_content 3032 ); 3033 3034 return true; 3035 } 3036 3037 if ( self::STATE_MATCHED_TAG !== $this->parser_state ) { 3038 return false; 3039 } 3040 3041 switch ( $this->get_tag() ) { 3042 case 'SCRIPT': 3043 /* 3044 * This is over-protective, but ensures the update doesn't break 3045 * out of the SCRIPT element. A more thorough check would need to 3046 * ensure that the script closing tag doesn't exist, and isn't 3047 * also "hidden" inside the script double-escaped state. 3048 * 3049 * It may seem like replacing `</script` with `<\/script` would 3050 * properly escape these things, but this could mask regex patterns 3051 * that previously worked. Resolve this by not sending `</script` 3052 */ 3053 if ( false !== stripos( $plaintext_content, '</script' ) ) { 3054 return false; 3055 } 3056 3057 $this->lexical_updates['modifiable text'] = new WP_HTML_Text_Replacement( 3058 $this->text_starts_at, 3059 $this->text_length, 3060 $plaintext_content 3061 ); 3062 3063 return true; 3064 3065 case 'STYLE': 3066 $plaintext_content = preg_replace_callback( 3067 '~</(?P<TAG_NAME>style)~i', 3068 static function ( $tag_match ) { 3069 return "\\3c\\2f{$tag_match['TAG_NAME']}"; 3070 }, 3071 $plaintext_content 3072 ); 3073 3074 $this->lexical_updates['modifiable text'] = new WP_HTML_Text_Replacement( 3075 $this->text_starts_at, 3076 $this->text_length, 3077 $plaintext_content 3078 ); 3079 3080 return true; 3081 3082 case 'TEXTAREA': 3083 case 'TITLE': 3084 $plaintext_content = preg_replace_callback( 3085 "~</(?P<TAG_NAME>{$this->get_tag()})~i", 3086 static function ( $tag_match ) { 3087 return "</{$tag_match['TAG_NAME']}"; 3088 }, 3089 $plaintext_content 3090 ); 3091 3092 /* 3093 * These don't _need_ to be escaped, but since they are decoded it's 3094 * safe to leave them escaped and this can prevent other code from 3095 * naively detecting tags within the contents. 3096 * 3097 * @todo It would be useful to prefix a multiline replacement text 3098 * with a newline, but not necessary. This is for aesthetics. 3099 */ 3100 $this->lexical_updates['modifiable text'] = new WP_HTML_Text_Replacement( 3101 $this->text_starts_at, 3102 $this->text_length, 3103 $plaintext_content 3104 ); 3105 3106 return true; 3107 } 3108 3109 return false; 2957 3110 } 2958 3111 -
trunk/tests/phpunit/tests/html-api/wpHtmlTagProcessorModifiableText.php
r58779 r58829 41 41 42 42 /** 43 * Ensures that updates to modifiable text that are shorter than the 44 * original text do not cause the parser to lose its orientation. 45 * 46 * @ticket 61617 47 */ 48 public function test_setting_shorter_modifiable_text() { 49 $processor = new WP_HTML_Tag_Processor( '<div><textarea>very long text</textarea><div id="not a <span>">' ); 50 51 // Find the test node in the middle. 52 while ( 'TEXTAREA' !== $processor->get_token_name() && $processor->next_token() ) { 53 continue; 54 } 55 56 $this->assertSame( 57 'TEXTAREA', 58 $processor->get_token_name(), 59 'Failed to find the test TEXTAREA node; check the test setup.' 60 ); 61 62 $processor->set_modifiable_text( 'short' ); 63 $processor->get_updated_html(); 64 $this->assertSame( 65 'short', 66 $processor->get_modifiable_text(), 67 'Should have updated modifiable text to something shorter than the original.' 68 ); 69 70 $this->assertTrue( 71 $processor->next_token(), 72 'Should have advanced to the last token in the input.' 73 ); 74 75 $this->assertSame( 76 'DIV', 77 $processor->get_token_name(), 78 'Should have recognized the final DIV in the input.' 79 ); 80 81 $this->assertSame( 82 'not a <span>', 83 $processor->get_attribute( 'id' ), 84 'Should have read in the id from the last DIV as "not a <span>"' 85 ); 86 } 87 88 /** 89 * Ensures that reads to modifiable text after setting it reads the updated 90 * enqueued values, and not the original value. 91 * 92 * @ticket 61617 93 */ 94 public function test_modifiable_text_reads_updates_after_setting() { 95 $processor = new WP_HTML_Tag_Processor( 'This is text<!-- this is not -->' ); 96 97 $processor->next_token(); 98 $this->assertSame( 99 '#text', 100 $processor->get_token_name(), 101 'Failed to find first text node: check test setup.' 102 ); 103 104 $update = 'This is new text'; 105 $processor->set_modifiable_text( $update ); 106 $this->assertSame( 107 $update, 108 $processor->get_modifiable_text(), 109 'Failed to read updated enqueued value of text node.' 110 ); 111 112 $processor->next_token(); 113 $this->assertSame( 114 '#comment', 115 $processor->get_token_name(), 116 'Failed to advance to comment: check test setup.' 117 ); 118 119 $this->assertSame( 120 ' this is not ', 121 $processor->get_modifiable_text(), 122 'Failed to read modifiable text for next token; did it read the old enqueued value from the previous token?' 123 ); 124 } 125 126 /** 43 127 * Ensures that when ignoring a newline after LISTING and PRE tags, that this 44 128 * happens appropriately after seeking. … … 109 193 ); 110 194 } 195 196 /** 197 * Ensures that modifiable text updates are not applied where they aren't supported. 198 * 199 * @ticket 61617 200 * 201 * @dataProvider data_tokens_not_supporting_modifiable_text_updates 202 * 203 * @param string $html Contains HTML with a token not supporting modifiable text updates. 204 * @param int $advance_n_tokens Count of times to run `next_token()` before reaching target node. 205 */ 206 public function test_rejects_updates_on_unsupported_match_locations( string $html, int $advance_n_tokens ) { 207 $processor = new WP_HTML_Tag_Processor( $html ); 208 while ( --$advance_n_tokens >= 0 ) { 209 $processor->next_token(); 210 } 211 212 $this->assertFalse( 213 $processor->set_modifiable_text( 'Bazinga!' ), 214 'Should have prevented modifying the text at the target node.' 215 ); 216 217 $this->assertSame( 218 $html, 219 $processor->get_updated_html(), 220 'Should not have modified the input document in any way.' 221 ); 222 } 223 224 /** 225 * Data provider. 226 * 227 * @return array[] 228 */ 229 public static function data_tokens_not_supporting_modifiable_text_updates() { 230 return array( 231 'Before parsing' => array( 'nothing to see here', 0 ), 232 'After parsing' => array( 'nothing here either', 2 ), 233 'Incomplete document' => array( '<tag without="an end', 1 ), 234 'Presumptuous closer' => array( 'before</>after', 2 ), 235 'Invalid (CDATA)' => array( '<![CDATA[this is a comment]]>', 1 ), 236 'Invalid (shortest comment)' => array( '<!-->', 1 ), 237 'Invalid (shorter comment)' => array( '<!--->', 1 ), 238 'Invalid (markup declaration)' => array( '<!run>', 1 ), 239 'Invalid (PI-like node)' => array( '<?xml is not html ?>', 1 ), 240 ); 241 } 242 243 /** 244 * Ensures that modifiable text updates are applied as expected to supported nodes. 245 * 246 * @ticket 61617 247 * 248 * @dataProvider data_tokens_with_basic_modifiable_text_updates 249 * 250 * @param string $html Contains HTML with a token supporting modifiable text updates. 251 * @param int $advance_n_tokens Count of times to run `next_token()` before reaching target node. 252 * @param string $raw_replacement This should be escaped properly when replaced as modifiable text. 253 * @param string $transformed Expected output after updating modifiable text. 254 */ 255 public function test_updates_basic_modifiable_text_on_supported_nodes( string $html, int $advance_n_tokens, string $raw_replacement, string $transformed ) { 256 $processor = new WP_HTML_Tag_Processor( $html ); 257 while ( --$advance_n_tokens >= 0 ) { 258 $processor->next_token(); 259 } 260 261 $this->assertTrue( 262 $processor->set_modifiable_text( $raw_replacement ), 263 'Should have modified the text at the target node.' 264 ); 265 266 $this->assertSame( 267 $transformed, 268 $processor->get_updated_html(), 269 "Should have transformed the HTML as expected why modifying the target node's modifiable text." 270 ); 271 } 272 273 /** 274 * Data provider. 275 * 276 * @return array[] 277 */ 278 public static function data_tokens_with_basic_modifiable_text_updates() { 279 return array( 280 'Text node (start)' => array( 'Text', 1, 'Blubber', 'Blubber' ), 281 'Text node (middle)' => array( '<em>Bold move</em>', 2, 'yo', '<em>yo</em>' ), 282 'Text node (end)' => array( '<img>of a dog', 2, 'of a cat', '<img>of a cat' ), 283 'Encoded text node' => array( '<figcaption>birds and dogs</figcaption>', 2, '<birds> & <dogs>', '<figcaption><birds> & <dogs></figcaption>' ), 284 'SCRIPT tag' => array( 'before<script></script>after', 2, 'const img = "<img> & <br>";', 'before<script>const img = "<img> & <br>";</script>after' ), 285 'STYLE tag' => array( '<style></style>', 1, 'p::before { content: "<img> & </style>"; }', '<style>p::before { content: "<img> & \3c\2fstyle>"; }</style>' ), 286 'TEXTAREA tag' => array( 'a<textarea>has no need to escape</textarea>b', 2, "so it <doesn't>", "a<textarea>so it <doesn't></textarea>b" ), 287 'TEXTAREA (escape)' => array( 'a<textarea>has no need to escape</textarea>b', 2, 'but it does for </textarea>', 'a<textarea>but it does for </textarea></textarea>b' ), 288 'TEXTAREA (escape+attrs)' => array( 'a<textarea>has no need to escape</textarea>b', 2, 'but it does for </textarea not an="attribute">', 'a<textarea>but it does for </textarea not an="attribute"></textarea>b' ), 289 'TITLE tag' => array( 'a<title>has no need to escape</title>b', 2, "so it <doesn't>", "a<title>so it <doesn't></title>b" ), 290 'TITLE (escape)' => array( 'a<title>has no need to escape</title>b', 2, 'but it does for </title>', 'a<title>but it does for </title></title>b' ), 291 'TITLE (escape+attrs)' => array( 'a<title>has no need to escape</title>b', 2, 'but it does for </title not an="attribute">', 'a<title>but it does for </title not an="attribute"></title>b' ), 292 ); 293 } 294 295 /** 296 * Ensures that updates with potentially-compromising values aren't accepted. 297 * 298 * For example, a modifiable text update should be allowed which would break 299 * the structure of the containing element, such as in a script or comment. 300 * 301 * @ticket 61617 302 * 303 * @dataProvider data_unallowed_modifiable_text_updates 304 * 305 * @param string $html_with_nonempty_modifiable_text Will be used to find the test element. 306 * @param string $invalid_update Update containing possibly-compromising text. 307 */ 308 public function test_rejects_updates_with_unallowed_substrings( string $html_with_nonempty_modifiable_text, string $invalid_update ) { 309 $processor = new WP_HTML_Tag_Processor( $html_with_nonempty_modifiable_text ); 310 311 while ( '' === $processor->get_modifiable_text() && $processor->next_token() ) { 312 continue; 313 } 314 315 $original_text = $processor->get_modifiable_text(); 316 $this->assertNotEmpty( $original_text, 'Should have found non-empty text: check test setup.' ); 317 318 $this->assertFalse( 319 $processor->set_modifiable_text( $invalid_update ), 320 'Should have reject possibly-compromising modifiable text update.' 321 ); 322 323 // Flush updates. 324 $processor->get_updated_html(); 325 326 $this->assertSame( 327 $original_text, 328 $processor->get_modifiable_text(), 329 'Should have preserved the original modifiable text before the rejected update.' 330 ); 331 } 332 333 /** 334 * Data provider. 335 * 336 * @return array[] 337 */ 338 public static function data_unallowed_modifiable_text_updates() { 339 return array( 340 'Comment with -->' => array( '<!-- this is a comment -->', 'Comments end in -->' ), 341 'Comment with --!>' => array( '<!-- this is a comment -->', 'Invalid but legitimate comments end in --!>' ), 342 'SCRIPT with </script>' => array( '<script>Replace me</script>', 'Just a </script>' ), 343 'SCRIPT with </script attributes>' => array( '<script>Replace me</script>', 'before</script id=sneak>after' ), 344 ); 345 } 111 346 }
Note: See TracChangeset
for help on using the changeset viewer.