Changeset 58970
Legend:
- Unmodified
- Added
- Removed
-
trunk/src/wp-includes/html-api/class-wp-html-processor.php
r58967 r58970 844 844 if ( self::PROCESS_NEXT_NODE === $node_to_process ) { 845 845 parent::next_token(); 846 if ( 847 WP_HTML_Tag_Processor::STATE_TEXT_NODE === $this->parser_state || 848 WP_HTML_Tag_Processor::STATE_CDATA_NODE === $this->parser_state 849 ) { 850 parent::subdivide_text_appropriately(); 851 } 846 852 } 847 853 … … 1057 1063 */ 1058 1064 case '#text': 1059 $text = $this->get_modifiable_text(); 1060 if ( strlen( $text ) === strspn( $text, " \t\n\f\r" ) ) { 1065 if ( parent::TEXT_IS_WHITESPACE === $this->text_node_classification ) { 1061 1066 return $this->step(); 1062 1067 } … … 1146 1151 */ 1147 1152 case '#text': 1148 $text = $this->get_modifiable_text(); 1149 if ( strlen( $text ) === strspn( $text, " \t\n\f\r" ) ) { 1153 if ( parent::TEXT_IS_WHITESPACE === $this->text_node_classification ) { 1150 1154 return $this->step(); 1151 1155 } … … 1228 1232 */ 1229 1233 case '#text': 1230 $text = $this->get_modifiable_text(); 1231 if ( strlen( $text ) === strspn( $text, " \t\n\f\r" ) ) { 1234 if ( parent::TEXT_IS_WHITESPACE === $this->text_node_classification ) { 1232 1235 return $this->step(); 1233 1236 } … … 1324 1327 * > U+000D CARRIAGE RETURN (CR), or U+0020 SPACE 1325 1328 */ 1326 $text = $this->get_modifiable_text(); 1327 if ( '' === $text ) { 1328 /* 1329 * If the text is empty after processing HTML entities and stripping 1330 * U+0000 NULL bytes then ignore the token. 1331 */ 1332 return $this->step(); 1333 } 1334 1335 if ( strlen( $text ) === strspn( $text, " \t\n\f\r" ) ) { 1329 if ( parent::TEXT_IS_WHITESPACE === $this->text_node_classification ) { 1336 1330 // Insert the character. 1337 1331 $this->insert_html_element( $this->state->current_token ); … … 1553 1547 */ 1554 1548 case '#text': 1555 $text = $this->get_modifiable_text(); 1556 if ( strlen( $text ) === strspn( $text, " \t\n\f\r" ) ) { 1549 if ( parent::TEXT_IS_WHITESPACE === $this->text_node_classification ) { 1557 1550 return $this->step_in_head(); 1558 1551 } … … 1655 1648 */ 1656 1649 case '#text': 1657 $text = $this->get_modifiable_text(); 1658 if ( strlen( $text ) === strspn( $text, " \t\n\f\r" ) ) { 1650 if ( parent::TEXT_IS_WHITESPACE === $this->text_node_classification ) { 1659 1651 // Insert the character. 1660 1652 $this->insert_html_element( $this->state->current_token ); … … 1794 1786 switch ( $op ) { 1795 1787 case '#text': 1796 $current_token = $this->bookmarks[ $this->state->current_token->bookmark_name ];1797 1798 1788 /* 1799 1789 * > A character token that is U+0000 NULL … … 1805 1795 * the active formats should be reconstructed. 1806 1796 */ 1807 if ( 1808 1 <= $current_token->length && 1809 "\x00" === $this->html[ $current_token->start ] && 1810 strspn( $this->html, "\x00", $current_token->start, $current_token->length ) === $current_token->length 1811 ) { 1797 if ( parent::TEXT_IS_NULL_SEQUENCE === $this->text_node_classification ) { 1812 1798 // Parse error: ignore the token. 1813 1799 return $this->step(); … … 1821 1807 * contain character references which decode only to whitespace. 1822 1808 */ 1823 $text = $this->get_modifiable_text(); 1824 if ( strlen( $text ) !== strspn( $text, " \t\n\f\r" ) ) { 1809 if ( parent::TEXT_IS_GENERIC === $this->text_node_classification ) { 1825 1810 $this->state->frameset_ok = false; 1826 1811 } … … 2830 2815 ) 2831 2816 ) { 2832 $text = $this->get_modifiable_text();2833 2817 /* 2834 2818 * If the text is empty after processing HTML entities and stripping 2835 2819 * U+0000 NULL bytes then ignore the token. 2836 2820 */ 2837 if ( '' === $text) {2821 if ( parent::TEXT_IS_NULL_SEQUENCE === $this->text_node_classification ) { 2838 2822 return $this->step(); 2839 2823 } … … 2858 2842 * @see https://html.spec.whatwg.org/#parsing-main-intabletext 2859 2843 */ 2860 if ( strlen( $text ) === strspn( $text, " \t\f\r\n" )) {2844 if ( parent::TEXT_IS_WHITESPACE === $this->text_node_classification ) { 2861 2845 $this->insert_html_element( $this->state->current_token ); 2862 2846 return true; … … 3178 3162 */ 3179 3163 case '#text': 3180 $text = $this->get_modifiable_text(); 3181 if ( '' === $text ) { 3182 /* 3183 * If the text is empty after processing HTML entities and stripping 3184 * U+0000 NULL bytes then ignore the token. 3185 */ 3186 return $this->step(); 3187 } 3188 3189 if ( strlen( $text ) === strspn( $text, " \t\n\f\r" ) ) { 3164 if ( parent::TEXT_IS_WHITESPACE === $this->text_node_classification ) { 3190 3165 // Insert the character. 3191 3166 $this->insert_html_element( $this->state->current_token ); … … 3610 3585 */ 3611 3586 case '#text': 3612 $current_token = $this->bookmarks[ $this->state->current_token->bookmark_name ];3613 3614 3587 /* 3615 3588 * > A character token that is U+0000 NULL … … 3618 3591 * entirely ignored and should not return to calling code. 3619 3592 */ 3620 if ( 3621 1 <= $current_token->length && 3622 "\x00" === $this->html[ $current_token->start ] && 3623 strspn( $this->html, "\x00", $current_token->start, $current_token->length ) === $current_token->length 3624 ) { 3593 if ( parent::TEXT_IS_NULL_SEQUENCE === $this->text_node_classification ) { 3625 3594 // Parse error: ignore the token. 3626 3595 return $this->step(); … … 3987 3956 */ 3988 3957 case '#text': 3989 $text = $this->get_modifiable_text(); 3990 if ( strlen( $text ) === strspn( $text, " \t\n\f\r" ) ) { 3958 if ( parent::TEXT_IS_WHITESPACE === $this->text_node_classification ) { 3991 3959 return $this->step_in_body(); 3992 3960 } … … 4073 4041 */ 4074 4042 case '#text': 4075 $text = $this->get_modifiable_text(); 4076 $text = $this->get_modifiable_text(); 4077 if ( strlen( $text ) === strspn( $text, " \t\n\f\r" ) ) { 4043 if ( parent::TEXT_IS_WHITESPACE === $this->text_node_classification ) { 4078 4044 return $this->step_in_body(); 4079 4045 } … … 4194 4160 */ 4195 4161 case '#text': 4196 $text = $this->get_modifiable_text(); 4197 if ( strlen( $text ) === strspn( $text, " \t\n\f\r" ) ) { 4162 if ( parent::TEXT_IS_WHITESPACE === $this->text_node_classification ) { 4198 4163 return $this->step_in_body(); 4199 4164 } … … 4289 4254 */ 4290 4255 case '#text': 4291 $text = $this->get_modifiable_text(); 4292 if ( strlen( $text ) === strspn( $text, " \t\n\f\r" ) ) { 4256 if ( parent::TEXT_IS_WHITESPACE === $this->text_node_classification ) { 4293 4257 return $this->step_in_body(); 4294 4258 } … … 4356 4320 */ 4357 4321 case '#text': 4358 $text = $this->get_modifiable_text(); 4359 if ( strlen( $text ) === strspn( $text, " \t\n\f\r" ) ) { 4322 if ( parent::TEXT_IS_WHITESPACE === $this->text_node_classification ) { 4360 4323 return $this->step_in_body(); 4361 4324 } … … 4413 4376 4414 4377 switch ( $op ) { 4378 case '#cdata-section': 4415 4379 case '#text': 4416 4380 /* … … 4425 4389 * contain character references which decode only to whitespace. 4426 4390 */ 4427 $text = $this->get_modifiable_text(); 4428 if ( strlen( $text ) !== strspn( $text, " \t\n\f\r" ) ) { 4391 if ( parent::TEXT_IS_GENERIC === $this->text_node_classification ) { 4429 4392 $this->state->frameset_ok = false; 4430 4393 } … … 4436 4399 * > A comment token 4437 4400 */ 4438 case '#cdata-section':4439 4401 case '#comment': 4440 4402 case '#funky-comment': -
trunk/src/wp-includes/html-api/class-wp-html-tag-processor.php
r58969 r58970 541 541 */ 542 542 protected $comment_type = null; 543 544 /** 545 * What kind of text the matched text node represents, if it was subdivided. 546 * 547 * @see self::TEXT_IS_NULL_SEQUENCE 548 * @see self::TEXT_IS_WHITESPACE 549 * @see self::TEXT_IS_GENERIC 550 * @see self::subdivide_text_appropriately 551 * 552 * @since 6.7.0 553 * 554 * @var string 555 */ 556 protected $text_node_classification = self::TEXT_IS_GENERIC; 543 557 544 558 /** … … 2200 2214 } 2201 2215 2202 $this->token_starts_at = null; 2203 $this->token_length = null; 2204 $this->tag_name_starts_at = null; 2205 $this->tag_name_length = null; 2206 $this->text_starts_at = 0; 2207 $this->text_length = 0; 2208 $this->is_closing_tag = null; 2209 $this->attributes = array(); 2210 $this->comment_type = null; 2211 $this->duplicate_attributes = null; 2216 $this->token_starts_at = null; 2217 $this->token_length = null; 2218 $this->tag_name_starts_at = null; 2219 $this->tag_name_length = null; 2220 $this->text_starts_at = 0; 2221 $this->text_length = 0; 2222 $this->is_closing_tag = null; 2223 $this->attributes = array(); 2224 $this->comment_type = null; 2225 $this->text_node_classification = self::TEXT_IS_GENERIC; 2226 $this->duplicate_attributes = null; 2212 2227 } 2213 2228 … … 3320 3335 3321 3336 return $this->comment_type; 3337 } 3338 3339 /** 3340 * Subdivides a matched text node or CDATA text node, splitting NULL byte sequences 3341 * and decoded whitespace as distinct prefixes. 3342 * 3343 * Note that once anything that's neither a NULL byte nor decoded whitespace is 3344 * encountered, then the remainder of the text node is left intact as generic text. 3345 * 3346 * - The HTML Processor uses this to apply distinct rules for different kinds of text. 3347 * - Inter-element whitespace can be detected and skipped with this method. 3348 * 3349 * Text nodes aren't eagerly subdivided because there's no need to split them unless 3350 * decisions are being made on NULL byte sequences or whitespace-only text. 3351 * 3352 * Example: 3353 * 3354 * $processor = new WP_HTML_Tag_Processor( "\x00Apples & Oranges" ); 3355 * true === $processor->next_token(); // Text is "Apples & Oranges". 3356 * true === $processor->subdivide_text_appropriately(); // Text is "". 3357 * true === $processor->next_token(); // Text is "Apples & Oranges". 3358 * false === $processor->subdivide_text_appropriately(); 3359 * 3360 * $processor = new WP_HTML_Tag_Processor( " \r\n\tMore" ); 3361 * true === $processor->next_token(); // Text is " ␉More". 3362 * true === $processor->subdivide_text_appropriately(); // Text is " ␉". 3363 * true === $processor->next_token(); // Text is "More". 3364 * false === $processor->subdivide_text_appropriately(); 3365 * 3366 * @since 6.7.0 3367 * 3368 * @return bool Whether the text node was subdivided. 3369 */ 3370 public function subdivide_text_appropriately(): bool { 3371 $this->text_node_classification = self::TEXT_IS_GENERIC; 3372 3373 if ( self::STATE_TEXT_NODE === $this->parser_state ) { 3374 /* 3375 * NULL bytes are treated categorically different than numeric character 3376 * references whose number is zero. `�` is not the same as `"\x00"`. 3377 */ 3378 $leading_nulls = strspn( $this->html, "\x00", $this->text_starts_at, $this->text_length ); 3379 if ( $leading_nulls > 0 ) { 3380 $this->token_length = $leading_nulls; 3381 $this->text_length = $leading_nulls; 3382 $this->bytes_already_parsed = $this->token_starts_at + $leading_nulls; 3383 $this->text_node_classification = self::TEXT_IS_NULL_SEQUENCE; 3384 return true; 3385 } 3386 3387 /* 3388 * Start a decoding loop to determine the point at which the 3389 * text subdivides. This entails raw whitespace bytes and any 3390 * character reference that decodes to the same. 3391 */ 3392 $at = $this->text_starts_at; 3393 $end = $this->text_starts_at + $this->text_length; 3394 while ( $at < $end ) { 3395 $skipped = strspn( $this->html, " \t\f\r\n", $at, $end - $at ); 3396 $at += $skipped; 3397 3398 if ( $at < $end && '&' === $this->html[ $at ] ) { 3399 $matched_byte_length = null; 3400 $replacement = WP_HTML_Decoder::read_character_reference( 'data', $this->html, $at, $matched_byte_length ); 3401 if ( isset( $replacement ) && 1 === strspn( $replacement, " \t\f\r\n" ) ) { 3402 $at += $matched_byte_length; 3403 continue; 3404 } 3405 } 3406 3407 break; 3408 } 3409 3410 if ( $at > $this->text_starts_at ) { 3411 $new_length = $at - $this->text_starts_at; 3412 $this->text_length = $new_length; 3413 $this->token_length = $new_length; 3414 $this->bytes_already_parsed = $at; 3415 $this->text_node_classification = self::TEXT_IS_WHITESPACE; 3416 return true; 3417 } 3418 3419 return false; 3420 } 3421 3422 // Unlike text nodes, there are no character references within CDATA sections. 3423 if ( self::STATE_CDATA_NODE === $this->parser_state ) { 3424 $leading_nulls = strspn( $this->html, "\x00", $this->text_starts_at, $this->text_length ); 3425 if ( $leading_nulls === $this->text_length ) { 3426 $this->text_node_classification = self::TEXT_IS_NULL_SEQUENCE; 3427 return true; 3428 } 3429 3430 $leading_ws = strspn( $this->html, " \t\f\r\n", $this->text_starts_at, $this->text_length ); 3431 if ( $leading_ws === $this->text_length ) { 3432 $this->text_node_classification = self::TEXT_IS_WHITESPACE; 3433 return true; 3434 } 3435 } 3436 3437 return false; 3322 3438 } 3323 3439 … … 4249 4365 */ 4250 4366 const COMMENT_AS_INVALID_HTML = 'COMMENT_AS_INVALID_HTML'; 4367 4368 /** 4369 * Indicates that a span of text may contain any combination of significant 4370 * kinds of characters: NULL bytes, whitespace, and others. 4371 * 4372 * @see self::$text_node_classification 4373 * @see self::subdivide_text_appropriately 4374 * 4375 * @since 6.7.0 4376 */ 4377 const TEXT_IS_GENERIC = 'TEXT_IS_GENERIC'; 4378 4379 /** 4380 * Indicates that a span of text comprises a sequence only of NULL bytes. 4381 * 4382 * @see self::$text_node_classification 4383 * @see self::subdivide_text_appropriately 4384 * 4385 * @since 6.7.0 4386 */ 4387 const TEXT_IS_NULL_SEQUENCE = 'TEXT_IS_NULL_SEQUENCE'; 4388 4389 /** 4390 * Indicates that a span of decoded text comprises only whitespace. 4391 * 4392 * @see self::$text_node_classification 4393 * @see self::subdivide_text_appropriately 4394 * 4395 * @since 6.7.0 4396 */ 4397 const TEXT_IS_WHITESPACE = 'TEXT_IS_WHITESPACE'; 4251 4398 } -
trunk/tests/phpunit/tests/html-api/wpHtmlProcessorHtml5lib.php
r58926 r58970 28 28 'comments01/line0155' => 'Unimplemented: Need to access raw comment text on non-normative comments.', 29 29 'comments01/line0169' => 'Unimplemented: Need to access raw comment text on non-normative comments.', 30 'doctype01/line0380' => 'Bug: Mixed whitespace, non-whitespace text in head not split correctly',31 30 'html5test-com/line0129' => 'Unimplemented: Need to access raw comment text on non-normative comments.', 32 31 'noscript01/line0014' => 'Unimplemented: This parser does not add missing attributes to existing HTML or BODY tags.', 33 'tests1/line0692' => 'Bug: Mixed whitespace, non-whitespace text in head not split correctly',34 32 'tests14/line0022' => 'Unimplemented: This parser does not add missing attributes to existing HTML or BODY tags.', 35 33 'tests14/line0055' => 'Unimplemented: This parser does not add missing attributes to existing HTML or BODY tags.', 36 34 'tests19/line0488' => 'Unimplemented: This parser does not add missing attributes to existing HTML or BODY tags.', 37 35 'tests19/line0500' => 'Unimplemented: This parser does not add missing attributes to existing HTML or BODY tags.', 38 'tests19/line0965' => 'Bug: Mixed whitespace, non-whitespace text in head not split correctly.',39 36 'tests19/line1079' => 'Unimplemented: This parser does not add missing attributes to existing HTML or BODY tags.', 40 37 'tests2/line0207' => 'Unimplemented: This parser does not add missing attributes to existing HTML or BODY tags.', … … 42 39 'tests2/line0697' => 'Unimplemented: This parser does not add missing attributes to existing HTML or BODY tags.', 43 40 'tests2/line0709' => 'Unimplemented: This parser does not add missing attributes to existing HTML or BODY tags.', 44 'tests5/line0013' => 'Bug: Mixed whitespace, non-whitespace text in head not split correctly.',45 'tests5/line0077' => 'Bug: Mixed whitespace, non-whitespace text in head not split correctly.',46 'tests5/line0091' => 'Bug: Mixed whitespace, non-whitespace text in head not split correctly',47 41 'webkit01/line0231' => 'Unimplemented: This parser does not add missing attributes to existing HTML or BODY tags.', 48 42 );
Note: See TracChangeset
for help on using the changeset viewer.