- Timestamp:
- 09/02/2024 11:19:08 PM (17 months ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/src/wp-includes/html-api/class-wp-html-tag-processor.php
r58969 r58970 541 541 */ 542 542 protected $comment_type = null; 543 544 /** 545 * What kind of text the matched text node represents, if it was subdivided. 546 * 547 * @see self::TEXT_IS_NULL_SEQUENCE 548 * @see self::TEXT_IS_WHITESPACE 549 * @see self::TEXT_IS_GENERIC 550 * @see self::subdivide_text_appropriately 551 * 552 * @since 6.7.0 553 * 554 * @var string 555 */ 556 protected $text_node_classification = self::TEXT_IS_GENERIC; 543 557 544 558 /** … … 2200 2214 } 2201 2215 2202 $this->token_starts_at = null; 2203 $this->token_length = null; 2204 $this->tag_name_starts_at = null; 2205 $this->tag_name_length = null; 2206 $this->text_starts_at = 0; 2207 $this->text_length = 0; 2208 $this->is_closing_tag = null; 2209 $this->attributes = array(); 2210 $this->comment_type = null; 2211 $this->duplicate_attributes = null; 2216 $this->token_starts_at = null; 2217 $this->token_length = null; 2218 $this->tag_name_starts_at = null; 2219 $this->tag_name_length = null; 2220 $this->text_starts_at = 0; 2221 $this->text_length = 0; 2222 $this->is_closing_tag = null; 2223 $this->attributes = array(); 2224 $this->comment_type = null; 2225 $this->text_node_classification = self::TEXT_IS_GENERIC; 2226 $this->duplicate_attributes = null; 2212 2227 } 2213 2228 … … 3320 3335 3321 3336 return $this->comment_type; 3337 } 3338 3339 /** 3340 * Subdivides a matched text node or CDATA text node, splitting NULL byte sequences 3341 * and decoded whitespace as distinct prefixes. 3342 * 3343 * Note that once anything that's neither a NULL byte nor decoded whitespace is 3344 * encountered, then the remainder of the text node is left intact as generic text. 3345 * 3346 * - The HTML Processor uses this to apply distinct rules for different kinds of text. 3347 * - Inter-element whitespace can be detected and skipped with this method. 3348 * 3349 * Text nodes aren't eagerly subdivided because there's no need to split them unless 3350 * decisions are being made on NULL byte sequences or whitespace-only text. 3351 * 3352 * Example: 3353 * 3354 * $processor = new WP_HTML_Tag_Processor( "\x00Apples & Oranges" ); 3355 * true === $processor->next_token(); // Text is "Apples & Oranges". 3356 * true === $processor->subdivide_text_appropriately(); // Text is "". 3357 * true === $processor->next_token(); // Text is "Apples & Oranges". 3358 * false === $processor->subdivide_text_appropriately(); 3359 * 3360 * $processor = new WP_HTML_Tag_Processor( " \r\n\tMore" ); 3361 * true === $processor->next_token(); // Text is " ␉More". 3362 * true === $processor->subdivide_text_appropriately(); // Text is " ␉". 3363 * true === $processor->next_token(); // Text is "More". 3364 * false === $processor->subdivide_text_appropriately(); 3365 * 3366 * @since 6.7.0 3367 * 3368 * @return bool Whether the text node was subdivided. 3369 */ 3370 public function subdivide_text_appropriately(): bool { 3371 $this->text_node_classification = self::TEXT_IS_GENERIC; 3372 3373 if ( self::STATE_TEXT_NODE === $this->parser_state ) { 3374 /* 3375 * NULL bytes are treated categorically different than numeric character 3376 * references whose number is zero. `�` is not the same as `"\x00"`. 3377 */ 3378 $leading_nulls = strspn( $this->html, "\x00", $this->text_starts_at, $this->text_length ); 3379 if ( $leading_nulls > 0 ) { 3380 $this->token_length = $leading_nulls; 3381 $this->text_length = $leading_nulls; 3382 $this->bytes_already_parsed = $this->token_starts_at + $leading_nulls; 3383 $this->text_node_classification = self::TEXT_IS_NULL_SEQUENCE; 3384 return true; 3385 } 3386 3387 /* 3388 * Start a decoding loop to determine the point at which the 3389 * text subdivides. This entails raw whitespace bytes and any 3390 * character reference that decodes to the same. 3391 */ 3392 $at = $this->text_starts_at; 3393 $end = $this->text_starts_at + $this->text_length; 3394 while ( $at < $end ) { 3395 $skipped = strspn( $this->html, " \t\f\r\n", $at, $end - $at ); 3396 $at += $skipped; 3397 3398 if ( $at < $end && '&' === $this->html[ $at ] ) { 3399 $matched_byte_length = null; 3400 $replacement = WP_HTML_Decoder::read_character_reference( 'data', $this->html, $at, $matched_byte_length ); 3401 if ( isset( $replacement ) && 1 === strspn( $replacement, " \t\f\r\n" ) ) { 3402 $at += $matched_byte_length; 3403 continue; 3404 } 3405 } 3406 3407 break; 3408 } 3409 3410 if ( $at > $this->text_starts_at ) { 3411 $new_length = $at - $this->text_starts_at; 3412 $this->text_length = $new_length; 3413 $this->token_length = $new_length; 3414 $this->bytes_already_parsed = $at; 3415 $this->text_node_classification = self::TEXT_IS_WHITESPACE; 3416 return true; 3417 } 3418 3419 return false; 3420 } 3421 3422 // Unlike text nodes, there are no character references within CDATA sections. 3423 if ( self::STATE_CDATA_NODE === $this->parser_state ) { 3424 $leading_nulls = strspn( $this->html, "\x00", $this->text_starts_at, $this->text_length ); 3425 if ( $leading_nulls === $this->text_length ) { 3426 $this->text_node_classification = self::TEXT_IS_NULL_SEQUENCE; 3427 return true; 3428 } 3429 3430 $leading_ws = strspn( $this->html, " \t\f\r\n", $this->text_starts_at, $this->text_length ); 3431 if ( $leading_ws === $this->text_length ) { 3432 $this->text_node_classification = self::TEXT_IS_WHITESPACE; 3433 return true; 3434 } 3435 } 3436 3437 return false; 3322 3438 } 3323 3439 … … 4249 4365 */ 4250 4366 const COMMENT_AS_INVALID_HTML = 'COMMENT_AS_INVALID_HTML'; 4367 4368 /** 4369 * Indicates that a span of text may contain any combination of significant 4370 * kinds of characters: NULL bytes, whitespace, and others. 4371 * 4372 * @see self::$text_node_classification 4373 * @see self::subdivide_text_appropriately 4374 * 4375 * @since 6.7.0 4376 */ 4377 const TEXT_IS_GENERIC = 'TEXT_IS_GENERIC'; 4378 4379 /** 4380 * Indicates that a span of text comprises a sequence only of NULL bytes. 4381 * 4382 * @see self::$text_node_classification 4383 * @see self::subdivide_text_appropriately 4384 * 4385 * @since 6.7.0 4386 */ 4387 const TEXT_IS_NULL_SEQUENCE = 'TEXT_IS_NULL_SEQUENCE'; 4388 4389 /** 4390 * Indicates that a span of decoded text comprises only whitespace. 4391 * 4392 * @see self::$text_node_classification 4393 * @see self::subdivide_text_appropriately 4394 * 4395 * @since 6.7.0 4396 */ 4397 const TEXT_IS_WHITESPACE = 'TEXT_IS_WHITESPACE'; 4251 4398 }
Note: See TracChangeset
for help on using the changeset viewer.