Make WordPress Core


Ignore:
Timestamp:
09/02/2024 11:19:08 PM (17 months ago)
Author:
dmsnell
Message:

HTML API: Allow subdividing text nodes by meaningful prefixes.

HTML parsing rules at times differentiate character tokens that are all null bytes, all whitespace, or other content. This patch introduces a new function which may be used to classify text node sub-regions and lead to more efficient application of these parsing rules.

Further, when classified in this way, application code may skip some rules and decoding entirely, improving performance. For example, this can be used to ease the implementation of skipping inter-element whitespace, which is usually not rendered.

Developed in https://github.com/WordPress/wordpress-develop/pull/7236
Discussed in https://core.trac.wordpress.org/ticket/61974

Props dmsnell, jonsurrell.
Fixes #61974.

File:
1 edited

Legend:

Unmodified
Added
Removed
  • trunk/src/wp-includes/html-api/class-wp-html-tag-processor.php

    r58969 r58970  
    541541     */
    542542    protected $comment_type = null;
     543
     544    /**
     545     * What kind of text the matched text node represents, if it was subdivided.
     546     *
     547     * @see self::TEXT_IS_NULL_SEQUENCE
     548     * @see self::TEXT_IS_WHITESPACE
     549     * @see self::TEXT_IS_GENERIC
     550     * @see self::subdivide_text_appropriately
     551     *
     552     * @since 6.7.0
     553     *
     554     * @var string
     555     */
     556    protected $text_node_classification = self::TEXT_IS_GENERIC;
    543557
    544558    /**
     
    22002214        }
    22012215
    2202         $this->token_starts_at      = null;
    2203         $this->token_length         = null;
    2204         $this->tag_name_starts_at   = null;
    2205         $this->tag_name_length      = null;
    2206         $this->text_starts_at       = 0;
    2207         $this->text_length          = 0;
    2208         $this->is_closing_tag       = null;
    2209         $this->attributes           = array();
    2210         $this->comment_type         = null;
    2211         $this->duplicate_attributes = null;
     2216        $this->token_starts_at          = null;
     2217        $this->token_length             = null;
     2218        $this->tag_name_starts_at       = null;
     2219        $this->tag_name_length          = null;
     2220        $this->text_starts_at           = 0;
     2221        $this->text_length              = 0;
     2222        $this->is_closing_tag           = null;
     2223        $this->attributes               = array();
     2224        $this->comment_type             = null;
     2225        $this->text_node_classification = self::TEXT_IS_GENERIC;
     2226        $this->duplicate_attributes     = null;
    22122227    }
    22132228
     
    33203335
    33213336        return $this->comment_type;
     3337    }
     3338
     3339    /**
     3340     * Subdivides a matched text node or CDATA text node, splitting NULL byte sequences
     3341     * and decoded whitespace as distinct prefixes.
     3342     *
     3343     * Note that once anything that's neither a NULL byte nor decoded whitespace is
     3344     * encountered, then the remainder of the text node is left intact as generic text.
     3345     *
     3346     *  - The HTML Processor uses this to apply distinct rules for different kinds of text.
     3347     *  - Inter-element whitespace can be detected and skipped with this method.
     3348     *
     3349     * Text nodes aren't eagerly subdivided because there's no need to split them unless
     3350     * decisions are being made on NULL byte sequences or whitespace-only text.
     3351     *
     3352     * Example:
     3353     *
     3354     *     $processor = new WP_HTML_Tag_Processor( "\x00Apples & Oranges" );
     3355     *     true  === $processor->next_token();                   // Text is "Apples & Oranges".
     3356     *     true  === $processor->subdivide_text_appropriately(); // Text is "".
     3357     *     true  === $processor->next_token();                   // Text is "Apples & Oranges".
     3358     *     false === $processor->subdivide_text_appropriately();
     3359     *
     3360     *     $processor = new WP_HTML_Tag_Processor( " \r\n\tMore" );
     3361     *     true  === $processor->next_token();                   // Text is "␤ ␤␉More".
     3362     *     true  === $processor->subdivide_text_appropriately(); // Text is "␤ ␤␉".
     3363     *     true  === $processor->next_token();                   // Text is "More".
     3364     *     false === $processor->subdivide_text_appropriately();
     3365     *
     3366     * @since 6.7.0
     3367     *
     3368     * @return bool Whether the text node was subdivided.
     3369     */
     3370    public function subdivide_text_appropriately(): bool {
     3371        $this->text_node_classification = self::TEXT_IS_GENERIC;
     3372
     3373        if ( self::STATE_TEXT_NODE === $this->parser_state ) {
     3374            /*
     3375             * NULL bytes are treated categorically different than numeric character
     3376             * references whose number is zero. `�` is not the same as `"\x00"`.
     3377             */
     3378            $leading_nulls = strspn( $this->html, "\x00", $this->text_starts_at, $this->text_length );
     3379            if ( $leading_nulls > 0 ) {
     3380                $this->token_length             = $leading_nulls;
     3381                $this->text_length              = $leading_nulls;
     3382                $this->bytes_already_parsed     = $this->token_starts_at + $leading_nulls;
     3383                $this->text_node_classification = self::TEXT_IS_NULL_SEQUENCE;
     3384                return true;
     3385            }
     3386
     3387            /*
     3388             * Start a decoding loop to determine the point at which the
     3389             * text subdivides. This entails raw whitespace bytes and any
     3390             * character reference that decodes to the same.
     3391             */
     3392            $at  = $this->text_starts_at;
     3393            $end = $this->text_starts_at + $this->text_length;
     3394            while ( $at < $end ) {
     3395                $skipped = strspn( $this->html, " \t\f\r\n", $at, $end - $at );
     3396                $at     += $skipped;
     3397
     3398                if ( $at < $end && '&' === $this->html[ $at ] ) {
     3399                    $matched_byte_length = null;
     3400                    $replacement         = WP_HTML_Decoder::read_character_reference( 'data', $this->html, $at, $matched_byte_length );
     3401                    if ( isset( $replacement ) && 1 === strspn( $replacement, " \t\f\r\n" ) ) {
     3402                        $at += $matched_byte_length;
     3403                        continue;
     3404                    }
     3405                }
     3406
     3407                break;
     3408            }
     3409
     3410            if ( $at > $this->text_starts_at ) {
     3411                $new_length                     = $at - $this->text_starts_at;
     3412                $this->text_length              = $new_length;
     3413                $this->token_length             = $new_length;
     3414                $this->bytes_already_parsed     = $at;
     3415                $this->text_node_classification = self::TEXT_IS_WHITESPACE;
     3416                return true;
     3417            }
     3418
     3419            return false;
     3420        }
     3421
     3422        // Unlike text nodes, there are no character references within CDATA sections.
     3423        if ( self::STATE_CDATA_NODE === $this->parser_state ) {
     3424            $leading_nulls = strspn( $this->html, "\x00", $this->text_starts_at, $this->text_length );
     3425            if ( $leading_nulls === $this->text_length ) {
     3426                $this->text_node_classification = self::TEXT_IS_NULL_SEQUENCE;
     3427                return true;
     3428            }
     3429
     3430            $leading_ws = strspn( $this->html, " \t\f\r\n", $this->text_starts_at, $this->text_length );
     3431            if ( $leading_ws === $this->text_length ) {
     3432                $this->text_node_classification = self::TEXT_IS_WHITESPACE;
     3433                return true;
     3434            }
     3435        }
     3436
     3437        return false;
    33223438    }
    33233439
     
    42494365     */
    42504366    const COMMENT_AS_INVALID_HTML = 'COMMENT_AS_INVALID_HTML';
     4367
     4368    /**
     4369     * Indicates that a span of text may contain any combination of significant
     4370     * kinds of characters: NULL bytes, whitespace, and others.
     4371     *
     4372     * @see self::$text_node_classification
     4373     * @see self::subdivide_text_appropriately
     4374     *
     4375     * @since 6.7.0
     4376     */
     4377    const TEXT_IS_GENERIC = 'TEXT_IS_GENERIC';
     4378
     4379    /**
     4380     * Indicates that a span of text comprises a sequence only of NULL bytes.
     4381     *
     4382     * @see self::$text_node_classification
     4383     * @see self::subdivide_text_appropriately
     4384     *
     4385     * @since 6.7.0
     4386     */
     4387    const TEXT_IS_NULL_SEQUENCE = 'TEXT_IS_NULL_SEQUENCE';
     4388
     4389    /**
     4390     * Indicates that a span of decoded text comprises only whitespace.
     4391     *
     4392     * @see self::$text_node_classification
     4393     * @see self::subdivide_text_appropriately
     4394     *
     4395     * @since 6.7.0
     4396     */
     4397    const TEXT_IS_WHITESPACE = 'TEXT_IS_WHITESPACE';
    42514398}
Note: See TracChangeset for help on using the changeset viewer.