Make WordPress Core

Changeset 58970


Ignore:
Timestamp:
09/02/2024 11:19:08 PM (6 weeks ago)
Author:
dmsnell
Message:

HTML API: Allow subdividing text nodes by meaningful prefixes.

HTML parsing rules at times differentiate character tokens that are all null bytes, all whitespace, or other content. This patch introduces a new function which may be used to classify text node sub-regions and lead to more efficient application of these parsing rules.

Further, when classified in this way, application code may skip some rules and decoding entirely, improving performance. For example, this can be used to ease the implementation of skipping inter-element whitespace, which is usually not rendered.

Developed in https://github.com/WordPress/wordpress-develop/pull/7236
Discussed in https://core.trac.wordpress.org/ticket/61974

Props dmsnell, jonsurrell.
Fixes #61974.

Location:
trunk
Files:
3 edited

Legend:

Unmodified
Added
Removed
  • trunk/src/wp-includes/html-api/class-wp-html-processor.php

    r58967 r58970  
    844844        if ( self::PROCESS_NEXT_NODE === $node_to_process ) {
    845845            parent::next_token();
     846            if (
     847                WP_HTML_Tag_Processor::STATE_TEXT_NODE === $this->parser_state ||
     848                WP_HTML_Tag_Processor::STATE_CDATA_NODE === $this->parser_state
     849            ) {
     850                parent::subdivide_text_appropriately();
     851            }
    846852        }
    847853
     
    10571063             */
    10581064            case '#text':
    1059                 $text = $this->get_modifiable_text();
    1060                 if ( strlen( $text ) === strspn( $text, " \t\n\f\r" ) ) {
     1065                if ( parent::TEXT_IS_WHITESPACE === $this->text_node_classification ) {
    10611066                    return $this->step();
    10621067                }
     
    11461151             */
    11471152            case '#text':
    1148                 $text = $this->get_modifiable_text();
    1149                 if ( strlen( $text ) === strspn( $text, " \t\n\f\r" ) ) {
     1153                if ( parent::TEXT_IS_WHITESPACE === $this->text_node_classification ) {
    11501154                    return $this->step();
    11511155                }
     
    12281232             */
    12291233            case '#text':
    1230                 $text = $this->get_modifiable_text();
    1231                 if ( strlen( $text ) === strspn( $text, " \t\n\f\r" ) ) {
     1234                if ( parent::TEXT_IS_WHITESPACE === $this->text_node_classification ) {
    12321235                    return $this->step();
    12331236                }
     
    13241327                 * > U+000D CARRIAGE RETURN (CR), or U+0020 SPACE
    13251328                 */
    1326                 $text = $this->get_modifiable_text();
    1327                 if ( '' === $text ) {
    1328                     /*
    1329                      * If the text is empty after processing HTML entities and stripping
    1330                      * U+0000 NULL bytes then ignore the token.
    1331                      */
    1332                     return $this->step();
    1333                 }
    1334 
    1335                 if ( strlen( $text ) === strspn( $text, " \t\n\f\r" ) ) {
     1329                if ( parent::TEXT_IS_WHITESPACE === $this->text_node_classification ) {
    13361330                    // Insert the character.
    13371331                    $this->insert_html_element( $this->state->current_token );
     
    15531547             */
    15541548            case '#text':
    1555                 $text = $this->get_modifiable_text();
    1556                 if ( strlen( $text ) === strspn( $text, " \t\n\f\r" ) ) {
     1549                if ( parent::TEXT_IS_WHITESPACE === $this->text_node_classification ) {
    15571550                    return $this->step_in_head();
    15581551                }
     
    16551648             */
    16561649            case '#text':
    1657                 $text = $this->get_modifiable_text();
    1658                 if ( strlen( $text ) === strspn( $text, " \t\n\f\r" ) ) {
     1650                if ( parent::TEXT_IS_WHITESPACE === $this->text_node_classification ) {
    16591651                    // Insert the character.
    16601652                    $this->insert_html_element( $this->state->current_token );
     
    17941786        switch ( $op ) {
    17951787            case '#text':
    1796                 $current_token = $this->bookmarks[ $this->state->current_token->bookmark_name ];
    1797 
    17981788                /*
    17991789                 * > A character token that is U+0000 NULL
     
    18051795                 * the active formats should be reconstructed.
    18061796                 */
    1807                 if (
    1808                     1 <= $current_token->length &&
    1809                     "\x00" === $this->html[ $current_token->start ] &&
    1810                     strspn( $this->html, "\x00", $current_token->start, $current_token->length ) === $current_token->length
    1811                 ) {
     1797                if ( parent::TEXT_IS_NULL_SEQUENCE === $this->text_node_classification ) {
    18121798                    // Parse error: ignore the token.
    18131799                    return $this->step();
     
    18211807                 * contain character references which decode only to whitespace.
    18221808                 */
    1823                 $text = $this->get_modifiable_text();
    1824                 if ( strlen( $text ) !== strspn( $text, " \t\n\f\r" ) ) {
     1809                if ( parent::TEXT_IS_GENERIC === $this->text_node_classification ) {
    18251810                    $this->state->frameset_ok = false;
    18261811                }
     
    28302815                    )
    28312816                ) {
    2832                     $text = $this->get_modifiable_text();
    28332817                    /*
    28342818                     * If the text is empty after processing HTML entities and stripping
    28352819                     * U+0000 NULL bytes then ignore the token.
    28362820                     */
    2837                     if ( '' === $text ) {
     2821                    if ( parent::TEXT_IS_NULL_SEQUENCE === $this->text_node_classification ) {
    28382822                        return $this->step();
    28392823                    }
     
    28582842                     * @see https://html.spec.whatwg.org/#parsing-main-intabletext
    28592843                     */
    2860                     if ( strlen( $text ) === strspn( $text, " \t\f\r\n" ) ) {
     2844                    if ( parent::TEXT_IS_WHITESPACE === $this->text_node_classification ) {
    28612845                        $this->insert_html_element( $this->state->current_token );
    28622846                        return true;
     
    31783162             */
    31793163            case '#text':
    3180                 $text = $this->get_modifiable_text();
    3181                 if ( '' === $text ) {
    3182                     /*
    3183                      * If the text is empty after processing HTML entities and stripping
    3184                      * U+0000 NULL bytes then ignore the token.
    3185                      */
    3186                     return $this->step();
    3187                 }
    3188 
    3189                 if ( strlen( $text ) === strspn( $text, " \t\n\f\r" ) ) {
     3164                if ( parent::TEXT_IS_WHITESPACE === $this->text_node_classification ) {
    31903165                    // Insert the character.
    31913166                    $this->insert_html_element( $this->state->current_token );
     
    36103585             */
    36113586            case '#text':
    3612                 $current_token = $this->bookmarks[ $this->state->current_token->bookmark_name ];
    3613 
    36143587                /*
    36153588                 * > A character token that is U+0000 NULL
     
    36183591                 * entirely ignored and should not return to calling code.
    36193592                 */
    3620                 if (
    3621                     1 <= $current_token->length &&
    3622                     "\x00" === $this->html[ $current_token->start ] &&
    3623                     strspn( $this->html, "\x00", $current_token->start, $current_token->length ) === $current_token->length
    3624                 ) {
     3593                if ( parent::TEXT_IS_NULL_SEQUENCE === $this->text_node_classification ) {
    36253594                    // Parse error: ignore the token.
    36263595                    return $this->step();
     
    39873956             */
    39883957            case '#text':
    3989                 $text = $this->get_modifiable_text();
    3990                 if ( strlen( $text ) === strspn( $text, " \t\n\f\r" ) ) {
     3958                if ( parent::TEXT_IS_WHITESPACE === $this->text_node_classification ) {
    39913959                    return $this->step_in_body();
    39923960                }
     
    40734041             */
    40744042            case '#text':
    4075                 $text = $this->get_modifiable_text();
    4076                 $text = $this->get_modifiable_text();
    4077                 if ( strlen( $text ) === strspn( $text, " \t\n\f\r" ) ) {
     4043                if ( parent::TEXT_IS_WHITESPACE === $this->text_node_classification ) {
    40784044                    return $this->step_in_body();
    40794045                }
     
    41944160             */
    41954161            case '#text':
    4196                 $text = $this->get_modifiable_text();
    4197                 if ( strlen( $text ) === strspn( $text, " \t\n\f\r" ) ) {
     4162                if ( parent::TEXT_IS_WHITESPACE === $this->text_node_classification ) {
    41984163                    return $this->step_in_body();
    41994164                }
     
    42894254             */
    42904255            case '#text':
    4291                 $text = $this->get_modifiable_text();
    4292                 if ( strlen( $text ) === strspn( $text, " \t\n\f\r" ) ) {
     4256                if ( parent::TEXT_IS_WHITESPACE === $this->text_node_classification ) {
    42934257                    return $this->step_in_body();
    42944258                }
     
    43564320             */
    43574321            case '#text':
    4358                 $text = $this->get_modifiable_text();
    4359                 if ( strlen( $text ) === strspn( $text, " \t\n\f\r" ) ) {
     4322                if ( parent::TEXT_IS_WHITESPACE === $this->text_node_classification ) {
    43604323                    return $this->step_in_body();
    43614324                }
     
    44134376
    44144377        switch ( $op ) {
     4378            case '#cdata-section':
    44154379            case '#text':
    44164380                /*
     
    44254389                 * contain character references which decode only to whitespace.
    44264390                 */
    4427                 $text = $this->get_modifiable_text();
    4428                 if ( strlen( $text ) !== strspn( $text, " \t\n\f\r" ) ) {
     4391                if ( parent::TEXT_IS_GENERIC === $this->text_node_classification ) {
    44294392                    $this->state->frameset_ok = false;
    44304393                }
     
    44364399             * > A comment token
    44374400             */
    4438             case '#cdata-section':
    44394401            case '#comment':
    44404402            case '#funky-comment':
  • trunk/src/wp-includes/html-api/class-wp-html-tag-processor.php

    r58969 r58970  
    541541     */
    542542    protected $comment_type = null;
     543
     544    /**
     545     * What kind of text the matched text node represents, if it was subdivided.
     546     *
     547     * @see self::TEXT_IS_NULL_SEQUENCE
     548     * @see self::TEXT_IS_WHITESPACE
     549     * @see self::TEXT_IS_GENERIC
     550     * @see self::subdivide_text_appropriately
     551     *
     552     * @since 6.7.0
     553     *
     554     * @var string
     555     */
     556    protected $text_node_classification = self::TEXT_IS_GENERIC;
    543557
    544558    /**
     
    22002214        }
    22012215
    2202         $this->token_starts_at      = null;
    2203         $this->token_length         = null;
    2204         $this->tag_name_starts_at   = null;
    2205         $this->tag_name_length      = null;
    2206         $this->text_starts_at       = 0;
    2207         $this->text_length          = 0;
    2208         $this->is_closing_tag       = null;
    2209         $this->attributes           = array();
    2210         $this->comment_type         = null;
    2211         $this->duplicate_attributes = null;
     2216        $this->token_starts_at          = null;
     2217        $this->token_length             = null;
     2218        $this->tag_name_starts_at       = null;
     2219        $this->tag_name_length          = null;
     2220        $this->text_starts_at           = 0;
     2221        $this->text_length              = 0;
     2222        $this->is_closing_tag           = null;
     2223        $this->attributes               = array();
     2224        $this->comment_type             = null;
     2225        $this->text_node_classification = self::TEXT_IS_GENERIC;
     2226        $this->duplicate_attributes     = null;
    22122227    }
    22132228
     
    33203335
    33213336        return $this->comment_type;
     3337    }
     3338
     3339    /**
     3340     * Subdivides a matched text node or CDATA text node, splitting NULL byte sequences
     3341     * and decoded whitespace as distinct prefixes.
     3342     *
     3343     * Note that once anything that's neither a NULL byte nor decoded whitespace is
     3344     * encountered, then the remainder of the text node is left intact as generic text.
     3345     *
     3346     *  - The HTML Processor uses this to apply distinct rules for different kinds of text.
     3347     *  - Inter-element whitespace can be detected and skipped with this method.
     3348     *
     3349     * Text nodes aren't eagerly subdivided because there's no need to split them unless
     3350     * decisions are being made on NULL byte sequences or whitespace-only text.
     3351     *
     3352     * Example:
     3353     *
     3354     *     $processor = new WP_HTML_Tag_Processor( "\x00Apples & Oranges" );
     3355     *     true  === $processor->next_token();                   // Text is "Apples & Oranges".
     3356     *     true  === $processor->subdivide_text_appropriately(); // Text is "".
     3357     *     true  === $processor->next_token();                   // Text is "Apples & Oranges".
     3358     *     false === $processor->subdivide_text_appropriately();
     3359     *
     3360     *     $processor = new WP_HTML_Tag_Processor( "&#x13; \r\n\tMore" );
     3361     *     true  === $processor->next_token();                   // Text is "␤ ␤␉More".
     3362     *     true  === $processor->subdivide_text_appropriately(); // Text is "␤ ␤␉".
     3363     *     true  === $processor->next_token();                   // Text is "More".
     3364     *     false === $processor->subdivide_text_appropriately();
     3365     *
     3366     * @since 6.7.0
     3367     *
     3368     * @return bool Whether the text node was subdivided.
     3369     */
     3370    public function subdivide_text_appropriately(): bool {
     3371        $this->text_node_classification = self::TEXT_IS_GENERIC;
     3372
     3373        if ( self::STATE_TEXT_NODE === $this->parser_state ) {
     3374            /*
     3375             * NULL bytes are treated categorically different than numeric character
     3376             * references whose number is zero. `&#x00;` is not the same as `"\x00"`.
     3377             */
     3378            $leading_nulls = strspn( $this->html, "\x00", $this->text_starts_at, $this->text_length );
     3379            if ( $leading_nulls > 0 ) {
     3380                $this->token_length             = $leading_nulls;
     3381                $this->text_length              = $leading_nulls;
     3382                $this->bytes_already_parsed     = $this->token_starts_at + $leading_nulls;
     3383                $this->text_node_classification = self::TEXT_IS_NULL_SEQUENCE;
     3384                return true;
     3385            }
     3386
     3387            /*
     3388             * Start a decoding loop to determine the point at which the
     3389             * text subdivides. This entails raw whitespace bytes and any
     3390             * character reference that decodes to the same.
     3391             */
     3392            $at  = $this->text_starts_at;
     3393            $end = $this->text_starts_at + $this->text_length;
     3394            while ( $at < $end ) {
     3395                $skipped = strspn( $this->html, " \t\f\r\n", $at, $end - $at );
     3396                $at     += $skipped;
     3397
     3398                if ( $at < $end && '&' === $this->html[ $at ] ) {
     3399                    $matched_byte_length = null;
     3400                    $replacement         = WP_HTML_Decoder::read_character_reference( 'data', $this->html, $at, $matched_byte_length );
     3401                    if ( isset( $replacement ) && 1 === strspn( $replacement, " \t\f\r\n" ) ) {
     3402                        $at += $matched_byte_length;
     3403                        continue;
     3404                    }
     3405                }
     3406
     3407                break;
     3408            }
     3409
     3410            if ( $at > $this->text_starts_at ) {
     3411                $new_length                     = $at - $this->text_starts_at;
     3412                $this->text_length              = $new_length;
     3413                $this->token_length             = $new_length;
     3414                $this->bytes_already_parsed     = $at;
     3415                $this->text_node_classification = self::TEXT_IS_WHITESPACE;
     3416                return true;
     3417            }
     3418
     3419            return false;
     3420        }
     3421
     3422        // Unlike text nodes, there are no character references within CDATA sections.
     3423        if ( self::STATE_CDATA_NODE === $this->parser_state ) {
     3424            $leading_nulls = strspn( $this->html, "\x00", $this->text_starts_at, $this->text_length );
     3425            if ( $leading_nulls === $this->text_length ) {
     3426                $this->text_node_classification = self::TEXT_IS_NULL_SEQUENCE;
     3427                return true;
     3428            }
     3429
     3430            $leading_ws = strspn( $this->html, " \t\f\r\n", $this->text_starts_at, $this->text_length );
     3431            if ( $leading_ws === $this->text_length ) {
     3432                $this->text_node_classification = self::TEXT_IS_WHITESPACE;
     3433                return true;
     3434            }
     3435        }
     3436
     3437        return false;
    33223438    }
    33233439
     
    42494365     */
    42504366    const COMMENT_AS_INVALID_HTML = 'COMMENT_AS_INVALID_HTML';
     4367
     4368    /**
     4369     * Indicates that a span of text may contain any combination of significant
     4370     * kinds of characters: NULL bytes, whitespace, and others.
     4371     *
     4372     * @see self::$text_node_classification
     4373     * @see self::subdivide_text_appropriately
     4374     *
     4375     * @since 6.7.0
     4376     */
     4377    const TEXT_IS_GENERIC = 'TEXT_IS_GENERIC';
     4378
     4379    /**
     4380     * Indicates that a span of text comprises a sequence only of NULL bytes.
     4381     *
     4382     * @see self::$text_node_classification
     4383     * @see self::subdivide_text_appropriately
     4384     *
     4385     * @since 6.7.0
     4386     */
     4387    const TEXT_IS_NULL_SEQUENCE = 'TEXT_IS_NULL_SEQUENCE';
     4388
     4389    /**
     4390     * Indicates that a span of decoded text comprises only whitespace.
     4391     *
     4392     * @see self::$text_node_classification
     4393     * @see self::subdivide_text_appropriately
     4394     *
     4395     * @since 6.7.0
     4396     */
     4397    const TEXT_IS_WHITESPACE = 'TEXT_IS_WHITESPACE';
    42514398}
  • trunk/tests/phpunit/tests/html-api/wpHtmlProcessorHtml5lib.php

    r58926 r58970  
    2828        'comments01/line0155'    => 'Unimplemented: Need to access raw comment text on non-normative comments.',
    2929        'comments01/line0169'    => 'Unimplemented: Need to access raw comment text on non-normative comments.',
    30         'doctype01/line0380'     => 'Bug: Mixed whitespace, non-whitespace text in head not split correctly',
    3130        'html5test-com/line0129' => 'Unimplemented: Need to access raw comment text on non-normative comments.',
    3231        'noscript01/line0014'    => 'Unimplemented: This parser does not add missing attributes to existing HTML or BODY tags.',
    33         'tests1/line0692'        => 'Bug: Mixed whitespace, non-whitespace text in head not split correctly',
    3432        'tests14/line0022'       => 'Unimplemented: This parser does not add missing attributes to existing HTML or BODY tags.',
    3533        'tests14/line0055'       => 'Unimplemented: This parser does not add missing attributes to existing HTML or BODY tags.',
    3634        'tests19/line0488'       => 'Unimplemented: This parser does not add missing attributes to existing HTML or BODY tags.',
    3735        'tests19/line0500'       => 'Unimplemented: This parser does not add missing attributes to existing HTML or BODY tags.',
    38         'tests19/line0965'       => 'Bug: Mixed whitespace, non-whitespace text in head not split correctly.',
    3936        'tests19/line1079'       => 'Unimplemented: This parser does not add missing attributes to existing HTML or BODY tags.',
    4037        'tests2/line0207'        => 'Unimplemented: This parser does not add missing attributes to existing HTML or BODY tags.',
     
    4239        'tests2/line0697'        => 'Unimplemented: This parser does not add missing attributes to existing HTML or BODY tags.',
    4340        'tests2/line0709'        => 'Unimplemented: This parser does not add missing attributes to existing HTML or BODY tags.',
    44         'tests5/line0013'        => 'Bug: Mixed whitespace, non-whitespace text in head not split correctly.',
    45         'tests5/line0077'        => 'Bug: Mixed whitespace, non-whitespace text in head not split correctly.',
    46         'tests5/line0091'        => 'Bug: Mixed whitespace, non-whitespace text in head not split correctly',
    4741        'webkit01/line0231'      => 'Unimplemented: This parser does not add missing attributes to existing HTML or BODY tags.',
    4842    );
Note: See TracChangeset for help on using the changeset viewer.