Make WordPress Core

Changeset 58613


Ignore:
Timestamp:
07/01/2024 11:34:19 PM (2 days ago)
Author:
dmsnell
Message:

HTML API: Optimize low-level parsing details in Tag Processor.

Introduces a number of micro-level optimizations in the Tag Processor to
improve token-scanning performance. Should contain no functional changes.

Based on benchmarking against a list of the 100 most-visited websites,
these changes result in an average improvement in performance of the Tag
Processor for scanning tags from between 3.5% and 7.5%.

Developed in https://github.com/WordPress/wordpress-develop/pull/6890
Discussed in https://core.trac.wordpress.org/ticket/61545

Follow-up to [55203].

See #61545.

Location:
trunk/src/wp-includes/html-api
Files:
2 edited

Legend:

Unmodified
Added
Removed
  • trunk/src/wp-includes/html-api/class-wp-html-decoder.php

    r58281 r58613  
    142142        while ( $at < $end ) {
    143143            $next_character_reference_at = strpos( $text, '&', $at );
    144             if ( false === $next_character_reference_at || $next_character_reference_at >= $end ) {
     144            if ( false === $next_character_reference_at ) {
    145145                break;
    146146            }
     
    437437
    438438        if ( $code_point <= 0x7FF ) {
    439             $byte1 = ( $code_point >> 6 ) | 0xC0;
    440             $byte2 = $code_point & 0x3F | 0x80;
    441 
    442             return pack( 'CC', $byte1, $byte2 );
     439            $byte1 = chr( ( $code_point >> 6 ) | 0xC0 );
     440            $byte2 = chr( $code_point & 0x3F | 0x80 );
     441
     442            return "{$byte1}{$byte2}";
    443443        }
    444444
    445445        if ( $code_point <= 0xFFFF ) {
    446             $byte1 = ( $code_point >> 12 ) | 0xE0;
    447             $byte2 = ( $code_point >> 6 ) & 0x3F | 0x80;
    448             $byte3 = $code_point & 0x3F | 0x80;
    449 
    450             return pack( 'CCC', $byte1, $byte2, $byte3 );
     446            $byte1 = chr( ( $code_point >> 12 ) | 0xE0 );
     447            $byte2 = chr( ( $code_point >> 6 ) & 0x3F | 0x80 );
     448            $byte3 = chr( $code_point & 0x3F | 0x80 );
     449
     450            return "{$byte1}{$byte2}{$byte3}";
    451451        }
    452452
    453453        // Any values above U+10FFFF are eliminated above in the pre-check.
    454         $byte1 = ( $code_point >> 18 ) | 0xF0;
    455         $byte2 = ( $code_point >> 12 ) & 0x3F | 0x80;
    456         $byte3 = ( $code_point >> 6 ) & 0x3F | 0x80;
    457         $byte4 = $code_point & 0x3F | 0x80;
    458 
    459         return pack( 'CCCC', $byte1, $byte2, $byte3, $byte4 );
     454        $byte1 = chr( ( $code_point >> 18 ) | 0xF0 );
     455        $byte2 = chr( ( $code_point >> 12 ) & 0x3F | 0x80 );
     456        $byte3 = chr( ( $code_point >> 6 ) & 0x3F | 0x80 );
     457        $byte4 = chr( $code_point & 0x3F | 0x80 );
     458
     459        return "{$byte1}{$byte2}{$byte3}{$byte4}";
    460460    }
    461461}
  • trunk/src/wp-includes/html-api/class-wp-html-tag-processor.php

    r58559 r58613  
    15251525        $at         = $was_at;
    15261526
    1527         while ( false !== $at && $at < $doc_length ) {
     1527        while ( $at < $doc_length ) {
    15281528            $at = strpos( $html, '<', $at );
    1529 
    1530             /*
    1531              * This does not imply an incomplete parse; it indicates that there
    1532              * can be nothing left in the document other than a #text node.
    1533              */
    15341529            if ( false === $at ) {
    1535                 $this->parser_state         = self::STATE_TEXT_NODE;
    1536                 $this->token_starts_at      = $was_at;
    1537                 $this->token_length         = strlen( $html ) - $was_at;
    1538                 $this->text_starts_at       = $was_at;
    1539                 $this->text_length          = $this->token_length;
    1540                 $this->bytes_already_parsed = strlen( $html );
    1541                 return true;
     1530                break;
    15421531            }
    15431532
     
    15551544                 * @see https://html.spec.whatwg.org/#tag-open-state
    15561545                 */
    1557                 if ( strlen( $html ) > $at + 1 ) {
    1558                     $next_character  = $html[ $at + 1 ];
    1559                     $at_another_node = (
    1560                         '!' === $next_character ||
    1561                         '/' === $next_character ||
    1562                         '?' === $next_character ||
    1563                         ( 'A' <= $next_character && $next_character <= 'Z' ) ||
    1564                         ( 'a' <= $next_character && $next_character <= 'z' )
    1565                     );
    1566                     if ( ! $at_another_node ) {
    1567                         ++$at;
    1568                         continue;
    1569                     }
     1546                if ( 1 !== strspn( $html, '!/?abcdefghijklmnopqrstuvwxyzABCEFGHIJKLMNOPQRSTUVWXYZ', $at + 1, 1 ) ) {
     1547                    ++$at;
     1548                    continue;
    15701549                }
    15711550
     
    16311610                 * https://html.spec.whatwg.org/multipage/parsing.html#tag-open-state
    16321611                 */
    1633                 if (
    1634                     $doc_length > $at + 3 &&
    1635                     '-' === $html[ $at + 2 ] &&
    1636                     '-' === $html[ $at + 3 ]
    1637                 ) {
     1612                if ( 0 === substr_compare( $html, '--', $at + 2, 2 ) ) {
    16381613                    $closer_at = $at + 4;
    16391614                    // If it's not possible to close the comment then there is nothing more to scan.
     
    19121887        }
    19131888
    1914         return false;
     1889        /*
     1890         * This does not imply an incomplete parse; it indicates that there
     1891         * can be nothing left in the document other than a #text node.
     1892         */
     1893        $this->parser_state         = self::STATE_TEXT_NODE;
     1894        $this->token_starts_at      = $was_at;
     1895        $this->token_length         = $doc_length - $was_at;
     1896        $this->text_starts_at       = $was_at;
     1897        $this->text_length          = $this->token_length;
     1898        $this->bytes_already_parsed = $doc_length;
     1899        return true;
    19151900    }
    19161901
     
    19231908     */
    19241909    private function parse_next_attribute() {
     1910        $doc_length = strlen( $this->html );
     1911
    19251912        // Skip whitespace and slashes.
    19261913        $this->bytes_already_parsed += strspn( $this->html, " \t\f\r\n/", $this->bytes_already_parsed );
    1927         if ( $this->bytes_already_parsed >= strlen( $this->html ) ) {
     1914        if ( $this->bytes_already_parsed >= $doc_length ) {
    19281915            $this->parser_state = self::STATE_INCOMPLETE_INPUT;
    19291916
     
    19421929
    19431930        // No attribute, just tag closer.
    1944         if ( 0 === $name_length || $this->bytes_already_parsed + $name_length >= strlen( $this->html ) ) {
     1931        if ( 0 === $name_length || $this->bytes_already_parsed + $name_length >= $doc_length ) {
    19451932            return false;
    19461933        }
     
    19491936        $attribute_name              = substr( $this->html, $attribute_start, $name_length );
    19501937        $this->bytes_already_parsed += $name_length;
    1951         if ( $this->bytes_already_parsed >= strlen( $this->html ) ) {
     1938        if ( $this->bytes_already_parsed >= $doc_length ) {
    19521939            $this->parser_state = self::STATE_INCOMPLETE_INPUT;
    19531940
     
    19561943
    19571944        $this->skip_whitespace();
    1958         if ( $this->bytes_already_parsed >= strlen( $this->html ) ) {
     1945        if ( $this->bytes_already_parsed >= $doc_length ) {
    19591946            $this->parser_state = self::STATE_INCOMPLETE_INPUT;
    19601947
     
    19661953            ++$this->bytes_already_parsed;
    19671954            $this->skip_whitespace();
    1968             if ( $this->bytes_already_parsed >= strlen( $this->html ) ) {
     1955            if ( $this->bytes_already_parsed >= $doc_length ) {
    19691956                $this->parser_state = self::STATE_INCOMPLETE_INPUT;
    19701957
     
    19771964                    $quote                      = $this->html[ $this->bytes_already_parsed ];
    19781965                    $value_start                = $this->bytes_already_parsed + 1;
    1979                     $value_length               = strcspn( $this->html, $quote, $value_start );
    1980                     $attribute_end              = $value_start + $value_length + 1;
     1966                    $end_quote_at               = strpos( $this->html, $quote, $value_start );
     1967                    $end_quote_at               = false === $end_quote_at ? $doc_length : $end_quote_at;
     1968                    $value_length               = $end_quote_at - $value_start;
     1969                    $attribute_end              = $end_quote_at + 1;
    19811970                    $this->bytes_already_parsed = $attribute_end;
    19821971                    break;
     
    19941983        }
    19951984
    1996         if ( $attribute_end >= strlen( $this->html ) ) {
     1985        if ( $attribute_end >= $doc_length ) {
    19971986            $this->parser_state = self::STATE_INCOMPLETE_INPUT;
    19981987
     
    20152004
    20162005        // If an attribute is listed many times, only use the first declaration and ignore the rest.
    2017         if ( ! array_key_exists( $comparable_name, $this->attributes ) ) {
     2006        if ( ! isset( $this->attributes[ $comparable_name ] ) ) {
    20182007            $this->attributes[ $comparable_name ] = new WP_HTML_Attribute_Token(
    20192008                $attribute_name,
     
    20392028        if ( null === $this->duplicate_attributes ) {
    20402029            $this->duplicate_attributes = array( $comparable_name => array( $duplicate_span ) );
    2041         } elseif ( ! array_key_exists( $comparable_name, $this->duplicate_attributes ) ) {
     2030        } elseif ( ! isset( $this->duplicate_attributes[ $comparable_name ] ) ) {
    20422031            $this->duplicate_attributes[ $comparable_name ] = array( $duplicate_span );
    20432032        } else {
     
    31113100
    31123101        // Removes any duplicated attributes if they were also present.
    3113         if ( null !== $this->duplicate_attributes && array_key_exists( $name, $this->duplicate_attributes ) ) {
    3114             foreach ( $this->duplicate_attributes[ $name ] as $attribute_token ) {
    3115                 $this->lexical_updates[] = new WP_HTML_Text_Replacement(
    3116                     $attribute_token->start,
    3117                     $attribute_token->length,
    3118                     ''
    3119                 );
    3120             }
     3102        foreach ( $this->duplicate_attributes[ $name ] ?? array() as $attribute_token ) {
     3103            $this->lexical_updates[] = new WP_HTML_Text_Replacement(
     3104                $attribute_token->start,
     3105                $attribute_token->length,
     3106                ''
     3107            );
    31213108        }
    31223109
     
    33183305
    33193306        // Does the tag name match the requested tag name in a case-insensitive manner?
    3320         if ( null !== $this->sought_tag_name ) {
    3321             /*
    3322              * String (byte) length lookup is fast. If they aren't the
    3323              * same length then they can't be the same string values.
    3324              */
    3325             if ( strlen( $this->sought_tag_name ) !== $this->tag_name_length ) {
    3326                 return false;
    3327             }
    3328 
    3329             /*
    3330              * Check each character to determine if they are the same.
    3331              * Defer calls to `strtoupper()` to avoid them when possible.
    3332              * Calling `strcasecmp()` here tested slowed than comparing each
    3333              * character, so unless benchmarks show otherwise, it should
    3334              * not be used.
    3335              *
    3336              * It's expected that most of the time that this runs, a
    3337              * lower-case tag name will be supplied and the input will
    3338              * contain lower-case tag names, thus normally bypassing
    3339              * the case comparison code.
    3340              */
    3341             for ( $i = 0; $i < $this->tag_name_length; $i++ ) {
    3342                 $html_char = $this->html[ $this->tag_name_starts_at + $i ];
    3343                 $tag_char  = $this->sought_tag_name[ $i ];
    3344 
    3345                 if ( $html_char !== $tag_char && strtoupper( $html_char ) !== $tag_char ) {
    3346                     return false;
    3347                 }
    3348             }
     3307        if ( isset( $this->sought_tag_name ) && 0 !== substr_compare( $this->html, $this->sought_tag_name, $this->tag_name_starts_at, $this->tag_name_length, true ) ) {
     3308            return false;
    33493309        }
    33503310
Note: See TracChangeset for help on using the changeset viewer.