Context Navigation

← Previous Changeset
Next Changeset →

Changeset 58613

Timestamp:

07/01/2024 11:34:19 PM (2 days ago)

Author:

dmsnell

Message:

HTML API: Optimize low-level parsing details in Tag Processor.

Introduces a number of micro-level optimizations in the Tag Processor to
improve token-scanning performance. Should contain no functional changes.

Based on benchmarking against a list of the 100 most-visited websites,
these changes result in an average improvement in performance of the Tag
Processor for scanning tags from between 3.5% and 7.5%.

Developed in https://github.com/WordPress/wordpress-develop/pull/6890
Discussed in https://core.trac.wordpress.org/ticket/61545

Follow-up to [55203].

See #61545.

Location:

trunk/src/wp-includes/html-api

Files:

: 2 edited

class-wp-html-decoder.php (modified) (2 diffs)
class-wp-html-tag-processor.php (modified) (15 diffs)

Legend:

: Unmodified
: Added
: Removed

trunk/src/wp-includes/html-api/class-wp-html-decoder.php

-                      r58281
+                      r58613
         while ( $at < $end ) {
             $next_character_reference_at = strpos( $text, '&', $at );
             if ( false === $next_character_reference_at || $next_character_reference_at >= $end ) {
+            if ( false === $next_character_reference_at ) {
                 break;
+            }
 …
         if ( $code_point <= 0x7FF ) {
             $byte1 = ( $code_point >> 6 ) | 0xC0;
             $byte2 = $code_point & 0x3F | 0x80;
             return pack( 'CC', $byte1, $byte2 );
+            $byte1 = chr( ( $code_point >> 6 ) | 0xC0 );
+            $byte2 = chr( $code_point & 0x3F | 0x80 );
+            return "{$byte1}{$byte2}";
+        }
         if ( $code_point <= 0xFFFF ) {
             $byte1 = ( $code_point >> 12 ) | 0xE0;
             $byte2 = ( $code_point >> 6 ) & 0x3F | 0x80;
             $byte3 = $code_point & 0x3F | 0x80;
             return pack( 'CCC', $byte1, $byte2, $byte3 );
+            $byte1 = chr( ( $code_point >> 12 ) | 0xE0 );
+            $byte2 = chr( ( $code_point >> 6 ) & 0x3F | 0x80 );
+            $byte3 = chr( $code_point & 0x3F | 0x80 );
+            return "{$byte1}{$byte2}{$byte3}";
+        }
         // Any values above U+10FFFF are eliminated above in the pre-check.
         $byte1 = ( $code_point >> 18 ) | 0xF0;
         $byte2 = ( $code_point >> 12 ) & 0x3F | 0x80;
         $byte3 = ( $code_point >> 6 ) & 0x3F | 0x80;
         $byte4 = $code_point & 0x3F | 0x80;
         return pack( 'CCCC', $byte1, $byte2, $byte3, $byte4 );
+        $byte1 = chr( ( $code_point >> 18 ) | 0xF0 );
+        $byte2 = chr( ( $code_point >> 12 ) & 0x3F | 0x80 );
+        $byte3 = chr( ( $code_point >> 6 ) & 0x3F | 0x80 );
+        $byte4 = chr( $code_point & 0x3F | 0x80 );
+        return "{$byte1}{$byte2}{$byte3}{$byte4}";
+    }
+}

trunk/src/wp-includes/html-api/class-wp-html-tag-processor.php

-                      r58559
+                      r58613
         $at         = $was_at;
         while ( false !== $at && $at < $doc_length ) {
+        while ( $at < $doc_length ) {
             $at = strpos( $html, '<', $at );
-            /*
-             * This does not imply an incomplete parse; it indicates that there
-             * can be nothing left in the document other than a #text node.
-             */
             if ( false === $at ) {
+                $this->parser_state         = self::STATE_TEXT_NODE;
+                $this->token_starts_at      = $was_at;
+                $this->token_length         = strlen( $html ) - $was_at;
+                $this->text_starts_at       = $was_at;
+                $this->text_length          = $this->token_length;
+                $this->bytes_already_parsed = strlen( $html );
+                return true;
+                break;
+            }
 …
                  * @see https://html.spec.whatwg.org/#tag-open-state
                  */
+                if ( strlen( $html ) > $at + 1 ) {
+                    $next_character  = $html[ $at + 1 ];
+                    $at_another_node = (
+                        '!' === $next_character ||
+                        '/' === $next_character ||
+                        '?' === $next_character ||
+                        ( 'A' <= $next_character && $next_character <= 'Z' ) ||
+                        ( 'a' <= $next_character && $next_character <= 'z' )
+                    );
+                    if ( ! $at_another_node ) {
+                        ++$at;
+                        continue;
+                    }
+                if ( 1 !== strspn( $html, '!/?abcdefghijklmnopqrstuvwxyzABCEFGHIJKLMNOPQRSTUVWXYZ', $at + 1, 1 ) ) {
+                    ++$at;
+                    continue;
+                }
 …
                  * https://html.spec.whatwg.org/multipage/parsing.html#tag-open-state
                  */
+                if (
+                    $doc_length > $at + 3 &&
+                    '-' === $html[ $at + 2 ] &&
+                    '-' === $html[ $at + 3 ]
+                ) {
+                if ( 0 === substr_compare( $html, '--', $at + 2, 2 ) ) {
                     $closer_at = $at + 4;
                     // If it's not possible to close the comment then there is nothing more to scan.
 …
+        }
+        return false;
+        /*
+         * This does not imply an incomplete parse; it indicates that there
+         * can be nothing left in the document other than a #text node.
+         */
+        $this->parser_state         = self::STATE_TEXT_NODE;
+        $this->token_starts_at      = $was_at;
+        $this->token_length         = $doc_length - $was_at;
+        $this->text_starts_at       = $was_at;
+        $this->text_length          = $this->token_length;
+        $this->bytes_already_parsed = $doc_length;
+        return true;
+    }
 …
      */
     private function parse_next_attribute() {
+        $doc_length = strlen( $this->html );
         // Skip whitespace and slashes.
         $this->bytes_already_parsed += strspn( $this->html, " \t\f\r\n/", $this->bytes_already_parsed );
         if ( $this->bytes_already_parsed >= strlen( $this->html ) ) {
+        if ( $this->bytes_already_parsed >= $doc_length ) {
             $this->parser_state = self::STATE_INCOMPLETE_INPUT;
 …
         // No attribute, just tag closer.
         if ( 0 === $name_length || $this->bytes_already_parsed + $name_length >= strlen( $this->html ) ) {
+        if ( 0 === $name_length || $this->bytes_already_parsed + $name_length >= $doc_length ) {
             return false;
+        }
 …
         $attribute_name              = substr( $this->html, $attribute_start, $name_length );
         $this->bytes_already_parsed += $name_length;
         if ( $this->bytes_already_parsed >= strlen( $this->html ) ) {
+        if ( $this->bytes_already_parsed >= $doc_length ) {
             $this->parser_state = self::STATE_INCOMPLETE_INPUT;
 …
         $this->skip_whitespace();
         if ( $this->bytes_already_parsed >= strlen( $this->html ) ) {
+        if ( $this->bytes_already_parsed >= $doc_length ) {
             $this->parser_state = self::STATE_INCOMPLETE_INPUT;
 …
             ++$this->bytes_already_parsed;
             $this->skip_whitespace();
             if ( $this->bytes_already_parsed >= strlen( $this->html ) ) {
+            if ( $this->bytes_already_parsed >= $doc_length ) {
                 $this->parser_state = self::STATE_INCOMPLETE_INPUT;
 …
                     $quote                      = $this->html[ $this->bytes_already_parsed ];
                     $value_start                = $this->bytes_already_parsed + 1;
+                    $value_length               = strcspn( $this->html, $quote, $value_start );
+                    $attribute_end              = $value_start + $value_length + 1;
+                    $end_quote_at               = strpos( $this->html, $quote, $value_start );
+                    $end_quote_at               = false === $end_quote_at ? $doc_length : $end_quote_at;
+                    $value_length               = $end_quote_at - $value_start;
+                    $attribute_end              = $end_quote_at + 1;
                     $this->bytes_already_parsed = $attribute_end;
                     break;
 …
+        }
         if ( $attribute_end >= strlen( $this->html ) ) {
+        if ( $attribute_end >= $doc_length ) {
             $this->parser_state = self::STATE_INCOMPLETE_INPUT;
 …
         // If an attribute is listed many times, only use the first declaration and ignore the rest.
         if ( ! array_key_exists( $comparable_name, $this->attributes ) ) {
+        if ( ! isset( $this->attributes[ $comparable_name ] ) ) {
             $this->attributes[ $comparable_name ] = new WP_HTML_Attribute_Token(
                 $attribute_name,
 …
         if ( null === $this->duplicate_attributes ) {
             $this->duplicate_attributes = array( $comparable_name => array( $duplicate_span ) );
         } elseif ( ! array_key_exists( $comparable_name, $this->duplicate_attributes ) ) {
+        } elseif ( ! isset( $this->duplicate_attributes[ $comparable_name ] ) ) {
             $this->duplicate_attributes[ $comparable_name ] = array( $duplicate_span );
         } else {
 …
         // Removes any duplicated attributes if they were also present.
+        if ( null !== $this->duplicate_attributes && array_key_exists( $name, $this->duplicate_attributes ) ) {
+            foreach ( $this->duplicate_attributes[ $name ] as $attribute_token ) {
+                $this->lexical_updates[] = new WP_HTML_Text_Replacement(
+                    $attribute_token->start,
+                    $attribute_token->length,
+                    ''
+                );
+            }
+        foreach ( $this->duplicate_attributes[ $name ] ?? array() as $attribute_token ) {
+            $this->lexical_updates[] = new WP_HTML_Text_Replacement(
+                $attribute_token->start,
+                $attribute_token->length,
+                ''
+            );
+        }
 …
         // Does the tag name match the requested tag name in a case-insensitive manner?
+        if ( null !== $this->sought_tag_name ) {
+            /*
+             * String (byte) length lookup is fast. If they aren't the
+             * same length then they can't be the same string values.
+             */
+            if ( strlen( $this->sought_tag_name ) !== $this->tag_name_length ) {
+                return false;
+            }
+            /*
+             * Check each character to determine if they are the same.
+             * Defer calls to `strtoupper()` to avoid them when possible.
+             * Calling `strcasecmp()` here tested slowed than comparing each
+             * character, so unless benchmarks show otherwise, it should
+             * not be used.
+             *
+             * It's expected that most of the time that this runs, a
+             * lower-case tag name will be supplied and the input will
+             * contain lower-case tag names, thus normally bypassing
+             * the case comparison code.
+             */
+            for ( $i = 0; $i < $this->tag_name_length; $i++ ) {
+                $html_char = $this->html[ $this->tag_name_starts_at + $i ];
+                $tag_char  = $this->sought_tag_name[ $i ];
+                if ( $html_char !== $tag_char && strtoupper( $html_char ) !== $tag_char ) {
+                    return false;
+                }
+            }
+        if ( isset( $this->sought_tag_name ) && 0 !== substr_compare( $this->html, $this->sought_tag_name, $this->tag_name_starts_at, $this->tag_name_length, true ) ) {
+            return false;
+        }

Note: See TracChangeset for help on using the changeset viewer.

Trac UI Preferences

Make WordPress Core

Context Navigation

Changeset 58613

Legend:

trunk/src/wp-includes/html-api/class-wp-html-decoder.php

trunk/src/wp-includes/html-api/class-wp-html-tag-processor.php

Download in other formats: