Make WordPress Core

Changeset 58985


Ignore:
Timestamp:
09/04/2024 04:32:37 AM (5 months ago)
Author:
dmsnell
Message:

HTML API: Respect document compat mode when handling CSS class names.

The HTML API has been behaving as if CSS class name selectors matched class names in an ASCII case-insensitive manner. This is only true if the document in question is set to quirks mode. Unfortunately most documents processed will be set to no-quirks mode, meaning that some CSS behaviors have been matching incorrectly when provided with case variants of class names.

In this patch, the CSS methods have been audited and updated to adhere to the rules governing ASCII case sensitivity when matching classes. This includes add_class(), remove_class(), has_class(), and class_list(). Now, it is assumed that a document is in no-quirks mode unless a full HTML parser infers quirks mode, and these methods will treat class names in a byte-for-byte manner. Otherwise, when a document is in quirks mode, the methods will compare the provided class names against existing class names for the tag in an ASCII case insensitive way, while class_list() will return a lower-cased version of the existing class names.

The lower-casing in class_list() is performed for consistency, since it's possible that multiple case variants of the same comparable class name exists on a tag in the input HTML.

Developed in https://github.com/WordPress/wordpress-develop/pull/7169
Discussed in https://core.trac.wordpress.org/ticket/61531

Props dmsnell, jonsurrell.
See #61531.

Location:
trunk
Files:
4 edited

Legend:

Unmodified
Added
Removed
  • trunk/src/wp-includes/html-api/class-wp-html-processor-state.php

    r58867 r58985  
    301301
    302302    /**
    303      * No-quirks mode document compatability mode.
    304      *
    305      * > In no-quirks mode, the behavior is (hopefully) the desired behavior
    306      * > described by the modern HTML and CSS specifications.
    307      *
    308      * @since 6.7.0
    309      *
    310      * @var string
    311      */
    312     const NO_QUIRKS_MODE = 'no-quirks-mode';
    313 
    314     /**
    315      * Quirks mode document compatability mode.
    316      *
    317      * > In quirks mode, layout emulates behavior in Navigator 4 and Internet
    318      * > Explorer 5. This is essential in order to support websites that were
    319      * > built before the widespread adoption of web standards.
    320      *
    321      * @since 6.7.0
    322      *
    323      * @var string
    324      */
    325     const QUIRKS_MODE = 'quirks-mode';
    326 
    327     /**
    328303     * The stack of template insertion modes.
    329304     *
     
    381356     */
    382357    public $insertion_mode = self::INSERTION_MODE_INITIAL;
    383 
    384     /**
    385      * Indicates if the document is in quirks mode or no-quirks mode.
    386      *
    387      * Impact on HTML parsing:
    388      *
    389      *  - In `NO_QUIRKS_MODE` CSS class and ID selectors match in a byte-for-byte
    390      *    manner, otherwise for backwards compatability, class selectors are to
    391      *    match in an ASCII case-insensitive manner.
    392      *
    393      *  - When not in `QUIRKS_MODE`, a TABLE start tag implicitly closes an open P tag
    394      *    if one is in scope and open, otherwise the TABLE becomes a child of the P.
    395      *
    396      * `QUIRKS_MODE` impacts many styling-related aspects of an HTML document, but
    397      * none of the other changes modifies how the HTML is parsed or selected.
    398      *
    399      * @see self::QUIRKS_MODE
    400      * @see self::NO_QUIRKS_MODE
    401      *
    402      * @since 6.7.0
    403      *
    404      * @var string
    405      */
    406     public $document_mode = self::NO_QUIRKS_MODE;
    407358
    408359    /**
  • trunk/src/wp-includes/html-api/class-wp-html-processor.php

    r58977 r58985  
    10811081                $doctype = $this->get_doctype_info();
    10821082                if ( null !== $doctype && 'quirks' === $doctype->indicated_compatability_mode ) {
    1083                     $this->state->document_mode = WP_HTML_Processor_State::QUIRKS_MODE;
     1083                    $this->compat_mode = WP_HTML_Tag_Processor::QUIRKS_MODE;
    10841084                }
    10851085
     
    10961096         */
    10971097        initial_anything_else:
    1098         $this->state->document_mode  = WP_HTML_Processor_State::QUIRKS_MODE;
     1098        $this->compat_mode           = WP_HTML_Tag_Processor::QUIRKS_MODE;
    10991099        $this->state->insertion_mode = WP_HTML_Processor_State::INSERTION_MODE_BEFORE_HTML;
    11001100        return $this->step( self::REPROCESS_CURRENT_NODE );
     
    24492449                 */
    24502450                if (
    2451                     WP_HTML_Processor_State::QUIRKS_MODE !== $this->state->document_mode &&
     2451                    WP_HTML_Tag_Processor::QUIRKS_MODE !== $this->compat_mode &&
    24522452                    $this->state->stack_of_open_elements->has_p_in_button_scope()
    24532453                ) {
     
    49384938     *
    49394939     * @since 6.6.0 Subclassed for the HTML Processor.
     4940     *
     4941     * @todo When reconstructing active formatting elements with attributes, find a way
     4942     *       to indicate if the virtually-reconstructed formatting elements contain the
     4943     *       wanted class name.
    49404944     *
    49414945     * @param string $wanted_class Look for this CSS class name, ASCII case-insensitive.
  • trunk/src/wp-includes/html-api/class-wp-html-tag-processor.php

    r58977 r58985  
    511511     */
    512512    protected $parser_state = self::STATE_READY;
     513
     514    /**
     515     * Indicates if the document is in quirks mode or no-quirks mode.
     516     *
     517     *  Impact on HTML parsing:
     518     *
     519     *   - In `NO_QUIRKS_MODE` (also known as "standard mode"):
     520     *       - CSS class and ID selectors match byte-for-byte (case-sensitively).
     521     *       - A TABLE start tag `<table>` implicitly closes any open `P` element.
     522     *
     523     *   - In `QUIRKS_MODE`:
     524     *       - CSS class and ID selectors match match in an ASCII case-insensitive manner.
     525     *       - A TABLE start tag `<table>` opens a `TABLE` element as a child of a `P`
     526     *         element if one is open.
     527     *
     528     * Quirks and no-quirks mode are thus mostly about styling, but have an impact when
     529     * tables are found inside paragraph elements.
     530     *
     531     * @see self::QUIRKS_MODE
     532     * @see self::NO_QUIRKS_MODE
     533     *
     534     * @since 6.7.0
     535     *
     536     * @var string
     537     */
     538    protected $compat_mode = self::NO_QUIRKS_MODE;
    513539
    514540    /**
     
    11561182        $seen = array();
    11571183
     1184        $is_quirks = self::QUIRKS_MODE === $this->compat_mode;
     1185
    11581186        $at = 0;
    11591187        while ( $at < strlen( $class ) ) {
     
    11701198            }
    11711199
    1172             /*
    1173              * CSS class names are case-insensitive in the ASCII range.
    1174              *
    1175              * @see https://www.w3.org/TR/CSS2/syndata.html#x1
    1176              */
    1177             $name = str_replace( "\x00", "\u{FFFD}", strtolower( substr( $class, $at, $length ) ) );
    1178             $at  += $length;
     1200            $name = str_replace( "\x00", "\u{FFFD}", substr( $class, $at, $length ) );
     1201            if ( $is_quirks ) {
     1202                $name = strtolower( $name );
     1203            }
     1204            $at += $length;
    11791205
    11801206            /*
     
    12061232        }
    12071233
    1208         $wanted_class = strtolower( $wanted_class );
    1209 
     1234        $case_insensitive = self::QUIRKS_MODE === $this->compat_mode;
     1235
     1236        $wanted_length = strlen( $wanted_class );
    12101237        foreach ( $this->class_list() as $class_name ) {
    1211             if ( $class_name === $wanted_class ) {
     1238            if (
     1239                strlen( $class_name ) === $wanted_length &&
     1240                0 === substr_compare( $class_name, $wanted_class, 0, strlen( $wanted_class ), $case_insensitive )
     1241            ) {
    12121242                return true;
    12131243            }
     
    22972327        $modified = false;
    22982328
     2329        $seen      = array();
     2330        $to_remove = array();
     2331        $is_quirks = self::QUIRKS_MODE === $this->compat_mode;
     2332        if ( $is_quirks ) {
     2333            foreach ( $this->classname_updates as $updated_name => $action ) {
     2334                if ( self::REMOVE_CLASS === $action ) {
     2335                    $to_remove[] = strtolower( $updated_name );
     2336                }
     2337            }
     2338        } else {
     2339            foreach ( $this->classname_updates as $updated_name => $action ) {
     2340                if ( self::REMOVE_CLASS === $action ) {
     2341                    $to_remove[] = $updated_name;
     2342                }
     2343            }
     2344        }
     2345
    22992346        // Remove unwanted classes by only copying the new ones.
    23002347        $existing_class_length = strlen( $existing_class );
     
    23122359            }
    23132360
    2314             $name = substr( $existing_class, $at, $name_length );
    2315             $at  += $name_length;
    2316 
    2317             // If this class is marked for removal, start processing the next one.
    2318             $remove_class = (
    2319                 isset( $this->classname_updates[ $name ] ) &&
    2320                 self::REMOVE_CLASS === $this->classname_updates[ $name ]
    2321             );
    2322 
    2323             // If a class has already been seen then skip it; it should not be added twice.
    2324             if ( ! $remove_class ) {
    2325                 $this->classname_updates[ $name ] = self::SKIP_CLASS;
    2326             }
    2327 
    2328             if ( $remove_class ) {
     2361            $name                  = substr( $existing_class, $at, $name_length );
     2362            $comparable_class_name = $is_quirks ? strtolower( $name ) : $name;
     2363            $at                   += $name_length;
     2364
     2365            // If this class is marked for removal, remove it and move on to the next one.
     2366            if ( in_array( $comparable_class_name, $to_remove, true ) ) {
    23292367                $modified = true;
    23302368                continue;
    23312369            }
     2370
     2371            // If a class has already been seen then skip it; it should not be added twice.
     2372            if ( in_array( $comparable_class_name, $seen, true ) ) {
     2373                continue;
     2374            }
     2375
     2376            $seen[] = $comparable_class_name;
    23322377
    23332378            /*
     
    23512396        // Add new classes by appending those which haven't already been seen.
    23522397        foreach ( $this->classname_updates as $name => $operation ) {
    2353             if ( self::ADD_CLASS === $operation ) {
     2398            $comparable_name = $is_quirks ? strtolower( $name ) : $name;
     2399            if ( self::ADD_CLASS === $operation && ! in_array( $comparable_name, $seen, true ) ) {
    23542400                $modified = true;
    23552401
     
    39333979        }
    39343980
     3981        if ( self::QUIRKS_MODE !== $this->compat_mode ) {
     3982            $this->classname_updates[ $class_name ] = self::ADD_CLASS;
     3983            return true;
     3984        }
     3985
     3986        /*
     3987         * Because class names are matched ASCII-case-insensitively in quirks mode,
     3988         * this needs to see if a case variant of the given class name is already
     3989         * enqueued and update that existing entry, if so. This picks the casing of
     3990         * the first-provided class name for all lexical variations.
     3991         */
     3992        $class_name_length = strlen( $class_name );
     3993        foreach ( $this->classname_updates as $updated_name => $action ) {
     3994            if (
     3995                strlen( $updated_name ) === $class_name_length &&
     3996                0 === substr_compare( $updated_name, $class_name, 0, $class_name_length, true )
     3997            ) {
     3998                $this->classname_updates[ $updated_name ] = self::ADD_CLASS;
     3999                return true;
     4000            }
     4001        }
     4002
    39354003        $this->classname_updates[ $class_name ] = self::ADD_CLASS;
    3936 
    39374004        return true;
    39384005    }
     
    39544021        }
    39554022
    3956         if ( null !== $this->tag_name_starts_at ) {
     4023        if ( self::QUIRKS_MODE !== $this->compat_mode ) {
    39574024            $this->classname_updates[ $class_name ] = self::REMOVE_CLASS;
    3958         }
    3959 
     4025            return true;
     4026        }
     4027
     4028        /*
     4029         * Because class names are matched ASCII-case-insensitively in quirks mode,
     4030         * this needs to see if a case variant of the given class name is already
     4031         * enqueued and update that existing entry, if so. This picks the casing of
     4032         * the first-provided class name for all lexical variations.
     4033         */
     4034        $class_name_length = strlen( $class_name );
     4035        foreach ( $this->classname_updates as $updated_name => $action ) {
     4036            if (
     4037                strlen( $updated_name ) === $class_name_length &&
     4038                0 === substr_compare( $updated_name, $class_name, 0, $class_name_length, true )
     4039            ) {
     4040                $this->classname_updates[ $updated_name ] = self::REMOVE_CLASS;
     4041                return true;
     4042            }
     4043        }
     4044
     4045        $this->classname_updates[ $class_name ] = self::REMOVE_CLASS;
    39604046        return true;
    39614047    }
     
    43524438
    43534439    /**
     4440     * No-quirks mode document compatability mode.
     4441     *
     4442     * > In no-quirks mode, the behavior is (hopefully) the desired behavior
     4443     * > described by the modern HTML and CSS specifications.
     4444     *
     4445     * @see self::$compat_mode
     4446     * @see https://developer.mozilla.org/en-US/docs/Web/HTML/Quirks_Mode_and_Standards_Mode
     4447     *
     4448     * @since 6.7.0
     4449     *
     4450     * @var string
     4451     */
     4452    const NO_QUIRKS_MODE = 'no-quirks-mode';
     4453
     4454    /**
     4455     * Quirks mode document compatability mode.
     4456     *
     4457     * > In quirks mode, layout emulates behavior in Navigator 4 and Internet
     4458     * > Explorer 5. This is essential in order to support websites that were
     4459     * > built before the widespread adoption of web standards.
     4460     *
     4461     * @see self::$compat_mode
     4462     * @see https://developer.mozilla.org/en-US/docs/Web/HTML/Quirks_Mode_and_Standards_Mode
     4463     *
     4464     * @since 6.7.0
     4465     *
     4466     * @var string
     4467     */
     4468    const QUIRKS_MODE = 'quirks-mode';
     4469
     4470    /**
    43544471     * Indicates that a span of text may contain any combination of significant
    43554472     * kinds of characters: NULL bytes, whitespace, and others.
  • trunk/tests/phpunit/tests/html-api/wpHtmlProcessor.php

    r58892 r58985  
    520520        $this->assertTrue( $processor->next_tag( 'script' ) );
    521521    }
     522
     523    /**
     524     * Ensures that the tag processor is case sensitive when removing CSS classes in no-quirks mode.
     525     *
     526     * @ticket 61531
     527     *
     528     * @covers ::remove_class
     529     */
     530    public function test_remove_class_no_quirks_mode() {
     531        $processor = WP_HTML_Processor::create_full_parser( '<!DOCTYPE html><span class="UPPER">' );
     532        $processor->next_tag( 'SPAN' );
     533        $processor->remove_class( 'upper' );
     534        $this->assertSame( '<!DOCTYPE html><span class="UPPER">', $processor->get_updated_html() );
     535
     536        $processor->remove_class( 'UPPER' );
     537        $this->assertSame( '<!DOCTYPE html><span >', $processor->get_updated_html() );
     538    }
     539
     540    /**
     541     * Ensures that the tag processor is case sensitive when adding CSS classes in no-quirks mode.
     542     *
     543     * @ticket 61531
     544     *
     545     * @covers ::add_class
     546     */
     547    public function test_add_class_no_quirks_mode() {
     548        $processor = WP_HTML_Processor::create_full_parser( '<!DOCTYPE html><span class="UPPER">' );
     549        $processor->next_tag( 'SPAN' );
     550        $processor->add_class( 'UPPER' );
     551        $this->assertSame( '<!DOCTYPE html><span class="UPPER">', $processor->get_updated_html() );
     552
     553        $processor->add_class( 'upper' );
     554        $this->assertSame( '<!DOCTYPE html><span class="UPPER upper">', $processor->get_updated_html() );
     555    }
     556
     557    /**
     558     * Ensures that the tag processor is case sensitive when checking has CSS classes in no-quirks mode.
     559     *
     560     * @ticket 61531
     561     *
     562     * @covers ::has_class
     563     */
     564    public function test_has_class_no_quirks_mode() {
     565        $processor = WP_HTML_Processor::create_full_parser( '<!DOCTYPE html><span class="UPPER">' );
     566        $processor->next_tag( 'SPAN' );
     567        $this->assertFalse( $processor->has_class( 'upper' ) );
     568        $this->assertTrue( $processor->has_class( 'UPPER' ) );
     569    }
     570
     571    /**
     572     * Ensures that the tag processor lists unique CSS class names in no-quirks mode.
     573     *
     574     * @ticket 61531
     575     *
     576     * @covers ::class_list
     577     */
     578    public function test_class_list_no_quirks_mode() {
     579        $processor = WP_HTML_Processor::create_full_parser(
     580            /*
     581             * U+00C9 is LATIN CAPITAL LETTER E WITH ACUTE
     582             * U+0045 is LATIN CAPITAL LETTER E
     583             * U+0301 is COMBINING ACUTE ACCENT
     584             *
     585             * This tests not only that the class matching deduplicates the É, but also
     586             * that it treats the same character in different normalization forms as
     587             * distinct, since matching occurs on a byte-for-byte basis.
     588             */
     589            "<!DOCTYPE html><span class='A A a B b \u{C9} \u{45}\u{0301} \u{C9} é'>"
     590        );
     591        $processor->next_tag( 'SPAN' );
     592        $class_list = iterator_to_array( $processor->class_list() );
     593        $this->assertSame(
     594            array( 'A', 'a', 'B', 'b', 'É', "E\u{0301}", 'é' ),
     595            $class_list
     596        );
     597    }
     598
     599    /**
     600     * Ensures that the tag processor is case insensitive when removing CSS classes in quirks mode.
     601     *
     602     * @ticket 61531
     603     *
     604     * @covers ::remove_class
     605     */
     606    public function test_remove_class_quirks_mode() {
     607        $processor = WP_HTML_Processor::create_full_parser( '<span class="uPPER">' );
     608        $processor->next_tag( 'SPAN' );
     609        $processor->remove_class( 'upPer' );
     610        $this->assertSame( '<span >', $processor->get_updated_html() );
     611    }
     612
     613    /**
     614     * Ensures that the tag processor is case insensitive when adding CSS classes in quirks mode.
     615     *
     616     * @ticket 61531
     617     *
     618     * @covers ::add_class
     619     */
     620    public function test_add_class_quirks_mode() {
     621        $processor = WP_HTML_Processor::create_full_parser( '<span class="UPPER">' );
     622        $processor->next_tag( 'SPAN' );
     623        $processor->add_class( 'upper' );
     624
     625        $this->assertSame( '<span class="UPPER">', $processor->get_updated_html() );
     626
     627        $processor->add_class( 'ANOTHER-UPPER' );
     628        $this->assertSame( '<span class="UPPER ANOTHER-UPPER">', $processor->get_updated_html() );
     629    }
     630
     631    /**
     632     * Ensures that the tag processor is case sensitive when checking has CSS classes in quirks mode.
     633     *
     634     * @ticket 61531
     635     *
     636     * @covers ::has_class
     637     */
     638    public function test_has_class_quirks_mode() {
     639        $processor = WP_HTML_Processor::create_full_parser( '<span class="UPPER">' );
     640        $processor->next_tag( 'SPAN' );
     641        $this->assertTrue( $processor->has_class( 'upper' ) );
     642        $this->assertTrue( $processor->has_class( 'UPPER' ) );
     643    }
     644
     645    /**
     646     * Ensures that the tag processor lists unique CSS class names in quirks mode.
     647     *
     648     * @ticket 61531
     649     *
     650     * @covers ::class_list
     651     */
     652    public function test_class_list_quirks_mode() {
     653        $processor = WP_HTML_Processor::create_full_parser(
     654            /*
     655             * U+00C9 is LATIN CAPITAL LETTER E WITH ACUTE
     656             * U+0045 is LATIN CAPITAL LETTER E
     657             * U+0065 is LATIN SMALL LETTER E
     658             * U+0301 is COMBINING ACUTE ACCENT
     659             *
     660             * This tests not only that the class matching deduplicates the É, but also
     661             * that it treats the same character in different normalization forms as
     662             * distinct, since matching occurs on a byte-for-byte basis.
     663             */
     664            "<span class='A A a B b \u{C9} \u{45}\u{301} \u{C9} é \u{65}\u{301}'>"
     665        );
     666        $processor->next_tag( 'SPAN' );
     667        $class_list = iterator_to_array( $processor->class_list() );
     668        $this->assertSame(
     669            array( 'a', 'b', 'É', "e\u{301}", 'é' ),
     670            $class_list
     671        );
     672    }
    522673}
Note: See TracChangeset for help on using the changeset viewer.