Make WordPress Core

Changeset 58925


Ignore:
Timestamp:
08/23/2024 02:53:59 PM (10 months ago)
Author:
dmsnell
Message:

HTML API: Parse DOCTYPE tokens and set HTML parser mode accordingly.

This patch adds until-now missing code to parse the structure of HTML DOCTYPE declarations. The DOCTYPE is mostly unused but can dictate the document compatability mode, which governs whether CSS class names match in a ASCII-case-insensitive way or not, and whether TABLE elements close an open P element.

The DOCTYPE information is made available through a new method on the Tag Processor, get_doctype_info().

Developed in https://github.com/wordpress/wordpress-develop/pull/7195
Discussed in https://core.trac.wordpress.org/ticket/61576

Props dmsnell, jonsurrell.
See #61576.

Location:
trunk
Files:
2 added
6 edited

Legend:

Unmodified
Added
Removed
  • trunk/phpcs.xml.dist

    r58107 r58925  
    263263    <rule ref="Generic.PHP.DiscourageGoto.Found">
    264264        <exclude-pattern>/wp-includes/html-api/class-wp-html-processor\.php</exclude-pattern>
     265        <exclude-pattern>/wp-includes/html-api/class-wp-html-doctype-info\.php</exclude-pattern>
    265266    </rule>
    266267
  • trunk/src/wp-includes/html-api/class-wp-html-processor.php

    r58898 r58925  
    10771077             */
    10781078            case 'html':
    1079                 $contents = $this->get_modifiable_text();
    1080                 if ( ' html' !== $contents ) {
    1081                     /*
    1082                      * @todo When the HTML Tag Processor fully parses the DOCTYPE declaration,
    1083                      *       this code should examine the contents to set the compatability mode.
    1084                      */
    1085                     $this->bail( 'Cannot process any DOCTYPE other than a normative HTML5 doctype.' );
     1079                $doctype = $this->get_doctype_info();
     1080                if ( null !== $doctype && 'quirks' === $doctype->indicated_compatability_mode ) {
     1081                    $this->state->document_mode = WP_HTML_Processor_State::QUIRKS_MODE;
    10861082                }
    10871083
     
    10901086                 */
    10911087                $this->state->insertion_mode = WP_HTML_Processor_State::INSERTION_MODE_BEFORE_HTML;
     1088                $this->insert_html_element( $this->state->current_token );
    10921089                return true;
    10931090        }
     
    10971094         */
    10981095        initial_anything_else:
     1096        $this->state->document_mode  = WP_HTML_Processor_State::QUIRKS_MODE;
    10991097        $this->state->insertion_mode = WP_HTML_Processor_State::INSERTION_MODE_BEFORE_HTML;
    11001098        return $this->step( self::REPROCESS_CURRENT_NODE );
  • trunk/src/wp-includes/html-api/class-wp-html-tag-processor.php

    r58897 r58925  
    40284028
    40294029    /**
     4030     * Gets DOCTYPE declaration info from a DOCTYPE token.
     4031     *
     4032     * DOCTYPE tokens may appear in many places in an HTML document. In most places, they are
     4033     * simply ignored. The main parsing functions find the basic shape of DOCTYPE tokens but
     4034     * do not perform detailed parsing.
     4035     *
     4036     * This method can be called to perform a full parse of the DOCTYPE token and retrieve
     4037     * its information.
     4038     *
     4039     * @return WP_HTML_Doctype_Info|null The DOCTYPE declaration information or `null` if not
     4040     *                                   currently at a DOCTYPE node.
     4041     */
     4042    public function get_doctype_info(): ?WP_HTML_Doctype_Info {
     4043        if ( self::STATE_DOCTYPE !== $this->parser_state ) {
     4044            return null;
     4045        }
     4046
     4047        return WP_HTML_Doctype_Info::from_doctype_token( substr( $this->html, $this->token_starts_at, $this->token_length ) );
     4048    }
     4049
     4050    /**
    40304051     * Parser Ready State.
    40314052     *
     
    41184139    /**
    41194140     * Indicates that the parser has found a DOCTYPE node and it's
    4120      * possible to read and modify its modifiable text.
     4141     * possible to read its DOCTYPE information via `get_doctype_info()`.
    41214142     *
    41224143     * @since 6.5.0
  • trunk/src/wp-settings.php

    r58672 r58925  
    253253require ABSPATH . WPINC . '/html-api/class-wp-html-attribute-token.php';
    254254require ABSPATH . WPINC . '/html-api/class-wp-html-span.php';
     255require ABSPATH . WPINC . '/html-api/class-wp-html-doctype-info.php';
    255256require ABSPATH . WPINC . '/html-api/class-wp-html-text-replacement.php';
    256257require ABSPATH . WPINC . '/html-api/class-wp-html-decoder.php';
  • trunk/tests/phpunit/tests/html-api/wpHtmlProcessorHtml5lib.php

    r58870 r58925  
    2828        'comments01/line0155'    => 'Unimplemented: Need to access raw comment text on non-normative comments.',
    2929        'comments01/line0169'    => 'Unimplemented: Need to access raw comment text on non-normative comments.',
     30        'doctype01/line0380'     => 'Bug: Mixed whitespace, non-whitespace text in head not split correctly',
    3031        'html5test-com/line0129' => 'Unimplemented: Need to access raw comment text on non-normative comments.',
    3132        'noscript01/line0014'    => 'Unimplemented: This parser does not add missing attributes to existing HTML or BODY tags.',
     
    116117                $test_context_element = $test[1];
    117118
    118                 if ( self::should_skip_test( $test_context_element, $test_name, $test[3] ) ) {
     119                if ( self::should_skip_test( $test_context_element, $test_name ) ) {
    119120                    continue;
    120121                }
     
    134135     * @return bool True if the test case should be skipped. False otherwise.
    135136     */
    136     private static function should_skip_test( ?string $test_context_element, string $test_name, string $expected_tree ): bool {
     137    private static function should_skip_test( ?string $test_context_element, string $test_name ): bool {
    137138        if ( null !== $test_context_element && 'body' !== $test_context_element ) {
    138139            return true;
     
    190191
    191192            switch ( $token_type ) {
     193                case '#doctype':
     194                    $doctype = $processor->get_doctype_info();
     195                    $output .= "<!DOCTYPE {$doctype->name}";
     196                    if ( null !== $doctype->public_identifier || null !== $doctype->system_identifier ) {
     197                        $output .= " \"{$doctype->public_identifier}\" \"{$doctype->system_identifier}\"";
     198                    }
     199                    $output .= ">\n";
     200                    break;
     201
    192202                case '#tag':
    193203                    $namespace = $processor->get_namespace();
     
    451461                case 'document':
    452462                    if ( '|' === $line[0] ) {
    453                         /*
    454                          * The next_token() method these tests rely on do not stop
    455                          * at doctype nodes. Strip doctypes from output.
    456                          * @todo Restore this line if and when the processor
    457                          * exposes doctypes.
    458                          */
    459                         if ( '| <!DOCTYPE ' !== substr( $line, 0, 12 ) ) {
    460                             $test_dom .= substr( $line, 2 );
    461                         }
     463                        $test_dom .= substr( $line, 2 );
    462464                    } else {
    463465                        // This is a text node that includes unescaped newlines.
  • trunk/tests/phpunit/tests/html-api/wpHtmlTagProcessor.php

    r58893 r58925  
    29402940        $this->assertTrue( $processor->paused_at_incomplete_token() );
    29412941    }
     2942
     2943    /**
     2944     * Test basic DOCTYPE handling.
     2945     *
     2946     * @ticket 61576
     2947     */
     2948    public function test_doctype_doc_name() {
     2949        $processor = new WP_HTML_Tag_Processor( '<!DOCTYPE html>' );
     2950        $this->assertTrue( $processor->next_token() );
     2951        $doctype = $processor->get_doctype_info();
     2952        $this->assertNotNull( $doctype );
     2953        $this->assertSame( 'html', $doctype->name );
     2954        $this->assertSame( 'no-quirks', $doctype->indicated_compatability_mode );
     2955        $this->assertNull( $doctype->public_identifier );
     2956        $this->assertNull( $doctype->system_identifier );
     2957    }
    29422958}
Note: See TracChangeset for help on using the changeset viewer.