Make WordPress Core

Changeset 58926


Ignore:
Timestamp:
08/23/2024 03:40:15 PM (7 months ago)
Author:
dmsnell
Message:

HTML API: Add support for missing FRAMESET and "after" insertion modes.

As part of work to add more spec support to the HTML API, this patch adds support for the FRAMESET-related insertion modes, as well as the set of missing after insertion modes. These modes run at the end of parsing a document, closing it and taking care of any lingering tags.

Developed in https://github.com/wordpress/wordpress-develop/7165
Discussed in https://core.trac.wordpress.org/ticket/61576

Props dmsnell, jonsurrell.
See #61576.

Location:
trunk
Files:
2 edited

Legend:

Unmodified
Added
Removed
  • trunk/src/wp-includes/html-api/class-wp-html-processor.php

    r58925 r58926  
    39733973     */
    39743974    private function step_after_body(): bool {
    3975         $this->bail( 'No support for parsing in the ' . WP_HTML_Processor_State::INSERTION_MODE_AFTER_BODY . ' state.' );
     3975        $tag_name   = $this->get_token_name();
     3976        $token_type = $this->get_token_type();
     3977        $op_sigil   = '#tag' === $token_type ? ( $this->is_tag_closer() ? '-' : '+' ) : '';
     3978        $op         = "{$op_sigil}{$tag_name}";
     3979
     3980        switch ( $op ) {
     3981            /*
     3982             * > A character token that is one of U+0009 CHARACTER TABULATION, U+000A LINE FEED (LF),
     3983             * >   U+000C FORM FEED (FF), U+000D CARRIAGE RETURN (CR), or U+0020 SPACE
     3984             *
     3985             * > Process the token using the rules for the "in body" insertion mode.
     3986             */
     3987            case '#text':
     3988                $text = $this->get_modifiable_text();
     3989                if ( strlen( $text ) === strspn( $text, " \t\n\f\r" ) ) {
     3990                    return $this->step_in_body();
     3991                }
     3992                goto after_body_anything_else;
     3993                break;
     3994
     3995            /*
     3996             * > A comment token
     3997             */
     3998            case '#comment':
     3999            case '#funky-comment':
     4000            case '#presumptuous-tag':
     4001                $this->bail( 'Content outside of BODY is unsupported.' );
     4002                break;
     4003
     4004            /*
     4005             * > A DOCTYPE token
     4006             */
     4007            case 'html':
     4008                // Parse error: ignore the token.
     4009                return $this->step();
     4010
     4011            /*
     4012             * > A start tag whose tag name is "html"
     4013             */
     4014            case '+HTML':
     4015                return $this->step_in_body();
     4016
     4017            /*
     4018             * > An end tag whose tag name is "html"
     4019             *
     4020             * > If the parser was created as part of the HTML fragment parsing algorithm,
     4021             * > this is a parse error; ignore the token. (fragment case)
     4022             * >
     4023             * > Otherwise, switch the insertion mode to "after after body".
     4024             */
     4025            case '-HTML':
     4026                if ( isset( $this->context_node ) ) {
     4027                    return $this->step();
     4028                }
     4029
     4030                $this->state->insertion_mode = WP_HTML_Processor_State::INSERTION_MODE_AFTER_AFTER_BODY;
     4031                return true;
     4032        }
     4033
     4034        /*
     4035         * > Parse error. Switch the insertion mode to "in body" and reprocess the token.
     4036         */
     4037        after_body_anything_else:
     4038        $this->state->insertion_mode = WP_HTML_Processor_State::INSERTION_MODE_IN_BODY;
     4039        return $this->step( self::REPROCESS_CURRENT_NODE );
    39764040    }
    39774041
     
    39924056     */
    39934057    private function step_in_frameset(): bool {
    3994         $this->bail( 'No support for parsing in the ' . WP_HTML_Processor_State::INSERTION_MODE_IN_FRAMESET . ' state.' );
     4058        $tag_name   = $this->get_token_name();
     4059        $token_type = $this->get_token_type();
     4060        $op_sigil   = '#tag' === $token_type ? ( $this->is_tag_closer() ? '-' : '+' ) : '';
     4061        $op         = "{$op_sigil}{$tag_name}";
     4062
     4063        switch ( $op ) {
     4064            /*
     4065             * > A character token that is one of U+0009 CHARACTER TABULATION, U+000A LINE FEED (LF),
     4066             * >   U+000C FORM FEED (FF), U+000D CARRIAGE RETURN (CR), or U+0020 SPACE
     4067             * >
     4068             * > Insert the character.
     4069             *
     4070             * This algorithm effectively strips non-whitespace characters from text and inserts
     4071             * them under HTML. This is not supported at this time.
     4072             */
     4073            case '#text':
     4074                $text = $this->get_modifiable_text();
     4075                $text = $this->get_modifiable_text();
     4076                if ( strlen( $text ) === strspn( $text, " \t\n\f\r" ) ) {
     4077                    return $this->step_in_body();
     4078                }
     4079                $this->bail( 'Non-whitespace characters cannot be handled in frameset.' );
     4080                break;
     4081
     4082            /*
     4083             * > A comment token
     4084             */
     4085            case '#comment':
     4086            case '#funky-comment':
     4087            case '#presumptuous-tag':
     4088                $this->insert_html_element( $this->state->current_token );
     4089                return true;
     4090
     4091            /*
     4092             * > A DOCTYPE token
     4093             */
     4094            case 'html':
     4095                // Parse error: ignore the token.
     4096                return $this->step();
     4097
     4098            /*
     4099             * > A start tag whose tag name is "html"
     4100             */
     4101            case '+HTML':
     4102                return $this->step_in_body();
     4103
     4104            /*
     4105             * > A start tag whose tag name is "frameset"
     4106             */
     4107            case '+FRAMESET':
     4108                $this->insert_html_element( $this->state->current_token );
     4109                return true;
     4110
     4111            /*
     4112             * > An end tag whose tag name is "frameset"
     4113             */
     4114            case '-FRAMESET':
     4115                /*
     4116                 * > If the current node is the root html element, then this is a parse error;
     4117                 * > ignore the token. (fragment case)
     4118                 */
     4119                if ( $this->state->stack_of_open_elements->current_node_is( 'HTML' ) ) {
     4120                    return $this->step();
     4121                }
     4122
     4123                /*
     4124                 * > Otherwise, pop the current node from the stack of open elements.
     4125                 */
     4126                $this->state->stack_of_open_elements->pop();
     4127
     4128                /*
     4129                 * > If the parser was not created as part of the HTML fragment parsing algorithm
     4130                 * > (fragment case), and the current node is no longer a frameset element, then
     4131                 * > switch the insertion mode to "after frameset".
     4132                 */
     4133                if ( ! isset( $this->context_node ) && ! $this->state->stack_of_open_elements->current_node_is( 'FRAMESET' ) ) {
     4134                    $this->state->insertion_mode = WP_HTML_Processor_State::INSERTION_MODE_AFTER_FRAMESET;
     4135                }
     4136
     4137                return true;
     4138
     4139            /*
     4140             * > A start tag whose tag name is "frame"
     4141             *
     4142             * > Insert an HTML element for the token. Immediately pop the
     4143             * > current node off the stack of open elements.
     4144             * >
     4145             * > Acknowledge the token's self-closing flag, if it is set.
     4146             */
     4147            case '+FRAME':
     4148                $this->insert_html_element( $this->state->current_token );
     4149                $this->state->stack_of_open_elements->pop();
     4150                return true;
     4151
     4152            /*
     4153             * > A start tag whose tag name is "noframes"
     4154             */
     4155            case '+NOFRAMES':
     4156                return $this->step_in_head();
     4157        }
     4158
     4159        // Parse error: ignore the token.
     4160        return $this->step();
    39954161    }
    39964162
     
    40114177     */
    40124178    private function step_after_frameset(): bool {
    4013         $this->bail( 'No support for parsing in the ' . WP_HTML_Processor_State::INSERTION_MODE_AFTER_FRAMESET . ' state.' );
     4179        $tag_name   = $this->get_token_name();
     4180        $token_type = $this->get_token_type();
     4181        $op_sigil   = '#tag' === $token_type ? ( $this->is_tag_closer() ? '-' : '+' ) : '';
     4182        $op         = "{$op_sigil}{$tag_name}";
     4183
     4184        switch ( $op ) {
     4185            /*
     4186             * > A character token that is one of U+0009 CHARACTER TABULATION, U+000A LINE FEED (LF),
     4187             * >   U+000C FORM FEED (FF), U+000D CARRIAGE RETURN (CR), or U+0020 SPACE
     4188             * >
     4189             * > Insert the character.
     4190             *
     4191             * This algorithm effectively strips non-whitespace characters from text and inserts
     4192             * them under HTML. This is not supported at this time.
     4193             */
     4194            case '#text':
     4195                $text = $this->get_modifiable_text();
     4196                if ( strlen( $text ) === strspn( $text, " \t\n\f\r" ) ) {
     4197                    return $this->step_in_body();
     4198                }
     4199                $this->bail( 'Non-whitespace characters cannot be handled in after frameset' );
     4200                break;
     4201
     4202            /*
     4203             * > A comment token
     4204             */
     4205            case '#comment':
     4206            case '#funky-comment':
     4207            case '#presumptuous-tag':
     4208                $this->insert_html_element( $this->state->current_token );
     4209                return true;
     4210
     4211            /*
     4212             * > A DOCTYPE token
     4213             */
     4214            case 'html':
     4215                // Parse error: ignore the token.
     4216                return $this->step();
     4217
     4218            /*
     4219             * > A start tag whose tag name is "html"
     4220             */
     4221            case '+HTML':
     4222                return $this->step_in_body();
     4223
     4224            /*
     4225             * > An end tag whose tag name is "html"
     4226             */
     4227            case '-HTML':
     4228                $this->state->insertion_mode = WP_HTML_Processor_State::INSERTION_MODE_AFTER_AFTER_FRAMESET;
     4229                return true;
     4230
     4231            /*
     4232             * > A start tag whose tag name is "noframes"
     4233             */
     4234            case '+NOFRAMES':
     4235                return $this->step_in_head();
     4236        }
     4237
     4238        // Parse error: ignore the token.
     4239        return $this->step();
    40144240    }
    40154241
     
    40304256     */
    40314257    private function step_after_after_body(): bool {
    4032         $this->bail( 'No support for parsing in the ' . WP_HTML_Processor_State::INSERTION_MODE_AFTER_AFTER_BODY . ' state.' );
     4258        $tag_name   = $this->get_token_name();
     4259        $token_type = $this->get_token_type();
     4260        $op_sigil   = '#tag' === $token_type ? ( $this->is_tag_closer() ? '-' : '+' ) : '';
     4261        $op         = "{$op_sigil}{$tag_name}";
     4262
     4263        switch ( $op ) {
     4264            /*
     4265             * > A comment token
     4266             */
     4267            case '#comment':
     4268            case '#funky-comment':
     4269            case '#presumptuous-tag':
     4270                $this->bail( 'Content outside of HTML is unsupported.' );
     4271                break;
     4272
     4273            /*
     4274             * > A DOCTYPE token
     4275             * > A start tag whose tag name is "html"
     4276             *
     4277             * > Process the token using the rules for the "in body" insertion mode.
     4278             */
     4279            case 'html':
     4280            case '+HTML':
     4281                return $this->step_in_body();
     4282
     4283            /*
     4284             * > A character token that is one of U+0009 CHARACTER TABULATION, U+000A LINE FEED (LF),
     4285             * >   U+000C FORM FEED (FF), U+000D CARRIAGE RETURN (CR), or U+0020 SPACE
     4286             * >
     4287             * > Process the token using the rules for the "in body" insertion mode.
     4288             */
     4289            case '#text':
     4290                $text = $this->get_modifiable_text();
     4291                if ( strlen( $text ) === strspn( $text, " \t\n\f\r" ) ) {
     4292                    return $this->step_in_body();
     4293                }
     4294                goto after_after_body_anything_else;
     4295                break;
     4296        }
     4297
     4298        /*
     4299         * > Parse error. Switch the insertion mode to "in body" and reprocess the token.
     4300         */
     4301        after_after_body_anything_else:
     4302        $this->state->insertion_mode = WP_HTML_Processor_State::INSERTION_MODE_IN_BODY;
     4303        return $this->step( self::REPROCESS_CURRENT_NODE );
    40334304    }
    40344305
     
    40494320     */
    40504321    private function step_after_after_frameset(): bool {
    4051         $this->bail( 'No support for parsing in the ' . WP_HTML_Processor_State::INSERTION_MODE_AFTER_AFTER_FRAMESET . ' state.' );
     4322        $tag_name   = $this->get_token_name();
     4323        $token_type = $this->get_token_type();
     4324        $op_sigil   = '#tag' === $token_type ? ( $this->is_tag_closer() ? '-' : '+' ) : '';
     4325        $op         = "{$op_sigil}{$tag_name}";
     4326
     4327        switch ( $op ) {
     4328            /*
     4329             * > A comment token
     4330             */
     4331            case '#comment':
     4332            case '#funky-comment':
     4333            case '#presumptuous-tag':
     4334                $this->bail( 'Content outside of HTML is unsupported.' );
     4335                break;
     4336
     4337            /*
     4338             * > A DOCTYPE token
     4339             * > A start tag whose tag name is "html"
     4340             *
     4341             * > Process the token using the rules for the "in body" insertion mode.
     4342             */
     4343            case 'html':
     4344            case '+HTML':
     4345                return $this->step_in_body();
     4346
     4347            /*
     4348             * > A character token that is one of U+0009 CHARACTER TABULATION, U+000A LINE FEED (LF),
     4349             * >   U+000C FORM FEED (FF), U+000D CARRIAGE RETURN (CR), or U+0020 SPACE
     4350             * >
     4351             * > Process the token using the rules for the "in body" insertion mode.
     4352             *
     4353             * This algorithm effectively strips non-whitespace characters from text and inserts
     4354             * them under HTML. This is not supported at this time.
     4355             */
     4356            case '#text':
     4357                $text = $this->get_modifiable_text();
     4358                if ( strlen( $text ) === strspn( $text, " \t\n\f\r" ) ) {
     4359                    return $this->step_in_body();
     4360                }
     4361                $this->bail( 'Non-whitespace characters cannot be handled in after after frameset.' );
     4362                break;
     4363
     4364            /*
     4365             * > A start tag whose tag name is "noframes"
     4366             */
     4367            case '+NOFRAMES':
     4368                return $this->step_in_head();
     4369        }
     4370
     4371        // Parse error: ignore the token.
     4372        return $this->step();
    40524373    }
    40534374
     
    41164437            case '#cdata-section':
    41174438            case '#comment':
    4118             case '#funky_comment':
     4439            case '#funky-comment':
     4440            case '#presumptuous-tag':
    41194441                $this->insert_foreign_element( $this->state->current_token, false );
    41204442                return true;
  • trunk/tests/phpunit/tests/html-api/wpHtmlProcessorHtml5lib.php

    r58925 r58926  
    3434        'tests14/line0022'       => 'Unimplemented: This parser does not add missing attributes to existing HTML or BODY tags.',
    3535        'tests14/line0055'       => 'Unimplemented: This parser does not add missing attributes to existing HTML or BODY tags.',
     36        'tests19/line0488'       => 'Unimplemented: This parser does not add missing attributes to existing HTML or BODY tags.',
     37        'tests19/line0500'       => 'Unimplemented: This parser does not add missing attributes to existing HTML or BODY tags.',
    3638        'tests19/line0965'       => 'Bug: Mixed whitespace, non-whitespace text in head not split correctly.',
    3739        'tests19/line1079'       => 'Unimplemented: This parser does not add missing attributes to existing HTML or BODY tags.',
    3840        'tests2/line0207'        => 'Unimplemented: This parser does not add missing attributes to existing HTML or BODY tags.',
    3941        'tests2/line0686'        => 'Unimplemented: This parser does not add missing attributes to existing HTML or BODY tags.',
     42        'tests2/line0697'        => 'Unimplemented: This parser does not add missing attributes to existing HTML or BODY tags.',
    4043        'tests2/line0709'        => 'Unimplemented: This parser does not add missing attributes to existing HTML or BODY tags.',
    4144        'tests5/line0013'        => 'Bug: Mixed whitespace, non-whitespace text in head not split correctly.',
Note: See TracChangeset for help on using the changeset viewer.