Make WordPress Core

Changeset 59467


Ignore:
Timestamp:
11/27/2024 02:33:46 PM (8 hours ago)
Author:
Bernhard Reiter
Message:

HTML API: Allow more contexts in create_fragment.

This changeset modifies WP_HTML_Processor::create_fragment( $html, $context ) to use a full processor and create_fragment_at_node instead of the other way around. This makes more sense and makes the main factory methods more clear, where the state required for fragments is set up in create_fragment_at_node instead of in both create_fragment and create_fragment_at_current_node.

This allows for more HTML contexts to be provided to the basic create_fragment where the provided context HTML is appended to <!DOCTYPE html>, a full processor is created, the last tag opener is found, and a fragment parser is created at that node via create_fragment_at_current_node.

The HTML5lib tests are updated accordingly to use this new method to create fragments.

Props jonsurrell, dmsnell, bernhard-reiter.
Fixes #62584.

Location:
trunk
Files:
1 added
3 edited

Legend:

Unmodified
Added
Removed
  • trunk/src/wp-includes/html-api/class-wp-html-processor.php

    r59463 r59467  
    280280     * impact the parse, such as with a SCRIPT tag and its `type` attribute.
    281281     *
    282      * ## Current HTML Support
    283      *
    284      *  - The only supported context is `<body>`, which is the default value.
    285      *  - The only supported document encoding is `UTF-8`, which is the default value.
     282     * Example:
     283     *
     284     *     // Usually, snippets of HTML ought to be processed in the default `<body>` context:
     285     *     $processor = WP_HTML_Processor::create_fragment( '<p>Hi</p>' );
     286     *
     287     *     // Some fragments should be processed in the correct context like this SVG:
     288     *     $processor = WP_HTML_Processor::create_fragment( '<rect width="10" height="10" />', '<svg>' );
     289     *
     290     *     // This fragment with TD tags should be processed in a TR context:
     291     *     $processor = WP_HTML_Processor::create_fragment(
     292     *         '<td>1<td>2<td>3',
     293     *         '<table><tbody><tr>'
     294     *     );
     295     *
     296     * In order to create a fragment processor at the correct location, the
     297     * provided fragment will be processed as part of a full HTML document.
     298     * The processor will search for the last opener tag in the document and
     299     * create a fragment processor at that location. The document will be
     300     * forced into "no-quirks" mode by including the HTML5 doctype.
     301     *
     302     * For advanced usage and precise control over the context element, use
     303     * `WP_HTML_Processor::create_full_processor()` and
     304     * `WP_HTML_Processor::create_fragment_at_current_node()`.
     305     *
     306     * UTF-8 is the only allowed encoding. If working with a document that
     307     * isn't UTF-8, first convert the document to UTF-8, then pass in the
     308     * converted HTML.
    286309     *
    287310     * @since 6.4.0
    288311     * @since 6.6.0 Returns `static` instead of `self` so it can create subclass instances.
     312     * @since 6.8.0 Can create fragments with any context element.
    289313     *
    290314     * @param string $html     Input HTML fragment to process.
    291      * @param string $context  Context element for the fragment, must be default of `<body>`.
     315     * @param string $context  Context element for the fragment. Defaults to `<body>`.
    292316     * @param string $encoding Text encoding of the document; must be default of 'UTF-8'.
    293317     * @return static|null The created processor if successful, otherwise null.
    294318     */
    295319    public static function create_fragment( $html, $context = '<body>', $encoding = 'UTF-8' ) {
    296         if ( '<body>' !== $context || 'UTF-8' !== $encoding ) {
     320        $context_processor = static::create_full_parser( "<!DOCTYPE html>{$context}", $encoding );
     321        if ( null === $context_processor ) {
    297322            return null;
    298323        }
    299324
    300         $processor                             = new static( $html, self::CONSTRUCTOR_UNLOCK_CODE );
    301         $processor->state->insertion_mode      = WP_HTML_Processor_State::INSERTION_MODE_IN_BODY;
    302         $processor->state->encoding            = $encoding;
    303         $processor->state->encoding_confidence = 'certain';
    304 
    305         // @todo Create "fake" bookmarks for non-existent but implied nodes.
    306         $processor->bookmarks['root-node']    = new WP_HTML_Span( 0, 0 );
    307         $processor->bookmarks['context-node'] = new WP_HTML_Span( 0, 0 );
    308 
    309         $root_node = new WP_HTML_Token(
    310             'root-node',
    311             'HTML',
    312             false
    313         );
    314 
    315         $processor->state->stack_of_open_elements->push( $root_node );
    316 
    317         $context_node = new WP_HTML_Token(
    318             'context-node',
    319             'BODY',
    320             false
    321         );
    322 
    323         $processor->context_node = $context_node;
    324         $processor->breadcrumbs  = array( 'HTML', $context_node->node_name );
    325 
    326         return $processor;
     325        while ( $context_processor->next_tag() ) {
     326            $context_processor->set_bookmark( 'final_node' );
     327        }
     328
     329        if (
     330            ! $context_processor->has_bookmark( 'final_node' ) ||
     331            ! $context_processor->seek( 'final_node' )
     332        ) {
     333            _doing_it_wrong( __METHOD__, __( 'No valid context element was detected.' ), '6.8.0' );
     334            return null;
     335        }
     336
     337        return $context_processor->create_fragment_at_current_node( $html );
    327338    }
    328339
     
    334345     * a context node of `<body>`.
    335346     *
    336      * Since UTF-8 is the only currently-accepted charset, if working with a
    337      * document that isn't UTF-8, it's important to convert the document before
    338      * creating the processor: pass in the converted HTML.
     347     * UTF-8 is the only allowed encoding. If working with a document that
     348     * isn't UTF-8, first convert the document to UTF-8, then pass in the
     349     * converted HTML.
    339350     *
    340351     * @param string      $html                    Input HTML document to process.
     
    460471     * @see https://html.spec.whatwg.org/multipage/parsing.html#html-fragment-parsing-algorithm
    461472     *
     473     * @since 6.8.0
     474     *
    462475     * @param string $html Input HTML fragment to process.
    463476     * @return static|null The created processor if successful, otherwise null.
     
    465478    public function create_fragment_at_current_node( string $html ) {
    466479        if ( $this->get_token_type() !== '#tag' || $this->is_tag_closer() ) {
     480            _doing_it_wrong(
     481                __METHOD__,
     482                __( 'The context element must be a start tag.' ),
     483                '6.8.0'
     484            );
    467485            return null;
    468486        }
    469487
     488        $tag_name  = $this->current_element->token->node_name;
    470489        $namespace = $this->current_element->token->namespace;
     490
     491        if ( 'html' === $namespace && self::is_void( $tag_name ) ) {
     492            _doing_it_wrong(
     493                __METHOD__,
     494                sprintf(
     495                    // translators: %s: A tag name like INPUT or BR.
     496                    __( 'The context element cannot be a void element, found "%s".' ),
     497                    $tag_name
     498                ),
     499                '6.8.0'
     500            );
     501            return null;
     502        }
    471503
    472504        /*
     
    476508        if (
    477509            'html' === $namespace &&
    478             in_array( $this->current_element->token->node_name, array( 'IFRAME', 'NOEMBED', 'NOFRAMES', 'SCRIPT', 'STYLE', 'TEXTAREA', 'TITLE', 'XMP', 'PLAINTEXT' ), true )
     510            in_array( $tag_name, array( 'IFRAME', 'NOEMBED', 'NOFRAMES', 'SCRIPT', 'STYLE', 'TEXTAREA', 'TITLE', 'XMP', 'PLAINTEXT' ), true )
    479511        ) {
     512            _doing_it_wrong(
     513                __METHOD__,
     514                sprintf(
     515                    // translators: %s: A tag name like IFRAME or TEXTAREA.
     516                    __( 'The context element "%s" is not supported.' ),
     517                    $tag_name
     518                ),
     519                '6.8.0'
     520            );
    480521            return null;
    481522        }
    482523
    483         $fragment_processor = static::create_fragment( $html );
    484         if ( null === $fragment_processor ) {
    485             return null;
    486         }
     524        $fragment_processor = new static( $html, self::CONSTRUCTOR_UNLOCK_CODE );
    487525
    488526        $fragment_processor->compat_mode = $this->compat_mode;
    489527
    490         $fragment_processor->context_node                = clone $this->state->current_token;
     528        // @todo Create "fake" bookmarks for non-existent but implied nodes.
     529        $fragment_processor->bookmarks['root-node'] = new WP_HTML_Span( 0, 0 );
     530        $root_node                                  = new WP_HTML_Token(
     531            'root-node',
     532            'HTML',
     533            false
     534        );
     535        $fragment_processor->state->stack_of_open_elements->push( $root_node );
     536
     537        $fragment_processor->bookmarks['context-node']   = new WP_HTML_Span( 0, 0 );
     538        $fragment_processor->context_node                = clone $this->current_element->token;
    491539        $fragment_processor->context_node->bookmark_name = 'context-node';
    492540        $fragment_processor->context_node->on_destroy    = null;
  • trunk/tests/phpunit/tests/html-api/wpHtmlProcessor.php

    r59450 r59467  
    10451045
    10461046    /**
    1047      * @ticket 62357
    1048      */
    1049     public function test_create_fragment_at_current_node_in_foreign_content() {
    1050         $processor = WP_HTML_Processor::create_full_parser( '<svg>' );
    1051         $this->assertTrue( $processor->next_tag( 'SVG' ) );
    1052 
    1053         $fragment = $processor->create_fragment_at_current_node( "\0preceded-by-nul-byte<rect /><circle></circle><foreignobject><div></div></foreignobject><g>" );
    1054 
    1055         $this->assertSame( 'svg', $fragment->get_namespace() );
    1056         $this->assertTrue( $fragment->next_token() );
    1057 
    1058         /*
    1059          * In HTML parsing, a nul byte would be ignored.
    1060          * In SVG it should be replaced with a replacement character.
    1061          */
    1062         $this->assertSame( '#text', $fragment->get_token_type() );
    1063         $this->assertSame( "\u{FFFD}", $fragment->get_modifiable_text() );
    1064 
    1065         $this->assertTrue( $fragment->next_tag( 'RECT' ) );
    1066         $this->assertSame( 'svg', $fragment->get_namespace() );
    1067 
    1068         $this->assertTrue( $fragment->next_tag( 'CIRCLE' ) );
    1069         $this->assertSame( array( 'HTML', 'SVG', 'CIRCLE' ), $fragment->get_breadcrumbs() );
    1070         $this->assertTrue( $fragment->next_tag( 'foreignObject' ) );
    1071         $this->assertSame( 'svg', $fragment->get_namespace() );
    1072     }
    1073 
    1074     /**
    1075      * @ticket 62357
    1076      */
    1077     public function test_create_fragment_at_current_node_in_foreign_content_integration_point() {
    1078         $processor = WP_HTML_Processor::create_full_parser( '<svg><foreignObject>' );
    1079         $this->assertTrue( $processor->next_tag( 'foreignObject' ) );
    1080 
    1081         $fragment = $processor->create_fragment_at_current_node( "<image>\0not-preceded-by-nul-byte<rect />" );
    1082 
    1083         // Nothing has been processed, the html namespace should be used for parsing as an integration point.
    1084         $this->assertSame( 'html', $fragment->get_namespace() );
    1085 
    1086         // HTML parsing transforms IMAGE into IMG.
    1087         $this->assertTrue( $fragment->next_tag( 'IMG' ) );
    1088 
    1089         $this->assertTrue( $fragment->next_token() );
    1090 
    1091         // In HTML parsing, the nul byte is ignored and the text is reached.
    1092         $this->assertSame( '#text', $fragment->get_token_type() );
    1093         $this->assertSame( 'not-preceded-by-nul-byte', $fragment->get_modifiable_text() );
    1094 
    1095         /*
    1096          * svg:foreignObject is an HTML integration point, so the processor should be in the HTML namespace.
    1097          * RECT is an HTML element here, meaning it may have the self-closing flag but does not self-close.
    1098          */
    1099         $this->assertTrue( $fragment->next_tag( 'RECT' ) );
    1100         $this->assertSame( array( 'HTML', 'FOREIGNOBJECT', 'RECT' ), $fragment->get_breadcrumbs() );
    1101         $this->assertSame( 'html', $fragment->get_namespace() );
    1102         $this->assertTrue( $fragment->has_self_closing_flag() );
    1103         $this->assertTrue( $fragment->expects_closer() );
    1104     }
    1105 
    1106     /**
    1107      * @ticket 62357
    1108      */
    1109     public function test_prevent_fragment_creation_on_closers() {
    1110         $processor = WP_HTML_Processor::create_full_parser( '<p></p>' );
    1111         $processor->next_tag( 'P' );
    1112         $processor->next_tag(
    1113             array(
    1114                 'tag_name'    => 'P',
    1115                 'tag_closers' => 'visit',
    1116             )
    1117         );
    1118         $this->assertSame( 'P', $processor->get_tag() );
    1119         $this->assertTrue( $processor->is_tag_closer() );
    1120         $this->assertNull( $processor->create_fragment_at_current_node( '<i>fragment HTML</i>' ) );
    1121     }
    1122 
    1123     /**
    11241047     * Ensure that lowercased tag_name query matches tags case-insensitively.
    11251048     *
  • trunk/tests/phpunit/tests/html-api/wpHtmlProcessorHtml5lib.php

    r59444 r59467  
    154154     */
    155155    private static function build_tree_representation( ?string $fragment_context, string $html ) {
    156         $processor = null;
    157156        if ( $fragment_context ) {
    158             if ( 'body' === $fragment_context ) {
    159                 $processor = WP_HTML_Processor::create_fragment( $html );
     157            /*
     158             * If the string of characters starts with "svg ", the context
     159             * element is in the SVG namespace and the substring after
     160             * "svg " is the local name. If the string of characters starts
     161             * with "math ", the context element is in the MathML namespace
     162             * and the substring after "math " is the local name.
     163             * Otherwise, the context element is in the HTML namespace and
     164             * the string is the local name.
     165             */
     166            if ( str_starts_with( $fragment_context, 'svg ' ) ) {
     167                $tag_name = substr( $fragment_context, 4 );
     168                if ( 'svg' === $tag_name ) {
     169                    $fragment_context_html = '<svg>';
     170                } else {
     171                    $fragment_context_html = "<svg><{$tag_name}>";
     172                }
     173            } elseif ( str_starts_with( $fragment_context, 'math ' ) ) {
     174                $tag_name = substr( $fragment_context, 5 );
     175                if ( 'math' === $tag_name ) {
     176                    $fragment_context_html = '<math>';
     177                } else {
     178                    $fragment_context_html = "<math><{$tag_name}>";
     179                }
    160180            } else {
    161 
    162                 /*
    163                  * If the string of characters starts with "svg ", the context
    164                  * element is in the SVG namespace and the substring after
    165                  * "svg " is the local name. If the string of characters starts
    166                  * with "math ", the context element is in the MathML namespace
    167                  * and the substring after "math " is the local name.
    168                  * Otherwise, the context element is in the HTML namespace and
    169                  * the string is the local name.
    170                  */
    171                 if ( str_starts_with( $fragment_context, 'svg ' ) ) {
    172                     $tag_name = substr( $fragment_context, 4 );
    173                     if ( 'svg' === $tag_name ) {
    174                         $parent_processor = WP_HTML_Processor::create_full_parser( '<!DOCTYPE html><svg>' );
    175                     } else {
    176                         $parent_processor = WP_HTML_Processor::create_full_parser( "<!DOCTYPE html><svg><{$tag_name}>" );
    177                     }
    178                     $parent_processor->next_tag( $tag_name );
    179                 } elseif ( str_starts_with( $fragment_context, 'math ' ) ) {
    180                     $tag_name = substr( $fragment_context, 5 );
    181                     if ( 'math' === $tag_name ) {
    182                         $parent_processor = WP_HTML_Processor::create_full_parser( '<!DOCTYPE html><math>' );
    183                     } else {
    184                         $parent_processor = WP_HTML_Processor::create_full_parser( "<!DOCTYPE html><math><{$tag_name}>" );
    185                     }
    186                     $parent_processor->next_tag( $tag_name );
     181                // Tags that only appear in tables need a special case.
     182                if ( in_array(
     183                    $fragment_context,
     184                    array(
     185                        'caption',
     186                        'col',
     187                        'colgroup',
     188                        'tbody',
     189                        'td',
     190                        'tfoot',
     191                        'th',
     192                        'thead',
     193                        'tr',
     194                    ),
     195                    true
     196                ) ) {
     197                    $fragment_context_html = "<table><{$fragment_context}>";
    187198                } else {
    188                     if ( in_array(
    189                         $fragment_context,
    190                         array(
    191                             'caption',
    192                             'col',
    193                             'colgroup',
    194                             'tbody',
    195                             'td',
    196                             'tfoot',
    197                             'th',
    198                             'thead',
    199                             'tr',
    200                         ),
    201                         true
    202                     ) ) {
    203                         $parent_processor = WP_HTML_Processor::create_full_parser( "<!DOCTYPE html><table><{$fragment_context}>" );
    204                         $parent_processor->next_tag();
    205                     } else {
    206                         $parent_processor = WP_HTML_Processor::create_full_parser( "<!DOCTYPE html><{$fragment_context}>" );
    207                     }
    208                     $parent_processor->next_tag( $fragment_context );
    209                 }
    210                 if ( null !== $parent_processor->get_unsupported_exception() ) {
    211                     throw $parent_processor->get_unsupported_exception();
    212                 }
    213                 if ( null !== $parent_processor->get_last_error() ) {
    214                     throw new Exception( $parent_processor->get_last_error() );
    215                 }
    216                 $processor = $parent_processor->create_fragment_at_current_node( $html );
    217             }
     199                    $fragment_context_html = "<{$fragment_context}>";
     200                }
     201            }
     202
     203            $processor = WP_HTML_Processor::create_fragment( $html, $fragment_context_html );
    218204
    219205            if ( null === $processor ) {
Note: See TracChangeset for help on using the changeset viewer.