Make WordPress Core


Ignore:
Timestamp:
11/27/2024 02:33:46 PM (18 months ago)
Author:
Bernhard Reiter
Message:

HTML API: Allow more contexts in create_fragment.

This changeset modifies WP_HTML_Processor::create_fragment( $html, $context ) to use a full processor and create_fragment_at_node instead of the other way around. This makes more sense and makes the main factory methods more clear, where the state required for fragments is set up in create_fragment_at_node instead of in both create_fragment and create_fragment_at_current_node.

This allows for more HTML contexts to be provided to the basic create_fragment where the provided context HTML is appended to <!DOCTYPE html>, a full processor is created, the last tag opener is found, and a fragment parser is created at that node via create_fragment_at_current_node.

The HTML5lib tests are updated accordingly to use this new method to create fragments.

Props jonsurrell, dmsnell, bernhard-reiter.
Fixes #62584.

File:
1 edited

Legend:

Unmodified
Added
Removed
  • trunk/src/wp-includes/html-api/class-wp-html-processor.php

    r59463 r59467  
    280280     * impact the parse, such as with a SCRIPT tag and its `type` attribute.
    281281     *
    282      * ## Current HTML Support
    283      *
    284      *  - The only supported context is `<body>`, which is the default value.
    285      *  - The only supported document encoding is `UTF-8`, which is the default value.
     282     * Example:
     283     *
     284     *     // Usually, snippets of HTML ought to be processed in the default `<body>` context:
     285     *     $processor = WP_HTML_Processor::create_fragment( '<p>Hi</p>' );
     286     *
     287     *     // Some fragments should be processed in the correct context like this SVG:
     288     *     $processor = WP_HTML_Processor::create_fragment( '<rect width="10" height="10" />', '<svg>' );
     289     *
     290     *     // This fragment with TD tags should be processed in a TR context:
     291     *     $processor = WP_HTML_Processor::create_fragment(
     292     *         '<td>1<td>2<td>3',
     293     *         '<table><tbody><tr>'
     294     *     );
     295     *
     296     * In order to create a fragment processor at the correct location, the
     297     * provided fragment will be processed as part of a full HTML document.
     298     * The processor will search for the last opener tag in the document and
     299     * create a fragment processor at that location. The document will be
     300     * forced into "no-quirks" mode by including the HTML5 doctype.
     301     *
     302     * For advanced usage and precise control over the context element, use
     303     * `WP_HTML_Processor::create_full_processor()` and
     304     * `WP_HTML_Processor::create_fragment_at_current_node()`.
     305     *
     306     * UTF-8 is the only allowed encoding. If working with a document that
     307     * isn't UTF-8, first convert the document to UTF-8, then pass in the
     308     * converted HTML.
    286309     *
    287310     * @since 6.4.0
    288311     * @since 6.6.0 Returns `static` instead of `self` so it can create subclass instances.
     312     * @since 6.8.0 Can create fragments with any context element.
    289313     *
    290314     * @param string $html     Input HTML fragment to process.
    291      * @param string $context  Context element for the fragment, must be default of `<body>`.
     315     * @param string $context  Context element for the fragment. Defaults to `<body>`.
    292316     * @param string $encoding Text encoding of the document; must be default of 'UTF-8'.
    293317     * @return static|null The created processor if successful, otherwise null.
    294318     */
    295319    public static function create_fragment( $html, $context = '<body>', $encoding = 'UTF-8' ) {
    296         if ( '<body>' !== $context || 'UTF-8' !== $encoding ) {
     320        $context_processor = static::create_full_parser( "<!DOCTYPE html>{$context}", $encoding );
     321        if ( null === $context_processor ) {
    297322            return null;
    298323        }
    299324
    300         $processor                             = new static( $html, self::CONSTRUCTOR_UNLOCK_CODE );
    301         $processor->state->insertion_mode      = WP_HTML_Processor_State::INSERTION_MODE_IN_BODY;
    302         $processor->state->encoding            = $encoding;
    303         $processor->state->encoding_confidence = 'certain';
    304 
    305         // @todo Create "fake" bookmarks for non-existent but implied nodes.
    306         $processor->bookmarks['root-node']    = new WP_HTML_Span( 0, 0 );
    307         $processor->bookmarks['context-node'] = new WP_HTML_Span( 0, 0 );
    308 
    309         $root_node = new WP_HTML_Token(
    310             'root-node',
    311             'HTML',
    312             false
    313         );
    314 
    315         $processor->state->stack_of_open_elements->push( $root_node );
    316 
    317         $context_node = new WP_HTML_Token(
    318             'context-node',
    319             'BODY',
    320             false
    321         );
    322 
    323         $processor->context_node = $context_node;
    324         $processor->breadcrumbs  = array( 'HTML', $context_node->node_name );
    325 
    326         return $processor;
     325        while ( $context_processor->next_tag() ) {
     326            $context_processor->set_bookmark( 'final_node' );
     327        }
     328
     329        if (
     330            ! $context_processor->has_bookmark( 'final_node' ) ||
     331            ! $context_processor->seek( 'final_node' )
     332        ) {
     333            _doing_it_wrong( __METHOD__, __( 'No valid context element was detected.' ), '6.8.0' );
     334            return null;
     335        }
     336
     337        return $context_processor->create_fragment_at_current_node( $html );
    327338    }
    328339
     
    334345     * a context node of `<body>`.
    335346     *
    336      * Since UTF-8 is the only currently-accepted charset, if working with a
    337      * document that isn't UTF-8, it's important to convert the document before
    338      * creating the processor: pass in the converted HTML.
     347     * UTF-8 is the only allowed encoding. If working with a document that
     348     * isn't UTF-8, first convert the document to UTF-8, then pass in the
     349     * converted HTML.
    339350     *
    340351     * @param string      $html                    Input HTML document to process.
     
    460471     * @see https://html.spec.whatwg.org/multipage/parsing.html#html-fragment-parsing-algorithm
    461472     *
     473     * @since 6.8.0
     474     *
    462475     * @param string $html Input HTML fragment to process.
    463476     * @return static|null The created processor if successful, otherwise null.
     
    465478    public function create_fragment_at_current_node( string $html ) {
    466479        if ( $this->get_token_type() !== '#tag' || $this->is_tag_closer() ) {
     480            _doing_it_wrong(
     481                __METHOD__,
     482                __( 'The context element must be a start tag.' ),
     483                '6.8.0'
     484            );
    467485            return null;
    468486        }
    469487
     488        $tag_name  = $this->current_element->token->node_name;
    470489        $namespace = $this->current_element->token->namespace;
     490
     491        if ( 'html' === $namespace && self::is_void( $tag_name ) ) {
     492            _doing_it_wrong(
     493                __METHOD__,
     494                sprintf(
     495                    // translators: %s: A tag name like INPUT or BR.
     496                    __( 'The context element cannot be a void element, found "%s".' ),
     497                    $tag_name
     498                ),
     499                '6.8.0'
     500            );
     501            return null;
     502        }
    471503
    472504        /*
     
    476508        if (
    477509            'html' === $namespace &&
    478             in_array( $this->current_element->token->node_name, array( 'IFRAME', 'NOEMBED', 'NOFRAMES', 'SCRIPT', 'STYLE', 'TEXTAREA', 'TITLE', 'XMP', 'PLAINTEXT' ), true )
     510            in_array( $tag_name, array( 'IFRAME', 'NOEMBED', 'NOFRAMES', 'SCRIPT', 'STYLE', 'TEXTAREA', 'TITLE', 'XMP', 'PLAINTEXT' ), true )
    479511        ) {
     512            _doing_it_wrong(
     513                __METHOD__,
     514                sprintf(
     515                    // translators: %s: A tag name like IFRAME or TEXTAREA.
     516                    __( 'The context element "%s" is not supported.' ),
     517                    $tag_name
     518                ),
     519                '6.8.0'
     520            );
    480521            return null;
    481522        }
    482523
    483         $fragment_processor = static::create_fragment( $html );
    484         if ( null === $fragment_processor ) {
    485             return null;
    486         }
     524        $fragment_processor = new static( $html, self::CONSTRUCTOR_UNLOCK_CODE );
    487525
    488526        $fragment_processor->compat_mode = $this->compat_mode;
    489527
    490         $fragment_processor->context_node                = clone $this->state->current_token;
     528        // @todo Create "fake" bookmarks for non-existent but implied nodes.
     529        $fragment_processor->bookmarks['root-node'] = new WP_HTML_Span( 0, 0 );
     530        $root_node                                  = new WP_HTML_Token(
     531            'root-node',
     532            'HTML',
     533            false
     534        );
     535        $fragment_processor->state->stack_of_open_elements->push( $root_node );
     536
     537        $fragment_processor->bookmarks['context-node']   = new WP_HTML_Span( 0, 0 );
     538        $fragment_processor->context_node                = clone $this->current_element->token;
    491539        $fragment_processor->context_node->bookmark_name = 'context-node';
    492540        $fragment_processor->context_node->on_destroy    = null;
Note: See TracChangeset for help on using the changeset viewer.