Make WordPress Core

Changeset 59444


Ignore:
Timestamp:
11/21/2024 01:27:58 PM (2 months ago)
Author:
Bernhard Reiter
Message:

HTML API: Add method to create fragment at node.

HTML Fragment parsing always happens with a context node, which may impact how a fragment of HTML is parsed. HTML Fragment Processors can be instantiated with a BODY context node via WP_HTML_Processor::create_fragment( $html ).

This changeset adds a static method called create_fragment_at_current_node( string $html_fragment ). It can only be called when the processor is paused at a #tag, with some additional constraints:

  • The opening and closing tags must appear in the HTML input (no virtual tokens).
  • No "self-contained" elements are allowed ( IFRAME, SCRIPT, TITLE, etc.).

If successful, the method will return a WP_HTML_Processor instance whose context is inherited from the node that the method was called from.

Props jonsurrell, bernhard-reiter, gziolo.
Fixes #62357.

Location:
trunk
Files:
3 edited

Legend:

Unmodified
Added
Removed
  • trunk/src/wp-includes/html-api/class-wp-html-processor.php

    r59422 r59444  
    426426
    427427    /**
     428     * Creates a fragment processor at the current node.
     429     *
     430     * HTML Fragment parsing always happens with a context node. HTML Fragment Processors can be
     431     * instantiated with a `BODY` context node via `WP_HTML_Processor::create_fragment( $html )`.
     432     *
     433     * The context node may impact how a fragment of HTML is parsed. For example, consider the HTML
     434     * fragment `<td />Inside TD?</td>`.
     435     *
     436     * A BODY context node will produce the following tree:
     437     *
     438     *     └─#text Inside TD?
     439     *
     440     * Notice that the `<td>` tags are completely ignored.
     441     *
     442     * Compare that with an SVG context node that produces the following tree:
     443     *
     444     *     ├─svg:td
     445     *     └─#text Inside TD?
     446     *
     447     * Here, a `td` node in the `svg` namespace is created, and its self-closing flag is respected.
     448     * This is a peculiarity of parsing HTML in foreign content like SVG.
     449     *
     450     * Finally, consider the tree produced with a TABLE context node:
     451     *
     452     *     └─TBODY
     453     *       └─TR
     454     *         └─TD
     455     *           └─#text Inside TD?
     456     *
     457     * These examples demonstrate how important the context node may be when processing an HTML
     458     * fragment. Special care must be taken when processing fragments that are expected to appear
     459     * in specific contexts. SVG and TABLE are good examples, but there are others.
     460     *
     461     * @see https://html.spec.whatwg.org/multipage/parsing.html#html-fragment-parsing-algorithm
     462     *
     463     * @param string $html Input HTML fragment to process.
     464     * @return static|null The created processor if successful, otherwise null.
     465     */
     466    public function create_fragment_at_current_node( string $html ) {
     467        if ( $this->get_token_type() !== '#tag' ) {
     468            return null;
     469        }
     470
     471        $namespace = $this->current_element->token->namespace;
     472
     473        /*
     474         * Prevent creating fragments at nodes that require a special tokenizer state.
     475         * This is unsupported by the HTML Processor.
     476         */
     477        if (
     478            'html' === $namespace &&
     479            in_array( $this->current_element->token->node_name, array( 'IFRAME', 'NOEMBED', 'NOFRAMES', 'SCRIPT', 'STYLE', 'TEXTAREA', 'TITLE', 'XMP', 'PLAINTEXT' ), true )
     480        ) {
     481            return null;
     482        }
     483
     484        $fragment_processor = static::create_fragment( $html );
     485        if ( null === $fragment_processor ) {
     486            return null;
     487        }
     488
     489        $fragment_processor->compat_mode = $this->compat_mode;
     490
     491        $fragment_processor->context_node                = clone $this->state->current_token;
     492        $fragment_processor->context_node->bookmark_name = 'context-node';
     493        $fragment_processor->context_node->on_destroy    = null;
     494
     495        $fragment_processor->state->context_node = array( $fragment_processor->context_node->node_name, array() );
     496
     497        $attribute_names = $this->get_attribute_names_with_prefix( '' );
     498        if ( null !== $attribute_names ) {
     499            foreach ( $attribute_names as $name ) {
     500                $fragment_processor->state->context_node[1][ $name ] = $this->get_attribute( $name );
     501            }
     502        }
     503
     504        $fragment_processor->breadcrumbs = array( 'HTML', $fragment_processor->context_node->node_name );
     505
     506        if ( 'TEMPLATE' === $fragment_processor->context_node->node_name ) {
     507            $fragment_processor->state->stack_of_template_insertion_modes[] = WP_HTML_Processor_State::INSERTION_MODE_IN_TEMPLATE;
     508        }
     509
     510        $fragment_processor->reset_insertion_mode_appropriately();
     511
     512        /*
     513         * > Set the parser's form element pointer to the nearest node to the context element that
     514         * > is a form element (going straight up the ancestor chain, and including the element
     515         * > itself, if it is a form element), if any. (If there is no such form element, the
     516         * > form element pointer keeps its initial value, null.)
     517         */
     518        foreach ( $this->state->stack_of_open_elements->walk_up() as $element ) {
     519            if ( 'FORM' === $element->node_name && 'html' === $element->namespace ) {
     520                $fragment_processor->state->form_element                = clone $element;
     521                $fragment_processor->state->form_element->bookmark_name = null;
     522                $fragment_processor->state->form_element->on_destroy    = null;
     523                break;
     524            }
     525        }
     526
     527        $fragment_processor->state->encoding_confidence = 'irrelevant';
     528
     529        /*
     530         * Update the parsing namespace near the end of the process.
     531         * This is important so that any push/pop from the stack of open
     532         * elements does not change the parsing namespace.
     533         */
     534        $fragment_processor->change_parsing_namespace(
     535            $this->current_element->token->integration_node_type ? 'html' : $namespace
     536        );
     537
     538        return $fragment_processor;
     539    }
     540
     541    /**
    428542     * Stops the parser and terminates its execution when encountering unsupported markup.
    429543     *
  • trunk/tests/phpunit/tests/html-api/wpHtmlProcessor.php

    r59422 r59444  
    10451045
    10461046    /**
     1047     * @ticket 62357
     1048     */
     1049    public function test_create_fragment_at_current_node_in_foreign_content() {
     1050        $processor = WP_HTML_Processor::create_full_parser( '<svg>' );
     1051        $this->assertTrue( $processor->next_tag( 'SVG' ) );
     1052
     1053        $fragment = $processor->create_fragment_at_current_node( "\0preceded-by-nul-byte<rect /><circle></circle><foreignobject><div></div></foreignobject><g>" );
     1054
     1055        $this->assertSame( 'svg', $fragment->get_namespace() );
     1056        $this->assertTrue( $fragment->next_token() );
     1057
     1058        /*
     1059         * In HTML parsing, a nul byte would be ignored.
     1060         * In SVG it should be replaced with a replacement character.
     1061         */
     1062        $this->assertSame( '#text', $fragment->get_token_type() );
     1063        $this->assertSame( "\u{FFFD}", $fragment->get_modifiable_text() );
     1064
     1065        $this->assertTrue( $fragment->next_tag( 'RECT' ) );
     1066        $this->assertSame( 'svg', $fragment->get_namespace() );
     1067
     1068        $this->assertTrue( $fragment->next_tag( 'CIRCLE' ) );
     1069        $this->assertSame( array( 'HTML', 'SVG', 'CIRCLE' ), $fragment->get_breadcrumbs() );
     1070        $this->assertTrue( $fragment->next_tag( 'foreignObject' ) );
     1071        $this->assertSame( 'svg', $fragment->get_namespace() );
     1072    }
     1073
     1074    /**
     1075     * @ticket 62357
     1076     */
     1077    public function test_create_fragment_at_current_node_in_foreign_content_integration_point() {
     1078        $processor = WP_HTML_Processor::create_full_parser( '<svg><foreignObject>' );
     1079        $this->assertTrue( $processor->next_tag( 'foreignObject' ) );
     1080
     1081        $fragment = $processor->create_fragment_at_current_node( "<image>\0not-preceded-by-nul-byte<rect />" );
     1082
     1083        // Nothing has been processed, the html namespace should be used for parsing as an integration point.
     1084        $this->assertSame( 'html', $fragment->get_namespace() );
     1085
     1086        // HTML parsing transforms IMAGE into IMG.
     1087        $this->assertTrue( $fragment->next_tag( 'IMG' ) );
     1088
     1089        $this->assertTrue( $fragment->next_token() );
     1090
     1091        // In HTML parsing, the nul byte is ignored and the text is reached.
     1092        $this->assertSame( '#text', $fragment->get_token_type() );
     1093        $this->assertSame( 'not-preceded-by-nul-byte', $fragment->get_modifiable_text() );
     1094
     1095        /*
     1096         * svg:foreignObject is an HTML integration point, so the processor should be in the HTML namespace.
     1097         * RECT is an HTML element here, meaning it may have the self-closing flag but does not self-close.
     1098         */
     1099        $this->assertTrue( $fragment->next_tag( 'RECT' ) );
     1100        $this->assertSame( array( 'HTML', 'FOREIGNOBJECT', 'RECT' ), $fragment->get_breadcrumbs() );
     1101        $this->assertSame( 'html', $fragment->get_namespace() );
     1102        $this->assertTrue( $fragment->has_self_closing_flag() );
     1103        $this->assertTrue( $fragment->expects_closer() );
     1104    }
     1105
     1106    /**
    10471107     * Ensure that lowercased tag_name query matches tags case-insensitively.
    10481108     *
  • trunk/tests/phpunit/tests/html-api/wpHtmlProcessorHtml5lib.php

    r59075 r59444  
    139139     */
    140140    private static function should_skip_test( ?string $test_context_element, string $test_name ): bool {
    141         if ( null !== $test_context_element && 'body' !== $test_context_element ) {
    142             return true;
    143         }
    144 
    145141        if ( array_key_exists( $test_name, self::SKIP_TESTS ) ) {
    146142            return true;
     
    158154     */
    159155    private static function build_tree_representation( ?string $fragment_context, string $html ) {
    160         $processor = $fragment_context
    161             ? WP_HTML_Processor::create_fragment( $html, "<{$fragment_context}>" )
    162             : WP_HTML_Processor::create_full_parser( $html );
    163         if ( null === $processor ) {
    164             throw new WP_HTML_Unsupported_Exception( "Could not create a parser with the given fragment context: {$fragment_context}.", '', 0, '', array(), array() );
    165         }
    166 
    167         /*
    168          * The fragment parser will start in 2 levels deep at: html > body > [position]
    169          * and requires adjustment to initial parameters.
    170          * The full parser will not.
    171          */
     156        $processor = null;
     157        if ( $fragment_context ) {
     158            if ( 'body' === $fragment_context ) {
     159                $processor = WP_HTML_Processor::create_fragment( $html );
     160            } else {
     161
     162                /*
     163                 * If the string of characters starts with "svg ", the context
     164                 * element is in the SVG namespace and the substring after
     165                 * "svg " is the local name. If the string of characters starts
     166                 * with "math ", the context element is in the MathML namespace
     167                 * and the substring after "math " is the local name.
     168                 * Otherwise, the context element is in the HTML namespace and
     169                 * the string is the local name.
     170                 */
     171                if ( str_starts_with( $fragment_context, 'svg ' ) ) {
     172                    $tag_name = substr( $fragment_context, 4 );
     173                    if ( 'svg' === $tag_name ) {
     174                        $parent_processor = WP_HTML_Processor::create_full_parser( '<!DOCTYPE html><svg>' );
     175                    } else {
     176                        $parent_processor = WP_HTML_Processor::create_full_parser( "<!DOCTYPE html><svg><{$tag_name}>" );
     177                    }
     178                    $parent_processor->next_tag( $tag_name );
     179                } elseif ( str_starts_with( $fragment_context, 'math ' ) ) {
     180                    $tag_name = substr( $fragment_context, 5 );
     181                    if ( 'math' === $tag_name ) {
     182                        $parent_processor = WP_HTML_Processor::create_full_parser( '<!DOCTYPE html><math>' );
     183                    } else {
     184                        $parent_processor = WP_HTML_Processor::create_full_parser( "<!DOCTYPE html><math><{$tag_name}>" );
     185                    }
     186                    $parent_processor->next_tag( $tag_name );
     187                } else {
     188                    if ( in_array(
     189                        $fragment_context,
     190                        array(
     191                            'caption',
     192                            'col',
     193                            'colgroup',
     194                            'tbody',
     195                            'td',
     196                            'tfoot',
     197                            'th',
     198                            'thead',
     199                            'tr',
     200                        ),
     201                        true
     202                    ) ) {
     203                        $parent_processor = WP_HTML_Processor::create_full_parser( "<!DOCTYPE html><table><{$fragment_context}>" );
     204                        $parent_processor->next_tag();
     205                    } else {
     206                        $parent_processor = WP_HTML_Processor::create_full_parser( "<!DOCTYPE html><{$fragment_context}>" );
     207                    }
     208                    $parent_processor->next_tag( $fragment_context );
     209                }
     210                if ( null !== $parent_processor->get_unsupported_exception() ) {
     211                    throw $parent_processor->get_unsupported_exception();
     212                }
     213                if ( null !== $parent_processor->get_last_error() ) {
     214                    throw new Exception( $parent_processor->get_last_error() );
     215                }
     216                $processor = $parent_processor->create_fragment_at_current_node( $html );
     217            }
     218
     219            if ( null === $processor ) {
     220                throw new WP_HTML_Unsupported_Exception( "Could not create a parser with the given fragment context: {$fragment_context}.", '', 0, '', array(), array() );
     221            }
     222        } else {
     223            $processor = WP_HTML_Processor::create_full_parser( $html );
     224            if ( null === $processor ) {
     225                throw new Exception( 'Could not create a full parser.' );
     226            }
     227        }
     228
    172229        $output       = '';
    173230        $indent_level = 0;
Note: See TracChangeset for help on using the changeset viewer.