Make WordPress Core

Changeset 58779


Ignore:
Timestamp:
07/22/2024 10:22:03 PM (18 months ago)
Author:
dmsnell
Message:

HTML API: Add missing tags in IN BODY insertion mode to HTML Processor.

As part of work to add more spec support to the HTML API, this patch adds
support for the remaining missing tags in the IN BODY insertion mode. Not
all of the added tags are supported, because in some cases they reset the
insertion mode and are reprocessed where they will be rejected.

This patch also improves the support of get_modifiable_text(), removing
a leading newline inside a LISTING, PRE, or TEXTAREA element.

Developed in https://github.com/WordPress/wordpress-develop/pull/6972
Discussed in https://core.trac.wordpress.org/ticket/61576

Props dmsnell, jonsurrell, westonruter.
See #61576.

Location:
trunk
Files:
1 added
13 edited

Legend:

Unmodified
Added
Removed
  • trunk/src/wp-includes/html-api/class-wp-html-active-formatting-elements.php

    r58769 r58779  
    8585
    8686        return $current_node ? $current_node : null;
     87    }
     88
     89    /**
     90     * Inserts a "marker" at the end of the list of active formatting elements.
     91     *
     92     * > The markers are inserted when entering applet, object, marquee,
     93     * > template, td, th, and caption elements, and are used to prevent
     94     * > formatting from "leaking" into applet, object, marquee, template,
     95     * > td, th, and caption elements.
     96     *
     97     * @see https://html.spec.whatwg.org/#concept-parser-marker
     98     *
     99     * @since 6.7.0
     100     */
     101    public function insert_marker(): void {
     102        $this->push( new WP_HTML_Token( null, 'marker', false ) );
    87103    }
    88104
     
    185201        }
    186202    }
     203
     204    /**
     205     * Clears the list of active formatting elements up to the last marker.
     206     *
     207     * > When the steps below require the UA to clear the list of active formatting elements up to
     208     * > the last marker, the UA must perform the following steps:
     209     * >
     210     * > 1. Let entry be the last (most recently added) entry in the list of active
     211     * >    formatting elements.
     212     * > 2. Remove entry from the list of active formatting elements.
     213     * > 3. If entry was a marker, then stop the algorithm at this point.
     214     * >    The list has been cleared up to the last marker.
     215     * > 4. Go to step 1.
     216     *
     217     * @see https://html.spec.whatwg.org/multipage/parsing.html#clear-the-list-of-active-formatting-elements-up-to-the-last-marker
     218     *
     219     * @since 6.7.0
     220     */
     221    public function clear_up_to_last_marker(): void {
     222        foreach ( $this->walk_up() as $item ) {
     223            array_pop( $this->stack );
     224            if ( 'marker' === $item->node_name ) {
     225                break;
     226            }
     227        }
     228    }
    187229}
  • trunk/src/wp-includes/html-api/class-wp-html-open-elements.php

    r58769 r58779  
    103103
    104104    /**
     105     * Returns the name of the node at the nth position on the stack
     106     * of open elements, or `null` if no such position exists.
     107     *
     108     * Note that this uses a 1-based index, which represents the
     109     * "nth item" on the stack, counting from the top, where the
     110     * top-most element is the 1st, the second is the 2nd, etc...
     111     *
     112     * @since 6.7.0
     113     *
     114     * @param int $nth Retrieve the nth item on the stack, with 1 being
     115     *                 the top element, 2 being the second, etc...
     116     * @return string|null Name of the node on the stack at the given location,
     117     *                     or `null` if the location isn't on the stack.
     118     */
     119    public function at( int $nth ): ?string {
     120        foreach ( $this->walk_down() as $item ) {
     121            if ( 0 === --$nth ) {
     122                return $item->node_name;
     123            }
     124        }
     125
     126        return null;
     127    }
     128
     129    /**
     130     * Reports if a node of a given name is in the stack of open elements.
     131     *
     132     * @since 6.7.0
     133     *
     134     * @param string $node_name Name of node for which to check.
     135     * @return bool Whether a node of the given name is in the stack of open elements.
     136     */
     137    public function contains( string $node_name ): bool {
     138        foreach ( $this->walk_up() as $item ) {
     139            if ( $node_name === $item->node_name ) {
     140                return true;
     141            }
     142        }
     143
     144        return false;
     145    }
     146
     147    /**
    105148     * Reports if a specific node is in the stack of open elements.
    106149     *
     
    112155    public function contains_node( WP_HTML_Token $token ): bool {
    113156        foreach ( $this->walk_up() as $item ) {
    114             if ( $token->bookmark_name === $item->bookmark_name ) {
     157            if ( $token === $item ) {
    115158                return true;
    116159            }
     
    211254            }
    212255
    213             switch ( $node->node_name ) {
    214                 case 'HTML':
    215                     return false;
    216             }
    217 
    218256            if ( in_array( $node->node_name, $termination_list, true ) ) {
    219257                return false;
     
    227265     * Returns whether a particular element is in scope.
    228266     *
    229      * @since 6.4.0
     267     * > The stack of open elements is said to have a particular element in
     268     * > scope when it has that element in the specific scope consisting of
     269     * > the following element types:
     270     * >
     271     * >   - applet
     272     * >   - caption
     273     * >   - html
     274     * >   - table
     275     * >   - td
     276     * >   - th
     277     * >   - marquee
     278     * >   - object
     279     * >   - template
     280     * >   - MathML mi
     281     * >   - MathML mo
     282     * >   - MathML mn
     283     * >   - MathML ms
     284     * >   - MathML mtext
     285     * >   - MathML annotation-xml
     286     * >   - SVG foreignObject
     287     * >   - SVG desc
     288     * >   - SVG title
     289     *
     290     * @since 6.4.0
     291     * @since 6.7.0 Supports all required HTML elements.
    230292     *
    231293     * @see https://html.spec.whatwg.org/#has-an-element-in-scope
     
    238300            $tag_name,
    239301            array(
    240 
    241                 /*
    242                  * Because it's not currently possible to encounter
    243                  * one of the termination elements, they don't need
    244                  * to be listed here. If they were, they would be
    245                  * unreachable and only waste CPU cycles while
    246                  * scanning through HTML.
    247                  */
     302                'APPLET',
     303                'CAPTION',
     304                'HTML',
     305                'TABLE',
     306                'TD',
     307                'TH',
     308                'MARQUEE',
     309                'OBJECT',
     310                'TEMPLATE',
     311                // @todo: Support SVG and MathML nodes when support for foreign content is added.
    248312            )
    249313        );
     
    253317     * Returns whether a particular element is in list item scope.
    254318     *
     319     * > The stack of open elements is said to have a particular element
     320     * > in list item scope when it has that element in the specific scope
     321     * > consisting of the following element types:
     322     * >
     323     * >   - All the element types listed above for the has an element in scope algorithm.
     324     * >   - ol in the HTML namespace
     325     * >   - ul in the HTML namespace
     326     *
    255327     * @since 6.4.0
    256328     * @since 6.5.0 Implemented: no longer throws on every invocation.
     329     * @since 6.7.0 Supports all required HTML elements.
    257330     *
    258331     * @see https://html.spec.whatwg.org/#has-an-element-in-list-item-scope
     
    265338            $tag_name,
    266339            array(
    267                 // There are more elements that belong here which aren't currently supported.
     340                'APPLET',
     341                'BUTTON',
     342                'CAPTION',
     343                'HTML',
     344                'TABLE',
     345                'TD',
     346                'TH',
     347                'MARQUEE',
     348                'OBJECT',
    268349                'OL',
     350                'TEMPLATE',
    269351                'UL',
     352                // @todo: Support SVG and MathML nodes when support for foreign content is added.
    270353            )
    271354        );
     
    275358     * Returns whether a particular element is in button scope.
    276359     *
    277      * @since 6.4.0
     360     * > The stack of open elements is said to have a particular element
     361     * > in button scope when it has that element in the specific scope
     362     * > consisting of the following element types:
     363     * >
     364     * >   - All the element types listed above for the has an element in scope algorithm.
     365     * >   - button in the HTML namespace
     366     *
     367     * @since 6.4.0
     368     * @since 6.7.0 Supports all required HTML elements.
    278369     *
    279370     * @see https://html.spec.whatwg.org/#has-an-element-in-button-scope
     
    283374     */
    284375    public function has_element_in_button_scope( string $tag_name ): bool {
    285         return $this->has_element_in_specific_scope( $tag_name, array( 'BUTTON' ) );
     376        return $this->has_element_in_specific_scope(
     377            $tag_name,
     378            array(
     379                'APPLET',
     380                'BUTTON',
     381                'CAPTION',
     382                'HTML',
     383                'TABLE',
     384                'TD',
     385                'TH',
     386                'MARQUEE',
     387                'OBJECT',
     388                'TEMPLATE',
     389                // @todo: Support SVG and MathML nodes when support for foreign content is added.
     390            )
     391        );
    286392    }
    287393
     
    289395     * Returns whether a particular element is in table scope.
    290396     *
    291      * @since 6.4.0
     397     * > The stack of open elements is said to have a particular element
     398     * > in table scope when it has that element in the specific scope
     399     * > consisting of the following element types:
     400     * >
     401     * >   - html in the HTML namespace
     402     * >   - table in the HTML namespace
     403     * >   - template in the HTML namespace
     404     *
     405     * @since 6.4.0
     406     * @since 6.7.0 Full implementation.
    292407     *
    293408     * @see https://html.spec.whatwg.org/#has-an-element-in-table-scope
    294      *
    295      * @throws WP_HTML_Unsupported_Exception Always until this function is implemented.
    296409     *
    297410     * @param string $tag_name Name of tag to check.
     
    299412     */
    300413    public function has_element_in_table_scope( string $tag_name ): bool {
    301         throw new WP_HTML_Unsupported_Exception( 'Cannot process elements depending on table scope.' );
    302 
    303         return false; // The linter requires this unreachable code until the function is implemented and can return.
     414        return $this->has_element_in_specific_scope(
     415            $tag_name,
     416            array(
     417                'HTML',
     418                'TABLE',
     419                'TEMPLATE',
     420            )
     421        );
    304422    }
    305423
     
    541659         */
    542660        switch ( $item->node_name ) {
     661            case 'APPLET':
    543662            case 'BUTTON':
     663            case 'CAPTION':
     664            case 'HTML':
     665            case 'TABLE':
     666            case 'TD':
     667            case 'TH':
     668            case 'MARQUEE':
     669            case 'OBJECT':
     670            case 'TEMPLATE':
    544671                $this->has_p_in_button_scope = false;
    545672                break;
     
    574701         */
    575702        switch ( $item->node_name ) {
     703            case 'APPLET':
    576704            case 'BUTTON':
     705            case 'CAPTION':
     706            case 'HTML':
     707            case 'P':
     708            case 'TABLE':
     709            case 'TD':
     710            case 'TH':
     711            case 'MARQUEE':
     712            case 'OBJECT':
     713            case 'TEMPLATE':
    577714                $this->has_p_in_button_scope = $this->has_element_in_button_scope( 'P' );
    578715                break;
    579 
    580             case 'P':
    581                 $this->has_p_in_button_scope = $this->has_element_in_button_scope( 'P' );
    582                 break;
    583716        }
    584717
  • trunk/src/wp-includes/html-api/class-wp-html-processor-state.php

    r58769 r58779  
    313313
    314314    /**
     315     * No-quirks mode document compatability mode.
     316     *
     317     * > In no-quirks mode, the behavior is (hopefully) the desired behavior
     318     * > described by the modern HTML and CSS specifications.
     319     *
     320     * @since 6.7.0
     321     *
     322     * @var string
     323     */
     324    const NO_QUIRKS_MODE = 'no-quirks-mode';
     325
     326    /**
     327     * Quirks mode document compatability mode.
     328     *
     329     * > In quirks mode, layout emulates behavior in Navigator 4 and Internet
     330     * > Explorer 5. This is essential in order to support websites that were
     331     * > built before the widespread adoption of web standards.
     332     *
     333     * @since 6.7.0
     334     *
     335     * @var string
     336     */
     337    const QUIRKS_MODE = 'quirks-mode';
     338
     339    /**
    315340     * The stack of template insertion modes.
    316341     *
     
    370395
    371396    /**
     397     * Indicates if the document is in quirks mode or no-quirks mode.
     398     *
     399     * Impact on HTML parsing:
     400     *
     401     *  - In `NO_QUIRKS_MODE` CSS class and ID selectors match in a byte-for-byte
     402     *    manner, otherwise for backwards compatability, class selectors are to
     403     *    match in an ASCII case-insensitive manner.
     404     *
     405     *  - When not in `QUIRKS_MODE`, a TABLE start tag implicitly closes an open P tag
     406     *    if one is in scope and open, otherwise the TABLE becomes a child of the P.
     407     *
     408     * `QUIRKS_MODE` impacts many styling-related aspects of an HTML document, but
     409     * none of the other changes modifies how the HTML is parsed or selected.
     410     *
     411     * @see self::QUIRKS_MODE
     412     * @see self::NO_QUIRKS_MODE
     413     *
     414     * @since 6.7.0
     415     *
     416     * @var string
     417     */
     418    public $document_mode = self::NO_QUIRKS_MODE;
     419
     420    /**
    372421     * Context node initializing fragment parser, if created as a fragment parser.
    373422     *
     
    390439     */
    391440    public $head_element = null;
     441
     442    /**
     443     * FORM element pointer.
     444     *
     445     * > points to the last form element that was opened and whose end tag has
     446     * > not yet been seen. It is used to make form controls associate with
     447     * > forms in the face of dramatically bad markup, for historical reasons.
     448     * > It is ignored inside template elements.
     449     *
     450     * @todo This may be invalidated by a seek operation.
     451     *
     452     * @see https://html.spec.whatwg.org/#form-element-pointer
     453     *
     454     * @since 6.7.0
     455     *
     456     * @var WP_HTML_Token|null
     457     */
     458    public $form_element = null;
    392459
    393460    /**
  • trunk/src/wp-includes/html-api/class-wp-html-processor.php

    r58769 r58779  
    9898 * that the HTML Processor won't break any HTML it doesn't fully understand.
    9999 *
    100  * The following list specifies the HTML tags that _are_ supported:
     100 * The HTML Processor supports all elements other than a specific set:
    101101 *
    102  *  - Containers: ADDRESS, BLOCKQUOTE, DETAILS, DIALOG, DIV, FOOTER, HEADER, MAIN, MENU, SPAN, SUMMARY.
    103  *  - Custom elements: All custom elements are supported. :)
    104  *  - Form elements: BUTTON, DATALIST, FIELDSET, INPUT, LABEL, LEGEND, METER, OPTGROUP, OPTION, PROGRESS, SEARCH, SELECT.
    105  *  - Formatting elements: B, BIG, CODE, EM, FONT, I, PRE, SMALL, STRIKE, STRONG, TT, U, WBR.
    106  *  - Heading elements: H1, H2, H3, H4, H5, H6, HGROUP.
    107  *  - Links: A.
    108  *  - Lists: DD, DL, DT, LI, OL, UL.
    109  *  - Media elements: AUDIO, CANVAS, EMBED, FIGCAPTION, FIGURE, IMG, MAP, PICTURE, SOURCE, TRACK, VIDEO.
    110  *  - Paragraph: BR, P.
    111  *  - Phrasing elements: ABBR, AREA, BDI, BDO, CITE, DATA, DEL, DFN, INS, MARK, OUTPUT, Q, SAMP, SUB, SUP, TIME, VAR.
    112  *  - Sectioning elements: ARTICLE, ASIDE, HR, NAV, SECTION.
    113  *  - Templating elements: SLOT.
    114  *  - Text decoration: RUBY.
    115  *  - Deprecated elements: ACRONYM, BLINK, CENTER, DIR, ISINDEX, KEYGEN, LISTING, MULTICOL, NEXTID, PARAM, SPACER.
     102 *  - Any element inside a TABLE.
     103 *  - Any element inside foreign content, including SVG and MATH.
     104 *  - Any element outside the IN BODY insertion mode, e.g. doctype declarations, meta, links.
    116105 *
    117106 * ### Supported markup
     
    122111 * such a case it will stop processing.
    123112 *
    124  * The following list specifies HTML markup that _is_ supported:
     113 * The following list illustrates some common examples of unexpected HTML inputs that
     114 * the HTML Processor properly parses and represents:
    125115 *
    126  *  - Markup involving only those tags listed above.
    127  *  - Fully-balanced and non-overlapping tags.
    128  *  - HTML with unexpected tag closers.
    129  *  - Some unbalanced or overlapping tags.
    130  *  - P tags after unclosed P tags.
    131  *  - BUTTON tags after unclosed BUTTON tags.
    132  *  - A tags after unclosed A tags that don't involve any active formatting elements.
     116 *  - HTML with optional tags omitted, e.g. `<p>one<p>two`.
     117 *  - HTML with unexpected tag closers, e.g. `<p>one </span> more</p>`.
     118 *  - Non-void tags with self-closing flag, e.g. `<div/>the DIV is still open.</div>`.
     119 *  - Heading elements which close open heading elements of another level, e.g. `<h1>Closed by </h2>`.
     120 *  - Elements containing text that looks like other tags but isn't, e.g. `<title>The <img> is plaintext</title>`.
     121 *  - SCRIPT and STYLE tags containing text that looks like HTML but isn't, e.g. `<script>document.write('<p>Hi</p>');</script>`.
     122 *  - SCRIPT content which has been escaped, e.g. `<script><!-- document.write('<script>console.log("hi")</script>') --></script>`.
     123 *
     124 * ### Unsupported Features
     125 *
     126 * This parser does not report parse errors.
     127 *
     128 * Normally, when additional HTML or BODY tags are encountered in a document, if there
     129 * are any additional attributes on them that aren't found on the previous elements,
     130 * the existing HTML and BODY elements adopt those missing attribute values. This
     131 * parser does not add those additional attributes.
     132 *
     133 * In certain situations, elements are moved to a different part of the document in
     134 * a process called "adoption" and "fostering." Because the nodes move to a location
     135 * in the document that the parser had already processed, this parser does not support
     136 * these situations and will bail.
    133137 *
    134138 * @since 6.4.0
     
    11051109
    11061110        switch ( $op ) {
    1107             case '#comment':
    1108             case '#funky-comment':
    1109             case '#presumptuous-tag':
    1110                 $this->insert_html_element( $this->state->current_token );
    1111                 return true;
    1112 
    11131111            case '#text':
    1114                 $this->reconstruct_active_formatting_elements();
    1115 
    11161112                $current_token = $this->bookmarks[ $this->state->current_token->bookmark_name ];
    11171113
     
    11341130                }
    11351131
     1132                $this->reconstruct_active_formatting_elements();
     1133
    11361134                /*
    11371135                 * Whitespace-only text does not affect the frameset-ok flag.
     
    11471145                return true;
    11481146
     1147            case '#comment':
     1148            case '#funky-comment':
     1149            case '#presumptuous-tag':
     1150                $this->insert_html_element( $this->state->current_token );
     1151                return true;
     1152
     1153            /*
     1154             * > A DOCTYPE token
     1155             * > Parse error. Ignore the token.
     1156             */
    11491157            case 'html':
     1158                return $this->step();
     1159
     1160            /*
     1161             * > A start tag whose tag name is "html"
     1162             */
     1163            case '+HTML':
     1164                if ( ! $this->state->stack_of_open_elements->contains( 'TEMPLATE' ) ) {
     1165                    /*
     1166                     * > Otherwise, for each attribute on the token, check to see if the attribute
     1167                     * > is already present on the top element of the stack of open elements. If
     1168                     * > it is not, add the attribute and its corresponding value to that element.
     1169                     *
     1170                     * This parser does not currently support this behavior: ignore the token.
     1171                     */
     1172                }
     1173
     1174                // Ignore the token.
     1175                return $this->step();
     1176
     1177            /*
     1178             * > A start tag whose tag name is one of: "base", "basefont", "bgsound", "link",
     1179             * > "meta", "noframes", "script", "style", "template", "title"
     1180             * >
     1181             * > An end tag whose tag name is "template"
     1182             */
     1183            case '+BASE':
     1184            case '+BASEFONT':
     1185            case '+BGSOUND':
     1186            case '+LINK':
     1187            case '+META':
     1188            case '+NOFRAMES':
     1189            case '+SCRIPT':
     1190            case '+STYLE':
     1191            case '+TEMPLATE':
     1192            case '+TITLE':
     1193            case '-TEMPLATE':
     1194                return $this->step_in_head();
     1195
     1196            /*
     1197             * > A start tag whose tag name is "body"
     1198             *
     1199             * This tag in the IN BODY insertion mode is a parse error.
     1200             */
     1201            case '+BODY':
     1202                if (
     1203                    1 === $this->state->stack_of_open_elements->count() ||
     1204                    'BODY' !== $this->state->stack_of_open_elements->at( 2 ) ||
     1205                    $this->state->stack_of_open_elements->contains( 'TEMPLATE' )
     1206                ) {
     1207                    // Ignore the token.
     1208                    return $this->step();
     1209                }
     1210
    11501211                /*
    1151                  * > A DOCTYPE token
    1152                  * > Parse error. Ignore the token.
     1212                 * > Otherwise, set the frameset-ok flag to "not ok"; then, for each attribute
     1213                 * > on the token, check to see if the attribute is already present on the body
     1214                 * > element (the second element) on the stack of open elements, and if it is
     1215                 * > not, add the attribute and its corresponding value to that element.
     1216                 *
     1217                 * This parser does not currently support this behavior: ignore the token.
    11531218                 */
     1219                $this->state->frameset_ok = false;
    11541220                return $this->step();
    11551221
    11561222            /*
    1157              * > A start tag whose tag name is "button"
    1158              */
    1159             case '+BUTTON':
    1160                 if ( $this->state->stack_of_open_elements->has_element_in_scope( 'BUTTON' ) ) {
    1161                     // @todo Indicate a parse error once it's possible. This error does not impact the logic here.
    1162                     $this->generate_implied_end_tags();
    1163                     $this->state->stack_of_open_elements->pop_until( 'BUTTON' );
    1164                 }
    1165 
    1166                 $this->reconstruct_active_formatting_elements();
    1167                 $this->insert_html_element( $this->state->current_token );
    1168                 $this->state->frameset_ok = false;
    1169 
    1170                 return true;
     1223             * > A start tag whose tag name is "frameset"
     1224             *
     1225             * This tag in the IN BODY insertion mode is a parse error.
     1226             */
     1227            case '+FRAMESET':
     1228                if (
     1229                    1 === $this->state->stack_of_open_elements->count() ||
     1230                    'BODY' !== $this->state->stack_of_open_elements->at( 2 ) ||
     1231                    false === $this->state->frameset_ok
     1232                ) {
     1233                    // Ignore the token.
     1234                    return $this->step();
     1235                }
     1236
     1237                /*
     1238                 * > Otherwise, run the following steps:
     1239                 */
     1240                $this->bail( 'Cannot process non-ignored FRAMESET tags.' );
     1241                break;
     1242
     1243            /*
     1244             * > An end tag whose tag name is "body"
     1245             */
     1246            case '-BODY':
     1247                if ( ! $this->state->stack_of_open_elements->has_element_in_scope( 'BODY' ) ) {
     1248                    // Parse error: ignore the token.
     1249                    return $this->step();
     1250                }
     1251
     1252                /*
     1253                 * > Otherwise, if there is a node in the stack of open elements that is not either a
     1254                 * > dd element, a dt element, an li element, an optgroup element, an option element,
     1255                 * > a p element, an rb element, an rp element, an rt element, an rtc element, a tbody
     1256                 * > element, a td element, a tfoot element, a th element, a thread element, a tr
     1257                 * > element, the body element, or the html element, then this is a parse error.
     1258                 *
     1259                 * There is nothing to do for this parse error, so don't check for it.
     1260                 */
     1261
     1262                $this->state->insertion_mode = WP_HTML_Processor_State::INSERTION_MODE_AFTER_BODY;
     1263                return true;
     1264
     1265            /*
     1266             * > An end tag whose tag name is "html"
     1267             */
     1268            case '-HTML':
     1269                if ( ! $this->state->stack_of_open_elements->has_element_in_scope( 'BODY' ) ) {
     1270                    // Parse error: ignore the token.
     1271                    return $this->step();
     1272                }
     1273
     1274                /*
     1275                 * > Otherwise, if there is a node in the stack of open elements that is not either a
     1276                 * > dd element, a dt element, an li element, an optgroup element, an option element,
     1277                 * > a p element, an rb element, an rp element, an rt element, an rtc element, a tbody
     1278                 * > element, a td element, a tfoot element, a th element, a thread element, a tr
     1279                 * > element, the body element, or the html element, then this is a parse error.
     1280                 *
     1281                 * There is nothing to do for this parse error, so don't check for it.
     1282                 */
     1283
     1284                $this->state->insertion_mode = WP_HTML_Processor_State::INSERTION_MODE_AFTER_BODY;
     1285                return $this->step( self::REPROCESS_CURRENT_NODE );
    11711286
    11721287            /*
     
    12091324
    12101325            /*
     1326             * > A start tag whose tag name is one of: "h1", "h2", "h3", "h4", "h5", "h6"
     1327             */
     1328            case '+H1':
     1329            case '+H2':
     1330            case '+H3':
     1331            case '+H4':
     1332            case '+H5':
     1333            case '+H6':
     1334                if ( $this->state->stack_of_open_elements->has_p_in_button_scope() ) {
     1335                    $this->close_a_p_element();
     1336                }
     1337
     1338                if (
     1339                    in_array(
     1340                        $this->state->stack_of_open_elements->current_node()->node_name,
     1341                        array( 'H1', 'H2', 'H3', 'H4', 'H5', 'H6' ),
     1342                        true
     1343                    )
     1344                ) {
     1345                    // @todo Indicate a parse error once it's possible.
     1346                    $this->state->stack_of_open_elements->pop();
     1347                }
     1348
     1349                $this->insert_html_element( $this->state->current_token );
     1350                return true;
     1351
     1352            /*
     1353             * > A start tag whose tag name is one of: "pre", "listing"
     1354             */
     1355            case '+PRE':
     1356            case '+LISTING':
     1357                if ( $this->state->stack_of_open_elements->has_p_in_button_scope() ) {
     1358                    $this->close_a_p_element();
     1359                }
     1360
     1361                /*
     1362                 * > If the next token is a U+000A LINE FEED (LF) character token,
     1363                 * > then ignore that token and move on to the next one. (Newlines
     1364                 * > at the start of pre blocks are ignored as an authoring convenience.)
     1365                 *
     1366                 * This is handled in `get_modifiable_text()`.
     1367                 */
     1368
     1369                $this->insert_html_element( $this->state->current_token );
     1370                $this->state->frameset_ok = false;
     1371                return true;
     1372
     1373            /*
     1374             * > A start tag whose tag name is "form"
     1375             */
     1376            case '+FORM':
     1377                $stack_contains_template = $this->state->stack_of_open_elements->contains( 'TEMPLATE' );
     1378
     1379                if ( isset( $this->state->form_element ) && ! $stack_contains_template ) {
     1380                    // Parse error: ignore the token.
     1381                    return $this->step();
     1382                }
     1383
     1384                if ( $this->state->stack_of_open_elements->has_p_in_button_scope() ) {
     1385                    $this->close_a_p_element();
     1386                }
     1387
     1388                $this->insert_html_element( $this->state->current_token );
     1389                if ( ! $stack_contains_template ) {
     1390                    $this->state->form_element = $this->state->current_token;
     1391                }
     1392
     1393                return true;
     1394
     1395            /*
     1396             * > A start tag whose tag name is "li"
     1397             * > A start tag whose tag name is one of: "dd", "dt"
     1398             */
     1399            case '+DD':
     1400            case '+DT':
     1401            case '+LI':
     1402                $this->state->frameset_ok = false;
     1403                $node                     = $this->state->stack_of_open_elements->current_node();
     1404                $is_li                    = 'LI' === $token_name;
     1405
     1406                in_body_list_loop:
     1407                /*
     1408                 * The logic for LI and DT/DD is the same except for one point: LI elements _only_
     1409                 * close other LI elements, but a DT or DD element closes _any_ open DT or DD element.
     1410                 */
     1411                if ( $is_li ? 'LI' === $node->node_name : ( 'DD' === $node->node_name || 'DT' === $node->node_name ) ) {
     1412                    $node_name = $is_li ? 'LI' : $node->node_name;
     1413                    $this->generate_implied_end_tags( $node_name );
     1414                    if ( ! $this->state->stack_of_open_elements->current_node_is( $node_name ) ) {
     1415                        // @todo Indicate a parse error once it's possible. This error does not impact the logic here.
     1416                    }
     1417
     1418                    $this->state->stack_of_open_elements->pop_until( $node_name );
     1419                    goto in_body_list_done;
     1420                }
     1421
     1422                if (
     1423                    'ADDRESS' !== $node->node_name &&
     1424                    'DIV' !== $node->node_name &&
     1425                    'P' !== $node->node_name &&
     1426                    $this->is_special( $node->node_name )
     1427                ) {
     1428                    /*
     1429                     * > If node is in the special category, but is not an address, div,
     1430                     * > or p element, then jump to the step labeled done below.
     1431                     */
     1432                    goto in_body_list_done;
     1433                } else {
     1434                    /*
     1435                     * > Otherwise, set node to the previous entry in the stack of open elements
     1436                     * > and return to the step labeled loop.
     1437                     */
     1438                    foreach ( $this->state->stack_of_open_elements->walk_up( $node ) as $item ) {
     1439                        $node = $item;
     1440                        break;
     1441                    }
     1442                    goto in_body_list_loop;
     1443                }
     1444
     1445                in_body_list_done:
     1446                if ( $this->state->stack_of_open_elements->has_p_in_button_scope() ) {
     1447                    $this->close_a_p_element();
     1448                }
     1449
     1450                $this->insert_html_element( $this->state->current_token );
     1451                return true;
     1452
     1453            case '+PLAINTEXT':
     1454                if ( $this->state->stack_of_open_elements->has_p_in_button_scope() ) {
     1455                    $this->close_a_p_element();
     1456                }
     1457
     1458                /*
     1459                 * @todo This may need to be handled in the Tag Processor and turn into
     1460                 *       a single self-contained tag like TEXTAREA, whose modifiable text
     1461                 *       is the rest of the input document as plaintext.
     1462                 */
     1463                $this->bail( 'Cannot process PLAINTEXT elements.' );
     1464                break;
     1465
     1466            /*
     1467             * > A start tag whose tag name is "button"
     1468             */
     1469            case '+BUTTON':
     1470                if ( $this->state->stack_of_open_elements->has_element_in_scope( 'BUTTON' ) ) {
     1471                    // @todo Indicate a parse error once it's possible. This error does not impact the logic here.
     1472                    $this->generate_implied_end_tags();
     1473                    $this->state->stack_of_open_elements->pop_until( 'BUTTON' );
     1474                }
     1475
     1476                $this->reconstruct_active_formatting_elements();
     1477                $this->insert_html_element( $this->state->current_token );
     1478                $this->state->frameset_ok = false;
     1479
     1480                return true;
     1481
     1482            /*
    12111483             * > An end tag whose tag name is one of: "address", "article", "aside", "blockquote",
    12121484             * > "button", "center", "details", "dialog", "dir", "div", "dl", "fieldset",
    12131485             * > "figcaption", "figure", "footer", "header", "hgroup", "listing", "main",
    12141486             * > "menu", "nav", "ol", "pre", "search", "section", "summary", "ul"
     1487             *
     1488             * @todo This needs to check if the element in scope is an HTML element, meaning that
     1489             *       when SVG and MathML support is added, this needs to differentiate between an
     1490             *       HTML element of the given name, such as `<center>`, and a foreign element of
     1491             *       the same given name.
    12151492             */
    12161493            case '-ADDRESS':
     
    12551532
    12561533            /*
    1257              * > A start tag whose tag name is one of: "h1", "h2", "h3", "h4", "h5", "h6"
    1258              */
    1259             case '+H1':
    1260             case '+H2':
    1261             case '+H3':
    1262             case '+H4':
    1263             case '+H5':
    1264             case '+H6':
    1265                 if ( $this->state->stack_of_open_elements->has_p_in_button_scope() ) {
    1266                     $this->close_a_p_element();
    1267                 }
    1268 
    1269                 if (
    1270                     in_array(
    1271                         $this->state->stack_of_open_elements->current_node()->node_name,
    1272                         array( 'H1', 'H2', 'H3', 'H4', 'H5', 'H6' ),
    1273                         true
    1274                     )
    1275                 ) {
    1276                     // @todo Indicate a parse error once it's possible.
    1277                     $this->state->stack_of_open_elements->pop();
    1278                 }
    1279 
    1280                 $this->insert_html_element( $this->state->current_token );
    1281                 return true;
    1282 
    1283             /*
    1284              * > A start tag whose tag name is one of: "pre", "listing"
    1285              */
    1286             case '+PRE':
    1287             case '+LISTING':
    1288                 if ( $this->state->stack_of_open_elements->has_p_in_button_scope() ) {
    1289                     $this->close_a_p_element();
    1290                 }
    1291                 $this->insert_html_element( $this->state->current_token );
    1292                 $this->state->frameset_ok = false;
    1293                 return true;
    1294 
    1295             /*
    1296              * > An end tag whose tag name is one of: "h1", "h2", "h3", "h4", "h5", "h6"
    1297              */
    1298             case '-H1':
    1299             case '-H2':
    1300             case '-H3':
    1301             case '-H4':
    1302             case '-H5':
    1303             case '-H6':
    1304                 if ( ! $this->state->stack_of_open_elements->has_element_in_scope( '(internal: H1 through H6 - do not use)' ) ) {
     1534             * > An end tag whose tag name is "form"
     1535             */
     1536            case '-FORM':
     1537                if ( ! $this->state->stack_of_open_elements->contains( 'TEMPLATE' ) ) {
     1538                    $node                      = $this->state->form_element;
     1539                    $this->state->form_element = null;
     1540
    13051541                    /*
    1306                      * This is a parse error; ignore the token.
     1542                     * > If node is null or if the stack of open elements does not have node
     1543                     * > in scope, then this is a parse error; return and ignore the token.
    13071544                     *
    1308                      * @todo Indicate a parse error once it's possible.
     1545                     * @todo It's necessary to check if the form token itself is in scope, not
     1546                     *       simply whether any FORM is in scope.
    13091547                     */
    1310                     return $this->step();
    1311                 }
    1312 
    1313                 $this->generate_implied_end_tags();
    1314 
    1315                 if ( ! $this->state->stack_of_open_elements->current_node_is( $token_name ) ) {
    1316                     // @todo Record parse error: this error doesn't impact parsing.
    1317                 }
    1318 
    1319                 $this->state->stack_of_open_elements->pop_until( '(internal: H1 through H6 - do not use)' );
    1320                 return true;
    1321 
    1322             /*
    1323              * > A start tag whose tag name is "li"
    1324              * > A start tag whose tag name is one of: "dd", "dt"
    1325              */
    1326             case '+DD':
    1327             case '+DT':
    1328             case '+LI':
    1329                 $this->state->frameset_ok = false;
    1330                 $node                     = $this->state->stack_of_open_elements->current_node();
    1331                 $is_li                    = 'LI' === $token_name;
    1332 
    1333                 in_body_list_loop:
    1334                 /*
    1335                  * The logic for LI and DT/DD is the same except for one point: LI elements _only_
    1336                  * close other LI elements, but a DT or DD element closes _any_ open DT or DD element.
    1337                  */
    1338                 if ( $is_li ? 'LI' === $node->node_name : ( 'DD' === $node->node_name || 'DT' === $node->node_name ) ) {
    1339                     $node_name = $is_li ? 'LI' : $node->node_name;
    1340                     $this->generate_implied_end_tags( $node_name );
    1341                     if ( ! $this->state->stack_of_open_elements->current_node_is( $node_name ) ) {
     1548                    if (
     1549                        null === $node ||
     1550                        ! $this->state->stack_of_open_elements->has_element_in_scope( 'FORM' )
     1551                    ) {
     1552                        // Parse error: ignore the token.
     1553                        return $this->step();
     1554                    }
     1555
     1556                    $this->generate_implied_end_tags();
     1557                    if ( $node !== $this->state->stack_of_open_elements->current_node() ) {
     1558                        // @todo Indicate a parse error once it's possible. This error does not impact the logic here.
     1559                        $this->bail( 'Cannot close a FORM when other elements remain open as this would throw off the breadcrumbs for the following tokens.' );
     1560                    }
     1561
     1562                    $this->state->stack_of_open_elements->remove_node( $node );
     1563                } else {
     1564                    /*
     1565                     * > If the stack of open elements does not have a form element in scope,
     1566                     * > then this is a parse error; return and ignore the token.
     1567                     *
     1568                     * Note that unlike in the clause above, this is checking for any FORM in scope.
     1569                     */
     1570                    if ( ! $this->state->stack_of_open_elements->has_element_in_scope( 'FORM' ) ) {
     1571                        // Parse error: ignore the token.
     1572                        return $this->step();
     1573                    }
     1574
     1575                    $this->generate_implied_end_tags();
     1576
     1577                    if ( ! $this->state->stack_of_open_elements->current_node_is( 'FORM' ) ) {
    13421578                        // @todo Indicate a parse error once it's possible. This error does not impact the logic here.
    13431579                    }
    13441580
    1345                     $this->state->stack_of_open_elements->pop_until( $node_name );
    1346                     goto in_body_list_done;
    1347                 }
    1348 
    1349                 if (
    1350                     'ADDRESS' !== $node->node_name &&
    1351                     'DIV' !== $node->node_name &&
    1352                     'P' !== $node->node_name &&
    1353                     $this->is_special( $node->node_name )
    1354                 ) {
    1355                     /*
    1356                      * > If node is in the special category, but is not an address, div,
    1357                      * > or p element, then jump to the step labeled done below.
    1358                      */
    1359                     goto in_body_list_done;
    1360                 } else {
    1361                     /*
    1362                      * > Otherwise, set node to the previous entry in the stack of open elements
    1363                      * > and return to the step labeled loop.
    1364                      */
    1365                     foreach ( $this->state->stack_of_open_elements->walk_up( $node ) as $item ) {
    1366                         $node = $item;
    1367                         break;
    1368                     }
    1369                     goto in_body_list_loop;
    1370                 }
    1371 
    1372                 in_body_list_done:
    1373                 if ( $this->state->stack_of_open_elements->has_p_in_button_scope() ) {
    1374                     $this->close_a_p_element();
    1375                 }
    1376 
    1377                 $this->insert_html_element( $this->state->current_token );
     1581                    $this->state->stack_of_open_elements->pop_until( 'FORM' );
     1582                    return true;
     1583                }
     1584                break;
     1585
     1586            /*
     1587             * > An end tag whose tag name is "p"
     1588             */
     1589            case '-P':
     1590                if ( ! $this->state->stack_of_open_elements->has_p_in_button_scope() ) {
     1591                    $this->insert_html_element( $this->state->current_token );
     1592                }
     1593
     1594                $this->close_a_p_element();
    13781595                return true;
    13791596
     
    14241641
    14251642            /*
    1426              * > An end tag whose tag name is "p"
    1427              */
    1428             case '-P':
    1429                 if ( ! $this->state->stack_of_open_elements->has_p_in_button_scope() ) {
    1430                     $this->insert_html_element( $this->state->current_token );
    1431                 }
    1432 
    1433                 $this->close_a_p_element();
    1434                 return true;
    1435 
    1436             // > A start tag whose tag name is "a"
     1643             * > An end tag whose tag name is one of: "h1", "h2", "h3", "h4", "h5", "h6"
     1644             */
     1645            case '-H1':
     1646            case '-H2':
     1647            case '-H3':
     1648            case '-H4':
     1649            case '-H5':
     1650            case '-H6':
     1651                if ( ! $this->state->stack_of_open_elements->has_element_in_scope( '(internal: H1 through H6 - do not use)' ) ) {
     1652                    /*
     1653                     * This is a parse error; ignore the token.
     1654                     *
     1655                     * @todo Indicate a parse error once it's possible.
     1656                     */
     1657                    return $this->step();
     1658                }
     1659
     1660                $this->generate_implied_end_tags();
     1661
     1662                if ( ! $this->state->stack_of_open_elements->current_node_is( $token_name ) ) {
     1663                    // @todo Record parse error: this error doesn't impact parsing.
     1664                }
     1665
     1666                $this->state->stack_of_open_elements->pop_until( '(internal: H1 through H6 - do not use)' );
     1667                return true;
     1668
     1669            /*
     1670             * > A start tag whose tag name is "a"
     1671             */
    14371672            case '+A':
    14381673                foreach ( $this->state->active_formatting_elements->walk_up() as $item ) {
     
    14761711
    14771712            /*
     1713             * > A start tag whose tag name is "nobr"
     1714             */
     1715            case '+NOBR':
     1716                $this->reconstruct_active_formatting_elements();
     1717
     1718                if ( $this->state->stack_of_open_elements->has_element_in_scope( 'NOBR' ) ) {
     1719                    // Parse error.
     1720                    $this->run_adoption_agency_algorithm();
     1721                    $this->reconstruct_active_formatting_elements();
     1722                }
     1723
     1724                $this->insert_html_element( $this->state->current_token );
     1725                $this->state->active_formatting_elements->push( $this->state->current_token );
     1726                return true;
     1727
     1728            /*
    14781729             * > An end tag whose tag name is one of: "a", "b", "big", "code", "em", "font", "i",
    14791730             * > "nobr", "s", "small", "strike", "strong", "tt", "u"
     
    14961747
    14971748            /*
     1749             * > A start tag whose tag name is one of: "applet", "marquee", "object"
     1750             */
     1751            case '+APPLET':
     1752            case '+MARQUEE':
     1753            case '+OBJECT':
     1754                $this->reconstruct_active_formatting_elements();
     1755                $this->insert_html_element( $this->state->current_token );
     1756                $this->state->active_formatting_elements->insert_marker();
     1757                $this->state->frameset_ok = false;
     1758                return true;
     1759
     1760            /*
     1761             * > A end tag token whose tag name is one of: "applet", "marquee", "object"
     1762             *
     1763             * @todo This needs to check if the element in scope is an HTML element, meaning that
     1764             *       when SVG and MathML support is added, this needs to differentiate between an
     1765             *       HTML element of the given name, such as `<object>`, and a foreign element of
     1766             *       the same given name.
     1767             */
     1768            case '-APPLET':
     1769            case '-MARQUEE':
     1770            case '-OBJECT':
     1771                if ( ! $this->state->stack_of_open_elements->has_element_in_scope( $token_name ) ) {
     1772                    // Parse error: ignore the token.
     1773                    return $this->step();
     1774                }
     1775
     1776                $this->generate_implied_end_tags();
     1777                if ( ! $this->state->stack_of_open_elements->current_node_is( $token_name ) ) {
     1778                    // This is a parse error.
     1779                }
     1780
     1781                $this->state->stack_of_open_elements->pop_until( $token_name );
     1782                $this->state->active_formatting_elements->clear_up_to_last_marker();
     1783                return true;
     1784
     1785            /*
     1786             * > A start tag whose tag name is "table"
     1787             */
     1788            case '+TABLE':
     1789                if (
     1790                    WP_HTML_Processor_State::QUIRKS_MODE !== $this->state->document_mode &&
     1791                    $this->state->stack_of_open_elements->has_p_in_button_scope()
     1792                ) {
     1793                    $this->close_a_p_element();
     1794                }
     1795
     1796                $this->insert_html_element( $this->state->current_token );
     1797                $this->state->frameset_ok    = false;
     1798                $this->state->insertion_mode = WP_HTML_Processor_State::INSERTION_MODE_IN_TABLE;
     1799                return true;
     1800
     1801            /*
    14981802             * > An end tag whose tag name is "br"
    1499              * >   Parse error. Drop the attributes from the token, and act as described in the next
    1500              * >   entry; i.e. act as if this was a "br" start tag token with no attributes, rather
    1501              * >   than the end tag token that it actually is.
    1502              */
    1503             case '-BR':
    1504                 $this->bail( 'Closing BR tags require unimplemented special handling.' );
    1505                 // This return required because PHPCS can't determine that the call to bail() throws.
    1506                 return false;
     1803             *
     1804             * This is prevented from happening because the Tag Processor
     1805             * reports all closing BR tags as if they were opening tags.
     1806             */
    15071807
    15081808            /*
     
    15261826                $this->reconstruct_active_formatting_elements();
    15271827                $this->insert_html_element( $this->state->current_token );
    1528                 $type_attribute = $this->get_attribute( 'type' );
     1828
    15291829                /*
    15301830                 * > If the token does not have an attribute with the name "type", or if it does,
     
    15321832                 * > string "hidden", then: set the frameset-ok flag to "not ok".
    15331833                 */
     1834                $type_attribute = $this->get_attribute( 'type' );
    15341835                if ( ! is_string( $type_attribute ) || 'hidden' !== strtolower( $type_attribute ) ) {
    15351836                    $this->state->frameset_ok = false;
    15361837                }
     1838
     1839                return true;
     1840
     1841            /*
     1842             * > A start tag whose tag name is one of: "param", "source", "track"
     1843             */
     1844            case '+PARAM':
     1845            case '+SOURCE':
     1846            case '+TRACK':
     1847                $this->insert_html_element( $this->state->current_token );
    15371848                return true;
    15381849
     
    15491860
    15501861            /*
    1551              * > A start tag whose tag name is one of: "param", "source", "track"
    1552              */
    1553             case '+PARAM':
    1554             case '+SOURCE':
    1555             case '+TRACK':
     1862             * > A start tag whose tag name is "image"
     1863             */
     1864            case '+IMAGE':
     1865                /*
     1866                 * > Parse error. Change the token's tag name to "img" and reprocess it. (Don't ask.)
     1867                 *
     1868                 * Note that this is handled elsewhere, so it should not be possible to reach this code.
     1869                 */
     1870                $this->bail( "Cannot process an IMAGE tag. (Don't ask.)" );
     1871                break;
     1872
     1873            /*
     1874             * > A start tag whose tag name is "textarea"
     1875             */
     1876            case '+TEXTAREA':
     1877                $this->insert_html_element( $this->state->current_token );
     1878
     1879                /*
     1880                 * > If the next token is a U+000A LINE FEED (LF) character token, then ignore
     1881                 * > that token and move on to the next one. (Newlines at the start of
     1882                 * > textarea elements are ignored as an authoring convenience.)
     1883                 *
     1884                 * This is handled in `get_modifiable_text()`.
     1885                 */
     1886
     1887                $this->state->frameset_ok = false;
     1888
     1889                /*
     1890                 * > Switch the insertion mode to "text".
     1891                 *
     1892                 * As a self-contained node, this behavior is handled in the Tag Processor.
     1893                 */
     1894                return true;
     1895
     1896            /*
     1897             * > A start tag whose tag name is "xmp"
     1898             */
     1899            case '+XMP':
     1900                if ( $this->state->stack_of_open_elements->has_p_in_button_scope() ) {
     1901                    $this->close_a_p_element();
     1902                }
     1903
     1904                $this->reconstruct_active_formatting_elements();
     1905                $this->state->frameset_ok = false;
     1906
     1907                /*
     1908                 * > Follow the generic raw text element parsing algorithm.
     1909                 *
     1910                 * As a self-contained node, this behavior is handled in the Tag Processor.
     1911                 */
     1912                $this->insert_html_element( $this->state->current_token );
     1913                return true;
     1914
     1915            /*
     1916             * A start tag whose tag name is "iframe"
     1917             */
     1918            case '+IFRAME':
     1919                $this->state->frameset_ok = false;
     1920
     1921                /*
     1922                 * > Follow the generic raw text element parsing algorithm.
     1923                 *
     1924                 * As a self-contained node, this behavior is handled in the Tag Processor.
     1925                 */
     1926                $this->insert_html_element( $this->state->current_token );
     1927                return true;
     1928
     1929            /*
     1930             * > A start tag whose tag name is "noembed"
     1931             * > A start tag whose tag name is "noscript", if the scripting flag is enabled
     1932             *
     1933             * The scripting flag is never enabled in this parser.
     1934             */
     1935            case '+NOEMBED':
    15561936                $this->insert_html_element( $this->state->current_token );
    15571937                return true;
     
    15981978                $this->insert_html_element( $this->state->current_token );
    15991979                return true;
    1600         }
    1601 
    1602         /*
    1603          * These tags require special handling in the 'in body' insertion mode
    1604          * but that handling hasn't yet been implemented.
    1605          *
    1606          * As the rules for each tag are implemented, the corresponding tag
    1607          * name should be removed from this list. An accompanying test should
    1608          * help ensure this list is maintained.
    1609          *
    1610          * @see Tests_HtmlApi_WpHtmlProcessor::test_step_in_body_fails_on_unsupported_tags
    1611          *
    1612          * Since this switch structure throws a WP_HTML_Unsupported_Exception, it's
    1613          * possible to handle "any other start tag" and "any other end tag" below,
    1614          * as that guarantees execution doesn't proceed for the unimplemented tags.
    1615          *
    1616          * @see https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-inbody
    1617          */
    1618         switch ( $token_name ) {
    1619             case 'APPLET':
    1620             case 'BASE':
    1621             case 'BASEFONT':
    1622             case 'BGSOUND':
    1623             case 'BODY':
    1624             case 'CAPTION':
    1625             case 'COL':
    1626             case 'COLGROUP':
    1627             case 'FORM':
    1628             case 'FRAME':
    1629             case 'FRAMESET':
    1630             case 'HEAD':
    1631             case 'HTML':
    1632             case 'IFRAME':
    1633             case 'LINK':
    1634             case 'MARQUEE':
    1635             case 'MATH':
    1636             case 'META':
    1637             case 'NOBR':
    1638             case 'NOEMBED':
    1639             case 'NOFRAMES':
    1640             case 'NOSCRIPT':
    1641             case 'OBJECT':
    1642             case 'PLAINTEXT':
    1643             case 'RB':
    1644             case 'RP':
    1645             case 'RT':
    1646             case 'RTC':
    1647             case 'SARCASM':
    1648             case 'SCRIPT':
    1649             case 'STYLE':
    1650             case 'SVG':
    1651             case 'TABLE':
    1652             case 'TBODY':
    1653             case 'TD':
    1654             case 'TEMPLATE':
    1655             case 'TEXTAREA':
    1656             case 'TFOOT':
    1657             case 'TH':
    1658             case 'THEAD':
    1659             case 'TITLE':
    1660             case 'TR':
    1661             case 'XMP':
    1662                 $this->bail( "Cannot process {$token_name} element." );
     1980
     1981            /*
     1982             * > A start tag whose tag name is one of: "rb", "rtc"
     1983             */
     1984            case '+RB':
     1985            case '+RTC':
     1986                if ( $this->state->stack_of_open_elements->has_element_in_scope( 'RUBY' ) ) {
     1987                    $this->generate_implied_end_tags();
     1988
     1989                    if ( $this->state->stack_of_open_elements->current_node_is( 'RUBY' ) ) {
     1990                        // @todo Indicate a parse error once it's possible.
     1991                    }
     1992                }
     1993
     1994                $this->insert_html_element( $this->state->current_token );
     1995                return true;
     1996
     1997            /*
     1998             * > A start tag whose tag name is one of: "rp", "rt"
     1999             */
     2000            case '+RP':
     2001            case '+RT':
     2002                if ( $this->state->stack_of_open_elements->has_element_in_scope( 'RUBY' ) ) {
     2003                    $this->generate_implied_end_tags( 'RTC' );
     2004
     2005                    $current_node_name = $this->state->stack_of_open_elements->current_node()->node_name;
     2006                    if ( 'RTC' === $current_node_name || 'RUBY' === $current_node_name ) {
     2007                        // @todo Indicate a parse error once it's possible.
     2008                    }
     2009                }
     2010
     2011                $this->insert_html_element( $this->state->current_token );
     2012                return true;
     2013
     2014            /*
     2015             * > A start tag whose tag name is "math"
     2016             */
     2017            case '+MATH':
     2018                $this->reconstruct_active_formatting_elements();
     2019
     2020                /*
     2021                 * @todo Adjust MathML attributes for the token. (This fixes the case of MathML attributes that are not all lowercase.)
     2022                 * @todo Adjust foreign attributes for the token. (This fixes the use of namespaced attributes, in particular XLink.)
     2023                 *
     2024                 * These ought to be handled in the attribute methods.
     2025                 */
     2026
     2027                $this->bail( 'Cannot process MATH element, opening foreign content.' );
     2028                break;
     2029
     2030            /*
     2031             * > A start tag whose tag name is "svg"
     2032             */
     2033            case '+SVG':
     2034                $this->reconstruct_active_formatting_elements();
     2035
     2036                /*
     2037                 * @todo Adjust SVG attributes for the token. (This fixes the case of SVG attributes that are not all lowercase.)
     2038                 * @todo Adjust foreign attributes for the token. (This fixes the use of namespaced attributes, in particular XLink in SVG.)
     2039                 *
     2040                 * These ought to be handled in the attribute methods.
     2041                 */
     2042
     2043                $this->bail( 'Cannot process SVG element, opening foreign content.' );
     2044                break;
     2045
     2046            /*
     2047             * > A start tag whose tag name is one of: "caption", "col", "colgroup",
     2048             * > "frame", "head", "tbody", "td", "tfoot", "th", "thead", "tr"
     2049             */
     2050            case '+CAPTION':
     2051            case '+COL':
     2052            case '+COLGROUP':
     2053            case '+FRAME':
     2054            case '+HEAD':
     2055            case '+TBODY':
     2056            case '+TD':
     2057            case '+TFOOT':
     2058            case '+TH':
     2059            case '+THEAD':
     2060            case '+TR':
     2061                // Parse error. Ignore the token.
     2062                return $this->step();
    16632063        }
    16642064
     
    16822082             */
    16832083            foreach ( $this->state->stack_of_open_elements->walk_up() as $node ) {
     2084                /*
     2085                 * @todo This needs to check if the element in scope is an HTML element, meaning that
     2086                 *       when SVG and MathML support is added, this needs to differentiate between an
     2087                 *       HTML element of the given name, such as `<object>`, and a foreign element of
     2088                 *       the same given name.
     2089                 */
    16842090                if ( $token_name === $node->node_name ) {
    16852091                    break;
  • trunk/src/wp-includes/html-api/class-wp-html-tag-processor.php

    r58769 r58779  
    130130 *     true === $processor->next_tag( 'DIV' );
    131131 *
    132  * #### Special elements
     132 * #### Special self-contained elements
    133133 *
    134134 * Some HTML elements are handled in a special way; their start and end tags
     
    757757
    758758    /**
     759     * Whether the parser should skip over an immediately-following linefeed
     760     * character, as is the case with LISTING, PRE, and TEXTAREA.
     761     *
     762     * > If the next token is a U+000A LINE FEED (LF) character token, then
     763     * > ignore that token and move on to the next one. (Newlines at the start
     764     * > of [these] elements are ignored as an authoring convenience.)
     765     *
     766     * @since 6.7.0
     767     *
     768     * @var int|null
     769     */
     770    private $skip_newline_at = null;
     771
     772    /**
    759773     * Constructor.
    760774     *
     
    927941
    928942        /*
    929          * For non-DATA sections which might contain text that looks like HTML tags but
    930          * isn't, scan with the appropriate alternative mode. Looking at the first letter
    931          * of the tag name as a pre-check avoids a string allocation when it's not needed.
    932          */
    933         $t = $this->html[ $this->tag_name_starts_at ];
     943         * Certain tags require additional processing. The first-letter pre-check
     944         * avoids unnecessary string allocation when comparing the tag names.
     945         *
     946         *  - IFRAME
     947         *  - LISTING (deprecated)
     948         *  - NOEMBED (deprecated)
     949         *  - NOFRAMES (deprecated)
     950         *  - PRE
     951         *  - SCRIPT
     952         *  - STYLE
     953         *  - TEXTAREA
     954         *  - TITLE
     955         *  - XMP (deprecated)
     956         */
    934957        if (
    935958            $this->is_closing_tag ||
    936             ! (
    937                 'i' === $t || 'I' === $t ||
    938                 'n' === $t || 'N' === $t ||
    939                 's' === $t || 'S' === $t ||
    940                 't' === $t || 'T' === $t ||
    941                 'x' === $t || 'X' === $t
    942             )
     959            1 !== strspn( $this->html, 'iIlLnNpPsStTxX', $this->tag_name_starts_at, 1 )
    943960        ) {
    944961            return true;
     
    948965
    949966        /*
     967         * For LISTING, PRE, and TEXTAREA, the first linefeed of an immediately-following
     968         * text node is ignored as an authoring convenience.
     969         *
     970         * @see static::skip_newline_at
     971         */
     972        if ( 'LISTING' === $tag_name || 'PRE' === $tag_name ) {
     973            $this->skip_newline_at = $this->bytes_already_parsed;
     974            return true;
     975        }
     976
     977        /*
     978         * There are certain elements whose children are not DATA but are instead
     979         * RCDATA or RAWTEXT. These cannot contain other elements, and the contents
     980         * are parsed as plaintext, with character references decoded in RCDATA but
     981         * not in RAWTEXT.
     982         *
     983         * These elements are described here as "self-contained" or special atomic
     984         * elements whose end tag is consumed with the opening tag, and they will
     985         * contain modifiable text inside of them.
     986         *
    950987         * Preserve the opening tag pointers, as these will be overwritten
    951988         * when finding the closing tag. They will be reset after finding
     
    26912728     *
    26922729     * @since 6.2.0
     2730     * @since 6.7.0 Reports all BR tags as opening tags.
    26932731     *
    26942732     * @return bool Whether the current tag is a tag closer.
     
    26972735        return (
    26982736            self::STATE_MATCHED_TAG === $this->parser_state &&
    2699             $this->is_closing_tag
     2737            $this->is_closing_tag &&
     2738
     2739            /*
     2740             * The BR tag can only exist as an opening tag. If something like `</br>`
     2741             * appears then the HTML parser will treat it as an opening tag with no
     2742             * attributes. The BR tag is unique in this way.
     2743             *
     2744             * @see https://html.spec.whatwg.org/#parsing-main-inbody
     2745             */
     2746            'BR' !== $this->get_tag()
    27002747        );
    27012748    }
     
    28262873     * have an empty string (e.g. a comment with no contents).
    28272874     *
     2875     * Limitations:
     2876     *
     2877     *  - This function will not strip the leading newline appropriately
     2878     *    after seeking into a LISTING or PRE element. To ensure that the
     2879     *    newline is treated properly, seek to the LISTING or PRE opening
     2880     *    tag instead of to the first text node inside the element.
     2881     *
    28282882     * @since 6.5.0
     2883     * @since 6.7.0 Replaces NULL bytes (U+0000) and newlines appropriately.
    28292884     *
    28302885     * @return string
    28312886     */
    28322887    public function get_modifiable_text(): string {
    2833         if ( null === $this->text_starts_at ) {
     2888        if ( null === $this->text_starts_at || 0 === $this->text_length ) {
    28342889            return '';
    28352890        }
    28362891
    28372892        $text = substr( $this->html, $this->text_starts_at, $this->text_length );
     2893
     2894        /*
     2895         * Pre-processing the input stream would normally happen before
     2896         * any parsing is done, but deferring it means it's possible to
     2897         * skip in most cases. When getting the modifiable text, however
     2898         * it's important to apply the pre-processing steps, which is
     2899         * normalizing newlines.
     2900         *
     2901         * @see https://html.spec.whatwg.org/#preprocessing-the-input-stream
     2902         * @see https://infra.spec.whatwg.org/#normalize-newlines
     2903         */
     2904        $text = str_replace( "\r\n", "\n", $text );
     2905        $text = str_replace( "\r", "\n", $text );
    28382906
    28392907        // Comment data is not decoded.
     
    28442912            self::STATE_FUNKY_COMMENT === $this->parser_state
    28452913        ) {
    2846             return $text;
    2847         }
    2848 
    2849         $tag_name = $this->get_tag();
     2914            return str_replace( "\x00", "\u{FFFD}", $text );
     2915        }
     2916
     2917        $tag_name = $this->get_token_name();
    28502918        if (
    28512919            // Script data is not decoded.
     
    28592927            'XMP' === $tag_name
    28602928        ) {
    2861             return $text;
     2929            return str_replace( "\x00", "\u{FFFD}", $text );
    28622930        }
    28632931
     
    28652933
    28662934        /*
    2867          * TEXTAREA skips a leading newline, but this newline may appear not only as the
    2868          * literal character `\n`, but also as a character reference, such as in the
    2869          * following markup: `<textarea>&#x0a;Content</textarea>`.
    2870          *
    2871          * For these cases it's important to first decode the text content before checking
    2872          * for a leading newline and removing it.
     2935         * Skip the first line feed after LISTING, PRE, and TEXTAREA opening tags.
     2936         *
     2937         * Note that this first newline may come in the form of a character
     2938         * reference, such as `&#x0a;`, and so it's important to perform
     2939         * this transformation only after decoding the raw text content.
    28732940         */
    28742941        if (
    2875             self::STATE_MATCHED_TAG === $this->parser_state &&
    2876             'TEXTAREA' === $tag_name &&
    2877             strlen( $decoded ) > 0 &&
    2878             "\n" === $decoded[0]
     2942            ( "\n" === ( $decoded[0] ?? '' ) ) &&
     2943            ( ( $this->skip_newline_at === $this->token_starts_at && '#text' === $tag_name ) || 'TEXTAREA' === $tag_name )
    28792944        ) {
    2880             return substr( $decoded, 1 );
    2881         }
    2882 
    2883         return $decoded;
     2945            $decoded = substr( $decoded, 1 );
     2946        }
     2947
     2948        /*
     2949         * Only in normative text nodes does the NULL byte (U+0000) get removed.
     2950         * In all other contexts it's replaced by the replacement character (U+FFFD)
     2951         * for security reasons (to avoid joining together strings that were safe
     2952         * when separated, but not when joined).
     2953         */
     2954        return '#text' === $tag_name
     2955            ? str_replace( "\x00", '', $decoded )
     2956            : str_replace( "\x00", "\u{FFFD}", $decoded );
    28842957    }
    28852958
  • trunk/src/wp-includes/html-api/class-wp-html-token.php

    r58769 r58779  
    7373     * @since 6.4.0
    7474     *
    75      * @param string        $bookmark_name         Name of bookmark corresponding to location in HTML where token is found.
     75     * @param string|null   $bookmark_name         Name of bookmark corresponding to location in HTML where token is found,
     76     *                                             or `null` for markers and nodes without a bookmark.
    7677     * @param string        $node_name             Name of node token represents; if uppercase, an HTML element; if lowercase, a special value like "marker".
    7778     * @param bool          $has_self_closing_flag Whether the source token contains the self-closing flag, regardless of whether it's valid.
    7879     * @param callable|null $on_destroy            Optional. Function to call when destroying token, useful for releasing the bookmark.
    7980     */
    80     public function __construct( string $bookmark_name, string $node_name, bool $has_self_closing_flag, ?callable $on_destroy = null ) {
     81    public function __construct( ?string $bookmark_name, string $node_name, bool $has_self_closing_flag, ?callable $on_destroy = null ) {
    8182        $this->bookmark_name         = $bookmark_name;
    8283        $this->node_name             = $node_name;
  • trunk/tests/phpunit/tests/html-api/wpHtmlProcessor.php

    r58677 r58779  
    135135     * @covers WP_HTML_Processor::is_void
    136136     *
    137      * @dataProvider data_void_tags
     137     * @dataProvider data_void_tags_not_ignored_in_body
    138138     *
    139139     * @param string $tag_name Name of void tag under test.
     
    251251        );
    252252
    253         foreach ( self::data_void_tags() as $tag_name => $_name ) {
     253        foreach ( self::data_void_tags_not_ignored_in_body() as $tag_name => $_name ) {
    254254            $self_contained_nodes[ "Void elements ({$tag_name})" ] = array( "<{$tag_name}>" );
    255255        }
     
    285285     * @ticket 60382
    286286     *
    287      * @dataProvider data_void_tags
     287     * @dataProvider data_void_tags_not_ignored_in_body
    288288     *
    289289     * @param string $tag_name Name of void tag under test.
     
    319319            $processor->get_breadcrumbs(),
    320320            'Found incorrect nesting of first element.'
    321         );
    322 
    323         $this->assertTrue(
    324             $processor->next_token(),
    325             'Should have found the DIV as the second tag.'
    326         );
    327 
    328         $this->assertSame(
    329             array( 'HTML', 'BODY', 'DIV' ),
    330             $processor->get_breadcrumbs(),
    331             "DIV should have been a sibling of the {$tag_name}."
    332321        );
    333322    }
     
    359348
    360349    /**
     350     * Data provider.
     351     *
     352     * @return array[]
     353     */
     354    public static function data_void_tags_not_ignored_in_body() {
     355        $all_void_tags = self::data_void_tags();
     356        unset( $all_void_tags['COL'] );
     357
     358        return $all_void_tags;
     359    }
     360
     361    /**
    361362     * Ensures that special handling of unsupported tags is cleaned up
    362363     * as handling is implemented. Otherwise there's risk of leaving special
     
    384385    public static function data_unsupported_special_in_body_tags() {
    385386        return array(
    386             'APPLET'    => array( 'APPLET' ),
    387             'BASE'      => array( 'BASE' ),
    388             'BASEFONT'  => array( 'BASEFONT' ),
    389             'BGSOUND'   => array( 'BGSOUND' ),
    390             'BODY'      => array( 'BODY' ),
    391             'CAPTION'   => array( 'CAPTION' ),
    392             'COL'       => array( 'COL' ),
    393             'COLGROUP'  => array( 'COLGROUP' ),
    394             'FORM'      => array( 'FORM' ),
    395             'FRAME'     => array( 'FRAME' ),
    396             'FRAMESET'  => array( 'FRAMESET' ),
    397             'HEAD'      => array( 'HEAD' ),
    398             'HTML'      => array( 'HTML' ),
    399             'IFRAME'    => array( 'IFRAME' ),
    400             'LINK'      => array( 'LINK' ),
    401             'MARQUEE'   => array( 'MARQUEE' ),
    402             'MATH'      => array( 'MATH' ),
    403             'META'      => array( 'META' ),
    404             'NOBR'      => array( 'NOBR' ),
    405             'NOEMBED'   => array( 'NOEMBED' ),
    406             'NOFRAMES'  => array( 'NOFRAMES' ),
    407             'NOSCRIPT'  => array( 'NOSCRIPT' ),
    408             'OBJECT'    => array( 'OBJECT' ),
    409             'PLAINTEXT' => array( 'PLAINTEXT' ),
    410             'RB'        => array( 'RB' ),
    411             'RP'        => array( 'RP' ),
    412             'RT'        => array( 'RT' ),
    413             'RTC'       => array( 'RTC' ),
    414             'SARCASM'   => array( 'SARCASM' ),
    415             'SCRIPT'    => array( 'SCRIPT' ),
    416             'STYLE'     => array( 'STYLE' ),
    417             'SVG'       => array( 'SVG' ),
    418             'TABLE'     => array( 'TABLE' ),
    419             'TBODY'     => array( 'TBODY' ),
    420             'TD'        => array( 'TD' ),
    421             'TEMPLATE'  => array( 'TEMPLATE' ),
    422             'TEXTAREA'  => array( 'TEXTAREA' ),
    423             'TFOOT'     => array( 'TFOOT' ),
    424             'TH'        => array( 'TH' ),
    425             'THEAD'     => array( 'THEAD' ),
    426             'TITLE'     => array( 'TITLE' ),
    427             'TR'        => array( 'TR' ),
    428             'XMP'       => array( 'XMP' ),
     387            'MATH' => array( 'MATH' ),
     388            'SVG'  => array( 'SVG' ),
    429389        );
    430390    }
  • trunk/tests/phpunit/tests/html-api/wpHtmlProcessorBreadcrumbs.php

    r58741 r58779  
    4141            'ACRONYM', // Neutralized.
    4242            'ADDRESS',
     43            'APPLET', // Deprecated.
    4344            'AREA',
    4445            'ARTICLE',
     
    7374            'FIGURE',
    7475            'FONT',
     76            'FORM',
    7577            'FOOTER',
    7678            'H1',
     
    9698            'MAP',
    9799            'MARK',
     100            'MARQUEE', // Deprecated.
    98101            'MENU',
    99102            'METER',
     
    101104            'NAV',
    102105            'NEXTID', // Deprecated.
     106            'NOBR', // Neutralized.
     107            'NOSCRIPT',
     108            'OBJECT',
    103109            'OL',
    104110            'OUTPUT',
     
    107113            'PROGRESS',
    108114            'Q',
     115            'RB', // Neutralized.
     116            'RP',
     117            'RT',
     118            'RTC', // Neutralized.
    109119            'RUBY',
    110120            'SAMP',
     
    120130            'SUMMARY',
    121131            'SUP',
     132            'TABLE',
    122133            'TIME',
    123134            'TT',
     
    168179    public static function data_unsupported_elements() {
    169180        $unsupported_elements = array(
    170             'APPLET', // Deprecated.
    171181            'BASE',
    172182            'BGSOUND', // Deprecated; self-closing if self-closing flag provided, otherwise normal.
     
    175185            'COL',
    176186            'COLGROUP',
    177             'FORM',
    178187            'FRAME',
    179188            'FRAMESET',
     
    182191            'IFRAME',
    183192            'LINK',
    184             'MARQUEE', // Deprecated.
    185193            'MATH',
    186194            'META',
    187             'NOBR', // Neutralized.
    188195            'NOEMBED', // Neutralized.
    189196            'NOFRAMES', // Neutralized.
    190             'NOSCRIPT',
    191             'OBJECT',
    192197            'PLAINTEXT', // Neutralized.
    193             'RB', // Neutralized.
    194             'RP',
    195             'RT',
    196             'RTC', // Neutralized.
    197198            'SCRIPT',
    198199            'STYLE',
    199200            'SVG',
    200             'TABLE',
    201201            'TBODY',
    202202            'TD',
  • trunk/tests/phpunit/tests/html-api/wpHtmlProcessorHtml5lib.php

    r58742 r58779  
    3232     */
    3333    const SKIP_TESTS = array(
    34         'adoption01/line0046'       => 'Unimplemented: Reconstruction of active formatting elements.',
    35         'adoption01/line0159'       => 'Unimplemented: Reconstruction of active formatting elements.',
    36         'adoption01/line0318'       => 'Unimplemented: Reconstruction of active formatting elements.',
    37         'inbody01/line0001'         => 'Bug.',
    38         'inbody01/line0014'         => 'Bug.',
    39         'inbody01/line0029'         => 'Bug.',
    40         'menuitem-element/line0012' => 'Bug.',
    41         'tests1/line0342'           => "Closing P tag implicitly creates opener, which we don't visit.",
    42         'tests1/line0720'           => 'Unimplemented: Reconstruction of active formatting elements.',
    43         'tests15/line0001'          => 'Unimplemented: Reconstruction of active formatting elements.',
    44         'tests15/line0022'          => 'Unimplemented: Reconstruction of active formatting elements.',
    45         'tests2/line0650'           => 'Whitespace only test never enters "in body" parsing mode.',
    46         'tests20/line0497'          => "Closing P tag implicitly creates opener, which we don't visit.",
    47         'tests23/line0001'          => 'Unimplemented: Reconstruction of active formatting elements.',
    48         'tests23/line0041'          => 'Unimplemented: Reconstruction of active formatting elements.',
    49         'tests23/line0069'          => 'Unimplemented: Reconstruction of active formatting elements.',
    50         'tests23/line0101'          => 'Unimplemented: Reconstruction of active formatting elements.',
    51         'tests25/line0169'          => 'Bug.',
    52         'tests26/line0263'          => 'Bug: An active formatting element should be created for a trailing text node.',
     34        'adoption01/line0046' => 'Unimplemented: Reconstruction of active formatting elements.',
     35        'adoption01/line0159' => 'Unimplemented: Reconstruction of active formatting elements.',
     36        'adoption01/line0318' => 'Unimplemented: Reconstruction of active formatting elements.',
     37        'tests1/line0720'     => 'Unimplemented: Reconstruction of active formatting elements.',
     38        'tests15/line0001'    => 'Unimplemented: Reconstruction of active formatting elements.',
     39        'tests15/line0022'    => 'Unimplemented: Reconstruction of active formatting elements.',
     40        'tests15/line0068'    => 'Unimplemented: no support outside of IN BODY yet.',
     41        'tests2/line0650'     => 'Whitespace only test never enters "in body" parsing mode.',
     42        'tests19/line0965'    => 'Unimplemented: no support outside of IN BODY yet.',
     43        'tests23/line0001'    => 'Unimplemented: Reconstruction of active formatting elements.',
     44        'tests23/line0041'    => 'Unimplemented: Reconstruction of active formatting elements.',
     45        'tests23/line0069'    => 'Unimplemented: Reconstruction of active formatting elements.',
     46        'tests23/line0101'    => 'Unimplemented: Reconstruction of active formatting elements.',
     47        'tests26/line0263'    => 'Bug: An active formatting element should be created for a trailing text node.',
     48        'webkit01/line0231'   => 'Unimplemented: This parser does not add missing attributes to existing HTML or BODY tags.',
     49        'webkit02/line0013'   => "Asserting behavior with scripting flag enabled, which this parser doesn't support.",
     50        'webkit01/line0300'   => 'Unimplemented: no support outside of IN BODY yet.',
     51        'webkit01/line0310'   => 'Unimplemented: no support outside of IN BODY yet.',
     52        'webkit01/line0336'   => 'Unimplemented: no support outside of IN BODY yet.',
     53        'webkit01/line0349'   => 'Unimplemented: no support outside of IN BODY yet.',
     54        'webkit01/line0362'   => 'Unimplemented: no support outside of IN BODY yet.',
     55        'webkit01/line0375'   => 'Unimplemented: no support outside of IN BODY yet.',
    5356    );
    5457
     
    199202                            $output .= str_repeat( $indent, $tag_indent + 1 ) . "{$attribute_name}=\"{$val}\"\n";
    200203                        }
    201 
    202                         // Self-contained tags contain their inner contents as modifiable text.
    203                         $modifiable_text = $processor->get_modifiable_text();
    204                         if ( '' !== $modifiable_text ) {
    205                             $was_text = true;
    206                             if ( '' === $text_node ) {
    207                                 $text_node = str_repeat( $indent, $indent_level ) . '"';
    208                             }
    209                             $text_node .= $modifiable_text;
    210                             --$indent_level;
    211                         }
     204                    }
     205
     206                    // Self-contained tags contain their inner contents as modifiable text.
     207                    $modifiable_text = $processor->get_modifiable_text();
     208                    if ( '' !== $modifiable_text ) {
     209                        $output .= str_repeat( $indent, $indent_level ) . "\"{$modifiable_text}\"\n";
     210                    }
     211
     212                    if ( ! $processor->is_void( $tag_name ) && ! $processor->expects_closer() ) {
     213                        --$indent_level;
    212214                    }
    213215
     
    226228                        case WP_HTML_Processor::COMMENT_AS_ABRUPTLY_CLOSED_COMMENT:
    227229                        case WP_HTML_Processor::COMMENT_AS_HTML_COMMENT:
     230                        case WP_HTML_Processor::COMMENT_AS_INVALID_HTML:
    228231                            $comment_text_content = $processor->get_modifiable_text();
    229232                            break;
  • trunk/tests/phpunit/tests/html-api/wpHtmlProcessorSemanticRules.php

    r58713 r58779  
    407407
    408408    /**
    409      * Ensures that support isn't accidentally partially added for the closing BR tag `</br>`.
    410      *
    411      * This tag closer has special rules and support shouldn't be added without implementing full support.
     409     * Ensures that closing `</br>` tags are appropriately treated as opening tags with no attributes.
    412410     *
    413411     * > An end tag whose tag name is "br"
     
    416414     * >   tag token that it actually is.
    417415     *
    418      * When this handling is implemented, this test should be removed. It's not incorporated
    419      * into the existing unsupported tag behavior test because the opening tag is supported;
    420      * only the closing tag isn't.
    421      *
    422416     * @covers WP_HTML_Processor::step_in_body
    423417     *
     
    425419     */
    426420    public function test_br_end_tag_unsupported() {
    427         $processor = WP_HTML_Processor::create_fragment( '</br>' );
    428 
    429         $this->assertFalse( $processor->next_tag(), 'Found a BR tag that should not be handled.' );
    430         $this->assertSame( WP_HTML_Processor::ERROR_UNSUPPORTED, $processor->get_last_error() );
     421        $processor = WP_HTML_Processor::create_fragment( '</br id="an-opener" html>' );
     422
     423        $this->assertTrue( $processor->next_tag(), 'Failed to find the expected opening BR tag.' );
     424        $this->assertFalse( $processor->is_tag_closer(), 'Should have treated the tag as an opening tag.' );
     425        $this->assertNull( $processor->get_attribute_names_with_prefix( '' ), 'Should have ignored any attributes on the tag.' );
    431426    }
    432427}
  • trunk/tests/phpunit/tests/html-api/wpHtmlSupportRequiredHtmlProcessor.php

    r58677 r58779  
    1 <?php
    2 /**
    3  * Unit tests for the HTML API indicating that changes are needed to the
    4  * WP_HTML_Processor class before specific features are added to the API.
    5  *
    6  * Note! Duplication of test cases and the helper function in this file are intentional.
    7  * This test file exists to warn developers of related areas of code that need to update
    8  * together when adding support for new elements to the HTML Processor. For example,
    9  * when adding support for the LI element it's necessary to update the function which
    10  * generates implied end tags. This is because each element might bring with it semantic
    11  * rules that impact the way the document should be parsed.
    12  *
    13  * Without these tests a developer needs to investigate all possible places they
    14  * might need to update when adding support for more elements and risks overlooking
    15  * important parts that, in the absence of the related support, will lead to errors.
    16  *
    17  * @package WordPress
    18  * @subpackage HTML-API
    19  *
    20  * @since 6.4.0
    21  *
    22  * @group html-api
    23  *
    24  * @coversDefaultClass WP_HTML_Processor
    25  */
    26 class Tests_HtmlApi_WpHtmlSupportRequiredHtmlProcessor extends WP_UnitTestCase {
    27     /**
    28      * Fails to assert if the HTML Processor handles the given tag.
    29      *
    30      * This test helper is used throughout this test file for one purpose only: to
    31      * fail a test if the HTML Processor handles the given tag. In other words, it
    32      * ensures that the HTML Processor aborts when encountering the given tag.
    33      *
    34      * This is used to ensure that when support for a new tag is added to the
    35      * HTML Processor it receives full support and not partial support, which
    36      * could lead to a variety of issues.
    37      *
    38      * Do not remove this helper function as it provides semantic meaning to the
    39      * assertions in the tests in this file and its behavior is incredibly specific
    40      * and limited and doesn't warrant adding a new abstraction into WP_UnitTestCase.
    41      *
    42      * @param string $tag_name the HTML Processor should abort when encountering this tag, e.g. "BUTTON".
    43      */
    44     private function ensure_support_is_added_everywhere( $tag_name ) {
    45         $processor = WP_HTML_Processor::create_fragment( "<$tag_name>" );
    46 
    47         $this->assertFalse( $processor->step(), "Must support terminating elements in specific scope check before adding support for the {$tag_name} element." );
    48     }
    49 
    50     /**
    51      * Generating implied end tags walks up the stack of open elements
    52      * as long as any of the following missing elements is the current node.
    53      *
    54      * @since 6.4.0
    55      *
    56      * @ticket 58907
    57      *
    58      * @covers WP_HTML_Processor::generate_implied_end_tags
    59      */
    60     public function test_generate_implied_end_tags_needs_support() {
    61         $this->ensure_support_is_added_everywhere( 'RB' );
    62         $this->ensure_support_is_added_everywhere( 'RP' );
    63         $this->ensure_support_is_added_everywhere( 'RT' );
    64         $this->ensure_support_is_added_everywhere( 'RTC' );
    65     }
    66 
    67     /**
    68      * Generating implied end tags thoroughly walks up the stack of open elements
    69      * as long as any of the following missing elements is the current node.
    70      *
    71      * @since 6.4.0
    72      *
    73      * @ticket 58907
    74      *
    75      * @covers WP_HTML_Processor::generate_implied_end_tags_thoroughly
    76      */
    77     public function test_generate_implied_end_tags_thoroughly_needs_support() {
    78         $this->ensure_support_is_added_everywhere( 'CAPTION' );
    79         $this->ensure_support_is_added_everywhere( 'COLGROUP' );
    80         $this->ensure_support_is_added_everywhere( 'RB' );
    81         $this->ensure_support_is_added_everywhere( 'RP' );
    82         $this->ensure_support_is_added_everywhere( 'RT' );
    83         $this->ensure_support_is_added_everywhere( 'RTC' );
    84         $this->ensure_support_is_added_everywhere( 'TBODY' );
    85         $this->ensure_support_is_added_everywhere( 'TD' );
    86         $this->ensure_support_is_added_everywhere( 'TFOOT' );
    87         $this->ensure_support_is_added_everywhere( 'TH' );
    88         $this->ensure_support_is_added_everywhere( 'HEAD' );
    89         $this->ensure_support_is_added_everywhere( 'TR' );
    90     }
    91 }
  • trunk/tests/phpunit/tests/html-api/wpHtmlSupportRequiredOpenElements.php

    r58677 r58779  
    6262     */
    6363    public function test_has_element_in_scope_needs_support() {
    64         // These elements impact all scopes.
    65         $this->ensure_support_is_added_everywhere( 'APPLET' );
    66         $this->ensure_support_is_added_everywhere( 'CAPTION' );
    67         $this->ensure_support_is_added_everywhere( 'HTML' );
    68         $this->ensure_support_is_added_everywhere( 'TABLE' );
    69         $this->ensure_support_is_added_everywhere( 'TD' );
    70         $this->ensure_support_is_added_everywhere( 'TH' );
    71         $this->ensure_support_is_added_everywhere( 'MARQUEE' );
    72         $this->ensure_support_is_added_everywhere( 'OBJECT' );
    73         $this->ensure_support_is_added_everywhere( 'TEMPLATE' );
    74 
    7564        // MathML Elements: MI, MO, MN, MS, MTEXT, ANNOTATION-XML.
    7665        $this->ensure_support_is_added_everywhere( 'MATH' );
     
    10089     */
    10190    public function test_has_element_in_list_item_scope_needs_support() {
    102         // These elements impact all scopes.
    103         $this->ensure_support_is_added_everywhere( 'APPLET' );
    104         $this->ensure_support_is_added_everywhere( 'CAPTION' );
    105         $this->ensure_support_is_added_everywhere( 'HTML' );
    106         $this->ensure_support_is_added_everywhere( 'TABLE' );
    107         $this->ensure_support_is_added_everywhere( 'TD' );
    108         $this->ensure_support_is_added_everywhere( 'TH' );
    109         $this->ensure_support_is_added_everywhere( 'MARQUEE' );
    110         $this->ensure_support_is_added_everywhere( 'OBJECT' );
    111         $this->ensure_support_is_added_everywhere( 'TEMPLATE' );
    112 
    11391        // MathML Elements: MI, MO, MN, MS, MTEXT, ANNOTATION-XML.
    11492        $this->ensure_support_is_added_everywhere( 'MATH' );
     
    134112     */
    135113    public function test_has_element_in_button_scope_needs_support() {
    136         // These elements impact all scopes.
    137         $this->ensure_support_is_added_everywhere( 'APPLET' );
    138         $this->ensure_support_is_added_everywhere( 'CAPTION' );
    139         $this->ensure_support_is_added_everywhere( 'HTML' );
    140         $this->ensure_support_is_added_everywhere( 'TABLE' );
    141         $this->ensure_support_is_added_everywhere( 'TD' );
    142         $this->ensure_support_is_added_everywhere( 'TH' );
    143         $this->ensure_support_is_added_everywhere( 'MARQUEE' );
    144         $this->ensure_support_is_added_everywhere( 'OBJECT' );
    145         $this->ensure_support_is_added_everywhere( 'TEMPLATE' );
    146 
    147114        // MathML Elements: MI, MO, MN, MS, MTEXT, ANNOTATION-XML.
    148115        $this->ensure_support_is_added_everywhere( 'MATH' );
     
    169136     */
    170137    public function test_after_element_pop_must_maintain_p_in_button_scope_flag() {
    171         // These elements impact all scopes.
    172         $this->ensure_support_is_added_everywhere( 'APPLET' );
    173         $this->ensure_support_is_added_everywhere( 'CAPTION' );
    174         $this->ensure_support_is_added_everywhere( 'HTML' );
    175         $this->ensure_support_is_added_everywhere( 'TABLE' );
    176         $this->ensure_support_is_added_everywhere( 'TD' );
    177         $this->ensure_support_is_added_everywhere( 'TH' );
    178         $this->ensure_support_is_added_everywhere( 'MARQUEE' );
    179         $this->ensure_support_is_added_everywhere( 'OBJECT' );
    180         $this->ensure_support_is_added_everywhere( 'TEMPLATE' );
    181 
    182138        // MathML Elements: MI, MO, MN, MS, MTEXT, ANNOTATION-XML.
    183139        $this->ensure_support_is_added_everywhere( 'MATH' );
     
    204160     */
    205161    public function test_after_element_push_must_maintain_p_in_button_scope_flag() {
    206         // These elements impact all scopes.
    207         $this->ensure_support_is_added_everywhere( 'APPLET' );
    208         $this->ensure_support_is_added_everywhere( 'CAPTION' );
    209         $this->ensure_support_is_added_everywhere( 'HTML' );
    210         $this->ensure_support_is_added_everywhere( 'TABLE' );
    211         $this->ensure_support_is_added_everywhere( 'TD' );
    212         $this->ensure_support_is_added_everywhere( 'TH' );
    213         $this->ensure_support_is_added_everywhere( 'MARQUEE' );
    214         $this->ensure_support_is_added_everywhere( 'OBJECT' );
    215         $this->ensure_support_is_added_everywhere( 'TEMPLATE' );
    216 
    217162        // MathML Elements: MI, MO, MN, MS, MTEXT, ANNOTATION-XML.
    218163        $this->ensure_support_is_added_everywhere( 'MATH' );
     
    238183     */
    239184    public function test_has_element_in_table_scope_needs_support() {
    240         // These elements impact all scopes.
    241         $this->ensure_support_is_added_everywhere( 'APPLET' );
    242         $this->ensure_support_is_added_everywhere( 'CAPTION' );
    243         $this->ensure_support_is_added_everywhere( 'HTML' );
    244         $this->ensure_support_is_added_everywhere( 'TABLE' );
    245         $this->ensure_support_is_added_everywhere( 'TD' );
    246         $this->ensure_support_is_added_everywhere( 'TH' );
    247         $this->ensure_support_is_added_everywhere( 'MARQUEE' );
    248         $this->ensure_support_is_added_everywhere( 'OBJECT' );
    249         $this->ensure_support_is_added_everywhere( 'TEMPLATE' );
    250 
    251         // MathML Elements: MI, MO, MN, MS, MTEXT, ANNOTATION-XML.
    252         $this->ensure_support_is_added_everywhere( 'MATH' );
    253 
    254         /*
    255          * SVG elements: note that TITLE is both an HTML element and an SVG element
    256          * so care must be taken when adding support for either one.
    257          *
    258          * FOREIGNOBJECT, DESC, TITLE.
    259          */
    260         $this->ensure_support_is_added_everywhere( 'SVG' );
    261 
    262         // These elements are specific to TABLE scope.
    263         $this->ensure_support_is_added_everywhere( 'HTML' );
    264         $this->ensure_support_is_added_everywhere( 'TABLE' );
    265         $this->ensure_support_is_added_everywhere( 'TEMPLATE' );
    266 
    267         // These elements depend on table scope.
    268         $this->ensure_support_is_added_everywhere( 'CAPTION' );
    269         $this->ensure_support_is_added_everywhere( 'COL' );
    270         $this->ensure_support_is_added_everywhere( 'COLGROUP' );
    271         $this->ensure_support_is_added_everywhere( 'TBODY' );
    272         $this->ensure_support_is_added_everywhere( 'TD' );
    273         $this->ensure_support_is_added_everywhere( 'TFOOT' );
    274         $this->ensure_support_is_added_everywhere( 'TH' );
    275         $this->ensure_support_is_added_everywhere( 'THEAD' );
    276         $this->ensure_support_is_added_everywhere( 'TR' );
     185        // MathML Elements: MI, MO, MN, MS, MTEXT, ANNOTATION-XML.
     186        $this->ensure_support_is_added_everywhere( 'MATH' );
     187
     188        /*
     189         * SVG elements: note that TITLE is both an HTML element and an SVG element
     190         * so care must be taken when adding support for either one.
     191         *
     192         * FOREIGNOBJECT, DESC, TITLE.
     193         */
     194        $this->ensure_support_is_added_everywhere( 'SVG' );
    277195    }
    278196
     
    288206     */
    289207    public function test_has_element_in_select_scope_needs_support() {
    290         // These elements impact all scopes.
    291         $this->ensure_support_is_added_everywhere( 'APPLET' );
    292         $this->ensure_support_is_added_everywhere( 'CAPTION' );
    293         $this->ensure_support_is_added_everywhere( 'HTML' );
    294         $this->ensure_support_is_added_everywhere( 'TABLE' );
    295         $this->ensure_support_is_added_everywhere( 'TD' );
    296         $this->ensure_support_is_added_everywhere( 'TH' );
    297         $this->ensure_support_is_added_everywhere( 'MARQUEE' );
    298         $this->ensure_support_is_added_everywhere( 'OBJECT' );
    299         $this->ensure_support_is_added_everywhere( 'TEMPLATE' );
    300 
    301208        // MathML Elements: MI, MO, MN, MS, MTEXT, ANNOTATION-XML.
    302209        $this->ensure_support_is_added_everywhere( 'MATH' );
  • trunk/tests/phpunit/tests/html-api/wpHtmlTagProcessor-token-scanning.php

    r58040 r58779  
    5858
    5959    /**
     60     * Ensures that `get_modifiable_text()` properly transforms text content.
     61     *
     62     * The newline and NULL byte (U+0000) behaviors can be complicated since they depend
     63     * on where the bytes were found and whether they were raw bytes in the input stream
     64     * or decoded from character references.
     65     *
     66     * @ticket 61576
     67     *
     68     * @dataProvider data_modifiable_text_needing_transformation
     69     *
     70     * @param string $html_with_target_node    HTML with node containing `target` or `target-next` attribute.
     71     * @param string $expected_modifiable_text Expected modifiable text from target node or following node.
     72     */
     73    public function test_modifiable_text_proper_transforms( string $html_with_target_node, string $expected_modifiable_text ) {
     74        $processor = new WP_HTML_Tag_Processor( $html_with_target_node );
     75
     76        // Find the expected target node.
     77        while ( $processor->next_token() ) {
     78            $target = $processor->get_attribute( 'target' );
     79            if ( true === $target ) {
     80                break;
     81            }
     82
     83            if ( is_numeric( $target ) ) {
     84                for ( $i = (int) $target; $i > 0; $i-- ) {
     85                    $processor->next_token();
     86                }
     87                break;
     88            }
     89        }
     90
     91        $this->assertSame(
     92            $expected_modifiable_text,
     93            $processor->get_modifiable_text(),
     94            "Should have properly decoded and transformed modifiable text, but didn't."
     95        );
     96    }
     97
     98    /**
     99     * Data provider.
     100     *
     101     * @return array[].
     102     */
     103    public static function data_modifiable_text_needing_transformation() {
     104        return array(
     105            'Text node + NULL byte'      => array( "<span target=1>NULL byte in \x00 text nodes disappears.", 'NULL byte in  text nodes disappears.' ),
     106            'LISTING + newline'          => array( "<listing target=1>\nNo newline</listing>", 'No newline' ),
     107            'LISTING + CR + LF'          => array( "<listing target=1>\r\nNo newline</listing>", 'No newline' ),
     108            'LISTING + Encoded LF'       => array( '<listing target=1>&#x0a;No newline</listing>', 'No newline' ),
     109            'LISTING + Encoded CR'       => array( '<listing target=1>&#x0d;Newline</listing>', "\rNewline" ),
     110            'LISTING + Encoded CR + LF'  => array( '<listing target=1>&#x0d;&#x0a;Newline</listing>', "\r\nNewline" ),
     111            'PRE + newline'              => array( "<pre target=1>\nNo newline</pre>", 'No newline' ),
     112            'PRE + CR + LF'              => array( "<pre target=1>\r\nNo newline</pre>", 'No newline' ),
     113            'PRE + Encoded LF'           => array( '<pre target=1>&#x0a;No newline</pre>', 'No newline' ),
     114            'PRE + Encoded CR'           => array( '<pre target=1>&#x0d;Newline</pre>', "\rNewline" ),
     115            'PRE + Encoded CR + LF'      => array( '<pre target=1>&#x0d;&#x0a;Newline</pre>', "\r\nNewline" ),
     116            'TEXTAREA + newline'         => array( "<textarea target>\nNo newline</textarea>", 'No newline' ),
     117            'TEXTAREA + CR + LF'         => array( "<textarea target>\r\nNo newline</textarea>", 'No newline' ),
     118            'TEXTAREA + Encoded LF'      => array( '<textarea target>&#x0a;No newline</textarea>', 'No newline' ),
     119            'TEXTAREA + Encoded CR'      => array( '<textarea target>&#x0d;Newline</textarea>', "\rNewline" ),
     120            'TEXTAREA + Encoded CR + LF' => array( '<textarea target>&#x0d;&#x0a;Newline</textarea>', "\r\nNewline" ),
     121            'TEXTAREA + Comment-like'    => array( "<textarea target><!-- comment -->\nNo newline</textarea>", "<!-- comment -->\nNo newline" ),
     122            'PRE + Comment'              => array( "<pre target=2><!-- comment -->\nNo newline</pre>", "\nNo newline" ),
     123            'PRE + CDATA-like'           => array( "<pre target=2><![CDATA[test]]>\nNo newline</pre>", "\nNo newline" ),
     124            'LISTING + NULL byte'        => array( "<listing target=1>\x00 is missing</listing>", ' is missing' ),
     125            'PRE + NULL byte'            => array( "<pre target=1>\x00 is missing</pre>", ' is missing' ),
     126            'TEXTAREA + NULL byte'       => array( "<textarea target>\x00 is U+FFFD</textarea>", "\u{FFFD} is U+FFFD" ),
     127            'SCRIPT + NULL byte'         => array( "<script target>\x00 is U+FFFD</script>", "\u{FFFD} is U+FFFD" ),
     128            'esc(SCRIPT) + NULL byte'    => array( "<script target><!-- <script> \x00 </script> --> is U+FFFD</script>", "<!-- <script> \u{FFFD} </script> --> is U+FFFD" ),
     129            'STYLE + NULL byte'          => array( "<style target>\x00 is U+FFFD</style>", "\u{FFFD} is U+FFFD" ),
     130            'XMP + NULL byte'            => array( "<xmp target>\x00 is U+FFFD</xmp>", "\u{FFFD} is U+FFFD" ),
     131            'CDATA-like + NULL byte'     => array( "<span target=1><![CDATA[just a \x00comment]]>", "just a \u{FFFD}comment" ),
     132            'Funky comment + NULL byte'  => array( "<span target=1></%just a \x00comment>", "%just a \u{FFFD}comment" ),
     133        );
     134    }
     135
     136    /**
    60137     * Ensures that normative Elements are properly parsed.
    61138     *
Note: See TracChangeset for help on using the changeset viewer.