Make WordPress Core

Changeset 59076


Ignore:
Timestamp:
09/20/2024 10:30:04 PM (5 months ago)
Author:
dmsnell
Message:

HTML API: Add normalize() to give us the HTML we always wanted.

HTML often appears in ways that are unexpected. It may be missing implicit tags, may have unquoted, single-quoted, or double-quoted attributes, may contain duplicate attributes, may contain unescaped text content, or any number of other possible invalid constructions. The HTML API understands all fo these inputs, but downline parsers may not, and HTML snippets which are safe on their own may introduce problems when joined with other HTML snippets.

This patch introduces the serialize() method on the HTML Processor, which prints a fully-normative HTML output, eliminating invalid markup along the way. It produces a string which contains every missing tag, double-quoted attributes, and no duplicates. A normalize() static method on the HTML Processor provides a convenient wrapper for constructing a fragment parser and immediately serializing.

Subclasses relying on the serialize_token() method may perform structural HTML modifications with as much security as the upcoming \Dom\HTMLDocument() parser will, though these are not
able to provide the full safety that will eventually appear with set_inner_html().

Further work may explore serializing to XML (which involves a number of other important transformations) and adding constraints to serialization (such as only allowing inline/flow/formatting elements and text).

Developed in https://github.com/wordpress/wordpress-develop/pull/7331
Discussed in https://core.trac.wordpress.org/ticket/62036

Props dmsnell, jonsurrell, westonruter.
Fixes #62036.

Location:
trunk
Files:
1 added
2 edited

Legend:

Unmodified
Added
Removed
  • trunk/src/wp-includes/html-api/class-wp-html-processor.php

    r59053 r59076  
    10311031
    10321032    /**
     1033     * Normalizes an HTML fragment by serializing it.
     1034     *
     1035     * This method assumes that the given HTML snippet is found in BODY context.
     1036     * For normalizing full documents or fragments found in other contexts, create
     1037     * a new processor using {@see WP_HTML_Processor::create_fragment} or
     1038     * {@see WP_HTML_Processor::create_full_parser} and call {@see WP_HTML_Processor::serialize}
     1039     * on the created instances.
     1040     *
     1041     * Many aspects of an input HTML fragment may be changed during normalization.
     1042     *
     1043     *  - Attribute values will be double-quoted.
     1044     *  - Duplicate attributes will be removed.
     1045     *  - Omitted tags will be added.
     1046     *  - Tag and attribute name casing will be lower-cased,
     1047     *    except for specific SVG and MathML tags or attributes.
     1048     *  - Text will be re-encoded, null bytes handled,
     1049     *    and invalid UTF-8 replaced with U+FFFD.
     1050     *  - Any incomplete syntax trailing at the end will be omitted,
     1051     *    for example, an unclosed comment opener will be removed.
     1052     *
     1053     * Example:
     1054     *
     1055     *     echo WP_HTML_Processor::normalize( '<a href=#anchor v=5 href="/" enabled>One</a another v=5><!--' );
     1056     *     // <a href="#anchor" v="5" enabled>One</a>
     1057     *
     1058     *     echo WP_HTML_Processor::normalize( '<div></p>fun<table><td>cell</div>' );
     1059     *     // <div><p></p>fun<table><tbody><tr><td>cell</td></tr></tbody></table></div>
     1060     *
     1061     *     echo WP_HTML_Processor::normalize( '<![CDATA[invalid comment]]> syntax < <> "oddities"' );
     1062     *     // <!--[CDATA[invalid comment]]--> syntax &lt; &lt;&gt; &quot;oddities&quot;
     1063     *
     1064     * @since 6.7.0
     1065     *
     1066     * @param string $html Input HTML to normalize.
     1067     *
     1068     * @return string|null Normalized output, or `null` if unable to normalize.
     1069     */
     1070    public static function normalize( string $html ): ?string {
     1071        return static::create_fragment( $html )->serialize();
     1072    }
     1073
     1074    /**
     1075     * Returns normalized HTML for a fragment by serializing it.
     1076     *
     1077     * This differs from {@see WP_HTML_Processor::normalize} in that it starts with
     1078     * a specific HTML Processor, which _must_ not have already started scanning;
     1079     * it must be in the initial ready state and will be in the completed state once
     1080     * serialization is complete.
     1081     *
     1082     * Many aspects of an input HTML fragment may be changed during normalization.
     1083     *
     1084     *  - Attribute values will be double-quoted.
     1085     *  - Duplicate attributes will be removed.
     1086     *  - Omitted tags will be added.
     1087     *  - Tag and attribute name casing will be lower-cased,
     1088     *    except for specific SVG and MathML tags or attributes.
     1089     *  - Text will be re-encoded, null bytes handled,
     1090     *    and invalid UTF-8 replaced with U+FFFD.
     1091     *  - Any incomplete syntax trailing at the end will be omitted,
     1092     *    for example, an unclosed comment opener will be removed.
     1093     *
     1094     * Example:
     1095     *
     1096     *     $processor = WP_HTML_Processor::create_fragment( '<a href=#anchor v=5 href="/" enabled>One</a another v=5><!--' );
     1097     *     echo $processor->serialize();
     1098     *     // <a href="#anchor" v="5" enabled>One</a>
     1099     *
     1100     *     $processor = WP_HTML_Processor::create_fragment( '<div></p>fun<table><td>cell</div>' );
     1101     *     echo $processor->serialize();
     1102     *     // <div><p></p>fun<table><tbody><tr><td>cell</td></tr></tbody></table></div>
     1103     *
     1104     *     $processor = WP_HTML_Processor::create_fragment( '<![CDATA[invalid comment]]> syntax < <> "oddities"' );
     1105     *     echo $processor->serialize();
     1106     *     // <!--[CDATA[invalid comment]]--> syntax &lt; &lt;&gt; &quot;oddities&quot;
     1107     *
     1108     * @since 6.7.0
     1109     *
     1110     * @return string|null Normalized HTML markup represented by processor,
     1111     *                     or `null` if unable to generate serialization.
     1112     */
     1113    public function serialize(): ?string {
     1114        if ( WP_HTML_Tag_Processor::STATE_READY !== $this->parser_state ) {
     1115            wp_trigger_error(
     1116                __METHOD__,
     1117                "An HTML Processor which has already started processing cannot serialize it's contents. Serialize immediately after creating the instance.",
     1118                E_USER_WARNING
     1119            );
     1120            return null;
     1121        }
     1122
     1123        $html = '';
     1124        while ( $this->next_token() ) {
     1125            $html .= $this->serialize_token();
     1126        }
     1127
     1128        if ( null !== $this->get_last_error() ) {
     1129            wp_trigger_error(
     1130                __METHOD__,
     1131                "Cannot serialize HTML Processor with parsing error: {$this->get_last_error()}.",
     1132                E_USER_WARNING
     1133            );
     1134            return null;
     1135        }
     1136
     1137        return $html;
     1138    }
     1139
     1140    /**
     1141     * Serializes the currently-matched token.
     1142     *
     1143     * This method produces a fully-normative HTML string for the currently-matched token,
     1144     * if able. If not matched at any token or if the token doesn't correspond to any HTML
     1145     * it will return an empty string (for example, presumptuous end tags are ignored).
     1146     *
     1147     * @see static::serialize()
     1148     *
     1149     * @since 6.7.0
     1150     *
     1151     * @return string Serialization of token, or empty string if no serialization exists.
     1152     */
     1153    protected function serialize_token(): string {
     1154        $html       = '';
     1155        $token_type = $this->get_token_type();
     1156
     1157        switch ( $token_type ) {
     1158            case '#text':
     1159                $html .= htmlspecialchars( $this->get_modifiable_text(), ENT_QUOTES | ENT_SUBSTITUTE | ENT_HTML5, 'UTF-8' );
     1160                break;
     1161
     1162            // Unlike the `<>` which is interpreted as plaintext, this is ignored entirely.
     1163            case '#presumptuous-tag':
     1164                break;
     1165
     1166            case '#funky-comment':
     1167            case '#comment':
     1168                $html .= "<!--{$this->get_full_comment_text()}-->";
     1169                break;
     1170
     1171            case '#cdata-section':
     1172                $html .= "<![CDATA[{$this->get_modifiable_text()}]]>";
     1173                break;
     1174
     1175            case 'html':
     1176                $html .= '<!DOCTYPE html>';
     1177                break;
     1178        }
     1179
     1180        if ( '#tag' !== $token_type ) {
     1181            return $html;
     1182        }
     1183
     1184        $tag_name       = str_replace( "\x00", "\u{FFFD}", $this->get_tag() );
     1185        $in_html        = 'html' === $this->get_namespace();
     1186        $qualified_name = $in_html ? strtolower( $tag_name ) : $this->get_qualified_tag_name();
     1187
     1188        if ( $this->is_tag_closer() ) {
     1189            $html .= "</{$qualified_name}>";
     1190            return $html;
     1191        }
     1192
     1193        $attribute_names = $this->get_attribute_names_with_prefix( '' );
     1194        if ( ! isset( $attribute_names ) ) {
     1195            $html .= "<{$qualified_name}>";
     1196            return $html;
     1197        }
     1198
     1199        $html .= "<{$qualified_name}";
     1200        foreach ( $attribute_names as $attribute_name ) {
     1201            $html .= " {$this->get_qualified_attribute_name( $attribute_name )}";
     1202            $value = $this->get_attribute( $attribute_name );
     1203
     1204            if ( is_string( $value ) ) {
     1205                $html .= '="' . htmlspecialchars( $value, ENT_QUOTES | ENT_SUBSTITUTE | ENT_HTML5 ) . '"';
     1206            }
     1207
     1208            $html = str_replace( "\x00", "\u{FFFD}", $html );
     1209        }
     1210
     1211        if ( ! $in_html && $this->has_self_closing_flag() ) {
     1212            $html .= ' /';
     1213        }
     1214
     1215        $html .= '>';
     1216
     1217        // Flush out self-contained elements.
     1218        if ( $in_html && in_array( $tag_name, array( 'IFRAME', 'NOEMBED', 'NOFRAMES', 'SCRIPT', 'STYLE', 'TEXTAREA', 'TITLE', 'XMP' ), true ) ) {
     1219            $text = $this->get_modifiable_text();
     1220
     1221            switch ( $tag_name ) {
     1222                case 'IFRAME':
     1223                case 'NOEMBED':
     1224                case 'NOFRAMES':
     1225                    $text = '';
     1226                    break;
     1227
     1228                case 'SCRIPT':
     1229                case 'STYLE':
     1230                    break;
     1231
     1232                default:
     1233                    $text = htmlspecialchars( $text, ENT_QUOTES | ENT_SUBSTITUTE | ENT_HTML5, 'UTF-8' );
     1234            }
     1235
     1236            $html .= "{$text}</{$qualified_name}>";
     1237        }
     1238
     1239        return $html;
     1240    }
     1241
     1242    /**
    10331243     * Parses next element in the 'initial' insertion mode.
    10341244     *
  • trunk/src/wp-includes/html-api/class-wp-html-tag-processor.php

    r59075 r59076  
    19841984                 *                     [#x10000-#xEFFFF]
    19851985                 * > NameChar      ::= NameStartChar | "-" | "." | [0-9] | #xB7 | [#x0300-#x036F] | [#x203F-#x2040]
     1986                 *
     1987                 * @todo Processing instruction nodes in SGML may contain any kind of markup. XML defines a
     1988                 *       special case with `<?xml ... ?>` syntax, but the `?` is part of the bogus comment.
    19861989                 *
    19871990                 * @see https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-PITarget
Note: See TracChangeset for help on using the changeset viewer.