Changeset 59076
Legend:
- Unmodified
- Added
- Removed
-
trunk/src/wp-includes/html-api/class-wp-html-processor.php
r59053 r59076 1031 1031 1032 1032 /** 1033 * Normalizes an HTML fragment by serializing it. 1034 * 1035 * This method assumes that the given HTML snippet is found in BODY context. 1036 * For normalizing full documents or fragments found in other contexts, create 1037 * a new processor using {@see WP_HTML_Processor::create_fragment} or 1038 * {@see WP_HTML_Processor::create_full_parser} and call {@see WP_HTML_Processor::serialize} 1039 * on the created instances. 1040 * 1041 * Many aspects of an input HTML fragment may be changed during normalization. 1042 * 1043 * - Attribute values will be double-quoted. 1044 * - Duplicate attributes will be removed. 1045 * - Omitted tags will be added. 1046 * - Tag and attribute name casing will be lower-cased, 1047 * except for specific SVG and MathML tags or attributes. 1048 * - Text will be re-encoded, null bytes handled, 1049 * and invalid UTF-8 replaced with U+FFFD. 1050 * - Any incomplete syntax trailing at the end will be omitted, 1051 * for example, an unclosed comment opener will be removed. 1052 * 1053 * Example: 1054 * 1055 * echo WP_HTML_Processor::normalize( '<a href=#anchor v=5 href="/" enabled>One</a another v=5><!--' ); 1056 * // <a href="#anchor" v="5" enabled>One</a> 1057 * 1058 * echo WP_HTML_Processor::normalize( '<div></p>fun<table><td>cell</div>' ); 1059 * // <div><p></p>fun<table><tbody><tr><td>cell</td></tr></tbody></table></div> 1060 * 1061 * echo WP_HTML_Processor::normalize( '<![CDATA[invalid comment]]> syntax < <> "oddities"' ); 1062 * // <!--[CDATA[invalid comment]]--> syntax < <> "oddities" 1063 * 1064 * @since 6.7.0 1065 * 1066 * @param string $html Input HTML to normalize. 1067 * 1068 * @return string|null Normalized output, or `null` if unable to normalize. 1069 */ 1070 public static function normalize( string $html ): ?string { 1071 return static::create_fragment( $html )->serialize(); 1072 } 1073 1074 /** 1075 * Returns normalized HTML for a fragment by serializing it. 1076 * 1077 * This differs from {@see WP_HTML_Processor::normalize} in that it starts with 1078 * a specific HTML Processor, which _must_ not have already started scanning; 1079 * it must be in the initial ready state and will be in the completed state once 1080 * serialization is complete. 1081 * 1082 * Many aspects of an input HTML fragment may be changed during normalization. 1083 * 1084 * - Attribute values will be double-quoted. 1085 * - Duplicate attributes will be removed. 1086 * - Omitted tags will be added. 1087 * - Tag and attribute name casing will be lower-cased, 1088 * except for specific SVG and MathML tags or attributes. 1089 * - Text will be re-encoded, null bytes handled, 1090 * and invalid UTF-8 replaced with U+FFFD. 1091 * - Any incomplete syntax trailing at the end will be omitted, 1092 * for example, an unclosed comment opener will be removed. 1093 * 1094 * Example: 1095 * 1096 * $processor = WP_HTML_Processor::create_fragment( '<a href=#anchor v=5 href="/" enabled>One</a another v=5><!--' ); 1097 * echo $processor->serialize(); 1098 * // <a href="#anchor" v="5" enabled>One</a> 1099 * 1100 * $processor = WP_HTML_Processor::create_fragment( '<div></p>fun<table><td>cell</div>' ); 1101 * echo $processor->serialize(); 1102 * // <div><p></p>fun<table><tbody><tr><td>cell</td></tr></tbody></table></div> 1103 * 1104 * $processor = WP_HTML_Processor::create_fragment( '<![CDATA[invalid comment]]> syntax < <> "oddities"' ); 1105 * echo $processor->serialize(); 1106 * // <!--[CDATA[invalid comment]]--> syntax < <> "oddities" 1107 * 1108 * @since 6.7.0 1109 * 1110 * @return string|null Normalized HTML markup represented by processor, 1111 * or `null` if unable to generate serialization. 1112 */ 1113 public function serialize(): ?string { 1114 if ( WP_HTML_Tag_Processor::STATE_READY !== $this->parser_state ) { 1115 wp_trigger_error( 1116 __METHOD__, 1117 "An HTML Processor which has already started processing cannot serialize it's contents. Serialize immediately after creating the instance.", 1118 E_USER_WARNING 1119 ); 1120 return null; 1121 } 1122 1123 $html = ''; 1124 while ( $this->next_token() ) { 1125 $html .= $this->serialize_token(); 1126 } 1127 1128 if ( null !== $this->get_last_error() ) { 1129 wp_trigger_error( 1130 __METHOD__, 1131 "Cannot serialize HTML Processor with parsing error: {$this->get_last_error()}.", 1132 E_USER_WARNING 1133 ); 1134 return null; 1135 } 1136 1137 return $html; 1138 } 1139 1140 /** 1141 * Serializes the currently-matched token. 1142 * 1143 * This method produces a fully-normative HTML string for the currently-matched token, 1144 * if able. If not matched at any token or if the token doesn't correspond to any HTML 1145 * it will return an empty string (for example, presumptuous end tags are ignored). 1146 * 1147 * @see static::serialize() 1148 * 1149 * @since 6.7.0 1150 * 1151 * @return string Serialization of token, or empty string if no serialization exists. 1152 */ 1153 protected function serialize_token(): string { 1154 $html = ''; 1155 $token_type = $this->get_token_type(); 1156 1157 switch ( $token_type ) { 1158 case '#text': 1159 $html .= htmlspecialchars( $this->get_modifiable_text(), ENT_QUOTES | ENT_SUBSTITUTE | ENT_HTML5, 'UTF-8' ); 1160 break; 1161 1162 // Unlike the `<>` which is interpreted as plaintext, this is ignored entirely. 1163 case '#presumptuous-tag': 1164 break; 1165 1166 case '#funky-comment': 1167 case '#comment': 1168 $html .= "<!--{$this->get_full_comment_text()}-->"; 1169 break; 1170 1171 case '#cdata-section': 1172 $html .= "<![CDATA[{$this->get_modifiable_text()}]]>"; 1173 break; 1174 1175 case 'html': 1176 $html .= '<!DOCTYPE html>'; 1177 break; 1178 } 1179 1180 if ( '#tag' !== $token_type ) { 1181 return $html; 1182 } 1183 1184 $tag_name = str_replace( "\x00", "\u{FFFD}", $this->get_tag() ); 1185 $in_html = 'html' === $this->get_namespace(); 1186 $qualified_name = $in_html ? strtolower( $tag_name ) : $this->get_qualified_tag_name(); 1187 1188 if ( $this->is_tag_closer() ) { 1189 $html .= "</{$qualified_name}>"; 1190 return $html; 1191 } 1192 1193 $attribute_names = $this->get_attribute_names_with_prefix( '' ); 1194 if ( ! isset( $attribute_names ) ) { 1195 $html .= "<{$qualified_name}>"; 1196 return $html; 1197 } 1198 1199 $html .= "<{$qualified_name}"; 1200 foreach ( $attribute_names as $attribute_name ) { 1201 $html .= " {$this->get_qualified_attribute_name( $attribute_name )}"; 1202 $value = $this->get_attribute( $attribute_name ); 1203 1204 if ( is_string( $value ) ) { 1205 $html .= '="' . htmlspecialchars( $value, ENT_QUOTES | ENT_SUBSTITUTE | ENT_HTML5 ) . '"'; 1206 } 1207 1208 $html = str_replace( "\x00", "\u{FFFD}", $html ); 1209 } 1210 1211 if ( ! $in_html && $this->has_self_closing_flag() ) { 1212 $html .= ' /'; 1213 } 1214 1215 $html .= '>'; 1216 1217 // Flush out self-contained elements. 1218 if ( $in_html && in_array( $tag_name, array( 'IFRAME', 'NOEMBED', 'NOFRAMES', 'SCRIPT', 'STYLE', 'TEXTAREA', 'TITLE', 'XMP' ), true ) ) { 1219 $text = $this->get_modifiable_text(); 1220 1221 switch ( $tag_name ) { 1222 case 'IFRAME': 1223 case 'NOEMBED': 1224 case 'NOFRAMES': 1225 $text = ''; 1226 break; 1227 1228 case 'SCRIPT': 1229 case 'STYLE': 1230 break; 1231 1232 default: 1233 $text = htmlspecialchars( $text, ENT_QUOTES | ENT_SUBSTITUTE | ENT_HTML5, 'UTF-8' ); 1234 } 1235 1236 $html .= "{$text}</{$qualified_name}>"; 1237 } 1238 1239 return $html; 1240 } 1241 1242 /** 1033 1243 * Parses next element in the 'initial' insertion mode. 1034 1244 * -
trunk/src/wp-includes/html-api/class-wp-html-tag-processor.php
r59075 r59076 1984 1984 * [#x10000-#xEFFFF] 1985 1985 * > NameChar ::= NameStartChar | "-" | "." | [0-9] | #xB7 | [#x0300-#x036F] | [#x203F-#x2040] 1986 * 1987 * @todo Processing instruction nodes in SGML may contain any kind of markup. XML defines a 1988 * special case with `<?xml ... ?>` syntax, but the `?` is part of the bogus comment. 1986 1989 * 1987 1990 * @see https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-PITarget
Note: See TracChangeset
for help on using the changeset viewer.