Changeset 60768
- Timestamp:
- 09/16/2025 12:35:01 PM (7 months ago)
- Location:
- trunk
- Files:
-
- 2 edited
-
phpcs.xml.dist (modified) (1 diff)
-
src/wp-includes/compat-utf8.php (modified) (7 diffs)
Legend:
- Unmodified
- Added
- Removed
-
trunk/phpcs.xml.dist
r58925 r60768 256 256 </rule> 257 257 258 <!-- Exclude forbidding goto in the HTML Processor, which mimics algorithms that are written259 this way in the HTML specification, and these particular algorithms are complex and260 highly imperative. Avoiding the goto introduces a number of risks that could make it261 more difficult to maintain the relationship to the standard, lead to subtle differences262 in the parsing, and distance the code from its standard. -->263 258 <rule ref="Generic.PHP.DiscourageGoto.Found"> 259 <!-- Exclude forbidding goto in the HTML Processor, which mimics algorithms that are written 260 this way in the HTML specification, and these particular algorithms are complex and 261 highly imperative. Avoiding the goto introduces a number of risks that could make it 262 more difficult to maintain the relationship to the standard, lead to subtle differences 263 in the parsing, and distance the code from its standard. --> 264 264 <exclude-pattern>/wp-includes/html-api/class-wp-html-processor\.php</exclude-pattern> 265 265 <exclude-pattern>/wp-includes/html-api/class-wp-html-doctype-info\.php</exclude-pattern> 266 267 <!-- Goto is an effective way to handle errors in decoders which expect valid bytes 268 without impacting the fast path while avoiding bloating the code with redundant 269 and risky handling code. Exclude forbidding goto in UTF-8 fallback code. --> 270 <exclude-pattern>/wp-includes/compat-utf8\.php</exclude-pattern> 266 271 </rule> 267 272 -
trunk/src/wp-includes/compat-utf8.php
r60764 r60768 2 2 3 3 /** 4 * Fallback mechanism for safely validating UTF-8 bytes. 5 * 6 * By implementing a raw method here the code will behave in the same way on 7 * all installed systems, regardless of what extensions are installed. 8 * 9 * @see wp_is_valid_utf8 4 * Finds spans of valid and invalid UTF-8 bytes in a given string. 5 * 6 * This is a low-level tool to power various UTF-8 functionality. 7 * It scans through a string until it finds invalid byte spans. 8 * When it does this, it does three things: 9 * 10 * - Assigns `$at` to the position after the last successful code point. 11 * - Assigns `$invalid_length` to the length of the maximal subpart of 12 * the invalid bytes starting at `$at`. 13 * - Returns how many code points were successfully scanned. 14 * 15 * This information is enough to build a number of useful UTF-8 functions. 16 * 17 * Example: 18 * 19 * // ñ is U+F1, which in `ISO-8859-1`/`latin1`/`Windows-1252`/`cp1252` is 0xF1. 20 * "Pi\xF1a" === $pineapple = mb_convert_encoding( "Piña", 'Windows-1252', 'UTF-8' ); 21 * $at = $invalid_length = 0; 22 * 23 * // The first step finds the invalid 0xF1 byte. 24 * 2 === _wp_scan_utf8( $pineapple, $at, $invalid_length ); 25 * $at === 2; $invalid_length === 1; 26 * 27 * // The second step continues to the end of the string. 28 * 1 === _wp_scan_utf8( $pineapple, $at, $invalid_length ); 29 * $at === 4; $invalid_length === 0; 30 * 31 * Note! This functions many arguments are passed without and “options” 32 * array. This choice is based on the fact that this is a low-level function 33 * and there’s no need to create an array of items on every invocation. 10 34 * 11 35 * @since 6.9.0 12 36 * @access private 13 37 * 14 * @param string $bytes String which might contain text encoded as UTF-8. 15 * @return bool Whether the provided bytes can decode as valid UTF-8. 38 * @param string $bytes UTF-8 encoded string which might include invalid spans of bytes. 39 * @param int $at Where to start scanning. 40 * @param int $invalid_length Will be set to how many bytes are to be ignored after `$at`. 41 * @param int|null $max_bytes Stop scanning after this many bytes have been seen. 42 * @param int|null $max_code_points Stop scanning after this many code points have been seen. 43 * @return int How many code points were successfully scanned. 16 44 */ 17 function _wp_is_valid_utf8_fallback( string $bytes ): bool { 18 $end = strlen( $bytes ); 19 20 for ( $i = 0; $i < $end; $i++ ) { 45 function _wp_scan_utf8( string $bytes, int &$at, int &$invalid_length, ?int $max_bytes = null, ?int $max_code_points = null ): int { 46 $byte_length = strlen( $bytes ); 47 $end = min( $byte_length, $at + ( $max_bytes ?? PHP_INT_MAX ) ); 48 $invalid_length = 0; 49 $count = 0; 50 $max_count = $max_code_points ?? PHP_INT_MAX; 51 52 for ( $i = $at; $i < $end && $count <= $max_count; $i++ ) { 21 53 /* 22 54 * Quickly skip past US-ASCII bytes, all of which are valid UTF-8. … … 25 57 * depending on whether the JIT has optimized the function. 26 58 */ 27 $ i += strspn(59 $ascii_byte_count = strspn( 28 60 $bytes, 29 61 "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f" . 30 62 "\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f" . 31 63 " !\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\x7f", 32 $i 64 $i, 65 $end - $i 33 66 ); 67 68 if ( $count + $ascii_byte_count >= $max_count ) { 69 $at = $i + ( $max_count - $count ); 70 $count = $max_count; 71 return $count; 72 } 73 74 $count += $ascii_byte_count; 75 $i += $ascii_byte_count; 76 34 77 if ( $i >= $end ) { 35 break; 78 $at = $end; 79 return $count; 36 80 } 37 81 … … 41 85 * 42 86 * Therefore everything past here is checking those multibyte sequences. 87 * 88 * It may look like there’s a need to check against the max bytes here, 89 * but since each match of a single character returns, this functions will 90 * bail already if crossing the max-bytes threshold. This function SHALL 91 * NOT return in the middle of a multi-byte character, so if a character 92 * falls on each side of the max bytes, the entire character will be scanned. 93 * 43 94 * Because it’s possible that there are truncated characters, the use of 44 95 * the null-coalescing operator with "\xC0" is a convenience for skipping … … 47 98 * truncated, it will find 0xC0 and reject as invalid UTF-8. 48 99 * 49 * > [The following table] lists all of the byte sequences that are well-formed100 * > [The following table] lists all of the byte sequences that are well-formed 50 101 * > in UTF-8. A range of byte values such as A0..BF indicates that any byte 51 102 * > from A0 to BF (inclusive) is well-formed in that position. Any byte value … … 67 118 * ╰─────────────────────┴────────────┴──────────────┴─────────────┴──────────────╯ 68 119 * 69 * Notice that all valid third and forth bytes are in the range 80..BF. This70 * validator takes advantage of that to only check the range of those bytes once.71 *72 * @see https://lemire.me/blog/2018/05/09/how-quickly-can-you-check-that-a-string-is-valid-unicode-utf-8/73 120 * @see https://www.unicode.org/versions/Unicode16.0.0/core-spec/chapter-3/#G27506 74 121 */ 75 122 123 // Valid two-byte code points. 76 124 $b1 = ord( $bytes[ $i ] ); 77 125 $b2 = ord( $bytes[ $i + 1 ] ?? "\xC0" ); 78 126 79 // Valid two-byte code points.80 81 127 if ( $b1 >= 0xC2 && $b1 <= 0xDF && $b2 >= 0x80 && $b2 <= 0xBF ) { 128 ++$count; 82 129 ++$i; 83 130 continue; 84 131 } 85 132 133 // Valid three-byte code points. 86 134 $b3 = ord( $bytes[ $i + 2 ] ?? "\xC0" ); 87 135 88 // Valid three-byte code points.89 90 136 if ( $b3 < 0x80 || $b3 > 0xBF ) { 91 return false;137 goto invalid_utf8; 92 138 } 93 139 … … 98 144 ( $b1 >= 0xEE && $b1 <= 0xEF && $b2 >= 0x80 && $b2 <= 0xBF ) 99 145 ) { 146 ++$count; 100 147 $i += 2; 101 148 continue; 102 149 } 103 150 151 // Valid four-byte code points. 104 152 $b4 = ord( $bytes[ $i + 3 ] ?? "\xC0" ); 105 153 106 // Valid four-byte code points.107 108 154 if ( $b4 < 0x80 || $b4 > 0xBF ) { 109 return false;155 goto invalid_utf8; 110 156 } 111 157 … … 115 161 ( 0xF4 === $b1 && $b2 >= 0x80 && $b2 <= 0x8F ) 116 162 ) { 163 ++$count; 117 164 $i += 3; 118 165 continue; 119 166 } 120 167 121 // Any other sequence is invalid. 122 return false; 168 /** 169 * When encountering invalid byte sequences, Unicode suggests finding the 170 * maximal subpart of a text and replacing that subpart with a single 171 * replacement character. 172 * 173 * > This practice is more secure because it does not result in the 174 * > conversion consuming parts of valid sequences as though they were 175 * > invalid. It also guarantees at least one replacement character will 176 * > occur for each instance of an invalid sequence in the original text. 177 * > Furthermore, this practice can be defined consistently for better 178 * > interoperability between different implementations of conversion. 179 * 180 * @see https://www.unicode.org/versions/Unicode16.0.0/core-spec/chapter-5/#G40630 181 */ 182 invalid_utf8: 183 $at = $i; 184 $invalid_length = 1; 185 186 // Single-byte and two-byte characters. 187 if ( ( 0x00 === ( $b1 & 0x80 ) ) || ( 0xC0 === ( $b1 & 0xE0 ) ) ) { 188 return $count; 189 } 190 191 $b2 = ord( $bytes[ $i + 1 ] ?? "\xC0" ); 192 $b3 = ord( $bytes[ $i + 2 ] ?? "\xC0" ); 193 194 // Find the maximal subpart and skip past it. 195 if ( 0xE0 === ( $b1 & 0xF0 ) ) { 196 // Three-byte characters. 197 $b2_valid = ( 198 ( 0xE0 === $b1 && $b2 >= 0xA0 && $b2 <= 0xBF ) || 199 ( $b1 >= 0xE1 && $b1 <= 0xEC && $b2 >= 0x80 && $b2 <= 0xBF ) || 200 ( 0xED === $b1 && $b2 >= 0x80 && $b2 <= 0x9F ) || 201 ( $b1 >= 0xEE && $b1 <= 0xEF && $b2 >= 0x80 && $b2 <= 0xBF ) 202 ); 203 204 $invalid_length = min( $end - $i, $b2_valid ? 2 : 1 ); 205 return $count; 206 } elseif ( 0xF0 === ( $b1 & 0xF8 ) ) { 207 // Four-byte characters. 208 $b2_valid = ( 209 ( 0xF0 === $b1 && $b2 >= 0x90 && $b2 <= 0xBF ) || 210 ( $b1 >= 0xF1 && $b1 <= 0xF3 && $b2 >= 0x80 && $b2 <= 0xBF ) || 211 ( 0xF4 === $b1 && $b2 >= 0x80 && $b2 <= 0x8F ) 212 ); 213 214 $b3_valid = $b3 >= 0x80 && $b3 <= 0xBF; 215 216 $invalid_length = min( $end - $i, $b2_valid ? ( $b3_valid ? 3 : 2 ) : 1 ); 217 return $count; 218 } 219 220 return $count; 123 221 } 124 222 125 // Reaching the end implies validating every byte.126 return true;223 $at = $i; 224 return $count; 127 225 } 226 227 /** 228 * Fallback mechanism for safely validating UTF-8 bytes. 229 * 230 * @see wp_is_valid_utf8() 231 * 232 * @since 6.9.0 233 * @access private 234 * 235 * @param string $bytes String which might contain text encoded as UTF-8. 236 * @return bool Whether the provided bytes can decode as valid UTF-8. 237 */ 238 function _wp_is_valid_utf8_fallback( string $bytes ): bool { 239 $bytes_length = strlen( $bytes ); 240 if ( 0 === $bytes_length ) { 241 return true; 242 } 243 244 $next_byte_at = 0; 245 $invalid_length = 0; 246 247 _wp_scan_utf8( $bytes, $next_byte_at, $invalid_length ); 248 249 return $bytes_length === $next_byte_at && 0 === $invalid_length; 250 }
Note: See TracChangeset
for help on using the changeset viewer.