| | 146 | |
| | 147 | /** |
| | 148 | * @ticket 24661 |
| | 149 | */ |
| | 150 | public function test_remove_accents_mn_combining_marks() { |
| | 151 | // Creates a string with all Mn nonspacing combining marks (generated from Unicode 5.0.0 "UnicodeData.txt", 880 code points). |
| | 152 | // (Using Unicode 5.0.0 to be compatible with PHP 5.2.4 (PCRE 7.2).) |
| | 153 | $code_points = array_merge( |
| | 154 | range( 0x300, 0x36f ) |
| | 155 | , range( 0x483, 0x486 ) |
| | 156 | , range( 0x591, 0x5bd ) |
| | 157 | , array( 0x5bf ) |
| | 158 | , range( 0x5c1, 0x5c2 ) |
| | 159 | , range( 0x5c4, 0x5c5 ) |
| | 160 | , array( 0x5c7 ) |
| | 161 | , range( 0x610, 0x615 ) |
| | 162 | , range( 0x64b, 0x65e ) |
| | 163 | , array( 0x670 ) |
| | 164 | , range( 0x6d6, 0x6dc ) |
| | 165 | , range( 0x6df, 0x6e4 ) |
| | 166 | , range( 0x6e7, 0x6e8 ) |
| | 167 | , range( 0x6ea, 0x6ed ) |
| | 168 | , array( 0x711 ) |
| | 169 | , range( 0x730, 0x74a ) |
| | 170 | , range( 0x7a6, 0x7b0 ) |
| | 171 | , range( 0x7eb, 0x7f3 ) |
| | 172 | , range( 0x901, 0x902 ) |
| | 173 | , array( 0x93c ) |
| | 174 | , range( 0x941, 0x948 ) |
| | 175 | , array( 0x94d ) |
| | 176 | , range( 0x951, 0x954 ) |
| | 177 | , range( 0x962, 0x963 ) |
| | 178 | , array( 0x981 ) |
| | 179 | , array( 0x9bc ) |
| | 180 | , range( 0x9c1, 0x9c4 ) |
| | 181 | , array( 0x9cd ) |
| | 182 | , range( 0x9e2, 0x9e3 ) |
| | 183 | , range( 0xa01, 0xa02 ) |
| | 184 | , array( 0xa3c ) |
| | 185 | , range( 0xa41, 0xa42 ) |
| | 186 | , range( 0xa47, 0xa48 ) |
| | 187 | , range( 0xa4b, 0xa4d ) |
| | 188 | , range( 0xa70, 0xa71 ) |
| | 189 | , range( 0xa81, 0xa82 ) |
| | 190 | , array( 0xabc ) |
| | 191 | , range( 0xac1, 0xac5 ) |
| | 192 | , range( 0xac7, 0xac8 ) |
| | 193 | , array( 0xacd ) |
| | 194 | , range( 0xae2, 0xae3 ) |
| | 195 | , array( 0xb01 ) |
| | 196 | , array( 0xb3c ) |
| | 197 | , array( 0xb3f ) |
| | 198 | , range( 0xb41, 0xb43 ) |
| | 199 | , array( 0xb4d ) |
| | 200 | , array( 0xb56 ) |
| | 201 | , array( 0xb82 ) |
| | 202 | , array( 0xbc0 ) |
| | 203 | , array( 0xbcd ) |
| | 204 | , range( 0xc3e, 0xc40 ) |
| | 205 | , range( 0xc46, 0xc48 ) |
| | 206 | , range( 0xc4a, 0xc4d ) |
| | 207 | , range( 0xc55, 0xc56 ) |
| | 208 | , array( 0xcbc ) |
| | 209 | , array( 0xcbf ) |
| | 210 | , array( 0xcc6 ) |
| | 211 | , range( 0xccc, 0xccd ) |
| | 212 | , range( 0xce2, 0xce3 ) |
| | 213 | , range( 0xd41, 0xd43 ) |
| | 214 | , array( 0xd4d ) |
| | 215 | , array( 0xdca ) |
| | 216 | , range( 0xdd2, 0xdd4 ) |
| | 217 | , array( 0xdd6 ) |
| | 218 | , array( 0xe31 ) |
| | 219 | , range( 0xe34, 0xe3a ) |
| | 220 | , range( 0xe47, 0xe4e ) |
| | 221 | , array( 0xeb1 ) |
| | 222 | , range( 0xeb4, 0xeb9 ) |
| | 223 | , range( 0xebb, 0xebc ) |
| | 224 | , range( 0xec8, 0xecd ) |
| | 225 | , range( 0xf18, 0xf19 ) |
| | 226 | , array( 0xf35 ) |
| | 227 | , array( 0xf37 ) |
| | 228 | , array( 0xf39 ) |
| | 229 | , range( 0xf71, 0xf7e ) |
| | 230 | , range( 0xf80, 0xf84 ) |
| | 231 | , range( 0xf86, 0xf87 ) |
| | 232 | , range( 0xf90, 0xf97 ) |
| | 233 | , range( 0xf99, 0xfbc ) |
| | 234 | , array( 0xfc6 ) |
| | 235 | , range( 0x102d, 0x1030 ) |
| | 236 | , array( 0x1032 ) |
| | 237 | , range( 0x1036, 0x1037 ) |
| | 238 | , array( 0x1039 ) |
| | 239 | , range( 0x1058, 0x1059 ) |
| | 240 | , array( 0x135f ) |
| | 241 | , range( 0x1712, 0x1714 ) |
| | 242 | , range( 0x1732, 0x1734 ) |
| | 243 | , range( 0x1752, 0x1753 ) |
| | 244 | , range( 0x1772, 0x1773 ) |
| | 245 | , range( 0x17b7, 0x17bd ) |
| | 246 | , array( 0x17c6 ) |
| | 247 | , range( 0x17c9, 0x17d3 ) |
| | 248 | , array( 0x17dd ) |
| | 249 | , range( 0x180b, 0x180d ) |
| | 250 | , array( 0x18a9 ) |
| | 251 | , range( 0x1920, 0x1922 ) |
| | 252 | , range( 0x1927, 0x1928 ) |
| | 253 | , array( 0x1932 ) |
| | 254 | , range( 0x1939, 0x193b ) |
| | 255 | , range( 0x1a17, 0x1a18 ) |
| | 256 | , range( 0x1b00, 0x1b03 ) |
| | 257 | , array( 0x1b34 ) |
| | 258 | , range( 0x1b36, 0x1b3a ) |
| | 259 | , array( 0x1b3c ) |
| | 260 | , array( 0x1b42 ) |
| | 261 | , range( 0x1b6b, 0x1b73 ) |
| | 262 | , range( 0x1dc0, 0x1dca ) |
| | 263 | , range( 0x1dfe, 0x1dff ) |
| | 264 | , range( 0x20d0, 0x20dc ) |
| | 265 | , array( 0x20e1 ) |
| | 266 | , range( 0x20e5, 0x20ef ) |
| | 267 | , range( 0x302a, 0x302d ) // NOTE: U+302E & U+302F recategorized as Mc in Unicode 6.1.0 |
| | 268 | , range( 0x3099, 0x309a ) |
| | 269 | , array( 0xa806 ) |
| | 270 | , array( 0xa80b ) |
| | 271 | , range( 0xa825, 0xa826 ) |
| | 272 | , array( 0xfb1e ) |
| | 273 | , range( 0xfe00, 0xfe0f ) |
| | 274 | , range( 0xfe20, 0xfe23 ) |
| | 275 | , range( 0x10a01, 0x10a03 ) |
| | 276 | , range( 0x10a05, 0x10a06 ) |
| | 277 | , range( 0x10a0c, 0x10a0f ) |
| | 278 | , range( 0x10a38, 0x10a3a ) |
| | 279 | , array( 0x10a3f ) |
| | 280 | , range( 0x1d167, 0x1d169 ) |
| | 281 | , range( 0x1d17b, 0x1d182 ) |
| | 282 | , range( 0x1d185, 0x1d18b ) |
| | 283 | , range( 0x1d1aa, 0x1d1ad ) |
| | 284 | , range( 0x1d242, 0x1d244 ) |
| | 285 | , range( 0xe0100, 0xe01ef ) |
| | 286 | ); |
| | 287 | $combining_marks = 'a'; // Only if follow Latin character. |
| | 288 | foreach ( $code_points as $c ) { |
| | 289 | if ( $c < 0x800 ) { |
| | 290 | $combining_marks .= chr( 0xc0 | $c >> 6 ) . chr( 0x80 | $c & 0x3f ); |
| | 291 | } elseif ( $c < 0x10000 ) { |
| | 292 | $combining_marks .= chr( 0xe0 | $c >> 12 ) . chr( 0x80 | $c >> 6 & 0x3f ) . chr( 0x80 | $c & 0x3f ); |
| | 293 | } else { |
| | 294 | $combining_marks .= chr( 0xf0 | $c >> 18 ) . chr( 0x80 | $c >> 12 & 0x3f ) . chr( 0x80 | $c >> 6 & 0x3f ) . chr( 0x80 | $c & 0x3f ); |
| | 295 | } |
| | 296 | } |
| | 297 | // Performs the test: all the characters should be removed. |
| | 298 | $this->assertSame( 'a', remove_accents( $combining_marks ) ); |
| | 299 | if ( _wp_can_use_pcre_ucp() ) { // If UCP available. |
| | 300 | // Test single-byte replace also. |
| | 301 | _wp_can_use_pcre_ucp( false ); |
| | 302 | $this->assertSame( 'a', remove_accents( $combining_marks ) ); |
| | 303 | _wp_can_use_pcre_ucp( 'reset' ); |
| | 304 | } |
| | 305 | } |
| | 306 | |
| | 307 | /** |
| | 308 | * @ticket 24661 |
| | 309 | * @dataProvider data_remove_accents_combining |
| | 310 | */ |
| | 311 | function test_remove_accents_combining( $input, $output ) { |
| | 312 | $this->assertSame( $output, remove_accents( $input ) ); |
| | 313 | } |
| | 314 | |
| | 315 | function data_remove_accents_combining() { |
| | 316 | return array( |
| | 317 | // Test a collection of filenames that could have problems. |
| | 318 | array( |
| | 319 | "Capture d’e\xcc\x81cran 2013-02-20 a\xcc\x80 23.36.06.png", // NOTE: apostrophe is U+2019 |
| | 320 | //"Capture d'ecran 2013-02-20 a 23.36.06.png", // NOTE: apostrophe should go to U+0029 |
| | 321 | "Capture d’ecran 2013-02-20 a 23.36.06.png", |
| | 322 | ), |
| | 323 | array( |
| | 324 | "Buttermo\xcc\x88deli.jpg", |
| | 325 | "Buttermodeli.jpg", |
| | 326 | ), |
| | 327 | array( |
| | 328 | "Mu\xcc\x88nsterl.Mai13a.jpg", |
| | 329 | "Munsterl.Mai13a.jpg", |
| | 330 | ), |
| | 331 | |
| | 332 | array( |
| | 333 | 'Cáo nâu lanh lẹ nhảy qua người lười biếng. Do bạch kim rất quý, sẽ để lắp vô xương', |
| | 334 | "Cao nau lanh le nhay qua nguoi luoi bieng. Do bach kim rat quy, se de lap vo xuong", |
| | 335 | ), |
| | 336 | array( |
| | 337 | 'Příliš žluťoučký kůň úpěl ďábelské kódy. Pójdźże, kiń tę chmurność w głąb flaszy! Päťtýždňové vĺčatá nervózne štekajú na môjho ďatľa v tŕní.', |
| | 338 | "Prilis zlutoucky kun upel dabelske kody. Pojdzze, kin te chmurnosc w glab flaszy! Pattyzdnove vlcata nervozne stekaju na mojho datla v trni.", |
| | 339 | ), |
| | 340 | array( |
| | 341 | "Les naïfs ægithales hâtifs pondant à noël où il gèle sont sûrs d'être déçus en voyant leurs drôles d'œufs abîmés.", |
| | 342 | "Les naifs aegithales hatifs pondant a noel ou il gele sont surs d'etre decus en voyant leurs droles d'oeufs abimes.", |
| | 343 | ), |
| | 344 | array( |
| | 345 | 'a᷄ (a + combining macron-acute). +⃟ (plus with enclosing diamond). a︠e︡ (ae with a combining ligature)', |
| | 346 | "a (a + combining macron-acute). +⃟ (plus with enclosing diamond). ae (ae with a combining ligature)", // NOTE: Not doing "Me" enclosing combining marks. |
| | 347 | ), |
| | 348 | |
| | 349 | // From UTF-8 Sampler http://www.columbia.edu/~fdc/utf8/ |
| | 350 | array( |
| | 351 | // NOTE: U+200C ZERO WIDTH NON-JOINER not dealt with. |
| | 352 | "Im finſteren Jagdſchloß am offenen Felsquellwaſſer patzte der affig-flatterhafte kauzig-höf\xe2\x80\x8cliche Bäcker über ſeinem verſifften kniffligen C-Xylophon.", |
| | 353 | "Im finsteren Jagdschlos am offenen Felsquellwasser patzte der affig-flatterhafte kauzig-hof\xe2\x80\x8cliche Backer uber seinem versifften kniffligen C-Xylophon.", |
| | 354 | ), |
| | 355 | ); |
| | 356 | } |