| 146 | |
| 147 | /** |
| 148 | * @ticket 24661 |
| 149 | */ |
| 150 | public function test_remove_accents_mn_combining_marks() { |
| 151 | // Creates a string with all Mn nonspacing combining marks (generated from Unicode 5.0.0 "UnicodeData.txt", 880 code points). |
| 152 | // (Using Unicode 5.0.0 to be compatible with PHP 5.2.4 (PCRE 7.2).) |
| 153 | $code_points = array_merge( |
| 154 | range( 0x300, 0x36f ) |
| 155 | , range( 0x483, 0x486 ) |
| 156 | , range( 0x591, 0x5bd ) |
| 157 | , array( 0x5bf ) |
| 158 | , range( 0x5c1, 0x5c2 ) |
| 159 | , range( 0x5c4, 0x5c5 ) |
| 160 | , array( 0x5c7 ) |
| 161 | , range( 0x610, 0x615 ) |
| 162 | , range( 0x64b, 0x65e ) |
| 163 | , array( 0x670 ) |
| 164 | , range( 0x6d6, 0x6dc ) |
| 165 | , range( 0x6df, 0x6e4 ) |
| 166 | , range( 0x6e7, 0x6e8 ) |
| 167 | , range( 0x6ea, 0x6ed ) |
| 168 | , array( 0x711 ) |
| 169 | , range( 0x730, 0x74a ) |
| 170 | , range( 0x7a6, 0x7b0 ) |
| 171 | , range( 0x7eb, 0x7f3 ) |
| 172 | , range( 0x901, 0x902 ) |
| 173 | , array( 0x93c ) |
| 174 | , range( 0x941, 0x948 ) |
| 175 | , array( 0x94d ) |
| 176 | , range( 0x951, 0x954 ) |
| 177 | , range( 0x962, 0x963 ) |
| 178 | , array( 0x981 ) |
| 179 | , array( 0x9bc ) |
| 180 | , range( 0x9c1, 0x9c4 ) |
| 181 | , array( 0x9cd ) |
| 182 | , range( 0x9e2, 0x9e3 ) |
| 183 | , range( 0xa01, 0xa02 ) |
| 184 | , array( 0xa3c ) |
| 185 | , range( 0xa41, 0xa42 ) |
| 186 | , range( 0xa47, 0xa48 ) |
| 187 | , range( 0xa4b, 0xa4d ) |
| 188 | , range( 0xa70, 0xa71 ) |
| 189 | , range( 0xa81, 0xa82 ) |
| 190 | , array( 0xabc ) |
| 191 | , range( 0xac1, 0xac5 ) |
| 192 | , range( 0xac7, 0xac8 ) |
| 193 | , array( 0xacd ) |
| 194 | , range( 0xae2, 0xae3 ) |
| 195 | , array( 0xb01 ) |
| 196 | , array( 0xb3c ) |
| 197 | , array( 0xb3f ) |
| 198 | , range( 0xb41, 0xb43 ) |
| 199 | , array( 0xb4d ) |
| 200 | , array( 0xb56 ) |
| 201 | , array( 0xb82 ) |
| 202 | , array( 0xbc0 ) |
| 203 | , array( 0xbcd ) |
| 204 | , range( 0xc3e, 0xc40 ) |
| 205 | , range( 0xc46, 0xc48 ) |
| 206 | , range( 0xc4a, 0xc4d ) |
| 207 | , range( 0xc55, 0xc56 ) |
| 208 | , array( 0xcbc ) |
| 209 | , array( 0xcbf ) |
| 210 | , array( 0xcc6 ) |
| 211 | , range( 0xccc, 0xccd ) |
| 212 | , range( 0xce2, 0xce3 ) |
| 213 | , range( 0xd41, 0xd43 ) |
| 214 | , array( 0xd4d ) |
| 215 | , array( 0xdca ) |
| 216 | , range( 0xdd2, 0xdd4 ) |
| 217 | , array( 0xdd6 ) |
| 218 | , array( 0xe31 ) |
| 219 | , range( 0xe34, 0xe3a ) |
| 220 | , range( 0xe47, 0xe4e ) |
| 221 | , array( 0xeb1 ) |
| 222 | , range( 0xeb4, 0xeb9 ) |
| 223 | , range( 0xebb, 0xebc ) |
| 224 | , range( 0xec8, 0xecd ) |
| 225 | , range( 0xf18, 0xf19 ) |
| 226 | , array( 0xf35 ) |
| 227 | , array( 0xf37 ) |
| 228 | , array( 0xf39 ) |
| 229 | , range( 0xf71, 0xf7e ) |
| 230 | , range( 0xf80, 0xf84 ) |
| 231 | , range( 0xf86, 0xf87 ) |
| 232 | , range( 0xf90, 0xf97 ) |
| 233 | , range( 0xf99, 0xfbc ) |
| 234 | , array( 0xfc6 ) |
| 235 | , range( 0x102d, 0x1030 ) |
| 236 | , array( 0x1032 ) |
| 237 | , range( 0x1036, 0x1037 ) |
| 238 | , array( 0x1039 ) |
| 239 | , range( 0x1058, 0x1059 ) |
| 240 | , array( 0x135f ) |
| 241 | , range( 0x1712, 0x1714 ) |
| 242 | , range( 0x1732, 0x1734 ) |
| 243 | , range( 0x1752, 0x1753 ) |
| 244 | , range( 0x1772, 0x1773 ) |
| 245 | , range( 0x17b7, 0x17bd ) |
| 246 | , array( 0x17c6 ) |
| 247 | , range( 0x17c9, 0x17d3 ) |
| 248 | , array( 0x17dd ) |
| 249 | , range( 0x180b, 0x180d ) |
| 250 | , array( 0x18a9 ) |
| 251 | , range( 0x1920, 0x1922 ) |
| 252 | , range( 0x1927, 0x1928 ) |
| 253 | , array( 0x1932 ) |
| 254 | , range( 0x1939, 0x193b ) |
| 255 | , range( 0x1a17, 0x1a18 ) |
| 256 | , range( 0x1b00, 0x1b03 ) |
| 257 | , array( 0x1b34 ) |
| 258 | , range( 0x1b36, 0x1b3a ) |
| 259 | , array( 0x1b3c ) |
| 260 | , array( 0x1b42 ) |
| 261 | , range( 0x1b6b, 0x1b73 ) |
| 262 | , range( 0x1dc0, 0x1dca ) |
| 263 | , range( 0x1dfe, 0x1dff ) |
| 264 | , range( 0x20d0, 0x20dc ) |
| 265 | , array( 0x20e1 ) |
| 266 | , range( 0x20e5, 0x20ef ) |
| 267 | , range( 0x302a, 0x302d ) // NOTE: U+302E & U+302F recategorized as Mc in Unicode 6.1.0 |
| 268 | , range( 0x3099, 0x309a ) |
| 269 | , array( 0xa806 ) |
| 270 | , array( 0xa80b ) |
| 271 | , range( 0xa825, 0xa826 ) |
| 272 | , array( 0xfb1e ) |
| 273 | , range( 0xfe00, 0xfe0f ) |
| 274 | , range( 0xfe20, 0xfe23 ) |
| 275 | , range( 0x10a01, 0x10a03 ) |
| 276 | , range( 0x10a05, 0x10a06 ) |
| 277 | , range( 0x10a0c, 0x10a0f ) |
| 278 | , range( 0x10a38, 0x10a3a ) |
| 279 | , array( 0x10a3f ) |
| 280 | , range( 0x1d167, 0x1d169 ) |
| 281 | , range( 0x1d17b, 0x1d182 ) |
| 282 | , range( 0x1d185, 0x1d18b ) |
| 283 | , range( 0x1d1aa, 0x1d1ad ) |
| 284 | , range( 0x1d242, 0x1d244 ) |
| 285 | , range( 0xe0100, 0xe01ef ) |
| 286 | ); |
| 287 | $combining_marks = 'a'; // Only if follow Latin character. |
| 288 | foreach ( $code_points as $c ) { |
| 289 | if ( $c < 0x800 ) { |
| 290 | $combining_marks .= chr( 0xc0 | $c >> 6 ) . chr( 0x80 | $c & 0x3f ); |
| 291 | } elseif ( $c < 0x10000 ) { |
| 292 | $combining_marks .= chr( 0xe0 | $c >> 12 ) . chr( 0x80 | $c >> 6 & 0x3f ) . chr( 0x80 | $c & 0x3f ); |
| 293 | } else { |
| 294 | $combining_marks .= chr( 0xf0 | $c >> 18 ) . chr( 0x80 | $c >> 12 & 0x3f ) . chr( 0x80 | $c >> 6 & 0x3f ) . chr( 0x80 | $c & 0x3f ); |
| 295 | } |
| 296 | } |
| 297 | // Performs the test: all the characters should be removed. |
| 298 | $this->assertSame( 'a', remove_accents( $combining_marks ) ); |
| 299 | if ( _wp_can_use_pcre_ucp() ) { // If UCP available. |
| 300 | // Test single-byte replace also. |
| 301 | _wp_can_use_pcre_ucp( false ); |
| 302 | $this->assertSame( 'a', remove_accents( $combining_marks ) ); |
| 303 | _wp_can_use_pcre_ucp( 'reset' ); |
| 304 | } |
| 305 | } |
| 306 | |
| 307 | /** |
| 308 | * @ticket 24661 |
| 309 | * @dataProvider data_remove_accents_combining |
| 310 | */ |
| 311 | function test_remove_accents_combining( $input, $output ) { |
| 312 | $this->assertSame( $output, remove_accents( $input ) ); |
| 313 | } |
| 314 | |
| 315 | function data_remove_accents_combining() { |
| 316 | return array( |
| 317 | // Test a collection of filenames that could have problems. |
| 318 | array( |
| 319 | "Capture d’e\xcc\x81cran 2013-02-20 a\xcc\x80 23.36.06.png", // NOTE: apostrophe is U+2019 |
| 320 | //"Capture d'ecran 2013-02-20 a 23.36.06.png", // NOTE: apostrophe should go to U+0029 |
| 321 | "Capture d’ecran 2013-02-20 a 23.36.06.png", |
| 322 | ), |
| 323 | array( |
| 324 | "Buttermo\xcc\x88deli.jpg", |
| 325 | "Buttermodeli.jpg", |
| 326 | ), |
| 327 | array( |
| 328 | "Mu\xcc\x88nsterl.Mai13a.jpg", |
| 329 | "Munsterl.Mai13a.jpg", |
| 330 | ), |
| 331 | |
| 332 | array( |
| 333 | 'Cáo nâu lanh lẹ nhảy qua người lười biếng. Do bạch kim rất quý, sẽ để lắp vô xương', |
| 334 | "Cao nau lanh le nhay qua nguoi luoi bieng. Do bach kim rat quy, se de lap vo xuong", |
| 335 | ), |
| 336 | array( |
| 337 | 'Příliš žluťoučký kůň úpěl ďábelské kódy. Pójdźże, kiń tę chmurność w głąb flaszy! Päťtýždňové vĺčatá nervózne štekajú na môjho ďatľa v tŕní.', |
| 338 | "Prilis zlutoucky kun upel dabelske kody. Pojdzze, kin te chmurnosc w glab flaszy! Pattyzdnove vlcata nervozne stekaju na mojho datla v trni.", |
| 339 | ), |
| 340 | array( |
| 341 | "Les naïfs ægithales hâtifs pondant à noël où il gèle sont sûrs d'être déçus en voyant leurs drôles d'œufs abîmés.", |
| 342 | "Les naifs aegithales hatifs pondant a noel ou il gele sont surs d'etre decus en voyant leurs droles d'oeufs abimes.", |
| 343 | ), |
| 344 | array( |
| 345 | 'a᷄ (a + combining macron-acute). +⃟ (plus with enclosing diamond). a︠e︡ (ae with a combining ligature)', |
| 346 | "a (a + combining macron-acute). +⃟ (plus with enclosing diamond). ae (ae with a combining ligature)", // NOTE: Not doing "Me" enclosing combining marks. |
| 347 | ), |
| 348 | |
| 349 | // From UTF-8 Sampler http://www.columbia.edu/~fdc/utf8/ |
| 350 | array( |
| 351 | // NOTE: U+200C ZERO WIDTH NON-JOINER not dealt with. |
| 352 | "Im finſteren Jagdſchloß am offenen Felsquellwaſſer patzte der affig-flatterhafte kauzig-höf\xe2\x80\x8cliche Bäcker über ſeinem verſifften kniffligen C-Xylophon.", |
| 353 | "Im finsteren Jagdschlos am offenen Felsquellwasser patzte der affig-flatterhafte kauzig-hof\xe2\x80\x8cliche Backer uber seinem versifften kniffligen C-Xylophon.", |
| 354 | ), |
| 355 | ); |
| 356 | } |