Ticket #24661: 24661.2.patch
File 24661.2.patch, 9.3 KB (added by , 8 years ago) |
---|
-
src/wp-includes/formatting.php
1700 1700 } 1701 1701 1702 1702 $string = strtr($string, $chars); 1703 1704 // Removes "Mn" nonspacing combining marks that follow Latin characters. 1705 if ( false !== @preg_match( '/\p{L}/u', '' ) ) { // If UCP available. 1706 $string = preg_replace( '/(?<=\p{Latin})\p{Mn}+/u', '', $string ); 1707 } else { 1708 if ( ! defined( 'WP_MN_REGEX_ALTS' ) ) { 1709 require dirname( __FILE__ ) . '/unicode/regex_alts.php'; 1710 } 1711 $string = preg_replace( '/(?<=' . WP_LATIN_REGEX_ALTS . ')(?:' . WP_MN_REGEX_ALTS . ')+/', '', $string ); 1712 } 1713 1703 1714 } else { 1704 1715 $chars = array(); 1705 1716 // Assume ISO-8859-1 if not UTF-8 -
src/wp-includes/unicode/regex_alts.php
1 <?php 2 /* 3 * Generated by "gen_cat_regex_alts.php" from "UnicodeData.txt". Don't edit! 4 * Mn general category codepoints. 5 * 6 * See http://www.unicode.org/Public/9.0.0/ucd/UnicodeData.txt 7 */ 8 9 define( 'WP_MN_REGEX_ALTS', '\xcc[\x80-\xbf]|\xcd[\x80-\xaf]|\xd2[\x83-\x87]|\xd6[\x91-\xbd\xbf]|\xd7[\x81\x82\x84\x85\x87]|\xd8[\x90-\x9a]|\xd9[\x8b-\x9f\xb0]|\xdb[\x96-\x9c\x9f-\xa4\xa7\xa8\xaa-\xad]|\xdc[\x91\xb0-\xbf]|\xdd[\x80-\x8a]|\xde[\xa6-\xb0]|\xdf[\xab-\xb3]|\xe0(?:\xa0[\x96-\x99\x9b-\xa3\xa5-\xa7\xa9-\xad]|\xa1[\x99-\x9b]|\xa3[\x94-\xa1\xa3-\xbf]|\xa4[\x80-\x82\xba\xbc]|\xa5[\x81-\x88\x8d\x91-\x97\xa2\xa3]|\xa6[\x81\xbc]|\xa7[\x81-\x84\x8d\xa2\xa3]|\xa8[\x81\x82\xbc]|\xa9[\x81\x82\x87\x88\x8b-\x8d\x91\xb0\xb1\xb5]|\xaa[\x81\x82\xbc]|\xab[\x81-\x85\x87\x88\x8d\xa2\xa3]|\xac[\x81\xbc\xbf]|\xad[\x81-\x84\x8d\x96\xa2\xa3]|\xae\x82|\xaf[\x80\x8d]|\xb0[\x80\xbe\xbf]|\xb1[\x80\x86-\x88\x8a-\x8d\x95\x96\xa2\xa3]|\xb2[\x81\xbc\xbf]|\xb3[\x86\x8c\x8d\xa2\xa3]|\xb4\x81|\xb5[\x81-\x84\x8d\xa2\xa3]|\xb7[\x8a\x92-\x94\x96]|\xb8[\xb1\xb4-\xba]|\xb9[\x87-\x8e]|\xba[\xb1\xb4-\xb9\xbb\xbc]|\xbb[\x88-\x8d]|\xbc[\x98\x99\xb5\xb7\xb9]|\xbd[\xb1-\xbe]|\xbe[\x80-\x84\x86\x87\x8d-\x97\x99-\xbc]|\xbf\x86)|\xe1(?:\x80[\xad-\xb0\xb2-\xb7\xb9\xba\xbd\xbe]|\x81[\x98\x99\x9e-\xa0\xb1-\xb4]|\x82[\x82\x85\x86\x8d\x9d]|\x8d[\x9d-\x9f]|\x9c[\x92-\x94\xb2-\xb4]|\x9d[\x92\x93\xb2\xb3]|\x9e[\xb4\xb5\xb7-\xbd]|\x9f[\x86\x89-\x93\x9d]|\xa0[\x8b-\x8d]|\xa2[\x85\x86\xa9]|\xa4[\xa0-\xa2\xa7\xa8\xb2\xb9-\xbb]|\xa8[\x97\x98\x9b]|\xa9[\x96\x98-\x9e\xa0\xa2\xa5-\xac\xb3-\xbc\xbf]|\xaa[\xb0-\xbd]|\xac[\x80-\x83\xb4\xb6-\xba\xbc]|\xad[\x82\xab-\xb3]|\xae[\x80\x81\xa2-\xa5\xa8\xa9\xab-\xad]|\xaf[\xa6\xa8\xa9\xad\xaf-\xb1]|\xb0[\xac-\xb3\xb6\xb7]|\xb3[\x90-\x92\x94-\xa0\xa2-\xa8\xad\xb4\xb8\xb9]|\xb7[\x80-\xb5\xbb-\xbf])|\xe2(?:\x83[\x90-\x9c\xa1\xa5-\xb0]|\xb3[\xaf-\xb1]|\xb5\xbf|\xb7[\xa0-\xbf])|\xe3(?:\x80[\xaa-\xad]|\x82[\x99\x9a])|\xea(?:\x99[\xaf\xb4-\xbd]|\x9a[\x9e\x9f]|\x9b[\xb0\xb1]|\xa0[\x82\x86\x8b\xa5\xa6]|\xa3[\x84\x85\xa0-\xb1]|\xa4[\xa6-\xad]|\xa5[\x87-\x91]|\xa6[\x80-\x82\xb3\xb6-\xb9\xbc]|\xa7\xa5|\xa8[\xa9-\xae\xb1\xb2\xb5\xb6]|\xa9[\x83\x8c\xbc]|\xaa[\xb0\xb2-\xb4\xb7\xb8\xbe\xbf]|\xab[\x81\xac\xad\xb6]|\xaf[\xa5\xa8\xad])|\xef(?:\xac\x9e|\xb8[\x80-\x8f\xa0-\xaf])|\xf0(?:\x90(?:\x87\xbd|\x8b\xa0|\x8d[\xb6-\xba]|\xa8[\x81-\x83\x85\x86\x8c-\x8f\xb8-\xba\xbf]|\xab[\xa5\xa6])|\x91(?:\x80[\x81\xb8-\xbf]|\x81[\x80-\x86\xbf]|\x82[\x80\x81\xb3-\xb6\xb9\xba]|\x84[\x80-\x82\xa7-\xab\xad-\xb4]|\x85\xb3|\x86[\x80\x81\xb6-\xbe]|\x87[\x8a-\x8c]|\x88[\xaf-\xb1\xb4\xb6\xb7\xbe]|\x8b[\x9f\xa3-\xaa]|\x8c[\x80\x81\xbc]|\x8d[\x80\xa6-\xac\xb0-\xb4]|\x90[\xb8-\xbf]|\x91[\x82-\x84\x86]|\x92[\xb3-\xb8\xba\xbf]|\x93[\x80\x82\x83]|\x96[\xb2-\xb5\xbc\xbd\xbf]|\x97[\x80\x9c\x9d]|\x98[\xb3-\xba\xbd\xbf]|\x99\x80|\x9a[\xab\xad\xb0-\xb5\xb7]|\x9c[\x9d-\x9f\xa2-\xa5\xa7-\xab]|\xb0[\xb0-\xb6\xb8-\xbd\xbf]|\xb2[\x92-\xa7\xaa-\xb0\xb2\xb3\xb5\xb6])|\x96(?:\xab[\xb0-\xb4]|\xac[\xb0-\xb6]|\xbe[\x8f-\x92])|\x9b\xb2[\x9d\x9e]|\x9d(?:\x85[\xa7-\xa9\xbb-\xbf]|\x86[\x80-\x82\x85-\x8b\xaa-\xad]|\x89[\x82-\x84]|\xa8[\x80-\xb6\xbb-\xbf]|\xa9[\x80-\xac\xb5]|\xaa[\x84\x9b-\x9f\xa1-\xaf])|\x9e(?:\x80[\x80-\x86\x88-\x98\x9b-\xa1\xa3\xa4\xa6-\xaa]|\xa3[\x90-\x96]|\xa5[\x84-\x8a]))|\xf3\xa0(?:[\x84-\x86][\x80-\xbf]|\x87[\x80-\xaf])' ); // 1690 code points. 10 11 /* 12 * Generated by "gen_script_regex_alts.php" from "Scripts.txt". Don't edit! 13 * Latin script codepoints. 14 * 15 * See http://www.unicode.org/Public/9.0.0/ucd/Scripts.txt 16 */ 17 18 define( 'WP_LATIN_REGEX_ALTS', '[\x41-\x5a\x61-\x7a]|\xc2[\xaa\xba]|\xc3[\x80-\x96\x98-\xb6\xb8-\xbf]|[\xc4-\xc9][\x80-\xbf]|\xca[\x80-\xb8]|\xcb[\xa0-\xa4]|\xe1(?:\xb4[\x80-\xa5\xac-\xbf]|\xb5[\x80-\x9c\xa2-\xa5\xab-\xb7\xb9-\xbf]|\xb6[\x80-\xbe]|[\xb8-\xbb][\x80-\xbf])|\xe2(?:\x81[\xb1\xbf]|\x82[\x90-\x9c]|\x84[\xaa\xab\xb2]|\x85[\x8e\xa0-\xbf]|\x86[\x80-\x88]|\xb1[\xa0-\xbf])|\xea(?:\x9c[\xa2-\xbf]|\x9d[\x80-\xbf]|\x9e[\x80-\x87\x8b-\xae\xb0-\xb7]|\x9f[\xb7-\xbf]|\xac[\xb0-\xbf]|\xad[\x80-\x9a\x9c-\xa4])|\xef(?:\xac[\x80-\x86]|\xbc[\xa1-\xba]|\xbd[\x81-\x9a])' ); // 1350 code points. -
tests/phpunit/tests/formatting/RemoveAccents.php
126 126 127 127 $this->assertEquals( 'al·lallalla', remove_accents( 'al·lallaŀla' ) ); 128 128 } 129 130 /** 131 * @ticket 24661 132 */ 133 public function test_remove_accents_mn_combining_marks() { 134 // Creates a string with some Mn nonspacing combining marks. 135 $code_points = array_merge( 136 range( 0x0300, 0x036F ) // Combining Diacritical Marks 137 , range( 0x1DC0, 0x1DF5 ) // Combining Diacritical Marks Supplement 138 , range( 0x20D0, 0x20DC ) // Combining Diacritical Marks for Symbols 139 , range( 0x20E5, 0x20F0 ) // Combining Diacritical Marks for Symbols 140 , range( 0xFE20, 0xFE2D ) // Combining Half Marks (restricted to Unicode 7.0) 141 ); 142 $combining_marks = 'a'; // Only if follow Latin character. 143 foreach ( $code_points as $code_point ) { 144 if ( $code_point <= 0x07ff ) { 145 $combining_marks .= chr( 0xc0 | ( $code_point >> 6 ) ) . chr( 0x80 | ( $code_point & 0x003f ) ); 146 } else { 147 $combining_marks .= chr( 0xe0 | ( $code_point >> 12 ) ) . chr( 0x80 | ( ( $code_point >> 6 ) & 0x003f ) ) . chr( 0x80 | ( $code_point & 0x003f ) ); 148 } 149 } 150 // Performs the test: all the characters should be removed. 151 $this->assertSame( 'a', remove_accents( $combining_marks ) ); 152 153 if ( false !== @preg_match( '/\p{L}/u', '' ) ) { // If UCP available. 154 // Test single-byte replace also. 155 $string = $combining_marks; 156 if ( ! defined( 'WP_MN_REGEX_ALTS' ) ) { 157 require ABSPATH . WPINC . '/unicode/regex_alts.php'; 158 } 159 $string = preg_replace( '/(?<=' . WP_LATIN_REGEX_ALTS . ')(?:' . WP_MN_REGEX_ALTS . ')+/', '', $string ); 160 $this->assertSame( 'a', $string ); 161 } 162 } 163 164 /** 165 * @ticket 24661 166 * @dataProvider data_remove_accents_combining 167 */ 168 function test_remove_accents_combining( $input, $output ) { 169 $this->assertSame( $output, remove_accents( $input ) ); 170 } 171 172 function data_remove_accents_combining() { 173 return array( 174 // Test a collection of filenames that could have problems. 175 array( 176 "Capture d’e\xcc\x81cran 2013-02-20 a\xcc\x80 23.36.06.png", // NOTE: apostrophe is U+2019 177 //"Capture d'ecran 2013-02-20 a 23.36.06.png", // NOTE: apostrophe should go to U+0029 178 "Capture d’ecran 2013-02-20 a 23.36.06.png", 179 ), 180 array( 181 "Buttermo\xcc\x88deli.jpg", 182 "Buttermodeli.jpg", 183 ), 184 array( 185 "Mu\xcc\x88nsterl.Mai13a.jpg", 186 "Munsterl.Mai13a.jpg", 187 ), 188 189 array( 190 'Cáo nâu lanh lẹ nhảy qua người lười biếng. Do bạch kim rất quý, sẽ để lắp vô xương', 191 "Cao nau lanh le nhay qua nguoi luoi bieng. Do bach kim rat quy, se de lap vo xuong", 192 ), 193 array( 194 'Příliš žluťoučký kůň úpěl ďábelské kódy. Pójdźże, kiń tę chmurność w głąb flaszy! Päťtýždňové vĺčatá nervózne štekajú na môjho ďatľa v tŕní.', 195 "Prilis zlutoucky kun upel dabelske kody. Pojdzze, kin te chmurnosc w glab flaszy! Pattyzdnove vlcata nervozne stekaju na mojho datla v trni.", 196 ), 197 array( 198 "Les naïfs ægithales hâtifs pondant à noël où il gèle sont sûrs d'être déçus en voyant leurs drôles d'œufs abîmés.", 199 "Les naifs aegithales hatifs pondant a noel ou il gele sont surs d'etre decus en voyant leurs droles d'oeufs abimes.", 200 ), 201 array( 202 'a᷄ (a + combining macron-acute). +⃟ (plus with enclosing diamond). a︠e︡ (ae with a combining ligature)', 203 "a (a + combining macron-acute). +⃟ (plus with enclosing diamond). ae (ae with a combining ligature)", // NOTE: Not doing "Me" enclosing combining marks. 204 ), 205 206 // From UTF-8 Sampler http://www.columbia.edu/~fdc/utf8/ 207 array( 208 // NOTE: U+200C ZERO WIDTH NON-JOINER not dealt with. 209 "Im finſteren Jagdſchloß am offenen Felsquellwaſſer patzte der affig-flatterhafte kauzig-höf\xe2\x80\x8cliche Bäcker über ſeinem verſifften kniffligen C-Xylophon.", 210 "Im finsteren Jagdschlos am offenen Felsquellwasser patzte der affig-flatterhafte kauzig-hof\xe2\x80\x8cliche Backer uber seinem versifften kniffligen C-Xylophon.", 211 ), 212 ); 213 } 129 214 }