Ticket #27896: 27896.1.diff
File 27896.1.diff, 10.5 KB (added by , 7 years ago) |
---|
-
parsers.php
From fcf80166c30f646d51a2938e5a06045f9146d705 Mon Sep 17 00:00:00 2001 From: Paul Biron <paul@sparrowhawkcomputing.com> Date: Tue, 6 Jun 2017 14:34:27 -0600 Subject: [PATCH] make WXR_Parser_SimpleXML & WXR_Parser_XML fully namespace aware --- parsers.php | 103 +++++++++++++++++++++++++++++++++++------------------------- 1 file changed, 60 insertions(+), 43 deletions(-) diff --git a/parsers.php b/parsers.php index 35513fe..f324dc3 100644
a b 6 6 * @subpackage Importer 7 7 */ 8 8 9 define( 'WXR_NAMESPACE_URI', 'http://wordpress.org/export/1.2/' ); 10 define( 'WXR_EXCERPT_NAMESPACE_URI', WXR_NAMESPACE_URI . 'excerpt' ); 11 define( 'DUBLIN_CORE_NAMESPACE_URI', 'http://purl.org/dc/elements/1.1/' ); 12 define( 'CONTENT_NAMESPACE_URI', 'http://purl.org/rss/1.0/modules/content/' ); 13 9 14 /** 10 15 * WordPress Importer class for managing parsing of WXR files. 11 16 */ … … class WXR_Parser_SimpleXML { 79 84 if ( ! $xml ) 80 85 return new WP_Error( 'SimpleXML_parse_error', __( 'There was an error when reading this WXR file', 'wordpress-importer' ), libxml_get_errors() ); 81 86 87 $xml->registerXPathNamespace( 'wp', WXR_NAMESPACE_URI ); 88 $xml->registerXPathNamespace( 'excerpt', WXR_EXCERPT_NAMESPACE_URI ); 89 82 90 $wxr_version = $xml->xpath('/rss/channel/wp:wxr_version'); 83 91 if ( ! $wxr_version ) 84 92 return new WP_Error( 'WXR_parse_error', __( 'This does not appear to be a WXR file, missing/invalid WXR version number', 'wordpress-importer' ) ); … … class WXR_Parser_SimpleXML { 91 99 $base_url = $xml->xpath('/rss/channel/wp:base_site_url'); 92 100 $base_url = (string) trim( $base_url[0] ); 93 101 94 $namespaces = $xml->getDocNamespaces();95 if ( ! isset( $namespaces['wp'] ) )96 $namespaces['wp'] = 'http://wordpress.org/export/1.1/';97 if ( ! isset( $namespaces['excerpt'] ) )98 $namespaces['excerpt'] = 'http://wordpress.org/export/1.1/excerpt/';99 100 102 // grab authors 101 103 foreach ( $xml->xpath('/rss/channel/wp:author') as $author_arr ) { 102 $a = $author_arr->children( $namespaces['wp']);104 $a = $author_arr->children( WXR_NAMESPACE_URI ); 103 105 $login = (string) $a->author_login; 104 106 $authors[$login] = array( 105 107 'author_id' => (int) $a->author_id, … … class WXR_Parser_SimpleXML { 113 115 114 116 // grab cats, tags and terms 115 117 foreach ( $xml->xpath('/rss/channel/wp:category') as $term_arr ) { 116 $t = $term_arr->children( $namespaces['wp']);118 $t = $term_arr->children( WXR_NAMESPACE_URI ); 117 119 $category = array( 118 120 'term_id' => (int) $t->term_id, 119 121 'category_nicename' => (string) $t->category_nicename, … … class WXR_Parser_SimpleXML { 133 135 } 134 136 135 137 foreach ( $xml->xpath('/rss/channel/wp:tag') as $term_arr ) { 136 $t = $term_arr->children( $namespaces['wp']);138 $t = $term_arr->children( WXR_NAMESPACE_URI ); 137 139 $tag = array( 138 140 'term_id' => (int) $t->term_id, 139 141 'tag_slug' => (string) $t->tag_slug, … … class WXR_Parser_SimpleXML { 152 154 } 153 155 154 156 foreach ( $xml->xpath('/rss/channel/wp:term') as $term_arr ) { 155 $t = $term_arr->children( $namespaces['wp']);157 $t = $term_arr->children( WXR_NAMESPACE_URI ); 156 158 $term = array( 157 159 'term_id' => (int) $t->term_id, 158 160 'term_taxonomy' => (string) $t->term_taxonomy, … … class WXR_Parser_SimpleXML { 179 181 'guid' => (string) $item->guid, 180 182 ); 181 183 182 $dc = $item->children( 'http://purl.org/dc/elements/1.1/');184 $dc = $item->children( DUBLIN_CORE_NAMESPACE_URI ); 183 185 $post['post_author'] = (string) $dc->creator; 184 186 185 $content = $item->children( 'http://purl.org/rss/1.0/modules/content/');186 $excerpt = $item->children( $namespaces['excerpt']);187 $content = $item->children( CONTENT_NAMESPACE_URI ); 188 $excerpt = $item->children( WXR_EXCERPT_NAMESPACE_URI ); 187 189 $post['post_content'] = (string) $content->encoded; 188 190 $post['post_excerpt'] = (string) $excerpt->encoded; 189 191 190 $wp = $item->children( $namespaces['wp']);192 $wp = $item->children( WXR_NAMESPACE_URI ); 191 193 $post['post_id'] = (int) $wp->post_id; 192 194 $post['post_date'] = (string) $wp->post_date; 193 195 $post['post_date_gmt'] = (string) $wp->post_date_gmt; … … class WXR_Parser_SimpleXML { 268 270 * WXR Parser that makes use of the XML Parser PHP extension. 269 271 */ 270 272 class WXR_Parser_XML { 273 // we'll put these into the WXR Namespace in __construct() 271 274 var $wp_tags = array( 272 ' wp:post_id', 'wp:post_date', 'wp:post_date_gmt', 'wp:comment_status', 'wp:ping_status', 'wp:attachment_url',273 ' wp:status', 'wp:post_name', 'wp:post_parent', 'wp:menu_order', 'wp:post_type', 'wp:post_password',274 ' wp:is_sticky', 'wp:term_id', 'wp:category_nicename', 'wp:category_parent', 'wp:cat_name', 'wp:category_description',275 ' wp:tag_slug', 'wp:tag_name', 'wp:tag_description', 'wp:term_taxonomy', 'wp:term_parent',276 ' wp:term_name', 'wp:term_description', 'wp:author_id', 'wp:author_login', 'wp:author_email', 'wp:author_display_name',277 ' wp:author_first_name', 'wp:author_last_name',275 'post_id', 'post_date', 'post_date_gmt', 'comment_status', 'ping_status', 'attachment_url', 276 'status', 'post_name', 'post_parent', 'menu_order', 'post_type', 'post_password', 277 'is_sticky', 'term_id', 'category_nicename', 'category_parent', 'cat_name', 'category_description', 278 'tag_slug', 'tag_name', 'tag_description', 'term_taxonomy', 'term_parent', 279 'term_name', 'term_description', 'author_id', 'author_login', 'author_email', 'author_display_name', 280 'author_first_name', 'author_last_name', 278 281 ); 282 // we'll put these into the WXR Namespace in __construct() 279 283 var $wp_sub_tags = array( 280 ' wp:comment_id', 'wp:comment_author', 'wp:comment_author_email', 'wp:comment_author_url',281 ' wp:comment_author_IP', 'wp:comment_date', 'wp:comment_date_gmt', 'wp:comment_content',282 ' wp:comment_approved', 'wp:comment_type', 'wp:comment_parent', 'wp:comment_user_id',284 'comment_id', 'comment_author', 'comment_author_email', 'comment_author_url', 285 'comment_author_IP', 'comment_date', 'comment_date_gmt', 'comment_content', 286 'comment_approved', 'comment_type', 'comment_parent', 'comment_user_id', 283 287 ); 284 288 289 /** 290 * Constructor 291 * 292 * @since 0.x.y 293 */ 294 function __construct() { 295 // put wp_tags & wp_sub_tags in the WXR Namespace 296 $this->wp_tags = array_map( function($tag) { return WXR_NAMESPACE_URI . ':' . $tag; }, $this->wp_tags ); 297 $this->wp_sub_tags = array_map( function($tag) { return WXR_NAMESPACE_URI . ':' . $tag; }, $this->wp_sub_tags ); 298 } 299 285 300 function parse( $file ) { 286 301 $this->wxr_version = $this->in_post = $this->cdata = $this->data = $this->sub_data = $this->in_tag = $this->in_sub_tag = false; 287 302 $this->authors = $this->posts = $this->term = $this->category = $this->tag = array(); 288 303 289 $xml = xml_parser_create ( 'UTF-8' );304 $xml = xml_parser_create_ns( 'UTF-8' ); 290 305 xml_parser_set_option( $xml, XML_OPTION_SKIP_WHITE, 1 ); 291 306 xml_parser_set_option( $xml, XML_OPTION_CASE_FOLDING, 0 ); 292 307 xml_set_object( $xml, $this ); … … class WXR_Parser_XML { 318 333 319 334 function tag_open( $parse, $tag, $attr ) { 320 335 if ( in_array( $tag, $this->wp_tags ) ) { 321 $this->in_tag = substr( $tag, 3);336 $this->in_tag = substr( $tag, strrpos( $tag, ':' ) + 1 ); 322 337 return; 323 338 } 324 339 325 340 if ( in_array( $tag, $this->wp_sub_tags ) ) { 326 $this->in_sub_tag = substr( $tag, 3);341 $this->in_sub_tag = substr( $tag, strrpos( $tag, ':' ) + 1 ); 327 342 return; 328 343 } 329 344 … … class WXR_Parser_XML { 337 352 case 'item': $this->in_post = true; 338 353 case 'title': if ( $this->in_post ) $this->in_tag = 'post_title'; break; 339 354 case 'guid': $this->in_tag = 'guid'; break; 340 case 'dc:creator': $this->in_tag = 'post_author'; break;341 case 'content:encoded': $this->in_tag = 'post_content'; break;342 case 'excerpt:encoded': $this->in_tag = 'post_excerpt'; break;355 case DUBLIN_CORE_NAMESPACE_URI . ':creator': $this->in_tag = 'post_author'; break; 356 case CONTENT_NAMESPACE_URI . ':encoded': $this->in_tag = 'post_content'; break; 357 case WXR_EXCERPT_NAMESPACE_URI . ':encoded': $this->in_tag = 'post_excerpt'; break; 343 358 344 case 'wp:term_slug': $this->in_tag = 'slug'; break;345 case 'wp:meta_key': $this->in_sub_tag = 'key'; break;346 case 'wp:meta_value': $this->in_sub_tag = 'value'; break;359 case WXR_NAMESPACE_URI . ':term_slug': $this->in_tag = 'slug'; break; 360 case WXR_NAMESPACE_URI . ':meta_key': $this->in_sub_tag = 'key'; break; 361 case WXR_NAMESPACE_URI . ':meta_value': $this->in_sub_tag = 'value'; break; 347 362 } 348 363 } 349 364 … … class WXR_Parser_XML { 360 375 361 376 function tag_close( $parser, $tag ) { 362 377 switch ( $tag ) { 363 case 'wp:comment':378 case WXR_NAMESPACE_URI . ':comment': 364 379 unset( $this->sub_data['key'], $this->sub_data['value'] ); // remove meta sub_data 365 380 if ( ! empty( $this->sub_data ) ) 366 381 $this->data['comments'][] = $this->sub_data; 367 382 $this->sub_data = false; 368 383 break; 369 case 'wp:commentmeta':384 case WXR_NAMESPACE_URI . ':commentmeta': 370 385 $this->sub_data['commentmeta'][] = array( 371 386 'key' => $this->sub_data['key'], 372 387 'value' => $this->sub_data['value'] … … class WXR_Parser_XML { 379 394 } 380 395 $this->sub_data = false; 381 396 break; 382 case 'wp:postmeta':397 case WXR_NAMESPACE_URI . ':postmeta': 383 398 if ( ! empty( $this->sub_data ) ) 384 399 $this->data['postmeta'][] = $this->sub_data; 385 400 $this->sub_data = false; 386 401 break; 387 case 'wp:termmeta':402 case WXR_NAMESPACE_URI . ':termmeta': 388 403 if ( ! empty( $this->sub_data ) ) 389 404 $this->data['termmeta'][] = $this->sub_data; 390 405 $this->sub_data = false; … … class WXR_Parser_XML { 393 408 $this->posts[] = $this->data; 394 409 $this->data = false; 395 410 break; 396 case 'wp:category':397 case 'wp:tag':398 case 'wp:term':399 $n = substr( $tag, 3);411 case WXR_NAMESPACE_URI . ':category': 412 case WXR_NAMESPACE_URI . ':tag': 413 case WXR_NAMESPACE_URI . ':term': 414 $n = substr( $tag, strrpos( $tag, ':' ) + 1 ); 400 415 array_push( $this->$n, $this->data ); 401 416 $this->data = false; 402 417 break; 403 case 'wp:author':418 case WXR_NAMESPACE_URI . ':author': 404 419 if ( ! empty($this->data['author_login']) ) 405 420 $this->authors[$this->data['author_login']] = $this->data; 406 421 $this->data = false; 407 422 break; 408 case 'wp:base_site_url':423 case WXR_NAMESPACE_URI . ':base_site_url': 409 424 $this->base_url = $this->cdata; 410 425 break; 411 case 'wp:wxr_version':426 case WXR_NAMESPACE_URI . ':wxr_version': 412 427 $this->wxr_version = $this->cdata; 413 428 break; 414 429 … … class WXR_Parser_XML { 428 443 429 444 /** 430 445 * WXR Parser that uses regular expressions. Fallback for installs without an XML parser. 446 * 447 * @todo It is not worth (or probably even possible) making this regex parser namespace aware! 431 448 */ 432 449 class WXR_Parser_Regex { 433 450 var $authors = array();