WordPress.org

Make WordPress Core

Changeset 48072


Ignore:
Timestamp:
06/17/2020 03:22:49 PM (4 months ago)
Author:
swissspidy
Message:

Sitemaps: Add XML sitemaps functionality to WordPress.

While web crawlers are able to discover pages from links within the site and from other sites, XML sitemaps supplement this approach by allowing crawlers to quickly and comprehensively identify all URLs included in the sitemap and learn other signals about those URLs using the associated metadata.

See https://make.wordpress.org/core/2020/06/10/merge-announcement-extensible-core-sitemaps/ for more details.

This feature exposes the sitemap index via /wp-sitemap.xml and exposes a variety of new filters and hooks for developers to modify the behavior. Users can disable sitemaps completely by turning off search engine visibility in WordPress admin.

This change also introduces a new esc_xml() function to escape strings for output in XML, as well as XML support to wp_kses_normalize_entities().

Props Adrian McShane, afragen, adamsilverstein, casiepa, flixos90, garrett-eclipse, joemcgill, kburgoine, kraftbj, milana_cap, pacifika, pbiron, pfefferle, Ruxandra Gradina, swissspidy, szepeviktor, tangrufus, tweetythierry.
Fixes #50117.
See #3670. See #19998.

Location:
trunk
Files:
28 added
7 edited

Legend:

Unmodified
Added
Removed
  • trunk/phpcs.xml.dist

    r48036 r48072  
    248248                <element value="WP_Import_UnitTestCase"/>
    249249                <element value="Tests_Query_Conditionals"/>
     250                <element value="WP_Test_XML_TestCase"/>
    250251
    251252                <!-- Mock classes. -->
  • trunk/src/wp-includes/canonical.php

    r48026 r48072  
    508508            if ( ! empty( $addl_path ) ) {
    509509                $redirect['path'] = trailingslashit( $redirect['path'] ) . $addl_path;
     510            }
     511
     512            // Remove trailing slash for sitemaps requests.
     513            if ( ! empty( get_query_var( 'sitemap' ) ) ) {
     514                $redirect['path'] = untrailingslashit( $redirect['path'] );
    510515            }
    511516
     
    652657    }
    653658
     659    // Remove trailing slash for sitemaps requests.
     660    if ( ! empty( get_query_var( 'sitemap' ) ) || ! empty( get_query_var( 'sitemap-stylesheet' ) ) ) {
     661        $redirect['path'] = untrailingslashit( $redirect['path'] );
     662    }
     663
    654664    // Strip multiple slashes out of the URL.
    655665    if ( strpos( $redirect['path'], '//' ) > -1 ) {
  • trunk/src/wp-includes/default-filters.php

    r47947 r48072  
    457457add_action( 'parse_request', 'rest_api_loaded' );
    458458
     459// Sitemaps actions.
     460add_action( 'init', 'wp_sitemaps_get_server' );
     461
    459462/**
    460463 * Filters formerly mixed into wp-includes.
  • trunk/src/wp-includes/formatting.php

    r48048 r48072  
    936936 *
    937937 * @since 1.2.2
     938 * @since 5.5.0 `$quote_style` also accepts '`ENT_XML1`.
    938939 * @access private
    939940 *
     
    943944 * @param int|string   $quote_style   Optional. Converts double quotes if set to ENT_COMPAT,
    944945 *                                    both single and double if set to ENT_QUOTES or none if set to ENT_NOQUOTES.
    945  *                                    Also compatible with old values; converting single quotes if set to 'single',
     946 *                                    Converts single and double quotes, as well as converting HTML
     947 *                                    named entities (that are not also XML named entities) to their
     948 *                                    code points if set to ENT_XML1. Also compatible with old values;
     949 *                                    converting single quotes if set to 'single',
    946950 *                                    double if set to 'double' or both if otherwise set.
    947951 *                                    Default is ENT_NOQUOTES.
     
    965969    if ( empty( $quote_style ) ) {
    966970        $quote_style = ENT_NOQUOTES;
    967     } elseif ( ! in_array( $quote_style, array( 0, 2, 3, 'single', 'double' ), true ) ) {
     971    } elseif ( ENT_XML1 === $quote_style ) {
     972        $quote_style = ENT_QUOTES | ENT_XML1;
     973    } elseif ( ! in_array( $quote_style, array( ENT_NOQUOTES, ENT_COMPAT, ENT_QUOTES, 'single', 'double' ), true ) ) {
    968974        $quote_style = ENT_QUOTES;
    969975    }
     
    9951001        // Guarantee every &entity; is valid, convert &garbage; into &amp;garbage;
    9961002        // This is required for PHP < 5.4.0 because ENT_HTML401 flag is unavailable.
    997         $string = wp_kses_normalize_entities( $string );
     1003        $string = wp_kses_normalize_entities( $string, ( $quote_style & ENT_XML1 ) ? 'xml' : 'html' );
    9981004    }
    9991005
     
    45384544
    45394545/**
     4546 * Escaping for XML blocks.
     4547 *
     4548 * @since 5.5.0
     4549 *
     4550 * @param string $text Text to escape.
     4551 * @return string Escaped text.
     4552 */
     4553function esc_xml( $text ) {
     4554    $safe_text = wp_check_invalid_utf8( $text );
     4555
     4556    $cdata_regex = '\<\!\[CDATA\[.*?\]\]\>';
     4557    $regex       = <<<EOF
     4558/
     4559    (?=.*?{$cdata_regex})                 # lookahead that will match anything followed by a CDATA Section
     4560    (?<non_cdata_followed_by_cdata>(.*?)) # the "anything" matched by the lookahead
     4561    (?<cdata>({$cdata_regex}))            # the CDATA Section matched by the lookahead
     4562
     4563|                                         # alternative
     4564
     4565    (?<non_cdata>(.*))                    # non-CDATA Section
     4566/sx
     4567EOF;
     4568
     4569    $safe_text = (string) preg_replace_callback(
     4570        $regex,
     4571        static function( $matches ) {
     4572            if ( ! $matches[0] ) {
     4573                return '';
     4574            }
     4575
     4576            if ( ! empty( $matches['non_cdata'] ) ) {
     4577                // escape HTML entities in the non-CDATA Section.
     4578                return _wp_specialchars( $matches['non_cdata'], ENT_XML1 );
     4579            }
     4580
     4581            // Return the CDATA Section unchanged, escape HTML entities in the rest.
     4582            return _wp_specialchars( $matches['non_cdata_followed_by_cdata'], ENT_XML1 ) . $matches['cdata'];
     4583        },
     4584        $safe_text
     4585    );
     4586
     4587    /**
     4588     * Filters a string cleaned and escaped for output in XML.
     4589     *
     4590     * Text passed to esc_xml() is stripped of invalid or special characters
     4591     * before output. HTML named character references are converted to their
     4592     * equivalent code points.
     4593     *
     4594     * @since 5.5.0
     4595     *
     4596     * @param string $safe_text The text after it has been escaped.
     4597     * @param string $text      The text prior to being escaped.
     4598     */
     4599    return apply_filters( 'esc_xml', $safe_text, $text );
     4600}
     4601
     4602/**
    45404603 * Escape an HTML tag name.
    45414604 *
  • trunk/src/wp-includes/kses.php

    r47892 r48072  
    4848// Ensure that these variables are added to the global namespace
    4949// (e.g. if using namespaces / autoload in the current PHP environment).
    50 global $allowedposttags, $allowedtags, $allowedentitynames;
     50global $allowedposttags, $allowedtags, $allowedentitynames, $allowedxmlentitynames;
    5151
    5252if ( ! CUSTOM_TAGS ) {
     
    705705    );
    706706
     707    /**
     708     * @var string[] $allowedxmlentitynames Array of KSES allowed XML entitity names.
     709     * @since 5.5.0
     710     */
     711    $allowedxmlnamedentities = array(
     712        'amp',
     713        'lt',
     714        'gt',
     715        'apos',
     716        'quot',
     717    );
     718
    707719    $allowedposttags = array_map( '_wp_add_global_attributes', $allowedposttags );
    708720} else {
     
    17461758 * `AT&amp;T`, `&#00058;` to `&#58;`, `&#XYZZY;` to `&amp;#XYZZY;` and so on.
    17471759 *
     1760 * When `$context` is set to 'xml', HTML entities are converted to their code points.  For
     1761 * example, `AT&T&hellip;&#XYZZY;` is converted to `AT&amp;T…&amp;#XYZZY;`.
     1762 *
    17481763 * @since 1.0.0
    1749  *
    1750  * @param string $string Content to normalize entities.
     1764 * @since 5.5.0 Added `$context` parameter.
     1765 *
     1766 * @param string $string  Content to normalize entities.
     1767 * @param string $context Context for normalization. Can be either 'html' or 'xml'.
     1768 *                        Default 'html'.
    17511769 * @return string Content with normalized entities.
    17521770 */
    1753 function wp_kses_normalize_entities( $string ) {
     1771function wp_kses_normalize_entities( $string, $context = 'html' ) {
    17541772    // Disarm all entities by converting & to &amp;
    17551773    $string = str_replace( '&', '&amp;', $string );
    17561774
    17571775    // Change back the allowed entities in our entity whitelist.
    1758     $string = preg_replace_callback( '/&amp;([A-Za-z]{2,8}[0-9]{0,2});/', 'wp_kses_named_entities', $string );
     1776    if ( 'xml' === $context ) {
     1777        $string = preg_replace_callback( '/&amp;([A-Za-z]{2,8}[0-9]{0,2});/', 'wp_kses_xml_named_entities', $string );
     1778    } else {
     1779        $string = preg_replace_callback( '/&amp;([A-Za-z]{2,8}[0-9]{0,2});/', 'wp_kses_named_entities', $string );
     1780    }
    17591781    $string = preg_replace_callback( '/&amp;#(0*[0-9]{1,7});/', 'wp_kses_normalize_entities2', $string );
    17601782    $string = preg_replace_callback( '/&amp;#[Xx](0*[0-9A-Fa-f]{1,6});/', 'wp_kses_normalize_entities3', $string );
     
    17851807    $i = $matches[1];
    17861808    return ( ! in_array( $i, $allowedentitynames, true ) ) ? "&amp;$i;" : "&$i;";
     1809}
     1810
     1811/**
     1812 * Callback for `wp_kses_normalize_entities()` regular expression.
     1813 *
     1814 * This function only accepts valid named entity references, which are finite,
     1815 * case-sensitive, and highly scrutinized by XML validators.  HTML named entity
     1816 * references are converted to their code points.
     1817 *
     1818 * @since 5.5.0
     1819 *
     1820 * @global array $allowedentitynames
     1821 * @global array $allowedxmlnamedentities
     1822 *
     1823 * @param array $matches preg_replace_callback() matches array.
     1824 * @return string Correctly encoded entity.
     1825 */
     1826function wp_kses_xml_named_entities( $matches ) {
     1827    global $allowedentitynames, $allowedxmlnamedentities;
     1828
     1829    if ( empty( $matches[1] ) ) {
     1830        return '';
     1831    }
     1832
     1833    $i = $matches[1];
     1834
     1835    if ( in_array( $i, $allowedxmlnamedentities, true ) ) {
     1836        return "&$i;";
     1837    } elseif ( in_array( $i, $allowedentitynames, true ) ) {
     1838        return html_entity_decode( "&$i;", ENT_HTML5 );
     1839    }
     1840
     1841    return "&amp;$i;";
    17871842}
    17881843
  • trunk/src/wp-settings.php

    r47612 r48072  
    264264require ABSPATH . WPINC . '/rest-api/search/class-wp-rest-search-handler.php';
    265265require ABSPATH . WPINC . '/rest-api/search/class-wp-rest-post-search-handler.php';
     266require ABSPATH . WPINC . '/sitemaps.php';
     267require ABSPATH . WPINC . '/sitemaps/class-wp-sitemaps.php';
     268require ABSPATH . WPINC . '/sitemaps/class-wp-sitemaps-index.php';
     269require ABSPATH . WPINC . '/sitemaps/class-wp-sitemaps-provider.php';
     270require ABSPATH . WPINC . '/sitemaps/class-wp-sitemaps-registry.php';
     271require ABSPATH . WPINC . '/sitemaps/class-wp-sitemaps-renderer.php';
     272require ABSPATH . WPINC . '/sitemaps/class-wp-sitemaps-stylesheet.php';
     273require ABSPATH . WPINC . '/sitemaps/providers/class-wp-sitemaps-posts.php';
     274require ABSPATH . WPINC . '/sitemaps/providers/class-wp-sitemaps-taxonomies.php';
     275require ABSPATH . WPINC . '/sitemaps/providers/class-wp-sitemaps-users.php';
    266276require ABSPATH . WPINC . '/class-wp-block-type.php';
    267277require ABSPATH . WPINC . '/class-wp-block-styles-registry.php';
  • trunk/tests/phpunit/includes/bootstrap.php

    r48059 r48072  
    154154require __DIR__ . '/testcase-ajax.php';
    155155require __DIR__ . '/testcase-canonical.php';
     156require __DIR__ . '/testcase-xml.php';
    156157require __DIR__ . '/exceptions.php';
    157158require __DIR__ . '/utils.php';
     
    160161require __DIR__ . '/class-wp-rest-test-configurable-controller.php';
    161162require __DIR__ . '/class-wp-fake-block-type.php';
     163require __DIR__ . '/class-wp-sitemaps-test-provider.php';
     164require __DIR__ . '/class-wp-sitemaps-empty-test-provider.php';
    162165
    163166/**
Note: See TracChangeset for help on using the changeset viewer.