WordPress.org

Make WordPress Core


Ignore:
Timestamp:
06/17/2020 03:22:49 PM (5 months ago)
Author:
swissspidy
Message:

Sitemaps: Add XML sitemaps functionality to WordPress.

While web crawlers are able to discover pages from links within the site and from other sites, XML sitemaps supplement this approach by allowing crawlers to quickly and comprehensively identify all URLs included in the sitemap and learn other signals about those URLs using the associated metadata.

See https://make.wordpress.org/core/2020/06/10/merge-announcement-extensible-core-sitemaps/ for more details.

This feature exposes the sitemap index via /wp-sitemap.xml and exposes a variety of new filters and hooks for developers to modify the behavior. Users can disable sitemaps completely by turning off search engine visibility in WordPress admin.

This change also introduces a new esc_xml() function to escape strings for output in XML, as well as XML support to wp_kses_normalize_entities().

Props Adrian McShane, afragen, adamsilverstein, casiepa, flixos90, garrett-eclipse, joemcgill, kburgoine, kraftbj, milana_cap, pacifika, pbiron, pfefferle, Ruxandra Gradina, swissspidy, szepeviktor, tangrufus, tweetythierry.
Fixes #50117.
See #3670. See #19998.

File:
1 edited

Legend:

Unmodified
Added
Removed
  • trunk/src/wp-includes/formatting.php

    r48048 r48072  
    936936 *
    937937 * @since 1.2.2
     938 * @since 5.5.0 `$quote_style` also accepts '`ENT_XML1`.
    938939 * @access private
    939940 *
     
    943944 * @param int|string   $quote_style   Optional. Converts double quotes if set to ENT_COMPAT,
    944945 *                                    both single and double if set to ENT_QUOTES or none if set to ENT_NOQUOTES.
    945  *                                    Also compatible with old values; converting single quotes if set to 'single',
     946 *                                    Converts single and double quotes, as well as converting HTML
     947 *                                    named entities (that are not also XML named entities) to their
     948 *                                    code points if set to ENT_XML1. Also compatible with old values;
     949 *                                    converting single quotes if set to 'single',
    946950 *                                    double if set to 'double' or both if otherwise set.
    947951 *                                    Default is ENT_NOQUOTES.
     
    965969    if ( empty( $quote_style ) ) {
    966970        $quote_style = ENT_NOQUOTES;
    967     } elseif ( ! in_array( $quote_style, array( 0, 2, 3, 'single', 'double' ), true ) ) {
     971    } elseif ( ENT_XML1 === $quote_style ) {
     972        $quote_style = ENT_QUOTES | ENT_XML1;
     973    } elseif ( ! in_array( $quote_style, array( ENT_NOQUOTES, ENT_COMPAT, ENT_QUOTES, 'single', 'double' ), true ) ) {
    968974        $quote_style = ENT_QUOTES;
    969975    }
     
    9951001        // Guarantee every &entity; is valid, convert &garbage; into &garbage;
    9961002        // This is required for PHP < 5.4.0 because ENT_HTML401 flag is unavailable.
    997         $string = wp_kses_normalize_entities( $string );
     1003        $string = wp_kses_normalize_entities( $string, ( $quote_style & ENT_XML1 ) ? 'xml' : 'html' );
    9981004    }
    9991005
     
    45384544
    45394545/**
     4546 * Escaping for XML blocks.
     4547 *
     4548 * @since 5.5.0
     4549 *
     4550 * @param string $text Text to escape.
     4551 * @return string Escaped text.
     4552 */
     4553function esc_xml( $text ) {
     4554    $safe_text = wp_check_invalid_utf8( $text );
     4555
     4556    $cdata_regex = '\<\!\[CDATA\[.*?\]\]\>';
     4557    $regex       = <<<EOF
     4558/
     4559    (?=.*?{$cdata_regex})                 # lookahead that will match anything followed by a CDATA Section
     4560    (?<non_cdata_followed_by_cdata>(.*?)) # the "anything" matched by the lookahead
     4561    (?<cdata>({$cdata_regex}))            # the CDATA Section matched by the lookahead
     4562
     4563|                                         # alternative
     4564
     4565    (?<non_cdata>(.*))                    # non-CDATA Section
     4566/sx
     4567EOF;
     4568
     4569    $safe_text = (string) preg_replace_callback(
     4570        $regex,
     4571        static function( $matches ) {
     4572            if ( ! $matches[0] ) {
     4573                return '';
     4574            }
     4575
     4576            if ( ! empty( $matches['non_cdata'] ) ) {
     4577                // escape HTML entities in the non-CDATA Section.
     4578                return _wp_specialchars( $matches['non_cdata'], ENT_XML1 );
     4579            }
     4580
     4581            // Return the CDATA Section unchanged, escape HTML entities in the rest.
     4582            return _wp_specialchars( $matches['non_cdata_followed_by_cdata'], ENT_XML1 ) . $matches['cdata'];
     4583        },
     4584        $safe_text
     4585    );
     4586
     4587    /**
     4588     * Filters a string cleaned and escaped for output in XML.
     4589     *
     4590     * Text passed to esc_xml() is stripped of invalid or special characters
     4591     * before output. HTML named character references are converted to their
     4592     * equivalent code points.
     4593     *
     4594     * @since 5.5.0
     4595     *
     4596     * @param string $safe_text The text after it has been escaped.
     4597     * @param string $text      The text prior to being escaped.
     4598     */
     4599    return apply_filters( 'esc_xml', $safe_text, $text );
     4600}
     4601
     4602/**
    45404603 * Escape an HTML tag name.
    45414604 *
Note: See TracChangeset for help on using the changeset viewer.