Make WordPress Core


Ignore:
Timestamp:
06/02/2024 03:14:35 PM (12 months ago)
Author:
dmsnell
Message:

HTML API: Add custom text decoder.

Provides a custom decoder for strings coming from HTML attributes and
markup. This custom decoder is necessary because of deficiencies in
PHP's html_entity_decode() function:

  • It isn't aware of 720 of the possible named character references in HTML, leaving many out that should be translated.
  • It isn't aware of the ambiguous ampersand rule, which allows conversion of character references in certain contexts when they are missing their closing ;.
  • It doesn't draw a distinction for the ambiguous ampersand rule when decoding attribute values instead of markup values.
  • Use of html_entity_decode() requires manually passing non-default paramter values to ensure it decodes properly.

This decoder also provides some conveniences, such as making a
single-pass and interruptable decode operation possible. This will
provide a number of opportunities to optimize detection and decoding
of things like value prefixes, and whether a value contains a given
substring.

Developed in https://github.com/WordPress/wordpress-develop/pull/6387
Discussed in https://core.trac.wordpress.org/ticket/61072

Props dmsnell, gziolo, jonsurrell, jorbin, westonruter, zieladam.
Fixes #61072.

File:
1 edited

Legend:

Unmodified
Added
Removed
  • trunk/src/wp-includes/class-wp-token-map.php

    r58188 r58281  
    436436     * @since 6.6.0
    437437     *
    438      * @param string  $word             Determine if this word is a lookup key in the map.
    439      * @param ?string $case_sensitivity 'ascii-case-insensitive' to ignore ASCII case or default of 'case-sensitive'.
     438     * @param string $word             Determine if this word is a lookup key in the map.
     439     * @param string $case_sensitivity Optional. Pass 'ascii-case-insensitive' to ignore ASCII case when matching. Default 'case-sensitive'.
    440440     * @return bool Whether there's an entry for the given word in the map.
    441441     */
     
    522522     *
    523523     * @param string  $text                       String in which to search for a lookup key.
    524      * @param ?int    $offset                     How many bytes into the string where the lookup key ought to start.
    525      * @param ?int    &$matched_token_byte_length Holds byte-length of found token matched, otherwise not set.
    526      * @param ?string $case_sensitivity           'ascii-case-insensitive' to ignore ASCII case or default of 'case-sensitive'.
    527      * @return string|false Mapped value of lookup key if found, otherwise `false`.
     524     * @param int     $offset                     Optional. How many bytes into the string where the lookup key ought to start. Default 0.
     525     * @param ?int    &$matched_token_byte_length Optional. Holds byte-length of found token matched, otherwise not set. Default null.
     526     * @param string  $case_sensitivity           Optional. Pass 'ascii-case-insensitive' to ignore ASCII case when matching. Default 'case-sensitive'.
     527     * @return string|null Mapped value of lookup key if found, otherwise `null`.
    528528     */
    529529    public function read_token( $text, $offset = 0, &$matched_token_byte_length = null, $case_sensitivity = 'case-sensitive' ) {
     
    540540                return strlen( $this->small_words ) > 0
    541541                    ? $this->read_small_token( $text, $offset, $matched_token_byte_length, $case_sensitivity )
    542                     : false;
     542                    : null;
    543543            }
    544544
     
    565565        return strlen( $this->small_words ) > 0
    566566            ? $this->read_small_token( $text, $offset, $matched_token_byte_length, $case_sensitivity )
    567             : false;
     567            : null;
    568568    }
    569569
     
    573573     * @since 6.6.0.
    574574     *
    575      * @param string  $text                       String in which to search for a lookup key.
    576      * @param ?int    $offset                     How many bytes into the string where the lookup key ought to start.
    577      * @param ?int    &$matched_token_byte_length Holds byte-length of found lookup key if matched, otherwise not set.
    578      * @param ?string $case_sensitivity           'ascii-case-insensitive' to ignore ASCII case or default of 'case-sensitive'.
    579      * @return string|false Mapped value of lookup key if found, otherwise `false`.
     575     * @param string $text                       String in which to search for a lookup key.
     576     * @param int    $offset                     Optional. How many bytes into the string where the lookup key ought to start. Default 0.
     577     * @param ?int   &$matched_token_byte_length Optional. Holds byte-length of found lookup key if matched, otherwise not set. Default null.
     578     * @param string $case_sensitivity           Optional. Pass 'ascii-case-insensitive' to ignore ASCII case when matching. Default 'case-sensitive'.
     579     * @return string|null Mapped value of lookup key if found, otherwise `null`.
    580580     */
    581581    private function read_small_token( $text, $offset, &$matched_token_byte_length, $case_sensitivity = 'case-sensitive' ) {
     
    617617        }
    618618
    619         return false;
     619        return null;
    620620    }
    621621
     
    693693     * @since 6.6.0
    694694     *
    695      * @param ?string $indent Use this string for indentation, or rely on the default horizontal tab character.
     695     * @param string $indent Optional. Use this string for indentation, or rely on the default horizontal tab character. Default "\t".
    696696     * @return string Value which can be pasted into a PHP source file for quick loading of table.
    697697     */
Note: See TracChangeset for help on using the changeset viewer.