Make WordPress Core

Changeset 58281


Ignore:
Timestamp:
06/02/2024 03:14:35 PM (6 months ago)
Author:
dmsnell
Message:

HTML API: Add custom text decoder.

Provides a custom decoder for strings coming from HTML attributes and
markup. This custom decoder is necessary because of deficiencies in
PHP's html_entity_decode() function:

  • It isn't aware of 720 of the possible named character references in HTML, leaving many out that should be translated.
  • It isn't aware of the ambiguous ampersand rule, which allows conversion of character references in certain contexts when they are missing their closing ;.
  • It doesn't draw a distinction for the ambiguous ampersand rule when decoding attribute values instead of markup values.
  • Use of html_entity_decode() requires manually passing non-default paramter values to ensure it decodes properly.

This decoder also provides some conveniences, such as making a
single-pass and interruptable decode operation possible. This will
provide a number of opportunities to optimize detection and decoding
of things like value prefixes, and whether a value contains a given
substring.

Developed in https://github.com/WordPress/wordpress-develop/pull/6387
Discussed in https://core.trac.wordpress.org/ticket/61072

Props dmsnell, gziolo, jonsurrell, jorbin, westonruter, zieladam.
Fixes #61072.

Location:
trunk
Files:
2 added
5 edited

Legend:

Unmodified
Added
Removed
  • trunk/src/wp-includes/class-wp-token-map.php

    r58188 r58281  
    436436     * @since 6.6.0
    437437     *
    438      * @param string  $word             Determine if this word is a lookup key in the map.
    439      * @param ?string $case_sensitivity 'ascii-case-insensitive' to ignore ASCII case or default of 'case-sensitive'.
     438     * @param string $word             Determine if this word is a lookup key in the map.
     439     * @param string $case_sensitivity Optional. Pass 'ascii-case-insensitive' to ignore ASCII case when matching. Default 'case-sensitive'.
    440440     * @return bool Whether there's an entry for the given word in the map.
    441441     */
     
    522522     *
    523523     * @param string  $text                       String in which to search for a lookup key.
    524      * @param ?int    $offset                     How many bytes into the string where the lookup key ought to start.
    525      * @param ?int    &$matched_token_byte_length Holds byte-length of found token matched, otherwise not set.
    526      * @param ?string $case_sensitivity           'ascii-case-insensitive' to ignore ASCII case or default of 'case-sensitive'.
    527      * @return string|false Mapped value of lookup key if found, otherwise `false`.
     524     * @param int     $offset                     Optional. How many bytes into the string where the lookup key ought to start. Default 0.
     525     * @param ?int    &$matched_token_byte_length Optional. Holds byte-length of found token matched, otherwise not set. Default null.
     526     * @param string  $case_sensitivity           Optional. Pass 'ascii-case-insensitive' to ignore ASCII case when matching. Default 'case-sensitive'.
     527     * @return string|null Mapped value of lookup key if found, otherwise `null`.
    528528     */
    529529    public function read_token( $text, $offset = 0, &$matched_token_byte_length = null, $case_sensitivity = 'case-sensitive' ) {
     
    540540                return strlen( $this->small_words ) > 0
    541541                    ? $this->read_small_token( $text, $offset, $matched_token_byte_length, $case_sensitivity )
    542                     : false;
     542                    : null;
    543543            }
    544544
     
    565565        return strlen( $this->small_words ) > 0
    566566            ? $this->read_small_token( $text, $offset, $matched_token_byte_length, $case_sensitivity )
    567             : false;
     567            : null;
    568568    }
    569569
     
    573573     * @since 6.6.0.
    574574     *
    575      * @param string  $text                       String in which to search for a lookup key.
    576      * @param ?int    $offset                     How many bytes into the string where the lookup key ought to start.
    577      * @param ?int    &$matched_token_byte_length Holds byte-length of found lookup key if matched, otherwise not set.
    578      * @param ?string $case_sensitivity           'ascii-case-insensitive' to ignore ASCII case or default of 'case-sensitive'.
    579      * @return string|false Mapped value of lookup key if found, otherwise `false`.
     575     * @param string $text                       String in which to search for a lookup key.
     576     * @param int    $offset                     Optional. How many bytes into the string where the lookup key ought to start. Default 0.
     577     * @param ?int   &$matched_token_byte_length Optional. Holds byte-length of found lookup key if matched, otherwise not set. Default null.
     578     * @param string $case_sensitivity           Optional. Pass 'ascii-case-insensitive' to ignore ASCII case when matching. Default 'case-sensitive'.
     579     * @return string|null Mapped value of lookup key if found, otherwise `null`.
    580580     */
    581581    private function read_small_token( $text, $offset, &$matched_token_byte_length, $case_sensitivity = 'case-sensitive' ) {
     
    617617        }
    618618
    619         return false;
     619        return null;
    620620    }
    621621
     
    693693     * @since 6.6.0
    694694     *
    695      * @param ?string $indent Use this string for indentation, or rely on the default horizontal tab character.
     695     * @param string $indent Optional. Use this string for indentation, or rely on the default horizontal tab character. Default "\t".
    696696     * @return string Value which can be pasted into a PHP source file for quick loading of table.
    697697     */
  • trunk/src/wp-includes/html-api/class-wp-html-tag-processor.php

    r58233 r58281  
    1616 *    This would increase the size of the changes for some operations but leave more
    1717 *    natural-looking output HTML.
    18  *  - Properly decode HTML character references in `get_attribute()`. PHP's
    19  *    `html_entity_decode()` is wrong in a couple ways: it doesn't account for the
    20  *    no-ambiguous-ampersand rule, and it improperly handles the way semicolons may
    21  *    or may not terminate a character reference.
    2218 *
    2319 * @package WordPress
     
    25002496         */
    25012497        $enqueued_value = substr( $enqueued_text, $equals_at + 2, -1 );
    2502         return html_entity_decode( $enqueued_value );
     2498        return WP_HTML_Decoder::decode_attribute( $enqueued_value );
    25032499    }
    25042500
     
    25732569        $raw_value = substr( $this->html, $attribute->value_starts_at, $attribute->value_length );
    25742570
    2575         return html_entity_decode( $raw_value );
     2571        return WP_HTML_Decoder::decode_attribute( $raw_value );
    25762572    }
    25772573
     
    28732869        }
    28742870
    2875         $decoded = html_entity_decode( $text, ENT_QUOTES | ENT_HTML5 | ENT_SUBSTITUTE );
     2871        $decoded = WP_HTML_Decoder::decode_text_node( $text );
    28762872
    28772873        /*
  • trunk/src/wp-settings.php

    r58264 r58281  
    254254require ABSPATH . WPINC . '/html-api/class-wp-html-span.php';
    255255require ABSPATH . WPINC . '/html-api/class-wp-html-text-replacement.php';
     256require ABSPATH . WPINC . '/html-api/class-wp-html-decoder.php';
    256257require ABSPATH . WPINC . '/html-api/class-wp-html-tag-processor.php';
    257258require ABSPATH . WPINC . '/html-api/class-wp-html-unsupported-exception.php';
  • trunk/tests/phpunit/tests/html-api/wpHtmlProcessorHtml5lib.php

    r58072 r58281  
    3232     */
    3333    const SKIP_TESTS = array(
    34         'adoption01/line0046'        => 'Unimplemented: Reconstruction of active formatting elements.',
    35         'adoption01/line0159'        => 'Unimplemented: Reconstruction of active formatting elements.',
    36         'adoption01/line0318'        => 'Unimplemented: Reconstruction of active formatting elements.',
    37         'entities02/line0100'        => 'Encoded characters without semicolon termination in attribute values are not handled properly',
    38         'entities02/line0114'        => 'Encoded characters without semicolon termination in attribute values are not handled properly',
    39         'entities02/line0128'        => 'Encoded characters without semicolon termination in attribute values are not handled properly',
    40         'entities02/line0142'        => 'Encoded characters without semicolon termination in attribute values are not handled properly',
    41         'entities02/line0156'        => 'Encoded characters without semicolon termination in attribute values are not handled properly',
    42         'inbody01/line0001'          => 'Bug.',
    43         'inbody01/line0014'          => 'Bug.',
    44         'inbody01/line0029'          => 'Bug.',
    45         'menuitem-element/line0012'  => 'Bug.',
    46         'plain-text-unsafe/line0001' => 'HTML entities may be mishandled.',
    47         'plain-text-unsafe/line0105' => 'Binary.',
    48         'tests1/line0342'            => "Closing P tag implicitly creates opener, which we don't visit.",
    49         'tests1/line0720'            => 'Unimplemented: Reconstruction of active formatting elements.',
    50         'tests1/line0833'            => 'Bug.',
    51         'tests15/line0001'           => 'Unimplemented: Reconstruction of active formatting elements.',
    52         'tests15/line0022'           => 'Unimplemented: Reconstruction of active formatting elements.',
    53         'tests2/line0317'            => 'HTML entities may be mishandled.',
    54         'tests2/line0408'            => 'HTML entities may be mishandled.',
    55         'tests2/line0650'            => 'Whitespace only test never enters "in body" parsing mode.',
    56         'tests20/line0497'           => "Closing P tag implicitly creates opener, which we don't visit.",
    57         'tests23/line0001'           => 'Unimplemented: Reconstruction of active formatting elements.',
    58         'tests23/line0041'           => 'Unimplemented: Reconstruction of active formatting elements.',
    59         'tests23/line0069'           => 'Unimplemented: Reconstruction of active formatting elements.',
    60         'tests23/line0101'           => 'Unimplemented: Reconstruction of active formatting elements.',
    61         'tests25/line0169'           => 'Bug.',
    62         'tests26/line0263'           => 'Bug: An active formatting element should be created for a trailing text node.',
    63         'tests7/line0354'            => 'Bug.',
    64         'tests8/line0001'            => 'Bug.',
    65         'tests8/line0020'            => 'Bug.',
    66         'tests8/line0037'            => 'Bug.',
    67         'tests8/line0052'            => 'Bug.',
    68         'webkit01/line0174'          => 'Bug.',
     34        'adoption01/line0046'       => 'Unimplemented: Reconstruction of active formatting elements.',
     35        'adoption01/line0159'       => 'Unimplemented: Reconstruction of active formatting elements.',
     36        'adoption01/line0318'       => 'Unimplemented: Reconstruction of active formatting elements.',
     37        'inbody01/line0001'         => 'Bug.',
     38        'inbody01/line0014'         => 'Bug.',
     39        'inbody01/line0029'         => 'Bug.',
     40        'menuitem-element/line0012' => 'Bug.',
     41        'tests1/line0342'           => "Closing P tag implicitly creates opener, which we don't visit.",
     42        'tests1/line0720'           => 'Unimplemented: Reconstruction of active formatting elements.',
     43        'tests1/line0833'           => 'Bug.',
     44        'tests15/line0001'          => 'Unimplemented: Reconstruction of active formatting elements.',
     45        'tests15/line0022'          => 'Unimplemented: Reconstruction of active formatting elements.',
     46        'tests2/line0650'           => 'Whitespace only test never enters "in body" parsing mode.',
     47        'tests20/line0497'          => "Closing P tag implicitly creates opener, which we don't visit.",
     48        'tests23/line0001'          => 'Unimplemented: Reconstruction of active formatting elements.',
     49        'tests23/line0041'          => 'Unimplemented: Reconstruction of active formatting elements.',
     50        'tests23/line0069'          => 'Unimplemented: Reconstruction of active formatting elements.',
     51        'tests23/line0101'          => 'Unimplemented: Reconstruction of active formatting elements.',
     52        'tests25/line0169'          => 'Bug.',
     53        'tests26/line0263'          => 'Bug: An active formatting element should be created for a trailing text node.',
     54        'tests7/line0354'           => 'Bug.',
     55        'tests8/line0001'           => 'Bug.',
     56        'tests8/line0020'           => 'Bug.',
     57        'tests8/line0037'           => 'Bug.',
     58        'tests8/line0052'           => 'Bug.',
     59        'webkit01/line0174'         => 'Bug.',
    6960    );
    7061
     
    10596        while ( false !== ( $entry = readdir( $handle ) ) ) {
    10697            if ( ! stripos( $entry, '.dat' ) ) {
    107                 continue;
    108             }
    109 
    110             if ( 'entities01.dat' === $entry || 'entities02.dat' === $entry ) {
    11198                continue;
    11299            }
  • trunk/tests/phpunit/tests/wp-token-map/wpTokenMap.php

    r58188 r58281  
    318318
    319319        $skip_bytes = 0;
    320         $this->assertFalse(
     320        $this->assertNull(
    321321            $map->read_token( $document, 0, $skip_bytes ),
    322322            "Shouldn't have found token at start of document."
Note: See TracChangeset for help on using the changeset viewer.