Make WordPress Core

Changeset 58558


Ignore:
Timestamp:
06/25/2024 03:09:43 AM (3 weeks ago)
Author:
dmsnell
Message:

HTML API: Add missing subclass methods to HTML Processor and add token provenance.

This patch introduces two related changes:

  • It adds missing subclass methods on the HTML Processor which needed to be implemented since it started visiting virtual nodes. These methods need to account for the fact that not all tokens truly exist.
  • It adds a new concept and internal method, is_virtual(), indicating if the currently-matched token comes from the raw text in the input HTML document or if it was the byproduct of semantic parsing rules. This internal method and new vocabulary around token provenance considerably simplifies the logic spread throughout the rest of the class and its subclass methods.

Developed in https://github.com/WordPress/wordpress-develop/pull/6860
Discussed in https://core.trac.wordpress.org/ticket/61348

Follow-up to [58304].

Props dmsnell, jonsurrell, gziolo.
See #61348.

Location:
trunk/src/wp-includes/html-api
Files:
2 edited

Legend:

Unmodified
Added
Removed
  • trunk/src/wp-includes/html-api/class-wp-html-processor.php

    r58365 r58558  
    350350        $this->state->stack_of_open_elements->set_push_handler(
    351351            function ( WP_HTML_Token $token ) {
    352                 $this->element_queue[] = new WP_HTML_Stack_Event( $token, WP_HTML_Stack_Event::PUSH );
     352                $is_virtual            = ! isset( $this->state->current_token ) || $this->is_tag_closer();
     353                $same_node             = isset( $this->state->current_token ) && $token->node_name === $this->state->current_token->node_name;
     354                $provenance            = ( ! $same_node || $is_virtual ) ? 'virtual' : 'real';
     355                $this->element_queue[] = new WP_HTML_Stack_Event( $token, WP_HTML_Stack_Event::PUSH, $provenance );
    353356            }
    354357        );
     
    356359        $this->state->stack_of_open_elements->set_pop_handler(
    357360            function ( WP_HTML_Token $token ) {
    358                 $this->element_queue[] = new WP_HTML_Stack_Event( $token, WP_HTML_Stack_Event::POP );
     361                $is_virtual            = ! isset( $this->state->current_token ) || ! $this->is_tag_closer();
     362                $same_node             = isset( $this->state->current_token ) && $token->node_name === $this->state->current_token->node_name;
     363                $provenance            = ( ! $same_node || $is_virtual ) ? 'virtual' : 'real';
     364                $this->element_queue[] = new WP_HTML_Stack_Event( $token, WP_HTML_Stack_Event::POP, $provenance );
    359365            }
    360366        );
     
    570576     */
    571577    public function is_tag_closer() {
    572         return isset( $this->current_element )
    573             ? ( WP_HTML_Stack_Event::POP === $this->current_element->operation )
     578        return $this->is_virtual()
     579            ? ( WP_HTML_Stack_Event::POP === $this->current_element->operation && '#tag' === $this->get_token_type() )
    574580            : parent::is_tag_closer();
     581    }
     582
     583    /**
     584     * Indicates if the currently-matched token is virtual, created by a stack operation
     585     * while processing HTML, rather than a token found in the HTML text itself.
     586     *
     587     * @since 6.6.0
     588     *
     589     * @return bool Whether the current token is virtual.
     590     */
     591    private function is_virtual() {
     592        return (
     593            isset( $this->current_element->provenance ) &&
     594            'virtual' === $this->current_element->provenance
     595        );
    575596    }
    576597
     
    14411462        }
    14421463
    1443         if ( isset( $this->current_element ) ) {
     1464        if ( $this->is_virtual() ) {
    14441465            return $this->current_element->token->node_name;
    14451466        }
     
    14611482
    14621483    /**
     1484     * Indicates if the currently matched tag contains the self-closing flag.
     1485     *
     1486     * No HTML elements ought to have the self-closing flag and for those, the self-closing
     1487     * flag will be ignored. For void elements this is benign because they "self close"
     1488     * automatically. For non-void HTML elements though problems will appear if someone
     1489     * intends to use a self-closing element in place of that element with an empty body.
     1490     * For HTML foreign elements and custom elements the self-closing flag determines if
     1491     * they self-close or not.
     1492     *
     1493     * This function does not determine if a tag is self-closing,
     1494     * but only if the self-closing flag is present in the syntax.
     1495     *
     1496     * @since 6.6.0 Subclassed for the HTML Processor.
     1497     *
     1498     * @return bool Whether the currently matched tag contains the self-closing flag.
     1499     */
     1500    public function has_self_closing_flag() {
     1501        return $this->is_virtual() ? false : parent::has_self_closing_flag();
     1502    }
     1503
     1504    /**
    14631505     * Returns the node name represented by the token.
    14641506     *
     
    14811523     */
    14821524    public function get_token_name() {
    1483         if ( isset( $this->current_element ) ) {
    1484             return $this->current_element->token->node_name;
    1485         }
    1486 
    1487         return parent::get_token_name();
     1525        return $this->is_virtual()
     1526            ? $this->current_element->token->node_name
     1527            : parent::get_token_name();
    14881528    }
    14891529
     
    15111551     */
    15121552    public function get_token_type() {
    1513         if ( isset( $this->current_element ) ) {
    1514             $node_name = $this->current_element->token->node_name;
    1515             if ( ctype_upper( $node_name[0] ) ) {
     1553        if ( $this->is_virtual() ) {
     1554            /*
     1555             * This logic comes from the Tag Processor.
     1556             *
     1557             * @todo It would be ideal not to repeat this here, but it's not clearly
     1558             *       better to allow passing a token name to `get_token_type()`.
     1559             */
     1560            $node_name     = $this->current_element->token->node_name;
     1561            $starting_char = $node_name[0];
     1562            if ( 'A' <= $starting_char && 'Z' >= $starting_char ) {
    15161563                return '#tag';
    15171564            }
     
    15471594     */
    15481595    public function get_attribute( $name ) {
    1549         if ( isset( $this->current_element ) ) {
    1550             // Closing tokens cannot contain attributes.
    1551             if ( WP_HTML_Stack_Event::POP === $this->current_element->operation ) {
    1552                 return null;
    1553             }
    1554 
    1555             $node_name = $this->current_element->token->node_name;
    1556 
    1557             // Only tags can contain attributes.
    1558             if ( 'A' > $node_name[0] || 'Z' < $node_name[0] ) {
    1559                 return null;
    1560             }
    1561 
    1562             if ( $this->current_element->token->bookmark_name === (string) $this->bookmark_counter ) {
    1563                 return parent::get_attribute( $name );
    1564             }
    1565         }
    1566 
    1567         return null;
     1596        return $this->is_virtual() ? null : parent::get_attribute( $name );
     1597    }
     1598
     1599    /**
     1600     * Updates or creates a new attribute on the currently matched tag with the passed value.
     1601     *
     1602     * For boolean attributes special handling is provided:
     1603     *  - When `true` is passed as the value, then only the attribute name is added to the tag.
     1604     *  - When `false` is passed, the attribute gets removed if it existed before.
     1605     *
     1606     * For string attributes, the value is escaped using the `esc_attr` function.
     1607     *
     1608     * @since 6.6.0 Subclassed for the HTML Processor.
     1609     *
     1610     * @param string      $name  The attribute name to target.
     1611     * @param string|bool $value The new attribute value.
     1612     * @return bool Whether an attribute value was set.
     1613     */
     1614    public function set_attribute( $name, $value ) {
     1615        return $this->is_virtual() ? false : parent::set_attribute( $name, $value );
     1616    }
     1617
     1618    /**
     1619     * Remove an attribute from the currently-matched tag.
     1620     *
     1621     * @since 6.6.0 Subclassed for HTML Processor.
     1622     *
     1623     * @param string $name The attribute name to remove.
     1624     * @return bool Whether an attribute was removed.
     1625     */
     1626    public function remove_attribute( $name ) {
     1627        return $this->is_virtual() ? false : parent::remove_attribute( $name );
    15681628    }
    15691629
     
    15951655     */
    15961656    public function get_attribute_names_with_prefix( $prefix ) {
    1597         if ( isset( $this->current_element ) ) {
    1598             if ( WP_HTML_Stack_Event::POP === $this->current_element->operation ) {
    1599                 return null;
    1600             }
    1601 
    1602             $mark = $this->bookmarks[ $this->current_element->token->bookmark_name ];
    1603             if ( 0 === $mark->length ) {
    1604                 return null;
    1605             }
    1606         }
    1607 
    1608         return parent::get_attribute_names_with_prefix( $prefix );
     1657        return $this->is_virtual() ? null : parent::get_attribute_names_with_prefix( $prefix );
     1658    }
     1659
     1660    /**
     1661     * Adds a new class name to the currently matched tag.
     1662     *
     1663     * @since 6.6.0 Subclassed for the HTML Processor.
     1664     *
     1665     * @param string $class_name The class name to add.
     1666     * @return bool Whether the class was set to be added.
     1667     */
     1668    public function add_class( $class_name ) {
     1669        return $this->is_virtual() ? false : parent::add_class( $class_name );
     1670    }
     1671
     1672    /**
     1673     * Removes a class name from the currently matched tag.
     1674     *
     1675     * @since 6.6.0 Subclassed for the HTML Processor.
     1676     *
     1677     * @param string $class_name The class name to remove.
     1678     * @return bool Whether the class was set to be removed.
     1679     */
     1680    public function remove_class( $class_name ) {
     1681        return $this->is_virtual() ? false : parent::remove_class( $class_name );
     1682    }
     1683
     1684    /**
     1685     * Returns if a matched tag contains the given ASCII case-insensitive class name.
     1686     *
     1687     * @since 6.6.0 Subclassed for the HTML Processor.
     1688     *
     1689     * @param string $wanted_class Look for this CSS class name, ASCII case-insensitive.
     1690     * @return bool|null Whether the matched tag contains the given class name, or null if not matched.
     1691     */
     1692    public function has_class( $wanted_class ) {
     1693        return $this->is_virtual() ? null : parent::has_class( $wanted_class );
     1694    }
     1695
     1696    /**
     1697     * Generator for a foreach loop to step through each class name for the matched tag.
     1698     *
     1699     * This generator function is designed to be used inside a "foreach" loop.
     1700     *
     1701     * Example:
     1702     *
     1703     *     $p = WP_HTML_Processor::create_fragment( "<div class='free &lt;egg&lt;\tlang-en'>" );
     1704     *     $p->next_tag();
     1705     *     foreach ( $p->class_list() as $class_name ) {
     1706     *         echo "{$class_name} ";
     1707     *     }
     1708     *     // Outputs: "free <egg> lang-en "
     1709     *
     1710     * @since 6.6.0 Subclassed for the HTML Processor.
     1711     */
     1712    public function class_list() {
     1713        return $this->is_virtual() ? null : parent::class_list();
    16091714    }
    16101715
     
    16301735     */
    16311736    public function get_modifiable_text() {
    1632         if ( isset( $this->current_element ) ) {
    1633             if ( WP_HTML_Stack_Event::POP === $this->current_element->operation ) {
    1634                 return '';
    1635             }
    1636 
    1637             $mark = $this->bookmarks[ $this->current_element->token->bookmark_name ];
    1638             if ( 0 === $mark->length ) {
    1639                 return '';
    1640             }
    1641         }
    1642         return parent::get_modifiable_text();
     1737        return $this->is_virtual() ? '' : parent::get_modifiable_text();
     1738    }
     1739
     1740    /**
     1741     * Indicates what kind of comment produced the comment node.
     1742     *
     1743     * Because there are different kinds of HTML syntax which produce
     1744     * comments, the Tag Processor tracks and exposes this as a type
     1745     * for the comment. Nominally only regular HTML comments exist as
     1746     * they are commonly known, but a number of unrelated syntax errors
     1747     * also produce comments.
     1748     *
     1749     * @see self::COMMENT_AS_ABRUPTLY_CLOSED_COMMENT
     1750     * @see self::COMMENT_AS_CDATA_LOOKALIKE
     1751     * @see self::COMMENT_AS_INVALID_HTML
     1752     * @see self::COMMENT_AS_HTML_COMMENT
     1753     * @see self::COMMENT_AS_PI_NODE_LOOKALIKE
     1754     *
     1755     * @since 6.6.0 Subclassed for the HTML Processor.
     1756     *
     1757     * @return string|null
     1758     */
     1759    public function get_comment_type() {
     1760        return $this->is_virtual() ? null : parent::get_comment_type();
    16431761    }
    16441762
  • trunk/src/wp-includes/html-api/class-wp-html-stack-event.php

    r58304 r58558  
    5858
    5959    /**
     60     * Indicates if the stack element is a real or virtual node.
     61     *
     62     * @since 6.6.0
     63     *
     64     * @var string
     65     */
     66    public $provenance;
     67
     68    /**
    6069     * Constructor function.
    6170     *
    62      * @param WP_HTML_Token $token     Token associated with stack event, always an opening token.
    63      * @param string        $operation One of self::PUSH or self::POP.
     71     * @since 6.6.0
     72     *
     73     * @param WP_HTML_Token $token      Token associated with stack event, always an opening token.
     74     * @param string        $operation  One of self::PUSH or self::POP.
     75     * @param string        $provenance "virtual" or "real".
    6476     */
    65     public function __construct( $token, $operation ) {
    66         $this->token     = $token;
    67         $this->operation = $operation;
     77    public function __construct( $token, $operation, $provenance ) {
     78        $this->token      = $token;
     79        $this->operation  = $operation;
     80        $this->provenance = $provenance;
    6881    }
    6982}
Note: See TracChangeset for help on using the changeset viewer.