Make WordPress Core

Changeset 62482


Ignore:
Timestamp:
06/10/2026 03:04:54 PM (5 days ago)
Author:
dmsnell
Message:

General: Add support for unicode email addresses in is_email and sanitize_email

This adds support for the unicode address extensions in RFC 6530-3 and refactors the code so there are fewer long regexes and less duplication between sanitize_email and is_email. A new class, WP_Email_Address, provides the shared parts.

Opting out of unicode support is easy, default-filters.php adds unicode support by adding filters, which can be removed.

sanitize_email no longer does major changes like removing an entire subdomain from someone's address, it only cleans up things like soft hyphens and whitespace — changes that happen when coping an email address from text.

Developed in: https://github.com/WordPress/wordpress-develop/pull/5237
Discussed in: https://core.trac.wordpress.org/ticket/31992

Props agulbra, akirk, benniledl, dmsnell, ironprogrammer, justlevine, mdawaffe, mukeshpanchal27, SirLouen, tusharbharti.
Fixes #31992.

Location:
trunk
Files:
3 added
9 edited

Legend:

Unmodified
Added
Removed
  • trunk/src/wp-includes/default-filters.php

    r62428 r62482  
    8686    add_filter( $filter, 'sanitize_url' );
    8787    add_filter( $filter, 'wp_filter_kses' );
     88}
     89
     90// Email addresses: Allow unicode if and only if as the database can
     91// store them. This affects all addresses, including those entered
     92// into contact forms.
     93if ( 'utf8mb4' === $wpdb->charset ) {
     94    add_filter( 'is_email', 'wp_is_unicode_email', 10, 3 );
     95    add_filter( 'sanitize_email', 'wp_sanitize_unicode_email', 10, 3 );
     96} else {
     97    add_filter( 'is_email', 'wp_is_ascii_email', 10, 3 );
     98    add_filter( 'sanitize_email', 'wp_sanitize_ascii_email', 10, 3 );
    8899}
    89100
  • trunk/src/wp-includes/formatting.php

    r62425 r62482  
    21772177}
    21782178
     2179
    21792180/**
    21802181 * Sanitizes a string key.
     
    35903591 * Verifies that an email is valid.
    35913592 *
    3592  * Does not grok i18n domains. Not RFC compliant.
     3593 * This accepts the addresses that matches the WHATWG specifications,
     3594 * i.e. what browsers use for `<input type=email>`. It also accepts some
     3595 * additional addresses.
     3596 *
     3597 * By default this accepts addresses like info@grå.org (also accepted
     3598 * by Firefox) `<input type=email>`. You can disable Unicode support by
     3599 * using the wp_is_ascii_email filter instead of wp_is_unicode_email,
     3600 * which is the default.
    35933601 *
    35943602 * @since 0.71
     
    36033611    }
    36043612
    3605     // Test for the minimum length the email can be.
    3606     if ( strlen( $email ) < 6 ) {
    3607         /**
    3608          * Filters whether an email address is valid.
    3609          *
    3610          * This filter is evaluated under several different contexts, such as 'email_too_short',
    3611          * 'email_no_at', 'local_invalid_chars', 'domain_period_sequence', 'domain_period_limits',
    3612          * 'domain_no_periods', 'sub_hyphen_limits', 'sub_invalid_chars', or no specific context.
    3613          *
    3614          * @since 2.8.0
    3615          *
    3616          * @param string|false $is_email The email address if successfully passed the is_email() checks, false otherwise.
    3617          * @param string       $email    The email address being checked.
    3618          * @param string       $context  Context under which the email was tested.
    3619          */
    3620         return apply_filters( 'is_email', false, $email, 'email_too_short' );
    3621     }
    3622 
    3623     // Test for an @ character after the first position.
    3624     if ( false === strpos( $email, '@', 1 ) ) {
    3625         /** This filter is documented in wp-includes/formatting.php */
    3626         return apply_filters( 'is_email', false, $email, 'email_no_at' );
    3627     }
    3628 
    3629     // Split out the local and domain parts.
    3630     list( $local, $domain ) = explode( '@', $email, 2 );
    3631 
    3632     /*
    3633      * LOCAL PART
    3634      * Test for invalid characters.
     3613    /**
     3614     * Filters whether an email address is valid.
     3615     *
     3616     * This filter is evaluated under several different contexts, such as
     3617     * 'local_invalid_chars', 'domain_no_periods', or no specific context.
     3618     * Filters registered on this hook perform the actual validation; the
     3619     * default filter is registered in default-filters.php.
     3620     *
     3621     * @since 2.8.0
     3622     *
     3623     * @param string|false $is_email The email address if successfully passed the is_email() checks, false otherwise.
     3624     * @param string       $email    The email address being checked.
     3625     * @param string|null  $context  Context under which the email was tested, or null for the initial call.
    36353626     */
    3636     if ( ! preg_match( '/^[a-zA-Z0-9!#$%&\'*+\/=?^_`{|}~\.-]+$/', $local ) ) {
    3637         /** This filter is documented in wp-includes/formatting.php */
    3638         return apply_filters( 'is_email', false, $email, 'local_invalid_chars' );
    3639     }
    3640 
    3641     /*
    3642      * DOMAIN PART
    3643      * Test for sequences of periods.
    3644      */
    3645     if ( preg_match( '/\.{2,}/', $domain ) ) {
    3646         /** This filter is documented in wp-includes/formatting.php */
    3647         return apply_filters( 'is_email', false, $email, 'domain_period_sequence' );
    3648     }
    3649 
    3650     // Test for leading and trailing periods and whitespace.
    3651     if ( trim( $domain, " \t\n\r\0\x0B." ) !== $domain ) {
    3652         /** This filter is documented in wp-includes/formatting.php */
    3653         return apply_filters( 'is_email', false, $email, 'domain_period_limits' );
    3654     }
    3655 
    3656     // Split the domain into subs.
    3657     $subs = explode( '.', $domain );
    3658 
    3659     // Assume the domain will have at least two subs.
    3660     if ( 2 > count( $subs ) ) {
    3661         /** This filter is documented in wp-includes/formatting.php */
    3662         return apply_filters( 'is_email', false, $email, 'domain_no_periods' );
    3663     }
    3664 
    3665     // Loop through each sub.
    3666     foreach ( $subs as $sub ) {
    3667         // Test for leading and trailing hyphens and whitespace.
    3668         if ( trim( $sub, " \t\n\r\0\x0B-" ) !== $sub ) {
    3669             /** This filter is documented in wp-includes/formatting.php */
    3670             return apply_filters( 'is_email', false, $email, 'sub_hyphen_limits' );
    3671         }
    3672 
    3673         // Test for invalid characters.
    3674         if ( ! preg_match( '/^[a-z0-9-]+$/i', $sub ) ) {
    3675             /** This filter is documented in wp-includes/formatting.php */
    3676             return apply_filters( 'is_email', false, $email, 'sub_invalid_chars' );
    3677         }
    3678     }
    3679 
    3680     // Congratulations, your email made it!
    3681     /** This filter is documented in wp-includes/formatting.php */
    3682     return apply_filters( 'is_email', $email, $email, null );
     3627    return apply_filters( 'is_email', false, $email, null );
     3628}
     3629
     3630/**
     3631 * Default is_email filter for databases that support Unicode (db charset is utf8mb4).
     3632 *
     3633 * Validates the email address using {@see WP_Email_Address::from_string()} with Unicode enabled.
     3634 * Only acts when $context is null (which it is in the initial validation call); later rescue-context calls are passed through.
     3635 *
     3636 * @since 7.1.0
     3637 *
     3638 * @param string|false $value   The current filter value.
     3639 * @param string       $email   The email address being checked.
     3640 * @param string|null  $context Validation context, or null for the initial call.
     3641 * @return string|false The email address if valid, false otherwise.
     3642 */
     3643function wp_is_unicode_email( $value, $email, $context ) {
     3644    if ( null !== $context ) {
     3645        return $value;
     3646    }
     3647
     3648    $result = WP_Email_Address::from_string( $email, 'unicode' );
     3649    return $result ? $result->get_unicode_address() : false;
     3650}
     3651
     3652/**
     3653 * Default is_email filter for databases that do not support Unicode (db charset is not utf8mb4).
     3654 *
     3655 * Validates the email address using {@see WP_Email_Address::from_string()} with Unicode disabled.
     3656 * Only acts when $context is null (which it is in the initial validation call); later rescue-context calls are passed through.
     3657 *
     3658 * @since 7.1.0
     3659 *
     3660 * @param string|false $value   The current filter value.
     3661 * @param string       $email   The email address being checked.
     3662 * @param string|null  $context Validation context, or null for the initial call.
     3663 * @return string|false The email address if valid, false otherwise.
     3664 */
     3665function wp_is_ascii_email( $value, $email, $context ) {
     3666    if ( null !== $context ) {
     3667        return $value;
     3668    }
     3669
     3670    $result = WP_Email_Address::from_string( $email, 'ascii' );
     3671    return $result ? $result->get_unicode_address() : false;
    36833672}
    36843673
     
    38093798
    38103799/**
    3811  * Strips out all characters that are not allowable in an email.
     3800 * Sanitizes an email address.
     3801 *
     3802 * Strips stray whitespace from the input, then strips trailing dots from the domain.
     3803 * This is designed to recover from cut/paste mistakes without any risk of transforming
     3804 * the input into a different address than the user intended.
     3805 *
     3806 * Validation and final form are determined by the 'sanitize_email' filter; the default
     3807 * filter is registered in default-filters.php and delegates to {@see WP_Email_Address::from_string()}.
    38123808 *
    38133809 * @since 1.5.0
    3814  *
    3815  * @param string $email Email address to filter.
    3816  * @return string Filtered email address.
     3810 * @since 7.1.0 Accepts Unicode email addresses on supporting platforms.
     3811 *
     3812 * @param string $email Email address to sanitize.
     3813 * @return string The sanitized email address, or an empty string if invalid.
    38173814 */
    38183815function sanitize_email( $email ) {
    3819     // Test for the minimum length the email can be.
    3820     if ( strlen( $email ) < 6 ) {
    3821         /**
    3822          * Filters a sanitized email address.
    3823          *
    3824          * This filter is evaluated under several contexts, including 'email_too_short',
    3825          * 'email_no_at', 'local_invalid_chars', 'domain_period_sequence', 'domain_period_limits',
    3826          * 'domain_no_periods', 'domain_no_valid_subs', or no context.
    3827          *
    3828          * @since 2.8.0
    3829          *
    3830          * @param string $sanitized_email The sanitized email address.
    3831          * @param string $email           The email address, as provided to sanitize_email().
    3832          * @param string|null $message    A message to pass to the user. null if email is sanitized.
    3833          */
    3834         return apply_filters( 'sanitize_email', '', $email, 'email_too_short' );
    3835     }
    3836 
    3837     // Test for an @ character after the first position.
    3838     if ( false === strpos( $email, '@', 1 ) ) {
    3839         /** This filter is documented in wp-includes/formatting.php */
    3840         return apply_filters( 'sanitize_email', '', $email, 'email_no_at' );
    3841     }
    3842 
    3843     // Split out the local and domain parts.
    3844     list( $local, $domain ) = explode( '@', $email, 2 );
     3816    // Strip surrounding whitespace.
     3817    $email = trim( $email );
     3818
     3819    // Extract the address from "Display Name <username@domain>" format.
     3820    if ( 1 === preg_match( '/<([^>]+)>$/', $email, $matches ) ) {
     3821        $email = $matches[1];
     3822    }
    38453823
    38463824    /*
    3847      * LOCAL PART
    3848      * Test for invalid characters.
     3825     * Strip soft hyphens and whitespace adjacent to structural separators (dots and @),
     3826     * e.g. copy-paste artifacts like "info@example\u{00AD}.com" or "info@example .com".
     3827     *
     3828     * In some cases, e.g. autocorrect, some older software has been seen to add the
     3829     * space for unrecognized TLDs. This re-joins the parts for proper examination.
    38493830     */
    3850     $local = preg_replace( '/[^a-zA-Z0-9!#$%&\'*+\/=?^_`{|}~\.-]/', '', $local );
    3851     if ( '' === $local ) {
    3852         /** This filter is documented in wp-includes/formatting.php */
    3853         return apply_filters( 'sanitize_email', '', $email, 'local_invalid_chars' );
    3854     }
    3855 
    3856     /*
    3857      * DOMAIN PART
    3858      * Test for sequences of periods.
     3831    $email = preg_replace( '/[\x{00AD}\s]*([.@])[\x{00AD}\s]*/u', '$1', $email ) ?? $email;
     3832
     3833    // Strip a trailing dot from the domain (e.g. if pasted from the end of a sentence).
     3834    if ( str_contains( $email, '@' ) ) {
     3835        list( $local, $domain ) = explode( '@', $email, 2 );
     3836        $domain                 = rtrim( $domain, '.' );
     3837        $email                  = $local . '@' . $domain;
     3838    }
     3839
     3840    /**
     3841     * Filters a sanitized email address.
     3842     *
     3843     * Filters registered on this hook perform the actual validation and return
     3844     * the canonical email string on success or an empty string on failure.
     3845     * The default filter is registered in default-filters.php.
     3846     *
     3847     * @since 2.8.0
     3848     *
     3849     * @param string      $sanitized_email The sanitized email address, or empty string.
     3850     * @param string      $email           The email address as provided to sanitize_email().
     3851     * @param string|null $context         Validation context, or null for the initial call.
    38593852     */
    3860     $domain = preg_replace( '/\.{2,}/', '', $domain );
    3861     if ( '' === $domain ) {
    3862         /** This filter is documented in wp-includes/formatting.php */
    3863         return apply_filters( 'sanitize_email', '', $email, 'domain_period_sequence' );
    3864     }
    3865 
    3866     // Test for leading and trailing periods and whitespace.
    3867     $domain = trim( $domain, " \t\n\r\0\x0B." );
    3868     if ( '' === $domain ) {
    3869         /** This filter is documented in wp-includes/formatting.php */
    3870         return apply_filters( 'sanitize_email', '', $email, 'domain_period_limits' );
    3871     }
    3872 
    3873     // Split the domain into subs.
    3874     $subs = explode( '.', $domain );
    3875 
    3876     // Assume the domain will have at least two subs.
    3877     if ( 2 > count( $subs ) ) {
    3878         /** This filter is documented in wp-includes/formatting.php */
    3879         return apply_filters( 'sanitize_email', '', $email, 'domain_no_periods' );
    3880     }
    3881 
    3882     // Create an array that will contain valid subs.
    3883     $new_subs = array();
    3884 
    3885     // Loop through each sub.
    3886     foreach ( $subs as $sub ) {
    3887         // Test for leading and trailing hyphens.
    3888         $sub = trim( $sub, " \t\n\r\0\x0B-" );
    3889 
    3890         // Test for invalid characters.
    3891         $sub = preg_replace( '/[^a-z0-9-]+/i', '', $sub );
    3892 
    3893         // If there's anything left, add it to the valid subs.
    3894         if ( '' !== $sub ) {
    3895             $new_subs[] = $sub;
    3896         }
    3897     }
    3898 
    3899     // If there aren't 2 or more valid subs.
    3900     if ( 2 > count( $new_subs ) ) {
    3901         /** This filter is documented in wp-includes/formatting.php */
    3902         return apply_filters( 'sanitize_email', '', $email, 'domain_no_valid_subs' );
    3903     }
    3904 
    3905     // Join valid subs into the new domain.
    3906     $domain = implode( '.', $new_subs );
    3907 
    3908     // Put the email back together.
    3909     $sanitized_email = $local . '@' . $domain;
    3910 
    3911     // Congratulations, your email made it!
    3912     /** This filter is documented in wp-includes/formatting.php */
    3913     return apply_filters( 'sanitize_email', $sanitized_email, $email, null );
     3853    return apply_filters( 'sanitize_email', '', $email, null );
     3854}
     3855
     3856/**
     3857 * Default sanitize_email filter for databases that support Unicode (db charset is utf8mb4).
     3858 *
     3859 * Returns the canonical address from {@see WP_Email_Address::from_string()} with Unicode
     3860 * enabled, or an empty string if the address is invalid.
     3861 *
     3862 * @since 7.1.0
     3863 *
     3864 * @param string      $value   The current filter value.
     3865 * @param string      $email   The email address being sanitized.
     3866 * @param string|null $context Sanitization context, always null.
     3867 * @return string The canonical email address if valid, empty string otherwise.
     3868 */
     3869function wp_sanitize_unicode_email( $value, $email, $context ) {
     3870    $result = WP_Email_Address::from_string( $email, 'unicode' );
     3871    return $result ? $result->get_unicode_address() : '';
     3872}
     3873
     3874/**
     3875 * Default sanitize_email filter for databases that do not support Unicode (db charset is not utf8mb4).
     3876 *
     3877 * Returns the canonical address from {@see WP_Email_Address::from_string()} with Unicode
     3878 * disabled, or an empty string if the address is invalid.
     3879 *
     3880 * @since 7.1.0
     3881 *
     3882 * @param string      $value   The current filter value.
     3883 * @param string      $email   The email address being sanitized.
     3884 * @param string|null $context Sanitization context, always null.
     3885 * @return string The canonical email address if valid, empty string otherwise.
     3886 */
     3887function wp_sanitize_ascii_email( $value, $email, $context ) {
     3888    $result = WP_Email_Address::from_string( $email, 'ascii' );
     3889    return $result ? $result->get_unicode_address() : '';
    39143890}
    39153891
  • trunk/src/wp-settings.php

    r62453 r62482  
    113113require ABSPATH . WPINC . '/class-wp-token-map.php';
    114114require ABSPATH . WPINC . '/utf8.php';
     115require ABSPATH . WPINC . '/class-wp-email-address.php';
    115116require ABSPATH . WPINC . '/formatting.php';
    116117require ABSPATH . WPINC . '/meta.php';
  • trunk/tests/phpunit/tests/auth.php

    r60895 r62482  
    15211521    public function test_wp_signon_using_email_with_an_apostrophe() {
    15221522        $user_args = array(
    1523             'user_email' => "mail\'@example.com",
     1523            'user_email' => "mail'@example.com",
    15241524            'user_pass'  => 'password',
    15251525        );
     
    18341834    public function test_reset_password_with_apostrophe_in_email() {
    18351835        $user_args = array(
    1836             'user_email' => "jo'hn@example.com",
     1836            'user_email' => "jo\'hn@example.com",
    18371837            'user_pass'  => 'password',
    18381838        );
  • trunk/tests/phpunit/tests/formatting/antispambot.php

    r62425 r62482  
    3535            'deep subdomain'       => array( 'kevin@many.subdomains.make.a.happy.man.edu' ),
    3636            'short address'        => array( 'a@b.co' ),
     37            'ascii@nonascii'       => array( 'info@grå.org' ),
     38            'nonascii@nonascii'    => array( 'grå@grå.org' ),
     39            'decomposed unicode'   => array( "gr\u{0061}\u{030a}blå@grå.org" ),
    3740            'weird but legal dots' => array( '..@example.com' ),
    3841            'umlauts'              => array( 'bücher@gmx.de' ),
  • trunk/tests/phpunit/tests/formatting/isEmail.php

    r62249 r62482  
    3838            'kevin@many.subdomains.make.a.happy.man.edu',
    3939            'a@b.co',
     40            'a@b.c',
    4041            'bill+ted@example.com',
     42            'info@grå.org',
     43            'grå@grå.org',
     44            "gr\u{0061}\u{030a}blå@grå.org",
    4145            '..@example.com',
    4246        );
     
    7579            'com.exampleNOSPAMbob',
    7680            'bob@your mom',
    77             'a@b.c',
    7881            '" "@b.c',
    79             '"@"@b.c',
    80             'a@route.org@b.c',
    8182            'h(aj@couc.ou', // bad comment.
    8283            'hi@',
  • trunk/tests/phpunit/tests/formatting/sanitizeEmail.php

    r62249 r62482  
    4242    public function data_sanitized_email_pairs() {
    4343        return array(
    44             'shorter than 6 characters'      => array( 'a@b', '' ),
    45             'contains no @'                  => array( 'ab', '' ),
    46             'just a TLD'                     => array( 'abc@com', '' ),
    47             'plain'                          => array( 'abc@example.com', 'abc@example.com' ),
    48             'invalid utf8 subdomain dropped' => array( "abc@sub.\x80.org", 'abc@sub.org' ),
    49             'all subdomains invalid utf8'    => array( "abc@\x80.org", '' ),
     44            'shorter than 6 characters'        => array( 'a@b', '' ),
     45            'contains no @'                    => array( 'ab', '' ),
     46            'just a TLD'                       => array( 'abc@com', '' ),
     47            'plain'                            => array( 'abc@example.com', 'abc@example.com' ),
     48            'unicode domain'                   => array( 'abc@grå.org', 'abc@grå.org' ),
     49            'unicode local part'               => array( 'grå@example.com', 'grå@example.com' ),
     50            'unicode local and domain'         => array( 'grå@grå.org', 'grå@grå.org' ),
     51            'invalid utf8 in local'            => array( "a\x80b@example.com", '' ),
     52            'invalid utf8 subdomain'           => array( "abc@sub.\x80.org", '' ),
     53            'all subdomains invalid utf8'      => array( "abc@\x80.org", '' ),
     54            'soft hyphen before dot'           => array( "info@example\xC2\xAD.com", 'info@example.com' ),
     55            'soft hyphen after dot'            => array( "info@example.\xC2\xADcom", 'info@example.com' ),
     56            'space before dot'                 => array( 'info@example .com', 'info@example.com' ),
     57            'space after dot'                  => array( 'info@example. com', 'info@example.com' ),
     58            'soft hyphen and space around dot' => array( "info@example \xC2\xAD.com", 'info@example.com' ),
     59            'space around at sign'             => array( 'info @ example.com', 'info@example.com' ),
     60            'soft hyphen before at sign'       => array( "info\xC2\xAD@example.com", 'info@example.com' ),
     61            'display name with angle brackets' => array( 'Alice Example <alice@example.com>', 'alice@example.com' ),
     62            'angle brackets only'              => array( '<alice@example.com>', 'alice@example.com' ),
     63            'angle brackets invalid address'   => array( 'Alice <not-an-email>', '' ),
    5064        );
    5165    }
  • trunk/tests/phpunit/tests/privacy/wpCreateUserRequest.php

    r55337 r62482  
    153153     */
    154154    public function test_sanitized_email() {
    155         $actual = wp_create_user_request( 'some(email<withinvalid\characters@local.test', 'export_personal_data' );
    156 
    157         $this->assertNotWPError( $actual );
    158 
    159         $post = get_post( $actual );
    160 
    161         $this->assertSame( 'export_personal_data', $post->post_name );
    162         $this->assertSame( 'someemailwithinvalidcharacters@local.test', $post->post_title );
     155        // Address supplied in "Display Name <address>" format should be extracted and accepted.
     156        $actual = wp_create_user_request( 'Some User <sanitized@local.test>', 'export_personal_data' );
     157
     158        $this->assertNotWPError( $actual );
     159
     160        $post = get_post( $actual );
     161
     162        $this->assertSame( 'export_personal_data', $post->post_name );
     163        $this->assertSame( 'sanitized@local.test', $post->post_title );
    163164    }
    164165
  • trunk/tests/phpunit/tests/rest-api/rest-comments-controller.php

    r61282 r62482  
    23222322            'post'         => self::$post_id,
    23232323            'author_name'  => 'Bleeding Gums Murphy',
    2324             'author_email' => 'murphy@' . rand_long_str( 190 ) . '.com',
     2324            'author_email' => 'murphy@' . rand_long_str( 60 ) . '.' . rand_long_str( 60 ) . '.com',
    23252325            'author_url'   => 'http://jazz.gingivitis.com',
    23262326            'content'      => 'This isn\'t a saxophone. It\'s an umbrella.',
     
    29552955
    29562956        $params = array(
    2957             'author_email' => 'murphy@' . rand_long_str( 190 ) . '.com',
     2957            'author_email' => 'murphy@' . rand_long_str( 60 ) . '.' . rand_long_str( 60 ) . '.com',
    29582958            'content'      => 'This isn\'t a saxophone. It\'s an umbrella.',
    29592959        );
Note: See TracChangeset for help on using the changeset viewer.