Make WordPress Core


Ignore:
Timestamp:
08/12/2025 06:13:48 PM (9 months ago)
Author:
dmsnell
Message:

Add wp_is_valid_utf8() for normalizing UTF-8 checks.

There are several existing mechanisms in Core to determine if a given string contains valid UTF-8 bytes or not. These are spread out and depend on which extensions are installed on the running system and what is set for blog_charset. The seems_utf8() function is one of these mechanisms.

seems_utf8() does not properly validate UTF-8, unfortunately, and is slow, and the purpose of the function is veiled behind its name and historic legacy.

This patch deprecates seems_utf() and introduces wp_is_valid_utf8(); a new, spec-compliant, efficient, and focused UTF-8 validator. This new validator defers to mb_check_encoding() where present, otherwise validating with a pure-PHP implementation. This makes the spec-compliant validator available on all systems regardless of their runtime environment.

Developed in https://github.com/WordPress/wordpress-develop/pull/9317
Discussed in https://core.trac.wordpress.org/ticket/38044

Props dmsnell, jonsurrell, jorbin.
Fixes #38044.

File:
1 edited

Legend:

Unmodified
Added
Removed
  • trunk/tests/phpunit/tests/formatting/seemsUtf8.php

    r56536 r60630  
    1 <?php
    2 
    3 /**
    4  * @group formatting
    5  *
    6  * @covers ::seems_utf8
    7  */
    8 class Tests_Formatting_SeemsUtf8 extends WP_UnitTestCase {
    9 
    10     /**
    11      * `seems_utf8` returns true for utf-8 strings, false otherwise.
    12      *
    13      * @dataProvider data_seems_utf8_returns_true_for_utf8_strings
    14      */
    15     public function test_seems_utf8_returns_true_for_utf8_strings( $utf8_string ) {
    16         // From http://www.i18nguy.com/unicode-example.html
    17         $this->assertTrue( seems_utf8( $utf8_string ) );
    18     }
    19 
    20     public function data_seems_utf8_returns_true_for_utf8_strings() {
    21         $utf8_strings = file( DIR_TESTDATA . '/formatting/utf-8/utf-8.txt' );
    22         foreach ( $utf8_strings as &$string ) {
    23             $string = (array) trim( $string );
    24         }
    25         unset( $string );
    26         return $utf8_strings;
    27     }
    28 
    29     /**
    30      * @dataProvider data_seems_utf8_returns_false_for_non_utf8_strings
    31      */
    32     public function test_seems_utf8_returns_false_for_non_utf8_strings( $big5_string ) {
    33         $this->assertFalse( seems_utf8( $big5_string ) );
    34     }
    35 
    36     public function data_seems_utf8_returns_false_for_non_utf8_strings() {
    37         // Get data from formatting/big5.txt.
    38         $big5_strings = file( DIR_TESTDATA . '/formatting/big5.txt' );
    39         foreach ( $big5_strings as &$string ) {
    40             $string = (array) trim( $string );
    41         }
    42         unset( $string );
    43         return $big5_strings;
    44     }
    45 }
Note: See TracChangeset for help on using the changeset viewer.