Make WordPress Core

Changeset 62249


Ignore:
Timestamp:
04/21/2026 02:03:56 PM (3 weeks ago)
Author:
dmsnell
Message:

Tests: Print invalid UTF-8 as ASCII to fix hosts test reporting failures.

When serializing test output into XML, invalid UTF-8 bytes lead to a failure to load those test results when they are read. This patch adds code to remap those invalid bytes in an ASCII-readable form, whereas the invalid bytes are separated by parentheses and encoded in their hex form.

This ensures that a proper XML file is generated from the testing results.

Developed in: https://github.com/WordPress/wordpress-develop/pull/11620
Discussed in: https://core.trac.wordpress.org/ticket/31992
Reported in: https://github.com/WordPress/phpunit-test-runner/pull/310

Follow-up to: [62225].

Props agulbra, amykamala, codexdemon, dmsnell, mywp459, rolle.
See #31992.

Location:
trunk/tests/phpunit/tests/formatting
Files:
2 edited

Legend:

Unmodified
Added
Removed
  • trunk/tests/phpunit/tests/formatting/isEmail.php

    r62225 r62249  
    123123
    124124        foreach ( $invalid_emails as $email ) {
    125             yield $email => array( $email );
     125            yield self::invalid_utf8_as_ascii( $email ) => array( $email );
    126126        }
    127127    }
     128
     129    /**
     130     * Transforms invalid byte sequences in UTF-8 into representations of
     131     * each byte value, according to the maximal subpart rule.
     132     *
     133     * Example:
     134     *
     135     *     // For valid UTF-8 the output is the input.
     136     *     'test' === invalid_utf8_as_ascii( 'test' );
     137     *
     138     *     // Invalid bytes are represented with their hex value.
     139     *     'a(0x80)b' === invalid_utf8_as_ascii( "a\x80b" );
     140     *
     141     *     // Invalid byte sequences form maximal subparts.
     142     *     '(0xC2)(0xEF 0xBF)' === invalid_utf8_as_ascii( "\xC2\xEF\xBF" );
     143     *
     144     * @param string $text
     145     * @return string
     146     */
     147    private static function invalid_utf8_as_ascii( string $text ): string {
     148        $output        = '';
     149        $at            = 0;
     150        $was_at        = 0;
     151        $end           = strlen( $text );
     152        $invalid_bytes = 0;
     153
     154        while ( $at < $end ) {
     155            if ( 0 === _wp_scan_utf8( $text, $at, $invalid_bytes ) && 0 === $invalid_bytes ) {
     156                break;
     157            }
     158
     159            if ( $at > $was_at ) {
     160                $output .= substr( $text, $was_at, $at - $was_at );
     161            }
     162
     163            if ( $invalid_bytes > 0 ) {
     164                $output .= '(';
     165
     166                for ( $i = 0; $i < $invalid_bytes; $i++ ) {
     167                    $space   = $i > 0 ? ' ' : '';
     168                    $as_hex  = bin2hex( $text[ $at + $i ] );
     169                    $output .= "{$space}0x{$as_hex}";
     170                }
     171
     172                $output .= ')';
     173            }
     174
     175            $at    += $invalid_bytes;
     176            $was_at = $at;
     177        }
     178
     179        return $output;
     180    }
    128181}
  • trunk/tests/phpunit/tests/formatting/sanitizeEmail.php

    r62226 r62249  
    1818     */
    1919    public function test_returns_stripped_email_address( $address, $expected ) {
    20         $this->assertSame(
    21             $expected,
    22             sanitize_email( $address ),
    23             'Should have produced the known sanitized form of the email.'
    24         );
     20        $sanitized = sanitize_email( $address );
     21
     22        if ( $expected === $sanitized ) {
     23            $this->assertSame(
     24                $expected,
     25                $sanitized,
     26                'Should have produced the known sanitized form of the email.'
     27            );
     28        } else {
     29            $this->assertSame(
     30                $expected,
     31                self::invalid_utf8_as_ascii( $sanitized ),
     32                'Should have produced the known sanitized form of the email.'
     33            );
     34        }
    2535    }
    2636
     
    4050        );
    4151    }
     52
     53    /**
     54     * Transforms invalid byte sequences in UTF-8 into representations of
     55     * each byte value, according to the maximal subpart rule.
     56     *
     57     * Example:
     58     *
     59     *     // For valid UTF-8 the output is the input.
     60     *     'test' === invalid_utf8_as_ascii( 'test' );
     61     *
     62     *     // Invalid bytes are represented with their hex value.
     63     *     'a(0x80)b' === invalid_utf8_as_ascii( "a\x80b" );
     64     *
     65     *     // Invalid byte sequences form maximal subparts.
     66     *     '(0xC2)(0xEF 0xBF)' === invalid_utf8_as_ascii( "\xC2\xEF\xBF" );
     67     *
     68     * @param string $text
     69     * @return string
     70     */
     71    private static function invalid_utf8_as_ascii( string $text ): string {
     72        $output        = '';
     73        $at            = 0;
     74        $was_at        = 0;
     75        $end           = strlen( $text );
     76        $invalid_bytes = 0;
     77
     78        while ( $at < $end ) {
     79            if ( 0 === _wp_scan_utf8( $text, $at, $invalid_bytes ) && 0 === $invalid_bytes ) {
     80                break;
     81            }
     82
     83            if ( $at > $was_at ) {
     84                $output .= substr( $text, $was_at, $at - $was_at );
     85            }
     86
     87            if ( $invalid_bytes > 0 ) {
     88                $output .= '(';
     89
     90                for ( $i = 0; $i < $invalid_bytes; $i++ ) {
     91                    $space   = $i > 0 ? ' ' : '';
     92                    $as_hex  = bin2hex( $text[ $at + $i ] );
     93                    $output .= "{$space}0x{$as_hex}";
     94                }
     95
     96                $output .= ')';
     97            }
     98
     99            $at    += $invalid_bytes;
     100            $was_at = $at;
     101        }
     102
     103        return $output;
     104    }
    42105}
Note: See TracChangeset for help on using the changeset viewer.